diff --git a/.gitattributes b/.gitattributes index d610e02cdcd3a96b5c2a6b3b8877c0d37655ffac..73d4fc5ed2aa9d1c0bfb2344954b846c4f6370e1 100644 --- a/.gitattributes +++ b/.gitattributes @@ -650,3 +650,12 @@ Qwen2-7B-Instruct_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r- Qwen2-7B-Instruct_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-8628-sd-10000/checkpoint-448/tokenizer.json filter=lfs diff=lfs merge=lfs -text Qwen2-7B-Instruct_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-8628-sd-10000/checkpoint-896/tokenizer.json filter=lfs diff=lfs merge=lfs -text Qwen2-7B-Instruct_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-8628-sd-10000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..805cd021178a92a5f6aee3f744a941cf0ca13916 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a4b931f6963f7a4f6edb095ef574385c121f99f4cf59b5fdda7531d710a5271 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7f25d34f0134a7016eb9a029e896b0ffd16f751f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f9e37a39cb8d75f9584d5da14ce516df0aea257a64514c351dafac1490270ca +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1b69197b69c302e7c0e58f1e7c1e325b8059469a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6717a0476210c674c25a0fe088b0d21143746d738bb24ad668a6f5c017422a8 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1d1684c3a8ebf09d0f551ae2c347e6d37b58c65c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dae728819e4a0cb83003ccd066c42471432ccc1a68e009ac714e5df8a8c08b7d +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9f7ae2aec92b71d3c47e1923674f4242047ff73 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fcec17e8bb722f589f5c9b540a4846addf3c9bfe19726aa9b467b6c0c15627b +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c5637455c7c09fcf1d65f01850ea59ec25837773 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/trainer_state.json @@ -0,0 +1,8738 @@ +{ + "best_metric": 1.0958120822906494, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 12392, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032278889606197547, + "grad_norm": 0.7092075347900391, + "learning_rate": 0.0002, + "loss": 1.593, + "step": 10 + }, + { + "epoch": 0.006455777921239509, + "grad_norm": 0.6900479793548584, + "learning_rate": 0.0002, + "loss": 1.0956, + "step": 20 + }, + { + "epoch": 0.009683666881859263, + "grad_norm": 0.6788288950920105, + "learning_rate": 0.0002, + "loss": 0.9807, + "step": 30 + }, + { + "epoch": 0.012911555842479019, + "grad_norm": 0.5590243339538574, + "learning_rate": 0.0002, + "loss": 0.9385, + "step": 40 + }, + { + "epoch": 0.016139444803098774, + "grad_norm": 0.5136010646820068, + "learning_rate": 0.0002, + "loss": 0.931, + "step": 50 + }, + { + "epoch": 0.019367333763718526, + "grad_norm": 0.45298320055007935, + "learning_rate": 0.0002, + "loss": 0.8896, + "step": 60 + }, + { + "epoch": 0.022595222724338282, + "grad_norm": 0.5917162299156189, + "learning_rate": 0.0002, + "loss": 0.9184, + "step": 70 + }, + { + "epoch": 0.025823111684958037, + "grad_norm": 0.4414856433868408, + "learning_rate": 0.0002, + "loss": 0.8705, + "step": 80 + }, + { + "epoch": 0.029051000645577793, + "grad_norm": 0.5547978281974792, + "learning_rate": 0.0002, + "loss": 0.8419, + "step": 90 + }, + { + "epoch": 0.03227888960619755, + "grad_norm": 0.5271288156509399, + "learning_rate": 0.0002, + "loss": 0.8987, + "step": 100 + }, + { + "epoch": 0.035506778566817304, + "grad_norm": 0.5506119728088379, + "learning_rate": 0.0002, + "loss": 0.8543, + "step": 110 + }, + { + "epoch": 0.03873466752743705, + "grad_norm": 0.5579327940940857, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 120 + }, + { + "epoch": 0.04196255648805681, + "grad_norm": 0.5099632740020752, + "learning_rate": 0.0002, + "loss": 0.8826, + "step": 130 + }, + { + "epoch": 0.045190445448676564, + "grad_norm": 0.40396833419799805, + "learning_rate": 0.0002, + "loss": 0.9239, + "step": 140 + }, + { + "epoch": 0.04841833440929632, + "grad_norm": 0.5008092522621155, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 150 + }, + { + "epoch": 0.051646223369916075, + "grad_norm": 0.4388776421546936, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 160 + }, + { + "epoch": 0.05487411233053583, + "grad_norm": 0.44138944149017334, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 170 + }, + { + "epoch": 0.058102001291155586, + "grad_norm": 0.358484148979187, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 180 + }, + { + "epoch": 0.06132989025177534, + "grad_norm": 0.457052081823349, + "learning_rate": 0.0002, + "loss": 0.8956, + "step": 190 + }, + { + "epoch": 0.0645577792123951, + "grad_norm": 0.5537622570991516, + "learning_rate": 0.0002, + "loss": 0.9138, + "step": 200 + }, + { + "epoch": 0.06778566817301485, + "grad_norm": 0.552631676197052, + "learning_rate": 0.0002, + "loss": 0.8701, + "step": 210 + }, + { + "epoch": 0.07101355713363461, + "grad_norm": 0.4414575397968292, + "learning_rate": 0.0002, + "loss": 0.8854, + "step": 220 + }, + { + "epoch": 0.07424144609425436, + "grad_norm": 0.4996664226055145, + "learning_rate": 0.0002, + "loss": 0.8581, + "step": 230 + }, + { + "epoch": 0.0774693350548741, + "grad_norm": 0.7321897149085999, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 240 + }, + { + "epoch": 0.08069722401549387, + "grad_norm": 0.4553901255130768, + "learning_rate": 0.0002, + "loss": 0.8848, + "step": 250 + }, + { + "epoch": 0.08392511297611362, + "grad_norm": 0.5039054751396179, + "learning_rate": 0.0002, + "loss": 0.868, + "step": 260 + }, + { + "epoch": 0.08715300193673338, + "grad_norm": 0.4113094210624695, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 270 + }, + { + "epoch": 0.09038089089735313, + "grad_norm": 0.450436532497406, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 280 + }, + { + "epoch": 0.09360877985797289, + "grad_norm": 0.4548024535179138, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 290 + }, + { + "epoch": 0.09683666881859264, + "grad_norm": 0.4932962656021118, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 300 + }, + { + "epoch": 0.1000645577792124, + "grad_norm": 0.4005250334739685, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 310 + }, + { + "epoch": 0.10329244673983215, + "grad_norm": 1.8321624994277954, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 320 + }, + { + "epoch": 0.1065203357004519, + "grad_norm": 0.45815610885620117, + "learning_rate": 0.0002, + "loss": 0.8411, + "step": 330 + }, + { + "epoch": 0.10974822466107166, + "grad_norm": 0.39324095845222473, + "learning_rate": 0.0002, + "loss": 0.857, + "step": 340 + }, + { + "epoch": 0.11297611362169141, + "grad_norm": 0.546273946762085, + "learning_rate": 0.0002, + "loss": 0.8258, + "step": 350 + }, + { + "epoch": 0.11620400258231117, + "grad_norm": 0.497448593378067, + "learning_rate": 0.0002, + "loss": 0.882, + "step": 360 + }, + { + "epoch": 0.11943189154293092, + "grad_norm": 0.37508800625801086, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 370 + }, + { + "epoch": 0.12265978050355068, + "grad_norm": 0.45849609375, + "learning_rate": 0.0002, + "loss": 0.852, + "step": 380 + }, + { + "epoch": 0.12588766946417043, + "grad_norm": 0.5488408803939819, + "learning_rate": 0.0002, + "loss": 0.8437, + "step": 390 + }, + { + "epoch": 0.1291155584247902, + "grad_norm": 0.4477061331272125, + "learning_rate": 0.0002, + "loss": 0.8349, + "step": 400 + }, + { + "epoch": 0.13234344738540993, + "grad_norm": 0.39227980375289917, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 410 + }, + { + "epoch": 0.1355713363460297, + "grad_norm": 0.3922233581542969, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 420 + }, + { + "epoch": 0.13879922530664945, + "grad_norm": 0.42901909351348877, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 430 + }, + { + "epoch": 0.14202711426726922, + "grad_norm": 0.4217798709869385, + "learning_rate": 0.0002, + "loss": 0.8271, + "step": 440 + }, + { + "epoch": 0.14525500322788895, + "grad_norm": 0.43470677733421326, + "learning_rate": 0.0002, + "loss": 0.8594, + "step": 450 + }, + { + "epoch": 0.1484828921885087, + "grad_norm": 0.5324403047561646, + "learning_rate": 0.0002, + "loss": 0.8106, + "step": 460 + }, + { + "epoch": 0.15171078114912848, + "grad_norm": 0.3999756872653961, + "learning_rate": 0.0002, + "loss": 0.8729, + "step": 470 + }, + { + "epoch": 0.1549386701097482, + "grad_norm": 0.404933363199234, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 480 + }, + { + "epoch": 0.15816655907036797, + "grad_norm": 0.44122636318206787, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 490 + }, + { + "epoch": 0.16139444803098774, + "grad_norm": 0.510166347026825, + "learning_rate": 0.0002, + "loss": 0.8457, + "step": 500 + }, + { + "epoch": 0.1646223369916075, + "grad_norm": 0.4549732506275177, + "learning_rate": 0.0002, + "loss": 0.8692, + "step": 510 + }, + { + "epoch": 0.16785022595222723, + "grad_norm": 0.5148182511329651, + "learning_rate": 0.0002, + "loss": 0.8466, + "step": 520 + }, + { + "epoch": 0.171078114912847, + "grad_norm": 0.3596806824207306, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 530 + }, + { + "epoch": 0.17430600387346676, + "grad_norm": 0.4388909339904785, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 540 + }, + { + "epoch": 0.17753389283408652, + "grad_norm": 0.5052742958068848, + "learning_rate": 0.0002, + "loss": 0.8322, + "step": 550 + }, + { + "epoch": 0.18076178179470626, + "grad_norm": 0.48248958587646484, + "learning_rate": 0.0002, + "loss": 0.791, + "step": 560 + }, + { + "epoch": 0.18398967075532602, + "grad_norm": 0.5360197424888611, + "learning_rate": 0.0002, + "loss": 0.8593, + "step": 570 + }, + { + "epoch": 0.18721755971594578, + "grad_norm": 0.43999341130256653, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 580 + }, + { + "epoch": 0.19044544867656552, + "grad_norm": 0.3685208261013031, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 590 + }, + { + "epoch": 0.19367333763718528, + "grad_norm": 0.4601275622844696, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 600 + }, + { + "epoch": 0.19690122659780504, + "grad_norm": 0.4778369665145874, + "learning_rate": 0.0002, + "loss": 0.8483, + "step": 610 + }, + { + "epoch": 0.2001291155584248, + "grad_norm": 0.4867003560066223, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 620 + }, + { + "epoch": 0.20335700451904454, + "grad_norm": 0.4583742916584015, + "learning_rate": 0.0002, + "loss": 0.8554, + "step": 630 + }, + { + "epoch": 0.2065848934796643, + "grad_norm": 0.47958165407180786, + "learning_rate": 0.0002, + "loss": 0.8698, + "step": 640 + }, + { + "epoch": 0.20981278244028406, + "grad_norm": 0.4526064097881317, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 650 + }, + { + "epoch": 0.2130406714009038, + "grad_norm": 0.45890581607818604, + "learning_rate": 0.0002, + "loss": 0.8313, + "step": 660 + }, + { + "epoch": 0.21626856036152356, + "grad_norm": 0.42725905776023865, + "learning_rate": 0.0002, + "loss": 0.8143, + "step": 670 + }, + { + "epoch": 0.21949644932214332, + "grad_norm": 0.40380963683128357, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 680 + }, + { + "epoch": 0.22272433828276308, + "grad_norm": 0.4372998774051666, + "learning_rate": 0.0002, + "loss": 0.9004, + "step": 690 + }, + { + "epoch": 0.22595222724338282, + "grad_norm": 0.4245864450931549, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 700 + }, + { + "epoch": 0.22918011620400258, + "grad_norm": 0.4061129689216614, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 710 + }, + { + "epoch": 0.23240800516462234, + "grad_norm": 0.474454790353775, + "learning_rate": 0.0002, + "loss": 0.8275, + "step": 720 + }, + { + "epoch": 0.23563589412524208, + "grad_norm": 0.4908486008644104, + "learning_rate": 0.0002, + "loss": 0.8346, + "step": 730 + }, + { + "epoch": 0.23886378308586184, + "grad_norm": 0.4284191429615021, + "learning_rate": 0.0002, + "loss": 0.8755, + "step": 740 + }, + { + "epoch": 0.2420916720464816, + "grad_norm": 0.44730308651924133, + "learning_rate": 0.0002, + "loss": 0.8387, + "step": 750 + }, + { + "epoch": 0.24531956100710137, + "grad_norm": 0.4433246850967407, + "learning_rate": 0.0002, + "loss": 0.8135, + "step": 760 + }, + { + "epoch": 0.2485474499677211, + "grad_norm": 0.43668854236602783, + "learning_rate": 0.0002, + "loss": 0.8644, + "step": 770 + }, + { + "epoch": 0.25177533892834086, + "grad_norm": 0.34324130415916443, + "learning_rate": 0.0002, + "loss": 0.8025, + "step": 780 + }, + { + "epoch": 0.2550032278889606, + "grad_norm": 0.46476295590400696, + "learning_rate": 0.0002, + "loss": 0.8725, + "step": 790 + }, + { + "epoch": 0.2582311168495804, + "grad_norm": 0.5047039985656738, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 800 + }, + { + "epoch": 0.26145900581020015, + "grad_norm": 0.4402127265930176, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 810 + }, + { + "epoch": 0.26468689477081986, + "grad_norm": 0.4642465114593506, + "learning_rate": 0.0002, + "loss": 0.8025, + "step": 820 + }, + { + "epoch": 0.2679147837314396, + "grad_norm": 0.40093424916267395, + "learning_rate": 0.0002, + "loss": 0.8836, + "step": 830 + }, + { + "epoch": 0.2711426726920594, + "grad_norm": 0.42501842975616455, + "learning_rate": 0.0002, + "loss": 0.83, + "step": 840 + }, + { + "epoch": 0.27437056165267915, + "grad_norm": 0.43279722332954407, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 850 + }, + { + "epoch": 0.2775984506132989, + "grad_norm": 0.5991243720054626, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 860 + }, + { + "epoch": 0.28082633957391867, + "grad_norm": 0.4217848777770996, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 870 + }, + { + "epoch": 0.28405422853453843, + "grad_norm": 0.3933536410331726, + "learning_rate": 0.0002, + "loss": 0.8135, + "step": 880 + }, + { + "epoch": 0.28728211749515814, + "grad_norm": 0.5868505239486694, + "learning_rate": 0.0002, + "loss": 0.8846, + "step": 890 + }, + { + "epoch": 0.2905100064557779, + "grad_norm": 0.5209547877311707, + "learning_rate": 0.0002, + "loss": 0.8759, + "step": 900 + }, + { + "epoch": 0.29373789541639767, + "grad_norm": 0.49307361245155334, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 910 + }, + { + "epoch": 0.2969657843770174, + "grad_norm": 0.4288382828235626, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 920 + }, + { + "epoch": 0.3001936733376372, + "grad_norm": 0.33568474650382996, + "learning_rate": 0.0002, + "loss": 0.8431, + "step": 930 + }, + { + "epoch": 0.30342156229825695, + "grad_norm": 1.0915930271148682, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 940 + }, + { + "epoch": 0.3066494512588767, + "grad_norm": 0.5489798188209534, + "learning_rate": 0.0002, + "loss": 0.8535, + "step": 950 + }, + { + "epoch": 0.3098773402194964, + "grad_norm": 0.42971742153167725, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 960 + }, + { + "epoch": 0.3131052291801162, + "grad_norm": 0.43375834822654724, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 970 + }, + { + "epoch": 0.31633311814073595, + "grad_norm": 0.47488611936569214, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 980 + }, + { + "epoch": 0.3195610071013557, + "grad_norm": 0.46296775341033936, + "learning_rate": 0.0002, + "loss": 0.7906, + "step": 990 + }, + { + "epoch": 0.32278889606197547, + "grad_norm": 0.4548890292644501, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 1000 + }, + { + "epoch": 0.32601678502259523, + "grad_norm": 0.41834497451782227, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 1010 + }, + { + "epoch": 0.329244673983215, + "grad_norm": 0.441092312335968, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 1020 + }, + { + "epoch": 0.33247256294383476, + "grad_norm": 0.637322187423706, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 1030 + }, + { + "epoch": 0.33570045190445447, + "grad_norm": 0.4374958574771881, + "learning_rate": 0.0002, + "loss": 0.8685, + "step": 1040 + }, + { + "epoch": 0.33892834086507423, + "grad_norm": 0.3935825824737549, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 1050 + }, + { + "epoch": 0.342156229825694, + "grad_norm": 0.43526220321655273, + "learning_rate": 0.0002, + "loss": 0.8287, + "step": 1060 + }, + { + "epoch": 0.34538411878631375, + "grad_norm": 0.45327696204185486, + "learning_rate": 0.0002, + "loss": 0.8413, + "step": 1070 + }, + { + "epoch": 0.3486120077469335, + "grad_norm": 0.4126075506210327, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 1080 + }, + { + "epoch": 0.3518398967075533, + "grad_norm": 0.4714072048664093, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 1090 + }, + { + "epoch": 0.35506778566817304, + "grad_norm": 0.518127977848053, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 1100 + }, + { + "epoch": 0.35829567462879275, + "grad_norm": 0.43264099955558777, + "learning_rate": 0.0002, + "loss": 0.8479, + "step": 1110 + }, + { + "epoch": 0.3615235635894125, + "grad_norm": 0.4857400357723236, + "learning_rate": 0.0002, + "loss": 0.8724, + "step": 1120 + }, + { + "epoch": 0.3647514525500323, + "grad_norm": 0.37591469287872314, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 1130 + }, + { + "epoch": 0.36797934151065204, + "grad_norm": 0.4165478050708771, + "learning_rate": 0.0002, + "loss": 0.8531, + "step": 1140 + }, + { + "epoch": 0.3712072304712718, + "grad_norm": 0.42911383509635925, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 1150 + }, + { + "epoch": 0.37443511943189156, + "grad_norm": 0.44980287551879883, + "learning_rate": 0.0002, + "loss": 0.8722, + "step": 1160 + }, + { + "epoch": 0.3776630083925113, + "grad_norm": 0.4066573679447174, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 1170 + }, + { + "epoch": 0.38089089735313103, + "grad_norm": 0.5056195855140686, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 1180 + }, + { + "epoch": 0.3841187863137508, + "grad_norm": 0.4141536355018616, + "learning_rate": 0.0002, + "loss": 0.8387, + "step": 1190 + }, + { + "epoch": 0.38734667527437056, + "grad_norm": 0.4501924514770508, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 1200 + }, + { + "epoch": 0.3905745642349903, + "grad_norm": 0.43304240703582764, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 1210 + }, + { + "epoch": 0.3938024531956101, + "grad_norm": 0.475777804851532, + "learning_rate": 0.0002, + "loss": 0.8905, + "step": 1220 + }, + { + "epoch": 0.39703034215622984, + "grad_norm": 0.5846465826034546, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 1230 + }, + { + "epoch": 0.4002582311168496, + "grad_norm": 0.42899325489997864, + "learning_rate": 0.0002, + "loss": 0.8078, + "step": 1240 + }, + { + "epoch": 0.4034861200774693, + "grad_norm": 0.3980463147163391, + "learning_rate": 0.0002, + "loss": 0.8415, + "step": 1250 + }, + { + "epoch": 0.4067140090380891, + "grad_norm": 0.45769768953323364, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 1260 + }, + { + "epoch": 0.40994189799870884, + "grad_norm": 0.5101280212402344, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 1270 + }, + { + "epoch": 0.4131697869593286, + "grad_norm": 0.47374317049980164, + "learning_rate": 0.0002, + "loss": 0.7905, + "step": 1280 + }, + { + "epoch": 0.41639767591994836, + "grad_norm": 0.4261878728866577, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 1290 + }, + { + "epoch": 0.4196255648805681, + "grad_norm": 0.46954256296157837, + "learning_rate": 0.0002, + "loss": 0.9004, + "step": 1300 + }, + { + "epoch": 0.4228534538411879, + "grad_norm": 0.5205738544464111, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 1310 + }, + { + "epoch": 0.4260813428018076, + "grad_norm": 0.5176340937614441, + "learning_rate": 0.0002, + "loss": 0.8964, + "step": 1320 + }, + { + "epoch": 0.42930923176242736, + "grad_norm": 0.5155916810035706, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 1330 + }, + { + "epoch": 0.4325371207230471, + "grad_norm": 0.44548553228378296, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 1340 + }, + { + "epoch": 0.4357650096836669, + "grad_norm": 0.5633558630943298, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 1350 + }, + { + "epoch": 0.43899289864428664, + "grad_norm": 0.42444056272506714, + "learning_rate": 0.0002, + "loss": 0.7889, + "step": 1360 + }, + { + "epoch": 0.4422207876049064, + "grad_norm": 0.5226860642433167, + "learning_rate": 0.0002, + "loss": 0.8588, + "step": 1370 + }, + { + "epoch": 0.44544867656552617, + "grad_norm": 0.5354582071304321, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 1380 + }, + { + "epoch": 0.4486765655261459, + "grad_norm": 0.472646564245224, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 1390 + }, + { + "epoch": 0.45190445448676564, + "grad_norm": 0.6312310099601746, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 1400 + }, + { + "epoch": 0.4551323434473854, + "grad_norm": 0.4298408031463623, + "learning_rate": 0.0002, + "loss": 0.8212, + "step": 1410 + }, + { + "epoch": 0.45836023240800516, + "grad_norm": 0.43427202105522156, + "learning_rate": 0.0002, + "loss": 0.8447, + "step": 1420 + }, + { + "epoch": 0.4615881213686249, + "grad_norm": 0.44097861647605896, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 1430 + }, + { + "epoch": 0.4648160103292447, + "grad_norm": 0.5142693519592285, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 1440 + }, + { + "epoch": 0.46804389928986445, + "grad_norm": 0.46416547894477844, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 1450 + }, + { + "epoch": 0.47127178825048416, + "grad_norm": 0.4858551025390625, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 1460 + }, + { + "epoch": 0.4744996772111039, + "grad_norm": 0.4709177315235138, + "learning_rate": 0.0002, + "loss": 0.8354, + "step": 1470 + }, + { + "epoch": 0.4777275661717237, + "grad_norm": 0.5500252842903137, + "learning_rate": 0.0002, + "loss": 0.8391, + "step": 1480 + }, + { + "epoch": 0.48095545513234345, + "grad_norm": 0.43364381790161133, + "learning_rate": 0.0002, + "loss": 0.8359, + "step": 1490 + }, + { + "epoch": 0.4841833440929632, + "grad_norm": 0.47712287306785583, + "learning_rate": 0.0002, + "loss": 0.8446, + "step": 1500 + }, + { + "epoch": 0.48741123305358297, + "grad_norm": 0.4518495202064514, + "learning_rate": 0.0002, + "loss": 0.8518, + "step": 1510 + }, + { + "epoch": 0.49063912201420273, + "grad_norm": 0.4539008140563965, + "learning_rate": 0.0002, + "loss": 0.819, + "step": 1520 + }, + { + "epoch": 0.49386701097482244, + "grad_norm": 0.4993067979812622, + "learning_rate": 0.0002, + "loss": 0.8276, + "step": 1530 + }, + { + "epoch": 0.4970948999354422, + "grad_norm": 0.6094803214073181, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 1540 + }, + { + "epoch": 0.500322788896062, + "grad_norm": 0.48602527379989624, + "learning_rate": 0.0002, + "loss": 0.8263, + "step": 1550 + }, + { + "epoch": 0.5035506778566817, + "grad_norm": 0.40245795249938965, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 1560 + }, + { + "epoch": 0.5067785668173015, + "grad_norm": 0.456787645816803, + "learning_rate": 0.0002, + "loss": 0.7907, + "step": 1570 + }, + { + "epoch": 0.5100064557779213, + "grad_norm": 0.43936216831207275, + "learning_rate": 0.0002, + "loss": 0.86, + "step": 1580 + }, + { + "epoch": 0.513234344738541, + "grad_norm": 0.549018144607544, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 1590 + }, + { + "epoch": 0.5164622336991608, + "grad_norm": 0.41746795177459717, + "learning_rate": 0.0002, + "loss": 0.8169, + "step": 1600 + }, + { + "epoch": 0.5196901226597805, + "grad_norm": 0.4217053949832916, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 1610 + }, + { + "epoch": 0.5229180116204003, + "grad_norm": 0.449913889169693, + "learning_rate": 0.0002, + "loss": 0.8161, + "step": 1620 + }, + { + "epoch": 0.5261459005810201, + "grad_norm": 0.5084872245788574, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 1630 + }, + { + "epoch": 0.5293737895416397, + "grad_norm": 0.46248653531074524, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 1640 + }, + { + "epoch": 0.5326016785022595, + "grad_norm": 0.4824236035346985, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 1650 + }, + { + "epoch": 0.5358295674628792, + "grad_norm": 0.6010985374450684, + "learning_rate": 0.0002, + "loss": 0.8711, + "step": 1660 + }, + { + "epoch": 0.539057456423499, + "grad_norm": 0.4757920801639557, + "learning_rate": 0.0002, + "loss": 0.8266, + "step": 1670 + }, + { + "epoch": 0.5422853453841188, + "grad_norm": 0.45161882042884827, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 1680 + }, + { + "epoch": 0.5455132343447385, + "grad_norm": 0.49314990639686584, + "learning_rate": 0.0002, + "loss": 0.8141, + "step": 1690 + }, + { + "epoch": 0.5487411233053583, + "grad_norm": 0.3918305039405823, + "learning_rate": 0.0002, + "loss": 0.8091, + "step": 1700 + }, + { + "epoch": 0.551969012265978, + "grad_norm": 0.5966728925704956, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 1710 + }, + { + "epoch": 0.5551969012265978, + "grad_norm": 0.4208986163139343, + "learning_rate": 0.0002, + "loss": 0.8438, + "step": 1720 + }, + { + "epoch": 0.5584247901872176, + "grad_norm": 0.43724218010902405, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 1730 + }, + { + "epoch": 0.5616526791478373, + "grad_norm": 0.5287272930145264, + "learning_rate": 0.0002, + "loss": 0.7956, + "step": 1740 + }, + { + "epoch": 0.5648805681084571, + "grad_norm": 0.4961899518966675, + "learning_rate": 0.0002, + "loss": 0.8557, + "step": 1750 + }, + { + "epoch": 0.5681084570690769, + "grad_norm": 0.4468635320663452, + "learning_rate": 0.0002, + "loss": 0.8029, + "step": 1760 + }, + { + "epoch": 0.5713363460296966, + "grad_norm": 0.6423530578613281, + "learning_rate": 0.0002, + "loss": 0.7968, + "step": 1770 + }, + { + "epoch": 0.5745642349903163, + "grad_norm": 0.4601971507072449, + "learning_rate": 0.0002, + "loss": 0.8324, + "step": 1780 + }, + { + "epoch": 0.577792123950936, + "grad_norm": 0.46514901518821716, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 1790 + }, + { + "epoch": 0.5810200129115558, + "grad_norm": 0.4771687388420105, + "learning_rate": 0.0002, + "loss": 0.8186, + "step": 1800 + }, + { + "epoch": 0.5842479018721756, + "grad_norm": 0.46514490246772766, + "learning_rate": 0.0002, + "loss": 0.856, + "step": 1810 + }, + { + "epoch": 0.5874757908327953, + "grad_norm": 0.5373936295509338, + "learning_rate": 0.0002, + "loss": 0.84, + "step": 1820 + }, + { + "epoch": 0.5907036797934151, + "grad_norm": 0.5175791382789612, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 1830 + }, + { + "epoch": 0.5939315687540349, + "grad_norm": 0.4522802233695984, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 1840 + }, + { + "epoch": 0.5971594577146546, + "grad_norm": 0.42987772822380066, + "learning_rate": 0.0002, + "loss": 0.8633, + "step": 1850 + }, + { + "epoch": 0.6003873466752744, + "grad_norm": 0.5566838383674622, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 1860 + }, + { + "epoch": 0.6036152356358941, + "grad_norm": 0.42807698249816895, + "learning_rate": 0.0002, + "loss": 0.8312, + "step": 1870 + }, + { + "epoch": 0.6068431245965139, + "grad_norm": 0.4957767724990845, + "learning_rate": 0.0002, + "loss": 0.8035, + "step": 1880 + }, + { + "epoch": 0.6100710135571337, + "grad_norm": 0.4260980188846588, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 1890 + }, + { + "epoch": 0.6132989025177534, + "grad_norm": 0.4777357876300812, + "learning_rate": 0.0002, + "loss": 0.8363, + "step": 1900 + }, + { + "epoch": 0.6165267914783732, + "grad_norm": 0.4434216022491455, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 1910 + }, + { + "epoch": 0.6197546804389928, + "grad_norm": 0.5215433835983276, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 1920 + }, + { + "epoch": 0.6229825693996126, + "grad_norm": 0.5143248438835144, + "learning_rate": 0.0002, + "loss": 0.82, + "step": 1930 + }, + { + "epoch": 0.6262104583602324, + "grad_norm": 0.5213413238525391, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 1940 + }, + { + "epoch": 0.6294383473208521, + "grad_norm": 0.5408226251602173, + "learning_rate": 0.0002, + "loss": 0.7549, + "step": 1950 + }, + { + "epoch": 0.6326662362814719, + "grad_norm": 0.5479708909988403, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 1960 + }, + { + "epoch": 0.6358941252420917, + "grad_norm": 0.4490949809551239, + "learning_rate": 0.0002, + "loss": 0.8138, + "step": 1970 + }, + { + "epoch": 0.6391220142027114, + "grad_norm": 0.48815059661865234, + "learning_rate": 0.0002, + "loss": 0.854, + "step": 1980 + }, + { + "epoch": 0.6423499031633312, + "grad_norm": 0.46498045325279236, + "learning_rate": 0.0002, + "loss": 0.8568, + "step": 1990 + }, + { + "epoch": 0.6455777921239509, + "grad_norm": 0.5136561393737793, + "learning_rate": 0.0002, + "loss": 0.8263, + "step": 2000 + }, + { + "epoch": 0.6488056810845707, + "grad_norm": 0.5145719647407532, + "learning_rate": 0.0002, + "loss": 0.8503, + "step": 2010 + }, + { + "epoch": 0.6520335700451905, + "grad_norm": 0.5430373549461365, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 2020 + }, + { + "epoch": 0.6552614590058102, + "grad_norm": 0.46347954869270325, + "learning_rate": 0.0002, + "loss": 0.8115, + "step": 2030 + }, + { + "epoch": 0.65848934796643, + "grad_norm": 0.5189562439918518, + "learning_rate": 0.0002, + "loss": 0.8769, + "step": 2040 + }, + { + "epoch": 0.6617172369270498, + "grad_norm": 0.43843990564346313, + "learning_rate": 0.0002, + "loss": 0.8453, + "step": 2050 + }, + { + "epoch": 0.6649451258876695, + "grad_norm": 0.4654983580112457, + "learning_rate": 0.0002, + "loss": 0.7951, + "step": 2060 + }, + { + "epoch": 0.6681730148482892, + "grad_norm": 0.44835716485977173, + "learning_rate": 0.0002, + "loss": 0.8308, + "step": 2070 + }, + { + "epoch": 0.6714009038089089, + "grad_norm": 0.38811734318733215, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 2080 + }, + { + "epoch": 0.6746287927695287, + "grad_norm": 0.5709853172302246, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 2090 + }, + { + "epoch": 0.6778566817301485, + "grad_norm": 0.49994757771492004, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 2100 + }, + { + "epoch": 0.6810845706907682, + "grad_norm": 0.5505402684211731, + "learning_rate": 0.0002, + "loss": 0.8, + "step": 2110 + }, + { + "epoch": 0.684312459651388, + "grad_norm": 0.48195120692253113, + "learning_rate": 0.0002, + "loss": 0.8227, + "step": 2120 + }, + { + "epoch": 0.6875403486120077, + "grad_norm": 0.4854775071144104, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 2130 + }, + { + "epoch": 0.6907682375726275, + "grad_norm": 0.6422494649887085, + "learning_rate": 0.0002, + "loss": 0.8231, + "step": 2140 + }, + { + "epoch": 0.6939961265332473, + "grad_norm": 0.3972536027431488, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 2150 + }, + { + "epoch": 0.697224015493867, + "grad_norm": 0.4297836422920227, + "learning_rate": 0.0002, + "loss": 0.8068, + "step": 2160 + }, + { + "epoch": 0.7004519044544868, + "grad_norm": 0.45486778020858765, + "learning_rate": 0.0002, + "loss": 0.8017, + "step": 2170 + }, + { + "epoch": 0.7036797934151066, + "grad_norm": 0.4706047773361206, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 2180 + }, + { + "epoch": 0.7069076823757263, + "grad_norm": 0.46426892280578613, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 2190 + }, + { + "epoch": 0.7101355713363461, + "grad_norm": 0.46333715319633484, + "learning_rate": 0.0002, + "loss": 0.8472, + "step": 2200 + }, + { + "epoch": 0.7133634602969657, + "grad_norm": 0.4632524251937866, + "learning_rate": 0.0002, + "loss": 0.8247, + "step": 2210 + }, + { + "epoch": 0.7165913492575855, + "grad_norm": 0.4610830843448639, + "learning_rate": 0.0002, + "loss": 0.8452, + "step": 2220 + }, + { + "epoch": 0.7198192382182053, + "grad_norm": 0.4905324876308441, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 2230 + }, + { + "epoch": 0.723047127178825, + "grad_norm": 0.4936263859272003, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 2240 + }, + { + "epoch": 0.7262750161394448, + "grad_norm": 0.40778425335884094, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 2250 + }, + { + "epoch": 0.7295029051000645, + "grad_norm": 0.50351482629776, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 2260 + }, + { + "epoch": 0.7327307940606843, + "grad_norm": 0.4894128143787384, + "learning_rate": 0.0002, + "loss": 0.8475, + "step": 2270 + }, + { + "epoch": 0.7359586830213041, + "grad_norm": 0.5580906271934509, + "learning_rate": 0.0002, + "loss": 0.8087, + "step": 2280 + }, + { + "epoch": 0.7391865719819238, + "grad_norm": 0.4655369520187378, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 2290 + }, + { + "epoch": 0.7424144609425436, + "grad_norm": 0.4666965901851654, + "learning_rate": 0.0002, + "loss": 0.8395, + "step": 2300 + }, + { + "epoch": 0.7456423499031634, + "grad_norm": 0.46259936690330505, + "learning_rate": 0.0002, + "loss": 0.7605, + "step": 2310 + }, + { + "epoch": 0.7488702388637831, + "grad_norm": 0.520706832408905, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 2320 + }, + { + "epoch": 0.7520981278244029, + "grad_norm": 0.5142408013343811, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 2330 + }, + { + "epoch": 0.7553260167850226, + "grad_norm": 0.5355164408683777, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 2340 + }, + { + "epoch": 0.7585539057456423, + "grad_norm": 0.5517185926437378, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 2350 + }, + { + "epoch": 0.7617817947062621, + "grad_norm": 0.7162677049636841, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 2360 + }, + { + "epoch": 0.7650096836668818, + "grad_norm": 0.42402133345603943, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 2370 + }, + { + "epoch": 0.7682375726275016, + "grad_norm": 0.47180113196372986, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 2380 + }, + { + "epoch": 0.7714654615881213, + "grad_norm": 0.6262288689613342, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 2390 + }, + { + "epoch": 0.7746933505487411, + "grad_norm": 0.5177528262138367, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 2400 + }, + { + "epoch": 0.7779212395093609, + "grad_norm": 0.555721640586853, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 2410 + }, + { + "epoch": 0.7811491284699806, + "grad_norm": 0.5592644810676575, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 2420 + }, + { + "epoch": 0.7843770174306004, + "grad_norm": 0.38025397062301636, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 2430 + }, + { + "epoch": 0.7876049063912202, + "grad_norm": 0.4597472548484802, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 2440 + }, + { + "epoch": 0.7908327953518399, + "grad_norm": 0.4929825961589813, + "learning_rate": 0.0002, + "loss": 0.8575, + "step": 2450 + }, + { + "epoch": 0.7940606843124597, + "grad_norm": 0.45277655124664307, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 2460 + }, + { + "epoch": 0.7972885732730794, + "grad_norm": 0.6224122643470764, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 2470 + }, + { + "epoch": 0.8005164622336992, + "grad_norm": 0.5740901827812195, + "learning_rate": 0.0002, + "loss": 0.8449, + "step": 2480 + }, + { + "epoch": 0.8037443511943189, + "grad_norm": 0.41335329413414, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 2490 + }, + { + "epoch": 0.8069722401549386, + "grad_norm": 0.4738694131374359, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 2500 + }, + { + "epoch": 0.8102001291155584, + "grad_norm": 0.5288197994232178, + "learning_rate": 0.0002, + "loss": 0.7927, + "step": 2510 + }, + { + "epoch": 0.8134280180761781, + "grad_norm": 0.5404666066169739, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 2520 + }, + { + "epoch": 0.8166559070367979, + "grad_norm": 0.4444909691810608, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 2530 + }, + { + "epoch": 0.8198837959974177, + "grad_norm": 0.542061448097229, + "learning_rate": 0.0002, + "loss": 0.8683, + "step": 2540 + }, + { + "epoch": 0.8231116849580374, + "grad_norm": 0.4914741814136505, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 2550 + }, + { + "epoch": 0.8263395739186572, + "grad_norm": 0.41703441739082336, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 2560 + }, + { + "epoch": 0.829567462879277, + "grad_norm": 0.5489841103553772, + "learning_rate": 0.0002, + "loss": 0.824, + "step": 2570 + }, + { + "epoch": 0.8327953518398967, + "grad_norm": 0.5359883308410645, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 2580 + }, + { + "epoch": 0.8360232408005165, + "grad_norm": 0.5541019439697266, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 2590 + }, + { + "epoch": 0.8392511297611362, + "grad_norm": 0.4746638834476471, + "learning_rate": 0.0002, + "loss": 0.797, + "step": 2600 + }, + { + "epoch": 0.842479018721756, + "grad_norm": 0.5243194103240967, + "learning_rate": 0.0002, + "loss": 0.8116, + "step": 2610 + }, + { + "epoch": 0.8457069076823758, + "grad_norm": 0.46824976801872253, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 2620 + }, + { + "epoch": 0.8489347966429954, + "grad_norm": 0.49487847089767456, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 2630 + }, + { + "epoch": 0.8521626856036152, + "grad_norm": 0.42180097103118896, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 2640 + }, + { + "epoch": 0.855390574564235, + "grad_norm": 0.5516560077667236, + "learning_rate": 0.0002, + "loss": 0.8304, + "step": 2650 + }, + { + "epoch": 0.8586184635248547, + "grad_norm": 0.4392191767692566, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 2660 + }, + { + "epoch": 0.8618463524854745, + "grad_norm": 0.5387210845947266, + "learning_rate": 0.0002, + "loss": 0.848, + "step": 2670 + }, + { + "epoch": 0.8650742414460942, + "grad_norm": 0.6232406497001648, + "learning_rate": 0.0002, + "loss": 0.8094, + "step": 2680 + }, + { + "epoch": 0.868302130406714, + "grad_norm": 0.53749018907547, + "learning_rate": 0.0002, + "loss": 0.768, + "step": 2690 + }, + { + "epoch": 0.8715300193673338, + "grad_norm": 0.47480374574661255, + "learning_rate": 0.0002, + "loss": 0.8299, + "step": 2700 + }, + { + "epoch": 0.8747579083279535, + "grad_norm": 0.44618046283721924, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 2710 + }, + { + "epoch": 0.8779857972885733, + "grad_norm": 0.4173581302165985, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 2720 + }, + { + "epoch": 0.881213686249193, + "grad_norm": 0.524081289768219, + "learning_rate": 0.0002, + "loss": 0.7713, + "step": 2730 + }, + { + "epoch": 0.8844415752098128, + "grad_norm": 0.5608431100845337, + "learning_rate": 0.0002, + "loss": 0.8738, + "step": 2740 + }, + { + "epoch": 0.8876694641704326, + "grad_norm": 0.5212284922599792, + "learning_rate": 0.0002, + "loss": 0.8513, + "step": 2750 + }, + { + "epoch": 0.8908973531310523, + "grad_norm": 0.5601475834846497, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 2760 + }, + { + "epoch": 0.8941252420916721, + "grad_norm": 0.4499223828315735, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 2770 + }, + { + "epoch": 0.8973531310522918, + "grad_norm": 0.46945226192474365, + "learning_rate": 0.0002, + "loss": 0.8559, + "step": 2780 + }, + { + "epoch": 0.9005810200129115, + "grad_norm": 0.4837495684623718, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 2790 + }, + { + "epoch": 0.9038089089735313, + "grad_norm": 0.5059258937835693, + "learning_rate": 0.0002, + "loss": 0.7887, + "step": 2800 + }, + { + "epoch": 0.907036797934151, + "grad_norm": 0.4857945144176483, + "learning_rate": 0.0002, + "loss": 0.8571, + "step": 2810 + }, + { + "epoch": 0.9102646868947708, + "grad_norm": 0.5001962780952454, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 2820 + }, + { + "epoch": 0.9134925758553906, + "grad_norm": 0.5468648672103882, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 2830 + }, + { + "epoch": 0.9167204648160103, + "grad_norm": 0.5533056259155273, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 2840 + }, + { + "epoch": 0.9199483537766301, + "grad_norm": 0.5909785628318787, + "learning_rate": 0.0002, + "loss": 0.7895, + "step": 2850 + }, + { + "epoch": 0.9231762427372499, + "grad_norm": 0.47428104281425476, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 2860 + }, + { + "epoch": 0.9264041316978696, + "grad_norm": 0.548814058303833, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 2870 + }, + { + "epoch": 0.9296320206584894, + "grad_norm": 0.5576745271682739, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 2880 + }, + { + "epoch": 0.9328599096191091, + "grad_norm": 0.47094792127609253, + "learning_rate": 0.0002, + "loss": 0.8399, + "step": 2890 + }, + { + "epoch": 0.9360877985797289, + "grad_norm": 0.5408539772033691, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 2900 + }, + { + "epoch": 0.9393156875403487, + "grad_norm": 0.5922889113426208, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 2910 + }, + { + "epoch": 0.9425435765009683, + "grad_norm": 0.45462584495544434, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 2920 + }, + { + "epoch": 0.9457714654615881, + "grad_norm": 0.6864947080612183, + "learning_rate": 0.0002, + "loss": 0.8344, + "step": 2930 + }, + { + "epoch": 0.9489993544222078, + "grad_norm": 0.4706299304962158, + "learning_rate": 0.0002, + "loss": 0.8166, + "step": 2940 + }, + { + "epoch": 0.9522272433828276, + "grad_norm": 0.5583269596099854, + "learning_rate": 0.0002, + "loss": 0.8422, + "step": 2950 + }, + { + "epoch": 0.9554551323434474, + "grad_norm": 0.51015704870224, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 2960 + }, + { + "epoch": 0.9586830213040671, + "grad_norm": 0.5325582027435303, + "learning_rate": 0.0002, + "loss": 0.8371, + "step": 2970 + }, + { + "epoch": 0.9619109102646869, + "grad_norm": 0.49008598923683167, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 2980 + }, + { + "epoch": 0.9651387992253067, + "grad_norm": 0.4422132074832916, + "learning_rate": 0.0002, + "loss": 0.8093, + "step": 2990 + }, + { + "epoch": 0.9683666881859264, + "grad_norm": 0.5053589344024658, + "learning_rate": 0.0002, + "loss": 0.7966, + "step": 3000 + }, + { + "epoch": 0.9715945771465462, + "grad_norm": 0.46754521131515503, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 3010 + }, + { + "epoch": 0.9748224661071659, + "grad_norm": 0.5613434910774231, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 3020 + }, + { + "epoch": 0.9780503550677857, + "grad_norm": 0.5052843689918518, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 3030 + }, + { + "epoch": 0.9812782440284055, + "grad_norm": 0.4270972013473511, + "learning_rate": 0.0002, + "loss": 0.8412, + "step": 3040 + }, + { + "epoch": 0.9845061329890252, + "grad_norm": 0.4974991977214813, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 3050 + }, + { + "epoch": 0.9877340219496449, + "grad_norm": 0.4432311952114105, + "learning_rate": 0.0002, + "loss": 0.8415, + "step": 3060 + }, + { + "epoch": 0.9909619109102646, + "grad_norm": 0.466457724571228, + "learning_rate": 0.0002, + "loss": 0.7764, + "step": 3070 + }, + { + "epoch": 0.9941897998708844, + "grad_norm": 0.6438009142875671, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 3080 + }, + { + "epoch": 0.9974176888315042, + "grad_norm": 0.5593604445457458, + "learning_rate": 0.0002, + "loss": 0.8425, + "step": 3090 + }, + { + "epoch": 1.0, + "eval_loss": 1.0958120822906494, + "eval_runtime": 148.3273, + "eval_samples_per_second": 4.942, + "eval_steps_per_second": 0.62, + "step": 3098 + }, + { + "epoch": 1.000645577792124, + "grad_norm": 0.5701445937156677, + "learning_rate": 0.0002, + "loss": 0.8275, + "step": 3100 + }, + { + "epoch": 1.0038734667527438, + "grad_norm": 0.6089657545089722, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 3110 + }, + { + "epoch": 1.0071013557133635, + "grad_norm": 0.5619552135467529, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 3120 + }, + { + "epoch": 1.010329244673983, + "grad_norm": 0.5550283789634705, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 3130 + }, + { + "epoch": 1.013557133634603, + "grad_norm": 0.6221792101860046, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 3140 + }, + { + "epoch": 1.0167850225952226, + "grad_norm": 0.5450758934020996, + "learning_rate": 0.0002, + "loss": 0.7603, + "step": 3150 + }, + { + "epoch": 1.0200129115558425, + "grad_norm": 0.4359588027000427, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 3160 + }, + { + "epoch": 1.0232408005164622, + "grad_norm": 0.5932239890098572, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 3170 + }, + { + "epoch": 1.026468689477082, + "grad_norm": 0.45478707551956177, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 3180 + }, + { + "epoch": 1.0296965784377017, + "grad_norm": 0.677615761756897, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 3190 + }, + { + "epoch": 1.0329244673983216, + "grad_norm": 0.6231790781021118, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 3200 + }, + { + "epoch": 1.0361523563589412, + "grad_norm": 0.5074195861816406, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 3210 + }, + { + "epoch": 1.039380245319561, + "grad_norm": 0.4844142198562622, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 3220 + }, + { + "epoch": 1.0426081342801807, + "grad_norm": 0.5372750759124756, + "learning_rate": 0.0002, + "loss": 0.7655, + "step": 3230 + }, + { + "epoch": 1.0458360232408006, + "grad_norm": 0.46296265721321106, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 3240 + }, + { + "epoch": 1.0490639122014203, + "grad_norm": 0.5417148470878601, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 3250 + }, + { + "epoch": 1.0522918011620401, + "grad_norm": 0.5695074200630188, + "learning_rate": 0.0002, + "loss": 0.7637, + "step": 3260 + }, + { + "epoch": 1.0555196901226598, + "grad_norm": 0.5050092935562134, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 3270 + }, + { + "epoch": 1.0587475790832794, + "grad_norm": 0.5320752263069153, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 3280 + }, + { + "epoch": 1.0619754680438993, + "grad_norm": 0.5832052230834961, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 3290 + }, + { + "epoch": 1.065203357004519, + "grad_norm": 0.5228804349899292, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 3300 + }, + { + "epoch": 1.0684312459651388, + "grad_norm": 0.5819445252418518, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 3310 + }, + { + "epoch": 1.0716591349257585, + "grad_norm": 0.4201328754425049, + "learning_rate": 0.0002, + "loss": 0.7093, + "step": 3320 + }, + { + "epoch": 1.0748870238863784, + "grad_norm": 0.5424145460128784, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 3330 + }, + { + "epoch": 1.078114912846998, + "grad_norm": 0.6169946789741516, + "learning_rate": 0.0002, + "loss": 0.7828, + "step": 3340 + }, + { + "epoch": 1.0813428018076179, + "grad_norm": 0.607676088809967, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 3350 + }, + { + "epoch": 1.0845706907682375, + "grad_norm": 0.5191982388496399, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 3360 + }, + { + "epoch": 1.0877985797288574, + "grad_norm": 0.5728003978729248, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 3370 + }, + { + "epoch": 1.091026468689477, + "grad_norm": 0.5402643084526062, + "learning_rate": 0.0002, + "loss": 0.7381, + "step": 3380 + }, + { + "epoch": 1.094254357650097, + "grad_norm": 0.5377541780471802, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 3390 + }, + { + "epoch": 1.0974822466107166, + "grad_norm": 0.4751385748386383, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 3400 + }, + { + "epoch": 1.1007101355713362, + "grad_norm": 0.559158444404602, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 3410 + }, + { + "epoch": 1.103938024531956, + "grad_norm": 0.4917701482772827, + "learning_rate": 0.0002, + "loss": 0.7366, + "step": 3420 + }, + { + "epoch": 1.1071659134925758, + "grad_norm": 0.5507875084877014, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 3430 + }, + { + "epoch": 1.1103938024531956, + "grad_norm": 0.45458680391311646, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 3440 + }, + { + "epoch": 1.1136216914138153, + "grad_norm": 0.5721744894981384, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 3450 + }, + { + "epoch": 1.1168495803744352, + "grad_norm": 0.5776081681251526, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 3460 + }, + { + "epoch": 1.1200774693350548, + "grad_norm": 0.5261953473091125, + "learning_rate": 0.0002, + "loss": 0.7644, + "step": 3470 + }, + { + "epoch": 1.1233053582956747, + "grad_norm": 0.47759532928466797, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 3480 + }, + { + "epoch": 1.1265332472562943, + "grad_norm": 0.5697659850120544, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 3490 + }, + { + "epoch": 1.1297611362169142, + "grad_norm": 0.5643419623374939, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 3500 + }, + { + "epoch": 1.1329890251775339, + "grad_norm": 0.6502931118011475, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 3510 + }, + { + "epoch": 1.1362169141381537, + "grad_norm": 0.5236507654190063, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 3520 + }, + { + "epoch": 1.1394448030987734, + "grad_norm": 0.6521499156951904, + "learning_rate": 0.0002, + "loss": 0.7571, + "step": 3530 + }, + { + "epoch": 1.142672692059393, + "grad_norm": 0.5893217325210571, + "learning_rate": 0.0002, + "loss": 0.7304, + "step": 3540 + }, + { + "epoch": 1.145900581020013, + "grad_norm": 0.5300073027610779, + "learning_rate": 0.0002, + "loss": 0.7508, + "step": 3550 + }, + { + "epoch": 1.1491284699806328, + "grad_norm": 0.6794660091400146, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 3560 + }, + { + "epoch": 1.1523563589412524, + "grad_norm": 0.5420064926147461, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 3570 + }, + { + "epoch": 1.155584247901872, + "grad_norm": 0.5096590518951416, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 3580 + }, + { + "epoch": 1.158812136862492, + "grad_norm": 0.5726043581962585, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 3590 + }, + { + "epoch": 1.1620400258231116, + "grad_norm": 0.7388110160827637, + "learning_rate": 0.0002, + "loss": 0.7728, + "step": 3600 + }, + { + "epoch": 1.1652679147837315, + "grad_norm": 0.5597969889640808, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 3610 + }, + { + "epoch": 1.1684958037443511, + "grad_norm": 0.5067800283432007, + "learning_rate": 0.0002, + "loss": 0.7132, + "step": 3620 + }, + { + "epoch": 1.171723692704971, + "grad_norm": 0.6625118255615234, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 3630 + }, + { + "epoch": 1.1749515816655907, + "grad_norm": 0.5830849409103394, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 3640 + }, + { + "epoch": 1.1781794706262105, + "grad_norm": 0.6140692830085754, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 3650 + }, + { + "epoch": 1.1814073595868302, + "grad_norm": 0.714523434638977, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 3660 + }, + { + "epoch": 1.18463524854745, + "grad_norm": 0.5196696519851685, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 3670 + }, + { + "epoch": 1.1878631375080697, + "grad_norm": 0.6677889823913574, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 3680 + }, + { + "epoch": 1.1910910264686896, + "grad_norm": 0.47095245122909546, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 3690 + }, + { + "epoch": 1.1943189154293092, + "grad_norm": 0.5197778940200806, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 3700 + }, + { + "epoch": 1.1975468043899289, + "grad_norm": 0.5156530141830444, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 3710 + }, + { + "epoch": 1.2007746933505488, + "grad_norm": 0.6968549489974976, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 3720 + }, + { + "epoch": 1.2040025823111684, + "grad_norm": 0.48983848094940186, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 3730 + }, + { + "epoch": 1.2072304712717883, + "grad_norm": 0.6709973216056824, + "learning_rate": 0.0002, + "loss": 0.7163, + "step": 3740 + }, + { + "epoch": 1.210458360232408, + "grad_norm": 0.48681750893592834, + "learning_rate": 0.0002, + "loss": 0.7632, + "step": 3750 + }, + { + "epoch": 1.2136862491930278, + "grad_norm": 0.49475061893463135, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 3760 + }, + { + "epoch": 1.2169141381536475, + "grad_norm": 0.6163983345031738, + "learning_rate": 0.0002, + "loss": 0.7372, + "step": 3770 + }, + { + "epoch": 1.2201420271142673, + "grad_norm": 0.5481411218643188, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 3780 + }, + { + "epoch": 1.223369916074887, + "grad_norm": 0.620639979839325, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 3790 + }, + { + "epoch": 1.2265978050355069, + "grad_norm": 0.7017222046852112, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 3800 + }, + { + "epoch": 1.2298256939961265, + "grad_norm": 0.5872400403022766, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 3810 + }, + { + "epoch": 1.2330535829567464, + "grad_norm": 0.45765596628189087, + "learning_rate": 0.0002, + "loss": 0.7854, + "step": 3820 + }, + { + "epoch": 1.236281471917366, + "grad_norm": 0.5676377415657043, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 3830 + }, + { + "epoch": 1.2395093608779857, + "grad_norm": 0.4793425500392914, + "learning_rate": 0.0002, + "loss": 0.7696, + "step": 3840 + }, + { + "epoch": 1.2427372498386056, + "grad_norm": 0.5060022473335266, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 3850 + }, + { + "epoch": 1.2459651387992252, + "grad_norm": 0.6140682697296143, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 3860 + }, + { + "epoch": 1.249193027759845, + "grad_norm": 0.5030326843261719, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 3870 + }, + { + "epoch": 1.2524209167204647, + "grad_norm": 0.6609430909156799, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 3880 + }, + { + "epoch": 1.2556488056810846, + "grad_norm": 0.5459545850753784, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 3890 + }, + { + "epoch": 1.2588766946417043, + "grad_norm": 0.5328870415687561, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 3900 + }, + { + "epoch": 1.2621045836023241, + "grad_norm": 0.5840652585029602, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 3910 + }, + { + "epoch": 1.2653324725629438, + "grad_norm": 0.5587584376335144, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 3920 + }, + { + "epoch": 1.2685603615235637, + "grad_norm": 0.5886949896812439, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 3930 + }, + { + "epoch": 1.2717882504841833, + "grad_norm": 0.5128693580627441, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 3940 + }, + { + "epoch": 1.2750161394448032, + "grad_norm": 0.6207669377326965, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 3950 + }, + { + "epoch": 1.2782440284054228, + "grad_norm": 0.5789574384689331, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 3960 + }, + { + "epoch": 1.2814719173660425, + "grad_norm": 0.503162145614624, + "learning_rate": 0.0002, + "loss": 0.7574, + "step": 3970 + }, + { + "epoch": 1.2846998063266624, + "grad_norm": 0.6670064926147461, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 3980 + }, + { + "epoch": 1.2879276952872822, + "grad_norm": 0.5676213502883911, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 3990 + }, + { + "epoch": 1.2911555842479019, + "grad_norm": 0.5383169054985046, + "learning_rate": 0.0002, + "loss": 0.7892, + "step": 4000 + }, + { + "epoch": 1.2943834732085215, + "grad_norm": 0.714743971824646, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 4010 + }, + { + "epoch": 1.2976113621691414, + "grad_norm": 0.5740262269973755, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 4020 + }, + { + "epoch": 1.300839251129761, + "grad_norm": 0.6143045425415039, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 4030 + }, + { + "epoch": 1.304067140090381, + "grad_norm": 0.501025378704071, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 4040 + }, + { + "epoch": 1.3072950290510006, + "grad_norm": 0.5784100294113159, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 4050 + }, + { + "epoch": 1.3105229180116205, + "grad_norm": 0.6182606220245361, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 4060 + }, + { + "epoch": 1.3137508069722401, + "grad_norm": 0.5072231292724609, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 4070 + }, + { + "epoch": 1.31697869593286, + "grad_norm": 0.6841012835502625, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 4080 + }, + { + "epoch": 1.3202065848934796, + "grad_norm": 0.697257936000824, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 4090 + }, + { + "epoch": 1.3234344738540993, + "grad_norm": 0.5113214254379272, + "learning_rate": 0.0002, + "loss": 0.7401, + "step": 4100 + }, + { + "epoch": 1.3266623628147192, + "grad_norm": 0.6270561814308167, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 4110 + }, + { + "epoch": 1.329890251775339, + "grad_norm": 0.5525947213172913, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 4120 + }, + { + "epoch": 1.3331181407359587, + "grad_norm": 0.546071469783783, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 4130 + }, + { + "epoch": 1.3363460296965783, + "grad_norm": 0.6516721248626709, + "learning_rate": 0.0002, + "loss": 0.7884, + "step": 4140 + }, + { + "epoch": 1.3395739186571982, + "grad_norm": 0.6235111355781555, + "learning_rate": 0.0002, + "loss": 0.755, + "step": 4150 + }, + { + "epoch": 1.3428018076178179, + "grad_norm": 0.538649320602417, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 4160 + }, + { + "epoch": 1.3460296965784377, + "grad_norm": 0.5367001891136169, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 4170 + }, + { + "epoch": 1.3492575855390574, + "grad_norm": 0.6134631037712097, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 4180 + }, + { + "epoch": 1.3524854744996773, + "grad_norm": 0.5827262997627258, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 4190 + }, + { + "epoch": 1.355713363460297, + "grad_norm": 0.5706096291542053, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 4200 + }, + { + "epoch": 1.3589412524209168, + "grad_norm": 0.6422057151794434, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 4210 + }, + { + "epoch": 1.3621691413815364, + "grad_norm": 0.6316141486167908, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 4220 + }, + { + "epoch": 1.365397030342156, + "grad_norm": 0.6946983933448792, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 4230 + }, + { + "epoch": 1.368624919302776, + "grad_norm": 0.5381525754928589, + "learning_rate": 0.0002, + "loss": 0.7388, + "step": 4240 + }, + { + "epoch": 1.3718528082633958, + "grad_norm": 0.5484845638275146, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 4250 + }, + { + "epoch": 1.3750806972240155, + "grad_norm": 0.5961896777153015, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 4260 + }, + { + "epoch": 1.3783085861846351, + "grad_norm": 0.6041752696037292, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 4270 + }, + { + "epoch": 1.381536475145255, + "grad_norm": 0.6283464431762695, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 4280 + }, + { + "epoch": 1.384764364105875, + "grad_norm": 0.6761324405670166, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 4290 + }, + { + "epoch": 1.3879922530664945, + "grad_norm": 0.504311203956604, + "learning_rate": 0.0002, + "loss": 0.7381, + "step": 4300 + }, + { + "epoch": 1.3912201420271142, + "grad_norm": 0.6100395917892456, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 4310 + }, + { + "epoch": 1.394448030987734, + "grad_norm": 0.6245788335800171, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 4320 + }, + { + "epoch": 1.3976759199483537, + "grad_norm": 0.6074621081352234, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 4330 + }, + { + "epoch": 1.4009038089089736, + "grad_norm": 0.6683838963508606, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 4340 + }, + { + "epoch": 1.4041316978695932, + "grad_norm": 0.622998058795929, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 4350 + }, + { + "epoch": 1.4073595868302131, + "grad_norm": 0.6089423894882202, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 4360 + }, + { + "epoch": 1.4105874757908328, + "grad_norm": 0.6381658911705017, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 4370 + }, + { + "epoch": 1.4138153647514526, + "grad_norm": 0.5419308543205261, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 4380 + }, + { + "epoch": 1.4170432537120723, + "grad_norm": 0.6026232242584229, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 4390 + }, + { + "epoch": 1.420271142672692, + "grad_norm": 0.4911101162433624, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 4400 + }, + { + "epoch": 1.4234990316333118, + "grad_norm": 0.6302908062934875, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 4410 + }, + { + "epoch": 1.4267269205939317, + "grad_norm": 0.6692768931388855, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 4420 + }, + { + "epoch": 1.4299548095545513, + "grad_norm": 0.46294572949409485, + "learning_rate": 0.0002, + "loss": 0.7312, + "step": 4430 + }, + { + "epoch": 1.433182698515171, + "grad_norm": 0.5452619194984436, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 4440 + }, + { + "epoch": 1.4364105874757909, + "grad_norm": 0.7809233069419861, + "learning_rate": 0.0002, + "loss": 0.7974, + "step": 4450 + }, + { + "epoch": 1.4396384764364105, + "grad_norm": 0.550088107585907, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 4460 + }, + { + "epoch": 1.4428663653970304, + "grad_norm": 0.7139151096343994, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 4470 + }, + { + "epoch": 1.44609425435765, + "grad_norm": 0.6187090873718262, + "learning_rate": 0.0002, + "loss": 0.7358, + "step": 4480 + }, + { + "epoch": 1.44932214331827, + "grad_norm": 0.5948249101638794, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 4490 + }, + { + "epoch": 1.4525500322788896, + "grad_norm": 0.6510892510414124, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 4500 + }, + { + "epoch": 1.4557779212395094, + "grad_norm": 0.6552293300628662, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 4510 + }, + { + "epoch": 1.459005810200129, + "grad_norm": 0.585574209690094, + "learning_rate": 0.0002, + "loss": 0.7965, + "step": 4520 + }, + { + "epoch": 1.4622336991607487, + "grad_norm": 0.4830162823200226, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 4530 + }, + { + "epoch": 1.4654615881213686, + "grad_norm": 0.5780223608016968, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 4540 + }, + { + "epoch": 1.4686894770819885, + "grad_norm": 0.5462607145309448, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 4550 + }, + { + "epoch": 1.4719173660426081, + "grad_norm": 0.5183546543121338, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 4560 + }, + { + "epoch": 1.4751452550032278, + "grad_norm": 0.676917552947998, + "learning_rate": 0.0002, + "loss": 0.71, + "step": 4570 + }, + { + "epoch": 1.4783731439638477, + "grad_norm": 0.5772345066070557, + "learning_rate": 0.0002, + "loss": 0.7875, + "step": 4580 + }, + { + "epoch": 1.4816010329244673, + "grad_norm": 0.7320035696029663, + "learning_rate": 0.0002, + "loss": 0.7709, + "step": 4590 + }, + { + "epoch": 1.4848289218850872, + "grad_norm": 0.5024042129516602, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 4600 + }, + { + "epoch": 1.4880568108457068, + "grad_norm": 0.5482868552207947, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 4610 + }, + { + "epoch": 1.4912846998063267, + "grad_norm": 0.5447399616241455, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 4620 + }, + { + "epoch": 1.4945125887669464, + "grad_norm": 0.5953414440155029, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 4630 + }, + { + "epoch": 1.4977404777275662, + "grad_norm": 0.6983066201210022, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 4640 + }, + { + "epoch": 1.500968366688186, + "grad_norm": 0.586327075958252, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 4650 + }, + { + "epoch": 1.5041962556488055, + "grad_norm": 0.5839682221412659, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 4660 + }, + { + "epoch": 1.5074241446094254, + "grad_norm": 0.5959209203720093, + "learning_rate": 0.0002, + "loss": 0.7524, + "step": 4670 + }, + { + "epoch": 1.5106520335700453, + "grad_norm": 0.5073857307434082, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 4680 + }, + { + "epoch": 1.513879922530665, + "grad_norm": 0.5183001160621643, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 4690 + }, + { + "epoch": 1.5171078114912846, + "grad_norm": 0.593530535697937, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 4700 + }, + { + "epoch": 1.5203357004519045, + "grad_norm": 0.675993025302887, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 4710 + }, + { + "epoch": 1.5235635894125243, + "grad_norm": 0.5823286771774292, + "learning_rate": 0.0002, + "loss": 0.7485, + "step": 4720 + }, + { + "epoch": 1.526791478373144, + "grad_norm": 0.5825035572052002, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 4730 + }, + { + "epoch": 1.5300193673337636, + "grad_norm": 0.5689691305160522, + "learning_rate": 0.0002, + "loss": 0.8287, + "step": 4740 + }, + { + "epoch": 1.5332472562943835, + "grad_norm": 0.6037150621414185, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 4750 + }, + { + "epoch": 1.5364751452550034, + "grad_norm": 0.6393677592277527, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 4760 + }, + { + "epoch": 1.539703034215623, + "grad_norm": 0.5926381945610046, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 4770 + }, + { + "epoch": 1.5429309231762427, + "grad_norm": 0.9468599557876587, + "learning_rate": 0.0002, + "loss": 0.7425, + "step": 4780 + }, + { + "epoch": 1.5461588121368623, + "grad_norm": 0.7544237375259399, + "learning_rate": 0.0002, + "loss": 0.7565, + "step": 4790 + }, + { + "epoch": 1.5493867010974822, + "grad_norm": 0.5308566093444824, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 4800 + }, + { + "epoch": 1.552614590058102, + "grad_norm": 0.6590296030044556, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 4810 + }, + { + "epoch": 1.5558424790187217, + "grad_norm": 0.5630404353141785, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 4820 + }, + { + "epoch": 1.5590703679793414, + "grad_norm": 0.6800200939178467, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 4830 + }, + { + "epoch": 1.5622982569399613, + "grad_norm": 0.5463718175888062, + "learning_rate": 0.0002, + "loss": 0.7373, + "step": 4840 + }, + { + "epoch": 1.5655261459005811, + "grad_norm": 0.505135178565979, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 4850 + }, + { + "epoch": 1.5687540348612008, + "grad_norm": 0.5469676852226257, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 4860 + }, + { + "epoch": 1.5719819238218204, + "grad_norm": 0.5318337678909302, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 4870 + }, + { + "epoch": 1.5752098127824403, + "grad_norm": 0.7287914752960205, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 4880 + }, + { + "epoch": 1.5784377017430602, + "grad_norm": 0.7318989038467407, + "learning_rate": 0.0002, + "loss": 0.7532, + "step": 4890 + }, + { + "epoch": 1.5816655907036798, + "grad_norm": 0.6499921679496765, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 4900 + }, + { + "epoch": 1.5848934796642995, + "grad_norm": 0.47907355427742004, + "learning_rate": 0.0002, + "loss": 0.753, + "step": 4910 + }, + { + "epoch": 1.5881213686249191, + "grad_norm": 0.7338833808898926, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 4920 + }, + { + "epoch": 1.591349257585539, + "grad_norm": 0.5800719261169434, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 4930 + }, + { + "epoch": 1.594577146546159, + "grad_norm": 0.5365763306617737, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 4940 + }, + { + "epoch": 1.5978050355067785, + "grad_norm": 0.5800772309303284, + "learning_rate": 0.0002, + "loss": 0.777, + "step": 4950 + }, + { + "epoch": 1.6010329244673982, + "grad_norm": 0.7878010869026184, + "learning_rate": 0.0002, + "loss": 0.8027, + "step": 4960 + }, + { + "epoch": 1.604260813428018, + "grad_norm": 0.5919058918952942, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 4970 + }, + { + "epoch": 1.607488702388638, + "grad_norm": 0.5004435181617737, + "learning_rate": 0.0002, + "loss": 0.7762, + "step": 4980 + }, + { + "epoch": 1.6107165913492576, + "grad_norm": 0.6299242377281189, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 4990 + }, + { + "epoch": 1.6139444803098772, + "grad_norm": 0.6307242512702942, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 5000 + }, + { + "epoch": 1.6171723692704971, + "grad_norm": 0.7838703989982605, + "learning_rate": 0.0002, + "loss": 0.7693, + "step": 5010 + }, + { + "epoch": 1.620400258231117, + "grad_norm": 0.6454671621322632, + "learning_rate": 0.0002, + "loss": 0.7364, + "step": 5020 + }, + { + "epoch": 1.6236281471917366, + "grad_norm": 0.5907095670700073, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 5030 + }, + { + "epoch": 1.6268560361523563, + "grad_norm": 0.6053501963615417, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 5040 + }, + { + "epoch": 1.630083925112976, + "grad_norm": 0.5644670128822327, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 5050 + }, + { + "epoch": 1.6333118140735958, + "grad_norm": 0.6320949792861938, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 5060 + }, + { + "epoch": 1.6365397030342157, + "grad_norm": 0.6101489067077637, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 5070 + }, + { + "epoch": 1.6397675919948353, + "grad_norm": 0.9435283541679382, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 5080 + }, + { + "epoch": 1.642995480955455, + "grad_norm": 0.6668919324874878, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 5090 + }, + { + "epoch": 1.6462233699160749, + "grad_norm": 0.6160340905189514, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 5100 + }, + { + "epoch": 1.6494512588766947, + "grad_norm": 0.5999835729598999, + "learning_rate": 0.0002, + "loss": 0.7461, + "step": 5110 + }, + { + "epoch": 1.6526791478373144, + "grad_norm": 0.9378551840782166, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 5120 + }, + { + "epoch": 1.655907036797934, + "grad_norm": 0.4795055389404297, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 5130 + }, + { + "epoch": 1.659134925758554, + "grad_norm": 0.4878861606121063, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 5140 + }, + { + "epoch": 1.6623628147191738, + "grad_norm": 0.6042965054512024, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 5150 + }, + { + "epoch": 1.6655907036797934, + "grad_norm": 0.5829901695251465, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 5160 + }, + { + "epoch": 1.668818592640413, + "grad_norm": 0.5168480277061462, + "learning_rate": 0.0002, + "loss": 0.7498, + "step": 5170 + }, + { + "epoch": 1.672046481601033, + "grad_norm": 0.6489511132240295, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 5180 + }, + { + "epoch": 1.6752743705616526, + "grad_norm": 0.5955966114997864, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 5190 + }, + { + "epoch": 1.6785022595222725, + "grad_norm": 0.6228088140487671, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 5200 + }, + { + "epoch": 1.6817301484828922, + "grad_norm": 0.5726390480995178, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 5210 + }, + { + "epoch": 1.6849580374435118, + "grad_norm": 0.6116343140602112, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 5220 + }, + { + "epoch": 1.6881859264041317, + "grad_norm": 0.5483687520027161, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 5230 + }, + { + "epoch": 1.6914138153647515, + "grad_norm": 0.570941686630249, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 5240 + }, + { + "epoch": 1.6946417043253712, + "grad_norm": 0.6048086285591125, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 5250 + }, + { + "epoch": 1.6978695932859909, + "grad_norm": 0.6769003868103027, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 5260 + }, + { + "epoch": 1.7010974822466107, + "grad_norm": 0.5629057884216309, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 5270 + }, + { + "epoch": 1.7043253712072306, + "grad_norm": 0.657341480255127, + "learning_rate": 0.0002, + "loss": 0.7693, + "step": 5280 + }, + { + "epoch": 1.7075532601678503, + "grad_norm": 0.6256147623062134, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 5290 + }, + { + "epoch": 1.71078114912847, + "grad_norm": 0.5498088002204895, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 5300 + }, + { + "epoch": 1.7140090380890898, + "grad_norm": 0.5078358054161072, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 5310 + }, + { + "epoch": 1.7172369270497096, + "grad_norm": 0.6696692705154419, + "learning_rate": 0.0002, + "loss": 0.7872, + "step": 5320 + }, + { + "epoch": 1.7204648160103293, + "grad_norm": 0.6692847013473511, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 5330 + }, + { + "epoch": 1.723692704970949, + "grad_norm": 0.5415751934051514, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 5340 + }, + { + "epoch": 1.7269205939315686, + "grad_norm": 0.5367611050605774, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 5350 + }, + { + "epoch": 1.7301484828921885, + "grad_norm": 0.7321061491966248, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 5360 + }, + { + "epoch": 1.7333763718528084, + "grad_norm": 0.723972499370575, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 5370 + }, + { + "epoch": 1.736604260813428, + "grad_norm": 0.7328100204467773, + "learning_rate": 0.0002, + "loss": 0.7077, + "step": 5380 + }, + { + "epoch": 1.7398321497740477, + "grad_norm": 0.5785264372825623, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 5390 + }, + { + "epoch": 1.7430600387346675, + "grad_norm": 0.7812932133674622, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 5400 + }, + { + "epoch": 1.7462879276952874, + "grad_norm": 0.6493327617645264, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 5410 + }, + { + "epoch": 1.749515816655907, + "grad_norm": 0.5825939774513245, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 5420 + }, + { + "epoch": 1.7527437056165267, + "grad_norm": 0.6969610452651978, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 5430 + }, + { + "epoch": 1.7559715945771466, + "grad_norm": 0.5558062195777893, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 5440 + }, + { + "epoch": 1.7591994835377665, + "grad_norm": 0.49222221970558167, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 5450 + }, + { + "epoch": 1.762427372498386, + "grad_norm": 0.5844656825065613, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 5460 + }, + { + "epoch": 1.7656552614590058, + "grad_norm": 0.8706597685813904, + "learning_rate": 0.0002, + "loss": 0.7695, + "step": 5470 + }, + { + "epoch": 1.7688831504196254, + "grad_norm": 0.6167706251144409, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 5480 + }, + { + "epoch": 1.7721110393802453, + "grad_norm": 0.5890011787414551, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 5490 + }, + { + "epoch": 1.7753389283408652, + "grad_norm": 0.6551728248596191, + "learning_rate": 0.0002, + "loss": 0.8319, + "step": 5500 + }, + { + "epoch": 1.7785668173014848, + "grad_norm": 0.5848751068115234, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 5510 + }, + { + "epoch": 1.7817947062621045, + "grad_norm": 0.6664014458656311, + "learning_rate": 0.0002, + "loss": 0.7622, + "step": 5520 + }, + { + "epoch": 1.7850225952227243, + "grad_norm": 0.5931693911552429, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 5530 + }, + { + "epoch": 1.7882504841833442, + "grad_norm": 0.5534724593162537, + "learning_rate": 0.0002, + "loss": 0.7992, + "step": 5540 + }, + { + "epoch": 1.7914783731439639, + "grad_norm": 0.5590878129005432, + "learning_rate": 0.0002, + "loss": 0.7967, + "step": 5550 + }, + { + "epoch": 1.7947062621045835, + "grad_norm": 0.6947470903396606, + "learning_rate": 0.0002, + "loss": 0.7406, + "step": 5560 + }, + { + "epoch": 1.7979341510652034, + "grad_norm": 0.6104130148887634, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 5570 + }, + { + "epoch": 1.8011620400258233, + "grad_norm": 0.6135714054107666, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 5580 + }, + { + "epoch": 1.804389928986443, + "grad_norm": 0.6626853346824646, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 5590 + }, + { + "epoch": 1.8076178179470626, + "grad_norm": 0.6977612972259521, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 5600 + }, + { + "epoch": 1.8108457069076824, + "grad_norm": 0.6275238394737244, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 5610 + }, + { + "epoch": 1.814073595868302, + "grad_norm": 0.5017505288124084, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 5620 + }, + { + "epoch": 1.817301484828922, + "grad_norm": 0.8314290642738342, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 5630 + }, + { + "epoch": 1.8205293737895416, + "grad_norm": 0.6863582134246826, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 5640 + }, + { + "epoch": 1.8237572627501613, + "grad_norm": 0.69544917345047, + "learning_rate": 0.0002, + "loss": 0.743, + "step": 5650 + }, + { + "epoch": 1.8269851517107811, + "grad_norm": 0.515499472618103, + "learning_rate": 0.0002, + "loss": 0.7277, + "step": 5660 + }, + { + "epoch": 1.830213040671401, + "grad_norm": 0.6100873947143555, + "learning_rate": 0.0002, + "loss": 0.7166, + "step": 5670 + }, + { + "epoch": 1.8334409296320207, + "grad_norm": 0.67416912317276, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 5680 + }, + { + "epoch": 1.8366688185926403, + "grad_norm": 0.7057772278785706, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 5690 + }, + { + "epoch": 1.8398967075532602, + "grad_norm": 0.7374551892280579, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 5700 + }, + { + "epoch": 1.84312459651388, + "grad_norm": 0.6266297101974487, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 5710 + }, + { + "epoch": 1.8463524854744997, + "grad_norm": 0.5629227757453918, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 5720 + }, + { + "epoch": 1.8495803744351194, + "grad_norm": 0.6603655815124512, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 5730 + }, + { + "epoch": 1.8528082633957392, + "grad_norm": 0.8113715052604675, + "learning_rate": 0.0002, + "loss": 0.7587, + "step": 5740 + }, + { + "epoch": 1.856036152356359, + "grad_norm": 0.7143914103507996, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 5750 + }, + { + "epoch": 1.8592640413169788, + "grad_norm": 0.6273732781410217, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 5760 + }, + { + "epoch": 1.8624919302775984, + "grad_norm": 0.5428690910339355, + "learning_rate": 0.0002, + "loss": 0.7962, + "step": 5770 + }, + { + "epoch": 1.865719819238218, + "grad_norm": 0.6405037641525269, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 5780 + }, + { + "epoch": 1.868947708198838, + "grad_norm": 0.700873613357544, + "learning_rate": 0.0002, + "loss": 0.7569, + "step": 5790 + }, + { + "epoch": 1.8721755971594578, + "grad_norm": 0.5645238161087036, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 5800 + }, + { + "epoch": 1.8754034861200775, + "grad_norm": 0.8780353665351868, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 5810 + }, + { + "epoch": 1.878631375080697, + "grad_norm": 0.6295409798622131, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 5820 + }, + { + "epoch": 1.881859264041317, + "grad_norm": 0.678269624710083, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 5830 + }, + { + "epoch": 1.8850871530019369, + "grad_norm": 0.6464608907699585, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 5840 + }, + { + "epoch": 1.8883150419625565, + "grad_norm": 0.6201048493385315, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 5850 + }, + { + "epoch": 1.8915429309231762, + "grad_norm": 0.6046274304389954, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 5860 + }, + { + "epoch": 1.894770819883796, + "grad_norm": 0.7532408833503723, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 5870 + }, + { + "epoch": 1.897998708844416, + "grad_norm": 0.6066767573356628, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 5880 + }, + { + "epoch": 1.9012265978050356, + "grad_norm": 0.6289830207824707, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 5890 + }, + { + "epoch": 1.9044544867656552, + "grad_norm": 0.5204319953918457, + "learning_rate": 0.0002, + "loss": 0.7501, + "step": 5900 + }, + { + "epoch": 1.9076823757262749, + "grad_norm": 0.6708219647407532, + "learning_rate": 0.0002, + "loss": 0.7335, + "step": 5910 + }, + { + "epoch": 1.9109102646868947, + "grad_norm": 0.4915677309036255, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 5920 + }, + { + "epoch": 1.9141381536475146, + "grad_norm": 0.652717113494873, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 5930 + }, + { + "epoch": 1.9173660426081343, + "grad_norm": 0.5446316003799438, + "learning_rate": 0.0002, + "loss": 0.7687, + "step": 5940 + }, + { + "epoch": 1.920593931568754, + "grad_norm": 0.4958149194717407, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 5950 + }, + { + "epoch": 1.9238218205293738, + "grad_norm": 0.5623434782028198, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 5960 + }, + { + "epoch": 1.9270497094899937, + "grad_norm": 0.6855450868606567, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 5970 + }, + { + "epoch": 1.9302775984506133, + "grad_norm": 0.5710492730140686, + "learning_rate": 0.0002, + "loss": 0.827, + "step": 5980 + }, + { + "epoch": 1.933505487411233, + "grad_norm": 0.5379431843757629, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 5990 + }, + { + "epoch": 1.9367333763718528, + "grad_norm": 0.557129442691803, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 6000 + }, + { + "epoch": 1.9399612653324727, + "grad_norm": 0.6336663961410522, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 6010 + }, + { + "epoch": 1.9431891542930924, + "grad_norm": 0.5950582027435303, + "learning_rate": 0.0002, + "loss": 0.7316, + "step": 6020 + }, + { + "epoch": 1.946417043253712, + "grad_norm": 0.5905954837799072, + "learning_rate": 0.0002, + "loss": 0.7443, + "step": 6030 + }, + { + "epoch": 1.9496449322143317, + "grad_norm": 0.6688982844352722, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 6040 + }, + { + "epoch": 1.9528728211749515, + "grad_norm": 0.5440775752067566, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 6050 + }, + { + "epoch": 1.9561007101355714, + "grad_norm": 0.6207906603813171, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 6060 + }, + { + "epoch": 1.959328599096191, + "grad_norm": 0.6999374628067017, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 6070 + }, + { + "epoch": 1.9625564880568107, + "grad_norm": 0.6310848593711853, + "learning_rate": 0.0002, + "loss": 0.7372, + "step": 6080 + }, + { + "epoch": 1.9657843770174306, + "grad_norm": 0.5903388261795044, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 6090 + }, + { + "epoch": 1.9690122659780505, + "grad_norm": 0.6333889961242676, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 6100 + }, + { + "epoch": 1.97224015493867, + "grad_norm": 0.5604711174964905, + "learning_rate": 0.0002, + "loss": 0.7246, + "step": 6110 + }, + { + "epoch": 1.9754680438992898, + "grad_norm": 0.9234541654586792, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 6120 + }, + { + "epoch": 1.9786959328599096, + "grad_norm": 0.6149102449417114, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 6130 + }, + { + "epoch": 1.9819238218205295, + "grad_norm": 0.615446150302887, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 6140 + }, + { + "epoch": 1.9851517107811492, + "grad_norm": 0.5176635980606079, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 6150 + }, + { + "epoch": 1.9883795997417688, + "grad_norm": 0.7124109864234924, + "learning_rate": 0.0002, + "loss": 0.718, + "step": 6160 + }, + { + "epoch": 1.9916074887023887, + "grad_norm": 0.6317567825317383, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 6170 + }, + { + "epoch": 1.9948353776630086, + "grad_norm": 0.6855016350746155, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 6180 + }, + { + "epoch": 1.9980632666236282, + "grad_norm": 0.6423715353012085, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 6190 + }, + { + "epoch": 2.0, + "eval_loss": 1.1096643209457397, + "eval_runtime": 147.7997, + "eval_samples_per_second": 4.959, + "eval_steps_per_second": 0.622, + "step": 6196 + }, + { + "epoch": 2.001291155584248, + "grad_norm": 0.5322932600975037, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 6200 + }, + { + "epoch": 2.0045190445448675, + "grad_norm": 0.8152306079864502, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 6210 + }, + { + "epoch": 2.0077469335054876, + "grad_norm": 0.6215983033180237, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 6220 + }, + { + "epoch": 2.0109748224661073, + "grad_norm": 0.845498263835907, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 6230 + }, + { + "epoch": 2.014202711426727, + "grad_norm": 0.733559787273407, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 6240 + }, + { + "epoch": 2.0174306003873466, + "grad_norm": 0.51433926820755, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 6250 + }, + { + "epoch": 2.020658489347966, + "grad_norm": 0.6374049782752991, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 6260 + }, + { + "epoch": 2.0238863783085863, + "grad_norm": 0.7833638191223145, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 6270 + }, + { + "epoch": 2.027114267269206, + "grad_norm": 0.8929463028907776, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 6280 + }, + { + "epoch": 2.0303421562298256, + "grad_norm": 0.669731855392456, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 6290 + }, + { + "epoch": 2.0335700451904453, + "grad_norm": 0.5846071243286133, + "learning_rate": 0.0002, + "loss": 0.646, + "step": 6300 + }, + { + "epoch": 2.0367979341510654, + "grad_norm": 0.7087787985801697, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 6310 + }, + { + "epoch": 2.040025823111685, + "grad_norm": 0.6739160418510437, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 6320 + }, + { + "epoch": 2.0432537120723047, + "grad_norm": 0.4860886335372925, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 6330 + }, + { + "epoch": 2.0464816010329243, + "grad_norm": 0.7201244831085205, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 6340 + }, + { + "epoch": 2.0497094899935444, + "grad_norm": 0.7409170269966125, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 6350 + }, + { + "epoch": 2.052937378954164, + "grad_norm": 0.6843920350074768, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 6360 + }, + { + "epoch": 2.0561652679147837, + "grad_norm": 0.7519999742507935, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 6370 + }, + { + "epoch": 2.0593931568754034, + "grad_norm": 0.5732819437980652, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 6380 + }, + { + "epoch": 2.062621045836023, + "grad_norm": 0.7565118074417114, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 6390 + }, + { + "epoch": 2.065848934796643, + "grad_norm": 0.8147150278091431, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 6400 + }, + { + "epoch": 2.0690768237572628, + "grad_norm": 0.6941924691200256, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 6410 + }, + { + "epoch": 2.0723047127178824, + "grad_norm": 0.6549784541130066, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 6420 + }, + { + "epoch": 2.075532601678502, + "grad_norm": 0.7224905490875244, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 6430 + }, + { + "epoch": 2.078760490639122, + "grad_norm": 0.7754863500595093, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 6440 + }, + { + "epoch": 2.081988379599742, + "grad_norm": 0.691318154335022, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 6450 + }, + { + "epoch": 2.0852162685603615, + "grad_norm": 0.6009294986724854, + "learning_rate": 0.0002, + "loss": 0.6233, + "step": 6460 + }, + { + "epoch": 2.088444157520981, + "grad_norm": 0.6753945350646973, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 6470 + }, + { + "epoch": 2.091672046481601, + "grad_norm": 0.6899921298027039, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 6480 + }, + { + "epoch": 2.094899935442221, + "grad_norm": 0.846510648727417, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 6490 + }, + { + "epoch": 2.0981278244028405, + "grad_norm": 0.6432605981826782, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 6500 + }, + { + "epoch": 2.10135571336346, + "grad_norm": 0.8125239014625549, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 6510 + }, + { + "epoch": 2.1045836023240803, + "grad_norm": 0.628302812576294, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 6520 + }, + { + "epoch": 2.1078114912847, + "grad_norm": 0.7164334654808044, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 6530 + }, + { + "epoch": 2.1110393802453196, + "grad_norm": 0.7476949095726013, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 6540 + }, + { + "epoch": 2.114267269205939, + "grad_norm": 0.7577515840530396, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 6550 + }, + { + "epoch": 2.117495158166559, + "grad_norm": 0.5684467554092407, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 6560 + }, + { + "epoch": 2.120723047127179, + "grad_norm": 0.6121789216995239, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 6570 + }, + { + "epoch": 2.1239509360877986, + "grad_norm": 0.6095348596572876, + "learning_rate": 0.0002, + "loss": 0.6314, + "step": 6580 + }, + { + "epoch": 2.1271788250484183, + "grad_norm": 0.7803651690483093, + "learning_rate": 0.0002, + "loss": 0.6276, + "step": 6590 + }, + { + "epoch": 2.130406714009038, + "grad_norm": 0.5990583300590515, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 6600 + }, + { + "epoch": 2.133634602969658, + "grad_norm": 0.6569220423698425, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 6610 + }, + { + "epoch": 2.1368624919302777, + "grad_norm": 0.5961166620254517, + "learning_rate": 0.0002, + "loss": 0.7049, + "step": 6620 + }, + { + "epoch": 2.1400903808908973, + "grad_norm": 0.5860554575920105, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 6630 + }, + { + "epoch": 2.143318269851517, + "grad_norm": 0.5994001626968384, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 6640 + }, + { + "epoch": 2.146546158812137, + "grad_norm": 0.7723015546798706, + "learning_rate": 0.0002, + "loss": 0.6421, + "step": 6650 + }, + { + "epoch": 2.1497740477727567, + "grad_norm": 0.676355242729187, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 6660 + }, + { + "epoch": 2.1530019367333764, + "grad_norm": 0.5689092874526978, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 6670 + }, + { + "epoch": 2.156229825693996, + "grad_norm": 0.6933727264404297, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 6680 + }, + { + "epoch": 2.159457714654616, + "grad_norm": 0.8380527496337891, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 6690 + }, + { + "epoch": 2.1626856036152358, + "grad_norm": 0.6876497268676758, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 6700 + }, + { + "epoch": 2.1659134925758554, + "grad_norm": 0.6418334245681763, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 6710 + }, + { + "epoch": 2.169141381536475, + "grad_norm": 0.7169192433357239, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 6720 + }, + { + "epoch": 2.1723692704970947, + "grad_norm": 0.6664170622825623, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 6730 + }, + { + "epoch": 2.175597159457715, + "grad_norm": 0.6011993288993835, + "learning_rate": 0.0002, + "loss": 0.6751, + "step": 6740 + }, + { + "epoch": 2.1788250484183345, + "grad_norm": 0.5529947280883789, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 6750 + }, + { + "epoch": 2.182052937378954, + "grad_norm": 0.6879532933235168, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 6760 + }, + { + "epoch": 2.1852808263395738, + "grad_norm": 0.6426113843917847, + "learning_rate": 0.0002, + "loss": 0.6634, + "step": 6770 + }, + { + "epoch": 2.188508715300194, + "grad_norm": 0.6571047306060791, + "learning_rate": 0.0002, + "loss": 0.6592, + "step": 6780 + }, + { + "epoch": 2.1917366042608135, + "grad_norm": 0.6400564908981323, + "learning_rate": 0.0002, + "loss": 0.6494, + "step": 6790 + }, + { + "epoch": 2.194964493221433, + "grad_norm": 0.6509664058685303, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 6800 + }, + { + "epoch": 2.198192382182053, + "grad_norm": 0.6673197150230408, + "learning_rate": 0.0002, + "loss": 0.6771, + "step": 6810 + }, + { + "epoch": 2.2014202711426725, + "grad_norm": 0.48205727338790894, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 6820 + }, + { + "epoch": 2.2046481601032926, + "grad_norm": 0.849525511264801, + "learning_rate": 0.0002, + "loss": 0.6894, + "step": 6830 + }, + { + "epoch": 2.207876049063912, + "grad_norm": 0.6150892376899719, + "learning_rate": 0.0002, + "loss": 0.6977, + "step": 6840 + }, + { + "epoch": 2.211103938024532, + "grad_norm": 0.7826945781707764, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 6850 + }, + { + "epoch": 2.2143318269851515, + "grad_norm": 0.5711963772773743, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 6860 + }, + { + "epoch": 2.2175597159457716, + "grad_norm": 0.6017758846282959, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 6870 + }, + { + "epoch": 2.2207876049063913, + "grad_norm": 0.785434901714325, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 6880 + }, + { + "epoch": 2.224015493867011, + "grad_norm": 0.6251688599586487, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 6890 + }, + { + "epoch": 2.2272433828276306, + "grad_norm": 0.8242034316062927, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 6900 + }, + { + "epoch": 2.2304712717882507, + "grad_norm": 0.7272933125495911, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 6910 + }, + { + "epoch": 2.2336991607488703, + "grad_norm": 0.7159379720687866, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 6920 + }, + { + "epoch": 2.23692704970949, + "grad_norm": 0.6518042087554932, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 6930 + }, + { + "epoch": 2.2401549386701096, + "grad_norm": 0.7365370392799377, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 6940 + }, + { + "epoch": 2.2433828276307297, + "grad_norm": 0.5674061179161072, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 6950 + }, + { + "epoch": 2.2466107165913494, + "grad_norm": 0.669185996055603, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 6960 + }, + { + "epoch": 2.249838605551969, + "grad_norm": 0.6638304591178894, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 6970 + }, + { + "epoch": 2.2530664945125887, + "grad_norm": 0.757006824016571, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 6980 + }, + { + "epoch": 2.2562943834732083, + "grad_norm": 0.7574930787086487, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 6990 + }, + { + "epoch": 2.2595222724338284, + "grad_norm": 0.7819514870643616, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 7000 + }, + { + "epoch": 2.262750161394448, + "grad_norm": 0.6987583041191101, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 7010 + }, + { + "epoch": 2.2659780503550677, + "grad_norm": 0.6628551483154297, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 7020 + }, + { + "epoch": 2.2692059393156874, + "grad_norm": 0.7855866551399231, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 7030 + }, + { + "epoch": 2.2724338282763075, + "grad_norm": 0.6102892756462097, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 7040 + }, + { + "epoch": 2.275661717236927, + "grad_norm": 0.7844198942184448, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 7050 + }, + { + "epoch": 2.2788896061975468, + "grad_norm": 0.6209492087364197, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 7060 + }, + { + "epoch": 2.2821174951581664, + "grad_norm": 0.8351290225982666, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 7070 + }, + { + "epoch": 2.285345384118786, + "grad_norm": 0.6883546710014343, + "learning_rate": 0.0002, + "loss": 0.6648, + "step": 7080 + }, + { + "epoch": 2.288573273079406, + "grad_norm": 0.6626381874084473, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 7090 + }, + { + "epoch": 2.291801162040026, + "grad_norm": 0.7216270565986633, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 7100 + }, + { + "epoch": 2.2950290510006455, + "grad_norm": 0.8246777057647705, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 7110 + }, + { + "epoch": 2.2982569399612656, + "grad_norm": 0.614326000213623, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 7120 + }, + { + "epoch": 2.301484828921885, + "grad_norm": 0.8785578012466431, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 7130 + }, + { + "epoch": 2.304712717882505, + "grad_norm": 0.7021808624267578, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 7140 + }, + { + "epoch": 2.3079406068431245, + "grad_norm": 0.6999403238296509, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 7150 + }, + { + "epoch": 2.311168495803744, + "grad_norm": 0.8013143539428711, + "learning_rate": 0.0002, + "loss": 0.6547, + "step": 7160 + }, + { + "epoch": 2.3143963847643643, + "grad_norm": 0.6592583060264587, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 7170 + }, + { + "epoch": 2.317624273724984, + "grad_norm": 0.6260249018669128, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 7180 + }, + { + "epoch": 2.3208521626856036, + "grad_norm": 0.9352797269821167, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 7190 + }, + { + "epoch": 2.324080051646223, + "grad_norm": 0.6629612445831299, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 7200 + }, + { + "epoch": 2.3273079406068433, + "grad_norm": 0.7062810063362122, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 7210 + }, + { + "epoch": 2.330535829567463, + "grad_norm": 0.7236241102218628, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 7220 + }, + { + "epoch": 2.3337637185280826, + "grad_norm": 0.7528148293495178, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 7230 + }, + { + "epoch": 2.3369916074887023, + "grad_norm": 0.7604748606681824, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 7240 + }, + { + "epoch": 2.340219496449322, + "grad_norm": 0.5601189136505127, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 7250 + }, + { + "epoch": 2.343447385409942, + "grad_norm": 0.7099230885505676, + "learning_rate": 0.0002, + "loss": 0.6925, + "step": 7260 + }, + { + "epoch": 2.3466752743705617, + "grad_norm": 0.6699047684669495, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 7270 + }, + { + "epoch": 2.3499031633311813, + "grad_norm": 0.7315047979354858, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 7280 + }, + { + "epoch": 2.353131052291801, + "grad_norm": 0.632836103439331, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 7290 + }, + { + "epoch": 2.356358941252421, + "grad_norm": 0.9410115480422974, + "learning_rate": 0.0002, + "loss": 0.6458, + "step": 7300 + }, + { + "epoch": 2.3595868302130407, + "grad_norm": 0.626554012298584, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 7310 + }, + { + "epoch": 2.3628147191736604, + "grad_norm": 0.7538444399833679, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 7320 + }, + { + "epoch": 2.36604260813428, + "grad_norm": 0.6826626062393188, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 7330 + }, + { + "epoch": 2.3692704970949, + "grad_norm": 0.6739391088485718, + "learning_rate": 0.0002, + "loss": 0.6752, + "step": 7340 + }, + { + "epoch": 2.3724983860555198, + "grad_norm": 0.7518446445465088, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 7350 + }, + { + "epoch": 2.3757262750161394, + "grad_norm": 0.714133083820343, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 7360 + }, + { + "epoch": 2.378954163976759, + "grad_norm": 0.7144588232040405, + "learning_rate": 0.0002, + "loss": 0.6794, + "step": 7370 + }, + { + "epoch": 2.382182052937379, + "grad_norm": 0.6598120927810669, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 7380 + }, + { + "epoch": 2.385409941897999, + "grad_norm": 0.7079148292541504, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 7390 + }, + { + "epoch": 2.3886378308586185, + "grad_norm": 0.6750902533531189, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 7400 + }, + { + "epoch": 2.391865719819238, + "grad_norm": 0.7181967496871948, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 7410 + }, + { + "epoch": 2.3950936087798578, + "grad_norm": 0.7720552086830139, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 7420 + }, + { + "epoch": 2.398321497740478, + "grad_norm": 0.7592426538467407, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 7430 + }, + { + "epoch": 2.4015493867010975, + "grad_norm": 0.7161896824836731, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 7440 + }, + { + "epoch": 2.404777275661717, + "grad_norm": 0.8019260764122009, + "learning_rate": 0.0002, + "loss": 0.6891, + "step": 7450 + }, + { + "epoch": 2.408005164622337, + "grad_norm": 0.7093342542648315, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 7460 + }, + { + "epoch": 2.411233053582957, + "grad_norm": 0.8464207649230957, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 7470 + }, + { + "epoch": 2.4144609425435766, + "grad_norm": 0.773666501045227, + "learning_rate": 0.0002, + "loss": 0.6724, + "step": 7480 + }, + { + "epoch": 2.4176888315041962, + "grad_norm": 0.8451611995697021, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 7490 + }, + { + "epoch": 2.420916720464816, + "grad_norm": 0.656795084476471, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 7500 + }, + { + "epoch": 2.4241446094254355, + "grad_norm": 0.7129034996032715, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 7510 + }, + { + "epoch": 2.4273724983860556, + "grad_norm": 0.8325763940811157, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 7520 + }, + { + "epoch": 2.4306003873466753, + "grad_norm": 0.7806527614593506, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 7530 + }, + { + "epoch": 2.433828276307295, + "grad_norm": 0.6994536519050598, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 7540 + }, + { + "epoch": 2.437056165267915, + "grad_norm": 0.6898999214172363, + "learning_rate": 0.0002, + "loss": 0.6615, + "step": 7550 + }, + { + "epoch": 2.4402840542285347, + "grad_norm": 0.719490647315979, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 7560 + }, + { + "epoch": 2.4435119431891543, + "grad_norm": 0.6841562390327454, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 7570 + }, + { + "epoch": 2.446739832149774, + "grad_norm": 0.7573311924934387, + "learning_rate": 0.0002, + "loss": 0.6504, + "step": 7580 + }, + { + "epoch": 2.4499677211103936, + "grad_norm": 0.7295880317687988, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 7590 + }, + { + "epoch": 2.4531956100710137, + "grad_norm": 0.710136353969574, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 7600 + }, + { + "epoch": 2.4564234990316334, + "grad_norm": 0.6126235127449036, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 7610 + }, + { + "epoch": 2.459651387992253, + "grad_norm": 0.8025609850883484, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 7620 + }, + { + "epoch": 2.4628792769528727, + "grad_norm": 0.7839472889900208, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 7630 + }, + { + "epoch": 2.4661071659134928, + "grad_norm": 0.7253499031066895, + "learning_rate": 0.0002, + "loss": 0.6797, + "step": 7640 + }, + { + "epoch": 2.4693350548741124, + "grad_norm": 0.7918946743011475, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 7650 + }, + { + "epoch": 2.472562943834732, + "grad_norm": 0.7930178046226501, + "learning_rate": 0.0002, + "loss": 0.6646, + "step": 7660 + }, + { + "epoch": 2.4757908327953517, + "grad_norm": 0.6826170086860657, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 7670 + }, + { + "epoch": 2.4790187217559714, + "grad_norm": 0.6576805114746094, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 7680 + }, + { + "epoch": 2.4822466107165915, + "grad_norm": 0.7012448310852051, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 7690 + }, + { + "epoch": 2.485474499677211, + "grad_norm": 0.7774284482002258, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 7700 + }, + { + "epoch": 2.4887023886378308, + "grad_norm": 0.6502766013145447, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 7710 + }, + { + "epoch": 2.4919302775984504, + "grad_norm": 0.7638739347457886, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 7720 + }, + { + "epoch": 2.4951581665590705, + "grad_norm": 0.6217384338378906, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 7730 + }, + { + "epoch": 2.49838605551969, + "grad_norm": 0.7576302886009216, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 7740 + }, + { + "epoch": 2.50161394448031, + "grad_norm": 0.6877137422561646, + "learning_rate": 0.0002, + "loss": 0.6855, + "step": 7750 + }, + { + "epoch": 2.5048418334409295, + "grad_norm": 0.6998329162597656, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 7760 + }, + { + "epoch": 2.508069722401549, + "grad_norm": 0.7879213690757751, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 7770 + }, + { + "epoch": 2.5112976113621692, + "grad_norm": 0.7834980487823486, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 7780 + }, + { + "epoch": 2.514525500322789, + "grad_norm": 0.7789630889892578, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 7790 + }, + { + "epoch": 2.5177533892834085, + "grad_norm": 0.7403590083122253, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 7800 + }, + { + "epoch": 2.5209812782440286, + "grad_norm": 0.6029766201972961, + "learning_rate": 0.0002, + "loss": 0.6964, + "step": 7810 + }, + { + "epoch": 2.5242091672046483, + "grad_norm": 0.7061092257499695, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 7820 + }, + { + "epoch": 2.527437056165268, + "grad_norm": 0.7120763659477234, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 7830 + }, + { + "epoch": 2.5306649451258876, + "grad_norm": 0.6173675656318665, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 7840 + }, + { + "epoch": 2.5338928340865072, + "grad_norm": 0.9566813111305237, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 7850 + }, + { + "epoch": 2.5371207230471273, + "grad_norm": 0.8497620224952698, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 7860 + }, + { + "epoch": 2.540348612007747, + "grad_norm": 0.7663498520851135, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 7870 + }, + { + "epoch": 2.5435765009683666, + "grad_norm": 0.6329668760299683, + "learning_rate": 0.0002, + "loss": 0.6292, + "step": 7880 + }, + { + "epoch": 2.5468043899289863, + "grad_norm": 0.8128195405006409, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 7890 + }, + { + "epoch": 2.5500322788896064, + "grad_norm": 0.6622284650802612, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 7900 + }, + { + "epoch": 2.553260167850226, + "grad_norm": 0.8460057973861694, + "learning_rate": 0.0002, + "loss": 0.693, + "step": 7910 + }, + { + "epoch": 2.5564880568108457, + "grad_norm": 0.6586956977844238, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 7920 + }, + { + "epoch": 2.5597159457714653, + "grad_norm": 0.7569382190704346, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 7930 + }, + { + "epoch": 2.562943834732085, + "grad_norm": 0.6409714221954346, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 7940 + }, + { + "epoch": 2.566171723692705, + "grad_norm": 0.7031713128089905, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 7950 + }, + { + "epoch": 2.5693996126533247, + "grad_norm": 0.7983605265617371, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 7960 + }, + { + "epoch": 2.5726275016139444, + "grad_norm": 0.7165433168411255, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 7970 + }, + { + "epoch": 2.5758553905745645, + "grad_norm": 0.6630598902702332, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 7980 + }, + { + "epoch": 2.579083279535184, + "grad_norm": 0.5883122086524963, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 7990 + }, + { + "epoch": 2.5823111684958038, + "grad_norm": 0.5928755402565002, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 8000 + }, + { + "epoch": 2.5855390574564234, + "grad_norm": 0.7843712568283081, + "learning_rate": 0.0002, + "loss": 0.6701, + "step": 8010 + }, + { + "epoch": 2.588766946417043, + "grad_norm": 0.7206324338912964, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 8020 + }, + { + "epoch": 2.5919948353776627, + "grad_norm": 0.812480092048645, + "learning_rate": 0.0002, + "loss": 0.6968, + "step": 8030 + }, + { + "epoch": 2.595222724338283, + "grad_norm": 0.9843078255653381, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 8040 + }, + { + "epoch": 2.5984506132989025, + "grad_norm": 0.7524392604827881, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 8050 + }, + { + "epoch": 2.601678502259522, + "grad_norm": 0.6220380067825317, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 8060 + }, + { + "epoch": 2.6049063912201422, + "grad_norm": 0.7461398243904114, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 8070 + }, + { + "epoch": 2.608134280180762, + "grad_norm": 0.720974326133728, + "learning_rate": 0.0002, + "loss": 0.6626, + "step": 8080 + }, + { + "epoch": 2.6113621691413815, + "grad_norm": 0.649509847164154, + "learning_rate": 0.0002, + "loss": 0.6756, + "step": 8090 + }, + { + "epoch": 2.614590058102001, + "grad_norm": 0.6894662976264954, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 8100 + }, + { + "epoch": 2.617817947062621, + "grad_norm": 0.734433114528656, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 8110 + }, + { + "epoch": 2.621045836023241, + "grad_norm": 0.7468628883361816, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 8120 + }, + { + "epoch": 2.6242737249838606, + "grad_norm": 0.6508180499076843, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 8130 + }, + { + "epoch": 2.6275016139444802, + "grad_norm": 0.8735209107398987, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 8140 + }, + { + "epoch": 2.6307295029051003, + "grad_norm": 0.8162857294082642, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 8150 + }, + { + "epoch": 2.63395739186572, + "grad_norm": 0.628872811794281, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 8160 + }, + { + "epoch": 2.6371852808263396, + "grad_norm": 0.8078708052635193, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 8170 + }, + { + "epoch": 2.6404131697869593, + "grad_norm": 0.7849429845809937, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 8180 + }, + { + "epoch": 2.643641058747579, + "grad_norm": 0.8115387558937073, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 8190 + }, + { + "epoch": 2.6468689477081986, + "grad_norm": 0.7462222576141357, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 8200 + }, + { + "epoch": 2.6500968366688187, + "grad_norm": 0.753662645816803, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 8210 + }, + { + "epoch": 2.6533247256294383, + "grad_norm": 0.6100404858589172, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 8220 + }, + { + "epoch": 2.656552614590058, + "grad_norm": 0.9084606766700745, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 8230 + }, + { + "epoch": 2.659780503550678, + "grad_norm": 0.6412538886070251, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 8240 + }, + { + "epoch": 2.6630083925112977, + "grad_norm": 0.7640451192855835, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 8250 + }, + { + "epoch": 2.6662362814719174, + "grad_norm": 0.5972344875335693, + "learning_rate": 0.0002, + "loss": 0.6846, + "step": 8260 + }, + { + "epoch": 2.669464170432537, + "grad_norm": 0.6935883164405823, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 8270 + }, + { + "epoch": 2.6726920593931567, + "grad_norm": 0.789399266242981, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 8280 + }, + { + "epoch": 2.675919948353777, + "grad_norm": 0.7143490314483643, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 8290 + }, + { + "epoch": 2.6791478373143964, + "grad_norm": 0.6670652627944946, + "learning_rate": 0.0002, + "loss": 0.6741, + "step": 8300 + }, + { + "epoch": 2.682375726275016, + "grad_norm": 0.687108039855957, + "learning_rate": 0.0002, + "loss": 0.6936, + "step": 8310 + }, + { + "epoch": 2.6856036152356357, + "grad_norm": 0.7914147973060608, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 8320 + }, + { + "epoch": 2.688831504196256, + "grad_norm": 0.8398420214653015, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 8330 + }, + { + "epoch": 2.6920593931568755, + "grad_norm": 0.6592720746994019, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 8340 + }, + { + "epoch": 2.695287282117495, + "grad_norm": 0.6888470649719238, + "learning_rate": 0.0002, + "loss": 0.6673, + "step": 8350 + }, + { + "epoch": 2.698515171078115, + "grad_norm": 0.7127556800842285, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 8360 + }, + { + "epoch": 2.7017430600387344, + "grad_norm": 0.6630286574363708, + "learning_rate": 0.0002, + "loss": 0.7013, + "step": 8370 + }, + { + "epoch": 2.7049709489993545, + "grad_norm": 0.8261964321136475, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 8380 + }, + { + "epoch": 2.708198837959974, + "grad_norm": 0.717339813709259, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 8390 + }, + { + "epoch": 2.711426726920594, + "grad_norm": 0.651637613773346, + "learning_rate": 0.0002, + "loss": 0.6929, + "step": 8400 + }, + { + "epoch": 2.714654615881214, + "grad_norm": 0.7936098575592041, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 8410 + }, + { + "epoch": 2.7178825048418336, + "grad_norm": 0.8761560320854187, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 8420 + }, + { + "epoch": 2.7211103938024532, + "grad_norm": 0.6768006086349487, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 8430 + }, + { + "epoch": 2.724338282763073, + "grad_norm": 0.7121055722236633, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 8440 + }, + { + "epoch": 2.7275661717236925, + "grad_norm": 0.6811696887016296, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 8450 + }, + { + "epoch": 2.730794060684312, + "grad_norm": 0.8168250918388367, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 8460 + }, + { + "epoch": 2.7340219496449323, + "grad_norm": 0.660682737827301, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 8470 + }, + { + "epoch": 2.737249838605552, + "grad_norm": 0.7369356155395508, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 8480 + }, + { + "epoch": 2.7404777275661716, + "grad_norm": 0.7545099854469299, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 8490 + }, + { + "epoch": 2.7437056165267917, + "grad_norm": 0.6991257667541504, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 8500 + }, + { + "epoch": 2.7469335054874113, + "grad_norm": 0.7195324301719666, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 8510 + }, + { + "epoch": 2.750161394448031, + "grad_norm": 0.8995378017425537, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 8520 + }, + { + "epoch": 2.7533892834086506, + "grad_norm": 0.6924123764038086, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 8530 + }, + { + "epoch": 2.7566171723692703, + "grad_norm": 0.6260585784912109, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 8540 + }, + { + "epoch": 2.7598450613298904, + "grad_norm": 0.7273091673851013, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 8550 + }, + { + "epoch": 2.76307295029051, + "grad_norm": 0.720562219619751, + "learning_rate": 0.0002, + "loss": 0.6853, + "step": 8560 + }, + { + "epoch": 2.7663008392511297, + "grad_norm": 0.6360004544258118, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 8570 + }, + { + "epoch": 2.76952872821175, + "grad_norm": 0.7634525895118713, + "learning_rate": 0.0002, + "loss": 0.6118, + "step": 8580 + }, + { + "epoch": 2.7727566171723694, + "grad_norm": 0.6586076021194458, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 8590 + }, + { + "epoch": 2.775984506132989, + "grad_norm": 0.6542639136314392, + "learning_rate": 0.0002, + "loss": 0.7072, + "step": 8600 + }, + { + "epoch": 2.7792123950936087, + "grad_norm": 0.7650290727615356, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 8610 + }, + { + "epoch": 2.7824402840542284, + "grad_norm": 0.6551542282104492, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 8620 + }, + { + "epoch": 2.785668173014848, + "grad_norm": 0.6915501952171326, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 8630 + }, + { + "epoch": 2.788896061975468, + "grad_norm": 0.8061493635177612, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 8640 + }, + { + "epoch": 2.792123950936088, + "grad_norm": 0.8403584957122803, + "learning_rate": 0.0002, + "loss": 0.6853, + "step": 8650 + }, + { + "epoch": 2.7953518398967074, + "grad_norm": 0.6455532312393188, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 8660 + }, + { + "epoch": 2.7985797288573275, + "grad_norm": 0.8296352028846741, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 8670 + }, + { + "epoch": 2.801807617817947, + "grad_norm": 0.7288752794265747, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 8680 + }, + { + "epoch": 2.805035506778567, + "grad_norm": 0.7628464102745056, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 8690 + }, + { + "epoch": 2.8082633957391865, + "grad_norm": 0.9993878602981567, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 8700 + }, + { + "epoch": 2.811491284699806, + "grad_norm": 0.6972465515136719, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 8710 + }, + { + "epoch": 2.8147191736604262, + "grad_norm": 0.645042896270752, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 8720 + }, + { + "epoch": 2.817947062621046, + "grad_norm": 0.6853853464126587, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 8730 + }, + { + "epoch": 2.8211749515816655, + "grad_norm": 0.5935067534446716, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 8740 + }, + { + "epoch": 2.824402840542285, + "grad_norm": 0.7336633205413818, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 8750 + }, + { + "epoch": 2.8276307295029053, + "grad_norm": 0.7074962854385376, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 8760 + }, + { + "epoch": 2.830858618463525, + "grad_norm": 0.6667559742927551, + "learning_rate": 0.0002, + "loss": 0.6744, + "step": 8770 + }, + { + "epoch": 2.8340865074241446, + "grad_norm": 0.8101205229759216, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 8780 + }, + { + "epoch": 2.8373143963847642, + "grad_norm": 0.8841480016708374, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 8790 + }, + { + "epoch": 2.840542285345384, + "grad_norm": 0.5891591310501099, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 8800 + }, + { + "epoch": 2.843770174306004, + "grad_norm": 0.667032778263092, + "learning_rate": 0.0002, + "loss": 0.7114, + "step": 8810 + }, + { + "epoch": 2.8469980632666236, + "grad_norm": 0.7629773020744324, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 8820 + }, + { + "epoch": 2.8502259522272433, + "grad_norm": 0.79471355676651, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 8830 + }, + { + "epoch": 2.8534538411878634, + "grad_norm": 0.7529178261756897, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 8840 + }, + { + "epoch": 2.856681730148483, + "grad_norm": 0.7014923691749573, + "learning_rate": 0.0002, + "loss": 0.7163, + "step": 8850 + }, + { + "epoch": 2.8599096191091027, + "grad_norm": 0.7996514439582825, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 8860 + }, + { + "epoch": 2.8631375080697223, + "grad_norm": 0.7044785618782043, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 8870 + }, + { + "epoch": 2.866365397030342, + "grad_norm": 0.6792093515396118, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 8880 + }, + { + "epoch": 2.8695932859909616, + "grad_norm": 0.69175124168396, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 8890 + }, + { + "epoch": 2.8728211749515817, + "grad_norm": 0.7499129176139832, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 8900 + }, + { + "epoch": 2.8760490639122014, + "grad_norm": 0.7678789496421814, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 8910 + }, + { + "epoch": 2.879276952872821, + "grad_norm": 0.7478128671646118, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 8920 + }, + { + "epoch": 2.882504841833441, + "grad_norm": 0.6767086386680603, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 8930 + }, + { + "epoch": 2.885732730794061, + "grad_norm": 0.7222196459770203, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 8940 + }, + { + "epoch": 2.8889606197546804, + "grad_norm": 0.6950580477714539, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 8950 + }, + { + "epoch": 2.8921885087153, + "grad_norm": 0.7759528160095215, + "learning_rate": 0.0002, + "loss": 0.7064, + "step": 8960 + }, + { + "epoch": 2.8954163976759197, + "grad_norm": 0.6686919927597046, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 8970 + }, + { + "epoch": 2.89864428663654, + "grad_norm": 0.9245954751968384, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 8980 + }, + { + "epoch": 2.9018721755971595, + "grad_norm": 0.8734814524650574, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 8990 + }, + { + "epoch": 2.905100064557779, + "grad_norm": 0.6056219339370728, + "learning_rate": 0.0002, + "loss": 0.6716, + "step": 9000 + }, + { + "epoch": 2.9083279535183992, + "grad_norm": 0.7364102005958557, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 9010 + }, + { + "epoch": 2.911555842479019, + "grad_norm": 0.6563605070114136, + "learning_rate": 0.0002, + "loss": 0.707, + "step": 9020 + }, + { + "epoch": 2.9147837314396385, + "grad_norm": 0.659978985786438, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 9030 + }, + { + "epoch": 2.918011620400258, + "grad_norm": 0.8176041841506958, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 9040 + }, + { + "epoch": 2.921239509360878, + "grad_norm": 0.743677020072937, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 9050 + }, + { + "epoch": 2.9244673983214975, + "grad_norm": 0.7418383359909058, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 9060 + }, + { + "epoch": 2.9276952872821176, + "grad_norm": 0.6916524767875671, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 9070 + }, + { + "epoch": 2.9309231762427372, + "grad_norm": 0.6559975743293762, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 9080 + }, + { + "epoch": 2.934151065203357, + "grad_norm": 0.7431221008300781, + "learning_rate": 0.0002, + "loss": 0.7016, + "step": 9090 + }, + { + "epoch": 2.937378954163977, + "grad_norm": 0.7525941133499146, + "learning_rate": 0.0002, + "loss": 0.6829, + "step": 9100 + }, + { + "epoch": 2.9406068431245966, + "grad_norm": 0.6860167384147644, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 9110 + }, + { + "epoch": 2.9438347320852163, + "grad_norm": 0.6467666029930115, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 9120 + }, + { + "epoch": 2.947062621045836, + "grad_norm": 0.7595751285552979, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 9130 + }, + { + "epoch": 2.9502905100064556, + "grad_norm": 0.6558279991149902, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 9140 + }, + { + "epoch": 2.9535183989670757, + "grad_norm": 0.6818708181381226, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 9150 + }, + { + "epoch": 2.9567462879276953, + "grad_norm": 0.8387085795402527, + "learning_rate": 0.0002, + "loss": 0.6921, + "step": 9160 + }, + { + "epoch": 2.959974176888315, + "grad_norm": 0.7705109715461731, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 9170 + }, + { + "epoch": 2.9632020658489346, + "grad_norm": 0.688106894493103, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 9180 + }, + { + "epoch": 2.9664299548095547, + "grad_norm": 0.659532368183136, + "learning_rate": 0.0002, + "loss": 0.6833, + "step": 9190 + }, + { + "epoch": 2.9696578437701744, + "grad_norm": 0.6839388608932495, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 9200 + }, + { + "epoch": 2.972885732730794, + "grad_norm": 0.6927599310874939, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 9210 + }, + { + "epoch": 2.9761136216914137, + "grad_norm": 0.6902472972869873, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 9220 + }, + { + "epoch": 2.9793415106520333, + "grad_norm": 0.620399534702301, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 9230 + }, + { + "epoch": 2.9825693996126534, + "grad_norm": 0.6812364459037781, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 9240 + }, + { + "epoch": 2.985797288573273, + "grad_norm": 0.7681456208229065, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 9250 + }, + { + "epoch": 2.9890251775338927, + "grad_norm": 0.7621907591819763, + "learning_rate": 0.0002, + "loss": 0.7113, + "step": 9260 + }, + { + "epoch": 2.992253066494513, + "grad_norm": 0.6075740456581116, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 9270 + }, + { + "epoch": 2.9954809554551325, + "grad_norm": 0.7100434899330139, + "learning_rate": 0.0002, + "loss": 0.6758, + "step": 9280 + }, + { + "epoch": 2.998708844415752, + "grad_norm": 0.7314488887786865, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 9290 + }, + { + "epoch": 3.0, + "eval_loss": 1.1434104442596436, + "eval_runtime": 166.3732, + "eval_samples_per_second": 4.406, + "eval_steps_per_second": 0.553, + "step": 9294 + }, + { + "epoch": 3.001936733376372, + "grad_norm": 0.7408893704414368, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 9300 + }, + { + "epoch": 3.0051646223369914, + "grad_norm": 0.9773574471473694, + "learning_rate": 0.0002, + "loss": 0.5182, + "step": 9310 + }, + { + "epoch": 3.0083925112976115, + "grad_norm": 0.7919653058052063, + "learning_rate": 0.0002, + "loss": 0.5432, + "step": 9320 + }, + { + "epoch": 3.011620400258231, + "grad_norm": 0.9139202833175659, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 9330 + }, + { + "epoch": 3.014848289218851, + "grad_norm": 0.8296737670898438, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 9340 + }, + { + "epoch": 3.0180761781794705, + "grad_norm": 0.786868155002594, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 9350 + }, + { + "epoch": 3.0213040671400906, + "grad_norm": 0.5928055644035339, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 9360 + }, + { + "epoch": 3.0245319561007102, + "grad_norm": 0.8785701394081116, + "learning_rate": 0.0002, + "loss": 0.5376, + "step": 9370 + }, + { + "epoch": 3.02775984506133, + "grad_norm": 0.7978872060775757, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 9380 + }, + { + "epoch": 3.0309877340219495, + "grad_norm": 0.7160913348197937, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 9390 + }, + { + "epoch": 3.034215622982569, + "grad_norm": 0.904465913772583, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 9400 + }, + { + "epoch": 3.0374435119431893, + "grad_norm": 0.7082195281982422, + "learning_rate": 0.0002, + "loss": 0.5518, + "step": 9410 + }, + { + "epoch": 3.040671400903809, + "grad_norm": 0.9686778783798218, + "learning_rate": 0.0002, + "loss": 0.5434, + "step": 9420 + }, + { + "epoch": 3.0438992898644286, + "grad_norm": 0.8788613677024841, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 9430 + }, + { + "epoch": 3.0471271788250482, + "grad_norm": 0.8217582106590271, + "learning_rate": 0.0002, + "loss": 0.5599, + "step": 9440 + }, + { + "epoch": 3.0503550677856683, + "grad_norm": 0.7380914092063904, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 9450 + }, + { + "epoch": 3.053582956746288, + "grad_norm": 0.7339285612106323, + "learning_rate": 0.0002, + "loss": 0.6258, + "step": 9460 + }, + { + "epoch": 3.0568108457069076, + "grad_norm": 0.7175183296203613, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 9470 + }, + { + "epoch": 3.0600387346675273, + "grad_norm": 0.8275379538536072, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 9480 + }, + { + "epoch": 3.0632666236281474, + "grad_norm": 0.6544256806373596, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 9490 + }, + { + "epoch": 3.066494512588767, + "grad_norm": 0.8193472623825073, + "learning_rate": 0.0002, + "loss": 0.5365, + "step": 9500 + }, + { + "epoch": 3.0697224015493867, + "grad_norm": 0.7967836856842041, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 9510 + }, + { + "epoch": 3.0729502905100063, + "grad_norm": 0.8788684010505676, + "learning_rate": 0.0002, + "loss": 0.5629, + "step": 9520 + }, + { + "epoch": 3.0761781794706264, + "grad_norm": 0.9410629868507385, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 9530 + }, + { + "epoch": 3.079406068431246, + "grad_norm": 0.7448706030845642, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 9540 + }, + { + "epoch": 3.0826339573918657, + "grad_norm": 0.9149372577667236, + "learning_rate": 0.0002, + "loss": 0.5774, + "step": 9550 + }, + { + "epoch": 3.0858618463524854, + "grad_norm": 0.7265563607215881, + "learning_rate": 0.0002, + "loss": 0.5347, + "step": 9560 + }, + { + "epoch": 3.089089735313105, + "grad_norm": 1.0305068492889404, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 9570 + }, + { + "epoch": 3.092317624273725, + "grad_norm": 0.7987357974052429, + "learning_rate": 0.0002, + "loss": 0.5884, + "step": 9580 + }, + { + "epoch": 3.095545513234345, + "grad_norm": 0.7733123898506165, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 9590 + }, + { + "epoch": 3.0987734021949644, + "grad_norm": 1.0438069105148315, + "learning_rate": 0.0002, + "loss": 0.5848, + "step": 9600 + }, + { + "epoch": 3.102001291155584, + "grad_norm": 0.7951784729957581, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 9610 + }, + { + "epoch": 3.105229180116204, + "grad_norm": 0.7776783108711243, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 9620 + }, + { + "epoch": 3.108457069076824, + "grad_norm": 0.7060676217079163, + "learning_rate": 0.0002, + "loss": 0.5626, + "step": 9630 + }, + { + "epoch": 3.1116849580374435, + "grad_norm": 0.871569037437439, + "learning_rate": 0.0002, + "loss": 0.5731, + "step": 9640 + }, + { + "epoch": 3.114912846998063, + "grad_norm": 0.8873385787010193, + "learning_rate": 0.0002, + "loss": 0.5168, + "step": 9650 + }, + { + "epoch": 3.118140735958683, + "grad_norm": 0.750998318195343, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 9660 + }, + { + "epoch": 3.121368624919303, + "grad_norm": 0.8678529262542725, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 9670 + }, + { + "epoch": 3.1245965138799225, + "grad_norm": 0.7706599235534668, + "learning_rate": 0.0002, + "loss": 0.5831, + "step": 9680 + }, + { + "epoch": 3.127824402840542, + "grad_norm": 0.8317574858665466, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 9690 + }, + { + "epoch": 3.131052291801162, + "grad_norm": 0.801800012588501, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 9700 + }, + { + "epoch": 3.134280180761782, + "grad_norm": 0.8574623465538025, + "learning_rate": 0.0002, + "loss": 0.6044, + "step": 9710 + }, + { + "epoch": 3.1375080697224016, + "grad_norm": 0.6556540727615356, + "learning_rate": 0.0002, + "loss": 0.6072, + "step": 9720 + }, + { + "epoch": 3.1407359586830212, + "grad_norm": 0.8555161952972412, + "learning_rate": 0.0002, + "loss": 0.6058, + "step": 9730 + }, + { + "epoch": 3.143963847643641, + "grad_norm": 0.8825467824935913, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 9740 + }, + { + "epoch": 3.147191736604261, + "grad_norm": 0.8297156691551208, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 9750 + }, + { + "epoch": 3.1504196255648806, + "grad_norm": 0.7710384726524353, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 9760 + }, + { + "epoch": 3.1536475145255003, + "grad_norm": 0.8778039216995239, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 9770 + }, + { + "epoch": 3.15687540348612, + "grad_norm": 0.9014058113098145, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 9780 + }, + { + "epoch": 3.16010329244674, + "grad_norm": 0.6856890320777893, + "learning_rate": 0.0002, + "loss": 0.5496, + "step": 9790 + }, + { + "epoch": 3.1633311814073597, + "grad_norm": 0.6520644426345825, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 9800 + }, + { + "epoch": 3.1665590703679793, + "grad_norm": 0.7250499129295349, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 9810 + }, + { + "epoch": 3.169786959328599, + "grad_norm": 0.8331542015075684, + "learning_rate": 0.0002, + "loss": 0.5823, + "step": 9820 + }, + { + "epoch": 3.1730148482892186, + "grad_norm": 0.8531261682510376, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 9830 + }, + { + "epoch": 3.1762427372498387, + "grad_norm": 0.8997558355331421, + "learning_rate": 0.0002, + "loss": 0.57, + "step": 9840 + }, + { + "epoch": 3.1794706262104584, + "grad_norm": 0.708335280418396, + "learning_rate": 0.0002, + "loss": 0.5921, + "step": 9850 + }, + { + "epoch": 3.182698515171078, + "grad_norm": 1.0074886083602905, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 9860 + }, + { + "epoch": 3.1859264041316977, + "grad_norm": 1.0804681777954102, + "learning_rate": 0.0002, + "loss": 0.573, + "step": 9870 + }, + { + "epoch": 3.189154293092318, + "grad_norm": 0.9510730504989624, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 9880 + }, + { + "epoch": 3.1923821820529374, + "grad_norm": 0.7211061716079712, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 9890 + }, + { + "epoch": 3.195610071013557, + "grad_norm": 0.8767086267471313, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 9900 + }, + { + "epoch": 3.1988379599741767, + "grad_norm": 0.8388153314590454, + "learning_rate": 0.0002, + "loss": 0.5747, + "step": 9910 + }, + { + "epoch": 3.202065848934797, + "grad_norm": 0.8038473725318909, + "learning_rate": 0.0002, + "loss": 0.5681, + "step": 9920 + }, + { + "epoch": 3.2052937378954165, + "grad_norm": 0.8187747001647949, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 9930 + }, + { + "epoch": 3.208521626856036, + "grad_norm": 0.7427355051040649, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 9940 + }, + { + "epoch": 3.211749515816656, + "grad_norm": 0.8017025589942932, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 9950 + }, + { + "epoch": 3.214977404777276, + "grad_norm": 0.738595187664032, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 9960 + }, + { + "epoch": 3.2182052937378955, + "grad_norm": 0.7521342039108276, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 9970 + }, + { + "epoch": 3.221433182698515, + "grad_norm": 0.840329110622406, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 9980 + }, + { + "epoch": 3.224661071659135, + "grad_norm": 0.9809671640396118, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 9990 + }, + { + "epoch": 3.2278889606197545, + "grad_norm": 0.8456943035125732, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 10000 + }, + { + "epoch": 3.2311168495803746, + "grad_norm": 0.8962995409965515, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 10010 + }, + { + "epoch": 3.2343447385409942, + "grad_norm": 0.6492817401885986, + "learning_rate": 0.0002, + "loss": 0.5399, + "step": 10020 + }, + { + "epoch": 3.237572627501614, + "grad_norm": 1.0471255779266357, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 10030 + }, + { + "epoch": 3.2408005164622335, + "grad_norm": 0.7995471358299255, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 10040 + }, + { + "epoch": 3.2440284054228536, + "grad_norm": 0.7231964468955994, + "learning_rate": 0.0002, + "loss": 0.615, + "step": 10050 + }, + { + "epoch": 3.2472562943834733, + "grad_norm": 0.639630138874054, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 10060 + }, + { + "epoch": 3.250484183344093, + "grad_norm": 0.7957055568695068, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 10070 + }, + { + "epoch": 3.2537120723047126, + "grad_norm": 0.7735482454299927, + "learning_rate": 0.0002, + "loss": 0.5845, + "step": 10080 + }, + { + "epoch": 3.2569399612653323, + "grad_norm": 0.8139488101005554, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 10090 + }, + { + "epoch": 3.2601678502259523, + "grad_norm": 0.8113240003585815, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 10100 + }, + { + "epoch": 3.263395739186572, + "grad_norm": 0.7735909819602966, + "learning_rate": 0.0002, + "loss": 0.5617, + "step": 10110 + }, + { + "epoch": 3.2666236281471916, + "grad_norm": 0.7760744094848633, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 10120 + }, + { + "epoch": 3.2698515171078113, + "grad_norm": 0.8078505396842957, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 10130 + }, + { + "epoch": 3.2730794060684314, + "grad_norm": 0.983648955821991, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 10140 + }, + { + "epoch": 3.276307295029051, + "grad_norm": 0.7131832242012024, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 10150 + }, + { + "epoch": 3.2795351839896707, + "grad_norm": 0.924493134021759, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 10160 + }, + { + "epoch": 3.2827630729502904, + "grad_norm": 0.9371112585067749, + "learning_rate": 0.0002, + "loss": 0.5733, + "step": 10170 + }, + { + "epoch": 3.2859909619109104, + "grad_norm": 0.8989261388778687, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 10180 + }, + { + "epoch": 3.28921885087153, + "grad_norm": 0.8130394816398621, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 10190 + }, + { + "epoch": 3.2924467398321497, + "grad_norm": 0.9899941086769104, + "learning_rate": 0.0002, + "loss": 0.5555, + "step": 10200 + }, + { + "epoch": 3.2956746287927694, + "grad_norm": 1.007038950920105, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 10210 + }, + { + "epoch": 3.2989025177533895, + "grad_norm": 0.7465066313743591, + "learning_rate": 0.0002, + "loss": 0.5713, + "step": 10220 + }, + { + "epoch": 3.302130406714009, + "grad_norm": 0.7202590703964233, + "learning_rate": 0.0002, + "loss": 0.6307, + "step": 10230 + }, + { + "epoch": 3.305358295674629, + "grad_norm": 0.6258249282836914, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 10240 + }, + { + "epoch": 3.3085861846352485, + "grad_norm": 0.8996058702468872, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 10250 + }, + { + "epoch": 3.311814073595868, + "grad_norm": 0.9550982713699341, + "learning_rate": 0.0002, + "loss": 0.5825, + "step": 10260 + }, + { + "epoch": 3.315041962556488, + "grad_norm": 0.7010059952735901, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 10270 + }, + { + "epoch": 3.318269851517108, + "grad_norm": 0.9639869332313538, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 10280 + }, + { + "epoch": 3.3214977404777275, + "grad_norm": 1.0192502737045288, + "learning_rate": 0.0002, + "loss": 0.5362, + "step": 10290 + }, + { + "epoch": 3.324725629438347, + "grad_norm": 0.7953670024871826, + "learning_rate": 0.0002, + "loss": 0.5605, + "step": 10300 + }, + { + "epoch": 3.3279535183989672, + "grad_norm": 0.7436774969100952, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 10310 + }, + { + "epoch": 3.331181407359587, + "grad_norm": 0.7846777439117432, + "learning_rate": 0.0002, + "loss": 0.5823, + "step": 10320 + }, + { + "epoch": 3.3344092963202066, + "grad_norm": 0.8963494896888733, + "learning_rate": 0.0002, + "loss": 0.6119, + "step": 10330 + }, + { + "epoch": 3.337637185280826, + "grad_norm": 0.6876392364501953, + "learning_rate": 0.0002, + "loss": 0.5872, + "step": 10340 + }, + { + "epoch": 3.340865074241446, + "grad_norm": 0.9161638021469116, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 10350 + }, + { + "epoch": 3.344092963202066, + "grad_norm": 0.8964458107948303, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 10360 + }, + { + "epoch": 3.3473208521626856, + "grad_norm": 0.9052296280860901, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 10370 + }, + { + "epoch": 3.3505487411233053, + "grad_norm": 0.9292596578598022, + "learning_rate": 0.0002, + "loss": 0.5958, + "step": 10380 + }, + { + "epoch": 3.3537766300839253, + "grad_norm": 0.9605957269668579, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 10390 + }, + { + "epoch": 3.357004519044545, + "grad_norm": 1.0198872089385986, + "learning_rate": 0.0002, + "loss": 0.6214, + "step": 10400 + }, + { + "epoch": 3.3602324080051647, + "grad_norm": 0.7043630480766296, + "learning_rate": 0.0002, + "loss": 0.6053, + "step": 10410 + }, + { + "epoch": 3.3634602969657843, + "grad_norm": 1.0533326864242554, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 10420 + }, + { + "epoch": 3.366688185926404, + "grad_norm": 0.7552485466003418, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 10430 + }, + { + "epoch": 3.369916074887024, + "grad_norm": 0.692708432674408, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 10440 + }, + { + "epoch": 3.3731439638476437, + "grad_norm": 0.985952615737915, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 10450 + }, + { + "epoch": 3.3763718528082634, + "grad_norm": 0.6749676465988159, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 10460 + }, + { + "epoch": 3.379599741768883, + "grad_norm": 0.9514535665512085, + "learning_rate": 0.0002, + "loss": 0.5724, + "step": 10470 + }, + { + "epoch": 3.382827630729503, + "grad_norm": 1.2681142091751099, + "learning_rate": 0.0002, + "loss": 0.5982, + "step": 10480 + }, + { + "epoch": 3.3860555196901228, + "grad_norm": 1.031968355178833, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 10490 + }, + { + "epoch": 3.3892834086507424, + "grad_norm": 0.8061563968658447, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 10500 + }, + { + "epoch": 3.392511297611362, + "grad_norm": 1.0515062808990479, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 10510 + }, + { + "epoch": 3.3957391865719817, + "grad_norm": 0.9055540561676025, + "learning_rate": 0.0002, + "loss": 0.542, + "step": 10520 + }, + { + "epoch": 3.398967075532602, + "grad_norm": 0.9318141341209412, + "learning_rate": 0.0002, + "loss": 0.6148, + "step": 10530 + }, + { + "epoch": 3.4021949644932215, + "grad_norm": 0.8266817331314087, + "learning_rate": 0.0002, + "loss": 0.5722, + "step": 10540 + }, + { + "epoch": 3.405422853453841, + "grad_norm": 1.2322112321853638, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 10550 + }, + { + "epoch": 3.4086507424144608, + "grad_norm": 0.9535136818885803, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 10560 + }, + { + "epoch": 3.411878631375081, + "grad_norm": 0.9243819117546082, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 10570 + }, + { + "epoch": 3.4151065203357005, + "grad_norm": 0.9011809825897217, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 10580 + }, + { + "epoch": 3.41833440929632, + "grad_norm": 0.9923036694526672, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 10590 + }, + { + "epoch": 3.42156229825694, + "grad_norm": 0.8903067111968994, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 10600 + }, + { + "epoch": 3.42479018721756, + "grad_norm": 0.7101534605026245, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 10610 + }, + { + "epoch": 3.4280180761781796, + "grad_norm": 0.8186570405960083, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 10620 + }, + { + "epoch": 3.431245965138799, + "grad_norm": 0.9480205774307251, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 10630 + }, + { + "epoch": 3.434473854099419, + "grad_norm": 1.1370961666107178, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 10640 + }, + { + "epoch": 3.437701743060039, + "grad_norm": 1.017669677734375, + "learning_rate": 0.0002, + "loss": 0.5779, + "step": 10650 + }, + { + "epoch": 3.4409296320206586, + "grad_norm": 0.7625100016593933, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 10660 + }, + { + "epoch": 3.4441575209812783, + "grad_norm": 0.9288196563720703, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 10670 + }, + { + "epoch": 3.447385409941898, + "grad_norm": 0.8800460696220398, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 10680 + }, + { + "epoch": 3.4506132989025176, + "grad_norm": 0.7499661445617676, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 10690 + }, + { + "epoch": 3.4538411878631377, + "grad_norm": 0.8254973292350769, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 10700 + }, + { + "epoch": 3.4570690768237573, + "grad_norm": 0.8735857605934143, + "learning_rate": 0.0002, + "loss": 0.5742, + "step": 10710 + }, + { + "epoch": 3.460296965784377, + "grad_norm": 0.9601819515228271, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 10720 + }, + { + "epoch": 3.4635248547449966, + "grad_norm": 0.8031058311462402, + "learning_rate": 0.0002, + "loss": 0.5574, + "step": 10730 + }, + { + "epoch": 3.4667527437056167, + "grad_norm": 0.8039247393608093, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 10740 + }, + { + "epoch": 3.4699806326662364, + "grad_norm": 0.8936953544616699, + "learning_rate": 0.0002, + "loss": 0.593, + "step": 10750 + }, + { + "epoch": 3.473208521626856, + "grad_norm": 0.8201186060905457, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 10760 + }, + { + "epoch": 3.4764364105874757, + "grad_norm": 1.0064148902893066, + "learning_rate": 0.0002, + "loss": 0.5875, + "step": 10770 + }, + { + "epoch": 3.4796642995480953, + "grad_norm": 0.8617483377456665, + "learning_rate": 0.0002, + "loss": 0.5639, + "step": 10780 + }, + { + "epoch": 3.4828921885087154, + "grad_norm": 0.8532096147537231, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 10790 + }, + { + "epoch": 3.486120077469335, + "grad_norm": 0.8646879196166992, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 10800 + }, + { + "epoch": 3.4893479664299547, + "grad_norm": 0.7962660789489746, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 10810 + }, + { + "epoch": 3.492575855390575, + "grad_norm": 0.9560028314590454, + "learning_rate": 0.0002, + "loss": 0.5398, + "step": 10820 + }, + { + "epoch": 3.4958037443511945, + "grad_norm": 0.928439736366272, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 10830 + }, + { + "epoch": 3.499031633311814, + "grad_norm": 0.8219282627105713, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 10840 + }, + { + "epoch": 3.5022595222724338, + "grad_norm": 0.7918338179588318, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 10850 + }, + { + "epoch": 3.5054874112330534, + "grad_norm": 0.961295485496521, + "learning_rate": 0.0002, + "loss": 0.6164, + "step": 10860 + }, + { + "epoch": 3.5087153001936735, + "grad_norm": 1.0731624364852905, + "learning_rate": 0.0002, + "loss": 0.5534, + "step": 10870 + }, + { + "epoch": 3.511943189154293, + "grad_norm": 0.9551863074302673, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 10880 + }, + { + "epoch": 3.515171078114913, + "grad_norm": 0.8409819602966309, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 10890 + }, + { + "epoch": 3.5183989670755325, + "grad_norm": 0.7546320557594299, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 10900 + }, + { + "epoch": 3.5216268560361526, + "grad_norm": 0.7505252361297607, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 10910 + }, + { + "epoch": 3.524854744996772, + "grad_norm": 0.7505561113357544, + "learning_rate": 0.0002, + "loss": 0.5649, + "step": 10920 + }, + { + "epoch": 3.528082633957392, + "grad_norm": 1.086177945137024, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 10930 + }, + { + "epoch": 3.5313105229180115, + "grad_norm": 0.7721118330955505, + "learning_rate": 0.0002, + "loss": 0.5983, + "step": 10940 + }, + { + "epoch": 3.534538411878631, + "grad_norm": 0.9567878246307373, + "learning_rate": 0.0002, + "loss": 0.5919, + "step": 10950 + }, + { + "epoch": 3.5377663008392513, + "grad_norm": 0.8377360105514526, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 10960 + }, + { + "epoch": 3.540994189799871, + "grad_norm": 1.0174858570098877, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 10970 + }, + { + "epoch": 3.5442220787604906, + "grad_norm": 0.8164418935775757, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 10980 + }, + { + "epoch": 3.5474499677211107, + "grad_norm": 0.8959241509437561, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 10990 + }, + { + "epoch": 3.5506778566817303, + "grad_norm": 1.0154379606246948, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 11000 + }, + { + "epoch": 3.55390574564235, + "grad_norm": 0.7812292575836182, + "learning_rate": 0.0002, + "loss": 0.5835, + "step": 11010 + }, + { + "epoch": 3.5571336346029696, + "grad_norm": 0.9849029779434204, + "learning_rate": 0.0002, + "loss": 0.6052, + "step": 11020 + }, + { + "epoch": 3.5603615235635893, + "grad_norm": 0.8826184272766113, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 11030 + }, + { + "epoch": 3.563589412524209, + "grad_norm": 0.9039685726165771, + "learning_rate": 0.0002, + "loss": 0.601, + "step": 11040 + }, + { + "epoch": 3.566817301484829, + "grad_norm": 0.9585249423980713, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 11050 + }, + { + "epoch": 3.5700451904454487, + "grad_norm": 0.8083069324493408, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 11060 + }, + { + "epoch": 3.5732730794060683, + "grad_norm": 0.9528678059577942, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 11070 + }, + { + "epoch": 3.5765009683666884, + "grad_norm": 0.8297588229179382, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 11080 + }, + { + "epoch": 3.579728857327308, + "grad_norm": 0.8191716074943542, + "learning_rate": 0.0002, + "loss": 0.5919, + "step": 11090 + }, + { + "epoch": 3.5829567462879277, + "grad_norm": 0.8056275844573975, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 11100 + }, + { + "epoch": 3.5861846352485474, + "grad_norm": 0.701930582523346, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 11110 + }, + { + "epoch": 3.589412524209167, + "grad_norm": 0.7644643187522888, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 11120 + }, + { + "epoch": 3.592640413169787, + "grad_norm": 0.668004035949707, + "learning_rate": 0.0002, + "loss": 0.605, + "step": 11130 + }, + { + "epoch": 3.5958683021304068, + "grad_norm": 0.8849539756774902, + "learning_rate": 0.0002, + "loss": 0.5735, + "step": 11140 + }, + { + "epoch": 3.5990961910910264, + "grad_norm": 0.8123571276664734, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 11150 + }, + { + "epoch": 3.602324080051646, + "grad_norm": 0.7591469287872314, + "learning_rate": 0.0002, + "loss": 0.5626, + "step": 11160 + }, + { + "epoch": 3.605551969012266, + "grad_norm": 0.776466965675354, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 11170 + }, + { + "epoch": 3.608779857972886, + "grad_norm": 0.9156150221824646, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 11180 + }, + { + "epoch": 3.6120077469335055, + "grad_norm": 0.7517618536949158, + "learning_rate": 0.0002, + "loss": 0.5867, + "step": 11190 + }, + { + "epoch": 3.615235635894125, + "grad_norm": 0.931239128112793, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 11200 + }, + { + "epoch": 3.6184635248547448, + "grad_norm": 0.9107872843742371, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 11210 + }, + { + "epoch": 3.621691413815365, + "grad_norm": 0.7624770998954773, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 11220 + }, + { + "epoch": 3.6249193027759845, + "grad_norm": 0.8129580616950989, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 11230 + }, + { + "epoch": 3.628147191736604, + "grad_norm": 0.7339836955070496, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 11240 + }, + { + "epoch": 3.6313750806972243, + "grad_norm": 0.8901296854019165, + "learning_rate": 0.0002, + "loss": 0.5976, + "step": 11250 + }, + { + "epoch": 3.634602969657844, + "grad_norm": 1.1374726295471191, + "learning_rate": 0.0002, + "loss": 0.5977, + "step": 11260 + }, + { + "epoch": 3.6378308586184636, + "grad_norm": 0.7438275218009949, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 11270 + }, + { + "epoch": 3.641058747579083, + "grad_norm": 0.808646559715271, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 11280 + }, + { + "epoch": 3.644286636539703, + "grad_norm": 1.091810941696167, + "learning_rate": 0.0002, + "loss": 0.6244, + "step": 11290 + }, + { + "epoch": 3.6475145255003225, + "grad_norm": 0.8439257144927979, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 11300 + }, + { + "epoch": 3.6507424144609426, + "grad_norm": 0.9720633029937744, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 11310 + }, + { + "epoch": 3.6539703034215623, + "grad_norm": 0.738571047782898, + "learning_rate": 0.0002, + "loss": 0.5942, + "step": 11320 + }, + { + "epoch": 3.657198192382182, + "grad_norm": 0.6961580514907837, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 11330 + }, + { + "epoch": 3.660426081342802, + "grad_norm": 0.8192131519317627, + "learning_rate": 0.0002, + "loss": 0.6226, + "step": 11340 + }, + { + "epoch": 3.6636539703034217, + "grad_norm": 0.8367205858230591, + "learning_rate": 0.0002, + "loss": 0.6155, + "step": 11350 + }, + { + "epoch": 3.6668818592640413, + "grad_norm": 0.7735666632652283, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 11360 + }, + { + "epoch": 3.670109748224661, + "grad_norm": 0.6507132649421692, + "learning_rate": 0.0002, + "loss": 0.6113, + "step": 11370 + }, + { + "epoch": 3.6733376371852806, + "grad_norm": 0.8271192312240601, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 11380 + }, + { + "epoch": 3.6765655261459007, + "grad_norm": 0.8724204301834106, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 11390 + }, + { + "epoch": 3.6797934151065204, + "grad_norm": 0.8448445200920105, + "learning_rate": 0.0002, + "loss": 0.6131, + "step": 11400 + }, + { + "epoch": 3.68302130406714, + "grad_norm": 0.6756882071495056, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 11410 + }, + { + "epoch": 3.68624919302776, + "grad_norm": 0.7859625816345215, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 11420 + }, + { + "epoch": 3.6894770819883798, + "grad_norm": 0.8929487466812134, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 11430 + }, + { + "epoch": 3.6927049709489994, + "grad_norm": 0.8163391351699829, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 11440 + }, + { + "epoch": 3.695932859909619, + "grad_norm": 0.8948464393615723, + "learning_rate": 0.0002, + "loss": 0.6467, + "step": 11450 + }, + { + "epoch": 3.6991607488702387, + "grad_norm": 0.8654782176017761, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 11460 + }, + { + "epoch": 3.7023886378308584, + "grad_norm": 0.9514864683151245, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 11470 + }, + { + "epoch": 3.7056165267914785, + "grad_norm": 0.7298579812049866, + "learning_rate": 0.0002, + "loss": 0.606, + "step": 11480 + }, + { + "epoch": 3.708844415752098, + "grad_norm": 0.9266309142112732, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 11490 + }, + { + "epoch": 3.7120723047127178, + "grad_norm": 0.8608686923980713, + "learning_rate": 0.0002, + "loss": 0.6122, + "step": 11500 + }, + { + "epoch": 3.715300193673338, + "grad_norm": 0.921788215637207, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 11510 + }, + { + "epoch": 3.7185280826339575, + "grad_norm": 0.8537021279335022, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 11520 + }, + { + "epoch": 3.721755971594577, + "grad_norm": 1.115194320678711, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 11530 + }, + { + "epoch": 3.724983860555197, + "grad_norm": 0.7614817023277283, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 11540 + }, + { + "epoch": 3.7282117495158165, + "grad_norm": 0.871999204158783, + "learning_rate": 0.0002, + "loss": 0.5776, + "step": 11550 + }, + { + "epoch": 3.7314396384764366, + "grad_norm": 0.9668049812316895, + "learning_rate": 0.0002, + "loss": 0.5962, + "step": 11560 + }, + { + "epoch": 3.734667527437056, + "grad_norm": 1.2185815572738647, + "learning_rate": 0.0002, + "loss": 0.5534, + "step": 11570 + }, + { + "epoch": 3.737895416397676, + "grad_norm": 0.8258453011512756, + "learning_rate": 0.0002, + "loss": 0.5936, + "step": 11580 + }, + { + "epoch": 3.7411233053582955, + "grad_norm": 0.8708966374397278, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 11590 + }, + { + "epoch": 3.7443511943189156, + "grad_norm": 0.7784267663955688, + "learning_rate": 0.0002, + "loss": 0.5847, + "step": 11600 + }, + { + "epoch": 3.7475790832795353, + "grad_norm": 0.7504425048828125, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 11610 + }, + { + "epoch": 3.750806972240155, + "grad_norm": 0.9144526124000549, + "learning_rate": 0.0002, + "loss": 0.5922, + "step": 11620 + }, + { + "epoch": 3.7540348612007746, + "grad_norm": 0.922581672668457, + "learning_rate": 0.0002, + "loss": 0.6425, + "step": 11630 + }, + { + "epoch": 3.757262750161394, + "grad_norm": 0.9348630905151367, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 11640 + }, + { + "epoch": 3.7604906391220143, + "grad_norm": 1.0740231275558472, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 11650 + }, + { + "epoch": 3.763718528082634, + "grad_norm": 0.884830117225647, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 11660 + }, + { + "epoch": 3.7669464170432536, + "grad_norm": 1.0256348848342896, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 11670 + }, + { + "epoch": 3.7701743060038737, + "grad_norm": 0.6795592904090881, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 11680 + }, + { + "epoch": 3.7734021949644934, + "grad_norm": 0.9381206631660461, + "learning_rate": 0.0002, + "loss": 0.6241, + "step": 11690 + }, + { + "epoch": 3.776630083925113, + "grad_norm": 0.7633092403411865, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 11700 + }, + { + "epoch": 3.7798579728857327, + "grad_norm": 0.7506213188171387, + "learning_rate": 0.0002, + "loss": 0.5937, + "step": 11710 + }, + { + "epoch": 3.7830858618463523, + "grad_norm": 0.8182913064956665, + "learning_rate": 0.0002, + "loss": 0.5933, + "step": 11720 + }, + { + "epoch": 3.786313750806972, + "grad_norm": 1.019322156906128, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 11730 + }, + { + "epoch": 3.789541639767592, + "grad_norm": 0.8895221948623657, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 11740 + }, + { + "epoch": 3.7927695287282117, + "grad_norm": 0.948847770690918, + "learning_rate": 0.0002, + "loss": 0.6553, + "step": 11750 + }, + { + "epoch": 3.7959974176888314, + "grad_norm": 0.9068999886512756, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 11760 + }, + { + "epoch": 3.7992253066494515, + "grad_norm": 0.7920539975166321, + "learning_rate": 0.0002, + "loss": 0.6163, + "step": 11770 + }, + { + "epoch": 3.802453195610071, + "grad_norm": 0.8441922068595886, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 11780 + }, + { + "epoch": 3.8056810845706908, + "grad_norm": 0.9258501529693604, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 11790 + }, + { + "epoch": 3.8089089735313104, + "grad_norm": 0.7354241609573364, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 11800 + }, + { + "epoch": 3.81213686249193, + "grad_norm": 0.9494872689247131, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 11810 + }, + { + "epoch": 3.81536475145255, + "grad_norm": 0.8266556859016418, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 11820 + }, + { + "epoch": 3.81859264041317, + "grad_norm": 0.7951219081878662, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 11830 + }, + { + "epoch": 3.8218205293737895, + "grad_norm": 0.7688382267951965, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 11840 + }, + { + "epoch": 3.8250484183344096, + "grad_norm": 1.0917940139770508, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 11850 + }, + { + "epoch": 3.828276307295029, + "grad_norm": 0.9880442023277283, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 11860 + }, + { + "epoch": 3.831504196255649, + "grad_norm": 0.8433151245117188, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 11870 + }, + { + "epoch": 3.8347320852162685, + "grad_norm": 0.8691204786300659, + "learning_rate": 0.0002, + "loss": 0.5876, + "step": 11880 + }, + { + "epoch": 3.837959974176888, + "grad_norm": 0.7698143124580383, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 11890 + }, + { + "epoch": 3.841187863137508, + "grad_norm": 0.8874883651733398, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 11900 + }, + { + "epoch": 3.844415752098128, + "grad_norm": 1.1209359169006348, + "learning_rate": 0.0002, + "loss": 0.6242, + "step": 11910 + }, + { + "epoch": 3.8476436410587476, + "grad_norm": 0.7723544239997864, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 11920 + }, + { + "epoch": 3.850871530019367, + "grad_norm": 0.8363937139511108, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 11930 + }, + { + "epoch": 3.8540994189799873, + "grad_norm": 0.9209707975387573, + "learning_rate": 0.0002, + "loss": 0.6498, + "step": 11940 + }, + { + "epoch": 3.857327307940607, + "grad_norm": 0.9456894993782043, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 11950 + }, + { + "epoch": 3.8605551969012266, + "grad_norm": 1.5748413801193237, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 11960 + }, + { + "epoch": 3.8637830858618463, + "grad_norm": 0.9083569049835205, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 11970 + }, + { + "epoch": 3.867010974822466, + "grad_norm": 0.7672823071479797, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 11980 + }, + { + "epoch": 3.870238863783086, + "grad_norm": 0.8647152185440063, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 11990 + }, + { + "epoch": 3.8734667527437057, + "grad_norm": 0.9564255475997925, + "learning_rate": 0.0002, + "loss": 0.5755, + "step": 12000 + }, + { + "epoch": 3.8766946417043253, + "grad_norm": 0.773267924785614, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 12010 + }, + { + "epoch": 3.879922530664945, + "grad_norm": 0.8030173182487488, + "learning_rate": 0.0002, + "loss": 0.6057, + "step": 12020 + }, + { + "epoch": 3.883150419625565, + "grad_norm": 0.8002150058746338, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 12030 + }, + { + "epoch": 3.8863783085861847, + "grad_norm": 0.98802250623703, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 12040 + }, + { + "epoch": 3.8896061975468044, + "grad_norm": 0.7868124842643738, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 12050 + }, + { + "epoch": 3.892834086507424, + "grad_norm": 0.932182788848877, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 12060 + }, + { + "epoch": 3.8960619754680437, + "grad_norm": 0.8576806783676147, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 12070 + }, + { + "epoch": 3.8992898644286638, + "grad_norm": 0.8985713124275208, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 12080 + }, + { + "epoch": 3.9025177533892834, + "grad_norm": 0.7876521944999695, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 12090 + }, + { + "epoch": 3.905745642349903, + "grad_norm": 0.773936927318573, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 12100 + }, + { + "epoch": 3.908973531310523, + "grad_norm": 0.7274761199951172, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 12110 + }, + { + "epoch": 3.912201420271143, + "grad_norm": 0.8625598549842834, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 12120 + }, + { + "epoch": 3.9154293092317625, + "grad_norm": 0.8702362179756165, + "learning_rate": 0.0002, + "loss": 0.5855, + "step": 12130 + }, + { + "epoch": 3.918657198192382, + "grad_norm": 0.912579357624054, + "learning_rate": 0.0002, + "loss": 0.6493, + "step": 12140 + }, + { + "epoch": 3.9218850871530018, + "grad_norm": 0.8697066903114319, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 12150 + }, + { + "epoch": 3.9251129761136214, + "grad_norm": 1.005232572555542, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 12160 + }, + { + "epoch": 3.9283408650742415, + "grad_norm": 0.793902575969696, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 12170 + }, + { + "epoch": 3.931568754034861, + "grad_norm": 0.7025905847549438, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 12180 + }, + { + "epoch": 3.934796642995481, + "grad_norm": 0.97635817527771, + "learning_rate": 0.0002, + "loss": 0.6421, + "step": 12190 + }, + { + "epoch": 3.938024531956101, + "grad_norm": 0.855417013168335, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 12200 + }, + { + "epoch": 3.9412524209167206, + "grad_norm": 0.8841291666030884, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 12210 + }, + { + "epoch": 3.94448030987734, + "grad_norm": 1.1762064695358276, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 12220 + }, + { + "epoch": 3.94770819883796, + "grad_norm": 0.8393193483352661, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 12230 + }, + { + "epoch": 3.9509360877985795, + "grad_norm": 0.9324905276298523, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 12240 + }, + { + "epoch": 3.9541639767591996, + "grad_norm": 0.8607982993125916, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 12250 + }, + { + "epoch": 3.9573918657198193, + "grad_norm": 0.8586681485176086, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 12260 + }, + { + "epoch": 3.960619754680439, + "grad_norm": 1.1082909107208252, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 12270 + }, + { + "epoch": 3.963847643641059, + "grad_norm": 1.065027117729187, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 12280 + }, + { + "epoch": 3.9670755326016787, + "grad_norm": 0.9544363021850586, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 12290 + }, + { + "epoch": 3.9703034215622983, + "grad_norm": 0.9008927345275879, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 12300 + }, + { + "epoch": 3.973531310522918, + "grad_norm": 0.8717467188835144, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 12310 + }, + { + "epoch": 3.9767591994835376, + "grad_norm": 0.9718339443206787, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 12320 + }, + { + "epoch": 3.9799870884441573, + "grad_norm": 1.0362015962600708, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 12330 + }, + { + "epoch": 3.9832149774047774, + "grad_norm": 1.0844318866729736, + "learning_rate": 0.0002, + "loss": 0.6229, + "step": 12340 + }, + { + "epoch": 3.986442866365397, + "grad_norm": 0.7506240606307983, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 12350 + }, + { + "epoch": 3.9896707553260167, + "grad_norm": 1.005982756614685, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 12360 + }, + { + "epoch": 3.9928986442866368, + "grad_norm": 0.7566431164741516, + "learning_rate": 0.0002, + "loss": 0.5926, + "step": 12370 + }, + { + "epoch": 3.9961265332472564, + "grad_norm": 0.8819181323051453, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 12380 + }, + { + "epoch": 3.999354422207876, + "grad_norm": 0.884497880935669, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 12390 + }, + { + "epoch": 4.0, + "eval_loss": 1.1907150745391846, + "eval_runtime": 161.5766, + "eval_samples_per_second": 4.537, + "eval_steps_per_second": 0.569, + "step": 12392 + } + ], + "logging_steps": 10, + "max_steps": 24784, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.734740131881943e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f46f2b8e8752b125339f36f172c3878be4cdb152 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-12392/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfc2a69e44a51edf5586ebed4b7ee915a23244c18c1f59e580471e4c9becfa98 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d878f77f43cd98a559bc0359c3e9820f030be242 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dc3520742eabe5c0adf4f49b0179b0af27ebf110b3bf827c217d1488cb35fa3 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e017f35c65e2cfa9184ff399898b7d1e886083f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f68ab2e0259281587db7900d00a3323b8f5acb1e1e3ef32758cde9da57e655c4 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9e7286bc4c9f2a978050df674f33c30f10a13b5 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0e404560fa33934d836e487cb33ba85efe12767ce547d115c0fe6bf1f277512 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..99c62021b5d3a61f8b332cd16d40c1d475481bc1 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cc89e179b3c3f9f228ae22777e7ba2007ea1132fc9274b300842b2a89da1732 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..151557996658e8516a5579ef61ad1a9d63f11756 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/trainer_state.json @@ -0,0 +1,10916 @@ +{ + "best_metric": 1.0958120822906494, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098", + "epoch": 5.0, + "eval_steps": 10, + "global_step": 15490, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032278889606197547, + "grad_norm": 0.7092075347900391, + "learning_rate": 0.0002, + "loss": 1.593, + "step": 10 + }, + { + "epoch": 0.006455777921239509, + "grad_norm": 0.6900479793548584, + "learning_rate": 0.0002, + "loss": 1.0956, + "step": 20 + }, + { + "epoch": 0.009683666881859263, + "grad_norm": 0.6788288950920105, + "learning_rate": 0.0002, + "loss": 0.9807, + "step": 30 + }, + { + "epoch": 0.012911555842479019, + "grad_norm": 0.5590243339538574, + "learning_rate": 0.0002, + "loss": 0.9385, + "step": 40 + }, + { + "epoch": 0.016139444803098774, + "grad_norm": 0.5136010646820068, + "learning_rate": 0.0002, + "loss": 0.931, + "step": 50 + }, + { + "epoch": 0.019367333763718526, + "grad_norm": 0.45298320055007935, + "learning_rate": 0.0002, + "loss": 0.8896, + "step": 60 + }, + { + "epoch": 0.022595222724338282, + "grad_norm": 0.5917162299156189, + "learning_rate": 0.0002, + "loss": 0.9184, + "step": 70 + }, + { + "epoch": 0.025823111684958037, + "grad_norm": 0.4414856433868408, + "learning_rate": 0.0002, + "loss": 0.8705, + "step": 80 + }, + { + "epoch": 0.029051000645577793, + "grad_norm": 0.5547978281974792, + "learning_rate": 0.0002, + "loss": 0.8419, + "step": 90 + }, + { + "epoch": 0.03227888960619755, + "grad_norm": 0.5271288156509399, + "learning_rate": 0.0002, + "loss": 0.8987, + "step": 100 + }, + { + "epoch": 0.035506778566817304, + "grad_norm": 0.5506119728088379, + "learning_rate": 0.0002, + "loss": 0.8543, + "step": 110 + }, + { + "epoch": 0.03873466752743705, + "grad_norm": 0.5579327940940857, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 120 + }, + { + "epoch": 0.04196255648805681, + "grad_norm": 0.5099632740020752, + "learning_rate": 0.0002, + "loss": 0.8826, + "step": 130 + }, + { + "epoch": 0.045190445448676564, + "grad_norm": 0.40396833419799805, + "learning_rate": 0.0002, + "loss": 0.9239, + "step": 140 + }, + { + "epoch": 0.04841833440929632, + "grad_norm": 0.5008092522621155, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 150 + }, + { + "epoch": 0.051646223369916075, + "grad_norm": 0.4388776421546936, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 160 + }, + { + "epoch": 0.05487411233053583, + "grad_norm": 0.44138944149017334, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 170 + }, + { + "epoch": 0.058102001291155586, + "grad_norm": 0.358484148979187, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 180 + }, + { + "epoch": 0.06132989025177534, + "grad_norm": 0.457052081823349, + "learning_rate": 0.0002, + "loss": 0.8956, + "step": 190 + }, + { + "epoch": 0.0645577792123951, + "grad_norm": 0.5537622570991516, + "learning_rate": 0.0002, + "loss": 0.9138, + "step": 200 + }, + { + "epoch": 0.06778566817301485, + "grad_norm": 0.552631676197052, + "learning_rate": 0.0002, + "loss": 0.8701, + "step": 210 + }, + { + "epoch": 0.07101355713363461, + "grad_norm": 0.4414575397968292, + "learning_rate": 0.0002, + "loss": 0.8854, + "step": 220 + }, + { + "epoch": 0.07424144609425436, + "grad_norm": 0.4996664226055145, + "learning_rate": 0.0002, + "loss": 0.8581, + "step": 230 + }, + { + "epoch": 0.0774693350548741, + "grad_norm": 0.7321897149085999, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 240 + }, + { + "epoch": 0.08069722401549387, + "grad_norm": 0.4553901255130768, + "learning_rate": 0.0002, + "loss": 0.8848, + "step": 250 + }, + { + "epoch": 0.08392511297611362, + "grad_norm": 0.5039054751396179, + "learning_rate": 0.0002, + "loss": 0.868, + "step": 260 + }, + { + "epoch": 0.08715300193673338, + "grad_norm": 0.4113094210624695, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 270 + }, + { + "epoch": 0.09038089089735313, + "grad_norm": 0.450436532497406, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 280 + }, + { + "epoch": 0.09360877985797289, + "grad_norm": 0.4548024535179138, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 290 + }, + { + "epoch": 0.09683666881859264, + "grad_norm": 0.4932962656021118, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 300 + }, + { + "epoch": 0.1000645577792124, + "grad_norm": 0.4005250334739685, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 310 + }, + { + "epoch": 0.10329244673983215, + "grad_norm": 1.8321624994277954, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 320 + }, + { + "epoch": 0.1065203357004519, + "grad_norm": 0.45815610885620117, + "learning_rate": 0.0002, + "loss": 0.8411, + "step": 330 + }, + { + "epoch": 0.10974822466107166, + "grad_norm": 0.39324095845222473, + "learning_rate": 0.0002, + "loss": 0.857, + "step": 340 + }, + { + "epoch": 0.11297611362169141, + "grad_norm": 0.546273946762085, + "learning_rate": 0.0002, + "loss": 0.8258, + "step": 350 + }, + { + "epoch": 0.11620400258231117, + "grad_norm": 0.497448593378067, + "learning_rate": 0.0002, + "loss": 0.882, + "step": 360 + }, + { + "epoch": 0.11943189154293092, + "grad_norm": 0.37508800625801086, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 370 + }, + { + "epoch": 0.12265978050355068, + "grad_norm": 0.45849609375, + "learning_rate": 0.0002, + "loss": 0.852, + "step": 380 + }, + { + "epoch": 0.12588766946417043, + "grad_norm": 0.5488408803939819, + "learning_rate": 0.0002, + "loss": 0.8437, + "step": 390 + }, + { + "epoch": 0.1291155584247902, + "grad_norm": 0.4477061331272125, + "learning_rate": 0.0002, + "loss": 0.8349, + "step": 400 + }, + { + "epoch": 0.13234344738540993, + "grad_norm": 0.39227980375289917, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 410 + }, + { + "epoch": 0.1355713363460297, + "grad_norm": 0.3922233581542969, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 420 + }, + { + "epoch": 0.13879922530664945, + "grad_norm": 0.42901909351348877, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 430 + }, + { + "epoch": 0.14202711426726922, + "grad_norm": 0.4217798709869385, + "learning_rate": 0.0002, + "loss": 0.8271, + "step": 440 + }, + { + "epoch": 0.14525500322788895, + "grad_norm": 0.43470677733421326, + "learning_rate": 0.0002, + "loss": 0.8594, + "step": 450 + }, + { + "epoch": 0.1484828921885087, + "grad_norm": 0.5324403047561646, + "learning_rate": 0.0002, + "loss": 0.8106, + "step": 460 + }, + { + "epoch": 0.15171078114912848, + "grad_norm": 0.3999756872653961, + "learning_rate": 0.0002, + "loss": 0.8729, + "step": 470 + }, + { + "epoch": 0.1549386701097482, + "grad_norm": 0.404933363199234, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 480 + }, + { + "epoch": 0.15816655907036797, + "grad_norm": 0.44122636318206787, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 490 + }, + { + "epoch": 0.16139444803098774, + "grad_norm": 0.510166347026825, + "learning_rate": 0.0002, + "loss": 0.8457, + "step": 500 + }, + { + "epoch": 0.1646223369916075, + "grad_norm": 0.4549732506275177, + "learning_rate": 0.0002, + "loss": 0.8692, + "step": 510 + }, + { + "epoch": 0.16785022595222723, + "grad_norm": 0.5148182511329651, + "learning_rate": 0.0002, + "loss": 0.8466, + "step": 520 + }, + { + "epoch": 0.171078114912847, + "grad_norm": 0.3596806824207306, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 530 + }, + { + "epoch": 0.17430600387346676, + "grad_norm": 0.4388909339904785, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 540 + }, + { + "epoch": 0.17753389283408652, + "grad_norm": 0.5052742958068848, + "learning_rate": 0.0002, + "loss": 0.8322, + "step": 550 + }, + { + "epoch": 0.18076178179470626, + "grad_norm": 0.48248958587646484, + "learning_rate": 0.0002, + "loss": 0.791, + "step": 560 + }, + { + "epoch": 0.18398967075532602, + "grad_norm": 0.5360197424888611, + "learning_rate": 0.0002, + "loss": 0.8593, + "step": 570 + }, + { + "epoch": 0.18721755971594578, + "grad_norm": 0.43999341130256653, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 580 + }, + { + "epoch": 0.19044544867656552, + "grad_norm": 0.3685208261013031, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 590 + }, + { + "epoch": 0.19367333763718528, + "grad_norm": 0.4601275622844696, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 600 + }, + { + "epoch": 0.19690122659780504, + "grad_norm": 0.4778369665145874, + "learning_rate": 0.0002, + "loss": 0.8483, + "step": 610 + }, + { + "epoch": 0.2001291155584248, + "grad_norm": 0.4867003560066223, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 620 + }, + { + "epoch": 0.20335700451904454, + "grad_norm": 0.4583742916584015, + "learning_rate": 0.0002, + "loss": 0.8554, + "step": 630 + }, + { + "epoch": 0.2065848934796643, + "grad_norm": 0.47958165407180786, + "learning_rate": 0.0002, + "loss": 0.8698, + "step": 640 + }, + { + "epoch": 0.20981278244028406, + "grad_norm": 0.4526064097881317, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 650 + }, + { + "epoch": 0.2130406714009038, + "grad_norm": 0.45890581607818604, + "learning_rate": 0.0002, + "loss": 0.8313, + "step": 660 + }, + { + "epoch": 0.21626856036152356, + "grad_norm": 0.42725905776023865, + "learning_rate": 0.0002, + "loss": 0.8143, + "step": 670 + }, + { + "epoch": 0.21949644932214332, + "grad_norm": 0.40380963683128357, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 680 + }, + { + "epoch": 0.22272433828276308, + "grad_norm": 0.4372998774051666, + "learning_rate": 0.0002, + "loss": 0.9004, + "step": 690 + }, + { + "epoch": 0.22595222724338282, + "grad_norm": 0.4245864450931549, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 700 + }, + { + "epoch": 0.22918011620400258, + "grad_norm": 0.4061129689216614, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 710 + }, + { + "epoch": 0.23240800516462234, + "grad_norm": 0.474454790353775, + "learning_rate": 0.0002, + "loss": 0.8275, + "step": 720 + }, + { + "epoch": 0.23563589412524208, + "grad_norm": 0.4908486008644104, + "learning_rate": 0.0002, + "loss": 0.8346, + "step": 730 + }, + { + "epoch": 0.23886378308586184, + "grad_norm": 0.4284191429615021, + "learning_rate": 0.0002, + "loss": 0.8755, + "step": 740 + }, + { + "epoch": 0.2420916720464816, + "grad_norm": 0.44730308651924133, + "learning_rate": 0.0002, + "loss": 0.8387, + "step": 750 + }, + { + "epoch": 0.24531956100710137, + "grad_norm": 0.4433246850967407, + "learning_rate": 0.0002, + "loss": 0.8135, + "step": 760 + }, + { + "epoch": 0.2485474499677211, + "grad_norm": 0.43668854236602783, + "learning_rate": 0.0002, + "loss": 0.8644, + "step": 770 + }, + { + "epoch": 0.25177533892834086, + "grad_norm": 0.34324130415916443, + "learning_rate": 0.0002, + "loss": 0.8025, + "step": 780 + }, + { + "epoch": 0.2550032278889606, + "grad_norm": 0.46476295590400696, + "learning_rate": 0.0002, + "loss": 0.8725, + "step": 790 + }, + { + "epoch": 0.2582311168495804, + "grad_norm": 0.5047039985656738, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 800 + }, + { + "epoch": 0.26145900581020015, + "grad_norm": 0.4402127265930176, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 810 + }, + { + "epoch": 0.26468689477081986, + "grad_norm": 0.4642465114593506, + "learning_rate": 0.0002, + "loss": 0.8025, + "step": 820 + }, + { + "epoch": 0.2679147837314396, + "grad_norm": 0.40093424916267395, + "learning_rate": 0.0002, + "loss": 0.8836, + "step": 830 + }, + { + "epoch": 0.2711426726920594, + "grad_norm": 0.42501842975616455, + "learning_rate": 0.0002, + "loss": 0.83, + "step": 840 + }, + { + "epoch": 0.27437056165267915, + "grad_norm": 0.43279722332954407, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 850 + }, + { + "epoch": 0.2775984506132989, + "grad_norm": 0.5991243720054626, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 860 + }, + { + "epoch": 0.28082633957391867, + "grad_norm": 0.4217848777770996, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 870 + }, + { + "epoch": 0.28405422853453843, + "grad_norm": 0.3933536410331726, + "learning_rate": 0.0002, + "loss": 0.8135, + "step": 880 + }, + { + "epoch": 0.28728211749515814, + "grad_norm": 0.5868505239486694, + "learning_rate": 0.0002, + "loss": 0.8846, + "step": 890 + }, + { + "epoch": 0.2905100064557779, + "grad_norm": 0.5209547877311707, + "learning_rate": 0.0002, + "loss": 0.8759, + "step": 900 + }, + { + "epoch": 0.29373789541639767, + "grad_norm": 0.49307361245155334, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 910 + }, + { + "epoch": 0.2969657843770174, + "grad_norm": 0.4288382828235626, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 920 + }, + { + "epoch": 0.3001936733376372, + "grad_norm": 0.33568474650382996, + "learning_rate": 0.0002, + "loss": 0.8431, + "step": 930 + }, + { + "epoch": 0.30342156229825695, + "grad_norm": 1.0915930271148682, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 940 + }, + { + "epoch": 0.3066494512588767, + "grad_norm": 0.5489798188209534, + "learning_rate": 0.0002, + "loss": 0.8535, + "step": 950 + }, + { + "epoch": 0.3098773402194964, + "grad_norm": 0.42971742153167725, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 960 + }, + { + "epoch": 0.3131052291801162, + "grad_norm": 0.43375834822654724, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 970 + }, + { + "epoch": 0.31633311814073595, + "grad_norm": 0.47488611936569214, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 980 + }, + { + "epoch": 0.3195610071013557, + "grad_norm": 0.46296775341033936, + "learning_rate": 0.0002, + "loss": 0.7906, + "step": 990 + }, + { + "epoch": 0.32278889606197547, + "grad_norm": 0.4548890292644501, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 1000 + }, + { + "epoch": 0.32601678502259523, + "grad_norm": 0.41834497451782227, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 1010 + }, + { + "epoch": 0.329244673983215, + "grad_norm": 0.441092312335968, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 1020 + }, + { + "epoch": 0.33247256294383476, + "grad_norm": 0.637322187423706, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 1030 + }, + { + "epoch": 0.33570045190445447, + "grad_norm": 0.4374958574771881, + "learning_rate": 0.0002, + "loss": 0.8685, + "step": 1040 + }, + { + "epoch": 0.33892834086507423, + "grad_norm": 0.3935825824737549, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 1050 + }, + { + "epoch": 0.342156229825694, + "grad_norm": 0.43526220321655273, + "learning_rate": 0.0002, + "loss": 0.8287, + "step": 1060 + }, + { + "epoch": 0.34538411878631375, + "grad_norm": 0.45327696204185486, + "learning_rate": 0.0002, + "loss": 0.8413, + "step": 1070 + }, + { + "epoch": 0.3486120077469335, + "grad_norm": 0.4126075506210327, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 1080 + }, + { + "epoch": 0.3518398967075533, + "grad_norm": 0.4714072048664093, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 1090 + }, + { + "epoch": 0.35506778566817304, + "grad_norm": 0.518127977848053, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 1100 + }, + { + "epoch": 0.35829567462879275, + "grad_norm": 0.43264099955558777, + "learning_rate": 0.0002, + "loss": 0.8479, + "step": 1110 + }, + { + "epoch": 0.3615235635894125, + "grad_norm": 0.4857400357723236, + "learning_rate": 0.0002, + "loss": 0.8724, + "step": 1120 + }, + { + "epoch": 0.3647514525500323, + "grad_norm": 0.37591469287872314, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 1130 + }, + { + "epoch": 0.36797934151065204, + "grad_norm": 0.4165478050708771, + "learning_rate": 0.0002, + "loss": 0.8531, + "step": 1140 + }, + { + "epoch": 0.3712072304712718, + "grad_norm": 0.42911383509635925, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 1150 + }, + { + "epoch": 0.37443511943189156, + "grad_norm": 0.44980287551879883, + "learning_rate": 0.0002, + "loss": 0.8722, + "step": 1160 + }, + { + "epoch": 0.3776630083925113, + "grad_norm": 0.4066573679447174, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 1170 + }, + { + "epoch": 0.38089089735313103, + "grad_norm": 0.5056195855140686, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 1180 + }, + { + "epoch": 0.3841187863137508, + "grad_norm": 0.4141536355018616, + "learning_rate": 0.0002, + "loss": 0.8387, + "step": 1190 + }, + { + "epoch": 0.38734667527437056, + "grad_norm": 0.4501924514770508, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 1200 + }, + { + "epoch": 0.3905745642349903, + "grad_norm": 0.43304240703582764, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 1210 + }, + { + "epoch": 0.3938024531956101, + "grad_norm": 0.475777804851532, + "learning_rate": 0.0002, + "loss": 0.8905, + "step": 1220 + }, + { + "epoch": 0.39703034215622984, + "grad_norm": 0.5846465826034546, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 1230 + }, + { + "epoch": 0.4002582311168496, + "grad_norm": 0.42899325489997864, + "learning_rate": 0.0002, + "loss": 0.8078, + "step": 1240 + }, + { + "epoch": 0.4034861200774693, + "grad_norm": 0.3980463147163391, + "learning_rate": 0.0002, + "loss": 0.8415, + "step": 1250 + }, + { + "epoch": 0.4067140090380891, + "grad_norm": 0.45769768953323364, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 1260 + }, + { + "epoch": 0.40994189799870884, + "grad_norm": 0.5101280212402344, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 1270 + }, + { + "epoch": 0.4131697869593286, + "grad_norm": 0.47374317049980164, + "learning_rate": 0.0002, + "loss": 0.7905, + "step": 1280 + }, + { + "epoch": 0.41639767591994836, + "grad_norm": 0.4261878728866577, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 1290 + }, + { + "epoch": 0.4196255648805681, + "grad_norm": 0.46954256296157837, + "learning_rate": 0.0002, + "loss": 0.9004, + "step": 1300 + }, + { + "epoch": 0.4228534538411879, + "grad_norm": 0.5205738544464111, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 1310 + }, + { + "epoch": 0.4260813428018076, + "grad_norm": 0.5176340937614441, + "learning_rate": 0.0002, + "loss": 0.8964, + "step": 1320 + }, + { + "epoch": 0.42930923176242736, + "grad_norm": 0.5155916810035706, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 1330 + }, + { + "epoch": 0.4325371207230471, + "grad_norm": 0.44548553228378296, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 1340 + }, + { + "epoch": 0.4357650096836669, + "grad_norm": 0.5633558630943298, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 1350 + }, + { + "epoch": 0.43899289864428664, + "grad_norm": 0.42444056272506714, + "learning_rate": 0.0002, + "loss": 0.7889, + "step": 1360 + }, + { + "epoch": 0.4422207876049064, + "grad_norm": 0.5226860642433167, + "learning_rate": 0.0002, + "loss": 0.8588, + "step": 1370 + }, + { + "epoch": 0.44544867656552617, + "grad_norm": 0.5354582071304321, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 1380 + }, + { + "epoch": 0.4486765655261459, + "grad_norm": 0.472646564245224, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 1390 + }, + { + "epoch": 0.45190445448676564, + "grad_norm": 0.6312310099601746, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 1400 + }, + { + "epoch": 0.4551323434473854, + "grad_norm": 0.4298408031463623, + "learning_rate": 0.0002, + "loss": 0.8212, + "step": 1410 + }, + { + "epoch": 0.45836023240800516, + "grad_norm": 0.43427202105522156, + "learning_rate": 0.0002, + "loss": 0.8447, + "step": 1420 + }, + { + "epoch": 0.4615881213686249, + "grad_norm": 0.44097861647605896, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 1430 + }, + { + "epoch": 0.4648160103292447, + "grad_norm": 0.5142693519592285, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 1440 + }, + { + "epoch": 0.46804389928986445, + "grad_norm": 0.46416547894477844, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 1450 + }, + { + "epoch": 0.47127178825048416, + "grad_norm": 0.4858551025390625, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 1460 + }, + { + "epoch": 0.4744996772111039, + "grad_norm": 0.4709177315235138, + "learning_rate": 0.0002, + "loss": 0.8354, + "step": 1470 + }, + { + "epoch": 0.4777275661717237, + "grad_norm": 0.5500252842903137, + "learning_rate": 0.0002, + "loss": 0.8391, + "step": 1480 + }, + { + "epoch": 0.48095545513234345, + "grad_norm": 0.43364381790161133, + "learning_rate": 0.0002, + "loss": 0.8359, + "step": 1490 + }, + { + "epoch": 0.4841833440929632, + "grad_norm": 0.47712287306785583, + "learning_rate": 0.0002, + "loss": 0.8446, + "step": 1500 + }, + { + "epoch": 0.48741123305358297, + "grad_norm": 0.4518495202064514, + "learning_rate": 0.0002, + "loss": 0.8518, + "step": 1510 + }, + { + "epoch": 0.49063912201420273, + "grad_norm": 0.4539008140563965, + "learning_rate": 0.0002, + "loss": 0.819, + "step": 1520 + }, + { + "epoch": 0.49386701097482244, + "grad_norm": 0.4993067979812622, + "learning_rate": 0.0002, + "loss": 0.8276, + "step": 1530 + }, + { + "epoch": 0.4970948999354422, + "grad_norm": 0.6094803214073181, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 1540 + }, + { + "epoch": 0.500322788896062, + "grad_norm": 0.48602527379989624, + "learning_rate": 0.0002, + "loss": 0.8263, + "step": 1550 + }, + { + "epoch": 0.5035506778566817, + "grad_norm": 0.40245795249938965, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 1560 + }, + { + "epoch": 0.5067785668173015, + "grad_norm": 0.456787645816803, + "learning_rate": 0.0002, + "loss": 0.7907, + "step": 1570 + }, + { + "epoch": 0.5100064557779213, + "grad_norm": 0.43936216831207275, + "learning_rate": 0.0002, + "loss": 0.86, + "step": 1580 + }, + { + "epoch": 0.513234344738541, + "grad_norm": 0.549018144607544, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 1590 + }, + { + "epoch": 0.5164622336991608, + "grad_norm": 0.41746795177459717, + "learning_rate": 0.0002, + "loss": 0.8169, + "step": 1600 + }, + { + "epoch": 0.5196901226597805, + "grad_norm": 0.4217053949832916, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 1610 + }, + { + "epoch": 0.5229180116204003, + "grad_norm": 0.449913889169693, + "learning_rate": 0.0002, + "loss": 0.8161, + "step": 1620 + }, + { + "epoch": 0.5261459005810201, + "grad_norm": 0.5084872245788574, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 1630 + }, + { + "epoch": 0.5293737895416397, + "grad_norm": 0.46248653531074524, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 1640 + }, + { + "epoch": 0.5326016785022595, + "grad_norm": 0.4824236035346985, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 1650 + }, + { + "epoch": 0.5358295674628792, + "grad_norm": 0.6010985374450684, + "learning_rate": 0.0002, + "loss": 0.8711, + "step": 1660 + }, + { + "epoch": 0.539057456423499, + "grad_norm": 0.4757920801639557, + "learning_rate": 0.0002, + "loss": 0.8266, + "step": 1670 + }, + { + "epoch": 0.5422853453841188, + "grad_norm": 0.45161882042884827, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 1680 + }, + { + "epoch": 0.5455132343447385, + "grad_norm": 0.49314990639686584, + "learning_rate": 0.0002, + "loss": 0.8141, + "step": 1690 + }, + { + "epoch": 0.5487411233053583, + "grad_norm": 0.3918305039405823, + "learning_rate": 0.0002, + "loss": 0.8091, + "step": 1700 + }, + { + "epoch": 0.551969012265978, + "grad_norm": 0.5966728925704956, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 1710 + }, + { + "epoch": 0.5551969012265978, + "grad_norm": 0.4208986163139343, + "learning_rate": 0.0002, + "loss": 0.8438, + "step": 1720 + }, + { + "epoch": 0.5584247901872176, + "grad_norm": 0.43724218010902405, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 1730 + }, + { + "epoch": 0.5616526791478373, + "grad_norm": 0.5287272930145264, + "learning_rate": 0.0002, + "loss": 0.7956, + "step": 1740 + }, + { + "epoch": 0.5648805681084571, + "grad_norm": 0.4961899518966675, + "learning_rate": 0.0002, + "loss": 0.8557, + "step": 1750 + }, + { + "epoch": 0.5681084570690769, + "grad_norm": 0.4468635320663452, + "learning_rate": 0.0002, + "loss": 0.8029, + "step": 1760 + }, + { + "epoch": 0.5713363460296966, + "grad_norm": 0.6423530578613281, + "learning_rate": 0.0002, + "loss": 0.7968, + "step": 1770 + }, + { + "epoch": 0.5745642349903163, + "grad_norm": 0.4601971507072449, + "learning_rate": 0.0002, + "loss": 0.8324, + "step": 1780 + }, + { + "epoch": 0.577792123950936, + "grad_norm": 0.46514901518821716, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 1790 + }, + { + "epoch": 0.5810200129115558, + "grad_norm": 0.4771687388420105, + "learning_rate": 0.0002, + "loss": 0.8186, + "step": 1800 + }, + { + "epoch": 0.5842479018721756, + "grad_norm": 0.46514490246772766, + "learning_rate": 0.0002, + "loss": 0.856, + "step": 1810 + }, + { + "epoch": 0.5874757908327953, + "grad_norm": 0.5373936295509338, + "learning_rate": 0.0002, + "loss": 0.84, + "step": 1820 + }, + { + "epoch": 0.5907036797934151, + "grad_norm": 0.5175791382789612, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 1830 + }, + { + "epoch": 0.5939315687540349, + "grad_norm": 0.4522802233695984, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 1840 + }, + { + "epoch": 0.5971594577146546, + "grad_norm": 0.42987772822380066, + "learning_rate": 0.0002, + "loss": 0.8633, + "step": 1850 + }, + { + "epoch": 0.6003873466752744, + "grad_norm": 0.5566838383674622, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 1860 + }, + { + "epoch": 0.6036152356358941, + "grad_norm": 0.42807698249816895, + "learning_rate": 0.0002, + "loss": 0.8312, + "step": 1870 + }, + { + "epoch": 0.6068431245965139, + "grad_norm": 0.4957767724990845, + "learning_rate": 0.0002, + "loss": 0.8035, + "step": 1880 + }, + { + "epoch": 0.6100710135571337, + "grad_norm": 0.4260980188846588, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 1890 + }, + { + "epoch": 0.6132989025177534, + "grad_norm": 0.4777357876300812, + "learning_rate": 0.0002, + "loss": 0.8363, + "step": 1900 + }, + { + "epoch": 0.6165267914783732, + "grad_norm": 0.4434216022491455, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 1910 + }, + { + "epoch": 0.6197546804389928, + "grad_norm": 0.5215433835983276, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 1920 + }, + { + "epoch": 0.6229825693996126, + "grad_norm": 0.5143248438835144, + "learning_rate": 0.0002, + "loss": 0.82, + "step": 1930 + }, + { + "epoch": 0.6262104583602324, + "grad_norm": 0.5213413238525391, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 1940 + }, + { + "epoch": 0.6294383473208521, + "grad_norm": 0.5408226251602173, + "learning_rate": 0.0002, + "loss": 0.7549, + "step": 1950 + }, + { + "epoch": 0.6326662362814719, + "grad_norm": 0.5479708909988403, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 1960 + }, + { + "epoch": 0.6358941252420917, + "grad_norm": 0.4490949809551239, + "learning_rate": 0.0002, + "loss": 0.8138, + "step": 1970 + }, + { + "epoch": 0.6391220142027114, + "grad_norm": 0.48815059661865234, + "learning_rate": 0.0002, + "loss": 0.854, + "step": 1980 + }, + { + "epoch": 0.6423499031633312, + "grad_norm": 0.46498045325279236, + "learning_rate": 0.0002, + "loss": 0.8568, + "step": 1990 + }, + { + "epoch": 0.6455777921239509, + "grad_norm": 0.5136561393737793, + "learning_rate": 0.0002, + "loss": 0.8263, + "step": 2000 + }, + { + "epoch": 0.6488056810845707, + "grad_norm": 0.5145719647407532, + "learning_rate": 0.0002, + "loss": 0.8503, + "step": 2010 + }, + { + "epoch": 0.6520335700451905, + "grad_norm": 0.5430373549461365, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 2020 + }, + { + "epoch": 0.6552614590058102, + "grad_norm": 0.46347954869270325, + "learning_rate": 0.0002, + "loss": 0.8115, + "step": 2030 + }, + { + "epoch": 0.65848934796643, + "grad_norm": 0.5189562439918518, + "learning_rate": 0.0002, + "loss": 0.8769, + "step": 2040 + }, + { + "epoch": 0.6617172369270498, + "grad_norm": 0.43843990564346313, + "learning_rate": 0.0002, + "loss": 0.8453, + "step": 2050 + }, + { + "epoch": 0.6649451258876695, + "grad_norm": 0.4654983580112457, + "learning_rate": 0.0002, + "loss": 0.7951, + "step": 2060 + }, + { + "epoch": 0.6681730148482892, + "grad_norm": 0.44835716485977173, + "learning_rate": 0.0002, + "loss": 0.8308, + "step": 2070 + }, + { + "epoch": 0.6714009038089089, + "grad_norm": 0.38811734318733215, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 2080 + }, + { + "epoch": 0.6746287927695287, + "grad_norm": 0.5709853172302246, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 2090 + }, + { + "epoch": 0.6778566817301485, + "grad_norm": 0.49994757771492004, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 2100 + }, + { + "epoch": 0.6810845706907682, + "grad_norm": 0.5505402684211731, + "learning_rate": 0.0002, + "loss": 0.8, + "step": 2110 + }, + { + "epoch": 0.684312459651388, + "grad_norm": 0.48195120692253113, + "learning_rate": 0.0002, + "loss": 0.8227, + "step": 2120 + }, + { + "epoch": 0.6875403486120077, + "grad_norm": 0.4854775071144104, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 2130 + }, + { + "epoch": 0.6907682375726275, + "grad_norm": 0.6422494649887085, + "learning_rate": 0.0002, + "loss": 0.8231, + "step": 2140 + }, + { + "epoch": 0.6939961265332473, + "grad_norm": 0.3972536027431488, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 2150 + }, + { + "epoch": 0.697224015493867, + "grad_norm": 0.4297836422920227, + "learning_rate": 0.0002, + "loss": 0.8068, + "step": 2160 + }, + { + "epoch": 0.7004519044544868, + "grad_norm": 0.45486778020858765, + "learning_rate": 0.0002, + "loss": 0.8017, + "step": 2170 + }, + { + "epoch": 0.7036797934151066, + "grad_norm": 0.4706047773361206, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 2180 + }, + { + "epoch": 0.7069076823757263, + "grad_norm": 0.46426892280578613, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 2190 + }, + { + "epoch": 0.7101355713363461, + "grad_norm": 0.46333715319633484, + "learning_rate": 0.0002, + "loss": 0.8472, + "step": 2200 + }, + { + "epoch": 0.7133634602969657, + "grad_norm": 0.4632524251937866, + "learning_rate": 0.0002, + "loss": 0.8247, + "step": 2210 + }, + { + "epoch": 0.7165913492575855, + "grad_norm": 0.4610830843448639, + "learning_rate": 0.0002, + "loss": 0.8452, + "step": 2220 + }, + { + "epoch": 0.7198192382182053, + "grad_norm": 0.4905324876308441, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 2230 + }, + { + "epoch": 0.723047127178825, + "grad_norm": 0.4936263859272003, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 2240 + }, + { + "epoch": 0.7262750161394448, + "grad_norm": 0.40778425335884094, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 2250 + }, + { + "epoch": 0.7295029051000645, + "grad_norm": 0.50351482629776, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 2260 + }, + { + "epoch": 0.7327307940606843, + "grad_norm": 0.4894128143787384, + "learning_rate": 0.0002, + "loss": 0.8475, + "step": 2270 + }, + { + "epoch": 0.7359586830213041, + "grad_norm": 0.5580906271934509, + "learning_rate": 0.0002, + "loss": 0.8087, + "step": 2280 + }, + { + "epoch": 0.7391865719819238, + "grad_norm": 0.4655369520187378, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 2290 + }, + { + "epoch": 0.7424144609425436, + "grad_norm": 0.4666965901851654, + "learning_rate": 0.0002, + "loss": 0.8395, + "step": 2300 + }, + { + "epoch": 0.7456423499031634, + "grad_norm": 0.46259936690330505, + "learning_rate": 0.0002, + "loss": 0.7605, + "step": 2310 + }, + { + "epoch": 0.7488702388637831, + "grad_norm": 0.520706832408905, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 2320 + }, + { + "epoch": 0.7520981278244029, + "grad_norm": 0.5142408013343811, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 2330 + }, + { + "epoch": 0.7553260167850226, + "grad_norm": 0.5355164408683777, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 2340 + }, + { + "epoch": 0.7585539057456423, + "grad_norm": 0.5517185926437378, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 2350 + }, + { + "epoch": 0.7617817947062621, + "grad_norm": 0.7162677049636841, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 2360 + }, + { + "epoch": 0.7650096836668818, + "grad_norm": 0.42402133345603943, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 2370 + }, + { + "epoch": 0.7682375726275016, + "grad_norm": 0.47180113196372986, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 2380 + }, + { + "epoch": 0.7714654615881213, + "grad_norm": 0.6262288689613342, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 2390 + }, + { + "epoch": 0.7746933505487411, + "grad_norm": 0.5177528262138367, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 2400 + }, + { + "epoch": 0.7779212395093609, + "grad_norm": 0.555721640586853, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 2410 + }, + { + "epoch": 0.7811491284699806, + "grad_norm": 0.5592644810676575, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 2420 + }, + { + "epoch": 0.7843770174306004, + "grad_norm": 0.38025397062301636, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 2430 + }, + { + "epoch": 0.7876049063912202, + "grad_norm": 0.4597472548484802, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 2440 + }, + { + "epoch": 0.7908327953518399, + "grad_norm": 0.4929825961589813, + "learning_rate": 0.0002, + "loss": 0.8575, + "step": 2450 + }, + { + "epoch": 0.7940606843124597, + "grad_norm": 0.45277655124664307, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 2460 + }, + { + "epoch": 0.7972885732730794, + "grad_norm": 0.6224122643470764, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 2470 + }, + { + "epoch": 0.8005164622336992, + "grad_norm": 0.5740901827812195, + "learning_rate": 0.0002, + "loss": 0.8449, + "step": 2480 + }, + { + "epoch": 0.8037443511943189, + "grad_norm": 0.41335329413414, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 2490 + }, + { + "epoch": 0.8069722401549386, + "grad_norm": 0.4738694131374359, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 2500 + }, + { + "epoch": 0.8102001291155584, + "grad_norm": 0.5288197994232178, + "learning_rate": 0.0002, + "loss": 0.7927, + "step": 2510 + }, + { + "epoch": 0.8134280180761781, + "grad_norm": 0.5404666066169739, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 2520 + }, + { + "epoch": 0.8166559070367979, + "grad_norm": 0.4444909691810608, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 2530 + }, + { + "epoch": 0.8198837959974177, + "grad_norm": 0.542061448097229, + "learning_rate": 0.0002, + "loss": 0.8683, + "step": 2540 + }, + { + "epoch": 0.8231116849580374, + "grad_norm": 0.4914741814136505, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 2550 + }, + { + "epoch": 0.8263395739186572, + "grad_norm": 0.41703441739082336, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 2560 + }, + { + "epoch": 0.829567462879277, + "grad_norm": 0.5489841103553772, + "learning_rate": 0.0002, + "loss": 0.824, + "step": 2570 + }, + { + "epoch": 0.8327953518398967, + "grad_norm": 0.5359883308410645, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 2580 + }, + { + "epoch": 0.8360232408005165, + "grad_norm": 0.5541019439697266, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 2590 + }, + { + "epoch": 0.8392511297611362, + "grad_norm": 0.4746638834476471, + "learning_rate": 0.0002, + "loss": 0.797, + "step": 2600 + }, + { + "epoch": 0.842479018721756, + "grad_norm": 0.5243194103240967, + "learning_rate": 0.0002, + "loss": 0.8116, + "step": 2610 + }, + { + "epoch": 0.8457069076823758, + "grad_norm": 0.46824976801872253, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 2620 + }, + { + "epoch": 0.8489347966429954, + "grad_norm": 0.49487847089767456, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 2630 + }, + { + "epoch": 0.8521626856036152, + "grad_norm": 0.42180097103118896, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 2640 + }, + { + "epoch": 0.855390574564235, + "grad_norm": 0.5516560077667236, + "learning_rate": 0.0002, + "loss": 0.8304, + "step": 2650 + }, + { + "epoch": 0.8586184635248547, + "grad_norm": 0.4392191767692566, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 2660 + }, + { + "epoch": 0.8618463524854745, + "grad_norm": 0.5387210845947266, + "learning_rate": 0.0002, + "loss": 0.848, + "step": 2670 + }, + { + "epoch": 0.8650742414460942, + "grad_norm": 0.6232406497001648, + "learning_rate": 0.0002, + "loss": 0.8094, + "step": 2680 + }, + { + "epoch": 0.868302130406714, + "grad_norm": 0.53749018907547, + "learning_rate": 0.0002, + "loss": 0.768, + "step": 2690 + }, + { + "epoch": 0.8715300193673338, + "grad_norm": 0.47480374574661255, + "learning_rate": 0.0002, + "loss": 0.8299, + "step": 2700 + }, + { + "epoch": 0.8747579083279535, + "grad_norm": 0.44618046283721924, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 2710 + }, + { + "epoch": 0.8779857972885733, + "grad_norm": 0.4173581302165985, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 2720 + }, + { + "epoch": 0.881213686249193, + "grad_norm": 0.524081289768219, + "learning_rate": 0.0002, + "loss": 0.7713, + "step": 2730 + }, + { + "epoch": 0.8844415752098128, + "grad_norm": 0.5608431100845337, + "learning_rate": 0.0002, + "loss": 0.8738, + "step": 2740 + }, + { + "epoch": 0.8876694641704326, + "grad_norm": 0.5212284922599792, + "learning_rate": 0.0002, + "loss": 0.8513, + "step": 2750 + }, + { + "epoch": 0.8908973531310523, + "grad_norm": 0.5601475834846497, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 2760 + }, + { + "epoch": 0.8941252420916721, + "grad_norm": 0.4499223828315735, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 2770 + }, + { + "epoch": 0.8973531310522918, + "grad_norm": 0.46945226192474365, + "learning_rate": 0.0002, + "loss": 0.8559, + "step": 2780 + }, + { + "epoch": 0.9005810200129115, + "grad_norm": 0.4837495684623718, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 2790 + }, + { + "epoch": 0.9038089089735313, + "grad_norm": 0.5059258937835693, + "learning_rate": 0.0002, + "loss": 0.7887, + "step": 2800 + }, + { + "epoch": 0.907036797934151, + "grad_norm": 0.4857945144176483, + "learning_rate": 0.0002, + "loss": 0.8571, + "step": 2810 + }, + { + "epoch": 0.9102646868947708, + "grad_norm": 0.5001962780952454, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 2820 + }, + { + "epoch": 0.9134925758553906, + "grad_norm": 0.5468648672103882, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 2830 + }, + { + "epoch": 0.9167204648160103, + "grad_norm": 0.5533056259155273, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 2840 + }, + { + "epoch": 0.9199483537766301, + "grad_norm": 0.5909785628318787, + "learning_rate": 0.0002, + "loss": 0.7895, + "step": 2850 + }, + { + "epoch": 0.9231762427372499, + "grad_norm": 0.47428104281425476, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 2860 + }, + { + "epoch": 0.9264041316978696, + "grad_norm": 0.548814058303833, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 2870 + }, + { + "epoch": 0.9296320206584894, + "grad_norm": 0.5576745271682739, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 2880 + }, + { + "epoch": 0.9328599096191091, + "grad_norm": 0.47094792127609253, + "learning_rate": 0.0002, + "loss": 0.8399, + "step": 2890 + }, + { + "epoch": 0.9360877985797289, + "grad_norm": 0.5408539772033691, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 2900 + }, + { + "epoch": 0.9393156875403487, + "grad_norm": 0.5922889113426208, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 2910 + }, + { + "epoch": 0.9425435765009683, + "grad_norm": 0.45462584495544434, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 2920 + }, + { + "epoch": 0.9457714654615881, + "grad_norm": 0.6864947080612183, + "learning_rate": 0.0002, + "loss": 0.8344, + "step": 2930 + }, + { + "epoch": 0.9489993544222078, + "grad_norm": 0.4706299304962158, + "learning_rate": 0.0002, + "loss": 0.8166, + "step": 2940 + }, + { + "epoch": 0.9522272433828276, + "grad_norm": 0.5583269596099854, + "learning_rate": 0.0002, + "loss": 0.8422, + "step": 2950 + }, + { + "epoch": 0.9554551323434474, + "grad_norm": 0.51015704870224, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 2960 + }, + { + "epoch": 0.9586830213040671, + "grad_norm": 0.5325582027435303, + "learning_rate": 0.0002, + "loss": 0.8371, + "step": 2970 + }, + { + "epoch": 0.9619109102646869, + "grad_norm": 0.49008598923683167, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 2980 + }, + { + "epoch": 0.9651387992253067, + "grad_norm": 0.4422132074832916, + "learning_rate": 0.0002, + "loss": 0.8093, + "step": 2990 + }, + { + "epoch": 0.9683666881859264, + "grad_norm": 0.5053589344024658, + "learning_rate": 0.0002, + "loss": 0.7966, + "step": 3000 + }, + { + "epoch": 0.9715945771465462, + "grad_norm": 0.46754521131515503, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 3010 + }, + { + "epoch": 0.9748224661071659, + "grad_norm": 0.5613434910774231, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 3020 + }, + { + "epoch": 0.9780503550677857, + "grad_norm": 0.5052843689918518, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 3030 + }, + { + "epoch": 0.9812782440284055, + "grad_norm": 0.4270972013473511, + "learning_rate": 0.0002, + "loss": 0.8412, + "step": 3040 + }, + { + "epoch": 0.9845061329890252, + "grad_norm": 0.4974991977214813, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 3050 + }, + { + "epoch": 0.9877340219496449, + "grad_norm": 0.4432311952114105, + "learning_rate": 0.0002, + "loss": 0.8415, + "step": 3060 + }, + { + "epoch": 0.9909619109102646, + "grad_norm": 0.466457724571228, + "learning_rate": 0.0002, + "loss": 0.7764, + "step": 3070 + }, + { + "epoch": 0.9941897998708844, + "grad_norm": 0.6438009142875671, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 3080 + }, + { + "epoch": 0.9974176888315042, + "grad_norm": 0.5593604445457458, + "learning_rate": 0.0002, + "loss": 0.8425, + "step": 3090 + }, + { + "epoch": 1.0, + "eval_loss": 1.0958120822906494, + "eval_runtime": 148.3273, + "eval_samples_per_second": 4.942, + "eval_steps_per_second": 0.62, + "step": 3098 + }, + { + "epoch": 1.000645577792124, + "grad_norm": 0.5701445937156677, + "learning_rate": 0.0002, + "loss": 0.8275, + "step": 3100 + }, + { + "epoch": 1.0038734667527438, + "grad_norm": 0.6089657545089722, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 3110 + }, + { + "epoch": 1.0071013557133635, + "grad_norm": 0.5619552135467529, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 3120 + }, + { + "epoch": 1.010329244673983, + "grad_norm": 0.5550283789634705, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 3130 + }, + { + "epoch": 1.013557133634603, + "grad_norm": 0.6221792101860046, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 3140 + }, + { + "epoch": 1.0167850225952226, + "grad_norm": 0.5450758934020996, + "learning_rate": 0.0002, + "loss": 0.7603, + "step": 3150 + }, + { + "epoch": 1.0200129115558425, + "grad_norm": 0.4359588027000427, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 3160 + }, + { + "epoch": 1.0232408005164622, + "grad_norm": 0.5932239890098572, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 3170 + }, + { + "epoch": 1.026468689477082, + "grad_norm": 0.45478707551956177, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 3180 + }, + { + "epoch": 1.0296965784377017, + "grad_norm": 0.677615761756897, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 3190 + }, + { + "epoch": 1.0329244673983216, + "grad_norm": 0.6231790781021118, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 3200 + }, + { + "epoch": 1.0361523563589412, + "grad_norm": 0.5074195861816406, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 3210 + }, + { + "epoch": 1.039380245319561, + "grad_norm": 0.4844142198562622, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 3220 + }, + { + "epoch": 1.0426081342801807, + "grad_norm": 0.5372750759124756, + "learning_rate": 0.0002, + "loss": 0.7655, + "step": 3230 + }, + { + "epoch": 1.0458360232408006, + "grad_norm": 0.46296265721321106, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 3240 + }, + { + "epoch": 1.0490639122014203, + "grad_norm": 0.5417148470878601, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 3250 + }, + { + "epoch": 1.0522918011620401, + "grad_norm": 0.5695074200630188, + "learning_rate": 0.0002, + "loss": 0.7637, + "step": 3260 + }, + { + "epoch": 1.0555196901226598, + "grad_norm": 0.5050092935562134, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 3270 + }, + { + "epoch": 1.0587475790832794, + "grad_norm": 0.5320752263069153, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 3280 + }, + { + "epoch": 1.0619754680438993, + "grad_norm": 0.5832052230834961, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 3290 + }, + { + "epoch": 1.065203357004519, + "grad_norm": 0.5228804349899292, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 3300 + }, + { + "epoch": 1.0684312459651388, + "grad_norm": 0.5819445252418518, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 3310 + }, + { + "epoch": 1.0716591349257585, + "grad_norm": 0.4201328754425049, + "learning_rate": 0.0002, + "loss": 0.7093, + "step": 3320 + }, + { + "epoch": 1.0748870238863784, + "grad_norm": 0.5424145460128784, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 3330 + }, + { + "epoch": 1.078114912846998, + "grad_norm": 0.6169946789741516, + "learning_rate": 0.0002, + "loss": 0.7828, + "step": 3340 + }, + { + "epoch": 1.0813428018076179, + "grad_norm": 0.607676088809967, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 3350 + }, + { + "epoch": 1.0845706907682375, + "grad_norm": 0.5191982388496399, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 3360 + }, + { + "epoch": 1.0877985797288574, + "grad_norm": 0.5728003978729248, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 3370 + }, + { + "epoch": 1.091026468689477, + "grad_norm": 0.5402643084526062, + "learning_rate": 0.0002, + "loss": 0.7381, + "step": 3380 + }, + { + "epoch": 1.094254357650097, + "grad_norm": 0.5377541780471802, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 3390 + }, + { + "epoch": 1.0974822466107166, + "grad_norm": 0.4751385748386383, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 3400 + }, + { + "epoch": 1.1007101355713362, + "grad_norm": 0.559158444404602, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 3410 + }, + { + "epoch": 1.103938024531956, + "grad_norm": 0.4917701482772827, + "learning_rate": 0.0002, + "loss": 0.7366, + "step": 3420 + }, + { + "epoch": 1.1071659134925758, + "grad_norm": 0.5507875084877014, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 3430 + }, + { + "epoch": 1.1103938024531956, + "grad_norm": 0.45458680391311646, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 3440 + }, + { + "epoch": 1.1136216914138153, + "grad_norm": 0.5721744894981384, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 3450 + }, + { + "epoch": 1.1168495803744352, + "grad_norm": 0.5776081681251526, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 3460 + }, + { + "epoch": 1.1200774693350548, + "grad_norm": 0.5261953473091125, + "learning_rate": 0.0002, + "loss": 0.7644, + "step": 3470 + }, + { + "epoch": 1.1233053582956747, + "grad_norm": 0.47759532928466797, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 3480 + }, + { + "epoch": 1.1265332472562943, + "grad_norm": 0.5697659850120544, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 3490 + }, + { + "epoch": 1.1297611362169142, + "grad_norm": 0.5643419623374939, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 3500 + }, + { + "epoch": 1.1329890251775339, + "grad_norm": 0.6502931118011475, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 3510 + }, + { + "epoch": 1.1362169141381537, + "grad_norm": 0.5236507654190063, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 3520 + }, + { + "epoch": 1.1394448030987734, + "grad_norm": 0.6521499156951904, + "learning_rate": 0.0002, + "loss": 0.7571, + "step": 3530 + }, + { + "epoch": 1.142672692059393, + "grad_norm": 0.5893217325210571, + "learning_rate": 0.0002, + "loss": 0.7304, + "step": 3540 + }, + { + "epoch": 1.145900581020013, + "grad_norm": 0.5300073027610779, + "learning_rate": 0.0002, + "loss": 0.7508, + "step": 3550 + }, + { + "epoch": 1.1491284699806328, + "grad_norm": 0.6794660091400146, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 3560 + }, + { + "epoch": 1.1523563589412524, + "grad_norm": 0.5420064926147461, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 3570 + }, + { + "epoch": 1.155584247901872, + "grad_norm": 0.5096590518951416, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 3580 + }, + { + "epoch": 1.158812136862492, + "grad_norm": 0.5726043581962585, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 3590 + }, + { + "epoch": 1.1620400258231116, + "grad_norm": 0.7388110160827637, + "learning_rate": 0.0002, + "loss": 0.7728, + "step": 3600 + }, + { + "epoch": 1.1652679147837315, + "grad_norm": 0.5597969889640808, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 3610 + }, + { + "epoch": 1.1684958037443511, + "grad_norm": 0.5067800283432007, + "learning_rate": 0.0002, + "loss": 0.7132, + "step": 3620 + }, + { + "epoch": 1.171723692704971, + "grad_norm": 0.6625118255615234, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 3630 + }, + { + "epoch": 1.1749515816655907, + "grad_norm": 0.5830849409103394, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 3640 + }, + { + "epoch": 1.1781794706262105, + "grad_norm": 0.6140692830085754, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 3650 + }, + { + "epoch": 1.1814073595868302, + "grad_norm": 0.714523434638977, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 3660 + }, + { + "epoch": 1.18463524854745, + "grad_norm": 0.5196696519851685, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 3670 + }, + { + "epoch": 1.1878631375080697, + "grad_norm": 0.6677889823913574, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 3680 + }, + { + "epoch": 1.1910910264686896, + "grad_norm": 0.47095245122909546, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 3690 + }, + { + "epoch": 1.1943189154293092, + "grad_norm": 0.5197778940200806, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 3700 + }, + { + "epoch": 1.1975468043899289, + "grad_norm": 0.5156530141830444, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 3710 + }, + { + "epoch": 1.2007746933505488, + "grad_norm": 0.6968549489974976, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 3720 + }, + { + "epoch": 1.2040025823111684, + "grad_norm": 0.48983848094940186, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 3730 + }, + { + "epoch": 1.2072304712717883, + "grad_norm": 0.6709973216056824, + "learning_rate": 0.0002, + "loss": 0.7163, + "step": 3740 + }, + { + "epoch": 1.210458360232408, + "grad_norm": 0.48681750893592834, + "learning_rate": 0.0002, + "loss": 0.7632, + "step": 3750 + }, + { + "epoch": 1.2136862491930278, + "grad_norm": 0.49475061893463135, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 3760 + }, + { + "epoch": 1.2169141381536475, + "grad_norm": 0.6163983345031738, + "learning_rate": 0.0002, + "loss": 0.7372, + "step": 3770 + }, + { + "epoch": 1.2201420271142673, + "grad_norm": 0.5481411218643188, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 3780 + }, + { + "epoch": 1.223369916074887, + "grad_norm": 0.620639979839325, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 3790 + }, + { + "epoch": 1.2265978050355069, + "grad_norm": 0.7017222046852112, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 3800 + }, + { + "epoch": 1.2298256939961265, + "grad_norm": 0.5872400403022766, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 3810 + }, + { + "epoch": 1.2330535829567464, + "grad_norm": 0.45765596628189087, + "learning_rate": 0.0002, + "loss": 0.7854, + "step": 3820 + }, + { + "epoch": 1.236281471917366, + "grad_norm": 0.5676377415657043, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 3830 + }, + { + "epoch": 1.2395093608779857, + "grad_norm": 0.4793425500392914, + "learning_rate": 0.0002, + "loss": 0.7696, + "step": 3840 + }, + { + "epoch": 1.2427372498386056, + "grad_norm": 0.5060022473335266, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 3850 + }, + { + "epoch": 1.2459651387992252, + "grad_norm": 0.6140682697296143, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 3860 + }, + { + "epoch": 1.249193027759845, + "grad_norm": 0.5030326843261719, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 3870 + }, + { + "epoch": 1.2524209167204647, + "grad_norm": 0.6609430909156799, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 3880 + }, + { + "epoch": 1.2556488056810846, + "grad_norm": 0.5459545850753784, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 3890 + }, + { + "epoch": 1.2588766946417043, + "grad_norm": 0.5328870415687561, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 3900 + }, + { + "epoch": 1.2621045836023241, + "grad_norm": 0.5840652585029602, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 3910 + }, + { + "epoch": 1.2653324725629438, + "grad_norm": 0.5587584376335144, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 3920 + }, + { + "epoch": 1.2685603615235637, + "grad_norm": 0.5886949896812439, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 3930 + }, + { + "epoch": 1.2717882504841833, + "grad_norm": 0.5128693580627441, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 3940 + }, + { + "epoch": 1.2750161394448032, + "grad_norm": 0.6207669377326965, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 3950 + }, + { + "epoch": 1.2782440284054228, + "grad_norm": 0.5789574384689331, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 3960 + }, + { + "epoch": 1.2814719173660425, + "grad_norm": 0.503162145614624, + "learning_rate": 0.0002, + "loss": 0.7574, + "step": 3970 + }, + { + "epoch": 1.2846998063266624, + "grad_norm": 0.6670064926147461, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 3980 + }, + { + "epoch": 1.2879276952872822, + "grad_norm": 0.5676213502883911, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 3990 + }, + { + "epoch": 1.2911555842479019, + "grad_norm": 0.5383169054985046, + "learning_rate": 0.0002, + "loss": 0.7892, + "step": 4000 + }, + { + "epoch": 1.2943834732085215, + "grad_norm": 0.714743971824646, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 4010 + }, + { + "epoch": 1.2976113621691414, + "grad_norm": 0.5740262269973755, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 4020 + }, + { + "epoch": 1.300839251129761, + "grad_norm": 0.6143045425415039, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 4030 + }, + { + "epoch": 1.304067140090381, + "grad_norm": 0.501025378704071, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 4040 + }, + { + "epoch": 1.3072950290510006, + "grad_norm": 0.5784100294113159, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 4050 + }, + { + "epoch": 1.3105229180116205, + "grad_norm": 0.6182606220245361, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 4060 + }, + { + "epoch": 1.3137508069722401, + "grad_norm": 0.5072231292724609, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 4070 + }, + { + "epoch": 1.31697869593286, + "grad_norm": 0.6841012835502625, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 4080 + }, + { + "epoch": 1.3202065848934796, + "grad_norm": 0.697257936000824, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 4090 + }, + { + "epoch": 1.3234344738540993, + "grad_norm": 0.5113214254379272, + "learning_rate": 0.0002, + "loss": 0.7401, + "step": 4100 + }, + { + "epoch": 1.3266623628147192, + "grad_norm": 0.6270561814308167, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 4110 + }, + { + "epoch": 1.329890251775339, + "grad_norm": 0.5525947213172913, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 4120 + }, + { + "epoch": 1.3331181407359587, + "grad_norm": 0.546071469783783, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 4130 + }, + { + "epoch": 1.3363460296965783, + "grad_norm": 0.6516721248626709, + "learning_rate": 0.0002, + "loss": 0.7884, + "step": 4140 + }, + { + "epoch": 1.3395739186571982, + "grad_norm": 0.6235111355781555, + "learning_rate": 0.0002, + "loss": 0.755, + "step": 4150 + }, + { + "epoch": 1.3428018076178179, + "grad_norm": 0.538649320602417, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 4160 + }, + { + "epoch": 1.3460296965784377, + "grad_norm": 0.5367001891136169, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 4170 + }, + { + "epoch": 1.3492575855390574, + "grad_norm": 0.6134631037712097, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 4180 + }, + { + "epoch": 1.3524854744996773, + "grad_norm": 0.5827262997627258, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 4190 + }, + { + "epoch": 1.355713363460297, + "grad_norm": 0.5706096291542053, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 4200 + }, + { + "epoch": 1.3589412524209168, + "grad_norm": 0.6422057151794434, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 4210 + }, + { + "epoch": 1.3621691413815364, + "grad_norm": 0.6316141486167908, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 4220 + }, + { + "epoch": 1.365397030342156, + "grad_norm": 0.6946983933448792, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 4230 + }, + { + "epoch": 1.368624919302776, + "grad_norm": 0.5381525754928589, + "learning_rate": 0.0002, + "loss": 0.7388, + "step": 4240 + }, + { + "epoch": 1.3718528082633958, + "grad_norm": 0.5484845638275146, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 4250 + }, + { + "epoch": 1.3750806972240155, + "grad_norm": 0.5961896777153015, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 4260 + }, + { + "epoch": 1.3783085861846351, + "grad_norm": 0.6041752696037292, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 4270 + }, + { + "epoch": 1.381536475145255, + "grad_norm": 0.6283464431762695, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 4280 + }, + { + "epoch": 1.384764364105875, + "grad_norm": 0.6761324405670166, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 4290 + }, + { + "epoch": 1.3879922530664945, + "grad_norm": 0.504311203956604, + "learning_rate": 0.0002, + "loss": 0.7381, + "step": 4300 + }, + { + "epoch": 1.3912201420271142, + "grad_norm": 0.6100395917892456, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 4310 + }, + { + "epoch": 1.394448030987734, + "grad_norm": 0.6245788335800171, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 4320 + }, + { + "epoch": 1.3976759199483537, + "grad_norm": 0.6074621081352234, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 4330 + }, + { + "epoch": 1.4009038089089736, + "grad_norm": 0.6683838963508606, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 4340 + }, + { + "epoch": 1.4041316978695932, + "grad_norm": 0.622998058795929, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 4350 + }, + { + "epoch": 1.4073595868302131, + "grad_norm": 0.6089423894882202, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 4360 + }, + { + "epoch": 1.4105874757908328, + "grad_norm": 0.6381658911705017, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 4370 + }, + { + "epoch": 1.4138153647514526, + "grad_norm": 0.5419308543205261, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 4380 + }, + { + "epoch": 1.4170432537120723, + "grad_norm": 0.6026232242584229, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 4390 + }, + { + "epoch": 1.420271142672692, + "grad_norm": 0.4911101162433624, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 4400 + }, + { + "epoch": 1.4234990316333118, + "grad_norm": 0.6302908062934875, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 4410 + }, + { + "epoch": 1.4267269205939317, + "grad_norm": 0.6692768931388855, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 4420 + }, + { + "epoch": 1.4299548095545513, + "grad_norm": 0.46294572949409485, + "learning_rate": 0.0002, + "loss": 0.7312, + "step": 4430 + }, + { + "epoch": 1.433182698515171, + "grad_norm": 0.5452619194984436, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 4440 + }, + { + "epoch": 1.4364105874757909, + "grad_norm": 0.7809233069419861, + "learning_rate": 0.0002, + "loss": 0.7974, + "step": 4450 + }, + { + "epoch": 1.4396384764364105, + "grad_norm": 0.550088107585907, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 4460 + }, + { + "epoch": 1.4428663653970304, + "grad_norm": 0.7139151096343994, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 4470 + }, + { + "epoch": 1.44609425435765, + "grad_norm": 0.6187090873718262, + "learning_rate": 0.0002, + "loss": 0.7358, + "step": 4480 + }, + { + "epoch": 1.44932214331827, + "grad_norm": 0.5948249101638794, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 4490 + }, + { + "epoch": 1.4525500322788896, + "grad_norm": 0.6510892510414124, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 4500 + }, + { + "epoch": 1.4557779212395094, + "grad_norm": 0.6552293300628662, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 4510 + }, + { + "epoch": 1.459005810200129, + "grad_norm": 0.585574209690094, + "learning_rate": 0.0002, + "loss": 0.7965, + "step": 4520 + }, + { + "epoch": 1.4622336991607487, + "grad_norm": 0.4830162823200226, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 4530 + }, + { + "epoch": 1.4654615881213686, + "grad_norm": 0.5780223608016968, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 4540 + }, + { + "epoch": 1.4686894770819885, + "grad_norm": 0.5462607145309448, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 4550 + }, + { + "epoch": 1.4719173660426081, + "grad_norm": 0.5183546543121338, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 4560 + }, + { + "epoch": 1.4751452550032278, + "grad_norm": 0.676917552947998, + "learning_rate": 0.0002, + "loss": 0.71, + "step": 4570 + }, + { + "epoch": 1.4783731439638477, + "grad_norm": 0.5772345066070557, + "learning_rate": 0.0002, + "loss": 0.7875, + "step": 4580 + }, + { + "epoch": 1.4816010329244673, + "grad_norm": 0.7320035696029663, + "learning_rate": 0.0002, + "loss": 0.7709, + "step": 4590 + }, + { + "epoch": 1.4848289218850872, + "grad_norm": 0.5024042129516602, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 4600 + }, + { + "epoch": 1.4880568108457068, + "grad_norm": 0.5482868552207947, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 4610 + }, + { + "epoch": 1.4912846998063267, + "grad_norm": 0.5447399616241455, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 4620 + }, + { + "epoch": 1.4945125887669464, + "grad_norm": 0.5953414440155029, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 4630 + }, + { + "epoch": 1.4977404777275662, + "grad_norm": 0.6983066201210022, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 4640 + }, + { + "epoch": 1.500968366688186, + "grad_norm": 0.586327075958252, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 4650 + }, + { + "epoch": 1.5041962556488055, + "grad_norm": 0.5839682221412659, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 4660 + }, + { + "epoch": 1.5074241446094254, + "grad_norm": 0.5959209203720093, + "learning_rate": 0.0002, + "loss": 0.7524, + "step": 4670 + }, + { + "epoch": 1.5106520335700453, + "grad_norm": 0.5073857307434082, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 4680 + }, + { + "epoch": 1.513879922530665, + "grad_norm": 0.5183001160621643, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 4690 + }, + { + "epoch": 1.5171078114912846, + "grad_norm": 0.593530535697937, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 4700 + }, + { + "epoch": 1.5203357004519045, + "grad_norm": 0.675993025302887, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 4710 + }, + { + "epoch": 1.5235635894125243, + "grad_norm": 0.5823286771774292, + "learning_rate": 0.0002, + "loss": 0.7485, + "step": 4720 + }, + { + "epoch": 1.526791478373144, + "grad_norm": 0.5825035572052002, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 4730 + }, + { + "epoch": 1.5300193673337636, + "grad_norm": 0.5689691305160522, + "learning_rate": 0.0002, + "loss": 0.8287, + "step": 4740 + }, + { + "epoch": 1.5332472562943835, + "grad_norm": 0.6037150621414185, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 4750 + }, + { + "epoch": 1.5364751452550034, + "grad_norm": 0.6393677592277527, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 4760 + }, + { + "epoch": 1.539703034215623, + "grad_norm": 0.5926381945610046, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 4770 + }, + { + "epoch": 1.5429309231762427, + "grad_norm": 0.9468599557876587, + "learning_rate": 0.0002, + "loss": 0.7425, + "step": 4780 + }, + { + "epoch": 1.5461588121368623, + "grad_norm": 0.7544237375259399, + "learning_rate": 0.0002, + "loss": 0.7565, + "step": 4790 + }, + { + "epoch": 1.5493867010974822, + "grad_norm": 0.5308566093444824, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 4800 + }, + { + "epoch": 1.552614590058102, + "grad_norm": 0.6590296030044556, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 4810 + }, + { + "epoch": 1.5558424790187217, + "grad_norm": 0.5630404353141785, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 4820 + }, + { + "epoch": 1.5590703679793414, + "grad_norm": 0.6800200939178467, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 4830 + }, + { + "epoch": 1.5622982569399613, + "grad_norm": 0.5463718175888062, + "learning_rate": 0.0002, + "loss": 0.7373, + "step": 4840 + }, + { + "epoch": 1.5655261459005811, + "grad_norm": 0.505135178565979, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 4850 + }, + { + "epoch": 1.5687540348612008, + "grad_norm": 0.5469676852226257, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 4860 + }, + { + "epoch": 1.5719819238218204, + "grad_norm": 0.5318337678909302, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 4870 + }, + { + "epoch": 1.5752098127824403, + "grad_norm": 0.7287914752960205, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 4880 + }, + { + "epoch": 1.5784377017430602, + "grad_norm": 0.7318989038467407, + "learning_rate": 0.0002, + "loss": 0.7532, + "step": 4890 + }, + { + "epoch": 1.5816655907036798, + "grad_norm": 0.6499921679496765, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 4900 + }, + { + "epoch": 1.5848934796642995, + "grad_norm": 0.47907355427742004, + "learning_rate": 0.0002, + "loss": 0.753, + "step": 4910 + }, + { + "epoch": 1.5881213686249191, + "grad_norm": 0.7338833808898926, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 4920 + }, + { + "epoch": 1.591349257585539, + "grad_norm": 0.5800719261169434, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 4930 + }, + { + "epoch": 1.594577146546159, + "grad_norm": 0.5365763306617737, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 4940 + }, + { + "epoch": 1.5978050355067785, + "grad_norm": 0.5800772309303284, + "learning_rate": 0.0002, + "loss": 0.777, + "step": 4950 + }, + { + "epoch": 1.6010329244673982, + "grad_norm": 0.7878010869026184, + "learning_rate": 0.0002, + "loss": 0.8027, + "step": 4960 + }, + { + "epoch": 1.604260813428018, + "grad_norm": 0.5919058918952942, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 4970 + }, + { + "epoch": 1.607488702388638, + "grad_norm": 0.5004435181617737, + "learning_rate": 0.0002, + "loss": 0.7762, + "step": 4980 + }, + { + "epoch": 1.6107165913492576, + "grad_norm": 0.6299242377281189, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 4990 + }, + { + "epoch": 1.6139444803098772, + "grad_norm": 0.6307242512702942, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 5000 + }, + { + "epoch": 1.6171723692704971, + "grad_norm": 0.7838703989982605, + "learning_rate": 0.0002, + "loss": 0.7693, + "step": 5010 + }, + { + "epoch": 1.620400258231117, + "grad_norm": 0.6454671621322632, + "learning_rate": 0.0002, + "loss": 0.7364, + "step": 5020 + }, + { + "epoch": 1.6236281471917366, + "grad_norm": 0.5907095670700073, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 5030 + }, + { + "epoch": 1.6268560361523563, + "grad_norm": 0.6053501963615417, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 5040 + }, + { + "epoch": 1.630083925112976, + "grad_norm": 0.5644670128822327, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 5050 + }, + { + "epoch": 1.6333118140735958, + "grad_norm": 0.6320949792861938, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 5060 + }, + { + "epoch": 1.6365397030342157, + "grad_norm": 0.6101489067077637, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 5070 + }, + { + "epoch": 1.6397675919948353, + "grad_norm": 0.9435283541679382, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 5080 + }, + { + "epoch": 1.642995480955455, + "grad_norm": 0.6668919324874878, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 5090 + }, + { + "epoch": 1.6462233699160749, + "grad_norm": 0.6160340905189514, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 5100 + }, + { + "epoch": 1.6494512588766947, + "grad_norm": 0.5999835729598999, + "learning_rate": 0.0002, + "loss": 0.7461, + "step": 5110 + }, + { + "epoch": 1.6526791478373144, + "grad_norm": 0.9378551840782166, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 5120 + }, + { + "epoch": 1.655907036797934, + "grad_norm": 0.4795055389404297, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 5130 + }, + { + "epoch": 1.659134925758554, + "grad_norm": 0.4878861606121063, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 5140 + }, + { + "epoch": 1.6623628147191738, + "grad_norm": 0.6042965054512024, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 5150 + }, + { + "epoch": 1.6655907036797934, + "grad_norm": 0.5829901695251465, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 5160 + }, + { + "epoch": 1.668818592640413, + "grad_norm": 0.5168480277061462, + "learning_rate": 0.0002, + "loss": 0.7498, + "step": 5170 + }, + { + "epoch": 1.672046481601033, + "grad_norm": 0.6489511132240295, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 5180 + }, + { + "epoch": 1.6752743705616526, + "grad_norm": 0.5955966114997864, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 5190 + }, + { + "epoch": 1.6785022595222725, + "grad_norm": 0.6228088140487671, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 5200 + }, + { + "epoch": 1.6817301484828922, + "grad_norm": 0.5726390480995178, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 5210 + }, + { + "epoch": 1.6849580374435118, + "grad_norm": 0.6116343140602112, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 5220 + }, + { + "epoch": 1.6881859264041317, + "grad_norm": 0.5483687520027161, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 5230 + }, + { + "epoch": 1.6914138153647515, + "grad_norm": 0.570941686630249, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 5240 + }, + { + "epoch": 1.6946417043253712, + "grad_norm": 0.6048086285591125, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 5250 + }, + { + "epoch": 1.6978695932859909, + "grad_norm": 0.6769003868103027, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 5260 + }, + { + "epoch": 1.7010974822466107, + "grad_norm": 0.5629057884216309, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 5270 + }, + { + "epoch": 1.7043253712072306, + "grad_norm": 0.657341480255127, + "learning_rate": 0.0002, + "loss": 0.7693, + "step": 5280 + }, + { + "epoch": 1.7075532601678503, + "grad_norm": 0.6256147623062134, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 5290 + }, + { + "epoch": 1.71078114912847, + "grad_norm": 0.5498088002204895, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 5300 + }, + { + "epoch": 1.7140090380890898, + "grad_norm": 0.5078358054161072, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 5310 + }, + { + "epoch": 1.7172369270497096, + "grad_norm": 0.6696692705154419, + "learning_rate": 0.0002, + "loss": 0.7872, + "step": 5320 + }, + { + "epoch": 1.7204648160103293, + "grad_norm": 0.6692847013473511, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 5330 + }, + { + "epoch": 1.723692704970949, + "grad_norm": 0.5415751934051514, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 5340 + }, + { + "epoch": 1.7269205939315686, + "grad_norm": 0.5367611050605774, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 5350 + }, + { + "epoch": 1.7301484828921885, + "grad_norm": 0.7321061491966248, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 5360 + }, + { + "epoch": 1.7333763718528084, + "grad_norm": 0.723972499370575, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 5370 + }, + { + "epoch": 1.736604260813428, + "grad_norm": 0.7328100204467773, + "learning_rate": 0.0002, + "loss": 0.7077, + "step": 5380 + }, + { + "epoch": 1.7398321497740477, + "grad_norm": 0.5785264372825623, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 5390 + }, + { + "epoch": 1.7430600387346675, + "grad_norm": 0.7812932133674622, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 5400 + }, + { + "epoch": 1.7462879276952874, + "grad_norm": 0.6493327617645264, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 5410 + }, + { + "epoch": 1.749515816655907, + "grad_norm": 0.5825939774513245, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 5420 + }, + { + "epoch": 1.7527437056165267, + "grad_norm": 0.6969610452651978, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 5430 + }, + { + "epoch": 1.7559715945771466, + "grad_norm": 0.5558062195777893, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 5440 + }, + { + "epoch": 1.7591994835377665, + "grad_norm": 0.49222221970558167, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 5450 + }, + { + "epoch": 1.762427372498386, + "grad_norm": 0.5844656825065613, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 5460 + }, + { + "epoch": 1.7656552614590058, + "grad_norm": 0.8706597685813904, + "learning_rate": 0.0002, + "loss": 0.7695, + "step": 5470 + }, + { + "epoch": 1.7688831504196254, + "grad_norm": 0.6167706251144409, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 5480 + }, + { + "epoch": 1.7721110393802453, + "grad_norm": 0.5890011787414551, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 5490 + }, + { + "epoch": 1.7753389283408652, + "grad_norm": 0.6551728248596191, + "learning_rate": 0.0002, + "loss": 0.8319, + "step": 5500 + }, + { + "epoch": 1.7785668173014848, + "grad_norm": 0.5848751068115234, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 5510 + }, + { + "epoch": 1.7817947062621045, + "grad_norm": 0.6664014458656311, + "learning_rate": 0.0002, + "loss": 0.7622, + "step": 5520 + }, + { + "epoch": 1.7850225952227243, + "grad_norm": 0.5931693911552429, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 5530 + }, + { + "epoch": 1.7882504841833442, + "grad_norm": 0.5534724593162537, + "learning_rate": 0.0002, + "loss": 0.7992, + "step": 5540 + }, + { + "epoch": 1.7914783731439639, + "grad_norm": 0.5590878129005432, + "learning_rate": 0.0002, + "loss": 0.7967, + "step": 5550 + }, + { + "epoch": 1.7947062621045835, + "grad_norm": 0.6947470903396606, + "learning_rate": 0.0002, + "loss": 0.7406, + "step": 5560 + }, + { + "epoch": 1.7979341510652034, + "grad_norm": 0.6104130148887634, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 5570 + }, + { + "epoch": 1.8011620400258233, + "grad_norm": 0.6135714054107666, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 5580 + }, + { + "epoch": 1.804389928986443, + "grad_norm": 0.6626853346824646, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 5590 + }, + { + "epoch": 1.8076178179470626, + "grad_norm": 0.6977612972259521, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 5600 + }, + { + "epoch": 1.8108457069076824, + "grad_norm": 0.6275238394737244, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 5610 + }, + { + "epoch": 1.814073595868302, + "grad_norm": 0.5017505288124084, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 5620 + }, + { + "epoch": 1.817301484828922, + "grad_norm": 0.8314290642738342, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 5630 + }, + { + "epoch": 1.8205293737895416, + "grad_norm": 0.6863582134246826, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 5640 + }, + { + "epoch": 1.8237572627501613, + "grad_norm": 0.69544917345047, + "learning_rate": 0.0002, + "loss": 0.743, + "step": 5650 + }, + { + "epoch": 1.8269851517107811, + "grad_norm": 0.515499472618103, + "learning_rate": 0.0002, + "loss": 0.7277, + "step": 5660 + }, + { + "epoch": 1.830213040671401, + "grad_norm": 0.6100873947143555, + "learning_rate": 0.0002, + "loss": 0.7166, + "step": 5670 + }, + { + "epoch": 1.8334409296320207, + "grad_norm": 0.67416912317276, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 5680 + }, + { + "epoch": 1.8366688185926403, + "grad_norm": 0.7057772278785706, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 5690 + }, + { + "epoch": 1.8398967075532602, + "grad_norm": 0.7374551892280579, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 5700 + }, + { + "epoch": 1.84312459651388, + "grad_norm": 0.6266297101974487, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 5710 + }, + { + "epoch": 1.8463524854744997, + "grad_norm": 0.5629227757453918, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 5720 + }, + { + "epoch": 1.8495803744351194, + "grad_norm": 0.6603655815124512, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 5730 + }, + { + "epoch": 1.8528082633957392, + "grad_norm": 0.8113715052604675, + "learning_rate": 0.0002, + "loss": 0.7587, + "step": 5740 + }, + { + "epoch": 1.856036152356359, + "grad_norm": 0.7143914103507996, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 5750 + }, + { + "epoch": 1.8592640413169788, + "grad_norm": 0.6273732781410217, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 5760 + }, + { + "epoch": 1.8624919302775984, + "grad_norm": 0.5428690910339355, + "learning_rate": 0.0002, + "loss": 0.7962, + "step": 5770 + }, + { + "epoch": 1.865719819238218, + "grad_norm": 0.6405037641525269, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 5780 + }, + { + "epoch": 1.868947708198838, + "grad_norm": 0.700873613357544, + "learning_rate": 0.0002, + "loss": 0.7569, + "step": 5790 + }, + { + "epoch": 1.8721755971594578, + "grad_norm": 0.5645238161087036, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 5800 + }, + { + "epoch": 1.8754034861200775, + "grad_norm": 0.8780353665351868, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 5810 + }, + { + "epoch": 1.878631375080697, + "grad_norm": 0.6295409798622131, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 5820 + }, + { + "epoch": 1.881859264041317, + "grad_norm": 0.678269624710083, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 5830 + }, + { + "epoch": 1.8850871530019369, + "grad_norm": 0.6464608907699585, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 5840 + }, + { + "epoch": 1.8883150419625565, + "grad_norm": 0.6201048493385315, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 5850 + }, + { + "epoch": 1.8915429309231762, + "grad_norm": 0.6046274304389954, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 5860 + }, + { + "epoch": 1.894770819883796, + "grad_norm": 0.7532408833503723, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 5870 + }, + { + "epoch": 1.897998708844416, + "grad_norm": 0.6066767573356628, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 5880 + }, + { + "epoch": 1.9012265978050356, + "grad_norm": 0.6289830207824707, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 5890 + }, + { + "epoch": 1.9044544867656552, + "grad_norm": 0.5204319953918457, + "learning_rate": 0.0002, + "loss": 0.7501, + "step": 5900 + }, + { + "epoch": 1.9076823757262749, + "grad_norm": 0.6708219647407532, + "learning_rate": 0.0002, + "loss": 0.7335, + "step": 5910 + }, + { + "epoch": 1.9109102646868947, + "grad_norm": 0.4915677309036255, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 5920 + }, + { + "epoch": 1.9141381536475146, + "grad_norm": 0.652717113494873, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 5930 + }, + { + "epoch": 1.9173660426081343, + "grad_norm": 0.5446316003799438, + "learning_rate": 0.0002, + "loss": 0.7687, + "step": 5940 + }, + { + "epoch": 1.920593931568754, + "grad_norm": 0.4958149194717407, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 5950 + }, + { + "epoch": 1.9238218205293738, + "grad_norm": 0.5623434782028198, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 5960 + }, + { + "epoch": 1.9270497094899937, + "grad_norm": 0.6855450868606567, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 5970 + }, + { + "epoch": 1.9302775984506133, + "grad_norm": 0.5710492730140686, + "learning_rate": 0.0002, + "loss": 0.827, + "step": 5980 + }, + { + "epoch": 1.933505487411233, + "grad_norm": 0.5379431843757629, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 5990 + }, + { + "epoch": 1.9367333763718528, + "grad_norm": 0.557129442691803, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 6000 + }, + { + "epoch": 1.9399612653324727, + "grad_norm": 0.6336663961410522, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 6010 + }, + { + "epoch": 1.9431891542930924, + "grad_norm": 0.5950582027435303, + "learning_rate": 0.0002, + "loss": 0.7316, + "step": 6020 + }, + { + "epoch": 1.946417043253712, + "grad_norm": 0.5905954837799072, + "learning_rate": 0.0002, + "loss": 0.7443, + "step": 6030 + }, + { + "epoch": 1.9496449322143317, + "grad_norm": 0.6688982844352722, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 6040 + }, + { + "epoch": 1.9528728211749515, + "grad_norm": 0.5440775752067566, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 6050 + }, + { + "epoch": 1.9561007101355714, + "grad_norm": 0.6207906603813171, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 6060 + }, + { + "epoch": 1.959328599096191, + "grad_norm": 0.6999374628067017, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 6070 + }, + { + "epoch": 1.9625564880568107, + "grad_norm": 0.6310848593711853, + "learning_rate": 0.0002, + "loss": 0.7372, + "step": 6080 + }, + { + "epoch": 1.9657843770174306, + "grad_norm": 0.5903388261795044, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 6090 + }, + { + "epoch": 1.9690122659780505, + "grad_norm": 0.6333889961242676, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 6100 + }, + { + "epoch": 1.97224015493867, + "grad_norm": 0.5604711174964905, + "learning_rate": 0.0002, + "loss": 0.7246, + "step": 6110 + }, + { + "epoch": 1.9754680438992898, + "grad_norm": 0.9234541654586792, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 6120 + }, + { + "epoch": 1.9786959328599096, + "grad_norm": 0.6149102449417114, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 6130 + }, + { + "epoch": 1.9819238218205295, + "grad_norm": 0.615446150302887, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 6140 + }, + { + "epoch": 1.9851517107811492, + "grad_norm": 0.5176635980606079, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 6150 + }, + { + "epoch": 1.9883795997417688, + "grad_norm": 0.7124109864234924, + "learning_rate": 0.0002, + "loss": 0.718, + "step": 6160 + }, + { + "epoch": 1.9916074887023887, + "grad_norm": 0.6317567825317383, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 6170 + }, + { + "epoch": 1.9948353776630086, + "grad_norm": 0.6855016350746155, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 6180 + }, + { + "epoch": 1.9980632666236282, + "grad_norm": 0.6423715353012085, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 6190 + }, + { + "epoch": 2.0, + "eval_loss": 1.1096643209457397, + "eval_runtime": 147.7997, + "eval_samples_per_second": 4.959, + "eval_steps_per_second": 0.622, + "step": 6196 + }, + { + "epoch": 2.001291155584248, + "grad_norm": 0.5322932600975037, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 6200 + }, + { + "epoch": 2.0045190445448675, + "grad_norm": 0.8152306079864502, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 6210 + }, + { + "epoch": 2.0077469335054876, + "grad_norm": 0.6215983033180237, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 6220 + }, + { + "epoch": 2.0109748224661073, + "grad_norm": 0.845498263835907, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 6230 + }, + { + "epoch": 2.014202711426727, + "grad_norm": 0.733559787273407, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 6240 + }, + { + "epoch": 2.0174306003873466, + "grad_norm": 0.51433926820755, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 6250 + }, + { + "epoch": 2.020658489347966, + "grad_norm": 0.6374049782752991, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 6260 + }, + { + "epoch": 2.0238863783085863, + "grad_norm": 0.7833638191223145, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 6270 + }, + { + "epoch": 2.027114267269206, + "grad_norm": 0.8929463028907776, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 6280 + }, + { + "epoch": 2.0303421562298256, + "grad_norm": 0.669731855392456, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 6290 + }, + { + "epoch": 2.0335700451904453, + "grad_norm": 0.5846071243286133, + "learning_rate": 0.0002, + "loss": 0.646, + "step": 6300 + }, + { + "epoch": 2.0367979341510654, + "grad_norm": 0.7087787985801697, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 6310 + }, + { + "epoch": 2.040025823111685, + "grad_norm": 0.6739160418510437, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 6320 + }, + { + "epoch": 2.0432537120723047, + "grad_norm": 0.4860886335372925, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 6330 + }, + { + "epoch": 2.0464816010329243, + "grad_norm": 0.7201244831085205, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 6340 + }, + { + "epoch": 2.0497094899935444, + "grad_norm": 0.7409170269966125, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 6350 + }, + { + "epoch": 2.052937378954164, + "grad_norm": 0.6843920350074768, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 6360 + }, + { + "epoch": 2.0561652679147837, + "grad_norm": 0.7519999742507935, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 6370 + }, + { + "epoch": 2.0593931568754034, + "grad_norm": 0.5732819437980652, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 6380 + }, + { + "epoch": 2.062621045836023, + "grad_norm": 0.7565118074417114, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 6390 + }, + { + "epoch": 2.065848934796643, + "grad_norm": 0.8147150278091431, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 6400 + }, + { + "epoch": 2.0690768237572628, + "grad_norm": 0.6941924691200256, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 6410 + }, + { + "epoch": 2.0723047127178824, + "grad_norm": 0.6549784541130066, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 6420 + }, + { + "epoch": 2.075532601678502, + "grad_norm": 0.7224905490875244, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 6430 + }, + { + "epoch": 2.078760490639122, + "grad_norm": 0.7754863500595093, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 6440 + }, + { + "epoch": 2.081988379599742, + "grad_norm": 0.691318154335022, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 6450 + }, + { + "epoch": 2.0852162685603615, + "grad_norm": 0.6009294986724854, + "learning_rate": 0.0002, + "loss": 0.6233, + "step": 6460 + }, + { + "epoch": 2.088444157520981, + "grad_norm": 0.6753945350646973, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 6470 + }, + { + "epoch": 2.091672046481601, + "grad_norm": 0.6899921298027039, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 6480 + }, + { + "epoch": 2.094899935442221, + "grad_norm": 0.846510648727417, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 6490 + }, + { + "epoch": 2.0981278244028405, + "grad_norm": 0.6432605981826782, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 6500 + }, + { + "epoch": 2.10135571336346, + "grad_norm": 0.8125239014625549, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 6510 + }, + { + "epoch": 2.1045836023240803, + "grad_norm": 0.628302812576294, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 6520 + }, + { + "epoch": 2.1078114912847, + "grad_norm": 0.7164334654808044, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 6530 + }, + { + "epoch": 2.1110393802453196, + "grad_norm": 0.7476949095726013, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 6540 + }, + { + "epoch": 2.114267269205939, + "grad_norm": 0.7577515840530396, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 6550 + }, + { + "epoch": 2.117495158166559, + "grad_norm": 0.5684467554092407, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 6560 + }, + { + "epoch": 2.120723047127179, + "grad_norm": 0.6121789216995239, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 6570 + }, + { + "epoch": 2.1239509360877986, + "grad_norm": 0.6095348596572876, + "learning_rate": 0.0002, + "loss": 0.6314, + "step": 6580 + }, + { + "epoch": 2.1271788250484183, + "grad_norm": 0.7803651690483093, + "learning_rate": 0.0002, + "loss": 0.6276, + "step": 6590 + }, + { + "epoch": 2.130406714009038, + "grad_norm": 0.5990583300590515, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 6600 + }, + { + "epoch": 2.133634602969658, + "grad_norm": 0.6569220423698425, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 6610 + }, + { + "epoch": 2.1368624919302777, + "grad_norm": 0.5961166620254517, + "learning_rate": 0.0002, + "loss": 0.7049, + "step": 6620 + }, + { + "epoch": 2.1400903808908973, + "grad_norm": 0.5860554575920105, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 6630 + }, + { + "epoch": 2.143318269851517, + "grad_norm": 0.5994001626968384, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 6640 + }, + { + "epoch": 2.146546158812137, + "grad_norm": 0.7723015546798706, + "learning_rate": 0.0002, + "loss": 0.6421, + "step": 6650 + }, + { + "epoch": 2.1497740477727567, + "grad_norm": 0.676355242729187, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 6660 + }, + { + "epoch": 2.1530019367333764, + "grad_norm": 0.5689092874526978, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 6670 + }, + { + "epoch": 2.156229825693996, + "grad_norm": 0.6933727264404297, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 6680 + }, + { + "epoch": 2.159457714654616, + "grad_norm": 0.8380527496337891, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 6690 + }, + { + "epoch": 2.1626856036152358, + "grad_norm": 0.6876497268676758, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 6700 + }, + { + "epoch": 2.1659134925758554, + "grad_norm": 0.6418334245681763, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 6710 + }, + { + "epoch": 2.169141381536475, + "grad_norm": 0.7169192433357239, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 6720 + }, + { + "epoch": 2.1723692704970947, + "grad_norm": 0.6664170622825623, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 6730 + }, + { + "epoch": 2.175597159457715, + "grad_norm": 0.6011993288993835, + "learning_rate": 0.0002, + "loss": 0.6751, + "step": 6740 + }, + { + "epoch": 2.1788250484183345, + "grad_norm": 0.5529947280883789, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 6750 + }, + { + "epoch": 2.182052937378954, + "grad_norm": 0.6879532933235168, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 6760 + }, + { + "epoch": 2.1852808263395738, + "grad_norm": 0.6426113843917847, + "learning_rate": 0.0002, + "loss": 0.6634, + "step": 6770 + }, + { + "epoch": 2.188508715300194, + "grad_norm": 0.6571047306060791, + "learning_rate": 0.0002, + "loss": 0.6592, + "step": 6780 + }, + { + "epoch": 2.1917366042608135, + "grad_norm": 0.6400564908981323, + "learning_rate": 0.0002, + "loss": 0.6494, + "step": 6790 + }, + { + "epoch": 2.194964493221433, + "grad_norm": 0.6509664058685303, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 6800 + }, + { + "epoch": 2.198192382182053, + "grad_norm": 0.6673197150230408, + "learning_rate": 0.0002, + "loss": 0.6771, + "step": 6810 + }, + { + "epoch": 2.2014202711426725, + "grad_norm": 0.48205727338790894, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 6820 + }, + { + "epoch": 2.2046481601032926, + "grad_norm": 0.849525511264801, + "learning_rate": 0.0002, + "loss": 0.6894, + "step": 6830 + }, + { + "epoch": 2.207876049063912, + "grad_norm": 0.6150892376899719, + "learning_rate": 0.0002, + "loss": 0.6977, + "step": 6840 + }, + { + "epoch": 2.211103938024532, + "grad_norm": 0.7826945781707764, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 6850 + }, + { + "epoch": 2.2143318269851515, + "grad_norm": 0.5711963772773743, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 6860 + }, + { + "epoch": 2.2175597159457716, + "grad_norm": 0.6017758846282959, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 6870 + }, + { + "epoch": 2.2207876049063913, + "grad_norm": 0.785434901714325, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 6880 + }, + { + "epoch": 2.224015493867011, + "grad_norm": 0.6251688599586487, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 6890 + }, + { + "epoch": 2.2272433828276306, + "grad_norm": 0.8242034316062927, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 6900 + }, + { + "epoch": 2.2304712717882507, + "grad_norm": 0.7272933125495911, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 6910 + }, + { + "epoch": 2.2336991607488703, + "grad_norm": 0.7159379720687866, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 6920 + }, + { + "epoch": 2.23692704970949, + "grad_norm": 0.6518042087554932, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 6930 + }, + { + "epoch": 2.2401549386701096, + "grad_norm": 0.7365370392799377, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 6940 + }, + { + "epoch": 2.2433828276307297, + "grad_norm": 0.5674061179161072, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 6950 + }, + { + "epoch": 2.2466107165913494, + "grad_norm": 0.669185996055603, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 6960 + }, + { + "epoch": 2.249838605551969, + "grad_norm": 0.6638304591178894, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 6970 + }, + { + "epoch": 2.2530664945125887, + "grad_norm": 0.757006824016571, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 6980 + }, + { + "epoch": 2.2562943834732083, + "grad_norm": 0.7574930787086487, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 6990 + }, + { + "epoch": 2.2595222724338284, + "grad_norm": 0.7819514870643616, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 7000 + }, + { + "epoch": 2.262750161394448, + "grad_norm": 0.6987583041191101, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 7010 + }, + { + "epoch": 2.2659780503550677, + "grad_norm": 0.6628551483154297, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 7020 + }, + { + "epoch": 2.2692059393156874, + "grad_norm": 0.7855866551399231, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 7030 + }, + { + "epoch": 2.2724338282763075, + "grad_norm": 0.6102892756462097, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 7040 + }, + { + "epoch": 2.275661717236927, + "grad_norm": 0.7844198942184448, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 7050 + }, + { + "epoch": 2.2788896061975468, + "grad_norm": 0.6209492087364197, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 7060 + }, + { + "epoch": 2.2821174951581664, + "grad_norm": 0.8351290225982666, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 7070 + }, + { + "epoch": 2.285345384118786, + "grad_norm": 0.6883546710014343, + "learning_rate": 0.0002, + "loss": 0.6648, + "step": 7080 + }, + { + "epoch": 2.288573273079406, + "grad_norm": 0.6626381874084473, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 7090 + }, + { + "epoch": 2.291801162040026, + "grad_norm": 0.7216270565986633, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 7100 + }, + { + "epoch": 2.2950290510006455, + "grad_norm": 0.8246777057647705, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 7110 + }, + { + "epoch": 2.2982569399612656, + "grad_norm": 0.614326000213623, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 7120 + }, + { + "epoch": 2.301484828921885, + "grad_norm": 0.8785578012466431, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 7130 + }, + { + "epoch": 2.304712717882505, + "grad_norm": 0.7021808624267578, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 7140 + }, + { + "epoch": 2.3079406068431245, + "grad_norm": 0.6999403238296509, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 7150 + }, + { + "epoch": 2.311168495803744, + "grad_norm": 0.8013143539428711, + "learning_rate": 0.0002, + "loss": 0.6547, + "step": 7160 + }, + { + "epoch": 2.3143963847643643, + "grad_norm": 0.6592583060264587, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 7170 + }, + { + "epoch": 2.317624273724984, + "grad_norm": 0.6260249018669128, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 7180 + }, + { + "epoch": 2.3208521626856036, + "grad_norm": 0.9352797269821167, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 7190 + }, + { + "epoch": 2.324080051646223, + "grad_norm": 0.6629612445831299, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 7200 + }, + { + "epoch": 2.3273079406068433, + "grad_norm": 0.7062810063362122, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 7210 + }, + { + "epoch": 2.330535829567463, + "grad_norm": 0.7236241102218628, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 7220 + }, + { + "epoch": 2.3337637185280826, + "grad_norm": 0.7528148293495178, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 7230 + }, + { + "epoch": 2.3369916074887023, + "grad_norm": 0.7604748606681824, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 7240 + }, + { + "epoch": 2.340219496449322, + "grad_norm": 0.5601189136505127, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 7250 + }, + { + "epoch": 2.343447385409942, + "grad_norm": 0.7099230885505676, + "learning_rate": 0.0002, + "loss": 0.6925, + "step": 7260 + }, + { + "epoch": 2.3466752743705617, + "grad_norm": 0.6699047684669495, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 7270 + }, + { + "epoch": 2.3499031633311813, + "grad_norm": 0.7315047979354858, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 7280 + }, + { + "epoch": 2.353131052291801, + "grad_norm": 0.632836103439331, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 7290 + }, + { + "epoch": 2.356358941252421, + "grad_norm": 0.9410115480422974, + "learning_rate": 0.0002, + "loss": 0.6458, + "step": 7300 + }, + { + "epoch": 2.3595868302130407, + "grad_norm": 0.626554012298584, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 7310 + }, + { + "epoch": 2.3628147191736604, + "grad_norm": 0.7538444399833679, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 7320 + }, + { + "epoch": 2.36604260813428, + "grad_norm": 0.6826626062393188, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 7330 + }, + { + "epoch": 2.3692704970949, + "grad_norm": 0.6739391088485718, + "learning_rate": 0.0002, + "loss": 0.6752, + "step": 7340 + }, + { + "epoch": 2.3724983860555198, + "grad_norm": 0.7518446445465088, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 7350 + }, + { + "epoch": 2.3757262750161394, + "grad_norm": 0.714133083820343, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 7360 + }, + { + "epoch": 2.378954163976759, + "grad_norm": 0.7144588232040405, + "learning_rate": 0.0002, + "loss": 0.6794, + "step": 7370 + }, + { + "epoch": 2.382182052937379, + "grad_norm": 0.6598120927810669, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 7380 + }, + { + "epoch": 2.385409941897999, + "grad_norm": 0.7079148292541504, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 7390 + }, + { + "epoch": 2.3886378308586185, + "grad_norm": 0.6750902533531189, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 7400 + }, + { + "epoch": 2.391865719819238, + "grad_norm": 0.7181967496871948, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 7410 + }, + { + "epoch": 2.3950936087798578, + "grad_norm": 0.7720552086830139, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 7420 + }, + { + "epoch": 2.398321497740478, + "grad_norm": 0.7592426538467407, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 7430 + }, + { + "epoch": 2.4015493867010975, + "grad_norm": 0.7161896824836731, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 7440 + }, + { + "epoch": 2.404777275661717, + "grad_norm": 0.8019260764122009, + "learning_rate": 0.0002, + "loss": 0.6891, + "step": 7450 + }, + { + "epoch": 2.408005164622337, + "grad_norm": 0.7093342542648315, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 7460 + }, + { + "epoch": 2.411233053582957, + "grad_norm": 0.8464207649230957, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 7470 + }, + { + "epoch": 2.4144609425435766, + "grad_norm": 0.773666501045227, + "learning_rate": 0.0002, + "loss": 0.6724, + "step": 7480 + }, + { + "epoch": 2.4176888315041962, + "grad_norm": 0.8451611995697021, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 7490 + }, + { + "epoch": 2.420916720464816, + "grad_norm": 0.656795084476471, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 7500 + }, + { + "epoch": 2.4241446094254355, + "grad_norm": 0.7129034996032715, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 7510 + }, + { + "epoch": 2.4273724983860556, + "grad_norm": 0.8325763940811157, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 7520 + }, + { + "epoch": 2.4306003873466753, + "grad_norm": 0.7806527614593506, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 7530 + }, + { + "epoch": 2.433828276307295, + "grad_norm": 0.6994536519050598, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 7540 + }, + { + "epoch": 2.437056165267915, + "grad_norm": 0.6898999214172363, + "learning_rate": 0.0002, + "loss": 0.6615, + "step": 7550 + }, + { + "epoch": 2.4402840542285347, + "grad_norm": 0.719490647315979, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 7560 + }, + { + "epoch": 2.4435119431891543, + "grad_norm": 0.6841562390327454, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 7570 + }, + { + "epoch": 2.446739832149774, + "grad_norm": 0.7573311924934387, + "learning_rate": 0.0002, + "loss": 0.6504, + "step": 7580 + }, + { + "epoch": 2.4499677211103936, + "grad_norm": 0.7295880317687988, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 7590 + }, + { + "epoch": 2.4531956100710137, + "grad_norm": 0.710136353969574, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 7600 + }, + { + "epoch": 2.4564234990316334, + "grad_norm": 0.6126235127449036, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 7610 + }, + { + "epoch": 2.459651387992253, + "grad_norm": 0.8025609850883484, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 7620 + }, + { + "epoch": 2.4628792769528727, + "grad_norm": 0.7839472889900208, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 7630 + }, + { + "epoch": 2.4661071659134928, + "grad_norm": 0.7253499031066895, + "learning_rate": 0.0002, + "loss": 0.6797, + "step": 7640 + }, + { + "epoch": 2.4693350548741124, + "grad_norm": 0.7918946743011475, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 7650 + }, + { + "epoch": 2.472562943834732, + "grad_norm": 0.7930178046226501, + "learning_rate": 0.0002, + "loss": 0.6646, + "step": 7660 + }, + { + "epoch": 2.4757908327953517, + "grad_norm": 0.6826170086860657, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 7670 + }, + { + "epoch": 2.4790187217559714, + "grad_norm": 0.6576805114746094, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 7680 + }, + { + "epoch": 2.4822466107165915, + "grad_norm": 0.7012448310852051, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 7690 + }, + { + "epoch": 2.485474499677211, + "grad_norm": 0.7774284482002258, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 7700 + }, + { + "epoch": 2.4887023886378308, + "grad_norm": 0.6502766013145447, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 7710 + }, + { + "epoch": 2.4919302775984504, + "grad_norm": 0.7638739347457886, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 7720 + }, + { + "epoch": 2.4951581665590705, + "grad_norm": 0.6217384338378906, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 7730 + }, + { + "epoch": 2.49838605551969, + "grad_norm": 0.7576302886009216, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 7740 + }, + { + "epoch": 2.50161394448031, + "grad_norm": 0.6877137422561646, + "learning_rate": 0.0002, + "loss": 0.6855, + "step": 7750 + }, + { + "epoch": 2.5048418334409295, + "grad_norm": 0.6998329162597656, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 7760 + }, + { + "epoch": 2.508069722401549, + "grad_norm": 0.7879213690757751, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 7770 + }, + { + "epoch": 2.5112976113621692, + "grad_norm": 0.7834980487823486, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 7780 + }, + { + "epoch": 2.514525500322789, + "grad_norm": 0.7789630889892578, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 7790 + }, + { + "epoch": 2.5177533892834085, + "grad_norm": 0.7403590083122253, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 7800 + }, + { + "epoch": 2.5209812782440286, + "grad_norm": 0.6029766201972961, + "learning_rate": 0.0002, + "loss": 0.6964, + "step": 7810 + }, + { + "epoch": 2.5242091672046483, + "grad_norm": 0.7061092257499695, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 7820 + }, + { + "epoch": 2.527437056165268, + "grad_norm": 0.7120763659477234, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 7830 + }, + { + "epoch": 2.5306649451258876, + "grad_norm": 0.6173675656318665, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 7840 + }, + { + "epoch": 2.5338928340865072, + "grad_norm": 0.9566813111305237, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 7850 + }, + { + "epoch": 2.5371207230471273, + "grad_norm": 0.8497620224952698, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 7860 + }, + { + "epoch": 2.540348612007747, + "grad_norm": 0.7663498520851135, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 7870 + }, + { + "epoch": 2.5435765009683666, + "grad_norm": 0.6329668760299683, + "learning_rate": 0.0002, + "loss": 0.6292, + "step": 7880 + }, + { + "epoch": 2.5468043899289863, + "grad_norm": 0.8128195405006409, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 7890 + }, + { + "epoch": 2.5500322788896064, + "grad_norm": 0.6622284650802612, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 7900 + }, + { + "epoch": 2.553260167850226, + "grad_norm": 0.8460057973861694, + "learning_rate": 0.0002, + "loss": 0.693, + "step": 7910 + }, + { + "epoch": 2.5564880568108457, + "grad_norm": 0.6586956977844238, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 7920 + }, + { + "epoch": 2.5597159457714653, + "grad_norm": 0.7569382190704346, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 7930 + }, + { + "epoch": 2.562943834732085, + "grad_norm": 0.6409714221954346, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 7940 + }, + { + "epoch": 2.566171723692705, + "grad_norm": 0.7031713128089905, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 7950 + }, + { + "epoch": 2.5693996126533247, + "grad_norm": 0.7983605265617371, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 7960 + }, + { + "epoch": 2.5726275016139444, + "grad_norm": 0.7165433168411255, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 7970 + }, + { + "epoch": 2.5758553905745645, + "grad_norm": 0.6630598902702332, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 7980 + }, + { + "epoch": 2.579083279535184, + "grad_norm": 0.5883122086524963, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 7990 + }, + { + "epoch": 2.5823111684958038, + "grad_norm": 0.5928755402565002, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 8000 + }, + { + "epoch": 2.5855390574564234, + "grad_norm": 0.7843712568283081, + "learning_rate": 0.0002, + "loss": 0.6701, + "step": 8010 + }, + { + "epoch": 2.588766946417043, + "grad_norm": 0.7206324338912964, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 8020 + }, + { + "epoch": 2.5919948353776627, + "grad_norm": 0.812480092048645, + "learning_rate": 0.0002, + "loss": 0.6968, + "step": 8030 + }, + { + "epoch": 2.595222724338283, + "grad_norm": 0.9843078255653381, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 8040 + }, + { + "epoch": 2.5984506132989025, + "grad_norm": 0.7524392604827881, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 8050 + }, + { + "epoch": 2.601678502259522, + "grad_norm": 0.6220380067825317, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 8060 + }, + { + "epoch": 2.6049063912201422, + "grad_norm": 0.7461398243904114, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 8070 + }, + { + "epoch": 2.608134280180762, + "grad_norm": 0.720974326133728, + "learning_rate": 0.0002, + "loss": 0.6626, + "step": 8080 + }, + { + "epoch": 2.6113621691413815, + "grad_norm": 0.649509847164154, + "learning_rate": 0.0002, + "loss": 0.6756, + "step": 8090 + }, + { + "epoch": 2.614590058102001, + "grad_norm": 0.6894662976264954, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 8100 + }, + { + "epoch": 2.617817947062621, + "grad_norm": 0.734433114528656, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 8110 + }, + { + "epoch": 2.621045836023241, + "grad_norm": 0.7468628883361816, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 8120 + }, + { + "epoch": 2.6242737249838606, + "grad_norm": 0.6508180499076843, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 8130 + }, + { + "epoch": 2.6275016139444802, + "grad_norm": 0.8735209107398987, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 8140 + }, + { + "epoch": 2.6307295029051003, + "grad_norm": 0.8162857294082642, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 8150 + }, + { + "epoch": 2.63395739186572, + "grad_norm": 0.628872811794281, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 8160 + }, + { + "epoch": 2.6371852808263396, + "grad_norm": 0.8078708052635193, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 8170 + }, + { + "epoch": 2.6404131697869593, + "grad_norm": 0.7849429845809937, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 8180 + }, + { + "epoch": 2.643641058747579, + "grad_norm": 0.8115387558937073, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 8190 + }, + { + "epoch": 2.6468689477081986, + "grad_norm": 0.7462222576141357, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 8200 + }, + { + "epoch": 2.6500968366688187, + "grad_norm": 0.753662645816803, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 8210 + }, + { + "epoch": 2.6533247256294383, + "grad_norm": 0.6100404858589172, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 8220 + }, + { + "epoch": 2.656552614590058, + "grad_norm": 0.9084606766700745, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 8230 + }, + { + "epoch": 2.659780503550678, + "grad_norm": 0.6412538886070251, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 8240 + }, + { + "epoch": 2.6630083925112977, + "grad_norm": 0.7640451192855835, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 8250 + }, + { + "epoch": 2.6662362814719174, + "grad_norm": 0.5972344875335693, + "learning_rate": 0.0002, + "loss": 0.6846, + "step": 8260 + }, + { + "epoch": 2.669464170432537, + "grad_norm": 0.6935883164405823, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 8270 + }, + { + "epoch": 2.6726920593931567, + "grad_norm": 0.789399266242981, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 8280 + }, + { + "epoch": 2.675919948353777, + "grad_norm": 0.7143490314483643, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 8290 + }, + { + "epoch": 2.6791478373143964, + "grad_norm": 0.6670652627944946, + "learning_rate": 0.0002, + "loss": 0.6741, + "step": 8300 + }, + { + "epoch": 2.682375726275016, + "grad_norm": 0.687108039855957, + "learning_rate": 0.0002, + "loss": 0.6936, + "step": 8310 + }, + { + "epoch": 2.6856036152356357, + "grad_norm": 0.7914147973060608, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 8320 + }, + { + "epoch": 2.688831504196256, + "grad_norm": 0.8398420214653015, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 8330 + }, + { + "epoch": 2.6920593931568755, + "grad_norm": 0.6592720746994019, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 8340 + }, + { + "epoch": 2.695287282117495, + "grad_norm": 0.6888470649719238, + "learning_rate": 0.0002, + "loss": 0.6673, + "step": 8350 + }, + { + "epoch": 2.698515171078115, + "grad_norm": 0.7127556800842285, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 8360 + }, + { + "epoch": 2.7017430600387344, + "grad_norm": 0.6630286574363708, + "learning_rate": 0.0002, + "loss": 0.7013, + "step": 8370 + }, + { + "epoch": 2.7049709489993545, + "grad_norm": 0.8261964321136475, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 8380 + }, + { + "epoch": 2.708198837959974, + "grad_norm": 0.717339813709259, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 8390 + }, + { + "epoch": 2.711426726920594, + "grad_norm": 0.651637613773346, + "learning_rate": 0.0002, + "loss": 0.6929, + "step": 8400 + }, + { + "epoch": 2.714654615881214, + "grad_norm": 0.7936098575592041, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 8410 + }, + { + "epoch": 2.7178825048418336, + "grad_norm": 0.8761560320854187, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 8420 + }, + { + "epoch": 2.7211103938024532, + "grad_norm": 0.6768006086349487, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 8430 + }, + { + "epoch": 2.724338282763073, + "grad_norm": 0.7121055722236633, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 8440 + }, + { + "epoch": 2.7275661717236925, + "grad_norm": 0.6811696887016296, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 8450 + }, + { + "epoch": 2.730794060684312, + "grad_norm": 0.8168250918388367, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 8460 + }, + { + "epoch": 2.7340219496449323, + "grad_norm": 0.660682737827301, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 8470 + }, + { + "epoch": 2.737249838605552, + "grad_norm": 0.7369356155395508, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 8480 + }, + { + "epoch": 2.7404777275661716, + "grad_norm": 0.7545099854469299, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 8490 + }, + { + "epoch": 2.7437056165267917, + "grad_norm": 0.6991257667541504, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 8500 + }, + { + "epoch": 2.7469335054874113, + "grad_norm": 0.7195324301719666, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 8510 + }, + { + "epoch": 2.750161394448031, + "grad_norm": 0.8995378017425537, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 8520 + }, + { + "epoch": 2.7533892834086506, + "grad_norm": 0.6924123764038086, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 8530 + }, + { + "epoch": 2.7566171723692703, + "grad_norm": 0.6260585784912109, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 8540 + }, + { + "epoch": 2.7598450613298904, + "grad_norm": 0.7273091673851013, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 8550 + }, + { + "epoch": 2.76307295029051, + "grad_norm": 0.720562219619751, + "learning_rate": 0.0002, + "loss": 0.6853, + "step": 8560 + }, + { + "epoch": 2.7663008392511297, + "grad_norm": 0.6360004544258118, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 8570 + }, + { + "epoch": 2.76952872821175, + "grad_norm": 0.7634525895118713, + "learning_rate": 0.0002, + "loss": 0.6118, + "step": 8580 + }, + { + "epoch": 2.7727566171723694, + "grad_norm": 0.6586076021194458, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 8590 + }, + { + "epoch": 2.775984506132989, + "grad_norm": 0.6542639136314392, + "learning_rate": 0.0002, + "loss": 0.7072, + "step": 8600 + }, + { + "epoch": 2.7792123950936087, + "grad_norm": 0.7650290727615356, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 8610 + }, + { + "epoch": 2.7824402840542284, + "grad_norm": 0.6551542282104492, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 8620 + }, + { + "epoch": 2.785668173014848, + "grad_norm": 0.6915501952171326, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 8630 + }, + { + "epoch": 2.788896061975468, + "grad_norm": 0.8061493635177612, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 8640 + }, + { + "epoch": 2.792123950936088, + "grad_norm": 0.8403584957122803, + "learning_rate": 0.0002, + "loss": 0.6853, + "step": 8650 + }, + { + "epoch": 2.7953518398967074, + "grad_norm": 0.6455532312393188, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 8660 + }, + { + "epoch": 2.7985797288573275, + "grad_norm": 0.8296352028846741, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 8670 + }, + { + "epoch": 2.801807617817947, + "grad_norm": 0.7288752794265747, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 8680 + }, + { + "epoch": 2.805035506778567, + "grad_norm": 0.7628464102745056, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 8690 + }, + { + "epoch": 2.8082633957391865, + "grad_norm": 0.9993878602981567, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 8700 + }, + { + "epoch": 2.811491284699806, + "grad_norm": 0.6972465515136719, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 8710 + }, + { + "epoch": 2.8147191736604262, + "grad_norm": 0.645042896270752, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 8720 + }, + { + "epoch": 2.817947062621046, + "grad_norm": 0.6853853464126587, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 8730 + }, + { + "epoch": 2.8211749515816655, + "grad_norm": 0.5935067534446716, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 8740 + }, + { + "epoch": 2.824402840542285, + "grad_norm": 0.7336633205413818, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 8750 + }, + { + "epoch": 2.8276307295029053, + "grad_norm": 0.7074962854385376, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 8760 + }, + { + "epoch": 2.830858618463525, + "grad_norm": 0.6667559742927551, + "learning_rate": 0.0002, + "loss": 0.6744, + "step": 8770 + }, + { + "epoch": 2.8340865074241446, + "grad_norm": 0.8101205229759216, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 8780 + }, + { + "epoch": 2.8373143963847642, + "grad_norm": 0.8841480016708374, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 8790 + }, + { + "epoch": 2.840542285345384, + "grad_norm": 0.5891591310501099, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 8800 + }, + { + "epoch": 2.843770174306004, + "grad_norm": 0.667032778263092, + "learning_rate": 0.0002, + "loss": 0.7114, + "step": 8810 + }, + { + "epoch": 2.8469980632666236, + "grad_norm": 0.7629773020744324, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 8820 + }, + { + "epoch": 2.8502259522272433, + "grad_norm": 0.79471355676651, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 8830 + }, + { + "epoch": 2.8534538411878634, + "grad_norm": 0.7529178261756897, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 8840 + }, + { + "epoch": 2.856681730148483, + "grad_norm": 0.7014923691749573, + "learning_rate": 0.0002, + "loss": 0.7163, + "step": 8850 + }, + { + "epoch": 2.8599096191091027, + "grad_norm": 0.7996514439582825, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 8860 + }, + { + "epoch": 2.8631375080697223, + "grad_norm": 0.7044785618782043, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 8870 + }, + { + "epoch": 2.866365397030342, + "grad_norm": 0.6792093515396118, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 8880 + }, + { + "epoch": 2.8695932859909616, + "grad_norm": 0.69175124168396, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 8890 + }, + { + "epoch": 2.8728211749515817, + "grad_norm": 0.7499129176139832, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 8900 + }, + { + "epoch": 2.8760490639122014, + "grad_norm": 0.7678789496421814, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 8910 + }, + { + "epoch": 2.879276952872821, + "grad_norm": 0.7478128671646118, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 8920 + }, + { + "epoch": 2.882504841833441, + "grad_norm": 0.6767086386680603, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 8930 + }, + { + "epoch": 2.885732730794061, + "grad_norm": 0.7222196459770203, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 8940 + }, + { + "epoch": 2.8889606197546804, + "grad_norm": 0.6950580477714539, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 8950 + }, + { + "epoch": 2.8921885087153, + "grad_norm": 0.7759528160095215, + "learning_rate": 0.0002, + "loss": 0.7064, + "step": 8960 + }, + { + "epoch": 2.8954163976759197, + "grad_norm": 0.6686919927597046, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 8970 + }, + { + "epoch": 2.89864428663654, + "grad_norm": 0.9245954751968384, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 8980 + }, + { + "epoch": 2.9018721755971595, + "grad_norm": 0.8734814524650574, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 8990 + }, + { + "epoch": 2.905100064557779, + "grad_norm": 0.6056219339370728, + "learning_rate": 0.0002, + "loss": 0.6716, + "step": 9000 + }, + { + "epoch": 2.9083279535183992, + "grad_norm": 0.7364102005958557, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 9010 + }, + { + "epoch": 2.911555842479019, + "grad_norm": 0.6563605070114136, + "learning_rate": 0.0002, + "loss": 0.707, + "step": 9020 + }, + { + "epoch": 2.9147837314396385, + "grad_norm": 0.659978985786438, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 9030 + }, + { + "epoch": 2.918011620400258, + "grad_norm": 0.8176041841506958, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 9040 + }, + { + "epoch": 2.921239509360878, + "grad_norm": 0.743677020072937, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 9050 + }, + { + "epoch": 2.9244673983214975, + "grad_norm": 0.7418383359909058, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 9060 + }, + { + "epoch": 2.9276952872821176, + "grad_norm": 0.6916524767875671, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 9070 + }, + { + "epoch": 2.9309231762427372, + "grad_norm": 0.6559975743293762, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 9080 + }, + { + "epoch": 2.934151065203357, + "grad_norm": 0.7431221008300781, + "learning_rate": 0.0002, + "loss": 0.7016, + "step": 9090 + }, + { + "epoch": 2.937378954163977, + "grad_norm": 0.7525941133499146, + "learning_rate": 0.0002, + "loss": 0.6829, + "step": 9100 + }, + { + "epoch": 2.9406068431245966, + "grad_norm": 0.6860167384147644, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 9110 + }, + { + "epoch": 2.9438347320852163, + "grad_norm": 0.6467666029930115, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 9120 + }, + { + "epoch": 2.947062621045836, + "grad_norm": 0.7595751285552979, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 9130 + }, + { + "epoch": 2.9502905100064556, + "grad_norm": 0.6558279991149902, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 9140 + }, + { + "epoch": 2.9535183989670757, + "grad_norm": 0.6818708181381226, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 9150 + }, + { + "epoch": 2.9567462879276953, + "grad_norm": 0.8387085795402527, + "learning_rate": 0.0002, + "loss": 0.6921, + "step": 9160 + }, + { + "epoch": 2.959974176888315, + "grad_norm": 0.7705109715461731, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 9170 + }, + { + "epoch": 2.9632020658489346, + "grad_norm": 0.688106894493103, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 9180 + }, + { + "epoch": 2.9664299548095547, + "grad_norm": 0.659532368183136, + "learning_rate": 0.0002, + "loss": 0.6833, + "step": 9190 + }, + { + "epoch": 2.9696578437701744, + "grad_norm": 0.6839388608932495, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 9200 + }, + { + "epoch": 2.972885732730794, + "grad_norm": 0.6927599310874939, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 9210 + }, + { + "epoch": 2.9761136216914137, + "grad_norm": 0.6902472972869873, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 9220 + }, + { + "epoch": 2.9793415106520333, + "grad_norm": 0.620399534702301, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 9230 + }, + { + "epoch": 2.9825693996126534, + "grad_norm": 0.6812364459037781, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 9240 + }, + { + "epoch": 2.985797288573273, + "grad_norm": 0.7681456208229065, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 9250 + }, + { + "epoch": 2.9890251775338927, + "grad_norm": 0.7621907591819763, + "learning_rate": 0.0002, + "loss": 0.7113, + "step": 9260 + }, + { + "epoch": 2.992253066494513, + "grad_norm": 0.6075740456581116, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 9270 + }, + { + "epoch": 2.9954809554551325, + "grad_norm": 0.7100434899330139, + "learning_rate": 0.0002, + "loss": 0.6758, + "step": 9280 + }, + { + "epoch": 2.998708844415752, + "grad_norm": 0.7314488887786865, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 9290 + }, + { + "epoch": 3.0, + "eval_loss": 1.1434104442596436, + "eval_runtime": 166.3732, + "eval_samples_per_second": 4.406, + "eval_steps_per_second": 0.553, + "step": 9294 + }, + { + "epoch": 3.001936733376372, + "grad_norm": 0.7408893704414368, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 9300 + }, + { + "epoch": 3.0051646223369914, + "grad_norm": 0.9773574471473694, + "learning_rate": 0.0002, + "loss": 0.5182, + "step": 9310 + }, + { + "epoch": 3.0083925112976115, + "grad_norm": 0.7919653058052063, + "learning_rate": 0.0002, + "loss": 0.5432, + "step": 9320 + }, + { + "epoch": 3.011620400258231, + "grad_norm": 0.9139202833175659, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 9330 + }, + { + "epoch": 3.014848289218851, + "grad_norm": 0.8296737670898438, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 9340 + }, + { + "epoch": 3.0180761781794705, + "grad_norm": 0.786868155002594, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 9350 + }, + { + "epoch": 3.0213040671400906, + "grad_norm": 0.5928055644035339, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 9360 + }, + { + "epoch": 3.0245319561007102, + "grad_norm": 0.8785701394081116, + "learning_rate": 0.0002, + "loss": 0.5376, + "step": 9370 + }, + { + "epoch": 3.02775984506133, + "grad_norm": 0.7978872060775757, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 9380 + }, + { + "epoch": 3.0309877340219495, + "grad_norm": 0.7160913348197937, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 9390 + }, + { + "epoch": 3.034215622982569, + "grad_norm": 0.904465913772583, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 9400 + }, + { + "epoch": 3.0374435119431893, + "grad_norm": 0.7082195281982422, + "learning_rate": 0.0002, + "loss": 0.5518, + "step": 9410 + }, + { + "epoch": 3.040671400903809, + "grad_norm": 0.9686778783798218, + "learning_rate": 0.0002, + "loss": 0.5434, + "step": 9420 + }, + { + "epoch": 3.0438992898644286, + "grad_norm": 0.8788613677024841, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 9430 + }, + { + "epoch": 3.0471271788250482, + "grad_norm": 0.8217582106590271, + "learning_rate": 0.0002, + "loss": 0.5599, + "step": 9440 + }, + { + "epoch": 3.0503550677856683, + "grad_norm": 0.7380914092063904, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 9450 + }, + { + "epoch": 3.053582956746288, + "grad_norm": 0.7339285612106323, + "learning_rate": 0.0002, + "loss": 0.6258, + "step": 9460 + }, + { + "epoch": 3.0568108457069076, + "grad_norm": 0.7175183296203613, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 9470 + }, + { + "epoch": 3.0600387346675273, + "grad_norm": 0.8275379538536072, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 9480 + }, + { + "epoch": 3.0632666236281474, + "grad_norm": 0.6544256806373596, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 9490 + }, + { + "epoch": 3.066494512588767, + "grad_norm": 0.8193472623825073, + "learning_rate": 0.0002, + "loss": 0.5365, + "step": 9500 + }, + { + "epoch": 3.0697224015493867, + "grad_norm": 0.7967836856842041, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 9510 + }, + { + "epoch": 3.0729502905100063, + "grad_norm": 0.8788684010505676, + "learning_rate": 0.0002, + "loss": 0.5629, + "step": 9520 + }, + { + "epoch": 3.0761781794706264, + "grad_norm": 0.9410629868507385, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 9530 + }, + { + "epoch": 3.079406068431246, + "grad_norm": 0.7448706030845642, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 9540 + }, + { + "epoch": 3.0826339573918657, + "grad_norm": 0.9149372577667236, + "learning_rate": 0.0002, + "loss": 0.5774, + "step": 9550 + }, + { + "epoch": 3.0858618463524854, + "grad_norm": 0.7265563607215881, + "learning_rate": 0.0002, + "loss": 0.5347, + "step": 9560 + }, + { + "epoch": 3.089089735313105, + "grad_norm": 1.0305068492889404, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 9570 + }, + { + "epoch": 3.092317624273725, + "grad_norm": 0.7987357974052429, + "learning_rate": 0.0002, + "loss": 0.5884, + "step": 9580 + }, + { + "epoch": 3.095545513234345, + "grad_norm": 0.7733123898506165, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 9590 + }, + { + "epoch": 3.0987734021949644, + "grad_norm": 1.0438069105148315, + "learning_rate": 0.0002, + "loss": 0.5848, + "step": 9600 + }, + { + "epoch": 3.102001291155584, + "grad_norm": 0.7951784729957581, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 9610 + }, + { + "epoch": 3.105229180116204, + "grad_norm": 0.7776783108711243, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 9620 + }, + { + "epoch": 3.108457069076824, + "grad_norm": 0.7060676217079163, + "learning_rate": 0.0002, + "loss": 0.5626, + "step": 9630 + }, + { + "epoch": 3.1116849580374435, + "grad_norm": 0.871569037437439, + "learning_rate": 0.0002, + "loss": 0.5731, + "step": 9640 + }, + { + "epoch": 3.114912846998063, + "grad_norm": 0.8873385787010193, + "learning_rate": 0.0002, + "loss": 0.5168, + "step": 9650 + }, + { + "epoch": 3.118140735958683, + "grad_norm": 0.750998318195343, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 9660 + }, + { + "epoch": 3.121368624919303, + "grad_norm": 0.8678529262542725, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 9670 + }, + { + "epoch": 3.1245965138799225, + "grad_norm": 0.7706599235534668, + "learning_rate": 0.0002, + "loss": 0.5831, + "step": 9680 + }, + { + "epoch": 3.127824402840542, + "grad_norm": 0.8317574858665466, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 9690 + }, + { + "epoch": 3.131052291801162, + "grad_norm": 0.801800012588501, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 9700 + }, + { + "epoch": 3.134280180761782, + "grad_norm": 0.8574623465538025, + "learning_rate": 0.0002, + "loss": 0.6044, + "step": 9710 + }, + { + "epoch": 3.1375080697224016, + "grad_norm": 0.6556540727615356, + "learning_rate": 0.0002, + "loss": 0.6072, + "step": 9720 + }, + { + "epoch": 3.1407359586830212, + "grad_norm": 0.8555161952972412, + "learning_rate": 0.0002, + "loss": 0.6058, + "step": 9730 + }, + { + "epoch": 3.143963847643641, + "grad_norm": 0.8825467824935913, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 9740 + }, + { + "epoch": 3.147191736604261, + "grad_norm": 0.8297156691551208, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 9750 + }, + { + "epoch": 3.1504196255648806, + "grad_norm": 0.7710384726524353, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 9760 + }, + { + "epoch": 3.1536475145255003, + "grad_norm": 0.8778039216995239, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 9770 + }, + { + "epoch": 3.15687540348612, + "grad_norm": 0.9014058113098145, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 9780 + }, + { + "epoch": 3.16010329244674, + "grad_norm": 0.6856890320777893, + "learning_rate": 0.0002, + "loss": 0.5496, + "step": 9790 + }, + { + "epoch": 3.1633311814073597, + "grad_norm": 0.6520644426345825, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 9800 + }, + { + "epoch": 3.1665590703679793, + "grad_norm": 0.7250499129295349, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 9810 + }, + { + "epoch": 3.169786959328599, + "grad_norm": 0.8331542015075684, + "learning_rate": 0.0002, + "loss": 0.5823, + "step": 9820 + }, + { + "epoch": 3.1730148482892186, + "grad_norm": 0.8531261682510376, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 9830 + }, + { + "epoch": 3.1762427372498387, + "grad_norm": 0.8997558355331421, + "learning_rate": 0.0002, + "loss": 0.57, + "step": 9840 + }, + { + "epoch": 3.1794706262104584, + "grad_norm": 0.708335280418396, + "learning_rate": 0.0002, + "loss": 0.5921, + "step": 9850 + }, + { + "epoch": 3.182698515171078, + "grad_norm": 1.0074886083602905, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 9860 + }, + { + "epoch": 3.1859264041316977, + "grad_norm": 1.0804681777954102, + "learning_rate": 0.0002, + "loss": 0.573, + "step": 9870 + }, + { + "epoch": 3.189154293092318, + "grad_norm": 0.9510730504989624, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 9880 + }, + { + "epoch": 3.1923821820529374, + "grad_norm": 0.7211061716079712, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 9890 + }, + { + "epoch": 3.195610071013557, + "grad_norm": 0.8767086267471313, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 9900 + }, + { + "epoch": 3.1988379599741767, + "grad_norm": 0.8388153314590454, + "learning_rate": 0.0002, + "loss": 0.5747, + "step": 9910 + }, + { + "epoch": 3.202065848934797, + "grad_norm": 0.8038473725318909, + "learning_rate": 0.0002, + "loss": 0.5681, + "step": 9920 + }, + { + "epoch": 3.2052937378954165, + "grad_norm": 0.8187747001647949, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 9930 + }, + { + "epoch": 3.208521626856036, + "grad_norm": 0.7427355051040649, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 9940 + }, + { + "epoch": 3.211749515816656, + "grad_norm": 0.8017025589942932, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 9950 + }, + { + "epoch": 3.214977404777276, + "grad_norm": 0.738595187664032, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 9960 + }, + { + "epoch": 3.2182052937378955, + "grad_norm": 0.7521342039108276, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 9970 + }, + { + "epoch": 3.221433182698515, + "grad_norm": 0.840329110622406, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 9980 + }, + { + "epoch": 3.224661071659135, + "grad_norm": 0.9809671640396118, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 9990 + }, + { + "epoch": 3.2278889606197545, + "grad_norm": 0.8456943035125732, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 10000 + }, + { + "epoch": 3.2311168495803746, + "grad_norm": 0.8962995409965515, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 10010 + }, + { + "epoch": 3.2343447385409942, + "grad_norm": 0.6492817401885986, + "learning_rate": 0.0002, + "loss": 0.5399, + "step": 10020 + }, + { + "epoch": 3.237572627501614, + "grad_norm": 1.0471255779266357, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 10030 + }, + { + "epoch": 3.2408005164622335, + "grad_norm": 0.7995471358299255, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 10040 + }, + { + "epoch": 3.2440284054228536, + "grad_norm": 0.7231964468955994, + "learning_rate": 0.0002, + "loss": 0.615, + "step": 10050 + }, + { + "epoch": 3.2472562943834733, + "grad_norm": 0.639630138874054, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 10060 + }, + { + "epoch": 3.250484183344093, + "grad_norm": 0.7957055568695068, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 10070 + }, + { + "epoch": 3.2537120723047126, + "grad_norm": 0.7735482454299927, + "learning_rate": 0.0002, + "loss": 0.5845, + "step": 10080 + }, + { + "epoch": 3.2569399612653323, + "grad_norm": 0.8139488101005554, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 10090 + }, + { + "epoch": 3.2601678502259523, + "grad_norm": 0.8113240003585815, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 10100 + }, + { + "epoch": 3.263395739186572, + "grad_norm": 0.7735909819602966, + "learning_rate": 0.0002, + "loss": 0.5617, + "step": 10110 + }, + { + "epoch": 3.2666236281471916, + "grad_norm": 0.7760744094848633, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 10120 + }, + { + "epoch": 3.2698515171078113, + "grad_norm": 0.8078505396842957, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 10130 + }, + { + "epoch": 3.2730794060684314, + "grad_norm": 0.983648955821991, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 10140 + }, + { + "epoch": 3.276307295029051, + "grad_norm": 0.7131832242012024, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 10150 + }, + { + "epoch": 3.2795351839896707, + "grad_norm": 0.924493134021759, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 10160 + }, + { + "epoch": 3.2827630729502904, + "grad_norm": 0.9371112585067749, + "learning_rate": 0.0002, + "loss": 0.5733, + "step": 10170 + }, + { + "epoch": 3.2859909619109104, + "grad_norm": 0.8989261388778687, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 10180 + }, + { + "epoch": 3.28921885087153, + "grad_norm": 0.8130394816398621, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 10190 + }, + { + "epoch": 3.2924467398321497, + "grad_norm": 0.9899941086769104, + "learning_rate": 0.0002, + "loss": 0.5555, + "step": 10200 + }, + { + "epoch": 3.2956746287927694, + "grad_norm": 1.007038950920105, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 10210 + }, + { + "epoch": 3.2989025177533895, + "grad_norm": 0.7465066313743591, + "learning_rate": 0.0002, + "loss": 0.5713, + "step": 10220 + }, + { + "epoch": 3.302130406714009, + "grad_norm": 0.7202590703964233, + "learning_rate": 0.0002, + "loss": 0.6307, + "step": 10230 + }, + { + "epoch": 3.305358295674629, + "grad_norm": 0.6258249282836914, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 10240 + }, + { + "epoch": 3.3085861846352485, + "grad_norm": 0.8996058702468872, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 10250 + }, + { + "epoch": 3.311814073595868, + "grad_norm": 0.9550982713699341, + "learning_rate": 0.0002, + "loss": 0.5825, + "step": 10260 + }, + { + "epoch": 3.315041962556488, + "grad_norm": 0.7010059952735901, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 10270 + }, + { + "epoch": 3.318269851517108, + "grad_norm": 0.9639869332313538, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 10280 + }, + { + "epoch": 3.3214977404777275, + "grad_norm": 1.0192502737045288, + "learning_rate": 0.0002, + "loss": 0.5362, + "step": 10290 + }, + { + "epoch": 3.324725629438347, + "grad_norm": 0.7953670024871826, + "learning_rate": 0.0002, + "loss": 0.5605, + "step": 10300 + }, + { + "epoch": 3.3279535183989672, + "grad_norm": 0.7436774969100952, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 10310 + }, + { + "epoch": 3.331181407359587, + "grad_norm": 0.7846777439117432, + "learning_rate": 0.0002, + "loss": 0.5823, + "step": 10320 + }, + { + "epoch": 3.3344092963202066, + "grad_norm": 0.8963494896888733, + "learning_rate": 0.0002, + "loss": 0.6119, + "step": 10330 + }, + { + "epoch": 3.337637185280826, + "grad_norm": 0.6876392364501953, + "learning_rate": 0.0002, + "loss": 0.5872, + "step": 10340 + }, + { + "epoch": 3.340865074241446, + "grad_norm": 0.9161638021469116, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 10350 + }, + { + "epoch": 3.344092963202066, + "grad_norm": 0.8964458107948303, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 10360 + }, + { + "epoch": 3.3473208521626856, + "grad_norm": 0.9052296280860901, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 10370 + }, + { + "epoch": 3.3505487411233053, + "grad_norm": 0.9292596578598022, + "learning_rate": 0.0002, + "loss": 0.5958, + "step": 10380 + }, + { + "epoch": 3.3537766300839253, + "grad_norm": 0.9605957269668579, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 10390 + }, + { + "epoch": 3.357004519044545, + "grad_norm": 1.0198872089385986, + "learning_rate": 0.0002, + "loss": 0.6214, + "step": 10400 + }, + { + "epoch": 3.3602324080051647, + "grad_norm": 0.7043630480766296, + "learning_rate": 0.0002, + "loss": 0.6053, + "step": 10410 + }, + { + "epoch": 3.3634602969657843, + "grad_norm": 1.0533326864242554, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 10420 + }, + { + "epoch": 3.366688185926404, + "grad_norm": 0.7552485466003418, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 10430 + }, + { + "epoch": 3.369916074887024, + "grad_norm": 0.692708432674408, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 10440 + }, + { + "epoch": 3.3731439638476437, + "grad_norm": 0.985952615737915, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 10450 + }, + { + "epoch": 3.3763718528082634, + "grad_norm": 0.6749676465988159, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 10460 + }, + { + "epoch": 3.379599741768883, + "grad_norm": 0.9514535665512085, + "learning_rate": 0.0002, + "loss": 0.5724, + "step": 10470 + }, + { + "epoch": 3.382827630729503, + "grad_norm": 1.2681142091751099, + "learning_rate": 0.0002, + "loss": 0.5982, + "step": 10480 + }, + { + "epoch": 3.3860555196901228, + "grad_norm": 1.031968355178833, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 10490 + }, + { + "epoch": 3.3892834086507424, + "grad_norm": 0.8061563968658447, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 10500 + }, + { + "epoch": 3.392511297611362, + "grad_norm": 1.0515062808990479, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 10510 + }, + { + "epoch": 3.3957391865719817, + "grad_norm": 0.9055540561676025, + "learning_rate": 0.0002, + "loss": 0.542, + "step": 10520 + }, + { + "epoch": 3.398967075532602, + "grad_norm": 0.9318141341209412, + "learning_rate": 0.0002, + "loss": 0.6148, + "step": 10530 + }, + { + "epoch": 3.4021949644932215, + "grad_norm": 0.8266817331314087, + "learning_rate": 0.0002, + "loss": 0.5722, + "step": 10540 + }, + { + "epoch": 3.405422853453841, + "grad_norm": 1.2322112321853638, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 10550 + }, + { + "epoch": 3.4086507424144608, + "grad_norm": 0.9535136818885803, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 10560 + }, + { + "epoch": 3.411878631375081, + "grad_norm": 0.9243819117546082, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 10570 + }, + { + "epoch": 3.4151065203357005, + "grad_norm": 0.9011809825897217, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 10580 + }, + { + "epoch": 3.41833440929632, + "grad_norm": 0.9923036694526672, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 10590 + }, + { + "epoch": 3.42156229825694, + "grad_norm": 0.8903067111968994, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 10600 + }, + { + "epoch": 3.42479018721756, + "grad_norm": 0.7101534605026245, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 10610 + }, + { + "epoch": 3.4280180761781796, + "grad_norm": 0.8186570405960083, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 10620 + }, + { + "epoch": 3.431245965138799, + "grad_norm": 0.9480205774307251, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 10630 + }, + { + "epoch": 3.434473854099419, + "grad_norm": 1.1370961666107178, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 10640 + }, + { + "epoch": 3.437701743060039, + "grad_norm": 1.017669677734375, + "learning_rate": 0.0002, + "loss": 0.5779, + "step": 10650 + }, + { + "epoch": 3.4409296320206586, + "grad_norm": 0.7625100016593933, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 10660 + }, + { + "epoch": 3.4441575209812783, + "grad_norm": 0.9288196563720703, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 10670 + }, + { + "epoch": 3.447385409941898, + "grad_norm": 0.8800460696220398, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 10680 + }, + { + "epoch": 3.4506132989025176, + "grad_norm": 0.7499661445617676, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 10690 + }, + { + "epoch": 3.4538411878631377, + "grad_norm": 0.8254973292350769, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 10700 + }, + { + "epoch": 3.4570690768237573, + "grad_norm": 0.8735857605934143, + "learning_rate": 0.0002, + "loss": 0.5742, + "step": 10710 + }, + { + "epoch": 3.460296965784377, + "grad_norm": 0.9601819515228271, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 10720 + }, + { + "epoch": 3.4635248547449966, + "grad_norm": 0.8031058311462402, + "learning_rate": 0.0002, + "loss": 0.5574, + "step": 10730 + }, + { + "epoch": 3.4667527437056167, + "grad_norm": 0.8039247393608093, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 10740 + }, + { + "epoch": 3.4699806326662364, + "grad_norm": 0.8936953544616699, + "learning_rate": 0.0002, + "loss": 0.593, + "step": 10750 + }, + { + "epoch": 3.473208521626856, + "grad_norm": 0.8201186060905457, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 10760 + }, + { + "epoch": 3.4764364105874757, + "grad_norm": 1.0064148902893066, + "learning_rate": 0.0002, + "loss": 0.5875, + "step": 10770 + }, + { + "epoch": 3.4796642995480953, + "grad_norm": 0.8617483377456665, + "learning_rate": 0.0002, + "loss": 0.5639, + "step": 10780 + }, + { + "epoch": 3.4828921885087154, + "grad_norm": 0.8532096147537231, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 10790 + }, + { + "epoch": 3.486120077469335, + "grad_norm": 0.8646879196166992, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 10800 + }, + { + "epoch": 3.4893479664299547, + "grad_norm": 0.7962660789489746, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 10810 + }, + { + "epoch": 3.492575855390575, + "grad_norm": 0.9560028314590454, + "learning_rate": 0.0002, + "loss": 0.5398, + "step": 10820 + }, + { + "epoch": 3.4958037443511945, + "grad_norm": 0.928439736366272, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 10830 + }, + { + "epoch": 3.499031633311814, + "grad_norm": 0.8219282627105713, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 10840 + }, + { + "epoch": 3.5022595222724338, + "grad_norm": 0.7918338179588318, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 10850 + }, + { + "epoch": 3.5054874112330534, + "grad_norm": 0.961295485496521, + "learning_rate": 0.0002, + "loss": 0.6164, + "step": 10860 + }, + { + "epoch": 3.5087153001936735, + "grad_norm": 1.0731624364852905, + "learning_rate": 0.0002, + "loss": 0.5534, + "step": 10870 + }, + { + "epoch": 3.511943189154293, + "grad_norm": 0.9551863074302673, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 10880 + }, + { + "epoch": 3.515171078114913, + "grad_norm": 0.8409819602966309, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 10890 + }, + { + "epoch": 3.5183989670755325, + "grad_norm": 0.7546320557594299, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 10900 + }, + { + "epoch": 3.5216268560361526, + "grad_norm": 0.7505252361297607, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 10910 + }, + { + "epoch": 3.524854744996772, + "grad_norm": 0.7505561113357544, + "learning_rate": 0.0002, + "loss": 0.5649, + "step": 10920 + }, + { + "epoch": 3.528082633957392, + "grad_norm": 1.086177945137024, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 10930 + }, + { + "epoch": 3.5313105229180115, + "grad_norm": 0.7721118330955505, + "learning_rate": 0.0002, + "loss": 0.5983, + "step": 10940 + }, + { + "epoch": 3.534538411878631, + "grad_norm": 0.9567878246307373, + "learning_rate": 0.0002, + "loss": 0.5919, + "step": 10950 + }, + { + "epoch": 3.5377663008392513, + "grad_norm": 0.8377360105514526, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 10960 + }, + { + "epoch": 3.540994189799871, + "grad_norm": 1.0174858570098877, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 10970 + }, + { + "epoch": 3.5442220787604906, + "grad_norm": 0.8164418935775757, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 10980 + }, + { + "epoch": 3.5474499677211107, + "grad_norm": 0.8959241509437561, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 10990 + }, + { + "epoch": 3.5506778566817303, + "grad_norm": 1.0154379606246948, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 11000 + }, + { + "epoch": 3.55390574564235, + "grad_norm": 0.7812292575836182, + "learning_rate": 0.0002, + "loss": 0.5835, + "step": 11010 + }, + { + "epoch": 3.5571336346029696, + "grad_norm": 0.9849029779434204, + "learning_rate": 0.0002, + "loss": 0.6052, + "step": 11020 + }, + { + "epoch": 3.5603615235635893, + "grad_norm": 0.8826184272766113, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 11030 + }, + { + "epoch": 3.563589412524209, + "grad_norm": 0.9039685726165771, + "learning_rate": 0.0002, + "loss": 0.601, + "step": 11040 + }, + { + "epoch": 3.566817301484829, + "grad_norm": 0.9585249423980713, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 11050 + }, + { + "epoch": 3.5700451904454487, + "grad_norm": 0.8083069324493408, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 11060 + }, + { + "epoch": 3.5732730794060683, + "grad_norm": 0.9528678059577942, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 11070 + }, + { + "epoch": 3.5765009683666884, + "grad_norm": 0.8297588229179382, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 11080 + }, + { + "epoch": 3.579728857327308, + "grad_norm": 0.8191716074943542, + "learning_rate": 0.0002, + "loss": 0.5919, + "step": 11090 + }, + { + "epoch": 3.5829567462879277, + "grad_norm": 0.8056275844573975, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 11100 + }, + { + "epoch": 3.5861846352485474, + "grad_norm": 0.701930582523346, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 11110 + }, + { + "epoch": 3.589412524209167, + "grad_norm": 0.7644643187522888, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 11120 + }, + { + "epoch": 3.592640413169787, + "grad_norm": 0.668004035949707, + "learning_rate": 0.0002, + "loss": 0.605, + "step": 11130 + }, + { + "epoch": 3.5958683021304068, + "grad_norm": 0.8849539756774902, + "learning_rate": 0.0002, + "loss": 0.5735, + "step": 11140 + }, + { + "epoch": 3.5990961910910264, + "grad_norm": 0.8123571276664734, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 11150 + }, + { + "epoch": 3.602324080051646, + "grad_norm": 0.7591469287872314, + "learning_rate": 0.0002, + "loss": 0.5626, + "step": 11160 + }, + { + "epoch": 3.605551969012266, + "grad_norm": 0.776466965675354, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 11170 + }, + { + "epoch": 3.608779857972886, + "grad_norm": 0.9156150221824646, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 11180 + }, + { + "epoch": 3.6120077469335055, + "grad_norm": 0.7517618536949158, + "learning_rate": 0.0002, + "loss": 0.5867, + "step": 11190 + }, + { + "epoch": 3.615235635894125, + "grad_norm": 0.931239128112793, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 11200 + }, + { + "epoch": 3.6184635248547448, + "grad_norm": 0.9107872843742371, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 11210 + }, + { + "epoch": 3.621691413815365, + "grad_norm": 0.7624770998954773, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 11220 + }, + { + "epoch": 3.6249193027759845, + "grad_norm": 0.8129580616950989, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 11230 + }, + { + "epoch": 3.628147191736604, + "grad_norm": 0.7339836955070496, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 11240 + }, + { + "epoch": 3.6313750806972243, + "grad_norm": 0.8901296854019165, + "learning_rate": 0.0002, + "loss": 0.5976, + "step": 11250 + }, + { + "epoch": 3.634602969657844, + "grad_norm": 1.1374726295471191, + "learning_rate": 0.0002, + "loss": 0.5977, + "step": 11260 + }, + { + "epoch": 3.6378308586184636, + "grad_norm": 0.7438275218009949, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 11270 + }, + { + "epoch": 3.641058747579083, + "grad_norm": 0.808646559715271, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 11280 + }, + { + "epoch": 3.644286636539703, + "grad_norm": 1.091810941696167, + "learning_rate": 0.0002, + "loss": 0.6244, + "step": 11290 + }, + { + "epoch": 3.6475145255003225, + "grad_norm": 0.8439257144927979, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 11300 + }, + { + "epoch": 3.6507424144609426, + "grad_norm": 0.9720633029937744, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 11310 + }, + { + "epoch": 3.6539703034215623, + "grad_norm": 0.738571047782898, + "learning_rate": 0.0002, + "loss": 0.5942, + "step": 11320 + }, + { + "epoch": 3.657198192382182, + "grad_norm": 0.6961580514907837, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 11330 + }, + { + "epoch": 3.660426081342802, + "grad_norm": 0.8192131519317627, + "learning_rate": 0.0002, + "loss": 0.6226, + "step": 11340 + }, + { + "epoch": 3.6636539703034217, + "grad_norm": 0.8367205858230591, + "learning_rate": 0.0002, + "loss": 0.6155, + "step": 11350 + }, + { + "epoch": 3.6668818592640413, + "grad_norm": 0.7735666632652283, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 11360 + }, + { + "epoch": 3.670109748224661, + "grad_norm": 0.6507132649421692, + "learning_rate": 0.0002, + "loss": 0.6113, + "step": 11370 + }, + { + "epoch": 3.6733376371852806, + "grad_norm": 0.8271192312240601, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 11380 + }, + { + "epoch": 3.6765655261459007, + "grad_norm": 0.8724204301834106, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 11390 + }, + { + "epoch": 3.6797934151065204, + "grad_norm": 0.8448445200920105, + "learning_rate": 0.0002, + "loss": 0.6131, + "step": 11400 + }, + { + "epoch": 3.68302130406714, + "grad_norm": 0.6756882071495056, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 11410 + }, + { + "epoch": 3.68624919302776, + "grad_norm": 0.7859625816345215, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 11420 + }, + { + "epoch": 3.6894770819883798, + "grad_norm": 0.8929487466812134, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 11430 + }, + { + "epoch": 3.6927049709489994, + "grad_norm": 0.8163391351699829, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 11440 + }, + { + "epoch": 3.695932859909619, + "grad_norm": 0.8948464393615723, + "learning_rate": 0.0002, + "loss": 0.6467, + "step": 11450 + }, + { + "epoch": 3.6991607488702387, + "grad_norm": 0.8654782176017761, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 11460 + }, + { + "epoch": 3.7023886378308584, + "grad_norm": 0.9514864683151245, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 11470 + }, + { + "epoch": 3.7056165267914785, + "grad_norm": 0.7298579812049866, + "learning_rate": 0.0002, + "loss": 0.606, + "step": 11480 + }, + { + "epoch": 3.708844415752098, + "grad_norm": 0.9266309142112732, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 11490 + }, + { + "epoch": 3.7120723047127178, + "grad_norm": 0.8608686923980713, + "learning_rate": 0.0002, + "loss": 0.6122, + "step": 11500 + }, + { + "epoch": 3.715300193673338, + "grad_norm": 0.921788215637207, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 11510 + }, + { + "epoch": 3.7185280826339575, + "grad_norm": 0.8537021279335022, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 11520 + }, + { + "epoch": 3.721755971594577, + "grad_norm": 1.115194320678711, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 11530 + }, + { + "epoch": 3.724983860555197, + "grad_norm": 0.7614817023277283, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 11540 + }, + { + "epoch": 3.7282117495158165, + "grad_norm": 0.871999204158783, + "learning_rate": 0.0002, + "loss": 0.5776, + "step": 11550 + }, + { + "epoch": 3.7314396384764366, + "grad_norm": 0.9668049812316895, + "learning_rate": 0.0002, + "loss": 0.5962, + "step": 11560 + }, + { + "epoch": 3.734667527437056, + "grad_norm": 1.2185815572738647, + "learning_rate": 0.0002, + "loss": 0.5534, + "step": 11570 + }, + { + "epoch": 3.737895416397676, + "grad_norm": 0.8258453011512756, + "learning_rate": 0.0002, + "loss": 0.5936, + "step": 11580 + }, + { + "epoch": 3.7411233053582955, + "grad_norm": 0.8708966374397278, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 11590 + }, + { + "epoch": 3.7443511943189156, + "grad_norm": 0.7784267663955688, + "learning_rate": 0.0002, + "loss": 0.5847, + "step": 11600 + }, + { + "epoch": 3.7475790832795353, + "grad_norm": 0.7504425048828125, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 11610 + }, + { + "epoch": 3.750806972240155, + "grad_norm": 0.9144526124000549, + "learning_rate": 0.0002, + "loss": 0.5922, + "step": 11620 + }, + { + "epoch": 3.7540348612007746, + "grad_norm": 0.922581672668457, + "learning_rate": 0.0002, + "loss": 0.6425, + "step": 11630 + }, + { + "epoch": 3.757262750161394, + "grad_norm": 0.9348630905151367, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 11640 + }, + { + "epoch": 3.7604906391220143, + "grad_norm": 1.0740231275558472, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 11650 + }, + { + "epoch": 3.763718528082634, + "grad_norm": 0.884830117225647, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 11660 + }, + { + "epoch": 3.7669464170432536, + "grad_norm": 1.0256348848342896, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 11670 + }, + { + "epoch": 3.7701743060038737, + "grad_norm": 0.6795592904090881, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 11680 + }, + { + "epoch": 3.7734021949644934, + "grad_norm": 0.9381206631660461, + "learning_rate": 0.0002, + "loss": 0.6241, + "step": 11690 + }, + { + "epoch": 3.776630083925113, + "grad_norm": 0.7633092403411865, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 11700 + }, + { + "epoch": 3.7798579728857327, + "grad_norm": 0.7506213188171387, + "learning_rate": 0.0002, + "loss": 0.5937, + "step": 11710 + }, + { + "epoch": 3.7830858618463523, + "grad_norm": 0.8182913064956665, + "learning_rate": 0.0002, + "loss": 0.5933, + "step": 11720 + }, + { + "epoch": 3.786313750806972, + "grad_norm": 1.019322156906128, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 11730 + }, + { + "epoch": 3.789541639767592, + "grad_norm": 0.8895221948623657, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 11740 + }, + { + "epoch": 3.7927695287282117, + "grad_norm": 0.948847770690918, + "learning_rate": 0.0002, + "loss": 0.6553, + "step": 11750 + }, + { + "epoch": 3.7959974176888314, + "grad_norm": 0.9068999886512756, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 11760 + }, + { + "epoch": 3.7992253066494515, + "grad_norm": 0.7920539975166321, + "learning_rate": 0.0002, + "loss": 0.6163, + "step": 11770 + }, + { + "epoch": 3.802453195610071, + "grad_norm": 0.8441922068595886, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 11780 + }, + { + "epoch": 3.8056810845706908, + "grad_norm": 0.9258501529693604, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 11790 + }, + { + "epoch": 3.8089089735313104, + "grad_norm": 0.7354241609573364, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 11800 + }, + { + "epoch": 3.81213686249193, + "grad_norm": 0.9494872689247131, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 11810 + }, + { + "epoch": 3.81536475145255, + "grad_norm": 0.8266556859016418, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 11820 + }, + { + "epoch": 3.81859264041317, + "grad_norm": 0.7951219081878662, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 11830 + }, + { + "epoch": 3.8218205293737895, + "grad_norm": 0.7688382267951965, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 11840 + }, + { + "epoch": 3.8250484183344096, + "grad_norm": 1.0917940139770508, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 11850 + }, + { + "epoch": 3.828276307295029, + "grad_norm": 0.9880442023277283, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 11860 + }, + { + "epoch": 3.831504196255649, + "grad_norm": 0.8433151245117188, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 11870 + }, + { + "epoch": 3.8347320852162685, + "grad_norm": 0.8691204786300659, + "learning_rate": 0.0002, + "loss": 0.5876, + "step": 11880 + }, + { + "epoch": 3.837959974176888, + "grad_norm": 0.7698143124580383, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 11890 + }, + { + "epoch": 3.841187863137508, + "grad_norm": 0.8874883651733398, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 11900 + }, + { + "epoch": 3.844415752098128, + "grad_norm": 1.1209359169006348, + "learning_rate": 0.0002, + "loss": 0.6242, + "step": 11910 + }, + { + "epoch": 3.8476436410587476, + "grad_norm": 0.7723544239997864, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 11920 + }, + { + "epoch": 3.850871530019367, + "grad_norm": 0.8363937139511108, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 11930 + }, + { + "epoch": 3.8540994189799873, + "grad_norm": 0.9209707975387573, + "learning_rate": 0.0002, + "loss": 0.6498, + "step": 11940 + }, + { + "epoch": 3.857327307940607, + "grad_norm": 0.9456894993782043, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 11950 + }, + { + "epoch": 3.8605551969012266, + "grad_norm": 1.5748413801193237, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 11960 + }, + { + "epoch": 3.8637830858618463, + "grad_norm": 0.9083569049835205, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 11970 + }, + { + "epoch": 3.867010974822466, + "grad_norm": 0.7672823071479797, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 11980 + }, + { + "epoch": 3.870238863783086, + "grad_norm": 0.8647152185440063, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 11990 + }, + { + "epoch": 3.8734667527437057, + "grad_norm": 0.9564255475997925, + "learning_rate": 0.0002, + "loss": 0.5755, + "step": 12000 + }, + { + "epoch": 3.8766946417043253, + "grad_norm": 0.773267924785614, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 12010 + }, + { + "epoch": 3.879922530664945, + "grad_norm": 0.8030173182487488, + "learning_rate": 0.0002, + "loss": 0.6057, + "step": 12020 + }, + { + "epoch": 3.883150419625565, + "grad_norm": 0.8002150058746338, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 12030 + }, + { + "epoch": 3.8863783085861847, + "grad_norm": 0.98802250623703, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 12040 + }, + { + "epoch": 3.8896061975468044, + "grad_norm": 0.7868124842643738, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 12050 + }, + { + "epoch": 3.892834086507424, + "grad_norm": 0.932182788848877, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 12060 + }, + { + "epoch": 3.8960619754680437, + "grad_norm": 0.8576806783676147, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 12070 + }, + { + "epoch": 3.8992898644286638, + "grad_norm": 0.8985713124275208, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 12080 + }, + { + "epoch": 3.9025177533892834, + "grad_norm": 0.7876521944999695, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 12090 + }, + { + "epoch": 3.905745642349903, + "grad_norm": 0.773936927318573, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 12100 + }, + { + "epoch": 3.908973531310523, + "grad_norm": 0.7274761199951172, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 12110 + }, + { + "epoch": 3.912201420271143, + "grad_norm": 0.8625598549842834, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 12120 + }, + { + "epoch": 3.9154293092317625, + "grad_norm": 0.8702362179756165, + "learning_rate": 0.0002, + "loss": 0.5855, + "step": 12130 + }, + { + "epoch": 3.918657198192382, + "grad_norm": 0.912579357624054, + "learning_rate": 0.0002, + "loss": 0.6493, + "step": 12140 + }, + { + "epoch": 3.9218850871530018, + "grad_norm": 0.8697066903114319, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 12150 + }, + { + "epoch": 3.9251129761136214, + "grad_norm": 1.005232572555542, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 12160 + }, + { + "epoch": 3.9283408650742415, + "grad_norm": 0.793902575969696, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 12170 + }, + { + "epoch": 3.931568754034861, + "grad_norm": 0.7025905847549438, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 12180 + }, + { + "epoch": 3.934796642995481, + "grad_norm": 0.97635817527771, + "learning_rate": 0.0002, + "loss": 0.6421, + "step": 12190 + }, + { + "epoch": 3.938024531956101, + "grad_norm": 0.855417013168335, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 12200 + }, + { + "epoch": 3.9412524209167206, + "grad_norm": 0.8841291666030884, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 12210 + }, + { + "epoch": 3.94448030987734, + "grad_norm": 1.1762064695358276, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 12220 + }, + { + "epoch": 3.94770819883796, + "grad_norm": 0.8393193483352661, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 12230 + }, + { + "epoch": 3.9509360877985795, + "grad_norm": 0.9324905276298523, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 12240 + }, + { + "epoch": 3.9541639767591996, + "grad_norm": 0.8607982993125916, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 12250 + }, + { + "epoch": 3.9573918657198193, + "grad_norm": 0.8586681485176086, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 12260 + }, + { + "epoch": 3.960619754680439, + "grad_norm": 1.1082909107208252, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 12270 + }, + { + "epoch": 3.963847643641059, + "grad_norm": 1.065027117729187, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 12280 + }, + { + "epoch": 3.9670755326016787, + "grad_norm": 0.9544363021850586, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 12290 + }, + { + "epoch": 3.9703034215622983, + "grad_norm": 0.9008927345275879, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 12300 + }, + { + "epoch": 3.973531310522918, + "grad_norm": 0.8717467188835144, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 12310 + }, + { + "epoch": 3.9767591994835376, + "grad_norm": 0.9718339443206787, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 12320 + }, + { + "epoch": 3.9799870884441573, + "grad_norm": 1.0362015962600708, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 12330 + }, + { + "epoch": 3.9832149774047774, + "grad_norm": 1.0844318866729736, + "learning_rate": 0.0002, + "loss": 0.6229, + "step": 12340 + }, + { + "epoch": 3.986442866365397, + "grad_norm": 0.7506240606307983, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 12350 + }, + { + "epoch": 3.9896707553260167, + "grad_norm": 1.005982756614685, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 12360 + }, + { + "epoch": 3.9928986442866368, + "grad_norm": 0.7566431164741516, + "learning_rate": 0.0002, + "loss": 0.5926, + "step": 12370 + }, + { + "epoch": 3.9961265332472564, + "grad_norm": 0.8819181323051453, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 12380 + }, + { + "epoch": 3.999354422207876, + "grad_norm": 0.884497880935669, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 12390 + }, + { + "epoch": 4.0, + "eval_loss": 1.1907150745391846, + "eval_runtime": 161.5766, + "eval_samples_per_second": 4.537, + "eval_steps_per_second": 0.569, + "step": 12392 + }, + { + "epoch": 4.002582311168496, + "grad_norm": 1.0407241582870483, + "learning_rate": 0.0002, + "loss": 0.5203, + "step": 12400 + }, + { + "epoch": 4.005810200129115, + "grad_norm": 1.0199295282363892, + "learning_rate": 0.0002, + "loss": 0.4978, + "step": 12410 + }, + { + "epoch": 4.009038089089735, + "grad_norm": 0.8456302881240845, + "learning_rate": 0.0002, + "loss": 0.4985, + "step": 12420 + }, + { + "epoch": 4.012265978050355, + "grad_norm": 1.0621124505996704, + "learning_rate": 0.0002, + "loss": 0.4669, + "step": 12430 + }, + { + "epoch": 4.015493867010975, + "grad_norm": 0.8984712362289429, + "learning_rate": 0.0002, + "loss": 0.5277, + "step": 12440 + }, + { + "epoch": 4.018721755971595, + "grad_norm": 1.3785864114761353, + "learning_rate": 0.0002, + "loss": 0.5508, + "step": 12450 + }, + { + "epoch": 4.0219496449322145, + "grad_norm": 0.7911781668663025, + "learning_rate": 0.0002, + "loss": 0.5244, + "step": 12460 + }, + { + "epoch": 4.025177533892834, + "grad_norm": 1.0977907180786133, + "learning_rate": 0.0002, + "loss": 0.4746, + "step": 12470 + }, + { + "epoch": 4.028405422853454, + "grad_norm": 1.0664983987808228, + "learning_rate": 0.0002, + "loss": 0.4632, + "step": 12480 + }, + { + "epoch": 4.0316333118140735, + "grad_norm": 1.0807124376296997, + "learning_rate": 0.0002, + "loss": 0.5151, + "step": 12490 + }, + { + "epoch": 4.034861200774693, + "grad_norm": 1.2650192975997925, + "learning_rate": 0.0002, + "loss": 0.4712, + "step": 12500 + }, + { + "epoch": 4.038089089735313, + "grad_norm": 0.7164070010185242, + "learning_rate": 0.0002, + "loss": 0.5111, + "step": 12510 + }, + { + "epoch": 4.041316978695932, + "grad_norm": 1.0047489404678345, + "learning_rate": 0.0002, + "loss": 0.5015, + "step": 12520 + }, + { + "epoch": 4.044544867656553, + "grad_norm": 0.9303901791572571, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 12530 + }, + { + "epoch": 4.047772756617173, + "grad_norm": 1.0319702625274658, + "learning_rate": 0.0002, + "loss": 0.5165, + "step": 12540 + }, + { + "epoch": 4.051000645577792, + "grad_norm": 0.9549729228019714, + "learning_rate": 0.0002, + "loss": 0.4834, + "step": 12550 + }, + { + "epoch": 4.054228534538412, + "grad_norm": 0.7175564765930176, + "learning_rate": 0.0002, + "loss": 0.5235, + "step": 12560 + }, + { + "epoch": 4.057456423499032, + "grad_norm": 1.0622259378433228, + "learning_rate": 0.0002, + "loss": 0.5257, + "step": 12570 + }, + { + "epoch": 4.060684312459651, + "grad_norm": 1.172074556350708, + "learning_rate": 0.0002, + "loss": 0.5098, + "step": 12580 + }, + { + "epoch": 4.063912201420271, + "grad_norm": 0.9702366590499878, + "learning_rate": 0.0002, + "loss": 0.5112, + "step": 12590 + }, + { + "epoch": 4.0671400903808905, + "grad_norm": 0.741511344909668, + "learning_rate": 0.0002, + "loss": 0.5042, + "step": 12600 + }, + { + "epoch": 4.070367979341511, + "grad_norm": 0.8632621169090271, + "learning_rate": 0.0002, + "loss": 0.4996, + "step": 12610 + }, + { + "epoch": 4.073595868302131, + "grad_norm": 0.9695962071418762, + "learning_rate": 0.0002, + "loss": 0.4927, + "step": 12620 + }, + { + "epoch": 4.07682375726275, + "grad_norm": 0.9401052594184875, + "learning_rate": 0.0002, + "loss": 0.4618, + "step": 12630 + }, + { + "epoch": 4.08005164622337, + "grad_norm": 0.8068707585334778, + "learning_rate": 0.0002, + "loss": 0.4889, + "step": 12640 + }, + { + "epoch": 4.08327953518399, + "grad_norm": 0.9554762840270996, + "learning_rate": 0.0002, + "loss": 0.5046, + "step": 12650 + }, + { + "epoch": 4.086507424144609, + "grad_norm": 0.7637128233909607, + "learning_rate": 0.0002, + "loss": 0.5081, + "step": 12660 + }, + { + "epoch": 4.089735313105229, + "grad_norm": 0.6703744530677795, + "learning_rate": 0.0002, + "loss": 0.4997, + "step": 12670 + }, + { + "epoch": 4.092963202065849, + "grad_norm": 0.8623828887939453, + "learning_rate": 0.0002, + "loss": 0.4977, + "step": 12680 + }, + { + "epoch": 4.096191091026468, + "grad_norm": 0.8198223114013672, + "learning_rate": 0.0002, + "loss": 0.4616, + "step": 12690 + }, + { + "epoch": 4.099418979987089, + "grad_norm": 1.3449875116348267, + "learning_rate": 0.0002, + "loss": 0.5372, + "step": 12700 + }, + { + "epoch": 4.1026468689477085, + "grad_norm": 0.8333606123924255, + "learning_rate": 0.0002, + "loss": 0.4782, + "step": 12710 + }, + { + "epoch": 4.105874757908328, + "grad_norm": 1.1647733449935913, + "learning_rate": 0.0002, + "loss": 0.5135, + "step": 12720 + }, + { + "epoch": 4.109102646868948, + "grad_norm": 1.0560213327407837, + "learning_rate": 0.0002, + "loss": 0.5147, + "step": 12730 + }, + { + "epoch": 4.112330535829567, + "grad_norm": 0.9479449987411499, + "learning_rate": 0.0002, + "loss": 0.5244, + "step": 12740 + }, + { + "epoch": 4.115558424790187, + "grad_norm": 1.1634587049484253, + "learning_rate": 0.0002, + "loss": 0.4596, + "step": 12750 + }, + { + "epoch": 4.118786313750807, + "grad_norm": 0.813987672328949, + "learning_rate": 0.0002, + "loss": 0.4966, + "step": 12760 + }, + { + "epoch": 4.122014202711426, + "grad_norm": 0.968461275100708, + "learning_rate": 0.0002, + "loss": 0.5133, + "step": 12770 + }, + { + "epoch": 4.125242091672046, + "grad_norm": 0.9324830770492554, + "learning_rate": 0.0002, + "loss": 0.5113, + "step": 12780 + }, + { + "epoch": 4.128469980632667, + "grad_norm": 0.8313411474227905, + "learning_rate": 0.0002, + "loss": 0.5233, + "step": 12790 + }, + { + "epoch": 4.131697869593286, + "grad_norm": 1.0177634954452515, + "learning_rate": 0.0002, + "loss": 0.5169, + "step": 12800 + }, + { + "epoch": 4.134925758553906, + "grad_norm": 1.0890623331069946, + "learning_rate": 0.0002, + "loss": 0.4635, + "step": 12810 + }, + { + "epoch": 4.1381536475145255, + "grad_norm": 0.9131693840026855, + "learning_rate": 0.0002, + "loss": 0.519, + "step": 12820 + }, + { + "epoch": 4.141381536475145, + "grad_norm": 0.8400680422782898, + "learning_rate": 0.0002, + "loss": 0.5017, + "step": 12830 + }, + { + "epoch": 4.144609425435765, + "grad_norm": 0.8988795876502991, + "learning_rate": 0.0002, + "loss": 0.5195, + "step": 12840 + }, + { + "epoch": 4.1478373143963845, + "grad_norm": 0.9224025011062622, + "learning_rate": 0.0002, + "loss": 0.5052, + "step": 12850 + }, + { + "epoch": 4.151065203357004, + "grad_norm": 0.7453159689903259, + "learning_rate": 0.0002, + "loss": 0.5001, + "step": 12860 + }, + { + "epoch": 4.154293092317625, + "grad_norm": 0.9815868139266968, + "learning_rate": 0.0002, + "loss": 0.4874, + "step": 12870 + }, + { + "epoch": 4.157520981278244, + "grad_norm": 1.2542768716812134, + "learning_rate": 0.0002, + "loss": 0.5485, + "step": 12880 + }, + { + "epoch": 4.160748870238864, + "grad_norm": 1.0092132091522217, + "learning_rate": 0.0002, + "loss": 0.5287, + "step": 12890 + }, + { + "epoch": 4.163976759199484, + "grad_norm": 1.1836622953414917, + "learning_rate": 0.0002, + "loss": 0.5125, + "step": 12900 + }, + { + "epoch": 4.167204648160103, + "grad_norm": 0.7706810235977173, + "learning_rate": 0.0002, + "loss": 0.5089, + "step": 12910 + }, + { + "epoch": 4.170432537120723, + "grad_norm": 1.00058913230896, + "learning_rate": 0.0002, + "loss": 0.5123, + "step": 12920 + }, + { + "epoch": 4.173660426081343, + "grad_norm": 1.2326250076293945, + "learning_rate": 0.0002, + "loss": 0.5238, + "step": 12930 + }, + { + "epoch": 4.176888315041962, + "grad_norm": 0.8829123377799988, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 12940 + }, + { + "epoch": 4.180116204002582, + "grad_norm": 0.936042845249176, + "learning_rate": 0.0002, + "loss": 0.517, + "step": 12950 + }, + { + "epoch": 4.183344092963202, + "grad_norm": 0.9773517847061157, + "learning_rate": 0.0002, + "loss": 0.4991, + "step": 12960 + }, + { + "epoch": 4.186571981923822, + "grad_norm": 0.9786297678947449, + "learning_rate": 0.0002, + "loss": 0.5025, + "step": 12970 + }, + { + "epoch": 4.189799870884442, + "grad_norm": 0.7524558901786804, + "learning_rate": 0.0002, + "loss": 0.5276, + "step": 12980 + }, + { + "epoch": 4.193027759845061, + "grad_norm": 1.0107866525650024, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 12990 + }, + { + "epoch": 4.196255648805681, + "grad_norm": 1.0092947483062744, + "learning_rate": 0.0002, + "loss": 0.5304, + "step": 13000 + }, + { + "epoch": 4.199483537766301, + "grad_norm": 1.18181312084198, + "learning_rate": 0.0002, + "loss": 0.5061, + "step": 13010 + }, + { + "epoch": 4.20271142672692, + "grad_norm": 0.8845750093460083, + "learning_rate": 0.0002, + "loss": 0.512, + "step": 13020 + }, + { + "epoch": 4.20593931568754, + "grad_norm": 1.0789145231246948, + "learning_rate": 0.0002, + "loss": 0.5329, + "step": 13030 + }, + { + "epoch": 4.2091672046481605, + "grad_norm": 0.9562082886695862, + "learning_rate": 0.0002, + "loss": 0.5001, + "step": 13040 + }, + { + "epoch": 4.21239509360878, + "grad_norm": 0.875755786895752, + "learning_rate": 0.0002, + "loss": 0.5211, + "step": 13050 + }, + { + "epoch": 4.2156229825694, + "grad_norm": 1.0694596767425537, + "learning_rate": 0.0002, + "loss": 0.5162, + "step": 13060 + }, + { + "epoch": 4.2188508715300195, + "grad_norm": 1.0053378343582153, + "learning_rate": 0.0002, + "loss": 0.4917, + "step": 13070 + }, + { + "epoch": 4.222078760490639, + "grad_norm": 1.1628689765930176, + "learning_rate": 0.0002, + "loss": 0.542, + "step": 13080 + }, + { + "epoch": 4.225306649451259, + "grad_norm": 0.9455991983413696, + "learning_rate": 0.0002, + "loss": 0.4796, + "step": 13090 + }, + { + "epoch": 4.228534538411878, + "grad_norm": 0.9736765623092651, + "learning_rate": 0.0002, + "loss": 0.4802, + "step": 13100 + }, + { + "epoch": 4.231762427372498, + "grad_norm": 0.8653560876846313, + "learning_rate": 0.0002, + "loss": 0.5411, + "step": 13110 + }, + { + "epoch": 4.234990316333118, + "grad_norm": 0.9335988163948059, + "learning_rate": 0.0002, + "loss": 0.5347, + "step": 13120 + }, + { + "epoch": 4.238218205293738, + "grad_norm": 0.9102661609649658, + "learning_rate": 0.0002, + "loss": 0.5217, + "step": 13130 + }, + { + "epoch": 4.241446094254358, + "grad_norm": 1.0595461130142212, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 13140 + }, + { + "epoch": 4.244673983214978, + "grad_norm": 0.8947662711143494, + "learning_rate": 0.0002, + "loss": 0.517, + "step": 13150 + }, + { + "epoch": 4.247901872175597, + "grad_norm": 1.0835723876953125, + "learning_rate": 0.0002, + "loss": 0.5116, + "step": 13160 + }, + { + "epoch": 4.251129761136217, + "grad_norm": 0.8496462106704712, + "learning_rate": 0.0002, + "loss": 0.5212, + "step": 13170 + }, + { + "epoch": 4.2543576500968365, + "grad_norm": 0.9395631551742554, + "learning_rate": 0.0002, + "loss": 0.5079, + "step": 13180 + }, + { + "epoch": 4.257585539057456, + "grad_norm": 1.2939592599868774, + "learning_rate": 0.0002, + "loss": 0.5076, + "step": 13190 + }, + { + "epoch": 4.260813428018076, + "grad_norm": 0.9325923919677734, + "learning_rate": 0.0002, + "loss": 0.5209, + "step": 13200 + }, + { + "epoch": 4.264041316978696, + "grad_norm": 0.9220664501190186, + "learning_rate": 0.0002, + "loss": 0.4984, + "step": 13210 + }, + { + "epoch": 4.267269205939316, + "grad_norm": 0.9505137205123901, + "learning_rate": 0.0002, + "loss": 0.5553, + "step": 13220 + }, + { + "epoch": 4.270497094899936, + "grad_norm": 1.0713751316070557, + "learning_rate": 0.0002, + "loss": 0.5238, + "step": 13230 + }, + { + "epoch": 4.273724983860555, + "grad_norm": 0.8390375971794128, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 13240 + }, + { + "epoch": 4.276952872821175, + "grad_norm": 0.8943426012992859, + "learning_rate": 0.0002, + "loss": 0.5217, + "step": 13250 + }, + { + "epoch": 4.280180761781795, + "grad_norm": 0.9175868630409241, + "learning_rate": 0.0002, + "loss": 0.5486, + "step": 13260 + }, + { + "epoch": 4.283408650742414, + "grad_norm": 0.9969881176948547, + "learning_rate": 0.0002, + "loss": 0.5208, + "step": 13270 + }, + { + "epoch": 4.286636539703034, + "grad_norm": 1.2271877527236938, + "learning_rate": 0.0002, + "loss": 0.5376, + "step": 13280 + }, + { + "epoch": 4.289864428663654, + "grad_norm": 0.9463263154029846, + "learning_rate": 0.0002, + "loss": 0.4811, + "step": 13290 + }, + { + "epoch": 4.293092317624274, + "grad_norm": 1.0306228399276733, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 13300 + }, + { + "epoch": 4.296320206584894, + "grad_norm": 0.8454763889312744, + "learning_rate": 0.0002, + "loss": 0.5092, + "step": 13310 + }, + { + "epoch": 4.299548095545513, + "grad_norm": 0.9843119978904724, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 13320 + }, + { + "epoch": 4.302775984506133, + "grad_norm": 1.0836851596832275, + "learning_rate": 0.0002, + "loss": 0.5407, + "step": 13330 + }, + { + "epoch": 4.306003873466753, + "grad_norm": 1.0719412565231323, + "learning_rate": 0.0002, + "loss": 0.5336, + "step": 13340 + }, + { + "epoch": 4.309231762427372, + "grad_norm": 0.9276487827301025, + "learning_rate": 0.0002, + "loss": 0.4798, + "step": 13350 + }, + { + "epoch": 4.312459651387992, + "grad_norm": 0.897072434425354, + "learning_rate": 0.0002, + "loss": 0.5256, + "step": 13360 + }, + { + "epoch": 4.315687540348612, + "grad_norm": 1.0493228435516357, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 13370 + }, + { + "epoch": 4.318915429309232, + "grad_norm": 0.9446353316307068, + "learning_rate": 0.0002, + "loss": 0.5218, + "step": 13380 + }, + { + "epoch": 4.322143318269852, + "grad_norm": 0.7765224575996399, + "learning_rate": 0.0002, + "loss": 0.4765, + "step": 13390 + }, + { + "epoch": 4.3253712072304715, + "grad_norm": 0.9100048542022705, + "learning_rate": 0.0002, + "loss": 0.5907, + "step": 13400 + }, + { + "epoch": 4.328599096191091, + "grad_norm": 1.0913089513778687, + "learning_rate": 0.0002, + "loss": 0.5393, + "step": 13410 + }, + { + "epoch": 4.331826985151711, + "grad_norm": 0.9607733488082886, + "learning_rate": 0.0002, + "loss": 0.494, + "step": 13420 + }, + { + "epoch": 4.3350548741123305, + "grad_norm": 0.8774219155311584, + "learning_rate": 0.0002, + "loss": 0.5273, + "step": 13430 + }, + { + "epoch": 4.33828276307295, + "grad_norm": 0.8366804122924805, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 13440 + }, + { + "epoch": 4.34151065203357, + "grad_norm": 1.034727931022644, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 13450 + }, + { + "epoch": 4.344738540994189, + "grad_norm": 0.942743182182312, + "learning_rate": 0.0002, + "loss": 0.4995, + "step": 13460 + }, + { + "epoch": 4.347966429954809, + "grad_norm": 0.7237029075622559, + "learning_rate": 0.0002, + "loss": 0.5222, + "step": 13470 + }, + { + "epoch": 4.35119431891543, + "grad_norm": 0.8216196894645691, + "learning_rate": 0.0002, + "loss": 0.5461, + "step": 13480 + }, + { + "epoch": 4.354422207876049, + "grad_norm": 1.031860113143921, + "learning_rate": 0.0002, + "loss": 0.5104, + "step": 13490 + }, + { + "epoch": 4.357650096836669, + "grad_norm": 0.8880493640899658, + "learning_rate": 0.0002, + "loss": 0.547, + "step": 13500 + }, + { + "epoch": 4.360877985797289, + "grad_norm": 0.8442490696907043, + "learning_rate": 0.0002, + "loss": 0.5259, + "step": 13510 + }, + { + "epoch": 4.364105874757908, + "grad_norm": 1.270971655845642, + "learning_rate": 0.0002, + "loss": 0.5176, + "step": 13520 + }, + { + "epoch": 4.367333763718528, + "grad_norm": 0.9657870531082153, + "learning_rate": 0.0002, + "loss": 0.5028, + "step": 13530 + }, + { + "epoch": 4.3705616526791475, + "grad_norm": 0.7477133870124817, + "learning_rate": 0.0002, + "loss": 0.5136, + "step": 13540 + }, + { + "epoch": 4.373789541639767, + "grad_norm": 1.0209243297576904, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 13550 + }, + { + "epoch": 4.377017430600388, + "grad_norm": 0.8714015483856201, + "learning_rate": 0.0002, + "loss": 0.4888, + "step": 13560 + }, + { + "epoch": 4.380245319561007, + "grad_norm": 1.0490189790725708, + "learning_rate": 0.0002, + "loss": 0.5428, + "step": 13570 + }, + { + "epoch": 4.383473208521627, + "grad_norm": 0.9454663991928101, + "learning_rate": 0.0002, + "loss": 0.5398, + "step": 13580 + }, + { + "epoch": 4.386701097482247, + "grad_norm": 1.154146432876587, + "learning_rate": 0.0002, + "loss": 0.5072, + "step": 13590 + }, + { + "epoch": 4.389928986442866, + "grad_norm": 1.155090570449829, + "learning_rate": 0.0002, + "loss": 0.5096, + "step": 13600 + }, + { + "epoch": 4.393156875403486, + "grad_norm": 0.9853842854499817, + "learning_rate": 0.0002, + "loss": 0.5679, + "step": 13610 + }, + { + "epoch": 4.396384764364106, + "grad_norm": 0.9265837669372559, + "learning_rate": 0.0002, + "loss": 0.4992, + "step": 13620 + }, + { + "epoch": 4.399612653324725, + "grad_norm": 0.8367540240287781, + "learning_rate": 0.0002, + "loss": 0.523, + "step": 13630 + }, + { + "epoch": 4.402840542285345, + "grad_norm": 1.1453629732131958, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 13640 + }, + { + "epoch": 4.4060684312459655, + "grad_norm": 1.0856295824050903, + "learning_rate": 0.0002, + "loss": 0.573, + "step": 13650 + }, + { + "epoch": 4.409296320206585, + "grad_norm": 0.9284523129463196, + "learning_rate": 0.0002, + "loss": 0.5178, + "step": 13660 + }, + { + "epoch": 4.412524209167205, + "grad_norm": 0.9632299542427063, + "learning_rate": 0.0002, + "loss": 0.4862, + "step": 13670 + }, + { + "epoch": 4.415752098127824, + "grad_norm": 1.048524260520935, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 13680 + }, + { + "epoch": 4.418979987088444, + "grad_norm": 0.9787682294845581, + "learning_rate": 0.0002, + "loss": 0.5258, + "step": 13690 + }, + { + "epoch": 4.422207876049064, + "grad_norm": 1.0728684663772583, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 13700 + }, + { + "epoch": 4.425435765009683, + "grad_norm": 0.72867351770401, + "learning_rate": 0.0002, + "loss": 0.5243, + "step": 13710 + }, + { + "epoch": 4.428663653970303, + "grad_norm": 0.8932793736457825, + "learning_rate": 0.0002, + "loss": 0.5313, + "step": 13720 + }, + { + "epoch": 4.431891542930924, + "grad_norm": 1.098343849182129, + "learning_rate": 0.0002, + "loss": 0.5156, + "step": 13730 + }, + { + "epoch": 4.435119431891543, + "grad_norm": 0.9321235418319702, + "learning_rate": 0.0002, + "loss": 0.5342, + "step": 13740 + }, + { + "epoch": 4.438347320852163, + "grad_norm": 0.8868634104728699, + "learning_rate": 0.0002, + "loss": 0.5114, + "step": 13750 + }, + { + "epoch": 4.4415752098127825, + "grad_norm": 1.200064778327942, + "learning_rate": 0.0002, + "loss": 0.5284, + "step": 13760 + }, + { + "epoch": 4.444803098773402, + "grad_norm": 0.8968019485473633, + "learning_rate": 0.0002, + "loss": 0.5208, + "step": 13770 + }, + { + "epoch": 4.448030987734022, + "grad_norm": 0.9560935497283936, + "learning_rate": 0.0002, + "loss": 0.4979, + "step": 13780 + }, + { + "epoch": 4.4512588766946415, + "grad_norm": 0.7985701560974121, + "learning_rate": 0.0002, + "loss": 0.5134, + "step": 13790 + }, + { + "epoch": 4.454486765655261, + "grad_norm": 1.062540888786316, + "learning_rate": 0.0002, + "loss": 0.5113, + "step": 13800 + }, + { + "epoch": 4.457714654615881, + "grad_norm": 1.0827109813690186, + "learning_rate": 0.0002, + "loss": 0.525, + "step": 13810 + }, + { + "epoch": 4.460942543576501, + "grad_norm": 1.0853543281555176, + "learning_rate": 0.0002, + "loss": 0.5541, + "step": 13820 + }, + { + "epoch": 4.464170432537121, + "grad_norm": 1.0613641738891602, + "learning_rate": 0.0002, + "loss": 0.5381, + "step": 13830 + }, + { + "epoch": 4.467398321497741, + "grad_norm": 0.9037535190582275, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 13840 + }, + { + "epoch": 4.47062621045836, + "grad_norm": 0.9216223955154419, + "learning_rate": 0.0002, + "loss": 0.5112, + "step": 13850 + }, + { + "epoch": 4.47385409941898, + "grad_norm": 0.8952260613441467, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 13860 + }, + { + "epoch": 4.4770819883796, + "grad_norm": 0.9997953176498413, + "learning_rate": 0.0002, + "loss": 0.5026, + "step": 13870 + }, + { + "epoch": 4.480309877340219, + "grad_norm": 1.062458872795105, + "learning_rate": 0.0002, + "loss": 0.5107, + "step": 13880 + }, + { + "epoch": 4.483537766300839, + "grad_norm": 0.9185126423835754, + "learning_rate": 0.0002, + "loss": 0.5463, + "step": 13890 + }, + { + "epoch": 4.486765655261459, + "grad_norm": 1.2389954328536987, + "learning_rate": 0.0002, + "loss": 0.5181, + "step": 13900 + }, + { + "epoch": 4.489993544222079, + "grad_norm": 1.1632126569747925, + "learning_rate": 0.0002, + "loss": 0.5199, + "step": 13910 + }, + { + "epoch": 4.493221433182699, + "grad_norm": 1.0304487943649292, + "learning_rate": 0.0002, + "loss": 0.5128, + "step": 13920 + }, + { + "epoch": 4.496449322143318, + "grad_norm": 0.9144788384437561, + "learning_rate": 0.0002, + "loss": 0.5331, + "step": 13930 + }, + { + "epoch": 4.499677211103938, + "grad_norm": 1.0285682678222656, + "learning_rate": 0.0002, + "loss": 0.5312, + "step": 13940 + }, + { + "epoch": 4.502905100064558, + "grad_norm": 1.1187206506729126, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 13950 + }, + { + "epoch": 4.506132989025177, + "grad_norm": 0.7917197942733765, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 13960 + }, + { + "epoch": 4.509360877985797, + "grad_norm": 0.8495619297027588, + "learning_rate": 0.0002, + "loss": 0.5227, + "step": 13970 + }, + { + "epoch": 4.512588766946417, + "grad_norm": 1.0450760126113892, + "learning_rate": 0.0002, + "loss": 0.4971, + "step": 13980 + }, + { + "epoch": 4.515816655907037, + "grad_norm": 1.0061010122299194, + "learning_rate": 0.0002, + "loss": 0.5402, + "step": 13990 + }, + { + "epoch": 4.519044544867657, + "grad_norm": 1.0232428312301636, + "learning_rate": 0.0002, + "loss": 0.527, + "step": 14000 + }, + { + "epoch": 4.5222724338282765, + "grad_norm": 0.8734631538391113, + "learning_rate": 0.0002, + "loss": 0.5002, + "step": 14010 + }, + { + "epoch": 4.525500322788896, + "grad_norm": 1.1085621118545532, + "learning_rate": 0.0002, + "loss": 0.5464, + "step": 14020 + }, + { + "epoch": 4.528728211749516, + "grad_norm": 0.9178624749183655, + "learning_rate": 0.0002, + "loss": 0.5167, + "step": 14030 + }, + { + "epoch": 4.531956100710135, + "grad_norm": 1.0687317848205566, + "learning_rate": 0.0002, + "loss": 0.5589, + "step": 14040 + }, + { + "epoch": 4.535183989670755, + "grad_norm": 0.9237300157546997, + "learning_rate": 0.0002, + "loss": 0.5576, + "step": 14050 + }, + { + "epoch": 4.538411878631375, + "grad_norm": 0.9667123556137085, + "learning_rate": 0.0002, + "loss": 0.5062, + "step": 14060 + }, + { + "epoch": 4.541639767591995, + "grad_norm": 1.1286747455596924, + "learning_rate": 0.0002, + "loss": 0.5645, + "step": 14070 + }, + { + "epoch": 4.544867656552615, + "grad_norm": 1.055392861366272, + "learning_rate": 0.0002, + "loss": 0.5226, + "step": 14080 + }, + { + "epoch": 4.548095545513235, + "grad_norm": 0.9492936134338379, + "learning_rate": 0.0002, + "loss": 0.5428, + "step": 14090 + }, + { + "epoch": 4.551323434473854, + "grad_norm": 0.9881349802017212, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 14100 + }, + { + "epoch": 4.554551323434474, + "grad_norm": 0.9389023184776306, + "learning_rate": 0.0002, + "loss": 0.5572, + "step": 14110 + }, + { + "epoch": 4.5577792123950935, + "grad_norm": 0.8395606875419617, + "learning_rate": 0.0002, + "loss": 0.5511, + "step": 14120 + }, + { + "epoch": 4.561007101355713, + "grad_norm": 0.9019067287445068, + "learning_rate": 0.0002, + "loss": 0.5696, + "step": 14130 + }, + { + "epoch": 4.564234990316333, + "grad_norm": 1.1058136224746704, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 14140 + }, + { + "epoch": 4.5674628792769525, + "grad_norm": 1.0683821439743042, + "learning_rate": 0.0002, + "loss": 0.5323, + "step": 14150 + }, + { + "epoch": 4.570690768237572, + "grad_norm": 1.3398395776748657, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 14160 + }, + { + "epoch": 4.573918657198193, + "grad_norm": 0.7829096913337708, + "learning_rate": 0.0002, + "loss": 0.4713, + "step": 14170 + }, + { + "epoch": 4.577146546158812, + "grad_norm": 0.9636675119400024, + "learning_rate": 0.0002, + "loss": 0.525, + "step": 14180 + }, + { + "epoch": 4.580374435119432, + "grad_norm": 1.0291401147842407, + "learning_rate": 0.0002, + "loss": 0.5458, + "step": 14190 + }, + { + "epoch": 4.583602324080052, + "grad_norm": 1.0894310474395752, + "learning_rate": 0.0002, + "loss": 0.5366, + "step": 14200 + }, + { + "epoch": 4.586830213040671, + "grad_norm": 1.111573576927185, + "learning_rate": 0.0002, + "loss": 0.5125, + "step": 14210 + }, + { + "epoch": 4.590058102001291, + "grad_norm": 0.9345336556434631, + "learning_rate": 0.0002, + "loss": 0.5444, + "step": 14220 + }, + { + "epoch": 4.593285990961911, + "grad_norm": 1.3338757753372192, + "learning_rate": 0.0002, + "loss": 0.5175, + "step": 14230 + }, + { + "epoch": 4.596513879922531, + "grad_norm": 1.1146448850631714, + "learning_rate": 0.0002, + "loss": 0.5227, + "step": 14240 + }, + { + "epoch": 4.599741768883151, + "grad_norm": 1.1576755046844482, + "learning_rate": 0.0002, + "loss": 0.543, + "step": 14250 + }, + { + "epoch": 4.60296965784377, + "grad_norm": 0.6851092576980591, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 14260 + }, + { + "epoch": 4.60619754680439, + "grad_norm": 0.9067938923835754, + "learning_rate": 0.0002, + "loss": 0.5027, + "step": 14270 + }, + { + "epoch": 4.60942543576501, + "grad_norm": 0.8767340183258057, + "learning_rate": 0.0002, + "loss": 0.5237, + "step": 14280 + }, + { + "epoch": 4.612653324725629, + "grad_norm": 1.024880290031433, + "learning_rate": 0.0002, + "loss": 0.5294, + "step": 14290 + }, + { + "epoch": 4.615881213686249, + "grad_norm": 0.9226394891738892, + "learning_rate": 0.0002, + "loss": 0.5371, + "step": 14300 + }, + { + "epoch": 4.619109102646869, + "grad_norm": 1.018187165260315, + "learning_rate": 0.0002, + "loss": 0.5281, + "step": 14310 + }, + { + "epoch": 4.622336991607488, + "grad_norm": 0.8851249814033508, + "learning_rate": 0.0002, + "loss": 0.5546, + "step": 14320 + }, + { + "epoch": 4.625564880568108, + "grad_norm": 0.745798647403717, + "learning_rate": 0.0002, + "loss": 0.5206, + "step": 14330 + }, + { + "epoch": 4.6287927695287285, + "grad_norm": 1.2082698345184326, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 14340 + }, + { + "epoch": 4.632020658489348, + "grad_norm": 0.901454508304596, + "learning_rate": 0.0002, + "loss": 0.5449, + "step": 14350 + }, + { + "epoch": 4.635248547449968, + "grad_norm": 0.9593124985694885, + "learning_rate": 0.0002, + "loss": 0.5433, + "step": 14360 + }, + { + "epoch": 4.6384764364105875, + "grad_norm": 1.1241410970687866, + "learning_rate": 0.0002, + "loss": 0.4939, + "step": 14370 + }, + { + "epoch": 4.641704325371207, + "grad_norm": 0.9221102595329285, + "learning_rate": 0.0002, + "loss": 0.5319, + "step": 14380 + }, + { + "epoch": 4.644932214331827, + "grad_norm": 1.0035039186477661, + "learning_rate": 0.0002, + "loss": 0.524, + "step": 14390 + }, + { + "epoch": 4.648160103292446, + "grad_norm": 1.1270662546157837, + "learning_rate": 0.0002, + "loss": 0.5617, + "step": 14400 + }, + { + "epoch": 4.651387992253067, + "grad_norm": 0.8631120324134827, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 14410 + }, + { + "epoch": 4.654615881213687, + "grad_norm": 1.0604606866836548, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 14420 + }, + { + "epoch": 4.657843770174306, + "grad_norm": 0.8002706170082092, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 14430 + }, + { + "epoch": 4.661071659134926, + "grad_norm": 1.0642075538635254, + "learning_rate": 0.0002, + "loss": 0.5459, + "step": 14440 + }, + { + "epoch": 4.664299548095546, + "grad_norm": 0.9315671324729919, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 14450 + }, + { + "epoch": 4.667527437056165, + "grad_norm": 0.8311864137649536, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 14460 + }, + { + "epoch": 4.670755326016785, + "grad_norm": 0.8900430202484131, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 14470 + }, + { + "epoch": 4.6739832149774045, + "grad_norm": 1.059267282485962, + "learning_rate": 0.0002, + "loss": 0.5086, + "step": 14480 + }, + { + "epoch": 4.677211103938024, + "grad_norm": 0.9864052534103394, + "learning_rate": 0.0002, + "loss": 0.5583, + "step": 14490 + }, + { + "epoch": 4.680438992898644, + "grad_norm": 1.210854411125183, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 14500 + }, + { + "epoch": 4.683666881859264, + "grad_norm": 1.030693769454956, + "learning_rate": 0.0002, + "loss": 0.536, + "step": 14510 + }, + { + "epoch": 4.686894770819884, + "grad_norm": 0.9809406995773315, + "learning_rate": 0.0002, + "loss": 0.544, + "step": 14520 + }, + { + "epoch": 4.690122659780504, + "grad_norm": 1.0471004247665405, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 14530 + }, + { + "epoch": 4.693350548741123, + "grad_norm": 1.1583727598190308, + "learning_rate": 0.0002, + "loss": 0.5613, + "step": 14540 + }, + { + "epoch": 4.696578437701743, + "grad_norm": 0.9664418697357178, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 14550 + }, + { + "epoch": 4.699806326662363, + "grad_norm": 0.9511209726333618, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 14560 + }, + { + "epoch": 4.703034215622982, + "grad_norm": 1.0211684703826904, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 14570 + }, + { + "epoch": 4.706262104583602, + "grad_norm": 1.097276210784912, + "learning_rate": 0.0002, + "loss": 0.5536, + "step": 14580 + }, + { + "epoch": 4.7094899935442225, + "grad_norm": 0.9363943338394165, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 14590 + }, + { + "epoch": 4.712717882504842, + "grad_norm": 1.4700615406036377, + "learning_rate": 0.0002, + "loss": 0.5261, + "step": 14600 + }, + { + "epoch": 4.715945771465462, + "grad_norm": 1.0001553297042847, + "learning_rate": 0.0002, + "loss": 0.5489, + "step": 14610 + }, + { + "epoch": 4.719173660426081, + "grad_norm": 1.0489927530288696, + "learning_rate": 0.0002, + "loss": 0.5236, + "step": 14620 + }, + { + "epoch": 4.722401549386701, + "grad_norm": 1.0483676195144653, + "learning_rate": 0.0002, + "loss": 0.5418, + "step": 14630 + }, + { + "epoch": 4.725629438347321, + "grad_norm": 1.1501940488815308, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 14640 + }, + { + "epoch": 4.72885732730794, + "grad_norm": 1.1703146696090698, + "learning_rate": 0.0002, + "loss": 0.5059, + "step": 14650 + }, + { + "epoch": 4.73208521626856, + "grad_norm": 0.8842985033988953, + "learning_rate": 0.0002, + "loss": 0.5356, + "step": 14660 + }, + { + "epoch": 4.73531310522918, + "grad_norm": 0.9147908687591553, + "learning_rate": 0.0002, + "loss": 0.5229, + "step": 14670 + }, + { + "epoch": 4.7385409941898, + "grad_norm": 1.0391576290130615, + "learning_rate": 0.0002, + "loss": 0.5436, + "step": 14680 + }, + { + "epoch": 4.74176888315042, + "grad_norm": 0.9469179511070251, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 14690 + }, + { + "epoch": 4.7449967721110395, + "grad_norm": 1.0529530048370361, + "learning_rate": 0.0002, + "loss": 0.5201, + "step": 14700 + }, + { + "epoch": 4.748224661071659, + "grad_norm": 0.9645711183547974, + "learning_rate": 0.0002, + "loss": 0.5401, + "step": 14710 + }, + { + "epoch": 4.751452550032279, + "grad_norm": 0.8163343071937561, + "learning_rate": 0.0002, + "loss": 0.5123, + "step": 14720 + }, + { + "epoch": 4.7546804389928985, + "grad_norm": 1.0581341981887817, + "learning_rate": 0.0002, + "loss": 0.5654, + "step": 14730 + }, + { + "epoch": 4.757908327953518, + "grad_norm": 1.0913853645324707, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 14740 + }, + { + "epoch": 4.761136216914138, + "grad_norm": 1.1071174144744873, + "learning_rate": 0.0002, + "loss": 0.5342, + "step": 14750 + }, + { + "epoch": 4.764364105874758, + "grad_norm": 1.0060709714889526, + "learning_rate": 0.0002, + "loss": 0.5353, + "step": 14760 + }, + { + "epoch": 4.767591994835378, + "grad_norm": 1.012024164199829, + "learning_rate": 0.0002, + "loss": 0.5415, + "step": 14770 + }, + { + "epoch": 4.770819883795998, + "grad_norm": 0.8438148498535156, + "learning_rate": 0.0002, + "loss": 0.5351, + "step": 14780 + }, + { + "epoch": 4.774047772756617, + "grad_norm": 0.8136811256408691, + "learning_rate": 0.0002, + "loss": 0.5424, + "step": 14790 + }, + { + "epoch": 4.777275661717237, + "grad_norm": 1.0765691995620728, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 14800 + }, + { + "epoch": 4.780503550677857, + "grad_norm": 1.0582574605941772, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 14810 + }, + { + "epoch": 4.783731439638476, + "grad_norm": 0.9419516921043396, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 14820 + }, + { + "epoch": 4.786959328599096, + "grad_norm": 0.9626181721687317, + "learning_rate": 0.0002, + "loss": 0.5499, + "step": 14830 + }, + { + "epoch": 4.7901872175597155, + "grad_norm": 1.2552800178527832, + "learning_rate": 0.0002, + "loss": 0.565, + "step": 14840 + }, + { + "epoch": 4.793415106520336, + "grad_norm": 0.9379919171333313, + "learning_rate": 0.0002, + "loss": 0.5402, + "step": 14850 + }, + { + "epoch": 4.796642995480956, + "grad_norm": 0.8166947364807129, + "learning_rate": 0.0002, + "loss": 0.5583, + "step": 14860 + }, + { + "epoch": 4.799870884441575, + "grad_norm": 0.9008694887161255, + "learning_rate": 0.0002, + "loss": 0.5139, + "step": 14870 + }, + { + "epoch": 4.803098773402195, + "grad_norm": 1.0256156921386719, + "learning_rate": 0.0002, + "loss": 0.5049, + "step": 14880 + }, + { + "epoch": 4.806326662362815, + "grad_norm": 0.9486594200134277, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 14890 + }, + { + "epoch": 4.809554551323434, + "grad_norm": 0.955238401889801, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 14900 + }, + { + "epoch": 4.812782440284054, + "grad_norm": 1.03775954246521, + "learning_rate": 0.0002, + "loss": 0.5269, + "step": 14910 + }, + { + "epoch": 4.816010329244674, + "grad_norm": 1.1383405923843384, + "learning_rate": 0.0002, + "loss": 0.5445, + "step": 14920 + }, + { + "epoch": 4.819238218205294, + "grad_norm": 0.9411700963973999, + "learning_rate": 0.0002, + "loss": 0.5347, + "step": 14930 + }, + { + "epoch": 4.822466107165914, + "grad_norm": 0.8188554644584656, + "learning_rate": 0.0002, + "loss": 0.4899, + "step": 14940 + }, + { + "epoch": 4.8256939961265335, + "grad_norm": 1.1336265802383423, + "learning_rate": 0.0002, + "loss": 0.5618, + "step": 14950 + }, + { + "epoch": 4.828921885087153, + "grad_norm": 1.106121301651001, + "learning_rate": 0.0002, + "loss": 0.5578, + "step": 14960 + }, + { + "epoch": 4.832149774047773, + "grad_norm": 1.0206533670425415, + "learning_rate": 0.0002, + "loss": 0.5306, + "step": 14970 + }, + { + "epoch": 4.8353776630083924, + "grad_norm": 1.1123926639556885, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 14980 + }, + { + "epoch": 4.838605551969012, + "grad_norm": 0.7879418730735779, + "learning_rate": 0.0002, + "loss": 0.5208, + "step": 14990 + }, + { + "epoch": 4.841833440929632, + "grad_norm": 1.0171709060668945, + "learning_rate": 0.0002, + "loss": 0.5385, + "step": 15000 + }, + { + "epoch": 4.845061329890251, + "grad_norm": 1.010671615600586, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 15010 + }, + { + "epoch": 4.848289218850871, + "grad_norm": 1.0778919458389282, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 15020 + }, + { + "epoch": 4.851517107811492, + "grad_norm": 1.0479968786239624, + "learning_rate": 0.0002, + "loss": 0.5587, + "step": 15030 + }, + { + "epoch": 4.854744996772111, + "grad_norm": 1.0345100164413452, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 15040 + }, + { + "epoch": 4.857972885732731, + "grad_norm": 0.9539691805839539, + "learning_rate": 0.0002, + "loss": 0.5809, + "step": 15050 + }, + { + "epoch": 4.8612007746933505, + "grad_norm": 0.9914752840995789, + "learning_rate": 0.0002, + "loss": 0.5314, + "step": 15060 + }, + { + "epoch": 4.86442866365397, + "grad_norm": 1.1935476064682007, + "learning_rate": 0.0002, + "loss": 0.5277, + "step": 15070 + }, + { + "epoch": 4.86765655261459, + "grad_norm": 1.0065057277679443, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 15080 + }, + { + "epoch": 4.8708844415752095, + "grad_norm": 0.9320993423461914, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 15090 + }, + { + "epoch": 4.87411233053583, + "grad_norm": 1.0578069686889648, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 15100 + }, + { + "epoch": 4.87734021949645, + "grad_norm": 0.9666239023208618, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 15110 + }, + { + "epoch": 4.880568108457069, + "grad_norm": 1.1322687864303589, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 15120 + }, + { + "epoch": 4.883795997417689, + "grad_norm": 0.955674409866333, + "learning_rate": 0.0002, + "loss": 0.5381, + "step": 15130 + }, + { + "epoch": 4.887023886378309, + "grad_norm": 1.119413137435913, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 15140 + }, + { + "epoch": 4.890251775338928, + "grad_norm": 0.863646924495697, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 15150 + }, + { + "epoch": 4.893479664299548, + "grad_norm": 1.1823450326919556, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 15160 + }, + { + "epoch": 4.896707553260168, + "grad_norm": 0.8657588958740234, + "learning_rate": 0.0002, + "loss": 0.5654, + "step": 15170 + }, + { + "epoch": 4.899935442220787, + "grad_norm": 0.8575737476348877, + "learning_rate": 0.0002, + "loss": 0.5239, + "step": 15180 + }, + { + "epoch": 4.903163331181407, + "grad_norm": 0.9611830711364746, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 15190 + }, + { + "epoch": 4.906391220142027, + "grad_norm": 1.1981453895568848, + "learning_rate": 0.0002, + "loss": 0.5505, + "step": 15200 + }, + { + "epoch": 4.909619109102647, + "grad_norm": 0.9401199221611023, + "learning_rate": 0.0002, + "loss": 0.5582, + "step": 15210 + }, + { + "epoch": 4.912846998063267, + "grad_norm": 0.8420369625091553, + "learning_rate": 0.0002, + "loss": 0.5631, + "step": 15220 + }, + { + "epoch": 4.916074887023886, + "grad_norm": 0.7877969145774841, + "learning_rate": 0.0002, + "loss": 0.5255, + "step": 15230 + }, + { + "epoch": 4.919302775984506, + "grad_norm": 0.8988324403762817, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 15240 + }, + { + "epoch": 4.922530664945126, + "grad_norm": 1.1103752851486206, + "learning_rate": 0.0002, + "loss": 0.5274, + "step": 15250 + }, + { + "epoch": 4.925758553905745, + "grad_norm": 0.8874443173408508, + "learning_rate": 0.0002, + "loss": 0.5249, + "step": 15260 + }, + { + "epoch": 4.928986442866366, + "grad_norm": 1.1001752614974976, + "learning_rate": 0.0002, + "loss": 0.5677, + "step": 15270 + }, + { + "epoch": 4.9322143318269855, + "grad_norm": 0.9661307334899902, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 15280 + }, + { + "epoch": 4.935442220787605, + "grad_norm": 1.1738812923431396, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 15290 + }, + { + "epoch": 4.938670109748225, + "grad_norm": 0.9773507714271545, + "learning_rate": 0.0002, + "loss": 0.5057, + "step": 15300 + }, + { + "epoch": 4.9418979987088445, + "grad_norm": 1.0735599994659424, + "learning_rate": 0.0002, + "loss": 0.5029, + "step": 15310 + }, + { + "epoch": 4.945125887669464, + "grad_norm": 1.0552113056182861, + "learning_rate": 0.0002, + "loss": 0.4996, + "step": 15320 + }, + { + "epoch": 4.948353776630084, + "grad_norm": 1.0900797843933105, + "learning_rate": 0.0002, + "loss": 0.5201, + "step": 15330 + }, + { + "epoch": 4.9515816655907035, + "grad_norm": 1.0908405780792236, + "learning_rate": 0.0002, + "loss": 0.552, + "step": 15340 + }, + { + "epoch": 4.954809554551323, + "grad_norm": 1.010221004486084, + "learning_rate": 0.0002, + "loss": 0.6208, + "step": 15350 + }, + { + "epoch": 4.958037443511943, + "grad_norm": 1.0321437120437622, + "learning_rate": 0.0002, + "loss": 0.5423, + "step": 15360 + }, + { + "epoch": 4.961265332472563, + "grad_norm": 0.8430278897285461, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 15370 + }, + { + "epoch": 4.964493221433183, + "grad_norm": 0.8775330185890198, + "learning_rate": 0.0002, + "loss": 0.538, + "step": 15380 + }, + { + "epoch": 4.967721110393803, + "grad_norm": 0.9796988368034363, + "learning_rate": 0.0002, + "loss": 0.5344, + "step": 15390 + }, + { + "epoch": 4.970948999354422, + "grad_norm": 0.8782257437705994, + "learning_rate": 0.0002, + "loss": 0.5352, + "step": 15400 + }, + { + "epoch": 4.974176888315042, + "grad_norm": 0.9959840774536133, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 15410 + }, + { + "epoch": 4.9774047772756616, + "grad_norm": 1.0730273723602295, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 15420 + }, + { + "epoch": 4.980632666236281, + "grad_norm": 0.8653680682182312, + "learning_rate": 0.0002, + "loss": 0.5277, + "step": 15430 + }, + { + "epoch": 4.983860555196901, + "grad_norm": 1.0769985914230347, + "learning_rate": 0.0002, + "loss": 0.5301, + "step": 15440 + }, + { + "epoch": 4.987088444157521, + "grad_norm": 1.1336040496826172, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 15450 + }, + { + "epoch": 4.990316333118141, + "grad_norm": 0.9844824075698853, + "learning_rate": 0.0002, + "loss": 0.5454, + "step": 15460 + }, + { + "epoch": 4.993544222078761, + "grad_norm": 0.8368769288063049, + "learning_rate": 0.0002, + "loss": 0.5316, + "step": 15470 + }, + { + "epoch": 4.99677211103938, + "grad_norm": 1.0238676071166992, + "learning_rate": 0.0002, + "loss": 0.5464, + "step": 15480 + }, + { + "epoch": 5.0, + "grad_norm": 1.064820408821106, + "learning_rate": 0.0002, + "loss": 0.5577, + "step": 15490 + }, + { + "epoch": 5.0, + "eval_loss": 1.241918921470642, + "eval_runtime": 158.4099, + "eval_samples_per_second": 4.627, + "eval_steps_per_second": 0.581, + "step": 15490 + } + ], + "logging_steps": 10, + "max_steps": 24784, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.168425164852429e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f46f2b8e8752b125339f36f172c3878be4cdb152 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-15490/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfc2a69e44a51edf5586ebed4b7ee915a23244c18c1f59e580471e4c9becfa98 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..be2b0cfff05fd5948131171eb82c80e2858199d4 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3bc25075db315827a4beba479b8f0c1e0ae13eb2ad5a4049963b0c5a4b98a5f +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a9aae928d82610ff13bdaa87487624c3ebf2365e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c6c43fb77e4198a9991b92cf418159c06824813da585151ebd36c35b52880e0 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e3675b51a8eb3905d0333a00f2736197b69baee9 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7e76cc0607e3141ad1ad19da835151ce646d0459f3765353d7f7f6ad6d098d9 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..431b5a3f9717d60b495dfad4d4b379c77256b21f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bc8ff81c1a401f1f68e18a3397254fa5e563cdc2cf55a3f3f5ff7897b978da7 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..567f2dc8c77bb43c7759238b20392df456fff521 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/trainer_state.json @@ -0,0 +1,13087 @@ +{ + "best_metric": 1.0958120822906494, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 18588, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032278889606197547, + "grad_norm": 0.7092075347900391, + "learning_rate": 0.0002, + "loss": 1.593, + "step": 10 + }, + { + "epoch": 0.006455777921239509, + "grad_norm": 0.6900479793548584, + "learning_rate": 0.0002, + "loss": 1.0956, + "step": 20 + }, + { + "epoch": 0.009683666881859263, + "grad_norm": 0.6788288950920105, + "learning_rate": 0.0002, + "loss": 0.9807, + "step": 30 + }, + { + "epoch": 0.012911555842479019, + "grad_norm": 0.5590243339538574, + "learning_rate": 0.0002, + "loss": 0.9385, + "step": 40 + }, + { + "epoch": 0.016139444803098774, + "grad_norm": 0.5136010646820068, + "learning_rate": 0.0002, + "loss": 0.931, + "step": 50 + }, + { + "epoch": 0.019367333763718526, + "grad_norm": 0.45298320055007935, + "learning_rate": 0.0002, + "loss": 0.8896, + "step": 60 + }, + { + "epoch": 0.022595222724338282, + "grad_norm": 0.5917162299156189, + "learning_rate": 0.0002, + "loss": 0.9184, + "step": 70 + }, + { + "epoch": 0.025823111684958037, + "grad_norm": 0.4414856433868408, + "learning_rate": 0.0002, + "loss": 0.8705, + "step": 80 + }, + { + "epoch": 0.029051000645577793, + "grad_norm": 0.5547978281974792, + "learning_rate": 0.0002, + "loss": 0.8419, + "step": 90 + }, + { + "epoch": 0.03227888960619755, + "grad_norm": 0.5271288156509399, + "learning_rate": 0.0002, + "loss": 0.8987, + "step": 100 + }, + { + "epoch": 0.035506778566817304, + "grad_norm": 0.5506119728088379, + "learning_rate": 0.0002, + "loss": 0.8543, + "step": 110 + }, + { + "epoch": 0.03873466752743705, + "grad_norm": 0.5579327940940857, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 120 + }, + { + "epoch": 0.04196255648805681, + "grad_norm": 0.5099632740020752, + "learning_rate": 0.0002, + "loss": 0.8826, + "step": 130 + }, + { + "epoch": 0.045190445448676564, + "grad_norm": 0.40396833419799805, + "learning_rate": 0.0002, + "loss": 0.9239, + "step": 140 + }, + { + "epoch": 0.04841833440929632, + "grad_norm": 0.5008092522621155, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 150 + }, + { + "epoch": 0.051646223369916075, + "grad_norm": 0.4388776421546936, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 160 + }, + { + "epoch": 0.05487411233053583, + "grad_norm": 0.44138944149017334, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 170 + }, + { + "epoch": 0.058102001291155586, + "grad_norm": 0.358484148979187, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 180 + }, + { + "epoch": 0.06132989025177534, + "grad_norm": 0.457052081823349, + "learning_rate": 0.0002, + "loss": 0.8956, + "step": 190 + }, + { + "epoch": 0.0645577792123951, + "grad_norm": 0.5537622570991516, + "learning_rate": 0.0002, + "loss": 0.9138, + "step": 200 + }, + { + "epoch": 0.06778566817301485, + "grad_norm": 0.552631676197052, + "learning_rate": 0.0002, + "loss": 0.8701, + "step": 210 + }, + { + "epoch": 0.07101355713363461, + "grad_norm": 0.4414575397968292, + "learning_rate": 0.0002, + "loss": 0.8854, + "step": 220 + }, + { + "epoch": 0.07424144609425436, + "grad_norm": 0.4996664226055145, + "learning_rate": 0.0002, + "loss": 0.8581, + "step": 230 + }, + { + "epoch": 0.0774693350548741, + "grad_norm": 0.7321897149085999, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 240 + }, + { + "epoch": 0.08069722401549387, + "grad_norm": 0.4553901255130768, + "learning_rate": 0.0002, + "loss": 0.8848, + "step": 250 + }, + { + "epoch": 0.08392511297611362, + "grad_norm": 0.5039054751396179, + "learning_rate": 0.0002, + "loss": 0.868, + "step": 260 + }, + { + "epoch": 0.08715300193673338, + "grad_norm": 0.4113094210624695, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 270 + }, + { + "epoch": 0.09038089089735313, + "grad_norm": 0.450436532497406, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 280 + }, + { + "epoch": 0.09360877985797289, + "grad_norm": 0.4548024535179138, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 290 + }, + { + "epoch": 0.09683666881859264, + "grad_norm": 0.4932962656021118, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 300 + }, + { + "epoch": 0.1000645577792124, + "grad_norm": 0.4005250334739685, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 310 + }, + { + "epoch": 0.10329244673983215, + "grad_norm": 1.8321624994277954, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 320 + }, + { + "epoch": 0.1065203357004519, + "grad_norm": 0.45815610885620117, + "learning_rate": 0.0002, + "loss": 0.8411, + "step": 330 + }, + { + "epoch": 0.10974822466107166, + "grad_norm": 0.39324095845222473, + "learning_rate": 0.0002, + "loss": 0.857, + "step": 340 + }, + { + "epoch": 0.11297611362169141, + "grad_norm": 0.546273946762085, + "learning_rate": 0.0002, + "loss": 0.8258, + "step": 350 + }, + { + "epoch": 0.11620400258231117, + "grad_norm": 0.497448593378067, + "learning_rate": 0.0002, + "loss": 0.882, + "step": 360 + }, + { + "epoch": 0.11943189154293092, + "grad_norm": 0.37508800625801086, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 370 + }, + { + "epoch": 0.12265978050355068, + "grad_norm": 0.45849609375, + "learning_rate": 0.0002, + "loss": 0.852, + "step": 380 + }, + { + "epoch": 0.12588766946417043, + "grad_norm": 0.5488408803939819, + "learning_rate": 0.0002, + "loss": 0.8437, + "step": 390 + }, + { + "epoch": 0.1291155584247902, + "grad_norm": 0.4477061331272125, + "learning_rate": 0.0002, + "loss": 0.8349, + "step": 400 + }, + { + "epoch": 0.13234344738540993, + "grad_norm": 0.39227980375289917, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 410 + }, + { + "epoch": 0.1355713363460297, + "grad_norm": 0.3922233581542969, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 420 + }, + { + "epoch": 0.13879922530664945, + "grad_norm": 0.42901909351348877, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 430 + }, + { + "epoch": 0.14202711426726922, + "grad_norm": 0.4217798709869385, + "learning_rate": 0.0002, + "loss": 0.8271, + "step": 440 + }, + { + "epoch": 0.14525500322788895, + "grad_norm": 0.43470677733421326, + "learning_rate": 0.0002, + "loss": 0.8594, + "step": 450 + }, + { + "epoch": 0.1484828921885087, + "grad_norm": 0.5324403047561646, + "learning_rate": 0.0002, + "loss": 0.8106, + "step": 460 + }, + { + "epoch": 0.15171078114912848, + "grad_norm": 0.3999756872653961, + "learning_rate": 0.0002, + "loss": 0.8729, + "step": 470 + }, + { + "epoch": 0.1549386701097482, + "grad_norm": 0.404933363199234, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 480 + }, + { + "epoch": 0.15816655907036797, + "grad_norm": 0.44122636318206787, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 490 + }, + { + "epoch": 0.16139444803098774, + "grad_norm": 0.510166347026825, + "learning_rate": 0.0002, + "loss": 0.8457, + "step": 500 + }, + { + "epoch": 0.1646223369916075, + "grad_norm": 0.4549732506275177, + "learning_rate": 0.0002, + "loss": 0.8692, + "step": 510 + }, + { + "epoch": 0.16785022595222723, + "grad_norm": 0.5148182511329651, + "learning_rate": 0.0002, + "loss": 0.8466, + "step": 520 + }, + { + "epoch": 0.171078114912847, + "grad_norm": 0.3596806824207306, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 530 + }, + { + "epoch": 0.17430600387346676, + "grad_norm": 0.4388909339904785, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 540 + }, + { + "epoch": 0.17753389283408652, + "grad_norm": 0.5052742958068848, + "learning_rate": 0.0002, + "loss": 0.8322, + "step": 550 + }, + { + "epoch": 0.18076178179470626, + "grad_norm": 0.48248958587646484, + "learning_rate": 0.0002, + "loss": 0.791, + "step": 560 + }, + { + "epoch": 0.18398967075532602, + "grad_norm": 0.5360197424888611, + "learning_rate": 0.0002, + "loss": 0.8593, + "step": 570 + }, + { + "epoch": 0.18721755971594578, + "grad_norm": 0.43999341130256653, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 580 + }, + { + "epoch": 0.19044544867656552, + "grad_norm": 0.3685208261013031, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 590 + }, + { + "epoch": 0.19367333763718528, + "grad_norm": 0.4601275622844696, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 600 + }, + { + "epoch": 0.19690122659780504, + "grad_norm": 0.4778369665145874, + "learning_rate": 0.0002, + "loss": 0.8483, + "step": 610 + }, + { + "epoch": 0.2001291155584248, + "grad_norm": 0.4867003560066223, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 620 + }, + { + "epoch": 0.20335700451904454, + "grad_norm": 0.4583742916584015, + "learning_rate": 0.0002, + "loss": 0.8554, + "step": 630 + }, + { + "epoch": 0.2065848934796643, + "grad_norm": 0.47958165407180786, + "learning_rate": 0.0002, + "loss": 0.8698, + "step": 640 + }, + { + "epoch": 0.20981278244028406, + "grad_norm": 0.4526064097881317, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 650 + }, + { + "epoch": 0.2130406714009038, + "grad_norm": 0.45890581607818604, + "learning_rate": 0.0002, + "loss": 0.8313, + "step": 660 + }, + { + "epoch": 0.21626856036152356, + "grad_norm": 0.42725905776023865, + "learning_rate": 0.0002, + "loss": 0.8143, + "step": 670 + }, + { + "epoch": 0.21949644932214332, + "grad_norm": 0.40380963683128357, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 680 + }, + { + "epoch": 0.22272433828276308, + "grad_norm": 0.4372998774051666, + "learning_rate": 0.0002, + "loss": 0.9004, + "step": 690 + }, + { + "epoch": 0.22595222724338282, + "grad_norm": 0.4245864450931549, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 700 + }, + { + "epoch": 0.22918011620400258, + "grad_norm": 0.4061129689216614, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 710 + }, + { + "epoch": 0.23240800516462234, + "grad_norm": 0.474454790353775, + "learning_rate": 0.0002, + "loss": 0.8275, + "step": 720 + }, + { + "epoch": 0.23563589412524208, + "grad_norm": 0.4908486008644104, + "learning_rate": 0.0002, + "loss": 0.8346, + "step": 730 + }, + { + "epoch": 0.23886378308586184, + "grad_norm": 0.4284191429615021, + "learning_rate": 0.0002, + "loss": 0.8755, + "step": 740 + }, + { + "epoch": 0.2420916720464816, + "grad_norm": 0.44730308651924133, + "learning_rate": 0.0002, + "loss": 0.8387, + "step": 750 + }, + { + "epoch": 0.24531956100710137, + "grad_norm": 0.4433246850967407, + "learning_rate": 0.0002, + "loss": 0.8135, + "step": 760 + }, + { + "epoch": 0.2485474499677211, + "grad_norm": 0.43668854236602783, + "learning_rate": 0.0002, + "loss": 0.8644, + "step": 770 + }, + { + "epoch": 0.25177533892834086, + "grad_norm": 0.34324130415916443, + "learning_rate": 0.0002, + "loss": 0.8025, + "step": 780 + }, + { + "epoch": 0.2550032278889606, + "grad_norm": 0.46476295590400696, + "learning_rate": 0.0002, + "loss": 0.8725, + "step": 790 + }, + { + "epoch": 0.2582311168495804, + "grad_norm": 0.5047039985656738, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 800 + }, + { + "epoch": 0.26145900581020015, + "grad_norm": 0.4402127265930176, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 810 + }, + { + "epoch": 0.26468689477081986, + "grad_norm": 0.4642465114593506, + "learning_rate": 0.0002, + "loss": 0.8025, + "step": 820 + }, + { + "epoch": 0.2679147837314396, + "grad_norm": 0.40093424916267395, + "learning_rate": 0.0002, + "loss": 0.8836, + "step": 830 + }, + { + "epoch": 0.2711426726920594, + "grad_norm": 0.42501842975616455, + "learning_rate": 0.0002, + "loss": 0.83, + "step": 840 + }, + { + "epoch": 0.27437056165267915, + "grad_norm": 0.43279722332954407, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 850 + }, + { + "epoch": 0.2775984506132989, + "grad_norm": 0.5991243720054626, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 860 + }, + { + "epoch": 0.28082633957391867, + "grad_norm": 0.4217848777770996, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 870 + }, + { + "epoch": 0.28405422853453843, + "grad_norm": 0.3933536410331726, + "learning_rate": 0.0002, + "loss": 0.8135, + "step": 880 + }, + { + "epoch": 0.28728211749515814, + "grad_norm": 0.5868505239486694, + "learning_rate": 0.0002, + "loss": 0.8846, + "step": 890 + }, + { + "epoch": 0.2905100064557779, + "grad_norm": 0.5209547877311707, + "learning_rate": 0.0002, + "loss": 0.8759, + "step": 900 + }, + { + "epoch": 0.29373789541639767, + "grad_norm": 0.49307361245155334, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 910 + }, + { + "epoch": 0.2969657843770174, + "grad_norm": 0.4288382828235626, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 920 + }, + { + "epoch": 0.3001936733376372, + "grad_norm": 0.33568474650382996, + "learning_rate": 0.0002, + "loss": 0.8431, + "step": 930 + }, + { + "epoch": 0.30342156229825695, + "grad_norm": 1.0915930271148682, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 940 + }, + { + "epoch": 0.3066494512588767, + "grad_norm": 0.5489798188209534, + "learning_rate": 0.0002, + "loss": 0.8535, + "step": 950 + }, + { + "epoch": 0.3098773402194964, + "grad_norm": 0.42971742153167725, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 960 + }, + { + "epoch": 0.3131052291801162, + "grad_norm": 0.43375834822654724, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 970 + }, + { + "epoch": 0.31633311814073595, + "grad_norm": 0.47488611936569214, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 980 + }, + { + "epoch": 0.3195610071013557, + "grad_norm": 0.46296775341033936, + "learning_rate": 0.0002, + "loss": 0.7906, + "step": 990 + }, + { + "epoch": 0.32278889606197547, + "grad_norm": 0.4548890292644501, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 1000 + }, + { + "epoch": 0.32601678502259523, + "grad_norm": 0.41834497451782227, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 1010 + }, + { + "epoch": 0.329244673983215, + "grad_norm": 0.441092312335968, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 1020 + }, + { + "epoch": 0.33247256294383476, + "grad_norm": 0.637322187423706, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 1030 + }, + { + "epoch": 0.33570045190445447, + "grad_norm": 0.4374958574771881, + "learning_rate": 0.0002, + "loss": 0.8685, + "step": 1040 + }, + { + "epoch": 0.33892834086507423, + "grad_norm": 0.3935825824737549, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 1050 + }, + { + "epoch": 0.342156229825694, + "grad_norm": 0.43526220321655273, + "learning_rate": 0.0002, + "loss": 0.8287, + "step": 1060 + }, + { + "epoch": 0.34538411878631375, + "grad_norm": 0.45327696204185486, + "learning_rate": 0.0002, + "loss": 0.8413, + "step": 1070 + }, + { + "epoch": 0.3486120077469335, + "grad_norm": 0.4126075506210327, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 1080 + }, + { + "epoch": 0.3518398967075533, + "grad_norm": 0.4714072048664093, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 1090 + }, + { + "epoch": 0.35506778566817304, + "grad_norm": 0.518127977848053, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 1100 + }, + { + "epoch": 0.35829567462879275, + "grad_norm": 0.43264099955558777, + "learning_rate": 0.0002, + "loss": 0.8479, + "step": 1110 + }, + { + "epoch": 0.3615235635894125, + "grad_norm": 0.4857400357723236, + "learning_rate": 0.0002, + "loss": 0.8724, + "step": 1120 + }, + { + "epoch": 0.3647514525500323, + "grad_norm": 0.37591469287872314, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 1130 + }, + { + "epoch": 0.36797934151065204, + "grad_norm": 0.4165478050708771, + "learning_rate": 0.0002, + "loss": 0.8531, + "step": 1140 + }, + { + "epoch": 0.3712072304712718, + "grad_norm": 0.42911383509635925, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 1150 + }, + { + "epoch": 0.37443511943189156, + "grad_norm": 0.44980287551879883, + "learning_rate": 0.0002, + "loss": 0.8722, + "step": 1160 + }, + { + "epoch": 0.3776630083925113, + "grad_norm": 0.4066573679447174, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 1170 + }, + { + "epoch": 0.38089089735313103, + "grad_norm": 0.5056195855140686, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 1180 + }, + { + "epoch": 0.3841187863137508, + "grad_norm": 0.4141536355018616, + "learning_rate": 0.0002, + "loss": 0.8387, + "step": 1190 + }, + { + "epoch": 0.38734667527437056, + "grad_norm": 0.4501924514770508, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 1200 + }, + { + "epoch": 0.3905745642349903, + "grad_norm": 0.43304240703582764, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 1210 + }, + { + "epoch": 0.3938024531956101, + "grad_norm": 0.475777804851532, + "learning_rate": 0.0002, + "loss": 0.8905, + "step": 1220 + }, + { + "epoch": 0.39703034215622984, + "grad_norm": 0.5846465826034546, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 1230 + }, + { + "epoch": 0.4002582311168496, + "grad_norm": 0.42899325489997864, + "learning_rate": 0.0002, + "loss": 0.8078, + "step": 1240 + }, + { + "epoch": 0.4034861200774693, + "grad_norm": 0.3980463147163391, + "learning_rate": 0.0002, + "loss": 0.8415, + "step": 1250 + }, + { + "epoch": 0.4067140090380891, + "grad_norm": 0.45769768953323364, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 1260 + }, + { + "epoch": 0.40994189799870884, + "grad_norm": 0.5101280212402344, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 1270 + }, + { + "epoch": 0.4131697869593286, + "grad_norm": 0.47374317049980164, + "learning_rate": 0.0002, + "loss": 0.7905, + "step": 1280 + }, + { + "epoch": 0.41639767591994836, + "grad_norm": 0.4261878728866577, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 1290 + }, + { + "epoch": 0.4196255648805681, + "grad_norm": 0.46954256296157837, + "learning_rate": 0.0002, + "loss": 0.9004, + "step": 1300 + }, + { + "epoch": 0.4228534538411879, + "grad_norm": 0.5205738544464111, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 1310 + }, + { + "epoch": 0.4260813428018076, + "grad_norm": 0.5176340937614441, + "learning_rate": 0.0002, + "loss": 0.8964, + "step": 1320 + }, + { + "epoch": 0.42930923176242736, + "grad_norm": 0.5155916810035706, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 1330 + }, + { + "epoch": 0.4325371207230471, + "grad_norm": 0.44548553228378296, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 1340 + }, + { + "epoch": 0.4357650096836669, + "grad_norm": 0.5633558630943298, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 1350 + }, + { + "epoch": 0.43899289864428664, + "grad_norm": 0.42444056272506714, + "learning_rate": 0.0002, + "loss": 0.7889, + "step": 1360 + }, + { + "epoch": 0.4422207876049064, + "grad_norm": 0.5226860642433167, + "learning_rate": 0.0002, + "loss": 0.8588, + "step": 1370 + }, + { + "epoch": 0.44544867656552617, + "grad_norm": 0.5354582071304321, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 1380 + }, + { + "epoch": 0.4486765655261459, + "grad_norm": 0.472646564245224, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 1390 + }, + { + "epoch": 0.45190445448676564, + "grad_norm": 0.6312310099601746, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 1400 + }, + { + "epoch": 0.4551323434473854, + "grad_norm": 0.4298408031463623, + "learning_rate": 0.0002, + "loss": 0.8212, + "step": 1410 + }, + { + "epoch": 0.45836023240800516, + "grad_norm": 0.43427202105522156, + "learning_rate": 0.0002, + "loss": 0.8447, + "step": 1420 + }, + { + "epoch": 0.4615881213686249, + "grad_norm": 0.44097861647605896, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 1430 + }, + { + "epoch": 0.4648160103292447, + "grad_norm": 0.5142693519592285, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 1440 + }, + { + "epoch": 0.46804389928986445, + "grad_norm": 0.46416547894477844, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 1450 + }, + { + "epoch": 0.47127178825048416, + "grad_norm": 0.4858551025390625, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 1460 + }, + { + "epoch": 0.4744996772111039, + "grad_norm": 0.4709177315235138, + "learning_rate": 0.0002, + "loss": 0.8354, + "step": 1470 + }, + { + "epoch": 0.4777275661717237, + "grad_norm": 0.5500252842903137, + "learning_rate": 0.0002, + "loss": 0.8391, + "step": 1480 + }, + { + "epoch": 0.48095545513234345, + "grad_norm": 0.43364381790161133, + "learning_rate": 0.0002, + "loss": 0.8359, + "step": 1490 + }, + { + "epoch": 0.4841833440929632, + "grad_norm": 0.47712287306785583, + "learning_rate": 0.0002, + "loss": 0.8446, + "step": 1500 + }, + { + "epoch": 0.48741123305358297, + "grad_norm": 0.4518495202064514, + "learning_rate": 0.0002, + "loss": 0.8518, + "step": 1510 + }, + { + "epoch": 0.49063912201420273, + "grad_norm": 0.4539008140563965, + "learning_rate": 0.0002, + "loss": 0.819, + "step": 1520 + }, + { + "epoch": 0.49386701097482244, + "grad_norm": 0.4993067979812622, + "learning_rate": 0.0002, + "loss": 0.8276, + "step": 1530 + }, + { + "epoch": 0.4970948999354422, + "grad_norm": 0.6094803214073181, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 1540 + }, + { + "epoch": 0.500322788896062, + "grad_norm": 0.48602527379989624, + "learning_rate": 0.0002, + "loss": 0.8263, + "step": 1550 + }, + { + "epoch": 0.5035506778566817, + "grad_norm": 0.40245795249938965, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 1560 + }, + { + "epoch": 0.5067785668173015, + "grad_norm": 0.456787645816803, + "learning_rate": 0.0002, + "loss": 0.7907, + "step": 1570 + }, + { + "epoch": 0.5100064557779213, + "grad_norm": 0.43936216831207275, + "learning_rate": 0.0002, + "loss": 0.86, + "step": 1580 + }, + { + "epoch": 0.513234344738541, + "grad_norm": 0.549018144607544, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 1590 + }, + { + "epoch": 0.5164622336991608, + "grad_norm": 0.41746795177459717, + "learning_rate": 0.0002, + "loss": 0.8169, + "step": 1600 + }, + { + "epoch": 0.5196901226597805, + "grad_norm": 0.4217053949832916, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 1610 + }, + { + "epoch": 0.5229180116204003, + "grad_norm": 0.449913889169693, + "learning_rate": 0.0002, + "loss": 0.8161, + "step": 1620 + }, + { + "epoch": 0.5261459005810201, + "grad_norm": 0.5084872245788574, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 1630 + }, + { + "epoch": 0.5293737895416397, + "grad_norm": 0.46248653531074524, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 1640 + }, + { + "epoch": 0.5326016785022595, + "grad_norm": 0.4824236035346985, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 1650 + }, + { + "epoch": 0.5358295674628792, + "grad_norm": 0.6010985374450684, + "learning_rate": 0.0002, + "loss": 0.8711, + "step": 1660 + }, + { + "epoch": 0.539057456423499, + "grad_norm": 0.4757920801639557, + "learning_rate": 0.0002, + "loss": 0.8266, + "step": 1670 + }, + { + "epoch": 0.5422853453841188, + "grad_norm": 0.45161882042884827, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 1680 + }, + { + "epoch": 0.5455132343447385, + "grad_norm": 0.49314990639686584, + "learning_rate": 0.0002, + "loss": 0.8141, + "step": 1690 + }, + { + "epoch": 0.5487411233053583, + "grad_norm": 0.3918305039405823, + "learning_rate": 0.0002, + "loss": 0.8091, + "step": 1700 + }, + { + "epoch": 0.551969012265978, + "grad_norm": 0.5966728925704956, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 1710 + }, + { + "epoch": 0.5551969012265978, + "grad_norm": 0.4208986163139343, + "learning_rate": 0.0002, + "loss": 0.8438, + "step": 1720 + }, + { + "epoch": 0.5584247901872176, + "grad_norm": 0.43724218010902405, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 1730 + }, + { + "epoch": 0.5616526791478373, + "grad_norm": 0.5287272930145264, + "learning_rate": 0.0002, + "loss": 0.7956, + "step": 1740 + }, + { + "epoch": 0.5648805681084571, + "grad_norm": 0.4961899518966675, + "learning_rate": 0.0002, + "loss": 0.8557, + "step": 1750 + }, + { + "epoch": 0.5681084570690769, + "grad_norm": 0.4468635320663452, + "learning_rate": 0.0002, + "loss": 0.8029, + "step": 1760 + }, + { + "epoch": 0.5713363460296966, + "grad_norm": 0.6423530578613281, + "learning_rate": 0.0002, + "loss": 0.7968, + "step": 1770 + }, + { + "epoch": 0.5745642349903163, + "grad_norm": 0.4601971507072449, + "learning_rate": 0.0002, + "loss": 0.8324, + "step": 1780 + }, + { + "epoch": 0.577792123950936, + "grad_norm": 0.46514901518821716, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 1790 + }, + { + "epoch": 0.5810200129115558, + "grad_norm": 0.4771687388420105, + "learning_rate": 0.0002, + "loss": 0.8186, + "step": 1800 + }, + { + "epoch": 0.5842479018721756, + "grad_norm": 0.46514490246772766, + "learning_rate": 0.0002, + "loss": 0.856, + "step": 1810 + }, + { + "epoch": 0.5874757908327953, + "grad_norm": 0.5373936295509338, + "learning_rate": 0.0002, + "loss": 0.84, + "step": 1820 + }, + { + "epoch": 0.5907036797934151, + "grad_norm": 0.5175791382789612, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 1830 + }, + { + "epoch": 0.5939315687540349, + "grad_norm": 0.4522802233695984, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 1840 + }, + { + "epoch": 0.5971594577146546, + "grad_norm": 0.42987772822380066, + "learning_rate": 0.0002, + "loss": 0.8633, + "step": 1850 + }, + { + "epoch": 0.6003873466752744, + "grad_norm": 0.5566838383674622, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 1860 + }, + { + "epoch": 0.6036152356358941, + "grad_norm": 0.42807698249816895, + "learning_rate": 0.0002, + "loss": 0.8312, + "step": 1870 + }, + { + "epoch": 0.6068431245965139, + "grad_norm": 0.4957767724990845, + "learning_rate": 0.0002, + "loss": 0.8035, + "step": 1880 + }, + { + "epoch": 0.6100710135571337, + "grad_norm": 0.4260980188846588, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 1890 + }, + { + "epoch": 0.6132989025177534, + "grad_norm": 0.4777357876300812, + "learning_rate": 0.0002, + "loss": 0.8363, + "step": 1900 + }, + { + "epoch": 0.6165267914783732, + "grad_norm": 0.4434216022491455, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 1910 + }, + { + "epoch": 0.6197546804389928, + "grad_norm": 0.5215433835983276, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 1920 + }, + { + "epoch": 0.6229825693996126, + "grad_norm": 0.5143248438835144, + "learning_rate": 0.0002, + "loss": 0.82, + "step": 1930 + }, + { + "epoch": 0.6262104583602324, + "grad_norm": 0.5213413238525391, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 1940 + }, + { + "epoch": 0.6294383473208521, + "grad_norm": 0.5408226251602173, + "learning_rate": 0.0002, + "loss": 0.7549, + "step": 1950 + }, + { + "epoch": 0.6326662362814719, + "grad_norm": 0.5479708909988403, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 1960 + }, + { + "epoch": 0.6358941252420917, + "grad_norm": 0.4490949809551239, + "learning_rate": 0.0002, + "loss": 0.8138, + "step": 1970 + }, + { + "epoch": 0.6391220142027114, + "grad_norm": 0.48815059661865234, + "learning_rate": 0.0002, + "loss": 0.854, + "step": 1980 + }, + { + "epoch": 0.6423499031633312, + "grad_norm": 0.46498045325279236, + "learning_rate": 0.0002, + "loss": 0.8568, + "step": 1990 + }, + { + "epoch": 0.6455777921239509, + "grad_norm": 0.5136561393737793, + "learning_rate": 0.0002, + "loss": 0.8263, + "step": 2000 + }, + { + "epoch": 0.6488056810845707, + "grad_norm": 0.5145719647407532, + "learning_rate": 0.0002, + "loss": 0.8503, + "step": 2010 + }, + { + "epoch": 0.6520335700451905, + "grad_norm": 0.5430373549461365, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 2020 + }, + { + "epoch": 0.6552614590058102, + "grad_norm": 0.46347954869270325, + "learning_rate": 0.0002, + "loss": 0.8115, + "step": 2030 + }, + { + "epoch": 0.65848934796643, + "grad_norm": 0.5189562439918518, + "learning_rate": 0.0002, + "loss": 0.8769, + "step": 2040 + }, + { + "epoch": 0.6617172369270498, + "grad_norm": 0.43843990564346313, + "learning_rate": 0.0002, + "loss": 0.8453, + "step": 2050 + }, + { + "epoch": 0.6649451258876695, + "grad_norm": 0.4654983580112457, + "learning_rate": 0.0002, + "loss": 0.7951, + "step": 2060 + }, + { + "epoch": 0.6681730148482892, + "grad_norm": 0.44835716485977173, + "learning_rate": 0.0002, + "loss": 0.8308, + "step": 2070 + }, + { + "epoch": 0.6714009038089089, + "grad_norm": 0.38811734318733215, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 2080 + }, + { + "epoch": 0.6746287927695287, + "grad_norm": 0.5709853172302246, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 2090 + }, + { + "epoch": 0.6778566817301485, + "grad_norm": 0.49994757771492004, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 2100 + }, + { + "epoch": 0.6810845706907682, + "grad_norm": 0.5505402684211731, + "learning_rate": 0.0002, + "loss": 0.8, + "step": 2110 + }, + { + "epoch": 0.684312459651388, + "grad_norm": 0.48195120692253113, + "learning_rate": 0.0002, + "loss": 0.8227, + "step": 2120 + }, + { + "epoch": 0.6875403486120077, + "grad_norm": 0.4854775071144104, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 2130 + }, + { + "epoch": 0.6907682375726275, + "grad_norm": 0.6422494649887085, + "learning_rate": 0.0002, + "loss": 0.8231, + "step": 2140 + }, + { + "epoch": 0.6939961265332473, + "grad_norm": 0.3972536027431488, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 2150 + }, + { + "epoch": 0.697224015493867, + "grad_norm": 0.4297836422920227, + "learning_rate": 0.0002, + "loss": 0.8068, + "step": 2160 + }, + { + "epoch": 0.7004519044544868, + "grad_norm": 0.45486778020858765, + "learning_rate": 0.0002, + "loss": 0.8017, + "step": 2170 + }, + { + "epoch": 0.7036797934151066, + "grad_norm": 0.4706047773361206, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 2180 + }, + { + "epoch": 0.7069076823757263, + "grad_norm": 0.46426892280578613, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 2190 + }, + { + "epoch": 0.7101355713363461, + "grad_norm": 0.46333715319633484, + "learning_rate": 0.0002, + "loss": 0.8472, + "step": 2200 + }, + { + "epoch": 0.7133634602969657, + "grad_norm": 0.4632524251937866, + "learning_rate": 0.0002, + "loss": 0.8247, + "step": 2210 + }, + { + "epoch": 0.7165913492575855, + "grad_norm": 0.4610830843448639, + "learning_rate": 0.0002, + "loss": 0.8452, + "step": 2220 + }, + { + "epoch": 0.7198192382182053, + "grad_norm": 0.4905324876308441, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 2230 + }, + { + "epoch": 0.723047127178825, + "grad_norm": 0.4936263859272003, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 2240 + }, + { + "epoch": 0.7262750161394448, + "grad_norm": 0.40778425335884094, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 2250 + }, + { + "epoch": 0.7295029051000645, + "grad_norm": 0.50351482629776, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 2260 + }, + { + "epoch": 0.7327307940606843, + "grad_norm": 0.4894128143787384, + "learning_rate": 0.0002, + "loss": 0.8475, + "step": 2270 + }, + { + "epoch": 0.7359586830213041, + "grad_norm": 0.5580906271934509, + "learning_rate": 0.0002, + "loss": 0.8087, + "step": 2280 + }, + { + "epoch": 0.7391865719819238, + "grad_norm": 0.4655369520187378, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 2290 + }, + { + "epoch": 0.7424144609425436, + "grad_norm": 0.4666965901851654, + "learning_rate": 0.0002, + "loss": 0.8395, + "step": 2300 + }, + { + "epoch": 0.7456423499031634, + "grad_norm": 0.46259936690330505, + "learning_rate": 0.0002, + "loss": 0.7605, + "step": 2310 + }, + { + "epoch": 0.7488702388637831, + "grad_norm": 0.520706832408905, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 2320 + }, + { + "epoch": 0.7520981278244029, + "grad_norm": 0.5142408013343811, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 2330 + }, + { + "epoch": 0.7553260167850226, + "grad_norm": 0.5355164408683777, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 2340 + }, + { + "epoch": 0.7585539057456423, + "grad_norm": 0.5517185926437378, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 2350 + }, + { + "epoch": 0.7617817947062621, + "grad_norm": 0.7162677049636841, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 2360 + }, + { + "epoch": 0.7650096836668818, + "grad_norm": 0.42402133345603943, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 2370 + }, + { + "epoch": 0.7682375726275016, + "grad_norm": 0.47180113196372986, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 2380 + }, + { + "epoch": 0.7714654615881213, + "grad_norm": 0.6262288689613342, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 2390 + }, + { + "epoch": 0.7746933505487411, + "grad_norm": 0.5177528262138367, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 2400 + }, + { + "epoch": 0.7779212395093609, + "grad_norm": 0.555721640586853, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 2410 + }, + { + "epoch": 0.7811491284699806, + "grad_norm": 0.5592644810676575, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 2420 + }, + { + "epoch": 0.7843770174306004, + "grad_norm": 0.38025397062301636, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 2430 + }, + { + "epoch": 0.7876049063912202, + "grad_norm": 0.4597472548484802, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 2440 + }, + { + "epoch": 0.7908327953518399, + "grad_norm": 0.4929825961589813, + "learning_rate": 0.0002, + "loss": 0.8575, + "step": 2450 + }, + { + "epoch": 0.7940606843124597, + "grad_norm": 0.45277655124664307, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 2460 + }, + { + "epoch": 0.7972885732730794, + "grad_norm": 0.6224122643470764, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 2470 + }, + { + "epoch": 0.8005164622336992, + "grad_norm": 0.5740901827812195, + "learning_rate": 0.0002, + "loss": 0.8449, + "step": 2480 + }, + { + "epoch": 0.8037443511943189, + "grad_norm": 0.41335329413414, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 2490 + }, + { + "epoch": 0.8069722401549386, + "grad_norm": 0.4738694131374359, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 2500 + }, + { + "epoch": 0.8102001291155584, + "grad_norm": 0.5288197994232178, + "learning_rate": 0.0002, + "loss": 0.7927, + "step": 2510 + }, + { + "epoch": 0.8134280180761781, + "grad_norm": 0.5404666066169739, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 2520 + }, + { + "epoch": 0.8166559070367979, + "grad_norm": 0.4444909691810608, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 2530 + }, + { + "epoch": 0.8198837959974177, + "grad_norm": 0.542061448097229, + "learning_rate": 0.0002, + "loss": 0.8683, + "step": 2540 + }, + { + "epoch": 0.8231116849580374, + "grad_norm": 0.4914741814136505, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 2550 + }, + { + "epoch": 0.8263395739186572, + "grad_norm": 0.41703441739082336, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 2560 + }, + { + "epoch": 0.829567462879277, + "grad_norm": 0.5489841103553772, + "learning_rate": 0.0002, + "loss": 0.824, + "step": 2570 + }, + { + "epoch": 0.8327953518398967, + "grad_norm": 0.5359883308410645, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 2580 + }, + { + "epoch": 0.8360232408005165, + "grad_norm": 0.5541019439697266, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 2590 + }, + { + "epoch": 0.8392511297611362, + "grad_norm": 0.4746638834476471, + "learning_rate": 0.0002, + "loss": 0.797, + "step": 2600 + }, + { + "epoch": 0.842479018721756, + "grad_norm": 0.5243194103240967, + "learning_rate": 0.0002, + "loss": 0.8116, + "step": 2610 + }, + { + "epoch": 0.8457069076823758, + "grad_norm": 0.46824976801872253, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 2620 + }, + { + "epoch": 0.8489347966429954, + "grad_norm": 0.49487847089767456, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 2630 + }, + { + "epoch": 0.8521626856036152, + "grad_norm": 0.42180097103118896, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 2640 + }, + { + "epoch": 0.855390574564235, + "grad_norm": 0.5516560077667236, + "learning_rate": 0.0002, + "loss": 0.8304, + "step": 2650 + }, + { + "epoch": 0.8586184635248547, + "grad_norm": 0.4392191767692566, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 2660 + }, + { + "epoch": 0.8618463524854745, + "grad_norm": 0.5387210845947266, + "learning_rate": 0.0002, + "loss": 0.848, + "step": 2670 + }, + { + "epoch": 0.8650742414460942, + "grad_norm": 0.6232406497001648, + "learning_rate": 0.0002, + "loss": 0.8094, + "step": 2680 + }, + { + "epoch": 0.868302130406714, + "grad_norm": 0.53749018907547, + "learning_rate": 0.0002, + "loss": 0.768, + "step": 2690 + }, + { + "epoch": 0.8715300193673338, + "grad_norm": 0.47480374574661255, + "learning_rate": 0.0002, + "loss": 0.8299, + "step": 2700 + }, + { + "epoch": 0.8747579083279535, + "grad_norm": 0.44618046283721924, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 2710 + }, + { + "epoch": 0.8779857972885733, + "grad_norm": 0.4173581302165985, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 2720 + }, + { + "epoch": 0.881213686249193, + "grad_norm": 0.524081289768219, + "learning_rate": 0.0002, + "loss": 0.7713, + "step": 2730 + }, + { + "epoch": 0.8844415752098128, + "grad_norm": 0.5608431100845337, + "learning_rate": 0.0002, + "loss": 0.8738, + "step": 2740 + }, + { + "epoch": 0.8876694641704326, + "grad_norm": 0.5212284922599792, + "learning_rate": 0.0002, + "loss": 0.8513, + "step": 2750 + }, + { + "epoch": 0.8908973531310523, + "grad_norm": 0.5601475834846497, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 2760 + }, + { + "epoch": 0.8941252420916721, + "grad_norm": 0.4499223828315735, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 2770 + }, + { + "epoch": 0.8973531310522918, + "grad_norm": 0.46945226192474365, + "learning_rate": 0.0002, + "loss": 0.8559, + "step": 2780 + }, + { + "epoch": 0.9005810200129115, + "grad_norm": 0.4837495684623718, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 2790 + }, + { + "epoch": 0.9038089089735313, + "grad_norm": 0.5059258937835693, + "learning_rate": 0.0002, + "loss": 0.7887, + "step": 2800 + }, + { + "epoch": 0.907036797934151, + "grad_norm": 0.4857945144176483, + "learning_rate": 0.0002, + "loss": 0.8571, + "step": 2810 + }, + { + "epoch": 0.9102646868947708, + "grad_norm": 0.5001962780952454, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 2820 + }, + { + "epoch": 0.9134925758553906, + "grad_norm": 0.5468648672103882, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 2830 + }, + { + "epoch": 0.9167204648160103, + "grad_norm": 0.5533056259155273, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 2840 + }, + { + "epoch": 0.9199483537766301, + "grad_norm": 0.5909785628318787, + "learning_rate": 0.0002, + "loss": 0.7895, + "step": 2850 + }, + { + "epoch": 0.9231762427372499, + "grad_norm": 0.47428104281425476, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 2860 + }, + { + "epoch": 0.9264041316978696, + "grad_norm": 0.548814058303833, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 2870 + }, + { + "epoch": 0.9296320206584894, + "grad_norm": 0.5576745271682739, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 2880 + }, + { + "epoch": 0.9328599096191091, + "grad_norm": 0.47094792127609253, + "learning_rate": 0.0002, + "loss": 0.8399, + "step": 2890 + }, + { + "epoch": 0.9360877985797289, + "grad_norm": 0.5408539772033691, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 2900 + }, + { + "epoch": 0.9393156875403487, + "grad_norm": 0.5922889113426208, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 2910 + }, + { + "epoch": 0.9425435765009683, + "grad_norm": 0.45462584495544434, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 2920 + }, + { + "epoch": 0.9457714654615881, + "grad_norm": 0.6864947080612183, + "learning_rate": 0.0002, + "loss": 0.8344, + "step": 2930 + }, + { + "epoch": 0.9489993544222078, + "grad_norm": 0.4706299304962158, + "learning_rate": 0.0002, + "loss": 0.8166, + "step": 2940 + }, + { + "epoch": 0.9522272433828276, + "grad_norm": 0.5583269596099854, + "learning_rate": 0.0002, + "loss": 0.8422, + "step": 2950 + }, + { + "epoch": 0.9554551323434474, + "grad_norm": 0.51015704870224, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 2960 + }, + { + "epoch": 0.9586830213040671, + "grad_norm": 0.5325582027435303, + "learning_rate": 0.0002, + "loss": 0.8371, + "step": 2970 + }, + { + "epoch": 0.9619109102646869, + "grad_norm": 0.49008598923683167, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 2980 + }, + { + "epoch": 0.9651387992253067, + "grad_norm": 0.4422132074832916, + "learning_rate": 0.0002, + "loss": 0.8093, + "step": 2990 + }, + { + "epoch": 0.9683666881859264, + "grad_norm": 0.5053589344024658, + "learning_rate": 0.0002, + "loss": 0.7966, + "step": 3000 + }, + { + "epoch": 0.9715945771465462, + "grad_norm": 0.46754521131515503, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 3010 + }, + { + "epoch": 0.9748224661071659, + "grad_norm": 0.5613434910774231, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 3020 + }, + { + "epoch": 0.9780503550677857, + "grad_norm": 0.5052843689918518, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 3030 + }, + { + "epoch": 0.9812782440284055, + "grad_norm": 0.4270972013473511, + "learning_rate": 0.0002, + "loss": 0.8412, + "step": 3040 + }, + { + "epoch": 0.9845061329890252, + "grad_norm": 0.4974991977214813, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 3050 + }, + { + "epoch": 0.9877340219496449, + "grad_norm": 0.4432311952114105, + "learning_rate": 0.0002, + "loss": 0.8415, + "step": 3060 + }, + { + "epoch": 0.9909619109102646, + "grad_norm": 0.466457724571228, + "learning_rate": 0.0002, + "loss": 0.7764, + "step": 3070 + }, + { + "epoch": 0.9941897998708844, + "grad_norm": 0.6438009142875671, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 3080 + }, + { + "epoch": 0.9974176888315042, + "grad_norm": 0.5593604445457458, + "learning_rate": 0.0002, + "loss": 0.8425, + "step": 3090 + }, + { + "epoch": 1.0, + "eval_loss": 1.0958120822906494, + "eval_runtime": 148.3273, + "eval_samples_per_second": 4.942, + "eval_steps_per_second": 0.62, + "step": 3098 + }, + { + "epoch": 1.000645577792124, + "grad_norm": 0.5701445937156677, + "learning_rate": 0.0002, + "loss": 0.8275, + "step": 3100 + }, + { + "epoch": 1.0038734667527438, + "grad_norm": 0.6089657545089722, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 3110 + }, + { + "epoch": 1.0071013557133635, + "grad_norm": 0.5619552135467529, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 3120 + }, + { + "epoch": 1.010329244673983, + "grad_norm": 0.5550283789634705, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 3130 + }, + { + "epoch": 1.013557133634603, + "grad_norm": 0.6221792101860046, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 3140 + }, + { + "epoch": 1.0167850225952226, + "grad_norm": 0.5450758934020996, + "learning_rate": 0.0002, + "loss": 0.7603, + "step": 3150 + }, + { + "epoch": 1.0200129115558425, + "grad_norm": 0.4359588027000427, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 3160 + }, + { + "epoch": 1.0232408005164622, + "grad_norm": 0.5932239890098572, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 3170 + }, + { + "epoch": 1.026468689477082, + "grad_norm": 0.45478707551956177, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 3180 + }, + { + "epoch": 1.0296965784377017, + "grad_norm": 0.677615761756897, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 3190 + }, + { + "epoch": 1.0329244673983216, + "grad_norm": 0.6231790781021118, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 3200 + }, + { + "epoch": 1.0361523563589412, + "grad_norm": 0.5074195861816406, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 3210 + }, + { + "epoch": 1.039380245319561, + "grad_norm": 0.4844142198562622, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 3220 + }, + { + "epoch": 1.0426081342801807, + "grad_norm": 0.5372750759124756, + "learning_rate": 0.0002, + "loss": 0.7655, + "step": 3230 + }, + { + "epoch": 1.0458360232408006, + "grad_norm": 0.46296265721321106, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 3240 + }, + { + "epoch": 1.0490639122014203, + "grad_norm": 0.5417148470878601, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 3250 + }, + { + "epoch": 1.0522918011620401, + "grad_norm": 0.5695074200630188, + "learning_rate": 0.0002, + "loss": 0.7637, + "step": 3260 + }, + { + "epoch": 1.0555196901226598, + "grad_norm": 0.5050092935562134, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 3270 + }, + { + "epoch": 1.0587475790832794, + "grad_norm": 0.5320752263069153, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 3280 + }, + { + "epoch": 1.0619754680438993, + "grad_norm": 0.5832052230834961, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 3290 + }, + { + "epoch": 1.065203357004519, + "grad_norm": 0.5228804349899292, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 3300 + }, + { + "epoch": 1.0684312459651388, + "grad_norm": 0.5819445252418518, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 3310 + }, + { + "epoch": 1.0716591349257585, + "grad_norm": 0.4201328754425049, + "learning_rate": 0.0002, + "loss": 0.7093, + "step": 3320 + }, + { + "epoch": 1.0748870238863784, + "grad_norm": 0.5424145460128784, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 3330 + }, + { + "epoch": 1.078114912846998, + "grad_norm": 0.6169946789741516, + "learning_rate": 0.0002, + "loss": 0.7828, + "step": 3340 + }, + { + "epoch": 1.0813428018076179, + "grad_norm": 0.607676088809967, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 3350 + }, + { + "epoch": 1.0845706907682375, + "grad_norm": 0.5191982388496399, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 3360 + }, + { + "epoch": 1.0877985797288574, + "grad_norm": 0.5728003978729248, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 3370 + }, + { + "epoch": 1.091026468689477, + "grad_norm": 0.5402643084526062, + "learning_rate": 0.0002, + "loss": 0.7381, + "step": 3380 + }, + { + "epoch": 1.094254357650097, + "grad_norm": 0.5377541780471802, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 3390 + }, + { + "epoch": 1.0974822466107166, + "grad_norm": 0.4751385748386383, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 3400 + }, + { + "epoch": 1.1007101355713362, + "grad_norm": 0.559158444404602, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 3410 + }, + { + "epoch": 1.103938024531956, + "grad_norm": 0.4917701482772827, + "learning_rate": 0.0002, + "loss": 0.7366, + "step": 3420 + }, + { + "epoch": 1.1071659134925758, + "grad_norm": 0.5507875084877014, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 3430 + }, + { + "epoch": 1.1103938024531956, + "grad_norm": 0.45458680391311646, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 3440 + }, + { + "epoch": 1.1136216914138153, + "grad_norm": 0.5721744894981384, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 3450 + }, + { + "epoch": 1.1168495803744352, + "grad_norm": 0.5776081681251526, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 3460 + }, + { + "epoch": 1.1200774693350548, + "grad_norm": 0.5261953473091125, + "learning_rate": 0.0002, + "loss": 0.7644, + "step": 3470 + }, + { + "epoch": 1.1233053582956747, + "grad_norm": 0.47759532928466797, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 3480 + }, + { + "epoch": 1.1265332472562943, + "grad_norm": 0.5697659850120544, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 3490 + }, + { + "epoch": 1.1297611362169142, + "grad_norm": 0.5643419623374939, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 3500 + }, + { + "epoch": 1.1329890251775339, + "grad_norm": 0.6502931118011475, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 3510 + }, + { + "epoch": 1.1362169141381537, + "grad_norm": 0.5236507654190063, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 3520 + }, + { + "epoch": 1.1394448030987734, + "grad_norm": 0.6521499156951904, + "learning_rate": 0.0002, + "loss": 0.7571, + "step": 3530 + }, + { + "epoch": 1.142672692059393, + "grad_norm": 0.5893217325210571, + "learning_rate": 0.0002, + "loss": 0.7304, + "step": 3540 + }, + { + "epoch": 1.145900581020013, + "grad_norm": 0.5300073027610779, + "learning_rate": 0.0002, + "loss": 0.7508, + "step": 3550 + }, + { + "epoch": 1.1491284699806328, + "grad_norm": 0.6794660091400146, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 3560 + }, + { + "epoch": 1.1523563589412524, + "grad_norm": 0.5420064926147461, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 3570 + }, + { + "epoch": 1.155584247901872, + "grad_norm": 0.5096590518951416, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 3580 + }, + { + "epoch": 1.158812136862492, + "grad_norm": 0.5726043581962585, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 3590 + }, + { + "epoch": 1.1620400258231116, + "grad_norm": 0.7388110160827637, + "learning_rate": 0.0002, + "loss": 0.7728, + "step": 3600 + }, + { + "epoch": 1.1652679147837315, + "grad_norm": 0.5597969889640808, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 3610 + }, + { + "epoch": 1.1684958037443511, + "grad_norm": 0.5067800283432007, + "learning_rate": 0.0002, + "loss": 0.7132, + "step": 3620 + }, + { + "epoch": 1.171723692704971, + "grad_norm": 0.6625118255615234, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 3630 + }, + { + "epoch": 1.1749515816655907, + "grad_norm": 0.5830849409103394, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 3640 + }, + { + "epoch": 1.1781794706262105, + "grad_norm": 0.6140692830085754, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 3650 + }, + { + "epoch": 1.1814073595868302, + "grad_norm": 0.714523434638977, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 3660 + }, + { + "epoch": 1.18463524854745, + "grad_norm": 0.5196696519851685, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 3670 + }, + { + "epoch": 1.1878631375080697, + "grad_norm": 0.6677889823913574, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 3680 + }, + { + "epoch": 1.1910910264686896, + "grad_norm": 0.47095245122909546, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 3690 + }, + { + "epoch": 1.1943189154293092, + "grad_norm": 0.5197778940200806, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 3700 + }, + { + "epoch": 1.1975468043899289, + "grad_norm": 0.5156530141830444, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 3710 + }, + { + "epoch": 1.2007746933505488, + "grad_norm": 0.6968549489974976, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 3720 + }, + { + "epoch": 1.2040025823111684, + "grad_norm": 0.48983848094940186, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 3730 + }, + { + "epoch": 1.2072304712717883, + "grad_norm": 0.6709973216056824, + "learning_rate": 0.0002, + "loss": 0.7163, + "step": 3740 + }, + { + "epoch": 1.210458360232408, + "grad_norm": 0.48681750893592834, + "learning_rate": 0.0002, + "loss": 0.7632, + "step": 3750 + }, + { + "epoch": 1.2136862491930278, + "grad_norm": 0.49475061893463135, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 3760 + }, + { + "epoch": 1.2169141381536475, + "grad_norm": 0.6163983345031738, + "learning_rate": 0.0002, + "loss": 0.7372, + "step": 3770 + }, + { + "epoch": 1.2201420271142673, + "grad_norm": 0.5481411218643188, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 3780 + }, + { + "epoch": 1.223369916074887, + "grad_norm": 0.620639979839325, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 3790 + }, + { + "epoch": 1.2265978050355069, + "grad_norm": 0.7017222046852112, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 3800 + }, + { + "epoch": 1.2298256939961265, + "grad_norm": 0.5872400403022766, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 3810 + }, + { + "epoch": 1.2330535829567464, + "grad_norm": 0.45765596628189087, + "learning_rate": 0.0002, + "loss": 0.7854, + "step": 3820 + }, + { + "epoch": 1.236281471917366, + "grad_norm": 0.5676377415657043, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 3830 + }, + { + "epoch": 1.2395093608779857, + "grad_norm": 0.4793425500392914, + "learning_rate": 0.0002, + "loss": 0.7696, + "step": 3840 + }, + { + "epoch": 1.2427372498386056, + "grad_norm": 0.5060022473335266, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 3850 + }, + { + "epoch": 1.2459651387992252, + "grad_norm": 0.6140682697296143, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 3860 + }, + { + "epoch": 1.249193027759845, + "grad_norm": 0.5030326843261719, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 3870 + }, + { + "epoch": 1.2524209167204647, + "grad_norm": 0.6609430909156799, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 3880 + }, + { + "epoch": 1.2556488056810846, + "grad_norm": 0.5459545850753784, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 3890 + }, + { + "epoch": 1.2588766946417043, + "grad_norm": 0.5328870415687561, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 3900 + }, + { + "epoch": 1.2621045836023241, + "grad_norm": 0.5840652585029602, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 3910 + }, + { + "epoch": 1.2653324725629438, + "grad_norm": 0.5587584376335144, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 3920 + }, + { + "epoch": 1.2685603615235637, + "grad_norm": 0.5886949896812439, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 3930 + }, + { + "epoch": 1.2717882504841833, + "grad_norm": 0.5128693580627441, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 3940 + }, + { + "epoch": 1.2750161394448032, + "grad_norm": 0.6207669377326965, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 3950 + }, + { + "epoch": 1.2782440284054228, + "grad_norm": 0.5789574384689331, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 3960 + }, + { + "epoch": 1.2814719173660425, + "grad_norm": 0.503162145614624, + "learning_rate": 0.0002, + "loss": 0.7574, + "step": 3970 + }, + { + "epoch": 1.2846998063266624, + "grad_norm": 0.6670064926147461, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 3980 + }, + { + "epoch": 1.2879276952872822, + "grad_norm": 0.5676213502883911, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 3990 + }, + { + "epoch": 1.2911555842479019, + "grad_norm": 0.5383169054985046, + "learning_rate": 0.0002, + "loss": 0.7892, + "step": 4000 + }, + { + "epoch": 1.2943834732085215, + "grad_norm": 0.714743971824646, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 4010 + }, + { + "epoch": 1.2976113621691414, + "grad_norm": 0.5740262269973755, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 4020 + }, + { + "epoch": 1.300839251129761, + "grad_norm": 0.6143045425415039, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 4030 + }, + { + "epoch": 1.304067140090381, + "grad_norm": 0.501025378704071, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 4040 + }, + { + "epoch": 1.3072950290510006, + "grad_norm": 0.5784100294113159, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 4050 + }, + { + "epoch": 1.3105229180116205, + "grad_norm": 0.6182606220245361, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 4060 + }, + { + "epoch": 1.3137508069722401, + "grad_norm": 0.5072231292724609, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 4070 + }, + { + "epoch": 1.31697869593286, + "grad_norm": 0.6841012835502625, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 4080 + }, + { + "epoch": 1.3202065848934796, + "grad_norm": 0.697257936000824, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 4090 + }, + { + "epoch": 1.3234344738540993, + "grad_norm": 0.5113214254379272, + "learning_rate": 0.0002, + "loss": 0.7401, + "step": 4100 + }, + { + "epoch": 1.3266623628147192, + "grad_norm": 0.6270561814308167, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 4110 + }, + { + "epoch": 1.329890251775339, + "grad_norm": 0.5525947213172913, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 4120 + }, + { + "epoch": 1.3331181407359587, + "grad_norm": 0.546071469783783, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 4130 + }, + { + "epoch": 1.3363460296965783, + "grad_norm": 0.6516721248626709, + "learning_rate": 0.0002, + "loss": 0.7884, + "step": 4140 + }, + { + "epoch": 1.3395739186571982, + "grad_norm": 0.6235111355781555, + "learning_rate": 0.0002, + "loss": 0.755, + "step": 4150 + }, + { + "epoch": 1.3428018076178179, + "grad_norm": 0.538649320602417, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 4160 + }, + { + "epoch": 1.3460296965784377, + "grad_norm": 0.5367001891136169, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 4170 + }, + { + "epoch": 1.3492575855390574, + "grad_norm": 0.6134631037712097, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 4180 + }, + { + "epoch": 1.3524854744996773, + "grad_norm": 0.5827262997627258, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 4190 + }, + { + "epoch": 1.355713363460297, + "grad_norm": 0.5706096291542053, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 4200 + }, + { + "epoch": 1.3589412524209168, + "grad_norm": 0.6422057151794434, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 4210 + }, + { + "epoch": 1.3621691413815364, + "grad_norm": 0.6316141486167908, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 4220 + }, + { + "epoch": 1.365397030342156, + "grad_norm": 0.6946983933448792, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 4230 + }, + { + "epoch": 1.368624919302776, + "grad_norm": 0.5381525754928589, + "learning_rate": 0.0002, + "loss": 0.7388, + "step": 4240 + }, + { + "epoch": 1.3718528082633958, + "grad_norm": 0.5484845638275146, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 4250 + }, + { + "epoch": 1.3750806972240155, + "grad_norm": 0.5961896777153015, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 4260 + }, + { + "epoch": 1.3783085861846351, + "grad_norm": 0.6041752696037292, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 4270 + }, + { + "epoch": 1.381536475145255, + "grad_norm": 0.6283464431762695, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 4280 + }, + { + "epoch": 1.384764364105875, + "grad_norm": 0.6761324405670166, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 4290 + }, + { + "epoch": 1.3879922530664945, + "grad_norm": 0.504311203956604, + "learning_rate": 0.0002, + "loss": 0.7381, + "step": 4300 + }, + { + "epoch": 1.3912201420271142, + "grad_norm": 0.6100395917892456, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 4310 + }, + { + "epoch": 1.394448030987734, + "grad_norm": 0.6245788335800171, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 4320 + }, + { + "epoch": 1.3976759199483537, + "grad_norm": 0.6074621081352234, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 4330 + }, + { + "epoch": 1.4009038089089736, + "grad_norm": 0.6683838963508606, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 4340 + }, + { + "epoch": 1.4041316978695932, + "grad_norm": 0.622998058795929, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 4350 + }, + { + "epoch": 1.4073595868302131, + "grad_norm": 0.6089423894882202, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 4360 + }, + { + "epoch": 1.4105874757908328, + "grad_norm": 0.6381658911705017, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 4370 + }, + { + "epoch": 1.4138153647514526, + "grad_norm": 0.5419308543205261, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 4380 + }, + { + "epoch": 1.4170432537120723, + "grad_norm": 0.6026232242584229, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 4390 + }, + { + "epoch": 1.420271142672692, + "grad_norm": 0.4911101162433624, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 4400 + }, + { + "epoch": 1.4234990316333118, + "grad_norm": 0.6302908062934875, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 4410 + }, + { + "epoch": 1.4267269205939317, + "grad_norm": 0.6692768931388855, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 4420 + }, + { + "epoch": 1.4299548095545513, + "grad_norm": 0.46294572949409485, + "learning_rate": 0.0002, + "loss": 0.7312, + "step": 4430 + }, + { + "epoch": 1.433182698515171, + "grad_norm": 0.5452619194984436, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 4440 + }, + { + "epoch": 1.4364105874757909, + "grad_norm": 0.7809233069419861, + "learning_rate": 0.0002, + "loss": 0.7974, + "step": 4450 + }, + { + "epoch": 1.4396384764364105, + "grad_norm": 0.550088107585907, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 4460 + }, + { + "epoch": 1.4428663653970304, + "grad_norm": 0.7139151096343994, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 4470 + }, + { + "epoch": 1.44609425435765, + "grad_norm": 0.6187090873718262, + "learning_rate": 0.0002, + "loss": 0.7358, + "step": 4480 + }, + { + "epoch": 1.44932214331827, + "grad_norm": 0.5948249101638794, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 4490 + }, + { + "epoch": 1.4525500322788896, + "grad_norm": 0.6510892510414124, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 4500 + }, + { + "epoch": 1.4557779212395094, + "grad_norm": 0.6552293300628662, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 4510 + }, + { + "epoch": 1.459005810200129, + "grad_norm": 0.585574209690094, + "learning_rate": 0.0002, + "loss": 0.7965, + "step": 4520 + }, + { + "epoch": 1.4622336991607487, + "grad_norm": 0.4830162823200226, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 4530 + }, + { + "epoch": 1.4654615881213686, + "grad_norm": 0.5780223608016968, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 4540 + }, + { + "epoch": 1.4686894770819885, + "grad_norm": 0.5462607145309448, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 4550 + }, + { + "epoch": 1.4719173660426081, + "grad_norm": 0.5183546543121338, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 4560 + }, + { + "epoch": 1.4751452550032278, + "grad_norm": 0.676917552947998, + "learning_rate": 0.0002, + "loss": 0.71, + "step": 4570 + }, + { + "epoch": 1.4783731439638477, + "grad_norm": 0.5772345066070557, + "learning_rate": 0.0002, + "loss": 0.7875, + "step": 4580 + }, + { + "epoch": 1.4816010329244673, + "grad_norm": 0.7320035696029663, + "learning_rate": 0.0002, + "loss": 0.7709, + "step": 4590 + }, + { + "epoch": 1.4848289218850872, + "grad_norm": 0.5024042129516602, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 4600 + }, + { + "epoch": 1.4880568108457068, + "grad_norm": 0.5482868552207947, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 4610 + }, + { + "epoch": 1.4912846998063267, + "grad_norm": 0.5447399616241455, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 4620 + }, + { + "epoch": 1.4945125887669464, + "grad_norm": 0.5953414440155029, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 4630 + }, + { + "epoch": 1.4977404777275662, + "grad_norm": 0.6983066201210022, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 4640 + }, + { + "epoch": 1.500968366688186, + "grad_norm": 0.586327075958252, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 4650 + }, + { + "epoch": 1.5041962556488055, + "grad_norm": 0.5839682221412659, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 4660 + }, + { + "epoch": 1.5074241446094254, + "grad_norm": 0.5959209203720093, + "learning_rate": 0.0002, + "loss": 0.7524, + "step": 4670 + }, + { + "epoch": 1.5106520335700453, + "grad_norm": 0.5073857307434082, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 4680 + }, + { + "epoch": 1.513879922530665, + "grad_norm": 0.5183001160621643, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 4690 + }, + { + "epoch": 1.5171078114912846, + "grad_norm": 0.593530535697937, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 4700 + }, + { + "epoch": 1.5203357004519045, + "grad_norm": 0.675993025302887, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 4710 + }, + { + "epoch": 1.5235635894125243, + "grad_norm": 0.5823286771774292, + "learning_rate": 0.0002, + "loss": 0.7485, + "step": 4720 + }, + { + "epoch": 1.526791478373144, + "grad_norm": 0.5825035572052002, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 4730 + }, + { + "epoch": 1.5300193673337636, + "grad_norm": 0.5689691305160522, + "learning_rate": 0.0002, + "loss": 0.8287, + "step": 4740 + }, + { + "epoch": 1.5332472562943835, + "grad_norm": 0.6037150621414185, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 4750 + }, + { + "epoch": 1.5364751452550034, + "grad_norm": 0.6393677592277527, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 4760 + }, + { + "epoch": 1.539703034215623, + "grad_norm": 0.5926381945610046, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 4770 + }, + { + "epoch": 1.5429309231762427, + "grad_norm": 0.9468599557876587, + "learning_rate": 0.0002, + "loss": 0.7425, + "step": 4780 + }, + { + "epoch": 1.5461588121368623, + "grad_norm": 0.7544237375259399, + "learning_rate": 0.0002, + "loss": 0.7565, + "step": 4790 + }, + { + "epoch": 1.5493867010974822, + "grad_norm": 0.5308566093444824, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 4800 + }, + { + "epoch": 1.552614590058102, + "grad_norm": 0.6590296030044556, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 4810 + }, + { + "epoch": 1.5558424790187217, + "grad_norm": 0.5630404353141785, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 4820 + }, + { + "epoch": 1.5590703679793414, + "grad_norm": 0.6800200939178467, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 4830 + }, + { + "epoch": 1.5622982569399613, + "grad_norm": 0.5463718175888062, + "learning_rate": 0.0002, + "loss": 0.7373, + "step": 4840 + }, + { + "epoch": 1.5655261459005811, + "grad_norm": 0.505135178565979, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 4850 + }, + { + "epoch": 1.5687540348612008, + "grad_norm": 0.5469676852226257, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 4860 + }, + { + "epoch": 1.5719819238218204, + "grad_norm": 0.5318337678909302, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 4870 + }, + { + "epoch": 1.5752098127824403, + "grad_norm": 0.7287914752960205, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 4880 + }, + { + "epoch": 1.5784377017430602, + "grad_norm": 0.7318989038467407, + "learning_rate": 0.0002, + "loss": 0.7532, + "step": 4890 + }, + { + "epoch": 1.5816655907036798, + "grad_norm": 0.6499921679496765, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 4900 + }, + { + "epoch": 1.5848934796642995, + "grad_norm": 0.47907355427742004, + "learning_rate": 0.0002, + "loss": 0.753, + "step": 4910 + }, + { + "epoch": 1.5881213686249191, + "grad_norm": 0.7338833808898926, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 4920 + }, + { + "epoch": 1.591349257585539, + "grad_norm": 0.5800719261169434, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 4930 + }, + { + "epoch": 1.594577146546159, + "grad_norm": 0.5365763306617737, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 4940 + }, + { + "epoch": 1.5978050355067785, + "grad_norm": 0.5800772309303284, + "learning_rate": 0.0002, + "loss": 0.777, + "step": 4950 + }, + { + "epoch": 1.6010329244673982, + "grad_norm": 0.7878010869026184, + "learning_rate": 0.0002, + "loss": 0.8027, + "step": 4960 + }, + { + "epoch": 1.604260813428018, + "grad_norm": 0.5919058918952942, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 4970 + }, + { + "epoch": 1.607488702388638, + "grad_norm": 0.5004435181617737, + "learning_rate": 0.0002, + "loss": 0.7762, + "step": 4980 + }, + { + "epoch": 1.6107165913492576, + "grad_norm": 0.6299242377281189, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 4990 + }, + { + "epoch": 1.6139444803098772, + "grad_norm": 0.6307242512702942, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 5000 + }, + { + "epoch": 1.6171723692704971, + "grad_norm": 0.7838703989982605, + "learning_rate": 0.0002, + "loss": 0.7693, + "step": 5010 + }, + { + "epoch": 1.620400258231117, + "grad_norm": 0.6454671621322632, + "learning_rate": 0.0002, + "loss": 0.7364, + "step": 5020 + }, + { + "epoch": 1.6236281471917366, + "grad_norm": 0.5907095670700073, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 5030 + }, + { + "epoch": 1.6268560361523563, + "grad_norm": 0.6053501963615417, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 5040 + }, + { + "epoch": 1.630083925112976, + "grad_norm": 0.5644670128822327, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 5050 + }, + { + "epoch": 1.6333118140735958, + "grad_norm": 0.6320949792861938, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 5060 + }, + { + "epoch": 1.6365397030342157, + "grad_norm": 0.6101489067077637, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 5070 + }, + { + "epoch": 1.6397675919948353, + "grad_norm": 0.9435283541679382, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 5080 + }, + { + "epoch": 1.642995480955455, + "grad_norm": 0.6668919324874878, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 5090 + }, + { + "epoch": 1.6462233699160749, + "grad_norm": 0.6160340905189514, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 5100 + }, + { + "epoch": 1.6494512588766947, + "grad_norm": 0.5999835729598999, + "learning_rate": 0.0002, + "loss": 0.7461, + "step": 5110 + }, + { + "epoch": 1.6526791478373144, + "grad_norm": 0.9378551840782166, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 5120 + }, + { + "epoch": 1.655907036797934, + "grad_norm": 0.4795055389404297, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 5130 + }, + { + "epoch": 1.659134925758554, + "grad_norm": 0.4878861606121063, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 5140 + }, + { + "epoch": 1.6623628147191738, + "grad_norm": 0.6042965054512024, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 5150 + }, + { + "epoch": 1.6655907036797934, + "grad_norm": 0.5829901695251465, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 5160 + }, + { + "epoch": 1.668818592640413, + "grad_norm": 0.5168480277061462, + "learning_rate": 0.0002, + "loss": 0.7498, + "step": 5170 + }, + { + "epoch": 1.672046481601033, + "grad_norm": 0.6489511132240295, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 5180 + }, + { + "epoch": 1.6752743705616526, + "grad_norm": 0.5955966114997864, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 5190 + }, + { + "epoch": 1.6785022595222725, + "grad_norm": 0.6228088140487671, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 5200 + }, + { + "epoch": 1.6817301484828922, + "grad_norm": 0.5726390480995178, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 5210 + }, + { + "epoch": 1.6849580374435118, + "grad_norm": 0.6116343140602112, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 5220 + }, + { + "epoch": 1.6881859264041317, + "grad_norm": 0.5483687520027161, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 5230 + }, + { + "epoch": 1.6914138153647515, + "grad_norm": 0.570941686630249, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 5240 + }, + { + "epoch": 1.6946417043253712, + "grad_norm": 0.6048086285591125, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 5250 + }, + { + "epoch": 1.6978695932859909, + "grad_norm": 0.6769003868103027, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 5260 + }, + { + "epoch": 1.7010974822466107, + "grad_norm": 0.5629057884216309, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 5270 + }, + { + "epoch": 1.7043253712072306, + "grad_norm": 0.657341480255127, + "learning_rate": 0.0002, + "loss": 0.7693, + "step": 5280 + }, + { + "epoch": 1.7075532601678503, + "grad_norm": 0.6256147623062134, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 5290 + }, + { + "epoch": 1.71078114912847, + "grad_norm": 0.5498088002204895, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 5300 + }, + { + "epoch": 1.7140090380890898, + "grad_norm": 0.5078358054161072, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 5310 + }, + { + "epoch": 1.7172369270497096, + "grad_norm": 0.6696692705154419, + "learning_rate": 0.0002, + "loss": 0.7872, + "step": 5320 + }, + { + "epoch": 1.7204648160103293, + "grad_norm": 0.6692847013473511, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 5330 + }, + { + "epoch": 1.723692704970949, + "grad_norm": 0.5415751934051514, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 5340 + }, + { + "epoch": 1.7269205939315686, + "grad_norm": 0.5367611050605774, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 5350 + }, + { + "epoch": 1.7301484828921885, + "grad_norm": 0.7321061491966248, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 5360 + }, + { + "epoch": 1.7333763718528084, + "grad_norm": 0.723972499370575, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 5370 + }, + { + "epoch": 1.736604260813428, + "grad_norm": 0.7328100204467773, + "learning_rate": 0.0002, + "loss": 0.7077, + "step": 5380 + }, + { + "epoch": 1.7398321497740477, + "grad_norm": 0.5785264372825623, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 5390 + }, + { + "epoch": 1.7430600387346675, + "grad_norm": 0.7812932133674622, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 5400 + }, + { + "epoch": 1.7462879276952874, + "grad_norm": 0.6493327617645264, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 5410 + }, + { + "epoch": 1.749515816655907, + "grad_norm": 0.5825939774513245, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 5420 + }, + { + "epoch": 1.7527437056165267, + "grad_norm": 0.6969610452651978, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 5430 + }, + { + "epoch": 1.7559715945771466, + "grad_norm": 0.5558062195777893, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 5440 + }, + { + "epoch": 1.7591994835377665, + "grad_norm": 0.49222221970558167, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 5450 + }, + { + "epoch": 1.762427372498386, + "grad_norm": 0.5844656825065613, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 5460 + }, + { + "epoch": 1.7656552614590058, + "grad_norm": 0.8706597685813904, + "learning_rate": 0.0002, + "loss": 0.7695, + "step": 5470 + }, + { + "epoch": 1.7688831504196254, + "grad_norm": 0.6167706251144409, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 5480 + }, + { + "epoch": 1.7721110393802453, + "grad_norm": 0.5890011787414551, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 5490 + }, + { + "epoch": 1.7753389283408652, + "grad_norm": 0.6551728248596191, + "learning_rate": 0.0002, + "loss": 0.8319, + "step": 5500 + }, + { + "epoch": 1.7785668173014848, + "grad_norm": 0.5848751068115234, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 5510 + }, + { + "epoch": 1.7817947062621045, + "grad_norm": 0.6664014458656311, + "learning_rate": 0.0002, + "loss": 0.7622, + "step": 5520 + }, + { + "epoch": 1.7850225952227243, + "grad_norm": 0.5931693911552429, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 5530 + }, + { + "epoch": 1.7882504841833442, + "grad_norm": 0.5534724593162537, + "learning_rate": 0.0002, + "loss": 0.7992, + "step": 5540 + }, + { + "epoch": 1.7914783731439639, + "grad_norm": 0.5590878129005432, + "learning_rate": 0.0002, + "loss": 0.7967, + "step": 5550 + }, + { + "epoch": 1.7947062621045835, + "grad_norm": 0.6947470903396606, + "learning_rate": 0.0002, + "loss": 0.7406, + "step": 5560 + }, + { + "epoch": 1.7979341510652034, + "grad_norm": 0.6104130148887634, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 5570 + }, + { + "epoch": 1.8011620400258233, + "grad_norm": 0.6135714054107666, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 5580 + }, + { + "epoch": 1.804389928986443, + "grad_norm": 0.6626853346824646, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 5590 + }, + { + "epoch": 1.8076178179470626, + "grad_norm": 0.6977612972259521, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 5600 + }, + { + "epoch": 1.8108457069076824, + "grad_norm": 0.6275238394737244, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 5610 + }, + { + "epoch": 1.814073595868302, + "grad_norm": 0.5017505288124084, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 5620 + }, + { + "epoch": 1.817301484828922, + "grad_norm": 0.8314290642738342, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 5630 + }, + { + "epoch": 1.8205293737895416, + "grad_norm": 0.6863582134246826, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 5640 + }, + { + "epoch": 1.8237572627501613, + "grad_norm": 0.69544917345047, + "learning_rate": 0.0002, + "loss": 0.743, + "step": 5650 + }, + { + "epoch": 1.8269851517107811, + "grad_norm": 0.515499472618103, + "learning_rate": 0.0002, + "loss": 0.7277, + "step": 5660 + }, + { + "epoch": 1.830213040671401, + "grad_norm": 0.6100873947143555, + "learning_rate": 0.0002, + "loss": 0.7166, + "step": 5670 + }, + { + "epoch": 1.8334409296320207, + "grad_norm": 0.67416912317276, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 5680 + }, + { + "epoch": 1.8366688185926403, + "grad_norm": 0.7057772278785706, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 5690 + }, + { + "epoch": 1.8398967075532602, + "grad_norm": 0.7374551892280579, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 5700 + }, + { + "epoch": 1.84312459651388, + "grad_norm": 0.6266297101974487, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 5710 + }, + { + "epoch": 1.8463524854744997, + "grad_norm": 0.5629227757453918, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 5720 + }, + { + "epoch": 1.8495803744351194, + "grad_norm": 0.6603655815124512, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 5730 + }, + { + "epoch": 1.8528082633957392, + "grad_norm": 0.8113715052604675, + "learning_rate": 0.0002, + "loss": 0.7587, + "step": 5740 + }, + { + "epoch": 1.856036152356359, + "grad_norm": 0.7143914103507996, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 5750 + }, + { + "epoch": 1.8592640413169788, + "grad_norm": 0.6273732781410217, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 5760 + }, + { + "epoch": 1.8624919302775984, + "grad_norm": 0.5428690910339355, + "learning_rate": 0.0002, + "loss": 0.7962, + "step": 5770 + }, + { + "epoch": 1.865719819238218, + "grad_norm": 0.6405037641525269, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 5780 + }, + { + "epoch": 1.868947708198838, + "grad_norm": 0.700873613357544, + "learning_rate": 0.0002, + "loss": 0.7569, + "step": 5790 + }, + { + "epoch": 1.8721755971594578, + "grad_norm": 0.5645238161087036, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 5800 + }, + { + "epoch": 1.8754034861200775, + "grad_norm": 0.8780353665351868, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 5810 + }, + { + "epoch": 1.878631375080697, + "grad_norm": 0.6295409798622131, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 5820 + }, + { + "epoch": 1.881859264041317, + "grad_norm": 0.678269624710083, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 5830 + }, + { + "epoch": 1.8850871530019369, + "grad_norm": 0.6464608907699585, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 5840 + }, + { + "epoch": 1.8883150419625565, + "grad_norm": 0.6201048493385315, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 5850 + }, + { + "epoch": 1.8915429309231762, + "grad_norm": 0.6046274304389954, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 5860 + }, + { + "epoch": 1.894770819883796, + "grad_norm": 0.7532408833503723, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 5870 + }, + { + "epoch": 1.897998708844416, + "grad_norm": 0.6066767573356628, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 5880 + }, + { + "epoch": 1.9012265978050356, + "grad_norm": 0.6289830207824707, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 5890 + }, + { + "epoch": 1.9044544867656552, + "grad_norm": 0.5204319953918457, + "learning_rate": 0.0002, + "loss": 0.7501, + "step": 5900 + }, + { + "epoch": 1.9076823757262749, + "grad_norm": 0.6708219647407532, + "learning_rate": 0.0002, + "loss": 0.7335, + "step": 5910 + }, + { + "epoch": 1.9109102646868947, + "grad_norm": 0.4915677309036255, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 5920 + }, + { + "epoch": 1.9141381536475146, + "grad_norm": 0.652717113494873, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 5930 + }, + { + "epoch": 1.9173660426081343, + "grad_norm": 0.5446316003799438, + "learning_rate": 0.0002, + "loss": 0.7687, + "step": 5940 + }, + { + "epoch": 1.920593931568754, + "grad_norm": 0.4958149194717407, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 5950 + }, + { + "epoch": 1.9238218205293738, + "grad_norm": 0.5623434782028198, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 5960 + }, + { + "epoch": 1.9270497094899937, + "grad_norm": 0.6855450868606567, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 5970 + }, + { + "epoch": 1.9302775984506133, + "grad_norm": 0.5710492730140686, + "learning_rate": 0.0002, + "loss": 0.827, + "step": 5980 + }, + { + "epoch": 1.933505487411233, + "grad_norm": 0.5379431843757629, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 5990 + }, + { + "epoch": 1.9367333763718528, + "grad_norm": 0.557129442691803, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 6000 + }, + { + "epoch": 1.9399612653324727, + "grad_norm": 0.6336663961410522, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 6010 + }, + { + "epoch": 1.9431891542930924, + "grad_norm": 0.5950582027435303, + "learning_rate": 0.0002, + "loss": 0.7316, + "step": 6020 + }, + { + "epoch": 1.946417043253712, + "grad_norm": 0.5905954837799072, + "learning_rate": 0.0002, + "loss": 0.7443, + "step": 6030 + }, + { + "epoch": 1.9496449322143317, + "grad_norm": 0.6688982844352722, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 6040 + }, + { + "epoch": 1.9528728211749515, + "grad_norm": 0.5440775752067566, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 6050 + }, + { + "epoch": 1.9561007101355714, + "grad_norm": 0.6207906603813171, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 6060 + }, + { + "epoch": 1.959328599096191, + "grad_norm": 0.6999374628067017, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 6070 + }, + { + "epoch": 1.9625564880568107, + "grad_norm": 0.6310848593711853, + "learning_rate": 0.0002, + "loss": 0.7372, + "step": 6080 + }, + { + "epoch": 1.9657843770174306, + "grad_norm": 0.5903388261795044, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 6090 + }, + { + "epoch": 1.9690122659780505, + "grad_norm": 0.6333889961242676, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 6100 + }, + { + "epoch": 1.97224015493867, + "grad_norm": 0.5604711174964905, + "learning_rate": 0.0002, + "loss": 0.7246, + "step": 6110 + }, + { + "epoch": 1.9754680438992898, + "grad_norm": 0.9234541654586792, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 6120 + }, + { + "epoch": 1.9786959328599096, + "grad_norm": 0.6149102449417114, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 6130 + }, + { + "epoch": 1.9819238218205295, + "grad_norm": 0.615446150302887, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 6140 + }, + { + "epoch": 1.9851517107811492, + "grad_norm": 0.5176635980606079, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 6150 + }, + { + "epoch": 1.9883795997417688, + "grad_norm": 0.7124109864234924, + "learning_rate": 0.0002, + "loss": 0.718, + "step": 6160 + }, + { + "epoch": 1.9916074887023887, + "grad_norm": 0.6317567825317383, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 6170 + }, + { + "epoch": 1.9948353776630086, + "grad_norm": 0.6855016350746155, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 6180 + }, + { + "epoch": 1.9980632666236282, + "grad_norm": 0.6423715353012085, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 6190 + }, + { + "epoch": 2.0, + "eval_loss": 1.1096643209457397, + "eval_runtime": 147.7997, + "eval_samples_per_second": 4.959, + "eval_steps_per_second": 0.622, + "step": 6196 + }, + { + "epoch": 2.001291155584248, + "grad_norm": 0.5322932600975037, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 6200 + }, + { + "epoch": 2.0045190445448675, + "grad_norm": 0.8152306079864502, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 6210 + }, + { + "epoch": 2.0077469335054876, + "grad_norm": 0.6215983033180237, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 6220 + }, + { + "epoch": 2.0109748224661073, + "grad_norm": 0.845498263835907, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 6230 + }, + { + "epoch": 2.014202711426727, + "grad_norm": 0.733559787273407, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 6240 + }, + { + "epoch": 2.0174306003873466, + "grad_norm": 0.51433926820755, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 6250 + }, + { + "epoch": 2.020658489347966, + "grad_norm": 0.6374049782752991, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 6260 + }, + { + "epoch": 2.0238863783085863, + "grad_norm": 0.7833638191223145, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 6270 + }, + { + "epoch": 2.027114267269206, + "grad_norm": 0.8929463028907776, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 6280 + }, + { + "epoch": 2.0303421562298256, + "grad_norm": 0.669731855392456, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 6290 + }, + { + "epoch": 2.0335700451904453, + "grad_norm": 0.5846071243286133, + "learning_rate": 0.0002, + "loss": 0.646, + "step": 6300 + }, + { + "epoch": 2.0367979341510654, + "grad_norm": 0.7087787985801697, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 6310 + }, + { + "epoch": 2.040025823111685, + "grad_norm": 0.6739160418510437, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 6320 + }, + { + "epoch": 2.0432537120723047, + "grad_norm": 0.4860886335372925, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 6330 + }, + { + "epoch": 2.0464816010329243, + "grad_norm": 0.7201244831085205, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 6340 + }, + { + "epoch": 2.0497094899935444, + "grad_norm": 0.7409170269966125, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 6350 + }, + { + "epoch": 2.052937378954164, + "grad_norm": 0.6843920350074768, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 6360 + }, + { + "epoch": 2.0561652679147837, + "grad_norm": 0.7519999742507935, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 6370 + }, + { + "epoch": 2.0593931568754034, + "grad_norm": 0.5732819437980652, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 6380 + }, + { + "epoch": 2.062621045836023, + "grad_norm": 0.7565118074417114, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 6390 + }, + { + "epoch": 2.065848934796643, + "grad_norm": 0.8147150278091431, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 6400 + }, + { + "epoch": 2.0690768237572628, + "grad_norm": 0.6941924691200256, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 6410 + }, + { + "epoch": 2.0723047127178824, + "grad_norm": 0.6549784541130066, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 6420 + }, + { + "epoch": 2.075532601678502, + "grad_norm": 0.7224905490875244, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 6430 + }, + { + "epoch": 2.078760490639122, + "grad_norm": 0.7754863500595093, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 6440 + }, + { + "epoch": 2.081988379599742, + "grad_norm": 0.691318154335022, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 6450 + }, + { + "epoch": 2.0852162685603615, + "grad_norm": 0.6009294986724854, + "learning_rate": 0.0002, + "loss": 0.6233, + "step": 6460 + }, + { + "epoch": 2.088444157520981, + "grad_norm": 0.6753945350646973, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 6470 + }, + { + "epoch": 2.091672046481601, + "grad_norm": 0.6899921298027039, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 6480 + }, + { + "epoch": 2.094899935442221, + "grad_norm": 0.846510648727417, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 6490 + }, + { + "epoch": 2.0981278244028405, + "grad_norm": 0.6432605981826782, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 6500 + }, + { + "epoch": 2.10135571336346, + "grad_norm": 0.8125239014625549, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 6510 + }, + { + "epoch": 2.1045836023240803, + "grad_norm": 0.628302812576294, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 6520 + }, + { + "epoch": 2.1078114912847, + "grad_norm": 0.7164334654808044, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 6530 + }, + { + "epoch": 2.1110393802453196, + "grad_norm": 0.7476949095726013, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 6540 + }, + { + "epoch": 2.114267269205939, + "grad_norm": 0.7577515840530396, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 6550 + }, + { + "epoch": 2.117495158166559, + "grad_norm": 0.5684467554092407, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 6560 + }, + { + "epoch": 2.120723047127179, + "grad_norm": 0.6121789216995239, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 6570 + }, + { + "epoch": 2.1239509360877986, + "grad_norm": 0.6095348596572876, + "learning_rate": 0.0002, + "loss": 0.6314, + "step": 6580 + }, + { + "epoch": 2.1271788250484183, + "grad_norm": 0.7803651690483093, + "learning_rate": 0.0002, + "loss": 0.6276, + "step": 6590 + }, + { + "epoch": 2.130406714009038, + "grad_norm": 0.5990583300590515, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 6600 + }, + { + "epoch": 2.133634602969658, + "grad_norm": 0.6569220423698425, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 6610 + }, + { + "epoch": 2.1368624919302777, + "grad_norm": 0.5961166620254517, + "learning_rate": 0.0002, + "loss": 0.7049, + "step": 6620 + }, + { + "epoch": 2.1400903808908973, + "grad_norm": 0.5860554575920105, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 6630 + }, + { + "epoch": 2.143318269851517, + "grad_norm": 0.5994001626968384, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 6640 + }, + { + "epoch": 2.146546158812137, + "grad_norm": 0.7723015546798706, + "learning_rate": 0.0002, + "loss": 0.6421, + "step": 6650 + }, + { + "epoch": 2.1497740477727567, + "grad_norm": 0.676355242729187, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 6660 + }, + { + "epoch": 2.1530019367333764, + "grad_norm": 0.5689092874526978, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 6670 + }, + { + "epoch": 2.156229825693996, + "grad_norm": 0.6933727264404297, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 6680 + }, + { + "epoch": 2.159457714654616, + "grad_norm": 0.8380527496337891, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 6690 + }, + { + "epoch": 2.1626856036152358, + "grad_norm": 0.6876497268676758, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 6700 + }, + { + "epoch": 2.1659134925758554, + "grad_norm": 0.6418334245681763, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 6710 + }, + { + "epoch": 2.169141381536475, + "grad_norm": 0.7169192433357239, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 6720 + }, + { + "epoch": 2.1723692704970947, + "grad_norm": 0.6664170622825623, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 6730 + }, + { + "epoch": 2.175597159457715, + "grad_norm": 0.6011993288993835, + "learning_rate": 0.0002, + "loss": 0.6751, + "step": 6740 + }, + { + "epoch": 2.1788250484183345, + "grad_norm": 0.5529947280883789, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 6750 + }, + { + "epoch": 2.182052937378954, + "grad_norm": 0.6879532933235168, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 6760 + }, + { + "epoch": 2.1852808263395738, + "grad_norm": 0.6426113843917847, + "learning_rate": 0.0002, + "loss": 0.6634, + "step": 6770 + }, + { + "epoch": 2.188508715300194, + "grad_norm": 0.6571047306060791, + "learning_rate": 0.0002, + "loss": 0.6592, + "step": 6780 + }, + { + "epoch": 2.1917366042608135, + "grad_norm": 0.6400564908981323, + "learning_rate": 0.0002, + "loss": 0.6494, + "step": 6790 + }, + { + "epoch": 2.194964493221433, + "grad_norm": 0.6509664058685303, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 6800 + }, + { + "epoch": 2.198192382182053, + "grad_norm": 0.6673197150230408, + "learning_rate": 0.0002, + "loss": 0.6771, + "step": 6810 + }, + { + "epoch": 2.2014202711426725, + "grad_norm": 0.48205727338790894, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 6820 + }, + { + "epoch": 2.2046481601032926, + "grad_norm": 0.849525511264801, + "learning_rate": 0.0002, + "loss": 0.6894, + "step": 6830 + }, + { + "epoch": 2.207876049063912, + "grad_norm": 0.6150892376899719, + "learning_rate": 0.0002, + "loss": 0.6977, + "step": 6840 + }, + { + "epoch": 2.211103938024532, + "grad_norm": 0.7826945781707764, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 6850 + }, + { + "epoch": 2.2143318269851515, + "grad_norm": 0.5711963772773743, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 6860 + }, + { + "epoch": 2.2175597159457716, + "grad_norm": 0.6017758846282959, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 6870 + }, + { + "epoch": 2.2207876049063913, + "grad_norm": 0.785434901714325, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 6880 + }, + { + "epoch": 2.224015493867011, + "grad_norm": 0.6251688599586487, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 6890 + }, + { + "epoch": 2.2272433828276306, + "grad_norm": 0.8242034316062927, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 6900 + }, + { + "epoch": 2.2304712717882507, + "grad_norm": 0.7272933125495911, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 6910 + }, + { + "epoch": 2.2336991607488703, + "grad_norm": 0.7159379720687866, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 6920 + }, + { + "epoch": 2.23692704970949, + "grad_norm": 0.6518042087554932, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 6930 + }, + { + "epoch": 2.2401549386701096, + "grad_norm": 0.7365370392799377, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 6940 + }, + { + "epoch": 2.2433828276307297, + "grad_norm": 0.5674061179161072, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 6950 + }, + { + "epoch": 2.2466107165913494, + "grad_norm": 0.669185996055603, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 6960 + }, + { + "epoch": 2.249838605551969, + "grad_norm": 0.6638304591178894, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 6970 + }, + { + "epoch": 2.2530664945125887, + "grad_norm": 0.757006824016571, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 6980 + }, + { + "epoch": 2.2562943834732083, + "grad_norm": 0.7574930787086487, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 6990 + }, + { + "epoch": 2.2595222724338284, + "grad_norm": 0.7819514870643616, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 7000 + }, + { + "epoch": 2.262750161394448, + "grad_norm": 0.6987583041191101, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 7010 + }, + { + "epoch": 2.2659780503550677, + "grad_norm": 0.6628551483154297, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 7020 + }, + { + "epoch": 2.2692059393156874, + "grad_norm": 0.7855866551399231, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 7030 + }, + { + "epoch": 2.2724338282763075, + "grad_norm": 0.6102892756462097, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 7040 + }, + { + "epoch": 2.275661717236927, + "grad_norm": 0.7844198942184448, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 7050 + }, + { + "epoch": 2.2788896061975468, + "grad_norm": 0.6209492087364197, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 7060 + }, + { + "epoch": 2.2821174951581664, + "grad_norm": 0.8351290225982666, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 7070 + }, + { + "epoch": 2.285345384118786, + "grad_norm": 0.6883546710014343, + "learning_rate": 0.0002, + "loss": 0.6648, + "step": 7080 + }, + { + "epoch": 2.288573273079406, + "grad_norm": 0.6626381874084473, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 7090 + }, + { + "epoch": 2.291801162040026, + "grad_norm": 0.7216270565986633, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 7100 + }, + { + "epoch": 2.2950290510006455, + "grad_norm": 0.8246777057647705, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 7110 + }, + { + "epoch": 2.2982569399612656, + "grad_norm": 0.614326000213623, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 7120 + }, + { + "epoch": 2.301484828921885, + "grad_norm": 0.8785578012466431, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 7130 + }, + { + "epoch": 2.304712717882505, + "grad_norm": 0.7021808624267578, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 7140 + }, + { + "epoch": 2.3079406068431245, + "grad_norm": 0.6999403238296509, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 7150 + }, + { + "epoch": 2.311168495803744, + "grad_norm": 0.8013143539428711, + "learning_rate": 0.0002, + "loss": 0.6547, + "step": 7160 + }, + { + "epoch": 2.3143963847643643, + "grad_norm": 0.6592583060264587, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 7170 + }, + { + "epoch": 2.317624273724984, + "grad_norm": 0.6260249018669128, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 7180 + }, + { + "epoch": 2.3208521626856036, + "grad_norm": 0.9352797269821167, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 7190 + }, + { + "epoch": 2.324080051646223, + "grad_norm": 0.6629612445831299, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 7200 + }, + { + "epoch": 2.3273079406068433, + "grad_norm": 0.7062810063362122, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 7210 + }, + { + "epoch": 2.330535829567463, + "grad_norm": 0.7236241102218628, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 7220 + }, + { + "epoch": 2.3337637185280826, + "grad_norm": 0.7528148293495178, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 7230 + }, + { + "epoch": 2.3369916074887023, + "grad_norm": 0.7604748606681824, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 7240 + }, + { + "epoch": 2.340219496449322, + "grad_norm": 0.5601189136505127, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 7250 + }, + { + "epoch": 2.343447385409942, + "grad_norm": 0.7099230885505676, + "learning_rate": 0.0002, + "loss": 0.6925, + "step": 7260 + }, + { + "epoch": 2.3466752743705617, + "grad_norm": 0.6699047684669495, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 7270 + }, + { + "epoch": 2.3499031633311813, + "grad_norm": 0.7315047979354858, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 7280 + }, + { + "epoch": 2.353131052291801, + "grad_norm": 0.632836103439331, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 7290 + }, + { + "epoch": 2.356358941252421, + "grad_norm": 0.9410115480422974, + "learning_rate": 0.0002, + "loss": 0.6458, + "step": 7300 + }, + { + "epoch": 2.3595868302130407, + "grad_norm": 0.626554012298584, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 7310 + }, + { + "epoch": 2.3628147191736604, + "grad_norm": 0.7538444399833679, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 7320 + }, + { + "epoch": 2.36604260813428, + "grad_norm": 0.6826626062393188, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 7330 + }, + { + "epoch": 2.3692704970949, + "grad_norm": 0.6739391088485718, + "learning_rate": 0.0002, + "loss": 0.6752, + "step": 7340 + }, + { + "epoch": 2.3724983860555198, + "grad_norm": 0.7518446445465088, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 7350 + }, + { + "epoch": 2.3757262750161394, + "grad_norm": 0.714133083820343, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 7360 + }, + { + "epoch": 2.378954163976759, + "grad_norm": 0.7144588232040405, + "learning_rate": 0.0002, + "loss": 0.6794, + "step": 7370 + }, + { + "epoch": 2.382182052937379, + "grad_norm": 0.6598120927810669, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 7380 + }, + { + "epoch": 2.385409941897999, + "grad_norm": 0.7079148292541504, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 7390 + }, + { + "epoch": 2.3886378308586185, + "grad_norm": 0.6750902533531189, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 7400 + }, + { + "epoch": 2.391865719819238, + "grad_norm": 0.7181967496871948, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 7410 + }, + { + "epoch": 2.3950936087798578, + "grad_norm": 0.7720552086830139, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 7420 + }, + { + "epoch": 2.398321497740478, + "grad_norm": 0.7592426538467407, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 7430 + }, + { + "epoch": 2.4015493867010975, + "grad_norm": 0.7161896824836731, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 7440 + }, + { + "epoch": 2.404777275661717, + "grad_norm": 0.8019260764122009, + "learning_rate": 0.0002, + "loss": 0.6891, + "step": 7450 + }, + { + "epoch": 2.408005164622337, + "grad_norm": 0.7093342542648315, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 7460 + }, + { + "epoch": 2.411233053582957, + "grad_norm": 0.8464207649230957, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 7470 + }, + { + "epoch": 2.4144609425435766, + "grad_norm": 0.773666501045227, + "learning_rate": 0.0002, + "loss": 0.6724, + "step": 7480 + }, + { + "epoch": 2.4176888315041962, + "grad_norm": 0.8451611995697021, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 7490 + }, + { + "epoch": 2.420916720464816, + "grad_norm": 0.656795084476471, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 7500 + }, + { + "epoch": 2.4241446094254355, + "grad_norm": 0.7129034996032715, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 7510 + }, + { + "epoch": 2.4273724983860556, + "grad_norm": 0.8325763940811157, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 7520 + }, + { + "epoch": 2.4306003873466753, + "grad_norm": 0.7806527614593506, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 7530 + }, + { + "epoch": 2.433828276307295, + "grad_norm": 0.6994536519050598, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 7540 + }, + { + "epoch": 2.437056165267915, + "grad_norm": 0.6898999214172363, + "learning_rate": 0.0002, + "loss": 0.6615, + "step": 7550 + }, + { + "epoch": 2.4402840542285347, + "grad_norm": 0.719490647315979, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 7560 + }, + { + "epoch": 2.4435119431891543, + "grad_norm": 0.6841562390327454, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 7570 + }, + { + "epoch": 2.446739832149774, + "grad_norm": 0.7573311924934387, + "learning_rate": 0.0002, + "loss": 0.6504, + "step": 7580 + }, + { + "epoch": 2.4499677211103936, + "grad_norm": 0.7295880317687988, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 7590 + }, + { + "epoch": 2.4531956100710137, + "grad_norm": 0.710136353969574, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 7600 + }, + { + "epoch": 2.4564234990316334, + "grad_norm": 0.6126235127449036, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 7610 + }, + { + "epoch": 2.459651387992253, + "grad_norm": 0.8025609850883484, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 7620 + }, + { + "epoch": 2.4628792769528727, + "grad_norm": 0.7839472889900208, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 7630 + }, + { + "epoch": 2.4661071659134928, + "grad_norm": 0.7253499031066895, + "learning_rate": 0.0002, + "loss": 0.6797, + "step": 7640 + }, + { + "epoch": 2.4693350548741124, + "grad_norm": 0.7918946743011475, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 7650 + }, + { + "epoch": 2.472562943834732, + "grad_norm": 0.7930178046226501, + "learning_rate": 0.0002, + "loss": 0.6646, + "step": 7660 + }, + { + "epoch": 2.4757908327953517, + "grad_norm": 0.6826170086860657, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 7670 + }, + { + "epoch": 2.4790187217559714, + "grad_norm": 0.6576805114746094, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 7680 + }, + { + "epoch": 2.4822466107165915, + "grad_norm": 0.7012448310852051, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 7690 + }, + { + "epoch": 2.485474499677211, + "grad_norm": 0.7774284482002258, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 7700 + }, + { + "epoch": 2.4887023886378308, + "grad_norm": 0.6502766013145447, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 7710 + }, + { + "epoch": 2.4919302775984504, + "grad_norm": 0.7638739347457886, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 7720 + }, + { + "epoch": 2.4951581665590705, + "grad_norm": 0.6217384338378906, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 7730 + }, + { + "epoch": 2.49838605551969, + "grad_norm": 0.7576302886009216, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 7740 + }, + { + "epoch": 2.50161394448031, + "grad_norm": 0.6877137422561646, + "learning_rate": 0.0002, + "loss": 0.6855, + "step": 7750 + }, + { + "epoch": 2.5048418334409295, + "grad_norm": 0.6998329162597656, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 7760 + }, + { + "epoch": 2.508069722401549, + "grad_norm": 0.7879213690757751, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 7770 + }, + { + "epoch": 2.5112976113621692, + "grad_norm": 0.7834980487823486, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 7780 + }, + { + "epoch": 2.514525500322789, + "grad_norm": 0.7789630889892578, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 7790 + }, + { + "epoch": 2.5177533892834085, + "grad_norm": 0.7403590083122253, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 7800 + }, + { + "epoch": 2.5209812782440286, + "grad_norm": 0.6029766201972961, + "learning_rate": 0.0002, + "loss": 0.6964, + "step": 7810 + }, + { + "epoch": 2.5242091672046483, + "grad_norm": 0.7061092257499695, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 7820 + }, + { + "epoch": 2.527437056165268, + "grad_norm": 0.7120763659477234, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 7830 + }, + { + "epoch": 2.5306649451258876, + "grad_norm": 0.6173675656318665, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 7840 + }, + { + "epoch": 2.5338928340865072, + "grad_norm": 0.9566813111305237, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 7850 + }, + { + "epoch": 2.5371207230471273, + "grad_norm": 0.8497620224952698, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 7860 + }, + { + "epoch": 2.540348612007747, + "grad_norm": 0.7663498520851135, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 7870 + }, + { + "epoch": 2.5435765009683666, + "grad_norm": 0.6329668760299683, + "learning_rate": 0.0002, + "loss": 0.6292, + "step": 7880 + }, + { + "epoch": 2.5468043899289863, + "grad_norm": 0.8128195405006409, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 7890 + }, + { + "epoch": 2.5500322788896064, + "grad_norm": 0.6622284650802612, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 7900 + }, + { + "epoch": 2.553260167850226, + "grad_norm": 0.8460057973861694, + "learning_rate": 0.0002, + "loss": 0.693, + "step": 7910 + }, + { + "epoch": 2.5564880568108457, + "grad_norm": 0.6586956977844238, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 7920 + }, + { + "epoch": 2.5597159457714653, + "grad_norm": 0.7569382190704346, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 7930 + }, + { + "epoch": 2.562943834732085, + "grad_norm": 0.6409714221954346, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 7940 + }, + { + "epoch": 2.566171723692705, + "grad_norm": 0.7031713128089905, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 7950 + }, + { + "epoch": 2.5693996126533247, + "grad_norm": 0.7983605265617371, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 7960 + }, + { + "epoch": 2.5726275016139444, + "grad_norm": 0.7165433168411255, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 7970 + }, + { + "epoch": 2.5758553905745645, + "grad_norm": 0.6630598902702332, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 7980 + }, + { + "epoch": 2.579083279535184, + "grad_norm": 0.5883122086524963, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 7990 + }, + { + "epoch": 2.5823111684958038, + "grad_norm": 0.5928755402565002, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 8000 + }, + { + "epoch": 2.5855390574564234, + "grad_norm": 0.7843712568283081, + "learning_rate": 0.0002, + "loss": 0.6701, + "step": 8010 + }, + { + "epoch": 2.588766946417043, + "grad_norm": 0.7206324338912964, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 8020 + }, + { + "epoch": 2.5919948353776627, + "grad_norm": 0.812480092048645, + "learning_rate": 0.0002, + "loss": 0.6968, + "step": 8030 + }, + { + "epoch": 2.595222724338283, + "grad_norm": 0.9843078255653381, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 8040 + }, + { + "epoch": 2.5984506132989025, + "grad_norm": 0.7524392604827881, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 8050 + }, + { + "epoch": 2.601678502259522, + "grad_norm": 0.6220380067825317, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 8060 + }, + { + "epoch": 2.6049063912201422, + "grad_norm": 0.7461398243904114, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 8070 + }, + { + "epoch": 2.608134280180762, + "grad_norm": 0.720974326133728, + "learning_rate": 0.0002, + "loss": 0.6626, + "step": 8080 + }, + { + "epoch": 2.6113621691413815, + "grad_norm": 0.649509847164154, + "learning_rate": 0.0002, + "loss": 0.6756, + "step": 8090 + }, + { + "epoch": 2.614590058102001, + "grad_norm": 0.6894662976264954, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 8100 + }, + { + "epoch": 2.617817947062621, + "grad_norm": 0.734433114528656, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 8110 + }, + { + "epoch": 2.621045836023241, + "grad_norm": 0.7468628883361816, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 8120 + }, + { + "epoch": 2.6242737249838606, + "grad_norm": 0.6508180499076843, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 8130 + }, + { + "epoch": 2.6275016139444802, + "grad_norm": 0.8735209107398987, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 8140 + }, + { + "epoch": 2.6307295029051003, + "grad_norm": 0.8162857294082642, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 8150 + }, + { + "epoch": 2.63395739186572, + "grad_norm": 0.628872811794281, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 8160 + }, + { + "epoch": 2.6371852808263396, + "grad_norm": 0.8078708052635193, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 8170 + }, + { + "epoch": 2.6404131697869593, + "grad_norm": 0.7849429845809937, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 8180 + }, + { + "epoch": 2.643641058747579, + "grad_norm": 0.8115387558937073, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 8190 + }, + { + "epoch": 2.6468689477081986, + "grad_norm": 0.7462222576141357, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 8200 + }, + { + "epoch": 2.6500968366688187, + "grad_norm": 0.753662645816803, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 8210 + }, + { + "epoch": 2.6533247256294383, + "grad_norm": 0.6100404858589172, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 8220 + }, + { + "epoch": 2.656552614590058, + "grad_norm": 0.9084606766700745, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 8230 + }, + { + "epoch": 2.659780503550678, + "grad_norm": 0.6412538886070251, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 8240 + }, + { + "epoch": 2.6630083925112977, + "grad_norm": 0.7640451192855835, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 8250 + }, + { + "epoch": 2.6662362814719174, + "grad_norm": 0.5972344875335693, + "learning_rate": 0.0002, + "loss": 0.6846, + "step": 8260 + }, + { + "epoch": 2.669464170432537, + "grad_norm": 0.6935883164405823, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 8270 + }, + { + "epoch": 2.6726920593931567, + "grad_norm": 0.789399266242981, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 8280 + }, + { + "epoch": 2.675919948353777, + "grad_norm": 0.7143490314483643, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 8290 + }, + { + "epoch": 2.6791478373143964, + "grad_norm": 0.6670652627944946, + "learning_rate": 0.0002, + "loss": 0.6741, + "step": 8300 + }, + { + "epoch": 2.682375726275016, + "grad_norm": 0.687108039855957, + "learning_rate": 0.0002, + "loss": 0.6936, + "step": 8310 + }, + { + "epoch": 2.6856036152356357, + "grad_norm": 0.7914147973060608, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 8320 + }, + { + "epoch": 2.688831504196256, + "grad_norm": 0.8398420214653015, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 8330 + }, + { + "epoch": 2.6920593931568755, + "grad_norm": 0.6592720746994019, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 8340 + }, + { + "epoch": 2.695287282117495, + "grad_norm": 0.6888470649719238, + "learning_rate": 0.0002, + "loss": 0.6673, + "step": 8350 + }, + { + "epoch": 2.698515171078115, + "grad_norm": 0.7127556800842285, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 8360 + }, + { + "epoch": 2.7017430600387344, + "grad_norm": 0.6630286574363708, + "learning_rate": 0.0002, + "loss": 0.7013, + "step": 8370 + }, + { + "epoch": 2.7049709489993545, + "grad_norm": 0.8261964321136475, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 8380 + }, + { + "epoch": 2.708198837959974, + "grad_norm": 0.717339813709259, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 8390 + }, + { + "epoch": 2.711426726920594, + "grad_norm": 0.651637613773346, + "learning_rate": 0.0002, + "loss": 0.6929, + "step": 8400 + }, + { + "epoch": 2.714654615881214, + "grad_norm": 0.7936098575592041, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 8410 + }, + { + "epoch": 2.7178825048418336, + "grad_norm": 0.8761560320854187, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 8420 + }, + { + "epoch": 2.7211103938024532, + "grad_norm": 0.6768006086349487, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 8430 + }, + { + "epoch": 2.724338282763073, + "grad_norm": 0.7121055722236633, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 8440 + }, + { + "epoch": 2.7275661717236925, + "grad_norm": 0.6811696887016296, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 8450 + }, + { + "epoch": 2.730794060684312, + "grad_norm": 0.8168250918388367, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 8460 + }, + { + "epoch": 2.7340219496449323, + "grad_norm": 0.660682737827301, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 8470 + }, + { + "epoch": 2.737249838605552, + "grad_norm": 0.7369356155395508, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 8480 + }, + { + "epoch": 2.7404777275661716, + "grad_norm": 0.7545099854469299, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 8490 + }, + { + "epoch": 2.7437056165267917, + "grad_norm": 0.6991257667541504, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 8500 + }, + { + "epoch": 2.7469335054874113, + "grad_norm": 0.7195324301719666, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 8510 + }, + { + "epoch": 2.750161394448031, + "grad_norm": 0.8995378017425537, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 8520 + }, + { + "epoch": 2.7533892834086506, + "grad_norm": 0.6924123764038086, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 8530 + }, + { + "epoch": 2.7566171723692703, + "grad_norm": 0.6260585784912109, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 8540 + }, + { + "epoch": 2.7598450613298904, + "grad_norm": 0.7273091673851013, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 8550 + }, + { + "epoch": 2.76307295029051, + "grad_norm": 0.720562219619751, + "learning_rate": 0.0002, + "loss": 0.6853, + "step": 8560 + }, + { + "epoch": 2.7663008392511297, + "grad_norm": 0.6360004544258118, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 8570 + }, + { + "epoch": 2.76952872821175, + "grad_norm": 0.7634525895118713, + "learning_rate": 0.0002, + "loss": 0.6118, + "step": 8580 + }, + { + "epoch": 2.7727566171723694, + "grad_norm": 0.6586076021194458, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 8590 + }, + { + "epoch": 2.775984506132989, + "grad_norm": 0.6542639136314392, + "learning_rate": 0.0002, + "loss": 0.7072, + "step": 8600 + }, + { + "epoch": 2.7792123950936087, + "grad_norm": 0.7650290727615356, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 8610 + }, + { + "epoch": 2.7824402840542284, + "grad_norm": 0.6551542282104492, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 8620 + }, + { + "epoch": 2.785668173014848, + "grad_norm": 0.6915501952171326, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 8630 + }, + { + "epoch": 2.788896061975468, + "grad_norm": 0.8061493635177612, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 8640 + }, + { + "epoch": 2.792123950936088, + "grad_norm": 0.8403584957122803, + "learning_rate": 0.0002, + "loss": 0.6853, + "step": 8650 + }, + { + "epoch": 2.7953518398967074, + "grad_norm": 0.6455532312393188, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 8660 + }, + { + "epoch": 2.7985797288573275, + "grad_norm": 0.8296352028846741, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 8670 + }, + { + "epoch": 2.801807617817947, + "grad_norm": 0.7288752794265747, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 8680 + }, + { + "epoch": 2.805035506778567, + "grad_norm": 0.7628464102745056, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 8690 + }, + { + "epoch": 2.8082633957391865, + "grad_norm": 0.9993878602981567, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 8700 + }, + { + "epoch": 2.811491284699806, + "grad_norm": 0.6972465515136719, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 8710 + }, + { + "epoch": 2.8147191736604262, + "grad_norm": 0.645042896270752, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 8720 + }, + { + "epoch": 2.817947062621046, + "grad_norm": 0.6853853464126587, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 8730 + }, + { + "epoch": 2.8211749515816655, + "grad_norm": 0.5935067534446716, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 8740 + }, + { + "epoch": 2.824402840542285, + "grad_norm": 0.7336633205413818, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 8750 + }, + { + "epoch": 2.8276307295029053, + "grad_norm": 0.7074962854385376, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 8760 + }, + { + "epoch": 2.830858618463525, + "grad_norm": 0.6667559742927551, + "learning_rate": 0.0002, + "loss": 0.6744, + "step": 8770 + }, + { + "epoch": 2.8340865074241446, + "grad_norm": 0.8101205229759216, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 8780 + }, + { + "epoch": 2.8373143963847642, + "grad_norm": 0.8841480016708374, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 8790 + }, + { + "epoch": 2.840542285345384, + "grad_norm": 0.5891591310501099, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 8800 + }, + { + "epoch": 2.843770174306004, + "grad_norm": 0.667032778263092, + "learning_rate": 0.0002, + "loss": 0.7114, + "step": 8810 + }, + { + "epoch": 2.8469980632666236, + "grad_norm": 0.7629773020744324, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 8820 + }, + { + "epoch": 2.8502259522272433, + "grad_norm": 0.79471355676651, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 8830 + }, + { + "epoch": 2.8534538411878634, + "grad_norm": 0.7529178261756897, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 8840 + }, + { + "epoch": 2.856681730148483, + "grad_norm": 0.7014923691749573, + "learning_rate": 0.0002, + "loss": 0.7163, + "step": 8850 + }, + { + "epoch": 2.8599096191091027, + "grad_norm": 0.7996514439582825, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 8860 + }, + { + "epoch": 2.8631375080697223, + "grad_norm": 0.7044785618782043, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 8870 + }, + { + "epoch": 2.866365397030342, + "grad_norm": 0.6792093515396118, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 8880 + }, + { + "epoch": 2.8695932859909616, + "grad_norm": 0.69175124168396, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 8890 + }, + { + "epoch": 2.8728211749515817, + "grad_norm": 0.7499129176139832, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 8900 + }, + { + "epoch": 2.8760490639122014, + "grad_norm": 0.7678789496421814, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 8910 + }, + { + "epoch": 2.879276952872821, + "grad_norm": 0.7478128671646118, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 8920 + }, + { + "epoch": 2.882504841833441, + "grad_norm": 0.6767086386680603, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 8930 + }, + { + "epoch": 2.885732730794061, + "grad_norm": 0.7222196459770203, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 8940 + }, + { + "epoch": 2.8889606197546804, + "grad_norm": 0.6950580477714539, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 8950 + }, + { + "epoch": 2.8921885087153, + "grad_norm": 0.7759528160095215, + "learning_rate": 0.0002, + "loss": 0.7064, + "step": 8960 + }, + { + "epoch": 2.8954163976759197, + "grad_norm": 0.6686919927597046, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 8970 + }, + { + "epoch": 2.89864428663654, + "grad_norm": 0.9245954751968384, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 8980 + }, + { + "epoch": 2.9018721755971595, + "grad_norm": 0.8734814524650574, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 8990 + }, + { + "epoch": 2.905100064557779, + "grad_norm": 0.6056219339370728, + "learning_rate": 0.0002, + "loss": 0.6716, + "step": 9000 + }, + { + "epoch": 2.9083279535183992, + "grad_norm": 0.7364102005958557, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 9010 + }, + { + "epoch": 2.911555842479019, + "grad_norm": 0.6563605070114136, + "learning_rate": 0.0002, + "loss": 0.707, + "step": 9020 + }, + { + "epoch": 2.9147837314396385, + "grad_norm": 0.659978985786438, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 9030 + }, + { + "epoch": 2.918011620400258, + "grad_norm": 0.8176041841506958, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 9040 + }, + { + "epoch": 2.921239509360878, + "grad_norm": 0.743677020072937, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 9050 + }, + { + "epoch": 2.9244673983214975, + "grad_norm": 0.7418383359909058, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 9060 + }, + { + "epoch": 2.9276952872821176, + "grad_norm": 0.6916524767875671, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 9070 + }, + { + "epoch": 2.9309231762427372, + "grad_norm": 0.6559975743293762, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 9080 + }, + { + "epoch": 2.934151065203357, + "grad_norm": 0.7431221008300781, + "learning_rate": 0.0002, + "loss": 0.7016, + "step": 9090 + }, + { + "epoch": 2.937378954163977, + "grad_norm": 0.7525941133499146, + "learning_rate": 0.0002, + "loss": 0.6829, + "step": 9100 + }, + { + "epoch": 2.9406068431245966, + "grad_norm": 0.6860167384147644, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 9110 + }, + { + "epoch": 2.9438347320852163, + "grad_norm": 0.6467666029930115, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 9120 + }, + { + "epoch": 2.947062621045836, + "grad_norm": 0.7595751285552979, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 9130 + }, + { + "epoch": 2.9502905100064556, + "grad_norm": 0.6558279991149902, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 9140 + }, + { + "epoch": 2.9535183989670757, + "grad_norm": 0.6818708181381226, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 9150 + }, + { + "epoch": 2.9567462879276953, + "grad_norm": 0.8387085795402527, + "learning_rate": 0.0002, + "loss": 0.6921, + "step": 9160 + }, + { + "epoch": 2.959974176888315, + "grad_norm": 0.7705109715461731, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 9170 + }, + { + "epoch": 2.9632020658489346, + "grad_norm": 0.688106894493103, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 9180 + }, + { + "epoch": 2.9664299548095547, + "grad_norm": 0.659532368183136, + "learning_rate": 0.0002, + "loss": 0.6833, + "step": 9190 + }, + { + "epoch": 2.9696578437701744, + "grad_norm": 0.6839388608932495, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 9200 + }, + { + "epoch": 2.972885732730794, + "grad_norm": 0.6927599310874939, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 9210 + }, + { + "epoch": 2.9761136216914137, + "grad_norm": 0.6902472972869873, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 9220 + }, + { + "epoch": 2.9793415106520333, + "grad_norm": 0.620399534702301, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 9230 + }, + { + "epoch": 2.9825693996126534, + "grad_norm": 0.6812364459037781, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 9240 + }, + { + "epoch": 2.985797288573273, + "grad_norm": 0.7681456208229065, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 9250 + }, + { + "epoch": 2.9890251775338927, + "grad_norm": 0.7621907591819763, + "learning_rate": 0.0002, + "loss": 0.7113, + "step": 9260 + }, + { + "epoch": 2.992253066494513, + "grad_norm": 0.6075740456581116, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 9270 + }, + { + "epoch": 2.9954809554551325, + "grad_norm": 0.7100434899330139, + "learning_rate": 0.0002, + "loss": 0.6758, + "step": 9280 + }, + { + "epoch": 2.998708844415752, + "grad_norm": 0.7314488887786865, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 9290 + }, + { + "epoch": 3.0, + "eval_loss": 1.1434104442596436, + "eval_runtime": 166.3732, + "eval_samples_per_second": 4.406, + "eval_steps_per_second": 0.553, + "step": 9294 + }, + { + "epoch": 3.001936733376372, + "grad_norm": 0.7408893704414368, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 9300 + }, + { + "epoch": 3.0051646223369914, + "grad_norm": 0.9773574471473694, + "learning_rate": 0.0002, + "loss": 0.5182, + "step": 9310 + }, + { + "epoch": 3.0083925112976115, + "grad_norm": 0.7919653058052063, + "learning_rate": 0.0002, + "loss": 0.5432, + "step": 9320 + }, + { + "epoch": 3.011620400258231, + "grad_norm": 0.9139202833175659, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 9330 + }, + { + "epoch": 3.014848289218851, + "grad_norm": 0.8296737670898438, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 9340 + }, + { + "epoch": 3.0180761781794705, + "grad_norm": 0.786868155002594, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 9350 + }, + { + "epoch": 3.0213040671400906, + "grad_norm": 0.5928055644035339, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 9360 + }, + { + "epoch": 3.0245319561007102, + "grad_norm": 0.8785701394081116, + "learning_rate": 0.0002, + "loss": 0.5376, + "step": 9370 + }, + { + "epoch": 3.02775984506133, + "grad_norm": 0.7978872060775757, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 9380 + }, + { + "epoch": 3.0309877340219495, + "grad_norm": 0.7160913348197937, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 9390 + }, + { + "epoch": 3.034215622982569, + "grad_norm": 0.904465913772583, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 9400 + }, + { + "epoch": 3.0374435119431893, + "grad_norm": 0.7082195281982422, + "learning_rate": 0.0002, + "loss": 0.5518, + "step": 9410 + }, + { + "epoch": 3.040671400903809, + "grad_norm": 0.9686778783798218, + "learning_rate": 0.0002, + "loss": 0.5434, + "step": 9420 + }, + { + "epoch": 3.0438992898644286, + "grad_norm": 0.8788613677024841, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 9430 + }, + { + "epoch": 3.0471271788250482, + "grad_norm": 0.8217582106590271, + "learning_rate": 0.0002, + "loss": 0.5599, + "step": 9440 + }, + { + "epoch": 3.0503550677856683, + "grad_norm": 0.7380914092063904, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 9450 + }, + { + "epoch": 3.053582956746288, + "grad_norm": 0.7339285612106323, + "learning_rate": 0.0002, + "loss": 0.6258, + "step": 9460 + }, + { + "epoch": 3.0568108457069076, + "grad_norm": 0.7175183296203613, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 9470 + }, + { + "epoch": 3.0600387346675273, + "grad_norm": 0.8275379538536072, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 9480 + }, + { + "epoch": 3.0632666236281474, + "grad_norm": 0.6544256806373596, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 9490 + }, + { + "epoch": 3.066494512588767, + "grad_norm": 0.8193472623825073, + "learning_rate": 0.0002, + "loss": 0.5365, + "step": 9500 + }, + { + "epoch": 3.0697224015493867, + "grad_norm": 0.7967836856842041, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 9510 + }, + { + "epoch": 3.0729502905100063, + "grad_norm": 0.8788684010505676, + "learning_rate": 0.0002, + "loss": 0.5629, + "step": 9520 + }, + { + "epoch": 3.0761781794706264, + "grad_norm": 0.9410629868507385, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 9530 + }, + { + "epoch": 3.079406068431246, + "grad_norm": 0.7448706030845642, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 9540 + }, + { + "epoch": 3.0826339573918657, + "grad_norm": 0.9149372577667236, + "learning_rate": 0.0002, + "loss": 0.5774, + "step": 9550 + }, + { + "epoch": 3.0858618463524854, + "grad_norm": 0.7265563607215881, + "learning_rate": 0.0002, + "loss": 0.5347, + "step": 9560 + }, + { + "epoch": 3.089089735313105, + "grad_norm": 1.0305068492889404, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 9570 + }, + { + "epoch": 3.092317624273725, + "grad_norm": 0.7987357974052429, + "learning_rate": 0.0002, + "loss": 0.5884, + "step": 9580 + }, + { + "epoch": 3.095545513234345, + "grad_norm": 0.7733123898506165, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 9590 + }, + { + "epoch": 3.0987734021949644, + "grad_norm": 1.0438069105148315, + "learning_rate": 0.0002, + "loss": 0.5848, + "step": 9600 + }, + { + "epoch": 3.102001291155584, + "grad_norm": 0.7951784729957581, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 9610 + }, + { + "epoch": 3.105229180116204, + "grad_norm": 0.7776783108711243, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 9620 + }, + { + "epoch": 3.108457069076824, + "grad_norm": 0.7060676217079163, + "learning_rate": 0.0002, + "loss": 0.5626, + "step": 9630 + }, + { + "epoch": 3.1116849580374435, + "grad_norm": 0.871569037437439, + "learning_rate": 0.0002, + "loss": 0.5731, + "step": 9640 + }, + { + "epoch": 3.114912846998063, + "grad_norm": 0.8873385787010193, + "learning_rate": 0.0002, + "loss": 0.5168, + "step": 9650 + }, + { + "epoch": 3.118140735958683, + "grad_norm": 0.750998318195343, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 9660 + }, + { + "epoch": 3.121368624919303, + "grad_norm": 0.8678529262542725, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 9670 + }, + { + "epoch": 3.1245965138799225, + "grad_norm": 0.7706599235534668, + "learning_rate": 0.0002, + "loss": 0.5831, + "step": 9680 + }, + { + "epoch": 3.127824402840542, + "grad_norm": 0.8317574858665466, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 9690 + }, + { + "epoch": 3.131052291801162, + "grad_norm": 0.801800012588501, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 9700 + }, + { + "epoch": 3.134280180761782, + "grad_norm": 0.8574623465538025, + "learning_rate": 0.0002, + "loss": 0.6044, + "step": 9710 + }, + { + "epoch": 3.1375080697224016, + "grad_norm": 0.6556540727615356, + "learning_rate": 0.0002, + "loss": 0.6072, + "step": 9720 + }, + { + "epoch": 3.1407359586830212, + "grad_norm": 0.8555161952972412, + "learning_rate": 0.0002, + "loss": 0.6058, + "step": 9730 + }, + { + "epoch": 3.143963847643641, + "grad_norm": 0.8825467824935913, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 9740 + }, + { + "epoch": 3.147191736604261, + "grad_norm": 0.8297156691551208, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 9750 + }, + { + "epoch": 3.1504196255648806, + "grad_norm": 0.7710384726524353, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 9760 + }, + { + "epoch": 3.1536475145255003, + "grad_norm": 0.8778039216995239, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 9770 + }, + { + "epoch": 3.15687540348612, + "grad_norm": 0.9014058113098145, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 9780 + }, + { + "epoch": 3.16010329244674, + "grad_norm": 0.6856890320777893, + "learning_rate": 0.0002, + "loss": 0.5496, + "step": 9790 + }, + { + "epoch": 3.1633311814073597, + "grad_norm": 0.6520644426345825, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 9800 + }, + { + "epoch": 3.1665590703679793, + "grad_norm": 0.7250499129295349, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 9810 + }, + { + "epoch": 3.169786959328599, + "grad_norm": 0.8331542015075684, + "learning_rate": 0.0002, + "loss": 0.5823, + "step": 9820 + }, + { + "epoch": 3.1730148482892186, + "grad_norm": 0.8531261682510376, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 9830 + }, + { + "epoch": 3.1762427372498387, + "grad_norm": 0.8997558355331421, + "learning_rate": 0.0002, + "loss": 0.57, + "step": 9840 + }, + { + "epoch": 3.1794706262104584, + "grad_norm": 0.708335280418396, + "learning_rate": 0.0002, + "loss": 0.5921, + "step": 9850 + }, + { + "epoch": 3.182698515171078, + "grad_norm": 1.0074886083602905, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 9860 + }, + { + "epoch": 3.1859264041316977, + "grad_norm": 1.0804681777954102, + "learning_rate": 0.0002, + "loss": 0.573, + "step": 9870 + }, + { + "epoch": 3.189154293092318, + "grad_norm": 0.9510730504989624, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 9880 + }, + { + "epoch": 3.1923821820529374, + "grad_norm": 0.7211061716079712, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 9890 + }, + { + "epoch": 3.195610071013557, + "grad_norm": 0.8767086267471313, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 9900 + }, + { + "epoch": 3.1988379599741767, + "grad_norm": 0.8388153314590454, + "learning_rate": 0.0002, + "loss": 0.5747, + "step": 9910 + }, + { + "epoch": 3.202065848934797, + "grad_norm": 0.8038473725318909, + "learning_rate": 0.0002, + "loss": 0.5681, + "step": 9920 + }, + { + "epoch": 3.2052937378954165, + "grad_norm": 0.8187747001647949, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 9930 + }, + { + "epoch": 3.208521626856036, + "grad_norm": 0.7427355051040649, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 9940 + }, + { + "epoch": 3.211749515816656, + "grad_norm": 0.8017025589942932, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 9950 + }, + { + "epoch": 3.214977404777276, + "grad_norm": 0.738595187664032, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 9960 + }, + { + "epoch": 3.2182052937378955, + "grad_norm": 0.7521342039108276, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 9970 + }, + { + "epoch": 3.221433182698515, + "grad_norm": 0.840329110622406, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 9980 + }, + { + "epoch": 3.224661071659135, + "grad_norm": 0.9809671640396118, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 9990 + }, + { + "epoch": 3.2278889606197545, + "grad_norm": 0.8456943035125732, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 10000 + }, + { + "epoch": 3.2311168495803746, + "grad_norm": 0.8962995409965515, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 10010 + }, + { + "epoch": 3.2343447385409942, + "grad_norm": 0.6492817401885986, + "learning_rate": 0.0002, + "loss": 0.5399, + "step": 10020 + }, + { + "epoch": 3.237572627501614, + "grad_norm": 1.0471255779266357, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 10030 + }, + { + "epoch": 3.2408005164622335, + "grad_norm": 0.7995471358299255, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 10040 + }, + { + "epoch": 3.2440284054228536, + "grad_norm": 0.7231964468955994, + "learning_rate": 0.0002, + "loss": 0.615, + "step": 10050 + }, + { + "epoch": 3.2472562943834733, + "grad_norm": 0.639630138874054, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 10060 + }, + { + "epoch": 3.250484183344093, + "grad_norm": 0.7957055568695068, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 10070 + }, + { + "epoch": 3.2537120723047126, + "grad_norm": 0.7735482454299927, + "learning_rate": 0.0002, + "loss": 0.5845, + "step": 10080 + }, + { + "epoch": 3.2569399612653323, + "grad_norm": 0.8139488101005554, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 10090 + }, + { + "epoch": 3.2601678502259523, + "grad_norm": 0.8113240003585815, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 10100 + }, + { + "epoch": 3.263395739186572, + "grad_norm": 0.7735909819602966, + "learning_rate": 0.0002, + "loss": 0.5617, + "step": 10110 + }, + { + "epoch": 3.2666236281471916, + "grad_norm": 0.7760744094848633, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 10120 + }, + { + "epoch": 3.2698515171078113, + "grad_norm": 0.8078505396842957, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 10130 + }, + { + "epoch": 3.2730794060684314, + "grad_norm": 0.983648955821991, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 10140 + }, + { + "epoch": 3.276307295029051, + "grad_norm": 0.7131832242012024, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 10150 + }, + { + "epoch": 3.2795351839896707, + "grad_norm": 0.924493134021759, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 10160 + }, + { + "epoch": 3.2827630729502904, + "grad_norm": 0.9371112585067749, + "learning_rate": 0.0002, + "loss": 0.5733, + "step": 10170 + }, + { + "epoch": 3.2859909619109104, + "grad_norm": 0.8989261388778687, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 10180 + }, + { + "epoch": 3.28921885087153, + "grad_norm": 0.8130394816398621, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 10190 + }, + { + "epoch": 3.2924467398321497, + "grad_norm": 0.9899941086769104, + "learning_rate": 0.0002, + "loss": 0.5555, + "step": 10200 + }, + { + "epoch": 3.2956746287927694, + "grad_norm": 1.007038950920105, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 10210 + }, + { + "epoch": 3.2989025177533895, + "grad_norm": 0.7465066313743591, + "learning_rate": 0.0002, + "loss": 0.5713, + "step": 10220 + }, + { + "epoch": 3.302130406714009, + "grad_norm": 0.7202590703964233, + "learning_rate": 0.0002, + "loss": 0.6307, + "step": 10230 + }, + { + "epoch": 3.305358295674629, + "grad_norm": 0.6258249282836914, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 10240 + }, + { + "epoch": 3.3085861846352485, + "grad_norm": 0.8996058702468872, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 10250 + }, + { + "epoch": 3.311814073595868, + "grad_norm": 0.9550982713699341, + "learning_rate": 0.0002, + "loss": 0.5825, + "step": 10260 + }, + { + "epoch": 3.315041962556488, + "grad_norm": 0.7010059952735901, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 10270 + }, + { + "epoch": 3.318269851517108, + "grad_norm": 0.9639869332313538, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 10280 + }, + { + "epoch": 3.3214977404777275, + "grad_norm": 1.0192502737045288, + "learning_rate": 0.0002, + "loss": 0.5362, + "step": 10290 + }, + { + "epoch": 3.324725629438347, + "grad_norm": 0.7953670024871826, + "learning_rate": 0.0002, + "loss": 0.5605, + "step": 10300 + }, + { + "epoch": 3.3279535183989672, + "grad_norm": 0.7436774969100952, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 10310 + }, + { + "epoch": 3.331181407359587, + "grad_norm": 0.7846777439117432, + "learning_rate": 0.0002, + "loss": 0.5823, + "step": 10320 + }, + { + "epoch": 3.3344092963202066, + "grad_norm": 0.8963494896888733, + "learning_rate": 0.0002, + "loss": 0.6119, + "step": 10330 + }, + { + "epoch": 3.337637185280826, + "grad_norm": 0.6876392364501953, + "learning_rate": 0.0002, + "loss": 0.5872, + "step": 10340 + }, + { + "epoch": 3.340865074241446, + "grad_norm": 0.9161638021469116, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 10350 + }, + { + "epoch": 3.344092963202066, + "grad_norm": 0.8964458107948303, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 10360 + }, + { + "epoch": 3.3473208521626856, + "grad_norm": 0.9052296280860901, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 10370 + }, + { + "epoch": 3.3505487411233053, + "grad_norm": 0.9292596578598022, + "learning_rate": 0.0002, + "loss": 0.5958, + "step": 10380 + }, + { + "epoch": 3.3537766300839253, + "grad_norm": 0.9605957269668579, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 10390 + }, + { + "epoch": 3.357004519044545, + "grad_norm": 1.0198872089385986, + "learning_rate": 0.0002, + "loss": 0.6214, + "step": 10400 + }, + { + "epoch": 3.3602324080051647, + "grad_norm": 0.7043630480766296, + "learning_rate": 0.0002, + "loss": 0.6053, + "step": 10410 + }, + { + "epoch": 3.3634602969657843, + "grad_norm": 1.0533326864242554, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 10420 + }, + { + "epoch": 3.366688185926404, + "grad_norm": 0.7552485466003418, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 10430 + }, + { + "epoch": 3.369916074887024, + "grad_norm": 0.692708432674408, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 10440 + }, + { + "epoch": 3.3731439638476437, + "grad_norm": 0.985952615737915, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 10450 + }, + { + "epoch": 3.3763718528082634, + "grad_norm": 0.6749676465988159, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 10460 + }, + { + "epoch": 3.379599741768883, + "grad_norm": 0.9514535665512085, + "learning_rate": 0.0002, + "loss": 0.5724, + "step": 10470 + }, + { + "epoch": 3.382827630729503, + "grad_norm": 1.2681142091751099, + "learning_rate": 0.0002, + "loss": 0.5982, + "step": 10480 + }, + { + "epoch": 3.3860555196901228, + "grad_norm": 1.031968355178833, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 10490 + }, + { + "epoch": 3.3892834086507424, + "grad_norm": 0.8061563968658447, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 10500 + }, + { + "epoch": 3.392511297611362, + "grad_norm": 1.0515062808990479, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 10510 + }, + { + "epoch": 3.3957391865719817, + "grad_norm": 0.9055540561676025, + "learning_rate": 0.0002, + "loss": 0.542, + "step": 10520 + }, + { + "epoch": 3.398967075532602, + "grad_norm": 0.9318141341209412, + "learning_rate": 0.0002, + "loss": 0.6148, + "step": 10530 + }, + { + "epoch": 3.4021949644932215, + "grad_norm": 0.8266817331314087, + "learning_rate": 0.0002, + "loss": 0.5722, + "step": 10540 + }, + { + "epoch": 3.405422853453841, + "grad_norm": 1.2322112321853638, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 10550 + }, + { + "epoch": 3.4086507424144608, + "grad_norm": 0.9535136818885803, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 10560 + }, + { + "epoch": 3.411878631375081, + "grad_norm": 0.9243819117546082, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 10570 + }, + { + "epoch": 3.4151065203357005, + "grad_norm": 0.9011809825897217, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 10580 + }, + { + "epoch": 3.41833440929632, + "grad_norm": 0.9923036694526672, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 10590 + }, + { + "epoch": 3.42156229825694, + "grad_norm": 0.8903067111968994, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 10600 + }, + { + "epoch": 3.42479018721756, + "grad_norm": 0.7101534605026245, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 10610 + }, + { + "epoch": 3.4280180761781796, + "grad_norm": 0.8186570405960083, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 10620 + }, + { + "epoch": 3.431245965138799, + "grad_norm": 0.9480205774307251, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 10630 + }, + { + "epoch": 3.434473854099419, + "grad_norm": 1.1370961666107178, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 10640 + }, + { + "epoch": 3.437701743060039, + "grad_norm": 1.017669677734375, + "learning_rate": 0.0002, + "loss": 0.5779, + "step": 10650 + }, + { + "epoch": 3.4409296320206586, + "grad_norm": 0.7625100016593933, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 10660 + }, + { + "epoch": 3.4441575209812783, + "grad_norm": 0.9288196563720703, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 10670 + }, + { + "epoch": 3.447385409941898, + "grad_norm": 0.8800460696220398, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 10680 + }, + { + "epoch": 3.4506132989025176, + "grad_norm": 0.7499661445617676, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 10690 + }, + { + "epoch": 3.4538411878631377, + "grad_norm": 0.8254973292350769, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 10700 + }, + { + "epoch": 3.4570690768237573, + "grad_norm": 0.8735857605934143, + "learning_rate": 0.0002, + "loss": 0.5742, + "step": 10710 + }, + { + "epoch": 3.460296965784377, + "grad_norm": 0.9601819515228271, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 10720 + }, + { + "epoch": 3.4635248547449966, + "grad_norm": 0.8031058311462402, + "learning_rate": 0.0002, + "loss": 0.5574, + "step": 10730 + }, + { + "epoch": 3.4667527437056167, + "grad_norm": 0.8039247393608093, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 10740 + }, + { + "epoch": 3.4699806326662364, + "grad_norm": 0.8936953544616699, + "learning_rate": 0.0002, + "loss": 0.593, + "step": 10750 + }, + { + "epoch": 3.473208521626856, + "grad_norm": 0.8201186060905457, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 10760 + }, + { + "epoch": 3.4764364105874757, + "grad_norm": 1.0064148902893066, + "learning_rate": 0.0002, + "loss": 0.5875, + "step": 10770 + }, + { + "epoch": 3.4796642995480953, + "grad_norm": 0.8617483377456665, + "learning_rate": 0.0002, + "loss": 0.5639, + "step": 10780 + }, + { + "epoch": 3.4828921885087154, + "grad_norm": 0.8532096147537231, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 10790 + }, + { + "epoch": 3.486120077469335, + "grad_norm": 0.8646879196166992, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 10800 + }, + { + "epoch": 3.4893479664299547, + "grad_norm": 0.7962660789489746, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 10810 + }, + { + "epoch": 3.492575855390575, + "grad_norm": 0.9560028314590454, + "learning_rate": 0.0002, + "loss": 0.5398, + "step": 10820 + }, + { + "epoch": 3.4958037443511945, + "grad_norm": 0.928439736366272, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 10830 + }, + { + "epoch": 3.499031633311814, + "grad_norm": 0.8219282627105713, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 10840 + }, + { + "epoch": 3.5022595222724338, + "grad_norm": 0.7918338179588318, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 10850 + }, + { + "epoch": 3.5054874112330534, + "grad_norm": 0.961295485496521, + "learning_rate": 0.0002, + "loss": 0.6164, + "step": 10860 + }, + { + "epoch": 3.5087153001936735, + "grad_norm": 1.0731624364852905, + "learning_rate": 0.0002, + "loss": 0.5534, + "step": 10870 + }, + { + "epoch": 3.511943189154293, + "grad_norm": 0.9551863074302673, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 10880 + }, + { + "epoch": 3.515171078114913, + "grad_norm": 0.8409819602966309, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 10890 + }, + { + "epoch": 3.5183989670755325, + "grad_norm": 0.7546320557594299, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 10900 + }, + { + "epoch": 3.5216268560361526, + "grad_norm": 0.7505252361297607, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 10910 + }, + { + "epoch": 3.524854744996772, + "grad_norm": 0.7505561113357544, + "learning_rate": 0.0002, + "loss": 0.5649, + "step": 10920 + }, + { + "epoch": 3.528082633957392, + "grad_norm": 1.086177945137024, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 10930 + }, + { + "epoch": 3.5313105229180115, + "grad_norm": 0.7721118330955505, + "learning_rate": 0.0002, + "loss": 0.5983, + "step": 10940 + }, + { + "epoch": 3.534538411878631, + "grad_norm": 0.9567878246307373, + "learning_rate": 0.0002, + "loss": 0.5919, + "step": 10950 + }, + { + "epoch": 3.5377663008392513, + "grad_norm": 0.8377360105514526, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 10960 + }, + { + "epoch": 3.540994189799871, + "grad_norm": 1.0174858570098877, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 10970 + }, + { + "epoch": 3.5442220787604906, + "grad_norm": 0.8164418935775757, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 10980 + }, + { + "epoch": 3.5474499677211107, + "grad_norm": 0.8959241509437561, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 10990 + }, + { + "epoch": 3.5506778566817303, + "grad_norm": 1.0154379606246948, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 11000 + }, + { + "epoch": 3.55390574564235, + "grad_norm": 0.7812292575836182, + "learning_rate": 0.0002, + "loss": 0.5835, + "step": 11010 + }, + { + "epoch": 3.5571336346029696, + "grad_norm": 0.9849029779434204, + "learning_rate": 0.0002, + "loss": 0.6052, + "step": 11020 + }, + { + "epoch": 3.5603615235635893, + "grad_norm": 0.8826184272766113, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 11030 + }, + { + "epoch": 3.563589412524209, + "grad_norm": 0.9039685726165771, + "learning_rate": 0.0002, + "loss": 0.601, + "step": 11040 + }, + { + "epoch": 3.566817301484829, + "grad_norm": 0.9585249423980713, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 11050 + }, + { + "epoch": 3.5700451904454487, + "grad_norm": 0.8083069324493408, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 11060 + }, + { + "epoch": 3.5732730794060683, + "grad_norm": 0.9528678059577942, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 11070 + }, + { + "epoch": 3.5765009683666884, + "grad_norm": 0.8297588229179382, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 11080 + }, + { + "epoch": 3.579728857327308, + "grad_norm": 0.8191716074943542, + "learning_rate": 0.0002, + "loss": 0.5919, + "step": 11090 + }, + { + "epoch": 3.5829567462879277, + "grad_norm": 0.8056275844573975, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 11100 + }, + { + "epoch": 3.5861846352485474, + "grad_norm": 0.701930582523346, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 11110 + }, + { + "epoch": 3.589412524209167, + "grad_norm": 0.7644643187522888, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 11120 + }, + { + "epoch": 3.592640413169787, + "grad_norm": 0.668004035949707, + "learning_rate": 0.0002, + "loss": 0.605, + "step": 11130 + }, + { + "epoch": 3.5958683021304068, + "grad_norm": 0.8849539756774902, + "learning_rate": 0.0002, + "loss": 0.5735, + "step": 11140 + }, + { + "epoch": 3.5990961910910264, + "grad_norm": 0.8123571276664734, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 11150 + }, + { + "epoch": 3.602324080051646, + "grad_norm": 0.7591469287872314, + "learning_rate": 0.0002, + "loss": 0.5626, + "step": 11160 + }, + { + "epoch": 3.605551969012266, + "grad_norm": 0.776466965675354, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 11170 + }, + { + "epoch": 3.608779857972886, + "grad_norm": 0.9156150221824646, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 11180 + }, + { + "epoch": 3.6120077469335055, + "grad_norm": 0.7517618536949158, + "learning_rate": 0.0002, + "loss": 0.5867, + "step": 11190 + }, + { + "epoch": 3.615235635894125, + "grad_norm": 0.931239128112793, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 11200 + }, + { + "epoch": 3.6184635248547448, + "grad_norm": 0.9107872843742371, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 11210 + }, + { + "epoch": 3.621691413815365, + "grad_norm": 0.7624770998954773, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 11220 + }, + { + "epoch": 3.6249193027759845, + "grad_norm": 0.8129580616950989, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 11230 + }, + { + "epoch": 3.628147191736604, + "grad_norm": 0.7339836955070496, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 11240 + }, + { + "epoch": 3.6313750806972243, + "grad_norm": 0.8901296854019165, + "learning_rate": 0.0002, + "loss": 0.5976, + "step": 11250 + }, + { + "epoch": 3.634602969657844, + "grad_norm": 1.1374726295471191, + "learning_rate": 0.0002, + "loss": 0.5977, + "step": 11260 + }, + { + "epoch": 3.6378308586184636, + "grad_norm": 0.7438275218009949, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 11270 + }, + { + "epoch": 3.641058747579083, + "grad_norm": 0.808646559715271, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 11280 + }, + { + "epoch": 3.644286636539703, + "grad_norm": 1.091810941696167, + "learning_rate": 0.0002, + "loss": 0.6244, + "step": 11290 + }, + { + "epoch": 3.6475145255003225, + "grad_norm": 0.8439257144927979, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 11300 + }, + { + "epoch": 3.6507424144609426, + "grad_norm": 0.9720633029937744, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 11310 + }, + { + "epoch": 3.6539703034215623, + "grad_norm": 0.738571047782898, + "learning_rate": 0.0002, + "loss": 0.5942, + "step": 11320 + }, + { + "epoch": 3.657198192382182, + "grad_norm": 0.6961580514907837, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 11330 + }, + { + "epoch": 3.660426081342802, + "grad_norm": 0.8192131519317627, + "learning_rate": 0.0002, + "loss": 0.6226, + "step": 11340 + }, + { + "epoch": 3.6636539703034217, + "grad_norm": 0.8367205858230591, + "learning_rate": 0.0002, + "loss": 0.6155, + "step": 11350 + }, + { + "epoch": 3.6668818592640413, + "grad_norm": 0.7735666632652283, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 11360 + }, + { + "epoch": 3.670109748224661, + "grad_norm": 0.6507132649421692, + "learning_rate": 0.0002, + "loss": 0.6113, + "step": 11370 + }, + { + "epoch": 3.6733376371852806, + "grad_norm": 0.8271192312240601, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 11380 + }, + { + "epoch": 3.6765655261459007, + "grad_norm": 0.8724204301834106, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 11390 + }, + { + "epoch": 3.6797934151065204, + "grad_norm": 0.8448445200920105, + "learning_rate": 0.0002, + "loss": 0.6131, + "step": 11400 + }, + { + "epoch": 3.68302130406714, + "grad_norm": 0.6756882071495056, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 11410 + }, + { + "epoch": 3.68624919302776, + "grad_norm": 0.7859625816345215, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 11420 + }, + { + "epoch": 3.6894770819883798, + "grad_norm": 0.8929487466812134, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 11430 + }, + { + "epoch": 3.6927049709489994, + "grad_norm": 0.8163391351699829, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 11440 + }, + { + "epoch": 3.695932859909619, + "grad_norm": 0.8948464393615723, + "learning_rate": 0.0002, + "loss": 0.6467, + "step": 11450 + }, + { + "epoch": 3.6991607488702387, + "grad_norm": 0.8654782176017761, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 11460 + }, + { + "epoch": 3.7023886378308584, + "grad_norm": 0.9514864683151245, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 11470 + }, + { + "epoch": 3.7056165267914785, + "grad_norm": 0.7298579812049866, + "learning_rate": 0.0002, + "loss": 0.606, + "step": 11480 + }, + { + "epoch": 3.708844415752098, + "grad_norm": 0.9266309142112732, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 11490 + }, + { + "epoch": 3.7120723047127178, + "grad_norm": 0.8608686923980713, + "learning_rate": 0.0002, + "loss": 0.6122, + "step": 11500 + }, + { + "epoch": 3.715300193673338, + "grad_norm": 0.921788215637207, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 11510 + }, + { + "epoch": 3.7185280826339575, + "grad_norm": 0.8537021279335022, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 11520 + }, + { + "epoch": 3.721755971594577, + "grad_norm": 1.115194320678711, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 11530 + }, + { + "epoch": 3.724983860555197, + "grad_norm": 0.7614817023277283, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 11540 + }, + { + "epoch": 3.7282117495158165, + "grad_norm": 0.871999204158783, + "learning_rate": 0.0002, + "loss": 0.5776, + "step": 11550 + }, + { + "epoch": 3.7314396384764366, + "grad_norm": 0.9668049812316895, + "learning_rate": 0.0002, + "loss": 0.5962, + "step": 11560 + }, + { + "epoch": 3.734667527437056, + "grad_norm": 1.2185815572738647, + "learning_rate": 0.0002, + "loss": 0.5534, + "step": 11570 + }, + { + "epoch": 3.737895416397676, + "grad_norm": 0.8258453011512756, + "learning_rate": 0.0002, + "loss": 0.5936, + "step": 11580 + }, + { + "epoch": 3.7411233053582955, + "grad_norm": 0.8708966374397278, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 11590 + }, + { + "epoch": 3.7443511943189156, + "grad_norm": 0.7784267663955688, + "learning_rate": 0.0002, + "loss": 0.5847, + "step": 11600 + }, + { + "epoch": 3.7475790832795353, + "grad_norm": 0.7504425048828125, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 11610 + }, + { + "epoch": 3.750806972240155, + "grad_norm": 0.9144526124000549, + "learning_rate": 0.0002, + "loss": 0.5922, + "step": 11620 + }, + { + "epoch": 3.7540348612007746, + "grad_norm": 0.922581672668457, + "learning_rate": 0.0002, + "loss": 0.6425, + "step": 11630 + }, + { + "epoch": 3.757262750161394, + "grad_norm": 0.9348630905151367, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 11640 + }, + { + "epoch": 3.7604906391220143, + "grad_norm": 1.0740231275558472, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 11650 + }, + { + "epoch": 3.763718528082634, + "grad_norm": 0.884830117225647, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 11660 + }, + { + "epoch": 3.7669464170432536, + "grad_norm": 1.0256348848342896, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 11670 + }, + { + "epoch": 3.7701743060038737, + "grad_norm": 0.6795592904090881, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 11680 + }, + { + "epoch": 3.7734021949644934, + "grad_norm": 0.9381206631660461, + "learning_rate": 0.0002, + "loss": 0.6241, + "step": 11690 + }, + { + "epoch": 3.776630083925113, + "grad_norm": 0.7633092403411865, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 11700 + }, + { + "epoch": 3.7798579728857327, + "grad_norm": 0.7506213188171387, + "learning_rate": 0.0002, + "loss": 0.5937, + "step": 11710 + }, + { + "epoch": 3.7830858618463523, + "grad_norm": 0.8182913064956665, + "learning_rate": 0.0002, + "loss": 0.5933, + "step": 11720 + }, + { + "epoch": 3.786313750806972, + "grad_norm": 1.019322156906128, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 11730 + }, + { + "epoch": 3.789541639767592, + "grad_norm": 0.8895221948623657, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 11740 + }, + { + "epoch": 3.7927695287282117, + "grad_norm": 0.948847770690918, + "learning_rate": 0.0002, + "loss": 0.6553, + "step": 11750 + }, + { + "epoch": 3.7959974176888314, + "grad_norm": 0.9068999886512756, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 11760 + }, + { + "epoch": 3.7992253066494515, + "grad_norm": 0.7920539975166321, + "learning_rate": 0.0002, + "loss": 0.6163, + "step": 11770 + }, + { + "epoch": 3.802453195610071, + "grad_norm": 0.8441922068595886, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 11780 + }, + { + "epoch": 3.8056810845706908, + "grad_norm": 0.9258501529693604, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 11790 + }, + { + "epoch": 3.8089089735313104, + "grad_norm": 0.7354241609573364, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 11800 + }, + { + "epoch": 3.81213686249193, + "grad_norm": 0.9494872689247131, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 11810 + }, + { + "epoch": 3.81536475145255, + "grad_norm": 0.8266556859016418, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 11820 + }, + { + "epoch": 3.81859264041317, + "grad_norm": 0.7951219081878662, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 11830 + }, + { + "epoch": 3.8218205293737895, + "grad_norm": 0.7688382267951965, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 11840 + }, + { + "epoch": 3.8250484183344096, + "grad_norm": 1.0917940139770508, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 11850 + }, + { + "epoch": 3.828276307295029, + "grad_norm": 0.9880442023277283, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 11860 + }, + { + "epoch": 3.831504196255649, + "grad_norm": 0.8433151245117188, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 11870 + }, + { + "epoch": 3.8347320852162685, + "grad_norm": 0.8691204786300659, + "learning_rate": 0.0002, + "loss": 0.5876, + "step": 11880 + }, + { + "epoch": 3.837959974176888, + "grad_norm": 0.7698143124580383, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 11890 + }, + { + "epoch": 3.841187863137508, + "grad_norm": 0.8874883651733398, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 11900 + }, + { + "epoch": 3.844415752098128, + "grad_norm": 1.1209359169006348, + "learning_rate": 0.0002, + "loss": 0.6242, + "step": 11910 + }, + { + "epoch": 3.8476436410587476, + "grad_norm": 0.7723544239997864, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 11920 + }, + { + "epoch": 3.850871530019367, + "grad_norm": 0.8363937139511108, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 11930 + }, + { + "epoch": 3.8540994189799873, + "grad_norm": 0.9209707975387573, + "learning_rate": 0.0002, + "loss": 0.6498, + "step": 11940 + }, + { + "epoch": 3.857327307940607, + "grad_norm": 0.9456894993782043, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 11950 + }, + { + "epoch": 3.8605551969012266, + "grad_norm": 1.5748413801193237, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 11960 + }, + { + "epoch": 3.8637830858618463, + "grad_norm": 0.9083569049835205, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 11970 + }, + { + "epoch": 3.867010974822466, + "grad_norm": 0.7672823071479797, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 11980 + }, + { + "epoch": 3.870238863783086, + "grad_norm": 0.8647152185440063, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 11990 + }, + { + "epoch": 3.8734667527437057, + "grad_norm": 0.9564255475997925, + "learning_rate": 0.0002, + "loss": 0.5755, + "step": 12000 + }, + { + "epoch": 3.8766946417043253, + "grad_norm": 0.773267924785614, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 12010 + }, + { + "epoch": 3.879922530664945, + "grad_norm": 0.8030173182487488, + "learning_rate": 0.0002, + "loss": 0.6057, + "step": 12020 + }, + { + "epoch": 3.883150419625565, + "grad_norm": 0.8002150058746338, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 12030 + }, + { + "epoch": 3.8863783085861847, + "grad_norm": 0.98802250623703, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 12040 + }, + { + "epoch": 3.8896061975468044, + "grad_norm": 0.7868124842643738, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 12050 + }, + { + "epoch": 3.892834086507424, + "grad_norm": 0.932182788848877, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 12060 + }, + { + "epoch": 3.8960619754680437, + "grad_norm": 0.8576806783676147, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 12070 + }, + { + "epoch": 3.8992898644286638, + "grad_norm": 0.8985713124275208, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 12080 + }, + { + "epoch": 3.9025177533892834, + "grad_norm": 0.7876521944999695, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 12090 + }, + { + "epoch": 3.905745642349903, + "grad_norm": 0.773936927318573, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 12100 + }, + { + "epoch": 3.908973531310523, + "grad_norm": 0.7274761199951172, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 12110 + }, + { + "epoch": 3.912201420271143, + "grad_norm": 0.8625598549842834, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 12120 + }, + { + "epoch": 3.9154293092317625, + "grad_norm": 0.8702362179756165, + "learning_rate": 0.0002, + "loss": 0.5855, + "step": 12130 + }, + { + "epoch": 3.918657198192382, + "grad_norm": 0.912579357624054, + "learning_rate": 0.0002, + "loss": 0.6493, + "step": 12140 + }, + { + "epoch": 3.9218850871530018, + "grad_norm": 0.8697066903114319, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 12150 + }, + { + "epoch": 3.9251129761136214, + "grad_norm": 1.005232572555542, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 12160 + }, + { + "epoch": 3.9283408650742415, + "grad_norm": 0.793902575969696, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 12170 + }, + { + "epoch": 3.931568754034861, + "grad_norm": 0.7025905847549438, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 12180 + }, + { + "epoch": 3.934796642995481, + "grad_norm": 0.97635817527771, + "learning_rate": 0.0002, + "loss": 0.6421, + "step": 12190 + }, + { + "epoch": 3.938024531956101, + "grad_norm": 0.855417013168335, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 12200 + }, + { + "epoch": 3.9412524209167206, + "grad_norm": 0.8841291666030884, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 12210 + }, + { + "epoch": 3.94448030987734, + "grad_norm": 1.1762064695358276, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 12220 + }, + { + "epoch": 3.94770819883796, + "grad_norm": 0.8393193483352661, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 12230 + }, + { + "epoch": 3.9509360877985795, + "grad_norm": 0.9324905276298523, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 12240 + }, + { + "epoch": 3.9541639767591996, + "grad_norm": 0.8607982993125916, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 12250 + }, + { + "epoch": 3.9573918657198193, + "grad_norm": 0.8586681485176086, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 12260 + }, + { + "epoch": 3.960619754680439, + "grad_norm": 1.1082909107208252, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 12270 + }, + { + "epoch": 3.963847643641059, + "grad_norm": 1.065027117729187, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 12280 + }, + { + "epoch": 3.9670755326016787, + "grad_norm": 0.9544363021850586, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 12290 + }, + { + "epoch": 3.9703034215622983, + "grad_norm": 0.9008927345275879, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 12300 + }, + { + "epoch": 3.973531310522918, + "grad_norm": 0.8717467188835144, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 12310 + }, + { + "epoch": 3.9767591994835376, + "grad_norm": 0.9718339443206787, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 12320 + }, + { + "epoch": 3.9799870884441573, + "grad_norm": 1.0362015962600708, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 12330 + }, + { + "epoch": 3.9832149774047774, + "grad_norm": 1.0844318866729736, + "learning_rate": 0.0002, + "loss": 0.6229, + "step": 12340 + }, + { + "epoch": 3.986442866365397, + "grad_norm": 0.7506240606307983, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 12350 + }, + { + "epoch": 3.9896707553260167, + "grad_norm": 1.005982756614685, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 12360 + }, + { + "epoch": 3.9928986442866368, + "grad_norm": 0.7566431164741516, + "learning_rate": 0.0002, + "loss": 0.5926, + "step": 12370 + }, + { + "epoch": 3.9961265332472564, + "grad_norm": 0.8819181323051453, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 12380 + }, + { + "epoch": 3.999354422207876, + "grad_norm": 0.884497880935669, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 12390 + }, + { + "epoch": 4.0, + "eval_loss": 1.1907150745391846, + "eval_runtime": 161.5766, + "eval_samples_per_second": 4.537, + "eval_steps_per_second": 0.569, + "step": 12392 + }, + { + "epoch": 4.002582311168496, + "grad_norm": 1.0407241582870483, + "learning_rate": 0.0002, + "loss": 0.5203, + "step": 12400 + }, + { + "epoch": 4.005810200129115, + "grad_norm": 1.0199295282363892, + "learning_rate": 0.0002, + "loss": 0.4978, + "step": 12410 + }, + { + "epoch": 4.009038089089735, + "grad_norm": 0.8456302881240845, + "learning_rate": 0.0002, + "loss": 0.4985, + "step": 12420 + }, + { + "epoch": 4.012265978050355, + "grad_norm": 1.0621124505996704, + "learning_rate": 0.0002, + "loss": 0.4669, + "step": 12430 + }, + { + "epoch": 4.015493867010975, + "grad_norm": 0.8984712362289429, + "learning_rate": 0.0002, + "loss": 0.5277, + "step": 12440 + }, + { + "epoch": 4.018721755971595, + "grad_norm": 1.3785864114761353, + "learning_rate": 0.0002, + "loss": 0.5508, + "step": 12450 + }, + { + "epoch": 4.0219496449322145, + "grad_norm": 0.7911781668663025, + "learning_rate": 0.0002, + "loss": 0.5244, + "step": 12460 + }, + { + "epoch": 4.025177533892834, + "grad_norm": 1.0977907180786133, + "learning_rate": 0.0002, + "loss": 0.4746, + "step": 12470 + }, + { + "epoch": 4.028405422853454, + "grad_norm": 1.0664983987808228, + "learning_rate": 0.0002, + "loss": 0.4632, + "step": 12480 + }, + { + "epoch": 4.0316333118140735, + "grad_norm": 1.0807124376296997, + "learning_rate": 0.0002, + "loss": 0.5151, + "step": 12490 + }, + { + "epoch": 4.034861200774693, + "grad_norm": 1.2650192975997925, + "learning_rate": 0.0002, + "loss": 0.4712, + "step": 12500 + }, + { + "epoch": 4.038089089735313, + "grad_norm": 0.7164070010185242, + "learning_rate": 0.0002, + "loss": 0.5111, + "step": 12510 + }, + { + "epoch": 4.041316978695932, + "grad_norm": 1.0047489404678345, + "learning_rate": 0.0002, + "loss": 0.5015, + "step": 12520 + }, + { + "epoch": 4.044544867656553, + "grad_norm": 0.9303901791572571, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 12530 + }, + { + "epoch": 4.047772756617173, + "grad_norm": 1.0319702625274658, + "learning_rate": 0.0002, + "loss": 0.5165, + "step": 12540 + }, + { + "epoch": 4.051000645577792, + "grad_norm": 0.9549729228019714, + "learning_rate": 0.0002, + "loss": 0.4834, + "step": 12550 + }, + { + "epoch": 4.054228534538412, + "grad_norm": 0.7175564765930176, + "learning_rate": 0.0002, + "loss": 0.5235, + "step": 12560 + }, + { + "epoch": 4.057456423499032, + "grad_norm": 1.0622259378433228, + "learning_rate": 0.0002, + "loss": 0.5257, + "step": 12570 + }, + { + "epoch": 4.060684312459651, + "grad_norm": 1.172074556350708, + "learning_rate": 0.0002, + "loss": 0.5098, + "step": 12580 + }, + { + "epoch": 4.063912201420271, + "grad_norm": 0.9702366590499878, + "learning_rate": 0.0002, + "loss": 0.5112, + "step": 12590 + }, + { + "epoch": 4.0671400903808905, + "grad_norm": 0.741511344909668, + "learning_rate": 0.0002, + "loss": 0.5042, + "step": 12600 + }, + { + "epoch": 4.070367979341511, + "grad_norm": 0.8632621169090271, + "learning_rate": 0.0002, + "loss": 0.4996, + "step": 12610 + }, + { + "epoch": 4.073595868302131, + "grad_norm": 0.9695962071418762, + "learning_rate": 0.0002, + "loss": 0.4927, + "step": 12620 + }, + { + "epoch": 4.07682375726275, + "grad_norm": 0.9401052594184875, + "learning_rate": 0.0002, + "loss": 0.4618, + "step": 12630 + }, + { + "epoch": 4.08005164622337, + "grad_norm": 0.8068707585334778, + "learning_rate": 0.0002, + "loss": 0.4889, + "step": 12640 + }, + { + "epoch": 4.08327953518399, + "grad_norm": 0.9554762840270996, + "learning_rate": 0.0002, + "loss": 0.5046, + "step": 12650 + }, + { + "epoch": 4.086507424144609, + "grad_norm": 0.7637128233909607, + "learning_rate": 0.0002, + "loss": 0.5081, + "step": 12660 + }, + { + "epoch": 4.089735313105229, + "grad_norm": 0.6703744530677795, + "learning_rate": 0.0002, + "loss": 0.4997, + "step": 12670 + }, + { + "epoch": 4.092963202065849, + "grad_norm": 0.8623828887939453, + "learning_rate": 0.0002, + "loss": 0.4977, + "step": 12680 + }, + { + "epoch": 4.096191091026468, + "grad_norm": 0.8198223114013672, + "learning_rate": 0.0002, + "loss": 0.4616, + "step": 12690 + }, + { + "epoch": 4.099418979987089, + "grad_norm": 1.3449875116348267, + "learning_rate": 0.0002, + "loss": 0.5372, + "step": 12700 + }, + { + "epoch": 4.1026468689477085, + "grad_norm": 0.8333606123924255, + "learning_rate": 0.0002, + "loss": 0.4782, + "step": 12710 + }, + { + "epoch": 4.105874757908328, + "grad_norm": 1.1647733449935913, + "learning_rate": 0.0002, + "loss": 0.5135, + "step": 12720 + }, + { + "epoch": 4.109102646868948, + "grad_norm": 1.0560213327407837, + "learning_rate": 0.0002, + "loss": 0.5147, + "step": 12730 + }, + { + "epoch": 4.112330535829567, + "grad_norm": 0.9479449987411499, + "learning_rate": 0.0002, + "loss": 0.5244, + "step": 12740 + }, + { + "epoch": 4.115558424790187, + "grad_norm": 1.1634587049484253, + "learning_rate": 0.0002, + "loss": 0.4596, + "step": 12750 + }, + { + "epoch": 4.118786313750807, + "grad_norm": 0.813987672328949, + "learning_rate": 0.0002, + "loss": 0.4966, + "step": 12760 + }, + { + "epoch": 4.122014202711426, + "grad_norm": 0.968461275100708, + "learning_rate": 0.0002, + "loss": 0.5133, + "step": 12770 + }, + { + "epoch": 4.125242091672046, + "grad_norm": 0.9324830770492554, + "learning_rate": 0.0002, + "loss": 0.5113, + "step": 12780 + }, + { + "epoch": 4.128469980632667, + "grad_norm": 0.8313411474227905, + "learning_rate": 0.0002, + "loss": 0.5233, + "step": 12790 + }, + { + "epoch": 4.131697869593286, + "grad_norm": 1.0177634954452515, + "learning_rate": 0.0002, + "loss": 0.5169, + "step": 12800 + }, + { + "epoch": 4.134925758553906, + "grad_norm": 1.0890623331069946, + "learning_rate": 0.0002, + "loss": 0.4635, + "step": 12810 + }, + { + "epoch": 4.1381536475145255, + "grad_norm": 0.9131693840026855, + "learning_rate": 0.0002, + "loss": 0.519, + "step": 12820 + }, + { + "epoch": 4.141381536475145, + "grad_norm": 0.8400680422782898, + "learning_rate": 0.0002, + "loss": 0.5017, + "step": 12830 + }, + { + "epoch": 4.144609425435765, + "grad_norm": 0.8988795876502991, + "learning_rate": 0.0002, + "loss": 0.5195, + "step": 12840 + }, + { + "epoch": 4.1478373143963845, + "grad_norm": 0.9224025011062622, + "learning_rate": 0.0002, + "loss": 0.5052, + "step": 12850 + }, + { + "epoch": 4.151065203357004, + "grad_norm": 0.7453159689903259, + "learning_rate": 0.0002, + "loss": 0.5001, + "step": 12860 + }, + { + "epoch": 4.154293092317625, + "grad_norm": 0.9815868139266968, + "learning_rate": 0.0002, + "loss": 0.4874, + "step": 12870 + }, + { + "epoch": 4.157520981278244, + "grad_norm": 1.2542768716812134, + "learning_rate": 0.0002, + "loss": 0.5485, + "step": 12880 + }, + { + "epoch": 4.160748870238864, + "grad_norm": 1.0092132091522217, + "learning_rate": 0.0002, + "loss": 0.5287, + "step": 12890 + }, + { + "epoch": 4.163976759199484, + "grad_norm": 1.1836622953414917, + "learning_rate": 0.0002, + "loss": 0.5125, + "step": 12900 + }, + { + "epoch": 4.167204648160103, + "grad_norm": 0.7706810235977173, + "learning_rate": 0.0002, + "loss": 0.5089, + "step": 12910 + }, + { + "epoch": 4.170432537120723, + "grad_norm": 1.00058913230896, + "learning_rate": 0.0002, + "loss": 0.5123, + "step": 12920 + }, + { + "epoch": 4.173660426081343, + "grad_norm": 1.2326250076293945, + "learning_rate": 0.0002, + "loss": 0.5238, + "step": 12930 + }, + { + "epoch": 4.176888315041962, + "grad_norm": 0.8829123377799988, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 12940 + }, + { + "epoch": 4.180116204002582, + "grad_norm": 0.936042845249176, + "learning_rate": 0.0002, + "loss": 0.517, + "step": 12950 + }, + { + "epoch": 4.183344092963202, + "grad_norm": 0.9773517847061157, + "learning_rate": 0.0002, + "loss": 0.4991, + "step": 12960 + }, + { + "epoch": 4.186571981923822, + "grad_norm": 0.9786297678947449, + "learning_rate": 0.0002, + "loss": 0.5025, + "step": 12970 + }, + { + "epoch": 4.189799870884442, + "grad_norm": 0.7524558901786804, + "learning_rate": 0.0002, + "loss": 0.5276, + "step": 12980 + }, + { + "epoch": 4.193027759845061, + "grad_norm": 1.0107866525650024, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 12990 + }, + { + "epoch": 4.196255648805681, + "grad_norm": 1.0092947483062744, + "learning_rate": 0.0002, + "loss": 0.5304, + "step": 13000 + }, + { + "epoch": 4.199483537766301, + "grad_norm": 1.18181312084198, + "learning_rate": 0.0002, + "loss": 0.5061, + "step": 13010 + }, + { + "epoch": 4.20271142672692, + "grad_norm": 0.8845750093460083, + "learning_rate": 0.0002, + "loss": 0.512, + "step": 13020 + }, + { + "epoch": 4.20593931568754, + "grad_norm": 1.0789145231246948, + "learning_rate": 0.0002, + "loss": 0.5329, + "step": 13030 + }, + { + "epoch": 4.2091672046481605, + "grad_norm": 0.9562082886695862, + "learning_rate": 0.0002, + "loss": 0.5001, + "step": 13040 + }, + { + "epoch": 4.21239509360878, + "grad_norm": 0.875755786895752, + "learning_rate": 0.0002, + "loss": 0.5211, + "step": 13050 + }, + { + "epoch": 4.2156229825694, + "grad_norm": 1.0694596767425537, + "learning_rate": 0.0002, + "loss": 0.5162, + "step": 13060 + }, + { + "epoch": 4.2188508715300195, + "grad_norm": 1.0053378343582153, + "learning_rate": 0.0002, + "loss": 0.4917, + "step": 13070 + }, + { + "epoch": 4.222078760490639, + "grad_norm": 1.1628689765930176, + "learning_rate": 0.0002, + "loss": 0.542, + "step": 13080 + }, + { + "epoch": 4.225306649451259, + "grad_norm": 0.9455991983413696, + "learning_rate": 0.0002, + "loss": 0.4796, + "step": 13090 + }, + { + "epoch": 4.228534538411878, + "grad_norm": 0.9736765623092651, + "learning_rate": 0.0002, + "loss": 0.4802, + "step": 13100 + }, + { + "epoch": 4.231762427372498, + "grad_norm": 0.8653560876846313, + "learning_rate": 0.0002, + "loss": 0.5411, + "step": 13110 + }, + { + "epoch": 4.234990316333118, + "grad_norm": 0.9335988163948059, + "learning_rate": 0.0002, + "loss": 0.5347, + "step": 13120 + }, + { + "epoch": 4.238218205293738, + "grad_norm": 0.9102661609649658, + "learning_rate": 0.0002, + "loss": 0.5217, + "step": 13130 + }, + { + "epoch": 4.241446094254358, + "grad_norm": 1.0595461130142212, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 13140 + }, + { + "epoch": 4.244673983214978, + "grad_norm": 0.8947662711143494, + "learning_rate": 0.0002, + "loss": 0.517, + "step": 13150 + }, + { + "epoch": 4.247901872175597, + "grad_norm": 1.0835723876953125, + "learning_rate": 0.0002, + "loss": 0.5116, + "step": 13160 + }, + { + "epoch": 4.251129761136217, + "grad_norm": 0.8496462106704712, + "learning_rate": 0.0002, + "loss": 0.5212, + "step": 13170 + }, + { + "epoch": 4.2543576500968365, + "grad_norm": 0.9395631551742554, + "learning_rate": 0.0002, + "loss": 0.5079, + "step": 13180 + }, + { + "epoch": 4.257585539057456, + "grad_norm": 1.2939592599868774, + "learning_rate": 0.0002, + "loss": 0.5076, + "step": 13190 + }, + { + "epoch": 4.260813428018076, + "grad_norm": 0.9325923919677734, + "learning_rate": 0.0002, + "loss": 0.5209, + "step": 13200 + }, + { + "epoch": 4.264041316978696, + "grad_norm": 0.9220664501190186, + "learning_rate": 0.0002, + "loss": 0.4984, + "step": 13210 + }, + { + "epoch": 4.267269205939316, + "grad_norm": 0.9505137205123901, + "learning_rate": 0.0002, + "loss": 0.5553, + "step": 13220 + }, + { + "epoch": 4.270497094899936, + "grad_norm": 1.0713751316070557, + "learning_rate": 0.0002, + "loss": 0.5238, + "step": 13230 + }, + { + "epoch": 4.273724983860555, + "grad_norm": 0.8390375971794128, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 13240 + }, + { + "epoch": 4.276952872821175, + "grad_norm": 0.8943426012992859, + "learning_rate": 0.0002, + "loss": 0.5217, + "step": 13250 + }, + { + "epoch": 4.280180761781795, + "grad_norm": 0.9175868630409241, + "learning_rate": 0.0002, + "loss": 0.5486, + "step": 13260 + }, + { + "epoch": 4.283408650742414, + "grad_norm": 0.9969881176948547, + "learning_rate": 0.0002, + "loss": 0.5208, + "step": 13270 + }, + { + "epoch": 4.286636539703034, + "grad_norm": 1.2271877527236938, + "learning_rate": 0.0002, + "loss": 0.5376, + "step": 13280 + }, + { + "epoch": 4.289864428663654, + "grad_norm": 0.9463263154029846, + "learning_rate": 0.0002, + "loss": 0.4811, + "step": 13290 + }, + { + "epoch": 4.293092317624274, + "grad_norm": 1.0306228399276733, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 13300 + }, + { + "epoch": 4.296320206584894, + "grad_norm": 0.8454763889312744, + "learning_rate": 0.0002, + "loss": 0.5092, + "step": 13310 + }, + { + "epoch": 4.299548095545513, + "grad_norm": 0.9843119978904724, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 13320 + }, + { + "epoch": 4.302775984506133, + "grad_norm": 1.0836851596832275, + "learning_rate": 0.0002, + "loss": 0.5407, + "step": 13330 + }, + { + "epoch": 4.306003873466753, + "grad_norm": 1.0719412565231323, + "learning_rate": 0.0002, + "loss": 0.5336, + "step": 13340 + }, + { + "epoch": 4.309231762427372, + "grad_norm": 0.9276487827301025, + "learning_rate": 0.0002, + "loss": 0.4798, + "step": 13350 + }, + { + "epoch": 4.312459651387992, + "grad_norm": 0.897072434425354, + "learning_rate": 0.0002, + "loss": 0.5256, + "step": 13360 + }, + { + "epoch": 4.315687540348612, + "grad_norm": 1.0493228435516357, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 13370 + }, + { + "epoch": 4.318915429309232, + "grad_norm": 0.9446353316307068, + "learning_rate": 0.0002, + "loss": 0.5218, + "step": 13380 + }, + { + "epoch": 4.322143318269852, + "grad_norm": 0.7765224575996399, + "learning_rate": 0.0002, + "loss": 0.4765, + "step": 13390 + }, + { + "epoch": 4.3253712072304715, + "grad_norm": 0.9100048542022705, + "learning_rate": 0.0002, + "loss": 0.5907, + "step": 13400 + }, + { + "epoch": 4.328599096191091, + "grad_norm": 1.0913089513778687, + "learning_rate": 0.0002, + "loss": 0.5393, + "step": 13410 + }, + { + "epoch": 4.331826985151711, + "grad_norm": 0.9607733488082886, + "learning_rate": 0.0002, + "loss": 0.494, + "step": 13420 + }, + { + "epoch": 4.3350548741123305, + "grad_norm": 0.8774219155311584, + "learning_rate": 0.0002, + "loss": 0.5273, + "step": 13430 + }, + { + "epoch": 4.33828276307295, + "grad_norm": 0.8366804122924805, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 13440 + }, + { + "epoch": 4.34151065203357, + "grad_norm": 1.034727931022644, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 13450 + }, + { + "epoch": 4.344738540994189, + "grad_norm": 0.942743182182312, + "learning_rate": 0.0002, + "loss": 0.4995, + "step": 13460 + }, + { + "epoch": 4.347966429954809, + "grad_norm": 0.7237029075622559, + "learning_rate": 0.0002, + "loss": 0.5222, + "step": 13470 + }, + { + "epoch": 4.35119431891543, + "grad_norm": 0.8216196894645691, + "learning_rate": 0.0002, + "loss": 0.5461, + "step": 13480 + }, + { + "epoch": 4.354422207876049, + "grad_norm": 1.031860113143921, + "learning_rate": 0.0002, + "loss": 0.5104, + "step": 13490 + }, + { + "epoch": 4.357650096836669, + "grad_norm": 0.8880493640899658, + "learning_rate": 0.0002, + "loss": 0.547, + "step": 13500 + }, + { + "epoch": 4.360877985797289, + "grad_norm": 0.8442490696907043, + "learning_rate": 0.0002, + "loss": 0.5259, + "step": 13510 + }, + { + "epoch": 4.364105874757908, + "grad_norm": 1.270971655845642, + "learning_rate": 0.0002, + "loss": 0.5176, + "step": 13520 + }, + { + "epoch": 4.367333763718528, + "grad_norm": 0.9657870531082153, + "learning_rate": 0.0002, + "loss": 0.5028, + "step": 13530 + }, + { + "epoch": 4.3705616526791475, + "grad_norm": 0.7477133870124817, + "learning_rate": 0.0002, + "loss": 0.5136, + "step": 13540 + }, + { + "epoch": 4.373789541639767, + "grad_norm": 1.0209243297576904, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 13550 + }, + { + "epoch": 4.377017430600388, + "grad_norm": 0.8714015483856201, + "learning_rate": 0.0002, + "loss": 0.4888, + "step": 13560 + }, + { + "epoch": 4.380245319561007, + "grad_norm": 1.0490189790725708, + "learning_rate": 0.0002, + "loss": 0.5428, + "step": 13570 + }, + { + "epoch": 4.383473208521627, + "grad_norm": 0.9454663991928101, + "learning_rate": 0.0002, + "loss": 0.5398, + "step": 13580 + }, + { + "epoch": 4.386701097482247, + "grad_norm": 1.154146432876587, + "learning_rate": 0.0002, + "loss": 0.5072, + "step": 13590 + }, + { + "epoch": 4.389928986442866, + "grad_norm": 1.155090570449829, + "learning_rate": 0.0002, + "loss": 0.5096, + "step": 13600 + }, + { + "epoch": 4.393156875403486, + "grad_norm": 0.9853842854499817, + "learning_rate": 0.0002, + "loss": 0.5679, + "step": 13610 + }, + { + "epoch": 4.396384764364106, + "grad_norm": 0.9265837669372559, + "learning_rate": 0.0002, + "loss": 0.4992, + "step": 13620 + }, + { + "epoch": 4.399612653324725, + "grad_norm": 0.8367540240287781, + "learning_rate": 0.0002, + "loss": 0.523, + "step": 13630 + }, + { + "epoch": 4.402840542285345, + "grad_norm": 1.1453629732131958, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 13640 + }, + { + "epoch": 4.4060684312459655, + "grad_norm": 1.0856295824050903, + "learning_rate": 0.0002, + "loss": 0.573, + "step": 13650 + }, + { + "epoch": 4.409296320206585, + "grad_norm": 0.9284523129463196, + "learning_rate": 0.0002, + "loss": 0.5178, + "step": 13660 + }, + { + "epoch": 4.412524209167205, + "grad_norm": 0.9632299542427063, + "learning_rate": 0.0002, + "loss": 0.4862, + "step": 13670 + }, + { + "epoch": 4.415752098127824, + "grad_norm": 1.048524260520935, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 13680 + }, + { + "epoch": 4.418979987088444, + "grad_norm": 0.9787682294845581, + "learning_rate": 0.0002, + "loss": 0.5258, + "step": 13690 + }, + { + "epoch": 4.422207876049064, + "grad_norm": 1.0728684663772583, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 13700 + }, + { + "epoch": 4.425435765009683, + "grad_norm": 0.72867351770401, + "learning_rate": 0.0002, + "loss": 0.5243, + "step": 13710 + }, + { + "epoch": 4.428663653970303, + "grad_norm": 0.8932793736457825, + "learning_rate": 0.0002, + "loss": 0.5313, + "step": 13720 + }, + { + "epoch": 4.431891542930924, + "grad_norm": 1.098343849182129, + "learning_rate": 0.0002, + "loss": 0.5156, + "step": 13730 + }, + { + "epoch": 4.435119431891543, + "grad_norm": 0.9321235418319702, + "learning_rate": 0.0002, + "loss": 0.5342, + "step": 13740 + }, + { + "epoch": 4.438347320852163, + "grad_norm": 0.8868634104728699, + "learning_rate": 0.0002, + "loss": 0.5114, + "step": 13750 + }, + { + "epoch": 4.4415752098127825, + "grad_norm": 1.200064778327942, + "learning_rate": 0.0002, + "loss": 0.5284, + "step": 13760 + }, + { + "epoch": 4.444803098773402, + "grad_norm": 0.8968019485473633, + "learning_rate": 0.0002, + "loss": 0.5208, + "step": 13770 + }, + { + "epoch": 4.448030987734022, + "grad_norm": 0.9560935497283936, + "learning_rate": 0.0002, + "loss": 0.4979, + "step": 13780 + }, + { + "epoch": 4.4512588766946415, + "grad_norm": 0.7985701560974121, + "learning_rate": 0.0002, + "loss": 0.5134, + "step": 13790 + }, + { + "epoch": 4.454486765655261, + "grad_norm": 1.062540888786316, + "learning_rate": 0.0002, + "loss": 0.5113, + "step": 13800 + }, + { + "epoch": 4.457714654615881, + "grad_norm": 1.0827109813690186, + "learning_rate": 0.0002, + "loss": 0.525, + "step": 13810 + }, + { + "epoch": 4.460942543576501, + "grad_norm": 1.0853543281555176, + "learning_rate": 0.0002, + "loss": 0.5541, + "step": 13820 + }, + { + "epoch": 4.464170432537121, + "grad_norm": 1.0613641738891602, + "learning_rate": 0.0002, + "loss": 0.5381, + "step": 13830 + }, + { + "epoch": 4.467398321497741, + "grad_norm": 0.9037535190582275, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 13840 + }, + { + "epoch": 4.47062621045836, + "grad_norm": 0.9216223955154419, + "learning_rate": 0.0002, + "loss": 0.5112, + "step": 13850 + }, + { + "epoch": 4.47385409941898, + "grad_norm": 0.8952260613441467, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 13860 + }, + { + "epoch": 4.4770819883796, + "grad_norm": 0.9997953176498413, + "learning_rate": 0.0002, + "loss": 0.5026, + "step": 13870 + }, + { + "epoch": 4.480309877340219, + "grad_norm": 1.062458872795105, + "learning_rate": 0.0002, + "loss": 0.5107, + "step": 13880 + }, + { + "epoch": 4.483537766300839, + "grad_norm": 0.9185126423835754, + "learning_rate": 0.0002, + "loss": 0.5463, + "step": 13890 + }, + { + "epoch": 4.486765655261459, + "grad_norm": 1.2389954328536987, + "learning_rate": 0.0002, + "loss": 0.5181, + "step": 13900 + }, + { + "epoch": 4.489993544222079, + "grad_norm": 1.1632126569747925, + "learning_rate": 0.0002, + "loss": 0.5199, + "step": 13910 + }, + { + "epoch": 4.493221433182699, + "grad_norm": 1.0304487943649292, + "learning_rate": 0.0002, + "loss": 0.5128, + "step": 13920 + }, + { + "epoch": 4.496449322143318, + "grad_norm": 0.9144788384437561, + "learning_rate": 0.0002, + "loss": 0.5331, + "step": 13930 + }, + { + "epoch": 4.499677211103938, + "grad_norm": 1.0285682678222656, + "learning_rate": 0.0002, + "loss": 0.5312, + "step": 13940 + }, + { + "epoch": 4.502905100064558, + "grad_norm": 1.1187206506729126, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 13950 + }, + { + "epoch": 4.506132989025177, + "grad_norm": 0.7917197942733765, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 13960 + }, + { + "epoch": 4.509360877985797, + "grad_norm": 0.8495619297027588, + "learning_rate": 0.0002, + "loss": 0.5227, + "step": 13970 + }, + { + "epoch": 4.512588766946417, + "grad_norm": 1.0450760126113892, + "learning_rate": 0.0002, + "loss": 0.4971, + "step": 13980 + }, + { + "epoch": 4.515816655907037, + "grad_norm": 1.0061010122299194, + "learning_rate": 0.0002, + "loss": 0.5402, + "step": 13990 + }, + { + "epoch": 4.519044544867657, + "grad_norm": 1.0232428312301636, + "learning_rate": 0.0002, + "loss": 0.527, + "step": 14000 + }, + { + "epoch": 4.5222724338282765, + "grad_norm": 0.8734631538391113, + "learning_rate": 0.0002, + "loss": 0.5002, + "step": 14010 + }, + { + "epoch": 4.525500322788896, + "grad_norm": 1.1085621118545532, + "learning_rate": 0.0002, + "loss": 0.5464, + "step": 14020 + }, + { + "epoch": 4.528728211749516, + "grad_norm": 0.9178624749183655, + "learning_rate": 0.0002, + "loss": 0.5167, + "step": 14030 + }, + { + "epoch": 4.531956100710135, + "grad_norm": 1.0687317848205566, + "learning_rate": 0.0002, + "loss": 0.5589, + "step": 14040 + }, + { + "epoch": 4.535183989670755, + "grad_norm": 0.9237300157546997, + "learning_rate": 0.0002, + "loss": 0.5576, + "step": 14050 + }, + { + "epoch": 4.538411878631375, + "grad_norm": 0.9667123556137085, + "learning_rate": 0.0002, + "loss": 0.5062, + "step": 14060 + }, + { + "epoch": 4.541639767591995, + "grad_norm": 1.1286747455596924, + "learning_rate": 0.0002, + "loss": 0.5645, + "step": 14070 + }, + { + "epoch": 4.544867656552615, + "grad_norm": 1.055392861366272, + "learning_rate": 0.0002, + "loss": 0.5226, + "step": 14080 + }, + { + "epoch": 4.548095545513235, + "grad_norm": 0.9492936134338379, + "learning_rate": 0.0002, + "loss": 0.5428, + "step": 14090 + }, + { + "epoch": 4.551323434473854, + "grad_norm": 0.9881349802017212, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 14100 + }, + { + "epoch": 4.554551323434474, + "grad_norm": 0.9389023184776306, + "learning_rate": 0.0002, + "loss": 0.5572, + "step": 14110 + }, + { + "epoch": 4.5577792123950935, + "grad_norm": 0.8395606875419617, + "learning_rate": 0.0002, + "loss": 0.5511, + "step": 14120 + }, + { + "epoch": 4.561007101355713, + "grad_norm": 0.9019067287445068, + "learning_rate": 0.0002, + "loss": 0.5696, + "step": 14130 + }, + { + "epoch": 4.564234990316333, + "grad_norm": 1.1058136224746704, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 14140 + }, + { + "epoch": 4.5674628792769525, + "grad_norm": 1.0683821439743042, + "learning_rate": 0.0002, + "loss": 0.5323, + "step": 14150 + }, + { + "epoch": 4.570690768237572, + "grad_norm": 1.3398395776748657, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 14160 + }, + { + "epoch": 4.573918657198193, + "grad_norm": 0.7829096913337708, + "learning_rate": 0.0002, + "loss": 0.4713, + "step": 14170 + }, + { + "epoch": 4.577146546158812, + "grad_norm": 0.9636675119400024, + "learning_rate": 0.0002, + "loss": 0.525, + "step": 14180 + }, + { + "epoch": 4.580374435119432, + "grad_norm": 1.0291401147842407, + "learning_rate": 0.0002, + "loss": 0.5458, + "step": 14190 + }, + { + "epoch": 4.583602324080052, + "grad_norm": 1.0894310474395752, + "learning_rate": 0.0002, + "loss": 0.5366, + "step": 14200 + }, + { + "epoch": 4.586830213040671, + "grad_norm": 1.111573576927185, + "learning_rate": 0.0002, + "loss": 0.5125, + "step": 14210 + }, + { + "epoch": 4.590058102001291, + "grad_norm": 0.9345336556434631, + "learning_rate": 0.0002, + "loss": 0.5444, + "step": 14220 + }, + { + "epoch": 4.593285990961911, + "grad_norm": 1.3338757753372192, + "learning_rate": 0.0002, + "loss": 0.5175, + "step": 14230 + }, + { + "epoch": 4.596513879922531, + "grad_norm": 1.1146448850631714, + "learning_rate": 0.0002, + "loss": 0.5227, + "step": 14240 + }, + { + "epoch": 4.599741768883151, + "grad_norm": 1.1576755046844482, + "learning_rate": 0.0002, + "loss": 0.543, + "step": 14250 + }, + { + "epoch": 4.60296965784377, + "grad_norm": 0.6851092576980591, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 14260 + }, + { + "epoch": 4.60619754680439, + "grad_norm": 0.9067938923835754, + "learning_rate": 0.0002, + "loss": 0.5027, + "step": 14270 + }, + { + "epoch": 4.60942543576501, + "grad_norm": 0.8767340183258057, + "learning_rate": 0.0002, + "loss": 0.5237, + "step": 14280 + }, + { + "epoch": 4.612653324725629, + "grad_norm": 1.024880290031433, + "learning_rate": 0.0002, + "loss": 0.5294, + "step": 14290 + }, + { + "epoch": 4.615881213686249, + "grad_norm": 0.9226394891738892, + "learning_rate": 0.0002, + "loss": 0.5371, + "step": 14300 + }, + { + "epoch": 4.619109102646869, + "grad_norm": 1.018187165260315, + "learning_rate": 0.0002, + "loss": 0.5281, + "step": 14310 + }, + { + "epoch": 4.622336991607488, + "grad_norm": 0.8851249814033508, + "learning_rate": 0.0002, + "loss": 0.5546, + "step": 14320 + }, + { + "epoch": 4.625564880568108, + "grad_norm": 0.745798647403717, + "learning_rate": 0.0002, + "loss": 0.5206, + "step": 14330 + }, + { + "epoch": 4.6287927695287285, + "grad_norm": 1.2082698345184326, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 14340 + }, + { + "epoch": 4.632020658489348, + "grad_norm": 0.901454508304596, + "learning_rate": 0.0002, + "loss": 0.5449, + "step": 14350 + }, + { + "epoch": 4.635248547449968, + "grad_norm": 0.9593124985694885, + "learning_rate": 0.0002, + "loss": 0.5433, + "step": 14360 + }, + { + "epoch": 4.6384764364105875, + "grad_norm": 1.1241410970687866, + "learning_rate": 0.0002, + "loss": 0.4939, + "step": 14370 + }, + { + "epoch": 4.641704325371207, + "grad_norm": 0.9221102595329285, + "learning_rate": 0.0002, + "loss": 0.5319, + "step": 14380 + }, + { + "epoch": 4.644932214331827, + "grad_norm": 1.0035039186477661, + "learning_rate": 0.0002, + "loss": 0.524, + "step": 14390 + }, + { + "epoch": 4.648160103292446, + "grad_norm": 1.1270662546157837, + "learning_rate": 0.0002, + "loss": 0.5617, + "step": 14400 + }, + { + "epoch": 4.651387992253067, + "grad_norm": 0.8631120324134827, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 14410 + }, + { + "epoch": 4.654615881213687, + "grad_norm": 1.0604606866836548, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 14420 + }, + { + "epoch": 4.657843770174306, + "grad_norm": 0.8002706170082092, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 14430 + }, + { + "epoch": 4.661071659134926, + "grad_norm": 1.0642075538635254, + "learning_rate": 0.0002, + "loss": 0.5459, + "step": 14440 + }, + { + "epoch": 4.664299548095546, + "grad_norm": 0.9315671324729919, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 14450 + }, + { + "epoch": 4.667527437056165, + "grad_norm": 0.8311864137649536, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 14460 + }, + { + "epoch": 4.670755326016785, + "grad_norm": 0.8900430202484131, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 14470 + }, + { + "epoch": 4.6739832149774045, + "grad_norm": 1.059267282485962, + "learning_rate": 0.0002, + "loss": 0.5086, + "step": 14480 + }, + { + "epoch": 4.677211103938024, + "grad_norm": 0.9864052534103394, + "learning_rate": 0.0002, + "loss": 0.5583, + "step": 14490 + }, + { + "epoch": 4.680438992898644, + "grad_norm": 1.210854411125183, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 14500 + }, + { + "epoch": 4.683666881859264, + "grad_norm": 1.030693769454956, + "learning_rate": 0.0002, + "loss": 0.536, + "step": 14510 + }, + { + "epoch": 4.686894770819884, + "grad_norm": 0.9809406995773315, + "learning_rate": 0.0002, + "loss": 0.544, + "step": 14520 + }, + { + "epoch": 4.690122659780504, + "grad_norm": 1.0471004247665405, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 14530 + }, + { + "epoch": 4.693350548741123, + "grad_norm": 1.1583727598190308, + "learning_rate": 0.0002, + "loss": 0.5613, + "step": 14540 + }, + { + "epoch": 4.696578437701743, + "grad_norm": 0.9664418697357178, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 14550 + }, + { + "epoch": 4.699806326662363, + "grad_norm": 0.9511209726333618, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 14560 + }, + { + "epoch": 4.703034215622982, + "grad_norm": 1.0211684703826904, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 14570 + }, + { + "epoch": 4.706262104583602, + "grad_norm": 1.097276210784912, + "learning_rate": 0.0002, + "loss": 0.5536, + "step": 14580 + }, + { + "epoch": 4.7094899935442225, + "grad_norm": 0.9363943338394165, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 14590 + }, + { + "epoch": 4.712717882504842, + "grad_norm": 1.4700615406036377, + "learning_rate": 0.0002, + "loss": 0.5261, + "step": 14600 + }, + { + "epoch": 4.715945771465462, + "grad_norm": 1.0001553297042847, + "learning_rate": 0.0002, + "loss": 0.5489, + "step": 14610 + }, + { + "epoch": 4.719173660426081, + "grad_norm": 1.0489927530288696, + "learning_rate": 0.0002, + "loss": 0.5236, + "step": 14620 + }, + { + "epoch": 4.722401549386701, + "grad_norm": 1.0483676195144653, + "learning_rate": 0.0002, + "loss": 0.5418, + "step": 14630 + }, + { + "epoch": 4.725629438347321, + "grad_norm": 1.1501940488815308, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 14640 + }, + { + "epoch": 4.72885732730794, + "grad_norm": 1.1703146696090698, + "learning_rate": 0.0002, + "loss": 0.5059, + "step": 14650 + }, + { + "epoch": 4.73208521626856, + "grad_norm": 0.8842985033988953, + "learning_rate": 0.0002, + "loss": 0.5356, + "step": 14660 + }, + { + "epoch": 4.73531310522918, + "grad_norm": 0.9147908687591553, + "learning_rate": 0.0002, + "loss": 0.5229, + "step": 14670 + }, + { + "epoch": 4.7385409941898, + "grad_norm": 1.0391576290130615, + "learning_rate": 0.0002, + "loss": 0.5436, + "step": 14680 + }, + { + "epoch": 4.74176888315042, + "grad_norm": 0.9469179511070251, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 14690 + }, + { + "epoch": 4.7449967721110395, + "grad_norm": 1.0529530048370361, + "learning_rate": 0.0002, + "loss": 0.5201, + "step": 14700 + }, + { + "epoch": 4.748224661071659, + "grad_norm": 0.9645711183547974, + "learning_rate": 0.0002, + "loss": 0.5401, + "step": 14710 + }, + { + "epoch": 4.751452550032279, + "grad_norm": 0.8163343071937561, + "learning_rate": 0.0002, + "loss": 0.5123, + "step": 14720 + }, + { + "epoch": 4.7546804389928985, + "grad_norm": 1.0581341981887817, + "learning_rate": 0.0002, + "loss": 0.5654, + "step": 14730 + }, + { + "epoch": 4.757908327953518, + "grad_norm": 1.0913853645324707, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 14740 + }, + { + "epoch": 4.761136216914138, + "grad_norm": 1.1071174144744873, + "learning_rate": 0.0002, + "loss": 0.5342, + "step": 14750 + }, + { + "epoch": 4.764364105874758, + "grad_norm": 1.0060709714889526, + "learning_rate": 0.0002, + "loss": 0.5353, + "step": 14760 + }, + { + "epoch": 4.767591994835378, + "grad_norm": 1.012024164199829, + "learning_rate": 0.0002, + "loss": 0.5415, + "step": 14770 + }, + { + "epoch": 4.770819883795998, + "grad_norm": 0.8438148498535156, + "learning_rate": 0.0002, + "loss": 0.5351, + "step": 14780 + }, + { + "epoch": 4.774047772756617, + "grad_norm": 0.8136811256408691, + "learning_rate": 0.0002, + "loss": 0.5424, + "step": 14790 + }, + { + "epoch": 4.777275661717237, + "grad_norm": 1.0765691995620728, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 14800 + }, + { + "epoch": 4.780503550677857, + "grad_norm": 1.0582574605941772, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 14810 + }, + { + "epoch": 4.783731439638476, + "grad_norm": 0.9419516921043396, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 14820 + }, + { + "epoch": 4.786959328599096, + "grad_norm": 0.9626181721687317, + "learning_rate": 0.0002, + "loss": 0.5499, + "step": 14830 + }, + { + "epoch": 4.7901872175597155, + "grad_norm": 1.2552800178527832, + "learning_rate": 0.0002, + "loss": 0.565, + "step": 14840 + }, + { + "epoch": 4.793415106520336, + "grad_norm": 0.9379919171333313, + "learning_rate": 0.0002, + "loss": 0.5402, + "step": 14850 + }, + { + "epoch": 4.796642995480956, + "grad_norm": 0.8166947364807129, + "learning_rate": 0.0002, + "loss": 0.5583, + "step": 14860 + }, + { + "epoch": 4.799870884441575, + "grad_norm": 0.9008694887161255, + "learning_rate": 0.0002, + "loss": 0.5139, + "step": 14870 + }, + { + "epoch": 4.803098773402195, + "grad_norm": 1.0256156921386719, + "learning_rate": 0.0002, + "loss": 0.5049, + "step": 14880 + }, + { + "epoch": 4.806326662362815, + "grad_norm": 0.9486594200134277, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 14890 + }, + { + "epoch": 4.809554551323434, + "grad_norm": 0.955238401889801, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 14900 + }, + { + "epoch": 4.812782440284054, + "grad_norm": 1.03775954246521, + "learning_rate": 0.0002, + "loss": 0.5269, + "step": 14910 + }, + { + "epoch": 4.816010329244674, + "grad_norm": 1.1383405923843384, + "learning_rate": 0.0002, + "loss": 0.5445, + "step": 14920 + }, + { + "epoch": 4.819238218205294, + "grad_norm": 0.9411700963973999, + "learning_rate": 0.0002, + "loss": 0.5347, + "step": 14930 + }, + { + "epoch": 4.822466107165914, + "grad_norm": 0.8188554644584656, + "learning_rate": 0.0002, + "loss": 0.4899, + "step": 14940 + }, + { + "epoch": 4.8256939961265335, + "grad_norm": 1.1336265802383423, + "learning_rate": 0.0002, + "loss": 0.5618, + "step": 14950 + }, + { + "epoch": 4.828921885087153, + "grad_norm": 1.106121301651001, + "learning_rate": 0.0002, + "loss": 0.5578, + "step": 14960 + }, + { + "epoch": 4.832149774047773, + "grad_norm": 1.0206533670425415, + "learning_rate": 0.0002, + "loss": 0.5306, + "step": 14970 + }, + { + "epoch": 4.8353776630083924, + "grad_norm": 1.1123926639556885, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 14980 + }, + { + "epoch": 4.838605551969012, + "grad_norm": 0.7879418730735779, + "learning_rate": 0.0002, + "loss": 0.5208, + "step": 14990 + }, + { + "epoch": 4.841833440929632, + "grad_norm": 1.0171709060668945, + "learning_rate": 0.0002, + "loss": 0.5385, + "step": 15000 + }, + { + "epoch": 4.845061329890251, + "grad_norm": 1.010671615600586, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 15010 + }, + { + "epoch": 4.848289218850871, + "grad_norm": 1.0778919458389282, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 15020 + }, + { + "epoch": 4.851517107811492, + "grad_norm": 1.0479968786239624, + "learning_rate": 0.0002, + "loss": 0.5587, + "step": 15030 + }, + { + "epoch": 4.854744996772111, + "grad_norm": 1.0345100164413452, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 15040 + }, + { + "epoch": 4.857972885732731, + "grad_norm": 0.9539691805839539, + "learning_rate": 0.0002, + "loss": 0.5809, + "step": 15050 + }, + { + "epoch": 4.8612007746933505, + "grad_norm": 0.9914752840995789, + "learning_rate": 0.0002, + "loss": 0.5314, + "step": 15060 + }, + { + "epoch": 4.86442866365397, + "grad_norm": 1.1935476064682007, + "learning_rate": 0.0002, + "loss": 0.5277, + "step": 15070 + }, + { + "epoch": 4.86765655261459, + "grad_norm": 1.0065057277679443, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 15080 + }, + { + "epoch": 4.8708844415752095, + "grad_norm": 0.9320993423461914, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 15090 + }, + { + "epoch": 4.87411233053583, + "grad_norm": 1.0578069686889648, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 15100 + }, + { + "epoch": 4.87734021949645, + "grad_norm": 0.9666239023208618, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 15110 + }, + { + "epoch": 4.880568108457069, + "grad_norm": 1.1322687864303589, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 15120 + }, + { + "epoch": 4.883795997417689, + "grad_norm": 0.955674409866333, + "learning_rate": 0.0002, + "loss": 0.5381, + "step": 15130 + }, + { + "epoch": 4.887023886378309, + "grad_norm": 1.119413137435913, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 15140 + }, + { + "epoch": 4.890251775338928, + "grad_norm": 0.863646924495697, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 15150 + }, + { + "epoch": 4.893479664299548, + "grad_norm": 1.1823450326919556, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 15160 + }, + { + "epoch": 4.896707553260168, + "grad_norm": 0.8657588958740234, + "learning_rate": 0.0002, + "loss": 0.5654, + "step": 15170 + }, + { + "epoch": 4.899935442220787, + "grad_norm": 0.8575737476348877, + "learning_rate": 0.0002, + "loss": 0.5239, + "step": 15180 + }, + { + "epoch": 4.903163331181407, + "grad_norm": 0.9611830711364746, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 15190 + }, + { + "epoch": 4.906391220142027, + "grad_norm": 1.1981453895568848, + "learning_rate": 0.0002, + "loss": 0.5505, + "step": 15200 + }, + { + "epoch": 4.909619109102647, + "grad_norm": 0.9401199221611023, + "learning_rate": 0.0002, + "loss": 0.5582, + "step": 15210 + }, + { + "epoch": 4.912846998063267, + "grad_norm": 0.8420369625091553, + "learning_rate": 0.0002, + "loss": 0.5631, + "step": 15220 + }, + { + "epoch": 4.916074887023886, + "grad_norm": 0.7877969145774841, + "learning_rate": 0.0002, + "loss": 0.5255, + "step": 15230 + }, + { + "epoch": 4.919302775984506, + "grad_norm": 0.8988324403762817, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 15240 + }, + { + "epoch": 4.922530664945126, + "grad_norm": 1.1103752851486206, + "learning_rate": 0.0002, + "loss": 0.5274, + "step": 15250 + }, + { + "epoch": 4.925758553905745, + "grad_norm": 0.8874443173408508, + "learning_rate": 0.0002, + "loss": 0.5249, + "step": 15260 + }, + { + "epoch": 4.928986442866366, + "grad_norm": 1.1001752614974976, + "learning_rate": 0.0002, + "loss": 0.5677, + "step": 15270 + }, + { + "epoch": 4.9322143318269855, + "grad_norm": 0.9661307334899902, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 15280 + }, + { + "epoch": 4.935442220787605, + "grad_norm": 1.1738812923431396, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 15290 + }, + { + "epoch": 4.938670109748225, + "grad_norm": 0.9773507714271545, + "learning_rate": 0.0002, + "loss": 0.5057, + "step": 15300 + }, + { + "epoch": 4.9418979987088445, + "grad_norm": 1.0735599994659424, + "learning_rate": 0.0002, + "loss": 0.5029, + "step": 15310 + }, + { + "epoch": 4.945125887669464, + "grad_norm": 1.0552113056182861, + "learning_rate": 0.0002, + "loss": 0.4996, + "step": 15320 + }, + { + "epoch": 4.948353776630084, + "grad_norm": 1.0900797843933105, + "learning_rate": 0.0002, + "loss": 0.5201, + "step": 15330 + }, + { + "epoch": 4.9515816655907035, + "grad_norm": 1.0908405780792236, + "learning_rate": 0.0002, + "loss": 0.552, + "step": 15340 + }, + { + "epoch": 4.954809554551323, + "grad_norm": 1.010221004486084, + "learning_rate": 0.0002, + "loss": 0.6208, + "step": 15350 + }, + { + "epoch": 4.958037443511943, + "grad_norm": 1.0321437120437622, + "learning_rate": 0.0002, + "loss": 0.5423, + "step": 15360 + }, + { + "epoch": 4.961265332472563, + "grad_norm": 0.8430278897285461, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 15370 + }, + { + "epoch": 4.964493221433183, + "grad_norm": 0.8775330185890198, + "learning_rate": 0.0002, + "loss": 0.538, + "step": 15380 + }, + { + "epoch": 4.967721110393803, + "grad_norm": 0.9796988368034363, + "learning_rate": 0.0002, + "loss": 0.5344, + "step": 15390 + }, + { + "epoch": 4.970948999354422, + "grad_norm": 0.8782257437705994, + "learning_rate": 0.0002, + "loss": 0.5352, + "step": 15400 + }, + { + "epoch": 4.974176888315042, + "grad_norm": 0.9959840774536133, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 15410 + }, + { + "epoch": 4.9774047772756616, + "grad_norm": 1.0730273723602295, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 15420 + }, + { + "epoch": 4.980632666236281, + "grad_norm": 0.8653680682182312, + "learning_rate": 0.0002, + "loss": 0.5277, + "step": 15430 + }, + { + "epoch": 4.983860555196901, + "grad_norm": 1.0769985914230347, + "learning_rate": 0.0002, + "loss": 0.5301, + "step": 15440 + }, + { + "epoch": 4.987088444157521, + "grad_norm": 1.1336040496826172, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 15450 + }, + { + "epoch": 4.990316333118141, + "grad_norm": 0.9844824075698853, + "learning_rate": 0.0002, + "loss": 0.5454, + "step": 15460 + }, + { + "epoch": 4.993544222078761, + "grad_norm": 0.8368769288063049, + "learning_rate": 0.0002, + "loss": 0.5316, + "step": 15470 + }, + { + "epoch": 4.99677211103938, + "grad_norm": 1.0238676071166992, + "learning_rate": 0.0002, + "loss": 0.5464, + "step": 15480 + }, + { + "epoch": 5.0, + "grad_norm": 1.064820408821106, + "learning_rate": 0.0002, + "loss": 0.5577, + "step": 15490 + }, + { + "epoch": 5.0, + "eval_loss": 1.241918921470642, + "eval_runtime": 158.4099, + "eval_samples_per_second": 4.627, + "eval_steps_per_second": 0.581, + "step": 15490 + }, + { + "epoch": 5.00322788896062, + "grad_norm": 1.1366689205169678, + "learning_rate": 0.0002, + "loss": 0.4554, + "step": 15500 + }, + { + "epoch": 5.006455777921239, + "grad_norm": 1.2548010349273682, + "learning_rate": 0.0002, + "loss": 0.4288, + "step": 15510 + }, + { + "epoch": 5.009683666881859, + "grad_norm": 1.3875139951705933, + "learning_rate": 0.0002, + "loss": 0.4276, + "step": 15520 + }, + { + "epoch": 5.012911555842479, + "grad_norm": 0.9834036231040955, + "learning_rate": 0.0002, + "loss": 0.4198, + "step": 15530 + }, + { + "epoch": 5.016139444803099, + "grad_norm": 1.0737303495407104, + "learning_rate": 0.0002, + "loss": 0.4531, + "step": 15540 + }, + { + "epoch": 5.019367333763719, + "grad_norm": 0.9877859950065613, + "learning_rate": 0.0002, + "loss": 0.4073, + "step": 15550 + }, + { + "epoch": 5.0225952227243384, + "grad_norm": 1.143268346786499, + "learning_rate": 0.0002, + "loss": 0.4459, + "step": 15560 + }, + { + "epoch": 5.025823111684958, + "grad_norm": 1.1206166744232178, + "learning_rate": 0.0002, + "loss": 0.4477, + "step": 15570 + }, + { + "epoch": 5.029051000645578, + "grad_norm": 0.9977272748947144, + "learning_rate": 0.0002, + "loss": 0.4593, + "step": 15580 + }, + { + "epoch": 5.032278889606197, + "grad_norm": 1.3193285465240479, + "learning_rate": 0.0002, + "loss": 0.436, + "step": 15590 + }, + { + "epoch": 5.035506778566817, + "grad_norm": 1.0761713981628418, + "learning_rate": 0.0002, + "loss": 0.4426, + "step": 15600 + }, + { + "epoch": 5.038734667527437, + "grad_norm": 1.1250759363174438, + "learning_rate": 0.0002, + "loss": 0.4701, + "step": 15610 + }, + { + "epoch": 5.041962556488057, + "grad_norm": 1.0414305925369263, + "learning_rate": 0.0002, + "loss": 0.3995, + "step": 15620 + }, + { + "epoch": 5.045190445448677, + "grad_norm": 1.0906853675842285, + "learning_rate": 0.0002, + "loss": 0.4244, + "step": 15630 + }, + { + "epoch": 5.0484183344092965, + "grad_norm": 0.9360867142677307, + "learning_rate": 0.0002, + "loss": 0.441, + "step": 15640 + }, + { + "epoch": 5.051646223369916, + "grad_norm": 0.9078057408332825, + "learning_rate": 0.0002, + "loss": 0.4146, + "step": 15650 + }, + { + "epoch": 5.054874112330536, + "grad_norm": 1.0054848194122314, + "learning_rate": 0.0002, + "loss": 0.4285, + "step": 15660 + }, + { + "epoch": 5.0581020012911555, + "grad_norm": 0.9538215398788452, + "learning_rate": 0.0002, + "loss": 0.417, + "step": 15670 + }, + { + "epoch": 5.061329890251775, + "grad_norm": 1.6312693357467651, + "learning_rate": 0.0002, + "loss": 0.4629, + "step": 15680 + }, + { + "epoch": 5.064557779212395, + "grad_norm": 1.2100921869277954, + "learning_rate": 0.0002, + "loss": 0.3996, + "step": 15690 + }, + { + "epoch": 5.0677856681730145, + "grad_norm": 1.2776238918304443, + "learning_rate": 0.0002, + "loss": 0.4489, + "step": 15700 + }, + { + "epoch": 5.071013557133635, + "grad_norm": 1.0110050439834595, + "learning_rate": 0.0002, + "loss": 0.4728, + "step": 15710 + }, + { + "epoch": 5.074241446094255, + "grad_norm": 1.0896575450897217, + "learning_rate": 0.0002, + "loss": 0.4916, + "step": 15720 + }, + { + "epoch": 5.077469335054874, + "grad_norm": 0.9989936947822571, + "learning_rate": 0.0002, + "loss": 0.4462, + "step": 15730 + }, + { + "epoch": 5.080697224015494, + "grad_norm": 1.0412228107452393, + "learning_rate": 0.0002, + "loss": 0.457, + "step": 15740 + }, + { + "epoch": 5.083925112976114, + "grad_norm": 1.0964457988739014, + "learning_rate": 0.0002, + "loss": 0.4525, + "step": 15750 + }, + { + "epoch": 5.087153001936733, + "grad_norm": 1.1700960397720337, + "learning_rate": 0.0002, + "loss": 0.4539, + "step": 15760 + }, + { + "epoch": 5.090380890897353, + "grad_norm": 0.9515631794929504, + "learning_rate": 0.0002, + "loss": 0.4517, + "step": 15770 + }, + { + "epoch": 5.093608779857973, + "grad_norm": 1.0895006656646729, + "learning_rate": 0.0002, + "loss": 0.4352, + "step": 15780 + }, + { + "epoch": 5.096836668818592, + "grad_norm": 1.041312575340271, + "learning_rate": 0.0002, + "loss": 0.4765, + "step": 15790 + }, + { + "epoch": 5.100064557779213, + "grad_norm": 0.9518465399742126, + "learning_rate": 0.0002, + "loss": 0.4532, + "step": 15800 + }, + { + "epoch": 5.103292446739832, + "grad_norm": 0.8317030668258667, + "learning_rate": 0.0002, + "loss": 0.4187, + "step": 15810 + }, + { + "epoch": 5.106520335700452, + "grad_norm": 1.0933761596679688, + "learning_rate": 0.0002, + "loss": 0.4523, + "step": 15820 + }, + { + "epoch": 5.109748224661072, + "grad_norm": 1.0069324970245361, + "learning_rate": 0.0002, + "loss": 0.4689, + "step": 15830 + }, + { + "epoch": 5.112976113621691, + "grad_norm": 1.1166068315505981, + "learning_rate": 0.0002, + "loss": 0.4773, + "step": 15840 + }, + { + "epoch": 5.116204002582311, + "grad_norm": 1.069992184638977, + "learning_rate": 0.0002, + "loss": 0.4635, + "step": 15850 + }, + { + "epoch": 5.119431891542931, + "grad_norm": 1.3728036880493164, + "learning_rate": 0.0002, + "loss": 0.445, + "step": 15860 + }, + { + "epoch": 5.12265978050355, + "grad_norm": 1.0625780820846558, + "learning_rate": 0.0002, + "loss": 0.4563, + "step": 15870 + }, + { + "epoch": 5.125887669464171, + "grad_norm": 1.090174913406372, + "learning_rate": 0.0002, + "loss": 0.426, + "step": 15880 + }, + { + "epoch": 5.1291155584247905, + "grad_norm": 0.8729526996612549, + "learning_rate": 0.0002, + "loss": 0.457, + "step": 15890 + }, + { + "epoch": 5.13234344738541, + "grad_norm": 0.9561540484428406, + "learning_rate": 0.0002, + "loss": 0.4686, + "step": 15900 + }, + { + "epoch": 5.13557133634603, + "grad_norm": 1.012120246887207, + "learning_rate": 0.0002, + "loss": 0.4266, + "step": 15910 + }, + { + "epoch": 5.1387992253066495, + "grad_norm": 1.1027921438217163, + "learning_rate": 0.0002, + "loss": 0.4484, + "step": 15920 + }, + { + "epoch": 5.142027114267269, + "grad_norm": 1.0878126621246338, + "learning_rate": 0.0002, + "loss": 0.4389, + "step": 15930 + }, + { + "epoch": 5.145255003227889, + "grad_norm": 0.9619103670120239, + "learning_rate": 0.0002, + "loss": 0.4716, + "step": 15940 + }, + { + "epoch": 5.148482892188508, + "grad_norm": 1.1684138774871826, + "learning_rate": 0.0002, + "loss": 0.4071, + "step": 15950 + }, + { + "epoch": 5.151710781149128, + "grad_norm": 1.3379510641098022, + "learning_rate": 0.0002, + "loss": 0.4292, + "step": 15960 + }, + { + "epoch": 5.154938670109749, + "grad_norm": 1.0427496433258057, + "learning_rate": 0.0002, + "loss": 0.4413, + "step": 15970 + }, + { + "epoch": 5.158166559070368, + "grad_norm": 0.9917148351669312, + "learning_rate": 0.0002, + "loss": 0.4665, + "step": 15980 + }, + { + "epoch": 5.161394448030988, + "grad_norm": 1.0899780988693237, + "learning_rate": 0.0002, + "loss": 0.4527, + "step": 15990 + }, + { + "epoch": 5.1646223369916076, + "grad_norm": 0.9251647591590881, + "learning_rate": 0.0002, + "loss": 0.4764, + "step": 16000 + }, + { + "epoch": 5.167850225952227, + "grad_norm": 1.1669172048568726, + "learning_rate": 0.0002, + "loss": 0.5043, + "step": 16010 + }, + { + "epoch": 5.171078114912847, + "grad_norm": 1.2285256385803223, + "learning_rate": 0.0002, + "loss": 0.4726, + "step": 16020 + }, + { + "epoch": 5.1743060038734665, + "grad_norm": 1.0504484176635742, + "learning_rate": 0.0002, + "loss": 0.4312, + "step": 16030 + }, + { + "epoch": 5.177533892834086, + "grad_norm": 1.2829089164733887, + "learning_rate": 0.0002, + "loss": 0.4507, + "step": 16040 + }, + { + "epoch": 5.180761781794706, + "grad_norm": 0.9332743287086487, + "learning_rate": 0.0002, + "loss": 0.4547, + "step": 16050 + }, + { + "epoch": 5.183989670755326, + "grad_norm": 1.0054426193237305, + "learning_rate": 0.0002, + "loss": 0.4211, + "step": 16060 + }, + { + "epoch": 5.187217559715946, + "grad_norm": 1.0049669742584229, + "learning_rate": 0.0002, + "loss": 0.4415, + "step": 16070 + }, + { + "epoch": 5.190445448676566, + "grad_norm": 1.0171366930007935, + "learning_rate": 0.0002, + "loss": 0.4462, + "step": 16080 + }, + { + "epoch": 5.193673337637185, + "grad_norm": 1.234966516494751, + "learning_rate": 0.0002, + "loss": 0.4725, + "step": 16090 + }, + { + "epoch": 5.196901226597805, + "grad_norm": 0.9127960205078125, + "learning_rate": 0.0002, + "loss": 0.4579, + "step": 16100 + }, + { + "epoch": 5.200129115558425, + "grad_norm": 1.153924822807312, + "learning_rate": 0.0002, + "loss": 0.4647, + "step": 16110 + }, + { + "epoch": 5.203357004519044, + "grad_norm": 1.26716947555542, + "learning_rate": 0.0002, + "loss": 0.4826, + "step": 16120 + }, + { + "epoch": 5.206584893479664, + "grad_norm": 1.2438743114471436, + "learning_rate": 0.0002, + "loss": 0.446, + "step": 16130 + }, + { + "epoch": 5.2098127824402845, + "grad_norm": 1.0888392925262451, + "learning_rate": 0.0002, + "loss": 0.4768, + "step": 16140 + }, + { + "epoch": 5.213040671400904, + "grad_norm": 1.1741917133331299, + "learning_rate": 0.0002, + "loss": 0.4508, + "step": 16150 + }, + { + "epoch": 5.216268560361524, + "grad_norm": 0.9508614540100098, + "learning_rate": 0.0002, + "loss": 0.4271, + "step": 16160 + }, + { + "epoch": 5.219496449322143, + "grad_norm": 0.9714716672897339, + "learning_rate": 0.0002, + "loss": 0.4577, + "step": 16170 + }, + { + "epoch": 5.222724338282763, + "grad_norm": 1.2681622505187988, + "learning_rate": 0.0002, + "loss": 0.4636, + "step": 16180 + }, + { + "epoch": 5.225952227243383, + "grad_norm": 1.045871376991272, + "learning_rate": 0.0002, + "loss": 0.4723, + "step": 16190 + }, + { + "epoch": 5.229180116204002, + "grad_norm": 1.0272563695907593, + "learning_rate": 0.0002, + "loss": 0.4467, + "step": 16200 + }, + { + "epoch": 5.232408005164622, + "grad_norm": 1.092901349067688, + "learning_rate": 0.0002, + "loss": 0.4353, + "step": 16210 + }, + { + "epoch": 5.235635894125242, + "grad_norm": 0.9332799315452576, + "learning_rate": 0.0002, + "loss": 0.4588, + "step": 16220 + }, + { + "epoch": 5.238863783085862, + "grad_norm": 1.1728498935699463, + "learning_rate": 0.0002, + "loss": 0.4594, + "step": 16230 + }, + { + "epoch": 5.242091672046482, + "grad_norm": 0.9932476878166199, + "learning_rate": 0.0002, + "loss": 0.4652, + "step": 16240 + }, + { + "epoch": 5.2453195610071015, + "grad_norm": 0.735236406326294, + "learning_rate": 0.0002, + "loss": 0.4469, + "step": 16250 + }, + { + "epoch": 5.248547449967721, + "grad_norm": 1.0289303064346313, + "learning_rate": 0.0002, + "loss": 0.4386, + "step": 16260 + }, + { + "epoch": 5.251775338928341, + "grad_norm": 0.9488231539726257, + "learning_rate": 0.0002, + "loss": 0.4303, + "step": 16270 + }, + { + "epoch": 5.2550032278889605, + "grad_norm": 0.8320055603981018, + "learning_rate": 0.0002, + "loss": 0.4495, + "step": 16280 + }, + { + "epoch": 5.25823111684958, + "grad_norm": 1.2013251781463623, + "learning_rate": 0.0002, + "loss": 0.4224, + "step": 16290 + }, + { + "epoch": 5.2614590058102, + "grad_norm": 1.0649845600128174, + "learning_rate": 0.0002, + "loss": 0.4666, + "step": 16300 + }, + { + "epoch": 5.26468689477082, + "grad_norm": 1.1674472093582153, + "learning_rate": 0.0002, + "loss": 0.4325, + "step": 16310 + }, + { + "epoch": 5.26791478373144, + "grad_norm": 1.3934763669967651, + "learning_rate": 0.0002, + "loss": 0.4482, + "step": 16320 + }, + { + "epoch": 5.27114267269206, + "grad_norm": 0.8427977561950684, + "learning_rate": 0.0002, + "loss": 0.4494, + "step": 16330 + }, + { + "epoch": 5.274370561652679, + "grad_norm": 1.0497093200683594, + "learning_rate": 0.0002, + "loss": 0.4234, + "step": 16340 + }, + { + "epoch": 5.277598450613299, + "grad_norm": 0.8562338352203369, + "learning_rate": 0.0002, + "loss": 0.4337, + "step": 16350 + }, + { + "epoch": 5.280826339573919, + "grad_norm": 1.043920874595642, + "learning_rate": 0.0002, + "loss": 0.4664, + "step": 16360 + }, + { + "epoch": 5.284054228534538, + "grad_norm": 1.0039188861846924, + "learning_rate": 0.0002, + "loss": 0.4463, + "step": 16370 + }, + { + "epoch": 5.287282117495158, + "grad_norm": 0.9414041638374329, + "learning_rate": 0.0002, + "loss": 0.4149, + "step": 16380 + }, + { + "epoch": 5.2905100064557775, + "grad_norm": 1.3346221446990967, + "learning_rate": 0.0002, + "loss": 0.5119, + "step": 16390 + }, + { + "epoch": 5.293737895416398, + "grad_norm": 1.0173962116241455, + "learning_rate": 0.0002, + "loss": 0.4479, + "step": 16400 + }, + { + "epoch": 5.296965784377018, + "grad_norm": 0.7756500244140625, + "learning_rate": 0.0002, + "loss": 0.4538, + "step": 16410 + }, + { + "epoch": 5.300193673337637, + "grad_norm": 1.1185362339019775, + "learning_rate": 0.0002, + "loss": 0.4306, + "step": 16420 + }, + { + "epoch": 5.303421562298257, + "grad_norm": 1.0904899835586548, + "learning_rate": 0.0002, + "loss": 0.5033, + "step": 16430 + }, + { + "epoch": 5.306649451258877, + "grad_norm": 1.0803170204162598, + "learning_rate": 0.0002, + "loss": 0.4887, + "step": 16440 + }, + { + "epoch": 5.309877340219496, + "grad_norm": 1.1492092609405518, + "learning_rate": 0.0002, + "loss": 0.4473, + "step": 16450 + }, + { + "epoch": 5.313105229180116, + "grad_norm": 1.1212135553359985, + "learning_rate": 0.0002, + "loss": 0.4696, + "step": 16460 + }, + { + "epoch": 5.316333118140736, + "grad_norm": 0.8274528980255127, + "learning_rate": 0.0002, + "loss": 0.4438, + "step": 16470 + }, + { + "epoch": 5.319561007101356, + "grad_norm": 1.118891716003418, + "learning_rate": 0.0002, + "loss": 0.468, + "step": 16480 + }, + { + "epoch": 5.322788896061976, + "grad_norm": 1.185945749282837, + "learning_rate": 0.0002, + "loss": 0.4403, + "step": 16490 + }, + { + "epoch": 5.3260167850225955, + "grad_norm": 1.0275214910507202, + "learning_rate": 0.0002, + "loss": 0.4946, + "step": 16500 + }, + { + "epoch": 5.329244673983215, + "grad_norm": 0.9346362352371216, + "learning_rate": 0.0002, + "loss": 0.4612, + "step": 16510 + }, + { + "epoch": 5.332472562943835, + "grad_norm": 0.9600600600242615, + "learning_rate": 0.0002, + "loss": 0.4722, + "step": 16520 + }, + { + "epoch": 5.335700451904454, + "grad_norm": 1.1238188743591309, + "learning_rate": 0.0002, + "loss": 0.4536, + "step": 16530 + }, + { + "epoch": 5.338928340865074, + "grad_norm": 0.8660476207733154, + "learning_rate": 0.0002, + "loss": 0.5025, + "step": 16540 + }, + { + "epoch": 5.342156229825694, + "grad_norm": 0.9869821071624756, + "learning_rate": 0.0002, + "loss": 0.4732, + "step": 16550 + }, + { + "epoch": 5.345384118786313, + "grad_norm": 1.1719090938568115, + "learning_rate": 0.0002, + "loss": 0.4967, + "step": 16560 + }, + { + "epoch": 5.348612007746934, + "grad_norm": 1.0122894048690796, + "learning_rate": 0.0002, + "loss": 0.4563, + "step": 16570 + }, + { + "epoch": 5.351839896707554, + "grad_norm": 1.2431079149246216, + "learning_rate": 0.0002, + "loss": 0.5066, + "step": 16580 + }, + { + "epoch": 5.355067785668173, + "grad_norm": 1.4178080558776855, + "learning_rate": 0.0002, + "loss": 0.4708, + "step": 16590 + }, + { + "epoch": 5.358295674628793, + "grad_norm": 1.1895726919174194, + "learning_rate": 0.0002, + "loss": 0.4686, + "step": 16600 + }, + { + "epoch": 5.3615235635894125, + "grad_norm": 1.154392123222351, + "learning_rate": 0.0002, + "loss": 0.475, + "step": 16610 + }, + { + "epoch": 5.364751452550032, + "grad_norm": 0.9207229018211365, + "learning_rate": 0.0002, + "loss": 0.4511, + "step": 16620 + }, + { + "epoch": 5.367979341510652, + "grad_norm": 1.0247414112091064, + "learning_rate": 0.0002, + "loss": 0.4606, + "step": 16630 + }, + { + "epoch": 5.3712072304712715, + "grad_norm": 1.0402202606201172, + "learning_rate": 0.0002, + "loss": 0.4886, + "step": 16640 + }, + { + "epoch": 5.374435119431892, + "grad_norm": 1.1902891397476196, + "learning_rate": 0.0002, + "loss": 0.4903, + "step": 16650 + }, + { + "epoch": 5.377663008392512, + "grad_norm": 0.9572759866714478, + "learning_rate": 0.0002, + "loss": 0.4583, + "step": 16660 + }, + { + "epoch": 5.380890897353131, + "grad_norm": 0.9968860149383545, + "learning_rate": 0.0002, + "loss": 0.4636, + "step": 16670 + }, + { + "epoch": 5.384118786313751, + "grad_norm": 1.2468547821044922, + "learning_rate": 0.0002, + "loss": 0.477, + "step": 16680 + }, + { + "epoch": 5.387346675274371, + "grad_norm": 1.154661774635315, + "learning_rate": 0.0002, + "loss": 0.5223, + "step": 16690 + }, + { + "epoch": 5.39057456423499, + "grad_norm": 0.8837044835090637, + "learning_rate": 0.0002, + "loss": 0.4637, + "step": 16700 + }, + { + "epoch": 5.39380245319561, + "grad_norm": 1.0317907333374023, + "learning_rate": 0.0002, + "loss": 0.4744, + "step": 16710 + }, + { + "epoch": 5.39703034215623, + "grad_norm": 0.9811587929725647, + "learning_rate": 0.0002, + "loss": 0.4831, + "step": 16720 + }, + { + "epoch": 5.400258231116849, + "grad_norm": 0.9487450122833252, + "learning_rate": 0.0002, + "loss": 0.4739, + "step": 16730 + }, + { + "epoch": 5.403486120077469, + "grad_norm": 1.0540274381637573, + "learning_rate": 0.0002, + "loss": 0.4574, + "step": 16740 + }, + { + "epoch": 5.406714009038089, + "grad_norm": 1.028363585472107, + "learning_rate": 0.0002, + "loss": 0.4709, + "step": 16750 + }, + { + "epoch": 5.409941897998709, + "grad_norm": 1.0200704336166382, + "learning_rate": 0.0002, + "loss": 0.468, + "step": 16760 + }, + { + "epoch": 5.413169786959329, + "grad_norm": 1.0330981016159058, + "learning_rate": 0.0002, + "loss": 0.4383, + "step": 16770 + }, + { + "epoch": 5.416397675919948, + "grad_norm": 1.320875644683838, + "learning_rate": 0.0002, + "loss": 0.4645, + "step": 16780 + }, + { + "epoch": 5.419625564880568, + "grad_norm": 0.9838143587112427, + "learning_rate": 0.0002, + "loss": 0.4601, + "step": 16790 + }, + { + "epoch": 5.422853453841188, + "grad_norm": 1.1006578207015991, + "learning_rate": 0.0002, + "loss": 0.4835, + "step": 16800 + }, + { + "epoch": 5.426081342801807, + "grad_norm": 1.099174976348877, + "learning_rate": 0.0002, + "loss": 0.4871, + "step": 16810 + }, + { + "epoch": 5.429309231762427, + "grad_norm": 1.0632189512252808, + "learning_rate": 0.0002, + "loss": 0.4773, + "step": 16820 + }, + { + "epoch": 5.4325371207230475, + "grad_norm": 0.9673194885253906, + "learning_rate": 0.0002, + "loss": 0.4732, + "step": 16830 + }, + { + "epoch": 5.435765009683667, + "grad_norm": 0.853013813495636, + "learning_rate": 0.0002, + "loss": 0.4731, + "step": 16840 + }, + { + "epoch": 5.438992898644287, + "grad_norm": 1.0261728763580322, + "learning_rate": 0.0002, + "loss": 0.4856, + "step": 16850 + }, + { + "epoch": 5.4422207876049065, + "grad_norm": 1.1642370223999023, + "learning_rate": 0.0002, + "loss": 0.4729, + "step": 16860 + }, + { + "epoch": 5.445448676565526, + "grad_norm": 0.8715673685073853, + "learning_rate": 0.0002, + "loss": 0.4751, + "step": 16870 + }, + { + "epoch": 5.448676565526146, + "grad_norm": 0.905746579170227, + "learning_rate": 0.0002, + "loss": 0.4566, + "step": 16880 + }, + { + "epoch": 5.451904454486765, + "grad_norm": 1.1051915884017944, + "learning_rate": 0.0002, + "loss": 0.4536, + "step": 16890 + }, + { + "epoch": 5.455132343447385, + "grad_norm": 1.0781478881835938, + "learning_rate": 0.0002, + "loss": 0.4944, + "step": 16900 + }, + { + "epoch": 5.458360232408005, + "grad_norm": 1.1168911457061768, + "learning_rate": 0.0002, + "loss": 0.4655, + "step": 16910 + }, + { + "epoch": 5.461588121368625, + "grad_norm": 1.1150046586990356, + "learning_rate": 0.0002, + "loss": 0.4624, + "step": 16920 + }, + { + "epoch": 5.464816010329245, + "grad_norm": 0.9862499833106995, + "learning_rate": 0.0002, + "loss": 0.4849, + "step": 16930 + }, + { + "epoch": 5.468043899289865, + "grad_norm": 1.5416640043258667, + "learning_rate": 0.0002, + "loss": 0.47, + "step": 16940 + }, + { + "epoch": 5.471271788250484, + "grad_norm": 0.8960899710655212, + "learning_rate": 0.0002, + "loss": 0.4508, + "step": 16950 + }, + { + "epoch": 5.474499677211104, + "grad_norm": 0.9796477556228638, + "learning_rate": 0.0002, + "loss": 0.5002, + "step": 16960 + }, + { + "epoch": 5.4777275661717235, + "grad_norm": 0.9526587128639221, + "learning_rate": 0.0002, + "loss": 0.4939, + "step": 16970 + }, + { + "epoch": 5.480955455132343, + "grad_norm": 1.2373039722442627, + "learning_rate": 0.0002, + "loss": 0.4807, + "step": 16980 + }, + { + "epoch": 5.484183344092963, + "grad_norm": 1.1860566139221191, + "learning_rate": 0.0002, + "loss": 0.4642, + "step": 16990 + }, + { + "epoch": 5.487411233053583, + "grad_norm": 1.477345585823059, + "learning_rate": 0.0002, + "loss": 0.4929, + "step": 17000 + }, + { + "epoch": 5.490639122014203, + "grad_norm": 1.1029295921325684, + "learning_rate": 0.0002, + "loss": 0.4566, + "step": 17010 + }, + { + "epoch": 5.493867010974823, + "grad_norm": 1.1416981220245361, + "learning_rate": 0.0002, + "loss": 0.487, + "step": 17020 + }, + { + "epoch": 5.497094899935442, + "grad_norm": 1.1647989749908447, + "learning_rate": 0.0002, + "loss": 0.475, + "step": 17030 + }, + { + "epoch": 5.500322788896062, + "grad_norm": 1.1297032833099365, + "learning_rate": 0.0002, + "loss": 0.4644, + "step": 17040 + }, + { + "epoch": 5.503550677856682, + "grad_norm": 0.9764689207077026, + "learning_rate": 0.0002, + "loss": 0.4885, + "step": 17050 + }, + { + "epoch": 5.506778566817301, + "grad_norm": 1.038161039352417, + "learning_rate": 0.0002, + "loss": 0.4789, + "step": 17060 + }, + { + "epoch": 5.510006455777921, + "grad_norm": 1.1417886018753052, + "learning_rate": 0.0002, + "loss": 0.4467, + "step": 17070 + }, + { + "epoch": 5.513234344738541, + "grad_norm": 0.9300898313522339, + "learning_rate": 0.0002, + "loss": 0.4782, + "step": 17080 + }, + { + "epoch": 5.516462233699161, + "grad_norm": 1.0295016765594482, + "learning_rate": 0.0002, + "loss": 0.4805, + "step": 17090 + }, + { + "epoch": 5.519690122659781, + "grad_norm": 1.1273008584976196, + "learning_rate": 0.0002, + "loss": 0.4663, + "step": 17100 + }, + { + "epoch": 5.5229180116204, + "grad_norm": 0.9542737007141113, + "learning_rate": 0.0002, + "loss": 0.4897, + "step": 17110 + }, + { + "epoch": 5.52614590058102, + "grad_norm": 1.34589421749115, + "learning_rate": 0.0002, + "loss": 0.51, + "step": 17120 + }, + { + "epoch": 5.52937378954164, + "grad_norm": 0.9889675378799438, + "learning_rate": 0.0002, + "loss": 0.467, + "step": 17130 + }, + { + "epoch": 5.532601678502259, + "grad_norm": 1.25719153881073, + "learning_rate": 0.0002, + "loss": 0.4752, + "step": 17140 + }, + { + "epoch": 5.535829567462879, + "grad_norm": 1.2511073350906372, + "learning_rate": 0.0002, + "loss": 0.4609, + "step": 17150 + }, + { + "epoch": 5.539057456423499, + "grad_norm": 1.1993521451950073, + "learning_rate": 0.0002, + "loss": 0.4992, + "step": 17160 + }, + { + "epoch": 5.542285345384119, + "grad_norm": 1.1394526958465576, + "learning_rate": 0.0002, + "loss": 0.4986, + "step": 17170 + }, + { + "epoch": 5.545513234344739, + "grad_norm": 1.0435349941253662, + "learning_rate": 0.0002, + "loss": 0.5284, + "step": 17180 + }, + { + "epoch": 5.5487411233053585, + "grad_norm": 1.120940089225769, + "learning_rate": 0.0002, + "loss": 0.4934, + "step": 17190 + }, + { + "epoch": 5.551969012265978, + "grad_norm": 1.0906445980072021, + "learning_rate": 0.0002, + "loss": 0.4704, + "step": 17200 + }, + { + "epoch": 5.555196901226598, + "grad_norm": 0.8883966207504272, + "learning_rate": 0.0002, + "loss": 0.4896, + "step": 17210 + }, + { + "epoch": 5.5584247901872175, + "grad_norm": 1.3078752756118774, + "learning_rate": 0.0002, + "loss": 0.4696, + "step": 17220 + }, + { + "epoch": 5.561652679147837, + "grad_norm": 1.0224416255950928, + "learning_rate": 0.0002, + "loss": 0.4805, + "step": 17230 + }, + { + "epoch": 5.564880568108457, + "grad_norm": 1.242518663406372, + "learning_rate": 0.0002, + "loss": 0.47, + "step": 17240 + }, + { + "epoch": 5.568108457069076, + "grad_norm": 1.2328250408172607, + "learning_rate": 0.0002, + "loss": 0.4708, + "step": 17250 + }, + { + "epoch": 5.571336346029697, + "grad_norm": 1.2186611890792847, + "learning_rate": 0.0002, + "loss": 0.4685, + "step": 17260 + }, + { + "epoch": 5.574564234990317, + "grad_norm": 1.0947459936141968, + "learning_rate": 0.0002, + "loss": 0.4688, + "step": 17270 + }, + { + "epoch": 5.577792123950936, + "grad_norm": 1.075279951095581, + "learning_rate": 0.0002, + "loss": 0.506, + "step": 17280 + }, + { + "epoch": 5.581020012911556, + "grad_norm": 1.0316804647445679, + "learning_rate": 0.0002, + "loss": 0.478, + "step": 17290 + }, + { + "epoch": 5.584247901872176, + "grad_norm": 1.1077373027801514, + "learning_rate": 0.0002, + "loss": 0.478, + "step": 17300 + }, + { + "epoch": 5.587475790832795, + "grad_norm": 1.219228744506836, + "learning_rate": 0.0002, + "loss": 0.4857, + "step": 17310 + }, + { + "epoch": 5.590703679793415, + "grad_norm": 1.026361346244812, + "learning_rate": 0.0002, + "loss": 0.4465, + "step": 17320 + }, + { + "epoch": 5.5939315687540345, + "grad_norm": 1.1621283292770386, + "learning_rate": 0.0002, + "loss": 0.4831, + "step": 17330 + }, + { + "epoch": 5.597159457714655, + "grad_norm": 1.0177470445632935, + "learning_rate": 0.0002, + "loss": 0.4706, + "step": 17340 + }, + { + "epoch": 5.600387346675275, + "grad_norm": 1.0625319480895996, + "learning_rate": 0.0002, + "loss": 0.4961, + "step": 17350 + }, + { + "epoch": 5.603615235635894, + "grad_norm": 1.148815393447876, + "learning_rate": 0.0002, + "loss": 0.484, + "step": 17360 + }, + { + "epoch": 5.606843124596514, + "grad_norm": 1.0571802854537964, + "learning_rate": 0.0002, + "loss": 0.4804, + "step": 17370 + }, + { + "epoch": 5.610071013557134, + "grad_norm": 1.2069389820098877, + "learning_rate": 0.0002, + "loss": 0.5202, + "step": 17380 + }, + { + "epoch": 5.613298902517753, + "grad_norm": 1.407530426979065, + "learning_rate": 0.0002, + "loss": 0.5029, + "step": 17390 + }, + { + "epoch": 5.616526791478373, + "grad_norm": 1.247060775756836, + "learning_rate": 0.0002, + "loss": 0.4688, + "step": 17400 + }, + { + "epoch": 5.619754680438993, + "grad_norm": 1.431684136390686, + "learning_rate": 0.0002, + "loss": 0.4359, + "step": 17410 + }, + { + "epoch": 5.622982569399612, + "grad_norm": 1.0520552396774292, + "learning_rate": 0.0002, + "loss": 0.5244, + "step": 17420 + }, + { + "epoch": 5.626210458360232, + "grad_norm": 1.0593537092208862, + "learning_rate": 0.0002, + "loss": 0.4993, + "step": 17430 + }, + { + "epoch": 5.6294383473208525, + "grad_norm": 1.4414515495300293, + "learning_rate": 0.0002, + "loss": 0.4911, + "step": 17440 + }, + { + "epoch": 5.632666236281472, + "grad_norm": 1.0902460813522339, + "learning_rate": 0.0002, + "loss": 0.4761, + "step": 17450 + }, + { + "epoch": 5.635894125242092, + "grad_norm": 0.890944242477417, + "learning_rate": 0.0002, + "loss": 0.4737, + "step": 17460 + }, + { + "epoch": 5.639122014202711, + "grad_norm": 1.035675287246704, + "learning_rate": 0.0002, + "loss": 0.4706, + "step": 17470 + }, + { + "epoch": 5.642349903163331, + "grad_norm": 0.9792264103889465, + "learning_rate": 0.0002, + "loss": 0.484, + "step": 17480 + }, + { + "epoch": 5.645577792123951, + "grad_norm": 1.1888220310211182, + "learning_rate": 0.0002, + "loss": 0.4753, + "step": 17490 + }, + { + "epoch": 5.64880568108457, + "grad_norm": 1.0169143676757812, + "learning_rate": 0.0002, + "loss": 0.5047, + "step": 17500 + }, + { + "epoch": 5.652033570045191, + "grad_norm": 0.9812449216842651, + "learning_rate": 0.0002, + "loss": 0.4919, + "step": 17510 + }, + { + "epoch": 5.655261459005811, + "grad_norm": 1.0509105920791626, + "learning_rate": 0.0002, + "loss": 0.4879, + "step": 17520 + }, + { + "epoch": 5.65848934796643, + "grad_norm": 0.9047426581382751, + "learning_rate": 0.0002, + "loss": 0.4695, + "step": 17530 + }, + { + "epoch": 5.66171723692705, + "grad_norm": 1.2393709421157837, + "learning_rate": 0.0002, + "loss": 0.4712, + "step": 17540 + }, + { + "epoch": 5.6649451258876695, + "grad_norm": 1.1098991632461548, + "learning_rate": 0.0002, + "loss": 0.5012, + "step": 17550 + }, + { + "epoch": 5.668173014848289, + "grad_norm": 0.8181570768356323, + "learning_rate": 0.0002, + "loss": 0.4499, + "step": 17560 + }, + { + "epoch": 5.671400903808909, + "grad_norm": 0.9676381945610046, + "learning_rate": 0.0002, + "loss": 0.4973, + "step": 17570 + }, + { + "epoch": 5.6746287927695285, + "grad_norm": 1.1225934028625488, + "learning_rate": 0.0002, + "loss": 0.5058, + "step": 17580 + }, + { + "epoch": 5.677856681730148, + "grad_norm": 1.6259925365447998, + "learning_rate": 0.0002, + "loss": 0.5165, + "step": 17590 + }, + { + "epoch": 5.681084570690768, + "grad_norm": 0.7751404643058777, + "learning_rate": 0.0002, + "loss": 0.4613, + "step": 17600 + }, + { + "epoch": 5.684312459651388, + "grad_norm": 0.8478589057922363, + "learning_rate": 0.0002, + "loss": 0.4895, + "step": 17610 + }, + { + "epoch": 5.687540348612008, + "grad_norm": 1.2887113094329834, + "learning_rate": 0.0002, + "loss": 0.4492, + "step": 17620 + }, + { + "epoch": 5.690768237572628, + "grad_norm": 1.1452652215957642, + "learning_rate": 0.0002, + "loss": 0.4792, + "step": 17630 + }, + { + "epoch": 5.693996126533247, + "grad_norm": 1.0370417833328247, + "learning_rate": 0.0002, + "loss": 0.4889, + "step": 17640 + }, + { + "epoch": 5.697224015493867, + "grad_norm": 1.1358870267868042, + "learning_rate": 0.0002, + "loss": 0.535, + "step": 17650 + }, + { + "epoch": 5.700451904454487, + "grad_norm": 1.2772479057312012, + "learning_rate": 0.0002, + "loss": 0.4753, + "step": 17660 + }, + { + "epoch": 5.703679793415106, + "grad_norm": 1.182812213897705, + "learning_rate": 0.0002, + "loss": 0.4492, + "step": 17670 + }, + { + "epoch": 5.706907682375727, + "grad_norm": 1.099074125289917, + "learning_rate": 0.0002, + "loss": 0.5025, + "step": 17680 + }, + { + "epoch": 5.710135571336346, + "grad_norm": 0.938634991645813, + "learning_rate": 0.0002, + "loss": 0.4945, + "step": 17690 + }, + { + "epoch": 5.713363460296966, + "grad_norm": 0.9385238885879517, + "learning_rate": 0.0002, + "loss": 0.491, + "step": 17700 + }, + { + "epoch": 5.716591349257586, + "grad_norm": 1.1486014127731323, + "learning_rate": 0.0002, + "loss": 0.4849, + "step": 17710 + }, + { + "epoch": 5.719819238218205, + "grad_norm": 0.9433078169822693, + "learning_rate": 0.0002, + "loss": 0.5043, + "step": 17720 + }, + { + "epoch": 5.723047127178825, + "grad_norm": 1.02472722530365, + "learning_rate": 0.0002, + "loss": 0.4543, + "step": 17730 + }, + { + "epoch": 5.726275016139445, + "grad_norm": 0.9360876679420471, + "learning_rate": 0.0002, + "loss": 0.4631, + "step": 17740 + }, + { + "epoch": 5.729502905100064, + "grad_norm": 1.0481483936309814, + "learning_rate": 0.0002, + "loss": 0.4947, + "step": 17750 + }, + { + "epoch": 5.732730794060684, + "grad_norm": 1.0032516717910767, + "learning_rate": 0.0002, + "loss": 0.4763, + "step": 17760 + }, + { + "epoch": 5.735958683021304, + "grad_norm": 0.8908069729804993, + "learning_rate": 0.0002, + "loss": 0.4819, + "step": 17770 + }, + { + "epoch": 5.739186571981924, + "grad_norm": 1.0679123401641846, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 17780 + }, + { + "epoch": 5.742414460942544, + "grad_norm": 1.0448014736175537, + "learning_rate": 0.0002, + "loss": 0.4818, + "step": 17790 + }, + { + "epoch": 5.7456423499031635, + "grad_norm": 1.0433847904205322, + "learning_rate": 0.0002, + "loss": 0.4869, + "step": 17800 + }, + { + "epoch": 5.748870238863783, + "grad_norm": 1.000291109085083, + "learning_rate": 0.0002, + "loss": 0.5243, + "step": 17810 + }, + { + "epoch": 5.752098127824403, + "grad_norm": 1.1238429546356201, + "learning_rate": 0.0002, + "loss": 0.4891, + "step": 17820 + }, + { + "epoch": 5.755326016785022, + "grad_norm": 1.09062659740448, + "learning_rate": 0.0002, + "loss": 0.4905, + "step": 17830 + }, + { + "epoch": 5.758553905745642, + "grad_norm": 0.8538689613342285, + "learning_rate": 0.0002, + "loss": 0.4883, + "step": 17840 + }, + { + "epoch": 5.761781794706262, + "grad_norm": 1.3872947692871094, + "learning_rate": 0.0002, + "loss": 0.4989, + "step": 17850 + }, + { + "epoch": 5.765009683666882, + "grad_norm": 1.0578876733779907, + "learning_rate": 0.0002, + "loss": 0.4707, + "step": 17860 + }, + { + "epoch": 5.768237572627502, + "grad_norm": 1.1761705875396729, + "learning_rate": 0.0002, + "loss": 0.5281, + "step": 17870 + }, + { + "epoch": 5.771465461588122, + "grad_norm": 1.1223368644714355, + "learning_rate": 0.0002, + "loss": 0.4802, + "step": 17880 + }, + { + "epoch": 5.774693350548741, + "grad_norm": 1.2484360933303833, + "learning_rate": 0.0002, + "loss": 0.505, + "step": 17890 + }, + { + "epoch": 5.777921239509361, + "grad_norm": 1.2461199760437012, + "learning_rate": 0.0002, + "loss": 0.4786, + "step": 17900 + }, + { + "epoch": 5.7811491284699805, + "grad_norm": 1.1718299388885498, + "learning_rate": 0.0002, + "loss": 0.4933, + "step": 17910 + }, + { + "epoch": 5.7843770174306, + "grad_norm": 0.9896837472915649, + "learning_rate": 0.0002, + "loss": 0.471, + "step": 17920 + }, + { + "epoch": 5.78760490639122, + "grad_norm": 1.3759760856628418, + "learning_rate": 0.0002, + "loss": 0.4808, + "step": 17930 + }, + { + "epoch": 5.7908327953518395, + "grad_norm": 1.0596622228622437, + "learning_rate": 0.0002, + "loss": 0.4847, + "step": 17940 + }, + { + "epoch": 5.79406068431246, + "grad_norm": 0.9292021989822388, + "learning_rate": 0.0002, + "loss": 0.5153, + "step": 17950 + }, + { + "epoch": 5.79728857327308, + "grad_norm": 0.8786653876304626, + "learning_rate": 0.0002, + "loss": 0.4783, + "step": 17960 + }, + { + "epoch": 5.800516462233699, + "grad_norm": 1.2087152004241943, + "learning_rate": 0.0002, + "loss": 0.4598, + "step": 17970 + }, + { + "epoch": 5.803744351194319, + "grad_norm": 1.1643104553222656, + "learning_rate": 0.0002, + "loss": 0.4953, + "step": 17980 + }, + { + "epoch": 5.806972240154939, + "grad_norm": 0.971613347530365, + "learning_rate": 0.0002, + "loss": 0.5111, + "step": 17990 + }, + { + "epoch": 5.810200129115558, + "grad_norm": 1.306227684020996, + "learning_rate": 0.0002, + "loss": 0.5094, + "step": 18000 + }, + { + "epoch": 5.813428018076178, + "grad_norm": 1.3665502071380615, + "learning_rate": 0.0002, + "loss": 0.5392, + "step": 18010 + }, + { + "epoch": 5.816655907036798, + "grad_norm": 1.2227312326431274, + "learning_rate": 0.0002, + "loss": 0.4887, + "step": 18020 + }, + { + "epoch": 5.819883795997418, + "grad_norm": 1.180694818496704, + "learning_rate": 0.0002, + "loss": 0.5203, + "step": 18030 + }, + { + "epoch": 5.823111684958038, + "grad_norm": 1.1045362949371338, + "learning_rate": 0.0002, + "loss": 0.4962, + "step": 18040 + }, + { + "epoch": 5.826339573918657, + "grad_norm": 1.3828954696655273, + "learning_rate": 0.0002, + "loss": 0.4969, + "step": 18050 + }, + { + "epoch": 5.829567462879277, + "grad_norm": 1.305102825164795, + "learning_rate": 0.0002, + "loss": 0.5493, + "step": 18060 + }, + { + "epoch": 5.832795351839897, + "grad_norm": 1.2708743810653687, + "learning_rate": 0.0002, + "loss": 0.4844, + "step": 18070 + }, + { + "epoch": 5.836023240800516, + "grad_norm": 1.0344188213348389, + "learning_rate": 0.0002, + "loss": 0.4834, + "step": 18080 + }, + { + "epoch": 5.839251129761136, + "grad_norm": 1.1321724653244019, + "learning_rate": 0.0002, + "loss": 0.5088, + "step": 18090 + }, + { + "epoch": 5.842479018721756, + "grad_norm": 1.2162611484527588, + "learning_rate": 0.0002, + "loss": 0.4888, + "step": 18100 + }, + { + "epoch": 5.845706907682375, + "grad_norm": 1.427612543106079, + "learning_rate": 0.0002, + "loss": 0.5014, + "step": 18110 + }, + { + "epoch": 5.848934796642995, + "grad_norm": 1.4391452074050903, + "learning_rate": 0.0002, + "loss": 0.5339, + "step": 18120 + }, + { + "epoch": 5.8521626856036155, + "grad_norm": 1.1548216342926025, + "learning_rate": 0.0002, + "loss": 0.528, + "step": 18130 + }, + { + "epoch": 5.855390574564235, + "grad_norm": 1.2336437702178955, + "learning_rate": 0.0002, + "loss": 0.4779, + "step": 18140 + }, + { + "epoch": 5.858618463524855, + "grad_norm": 1.254661202430725, + "learning_rate": 0.0002, + "loss": 0.4844, + "step": 18150 + }, + { + "epoch": 5.8618463524854745, + "grad_norm": 0.8326491117477417, + "learning_rate": 0.0002, + "loss": 0.5201, + "step": 18160 + }, + { + "epoch": 5.865074241446094, + "grad_norm": 1.0907988548278809, + "learning_rate": 0.0002, + "loss": 0.5076, + "step": 18170 + }, + { + "epoch": 5.868302130406714, + "grad_norm": 0.9896568655967712, + "learning_rate": 0.0002, + "loss": 0.48, + "step": 18180 + }, + { + "epoch": 5.871530019367333, + "grad_norm": 0.9440065026283264, + "learning_rate": 0.0002, + "loss": 0.4628, + "step": 18190 + }, + { + "epoch": 5.874757908327954, + "grad_norm": 1.09321129322052, + "learning_rate": 0.0002, + "loss": 0.5265, + "step": 18200 + }, + { + "epoch": 5.877985797288574, + "grad_norm": 1.2588142156600952, + "learning_rate": 0.0002, + "loss": 0.4737, + "step": 18210 + }, + { + "epoch": 5.881213686249193, + "grad_norm": 1.1731587648391724, + "learning_rate": 0.0002, + "loss": 0.475, + "step": 18220 + }, + { + "epoch": 5.884441575209813, + "grad_norm": 0.9904444217681885, + "learning_rate": 0.0002, + "loss": 0.504, + "step": 18230 + }, + { + "epoch": 5.887669464170433, + "grad_norm": 0.8985799551010132, + "learning_rate": 0.0002, + "loss": 0.4842, + "step": 18240 + }, + { + "epoch": 5.890897353131052, + "grad_norm": 1.0182441473007202, + "learning_rate": 0.0002, + "loss": 0.4878, + "step": 18250 + }, + { + "epoch": 5.894125242091672, + "grad_norm": 1.1574701070785522, + "learning_rate": 0.0002, + "loss": 0.5224, + "step": 18260 + }, + { + "epoch": 5.8973531310522915, + "grad_norm": 1.1776602268218994, + "learning_rate": 0.0002, + "loss": 0.5, + "step": 18270 + }, + { + "epoch": 5.900581020012911, + "grad_norm": 1.4951308965682983, + "learning_rate": 0.0002, + "loss": 0.5245, + "step": 18280 + }, + { + "epoch": 5.903808908973531, + "grad_norm": 1.1440261602401733, + "learning_rate": 0.0002, + "loss": 0.5454, + "step": 18290 + }, + { + "epoch": 5.907036797934151, + "grad_norm": 0.9925196170806885, + "learning_rate": 0.0002, + "loss": 0.4868, + "step": 18300 + }, + { + "epoch": 5.910264686894771, + "grad_norm": 1.098615288734436, + "learning_rate": 0.0002, + "loss": 0.5142, + "step": 18310 + }, + { + "epoch": 5.913492575855391, + "grad_norm": 1.0030080080032349, + "learning_rate": 0.0002, + "loss": 0.5184, + "step": 18320 + }, + { + "epoch": 5.91672046481601, + "grad_norm": 0.9890318512916565, + "learning_rate": 0.0002, + "loss": 0.474, + "step": 18330 + }, + { + "epoch": 5.91994835377663, + "grad_norm": 1.2209392786026, + "learning_rate": 0.0002, + "loss": 0.5125, + "step": 18340 + }, + { + "epoch": 5.92317624273725, + "grad_norm": 1.108933925628662, + "learning_rate": 0.0002, + "loss": 0.4634, + "step": 18350 + }, + { + "epoch": 5.926404131697869, + "grad_norm": 1.086024522781372, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 18360 + }, + { + "epoch": 5.92963202065849, + "grad_norm": 1.0061167478561401, + "learning_rate": 0.0002, + "loss": 0.4952, + "step": 18370 + }, + { + "epoch": 5.9328599096191095, + "grad_norm": 0.9445858597755432, + "learning_rate": 0.0002, + "loss": 0.4848, + "step": 18380 + }, + { + "epoch": 5.936087798579729, + "grad_norm": 0.9556859135627747, + "learning_rate": 0.0002, + "loss": 0.5014, + "step": 18390 + }, + { + "epoch": 5.939315687540349, + "grad_norm": 1.154168963432312, + "learning_rate": 0.0002, + "loss": 0.4966, + "step": 18400 + }, + { + "epoch": 5.942543576500968, + "grad_norm": 1.0495831966400146, + "learning_rate": 0.0002, + "loss": 0.4836, + "step": 18410 + }, + { + "epoch": 5.945771465461588, + "grad_norm": 1.0717304944992065, + "learning_rate": 0.0002, + "loss": 0.5021, + "step": 18420 + }, + { + "epoch": 5.948999354422208, + "grad_norm": 1.06618332862854, + "learning_rate": 0.0002, + "loss": 0.4794, + "step": 18430 + }, + { + "epoch": 5.952227243382827, + "grad_norm": 0.9567165374755859, + "learning_rate": 0.0002, + "loss": 0.5011, + "step": 18440 + }, + { + "epoch": 5.955455132343447, + "grad_norm": 1.0306249856948853, + "learning_rate": 0.0002, + "loss": 0.485, + "step": 18450 + }, + { + "epoch": 5.958683021304067, + "grad_norm": 1.1879968643188477, + "learning_rate": 0.0002, + "loss": 0.4948, + "step": 18460 + }, + { + "epoch": 5.961910910264687, + "grad_norm": 1.3177233934402466, + "learning_rate": 0.0002, + "loss": 0.5185, + "step": 18470 + }, + { + "epoch": 5.965138799225307, + "grad_norm": 1.0945817232131958, + "learning_rate": 0.0002, + "loss": 0.4966, + "step": 18480 + }, + { + "epoch": 5.9683666881859265, + "grad_norm": 1.029414415359497, + "learning_rate": 0.0002, + "loss": 0.5196, + "step": 18490 + }, + { + "epoch": 5.971594577146546, + "grad_norm": 1.2266209125518799, + "learning_rate": 0.0002, + "loss": 0.5154, + "step": 18500 + }, + { + "epoch": 5.974822466107166, + "grad_norm": 1.2167150974273682, + "learning_rate": 0.0002, + "loss": 0.4914, + "step": 18510 + }, + { + "epoch": 5.9780503550677855, + "grad_norm": 0.9941056966781616, + "learning_rate": 0.0002, + "loss": 0.466, + "step": 18520 + }, + { + "epoch": 5.981278244028405, + "grad_norm": 1.4244859218597412, + "learning_rate": 0.0002, + "loss": 0.5037, + "step": 18530 + }, + { + "epoch": 5.984506132989026, + "grad_norm": 0.8976260423660278, + "learning_rate": 0.0002, + "loss": 0.4902, + "step": 18540 + }, + { + "epoch": 5.987734021949645, + "grad_norm": 1.0162699222564697, + "learning_rate": 0.0002, + "loss": 0.5039, + "step": 18550 + }, + { + "epoch": 5.990961910910265, + "grad_norm": 1.196677803993225, + "learning_rate": 0.0002, + "loss": 0.5138, + "step": 18560 + }, + { + "epoch": 5.994189799870885, + "grad_norm": 1.163403868675232, + "learning_rate": 0.0002, + "loss": 0.4626, + "step": 18570 + }, + { + "epoch": 5.997417688831504, + "grad_norm": 1.010205626487732, + "learning_rate": 0.0002, + "loss": 0.5105, + "step": 18580 + }, + { + "epoch": 6.0, + "eval_loss": 1.2861483097076416, + "eval_runtime": 163.2683, + "eval_samples_per_second": 4.49, + "eval_steps_per_second": 0.563, + "step": 18588 + } + ], + "logging_steps": 10, + "max_steps": 24784, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.602110197822915e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f46f2b8e8752b125339f36f172c3878be4cdb152 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-18588/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfc2a69e44a51edf5586ebed4b7ee915a23244c18c1f59e580471e4c9becfa98 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aebdf5e01be09b15298016b4c240e1ae764d0dce --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb6f5d8ad56f754da442fc192ae8830419b9eb526b934098b0cd6f11c8d0a6b7 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c14c0a891aa8ca6fc141d5c237e556098e1d5d2 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ddf8d0dc12f2a503a4a69a5a1e1a2f55c1f6a856de10773c1ccf17007797010 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..fca0e9db3900354bf6f5075c59db5e4dbed1be48 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:735044262f5cd4c9212b070eba9e3b48f5956dc3bc0ad925eb42a1ab773660b0 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf054cc8634b23dcbbd3dbfc76cfa85aecea7a4f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d825207f025b24b4ad7fe2716ef9578180a9deca23258a6220df40fd47443aa2 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ccc1722eeedbe13c8de04c32fed0656d26307d34 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/trainer_state.json @@ -0,0 +1,15265 @@ +{ + "best_metric": 1.0958120822906494, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098", + "epoch": 7.0, + "eval_steps": 10, + "global_step": 21686, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032278889606197547, + "grad_norm": 0.7092075347900391, + "learning_rate": 0.0002, + "loss": 1.593, + "step": 10 + }, + { + "epoch": 0.006455777921239509, + "grad_norm": 0.6900479793548584, + "learning_rate": 0.0002, + "loss": 1.0956, + "step": 20 + }, + { + "epoch": 0.009683666881859263, + "grad_norm": 0.6788288950920105, + "learning_rate": 0.0002, + "loss": 0.9807, + "step": 30 + }, + { + "epoch": 0.012911555842479019, + "grad_norm": 0.5590243339538574, + "learning_rate": 0.0002, + "loss": 0.9385, + "step": 40 + }, + { + "epoch": 0.016139444803098774, + "grad_norm": 0.5136010646820068, + "learning_rate": 0.0002, + "loss": 0.931, + "step": 50 + }, + { + "epoch": 0.019367333763718526, + "grad_norm": 0.45298320055007935, + "learning_rate": 0.0002, + "loss": 0.8896, + "step": 60 + }, + { + "epoch": 0.022595222724338282, + "grad_norm": 0.5917162299156189, + "learning_rate": 0.0002, + "loss": 0.9184, + "step": 70 + }, + { + "epoch": 0.025823111684958037, + "grad_norm": 0.4414856433868408, + "learning_rate": 0.0002, + "loss": 0.8705, + "step": 80 + }, + { + "epoch": 0.029051000645577793, + "grad_norm": 0.5547978281974792, + "learning_rate": 0.0002, + "loss": 0.8419, + "step": 90 + }, + { + "epoch": 0.03227888960619755, + "grad_norm": 0.5271288156509399, + "learning_rate": 0.0002, + "loss": 0.8987, + "step": 100 + }, + { + "epoch": 0.035506778566817304, + "grad_norm": 0.5506119728088379, + "learning_rate": 0.0002, + "loss": 0.8543, + "step": 110 + }, + { + "epoch": 0.03873466752743705, + "grad_norm": 0.5579327940940857, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 120 + }, + { + "epoch": 0.04196255648805681, + "grad_norm": 0.5099632740020752, + "learning_rate": 0.0002, + "loss": 0.8826, + "step": 130 + }, + { + "epoch": 0.045190445448676564, + "grad_norm": 0.40396833419799805, + "learning_rate": 0.0002, + "loss": 0.9239, + "step": 140 + }, + { + "epoch": 0.04841833440929632, + "grad_norm": 0.5008092522621155, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 150 + }, + { + "epoch": 0.051646223369916075, + "grad_norm": 0.4388776421546936, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 160 + }, + { + "epoch": 0.05487411233053583, + "grad_norm": 0.44138944149017334, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 170 + }, + { + "epoch": 0.058102001291155586, + "grad_norm": 0.358484148979187, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 180 + }, + { + "epoch": 0.06132989025177534, + "grad_norm": 0.457052081823349, + "learning_rate": 0.0002, + "loss": 0.8956, + "step": 190 + }, + { + "epoch": 0.0645577792123951, + "grad_norm": 0.5537622570991516, + "learning_rate": 0.0002, + "loss": 0.9138, + "step": 200 + }, + { + "epoch": 0.06778566817301485, + "grad_norm": 0.552631676197052, + "learning_rate": 0.0002, + "loss": 0.8701, + "step": 210 + }, + { + "epoch": 0.07101355713363461, + "grad_norm": 0.4414575397968292, + "learning_rate": 0.0002, + "loss": 0.8854, + "step": 220 + }, + { + "epoch": 0.07424144609425436, + "grad_norm": 0.4996664226055145, + "learning_rate": 0.0002, + "loss": 0.8581, + "step": 230 + }, + { + "epoch": 0.0774693350548741, + "grad_norm": 0.7321897149085999, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 240 + }, + { + "epoch": 0.08069722401549387, + "grad_norm": 0.4553901255130768, + "learning_rate": 0.0002, + "loss": 0.8848, + "step": 250 + }, + { + "epoch": 0.08392511297611362, + "grad_norm": 0.5039054751396179, + "learning_rate": 0.0002, + "loss": 0.868, + "step": 260 + }, + { + "epoch": 0.08715300193673338, + "grad_norm": 0.4113094210624695, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 270 + }, + { + "epoch": 0.09038089089735313, + "grad_norm": 0.450436532497406, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 280 + }, + { + "epoch": 0.09360877985797289, + "grad_norm": 0.4548024535179138, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 290 + }, + { + "epoch": 0.09683666881859264, + "grad_norm": 0.4932962656021118, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 300 + }, + { + "epoch": 0.1000645577792124, + "grad_norm": 0.4005250334739685, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 310 + }, + { + "epoch": 0.10329244673983215, + "grad_norm": 1.8321624994277954, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 320 + }, + { + "epoch": 0.1065203357004519, + "grad_norm": 0.45815610885620117, + "learning_rate": 0.0002, + "loss": 0.8411, + "step": 330 + }, + { + "epoch": 0.10974822466107166, + "grad_norm": 0.39324095845222473, + "learning_rate": 0.0002, + "loss": 0.857, + "step": 340 + }, + { + "epoch": 0.11297611362169141, + "grad_norm": 0.546273946762085, + "learning_rate": 0.0002, + "loss": 0.8258, + "step": 350 + }, + { + "epoch": 0.11620400258231117, + "grad_norm": 0.497448593378067, + "learning_rate": 0.0002, + "loss": 0.882, + "step": 360 + }, + { + "epoch": 0.11943189154293092, + "grad_norm": 0.37508800625801086, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 370 + }, + { + "epoch": 0.12265978050355068, + "grad_norm": 0.45849609375, + "learning_rate": 0.0002, + "loss": 0.852, + "step": 380 + }, + { + "epoch": 0.12588766946417043, + "grad_norm": 0.5488408803939819, + "learning_rate": 0.0002, + "loss": 0.8437, + "step": 390 + }, + { + "epoch": 0.1291155584247902, + "grad_norm": 0.4477061331272125, + "learning_rate": 0.0002, + "loss": 0.8349, + "step": 400 + }, + { + "epoch": 0.13234344738540993, + "grad_norm": 0.39227980375289917, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 410 + }, + { + "epoch": 0.1355713363460297, + "grad_norm": 0.3922233581542969, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 420 + }, + { + "epoch": 0.13879922530664945, + "grad_norm": 0.42901909351348877, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 430 + }, + { + "epoch": 0.14202711426726922, + "grad_norm": 0.4217798709869385, + "learning_rate": 0.0002, + "loss": 0.8271, + "step": 440 + }, + { + "epoch": 0.14525500322788895, + "grad_norm": 0.43470677733421326, + "learning_rate": 0.0002, + "loss": 0.8594, + "step": 450 + }, + { + "epoch": 0.1484828921885087, + "grad_norm": 0.5324403047561646, + "learning_rate": 0.0002, + "loss": 0.8106, + "step": 460 + }, + { + "epoch": 0.15171078114912848, + "grad_norm": 0.3999756872653961, + "learning_rate": 0.0002, + "loss": 0.8729, + "step": 470 + }, + { + "epoch": 0.1549386701097482, + "grad_norm": 0.404933363199234, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 480 + }, + { + "epoch": 0.15816655907036797, + "grad_norm": 0.44122636318206787, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 490 + }, + { + "epoch": 0.16139444803098774, + "grad_norm": 0.510166347026825, + "learning_rate": 0.0002, + "loss": 0.8457, + "step": 500 + }, + { + "epoch": 0.1646223369916075, + "grad_norm": 0.4549732506275177, + "learning_rate": 0.0002, + "loss": 0.8692, + "step": 510 + }, + { + "epoch": 0.16785022595222723, + "grad_norm": 0.5148182511329651, + "learning_rate": 0.0002, + "loss": 0.8466, + "step": 520 + }, + { + "epoch": 0.171078114912847, + "grad_norm": 0.3596806824207306, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 530 + }, + { + "epoch": 0.17430600387346676, + "grad_norm": 0.4388909339904785, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 540 + }, + { + "epoch": 0.17753389283408652, + "grad_norm": 0.5052742958068848, + "learning_rate": 0.0002, + "loss": 0.8322, + "step": 550 + }, + { + "epoch": 0.18076178179470626, + "grad_norm": 0.48248958587646484, + "learning_rate": 0.0002, + "loss": 0.791, + "step": 560 + }, + { + "epoch": 0.18398967075532602, + "grad_norm": 0.5360197424888611, + "learning_rate": 0.0002, + "loss": 0.8593, + "step": 570 + }, + { + "epoch": 0.18721755971594578, + "grad_norm": 0.43999341130256653, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 580 + }, + { + "epoch": 0.19044544867656552, + "grad_norm": 0.3685208261013031, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 590 + }, + { + "epoch": 0.19367333763718528, + "grad_norm": 0.4601275622844696, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 600 + }, + { + "epoch": 0.19690122659780504, + "grad_norm": 0.4778369665145874, + "learning_rate": 0.0002, + "loss": 0.8483, + "step": 610 + }, + { + "epoch": 0.2001291155584248, + "grad_norm": 0.4867003560066223, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 620 + }, + { + "epoch": 0.20335700451904454, + "grad_norm": 0.4583742916584015, + "learning_rate": 0.0002, + "loss": 0.8554, + "step": 630 + }, + { + "epoch": 0.2065848934796643, + "grad_norm": 0.47958165407180786, + "learning_rate": 0.0002, + "loss": 0.8698, + "step": 640 + }, + { + "epoch": 0.20981278244028406, + "grad_norm": 0.4526064097881317, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 650 + }, + { + "epoch": 0.2130406714009038, + "grad_norm": 0.45890581607818604, + "learning_rate": 0.0002, + "loss": 0.8313, + "step": 660 + }, + { + "epoch": 0.21626856036152356, + "grad_norm": 0.42725905776023865, + "learning_rate": 0.0002, + "loss": 0.8143, + "step": 670 + }, + { + "epoch": 0.21949644932214332, + "grad_norm": 0.40380963683128357, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 680 + }, + { + "epoch": 0.22272433828276308, + "grad_norm": 0.4372998774051666, + "learning_rate": 0.0002, + "loss": 0.9004, + "step": 690 + }, + { + "epoch": 0.22595222724338282, + "grad_norm": 0.4245864450931549, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 700 + }, + { + "epoch": 0.22918011620400258, + "grad_norm": 0.4061129689216614, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 710 + }, + { + "epoch": 0.23240800516462234, + "grad_norm": 0.474454790353775, + "learning_rate": 0.0002, + "loss": 0.8275, + "step": 720 + }, + { + "epoch": 0.23563589412524208, + "grad_norm": 0.4908486008644104, + "learning_rate": 0.0002, + "loss": 0.8346, + "step": 730 + }, + { + "epoch": 0.23886378308586184, + "grad_norm": 0.4284191429615021, + "learning_rate": 0.0002, + "loss": 0.8755, + "step": 740 + }, + { + "epoch": 0.2420916720464816, + "grad_norm": 0.44730308651924133, + "learning_rate": 0.0002, + "loss": 0.8387, + "step": 750 + }, + { + "epoch": 0.24531956100710137, + "grad_norm": 0.4433246850967407, + "learning_rate": 0.0002, + "loss": 0.8135, + "step": 760 + }, + { + "epoch": 0.2485474499677211, + "grad_norm": 0.43668854236602783, + "learning_rate": 0.0002, + "loss": 0.8644, + "step": 770 + }, + { + "epoch": 0.25177533892834086, + "grad_norm": 0.34324130415916443, + "learning_rate": 0.0002, + "loss": 0.8025, + "step": 780 + }, + { + "epoch": 0.2550032278889606, + "grad_norm": 0.46476295590400696, + "learning_rate": 0.0002, + "loss": 0.8725, + "step": 790 + }, + { + "epoch": 0.2582311168495804, + "grad_norm": 0.5047039985656738, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 800 + }, + { + "epoch": 0.26145900581020015, + "grad_norm": 0.4402127265930176, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 810 + }, + { + "epoch": 0.26468689477081986, + "grad_norm": 0.4642465114593506, + "learning_rate": 0.0002, + "loss": 0.8025, + "step": 820 + }, + { + "epoch": 0.2679147837314396, + "grad_norm": 0.40093424916267395, + "learning_rate": 0.0002, + "loss": 0.8836, + "step": 830 + }, + { + "epoch": 0.2711426726920594, + "grad_norm": 0.42501842975616455, + "learning_rate": 0.0002, + "loss": 0.83, + "step": 840 + }, + { + "epoch": 0.27437056165267915, + "grad_norm": 0.43279722332954407, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 850 + }, + { + "epoch": 0.2775984506132989, + "grad_norm": 0.5991243720054626, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 860 + }, + { + "epoch": 0.28082633957391867, + "grad_norm": 0.4217848777770996, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 870 + }, + { + "epoch": 0.28405422853453843, + "grad_norm": 0.3933536410331726, + "learning_rate": 0.0002, + "loss": 0.8135, + "step": 880 + }, + { + "epoch": 0.28728211749515814, + "grad_norm": 0.5868505239486694, + "learning_rate": 0.0002, + "loss": 0.8846, + "step": 890 + }, + { + "epoch": 0.2905100064557779, + "grad_norm": 0.5209547877311707, + "learning_rate": 0.0002, + "loss": 0.8759, + "step": 900 + }, + { + "epoch": 0.29373789541639767, + "grad_norm": 0.49307361245155334, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 910 + }, + { + "epoch": 0.2969657843770174, + "grad_norm": 0.4288382828235626, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 920 + }, + { + "epoch": 0.3001936733376372, + "grad_norm": 0.33568474650382996, + "learning_rate": 0.0002, + "loss": 0.8431, + "step": 930 + }, + { + "epoch": 0.30342156229825695, + "grad_norm": 1.0915930271148682, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 940 + }, + { + "epoch": 0.3066494512588767, + "grad_norm": 0.5489798188209534, + "learning_rate": 0.0002, + "loss": 0.8535, + "step": 950 + }, + { + "epoch": 0.3098773402194964, + "grad_norm": 0.42971742153167725, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 960 + }, + { + "epoch": 0.3131052291801162, + "grad_norm": 0.43375834822654724, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 970 + }, + { + "epoch": 0.31633311814073595, + "grad_norm": 0.47488611936569214, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 980 + }, + { + "epoch": 0.3195610071013557, + "grad_norm": 0.46296775341033936, + "learning_rate": 0.0002, + "loss": 0.7906, + "step": 990 + }, + { + "epoch": 0.32278889606197547, + "grad_norm": 0.4548890292644501, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 1000 + }, + { + "epoch": 0.32601678502259523, + "grad_norm": 0.41834497451782227, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 1010 + }, + { + "epoch": 0.329244673983215, + "grad_norm": 0.441092312335968, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 1020 + }, + { + "epoch": 0.33247256294383476, + "grad_norm": 0.637322187423706, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 1030 + }, + { + "epoch": 0.33570045190445447, + "grad_norm": 0.4374958574771881, + "learning_rate": 0.0002, + "loss": 0.8685, + "step": 1040 + }, + { + "epoch": 0.33892834086507423, + "grad_norm": 0.3935825824737549, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 1050 + }, + { + "epoch": 0.342156229825694, + "grad_norm": 0.43526220321655273, + "learning_rate": 0.0002, + "loss": 0.8287, + "step": 1060 + }, + { + "epoch": 0.34538411878631375, + "grad_norm": 0.45327696204185486, + "learning_rate": 0.0002, + "loss": 0.8413, + "step": 1070 + }, + { + "epoch": 0.3486120077469335, + "grad_norm": 0.4126075506210327, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 1080 + }, + { + "epoch": 0.3518398967075533, + "grad_norm": 0.4714072048664093, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 1090 + }, + { + "epoch": 0.35506778566817304, + "grad_norm": 0.518127977848053, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 1100 + }, + { + "epoch": 0.35829567462879275, + "grad_norm": 0.43264099955558777, + "learning_rate": 0.0002, + "loss": 0.8479, + "step": 1110 + }, + { + "epoch": 0.3615235635894125, + "grad_norm": 0.4857400357723236, + "learning_rate": 0.0002, + "loss": 0.8724, + "step": 1120 + }, + { + "epoch": 0.3647514525500323, + "grad_norm": 0.37591469287872314, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 1130 + }, + { + "epoch": 0.36797934151065204, + "grad_norm": 0.4165478050708771, + "learning_rate": 0.0002, + "loss": 0.8531, + "step": 1140 + }, + { + "epoch": 0.3712072304712718, + "grad_norm": 0.42911383509635925, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 1150 + }, + { + "epoch": 0.37443511943189156, + "grad_norm": 0.44980287551879883, + "learning_rate": 0.0002, + "loss": 0.8722, + "step": 1160 + }, + { + "epoch": 0.3776630083925113, + "grad_norm": 0.4066573679447174, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 1170 + }, + { + "epoch": 0.38089089735313103, + "grad_norm": 0.5056195855140686, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 1180 + }, + { + "epoch": 0.3841187863137508, + "grad_norm": 0.4141536355018616, + "learning_rate": 0.0002, + "loss": 0.8387, + "step": 1190 + }, + { + "epoch": 0.38734667527437056, + "grad_norm": 0.4501924514770508, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 1200 + }, + { + "epoch": 0.3905745642349903, + "grad_norm": 0.43304240703582764, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 1210 + }, + { + "epoch": 0.3938024531956101, + "grad_norm": 0.475777804851532, + "learning_rate": 0.0002, + "loss": 0.8905, + "step": 1220 + }, + { + "epoch": 0.39703034215622984, + "grad_norm": 0.5846465826034546, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 1230 + }, + { + "epoch": 0.4002582311168496, + "grad_norm": 0.42899325489997864, + "learning_rate": 0.0002, + "loss": 0.8078, + "step": 1240 + }, + { + "epoch": 0.4034861200774693, + "grad_norm": 0.3980463147163391, + "learning_rate": 0.0002, + "loss": 0.8415, + "step": 1250 + }, + { + "epoch": 0.4067140090380891, + "grad_norm": 0.45769768953323364, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 1260 + }, + { + "epoch": 0.40994189799870884, + "grad_norm": 0.5101280212402344, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 1270 + }, + { + "epoch": 0.4131697869593286, + "grad_norm": 0.47374317049980164, + "learning_rate": 0.0002, + "loss": 0.7905, + "step": 1280 + }, + { + "epoch": 0.41639767591994836, + "grad_norm": 0.4261878728866577, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 1290 + }, + { + "epoch": 0.4196255648805681, + "grad_norm": 0.46954256296157837, + "learning_rate": 0.0002, + "loss": 0.9004, + "step": 1300 + }, + { + "epoch": 0.4228534538411879, + "grad_norm": 0.5205738544464111, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 1310 + }, + { + "epoch": 0.4260813428018076, + "grad_norm": 0.5176340937614441, + "learning_rate": 0.0002, + "loss": 0.8964, + "step": 1320 + }, + { + "epoch": 0.42930923176242736, + "grad_norm": 0.5155916810035706, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 1330 + }, + { + "epoch": 0.4325371207230471, + "grad_norm": 0.44548553228378296, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 1340 + }, + { + "epoch": 0.4357650096836669, + "grad_norm": 0.5633558630943298, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 1350 + }, + { + "epoch": 0.43899289864428664, + "grad_norm": 0.42444056272506714, + "learning_rate": 0.0002, + "loss": 0.7889, + "step": 1360 + }, + { + "epoch": 0.4422207876049064, + "grad_norm": 0.5226860642433167, + "learning_rate": 0.0002, + "loss": 0.8588, + "step": 1370 + }, + { + "epoch": 0.44544867656552617, + "grad_norm": 0.5354582071304321, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 1380 + }, + { + "epoch": 0.4486765655261459, + "grad_norm": 0.472646564245224, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 1390 + }, + { + "epoch": 0.45190445448676564, + "grad_norm": 0.6312310099601746, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 1400 + }, + { + "epoch": 0.4551323434473854, + "grad_norm": 0.4298408031463623, + "learning_rate": 0.0002, + "loss": 0.8212, + "step": 1410 + }, + { + "epoch": 0.45836023240800516, + "grad_norm": 0.43427202105522156, + "learning_rate": 0.0002, + "loss": 0.8447, + "step": 1420 + }, + { + "epoch": 0.4615881213686249, + "grad_norm": 0.44097861647605896, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 1430 + }, + { + "epoch": 0.4648160103292447, + "grad_norm": 0.5142693519592285, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 1440 + }, + { + "epoch": 0.46804389928986445, + "grad_norm": 0.46416547894477844, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 1450 + }, + { + "epoch": 0.47127178825048416, + "grad_norm": 0.4858551025390625, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 1460 + }, + { + "epoch": 0.4744996772111039, + "grad_norm": 0.4709177315235138, + "learning_rate": 0.0002, + "loss": 0.8354, + "step": 1470 + }, + { + "epoch": 0.4777275661717237, + "grad_norm": 0.5500252842903137, + "learning_rate": 0.0002, + "loss": 0.8391, + "step": 1480 + }, + { + "epoch": 0.48095545513234345, + "grad_norm": 0.43364381790161133, + "learning_rate": 0.0002, + "loss": 0.8359, + "step": 1490 + }, + { + "epoch": 0.4841833440929632, + "grad_norm": 0.47712287306785583, + "learning_rate": 0.0002, + "loss": 0.8446, + "step": 1500 + }, + { + "epoch": 0.48741123305358297, + "grad_norm": 0.4518495202064514, + "learning_rate": 0.0002, + "loss": 0.8518, + "step": 1510 + }, + { + "epoch": 0.49063912201420273, + "grad_norm": 0.4539008140563965, + "learning_rate": 0.0002, + "loss": 0.819, + "step": 1520 + }, + { + "epoch": 0.49386701097482244, + "grad_norm": 0.4993067979812622, + "learning_rate": 0.0002, + "loss": 0.8276, + "step": 1530 + }, + { + "epoch": 0.4970948999354422, + "grad_norm": 0.6094803214073181, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 1540 + }, + { + "epoch": 0.500322788896062, + "grad_norm": 0.48602527379989624, + "learning_rate": 0.0002, + "loss": 0.8263, + "step": 1550 + }, + { + "epoch": 0.5035506778566817, + "grad_norm": 0.40245795249938965, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 1560 + }, + { + "epoch": 0.5067785668173015, + "grad_norm": 0.456787645816803, + "learning_rate": 0.0002, + "loss": 0.7907, + "step": 1570 + }, + { + "epoch": 0.5100064557779213, + "grad_norm": 0.43936216831207275, + "learning_rate": 0.0002, + "loss": 0.86, + "step": 1580 + }, + { + "epoch": 0.513234344738541, + "grad_norm": 0.549018144607544, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 1590 + }, + { + "epoch": 0.5164622336991608, + "grad_norm": 0.41746795177459717, + "learning_rate": 0.0002, + "loss": 0.8169, + "step": 1600 + }, + { + "epoch": 0.5196901226597805, + "grad_norm": 0.4217053949832916, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 1610 + }, + { + "epoch": 0.5229180116204003, + "grad_norm": 0.449913889169693, + "learning_rate": 0.0002, + "loss": 0.8161, + "step": 1620 + }, + { + "epoch": 0.5261459005810201, + "grad_norm": 0.5084872245788574, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 1630 + }, + { + "epoch": 0.5293737895416397, + "grad_norm": 0.46248653531074524, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 1640 + }, + { + "epoch": 0.5326016785022595, + "grad_norm": 0.4824236035346985, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 1650 + }, + { + "epoch": 0.5358295674628792, + "grad_norm": 0.6010985374450684, + "learning_rate": 0.0002, + "loss": 0.8711, + "step": 1660 + }, + { + "epoch": 0.539057456423499, + "grad_norm": 0.4757920801639557, + "learning_rate": 0.0002, + "loss": 0.8266, + "step": 1670 + }, + { + "epoch": 0.5422853453841188, + "grad_norm": 0.45161882042884827, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 1680 + }, + { + "epoch": 0.5455132343447385, + "grad_norm": 0.49314990639686584, + "learning_rate": 0.0002, + "loss": 0.8141, + "step": 1690 + }, + { + "epoch": 0.5487411233053583, + "grad_norm": 0.3918305039405823, + "learning_rate": 0.0002, + "loss": 0.8091, + "step": 1700 + }, + { + "epoch": 0.551969012265978, + "grad_norm": 0.5966728925704956, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 1710 + }, + { + "epoch": 0.5551969012265978, + "grad_norm": 0.4208986163139343, + "learning_rate": 0.0002, + "loss": 0.8438, + "step": 1720 + }, + { + "epoch": 0.5584247901872176, + "grad_norm": 0.43724218010902405, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 1730 + }, + { + "epoch": 0.5616526791478373, + "grad_norm": 0.5287272930145264, + "learning_rate": 0.0002, + "loss": 0.7956, + "step": 1740 + }, + { + "epoch": 0.5648805681084571, + "grad_norm": 0.4961899518966675, + "learning_rate": 0.0002, + "loss": 0.8557, + "step": 1750 + }, + { + "epoch": 0.5681084570690769, + "grad_norm": 0.4468635320663452, + "learning_rate": 0.0002, + "loss": 0.8029, + "step": 1760 + }, + { + "epoch": 0.5713363460296966, + "grad_norm": 0.6423530578613281, + "learning_rate": 0.0002, + "loss": 0.7968, + "step": 1770 + }, + { + "epoch": 0.5745642349903163, + "grad_norm": 0.4601971507072449, + "learning_rate": 0.0002, + "loss": 0.8324, + "step": 1780 + }, + { + "epoch": 0.577792123950936, + "grad_norm": 0.46514901518821716, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 1790 + }, + { + "epoch": 0.5810200129115558, + "grad_norm": 0.4771687388420105, + "learning_rate": 0.0002, + "loss": 0.8186, + "step": 1800 + }, + { + "epoch": 0.5842479018721756, + "grad_norm": 0.46514490246772766, + "learning_rate": 0.0002, + "loss": 0.856, + "step": 1810 + }, + { + "epoch": 0.5874757908327953, + "grad_norm": 0.5373936295509338, + "learning_rate": 0.0002, + "loss": 0.84, + "step": 1820 + }, + { + "epoch": 0.5907036797934151, + "grad_norm": 0.5175791382789612, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 1830 + }, + { + "epoch": 0.5939315687540349, + "grad_norm": 0.4522802233695984, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 1840 + }, + { + "epoch": 0.5971594577146546, + "grad_norm": 0.42987772822380066, + "learning_rate": 0.0002, + "loss": 0.8633, + "step": 1850 + }, + { + "epoch": 0.6003873466752744, + "grad_norm": 0.5566838383674622, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 1860 + }, + { + "epoch": 0.6036152356358941, + "grad_norm": 0.42807698249816895, + "learning_rate": 0.0002, + "loss": 0.8312, + "step": 1870 + }, + { + "epoch": 0.6068431245965139, + "grad_norm": 0.4957767724990845, + "learning_rate": 0.0002, + "loss": 0.8035, + "step": 1880 + }, + { + "epoch": 0.6100710135571337, + "grad_norm": 0.4260980188846588, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 1890 + }, + { + "epoch": 0.6132989025177534, + "grad_norm": 0.4777357876300812, + "learning_rate": 0.0002, + "loss": 0.8363, + "step": 1900 + }, + { + "epoch": 0.6165267914783732, + "grad_norm": 0.4434216022491455, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 1910 + }, + { + "epoch": 0.6197546804389928, + "grad_norm": 0.5215433835983276, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 1920 + }, + { + "epoch": 0.6229825693996126, + "grad_norm": 0.5143248438835144, + "learning_rate": 0.0002, + "loss": 0.82, + "step": 1930 + }, + { + "epoch": 0.6262104583602324, + "grad_norm": 0.5213413238525391, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 1940 + }, + { + "epoch": 0.6294383473208521, + "grad_norm": 0.5408226251602173, + "learning_rate": 0.0002, + "loss": 0.7549, + "step": 1950 + }, + { + "epoch": 0.6326662362814719, + "grad_norm": 0.5479708909988403, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 1960 + }, + { + "epoch": 0.6358941252420917, + "grad_norm": 0.4490949809551239, + "learning_rate": 0.0002, + "loss": 0.8138, + "step": 1970 + }, + { + "epoch": 0.6391220142027114, + "grad_norm": 0.48815059661865234, + "learning_rate": 0.0002, + "loss": 0.854, + "step": 1980 + }, + { + "epoch": 0.6423499031633312, + "grad_norm": 0.46498045325279236, + "learning_rate": 0.0002, + "loss": 0.8568, + "step": 1990 + }, + { + "epoch": 0.6455777921239509, + "grad_norm": 0.5136561393737793, + "learning_rate": 0.0002, + "loss": 0.8263, + "step": 2000 + }, + { + "epoch": 0.6488056810845707, + "grad_norm": 0.5145719647407532, + "learning_rate": 0.0002, + "loss": 0.8503, + "step": 2010 + }, + { + "epoch": 0.6520335700451905, + "grad_norm": 0.5430373549461365, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 2020 + }, + { + "epoch": 0.6552614590058102, + "grad_norm": 0.46347954869270325, + "learning_rate": 0.0002, + "loss": 0.8115, + "step": 2030 + }, + { + "epoch": 0.65848934796643, + "grad_norm": 0.5189562439918518, + "learning_rate": 0.0002, + "loss": 0.8769, + "step": 2040 + }, + { + "epoch": 0.6617172369270498, + "grad_norm": 0.43843990564346313, + "learning_rate": 0.0002, + "loss": 0.8453, + "step": 2050 + }, + { + "epoch": 0.6649451258876695, + "grad_norm": 0.4654983580112457, + "learning_rate": 0.0002, + "loss": 0.7951, + "step": 2060 + }, + { + "epoch": 0.6681730148482892, + "grad_norm": 0.44835716485977173, + "learning_rate": 0.0002, + "loss": 0.8308, + "step": 2070 + }, + { + "epoch": 0.6714009038089089, + "grad_norm": 0.38811734318733215, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 2080 + }, + { + "epoch": 0.6746287927695287, + "grad_norm": 0.5709853172302246, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 2090 + }, + { + "epoch": 0.6778566817301485, + "grad_norm": 0.49994757771492004, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 2100 + }, + { + "epoch": 0.6810845706907682, + "grad_norm": 0.5505402684211731, + "learning_rate": 0.0002, + "loss": 0.8, + "step": 2110 + }, + { + "epoch": 0.684312459651388, + "grad_norm": 0.48195120692253113, + "learning_rate": 0.0002, + "loss": 0.8227, + "step": 2120 + }, + { + "epoch": 0.6875403486120077, + "grad_norm": 0.4854775071144104, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 2130 + }, + { + "epoch": 0.6907682375726275, + "grad_norm": 0.6422494649887085, + "learning_rate": 0.0002, + "loss": 0.8231, + "step": 2140 + }, + { + "epoch": 0.6939961265332473, + "grad_norm": 0.3972536027431488, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 2150 + }, + { + "epoch": 0.697224015493867, + "grad_norm": 0.4297836422920227, + "learning_rate": 0.0002, + "loss": 0.8068, + "step": 2160 + }, + { + "epoch": 0.7004519044544868, + "grad_norm": 0.45486778020858765, + "learning_rate": 0.0002, + "loss": 0.8017, + "step": 2170 + }, + { + "epoch": 0.7036797934151066, + "grad_norm": 0.4706047773361206, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 2180 + }, + { + "epoch": 0.7069076823757263, + "grad_norm": 0.46426892280578613, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 2190 + }, + { + "epoch": 0.7101355713363461, + "grad_norm": 0.46333715319633484, + "learning_rate": 0.0002, + "loss": 0.8472, + "step": 2200 + }, + { + "epoch": 0.7133634602969657, + "grad_norm": 0.4632524251937866, + "learning_rate": 0.0002, + "loss": 0.8247, + "step": 2210 + }, + { + "epoch": 0.7165913492575855, + "grad_norm": 0.4610830843448639, + "learning_rate": 0.0002, + "loss": 0.8452, + "step": 2220 + }, + { + "epoch": 0.7198192382182053, + "grad_norm": 0.4905324876308441, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 2230 + }, + { + "epoch": 0.723047127178825, + "grad_norm": 0.4936263859272003, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 2240 + }, + { + "epoch": 0.7262750161394448, + "grad_norm": 0.40778425335884094, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 2250 + }, + { + "epoch": 0.7295029051000645, + "grad_norm": 0.50351482629776, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 2260 + }, + { + "epoch": 0.7327307940606843, + "grad_norm": 0.4894128143787384, + "learning_rate": 0.0002, + "loss": 0.8475, + "step": 2270 + }, + { + "epoch": 0.7359586830213041, + "grad_norm": 0.5580906271934509, + "learning_rate": 0.0002, + "loss": 0.8087, + "step": 2280 + }, + { + "epoch": 0.7391865719819238, + "grad_norm": 0.4655369520187378, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 2290 + }, + { + "epoch": 0.7424144609425436, + "grad_norm": 0.4666965901851654, + "learning_rate": 0.0002, + "loss": 0.8395, + "step": 2300 + }, + { + "epoch": 0.7456423499031634, + "grad_norm": 0.46259936690330505, + "learning_rate": 0.0002, + "loss": 0.7605, + "step": 2310 + }, + { + "epoch": 0.7488702388637831, + "grad_norm": 0.520706832408905, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 2320 + }, + { + "epoch": 0.7520981278244029, + "grad_norm": 0.5142408013343811, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 2330 + }, + { + "epoch": 0.7553260167850226, + "grad_norm": 0.5355164408683777, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 2340 + }, + { + "epoch": 0.7585539057456423, + "grad_norm": 0.5517185926437378, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 2350 + }, + { + "epoch": 0.7617817947062621, + "grad_norm": 0.7162677049636841, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 2360 + }, + { + "epoch": 0.7650096836668818, + "grad_norm": 0.42402133345603943, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 2370 + }, + { + "epoch": 0.7682375726275016, + "grad_norm": 0.47180113196372986, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 2380 + }, + { + "epoch": 0.7714654615881213, + "grad_norm": 0.6262288689613342, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 2390 + }, + { + "epoch": 0.7746933505487411, + "grad_norm": 0.5177528262138367, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 2400 + }, + { + "epoch": 0.7779212395093609, + "grad_norm": 0.555721640586853, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 2410 + }, + { + "epoch": 0.7811491284699806, + "grad_norm": 0.5592644810676575, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 2420 + }, + { + "epoch": 0.7843770174306004, + "grad_norm": 0.38025397062301636, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 2430 + }, + { + "epoch": 0.7876049063912202, + "grad_norm": 0.4597472548484802, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 2440 + }, + { + "epoch": 0.7908327953518399, + "grad_norm": 0.4929825961589813, + "learning_rate": 0.0002, + "loss": 0.8575, + "step": 2450 + }, + { + "epoch": 0.7940606843124597, + "grad_norm": 0.45277655124664307, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 2460 + }, + { + "epoch": 0.7972885732730794, + "grad_norm": 0.6224122643470764, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 2470 + }, + { + "epoch": 0.8005164622336992, + "grad_norm": 0.5740901827812195, + "learning_rate": 0.0002, + "loss": 0.8449, + "step": 2480 + }, + { + "epoch": 0.8037443511943189, + "grad_norm": 0.41335329413414, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 2490 + }, + { + "epoch": 0.8069722401549386, + "grad_norm": 0.4738694131374359, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 2500 + }, + { + "epoch": 0.8102001291155584, + "grad_norm": 0.5288197994232178, + "learning_rate": 0.0002, + "loss": 0.7927, + "step": 2510 + }, + { + "epoch": 0.8134280180761781, + "grad_norm": 0.5404666066169739, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 2520 + }, + { + "epoch": 0.8166559070367979, + "grad_norm": 0.4444909691810608, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 2530 + }, + { + "epoch": 0.8198837959974177, + "grad_norm": 0.542061448097229, + "learning_rate": 0.0002, + "loss": 0.8683, + "step": 2540 + }, + { + "epoch": 0.8231116849580374, + "grad_norm": 0.4914741814136505, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 2550 + }, + { + "epoch": 0.8263395739186572, + "grad_norm": 0.41703441739082336, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 2560 + }, + { + "epoch": 0.829567462879277, + "grad_norm": 0.5489841103553772, + "learning_rate": 0.0002, + "loss": 0.824, + "step": 2570 + }, + { + "epoch": 0.8327953518398967, + "grad_norm": 0.5359883308410645, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 2580 + }, + { + "epoch": 0.8360232408005165, + "grad_norm": 0.5541019439697266, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 2590 + }, + { + "epoch": 0.8392511297611362, + "grad_norm": 0.4746638834476471, + "learning_rate": 0.0002, + "loss": 0.797, + "step": 2600 + }, + { + "epoch": 0.842479018721756, + "grad_norm": 0.5243194103240967, + "learning_rate": 0.0002, + "loss": 0.8116, + "step": 2610 + }, + { + "epoch": 0.8457069076823758, + "grad_norm": 0.46824976801872253, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 2620 + }, + { + "epoch": 0.8489347966429954, + "grad_norm": 0.49487847089767456, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 2630 + }, + { + "epoch": 0.8521626856036152, + "grad_norm": 0.42180097103118896, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 2640 + }, + { + "epoch": 0.855390574564235, + "grad_norm": 0.5516560077667236, + "learning_rate": 0.0002, + "loss": 0.8304, + "step": 2650 + }, + { + "epoch": 0.8586184635248547, + "grad_norm": 0.4392191767692566, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 2660 + }, + { + "epoch": 0.8618463524854745, + "grad_norm": 0.5387210845947266, + "learning_rate": 0.0002, + "loss": 0.848, + "step": 2670 + }, + { + "epoch": 0.8650742414460942, + "grad_norm": 0.6232406497001648, + "learning_rate": 0.0002, + "loss": 0.8094, + "step": 2680 + }, + { + "epoch": 0.868302130406714, + "grad_norm": 0.53749018907547, + "learning_rate": 0.0002, + "loss": 0.768, + "step": 2690 + }, + { + "epoch": 0.8715300193673338, + "grad_norm": 0.47480374574661255, + "learning_rate": 0.0002, + "loss": 0.8299, + "step": 2700 + }, + { + "epoch": 0.8747579083279535, + "grad_norm": 0.44618046283721924, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 2710 + }, + { + "epoch": 0.8779857972885733, + "grad_norm": 0.4173581302165985, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 2720 + }, + { + "epoch": 0.881213686249193, + "grad_norm": 0.524081289768219, + "learning_rate": 0.0002, + "loss": 0.7713, + "step": 2730 + }, + { + "epoch": 0.8844415752098128, + "grad_norm": 0.5608431100845337, + "learning_rate": 0.0002, + "loss": 0.8738, + "step": 2740 + }, + { + "epoch": 0.8876694641704326, + "grad_norm": 0.5212284922599792, + "learning_rate": 0.0002, + "loss": 0.8513, + "step": 2750 + }, + { + "epoch": 0.8908973531310523, + "grad_norm": 0.5601475834846497, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 2760 + }, + { + "epoch": 0.8941252420916721, + "grad_norm": 0.4499223828315735, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 2770 + }, + { + "epoch": 0.8973531310522918, + "grad_norm": 0.46945226192474365, + "learning_rate": 0.0002, + "loss": 0.8559, + "step": 2780 + }, + { + "epoch": 0.9005810200129115, + "grad_norm": 0.4837495684623718, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 2790 + }, + { + "epoch": 0.9038089089735313, + "grad_norm": 0.5059258937835693, + "learning_rate": 0.0002, + "loss": 0.7887, + "step": 2800 + }, + { + "epoch": 0.907036797934151, + "grad_norm": 0.4857945144176483, + "learning_rate": 0.0002, + "loss": 0.8571, + "step": 2810 + }, + { + "epoch": 0.9102646868947708, + "grad_norm": 0.5001962780952454, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 2820 + }, + { + "epoch": 0.9134925758553906, + "grad_norm": 0.5468648672103882, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 2830 + }, + { + "epoch": 0.9167204648160103, + "grad_norm": 0.5533056259155273, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 2840 + }, + { + "epoch": 0.9199483537766301, + "grad_norm": 0.5909785628318787, + "learning_rate": 0.0002, + "loss": 0.7895, + "step": 2850 + }, + { + "epoch": 0.9231762427372499, + "grad_norm": 0.47428104281425476, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 2860 + }, + { + "epoch": 0.9264041316978696, + "grad_norm": 0.548814058303833, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 2870 + }, + { + "epoch": 0.9296320206584894, + "grad_norm": 0.5576745271682739, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 2880 + }, + { + "epoch": 0.9328599096191091, + "grad_norm": 0.47094792127609253, + "learning_rate": 0.0002, + "loss": 0.8399, + "step": 2890 + }, + { + "epoch": 0.9360877985797289, + "grad_norm": 0.5408539772033691, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 2900 + }, + { + "epoch": 0.9393156875403487, + "grad_norm": 0.5922889113426208, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 2910 + }, + { + "epoch": 0.9425435765009683, + "grad_norm": 0.45462584495544434, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 2920 + }, + { + "epoch": 0.9457714654615881, + "grad_norm": 0.6864947080612183, + "learning_rate": 0.0002, + "loss": 0.8344, + "step": 2930 + }, + { + "epoch": 0.9489993544222078, + "grad_norm": 0.4706299304962158, + "learning_rate": 0.0002, + "loss": 0.8166, + "step": 2940 + }, + { + "epoch": 0.9522272433828276, + "grad_norm": 0.5583269596099854, + "learning_rate": 0.0002, + "loss": 0.8422, + "step": 2950 + }, + { + "epoch": 0.9554551323434474, + "grad_norm": 0.51015704870224, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 2960 + }, + { + "epoch": 0.9586830213040671, + "grad_norm": 0.5325582027435303, + "learning_rate": 0.0002, + "loss": 0.8371, + "step": 2970 + }, + { + "epoch": 0.9619109102646869, + "grad_norm": 0.49008598923683167, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 2980 + }, + { + "epoch": 0.9651387992253067, + "grad_norm": 0.4422132074832916, + "learning_rate": 0.0002, + "loss": 0.8093, + "step": 2990 + }, + { + "epoch": 0.9683666881859264, + "grad_norm": 0.5053589344024658, + "learning_rate": 0.0002, + "loss": 0.7966, + "step": 3000 + }, + { + "epoch": 0.9715945771465462, + "grad_norm": 0.46754521131515503, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 3010 + }, + { + "epoch": 0.9748224661071659, + "grad_norm": 0.5613434910774231, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 3020 + }, + { + "epoch": 0.9780503550677857, + "grad_norm": 0.5052843689918518, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 3030 + }, + { + "epoch": 0.9812782440284055, + "grad_norm": 0.4270972013473511, + "learning_rate": 0.0002, + "loss": 0.8412, + "step": 3040 + }, + { + "epoch": 0.9845061329890252, + "grad_norm": 0.4974991977214813, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 3050 + }, + { + "epoch": 0.9877340219496449, + "grad_norm": 0.4432311952114105, + "learning_rate": 0.0002, + "loss": 0.8415, + "step": 3060 + }, + { + "epoch": 0.9909619109102646, + "grad_norm": 0.466457724571228, + "learning_rate": 0.0002, + "loss": 0.7764, + "step": 3070 + }, + { + "epoch": 0.9941897998708844, + "grad_norm": 0.6438009142875671, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 3080 + }, + { + "epoch": 0.9974176888315042, + "grad_norm": 0.5593604445457458, + "learning_rate": 0.0002, + "loss": 0.8425, + "step": 3090 + }, + { + "epoch": 1.0, + "eval_loss": 1.0958120822906494, + "eval_runtime": 148.3273, + "eval_samples_per_second": 4.942, + "eval_steps_per_second": 0.62, + "step": 3098 + }, + { + "epoch": 1.000645577792124, + "grad_norm": 0.5701445937156677, + "learning_rate": 0.0002, + "loss": 0.8275, + "step": 3100 + }, + { + "epoch": 1.0038734667527438, + "grad_norm": 0.6089657545089722, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 3110 + }, + { + "epoch": 1.0071013557133635, + "grad_norm": 0.5619552135467529, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 3120 + }, + { + "epoch": 1.010329244673983, + "grad_norm": 0.5550283789634705, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 3130 + }, + { + "epoch": 1.013557133634603, + "grad_norm": 0.6221792101860046, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 3140 + }, + { + "epoch": 1.0167850225952226, + "grad_norm": 0.5450758934020996, + "learning_rate": 0.0002, + "loss": 0.7603, + "step": 3150 + }, + { + "epoch": 1.0200129115558425, + "grad_norm": 0.4359588027000427, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 3160 + }, + { + "epoch": 1.0232408005164622, + "grad_norm": 0.5932239890098572, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 3170 + }, + { + "epoch": 1.026468689477082, + "grad_norm": 0.45478707551956177, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 3180 + }, + { + "epoch": 1.0296965784377017, + "grad_norm": 0.677615761756897, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 3190 + }, + { + "epoch": 1.0329244673983216, + "grad_norm": 0.6231790781021118, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 3200 + }, + { + "epoch": 1.0361523563589412, + "grad_norm": 0.5074195861816406, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 3210 + }, + { + "epoch": 1.039380245319561, + "grad_norm": 0.4844142198562622, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 3220 + }, + { + "epoch": 1.0426081342801807, + "grad_norm": 0.5372750759124756, + "learning_rate": 0.0002, + "loss": 0.7655, + "step": 3230 + }, + { + "epoch": 1.0458360232408006, + "grad_norm": 0.46296265721321106, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 3240 + }, + { + "epoch": 1.0490639122014203, + "grad_norm": 0.5417148470878601, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 3250 + }, + { + "epoch": 1.0522918011620401, + "grad_norm": 0.5695074200630188, + "learning_rate": 0.0002, + "loss": 0.7637, + "step": 3260 + }, + { + "epoch": 1.0555196901226598, + "grad_norm": 0.5050092935562134, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 3270 + }, + { + "epoch": 1.0587475790832794, + "grad_norm": 0.5320752263069153, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 3280 + }, + { + "epoch": 1.0619754680438993, + "grad_norm": 0.5832052230834961, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 3290 + }, + { + "epoch": 1.065203357004519, + "grad_norm": 0.5228804349899292, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 3300 + }, + { + "epoch": 1.0684312459651388, + "grad_norm": 0.5819445252418518, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 3310 + }, + { + "epoch": 1.0716591349257585, + "grad_norm": 0.4201328754425049, + "learning_rate": 0.0002, + "loss": 0.7093, + "step": 3320 + }, + { + "epoch": 1.0748870238863784, + "grad_norm": 0.5424145460128784, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 3330 + }, + { + "epoch": 1.078114912846998, + "grad_norm": 0.6169946789741516, + "learning_rate": 0.0002, + "loss": 0.7828, + "step": 3340 + }, + { + "epoch": 1.0813428018076179, + "grad_norm": 0.607676088809967, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 3350 + }, + { + "epoch": 1.0845706907682375, + "grad_norm": 0.5191982388496399, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 3360 + }, + { + "epoch": 1.0877985797288574, + "grad_norm": 0.5728003978729248, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 3370 + }, + { + "epoch": 1.091026468689477, + "grad_norm": 0.5402643084526062, + "learning_rate": 0.0002, + "loss": 0.7381, + "step": 3380 + }, + { + "epoch": 1.094254357650097, + "grad_norm": 0.5377541780471802, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 3390 + }, + { + "epoch": 1.0974822466107166, + "grad_norm": 0.4751385748386383, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 3400 + }, + { + "epoch": 1.1007101355713362, + "grad_norm": 0.559158444404602, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 3410 + }, + { + "epoch": 1.103938024531956, + "grad_norm": 0.4917701482772827, + "learning_rate": 0.0002, + "loss": 0.7366, + "step": 3420 + }, + { + "epoch": 1.1071659134925758, + "grad_norm": 0.5507875084877014, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 3430 + }, + { + "epoch": 1.1103938024531956, + "grad_norm": 0.45458680391311646, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 3440 + }, + { + "epoch": 1.1136216914138153, + "grad_norm": 0.5721744894981384, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 3450 + }, + { + "epoch": 1.1168495803744352, + "grad_norm": 0.5776081681251526, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 3460 + }, + { + "epoch": 1.1200774693350548, + "grad_norm": 0.5261953473091125, + "learning_rate": 0.0002, + "loss": 0.7644, + "step": 3470 + }, + { + "epoch": 1.1233053582956747, + "grad_norm": 0.47759532928466797, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 3480 + }, + { + "epoch": 1.1265332472562943, + "grad_norm": 0.5697659850120544, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 3490 + }, + { + "epoch": 1.1297611362169142, + "grad_norm": 0.5643419623374939, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 3500 + }, + { + "epoch": 1.1329890251775339, + "grad_norm": 0.6502931118011475, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 3510 + }, + { + "epoch": 1.1362169141381537, + "grad_norm": 0.5236507654190063, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 3520 + }, + { + "epoch": 1.1394448030987734, + "grad_norm": 0.6521499156951904, + "learning_rate": 0.0002, + "loss": 0.7571, + "step": 3530 + }, + { + "epoch": 1.142672692059393, + "grad_norm": 0.5893217325210571, + "learning_rate": 0.0002, + "loss": 0.7304, + "step": 3540 + }, + { + "epoch": 1.145900581020013, + "grad_norm": 0.5300073027610779, + "learning_rate": 0.0002, + "loss": 0.7508, + "step": 3550 + }, + { + "epoch": 1.1491284699806328, + "grad_norm": 0.6794660091400146, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 3560 + }, + { + "epoch": 1.1523563589412524, + "grad_norm": 0.5420064926147461, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 3570 + }, + { + "epoch": 1.155584247901872, + "grad_norm": 0.5096590518951416, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 3580 + }, + { + "epoch": 1.158812136862492, + "grad_norm": 0.5726043581962585, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 3590 + }, + { + "epoch": 1.1620400258231116, + "grad_norm": 0.7388110160827637, + "learning_rate": 0.0002, + "loss": 0.7728, + "step": 3600 + }, + { + "epoch": 1.1652679147837315, + "grad_norm": 0.5597969889640808, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 3610 + }, + { + "epoch": 1.1684958037443511, + "grad_norm": 0.5067800283432007, + "learning_rate": 0.0002, + "loss": 0.7132, + "step": 3620 + }, + { + "epoch": 1.171723692704971, + "grad_norm": 0.6625118255615234, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 3630 + }, + { + "epoch": 1.1749515816655907, + "grad_norm": 0.5830849409103394, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 3640 + }, + { + "epoch": 1.1781794706262105, + "grad_norm": 0.6140692830085754, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 3650 + }, + { + "epoch": 1.1814073595868302, + "grad_norm": 0.714523434638977, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 3660 + }, + { + "epoch": 1.18463524854745, + "grad_norm": 0.5196696519851685, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 3670 + }, + { + "epoch": 1.1878631375080697, + "grad_norm": 0.6677889823913574, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 3680 + }, + { + "epoch": 1.1910910264686896, + "grad_norm": 0.47095245122909546, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 3690 + }, + { + "epoch": 1.1943189154293092, + "grad_norm": 0.5197778940200806, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 3700 + }, + { + "epoch": 1.1975468043899289, + "grad_norm": 0.5156530141830444, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 3710 + }, + { + "epoch": 1.2007746933505488, + "grad_norm": 0.6968549489974976, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 3720 + }, + { + "epoch": 1.2040025823111684, + "grad_norm": 0.48983848094940186, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 3730 + }, + { + "epoch": 1.2072304712717883, + "grad_norm": 0.6709973216056824, + "learning_rate": 0.0002, + "loss": 0.7163, + "step": 3740 + }, + { + "epoch": 1.210458360232408, + "grad_norm": 0.48681750893592834, + "learning_rate": 0.0002, + "loss": 0.7632, + "step": 3750 + }, + { + "epoch": 1.2136862491930278, + "grad_norm": 0.49475061893463135, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 3760 + }, + { + "epoch": 1.2169141381536475, + "grad_norm": 0.6163983345031738, + "learning_rate": 0.0002, + "loss": 0.7372, + "step": 3770 + }, + { + "epoch": 1.2201420271142673, + "grad_norm": 0.5481411218643188, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 3780 + }, + { + "epoch": 1.223369916074887, + "grad_norm": 0.620639979839325, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 3790 + }, + { + "epoch": 1.2265978050355069, + "grad_norm": 0.7017222046852112, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 3800 + }, + { + "epoch": 1.2298256939961265, + "grad_norm": 0.5872400403022766, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 3810 + }, + { + "epoch": 1.2330535829567464, + "grad_norm": 0.45765596628189087, + "learning_rate": 0.0002, + "loss": 0.7854, + "step": 3820 + }, + { + "epoch": 1.236281471917366, + "grad_norm": 0.5676377415657043, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 3830 + }, + { + "epoch": 1.2395093608779857, + "grad_norm": 0.4793425500392914, + "learning_rate": 0.0002, + "loss": 0.7696, + "step": 3840 + }, + { + "epoch": 1.2427372498386056, + "grad_norm": 0.5060022473335266, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 3850 + }, + { + "epoch": 1.2459651387992252, + "grad_norm": 0.6140682697296143, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 3860 + }, + { + "epoch": 1.249193027759845, + "grad_norm": 0.5030326843261719, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 3870 + }, + { + "epoch": 1.2524209167204647, + "grad_norm": 0.6609430909156799, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 3880 + }, + { + "epoch": 1.2556488056810846, + "grad_norm": 0.5459545850753784, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 3890 + }, + { + "epoch": 1.2588766946417043, + "grad_norm": 0.5328870415687561, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 3900 + }, + { + "epoch": 1.2621045836023241, + "grad_norm": 0.5840652585029602, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 3910 + }, + { + "epoch": 1.2653324725629438, + "grad_norm": 0.5587584376335144, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 3920 + }, + { + "epoch": 1.2685603615235637, + "grad_norm": 0.5886949896812439, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 3930 + }, + { + "epoch": 1.2717882504841833, + "grad_norm": 0.5128693580627441, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 3940 + }, + { + "epoch": 1.2750161394448032, + "grad_norm": 0.6207669377326965, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 3950 + }, + { + "epoch": 1.2782440284054228, + "grad_norm": 0.5789574384689331, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 3960 + }, + { + "epoch": 1.2814719173660425, + "grad_norm": 0.503162145614624, + "learning_rate": 0.0002, + "loss": 0.7574, + "step": 3970 + }, + { + "epoch": 1.2846998063266624, + "grad_norm": 0.6670064926147461, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 3980 + }, + { + "epoch": 1.2879276952872822, + "grad_norm": 0.5676213502883911, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 3990 + }, + { + "epoch": 1.2911555842479019, + "grad_norm": 0.5383169054985046, + "learning_rate": 0.0002, + "loss": 0.7892, + "step": 4000 + }, + { + "epoch": 1.2943834732085215, + "grad_norm": 0.714743971824646, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 4010 + }, + { + "epoch": 1.2976113621691414, + "grad_norm": 0.5740262269973755, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 4020 + }, + { + "epoch": 1.300839251129761, + "grad_norm": 0.6143045425415039, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 4030 + }, + { + "epoch": 1.304067140090381, + "grad_norm": 0.501025378704071, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 4040 + }, + { + "epoch": 1.3072950290510006, + "grad_norm": 0.5784100294113159, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 4050 + }, + { + "epoch": 1.3105229180116205, + "grad_norm": 0.6182606220245361, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 4060 + }, + { + "epoch": 1.3137508069722401, + "grad_norm": 0.5072231292724609, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 4070 + }, + { + "epoch": 1.31697869593286, + "grad_norm": 0.6841012835502625, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 4080 + }, + { + "epoch": 1.3202065848934796, + "grad_norm": 0.697257936000824, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 4090 + }, + { + "epoch": 1.3234344738540993, + "grad_norm": 0.5113214254379272, + "learning_rate": 0.0002, + "loss": 0.7401, + "step": 4100 + }, + { + "epoch": 1.3266623628147192, + "grad_norm": 0.6270561814308167, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 4110 + }, + { + "epoch": 1.329890251775339, + "grad_norm": 0.5525947213172913, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 4120 + }, + { + "epoch": 1.3331181407359587, + "grad_norm": 0.546071469783783, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 4130 + }, + { + "epoch": 1.3363460296965783, + "grad_norm": 0.6516721248626709, + "learning_rate": 0.0002, + "loss": 0.7884, + "step": 4140 + }, + { + "epoch": 1.3395739186571982, + "grad_norm": 0.6235111355781555, + "learning_rate": 0.0002, + "loss": 0.755, + "step": 4150 + }, + { + "epoch": 1.3428018076178179, + "grad_norm": 0.538649320602417, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 4160 + }, + { + "epoch": 1.3460296965784377, + "grad_norm": 0.5367001891136169, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 4170 + }, + { + "epoch": 1.3492575855390574, + "grad_norm": 0.6134631037712097, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 4180 + }, + { + "epoch": 1.3524854744996773, + "grad_norm": 0.5827262997627258, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 4190 + }, + { + "epoch": 1.355713363460297, + "grad_norm": 0.5706096291542053, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 4200 + }, + { + "epoch": 1.3589412524209168, + "grad_norm": 0.6422057151794434, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 4210 + }, + { + "epoch": 1.3621691413815364, + "grad_norm": 0.6316141486167908, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 4220 + }, + { + "epoch": 1.365397030342156, + "grad_norm": 0.6946983933448792, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 4230 + }, + { + "epoch": 1.368624919302776, + "grad_norm": 0.5381525754928589, + "learning_rate": 0.0002, + "loss": 0.7388, + "step": 4240 + }, + { + "epoch": 1.3718528082633958, + "grad_norm": 0.5484845638275146, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 4250 + }, + { + "epoch": 1.3750806972240155, + "grad_norm": 0.5961896777153015, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 4260 + }, + { + "epoch": 1.3783085861846351, + "grad_norm": 0.6041752696037292, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 4270 + }, + { + "epoch": 1.381536475145255, + "grad_norm": 0.6283464431762695, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 4280 + }, + { + "epoch": 1.384764364105875, + "grad_norm": 0.6761324405670166, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 4290 + }, + { + "epoch": 1.3879922530664945, + "grad_norm": 0.504311203956604, + "learning_rate": 0.0002, + "loss": 0.7381, + "step": 4300 + }, + { + "epoch": 1.3912201420271142, + "grad_norm": 0.6100395917892456, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 4310 + }, + { + "epoch": 1.394448030987734, + "grad_norm": 0.6245788335800171, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 4320 + }, + { + "epoch": 1.3976759199483537, + "grad_norm": 0.6074621081352234, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 4330 + }, + { + "epoch": 1.4009038089089736, + "grad_norm": 0.6683838963508606, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 4340 + }, + { + "epoch": 1.4041316978695932, + "grad_norm": 0.622998058795929, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 4350 + }, + { + "epoch": 1.4073595868302131, + "grad_norm": 0.6089423894882202, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 4360 + }, + { + "epoch": 1.4105874757908328, + "grad_norm": 0.6381658911705017, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 4370 + }, + { + "epoch": 1.4138153647514526, + "grad_norm": 0.5419308543205261, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 4380 + }, + { + "epoch": 1.4170432537120723, + "grad_norm": 0.6026232242584229, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 4390 + }, + { + "epoch": 1.420271142672692, + "grad_norm": 0.4911101162433624, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 4400 + }, + { + "epoch": 1.4234990316333118, + "grad_norm": 0.6302908062934875, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 4410 + }, + { + "epoch": 1.4267269205939317, + "grad_norm": 0.6692768931388855, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 4420 + }, + { + "epoch": 1.4299548095545513, + "grad_norm": 0.46294572949409485, + "learning_rate": 0.0002, + "loss": 0.7312, + "step": 4430 + }, + { + "epoch": 1.433182698515171, + "grad_norm": 0.5452619194984436, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 4440 + }, + { + "epoch": 1.4364105874757909, + "grad_norm": 0.7809233069419861, + "learning_rate": 0.0002, + "loss": 0.7974, + "step": 4450 + }, + { + "epoch": 1.4396384764364105, + "grad_norm": 0.550088107585907, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 4460 + }, + { + "epoch": 1.4428663653970304, + "grad_norm": 0.7139151096343994, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 4470 + }, + { + "epoch": 1.44609425435765, + "grad_norm": 0.6187090873718262, + "learning_rate": 0.0002, + "loss": 0.7358, + "step": 4480 + }, + { + "epoch": 1.44932214331827, + "grad_norm": 0.5948249101638794, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 4490 + }, + { + "epoch": 1.4525500322788896, + "grad_norm": 0.6510892510414124, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 4500 + }, + { + "epoch": 1.4557779212395094, + "grad_norm": 0.6552293300628662, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 4510 + }, + { + "epoch": 1.459005810200129, + "grad_norm": 0.585574209690094, + "learning_rate": 0.0002, + "loss": 0.7965, + "step": 4520 + }, + { + "epoch": 1.4622336991607487, + "grad_norm": 0.4830162823200226, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 4530 + }, + { + "epoch": 1.4654615881213686, + "grad_norm": 0.5780223608016968, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 4540 + }, + { + "epoch": 1.4686894770819885, + "grad_norm": 0.5462607145309448, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 4550 + }, + { + "epoch": 1.4719173660426081, + "grad_norm": 0.5183546543121338, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 4560 + }, + { + "epoch": 1.4751452550032278, + "grad_norm": 0.676917552947998, + "learning_rate": 0.0002, + "loss": 0.71, + "step": 4570 + }, + { + "epoch": 1.4783731439638477, + "grad_norm": 0.5772345066070557, + "learning_rate": 0.0002, + "loss": 0.7875, + "step": 4580 + }, + { + "epoch": 1.4816010329244673, + "grad_norm": 0.7320035696029663, + "learning_rate": 0.0002, + "loss": 0.7709, + "step": 4590 + }, + { + "epoch": 1.4848289218850872, + "grad_norm": 0.5024042129516602, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 4600 + }, + { + "epoch": 1.4880568108457068, + "grad_norm": 0.5482868552207947, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 4610 + }, + { + "epoch": 1.4912846998063267, + "grad_norm": 0.5447399616241455, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 4620 + }, + { + "epoch": 1.4945125887669464, + "grad_norm": 0.5953414440155029, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 4630 + }, + { + "epoch": 1.4977404777275662, + "grad_norm": 0.6983066201210022, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 4640 + }, + { + "epoch": 1.500968366688186, + "grad_norm": 0.586327075958252, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 4650 + }, + { + "epoch": 1.5041962556488055, + "grad_norm": 0.5839682221412659, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 4660 + }, + { + "epoch": 1.5074241446094254, + "grad_norm": 0.5959209203720093, + "learning_rate": 0.0002, + "loss": 0.7524, + "step": 4670 + }, + { + "epoch": 1.5106520335700453, + "grad_norm": 0.5073857307434082, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 4680 + }, + { + "epoch": 1.513879922530665, + "grad_norm": 0.5183001160621643, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 4690 + }, + { + "epoch": 1.5171078114912846, + "grad_norm": 0.593530535697937, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 4700 + }, + { + "epoch": 1.5203357004519045, + "grad_norm": 0.675993025302887, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 4710 + }, + { + "epoch": 1.5235635894125243, + "grad_norm": 0.5823286771774292, + "learning_rate": 0.0002, + "loss": 0.7485, + "step": 4720 + }, + { + "epoch": 1.526791478373144, + "grad_norm": 0.5825035572052002, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 4730 + }, + { + "epoch": 1.5300193673337636, + "grad_norm": 0.5689691305160522, + "learning_rate": 0.0002, + "loss": 0.8287, + "step": 4740 + }, + { + "epoch": 1.5332472562943835, + "grad_norm": 0.6037150621414185, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 4750 + }, + { + "epoch": 1.5364751452550034, + "grad_norm": 0.6393677592277527, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 4760 + }, + { + "epoch": 1.539703034215623, + "grad_norm": 0.5926381945610046, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 4770 + }, + { + "epoch": 1.5429309231762427, + "grad_norm": 0.9468599557876587, + "learning_rate": 0.0002, + "loss": 0.7425, + "step": 4780 + }, + { + "epoch": 1.5461588121368623, + "grad_norm": 0.7544237375259399, + "learning_rate": 0.0002, + "loss": 0.7565, + "step": 4790 + }, + { + "epoch": 1.5493867010974822, + "grad_norm": 0.5308566093444824, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 4800 + }, + { + "epoch": 1.552614590058102, + "grad_norm": 0.6590296030044556, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 4810 + }, + { + "epoch": 1.5558424790187217, + "grad_norm": 0.5630404353141785, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 4820 + }, + { + "epoch": 1.5590703679793414, + "grad_norm": 0.6800200939178467, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 4830 + }, + { + "epoch": 1.5622982569399613, + "grad_norm": 0.5463718175888062, + "learning_rate": 0.0002, + "loss": 0.7373, + "step": 4840 + }, + { + "epoch": 1.5655261459005811, + "grad_norm": 0.505135178565979, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 4850 + }, + { + "epoch": 1.5687540348612008, + "grad_norm": 0.5469676852226257, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 4860 + }, + { + "epoch": 1.5719819238218204, + "grad_norm": 0.5318337678909302, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 4870 + }, + { + "epoch": 1.5752098127824403, + "grad_norm": 0.7287914752960205, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 4880 + }, + { + "epoch": 1.5784377017430602, + "grad_norm": 0.7318989038467407, + "learning_rate": 0.0002, + "loss": 0.7532, + "step": 4890 + }, + { + "epoch": 1.5816655907036798, + "grad_norm": 0.6499921679496765, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 4900 + }, + { + "epoch": 1.5848934796642995, + "grad_norm": 0.47907355427742004, + "learning_rate": 0.0002, + "loss": 0.753, + "step": 4910 + }, + { + "epoch": 1.5881213686249191, + "grad_norm": 0.7338833808898926, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 4920 + }, + { + "epoch": 1.591349257585539, + "grad_norm": 0.5800719261169434, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 4930 + }, + { + "epoch": 1.594577146546159, + "grad_norm": 0.5365763306617737, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 4940 + }, + { + "epoch": 1.5978050355067785, + "grad_norm": 0.5800772309303284, + "learning_rate": 0.0002, + "loss": 0.777, + "step": 4950 + }, + { + "epoch": 1.6010329244673982, + "grad_norm": 0.7878010869026184, + "learning_rate": 0.0002, + "loss": 0.8027, + "step": 4960 + }, + { + "epoch": 1.604260813428018, + "grad_norm": 0.5919058918952942, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 4970 + }, + { + "epoch": 1.607488702388638, + "grad_norm": 0.5004435181617737, + "learning_rate": 0.0002, + "loss": 0.7762, + "step": 4980 + }, + { + "epoch": 1.6107165913492576, + "grad_norm": 0.6299242377281189, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 4990 + }, + { + "epoch": 1.6139444803098772, + "grad_norm": 0.6307242512702942, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 5000 + }, + { + "epoch": 1.6171723692704971, + "grad_norm": 0.7838703989982605, + "learning_rate": 0.0002, + "loss": 0.7693, + "step": 5010 + }, + { + "epoch": 1.620400258231117, + "grad_norm": 0.6454671621322632, + "learning_rate": 0.0002, + "loss": 0.7364, + "step": 5020 + }, + { + "epoch": 1.6236281471917366, + "grad_norm": 0.5907095670700073, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 5030 + }, + { + "epoch": 1.6268560361523563, + "grad_norm": 0.6053501963615417, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 5040 + }, + { + "epoch": 1.630083925112976, + "grad_norm": 0.5644670128822327, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 5050 + }, + { + "epoch": 1.6333118140735958, + "grad_norm": 0.6320949792861938, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 5060 + }, + { + "epoch": 1.6365397030342157, + "grad_norm": 0.6101489067077637, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 5070 + }, + { + "epoch": 1.6397675919948353, + "grad_norm": 0.9435283541679382, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 5080 + }, + { + "epoch": 1.642995480955455, + "grad_norm": 0.6668919324874878, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 5090 + }, + { + "epoch": 1.6462233699160749, + "grad_norm": 0.6160340905189514, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 5100 + }, + { + "epoch": 1.6494512588766947, + "grad_norm": 0.5999835729598999, + "learning_rate": 0.0002, + "loss": 0.7461, + "step": 5110 + }, + { + "epoch": 1.6526791478373144, + "grad_norm": 0.9378551840782166, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 5120 + }, + { + "epoch": 1.655907036797934, + "grad_norm": 0.4795055389404297, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 5130 + }, + { + "epoch": 1.659134925758554, + "grad_norm": 0.4878861606121063, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 5140 + }, + { + "epoch": 1.6623628147191738, + "grad_norm": 0.6042965054512024, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 5150 + }, + { + "epoch": 1.6655907036797934, + "grad_norm": 0.5829901695251465, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 5160 + }, + { + "epoch": 1.668818592640413, + "grad_norm": 0.5168480277061462, + "learning_rate": 0.0002, + "loss": 0.7498, + "step": 5170 + }, + { + "epoch": 1.672046481601033, + "grad_norm": 0.6489511132240295, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 5180 + }, + { + "epoch": 1.6752743705616526, + "grad_norm": 0.5955966114997864, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 5190 + }, + { + "epoch": 1.6785022595222725, + "grad_norm": 0.6228088140487671, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 5200 + }, + { + "epoch": 1.6817301484828922, + "grad_norm": 0.5726390480995178, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 5210 + }, + { + "epoch": 1.6849580374435118, + "grad_norm": 0.6116343140602112, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 5220 + }, + { + "epoch": 1.6881859264041317, + "grad_norm": 0.5483687520027161, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 5230 + }, + { + "epoch": 1.6914138153647515, + "grad_norm": 0.570941686630249, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 5240 + }, + { + "epoch": 1.6946417043253712, + "grad_norm": 0.6048086285591125, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 5250 + }, + { + "epoch": 1.6978695932859909, + "grad_norm": 0.6769003868103027, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 5260 + }, + { + "epoch": 1.7010974822466107, + "grad_norm": 0.5629057884216309, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 5270 + }, + { + "epoch": 1.7043253712072306, + "grad_norm": 0.657341480255127, + "learning_rate": 0.0002, + "loss": 0.7693, + "step": 5280 + }, + { + "epoch": 1.7075532601678503, + "grad_norm": 0.6256147623062134, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 5290 + }, + { + "epoch": 1.71078114912847, + "grad_norm": 0.5498088002204895, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 5300 + }, + { + "epoch": 1.7140090380890898, + "grad_norm": 0.5078358054161072, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 5310 + }, + { + "epoch": 1.7172369270497096, + "grad_norm": 0.6696692705154419, + "learning_rate": 0.0002, + "loss": 0.7872, + "step": 5320 + }, + { + "epoch": 1.7204648160103293, + "grad_norm": 0.6692847013473511, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 5330 + }, + { + "epoch": 1.723692704970949, + "grad_norm": 0.5415751934051514, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 5340 + }, + { + "epoch": 1.7269205939315686, + "grad_norm": 0.5367611050605774, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 5350 + }, + { + "epoch": 1.7301484828921885, + "grad_norm": 0.7321061491966248, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 5360 + }, + { + "epoch": 1.7333763718528084, + "grad_norm": 0.723972499370575, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 5370 + }, + { + "epoch": 1.736604260813428, + "grad_norm": 0.7328100204467773, + "learning_rate": 0.0002, + "loss": 0.7077, + "step": 5380 + }, + { + "epoch": 1.7398321497740477, + "grad_norm": 0.5785264372825623, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 5390 + }, + { + "epoch": 1.7430600387346675, + "grad_norm": 0.7812932133674622, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 5400 + }, + { + "epoch": 1.7462879276952874, + "grad_norm": 0.6493327617645264, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 5410 + }, + { + "epoch": 1.749515816655907, + "grad_norm": 0.5825939774513245, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 5420 + }, + { + "epoch": 1.7527437056165267, + "grad_norm": 0.6969610452651978, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 5430 + }, + { + "epoch": 1.7559715945771466, + "grad_norm": 0.5558062195777893, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 5440 + }, + { + "epoch": 1.7591994835377665, + "grad_norm": 0.49222221970558167, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 5450 + }, + { + "epoch": 1.762427372498386, + "grad_norm": 0.5844656825065613, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 5460 + }, + { + "epoch": 1.7656552614590058, + "grad_norm": 0.8706597685813904, + "learning_rate": 0.0002, + "loss": 0.7695, + "step": 5470 + }, + { + "epoch": 1.7688831504196254, + "grad_norm": 0.6167706251144409, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 5480 + }, + { + "epoch": 1.7721110393802453, + "grad_norm": 0.5890011787414551, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 5490 + }, + { + "epoch": 1.7753389283408652, + "grad_norm": 0.6551728248596191, + "learning_rate": 0.0002, + "loss": 0.8319, + "step": 5500 + }, + { + "epoch": 1.7785668173014848, + "grad_norm": 0.5848751068115234, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 5510 + }, + { + "epoch": 1.7817947062621045, + "grad_norm": 0.6664014458656311, + "learning_rate": 0.0002, + "loss": 0.7622, + "step": 5520 + }, + { + "epoch": 1.7850225952227243, + "grad_norm": 0.5931693911552429, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 5530 + }, + { + "epoch": 1.7882504841833442, + "grad_norm": 0.5534724593162537, + "learning_rate": 0.0002, + "loss": 0.7992, + "step": 5540 + }, + { + "epoch": 1.7914783731439639, + "grad_norm": 0.5590878129005432, + "learning_rate": 0.0002, + "loss": 0.7967, + "step": 5550 + }, + { + "epoch": 1.7947062621045835, + "grad_norm": 0.6947470903396606, + "learning_rate": 0.0002, + "loss": 0.7406, + "step": 5560 + }, + { + "epoch": 1.7979341510652034, + "grad_norm": 0.6104130148887634, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 5570 + }, + { + "epoch": 1.8011620400258233, + "grad_norm": 0.6135714054107666, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 5580 + }, + { + "epoch": 1.804389928986443, + "grad_norm": 0.6626853346824646, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 5590 + }, + { + "epoch": 1.8076178179470626, + "grad_norm": 0.6977612972259521, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 5600 + }, + { + "epoch": 1.8108457069076824, + "grad_norm": 0.6275238394737244, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 5610 + }, + { + "epoch": 1.814073595868302, + "grad_norm": 0.5017505288124084, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 5620 + }, + { + "epoch": 1.817301484828922, + "grad_norm": 0.8314290642738342, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 5630 + }, + { + "epoch": 1.8205293737895416, + "grad_norm": 0.6863582134246826, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 5640 + }, + { + "epoch": 1.8237572627501613, + "grad_norm": 0.69544917345047, + "learning_rate": 0.0002, + "loss": 0.743, + "step": 5650 + }, + { + "epoch": 1.8269851517107811, + "grad_norm": 0.515499472618103, + "learning_rate": 0.0002, + "loss": 0.7277, + "step": 5660 + }, + { + "epoch": 1.830213040671401, + "grad_norm": 0.6100873947143555, + "learning_rate": 0.0002, + "loss": 0.7166, + "step": 5670 + }, + { + "epoch": 1.8334409296320207, + "grad_norm": 0.67416912317276, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 5680 + }, + { + "epoch": 1.8366688185926403, + "grad_norm": 0.7057772278785706, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 5690 + }, + { + "epoch": 1.8398967075532602, + "grad_norm": 0.7374551892280579, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 5700 + }, + { + "epoch": 1.84312459651388, + "grad_norm": 0.6266297101974487, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 5710 + }, + { + "epoch": 1.8463524854744997, + "grad_norm": 0.5629227757453918, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 5720 + }, + { + "epoch": 1.8495803744351194, + "grad_norm": 0.6603655815124512, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 5730 + }, + { + "epoch": 1.8528082633957392, + "grad_norm": 0.8113715052604675, + "learning_rate": 0.0002, + "loss": 0.7587, + "step": 5740 + }, + { + "epoch": 1.856036152356359, + "grad_norm": 0.7143914103507996, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 5750 + }, + { + "epoch": 1.8592640413169788, + "grad_norm": 0.6273732781410217, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 5760 + }, + { + "epoch": 1.8624919302775984, + "grad_norm": 0.5428690910339355, + "learning_rate": 0.0002, + "loss": 0.7962, + "step": 5770 + }, + { + "epoch": 1.865719819238218, + "grad_norm": 0.6405037641525269, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 5780 + }, + { + "epoch": 1.868947708198838, + "grad_norm": 0.700873613357544, + "learning_rate": 0.0002, + "loss": 0.7569, + "step": 5790 + }, + { + "epoch": 1.8721755971594578, + "grad_norm": 0.5645238161087036, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 5800 + }, + { + "epoch": 1.8754034861200775, + "grad_norm": 0.8780353665351868, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 5810 + }, + { + "epoch": 1.878631375080697, + "grad_norm": 0.6295409798622131, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 5820 + }, + { + "epoch": 1.881859264041317, + "grad_norm": 0.678269624710083, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 5830 + }, + { + "epoch": 1.8850871530019369, + "grad_norm": 0.6464608907699585, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 5840 + }, + { + "epoch": 1.8883150419625565, + "grad_norm": 0.6201048493385315, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 5850 + }, + { + "epoch": 1.8915429309231762, + "grad_norm": 0.6046274304389954, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 5860 + }, + { + "epoch": 1.894770819883796, + "grad_norm": 0.7532408833503723, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 5870 + }, + { + "epoch": 1.897998708844416, + "grad_norm": 0.6066767573356628, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 5880 + }, + { + "epoch": 1.9012265978050356, + "grad_norm": 0.6289830207824707, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 5890 + }, + { + "epoch": 1.9044544867656552, + "grad_norm": 0.5204319953918457, + "learning_rate": 0.0002, + "loss": 0.7501, + "step": 5900 + }, + { + "epoch": 1.9076823757262749, + "grad_norm": 0.6708219647407532, + "learning_rate": 0.0002, + "loss": 0.7335, + "step": 5910 + }, + { + "epoch": 1.9109102646868947, + "grad_norm": 0.4915677309036255, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 5920 + }, + { + "epoch": 1.9141381536475146, + "grad_norm": 0.652717113494873, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 5930 + }, + { + "epoch": 1.9173660426081343, + "grad_norm": 0.5446316003799438, + "learning_rate": 0.0002, + "loss": 0.7687, + "step": 5940 + }, + { + "epoch": 1.920593931568754, + "grad_norm": 0.4958149194717407, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 5950 + }, + { + "epoch": 1.9238218205293738, + "grad_norm": 0.5623434782028198, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 5960 + }, + { + "epoch": 1.9270497094899937, + "grad_norm": 0.6855450868606567, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 5970 + }, + { + "epoch": 1.9302775984506133, + "grad_norm": 0.5710492730140686, + "learning_rate": 0.0002, + "loss": 0.827, + "step": 5980 + }, + { + "epoch": 1.933505487411233, + "grad_norm": 0.5379431843757629, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 5990 + }, + { + "epoch": 1.9367333763718528, + "grad_norm": 0.557129442691803, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 6000 + }, + { + "epoch": 1.9399612653324727, + "grad_norm": 0.6336663961410522, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 6010 + }, + { + "epoch": 1.9431891542930924, + "grad_norm": 0.5950582027435303, + "learning_rate": 0.0002, + "loss": 0.7316, + "step": 6020 + }, + { + "epoch": 1.946417043253712, + "grad_norm": 0.5905954837799072, + "learning_rate": 0.0002, + "loss": 0.7443, + "step": 6030 + }, + { + "epoch": 1.9496449322143317, + "grad_norm": 0.6688982844352722, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 6040 + }, + { + "epoch": 1.9528728211749515, + "grad_norm": 0.5440775752067566, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 6050 + }, + { + "epoch": 1.9561007101355714, + "grad_norm": 0.6207906603813171, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 6060 + }, + { + "epoch": 1.959328599096191, + "grad_norm": 0.6999374628067017, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 6070 + }, + { + "epoch": 1.9625564880568107, + "grad_norm": 0.6310848593711853, + "learning_rate": 0.0002, + "loss": 0.7372, + "step": 6080 + }, + { + "epoch": 1.9657843770174306, + "grad_norm": 0.5903388261795044, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 6090 + }, + { + "epoch": 1.9690122659780505, + "grad_norm": 0.6333889961242676, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 6100 + }, + { + "epoch": 1.97224015493867, + "grad_norm": 0.5604711174964905, + "learning_rate": 0.0002, + "loss": 0.7246, + "step": 6110 + }, + { + "epoch": 1.9754680438992898, + "grad_norm": 0.9234541654586792, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 6120 + }, + { + "epoch": 1.9786959328599096, + "grad_norm": 0.6149102449417114, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 6130 + }, + { + "epoch": 1.9819238218205295, + "grad_norm": 0.615446150302887, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 6140 + }, + { + "epoch": 1.9851517107811492, + "grad_norm": 0.5176635980606079, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 6150 + }, + { + "epoch": 1.9883795997417688, + "grad_norm": 0.7124109864234924, + "learning_rate": 0.0002, + "loss": 0.718, + "step": 6160 + }, + { + "epoch": 1.9916074887023887, + "grad_norm": 0.6317567825317383, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 6170 + }, + { + "epoch": 1.9948353776630086, + "grad_norm": 0.6855016350746155, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 6180 + }, + { + "epoch": 1.9980632666236282, + "grad_norm": 0.6423715353012085, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 6190 + }, + { + "epoch": 2.0, + "eval_loss": 1.1096643209457397, + "eval_runtime": 147.7997, + "eval_samples_per_second": 4.959, + "eval_steps_per_second": 0.622, + "step": 6196 + }, + { + "epoch": 2.001291155584248, + "grad_norm": 0.5322932600975037, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 6200 + }, + { + "epoch": 2.0045190445448675, + "grad_norm": 0.8152306079864502, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 6210 + }, + { + "epoch": 2.0077469335054876, + "grad_norm": 0.6215983033180237, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 6220 + }, + { + "epoch": 2.0109748224661073, + "grad_norm": 0.845498263835907, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 6230 + }, + { + "epoch": 2.014202711426727, + "grad_norm": 0.733559787273407, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 6240 + }, + { + "epoch": 2.0174306003873466, + "grad_norm": 0.51433926820755, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 6250 + }, + { + "epoch": 2.020658489347966, + "grad_norm": 0.6374049782752991, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 6260 + }, + { + "epoch": 2.0238863783085863, + "grad_norm": 0.7833638191223145, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 6270 + }, + { + "epoch": 2.027114267269206, + "grad_norm": 0.8929463028907776, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 6280 + }, + { + "epoch": 2.0303421562298256, + "grad_norm": 0.669731855392456, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 6290 + }, + { + "epoch": 2.0335700451904453, + "grad_norm": 0.5846071243286133, + "learning_rate": 0.0002, + "loss": 0.646, + "step": 6300 + }, + { + "epoch": 2.0367979341510654, + "grad_norm": 0.7087787985801697, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 6310 + }, + { + "epoch": 2.040025823111685, + "grad_norm": 0.6739160418510437, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 6320 + }, + { + "epoch": 2.0432537120723047, + "grad_norm": 0.4860886335372925, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 6330 + }, + { + "epoch": 2.0464816010329243, + "grad_norm": 0.7201244831085205, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 6340 + }, + { + "epoch": 2.0497094899935444, + "grad_norm": 0.7409170269966125, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 6350 + }, + { + "epoch": 2.052937378954164, + "grad_norm": 0.6843920350074768, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 6360 + }, + { + "epoch": 2.0561652679147837, + "grad_norm": 0.7519999742507935, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 6370 + }, + { + "epoch": 2.0593931568754034, + "grad_norm": 0.5732819437980652, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 6380 + }, + { + "epoch": 2.062621045836023, + "grad_norm": 0.7565118074417114, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 6390 + }, + { + "epoch": 2.065848934796643, + "grad_norm": 0.8147150278091431, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 6400 + }, + { + "epoch": 2.0690768237572628, + "grad_norm": 0.6941924691200256, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 6410 + }, + { + "epoch": 2.0723047127178824, + "grad_norm": 0.6549784541130066, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 6420 + }, + { + "epoch": 2.075532601678502, + "grad_norm": 0.7224905490875244, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 6430 + }, + { + "epoch": 2.078760490639122, + "grad_norm": 0.7754863500595093, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 6440 + }, + { + "epoch": 2.081988379599742, + "grad_norm": 0.691318154335022, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 6450 + }, + { + "epoch": 2.0852162685603615, + "grad_norm": 0.6009294986724854, + "learning_rate": 0.0002, + "loss": 0.6233, + "step": 6460 + }, + { + "epoch": 2.088444157520981, + "grad_norm": 0.6753945350646973, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 6470 + }, + { + "epoch": 2.091672046481601, + "grad_norm": 0.6899921298027039, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 6480 + }, + { + "epoch": 2.094899935442221, + "grad_norm": 0.846510648727417, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 6490 + }, + { + "epoch": 2.0981278244028405, + "grad_norm": 0.6432605981826782, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 6500 + }, + { + "epoch": 2.10135571336346, + "grad_norm": 0.8125239014625549, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 6510 + }, + { + "epoch": 2.1045836023240803, + "grad_norm": 0.628302812576294, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 6520 + }, + { + "epoch": 2.1078114912847, + "grad_norm": 0.7164334654808044, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 6530 + }, + { + "epoch": 2.1110393802453196, + "grad_norm": 0.7476949095726013, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 6540 + }, + { + "epoch": 2.114267269205939, + "grad_norm": 0.7577515840530396, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 6550 + }, + { + "epoch": 2.117495158166559, + "grad_norm": 0.5684467554092407, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 6560 + }, + { + "epoch": 2.120723047127179, + "grad_norm": 0.6121789216995239, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 6570 + }, + { + "epoch": 2.1239509360877986, + "grad_norm": 0.6095348596572876, + "learning_rate": 0.0002, + "loss": 0.6314, + "step": 6580 + }, + { + "epoch": 2.1271788250484183, + "grad_norm": 0.7803651690483093, + "learning_rate": 0.0002, + "loss": 0.6276, + "step": 6590 + }, + { + "epoch": 2.130406714009038, + "grad_norm": 0.5990583300590515, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 6600 + }, + { + "epoch": 2.133634602969658, + "grad_norm": 0.6569220423698425, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 6610 + }, + { + "epoch": 2.1368624919302777, + "grad_norm": 0.5961166620254517, + "learning_rate": 0.0002, + "loss": 0.7049, + "step": 6620 + }, + { + "epoch": 2.1400903808908973, + "grad_norm": 0.5860554575920105, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 6630 + }, + { + "epoch": 2.143318269851517, + "grad_norm": 0.5994001626968384, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 6640 + }, + { + "epoch": 2.146546158812137, + "grad_norm": 0.7723015546798706, + "learning_rate": 0.0002, + "loss": 0.6421, + "step": 6650 + }, + { + "epoch": 2.1497740477727567, + "grad_norm": 0.676355242729187, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 6660 + }, + { + "epoch": 2.1530019367333764, + "grad_norm": 0.5689092874526978, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 6670 + }, + { + "epoch": 2.156229825693996, + "grad_norm": 0.6933727264404297, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 6680 + }, + { + "epoch": 2.159457714654616, + "grad_norm": 0.8380527496337891, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 6690 + }, + { + "epoch": 2.1626856036152358, + "grad_norm": 0.6876497268676758, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 6700 + }, + { + "epoch": 2.1659134925758554, + "grad_norm": 0.6418334245681763, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 6710 + }, + { + "epoch": 2.169141381536475, + "grad_norm": 0.7169192433357239, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 6720 + }, + { + "epoch": 2.1723692704970947, + "grad_norm": 0.6664170622825623, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 6730 + }, + { + "epoch": 2.175597159457715, + "grad_norm": 0.6011993288993835, + "learning_rate": 0.0002, + "loss": 0.6751, + "step": 6740 + }, + { + "epoch": 2.1788250484183345, + "grad_norm": 0.5529947280883789, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 6750 + }, + { + "epoch": 2.182052937378954, + "grad_norm": 0.6879532933235168, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 6760 + }, + { + "epoch": 2.1852808263395738, + "grad_norm": 0.6426113843917847, + "learning_rate": 0.0002, + "loss": 0.6634, + "step": 6770 + }, + { + "epoch": 2.188508715300194, + "grad_norm": 0.6571047306060791, + "learning_rate": 0.0002, + "loss": 0.6592, + "step": 6780 + }, + { + "epoch": 2.1917366042608135, + "grad_norm": 0.6400564908981323, + "learning_rate": 0.0002, + "loss": 0.6494, + "step": 6790 + }, + { + "epoch": 2.194964493221433, + "grad_norm": 0.6509664058685303, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 6800 + }, + { + "epoch": 2.198192382182053, + "grad_norm": 0.6673197150230408, + "learning_rate": 0.0002, + "loss": 0.6771, + "step": 6810 + }, + { + "epoch": 2.2014202711426725, + "grad_norm": 0.48205727338790894, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 6820 + }, + { + "epoch": 2.2046481601032926, + "grad_norm": 0.849525511264801, + "learning_rate": 0.0002, + "loss": 0.6894, + "step": 6830 + }, + { + "epoch": 2.207876049063912, + "grad_norm": 0.6150892376899719, + "learning_rate": 0.0002, + "loss": 0.6977, + "step": 6840 + }, + { + "epoch": 2.211103938024532, + "grad_norm": 0.7826945781707764, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 6850 + }, + { + "epoch": 2.2143318269851515, + "grad_norm": 0.5711963772773743, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 6860 + }, + { + "epoch": 2.2175597159457716, + "grad_norm": 0.6017758846282959, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 6870 + }, + { + "epoch": 2.2207876049063913, + "grad_norm": 0.785434901714325, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 6880 + }, + { + "epoch": 2.224015493867011, + "grad_norm": 0.6251688599586487, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 6890 + }, + { + "epoch": 2.2272433828276306, + "grad_norm": 0.8242034316062927, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 6900 + }, + { + "epoch": 2.2304712717882507, + "grad_norm": 0.7272933125495911, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 6910 + }, + { + "epoch": 2.2336991607488703, + "grad_norm": 0.7159379720687866, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 6920 + }, + { + "epoch": 2.23692704970949, + "grad_norm": 0.6518042087554932, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 6930 + }, + { + "epoch": 2.2401549386701096, + "grad_norm": 0.7365370392799377, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 6940 + }, + { + "epoch": 2.2433828276307297, + "grad_norm": 0.5674061179161072, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 6950 + }, + { + "epoch": 2.2466107165913494, + "grad_norm": 0.669185996055603, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 6960 + }, + { + "epoch": 2.249838605551969, + "grad_norm": 0.6638304591178894, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 6970 + }, + { + "epoch": 2.2530664945125887, + "grad_norm": 0.757006824016571, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 6980 + }, + { + "epoch": 2.2562943834732083, + "grad_norm": 0.7574930787086487, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 6990 + }, + { + "epoch": 2.2595222724338284, + "grad_norm": 0.7819514870643616, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 7000 + }, + { + "epoch": 2.262750161394448, + "grad_norm": 0.6987583041191101, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 7010 + }, + { + "epoch": 2.2659780503550677, + "grad_norm": 0.6628551483154297, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 7020 + }, + { + "epoch": 2.2692059393156874, + "grad_norm": 0.7855866551399231, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 7030 + }, + { + "epoch": 2.2724338282763075, + "grad_norm": 0.6102892756462097, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 7040 + }, + { + "epoch": 2.275661717236927, + "grad_norm": 0.7844198942184448, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 7050 + }, + { + "epoch": 2.2788896061975468, + "grad_norm": 0.6209492087364197, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 7060 + }, + { + "epoch": 2.2821174951581664, + "grad_norm": 0.8351290225982666, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 7070 + }, + { + "epoch": 2.285345384118786, + "grad_norm": 0.6883546710014343, + "learning_rate": 0.0002, + "loss": 0.6648, + "step": 7080 + }, + { + "epoch": 2.288573273079406, + "grad_norm": 0.6626381874084473, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 7090 + }, + { + "epoch": 2.291801162040026, + "grad_norm": 0.7216270565986633, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 7100 + }, + { + "epoch": 2.2950290510006455, + "grad_norm": 0.8246777057647705, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 7110 + }, + { + "epoch": 2.2982569399612656, + "grad_norm": 0.614326000213623, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 7120 + }, + { + "epoch": 2.301484828921885, + "grad_norm": 0.8785578012466431, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 7130 + }, + { + "epoch": 2.304712717882505, + "grad_norm": 0.7021808624267578, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 7140 + }, + { + "epoch": 2.3079406068431245, + "grad_norm": 0.6999403238296509, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 7150 + }, + { + "epoch": 2.311168495803744, + "grad_norm": 0.8013143539428711, + "learning_rate": 0.0002, + "loss": 0.6547, + "step": 7160 + }, + { + "epoch": 2.3143963847643643, + "grad_norm": 0.6592583060264587, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 7170 + }, + { + "epoch": 2.317624273724984, + "grad_norm": 0.6260249018669128, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 7180 + }, + { + "epoch": 2.3208521626856036, + "grad_norm": 0.9352797269821167, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 7190 + }, + { + "epoch": 2.324080051646223, + "grad_norm": 0.6629612445831299, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 7200 + }, + { + "epoch": 2.3273079406068433, + "grad_norm": 0.7062810063362122, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 7210 + }, + { + "epoch": 2.330535829567463, + "grad_norm": 0.7236241102218628, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 7220 + }, + { + "epoch": 2.3337637185280826, + "grad_norm": 0.7528148293495178, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 7230 + }, + { + "epoch": 2.3369916074887023, + "grad_norm": 0.7604748606681824, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 7240 + }, + { + "epoch": 2.340219496449322, + "grad_norm": 0.5601189136505127, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 7250 + }, + { + "epoch": 2.343447385409942, + "grad_norm": 0.7099230885505676, + "learning_rate": 0.0002, + "loss": 0.6925, + "step": 7260 + }, + { + "epoch": 2.3466752743705617, + "grad_norm": 0.6699047684669495, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 7270 + }, + { + "epoch": 2.3499031633311813, + "grad_norm": 0.7315047979354858, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 7280 + }, + { + "epoch": 2.353131052291801, + "grad_norm": 0.632836103439331, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 7290 + }, + { + "epoch": 2.356358941252421, + "grad_norm": 0.9410115480422974, + "learning_rate": 0.0002, + "loss": 0.6458, + "step": 7300 + }, + { + "epoch": 2.3595868302130407, + "grad_norm": 0.626554012298584, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 7310 + }, + { + "epoch": 2.3628147191736604, + "grad_norm": 0.7538444399833679, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 7320 + }, + { + "epoch": 2.36604260813428, + "grad_norm": 0.6826626062393188, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 7330 + }, + { + "epoch": 2.3692704970949, + "grad_norm": 0.6739391088485718, + "learning_rate": 0.0002, + "loss": 0.6752, + "step": 7340 + }, + { + "epoch": 2.3724983860555198, + "grad_norm": 0.7518446445465088, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 7350 + }, + { + "epoch": 2.3757262750161394, + "grad_norm": 0.714133083820343, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 7360 + }, + { + "epoch": 2.378954163976759, + "grad_norm": 0.7144588232040405, + "learning_rate": 0.0002, + "loss": 0.6794, + "step": 7370 + }, + { + "epoch": 2.382182052937379, + "grad_norm": 0.6598120927810669, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 7380 + }, + { + "epoch": 2.385409941897999, + "grad_norm": 0.7079148292541504, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 7390 + }, + { + "epoch": 2.3886378308586185, + "grad_norm": 0.6750902533531189, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 7400 + }, + { + "epoch": 2.391865719819238, + "grad_norm": 0.7181967496871948, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 7410 + }, + { + "epoch": 2.3950936087798578, + "grad_norm": 0.7720552086830139, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 7420 + }, + { + "epoch": 2.398321497740478, + "grad_norm": 0.7592426538467407, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 7430 + }, + { + "epoch": 2.4015493867010975, + "grad_norm": 0.7161896824836731, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 7440 + }, + { + "epoch": 2.404777275661717, + "grad_norm": 0.8019260764122009, + "learning_rate": 0.0002, + "loss": 0.6891, + "step": 7450 + }, + { + "epoch": 2.408005164622337, + "grad_norm": 0.7093342542648315, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 7460 + }, + { + "epoch": 2.411233053582957, + "grad_norm": 0.8464207649230957, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 7470 + }, + { + "epoch": 2.4144609425435766, + "grad_norm": 0.773666501045227, + "learning_rate": 0.0002, + "loss": 0.6724, + "step": 7480 + }, + { + "epoch": 2.4176888315041962, + "grad_norm": 0.8451611995697021, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 7490 + }, + { + "epoch": 2.420916720464816, + "grad_norm": 0.656795084476471, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 7500 + }, + { + "epoch": 2.4241446094254355, + "grad_norm": 0.7129034996032715, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 7510 + }, + { + "epoch": 2.4273724983860556, + "grad_norm": 0.8325763940811157, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 7520 + }, + { + "epoch": 2.4306003873466753, + "grad_norm": 0.7806527614593506, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 7530 + }, + { + "epoch": 2.433828276307295, + "grad_norm": 0.6994536519050598, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 7540 + }, + { + "epoch": 2.437056165267915, + "grad_norm": 0.6898999214172363, + "learning_rate": 0.0002, + "loss": 0.6615, + "step": 7550 + }, + { + "epoch": 2.4402840542285347, + "grad_norm": 0.719490647315979, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 7560 + }, + { + "epoch": 2.4435119431891543, + "grad_norm": 0.6841562390327454, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 7570 + }, + { + "epoch": 2.446739832149774, + "grad_norm": 0.7573311924934387, + "learning_rate": 0.0002, + "loss": 0.6504, + "step": 7580 + }, + { + "epoch": 2.4499677211103936, + "grad_norm": 0.7295880317687988, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 7590 + }, + { + "epoch": 2.4531956100710137, + "grad_norm": 0.710136353969574, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 7600 + }, + { + "epoch": 2.4564234990316334, + "grad_norm": 0.6126235127449036, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 7610 + }, + { + "epoch": 2.459651387992253, + "grad_norm": 0.8025609850883484, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 7620 + }, + { + "epoch": 2.4628792769528727, + "grad_norm": 0.7839472889900208, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 7630 + }, + { + "epoch": 2.4661071659134928, + "grad_norm": 0.7253499031066895, + "learning_rate": 0.0002, + "loss": 0.6797, + "step": 7640 + }, + { + "epoch": 2.4693350548741124, + "grad_norm": 0.7918946743011475, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 7650 + }, + { + "epoch": 2.472562943834732, + "grad_norm": 0.7930178046226501, + "learning_rate": 0.0002, + "loss": 0.6646, + "step": 7660 + }, + { + "epoch": 2.4757908327953517, + "grad_norm": 0.6826170086860657, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 7670 + }, + { + "epoch": 2.4790187217559714, + "grad_norm": 0.6576805114746094, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 7680 + }, + { + "epoch": 2.4822466107165915, + "grad_norm": 0.7012448310852051, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 7690 + }, + { + "epoch": 2.485474499677211, + "grad_norm": 0.7774284482002258, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 7700 + }, + { + "epoch": 2.4887023886378308, + "grad_norm": 0.6502766013145447, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 7710 + }, + { + "epoch": 2.4919302775984504, + "grad_norm": 0.7638739347457886, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 7720 + }, + { + "epoch": 2.4951581665590705, + "grad_norm": 0.6217384338378906, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 7730 + }, + { + "epoch": 2.49838605551969, + "grad_norm": 0.7576302886009216, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 7740 + }, + { + "epoch": 2.50161394448031, + "grad_norm": 0.6877137422561646, + "learning_rate": 0.0002, + "loss": 0.6855, + "step": 7750 + }, + { + "epoch": 2.5048418334409295, + "grad_norm": 0.6998329162597656, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 7760 + }, + { + "epoch": 2.508069722401549, + "grad_norm": 0.7879213690757751, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 7770 + }, + { + "epoch": 2.5112976113621692, + "grad_norm": 0.7834980487823486, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 7780 + }, + { + "epoch": 2.514525500322789, + "grad_norm": 0.7789630889892578, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 7790 + }, + { + "epoch": 2.5177533892834085, + "grad_norm": 0.7403590083122253, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 7800 + }, + { + "epoch": 2.5209812782440286, + "grad_norm": 0.6029766201972961, + "learning_rate": 0.0002, + "loss": 0.6964, + "step": 7810 + }, + { + "epoch": 2.5242091672046483, + "grad_norm": 0.7061092257499695, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 7820 + }, + { + "epoch": 2.527437056165268, + "grad_norm": 0.7120763659477234, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 7830 + }, + { + "epoch": 2.5306649451258876, + "grad_norm": 0.6173675656318665, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 7840 + }, + { + "epoch": 2.5338928340865072, + "grad_norm": 0.9566813111305237, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 7850 + }, + { + "epoch": 2.5371207230471273, + "grad_norm": 0.8497620224952698, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 7860 + }, + { + "epoch": 2.540348612007747, + "grad_norm": 0.7663498520851135, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 7870 + }, + { + "epoch": 2.5435765009683666, + "grad_norm": 0.6329668760299683, + "learning_rate": 0.0002, + "loss": 0.6292, + "step": 7880 + }, + { + "epoch": 2.5468043899289863, + "grad_norm": 0.8128195405006409, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 7890 + }, + { + "epoch": 2.5500322788896064, + "grad_norm": 0.6622284650802612, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 7900 + }, + { + "epoch": 2.553260167850226, + "grad_norm": 0.8460057973861694, + "learning_rate": 0.0002, + "loss": 0.693, + "step": 7910 + }, + { + "epoch": 2.5564880568108457, + "grad_norm": 0.6586956977844238, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 7920 + }, + { + "epoch": 2.5597159457714653, + "grad_norm": 0.7569382190704346, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 7930 + }, + { + "epoch": 2.562943834732085, + "grad_norm": 0.6409714221954346, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 7940 + }, + { + "epoch": 2.566171723692705, + "grad_norm": 0.7031713128089905, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 7950 + }, + { + "epoch": 2.5693996126533247, + "grad_norm": 0.7983605265617371, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 7960 + }, + { + "epoch": 2.5726275016139444, + "grad_norm": 0.7165433168411255, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 7970 + }, + { + "epoch": 2.5758553905745645, + "grad_norm": 0.6630598902702332, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 7980 + }, + { + "epoch": 2.579083279535184, + "grad_norm": 0.5883122086524963, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 7990 + }, + { + "epoch": 2.5823111684958038, + "grad_norm": 0.5928755402565002, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 8000 + }, + { + "epoch": 2.5855390574564234, + "grad_norm": 0.7843712568283081, + "learning_rate": 0.0002, + "loss": 0.6701, + "step": 8010 + }, + { + "epoch": 2.588766946417043, + "grad_norm": 0.7206324338912964, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 8020 + }, + { + "epoch": 2.5919948353776627, + "grad_norm": 0.812480092048645, + "learning_rate": 0.0002, + "loss": 0.6968, + "step": 8030 + }, + { + "epoch": 2.595222724338283, + "grad_norm": 0.9843078255653381, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 8040 + }, + { + "epoch": 2.5984506132989025, + "grad_norm": 0.7524392604827881, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 8050 + }, + { + "epoch": 2.601678502259522, + "grad_norm": 0.6220380067825317, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 8060 + }, + { + "epoch": 2.6049063912201422, + "grad_norm": 0.7461398243904114, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 8070 + }, + { + "epoch": 2.608134280180762, + "grad_norm": 0.720974326133728, + "learning_rate": 0.0002, + "loss": 0.6626, + "step": 8080 + }, + { + "epoch": 2.6113621691413815, + "grad_norm": 0.649509847164154, + "learning_rate": 0.0002, + "loss": 0.6756, + "step": 8090 + }, + { + "epoch": 2.614590058102001, + "grad_norm": 0.6894662976264954, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 8100 + }, + { + "epoch": 2.617817947062621, + "grad_norm": 0.734433114528656, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 8110 + }, + { + "epoch": 2.621045836023241, + "grad_norm": 0.7468628883361816, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 8120 + }, + { + "epoch": 2.6242737249838606, + "grad_norm": 0.6508180499076843, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 8130 + }, + { + "epoch": 2.6275016139444802, + "grad_norm": 0.8735209107398987, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 8140 + }, + { + "epoch": 2.6307295029051003, + "grad_norm": 0.8162857294082642, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 8150 + }, + { + "epoch": 2.63395739186572, + "grad_norm": 0.628872811794281, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 8160 + }, + { + "epoch": 2.6371852808263396, + "grad_norm": 0.8078708052635193, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 8170 + }, + { + "epoch": 2.6404131697869593, + "grad_norm": 0.7849429845809937, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 8180 + }, + { + "epoch": 2.643641058747579, + "grad_norm": 0.8115387558937073, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 8190 + }, + { + "epoch": 2.6468689477081986, + "grad_norm": 0.7462222576141357, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 8200 + }, + { + "epoch": 2.6500968366688187, + "grad_norm": 0.753662645816803, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 8210 + }, + { + "epoch": 2.6533247256294383, + "grad_norm": 0.6100404858589172, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 8220 + }, + { + "epoch": 2.656552614590058, + "grad_norm": 0.9084606766700745, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 8230 + }, + { + "epoch": 2.659780503550678, + "grad_norm": 0.6412538886070251, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 8240 + }, + { + "epoch": 2.6630083925112977, + "grad_norm": 0.7640451192855835, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 8250 + }, + { + "epoch": 2.6662362814719174, + "grad_norm": 0.5972344875335693, + "learning_rate": 0.0002, + "loss": 0.6846, + "step": 8260 + }, + { + "epoch": 2.669464170432537, + "grad_norm": 0.6935883164405823, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 8270 + }, + { + "epoch": 2.6726920593931567, + "grad_norm": 0.789399266242981, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 8280 + }, + { + "epoch": 2.675919948353777, + "grad_norm": 0.7143490314483643, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 8290 + }, + { + "epoch": 2.6791478373143964, + "grad_norm": 0.6670652627944946, + "learning_rate": 0.0002, + "loss": 0.6741, + "step": 8300 + }, + { + "epoch": 2.682375726275016, + "grad_norm": 0.687108039855957, + "learning_rate": 0.0002, + "loss": 0.6936, + "step": 8310 + }, + { + "epoch": 2.6856036152356357, + "grad_norm": 0.7914147973060608, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 8320 + }, + { + "epoch": 2.688831504196256, + "grad_norm": 0.8398420214653015, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 8330 + }, + { + "epoch": 2.6920593931568755, + "grad_norm": 0.6592720746994019, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 8340 + }, + { + "epoch": 2.695287282117495, + "grad_norm": 0.6888470649719238, + "learning_rate": 0.0002, + "loss": 0.6673, + "step": 8350 + }, + { + "epoch": 2.698515171078115, + "grad_norm": 0.7127556800842285, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 8360 + }, + { + "epoch": 2.7017430600387344, + "grad_norm": 0.6630286574363708, + "learning_rate": 0.0002, + "loss": 0.7013, + "step": 8370 + }, + { + "epoch": 2.7049709489993545, + "grad_norm": 0.8261964321136475, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 8380 + }, + { + "epoch": 2.708198837959974, + "grad_norm": 0.717339813709259, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 8390 + }, + { + "epoch": 2.711426726920594, + "grad_norm": 0.651637613773346, + "learning_rate": 0.0002, + "loss": 0.6929, + "step": 8400 + }, + { + "epoch": 2.714654615881214, + "grad_norm": 0.7936098575592041, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 8410 + }, + { + "epoch": 2.7178825048418336, + "grad_norm": 0.8761560320854187, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 8420 + }, + { + "epoch": 2.7211103938024532, + "grad_norm": 0.6768006086349487, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 8430 + }, + { + "epoch": 2.724338282763073, + "grad_norm": 0.7121055722236633, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 8440 + }, + { + "epoch": 2.7275661717236925, + "grad_norm": 0.6811696887016296, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 8450 + }, + { + "epoch": 2.730794060684312, + "grad_norm": 0.8168250918388367, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 8460 + }, + { + "epoch": 2.7340219496449323, + "grad_norm": 0.660682737827301, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 8470 + }, + { + "epoch": 2.737249838605552, + "grad_norm": 0.7369356155395508, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 8480 + }, + { + "epoch": 2.7404777275661716, + "grad_norm": 0.7545099854469299, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 8490 + }, + { + "epoch": 2.7437056165267917, + "grad_norm": 0.6991257667541504, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 8500 + }, + { + "epoch": 2.7469335054874113, + "grad_norm": 0.7195324301719666, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 8510 + }, + { + "epoch": 2.750161394448031, + "grad_norm": 0.8995378017425537, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 8520 + }, + { + "epoch": 2.7533892834086506, + "grad_norm": 0.6924123764038086, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 8530 + }, + { + "epoch": 2.7566171723692703, + "grad_norm": 0.6260585784912109, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 8540 + }, + { + "epoch": 2.7598450613298904, + "grad_norm": 0.7273091673851013, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 8550 + }, + { + "epoch": 2.76307295029051, + "grad_norm": 0.720562219619751, + "learning_rate": 0.0002, + "loss": 0.6853, + "step": 8560 + }, + { + "epoch": 2.7663008392511297, + "grad_norm": 0.6360004544258118, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 8570 + }, + { + "epoch": 2.76952872821175, + "grad_norm": 0.7634525895118713, + "learning_rate": 0.0002, + "loss": 0.6118, + "step": 8580 + }, + { + "epoch": 2.7727566171723694, + "grad_norm": 0.6586076021194458, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 8590 + }, + { + "epoch": 2.775984506132989, + "grad_norm": 0.6542639136314392, + "learning_rate": 0.0002, + "loss": 0.7072, + "step": 8600 + }, + { + "epoch": 2.7792123950936087, + "grad_norm": 0.7650290727615356, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 8610 + }, + { + "epoch": 2.7824402840542284, + "grad_norm": 0.6551542282104492, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 8620 + }, + { + "epoch": 2.785668173014848, + "grad_norm": 0.6915501952171326, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 8630 + }, + { + "epoch": 2.788896061975468, + "grad_norm": 0.8061493635177612, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 8640 + }, + { + "epoch": 2.792123950936088, + "grad_norm": 0.8403584957122803, + "learning_rate": 0.0002, + "loss": 0.6853, + "step": 8650 + }, + { + "epoch": 2.7953518398967074, + "grad_norm": 0.6455532312393188, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 8660 + }, + { + "epoch": 2.7985797288573275, + "grad_norm": 0.8296352028846741, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 8670 + }, + { + "epoch": 2.801807617817947, + "grad_norm": 0.7288752794265747, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 8680 + }, + { + "epoch": 2.805035506778567, + "grad_norm": 0.7628464102745056, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 8690 + }, + { + "epoch": 2.8082633957391865, + "grad_norm": 0.9993878602981567, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 8700 + }, + { + "epoch": 2.811491284699806, + "grad_norm": 0.6972465515136719, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 8710 + }, + { + "epoch": 2.8147191736604262, + "grad_norm": 0.645042896270752, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 8720 + }, + { + "epoch": 2.817947062621046, + "grad_norm": 0.6853853464126587, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 8730 + }, + { + "epoch": 2.8211749515816655, + "grad_norm": 0.5935067534446716, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 8740 + }, + { + "epoch": 2.824402840542285, + "grad_norm": 0.7336633205413818, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 8750 + }, + { + "epoch": 2.8276307295029053, + "grad_norm": 0.7074962854385376, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 8760 + }, + { + "epoch": 2.830858618463525, + "grad_norm": 0.6667559742927551, + "learning_rate": 0.0002, + "loss": 0.6744, + "step": 8770 + }, + { + "epoch": 2.8340865074241446, + "grad_norm": 0.8101205229759216, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 8780 + }, + { + "epoch": 2.8373143963847642, + "grad_norm": 0.8841480016708374, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 8790 + }, + { + "epoch": 2.840542285345384, + "grad_norm": 0.5891591310501099, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 8800 + }, + { + "epoch": 2.843770174306004, + "grad_norm": 0.667032778263092, + "learning_rate": 0.0002, + "loss": 0.7114, + "step": 8810 + }, + { + "epoch": 2.8469980632666236, + "grad_norm": 0.7629773020744324, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 8820 + }, + { + "epoch": 2.8502259522272433, + "grad_norm": 0.79471355676651, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 8830 + }, + { + "epoch": 2.8534538411878634, + "grad_norm": 0.7529178261756897, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 8840 + }, + { + "epoch": 2.856681730148483, + "grad_norm": 0.7014923691749573, + "learning_rate": 0.0002, + "loss": 0.7163, + "step": 8850 + }, + { + "epoch": 2.8599096191091027, + "grad_norm": 0.7996514439582825, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 8860 + }, + { + "epoch": 2.8631375080697223, + "grad_norm": 0.7044785618782043, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 8870 + }, + { + "epoch": 2.866365397030342, + "grad_norm": 0.6792093515396118, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 8880 + }, + { + "epoch": 2.8695932859909616, + "grad_norm": 0.69175124168396, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 8890 + }, + { + "epoch": 2.8728211749515817, + "grad_norm": 0.7499129176139832, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 8900 + }, + { + "epoch": 2.8760490639122014, + "grad_norm": 0.7678789496421814, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 8910 + }, + { + "epoch": 2.879276952872821, + "grad_norm": 0.7478128671646118, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 8920 + }, + { + "epoch": 2.882504841833441, + "grad_norm": 0.6767086386680603, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 8930 + }, + { + "epoch": 2.885732730794061, + "grad_norm": 0.7222196459770203, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 8940 + }, + { + "epoch": 2.8889606197546804, + "grad_norm": 0.6950580477714539, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 8950 + }, + { + "epoch": 2.8921885087153, + "grad_norm": 0.7759528160095215, + "learning_rate": 0.0002, + "loss": 0.7064, + "step": 8960 + }, + { + "epoch": 2.8954163976759197, + "grad_norm": 0.6686919927597046, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 8970 + }, + { + "epoch": 2.89864428663654, + "grad_norm": 0.9245954751968384, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 8980 + }, + { + "epoch": 2.9018721755971595, + "grad_norm": 0.8734814524650574, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 8990 + }, + { + "epoch": 2.905100064557779, + "grad_norm": 0.6056219339370728, + "learning_rate": 0.0002, + "loss": 0.6716, + "step": 9000 + }, + { + "epoch": 2.9083279535183992, + "grad_norm": 0.7364102005958557, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 9010 + }, + { + "epoch": 2.911555842479019, + "grad_norm": 0.6563605070114136, + "learning_rate": 0.0002, + "loss": 0.707, + "step": 9020 + }, + { + "epoch": 2.9147837314396385, + "grad_norm": 0.659978985786438, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 9030 + }, + { + "epoch": 2.918011620400258, + "grad_norm": 0.8176041841506958, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 9040 + }, + { + "epoch": 2.921239509360878, + "grad_norm": 0.743677020072937, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 9050 + }, + { + "epoch": 2.9244673983214975, + "grad_norm": 0.7418383359909058, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 9060 + }, + { + "epoch": 2.9276952872821176, + "grad_norm": 0.6916524767875671, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 9070 + }, + { + "epoch": 2.9309231762427372, + "grad_norm": 0.6559975743293762, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 9080 + }, + { + "epoch": 2.934151065203357, + "grad_norm": 0.7431221008300781, + "learning_rate": 0.0002, + "loss": 0.7016, + "step": 9090 + }, + { + "epoch": 2.937378954163977, + "grad_norm": 0.7525941133499146, + "learning_rate": 0.0002, + "loss": 0.6829, + "step": 9100 + }, + { + "epoch": 2.9406068431245966, + "grad_norm": 0.6860167384147644, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 9110 + }, + { + "epoch": 2.9438347320852163, + "grad_norm": 0.6467666029930115, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 9120 + }, + { + "epoch": 2.947062621045836, + "grad_norm": 0.7595751285552979, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 9130 + }, + { + "epoch": 2.9502905100064556, + "grad_norm": 0.6558279991149902, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 9140 + }, + { + "epoch": 2.9535183989670757, + "grad_norm": 0.6818708181381226, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 9150 + }, + { + "epoch": 2.9567462879276953, + "grad_norm": 0.8387085795402527, + "learning_rate": 0.0002, + "loss": 0.6921, + "step": 9160 + }, + { + "epoch": 2.959974176888315, + "grad_norm": 0.7705109715461731, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 9170 + }, + { + "epoch": 2.9632020658489346, + "grad_norm": 0.688106894493103, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 9180 + }, + { + "epoch": 2.9664299548095547, + "grad_norm": 0.659532368183136, + "learning_rate": 0.0002, + "loss": 0.6833, + "step": 9190 + }, + { + "epoch": 2.9696578437701744, + "grad_norm": 0.6839388608932495, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 9200 + }, + { + "epoch": 2.972885732730794, + "grad_norm": 0.6927599310874939, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 9210 + }, + { + "epoch": 2.9761136216914137, + "grad_norm": 0.6902472972869873, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 9220 + }, + { + "epoch": 2.9793415106520333, + "grad_norm": 0.620399534702301, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 9230 + }, + { + "epoch": 2.9825693996126534, + "grad_norm": 0.6812364459037781, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 9240 + }, + { + "epoch": 2.985797288573273, + "grad_norm": 0.7681456208229065, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 9250 + }, + { + "epoch": 2.9890251775338927, + "grad_norm": 0.7621907591819763, + "learning_rate": 0.0002, + "loss": 0.7113, + "step": 9260 + }, + { + "epoch": 2.992253066494513, + "grad_norm": 0.6075740456581116, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 9270 + }, + { + "epoch": 2.9954809554551325, + "grad_norm": 0.7100434899330139, + "learning_rate": 0.0002, + "loss": 0.6758, + "step": 9280 + }, + { + "epoch": 2.998708844415752, + "grad_norm": 0.7314488887786865, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 9290 + }, + { + "epoch": 3.0, + "eval_loss": 1.1434104442596436, + "eval_runtime": 166.3732, + "eval_samples_per_second": 4.406, + "eval_steps_per_second": 0.553, + "step": 9294 + }, + { + "epoch": 3.001936733376372, + "grad_norm": 0.7408893704414368, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 9300 + }, + { + "epoch": 3.0051646223369914, + "grad_norm": 0.9773574471473694, + "learning_rate": 0.0002, + "loss": 0.5182, + "step": 9310 + }, + { + "epoch": 3.0083925112976115, + "grad_norm": 0.7919653058052063, + "learning_rate": 0.0002, + "loss": 0.5432, + "step": 9320 + }, + { + "epoch": 3.011620400258231, + "grad_norm": 0.9139202833175659, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 9330 + }, + { + "epoch": 3.014848289218851, + "grad_norm": 0.8296737670898438, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 9340 + }, + { + "epoch": 3.0180761781794705, + "grad_norm": 0.786868155002594, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 9350 + }, + { + "epoch": 3.0213040671400906, + "grad_norm": 0.5928055644035339, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 9360 + }, + { + "epoch": 3.0245319561007102, + "grad_norm": 0.8785701394081116, + "learning_rate": 0.0002, + "loss": 0.5376, + "step": 9370 + }, + { + "epoch": 3.02775984506133, + "grad_norm": 0.7978872060775757, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 9380 + }, + { + "epoch": 3.0309877340219495, + "grad_norm": 0.7160913348197937, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 9390 + }, + { + "epoch": 3.034215622982569, + "grad_norm": 0.904465913772583, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 9400 + }, + { + "epoch": 3.0374435119431893, + "grad_norm": 0.7082195281982422, + "learning_rate": 0.0002, + "loss": 0.5518, + "step": 9410 + }, + { + "epoch": 3.040671400903809, + "grad_norm": 0.9686778783798218, + "learning_rate": 0.0002, + "loss": 0.5434, + "step": 9420 + }, + { + "epoch": 3.0438992898644286, + "grad_norm": 0.8788613677024841, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 9430 + }, + { + "epoch": 3.0471271788250482, + "grad_norm": 0.8217582106590271, + "learning_rate": 0.0002, + "loss": 0.5599, + "step": 9440 + }, + { + "epoch": 3.0503550677856683, + "grad_norm": 0.7380914092063904, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 9450 + }, + { + "epoch": 3.053582956746288, + "grad_norm": 0.7339285612106323, + "learning_rate": 0.0002, + "loss": 0.6258, + "step": 9460 + }, + { + "epoch": 3.0568108457069076, + "grad_norm": 0.7175183296203613, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 9470 + }, + { + "epoch": 3.0600387346675273, + "grad_norm": 0.8275379538536072, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 9480 + }, + { + "epoch": 3.0632666236281474, + "grad_norm": 0.6544256806373596, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 9490 + }, + { + "epoch": 3.066494512588767, + "grad_norm": 0.8193472623825073, + "learning_rate": 0.0002, + "loss": 0.5365, + "step": 9500 + }, + { + "epoch": 3.0697224015493867, + "grad_norm": 0.7967836856842041, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 9510 + }, + { + "epoch": 3.0729502905100063, + "grad_norm": 0.8788684010505676, + "learning_rate": 0.0002, + "loss": 0.5629, + "step": 9520 + }, + { + "epoch": 3.0761781794706264, + "grad_norm": 0.9410629868507385, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 9530 + }, + { + "epoch": 3.079406068431246, + "grad_norm": 0.7448706030845642, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 9540 + }, + { + "epoch": 3.0826339573918657, + "grad_norm": 0.9149372577667236, + "learning_rate": 0.0002, + "loss": 0.5774, + "step": 9550 + }, + { + "epoch": 3.0858618463524854, + "grad_norm": 0.7265563607215881, + "learning_rate": 0.0002, + "loss": 0.5347, + "step": 9560 + }, + { + "epoch": 3.089089735313105, + "grad_norm": 1.0305068492889404, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 9570 + }, + { + "epoch": 3.092317624273725, + "grad_norm": 0.7987357974052429, + "learning_rate": 0.0002, + "loss": 0.5884, + "step": 9580 + }, + { + "epoch": 3.095545513234345, + "grad_norm": 0.7733123898506165, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 9590 + }, + { + "epoch": 3.0987734021949644, + "grad_norm": 1.0438069105148315, + "learning_rate": 0.0002, + "loss": 0.5848, + "step": 9600 + }, + { + "epoch": 3.102001291155584, + "grad_norm": 0.7951784729957581, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 9610 + }, + { + "epoch": 3.105229180116204, + "grad_norm": 0.7776783108711243, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 9620 + }, + { + "epoch": 3.108457069076824, + "grad_norm": 0.7060676217079163, + "learning_rate": 0.0002, + "loss": 0.5626, + "step": 9630 + }, + { + "epoch": 3.1116849580374435, + "grad_norm": 0.871569037437439, + "learning_rate": 0.0002, + "loss": 0.5731, + "step": 9640 + }, + { + "epoch": 3.114912846998063, + "grad_norm": 0.8873385787010193, + "learning_rate": 0.0002, + "loss": 0.5168, + "step": 9650 + }, + { + "epoch": 3.118140735958683, + "grad_norm": 0.750998318195343, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 9660 + }, + { + "epoch": 3.121368624919303, + "grad_norm": 0.8678529262542725, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 9670 + }, + { + "epoch": 3.1245965138799225, + "grad_norm": 0.7706599235534668, + "learning_rate": 0.0002, + "loss": 0.5831, + "step": 9680 + }, + { + "epoch": 3.127824402840542, + "grad_norm": 0.8317574858665466, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 9690 + }, + { + "epoch": 3.131052291801162, + "grad_norm": 0.801800012588501, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 9700 + }, + { + "epoch": 3.134280180761782, + "grad_norm": 0.8574623465538025, + "learning_rate": 0.0002, + "loss": 0.6044, + "step": 9710 + }, + { + "epoch": 3.1375080697224016, + "grad_norm": 0.6556540727615356, + "learning_rate": 0.0002, + "loss": 0.6072, + "step": 9720 + }, + { + "epoch": 3.1407359586830212, + "grad_norm": 0.8555161952972412, + "learning_rate": 0.0002, + "loss": 0.6058, + "step": 9730 + }, + { + "epoch": 3.143963847643641, + "grad_norm": 0.8825467824935913, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 9740 + }, + { + "epoch": 3.147191736604261, + "grad_norm": 0.8297156691551208, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 9750 + }, + { + "epoch": 3.1504196255648806, + "grad_norm": 0.7710384726524353, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 9760 + }, + { + "epoch": 3.1536475145255003, + "grad_norm": 0.8778039216995239, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 9770 + }, + { + "epoch": 3.15687540348612, + "grad_norm": 0.9014058113098145, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 9780 + }, + { + "epoch": 3.16010329244674, + "grad_norm": 0.6856890320777893, + "learning_rate": 0.0002, + "loss": 0.5496, + "step": 9790 + }, + { + "epoch": 3.1633311814073597, + "grad_norm": 0.6520644426345825, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 9800 + }, + { + "epoch": 3.1665590703679793, + "grad_norm": 0.7250499129295349, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 9810 + }, + { + "epoch": 3.169786959328599, + "grad_norm": 0.8331542015075684, + "learning_rate": 0.0002, + "loss": 0.5823, + "step": 9820 + }, + { + "epoch": 3.1730148482892186, + "grad_norm": 0.8531261682510376, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 9830 + }, + { + "epoch": 3.1762427372498387, + "grad_norm": 0.8997558355331421, + "learning_rate": 0.0002, + "loss": 0.57, + "step": 9840 + }, + { + "epoch": 3.1794706262104584, + "grad_norm": 0.708335280418396, + "learning_rate": 0.0002, + "loss": 0.5921, + "step": 9850 + }, + { + "epoch": 3.182698515171078, + "grad_norm": 1.0074886083602905, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 9860 + }, + { + "epoch": 3.1859264041316977, + "grad_norm": 1.0804681777954102, + "learning_rate": 0.0002, + "loss": 0.573, + "step": 9870 + }, + { + "epoch": 3.189154293092318, + "grad_norm": 0.9510730504989624, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 9880 + }, + { + "epoch": 3.1923821820529374, + "grad_norm": 0.7211061716079712, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 9890 + }, + { + "epoch": 3.195610071013557, + "grad_norm": 0.8767086267471313, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 9900 + }, + { + "epoch": 3.1988379599741767, + "grad_norm": 0.8388153314590454, + "learning_rate": 0.0002, + "loss": 0.5747, + "step": 9910 + }, + { + "epoch": 3.202065848934797, + "grad_norm": 0.8038473725318909, + "learning_rate": 0.0002, + "loss": 0.5681, + "step": 9920 + }, + { + "epoch": 3.2052937378954165, + "grad_norm": 0.8187747001647949, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 9930 + }, + { + "epoch": 3.208521626856036, + "grad_norm": 0.7427355051040649, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 9940 + }, + { + "epoch": 3.211749515816656, + "grad_norm": 0.8017025589942932, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 9950 + }, + { + "epoch": 3.214977404777276, + "grad_norm": 0.738595187664032, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 9960 + }, + { + "epoch": 3.2182052937378955, + "grad_norm": 0.7521342039108276, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 9970 + }, + { + "epoch": 3.221433182698515, + "grad_norm": 0.840329110622406, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 9980 + }, + { + "epoch": 3.224661071659135, + "grad_norm": 0.9809671640396118, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 9990 + }, + { + "epoch": 3.2278889606197545, + "grad_norm": 0.8456943035125732, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 10000 + }, + { + "epoch": 3.2311168495803746, + "grad_norm": 0.8962995409965515, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 10010 + }, + { + "epoch": 3.2343447385409942, + "grad_norm": 0.6492817401885986, + "learning_rate": 0.0002, + "loss": 0.5399, + "step": 10020 + }, + { + "epoch": 3.237572627501614, + "grad_norm": 1.0471255779266357, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 10030 + }, + { + "epoch": 3.2408005164622335, + "grad_norm": 0.7995471358299255, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 10040 + }, + { + "epoch": 3.2440284054228536, + "grad_norm": 0.7231964468955994, + "learning_rate": 0.0002, + "loss": 0.615, + "step": 10050 + }, + { + "epoch": 3.2472562943834733, + "grad_norm": 0.639630138874054, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 10060 + }, + { + "epoch": 3.250484183344093, + "grad_norm": 0.7957055568695068, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 10070 + }, + { + "epoch": 3.2537120723047126, + "grad_norm": 0.7735482454299927, + "learning_rate": 0.0002, + "loss": 0.5845, + "step": 10080 + }, + { + "epoch": 3.2569399612653323, + "grad_norm": 0.8139488101005554, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 10090 + }, + { + "epoch": 3.2601678502259523, + "grad_norm": 0.8113240003585815, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 10100 + }, + { + "epoch": 3.263395739186572, + "grad_norm": 0.7735909819602966, + "learning_rate": 0.0002, + "loss": 0.5617, + "step": 10110 + }, + { + "epoch": 3.2666236281471916, + "grad_norm": 0.7760744094848633, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 10120 + }, + { + "epoch": 3.2698515171078113, + "grad_norm": 0.8078505396842957, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 10130 + }, + { + "epoch": 3.2730794060684314, + "grad_norm": 0.983648955821991, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 10140 + }, + { + "epoch": 3.276307295029051, + "grad_norm": 0.7131832242012024, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 10150 + }, + { + "epoch": 3.2795351839896707, + "grad_norm": 0.924493134021759, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 10160 + }, + { + "epoch": 3.2827630729502904, + "grad_norm": 0.9371112585067749, + "learning_rate": 0.0002, + "loss": 0.5733, + "step": 10170 + }, + { + "epoch": 3.2859909619109104, + "grad_norm": 0.8989261388778687, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 10180 + }, + { + "epoch": 3.28921885087153, + "grad_norm": 0.8130394816398621, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 10190 + }, + { + "epoch": 3.2924467398321497, + "grad_norm": 0.9899941086769104, + "learning_rate": 0.0002, + "loss": 0.5555, + "step": 10200 + }, + { + "epoch": 3.2956746287927694, + "grad_norm": 1.007038950920105, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 10210 + }, + { + "epoch": 3.2989025177533895, + "grad_norm": 0.7465066313743591, + "learning_rate": 0.0002, + "loss": 0.5713, + "step": 10220 + }, + { + "epoch": 3.302130406714009, + "grad_norm": 0.7202590703964233, + "learning_rate": 0.0002, + "loss": 0.6307, + "step": 10230 + }, + { + "epoch": 3.305358295674629, + "grad_norm": 0.6258249282836914, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 10240 + }, + { + "epoch": 3.3085861846352485, + "grad_norm": 0.8996058702468872, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 10250 + }, + { + "epoch": 3.311814073595868, + "grad_norm": 0.9550982713699341, + "learning_rate": 0.0002, + "loss": 0.5825, + "step": 10260 + }, + { + "epoch": 3.315041962556488, + "grad_norm": 0.7010059952735901, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 10270 + }, + { + "epoch": 3.318269851517108, + "grad_norm": 0.9639869332313538, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 10280 + }, + { + "epoch": 3.3214977404777275, + "grad_norm": 1.0192502737045288, + "learning_rate": 0.0002, + "loss": 0.5362, + "step": 10290 + }, + { + "epoch": 3.324725629438347, + "grad_norm": 0.7953670024871826, + "learning_rate": 0.0002, + "loss": 0.5605, + "step": 10300 + }, + { + "epoch": 3.3279535183989672, + "grad_norm": 0.7436774969100952, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 10310 + }, + { + "epoch": 3.331181407359587, + "grad_norm": 0.7846777439117432, + "learning_rate": 0.0002, + "loss": 0.5823, + "step": 10320 + }, + { + "epoch": 3.3344092963202066, + "grad_norm": 0.8963494896888733, + "learning_rate": 0.0002, + "loss": 0.6119, + "step": 10330 + }, + { + "epoch": 3.337637185280826, + "grad_norm": 0.6876392364501953, + "learning_rate": 0.0002, + "loss": 0.5872, + "step": 10340 + }, + { + "epoch": 3.340865074241446, + "grad_norm": 0.9161638021469116, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 10350 + }, + { + "epoch": 3.344092963202066, + "grad_norm": 0.8964458107948303, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 10360 + }, + { + "epoch": 3.3473208521626856, + "grad_norm": 0.9052296280860901, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 10370 + }, + { + "epoch": 3.3505487411233053, + "grad_norm": 0.9292596578598022, + "learning_rate": 0.0002, + "loss": 0.5958, + "step": 10380 + }, + { + "epoch": 3.3537766300839253, + "grad_norm": 0.9605957269668579, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 10390 + }, + { + "epoch": 3.357004519044545, + "grad_norm": 1.0198872089385986, + "learning_rate": 0.0002, + "loss": 0.6214, + "step": 10400 + }, + { + "epoch": 3.3602324080051647, + "grad_norm": 0.7043630480766296, + "learning_rate": 0.0002, + "loss": 0.6053, + "step": 10410 + }, + { + "epoch": 3.3634602969657843, + "grad_norm": 1.0533326864242554, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 10420 + }, + { + "epoch": 3.366688185926404, + "grad_norm": 0.7552485466003418, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 10430 + }, + { + "epoch": 3.369916074887024, + "grad_norm": 0.692708432674408, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 10440 + }, + { + "epoch": 3.3731439638476437, + "grad_norm": 0.985952615737915, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 10450 + }, + { + "epoch": 3.3763718528082634, + "grad_norm": 0.6749676465988159, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 10460 + }, + { + "epoch": 3.379599741768883, + "grad_norm": 0.9514535665512085, + "learning_rate": 0.0002, + "loss": 0.5724, + "step": 10470 + }, + { + "epoch": 3.382827630729503, + "grad_norm": 1.2681142091751099, + "learning_rate": 0.0002, + "loss": 0.5982, + "step": 10480 + }, + { + "epoch": 3.3860555196901228, + "grad_norm": 1.031968355178833, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 10490 + }, + { + "epoch": 3.3892834086507424, + "grad_norm": 0.8061563968658447, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 10500 + }, + { + "epoch": 3.392511297611362, + "grad_norm": 1.0515062808990479, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 10510 + }, + { + "epoch": 3.3957391865719817, + "grad_norm": 0.9055540561676025, + "learning_rate": 0.0002, + "loss": 0.542, + "step": 10520 + }, + { + "epoch": 3.398967075532602, + "grad_norm": 0.9318141341209412, + "learning_rate": 0.0002, + "loss": 0.6148, + "step": 10530 + }, + { + "epoch": 3.4021949644932215, + "grad_norm": 0.8266817331314087, + "learning_rate": 0.0002, + "loss": 0.5722, + "step": 10540 + }, + { + "epoch": 3.405422853453841, + "grad_norm": 1.2322112321853638, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 10550 + }, + { + "epoch": 3.4086507424144608, + "grad_norm": 0.9535136818885803, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 10560 + }, + { + "epoch": 3.411878631375081, + "grad_norm": 0.9243819117546082, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 10570 + }, + { + "epoch": 3.4151065203357005, + "grad_norm": 0.9011809825897217, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 10580 + }, + { + "epoch": 3.41833440929632, + "grad_norm": 0.9923036694526672, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 10590 + }, + { + "epoch": 3.42156229825694, + "grad_norm": 0.8903067111968994, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 10600 + }, + { + "epoch": 3.42479018721756, + "grad_norm": 0.7101534605026245, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 10610 + }, + { + "epoch": 3.4280180761781796, + "grad_norm": 0.8186570405960083, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 10620 + }, + { + "epoch": 3.431245965138799, + "grad_norm": 0.9480205774307251, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 10630 + }, + { + "epoch": 3.434473854099419, + "grad_norm": 1.1370961666107178, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 10640 + }, + { + "epoch": 3.437701743060039, + "grad_norm": 1.017669677734375, + "learning_rate": 0.0002, + "loss": 0.5779, + "step": 10650 + }, + { + "epoch": 3.4409296320206586, + "grad_norm": 0.7625100016593933, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 10660 + }, + { + "epoch": 3.4441575209812783, + "grad_norm": 0.9288196563720703, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 10670 + }, + { + "epoch": 3.447385409941898, + "grad_norm": 0.8800460696220398, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 10680 + }, + { + "epoch": 3.4506132989025176, + "grad_norm": 0.7499661445617676, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 10690 + }, + { + "epoch": 3.4538411878631377, + "grad_norm": 0.8254973292350769, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 10700 + }, + { + "epoch": 3.4570690768237573, + "grad_norm": 0.8735857605934143, + "learning_rate": 0.0002, + "loss": 0.5742, + "step": 10710 + }, + { + "epoch": 3.460296965784377, + "grad_norm": 0.9601819515228271, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 10720 + }, + { + "epoch": 3.4635248547449966, + "grad_norm": 0.8031058311462402, + "learning_rate": 0.0002, + "loss": 0.5574, + "step": 10730 + }, + { + "epoch": 3.4667527437056167, + "grad_norm": 0.8039247393608093, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 10740 + }, + { + "epoch": 3.4699806326662364, + "grad_norm": 0.8936953544616699, + "learning_rate": 0.0002, + "loss": 0.593, + "step": 10750 + }, + { + "epoch": 3.473208521626856, + "grad_norm": 0.8201186060905457, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 10760 + }, + { + "epoch": 3.4764364105874757, + "grad_norm": 1.0064148902893066, + "learning_rate": 0.0002, + "loss": 0.5875, + "step": 10770 + }, + { + "epoch": 3.4796642995480953, + "grad_norm": 0.8617483377456665, + "learning_rate": 0.0002, + "loss": 0.5639, + "step": 10780 + }, + { + "epoch": 3.4828921885087154, + "grad_norm": 0.8532096147537231, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 10790 + }, + { + "epoch": 3.486120077469335, + "grad_norm": 0.8646879196166992, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 10800 + }, + { + "epoch": 3.4893479664299547, + "grad_norm": 0.7962660789489746, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 10810 + }, + { + "epoch": 3.492575855390575, + "grad_norm": 0.9560028314590454, + "learning_rate": 0.0002, + "loss": 0.5398, + "step": 10820 + }, + { + "epoch": 3.4958037443511945, + "grad_norm": 0.928439736366272, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 10830 + }, + { + "epoch": 3.499031633311814, + "grad_norm": 0.8219282627105713, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 10840 + }, + { + "epoch": 3.5022595222724338, + "grad_norm": 0.7918338179588318, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 10850 + }, + { + "epoch": 3.5054874112330534, + "grad_norm": 0.961295485496521, + "learning_rate": 0.0002, + "loss": 0.6164, + "step": 10860 + }, + { + "epoch": 3.5087153001936735, + "grad_norm": 1.0731624364852905, + "learning_rate": 0.0002, + "loss": 0.5534, + "step": 10870 + }, + { + "epoch": 3.511943189154293, + "grad_norm": 0.9551863074302673, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 10880 + }, + { + "epoch": 3.515171078114913, + "grad_norm": 0.8409819602966309, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 10890 + }, + { + "epoch": 3.5183989670755325, + "grad_norm": 0.7546320557594299, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 10900 + }, + { + "epoch": 3.5216268560361526, + "grad_norm": 0.7505252361297607, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 10910 + }, + { + "epoch": 3.524854744996772, + "grad_norm": 0.7505561113357544, + "learning_rate": 0.0002, + "loss": 0.5649, + "step": 10920 + }, + { + "epoch": 3.528082633957392, + "grad_norm": 1.086177945137024, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 10930 + }, + { + "epoch": 3.5313105229180115, + "grad_norm": 0.7721118330955505, + "learning_rate": 0.0002, + "loss": 0.5983, + "step": 10940 + }, + { + "epoch": 3.534538411878631, + "grad_norm": 0.9567878246307373, + "learning_rate": 0.0002, + "loss": 0.5919, + "step": 10950 + }, + { + "epoch": 3.5377663008392513, + "grad_norm": 0.8377360105514526, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 10960 + }, + { + "epoch": 3.540994189799871, + "grad_norm": 1.0174858570098877, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 10970 + }, + { + "epoch": 3.5442220787604906, + "grad_norm": 0.8164418935775757, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 10980 + }, + { + "epoch": 3.5474499677211107, + "grad_norm": 0.8959241509437561, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 10990 + }, + { + "epoch": 3.5506778566817303, + "grad_norm": 1.0154379606246948, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 11000 + }, + { + "epoch": 3.55390574564235, + "grad_norm": 0.7812292575836182, + "learning_rate": 0.0002, + "loss": 0.5835, + "step": 11010 + }, + { + "epoch": 3.5571336346029696, + "grad_norm": 0.9849029779434204, + "learning_rate": 0.0002, + "loss": 0.6052, + "step": 11020 + }, + { + "epoch": 3.5603615235635893, + "grad_norm": 0.8826184272766113, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 11030 + }, + { + "epoch": 3.563589412524209, + "grad_norm": 0.9039685726165771, + "learning_rate": 0.0002, + "loss": 0.601, + "step": 11040 + }, + { + "epoch": 3.566817301484829, + "grad_norm": 0.9585249423980713, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 11050 + }, + { + "epoch": 3.5700451904454487, + "grad_norm": 0.8083069324493408, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 11060 + }, + { + "epoch": 3.5732730794060683, + "grad_norm": 0.9528678059577942, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 11070 + }, + { + "epoch": 3.5765009683666884, + "grad_norm": 0.8297588229179382, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 11080 + }, + { + "epoch": 3.579728857327308, + "grad_norm": 0.8191716074943542, + "learning_rate": 0.0002, + "loss": 0.5919, + "step": 11090 + }, + { + "epoch": 3.5829567462879277, + "grad_norm": 0.8056275844573975, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 11100 + }, + { + "epoch": 3.5861846352485474, + "grad_norm": 0.701930582523346, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 11110 + }, + { + "epoch": 3.589412524209167, + "grad_norm": 0.7644643187522888, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 11120 + }, + { + "epoch": 3.592640413169787, + "grad_norm": 0.668004035949707, + "learning_rate": 0.0002, + "loss": 0.605, + "step": 11130 + }, + { + "epoch": 3.5958683021304068, + "grad_norm": 0.8849539756774902, + "learning_rate": 0.0002, + "loss": 0.5735, + "step": 11140 + }, + { + "epoch": 3.5990961910910264, + "grad_norm": 0.8123571276664734, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 11150 + }, + { + "epoch": 3.602324080051646, + "grad_norm": 0.7591469287872314, + "learning_rate": 0.0002, + "loss": 0.5626, + "step": 11160 + }, + { + "epoch": 3.605551969012266, + "grad_norm": 0.776466965675354, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 11170 + }, + { + "epoch": 3.608779857972886, + "grad_norm": 0.9156150221824646, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 11180 + }, + { + "epoch": 3.6120077469335055, + "grad_norm": 0.7517618536949158, + "learning_rate": 0.0002, + "loss": 0.5867, + "step": 11190 + }, + { + "epoch": 3.615235635894125, + "grad_norm": 0.931239128112793, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 11200 + }, + { + "epoch": 3.6184635248547448, + "grad_norm": 0.9107872843742371, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 11210 + }, + { + "epoch": 3.621691413815365, + "grad_norm": 0.7624770998954773, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 11220 + }, + { + "epoch": 3.6249193027759845, + "grad_norm": 0.8129580616950989, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 11230 + }, + { + "epoch": 3.628147191736604, + "grad_norm": 0.7339836955070496, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 11240 + }, + { + "epoch": 3.6313750806972243, + "grad_norm": 0.8901296854019165, + "learning_rate": 0.0002, + "loss": 0.5976, + "step": 11250 + }, + { + "epoch": 3.634602969657844, + "grad_norm": 1.1374726295471191, + "learning_rate": 0.0002, + "loss": 0.5977, + "step": 11260 + }, + { + "epoch": 3.6378308586184636, + "grad_norm": 0.7438275218009949, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 11270 + }, + { + "epoch": 3.641058747579083, + "grad_norm": 0.808646559715271, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 11280 + }, + { + "epoch": 3.644286636539703, + "grad_norm": 1.091810941696167, + "learning_rate": 0.0002, + "loss": 0.6244, + "step": 11290 + }, + { + "epoch": 3.6475145255003225, + "grad_norm": 0.8439257144927979, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 11300 + }, + { + "epoch": 3.6507424144609426, + "grad_norm": 0.9720633029937744, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 11310 + }, + { + "epoch": 3.6539703034215623, + "grad_norm": 0.738571047782898, + "learning_rate": 0.0002, + "loss": 0.5942, + "step": 11320 + }, + { + "epoch": 3.657198192382182, + "grad_norm": 0.6961580514907837, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 11330 + }, + { + "epoch": 3.660426081342802, + "grad_norm": 0.8192131519317627, + "learning_rate": 0.0002, + "loss": 0.6226, + "step": 11340 + }, + { + "epoch": 3.6636539703034217, + "grad_norm": 0.8367205858230591, + "learning_rate": 0.0002, + "loss": 0.6155, + "step": 11350 + }, + { + "epoch": 3.6668818592640413, + "grad_norm": 0.7735666632652283, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 11360 + }, + { + "epoch": 3.670109748224661, + "grad_norm": 0.6507132649421692, + "learning_rate": 0.0002, + "loss": 0.6113, + "step": 11370 + }, + { + "epoch": 3.6733376371852806, + "grad_norm": 0.8271192312240601, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 11380 + }, + { + "epoch": 3.6765655261459007, + "grad_norm": 0.8724204301834106, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 11390 + }, + { + "epoch": 3.6797934151065204, + "grad_norm": 0.8448445200920105, + "learning_rate": 0.0002, + "loss": 0.6131, + "step": 11400 + }, + { + "epoch": 3.68302130406714, + "grad_norm": 0.6756882071495056, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 11410 + }, + { + "epoch": 3.68624919302776, + "grad_norm": 0.7859625816345215, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 11420 + }, + { + "epoch": 3.6894770819883798, + "grad_norm": 0.8929487466812134, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 11430 + }, + { + "epoch": 3.6927049709489994, + "grad_norm": 0.8163391351699829, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 11440 + }, + { + "epoch": 3.695932859909619, + "grad_norm": 0.8948464393615723, + "learning_rate": 0.0002, + "loss": 0.6467, + "step": 11450 + }, + { + "epoch": 3.6991607488702387, + "grad_norm": 0.8654782176017761, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 11460 + }, + { + "epoch": 3.7023886378308584, + "grad_norm": 0.9514864683151245, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 11470 + }, + { + "epoch": 3.7056165267914785, + "grad_norm": 0.7298579812049866, + "learning_rate": 0.0002, + "loss": 0.606, + "step": 11480 + }, + { + "epoch": 3.708844415752098, + "grad_norm": 0.9266309142112732, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 11490 + }, + { + "epoch": 3.7120723047127178, + "grad_norm": 0.8608686923980713, + "learning_rate": 0.0002, + "loss": 0.6122, + "step": 11500 + }, + { + "epoch": 3.715300193673338, + "grad_norm": 0.921788215637207, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 11510 + }, + { + "epoch": 3.7185280826339575, + "grad_norm": 0.8537021279335022, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 11520 + }, + { + "epoch": 3.721755971594577, + "grad_norm": 1.115194320678711, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 11530 + }, + { + "epoch": 3.724983860555197, + "grad_norm": 0.7614817023277283, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 11540 + }, + { + "epoch": 3.7282117495158165, + "grad_norm": 0.871999204158783, + "learning_rate": 0.0002, + "loss": 0.5776, + "step": 11550 + }, + { + "epoch": 3.7314396384764366, + "grad_norm": 0.9668049812316895, + "learning_rate": 0.0002, + "loss": 0.5962, + "step": 11560 + }, + { + "epoch": 3.734667527437056, + "grad_norm": 1.2185815572738647, + "learning_rate": 0.0002, + "loss": 0.5534, + "step": 11570 + }, + { + "epoch": 3.737895416397676, + "grad_norm": 0.8258453011512756, + "learning_rate": 0.0002, + "loss": 0.5936, + "step": 11580 + }, + { + "epoch": 3.7411233053582955, + "grad_norm": 0.8708966374397278, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 11590 + }, + { + "epoch": 3.7443511943189156, + "grad_norm": 0.7784267663955688, + "learning_rate": 0.0002, + "loss": 0.5847, + "step": 11600 + }, + { + "epoch": 3.7475790832795353, + "grad_norm": 0.7504425048828125, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 11610 + }, + { + "epoch": 3.750806972240155, + "grad_norm": 0.9144526124000549, + "learning_rate": 0.0002, + "loss": 0.5922, + "step": 11620 + }, + { + "epoch": 3.7540348612007746, + "grad_norm": 0.922581672668457, + "learning_rate": 0.0002, + "loss": 0.6425, + "step": 11630 + }, + { + "epoch": 3.757262750161394, + "grad_norm": 0.9348630905151367, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 11640 + }, + { + "epoch": 3.7604906391220143, + "grad_norm": 1.0740231275558472, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 11650 + }, + { + "epoch": 3.763718528082634, + "grad_norm": 0.884830117225647, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 11660 + }, + { + "epoch": 3.7669464170432536, + "grad_norm": 1.0256348848342896, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 11670 + }, + { + "epoch": 3.7701743060038737, + "grad_norm": 0.6795592904090881, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 11680 + }, + { + "epoch": 3.7734021949644934, + "grad_norm": 0.9381206631660461, + "learning_rate": 0.0002, + "loss": 0.6241, + "step": 11690 + }, + { + "epoch": 3.776630083925113, + "grad_norm": 0.7633092403411865, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 11700 + }, + { + "epoch": 3.7798579728857327, + "grad_norm": 0.7506213188171387, + "learning_rate": 0.0002, + "loss": 0.5937, + "step": 11710 + }, + { + "epoch": 3.7830858618463523, + "grad_norm": 0.8182913064956665, + "learning_rate": 0.0002, + "loss": 0.5933, + "step": 11720 + }, + { + "epoch": 3.786313750806972, + "grad_norm": 1.019322156906128, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 11730 + }, + { + "epoch": 3.789541639767592, + "grad_norm": 0.8895221948623657, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 11740 + }, + { + "epoch": 3.7927695287282117, + "grad_norm": 0.948847770690918, + "learning_rate": 0.0002, + "loss": 0.6553, + "step": 11750 + }, + { + "epoch": 3.7959974176888314, + "grad_norm": 0.9068999886512756, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 11760 + }, + { + "epoch": 3.7992253066494515, + "grad_norm": 0.7920539975166321, + "learning_rate": 0.0002, + "loss": 0.6163, + "step": 11770 + }, + { + "epoch": 3.802453195610071, + "grad_norm": 0.8441922068595886, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 11780 + }, + { + "epoch": 3.8056810845706908, + "grad_norm": 0.9258501529693604, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 11790 + }, + { + "epoch": 3.8089089735313104, + "grad_norm": 0.7354241609573364, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 11800 + }, + { + "epoch": 3.81213686249193, + "grad_norm": 0.9494872689247131, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 11810 + }, + { + "epoch": 3.81536475145255, + "grad_norm": 0.8266556859016418, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 11820 + }, + { + "epoch": 3.81859264041317, + "grad_norm": 0.7951219081878662, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 11830 + }, + { + "epoch": 3.8218205293737895, + "grad_norm": 0.7688382267951965, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 11840 + }, + { + "epoch": 3.8250484183344096, + "grad_norm": 1.0917940139770508, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 11850 + }, + { + "epoch": 3.828276307295029, + "grad_norm": 0.9880442023277283, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 11860 + }, + { + "epoch": 3.831504196255649, + "grad_norm": 0.8433151245117188, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 11870 + }, + { + "epoch": 3.8347320852162685, + "grad_norm": 0.8691204786300659, + "learning_rate": 0.0002, + "loss": 0.5876, + "step": 11880 + }, + { + "epoch": 3.837959974176888, + "grad_norm": 0.7698143124580383, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 11890 + }, + { + "epoch": 3.841187863137508, + "grad_norm": 0.8874883651733398, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 11900 + }, + { + "epoch": 3.844415752098128, + "grad_norm": 1.1209359169006348, + "learning_rate": 0.0002, + "loss": 0.6242, + "step": 11910 + }, + { + "epoch": 3.8476436410587476, + "grad_norm": 0.7723544239997864, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 11920 + }, + { + "epoch": 3.850871530019367, + "grad_norm": 0.8363937139511108, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 11930 + }, + { + "epoch": 3.8540994189799873, + "grad_norm": 0.9209707975387573, + "learning_rate": 0.0002, + "loss": 0.6498, + "step": 11940 + }, + { + "epoch": 3.857327307940607, + "grad_norm": 0.9456894993782043, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 11950 + }, + { + "epoch": 3.8605551969012266, + "grad_norm": 1.5748413801193237, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 11960 + }, + { + "epoch": 3.8637830858618463, + "grad_norm": 0.9083569049835205, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 11970 + }, + { + "epoch": 3.867010974822466, + "grad_norm": 0.7672823071479797, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 11980 + }, + { + "epoch": 3.870238863783086, + "grad_norm": 0.8647152185440063, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 11990 + }, + { + "epoch": 3.8734667527437057, + "grad_norm": 0.9564255475997925, + "learning_rate": 0.0002, + "loss": 0.5755, + "step": 12000 + }, + { + "epoch": 3.8766946417043253, + "grad_norm": 0.773267924785614, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 12010 + }, + { + "epoch": 3.879922530664945, + "grad_norm": 0.8030173182487488, + "learning_rate": 0.0002, + "loss": 0.6057, + "step": 12020 + }, + { + "epoch": 3.883150419625565, + "grad_norm": 0.8002150058746338, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 12030 + }, + { + "epoch": 3.8863783085861847, + "grad_norm": 0.98802250623703, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 12040 + }, + { + "epoch": 3.8896061975468044, + "grad_norm": 0.7868124842643738, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 12050 + }, + { + "epoch": 3.892834086507424, + "grad_norm": 0.932182788848877, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 12060 + }, + { + "epoch": 3.8960619754680437, + "grad_norm": 0.8576806783676147, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 12070 + }, + { + "epoch": 3.8992898644286638, + "grad_norm": 0.8985713124275208, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 12080 + }, + { + "epoch": 3.9025177533892834, + "grad_norm": 0.7876521944999695, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 12090 + }, + { + "epoch": 3.905745642349903, + "grad_norm": 0.773936927318573, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 12100 + }, + { + "epoch": 3.908973531310523, + "grad_norm": 0.7274761199951172, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 12110 + }, + { + "epoch": 3.912201420271143, + "grad_norm": 0.8625598549842834, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 12120 + }, + { + "epoch": 3.9154293092317625, + "grad_norm": 0.8702362179756165, + "learning_rate": 0.0002, + "loss": 0.5855, + "step": 12130 + }, + { + "epoch": 3.918657198192382, + "grad_norm": 0.912579357624054, + "learning_rate": 0.0002, + "loss": 0.6493, + "step": 12140 + }, + { + "epoch": 3.9218850871530018, + "grad_norm": 0.8697066903114319, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 12150 + }, + { + "epoch": 3.9251129761136214, + "grad_norm": 1.005232572555542, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 12160 + }, + { + "epoch": 3.9283408650742415, + "grad_norm": 0.793902575969696, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 12170 + }, + { + "epoch": 3.931568754034861, + "grad_norm": 0.7025905847549438, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 12180 + }, + { + "epoch": 3.934796642995481, + "grad_norm": 0.97635817527771, + "learning_rate": 0.0002, + "loss": 0.6421, + "step": 12190 + }, + { + "epoch": 3.938024531956101, + "grad_norm": 0.855417013168335, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 12200 + }, + { + "epoch": 3.9412524209167206, + "grad_norm": 0.8841291666030884, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 12210 + }, + { + "epoch": 3.94448030987734, + "grad_norm": 1.1762064695358276, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 12220 + }, + { + "epoch": 3.94770819883796, + "grad_norm": 0.8393193483352661, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 12230 + }, + { + "epoch": 3.9509360877985795, + "grad_norm": 0.9324905276298523, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 12240 + }, + { + "epoch": 3.9541639767591996, + "grad_norm": 0.8607982993125916, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 12250 + }, + { + "epoch": 3.9573918657198193, + "grad_norm": 0.8586681485176086, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 12260 + }, + { + "epoch": 3.960619754680439, + "grad_norm": 1.1082909107208252, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 12270 + }, + { + "epoch": 3.963847643641059, + "grad_norm": 1.065027117729187, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 12280 + }, + { + "epoch": 3.9670755326016787, + "grad_norm": 0.9544363021850586, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 12290 + }, + { + "epoch": 3.9703034215622983, + "grad_norm": 0.9008927345275879, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 12300 + }, + { + "epoch": 3.973531310522918, + "grad_norm": 0.8717467188835144, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 12310 + }, + { + "epoch": 3.9767591994835376, + "grad_norm": 0.9718339443206787, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 12320 + }, + { + "epoch": 3.9799870884441573, + "grad_norm": 1.0362015962600708, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 12330 + }, + { + "epoch": 3.9832149774047774, + "grad_norm": 1.0844318866729736, + "learning_rate": 0.0002, + "loss": 0.6229, + "step": 12340 + }, + { + "epoch": 3.986442866365397, + "grad_norm": 0.7506240606307983, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 12350 + }, + { + "epoch": 3.9896707553260167, + "grad_norm": 1.005982756614685, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 12360 + }, + { + "epoch": 3.9928986442866368, + "grad_norm": 0.7566431164741516, + "learning_rate": 0.0002, + "loss": 0.5926, + "step": 12370 + }, + { + "epoch": 3.9961265332472564, + "grad_norm": 0.8819181323051453, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 12380 + }, + { + "epoch": 3.999354422207876, + "grad_norm": 0.884497880935669, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 12390 + }, + { + "epoch": 4.0, + "eval_loss": 1.1907150745391846, + "eval_runtime": 161.5766, + "eval_samples_per_second": 4.537, + "eval_steps_per_second": 0.569, + "step": 12392 + }, + { + "epoch": 4.002582311168496, + "grad_norm": 1.0407241582870483, + "learning_rate": 0.0002, + "loss": 0.5203, + "step": 12400 + }, + { + "epoch": 4.005810200129115, + "grad_norm": 1.0199295282363892, + "learning_rate": 0.0002, + "loss": 0.4978, + "step": 12410 + }, + { + "epoch": 4.009038089089735, + "grad_norm": 0.8456302881240845, + "learning_rate": 0.0002, + "loss": 0.4985, + "step": 12420 + }, + { + "epoch": 4.012265978050355, + "grad_norm": 1.0621124505996704, + "learning_rate": 0.0002, + "loss": 0.4669, + "step": 12430 + }, + { + "epoch": 4.015493867010975, + "grad_norm": 0.8984712362289429, + "learning_rate": 0.0002, + "loss": 0.5277, + "step": 12440 + }, + { + "epoch": 4.018721755971595, + "grad_norm": 1.3785864114761353, + "learning_rate": 0.0002, + "loss": 0.5508, + "step": 12450 + }, + { + "epoch": 4.0219496449322145, + "grad_norm": 0.7911781668663025, + "learning_rate": 0.0002, + "loss": 0.5244, + "step": 12460 + }, + { + "epoch": 4.025177533892834, + "grad_norm": 1.0977907180786133, + "learning_rate": 0.0002, + "loss": 0.4746, + "step": 12470 + }, + { + "epoch": 4.028405422853454, + "grad_norm": 1.0664983987808228, + "learning_rate": 0.0002, + "loss": 0.4632, + "step": 12480 + }, + { + "epoch": 4.0316333118140735, + "grad_norm": 1.0807124376296997, + "learning_rate": 0.0002, + "loss": 0.5151, + "step": 12490 + }, + { + "epoch": 4.034861200774693, + "grad_norm": 1.2650192975997925, + "learning_rate": 0.0002, + "loss": 0.4712, + "step": 12500 + }, + { + "epoch": 4.038089089735313, + "grad_norm": 0.7164070010185242, + "learning_rate": 0.0002, + "loss": 0.5111, + "step": 12510 + }, + { + "epoch": 4.041316978695932, + "grad_norm": 1.0047489404678345, + "learning_rate": 0.0002, + "loss": 0.5015, + "step": 12520 + }, + { + "epoch": 4.044544867656553, + "grad_norm": 0.9303901791572571, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 12530 + }, + { + "epoch": 4.047772756617173, + "grad_norm": 1.0319702625274658, + "learning_rate": 0.0002, + "loss": 0.5165, + "step": 12540 + }, + { + "epoch": 4.051000645577792, + "grad_norm": 0.9549729228019714, + "learning_rate": 0.0002, + "loss": 0.4834, + "step": 12550 + }, + { + "epoch": 4.054228534538412, + "grad_norm": 0.7175564765930176, + "learning_rate": 0.0002, + "loss": 0.5235, + "step": 12560 + }, + { + "epoch": 4.057456423499032, + "grad_norm": 1.0622259378433228, + "learning_rate": 0.0002, + "loss": 0.5257, + "step": 12570 + }, + { + "epoch": 4.060684312459651, + "grad_norm": 1.172074556350708, + "learning_rate": 0.0002, + "loss": 0.5098, + "step": 12580 + }, + { + "epoch": 4.063912201420271, + "grad_norm": 0.9702366590499878, + "learning_rate": 0.0002, + "loss": 0.5112, + "step": 12590 + }, + { + "epoch": 4.0671400903808905, + "grad_norm": 0.741511344909668, + "learning_rate": 0.0002, + "loss": 0.5042, + "step": 12600 + }, + { + "epoch": 4.070367979341511, + "grad_norm": 0.8632621169090271, + "learning_rate": 0.0002, + "loss": 0.4996, + "step": 12610 + }, + { + "epoch": 4.073595868302131, + "grad_norm": 0.9695962071418762, + "learning_rate": 0.0002, + "loss": 0.4927, + "step": 12620 + }, + { + "epoch": 4.07682375726275, + "grad_norm": 0.9401052594184875, + "learning_rate": 0.0002, + "loss": 0.4618, + "step": 12630 + }, + { + "epoch": 4.08005164622337, + "grad_norm": 0.8068707585334778, + "learning_rate": 0.0002, + "loss": 0.4889, + "step": 12640 + }, + { + "epoch": 4.08327953518399, + "grad_norm": 0.9554762840270996, + "learning_rate": 0.0002, + "loss": 0.5046, + "step": 12650 + }, + { + "epoch": 4.086507424144609, + "grad_norm": 0.7637128233909607, + "learning_rate": 0.0002, + "loss": 0.5081, + "step": 12660 + }, + { + "epoch": 4.089735313105229, + "grad_norm": 0.6703744530677795, + "learning_rate": 0.0002, + "loss": 0.4997, + "step": 12670 + }, + { + "epoch": 4.092963202065849, + "grad_norm": 0.8623828887939453, + "learning_rate": 0.0002, + "loss": 0.4977, + "step": 12680 + }, + { + "epoch": 4.096191091026468, + "grad_norm": 0.8198223114013672, + "learning_rate": 0.0002, + "loss": 0.4616, + "step": 12690 + }, + { + "epoch": 4.099418979987089, + "grad_norm": 1.3449875116348267, + "learning_rate": 0.0002, + "loss": 0.5372, + "step": 12700 + }, + { + "epoch": 4.1026468689477085, + "grad_norm": 0.8333606123924255, + "learning_rate": 0.0002, + "loss": 0.4782, + "step": 12710 + }, + { + "epoch": 4.105874757908328, + "grad_norm": 1.1647733449935913, + "learning_rate": 0.0002, + "loss": 0.5135, + "step": 12720 + }, + { + "epoch": 4.109102646868948, + "grad_norm": 1.0560213327407837, + "learning_rate": 0.0002, + "loss": 0.5147, + "step": 12730 + }, + { + "epoch": 4.112330535829567, + "grad_norm": 0.9479449987411499, + "learning_rate": 0.0002, + "loss": 0.5244, + "step": 12740 + }, + { + "epoch": 4.115558424790187, + "grad_norm": 1.1634587049484253, + "learning_rate": 0.0002, + "loss": 0.4596, + "step": 12750 + }, + { + "epoch": 4.118786313750807, + "grad_norm": 0.813987672328949, + "learning_rate": 0.0002, + "loss": 0.4966, + "step": 12760 + }, + { + "epoch": 4.122014202711426, + "grad_norm": 0.968461275100708, + "learning_rate": 0.0002, + "loss": 0.5133, + "step": 12770 + }, + { + "epoch": 4.125242091672046, + "grad_norm": 0.9324830770492554, + "learning_rate": 0.0002, + "loss": 0.5113, + "step": 12780 + }, + { + "epoch": 4.128469980632667, + "grad_norm": 0.8313411474227905, + "learning_rate": 0.0002, + "loss": 0.5233, + "step": 12790 + }, + { + "epoch": 4.131697869593286, + "grad_norm": 1.0177634954452515, + "learning_rate": 0.0002, + "loss": 0.5169, + "step": 12800 + }, + { + "epoch": 4.134925758553906, + "grad_norm": 1.0890623331069946, + "learning_rate": 0.0002, + "loss": 0.4635, + "step": 12810 + }, + { + "epoch": 4.1381536475145255, + "grad_norm": 0.9131693840026855, + "learning_rate": 0.0002, + "loss": 0.519, + "step": 12820 + }, + { + "epoch": 4.141381536475145, + "grad_norm": 0.8400680422782898, + "learning_rate": 0.0002, + "loss": 0.5017, + "step": 12830 + }, + { + "epoch": 4.144609425435765, + "grad_norm": 0.8988795876502991, + "learning_rate": 0.0002, + "loss": 0.5195, + "step": 12840 + }, + { + "epoch": 4.1478373143963845, + "grad_norm": 0.9224025011062622, + "learning_rate": 0.0002, + "loss": 0.5052, + "step": 12850 + }, + { + "epoch": 4.151065203357004, + "grad_norm": 0.7453159689903259, + "learning_rate": 0.0002, + "loss": 0.5001, + "step": 12860 + }, + { + "epoch": 4.154293092317625, + "grad_norm": 0.9815868139266968, + "learning_rate": 0.0002, + "loss": 0.4874, + "step": 12870 + }, + { + "epoch": 4.157520981278244, + "grad_norm": 1.2542768716812134, + "learning_rate": 0.0002, + "loss": 0.5485, + "step": 12880 + }, + { + "epoch": 4.160748870238864, + "grad_norm": 1.0092132091522217, + "learning_rate": 0.0002, + "loss": 0.5287, + "step": 12890 + }, + { + "epoch": 4.163976759199484, + "grad_norm": 1.1836622953414917, + "learning_rate": 0.0002, + "loss": 0.5125, + "step": 12900 + }, + { + "epoch": 4.167204648160103, + "grad_norm": 0.7706810235977173, + "learning_rate": 0.0002, + "loss": 0.5089, + "step": 12910 + }, + { + "epoch": 4.170432537120723, + "grad_norm": 1.00058913230896, + "learning_rate": 0.0002, + "loss": 0.5123, + "step": 12920 + }, + { + "epoch": 4.173660426081343, + "grad_norm": 1.2326250076293945, + "learning_rate": 0.0002, + "loss": 0.5238, + "step": 12930 + }, + { + "epoch": 4.176888315041962, + "grad_norm": 0.8829123377799988, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 12940 + }, + { + "epoch": 4.180116204002582, + "grad_norm": 0.936042845249176, + "learning_rate": 0.0002, + "loss": 0.517, + "step": 12950 + }, + { + "epoch": 4.183344092963202, + "grad_norm": 0.9773517847061157, + "learning_rate": 0.0002, + "loss": 0.4991, + "step": 12960 + }, + { + "epoch": 4.186571981923822, + "grad_norm": 0.9786297678947449, + "learning_rate": 0.0002, + "loss": 0.5025, + "step": 12970 + }, + { + "epoch": 4.189799870884442, + "grad_norm": 0.7524558901786804, + "learning_rate": 0.0002, + "loss": 0.5276, + "step": 12980 + }, + { + "epoch": 4.193027759845061, + "grad_norm": 1.0107866525650024, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 12990 + }, + { + "epoch": 4.196255648805681, + "grad_norm": 1.0092947483062744, + "learning_rate": 0.0002, + "loss": 0.5304, + "step": 13000 + }, + { + "epoch": 4.199483537766301, + "grad_norm": 1.18181312084198, + "learning_rate": 0.0002, + "loss": 0.5061, + "step": 13010 + }, + { + "epoch": 4.20271142672692, + "grad_norm": 0.8845750093460083, + "learning_rate": 0.0002, + "loss": 0.512, + "step": 13020 + }, + { + "epoch": 4.20593931568754, + "grad_norm": 1.0789145231246948, + "learning_rate": 0.0002, + "loss": 0.5329, + "step": 13030 + }, + { + "epoch": 4.2091672046481605, + "grad_norm": 0.9562082886695862, + "learning_rate": 0.0002, + "loss": 0.5001, + "step": 13040 + }, + { + "epoch": 4.21239509360878, + "grad_norm": 0.875755786895752, + "learning_rate": 0.0002, + "loss": 0.5211, + "step": 13050 + }, + { + "epoch": 4.2156229825694, + "grad_norm": 1.0694596767425537, + "learning_rate": 0.0002, + "loss": 0.5162, + "step": 13060 + }, + { + "epoch": 4.2188508715300195, + "grad_norm": 1.0053378343582153, + "learning_rate": 0.0002, + "loss": 0.4917, + "step": 13070 + }, + { + "epoch": 4.222078760490639, + "grad_norm": 1.1628689765930176, + "learning_rate": 0.0002, + "loss": 0.542, + "step": 13080 + }, + { + "epoch": 4.225306649451259, + "grad_norm": 0.9455991983413696, + "learning_rate": 0.0002, + "loss": 0.4796, + "step": 13090 + }, + { + "epoch": 4.228534538411878, + "grad_norm": 0.9736765623092651, + "learning_rate": 0.0002, + "loss": 0.4802, + "step": 13100 + }, + { + "epoch": 4.231762427372498, + "grad_norm": 0.8653560876846313, + "learning_rate": 0.0002, + "loss": 0.5411, + "step": 13110 + }, + { + "epoch": 4.234990316333118, + "grad_norm": 0.9335988163948059, + "learning_rate": 0.0002, + "loss": 0.5347, + "step": 13120 + }, + { + "epoch": 4.238218205293738, + "grad_norm": 0.9102661609649658, + "learning_rate": 0.0002, + "loss": 0.5217, + "step": 13130 + }, + { + "epoch": 4.241446094254358, + "grad_norm": 1.0595461130142212, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 13140 + }, + { + "epoch": 4.244673983214978, + "grad_norm": 0.8947662711143494, + "learning_rate": 0.0002, + "loss": 0.517, + "step": 13150 + }, + { + "epoch": 4.247901872175597, + "grad_norm": 1.0835723876953125, + "learning_rate": 0.0002, + "loss": 0.5116, + "step": 13160 + }, + { + "epoch": 4.251129761136217, + "grad_norm": 0.8496462106704712, + "learning_rate": 0.0002, + "loss": 0.5212, + "step": 13170 + }, + { + "epoch": 4.2543576500968365, + "grad_norm": 0.9395631551742554, + "learning_rate": 0.0002, + "loss": 0.5079, + "step": 13180 + }, + { + "epoch": 4.257585539057456, + "grad_norm": 1.2939592599868774, + "learning_rate": 0.0002, + "loss": 0.5076, + "step": 13190 + }, + { + "epoch": 4.260813428018076, + "grad_norm": 0.9325923919677734, + "learning_rate": 0.0002, + "loss": 0.5209, + "step": 13200 + }, + { + "epoch": 4.264041316978696, + "grad_norm": 0.9220664501190186, + "learning_rate": 0.0002, + "loss": 0.4984, + "step": 13210 + }, + { + "epoch": 4.267269205939316, + "grad_norm": 0.9505137205123901, + "learning_rate": 0.0002, + "loss": 0.5553, + "step": 13220 + }, + { + "epoch": 4.270497094899936, + "grad_norm": 1.0713751316070557, + "learning_rate": 0.0002, + "loss": 0.5238, + "step": 13230 + }, + { + "epoch": 4.273724983860555, + "grad_norm": 0.8390375971794128, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 13240 + }, + { + "epoch": 4.276952872821175, + "grad_norm": 0.8943426012992859, + "learning_rate": 0.0002, + "loss": 0.5217, + "step": 13250 + }, + { + "epoch": 4.280180761781795, + "grad_norm": 0.9175868630409241, + "learning_rate": 0.0002, + "loss": 0.5486, + "step": 13260 + }, + { + "epoch": 4.283408650742414, + "grad_norm": 0.9969881176948547, + "learning_rate": 0.0002, + "loss": 0.5208, + "step": 13270 + }, + { + "epoch": 4.286636539703034, + "grad_norm": 1.2271877527236938, + "learning_rate": 0.0002, + "loss": 0.5376, + "step": 13280 + }, + { + "epoch": 4.289864428663654, + "grad_norm": 0.9463263154029846, + "learning_rate": 0.0002, + "loss": 0.4811, + "step": 13290 + }, + { + "epoch": 4.293092317624274, + "grad_norm": 1.0306228399276733, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 13300 + }, + { + "epoch": 4.296320206584894, + "grad_norm": 0.8454763889312744, + "learning_rate": 0.0002, + "loss": 0.5092, + "step": 13310 + }, + { + "epoch": 4.299548095545513, + "grad_norm": 0.9843119978904724, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 13320 + }, + { + "epoch": 4.302775984506133, + "grad_norm": 1.0836851596832275, + "learning_rate": 0.0002, + "loss": 0.5407, + "step": 13330 + }, + { + "epoch": 4.306003873466753, + "grad_norm": 1.0719412565231323, + "learning_rate": 0.0002, + "loss": 0.5336, + "step": 13340 + }, + { + "epoch": 4.309231762427372, + "grad_norm": 0.9276487827301025, + "learning_rate": 0.0002, + "loss": 0.4798, + "step": 13350 + }, + { + "epoch": 4.312459651387992, + "grad_norm": 0.897072434425354, + "learning_rate": 0.0002, + "loss": 0.5256, + "step": 13360 + }, + { + "epoch": 4.315687540348612, + "grad_norm": 1.0493228435516357, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 13370 + }, + { + "epoch": 4.318915429309232, + "grad_norm": 0.9446353316307068, + "learning_rate": 0.0002, + "loss": 0.5218, + "step": 13380 + }, + { + "epoch": 4.322143318269852, + "grad_norm": 0.7765224575996399, + "learning_rate": 0.0002, + "loss": 0.4765, + "step": 13390 + }, + { + "epoch": 4.3253712072304715, + "grad_norm": 0.9100048542022705, + "learning_rate": 0.0002, + "loss": 0.5907, + "step": 13400 + }, + { + "epoch": 4.328599096191091, + "grad_norm": 1.0913089513778687, + "learning_rate": 0.0002, + "loss": 0.5393, + "step": 13410 + }, + { + "epoch": 4.331826985151711, + "grad_norm": 0.9607733488082886, + "learning_rate": 0.0002, + "loss": 0.494, + "step": 13420 + }, + { + "epoch": 4.3350548741123305, + "grad_norm": 0.8774219155311584, + "learning_rate": 0.0002, + "loss": 0.5273, + "step": 13430 + }, + { + "epoch": 4.33828276307295, + "grad_norm": 0.8366804122924805, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 13440 + }, + { + "epoch": 4.34151065203357, + "grad_norm": 1.034727931022644, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 13450 + }, + { + "epoch": 4.344738540994189, + "grad_norm": 0.942743182182312, + "learning_rate": 0.0002, + "loss": 0.4995, + "step": 13460 + }, + { + "epoch": 4.347966429954809, + "grad_norm": 0.7237029075622559, + "learning_rate": 0.0002, + "loss": 0.5222, + "step": 13470 + }, + { + "epoch": 4.35119431891543, + "grad_norm": 0.8216196894645691, + "learning_rate": 0.0002, + "loss": 0.5461, + "step": 13480 + }, + { + "epoch": 4.354422207876049, + "grad_norm": 1.031860113143921, + "learning_rate": 0.0002, + "loss": 0.5104, + "step": 13490 + }, + { + "epoch": 4.357650096836669, + "grad_norm": 0.8880493640899658, + "learning_rate": 0.0002, + "loss": 0.547, + "step": 13500 + }, + { + "epoch": 4.360877985797289, + "grad_norm": 0.8442490696907043, + "learning_rate": 0.0002, + "loss": 0.5259, + "step": 13510 + }, + { + "epoch": 4.364105874757908, + "grad_norm": 1.270971655845642, + "learning_rate": 0.0002, + "loss": 0.5176, + "step": 13520 + }, + { + "epoch": 4.367333763718528, + "grad_norm": 0.9657870531082153, + "learning_rate": 0.0002, + "loss": 0.5028, + "step": 13530 + }, + { + "epoch": 4.3705616526791475, + "grad_norm": 0.7477133870124817, + "learning_rate": 0.0002, + "loss": 0.5136, + "step": 13540 + }, + { + "epoch": 4.373789541639767, + "grad_norm": 1.0209243297576904, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 13550 + }, + { + "epoch": 4.377017430600388, + "grad_norm": 0.8714015483856201, + "learning_rate": 0.0002, + "loss": 0.4888, + "step": 13560 + }, + { + "epoch": 4.380245319561007, + "grad_norm": 1.0490189790725708, + "learning_rate": 0.0002, + "loss": 0.5428, + "step": 13570 + }, + { + "epoch": 4.383473208521627, + "grad_norm": 0.9454663991928101, + "learning_rate": 0.0002, + "loss": 0.5398, + "step": 13580 + }, + { + "epoch": 4.386701097482247, + "grad_norm": 1.154146432876587, + "learning_rate": 0.0002, + "loss": 0.5072, + "step": 13590 + }, + { + "epoch": 4.389928986442866, + "grad_norm": 1.155090570449829, + "learning_rate": 0.0002, + "loss": 0.5096, + "step": 13600 + }, + { + "epoch": 4.393156875403486, + "grad_norm": 0.9853842854499817, + "learning_rate": 0.0002, + "loss": 0.5679, + "step": 13610 + }, + { + "epoch": 4.396384764364106, + "grad_norm": 0.9265837669372559, + "learning_rate": 0.0002, + "loss": 0.4992, + "step": 13620 + }, + { + "epoch": 4.399612653324725, + "grad_norm": 0.8367540240287781, + "learning_rate": 0.0002, + "loss": 0.523, + "step": 13630 + }, + { + "epoch": 4.402840542285345, + "grad_norm": 1.1453629732131958, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 13640 + }, + { + "epoch": 4.4060684312459655, + "grad_norm": 1.0856295824050903, + "learning_rate": 0.0002, + "loss": 0.573, + "step": 13650 + }, + { + "epoch": 4.409296320206585, + "grad_norm": 0.9284523129463196, + "learning_rate": 0.0002, + "loss": 0.5178, + "step": 13660 + }, + { + "epoch": 4.412524209167205, + "grad_norm": 0.9632299542427063, + "learning_rate": 0.0002, + "loss": 0.4862, + "step": 13670 + }, + { + "epoch": 4.415752098127824, + "grad_norm": 1.048524260520935, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 13680 + }, + { + "epoch": 4.418979987088444, + "grad_norm": 0.9787682294845581, + "learning_rate": 0.0002, + "loss": 0.5258, + "step": 13690 + }, + { + "epoch": 4.422207876049064, + "grad_norm": 1.0728684663772583, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 13700 + }, + { + "epoch": 4.425435765009683, + "grad_norm": 0.72867351770401, + "learning_rate": 0.0002, + "loss": 0.5243, + "step": 13710 + }, + { + "epoch": 4.428663653970303, + "grad_norm": 0.8932793736457825, + "learning_rate": 0.0002, + "loss": 0.5313, + "step": 13720 + }, + { + "epoch": 4.431891542930924, + "grad_norm": 1.098343849182129, + "learning_rate": 0.0002, + "loss": 0.5156, + "step": 13730 + }, + { + "epoch": 4.435119431891543, + "grad_norm": 0.9321235418319702, + "learning_rate": 0.0002, + "loss": 0.5342, + "step": 13740 + }, + { + "epoch": 4.438347320852163, + "grad_norm": 0.8868634104728699, + "learning_rate": 0.0002, + "loss": 0.5114, + "step": 13750 + }, + { + "epoch": 4.4415752098127825, + "grad_norm": 1.200064778327942, + "learning_rate": 0.0002, + "loss": 0.5284, + "step": 13760 + }, + { + "epoch": 4.444803098773402, + "grad_norm": 0.8968019485473633, + "learning_rate": 0.0002, + "loss": 0.5208, + "step": 13770 + }, + { + "epoch": 4.448030987734022, + "grad_norm": 0.9560935497283936, + "learning_rate": 0.0002, + "loss": 0.4979, + "step": 13780 + }, + { + "epoch": 4.4512588766946415, + "grad_norm": 0.7985701560974121, + "learning_rate": 0.0002, + "loss": 0.5134, + "step": 13790 + }, + { + "epoch": 4.454486765655261, + "grad_norm": 1.062540888786316, + "learning_rate": 0.0002, + "loss": 0.5113, + "step": 13800 + }, + { + "epoch": 4.457714654615881, + "grad_norm": 1.0827109813690186, + "learning_rate": 0.0002, + "loss": 0.525, + "step": 13810 + }, + { + "epoch": 4.460942543576501, + "grad_norm": 1.0853543281555176, + "learning_rate": 0.0002, + "loss": 0.5541, + "step": 13820 + }, + { + "epoch": 4.464170432537121, + "grad_norm": 1.0613641738891602, + "learning_rate": 0.0002, + "loss": 0.5381, + "step": 13830 + }, + { + "epoch": 4.467398321497741, + "grad_norm": 0.9037535190582275, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 13840 + }, + { + "epoch": 4.47062621045836, + "grad_norm": 0.9216223955154419, + "learning_rate": 0.0002, + "loss": 0.5112, + "step": 13850 + }, + { + "epoch": 4.47385409941898, + "grad_norm": 0.8952260613441467, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 13860 + }, + { + "epoch": 4.4770819883796, + "grad_norm": 0.9997953176498413, + "learning_rate": 0.0002, + "loss": 0.5026, + "step": 13870 + }, + { + "epoch": 4.480309877340219, + "grad_norm": 1.062458872795105, + "learning_rate": 0.0002, + "loss": 0.5107, + "step": 13880 + }, + { + "epoch": 4.483537766300839, + "grad_norm": 0.9185126423835754, + "learning_rate": 0.0002, + "loss": 0.5463, + "step": 13890 + }, + { + "epoch": 4.486765655261459, + "grad_norm": 1.2389954328536987, + "learning_rate": 0.0002, + "loss": 0.5181, + "step": 13900 + }, + { + "epoch": 4.489993544222079, + "grad_norm": 1.1632126569747925, + "learning_rate": 0.0002, + "loss": 0.5199, + "step": 13910 + }, + { + "epoch": 4.493221433182699, + "grad_norm": 1.0304487943649292, + "learning_rate": 0.0002, + "loss": 0.5128, + "step": 13920 + }, + { + "epoch": 4.496449322143318, + "grad_norm": 0.9144788384437561, + "learning_rate": 0.0002, + "loss": 0.5331, + "step": 13930 + }, + { + "epoch": 4.499677211103938, + "grad_norm": 1.0285682678222656, + "learning_rate": 0.0002, + "loss": 0.5312, + "step": 13940 + }, + { + "epoch": 4.502905100064558, + "grad_norm": 1.1187206506729126, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 13950 + }, + { + "epoch": 4.506132989025177, + "grad_norm": 0.7917197942733765, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 13960 + }, + { + "epoch": 4.509360877985797, + "grad_norm": 0.8495619297027588, + "learning_rate": 0.0002, + "loss": 0.5227, + "step": 13970 + }, + { + "epoch": 4.512588766946417, + "grad_norm": 1.0450760126113892, + "learning_rate": 0.0002, + "loss": 0.4971, + "step": 13980 + }, + { + "epoch": 4.515816655907037, + "grad_norm": 1.0061010122299194, + "learning_rate": 0.0002, + "loss": 0.5402, + "step": 13990 + }, + { + "epoch": 4.519044544867657, + "grad_norm": 1.0232428312301636, + "learning_rate": 0.0002, + "loss": 0.527, + "step": 14000 + }, + { + "epoch": 4.5222724338282765, + "grad_norm": 0.8734631538391113, + "learning_rate": 0.0002, + "loss": 0.5002, + "step": 14010 + }, + { + "epoch": 4.525500322788896, + "grad_norm": 1.1085621118545532, + "learning_rate": 0.0002, + "loss": 0.5464, + "step": 14020 + }, + { + "epoch": 4.528728211749516, + "grad_norm": 0.9178624749183655, + "learning_rate": 0.0002, + "loss": 0.5167, + "step": 14030 + }, + { + "epoch": 4.531956100710135, + "grad_norm": 1.0687317848205566, + "learning_rate": 0.0002, + "loss": 0.5589, + "step": 14040 + }, + { + "epoch": 4.535183989670755, + "grad_norm": 0.9237300157546997, + "learning_rate": 0.0002, + "loss": 0.5576, + "step": 14050 + }, + { + "epoch": 4.538411878631375, + "grad_norm": 0.9667123556137085, + "learning_rate": 0.0002, + "loss": 0.5062, + "step": 14060 + }, + { + "epoch": 4.541639767591995, + "grad_norm": 1.1286747455596924, + "learning_rate": 0.0002, + "loss": 0.5645, + "step": 14070 + }, + { + "epoch": 4.544867656552615, + "grad_norm": 1.055392861366272, + "learning_rate": 0.0002, + "loss": 0.5226, + "step": 14080 + }, + { + "epoch": 4.548095545513235, + "grad_norm": 0.9492936134338379, + "learning_rate": 0.0002, + "loss": 0.5428, + "step": 14090 + }, + { + "epoch": 4.551323434473854, + "grad_norm": 0.9881349802017212, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 14100 + }, + { + "epoch": 4.554551323434474, + "grad_norm": 0.9389023184776306, + "learning_rate": 0.0002, + "loss": 0.5572, + "step": 14110 + }, + { + "epoch": 4.5577792123950935, + "grad_norm": 0.8395606875419617, + "learning_rate": 0.0002, + "loss": 0.5511, + "step": 14120 + }, + { + "epoch": 4.561007101355713, + "grad_norm": 0.9019067287445068, + "learning_rate": 0.0002, + "loss": 0.5696, + "step": 14130 + }, + { + "epoch": 4.564234990316333, + "grad_norm": 1.1058136224746704, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 14140 + }, + { + "epoch": 4.5674628792769525, + "grad_norm": 1.0683821439743042, + "learning_rate": 0.0002, + "loss": 0.5323, + "step": 14150 + }, + { + "epoch": 4.570690768237572, + "grad_norm": 1.3398395776748657, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 14160 + }, + { + "epoch": 4.573918657198193, + "grad_norm": 0.7829096913337708, + "learning_rate": 0.0002, + "loss": 0.4713, + "step": 14170 + }, + { + "epoch": 4.577146546158812, + "grad_norm": 0.9636675119400024, + "learning_rate": 0.0002, + "loss": 0.525, + "step": 14180 + }, + { + "epoch": 4.580374435119432, + "grad_norm": 1.0291401147842407, + "learning_rate": 0.0002, + "loss": 0.5458, + "step": 14190 + }, + { + "epoch": 4.583602324080052, + "grad_norm": 1.0894310474395752, + "learning_rate": 0.0002, + "loss": 0.5366, + "step": 14200 + }, + { + "epoch": 4.586830213040671, + "grad_norm": 1.111573576927185, + "learning_rate": 0.0002, + "loss": 0.5125, + "step": 14210 + }, + { + "epoch": 4.590058102001291, + "grad_norm": 0.9345336556434631, + "learning_rate": 0.0002, + "loss": 0.5444, + "step": 14220 + }, + { + "epoch": 4.593285990961911, + "grad_norm": 1.3338757753372192, + "learning_rate": 0.0002, + "loss": 0.5175, + "step": 14230 + }, + { + "epoch": 4.596513879922531, + "grad_norm": 1.1146448850631714, + "learning_rate": 0.0002, + "loss": 0.5227, + "step": 14240 + }, + { + "epoch": 4.599741768883151, + "grad_norm": 1.1576755046844482, + "learning_rate": 0.0002, + "loss": 0.543, + "step": 14250 + }, + { + "epoch": 4.60296965784377, + "grad_norm": 0.6851092576980591, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 14260 + }, + { + "epoch": 4.60619754680439, + "grad_norm": 0.9067938923835754, + "learning_rate": 0.0002, + "loss": 0.5027, + "step": 14270 + }, + { + "epoch": 4.60942543576501, + "grad_norm": 0.8767340183258057, + "learning_rate": 0.0002, + "loss": 0.5237, + "step": 14280 + }, + { + "epoch": 4.612653324725629, + "grad_norm": 1.024880290031433, + "learning_rate": 0.0002, + "loss": 0.5294, + "step": 14290 + }, + { + "epoch": 4.615881213686249, + "grad_norm": 0.9226394891738892, + "learning_rate": 0.0002, + "loss": 0.5371, + "step": 14300 + }, + { + "epoch": 4.619109102646869, + "grad_norm": 1.018187165260315, + "learning_rate": 0.0002, + "loss": 0.5281, + "step": 14310 + }, + { + "epoch": 4.622336991607488, + "grad_norm": 0.8851249814033508, + "learning_rate": 0.0002, + "loss": 0.5546, + "step": 14320 + }, + { + "epoch": 4.625564880568108, + "grad_norm": 0.745798647403717, + "learning_rate": 0.0002, + "loss": 0.5206, + "step": 14330 + }, + { + "epoch": 4.6287927695287285, + "grad_norm": 1.2082698345184326, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 14340 + }, + { + "epoch": 4.632020658489348, + "grad_norm": 0.901454508304596, + "learning_rate": 0.0002, + "loss": 0.5449, + "step": 14350 + }, + { + "epoch": 4.635248547449968, + "grad_norm": 0.9593124985694885, + "learning_rate": 0.0002, + "loss": 0.5433, + "step": 14360 + }, + { + "epoch": 4.6384764364105875, + "grad_norm": 1.1241410970687866, + "learning_rate": 0.0002, + "loss": 0.4939, + "step": 14370 + }, + { + "epoch": 4.641704325371207, + "grad_norm": 0.9221102595329285, + "learning_rate": 0.0002, + "loss": 0.5319, + "step": 14380 + }, + { + "epoch": 4.644932214331827, + "grad_norm": 1.0035039186477661, + "learning_rate": 0.0002, + "loss": 0.524, + "step": 14390 + }, + { + "epoch": 4.648160103292446, + "grad_norm": 1.1270662546157837, + "learning_rate": 0.0002, + "loss": 0.5617, + "step": 14400 + }, + { + "epoch": 4.651387992253067, + "grad_norm": 0.8631120324134827, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 14410 + }, + { + "epoch": 4.654615881213687, + "grad_norm": 1.0604606866836548, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 14420 + }, + { + "epoch": 4.657843770174306, + "grad_norm": 0.8002706170082092, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 14430 + }, + { + "epoch": 4.661071659134926, + "grad_norm": 1.0642075538635254, + "learning_rate": 0.0002, + "loss": 0.5459, + "step": 14440 + }, + { + "epoch": 4.664299548095546, + "grad_norm": 0.9315671324729919, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 14450 + }, + { + "epoch": 4.667527437056165, + "grad_norm": 0.8311864137649536, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 14460 + }, + { + "epoch": 4.670755326016785, + "grad_norm": 0.8900430202484131, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 14470 + }, + { + "epoch": 4.6739832149774045, + "grad_norm": 1.059267282485962, + "learning_rate": 0.0002, + "loss": 0.5086, + "step": 14480 + }, + { + "epoch": 4.677211103938024, + "grad_norm": 0.9864052534103394, + "learning_rate": 0.0002, + "loss": 0.5583, + "step": 14490 + }, + { + "epoch": 4.680438992898644, + "grad_norm": 1.210854411125183, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 14500 + }, + { + "epoch": 4.683666881859264, + "grad_norm": 1.030693769454956, + "learning_rate": 0.0002, + "loss": 0.536, + "step": 14510 + }, + { + "epoch": 4.686894770819884, + "grad_norm": 0.9809406995773315, + "learning_rate": 0.0002, + "loss": 0.544, + "step": 14520 + }, + { + "epoch": 4.690122659780504, + "grad_norm": 1.0471004247665405, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 14530 + }, + { + "epoch": 4.693350548741123, + "grad_norm": 1.1583727598190308, + "learning_rate": 0.0002, + "loss": 0.5613, + "step": 14540 + }, + { + "epoch": 4.696578437701743, + "grad_norm": 0.9664418697357178, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 14550 + }, + { + "epoch": 4.699806326662363, + "grad_norm": 0.9511209726333618, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 14560 + }, + { + "epoch": 4.703034215622982, + "grad_norm": 1.0211684703826904, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 14570 + }, + { + "epoch": 4.706262104583602, + "grad_norm": 1.097276210784912, + "learning_rate": 0.0002, + "loss": 0.5536, + "step": 14580 + }, + { + "epoch": 4.7094899935442225, + "grad_norm": 0.9363943338394165, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 14590 + }, + { + "epoch": 4.712717882504842, + "grad_norm": 1.4700615406036377, + "learning_rate": 0.0002, + "loss": 0.5261, + "step": 14600 + }, + { + "epoch": 4.715945771465462, + "grad_norm": 1.0001553297042847, + "learning_rate": 0.0002, + "loss": 0.5489, + "step": 14610 + }, + { + "epoch": 4.719173660426081, + "grad_norm": 1.0489927530288696, + "learning_rate": 0.0002, + "loss": 0.5236, + "step": 14620 + }, + { + "epoch": 4.722401549386701, + "grad_norm": 1.0483676195144653, + "learning_rate": 0.0002, + "loss": 0.5418, + "step": 14630 + }, + { + "epoch": 4.725629438347321, + "grad_norm": 1.1501940488815308, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 14640 + }, + { + "epoch": 4.72885732730794, + "grad_norm": 1.1703146696090698, + "learning_rate": 0.0002, + "loss": 0.5059, + "step": 14650 + }, + { + "epoch": 4.73208521626856, + "grad_norm": 0.8842985033988953, + "learning_rate": 0.0002, + "loss": 0.5356, + "step": 14660 + }, + { + "epoch": 4.73531310522918, + "grad_norm": 0.9147908687591553, + "learning_rate": 0.0002, + "loss": 0.5229, + "step": 14670 + }, + { + "epoch": 4.7385409941898, + "grad_norm": 1.0391576290130615, + "learning_rate": 0.0002, + "loss": 0.5436, + "step": 14680 + }, + { + "epoch": 4.74176888315042, + "grad_norm": 0.9469179511070251, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 14690 + }, + { + "epoch": 4.7449967721110395, + "grad_norm": 1.0529530048370361, + "learning_rate": 0.0002, + "loss": 0.5201, + "step": 14700 + }, + { + "epoch": 4.748224661071659, + "grad_norm": 0.9645711183547974, + "learning_rate": 0.0002, + "loss": 0.5401, + "step": 14710 + }, + { + "epoch": 4.751452550032279, + "grad_norm": 0.8163343071937561, + "learning_rate": 0.0002, + "loss": 0.5123, + "step": 14720 + }, + { + "epoch": 4.7546804389928985, + "grad_norm": 1.0581341981887817, + "learning_rate": 0.0002, + "loss": 0.5654, + "step": 14730 + }, + { + "epoch": 4.757908327953518, + "grad_norm": 1.0913853645324707, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 14740 + }, + { + "epoch": 4.761136216914138, + "grad_norm": 1.1071174144744873, + "learning_rate": 0.0002, + "loss": 0.5342, + "step": 14750 + }, + { + "epoch": 4.764364105874758, + "grad_norm": 1.0060709714889526, + "learning_rate": 0.0002, + "loss": 0.5353, + "step": 14760 + }, + { + "epoch": 4.767591994835378, + "grad_norm": 1.012024164199829, + "learning_rate": 0.0002, + "loss": 0.5415, + "step": 14770 + }, + { + "epoch": 4.770819883795998, + "grad_norm": 0.8438148498535156, + "learning_rate": 0.0002, + "loss": 0.5351, + "step": 14780 + }, + { + "epoch": 4.774047772756617, + "grad_norm": 0.8136811256408691, + "learning_rate": 0.0002, + "loss": 0.5424, + "step": 14790 + }, + { + "epoch": 4.777275661717237, + "grad_norm": 1.0765691995620728, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 14800 + }, + { + "epoch": 4.780503550677857, + "grad_norm": 1.0582574605941772, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 14810 + }, + { + "epoch": 4.783731439638476, + "grad_norm": 0.9419516921043396, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 14820 + }, + { + "epoch": 4.786959328599096, + "grad_norm": 0.9626181721687317, + "learning_rate": 0.0002, + "loss": 0.5499, + "step": 14830 + }, + { + "epoch": 4.7901872175597155, + "grad_norm": 1.2552800178527832, + "learning_rate": 0.0002, + "loss": 0.565, + "step": 14840 + }, + { + "epoch": 4.793415106520336, + "grad_norm": 0.9379919171333313, + "learning_rate": 0.0002, + "loss": 0.5402, + "step": 14850 + }, + { + "epoch": 4.796642995480956, + "grad_norm": 0.8166947364807129, + "learning_rate": 0.0002, + "loss": 0.5583, + "step": 14860 + }, + { + "epoch": 4.799870884441575, + "grad_norm": 0.9008694887161255, + "learning_rate": 0.0002, + "loss": 0.5139, + "step": 14870 + }, + { + "epoch": 4.803098773402195, + "grad_norm": 1.0256156921386719, + "learning_rate": 0.0002, + "loss": 0.5049, + "step": 14880 + }, + { + "epoch": 4.806326662362815, + "grad_norm": 0.9486594200134277, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 14890 + }, + { + "epoch": 4.809554551323434, + "grad_norm": 0.955238401889801, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 14900 + }, + { + "epoch": 4.812782440284054, + "grad_norm": 1.03775954246521, + "learning_rate": 0.0002, + "loss": 0.5269, + "step": 14910 + }, + { + "epoch": 4.816010329244674, + "grad_norm": 1.1383405923843384, + "learning_rate": 0.0002, + "loss": 0.5445, + "step": 14920 + }, + { + "epoch": 4.819238218205294, + "grad_norm": 0.9411700963973999, + "learning_rate": 0.0002, + "loss": 0.5347, + "step": 14930 + }, + { + "epoch": 4.822466107165914, + "grad_norm": 0.8188554644584656, + "learning_rate": 0.0002, + "loss": 0.4899, + "step": 14940 + }, + { + "epoch": 4.8256939961265335, + "grad_norm": 1.1336265802383423, + "learning_rate": 0.0002, + "loss": 0.5618, + "step": 14950 + }, + { + "epoch": 4.828921885087153, + "grad_norm": 1.106121301651001, + "learning_rate": 0.0002, + "loss": 0.5578, + "step": 14960 + }, + { + "epoch": 4.832149774047773, + "grad_norm": 1.0206533670425415, + "learning_rate": 0.0002, + "loss": 0.5306, + "step": 14970 + }, + { + "epoch": 4.8353776630083924, + "grad_norm": 1.1123926639556885, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 14980 + }, + { + "epoch": 4.838605551969012, + "grad_norm": 0.7879418730735779, + "learning_rate": 0.0002, + "loss": 0.5208, + "step": 14990 + }, + { + "epoch": 4.841833440929632, + "grad_norm": 1.0171709060668945, + "learning_rate": 0.0002, + "loss": 0.5385, + "step": 15000 + }, + { + "epoch": 4.845061329890251, + "grad_norm": 1.010671615600586, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 15010 + }, + { + "epoch": 4.848289218850871, + "grad_norm": 1.0778919458389282, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 15020 + }, + { + "epoch": 4.851517107811492, + "grad_norm": 1.0479968786239624, + "learning_rate": 0.0002, + "loss": 0.5587, + "step": 15030 + }, + { + "epoch": 4.854744996772111, + "grad_norm": 1.0345100164413452, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 15040 + }, + { + "epoch": 4.857972885732731, + "grad_norm": 0.9539691805839539, + "learning_rate": 0.0002, + "loss": 0.5809, + "step": 15050 + }, + { + "epoch": 4.8612007746933505, + "grad_norm": 0.9914752840995789, + "learning_rate": 0.0002, + "loss": 0.5314, + "step": 15060 + }, + { + "epoch": 4.86442866365397, + "grad_norm": 1.1935476064682007, + "learning_rate": 0.0002, + "loss": 0.5277, + "step": 15070 + }, + { + "epoch": 4.86765655261459, + "grad_norm": 1.0065057277679443, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 15080 + }, + { + "epoch": 4.8708844415752095, + "grad_norm": 0.9320993423461914, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 15090 + }, + { + "epoch": 4.87411233053583, + "grad_norm": 1.0578069686889648, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 15100 + }, + { + "epoch": 4.87734021949645, + "grad_norm": 0.9666239023208618, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 15110 + }, + { + "epoch": 4.880568108457069, + "grad_norm": 1.1322687864303589, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 15120 + }, + { + "epoch": 4.883795997417689, + "grad_norm": 0.955674409866333, + "learning_rate": 0.0002, + "loss": 0.5381, + "step": 15130 + }, + { + "epoch": 4.887023886378309, + "grad_norm": 1.119413137435913, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 15140 + }, + { + "epoch": 4.890251775338928, + "grad_norm": 0.863646924495697, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 15150 + }, + { + "epoch": 4.893479664299548, + "grad_norm": 1.1823450326919556, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 15160 + }, + { + "epoch": 4.896707553260168, + "grad_norm": 0.8657588958740234, + "learning_rate": 0.0002, + "loss": 0.5654, + "step": 15170 + }, + { + "epoch": 4.899935442220787, + "grad_norm": 0.8575737476348877, + "learning_rate": 0.0002, + "loss": 0.5239, + "step": 15180 + }, + { + "epoch": 4.903163331181407, + "grad_norm": 0.9611830711364746, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 15190 + }, + { + "epoch": 4.906391220142027, + "grad_norm": 1.1981453895568848, + "learning_rate": 0.0002, + "loss": 0.5505, + "step": 15200 + }, + { + "epoch": 4.909619109102647, + "grad_norm": 0.9401199221611023, + "learning_rate": 0.0002, + "loss": 0.5582, + "step": 15210 + }, + { + "epoch": 4.912846998063267, + "grad_norm": 0.8420369625091553, + "learning_rate": 0.0002, + "loss": 0.5631, + "step": 15220 + }, + { + "epoch": 4.916074887023886, + "grad_norm": 0.7877969145774841, + "learning_rate": 0.0002, + "loss": 0.5255, + "step": 15230 + }, + { + "epoch": 4.919302775984506, + "grad_norm": 0.8988324403762817, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 15240 + }, + { + "epoch": 4.922530664945126, + "grad_norm": 1.1103752851486206, + "learning_rate": 0.0002, + "loss": 0.5274, + "step": 15250 + }, + { + "epoch": 4.925758553905745, + "grad_norm": 0.8874443173408508, + "learning_rate": 0.0002, + "loss": 0.5249, + "step": 15260 + }, + { + "epoch": 4.928986442866366, + "grad_norm": 1.1001752614974976, + "learning_rate": 0.0002, + "loss": 0.5677, + "step": 15270 + }, + { + "epoch": 4.9322143318269855, + "grad_norm": 0.9661307334899902, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 15280 + }, + { + "epoch": 4.935442220787605, + "grad_norm": 1.1738812923431396, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 15290 + }, + { + "epoch": 4.938670109748225, + "grad_norm": 0.9773507714271545, + "learning_rate": 0.0002, + "loss": 0.5057, + "step": 15300 + }, + { + "epoch": 4.9418979987088445, + "grad_norm": 1.0735599994659424, + "learning_rate": 0.0002, + "loss": 0.5029, + "step": 15310 + }, + { + "epoch": 4.945125887669464, + "grad_norm": 1.0552113056182861, + "learning_rate": 0.0002, + "loss": 0.4996, + "step": 15320 + }, + { + "epoch": 4.948353776630084, + "grad_norm": 1.0900797843933105, + "learning_rate": 0.0002, + "loss": 0.5201, + "step": 15330 + }, + { + "epoch": 4.9515816655907035, + "grad_norm": 1.0908405780792236, + "learning_rate": 0.0002, + "loss": 0.552, + "step": 15340 + }, + { + "epoch": 4.954809554551323, + "grad_norm": 1.010221004486084, + "learning_rate": 0.0002, + "loss": 0.6208, + "step": 15350 + }, + { + "epoch": 4.958037443511943, + "grad_norm": 1.0321437120437622, + "learning_rate": 0.0002, + "loss": 0.5423, + "step": 15360 + }, + { + "epoch": 4.961265332472563, + "grad_norm": 0.8430278897285461, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 15370 + }, + { + "epoch": 4.964493221433183, + "grad_norm": 0.8775330185890198, + "learning_rate": 0.0002, + "loss": 0.538, + "step": 15380 + }, + { + "epoch": 4.967721110393803, + "grad_norm": 0.9796988368034363, + "learning_rate": 0.0002, + "loss": 0.5344, + "step": 15390 + }, + { + "epoch": 4.970948999354422, + "grad_norm": 0.8782257437705994, + "learning_rate": 0.0002, + "loss": 0.5352, + "step": 15400 + }, + { + "epoch": 4.974176888315042, + "grad_norm": 0.9959840774536133, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 15410 + }, + { + "epoch": 4.9774047772756616, + "grad_norm": 1.0730273723602295, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 15420 + }, + { + "epoch": 4.980632666236281, + "grad_norm": 0.8653680682182312, + "learning_rate": 0.0002, + "loss": 0.5277, + "step": 15430 + }, + { + "epoch": 4.983860555196901, + "grad_norm": 1.0769985914230347, + "learning_rate": 0.0002, + "loss": 0.5301, + "step": 15440 + }, + { + "epoch": 4.987088444157521, + "grad_norm": 1.1336040496826172, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 15450 + }, + { + "epoch": 4.990316333118141, + "grad_norm": 0.9844824075698853, + "learning_rate": 0.0002, + "loss": 0.5454, + "step": 15460 + }, + { + "epoch": 4.993544222078761, + "grad_norm": 0.8368769288063049, + "learning_rate": 0.0002, + "loss": 0.5316, + "step": 15470 + }, + { + "epoch": 4.99677211103938, + "grad_norm": 1.0238676071166992, + "learning_rate": 0.0002, + "loss": 0.5464, + "step": 15480 + }, + { + "epoch": 5.0, + "grad_norm": 1.064820408821106, + "learning_rate": 0.0002, + "loss": 0.5577, + "step": 15490 + }, + { + "epoch": 5.0, + "eval_loss": 1.241918921470642, + "eval_runtime": 158.4099, + "eval_samples_per_second": 4.627, + "eval_steps_per_second": 0.581, + "step": 15490 + }, + { + "epoch": 5.00322788896062, + "grad_norm": 1.1366689205169678, + "learning_rate": 0.0002, + "loss": 0.4554, + "step": 15500 + }, + { + "epoch": 5.006455777921239, + "grad_norm": 1.2548010349273682, + "learning_rate": 0.0002, + "loss": 0.4288, + "step": 15510 + }, + { + "epoch": 5.009683666881859, + "grad_norm": 1.3875139951705933, + "learning_rate": 0.0002, + "loss": 0.4276, + "step": 15520 + }, + { + "epoch": 5.012911555842479, + "grad_norm": 0.9834036231040955, + "learning_rate": 0.0002, + "loss": 0.4198, + "step": 15530 + }, + { + "epoch": 5.016139444803099, + "grad_norm": 1.0737303495407104, + "learning_rate": 0.0002, + "loss": 0.4531, + "step": 15540 + }, + { + "epoch": 5.019367333763719, + "grad_norm": 0.9877859950065613, + "learning_rate": 0.0002, + "loss": 0.4073, + "step": 15550 + }, + { + "epoch": 5.0225952227243384, + "grad_norm": 1.143268346786499, + "learning_rate": 0.0002, + "loss": 0.4459, + "step": 15560 + }, + { + "epoch": 5.025823111684958, + "grad_norm": 1.1206166744232178, + "learning_rate": 0.0002, + "loss": 0.4477, + "step": 15570 + }, + { + "epoch": 5.029051000645578, + "grad_norm": 0.9977272748947144, + "learning_rate": 0.0002, + "loss": 0.4593, + "step": 15580 + }, + { + "epoch": 5.032278889606197, + "grad_norm": 1.3193285465240479, + "learning_rate": 0.0002, + "loss": 0.436, + "step": 15590 + }, + { + "epoch": 5.035506778566817, + "grad_norm": 1.0761713981628418, + "learning_rate": 0.0002, + "loss": 0.4426, + "step": 15600 + }, + { + "epoch": 5.038734667527437, + "grad_norm": 1.1250759363174438, + "learning_rate": 0.0002, + "loss": 0.4701, + "step": 15610 + }, + { + "epoch": 5.041962556488057, + "grad_norm": 1.0414305925369263, + "learning_rate": 0.0002, + "loss": 0.3995, + "step": 15620 + }, + { + "epoch": 5.045190445448677, + "grad_norm": 1.0906853675842285, + "learning_rate": 0.0002, + "loss": 0.4244, + "step": 15630 + }, + { + "epoch": 5.0484183344092965, + "grad_norm": 0.9360867142677307, + "learning_rate": 0.0002, + "loss": 0.441, + "step": 15640 + }, + { + "epoch": 5.051646223369916, + "grad_norm": 0.9078057408332825, + "learning_rate": 0.0002, + "loss": 0.4146, + "step": 15650 + }, + { + "epoch": 5.054874112330536, + "grad_norm": 1.0054848194122314, + "learning_rate": 0.0002, + "loss": 0.4285, + "step": 15660 + }, + { + "epoch": 5.0581020012911555, + "grad_norm": 0.9538215398788452, + "learning_rate": 0.0002, + "loss": 0.417, + "step": 15670 + }, + { + "epoch": 5.061329890251775, + "grad_norm": 1.6312693357467651, + "learning_rate": 0.0002, + "loss": 0.4629, + "step": 15680 + }, + { + "epoch": 5.064557779212395, + "grad_norm": 1.2100921869277954, + "learning_rate": 0.0002, + "loss": 0.3996, + "step": 15690 + }, + { + "epoch": 5.0677856681730145, + "grad_norm": 1.2776238918304443, + "learning_rate": 0.0002, + "loss": 0.4489, + "step": 15700 + }, + { + "epoch": 5.071013557133635, + "grad_norm": 1.0110050439834595, + "learning_rate": 0.0002, + "loss": 0.4728, + "step": 15710 + }, + { + "epoch": 5.074241446094255, + "grad_norm": 1.0896575450897217, + "learning_rate": 0.0002, + "loss": 0.4916, + "step": 15720 + }, + { + "epoch": 5.077469335054874, + "grad_norm": 0.9989936947822571, + "learning_rate": 0.0002, + "loss": 0.4462, + "step": 15730 + }, + { + "epoch": 5.080697224015494, + "grad_norm": 1.0412228107452393, + "learning_rate": 0.0002, + "loss": 0.457, + "step": 15740 + }, + { + "epoch": 5.083925112976114, + "grad_norm": 1.0964457988739014, + "learning_rate": 0.0002, + "loss": 0.4525, + "step": 15750 + }, + { + "epoch": 5.087153001936733, + "grad_norm": 1.1700960397720337, + "learning_rate": 0.0002, + "loss": 0.4539, + "step": 15760 + }, + { + "epoch": 5.090380890897353, + "grad_norm": 0.9515631794929504, + "learning_rate": 0.0002, + "loss": 0.4517, + "step": 15770 + }, + { + "epoch": 5.093608779857973, + "grad_norm": 1.0895006656646729, + "learning_rate": 0.0002, + "loss": 0.4352, + "step": 15780 + }, + { + "epoch": 5.096836668818592, + "grad_norm": 1.041312575340271, + "learning_rate": 0.0002, + "loss": 0.4765, + "step": 15790 + }, + { + "epoch": 5.100064557779213, + "grad_norm": 0.9518465399742126, + "learning_rate": 0.0002, + "loss": 0.4532, + "step": 15800 + }, + { + "epoch": 5.103292446739832, + "grad_norm": 0.8317030668258667, + "learning_rate": 0.0002, + "loss": 0.4187, + "step": 15810 + }, + { + "epoch": 5.106520335700452, + "grad_norm": 1.0933761596679688, + "learning_rate": 0.0002, + "loss": 0.4523, + "step": 15820 + }, + { + "epoch": 5.109748224661072, + "grad_norm": 1.0069324970245361, + "learning_rate": 0.0002, + "loss": 0.4689, + "step": 15830 + }, + { + "epoch": 5.112976113621691, + "grad_norm": 1.1166068315505981, + "learning_rate": 0.0002, + "loss": 0.4773, + "step": 15840 + }, + { + "epoch": 5.116204002582311, + "grad_norm": 1.069992184638977, + "learning_rate": 0.0002, + "loss": 0.4635, + "step": 15850 + }, + { + "epoch": 5.119431891542931, + "grad_norm": 1.3728036880493164, + "learning_rate": 0.0002, + "loss": 0.445, + "step": 15860 + }, + { + "epoch": 5.12265978050355, + "grad_norm": 1.0625780820846558, + "learning_rate": 0.0002, + "loss": 0.4563, + "step": 15870 + }, + { + "epoch": 5.125887669464171, + "grad_norm": 1.090174913406372, + "learning_rate": 0.0002, + "loss": 0.426, + "step": 15880 + }, + { + "epoch": 5.1291155584247905, + "grad_norm": 0.8729526996612549, + "learning_rate": 0.0002, + "loss": 0.457, + "step": 15890 + }, + { + "epoch": 5.13234344738541, + "grad_norm": 0.9561540484428406, + "learning_rate": 0.0002, + "loss": 0.4686, + "step": 15900 + }, + { + "epoch": 5.13557133634603, + "grad_norm": 1.012120246887207, + "learning_rate": 0.0002, + "loss": 0.4266, + "step": 15910 + }, + { + "epoch": 5.1387992253066495, + "grad_norm": 1.1027921438217163, + "learning_rate": 0.0002, + "loss": 0.4484, + "step": 15920 + }, + { + "epoch": 5.142027114267269, + "grad_norm": 1.0878126621246338, + "learning_rate": 0.0002, + "loss": 0.4389, + "step": 15930 + }, + { + "epoch": 5.145255003227889, + "grad_norm": 0.9619103670120239, + "learning_rate": 0.0002, + "loss": 0.4716, + "step": 15940 + }, + { + "epoch": 5.148482892188508, + "grad_norm": 1.1684138774871826, + "learning_rate": 0.0002, + "loss": 0.4071, + "step": 15950 + }, + { + "epoch": 5.151710781149128, + "grad_norm": 1.3379510641098022, + "learning_rate": 0.0002, + "loss": 0.4292, + "step": 15960 + }, + { + "epoch": 5.154938670109749, + "grad_norm": 1.0427496433258057, + "learning_rate": 0.0002, + "loss": 0.4413, + "step": 15970 + }, + { + "epoch": 5.158166559070368, + "grad_norm": 0.9917148351669312, + "learning_rate": 0.0002, + "loss": 0.4665, + "step": 15980 + }, + { + "epoch": 5.161394448030988, + "grad_norm": 1.0899780988693237, + "learning_rate": 0.0002, + "loss": 0.4527, + "step": 15990 + }, + { + "epoch": 5.1646223369916076, + "grad_norm": 0.9251647591590881, + "learning_rate": 0.0002, + "loss": 0.4764, + "step": 16000 + }, + { + "epoch": 5.167850225952227, + "grad_norm": 1.1669172048568726, + "learning_rate": 0.0002, + "loss": 0.5043, + "step": 16010 + }, + { + "epoch": 5.171078114912847, + "grad_norm": 1.2285256385803223, + "learning_rate": 0.0002, + "loss": 0.4726, + "step": 16020 + }, + { + "epoch": 5.1743060038734665, + "grad_norm": 1.0504484176635742, + "learning_rate": 0.0002, + "loss": 0.4312, + "step": 16030 + }, + { + "epoch": 5.177533892834086, + "grad_norm": 1.2829089164733887, + "learning_rate": 0.0002, + "loss": 0.4507, + "step": 16040 + }, + { + "epoch": 5.180761781794706, + "grad_norm": 0.9332743287086487, + "learning_rate": 0.0002, + "loss": 0.4547, + "step": 16050 + }, + { + "epoch": 5.183989670755326, + "grad_norm": 1.0054426193237305, + "learning_rate": 0.0002, + "loss": 0.4211, + "step": 16060 + }, + { + "epoch": 5.187217559715946, + "grad_norm": 1.0049669742584229, + "learning_rate": 0.0002, + "loss": 0.4415, + "step": 16070 + }, + { + "epoch": 5.190445448676566, + "grad_norm": 1.0171366930007935, + "learning_rate": 0.0002, + "loss": 0.4462, + "step": 16080 + }, + { + "epoch": 5.193673337637185, + "grad_norm": 1.234966516494751, + "learning_rate": 0.0002, + "loss": 0.4725, + "step": 16090 + }, + { + "epoch": 5.196901226597805, + "grad_norm": 0.9127960205078125, + "learning_rate": 0.0002, + "loss": 0.4579, + "step": 16100 + }, + { + "epoch": 5.200129115558425, + "grad_norm": 1.153924822807312, + "learning_rate": 0.0002, + "loss": 0.4647, + "step": 16110 + }, + { + "epoch": 5.203357004519044, + "grad_norm": 1.26716947555542, + "learning_rate": 0.0002, + "loss": 0.4826, + "step": 16120 + }, + { + "epoch": 5.206584893479664, + "grad_norm": 1.2438743114471436, + "learning_rate": 0.0002, + "loss": 0.446, + "step": 16130 + }, + { + "epoch": 5.2098127824402845, + "grad_norm": 1.0888392925262451, + "learning_rate": 0.0002, + "loss": 0.4768, + "step": 16140 + }, + { + "epoch": 5.213040671400904, + "grad_norm": 1.1741917133331299, + "learning_rate": 0.0002, + "loss": 0.4508, + "step": 16150 + }, + { + "epoch": 5.216268560361524, + "grad_norm": 0.9508614540100098, + "learning_rate": 0.0002, + "loss": 0.4271, + "step": 16160 + }, + { + "epoch": 5.219496449322143, + "grad_norm": 0.9714716672897339, + "learning_rate": 0.0002, + "loss": 0.4577, + "step": 16170 + }, + { + "epoch": 5.222724338282763, + "grad_norm": 1.2681622505187988, + "learning_rate": 0.0002, + "loss": 0.4636, + "step": 16180 + }, + { + "epoch": 5.225952227243383, + "grad_norm": 1.045871376991272, + "learning_rate": 0.0002, + "loss": 0.4723, + "step": 16190 + }, + { + "epoch": 5.229180116204002, + "grad_norm": 1.0272563695907593, + "learning_rate": 0.0002, + "loss": 0.4467, + "step": 16200 + }, + { + "epoch": 5.232408005164622, + "grad_norm": 1.092901349067688, + "learning_rate": 0.0002, + "loss": 0.4353, + "step": 16210 + }, + { + "epoch": 5.235635894125242, + "grad_norm": 0.9332799315452576, + "learning_rate": 0.0002, + "loss": 0.4588, + "step": 16220 + }, + { + "epoch": 5.238863783085862, + "grad_norm": 1.1728498935699463, + "learning_rate": 0.0002, + "loss": 0.4594, + "step": 16230 + }, + { + "epoch": 5.242091672046482, + "grad_norm": 0.9932476878166199, + "learning_rate": 0.0002, + "loss": 0.4652, + "step": 16240 + }, + { + "epoch": 5.2453195610071015, + "grad_norm": 0.735236406326294, + "learning_rate": 0.0002, + "loss": 0.4469, + "step": 16250 + }, + { + "epoch": 5.248547449967721, + "grad_norm": 1.0289303064346313, + "learning_rate": 0.0002, + "loss": 0.4386, + "step": 16260 + }, + { + "epoch": 5.251775338928341, + "grad_norm": 0.9488231539726257, + "learning_rate": 0.0002, + "loss": 0.4303, + "step": 16270 + }, + { + "epoch": 5.2550032278889605, + "grad_norm": 0.8320055603981018, + "learning_rate": 0.0002, + "loss": 0.4495, + "step": 16280 + }, + { + "epoch": 5.25823111684958, + "grad_norm": 1.2013251781463623, + "learning_rate": 0.0002, + "loss": 0.4224, + "step": 16290 + }, + { + "epoch": 5.2614590058102, + "grad_norm": 1.0649845600128174, + "learning_rate": 0.0002, + "loss": 0.4666, + "step": 16300 + }, + { + "epoch": 5.26468689477082, + "grad_norm": 1.1674472093582153, + "learning_rate": 0.0002, + "loss": 0.4325, + "step": 16310 + }, + { + "epoch": 5.26791478373144, + "grad_norm": 1.3934763669967651, + "learning_rate": 0.0002, + "loss": 0.4482, + "step": 16320 + }, + { + "epoch": 5.27114267269206, + "grad_norm": 0.8427977561950684, + "learning_rate": 0.0002, + "loss": 0.4494, + "step": 16330 + }, + { + "epoch": 5.274370561652679, + "grad_norm": 1.0497093200683594, + "learning_rate": 0.0002, + "loss": 0.4234, + "step": 16340 + }, + { + "epoch": 5.277598450613299, + "grad_norm": 0.8562338352203369, + "learning_rate": 0.0002, + "loss": 0.4337, + "step": 16350 + }, + { + "epoch": 5.280826339573919, + "grad_norm": 1.043920874595642, + "learning_rate": 0.0002, + "loss": 0.4664, + "step": 16360 + }, + { + "epoch": 5.284054228534538, + "grad_norm": 1.0039188861846924, + "learning_rate": 0.0002, + "loss": 0.4463, + "step": 16370 + }, + { + "epoch": 5.287282117495158, + "grad_norm": 0.9414041638374329, + "learning_rate": 0.0002, + "loss": 0.4149, + "step": 16380 + }, + { + "epoch": 5.2905100064557775, + "grad_norm": 1.3346221446990967, + "learning_rate": 0.0002, + "loss": 0.5119, + "step": 16390 + }, + { + "epoch": 5.293737895416398, + "grad_norm": 1.0173962116241455, + "learning_rate": 0.0002, + "loss": 0.4479, + "step": 16400 + }, + { + "epoch": 5.296965784377018, + "grad_norm": 0.7756500244140625, + "learning_rate": 0.0002, + "loss": 0.4538, + "step": 16410 + }, + { + "epoch": 5.300193673337637, + "grad_norm": 1.1185362339019775, + "learning_rate": 0.0002, + "loss": 0.4306, + "step": 16420 + }, + { + "epoch": 5.303421562298257, + "grad_norm": 1.0904899835586548, + "learning_rate": 0.0002, + "loss": 0.5033, + "step": 16430 + }, + { + "epoch": 5.306649451258877, + "grad_norm": 1.0803170204162598, + "learning_rate": 0.0002, + "loss": 0.4887, + "step": 16440 + }, + { + "epoch": 5.309877340219496, + "grad_norm": 1.1492092609405518, + "learning_rate": 0.0002, + "loss": 0.4473, + "step": 16450 + }, + { + "epoch": 5.313105229180116, + "grad_norm": 1.1212135553359985, + "learning_rate": 0.0002, + "loss": 0.4696, + "step": 16460 + }, + { + "epoch": 5.316333118140736, + "grad_norm": 0.8274528980255127, + "learning_rate": 0.0002, + "loss": 0.4438, + "step": 16470 + }, + { + "epoch": 5.319561007101356, + "grad_norm": 1.118891716003418, + "learning_rate": 0.0002, + "loss": 0.468, + "step": 16480 + }, + { + "epoch": 5.322788896061976, + "grad_norm": 1.185945749282837, + "learning_rate": 0.0002, + "loss": 0.4403, + "step": 16490 + }, + { + "epoch": 5.3260167850225955, + "grad_norm": 1.0275214910507202, + "learning_rate": 0.0002, + "loss": 0.4946, + "step": 16500 + }, + { + "epoch": 5.329244673983215, + "grad_norm": 0.9346362352371216, + "learning_rate": 0.0002, + "loss": 0.4612, + "step": 16510 + }, + { + "epoch": 5.332472562943835, + "grad_norm": 0.9600600600242615, + "learning_rate": 0.0002, + "loss": 0.4722, + "step": 16520 + }, + { + "epoch": 5.335700451904454, + "grad_norm": 1.1238188743591309, + "learning_rate": 0.0002, + "loss": 0.4536, + "step": 16530 + }, + { + "epoch": 5.338928340865074, + "grad_norm": 0.8660476207733154, + "learning_rate": 0.0002, + "loss": 0.5025, + "step": 16540 + }, + { + "epoch": 5.342156229825694, + "grad_norm": 0.9869821071624756, + "learning_rate": 0.0002, + "loss": 0.4732, + "step": 16550 + }, + { + "epoch": 5.345384118786313, + "grad_norm": 1.1719090938568115, + "learning_rate": 0.0002, + "loss": 0.4967, + "step": 16560 + }, + { + "epoch": 5.348612007746934, + "grad_norm": 1.0122894048690796, + "learning_rate": 0.0002, + "loss": 0.4563, + "step": 16570 + }, + { + "epoch": 5.351839896707554, + "grad_norm": 1.2431079149246216, + "learning_rate": 0.0002, + "loss": 0.5066, + "step": 16580 + }, + { + "epoch": 5.355067785668173, + "grad_norm": 1.4178080558776855, + "learning_rate": 0.0002, + "loss": 0.4708, + "step": 16590 + }, + { + "epoch": 5.358295674628793, + "grad_norm": 1.1895726919174194, + "learning_rate": 0.0002, + "loss": 0.4686, + "step": 16600 + }, + { + "epoch": 5.3615235635894125, + "grad_norm": 1.154392123222351, + "learning_rate": 0.0002, + "loss": 0.475, + "step": 16610 + }, + { + "epoch": 5.364751452550032, + "grad_norm": 0.9207229018211365, + "learning_rate": 0.0002, + "loss": 0.4511, + "step": 16620 + }, + { + "epoch": 5.367979341510652, + "grad_norm": 1.0247414112091064, + "learning_rate": 0.0002, + "loss": 0.4606, + "step": 16630 + }, + { + "epoch": 5.3712072304712715, + "grad_norm": 1.0402202606201172, + "learning_rate": 0.0002, + "loss": 0.4886, + "step": 16640 + }, + { + "epoch": 5.374435119431892, + "grad_norm": 1.1902891397476196, + "learning_rate": 0.0002, + "loss": 0.4903, + "step": 16650 + }, + { + "epoch": 5.377663008392512, + "grad_norm": 0.9572759866714478, + "learning_rate": 0.0002, + "loss": 0.4583, + "step": 16660 + }, + { + "epoch": 5.380890897353131, + "grad_norm": 0.9968860149383545, + "learning_rate": 0.0002, + "loss": 0.4636, + "step": 16670 + }, + { + "epoch": 5.384118786313751, + "grad_norm": 1.2468547821044922, + "learning_rate": 0.0002, + "loss": 0.477, + "step": 16680 + }, + { + "epoch": 5.387346675274371, + "grad_norm": 1.154661774635315, + "learning_rate": 0.0002, + "loss": 0.5223, + "step": 16690 + }, + { + "epoch": 5.39057456423499, + "grad_norm": 0.8837044835090637, + "learning_rate": 0.0002, + "loss": 0.4637, + "step": 16700 + }, + { + "epoch": 5.39380245319561, + "grad_norm": 1.0317907333374023, + "learning_rate": 0.0002, + "loss": 0.4744, + "step": 16710 + }, + { + "epoch": 5.39703034215623, + "grad_norm": 0.9811587929725647, + "learning_rate": 0.0002, + "loss": 0.4831, + "step": 16720 + }, + { + "epoch": 5.400258231116849, + "grad_norm": 0.9487450122833252, + "learning_rate": 0.0002, + "loss": 0.4739, + "step": 16730 + }, + { + "epoch": 5.403486120077469, + "grad_norm": 1.0540274381637573, + "learning_rate": 0.0002, + "loss": 0.4574, + "step": 16740 + }, + { + "epoch": 5.406714009038089, + "grad_norm": 1.028363585472107, + "learning_rate": 0.0002, + "loss": 0.4709, + "step": 16750 + }, + { + "epoch": 5.409941897998709, + "grad_norm": 1.0200704336166382, + "learning_rate": 0.0002, + "loss": 0.468, + "step": 16760 + }, + { + "epoch": 5.413169786959329, + "grad_norm": 1.0330981016159058, + "learning_rate": 0.0002, + "loss": 0.4383, + "step": 16770 + }, + { + "epoch": 5.416397675919948, + "grad_norm": 1.320875644683838, + "learning_rate": 0.0002, + "loss": 0.4645, + "step": 16780 + }, + { + "epoch": 5.419625564880568, + "grad_norm": 0.9838143587112427, + "learning_rate": 0.0002, + "loss": 0.4601, + "step": 16790 + }, + { + "epoch": 5.422853453841188, + "grad_norm": 1.1006578207015991, + "learning_rate": 0.0002, + "loss": 0.4835, + "step": 16800 + }, + { + "epoch": 5.426081342801807, + "grad_norm": 1.099174976348877, + "learning_rate": 0.0002, + "loss": 0.4871, + "step": 16810 + }, + { + "epoch": 5.429309231762427, + "grad_norm": 1.0632189512252808, + "learning_rate": 0.0002, + "loss": 0.4773, + "step": 16820 + }, + { + "epoch": 5.4325371207230475, + "grad_norm": 0.9673194885253906, + "learning_rate": 0.0002, + "loss": 0.4732, + "step": 16830 + }, + { + "epoch": 5.435765009683667, + "grad_norm": 0.853013813495636, + "learning_rate": 0.0002, + "loss": 0.4731, + "step": 16840 + }, + { + "epoch": 5.438992898644287, + "grad_norm": 1.0261728763580322, + "learning_rate": 0.0002, + "loss": 0.4856, + "step": 16850 + }, + { + "epoch": 5.4422207876049065, + "grad_norm": 1.1642370223999023, + "learning_rate": 0.0002, + "loss": 0.4729, + "step": 16860 + }, + { + "epoch": 5.445448676565526, + "grad_norm": 0.8715673685073853, + "learning_rate": 0.0002, + "loss": 0.4751, + "step": 16870 + }, + { + "epoch": 5.448676565526146, + "grad_norm": 0.905746579170227, + "learning_rate": 0.0002, + "loss": 0.4566, + "step": 16880 + }, + { + "epoch": 5.451904454486765, + "grad_norm": 1.1051915884017944, + "learning_rate": 0.0002, + "loss": 0.4536, + "step": 16890 + }, + { + "epoch": 5.455132343447385, + "grad_norm": 1.0781478881835938, + "learning_rate": 0.0002, + "loss": 0.4944, + "step": 16900 + }, + { + "epoch": 5.458360232408005, + "grad_norm": 1.1168911457061768, + "learning_rate": 0.0002, + "loss": 0.4655, + "step": 16910 + }, + { + "epoch": 5.461588121368625, + "grad_norm": 1.1150046586990356, + "learning_rate": 0.0002, + "loss": 0.4624, + "step": 16920 + }, + { + "epoch": 5.464816010329245, + "grad_norm": 0.9862499833106995, + "learning_rate": 0.0002, + "loss": 0.4849, + "step": 16930 + }, + { + "epoch": 5.468043899289865, + "grad_norm": 1.5416640043258667, + "learning_rate": 0.0002, + "loss": 0.47, + "step": 16940 + }, + { + "epoch": 5.471271788250484, + "grad_norm": 0.8960899710655212, + "learning_rate": 0.0002, + "loss": 0.4508, + "step": 16950 + }, + { + "epoch": 5.474499677211104, + "grad_norm": 0.9796477556228638, + "learning_rate": 0.0002, + "loss": 0.5002, + "step": 16960 + }, + { + "epoch": 5.4777275661717235, + "grad_norm": 0.9526587128639221, + "learning_rate": 0.0002, + "loss": 0.4939, + "step": 16970 + }, + { + "epoch": 5.480955455132343, + "grad_norm": 1.2373039722442627, + "learning_rate": 0.0002, + "loss": 0.4807, + "step": 16980 + }, + { + "epoch": 5.484183344092963, + "grad_norm": 1.1860566139221191, + "learning_rate": 0.0002, + "loss": 0.4642, + "step": 16990 + }, + { + "epoch": 5.487411233053583, + "grad_norm": 1.477345585823059, + "learning_rate": 0.0002, + "loss": 0.4929, + "step": 17000 + }, + { + "epoch": 5.490639122014203, + "grad_norm": 1.1029295921325684, + "learning_rate": 0.0002, + "loss": 0.4566, + "step": 17010 + }, + { + "epoch": 5.493867010974823, + "grad_norm": 1.1416981220245361, + "learning_rate": 0.0002, + "loss": 0.487, + "step": 17020 + }, + { + "epoch": 5.497094899935442, + "grad_norm": 1.1647989749908447, + "learning_rate": 0.0002, + "loss": 0.475, + "step": 17030 + }, + { + "epoch": 5.500322788896062, + "grad_norm": 1.1297032833099365, + "learning_rate": 0.0002, + "loss": 0.4644, + "step": 17040 + }, + { + "epoch": 5.503550677856682, + "grad_norm": 0.9764689207077026, + "learning_rate": 0.0002, + "loss": 0.4885, + "step": 17050 + }, + { + "epoch": 5.506778566817301, + "grad_norm": 1.038161039352417, + "learning_rate": 0.0002, + "loss": 0.4789, + "step": 17060 + }, + { + "epoch": 5.510006455777921, + "grad_norm": 1.1417886018753052, + "learning_rate": 0.0002, + "loss": 0.4467, + "step": 17070 + }, + { + "epoch": 5.513234344738541, + "grad_norm": 0.9300898313522339, + "learning_rate": 0.0002, + "loss": 0.4782, + "step": 17080 + }, + { + "epoch": 5.516462233699161, + "grad_norm": 1.0295016765594482, + "learning_rate": 0.0002, + "loss": 0.4805, + "step": 17090 + }, + { + "epoch": 5.519690122659781, + "grad_norm": 1.1273008584976196, + "learning_rate": 0.0002, + "loss": 0.4663, + "step": 17100 + }, + { + "epoch": 5.5229180116204, + "grad_norm": 0.9542737007141113, + "learning_rate": 0.0002, + "loss": 0.4897, + "step": 17110 + }, + { + "epoch": 5.52614590058102, + "grad_norm": 1.34589421749115, + "learning_rate": 0.0002, + "loss": 0.51, + "step": 17120 + }, + { + "epoch": 5.52937378954164, + "grad_norm": 0.9889675378799438, + "learning_rate": 0.0002, + "loss": 0.467, + "step": 17130 + }, + { + "epoch": 5.532601678502259, + "grad_norm": 1.25719153881073, + "learning_rate": 0.0002, + "loss": 0.4752, + "step": 17140 + }, + { + "epoch": 5.535829567462879, + "grad_norm": 1.2511073350906372, + "learning_rate": 0.0002, + "loss": 0.4609, + "step": 17150 + }, + { + "epoch": 5.539057456423499, + "grad_norm": 1.1993521451950073, + "learning_rate": 0.0002, + "loss": 0.4992, + "step": 17160 + }, + { + "epoch": 5.542285345384119, + "grad_norm": 1.1394526958465576, + "learning_rate": 0.0002, + "loss": 0.4986, + "step": 17170 + }, + { + "epoch": 5.545513234344739, + "grad_norm": 1.0435349941253662, + "learning_rate": 0.0002, + "loss": 0.5284, + "step": 17180 + }, + { + "epoch": 5.5487411233053585, + "grad_norm": 1.120940089225769, + "learning_rate": 0.0002, + "loss": 0.4934, + "step": 17190 + }, + { + "epoch": 5.551969012265978, + "grad_norm": 1.0906445980072021, + "learning_rate": 0.0002, + "loss": 0.4704, + "step": 17200 + }, + { + "epoch": 5.555196901226598, + "grad_norm": 0.8883966207504272, + "learning_rate": 0.0002, + "loss": 0.4896, + "step": 17210 + }, + { + "epoch": 5.5584247901872175, + "grad_norm": 1.3078752756118774, + "learning_rate": 0.0002, + "loss": 0.4696, + "step": 17220 + }, + { + "epoch": 5.561652679147837, + "grad_norm": 1.0224416255950928, + "learning_rate": 0.0002, + "loss": 0.4805, + "step": 17230 + }, + { + "epoch": 5.564880568108457, + "grad_norm": 1.242518663406372, + "learning_rate": 0.0002, + "loss": 0.47, + "step": 17240 + }, + { + "epoch": 5.568108457069076, + "grad_norm": 1.2328250408172607, + "learning_rate": 0.0002, + "loss": 0.4708, + "step": 17250 + }, + { + "epoch": 5.571336346029697, + "grad_norm": 1.2186611890792847, + "learning_rate": 0.0002, + "loss": 0.4685, + "step": 17260 + }, + { + "epoch": 5.574564234990317, + "grad_norm": 1.0947459936141968, + "learning_rate": 0.0002, + "loss": 0.4688, + "step": 17270 + }, + { + "epoch": 5.577792123950936, + "grad_norm": 1.075279951095581, + "learning_rate": 0.0002, + "loss": 0.506, + "step": 17280 + }, + { + "epoch": 5.581020012911556, + "grad_norm": 1.0316804647445679, + "learning_rate": 0.0002, + "loss": 0.478, + "step": 17290 + }, + { + "epoch": 5.584247901872176, + "grad_norm": 1.1077373027801514, + "learning_rate": 0.0002, + "loss": 0.478, + "step": 17300 + }, + { + "epoch": 5.587475790832795, + "grad_norm": 1.219228744506836, + "learning_rate": 0.0002, + "loss": 0.4857, + "step": 17310 + }, + { + "epoch": 5.590703679793415, + "grad_norm": 1.026361346244812, + "learning_rate": 0.0002, + "loss": 0.4465, + "step": 17320 + }, + { + "epoch": 5.5939315687540345, + "grad_norm": 1.1621283292770386, + "learning_rate": 0.0002, + "loss": 0.4831, + "step": 17330 + }, + { + "epoch": 5.597159457714655, + "grad_norm": 1.0177470445632935, + "learning_rate": 0.0002, + "loss": 0.4706, + "step": 17340 + }, + { + "epoch": 5.600387346675275, + "grad_norm": 1.0625319480895996, + "learning_rate": 0.0002, + "loss": 0.4961, + "step": 17350 + }, + { + "epoch": 5.603615235635894, + "grad_norm": 1.148815393447876, + "learning_rate": 0.0002, + "loss": 0.484, + "step": 17360 + }, + { + "epoch": 5.606843124596514, + "grad_norm": 1.0571802854537964, + "learning_rate": 0.0002, + "loss": 0.4804, + "step": 17370 + }, + { + "epoch": 5.610071013557134, + "grad_norm": 1.2069389820098877, + "learning_rate": 0.0002, + "loss": 0.5202, + "step": 17380 + }, + { + "epoch": 5.613298902517753, + "grad_norm": 1.407530426979065, + "learning_rate": 0.0002, + "loss": 0.5029, + "step": 17390 + }, + { + "epoch": 5.616526791478373, + "grad_norm": 1.247060775756836, + "learning_rate": 0.0002, + "loss": 0.4688, + "step": 17400 + }, + { + "epoch": 5.619754680438993, + "grad_norm": 1.431684136390686, + "learning_rate": 0.0002, + "loss": 0.4359, + "step": 17410 + }, + { + "epoch": 5.622982569399612, + "grad_norm": 1.0520552396774292, + "learning_rate": 0.0002, + "loss": 0.5244, + "step": 17420 + }, + { + "epoch": 5.626210458360232, + "grad_norm": 1.0593537092208862, + "learning_rate": 0.0002, + "loss": 0.4993, + "step": 17430 + }, + { + "epoch": 5.6294383473208525, + "grad_norm": 1.4414515495300293, + "learning_rate": 0.0002, + "loss": 0.4911, + "step": 17440 + }, + { + "epoch": 5.632666236281472, + "grad_norm": 1.0902460813522339, + "learning_rate": 0.0002, + "loss": 0.4761, + "step": 17450 + }, + { + "epoch": 5.635894125242092, + "grad_norm": 0.890944242477417, + "learning_rate": 0.0002, + "loss": 0.4737, + "step": 17460 + }, + { + "epoch": 5.639122014202711, + "grad_norm": 1.035675287246704, + "learning_rate": 0.0002, + "loss": 0.4706, + "step": 17470 + }, + { + "epoch": 5.642349903163331, + "grad_norm": 0.9792264103889465, + "learning_rate": 0.0002, + "loss": 0.484, + "step": 17480 + }, + { + "epoch": 5.645577792123951, + "grad_norm": 1.1888220310211182, + "learning_rate": 0.0002, + "loss": 0.4753, + "step": 17490 + }, + { + "epoch": 5.64880568108457, + "grad_norm": 1.0169143676757812, + "learning_rate": 0.0002, + "loss": 0.5047, + "step": 17500 + }, + { + "epoch": 5.652033570045191, + "grad_norm": 0.9812449216842651, + "learning_rate": 0.0002, + "loss": 0.4919, + "step": 17510 + }, + { + "epoch": 5.655261459005811, + "grad_norm": 1.0509105920791626, + "learning_rate": 0.0002, + "loss": 0.4879, + "step": 17520 + }, + { + "epoch": 5.65848934796643, + "grad_norm": 0.9047426581382751, + "learning_rate": 0.0002, + "loss": 0.4695, + "step": 17530 + }, + { + "epoch": 5.66171723692705, + "grad_norm": 1.2393709421157837, + "learning_rate": 0.0002, + "loss": 0.4712, + "step": 17540 + }, + { + "epoch": 5.6649451258876695, + "grad_norm": 1.1098991632461548, + "learning_rate": 0.0002, + "loss": 0.5012, + "step": 17550 + }, + { + "epoch": 5.668173014848289, + "grad_norm": 0.8181570768356323, + "learning_rate": 0.0002, + "loss": 0.4499, + "step": 17560 + }, + { + "epoch": 5.671400903808909, + "grad_norm": 0.9676381945610046, + "learning_rate": 0.0002, + "loss": 0.4973, + "step": 17570 + }, + { + "epoch": 5.6746287927695285, + "grad_norm": 1.1225934028625488, + "learning_rate": 0.0002, + "loss": 0.5058, + "step": 17580 + }, + { + "epoch": 5.677856681730148, + "grad_norm": 1.6259925365447998, + "learning_rate": 0.0002, + "loss": 0.5165, + "step": 17590 + }, + { + "epoch": 5.681084570690768, + "grad_norm": 0.7751404643058777, + "learning_rate": 0.0002, + "loss": 0.4613, + "step": 17600 + }, + { + "epoch": 5.684312459651388, + "grad_norm": 0.8478589057922363, + "learning_rate": 0.0002, + "loss": 0.4895, + "step": 17610 + }, + { + "epoch": 5.687540348612008, + "grad_norm": 1.2887113094329834, + "learning_rate": 0.0002, + "loss": 0.4492, + "step": 17620 + }, + { + "epoch": 5.690768237572628, + "grad_norm": 1.1452652215957642, + "learning_rate": 0.0002, + "loss": 0.4792, + "step": 17630 + }, + { + "epoch": 5.693996126533247, + "grad_norm": 1.0370417833328247, + "learning_rate": 0.0002, + "loss": 0.4889, + "step": 17640 + }, + { + "epoch": 5.697224015493867, + "grad_norm": 1.1358870267868042, + "learning_rate": 0.0002, + "loss": 0.535, + "step": 17650 + }, + { + "epoch": 5.700451904454487, + "grad_norm": 1.2772479057312012, + "learning_rate": 0.0002, + "loss": 0.4753, + "step": 17660 + }, + { + "epoch": 5.703679793415106, + "grad_norm": 1.182812213897705, + "learning_rate": 0.0002, + "loss": 0.4492, + "step": 17670 + }, + { + "epoch": 5.706907682375727, + "grad_norm": 1.099074125289917, + "learning_rate": 0.0002, + "loss": 0.5025, + "step": 17680 + }, + { + "epoch": 5.710135571336346, + "grad_norm": 0.938634991645813, + "learning_rate": 0.0002, + "loss": 0.4945, + "step": 17690 + }, + { + "epoch": 5.713363460296966, + "grad_norm": 0.9385238885879517, + "learning_rate": 0.0002, + "loss": 0.491, + "step": 17700 + }, + { + "epoch": 5.716591349257586, + "grad_norm": 1.1486014127731323, + "learning_rate": 0.0002, + "loss": 0.4849, + "step": 17710 + }, + { + "epoch": 5.719819238218205, + "grad_norm": 0.9433078169822693, + "learning_rate": 0.0002, + "loss": 0.5043, + "step": 17720 + }, + { + "epoch": 5.723047127178825, + "grad_norm": 1.02472722530365, + "learning_rate": 0.0002, + "loss": 0.4543, + "step": 17730 + }, + { + "epoch": 5.726275016139445, + "grad_norm": 0.9360876679420471, + "learning_rate": 0.0002, + "loss": 0.4631, + "step": 17740 + }, + { + "epoch": 5.729502905100064, + "grad_norm": 1.0481483936309814, + "learning_rate": 0.0002, + "loss": 0.4947, + "step": 17750 + }, + { + "epoch": 5.732730794060684, + "grad_norm": 1.0032516717910767, + "learning_rate": 0.0002, + "loss": 0.4763, + "step": 17760 + }, + { + "epoch": 5.735958683021304, + "grad_norm": 0.8908069729804993, + "learning_rate": 0.0002, + "loss": 0.4819, + "step": 17770 + }, + { + "epoch": 5.739186571981924, + "grad_norm": 1.0679123401641846, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 17780 + }, + { + "epoch": 5.742414460942544, + "grad_norm": 1.0448014736175537, + "learning_rate": 0.0002, + "loss": 0.4818, + "step": 17790 + }, + { + "epoch": 5.7456423499031635, + "grad_norm": 1.0433847904205322, + "learning_rate": 0.0002, + "loss": 0.4869, + "step": 17800 + }, + { + "epoch": 5.748870238863783, + "grad_norm": 1.000291109085083, + "learning_rate": 0.0002, + "loss": 0.5243, + "step": 17810 + }, + { + "epoch": 5.752098127824403, + "grad_norm": 1.1238429546356201, + "learning_rate": 0.0002, + "loss": 0.4891, + "step": 17820 + }, + { + "epoch": 5.755326016785022, + "grad_norm": 1.09062659740448, + "learning_rate": 0.0002, + "loss": 0.4905, + "step": 17830 + }, + { + "epoch": 5.758553905745642, + "grad_norm": 0.8538689613342285, + "learning_rate": 0.0002, + "loss": 0.4883, + "step": 17840 + }, + { + "epoch": 5.761781794706262, + "grad_norm": 1.3872947692871094, + "learning_rate": 0.0002, + "loss": 0.4989, + "step": 17850 + }, + { + "epoch": 5.765009683666882, + "grad_norm": 1.0578876733779907, + "learning_rate": 0.0002, + "loss": 0.4707, + "step": 17860 + }, + { + "epoch": 5.768237572627502, + "grad_norm": 1.1761705875396729, + "learning_rate": 0.0002, + "loss": 0.5281, + "step": 17870 + }, + { + "epoch": 5.771465461588122, + "grad_norm": 1.1223368644714355, + "learning_rate": 0.0002, + "loss": 0.4802, + "step": 17880 + }, + { + "epoch": 5.774693350548741, + "grad_norm": 1.2484360933303833, + "learning_rate": 0.0002, + "loss": 0.505, + "step": 17890 + }, + { + "epoch": 5.777921239509361, + "grad_norm": 1.2461199760437012, + "learning_rate": 0.0002, + "loss": 0.4786, + "step": 17900 + }, + { + "epoch": 5.7811491284699805, + "grad_norm": 1.1718299388885498, + "learning_rate": 0.0002, + "loss": 0.4933, + "step": 17910 + }, + { + "epoch": 5.7843770174306, + "grad_norm": 0.9896837472915649, + "learning_rate": 0.0002, + "loss": 0.471, + "step": 17920 + }, + { + "epoch": 5.78760490639122, + "grad_norm": 1.3759760856628418, + "learning_rate": 0.0002, + "loss": 0.4808, + "step": 17930 + }, + { + "epoch": 5.7908327953518395, + "grad_norm": 1.0596622228622437, + "learning_rate": 0.0002, + "loss": 0.4847, + "step": 17940 + }, + { + "epoch": 5.79406068431246, + "grad_norm": 0.9292021989822388, + "learning_rate": 0.0002, + "loss": 0.5153, + "step": 17950 + }, + { + "epoch": 5.79728857327308, + "grad_norm": 0.8786653876304626, + "learning_rate": 0.0002, + "loss": 0.4783, + "step": 17960 + }, + { + "epoch": 5.800516462233699, + "grad_norm": 1.2087152004241943, + "learning_rate": 0.0002, + "loss": 0.4598, + "step": 17970 + }, + { + "epoch": 5.803744351194319, + "grad_norm": 1.1643104553222656, + "learning_rate": 0.0002, + "loss": 0.4953, + "step": 17980 + }, + { + "epoch": 5.806972240154939, + "grad_norm": 0.971613347530365, + "learning_rate": 0.0002, + "loss": 0.5111, + "step": 17990 + }, + { + "epoch": 5.810200129115558, + "grad_norm": 1.306227684020996, + "learning_rate": 0.0002, + "loss": 0.5094, + "step": 18000 + }, + { + "epoch": 5.813428018076178, + "grad_norm": 1.3665502071380615, + "learning_rate": 0.0002, + "loss": 0.5392, + "step": 18010 + }, + { + "epoch": 5.816655907036798, + "grad_norm": 1.2227312326431274, + "learning_rate": 0.0002, + "loss": 0.4887, + "step": 18020 + }, + { + "epoch": 5.819883795997418, + "grad_norm": 1.180694818496704, + "learning_rate": 0.0002, + "loss": 0.5203, + "step": 18030 + }, + { + "epoch": 5.823111684958038, + "grad_norm": 1.1045362949371338, + "learning_rate": 0.0002, + "loss": 0.4962, + "step": 18040 + }, + { + "epoch": 5.826339573918657, + "grad_norm": 1.3828954696655273, + "learning_rate": 0.0002, + "loss": 0.4969, + "step": 18050 + }, + { + "epoch": 5.829567462879277, + "grad_norm": 1.305102825164795, + "learning_rate": 0.0002, + "loss": 0.5493, + "step": 18060 + }, + { + "epoch": 5.832795351839897, + "grad_norm": 1.2708743810653687, + "learning_rate": 0.0002, + "loss": 0.4844, + "step": 18070 + }, + { + "epoch": 5.836023240800516, + "grad_norm": 1.0344188213348389, + "learning_rate": 0.0002, + "loss": 0.4834, + "step": 18080 + }, + { + "epoch": 5.839251129761136, + "grad_norm": 1.1321724653244019, + "learning_rate": 0.0002, + "loss": 0.5088, + "step": 18090 + }, + { + "epoch": 5.842479018721756, + "grad_norm": 1.2162611484527588, + "learning_rate": 0.0002, + "loss": 0.4888, + "step": 18100 + }, + { + "epoch": 5.845706907682375, + "grad_norm": 1.427612543106079, + "learning_rate": 0.0002, + "loss": 0.5014, + "step": 18110 + }, + { + "epoch": 5.848934796642995, + "grad_norm": 1.4391452074050903, + "learning_rate": 0.0002, + "loss": 0.5339, + "step": 18120 + }, + { + "epoch": 5.8521626856036155, + "grad_norm": 1.1548216342926025, + "learning_rate": 0.0002, + "loss": 0.528, + "step": 18130 + }, + { + "epoch": 5.855390574564235, + "grad_norm": 1.2336437702178955, + "learning_rate": 0.0002, + "loss": 0.4779, + "step": 18140 + }, + { + "epoch": 5.858618463524855, + "grad_norm": 1.254661202430725, + "learning_rate": 0.0002, + "loss": 0.4844, + "step": 18150 + }, + { + "epoch": 5.8618463524854745, + "grad_norm": 0.8326491117477417, + "learning_rate": 0.0002, + "loss": 0.5201, + "step": 18160 + }, + { + "epoch": 5.865074241446094, + "grad_norm": 1.0907988548278809, + "learning_rate": 0.0002, + "loss": 0.5076, + "step": 18170 + }, + { + "epoch": 5.868302130406714, + "grad_norm": 0.9896568655967712, + "learning_rate": 0.0002, + "loss": 0.48, + "step": 18180 + }, + { + "epoch": 5.871530019367333, + "grad_norm": 0.9440065026283264, + "learning_rate": 0.0002, + "loss": 0.4628, + "step": 18190 + }, + { + "epoch": 5.874757908327954, + "grad_norm": 1.09321129322052, + "learning_rate": 0.0002, + "loss": 0.5265, + "step": 18200 + }, + { + "epoch": 5.877985797288574, + "grad_norm": 1.2588142156600952, + "learning_rate": 0.0002, + "loss": 0.4737, + "step": 18210 + }, + { + "epoch": 5.881213686249193, + "grad_norm": 1.1731587648391724, + "learning_rate": 0.0002, + "loss": 0.475, + "step": 18220 + }, + { + "epoch": 5.884441575209813, + "grad_norm": 0.9904444217681885, + "learning_rate": 0.0002, + "loss": 0.504, + "step": 18230 + }, + { + "epoch": 5.887669464170433, + "grad_norm": 0.8985799551010132, + "learning_rate": 0.0002, + "loss": 0.4842, + "step": 18240 + }, + { + "epoch": 5.890897353131052, + "grad_norm": 1.0182441473007202, + "learning_rate": 0.0002, + "loss": 0.4878, + "step": 18250 + }, + { + "epoch": 5.894125242091672, + "grad_norm": 1.1574701070785522, + "learning_rate": 0.0002, + "loss": 0.5224, + "step": 18260 + }, + { + "epoch": 5.8973531310522915, + "grad_norm": 1.1776602268218994, + "learning_rate": 0.0002, + "loss": 0.5, + "step": 18270 + }, + { + "epoch": 5.900581020012911, + "grad_norm": 1.4951308965682983, + "learning_rate": 0.0002, + "loss": 0.5245, + "step": 18280 + }, + { + "epoch": 5.903808908973531, + "grad_norm": 1.1440261602401733, + "learning_rate": 0.0002, + "loss": 0.5454, + "step": 18290 + }, + { + "epoch": 5.907036797934151, + "grad_norm": 0.9925196170806885, + "learning_rate": 0.0002, + "loss": 0.4868, + "step": 18300 + }, + { + "epoch": 5.910264686894771, + "grad_norm": 1.098615288734436, + "learning_rate": 0.0002, + "loss": 0.5142, + "step": 18310 + }, + { + "epoch": 5.913492575855391, + "grad_norm": 1.0030080080032349, + "learning_rate": 0.0002, + "loss": 0.5184, + "step": 18320 + }, + { + "epoch": 5.91672046481601, + "grad_norm": 0.9890318512916565, + "learning_rate": 0.0002, + "loss": 0.474, + "step": 18330 + }, + { + "epoch": 5.91994835377663, + "grad_norm": 1.2209392786026, + "learning_rate": 0.0002, + "loss": 0.5125, + "step": 18340 + }, + { + "epoch": 5.92317624273725, + "grad_norm": 1.108933925628662, + "learning_rate": 0.0002, + "loss": 0.4634, + "step": 18350 + }, + { + "epoch": 5.926404131697869, + "grad_norm": 1.086024522781372, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 18360 + }, + { + "epoch": 5.92963202065849, + "grad_norm": 1.0061167478561401, + "learning_rate": 0.0002, + "loss": 0.4952, + "step": 18370 + }, + { + "epoch": 5.9328599096191095, + "grad_norm": 0.9445858597755432, + "learning_rate": 0.0002, + "loss": 0.4848, + "step": 18380 + }, + { + "epoch": 5.936087798579729, + "grad_norm": 0.9556859135627747, + "learning_rate": 0.0002, + "loss": 0.5014, + "step": 18390 + }, + { + "epoch": 5.939315687540349, + "grad_norm": 1.154168963432312, + "learning_rate": 0.0002, + "loss": 0.4966, + "step": 18400 + }, + { + "epoch": 5.942543576500968, + "grad_norm": 1.0495831966400146, + "learning_rate": 0.0002, + "loss": 0.4836, + "step": 18410 + }, + { + "epoch": 5.945771465461588, + "grad_norm": 1.0717304944992065, + "learning_rate": 0.0002, + "loss": 0.5021, + "step": 18420 + }, + { + "epoch": 5.948999354422208, + "grad_norm": 1.06618332862854, + "learning_rate": 0.0002, + "loss": 0.4794, + "step": 18430 + }, + { + "epoch": 5.952227243382827, + "grad_norm": 0.9567165374755859, + "learning_rate": 0.0002, + "loss": 0.5011, + "step": 18440 + }, + { + "epoch": 5.955455132343447, + "grad_norm": 1.0306249856948853, + "learning_rate": 0.0002, + "loss": 0.485, + "step": 18450 + }, + { + "epoch": 5.958683021304067, + "grad_norm": 1.1879968643188477, + "learning_rate": 0.0002, + "loss": 0.4948, + "step": 18460 + }, + { + "epoch": 5.961910910264687, + "grad_norm": 1.3177233934402466, + "learning_rate": 0.0002, + "loss": 0.5185, + "step": 18470 + }, + { + "epoch": 5.965138799225307, + "grad_norm": 1.0945817232131958, + "learning_rate": 0.0002, + "loss": 0.4966, + "step": 18480 + }, + { + "epoch": 5.9683666881859265, + "grad_norm": 1.029414415359497, + "learning_rate": 0.0002, + "loss": 0.5196, + "step": 18490 + }, + { + "epoch": 5.971594577146546, + "grad_norm": 1.2266209125518799, + "learning_rate": 0.0002, + "loss": 0.5154, + "step": 18500 + }, + { + "epoch": 5.974822466107166, + "grad_norm": 1.2167150974273682, + "learning_rate": 0.0002, + "loss": 0.4914, + "step": 18510 + }, + { + "epoch": 5.9780503550677855, + "grad_norm": 0.9941056966781616, + "learning_rate": 0.0002, + "loss": 0.466, + "step": 18520 + }, + { + "epoch": 5.981278244028405, + "grad_norm": 1.4244859218597412, + "learning_rate": 0.0002, + "loss": 0.5037, + "step": 18530 + }, + { + "epoch": 5.984506132989026, + "grad_norm": 0.8976260423660278, + "learning_rate": 0.0002, + "loss": 0.4902, + "step": 18540 + }, + { + "epoch": 5.987734021949645, + "grad_norm": 1.0162699222564697, + "learning_rate": 0.0002, + "loss": 0.5039, + "step": 18550 + }, + { + "epoch": 5.990961910910265, + "grad_norm": 1.196677803993225, + "learning_rate": 0.0002, + "loss": 0.5138, + "step": 18560 + }, + { + "epoch": 5.994189799870885, + "grad_norm": 1.163403868675232, + "learning_rate": 0.0002, + "loss": 0.4626, + "step": 18570 + }, + { + "epoch": 5.997417688831504, + "grad_norm": 1.010205626487732, + "learning_rate": 0.0002, + "loss": 0.5105, + "step": 18580 + }, + { + "epoch": 6.0, + "eval_loss": 1.2861483097076416, + "eval_runtime": 163.2683, + "eval_samples_per_second": 4.49, + "eval_steps_per_second": 0.563, + "step": 18588 + }, + { + "epoch": 6.000645577792124, + "grad_norm": 0.7334756255149841, + "learning_rate": 0.0002, + "loss": 0.4557, + "step": 18590 + }, + { + "epoch": 6.003873466752744, + "grad_norm": 1.093945026397705, + "learning_rate": 0.0002, + "loss": 0.4201, + "step": 18600 + }, + { + "epoch": 6.007101355713363, + "grad_norm": 1.2327148914337158, + "learning_rate": 0.0002, + "loss": 0.4235, + "step": 18610 + }, + { + "epoch": 6.010329244673983, + "grad_norm": 1.3238836526870728, + "learning_rate": 0.0002, + "loss": 0.377, + "step": 18620 + }, + { + "epoch": 6.0135571336346025, + "grad_norm": 1.2364031076431274, + "learning_rate": 0.0002, + "loss": 0.3883, + "step": 18630 + }, + { + "epoch": 6.016785022595223, + "grad_norm": 0.902474045753479, + "learning_rate": 0.0002, + "loss": 0.3958, + "step": 18640 + }, + { + "epoch": 6.020012911555843, + "grad_norm": 1.273280382156372, + "learning_rate": 0.0002, + "loss": 0.4077, + "step": 18650 + }, + { + "epoch": 6.023240800516462, + "grad_norm": 1.2470760345458984, + "learning_rate": 0.0002, + "loss": 0.4224, + "step": 18660 + }, + { + "epoch": 6.026468689477082, + "grad_norm": 1.2360138893127441, + "learning_rate": 0.0002, + "loss": 0.3752, + "step": 18670 + }, + { + "epoch": 6.029696578437702, + "grad_norm": 1.467140793800354, + "learning_rate": 0.0002, + "loss": 0.3653, + "step": 18680 + }, + { + "epoch": 6.032924467398321, + "grad_norm": 1.123871088027954, + "learning_rate": 0.0002, + "loss": 0.3883, + "step": 18690 + }, + { + "epoch": 6.036152356358941, + "grad_norm": 0.9732550978660583, + "learning_rate": 0.0002, + "loss": 0.3812, + "step": 18700 + }, + { + "epoch": 6.039380245319561, + "grad_norm": 1.170860767364502, + "learning_rate": 0.0002, + "loss": 0.4163, + "step": 18710 + }, + { + "epoch": 6.042608134280181, + "grad_norm": 1.2599345445632935, + "learning_rate": 0.0002, + "loss": 0.3836, + "step": 18720 + }, + { + "epoch": 6.045836023240801, + "grad_norm": 1.0808286666870117, + "learning_rate": 0.0002, + "loss": 0.3881, + "step": 18730 + }, + { + "epoch": 6.0490639122014205, + "grad_norm": 0.9799565076828003, + "learning_rate": 0.0002, + "loss": 0.386, + "step": 18740 + }, + { + "epoch": 6.05229180116204, + "grad_norm": 0.8425611853599548, + "learning_rate": 0.0002, + "loss": 0.3833, + "step": 18750 + }, + { + "epoch": 6.05551969012266, + "grad_norm": 0.9762344360351562, + "learning_rate": 0.0002, + "loss": 0.3765, + "step": 18760 + }, + { + "epoch": 6.058747579083279, + "grad_norm": 1.1290913820266724, + "learning_rate": 0.0002, + "loss": 0.3878, + "step": 18770 + }, + { + "epoch": 6.061975468043899, + "grad_norm": 1.2240493297576904, + "learning_rate": 0.0002, + "loss": 0.4061, + "step": 18780 + }, + { + "epoch": 6.065203357004519, + "grad_norm": 1.3422439098358154, + "learning_rate": 0.0002, + "loss": 0.3894, + "step": 18790 + }, + { + "epoch": 6.068431245965138, + "grad_norm": 1.0391879081726074, + "learning_rate": 0.0002, + "loss": 0.3885, + "step": 18800 + }, + { + "epoch": 6.071659134925759, + "grad_norm": 1.0910760164260864, + "learning_rate": 0.0002, + "loss": 0.409, + "step": 18810 + }, + { + "epoch": 6.074887023886379, + "grad_norm": 1.280098557472229, + "learning_rate": 0.0002, + "loss": 0.3905, + "step": 18820 + }, + { + "epoch": 6.078114912846998, + "grad_norm": 1.2102673053741455, + "learning_rate": 0.0002, + "loss": 0.3892, + "step": 18830 + }, + { + "epoch": 6.081342801807618, + "grad_norm": 1.3735624551773071, + "learning_rate": 0.0002, + "loss": 0.3757, + "step": 18840 + }, + { + "epoch": 6.0845706907682375, + "grad_norm": 1.039419412612915, + "learning_rate": 0.0002, + "loss": 0.4057, + "step": 18850 + }, + { + "epoch": 6.087798579728857, + "grad_norm": 1.175872802734375, + "learning_rate": 0.0002, + "loss": 0.4093, + "step": 18860 + }, + { + "epoch": 6.091026468689477, + "grad_norm": 1.4287301301956177, + "learning_rate": 0.0002, + "loss": 0.3933, + "step": 18870 + }, + { + "epoch": 6.0942543576500965, + "grad_norm": 1.110627293586731, + "learning_rate": 0.0002, + "loss": 0.4029, + "step": 18880 + }, + { + "epoch": 6.097482246610717, + "grad_norm": 1.1495535373687744, + "learning_rate": 0.0002, + "loss": 0.4195, + "step": 18890 + }, + { + "epoch": 6.100710135571337, + "grad_norm": 0.9764134287834167, + "learning_rate": 0.0002, + "loss": 0.4022, + "step": 18900 + }, + { + "epoch": 6.103938024531956, + "grad_norm": 1.0792596340179443, + "learning_rate": 0.0002, + "loss": 0.4097, + "step": 18910 + }, + { + "epoch": 6.107165913492576, + "grad_norm": 1.2520235776901245, + "learning_rate": 0.0002, + "loss": 0.402, + "step": 18920 + }, + { + "epoch": 6.110393802453196, + "grad_norm": 0.857008695602417, + "learning_rate": 0.0002, + "loss": 0.4091, + "step": 18930 + }, + { + "epoch": 6.113621691413815, + "grad_norm": 1.745723009109497, + "learning_rate": 0.0002, + "loss": 0.4046, + "step": 18940 + }, + { + "epoch": 6.116849580374435, + "grad_norm": 1.099941611289978, + "learning_rate": 0.0002, + "loss": 0.4245, + "step": 18950 + }, + { + "epoch": 6.120077469335055, + "grad_norm": 1.1402947902679443, + "learning_rate": 0.0002, + "loss": 0.3708, + "step": 18960 + }, + { + "epoch": 6.123305358295674, + "grad_norm": 1.0565131902694702, + "learning_rate": 0.0002, + "loss": 0.4022, + "step": 18970 + }, + { + "epoch": 6.126533247256295, + "grad_norm": 1.1511917114257812, + "learning_rate": 0.0002, + "loss": 0.3973, + "step": 18980 + }, + { + "epoch": 6.129761136216914, + "grad_norm": 0.9029410481452942, + "learning_rate": 0.0002, + "loss": 0.395, + "step": 18990 + }, + { + "epoch": 6.132989025177534, + "grad_norm": 1.03252375125885, + "learning_rate": 0.0002, + "loss": 0.393, + "step": 19000 + }, + { + "epoch": 6.136216914138154, + "grad_norm": 1.2058522701263428, + "learning_rate": 0.0002, + "loss": 0.3923, + "step": 19010 + }, + { + "epoch": 6.139444803098773, + "grad_norm": 1.2274953126907349, + "learning_rate": 0.0002, + "loss": 0.3963, + "step": 19020 + }, + { + "epoch": 6.142672692059393, + "grad_norm": 1.3196226358413696, + "learning_rate": 0.0002, + "loss": 0.3999, + "step": 19030 + }, + { + "epoch": 6.145900581020013, + "grad_norm": 0.8030686378479004, + "learning_rate": 0.0002, + "loss": 0.4176, + "step": 19040 + }, + { + "epoch": 6.149128469980632, + "grad_norm": 1.1762639284133911, + "learning_rate": 0.0002, + "loss": 0.3886, + "step": 19050 + }, + { + "epoch": 6.152356358941253, + "grad_norm": 1.0247628688812256, + "learning_rate": 0.0002, + "loss": 0.429, + "step": 19060 + }, + { + "epoch": 6.1555842479018725, + "grad_norm": 0.99031662940979, + "learning_rate": 0.0002, + "loss": 0.3876, + "step": 19070 + }, + { + "epoch": 6.158812136862492, + "grad_norm": 1.334445834159851, + "learning_rate": 0.0002, + "loss": 0.3818, + "step": 19080 + }, + { + "epoch": 6.162040025823112, + "grad_norm": 1.1160423755645752, + "learning_rate": 0.0002, + "loss": 0.4038, + "step": 19090 + }, + { + "epoch": 6.1652679147837315, + "grad_norm": 1.2579560279846191, + "learning_rate": 0.0002, + "loss": 0.4081, + "step": 19100 + }, + { + "epoch": 6.168495803744351, + "grad_norm": 0.9372721910476685, + "learning_rate": 0.0002, + "loss": 0.4092, + "step": 19110 + }, + { + "epoch": 6.171723692704971, + "grad_norm": 0.7995722889900208, + "learning_rate": 0.0002, + "loss": 0.3905, + "step": 19120 + }, + { + "epoch": 6.17495158166559, + "grad_norm": 1.0074360370635986, + "learning_rate": 0.0002, + "loss": 0.3896, + "step": 19130 + }, + { + "epoch": 6.17817947062621, + "grad_norm": 0.9821600914001465, + "learning_rate": 0.0002, + "loss": 0.4328, + "step": 19140 + }, + { + "epoch": 6.181407359586831, + "grad_norm": 1.1252691745758057, + "learning_rate": 0.0002, + "loss": 0.3845, + "step": 19150 + }, + { + "epoch": 6.18463524854745, + "grad_norm": 1.316981554031372, + "learning_rate": 0.0002, + "loss": 0.3918, + "step": 19160 + }, + { + "epoch": 6.18786313750807, + "grad_norm": 1.0131299495697021, + "learning_rate": 0.0002, + "loss": 0.3893, + "step": 19170 + }, + { + "epoch": 6.19109102646869, + "grad_norm": 1.3530288934707642, + "learning_rate": 0.0002, + "loss": 0.4111, + "step": 19180 + }, + { + "epoch": 6.194318915429309, + "grad_norm": 1.148247480392456, + "learning_rate": 0.0002, + "loss": 0.416, + "step": 19190 + }, + { + "epoch": 6.197546804389929, + "grad_norm": 1.5510036945343018, + "learning_rate": 0.0002, + "loss": 0.4191, + "step": 19200 + }, + { + "epoch": 6.2007746933505485, + "grad_norm": 1.3048018217086792, + "learning_rate": 0.0002, + "loss": 0.423, + "step": 19210 + }, + { + "epoch": 6.204002582311168, + "grad_norm": 1.186187982559204, + "learning_rate": 0.0002, + "loss": 0.397, + "step": 19220 + }, + { + "epoch": 6.207230471271788, + "grad_norm": 1.5199471712112427, + "learning_rate": 0.0002, + "loss": 0.4164, + "step": 19230 + }, + { + "epoch": 6.210458360232408, + "grad_norm": 1.1311423778533936, + "learning_rate": 0.0002, + "loss": 0.4322, + "step": 19240 + }, + { + "epoch": 6.213686249193028, + "grad_norm": 1.2345898151397705, + "learning_rate": 0.0002, + "loss": 0.4086, + "step": 19250 + }, + { + "epoch": 6.216914138153648, + "grad_norm": 1.0261863470077515, + "learning_rate": 0.0002, + "loss": 0.4122, + "step": 19260 + }, + { + "epoch": 6.220142027114267, + "grad_norm": 0.8985416293144226, + "learning_rate": 0.0002, + "loss": 0.4315, + "step": 19270 + }, + { + "epoch": 6.223369916074887, + "grad_norm": 1.3136980533599854, + "learning_rate": 0.0002, + "loss": 0.4052, + "step": 19280 + }, + { + "epoch": 6.226597805035507, + "grad_norm": 1.1949185132980347, + "learning_rate": 0.0002, + "loss": 0.4232, + "step": 19290 + }, + { + "epoch": 6.229825693996126, + "grad_norm": 0.9668909907341003, + "learning_rate": 0.0002, + "loss": 0.4255, + "step": 19300 + }, + { + "epoch": 6.233053582956746, + "grad_norm": 0.8858964443206787, + "learning_rate": 0.0002, + "loss": 0.3917, + "step": 19310 + }, + { + "epoch": 6.236281471917366, + "grad_norm": 1.4254822731018066, + "learning_rate": 0.0002, + "loss": 0.4087, + "step": 19320 + }, + { + "epoch": 6.239509360877986, + "grad_norm": 1.0455392599105835, + "learning_rate": 0.0002, + "loss": 0.426, + "step": 19330 + }, + { + "epoch": 6.242737249838606, + "grad_norm": 1.1690824031829834, + "learning_rate": 0.0002, + "loss": 0.3894, + "step": 19340 + }, + { + "epoch": 6.245965138799225, + "grad_norm": 1.0347497463226318, + "learning_rate": 0.0002, + "loss": 0.3777, + "step": 19350 + }, + { + "epoch": 6.249193027759845, + "grad_norm": 1.0790464878082275, + "learning_rate": 0.0002, + "loss": 0.3972, + "step": 19360 + }, + { + "epoch": 6.252420916720465, + "grad_norm": 1.1294453144073486, + "learning_rate": 0.0002, + "loss": 0.4393, + "step": 19370 + }, + { + "epoch": 6.255648805681084, + "grad_norm": 1.5094330310821533, + "learning_rate": 0.0002, + "loss": 0.4055, + "step": 19380 + }, + { + "epoch": 6.258876694641704, + "grad_norm": 1.1122944355010986, + "learning_rate": 0.0002, + "loss": 0.4228, + "step": 19390 + }, + { + "epoch": 6.262104583602324, + "grad_norm": 1.3123422861099243, + "learning_rate": 0.0002, + "loss": 0.4341, + "step": 19400 + }, + { + "epoch": 6.265332472562944, + "grad_norm": 1.0585907697677612, + "learning_rate": 0.0002, + "loss": 0.4206, + "step": 19410 + }, + { + "epoch": 6.268560361523564, + "grad_norm": 0.8711239099502563, + "learning_rate": 0.0002, + "loss": 0.4001, + "step": 19420 + }, + { + "epoch": 6.2717882504841835, + "grad_norm": 1.2772116661071777, + "learning_rate": 0.0002, + "loss": 0.4201, + "step": 19430 + }, + { + "epoch": 6.275016139444803, + "grad_norm": 1.0035508871078491, + "learning_rate": 0.0002, + "loss": 0.4298, + "step": 19440 + }, + { + "epoch": 6.278244028405423, + "grad_norm": 0.7933974862098694, + "learning_rate": 0.0002, + "loss": 0.4234, + "step": 19450 + }, + { + "epoch": 6.2814719173660425, + "grad_norm": 1.2455826997756958, + "learning_rate": 0.0002, + "loss": 0.4144, + "step": 19460 + }, + { + "epoch": 6.284699806326662, + "grad_norm": 1.2735545635223389, + "learning_rate": 0.0002, + "loss": 0.4171, + "step": 19470 + }, + { + "epoch": 6.287927695287282, + "grad_norm": 0.9773174524307251, + "learning_rate": 0.0002, + "loss": 0.3956, + "step": 19480 + }, + { + "epoch": 6.2911555842479014, + "grad_norm": 1.2341974973678589, + "learning_rate": 0.0002, + "loss": 0.4264, + "step": 19490 + }, + { + "epoch": 6.294383473208522, + "grad_norm": 1.286138653755188, + "learning_rate": 0.0002, + "loss": 0.4068, + "step": 19500 + }, + { + "epoch": 6.297611362169142, + "grad_norm": 1.052889108657837, + "learning_rate": 0.0002, + "loss": 0.439, + "step": 19510 + }, + { + "epoch": 6.300839251129761, + "grad_norm": 1.1955385208129883, + "learning_rate": 0.0002, + "loss": 0.4199, + "step": 19520 + }, + { + "epoch": 6.304067140090381, + "grad_norm": 1.2792452573776245, + "learning_rate": 0.0002, + "loss": 0.4242, + "step": 19530 + }, + { + "epoch": 6.307295029051001, + "grad_norm": 0.9077931046485901, + "learning_rate": 0.0002, + "loss": 0.3989, + "step": 19540 + }, + { + "epoch": 6.31052291801162, + "grad_norm": 1.2492976188659668, + "learning_rate": 0.0002, + "loss": 0.388, + "step": 19550 + }, + { + "epoch": 6.31375080697224, + "grad_norm": 1.1097182035446167, + "learning_rate": 0.0002, + "loss": 0.3828, + "step": 19560 + }, + { + "epoch": 6.3169786959328595, + "grad_norm": 1.271609902381897, + "learning_rate": 0.0002, + "loss": 0.4482, + "step": 19570 + }, + { + "epoch": 6.32020658489348, + "grad_norm": 1.4262897968292236, + "learning_rate": 0.0002, + "loss": 0.3851, + "step": 19580 + }, + { + "epoch": 6.3234344738541, + "grad_norm": 1.057338833808899, + "learning_rate": 0.0002, + "loss": 0.4133, + "step": 19590 + }, + { + "epoch": 6.326662362814719, + "grad_norm": 1.323028326034546, + "learning_rate": 0.0002, + "loss": 0.4366, + "step": 19600 + }, + { + "epoch": 6.329890251775339, + "grad_norm": 1.0991673469543457, + "learning_rate": 0.0002, + "loss": 0.4186, + "step": 19610 + }, + { + "epoch": 6.333118140735959, + "grad_norm": 1.1600234508514404, + "learning_rate": 0.0002, + "loss": 0.4132, + "step": 19620 + }, + { + "epoch": 6.336346029696578, + "grad_norm": 1.2986212968826294, + "learning_rate": 0.0002, + "loss": 0.4689, + "step": 19630 + }, + { + "epoch": 6.339573918657198, + "grad_norm": 1.2117934226989746, + "learning_rate": 0.0002, + "loss": 0.3914, + "step": 19640 + }, + { + "epoch": 6.342801807617818, + "grad_norm": 0.9747948050498962, + "learning_rate": 0.0002, + "loss": 0.3939, + "step": 19650 + }, + { + "epoch": 6.346029696578437, + "grad_norm": 1.2380492687225342, + "learning_rate": 0.0002, + "loss": 0.4517, + "step": 19660 + }, + { + "epoch": 6.349257585539058, + "grad_norm": 1.2475087642669678, + "learning_rate": 0.0002, + "loss": 0.4344, + "step": 19670 + }, + { + "epoch": 6.3524854744996775, + "grad_norm": 1.022084355354309, + "learning_rate": 0.0002, + "loss": 0.4253, + "step": 19680 + }, + { + "epoch": 6.355713363460297, + "grad_norm": 1.2422059774398804, + "learning_rate": 0.0002, + "loss": 0.4227, + "step": 19690 + }, + { + "epoch": 6.358941252420917, + "grad_norm": 1.5015275478363037, + "learning_rate": 0.0002, + "loss": 0.4205, + "step": 19700 + }, + { + "epoch": 6.362169141381536, + "grad_norm": 1.068727970123291, + "learning_rate": 0.0002, + "loss": 0.414, + "step": 19710 + }, + { + "epoch": 6.365397030342156, + "grad_norm": 1.3718897104263306, + "learning_rate": 0.0002, + "loss": 0.4054, + "step": 19720 + }, + { + "epoch": 6.368624919302776, + "grad_norm": 1.3437764644622803, + "learning_rate": 0.0002, + "loss": 0.4399, + "step": 19730 + }, + { + "epoch": 6.371852808263395, + "grad_norm": 0.9128499031066895, + "learning_rate": 0.0002, + "loss": 0.4187, + "step": 19740 + }, + { + "epoch": 6.375080697224016, + "grad_norm": 1.0678889751434326, + "learning_rate": 0.0002, + "loss": 0.4346, + "step": 19750 + }, + { + "epoch": 6.378308586184636, + "grad_norm": 1.0432878732681274, + "learning_rate": 0.0002, + "loss": 0.4103, + "step": 19760 + }, + { + "epoch": 6.381536475145255, + "grad_norm": 1.4033927917480469, + "learning_rate": 0.0002, + "loss": 0.4304, + "step": 19770 + }, + { + "epoch": 6.384764364105875, + "grad_norm": 1.2773922681808472, + "learning_rate": 0.0002, + "loss": 0.4225, + "step": 19780 + }, + { + "epoch": 6.3879922530664945, + "grad_norm": 1.257847547531128, + "learning_rate": 0.0002, + "loss": 0.4246, + "step": 19790 + }, + { + "epoch": 6.391220142027114, + "grad_norm": 0.8424118757247925, + "learning_rate": 0.0002, + "loss": 0.4261, + "step": 19800 + }, + { + "epoch": 6.394448030987734, + "grad_norm": 1.3387986421585083, + "learning_rate": 0.0002, + "loss": 0.4145, + "step": 19810 + }, + { + "epoch": 6.3976759199483535, + "grad_norm": 1.1277328729629517, + "learning_rate": 0.0002, + "loss": 0.4268, + "step": 19820 + }, + { + "epoch": 6.400903808908973, + "grad_norm": 1.264283537864685, + "learning_rate": 0.0002, + "loss": 0.4213, + "step": 19830 + }, + { + "epoch": 6.404131697869594, + "grad_norm": 1.1770991086959839, + "learning_rate": 0.0002, + "loss": 0.4506, + "step": 19840 + }, + { + "epoch": 6.407359586830213, + "grad_norm": 0.9695967435836792, + "learning_rate": 0.0002, + "loss": 0.4385, + "step": 19850 + }, + { + "epoch": 6.410587475790833, + "grad_norm": 1.3394994735717773, + "learning_rate": 0.0002, + "loss": 0.4258, + "step": 19860 + }, + { + "epoch": 6.413815364751453, + "grad_norm": 1.0515536069869995, + "learning_rate": 0.0002, + "loss": 0.4017, + "step": 19870 + }, + { + "epoch": 6.417043253712072, + "grad_norm": 1.3238868713378906, + "learning_rate": 0.0002, + "loss": 0.4555, + "step": 19880 + }, + { + "epoch": 6.420271142672692, + "grad_norm": 1.0801814794540405, + "learning_rate": 0.0002, + "loss": 0.4385, + "step": 19890 + }, + { + "epoch": 6.423499031633312, + "grad_norm": 1.1391135454177856, + "learning_rate": 0.0002, + "loss": 0.4135, + "step": 19900 + }, + { + "epoch": 6.426726920593931, + "grad_norm": 1.13046133518219, + "learning_rate": 0.0002, + "loss": 0.4376, + "step": 19910 + }, + { + "epoch": 6.429954809554552, + "grad_norm": 1.1657520532608032, + "learning_rate": 0.0002, + "loss": 0.4251, + "step": 19920 + }, + { + "epoch": 6.433182698515171, + "grad_norm": 1.3315341472625732, + "learning_rate": 0.0002, + "loss": 0.3951, + "step": 19930 + }, + { + "epoch": 6.436410587475791, + "grad_norm": 1.1806831359863281, + "learning_rate": 0.0002, + "loss": 0.4254, + "step": 19940 + }, + { + "epoch": 6.439638476436411, + "grad_norm": 1.1581867933273315, + "learning_rate": 0.0002, + "loss": 0.3988, + "step": 19950 + }, + { + "epoch": 6.44286636539703, + "grad_norm": 1.2601206302642822, + "learning_rate": 0.0002, + "loss": 0.4194, + "step": 19960 + }, + { + "epoch": 6.44609425435765, + "grad_norm": 1.1163229942321777, + "learning_rate": 0.0002, + "loss": 0.4505, + "step": 19970 + }, + { + "epoch": 6.44932214331827, + "grad_norm": 0.9959462285041809, + "learning_rate": 0.0002, + "loss": 0.4295, + "step": 19980 + }, + { + "epoch": 6.452550032278889, + "grad_norm": 1.1213586330413818, + "learning_rate": 0.0002, + "loss": 0.421, + "step": 19990 + }, + { + "epoch": 6.455777921239509, + "grad_norm": 1.1345361471176147, + "learning_rate": 0.0002, + "loss": 0.4354, + "step": 20000 + }, + { + "epoch": 6.459005810200129, + "grad_norm": 1.245871901512146, + "learning_rate": 0.0002, + "loss": 0.429, + "step": 20010 + }, + { + "epoch": 6.462233699160749, + "grad_norm": 1.0894919633865356, + "learning_rate": 0.0002, + "loss": 0.4395, + "step": 20020 + }, + { + "epoch": 6.465461588121369, + "grad_norm": 1.030206322669983, + "learning_rate": 0.0002, + "loss": 0.4365, + "step": 20030 + }, + { + "epoch": 6.4686894770819885, + "grad_norm": 1.262133002281189, + "learning_rate": 0.0002, + "loss": 0.4225, + "step": 20040 + }, + { + "epoch": 6.471917366042608, + "grad_norm": 1.167641043663025, + "learning_rate": 0.0002, + "loss": 0.4301, + "step": 20050 + }, + { + "epoch": 6.475145255003228, + "grad_norm": 1.1125705242156982, + "learning_rate": 0.0002, + "loss": 0.4438, + "step": 20060 + }, + { + "epoch": 6.4783731439638474, + "grad_norm": 1.3777440786361694, + "learning_rate": 0.0002, + "loss": 0.4205, + "step": 20070 + }, + { + "epoch": 6.481601032924467, + "grad_norm": 1.1771081686019897, + "learning_rate": 0.0002, + "loss": 0.424, + "step": 20080 + }, + { + "epoch": 6.484828921885087, + "grad_norm": 1.0414351224899292, + "learning_rate": 0.0002, + "loss": 0.4187, + "step": 20090 + }, + { + "epoch": 6.488056810845707, + "grad_norm": 1.2103244066238403, + "learning_rate": 0.0002, + "loss": 0.4419, + "step": 20100 + }, + { + "epoch": 6.491284699806327, + "grad_norm": 1.4153836965560913, + "learning_rate": 0.0002, + "loss": 0.4502, + "step": 20110 + }, + { + "epoch": 6.494512588766947, + "grad_norm": 1.2718676328659058, + "learning_rate": 0.0002, + "loss": 0.4524, + "step": 20120 + }, + { + "epoch": 6.497740477727566, + "grad_norm": 1.1040351390838623, + "learning_rate": 0.0002, + "loss": 0.4546, + "step": 20130 + }, + { + "epoch": 6.500968366688186, + "grad_norm": 0.9804210662841797, + "learning_rate": 0.0002, + "loss": 0.4105, + "step": 20140 + }, + { + "epoch": 6.5041962556488055, + "grad_norm": 1.028836965560913, + "learning_rate": 0.0002, + "loss": 0.4165, + "step": 20150 + }, + { + "epoch": 6.507424144609425, + "grad_norm": 1.1773076057434082, + "learning_rate": 0.0002, + "loss": 0.4106, + "step": 20160 + }, + { + "epoch": 6.510652033570045, + "grad_norm": 0.8597512245178223, + "learning_rate": 0.0002, + "loss": 0.4364, + "step": 20170 + }, + { + "epoch": 6.5138799225306645, + "grad_norm": 1.4290635585784912, + "learning_rate": 0.0002, + "loss": 0.4346, + "step": 20180 + }, + { + "epoch": 6.517107811491285, + "grad_norm": 0.9842908382415771, + "learning_rate": 0.0002, + "loss": 0.4057, + "step": 20190 + }, + { + "epoch": 6.520335700451905, + "grad_norm": 1.0254372358322144, + "learning_rate": 0.0002, + "loss": 0.4562, + "step": 20200 + }, + { + "epoch": 6.523563589412524, + "grad_norm": 1.1869125366210938, + "learning_rate": 0.0002, + "loss": 0.433, + "step": 20210 + }, + { + "epoch": 6.526791478373144, + "grad_norm": 1.0994106531143188, + "learning_rate": 0.0002, + "loss": 0.4247, + "step": 20220 + }, + { + "epoch": 6.530019367333764, + "grad_norm": 1.03111732006073, + "learning_rate": 0.0002, + "loss": 0.416, + "step": 20230 + }, + { + "epoch": 6.533247256294383, + "grad_norm": 1.5421077013015747, + "learning_rate": 0.0002, + "loss": 0.4202, + "step": 20240 + }, + { + "epoch": 6.536475145255003, + "grad_norm": 1.4383527040481567, + "learning_rate": 0.0002, + "loss": 0.4309, + "step": 20250 + }, + { + "epoch": 6.539703034215623, + "grad_norm": 1.0252864360809326, + "learning_rate": 0.0002, + "loss": 0.4086, + "step": 20260 + }, + { + "epoch": 6.542930923176243, + "grad_norm": 1.2504689693450928, + "learning_rate": 0.0002, + "loss": 0.4391, + "step": 20270 + }, + { + "epoch": 6.546158812136863, + "grad_norm": 1.2130976915359497, + "learning_rate": 0.0002, + "loss": 0.4294, + "step": 20280 + }, + { + "epoch": 6.549386701097482, + "grad_norm": 1.1186957359313965, + "learning_rate": 0.0002, + "loss": 0.4432, + "step": 20290 + }, + { + "epoch": 6.552614590058102, + "grad_norm": 1.0373939275741577, + "learning_rate": 0.0002, + "loss": 0.4225, + "step": 20300 + }, + { + "epoch": 6.555842479018722, + "grad_norm": 0.9950923323631287, + "learning_rate": 0.0002, + "loss": 0.3874, + "step": 20310 + }, + { + "epoch": 6.559070367979341, + "grad_norm": 1.1479439735412598, + "learning_rate": 0.0002, + "loss": 0.4257, + "step": 20320 + }, + { + "epoch": 6.562298256939961, + "grad_norm": 1.2426027059555054, + "learning_rate": 0.0002, + "loss": 0.4418, + "step": 20330 + }, + { + "epoch": 6.565526145900581, + "grad_norm": 1.3021808862686157, + "learning_rate": 0.0002, + "loss": 0.4274, + "step": 20340 + }, + { + "epoch": 6.5687540348612, + "grad_norm": 1.203259825706482, + "learning_rate": 0.0002, + "loss": 0.4423, + "step": 20350 + }, + { + "epoch": 6.571981923821821, + "grad_norm": 2.1131186485290527, + "learning_rate": 0.0002, + "loss": 0.4568, + "step": 20360 + }, + { + "epoch": 6.5752098127824405, + "grad_norm": 1.1588627099990845, + "learning_rate": 0.0002, + "loss": 0.4272, + "step": 20370 + }, + { + "epoch": 6.57843770174306, + "grad_norm": 1.0151054859161377, + "learning_rate": 0.0002, + "loss": 0.4727, + "step": 20380 + }, + { + "epoch": 6.58166559070368, + "grad_norm": 1.323155403137207, + "learning_rate": 0.0002, + "loss": 0.4592, + "step": 20390 + }, + { + "epoch": 6.5848934796642995, + "grad_norm": 1.0907572507858276, + "learning_rate": 0.0002, + "loss": 0.4075, + "step": 20400 + }, + { + "epoch": 6.588121368624919, + "grad_norm": 1.2375017404556274, + "learning_rate": 0.0002, + "loss": 0.4127, + "step": 20410 + }, + { + "epoch": 6.591349257585539, + "grad_norm": 1.0491245985031128, + "learning_rate": 0.0002, + "loss": 0.4483, + "step": 20420 + }, + { + "epoch": 6.5945771465461585, + "grad_norm": 1.50575852394104, + "learning_rate": 0.0002, + "loss": 0.4476, + "step": 20430 + }, + { + "epoch": 6.597805035506779, + "grad_norm": 0.9893020987510681, + "learning_rate": 0.0002, + "loss": 0.4235, + "step": 20440 + }, + { + "epoch": 6.601032924467399, + "grad_norm": 1.258591651916504, + "learning_rate": 0.0002, + "loss": 0.4384, + "step": 20450 + }, + { + "epoch": 6.604260813428018, + "grad_norm": 1.3949081897735596, + "learning_rate": 0.0002, + "loss": 0.4458, + "step": 20460 + }, + { + "epoch": 6.607488702388638, + "grad_norm": 1.152513861656189, + "learning_rate": 0.0002, + "loss": 0.3885, + "step": 20470 + }, + { + "epoch": 6.610716591349258, + "grad_norm": 1.218362808227539, + "learning_rate": 0.0002, + "loss": 0.4257, + "step": 20480 + }, + { + "epoch": 6.613944480309877, + "grad_norm": 1.3538687229156494, + "learning_rate": 0.0002, + "loss": 0.4448, + "step": 20490 + }, + { + "epoch": 6.617172369270497, + "grad_norm": 1.2896782159805298, + "learning_rate": 0.0002, + "loss": 0.4348, + "step": 20500 + }, + { + "epoch": 6.6204002582311166, + "grad_norm": 1.0762150287628174, + "learning_rate": 0.0002, + "loss": 0.4287, + "step": 20510 + }, + { + "epoch": 6.623628147191736, + "grad_norm": 1.1561447381973267, + "learning_rate": 0.0002, + "loss": 0.4529, + "step": 20520 + }, + { + "epoch": 6.626856036152357, + "grad_norm": 1.0553218126296997, + "learning_rate": 0.0002, + "loss": 0.4017, + "step": 20530 + }, + { + "epoch": 6.630083925112976, + "grad_norm": 1.1378765106201172, + "learning_rate": 0.0002, + "loss": 0.4321, + "step": 20540 + }, + { + "epoch": 6.633311814073596, + "grad_norm": 1.2299952507019043, + "learning_rate": 0.0002, + "loss": 0.4351, + "step": 20550 + }, + { + "epoch": 6.636539703034216, + "grad_norm": 1.4158518314361572, + "learning_rate": 0.0002, + "loss": 0.4406, + "step": 20560 + }, + { + "epoch": 6.639767591994835, + "grad_norm": 1.058830738067627, + "learning_rate": 0.0002, + "loss": 0.4334, + "step": 20570 + }, + { + "epoch": 6.642995480955455, + "grad_norm": 1.1069598197937012, + "learning_rate": 0.0002, + "loss": 0.4248, + "step": 20580 + }, + { + "epoch": 6.646223369916075, + "grad_norm": 1.3859037160873413, + "learning_rate": 0.0002, + "loss": 0.4651, + "step": 20590 + }, + { + "epoch": 6.649451258876694, + "grad_norm": 1.300588607788086, + "learning_rate": 0.0002, + "loss": 0.4324, + "step": 20600 + }, + { + "epoch": 6.652679147837315, + "grad_norm": 1.3861193656921387, + "learning_rate": 0.0002, + "loss": 0.4581, + "step": 20610 + }, + { + "epoch": 6.6559070367979345, + "grad_norm": 1.2356518507003784, + "learning_rate": 0.0002, + "loss": 0.4198, + "step": 20620 + }, + { + "epoch": 6.659134925758554, + "grad_norm": 1.1698070764541626, + "learning_rate": 0.0002, + "loss": 0.4578, + "step": 20630 + }, + { + "epoch": 6.662362814719174, + "grad_norm": 1.270707607269287, + "learning_rate": 0.0002, + "loss": 0.4513, + "step": 20640 + }, + { + "epoch": 6.6655907036797934, + "grad_norm": 0.984618067741394, + "learning_rate": 0.0002, + "loss": 0.4552, + "step": 20650 + }, + { + "epoch": 6.668818592640413, + "grad_norm": 1.2335834503173828, + "learning_rate": 0.0002, + "loss": 0.4648, + "step": 20660 + }, + { + "epoch": 6.672046481601033, + "grad_norm": 0.9497392773628235, + "learning_rate": 0.0002, + "loss": 0.4541, + "step": 20670 + }, + { + "epoch": 6.675274370561652, + "grad_norm": 1.011144757270813, + "learning_rate": 0.0002, + "loss": 0.4176, + "step": 20680 + }, + { + "epoch": 6.678502259522272, + "grad_norm": 1.1605948209762573, + "learning_rate": 0.0002, + "loss": 0.4424, + "step": 20690 + }, + { + "epoch": 6.681730148482892, + "grad_norm": 1.2136812210083008, + "learning_rate": 0.0002, + "loss": 0.4613, + "step": 20700 + }, + { + "epoch": 6.684958037443512, + "grad_norm": 1.0823525190353394, + "learning_rate": 0.0002, + "loss": 0.4287, + "step": 20710 + }, + { + "epoch": 6.688185926404132, + "grad_norm": 1.1929140090942383, + "learning_rate": 0.0002, + "loss": 0.4307, + "step": 20720 + }, + { + "epoch": 6.6914138153647515, + "grad_norm": 1.2468219995498657, + "learning_rate": 0.0002, + "loss": 0.4453, + "step": 20730 + }, + { + "epoch": 6.694641704325371, + "grad_norm": 1.2653573751449585, + "learning_rate": 0.0002, + "loss": 0.4262, + "step": 20740 + }, + { + "epoch": 6.697869593285991, + "grad_norm": 1.2253094911575317, + "learning_rate": 0.0002, + "loss": 0.4716, + "step": 20750 + }, + { + "epoch": 6.7010974822466105, + "grad_norm": 1.103179931640625, + "learning_rate": 0.0002, + "loss": 0.4462, + "step": 20760 + }, + { + "epoch": 6.70432537120723, + "grad_norm": 0.9180657863616943, + "learning_rate": 0.0002, + "loss": 0.4179, + "step": 20770 + }, + { + "epoch": 6.707553260167851, + "grad_norm": 1.1830929517745972, + "learning_rate": 0.0002, + "loss": 0.4712, + "step": 20780 + }, + { + "epoch": 6.71078114912847, + "grad_norm": 1.1052136421203613, + "learning_rate": 0.0002, + "loss": 0.4304, + "step": 20790 + }, + { + "epoch": 6.71400903808909, + "grad_norm": 1.1268569231033325, + "learning_rate": 0.0002, + "loss": 0.436, + "step": 20800 + }, + { + "epoch": 6.71723692704971, + "grad_norm": 1.0753320455551147, + "learning_rate": 0.0002, + "loss": 0.4109, + "step": 20810 + }, + { + "epoch": 6.720464816010329, + "grad_norm": 1.1100133657455444, + "learning_rate": 0.0002, + "loss": 0.4471, + "step": 20820 + }, + { + "epoch": 6.723692704970949, + "grad_norm": 0.7498472929000854, + "learning_rate": 0.0002, + "loss": 0.447, + "step": 20830 + }, + { + "epoch": 6.726920593931569, + "grad_norm": 1.1006664037704468, + "learning_rate": 0.0002, + "loss": 0.4182, + "step": 20840 + }, + { + "epoch": 6.730148482892188, + "grad_norm": 1.4599690437316895, + "learning_rate": 0.0002, + "loss": 0.4348, + "step": 20850 + }, + { + "epoch": 6.733376371852808, + "grad_norm": 1.324700951576233, + "learning_rate": 0.0002, + "loss": 0.4596, + "step": 20860 + }, + { + "epoch": 6.736604260813428, + "grad_norm": 1.1128668785095215, + "learning_rate": 0.0002, + "loss": 0.4373, + "step": 20870 + }, + { + "epoch": 6.739832149774048, + "grad_norm": 1.0438026189804077, + "learning_rate": 0.0002, + "loss": 0.4267, + "step": 20880 + }, + { + "epoch": 6.743060038734668, + "grad_norm": 1.1934672594070435, + "learning_rate": 0.0002, + "loss": 0.4366, + "step": 20890 + }, + { + "epoch": 6.746287927695287, + "grad_norm": 1.2108192443847656, + "learning_rate": 0.0002, + "loss": 0.4264, + "step": 20900 + }, + { + "epoch": 6.749515816655907, + "grad_norm": 1.1514620780944824, + "learning_rate": 0.0002, + "loss": 0.4327, + "step": 20910 + }, + { + "epoch": 6.752743705616527, + "grad_norm": 1.1723405122756958, + "learning_rate": 0.0002, + "loss": 0.4774, + "step": 20920 + }, + { + "epoch": 6.755971594577146, + "grad_norm": 1.1136211156845093, + "learning_rate": 0.0002, + "loss": 0.4458, + "step": 20930 + }, + { + "epoch": 6.759199483537766, + "grad_norm": 1.297601342201233, + "learning_rate": 0.0002, + "loss": 0.4363, + "step": 20940 + }, + { + "epoch": 6.7624273724983865, + "grad_norm": 1.139397144317627, + "learning_rate": 0.0002, + "loss": 0.4389, + "step": 20950 + }, + { + "epoch": 6.765655261459006, + "grad_norm": 1.2873362302780151, + "learning_rate": 0.0002, + "loss": 0.4344, + "step": 20960 + }, + { + "epoch": 6.768883150419626, + "grad_norm": 1.1499544382095337, + "learning_rate": 0.0002, + "loss": 0.4204, + "step": 20970 + }, + { + "epoch": 6.7721110393802455, + "grad_norm": 1.3687032461166382, + "learning_rate": 0.0002, + "loss": 0.4279, + "step": 20980 + }, + { + "epoch": 6.775338928340865, + "grad_norm": 1.2877939939498901, + "learning_rate": 0.0002, + "loss": 0.4621, + "step": 20990 + }, + { + "epoch": 6.778566817301485, + "grad_norm": 1.232993483543396, + "learning_rate": 0.0002, + "loss": 0.4629, + "step": 21000 + }, + { + "epoch": 6.7817947062621045, + "grad_norm": 1.1765092611312866, + "learning_rate": 0.0002, + "loss": 0.4697, + "step": 21010 + }, + { + "epoch": 6.785022595222724, + "grad_norm": 1.4695899486541748, + "learning_rate": 0.0002, + "loss": 0.431, + "step": 21020 + }, + { + "epoch": 6.788250484183344, + "grad_norm": 1.2325087785720825, + "learning_rate": 0.0002, + "loss": 0.4348, + "step": 21030 + }, + { + "epoch": 6.791478373143963, + "grad_norm": 1.3475068807601929, + "learning_rate": 0.0002, + "loss": 0.4595, + "step": 21040 + }, + { + "epoch": 6.794706262104584, + "grad_norm": 1.5654256343841553, + "learning_rate": 0.0002, + "loss": 0.4555, + "step": 21050 + }, + { + "epoch": 6.797934151065204, + "grad_norm": 1.4210680723190308, + "learning_rate": 0.0002, + "loss": 0.4672, + "step": 21060 + }, + { + "epoch": 6.801162040025823, + "grad_norm": 1.167878270149231, + "learning_rate": 0.0002, + "loss": 0.4491, + "step": 21070 + }, + { + "epoch": 6.804389928986443, + "grad_norm": 1.1643486022949219, + "learning_rate": 0.0002, + "loss": 0.4524, + "step": 21080 + }, + { + "epoch": 6.8076178179470626, + "grad_norm": 1.1976310014724731, + "learning_rate": 0.0002, + "loss": 0.4467, + "step": 21090 + }, + { + "epoch": 6.810845706907682, + "grad_norm": 1.1392749547958374, + "learning_rate": 0.0002, + "loss": 0.4449, + "step": 21100 + }, + { + "epoch": 6.814073595868302, + "grad_norm": 1.2456704378128052, + "learning_rate": 0.0002, + "loss": 0.4567, + "step": 21110 + }, + { + "epoch": 6.8173014848289215, + "grad_norm": 1.0030150413513184, + "learning_rate": 0.0002, + "loss": 0.4271, + "step": 21120 + }, + { + "epoch": 6.820529373789542, + "grad_norm": 1.4715943336486816, + "learning_rate": 0.0002, + "loss": 0.4258, + "step": 21130 + }, + { + "epoch": 6.823757262750162, + "grad_norm": 1.1307374238967896, + "learning_rate": 0.0002, + "loss": 0.4615, + "step": 21140 + }, + { + "epoch": 6.826985151710781, + "grad_norm": 1.37498140335083, + "learning_rate": 0.0002, + "loss": 0.4643, + "step": 21150 + }, + { + "epoch": 6.830213040671401, + "grad_norm": 1.2791364192962646, + "learning_rate": 0.0002, + "loss": 0.4447, + "step": 21160 + }, + { + "epoch": 6.833440929632021, + "grad_norm": 1.0518016815185547, + "learning_rate": 0.0002, + "loss": 0.4778, + "step": 21170 + }, + { + "epoch": 6.83666881859264, + "grad_norm": 1.1237729787826538, + "learning_rate": 0.0002, + "loss": 0.448, + "step": 21180 + }, + { + "epoch": 6.83989670755326, + "grad_norm": 1.0360032320022583, + "learning_rate": 0.0002, + "loss": 0.4299, + "step": 21190 + }, + { + "epoch": 6.84312459651388, + "grad_norm": 0.8733281493186951, + "learning_rate": 0.0002, + "loss": 0.4336, + "step": 21200 + }, + { + "epoch": 6.846352485474499, + "grad_norm": 1.3178322315216064, + "learning_rate": 0.0002, + "loss": 0.4495, + "step": 21210 + }, + { + "epoch": 6.84958037443512, + "grad_norm": 1.0884978771209717, + "learning_rate": 0.0002, + "loss": 0.4548, + "step": 21220 + }, + { + "epoch": 6.8528082633957395, + "grad_norm": 1.213229775428772, + "learning_rate": 0.0002, + "loss": 0.4543, + "step": 21230 + }, + { + "epoch": 6.856036152356359, + "grad_norm": 1.0828464031219482, + "learning_rate": 0.0002, + "loss": 0.4628, + "step": 21240 + }, + { + "epoch": 6.859264041316979, + "grad_norm": 1.2298113107681274, + "learning_rate": 0.0002, + "loss": 0.4353, + "step": 21250 + }, + { + "epoch": 6.862491930277598, + "grad_norm": 1.4773930311203003, + "learning_rate": 0.0002, + "loss": 0.4088, + "step": 21260 + }, + { + "epoch": 6.865719819238218, + "grad_norm": 0.992661714553833, + "learning_rate": 0.0002, + "loss": 0.4529, + "step": 21270 + }, + { + "epoch": 6.868947708198838, + "grad_norm": 1.25167715549469, + "learning_rate": 0.0002, + "loss": 0.474, + "step": 21280 + }, + { + "epoch": 6.872175597159457, + "grad_norm": 1.1554399728775024, + "learning_rate": 0.0002, + "loss": 0.4466, + "step": 21290 + }, + { + "epoch": 6.875403486120078, + "grad_norm": 1.2587701082229614, + "learning_rate": 0.0002, + "loss": 0.4375, + "step": 21300 + }, + { + "epoch": 6.8786313750806976, + "grad_norm": 1.392392635345459, + "learning_rate": 0.0002, + "loss": 0.4507, + "step": 21310 + }, + { + "epoch": 6.881859264041317, + "grad_norm": 1.2159595489501953, + "learning_rate": 0.0002, + "loss": 0.4432, + "step": 21320 + }, + { + "epoch": 6.885087153001937, + "grad_norm": 1.3811182975769043, + "learning_rate": 0.0002, + "loss": 0.4255, + "step": 21330 + }, + { + "epoch": 6.8883150419625565, + "grad_norm": 1.2652684450149536, + "learning_rate": 0.0002, + "loss": 0.4437, + "step": 21340 + }, + { + "epoch": 6.891542930923176, + "grad_norm": 1.1906380653381348, + "learning_rate": 0.0002, + "loss": 0.4797, + "step": 21350 + }, + { + "epoch": 6.894770819883796, + "grad_norm": 1.0525990724563599, + "learning_rate": 0.0002, + "loss": 0.423, + "step": 21360 + }, + { + "epoch": 6.8979987088444155, + "grad_norm": 0.910491406917572, + "learning_rate": 0.0002, + "loss": 0.4414, + "step": 21370 + }, + { + "epoch": 6.901226597805035, + "grad_norm": 1.366865634918213, + "learning_rate": 0.0002, + "loss": 0.4882, + "step": 21380 + }, + { + "epoch": 6.904454486765655, + "grad_norm": 1.1270265579223633, + "learning_rate": 0.0002, + "loss": 0.4648, + "step": 21390 + }, + { + "epoch": 6.907682375726275, + "grad_norm": 1.1745691299438477, + "learning_rate": 0.0002, + "loss": 0.4529, + "step": 21400 + }, + { + "epoch": 6.910910264686895, + "grad_norm": 1.1036182641983032, + "learning_rate": 0.0002, + "loss": 0.4504, + "step": 21410 + }, + { + "epoch": 6.914138153647515, + "grad_norm": 1.0906540155410767, + "learning_rate": 0.0002, + "loss": 0.4612, + "step": 21420 + }, + { + "epoch": 6.917366042608134, + "grad_norm": 1.1176798343658447, + "learning_rate": 0.0002, + "loss": 0.4408, + "step": 21430 + }, + { + "epoch": 6.920593931568754, + "grad_norm": 1.525869607925415, + "learning_rate": 0.0002, + "loss": 0.477, + "step": 21440 + }, + { + "epoch": 6.923821820529374, + "grad_norm": 1.2466827630996704, + "learning_rate": 0.0002, + "loss": 0.4473, + "step": 21450 + }, + { + "epoch": 6.927049709489993, + "grad_norm": 1.0200796127319336, + "learning_rate": 0.0002, + "loss": 0.4256, + "step": 21460 + }, + { + "epoch": 6.930277598450614, + "grad_norm": 1.2133489847183228, + "learning_rate": 0.0002, + "loss": 0.4601, + "step": 21470 + }, + { + "epoch": 6.933505487411233, + "grad_norm": 1.2100290060043335, + "learning_rate": 0.0002, + "loss": 0.44, + "step": 21480 + }, + { + "epoch": 6.936733376371853, + "grad_norm": 1.1833131313323975, + "learning_rate": 0.0002, + "loss": 0.468, + "step": 21490 + }, + { + "epoch": 6.939961265332473, + "grad_norm": 1.2262470722198486, + "learning_rate": 0.0002, + "loss": 0.4529, + "step": 21500 + }, + { + "epoch": 6.943189154293092, + "grad_norm": 1.0496156215667725, + "learning_rate": 0.0002, + "loss": 0.4612, + "step": 21510 + }, + { + "epoch": 6.946417043253712, + "grad_norm": 1.050690770149231, + "learning_rate": 0.0002, + "loss": 0.4417, + "step": 21520 + }, + { + "epoch": 6.949644932214332, + "grad_norm": 1.2035698890686035, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 21530 + }, + { + "epoch": 6.952872821174951, + "grad_norm": 1.408007025718689, + "learning_rate": 0.0002, + "loss": 0.4349, + "step": 21540 + }, + { + "epoch": 6.956100710135571, + "grad_norm": 1.2247556447982788, + "learning_rate": 0.0002, + "loss": 0.4391, + "step": 21550 + }, + { + "epoch": 6.959328599096191, + "grad_norm": 1.1727497577667236, + "learning_rate": 0.0002, + "loss": 0.4526, + "step": 21560 + }, + { + "epoch": 6.962556488056811, + "grad_norm": 1.2948925495147705, + "learning_rate": 0.0002, + "loss": 0.4566, + "step": 21570 + }, + { + "epoch": 6.965784377017431, + "grad_norm": 1.3374950885772705, + "learning_rate": 0.0002, + "loss": 0.4672, + "step": 21580 + }, + { + "epoch": 6.9690122659780505, + "grad_norm": 1.164650559425354, + "learning_rate": 0.0002, + "loss": 0.4515, + "step": 21590 + }, + { + "epoch": 6.97224015493867, + "grad_norm": 1.2682108879089355, + "learning_rate": 0.0002, + "loss": 0.4704, + "step": 21600 + }, + { + "epoch": 6.97546804389929, + "grad_norm": 1.195971131324768, + "learning_rate": 0.0002, + "loss": 0.4557, + "step": 21610 + }, + { + "epoch": 6.978695932859909, + "grad_norm": 1.1988017559051514, + "learning_rate": 0.0002, + "loss": 0.4194, + "step": 21620 + }, + { + "epoch": 6.981923821820529, + "grad_norm": 1.0981930494308472, + "learning_rate": 0.0002, + "loss": 0.4524, + "step": 21630 + }, + { + "epoch": 6.98515171078115, + "grad_norm": 1.307260274887085, + "learning_rate": 0.0002, + "loss": 0.4808, + "step": 21640 + }, + { + "epoch": 6.988379599741769, + "grad_norm": 1.2798160314559937, + "learning_rate": 0.0002, + "loss": 0.4936, + "step": 21650 + }, + { + "epoch": 6.991607488702389, + "grad_norm": 1.0053848028182983, + "learning_rate": 0.0002, + "loss": 0.4615, + "step": 21660 + }, + { + "epoch": 6.994835377663009, + "grad_norm": 1.2257840633392334, + "learning_rate": 0.0002, + "loss": 0.4496, + "step": 21670 + }, + { + "epoch": 6.998063266623628, + "grad_norm": 1.3769378662109375, + "learning_rate": 0.0002, + "loss": 0.4449, + "step": 21680 + }, + { + "epoch": 7.0, + "eval_loss": 1.3414524793624878, + "eval_runtime": 162.0091, + "eval_samples_per_second": 4.524, + "eval_steps_per_second": 0.568, + "step": 21686 + } + ], + "logging_steps": 10, + "max_steps": 24784, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.00357952307934e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f46f2b8e8752b125339f36f172c3878be4cdb152 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-21686/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfc2a69e44a51edf5586ebed4b7ee915a23244c18c1f59e580471e4c9becfa98 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d147b3ff1714820591c8fb3e396d2cd37be7def5 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00d7dffbf407eccd15a565f3ec7e444e955a9bd87a94e6ea9b7f5c5e7d59efbb +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d5840a8b09bccb86b6a6f20c3d350f3e4a484327 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0605d35b122536da1134806b84e16c5b31ce86176b914fed8f43c21d734835ed +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..76c4d47aa58a36158827858ac271bbbdd8cb2089 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c071cd9bc26f6dd87a59cdf4eb39613db89e029fc3957f700d468eccf4be25a2 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fdb74a03a3c96ba30c9ae42b5425b400cc9866b1 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:898ef7996d34132f853e4f9a14ec02188a296e9d2355244fc0293288b34e8a75 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f3bfa5a92ed9e545590a0a1146e9da2510fc72f4 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/trainer_state.json @@ -0,0 +1,17443 @@ +{ + "best_metric": 1.0958120822906494, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098", + "epoch": 8.0, + "eval_steps": 10, + "global_step": 24784, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032278889606197547, + "grad_norm": 0.7092075347900391, + "learning_rate": 0.0002, + "loss": 1.593, + "step": 10 + }, + { + "epoch": 0.006455777921239509, + "grad_norm": 0.6900479793548584, + "learning_rate": 0.0002, + "loss": 1.0956, + "step": 20 + }, + { + "epoch": 0.009683666881859263, + "grad_norm": 0.6788288950920105, + "learning_rate": 0.0002, + "loss": 0.9807, + "step": 30 + }, + { + "epoch": 0.012911555842479019, + "grad_norm": 0.5590243339538574, + "learning_rate": 0.0002, + "loss": 0.9385, + "step": 40 + }, + { + "epoch": 0.016139444803098774, + "grad_norm": 0.5136010646820068, + "learning_rate": 0.0002, + "loss": 0.931, + "step": 50 + }, + { + "epoch": 0.019367333763718526, + "grad_norm": 0.45298320055007935, + "learning_rate": 0.0002, + "loss": 0.8896, + "step": 60 + }, + { + "epoch": 0.022595222724338282, + "grad_norm": 0.5917162299156189, + "learning_rate": 0.0002, + "loss": 0.9184, + "step": 70 + }, + { + "epoch": 0.025823111684958037, + "grad_norm": 0.4414856433868408, + "learning_rate": 0.0002, + "loss": 0.8705, + "step": 80 + }, + { + "epoch": 0.029051000645577793, + "grad_norm": 0.5547978281974792, + "learning_rate": 0.0002, + "loss": 0.8419, + "step": 90 + }, + { + "epoch": 0.03227888960619755, + "grad_norm": 0.5271288156509399, + "learning_rate": 0.0002, + "loss": 0.8987, + "step": 100 + }, + { + "epoch": 0.035506778566817304, + "grad_norm": 0.5506119728088379, + "learning_rate": 0.0002, + "loss": 0.8543, + "step": 110 + }, + { + "epoch": 0.03873466752743705, + "grad_norm": 0.5579327940940857, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 120 + }, + { + "epoch": 0.04196255648805681, + "grad_norm": 0.5099632740020752, + "learning_rate": 0.0002, + "loss": 0.8826, + "step": 130 + }, + { + "epoch": 0.045190445448676564, + "grad_norm": 0.40396833419799805, + "learning_rate": 0.0002, + "loss": 0.9239, + "step": 140 + }, + { + "epoch": 0.04841833440929632, + "grad_norm": 0.5008092522621155, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 150 + }, + { + "epoch": 0.051646223369916075, + "grad_norm": 0.4388776421546936, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 160 + }, + { + "epoch": 0.05487411233053583, + "grad_norm": 0.44138944149017334, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 170 + }, + { + "epoch": 0.058102001291155586, + "grad_norm": 0.358484148979187, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 180 + }, + { + "epoch": 0.06132989025177534, + "grad_norm": 0.457052081823349, + "learning_rate": 0.0002, + "loss": 0.8956, + "step": 190 + }, + { + "epoch": 0.0645577792123951, + "grad_norm": 0.5537622570991516, + "learning_rate": 0.0002, + "loss": 0.9138, + "step": 200 + }, + { + "epoch": 0.06778566817301485, + "grad_norm": 0.552631676197052, + "learning_rate": 0.0002, + "loss": 0.8701, + "step": 210 + }, + { + "epoch": 0.07101355713363461, + "grad_norm": 0.4414575397968292, + "learning_rate": 0.0002, + "loss": 0.8854, + "step": 220 + }, + { + "epoch": 0.07424144609425436, + "grad_norm": 0.4996664226055145, + "learning_rate": 0.0002, + "loss": 0.8581, + "step": 230 + }, + { + "epoch": 0.0774693350548741, + "grad_norm": 0.7321897149085999, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 240 + }, + { + "epoch": 0.08069722401549387, + "grad_norm": 0.4553901255130768, + "learning_rate": 0.0002, + "loss": 0.8848, + "step": 250 + }, + { + "epoch": 0.08392511297611362, + "grad_norm": 0.5039054751396179, + "learning_rate": 0.0002, + "loss": 0.868, + "step": 260 + }, + { + "epoch": 0.08715300193673338, + "grad_norm": 0.4113094210624695, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 270 + }, + { + "epoch": 0.09038089089735313, + "grad_norm": 0.450436532497406, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 280 + }, + { + "epoch": 0.09360877985797289, + "grad_norm": 0.4548024535179138, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 290 + }, + { + "epoch": 0.09683666881859264, + "grad_norm": 0.4932962656021118, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 300 + }, + { + "epoch": 0.1000645577792124, + "grad_norm": 0.4005250334739685, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 310 + }, + { + "epoch": 0.10329244673983215, + "grad_norm": 1.8321624994277954, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 320 + }, + { + "epoch": 0.1065203357004519, + "grad_norm": 0.45815610885620117, + "learning_rate": 0.0002, + "loss": 0.8411, + "step": 330 + }, + { + "epoch": 0.10974822466107166, + "grad_norm": 0.39324095845222473, + "learning_rate": 0.0002, + "loss": 0.857, + "step": 340 + }, + { + "epoch": 0.11297611362169141, + "grad_norm": 0.546273946762085, + "learning_rate": 0.0002, + "loss": 0.8258, + "step": 350 + }, + { + "epoch": 0.11620400258231117, + "grad_norm": 0.497448593378067, + "learning_rate": 0.0002, + "loss": 0.882, + "step": 360 + }, + { + "epoch": 0.11943189154293092, + "grad_norm": 0.37508800625801086, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 370 + }, + { + "epoch": 0.12265978050355068, + "grad_norm": 0.45849609375, + "learning_rate": 0.0002, + "loss": 0.852, + "step": 380 + }, + { + "epoch": 0.12588766946417043, + "grad_norm": 0.5488408803939819, + "learning_rate": 0.0002, + "loss": 0.8437, + "step": 390 + }, + { + "epoch": 0.1291155584247902, + "grad_norm": 0.4477061331272125, + "learning_rate": 0.0002, + "loss": 0.8349, + "step": 400 + }, + { + "epoch": 0.13234344738540993, + "grad_norm": 0.39227980375289917, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 410 + }, + { + "epoch": 0.1355713363460297, + "grad_norm": 0.3922233581542969, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 420 + }, + { + "epoch": 0.13879922530664945, + "grad_norm": 0.42901909351348877, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 430 + }, + { + "epoch": 0.14202711426726922, + "grad_norm": 0.4217798709869385, + "learning_rate": 0.0002, + "loss": 0.8271, + "step": 440 + }, + { + "epoch": 0.14525500322788895, + "grad_norm": 0.43470677733421326, + "learning_rate": 0.0002, + "loss": 0.8594, + "step": 450 + }, + { + "epoch": 0.1484828921885087, + "grad_norm": 0.5324403047561646, + "learning_rate": 0.0002, + "loss": 0.8106, + "step": 460 + }, + { + "epoch": 0.15171078114912848, + "grad_norm": 0.3999756872653961, + "learning_rate": 0.0002, + "loss": 0.8729, + "step": 470 + }, + { + "epoch": 0.1549386701097482, + "grad_norm": 0.404933363199234, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 480 + }, + { + "epoch": 0.15816655907036797, + "grad_norm": 0.44122636318206787, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 490 + }, + { + "epoch": 0.16139444803098774, + "grad_norm": 0.510166347026825, + "learning_rate": 0.0002, + "loss": 0.8457, + "step": 500 + }, + { + "epoch": 0.1646223369916075, + "grad_norm": 0.4549732506275177, + "learning_rate": 0.0002, + "loss": 0.8692, + "step": 510 + }, + { + "epoch": 0.16785022595222723, + "grad_norm": 0.5148182511329651, + "learning_rate": 0.0002, + "loss": 0.8466, + "step": 520 + }, + { + "epoch": 0.171078114912847, + "grad_norm": 0.3596806824207306, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 530 + }, + { + "epoch": 0.17430600387346676, + "grad_norm": 0.4388909339904785, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 540 + }, + { + "epoch": 0.17753389283408652, + "grad_norm": 0.5052742958068848, + "learning_rate": 0.0002, + "loss": 0.8322, + "step": 550 + }, + { + "epoch": 0.18076178179470626, + "grad_norm": 0.48248958587646484, + "learning_rate": 0.0002, + "loss": 0.791, + "step": 560 + }, + { + "epoch": 0.18398967075532602, + "grad_norm": 0.5360197424888611, + "learning_rate": 0.0002, + "loss": 0.8593, + "step": 570 + }, + { + "epoch": 0.18721755971594578, + "grad_norm": 0.43999341130256653, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 580 + }, + { + "epoch": 0.19044544867656552, + "grad_norm": 0.3685208261013031, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 590 + }, + { + "epoch": 0.19367333763718528, + "grad_norm": 0.4601275622844696, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 600 + }, + { + "epoch": 0.19690122659780504, + "grad_norm": 0.4778369665145874, + "learning_rate": 0.0002, + "loss": 0.8483, + "step": 610 + }, + { + "epoch": 0.2001291155584248, + "grad_norm": 0.4867003560066223, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 620 + }, + { + "epoch": 0.20335700451904454, + "grad_norm": 0.4583742916584015, + "learning_rate": 0.0002, + "loss": 0.8554, + "step": 630 + }, + { + "epoch": 0.2065848934796643, + "grad_norm": 0.47958165407180786, + "learning_rate": 0.0002, + "loss": 0.8698, + "step": 640 + }, + { + "epoch": 0.20981278244028406, + "grad_norm": 0.4526064097881317, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 650 + }, + { + "epoch": 0.2130406714009038, + "grad_norm": 0.45890581607818604, + "learning_rate": 0.0002, + "loss": 0.8313, + "step": 660 + }, + { + "epoch": 0.21626856036152356, + "grad_norm": 0.42725905776023865, + "learning_rate": 0.0002, + "loss": 0.8143, + "step": 670 + }, + { + "epoch": 0.21949644932214332, + "grad_norm": 0.40380963683128357, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 680 + }, + { + "epoch": 0.22272433828276308, + "grad_norm": 0.4372998774051666, + "learning_rate": 0.0002, + "loss": 0.9004, + "step": 690 + }, + { + "epoch": 0.22595222724338282, + "grad_norm": 0.4245864450931549, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 700 + }, + { + "epoch": 0.22918011620400258, + "grad_norm": 0.4061129689216614, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 710 + }, + { + "epoch": 0.23240800516462234, + "grad_norm": 0.474454790353775, + "learning_rate": 0.0002, + "loss": 0.8275, + "step": 720 + }, + { + "epoch": 0.23563589412524208, + "grad_norm": 0.4908486008644104, + "learning_rate": 0.0002, + "loss": 0.8346, + "step": 730 + }, + { + "epoch": 0.23886378308586184, + "grad_norm": 0.4284191429615021, + "learning_rate": 0.0002, + "loss": 0.8755, + "step": 740 + }, + { + "epoch": 0.2420916720464816, + "grad_norm": 0.44730308651924133, + "learning_rate": 0.0002, + "loss": 0.8387, + "step": 750 + }, + { + "epoch": 0.24531956100710137, + "grad_norm": 0.4433246850967407, + "learning_rate": 0.0002, + "loss": 0.8135, + "step": 760 + }, + { + "epoch": 0.2485474499677211, + "grad_norm": 0.43668854236602783, + "learning_rate": 0.0002, + "loss": 0.8644, + "step": 770 + }, + { + "epoch": 0.25177533892834086, + "grad_norm": 0.34324130415916443, + "learning_rate": 0.0002, + "loss": 0.8025, + "step": 780 + }, + { + "epoch": 0.2550032278889606, + "grad_norm": 0.46476295590400696, + "learning_rate": 0.0002, + "loss": 0.8725, + "step": 790 + }, + { + "epoch": 0.2582311168495804, + "grad_norm": 0.5047039985656738, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 800 + }, + { + "epoch": 0.26145900581020015, + "grad_norm": 0.4402127265930176, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 810 + }, + { + "epoch": 0.26468689477081986, + "grad_norm": 0.4642465114593506, + "learning_rate": 0.0002, + "loss": 0.8025, + "step": 820 + }, + { + "epoch": 0.2679147837314396, + "grad_norm": 0.40093424916267395, + "learning_rate": 0.0002, + "loss": 0.8836, + "step": 830 + }, + { + "epoch": 0.2711426726920594, + "grad_norm": 0.42501842975616455, + "learning_rate": 0.0002, + "loss": 0.83, + "step": 840 + }, + { + "epoch": 0.27437056165267915, + "grad_norm": 0.43279722332954407, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 850 + }, + { + "epoch": 0.2775984506132989, + "grad_norm": 0.5991243720054626, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 860 + }, + { + "epoch": 0.28082633957391867, + "grad_norm": 0.4217848777770996, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 870 + }, + { + "epoch": 0.28405422853453843, + "grad_norm": 0.3933536410331726, + "learning_rate": 0.0002, + "loss": 0.8135, + "step": 880 + }, + { + "epoch": 0.28728211749515814, + "grad_norm": 0.5868505239486694, + "learning_rate": 0.0002, + "loss": 0.8846, + "step": 890 + }, + { + "epoch": 0.2905100064557779, + "grad_norm": 0.5209547877311707, + "learning_rate": 0.0002, + "loss": 0.8759, + "step": 900 + }, + { + "epoch": 0.29373789541639767, + "grad_norm": 0.49307361245155334, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 910 + }, + { + "epoch": 0.2969657843770174, + "grad_norm": 0.4288382828235626, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 920 + }, + { + "epoch": 0.3001936733376372, + "grad_norm": 0.33568474650382996, + "learning_rate": 0.0002, + "loss": 0.8431, + "step": 930 + }, + { + "epoch": 0.30342156229825695, + "grad_norm": 1.0915930271148682, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 940 + }, + { + "epoch": 0.3066494512588767, + "grad_norm": 0.5489798188209534, + "learning_rate": 0.0002, + "loss": 0.8535, + "step": 950 + }, + { + "epoch": 0.3098773402194964, + "grad_norm": 0.42971742153167725, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 960 + }, + { + "epoch": 0.3131052291801162, + "grad_norm": 0.43375834822654724, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 970 + }, + { + "epoch": 0.31633311814073595, + "grad_norm": 0.47488611936569214, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 980 + }, + { + "epoch": 0.3195610071013557, + "grad_norm": 0.46296775341033936, + "learning_rate": 0.0002, + "loss": 0.7906, + "step": 990 + }, + { + "epoch": 0.32278889606197547, + "grad_norm": 0.4548890292644501, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 1000 + }, + { + "epoch": 0.32601678502259523, + "grad_norm": 0.41834497451782227, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 1010 + }, + { + "epoch": 0.329244673983215, + "grad_norm": 0.441092312335968, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 1020 + }, + { + "epoch": 0.33247256294383476, + "grad_norm": 0.637322187423706, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 1030 + }, + { + "epoch": 0.33570045190445447, + "grad_norm": 0.4374958574771881, + "learning_rate": 0.0002, + "loss": 0.8685, + "step": 1040 + }, + { + "epoch": 0.33892834086507423, + "grad_norm": 0.3935825824737549, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 1050 + }, + { + "epoch": 0.342156229825694, + "grad_norm": 0.43526220321655273, + "learning_rate": 0.0002, + "loss": 0.8287, + "step": 1060 + }, + { + "epoch": 0.34538411878631375, + "grad_norm": 0.45327696204185486, + "learning_rate": 0.0002, + "loss": 0.8413, + "step": 1070 + }, + { + "epoch": 0.3486120077469335, + "grad_norm": 0.4126075506210327, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 1080 + }, + { + "epoch": 0.3518398967075533, + "grad_norm": 0.4714072048664093, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 1090 + }, + { + "epoch": 0.35506778566817304, + "grad_norm": 0.518127977848053, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 1100 + }, + { + "epoch": 0.35829567462879275, + "grad_norm": 0.43264099955558777, + "learning_rate": 0.0002, + "loss": 0.8479, + "step": 1110 + }, + { + "epoch": 0.3615235635894125, + "grad_norm": 0.4857400357723236, + "learning_rate": 0.0002, + "loss": 0.8724, + "step": 1120 + }, + { + "epoch": 0.3647514525500323, + "grad_norm": 0.37591469287872314, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 1130 + }, + { + "epoch": 0.36797934151065204, + "grad_norm": 0.4165478050708771, + "learning_rate": 0.0002, + "loss": 0.8531, + "step": 1140 + }, + { + "epoch": 0.3712072304712718, + "grad_norm": 0.42911383509635925, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 1150 + }, + { + "epoch": 0.37443511943189156, + "grad_norm": 0.44980287551879883, + "learning_rate": 0.0002, + "loss": 0.8722, + "step": 1160 + }, + { + "epoch": 0.3776630083925113, + "grad_norm": 0.4066573679447174, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 1170 + }, + { + "epoch": 0.38089089735313103, + "grad_norm": 0.5056195855140686, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 1180 + }, + { + "epoch": 0.3841187863137508, + "grad_norm": 0.4141536355018616, + "learning_rate": 0.0002, + "loss": 0.8387, + "step": 1190 + }, + { + "epoch": 0.38734667527437056, + "grad_norm": 0.4501924514770508, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 1200 + }, + { + "epoch": 0.3905745642349903, + "grad_norm": 0.43304240703582764, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 1210 + }, + { + "epoch": 0.3938024531956101, + "grad_norm": 0.475777804851532, + "learning_rate": 0.0002, + "loss": 0.8905, + "step": 1220 + }, + { + "epoch": 0.39703034215622984, + "grad_norm": 0.5846465826034546, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 1230 + }, + { + "epoch": 0.4002582311168496, + "grad_norm": 0.42899325489997864, + "learning_rate": 0.0002, + "loss": 0.8078, + "step": 1240 + }, + { + "epoch": 0.4034861200774693, + "grad_norm": 0.3980463147163391, + "learning_rate": 0.0002, + "loss": 0.8415, + "step": 1250 + }, + { + "epoch": 0.4067140090380891, + "grad_norm": 0.45769768953323364, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 1260 + }, + { + "epoch": 0.40994189799870884, + "grad_norm": 0.5101280212402344, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 1270 + }, + { + "epoch": 0.4131697869593286, + "grad_norm": 0.47374317049980164, + "learning_rate": 0.0002, + "loss": 0.7905, + "step": 1280 + }, + { + "epoch": 0.41639767591994836, + "grad_norm": 0.4261878728866577, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 1290 + }, + { + "epoch": 0.4196255648805681, + "grad_norm": 0.46954256296157837, + "learning_rate": 0.0002, + "loss": 0.9004, + "step": 1300 + }, + { + "epoch": 0.4228534538411879, + "grad_norm": 0.5205738544464111, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 1310 + }, + { + "epoch": 0.4260813428018076, + "grad_norm": 0.5176340937614441, + "learning_rate": 0.0002, + "loss": 0.8964, + "step": 1320 + }, + { + "epoch": 0.42930923176242736, + "grad_norm": 0.5155916810035706, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 1330 + }, + { + "epoch": 0.4325371207230471, + "grad_norm": 0.44548553228378296, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 1340 + }, + { + "epoch": 0.4357650096836669, + "grad_norm": 0.5633558630943298, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 1350 + }, + { + "epoch": 0.43899289864428664, + "grad_norm": 0.42444056272506714, + "learning_rate": 0.0002, + "loss": 0.7889, + "step": 1360 + }, + { + "epoch": 0.4422207876049064, + "grad_norm": 0.5226860642433167, + "learning_rate": 0.0002, + "loss": 0.8588, + "step": 1370 + }, + { + "epoch": 0.44544867656552617, + "grad_norm": 0.5354582071304321, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 1380 + }, + { + "epoch": 0.4486765655261459, + "grad_norm": 0.472646564245224, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 1390 + }, + { + "epoch": 0.45190445448676564, + "grad_norm": 0.6312310099601746, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 1400 + }, + { + "epoch": 0.4551323434473854, + "grad_norm": 0.4298408031463623, + "learning_rate": 0.0002, + "loss": 0.8212, + "step": 1410 + }, + { + "epoch": 0.45836023240800516, + "grad_norm": 0.43427202105522156, + "learning_rate": 0.0002, + "loss": 0.8447, + "step": 1420 + }, + { + "epoch": 0.4615881213686249, + "grad_norm": 0.44097861647605896, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 1430 + }, + { + "epoch": 0.4648160103292447, + "grad_norm": 0.5142693519592285, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 1440 + }, + { + "epoch": 0.46804389928986445, + "grad_norm": 0.46416547894477844, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 1450 + }, + { + "epoch": 0.47127178825048416, + "grad_norm": 0.4858551025390625, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 1460 + }, + { + "epoch": 0.4744996772111039, + "grad_norm": 0.4709177315235138, + "learning_rate": 0.0002, + "loss": 0.8354, + "step": 1470 + }, + { + "epoch": 0.4777275661717237, + "grad_norm": 0.5500252842903137, + "learning_rate": 0.0002, + "loss": 0.8391, + "step": 1480 + }, + { + "epoch": 0.48095545513234345, + "grad_norm": 0.43364381790161133, + "learning_rate": 0.0002, + "loss": 0.8359, + "step": 1490 + }, + { + "epoch": 0.4841833440929632, + "grad_norm": 0.47712287306785583, + "learning_rate": 0.0002, + "loss": 0.8446, + "step": 1500 + }, + { + "epoch": 0.48741123305358297, + "grad_norm": 0.4518495202064514, + "learning_rate": 0.0002, + "loss": 0.8518, + "step": 1510 + }, + { + "epoch": 0.49063912201420273, + "grad_norm": 0.4539008140563965, + "learning_rate": 0.0002, + "loss": 0.819, + "step": 1520 + }, + { + "epoch": 0.49386701097482244, + "grad_norm": 0.4993067979812622, + "learning_rate": 0.0002, + "loss": 0.8276, + "step": 1530 + }, + { + "epoch": 0.4970948999354422, + "grad_norm": 0.6094803214073181, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 1540 + }, + { + "epoch": 0.500322788896062, + "grad_norm": 0.48602527379989624, + "learning_rate": 0.0002, + "loss": 0.8263, + "step": 1550 + }, + { + "epoch": 0.5035506778566817, + "grad_norm": 0.40245795249938965, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 1560 + }, + { + "epoch": 0.5067785668173015, + "grad_norm": 0.456787645816803, + "learning_rate": 0.0002, + "loss": 0.7907, + "step": 1570 + }, + { + "epoch": 0.5100064557779213, + "grad_norm": 0.43936216831207275, + "learning_rate": 0.0002, + "loss": 0.86, + "step": 1580 + }, + { + "epoch": 0.513234344738541, + "grad_norm": 0.549018144607544, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 1590 + }, + { + "epoch": 0.5164622336991608, + "grad_norm": 0.41746795177459717, + "learning_rate": 0.0002, + "loss": 0.8169, + "step": 1600 + }, + { + "epoch": 0.5196901226597805, + "grad_norm": 0.4217053949832916, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 1610 + }, + { + "epoch": 0.5229180116204003, + "grad_norm": 0.449913889169693, + "learning_rate": 0.0002, + "loss": 0.8161, + "step": 1620 + }, + { + "epoch": 0.5261459005810201, + "grad_norm": 0.5084872245788574, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 1630 + }, + { + "epoch": 0.5293737895416397, + "grad_norm": 0.46248653531074524, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 1640 + }, + { + "epoch": 0.5326016785022595, + "grad_norm": 0.4824236035346985, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 1650 + }, + { + "epoch": 0.5358295674628792, + "grad_norm": 0.6010985374450684, + "learning_rate": 0.0002, + "loss": 0.8711, + "step": 1660 + }, + { + "epoch": 0.539057456423499, + "grad_norm": 0.4757920801639557, + "learning_rate": 0.0002, + "loss": 0.8266, + "step": 1670 + }, + { + "epoch": 0.5422853453841188, + "grad_norm": 0.45161882042884827, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 1680 + }, + { + "epoch": 0.5455132343447385, + "grad_norm": 0.49314990639686584, + "learning_rate": 0.0002, + "loss": 0.8141, + "step": 1690 + }, + { + "epoch": 0.5487411233053583, + "grad_norm": 0.3918305039405823, + "learning_rate": 0.0002, + "loss": 0.8091, + "step": 1700 + }, + { + "epoch": 0.551969012265978, + "grad_norm": 0.5966728925704956, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 1710 + }, + { + "epoch": 0.5551969012265978, + "grad_norm": 0.4208986163139343, + "learning_rate": 0.0002, + "loss": 0.8438, + "step": 1720 + }, + { + "epoch": 0.5584247901872176, + "grad_norm": 0.43724218010902405, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 1730 + }, + { + "epoch": 0.5616526791478373, + "grad_norm": 0.5287272930145264, + "learning_rate": 0.0002, + "loss": 0.7956, + "step": 1740 + }, + { + "epoch": 0.5648805681084571, + "grad_norm": 0.4961899518966675, + "learning_rate": 0.0002, + "loss": 0.8557, + "step": 1750 + }, + { + "epoch": 0.5681084570690769, + "grad_norm": 0.4468635320663452, + "learning_rate": 0.0002, + "loss": 0.8029, + "step": 1760 + }, + { + "epoch": 0.5713363460296966, + "grad_norm": 0.6423530578613281, + "learning_rate": 0.0002, + "loss": 0.7968, + "step": 1770 + }, + { + "epoch": 0.5745642349903163, + "grad_norm": 0.4601971507072449, + "learning_rate": 0.0002, + "loss": 0.8324, + "step": 1780 + }, + { + "epoch": 0.577792123950936, + "grad_norm": 0.46514901518821716, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 1790 + }, + { + "epoch": 0.5810200129115558, + "grad_norm": 0.4771687388420105, + "learning_rate": 0.0002, + "loss": 0.8186, + "step": 1800 + }, + { + "epoch": 0.5842479018721756, + "grad_norm": 0.46514490246772766, + "learning_rate": 0.0002, + "loss": 0.856, + "step": 1810 + }, + { + "epoch": 0.5874757908327953, + "grad_norm": 0.5373936295509338, + "learning_rate": 0.0002, + "loss": 0.84, + "step": 1820 + }, + { + "epoch": 0.5907036797934151, + "grad_norm": 0.5175791382789612, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 1830 + }, + { + "epoch": 0.5939315687540349, + "grad_norm": 0.4522802233695984, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 1840 + }, + { + "epoch": 0.5971594577146546, + "grad_norm": 0.42987772822380066, + "learning_rate": 0.0002, + "loss": 0.8633, + "step": 1850 + }, + { + "epoch": 0.6003873466752744, + "grad_norm": 0.5566838383674622, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 1860 + }, + { + "epoch": 0.6036152356358941, + "grad_norm": 0.42807698249816895, + "learning_rate": 0.0002, + "loss": 0.8312, + "step": 1870 + }, + { + "epoch": 0.6068431245965139, + "grad_norm": 0.4957767724990845, + "learning_rate": 0.0002, + "loss": 0.8035, + "step": 1880 + }, + { + "epoch": 0.6100710135571337, + "grad_norm": 0.4260980188846588, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 1890 + }, + { + "epoch": 0.6132989025177534, + "grad_norm": 0.4777357876300812, + "learning_rate": 0.0002, + "loss": 0.8363, + "step": 1900 + }, + { + "epoch": 0.6165267914783732, + "grad_norm": 0.4434216022491455, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 1910 + }, + { + "epoch": 0.6197546804389928, + "grad_norm": 0.5215433835983276, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 1920 + }, + { + "epoch": 0.6229825693996126, + "grad_norm": 0.5143248438835144, + "learning_rate": 0.0002, + "loss": 0.82, + "step": 1930 + }, + { + "epoch": 0.6262104583602324, + "grad_norm": 0.5213413238525391, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 1940 + }, + { + "epoch": 0.6294383473208521, + "grad_norm": 0.5408226251602173, + "learning_rate": 0.0002, + "loss": 0.7549, + "step": 1950 + }, + { + "epoch": 0.6326662362814719, + "grad_norm": 0.5479708909988403, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 1960 + }, + { + "epoch": 0.6358941252420917, + "grad_norm": 0.4490949809551239, + "learning_rate": 0.0002, + "loss": 0.8138, + "step": 1970 + }, + { + "epoch": 0.6391220142027114, + "grad_norm": 0.48815059661865234, + "learning_rate": 0.0002, + "loss": 0.854, + "step": 1980 + }, + { + "epoch": 0.6423499031633312, + "grad_norm": 0.46498045325279236, + "learning_rate": 0.0002, + "loss": 0.8568, + "step": 1990 + }, + { + "epoch": 0.6455777921239509, + "grad_norm": 0.5136561393737793, + "learning_rate": 0.0002, + "loss": 0.8263, + "step": 2000 + }, + { + "epoch": 0.6488056810845707, + "grad_norm": 0.5145719647407532, + "learning_rate": 0.0002, + "loss": 0.8503, + "step": 2010 + }, + { + "epoch": 0.6520335700451905, + "grad_norm": 0.5430373549461365, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 2020 + }, + { + "epoch": 0.6552614590058102, + "grad_norm": 0.46347954869270325, + "learning_rate": 0.0002, + "loss": 0.8115, + "step": 2030 + }, + { + "epoch": 0.65848934796643, + "grad_norm": 0.5189562439918518, + "learning_rate": 0.0002, + "loss": 0.8769, + "step": 2040 + }, + { + "epoch": 0.6617172369270498, + "grad_norm": 0.43843990564346313, + "learning_rate": 0.0002, + "loss": 0.8453, + "step": 2050 + }, + { + "epoch": 0.6649451258876695, + "grad_norm": 0.4654983580112457, + "learning_rate": 0.0002, + "loss": 0.7951, + "step": 2060 + }, + { + "epoch": 0.6681730148482892, + "grad_norm": 0.44835716485977173, + "learning_rate": 0.0002, + "loss": 0.8308, + "step": 2070 + }, + { + "epoch": 0.6714009038089089, + "grad_norm": 0.38811734318733215, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 2080 + }, + { + "epoch": 0.6746287927695287, + "grad_norm": 0.5709853172302246, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 2090 + }, + { + "epoch": 0.6778566817301485, + "grad_norm": 0.49994757771492004, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 2100 + }, + { + "epoch": 0.6810845706907682, + "grad_norm": 0.5505402684211731, + "learning_rate": 0.0002, + "loss": 0.8, + "step": 2110 + }, + { + "epoch": 0.684312459651388, + "grad_norm": 0.48195120692253113, + "learning_rate": 0.0002, + "loss": 0.8227, + "step": 2120 + }, + { + "epoch": 0.6875403486120077, + "grad_norm": 0.4854775071144104, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 2130 + }, + { + "epoch": 0.6907682375726275, + "grad_norm": 0.6422494649887085, + "learning_rate": 0.0002, + "loss": 0.8231, + "step": 2140 + }, + { + "epoch": 0.6939961265332473, + "grad_norm": 0.3972536027431488, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 2150 + }, + { + "epoch": 0.697224015493867, + "grad_norm": 0.4297836422920227, + "learning_rate": 0.0002, + "loss": 0.8068, + "step": 2160 + }, + { + "epoch": 0.7004519044544868, + "grad_norm": 0.45486778020858765, + "learning_rate": 0.0002, + "loss": 0.8017, + "step": 2170 + }, + { + "epoch": 0.7036797934151066, + "grad_norm": 0.4706047773361206, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 2180 + }, + { + "epoch": 0.7069076823757263, + "grad_norm": 0.46426892280578613, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 2190 + }, + { + "epoch": 0.7101355713363461, + "grad_norm": 0.46333715319633484, + "learning_rate": 0.0002, + "loss": 0.8472, + "step": 2200 + }, + { + "epoch": 0.7133634602969657, + "grad_norm": 0.4632524251937866, + "learning_rate": 0.0002, + "loss": 0.8247, + "step": 2210 + }, + { + "epoch": 0.7165913492575855, + "grad_norm": 0.4610830843448639, + "learning_rate": 0.0002, + "loss": 0.8452, + "step": 2220 + }, + { + "epoch": 0.7198192382182053, + "grad_norm": 0.4905324876308441, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 2230 + }, + { + "epoch": 0.723047127178825, + "grad_norm": 0.4936263859272003, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 2240 + }, + { + "epoch": 0.7262750161394448, + "grad_norm": 0.40778425335884094, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 2250 + }, + { + "epoch": 0.7295029051000645, + "grad_norm": 0.50351482629776, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 2260 + }, + { + "epoch": 0.7327307940606843, + "grad_norm": 0.4894128143787384, + "learning_rate": 0.0002, + "loss": 0.8475, + "step": 2270 + }, + { + "epoch": 0.7359586830213041, + "grad_norm": 0.5580906271934509, + "learning_rate": 0.0002, + "loss": 0.8087, + "step": 2280 + }, + { + "epoch": 0.7391865719819238, + "grad_norm": 0.4655369520187378, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 2290 + }, + { + "epoch": 0.7424144609425436, + "grad_norm": 0.4666965901851654, + "learning_rate": 0.0002, + "loss": 0.8395, + "step": 2300 + }, + { + "epoch": 0.7456423499031634, + "grad_norm": 0.46259936690330505, + "learning_rate": 0.0002, + "loss": 0.7605, + "step": 2310 + }, + { + "epoch": 0.7488702388637831, + "grad_norm": 0.520706832408905, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 2320 + }, + { + "epoch": 0.7520981278244029, + "grad_norm": 0.5142408013343811, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 2330 + }, + { + "epoch": 0.7553260167850226, + "grad_norm": 0.5355164408683777, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 2340 + }, + { + "epoch": 0.7585539057456423, + "grad_norm": 0.5517185926437378, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 2350 + }, + { + "epoch": 0.7617817947062621, + "grad_norm": 0.7162677049636841, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 2360 + }, + { + "epoch": 0.7650096836668818, + "grad_norm": 0.42402133345603943, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 2370 + }, + { + "epoch": 0.7682375726275016, + "grad_norm": 0.47180113196372986, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 2380 + }, + { + "epoch": 0.7714654615881213, + "grad_norm": 0.6262288689613342, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 2390 + }, + { + "epoch": 0.7746933505487411, + "grad_norm": 0.5177528262138367, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 2400 + }, + { + "epoch": 0.7779212395093609, + "grad_norm": 0.555721640586853, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 2410 + }, + { + "epoch": 0.7811491284699806, + "grad_norm": 0.5592644810676575, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 2420 + }, + { + "epoch": 0.7843770174306004, + "grad_norm": 0.38025397062301636, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 2430 + }, + { + "epoch": 0.7876049063912202, + "grad_norm": 0.4597472548484802, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 2440 + }, + { + "epoch": 0.7908327953518399, + "grad_norm": 0.4929825961589813, + "learning_rate": 0.0002, + "loss": 0.8575, + "step": 2450 + }, + { + "epoch": 0.7940606843124597, + "grad_norm": 0.45277655124664307, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 2460 + }, + { + "epoch": 0.7972885732730794, + "grad_norm": 0.6224122643470764, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 2470 + }, + { + "epoch": 0.8005164622336992, + "grad_norm": 0.5740901827812195, + "learning_rate": 0.0002, + "loss": 0.8449, + "step": 2480 + }, + { + "epoch": 0.8037443511943189, + "grad_norm": 0.41335329413414, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 2490 + }, + { + "epoch": 0.8069722401549386, + "grad_norm": 0.4738694131374359, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 2500 + }, + { + "epoch": 0.8102001291155584, + "grad_norm": 0.5288197994232178, + "learning_rate": 0.0002, + "loss": 0.7927, + "step": 2510 + }, + { + "epoch": 0.8134280180761781, + "grad_norm": 0.5404666066169739, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 2520 + }, + { + "epoch": 0.8166559070367979, + "grad_norm": 0.4444909691810608, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 2530 + }, + { + "epoch": 0.8198837959974177, + "grad_norm": 0.542061448097229, + "learning_rate": 0.0002, + "loss": 0.8683, + "step": 2540 + }, + { + "epoch": 0.8231116849580374, + "grad_norm": 0.4914741814136505, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 2550 + }, + { + "epoch": 0.8263395739186572, + "grad_norm": 0.41703441739082336, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 2560 + }, + { + "epoch": 0.829567462879277, + "grad_norm": 0.5489841103553772, + "learning_rate": 0.0002, + "loss": 0.824, + "step": 2570 + }, + { + "epoch": 0.8327953518398967, + "grad_norm": 0.5359883308410645, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 2580 + }, + { + "epoch": 0.8360232408005165, + "grad_norm": 0.5541019439697266, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 2590 + }, + { + "epoch": 0.8392511297611362, + "grad_norm": 0.4746638834476471, + "learning_rate": 0.0002, + "loss": 0.797, + "step": 2600 + }, + { + "epoch": 0.842479018721756, + "grad_norm": 0.5243194103240967, + "learning_rate": 0.0002, + "loss": 0.8116, + "step": 2610 + }, + { + "epoch": 0.8457069076823758, + "grad_norm": 0.46824976801872253, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 2620 + }, + { + "epoch": 0.8489347966429954, + "grad_norm": 0.49487847089767456, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 2630 + }, + { + "epoch": 0.8521626856036152, + "grad_norm": 0.42180097103118896, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 2640 + }, + { + "epoch": 0.855390574564235, + "grad_norm": 0.5516560077667236, + "learning_rate": 0.0002, + "loss": 0.8304, + "step": 2650 + }, + { + "epoch": 0.8586184635248547, + "grad_norm": 0.4392191767692566, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 2660 + }, + { + "epoch": 0.8618463524854745, + "grad_norm": 0.5387210845947266, + "learning_rate": 0.0002, + "loss": 0.848, + "step": 2670 + }, + { + "epoch": 0.8650742414460942, + "grad_norm": 0.6232406497001648, + "learning_rate": 0.0002, + "loss": 0.8094, + "step": 2680 + }, + { + "epoch": 0.868302130406714, + "grad_norm": 0.53749018907547, + "learning_rate": 0.0002, + "loss": 0.768, + "step": 2690 + }, + { + "epoch": 0.8715300193673338, + "grad_norm": 0.47480374574661255, + "learning_rate": 0.0002, + "loss": 0.8299, + "step": 2700 + }, + { + "epoch": 0.8747579083279535, + "grad_norm": 0.44618046283721924, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 2710 + }, + { + "epoch": 0.8779857972885733, + "grad_norm": 0.4173581302165985, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 2720 + }, + { + "epoch": 0.881213686249193, + "grad_norm": 0.524081289768219, + "learning_rate": 0.0002, + "loss": 0.7713, + "step": 2730 + }, + { + "epoch": 0.8844415752098128, + "grad_norm": 0.5608431100845337, + "learning_rate": 0.0002, + "loss": 0.8738, + "step": 2740 + }, + { + "epoch": 0.8876694641704326, + "grad_norm": 0.5212284922599792, + "learning_rate": 0.0002, + "loss": 0.8513, + "step": 2750 + }, + { + "epoch": 0.8908973531310523, + "grad_norm": 0.5601475834846497, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 2760 + }, + { + "epoch": 0.8941252420916721, + "grad_norm": 0.4499223828315735, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 2770 + }, + { + "epoch": 0.8973531310522918, + "grad_norm": 0.46945226192474365, + "learning_rate": 0.0002, + "loss": 0.8559, + "step": 2780 + }, + { + "epoch": 0.9005810200129115, + "grad_norm": 0.4837495684623718, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 2790 + }, + { + "epoch": 0.9038089089735313, + "grad_norm": 0.5059258937835693, + "learning_rate": 0.0002, + "loss": 0.7887, + "step": 2800 + }, + { + "epoch": 0.907036797934151, + "grad_norm": 0.4857945144176483, + "learning_rate": 0.0002, + "loss": 0.8571, + "step": 2810 + }, + { + "epoch": 0.9102646868947708, + "grad_norm": 0.5001962780952454, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 2820 + }, + { + "epoch": 0.9134925758553906, + "grad_norm": 0.5468648672103882, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 2830 + }, + { + "epoch": 0.9167204648160103, + "grad_norm": 0.5533056259155273, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 2840 + }, + { + "epoch": 0.9199483537766301, + "grad_norm": 0.5909785628318787, + "learning_rate": 0.0002, + "loss": 0.7895, + "step": 2850 + }, + { + "epoch": 0.9231762427372499, + "grad_norm": 0.47428104281425476, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 2860 + }, + { + "epoch": 0.9264041316978696, + "grad_norm": 0.548814058303833, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 2870 + }, + { + "epoch": 0.9296320206584894, + "grad_norm": 0.5576745271682739, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 2880 + }, + { + "epoch": 0.9328599096191091, + "grad_norm": 0.47094792127609253, + "learning_rate": 0.0002, + "loss": 0.8399, + "step": 2890 + }, + { + "epoch": 0.9360877985797289, + "grad_norm": 0.5408539772033691, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 2900 + }, + { + "epoch": 0.9393156875403487, + "grad_norm": 0.5922889113426208, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 2910 + }, + { + "epoch": 0.9425435765009683, + "grad_norm": 0.45462584495544434, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 2920 + }, + { + "epoch": 0.9457714654615881, + "grad_norm": 0.6864947080612183, + "learning_rate": 0.0002, + "loss": 0.8344, + "step": 2930 + }, + { + "epoch": 0.9489993544222078, + "grad_norm": 0.4706299304962158, + "learning_rate": 0.0002, + "loss": 0.8166, + "step": 2940 + }, + { + "epoch": 0.9522272433828276, + "grad_norm": 0.5583269596099854, + "learning_rate": 0.0002, + "loss": 0.8422, + "step": 2950 + }, + { + "epoch": 0.9554551323434474, + "grad_norm": 0.51015704870224, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 2960 + }, + { + "epoch": 0.9586830213040671, + "grad_norm": 0.5325582027435303, + "learning_rate": 0.0002, + "loss": 0.8371, + "step": 2970 + }, + { + "epoch": 0.9619109102646869, + "grad_norm": 0.49008598923683167, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 2980 + }, + { + "epoch": 0.9651387992253067, + "grad_norm": 0.4422132074832916, + "learning_rate": 0.0002, + "loss": 0.8093, + "step": 2990 + }, + { + "epoch": 0.9683666881859264, + "grad_norm": 0.5053589344024658, + "learning_rate": 0.0002, + "loss": 0.7966, + "step": 3000 + }, + { + "epoch": 0.9715945771465462, + "grad_norm": 0.46754521131515503, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 3010 + }, + { + "epoch": 0.9748224661071659, + "grad_norm": 0.5613434910774231, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 3020 + }, + { + "epoch": 0.9780503550677857, + "grad_norm": 0.5052843689918518, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 3030 + }, + { + "epoch": 0.9812782440284055, + "grad_norm": 0.4270972013473511, + "learning_rate": 0.0002, + "loss": 0.8412, + "step": 3040 + }, + { + "epoch": 0.9845061329890252, + "grad_norm": 0.4974991977214813, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 3050 + }, + { + "epoch": 0.9877340219496449, + "grad_norm": 0.4432311952114105, + "learning_rate": 0.0002, + "loss": 0.8415, + "step": 3060 + }, + { + "epoch": 0.9909619109102646, + "grad_norm": 0.466457724571228, + "learning_rate": 0.0002, + "loss": 0.7764, + "step": 3070 + }, + { + "epoch": 0.9941897998708844, + "grad_norm": 0.6438009142875671, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 3080 + }, + { + "epoch": 0.9974176888315042, + "grad_norm": 0.5593604445457458, + "learning_rate": 0.0002, + "loss": 0.8425, + "step": 3090 + }, + { + "epoch": 1.0, + "eval_loss": 1.0958120822906494, + "eval_runtime": 148.3273, + "eval_samples_per_second": 4.942, + "eval_steps_per_second": 0.62, + "step": 3098 + }, + { + "epoch": 1.000645577792124, + "grad_norm": 0.5701445937156677, + "learning_rate": 0.0002, + "loss": 0.8275, + "step": 3100 + }, + { + "epoch": 1.0038734667527438, + "grad_norm": 0.6089657545089722, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 3110 + }, + { + "epoch": 1.0071013557133635, + "grad_norm": 0.5619552135467529, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 3120 + }, + { + "epoch": 1.010329244673983, + "grad_norm": 0.5550283789634705, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 3130 + }, + { + "epoch": 1.013557133634603, + "grad_norm": 0.6221792101860046, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 3140 + }, + { + "epoch": 1.0167850225952226, + "grad_norm": 0.5450758934020996, + "learning_rate": 0.0002, + "loss": 0.7603, + "step": 3150 + }, + { + "epoch": 1.0200129115558425, + "grad_norm": 0.4359588027000427, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 3160 + }, + { + "epoch": 1.0232408005164622, + "grad_norm": 0.5932239890098572, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 3170 + }, + { + "epoch": 1.026468689477082, + "grad_norm": 0.45478707551956177, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 3180 + }, + { + "epoch": 1.0296965784377017, + "grad_norm": 0.677615761756897, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 3190 + }, + { + "epoch": 1.0329244673983216, + "grad_norm": 0.6231790781021118, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 3200 + }, + { + "epoch": 1.0361523563589412, + "grad_norm": 0.5074195861816406, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 3210 + }, + { + "epoch": 1.039380245319561, + "grad_norm": 0.4844142198562622, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 3220 + }, + { + "epoch": 1.0426081342801807, + "grad_norm": 0.5372750759124756, + "learning_rate": 0.0002, + "loss": 0.7655, + "step": 3230 + }, + { + "epoch": 1.0458360232408006, + "grad_norm": 0.46296265721321106, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 3240 + }, + { + "epoch": 1.0490639122014203, + "grad_norm": 0.5417148470878601, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 3250 + }, + { + "epoch": 1.0522918011620401, + "grad_norm": 0.5695074200630188, + "learning_rate": 0.0002, + "loss": 0.7637, + "step": 3260 + }, + { + "epoch": 1.0555196901226598, + "grad_norm": 0.5050092935562134, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 3270 + }, + { + "epoch": 1.0587475790832794, + "grad_norm": 0.5320752263069153, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 3280 + }, + { + "epoch": 1.0619754680438993, + "grad_norm": 0.5832052230834961, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 3290 + }, + { + "epoch": 1.065203357004519, + "grad_norm": 0.5228804349899292, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 3300 + }, + { + "epoch": 1.0684312459651388, + "grad_norm": 0.5819445252418518, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 3310 + }, + { + "epoch": 1.0716591349257585, + "grad_norm": 0.4201328754425049, + "learning_rate": 0.0002, + "loss": 0.7093, + "step": 3320 + }, + { + "epoch": 1.0748870238863784, + "grad_norm": 0.5424145460128784, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 3330 + }, + { + "epoch": 1.078114912846998, + "grad_norm": 0.6169946789741516, + "learning_rate": 0.0002, + "loss": 0.7828, + "step": 3340 + }, + { + "epoch": 1.0813428018076179, + "grad_norm": 0.607676088809967, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 3350 + }, + { + "epoch": 1.0845706907682375, + "grad_norm": 0.5191982388496399, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 3360 + }, + { + "epoch": 1.0877985797288574, + "grad_norm": 0.5728003978729248, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 3370 + }, + { + "epoch": 1.091026468689477, + "grad_norm": 0.5402643084526062, + "learning_rate": 0.0002, + "loss": 0.7381, + "step": 3380 + }, + { + "epoch": 1.094254357650097, + "grad_norm": 0.5377541780471802, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 3390 + }, + { + "epoch": 1.0974822466107166, + "grad_norm": 0.4751385748386383, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 3400 + }, + { + "epoch": 1.1007101355713362, + "grad_norm": 0.559158444404602, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 3410 + }, + { + "epoch": 1.103938024531956, + "grad_norm": 0.4917701482772827, + "learning_rate": 0.0002, + "loss": 0.7366, + "step": 3420 + }, + { + "epoch": 1.1071659134925758, + "grad_norm": 0.5507875084877014, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 3430 + }, + { + "epoch": 1.1103938024531956, + "grad_norm": 0.45458680391311646, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 3440 + }, + { + "epoch": 1.1136216914138153, + "grad_norm": 0.5721744894981384, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 3450 + }, + { + "epoch": 1.1168495803744352, + "grad_norm": 0.5776081681251526, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 3460 + }, + { + "epoch": 1.1200774693350548, + "grad_norm": 0.5261953473091125, + "learning_rate": 0.0002, + "loss": 0.7644, + "step": 3470 + }, + { + "epoch": 1.1233053582956747, + "grad_norm": 0.47759532928466797, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 3480 + }, + { + "epoch": 1.1265332472562943, + "grad_norm": 0.5697659850120544, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 3490 + }, + { + "epoch": 1.1297611362169142, + "grad_norm": 0.5643419623374939, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 3500 + }, + { + "epoch": 1.1329890251775339, + "grad_norm": 0.6502931118011475, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 3510 + }, + { + "epoch": 1.1362169141381537, + "grad_norm": 0.5236507654190063, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 3520 + }, + { + "epoch": 1.1394448030987734, + "grad_norm": 0.6521499156951904, + "learning_rate": 0.0002, + "loss": 0.7571, + "step": 3530 + }, + { + "epoch": 1.142672692059393, + "grad_norm": 0.5893217325210571, + "learning_rate": 0.0002, + "loss": 0.7304, + "step": 3540 + }, + { + "epoch": 1.145900581020013, + "grad_norm": 0.5300073027610779, + "learning_rate": 0.0002, + "loss": 0.7508, + "step": 3550 + }, + { + "epoch": 1.1491284699806328, + "grad_norm": 0.6794660091400146, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 3560 + }, + { + "epoch": 1.1523563589412524, + "grad_norm": 0.5420064926147461, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 3570 + }, + { + "epoch": 1.155584247901872, + "grad_norm": 0.5096590518951416, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 3580 + }, + { + "epoch": 1.158812136862492, + "grad_norm": 0.5726043581962585, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 3590 + }, + { + "epoch": 1.1620400258231116, + "grad_norm": 0.7388110160827637, + "learning_rate": 0.0002, + "loss": 0.7728, + "step": 3600 + }, + { + "epoch": 1.1652679147837315, + "grad_norm": 0.5597969889640808, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 3610 + }, + { + "epoch": 1.1684958037443511, + "grad_norm": 0.5067800283432007, + "learning_rate": 0.0002, + "loss": 0.7132, + "step": 3620 + }, + { + "epoch": 1.171723692704971, + "grad_norm": 0.6625118255615234, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 3630 + }, + { + "epoch": 1.1749515816655907, + "grad_norm": 0.5830849409103394, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 3640 + }, + { + "epoch": 1.1781794706262105, + "grad_norm": 0.6140692830085754, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 3650 + }, + { + "epoch": 1.1814073595868302, + "grad_norm": 0.714523434638977, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 3660 + }, + { + "epoch": 1.18463524854745, + "grad_norm": 0.5196696519851685, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 3670 + }, + { + "epoch": 1.1878631375080697, + "grad_norm": 0.6677889823913574, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 3680 + }, + { + "epoch": 1.1910910264686896, + "grad_norm": 0.47095245122909546, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 3690 + }, + { + "epoch": 1.1943189154293092, + "grad_norm": 0.5197778940200806, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 3700 + }, + { + "epoch": 1.1975468043899289, + "grad_norm": 0.5156530141830444, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 3710 + }, + { + "epoch": 1.2007746933505488, + "grad_norm": 0.6968549489974976, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 3720 + }, + { + "epoch": 1.2040025823111684, + "grad_norm": 0.48983848094940186, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 3730 + }, + { + "epoch": 1.2072304712717883, + "grad_norm": 0.6709973216056824, + "learning_rate": 0.0002, + "loss": 0.7163, + "step": 3740 + }, + { + "epoch": 1.210458360232408, + "grad_norm": 0.48681750893592834, + "learning_rate": 0.0002, + "loss": 0.7632, + "step": 3750 + }, + { + "epoch": 1.2136862491930278, + "grad_norm": 0.49475061893463135, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 3760 + }, + { + "epoch": 1.2169141381536475, + "grad_norm": 0.6163983345031738, + "learning_rate": 0.0002, + "loss": 0.7372, + "step": 3770 + }, + { + "epoch": 1.2201420271142673, + "grad_norm": 0.5481411218643188, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 3780 + }, + { + "epoch": 1.223369916074887, + "grad_norm": 0.620639979839325, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 3790 + }, + { + "epoch": 1.2265978050355069, + "grad_norm": 0.7017222046852112, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 3800 + }, + { + "epoch": 1.2298256939961265, + "grad_norm": 0.5872400403022766, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 3810 + }, + { + "epoch": 1.2330535829567464, + "grad_norm": 0.45765596628189087, + "learning_rate": 0.0002, + "loss": 0.7854, + "step": 3820 + }, + { + "epoch": 1.236281471917366, + "grad_norm": 0.5676377415657043, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 3830 + }, + { + "epoch": 1.2395093608779857, + "grad_norm": 0.4793425500392914, + "learning_rate": 0.0002, + "loss": 0.7696, + "step": 3840 + }, + { + "epoch": 1.2427372498386056, + "grad_norm": 0.5060022473335266, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 3850 + }, + { + "epoch": 1.2459651387992252, + "grad_norm": 0.6140682697296143, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 3860 + }, + { + "epoch": 1.249193027759845, + "grad_norm": 0.5030326843261719, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 3870 + }, + { + "epoch": 1.2524209167204647, + "grad_norm": 0.6609430909156799, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 3880 + }, + { + "epoch": 1.2556488056810846, + "grad_norm": 0.5459545850753784, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 3890 + }, + { + "epoch": 1.2588766946417043, + "grad_norm": 0.5328870415687561, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 3900 + }, + { + "epoch": 1.2621045836023241, + "grad_norm": 0.5840652585029602, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 3910 + }, + { + "epoch": 1.2653324725629438, + "grad_norm": 0.5587584376335144, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 3920 + }, + { + "epoch": 1.2685603615235637, + "grad_norm": 0.5886949896812439, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 3930 + }, + { + "epoch": 1.2717882504841833, + "grad_norm": 0.5128693580627441, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 3940 + }, + { + "epoch": 1.2750161394448032, + "grad_norm": 0.6207669377326965, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 3950 + }, + { + "epoch": 1.2782440284054228, + "grad_norm": 0.5789574384689331, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 3960 + }, + { + "epoch": 1.2814719173660425, + "grad_norm": 0.503162145614624, + "learning_rate": 0.0002, + "loss": 0.7574, + "step": 3970 + }, + { + "epoch": 1.2846998063266624, + "grad_norm": 0.6670064926147461, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 3980 + }, + { + "epoch": 1.2879276952872822, + "grad_norm": 0.5676213502883911, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 3990 + }, + { + "epoch": 1.2911555842479019, + "grad_norm": 0.5383169054985046, + "learning_rate": 0.0002, + "loss": 0.7892, + "step": 4000 + }, + { + "epoch": 1.2943834732085215, + "grad_norm": 0.714743971824646, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 4010 + }, + { + "epoch": 1.2976113621691414, + "grad_norm": 0.5740262269973755, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 4020 + }, + { + "epoch": 1.300839251129761, + "grad_norm": 0.6143045425415039, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 4030 + }, + { + "epoch": 1.304067140090381, + "grad_norm": 0.501025378704071, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 4040 + }, + { + "epoch": 1.3072950290510006, + "grad_norm": 0.5784100294113159, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 4050 + }, + { + "epoch": 1.3105229180116205, + "grad_norm": 0.6182606220245361, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 4060 + }, + { + "epoch": 1.3137508069722401, + "grad_norm": 0.5072231292724609, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 4070 + }, + { + "epoch": 1.31697869593286, + "grad_norm": 0.6841012835502625, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 4080 + }, + { + "epoch": 1.3202065848934796, + "grad_norm": 0.697257936000824, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 4090 + }, + { + "epoch": 1.3234344738540993, + "grad_norm": 0.5113214254379272, + "learning_rate": 0.0002, + "loss": 0.7401, + "step": 4100 + }, + { + "epoch": 1.3266623628147192, + "grad_norm": 0.6270561814308167, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 4110 + }, + { + "epoch": 1.329890251775339, + "grad_norm": 0.5525947213172913, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 4120 + }, + { + "epoch": 1.3331181407359587, + "grad_norm": 0.546071469783783, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 4130 + }, + { + "epoch": 1.3363460296965783, + "grad_norm": 0.6516721248626709, + "learning_rate": 0.0002, + "loss": 0.7884, + "step": 4140 + }, + { + "epoch": 1.3395739186571982, + "grad_norm": 0.6235111355781555, + "learning_rate": 0.0002, + "loss": 0.755, + "step": 4150 + }, + { + "epoch": 1.3428018076178179, + "grad_norm": 0.538649320602417, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 4160 + }, + { + "epoch": 1.3460296965784377, + "grad_norm": 0.5367001891136169, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 4170 + }, + { + "epoch": 1.3492575855390574, + "grad_norm": 0.6134631037712097, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 4180 + }, + { + "epoch": 1.3524854744996773, + "grad_norm": 0.5827262997627258, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 4190 + }, + { + "epoch": 1.355713363460297, + "grad_norm": 0.5706096291542053, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 4200 + }, + { + "epoch": 1.3589412524209168, + "grad_norm": 0.6422057151794434, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 4210 + }, + { + "epoch": 1.3621691413815364, + "grad_norm": 0.6316141486167908, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 4220 + }, + { + "epoch": 1.365397030342156, + "grad_norm": 0.6946983933448792, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 4230 + }, + { + "epoch": 1.368624919302776, + "grad_norm": 0.5381525754928589, + "learning_rate": 0.0002, + "loss": 0.7388, + "step": 4240 + }, + { + "epoch": 1.3718528082633958, + "grad_norm": 0.5484845638275146, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 4250 + }, + { + "epoch": 1.3750806972240155, + "grad_norm": 0.5961896777153015, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 4260 + }, + { + "epoch": 1.3783085861846351, + "grad_norm": 0.6041752696037292, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 4270 + }, + { + "epoch": 1.381536475145255, + "grad_norm": 0.6283464431762695, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 4280 + }, + { + "epoch": 1.384764364105875, + "grad_norm": 0.6761324405670166, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 4290 + }, + { + "epoch": 1.3879922530664945, + "grad_norm": 0.504311203956604, + "learning_rate": 0.0002, + "loss": 0.7381, + "step": 4300 + }, + { + "epoch": 1.3912201420271142, + "grad_norm": 0.6100395917892456, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 4310 + }, + { + "epoch": 1.394448030987734, + "grad_norm": 0.6245788335800171, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 4320 + }, + { + "epoch": 1.3976759199483537, + "grad_norm": 0.6074621081352234, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 4330 + }, + { + "epoch": 1.4009038089089736, + "grad_norm": 0.6683838963508606, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 4340 + }, + { + "epoch": 1.4041316978695932, + "grad_norm": 0.622998058795929, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 4350 + }, + { + "epoch": 1.4073595868302131, + "grad_norm": 0.6089423894882202, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 4360 + }, + { + "epoch": 1.4105874757908328, + "grad_norm": 0.6381658911705017, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 4370 + }, + { + "epoch": 1.4138153647514526, + "grad_norm": 0.5419308543205261, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 4380 + }, + { + "epoch": 1.4170432537120723, + "grad_norm": 0.6026232242584229, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 4390 + }, + { + "epoch": 1.420271142672692, + "grad_norm": 0.4911101162433624, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 4400 + }, + { + "epoch": 1.4234990316333118, + "grad_norm": 0.6302908062934875, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 4410 + }, + { + "epoch": 1.4267269205939317, + "grad_norm": 0.6692768931388855, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 4420 + }, + { + "epoch": 1.4299548095545513, + "grad_norm": 0.46294572949409485, + "learning_rate": 0.0002, + "loss": 0.7312, + "step": 4430 + }, + { + "epoch": 1.433182698515171, + "grad_norm": 0.5452619194984436, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 4440 + }, + { + "epoch": 1.4364105874757909, + "grad_norm": 0.7809233069419861, + "learning_rate": 0.0002, + "loss": 0.7974, + "step": 4450 + }, + { + "epoch": 1.4396384764364105, + "grad_norm": 0.550088107585907, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 4460 + }, + { + "epoch": 1.4428663653970304, + "grad_norm": 0.7139151096343994, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 4470 + }, + { + "epoch": 1.44609425435765, + "grad_norm": 0.6187090873718262, + "learning_rate": 0.0002, + "loss": 0.7358, + "step": 4480 + }, + { + "epoch": 1.44932214331827, + "grad_norm": 0.5948249101638794, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 4490 + }, + { + "epoch": 1.4525500322788896, + "grad_norm": 0.6510892510414124, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 4500 + }, + { + "epoch": 1.4557779212395094, + "grad_norm": 0.6552293300628662, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 4510 + }, + { + "epoch": 1.459005810200129, + "grad_norm": 0.585574209690094, + "learning_rate": 0.0002, + "loss": 0.7965, + "step": 4520 + }, + { + "epoch": 1.4622336991607487, + "grad_norm": 0.4830162823200226, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 4530 + }, + { + "epoch": 1.4654615881213686, + "grad_norm": 0.5780223608016968, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 4540 + }, + { + "epoch": 1.4686894770819885, + "grad_norm": 0.5462607145309448, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 4550 + }, + { + "epoch": 1.4719173660426081, + "grad_norm": 0.5183546543121338, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 4560 + }, + { + "epoch": 1.4751452550032278, + "grad_norm": 0.676917552947998, + "learning_rate": 0.0002, + "loss": 0.71, + "step": 4570 + }, + { + "epoch": 1.4783731439638477, + "grad_norm": 0.5772345066070557, + "learning_rate": 0.0002, + "loss": 0.7875, + "step": 4580 + }, + { + "epoch": 1.4816010329244673, + "grad_norm": 0.7320035696029663, + "learning_rate": 0.0002, + "loss": 0.7709, + "step": 4590 + }, + { + "epoch": 1.4848289218850872, + "grad_norm": 0.5024042129516602, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 4600 + }, + { + "epoch": 1.4880568108457068, + "grad_norm": 0.5482868552207947, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 4610 + }, + { + "epoch": 1.4912846998063267, + "grad_norm": 0.5447399616241455, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 4620 + }, + { + "epoch": 1.4945125887669464, + "grad_norm": 0.5953414440155029, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 4630 + }, + { + "epoch": 1.4977404777275662, + "grad_norm": 0.6983066201210022, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 4640 + }, + { + "epoch": 1.500968366688186, + "grad_norm": 0.586327075958252, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 4650 + }, + { + "epoch": 1.5041962556488055, + "grad_norm": 0.5839682221412659, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 4660 + }, + { + "epoch": 1.5074241446094254, + "grad_norm": 0.5959209203720093, + "learning_rate": 0.0002, + "loss": 0.7524, + "step": 4670 + }, + { + "epoch": 1.5106520335700453, + "grad_norm": 0.5073857307434082, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 4680 + }, + { + "epoch": 1.513879922530665, + "grad_norm": 0.5183001160621643, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 4690 + }, + { + "epoch": 1.5171078114912846, + "grad_norm": 0.593530535697937, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 4700 + }, + { + "epoch": 1.5203357004519045, + "grad_norm": 0.675993025302887, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 4710 + }, + { + "epoch": 1.5235635894125243, + "grad_norm": 0.5823286771774292, + "learning_rate": 0.0002, + "loss": 0.7485, + "step": 4720 + }, + { + "epoch": 1.526791478373144, + "grad_norm": 0.5825035572052002, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 4730 + }, + { + "epoch": 1.5300193673337636, + "grad_norm": 0.5689691305160522, + "learning_rate": 0.0002, + "loss": 0.8287, + "step": 4740 + }, + { + "epoch": 1.5332472562943835, + "grad_norm": 0.6037150621414185, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 4750 + }, + { + "epoch": 1.5364751452550034, + "grad_norm": 0.6393677592277527, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 4760 + }, + { + "epoch": 1.539703034215623, + "grad_norm": 0.5926381945610046, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 4770 + }, + { + "epoch": 1.5429309231762427, + "grad_norm": 0.9468599557876587, + "learning_rate": 0.0002, + "loss": 0.7425, + "step": 4780 + }, + { + "epoch": 1.5461588121368623, + "grad_norm": 0.7544237375259399, + "learning_rate": 0.0002, + "loss": 0.7565, + "step": 4790 + }, + { + "epoch": 1.5493867010974822, + "grad_norm": 0.5308566093444824, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 4800 + }, + { + "epoch": 1.552614590058102, + "grad_norm": 0.6590296030044556, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 4810 + }, + { + "epoch": 1.5558424790187217, + "grad_norm": 0.5630404353141785, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 4820 + }, + { + "epoch": 1.5590703679793414, + "grad_norm": 0.6800200939178467, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 4830 + }, + { + "epoch": 1.5622982569399613, + "grad_norm": 0.5463718175888062, + "learning_rate": 0.0002, + "loss": 0.7373, + "step": 4840 + }, + { + "epoch": 1.5655261459005811, + "grad_norm": 0.505135178565979, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 4850 + }, + { + "epoch": 1.5687540348612008, + "grad_norm": 0.5469676852226257, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 4860 + }, + { + "epoch": 1.5719819238218204, + "grad_norm": 0.5318337678909302, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 4870 + }, + { + "epoch": 1.5752098127824403, + "grad_norm": 0.7287914752960205, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 4880 + }, + { + "epoch": 1.5784377017430602, + "grad_norm": 0.7318989038467407, + "learning_rate": 0.0002, + "loss": 0.7532, + "step": 4890 + }, + { + "epoch": 1.5816655907036798, + "grad_norm": 0.6499921679496765, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 4900 + }, + { + "epoch": 1.5848934796642995, + "grad_norm": 0.47907355427742004, + "learning_rate": 0.0002, + "loss": 0.753, + "step": 4910 + }, + { + "epoch": 1.5881213686249191, + "grad_norm": 0.7338833808898926, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 4920 + }, + { + "epoch": 1.591349257585539, + "grad_norm": 0.5800719261169434, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 4930 + }, + { + "epoch": 1.594577146546159, + "grad_norm": 0.5365763306617737, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 4940 + }, + { + "epoch": 1.5978050355067785, + "grad_norm": 0.5800772309303284, + "learning_rate": 0.0002, + "loss": 0.777, + "step": 4950 + }, + { + "epoch": 1.6010329244673982, + "grad_norm": 0.7878010869026184, + "learning_rate": 0.0002, + "loss": 0.8027, + "step": 4960 + }, + { + "epoch": 1.604260813428018, + "grad_norm": 0.5919058918952942, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 4970 + }, + { + "epoch": 1.607488702388638, + "grad_norm": 0.5004435181617737, + "learning_rate": 0.0002, + "loss": 0.7762, + "step": 4980 + }, + { + "epoch": 1.6107165913492576, + "grad_norm": 0.6299242377281189, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 4990 + }, + { + "epoch": 1.6139444803098772, + "grad_norm": 0.6307242512702942, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 5000 + }, + { + "epoch": 1.6171723692704971, + "grad_norm": 0.7838703989982605, + "learning_rate": 0.0002, + "loss": 0.7693, + "step": 5010 + }, + { + "epoch": 1.620400258231117, + "grad_norm": 0.6454671621322632, + "learning_rate": 0.0002, + "loss": 0.7364, + "step": 5020 + }, + { + "epoch": 1.6236281471917366, + "grad_norm": 0.5907095670700073, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 5030 + }, + { + "epoch": 1.6268560361523563, + "grad_norm": 0.6053501963615417, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 5040 + }, + { + "epoch": 1.630083925112976, + "grad_norm": 0.5644670128822327, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 5050 + }, + { + "epoch": 1.6333118140735958, + "grad_norm": 0.6320949792861938, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 5060 + }, + { + "epoch": 1.6365397030342157, + "grad_norm": 0.6101489067077637, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 5070 + }, + { + "epoch": 1.6397675919948353, + "grad_norm": 0.9435283541679382, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 5080 + }, + { + "epoch": 1.642995480955455, + "grad_norm": 0.6668919324874878, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 5090 + }, + { + "epoch": 1.6462233699160749, + "grad_norm": 0.6160340905189514, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 5100 + }, + { + "epoch": 1.6494512588766947, + "grad_norm": 0.5999835729598999, + "learning_rate": 0.0002, + "loss": 0.7461, + "step": 5110 + }, + { + "epoch": 1.6526791478373144, + "grad_norm": 0.9378551840782166, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 5120 + }, + { + "epoch": 1.655907036797934, + "grad_norm": 0.4795055389404297, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 5130 + }, + { + "epoch": 1.659134925758554, + "grad_norm": 0.4878861606121063, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 5140 + }, + { + "epoch": 1.6623628147191738, + "grad_norm": 0.6042965054512024, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 5150 + }, + { + "epoch": 1.6655907036797934, + "grad_norm": 0.5829901695251465, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 5160 + }, + { + "epoch": 1.668818592640413, + "grad_norm": 0.5168480277061462, + "learning_rate": 0.0002, + "loss": 0.7498, + "step": 5170 + }, + { + "epoch": 1.672046481601033, + "grad_norm": 0.6489511132240295, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 5180 + }, + { + "epoch": 1.6752743705616526, + "grad_norm": 0.5955966114997864, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 5190 + }, + { + "epoch": 1.6785022595222725, + "grad_norm": 0.6228088140487671, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 5200 + }, + { + "epoch": 1.6817301484828922, + "grad_norm": 0.5726390480995178, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 5210 + }, + { + "epoch": 1.6849580374435118, + "grad_norm": 0.6116343140602112, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 5220 + }, + { + "epoch": 1.6881859264041317, + "grad_norm": 0.5483687520027161, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 5230 + }, + { + "epoch": 1.6914138153647515, + "grad_norm": 0.570941686630249, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 5240 + }, + { + "epoch": 1.6946417043253712, + "grad_norm": 0.6048086285591125, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 5250 + }, + { + "epoch": 1.6978695932859909, + "grad_norm": 0.6769003868103027, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 5260 + }, + { + "epoch": 1.7010974822466107, + "grad_norm": 0.5629057884216309, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 5270 + }, + { + "epoch": 1.7043253712072306, + "grad_norm": 0.657341480255127, + "learning_rate": 0.0002, + "loss": 0.7693, + "step": 5280 + }, + { + "epoch": 1.7075532601678503, + "grad_norm": 0.6256147623062134, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 5290 + }, + { + "epoch": 1.71078114912847, + "grad_norm": 0.5498088002204895, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 5300 + }, + { + "epoch": 1.7140090380890898, + "grad_norm": 0.5078358054161072, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 5310 + }, + { + "epoch": 1.7172369270497096, + "grad_norm": 0.6696692705154419, + "learning_rate": 0.0002, + "loss": 0.7872, + "step": 5320 + }, + { + "epoch": 1.7204648160103293, + "grad_norm": 0.6692847013473511, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 5330 + }, + { + "epoch": 1.723692704970949, + "grad_norm": 0.5415751934051514, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 5340 + }, + { + "epoch": 1.7269205939315686, + "grad_norm": 0.5367611050605774, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 5350 + }, + { + "epoch": 1.7301484828921885, + "grad_norm": 0.7321061491966248, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 5360 + }, + { + "epoch": 1.7333763718528084, + "grad_norm": 0.723972499370575, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 5370 + }, + { + "epoch": 1.736604260813428, + "grad_norm": 0.7328100204467773, + "learning_rate": 0.0002, + "loss": 0.7077, + "step": 5380 + }, + { + "epoch": 1.7398321497740477, + "grad_norm": 0.5785264372825623, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 5390 + }, + { + "epoch": 1.7430600387346675, + "grad_norm": 0.7812932133674622, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 5400 + }, + { + "epoch": 1.7462879276952874, + "grad_norm": 0.6493327617645264, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 5410 + }, + { + "epoch": 1.749515816655907, + "grad_norm": 0.5825939774513245, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 5420 + }, + { + "epoch": 1.7527437056165267, + "grad_norm": 0.6969610452651978, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 5430 + }, + { + "epoch": 1.7559715945771466, + "grad_norm": 0.5558062195777893, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 5440 + }, + { + "epoch": 1.7591994835377665, + "grad_norm": 0.49222221970558167, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 5450 + }, + { + "epoch": 1.762427372498386, + "grad_norm": 0.5844656825065613, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 5460 + }, + { + "epoch": 1.7656552614590058, + "grad_norm": 0.8706597685813904, + "learning_rate": 0.0002, + "loss": 0.7695, + "step": 5470 + }, + { + "epoch": 1.7688831504196254, + "grad_norm": 0.6167706251144409, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 5480 + }, + { + "epoch": 1.7721110393802453, + "grad_norm": 0.5890011787414551, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 5490 + }, + { + "epoch": 1.7753389283408652, + "grad_norm": 0.6551728248596191, + "learning_rate": 0.0002, + "loss": 0.8319, + "step": 5500 + }, + { + "epoch": 1.7785668173014848, + "grad_norm": 0.5848751068115234, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 5510 + }, + { + "epoch": 1.7817947062621045, + "grad_norm": 0.6664014458656311, + "learning_rate": 0.0002, + "loss": 0.7622, + "step": 5520 + }, + { + "epoch": 1.7850225952227243, + "grad_norm": 0.5931693911552429, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 5530 + }, + { + "epoch": 1.7882504841833442, + "grad_norm": 0.5534724593162537, + "learning_rate": 0.0002, + "loss": 0.7992, + "step": 5540 + }, + { + "epoch": 1.7914783731439639, + "grad_norm": 0.5590878129005432, + "learning_rate": 0.0002, + "loss": 0.7967, + "step": 5550 + }, + { + "epoch": 1.7947062621045835, + "grad_norm": 0.6947470903396606, + "learning_rate": 0.0002, + "loss": 0.7406, + "step": 5560 + }, + { + "epoch": 1.7979341510652034, + "grad_norm": 0.6104130148887634, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 5570 + }, + { + "epoch": 1.8011620400258233, + "grad_norm": 0.6135714054107666, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 5580 + }, + { + "epoch": 1.804389928986443, + "grad_norm": 0.6626853346824646, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 5590 + }, + { + "epoch": 1.8076178179470626, + "grad_norm": 0.6977612972259521, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 5600 + }, + { + "epoch": 1.8108457069076824, + "grad_norm": 0.6275238394737244, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 5610 + }, + { + "epoch": 1.814073595868302, + "grad_norm": 0.5017505288124084, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 5620 + }, + { + "epoch": 1.817301484828922, + "grad_norm": 0.8314290642738342, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 5630 + }, + { + "epoch": 1.8205293737895416, + "grad_norm": 0.6863582134246826, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 5640 + }, + { + "epoch": 1.8237572627501613, + "grad_norm": 0.69544917345047, + "learning_rate": 0.0002, + "loss": 0.743, + "step": 5650 + }, + { + "epoch": 1.8269851517107811, + "grad_norm": 0.515499472618103, + "learning_rate": 0.0002, + "loss": 0.7277, + "step": 5660 + }, + { + "epoch": 1.830213040671401, + "grad_norm": 0.6100873947143555, + "learning_rate": 0.0002, + "loss": 0.7166, + "step": 5670 + }, + { + "epoch": 1.8334409296320207, + "grad_norm": 0.67416912317276, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 5680 + }, + { + "epoch": 1.8366688185926403, + "grad_norm": 0.7057772278785706, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 5690 + }, + { + "epoch": 1.8398967075532602, + "grad_norm": 0.7374551892280579, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 5700 + }, + { + "epoch": 1.84312459651388, + "grad_norm": 0.6266297101974487, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 5710 + }, + { + "epoch": 1.8463524854744997, + "grad_norm": 0.5629227757453918, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 5720 + }, + { + "epoch": 1.8495803744351194, + "grad_norm": 0.6603655815124512, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 5730 + }, + { + "epoch": 1.8528082633957392, + "grad_norm": 0.8113715052604675, + "learning_rate": 0.0002, + "loss": 0.7587, + "step": 5740 + }, + { + "epoch": 1.856036152356359, + "grad_norm": 0.7143914103507996, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 5750 + }, + { + "epoch": 1.8592640413169788, + "grad_norm": 0.6273732781410217, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 5760 + }, + { + "epoch": 1.8624919302775984, + "grad_norm": 0.5428690910339355, + "learning_rate": 0.0002, + "loss": 0.7962, + "step": 5770 + }, + { + "epoch": 1.865719819238218, + "grad_norm": 0.6405037641525269, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 5780 + }, + { + "epoch": 1.868947708198838, + "grad_norm": 0.700873613357544, + "learning_rate": 0.0002, + "loss": 0.7569, + "step": 5790 + }, + { + "epoch": 1.8721755971594578, + "grad_norm": 0.5645238161087036, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 5800 + }, + { + "epoch": 1.8754034861200775, + "grad_norm": 0.8780353665351868, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 5810 + }, + { + "epoch": 1.878631375080697, + "grad_norm": 0.6295409798622131, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 5820 + }, + { + "epoch": 1.881859264041317, + "grad_norm": 0.678269624710083, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 5830 + }, + { + "epoch": 1.8850871530019369, + "grad_norm": 0.6464608907699585, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 5840 + }, + { + "epoch": 1.8883150419625565, + "grad_norm": 0.6201048493385315, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 5850 + }, + { + "epoch": 1.8915429309231762, + "grad_norm": 0.6046274304389954, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 5860 + }, + { + "epoch": 1.894770819883796, + "grad_norm": 0.7532408833503723, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 5870 + }, + { + "epoch": 1.897998708844416, + "grad_norm": 0.6066767573356628, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 5880 + }, + { + "epoch": 1.9012265978050356, + "grad_norm": 0.6289830207824707, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 5890 + }, + { + "epoch": 1.9044544867656552, + "grad_norm": 0.5204319953918457, + "learning_rate": 0.0002, + "loss": 0.7501, + "step": 5900 + }, + { + "epoch": 1.9076823757262749, + "grad_norm": 0.6708219647407532, + "learning_rate": 0.0002, + "loss": 0.7335, + "step": 5910 + }, + { + "epoch": 1.9109102646868947, + "grad_norm": 0.4915677309036255, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 5920 + }, + { + "epoch": 1.9141381536475146, + "grad_norm": 0.652717113494873, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 5930 + }, + { + "epoch": 1.9173660426081343, + "grad_norm": 0.5446316003799438, + "learning_rate": 0.0002, + "loss": 0.7687, + "step": 5940 + }, + { + "epoch": 1.920593931568754, + "grad_norm": 0.4958149194717407, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 5950 + }, + { + "epoch": 1.9238218205293738, + "grad_norm": 0.5623434782028198, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 5960 + }, + { + "epoch": 1.9270497094899937, + "grad_norm": 0.6855450868606567, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 5970 + }, + { + "epoch": 1.9302775984506133, + "grad_norm": 0.5710492730140686, + "learning_rate": 0.0002, + "loss": 0.827, + "step": 5980 + }, + { + "epoch": 1.933505487411233, + "grad_norm": 0.5379431843757629, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 5990 + }, + { + "epoch": 1.9367333763718528, + "grad_norm": 0.557129442691803, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 6000 + }, + { + "epoch": 1.9399612653324727, + "grad_norm": 0.6336663961410522, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 6010 + }, + { + "epoch": 1.9431891542930924, + "grad_norm": 0.5950582027435303, + "learning_rate": 0.0002, + "loss": 0.7316, + "step": 6020 + }, + { + "epoch": 1.946417043253712, + "grad_norm": 0.5905954837799072, + "learning_rate": 0.0002, + "loss": 0.7443, + "step": 6030 + }, + { + "epoch": 1.9496449322143317, + "grad_norm": 0.6688982844352722, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 6040 + }, + { + "epoch": 1.9528728211749515, + "grad_norm": 0.5440775752067566, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 6050 + }, + { + "epoch": 1.9561007101355714, + "grad_norm": 0.6207906603813171, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 6060 + }, + { + "epoch": 1.959328599096191, + "grad_norm": 0.6999374628067017, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 6070 + }, + { + "epoch": 1.9625564880568107, + "grad_norm": 0.6310848593711853, + "learning_rate": 0.0002, + "loss": 0.7372, + "step": 6080 + }, + { + "epoch": 1.9657843770174306, + "grad_norm": 0.5903388261795044, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 6090 + }, + { + "epoch": 1.9690122659780505, + "grad_norm": 0.6333889961242676, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 6100 + }, + { + "epoch": 1.97224015493867, + "grad_norm": 0.5604711174964905, + "learning_rate": 0.0002, + "loss": 0.7246, + "step": 6110 + }, + { + "epoch": 1.9754680438992898, + "grad_norm": 0.9234541654586792, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 6120 + }, + { + "epoch": 1.9786959328599096, + "grad_norm": 0.6149102449417114, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 6130 + }, + { + "epoch": 1.9819238218205295, + "grad_norm": 0.615446150302887, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 6140 + }, + { + "epoch": 1.9851517107811492, + "grad_norm": 0.5176635980606079, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 6150 + }, + { + "epoch": 1.9883795997417688, + "grad_norm": 0.7124109864234924, + "learning_rate": 0.0002, + "loss": 0.718, + "step": 6160 + }, + { + "epoch": 1.9916074887023887, + "grad_norm": 0.6317567825317383, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 6170 + }, + { + "epoch": 1.9948353776630086, + "grad_norm": 0.6855016350746155, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 6180 + }, + { + "epoch": 1.9980632666236282, + "grad_norm": 0.6423715353012085, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 6190 + }, + { + "epoch": 2.0, + "eval_loss": 1.1096643209457397, + "eval_runtime": 147.7997, + "eval_samples_per_second": 4.959, + "eval_steps_per_second": 0.622, + "step": 6196 + }, + { + "epoch": 2.001291155584248, + "grad_norm": 0.5322932600975037, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 6200 + }, + { + "epoch": 2.0045190445448675, + "grad_norm": 0.8152306079864502, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 6210 + }, + { + "epoch": 2.0077469335054876, + "grad_norm": 0.6215983033180237, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 6220 + }, + { + "epoch": 2.0109748224661073, + "grad_norm": 0.845498263835907, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 6230 + }, + { + "epoch": 2.014202711426727, + "grad_norm": 0.733559787273407, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 6240 + }, + { + "epoch": 2.0174306003873466, + "grad_norm": 0.51433926820755, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 6250 + }, + { + "epoch": 2.020658489347966, + "grad_norm": 0.6374049782752991, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 6260 + }, + { + "epoch": 2.0238863783085863, + "grad_norm": 0.7833638191223145, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 6270 + }, + { + "epoch": 2.027114267269206, + "grad_norm": 0.8929463028907776, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 6280 + }, + { + "epoch": 2.0303421562298256, + "grad_norm": 0.669731855392456, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 6290 + }, + { + "epoch": 2.0335700451904453, + "grad_norm": 0.5846071243286133, + "learning_rate": 0.0002, + "loss": 0.646, + "step": 6300 + }, + { + "epoch": 2.0367979341510654, + "grad_norm": 0.7087787985801697, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 6310 + }, + { + "epoch": 2.040025823111685, + "grad_norm": 0.6739160418510437, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 6320 + }, + { + "epoch": 2.0432537120723047, + "grad_norm": 0.4860886335372925, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 6330 + }, + { + "epoch": 2.0464816010329243, + "grad_norm": 0.7201244831085205, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 6340 + }, + { + "epoch": 2.0497094899935444, + "grad_norm": 0.7409170269966125, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 6350 + }, + { + "epoch": 2.052937378954164, + "grad_norm": 0.6843920350074768, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 6360 + }, + { + "epoch": 2.0561652679147837, + "grad_norm": 0.7519999742507935, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 6370 + }, + { + "epoch": 2.0593931568754034, + "grad_norm": 0.5732819437980652, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 6380 + }, + { + "epoch": 2.062621045836023, + "grad_norm": 0.7565118074417114, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 6390 + }, + { + "epoch": 2.065848934796643, + "grad_norm": 0.8147150278091431, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 6400 + }, + { + "epoch": 2.0690768237572628, + "grad_norm": 0.6941924691200256, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 6410 + }, + { + "epoch": 2.0723047127178824, + "grad_norm": 0.6549784541130066, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 6420 + }, + { + "epoch": 2.075532601678502, + "grad_norm": 0.7224905490875244, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 6430 + }, + { + "epoch": 2.078760490639122, + "grad_norm": 0.7754863500595093, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 6440 + }, + { + "epoch": 2.081988379599742, + "grad_norm": 0.691318154335022, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 6450 + }, + { + "epoch": 2.0852162685603615, + "grad_norm": 0.6009294986724854, + "learning_rate": 0.0002, + "loss": 0.6233, + "step": 6460 + }, + { + "epoch": 2.088444157520981, + "grad_norm": 0.6753945350646973, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 6470 + }, + { + "epoch": 2.091672046481601, + "grad_norm": 0.6899921298027039, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 6480 + }, + { + "epoch": 2.094899935442221, + "grad_norm": 0.846510648727417, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 6490 + }, + { + "epoch": 2.0981278244028405, + "grad_norm": 0.6432605981826782, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 6500 + }, + { + "epoch": 2.10135571336346, + "grad_norm": 0.8125239014625549, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 6510 + }, + { + "epoch": 2.1045836023240803, + "grad_norm": 0.628302812576294, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 6520 + }, + { + "epoch": 2.1078114912847, + "grad_norm": 0.7164334654808044, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 6530 + }, + { + "epoch": 2.1110393802453196, + "grad_norm": 0.7476949095726013, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 6540 + }, + { + "epoch": 2.114267269205939, + "grad_norm": 0.7577515840530396, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 6550 + }, + { + "epoch": 2.117495158166559, + "grad_norm": 0.5684467554092407, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 6560 + }, + { + "epoch": 2.120723047127179, + "grad_norm": 0.6121789216995239, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 6570 + }, + { + "epoch": 2.1239509360877986, + "grad_norm": 0.6095348596572876, + "learning_rate": 0.0002, + "loss": 0.6314, + "step": 6580 + }, + { + "epoch": 2.1271788250484183, + "grad_norm": 0.7803651690483093, + "learning_rate": 0.0002, + "loss": 0.6276, + "step": 6590 + }, + { + "epoch": 2.130406714009038, + "grad_norm": 0.5990583300590515, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 6600 + }, + { + "epoch": 2.133634602969658, + "grad_norm": 0.6569220423698425, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 6610 + }, + { + "epoch": 2.1368624919302777, + "grad_norm": 0.5961166620254517, + "learning_rate": 0.0002, + "loss": 0.7049, + "step": 6620 + }, + { + "epoch": 2.1400903808908973, + "grad_norm": 0.5860554575920105, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 6630 + }, + { + "epoch": 2.143318269851517, + "grad_norm": 0.5994001626968384, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 6640 + }, + { + "epoch": 2.146546158812137, + "grad_norm": 0.7723015546798706, + "learning_rate": 0.0002, + "loss": 0.6421, + "step": 6650 + }, + { + "epoch": 2.1497740477727567, + "grad_norm": 0.676355242729187, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 6660 + }, + { + "epoch": 2.1530019367333764, + "grad_norm": 0.5689092874526978, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 6670 + }, + { + "epoch": 2.156229825693996, + "grad_norm": 0.6933727264404297, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 6680 + }, + { + "epoch": 2.159457714654616, + "grad_norm": 0.8380527496337891, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 6690 + }, + { + "epoch": 2.1626856036152358, + "grad_norm": 0.6876497268676758, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 6700 + }, + { + "epoch": 2.1659134925758554, + "grad_norm": 0.6418334245681763, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 6710 + }, + { + "epoch": 2.169141381536475, + "grad_norm": 0.7169192433357239, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 6720 + }, + { + "epoch": 2.1723692704970947, + "grad_norm": 0.6664170622825623, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 6730 + }, + { + "epoch": 2.175597159457715, + "grad_norm": 0.6011993288993835, + "learning_rate": 0.0002, + "loss": 0.6751, + "step": 6740 + }, + { + "epoch": 2.1788250484183345, + "grad_norm": 0.5529947280883789, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 6750 + }, + { + "epoch": 2.182052937378954, + "grad_norm": 0.6879532933235168, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 6760 + }, + { + "epoch": 2.1852808263395738, + "grad_norm": 0.6426113843917847, + "learning_rate": 0.0002, + "loss": 0.6634, + "step": 6770 + }, + { + "epoch": 2.188508715300194, + "grad_norm": 0.6571047306060791, + "learning_rate": 0.0002, + "loss": 0.6592, + "step": 6780 + }, + { + "epoch": 2.1917366042608135, + "grad_norm": 0.6400564908981323, + "learning_rate": 0.0002, + "loss": 0.6494, + "step": 6790 + }, + { + "epoch": 2.194964493221433, + "grad_norm": 0.6509664058685303, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 6800 + }, + { + "epoch": 2.198192382182053, + "grad_norm": 0.6673197150230408, + "learning_rate": 0.0002, + "loss": 0.6771, + "step": 6810 + }, + { + "epoch": 2.2014202711426725, + "grad_norm": 0.48205727338790894, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 6820 + }, + { + "epoch": 2.2046481601032926, + "grad_norm": 0.849525511264801, + "learning_rate": 0.0002, + "loss": 0.6894, + "step": 6830 + }, + { + "epoch": 2.207876049063912, + "grad_norm": 0.6150892376899719, + "learning_rate": 0.0002, + "loss": 0.6977, + "step": 6840 + }, + { + "epoch": 2.211103938024532, + "grad_norm": 0.7826945781707764, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 6850 + }, + { + "epoch": 2.2143318269851515, + "grad_norm": 0.5711963772773743, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 6860 + }, + { + "epoch": 2.2175597159457716, + "grad_norm": 0.6017758846282959, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 6870 + }, + { + "epoch": 2.2207876049063913, + "grad_norm": 0.785434901714325, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 6880 + }, + { + "epoch": 2.224015493867011, + "grad_norm": 0.6251688599586487, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 6890 + }, + { + "epoch": 2.2272433828276306, + "grad_norm": 0.8242034316062927, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 6900 + }, + { + "epoch": 2.2304712717882507, + "grad_norm": 0.7272933125495911, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 6910 + }, + { + "epoch": 2.2336991607488703, + "grad_norm": 0.7159379720687866, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 6920 + }, + { + "epoch": 2.23692704970949, + "grad_norm": 0.6518042087554932, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 6930 + }, + { + "epoch": 2.2401549386701096, + "grad_norm": 0.7365370392799377, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 6940 + }, + { + "epoch": 2.2433828276307297, + "grad_norm": 0.5674061179161072, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 6950 + }, + { + "epoch": 2.2466107165913494, + "grad_norm": 0.669185996055603, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 6960 + }, + { + "epoch": 2.249838605551969, + "grad_norm": 0.6638304591178894, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 6970 + }, + { + "epoch": 2.2530664945125887, + "grad_norm": 0.757006824016571, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 6980 + }, + { + "epoch": 2.2562943834732083, + "grad_norm": 0.7574930787086487, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 6990 + }, + { + "epoch": 2.2595222724338284, + "grad_norm": 0.7819514870643616, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 7000 + }, + { + "epoch": 2.262750161394448, + "grad_norm": 0.6987583041191101, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 7010 + }, + { + "epoch": 2.2659780503550677, + "grad_norm": 0.6628551483154297, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 7020 + }, + { + "epoch": 2.2692059393156874, + "grad_norm": 0.7855866551399231, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 7030 + }, + { + "epoch": 2.2724338282763075, + "grad_norm": 0.6102892756462097, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 7040 + }, + { + "epoch": 2.275661717236927, + "grad_norm": 0.7844198942184448, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 7050 + }, + { + "epoch": 2.2788896061975468, + "grad_norm": 0.6209492087364197, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 7060 + }, + { + "epoch": 2.2821174951581664, + "grad_norm": 0.8351290225982666, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 7070 + }, + { + "epoch": 2.285345384118786, + "grad_norm": 0.6883546710014343, + "learning_rate": 0.0002, + "loss": 0.6648, + "step": 7080 + }, + { + "epoch": 2.288573273079406, + "grad_norm": 0.6626381874084473, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 7090 + }, + { + "epoch": 2.291801162040026, + "grad_norm": 0.7216270565986633, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 7100 + }, + { + "epoch": 2.2950290510006455, + "grad_norm": 0.8246777057647705, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 7110 + }, + { + "epoch": 2.2982569399612656, + "grad_norm": 0.614326000213623, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 7120 + }, + { + "epoch": 2.301484828921885, + "grad_norm": 0.8785578012466431, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 7130 + }, + { + "epoch": 2.304712717882505, + "grad_norm": 0.7021808624267578, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 7140 + }, + { + "epoch": 2.3079406068431245, + "grad_norm": 0.6999403238296509, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 7150 + }, + { + "epoch": 2.311168495803744, + "grad_norm": 0.8013143539428711, + "learning_rate": 0.0002, + "loss": 0.6547, + "step": 7160 + }, + { + "epoch": 2.3143963847643643, + "grad_norm": 0.6592583060264587, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 7170 + }, + { + "epoch": 2.317624273724984, + "grad_norm": 0.6260249018669128, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 7180 + }, + { + "epoch": 2.3208521626856036, + "grad_norm": 0.9352797269821167, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 7190 + }, + { + "epoch": 2.324080051646223, + "grad_norm": 0.6629612445831299, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 7200 + }, + { + "epoch": 2.3273079406068433, + "grad_norm": 0.7062810063362122, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 7210 + }, + { + "epoch": 2.330535829567463, + "grad_norm": 0.7236241102218628, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 7220 + }, + { + "epoch": 2.3337637185280826, + "grad_norm": 0.7528148293495178, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 7230 + }, + { + "epoch": 2.3369916074887023, + "grad_norm": 0.7604748606681824, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 7240 + }, + { + "epoch": 2.340219496449322, + "grad_norm": 0.5601189136505127, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 7250 + }, + { + "epoch": 2.343447385409942, + "grad_norm": 0.7099230885505676, + "learning_rate": 0.0002, + "loss": 0.6925, + "step": 7260 + }, + { + "epoch": 2.3466752743705617, + "grad_norm": 0.6699047684669495, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 7270 + }, + { + "epoch": 2.3499031633311813, + "grad_norm": 0.7315047979354858, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 7280 + }, + { + "epoch": 2.353131052291801, + "grad_norm": 0.632836103439331, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 7290 + }, + { + "epoch": 2.356358941252421, + "grad_norm": 0.9410115480422974, + "learning_rate": 0.0002, + "loss": 0.6458, + "step": 7300 + }, + { + "epoch": 2.3595868302130407, + "grad_norm": 0.626554012298584, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 7310 + }, + { + "epoch": 2.3628147191736604, + "grad_norm": 0.7538444399833679, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 7320 + }, + { + "epoch": 2.36604260813428, + "grad_norm": 0.6826626062393188, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 7330 + }, + { + "epoch": 2.3692704970949, + "grad_norm": 0.6739391088485718, + "learning_rate": 0.0002, + "loss": 0.6752, + "step": 7340 + }, + { + "epoch": 2.3724983860555198, + "grad_norm": 0.7518446445465088, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 7350 + }, + { + "epoch": 2.3757262750161394, + "grad_norm": 0.714133083820343, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 7360 + }, + { + "epoch": 2.378954163976759, + "grad_norm": 0.7144588232040405, + "learning_rate": 0.0002, + "loss": 0.6794, + "step": 7370 + }, + { + "epoch": 2.382182052937379, + "grad_norm": 0.6598120927810669, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 7380 + }, + { + "epoch": 2.385409941897999, + "grad_norm": 0.7079148292541504, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 7390 + }, + { + "epoch": 2.3886378308586185, + "grad_norm": 0.6750902533531189, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 7400 + }, + { + "epoch": 2.391865719819238, + "grad_norm": 0.7181967496871948, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 7410 + }, + { + "epoch": 2.3950936087798578, + "grad_norm": 0.7720552086830139, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 7420 + }, + { + "epoch": 2.398321497740478, + "grad_norm": 0.7592426538467407, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 7430 + }, + { + "epoch": 2.4015493867010975, + "grad_norm": 0.7161896824836731, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 7440 + }, + { + "epoch": 2.404777275661717, + "grad_norm": 0.8019260764122009, + "learning_rate": 0.0002, + "loss": 0.6891, + "step": 7450 + }, + { + "epoch": 2.408005164622337, + "grad_norm": 0.7093342542648315, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 7460 + }, + { + "epoch": 2.411233053582957, + "grad_norm": 0.8464207649230957, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 7470 + }, + { + "epoch": 2.4144609425435766, + "grad_norm": 0.773666501045227, + "learning_rate": 0.0002, + "loss": 0.6724, + "step": 7480 + }, + { + "epoch": 2.4176888315041962, + "grad_norm": 0.8451611995697021, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 7490 + }, + { + "epoch": 2.420916720464816, + "grad_norm": 0.656795084476471, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 7500 + }, + { + "epoch": 2.4241446094254355, + "grad_norm": 0.7129034996032715, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 7510 + }, + { + "epoch": 2.4273724983860556, + "grad_norm": 0.8325763940811157, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 7520 + }, + { + "epoch": 2.4306003873466753, + "grad_norm": 0.7806527614593506, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 7530 + }, + { + "epoch": 2.433828276307295, + "grad_norm": 0.6994536519050598, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 7540 + }, + { + "epoch": 2.437056165267915, + "grad_norm": 0.6898999214172363, + "learning_rate": 0.0002, + "loss": 0.6615, + "step": 7550 + }, + { + "epoch": 2.4402840542285347, + "grad_norm": 0.719490647315979, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 7560 + }, + { + "epoch": 2.4435119431891543, + "grad_norm": 0.6841562390327454, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 7570 + }, + { + "epoch": 2.446739832149774, + "grad_norm": 0.7573311924934387, + "learning_rate": 0.0002, + "loss": 0.6504, + "step": 7580 + }, + { + "epoch": 2.4499677211103936, + "grad_norm": 0.7295880317687988, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 7590 + }, + { + "epoch": 2.4531956100710137, + "grad_norm": 0.710136353969574, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 7600 + }, + { + "epoch": 2.4564234990316334, + "grad_norm": 0.6126235127449036, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 7610 + }, + { + "epoch": 2.459651387992253, + "grad_norm": 0.8025609850883484, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 7620 + }, + { + "epoch": 2.4628792769528727, + "grad_norm": 0.7839472889900208, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 7630 + }, + { + "epoch": 2.4661071659134928, + "grad_norm": 0.7253499031066895, + "learning_rate": 0.0002, + "loss": 0.6797, + "step": 7640 + }, + { + "epoch": 2.4693350548741124, + "grad_norm": 0.7918946743011475, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 7650 + }, + { + "epoch": 2.472562943834732, + "grad_norm": 0.7930178046226501, + "learning_rate": 0.0002, + "loss": 0.6646, + "step": 7660 + }, + { + "epoch": 2.4757908327953517, + "grad_norm": 0.6826170086860657, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 7670 + }, + { + "epoch": 2.4790187217559714, + "grad_norm": 0.6576805114746094, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 7680 + }, + { + "epoch": 2.4822466107165915, + "grad_norm": 0.7012448310852051, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 7690 + }, + { + "epoch": 2.485474499677211, + "grad_norm": 0.7774284482002258, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 7700 + }, + { + "epoch": 2.4887023886378308, + "grad_norm": 0.6502766013145447, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 7710 + }, + { + "epoch": 2.4919302775984504, + "grad_norm": 0.7638739347457886, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 7720 + }, + { + "epoch": 2.4951581665590705, + "grad_norm": 0.6217384338378906, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 7730 + }, + { + "epoch": 2.49838605551969, + "grad_norm": 0.7576302886009216, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 7740 + }, + { + "epoch": 2.50161394448031, + "grad_norm": 0.6877137422561646, + "learning_rate": 0.0002, + "loss": 0.6855, + "step": 7750 + }, + { + "epoch": 2.5048418334409295, + "grad_norm": 0.6998329162597656, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 7760 + }, + { + "epoch": 2.508069722401549, + "grad_norm": 0.7879213690757751, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 7770 + }, + { + "epoch": 2.5112976113621692, + "grad_norm": 0.7834980487823486, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 7780 + }, + { + "epoch": 2.514525500322789, + "grad_norm": 0.7789630889892578, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 7790 + }, + { + "epoch": 2.5177533892834085, + "grad_norm": 0.7403590083122253, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 7800 + }, + { + "epoch": 2.5209812782440286, + "grad_norm": 0.6029766201972961, + "learning_rate": 0.0002, + "loss": 0.6964, + "step": 7810 + }, + { + "epoch": 2.5242091672046483, + "grad_norm": 0.7061092257499695, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 7820 + }, + { + "epoch": 2.527437056165268, + "grad_norm": 0.7120763659477234, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 7830 + }, + { + "epoch": 2.5306649451258876, + "grad_norm": 0.6173675656318665, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 7840 + }, + { + "epoch": 2.5338928340865072, + "grad_norm": 0.9566813111305237, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 7850 + }, + { + "epoch": 2.5371207230471273, + "grad_norm": 0.8497620224952698, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 7860 + }, + { + "epoch": 2.540348612007747, + "grad_norm": 0.7663498520851135, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 7870 + }, + { + "epoch": 2.5435765009683666, + "grad_norm": 0.6329668760299683, + "learning_rate": 0.0002, + "loss": 0.6292, + "step": 7880 + }, + { + "epoch": 2.5468043899289863, + "grad_norm": 0.8128195405006409, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 7890 + }, + { + "epoch": 2.5500322788896064, + "grad_norm": 0.6622284650802612, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 7900 + }, + { + "epoch": 2.553260167850226, + "grad_norm": 0.8460057973861694, + "learning_rate": 0.0002, + "loss": 0.693, + "step": 7910 + }, + { + "epoch": 2.5564880568108457, + "grad_norm": 0.6586956977844238, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 7920 + }, + { + "epoch": 2.5597159457714653, + "grad_norm": 0.7569382190704346, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 7930 + }, + { + "epoch": 2.562943834732085, + "grad_norm": 0.6409714221954346, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 7940 + }, + { + "epoch": 2.566171723692705, + "grad_norm": 0.7031713128089905, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 7950 + }, + { + "epoch": 2.5693996126533247, + "grad_norm": 0.7983605265617371, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 7960 + }, + { + "epoch": 2.5726275016139444, + "grad_norm": 0.7165433168411255, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 7970 + }, + { + "epoch": 2.5758553905745645, + "grad_norm": 0.6630598902702332, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 7980 + }, + { + "epoch": 2.579083279535184, + "grad_norm": 0.5883122086524963, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 7990 + }, + { + "epoch": 2.5823111684958038, + "grad_norm": 0.5928755402565002, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 8000 + }, + { + "epoch": 2.5855390574564234, + "grad_norm": 0.7843712568283081, + "learning_rate": 0.0002, + "loss": 0.6701, + "step": 8010 + }, + { + "epoch": 2.588766946417043, + "grad_norm": 0.7206324338912964, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 8020 + }, + { + "epoch": 2.5919948353776627, + "grad_norm": 0.812480092048645, + "learning_rate": 0.0002, + "loss": 0.6968, + "step": 8030 + }, + { + "epoch": 2.595222724338283, + "grad_norm": 0.9843078255653381, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 8040 + }, + { + "epoch": 2.5984506132989025, + "grad_norm": 0.7524392604827881, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 8050 + }, + { + "epoch": 2.601678502259522, + "grad_norm": 0.6220380067825317, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 8060 + }, + { + "epoch": 2.6049063912201422, + "grad_norm": 0.7461398243904114, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 8070 + }, + { + "epoch": 2.608134280180762, + "grad_norm": 0.720974326133728, + "learning_rate": 0.0002, + "loss": 0.6626, + "step": 8080 + }, + { + "epoch": 2.6113621691413815, + "grad_norm": 0.649509847164154, + "learning_rate": 0.0002, + "loss": 0.6756, + "step": 8090 + }, + { + "epoch": 2.614590058102001, + "grad_norm": 0.6894662976264954, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 8100 + }, + { + "epoch": 2.617817947062621, + "grad_norm": 0.734433114528656, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 8110 + }, + { + "epoch": 2.621045836023241, + "grad_norm": 0.7468628883361816, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 8120 + }, + { + "epoch": 2.6242737249838606, + "grad_norm": 0.6508180499076843, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 8130 + }, + { + "epoch": 2.6275016139444802, + "grad_norm": 0.8735209107398987, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 8140 + }, + { + "epoch": 2.6307295029051003, + "grad_norm": 0.8162857294082642, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 8150 + }, + { + "epoch": 2.63395739186572, + "grad_norm": 0.628872811794281, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 8160 + }, + { + "epoch": 2.6371852808263396, + "grad_norm": 0.8078708052635193, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 8170 + }, + { + "epoch": 2.6404131697869593, + "grad_norm": 0.7849429845809937, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 8180 + }, + { + "epoch": 2.643641058747579, + "grad_norm": 0.8115387558937073, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 8190 + }, + { + "epoch": 2.6468689477081986, + "grad_norm": 0.7462222576141357, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 8200 + }, + { + "epoch": 2.6500968366688187, + "grad_norm": 0.753662645816803, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 8210 + }, + { + "epoch": 2.6533247256294383, + "grad_norm": 0.6100404858589172, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 8220 + }, + { + "epoch": 2.656552614590058, + "grad_norm": 0.9084606766700745, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 8230 + }, + { + "epoch": 2.659780503550678, + "grad_norm": 0.6412538886070251, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 8240 + }, + { + "epoch": 2.6630083925112977, + "grad_norm": 0.7640451192855835, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 8250 + }, + { + "epoch": 2.6662362814719174, + "grad_norm": 0.5972344875335693, + "learning_rate": 0.0002, + "loss": 0.6846, + "step": 8260 + }, + { + "epoch": 2.669464170432537, + "grad_norm": 0.6935883164405823, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 8270 + }, + { + "epoch": 2.6726920593931567, + "grad_norm": 0.789399266242981, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 8280 + }, + { + "epoch": 2.675919948353777, + "grad_norm": 0.7143490314483643, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 8290 + }, + { + "epoch": 2.6791478373143964, + "grad_norm": 0.6670652627944946, + "learning_rate": 0.0002, + "loss": 0.6741, + "step": 8300 + }, + { + "epoch": 2.682375726275016, + "grad_norm": 0.687108039855957, + "learning_rate": 0.0002, + "loss": 0.6936, + "step": 8310 + }, + { + "epoch": 2.6856036152356357, + "grad_norm": 0.7914147973060608, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 8320 + }, + { + "epoch": 2.688831504196256, + "grad_norm": 0.8398420214653015, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 8330 + }, + { + "epoch": 2.6920593931568755, + "grad_norm": 0.6592720746994019, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 8340 + }, + { + "epoch": 2.695287282117495, + "grad_norm": 0.6888470649719238, + "learning_rate": 0.0002, + "loss": 0.6673, + "step": 8350 + }, + { + "epoch": 2.698515171078115, + "grad_norm": 0.7127556800842285, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 8360 + }, + { + "epoch": 2.7017430600387344, + "grad_norm": 0.6630286574363708, + "learning_rate": 0.0002, + "loss": 0.7013, + "step": 8370 + }, + { + "epoch": 2.7049709489993545, + "grad_norm": 0.8261964321136475, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 8380 + }, + { + "epoch": 2.708198837959974, + "grad_norm": 0.717339813709259, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 8390 + }, + { + "epoch": 2.711426726920594, + "grad_norm": 0.651637613773346, + "learning_rate": 0.0002, + "loss": 0.6929, + "step": 8400 + }, + { + "epoch": 2.714654615881214, + "grad_norm": 0.7936098575592041, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 8410 + }, + { + "epoch": 2.7178825048418336, + "grad_norm": 0.8761560320854187, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 8420 + }, + { + "epoch": 2.7211103938024532, + "grad_norm": 0.6768006086349487, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 8430 + }, + { + "epoch": 2.724338282763073, + "grad_norm": 0.7121055722236633, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 8440 + }, + { + "epoch": 2.7275661717236925, + "grad_norm": 0.6811696887016296, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 8450 + }, + { + "epoch": 2.730794060684312, + "grad_norm": 0.8168250918388367, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 8460 + }, + { + "epoch": 2.7340219496449323, + "grad_norm": 0.660682737827301, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 8470 + }, + { + "epoch": 2.737249838605552, + "grad_norm": 0.7369356155395508, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 8480 + }, + { + "epoch": 2.7404777275661716, + "grad_norm": 0.7545099854469299, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 8490 + }, + { + "epoch": 2.7437056165267917, + "grad_norm": 0.6991257667541504, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 8500 + }, + { + "epoch": 2.7469335054874113, + "grad_norm": 0.7195324301719666, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 8510 + }, + { + "epoch": 2.750161394448031, + "grad_norm": 0.8995378017425537, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 8520 + }, + { + "epoch": 2.7533892834086506, + "grad_norm": 0.6924123764038086, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 8530 + }, + { + "epoch": 2.7566171723692703, + "grad_norm": 0.6260585784912109, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 8540 + }, + { + "epoch": 2.7598450613298904, + "grad_norm": 0.7273091673851013, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 8550 + }, + { + "epoch": 2.76307295029051, + "grad_norm": 0.720562219619751, + "learning_rate": 0.0002, + "loss": 0.6853, + "step": 8560 + }, + { + "epoch": 2.7663008392511297, + "grad_norm": 0.6360004544258118, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 8570 + }, + { + "epoch": 2.76952872821175, + "grad_norm": 0.7634525895118713, + "learning_rate": 0.0002, + "loss": 0.6118, + "step": 8580 + }, + { + "epoch": 2.7727566171723694, + "grad_norm": 0.6586076021194458, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 8590 + }, + { + "epoch": 2.775984506132989, + "grad_norm": 0.6542639136314392, + "learning_rate": 0.0002, + "loss": 0.7072, + "step": 8600 + }, + { + "epoch": 2.7792123950936087, + "grad_norm": 0.7650290727615356, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 8610 + }, + { + "epoch": 2.7824402840542284, + "grad_norm": 0.6551542282104492, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 8620 + }, + { + "epoch": 2.785668173014848, + "grad_norm": 0.6915501952171326, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 8630 + }, + { + "epoch": 2.788896061975468, + "grad_norm": 0.8061493635177612, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 8640 + }, + { + "epoch": 2.792123950936088, + "grad_norm": 0.8403584957122803, + "learning_rate": 0.0002, + "loss": 0.6853, + "step": 8650 + }, + { + "epoch": 2.7953518398967074, + "grad_norm": 0.6455532312393188, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 8660 + }, + { + "epoch": 2.7985797288573275, + "grad_norm": 0.8296352028846741, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 8670 + }, + { + "epoch": 2.801807617817947, + "grad_norm": 0.7288752794265747, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 8680 + }, + { + "epoch": 2.805035506778567, + "grad_norm": 0.7628464102745056, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 8690 + }, + { + "epoch": 2.8082633957391865, + "grad_norm": 0.9993878602981567, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 8700 + }, + { + "epoch": 2.811491284699806, + "grad_norm": 0.6972465515136719, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 8710 + }, + { + "epoch": 2.8147191736604262, + "grad_norm": 0.645042896270752, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 8720 + }, + { + "epoch": 2.817947062621046, + "grad_norm": 0.6853853464126587, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 8730 + }, + { + "epoch": 2.8211749515816655, + "grad_norm": 0.5935067534446716, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 8740 + }, + { + "epoch": 2.824402840542285, + "grad_norm": 0.7336633205413818, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 8750 + }, + { + "epoch": 2.8276307295029053, + "grad_norm": 0.7074962854385376, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 8760 + }, + { + "epoch": 2.830858618463525, + "grad_norm": 0.6667559742927551, + "learning_rate": 0.0002, + "loss": 0.6744, + "step": 8770 + }, + { + "epoch": 2.8340865074241446, + "grad_norm": 0.8101205229759216, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 8780 + }, + { + "epoch": 2.8373143963847642, + "grad_norm": 0.8841480016708374, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 8790 + }, + { + "epoch": 2.840542285345384, + "grad_norm": 0.5891591310501099, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 8800 + }, + { + "epoch": 2.843770174306004, + "grad_norm": 0.667032778263092, + "learning_rate": 0.0002, + "loss": 0.7114, + "step": 8810 + }, + { + "epoch": 2.8469980632666236, + "grad_norm": 0.7629773020744324, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 8820 + }, + { + "epoch": 2.8502259522272433, + "grad_norm": 0.79471355676651, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 8830 + }, + { + "epoch": 2.8534538411878634, + "grad_norm": 0.7529178261756897, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 8840 + }, + { + "epoch": 2.856681730148483, + "grad_norm": 0.7014923691749573, + "learning_rate": 0.0002, + "loss": 0.7163, + "step": 8850 + }, + { + "epoch": 2.8599096191091027, + "grad_norm": 0.7996514439582825, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 8860 + }, + { + "epoch": 2.8631375080697223, + "grad_norm": 0.7044785618782043, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 8870 + }, + { + "epoch": 2.866365397030342, + "grad_norm": 0.6792093515396118, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 8880 + }, + { + "epoch": 2.8695932859909616, + "grad_norm": 0.69175124168396, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 8890 + }, + { + "epoch": 2.8728211749515817, + "grad_norm": 0.7499129176139832, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 8900 + }, + { + "epoch": 2.8760490639122014, + "grad_norm": 0.7678789496421814, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 8910 + }, + { + "epoch": 2.879276952872821, + "grad_norm": 0.7478128671646118, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 8920 + }, + { + "epoch": 2.882504841833441, + "grad_norm": 0.6767086386680603, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 8930 + }, + { + "epoch": 2.885732730794061, + "grad_norm": 0.7222196459770203, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 8940 + }, + { + "epoch": 2.8889606197546804, + "grad_norm": 0.6950580477714539, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 8950 + }, + { + "epoch": 2.8921885087153, + "grad_norm": 0.7759528160095215, + "learning_rate": 0.0002, + "loss": 0.7064, + "step": 8960 + }, + { + "epoch": 2.8954163976759197, + "grad_norm": 0.6686919927597046, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 8970 + }, + { + "epoch": 2.89864428663654, + "grad_norm": 0.9245954751968384, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 8980 + }, + { + "epoch": 2.9018721755971595, + "grad_norm": 0.8734814524650574, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 8990 + }, + { + "epoch": 2.905100064557779, + "grad_norm": 0.6056219339370728, + "learning_rate": 0.0002, + "loss": 0.6716, + "step": 9000 + }, + { + "epoch": 2.9083279535183992, + "grad_norm": 0.7364102005958557, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 9010 + }, + { + "epoch": 2.911555842479019, + "grad_norm": 0.6563605070114136, + "learning_rate": 0.0002, + "loss": 0.707, + "step": 9020 + }, + { + "epoch": 2.9147837314396385, + "grad_norm": 0.659978985786438, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 9030 + }, + { + "epoch": 2.918011620400258, + "grad_norm": 0.8176041841506958, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 9040 + }, + { + "epoch": 2.921239509360878, + "grad_norm": 0.743677020072937, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 9050 + }, + { + "epoch": 2.9244673983214975, + "grad_norm": 0.7418383359909058, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 9060 + }, + { + "epoch": 2.9276952872821176, + "grad_norm": 0.6916524767875671, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 9070 + }, + { + "epoch": 2.9309231762427372, + "grad_norm": 0.6559975743293762, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 9080 + }, + { + "epoch": 2.934151065203357, + "grad_norm": 0.7431221008300781, + "learning_rate": 0.0002, + "loss": 0.7016, + "step": 9090 + }, + { + "epoch": 2.937378954163977, + "grad_norm": 0.7525941133499146, + "learning_rate": 0.0002, + "loss": 0.6829, + "step": 9100 + }, + { + "epoch": 2.9406068431245966, + "grad_norm": 0.6860167384147644, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 9110 + }, + { + "epoch": 2.9438347320852163, + "grad_norm": 0.6467666029930115, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 9120 + }, + { + "epoch": 2.947062621045836, + "grad_norm": 0.7595751285552979, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 9130 + }, + { + "epoch": 2.9502905100064556, + "grad_norm": 0.6558279991149902, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 9140 + }, + { + "epoch": 2.9535183989670757, + "grad_norm": 0.6818708181381226, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 9150 + }, + { + "epoch": 2.9567462879276953, + "grad_norm": 0.8387085795402527, + "learning_rate": 0.0002, + "loss": 0.6921, + "step": 9160 + }, + { + "epoch": 2.959974176888315, + "grad_norm": 0.7705109715461731, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 9170 + }, + { + "epoch": 2.9632020658489346, + "grad_norm": 0.688106894493103, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 9180 + }, + { + "epoch": 2.9664299548095547, + "grad_norm": 0.659532368183136, + "learning_rate": 0.0002, + "loss": 0.6833, + "step": 9190 + }, + { + "epoch": 2.9696578437701744, + "grad_norm": 0.6839388608932495, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 9200 + }, + { + "epoch": 2.972885732730794, + "grad_norm": 0.6927599310874939, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 9210 + }, + { + "epoch": 2.9761136216914137, + "grad_norm": 0.6902472972869873, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 9220 + }, + { + "epoch": 2.9793415106520333, + "grad_norm": 0.620399534702301, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 9230 + }, + { + "epoch": 2.9825693996126534, + "grad_norm": 0.6812364459037781, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 9240 + }, + { + "epoch": 2.985797288573273, + "grad_norm": 0.7681456208229065, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 9250 + }, + { + "epoch": 2.9890251775338927, + "grad_norm": 0.7621907591819763, + "learning_rate": 0.0002, + "loss": 0.7113, + "step": 9260 + }, + { + "epoch": 2.992253066494513, + "grad_norm": 0.6075740456581116, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 9270 + }, + { + "epoch": 2.9954809554551325, + "grad_norm": 0.7100434899330139, + "learning_rate": 0.0002, + "loss": 0.6758, + "step": 9280 + }, + { + "epoch": 2.998708844415752, + "grad_norm": 0.7314488887786865, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 9290 + }, + { + "epoch": 3.0, + "eval_loss": 1.1434104442596436, + "eval_runtime": 166.3732, + "eval_samples_per_second": 4.406, + "eval_steps_per_second": 0.553, + "step": 9294 + }, + { + "epoch": 3.001936733376372, + "grad_norm": 0.7408893704414368, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 9300 + }, + { + "epoch": 3.0051646223369914, + "grad_norm": 0.9773574471473694, + "learning_rate": 0.0002, + "loss": 0.5182, + "step": 9310 + }, + { + "epoch": 3.0083925112976115, + "grad_norm": 0.7919653058052063, + "learning_rate": 0.0002, + "loss": 0.5432, + "step": 9320 + }, + { + "epoch": 3.011620400258231, + "grad_norm": 0.9139202833175659, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 9330 + }, + { + "epoch": 3.014848289218851, + "grad_norm": 0.8296737670898438, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 9340 + }, + { + "epoch": 3.0180761781794705, + "grad_norm": 0.786868155002594, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 9350 + }, + { + "epoch": 3.0213040671400906, + "grad_norm": 0.5928055644035339, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 9360 + }, + { + "epoch": 3.0245319561007102, + "grad_norm": 0.8785701394081116, + "learning_rate": 0.0002, + "loss": 0.5376, + "step": 9370 + }, + { + "epoch": 3.02775984506133, + "grad_norm": 0.7978872060775757, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 9380 + }, + { + "epoch": 3.0309877340219495, + "grad_norm": 0.7160913348197937, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 9390 + }, + { + "epoch": 3.034215622982569, + "grad_norm": 0.904465913772583, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 9400 + }, + { + "epoch": 3.0374435119431893, + "grad_norm": 0.7082195281982422, + "learning_rate": 0.0002, + "loss": 0.5518, + "step": 9410 + }, + { + "epoch": 3.040671400903809, + "grad_norm": 0.9686778783798218, + "learning_rate": 0.0002, + "loss": 0.5434, + "step": 9420 + }, + { + "epoch": 3.0438992898644286, + "grad_norm": 0.8788613677024841, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 9430 + }, + { + "epoch": 3.0471271788250482, + "grad_norm": 0.8217582106590271, + "learning_rate": 0.0002, + "loss": 0.5599, + "step": 9440 + }, + { + "epoch": 3.0503550677856683, + "grad_norm": 0.7380914092063904, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 9450 + }, + { + "epoch": 3.053582956746288, + "grad_norm": 0.7339285612106323, + "learning_rate": 0.0002, + "loss": 0.6258, + "step": 9460 + }, + { + "epoch": 3.0568108457069076, + "grad_norm": 0.7175183296203613, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 9470 + }, + { + "epoch": 3.0600387346675273, + "grad_norm": 0.8275379538536072, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 9480 + }, + { + "epoch": 3.0632666236281474, + "grad_norm": 0.6544256806373596, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 9490 + }, + { + "epoch": 3.066494512588767, + "grad_norm": 0.8193472623825073, + "learning_rate": 0.0002, + "loss": 0.5365, + "step": 9500 + }, + { + "epoch": 3.0697224015493867, + "grad_norm": 0.7967836856842041, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 9510 + }, + { + "epoch": 3.0729502905100063, + "grad_norm": 0.8788684010505676, + "learning_rate": 0.0002, + "loss": 0.5629, + "step": 9520 + }, + { + "epoch": 3.0761781794706264, + "grad_norm": 0.9410629868507385, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 9530 + }, + { + "epoch": 3.079406068431246, + "grad_norm": 0.7448706030845642, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 9540 + }, + { + "epoch": 3.0826339573918657, + "grad_norm": 0.9149372577667236, + "learning_rate": 0.0002, + "loss": 0.5774, + "step": 9550 + }, + { + "epoch": 3.0858618463524854, + "grad_norm": 0.7265563607215881, + "learning_rate": 0.0002, + "loss": 0.5347, + "step": 9560 + }, + { + "epoch": 3.089089735313105, + "grad_norm": 1.0305068492889404, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 9570 + }, + { + "epoch": 3.092317624273725, + "grad_norm": 0.7987357974052429, + "learning_rate": 0.0002, + "loss": 0.5884, + "step": 9580 + }, + { + "epoch": 3.095545513234345, + "grad_norm": 0.7733123898506165, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 9590 + }, + { + "epoch": 3.0987734021949644, + "grad_norm": 1.0438069105148315, + "learning_rate": 0.0002, + "loss": 0.5848, + "step": 9600 + }, + { + "epoch": 3.102001291155584, + "grad_norm": 0.7951784729957581, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 9610 + }, + { + "epoch": 3.105229180116204, + "grad_norm": 0.7776783108711243, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 9620 + }, + { + "epoch": 3.108457069076824, + "grad_norm": 0.7060676217079163, + "learning_rate": 0.0002, + "loss": 0.5626, + "step": 9630 + }, + { + "epoch": 3.1116849580374435, + "grad_norm": 0.871569037437439, + "learning_rate": 0.0002, + "loss": 0.5731, + "step": 9640 + }, + { + "epoch": 3.114912846998063, + "grad_norm": 0.8873385787010193, + "learning_rate": 0.0002, + "loss": 0.5168, + "step": 9650 + }, + { + "epoch": 3.118140735958683, + "grad_norm": 0.750998318195343, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 9660 + }, + { + "epoch": 3.121368624919303, + "grad_norm": 0.8678529262542725, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 9670 + }, + { + "epoch": 3.1245965138799225, + "grad_norm": 0.7706599235534668, + "learning_rate": 0.0002, + "loss": 0.5831, + "step": 9680 + }, + { + "epoch": 3.127824402840542, + "grad_norm": 0.8317574858665466, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 9690 + }, + { + "epoch": 3.131052291801162, + "grad_norm": 0.801800012588501, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 9700 + }, + { + "epoch": 3.134280180761782, + "grad_norm": 0.8574623465538025, + "learning_rate": 0.0002, + "loss": 0.6044, + "step": 9710 + }, + { + "epoch": 3.1375080697224016, + "grad_norm": 0.6556540727615356, + "learning_rate": 0.0002, + "loss": 0.6072, + "step": 9720 + }, + { + "epoch": 3.1407359586830212, + "grad_norm": 0.8555161952972412, + "learning_rate": 0.0002, + "loss": 0.6058, + "step": 9730 + }, + { + "epoch": 3.143963847643641, + "grad_norm": 0.8825467824935913, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 9740 + }, + { + "epoch": 3.147191736604261, + "grad_norm": 0.8297156691551208, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 9750 + }, + { + "epoch": 3.1504196255648806, + "grad_norm": 0.7710384726524353, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 9760 + }, + { + "epoch": 3.1536475145255003, + "grad_norm": 0.8778039216995239, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 9770 + }, + { + "epoch": 3.15687540348612, + "grad_norm": 0.9014058113098145, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 9780 + }, + { + "epoch": 3.16010329244674, + "grad_norm": 0.6856890320777893, + "learning_rate": 0.0002, + "loss": 0.5496, + "step": 9790 + }, + { + "epoch": 3.1633311814073597, + "grad_norm": 0.6520644426345825, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 9800 + }, + { + "epoch": 3.1665590703679793, + "grad_norm": 0.7250499129295349, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 9810 + }, + { + "epoch": 3.169786959328599, + "grad_norm": 0.8331542015075684, + "learning_rate": 0.0002, + "loss": 0.5823, + "step": 9820 + }, + { + "epoch": 3.1730148482892186, + "grad_norm": 0.8531261682510376, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 9830 + }, + { + "epoch": 3.1762427372498387, + "grad_norm": 0.8997558355331421, + "learning_rate": 0.0002, + "loss": 0.57, + "step": 9840 + }, + { + "epoch": 3.1794706262104584, + "grad_norm": 0.708335280418396, + "learning_rate": 0.0002, + "loss": 0.5921, + "step": 9850 + }, + { + "epoch": 3.182698515171078, + "grad_norm": 1.0074886083602905, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 9860 + }, + { + "epoch": 3.1859264041316977, + "grad_norm": 1.0804681777954102, + "learning_rate": 0.0002, + "loss": 0.573, + "step": 9870 + }, + { + "epoch": 3.189154293092318, + "grad_norm": 0.9510730504989624, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 9880 + }, + { + "epoch": 3.1923821820529374, + "grad_norm": 0.7211061716079712, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 9890 + }, + { + "epoch": 3.195610071013557, + "grad_norm": 0.8767086267471313, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 9900 + }, + { + "epoch": 3.1988379599741767, + "grad_norm": 0.8388153314590454, + "learning_rate": 0.0002, + "loss": 0.5747, + "step": 9910 + }, + { + "epoch": 3.202065848934797, + "grad_norm": 0.8038473725318909, + "learning_rate": 0.0002, + "loss": 0.5681, + "step": 9920 + }, + { + "epoch": 3.2052937378954165, + "grad_norm": 0.8187747001647949, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 9930 + }, + { + "epoch": 3.208521626856036, + "grad_norm": 0.7427355051040649, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 9940 + }, + { + "epoch": 3.211749515816656, + "grad_norm": 0.8017025589942932, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 9950 + }, + { + "epoch": 3.214977404777276, + "grad_norm": 0.738595187664032, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 9960 + }, + { + "epoch": 3.2182052937378955, + "grad_norm": 0.7521342039108276, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 9970 + }, + { + "epoch": 3.221433182698515, + "grad_norm": 0.840329110622406, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 9980 + }, + { + "epoch": 3.224661071659135, + "grad_norm": 0.9809671640396118, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 9990 + }, + { + "epoch": 3.2278889606197545, + "grad_norm": 0.8456943035125732, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 10000 + }, + { + "epoch": 3.2311168495803746, + "grad_norm": 0.8962995409965515, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 10010 + }, + { + "epoch": 3.2343447385409942, + "grad_norm": 0.6492817401885986, + "learning_rate": 0.0002, + "loss": 0.5399, + "step": 10020 + }, + { + "epoch": 3.237572627501614, + "grad_norm": 1.0471255779266357, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 10030 + }, + { + "epoch": 3.2408005164622335, + "grad_norm": 0.7995471358299255, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 10040 + }, + { + "epoch": 3.2440284054228536, + "grad_norm": 0.7231964468955994, + "learning_rate": 0.0002, + "loss": 0.615, + "step": 10050 + }, + { + "epoch": 3.2472562943834733, + "grad_norm": 0.639630138874054, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 10060 + }, + { + "epoch": 3.250484183344093, + "grad_norm": 0.7957055568695068, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 10070 + }, + { + "epoch": 3.2537120723047126, + "grad_norm": 0.7735482454299927, + "learning_rate": 0.0002, + "loss": 0.5845, + "step": 10080 + }, + { + "epoch": 3.2569399612653323, + "grad_norm": 0.8139488101005554, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 10090 + }, + { + "epoch": 3.2601678502259523, + "grad_norm": 0.8113240003585815, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 10100 + }, + { + "epoch": 3.263395739186572, + "grad_norm": 0.7735909819602966, + "learning_rate": 0.0002, + "loss": 0.5617, + "step": 10110 + }, + { + "epoch": 3.2666236281471916, + "grad_norm": 0.7760744094848633, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 10120 + }, + { + "epoch": 3.2698515171078113, + "grad_norm": 0.8078505396842957, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 10130 + }, + { + "epoch": 3.2730794060684314, + "grad_norm": 0.983648955821991, + "learning_rate": 0.0002, + "loss": 0.5904, + "step": 10140 + }, + { + "epoch": 3.276307295029051, + "grad_norm": 0.7131832242012024, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 10150 + }, + { + "epoch": 3.2795351839896707, + "grad_norm": 0.924493134021759, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 10160 + }, + { + "epoch": 3.2827630729502904, + "grad_norm": 0.9371112585067749, + "learning_rate": 0.0002, + "loss": 0.5733, + "step": 10170 + }, + { + "epoch": 3.2859909619109104, + "grad_norm": 0.8989261388778687, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 10180 + }, + { + "epoch": 3.28921885087153, + "grad_norm": 0.8130394816398621, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 10190 + }, + { + "epoch": 3.2924467398321497, + "grad_norm": 0.9899941086769104, + "learning_rate": 0.0002, + "loss": 0.5555, + "step": 10200 + }, + { + "epoch": 3.2956746287927694, + "grad_norm": 1.007038950920105, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 10210 + }, + { + "epoch": 3.2989025177533895, + "grad_norm": 0.7465066313743591, + "learning_rate": 0.0002, + "loss": 0.5713, + "step": 10220 + }, + { + "epoch": 3.302130406714009, + "grad_norm": 0.7202590703964233, + "learning_rate": 0.0002, + "loss": 0.6307, + "step": 10230 + }, + { + "epoch": 3.305358295674629, + "grad_norm": 0.6258249282836914, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 10240 + }, + { + "epoch": 3.3085861846352485, + "grad_norm": 0.8996058702468872, + "learning_rate": 0.0002, + "loss": 0.5869, + "step": 10250 + }, + { + "epoch": 3.311814073595868, + "grad_norm": 0.9550982713699341, + "learning_rate": 0.0002, + "loss": 0.5825, + "step": 10260 + }, + { + "epoch": 3.315041962556488, + "grad_norm": 0.7010059952735901, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 10270 + }, + { + "epoch": 3.318269851517108, + "grad_norm": 0.9639869332313538, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 10280 + }, + { + "epoch": 3.3214977404777275, + "grad_norm": 1.0192502737045288, + "learning_rate": 0.0002, + "loss": 0.5362, + "step": 10290 + }, + { + "epoch": 3.324725629438347, + "grad_norm": 0.7953670024871826, + "learning_rate": 0.0002, + "loss": 0.5605, + "step": 10300 + }, + { + "epoch": 3.3279535183989672, + "grad_norm": 0.7436774969100952, + "learning_rate": 0.0002, + "loss": 0.6386, + "step": 10310 + }, + { + "epoch": 3.331181407359587, + "grad_norm": 0.7846777439117432, + "learning_rate": 0.0002, + "loss": 0.5823, + "step": 10320 + }, + { + "epoch": 3.3344092963202066, + "grad_norm": 0.8963494896888733, + "learning_rate": 0.0002, + "loss": 0.6119, + "step": 10330 + }, + { + "epoch": 3.337637185280826, + "grad_norm": 0.6876392364501953, + "learning_rate": 0.0002, + "loss": 0.5872, + "step": 10340 + }, + { + "epoch": 3.340865074241446, + "grad_norm": 0.9161638021469116, + "learning_rate": 0.0002, + "loss": 0.6291, + "step": 10350 + }, + { + "epoch": 3.344092963202066, + "grad_norm": 0.8964458107948303, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 10360 + }, + { + "epoch": 3.3473208521626856, + "grad_norm": 0.9052296280860901, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 10370 + }, + { + "epoch": 3.3505487411233053, + "grad_norm": 0.9292596578598022, + "learning_rate": 0.0002, + "loss": 0.5958, + "step": 10380 + }, + { + "epoch": 3.3537766300839253, + "grad_norm": 0.9605957269668579, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 10390 + }, + { + "epoch": 3.357004519044545, + "grad_norm": 1.0198872089385986, + "learning_rate": 0.0002, + "loss": 0.6214, + "step": 10400 + }, + { + "epoch": 3.3602324080051647, + "grad_norm": 0.7043630480766296, + "learning_rate": 0.0002, + "loss": 0.6053, + "step": 10410 + }, + { + "epoch": 3.3634602969657843, + "grad_norm": 1.0533326864242554, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 10420 + }, + { + "epoch": 3.366688185926404, + "grad_norm": 0.7552485466003418, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 10430 + }, + { + "epoch": 3.369916074887024, + "grad_norm": 0.692708432674408, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 10440 + }, + { + "epoch": 3.3731439638476437, + "grad_norm": 0.985952615737915, + "learning_rate": 0.0002, + "loss": 0.631, + "step": 10450 + }, + { + "epoch": 3.3763718528082634, + "grad_norm": 0.6749676465988159, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 10460 + }, + { + "epoch": 3.379599741768883, + "grad_norm": 0.9514535665512085, + "learning_rate": 0.0002, + "loss": 0.5724, + "step": 10470 + }, + { + "epoch": 3.382827630729503, + "grad_norm": 1.2681142091751099, + "learning_rate": 0.0002, + "loss": 0.5982, + "step": 10480 + }, + { + "epoch": 3.3860555196901228, + "grad_norm": 1.031968355178833, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 10490 + }, + { + "epoch": 3.3892834086507424, + "grad_norm": 0.8061563968658447, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 10500 + }, + { + "epoch": 3.392511297611362, + "grad_norm": 1.0515062808990479, + "learning_rate": 0.0002, + "loss": 0.6094, + "step": 10510 + }, + { + "epoch": 3.3957391865719817, + "grad_norm": 0.9055540561676025, + "learning_rate": 0.0002, + "loss": 0.542, + "step": 10520 + }, + { + "epoch": 3.398967075532602, + "grad_norm": 0.9318141341209412, + "learning_rate": 0.0002, + "loss": 0.6148, + "step": 10530 + }, + { + "epoch": 3.4021949644932215, + "grad_norm": 0.8266817331314087, + "learning_rate": 0.0002, + "loss": 0.5722, + "step": 10540 + }, + { + "epoch": 3.405422853453841, + "grad_norm": 1.2322112321853638, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 10550 + }, + { + "epoch": 3.4086507424144608, + "grad_norm": 0.9535136818885803, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 10560 + }, + { + "epoch": 3.411878631375081, + "grad_norm": 0.9243819117546082, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 10570 + }, + { + "epoch": 3.4151065203357005, + "grad_norm": 0.9011809825897217, + "learning_rate": 0.0002, + "loss": 0.5844, + "step": 10580 + }, + { + "epoch": 3.41833440929632, + "grad_norm": 0.9923036694526672, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 10590 + }, + { + "epoch": 3.42156229825694, + "grad_norm": 0.8903067111968994, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 10600 + }, + { + "epoch": 3.42479018721756, + "grad_norm": 0.7101534605026245, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 10610 + }, + { + "epoch": 3.4280180761781796, + "grad_norm": 0.8186570405960083, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 10620 + }, + { + "epoch": 3.431245965138799, + "grad_norm": 0.9480205774307251, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 10630 + }, + { + "epoch": 3.434473854099419, + "grad_norm": 1.1370961666107178, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 10640 + }, + { + "epoch": 3.437701743060039, + "grad_norm": 1.017669677734375, + "learning_rate": 0.0002, + "loss": 0.5779, + "step": 10650 + }, + { + "epoch": 3.4409296320206586, + "grad_norm": 0.7625100016593933, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 10660 + }, + { + "epoch": 3.4441575209812783, + "grad_norm": 0.9288196563720703, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 10670 + }, + { + "epoch": 3.447385409941898, + "grad_norm": 0.8800460696220398, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 10680 + }, + { + "epoch": 3.4506132989025176, + "grad_norm": 0.7499661445617676, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 10690 + }, + { + "epoch": 3.4538411878631377, + "grad_norm": 0.8254973292350769, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 10700 + }, + { + "epoch": 3.4570690768237573, + "grad_norm": 0.8735857605934143, + "learning_rate": 0.0002, + "loss": 0.5742, + "step": 10710 + }, + { + "epoch": 3.460296965784377, + "grad_norm": 0.9601819515228271, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 10720 + }, + { + "epoch": 3.4635248547449966, + "grad_norm": 0.8031058311462402, + "learning_rate": 0.0002, + "loss": 0.5574, + "step": 10730 + }, + { + "epoch": 3.4667527437056167, + "grad_norm": 0.8039247393608093, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 10740 + }, + { + "epoch": 3.4699806326662364, + "grad_norm": 0.8936953544616699, + "learning_rate": 0.0002, + "loss": 0.593, + "step": 10750 + }, + { + "epoch": 3.473208521626856, + "grad_norm": 0.8201186060905457, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 10760 + }, + { + "epoch": 3.4764364105874757, + "grad_norm": 1.0064148902893066, + "learning_rate": 0.0002, + "loss": 0.5875, + "step": 10770 + }, + { + "epoch": 3.4796642995480953, + "grad_norm": 0.8617483377456665, + "learning_rate": 0.0002, + "loss": 0.5639, + "step": 10780 + }, + { + "epoch": 3.4828921885087154, + "grad_norm": 0.8532096147537231, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 10790 + }, + { + "epoch": 3.486120077469335, + "grad_norm": 0.8646879196166992, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 10800 + }, + { + "epoch": 3.4893479664299547, + "grad_norm": 0.7962660789489746, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 10810 + }, + { + "epoch": 3.492575855390575, + "grad_norm": 0.9560028314590454, + "learning_rate": 0.0002, + "loss": 0.5398, + "step": 10820 + }, + { + "epoch": 3.4958037443511945, + "grad_norm": 0.928439736366272, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 10830 + }, + { + "epoch": 3.499031633311814, + "grad_norm": 0.8219282627105713, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 10840 + }, + { + "epoch": 3.5022595222724338, + "grad_norm": 0.7918338179588318, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 10850 + }, + { + "epoch": 3.5054874112330534, + "grad_norm": 0.961295485496521, + "learning_rate": 0.0002, + "loss": 0.6164, + "step": 10860 + }, + { + "epoch": 3.5087153001936735, + "grad_norm": 1.0731624364852905, + "learning_rate": 0.0002, + "loss": 0.5534, + "step": 10870 + }, + { + "epoch": 3.511943189154293, + "grad_norm": 0.9551863074302673, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 10880 + }, + { + "epoch": 3.515171078114913, + "grad_norm": 0.8409819602966309, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 10890 + }, + { + "epoch": 3.5183989670755325, + "grad_norm": 0.7546320557594299, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 10900 + }, + { + "epoch": 3.5216268560361526, + "grad_norm": 0.7505252361297607, + "learning_rate": 0.0002, + "loss": 0.6184, + "step": 10910 + }, + { + "epoch": 3.524854744996772, + "grad_norm": 0.7505561113357544, + "learning_rate": 0.0002, + "loss": 0.5649, + "step": 10920 + }, + { + "epoch": 3.528082633957392, + "grad_norm": 1.086177945137024, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 10930 + }, + { + "epoch": 3.5313105229180115, + "grad_norm": 0.7721118330955505, + "learning_rate": 0.0002, + "loss": 0.5983, + "step": 10940 + }, + { + "epoch": 3.534538411878631, + "grad_norm": 0.9567878246307373, + "learning_rate": 0.0002, + "loss": 0.5919, + "step": 10950 + }, + { + "epoch": 3.5377663008392513, + "grad_norm": 0.8377360105514526, + "learning_rate": 0.0002, + "loss": 0.6261, + "step": 10960 + }, + { + "epoch": 3.540994189799871, + "grad_norm": 1.0174858570098877, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 10970 + }, + { + "epoch": 3.5442220787604906, + "grad_norm": 0.8164418935775757, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 10980 + }, + { + "epoch": 3.5474499677211107, + "grad_norm": 0.8959241509437561, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 10990 + }, + { + "epoch": 3.5506778566817303, + "grad_norm": 1.0154379606246948, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 11000 + }, + { + "epoch": 3.55390574564235, + "grad_norm": 0.7812292575836182, + "learning_rate": 0.0002, + "loss": 0.5835, + "step": 11010 + }, + { + "epoch": 3.5571336346029696, + "grad_norm": 0.9849029779434204, + "learning_rate": 0.0002, + "loss": 0.6052, + "step": 11020 + }, + { + "epoch": 3.5603615235635893, + "grad_norm": 0.8826184272766113, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 11030 + }, + { + "epoch": 3.563589412524209, + "grad_norm": 0.9039685726165771, + "learning_rate": 0.0002, + "loss": 0.601, + "step": 11040 + }, + { + "epoch": 3.566817301484829, + "grad_norm": 0.9585249423980713, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 11050 + }, + { + "epoch": 3.5700451904454487, + "grad_norm": 0.8083069324493408, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 11060 + }, + { + "epoch": 3.5732730794060683, + "grad_norm": 0.9528678059577942, + "learning_rate": 0.0002, + "loss": 0.6317, + "step": 11070 + }, + { + "epoch": 3.5765009683666884, + "grad_norm": 0.8297588229179382, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 11080 + }, + { + "epoch": 3.579728857327308, + "grad_norm": 0.8191716074943542, + "learning_rate": 0.0002, + "loss": 0.5919, + "step": 11090 + }, + { + "epoch": 3.5829567462879277, + "grad_norm": 0.8056275844573975, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 11100 + }, + { + "epoch": 3.5861846352485474, + "grad_norm": 0.701930582523346, + "learning_rate": 0.0002, + "loss": 0.6325, + "step": 11110 + }, + { + "epoch": 3.589412524209167, + "grad_norm": 0.7644643187522888, + "learning_rate": 0.0002, + "loss": 0.6088, + "step": 11120 + }, + { + "epoch": 3.592640413169787, + "grad_norm": 0.668004035949707, + "learning_rate": 0.0002, + "loss": 0.605, + "step": 11130 + }, + { + "epoch": 3.5958683021304068, + "grad_norm": 0.8849539756774902, + "learning_rate": 0.0002, + "loss": 0.5735, + "step": 11140 + }, + { + "epoch": 3.5990961910910264, + "grad_norm": 0.8123571276664734, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 11150 + }, + { + "epoch": 3.602324080051646, + "grad_norm": 0.7591469287872314, + "learning_rate": 0.0002, + "loss": 0.5626, + "step": 11160 + }, + { + "epoch": 3.605551969012266, + "grad_norm": 0.776466965675354, + "learning_rate": 0.0002, + "loss": 0.5668, + "step": 11170 + }, + { + "epoch": 3.608779857972886, + "grad_norm": 0.9156150221824646, + "learning_rate": 0.0002, + "loss": 0.6631, + "step": 11180 + }, + { + "epoch": 3.6120077469335055, + "grad_norm": 0.7517618536949158, + "learning_rate": 0.0002, + "loss": 0.5867, + "step": 11190 + }, + { + "epoch": 3.615235635894125, + "grad_norm": 0.931239128112793, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 11200 + }, + { + "epoch": 3.6184635248547448, + "grad_norm": 0.9107872843742371, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 11210 + }, + { + "epoch": 3.621691413815365, + "grad_norm": 0.7624770998954773, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 11220 + }, + { + "epoch": 3.6249193027759845, + "grad_norm": 0.8129580616950989, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 11230 + }, + { + "epoch": 3.628147191736604, + "grad_norm": 0.7339836955070496, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 11240 + }, + { + "epoch": 3.6313750806972243, + "grad_norm": 0.8901296854019165, + "learning_rate": 0.0002, + "loss": 0.5976, + "step": 11250 + }, + { + "epoch": 3.634602969657844, + "grad_norm": 1.1374726295471191, + "learning_rate": 0.0002, + "loss": 0.5977, + "step": 11260 + }, + { + "epoch": 3.6378308586184636, + "grad_norm": 0.7438275218009949, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 11270 + }, + { + "epoch": 3.641058747579083, + "grad_norm": 0.808646559715271, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 11280 + }, + { + "epoch": 3.644286636539703, + "grad_norm": 1.091810941696167, + "learning_rate": 0.0002, + "loss": 0.6244, + "step": 11290 + }, + { + "epoch": 3.6475145255003225, + "grad_norm": 0.8439257144927979, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 11300 + }, + { + "epoch": 3.6507424144609426, + "grad_norm": 0.9720633029937744, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 11310 + }, + { + "epoch": 3.6539703034215623, + "grad_norm": 0.738571047782898, + "learning_rate": 0.0002, + "loss": 0.5942, + "step": 11320 + }, + { + "epoch": 3.657198192382182, + "grad_norm": 0.6961580514907837, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 11330 + }, + { + "epoch": 3.660426081342802, + "grad_norm": 0.8192131519317627, + "learning_rate": 0.0002, + "loss": 0.6226, + "step": 11340 + }, + { + "epoch": 3.6636539703034217, + "grad_norm": 0.8367205858230591, + "learning_rate": 0.0002, + "loss": 0.6155, + "step": 11350 + }, + { + "epoch": 3.6668818592640413, + "grad_norm": 0.7735666632652283, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 11360 + }, + { + "epoch": 3.670109748224661, + "grad_norm": 0.6507132649421692, + "learning_rate": 0.0002, + "loss": 0.6113, + "step": 11370 + }, + { + "epoch": 3.6733376371852806, + "grad_norm": 0.8271192312240601, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 11380 + }, + { + "epoch": 3.6765655261459007, + "grad_norm": 0.8724204301834106, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 11390 + }, + { + "epoch": 3.6797934151065204, + "grad_norm": 0.8448445200920105, + "learning_rate": 0.0002, + "loss": 0.6131, + "step": 11400 + }, + { + "epoch": 3.68302130406714, + "grad_norm": 0.6756882071495056, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 11410 + }, + { + "epoch": 3.68624919302776, + "grad_norm": 0.7859625816345215, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 11420 + }, + { + "epoch": 3.6894770819883798, + "grad_norm": 0.8929487466812134, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 11430 + }, + { + "epoch": 3.6927049709489994, + "grad_norm": 0.8163391351699829, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 11440 + }, + { + "epoch": 3.695932859909619, + "grad_norm": 0.8948464393615723, + "learning_rate": 0.0002, + "loss": 0.6467, + "step": 11450 + }, + { + "epoch": 3.6991607488702387, + "grad_norm": 0.8654782176017761, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 11460 + }, + { + "epoch": 3.7023886378308584, + "grad_norm": 0.9514864683151245, + "learning_rate": 0.0002, + "loss": 0.6142, + "step": 11470 + }, + { + "epoch": 3.7056165267914785, + "grad_norm": 0.7298579812049866, + "learning_rate": 0.0002, + "loss": 0.606, + "step": 11480 + }, + { + "epoch": 3.708844415752098, + "grad_norm": 0.9266309142112732, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 11490 + }, + { + "epoch": 3.7120723047127178, + "grad_norm": 0.8608686923980713, + "learning_rate": 0.0002, + "loss": 0.6122, + "step": 11500 + }, + { + "epoch": 3.715300193673338, + "grad_norm": 0.921788215637207, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 11510 + }, + { + "epoch": 3.7185280826339575, + "grad_norm": 0.8537021279335022, + "learning_rate": 0.0002, + "loss": 0.6191, + "step": 11520 + }, + { + "epoch": 3.721755971594577, + "grad_norm": 1.115194320678711, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 11530 + }, + { + "epoch": 3.724983860555197, + "grad_norm": 0.7614817023277283, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 11540 + }, + { + "epoch": 3.7282117495158165, + "grad_norm": 0.871999204158783, + "learning_rate": 0.0002, + "loss": 0.5776, + "step": 11550 + }, + { + "epoch": 3.7314396384764366, + "grad_norm": 0.9668049812316895, + "learning_rate": 0.0002, + "loss": 0.5962, + "step": 11560 + }, + { + "epoch": 3.734667527437056, + "grad_norm": 1.2185815572738647, + "learning_rate": 0.0002, + "loss": 0.5534, + "step": 11570 + }, + { + "epoch": 3.737895416397676, + "grad_norm": 0.8258453011512756, + "learning_rate": 0.0002, + "loss": 0.5936, + "step": 11580 + }, + { + "epoch": 3.7411233053582955, + "grad_norm": 0.8708966374397278, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 11590 + }, + { + "epoch": 3.7443511943189156, + "grad_norm": 0.7784267663955688, + "learning_rate": 0.0002, + "loss": 0.5847, + "step": 11600 + }, + { + "epoch": 3.7475790832795353, + "grad_norm": 0.7504425048828125, + "learning_rate": 0.0002, + "loss": 0.6404, + "step": 11610 + }, + { + "epoch": 3.750806972240155, + "grad_norm": 0.9144526124000549, + "learning_rate": 0.0002, + "loss": 0.5922, + "step": 11620 + }, + { + "epoch": 3.7540348612007746, + "grad_norm": 0.922581672668457, + "learning_rate": 0.0002, + "loss": 0.6425, + "step": 11630 + }, + { + "epoch": 3.757262750161394, + "grad_norm": 0.9348630905151367, + "learning_rate": 0.0002, + "loss": 0.6402, + "step": 11640 + }, + { + "epoch": 3.7604906391220143, + "grad_norm": 1.0740231275558472, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 11650 + }, + { + "epoch": 3.763718528082634, + "grad_norm": 0.884830117225647, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 11660 + }, + { + "epoch": 3.7669464170432536, + "grad_norm": 1.0256348848342896, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 11670 + }, + { + "epoch": 3.7701743060038737, + "grad_norm": 0.6795592904090881, + "learning_rate": 0.0002, + "loss": 0.626, + "step": 11680 + }, + { + "epoch": 3.7734021949644934, + "grad_norm": 0.9381206631660461, + "learning_rate": 0.0002, + "loss": 0.6241, + "step": 11690 + }, + { + "epoch": 3.776630083925113, + "grad_norm": 0.7633092403411865, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 11700 + }, + { + "epoch": 3.7798579728857327, + "grad_norm": 0.7506213188171387, + "learning_rate": 0.0002, + "loss": 0.5937, + "step": 11710 + }, + { + "epoch": 3.7830858618463523, + "grad_norm": 0.8182913064956665, + "learning_rate": 0.0002, + "loss": 0.5933, + "step": 11720 + }, + { + "epoch": 3.786313750806972, + "grad_norm": 1.019322156906128, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 11730 + }, + { + "epoch": 3.789541639767592, + "grad_norm": 0.8895221948623657, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 11740 + }, + { + "epoch": 3.7927695287282117, + "grad_norm": 0.948847770690918, + "learning_rate": 0.0002, + "loss": 0.6553, + "step": 11750 + }, + { + "epoch": 3.7959974176888314, + "grad_norm": 0.9068999886512756, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 11760 + }, + { + "epoch": 3.7992253066494515, + "grad_norm": 0.7920539975166321, + "learning_rate": 0.0002, + "loss": 0.6163, + "step": 11770 + }, + { + "epoch": 3.802453195610071, + "grad_norm": 0.8441922068595886, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 11780 + }, + { + "epoch": 3.8056810845706908, + "grad_norm": 0.9258501529693604, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 11790 + }, + { + "epoch": 3.8089089735313104, + "grad_norm": 0.7354241609573364, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 11800 + }, + { + "epoch": 3.81213686249193, + "grad_norm": 0.9494872689247131, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 11810 + }, + { + "epoch": 3.81536475145255, + "grad_norm": 0.8266556859016418, + "learning_rate": 0.0002, + "loss": 0.5931, + "step": 11820 + }, + { + "epoch": 3.81859264041317, + "grad_norm": 0.7951219081878662, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 11830 + }, + { + "epoch": 3.8218205293737895, + "grad_norm": 0.7688382267951965, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 11840 + }, + { + "epoch": 3.8250484183344096, + "grad_norm": 1.0917940139770508, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 11850 + }, + { + "epoch": 3.828276307295029, + "grad_norm": 0.9880442023277283, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 11860 + }, + { + "epoch": 3.831504196255649, + "grad_norm": 0.8433151245117188, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 11870 + }, + { + "epoch": 3.8347320852162685, + "grad_norm": 0.8691204786300659, + "learning_rate": 0.0002, + "loss": 0.5876, + "step": 11880 + }, + { + "epoch": 3.837959974176888, + "grad_norm": 0.7698143124580383, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 11890 + }, + { + "epoch": 3.841187863137508, + "grad_norm": 0.8874883651733398, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 11900 + }, + { + "epoch": 3.844415752098128, + "grad_norm": 1.1209359169006348, + "learning_rate": 0.0002, + "loss": 0.6242, + "step": 11910 + }, + { + "epoch": 3.8476436410587476, + "grad_norm": 0.7723544239997864, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 11920 + }, + { + "epoch": 3.850871530019367, + "grad_norm": 0.8363937139511108, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 11930 + }, + { + "epoch": 3.8540994189799873, + "grad_norm": 0.9209707975387573, + "learning_rate": 0.0002, + "loss": 0.6498, + "step": 11940 + }, + { + "epoch": 3.857327307940607, + "grad_norm": 0.9456894993782043, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 11950 + }, + { + "epoch": 3.8605551969012266, + "grad_norm": 1.5748413801193237, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 11960 + }, + { + "epoch": 3.8637830858618463, + "grad_norm": 0.9083569049835205, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 11970 + }, + { + "epoch": 3.867010974822466, + "grad_norm": 0.7672823071479797, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 11980 + }, + { + "epoch": 3.870238863783086, + "grad_norm": 0.8647152185440063, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 11990 + }, + { + "epoch": 3.8734667527437057, + "grad_norm": 0.9564255475997925, + "learning_rate": 0.0002, + "loss": 0.5755, + "step": 12000 + }, + { + "epoch": 3.8766946417043253, + "grad_norm": 0.773267924785614, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 12010 + }, + { + "epoch": 3.879922530664945, + "grad_norm": 0.8030173182487488, + "learning_rate": 0.0002, + "loss": 0.6057, + "step": 12020 + }, + { + "epoch": 3.883150419625565, + "grad_norm": 0.8002150058746338, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 12030 + }, + { + "epoch": 3.8863783085861847, + "grad_norm": 0.98802250623703, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 12040 + }, + { + "epoch": 3.8896061975468044, + "grad_norm": 0.7868124842643738, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 12050 + }, + { + "epoch": 3.892834086507424, + "grad_norm": 0.932182788848877, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 12060 + }, + { + "epoch": 3.8960619754680437, + "grad_norm": 0.8576806783676147, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 12070 + }, + { + "epoch": 3.8992898644286638, + "grad_norm": 0.8985713124275208, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 12080 + }, + { + "epoch": 3.9025177533892834, + "grad_norm": 0.7876521944999695, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 12090 + }, + { + "epoch": 3.905745642349903, + "grad_norm": 0.773936927318573, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 12100 + }, + { + "epoch": 3.908973531310523, + "grad_norm": 0.7274761199951172, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 12110 + }, + { + "epoch": 3.912201420271143, + "grad_norm": 0.8625598549842834, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 12120 + }, + { + "epoch": 3.9154293092317625, + "grad_norm": 0.8702362179756165, + "learning_rate": 0.0002, + "loss": 0.5855, + "step": 12130 + }, + { + "epoch": 3.918657198192382, + "grad_norm": 0.912579357624054, + "learning_rate": 0.0002, + "loss": 0.6493, + "step": 12140 + }, + { + "epoch": 3.9218850871530018, + "grad_norm": 0.8697066903114319, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 12150 + }, + { + "epoch": 3.9251129761136214, + "grad_norm": 1.005232572555542, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 12160 + }, + { + "epoch": 3.9283408650742415, + "grad_norm": 0.793902575969696, + "learning_rate": 0.0002, + "loss": 0.621, + "step": 12170 + }, + { + "epoch": 3.931568754034861, + "grad_norm": 0.7025905847549438, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 12180 + }, + { + "epoch": 3.934796642995481, + "grad_norm": 0.97635817527771, + "learning_rate": 0.0002, + "loss": 0.6421, + "step": 12190 + }, + { + "epoch": 3.938024531956101, + "grad_norm": 0.855417013168335, + "learning_rate": 0.0002, + "loss": 0.6416, + "step": 12200 + }, + { + "epoch": 3.9412524209167206, + "grad_norm": 0.8841291666030884, + "learning_rate": 0.0002, + "loss": 0.5979, + "step": 12210 + }, + { + "epoch": 3.94448030987734, + "grad_norm": 1.1762064695358276, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 12220 + }, + { + "epoch": 3.94770819883796, + "grad_norm": 0.8393193483352661, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 12230 + }, + { + "epoch": 3.9509360877985795, + "grad_norm": 0.9324905276298523, + "learning_rate": 0.0002, + "loss": 0.5738, + "step": 12240 + }, + { + "epoch": 3.9541639767591996, + "grad_norm": 0.8607982993125916, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 12250 + }, + { + "epoch": 3.9573918657198193, + "grad_norm": 0.8586681485176086, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 12260 + }, + { + "epoch": 3.960619754680439, + "grad_norm": 1.1082909107208252, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 12270 + }, + { + "epoch": 3.963847643641059, + "grad_norm": 1.065027117729187, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 12280 + }, + { + "epoch": 3.9670755326016787, + "grad_norm": 0.9544363021850586, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 12290 + }, + { + "epoch": 3.9703034215622983, + "grad_norm": 0.9008927345275879, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 12300 + }, + { + "epoch": 3.973531310522918, + "grad_norm": 0.8717467188835144, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 12310 + }, + { + "epoch": 3.9767591994835376, + "grad_norm": 0.9718339443206787, + "learning_rate": 0.0002, + "loss": 0.6465, + "step": 12320 + }, + { + "epoch": 3.9799870884441573, + "grad_norm": 1.0362015962600708, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 12330 + }, + { + "epoch": 3.9832149774047774, + "grad_norm": 1.0844318866729736, + "learning_rate": 0.0002, + "loss": 0.6229, + "step": 12340 + }, + { + "epoch": 3.986442866365397, + "grad_norm": 0.7506240606307983, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 12350 + }, + { + "epoch": 3.9896707553260167, + "grad_norm": 1.005982756614685, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 12360 + }, + { + "epoch": 3.9928986442866368, + "grad_norm": 0.7566431164741516, + "learning_rate": 0.0002, + "loss": 0.5926, + "step": 12370 + }, + { + "epoch": 3.9961265332472564, + "grad_norm": 0.8819181323051453, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 12380 + }, + { + "epoch": 3.999354422207876, + "grad_norm": 0.884497880935669, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 12390 + }, + { + "epoch": 4.0, + "eval_loss": 1.1907150745391846, + "eval_runtime": 161.5766, + "eval_samples_per_second": 4.537, + "eval_steps_per_second": 0.569, + "step": 12392 + }, + { + "epoch": 4.002582311168496, + "grad_norm": 1.0407241582870483, + "learning_rate": 0.0002, + "loss": 0.5203, + "step": 12400 + }, + { + "epoch": 4.005810200129115, + "grad_norm": 1.0199295282363892, + "learning_rate": 0.0002, + "loss": 0.4978, + "step": 12410 + }, + { + "epoch": 4.009038089089735, + "grad_norm": 0.8456302881240845, + "learning_rate": 0.0002, + "loss": 0.4985, + "step": 12420 + }, + { + "epoch": 4.012265978050355, + "grad_norm": 1.0621124505996704, + "learning_rate": 0.0002, + "loss": 0.4669, + "step": 12430 + }, + { + "epoch": 4.015493867010975, + "grad_norm": 0.8984712362289429, + "learning_rate": 0.0002, + "loss": 0.5277, + "step": 12440 + }, + { + "epoch": 4.018721755971595, + "grad_norm": 1.3785864114761353, + "learning_rate": 0.0002, + "loss": 0.5508, + "step": 12450 + }, + { + "epoch": 4.0219496449322145, + "grad_norm": 0.7911781668663025, + "learning_rate": 0.0002, + "loss": 0.5244, + "step": 12460 + }, + { + "epoch": 4.025177533892834, + "grad_norm": 1.0977907180786133, + "learning_rate": 0.0002, + "loss": 0.4746, + "step": 12470 + }, + { + "epoch": 4.028405422853454, + "grad_norm": 1.0664983987808228, + "learning_rate": 0.0002, + "loss": 0.4632, + "step": 12480 + }, + { + "epoch": 4.0316333118140735, + "grad_norm": 1.0807124376296997, + "learning_rate": 0.0002, + "loss": 0.5151, + "step": 12490 + }, + { + "epoch": 4.034861200774693, + "grad_norm": 1.2650192975997925, + "learning_rate": 0.0002, + "loss": 0.4712, + "step": 12500 + }, + { + "epoch": 4.038089089735313, + "grad_norm": 0.7164070010185242, + "learning_rate": 0.0002, + "loss": 0.5111, + "step": 12510 + }, + { + "epoch": 4.041316978695932, + "grad_norm": 1.0047489404678345, + "learning_rate": 0.0002, + "loss": 0.5015, + "step": 12520 + }, + { + "epoch": 4.044544867656553, + "grad_norm": 0.9303901791572571, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 12530 + }, + { + "epoch": 4.047772756617173, + "grad_norm": 1.0319702625274658, + "learning_rate": 0.0002, + "loss": 0.5165, + "step": 12540 + }, + { + "epoch": 4.051000645577792, + "grad_norm": 0.9549729228019714, + "learning_rate": 0.0002, + "loss": 0.4834, + "step": 12550 + }, + { + "epoch": 4.054228534538412, + "grad_norm": 0.7175564765930176, + "learning_rate": 0.0002, + "loss": 0.5235, + "step": 12560 + }, + { + "epoch": 4.057456423499032, + "grad_norm": 1.0622259378433228, + "learning_rate": 0.0002, + "loss": 0.5257, + "step": 12570 + }, + { + "epoch": 4.060684312459651, + "grad_norm": 1.172074556350708, + "learning_rate": 0.0002, + "loss": 0.5098, + "step": 12580 + }, + { + "epoch": 4.063912201420271, + "grad_norm": 0.9702366590499878, + "learning_rate": 0.0002, + "loss": 0.5112, + "step": 12590 + }, + { + "epoch": 4.0671400903808905, + "grad_norm": 0.741511344909668, + "learning_rate": 0.0002, + "loss": 0.5042, + "step": 12600 + }, + { + "epoch": 4.070367979341511, + "grad_norm": 0.8632621169090271, + "learning_rate": 0.0002, + "loss": 0.4996, + "step": 12610 + }, + { + "epoch": 4.073595868302131, + "grad_norm": 0.9695962071418762, + "learning_rate": 0.0002, + "loss": 0.4927, + "step": 12620 + }, + { + "epoch": 4.07682375726275, + "grad_norm": 0.9401052594184875, + "learning_rate": 0.0002, + "loss": 0.4618, + "step": 12630 + }, + { + "epoch": 4.08005164622337, + "grad_norm": 0.8068707585334778, + "learning_rate": 0.0002, + "loss": 0.4889, + "step": 12640 + }, + { + "epoch": 4.08327953518399, + "grad_norm": 0.9554762840270996, + "learning_rate": 0.0002, + "loss": 0.5046, + "step": 12650 + }, + { + "epoch": 4.086507424144609, + "grad_norm": 0.7637128233909607, + "learning_rate": 0.0002, + "loss": 0.5081, + "step": 12660 + }, + { + "epoch": 4.089735313105229, + "grad_norm": 0.6703744530677795, + "learning_rate": 0.0002, + "loss": 0.4997, + "step": 12670 + }, + { + "epoch": 4.092963202065849, + "grad_norm": 0.8623828887939453, + "learning_rate": 0.0002, + "loss": 0.4977, + "step": 12680 + }, + { + "epoch": 4.096191091026468, + "grad_norm": 0.8198223114013672, + "learning_rate": 0.0002, + "loss": 0.4616, + "step": 12690 + }, + { + "epoch": 4.099418979987089, + "grad_norm": 1.3449875116348267, + "learning_rate": 0.0002, + "loss": 0.5372, + "step": 12700 + }, + { + "epoch": 4.1026468689477085, + "grad_norm": 0.8333606123924255, + "learning_rate": 0.0002, + "loss": 0.4782, + "step": 12710 + }, + { + "epoch": 4.105874757908328, + "grad_norm": 1.1647733449935913, + "learning_rate": 0.0002, + "loss": 0.5135, + "step": 12720 + }, + { + "epoch": 4.109102646868948, + "grad_norm": 1.0560213327407837, + "learning_rate": 0.0002, + "loss": 0.5147, + "step": 12730 + }, + { + "epoch": 4.112330535829567, + "grad_norm": 0.9479449987411499, + "learning_rate": 0.0002, + "loss": 0.5244, + "step": 12740 + }, + { + "epoch": 4.115558424790187, + "grad_norm": 1.1634587049484253, + "learning_rate": 0.0002, + "loss": 0.4596, + "step": 12750 + }, + { + "epoch": 4.118786313750807, + "grad_norm": 0.813987672328949, + "learning_rate": 0.0002, + "loss": 0.4966, + "step": 12760 + }, + { + "epoch": 4.122014202711426, + "grad_norm": 0.968461275100708, + "learning_rate": 0.0002, + "loss": 0.5133, + "step": 12770 + }, + { + "epoch": 4.125242091672046, + "grad_norm": 0.9324830770492554, + "learning_rate": 0.0002, + "loss": 0.5113, + "step": 12780 + }, + { + "epoch": 4.128469980632667, + "grad_norm": 0.8313411474227905, + "learning_rate": 0.0002, + "loss": 0.5233, + "step": 12790 + }, + { + "epoch": 4.131697869593286, + "grad_norm": 1.0177634954452515, + "learning_rate": 0.0002, + "loss": 0.5169, + "step": 12800 + }, + { + "epoch": 4.134925758553906, + "grad_norm": 1.0890623331069946, + "learning_rate": 0.0002, + "loss": 0.4635, + "step": 12810 + }, + { + "epoch": 4.1381536475145255, + "grad_norm": 0.9131693840026855, + "learning_rate": 0.0002, + "loss": 0.519, + "step": 12820 + }, + { + "epoch": 4.141381536475145, + "grad_norm": 0.8400680422782898, + "learning_rate": 0.0002, + "loss": 0.5017, + "step": 12830 + }, + { + "epoch": 4.144609425435765, + "grad_norm": 0.8988795876502991, + "learning_rate": 0.0002, + "loss": 0.5195, + "step": 12840 + }, + { + "epoch": 4.1478373143963845, + "grad_norm": 0.9224025011062622, + "learning_rate": 0.0002, + "loss": 0.5052, + "step": 12850 + }, + { + "epoch": 4.151065203357004, + "grad_norm": 0.7453159689903259, + "learning_rate": 0.0002, + "loss": 0.5001, + "step": 12860 + }, + { + "epoch": 4.154293092317625, + "grad_norm": 0.9815868139266968, + "learning_rate": 0.0002, + "loss": 0.4874, + "step": 12870 + }, + { + "epoch": 4.157520981278244, + "grad_norm": 1.2542768716812134, + "learning_rate": 0.0002, + "loss": 0.5485, + "step": 12880 + }, + { + "epoch": 4.160748870238864, + "grad_norm": 1.0092132091522217, + "learning_rate": 0.0002, + "loss": 0.5287, + "step": 12890 + }, + { + "epoch": 4.163976759199484, + "grad_norm": 1.1836622953414917, + "learning_rate": 0.0002, + "loss": 0.5125, + "step": 12900 + }, + { + "epoch": 4.167204648160103, + "grad_norm": 0.7706810235977173, + "learning_rate": 0.0002, + "loss": 0.5089, + "step": 12910 + }, + { + "epoch": 4.170432537120723, + "grad_norm": 1.00058913230896, + "learning_rate": 0.0002, + "loss": 0.5123, + "step": 12920 + }, + { + "epoch": 4.173660426081343, + "grad_norm": 1.2326250076293945, + "learning_rate": 0.0002, + "loss": 0.5238, + "step": 12930 + }, + { + "epoch": 4.176888315041962, + "grad_norm": 0.8829123377799988, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 12940 + }, + { + "epoch": 4.180116204002582, + "grad_norm": 0.936042845249176, + "learning_rate": 0.0002, + "loss": 0.517, + "step": 12950 + }, + { + "epoch": 4.183344092963202, + "grad_norm": 0.9773517847061157, + "learning_rate": 0.0002, + "loss": 0.4991, + "step": 12960 + }, + { + "epoch": 4.186571981923822, + "grad_norm": 0.9786297678947449, + "learning_rate": 0.0002, + "loss": 0.5025, + "step": 12970 + }, + { + "epoch": 4.189799870884442, + "grad_norm": 0.7524558901786804, + "learning_rate": 0.0002, + "loss": 0.5276, + "step": 12980 + }, + { + "epoch": 4.193027759845061, + "grad_norm": 1.0107866525650024, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 12990 + }, + { + "epoch": 4.196255648805681, + "grad_norm": 1.0092947483062744, + "learning_rate": 0.0002, + "loss": 0.5304, + "step": 13000 + }, + { + "epoch": 4.199483537766301, + "grad_norm": 1.18181312084198, + "learning_rate": 0.0002, + "loss": 0.5061, + "step": 13010 + }, + { + "epoch": 4.20271142672692, + "grad_norm": 0.8845750093460083, + "learning_rate": 0.0002, + "loss": 0.512, + "step": 13020 + }, + { + "epoch": 4.20593931568754, + "grad_norm": 1.0789145231246948, + "learning_rate": 0.0002, + "loss": 0.5329, + "step": 13030 + }, + { + "epoch": 4.2091672046481605, + "grad_norm": 0.9562082886695862, + "learning_rate": 0.0002, + "loss": 0.5001, + "step": 13040 + }, + { + "epoch": 4.21239509360878, + "grad_norm": 0.875755786895752, + "learning_rate": 0.0002, + "loss": 0.5211, + "step": 13050 + }, + { + "epoch": 4.2156229825694, + "grad_norm": 1.0694596767425537, + "learning_rate": 0.0002, + "loss": 0.5162, + "step": 13060 + }, + { + "epoch": 4.2188508715300195, + "grad_norm": 1.0053378343582153, + "learning_rate": 0.0002, + "loss": 0.4917, + "step": 13070 + }, + { + "epoch": 4.222078760490639, + "grad_norm": 1.1628689765930176, + "learning_rate": 0.0002, + "loss": 0.542, + "step": 13080 + }, + { + "epoch": 4.225306649451259, + "grad_norm": 0.9455991983413696, + "learning_rate": 0.0002, + "loss": 0.4796, + "step": 13090 + }, + { + "epoch": 4.228534538411878, + "grad_norm": 0.9736765623092651, + "learning_rate": 0.0002, + "loss": 0.4802, + "step": 13100 + }, + { + "epoch": 4.231762427372498, + "grad_norm": 0.8653560876846313, + "learning_rate": 0.0002, + "loss": 0.5411, + "step": 13110 + }, + { + "epoch": 4.234990316333118, + "grad_norm": 0.9335988163948059, + "learning_rate": 0.0002, + "loss": 0.5347, + "step": 13120 + }, + { + "epoch": 4.238218205293738, + "grad_norm": 0.9102661609649658, + "learning_rate": 0.0002, + "loss": 0.5217, + "step": 13130 + }, + { + "epoch": 4.241446094254358, + "grad_norm": 1.0595461130142212, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 13140 + }, + { + "epoch": 4.244673983214978, + "grad_norm": 0.8947662711143494, + "learning_rate": 0.0002, + "loss": 0.517, + "step": 13150 + }, + { + "epoch": 4.247901872175597, + "grad_norm": 1.0835723876953125, + "learning_rate": 0.0002, + "loss": 0.5116, + "step": 13160 + }, + { + "epoch": 4.251129761136217, + "grad_norm": 0.8496462106704712, + "learning_rate": 0.0002, + "loss": 0.5212, + "step": 13170 + }, + { + "epoch": 4.2543576500968365, + "grad_norm": 0.9395631551742554, + "learning_rate": 0.0002, + "loss": 0.5079, + "step": 13180 + }, + { + "epoch": 4.257585539057456, + "grad_norm": 1.2939592599868774, + "learning_rate": 0.0002, + "loss": 0.5076, + "step": 13190 + }, + { + "epoch": 4.260813428018076, + "grad_norm": 0.9325923919677734, + "learning_rate": 0.0002, + "loss": 0.5209, + "step": 13200 + }, + { + "epoch": 4.264041316978696, + "grad_norm": 0.9220664501190186, + "learning_rate": 0.0002, + "loss": 0.4984, + "step": 13210 + }, + { + "epoch": 4.267269205939316, + "grad_norm": 0.9505137205123901, + "learning_rate": 0.0002, + "loss": 0.5553, + "step": 13220 + }, + { + "epoch": 4.270497094899936, + "grad_norm": 1.0713751316070557, + "learning_rate": 0.0002, + "loss": 0.5238, + "step": 13230 + }, + { + "epoch": 4.273724983860555, + "grad_norm": 0.8390375971794128, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 13240 + }, + { + "epoch": 4.276952872821175, + "grad_norm": 0.8943426012992859, + "learning_rate": 0.0002, + "loss": 0.5217, + "step": 13250 + }, + { + "epoch": 4.280180761781795, + "grad_norm": 0.9175868630409241, + "learning_rate": 0.0002, + "loss": 0.5486, + "step": 13260 + }, + { + "epoch": 4.283408650742414, + "grad_norm": 0.9969881176948547, + "learning_rate": 0.0002, + "loss": 0.5208, + "step": 13270 + }, + { + "epoch": 4.286636539703034, + "grad_norm": 1.2271877527236938, + "learning_rate": 0.0002, + "loss": 0.5376, + "step": 13280 + }, + { + "epoch": 4.289864428663654, + "grad_norm": 0.9463263154029846, + "learning_rate": 0.0002, + "loss": 0.4811, + "step": 13290 + }, + { + "epoch": 4.293092317624274, + "grad_norm": 1.0306228399276733, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 13300 + }, + { + "epoch": 4.296320206584894, + "grad_norm": 0.8454763889312744, + "learning_rate": 0.0002, + "loss": 0.5092, + "step": 13310 + }, + { + "epoch": 4.299548095545513, + "grad_norm": 0.9843119978904724, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 13320 + }, + { + "epoch": 4.302775984506133, + "grad_norm": 1.0836851596832275, + "learning_rate": 0.0002, + "loss": 0.5407, + "step": 13330 + }, + { + "epoch": 4.306003873466753, + "grad_norm": 1.0719412565231323, + "learning_rate": 0.0002, + "loss": 0.5336, + "step": 13340 + }, + { + "epoch": 4.309231762427372, + "grad_norm": 0.9276487827301025, + "learning_rate": 0.0002, + "loss": 0.4798, + "step": 13350 + }, + { + "epoch": 4.312459651387992, + "grad_norm": 0.897072434425354, + "learning_rate": 0.0002, + "loss": 0.5256, + "step": 13360 + }, + { + "epoch": 4.315687540348612, + "grad_norm": 1.0493228435516357, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 13370 + }, + { + "epoch": 4.318915429309232, + "grad_norm": 0.9446353316307068, + "learning_rate": 0.0002, + "loss": 0.5218, + "step": 13380 + }, + { + "epoch": 4.322143318269852, + "grad_norm": 0.7765224575996399, + "learning_rate": 0.0002, + "loss": 0.4765, + "step": 13390 + }, + { + "epoch": 4.3253712072304715, + "grad_norm": 0.9100048542022705, + "learning_rate": 0.0002, + "loss": 0.5907, + "step": 13400 + }, + { + "epoch": 4.328599096191091, + "grad_norm": 1.0913089513778687, + "learning_rate": 0.0002, + "loss": 0.5393, + "step": 13410 + }, + { + "epoch": 4.331826985151711, + "grad_norm": 0.9607733488082886, + "learning_rate": 0.0002, + "loss": 0.494, + "step": 13420 + }, + { + "epoch": 4.3350548741123305, + "grad_norm": 0.8774219155311584, + "learning_rate": 0.0002, + "loss": 0.5273, + "step": 13430 + }, + { + "epoch": 4.33828276307295, + "grad_norm": 0.8366804122924805, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 13440 + }, + { + "epoch": 4.34151065203357, + "grad_norm": 1.034727931022644, + "learning_rate": 0.0002, + "loss": 0.5487, + "step": 13450 + }, + { + "epoch": 4.344738540994189, + "grad_norm": 0.942743182182312, + "learning_rate": 0.0002, + "loss": 0.4995, + "step": 13460 + }, + { + "epoch": 4.347966429954809, + "grad_norm": 0.7237029075622559, + "learning_rate": 0.0002, + "loss": 0.5222, + "step": 13470 + }, + { + "epoch": 4.35119431891543, + "grad_norm": 0.8216196894645691, + "learning_rate": 0.0002, + "loss": 0.5461, + "step": 13480 + }, + { + "epoch": 4.354422207876049, + "grad_norm": 1.031860113143921, + "learning_rate": 0.0002, + "loss": 0.5104, + "step": 13490 + }, + { + "epoch": 4.357650096836669, + "grad_norm": 0.8880493640899658, + "learning_rate": 0.0002, + "loss": 0.547, + "step": 13500 + }, + { + "epoch": 4.360877985797289, + "grad_norm": 0.8442490696907043, + "learning_rate": 0.0002, + "loss": 0.5259, + "step": 13510 + }, + { + "epoch": 4.364105874757908, + "grad_norm": 1.270971655845642, + "learning_rate": 0.0002, + "loss": 0.5176, + "step": 13520 + }, + { + "epoch": 4.367333763718528, + "grad_norm": 0.9657870531082153, + "learning_rate": 0.0002, + "loss": 0.5028, + "step": 13530 + }, + { + "epoch": 4.3705616526791475, + "grad_norm": 0.7477133870124817, + "learning_rate": 0.0002, + "loss": 0.5136, + "step": 13540 + }, + { + "epoch": 4.373789541639767, + "grad_norm": 1.0209243297576904, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 13550 + }, + { + "epoch": 4.377017430600388, + "grad_norm": 0.8714015483856201, + "learning_rate": 0.0002, + "loss": 0.4888, + "step": 13560 + }, + { + "epoch": 4.380245319561007, + "grad_norm": 1.0490189790725708, + "learning_rate": 0.0002, + "loss": 0.5428, + "step": 13570 + }, + { + "epoch": 4.383473208521627, + "grad_norm": 0.9454663991928101, + "learning_rate": 0.0002, + "loss": 0.5398, + "step": 13580 + }, + { + "epoch": 4.386701097482247, + "grad_norm": 1.154146432876587, + "learning_rate": 0.0002, + "loss": 0.5072, + "step": 13590 + }, + { + "epoch": 4.389928986442866, + "grad_norm": 1.155090570449829, + "learning_rate": 0.0002, + "loss": 0.5096, + "step": 13600 + }, + { + "epoch": 4.393156875403486, + "grad_norm": 0.9853842854499817, + "learning_rate": 0.0002, + "loss": 0.5679, + "step": 13610 + }, + { + "epoch": 4.396384764364106, + "grad_norm": 0.9265837669372559, + "learning_rate": 0.0002, + "loss": 0.4992, + "step": 13620 + }, + { + "epoch": 4.399612653324725, + "grad_norm": 0.8367540240287781, + "learning_rate": 0.0002, + "loss": 0.523, + "step": 13630 + }, + { + "epoch": 4.402840542285345, + "grad_norm": 1.1453629732131958, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 13640 + }, + { + "epoch": 4.4060684312459655, + "grad_norm": 1.0856295824050903, + "learning_rate": 0.0002, + "loss": 0.573, + "step": 13650 + }, + { + "epoch": 4.409296320206585, + "grad_norm": 0.9284523129463196, + "learning_rate": 0.0002, + "loss": 0.5178, + "step": 13660 + }, + { + "epoch": 4.412524209167205, + "grad_norm": 0.9632299542427063, + "learning_rate": 0.0002, + "loss": 0.4862, + "step": 13670 + }, + { + "epoch": 4.415752098127824, + "grad_norm": 1.048524260520935, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 13680 + }, + { + "epoch": 4.418979987088444, + "grad_norm": 0.9787682294845581, + "learning_rate": 0.0002, + "loss": 0.5258, + "step": 13690 + }, + { + "epoch": 4.422207876049064, + "grad_norm": 1.0728684663772583, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 13700 + }, + { + "epoch": 4.425435765009683, + "grad_norm": 0.72867351770401, + "learning_rate": 0.0002, + "loss": 0.5243, + "step": 13710 + }, + { + "epoch": 4.428663653970303, + "grad_norm": 0.8932793736457825, + "learning_rate": 0.0002, + "loss": 0.5313, + "step": 13720 + }, + { + "epoch": 4.431891542930924, + "grad_norm": 1.098343849182129, + "learning_rate": 0.0002, + "loss": 0.5156, + "step": 13730 + }, + { + "epoch": 4.435119431891543, + "grad_norm": 0.9321235418319702, + "learning_rate": 0.0002, + "loss": 0.5342, + "step": 13740 + }, + { + "epoch": 4.438347320852163, + "grad_norm": 0.8868634104728699, + "learning_rate": 0.0002, + "loss": 0.5114, + "step": 13750 + }, + { + "epoch": 4.4415752098127825, + "grad_norm": 1.200064778327942, + "learning_rate": 0.0002, + "loss": 0.5284, + "step": 13760 + }, + { + "epoch": 4.444803098773402, + "grad_norm": 0.8968019485473633, + "learning_rate": 0.0002, + "loss": 0.5208, + "step": 13770 + }, + { + "epoch": 4.448030987734022, + "grad_norm": 0.9560935497283936, + "learning_rate": 0.0002, + "loss": 0.4979, + "step": 13780 + }, + { + "epoch": 4.4512588766946415, + "grad_norm": 0.7985701560974121, + "learning_rate": 0.0002, + "loss": 0.5134, + "step": 13790 + }, + { + "epoch": 4.454486765655261, + "grad_norm": 1.062540888786316, + "learning_rate": 0.0002, + "loss": 0.5113, + "step": 13800 + }, + { + "epoch": 4.457714654615881, + "grad_norm": 1.0827109813690186, + "learning_rate": 0.0002, + "loss": 0.525, + "step": 13810 + }, + { + "epoch": 4.460942543576501, + "grad_norm": 1.0853543281555176, + "learning_rate": 0.0002, + "loss": 0.5541, + "step": 13820 + }, + { + "epoch": 4.464170432537121, + "grad_norm": 1.0613641738891602, + "learning_rate": 0.0002, + "loss": 0.5381, + "step": 13830 + }, + { + "epoch": 4.467398321497741, + "grad_norm": 0.9037535190582275, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 13840 + }, + { + "epoch": 4.47062621045836, + "grad_norm": 0.9216223955154419, + "learning_rate": 0.0002, + "loss": 0.5112, + "step": 13850 + }, + { + "epoch": 4.47385409941898, + "grad_norm": 0.8952260613441467, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 13860 + }, + { + "epoch": 4.4770819883796, + "grad_norm": 0.9997953176498413, + "learning_rate": 0.0002, + "loss": 0.5026, + "step": 13870 + }, + { + "epoch": 4.480309877340219, + "grad_norm": 1.062458872795105, + "learning_rate": 0.0002, + "loss": 0.5107, + "step": 13880 + }, + { + "epoch": 4.483537766300839, + "grad_norm": 0.9185126423835754, + "learning_rate": 0.0002, + "loss": 0.5463, + "step": 13890 + }, + { + "epoch": 4.486765655261459, + "grad_norm": 1.2389954328536987, + "learning_rate": 0.0002, + "loss": 0.5181, + "step": 13900 + }, + { + "epoch": 4.489993544222079, + "grad_norm": 1.1632126569747925, + "learning_rate": 0.0002, + "loss": 0.5199, + "step": 13910 + }, + { + "epoch": 4.493221433182699, + "grad_norm": 1.0304487943649292, + "learning_rate": 0.0002, + "loss": 0.5128, + "step": 13920 + }, + { + "epoch": 4.496449322143318, + "grad_norm": 0.9144788384437561, + "learning_rate": 0.0002, + "loss": 0.5331, + "step": 13930 + }, + { + "epoch": 4.499677211103938, + "grad_norm": 1.0285682678222656, + "learning_rate": 0.0002, + "loss": 0.5312, + "step": 13940 + }, + { + "epoch": 4.502905100064558, + "grad_norm": 1.1187206506729126, + "learning_rate": 0.0002, + "loss": 0.554, + "step": 13950 + }, + { + "epoch": 4.506132989025177, + "grad_norm": 0.7917197942733765, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 13960 + }, + { + "epoch": 4.509360877985797, + "grad_norm": 0.8495619297027588, + "learning_rate": 0.0002, + "loss": 0.5227, + "step": 13970 + }, + { + "epoch": 4.512588766946417, + "grad_norm": 1.0450760126113892, + "learning_rate": 0.0002, + "loss": 0.4971, + "step": 13980 + }, + { + "epoch": 4.515816655907037, + "grad_norm": 1.0061010122299194, + "learning_rate": 0.0002, + "loss": 0.5402, + "step": 13990 + }, + { + "epoch": 4.519044544867657, + "grad_norm": 1.0232428312301636, + "learning_rate": 0.0002, + "loss": 0.527, + "step": 14000 + }, + { + "epoch": 4.5222724338282765, + "grad_norm": 0.8734631538391113, + "learning_rate": 0.0002, + "loss": 0.5002, + "step": 14010 + }, + { + "epoch": 4.525500322788896, + "grad_norm": 1.1085621118545532, + "learning_rate": 0.0002, + "loss": 0.5464, + "step": 14020 + }, + { + "epoch": 4.528728211749516, + "grad_norm": 0.9178624749183655, + "learning_rate": 0.0002, + "loss": 0.5167, + "step": 14030 + }, + { + "epoch": 4.531956100710135, + "grad_norm": 1.0687317848205566, + "learning_rate": 0.0002, + "loss": 0.5589, + "step": 14040 + }, + { + "epoch": 4.535183989670755, + "grad_norm": 0.9237300157546997, + "learning_rate": 0.0002, + "loss": 0.5576, + "step": 14050 + }, + { + "epoch": 4.538411878631375, + "grad_norm": 0.9667123556137085, + "learning_rate": 0.0002, + "loss": 0.5062, + "step": 14060 + }, + { + "epoch": 4.541639767591995, + "grad_norm": 1.1286747455596924, + "learning_rate": 0.0002, + "loss": 0.5645, + "step": 14070 + }, + { + "epoch": 4.544867656552615, + "grad_norm": 1.055392861366272, + "learning_rate": 0.0002, + "loss": 0.5226, + "step": 14080 + }, + { + "epoch": 4.548095545513235, + "grad_norm": 0.9492936134338379, + "learning_rate": 0.0002, + "loss": 0.5428, + "step": 14090 + }, + { + "epoch": 4.551323434473854, + "grad_norm": 0.9881349802017212, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 14100 + }, + { + "epoch": 4.554551323434474, + "grad_norm": 0.9389023184776306, + "learning_rate": 0.0002, + "loss": 0.5572, + "step": 14110 + }, + { + "epoch": 4.5577792123950935, + "grad_norm": 0.8395606875419617, + "learning_rate": 0.0002, + "loss": 0.5511, + "step": 14120 + }, + { + "epoch": 4.561007101355713, + "grad_norm": 0.9019067287445068, + "learning_rate": 0.0002, + "loss": 0.5696, + "step": 14130 + }, + { + "epoch": 4.564234990316333, + "grad_norm": 1.1058136224746704, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 14140 + }, + { + "epoch": 4.5674628792769525, + "grad_norm": 1.0683821439743042, + "learning_rate": 0.0002, + "loss": 0.5323, + "step": 14150 + }, + { + "epoch": 4.570690768237572, + "grad_norm": 1.3398395776748657, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 14160 + }, + { + "epoch": 4.573918657198193, + "grad_norm": 0.7829096913337708, + "learning_rate": 0.0002, + "loss": 0.4713, + "step": 14170 + }, + { + "epoch": 4.577146546158812, + "grad_norm": 0.9636675119400024, + "learning_rate": 0.0002, + "loss": 0.525, + "step": 14180 + }, + { + "epoch": 4.580374435119432, + "grad_norm": 1.0291401147842407, + "learning_rate": 0.0002, + "loss": 0.5458, + "step": 14190 + }, + { + "epoch": 4.583602324080052, + "grad_norm": 1.0894310474395752, + "learning_rate": 0.0002, + "loss": 0.5366, + "step": 14200 + }, + { + "epoch": 4.586830213040671, + "grad_norm": 1.111573576927185, + "learning_rate": 0.0002, + "loss": 0.5125, + "step": 14210 + }, + { + "epoch": 4.590058102001291, + "grad_norm": 0.9345336556434631, + "learning_rate": 0.0002, + "loss": 0.5444, + "step": 14220 + }, + { + "epoch": 4.593285990961911, + "grad_norm": 1.3338757753372192, + "learning_rate": 0.0002, + "loss": 0.5175, + "step": 14230 + }, + { + "epoch": 4.596513879922531, + "grad_norm": 1.1146448850631714, + "learning_rate": 0.0002, + "loss": 0.5227, + "step": 14240 + }, + { + "epoch": 4.599741768883151, + "grad_norm": 1.1576755046844482, + "learning_rate": 0.0002, + "loss": 0.543, + "step": 14250 + }, + { + "epoch": 4.60296965784377, + "grad_norm": 0.6851092576980591, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 14260 + }, + { + "epoch": 4.60619754680439, + "grad_norm": 0.9067938923835754, + "learning_rate": 0.0002, + "loss": 0.5027, + "step": 14270 + }, + { + "epoch": 4.60942543576501, + "grad_norm": 0.8767340183258057, + "learning_rate": 0.0002, + "loss": 0.5237, + "step": 14280 + }, + { + "epoch": 4.612653324725629, + "grad_norm": 1.024880290031433, + "learning_rate": 0.0002, + "loss": 0.5294, + "step": 14290 + }, + { + "epoch": 4.615881213686249, + "grad_norm": 0.9226394891738892, + "learning_rate": 0.0002, + "loss": 0.5371, + "step": 14300 + }, + { + "epoch": 4.619109102646869, + "grad_norm": 1.018187165260315, + "learning_rate": 0.0002, + "loss": 0.5281, + "step": 14310 + }, + { + "epoch": 4.622336991607488, + "grad_norm": 0.8851249814033508, + "learning_rate": 0.0002, + "loss": 0.5546, + "step": 14320 + }, + { + "epoch": 4.625564880568108, + "grad_norm": 0.745798647403717, + "learning_rate": 0.0002, + "loss": 0.5206, + "step": 14330 + }, + { + "epoch": 4.6287927695287285, + "grad_norm": 1.2082698345184326, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 14340 + }, + { + "epoch": 4.632020658489348, + "grad_norm": 0.901454508304596, + "learning_rate": 0.0002, + "loss": 0.5449, + "step": 14350 + }, + { + "epoch": 4.635248547449968, + "grad_norm": 0.9593124985694885, + "learning_rate": 0.0002, + "loss": 0.5433, + "step": 14360 + }, + { + "epoch": 4.6384764364105875, + "grad_norm": 1.1241410970687866, + "learning_rate": 0.0002, + "loss": 0.4939, + "step": 14370 + }, + { + "epoch": 4.641704325371207, + "grad_norm": 0.9221102595329285, + "learning_rate": 0.0002, + "loss": 0.5319, + "step": 14380 + }, + { + "epoch": 4.644932214331827, + "grad_norm": 1.0035039186477661, + "learning_rate": 0.0002, + "loss": 0.524, + "step": 14390 + }, + { + "epoch": 4.648160103292446, + "grad_norm": 1.1270662546157837, + "learning_rate": 0.0002, + "loss": 0.5617, + "step": 14400 + }, + { + "epoch": 4.651387992253067, + "grad_norm": 0.8631120324134827, + "learning_rate": 0.0002, + "loss": 0.5663, + "step": 14410 + }, + { + "epoch": 4.654615881213687, + "grad_norm": 1.0604606866836548, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 14420 + }, + { + "epoch": 4.657843770174306, + "grad_norm": 0.8002706170082092, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 14430 + }, + { + "epoch": 4.661071659134926, + "grad_norm": 1.0642075538635254, + "learning_rate": 0.0002, + "loss": 0.5459, + "step": 14440 + }, + { + "epoch": 4.664299548095546, + "grad_norm": 0.9315671324729919, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 14450 + }, + { + "epoch": 4.667527437056165, + "grad_norm": 0.8311864137649536, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 14460 + }, + { + "epoch": 4.670755326016785, + "grad_norm": 0.8900430202484131, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 14470 + }, + { + "epoch": 4.6739832149774045, + "grad_norm": 1.059267282485962, + "learning_rate": 0.0002, + "loss": 0.5086, + "step": 14480 + }, + { + "epoch": 4.677211103938024, + "grad_norm": 0.9864052534103394, + "learning_rate": 0.0002, + "loss": 0.5583, + "step": 14490 + }, + { + "epoch": 4.680438992898644, + "grad_norm": 1.210854411125183, + "learning_rate": 0.0002, + "loss": 0.5737, + "step": 14500 + }, + { + "epoch": 4.683666881859264, + "grad_norm": 1.030693769454956, + "learning_rate": 0.0002, + "loss": 0.536, + "step": 14510 + }, + { + "epoch": 4.686894770819884, + "grad_norm": 0.9809406995773315, + "learning_rate": 0.0002, + "loss": 0.544, + "step": 14520 + }, + { + "epoch": 4.690122659780504, + "grad_norm": 1.0471004247665405, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 14530 + }, + { + "epoch": 4.693350548741123, + "grad_norm": 1.1583727598190308, + "learning_rate": 0.0002, + "loss": 0.5613, + "step": 14540 + }, + { + "epoch": 4.696578437701743, + "grad_norm": 0.9664418697357178, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 14550 + }, + { + "epoch": 4.699806326662363, + "grad_norm": 0.9511209726333618, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 14560 + }, + { + "epoch": 4.703034215622982, + "grad_norm": 1.0211684703826904, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 14570 + }, + { + "epoch": 4.706262104583602, + "grad_norm": 1.097276210784912, + "learning_rate": 0.0002, + "loss": 0.5536, + "step": 14580 + }, + { + "epoch": 4.7094899935442225, + "grad_norm": 0.9363943338394165, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 14590 + }, + { + "epoch": 4.712717882504842, + "grad_norm": 1.4700615406036377, + "learning_rate": 0.0002, + "loss": 0.5261, + "step": 14600 + }, + { + "epoch": 4.715945771465462, + "grad_norm": 1.0001553297042847, + "learning_rate": 0.0002, + "loss": 0.5489, + "step": 14610 + }, + { + "epoch": 4.719173660426081, + "grad_norm": 1.0489927530288696, + "learning_rate": 0.0002, + "loss": 0.5236, + "step": 14620 + }, + { + "epoch": 4.722401549386701, + "grad_norm": 1.0483676195144653, + "learning_rate": 0.0002, + "loss": 0.5418, + "step": 14630 + }, + { + "epoch": 4.725629438347321, + "grad_norm": 1.1501940488815308, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 14640 + }, + { + "epoch": 4.72885732730794, + "grad_norm": 1.1703146696090698, + "learning_rate": 0.0002, + "loss": 0.5059, + "step": 14650 + }, + { + "epoch": 4.73208521626856, + "grad_norm": 0.8842985033988953, + "learning_rate": 0.0002, + "loss": 0.5356, + "step": 14660 + }, + { + "epoch": 4.73531310522918, + "grad_norm": 0.9147908687591553, + "learning_rate": 0.0002, + "loss": 0.5229, + "step": 14670 + }, + { + "epoch": 4.7385409941898, + "grad_norm": 1.0391576290130615, + "learning_rate": 0.0002, + "loss": 0.5436, + "step": 14680 + }, + { + "epoch": 4.74176888315042, + "grad_norm": 0.9469179511070251, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 14690 + }, + { + "epoch": 4.7449967721110395, + "grad_norm": 1.0529530048370361, + "learning_rate": 0.0002, + "loss": 0.5201, + "step": 14700 + }, + { + "epoch": 4.748224661071659, + "grad_norm": 0.9645711183547974, + "learning_rate": 0.0002, + "loss": 0.5401, + "step": 14710 + }, + { + "epoch": 4.751452550032279, + "grad_norm": 0.8163343071937561, + "learning_rate": 0.0002, + "loss": 0.5123, + "step": 14720 + }, + { + "epoch": 4.7546804389928985, + "grad_norm": 1.0581341981887817, + "learning_rate": 0.0002, + "loss": 0.5654, + "step": 14730 + }, + { + "epoch": 4.757908327953518, + "grad_norm": 1.0913853645324707, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 14740 + }, + { + "epoch": 4.761136216914138, + "grad_norm": 1.1071174144744873, + "learning_rate": 0.0002, + "loss": 0.5342, + "step": 14750 + }, + { + "epoch": 4.764364105874758, + "grad_norm": 1.0060709714889526, + "learning_rate": 0.0002, + "loss": 0.5353, + "step": 14760 + }, + { + "epoch": 4.767591994835378, + "grad_norm": 1.012024164199829, + "learning_rate": 0.0002, + "loss": 0.5415, + "step": 14770 + }, + { + "epoch": 4.770819883795998, + "grad_norm": 0.8438148498535156, + "learning_rate": 0.0002, + "loss": 0.5351, + "step": 14780 + }, + { + "epoch": 4.774047772756617, + "grad_norm": 0.8136811256408691, + "learning_rate": 0.0002, + "loss": 0.5424, + "step": 14790 + }, + { + "epoch": 4.777275661717237, + "grad_norm": 1.0765691995620728, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 14800 + }, + { + "epoch": 4.780503550677857, + "grad_norm": 1.0582574605941772, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 14810 + }, + { + "epoch": 4.783731439638476, + "grad_norm": 0.9419516921043396, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 14820 + }, + { + "epoch": 4.786959328599096, + "grad_norm": 0.9626181721687317, + "learning_rate": 0.0002, + "loss": 0.5499, + "step": 14830 + }, + { + "epoch": 4.7901872175597155, + "grad_norm": 1.2552800178527832, + "learning_rate": 0.0002, + "loss": 0.565, + "step": 14840 + }, + { + "epoch": 4.793415106520336, + "grad_norm": 0.9379919171333313, + "learning_rate": 0.0002, + "loss": 0.5402, + "step": 14850 + }, + { + "epoch": 4.796642995480956, + "grad_norm": 0.8166947364807129, + "learning_rate": 0.0002, + "loss": 0.5583, + "step": 14860 + }, + { + "epoch": 4.799870884441575, + "grad_norm": 0.9008694887161255, + "learning_rate": 0.0002, + "loss": 0.5139, + "step": 14870 + }, + { + "epoch": 4.803098773402195, + "grad_norm": 1.0256156921386719, + "learning_rate": 0.0002, + "loss": 0.5049, + "step": 14880 + }, + { + "epoch": 4.806326662362815, + "grad_norm": 0.9486594200134277, + "learning_rate": 0.0002, + "loss": 0.5531, + "step": 14890 + }, + { + "epoch": 4.809554551323434, + "grad_norm": 0.955238401889801, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 14900 + }, + { + "epoch": 4.812782440284054, + "grad_norm": 1.03775954246521, + "learning_rate": 0.0002, + "loss": 0.5269, + "step": 14910 + }, + { + "epoch": 4.816010329244674, + "grad_norm": 1.1383405923843384, + "learning_rate": 0.0002, + "loss": 0.5445, + "step": 14920 + }, + { + "epoch": 4.819238218205294, + "grad_norm": 0.9411700963973999, + "learning_rate": 0.0002, + "loss": 0.5347, + "step": 14930 + }, + { + "epoch": 4.822466107165914, + "grad_norm": 0.8188554644584656, + "learning_rate": 0.0002, + "loss": 0.4899, + "step": 14940 + }, + { + "epoch": 4.8256939961265335, + "grad_norm": 1.1336265802383423, + "learning_rate": 0.0002, + "loss": 0.5618, + "step": 14950 + }, + { + "epoch": 4.828921885087153, + "grad_norm": 1.106121301651001, + "learning_rate": 0.0002, + "loss": 0.5578, + "step": 14960 + }, + { + "epoch": 4.832149774047773, + "grad_norm": 1.0206533670425415, + "learning_rate": 0.0002, + "loss": 0.5306, + "step": 14970 + }, + { + "epoch": 4.8353776630083924, + "grad_norm": 1.1123926639556885, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 14980 + }, + { + "epoch": 4.838605551969012, + "grad_norm": 0.7879418730735779, + "learning_rate": 0.0002, + "loss": 0.5208, + "step": 14990 + }, + { + "epoch": 4.841833440929632, + "grad_norm": 1.0171709060668945, + "learning_rate": 0.0002, + "loss": 0.5385, + "step": 15000 + }, + { + "epoch": 4.845061329890251, + "grad_norm": 1.010671615600586, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 15010 + }, + { + "epoch": 4.848289218850871, + "grad_norm": 1.0778919458389282, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 15020 + }, + { + "epoch": 4.851517107811492, + "grad_norm": 1.0479968786239624, + "learning_rate": 0.0002, + "loss": 0.5587, + "step": 15030 + }, + { + "epoch": 4.854744996772111, + "grad_norm": 1.0345100164413452, + "learning_rate": 0.0002, + "loss": 0.5637, + "step": 15040 + }, + { + "epoch": 4.857972885732731, + "grad_norm": 0.9539691805839539, + "learning_rate": 0.0002, + "loss": 0.5809, + "step": 15050 + }, + { + "epoch": 4.8612007746933505, + "grad_norm": 0.9914752840995789, + "learning_rate": 0.0002, + "loss": 0.5314, + "step": 15060 + }, + { + "epoch": 4.86442866365397, + "grad_norm": 1.1935476064682007, + "learning_rate": 0.0002, + "loss": 0.5277, + "step": 15070 + }, + { + "epoch": 4.86765655261459, + "grad_norm": 1.0065057277679443, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 15080 + }, + { + "epoch": 4.8708844415752095, + "grad_norm": 0.9320993423461914, + "learning_rate": 0.0002, + "loss": 0.5563, + "step": 15090 + }, + { + "epoch": 4.87411233053583, + "grad_norm": 1.0578069686889648, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 15100 + }, + { + "epoch": 4.87734021949645, + "grad_norm": 0.9666239023208618, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 15110 + }, + { + "epoch": 4.880568108457069, + "grad_norm": 1.1322687864303589, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 15120 + }, + { + "epoch": 4.883795997417689, + "grad_norm": 0.955674409866333, + "learning_rate": 0.0002, + "loss": 0.5381, + "step": 15130 + }, + { + "epoch": 4.887023886378309, + "grad_norm": 1.119413137435913, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 15140 + }, + { + "epoch": 4.890251775338928, + "grad_norm": 0.863646924495697, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 15150 + }, + { + "epoch": 4.893479664299548, + "grad_norm": 1.1823450326919556, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 15160 + }, + { + "epoch": 4.896707553260168, + "grad_norm": 0.8657588958740234, + "learning_rate": 0.0002, + "loss": 0.5654, + "step": 15170 + }, + { + "epoch": 4.899935442220787, + "grad_norm": 0.8575737476348877, + "learning_rate": 0.0002, + "loss": 0.5239, + "step": 15180 + }, + { + "epoch": 4.903163331181407, + "grad_norm": 0.9611830711364746, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 15190 + }, + { + "epoch": 4.906391220142027, + "grad_norm": 1.1981453895568848, + "learning_rate": 0.0002, + "loss": 0.5505, + "step": 15200 + }, + { + "epoch": 4.909619109102647, + "grad_norm": 0.9401199221611023, + "learning_rate": 0.0002, + "loss": 0.5582, + "step": 15210 + }, + { + "epoch": 4.912846998063267, + "grad_norm": 0.8420369625091553, + "learning_rate": 0.0002, + "loss": 0.5631, + "step": 15220 + }, + { + "epoch": 4.916074887023886, + "grad_norm": 0.7877969145774841, + "learning_rate": 0.0002, + "loss": 0.5255, + "step": 15230 + }, + { + "epoch": 4.919302775984506, + "grad_norm": 0.8988324403762817, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 15240 + }, + { + "epoch": 4.922530664945126, + "grad_norm": 1.1103752851486206, + "learning_rate": 0.0002, + "loss": 0.5274, + "step": 15250 + }, + { + "epoch": 4.925758553905745, + "grad_norm": 0.8874443173408508, + "learning_rate": 0.0002, + "loss": 0.5249, + "step": 15260 + }, + { + "epoch": 4.928986442866366, + "grad_norm": 1.1001752614974976, + "learning_rate": 0.0002, + "loss": 0.5677, + "step": 15270 + }, + { + "epoch": 4.9322143318269855, + "grad_norm": 0.9661307334899902, + "learning_rate": 0.0002, + "loss": 0.5596, + "step": 15280 + }, + { + "epoch": 4.935442220787605, + "grad_norm": 1.1738812923431396, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 15290 + }, + { + "epoch": 4.938670109748225, + "grad_norm": 0.9773507714271545, + "learning_rate": 0.0002, + "loss": 0.5057, + "step": 15300 + }, + { + "epoch": 4.9418979987088445, + "grad_norm": 1.0735599994659424, + "learning_rate": 0.0002, + "loss": 0.5029, + "step": 15310 + }, + { + "epoch": 4.945125887669464, + "grad_norm": 1.0552113056182861, + "learning_rate": 0.0002, + "loss": 0.4996, + "step": 15320 + }, + { + "epoch": 4.948353776630084, + "grad_norm": 1.0900797843933105, + "learning_rate": 0.0002, + "loss": 0.5201, + "step": 15330 + }, + { + "epoch": 4.9515816655907035, + "grad_norm": 1.0908405780792236, + "learning_rate": 0.0002, + "loss": 0.552, + "step": 15340 + }, + { + "epoch": 4.954809554551323, + "grad_norm": 1.010221004486084, + "learning_rate": 0.0002, + "loss": 0.6208, + "step": 15350 + }, + { + "epoch": 4.958037443511943, + "grad_norm": 1.0321437120437622, + "learning_rate": 0.0002, + "loss": 0.5423, + "step": 15360 + }, + { + "epoch": 4.961265332472563, + "grad_norm": 0.8430278897285461, + "learning_rate": 0.0002, + "loss": 0.5903, + "step": 15370 + }, + { + "epoch": 4.964493221433183, + "grad_norm": 0.8775330185890198, + "learning_rate": 0.0002, + "loss": 0.538, + "step": 15380 + }, + { + "epoch": 4.967721110393803, + "grad_norm": 0.9796988368034363, + "learning_rate": 0.0002, + "loss": 0.5344, + "step": 15390 + }, + { + "epoch": 4.970948999354422, + "grad_norm": 0.8782257437705994, + "learning_rate": 0.0002, + "loss": 0.5352, + "step": 15400 + }, + { + "epoch": 4.974176888315042, + "grad_norm": 0.9959840774536133, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 15410 + }, + { + "epoch": 4.9774047772756616, + "grad_norm": 1.0730273723602295, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 15420 + }, + { + "epoch": 4.980632666236281, + "grad_norm": 0.8653680682182312, + "learning_rate": 0.0002, + "loss": 0.5277, + "step": 15430 + }, + { + "epoch": 4.983860555196901, + "grad_norm": 1.0769985914230347, + "learning_rate": 0.0002, + "loss": 0.5301, + "step": 15440 + }, + { + "epoch": 4.987088444157521, + "grad_norm": 1.1336040496826172, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 15450 + }, + { + "epoch": 4.990316333118141, + "grad_norm": 0.9844824075698853, + "learning_rate": 0.0002, + "loss": 0.5454, + "step": 15460 + }, + { + "epoch": 4.993544222078761, + "grad_norm": 0.8368769288063049, + "learning_rate": 0.0002, + "loss": 0.5316, + "step": 15470 + }, + { + "epoch": 4.99677211103938, + "grad_norm": 1.0238676071166992, + "learning_rate": 0.0002, + "loss": 0.5464, + "step": 15480 + }, + { + "epoch": 5.0, + "grad_norm": 1.064820408821106, + "learning_rate": 0.0002, + "loss": 0.5577, + "step": 15490 + }, + { + "epoch": 5.0, + "eval_loss": 1.241918921470642, + "eval_runtime": 158.4099, + "eval_samples_per_second": 4.627, + "eval_steps_per_second": 0.581, + "step": 15490 + }, + { + "epoch": 5.00322788896062, + "grad_norm": 1.1366689205169678, + "learning_rate": 0.0002, + "loss": 0.4554, + "step": 15500 + }, + { + "epoch": 5.006455777921239, + "grad_norm": 1.2548010349273682, + "learning_rate": 0.0002, + "loss": 0.4288, + "step": 15510 + }, + { + "epoch": 5.009683666881859, + "grad_norm": 1.3875139951705933, + "learning_rate": 0.0002, + "loss": 0.4276, + "step": 15520 + }, + { + "epoch": 5.012911555842479, + "grad_norm": 0.9834036231040955, + "learning_rate": 0.0002, + "loss": 0.4198, + "step": 15530 + }, + { + "epoch": 5.016139444803099, + "grad_norm": 1.0737303495407104, + "learning_rate": 0.0002, + "loss": 0.4531, + "step": 15540 + }, + { + "epoch": 5.019367333763719, + "grad_norm": 0.9877859950065613, + "learning_rate": 0.0002, + "loss": 0.4073, + "step": 15550 + }, + { + "epoch": 5.0225952227243384, + "grad_norm": 1.143268346786499, + "learning_rate": 0.0002, + "loss": 0.4459, + "step": 15560 + }, + { + "epoch": 5.025823111684958, + "grad_norm": 1.1206166744232178, + "learning_rate": 0.0002, + "loss": 0.4477, + "step": 15570 + }, + { + "epoch": 5.029051000645578, + "grad_norm": 0.9977272748947144, + "learning_rate": 0.0002, + "loss": 0.4593, + "step": 15580 + }, + { + "epoch": 5.032278889606197, + "grad_norm": 1.3193285465240479, + "learning_rate": 0.0002, + "loss": 0.436, + "step": 15590 + }, + { + "epoch": 5.035506778566817, + "grad_norm": 1.0761713981628418, + "learning_rate": 0.0002, + "loss": 0.4426, + "step": 15600 + }, + { + "epoch": 5.038734667527437, + "grad_norm": 1.1250759363174438, + "learning_rate": 0.0002, + "loss": 0.4701, + "step": 15610 + }, + { + "epoch": 5.041962556488057, + "grad_norm": 1.0414305925369263, + "learning_rate": 0.0002, + "loss": 0.3995, + "step": 15620 + }, + { + "epoch": 5.045190445448677, + "grad_norm": 1.0906853675842285, + "learning_rate": 0.0002, + "loss": 0.4244, + "step": 15630 + }, + { + "epoch": 5.0484183344092965, + "grad_norm": 0.9360867142677307, + "learning_rate": 0.0002, + "loss": 0.441, + "step": 15640 + }, + { + "epoch": 5.051646223369916, + "grad_norm": 0.9078057408332825, + "learning_rate": 0.0002, + "loss": 0.4146, + "step": 15650 + }, + { + "epoch": 5.054874112330536, + "grad_norm": 1.0054848194122314, + "learning_rate": 0.0002, + "loss": 0.4285, + "step": 15660 + }, + { + "epoch": 5.0581020012911555, + "grad_norm": 0.9538215398788452, + "learning_rate": 0.0002, + "loss": 0.417, + "step": 15670 + }, + { + "epoch": 5.061329890251775, + "grad_norm": 1.6312693357467651, + "learning_rate": 0.0002, + "loss": 0.4629, + "step": 15680 + }, + { + "epoch": 5.064557779212395, + "grad_norm": 1.2100921869277954, + "learning_rate": 0.0002, + "loss": 0.3996, + "step": 15690 + }, + { + "epoch": 5.0677856681730145, + "grad_norm": 1.2776238918304443, + "learning_rate": 0.0002, + "loss": 0.4489, + "step": 15700 + }, + { + "epoch": 5.071013557133635, + "grad_norm": 1.0110050439834595, + "learning_rate": 0.0002, + "loss": 0.4728, + "step": 15710 + }, + { + "epoch": 5.074241446094255, + "grad_norm": 1.0896575450897217, + "learning_rate": 0.0002, + "loss": 0.4916, + "step": 15720 + }, + { + "epoch": 5.077469335054874, + "grad_norm": 0.9989936947822571, + "learning_rate": 0.0002, + "loss": 0.4462, + "step": 15730 + }, + { + "epoch": 5.080697224015494, + "grad_norm": 1.0412228107452393, + "learning_rate": 0.0002, + "loss": 0.457, + "step": 15740 + }, + { + "epoch": 5.083925112976114, + "grad_norm": 1.0964457988739014, + "learning_rate": 0.0002, + "loss": 0.4525, + "step": 15750 + }, + { + "epoch": 5.087153001936733, + "grad_norm": 1.1700960397720337, + "learning_rate": 0.0002, + "loss": 0.4539, + "step": 15760 + }, + { + "epoch": 5.090380890897353, + "grad_norm": 0.9515631794929504, + "learning_rate": 0.0002, + "loss": 0.4517, + "step": 15770 + }, + { + "epoch": 5.093608779857973, + "grad_norm": 1.0895006656646729, + "learning_rate": 0.0002, + "loss": 0.4352, + "step": 15780 + }, + { + "epoch": 5.096836668818592, + "grad_norm": 1.041312575340271, + "learning_rate": 0.0002, + "loss": 0.4765, + "step": 15790 + }, + { + "epoch": 5.100064557779213, + "grad_norm": 0.9518465399742126, + "learning_rate": 0.0002, + "loss": 0.4532, + "step": 15800 + }, + { + "epoch": 5.103292446739832, + "grad_norm": 0.8317030668258667, + "learning_rate": 0.0002, + "loss": 0.4187, + "step": 15810 + }, + { + "epoch": 5.106520335700452, + "grad_norm": 1.0933761596679688, + "learning_rate": 0.0002, + "loss": 0.4523, + "step": 15820 + }, + { + "epoch": 5.109748224661072, + "grad_norm": 1.0069324970245361, + "learning_rate": 0.0002, + "loss": 0.4689, + "step": 15830 + }, + { + "epoch": 5.112976113621691, + "grad_norm": 1.1166068315505981, + "learning_rate": 0.0002, + "loss": 0.4773, + "step": 15840 + }, + { + "epoch": 5.116204002582311, + "grad_norm": 1.069992184638977, + "learning_rate": 0.0002, + "loss": 0.4635, + "step": 15850 + }, + { + "epoch": 5.119431891542931, + "grad_norm": 1.3728036880493164, + "learning_rate": 0.0002, + "loss": 0.445, + "step": 15860 + }, + { + "epoch": 5.12265978050355, + "grad_norm": 1.0625780820846558, + "learning_rate": 0.0002, + "loss": 0.4563, + "step": 15870 + }, + { + "epoch": 5.125887669464171, + "grad_norm": 1.090174913406372, + "learning_rate": 0.0002, + "loss": 0.426, + "step": 15880 + }, + { + "epoch": 5.1291155584247905, + "grad_norm": 0.8729526996612549, + "learning_rate": 0.0002, + "loss": 0.457, + "step": 15890 + }, + { + "epoch": 5.13234344738541, + "grad_norm": 0.9561540484428406, + "learning_rate": 0.0002, + "loss": 0.4686, + "step": 15900 + }, + { + "epoch": 5.13557133634603, + "grad_norm": 1.012120246887207, + "learning_rate": 0.0002, + "loss": 0.4266, + "step": 15910 + }, + { + "epoch": 5.1387992253066495, + "grad_norm": 1.1027921438217163, + "learning_rate": 0.0002, + "loss": 0.4484, + "step": 15920 + }, + { + "epoch": 5.142027114267269, + "grad_norm": 1.0878126621246338, + "learning_rate": 0.0002, + "loss": 0.4389, + "step": 15930 + }, + { + "epoch": 5.145255003227889, + "grad_norm": 0.9619103670120239, + "learning_rate": 0.0002, + "loss": 0.4716, + "step": 15940 + }, + { + "epoch": 5.148482892188508, + "grad_norm": 1.1684138774871826, + "learning_rate": 0.0002, + "loss": 0.4071, + "step": 15950 + }, + { + "epoch": 5.151710781149128, + "grad_norm": 1.3379510641098022, + "learning_rate": 0.0002, + "loss": 0.4292, + "step": 15960 + }, + { + "epoch": 5.154938670109749, + "grad_norm": 1.0427496433258057, + "learning_rate": 0.0002, + "loss": 0.4413, + "step": 15970 + }, + { + "epoch": 5.158166559070368, + "grad_norm": 0.9917148351669312, + "learning_rate": 0.0002, + "loss": 0.4665, + "step": 15980 + }, + { + "epoch": 5.161394448030988, + "grad_norm": 1.0899780988693237, + "learning_rate": 0.0002, + "loss": 0.4527, + "step": 15990 + }, + { + "epoch": 5.1646223369916076, + "grad_norm": 0.9251647591590881, + "learning_rate": 0.0002, + "loss": 0.4764, + "step": 16000 + }, + { + "epoch": 5.167850225952227, + "grad_norm": 1.1669172048568726, + "learning_rate": 0.0002, + "loss": 0.5043, + "step": 16010 + }, + { + "epoch": 5.171078114912847, + "grad_norm": 1.2285256385803223, + "learning_rate": 0.0002, + "loss": 0.4726, + "step": 16020 + }, + { + "epoch": 5.1743060038734665, + "grad_norm": 1.0504484176635742, + "learning_rate": 0.0002, + "loss": 0.4312, + "step": 16030 + }, + { + "epoch": 5.177533892834086, + "grad_norm": 1.2829089164733887, + "learning_rate": 0.0002, + "loss": 0.4507, + "step": 16040 + }, + { + "epoch": 5.180761781794706, + "grad_norm": 0.9332743287086487, + "learning_rate": 0.0002, + "loss": 0.4547, + "step": 16050 + }, + { + "epoch": 5.183989670755326, + "grad_norm": 1.0054426193237305, + "learning_rate": 0.0002, + "loss": 0.4211, + "step": 16060 + }, + { + "epoch": 5.187217559715946, + "grad_norm": 1.0049669742584229, + "learning_rate": 0.0002, + "loss": 0.4415, + "step": 16070 + }, + { + "epoch": 5.190445448676566, + "grad_norm": 1.0171366930007935, + "learning_rate": 0.0002, + "loss": 0.4462, + "step": 16080 + }, + { + "epoch": 5.193673337637185, + "grad_norm": 1.234966516494751, + "learning_rate": 0.0002, + "loss": 0.4725, + "step": 16090 + }, + { + "epoch": 5.196901226597805, + "grad_norm": 0.9127960205078125, + "learning_rate": 0.0002, + "loss": 0.4579, + "step": 16100 + }, + { + "epoch": 5.200129115558425, + "grad_norm": 1.153924822807312, + "learning_rate": 0.0002, + "loss": 0.4647, + "step": 16110 + }, + { + "epoch": 5.203357004519044, + "grad_norm": 1.26716947555542, + "learning_rate": 0.0002, + "loss": 0.4826, + "step": 16120 + }, + { + "epoch": 5.206584893479664, + "grad_norm": 1.2438743114471436, + "learning_rate": 0.0002, + "loss": 0.446, + "step": 16130 + }, + { + "epoch": 5.2098127824402845, + "grad_norm": 1.0888392925262451, + "learning_rate": 0.0002, + "loss": 0.4768, + "step": 16140 + }, + { + "epoch": 5.213040671400904, + "grad_norm": 1.1741917133331299, + "learning_rate": 0.0002, + "loss": 0.4508, + "step": 16150 + }, + { + "epoch": 5.216268560361524, + "grad_norm": 0.9508614540100098, + "learning_rate": 0.0002, + "loss": 0.4271, + "step": 16160 + }, + { + "epoch": 5.219496449322143, + "grad_norm": 0.9714716672897339, + "learning_rate": 0.0002, + "loss": 0.4577, + "step": 16170 + }, + { + "epoch": 5.222724338282763, + "grad_norm": 1.2681622505187988, + "learning_rate": 0.0002, + "loss": 0.4636, + "step": 16180 + }, + { + "epoch": 5.225952227243383, + "grad_norm": 1.045871376991272, + "learning_rate": 0.0002, + "loss": 0.4723, + "step": 16190 + }, + { + "epoch": 5.229180116204002, + "grad_norm": 1.0272563695907593, + "learning_rate": 0.0002, + "loss": 0.4467, + "step": 16200 + }, + { + "epoch": 5.232408005164622, + "grad_norm": 1.092901349067688, + "learning_rate": 0.0002, + "loss": 0.4353, + "step": 16210 + }, + { + "epoch": 5.235635894125242, + "grad_norm": 0.9332799315452576, + "learning_rate": 0.0002, + "loss": 0.4588, + "step": 16220 + }, + { + "epoch": 5.238863783085862, + "grad_norm": 1.1728498935699463, + "learning_rate": 0.0002, + "loss": 0.4594, + "step": 16230 + }, + { + "epoch": 5.242091672046482, + "grad_norm": 0.9932476878166199, + "learning_rate": 0.0002, + "loss": 0.4652, + "step": 16240 + }, + { + "epoch": 5.2453195610071015, + "grad_norm": 0.735236406326294, + "learning_rate": 0.0002, + "loss": 0.4469, + "step": 16250 + }, + { + "epoch": 5.248547449967721, + "grad_norm": 1.0289303064346313, + "learning_rate": 0.0002, + "loss": 0.4386, + "step": 16260 + }, + { + "epoch": 5.251775338928341, + "grad_norm": 0.9488231539726257, + "learning_rate": 0.0002, + "loss": 0.4303, + "step": 16270 + }, + { + "epoch": 5.2550032278889605, + "grad_norm": 0.8320055603981018, + "learning_rate": 0.0002, + "loss": 0.4495, + "step": 16280 + }, + { + "epoch": 5.25823111684958, + "grad_norm": 1.2013251781463623, + "learning_rate": 0.0002, + "loss": 0.4224, + "step": 16290 + }, + { + "epoch": 5.2614590058102, + "grad_norm": 1.0649845600128174, + "learning_rate": 0.0002, + "loss": 0.4666, + "step": 16300 + }, + { + "epoch": 5.26468689477082, + "grad_norm": 1.1674472093582153, + "learning_rate": 0.0002, + "loss": 0.4325, + "step": 16310 + }, + { + "epoch": 5.26791478373144, + "grad_norm": 1.3934763669967651, + "learning_rate": 0.0002, + "loss": 0.4482, + "step": 16320 + }, + { + "epoch": 5.27114267269206, + "grad_norm": 0.8427977561950684, + "learning_rate": 0.0002, + "loss": 0.4494, + "step": 16330 + }, + { + "epoch": 5.274370561652679, + "grad_norm": 1.0497093200683594, + "learning_rate": 0.0002, + "loss": 0.4234, + "step": 16340 + }, + { + "epoch": 5.277598450613299, + "grad_norm": 0.8562338352203369, + "learning_rate": 0.0002, + "loss": 0.4337, + "step": 16350 + }, + { + "epoch": 5.280826339573919, + "grad_norm": 1.043920874595642, + "learning_rate": 0.0002, + "loss": 0.4664, + "step": 16360 + }, + { + "epoch": 5.284054228534538, + "grad_norm": 1.0039188861846924, + "learning_rate": 0.0002, + "loss": 0.4463, + "step": 16370 + }, + { + "epoch": 5.287282117495158, + "grad_norm": 0.9414041638374329, + "learning_rate": 0.0002, + "loss": 0.4149, + "step": 16380 + }, + { + "epoch": 5.2905100064557775, + "grad_norm": 1.3346221446990967, + "learning_rate": 0.0002, + "loss": 0.5119, + "step": 16390 + }, + { + "epoch": 5.293737895416398, + "grad_norm": 1.0173962116241455, + "learning_rate": 0.0002, + "loss": 0.4479, + "step": 16400 + }, + { + "epoch": 5.296965784377018, + "grad_norm": 0.7756500244140625, + "learning_rate": 0.0002, + "loss": 0.4538, + "step": 16410 + }, + { + "epoch": 5.300193673337637, + "grad_norm": 1.1185362339019775, + "learning_rate": 0.0002, + "loss": 0.4306, + "step": 16420 + }, + { + "epoch": 5.303421562298257, + "grad_norm": 1.0904899835586548, + "learning_rate": 0.0002, + "loss": 0.5033, + "step": 16430 + }, + { + "epoch": 5.306649451258877, + "grad_norm": 1.0803170204162598, + "learning_rate": 0.0002, + "loss": 0.4887, + "step": 16440 + }, + { + "epoch": 5.309877340219496, + "grad_norm": 1.1492092609405518, + "learning_rate": 0.0002, + "loss": 0.4473, + "step": 16450 + }, + { + "epoch": 5.313105229180116, + "grad_norm": 1.1212135553359985, + "learning_rate": 0.0002, + "loss": 0.4696, + "step": 16460 + }, + { + "epoch": 5.316333118140736, + "grad_norm": 0.8274528980255127, + "learning_rate": 0.0002, + "loss": 0.4438, + "step": 16470 + }, + { + "epoch": 5.319561007101356, + "grad_norm": 1.118891716003418, + "learning_rate": 0.0002, + "loss": 0.468, + "step": 16480 + }, + { + "epoch": 5.322788896061976, + "grad_norm": 1.185945749282837, + "learning_rate": 0.0002, + "loss": 0.4403, + "step": 16490 + }, + { + "epoch": 5.3260167850225955, + "grad_norm": 1.0275214910507202, + "learning_rate": 0.0002, + "loss": 0.4946, + "step": 16500 + }, + { + "epoch": 5.329244673983215, + "grad_norm": 0.9346362352371216, + "learning_rate": 0.0002, + "loss": 0.4612, + "step": 16510 + }, + { + "epoch": 5.332472562943835, + "grad_norm": 0.9600600600242615, + "learning_rate": 0.0002, + "loss": 0.4722, + "step": 16520 + }, + { + "epoch": 5.335700451904454, + "grad_norm": 1.1238188743591309, + "learning_rate": 0.0002, + "loss": 0.4536, + "step": 16530 + }, + { + "epoch": 5.338928340865074, + "grad_norm": 0.8660476207733154, + "learning_rate": 0.0002, + "loss": 0.5025, + "step": 16540 + }, + { + "epoch": 5.342156229825694, + "grad_norm": 0.9869821071624756, + "learning_rate": 0.0002, + "loss": 0.4732, + "step": 16550 + }, + { + "epoch": 5.345384118786313, + "grad_norm": 1.1719090938568115, + "learning_rate": 0.0002, + "loss": 0.4967, + "step": 16560 + }, + { + "epoch": 5.348612007746934, + "grad_norm": 1.0122894048690796, + "learning_rate": 0.0002, + "loss": 0.4563, + "step": 16570 + }, + { + "epoch": 5.351839896707554, + "grad_norm": 1.2431079149246216, + "learning_rate": 0.0002, + "loss": 0.5066, + "step": 16580 + }, + { + "epoch": 5.355067785668173, + "grad_norm": 1.4178080558776855, + "learning_rate": 0.0002, + "loss": 0.4708, + "step": 16590 + }, + { + "epoch": 5.358295674628793, + "grad_norm": 1.1895726919174194, + "learning_rate": 0.0002, + "loss": 0.4686, + "step": 16600 + }, + { + "epoch": 5.3615235635894125, + "grad_norm": 1.154392123222351, + "learning_rate": 0.0002, + "loss": 0.475, + "step": 16610 + }, + { + "epoch": 5.364751452550032, + "grad_norm": 0.9207229018211365, + "learning_rate": 0.0002, + "loss": 0.4511, + "step": 16620 + }, + { + "epoch": 5.367979341510652, + "grad_norm": 1.0247414112091064, + "learning_rate": 0.0002, + "loss": 0.4606, + "step": 16630 + }, + { + "epoch": 5.3712072304712715, + "grad_norm": 1.0402202606201172, + "learning_rate": 0.0002, + "loss": 0.4886, + "step": 16640 + }, + { + "epoch": 5.374435119431892, + "grad_norm": 1.1902891397476196, + "learning_rate": 0.0002, + "loss": 0.4903, + "step": 16650 + }, + { + "epoch": 5.377663008392512, + "grad_norm": 0.9572759866714478, + "learning_rate": 0.0002, + "loss": 0.4583, + "step": 16660 + }, + { + "epoch": 5.380890897353131, + "grad_norm": 0.9968860149383545, + "learning_rate": 0.0002, + "loss": 0.4636, + "step": 16670 + }, + { + "epoch": 5.384118786313751, + "grad_norm": 1.2468547821044922, + "learning_rate": 0.0002, + "loss": 0.477, + "step": 16680 + }, + { + "epoch": 5.387346675274371, + "grad_norm": 1.154661774635315, + "learning_rate": 0.0002, + "loss": 0.5223, + "step": 16690 + }, + { + "epoch": 5.39057456423499, + "grad_norm": 0.8837044835090637, + "learning_rate": 0.0002, + "loss": 0.4637, + "step": 16700 + }, + { + "epoch": 5.39380245319561, + "grad_norm": 1.0317907333374023, + "learning_rate": 0.0002, + "loss": 0.4744, + "step": 16710 + }, + { + "epoch": 5.39703034215623, + "grad_norm": 0.9811587929725647, + "learning_rate": 0.0002, + "loss": 0.4831, + "step": 16720 + }, + { + "epoch": 5.400258231116849, + "grad_norm": 0.9487450122833252, + "learning_rate": 0.0002, + "loss": 0.4739, + "step": 16730 + }, + { + "epoch": 5.403486120077469, + "grad_norm": 1.0540274381637573, + "learning_rate": 0.0002, + "loss": 0.4574, + "step": 16740 + }, + { + "epoch": 5.406714009038089, + "grad_norm": 1.028363585472107, + "learning_rate": 0.0002, + "loss": 0.4709, + "step": 16750 + }, + { + "epoch": 5.409941897998709, + "grad_norm": 1.0200704336166382, + "learning_rate": 0.0002, + "loss": 0.468, + "step": 16760 + }, + { + "epoch": 5.413169786959329, + "grad_norm": 1.0330981016159058, + "learning_rate": 0.0002, + "loss": 0.4383, + "step": 16770 + }, + { + "epoch": 5.416397675919948, + "grad_norm": 1.320875644683838, + "learning_rate": 0.0002, + "loss": 0.4645, + "step": 16780 + }, + { + "epoch": 5.419625564880568, + "grad_norm": 0.9838143587112427, + "learning_rate": 0.0002, + "loss": 0.4601, + "step": 16790 + }, + { + "epoch": 5.422853453841188, + "grad_norm": 1.1006578207015991, + "learning_rate": 0.0002, + "loss": 0.4835, + "step": 16800 + }, + { + "epoch": 5.426081342801807, + "grad_norm": 1.099174976348877, + "learning_rate": 0.0002, + "loss": 0.4871, + "step": 16810 + }, + { + "epoch": 5.429309231762427, + "grad_norm": 1.0632189512252808, + "learning_rate": 0.0002, + "loss": 0.4773, + "step": 16820 + }, + { + "epoch": 5.4325371207230475, + "grad_norm": 0.9673194885253906, + "learning_rate": 0.0002, + "loss": 0.4732, + "step": 16830 + }, + { + "epoch": 5.435765009683667, + "grad_norm": 0.853013813495636, + "learning_rate": 0.0002, + "loss": 0.4731, + "step": 16840 + }, + { + "epoch": 5.438992898644287, + "grad_norm": 1.0261728763580322, + "learning_rate": 0.0002, + "loss": 0.4856, + "step": 16850 + }, + { + "epoch": 5.4422207876049065, + "grad_norm": 1.1642370223999023, + "learning_rate": 0.0002, + "loss": 0.4729, + "step": 16860 + }, + { + "epoch": 5.445448676565526, + "grad_norm": 0.8715673685073853, + "learning_rate": 0.0002, + "loss": 0.4751, + "step": 16870 + }, + { + "epoch": 5.448676565526146, + "grad_norm": 0.905746579170227, + "learning_rate": 0.0002, + "loss": 0.4566, + "step": 16880 + }, + { + "epoch": 5.451904454486765, + "grad_norm": 1.1051915884017944, + "learning_rate": 0.0002, + "loss": 0.4536, + "step": 16890 + }, + { + "epoch": 5.455132343447385, + "grad_norm": 1.0781478881835938, + "learning_rate": 0.0002, + "loss": 0.4944, + "step": 16900 + }, + { + "epoch": 5.458360232408005, + "grad_norm": 1.1168911457061768, + "learning_rate": 0.0002, + "loss": 0.4655, + "step": 16910 + }, + { + "epoch": 5.461588121368625, + "grad_norm": 1.1150046586990356, + "learning_rate": 0.0002, + "loss": 0.4624, + "step": 16920 + }, + { + "epoch": 5.464816010329245, + "grad_norm": 0.9862499833106995, + "learning_rate": 0.0002, + "loss": 0.4849, + "step": 16930 + }, + { + "epoch": 5.468043899289865, + "grad_norm": 1.5416640043258667, + "learning_rate": 0.0002, + "loss": 0.47, + "step": 16940 + }, + { + "epoch": 5.471271788250484, + "grad_norm": 0.8960899710655212, + "learning_rate": 0.0002, + "loss": 0.4508, + "step": 16950 + }, + { + "epoch": 5.474499677211104, + "grad_norm": 0.9796477556228638, + "learning_rate": 0.0002, + "loss": 0.5002, + "step": 16960 + }, + { + "epoch": 5.4777275661717235, + "grad_norm": 0.9526587128639221, + "learning_rate": 0.0002, + "loss": 0.4939, + "step": 16970 + }, + { + "epoch": 5.480955455132343, + "grad_norm": 1.2373039722442627, + "learning_rate": 0.0002, + "loss": 0.4807, + "step": 16980 + }, + { + "epoch": 5.484183344092963, + "grad_norm": 1.1860566139221191, + "learning_rate": 0.0002, + "loss": 0.4642, + "step": 16990 + }, + { + "epoch": 5.487411233053583, + "grad_norm": 1.477345585823059, + "learning_rate": 0.0002, + "loss": 0.4929, + "step": 17000 + }, + { + "epoch": 5.490639122014203, + "grad_norm": 1.1029295921325684, + "learning_rate": 0.0002, + "loss": 0.4566, + "step": 17010 + }, + { + "epoch": 5.493867010974823, + "grad_norm": 1.1416981220245361, + "learning_rate": 0.0002, + "loss": 0.487, + "step": 17020 + }, + { + "epoch": 5.497094899935442, + "grad_norm": 1.1647989749908447, + "learning_rate": 0.0002, + "loss": 0.475, + "step": 17030 + }, + { + "epoch": 5.500322788896062, + "grad_norm": 1.1297032833099365, + "learning_rate": 0.0002, + "loss": 0.4644, + "step": 17040 + }, + { + "epoch": 5.503550677856682, + "grad_norm": 0.9764689207077026, + "learning_rate": 0.0002, + "loss": 0.4885, + "step": 17050 + }, + { + "epoch": 5.506778566817301, + "grad_norm": 1.038161039352417, + "learning_rate": 0.0002, + "loss": 0.4789, + "step": 17060 + }, + { + "epoch": 5.510006455777921, + "grad_norm": 1.1417886018753052, + "learning_rate": 0.0002, + "loss": 0.4467, + "step": 17070 + }, + { + "epoch": 5.513234344738541, + "grad_norm": 0.9300898313522339, + "learning_rate": 0.0002, + "loss": 0.4782, + "step": 17080 + }, + { + "epoch": 5.516462233699161, + "grad_norm": 1.0295016765594482, + "learning_rate": 0.0002, + "loss": 0.4805, + "step": 17090 + }, + { + "epoch": 5.519690122659781, + "grad_norm": 1.1273008584976196, + "learning_rate": 0.0002, + "loss": 0.4663, + "step": 17100 + }, + { + "epoch": 5.5229180116204, + "grad_norm": 0.9542737007141113, + "learning_rate": 0.0002, + "loss": 0.4897, + "step": 17110 + }, + { + "epoch": 5.52614590058102, + "grad_norm": 1.34589421749115, + "learning_rate": 0.0002, + "loss": 0.51, + "step": 17120 + }, + { + "epoch": 5.52937378954164, + "grad_norm": 0.9889675378799438, + "learning_rate": 0.0002, + "loss": 0.467, + "step": 17130 + }, + { + "epoch": 5.532601678502259, + "grad_norm": 1.25719153881073, + "learning_rate": 0.0002, + "loss": 0.4752, + "step": 17140 + }, + { + "epoch": 5.535829567462879, + "grad_norm": 1.2511073350906372, + "learning_rate": 0.0002, + "loss": 0.4609, + "step": 17150 + }, + { + "epoch": 5.539057456423499, + "grad_norm": 1.1993521451950073, + "learning_rate": 0.0002, + "loss": 0.4992, + "step": 17160 + }, + { + "epoch": 5.542285345384119, + "grad_norm": 1.1394526958465576, + "learning_rate": 0.0002, + "loss": 0.4986, + "step": 17170 + }, + { + "epoch": 5.545513234344739, + "grad_norm": 1.0435349941253662, + "learning_rate": 0.0002, + "loss": 0.5284, + "step": 17180 + }, + { + "epoch": 5.5487411233053585, + "grad_norm": 1.120940089225769, + "learning_rate": 0.0002, + "loss": 0.4934, + "step": 17190 + }, + { + "epoch": 5.551969012265978, + "grad_norm": 1.0906445980072021, + "learning_rate": 0.0002, + "loss": 0.4704, + "step": 17200 + }, + { + "epoch": 5.555196901226598, + "grad_norm": 0.8883966207504272, + "learning_rate": 0.0002, + "loss": 0.4896, + "step": 17210 + }, + { + "epoch": 5.5584247901872175, + "grad_norm": 1.3078752756118774, + "learning_rate": 0.0002, + "loss": 0.4696, + "step": 17220 + }, + { + "epoch": 5.561652679147837, + "grad_norm": 1.0224416255950928, + "learning_rate": 0.0002, + "loss": 0.4805, + "step": 17230 + }, + { + "epoch": 5.564880568108457, + "grad_norm": 1.242518663406372, + "learning_rate": 0.0002, + "loss": 0.47, + "step": 17240 + }, + { + "epoch": 5.568108457069076, + "grad_norm": 1.2328250408172607, + "learning_rate": 0.0002, + "loss": 0.4708, + "step": 17250 + }, + { + "epoch": 5.571336346029697, + "grad_norm": 1.2186611890792847, + "learning_rate": 0.0002, + "loss": 0.4685, + "step": 17260 + }, + { + "epoch": 5.574564234990317, + "grad_norm": 1.0947459936141968, + "learning_rate": 0.0002, + "loss": 0.4688, + "step": 17270 + }, + { + "epoch": 5.577792123950936, + "grad_norm": 1.075279951095581, + "learning_rate": 0.0002, + "loss": 0.506, + "step": 17280 + }, + { + "epoch": 5.581020012911556, + "grad_norm": 1.0316804647445679, + "learning_rate": 0.0002, + "loss": 0.478, + "step": 17290 + }, + { + "epoch": 5.584247901872176, + "grad_norm": 1.1077373027801514, + "learning_rate": 0.0002, + "loss": 0.478, + "step": 17300 + }, + { + "epoch": 5.587475790832795, + "grad_norm": 1.219228744506836, + "learning_rate": 0.0002, + "loss": 0.4857, + "step": 17310 + }, + { + "epoch": 5.590703679793415, + "grad_norm": 1.026361346244812, + "learning_rate": 0.0002, + "loss": 0.4465, + "step": 17320 + }, + { + "epoch": 5.5939315687540345, + "grad_norm": 1.1621283292770386, + "learning_rate": 0.0002, + "loss": 0.4831, + "step": 17330 + }, + { + "epoch": 5.597159457714655, + "grad_norm": 1.0177470445632935, + "learning_rate": 0.0002, + "loss": 0.4706, + "step": 17340 + }, + { + "epoch": 5.600387346675275, + "grad_norm": 1.0625319480895996, + "learning_rate": 0.0002, + "loss": 0.4961, + "step": 17350 + }, + { + "epoch": 5.603615235635894, + "grad_norm": 1.148815393447876, + "learning_rate": 0.0002, + "loss": 0.484, + "step": 17360 + }, + { + "epoch": 5.606843124596514, + "grad_norm": 1.0571802854537964, + "learning_rate": 0.0002, + "loss": 0.4804, + "step": 17370 + }, + { + "epoch": 5.610071013557134, + "grad_norm": 1.2069389820098877, + "learning_rate": 0.0002, + "loss": 0.5202, + "step": 17380 + }, + { + "epoch": 5.613298902517753, + "grad_norm": 1.407530426979065, + "learning_rate": 0.0002, + "loss": 0.5029, + "step": 17390 + }, + { + "epoch": 5.616526791478373, + "grad_norm": 1.247060775756836, + "learning_rate": 0.0002, + "loss": 0.4688, + "step": 17400 + }, + { + "epoch": 5.619754680438993, + "grad_norm": 1.431684136390686, + "learning_rate": 0.0002, + "loss": 0.4359, + "step": 17410 + }, + { + "epoch": 5.622982569399612, + "grad_norm": 1.0520552396774292, + "learning_rate": 0.0002, + "loss": 0.5244, + "step": 17420 + }, + { + "epoch": 5.626210458360232, + "grad_norm": 1.0593537092208862, + "learning_rate": 0.0002, + "loss": 0.4993, + "step": 17430 + }, + { + "epoch": 5.6294383473208525, + "grad_norm": 1.4414515495300293, + "learning_rate": 0.0002, + "loss": 0.4911, + "step": 17440 + }, + { + "epoch": 5.632666236281472, + "grad_norm": 1.0902460813522339, + "learning_rate": 0.0002, + "loss": 0.4761, + "step": 17450 + }, + { + "epoch": 5.635894125242092, + "grad_norm": 0.890944242477417, + "learning_rate": 0.0002, + "loss": 0.4737, + "step": 17460 + }, + { + "epoch": 5.639122014202711, + "grad_norm": 1.035675287246704, + "learning_rate": 0.0002, + "loss": 0.4706, + "step": 17470 + }, + { + "epoch": 5.642349903163331, + "grad_norm": 0.9792264103889465, + "learning_rate": 0.0002, + "loss": 0.484, + "step": 17480 + }, + { + "epoch": 5.645577792123951, + "grad_norm": 1.1888220310211182, + "learning_rate": 0.0002, + "loss": 0.4753, + "step": 17490 + }, + { + "epoch": 5.64880568108457, + "grad_norm": 1.0169143676757812, + "learning_rate": 0.0002, + "loss": 0.5047, + "step": 17500 + }, + { + "epoch": 5.652033570045191, + "grad_norm": 0.9812449216842651, + "learning_rate": 0.0002, + "loss": 0.4919, + "step": 17510 + }, + { + "epoch": 5.655261459005811, + "grad_norm": 1.0509105920791626, + "learning_rate": 0.0002, + "loss": 0.4879, + "step": 17520 + }, + { + "epoch": 5.65848934796643, + "grad_norm": 0.9047426581382751, + "learning_rate": 0.0002, + "loss": 0.4695, + "step": 17530 + }, + { + "epoch": 5.66171723692705, + "grad_norm": 1.2393709421157837, + "learning_rate": 0.0002, + "loss": 0.4712, + "step": 17540 + }, + { + "epoch": 5.6649451258876695, + "grad_norm": 1.1098991632461548, + "learning_rate": 0.0002, + "loss": 0.5012, + "step": 17550 + }, + { + "epoch": 5.668173014848289, + "grad_norm": 0.8181570768356323, + "learning_rate": 0.0002, + "loss": 0.4499, + "step": 17560 + }, + { + "epoch": 5.671400903808909, + "grad_norm": 0.9676381945610046, + "learning_rate": 0.0002, + "loss": 0.4973, + "step": 17570 + }, + { + "epoch": 5.6746287927695285, + "grad_norm": 1.1225934028625488, + "learning_rate": 0.0002, + "loss": 0.5058, + "step": 17580 + }, + { + "epoch": 5.677856681730148, + "grad_norm": 1.6259925365447998, + "learning_rate": 0.0002, + "loss": 0.5165, + "step": 17590 + }, + { + "epoch": 5.681084570690768, + "grad_norm": 0.7751404643058777, + "learning_rate": 0.0002, + "loss": 0.4613, + "step": 17600 + }, + { + "epoch": 5.684312459651388, + "grad_norm": 0.8478589057922363, + "learning_rate": 0.0002, + "loss": 0.4895, + "step": 17610 + }, + { + "epoch": 5.687540348612008, + "grad_norm": 1.2887113094329834, + "learning_rate": 0.0002, + "loss": 0.4492, + "step": 17620 + }, + { + "epoch": 5.690768237572628, + "grad_norm": 1.1452652215957642, + "learning_rate": 0.0002, + "loss": 0.4792, + "step": 17630 + }, + { + "epoch": 5.693996126533247, + "grad_norm": 1.0370417833328247, + "learning_rate": 0.0002, + "loss": 0.4889, + "step": 17640 + }, + { + "epoch": 5.697224015493867, + "grad_norm": 1.1358870267868042, + "learning_rate": 0.0002, + "loss": 0.535, + "step": 17650 + }, + { + "epoch": 5.700451904454487, + "grad_norm": 1.2772479057312012, + "learning_rate": 0.0002, + "loss": 0.4753, + "step": 17660 + }, + { + "epoch": 5.703679793415106, + "grad_norm": 1.182812213897705, + "learning_rate": 0.0002, + "loss": 0.4492, + "step": 17670 + }, + { + "epoch": 5.706907682375727, + "grad_norm": 1.099074125289917, + "learning_rate": 0.0002, + "loss": 0.5025, + "step": 17680 + }, + { + "epoch": 5.710135571336346, + "grad_norm": 0.938634991645813, + "learning_rate": 0.0002, + "loss": 0.4945, + "step": 17690 + }, + { + "epoch": 5.713363460296966, + "grad_norm": 0.9385238885879517, + "learning_rate": 0.0002, + "loss": 0.491, + "step": 17700 + }, + { + "epoch": 5.716591349257586, + "grad_norm": 1.1486014127731323, + "learning_rate": 0.0002, + "loss": 0.4849, + "step": 17710 + }, + { + "epoch": 5.719819238218205, + "grad_norm": 0.9433078169822693, + "learning_rate": 0.0002, + "loss": 0.5043, + "step": 17720 + }, + { + "epoch": 5.723047127178825, + "grad_norm": 1.02472722530365, + "learning_rate": 0.0002, + "loss": 0.4543, + "step": 17730 + }, + { + "epoch": 5.726275016139445, + "grad_norm": 0.9360876679420471, + "learning_rate": 0.0002, + "loss": 0.4631, + "step": 17740 + }, + { + "epoch": 5.729502905100064, + "grad_norm": 1.0481483936309814, + "learning_rate": 0.0002, + "loss": 0.4947, + "step": 17750 + }, + { + "epoch": 5.732730794060684, + "grad_norm": 1.0032516717910767, + "learning_rate": 0.0002, + "loss": 0.4763, + "step": 17760 + }, + { + "epoch": 5.735958683021304, + "grad_norm": 0.8908069729804993, + "learning_rate": 0.0002, + "loss": 0.4819, + "step": 17770 + }, + { + "epoch": 5.739186571981924, + "grad_norm": 1.0679123401641846, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 17780 + }, + { + "epoch": 5.742414460942544, + "grad_norm": 1.0448014736175537, + "learning_rate": 0.0002, + "loss": 0.4818, + "step": 17790 + }, + { + "epoch": 5.7456423499031635, + "grad_norm": 1.0433847904205322, + "learning_rate": 0.0002, + "loss": 0.4869, + "step": 17800 + }, + { + "epoch": 5.748870238863783, + "grad_norm": 1.000291109085083, + "learning_rate": 0.0002, + "loss": 0.5243, + "step": 17810 + }, + { + "epoch": 5.752098127824403, + "grad_norm": 1.1238429546356201, + "learning_rate": 0.0002, + "loss": 0.4891, + "step": 17820 + }, + { + "epoch": 5.755326016785022, + "grad_norm": 1.09062659740448, + "learning_rate": 0.0002, + "loss": 0.4905, + "step": 17830 + }, + { + "epoch": 5.758553905745642, + "grad_norm": 0.8538689613342285, + "learning_rate": 0.0002, + "loss": 0.4883, + "step": 17840 + }, + { + "epoch": 5.761781794706262, + "grad_norm": 1.3872947692871094, + "learning_rate": 0.0002, + "loss": 0.4989, + "step": 17850 + }, + { + "epoch": 5.765009683666882, + "grad_norm": 1.0578876733779907, + "learning_rate": 0.0002, + "loss": 0.4707, + "step": 17860 + }, + { + "epoch": 5.768237572627502, + "grad_norm": 1.1761705875396729, + "learning_rate": 0.0002, + "loss": 0.5281, + "step": 17870 + }, + { + "epoch": 5.771465461588122, + "grad_norm": 1.1223368644714355, + "learning_rate": 0.0002, + "loss": 0.4802, + "step": 17880 + }, + { + "epoch": 5.774693350548741, + "grad_norm": 1.2484360933303833, + "learning_rate": 0.0002, + "loss": 0.505, + "step": 17890 + }, + { + "epoch": 5.777921239509361, + "grad_norm": 1.2461199760437012, + "learning_rate": 0.0002, + "loss": 0.4786, + "step": 17900 + }, + { + "epoch": 5.7811491284699805, + "grad_norm": 1.1718299388885498, + "learning_rate": 0.0002, + "loss": 0.4933, + "step": 17910 + }, + { + "epoch": 5.7843770174306, + "grad_norm": 0.9896837472915649, + "learning_rate": 0.0002, + "loss": 0.471, + "step": 17920 + }, + { + "epoch": 5.78760490639122, + "grad_norm": 1.3759760856628418, + "learning_rate": 0.0002, + "loss": 0.4808, + "step": 17930 + }, + { + "epoch": 5.7908327953518395, + "grad_norm": 1.0596622228622437, + "learning_rate": 0.0002, + "loss": 0.4847, + "step": 17940 + }, + { + "epoch": 5.79406068431246, + "grad_norm": 0.9292021989822388, + "learning_rate": 0.0002, + "loss": 0.5153, + "step": 17950 + }, + { + "epoch": 5.79728857327308, + "grad_norm": 0.8786653876304626, + "learning_rate": 0.0002, + "loss": 0.4783, + "step": 17960 + }, + { + "epoch": 5.800516462233699, + "grad_norm": 1.2087152004241943, + "learning_rate": 0.0002, + "loss": 0.4598, + "step": 17970 + }, + { + "epoch": 5.803744351194319, + "grad_norm": 1.1643104553222656, + "learning_rate": 0.0002, + "loss": 0.4953, + "step": 17980 + }, + { + "epoch": 5.806972240154939, + "grad_norm": 0.971613347530365, + "learning_rate": 0.0002, + "loss": 0.5111, + "step": 17990 + }, + { + "epoch": 5.810200129115558, + "grad_norm": 1.306227684020996, + "learning_rate": 0.0002, + "loss": 0.5094, + "step": 18000 + }, + { + "epoch": 5.813428018076178, + "grad_norm": 1.3665502071380615, + "learning_rate": 0.0002, + "loss": 0.5392, + "step": 18010 + }, + { + "epoch": 5.816655907036798, + "grad_norm": 1.2227312326431274, + "learning_rate": 0.0002, + "loss": 0.4887, + "step": 18020 + }, + { + "epoch": 5.819883795997418, + "grad_norm": 1.180694818496704, + "learning_rate": 0.0002, + "loss": 0.5203, + "step": 18030 + }, + { + "epoch": 5.823111684958038, + "grad_norm": 1.1045362949371338, + "learning_rate": 0.0002, + "loss": 0.4962, + "step": 18040 + }, + { + "epoch": 5.826339573918657, + "grad_norm": 1.3828954696655273, + "learning_rate": 0.0002, + "loss": 0.4969, + "step": 18050 + }, + { + "epoch": 5.829567462879277, + "grad_norm": 1.305102825164795, + "learning_rate": 0.0002, + "loss": 0.5493, + "step": 18060 + }, + { + "epoch": 5.832795351839897, + "grad_norm": 1.2708743810653687, + "learning_rate": 0.0002, + "loss": 0.4844, + "step": 18070 + }, + { + "epoch": 5.836023240800516, + "grad_norm": 1.0344188213348389, + "learning_rate": 0.0002, + "loss": 0.4834, + "step": 18080 + }, + { + "epoch": 5.839251129761136, + "grad_norm": 1.1321724653244019, + "learning_rate": 0.0002, + "loss": 0.5088, + "step": 18090 + }, + { + "epoch": 5.842479018721756, + "grad_norm": 1.2162611484527588, + "learning_rate": 0.0002, + "loss": 0.4888, + "step": 18100 + }, + { + "epoch": 5.845706907682375, + "grad_norm": 1.427612543106079, + "learning_rate": 0.0002, + "loss": 0.5014, + "step": 18110 + }, + { + "epoch": 5.848934796642995, + "grad_norm": 1.4391452074050903, + "learning_rate": 0.0002, + "loss": 0.5339, + "step": 18120 + }, + { + "epoch": 5.8521626856036155, + "grad_norm": 1.1548216342926025, + "learning_rate": 0.0002, + "loss": 0.528, + "step": 18130 + }, + { + "epoch": 5.855390574564235, + "grad_norm": 1.2336437702178955, + "learning_rate": 0.0002, + "loss": 0.4779, + "step": 18140 + }, + { + "epoch": 5.858618463524855, + "grad_norm": 1.254661202430725, + "learning_rate": 0.0002, + "loss": 0.4844, + "step": 18150 + }, + { + "epoch": 5.8618463524854745, + "grad_norm": 0.8326491117477417, + "learning_rate": 0.0002, + "loss": 0.5201, + "step": 18160 + }, + { + "epoch": 5.865074241446094, + "grad_norm": 1.0907988548278809, + "learning_rate": 0.0002, + "loss": 0.5076, + "step": 18170 + }, + { + "epoch": 5.868302130406714, + "grad_norm": 0.9896568655967712, + "learning_rate": 0.0002, + "loss": 0.48, + "step": 18180 + }, + { + "epoch": 5.871530019367333, + "grad_norm": 0.9440065026283264, + "learning_rate": 0.0002, + "loss": 0.4628, + "step": 18190 + }, + { + "epoch": 5.874757908327954, + "grad_norm": 1.09321129322052, + "learning_rate": 0.0002, + "loss": 0.5265, + "step": 18200 + }, + { + "epoch": 5.877985797288574, + "grad_norm": 1.2588142156600952, + "learning_rate": 0.0002, + "loss": 0.4737, + "step": 18210 + }, + { + "epoch": 5.881213686249193, + "grad_norm": 1.1731587648391724, + "learning_rate": 0.0002, + "loss": 0.475, + "step": 18220 + }, + { + "epoch": 5.884441575209813, + "grad_norm": 0.9904444217681885, + "learning_rate": 0.0002, + "loss": 0.504, + "step": 18230 + }, + { + "epoch": 5.887669464170433, + "grad_norm": 0.8985799551010132, + "learning_rate": 0.0002, + "loss": 0.4842, + "step": 18240 + }, + { + "epoch": 5.890897353131052, + "grad_norm": 1.0182441473007202, + "learning_rate": 0.0002, + "loss": 0.4878, + "step": 18250 + }, + { + "epoch": 5.894125242091672, + "grad_norm": 1.1574701070785522, + "learning_rate": 0.0002, + "loss": 0.5224, + "step": 18260 + }, + { + "epoch": 5.8973531310522915, + "grad_norm": 1.1776602268218994, + "learning_rate": 0.0002, + "loss": 0.5, + "step": 18270 + }, + { + "epoch": 5.900581020012911, + "grad_norm": 1.4951308965682983, + "learning_rate": 0.0002, + "loss": 0.5245, + "step": 18280 + }, + { + "epoch": 5.903808908973531, + "grad_norm": 1.1440261602401733, + "learning_rate": 0.0002, + "loss": 0.5454, + "step": 18290 + }, + { + "epoch": 5.907036797934151, + "grad_norm": 0.9925196170806885, + "learning_rate": 0.0002, + "loss": 0.4868, + "step": 18300 + }, + { + "epoch": 5.910264686894771, + "grad_norm": 1.098615288734436, + "learning_rate": 0.0002, + "loss": 0.5142, + "step": 18310 + }, + { + "epoch": 5.913492575855391, + "grad_norm": 1.0030080080032349, + "learning_rate": 0.0002, + "loss": 0.5184, + "step": 18320 + }, + { + "epoch": 5.91672046481601, + "grad_norm": 0.9890318512916565, + "learning_rate": 0.0002, + "loss": 0.474, + "step": 18330 + }, + { + "epoch": 5.91994835377663, + "grad_norm": 1.2209392786026, + "learning_rate": 0.0002, + "loss": 0.5125, + "step": 18340 + }, + { + "epoch": 5.92317624273725, + "grad_norm": 1.108933925628662, + "learning_rate": 0.0002, + "loss": 0.4634, + "step": 18350 + }, + { + "epoch": 5.926404131697869, + "grad_norm": 1.086024522781372, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 18360 + }, + { + "epoch": 5.92963202065849, + "grad_norm": 1.0061167478561401, + "learning_rate": 0.0002, + "loss": 0.4952, + "step": 18370 + }, + { + "epoch": 5.9328599096191095, + "grad_norm": 0.9445858597755432, + "learning_rate": 0.0002, + "loss": 0.4848, + "step": 18380 + }, + { + "epoch": 5.936087798579729, + "grad_norm": 0.9556859135627747, + "learning_rate": 0.0002, + "loss": 0.5014, + "step": 18390 + }, + { + "epoch": 5.939315687540349, + "grad_norm": 1.154168963432312, + "learning_rate": 0.0002, + "loss": 0.4966, + "step": 18400 + }, + { + "epoch": 5.942543576500968, + "grad_norm": 1.0495831966400146, + "learning_rate": 0.0002, + "loss": 0.4836, + "step": 18410 + }, + { + "epoch": 5.945771465461588, + "grad_norm": 1.0717304944992065, + "learning_rate": 0.0002, + "loss": 0.5021, + "step": 18420 + }, + { + "epoch": 5.948999354422208, + "grad_norm": 1.06618332862854, + "learning_rate": 0.0002, + "loss": 0.4794, + "step": 18430 + }, + { + "epoch": 5.952227243382827, + "grad_norm": 0.9567165374755859, + "learning_rate": 0.0002, + "loss": 0.5011, + "step": 18440 + }, + { + "epoch": 5.955455132343447, + "grad_norm": 1.0306249856948853, + "learning_rate": 0.0002, + "loss": 0.485, + "step": 18450 + }, + { + "epoch": 5.958683021304067, + "grad_norm": 1.1879968643188477, + "learning_rate": 0.0002, + "loss": 0.4948, + "step": 18460 + }, + { + "epoch": 5.961910910264687, + "grad_norm": 1.3177233934402466, + "learning_rate": 0.0002, + "loss": 0.5185, + "step": 18470 + }, + { + "epoch": 5.965138799225307, + "grad_norm": 1.0945817232131958, + "learning_rate": 0.0002, + "loss": 0.4966, + "step": 18480 + }, + { + "epoch": 5.9683666881859265, + "grad_norm": 1.029414415359497, + "learning_rate": 0.0002, + "loss": 0.5196, + "step": 18490 + }, + { + "epoch": 5.971594577146546, + "grad_norm": 1.2266209125518799, + "learning_rate": 0.0002, + "loss": 0.5154, + "step": 18500 + }, + { + "epoch": 5.974822466107166, + "grad_norm": 1.2167150974273682, + "learning_rate": 0.0002, + "loss": 0.4914, + "step": 18510 + }, + { + "epoch": 5.9780503550677855, + "grad_norm": 0.9941056966781616, + "learning_rate": 0.0002, + "loss": 0.466, + "step": 18520 + }, + { + "epoch": 5.981278244028405, + "grad_norm": 1.4244859218597412, + "learning_rate": 0.0002, + "loss": 0.5037, + "step": 18530 + }, + { + "epoch": 5.984506132989026, + "grad_norm": 0.8976260423660278, + "learning_rate": 0.0002, + "loss": 0.4902, + "step": 18540 + }, + { + "epoch": 5.987734021949645, + "grad_norm": 1.0162699222564697, + "learning_rate": 0.0002, + "loss": 0.5039, + "step": 18550 + }, + { + "epoch": 5.990961910910265, + "grad_norm": 1.196677803993225, + "learning_rate": 0.0002, + "loss": 0.5138, + "step": 18560 + }, + { + "epoch": 5.994189799870885, + "grad_norm": 1.163403868675232, + "learning_rate": 0.0002, + "loss": 0.4626, + "step": 18570 + }, + { + "epoch": 5.997417688831504, + "grad_norm": 1.010205626487732, + "learning_rate": 0.0002, + "loss": 0.5105, + "step": 18580 + }, + { + "epoch": 6.0, + "eval_loss": 1.2861483097076416, + "eval_runtime": 163.2683, + "eval_samples_per_second": 4.49, + "eval_steps_per_second": 0.563, + "step": 18588 + }, + { + "epoch": 6.000645577792124, + "grad_norm": 0.7334756255149841, + "learning_rate": 0.0002, + "loss": 0.4557, + "step": 18590 + }, + { + "epoch": 6.003873466752744, + "grad_norm": 1.093945026397705, + "learning_rate": 0.0002, + "loss": 0.4201, + "step": 18600 + }, + { + "epoch": 6.007101355713363, + "grad_norm": 1.2327148914337158, + "learning_rate": 0.0002, + "loss": 0.4235, + "step": 18610 + }, + { + "epoch": 6.010329244673983, + "grad_norm": 1.3238836526870728, + "learning_rate": 0.0002, + "loss": 0.377, + "step": 18620 + }, + { + "epoch": 6.0135571336346025, + "grad_norm": 1.2364031076431274, + "learning_rate": 0.0002, + "loss": 0.3883, + "step": 18630 + }, + { + "epoch": 6.016785022595223, + "grad_norm": 0.902474045753479, + "learning_rate": 0.0002, + "loss": 0.3958, + "step": 18640 + }, + { + "epoch": 6.020012911555843, + "grad_norm": 1.273280382156372, + "learning_rate": 0.0002, + "loss": 0.4077, + "step": 18650 + }, + { + "epoch": 6.023240800516462, + "grad_norm": 1.2470760345458984, + "learning_rate": 0.0002, + "loss": 0.4224, + "step": 18660 + }, + { + "epoch": 6.026468689477082, + "grad_norm": 1.2360138893127441, + "learning_rate": 0.0002, + "loss": 0.3752, + "step": 18670 + }, + { + "epoch": 6.029696578437702, + "grad_norm": 1.467140793800354, + "learning_rate": 0.0002, + "loss": 0.3653, + "step": 18680 + }, + { + "epoch": 6.032924467398321, + "grad_norm": 1.123871088027954, + "learning_rate": 0.0002, + "loss": 0.3883, + "step": 18690 + }, + { + "epoch": 6.036152356358941, + "grad_norm": 0.9732550978660583, + "learning_rate": 0.0002, + "loss": 0.3812, + "step": 18700 + }, + { + "epoch": 6.039380245319561, + "grad_norm": 1.170860767364502, + "learning_rate": 0.0002, + "loss": 0.4163, + "step": 18710 + }, + { + "epoch": 6.042608134280181, + "grad_norm": 1.2599345445632935, + "learning_rate": 0.0002, + "loss": 0.3836, + "step": 18720 + }, + { + "epoch": 6.045836023240801, + "grad_norm": 1.0808286666870117, + "learning_rate": 0.0002, + "loss": 0.3881, + "step": 18730 + }, + { + "epoch": 6.0490639122014205, + "grad_norm": 0.9799565076828003, + "learning_rate": 0.0002, + "loss": 0.386, + "step": 18740 + }, + { + "epoch": 6.05229180116204, + "grad_norm": 0.8425611853599548, + "learning_rate": 0.0002, + "loss": 0.3833, + "step": 18750 + }, + { + "epoch": 6.05551969012266, + "grad_norm": 0.9762344360351562, + "learning_rate": 0.0002, + "loss": 0.3765, + "step": 18760 + }, + { + "epoch": 6.058747579083279, + "grad_norm": 1.1290913820266724, + "learning_rate": 0.0002, + "loss": 0.3878, + "step": 18770 + }, + { + "epoch": 6.061975468043899, + "grad_norm": 1.2240493297576904, + "learning_rate": 0.0002, + "loss": 0.4061, + "step": 18780 + }, + { + "epoch": 6.065203357004519, + "grad_norm": 1.3422439098358154, + "learning_rate": 0.0002, + "loss": 0.3894, + "step": 18790 + }, + { + "epoch": 6.068431245965138, + "grad_norm": 1.0391879081726074, + "learning_rate": 0.0002, + "loss": 0.3885, + "step": 18800 + }, + { + "epoch": 6.071659134925759, + "grad_norm": 1.0910760164260864, + "learning_rate": 0.0002, + "loss": 0.409, + "step": 18810 + }, + { + "epoch": 6.074887023886379, + "grad_norm": 1.280098557472229, + "learning_rate": 0.0002, + "loss": 0.3905, + "step": 18820 + }, + { + "epoch": 6.078114912846998, + "grad_norm": 1.2102673053741455, + "learning_rate": 0.0002, + "loss": 0.3892, + "step": 18830 + }, + { + "epoch": 6.081342801807618, + "grad_norm": 1.3735624551773071, + "learning_rate": 0.0002, + "loss": 0.3757, + "step": 18840 + }, + { + "epoch": 6.0845706907682375, + "grad_norm": 1.039419412612915, + "learning_rate": 0.0002, + "loss": 0.4057, + "step": 18850 + }, + { + "epoch": 6.087798579728857, + "grad_norm": 1.175872802734375, + "learning_rate": 0.0002, + "loss": 0.4093, + "step": 18860 + }, + { + "epoch": 6.091026468689477, + "grad_norm": 1.4287301301956177, + "learning_rate": 0.0002, + "loss": 0.3933, + "step": 18870 + }, + { + "epoch": 6.0942543576500965, + "grad_norm": 1.110627293586731, + "learning_rate": 0.0002, + "loss": 0.4029, + "step": 18880 + }, + { + "epoch": 6.097482246610717, + "grad_norm": 1.1495535373687744, + "learning_rate": 0.0002, + "loss": 0.4195, + "step": 18890 + }, + { + "epoch": 6.100710135571337, + "grad_norm": 0.9764134287834167, + "learning_rate": 0.0002, + "loss": 0.4022, + "step": 18900 + }, + { + "epoch": 6.103938024531956, + "grad_norm": 1.0792596340179443, + "learning_rate": 0.0002, + "loss": 0.4097, + "step": 18910 + }, + { + "epoch": 6.107165913492576, + "grad_norm": 1.2520235776901245, + "learning_rate": 0.0002, + "loss": 0.402, + "step": 18920 + }, + { + "epoch": 6.110393802453196, + "grad_norm": 0.857008695602417, + "learning_rate": 0.0002, + "loss": 0.4091, + "step": 18930 + }, + { + "epoch": 6.113621691413815, + "grad_norm": 1.745723009109497, + "learning_rate": 0.0002, + "loss": 0.4046, + "step": 18940 + }, + { + "epoch": 6.116849580374435, + "grad_norm": 1.099941611289978, + "learning_rate": 0.0002, + "loss": 0.4245, + "step": 18950 + }, + { + "epoch": 6.120077469335055, + "grad_norm": 1.1402947902679443, + "learning_rate": 0.0002, + "loss": 0.3708, + "step": 18960 + }, + { + "epoch": 6.123305358295674, + "grad_norm": 1.0565131902694702, + "learning_rate": 0.0002, + "loss": 0.4022, + "step": 18970 + }, + { + "epoch": 6.126533247256295, + "grad_norm": 1.1511917114257812, + "learning_rate": 0.0002, + "loss": 0.3973, + "step": 18980 + }, + { + "epoch": 6.129761136216914, + "grad_norm": 0.9029410481452942, + "learning_rate": 0.0002, + "loss": 0.395, + "step": 18990 + }, + { + "epoch": 6.132989025177534, + "grad_norm": 1.03252375125885, + "learning_rate": 0.0002, + "loss": 0.393, + "step": 19000 + }, + { + "epoch": 6.136216914138154, + "grad_norm": 1.2058522701263428, + "learning_rate": 0.0002, + "loss": 0.3923, + "step": 19010 + }, + { + "epoch": 6.139444803098773, + "grad_norm": 1.2274953126907349, + "learning_rate": 0.0002, + "loss": 0.3963, + "step": 19020 + }, + { + "epoch": 6.142672692059393, + "grad_norm": 1.3196226358413696, + "learning_rate": 0.0002, + "loss": 0.3999, + "step": 19030 + }, + { + "epoch": 6.145900581020013, + "grad_norm": 0.8030686378479004, + "learning_rate": 0.0002, + "loss": 0.4176, + "step": 19040 + }, + { + "epoch": 6.149128469980632, + "grad_norm": 1.1762639284133911, + "learning_rate": 0.0002, + "loss": 0.3886, + "step": 19050 + }, + { + "epoch": 6.152356358941253, + "grad_norm": 1.0247628688812256, + "learning_rate": 0.0002, + "loss": 0.429, + "step": 19060 + }, + { + "epoch": 6.1555842479018725, + "grad_norm": 0.99031662940979, + "learning_rate": 0.0002, + "loss": 0.3876, + "step": 19070 + }, + { + "epoch": 6.158812136862492, + "grad_norm": 1.334445834159851, + "learning_rate": 0.0002, + "loss": 0.3818, + "step": 19080 + }, + { + "epoch": 6.162040025823112, + "grad_norm": 1.1160423755645752, + "learning_rate": 0.0002, + "loss": 0.4038, + "step": 19090 + }, + { + "epoch": 6.1652679147837315, + "grad_norm": 1.2579560279846191, + "learning_rate": 0.0002, + "loss": 0.4081, + "step": 19100 + }, + { + "epoch": 6.168495803744351, + "grad_norm": 0.9372721910476685, + "learning_rate": 0.0002, + "loss": 0.4092, + "step": 19110 + }, + { + "epoch": 6.171723692704971, + "grad_norm": 0.7995722889900208, + "learning_rate": 0.0002, + "loss": 0.3905, + "step": 19120 + }, + { + "epoch": 6.17495158166559, + "grad_norm": 1.0074360370635986, + "learning_rate": 0.0002, + "loss": 0.3896, + "step": 19130 + }, + { + "epoch": 6.17817947062621, + "grad_norm": 0.9821600914001465, + "learning_rate": 0.0002, + "loss": 0.4328, + "step": 19140 + }, + { + "epoch": 6.181407359586831, + "grad_norm": 1.1252691745758057, + "learning_rate": 0.0002, + "loss": 0.3845, + "step": 19150 + }, + { + "epoch": 6.18463524854745, + "grad_norm": 1.316981554031372, + "learning_rate": 0.0002, + "loss": 0.3918, + "step": 19160 + }, + { + "epoch": 6.18786313750807, + "grad_norm": 1.0131299495697021, + "learning_rate": 0.0002, + "loss": 0.3893, + "step": 19170 + }, + { + "epoch": 6.19109102646869, + "grad_norm": 1.3530288934707642, + "learning_rate": 0.0002, + "loss": 0.4111, + "step": 19180 + }, + { + "epoch": 6.194318915429309, + "grad_norm": 1.148247480392456, + "learning_rate": 0.0002, + "loss": 0.416, + "step": 19190 + }, + { + "epoch": 6.197546804389929, + "grad_norm": 1.5510036945343018, + "learning_rate": 0.0002, + "loss": 0.4191, + "step": 19200 + }, + { + "epoch": 6.2007746933505485, + "grad_norm": 1.3048018217086792, + "learning_rate": 0.0002, + "loss": 0.423, + "step": 19210 + }, + { + "epoch": 6.204002582311168, + "grad_norm": 1.186187982559204, + "learning_rate": 0.0002, + "loss": 0.397, + "step": 19220 + }, + { + "epoch": 6.207230471271788, + "grad_norm": 1.5199471712112427, + "learning_rate": 0.0002, + "loss": 0.4164, + "step": 19230 + }, + { + "epoch": 6.210458360232408, + "grad_norm": 1.1311423778533936, + "learning_rate": 0.0002, + "loss": 0.4322, + "step": 19240 + }, + { + "epoch": 6.213686249193028, + "grad_norm": 1.2345898151397705, + "learning_rate": 0.0002, + "loss": 0.4086, + "step": 19250 + }, + { + "epoch": 6.216914138153648, + "grad_norm": 1.0261863470077515, + "learning_rate": 0.0002, + "loss": 0.4122, + "step": 19260 + }, + { + "epoch": 6.220142027114267, + "grad_norm": 0.8985416293144226, + "learning_rate": 0.0002, + "loss": 0.4315, + "step": 19270 + }, + { + "epoch": 6.223369916074887, + "grad_norm": 1.3136980533599854, + "learning_rate": 0.0002, + "loss": 0.4052, + "step": 19280 + }, + { + "epoch": 6.226597805035507, + "grad_norm": 1.1949185132980347, + "learning_rate": 0.0002, + "loss": 0.4232, + "step": 19290 + }, + { + "epoch": 6.229825693996126, + "grad_norm": 0.9668909907341003, + "learning_rate": 0.0002, + "loss": 0.4255, + "step": 19300 + }, + { + "epoch": 6.233053582956746, + "grad_norm": 0.8858964443206787, + "learning_rate": 0.0002, + "loss": 0.3917, + "step": 19310 + }, + { + "epoch": 6.236281471917366, + "grad_norm": 1.4254822731018066, + "learning_rate": 0.0002, + "loss": 0.4087, + "step": 19320 + }, + { + "epoch": 6.239509360877986, + "grad_norm": 1.0455392599105835, + "learning_rate": 0.0002, + "loss": 0.426, + "step": 19330 + }, + { + "epoch": 6.242737249838606, + "grad_norm": 1.1690824031829834, + "learning_rate": 0.0002, + "loss": 0.3894, + "step": 19340 + }, + { + "epoch": 6.245965138799225, + "grad_norm": 1.0347497463226318, + "learning_rate": 0.0002, + "loss": 0.3777, + "step": 19350 + }, + { + "epoch": 6.249193027759845, + "grad_norm": 1.0790464878082275, + "learning_rate": 0.0002, + "loss": 0.3972, + "step": 19360 + }, + { + "epoch": 6.252420916720465, + "grad_norm": 1.1294453144073486, + "learning_rate": 0.0002, + "loss": 0.4393, + "step": 19370 + }, + { + "epoch": 6.255648805681084, + "grad_norm": 1.5094330310821533, + "learning_rate": 0.0002, + "loss": 0.4055, + "step": 19380 + }, + { + "epoch": 6.258876694641704, + "grad_norm": 1.1122944355010986, + "learning_rate": 0.0002, + "loss": 0.4228, + "step": 19390 + }, + { + "epoch": 6.262104583602324, + "grad_norm": 1.3123422861099243, + "learning_rate": 0.0002, + "loss": 0.4341, + "step": 19400 + }, + { + "epoch": 6.265332472562944, + "grad_norm": 1.0585907697677612, + "learning_rate": 0.0002, + "loss": 0.4206, + "step": 19410 + }, + { + "epoch": 6.268560361523564, + "grad_norm": 0.8711239099502563, + "learning_rate": 0.0002, + "loss": 0.4001, + "step": 19420 + }, + { + "epoch": 6.2717882504841835, + "grad_norm": 1.2772116661071777, + "learning_rate": 0.0002, + "loss": 0.4201, + "step": 19430 + }, + { + "epoch": 6.275016139444803, + "grad_norm": 1.0035508871078491, + "learning_rate": 0.0002, + "loss": 0.4298, + "step": 19440 + }, + { + "epoch": 6.278244028405423, + "grad_norm": 0.7933974862098694, + "learning_rate": 0.0002, + "loss": 0.4234, + "step": 19450 + }, + { + "epoch": 6.2814719173660425, + "grad_norm": 1.2455826997756958, + "learning_rate": 0.0002, + "loss": 0.4144, + "step": 19460 + }, + { + "epoch": 6.284699806326662, + "grad_norm": 1.2735545635223389, + "learning_rate": 0.0002, + "loss": 0.4171, + "step": 19470 + }, + { + "epoch": 6.287927695287282, + "grad_norm": 0.9773174524307251, + "learning_rate": 0.0002, + "loss": 0.3956, + "step": 19480 + }, + { + "epoch": 6.2911555842479014, + "grad_norm": 1.2341974973678589, + "learning_rate": 0.0002, + "loss": 0.4264, + "step": 19490 + }, + { + "epoch": 6.294383473208522, + "grad_norm": 1.286138653755188, + "learning_rate": 0.0002, + "loss": 0.4068, + "step": 19500 + }, + { + "epoch": 6.297611362169142, + "grad_norm": 1.052889108657837, + "learning_rate": 0.0002, + "loss": 0.439, + "step": 19510 + }, + { + "epoch": 6.300839251129761, + "grad_norm": 1.1955385208129883, + "learning_rate": 0.0002, + "loss": 0.4199, + "step": 19520 + }, + { + "epoch": 6.304067140090381, + "grad_norm": 1.2792452573776245, + "learning_rate": 0.0002, + "loss": 0.4242, + "step": 19530 + }, + { + "epoch": 6.307295029051001, + "grad_norm": 0.9077931046485901, + "learning_rate": 0.0002, + "loss": 0.3989, + "step": 19540 + }, + { + "epoch": 6.31052291801162, + "grad_norm": 1.2492976188659668, + "learning_rate": 0.0002, + "loss": 0.388, + "step": 19550 + }, + { + "epoch": 6.31375080697224, + "grad_norm": 1.1097182035446167, + "learning_rate": 0.0002, + "loss": 0.3828, + "step": 19560 + }, + { + "epoch": 6.3169786959328595, + "grad_norm": 1.271609902381897, + "learning_rate": 0.0002, + "loss": 0.4482, + "step": 19570 + }, + { + "epoch": 6.32020658489348, + "grad_norm": 1.4262897968292236, + "learning_rate": 0.0002, + "loss": 0.3851, + "step": 19580 + }, + { + "epoch": 6.3234344738541, + "grad_norm": 1.057338833808899, + "learning_rate": 0.0002, + "loss": 0.4133, + "step": 19590 + }, + { + "epoch": 6.326662362814719, + "grad_norm": 1.323028326034546, + "learning_rate": 0.0002, + "loss": 0.4366, + "step": 19600 + }, + { + "epoch": 6.329890251775339, + "grad_norm": 1.0991673469543457, + "learning_rate": 0.0002, + "loss": 0.4186, + "step": 19610 + }, + { + "epoch": 6.333118140735959, + "grad_norm": 1.1600234508514404, + "learning_rate": 0.0002, + "loss": 0.4132, + "step": 19620 + }, + { + "epoch": 6.336346029696578, + "grad_norm": 1.2986212968826294, + "learning_rate": 0.0002, + "loss": 0.4689, + "step": 19630 + }, + { + "epoch": 6.339573918657198, + "grad_norm": 1.2117934226989746, + "learning_rate": 0.0002, + "loss": 0.3914, + "step": 19640 + }, + { + "epoch": 6.342801807617818, + "grad_norm": 0.9747948050498962, + "learning_rate": 0.0002, + "loss": 0.3939, + "step": 19650 + }, + { + "epoch": 6.346029696578437, + "grad_norm": 1.2380492687225342, + "learning_rate": 0.0002, + "loss": 0.4517, + "step": 19660 + }, + { + "epoch": 6.349257585539058, + "grad_norm": 1.2475087642669678, + "learning_rate": 0.0002, + "loss": 0.4344, + "step": 19670 + }, + { + "epoch": 6.3524854744996775, + "grad_norm": 1.022084355354309, + "learning_rate": 0.0002, + "loss": 0.4253, + "step": 19680 + }, + { + "epoch": 6.355713363460297, + "grad_norm": 1.2422059774398804, + "learning_rate": 0.0002, + "loss": 0.4227, + "step": 19690 + }, + { + "epoch": 6.358941252420917, + "grad_norm": 1.5015275478363037, + "learning_rate": 0.0002, + "loss": 0.4205, + "step": 19700 + }, + { + "epoch": 6.362169141381536, + "grad_norm": 1.068727970123291, + "learning_rate": 0.0002, + "loss": 0.414, + "step": 19710 + }, + { + "epoch": 6.365397030342156, + "grad_norm": 1.3718897104263306, + "learning_rate": 0.0002, + "loss": 0.4054, + "step": 19720 + }, + { + "epoch": 6.368624919302776, + "grad_norm": 1.3437764644622803, + "learning_rate": 0.0002, + "loss": 0.4399, + "step": 19730 + }, + { + "epoch": 6.371852808263395, + "grad_norm": 0.9128499031066895, + "learning_rate": 0.0002, + "loss": 0.4187, + "step": 19740 + }, + { + "epoch": 6.375080697224016, + "grad_norm": 1.0678889751434326, + "learning_rate": 0.0002, + "loss": 0.4346, + "step": 19750 + }, + { + "epoch": 6.378308586184636, + "grad_norm": 1.0432878732681274, + "learning_rate": 0.0002, + "loss": 0.4103, + "step": 19760 + }, + { + "epoch": 6.381536475145255, + "grad_norm": 1.4033927917480469, + "learning_rate": 0.0002, + "loss": 0.4304, + "step": 19770 + }, + { + "epoch": 6.384764364105875, + "grad_norm": 1.2773922681808472, + "learning_rate": 0.0002, + "loss": 0.4225, + "step": 19780 + }, + { + "epoch": 6.3879922530664945, + "grad_norm": 1.257847547531128, + "learning_rate": 0.0002, + "loss": 0.4246, + "step": 19790 + }, + { + "epoch": 6.391220142027114, + "grad_norm": 0.8424118757247925, + "learning_rate": 0.0002, + "loss": 0.4261, + "step": 19800 + }, + { + "epoch": 6.394448030987734, + "grad_norm": 1.3387986421585083, + "learning_rate": 0.0002, + "loss": 0.4145, + "step": 19810 + }, + { + "epoch": 6.3976759199483535, + "grad_norm": 1.1277328729629517, + "learning_rate": 0.0002, + "loss": 0.4268, + "step": 19820 + }, + { + "epoch": 6.400903808908973, + "grad_norm": 1.264283537864685, + "learning_rate": 0.0002, + "loss": 0.4213, + "step": 19830 + }, + { + "epoch": 6.404131697869594, + "grad_norm": 1.1770991086959839, + "learning_rate": 0.0002, + "loss": 0.4506, + "step": 19840 + }, + { + "epoch": 6.407359586830213, + "grad_norm": 0.9695967435836792, + "learning_rate": 0.0002, + "loss": 0.4385, + "step": 19850 + }, + { + "epoch": 6.410587475790833, + "grad_norm": 1.3394994735717773, + "learning_rate": 0.0002, + "loss": 0.4258, + "step": 19860 + }, + { + "epoch": 6.413815364751453, + "grad_norm": 1.0515536069869995, + "learning_rate": 0.0002, + "loss": 0.4017, + "step": 19870 + }, + { + "epoch": 6.417043253712072, + "grad_norm": 1.3238868713378906, + "learning_rate": 0.0002, + "loss": 0.4555, + "step": 19880 + }, + { + "epoch": 6.420271142672692, + "grad_norm": 1.0801814794540405, + "learning_rate": 0.0002, + "loss": 0.4385, + "step": 19890 + }, + { + "epoch": 6.423499031633312, + "grad_norm": 1.1391135454177856, + "learning_rate": 0.0002, + "loss": 0.4135, + "step": 19900 + }, + { + "epoch": 6.426726920593931, + "grad_norm": 1.13046133518219, + "learning_rate": 0.0002, + "loss": 0.4376, + "step": 19910 + }, + { + "epoch": 6.429954809554552, + "grad_norm": 1.1657520532608032, + "learning_rate": 0.0002, + "loss": 0.4251, + "step": 19920 + }, + { + "epoch": 6.433182698515171, + "grad_norm": 1.3315341472625732, + "learning_rate": 0.0002, + "loss": 0.3951, + "step": 19930 + }, + { + "epoch": 6.436410587475791, + "grad_norm": 1.1806831359863281, + "learning_rate": 0.0002, + "loss": 0.4254, + "step": 19940 + }, + { + "epoch": 6.439638476436411, + "grad_norm": 1.1581867933273315, + "learning_rate": 0.0002, + "loss": 0.3988, + "step": 19950 + }, + { + "epoch": 6.44286636539703, + "grad_norm": 1.2601206302642822, + "learning_rate": 0.0002, + "loss": 0.4194, + "step": 19960 + }, + { + "epoch": 6.44609425435765, + "grad_norm": 1.1163229942321777, + "learning_rate": 0.0002, + "loss": 0.4505, + "step": 19970 + }, + { + "epoch": 6.44932214331827, + "grad_norm": 0.9959462285041809, + "learning_rate": 0.0002, + "loss": 0.4295, + "step": 19980 + }, + { + "epoch": 6.452550032278889, + "grad_norm": 1.1213586330413818, + "learning_rate": 0.0002, + "loss": 0.421, + "step": 19990 + }, + { + "epoch": 6.455777921239509, + "grad_norm": 1.1345361471176147, + "learning_rate": 0.0002, + "loss": 0.4354, + "step": 20000 + }, + { + "epoch": 6.459005810200129, + "grad_norm": 1.245871901512146, + "learning_rate": 0.0002, + "loss": 0.429, + "step": 20010 + }, + { + "epoch": 6.462233699160749, + "grad_norm": 1.0894919633865356, + "learning_rate": 0.0002, + "loss": 0.4395, + "step": 20020 + }, + { + "epoch": 6.465461588121369, + "grad_norm": 1.030206322669983, + "learning_rate": 0.0002, + "loss": 0.4365, + "step": 20030 + }, + { + "epoch": 6.4686894770819885, + "grad_norm": 1.262133002281189, + "learning_rate": 0.0002, + "loss": 0.4225, + "step": 20040 + }, + { + "epoch": 6.471917366042608, + "grad_norm": 1.167641043663025, + "learning_rate": 0.0002, + "loss": 0.4301, + "step": 20050 + }, + { + "epoch": 6.475145255003228, + "grad_norm": 1.1125705242156982, + "learning_rate": 0.0002, + "loss": 0.4438, + "step": 20060 + }, + { + "epoch": 6.4783731439638474, + "grad_norm": 1.3777440786361694, + "learning_rate": 0.0002, + "loss": 0.4205, + "step": 20070 + }, + { + "epoch": 6.481601032924467, + "grad_norm": 1.1771081686019897, + "learning_rate": 0.0002, + "loss": 0.424, + "step": 20080 + }, + { + "epoch": 6.484828921885087, + "grad_norm": 1.0414351224899292, + "learning_rate": 0.0002, + "loss": 0.4187, + "step": 20090 + }, + { + "epoch": 6.488056810845707, + "grad_norm": 1.2103244066238403, + "learning_rate": 0.0002, + "loss": 0.4419, + "step": 20100 + }, + { + "epoch": 6.491284699806327, + "grad_norm": 1.4153836965560913, + "learning_rate": 0.0002, + "loss": 0.4502, + "step": 20110 + }, + { + "epoch": 6.494512588766947, + "grad_norm": 1.2718676328659058, + "learning_rate": 0.0002, + "loss": 0.4524, + "step": 20120 + }, + { + "epoch": 6.497740477727566, + "grad_norm": 1.1040351390838623, + "learning_rate": 0.0002, + "loss": 0.4546, + "step": 20130 + }, + { + "epoch": 6.500968366688186, + "grad_norm": 0.9804210662841797, + "learning_rate": 0.0002, + "loss": 0.4105, + "step": 20140 + }, + { + "epoch": 6.5041962556488055, + "grad_norm": 1.028836965560913, + "learning_rate": 0.0002, + "loss": 0.4165, + "step": 20150 + }, + { + "epoch": 6.507424144609425, + "grad_norm": 1.1773076057434082, + "learning_rate": 0.0002, + "loss": 0.4106, + "step": 20160 + }, + { + "epoch": 6.510652033570045, + "grad_norm": 0.8597512245178223, + "learning_rate": 0.0002, + "loss": 0.4364, + "step": 20170 + }, + { + "epoch": 6.5138799225306645, + "grad_norm": 1.4290635585784912, + "learning_rate": 0.0002, + "loss": 0.4346, + "step": 20180 + }, + { + "epoch": 6.517107811491285, + "grad_norm": 0.9842908382415771, + "learning_rate": 0.0002, + "loss": 0.4057, + "step": 20190 + }, + { + "epoch": 6.520335700451905, + "grad_norm": 1.0254372358322144, + "learning_rate": 0.0002, + "loss": 0.4562, + "step": 20200 + }, + { + "epoch": 6.523563589412524, + "grad_norm": 1.1869125366210938, + "learning_rate": 0.0002, + "loss": 0.433, + "step": 20210 + }, + { + "epoch": 6.526791478373144, + "grad_norm": 1.0994106531143188, + "learning_rate": 0.0002, + "loss": 0.4247, + "step": 20220 + }, + { + "epoch": 6.530019367333764, + "grad_norm": 1.03111732006073, + "learning_rate": 0.0002, + "loss": 0.416, + "step": 20230 + }, + { + "epoch": 6.533247256294383, + "grad_norm": 1.5421077013015747, + "learning_rate": 0.0002, + "loss": 0.4202, + "step": 20240 + }, + { + "epoch": 6.536475145255003, + "grad_norm": 1.4383527040481567, + "learning_rate": 0.0002, + "loss": 0.4309, + "step": 20250 + }, + { + "epoch": 6.539703034215623, + "grad_norm": 1.0252864360809326, + "learning_rate": 0.0002, + "loss": 0.4086, + "step": 20260 + }, + { + "epoch": 6.542930923176243, + "grad_norm": 1.2504689693450928, + "learning_rate": 0.0002, + "loss": 0.4391, + "step": 20270 + }, + { + "epoch": 6.546158812136863, + "grad_norm": 1.2130976915359497, + "learning_rate": 0.0002, + "loss": 0.4294, + "step": 20280 + }, + { + "epoch": 6.549386701097482, + "grad_norm": 1.1186957359313965, + "learning_rate": 0.0002, + "loss": 0.4432, + "step": 20290 + }, + { + "epoch": 6.552614590058102, + "grad_norm": 1.0373939275741577, + "learning_rate": 0.0002, + "loss": 0.4225, + "step": 20300 + }, + { + "epoch": 6.555842479018722, + "grad_norm": 0.9950923323631287, + "learning_rate": 0.0002, + "loss": 0.3874, + "step": 20310 + }, + { + "epoch": 6.559070367979341, + "grad_norm": 1.1479439735412598, + "learning_rate": 0.0002, + "loss": 0.4257, + "step": 20320 + }, + { + "epoch": 6.562298256939961, + "grad_norm": 1.2426027059555054, + "learning_rate": 0.0002, + "loss": 0.4418, + "step": 20330 + }, + { + "epoch": 6.565526145900581, + "grad_norm": 1.3021808862686157, + "learning_rate": 0.0002, + "loss": 0.4274, + "step": 20340 + }, + { + "epoch": 6.5687540348612, + "grad_norm": 1.203259825706482, + "learning_rate": 0.0002, + "loss": 0.4423, + "step": 20350 + }, + { + "epoch": 6.571981923821821, + "grad_norm": 2.1131186485290527, + "learning_rate": 0.0002, + "loss": 0.4568, + "step": 20360 + }, + { + "epoch": 6.5752098127824405, + "grad_norm": 1.1588627099990845, + "learning_rate": 0.0002, + "loss": 0.4272, + "step": 20370 + }, + { + "epoch": 6.57843770174306, + "grad_norm": 1.0151054859161377, + "learning_rate": 0.0002, + "loss": 0.4727, + "step": 20380 + }, + { + "epoch": 6.58166559070368, + "grad_norm": 1.323155403137207, + "learning_rate": 0.0002, + "loss": 0.4592, + "step": 20390 + }, + { + "epoch": 6.5848934796642995, + "grad_norm": 1.0907572507858276, + "learning_rate": 0.0002, + "loss": 0.4075, + "step": 20400 + }, + { + "epoch": 6.588121368624919, + "grad_norm": 1.2375017404556274, + "learning_rate": 0.0002, + "loss": 0.4127, + "step": 20410 + }, + { + "epoch": 6.591349257585539, + "grad_norm": 1.0491245985031128, + "learning_rate": 0.0002, + "loss": 0.4483, + "step": 20420 + }, + { + "epoch": 6.5945771465461585, + "grad_norm": 1.50575852394104, + "learning_rate": 0.0002, + "loss": 0.4476, + "step": 20430 + }, + { + "epoch": 6.597805035506779, + "grad_norm": 0.9893020987510681, + "learning_rate": 0.0002, + "loss": 0.4235, + "step": 20440 + }, + { + "epoch": 6.601032924467399, + "grad_norm": 1.258591651916504, + "learning_rate": 0.0002, + "loss": 0.4384, + "step": 20450 + }, + { + "epoch": 6.604260813428018, + "grad_norm": 1.3949081897735596, + "learning_rate": 0.0002, + "loss": 0.4458, + "step": 20460 + }, + { + "epoch": 6.607488702388638, + "grad_norm": 1.152513861656189, + "learning_rate": 0.0002, + "loss": 0.3885, + "step": 20470 + }, + { + "epoch": 6.610716591349258, + "grad_norm": 1.218362808227539, + "learning_rate": 0.0002, + "loss": 0.4257, + "step": 20480 + }, + { + "epoch": 6.613944480309877, + "grad_norm": 1.3538687229156494, + "learning_rate": 0.0002, + "loss": 0.4448, + "step": 20490 + }, + { + "epoch": 6.617172369270497, + "grad_norm": 1.2896782159805298, + "learning_rate": 0.0002, + "loss": 0.4348, + "step": 20500 + }, + { + "epoch": 6.6204002582311166, + "grad_norm": 1.0762150287628174, + "learning_rate": 0.0002, + "loss": 0.4287, + "step": 20510 + }, + { + "epoch": 6.623628147191736, + "grad_norm": 1.1561447381973267, + "learning_rate": 0.0002, + "loss": 0.4529, + "step": 20520 + }, + { + "epoch": 6.626856036152357, + "grad_norm": 1.0553218126296997, + "learning_rate": 0.0002, + "loss": 0.4017, + "step": 20530 + }, + { + "epoch": 6.630083925112976, + "grad_norm": 1.1378765106201172, + "learning_rate": 0.0002, + "loss": 0.4321, + "step": 20540 + }, + { + "epoch": 6.633311814073596, + "grad_norm": 1.2299952507019043, + "learning_rate": 0.0002, + "loss": 0.4351, + "step": 20550 + }, + { + "epoch": 6.636539703034216, + "grad_norm": 1.4158518314361572, + "learning_rate": 0.0002, + "loss": 0.4406, + "step": 20560 + }, + { + "epoch": 6.639767591994835, + "grad_norm": 1.058830738067627, + "learning_rate": 0.0002, + "loss": 0.4334, + "step": 20570 + }, + { + "epoch": 6.642995480955455, + "grad_norm": 1.1069598197937012, + "learning_rate": 0.0002, + "loss": 0.4248, + "step": 20580 + }, + { + "epoch": 6.646223369916075, + "grad_norm": 1.3859037160873413, + "learning_rate": 0.0002, + "loss": 0.4651, + "step": 20590 + }, + { + "epoch": 6.649451258876694, + "grad_norm": 1.300588607788086, + "learning_rate": 0.0002, + "loss": 0.4324, + "step": 20600 + }, + { + "epoch": 6.652679147837315, + "grad_norm": 1.3861193656921387, + "learning_rate": 0.0002, + "loss": 0.4581, + "step": 20610 + }, + { + "epoch": 6.6559070367979345, + "grad_norm": 1.2356518507003784, + "learning_rate": 0.0002, + "loss": 0.4198, + "step": 20620 + }, + { + "epoch": 6.659134925758554, + "grad_norm": 1.1698070764541626, + "learning_rate": 0.0002, + "loss": 0.4578, + "step": 20630 + }, + { + "epoch": 6.662362814719174, + "grad_norm": 1.270707607269287, + "learning_rate": 0.0002, + "loss": 0.4513, + "step": 20640 + }, + { + "epoch": 6.6655907036797934, + "grad_norm": 0.984618067741394, + "learning_rate": 0.0002, + "loss": 0.4552, + "step": 20650 + }, + { + "epoch": 6.668818592640413, + "grad_norm": 1.2335834503173828, + "learning_rate": 0.0002, + "loss": 0.4648, + "step": 20660 + }, + { + "epoch": 6.672046481601033, + "grad_norm": 0.9497392773628235, + "learning_rate": 0.0002, + "loss": 0.4541, + "step": 20670 + }, + { + "epoch": 6.675274370561652, + "grad_norm": 1.011144757270813, + "learning_rate": 0.0002, + "loss": 0.4176, + "step": 20680 + }, + { + "epoch": 6.678502259522272, + "grad_norm": 1.1605948209762573, + "learning_rate": 0.0002, + "loss": 0.4424, + "step": 20690 + }, + { + "epoch": 6.681730148482892, + "grad_norm": 1.2136812210083008, + "learning_rate": 0.0002, + "loss": 0.4613, + "step": 20700 + }, + { + "epoch": 6.684958037443512, + "grad_norm": 1.0823525190353394, + "learning_rate": 0.0002, + "loss": 0.4287, + "step": 20710 + }, + { + "epoch": 6.688185926404132, + "grad_norm": 1.1929140090942383, + "learning_rate": 0.0002, + "loss": 0.4307, + "step": 20720 + }, + { + "epoch": 6.6914138153647515, + "grad_norm": 1.2468219995498657, + "learning_rate": 0.0002, + "loss": 0.4453, + "step": 20730 + }, + { + "epoch": 6.694641704325371, + "grad_norm": 1.2653573751449585, + "learning_rate": 0.0002, + "loss": 0.4262, + "step": 20740 + }, + { + "epoch": 6.697869593285991, + "grad_norm": 1.2253094911575317, + "learning_rate": 0.0002, + "loss": 0.4716, + "step": 20750 + }, + { + "epoch": 6.7010974822466105, + "grad_norm": 1.103179931640625, + "learning_rate": 0.0002, + "loss": 0.4462, + "step": 20760 + }, + { + "epoch": 6.70432537120723, + "grad_norm": 0.9180657863616943, + "learning_rate": 0.0002, + "loss": 0.4179, + "step": 20770 + }, + { + "epoch": 6.707553260167851, + "grad_norm": 1.1830929517745972, + "learning_rate": 0.0002, + "loss": 0.4712, + "step": 20780 + }, + { + "epoch": 6.71078114912847, + "grad_norm": 1.1052136421203613, + "learning_rate": 0.0002, + "loss": 0.4304, + "step": 20790 + }, + { + "epoch": 6.71400903808909, + "grad_norm": 1.1268569231033325, + "learning_rate": 0.0002, + "loss": 0.436, + "step": 20800 + }, + { + "epoch": 6.71723692704971, + "grad_norm": 1.0753320455551147, + "learning_rate": 0.0002, + "loss": 0.4109, + "step": 20810 + }, + { + "epoch": 6.720464816010329, + "grad_norm": 1.1100133657455444, + "learning_rate": 0.0002, + "loss": 0.4471, + "step": 20820 + }, + { + "epoch": 6.723692704970949, + "grad_norm": 0.7498472929000854, + "learning_rate": 0.0002, + "loss": 0.447, + "step": 20830 + }, + { + "epoch": 6.726920593931569, + "grad_norm": 1.1006664037704468, + "learning_rate": 0.0002, + "loss": 0.4182, + "step": 20840 + }, + { + "epoch": 6.730148482892188, + "grad_norm": 1.4599690437316895, + "learning_rate": 0.0002, + "loss": 0.4348, + "step": 20850 + }, + { + "epoch": 6.733376371852808, + "grad_norm": 1.324700951576233, + "learning_rate": 0.0002, + "loss": 0.4596, + "step": 20860 + }, + { + "epoch": 6.736604260813428, + "grad_norm": 1.1128668785095215, + "learning_rate": 0.0002, + "loss": 0.4373, + "step": 20870 + }, + { + "epoch": 6.739832149774048, + "grad_norm": 1.0438026189804077, + "learning_rate": 0.0002, + "loss": 0.4267, + "step": 20880 + }, + { + "epoch": 6.743060038734668, + "grad_norm": 1.1934672594070435, + "learning_rate": 0.0002, + "loss": 0.4366, + "step": 20890 + }, + { + "epoch": 6.746287927695287, + "grad_norm": 1.2108192443847656, + "learning_rate": 0.0002, + "loss": 0.4264, + "step": 20900 + }, + { + "epoch": 6.749515816655907, + "grad_norm": 1.1514620780944824, + "learning_rate": 0.0002, + "loss": 0.4327, + "step": 20910 + }, + { + "epoch": 6.752743705616527, + "grad_norm": 1.1723405122756958, + "learning_rate": 0.0002, + "loss": 0.4774, + "step": 20920 + }, + { + "epoch": 6.755971594577146, + "grad_norm": 1.1136211156845093, + "learning_rate": 0.0002, + "loss": 0.4458, + "step": 20930 + }, + { + "epoch": 6.759199483537766, + "grad_norm": 1.297601342201233, + "learning_rate": 0.0002, + "loss": 0.4363, + "step": 20940 + }, + { + "epoch": 6.7624273724983865, + "grad_norm": 1.139397144317627, + "learning_rate": 0.0002, + "loss": 0.4389, + "step": 20950 + }, + { + "epoch": 6.765655261459006, + "grad_norm": 1.2873362302780151, + "learning_rate": 0.0002, + "loss": 0.4344, + "step": 20960 + }, + { + "epoch": 6.768883150419626, + "grad_norm": 1.1499544382095337, + "learning_rate": 0.0002, + "loss": 0.4204, + "step": 20970 + }, + { + "epoch": 6.7721110393802455, + "grad_norm": 1.3687032461166382, + "learning_rate": 0.0002, + "loss": 0.4279, + "step": 20980 + }, + { + "epoch": 6.775338928340865, + "grad_norm": 1.2877939939498901, + "learning_rate": 0.0002, + "loss": 0.4621, + "step": 20990 + }, + { + "epoch": 6.778566817301485, + "grad_norm": 1.232993483543396, + "learning_rate": 0.0002, + "loss": 0.4629, + "step": 21000 + }, + { + "epoch": 6.7817947062621045, + "grad_norm": 1.1765092611312866, + "learning_rate": 0.0002, + "loss": 0.4697, + "step": 21010 + }, + { + "epoch": 6.785022595222724, + "grad_norm": 1.4695899486541748, + "learning_rate": 0.0002, + "loss": 0.431, + "step": 21020 + }, + { + "epoch": 6.788250484183344, + "grad_norm": 1.2325087785720825, + "learning_rate": 0.0002, + "loss": 0.4348, + "step": 21030 + }, + { + "epoch": 6.791478373143963, + "grad_norm": 1.3475068807601929, + "learning_rate": 0.0002, + "loss": 0.4595, + "step": 21040 + }, + { + "epoch": 6.794706262104584, + "grad_norm": 1.5654256343841553, + "learning_rate": 0.0002, + "loss": 0.4555, + "step": 21050 + }, + { + "epoch": 6.797934151065204, + "grad_norm": 1.4210680723190308, + "learning_rate": 0.0002, + "loss": 0.4672, + "step": 21060 + }, + { + "epoch": 6.801162040025823, + "grad_norm": 1.167878270149231, + "learning_rate": 0.0002, + "loss": 0.4491, + "step": 21070 + }, + { + "epoch": 6.804389928986443, + "grad_norm": 1.1643486022949219, + "learning_rate": 0.0002, + "loss": 0.4524, + "step": 21080 + }, + { + "epoch": 6.8076178179470626, + "grad_norm": 1.1976310014724731, + "learning_rate": 0.0002, + "loss": 0.4467, + "step": 21090 + }, + { + "epoch": 6.810845706907682, + "grad_norm": 1.1392749547958374, + "learning_rate": 0.0002, + "loss": 0.4449, + "step": 21100 + }, + { + "epoch": 6.814073595868302, + "grad_norm": 1.2456704378128052, + "learning_rate": 0.0002, + "loss": 0.4567, + "step": 21110 + }, + { + "epoch": 6.8173014848289215, + "grad_norm": 1.0030150413513184, + "learning_rate": 0.0002, + "loss": 0.4271, + "step": 21120 + }, + { + "epoch": 6.820529373789542, + "grad_norm": 1.4715943336486816, + "learning_rate": 0.0002, + "loss": 0.4258, + "step": 21130 + }, + { + "epoch": 6.823757262750162, + "grad_norm": 1.1307374238967896, + "learning_rate": 0.0002, + "loss": 0.4615, + "step": 21140 + }, + { + "epoch": 6.826985151710781, + "grad_norm": 1.37498140335083, + "learning_rate": 0.0002, + "loss": 0.4643, + "step": 21150 + }, + { + "epoch": 6.830213040671401, + "grad_norm": 1.2791364192962646, + "learning_rate": 0.0002, + "loss": 0.4447, + "step": 21160 + }, + { + "epoch": 6.833440929632021, + "grad_norm": 1.0518016815185547, + "learning_rate": 0.0002, + "loss": 0.4778, + "step": 21170 + }, + { + "epoch": 6.83666881859264, + "grad_norm": 1.1237729787826538, + "learning_rate": 0.0002, + "loss": 0.448, + "step": 21180 + }, + { + "epoch": 6.83989670755326, + "grad_norm": 1.0360032320022583, + "learning_rate": 0.0002, + "loss": 0.4299, + "step": 21190 + }, + { + "epoch": 6.84312459651388, + "grad_norm": 0.8733281493186951, + "learning_rate": 0.0002, + "loss": 0.4336, + "step": 21200 + }, + { + "epoch": 6.846352485474499, + "grad_norm": 1.3178322315216064, + "learning_rate": 0.0002, + "loss": 0.4495, + "step": 21210 + }, + { + "epoch": 6.84958037443512, + "grad_norm": 1.0884978771209717, + "learning_rate": 0.0002, + "loss": 0.4548, + "step": 21220 + }, + { + "epoch": 6.8528082633957395, + "grad_norm": 1.213229775428772, + "learning_rate": 0.0002, + "loss": 0.4543, + "step": 21230 + }, + { + "epoch": 6.856036152356359, + "grad_norm": 1.0828464031219482, + "learning_rate": 0.0002, + "loss": 0.4628, + "step": 21240 + }, + { + "epoch": 6.859264041316979, + "grad_norm": 1.2298113107681274, + "learning_rate": 0.0002, + "loss": 0.4353, + "step": 21250 + }, + { + "epoch": 6.862491930277598, + "grad_norm": 1.4773930311203003, + "learning_rate": 0.0002, + "loss": 0.4088, + "step": 21260 + }, + { + "epoch": 6.865719819238218, + "grad_norm": 0.992661714553833, + "learning_rate": 0.0002, + "loss": 0.4529, + "step": 21270 + }, + { + "epoch": 6.868947708198838, + "grad_norm": 1.25167715549469, + "learning_rate": 0.0002, + "loss": 0.474, + "step": 21280 + }, + { + "epoch": 6.872175597159457, + "grad_norm": 1.1554399728775024, + "learning_rate": 0.0002, + "loss": 0.4466, + "step": 21290 + }, + { + "epoch": 6.875403486120078, + "grad_norm": 1.2587701082229614, + "learning_rate": 0.0002, + "loss": 0.4375, + "step": 21300 + }, + { + "epoch": 6.8786313750806976, + "grad_norm": 1.392392635345459, + "learning_rate": 0.0002, + "loss": 0.4507, + "step": 21310 + }, + { + "epoch": 6.881859264041317, + "grad_norm": 1.2159595489501953, + "learning_rate": 0.0002, + "loss": 0.4432, + "step": 21320 + }, + { + "epoch": 6.885087153001937, + "grad_norm": 1.3811182975769043, + "learning_rate": 0.0002, + "loss": 0.4255, + "step": 21330 + }, + { + "epoch": 6.8883150419625565, + "grad_norm": 1.2652684450149536, + "learning_rate": 0.0002, + "loss": 0.4437, + "step": 21340 + }, + { + "epoch": 6.891542930923176, + "grad_norm": 1.1906380653381348, + "learning_rate": 0.0002, + "loss": 0.4797, + "step": 21350 + }, + { + "epoch": 6.894770819883796, + "grad_norm": 1.0525990724563599, + "learning_rate": 0.0002, + "loss": 0.423, + "step": 21360 + }, + { + "epoch": 6.8979987088444155, + "grad_norm": 0.910491406917572, + "learning_rate": 0.0002, + "loss": 0.4414, + "step": 21370 + }, + { + "epoch": 6.901226597805035, + "grad_norm": 1.366865634918213, + "learning_rate": 0.0002, + "loss": 0.4882, + "step": 21380 + }, + { + "epoch": 6.904454486765655, + "grad_norm": 1.1270265579223633, + "learning_rate": 0.0002, + "loss": 0.4648, + "step": 21390 + }, + { + "epoch": 6.907682375726275, + "grad_norm": 1.1745691299438477, + "learning_rate": 0.0002, + "loss": 0.4529, + "step": 21400 + }, + { + "epoch": 6.910910264686895, + "grad_norm": 1.1036182641983032, + "learning_rate": 0.0002, + "loss": 0.4504, + "step": 21410 + }, + { + "epoch": 6.914138153647515, + "grad_norm": 1.0906540155410767, + "learning_rate": 0.0002, + "loss": 0.4612, + "step": 21420 + }, + { + "epoch": 6.917366042608134, + "grad_norm": 1.1176798343658447, + "learning_rate": 0.0002, + "loss": 0.4408, + "step": 21430 + }, + { + "epoch": 6.920593931568754, + "grad_norm": 1.525869607925415, + "learning_rate": 0.0002, + "loss": 0.477, + "step": 21440 + }, + { + "epoch": 6.923821820529374, + "grad_norm": 1.2466827630996704, + "learning_rate": 0.0002, + "loss": 0.4473, + "step": 21450 + }, + { + "epoch": 6.927049709489993, + "grad_norm": 1.0200796127319336, + "learning_rate": 0.0002, + "loss": 0.4256, + "step": 21460 + }, + { + "epoch": 6.930277598450614, + "grad_norm": 1.2133489847183228, + "learning_rate": 0.0002, + "loss": 0.4601, + "step": 21470 + }, + { + "epoch": 6.933505487411233, + "grad_norm": 1.2100290060043335, + "learning_rate": 0.0002, + "loss": 0.44, + "step": 21480 + }, + { + "epoch": 6.936733376371853, + "grad_norm": 1.1833131313323975, + "learning_rate": 0.0002, + "loss": 0.468, + "step": 21490 + }, + { + "epoch": 6.939961265332473, + "grad_norm": 1.2262470722198486, + "learning_rate": 0.0002, + "loss": 0.4529, + "step": 21500 + }, + { + "epoch": 6.943189154293092, + "grad_norm": 1.0496156215667725, + "learning_rate": 0.0002, + "loss": 0.4612, + "step": 21510 + }, + { + "epoch": 6.946417043253712, + "grad_norm": 1.050690770149231, + "learning_rate": 0.0002, + "loss": 0.4417, + "step": 21520 + }, + { + "epoch": 6.949644932214332, + "grad_norm": 1.2035698890686035, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 21530 + }, + { + "epoch": 6.952872821174951, + "grad_norm": 1.408007025718689, + "learning_rate": 0.0002, + "loss": 0.4349, + "step": 21540 + }, + { + "epoch": 6.956100710135571, + "grad_norm": 1.2247556447982788, + "learning_rate": 0.0002, + "loss": 0.4391, + "step": 21550 + }, + { + "epoch": 6.959328599096191, + "grad_norm": 1.1727497577667236, + "learning_rate": 0.0002, + "loss": 0.4526, + "step": 21560 + }, + { + "epoch": 6.962556488056811, + "grad_norm": 1.2948925495147705, + "learning_rate": 0.0002, + "loss": 0.4566, + "step": 21570 + }, + { + "epoch": 6.965784377017431, + "grad_norm": 1.3374950885772705, + "learning_rate": 0.0002, + "loss": 0.4672, + "step": 21580 + }, + { + "epoch": 6.9690122659780505, + "grad_norm": 1.164650559425354, + "learning_rate": 0.0002, + "loss": 0.4515, + "step": 21590 + }, + { + "epoch": 6.97224015493867, + "grad_norm": 1.2682108879089355, + "learning_rate": 0.0002, + "loss": 0.4704, + "step": 21600 + }, + { + "epoch": 6.97546804389929, + "grad_norm": 1.195971131324768, + "learning_rate": 0.0002, + "loss": 0.4557, + "step": 21610 + }, + { + "epoch": 6.978695932859909, + "grad_norm": 1.1988017559051514, + "learning_rate": 0.0002, + "loss": 0.4194, + "step": 21620 + }, + { + "epoch": 6.981923821820529, + "grad_norm": 1.0981930494308472, + "learning_rate": 0.0002, + "loss": 0.4524, + "step": 21630 + }, + { + "epoch": 6.98515171078115, + "grad_norm": 1.307260274887085, + "learning_rate": 0.0002, + "loss": 0.4808, + "step": 21640 + }, + { + "epoch": 6.988379599741769, + "grad_norm": 1.2798160314559937, + "learning_rate": 0.0002, + "loss": 0.4936, + "step": 21650 + }, + { + "epoch": 6.991607488702389, + "grad_norm": 1.0053848028182983, + "learning_rate": 0.0002, + "loss": 0.4615, + "step": 21660 + }, + { + "epoch": 6.994835377663009, + "grad_norm": 1.2257840633392334, + "learning_rate": 0.0002, + "loss": 0.4496, + "step": 21670 + }, + { + "epoch": 6.998063266623628, + "grad_norm": 1.3769378662109375, + "learning_rate": 0.0002, + "loss": 0.4449, + "step": 21680 + }, + { + "epoch": 7.0, + "eval_loss": 1.3414524793624878, + "eval_runtime": 162.0091, + "eval_samples_per_second": 4.524, + "eval_steps_per_second": 0.568, + "step": 21686 + }, + { + "epoch": 7.001291155584248, + "grad_norm": 0.834328830242157, + "learning_rate": 0.0002, + "loss": 0.4148, + "step": 21690 + }, + { + "epoch": 7.0045190445448675, + "grad_norm": 1.0984957218170166, + "learning_rate": 0.0002, + "loss": 0.3444, + "step": 21700 + }, + { + "epoch": 7.007746933505487, + "grad_norm": 1.0821330547332764, + "learning_rate": 0.0002, + "loss": 0.3456, + "step": 21710 + }, + { + "epoch": 7.010974822466107, + "grad_norm": 1.1686056852340698, + "learning_rate": 0.0002, + "loss": 0.3698, + "step": 21720 + }, + { + "epoch": 7.014202711426727, + "grad_norm": 1.0800853967666626, + "learning_rate": 0.0002, + "loss": 0.3425, + "step": 21730 + }, + { + "epoch": 7.017430600387347, + "grad_norm": 1.0158464908599854, + "learning_rate": 0.0002, + "loss": 0.3518, + "step": 21740 + }, + { + "epoch": 7.020658489347967, + "grad_norm": 1.1526305675506592, + "learning_rate": 0.0002, + "loss": 0.3388, + "step": 21750 + }, + { + "epoch": 7.023886378308586, + "grad_norm": 0.9431301951408386, + "learning_rate": 0.0002, + "loss": 0.3549, + "step": 21760 + }, + { + "epoch": 7.027114267269206, + "grad_norm": 1.2625824213027954, + "learning_rate": 0.0002, + "loss": 0.3756, + "step": 21770 + }, + { + "epoch": 7.030342156229826, + "grad_norm": 1.2469223737716675, + "learning_rate": 0.0002, + "loss": 0.3513, + "step": 21780 + }, + { + "epoch": 7.033570045190445, + "grad_norm": 1.0981431007385254, + "learning_rate": 0.0002, + "loss": 0.3756, + "step": 21790 + }, + { + "epoch": 7.036797934151065, + "grad_norm": 1.147852897644043, + "learning_rate": 0.0002, + "loss": 0.3543, + "step": 21800 + }, + { + "epoch": 7.040025823111685, + "grad_norm": 1.368754506111145, + "learning_rate": 0.0002, + "loss": 0.3706, + "step": 21810 + }, + { + "epoch": 7.043253712072305, + "grad_norm": 0.7324210405349731, + "learning_rate": 0.0002, + "loss": 0.3446, + "step": 21820 + }, + { + "epoch": 7.046481601032925, + "grad_norm": 1.264591932296753, + "learning_rate": 0.0002, + "loss": 0.3493, + "step": 21830 + }, + { + "epoch": 7.049709489993544, + "grad_norm": 1.080914855003357, + "learning_rate": 0.0002, + "loss": 0.3368, + "step": 21840 + }, + { + "epoch": 7.052937378954164, + "grad_norm": 0.8814678192138672, + "learning_rate": 0.0002, + "loss": 0.3676, + "step": 21850 + }, + { + "epoch": 7.056165267914784, + "grad_norm": 1.0538815259933472, + "learning_rate": 0.0002, + "loss": 0.3537, + "step": 21860 + }, + { + "epoch": 7.059393156875403, + "grad_norm": 1.0479655265808105, + "learning_rate": 0.0002, + "loss": 0.3436, + "step": 21870 + }, + { + "epoch": 7.062621045836023, + "grad_norm": 1.260636329650879, + "learning_rate": 0.0002, + "loss": 0.3482, + "step": 21880 + }, + { + "epoch": 7.065848934796643, + "grad_norm": 1.0623047351837158, + "learning_rate": 0.0002, + "loss": 0.3442, + "step": 21890 + }, + { + "epoch": 7.069076823757262, + "grad_norm": 1.083094835281372, + "learning_rate": 0.0002, + "loss": 0.3841, + "step": 21900 + }, + { + "epoch": 7.072304712717883, + "grad_norm": 1.1972185373306274, + "learning_rate": 0.0002, + "loss": 0.3517, + "step": 21910 + }, + { + "epoch": 7.0755326016785025, + "grad_norm": 1.217283844947815, + "learning_rate": 0.0002, + "loss": 0.3642, + "step": 21920 + }, + { + "epoch": 7.078760490639122, + "grad_norm": 1.7448943853378296, + "learning_rate": 0.0002, + "loss": 0.3709, + "step": 21930 + }, + { + "epoch": 7.081988379599742, + "grad_norm": 0.7799133062362671, + "learning_rate": 0.0002, + "loss": 0.3705, + "step": 21940 + }, + { + "epoch": 7.0852162685603615, + "grad_norm": 1.0691521167755127, + "learning_rate": 0.0002, + "loss": 0.3658, + "step": 21950 + }, + { + "epoch": 7.088444157520981, + "grad_norm": 1.4790667295455933, + "learning_rate": 0.0002, + "loss": 0.3879, + "step": 21960 + }, + { + "epoch": 7.091672046481601, + "grad_norm": 1.0977898836135864, + "learning_rate": 0.0002, + "loss": 0.3432, + "step": 21970 + }, + { + "epoch": 7.09489993544222, + "grad_norm": 2.204333543777466, + "learning_rate": 0.0002, + "loss": 0.3636, + "step": 21980 + }, + { + "epoch": 7.098127824402841, + "grad_norm": 1.1866867542266846, + "learning_rate": 0.0002, + "loss": 0.3561, + "step": 21990 + }, + { + "epoch": 7.101355713363461, + "grad_norm": 1.2251238822937012, + "learning_rate": 0.0002, + "loss": 0.3678, + "step": 22000 + }, + { + "epoch": 7.10458360232408, + "grad_norm": 1.1271567344665527, + "learning_rate": 0.0002, + "loss": 0.3819, + "step": 22010 + }, + { + "epoch": 7.1078114912847, + "grad_norm": 0.8748073577880859, + "learning_rate": 0.0002, + "loss": 0.3434, + "step": 22020 + }, + { + "epoch": 7.11103938024532, + "grad_norm": 1.1254602670669556, + "learning_rate": 0.0002, + "loss": 0.3628, + "step": 22030 + }, + { + "epoch": 7.114267269205939, + "grad_norm": 1.2542496919631958, + "learning_rate": 0.0002, + "loss": 0.3604, + "step": 22040 + }, + { + "epoch": 7.117495158166559, + "grad_norm": 1.059043526649475, + "learning_rate": 0.0002, + "loss": 0.3761, + "step": 22050 + }, + { + "epoch": 7.1207230471271785, + "grad_norm": 1.054980993270874, + "learning_rate": 0.0002, + "loss": 0.3717, + "step": 22060 + }, + { + "epoch": 7.123950936087798, + "grad_norm": 1.5040231943130493, + "learning_rate": 0.0002, + "loss": 0.3849, + "step": 22070 + }, + { + "epoch": 7.127178825048419, + "grad_norm": 1.089801549911499, + "learning_rate": 0.0002, + "loss": 0.387, + "step": 22080 + }, + { + "epoch": 7.130406714009038, + "grad_norm": 0.8638873100280762, + "learning_rate": 0.0002, + "loss": 0.3474, + "step": 22090 + }, + { + "epoch": 7.133634602969658, + "grad_norm": 1.0746978521347046, + "learning_rate": 0.0002, + "loss": 0.3738, + "step": 22100 + }, + { + "epoch": 7.136862491930278, + "grad_norm": 0.875741720199585, + "learning_rate": 0.0002, + "loss": 0.3605, + "step": 22110 + }, + { + "epoch": 7.140090380890897, + "grad_norm": 1.0179301500320435, + "learning_rate": 0.0002, + "loss": 0.3465, + "step": 22120 + }, + { + "epoch": 7.143318269851517, + "grad_norm": 1.18764066696167, + "learning_rate": 0.0002, + "loss": 0.3779, + "step": 22130 + }, + { + "epoch": 7.146546158812137, + "grad_norm": 1.1021716594696045, + "learning_rate": 0.0002, + "loss": 0.392, + "step": 22140 + }, + { + "epoch": 7.149774047772756, + "grad_norm": 1.3633701801300049, + "learning_rate": 0.0002, + "loss": 0.3763, + "step": 22150 + }, + { + "epoch": 7.153001936733377, + "grad_norm": 1.124321460723877, + "learning_rate": 0.0002, + "loss": 0.3595, + "step": 22160 + }, + { + "epoch": 7.1562298256939965, + "grad_norm": 1.1838600635528564, + "learning_rate": 0.0002, + "loss": 0.3668, + "step": 22170 + }, + { + "epoch": 7.159457714654616, + "grad_norm": 1.1565297842025757, + "learning_rate": 0.0002, + "loss": 0.367, + "step": 22180 + }, + { + "epoch": 7.162685603615236, + "grad_norm": 1.1444414854049683, + "learning_rate": 0.0002, + "loss": 0.3728, + "step": 22190 + }, + { + "epoch": 7.165913492575855, + "grad_norm": 1.4376155138015747, + "learning_rate": 0.0002, + "loss": 0.3679, + "step": 22200 + }, + { + "epoch": 7.169141381536475, + "grad_norm": 1.154999852180481, + "learning_rate": 0.0002, + "loss": 0.4011, + "step": 22210 + }, + { + "epoch": 7.172369270497095, + "grad_norm": 1.2167491912841797, + "learning_rate": 0.0002, + "loss": 0.3719, + "step": 22220 + }, + { + "epoch": 7.175597159457714, + "grad_norm": 1.1870360374450684, + "learning_rate": 0.0002, + "loss": 0.398, + "step": 22230 + }, + { + "epoch": 7.178825048418334, + "grad_norm": 1.269687294960022, + "learning_rate": 0.0002, + "loss": 0.3751, + "step": 22240 + }, + { + "epoch": 7.182052937378955, + "grad_norm": 1.2174620628356934, + "learning_rate": 0.0002, + "loss": 0.3468, + "step": 22250 + }, + { + "epoch": 7.185280826339574, + "grad_norm": 0.8996151685714722, + "learning_rate": 0.0002, + "loss": 0.3833, + "step": 22260 + }, + { + "epoch": 7.188508715300194, + "grad_norm": 1.1364930868148804, + "learning_rate": 0.0002, + "loss": 0.3802, + "step": 22270 + }, + { + "epoch": 7.1917366042608135, + "grad_norm": 1.2437993288040161, + "learning_rate": 0.0002, + "loss": 0.3655, + "step": 22280 + }, + { + "epoch": 7.194964493221433, + "grad_norm": 1.3526612520217896, + "learning_rate": 0.0002, + "loss": 0.3633, + "step": 22290 + }, + { + "epoch": 7.198192382182053, + "grad_norm": 0.9819979071617126, + "learning_rate": 0.0002, + "loss": 0.3644, + "step": 22300 + }, + { + "epoch": 7.2014202711426725, + "grad_norm": 1.3902596235275269, + "learning_rate": 0.0002, + "loss": 0.3532, + "step": 22310 + }, + { + "epoch": 7.204648160103292, + "grad_norm": 1.2565160989761353, + "learning_rate": 0.0002, + "loss": 0.3896, + "step": 22320 + }, + { + "epoch": 7.207876049063912, + "grad_norm": 1.2485729455947876, + "learning_rate": 0.0002, + "loss": 0.3535, + "step": 22330 + }, + { + "epoch": 7.211103938024532, + "grad_norm": 1.1691182851791382, + "learning_rate": 0.0002, + "loss": 0.3615, + "step": 22340 + }, + { + "epoch": 7.214331826985152, + "grad_norm": 1.0192445516586304, + "learning_rate": 0.0002, + "loss": 0.3543, + "step": 22350 + }, + { + "epoch": 7.217559715945772, + "grad_norm": 1.2632675170898438, + "learning_rate": 0.0002, + "loss": 0.3774, + "step": 22360 + }, + { + "epoch": 7.220787604906391, + "grad_norm": 1.4515255689620972, + "learning_rate": 0.0002, + "loss": 0.3668, + "step": 22370 + }, + { + "epoch": 7.224015493867011, + "grad_norm": 1.3013306856155396, + "learning_rate": 0.0002, + "loss": 0.3605, + "step": 22380 + }, + { + "epoch": 7.227243382827631, + "grad_norm": 0.9696382284164429, + "learning_rate": 0.0002, + "loss": 0.4009, + "step": 22390 + }, + { + "epoch": 7.23047127178825, + "grad_norm": 1.2517571449279785, + "learning_rate": 0.0002, + "loss": 0.3669, + "step": 22400 + }, + { + "epoch": 7.23369916074887, + "grad_norm": 1.275736689567566, + "learning_rate": 0.0002, + "loss": 0.385, + "step": 22410 + }, + { + "epoch": 7.23692704970949, + "grad_norm": 1.2981343269348145, + "learning_rate": 0.0002, + "loss": 0.3859, + "step": 22420 + }, + { + "epoch": 7.24015493867011, + "grad_norm": 1.1113612651824951, + "learning_rate": 0.0002, + "loss": 0.3617, + "step": 22430 + }, + { + "epoch": 7.24338282763073, + "grad_norm": 1.1843012571334839, + "learning_rate": 0.0002, + "loss": 0.3681, + "step": 22440 + }, + { + "epoch": 7.246610716591349, + "grad_norm": 1.2983063459396362, + "learning_rate": 0.0002, + "loss": 0.4053, + "step": 22450 + }, + { + "epoch": 7.249838605551969, + "grad_norm": 1.1059116125106812, + "learning_rate": 0.0002, + "loss": 0.3691, + "step": 22460 + }, + { + "epoch": 7.253066494512589, + "grad_norm": 1.4968358278274536, + "learning_rate": 0.0002, + "loss": 0.4125, + "step": 22470 + }, + { + "epoch": 7.256294383473208, + "grad_norm": 1.2906030416488647, + "learning_rate": 0.0002, + "loss": 0.3664, + "step": 22480 + }, + { + "epoch": 7.259522272433828, + "grad_norm": 1.1108627319335938, + "learning_rate": 0.0002, + "loss": 0.3896, + "step": 22490 + }, + { + "epoch": 7.262750161394448, + "grad_norm": 0.9844270348548889, + "learning_rate": 0.0002, + "loss": 0.3779, + "step": 22500 + }, + { + "epoch": 7.265978050355068, + "grad_norm": 1.0623210668563843, + "learning_rate": 0.0002, + "loss": 0.3779, + "step": 22510 + }, + { + "epoch": 7.269205939315688, + "grad_norm": 1.2726962566375732, + "learning_rate": 0.0002, + "loss": 0.3862, + "step": 22520 + }, + { + "epoch": 7.2724338282763075, + "grad_norm": 1.1712630987167358, + "learning_rate": 0.0002, + "loss": 0.3889, + "step": 22530 + }, + { + "epoch": 7.275661717236927, + "grad_norm": 1.0604515075683594, + "learning_rate": 0.0002, + "loss": 0.3744, + "step": 22540 + }, + { + "epoch": 7.278889606197547, + "grad_norm": 1.1781001091003418, + "learning_rate": 0.0002, + "loss": 0.3878, + "step": 22550 + }, + { + "epoch": 7.282117495158166, + "grad_norm": 1.2568641901016235, + "learning_rate": 0.0002, + "loss": 0.3806, + "step": 22560 + }, + { + "epoch": 7.285345384118786, + "grad_norm": 1.2375072240829468, + "learning_rate": 0.0002, + "loss": 0.4032, + "step": 22570 + }, + { + "epoch": 7.288573273079406, + "grad_norm": 1.2701354026794434, + "learning_rate": 0.0002, + "loss": 0.41, + "step": 22580 + }, + { + "epoch": 7.291801162040025, + "grad_norm": 1.2957371473312378, + "learning_rate": 0.0002, + "loss": 0.3716, + "step": 22590 + }, + { + "epoch": 7.295029051000646, + "grad_norm": 1.1555131673812866, + "learning_rate": 0.0002, + "loss": 0.3564, + "step": 22600 + }, + { + "epoch": 7.298256939961266, + "grad_norm": 1.1809004545211792, + "learning_rate": 0.0002, + "loss": 0.3887, + "step": 22610 + }, + { + "epoch": 7.301484828921885, + "grad_norm": 1.156985878944397, + "learning_rate": 0.0002, + "loss": 0.3521, + "step": 22620 + }, + { + "epoch": 7.304712717882505, + "grad_norm": 1.3241633176803589, + "learning_rate": 0.0002, + "loss": 0.3648, + "step": 22630 + }, + { + "epoch": 7.3079406068431245, + "grad_norm": 1.3285194635391235, + "learning_rate": 0.0002, + "loss": 0.4075, + "step": 22640 + }, + { + "epoch": 7.311168495803744, + "grad_norm": 1.0388010740280151, + "learning_rate": 0.0002, + "loss": 0.3802, + "step": 22650 + }, + { + "epoch": 7.314396384764364, + "grad_norm": 1.1035511493682861, + "learning_rate": 0.0002, + "loss": 0.3895, + "step": 22660 + }, + { + "epoch": 7.3176242737249835, + "grad_norm": 1.1168203353881836, + "learning_rate": 0.0002, + "loss": 0.3607, + "step": 22670 + }, + { + "epoch": 7.320852162685604, + "grad_norm": 1.0566749572753906, + "learning_rate": 0.0002, + "loss": 0.3785, + "step": 22680 + }, + { + "epoch": 7.324080051646224, + "grad_norm": 1.0538207292556763, + "learning_rate": 0.0002, + "loss": 0.3833, + "step": 22690 + }, + { + "epoch": 7.327307940606843, + "grad_norm": 1.0754560232162476, + "learning_rate": 0.0002, + "loss": 0.3691, + "step": 22700 + }, + { + "epoch": 7.330535829567463, + "grad_norm": 1.036759614944458, + "learning_rate": 0.0002, + "loss": 0.3503, + "step": 22710 + }, + { + "epoch": 7.333763718528083, + "grad_norm": 1.1662222146987915, + "learning_rate": 0.0002, + "loss": 0.3821, + "step": 22720 + }, + { + "epoch": 7.336991607488702, + "grad_norm": 1.1255900859832764, + "learning_rate": 0.0002, + "loss": 0.376, + "step": 22730 + }, + { + "epoch": 7.340219496449322, + "grad_norm": 1.4802581071853638, + "learning_rate": 0.0002, + "loss": 0.4036, + "step": 22740 + }, + { + "epoch": 7.343447385409942, + "grad_norm": 1.1963917016983032, + "learning_rate": 0.0002, + "loss": 0.3889, + "step": 22750 + }, + { + "epoch": 7.346675274370561, + "grad_norm": 1.0769098997116089, + "learning_rate": 0.0002, + "loss": 0.3732, + "step": 22760 + }, + { + "epoch": 7.349903163331182, + "grad_norm": 1.5818109512329102, + "learning_rate": 0.0002, + "loss": 0.3914, + "step": 22770 + }, + { + "epoch": 7.353131052291801, + "grad_norm": 1.5089726448059082, + "learning_rate": 0.0002, + "loss": 0.3577, + "step": 22780 + }, + { + "epoch": 7.356358941252421, + "grad_norm": 1.0024120807647705, + "learning_rate": 0.0002, + "loss": 0.3788, + "step": 22790 + }, + { + "epoch": 7.359586830213041, + "grad_norm": 1.2956844568252563, + "learning_rate": 0.0002, + "loss": 0.3867, + "step": 22800 + }, + { + "epoch": 7.36281471917366, + "grad_norm": 1.0113978385925293, + "learning_rate": 0.0002, + "loss": 0.3612, + "step": 22810 + }, + { + "epoch": 7.36604260813428, + "grad_norm": 1.4180196523666382, + "learning_rate": 0.0002, + "loss": 0.3548, + "step": 22820 + }, + { + "epoch": 7.3692704970949, + "grad_norm": 0.9611803293228149, + "learning_rate": 0.0002, + "loss": 0.3817, + "step": 22830 + }, + { + "epoch": 7.372498386055519, + "grad_norm": 1.2668812274932861, + "learning_rate": 0.0002, + "loss": 0.3755, + "step": 22840 + }, + { + "epoch": 7.37572627501614, + "grad_norm": 1.2809178829193115, + "learning_rate": 0.0002, + "loss": 0.4001, + "step": 22850 + }, + { + "epoch": 7.3789541639767595, + "grad_norm": 1.4618953466415405, + "learning_rate": 0.0002, + "loss": 0.3859, + "step": 22860 + }, + { + "epoch": 7.382182052937379, + "grad_norm": 1.0964281558990479, + "learning_rate": 0.0002, + "loss": 0.3796, + "step": 22870 + }, + { + "epoch": 7.385409941897999, + "grad_norm": 1.2329200506210327, + "learning_rate": 0.0002, + "loss": 0.369, + "step": 22880 + }, + { + "epoch": 7.3886378308586185, + "grad_norm": 1.0750329494476318, + "learning_rate": 0.0002, + "loss": 0.3762, + "step": 22890 + }, + { + "epoch": 7.391865719819238, + "grad_norm": 0.9547448754310608, + "learning_rate": 0.0002, + "loss": 0.3762, + "step": 22900 + }, + { + "epoch": 7.395093608779858, + "grad_norm": 1.146202802658081, + "learning_rate": 0.0002, + "loss": 0.3741, + "step": 22910 + }, + { + "epoch": 7.398321497740477, + "grad_norm": 1.1540607213974, + "learning_rate": 0.0002, + "loss": 0.4, + "step": 22920 + }, + { + "epoch": 7.401549386701097, + "grad_norm": 1.1683391332626343, + "learning_rate": 0.0002, + "loss": 0.3714, + "step": 22930 + }, + { + "epoch": 7.404777275661718, + "grad_norm": 1.2653683423995972, + "learning_rate": 0.0002, + "loss": 0.3786, + "step": 22940 + }, + { + "epoch": 7.408005164622337, + "grad_norm": 1.1355576515197754, + "learning_rate": 0.0002, + "loss": 0.3835, + "step": 22950 + }, + { + "epoch": 7.411233053582957, + "grad_norm": 1.2306767702102661, + "learning_rate": 0.0002, + "loss": 0.3958, + "step": 22960 + }, + { + "epoch": 7.414460942543577, + "grad_norm": 1.2526071071624756, + "learning_rate": 0.0002, + "loss": 0.3752, + "step": 22970 + }, + { + "epoch": 7.417688831504196, + "grad_norm": 1.3868485689163208, + "learning_rate": 0.0002, + "loss": 0.3931, + "step": 22980 + }, + { + "epoch": 7.420916720464816, + "grad_norm": 1.257453203201294, + "learning_rate": 0.0002, + "loss": 0.3899, + "step": 22990 + }, + { + "epoch": 7.4241446094254355, + "grad_norm": 1.1610639095306396, + "learning_rate": 0.0002, + "loss": 0.3758, + "step": 23000 + }, + { + "epoch": 7.427372498386055, + "grad_norm": 1.3744033575057983, + "learning_rate": 0.0002, + "loss": 0.3639, + "step": 23010 + }, + { + "epoch": 7.430600387346676, + "grad_norm": 1.0811532735824585, + "learning_rate": 0.0002, + "loss": 0.3885, + "step": 23020 + }, + { + "epoch": 7.433828276307295, + "grad_norm": 1.170789122581482, + "learning_rate": 0.0002, + "loss": 0.3914, + "step": 23030 + }, + { + "epoch": 7.437056165267915, + "grad_norm": 1.2688828706741333, + "learning_rate": 0.0002, + "loss": 0.4192, + "step": 23040 + }, + { + "epoch": 7.440284054228535, + "grad_norm": 1.1140133142471313, + "learning_rate": 0.0002, + "loss": 0.3859, + "step": 23050 + }, + { + "epoch": 7.443511943189154, + "grad_norm": 1.525015950202942, + "learning_rate": 0.0002, + "loss": 0.3856, + "step": 23060 + }, + { + "epoch": 7.446739832149774, + "grad_norm": 1.120497226715088, + "learning_rate": 0.0002, + "loss": 0.3775, + "step": 23070 + }, + { + "epoch": 7.449967721110394, + "grad_norm": 1.298614740371704, + "learning_rate": 0.0002, + "loss": 0.3917, + "step": 23080 + }, + { + "epoch": 7.453195610071013, + "grad_norm": 1.096987247467041, + "learning_rate": 0.0002, + "loss": 0.3662, + "step": 23090 + }, + { + "epoch": 7.456423499031633, + "grad_norm": 1.2544305324554443, + "learning_rate": 0.0002, + "loss": 0.3898, + "step": 23100 + }, + { + "epoch": 7.4596513879922535, + "grad_norm": 1.4809341430664062, + "learning_rate": 0.0002, + "loss": 0.4021, + "step": 23110 + }, + { + "epoch": 7.462879276952873, + "grad_norm": 0.9224157333374023, + "learning_rate": 0.0002, + "loss": 0.3775, + "step": 23120 + }, + { + "epoch": 7.466107165913493, + "grad_norm": 1.4894850254058838, + "learning_rate": 0.0002, + "loss": 0.3644, + "step": 23130 + }, + { + "epoch": 7.469335054874112, + "grad_norm": 1.1947047710418701, + "learning_rate": 0.0002, + "loss": 0.3804, + "step": 23140 + }, + { + "epoch": 7.472562943834732, + "grad_norm": 1.5348929166793823, + "learning_rate": 0.0002, + "loss": 0.3843, + "step": 23150 + }, + { + "epoch": 7.475790832795352, + "grad_norm": 1.0486136674880981, + "learning_rate": 0.0002, + "loss": 0.3941, + "step": 23160 + }, + { + "epoch": 7.479018721755971, + "grad_norm": 1.6460468769073486, + "learning_rate": 0.0002, + "loss": 0.3935, + "step": 23170 + }, + { + "epoch": 7.482246610716591, + "grad_norm": 0.9416976571083069, + "learning_rate": 0.0002, + "loss": 0.3755, + "step": 23180 + }, + { + "epoch": 7.485474499677212, + "grad_norm": 1.3972517251968384, + "learning_rate": 0.0002, + "loss": 0.4044, + "step": 23190 + }, + { + "epoch": 7.488702388637831, + "grad_norm": 1.3033207654953003, + "learning_rate": 0.0002, + "loss": 0.3869, + "step": 23200 + }, + { + "epoch": 7.491930277598451, + "grad_norm": 1.1479045152664185, + "learning_rate": 0.0002, + "loss": 0.3896, + "step": 23210 + }, + { + "epoch": 7.4951581665590705, + "grad_norm": 1.108995795249939, + "learning_rate": 0.0002, + "loss": 0.3746, + "step": 23220 + }, + { + "epoch": 7.49838605551969, + "grad_norm": 1.2081542015075684, + "learning_rate": 0.0002, + "loss": 0.3802, + "step": 23230 + }, + { + "epoch": 7.50161394448031, + "grad_norm": 1.227265477180481, + "learning_rate": 0.0002, + "loss": 0.3782, + "step": 23240 + }, + { + "epoch": 7.5048418334409295, + "grad_norm": 1.3606903553009033, + "learning_rate": 0.0002, + "loss": 0.3999, + "step": 23250 + }, + { + "epoch": 7.508069722401549, + "grad_norm": 1.4457145929336548, + "learning_rate": 0.0002, + "loss": 0.3845, + "step": 23260 + }, + { + "epoch": 7.511297611362169, + "grad_norm": 1.071205496788025, + "learning_rate": 0.0002, + "loss": 0.3809, + "step": 23270 + }, + { + "epoch": 7.514525500322788, + "grad_norm": 1.0113176107406616, + "learning_rate": 0.0002, + "loss": 0.3707, + "step": 23280 + }, + { + "epoch": 7.517753389283409, + "grad_norm": 1.2792452573776245, + "learning_rate": 0.0002, + "loss": 0.3815, + "step": 23290 + }, + { + "epoch": 7.520981278244029, + "grad_norm": 1.16257643699646, + "learning_rate": 0.0002, + "loss": 0.3945, + "step": 23300 + }, + { + "epoch": 7.524209167204648, + "grad_norm": 1.4449529647827148, + "learning_rate": 0.0002, + "loss": 0.4063, + "step": 23310 + }, + { + "epoch": 7.527437056165268, + "grad_norm": 1.0467441082000732, + "learning_rate": 0.0002, + "loss": 0.3693, + "step": 23320 + }, + { + "epoch": 7.530664945125888, + "grad_norm": 1.2062382698059082, + "learning_rate": 0.0002, + "loss": 0.3925, + "step": 23330 + }, + { + "epoch": 7.533892834086507, + "grad_norm": 1.3828591108322144, + "learning_rate": 0.0002, + "loss": 0.404, + "step": 23340 + }, + { + "epoch": 7.537120723047127, + "grad_norm": 1.1746373176574707, + "learning_rate": 0.0002, + "loss": 0.3694, + "step": 23350 + }, + { + "epoch": 7.540348612007747, + "grad_norm": 1.1252634525299072, + "learning_rate": 0.0002, + "loss": 0.3803, + "step": 23360 + }, + { + "epoch": 7.543576500968367, + "grad_norm": 1.1146548986434937, + "learning_rate": 0.0002, + "loss": 0.3979, + "step": 23370 + }, + { + "epoch": 7.546804389928987, + "grad_norm": 1.2049988508224487, + "learning_rate": 0.0002, + "loss": 0.4093, + "step": 23380 + }, + { + "epoch": 7.550032278889606, + "grad_norm": 1.211979866027832, + "learning_rate": 0.0002, + "loss": 0.419, + "step": 23390 + }, + { + "epoch": 7.553260167850226, + "grad_norm": 1.1158992052078247, + "learning_rate": 0.0002, + "loss": 0.3793, + "step": 23400 + }, + { + "epoch": 7.556488056810846, + "grad_norm": 1.0987670421600342, + "learning_rate": 0.0002, + "loss": 0.3748, + "step": 23410 + }, + { + "epoch": 7.559715945771465, + "grad_norm": 1.2179386615753174, + "learning_rate": 0.0002, + "loss": 0.3835, + "step": 23420 + }, + { + "epoch": 7.562943834732085, + "grad_norm": 1.2416619062423706, + "learning_rate": 0.0002, + "loss": 0.3934, + "step": 23430 + }, + { + "epoch": 7.566171723692705, + "grad_norm": 0.7858901023864746, + "learning_rate": 0.0002, + "loss": 0.3951, + "step": 23440 + }, + { + "epoch": 7.569399612653324, + "grad_norm": 1.4219504594802856, + "learning_rate": 0.0002, + "loss": 0.3938, + "step": 23450 + }, + { + "epoch": 7.572627501613945, + "grad_norm": 0.9971513152122498, + "learning_rate": 0.0002, + "loss": 0.3811, + "step": 23460 + }, + { + "epoch": 7.5758553905745645, + "grad_norm": 1.2463445663452148, + "learning_rate": 0.0002, + "loss": 0.3846, + "step": 23470 + }, + { + "epoch": 7.579083279535184, + "grad_norm": 0.9103072881698608, + "learning_rate": 0.0002, + "loss": 0.391, + "step": 23480 + }, + { + "epoch": 7.582311168495804, + "grad_norm": 1.296644687652588, + "learning_rate": 0.0002, + "loss": 0.4219, + "step": 23490 + }, + { + "epoch": 7.585539057456423, + "grad_norm": 1.2630009651184082, + "learning_rate": 0.0002, + "loss": 0.4191, + "step": 23500 + }, + { + "epoch": 7.588766946417043, + "grad_norm": 1.1580113172531128, + "learning_rate": 0.0002, + "loss": 0.3822, + "step": 23510 + }, + { + "epoch": 7.591994835377663, + "grad_norm": 1.3033956289291382, + "learning_rate": 0.0002, + "loss": 0.4366, + "step": 23520 + }, + { + "epoch": 7.595222724338282, + "grad_norm": 1.1394670009613037, + "learning_rate": 0.0002, + "loss": 0.3951, + "step": 23530 + }, + { + "epoch": 7.598450613298903, + "grad_norm": 1.1448818445205688, + "learning_rate": 0.0002, + "loss": 0.379, + "step": 23540 + }, + { + "epoch": 7.601678502259523, + "grad_norm": 1.3899340629577637, + "learning_rate": 0.0002, + "loss": 0.3967, + "step": 23550 + }, + { + "epoch": 7.604906391220142, + "grad_norm": 1.2759299278259277, + "learning_rate": 0.0002, + "loss": 0.3844, + "step": 23560 + }, + { + "epoch": 7.608134280180762, + "grad_norm": 1.0882219076156616, + "learning_rate": 0.0002, + "loss": 0.4017, + "step": 23570 + }, + { + "epoch": 7.6113621691413815, + "grad_norm": 1.189413070678711, + "learning_rate": 0.0002, + "loss": 0.3926, + "step": 23580 + }, + { + "epoch": 7.614590058102001, + "grad_norm": 1.1257762908935547, + "learning_rate": 0.0002, + "loss": 0.41, + "step": 23590 + }, + { + "epoch": 7.617817947062621, + "grad_norm": 1.2915645837783813, + "learning_rate": 0.0002, + "loss": 0.4264, + "step": 23600 + }, + { + "epoch": 7.6210458360232405, + "grad_norm": 1.3340779542922974, + "learning_rate": 0.0002, + "loss": 0.401, + "step": 23610 + }, + { + "epoch": 7.62427372498386, + "grad_norm": 1.3149892091751099, + "learning_rate": 0.0002, + "loss": 0.4148, + "step": 23620 + }, + { + "epoch": 7.627501613944481, + "grad_norm": 1.4316612482070923, + "learning_rate": 0.0002, + "loss": 0.3946, + "step": 23630 + }, + { + "epoch": 7.6307295029051, + "grad_norm": 1.024850606918335, + "learning_rate": 0.0002, + "loss": 0.3893, + "step": 23640 + }, + { + "epoch": 7.63395739186572, + "grad_norm": 1.193853735923767, + "learning_rate": 0.0002, + "loss": 0.4275, + "step": 23650 + }, + { + "epoch": 7.63718528082634, + "grad_norm": 1.1436676979064941, + "learning_rate": 0.0002, + "loss": 0.4064, + "step": 23660 + }, + { + "epoch": 7.640413169786959, + "grad_norm": 1.231313705444336, + "learning_rate": 0.0002, + "loss": 0.4051, + "step": 23670 + }, + { + "epoch": 7.643641058747579, + "grad_norm": 1.370025634765625, + "learning_rate": 0.0002, + "loss": 0.4088, + "step": 23680 + }, + { + "epoch": 7.646868947708199, + "grad_norm": 1.4087916612625122, + "learning_rate": 0.0002, + "loss": 0.3881, + "step": 23690 + }, + { + "epoch": 7.650096836668818, + "grad_norm": 1.143715500831604, + "learning_rate": 0.0002, + "loss": 0.3767, + "step": 23700 + }, + { + "epoch": 7.653324725629439, + "grad_norm": 1.0907450914382935, + "learning_rate": 0.0002, + "loss": 0.3976, + "step": 23710 + }, + { + "epoch": 7.656552614590058, + "grad_norm": 1.1993663311004639, + "learning_rate": 0.0002, + "loss": 0.423, + "step": 23720 + }, + { + "epoch": 7.659780503550678, + "grad_norm": 1.5836968421936035, + "learning_rate": 0.0002, + "loss": 0.3833, + "step": 23730 + }, + { + "epoch": 7.663008392511298, + "grad_norm": 1.1070377826690674, + "learning_rate": 0.0002, + "loss": 0.4029, + "step": 23740 + }, + { + "epoch": 7.666236281471917, + "grad_norm": 1.0333292484283447, + "learning_rate": 0.0002, + "loss": 0.3889, + "step": 23750 + }, + { + "epoch": 7.669464170432537, + "grad_norm": 1.293520450592041, + "learning_rate": 0.0002, + "loss": 0.3862, + "step": 23760 + }, + { + "epoch": 7.672692059393157, + "grad_norm": 1.164291262626648, + "learning_rate": 0.0002, + "loss": 0.393, + "step": 23770 + }, + { + "epoch": 7.675919948353776, + "grad_norm": 1.1913787126541138, + "learning_rate": 0.0002, + "loss": 0.4133, + "step": 23780 + }, + { + "epoch": 7.679147837314396, + "grad_norm": 0.9081819653511047, + "learning_rate": 0.0002, + "loss": 0.3839, + "step": 23790 + }, + { + "epoch": 7.6823757262750165, + "grad_norm": 1.2931487560272217, + "learning_rate": 0.0002, + "loss": 0.449, + "step": 23800 + }, + { + "epoch": 7.685603615235636, + "grad_norm": 1.2466086149215698, + "learning_rate": 0.0002, + "loss": 0.3958, + "step": 23810 + }, + { + "epoch": 7.688831504196256, + "grad_norm": 1.2980233430862427, + "learning_rate": 0.0002, + "loss": 0.4183, + "step": 23820 + }, + { + "epoch": 7.6920593931568755, + "grad_norm": 1.357170581817627, + "learning_rate": 0.0002, + "loss": 0.4035, + "step": 23830 + }, + { + "epoch": 7.695287282117495, + "grad_norm": 1.0869120359420776, + "learning_rate": 0.0002, + "loss": 0.385, + "step": 23840 + }, + { + "epoch": 7.698515171078115, + "grad_norm": 0.9358172416687012, + "learning_rate": 0.0002, + "loss": 0.4135, + "step": 23850 + }, + { + "epoch": 7.701743060038734, + "grad_norm": 1.4435080289840698, + "learning_rate": 0.0002, + "loss": 0.403, + "step": 23860 + }, + { + "epoch": 7.704970948999354, + "grad_norm": 1.0344315767288208, + "learning_rate": 0.0002, + "loss": 0.3964, + "step": 23870 + }, + { + "epoch": 7.708198837959975, + "grad_norm": 1.2128890752792358, + "learning_rate": 0.0002, + "loss": 0.4093, + "step": 23880 + }, + { + "epoch": 7.711426726920594, + "grad_norm": 1.239585280418396, + "learning_rate": 0.0002, + "loss": 0.3924, + "step": 23890 + }, + { + "epoch": 7.714654615881214, + "grad_norm": 1.1732957363128662, + "learning_rate": 0.0002, + "loss": 0.3966, + "step": 23900 + }, + { + "epoch": 7.717882504841834, + "grad_norm": 1.2434546947479248, + "learning_rate": 0.0002, + "loss": 0.3917, + "step": 23910 + }, + { + "epoch": 7.721110393802453, + "grad_norm": 1.2031792402267456, + "learning_rate": 0.0002, + "loss": 0.3876, + "step": 23920 + }, + { + "epoch": 7.724338282763073, + "grad_norm": 1.1401077508926392, + "learning_rate": 0.0002, + "loss": 0.3948, + "step": 23930 + }, + { + "epoch": 7.7275661717236925, + "grad_norm": 1.3985689878463745, + "learning_rate": 0.0002, + "loss": 0.4178, + "step": 23940 + }, + { + "epoch": 7.730794060684312, + "grad_norm": 1.3179208040237427, + "learning_rate": 0.0002, + "loss": 0.3933, + "step": 23950 + }, + { + "epoch": 7.734021949644932, + "grad_norm": 1.071332335472107, + "learning_rate": 0.0002, + "loss": 0.4043, + "step": 23960 + }, + { + "epoch": 7.7372498386055515, + "grad_norm": 1.169771671295166, + "learning_rate": 0.0002, + "loss": 0.4217, + "step": 23970 + }, + { + "epoch": 7.740477727566172, + "grad_norm": 1.2893975973129272, + "learning_rate": 0.0002, + "loss": 0.4149, + "step": 23980 + }, + { + "epoch": 7.743705616526792, + "grad_norm": 1.424354076385498, + "learning_rate": 0.0002, + "loss": 0.4136, + "step": 23990 + }, + { + "epoch": 7.746933505487411, + "grad_norm": 1.3814094066619873, + "learning_rate": 0.0002, + "loss": 0.403, + "step": 24000 + }, + { + "epoch": 7.750161394448031, + "grad_norm": 1.04098641872406, + "learning_rate": 0.0002, + "loss": 0.3875, + "step": 24010 + }, + { + "epoch": 7.753389283408651, + "grad_norm": 1.2493431568145752, + "learning_rate": 0.0002, + "loss": 0.3875, + "step": 24020 + }, + { + "epoch": 7.75661717236927, + "grad_norm": 1.20700204372406, + "learning_rate": 0.0002, + "loss": 0.3948, + "step": 24030 + }, + { + "epoch": 7.75984506132989, + "grad_norm": 1.0956356525421143, + "learning_rate": 0.0002, + "loss": 0.3946, + "step": 24040 + }, + { + "epoch": 7.7630729502905105, + "grad_norm": 1.0404914617538452, + "learning_rate": 0.0002, + "loss": 0.4026, + "step": 24050 + }, + { + "epoch": 7.76630083925113, + "grad_norm": 1.1474649906158447, + "learning_rate": 0.0002, + "loss": 0.4263, + "step": 24060 + }, + { + "epoch": 7.76952872821175, + "grad_norm": 1.5770092010498047, + "learning_rate": 0.0002, + "loss": 0.408, + "step": 24070 + }, + { + "epoch": 7.772756617172369, + "grad_norm": 1.1962103843688965, + "learning_rate": 0.0002, + "loss": 0.406, + "step": 24080 + }, + { + "epoch": 7.775984506132989, + "grad_norm": 1.2712551355361938, + "learning_rate": 0.0002, + "loss": 0.4168, + "step": 24090 + }, + { + "epoch": 7.779212395093609, + "grad_norm": 1.0740753412246704, + "learning_rate": 0.0002, + "loss": 0.4081, + "step": 24100 + }, + { + "epoch": 7.782440284054228, + "grad_norm": 1.2754921913146973, + "learning_rate": 0.0002, + "loss": 0.3736, + "step": 24110 + }, + { + "epoch": 7.785668173014848, + "grad_norm": 1.2397977113723755, + "learning_rate": 0.0002, + "loss": 0.4068, + "step": 24120 + }, + { + "epoch": 7.788896061975468, + "grad_norm": 1.6444467306137085, + "learning_rate": 0.0002, + "loss": 0.4099, + "step": 24130 + }, + { + "epoch": 7.792123950936087, + "grad_norm": 1.1543670892715454, + "learning_rate": 0.0002, + "loss": 0.4206, + "step": 24140 + }, + { + "epoch": 7.795351839896708, + "grad_norm": 1.284700870513916, + "learning_rate": 0.0002, + "loss": 0.4201, + "step": 24150 + }, + { + "epoch": 7.7985797288573275, + "grad_norm": 1.3647849559783936, + "learning_rate": 0.0002, + "loss": 0.4014, + "step": 24160 + }, + { + "epoch": 7.801807617817947, + "grad_norm": 1.3251831531524658, + "learning_rate": 0.0002, + "loss": 0.3868, + "step": 24170 + }, + { + "epoch": 7.805035506778567, + "grad_norm": 0.9937632083892822, + "learning_rate": 0.0002, + "loss": 0.3926, + "step": 24180 + }, + { + "epoch": 7.8082633957391865, + "grad_norm": 1.2740001678466797, + "learning_rate": 0.0002, + "loss": 0.4154, + "step": 24190 + }, + { + "epoch": 7.811491284699806, + "grad_norm": 1.2092649936676025, + "learning_rate": 0.0002, + "loss": 0.4144, + "step": 24200 + }, + { + "epoch": 7.814719173660426, + "grad_norm": 1.363057255744934, + "learning_rate": 0.0002, + "loss": 0.4359, + "step": 24210 + }, + { + "epoch": 7.817947062621046, + "grad_norm": 1.2452268600463867, + "learning_rate": 0.0002, + "loss": 0.4006, + "step": 24220 + }, + { + "epoch": 7.821174951581666, + "grad_norm": 1.2593066692352295, + "learning_rate": 0.0002, + "loss": 0.4297, + "step": 24230 + }, + { + "epoch": 7.824402840542286, + "grad_norm": 1.3587749004364014, + "learning_rate": 0.0002, + "loss": 0.4023, + "step": 24240 + }, + { + "epoch": 7.827630729502905, + "grad_norm": 1.2257705926895142, + "learning_rate": 0.0002, + "loss": 0.4152, + "step": 24250 + }, + { + "epoch": 7.830858618463525, + "grad_norm": 1.257444977760315, + "learning_rate": 0.0002, + "loss": 0.3872, + "step": 24260 + }, + { + "epoch": 7.834086507424145, + "grad_norm": 1.3570739030838013, + "learning_rate": 0.0002, + "loss": 0.3883, + "step": 24270 + }, + { + "epoch": 7.837314396384764, + "grad_norm": 1.2873027324676514, + "learning_rate": 0.0002, + "loss": 0.418, + "step": 24280 + }, + { + "epoch": 7.840542285345384, + "grad_norm": 1.078808069229126, + "learning_rate": 0.0002, + "loss": 0.3813, + "step": 24290 + }, + { + "epoch": 7.8437701743060035, + "grad_norm": 1.409043788909912, + "learning_rate": 0.0002, + "loss": 0.4167, + "step": 24300 + }, + { + "epoch": 7.846998063266623, + "grad_norm": 1.113909363746643, + "learning_rate": 0.0002, + "loss": 0.4394, + "step": 24310 + }, + { + "epoch": 7.850225952227244, + "grad_norm": 1.432429313659668, + "learning_rate": 0.0002, + "loss": 0.4063, + "step": 24320 + }, + { + "epoch": 7.853453841187863, + "grad_norm": 1.1753697395324707, + "learning_rate": 0.0002, + "loss": 0.4368, + "step": 24330 + }, + { + "epoch": 7.856681730148483, + "grad_norm": 1.4771350622177124, + "learning_rate": 0.0002, + "loss": 0.4368, + "step": 24340 + }, + { + "epoch": 7.859909619109103, + "grad_norm": 1.0278029441833496, + "learning_rate": 0.0002, + "loss": 0.432, + "step": 24350 + }, + { + "epoch": 7.863137508069722, + "grad_norm": 1.064161777496338, + "learning_rate": 0.0002, + "loss": 0.408, + "step": 24360 + }, + { + "epoch": 7.866365397030342, + "grad_norm": 1.4824532270431519, + "learning_rate": 0.0002, + "loss": 0.4023, + "step": 24370 + }, + { + "epoch": 7.869593285990962, + "grad_norm": 1.3403675556182861, + "learning_rate": 0.0002, + "loss": 0.4283, + "step": 24380 + }, + { + "epoch": 7.872821174951581, + "grad_norm": 1.3019866943359375, + "learning_rate": 0.0002, + "loss": 0.418, + "step": 24390 + }, + { + "epoch": 7.876049063912202, + "grad_norm": 1.3158677816390991, + "learning_rate": 0.0002, + "loss": 0.4295, + "step": 24400 + }, + { + "epoch": 7.8792769528728215, + "grad_norm": 1.3224833011627197, + "learning_rate": 0.0002, + "loss": 0.4371, + "step": 24410 + }, + { + "epoch": 7.882504841833441, + "grad_norm": 1.158711314201355, + "learning_rate": 0.0002, + "loss": 0.4193, + "step": 24420 + }, + { + "epoch": 7.885732730794061, + "grad_norm": 1.5012301206588745, + "learning_rate": 0.0002, + "loss": 0.3888, + "step": 24430 + }, + { + "epoch": 7.88896061975468, + "grad_norm": 1.0743858814239502, + "learning_rate": 0.0002, + "loss": 0.3872, + "step": 24440 + }, + { + "epoch": 7.8921885087153, + "grad_norm": 1.1748833656311035, + "learning_rate": 0.0002, + "loss": 0.3838, + "step": 24450 + }, + { + "epoch": 7.89541639767592, + "grad_norm": 1.2368545532226562, + "learning_rate": 0.0002, + "loss": 0.4151, + "step": 24460 + }, + { + "epoch": 7.898644286636539, + "grad_norm": 1.339815378189087, + "learning_rate": 0.0002, + "loss": 0.4292, + "step": 24470 + }, + { + "epoch": 7.901872175597159, + "grad_norm": 1.106711983680725, + "learning_rate": 0.0002, + "loss": 0.3871, + "step": 24480 + }, + { + "epoch": 7.90510006455778, + "grad_norm": 1.082188367843628, + "learning_rate": 0.0002, + "loss": 0.4038, + "step": 24490 + }, + { + "epoch": 7.908327953518399, + "grad_norm": 1.2585617303848267, + "learning_rate": 0.0002, + "loss": 0.4296, + "step": 24500 + }, + { + "epoch": 7.911555842479019, + "grad_norm": 1.2435230016708374, + "learning_rate": 0.0002, + "loss": 0.4063, + "step": 24510 + }, + { + "epoch": 7.9147837314396385, + "grad_norm": 1.6732012033462524, + "learning_rate": 0.0002, + "loss": 0.4008, + "step": 24520 + }, + { + "epoch": 7.918011620400258, + "grad_norm": 1.1985243558883667, + "learning_rate": 0.0002, + "loss": 0.392, + "step": 24530 + }, + { + "epoch": 7.921239509360878, + "grad_norm": 1.255313515663147, + "learning_rate": 0.0002, + "loss": 0.3927, + "step": 24540 + }, + { + "epoch": 7.9244673983214975, + "grad_norm": 1.2786425352096558, + "learning_rate": 0.0002, + "loss": 0.4229, + "step": 24550 + }, + { + "epoch": 7.927695287282117, + "grad_norm": 1.1514666080474854, + "learning_rate": 0.0002, + "loss": 0.4087, + "step": 24560 + }, + { + "epoch": 7.930923176242738, + "grad_norm": 1.3536173105239868, + "learning_rate": 0.0002, + "loss": 0.4315, + "step": 24570 + }, + { + "epoch": 7.934151065203357, + "grad_norm": 1.3156218528747559, + "learning_rate": 0.0002, + "loss": 0.4172, + "step": 24580 + }, + { + "epoch": 7.937378954163977, + "grad_norm": 1.465572476387024, + "learning_rate": 0.0002, + "loss": 0.4088, + "step": 24590 + }, + { + "epoch": 7.940606843124597, + "grad_norm": 1.0745478868484497, + "learning_rate": 0.0002, + "loss": 0.4161, + "step": 24600 + }, + { + "epoch": 7.943834732085216, + "grad_norm": 1.2898974418640137, + "learning_rate": 0.0002, + "loss": 0.4084, + "step": 24610 + }, + { + "epoch": 7.947062621045836, + "grad_norm": 0.9425821900367737, + "learning_rate": 0.0002, + "loss": 0.4066, + "step": 24620 + }, + { + "epoch": 7.950290510006456, + "grad_norm": 1.238996148109436, + "learning_rate": 0.0002, + "loss": 0.4281, + "step": 24630 + }, + { + "epoch": 7.953518398967075, + "grad_norm": 1.5326380729675293, + "learning_rate": 0.0002, + "loss": 0.4093, + "step": 24640 + }, + { + "epoch": 7.956746287927695, + "grad_norm": 0.8708599209785461, + "learning_rate": 0.0002, + "loss": 0.3992, + "step": 24650 + }, + { + "epoch": 7.9599741768883145, + "grad_norm": 1.45661461353302, + "learning_rate": 0.0002, + "loss": 0.4215, + "step": 24660 + }, + { + "epoch": 7.963202065848935, + "grad_norm": 1.204917073249817, + "learning_rate": 0.0002, + "loss": 0.404, + "step": 24670 + }, + { + "epoch": 7.966429954809555, + "grad_norm": 1.2509328126907349, + "learning_rate": 0.0002, + "loss": 0.4095, + "step": 24680 + }, + { + "epoch": 7.969657843770174, + "grad_norm": 1.3137809038162231, + "learning_rate": 0.0002, + "loss": 0.4102, + "step": 24690 + }, + { + "epoch": 7.972885732730794, + "grad_norm": 1.0418064594268799, + "learning_rate": 0.0002, + "loss": 0.416, + "step": 24700 + }, + { + "epoch": 7.976113621691414, + "grad_norm": 1.4729000329971313, + "learning_rate": 0.0002, + "loss": 0.423, + "step": 24710 + }, + { + "epoch": 7.979341510652033, + "grad_norm": 1.1795575618743896, + "learning_rate": 0.0002, + "loss": 0.4104, + "step": 24720 + }, + { + "epoch": 7.982569399612653, + "grad_norm": 1.7517948150634766, + "learning_rate": 0.0002, + "loss": 0.39, + "step": 24730 + }, + { + "epoch": 7.9857972885732735, + "grad_norm": 1.0974000692367554, + "learning_rate": 0.0002, + "loss": 0.4214, + "step": 24740 + }, + { + "epoch": 7.989025177533893, + "grad_norm": 1.1564710140228271, + "learning_rate": 0.0002, + "loss": 0.4426, + "step": 24750 + }, + { + "epoch": 7.992253066494513, + "grad_norm": 1.1639856100082397, + "learning_rate": 0.0002, + "loss": 0.4022, + "step": 24760 + }, + { + "epoch": 7.9954809554551325, + "grad_norm": 1.2776424884796143, + "learning_rate": 0.0002, + "loss": 0.4392, + "step": 24770 + }, + { + "epoch": 7.998708844415752, + "grad_norm": 1.084326148033142, + "learning_rate": 0.0002, + "loss": 0.4118, + "step": 24780 + }, + { + "epoch": 8.0, + "eval_loss": 1.3813514709472656, + "eval_runtime": 157.787, + "eval_samples_per_second": 4.646, + "eval_steps_per_second": 0.583, + "step": 24784 + } + ], + "logging_steps": 10, + "max_steps": 24784, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1469480263763886e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f46f2b8e8752b125339f36f172c3878be4cdb152 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-24784/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfc2a69e44a51edf5586ebed4b7ee915a23244c18c1f59e580471e4c9becfa98 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..805cd021178a92a5f6aee3f744a941cf0ca13916 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a4b931f6963f7a4f6edb095ef574385c121f99f4cf59b5fdda7531d710a5271 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..15da2c8f3081cd71188b80eb769a2f62b77b2f93 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a8783af8b68c6b4a7406ee233c56898a25a0b43139364ff0cbefd351c203a30 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b9c829624a53d285684c3774aa3bdd51360503c2 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89888a3e754e347bfa5944108e2cf82ce6a01224e7a1045c364578543947b3f7 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e95c9853effa917037fc0a528d2647fca4095734 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a3080889edee322371e6151bd75add736a2028b456db2f00f0a4a1dda56d1be +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cc5e74e41ff9a516a626f5cecb9b8f641012822b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/trainer_state.json @@ -0,0 +1,2204 @@ +{ + "best_metric": 1.0958120822906494, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098", + "epoch": 1.0, + "eval_steps": 10, + "global_step": 3098, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032278889606197547, + "grad_norm": 0.7092075347900391, + "learning_rate": 0.0002, + "loss": 1.593, + "step": 10 + }, + { + "epoch": 0.006455777921239509, + "grad_norm": 0.6900479793548584, + "learning_rate": 0.0002, + "loss": 1.0956, + "step": 20 + }, + { + "epoch": 0.009683666881859263, + "grad_norm": 0.6788288950920105, + "learning_rate": 0.0002, + "loss": 0.9807, + "step": 30 + }, + { + "epoch": 0.012911555842479019, + "grad_norm": 0.5590243339538574, + "learning_rate": 0.0002, + "loss": 0.9385, + "step": 40 + }, + { + "epoch": 0.016139444803098774, + "grad_norm": 0.5136010646820068, + "learning_rate": 0.0002, + "loss": 0.931, + "step": 50 + }, + { + "epoch": 0.019367333763718526, + "grad_norm": 0.45298320055007935, + "learning_rate": 0.0002, + "loss": 0.8896, + "step": 60 + }, + { + "epoch": 0.022595222724338282, + "grad_norm": 0.5917162299156189, + "learning_rate": 0.0002, + "loss": 0.9184, + "step": 70 + }, + { + "epoch": 0.025823111684958037, + "grad_norm": 0.4414856433868408, + "learning_rate": 0.0002, + "loss": 0.8705, + "step": 80 + }, + { + "epoch": 0.029051000645577793, + "grad_norm": 0.5547978281974792, + "learning_rate": 0.0002, + "loss": 0.8419, + "step": 90 + }, + { + "epoch": 0.03227888960619755, + "grad_norm": 0.5271288156509399, + "learning_rate": 0.0002, + "loss": 0.8987, + "step": 100 + }, + { + "epoch": 0.035506778566817304, + "grad_norm": 0.5506119728088379, + "learning_rate": 0.0002, + "loss": 0.8543, + "step": 110 + }, + { + "epoch": 0.03873466752743705, + "grad_norm": 0.5579327940940857, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 120 + }, + { + "epoch": 0.04196255648805681, + "grad_norm": 0.5099632740020752, + "learning_rate": 0.0002, + "loss": 0.8826, + "step": 130 + }, + { + "epoch": 0.045190445448676564, + "grad_norm": 0.40396833419799805, + "learning_rate": 0.0002, + "loss": 0.9239, + "step": 140 + }, + { + "epoch": 0.04841833440929632, + "grad_norm": 0.5008092522621155, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 150 + }, + { + "epoch": 0.051646223369916075, + "grad_norm": 0.4388776421546936, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 160 + }, + { + "epoch": 0.05487411233053583, + "grad_norm": 0.44138944149017334, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 170 + }, + { + "epoch": 0.058102001291155586, + "grad_norm": 0.358484148979187, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 180 + }, + { + "epoch": 0.06132989025177534, + "grad_norm": 0.457052081823349, + "learning_rate": 0.0002, + "loss": 0.8956, + "step": 190 + }, + { + "epoch": 0.0645577792123951, + "grad_norm": 0.5537622570991516, + "learning_rate": 0.0002, + "loss": 0.9138, + "step": 200 + }, + { + "epoch": 0.06778566817301485, + "grad_norm": 0.552631676197052, + "learning_rate": 0.0002, + "loss": 0.8701, + "step": 210 + }, + { + "epoch": 0.07101355713363461, + "grad_norm": 0.4414575397968292, + "learning_rate": 0.0002, + "loss": 0.8854, + "step": 220 + }, + { + "epoch": 0.07424144609425436, + "grad_norm": 0.4996664226055145, + "learning_rate": 0.0002, + "loss": 0.8581, + "step": 230 + }, + { + "epoch": 0.0774693350548741, + "grad_norm": 0.7321897149085999, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 240 + }, + { + "epoch": 0.08069722401549387, + "grad_norm": 0.4553901255130768, + "learning_rate": 0.0002, + "loss": 0.8848, + "step": 250 + }, + { + "epoch": 0.08392511297611362, + "grad_norm": 0.5039054751396179, + "learning_rate": 0.0002, + "loss": 0.868, + "step": 260 + }, + { + "epoch": 0.08715300193673338, + "grad_norm": 0.4113094210624695, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 270 + }, + { + "epoch": 0.09038089089735313, + "grad_norm": 0.450436532497406, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 280 + }, + { + "epoch": 0.09360877985797289, + "grad_norm": 0.4548024535179138, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 290 + }, + { + "epoch": 0.09683666881859264, + "grad_norm": 0.4932962656021118, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 300 + }, + { + "epoch": 0.1000645577792124, + "grad_norm": 0.4005250334739685, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 310 + }, + { + "epoch": 0.10329244673983215, + "grad_norm": 1.8321624994277954, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 320 + }, + { + "epoch": 0.1065203357004519, + "grad_norm": 0.45815610885620117, + "learning_rate": 0.0002, + "loss": 0.8411, + "step": 330 + }, + { + "epoch": 0.10974822466107166, + "grad_norm": 0.39324095845222473, + "learning_rate": 0.0002, + "loss": 0.857, + "step": 340 + }, + { + "epoch": 0.11297611362169141, + "grad_norm": 0.546273946762085, + "learning_rate": 0.0002, + "loss": 0.8258, + "step": 350 + }, + { + "epoch": 0.11620400258231117, + "grad_norm": 0.497448593378067, + "learning_rate": 0.0002, + "loss": 0.882, + "step": 360 + }, + { + "epoch": 0.11943189154293092, + "grad_norm": 0.37508800625801086, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 370 + }, + { + "epoch": 0.12265978050355068, + "grad_norm": 0.45849609375, + "learning_rate": 0.0002, + "loss": 0.852, + "step": 380 + }, + { + "epoch": 0.12588766946417043, + "grad_norm": 0.5488408803939819, + "learning_rate": 0.0002, + "loss": 0.8437, + "step": 390 + }, + { + "epoch": 0.1291155584247902, + "grad_norm": 0.4477061331272125, + "learning_rate": 0.0002, + "loss": 0.8349, + "step": 400 + }, + { + "epoch": 0.13234344738540993, + "grad_norm": 0.39227980375289917, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 410 + }, + { + "epoch": 0.1355713363460297, + "grad_norm": 0.3922233581542969, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 420 + }, + { + "epoch": 0.13879922530664945, + "grad_norm": 0.42901909351348877, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 430 + }, + { + "epoch": 0.14202711426726922, + "grad_norm": 0.4217798709869385, + "learning_rate": 0.0002, + "loss": 0.8271, + "step": 440 + }, + { + "epoch": 0.14525500322788895, + "grad_norm": 0.43470677733421326, + "learning_rate": 0.0002, + "loss": 0.8594, + "step": 450 + }, + { + "epoch": 0.1484828921885087, + "grad_norm": 0.5324403047561646, + "learning_rate": 0.0002, + "loss": 0.8106, + "step": 460 + }, + { + "epoch": 0.15171078114912848, + "grad_norm": 0.3999756872653961, + "learning_rate": 0.0002, + "loss": 0.8729, + "step": 470 + }, + { + "epoch": 0.1549386701097482, + "grad_norm": 0.404933363199234, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 480 + }, + { + "epoch": 0.15816655907036797, + "grad_norm": 0.44122636318206787, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 490 + }, + { + "epoch": 0.16139444803098774, + "grad_norm": 0.510166347026825, + "learning_rate": 0.0002, + "loss": 0.8457, + "step": 500 + }, + { + "epoch": 0.1646223369916075, + "grad_norm": 0.4549732506275177, + "learning_rate": 0.0002, + "loss": 0.8692, + "step": 510 + }, + { + "epoch": 0.16785022595222723, + "grad_norm": 0.5148182511329651, + "learning_rate": 0.0002, + "loss": 0.8466, + "step": 520 + }, + { + "epoch": 0.171078114912847, + "grad_norm": 0.3596806824207306, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 530 + }, + { + "epoch": 0.17430600387346676, + "grad_norm": 0.4388909339904785, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 540 + }, + { + "epoch": 0.17753389283408652, + "grad_norm": 0.5052742958068848, + "learning_rate": 0.0002, + "loss": 0.8322, + "step": 550 + }, + { + "epoch": 0.18076178179470626, + "grad_norm": 0.48248958587646484, + "learning_rate": 0.0002, + "loss": 0.791, + "step": 560 + }, + { + "epoch": 0.18398967075532602, + "grad_norm": 0.5360197424888611, + "learning_rate": 0.0002, + "loss": 0.8593, + "step": 570 + }, + { + "epoch": 0.18721755971594578, + "grad_norm": 0.43999341130256653, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 580 + }, + { + "epoch": 0.19044544867656552, + "grad_norm": 0.3685208261013031, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 590 + }, + { + "epoch": 0.19367333763718528, + "grad_norm": 0.4601275622844696, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 600 + }, + { + "epoch": 0.19690122659780504, + "grad_norm": 0.4778369665145874, + "learning_rate": 0.0002, + "loss": 0.8483, + "step": 610 + }, + { + "epoch": 0.2001291155584248, + "grad_norm": 0.4867003560066223, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 620 + }, + { + "epoch": 0.20335700451904454, + "grad_norm": 0.4583742916584015, + "learning_rate": 0.0002, + "loss": 0.8554, + "step": 630 + }, + { + "epoch": 0.2065848934796643, + "grad_norm": 0.47958165407180786, + "learning_rate": 0.0002, + "loss": 0.8698, + "step": 640 + }, + { + "epoch": 0.20981278244028406, + "grad_norm": 0.4526064097881317, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 650 + }, + { + "epoch": 0.2130406714009038, + "grad_norm": 0.45890581607818604, + "learning_rate": 0.0002, + "loss": 0.8313, + "step": 660 + }, + { + "epoch": 0.21626856036152356, + "grad_norm": 0.42725905776023865, + "learning_rate": 0.0002, + "loss": 0.8143, + "step": 670 + }, + { + "epoch": 0.21949644932214332, + "grad_norm": 0.40380963683128357, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 680 + }, + { + "epoch": 0.22272433828276308, + "grad_norm": 0.4372998774051666, + "learning_rate": 0.0002, + "loss": 0.9004, + "step": 690 + }, + { + "epoch": 0.22595222724338282, + "grad_norm": 0.4245864450931549, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 700 + }, + { + "epoch": 0.22918011620400258, + "grad_norm": 0.4061129689216614, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 710 + }, + { + "epoch": 0.23240800516462234, + "grad_norm": 0.474454790353775, + "learning_rate": 0.0002, + "loss": 0.8275, + "step": 720 + }, + { + "epoch": 0.23563589412524208, + "grad_norm": 0.4908486008644104, + "learning_rate": 0.0002, + "loss": 0.8346, + "step": 730 + }, + { + "epoch": 0.23886378308586184, + "grad_norm": 0.4284191429615021, + "learning_rate": 0.0002, + "loss": 0.8755, + "step": 740 + }, + { + "epoch": 0.2420916720464816, + "grad_norm": 0.44730308651924133, + "learning_rate": 0.0002, + "loss": 0.8387, + "step": 750 + }, + { + "epoch": 0.24531956100710137, + "grad_norm": 0.4433246850967407, + "learning_rate": 0.0002, + "loss": 0.8135, + "step": 760 + }, + { + "epoch": 0.2485474499677211, + "grad_norm": 0.43668854236602783, + "learning_rate": 0.0002, + "loss": 0.8644, + "step": 770 + }, + { + "epoch": 0.25177533892834086, + "grad_norm": 0.34324130415916443, + "learning_rate": 0.0002, + "loss": 0.8025, + "step": 780 + }, + { + "epoch": 0.2550032278889606, + "grad_norm": 0.46476295590400696, + "learning_rate": 0.0002, + "loss": 0.8725, + "step": 790 + }, + { + "epoch": 0.2582311168495804, + "grad_norm": 0.5047039985656738, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 800 + }, + { + "epoch": 0.26145900581020015, + "grad_norm": 0.4402127265930176, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 810 + }, + { + "epoch": 0.26468689477081986, + "grad_norm": 0.4642465114593506, + "learning_rate": 0.0002, + "loss": 0.8025, + "step": 820 + }, + { + "epoch": 0.2679147837314396, + "grad_norm": 0.40093424916267395, + "learning_rate": 0.0002, + "loss": 0.8836, + "step": 830 + }, + { + "epoch": 0.2711426726920594, + "grad_norm": 0.42501842975616455, + "learning_rate": 0.0002, + "loss": 0.83, + "step": 840 + }, + { + "epoch": 0.27437056165267915, + "grad_norm": 0.43279722332954407, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 850 + }, + { + "epoch": 0.2775984506132989, + "grad_norm": 0.5991243720054626, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 860 + }, + { + "epoch": 0.28082633957391867, + "grad_norm": 0.4217848777770996, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 870 + }, + { + "epoch": 0.28405422853453843, + "grad_norm": 0.3933536410331726, + "learning_rate": 0.0002, + "loss": 0.8135, + "step": 880 + }, + { + "epoch": 0.28728211749515814, + "grad_norm": 0.5868505239486694, + "learning_rate": 0.0002, + "loss": 0.8846, + "step": 890 + }, + { + "epoch": 0.2905100064557779, + "grad_norm": 0.5209547877311707, + "learning_rate": 0.0002, + "loss": 0.8759, + "step": 900 + }, + { + "epoch": 0.29373789541639767, + "grad_norm": 0.49307361245155334, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 910 + }, + { + "epoch": 0.2969657843770174, + "grad_norm": 0.4288382828235626, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 920 + }, + { + "epoch": 0.3001936733376372, + "grad_norm": 0.33568474650382996, + "learning_rate": 0.0002, + "loss": 0.8431, + "step": 930 + }, + { + "epoch": 0.30342156229825695, + "grad_norm": 1.0915930271148682, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 940 + }, + { + "epoch": 0.3066494512588767, + "grad_norm": 0.5489798188209534, + "learning_rate": 0.0002, + "loss": 0.8535, + "step": 950 + }, + { + "epoch": 0.3098773402194964, + "grad_norm": 0.42971742153167725, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 960 + }, + { + "epoch": 0.3131052291801162, + "grad_norm": 0.43375834822654724, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 970 + }, + { + "epoch": 0.31633311814073595, + "grad_norm": 0.47488611936569214, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 980 + }, + { + "epoch": 0.3195610071013557, + "grad_norm": 0.46296775341033936, + "learning_rate": 0.0002, + "loss": 0.7906, + "step": 990 + }, + { + "epoch": 0.32278889606197547, + "grad_norm": 0.4548890292644501, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 1000 + }, + { + "epoch": 0.32601678502259523, + "grad_norm": 0.41834497451782227, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 1010 + }, + { + "epoch": 0.329244673983215, + "grad_norm": 0.441092312335968, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 1020 + }, + { + "epoch": 0.33247256294383476, + "grad_norm": 0.637322187423706, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 1030 + }, + { + "epoch": 0.33570045190445447, + "grad_norm": 0.4374958574771881, + "learning_rate": 0.0002, + "loss": 0.8685, + "step": 1040 + }, + { + "epoch": 0.33892834086507423, + "grad_norm": 0.3935825824737549, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 1050 + }, + { + "epoch": 0.342156229825694, + "grad_norm": 0.43526220321655273, + "learning_rate": 0.0002, + "loss": 0.8287, + "step": 1060 + }, + { + "epoch": 0.34538411878631375, + "grad_norm": 0.45327696204185486, + "learning_rate": 0.0002, + "loss": 0.8413, + "step": 1070 + }, + { + "epoch": 0.3486120077469335, + "grad_norm": 0.4126075506210327, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 1080 + }, + { + "epoch": 0.3518398967075533, + "grad_norm": 0.4714072048664093, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 1090 + }, + { + "epoch": 0.35506778566817304, + "grad_norm": 0.518127977848053, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 1100 + }, + { + "epoch": 0.35829567462879275, + "grad_norm": 0.43264099955558777, + "learning_rate": 0.0002, + "loss": 0.8479, + "step": 1110 + }, + { + "epoch": 0.3615235635894125, + "grad_norm": 0.4857400357723236, + "learning_rate": 0.0002, + "loss": 0.8724, + "step": 1120 + }, + { + "epoch": 0.3647514525500323, + "grad_norm": 0.37591469287872314, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 1130 + }, + { + "epoch": 0.36797934151065204, + "grad_norm": 0.4165478050708771, + "learning_rate": 0.0002, + "loss": 0.8531, + "step": 1140 + }, + { + "epoch": 0.3712072304712718, + "grad_norm": 0.42911383509635925, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 1150 + }, + { + "epoch": 0.37443511943189156, + "grad_norm": 0.44980287551879883, + "learning_rate": 0.0002, + "loss": 0.8722, + "step": 1160 + }, + { + "epoch": 0.3776630083925113, + "grad_norm": 0.4066573679447174, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 1170 + }, + { + "epoch": 0.38089089735313103, + "grad_norm": 0.5056195855140686, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 1180 + }, + { + "epoch": 0.3841187863137508, + "grad_norm": 0.4141536355018616, + "learning_rate": 0.0002, + "loss": 0.8387, + "step": 1190 + }, + { + "epoch": 0.38734667527437056, + "grad_norm": 0.4501924514770508, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 1200 + }, + { + "epoch": 0.3905745642349903, + "grad_norm": 0.43304240703582764, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 1210 + }, + { + "epoch": 0.3938024531956101, + "grad_norm": 0.475777804851532, + "learning_rate": 0.0002, + "loss": 0.8905, + "step": 1220 + }, + { + "epoch": 0.39703034215622984, + "grad_norm": 0.5846465826034546, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 1230 + }, + { + "epoch": 0.4002582311168496, + "grad_norm": 0.42899325489997864, + "learning_rate": 0.0002, + "loss": 0.8078, + "step": 1240 + }, + { + "epoch": 0.4034861200774693, + "grad_norm": 0.3980463147163391, + "learning_rate": 0.0002, + "loss": 0.8415, + "step": 1250 + }, + { + "epoch": 0.4067140090380891, + "grad_norm": 0.45769768953323364, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 1260 + }, + { + "epoch": 0.40994189799870884, + "grad_norm": 0.5101280212402344, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 1270 + }, + { + "epoch": 0.4131697869593286, + "grad_norm": 0.47374317049980164, + "learning_rate": 0.0002, + "loss": 0.7905, + "step": 1280 + }, + { + "epoch": 0.41639767591994836, + "grad_norm": 0.4261878728866577, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 1290 + }, + { + "epoch": 0.4196255648805681, + "grad_norm": 0.46954256296157837, + "learning_rate": 0.0002, + "loss": 0.9004, + "step": 1300 + }, + { + "epoch": 0.4228534538411879, + "grad_norm": 0.5205738544464111, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 1310 + }, + { + "epoch": 0.4260813428018076, + "grad_norm": 0.5176340937614441, + "learning_rate": 0.0002, + "loss": 0.8964, + "step": 1320 + }, + { + "epoch": 0.42930923176242736, + "grad_norm": 0.5155916810035706, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 1330 + }, + { + "epoch": 0.4325371207230471, + "grad_norm": 0.44548553228378296, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 1340 + }, + { + "epoch": 0.4357650096836669, + "grad_norm": 0.5633558630943298, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 1350 + }, + { + "epoch": 0.43899289864428664, + "grad_norm": 0.42444056272506714, + "learning_rate": 0.0002, + "loss": 0.7889, + "step": 1360 + }, + { + "epoch": 0.4422207876049064, + "grad_norm": 0.5226860642433167, + "learning_rate": 0.0002, + "loss": 0.8588, + "step": 1370 + }, + { + "epoch": 0.44544867656552617, + "grad_norm": 0.5354582071304321, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 1380 + }, + { + "epoch": 0.4486765655261459, + "grad_norm": 0.472646564245224, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 1390 + }, + { + "epoch": 0.45190445448676564, + "grad_norm": 0.6312310099601746, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 1400 + }, + { + "epoch": 0.4551323434473854, + "grad_norm": 0.4298408031463623, + "learning_rate": 0.0002, + "loss": 0.8212, + "step": 1410 + }, + { + "epoch": 0.45836023240800516, + "grad_norm": 0.43427202105522156, + "learning_rate": 0.0002, + "loss": 0.8447, + "step": 1420 + }, + { + "epoch": 0.4615881213686249, + "grad_norm": 0.44097861647605896, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 1430 + }, + { + "epoch": 0.4648160103292447, + "grad_norm": 0.5142693519592285, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 1440 + }, + { + "epoch": 0.46804389928986445, + "grad_norm": 0.46416547894477844, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 1450 + }, + { + "epoch": 0.47127178825048416, + "grad_norm": 0.4858551025390625, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 1460 + }, + { + "epoch": 0.4744996772111039, + "grad_norm": 0.4709177315235138, + "learning_rate": 0.0002, + "loss": 0.8354, + "step": 1470 + }, + { + "epoch": 0.4777275661717237, + "grad_norm": 0.5500252842903137, + "learning_rate": 0.0002, + "loss": 0.8391, + "step": 1480 + }, + { + "epoch": 0.48095545513234345, + "grad_norm": 0.43364381790161133, + "learning_rate": 0.0002, + "loss": 0.8359, + "step": 1490 + }, + { + "epoch": 0.4841833440929632, + "grad_norm": 0.47712287306785583, + "learning_rate": 0.0002, + "loss": 0.8446, + "step": 1500 + }, + { + "epoch": 0.48741123305358297, + "grad_norm": 0.4518495202064514, + "learning_rate": 0.0002, + "loss": 0.8518, + "step": 1510 + }, + { + "epoch": 0.49063912201420273, + "grad_norm": 0.4539008140563965, + "learning_rate": 0.0002, + "loss": 0.819, + "step": 1520 + }, + { + "epoch": 0.49386701097482244, + "grad_norm": 0.4993067979812622, + "learning_rate": 0.0002, + "loss": 0.8276, + "step": 1530 + }, + { + "epoch": 0.4970948999354422, + "grad_norm": 0.6094803214073181, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 1540 + }, + { + "epoch": 0.500322788896062, + "grad_norm": 0.48602527379989624, + "learning_rate": 0.0002, + "loss": 0.8263, + "step": 1550 + }, + { + "epoch": 0.5035506778566817, + "grad_norm": 0.40245795249938965, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 1560 + }, + { + "epoch": 0.5067785668173015, + "grad_norm": 0.456787645816803, + "learning_rate": 0.0002, + "loss": 0.7907, + "step": 1570 + }, + { + "epoch": 0.5100064557779213, + "grad_norm": 0.43936216831207275, + "learning_rate": 0.0002, + "loss": 0.86, + "step": 1580 + }, + { + "epoch": 0.513234344738541, + "grad_norm": 0.549018144607544, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 1590 + }, + { + "epoch": 0.5164622336991608, + "grad_norm": 0.41746795177459717, + "learning_rate": 0.0002, + "loss": 0.8169, + "step": 1600 + }, + { + "epoch": 0.5196901226597805, + "grad_norm": 0.4217053949832916, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 1610 + }, + { + "epoch": 0.5229180116204003, + "grad_norm": 0.449913889169693, + "learning_rate": 0.0002, + "loss": 0.8161, + "step": 1620 + }, + { + "epoch": 0.5261459005810201, + "grad_norm": 0.5084872245788574, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 1630 + }, + { + "epoch": 0.5293737895416397, + "grad_norm": 0.46248653531074524, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 1640 + }, + { + "epoch": 0.5326016785022595, + "grad_norm": 0.4824236035346985, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 1650 + }, + { + "epoch": 0.5358295674628792, + "grad_norm": 0.6010985374450684, + "learning_rate": 0.0002, + "loss": 0.8711, + "step": 1660 + }, + { + "epoch": 0.539057456423499, + "grad_norm": 0.4757920801639557, + "learning_rate": 0.0002, + "loss": 0.8266, + "step": 1670 + }, + { + "epoch": 0.5422853453841188, + "grad_norm": 0.45161882042884827, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 1680 + }, + { + "epoch": 0.5455132343447385, + "grad_norm": 0.49314990639686584, + "learning_rate": 0.0002, + "loss": 0.8141, + "step": 1690 + }, + { + "epoch": 0.5487411233053583, + "grad_norm": 0.3918305039405823, + "learning_rate": 0.0002, + "loss": 0.8091, + "step": 1700 + }, + { + "epoch": 0.551969012265978, + "grad_norm": 0.5966728925704956, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 1710 + }, + { + "epoch": 0.5551969012265978, + "grad_norm": 0.4208986163139343, + "learning_rate": 0.0002, + "loss": 0.8438, + "step": 1720 + }, + { + "epoch": 0.5584247901872176, + "grad_norm": 0.43724218010902405, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 1730 + }, + { + "epoch": 0.5616526791478373, + "grad_norm": 0.5287272930145264, + "learning_rate": 0.0002, + "loss": 0.7956, + "step": 1740 + }, + { + "epoch": 0.5648805681084571, + "grad_norm": 0.4961899518966675, + "learning_rate": 0.0002, + "loss": 0.8557, + "step": 1750 + }, + { + "epoch": 0.5681084570690769, + "grad_norm": 0.4468635320663452, + "learning_rate": 0.0002, + "loss": 0.8029, + "step": 1760 + }, + { + "epoch": 0.5713363460296966, + "grad_norm": 0.6423530578613281, + "learning_rate": 0.0002, + "loss": 0.7968, + "step": 1770 + }, + { + "epoch": 0.5745642349903163, + "grad_norm": 0.4601971507072449, + "learning_rate": 0.0002, + "loss": 0.8324, + "step": 1780 + }, + { + "epoch": 0.577792123950936, + "grad_norm": 0.46514901518821716, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 1790 + }, + { + "epoch": 0.5810200129115558, + "grad_norm": 0.4771687388420105, + "learning_rate": 0.0002, + "loss": 0.8186, + "step": 1800 + }, + { + "epoch": 0.5842479018721756, + "grad_norm": 0.46514490246772766, + "learning_rate": 0.0002, + "loss": 0.856, + "step": 1810 + }, + { + "epoch": 0.5874757908327953, + "grad_norm": 0.5373936295509338, + "learning_rate": 0.0002, + "loss": 0.84, + "step": 1820 + }, + { + "epoch": 0.5907036797934151, + "grad_norm": 0.5175791382789612, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 1830 + }, + { + "epoch": 0.5939315687540349, + "grad_norm": 0.4522802233695984, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 1840 + }, + { + "epoch": 0.5971594577146546, + "grad_norm": 0.42987772822380066, + "learning_rate": 0.0002, + "loss": 0.8633, + "step": 1850 + }, + { + "epoch": 0.6003873466752744, + "grad_norm": 0.5566838383674622, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 1860 + }, + { + "epoch": 0.6036152356358941, + "grad_norm": 0.42807698249816895, + "learning_rate": 0.0002, + "loss": 0.8312, + "step": 1870 + }, + { + "epoch": 0.6068431245965139, + "grad_norm": 0.4957767724990845, + "learning_rate": 0.0002, + "loss": 0.8035, + "step": 1880 + }, + { + "epoch": 0.6100710135571337, + "grad_norm": 0.4260980188846588, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 1890 + }, + { + "epoch": 0.6132989025177534, + "grad_norm": 0.4777357876300812, + "learning_rate": 0.0002, + "loss": 0.8363, + "step": 1900 + }, + { + "epoch": 0.6165267914783732, + "grad_norm": 0.4434216022491455, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 1910 + }, + { + "epoch": 0.6197546804389928, + "grad_norm": 0.5215433835983276, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 1920 + }, + { + "epoch": 0.6229825693996126, + "grad_norm": 0.5143248438835144, + "learning_rate": 0.0002, + "loss": 0.82, + "step": 1930 + }, + { + "epoch": 0.6262104583602324, + "grad_norm": 0.5213413238525391, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 1940 + }, + { + "epoch": 0.6294383473208521, + "grad_norm": 0.5408226251602173, + "learning_rate": 0.0002, + "loss": 0.7549, + "step": 1950 + }, + { + "epoch": 0.6326662362814719, + "grad_norm": 0.5479708909988403, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 1960 + }, + { + "epoch": 0.6358941252420917, + "grad_norm": 0.4490949809551239, + "learning_rate": 0.0002, + "loss": 0.8138, + "step": 1970 + }, + { + "epoch": 0.6391220142027114, + "grad_norm": 0.48815059661865234, + "learning_rate": 0.0002, + "loss": 0.854, + "step": 1980 + }, + { + "epoch": 0.6423499031633312, + "grad_norm": 0.46498045325279236, + "learning_rate": 0.0002, + "loss": 0.8568, + "step": 1990 + }, + { + "epoch": 0.6455777921239509, + "grad_norm": 0.5136561393737793, + "learning_rate": 0.0002, + "loss": 0.8263, + "step": 2000 + }, + { + "epoch": 0.6488056810845707, + "grad_norm": 0.5145719647407532, + "learning_rate": 0.0002, + "loss": 0.8503, + "step": 2010 + }, + { + "epoch": 0.6520335700451905, + "grad_norm": 0.5430373549461365, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 2020 + }, + { + "epoch": 0.6552614590058102, + "grad_norm": 0.46347954869270325, + "learning_rate": 0.0002, + "loss": 0.8115, + "step": 2030 + }, + { + "epoch": 0.65848934796643, + "grad_norm": 0.5189562439918518, + "learning_rate": 0.0002, + "loss": 0.8769, + "step": 2040 + }, + { + "epoch": 0.6617172369270498, + "grad_norm": 0.43843990564346313, + "learning_rate": 0.0002, + "loss": 0.8453, + "step": 2050 + }, + { + "epoch": 0.6649451258876695, + "grad_norm": 0.4654983580112457, + "learning_rate": 0.0002, + "loss": 0.7951, + "step": 2060 + }, + { + "epoch": 0.6681730148482892, + "grad_norm": 0.44835716485977173, + "learning_rate": 0.0002, + "loss": 0.8308, + "step": 2070 + }, + { + "epoch": 0.6714009038089089, + "grad_norm": 0.38811734318733215, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 2080 + }, + { + "epoch": 0.6746287927695287, + "grad_norm": 0.5709853172302246, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 2090 + }, + { + "epoch": 0.6778566817301485, + "grad_norm": 0.49994757771492004, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 2100 + }, + { + "epoch": 0.6810845706907682, + "grad_norm": 0.5505402684211731, + "learning_rate": 0.0002, + "loss": 0.8, + "step": 2110 + }, + { + "epoch": 0.684312459651388, + "grad_norm": 0.48195120692253113, + "learning_rate": 0.0002, + "loss": 0.8227, + "step": 2120 + }, + { + "epoch": 0.6875403486120077, + "grad_norm": 0.4854775071144104, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 2130 + }, + { + "epoch": 0.6907682375726275, + "grad_norm": 0.6422494649887085, + "learning_rate": 0.0002, + "loss": 0.8231, + "step": 2140 + }, + { + "epoch": 0.6939961265332473, + "grad_norm": 0.3972536027431488, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 2150 + }, + { + "epoch": 0.697224015493867, + "grad_norm": 0.4297836422920227, + "learning_rate": 0.0002, + "loss": 0.8068, + "step": 2160 + }, + { + "epoch": 0.7004519044544868, + "grad_norm": 0.45486778020858765, + "learning_rate": 0.0002, + "loss": 0.8017, + "step": 2170 + }, + { + "epoch": 0.7036797934151066, + "grad_norm": 0.4706047773361206, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 2180 + }, + { + "epoch": 0.7069076823757263, + "grad_norm": 0.46426892280578613, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 2190 + }, + { + "epoch": 0.7101355713363461, + "grad_norm": 0.46333715319633484, + "learning_rate": 0.0002, + "loss": 0.8472, + "step": 2200 + }, + { + "epoch": 0.7133634602969657, + "grad_norm": 0.4632524251937866, + "learning_rate": 0.0002, + "loss": 0.8247, + "step": 2210 + }, + { + "epoch": 0.7165913492575855, + "grad_norm": 0.4610830843448639, + "learning_rate": 0.0002, + "loss": 0.8452, + "step": 2220 + }, + { + "epoch": 0.7198192382182053, + "grad_norm": 0.4905324876308441, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 2230 + }, + { + "epoch": 0.723047127178825, + "grad_norm": 0.4936263859272003, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 2240 + }, + { + "epoch": 0.7262750161394448, + "grad_norm": 0.40778425335884094, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 2250 + }, + { + "epoch": 0.7295029051000645, + "grad_norm": 0.50351482629776, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 2260 + }, + { + "epoch": 0.7327307940606843, + "grad_norm": 0.4894128143787384, + "learning_rate": 0.0002, + "loss": 0.8475, + "step": 2270 + }, + { + "epoch": 0.7359586830213041, + "grad_norm": 0.5580906271934509, + "learning_rate": 0.0002, + "loss": 0.8087, + "step": 2280 + }, + { + "epoch": 0.7391865719819238, + "grad_norm": 0.4655369520187378, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 2290 + }, + { + "epoch": 0.7424144609425436, + "grad_norm": 0.4666965901851654, + "learning_rate": 0.0002, + "loss": 0.8395, + "step": 2300 + }, + { + "epoch": 0.7456423499031634, + "grad_norm": 0.46259936690330505, + "learning_rate": 0.0002, + "loss": 0.7605, + "step": 2310 + }, + { + "epoch": 0.7488702388637831, + "grad_norm": 0.520706832408905, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 2320 + }, + { + "epoch": 0.7520981278244029, + "grad_norm": 0.5142408013343811, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 2330 + }, + { + "epoch": 0.7553260167850226, + "grad_norm": 0.5355164408683777, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 2340 + }, + { + "epoch": 0.7585539057456423, + "grad_norm": 0.5517185926437378, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 2350 + }, + { + "epoch": 0.7617817947062621, + "grad_norm": 0.7162677049636841, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 2360 + }, + { + "epoch": 0.7650096836668818, + "grad_norm": 0.42402133345603943, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 2370 + }, + { + "epoch": 0.7682375726275016, + "grad_norm": 0.47180113196372986, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 2380 + }, + { + "epoch": 0.7714654615881213, + "grad_norm": 0.6262288689613342, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 2390 + }, + { + "epoch": 0.7746933505487411, + "grad_norm": 0.5177528262138367, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 2400 + }, + { + "epoch": 0.7779212395093609, + "grad_norm": 0.555721640586853, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 2410 + }, + { + "epoch": 0.7811491284699806, + "grad_norm": 0.5592644810676575, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 2420 + }, + { + "epoch": 0.7843770174306004, + "grad_norm": 0.38025397062301636, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 2430 + }, + { + "epoch": 0.7876049063912202, + "grad_norm": 0.4597472548484802, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 2440 + }, + { + "epoch": 0.7908327953518399, + "grad_norm": 0.4929825961589813, + "learning_rate": 0.0002, + "loss": 0.8575, + "step": 2450 + }, + { + "epoch": 0.7940606843124597, + "grad_norm": 0.45277655124664307, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 2460 + }, + { + "epoch": 0.7972885732730794, + "grad_norm": 0.6224122643470764, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 2470 + }, + { + "epoch": 0.8005164622336992, + "grad_norm": 0.5740901827812195, + "learning_rate": 0.0002, + "loss": 0.8449, + "step": 2480 + }, + { + "epoch": 0.8037443511943189, + "grad_norm": 0.41335329413414, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 2490 + }, + { + "epoch": 0.8069722401549386, + "grad_norm": 0.4738694131374359, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 2500 + }, + { + "epoch": 0.8102001291155584, + "grad_norm": 0.5288197994232178, + "learning_rate": 0.0002, + "loss": 0.7927, + "step": 2510 + }, + { + "epoch": 0.8134280180761781, + "grad_norm": 0.5404666066169739, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 2520 + }, + { + "epoch": 0.8166559070367979, + "grad_norm": 0.4444909691810608, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 2530 + }, + { + "epoch": 0.8198837959974177, + "grad_norm": 0.542061448097229, + "learning_rate": 0.0002, + "loss": 0.8683, + "step": 2540 + }, + { + "epoch": 0.8231116849580374, + "grad_norm": 0.4914741814136505, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 2550 + }, + { + "epoch": 0.8263395739186572, + "grad_norm": 0.41703441739082336, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 2560 + }, + { + "epoch": 0.829567462879277, + "grad_norm": 0.5489841103553772, + "learning_rate": 0.0002, + "loss": 0.824, + "step": 2570 + }, + { + "epoch": 0.8327953518398967, + "grad_norm": 0.5359883308410645, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 2580 + }, + { + "epoch": 0.8360232408005165, + "grad_norm": 0.5541019439697266, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 2590 + }, + { + "epoch": 0.8392511297611362, + "grad_norm": 0.4746638834476471, + "learning_rate": 0.0002, + "loss": 0.797, + "step": 2600 + }, + { + "epoch": 0.842479018721756, + "grad_norm": 0.5243194103240967, + "learning_rate": 0.0002, + "loss": 0.8116, + "step": 2610 + }, + { + "epoch": 0.8457069076823758, + "grad_norm": 0.46824976801872253, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 2620 + }, + { + "epoch": 0.8489347966429954, + "grad_norm": 0.49487847089767456, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 2630 + }, + { + "epoch": 0.8521626856036152, + "grad_norm": 0.42180097103118896, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 2640 + }, + { + "epoch": 0.855390574564235, + "grad_norm": 0.5516560077667236, + "learning_rate": 0.0002, + "loss": 0.8304, + "step": 2650 + }, + { + "epoch": 0.8586184635248547, + "grad_norm": 0.4392191767692566, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 2660 + }, + { + "epoch": 0.8618463524854745, + "grad_norm": 0.5387210845947266, + "learning_rate": 0.0002, + "loss": 0.848, + "step": 2670 + }, + { + "epoch": 0.8650742414460942, + "grad_norm": 0.6232406497001648, + "learning_rate": 0.0002, + "loss": 0.8094, + "step": 2680 + }, + { + "epoch": 0.868302130406714, + "grad_norm": 0.53749018907547, + "learning_rate": 0.0002, + "loss": 0.768, + "step": 2690 + }, + { + "epoch": 0.8715300193673338, + "grad_norm": 0.47480374574661255, + "learning_rate": 0.0002, + "loss": 0.8299, + "step": 2700 + }, + { + "epoch": 0.8747579083279535, + "grad_norm": 0.44618046283721924, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 2710 + }, + { + "epoch": 0.8779857972885733, + "grad_norm": 0.4173581302165985, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 2720 + }, + { + "epoch": 0.881213686249193, + "grad_norm": 0.524081289768219, + "learning_rate": 0.0002, + "loss": 0.7713, + "step": 2730 + }, + { + "epoch": 0.8844415752098128, + "grad_norm": 0.5608431100845337, + "learning_rate": 0.0002, + "loss": 0.8738, + "step": 2740 + }, + { + "epoch": 0.8876694641704326, + "grad_norm": 0.5212284922599792, + "learning_rate": 0.0002, + "loss": 0.8513, + "step": 2750 + }, + { + "epoch": 0.8908973531310523, + "grad_norm": 0.5601475834846497, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 2760 + }, + { + "epoch": 0.8941252420916721, + "grad_norm": 0.4499223828315735, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 2770 + }, + { + "epoch": 0.8973531310522918, + "grad_norm": 0.46945226192474365, + "learning_rate": 0.0002, + "loss": 0.8559, + "step": 2780 + }, + { + "epoch": 0.9005810200129115, + "grad_norm": 0.4837495684623718, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 2790 + }, + { + "epoch": 0.9038089089735313, + "grad_norm": 0.5059258937835693, + "learning_rate": 0.0002, + "loss": 0.7887, + "step": 2800 + }, + { + "epoch": 0.907036797934151, + "grad_norm": 0.4857945144176483, + "learning_rate": 0.0002, + "loss": 0.8571, + "step": 2810 + }, + { + "epoch": 0.9102646868947708, + "grad_norm": 0.5001962780952454, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 2820 + }, + { + "epoch": 0.9134925758553906, + "grad_norm": 0.5468648672103882, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 2830 + }, + { + "epoch": 0.9167204648160103, + "grad_norm": 0.5533056259155273, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 2840 + }, + { + "epoch": 0.9199483537766301, + "grad_norm": 0.5909785628318787, + "learning_rate": 0.0002, + "loss": 0.7895, + "step": 2850 + }, + { + "epoch": 0.9231762427372499, + "grad_norm": 0.47428104281425476, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 2860 + }, + { + "epoch": 0.9264041316978696, + "grad_norm": 0.548814058303833, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 2870 + }, + { + "epoch": 0.9296320206584894, + "grad_norm": 0.5576745271682739, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 2880 + }, + { + "epoch": 0.9328599096191091, + "grad_norm": 0.47094792127609253, + "learning_rate": 0.0002, + "loss": 0.8399, + "step": 2890 + }, + { + "epoch": 0.9360877985797289, + "grad_norm": 0.5408539772033691, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 2900 + }, + { + "epoch": 0.9393156875403487, + "grad_norm": 0.5922889113426208, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 2910 + }, + { + "epoch": 0.9425435765009683, + "grad_norm": 0.45462584495544434, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 2920 + }, + { + "epoch": 0.9457714654615881, + "grad_norm": 0.6864947080612183, + "learning_rate": 0.0002, + "loss": 0.8344, + "step": 2930 + }, + { + "epoch": 0.9489993544222078, + "grad_norm": 0.4706299304962158, + "learning_rate": 0.0002, + "loss": 0.8166, + "step": 2940 + }, + { + "epoch": 0.9522272433828276, + "grad_norm": 0.5583269596099854, + "learning_rate": 0.0002, + "loss": 0.8422, + "step": 2950 + }, + { + "epoch": 0.9554551323434474, + "grad_norm": 0.51015704870224, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 2960 + }, + { + "epoch": 0.9586830213040671, + "grad_norm": 0.5325582027435303, + "learning_rate": 0.0002, + "loss": 0.8371, + "step": 2970 + }, + { + "epoch": 0.9619109102646869, + "grad_norm": 0.49008598923683167, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 2980 + }, + { + "epoch": 0.9651387992253067, + "grad_norm": 0.4422132074832916, + "learning_rate": 0.0002, + "loss": 0.8093, + "step": 2990 + }, + { + "epoch": 0.9683666881859264, + "grad_norm": 0.5053589344024658, + "learning_rate": 0.0002, + "loss": 0.7966, + "step": 3000 + }, + { + "epoch": 0.9715945771465462, + "grad_norm": 0.46754521131515503, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 3010 + }, + { + "epoch": 0.9748224661071659, + "grad_norm": 0.5613434910774231, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 3020 + }, + { + "epoch": 0.9780503550677857, + "grad_norm": 0.5052843689918518, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 3030 + }, + { + "epoch": 0.9812782440284055, + "grad_norm": 0.4270972013473511, + "learning_rate": 0.0002, + "loss": 0.8412, + "step": 3040 + }, + { + "epoch": 0.9845061329890252, + "grad_norm": 0.4974991977214813, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 3050 + }, + { + "epoch": 0.9877340219496449, + "grad_norm": 0.4432311952114105, + "learning_rate": 0.0002, + "loss": 0.8415, + "step": 3060 + }, + { + "epoch": 0.9909619109102646, + "grad_norm": 0.466457724571228, + "learning_rate": 0.0002, + "loss": 0.7764, + "step": 3070 + }, + { + "epoch": 0.9941897998708844, + "grad_norm": 0.6438009142875671, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 3080 + }, + { + "epoch": 0.9974176888315042, + "grad_norm": 0.5593604445457458, + "learning_rate": 0.0002, + "loss": 0.8425, + "step": 3090 + }, + { + "epoch": 1.0, + "eval_loss": 1.0958120822906494, + "eval_runtime": 148.3273, + "eval_samples_per_second": 4.942, + "eval_steps_per_second": 0.62, + "step": 3098 + } + ], + "logging_steps": 10, + "max_steps": 24784, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.4336850329704858e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f46f2b8e8752b125339f36f172c3878be4cdb152 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfc2a69e44a51edf5586ebed4b7ee915a23244c18c1f59e580471e4c9becfa98 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..05da3659324621f6b9e55356d621e68a0f531532 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9fdc7542c3140db0ec0e36b95a2f2720480264bcfdd0f3260e96f1fb3c64c09 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..bab0f7e7fa2a7792815c4180b705a5a987e3873e --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d3bc504d4cedf343f18092a9c7470e0d591807026fbaf538abc1041fac205ea +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e1c51048f65e2744070f15d834c4adc4f5c2c3ba --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c09ba047da851b8f58a65be69dccbf67296fcd2fd53d20769d72c8ef3f0b003 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e3d1684e20cdc4c94d45b8a66f289259cef839de --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9017aa2a45dc562deab13e5021d9312539848c412104787ceb0f0b7db2d93b77 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f013b18553ac350bc800f5bf7b34e71e750b16ca --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/trainer_state.json @@ -0,0 +1,4382 @@ +{ + "best_metric": 1.0958120822906494, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 6196, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032278889606197547, + "grad_norm": 0.7092075347900391, + "learning_rate": 0.0002, + "loss": 1.593, + "step": 10 + }, + { + "epoch": 0.006455777921239509, + "grad_norm": 0.6900479793548584, + "learning_rate": 0.0002, + "loss": 1.0956, + "step": 20 + }, + { + "epoch": 0.009683666881859263, + "grad_norm": 0.6788288950920105, + "learning_rate": 0.0002, + "loss": 0.9807, + "step": 30 + }, + { + "epoch": 0.012911555842479019, + "grad_norm": 0.5590243339538574, + "learning_rate": 0.0002, + "loss": 0.9385, + "step": 40 + }, + { + "epoch": 0.016139444803098774, + "grad_norm": 0.5136010646820068, + "learning_rate": 0.0002, + "loss": 0.931, + "step": 50 + }, + { + "epoch": 0.019367333763718526, + "grad_norm": 0.45298320055007935, + "learning_rate": 0.0002, + "loss": 0.8896, + "step": 60 + }, + { + "epoch": 0.022595222724338282, + "grad_norm": 0.5917162299156189, + "learning_rate": 0.0002, + "loss": 0.9184, + "step": 70 + }, + { + "epoch": 0.025823111684958037, + "grad_norm": 0.4414856433868408, + "learning_rate": 0.0002, + "loss": 0.8705, + "step": 80 + }, + { + "epoch": 0.029051000645577793, + "grad_norm": 0.5547978281974792, + "learning_rate": 0.0002, + "loss": 0.8419, + "step": 90 + }, + { + "epoch": 0.03227888960619755, + "grad_norm": 0.5271288156509399, + "learning_rate": 0.0002, + "loss": 0.8987, + "step": 100 + }, + { + "epoch": 0.035506778566817304, + "grad_norm": 0.5506119728088379, + "learning_rate": 0.0002, + "loss": 0.8543, + "step": 110 + }, + { + "epoch": 0.03873466752743705, + "grad_norm": 0.5579327940940857, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 120 + }, + { + "epoch": 0.04196255648805681, + "grad_norm": 0.5099632740020752, + "learning_rate": 0.0002, + "loss": 0.8826, + "step": 130 + }, + { + "epoch": 0.045190445448676564, + "grad_norm": 0.40396833419799805, + "learning_rate": 0.0002, + "loss": 0.9239, + "step": 140 + }, + { + "epoch": 0.04841833440929632, + "grad_norm": 0.5008092522621155, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 150 + }, + { + "epoch": 0.051646223369916075, + "grad_norm": 0.4388776421546936, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 160 + }, + { + "epoch": 0.05487411233053583, + "grad_norm": 0.44138944149017334, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 170 + }, + { + "epoch": 0.058102001291155586, + "grad_norm": 0.358484148979187, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 180 + }, + { + "epoch": 0.06132989025177534, + "grad_norm": 0.457052081823349, + "learning_rate": 0.0002, + "loss": 0.8956, + "step": 190 + }, + { + "epoch": 0.0645577792123951, + "grad_norm": 0.5537622570991516, + "learning_rate": 0.0002, + "loss": 0.9138, + "step": 200 + }, + { + "epoch": 0.06778566817301485, + "grad_norm": 0.552631676197052, + "learning_rate": 0.0002, + "loss": 0.8701, + "step": 210 + }, + { + "epoch": 0.07101355713363461, + "grad_norm": 0.4414575397968292, + "learning_rate": 0.0002, + "loss": 0.8854, + "step": 220 + }, + { + "epoch": 0.07424144609425436, + "grad_norm": 0.4996664226055145, + "learning_rate": 0.0002, + "loss": 0.8581, + "step": 230 + }, + { + "epoch": 0.0774693350548741, + "grad_norm": 0.7321897149085999, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 240 + }, + { + "epoch": 0.08069722401549387, + "grad_norm": 0.4553901255130768, + "learning_rate": 0.0002, + "loss": 0.8848, + "step": 250 + }, + { + "epoch": 0.08392511297611362, + "grad_norm": 0.5039054751396179, + "learning_rate": 0.0002, + "loss": 0.868, + "step": 260 + }, + { + "epoch": 0.08715300193673338, + "grad_norm": 0.4113094210624695, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 270 + }, + { + "epoch": 0.09038089089735313, + "grad_norm": 0.450436532497406, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 280 + }, + { + "epoch": 0.09360877985797289, + "grad_norm": 0.4548024535179138, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 290 + }, + { + "epoch": 0.09683666881859264, + "grad_norm": 0.4932962656021118, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 300 + }, + { + "epoch": 0.1000645577792124, + "grad_norm": 0.4005250334739685, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 310 + }, + { + "epoch": 0.10329244673983215, + "grad_norm": 1.8321624994277954, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 320 + }, + { + "epoch": 0.1065203357004519, + "grad_norm": 0.45815610885620117, + "learning_rate": 0.0002, + "loss": 0.8411, + "step": 330 + }, + { + "epoch": 0.10974822466107166, + "grad_norm": 0.39324095845222473, + "learning_rate": 0.0002, + "loss": 0.857, + "step": 340 + }, + { + "epoch": 0.11297611362169141, + "grad_norm": 0.546273946762085, + "learning_rate": 0.0002, + "loss": 0.8258, + "step": 350 + }, + { + "epoch": 0.11620400258231117, + "grad_norm": 0.497448593378067, + "learning_rate": 0.0002, + "loss": 0.882, + "step": 360 + }, + { + "epoch": 0.11943189154293092, + "grad_norm": 0.37508800625801086, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 370 + }, + { + "epoch": 0.12265978050355068, + "grad_norm": 0.45849609375, + "learning_rate": 0.0002, + "loss": 0.852, + "step": 380 + }, + { + "epoch": 0.12588766946417043, + "grad_norm": 0.5488408803939819, + "learning_rate": 0.0002, + "loss": 0.8437, + "step": 390 + }, + { + "epoch": 0.1291155584247902, + "grad_norm": 0.4477061331272125, + "learning_rate": 0.0002, + "loss": 0.8349, + "step": 400 + }, + { + "epoch": 0.13234344738540993, + "grad_norm": 0.39227980375289917, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 410 + }, + { + "epoch": 0.1355713363460297, + "grad_norm": 0.3922233581542969, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 420 + }, + { + "epoch": 0.13879922530664945, + "grad_norm": 0.42901909351348877, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 430 + }, + { + "epoch": 0.14202711426726922, + "grad_norm": 0.4217798709869385, + "learning_rate": 0.0002, + "loss": 0.8271, + "step": 440 + }, + { + "epoch": 0.14525500322788895, + "grad_norm": 0.43470677733421326, + "learning_rate": 0.0002, + "loss": 0.8594, + "step": 450 + }, + { + "epoch": 0.1484828921885087, + "grad_norm": 0.5324403047561646, + "learning_rate": 0.0002, + "loss": 0.8106, + "step": 460 + }, + { + "epoch": 0.15171078114912848, + "grad_norm": 0.3999756872653961, + "learning_rate": 0.0002, + "loss": 0.8729, + "step": 470 + }, + { + "epoch": 0.1549386701097482, + "grad_norm": 0.404933363199234, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 480 + }, + { + "epoch": 0.15816655907036797, + "grad_norm": 0.44122636318206787, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 490 + }, + { + "epoch": 0.16139444803098774, + "grad_norm": 0.510166347026825, + "learning_rate": 0.0002, + "loss": 0.8457, + "step": 500 + }, + { + "epoch": 0.1646223369916075, + "grad_norm": 0.4549732506275177, + "learning_rate": 0.0002, + "loss": 0.8692, + "step": 510 + }, + { + "epoch": 0.16785022595222723, + "grad_norm": 0.5148182511329651, + "learning_rate": 0.0002, + "loss": 0.8466, + "step": 520 + }, + { + "epoch": 0.171078114912847, + "grad_norm": 0.3596806824207306, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 530 + }, + { + "epoch": 0.17430600387346676, + "grad_norm": 0.4388909339904785, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 540 + }, + { + "epoch": 0.17753389283408652, + "grad_norm": 0.5052742958068848, + "learning_rate": 0.0002, + "loss": 0.8322, + "step": 550 + }, + { + "epoch": 0.18076178179470626, + "grad_norm": 0.48248958587646484, + "learning_rate": 0.0002, + "loss": 0.791, + "step": 560 + }, + { + "epoch": 0.18398967075532602, + "grad_norm": 0.5360197424888611, + "learning_rate": 0.0002, + "loss": 0.8593, + "step": 570 + }, + { + "epoch": 0.18721755971594578, + "grad_norm": 0.43999341130256653, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 580 + }, + { + "epoch": 0.19044544867656552, + "grad_norm": 0.3685208261013031, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 590 + }, + { + "epoch": 0.19367333763718528, + "grad_norm": 0.4601275622844696, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 600 + }, + { + "epoch": 0.19690122659780504, + "grad_norm": 0.4778369665145874, + "learning_rate": 0.0002, + "loss": 0.8483, + "step": 610 + }, + { + "epoch": 0.2001291155584248, + "grad_norm": 0.4867003560066223, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 620 + }, + { + "epoch": 0.20335700451904454, + "grad_norm": 0.4583742916584015, + "learning_rate": 0.0002, + "loss": 0.8554, + "step": 630 + }, + { + "epoch": 0.2065848934796643, + "grad_norm": 0.47958165407180786, + "learning_rate": 0.0002, + "loss": 0.8698, + "step": 640 + }, + { + "epoch": 0.20981278244028406, + "grad_norm": 0.4526064097881317, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 650 + }, + { + "epoch": 0.2130406714009038, + "grad_norm": 0.45890581607818604, + "learning_rate": 0.0002, + "loss": 0.8313, + "step": 660 + }, + { + "epoch": 0.21626856036152356, + "grad_norm": 0.42725905776023865, + "learning_rate": 0.0002, + "loss": 0.8143, + "step": 670 + }, + { + "epoch": 0.21949644932214332, + "grad_norm": 0.40380963683128357, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 680 + }, + { + "epoch": 0.22272433828276308, + "grad_norm": 0.4372998774051666, + "learning_rate": 0.0002, + "loss": 0.9004, + "step": 690 + }, + { + "epoch": 0.22595222724338282, + "grad_norm": 0.4245864450931549, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 700 + }, + { + "epoch": 0.22918011620400258, + "grad_norm": 0.4061129689216614, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 710 + }, + { + "epoch": 0.23240800516462234, + "grad_norm": 0.474454790353775, + "learning_rate": 0.0002, + "loss": 0.8275, + "step": 720 + }, + { + "epoch": 0.23563589412524208, + "grad_norm": 0.4908486008644104, + "learning_rate": 0.0002, + "loss": 0.8346, + "step": 730 + }, + { + "epoch": 0.23886378308586184, + "grad_norm": 0.4284191429615021, + "learning_rate": 0.0002, + "loss": 0.8755, + "step": 740 + }, + { + "epoch": 0.2420916720464816, + "grad_norm": 0.44730308651924133, + "learning_rate": 0.0002, + "loss": 0.8387, + "step": 750 + }, + { + "epoch": 0.24531956100710137, + "grad_norm": 0.4433246850967407, + "learning_rate": 0.0002, + "loss": 0.8135, + "step": 760 + }, + { + "epoch": 0.2485474499677211, + "grad_norm": 0.43668854236602783, + "learning_rate": 0.0002, + "loss": 0.8644, + "step": 770 + }, + { + "epoch": 0.25177533892834086, + "grad_norm": 0.34324130415916443, + "learning_rate": 0.0002, + "loss": 0.8025, + "step": 780 + }, + { + "epoch": 0.2550032278889606, + "grad_norm": 0.46476295590400696, + "learning_rate": 0.0002, + "loss": 0.8725, + "step": 790 + }, + { + "epoch": 0.2582311168495804, + "grad_norm": 0.5047039985656738, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 800 + }, + { + "epoch": 0.26145900581020015, + "grad_norm": 0.4402127265930176, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 810 + }, + { + "epoch": 0.26468689477081986, + "grad_norm": 0.4642465114593506, + "learning_rate": 0.0002, + "loss": 0.8025, + "step": 820 + }, + { + "epoch": 0.2679147837314396, + "grad_norm": 0.40093424916267395, + "learning_rate": 0.0002, + "loss": 0.8836, + "step": 830 + }, + { + "epoch": 0.2711426726920594, + "grad_norm": 0.42501842975616455, + "learning_rate": 0.0002, + "loss": 0.83, + "step": 840 + }, + { + "epoch": 0.27437056165267915, + "grad_norm": 0.43279722332954407, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 850 + }, + { + "epoch": 0.2775984506132989, + "grad_norm": 0.5991243720054626, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 860 + }, + { + "epoch": 0.28082633957391867, + "grad_norm": 0.4217848777770996, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 870 + }, + { + "epoch": 0.28405422853453843, + "grad_norm": 0.3933536410331726, + "learning_rate": 0.0002, + "loss": 0.8135, + "step": 880 + }, + { + "epoch": 0.28728211749515814, + "grad_norm": 0.5868505239486694, + "learning_rate": 0.0002, + "loss": 0.8846, + "step": 890 + }, + { + "epoch": 0.2905100064557779, + "grad_norm": 0.5209547877311707, + "learning_rate": 0.0002, + "loss": 0.8759, + "step": 900 + }, + { + "epoch": 0.29373789541639767, + "grad_norm": 0.49307361245155334, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 910 + }, + { + "epoch": 0.2969657843770174, + "grad_norm": 0.4288382828235626, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 920 + }, + { + "epoch": 0.3001936733376372, + "grad_norm": 0.33568474650382996, + "learning_rate": 0.0002, + "loss": 0.8431, + "step": 930 + }, + { + "epoch": 0.30342156229825695, + "grad_norm": 1.0915930271148682, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 940 + }, + { + "epoch": 0.3066494512588767, + "grad_norm": 0.5489798188209534, + "learning_rate": 0.0002, + "loss": 0.8535, + "step": 950 + }, + { + "epoch": 0.3098773402194964, + "grad_norm": 0.42971742153167725, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 960 + }, + { + "epoch": 0.3131052291801162, + "grad_norm": 0.43375834822654724, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 970 + }, + { + "epoch": 0.31633311814073595, + "grad_norm": 0.47488611936569214, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 980 + }, + { + "epoch": 0.3195610071013557, + "grad_norm": 0.46296775341033936, + "learning_rate": 0.0002, + "loss": 0.7906, + "step": 990 + }, + { + "epoch": 0.32278889606197547, + "grad_norm": 0.4548890292644501, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 1000 + }, + { + "epoch": 0.32601678502259523, + "grad_norm": 0.41834497451782227, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 1010 + }, + { + "epoch": 0.329244673983215, + "grad_norm": 0.441092312335968, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 1020 + }, + { + "epoch": 0.33247256294383476, + "grad_norm": 0.637322187423706, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 1030 + }, + { + "epoch": 0.33570045190445447, + "grad_norm": 0.4374958574771881, + "learning_rate": 0.0002, + "loss": 0.8685, + "step": 1040 + }, + { + "epoch": 0.33892834086507423, + "grad_norm": 0.3935825824737549, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 1050 + }, + { + "epoch": 0.342156229825694, + "grad_norm": 0.43526220321655273, + "learning_rate": 0.0002, + "loss": 0.8287, + "step": 1060 + }, + { + "epoch": 0.34538411878631375, + "grad_norm": 0.45327696204185486, + "learning_rate": 0.0002, + "loss": 0.8413, + "step": 1070 + }, + { + "epoch": 0.3486120077469335, + "grad_norm": 0.4126075506210327, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 1080 + }, + { + "epoch": 0.3518398967075533, + "grad_norm": 0.4714072048664093, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 1090 + }, + { + "epoch": 0.35506778566817304, + "grad_norm": 0.518127977848053, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 1100 + }, + { + "epoch": 0.35829567462879275, + "grad_norm": 0.43264099955558777, + "learning_rate": 0.0002, + "loss": 0.8479, + "step": 1110 + }, + { + "epoch": 0.3615235635894125, + "grad_norm": 0.4857400357723236, + "learning_rate": 0.0002, + "loss": 0.8724, + "step": 1120 + }, + { + "epoch": 0.3647514525500323, + "grad_norm": 0.37591469287872314, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 1130 + }, + { + "epoch": 0.36797934151065204, + "grad_norm": 0.4165478050708771, + "learning_rate": 0.0002, + "loss": 0.8531, + "step": 1140 + }, + { + "epoch": 0.3712072304712718, + "grad_norm": 0.42911383509635925, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 1150 + }, + { + "epoch": 0.37443511943189156, + "grad_norm": 0.44980287551879883, + "learning_rate": 0.0002, + "loss": 0.8722, + "step": 1160 + }, + { + "epoch": 0.3776630083925113, + "grad_norm": 0.4066573679447174, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 1170 + }, + { + "epoch": 0.38089089735313103, + "grad_norm": 0.5056195855140686, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 1180 + }, + { + "epoch": 0.3841187863137508, + "grad_norm": 0.4141536355018616, + "learning_rate": 0.0002, + "loss": 0.8387, + "step": 1190 + }, + { + "epoch": 0.38734667527437056, + "grad_norm": 0.4501924514770508, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 1200 + }, + { + "epoch": 0.3905745642349903, + "grad_norm": 0.43304240703582764, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 1210 + }, + { + "epoch": 0.3938024531956101, + "grad_norm": 0.475777804851532, + "learning_rate": 0.0002, + "loss": 0.8905, + "step": 1220 + }, + { + "epoch": 0.39703034215622984, + "grad_norm": 0.5846465826034546, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 1230 + }, + { + "epoch": 0.4002582311168496, + "grad_norm": 0.42899325489997864, + "learning_rate": 0.0002, + "loss": 0.8078, + "step": 1240 + }, + { + "epoch": 0.4034861200774693, + "grad_norm": 0.3980463147163391, + "learning_rate": 0.0002, + "loss": 0.8415, + "step": 1250 + }, + { + "epoch": 0.4067140090380891, + "grad_norm": 0.45769768953323364, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 1260 + }, + { + "epoch": 0.40994189799870884, + "grad_norm": 0.5101280212402344, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 1270 + }, + { + "epoch": 0.4131697869593286, + "grad_norm": 0.47374317049980164, + "learning_rate": 0.0002, + "loss": 0.7905, + "step": 1280 + }, + { + "epoch": 0.41639767591994836, + "grad_norm": 0.4261878728866577, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 1290 + }, + { + "epoch": 0.4196255648805681, + "grad_norm": 0.46954256296157837, + "learning_rate": 0.0002, + "loss": 0.9004, + "step": 1300 + }, + { + "epoch": 0.4228534538411879, + "grad_norm": 0.5205738544464111, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 1310 + }, + { + "epoch": 0.4260813428018076, + "grad_norm": 0.5176340937614441, + "learning_rate": 0.0002, + "loss": 0.8964, + "step": 1320 + }, + { + "epoch": 0.42930923176242736, + "grad_norm": 0.5155916810035706, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 1330 + }, + { + "epoch": 0.4325371207230471, + "grad_norm": 0.44548553228378296, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 1340 + }, + { + "epoch": 0.4357650096836669, + "grad_norm": 0.5633558630943298, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 1350 + }, + { + "epoch": 0.43899289864428664, + "grad_norm": 0.42444056272506714, + "learning_rate": 0.0002, + "loss": 0.7889, + "step": 1360 + }, + { + "epoch": 0.4422207876049064, + "grad_norm": 0.5226860642433167, + "learning_rate": 0.0002, + "loss": 0.8588, + "step": 1370 + }, + { + "epoch": 0.44544867656552617, + "grad_norm": 0.5354582071304321, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 1380 + }, + { + "epoch": 0.4486765655261459, + "grad_norm": 0.472646564245224, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 1390 + }, + { + "epoch": 0.45190445448676564, + "grad_norm": 0.6312310099601746, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 1400 + }, + { + "epoch": 0.4551323434473854, + "grad_norm": 0.4298408031463623, + "learning_rate": 0.0002, + "loss": 0.8212, + "step": 1410 + }, + { + "epoch": 0.45836023240800516, + "grad_norm": 0.43427202105522156, + "learning_rate": 0.0002, + "loss": 0.8447, + "step": 1420 + }, + { + "epoch": 0.4615881213686249, + "grad_norm": 0.44097861647605896, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 1430 + }, + { + "epoch": 0.4648160103292447, + "grad_norm": 0.5142693519592285, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 1440 + }, + { + "epoch": 0.46804389928986445, + "grad_norm": 0.46416547894477844, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 1450 + }, + { + "epoch": 0.47127178825048416, + "grad_norm": 0.4858551025390625, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 1460 + }, + { + "epoch": 0.4744996772111039, + "grad_norm": 0.4709177315235138, + "learning_rate": 0.0002, + "loss": 0.8354, + "step": 1470 + }, + { + "epoch": 0.4777275661717237, + "grad_norm": 0.5500252842903137, + "learning_rate": 0.0002, + "loss": 0.8391, + "step": 1480 + }, + { + "epoch": 0.48095545513234345, + "grad_norm": 0.43364381790161133, + "learning_rate": 0.0002, + "loss": 0.8359, + "step": 1490 + }, + { + "epoch": 0.4841833440929632, + "grad_norm": 0.47712287306785583, + "learning_rate": 0.0002, + "loss": 0.8446, + "step": 1500 + }, + { + "epoch": 0.48741123305358297, + "grad_norm": 0.4518495202064514, + "learning_rate": 0.0002, + "loss": 0.8518, + "step": 1510 + }, + { + "epoch": 0.49063912201420273, + "grad_norm": 0.4539008140563965, + "learning_rate": 0.0002, + "loss": 0.819, + "step": 1520 + }, + { + "epoch": 0.49386701097482244, + "grad_norm": 0.4993067979812622, + "learning_rate": 0.0002, + "loss": 0.8276, + "step": 1530 + }, + { + "epoch": 0.4970948999354422, + "grad_norm": 0.6094803214073181, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 1540 + }, + { + "epoch": 0.500322788896062, + "grad_norm": 0.48602527379989624, + "learning_rate": 0.0002, + "loss": 0.8263, + "step": 1550 + }, + { + "epoch": 0.5035506778566817, + "grad_norm": 0.40245795249938965, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 1560 + }, + { + "epoch": 0.5067785668173015, + "grad_norm": 0.456787645816803, + "learning_rate": 0.0002, + "loss": 0.7907, + "step": 1570 + }, + { + "epoch": 0.5100064557779213, + "grad_norm": 0.43936216831207275, + "learning_rate": 0.0002, + "loss": 0.86, + "step": 1580 + }, + { + "epoch": 0.513234344738541, + "grad_norm": 0.549018144607544, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 1590 + }, + { + "epoch": 0.5164622336991608, + "grad_norm": 0.41746795177459717, + "learning_rate": 0.0002, + "loss": 0.8169, + "step": 1600 + }, + { + "epoch": 0.5196901226597805, + "grad_norm": 0.4217053949832916, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 1610 + }, + { + "epoch": 0.5229180116204003, + "grad_norm": 0.449913889169693, + "learning_rate": 0.0002, + "loss": 0.8161, + "step": 1620 + }, + { + "epoch": 0.5261459005810201, + "grad_norm": 0.5084872245788574, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 1630 + }, + { + "epoch": 0.5293737895416397, + "grad_norm": 0.46248653531074524, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 1640 + }, + { + "epoch": 0.5326016785022595, + "grad_norm": 0.4824236035346985, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 1650 + }, + { + "epoch": 0.5358295674628792, + "grad_norm": 0.6010985374450684, + "learning_rate": 0.0002, + "loss": 0.8711, + "step": 1660 + }, + { + "epoch": 0.539057456423499, + "grad_norm": 0.4757920801639557, + "learning_rate": 0.0002, + "loss": 0.8266, + "step": 1670 + }, + { + "epoch": 0.5422853453841188, + "grad_norm": 0.45161882042884827, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 1680 + }, + { + "epoch": 0.5455132343447385, + "grad_norm": 0.49314990639686584, + "learning_rate": 0.0002, + "loss": 0.8141, + "step": 1690 + }, + { + "epoch": 0.5487411233053583, + "grad_norm": 0.3918305039405823, + "learning_rate": 0.0002, + "loss": 0.8091, + "step": 1700 + }, + { + "epoch": 0.551969012265978, + "grad_norm": 0.5966728925704956, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 1710 + }, + { + "epoch": 0.5551969012265978, + "grad_norm": 0.4208986163139343, + "learning_rate": 0.0002, + "loss": 0.8438, + "step": 1720 + }, + { + "epoch": 0.5584247901872176, + "grad_norm": 0.43724218010902405, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 1730 + }, + { + "epoch": 0.5616526791478373, + "grad_norm": 0.5287272930145264, + "learning_rate": 0.0002, + "loss": 0.7956, + "step": 1740 + }, + { + "epoch": 0.5648805681084571, + "grad_norm": 0.4961899518966675, + "learning_rate": 0.0002, + "loss": 0.8557, + "step": 1750 + }, + { + "epoch": 0.5681084570690769, + "grad_norm": 0.4468635320663452, + "learning_rate": 0.0002, + "loss": 0.8029, + "step": 1760 + }, + { + "epoch": 0.5713363460296966, + "grad_norm": 0.6423530578613281, + "learning_rate": 0.0002, + "loss": 0.7968, + "step": 1770 + }, + { + "epoch": 0.5745642349903163, + "grad_norm": 0.4601971507072449, + "learning_rate": 0.0002, + "loss": 0.8324, + "step": 1780 + }, + { + "epoch": 0.577792123950936, + "grad_norm": 0.46514901518821716, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 1790 + }, + { + "epoch": 0.5810200129115558, + "grad_norm": 0.4771687388420105, + "learning_rate": 0.0002, + "loss": 0.8186, + "step": 1800 + }, + { + "epoch": 0.5842479018721756, + "grad_norm": 0.46514490246772766, + "learning_rate": 0.0002, + "loss": 0.856, + "step": 1810 + }, + { + "epoch": 0.5874757908327953, + "grad_norm": 0.5373936295509338, + "learning_rate": 0.0002, + "loss": 0.84, + "step": 1820 + }, + { + "epoch": 0.5907036797934151, + "grad_norm": 0.5175791382789612, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 1830 + }, + { + "epoch": 0.5939315687540349, + "grad_norm": 0.4522802233695984, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 1840 + }, + { + "epoch": 0.5971594577146546, + "grad_norm": 0.42987772822380066, + "learning_rate": 0.0002, + "loss": 0.8633, + "step": 1850 + }, + { + "epoch": 0.6003873466752744, + "grad_norm": 0.5566838383674622, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 1860 + }, + { + "epoch": 0.6036152356358941, + "grad_norm": 0.42807698249816895, + "learning_rate": 0.0002, + "loss": 0.8312, + "step": 1870 + }, + { + "epoch": 0.6068431245965139, + "grad_norm": 0.4957767724990845, + "learning_rate": 0.0002, + "loss": 0.8035, + "step": 1880 + }, + { + "epoch": 0.6100710135571337, + "grad_norm": 0.4260980188846588, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 1890 + }, + { + "epoch": 0.6132989025177534, + "grad_norm": 0.4777357876300812, + "learning_rate": 0.0002, + "loss": 0.8363, + "step": 1900 + }, + { + "epoch": 0.6165267914783732, + "grad_norm": 0.4434216022491455, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 1910 + }, + { + "epoch": 0.6197546804389928, + "grad_norm": 0.5215433835983276, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 1920 + }, + { + "epoch": 0.6229825693996126, + "grad_norm": 0.5143248438835144, + "learning_rate": 0.0002, + "loss": 0.82, + "step": 1930 + }, + { + "epoch": 0.6262104583602324, + "grad_norm": 0.5213413238525391, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 1940 + }, + { + "epoch": 0.6294383473208521, + "grad_norm": 0.5408226251602173, + "learning_rate": 0.0002, + "loss": 0.7549, + "step": 1950 + }, + { + "epoch": 0.6326662362814719, + "grad_norm": 0.5479708909988403, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 1960 + }, + { + "epoch": 0.6358941252420917, + "grad_norm": 0.4490949809551239, + "learning_rate": 0.0002, + "loss": 0.8138, + "step": 1970 + }, + { + "epoch": 0.6391220142027114, + "grad_norm": 0.48815059661865234, + "learning_rate": 0.0002, + "loss": 0.854, + "step": 1980 + }, + { + "epoch": 0.6423499031633312, + "grad_norm": 0.46498045325279236, + "learning_rate": 0.0002, + "loss": 0.8568, + "step": 1990 + }, + { + "epoch": 0.6455777921239509, + "grad_norm": 0.5136561393737793, + "learning_rate": 0.0002, + "loss": 0.8263, + "step": 2000 + }, + { + "epoch": 0.6488056810845707, + "grad_norm": 0.5145719647407532, + "learning_rate": 0.0002, + "loss": 0.8503, + "step": 2010 + }, + { + "epoch": 0.6520335700451905, + "grad_norm": 0.5430373549461365, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 2020 + }, + { + "epoch": 0.6552614590058102, + "grad_norm": 0.46347954869270325, + "learning_rate": 0.0002, + "loss": 0.8115, + "step": 2030 + }, + { + "epoch": 0.65848934796643, + "grad_norm": 0.5189562439918518, + "learning_rate": 0.0002, + "loss": 0.8769, + "step": 2040 + }, + { + "epoch": 0.6617172369270498, + "grad_norm": 0.43843990564346313, + "learning_rate": 0.0002, + "loss": 0.8453, + "step": 2050 + }, + { + "epoch": 0.6649451258876695, + "grad_norm": 0.4654983580112457, + "learning_rate": 0.0002, + "loss": 0.7951, + "step": 2060 + }, + { + "epoch": 0.6681730148482892, + "grad_norm": 0.44835716485977173, + "learning_rate": 0.0002, + "loss": 0.8308, + "step": 2070 + }, + { + "epoch": 0.6714009038089089, + "grad_norm": 0.38811734318733215, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 2080 + }, + { + "epoch": 0.6746287927695287, + "grad_norm": 0.5709853172302246, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 2090 + }, + { + "epoch": 0.6778566817301485, + "grad_norm": 0.49994757771492004, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 2100 + }, + { + "epoch": 0.6810845706907682, + "grad_norm": 0.5505402684211731, + "learning_rate": 0.0002, + "loss": 0.8, + "step": 2110 + }, + { + "epoch": 0.684312459651388, + "grad_norm": 0.48195120692253113, + "learning_rate": 0.0002, + "loss": 0.8227, + "step": 2120 + }, + { + "epoch": 0.6875403486120077, + "grad_norm": 0.4854775071144104, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 2130 + }, + { + "epoch": 0.6907682375726275, + "grad_norm": 0.6422494649887085, + "learning_rate": 0.0002, + "loss": 0.8231, + "step": 2140 + }, + { + "epoch": 0.6939961265332473, + "grad_norm": 0.3972536027431488, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 2150 + }, + { + "epoch": 0.697224015493867, + "grad_norm": 0.4297836422920227, + "learning_rate": 0.0002, + "loss": 0.8068, + "step": 2160 + }, + { + "epoch": 0.7004519044544868, + "grad_norm": 0.45486778020858765, + "learning_rate": 0.0002, + "loss": 0.8017, + "step": 2170 + }, + { + "epoch": 0.7036797934151066, + "grad_norm": 0.4706047773361206, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 2180 + }, + { + "epoch": 0.7069076823757263, + "grad_norm": 0.46426892280578613, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 2190 + }, + { + "epoch": 0.7101355713363461, + "grad_norm": 0.46333715319633484, + "learning_rate": 0.0002, + "loss": 0.8472, + "step": 2200 + }, + { + "epoch": 0.7133634602969657, + "grad_norm": 0.4632524251937866, + "learning_rate": 0.0002, + "loss": 0.8247, + "step": 2210 + }, + { + "epoch": 0.7165913492575855, + "grad_norm": 0.4610830843448639, + "learning_rate": 0.0002, + "loss": 0.8452, + "step": 2220 + }, + { + "epoch": 0.7198192382182053, + "grad_norm": 0.4905324876308441, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 2230 + }, + { + "epoch": 0.723047127178825, + "grad_norm": 0.4936263859272003, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 2240 + }, + { + "epoch": 0.7262750161394448, + "grad_norm": 0.40778425335884094, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 2250 + }, + { + "epoch": 0.7295029051000645, + "grad_norm": 0.50351482629776, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 2260 + }, + { + "epoch": 0.7327307940606843, + "grad_norm": 0.4894128143787384, + "learning_rate": 0.0002, + "loss": 0.8475, + "step": 2270 + }, + { + "epoch": 0.7359586830213041, + "grad_norm": 0.5580906271934509, + "learning_rate": 0.0002, + "loss": 0.8087, + "step": 2280 + }, + { + "epoch": 0.7391865719819238, + "grad_norm": 0.4655369520187378, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 2290 + }, + { + "epoch": 0.7424144609425436, + "grad_norm": 0.4666965901851654, + "learning_rate": 0.0002, + "loss": 0.8395, + "step": 2300 + }, + { + "epoch": 0.7456423499031634, + "grad_norm": 0.46259936690330505, + "learning_rate": 0.0002, + "loss": 0.7605, + "step": 2310 + }, + { + "epoch": 0.7488702388637831, + "grad_norm": 0.520706832408905, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 2320 + }, + { + "epoch": 0.7520981278244029, + "grad_norm": 0.5142408013343811, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 2330 + }, + { + "epoch": 0.7553260167850226, + "grad_norm": 0.5355164408683777, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 2340 + }, + { + "epoch": 0.7585539057456423, + "grad_norm": 0.5517185926437378, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 2350 + }, + { + "epoch": 0.7617817947062621, + "grad_norm": 0.7162677049636841, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 2360 + }, + { + "epoch": 0.7650096836668818, + "grad_norm": 0.42402133345603943, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 2370 + }, + { + "epoch": 0.7682375726275016, + "grad_norm": 0.47180113196372986, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 2380 + }, + { + "epoch": 0.7714654615881213, + "grad_norm": 0.6262288689613342, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 2390 + }, + { + "epoch": 0.7746933505487411, + "grad_norm": 0.5177528262138367, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 2400 + }, + { + "epoch": 0.7779212395093609, + "grad_norm": 0.555721640586853, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 2410 + }, + { + "epoch": 0.7811491284699806, + "grad_norm": 0.5592644810676575, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 2420 + }, + { + "epoch": 0.7843770174306004, + "grad_norm": 0.38025397062301636, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 2430 + }, + { + "epoch": 0.7876049063912202, + "grad_norm": 0.4597472548484802, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 2440 + }, + { + "epoch": 0.7908327953518399, + "grad_norm": 0.4929825961589813, + "learning_rate": 0.0002, + "loss": 0.8575, + "step": 2450 + }, + { + "epoch": 0.7940606843124597, + "grad_norm": 0.45277655124664307, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 2460 + }, + { + "epoch": 0.7972885732730794, + "grad_norm": 0.6224122643470764, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 2470 + }, + { + "epoch": 0.8005164622336992, + "grad_norm": 0.5740901827812195, + "learning_rate": 0.0002, + "loss": 0.8449, + "step": 2480 + }, + { + "epoch": 0.8037443511943189, + "grad_norm": 0.41335329413414, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 2490 + }, + { + "epoch": 0.8069722401549386, + "grad_norm": 0.4738694131374359, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 2500 + }, + { + "epoch": 0.8102001291155584, + "grad_norm": 0.5288197994232178, + "learning_rate": 0.0002, + "loss": 0.7927, + "step": 2510 + }, + { + "epoch": 0.8134280180761781, + "grad_norm": 0.5404666066169739, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 2520 + }, + { + "epoch": 0.8166559070367979, + "grad_norm": 0.4444909691810608, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 2530 + }, + { + "epoch": 0.8198837959974177, + "grad_norm": 0.542061448097229, + "learning_rate": 0.0002, + "loss": 0.8683, + "step": 2540 + }, + { + "epoch": 0.8231116849580374, + "grad_norm": 0.4914741814136505, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 2550 + }, + { + "epoch": 0.8263395739186572, + "grad_norm": 0.41703441739082336, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 2560 + }, + { + "epoch": 0.829567462879277, + "grad_norm": 0.5489841103553772, + "learning_rate": 0.0002, + "loss": 0.824, + "step": 2570 + }, + { + "epoch": 0.8327953518398967, + "grad_norm": 0.5359883308410645, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 2580 + }, + { + "epoch": 0.8360232408005165, + "grad_norm": 0.5541019439697266, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 2590 + }, + { + "epoch": 0.8392511297611362, + "grad_norm": 0.4746638834476471, + "learning_rate": 0.0002, + "loss": 0.797, + "step": 2600 + }, + { + "epoch": 0.842479018721756, + "grad_norm": 0.5243194103240967, + "learning_rate": 0.0002, + "loss": 0.8116, + "step": 2610 + }, + { + "epoch": 0.8457069076823758, + "grad_norm": 0.46824976801872253, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 2620 + }, + { + "epoch": 0.8489347966429954, + "grad_norm": 0.49487847089767456, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 2630 + }, + { + "epoch": 0.8521626856036152, + "grad_norm": 0.42180097103118896, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 2640 + }, + { + "epoch": 0.855390574564235, + "grad_norm": 0.5516560077667236, + "learning_rate": 0.0002, + "loss": 0.8304, + "step": 2650 + }, + { + "epoch": 0.8586184635248547, + "grad_norm": 0.4392191767692566, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 2660 + }, + { + "epoch": 0.8618463524854745, + "grad_norm": 0.5387210845947266, + "learning_rate": 0.0002, + "loss": 0.848, + "step": 2670 + }, + { + "epoch": 0.8650742414460942, + "grad_norm": 0.6232406497001648, + "learning_rate": 0.0002, + "loss": 0.8094, + "step": 2680 + }, + { + "epoch": 0.868302130406714, + "grad_norm": 0.53749018907547, + "learning_rate": 0.0002, + "loss": 0.768, + "step": 2690 + }, + { + "epoch": 0.8715300193673338, + "grad_norm": 0.47480374574661255, + "learning_rate": 0.0002, + "loss": 0.8299, + "step": 2700 + }, + { + "epoch": 0.8747579083279535, + "grad_norm": 0.44618046283721924, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 2710 + }, + { + "epoch": 0.8779857972885733, + "grad_norm": 0.4173581302165985, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 2720 + }, + { + "epoch": 0.881213686249193, + "grad_norm": 0.524081289768219, + "learning_rate": 0.0002, + "loss": 0.7713, + "step": 2730 + }, + { + "epoch": 0.8844415752098128, + "grad_norm": 0.5608431100845337, + "learning_rate": 0.0002, + "loss": 0.8738, + "step": 2740 + }, + { + "epoch": 0.8876694641704326, + "grad_norm": 0.5212284922599792, + "learning_rate": 0.0002, + "loss": 0.8513, + "step": 2750 + }, + { + "epoch": 0.8908973531310523, + "grad_norm": 0.5601475834846497, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 2760 + }, + { + "epoch": 0.8941252420916721, + "grad_norm": 0.4499223828315735, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 2770 + }, + { + "epoch": 0.8973531310522918, + "grad_norm": 0.46945226192474365, + "learning_rate": 0.0002, + "loss": 0.8559, + "step": 2780 + }, + { + "epoch": 0.9005810200129115, + "grad_norm": 0.4837495684623718, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 2790 + }, + { + "epoch": 0.9038089089735313, + "grad_norm": 0.5059258937835693, + "learning_rate": 0.0002, + "loss": 0.7887, + "step": 2800 + }, + { + "epoch": 0.907036797934151, + "grad_norm": 0.4857945144176483, + "learning_rate": 0.0002, + "loss": 0.8571, + "step": 2810 + }, + { + "epoch": 0.9102646868947708, + "grad_norm": 0.5001962780952454, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 2820 + }, + { + "epoch": 0.9134925758553906, + "grad_norm": 0.5468648672103882, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 2830 + }, + { + "epoch": 0.9167204648160103, + "grad_norm": 0.5533056259155273, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 2840 + }, + { + "epoch": 0.9199483537766301, + "grad_norm": 0.5909785628318787, + "learning_rate": 0.0002, + "loss": 0.7895, + "step": 2850 + }, + { + "epoch": 0.9231762427372499, + "grad_norm": 0.47428104281425476, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 2860 + }, + { + "epoch": 0.9264041316978696, + "grad_norm": 0.548814058303833, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 2870 + }, + { + "epoch": 0.9296320206584894, + "grad_norm": 0.5576745271682739, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 2880 + }, + { + "epoch": 0.9328599096191091, + "grad_norm": 0.47094792127609253, + "learning_rate": 0.0002, + "loss": 0.8399, + "step": 2890 + }, + { + "epoch": 0.9360877985797289, + "grad_norm": 0.5408539772033691, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 2900 + }, + { + "epoch": 0.9393156875403487, + "grad_norm": 0.5922889113426208, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 2910 + }, + { + "epoch": 0.9425435765009683, + "grad_norm": 0.45462584495544434, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 2920 + }, + { + "epoch": 0.9457714654615881, + "grad_norm": 0.6864947080612183, + "learning_rate": 0.0002, + "loss": 0.8344, + "step": 2930 + }, + { + "epoch": 0.9489993544222078, + "grad_norm": 0.4706299304962158, + "learning_rate": 0.0002, + "loss": 0.8166, + "step": 2940 + }, + { + "epoch": 0.9522272433828276, + "grad_norm": 0.5583269596099854, + "learning_rate": 0.0002, + "loss": 0.8422, + "step": 2950 + }, + { + "epoch": 0.9554551323434474, + "grad_norm": 0.51015704870224, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 2960 + }, + { + "epoch": 0.9586830213040671, + "grad_norm": 0.5325582027435303, + "learning_rate": 0.0002, + "loss": 0.8371, + "step": 2970 + }, + { + "epoch": 0.9619109102646869, + "grad_norm": 0.49008598923683167, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 2980 + }, + { + "epoch": 0.9651387992253067, + "grad_norm": 0.4422132074832916, + "learning_rate": 0.0002, + "loss": 0.8093, + "step": 2990 + }, + { + "epoch": 0.9683666881859264, + "grad_norm": 0.5053589344024658, + "learning_rate": 0.0002, + "loss": 0.7966, + "step": 3000 + }, + { + "epoch": 0.9715945771465462, + "grad_norm": 0.46754521131515503, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 3010 + }, + { + "epoch": 0.9748224661071659, + "grad_norm": 0.5613434910774231, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 3020 + }, + { + "epoch": 0.9780503550677857, + "grad_norm": 0.5052843689918518, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 3030 + }, + { + "epoch": 0.9812782440284055, + "grad_norm": 0.4270972013473511, + "learning_rate": 0.0002, + "loss": 0.8412, + "step": 3040 + }, + { + "epoch": 0.9845061329890252, + "grad_norm": 0.4974991977214813, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 3050 + }, + { + "epoch": 0.9877340219496449, + "grad_norm": 0.4432311952114105, + "learning_rate": 0.0002, + "loss": 0.8415, + "step": 3060 + }, + { + "epoch": 0.9909619109102646, + "grad_norm": 0.466457724571228, + "learning_rate": 0.0002, + "loss": 0.7764, + "step": 3070 + }, + { + "epoch": 0.9941897998708844, + "grad_norm": 0.6438009142875671, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 3080 + }, + { + "epoch": 0.9974176888315042, + "grad_norm": 0.5593604445457458, + "learning_rate": 0.0002, + "loss": 0.8425, + "step": 3090 + }, + { + "epoch": 1.0, + "eval_loss": 1.0958120822906494, + "eval_runtime": 148.3273, + "eval_samples_per_second": 4.942, + "eval_steps_per_second": 0.62, + "step": 3098 + }, + { + "epoch": 1.000645577792124, + "grad_norm": 0.5701445937156677, + "learning_rate": 0.0002, + "loss": 0.8275, + "step": 3100 + }, + { + "epoch": 1.0038734667527438, + "grad_norm": 0.6089657545089722, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 3110 + }, + { + "epoch": 1.0071013557133635, + "grad_norm": 0.5619552135467529, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 3120 + }, + { + "epoch": 1.010329244673983, + "grad_norm": 0.5550283789634705, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 3130 + }, + { + "epoch": 1.013557133634603, + "grad_norm": 0.6221792101860046, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 3140 + }, + { + "epoch": 1.0167850225952226, + "grad_norm": 0.5450758934020996, + "learning_rate": 0.0002, + "loss": 0.7603, + "step": 3150 + }, + { + "epoch": 1.0200129115558425, + "grad_norm": 0.4359588027000427, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 3160 + }, + { + "epoch": 1.0232408005164622, + "grad_norm": 0.5932239890098572, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 3170 + }, + { + "epoch": 1.026468689477082, + "grad_norm": 0.45478707551956177, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 3180 + }, + { + "epoch": 1.0296965784377017, + "grad_norm": 0.677615761756897, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 3190 + }, + { + "epoch": 1.0329244673983216, + "grad_norm": 0.6231790781021118, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 3200 + }, + { + "epoch": 1.0361523563589412, + "grad_norm": 0.5074195861816406, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 3210 + }, + { + "epoch": 1.039380245319561, + "grad_norm": 0.4844142198562622, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 3220 + }, + { + "epoch": 1.0426081342801807, + "grad_norm": 0.5372750759124756, + "learning_rate": 0.0002, + "loss": 0.7655, + "step": 3230 + }, + { + "epoch": 1.0458360232408006, + "grad_norm": 0.46296265721321106, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 3240 + }, + { + "epoch": 1.0490639122014203, + "grad_norm": 0.5417148470878601, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 3250 + }, + { + "epoch": 1.0522918011620401, + "grad_norm": 0.5695074200630188, + "learning_rate": 0.0002, + "loss": 0.7637, + "step": 3260 + }, + { + "epoch": 1.0555196901226598, + "grad_norm": 0.5050092935562134, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 3270 + }, + { + "epoch": 1.0587475790832794, + "grad_norm": 0.5320752263069153, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 3280 + }, + { + "epoch": 1.0619754680438993, + "grad_norm": 0.5832052230834961, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 3290 + }, + { + "epoch": 1.065203357004519, + "grad_norm": 0.5228804349899292, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 3300 + }, + { + "epoch": 1.0684312459651388, + "grad_norm": 0.5819445252418518, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 3310 + }, + { + "epoch": 1.0716591349257585, + "grad_norm": 0.4201328754425049, + "learning_rate": 0.0002, + "loss": 0.7093, + "step": 3320 + }, + { + "epoch": 1.0748870238863784, + "grad_norm": 0.5424145460128784, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 3330 + }, + { + "epoch": 1.078114912846998, + "grad_norm": 0.6169946789741516, + "learning_rate": 0.0002, + "loss": 0.7828, + "step": 3340 + }, + { + "epoch": 1.0813428018076179, + "grad_norm": 0.607676088809967, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 3350 + }, + { + "epoch": 1.0845706907682375, + "grad_norm": 0.5191982388496399, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 3360 + }, + { + "epoch": 1.0877985797288574, + "grad_norm": 0.5728003978729248, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 3370 + }, + { + "epoch": 1.091026468689477, + "grad_norm": 0.5402643084526062, + "learning_rate": 0.0002, + "loss": 0.7381, + "step": 3380 + }, + { + "epoch": 1.094254357650097, + "grad_norm": 0.5377541780471802, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 3390 + }, + { + "epoch": 1.0974822466107166, + "grad_norm": 0.4751385748386383, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 3400 + }, + { + "epoch": 1.1007101355713362, + "grad_norm": 0.559158444404602, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 3410 + }, + { + "epoch": 1.103938024531956, + "grad_norm": 0.4917701482772827, + "learning_rate": 0.0002, + "loss": 0.7366, + "step": 3420 + }, + { + "epoch": 1.1071659134925758, + "grad_norm": 0.5507875084877014, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 3430 + }, + { + "epoch": 1.1103938024531956, + "grad_norm": 0.45458680391311646, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 3440 + }, + { + "epoch": 1.1136216914138153, + "grad_norm": 0.5721744894981384, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 3450 + }, + { + "epoch": 1.1168495803744352, + "grad_norm": 0.5776081681251526, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 3460 + }, + { + "epoch": 1.1200774693350548, + "grad_norm": 0.5261953473091125, + "learning_rate": 0.0002, + "loss": 0.7644, + "step": 3470 + }, + { + "epoch": 1.1233053582956747, + "grad_norm": 0.47759532928466797, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 3480 + }, + { + "epoch": 1.1265332472562943, + "grad_norm": 0.5697659850120544, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 3490 + }, + { + "epoch": 1.1297611362169142, + "grad_norm": 0.5643419623374939, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 3500 + }, + { + "epoch": 1.1329890251775339, + "grad_norm": 0.6502931118011475, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 3510 + }, + { + "epoch": 1.1362169141381537, + "grad_norm": 0.5236507654190063, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 3520 + }, + { + "epoch": 1.1394448030987734, + "grad_norm": 0.6521499156951904, + "learning_rate": 0.0002, + "loss": 0.7571, + "step": 3530 + }, + { + "epoch": 1.142672692059393, + "grad_norm": 0.5893217325210571, + "learning_rate": 0.0002, + "loss": 0.7304, + "step": 3540 + }, + { + "epoch": 1.145900581020013, + "grad_norm": 0.5300073027610779, + "learning_rate": 0.0002, + "loss": 0.7508, + "step": 3550 + }, + { + "epoch": 1.1491284699806328, + "grad_norm": 0.6794660091400146, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 3560 + }, + { + "epoch": 1.1523563589412524, + "grad_norm": 0.5420064926147461, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 3570 + }, + { + "epoch": 1.155584247901872, + "grad_norm": 0.5096590518951416, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 3580 + }, + { + "epoch": 1.158812136862492, + "grad_norm": 0.5726043581962585, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 3590 + }, + { + "epoch": 1.1620400258231116, + "grad_norm": 0.7388110160827637, + "learning_rate": 0.0002, + "loss": 0.7728, + "step": 3600 + }, + { + "epoch": 1.1652679147837315, + "grad_norm": 0.5597969889640808, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 3610 + }, + { + "epoch": 1.1684958037443511, + "grad_norm": 0.5067800283432007, + "learning_rate": 0.0002, + "loss": 0.7132, + "step": 3620 + }, + { + "epoch": 1.171723692704971, + "grad_norm": 0.6625118255615234, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 3630 + }, + { + "epoch": 1.1749515816655907, + "grad_norm": 0.5830849409103394, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 3640 + }, + { + "epoch": 1.1781794706262105, + "grad_norm": 0.6140692830085754, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 3650 + }, + { + "epoch": 1.1814073595868302, + "grad_norm": 0.714523434638977, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 3660 + }, + { + "epoch": 1.18463524854745, + "grad_norm": 0.5196696519851685, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 3670 + }, + { + "epoch": 1.1878631375080697, + "grad_norm": 0.6677889823913574, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 3680 + }, + { + "epoch": 1.1910910264686896, + "grad_norm": 0.47095245122909546, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 3690 + }, + { + "epoch": 1.1943189154293092, + "grad_norm": 0.5197778940200806, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 3700 + }, + { + "epoch": 1.1975468043899289, + "grad_norm": 0.5156530141830444, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 3710 + }, + { + "epoch": 1.2007746933505488, + "grad_norm": 0.6968549489974976, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 3720 + }, + { + "epoch": 1.2040025823111684, + "grad_norm": 0.48983848094940186, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 3730 + }, + { + "epoch": 1.2072304712717883, + "grad_norm": 0.6709973216056824, + "learning_rate": 0.0002, + "loss": 0.7163, + "step": 3740 + }, + { + "epoch": 1.210458360232408, + "grad_norm": 0.48681750893592834, + "learning_rate": 0.0002, + "loss": 0.7632, + "step": 3750 + }, + { + "epoch": 1.2136862491930278, + "grad_norm": 0.49475061893463135, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 3760 + }, + { + "epoch": 1.2169141381536475, + "grad_norm": 0.6163983345031738, + "learning_rate": 0.0002, + "loss": 0.7372, + "step": 3770 + }, + { + "epoch": 1.2201420271142673, + "grad_norm": 0.5481411218643188, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 3780 + }, + { + "epoch": 1.223369916074887, + "grad_norm": 0.620639979839325, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 3790 + }, + { + "epoch": 1.2265978050355069, + "grad_norm": 0.7017222046852112, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 3800 + }, + { + "epoch": 1.2298256939961265, + "grad_norm": 0.5872400403022766, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 3810 + }, + { + "epoch": 1.2330535829567464, + "grad_norm": 0.45765596628189087, + "learning_rate": 0.0002, + "loss": 0.7854, + "step": 3820 + }, + { + "epoch": 1.236281471917366, + "grad_norm": 0.5676377415657043, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 3830 + }, + { + "epoch": 1.2395093608779857, + "grad_norm": 0.4793425500392914, + "learning_rate": 0.0002, + "loss": 0.7696, + "step": 3840 + }, + { + "epoch": 1.2427372498386056, + "grad_norm": 0.5060022473335266, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 3850 + }, + { + "epoch": 1.2459651387992252, + "grad_norm": 0.6140682697296143, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 3860 + }, + { + "epoch": 1.249193027759845, + "grad_norm": 0.5030326843261719, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 3870 + }, + { + "epoch": 1.2524209167204647, + "grad_norm": 0.6609430909156799, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 3880 + }, + { + "epoch": 1.2556488056810846, + "grad_norm": 0.5459545850753784, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 3890 + }, + { + "epoch": 1.2588766946417043, + "grad_norm": 0.5328870415687561, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 3900 + }, + { + "epoch": 1.2621045836023241, + "grad_norm": 0.5840652585029602, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 3910 + }, + { + "epoch": 1.2653324725629438, + "grad_norm": 0.5587584376335144, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 3920 + }, + { + "epoch": 1.2685603615235637, + "grad_norm": 0.5886949896812439, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 3930 + }, + { + "epoch": 1.2717882504841833, + "grad_norm": 0.5128693580627441, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 3940 + }, + { + "epoch": 1.2750161394448032, + "grad_norm": 0.6207669377326965, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 3950 + }, + { + "epoch": 1.2782440284054228, + "grad_norm": 0.5789574384689331, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 3960 + }, + { + "epoch": 1.2814719173660425, + "grad_norm": 0.503162145614624, + "learning_rate": 0.0002, + "loss": 0.7574, + "step": 3970 + }, + { + "epoch": 1.2846998063266624, + "grad_norm": 0.6670064926147461, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 3980 + }, + { + "epoch": 1.2879276952872822, + "grad_norm": 0.5676213502883911, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 3990 + }, + { + "epoch": 1.2911555842479019, + "grad_norm": 0.5383169054985046, + "learning_rate": 0.0002, + "loss": 0.7892, + "step": 4000 + }, + { + "epoch": 1.2943834732085215, + "grad_norm": 0.714743971824646, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 4010 + }, + { + "epoch": 1.2976113621691414, + "grad_norm": 0.5740262269973755, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 4020 + }, + { + "epoch": 1.300839251129761, + "grad_norm": 0.6143045425415039, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 4030 + }, + { + "epoch": 1.304067140090381, + "grad_norm": 0.501025378704071, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 4040 + }, + { + "epoch": 1.3072950290510006, + "grad_norm": 0.5784100294113159, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 4050 + }, + { + "epoch": 1.3105229180116205, + "grad_norm": 0.6182606220245361, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 4060 + }, + { + "epoch": 1.3137508069722401, + "grad_norm": 0.5072231292724609, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 4070 + }, + { + "epoch": 1.31697869593286, + "grad_norm": 0.6841012835502625, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 4080 + }, + { + "epoch": 1.3202065848934796, + "grad_norm": 0.697257936000824, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 4090 + }, + { + "epoch": 1.3234344738540993, + "grad_norm": 0.5113214254379272, + "learning_rate": 0.0002, + "loss": 0.7401, + "step": 4100 + }, + { + "epoch": 1.3266623628147192, + "grad_norm": 0.6270561814308167, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 4110 + }, + { + "epoch": 1.329890251775339, + "grad_norm": 0.5525947213172913, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 4120 + }, + { + "epoch": 1.3331181407359587, + "grad_norm": 0.546071469783783, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 4130 + }, + { + "epoch": 1.3363460296965783, + "grad_norm": 0.6516721248626709, + "learning_rate": 0.0002, + "loss": 0.7884, + "step": 4140 + }, + { + "epoch": 1.3395739186571982, + "grad_norm": 0.6235111355781555, + "learning_rate": 0.0002, + "loss": 0.755, + "step": 4150 + }, + { + "epoch": 1.3428018076178179, + "grad_norm": 0.538649320602417, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 4160 + }, + { + "epoch": 1.3460296965784377, + "grad_norm": 0.5367001891136169, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 4170 + }, + { + "epoch": 1.3492575855390574, + "grad_norm": 0.6134631037712097, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 4180 + }, + { + "epoch": 1.3524854744996773, + "grad_norm": 0.5827262997627258, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 4190 + }, + { + "epoch": 1.355713363460297, + "grad_norm": 0.5706096291542053, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 4200 + }, + { + "epoch": 1.3589412524209168, + "grad_norm": 0.6422057151794434, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 4210 + }, + { + "epoch": 1.3621691413815364, + "grad_norm": 0.6316141486167908, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 4220 + }, + { + "epoch": 1.365397030342156, + "grad_norm": 0.6946983933448792, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 4230 + }, + { + "epoch": 1.368624919302776, + "grad_norm": 0.5381525754928589, + "learning_rate": 0.0002, + "loss": 0.7388, + "step": 4240 + }, + { + "epoch": 1.3718528082633958, + "grad_norm": 0.5484845638275146, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 4250 + }, + { + "epoch": 1.3750806972240155, + "grad_norm": 0.5961896777153015, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 4260 + }, + { + "epoch": 1.3783085861846351, + "grad_norm": 0.6041752696037292, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 4270 + }, + { + "epoch": 1.381536475145255, + "grad_norm": 0.6283464431762695, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 4280 + }, + { + "epoch": 1.384764364105875, + "grad_norm": 0.6761324405670166, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 4290 + }, + { + "epoch": 1.3879922530664945, + "grad_norm": 0.504311203956604, + "learning_rate": 0.0002, + "loss": 0.7381, + "step": 4300 + }, + { + "epoch": 1.3912201420271142, + "grad_norm": 0.6100395917892456, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 4310 + }, + { + "epoch": 1.394448030987734, + "grad_norm": 0.6245788335800171, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 4320 + }, + { + "epoch": 1.3976759199483537, + "grad_norm": 0.6074621081352234, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 4330 + }, + { + "epoch": 1.4009038089089736, + "grad_norm": 0.6683838963508606, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 4340 + }, + { + "epoch": 1.4041316978695932, + "grad_norm": 0.622998058795929, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 4350 + }, + { + "epoch": 1.4073595868302131, + "grad_norm": 0.6089423894882202, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 4360 + }, + { + "epoch": 1.4105874757908328, + "grad_norm": 0.6381658911705017, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 4370 + }, + { + "epoch": 1.4138153647514526, + "grad_norm": 0.5419308543205261, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 4380 + }, + { + "epoch": 1.4170432537120723, + "grad_norm": 0.6026232242584229, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 4390 + }, + { + "epoch": 1.420271142672692, + "grad_norm": 0.4911101162433624, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 4400 + }, + { + "epoch": 1.4234990316333118, + "grad_norm": 0.6302908062934875, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 4410 + }, + { + "epoch": 1.4267269205939317, + "grad_norm": 0.6692768931388855, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 4420 + }, + { + "epoch": 1.4299548095545513, + "grad_norm": 0.46294572949409485, + "learning_rate": 0.0002, + "loss": 0.7312, + "step": 4430 + }, + { + "epoch": 1.433182698515171, + "grad_norm": 0.5452619194984436, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 4440 + }, + { + "epoch": 1.4364105874757909, + "grad_norm": 0.7809233069419861, + "learning_rate": 0.0002, + "loss": 0.7974, + "step": 4450 + }, + { + "epoch": 1.4396384764364105, + "grad_norm": 0.550088107585907, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 4460 + }, + { + "epoch": 1.4428663653970304, + "grad_norm": 0.7139151096343994, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 4470 + }, + { + "epoch": 1.44609425435765, + "grad_norm": 0.6187090873718262, + "learning_rate": 0.0002, + "loss": 0.7358, + "step": 4480 + }, + { + "epoch": 1.44932214331827, + "grad_norm": 0.5948249101638794, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 4490 + }, + { + "epoch": 1.4525500322788896, + "grad_norm": 0.6510892510414124, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 4500 + }, + { + "epoch": 1.4557779212395094, + "grad_norm": 0.6552293300628662, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 4510 + }, + { + "epoch": 1.459005810200129, + "grad_norm": 0.585574209690094, + "learning_rate": 0.0002, + "loss": 0.7965, + "step": 4520 + }, + { + "epoch": 1.4622336991607487, + "grad_norm": 0.4830162823200226, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 4530 + }, + { + "epoch": 1.4654615881213686, + "grad_norm": 0.5780223608016968, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 4540 + }, + { + "epoch": 1.4686894770819885, + "grad_norm": 0.5462607145309448, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 4550 + }, + { + "epoch": 1.4719173660426081, + "grad_norm": 0.5183546543121338, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 4560 + }, + { + "epoch": 1.4751452550032278, + "grad_norm": 0.676917552947998, + "learning_rate": 0.0002, + "loss": 0.71, + "step": 4570 + }, + { + "epoch": 1.4783731439638477, + "grad_norm": 0.5772345066070557, + "learning_rate": 0.0002, + "loss": 0.7875, + "step": 4580 + }, + { + "epoch": 1.4816010329244673, + "grad_norm": 0.7320035696029663, + "learning_rate": 0.0002, + "loss": 0.7709, + "step": 4590 + }, + { + "epoch": 1.4848289218850872, + "grad_norm": 0.5024042129516602, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 4600 + }, + { + "epoch": 1.4880568108457068, + "grad_norm": 0.5482868552207947, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 4610 + }, + { + "epoch": 1.4912846998063267, + "grad_norm": 0.5447399616241455, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 4620 + }, + { + "epoch": 1.4945125887669464, + "grad_norm": 0.5953414440155029, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 4630 + }, + { + "epoch": 1.4977404777275662, + "grad_norm": 0.6983066201210022, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 4640 + }, + { + "epoch": 1.500968366688186, + "grad_norm": 0.586327075958252, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 4650 + }, + { + "epoch": 1.5041962556488055, + "grad_norm": 0.5839682221412659, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 4660 + }, + { + "epoch": 1.5074241446094254, + "grad_norm": 0.5959209203720093, + "learning_rate": 0.0002, + "loss": 0.7524, + "step": 4670 + }, + { + "epoch": 1.5106520335700453, + "grad_norm": 0.5073857307434082, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 4680 + }, + { + "epoch": 1.513879922530665, + "grad_norm": 0.5183001160621643, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 4690 + }, + { + "epoch": 1.5171078114912846, + "grad_norm": 0.593530535697937, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 4700 + }, + { + "epoch": 1.5203357004519045, + "grad_norm": 0.675993025302887, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 4710 + }, + { + "epoch": 1.5235635894125243, + "grad_norm": 0.5823286771774292, + "learning_rate": 0.0002, + "loss": 0.7485, + "step": 4720 + }, + { + "epoch": 1.526791478373144, + "grad_norm": 0.5825035572052002, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 4730 + }, + { + "epoch": 1.5300193673337636, + "grad_norm": 0.5689691305160522, + "learning_rate": 0.0002, + "loss": 0.8287, + "step": 4740 + }, + { + "epoch": 1.5332472562943835, + "grad_norm": 0.6037150621414185, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 4750 + }, + { + "epoch": 1.5364751452550034, + "grad_norm": 0.6393677592277527, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 4760 + }, + { + "epoch": 1.539703034215623, + "grad_norm": 0.5926381945610046, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 4770 + }, + { + "epoch": 1.5429309231762427, + "grad_norm": 0.9468599557876587, + "learning_rate": 0.0002, + "loss": 0.7425, + "step": 4780 + }, + { + "epoch": 1.5461588121368623, + "grad_norm": 0.7544237375259399, + "learning_rate": 0.0002, + "loss": 0.7565, + "step": 4790 + }, + { + "epoch": 1.5493867010974822, + "grad_norm": 0.5308566093444824, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 4800 + }, + { + "epoch": 1.552614590058102, + "grad_norm": 0.6590296030044556, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 4810 + }, + { + "epoch": 1.5558424790187217, + "grad_norm": 0.5630404353141785, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 4820 + }, + { + "epoch": 1.5590703679793414, + "grad_norm": 0.6800200939178467, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 4830 + }, + { + "epoch": 1.5622982569399613, + "grad_norm": 0.5463718175888062, + "learning_rate": 0.0002, + "loss": 0.7373, + "step": 4840 + }, + { + "epoch": 1.5655261459005811, + "grad_norm": 0.505135178565979, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 4850 + }, + { + "epoch": 1.5687540348612008, + "grad_norm": 0.5469676852226257, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 4860 + }, + { + "epoch": 1.5719819238218204, + "grad_norm": 0.5318337678909302, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 4870 + }, + { + "epoch": 1.5752098127824403, + "grad_norm": 0.7287914752960205, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 4880 + }, + { + "epoch": 1.5784377017430602, + "grad_norm": 0.7318989038467407, + "learning_rate": 0.0002, + "loss": 0.7532, + "step": 4890 + }, + { + "epoch": 1.5816655907036798, + "grad_norm": 0.6499921679496765, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 4900 + }, + { + "epoch": 1.5848934796642995, + "grad_norm": 0.47907355427742004, + "learning_rate": 0.0002, + "loss": 0.753, + "step": 4910 + }, + { + "epoch": 1.5881213686249191, + "grad_norm": 0.7338833808898926, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 4920 + }, + { + "epoch": 1.591349257585539, + "grad_norm": 0.5800719261169434, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 4930 + }, + { + "epoch": 1.594577146546159, + "grad_norm": 0.5365763306617737, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 4940 + }, + { + "epoch": 1.5978050355067785, + "grad_norm": 0.5800772309303284, + "learning_rate": 0.0002, + "loss": 0.777, + "step": 4950 + }, + { + "epoch": 1.6010329244673982, + "grad_norm": 0.7878010869026184, + "learning_rate": 0.0002, + "loss": 0.8027, + "step": 4960 + }, + { + "epoch": 1.604260813428018, + "grad_norm": 0.5919058918952942, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 4970 + }, + { + "epoch": 1.607488702388638, + "grad_norm": 0.5004435181617737, + "learning_rate": 0.0002, + "loss": 0.7762, + "step": 4980 + }, + { + "epoch": 1.6107165913492576, + "grad_norm": 0.6299242377281189, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 4990 + }, + { + "epoch": 1.6139444803098772, + "grad_norm": 0.6307242512702942, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 5000 + }, + { + "epoch": 1.6171723692704971, + "grad_norm": 0.7838703989982605, + "learning_rate": 0.0002, + "loss": 0.7693, + "step": 5010 + }, + { + "epoch": 1.620400258231117, + "grad_norm": 0.6454671621322632, + "learning_rate": 0.0002, + "loss": 0.7364, + "step": 5020 + }, + { + "epoch": 1.6236281471917366, + "grad_norm": 0.5907095670700073, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 5030 + }, + { + "epoch": 1.6268560361523563, + "grad_norm": 0.6053501963615417, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 5040 + }, + { + "epoch": 1.630083925112976, + "grad_norm": 0.5644670128822327, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 5050 + }, + { + "epoch": 1.6333118140735958, + "grad_norm": 0.6320949792861938, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 5060 + }, + { + "epoch": 1.6365397030342157, + "grad_norm": 0.6101489067077637, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 5070 + }, + { + "epoch": 1.6397675919948353, + "grad_norm": 0.9435283541679382, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 5080 + }, + { + "epoch": 1.642995480955455, + "grad_norm": 0.6668919324874878, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 5090 + }, + { + "epoch": 1.6462233699160749, + "grad_norm": 0.6160340905189514, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 5100 + }, + { + "epoch": 1.6494512588766947, + "grad_norm": 0.5999835729598999, + "learning_rate": 0.0002, + "loss": 0.7461, + "step": 5110 + }, + { + "epoch": 1.6526791478373144, + "grad_norm": 0.9378551840782166, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 5120 + }, + { + "epoch": 1.655907036797934, + "grad_norm": 0.4795055389404297, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 5130 + }, + { + "epoch": 1.659134925758554, + "grad_norm": 0.4878861606121063, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 5140 + }, + { + "epoch": 1.6623628147191738, + "grad_norm": 0.6042965054512024, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 5150 + }, + { + "epoch": 1.6655907036797934, + "grad_norm": 0.5829901695251465, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 5160 + }, + { + "epoch": 1.668818592640413, + "grad_norm": 0.5168480277061462, + "learning_rate": 0.0002, + "loss": 0.7498, + "step": 5170 + }, + { + "epoch": 1.672046481601033, + "grad_norm": 0.6489511132240295, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 5180 + }, + { + "epoch": 1.6752743705616526, + "grad_norm": 0.5955966114997864, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 5190 + }, + { + "epoch": 1.6785022595222725, + "grad_norm": 0.6228088140487671, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 5200 + }, + { + "epoch": 1.6817301484828922, + "grad_norm": 0.5726390480995178, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 5210 + }, + { + "epoch": 1.6849580374435118, + "grad_norm": 0.6116343140602112, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 5220 + }, + { + "epoch": 1.6881859264041317, + "grad_norm": 0.5483687520027161, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 5230 + }, + { + "epoch": 1.6914138153647515, + "grad_norm": 0.570941686630249, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 5240 + }, + { + "epoch": 1.6946417043253712, + "grad_norm": 0.6048086285591125, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 5250 + }, + { + "epoch": 1.6978695932859909, + "grad_norm": 0.6769003868103027, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 5260 + }, + { + "epoch": 1.7010974822466107, + "grad_norm": 0.5629057884216309, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 5270 + }, + { + "epoch": 1.7043253712072306, + "grad_norm": 0.657341480255127, + "learning_rate": 0.0002, + "loss": 0.7693, + "step": 5280 + }, + { + "epoch": 1.7075532601678503, + "grad_norm": 0.6256147623062134, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 5290 + }, + { + "epoch": 1.71078114912847, + "grad_norm": 0.5498088002204895, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 5300 + }, + { + "epoch": 1.7140090380890898, + "grad_norm": 0.5078358054161072, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 5310 + }, + { + "epoch": 1.7172369270497096, + "grad_norm": 0.6696692705154419, + "learning_rate": 0.0002, + "loss": 0.7872, + "step": 5320 + }, + { + "epoch": 1.7204648160103293, + "grad_norm": 0.6692847013473511, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 5330 + }, + { + "epoch": 1.723692704970949, + "grad_norm": 0.5415751934051514, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 5340 + }, + { + "epoch": 1.7269205939315686, + "grad_norm": 0.5367611050605774, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 5350 + }, + { + "epoch": 1.7301484828921885, + "grad_norm": 0.7321061491966248, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 5360 + }, + { + "epoch": 1.7333763718528084, + "grad_norm": 0.723972499370575, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 5370 + }, + { + "epoch": 1.736604260813428, + "grad_norm": 0.7328100204467773, + "learning_rate": 0.0002, + "loss": 0.7077, + "step": 5380 + }, + { + "epoch": 1.7398321497740477, + "grad_norm": 0.5785264372825623, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 5390 + }, + { + "epoch": 1.7430600387346675, + "grad_norm": 0.7812932133674622, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 5400 + }, + { + "epoch": 1.7462879276952874, + "grad_norm": 0.6493327617645264, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 5410 + }, + { + "epoch": 1.749515816655907, + "grad_norm": 0.5825939774513245, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 5420 + }, + { + "epoch": 1.7527437056165267, + "grad_norm": 0.6969610452651978, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 5430 + }, + { + "epoch": 1.7559715945771466, + "grad_norm": 0.5558062195777893, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 5440 + }, + { + "epoch": 1.7591994835377665, + "grad_norm": 0.49222221970558167, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 5450 + }, + { + "epoch": 1.762427372498386, + "grad_norm": 0.5844656825065613, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 5460 + }, + { + "epoch": 1.7656552614590058, + "grad_norm": 0.8706597685813904, + "learning_rate": 0.0002, + "loss": 0.7695, + "step": 5470 + }, + { + "epoch": 1.7688831504196254, + "grad_norm": 0.6167706251144409, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 5480 + }, + { + "epoch": 1.7721110393802453, + "grad_norm": 0.5890011787414551, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 5490 + }, + { + "epoch": 1.7753389283408652, + "grad_norm": 0.6551728248596191, + "learning_rate": 0.0002, + "loss": 0.8319, + "step": 5500 + }, + { + "epoch": 1.7785668173014848, + "grad_norm": 0.5848751068115234, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 5510 + }, + { + "epoch": 1.7817947062621045, + "grad_norm": 0.6664014458656311, + "learning_rate": 0.0002, + "loss": 0.7622, + "step": 5520 + }, + { + "epoch": 1.7850225952227243, + "grad_norm": 0.5931693911552429, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 5530 + }, + { + "epoch": 1.7882504841833442, + "grad_norm": 0.5534724593162537, + "learning_rate": 0.0002, + "loss": 0.7992, + "step": 5540 + }, + { + "epoch": 1.7914783731439639, + "grad_norm": 0.5590878129005432, + "learning_rate": 0.0002, + "loss": 0.7967, + "step": 5550 + }, + { + "epoch": 1.7947062621045835, + "grad_norm": 0.6947470903396606, + "learning_rate": 0.0002, + "loss": 0.7406, + "step": 5560 + }, + { + "epoch": 1.7979341510652034, + "grad_norm": 0.6104130148887634, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 5570 + }, + { + "epoch": 1.8011620400258233, + "grad_norm": 0.6135714054107666, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 5580 + }, + { + "epoch": 1.804389928986443, + "grad_norm": 0.6626853346824646, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 5590 + }, + { + "epoch": 1.8076178179470626, + "grad_norm": 0.6977612972259521, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 5600 + }, + { + "epoch": 1.8108457069076824, + "grad_norm": 0.6275238394737244, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 5610 + }, + { + "epoch": 1.814073595868302, + "grad_norm": 0.5017505288124084, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 5620 + }, + { + "epoch": 1.817301484828922, + "grad_norm": 0.8314290642738342, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 5630 + }, + { + "epoch": 1.8205293737895416, + "grad_norm": 0.6863582134246826, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 5640 + }, + { + "epoch": 1.8237572627501613, + "grad_norm": 0.69544917345047, + "learning_rate": 0.0002, + "loss": 0.743, + "step": 5650 + }, + { + "epoch": 1.8269851517107811, + "grad_norm": 0.515499472618103, + "learning_rate": 0.0002, + "loss": 0.7277, + "step": 5660 + }, + { + "epoch": 1.830213040671401, + "grad_norm": 0.6100873947143555, + "learning_rate": 0.0002, + "loss": 0.7166, + "step": 5670 + }, + { + "epoch": 1.8334409296320207, + "grad_norm": 0.67416912317276, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 5680 + }, + { + "epoch": 1.8366688185926403, + "grad_norm": 0.7057772278785706, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 5690 + }, + { + "epoch": 1.8398967075532602, + "grad_norm": 0.7374551892280579, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 5700 + }, + { + "epoch": 1.84312459651388, + "grad_norm": 0.6266297101974487, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 5710 + }, + { + "epoch": 1.8463524854744997, + "grad_norm": 0.5629227757453918, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 5720 + }, + { + "epoch": 1.8495803744351194, + "grad_norm": 0.6603655815124512, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 5730 + }, + { + "epoch": 1.8528082633957392, + "grad_norm": 0.8113715052604675, + "learning_rate": 0.0002, + "loss": 0.7587, + "step": 5740 + }, + { + "epoch": 1.856036152356359, + "grad_norm": 0.7143914103507996, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 5750 + }, + { + "epoch": 1.8592640413169788, + "grad_norm": 0.6273732781410217, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 5760 + }, + { + "epoch": 1.8624919302775984, + "grad_norm": 0.5428690910339355, + "learning_rate": 0.0002, + "loss": 0.7962, + "step": 5770 + }, + { + "epoch": 1.865719819238218, + "grad_norm": 0.6405037641525269, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 5780 + }, + { + "epoch": 1.868947708198838, + "grad_norm": 0.700873613357544, + "learning_rate": 0.0002, + "loss": 0.7569, + "step": 5790 + }, + { + "epoch": 1.8721755971594578, + "grad_norm": 0.5645238161087036, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 5800 + }, + { + "epoch": 1.8754034861200775, + "grad_norm": 0.8780353665351868, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 5810 + }, + { + "epoch": 1.878631375080697, + "grad_norm": 0.6295409798622131, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 5820 + }, + { + "epoch": 1.881859264041317, + "grad_norm": 0.678269624710083, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 5830 + }, + { + "epoch": 1.8850871530019369, + "grad_norm": 0.6464608907699585, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 5840 + }, + { + "epoch": 1.8883150419625565, + "grad_norm": 0.6201048493385315, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 5850 + }, + { + "epoch": 1.8915429309231762, + "grad_norm": 0.6046274304389954, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 5860 + }, + { + "epoch": 1.894770819883796, + "grad_norm": 0.7532408833503723, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 5870 + }, + { + "epoch": 1.897998708844416, + "grad_norm": 0.6066767573356628, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 5880 + }, + { + "epoch": 1.9012265978050356, + "grad_norm": 0.6289830207824707, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 5890 + }, + { + "epoch": 1.9044544867656552, + "grad_norm": 0.5204319953918457, + "learning_rate": 0.0002, + "loss": 0.7501, + "step": 5900 + }, + { + "epoch": 1.9076823757262749, + "grad_norm": 0.6708219647407532, + "learning_rate": 0.0002, + "loss": 0.7335, + "step": 5910 + }, + { + "epoch": 1.9109102646868947, + "grad_norm": 0.4915677309036255, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 5920 + }, + { + "epoch": 1.9141381536475146, + "grad_norm": 0.652717113494873, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 5930 + }, + { + "epoch": 1.9173660426081343, + "grad_norm": 0.5446316003799438, + "learning_rate": 0.0002, + "loss": 0.7687, + "step": 5940 + }, + { + "epoch": 1.920593931568754, + "grad_norm": 0.4958149194717407, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 5950 + }, + { + "epoch": 1.9238218205293738, + "grad_norm": 0.5623434782028198, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 5960 + }, + { + "epoch": 1.9270497094899937, + "grad_norm": 0.6855450868606567, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 5970 + }, + { + "epoch": 1.9302775984506133, + "grad_norm": 0.5710492730140686, + "learning_rate": 0.0002, + "loss": 0.827, + "step": 5980 + }, + { + "epoch": 1.933505487411233, + "grad_norm": 0.5379431843757629, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 5990 + }, + { + "epoch": 1.9367333763718528, + "grad_norm": 0.557129442691803, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 6000 + }, + { + "epoch": 1.9399612653324727, + "grad_norm": 0.6336663961410522, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 6010 + }, + { + "epoch": 1.9431891542930924, + "grad_norm": 0.5950582027435303, + "learning_rate": 0.0002, + "loss": 0.7316, + "step": 6020 + }, + { + "epoch": 1.946417043253712, + "grad_norm": 0.5905954837799072, + "learning_rate": 0.0002, + "loss": 0.7443, + "step": 6030 + }, + { + "epoch": 1.9496449322143317, + "grad_norm": 0.6688982844352722, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 6040 + }, + { + "epoch": 1.9528728211749515, + "grad_norm": 0.5440775752067566, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 6050 + }, + { + "epoch": 1.9561007101355714, + "grad_norm": 0.6207906603813171, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 6060 + }, + { + "epoch": 1.959328599096191, + "grad_norm": 0.6999374628067017, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 6070 + }, + { + "epoch": 1.9625564880568107, + "grad_norm": 0.6310848593711853, + "learning_rate": 0.0002, + "loss": 0.7372, + "step": 6080 + }, + { + "epoch": 1.9657843770174306, + "grad_norm": 0.5903388261795044, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 6090 + }, + { + "epoch": 1.9690122659780505, + "grad_norm": 0.6333889961242676, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 6100 + }, + { + "epoch": 1.97224015493867, + "grad_norm": 0.5604711174964905, + "learning_rate": 0.0002, + "loss": 0.7246, + "step": 6110 + }, + { + "epoch": 1.9754680438992898, + "grad_norm": 0.9234541654586792, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 6120 + }, + { + "epoch": 1.9786959328599096, + "grad_norm": 0.6149102449417114, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 6130 + }, + { + "epoch": 1.9819238218205295, + "grad_norm": 0.615446150302887, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 6140 + }, + { + "epoch": 1.9851517107811492, + "grad_norm": 0.5176635980606079, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 6150 + }, + { + "epoch": 1.9883795997417688, + "grad_norm": 0.7124109864234924, + "learning_rate": 0.0002, + "loss": 0.718, + "step": 6160 + }, + { + "epoch": 1.9916074887023887, + "grad_norm": 0.6317567825317383, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 6170 + }, + { + "epoch": 1.9948353776630086, + "grad_norm": 0.6855016350746155, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 6180 + }, + { + "epoch": 1.9980632666236282, + "grad_norm": 0.6423715353012085, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 6190 + }, + { + "epoch": 2.0, + "eval_loss": 1.1096643209457397, + "eval_runtime": 147.7997, + "eval_samples_per_second": 4.959, + "eval_steps_per_second": 0.622, + "step": 6196 + } + ], + "logging_steps": 10, + "max_steps": 24784, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.8673700659409715e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f46f2b8e8752b125339f36f172c3878be4cdb152 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-6196/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfc2a69e44a51edf5586ebed4b7ee915a23244c18c1f59e580471e4c9becfa98 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5243c193c058d38ada47da6117744c7d7b9d006a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69a8057394e6454d1eb794c68150a01bbf1c565c43dc28eb154388db4598d78c +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d2acfd466ae61ea13490cea6b540e058405d6dd4 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c91d6ead55f732c4cafdb6735f223888b90bc416647053914661b92e693da46e +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4961b6b204e2310c1d046e4fc59efc010d5a9ede --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7b5fb77371ede66bbac88e239d0c4c4b7a191c33aa02aa81170f551f5cc26c5 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..51bb1ba34d7e5fe81899b3b3018c256339202904 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ee815efbbf88afd92d459eb09349892fafd01b7ead30c5a15ff54834da1a6c6 +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..853e8dc7371c362ca2fcc104e0c45059658f809b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/trainer_state.json @@ -0,0 +1,6560 @@ +{ + "best_metric": 1.0958120822906494, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098", + "epoch": 3.0, + "eval_steps": 10, + "global_step": 9294, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032278889606197547, + "grad_norm": 0.7092075347900391, + "learning_rate": 0.0002, + "loss": 1.593, + "step": 10 + }, + { + "epoch": 0.006455777921239509, + "grad_norm": 0.6900479793548584, + "learning_rate": 0.0002, + "loss": 1.0956, + "step": 20 + }, + { + "epoch": 0.009683666881859263, + "grad_norm": 0.6788288950920105, + "learning_rate": 0.0002, + "loss": 0.9807, + "step": 30 + }, + { + "epoch": 0.012911555842479019, + "grad_norm": 0.5590243339538574, + "learning_rate": 0.0002, + "loss": 0.9385, + "step": 40 + }, + { + "epoch": 0.016139444803098774, + "grad_norm": 0.5136010646820068, + "learning_rate": 0.0002, + "loss": 0.931, + "step": 50 + }, + { + "epoch": 0.019367333763718526, + "grad_norm": 0.45298320055007935, + "learning_rate": 0.0002, + "loss": 0.8896, + "step": 60 + }, + { + "epoch": 0.022595222724338282, + "grad_norm": 0.5917162299156189, + "learning_rate": 0.0002, + "loss": 0.9184, + "step": 70 + }, + { + "epoch": 0.025823111684958037, + "grad_norm": 0.4414856433868408, + "learning_rate": 0.0002, + "loss": 0.8705, + "step": 80 + }, + { + "epoch": 0.029051000645577793, + "grad_norm": 0.5547978281974792, + "learning_rate": 0.0002, + "loss": 0.8419, + "step": 90 + }, + { + "epoch": 0.03227888960619755, + "grad_norm": 0.5271288156509399, + "learning_rate": 0.0002, + "loss": 0.8987, + "step": 100 + }, + { + "epoch": 0.035506778566817304, + "grad_norm": 0.5506119728088379, + "learning_rate": 0.0002, + "loss": 0.8543, + "step": 110 + }, + { + "epoch": 0.03873466752743705, + "grad_norm": 0.5579327940940857, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 120 + }, + { + "epoch": 0.04196255648805681, + "grad_norm": 0.5099632740020752, + "learning_rate": 0.0002, + "loss": 0.8826, + "step": 130 + }, + { + "epoch": 0.045190445448676564, + "grad_norm": 0.40396833419799805, + "learning_rate": 0.0002, + "loss": 0.9239, + "step": 140 + }, + { + "epoch": 0.04841833440929632, + "grad_norm": 0.5008092522621155, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 150 + }, + { + "epoch": 0.051646223369916075, + "grad_norm": 0.4388776421546936, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 160 + }, + { + "epoch": 0.05487411233053583, + "grad_norm": 0.44138944149017334, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 170 + }, + { + "epoch": 0.058102001291155586, + "grad_norm": 0.358484148979187, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 180 + }, + { + "epoch": 0.06132989025177534, + "grad_norm": 0.457052081823349, + "learning_rate": 0.0002, + "loss": 0.8956, + "step": 190 + }, + { + "epoch": 0.0645577792123951, + "grad_norm": 0.5537622570991516, + "learning_rate": 0.0002, + "loss": 0.9138, + "step": 200 + }, + { + "epoch": 0.06778566817301485, + "grad_norm": 0.552631676197052, + "learning_rate": 0.0002, + "loss": 0.8701, + "step": 210 + }, + { + "epoch": 0.07101355713363461, + "grad_norm": 0.4414575397968292, + "learning_rate": 0.0002, + "loss": 0.8854, + "step": 220 + }, + { + "epoch": 0.07424144609425436, + "grad_norm": 0.4996664226055145, + "learning_rate": 0.0002, + "loss": 0.8581, + "step": 230 + }, + { + "epoch": 0.0774693350548741, + "grad_norm": 0.7321897149085999, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 240 + }, + { + "epoch": 0.08069722401549387, + "grad_norm": 0.4553901255130768, + "learning_rate": 0.0002, + "loss": 0.8848, + "step": 250 + }, + { + "epoch": 0.08392511297611362, + "grad_norm": 0.5039054751396179, + "learning_rate": 0.0002, + "loss": 0.868, + "step": 260 + }, + { + "epoch": 0.08715300193673338, + "grad_norm": 0.4113094210624695, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 270 + }, + { + "epoch": 0.09038089089735313, + "grad_norm": 0.450436532497406, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 280 + }, + { + "epoch": 0.09360877985797289, + "grad_norm": 0.4548024535179138, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 290 + }, + { + "epoch": 0.09683666881859264, + "grad_norm": 0.4932962656021118, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 300 + }, + { + "epoch": 0.1000645577792124, + "grad_norm": 0.4005250334739685, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 310 + }, + { + "epoch": 0.10329244673983215, + "grad_norm": 1.8321624994277954, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 320 + }, + { + "epoch": 0.1065203357004519, + "grad_norm": 0.45815610885620117, + "learning_rate": 0.0002, + "loss": 0.8411, + "step": 330 + }, + { + "epoch": 0.10974822466107166, + "grad_norm": 0.39324095845222473, + "learning_rate": 0.0002, + "loss": 0.857, + "step": 340 + }, + { + "epoch": 0.11297611362169141, + "grad_norm": 0.546273946762085, + "learning_rate": 0.0002, + "loss": 0.8258, + "step": 350 + }, + { + "epoch": 0.11620400258231117, + "grad_norm": 0.497448593378067, + "learning_rate": 0.0002, + "loss": 0.882, + "step": 360 + }, + { + "epoch": 0.11943189154293092, + "grad_norm": 0.37508800625801086, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 370 + }, + { + "epoch": 0.12265978050355068, + "grad_norm": 0.45849609375, + "learning_rate": 0.0002, + "loss": 0.852, + "step": 380 + }, + { + "epoch": 0.12588766946417043, + "grad_norm": 0.5488408803939819, + "learning_rate": 0.0002, + "loss": 0.8437, + "step": 390 + }, + { + "epoch": 0.1291155584247902, + "grad_norm": 0.4477061331272125, + "learning_rate": 0.0002, + "loss": 0.8349, + "step": 400 + }, + { + "epoch": 0.13234344738540993, + "grad_norm": 0.39227980375289917, + "learning_rate": 0.0002, + "loss": 0.8306, + "step": 410 + }, + { + "epoch": 0.1355713363460297, + "grad_norm": 0.3922233581542969, + "learning_rate": 0.0002, + "loss": 0.7933, + "step": 420 + }, + { + "epoch": 0.13879922530664945, + "grad_norm": 0.42901909351348877, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 430 + }, + { + "epoch": 0.14202711426726922, + "grad_norm": 0.4217798709869385, + "learning_rate": 0.0002, + "loss": 0.8271, + "step": 440 + }, + { + "epoch": 0.14525500322788895, + "grad_norm": 0.43470677733421326, + "learning_rate": 0.0002, + "loss": 0.8594, + "step": 450 + }, + { + "epoch": 0.1484828921885087, + "grad_norm": 0.5324403047561646, + "learning_rate": 0.0002, + "loss": 0.8106, + "step": 460 + }, + { + "epoch": 0.15171078114912848, + "grad_norm": 0.3999756872653961, + "learning_rate": 0.0002, + "loss": 0.8729, + "step": 470 + }, + { + "epoch": 0.1549386701097482, + "grad_norm": 0.404933363199234, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 480 + }, + { + "epoch": 0.15816655907036797, + "grad_norm": 0.44122636318206787, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 490 + }, + { + "epoch": 0.16139444803098774, + "grad_norm": 0.510166347026825, + "learning_rate": 0.0002, + "loss": 0.8457, + "step": 500 + }, + { + "epoch": 0.1646223369916075, + "grad_norm": 0.4549732506275177, + "learning_rate": 0.0002, + "loss": 0.8692, + "step": 510 + }, + { + "epoch": 0.16785022595222723, + "grad_norm": 0.5148182511329651, + "learning_rate": 0.0002, + "loss": 0.8466, + "step": 520 + }, + { + "epoch": 0.171078114912847, + "grad_norm": 0.3596806824207306, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 530 + }, + { + "epoch": 0.17430600387346676, + "grad_norm": 0.4388909339904785, + "learning_rate": 0.0002, + "loss": 0.844, + "step": 540 + }, + { + "epoch": 0.17753389283408652, + "grad_norm": 0.5052742958068848, + "learning_rate": 0.0002, + "loss": 0.8322, + "step": 550 + }, + { + "epoch": 0.18076178179470626, + "grad_norm": 0.48248958587646484, + "learning_rate": 0.0002, + "loss": 0.791, + "step": 560 + }, + { + "epoch": 0.18398967075532602, + "grad_norm": 0.5360197424888611, + "learning_rate": 0.0002, + "loss": 0.8593, + "step": 570 + }, + { + "epoch": 0.18721755971594578, + "grad_norm": 0.43999341130256653, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 580 + }, + { + "epoch": 0.19044544867656552, + "grad_norm": 0.3685208261013031, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 590 + }, + { + "epoch": 0.19367333763718528, + "grad_norm": 0.4601275622844696, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 600 + }, + { + "epoch": 0.19690122659780504, + "grad_norm": 0.4778369665145874, + "learning_rate": 0.0002, + "loss": 0.8483, + "step": 610 + }, + { + "epoch": 0.2001291155584248, + "grad_norm": 0.4867003560066223, + "learning_rate": 0.0002, + "loss": 0.8653, + "step": 620 + }, + { + "epoch": 0.20335700451904454, + "grad_norm": 0.4583742916584015, + "learning_rate": 0.0002, + "loss": 0.8554, + "step": 630 + }, + { + "epoch": 0.2065848934796643, + "grad_norm": 0.47958165407180786, + "learning_rate": 0.0002, + "loss": 0.8698, + "step": 640 + }, + { + "epoch": 0.20981278244028406, + "grad_norm": 0.4526064097881317, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 650 + }, + { + "epoch": 0.2130406714009038, + "grad_norm": 0.45890581607818604, + "learning_rate": 0.0002, + "loss": 0.8313, + "step": 660 + }, + { + "epoch": 0.21626856036152356, + "grad_norm": 0.42725905776023865, + "learning_rate": 0.0002, + "loss": 0.8143, + "step": 670 + }, + { + "epoch": 0.21949644932214332, + "grad_norm": 0.40380963683128357, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 680 + }, + { + "epoch": 0.22272433828276308, + "grad_norm": 0.4372998774051666, + "learning_rate": 0.0002, + "loss": 0.9004, + "step": 690 + }, + { + "epoch": 0.22595222724338282, + "grad_norm": 0.4245864450931549, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 700 + }, + { + "epoch": 0.22918011620400258, + "grad_norm": 0.4061129689216614, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 710 + }, + { + "epoch": 0.23240800516462234, + "grad_norm": 0.474454790353775, + "learning_rate": 0.0002, + "loss": 0.8275, + "step": 720 + }, + { + "epoch": 0.23563589412524208, + "grad_norm": 0.4908486008644104, + "learning_rate": 0.0002, + "loss": 0.8346, + "step": 730 + }, + { + "epoch": 0.23886378308586184, + "grad_norm": 0.4284191429615021, + "learning_rate": 0.0002, + "loss": 0.8755, + "step": 740 + }, + { + "epoch": 0.2420916720464816, + "grad_norm": 0.44730308651924133, + "learning_rate": 0.0002, + "loss": 0.8387, + "step": 750 + }, + { + "epoch": 0.24531956100710137, + "grad_norm": 0.4433246850967407, + "learning_rate": 0.0002, + "loss": 0.8135, + "step": 760 + }, + { + "epoch": 0.2485474499677211, + "grad_norm": 0.43668854236602783, + "learning_rate": 0.0002, + "loss": 0.8644, + "step": 770 + }, + { + "epoch": 0.25177533892834086, + "grad_norm": 0.34324130415916443, + "learning_rate": 0.0002, + "loss": 0.8025, + "step": 780 + }, + { + "epoch": 0.2550032278889606, + "grad_norm": 0.46476295590400696, + "learning_rate": 0.0002, + "loss": 0.8725, + "step": 790 + }, + { + "epoch": 0.2582311168495804, + "grad_norm": 0.5047039985656738, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 800 + }, + { + "epoch": 0.26145900581020015, + "grad_norm": 0.4402127265930176, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 810 + }, + { + "epoch": 0.26468689477081986, + "grad_norm": 0.4642465114593506, + "learning_rate": 0.0002, + "loss": 0.8025, + "step": 820 + }, + { + "epoch": 0.2679147837314396, + "grad_norm": 0.40093424916267395, + "learning_rate": 0.0002, + "loss": 0.8836, + "step": 830 + }, + { + "epoch": 0.2711426726920594, + "grad_norm": 0.42501842975616455, + "learning_rate": 0.0002, + "loss": 0.83, + "step": 840 + }, + { + "epoch": 0.27437056165267915, + "grad_norm": 0.43279722332954407, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 850 + }, + { + "epoch": 0.2775984506132989, + "grad_norm": 0.5991243720054626, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 860 + }, + { + "epoch": 0.28082633957391867, + "grad_norm": 0.4217848777770996, + "learning_rate": 0.0002, + "loss": 0.7981, + "step": 870 + }, + { + "epoch": 0.28405422853453843, + "grad_norm": 0.3933536410331726, + "learning_rate": 0.0002, + "loss": 0.8135, + "step": 880 + }, + { + "epoch": 0.28728211749515814, + "grad_norm": 0.5868505239486694, + "learning_rate": 0.0002, + "loss": 0.8846, + "step": 890 + }, + { + "epoch": 0.2905100064557779, + "grad_norm": 0.5209547877311707, + "learning_rate": 0.0002, + "loss": 0.8759, + "step": 900 + }, + { + "epoch": 0.29373789541639767, + "grad_norm": 0.49307361245155334, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 910 + }, + { + "epoch": 0.2969657843770174, + "grad_norm": 0.4288382828235626, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 920 + }, + { + "epoch": 0.3001936733376372, + "grad_norm": 0.33568474650382996, + "learning_rate": 0.0002, + "loss": 0.8431, + "step": 930 + }, + { + "epoch": 0.30342156229825695, + "grad_norm": 1.0915930271148682, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 940 + }, + { + "epoch": 0.3066494512588767, + "grad_norm": 0.5489798188209534, + "learning_rate": 0.0002, + "loss": 0.8535, + "step": 950 + }, + { + "epoch": 0.3098773402194964, + "grad_norm": 0.42971742153167725, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 960 + }, + { + "epoch": 0.3131052291801162, + "grad_norm": 0.43375834822654724, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 970 + }, + { + "epoch": 0.31633311814073595, + "grad_norm": 0.47488611936569214, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 980 + }, + { + "epoch": 0.3195610071013557, + "grad_norm": 0.46296775341033936, + "learning_rate": 0.0002, + "loss": 0.7906, + "step": 990 + }, + { + "epoch": 0.32278889606197547, + "grad_norm": 0.4548890292644501, + "learning_rate": 0.0002, + "loss": 0.7948, + "step": 1000 + }, + { + "epoch": 0.32601678502259523, + "grad_norm": 0.41834497451782227, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 1010 + }, + { + "epoch": 0.329244673983215, + "grad_norm": 0.441092312335968, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 1020 + }, + { + "epoch": 0.33247256294383476, + "grad_norm": 0.637322187423706, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 1030 + }, + { + "epoch": 0.33570045190445447, + "grad_norm": 0.4374958574771881, + "learning_rate": 0.0002, + "loss": 0.8685, + "step": 1040 + }, + { + "epoch": 0.33892834086507423, + "grad_norm": 0.3935825824737549, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 1050 + }, + { + "epoch": 0.342156229825694, + "grad_norm": 0.43526220321655273, + "learning_rate": 0.0002, + "loss": 0.8287, + "step": 1060 + }, + { + "epoch": 0.34538411878631375, + "grad_norm": 0.45327696204185486, + "learning_rate": 0.0002, + "loss": 0.8413, + "step": 1070 + }, + { + "epoch": 0.3486120077469335, + "grad_norm": 0.4126075506210327, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 1080 + }, + { + "epoch": 0.3518398967075533, + "grad_norm": 0.4714072048664093, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 1090 + }, + { + "epoch": 0.35506778566817304, + "grad_norm": 0.518127977848053, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 1100 + }, + { + "epoch": 0.35829567462879275, + "grad_norm": 0.43264099955558777, + "learning_rate": 0.0002, + "loss": 0.8479, + "step": 1110 + }, + { + "epoch": 0.3615235635894125, + "grad_norm": 0.4857400357723236, + "learning_rate": 0.0002, + "loss": 0.8724, + "step": 1120 + }, + { + "epoch": 0.3647514525500323, + "grad_norm": 0.37591469287872314, + "learning_rate": 0.0002, + "loss": 0.7735, + "step": 1130 + }, + { + "epoch": 0.36797934151065204, + "grad_norm": 0.4165478050708771, + "learning_rate": 0.0002, + "loss": 0.8531, + "step": 1140 + }, + { + "epoch": 0.3712072304712718, + "grad_norm": 0.42911383509635925, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 1150 + }, + { + "epoch": 0.37443511943189156, + "grad_norm": 0.44980287551879883, + "learning_rate": 0.0002, + "loss": 0.8722, + "step": 1160 + }, + { + "epoch": 0.3776630083925113, + "grad_norm": 0.4066573679447174, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 1170 + }, + { + "epoch": 0.38089089735313103, + "grad_norm": 0.5056195855140686, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 1180 + }, + { + "epoch": 0.3841187863137508, + "grad_norm": 0.4141536355018616, + "learning_rate": 0.0002, + "loss": 0.8387, + "step": 1190 + }, + { + "epoch": 0.38734667527437056, + "grad_norm": 0.4501924514770508, + "learning_rate": 0.0002, + "loss": 0.8019, + "step": 1200 + }, + { + "epoch": 0.3905745642349903, + "grad_norm": 0.43304240703582764, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 1210 + }, + { + "epoch": 0.3938024531956101, + "grad_norm": 0.475777804851532, + "learning_rate": 0.0002, + "loss": 0.8905, + "step": 1220 + }, + { + "epoch": 0.39703034215622984, + "grad_norm": 0.5846465826034546, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 1230 + }, + { + "epoch": 0.4002582311168496, + "grad_norm": 0.42899325489997864, + "learning_rate": 0.0002, + "loss": 0.8078, + "step": 1240 + }, + { + "epoch": 0.4034861200774693, + "grad_norm": 0.3980463147163391, + "learning_rate": 0.0002, + "loss": 0.8415, + "step": 1250 + }, + { + "epoch": 0.4067140090380891, + "grad_norm": 0.45769768953323364, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 1260 + }, + { + "epoch": 0.40994189799870884, + "grad_norm": 0.5101280212402344, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 1270 + }, + { + "epoch": 0.4131697869593286, + "grad_norm": 0.47374317049980164, + "learning_rate": 0.0002, + "loss": 0.7905, + "step": 1280 + }, + { + "epoch": 0.41639767591994836, + "grad_norm": 0.4261878728866577, + "learning_rate": 0.0002, + "loss": 0.8172, + "step": 1290 + }, + { + "epoch": 0.4196255648805681, + "grad_norm": 0.46954256296157837, + "learning_rate": 0.0002, + "loss": 0.9004, + "step": 1300 + }, + { + "epoch": 0.4228534538411879, + "grad_norm": 0.5205738544464111, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 1310 + }, + { + "epoch": 0.4260813428018076, + "grad_norm": 0.5176340937614441, + "learning_rate": 0.0002, + "loss": 0.8964, + "step": 1320 + }, + { + "epoch": 0.42930923176242736, + "grad_norm": 0.5155916810035706, + "learning_rate": 0.0002, + "loss": 0.8764, + "step": 1330 + }, + { + "epoch": 0.4325371207230471, + "grad_norm": 0.44548553228378296, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 1340 + }, + { + "epoch": 0.4357650096836669, + "grad_norm": 0.5633558630943298, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 1350 + }, + { + "epoch": 0.43899289864428664, + "grad_norm": 0.42444056272506714, + "learning_rate": 0.0002, + "loss": 0.7889, + "step": 1360 + }, + { + "epoch": 0.4422207876049064, + "grad_norm": 0.5226860642433167, + "learning_rate": 0.0002, + "loss": 0.8588, + "step": 1370 + }, + { + "epoch": 0.44544867656552617, + "grad_norm": 0.5354582071304321, + "learning_rate": 0.0002, + "loss": 0.8232, + "step": 1380 + }, + { + "epoch": 0.4486765655261459, + "grad_norm": 0.472646564245224, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 1390 + }, + { + "epoch": 0.45190445448676564, + "grad_norm": 0.6312310099601746, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 1400 + }, + { + "epoch": 0.4551323434473854, + "grad_norm": 0.4298408031463623, + "learning_rate": 0.0002, + "loss": 0.8212, + "step": 1410 + }, + { + "epoch": 0.45836023240800516, + "grad_norm": 0.43427202105522156, + "learning_rate": 0.0002, + "loss": 0.8447, + "step": 1420 + }, + { + "epoch": 0.4615881213686249, + "grad_norm": 0.44097861647605896, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 1430 + }, + { + "epoch": 0.4648160103292447, + "grad_norm": 0.5142693519592285, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 1440 + }, + { + "epoch": 0.46804389928986445, + "grad_norm": 0.46416547894477844, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 1450 + }, + { + "epoch": 0.47127178825048416, + "grad_norm": 0.4858551025390625, + "learning_rate": 0.0002, + "loss": 0.8342, + "step": 1460 + }, + { + "epoch": 0.4744996772111039, + "grad_norm": 0.4709177315235138, + "learning_rate": 0.0002, + "loss": 0.8354, + "step": 1470 + }, + { + "epoch": 0.4777275661717237, + "grad_norm": 0.5500252842903137, + "learning_rate": 0.0002, + "loss": 0.8391, + "step": 1480 + }, + { + "epoch": 0.48095545513234345, + "grad_norm": 0.43364381790161133, + "learning_rate": 0.0002, + "loss": 0.8359, + "step": 1490 + }, + { + "epoch": 0.4841833440929632, + "grad_norm": 0.47712287306785583, + "learning_rate": 0.0002, + "loss": 0.8446, + "step": 1500 + }, + { + "epoch": 0.48741123305358297, + "grad_norm": 0.4518495202064514, + "learning_rate": 0.0002, + "loss": 0.8518, + "step": 1510 + }, + { + "epoch": 0.49063912201420273, + "grad_norm": 0.4539008140563965, + "learning_rate": 0.0002, + "loss": 0.819, + "step": 1520 + }, + { + "epoch": 0.49386701097482244, + "grad_norm": 0.4993067979812622, + "learning_rate": 0.0002, + "loss": 0.8276, + "step": 1530 + }, + { + "epoch": 0.4970948999354422, + "grad_norm": 0.6094803214073181, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 1540 + }, + { + "epoch": 0.500322788896062, + "grad_norm": 0.48602527379989624, + "learning_rate": 0.0002, + "loss": 0.8263, + "step": 1550 + }, + { + "epoch": 0.5035506778566817, + "grad_norm": 0.40245795249938965, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 1560 + }, + { + "epoch": 0.5067785668173015, + "grad_norm": 0.456787645816803, + "learning_rate": 0.0002, + "loss": 0.7907, + "step": 1570 + }, + { + "epoch": 0.5100064557779213, + "grad_norm": 0.43936216831207275, + "learning_rate": 0.0002, + "loss": 0.86, + "step": 1580 + }, + { + "epoch": 0.513234344738541, + "grad_norm": 0.549018144607544, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 1590 + }, + { + "epoch": 0.5164622336991608, + "grad_norm": 0.41746795177459717, + "learning_rate": 0.0002, + "loss": 0.8169, + "step": 1600 + }, + { + "epoch": 0.5196901226597805, + "grad_norm": 0.4217053949832916, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 1610 + }, + { + "epoch": 0.5229180116204003, + "grad_norm": 0.449913889169693, + "learning_rate": 0.0002, + "loss": 0.8161, + "step": 1620 + }, + { + "epoch": 0.5261459005810201, + "grad_norm": 0.5084872245788574, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 1630 + }, + { + "epoch": 0.5293737895416397, + "grad_norm": 0.46248653531074524, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 1640 + }, + { + "epoch": 0.5326016785022595, + "grad_norm": 0.4824236035346985, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 1650 + }, + { + "epoch": 0.5358295674628792, + "grad_norm": 0.6010985374450684, + "learning_rate": 0.0002, + "loss": 0.8711, + "step": 1660 + }, + { + "epoch": 0.539057456423499, + "grad_norm": 0.4757920801639557, + "learning_rate": 0.0002, + "loss": 0.8266, + "step": 1670 + }, + { + "epoch": 0.5422853453841188, + "grad_norm": 0.45161882042884827, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 1680 + }, + { + "epoch": 0.5455132343447385, + "grad_norm": 0.49314990639686584, + "learning_rate": 0.0002, + "loss": 0.8141, + "step": 1690 + }, + { + "epoch": 0.5487411233053583, + "grad_norm": 0.3918305039405823, + "learning_rate": 0.0002, + "loss": 0.8091, + "step": 1700 + }, + { + "epoch": 0.551969012265978, + "grad_norm": 0.5966728925704956, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 1710 + }, + { + "epoch": 0.5551969012265978, + "grad_norm": 0.4208986163139343, + "learning_rate": 0.0002, + "loss": 0.8438, + "step": 1720 + }, + { + "epoch": 0.5584247901872176, + "grad_norm": 0.43724218010902405, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 1730 + }, + { + "epoch": 0.5616526791478373, + "grad_norm": 0.5287272930145264, + "learning_rate": 0.0002, + "loss": 0.7956, + "step": 1740 + }, + { + "epoch": 0.5648805681084571, + "grad_norm": 0.4961899518966675, + "learning_rate": 0.0002, + "loss": 0.8557, + "step": 1750 + }, + { + "epoch": 0.5681084570690769, + "grad_norm": 0.4468635320663452, + "learning_rate": 0.0002, + "loss": 0.8029, + "step": 1760 + }, + { + "epoch": 0.5713363460296966, + "grad_norm": 0.6423530578613281, + "learning_rate": 0.0002, + "loss": 0.7968, + "step": 1770 + }, + { + "epoch": 0.5745642349903163, + "grad_norm": 0.4601971507072449, + "learning_rate": 0.0002, + "loss": 0.8324, + "step": 1780 + }, + { + "epoch": 0.577792123950936, + "grad_norm": 0.46514901518821716, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 1790 + }, + { + "epoch": 0.5810200129115558, + "grad_norm": 0.4771687388420105, + "learning_rate": 0.0002, + "loss": 0.8186, + "step": 1800 + }, + { + "epoch": 0.5842479018721756, + "grad_norm": 0.46514490246772766, + "learning_rate": 0.0002, + "loss": 0.856, + "step": 1810 + }, + { + "epoch": 0.5874757908327953, + "grad_norm": 0.5373936295509338, + "learning_rate": 0.0002, + "loss": 0.84, + "step": 1820 + }, + { + "epoch": 0.5907036797934151, + "grad_norm": 0.5175791382789612, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 1830 + }, + { + "epoch": 0.5939315687540349, + "grad_norm": 0.4522802233695984, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 1840 + }, + { + "epoch": 0.5971594577146546, + "grad_norm": 0.42987772822380066, + "learning_rate": 0.0002, + "loss": 0.8633, + "step": 1850 + }, + { + "epoch": 0.6003873466752744, + "grad_norm": 0.5566838383674622, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 1860 + }, + { + "epoch": 0.6036152356358941, + "grad_norm": 0.42807698249816895, + "learning_rate": 0.0002, + "loss": 0.8312, + "step": 1870 + }, + { + "epoch": 0.6068431245965139, + "grad_norm": 0.4957767724990845, + "learning_rate": 0.0002, + "loss": 0.8035, + "step": 1880 + }, + { + "epoch": 0.6100710135571337, + "grad_norm": 0.4260980188846588, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 1890 + }, + { + "epoch": 0.6132989025177534, + "grad_norm": 0.4777357876300812, + "learning_rate": 0.0002, + "loss": 0.8363, + "step": 1900 + }, + { + "epoch": 0.6165267914783732, + "grad_norm": 0.4434216022491455, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 1910 + }, + { + "epoch": 0.6197546804389928, + "grad_norm": 0.5215433835983276, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 1920 + }, + { + "epoch": 0.6229825693996126, + "grad_norm": 0.5143248438835144, + "learning_rate": 0.0002, + "loss": 0.82, + "step": 1930 + }, + { + "epoch": 0.6262104583602324, + "grad_norm": 0.5213413238525391, + "learning_rate": 0.0002, + "loss": 0.8107, + "step": 1940 + }, + { + "epoch": 0.6294383473208521, + "grad_norm": 0.5408226251602173, + "learning_rate": 0.0002, + "loss": 0.7549, + "step": 1950 + }, + { + "epoch": 0.6326662362814719, + "grad_norm": 0.5479708909988403, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 1960 + }, + { + "epoch": 0.6358941252420917, + "grad_norm": 0.4490949809551239, + "learning_rate": 0.0002, + "loss": 0.8138, + "step": 1970 + }, + { + "epoch": 0.6391220142027114, + "grad_norm": 0.48815059661865234, + "learning_rate": 0.0002, + "loss": 0.854, + "step": 1980 + }, + { + "epoch": 0.6423499031633312, + "grad_norm": 0.46498045325279236, + "learning_rate": 0.0002, + "loss": 0.8568, + "step": 1990 + }, + { + "epoch": 0.6455777921239509, + "grad_norm": 0.5136561393737793, + "learning_rate": 0.0002, + "loss": 0.8263, + "step": 2000 + }, + { + "epoch": 0.6488056810845707, + "grad_norm": 0.5145719647407532, + "learning_rate": 0.0002, + "loss": 0.8503, + "step": 2010 + }, + { + "epoch": 0.6520335700451905, + "grad_norm": 0.5430373549461365, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 2020 + }, + { + "epoch": 0.6552614590058102, + "grad_norm": 0.46347954869270325, + "learning_rate": 0.0002, + "loss": 0.8115, + "step": 2030 + }, + { + "epoch": 0.65848934796643, + "grad_norm": 0.5189562439918518, + "learning_rate": 0.0002, + "loss": 0.8769, + "step": 2040 + }, + { + "epoch": 0.6617172369270498, + "grad_norm": 0.43843990564346313, + "learning_rate": 0.0002, + "loss": 0.8453, + "step": 2050 + }, + { + "epoch": 0.6649451258876695, + "grad_norm": 0.4654983580112457, + "learning_rate": 0.0002, + "loss": 0.7951, + "step": 2060 + }, + { + "epoch": 0.6681730148482892, + "grad_norm": 0.44835716485977173, + "learning_rate": 0.0002, + "loss": 0.8308, + "step": 2070 + }, + { + "epoch": 0.6714009038089089, + "grad_norm": 0.38811734318733215, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 2080 + }, + { + "epoch": 0.6746287927695287, + "grad_norm": 0.5709853172302246, + "learning_rate": 0.0002, + "loss": 0.762, + "step": 2090 + }, + { + "epoch": 0.6778566817301485, + "grad_norm": 0.49994757771492004, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 2100 + }, + { + "epoch": 0.6810845706907682, + "grad_norm": 0.5505402684211731, + "learning_rate": 0.0002, + "loss": 0.8, + "step": 2110 + }, + { + "epoch": 0.684312459651388, + "grad_norm": 0.48195120692253113, + "learning_rate": 0.0002, + "loss": 0.8227, + "step": 2120 + }, + { + "epoch": 0.6875403486120077, + "grad_norm": 0.4854775071144104, + "learning_rate": 0.0002, + "loss": 0.7879, + "step": 2130 + }, + { + "epoch": 0.6907682375726275, + "grad_norm": 0.6422494649887085, + "learning_rate": 0.0002, + "loss": 0.8231, + "step": 2140 + }, + { + "epoch": 0.6939961265332473, + "grad_norm": 0.3972536027431488, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 2150 + }, + { + "epoch": 0.697224015493867, + "grad_norm": 0.4297836422920227, + "learning_rate": 0.0002, + "loss": 0.8068, + "step": 2160 + }, + { + "epoch": 0.7004519044544868, + "grad_norm": 0.45486778020858765, + "learning_rate": 0.0002, + "loss": 0.8017, + "step": 2170 + }, + { + "epoch": 0.7036797934151066, + "grad_norm": 0.4706047773361206, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 2180 + }, + { + "epoch": 0.7069076823757263, + "grad_norm": 0.46426892280578613, + "learning_rate": 0.0002, + "loss": 0.8234, + "step": 2190 + }, + { + "epoch": 0.7101355713363461, + "grad_norm": 0.46333715319633484, + "learning_rate": 0.0002, + "loss": 0.8472, + "step": 2200 + }, + { + "epoch": 0.7133634602969657, + "grad_norm": 0.4632524251937866, + "learning_rate": 0.0002, + "loss": 0.8247, + "step": 2210 + }, + { + "epoch": 0.7165913492575855, + "grad_norm": 0.4610830843448639, + "learning_rate": 0.0002, + "loss": 0.8452, + "step": 2220 + }, + { + "epoch": 0.7198192382182053, + "grad_norm": 0.4905324876308441, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 2230 + }, + { + "epoch": 0.723047127178825, + "grad_norm": 0.4936263859272003, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 2240 + }, + { + "epoch": 0.7262750161394448, + "grad_norm": 0.40778425335884094, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 2250 + }, + { + "epoch": 0.7295029051000645, + "grad_norm": 0.50351482629776, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 2260 + }, + { + "epoch": 0.7327307940606843, + "grad_norm": 0.4894128143787384, + "learning_rate": 0.0002, + "loss": 0.8475, + "step": 2270 + }, + { + "epoch": 0.7359586830213041, + "grad_norm": 0.5580906271934509, + "learning_rate": 0.0002, + "loss": 0.8087, + "step": 2280 + }, + { + "epoch": 0.7391865719819238, + "grad_norm": 0.4655369520187378, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 2290 + }, + { + "epoch": 0.7424144609425436, + "grad_norm": 0.4666965901851654, + "learning_rate": 0.0002, + "loss": 0.8395, + "step": 2300 + }, + { + "epoch": 0.7456423499031634, + "grad_norm": 0.46259936690330505, + "learning_rate": 0.0002, + "loss": 0.7605, + "step": 2310 + }, + { + "epoch": 0.7488702388637831, + "grad_norm": 0.520706832408905, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 2320 + }, + { + "epoch": 0.7520981278244029, + "grad_norm": 0.5142408013343811, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 2330 + }, + { + "epoch": 0.7553260167850226, + "grad_norm": 0.5355164408683777, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 2340 + }, + { + "epoch": 0.7585539057456423, + "grad_norm": 0.5517185926437378, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 2350 + }, + { + "epoch": 0.7617817947062621, + "grad_norm": 0.7162677049636841, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 2360 + }, + { + "epoch": 0.7650096836668818, + "grad_norm": 0.42402133345603943, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 2370 + }, + { + "epoch": 0.7682375726275016, + "grad_norm": 0.47180113196372986, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 2380 + }, + { + "epoch": 0.7714654615881213, + "grad_norm": 0.6262288689613342, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 2390 + }, + { + "epoch": 0.7746933505487411, + "grad_norm": 0.5177528262138367, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 2400 + }, + { + "epoch": 0.7779212395093609, + "grad_norm": 0.555721640586853, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 2410 + }, + { + "epoch": 0.7811491284699806, + "grad_norm": 0.5592644810676575, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 2420 + }, + { + "epoch": 0.7843770174306004, + "grad_norm": 0.38025397062301636, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 2430 + }, + { + "epoch": 0.7876049063912202, + "grad_norm": 0.4597472548484802, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 2440 + }, + { + "epoch": 0.7908327953518399, + "grad_norm": 0.4929825961589813, + "learning_rate": 0.0002, + "loss": 0.8575, + "step": 2450 + }, + { + "epoch": 0.7940606843124597, + "grad_norm": 0.45277655124664307, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 2460 + }, + { + "epoch": 0.7972885732730794, + "grad_norm": 0.6224122643470764, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 2470 + }, + { + "epoch": 0.8005164622336992, + "grad_norm": 0.5740901827812195, + "learning_rate": 0.0002, + "loss": 0.8449, + "step": 2480 + }, + { + "epoch": 0.8037443511943189, + "grad_norm": 0.41335329413414, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 2490 + }, + { + "epoch": 0.8069722401549386, + "grad_norm": 0.4738694131374359, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 2500 + }, + { + "epoch": 0.8102001291155584, + "grad_norm": 0.5288197994232178, + "learning_rate": 0.0002, + "loss": 0.7927, + "step": 2510 + }, + { + "epoch": 0.8134280180761781, + "grad_norm": 0.5404666066169739, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 2520 + }, + { + "epoch": 0.8166559070367979, + "grad_norm": 0.4444909691810608, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 2530 + }, + { + "epoch": 0.8198837959974177, + "grad_norm": 0.542061448097229, + "learning_rate": 0.0002, + "loss": 0.8683, + "step": 2540 + }, + { + "epoch": 0.8231116849580374, + "grad_norm": 0.4914741814136505, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 2550 + }, + { + "epoch": 0.8263395739186572, + "grad_norm": 0.41703441739082336, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 2560 + }, + { + "epoch": 0.829567462879277, + "grad_norm": 0.5489841103553772, + "learning_rate": 0.0002, + "loss": 0.824, + "step": 2570 + }, + { + "epoch": 0.8327953518398967, + "grad_norm": 0.5359883308410645, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 2580 + }, + { + "epoch": 0.8360232408005165, + "grad_norm": 0.5541019439697266, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 2590 + }, + { + "epoch": 0.8392511297611362, + "grad_norm": 0.4746638834476471, + "learning_rate": 0.0002, + "loss": 0.797, + "step": 2600 + }, + { + "epoch": 0.842479018721756, + "grad_norm": 0.5243194103240967, + "learning_rate": 0.0002, + "loss": 0.8116, + "step": 2610 + }, + { + "epoch": 0.8457069076823758, + "grad_norm": 0.46824976801872253, + "learning_rate": 0.0002, + "loss": 0.8173, + "step": 2620 + }, + { + "epoch": 0.8489347966429954, + "grad_norm": 0.49487847089767456, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 2630 + }, + { + "epoch": 0.8521626856036152, + "grad_norm": 0.42180097103118896, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 2640 + }, + { + "epoch": 0.855390574564235, + "grad_norm": 0.5516560077667236, + "learning_rate": 0.0002, + "loss": 0.8304, + "step": 2650 + }, + { + "epoch": 0.8586184635248547, + "grad_norm": 0.4392191767692566, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 2660 + }, + { + "epoch": 0.8618463524854745, + "grad_norm": 0.5387210845947266, + "learning_rate": 0.0002, + "loss": 0.848, + "step": 2670 + }, + { + "epoch": 0.8650742414460942, + "grad_norm": 0.6232406497001648, + "learning_rate": 0.0002, + "loss": 0.8094, + "step": 2680 + }, + { + "epoch": 0.868302130406714, + "grad_norm": 0.53749018907547, + "learning_rate": 0.0002, + "loss": 0.768, + "step": 2690 + }, + { + "epoch": 0.8715300193673338, + "grad_norm": 0.47480374574661255, + "learning_rate": 0.0002, + "loss": 0.8299, + "step": 2700 + }, + { + "epoch": 0.8747579083279535, + "grad_norm": 0.44618046283721924, + "learning_rate": 0.0002, + "loss": 0.8055, + "step": 2710 + }, + { + "epoch": 0.8779857972885733, + "grad_norm": 0.4173581302165985, + "learning_rate": 0.0002, + "loss": 0.8015, + "step": 2720 + }, + { + "epoch": 0.881213686249193, + "grad_norm": 0.524081289768219, + "learning_rate": 0.0002, + "loss": 0.7713, + "step": 2730 + }, + { + "epoch": 0.8844415752098128, + "grad_norm": 0.5608431100845337, + "learning_rate": 0.0002, + "loss": 0.8738, + "step": 2740 + }, + { + "epoch": 0.8876694641704326, + "grad_norm": 0.5212284922599792, + "learning_rate": 0.0002, + "loss": 0.8513, + "step": 2750 + }, + { + "epoch": 0.8908973531310523, + "grad_norm": 0.5601475834846497, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 2760 + }, + { + "epoch": 0.8941252420916721, + "grad_norm": 0.4499223828315735, + "learning_rate": 0.0002, + "loss": 0.7947, + "step": 2770 + }, + { + "epoch": 0.8973531310522918, + "grad_norm": 0.46945226192474365, + "learning_rate": 0.0002, + "loss": 0.8559, + "step": 2780 + }, + { + "epoch": 0.9005810200129115, + "grad_norm": 0.4837495684623718, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 2790 + }, + { + "epoch": 0.9038089089735313, + "grad_norm": 0.5059258937835693, + "learning_rate": 0.0002, + "loss": 0.7887, + "step": 2800 + }, + { + "epoch": 0.907036797934151, + "grad_norm": 0.4857945144176483, + "learning_rate": 0.0002, + "loss": 0.8571, + "step": 2810 + }, + { + "epoch": 0.9102646868947708, + "grad_norm": 0.5001962780952454, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 2820 + }, + { + "epoch": 0.9134925758553906, + "grad_norm": 0.5468648672103882, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 2830 + }, + { + "epoch": 0.9167204648160103, + "grad_norm": 0.5533056259155273, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 2840 + }, + { + "epoch": 0.9199483537766301, + "grad_norm": 0.5909785628318787, + "learning_rate": 0.0002, + "loss": 0.7895, + "step": 2850 + }, + { + "epoch": 0.9231762427372499, + "grad_norm": 0.47428104281425476, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 2860 + }, + { + "epoch": 0.9264041316978696, + "grad_norm": 0.548814058303833, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 2870 + }, + { + "epoch": 0.9296320206584894, + "grad_norm": 0.5576745271682739, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 2880 + }, + { + "epoch": 0.9328599096191091, + "grad_norm": 0.47094792127609253, + "learning_rate": 0.0002, + "loss": 0.8399, + "step": 2890 + }, + { + "epoch": 0.9360877985797289, + "grad_norm": 0.5408539772033691, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 2900 + }, + { + "epoch": 0.9393156875403487, + "grad_norm": 0.5922889113426208, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 2910 + }, + { + "epoch": 0.9425435765009683, + "grad_norm": 0.45462584495544434, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 2920 + }, + { + "epoch": 0.9457714654615881, + "grad_norm": 0.6864947080612183, + "learning_rate": 0.0002, + "loss": 0.8344, + "step": 2930 + }, + { + "epoch": 0.9489993544222078, + "grad_norm": 0.4706299304962158, + "learning_rate": 0.0002, + "loss": 0.8166, + "step": 2940 + }, + { + "epoch": 0.9522272433828276, + "grad_norm": 0.5583269596099854, + "learning_rate": 0.0002, + "loss": 0.8422, + "step": 2950 + }, + { + "epoch": 0.9554551323434474, + "grad_norm": 0.51015704870224, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 2960 + }, + { + "epoch": 0.9586830213040671, + "grad_norm": 0.5325582027435303, + "learning_rate": 0.0002, + "loss": 0.8371, + "step": 2970 + }, + { + "epoch": 0.9619109102646869, + "grad_norm": 0.49008598923683167, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 2980 + }, + { + "epoch": 0.9651387992253067, + "grad_norm": 0.4422132074832916, + "learning_rate": 0.0002, + "loss": 0.8093, + "step": 2990 + }, + { + "epoch": 0.9683666881859264, + "grad_norm": 0.5053589344024658, + "learning_rate": 0.0002, + "loss": 0.7966, + "step": 3000 + }, + { + "epoch": 0.9715945771465462, + "grad_norm": 0.46754521131515503, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 3010 + }, + { + "epoch": 0.9748224661071659, + "grad_norm": 0.5613434910774231, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 3020 + }, + { + "epoch": 0.9780503550677857, + "grad_norm": 0.5052843689918518, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 3030 + }, + { + "epoch": 0.9812782440284055, + "grad_norm": 0.4270972013473511, + "learning_rate": 0.0002, + "loss": 0.8412, + "step": 3040 + }, + { + "epoch": 0.9845061329890252, + "grad_norm": 0.4974991977214813, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 3050 + }, + { + "epoch": 0.9877340219496449, + "grad_norm": 0.4432311952114105, + "learning_rate": 0.0002, + "loss": 0.8415, + "step": 3060 + }, + { + "epoch": 0.9909619109102646, + "grad_norm": 0.466457724571228, + "learning_rate": 0.0002, + "loss": 0.7764, + "step": 3070 + }, + { + "epoch": 0.9941897998708844, + "grad_norm": 0.6438009142875671, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 3080 + }, + { + "epoch": 0.9974176888315042, + "grad_norm": 0.5593604445457458, + "learning_rate": 0.0002, + "loss": 0.8425, + "step": 3090 + }, + { + "epoch": 1.0, + "eval_loss": 1.0958120822906494, + "eval_runtime": 148.3273, + "eval_samples_per_second": 4.942, + "eval_steps_per_second": 0.62, + "step": 3098 + }, + { + "epoch": 1.000645577792124, + "grad_norm": 0.5701445937156677, + "learning_rate": 0.0002, + "loss": 0.8275, + "step": 3100 + }, + { + "epoch": 1.0038734667527438, + "grad_norm": 0.6089657545089722, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 3110 + }, + { + "epoch": 1.0071013557133635, + "grad_norm": 0.5619552135467529, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 3120 + }, + { + "epoch": 1.010329244673983, + "grad_norm": 0.5550283789634705, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 3130 + }, + { + "epoch": 1.013557133634603, + "grad_norm": 0.6221792101860046, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 3140 + }, + { + "epoch": 1.0167850225952226, + "grad_norm": 0.5450758934020996, + "learning_rate": 0.0002, + "loss": 0.7603, + "step": 3150 + }, + { + "epoch": 1.0200129115558425, + "grad_norm": 0.4359588027000427, + "learning_rate": 0.0002, + "loss": 0.7021, + "step": 3160 + }, + { + "epoch": 1.0232408005164622, + "grad_norm": 0.5932239890098572, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 3170 + }, + { + "epoch": 1.026468689477082, + "grad_norm": 0.45478707551956177, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 3180 + }, + { + "epoch": 1.0296965784377017, + "grad_norm": 0.677615761756897, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 3190 + }, + { + "epoch": 1.0329244673983216, + "grad_norm": 0.6231790781021118, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 3200 + }, + { + "epoch": 1.0361523563589412, + "grad_norm": 0.5074195861816406, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 3210 + }, + { + "epoch": 1.039380245319561, + "grad_norm": 0.4844142198562622, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 3220 + }, + { + "epoch": 1.0426081342801807, + "grad_norm": 0.5372750759124756, + "learning_rate": 0.0002, + "loss": 0.7655, + "step": 3230 + }, + { + "epoch": 1.0458360232408006, + "grad_norm": 0.46296265721321106, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 3240 + }, + { + "epoch": 1.0490639122014203, + "grad_norm": 0.5417148470878601, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 3250 + }, + { + "epoch": 1.0522918011620401, + "grad_norm": 0.5695074200630188, + "learning_rate": 0.0002, + "loss": 0.7637, + "step": 3260 + }, + { + "epoch": 1.0555196901226598, + "grad_norm": 0.5050092935562134, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 3270 + }, + { + "epoch": 1.0587475790832794, + "grad_norm": 0.5320752263069153, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 3280 + }, + { + "epoch": 1.0619754680438993, + "grad_norm": 0.5832052230834961, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 3290 + }, + { + "epoch": 1.065203357004519, + "grad_norm": 0.5228804349899292, + "learning_rate": 0.0002, + "loss": 0.7656, + "step": 3300 + }, + { + "epoch": 1.0684312459651388, + "grad_norm": 0.5819445252418518, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 3310 + }, + { + "epoch": 1.0716591349257585, + "grad_norm": 0.4201328754425049, + "learning_rate": 0.0002, + "loss": 0.7093, + "step": 3320 + }, + { + "epoch": 1.0748870238863784, + "grad_norm": 0.5424145460128784, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 3330 + }, + { + "epoch": 1.078114912846998, + "grad_norm": 0.6169946789741516, + "learning_rate": 0.0002, + "loss": 0.7828, + "step": 3340 + }, + { + "epoch": 1.0813428018076179, + "grad_norm": 0.607676088809967, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 3350 + }, + { + "epoch": 1.0845706907682375, + "grad_norm": 0.5191982388496399, + "learning_rate": 0.0002, + "loss": 0.7315, + "step": 3360 + }, + { + "epoch": 1.0877985797288574, + "grad_norm": 0.5728003978729248, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 3370 + }, + { + "epoch": 1.091026468689477, + "grad_norm": 0.5402643084526062, + "learning_rate": 0.0002, + "loss": 0.7381, + "step": 3380 + }, + { + "epoch": 1.094254357650097, + "grad_norm": 0.5377541780471802, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 3390 + }, + { + "epoch": 1.0974822466107166, + "grad_norm": 0.4751385748386383, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 3400 + }, + { + "epoch": 1.1007101355713362, + "grad_norm": 0.559158444404602, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 3410 + }, + { + "epoch": 1.103938024531956, + "grad_norm": 0.4917701482772827, + "learning_rate": 0.0002, + "loss": 0.7366, + "step": 3420 + }, + { + "epoch": 1.1071659134925758, + "grad_norm": 0.5507875084877014, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 3430 + }, + { + "epoch": 1.1103938024531956, + "grad_norm": 0.45458680391311646, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 3440 + }, + { + "epoch": 1.1136216914138153, + "grad_norm": 0.5721744894981384, + "learning_rate": 0.0002, + "loss": 0.7234, + "step": 3450 + }, + { + "epoch": 1.1168495803744352, + "grad_norm": 0.5776081681251526, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 3460 + }, + { + "epoch": 1.1200774693350548, + "grad_norm": 0.5261953473091125, + "learning_rate": 0.0002, + "loss": 0.7644, + "step": 3470 + }, + { + "epoch": 1.1233053582956747, + "grad_norm": 0.47759532928466797, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 3480 + }, + { + "epoch": 1.1265332472562943, + "grad_norm": 0.5697659850120544, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 3490 + }, + { + "epoch": 1.1297611362169142, + "grad_norm": 0.5643419623374939, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 3500 + }, + { + "epoch": 1.1329890251775339, + "grad_norm": 0.6502931118011475, + "learning_rate": 0.0002, + "loss": 0.7235, + "step": 3510 + }, + { + "epoch": 1.1362169141381537, + "grad_norm": 0.5236507654190063, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 3520 + }, + { + "epoch": 1.1394448030987734, + "grad_norm": 0.6521499156951904, + "learning_rate": 0.0002, + "loss": 0.7571, + "step": 3530 + }, + { + "epoch": 1.142672692059393, + "grad_norm": 0.5893217325210571, + "learning_rate": 0.0002, + "loss": 0.7304, + "step": 3540 + }, + { + "epoch": 1.145900581020013, + "grad_norm": 0.5300073027610779, + "learning_rate": 0.0002, + "loss": 0.7508, + "step": 3550 + }, + { + "epoch": 1.1491284699806328, + "grad_norm": 0.6794660091400146, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 3560 + }, + { + "epoch": 1.1523563589412524, + "grad_norm": 0.5420064926147461, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 3570 + }, + { + "epoch": 1.155584247901872, + "grad_norm": 0.5096590518951416, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 3580 + }, + { + "epoch": 1.158812136862492, + "grad_norm": 0.5726043581962585, + "learning_rate": 0.0002, + "loss": 0.7436, + "step": 3590 + }, + { + "epoch": 1.1620400258231116, + "grad_norm": 0.7388110160827637, + "learning_rate": 0.0002, + "loss": 0.7728, + "step": 3600 + }, + { + "epoch": 1.1652679147837315, + "grad_norm": 0.5597969889640808, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 3610 + }, + { + "epoch": 1.1684958037443511, + "grad_norm": 0.5067800283432007, + "learning_rate": 0.0002, + "loss": 0.7132, + "step": 3620 + }, + { + "epoch": 1.171723692704971, + "grad_norm": 0.6625118255615234, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 3630 + }, + { + "epoch": 1.1749515816655907, + "grad_norm": 0.5830849409103394, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 3640 + }, + { + "epoch": 1.1781794706262105, + "grad_norm": 0.6140692830085754, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 3650 + }, + { + "epoch": 1.1814073595868302, + "grad_norm": 0.714523434638977, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 3660 + }, + { + "epoch": 1.18463524854745, + "grad_norm": 0.5196696519851685, + "learning_rate": 0.0002, + "loss": 0.7092, + "step": 3670 + }, + { + "epoch": 1.1878631375080697, + "grad_norm": 0.6677889823913574, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 3680 + }, + { + "epoch": 1.1910910264686896, + "grad_norm": 0.47095245122909546, + "learning_rate": 0.0002, + "loss": 0.7813, + "step": 3690 + }, + { + "epoch": 1.1943189154293092, + "grad_norm": 0.5197778940200806, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 3700 + }, + { + "epoch": 1.1975468043899289, + "grad_norm": 0.5156530141830444, + "learning_rate": 0.0002, + "loss": 0.7349, + "step": 3710 + }, + { + "epoch": 1.2007746933505488, + "grad_norm": 0.6968549489974976, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 3720 + }, + { + "epoch": 1.2040025823111684, + "grad_norm": 0.48983848094940186, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 3730 + }, + { + "epoch": 1.2072304712717883, + "grad_norm": 0.6709973216056824, + "learning_rate": 0.0002, + "loss": 0.7163, + "step": 3740 + }, + { + "epoch": 1.210458360232408, + "grad_norm": 0.48681750893592834, + "learning_rate": 0.0002, + "loss": 0.7632, + "step": 3750 + }, + { + "epoch": 1.2136862491930278, + "grad_norm": 0.49475061893463135, + "learning_rate": 0.0002, + "loss": 0.7039, + "step": 3760 + }, + { + "epoch": 1.2169141381536475, + "grad_norm": 0.6163983345031738, + "learning_rate": 0.0002, + "loss": 0.7372, + "step": 3770 + }, + { + "epoch": 1.2201420271142673, + "grad_norm": 0.5481411218643188, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 3780 + }, + { + "epoch": 1.223369916074887, + "grad_norm": 0.620639979839325, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 3790 + }, + { + "epoch": 1.2265978050355069, + "grad_norm": 0.7017222046852112, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 3800 + }, + { + "epoch": 1.2298256939961265, + "grad_norm": 0.5872400403022766, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 3810 + }, + { + "epoch": 1.2330535829567464, + "grad_norm": 0.45765596628189087, + "learning_rate": 0.0002, + "loss": 0.7854, + "step": 3820 + }, + { + "epoch": 1.236281471917366, + "grad_norm": 0.5676377415657043, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 3830 + }, + { + "epoch": 1.2395093608779857, + "grad_norm": 0.4793425500392914, + "learning_rate": 0.0002, + "loss": 0.7696, + "step": 3840 + }, + { + "epoch": 1.2427372498386056, + "grad_norm": 0.5060022473335266, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 3850 + }, + { + "epoch": 1.2459651387992252, + "grad_norm": 0.6140682697296143, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 3860 + }, + { + "epoch": 1.249193027759845, + "grad_norm": 0.5030326843261719, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 3870 + }, + { + "epoch": 1.2524209167204647, + "grad_norm": 0.6609430909156799, + "learning_rate": 0.0002, + "loss": 0.7226, + "step": 3880 + }, + { + "epoch": 1.2556488056810846, + "grad_norm": 0.5459545850753784, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 3890 + }, + { + "epoch": 1.2588766946417043, + "grad_norm": 0.5328870415687561, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 3900 + }, + { + "epoch": 1.2621045836023241, + "grad_norm": 0.5840652585029602, + "learning_rate": 0.0002, + "loss": 0.7572, + "step": 3910 + }, + { + "epoch": 1.2653324725629438, + "grad_norm": 0.5587584376335144, + "learning_rate": 0.0002, + "loss": 0.7624, + "step": 3920 + }, + { + "epoch": 1.2685603615235637, + "grad_norm": 0.5886949896812439, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 3930 + }, + { + "epoch": 1.2717882504841833, + "grad_norm": 0.5128693580627441, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 3940 + }, + { + "epoch": 1.2750161394448032, + "grad_norm": 0.6207669377326965, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 3950 + }, + { + "epoch": 1.2782440284054228, + "grad_norm": 0.5789574384689331, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 3960 + }, + { + "epoch": 1.2814719173660425, + "grad_norm": 0.503162145614624, + "learning_rate": 0.0002, + "loss": 0.7574, + "step": 3970 + }, + { + "epoch": 1.2846998063266624, + "grad_norm": 0.6670064926147461, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 3980 + }, + { + "epoch": 1.2879276952872822, + "grad_norm": 0.5676213502883911, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 3990 + }, + { + "epoch": 1.2911555842479019, + "grad_norm": 0.5383169054985046, + "learning_rate": 0.0002, + "loss": 0.7892, + "step": 4000 + }, + { + "epoch": 1.2943834732085215, + "grad_norm": 0.714743971824646, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 4010 + }, + { + "epoch": 1.2976113621691414, + "grad_norm": 0.5740262269973755, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 4020 + }, + { + "epoch": 1.300839251129761, + "grad_norm": 0.6143045425415039, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 4030 + }, + { + "epoch": 1.304067140090381, + "grad_norm": 0.501025378704071, + "learning_rate": 0.0002, + "loss": 0.7181, + "step": 4040 + }, + { + "epoch": 1.3072950290510006, + "grad_norm": 0.5784100294113159, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 4050 + }, + { + "epoch": 1.3105229180116205, + "grad_norm": 0.6182606220245361, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 4060 + }, + { + "epoch": 1.3137508069722401, + "grad_norm": 0.5072231292724609, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 4070 + }, + { + "epoch": 1.31697869593286, + "grad_norm": 0.6841012835502625, + "learning_rate": 0.0002, + "loss": 0.7451, + "step": 4080 + }, + { + "epoch": 1.3202065848934796, + "grad_norm": 0.697257936000824, + "learning_rate": 0.0002, + "loss": 0.7395, + "step": 4090 + }, + { + "epoch": 1.3234344738540993, + "grad_norm": 0.5113214254379272, + "learning_rate": 0.0002, + "loss": 0.7401, + "step": 4100 + }, + { + "epoch": 1.3266623628147192, + "grad_norm": 0.6270561814308167, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 4110 + }, + { + "epoch": 1.329890251775339, + "grad_norm": 0.5525947213172913, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 4120 + }, + { + "epoch": 1.3331181407359587, + "grad_norm": 0.546071469783783, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 4130 + }, + { + "epoch": 1.3363460296965783, + "grad_norm": 0.6516721248626709, + "learning_rate": 0.0002, + "loss": 0.7884, + "step": 4140 + }, + { + "epoch": 1.3395739186571982, + "grad_norm": 0.6235111355781555, + "learning_rate": 0.0002, + "loss": 0.755, + "step": 4150 + }, + { + "epoch": 1.3428018076178179, + "grad_norm": 0.538649320602417, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 4160 + }, + { + "epoch": 1.3460296965784377, + "grad_norm": 0.5367001891136169, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 4170 + }, + { + "epoch": 1.3492575855390574, + "grad_norm": 0.6134631037712097, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 4180 + }, + { + "epoch": 1.3524854744996773, + "grad_norm": 0.5827262997627258, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 4190 + }, + { + "epoch": 1.355713363460297, + "grad_norm": 0.5706096291542053, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 4200 + }, + { + "epoch": 1.3589412524209168, + "grad_norm": 0.6422057151794434, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 4210 + }, + { + "epoch": 1.3621691413815364, + "grad_norm": 0.6316141486167908, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 4220 + }, + { + "epoch": 1.365397030342156, + "grad_norm": 0.6946983933448792, + "learning_rate": 0.0002, + "loss": 0.7457, + "step": 4230 + }, + { + "epoch": 1.368624919302776, + "grad_norm": 0.5381525754928589, + "learning_rate": 0.0002, + "loss": 0.7388, + "step": 4240 + }, + { + "epoch": 1.3718528082633958, + "grad_norm": 0.5484845638275146, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 4250 + }, + { + "epoch": 1.3750806972240155, + "grad_norm": 0.5961896777153015, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 4260 + }, + { + "epoch": 1.3783085861846351, + "grad_norm": 0.6041752696037292, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 4270 + }, + { + "epoch": 1.381536475145255, + "grad_norm": 0.6283464431762695, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 4280 + }, + { + "epoch": 1.384764364105875, + "grad_norm": 0.6761324405670166, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 4290 + }, + { + "epoch": 1.3879922530664945, + "grad_norm": 0.504311203956604, + "learning_rate": 0.0002, + "loss": 0.7381, + "step": 4300 + }, + { + "epoch": 1.3912201420271142, + "grad_norm": 0.6100395917892456, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 4310 + }, + { + "epoch": 1.394448030987734, + "grad_norm": 0.6245788335800171, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 4320 + }, + { + "epoch": 1.3976759199483537, + "grad_norm": 0.6074621081352234, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 4330 + }, + { + "epoch": 1.4009038089089736, + "grad_norm": 0.6683838963508606, + "learning_rate": 0.0002, + "loss": 0.752, + "step": 4340 + }, + { + "epoch": 1.4041316978695932, + "grad_norm": 0.622998058795929, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 4350 + }, + { + "epoch": 1.4073595868302131, + "grad_norm": 0.6089423894882202, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 4360 + }, + { + "epoch": 1.4105874757908328, + "grad_norm": 0.6381658911705017, + "learning_rate": 0.0002, + "loss": 0.7715, + "step": 4370 + }, + { + "epoch": 1.4138153647514526, + "grad_norm": 0.5419308543205261, + "learning_rate": 0.0002, + "loss": 0.7871, + "step": 4380 + }, + { + "epoch": 1.4170432537120723, + "grad_norm": 0.6026232242584229, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 4390 + }, + { + "epoch": 1.420271142672692, + "grad_norm": 0.4911101162433624, + "learning_rate": 0.0002, + "loss": 0.7529, + "step": 4400 + }, + { + "epoch": 1.4234990316333118, + "grad_norm": 0.6302908062934875, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 4410 + }, + { + "epoch": 1.4267269205939317, + "grad_norm": 0.6692768931388855, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 4420 + }, + { + "epoch": 1.4299548095545513, + "grad_norm": 0.46294572949409485, + "learning_rate": 0.0002, + "loss": 0.7312, + "step": 4430 + }, + { + "epoch": 1.433182698515171, + "grad_norm": 0.5452619194984436, + "learning_rate": 0.0002, + "loss": 0.7255, + "step": 4440 + }, + { + "epoch": 1.4364105874757909, + "grad_norm": 0.7809233069419861, + "learning_rate": 0.0002, + "loss": 0.7974, + "step": 4450 + }, + { + "epoch": 1.4396384764364105, + "grad_norm": 0.550088107585907, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 4460 + }, + { + "epoch": 1.4428663653970304, + "grad_norm": 0.7139151096343994, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 4470 + }, + { + "epoch": 1.44609425435765, + "grad_norm": 0.6187090873718262, + "learning_rate": 0.0002, + "loss": 0.7358, + "step": 4480 + }, + { + "epoch": 1.44932214331827, + "grad_norm": 0.5948249101638794, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 4490 + }, + { + "epoch": 1.4525500322788896, + "grad_norm": 0.6510892510414124, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 4500 + }, + { + "epoch": 1.4557779212395094, + "grad_norm": 0.6552293300628662, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 4510 + }, + { + "epoch": 1.459005810200129, + "grad_norm": 0.585574209690094, + "learning_rate": 0.0002, + "loss": 0.7965, + "step": 4520 + }, + { + "epoch": 1.4622336991607487, + "grad_norm": 0.4830162823200226, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 4530 + }, + { + "epoch": 1.4654615881213686, + "grad_norm": 0.5780223608016968, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 4540 + }, + { + "epoch": 1.4686894770819885, + "grad_norm": 0.5462607145309448, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 4550 + }, + { + "epoch": 1.4719173660426081, + "grad_norm": 0.5183546543121338, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 4560 + }, + { + "epoch": 1.4751452550032278, + "grad_norm": 0.676917552947998, + "learning_rate": 0.0002, + "loss": 0.71, + "step": 4570 + }, + { + "epoch": 1.4783731439638477, + "grad_norm": 0.5772345066070557, + "learning_rate": 0.0002, + "loss": 0.7875, + "step": 4580 + }, + { + "epoch": 1.4816010329244673, + "grad_norm": 0.7320035696029663, + "learning_rate": 0.0002, + "loss": 0.7709, + "step": 4590 + }, + { + "epoch": 1.4848289218850872, + "grad_norm": 0.5024042129516602, + "learning_rate": 0.0002, + "loss": 0.7601, + "step": 4600 + }, + { + "epoch": 1.4880568108457068, + "grad_norm": 0.5482868552207947, + "learning_rate": 0.0002, + "loss": 0.8061, + "step": 4610 + }, + { + "epoch": 1.4912846998063267, + "grad_norm": 0.5447399616241455, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 4620 + }, + { + "epoch": 1.4945125887669464, + "grad_norm": 0.5953414440155029, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 4630 + }, + { + "epoch": 1.4977404777275662, + "grad_norm": 0.6983066201210022, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 4640 + }, + { + "epoch": 1.500968366688186, + "grad_norm": 0.586327075958252, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 4650 + }, + { + "epoch": 1.5041962556488055, + "grad_norm": 0.5839682221412659, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 4660 + }, + { + "epoch": 1.5074241446094254, + "grad_norm": 0.5959209203720093, + "learning_rate": 0.0002, + "loss": 0.7524, + "step": 4670 + }, + { + "epoch": 1.5106520335700453, + "grad_norm": 0.5073857307434082, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 4680 + }, + { + "epoch": 1.513879922530665, + "grad_norm": 0.5183001160621643, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 4690 + }, + { + "epoch": 1.5171078114912846, + "grad_norm": 0.593530535697937, + "learning_rate": 0.0002, + "loss": 0.784, + "step": 4700 + }, + { + "epoch": 1.5203357004519045, + "grad_norm": 0.675993025302887, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 4710 + }, + { + "epoch": 1.5235635894125243, + "grad_norm": 0.5823286771774292, + "learning_rate": 0.0002, + "loss": 0.7485, + "step": 4720 + }, + { + "epoch": 1.526791478373144, + "grad_norm": 0.5825035572052002, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 4730 + }, + { + "epoch": 1.5300193673337636, + "grad_norm": 0.5689691305160522, + "learning_rate": 0.0002, + "loss": 0.8287, + "step": 4740 + }, + { + "epoch": 1.5332472562943835, + "grad_norm": 0.6037150621414185, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 4750 + }, + { + "epoch": 1.5364751452550034, + "grad_norm": 0.6393677592277527, + "learning_rate": 0.0002, + "loss": 0.7865, + "step": 4760 + }, + { + "epoch": 1.539703034215623, + "grad_norm": 0.5926381945610046, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 4770 + }, + { + "epoch": 1.5429309231762427, + "grad_norm": 0.9468599557876587, + "learning_rate": 0.0002, + "loss": 0.7425, + "step": 4780 + }, + { + "epoch": 1.5461588121368623, + "grad_norm": 0.7544237375259399, + "learning_rate": 0.0002, + "loss": 0.7565, + "step": 4790 + }, + { + "epoch": 1.5493867010974822, + "grad_norm": 0.5308566093444824, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 4800 + }, + { + "epoch": 1.552614590058102, + "grad_norm": 0.6590296030044556, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 4810 + }, + { + "epoch": 1.5558424790187217, + "grad_norm": 0.5630404353141785, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 4820 + }, + { + "epoch": 1.5590703679793414, + "grad_norm": 0.6800200939178467, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 4830 + }, + { + "epoch": 1.5622982569399613, + "grad_norm": 0.5463718175888062, + "learning_rate": 0.0002, + "loss": 0.7373, + "step": 4840 + }, + { + "epoch": 1.5655261459005811, + "grad_norm": 0.505135178565979, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 4850 + }, + { + "epoch": 1.5687540348612008, + "grad_norm": 0.5469676852226257, + "learning_rate": 0.0002, + "loss": 0.8122, + "step": 4860 + }, + { + "epoch": 1.5719819238218204, + "grad_norm": 0.5318337678909302, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 4870 + }, + { + "epoch": 1.5752098127824403, + "grad_norm": 0.7287914752960205, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 4880 + }, + { + "epoch": 1.5784377017430602, + "grad_norm": 0.7318989038467407, + "learning_rate": 0.0002, + "loss": 0.7532, + "step": 4890 + }, + { + "epoch": 1.5816655907036798, + "grad_norm": 0.6499921679496765, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 4900 + }, + { + "epoch": 1.5848934796642995, + "grad_norm": 0.47907355427742004, + "learning_rate": 0.0002, + "loss": 0.753, + "step": 4910 + }, + { + "epoch": 1.5881213686249191, + "grad_norm": 0.7338833808898926, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 4920 + }, + { + "epoch": 1.591349257585539, + "grad_norm": 0.5800719261169434, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 4930 + }, + { + "epoch": 1.594577146546159, + "grad_norm": 0.5365763306617737, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 4940 + }, + { + "epoch": 1.5978050355067785, + "grad_norm": 0.5800772309303284, + "learning_rate": 0.0002, + "loss": 0.777, + "step": 4950 + }, + { + "epoch": 1.6010329244673982, + "grad_norm": 0.7878010869026184, + "learning_rate": 0.0002, + "loss": 0.8027, + "step": 4960 + }, + { + "epoch": 1.604260813428018, + "grad_norm": 0.5919058918952942, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 4970 + }, + { + "epoch": 1.607488702388638, + "grad_norm": 0.5004435181617737, + "learning_rate": 0.0002, + "loss": 0.7762, + "step": 4980 + }, + { + "epoch": 1.6107165913492576, + "grad_norm": 0.6299242377281189, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 4990 + }, + { + "epoch": 1.6139444803098772, + "grad_norm": 0.6307242512702942, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 5000 + }, + { + "epoch": 1.6171723692704971, + "grad_norm": 0.7838703989982605, + "learning_rate": 0.0002, + "loss": 0.7693, + "step": 5010 + }, + { + "epoch": 1.620400258231117, + "grad_norm": 0.6454671621322632, + "learning_rate": 0.0002, + "loss": 0.7364, + "step": 5020 + }, + { + "epoch": 1.6236281471917366, + "grad_norm": 0.5907095670700073, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 5030 + }, + { + "epoch": 1.6268560361523563, + "grad_norm": 0.6053501963615417, + "learning_rate": 0.0002, + "loss": 0.7331, + "step": 5040 + }, + { + "epoch": 1.630083925112976, + "grad_norm": 0.5644670128822327, + "learning_rate": 0.0002, + "loss": 0.6987, + "step": 5050 + }, + { + "epoch": 1.6333118140735958, + "grad_norm": 0.6320949792861938, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 5060 + }, + { + "epoch": 1.6365397030342157, + "grad_norm": 0.6101489067077637, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 5070 + }, + { + "epoch": 1.6397675919948353, + "grad_norm": 0.9435283541679382, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 5080 + }, + { + "epoch": 1.642995480955455, + "grad_norm": 0.6668919324874878, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 5090 + }, + { + "epoch": 1.6462233699160749, + "grad_norm": 0.6160340905189514, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 5100 + }, + { + "epoch": 1.6494512588766947, + "grad_norm": 0.5999835729598999, + "learning_rate": 0.0002, + "loss": 0.7461, + "step": 5110 + }, + { + "epoch": 1.6526791478373144, + "grad_norm": 0.9378551840782166, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 5120 + }, + { + "epoch": 1.655907036797934, + "grad_norm": 0.4795055389404297, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 5130 + }, + { + "epoch": 1.659134925758554, + "grad_norm": 0.4878861606121063, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 5140 + }, + { + "epoch": 1.6623628147191738, + "grad_norm": 0.6042965054512024, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 5150 + }, + { + "epoch": 1.6655907036797934, + "grad_norm": 0.5829901695251465, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 5160 + }, + { + "epoch": 1.668818592640413, + "grad_norm": 0.5168480277061462, + "learning_rate": 0.0002, + "loss": 0.7498, + "step": 5170 + }, + { + "epoch": 1.672046481601033, + "grad_norm": 0.6489511132240295, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 5180 + }, + { + "epoch": 1.6752743705616526, + "grad_norm": 0.5955966114997864, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 5190 + }, + { + "epoch": 1.6785022595222725, + "grad_norm": 0.6228088140487671, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 5200 + }, + { + "epoch": 1.6817301484828922, + "grad_norm": 0.5726390480995178, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 5210 + }, + { + "epoch": 1.6849580374435118, + "grad_norm": 0.6116343140602112, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 5220 + }, + { + "epoch": 1.6881859264041317, + "grad_norm": 0.5483687520027161, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 5230 + }, + { + "epoch": 1.6914138153647515, + "grad_norm": 0.570941686630249, + "learning_rate": 0.0002, + "loss": 0.7293, + "step": 5240 + }, + { + "epoch": 1.6946417043253712, + "grad_norm": 0.6048086285591125, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 5250 + }, + { + "epoch": 1.6978695932859909, + "grad_norm": 0.6769003868103027, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 5260 + }, + { + "epoch": 1.7010974822466107, + "grad_norm": 0.5629057884216309, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 5270 + }, + { + "epoch": 1.7043253712072306, + "grad_norm": 0.657341480255127, + "learning_rate": 0.0002, + "loss": 0.7693, + "step": 5280 + }, + { + "epoch": 1.7075532601678503, + "grad_norm": 0.6256147623062134, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 5290 + }, + { + "epoch": 1.71078114912847, + "grad_norm": 0.5498088002204895, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 5300 + }, + { + "epoch": 1.7140090380890898, + "grad_norm": 0.5078358054161072, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 5310 + }, + { + "epoch": 1.7172369270497096, + "grad_norm": 0.6696692705154419, + "learning_rate": 0.0002, + "loss": 0.7872, + "step": 5320 + }, + { + "epoch": 1.7204648160103293, + "grad_norm": 0.6692847013473511, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 5330 + }, + { + "epoch": 1.723692704970949, + "grad_norm": 0.5415751934051514, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 5340 + }, + { + "epoch": 1.7269205939315686, + "grad_norm": 0.5367611050605774, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 5350 + }, + { + "epoch": 1.7301484828921885, + "grad_norm": 0.7321061491966248, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 5360 + }, + { + "epoch": 1.7333763718528084, + "grad_norm": 0.723972499370575, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 5370 + }, + { + "epoch": 1.736604260813428, + "grad_norm": 0.7328100204467773, + "learning_rate": 0.0002, + "loss": 0.7077, + "step": 5380 + }, + { + "epoch": 1.7398321497740477, + "grad_norm": 0.5785264372825623, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 5390 + }, + { + "epoch": 1.7430600387346675, + "grad_norm": 0.7812932133674622, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 5400 + }, + { + "epoch": 1.7462879276952874, + "grad_norm": 0.6493327617645264, + "learning_rate": 0.0002, + "loss": 0.7386, + "step": 5410 + }, + { + "epoch": 1.749515816655907, + "grad_norm": 0.5825939774513245, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 5420 + }, + { + "epoch": 1.7527437056165267, + "grad_norm": 0.6969610452651978, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 5430 + }, + { + "epoch": 1.7559715945771466, + "grad_norm": 0.5558062195777893, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 5440 + }, + { + "epoch": 1.7591994835377665, + "grad_norm": 0.49222221970558167, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 5450 + }, + { + "epoch": 1.762427372498386, + "grad_norm": 0.5844656825065613, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 5460 + }, + { + "epoch": 1.7656552614590058, + "grad_norm": 0.8706597685813904, + "learning_rate": 0.0002, + "loss": 0.7695, + "step": 5470 + }, + { + "epoch": 1.7688831504196254, + "grad_norm": 0.6167706251144409, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 5480 + }, + { + "epoch": 1.7721110393802453, + "grad_norm": 0.5890011787414551, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 5490 + }, + { + "epoch": 1.7753389283408652, + "grad_norm": 0.6551728248596191, + "learning_rate": 0.0002, + "loss": 0.8319, + "step": 5500 + }, + { + "epoch": 1.7785668173014848, + "grad_norm": 0.5848751068115234, + "learning_rate": 0.0002, + "loss": 0.7615, + "step": 5510 + }, + { + "epoch": 1.7817947062621045, + "grad_norm": 0.6664014458656311, + "learning_rate": 0.0002, + "loss": 0.7622, + "step": 5520 + }, + { + "epoch": 1.7850225952227243, + "grad_norm": 0.5931693911552429, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 5530 + }, + { + "epoch": 1.7882504841833442, + "grad_norm": 0.5534724593162537, + "learning_rate": 0.0002, + "loss": 0.7992, + "step": 5540 + }, + { + "epoch": 1.7914783731439639, + "grad_norm": 0.5590878129005432, + "learning_rate": 0.0002, + "loss": 0.7967, + "step": 5550 + }, + { + "epoch": 1.7947062621045835, + "grad_norm": 0.6947470903396606, + "learning_rate": 0.0002, + "loss": 0.7406, + "step": 5560 + }, + { + "epoch": 1.7979341510652034, + "grad_norm": 0.6104130148887634, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 5570 + }, + { + "epoch": 1.8011620400258233, + "grad_norm": 0.6135714054107666, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 5580 + }, + { + "epoch": 1.804389928986443, + "grad_norm": 0.6626853346824646, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 5590 + }, + { + "epoch": 1.8076178179470626, + "grad_norm": 0.6977612972259521, + "learning_rate": 0.0002, + "loss": 0.7746, + "step": 5600 + }, + { + "epoch": 1.8108457069076824, + "grad_norm": 0.6275238394737244, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 5610 + }, + { + "epoch": 1.814073595868302, + "grad_norm": 0.5017505288124084, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 5620 + }, + { + "epoch": 1.817301484828922, + "grad_norm": 0.8314290642738342, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 5630 + }, + { + "epoch": 1.8205293737895416, + "grad_norm": 0.6863582134246826, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 5640 + }, + { + "epoch": 1.8237572627501613, + "grad_norm": 0.69544917345047, + "learning_rate": 0.0002, + "loss": 0.743, + "step": 5650 + }, + { + "epoch": 1.8269851517107811, + "grad_norm": 0.515499472618103, + "learning_rate": 0.0002, + "loss": 0.7277, + "step": 5660 + }, + { + "epoch": 1.830213040671401, + "grad_norm": 0.6100873947143555, + "learning_rate": 0.0002, + "loss": 0.7166, + "step": 5670 + }, + { + "epoch": 1.8334409296320207, + "grad_norm": 0.67416912317276, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 5680 + }, + { + "epoch": 1.8366688185926403, + "grad_norm": 0.7057772278785706, + "learning_rate": 0.0002, + "loss": 0.7575, + "step": 5690 + }, + { + "epoch": 1.8398967075532602, + "grad_norm": 0.7374551892280579, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 5700 + }, + { + "epoch": 1.84312459651388, + "grad_norm": 0.6266297101974487, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 5710 + }, + { + "epoch": 1.8463524854744997, + "grad_norm": 0.5629227757453918, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 5720 + }, + { + "epoch": 1.8495803744351194, + "grad_norm": 0.6603655815124512, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 5730 + }, + { + "epoch": 1.8528082633957392, + "grad_norm": 0.8113715052604675, + "learning_rate": 0.0002, + "loss": 0.7587, + "step": 5740 + }, + { + "epoch": 1.856036152356359, + "grad_norm": 0.7143914103507996, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 5750 + }, + { + "epoch": 1.8592640413169788, + "grad_norm": 0.6273732781410217, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 5760 + }, + { + "epoch": 1.8624919302775984, + "grad_norm": 0.5428690910339355, + "learning_rate": 0.0002, + "loss": 0.7962, + "step": 5770 + }, + { + "epoch": 1.865719819238218, + "grad_norm": 0.6405037641525269, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 5780 + }, + { + "epoch": 1.868947708198838, + "grad_norm": 0.700873613357544, + "learning_rate": 0.0002, + "loss": 0.7569, + "step": 5790 + }, + { + "epoch": 1.8721755971594578, + "grad_norm": 0.5645238161087036, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 5800 + }, + { + "epoch": 1.8754034861200775, + "grad_norm": 0.8780353665351868, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 5810 + }, + { + "epoch": 1.878631375080697, + "grad_norm": 0.6295409798622131, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 5820 + }, + { + "epoch": 1.881859264041317, + "grad_norm": 0.678269624710083, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 5830 + }, + { + "epoch": 1.8850871530019369, + "grad_norm": 0.6464608907699585, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 5840 + }, + { + "epoch": 1.8883150419625565, + "grad_norm": 0.6201048493385315, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 5850 + }, + { + "epoch": 1.8915429309231762, + "grad_norm": 0.6046274304389954, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 5860 + }, + { + "epoch": 1.894770819883796, + "grad_norm": 0.7532408833503723, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 5870 + }, + { + "epoch": 1.897998708844416, + "grad_norm": 0.6066767573356628, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 5880 + }, + { + "epoch": 1.9012265978050356, + "grad_norm": 0.6289830207824707, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 5890 + }, + { + "epoch": 1.9044544867656552, + "grad_norm": 0.5204319953918457, + "learning_rate": 0.0002, + "loss": 0.7501, + "step": 5900 + }, + { + "epoch": 1.9076823757262749, + "grad_norm": 0.6708219647407532, + "learning_rate": 0.0002, + "loss": 0.7335, + "step": 5910 + }, + { + "epoch": 1.9109102646868947, + "grad_norm": 0.4915677309036255, + "learning_rate": 0.0002, + "loss": 0.7455, + "step": 5920 + }, + { + "epoch": 1.9141381536475146, + "grad_norm": 0.652717113494873, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 5930 + }, + { + "epoch": 1.9173660426081343, + "grad_norm": 0.5446316003799438, + "learning_rate": 0.0002, + "loss": 0.7687, + "step": 5940 + }, + { + "epoch": 1.920593931568754, + "grad_norm": 0.4958149194717407, + "learning_rate": 0.0002, + "loss": 0.7424, + "step": 5950 + }, + { + "epoch": 1.9238218205293738, + "grad_norm": 0.5623434782028198, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 5960 + }, + { + "epoch": 1.9270497094899937, + "grad_norm": 0.6855450868606567, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 5970 + }, + { + "epoch": 1.9302775984506133, + "grad_norm": 0.5710492730140686, + "learning_rate": 0.0002, + "loss": 0.827, + "step": 5980 + }, + { + "epoch": 1.933505487411233, + "grad_norm": 0.5379431843757629, + "learning_rate": 0.0002, + "loss": 0.7245, + "step": 5990 + }, + { + "epoch": 1.9367333763718528, + "grad_norm": 0.557129442691803, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 6000 + }, + { + "epoch": 1.9399612653324727, + "grad_norm": 0.6336663961410522, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 6010 + }, + { + "epoch": 1.9431891542930924, + "grad_norm": 0.5950582027435303, + "learning_rate": 0.0002, + "loss": 0.7316, + "step": 6020 + }, + { + "epoch": 1.946417043253712, + "grad_norm": 0.5905954837799072, + "learning_rate": 0.0002, + "loss": 0.7443, + "step": 6030 + }, + { + "epoch": 1.9496449322143317, + "grad_norm": 0.6688982844352722, + "learning_rate": 0.0002, + "loss": 0.7127, + "step": 6040 + }, + { + "epoch": 1.9528728211749515, + "grad_norm": 0.5440775752067566, + "learning_rate": 0.0002, + "loss": 0.79, + "step": 6050 + }, + { + "epoch": 1.9561007101355714, + "grad_norm": 0.6207906603813171, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 6060 + }, + { + "epoch": 1.959328599096191, + "grad_norm": 0.6999374628067017, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 6070 + }, + { + "epoch": 1.9625564880568107, + "grad_norm": 0.6310848593711853, + "learning_rate": 0.0002, + "loss": 0.7372, + "step": 6080 + }, + { + "epoch": 1.9657843770174306, + "grad_norm": 0.5903388261795044, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 6090 + }, + { + "epoch": 1.9690122659780505, + "grad_norm": 0.6333889961242676, + "learning_rate": 0.0002, + "loss": 0.7103, + "step": 6100 + }, + { + "epoch": 1.97224015493867, + "grad_norm": 0.5604711174964905, + "learning_rate": 0.0002, + "loss": 0.7246, + "step": 6110 + }, + { + "epoch": 1.9754680438992898, + "grad_norm": 0.9234541654586792, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 6120 + }, + { + "epoch": 1.9786959328599096, + "grad_norm": 0.6149102449417114, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 6130 + }, + { + "epoch": 1.9819238218205295, + "grad_norm": 0.615446150302887, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 6140 + }, + { + "epoch": 1.9851517107811492, + "grad_norm": 0.5176635980606079, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 6150 + }, + { + "epoch": 1.9883795997417688, + "grad_norm": 0.7124109864234924, + "learning_rate": 0.0002, + "loss": 0.718, + "step": 6160 + }, + { + "epoch": 1.9916074887023887, + "grad_norm": 0.6317567825317383, + "learning_rate": 0.0002, + "loss": 0.7669, + "step": 6170 + }, + { + "epoch": 1.9948353776630086, + "grad_norm": 0.6855016350746155, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 6180 + }, + { + "epoch": 1.9980632666236282, + "grad_norm": 0.6423715353012085, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 6190 + }, + { + "epoch": 2.0, + "eval_loss": 1.1096643209457397, + "eval_runtime": 147.7997, + "eval_samples_per_second": 4.959, + "eval_steps_per_second": 0.622, + "step": 6196 + }, + { + "epoch": 2.001291155584248, + "grad_norm": 0.5322932600975037, + "learning_rate": 0.0002, + "loss": 0.7131, + "step": 6200 + }, + { + "epoch": 2.0045190445448675, + "grad_norm": 0.8152306079864502, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 6210 + }, + { + "epoch": 2.0077469335054876, + "grad_norm": 0.6215983033180237, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 6220 + }, + { + "epoch": 2.0109748224661073, + "grad_norm": 0.845498263835907, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 6230 + }, + { + "epoch": 2.014202711426727, + "grad_norm": 0.733559787273407, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 6240 + }, + { + "epoch": 2.0174306003873466, + "grad_norm": 0.51433926820755, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 6250 + }, + { + "epoch": 2.020658489347966, + "grad_norm": 0.6374049782752991, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 6260 + }, + { + "epoch": 2.0238863783085863, + "grad_norm": 0.7833638191223145, + "learning_rate": 0.0002, + "loss": 0.6831, + "step": 6270 + }, + { + "epoch": 2.027114267269206, + "grad_norm": 0.8929463028907776, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 6280 + }, + { + "epoch": 2.0303421562298256, + "grad_norm": 0.669731855392456, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 6290 + }, + { + "epoch": 2.0335700451904453, + "grad_norm": 0.5846071243286133, + "learning_rate": 0.0002, + "loss": 0.646, + "step": 6300 + }, + { + "epoch": 2.0367979341510654, + "grad_norm": 0.7087787985801697, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 6310 + }, + { + "epoch": 2.040025823111685, + "grad_norm": 0.6739160418510437, + "learning_rate": 0.0002, + "loss": 0.6433, + "step": 6320 + }, + { + "epoch": 2.0432537120723047, + "grad_norm": 0.4860886335372925, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 6330 + }, + { + "epoch": 2.0464816010329243, + "grad_norm": 0.7201244831085205, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 6340 + }, + { + "epoch": 2.0497094899935444, + "grad_norm": 0.7409170269966125, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 6350 + }, + { + "epoch": 2.052937378954164, + "grad_norm": 0.6843920350074768, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 6360 + }, + { + "epoch": 2.0561652679147837, + "grad_norm": 0.7519999742507935, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 6370 + }, + { + "epoch": 2.0593931568754034, + "grad_norm": 0.5732819437980652, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 6380 + }, + { + "epoch": 2.062621045836023, + "grad_norm": 0.7565118074417114, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 6390 + }, + { + "epoch": 2.065848934796643, + "grad_norm": 0.8147150278091431, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 6400 + }, + { + "epoch": 2.0690768237572628, + "grad_norm": 0.6941924691200256, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 6410 + }, + { + "epoch": 2.0723047127178824, + "grad_norm": 0.6549784541130066, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 6420 + }, + { + "epoch": 2.075532601678502, + "grad_norm": 0.7224905490875244, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 6430 + }, + { + "epoch": 2.078760490639122, + "grad_norm": 0.7754863500595093, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 6440 + }, + { + "epoch": 2.081988379599742, + "grad_norm": 0.691318154335022, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 6450 + }, + { + "epoch": 2.0852162685603615, + "grad_norm": 0.6009294986724854, + "learning_rate": 0.0002, + "loss": 0.6233, + "step": 6460 + }, + { + "epoch": 2.088444157520981, + "grad_norm": 0.6753945350646973, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 6470 + }, + { + "epoch": 2.091672046481601, + "grad_norm": 0.6899921298027039, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 6480 + }, + { + "epoch": 2.094899935442221, + "grad_norm": 0.846510648727417, + "learning_rate": 0.0002, + "loss": 0.6918, + "step": 6490 + }, + { + "epoch": 2.0981278244028405, + "grad_norm": 0.6432605981826782, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 6500 + }, + { + "epoch": 2.10135571336346, + "grad_norm": 0.8125239014625549, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 6510 + }, + { + "epoch": 2.1045836023240803, + "grad_norm": 0.628302812576294, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 6520 + }, + { + "epoch": 2.1078114912847, + "grad_norm": 0.7164334654808044, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 6530 + }, + { + "epoch": 2.1110393802453196, + "grad_norm": 0.7476949095726013, + "learning_rate": 0.0002, + "loss": 0.6578, + "step": 6540 + }, + { + "epoch": 2.114267269205939, + "grad_norm": 0.7577515840530396, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 6550 + }, + { + "epoch": 2.117495158166559, + "grad_norm": 0.5684467554092407, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 6560 + }, + { + "epoch": 2.120723047127179, + "grad_norm": 0.6121789216995239, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 6570 + }, + { + "epoch": 2.1239509360877986, + "grad_norm": 0.6095348596572876, + "learning_rate": 0.0002, + "loss": 0.6314, + "step": 6580 + }, + { + "epoch": 2.1271788250484183, + "grad_norm": 0.7803651690483093, + "learning_rate": 0.0002, + "loss": 0.6276, + "step": 6590 + }, + { + "epoch": 2.130406714009038, + "grad_norm": 0.5990583300590515, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 6600 + }, + { + "epoch": 2.133634602969658, + "grad_norm": 0.6569220423698425, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 6610 + }, + { + "epoch": 2.1368624919302777, + "grad_norm": 0.5961166620254517, + "learning_rate": 0.0002, + "loss": 0.7049, + "step": 6620 + }, + { + "epoch": 2.1400903808908973, + "grad_norm": 0.5860554575920105, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 6630 + }, + { + "epoch": 2.143318269851517, + "grad_norm": 0.5994001626968384, + "learning_rate": 0.0002, + "loss": 0.6651, + "step": 6640 + }, + { + "epoch": 2.146546158812137, + "grad_norm": 0.7723015546798706, + "learning_rate": 0.0002, + "loss": 0.6421, + "step": 6650 + }, + { + "epoch": 2.1497740477727567, + "grad_norm": 0.676355242729187, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 6660 + }, + { + "epoch": 2.1530019367333764, + "grad_norm": 0.5689092874526978, + "learning_rate": 0.0002, + "loss": 0.6826, + "step": 6670 + }, + { + "epoch": 2.156229825693996, + "grad_norm": 0.6933727264404297, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 6680 + }, + { + "epoch": 2.159457714654616, + "grad_norm": 0.8380527496337891, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 6690 + }, + { + "epoch": 2.1626856036152358, + "grad_norm": 0.6876497268676758, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 6700 + }, + { + "epoch": 2.1659134925758554, + "grad_norm": 0.6418334245681763, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 6710 + }, + { + "epoch": 2.169141381536475, + "grad_norm": 0.7169192433357239, + "learning_rate": 0.0002, + "loss": 0.6357, + "step": 6720 + }, + { + "epoch": 2.1723692704970947, + "grad_norm": 0.6664170622825623, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 6730 + }, + { + "epoch": 2.175597159457715, + "grad_norm": 0.6011993288993835, + "learning_rate": 0.0002, + "loss": 0.6751, + "step": 6740 + }, + { + "epoch": 2.1788250484183345, + "grad_norm": 0.5529947280883789, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 6750 + }, + { + "epoch": 2.182052937378954, + "grad_norm": 0.6879532933235168, + "learning_rate": 0.0002, + "loss": 0.671, + "step": 6760 + }, + { + "epoch": 2.1852808263395738, + "grad_norm": 0.6426113843917847, + "learning_rate": 0.0002, + "loss": 0.6634, + "step": 6770 + }, + { + "epoch": 2.188508715300194, + "grad_norm": 0.6571047306060791, + "learning_rate": 0.0002, + "loss": 0.6592, + "step": 6780 + }, + { + "epoch": 2.1917366042608135, + "grad_norm": 0.6400564908981323, + "learning_rate": 0.0002, + "loss": 0.6494, + "step": 6790 + }, + { + "epoch": 2.194964493221433, + "grad_norm": 0.6509664058685303, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 6800 + }, + { + "epoch": 2.198192382182053, + "grad_norm": 0.6673197150230408, + "learning_rate": 0.0002, + "loss": 0.6771, + "step": 6810 + }, + { + "epoch": 2.2014202711426725, + "grad_norm": 0.48205727338790894, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 6820 + }, + { + "epoch": 2.2046481601032926, + "grad_norm": 0.849525511264801, + "learning_rate": 0.0002, + "loss": 0.6894, + "step": 6830 + }, + { + "epoch": 2.207876049063912, + "grad_norm": 0.6150892376899719, + "learning_rate": 0.0002, + "loss": 0.6977, + "step": 6840 + }, + { + "epoch": 2.211103938024532, + "grad_norm": 0.7826945781707764, + "learning_rate": 0.0002, + "loss": 0.6843, + "step": 6850 + }, + { + "epoch": 2.2143318269851515, + "grad_norm": 0.5711963772773743, + "learning_rate": 0.0002, + "loss": 0.6338, + "step": 6860 + }, + { + "epoch": 2.2175597159457716, + "grad_norm": 0.6017758846282959, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 6870 + }, + { + "epoch": 2.2207876049063913, + "grad_norm": 0.785434901714325, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 6880 + }, + { + "epoch": 2.224015493867011, + "grad_norm": 0.6251688599586487, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 6890 + }, + { + "epoch": 2.2272433828276306, + "grad_norm": 0.8242034316062927, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 6900 + }, + { + "epoch": 2.2304712717882507, + "grad_norm": 0.7272933125495911, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 6910 + }, + { + "epoch": 2.2336991607488703, + "grad_norm": 0.7159379720687866, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 6920 + }, + { + "epoch": 2.23692704970949, + "grad_norm": 0.6518042087554932, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 6930 + }, + { + "epoch": 2.2401549386701096, + "grad_norm": 0.7365370392799377, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 6940 + }, + { + "epoch": 2.2433828276307297, + "grad_norm": 0.5674061179161072, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 6950 + }, + { + "epoch": 2.2466107165913494, + "grad_norm": 0.669185996055603, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 6960 + }, + { + "epoch": 2.249838605551969, + "grad_norm": 0.6638304591178894, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 6970 + }, + { + "epoch": 2.2530664945125887, + "grad_norm": 0.757006824016571, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 6980 + }, + { + "epoch": 2.2562943834732083, + "grad_norm": 0.7574930787086487, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 6990 + }, + { + "epoch": 2.2595222724338284, + "grad_norm": 0.7819514870643616, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 7000 + }, + { + "epoch": 2.262750161394448, + "grad_norm": 0.6987583041191101, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 7010 + }, + { + "epoch": 2.2659780503550677, + "grad_norm": 0.6628551483154297, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 7020 + }, + { + "epoch": 2.2692059393156874, + "grad_norm": 0.7855866551399231, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 7030 + }, + { + "epoch": 2.2724338282763075, + "grad_norm": 0.6102892756462097, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 7040 + }, + { + "epoch": 2.275661717236927, + "grad_norm": 0.7844198942184448, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 7050 + }, + { + "epoch": 2.2788896061975468, + "grad_norm": 0.6209492087364197, + "learning_rate": 0.0002, + "loss": 0.63, + "step": 7060 + }, + { + "epoch": 2.2821174951581664, + "grad_norm": 0.8351290225982666, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 7070 + }, + { + "epoch": 2.285345384118786, + "grad_norm": 0.6883546710014343, + "learning_rate": 0.0002, + "loss": 0.6648, + "step": 7080 + }, + { + "epoch": 2.288573273079406, + "grad_norm": 0.6626381874084473, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 7090 + }, + { + "epoch": 2.291801162040026, + "grad_norm": 0.7216270565986633, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 7100 + }, + { + "epoch": 2.2950290510006455, + "grad_norm": 0.8246777057647705, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 7110 + }, + { + "epoch": 2.2982569399612656, + "grad_norm": 0.614326000213623, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 7120 + }, + { + "epoch": 2.301484828921885, + "grad_norm": 0.8785578012466431, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 7130 + }, + { + "epoch": 2.304712717882505, + "grad_norm": 0.7021808624267578, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 7140 + }, + { + "epoch": 2.3079406068431245, + "grad_norm": 0.6999403238296509, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 7150 + }, + { + "epoch": 2.311168495803744, + "grad_norm": 0.8013143539428711, + "learning_rate": 0.0002, + "loss": 0.6547, + "step": 7160 + }, + { + "epoch": 2.3143963847643643, + "grad_norm": 0.6592583060264587, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 7170 + }, + { + "epoch": 2.317624273724984, + "grad_norm": 0.6260249018669128, + "learning_rate": 0.0002, + "loss": 0.6369, + "step": 7180 + }, + { + "epoch": 2.3208521626856036, + "grad_norm": 0.9352797269821167, + "learning_rate": 0.0002, + "loss": 0.6647, + "step": 7190 + }, + { + "epoch": 2.324080051646223, + "grad_norm": 0.6629612445831299, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 7200 + }, + { + "epoch": 2.3273079406068433, + "grad_norm": 0.7062810063362122, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 7210 + }, + { + "epoch": 2.330535829567463, + "grad_norm": 0.7236241102218628, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 7220 + }, + { + "epoch": 2.3337637185280826, + "grad_norm": 0.7528148293495178, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 7230 + }, + { + "epoch": 2.3369916074887023, + "grad_norm": 0.7604748606681824, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 7240 + }, + { + "epoch": 2.340219496449322, + "grad_norm": 0.5601189136505127, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 7250 + }, + { + "epoch": 2.343447385409942, + "grad_norm": 0.7099230885505676, + "learning_rate": 0.0002, + "loss": 0.6925, + "step": 7260 + }, + { + "epoch": 2.3466752743705617, + "grad_norm": 0.6699047684669495, + "learning_rate": 0.0002, + "loss": 0.6333, + "step": 7270 + }, + { + "epoch": 2.3499031633311813, + "grad_norm": 0.7315047979354858, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 7280 + }, + { + "epoch": 2.353131052291801, + "grad_norm": 0.632836103439331, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 7290 + }, + { + "epoch": 2.356358941252421, + "grad_norm": 0.9410115480422974, + "learning_rate": 0.0002, + "loss": 0.6458, + "step": 7300 + }, + { + "epoch": 2.3595868302130407, + "grad_norm": 0.626554012298584, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 7310 + }, + { + "epoch": 2.3628147191736604, + "grad_norm": 0.7538444399833679, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 7320 + }, + { + "epoch": 2.36604260813428, + "grad_norm": 0.6826626062393188, + "learning_rate": 0.0002, + "loss": 0.6321, + "step": 7330 + }, + { + "epoch": 2.3692704970949, + "grad_norm": 0.6739391088485718, + "learning_rate": 0.0002, + "loss": 0.6752, + "step": 7340 + }, + { + "epoch": 2.3724983860555198, + "grad_norm": 0.7518446445465088, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 7350 + }, + { + "epoch": 2.3757262750161394, + "grad_norm": 0.714133083820343, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 7360 + }, + { + "epoch": 2.378954163976759, + "grad_norm": 0.7144588232040405, + "learning_rate": 0.0002, + "loss": 0.6794, + "step": 7370 + }, + { + "epoch": 2.382182052937379, + "grad_norm": 0.6598120927810669, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 7380 + }, + { + "epoch": 2.385409941897999, + "grad_norm": 0.7079148292541504, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 7390 + }, + { + "epoch": 2.3886378308586185, + "grad_norm": 0.6750902533531189, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 7400 + }, + { + "epoch": 2.391865719819238, + "grad_norm": 0.7181967496871948, + "learning_rate": 0.0002, + "loss": 0.6398, + "step": 7410 + }, + { + "epoch": 2.3950936087798578, + "grad_norm": 0.7720552086830139, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 7420 + }, + { + "epoch": 2.398321497740478, + "grad_norm": 0.7592426538467407, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 7430 + }, + { + "epoch": 2.4015493867010975, + "grad_norm": 0.7161896824836731, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 7440 + }, + { + "epoch": 2.404777275661717, + "grad_norm": 0.8019260764122009, + "learning_rate": 0.0002, + "loss": 0.6891, + "step": 7450 + }, + { + "epoch": 2.408005164622337, + "grad_norm": 0.7093342542648315, + "learning_rate": 0.0002, + "loss": 0.6864, + "step": 7460 + }, + { + "epoch": 2.411233053582957, + "grad_norm": 0.8464207649230957, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 7470 + }, + { + "epoch": 2.4144609425435766, + "grad_norm": 0.773666501045227, + "learning_rate": 0.0002, + "loss": 0.6724, + "step": 7480 + }, + { + "epoch": 2.4176888315041962, + "grad_norm": 0.8451611995697021, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 7490 + }, + { + "epoch": 2.420916720464816, + "grad_norm": 0.656795084476471, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 7500 + }, + { + "epoch": 2.4241446094254355, + "grad_norm": 0.7129034996032715, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 7510 + }, + { + "epoch": 2.4273724983860556, + "grad_norm": 0.8325763940811157, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 7520 + }, + { + "epoch": 2.4306003873466753, + "grad_norm": 0.7806527614593506, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 7530 + }, + { + "epoch": 2.433828276307295, + "grad_norm": 0.6994536519050598, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 7540 + }, + { + "epoch": 2.437056165267915, + "grad_norm": 0.6898999214172363, + "learning_rate": 0.0002, + "loss": 0.6615, + "step": 7550 + }, + { + "epoch": 2.4402840542285347, + "grad_norm": 0.719490647315979, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 7560 + }, + { + "epoch": 2.4435119431891543, + "grad_norm": 0.6841562390327454, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 7570 + }, + { + "epoch": 2.446739832149774, + "grad_norm": 0.7573311924934387, + "learning_rate": 0.0002, + "loss": 0.6504, + "step": 7580 + }, + { + "epoch": 2.4499677211103936, + "grad_norm": 0.7295880317687988, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 7590 + }, + { + "epoch": 2.4531956100710137, + "grad_norm": 0.710136353969574, + "learning_rate": 0.0002, + "loss": 0.6593, + "step": 7600 + }, + { + "epoch": 2.4564234990316334, + "grad_norm": 0.6126235127449036, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 7610 + }, + { + "epoch": 2.459651387992253, + "grad_norm": 0.8025609850883484, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 7620 + }, + { + "epoch": 2.4628792769528727, + "grad_norm": 0.7839472889900208, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 7630 + }, + { + "epoch": 2.4661071659134928, + "grad_norm": 0.7253499031066895, + "learning_rate": 0.0002, + "loss": 0.6797, + "step": 7640 + }, + { + "epoch": 2.4693350548741124, + "grad_norm": 0.7918946743011475, + "learning_rate": 0.0002, + "loss": 0.7341, + "step": 7650 + }, + { + "epoch": 2.472562943834732, + "grad_norm": 0.7930178046226501, + "learning_rate": 0.0002, + "loss": 0.6646, + "step": 7660 + }, + { + "epoch": 2.4757908327953517, + "grad_norm": 0.6826170086860657, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 7670 + }, + { + "epoch": 2.4790187217559714, + "grad_norm": 0.6576805114746094, + "learning_rate": 0.0002, + "loss": 0.6697, + "step": 7680 + }, + { + "epoch": 2.4822466107165915, + "grad_norm": 0.7012448310852051, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 7690 + }, + { + "epoch": 2.485474499677211, + "grad_norm": 0.7774284482002258, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 7700 + }, + { + "epoch": 2.4887023886378308, + "grad_norm": 0.6502766013145447, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 7710 + }, + { + "epoch": 2.4919302775984504, + "grad_norm": 0.7638739347457886, + "learning_rate": 0.0002, + "loss": 0.6965, + "step": 7720 + }, + { + "epoch": 2.4951581665590705, + "grad_norm": 0.6217384338378906, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 7730 + }, + { + "epoch": 2.49838605551969, + "grad_norm": 0.7576302886009216, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 7740 + }, + { + "epoch": 2.50161394448031, + "grad_norm": 0.6877137422561646, + "learning_rate": 0.0002, + "loss": 0.6855, + "step": 7750 + }, + { + "epoch": 2.5048418334409295, + "grad_norm": 0.6998329162597656, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 7760 + }, + { + "epoch": 2.508069722401549, + "grad_norm": 0.7879213690757751, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 7770 + }, + { + "epoch": 2.5112976113621692, + "grad_norm": 0.7834980487823486, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 7780 + }, + { + "epoch": 2.514525500322789, + "grad_norm": 0.7789630889892578, + "learning_rate": 0.0002, + "loss": 0.6954, + "step": 7790 + }, + { + "epoch": 2.5177533892834085, + "grad_norm": 0.7403590083122253, + "learning_rate": 0.0002, + "loss": 0.6979, + "step": 7800 + }, + { + "epoch": 2.5209812782440286, + "grad_norm": 0.6029766201972961, + "learning_rate": 0.0002, + "loss": 0.6964, + "step": 7810 + }, + { + "epoch": 2.5242091672046483, + "grad_norm": 0.7061092257499695, + "learning_rate": 0.0002, + "loss": 0.6887, + "step": 7820 + }, + { + "epoch": 2.527437056165268, + "grad_norm": 0.7120763659477234, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 7830 + }, + { + "epoch": 2.5306649451258876, + "grad_norm": 0.6173675656318665, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 7840 + }, + { + "epoch": 2.5338928340865072, + "grad_norm": 0.9566813111305237, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 7850 + }, + { + "epoch": 2.5371207230471273, + "grad_norm": 0.8497620224952698, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 7860 + }, + { + "epoch": 2.540348612007747, + "grad_norm": 0.7663498520851135, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 7870 + }, + { + "epoch": 2.5435765009683666, + "grad_norm": 0.6329668760299683, + "learning_rate": 0.0002, + "loss": 0.6292, + "step": 7880 + }, + { + "epoch": 2.5468043899289863, + "grad_norm": 0.8128195405006409, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 7890 + }, + { + "epoch": 2.5500322788896064, + "grad_norm": 0.6622284650802612, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 7900 + }, + { + "epoch": 2.553260167850226, + "grad_norm": 0.8460057973861694, + "learning_rate": 0.0002, + "loss": 0.693, + "step": 7910 + }, + { + "epoch": 2.5564880568108457, + "grad_norm": 0.6586956977844238, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 7920 + }, + { + "epoch": 2.5597159457714653, + "grad_norm": 0.7569382190704346, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 7930 + }, + { + "epoch": 2.562943834732085, + "grad_norm": 0.6409714221954346, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 7940 + }, + { + "epoch": 2.566171723692705, + "grad_norm": 0.7031713128089905, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 7950 + }, + { + "epoch": 2.5693996126533247, + "grad_norm": 0.7983605265617371, + "learning_rate": 0.0002, + "loss": 0.6344, + "step": 7960 + }, + { + "epoch": 2.5726275016139444, + "grad_norm": 0.7165433168411255, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 7970 + }, + { + "epoch": 2.5758553905745645, + "grad_norm": 0.6630598902702332, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 7980 + }, + { + "epoch": 2.579083279535184, + "grad_norm": 0.5883122086524963, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 7990 + }, + { + "epoch": 2.5823111684958038, + "grad_norm": 0.5928755402565002, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 8000 + }, + { + "epoch": 2.5855390574564234, + "grad_norm": 0.7843712568283081, + "learning_rate": 0.0002, + "loss": 0.6701, + "step": 8010 + }, + { + "epoch": 2.588766946417043, + "grad_norm": 0.7206324338912964, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 8020 + }, + { + "epoch": 2.5919948353776627, + "grad_norm": 0.812480092048645, + "learning_rate": 0.0002, + "loss": 0.6968, + "step": 8030 + }, + { + "epoch": 2.595222724338283, + "grad_norm": 0.9843078255653381, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 8040 + }, + { + "epoch": 2.5984506132989025, + "grad_norm": 0.7524392604827881, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 8050 + }, + { + "epoch": 2.601678502259522, + "grad_norm": 0.6220380067825317, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 8060 + }, + { + "epoch": 2.6049063912201422, + "grad_norm": 0.7461398243904114, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 8070 + }, + { + "epoch": 2.608134280180762, + "grad_norm": 0.720974326133728, + "learning_rate": 0.0002, + "loss": 0.6626, + "step": 8080 + }, + { + "epoch": 2.6113621691413815, + "grad_norm": 0.649509847164154, + "learning_rate": 0.0002, + "loss": 0.6756, + "step": 8090 + }, + { + "epoch": 2.614590058102001, + "grad_norm": 0.6894662976264954, + "learning_rate": 0.0002, + "loss": 0.6394, + "step": 8100 + }, + { + "epoch": 2.617817947062621, + "grad_norm": 0.734433114528656, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 8110 + }, + { + "epoch": 2.621045836023241, + "grad_norm": 0.7468628883361816, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 8120 + }, + { + "epoch": 2.6242737249838606, + "grad_norm": 0.6508180499076843, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 8130 + }, + { + "epoch": 2.6275016139444802, + "grad_norm": 0.8735209107398987, + "learning_rate": 0.0002, + "loss": 0.6619, + "step": 8140 + }, + { + "epoch": 2.6307295029051003, + "grad_norm": 0.8162857294082642, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 8150 + }, + { + "epoch": 2.63395739186572, + "grad_norm": 0.628872811794281, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 8160 + }, + { + "epoch": 2.6371852808263396, + "grad_norm": 0.8078708052635193, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 8170 + }, + { + "epoch": 2.6404131697869593, + "grad_norm": 0.7849429845809937, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 8180 + }, + { + "epoch": 2.643641058747579, + "grad_norm": 0.8115387558937073, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 8190 + }, + { + "epoch": 2.6468689477081986, + "grad_norm": 0.7462222576141357, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 8200 + }, + { + "epoch": 2.6500968366688187, + "grad_norm": 0.753662645816803, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 8210 + }, + { + "epoch": 2.6533247256294383, + "grad_norm": 0.6100404858589172, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 8220 + }, + { + "epoch": 2.656552614590058, + "grad_norm": 0.9084606766700745, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 8230 + }, + { + "epoch": 2.659780503550678, + "grad_norm": 0.6412538886070251, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 8240 + }, + { + "epoch": 2.6630083925112977, + "grad_norm": 0.7640451192855835, + "learning_rate": 0.0002, + "loss": 0.7048, + "step": 8250 + }, + { + "epoch": 2.6662362814719174, + "grad_norm": 0.5972344875335693, + "learning_rate": 0.0002, + "loss": 0.6846, + "step": 8260 + }, + { + "epoch": 2.669464170432537, + "grad_norm": 0.6935883164405823, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 8270 + }, + { + "epoch": 2.6726920593931567, + "grad_norm": 0.789399266242981, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 8280 + }, + { + "epoch": 2.675919948353777, + "grad_norm": 0.7143490314483643, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 8290 + }, + { + "epoch": 2.6791478373143964, + "grad_norm": 0.6670652627944946, + "learning_rate": 0.0002, + "loss": 0.6741, + "step": 8300 + }, + { + "epoch": 2.682375726275016, + "grad_norm": 0.687108039855957, + "learning_rate": 0.0002, + "loss": 0.6936, + "step": 8310 + }, + { + "epoch": 2.6856036152356357, + "grad_norm": 0.7914147973060608, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 8320 + }, + { + "epoch": 2.688831504196256, + "grad_norm": 0.8398420214653015, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 8330 + }, + { + "epoch": 2.6920593931568755, + "grad_norm": 0.6592720746994019, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 8340 + }, + { + "epoch": 2.695287282117495, + "grad_norm": 0.6888470649719238, + "learning_rate": 0.0002, + "loss": 0.6673, + "step": 8350 + }, + { + "epoch": 2.698515171078115, + "grad_norm": 0.7127556800842285, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 8360 + }, + { + "epoch": 2.7017430600387344, + "grad_norm": 0.6630286574363708, + "learning_rate": 0.0002, + "loss": 0.7013, + "step": 8370 + }, + { + "epoch": 2.7049709489993545, + "grad_norm": 0.8261964321136475, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 8380 + }, + { + "epoch": 2.708198837959974, + "grad_norm": 0.717339813709259, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 8390 + }, + { + "epoch": 2.711426726920594, + "grad_norm": 0.651637613773346, + "learning_rate": 0.0002, + "loss": 0.6929, + "step": 8400 + }, + { + "epoch": 2.714654615881214, + "grad_norm": 0.7936098575592041, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 8410 + }, + { + "epoch": 2.7178825048418336, + "grad_norm": 0.8761560320854187, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 8420 + }, + { + "epoch": 2.7211103938024532, + "grad_norm": 0.6768006086349487, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 8430 + }, + { + "epoch": 2.724338282763073, + "grad_norm": 0.7121055722236633, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 8440 + }, + { + "epoch": 2.7275661717236925, + "grad_norm": 0.6811696887016296, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 8450 + }, + { + "epoch": 2.730794060684312, + "grad_norm": 0.8168250918388367, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 8460 + }, + { + "epoch": 2.7340219496449323, + "grad_norm": 0.660682737827301, + "learning_rate": 0.0002, + "loss": 0.6809, + "step": 8470 + }, + { + "epoch": 2.737249838605552, + "grad_norm": 0.7369356155395508, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 8480 + }, + { + "epoch": 2.7404777275661716, + "grad_norm": 0.7545099854469299, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 8490 + }, + { + "epoch": 2.7437056165267917, + "grad_norm": 0.6991257667541504, + "learning_rate": 0.0002, + "loss": 0.6917, + "step": 8500 + }, + { + "epoch": 2.7469335054874113, + "grad_norm": 0.7195324301719666, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 8510 + }, + { + "epoch": 2.750161394448031, + "grad_norm": 0.8995378017425537, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 8520 + }, + { + "epoch": 2.7533892834086506, + "grad_norm": 0.6924123764038086, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 8530 + }, + { + "epoch": 2.7566171723692703, + "grad_norm": 0.6260585784912109, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 8540 + }, + { + "epoch": 2.7598450613298904, + "grad_norm": 0.7273091673851013, + "learning_rate": 0.0002, + "loss": 0.6613, + "step": 8550 + }, + { + "epoch": 2.76307295029051, + "grad_norm": 0.720562219619751, + "learning_rate": 0.0002, + "loss": 0.6853, + "step": 8560 + }, + { + "epoch": 2.7663008392511297, + "grad_norm": 0.6360004544258118, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 8570 + }, + { + "epoch": 2.76952872821175, + "grad_norm": 0.7634525895118713, + "learning_rate": 0.0002, + "loss": 0.6118, + "step": 8580 + }, + { + "epoch": 2.7727566171723694, + "grad_norm": 0.6586076021194458, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 8590 + }, + { + "epoch": 2.775984506132989, + "grad_norm": 0.6542639136314392, + "learning_rate": 0.0002, + "loss": 0.7072, + "step": 8600 + }, + { + "epoch": 2.7792123950936087, + "grad_norm": 0.7650290727615356, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 8610 + }, + { + "epoch": 2.7824402840542284, + "grad_norm": 0.6551542282104492, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 8620 + }, + { + "epoch": 2.785668173014848, + "grad_norm": 0.6915501952171326, + "learning_rate": 0.0002, + "loss": 0.6937, + "step": 8630 + }, + { + "epoch": 2.788896061975468, + "grad_norm": 0.8061493635177612, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 8640 + }, + { + "epoch": 2.792123950936088, + "grad_norm": 0.8403584957122803, + "learning_rate": 0.0002, + "loss": 0.6853, + "step": 8650 + }, + { + "epoch": 2.7953518398967074, + "grad_norm": 0.6455532312393188, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 8660 + }, + { + "epoch": 2.7985797288573275, + "grad_norm": 0.8296352028846741, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 8670 + }, + { + "epoch": 2.801807617817947, + "grad_norm": 0.7288752794265747, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 8680 + }, + { + "epoch": 2.805035506778567, + "grad_norm": 0.7628464102745056, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 8690 + }, + { + "epoch": 2.8082633957391865, + "grad_norm": 0.9993878602981567, + "learning_rate": 0.0002, + "loss": 0.7176, + "step": 8700 + }, + { + "epoch": 2.811491284699806, + "grad_norm": 0.6972465515136719, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 8710 + }, + { + "epoch": 2.8147191736604262, + "grad_norm": 0.645042896270752, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 8720 + }, + { + "epoch": 2.817947062621046, + "grad_norm": 0.6853853464126587, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 8730 + }, + { + "epoch": 2.8211749515816655, + "grad_norm": 0.5935067534446716, + "learning_rate": 0.0002, + "loss": 0.6405, + "step": 8740 + }, + { + "epoch": 2.824402840542285, + "grad_norm": 0.7336633205413818, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 8750 + }, + { + "epoch": 2.8276307295029053, + "grad_norm": 0.7074962854385376, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 8760 + }, + { + "epoch": 2.830858618463525, + "grad_norm": 0.6667559742927551, + "learning_rate": 0.0002, + "loss": 0.6744, + "step": 8770 + }, + { + "epoch": 2.8340865074241446, + "grad_norm": 0.8101205229759216, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 8780 + }, + { + "epoch": 2.8373143963847642, + "grad_norm": 0.8841480016708374, + "learning_rate": 0.0002, + "loss": 0.6727, + "step": 8790 + }, + { + "epoch": 2.840542285345384, + "grad_norm": 0.5891591310501099, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 8800 + }, + { + "epoch": 2.843770174306004, + "grad_norm": 0.667032778263092, + "learning_rate": 0.0002, + "loss": 0.7114, + "step": 8810 + }, + { + "epoch": 2.8469980632666236, + "grad_norm": 0.7629773020744324, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 8820 + }, + { + "epoch": 2.8502259522272433, + "grad_norm": 0.79471355676651, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 8830 + }, + { + "epoch": 2.8534538411878634, + "grad_norm": 0.7529178261756897, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 8840 + }, + { + "epoch": 2.856681730148483, + "grad_norm": 0.7014923691749573, + "learning_rate": 0.0002, + "loss": 0.7163, + "step": 8850 + }, + { + "epoch": 2.8599096191091027, + "grad_norm": 0.7996514439582825, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 8860 + }, + { + "epoch": 2.8631375080697223, + "grad_norm": 0.7044785618782043, + "learning_rate": 0.0002, + "loss": 0.6562, + "step": 8870 + }, + { + "epoch": 2.866365397030342, + "grad_norm": 0.6792093515396118, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 8880 + }, + { + "epoch": 2.8695932859909616, + "grad_norm": 0.69175124168396, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 8890 + }, + { + "epoch": 2.8728211749515817, + "grad_norm": 0.7499129176139832, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 8900 + }, + { + "epoch": 2.8760490639122014, + "grad_norm": 0.7678789496421814, + "learning_rate": 0.0002, + "loss": 0.6922, + "step": 8910 + }, + { + "epoch": 2.879276952872821, + "grad_norm": 0.7478128671646118, + "learning_rate": 0.0002, + "loss": 0.6803, + "step": 8920 + }, + { + "epoch": 2.882504841833441, + "grad_norm": 0.6767086386680603, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 8930 + }, + { + "epoch": 2.885732730794061, + "grad_norm": 0.7222196459770203, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 8940 + }, + { + "epoch": 2.8889606197546804, + "grad_norm": 0.6950580477714539, + "learning_rate": 0.0002, + "loss": 0.6472, + "step": 8950 + }, + { + "epoch": 2.8921885087153, + "grad_norm": 0.7759528160095215, + "learning_rate": 0.0002, + "loss": 0.7064, + "step": 8960 + }, + { + "epoch": 2.8954163976759197, + "grad_norm": 0.6686919927597046, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 8970 + }, + { + "epoch": 2.89864428663654, + "grad_norm": 0.9245954751968384, + "learning_rate": 0.0002, + "loss": 0.6801, + "step": 8980 + }, + { + "epoch": 2.9018721755971595, + "grad_norm": 0.8734814524650574, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 8990 + }, + { + "epoch": 2.905100064557779, + "grad_norm": 0.6056219339370728, + "learning_rate": 0.0002, + "loss": 0.6716, + "step": 9000 + }, + { + "epoch": 2.9083279535183992, + "grad_norm": 0.7364102005958557, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 9010 + }, + { + "epoch": 2.911555842479019, + "grad_norm": 0.6563605070114136, + "learning_rate": 0.0002, + "loss": 0.707, + "step": 9020 + }, + { + "epoch": 2.9147837314396385, + "grad_norm": 0.659978985786438, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 9030 + }, + { + "epoch": 2.918011620400258, + "grad_norm": 0.8176041841506958, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 9040 + }, + { + "epoch": 2.921239509360878, + "grad_norm": 0.743677020072937, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 9050 + }, + { + "epoch": 2.9244673983214975, + "grad_norm": 0.7418383359909058, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 9060 + }, + { + "epoch": 2.9276952872821176, + "grad_norm": 0.6916524767875671, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 9070 + }, + { + "epoch": 2.9309231762427372, + "grad_norm": 0.6559975743293762, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 9080 + }, + { + "epoch": 2.934151065203357, + "grad_norm": 0.7431221008300781, + "learning_rate": 0.0002, + "loss": 0.7016, + "step": 9090 + }, + { + "epoch": 2.937378954163977, + "grad_norm": 0.7525941133499146, + "learning_rate": 0.0002, + "loss": 0.6829, + "step": 9100 + }, + { + "epoch": 2.9406068431245966, + "grad_norm": 0.6860167384147644, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 9110 + }, + { + "epoch": 2.9438347320852163, + "grad_norm": 0.6467666029930115, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 9120 + }, + { + "epoch": 2.947062621045836, + "grad_norm": 0.7595751285552979, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 9130 + }, + { + "epoch": 2.9502905100064556, + "grad_norm": 0.6558279991149902, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 9140 + }, + { + "epoch": 2.9535183989670757, + "grad_norm": 0.6818708181381226, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 9150 + }, + { + "epoch": 2.9567462879276953, + "grad_norm": 0.8387085795402527, + "learning_rate": 0.0002, + "loss": 0.6921, + "step": 9160 + }, + { + "epoch": 2.959974176888315, + "grad_norm": 0.7705109715461731, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 9170 + }, + { + "epoch": 2.9632020658489346, + "grad_norm": 0.688106894493103, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 9180 + }, + { + "epoch": 2.9664299548095547, + "grad_norm": 0.659532368183136, + "learning_rate": 0.0002, + "loss": 0.6833, + "step": 9190 + }, + { + "epoch": 2.9696578437701744, + "grad_norm": 0.6839388608932495, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 9200 + }, + { + "epoch": 2.972885732730794, + "grad_norm": 0.6927599310874939, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 9210 + }, + { + "epoch": 2.9761136216914137, + "grad_norm": 0.6902472972869873, + "learning_rate": 0.0002, + "loss": 0.7338, + "step": 9220 + }, + { + "epoch": 2.9793415106520333, + "grad_norm": 0.620399534702301, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 9230 + }, + { + "epoch": 2.9825693996126534, + "grad_norm": 0.6812364459037781, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 9240 + }, + { + "epoch": 2.985797288573273, + "grad_norm": 0.7681456208229065, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 9250 + }, + { + "epoch": 2.9890251775338927, + "grad_norm": 0.7621907591819763, + "learning_rate": 0.0002, + "loss": 0.7113, + "step": 9260 + }, + { + "epoch": 2.992253066494513, + "grad_norm": 0.6075740456581116, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 9270 + }, + { + "epoch": 2.9954809554551325, + "grad_norm": 0.7100434899330139, + "learning_rate": 0.0002, + "loss": 0.6758, + "step": 9280 + }, + { + "epoch": 2.998708844415752, + "grad_norm": 0.7314488887786865, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 9290 + }, + { + "epoch": 3.0, + "eval_loss": 1.1434104442596436, + "eval_runtime": 166.3732, + "eval_samples_per_second": 4.406, + "eval_steps_per_second": 0.553, + "step": 9294 + } + ], + "logging_steps": 10, + "max_steps": 24784, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.301055098911457e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f46f2b8e8752b125339f36f172c3878be4cdb152 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-9294/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfc2a69e44a51edf5586ebed4b7ee915a23244c18c1f59e580471e4c9becfa98 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..f46f2b8e8752b125339f36f172c3878be4cdb152 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfc2a69e44a51edf5586ebed4b7ee915a23244c18c1f59e580471e4c9becfa98 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/training_log.jsonl b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..53d6916d261d15d76df88b13d0bb08c9467cfd31 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/training_log.jsonl @@ -0,0 +1,8 @@ +{"epoch": 1.0, "step": 3098, "epoch_duration": 10469.675919294357, "total_accumulated_duration": 10469.675919294357, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 9688.99365234375}, "avg_memory_reserved": {"GPU_0": 10406.0}, "peak_memory_reserved": {"GPU_0": 10406.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.593, "grad_norm": 0.7092075347900391, "learning_rate": 0.0002, "epoch": 0.0032278889606197547, "step": 10}, {"loss": 1.0956, "grad_norm": 0.6900479793548584, "learning_rate": 0.0002, "epoch": 0.006455777921239509, "step": 20}, {"loss": 0.9807, "grad_norm": 0.6788288950920105, "learning_rate": 0.0002, "epoch": 0.009683666881859263, "step": 30}, {"loss": 0.9385, "grad_norm": 0.5590243339538574, "learning_rate": 0.0002, "epoch": 0.012911555842479019, "step": 40}, {"loss": 0.931, "grad_norm": 0.5136010646820068, "learning_rate": 0.0002, "epoch": 0.016139444803098774, "step": 50}, {"loss": 0.8896, "grad_norm": 0.45298320055007935, "learning_rate": 0.0002, "epoch": 0.019367333763718526, "step": 60}, {"loss": 0.9184, "grad_norm": 0.5917162299156189, "learning_rate": 0.0002, "epoch": 0.022595222724338282, "step": 70}, {"loss": 0.8705, "grad_norm": 0.4414856433868408, "learning_rate": 0.0002, "epoch": 0.025823111684958037, "step": 80}, {"loss": 0.8419, "grad_norm": 0.5547978281974792, "learning_rate": 0.0002, "epoch": 0.029051000645577793, "step": 90}, {"loss": 0.8987, "grad_norm": 0.5271288156509399, "learning_rate": 0.0002, "epoch": 0.03227888960619755, "step": 100}, {"loss": 0.8543, "grad_norm": 0.5506119728088379, "learning_rate": 0.0002, "epoch": 0.035506778566817304, "step": 110}, {"loss": 0.8373, "grad_norm": 0.5579327940940857, "learning_rate": 0.0002, "epoch": 0.03873466752743705, "step": 120}, {"loss": 0.8826, "grad_norm": 0.5099632740020752, "learning_rate": 0.0002, "epoch": 0.04196255648805681, "step": 130}, {"loss": 0.9239, "grad_norm": 0.40396833419799805, "learning_rate": 0.0002, "epoch": 0.045190445448676564, "step": 140}, {"loss": 0.846, "grad_norm": 0.5008092522621155, "learning_rate": 0.0002, "epoch": 0.04841833440929632, "step": 150}, {"loss": 0.8564, "grad_norm": 0.4388776421546936, "learning_rate": 0.0002, "epoch": 0.051646223369916075, "step": 160}, {"loss": 0.8829, "grad_norm": 0.44138944149017334, "learning_rate": 0.0002, "epoch": 0.05487411233053583, "step": 170}, {"loss": 0.8061, "grad_norm": 0.358484148979187, "learning_rate": 0.0002, "epoch": 0.058102001291155586, "step": 180}, {"loss": 0.8956, "grad_norm": 0.457052081823349, "learning_rate": 0.0002, "epoch": 0.06132989025177534, "step": 190}, {"loss": 0.9138, "grad_norm": 0.5537622570991516, "learning_rate": 0.0002, "epoch": 0.0645577792123951, "step": 200}, {"loss": 0.8701, "grad_norm": 0.552631676197052, "learning_rate": 0.0002, "epoch": 0.06778566817301485, "step": 210}, {"loss": 0.8854, "grad_norm": 0.4414575397968292, "learning_rate": 0.0002, "epoch": 0.07101355713363461, "step": 220}, {"loss": 0.8581, "grad_norm": 0.4996664226055145, "learning_rate": 0.0002, "epoch": 0.07424144609425436, "step": 230}, {"loss": 0.8675, "grad_norm": 0.7321897149085999, "learning_rate": 0.0002, "epoch": 0.0774693350548741, "step": 240}, {"loss": 0.8848, "grad_norm": 0.4553901255130768, "learning_rate": 0.0002, "epoch": 0.08069722401549387, "step": 250}, {"loss": 0.868, "grad_norm": 0.5039054751396179, "learning_rate": 0.0002, "epoch": 0.08392511297611362, "step": 260}, {"loss": 0.8317, "grad_norm": 0.4113094210624695, "learning_rate": 0.0002, "epoch": 0.08715300193673338, "step": 270}, {"loss": 0.8074, "grad_norm": 0.450436532497406, "learning_rate": 0.0002, "epoch": 0.09038089089735313, "step": 280}, {"loss": 0.8105, "grad_norm": 0.4548024535179138, "learning_rate": 0.0002, "epoch": 0.09360877985797289, "step": 290}, {"loss": 0.8325, "grad_norm": 0.4932962656021118, "learning_rate": 0.0002, "epoch": 0.09683666881859264, "step": 300}, {"loss": 0.8105, "grad_norm": 0.4005250334739685, "learning_rate": 0.0002, "epoch": 0.1000645577792124, "step": 310}, {"loss": 0.8083, "grad_norm": 1.8321624994277954, "learning_rate": 0.0002, "epoch": 0.10329244673983215, "step": 320}, {"loss": 0.8411, "grad_norm": 0.45815610885620117, "learning_rate": 0.0002, "epoch": 0.1065203357004519, "step": 330}, {"loss": 0.857, "grad_norm": 0.39324095845222473, "learning_rate": 0.0002, "epoch": 0.10974822466107166, "step": 340}, {"loss": 0.8258, "grad_norm": 0.546273946762085, "learning_rate": 0.0002, "epoch": 0.11297611362169141, "step": 350}, {"loss": 0.882, "grad_norm": 0.497448593378067, "learning_rate": 0.0002, "epoch": 0.11620400258231117, "step": 360}, {"loss": 0.7608, "grad_norm": 0.37508800625801086, "learning_rate": 0.0002, "epoch": 0.11943189154293092, "step": 370}, {"loss": 0.852, "grad_norm": 0.45849609375, "learning_rate": 0.0002, "epoch": 0.12265978050355068, "step": 380}, {"loss": 0.8437, "grad_norm": 0.5488408803939819, "learning_rate": 0.0002, "epoch": 0.12588766946417043, "step": 390}, {"loss": 0.8349, "grad_norm": 0.4477061331272125, "learning_rate": 0.0002, "epoch": 0.1291155584247902, "step": 400}, {"loss": 0.8306, "grad_norm": 0.39227980375289917, "learning_rate": 0.0002, "epoch": 0.13234344738540993, "step": 410}, {"loss": 0.7933, "grad_norm": 0.3922233581542969, "learning_rate": 0.0002, "epoch": 0.1355713363460297, "step": 420}, {"loss": 0.8134, "grad_norm": 0.42901909351348877, "learning_rate": 0.0002, "epoch": 0.13879922530664945, "step": 430}, {"loss": 0.8271, "grad_norm": 0.4217798709869385, "learning_rate": 0.0002, "epoch": 0.14202711426726922, "step": 440}, {"loss": 0.8594, "grad_norm": 0.43470677733421326, "learning_rate": 0.0002, "epoch": 0.14525500322788895, "step": 450}, {"loss": 0.8106, "grad_norm": 0.5324403047561646, "learning_rate": 0.0002, "epoch": 0.1484828921885087, "step": 460}, {"loss": 0.8729, "grad_norm": 0.3999756872653961, "learning_rate": 0.0002, "epoch": 0.15171078114912848, "step": 470}, {"loss": 0.7702, "grad_norm": 0.404933363199234, "learning_rate": 0.0002, "epoch": 0.1549386701097482, "step": 480}, {"loss": 0.8151, "grad_norm": 0.44122636318206787, "learning_rate": 0.0002, "epoch": 0.15816655907036797, "step": 490}, {"loss": 0.8457, "grad_norm": 0.510166347026825, "learning_rate": 0.0002, "epoch": 0.16139444803098774, "step": 500}, {"loss": 0.8692, "grad_norm": 0.4549732506275177, "learning_rate": 0.0002, "epoch": 0.1646223369916075, "step": 510}, {"loss": 0.8466, "grad_norm": 0.5148182511329651, "learning_rate": 0.0002, "epoch": 0.16785022595222723, "step": 520}, {"loss": 0.8317, "grad_norm": 0.3596806824207306, "learning_rate": 0.0002, "epoch": 0.171078114912847, "step": 530}, {"loss": 0.844, "grad_norm": 0.4388909339904785, "learning_rate": 0.0002, "epoch": 0.17430600387346676, "step": 540}, {"loss": 0.8322, "grad_norm": 0.5052742958068848, "learning_rate": 0.0002, "epoch": 0.17753389283408652, "step": 550}, {"loss": 0.791, "grad_norm": 0.48248958587646484, "learning_rate": 0.0002, "epoch": 0.18076178179470626, "step": 560}, {"loss": 0.8593, "grad_norm": 0.5360197424888611, "learning_rate": 0.0002, "epoch": 0.18398967075532602, "step": 570}, {"loss": 0.817, "grad_norm": 0.43999341130256653, "learning_rate": 0.0002, "epoch": 0.18721755971594578, "step": 580}, {"loss": 0.8311, "grad_norm": 0.3685208261013031, "learning_rate": 0.0002, "epoch": 0.19044544867656552, "step": 590}, {"loss": 0.8341, "grad_norm": 0.4601275622844696, "learning_rate": 0.0002, "epoch": 0.19367333763718528, "step": 600}, {"loss": 0.8483, "grad_norm": 0.4778369665145874, "learning_rate": 0.0002, "epoch": 0.19690122659780504, "step": 610}, {"loss": 0.8653, "grad_norm": 0.4867003560066223, "learning_rate": 0.0002, "epoch": 0.2001291155584248, "step": 620}, {"loss": 0.8554, "grad_norm": 0.4583742916584015, "learning_rate": 0.0002, "epoch": 0.20335700451904454, "step": 630}, {"loss": 0.8698, "grad_norm": 0.47958165407180786, "learning_rate": 0.0002, "epoch": 0.2065848934796643, "step": 640}, {"loss": 0.8213, "grad_norm": 0.4526064097881317, "learning_rate": 0.0002, "epoch": 0.20981278244028406, "step": 650}, {"loss": 0.8313, "grad_norm": 0.45890581607818604, "learning_rate": 0.0002, "epoch": 0.2130406714009038, "step": 660}, {"loss": 0.8143, "grad_norm": 0.42725905776023865, "learning_rate": 0.0002, "epoch": 0.21626856036152356, "step": 670}, {"loss": 0.8675, "grad_norm": 0.40380963683128357, "learning_rate": 0.0002, "epoch": 0.21949644932214332, "step": 680}, {"loss": 0.9004, "grad_norm": 0.4372998774051666, "learning_rate": 0.0002, "epoch": 0.22272433828276308, "step": 690}, {"loss": 0.8208, "grad_norm": 0.4245864450931549, "learning_rate": 0.0002, "epoch": 0.22595222724338282, "step": 700}, {"loss": 0.8564, "grad_norm": 0.4061129689216614, "learning_rate": 0.0002, "epoch": 0.22918011620400258, "step": 710}, {"loss": 0.8275, "grad_norm": 0.474454790353775, "learning_rate": 0.0002, "epoch": 0.23240800516462234, "step": 720}, {"loss": 0.8346, "grad_norm": 0.4908486008644104, "learning_rate": 0.0002, "epoch": 0.23563589412524208, "step": 730}, {"loss": 0.8755, "grad_norm": 0.4284191429615021, "learning_rate": 0.0002, "epoch": 0.23886378308586184, "step": 740}, {"loss": 0.8387, "grad_norm": 0.44730308651924133, "learning_rate": 0.0002, "epoch": 0.2420916720464816, "step": 750}, {"loss": 0.8135, "grad_norm": 0.4433246850967407, "learning_rate": 0.0002, "epoch": 0.24531956100710137, "step": 760}, {"loss": 0.8644, "grad_norm": 0.43668854236602783, "learning_rate": 0.0002, "epoch": 0.2485474499677211, "step": 770}, {"loss": 0.8025, "grad_norm": 0.34324130415916443, "learning_rate": 0.0002, "epoch": 0.25177533892834086, "step": 780}, {"loss": 0.8725, "grad_norm": 0.46476295590400696, "learning_rate": 0.0002, "epoch": 0.2550032278889606, "step": 790}, {"loss": 0.8157, "grad_norm": 0.5047039985656738, "learning_rate": 0.0002, "epoch": 0.2582311168495804, "step": 800}, {"loss": 0.8643, "grad_norm": 0.4402127265930176, "learning_rate": 0.0002, "epoch": 0.26145900581020015, "step": 810}, {"loss": 0.8025, "grad_norm": 0.4642465114593506, "learning_rate": 0.0002, "epoch": 0.26468689477081986, "step": 820}, {"loss": 0.8836, "grad_norm": 0.40093424916267395, "learning_rate": 0.0002, "epoch": 0.2679147837314396, "step": 830}, {"loss": 0.83, "grad_norm": 0.42501842975616455, "learning_rate": 0.0002, "epoch": 0.2711426726920594, "step": 840}, {"loss": 0.8573, "grad_norm": 0.43279722332954407, "learning_rate": 0.0002, "epoch": 0.27437056165267915, "step": 850}, {"loss": 0.817, "grad_norm": 0.5991243720054626, "learning_rate": 0.0002, "epoch": 0.2775984506132989, "step": 860}, {"loss": 0.7981, "grad_norm": 0.4217848777770996, "learning_rate": 0.0002, "epoch": 0.28082633957391867, "step": 870}, {"loss": 0.8135, "grad_norm": 0.3933536410331726, "learning_rate": 0.0002, "epoch": 0.28405422853453843, "step": 880}, {"loss": 0.8846, "grad_norm": 0.5868505239486694, "learning_rate": 0.0002, "epoch": 0.28728211749515814, "step": 890}, {"loss": 0.8759, "grad_norm": 0.5209547877311707, "learning_rate": 0.0002, "epoch": 0.2905100064557779, "step": 900}, {"loss": 0.815, "grad_norm": 0.49307361245155334, "learning_rate": 0.0002, "epoch": 0.29373789541639767, "step": 910}, {"loss": 0.7813, "grad_norm": 0.4288382828235626, "learning_rate": 0.0002, "epoch": 0.2969657843770174, "step": 920}, {"loss": 0.8431, "grad_norm": 0.33568474650382996, "learning_rate": 0.0002, "epoch": 0.3001936733376372, "step": 930}, {"loss": 0.8455, "grad_norm": 1.0915930271148682, "learning_rate": 0.0002, "epoch": 0.30342156229825695, "step": 940}, {"loss": 0.8535, "grad_norm": 0.5489798188209534, "learning_rate": 0.0002, "epoch": 0.3066494512588767, "step": 950}, {"loss": 0.8031, "grad_norm": 0.42971742153167725, "learning_rate": 0.0002, "epoch": 0.3098773402194964, "step": 960}, {"loss": 0.8253, "grad_norm": 0.43375834822654724, "learning_rate": 0.0002, "epoch": 0.3131052291801162, "step": 970}, {"loss": 0.7747, "grad_norm": 0.47488611936569214, "learning_rate": 0.0002, "epoch": 0.31633311814073595, "step": 980}, {"loss": 0.7906, "grad_norm": 0.46296775341033936, "learning_rate": 0.0002, "epoch": 0.3195610071013557, "step": 990}, {"loss": 0.7948, "grad_norm": 0.4548890292644501, "learning_rate": 0.0002, "epoch": 0.32278889606197547, "step": 1000}, {"loss": 0.8856, "grad_norm": 0.41834497451782227, "learning_rate": 0.0002, "epoch": 0.32601678502259523, "step": 1010}, {"loss": 0.7791, "grad_norm": 0.441092312335968, "learning_rate": 0.0002, "epoch": 0.329244673983215, "step": 1020}, {"loss": 0.8191, "grad_norm": 0.637322187423706, "learning_rate": 0.0002, "epoch": 0.33247256294383476, "step": 1030}, {"loss": 0.8685, "grad_norm": 0.4374958574771881, "learning_rate": 0.0002, "epoch": 0.33570045190445447, "step": 1040}, {"loss": 0.8423, "grad_norm": 0.3935825824737549, "learning_rate": 0.0002, "epoch": 0.33892834086507423, "step": 1050}, {"loss": 0.8287, "grad_norm": 0.43526220321655273, "learning_rate": 0.0002, "epoch": 0.342156229825694, "step": 1060}, {"loss": 0.8413, "grad_norm": 0.45327696204185486, "learning_rate": 0.0002, "epoch": 0.34538411878631375, "step": 1070}, {"loss": 0.7421, "grad_norm": 0.4126075506210327, "learning_rate": 0.0002, "epoch": 0.3486120077469335, "step": 1080}, {"loss": 0.8427, "grad_norm": 0.4714072048664093, "learning_rate": 0.0002, "epoch": 0.3518398967075533, "step": 1090}, {"loss": 0.8028, "grad_norm": 0.518127977848053, "learning_rate": 0.0002, "epoch": 0.35506778566817304, "step": 1100}, {"loss": 0.8479, "grad_norm": 0.43264099955558777, "learning_rate": 0.0002, "epoch": 0.35829567462879275, "step": 1110}, {"loss": 0.8724, "grad_norm": 0.4857400357723236, "learning_rate": 0.0002, "epoch": 0.3615235635894125, "step": 1120}, {"loss": 0.7735, "grad_norm": 0.37591469287872314, "learning_rate": 0.0002, "epoch": 0.3647514525500323, "step": 1130}, {"loss": 0.8531, "grad_norm": 0.4165478050708771, "learning_rate": 0.0002, "epoch": 0.36797934151065204, "step": 1140}, {"loss": 0.8151, "grad_norm": 0.42911383509635925, "learning_rate": 0.0002, "epoch": 0.3712072304712718, "step": 1150}, {"loss": 0.8722, "grad_norm": 0.44980287551879883, "learning_rate": 0.0002, "epoch": 0.37443511943189156, "step": 1160}, {"loss": 0.7961, "grad_norm": 0.4066573679447174, "learning_rate": 0.0002, "epoch": 0.3776630083925113, "step": 1170}, {"loss": 0.8317, "grad_norm": 0.5056195855140686, "learning_rate": 0.0002, "epoch": 0.38089089735313103, "step": 1180}, {"loss": 0.8387, "grad_norm": 0.4141536355018616, "learning_rate": 0.0002, "epoch": 0.3841187863137508, "step": 1190}, {"loss": 0.8019, "grad_norm": 0.4501924514770508, "learning_rate": 0.0002, "epoch": 0.38734667527437056, "step": 1200}, {"loss": 0.8528, "grad_norm": 0.43304240703582764, "learning_rate": 0.0002, "epoch": 0.3905745642349903, "step": 1210}, {"loss": 0.8905, "grad_norm": 0.475777804851532, "learning_rate": 0.0002, "epoch": 0.3938024531956101, "step": 1220}, {"loss": 0.8643, "grad_norm": 0.5846465826034546, "learning_rate": 0.0002, "epoch": 0.39703034215622984, "step": 1230}, {"loss": 0.8078, "grad_norm": 0.42899325489997864, "learning_rate": 0.0002, "epoch": 0.4002582311168496, "step": 1240}, {"loss": 0.8415, "grad_norm": 0.3980463147163391, "learning_rate": 0.0002, "epoch": 0.4034861200774693, "step": 1250}, {"loss": 0.8026, "grad_norm": 0.45769768953323364, "learning_rate": 0.0002, "epoch": 0.4067140090380891, "step": 1260}, {"loss": 0.8377, "grad_norm": 0.5101280212402344, "learning_rate": 0.0002, "epoch": 0.40994189799870884, "step": 1270}, {"loss": 0.7905, "grad_norm": 0.47374317049980164, "learning_rate": 0.0002, "epoch": 0.4131697869593286, "step": 1280}, {"loss": 0.8172, "grad_norm": 0.4261878728866577, "learning_rate": 0.0002, "epoch": 0.41639767591994836, "step": 1290}, {"loss": 0.9004, "grad_norm": 0.46954256296157837, "learning_rate": 0.0002, "epoch": 0.4196255648805681, "step": 1300}, {"loss": 0.7868, "grad_norm": 0.5205738544464111, "learning_rate": 0.0002, "epoch": 0.4228534538411879, "step": 1310}, {"loss": 0.8964, "grad_norm": 0.5176340937614441, "learning_rate": 0.0002, "epoch": 0.4260813428018076, "step": 1320}, {"loss": 0.8764, "grad_norm": 0.5155916810035706, "learning_rate": 0.0002, "epoch": 0.42930923176242736, "step": 1330}, {"loss": 0.8197, "grad_norm": 0.44548553228378296, "learning_rate": 0.0002, "epoch": 0.4325371207230471, "step": 1340}, {"loss": 0.7873, "grad_norm": 0.5633558630943298, "learning_rate": 0.0002, "epoch": 0.4357650096836669, "step": 1350}, {"loss": 0.7889, "grad_norm": 0.42444056272506714, "learning_rate": 0.0002, "epoch": 0.43899289864428664, "step": 1360}, {"loss": 0.8588, "grad_norm": 0.5226860642433167, "learning_rate": 0.0002, "epoch": 0.4422207876049064, "step": 1370}, {"loss": 0.8232, "grad_norm": 0.5354582071304321, "learning_rate": 0.0002, "epoch": 0.44544867656552617, "step": 1380}, {"loss": 0.816, "grad_norm": 0.472646564245224, "learning_rate": 0.0002, "epoch": 0.4486765655261459, "step": 1390}, {"loss": 0.7953, "grad_norm": 0.6312310099601746, "learning_rate": 0.0002, "epoch": 0.45190445448676564, "step": 1400}, {"loss": 0.8212, "grad_norm": 0.4298408031463623, "learning_rate": 0.0002, "epoch": 0.4551323434473854, "step": 1410}, {"loss": 0.8447, "grad_norm": 0.43427202105522156, "learning_rate": 0.0002, "epoch": 0.45836023240800516, "step": 1420}, {"loss": 0.8342, "grad_norm": 0.44097861647605896, "learning_rate": 0.0002, "epoch": 0.4615881213686249, "step": 1430}, {"loss": 0.8301, "grad_norm": 0.5142693519592285, "learning_rate": 0.0002, "epoch": 0.4648160103292447, "step": 1440}, {"loss": 0.8144, "grad_norm": 0.46416547894477844, "learning_rate": 0.0002, "epoch": 0.46804389928986445, "step": 1450}, {"loss": 0.8342, "grad_norm": 0.4858551025390625, "learning_rate": 0.0002, "epoch": 0.47127178825048416, "step": 1460}, {"loss": 0.8354, "grad_norm": 0.4709177315235138, "learning_rate": 0.0002, "epoch": 0.4744996772111039, "step": 1470}, {"loss": 0.8391, "grad_norm": 0.5500252842903137, "learning_rate": 0.0002, "epoch": 0.4777275661717237, "step": 1480}, {"loss": 0.8359, "grad_norm": 0.43364381790161133, "learning_rate": 0.0002, "epoch": 0.48095545513234345, "step": 1490}, {"loss": 0.8446, "grad_norm": 0.47712287306785583, "learning_rate": 0.0002, "epoch": 0.4841833440929632, "step": 1500}, {"loss": 0.8518, "grad_norm": 0.4518495202064514, "learning_rate": 0.0002, "epoch": 0.48741123305358297, "step": 1510}, {"loss": 0.819, "grad_norm": 0.4539008140563965, "learning_rate": 0.0002, "epoch": 0.49063912201420273, "step": 1520}, {"loss": 0.8276, "grad_norm": 0.4993067979812622, "learning_rate": 0.0002, "epoch": 0.49386701097482244, "step": 1530}, {"loss": 0.8297, "grad_norm": 0.6094803214073181, "learning_rate": 0.0002, "epoch": 0.4970948999354422, "step": 1540}, {"loss": 0.8263, "grad_norm": 0.48602527379989624, "learning_rate": 0.0002, "epoch": 0.500322788896062, "step": 1550}, {"loss": 0.8182, "grad_norm": 0.40245795249938965, "learning_rate": 0.0002, "epoch": 0.5035506778566817, "step": 1560}, {"loss": 0.7907, "grad_norm": 0.456787645816803, "learning_rate": 0.0002, "epoch": 0.5067785668173015, "step": 1570}, {"loss": 0.86, "grad_norm": 0.43936216831207275, "learning_rate": 0.0002, "epoch": 0.5100064557779213, "step": 1580}, {"loss": 0.7928, "grad_norm": 0.549018144607544, "learning_rate": 0.0002, "epoch": 0.513234344738541, "step": 1590}, {"loss": 0.8169, "grad_norm": 0.41746795177459717, "learning_rate": 0.0002, "epoch": 0.5164622336991608, "step": 1600}, {"loss": 0.7868, "grad_norm": 0.4217053949832916, "learning_rate": 0.0002, "epoch": 0.5196901226597805, "step": 1610}, {"loss": 0.8161, "grad_norm": 0.449913889169693, "learning_rate": 0.0002, "epoch": 0.5229180116204003, "step": 1620}, {"loss": 0.7938, "grad_norm": 0.5084872245788574, "learning_rate": 0.0002, "epoch": 0.5261459005810201, "step": 1630}, {"loss": 0.8295, "grad_norm": 0.46248653531074524, "learning_rate": 0.0002, "epoch": 0.5293737895416397, "step": 1640}, {"loss": 0.7993, "grad_norm": 0.4824236035346985, "learning_rate": 0.0002, "epoch": 0.5326016785022595, "step": 1650}, {"loss": 0.8711, "grad_norm": 0.6010985374450684, "learning_rate": 0.0002, "epoch": 0.5358295674628792, "step": 1660}, {"loss": 0.8266, "grad_norm": 0.4757920801639557, "learning_rate": 0.0002, "epoch": 0.539057456423499, "step": 1670}, {"loss": 0.8182, "grad_norm": 0.45161882042884827, "learning_rate": 0.0002, "epoch": 0.5422853453841188, "step": 1680}, {"loss": 0.8141, "grad_norm": 0.49314990639686584, "learning_rate": 0.0002, "epoch": 0.5455132343447385, "step": 1690}, {"loss": 0.8091, "grad_norm": 0.3918305039405823, "learning_rate": 0.0002, "epoch": 0.5487411233053583, "step": 1700}, {"loss": 0.8177, "grad_norm": 0.5966728925704956, "learning_rate": 0.0002, "epoch": 0.551969012265978, "step": 1710}, {"loss": 0.8438, "grad_norm": 0.4208986163139343, "learning_rate": 0.0002, "epoch": 0.5551969012265978, "step": 1720}, {"loss": 0.817, "grad_norm": 0.43724218010902405, "learning_rate": 0.0002, "epoch": 0.5584247901872176, "step": 1730}, {"loss": 0.7956, "grad_norm": 0.5287272930145264, "learning_rate": 0.0002, "epoch": 0.5616526791478373, "step": 1740}, {"loss": 0.8557, "grad_norm": 0.4961899518966675, "learning_rate": 0.0002, "epoch": 0.5648805681084571, "step": 1750}, {"loss": 0.8029, "grad_norm": 0.4468635320663452, "learning_rate": 0.0002, "epoch": 0.5681084570690769, "step": 1760}, {"loss": 0.7968, "grad_norm": 0.6423530578613281, "learning_rate": 0.0002, "epoch": 0.5713363460296966, "step": 1770}, {"loss": 0.8324, "grad_norm": 0.4601971507072449, "learning_rate": 0.0002, "epoch": 0.5745642349903163, "step": 1780}, {"loss": 0.8171, "grad_norm": 0.46514901518821716, "learning_rate": 0.0002, "epoch": 0.577792123950936, "step": 1790}, {"loss": 0.8186, "grad_norm": 0.4771687388420105, "learning_rate": 0.0002, "epoch": 0.5810200129115558, "step": 1800}, {"loss": 0.856, "grad_norm": 0.46514490246772766, "learning_rate": 0.0002, "epoch": 0.5842479018721756, "step": 1810}, {"loss": 0.84, "grad_norm": 0.5373936295509338, "learning_rate": 0.0002, "epoch": 0.5874757908327953, "step": 1820}, {"loss": 0.8456, "grad_norm": 0.5175791382789612, "learning_rate": 0.0002, "epoch": 0.5907036797934151, "step": 1830}, {"loss": 0.7957, "grad_norm": 0.4522802233695984, "learning_rate": 0.0002, "epoch": 0.5939315687540349, "step": 1840}, {"loss": 0.8633, "grad_norm": 0.42987772822380066, "learning_rate": 0.0002, "epoch": 0.5971594577146546, "step": 1850}, {"loss": 0.7871, "grad_norm": 0.5566838383674622, "learning_rate": 0.0002, "epoch": 0.6003873466752744, "step": 1860}, {"loss": 0.8312, "grad_norm": 0.42807698249816895, "learning_rate": 0.0002, "epoch": 0.6036152356358941, "step": 1870}, {"loss": 0.8035, "grad_norm": 0.4957767724990845, "learning_rate": 0.0002, "epoch": 0.6068431245965139, "step": 1880}, {"loss": 0.8145, "grad_norm": 0.4260980188846588, "learning_rate": 0.0002, "epoch": 0.6100710135571337, "step": 1890}, {"loss": 0.8363, "grad_norm": 0.4777357876300812, "learning_rate": 0.0002, "epoch": 0.6132989025177534, "step": 1900}, {"loss": 0.8404, "grad_norm": 0.4434216022491455, "learning_rate": 0.0002, "epoch": 0.6165267914783732, "step": 1910}, {"loss": 0.8057, "grad_norm": 0.5215433835983276, "learning_rate": 0.0002, "epoch": 0.6197546804389928, "step": 1920}, {"loss": 0.82, "grad_norm": 0.5143248438835144, "learning_rate": 0.0002, "epoch": 0.6229825693996126, "step": 1930}, {"loss": 0.8107, "grad_norm": 0.5213413238525391, "learning_rate": 0.0002, "epoch": 0.6262104583602324, "step": 1940}, {"loss": 0.7549, "grad_norm": 0.5408226251602173, "learning_rate": 0.0002, "epoch": 0.6294383473208521, "step": 1950}, {"loss": 0.8405, "grad_norm": 0.5479708909988403, "learning_rate": 0.0002, "epoch": 0.6326662362814719, "step": 1960}, {"loss": 0.8138, "grad_norm": 0.4490949809551239, "learning_rate": 0.0002, "epoch": 0.6358941252420917, "step": 1970}, {"loss": 0.854, "grad_norm": 0.48815059661865234, "learning_rate": 0.0002, "epoch": 0.6391220142027114, "step": 1980}, {"loss": 0.8568, "grad_norm": 0.46498045325279236, "learning_rate": 0.0002, "epoch": 0.6423499031633312, "step": 1990}, {"loss": 0.8263, "grad_norm": 0.5136561393737793, "learning_rate": 0.0002, "epoch": 0.6455777921239509, "step": 2000}, {"loss": 0.8503, "grad_norm": 0.5145719647407532, "learning_rate": 0.0002, "epoch": 0.6488056810845707, "step": 2010}, {"loss": 0.8456, "grad_norm": 0.5430373549461365, "learning_rate": 0.0002, "epoch": 0.6520335700451905, "step": 2020}, {"loss": 0.8115, "grad_norm": 0.46347954869270325, "learning_rate": 0.0002, "epoch": 0.6552614590058102, "step": 2030}, {"loss": 0.8769, "grad_norm": 0.5189562439918518, "learning_rate": 0.0002, "epoch": 0.65848934796643, "step": 2040}, {"loss": 0.8453, "grad_norm": 0.43843990564346313, "learning_rate": 0.0002, "epoch": 0.6617172369270498, "step": 2050}, {"loss": 0.7951, "grad_norm": 0.4654983580112457, "learning_rate": 0.0002, "epoch": 0.6649451258876695, "step": 2060}, {"loss": 0.8308, "grad_norm": 0.44835716485977173, "learning_rate": 0.0002, "epoch": 0.6681730148482892, "step": 2070}, {"loss": 0.8181, "grad_norm": 0.38811734318733215, "learning_rate": 0.0002, "epoch": 0.6714009038089089, "step": 2080}, {"loss": 0.762, "grad_norm": 0.5709853172302246, "learning_rate": 0.0002, "epoch": 0.6746287927695287, "step": 2090}, {"loss": 0.8334, "grad_norm": 0.49994757771492004, "learning_rate": 0.0002, "epoch": 0.6778566817301485, "step": 2100}, {"loss": 0.8, "grad_norm": 0.5505402684211731, "learning_rate": 0.0002, "epoch": 0.6810845706907682, "step": 2110}, {"loss": 0.8227, "grad_norm": 0.48195120692253113, "learning_rate": 0.0002, "epoch": 0.684312459651388, "step": 2120}, {"loss": 0.7879, "grad_norm": 0.4854775071144104, "learning_rate": 0.0002, "epoch": 0.6875403486120077, "step": 2130}, {"loss": 0.8231, "grad_norm": 0.6422494649887085, "learning_rate": 0.0002, "epoch": 0.6907682375726275, "step": 2140}, {"loss": 0.8353, "grad_norm": 0.3972536027431488, "learning_rate": 0.0002, "epoch": 0.6939961265332473, "step": 2150}, {"loss": 0.8068, "grad_norm": 0.4297836422920227, "learning_rate": 0.0002, "epoch": 0.697224015493867, "step": 2160}, {"loss": 0.8017, "grad_norm": 0.45486778020858765, "learning_rate": 0.0002, "epoch": 0.7004519044544868, "step": 2170}, {"loss": 0.8507, "grad_norm": 0.4706047773361206, "learning_rate": 0.0002, "epoch": 0.7036797934151066, "step": 2180}, {"loss": 0.8234, "grad_norm": 0.46426892280578613, "learning_rate": 0.0002, "epoch": 0.7069076823757263, "step": 2190}, {"loss": 0.8472, "grad_norm": 0.46333715319633484, "learning_rate": 0.0002, "epoch": 0.7101355713363461, "step": 2200}, {"loss": 0.8247, "grad_norm": 0.4632524251937866, "learning_rate": 0.0002, "epoch": 0.7133634602969657, "step": 2210}, {"loss": 0.8452, "grad_norm": 0.4610830843448639, "learning_rate": 0.0002, "epoch": 0.7165913492575855, "step": 2220}, {"loss": 0.7338, "grad_norm": 0.4905324876308441, "learning_rate": 0.0002, "epoch": 0.7198192382182053, "step": 2230}, {"loss": 0.7715, "grad_norm": 0.4936263859272003, "learning_rate": 0.0002, "epoch": 0.723047127178825, "step": 2240}, {"loss": 0.8162, "grad_norm": 0.40778425335884094, "learning_rate": 0.0002, "epoch": 0.7262750161394448, "step": 2250}, {"loss": 0.828, "grad_norm": 0.50351482629776, "learning_rate": 0.0002, "epoch": 0.7295029051000645, "step": 2260}, {"loss": 0.8475, "grad_norm": 0.4894128143787384, "learning_rate": 0.0002, "epoch": 0.7327307940606843, "step": 2270}, {"loss": 0.8087, "grad_norm": 0.5580906271934509, "learning_rate": 0.0002, "epoch": 0.7359586830213041, "step": 2280}, {"loss": 0.8157, "grad_norm": 0.4655369520187378, "learning_rate": 0.0002, "epoch": 0.7391865719819238, "step": 2290}, {"loss": 0.8395, "grad_norm": 0.4666965901851654, "learning_rate": 0.0002, "epoch": 0.7424144609425436, "step": 2300}, {"loss": 0.7605, "grad_norm": 0.46259936690330505, "learning_rate": 0.0002, "epoch": 0.7456423499031634, "step": 2310}, {"loss": 0.7849, "grad_norm": 0.520706832408905, "learning_rate": 0.0002, "epoch": 0.7488702388637831, "step": 2320}, {"loss": 0.8173, "grad_norm": 0.5142408013343811, "learning_rate": 0.0002, "epoch": 0.7520981278244029, "step": 2330}, {"loss": 0.7782, "grad_norm": 0.5355164408683777, "learning_rate": 0.0002, "epoch": 0.7553260167850226, "step": 2340}, {"loss": 0.8242, "grad_norm": 0.5517185926437378, "learning_rate": 0.0002, "epoch": 0.7585539057456423, "step": 2350}, {"loss": 0.8404, "grad_norm": 0.7162677049636841, "learning_rate": 0.0002, "epoch": 0.7617817947062621, "step": 2360}, {"loss": 0.8455, "grad_norm": 0.42402133345603943, "learning_rate": 0.0002, "epoch": 0.7650096836668818, "step": 2370}, {"loss": 0.8214, "grad_norm": 0.47180113196372986, "learning_rate": 0.0002, "epoch": 0.7682375726275016, "step": 2380}, {"loss": 0.8274, "grad_norm": 0.6262288689613342, "learning_rate": 0.0002, "epoch": 0.7714654615881213, "step": 2390}, {"loss": 0.7915, "grad_norm": 0.5177528262138367, "learning_rate": 0.0002, "epoch": 0.7746933505487411, "step": 2400}, {"loss": 0.7631, "grad_norm": 0.555721640586853, "learning_rate": 0.0002, "epoch": 0.7779212395093609, "step": 2410}, {"loss": 0.795, "grad_norm": 0.5592644810676575, "learning_rate": 0.0002, "epoch": 0.7811491284699806, "step": 2420}, {"loss": 0.8081, "grad_norm": 0.38025397062301636, "learning_rate": 0.0002, "epoch": 0.7843770174306004, "step": 2430}, {"loss": 0.7851, "grad_norm": 0.4597472548484802, "learning_rate": 0.0002, "epoch": 0.7876049063912202, "step": 2440}, {"loss": 0.8575, "grad_norm": 0.4929825961589813, "learning_rate": 0.0002, "epoch": 0.7908327953518399, "step": 2450}, {"loss": 0.7584, "grad_norm": 0.45277655124664307, "learning_rate": 0.0002, "epoch": 0.7940606843124597, "step": 2460}, {"loss": 0.8208, "grad_norm": 0.6224122643470764, "learning_rate": 0.0002, "epoch": 0.7972885732730794, "step": 2470}, {"loss": 0.8449, "grad_norm": 0.5740901827812195, "learning_rate": 0.0002, "epoch": 0.8005164622336992, "step": 2480}, {"loss": 0.7834, "grad_norm": 0.41335329413414, "learning_rate": 0.0002, "epoch": 0.8037443511943189, "step": 2490}, {"loss": 0.7768, "grad_norm": 0.4738694131374359, "learning_rate": 0.0002, "epoch": 0.8069722401549386, "step": 2500}, {"loss": 0.7927, "grad_norm": 0.5288197994232178, "learning_rate": 0.0002, "epoch": 0.8102001291155584, "step": 2510}, {"loss": 0.8334, "grad_norm": 0.5404666066169739, "learning_rate": 0.0002, "epoch": 0.8134280180761781, "step": 2520}, {"loss": 0.7998, "grad_norm": 0.4444909691810608, "learning_rate": 0.0002, "epoch": 0.8166559070367979, "step": 2530}, {"loss": 0.8683, "grad_norm": 0.542061448097229, "learning_rate": 0.0002, "epoch": 0.8198837959974177, "step": 2540}, {"loss": 0.8038, "grad_norm": 0.4914741814136505, "learning_rate": 0.0002, "epoch": 0.8231116849580374, "step": 2550}, {"loss": 0.7899, "grad_norm": 0.41703441739082336, "learning_rate": 0.0002, "epoch": 0.8263395739186572, "step": 2560}, {"loss": 0.824, "grad_norm": 0.5489841103553772, "learning_rate": 0.0002, "epoch": 0.829567462879277, "step": 2570}, {"loss": 0.8157, "grad_norm": 0.5359883308410645, "learning_rate": 0.0002, "epoch": 0.8327953518398967, "step": 2580}, {"loss": 0.8122, "grad_norm": 0.5541019439697266, "learning_rate": 0.0002, "epoch": 0.8360232408005165, "step": 2590}, {"loss": 0.797, "grad_norm": 0.4746638834476471, "learning_rate": 0.0002, "epoch": 0.8392511297611362, "step": 2600}, {"loss": 0.8116, "grad_norm": 0.5243194103240967, "learning_rate": 0.0002, "epoch": 0.842479018721756, "step": 2610}, {"loss": 0.8173, "grad_norm": 0.46824976801872253, "learning_rate": 0.0002, "epoch": 0.8457069076823758, "step": 2620}, {"loss": 0.7525, "grad_norm": 0.49487847089767456, "learning_rate": 0.0002, "epoch": 0.8489347966429954, "step": 2630}, {"loss": 0.8296, "grad_norm": 0.42180097103118896, "learning_rate": 0.0002, "epoch": 0.8521626856036152, "step": 2640}, {"loss": 0.8304, "grad_norm": 0.5516560077667236, "learning_rate": 0.0002, "epoch": 0.855390574564235, "step": 2650}, {"loss": 0.7882, "grad_norm": 0.4392191767692566, "learning_rate": 0.0002, "epoch": 0.8586184635248547, "step": 2660}, {"loss": 0.848, "grad_norm": 0.5387210845947266, "learning_rate": 0.0002, "epoch": 0.8618463524854745, "step": 2670}, {"loss": 0.8094, "grad_norm": 0.6232406497001648, "learning_rate": 0.0002, "epoch": 0.8650742414460942, "step": 2680}, {"loss": 0.768, "grad_norm": 0.53749018907547, "learning_rate": 0.0002, "epoch": 0.868302130406714, "step": 2690}, {"loss": 0.8299, "grad_norm": 0.47480374574661255, "learning_rate": 0.0002, "epoch": 0.8715300193673338, "step": 2700}, {"loss": 0.8055, "grad_norm": 0.44618046283721924, "learning_rate": 0.0002, "epoch": 0.8747579083279535, "step": 2710}, {"loss": 0.8015, "grad_norm": 0.4173581302165985, "learning_rate": 0.0002, "epoch": 0.8779857972885733, "step": 2720}, {"loss": 0.7713, "grad_norm": 0.524081289768219, "learning_rate": 0.0002, "epoch": 0.881213686249193, "step": 2730}, {"loss": 0.8738, "grad_norm": 0.5608431100845337, "learning_rate": 0.0002, "epoch": 0.8844415752098128, "step": 2740}, {"loss": 0.8513, "grad_norm": 0.5212284922599792, "learning_rate": 0.0002, "epoch": 0.8876694641704326, "step": 2750}, {"loss": 0.8139, "grad_norm": 0.5601475834846497, "learning_rate": 0.0002, "epoch": 0.8908973531310523, "step": 2760}, {"loss": 0.7947, "grad_norm": 0.4499223828315735, "learning_rate": 0.0002, "epoch": 0.8941252420916721, "step": 2770}, {"loss": 0.8559, "grad_norm": 0.46945226192474365, "learning_rate": 0.0002, "epoch": 0.8973531310522918, "step": 2780}, {"loss": 0.801, "grad_norm": 0.4837495684623718, "learning_rate": 0.0002, "epoch": 0.9005810200129115, "step": 2790}, {"loss": 0.7887, "grad_norm": 0.5059258937835693, "learning_rate": 0.0002, "epoch": 0.9038089089735313, "step": 2800}, {"loss": 0.8571, "grad_norm": 0.4857945144176483, "learning_rate": 0.0002, "epoch": 0.907036797934151, "step": 2810}, {"loss": 0.8301, "grad_norm": 0.5001962780952454, "learning_rate": 0.0002, "epoch": 0.9102646868947708, "step": 2820}, {"loss": 0.8236, "grad_norm": 0.5468648672103882, "learning_rate": 0.0002, "epoch": 0.9134925758553906, "step": 2830}, {"loss": 0.8071, "grad_norm": 0.5533056259155273, "learning_rate": 0.0002, "epoch": 0.9167204648160103, "step": 2840}, {"loss": 0.7895, "grad_norm": 0.5909785628318787, "learning_rate": 0.0002, "epoch": 0.9199483537766301, "step": 2850}, {"loss": 0.796, "grad_norm": 0.47428104281425476, "learning_rate": 0.0002, "epoch": 0.9231762427372499, "step": 2860}, {"loss": 0.7845, "grad_norm": 0.548814058303833, "learning_rate": 0.0002, "epoch": 0.9264041316978696, "step": 2870}, {"loss": 0.7871, "grad_norm": 0.5576745271682739, "learning_rate": 0.0002, "epoch": 0.9296320206584894, "step": 2880}, {"loss": 0.8399, "grad_norm": 0.47094792127609253, "learning_rate": 0.0002, "epoch": 0.9328599096191091, "step": 2890}, {"loss": 0.805, "grad_norm": 0.5408539772033691, "learning_rate": 0.0002, "epoch": 0.9360877985797289, "step": 2900}, {"loss": 0.785, "grad_norm": 0.5922889113426208, "learning_rate": 0.0002, "epoch": 0.9393156875403487, "step": 2910}, {"loss": 0.8043, "grad_norm": 0.45462584495544434, "learning_rate": 0.0002, "epoch": 0.9425435765009683, "step": 2920}, {"loss": 0.8344, "grad_norm": 0.6864947080612183, "learning_rate": 0.0002, "epoch": 0.9457714654615881, "step": 2930}, {"loss": 0.8166, "grad_norm": 0.4706299304962158, "learning_rate": 0.0002, "epoch": 0.9489993544222078, "step": 2940}, {"loss": 0.8422, "grad_norm": 0.5583269596099854, "learning_rate": 0.0002, "epoch": 0.9522272433828276, "step": 2950}, {"loss": 0.836, "grad_norm": 0.51015704870224, "learning_rate": 0.0002, "epoch": 0.9554551323434474, "step": 2960}, {"loss": 0.8371, "grad_norm": 0.5325582027435303, "learning_rate": 0.0002, "epoch": 0.9586830213040671, "step": 2970}, {"loss": 0.7593, "grad_norm": 0.49008598923683167, "learning_rate": 0.0002, "epoch": 0.9619109102646869, "step": 2980}, {"loss": 0.8093, "grad_norm": 0.4422132074832916, "learning_rate": 0.0002, "epoch": 0.9651387992253067, "step": 2990}, {"loss": 0.7966, "grad_norm": 0.5053589344024658, "learning_rate": 0.0002, "epoch": 0.9683666881859264, "step": 3000}, {"loss": 0.8081, "grad_norm": 0.46754521131515503, "learning_rate": 0.0002, "epoch": 0.9715945771465462, "step": 3010}, {"loss": 0.8377, "grad_norm": 0.5613434910774231, "learning_rate": 0.0002, "epoch": 0.9748224661071659, "step": 3020}, {"loss": 0.7856, "grad_norm": 0.5052843689918518, "learning_rate": 0.0002, "epoch": 0.9780503550677857, "step": 3030}, {"loss": 0.8412, "grad_norm": 0.4270972013473511, "learning_rate": 0.0002, "epoch": 0.9812782440284055, "step": 3040}, {"loss": 0.8353, "grad_norm": 0.4974991977214813, "learning_rate": 0.0002, "epoch": 0.9845061329890252, "step": 3050}, {"loss": 0.8415, "grad_norm": 0.4432311952114105, "learning_rate": 0.0002, "epoch": 0.9877340219496449, "step": 3060}, {"loss": 0.7764, "grad_norm": 0.466457724571228, "learning_rate": 0.0002, "epoch": 0.9909619109102646, "step": 3070}, {"loss": 0.8067, "grad_norm": 0.6438009142875671, "learning_rate": 0.0002, "epoch": 0.9941897998708844, "step": 3080}, {"loss": 0.8425, "grad_norm": 0.5593604445457458, "learning_rate": 0.0002, "epoch": 0.9974176888315042, "step": 3090}]} +{"epoch": 2.0, "step": 6196, "epoch_duration": 10365.820917844772, "total_accumulated_duration": 20835.49683713913, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75537109375}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.593, "grad_norm": 0.7092075347900391, "learning_rate": 0.0002, "epoch": 0.0032278889606197547, "step": 10}, {"loss": 1.0956, "grad_norm": 0.6900479793548584, "learning_rate": 0.0002, "epoch": 0.006455777921239509, "step": 20}, {"loss": 0.9807, "grad_norm": 0.6788288950920105, "learning_rate": 0.0002, "epoch": 0.009683666881859263, "step": 30}, {"loss": 0.9385, "grad_norm": 0.5590243339538574, "learning_rate": 0.0002, "epoch": 0.012911555842479019, "step": 40}, {"loss": 0.931, "grad_norm": 0.5136010646820068, "learning_rate": 0.0002, "epoch": 0.016139444803098774, "step": 50}, {"loss": 0.8896, "grad_norm": 0.45298320055007935, "learning_rate": 0.0002, "epoch": 0.019367333763718526, "step": 60}, {"loss": 0.9184, "grad_norm": 0.5917162299156189, "learning_rate": 0.0002, "epoch": 0.022595222724338282, "step": 70}, {"loss": 0.8705, "grad_norm": 0.4414856433868408, "learning_rate": 0.0002, "epoch": 0.025823111684958037, "step": 80}, {"loss": 0.8419, "grad_norm": 0.5547978281974792, "learning_rate": 0.0002, "epoch": 0.029051000645577793, "step": 90}, {"loss": 0.8987, "grad_norm": 0.5271288156509399, "learning_rate": 0.0002, "epoch": 0.03227888960619755, "step": 100}, {"loss": 0.8543, "grad_norm": 0.5506119728088379, "learning_rate": 0.0002, "epoch": 0.035506778566817304, "step": 110}, {"loss": 0.8373, "grad_norm": 0.5579327940940857, "learning_rate": 0.0002, "epoch": 0.03873466752743705, "step": 120}, {"loss": 0.8826, "grad_norm": 0.5099632740020752, "learning_rate": 0.0002, "epoch": 0.04196255648805681, "step": 130}, {"loss": 0.9239, "grad_norm": 0.40396833419799805, "learning_rate": 0.0002, "epoch": 0.045190445448676564, "step": 140}, {"loss": 0.846, "grad_norm": 0.5008092522621155, "learning_rate": 0.0002, "epoch": 0.04841833440929632, "step": 150}, {"loss": 0.8564, "grad_norm": 0.4388776421546936, "learning_rate": 0.0002, "epoch": 0.051646223369916075, "step": 160}, {"loss": 0.8829, "grad_norm": 0.44138944149017334, "learning_rate": 0.0002, "epoch": 0.05487411233053583, "step": 170}, {"loss": 0.8061, "grad_norm": 0.358484148979187, "learning_rate": 0.0002, "epoch": 0.058102001291155586, "step": 180}, {"loss": 0.8956, "grad_norm": 0.457052081823349, "learning_rate": 0.0002, "epoch": 0.06132989025177534, "step": 190}, {"loss": 0.9138, "grad_norm": 0.5537622570991516, "learning_rate": 0.0002, "epoch": 0.0645577792123951, "step": 200}, {"loss": 0.8701, "grad_norm": 0.552631676197052, "learning_rate": 0.0002, "epoch": 0.06778566817301485, "step": 210}, {"loss": 0.8854, "grad_norm": 0.4414575397968292, "learning_rate": 0.0002, "epoch": 0.07101355713363461, "step": 220}, {"loss": 0.8581, "grad_norm": 0.4996664226055145, "learning_rate": 0.0002, "epoch": 0.07424144609425436, "step": 230}, {"loss": 0.8675, "grad_norm": 0.7321897149085999, "learning_rate": 0.0002, "epoch": 0.0774693350548741, "step": 240}, {"loss": 0.8848, "grad_norm": 0.4553901255130768, "learning_rate": 0.0002, "epoch": 0.08069722401549387, "step": 250}, {"loss": 0.868, "grad_norm": 0.5039054751396179, "learning_rate": 0.0002, "epoch": 0.08392511297611362, "step": 260}, {"loss": 0.8317, "grad_norm": 0.4113094210624695, "learning_rate": 0.0002, "epoch": 0.08715300193673338, "step": 270}, {"loss": 0.8074, "grad_norm": 0.450436532497406, "learning_rate": 0.0002, "epoch": 0.09038089089735313, "step": 280}, {"loss": 0.8105, "grad_norm": 0.4548024535179138, "learning_rate": 0.0002, "epoch": 0.09360877985797289, "step": 290}, {"loss": 0.8325, "grad_norm": 0.4932962656021118, "learning_rate": 0.0002, "epoch": 0.09683666881859264, "step": 300}, {"loss": 0.8105, "grad_norm": 0.4005250334739685, "learning_rate": 0.0002, "epoch": 0.1000645577792124, "step": 310}, {"loss": 0.8083, "grad_norm": 1.8321624994277954, "learning_rate": 0.0002, "epoch": 0.10329244673983215, "step": 320}, {"loss": 0.8411, "grad_norm": 0.45815610885620117, "learning_rate": 0.0002, "epoch": 0.1065203357004519, "step": 330}, {"loss": 0.857, "grad_norm": 0.39324095845222473, "learning_rate": 0.0002, "epoch": 0.10974822466107166, "step": 340}, {"loss": 0.8258, "grad_norm": 0.546273946762085, "learning_rate": 0.0002, "epoch": 0.11297611362169141, "step": 350}, {"loss": 0.882, "grad_norm": 0.497448593378067, "learning_rate": 0.0002, "epoch": 0.11620400258231117, "step": 360}, {"loss": 0.7608, "grad_norm": 0.37508800625801086, "learning_rate": 0.0002, "epoch": 0.11943189154293092, "step": 370}, {"loss": 0.852, "grad_norm": 0.45849609375, "learning_rate": 0.0002, "epoch": 0.12265978050355068, "step": 380}, {"loss": 0.8437, "grad_norm": 0.5488408803939819, "learning_rate": 0.0002, "epoch": 0.12588766946417043, "step": 390}, {"loss": 0.8349, "grad_norm": 0.4477061331272125, "learning_rate": 0.0002, "epoch": 0.1291155584247902, "step": 400}, {"loss": 0.8306, "grad_norm": 0.39227980375289917, "learning_rate": 0.0002, "epoch": 0.13234344738540993, "step": 410}, {"loss": 0.7933, "grad_norm": 0.3922233581542969, "learning_rate": 0.0002, "epoch": 0.1355713363460297, "step": 420}, {"loss": 0.8134, "grad_norm": 0.42901909351348877, "learning_rate": 0.0002, "epoch": 0.13879922530664945, "step": 430}, {"loss": 0.8271, "grad_norm": 0.4217798709869385, "learning_rate": 0.0002, "epoch": 0.14202711426726922, "step": 440}, {"loss": 0.8594, "grad_norm": 0.43470677733421326, "learning_rate": 0.0002, "epoch": 0.14525500322788895, "step": 450}, {"loss": 0.8106, "grad_norm": 0.5324403047561646, "learning_rate": 0.0002, "epoch": 0.1484828921885087, "step": 460}, {"loss": 0.8729, "grad_norm": 0.3999756872653961, "learning_rate": 0.0002, "epoch": 0.15171078114912848, "step": 470}, {"loss": 0.7702, "grad_norm": 0.404933363199234, "learning_rate": 0.0002, "epoch": 0.1549386701097482, "step": 480}, {"loss": 0.8151, "grad_norm": 0.44122636318206787, "learning_rate": 0.0002, "epoch": 0.15816655907036797, "step": 490}, {"loss": 0.8457, "grad_norm": 0.510166347026825, "learning_rate": 0.0002, "epoch": 0.16139444803098774, "step": 500}, {"loss": 0.8692, "grad_norm": 0.4549732506275177, "learning_rate": 0.0002, "epoch": 0.1646223369916075, "step": 510}, {"loss": 0.8466, "grad_norm": 0.5148182511329651, "learning_rate": 0.0002, "epoch": 0.16785022595222723, "step": 520}, {"loss": 0.8317, "grad_norm": 0.3596806824207306, "learning_rate": 0.0002, "epoch": 0.171078114912847, "step": 530}, {"loss": 0.844, "grad_norm": 0.4388909339904785, "learning_rate": 0.0002, "epoch": 0.17430600387346676, "step": 540}, {"loss": 0.8322, "grad_norm": 0.5052742958068848, "learning_rate": 0.0002, "epoch": 0.17753389283408652, "step": 550}, {"loss": 0.791, "grad_norm": 0.48248958587646484, "learning_rate": 0.0002, "epoch": 0.18076178179470626, "step": 560}, {"loss": 0.8593, "grad_norm": 0.5360197424888611, "learning_rate": 0.0002, "epoch": 0.18398967075532602, "step": 570}, {"loss": 0.817, "grad_norm": 0.43999341130256653, "learning_rate": 0.0002, "epoch": 0.18721755971594578, "step": 580}, {"loss": 0.8311, "grad_norm": 0.3685208261013031, "learning_rate": 0.0002, "epoch": 0.19044544867656552, "step": 590}, {"loss": 0.8341, "grad_norm": 0.4601275622844696, "learning_rate": 0.0002, "epoch": 0.19367333763718528, "step": 600}, {"loss": 0.8483, "grad_norm": 0.4778369665145874, "learning_rate": 0.0002, "epoch": 0.19690122659780504, "step": 610}, {"loss": 0.8653, "grad_norm": 0.4867003560066223, "learning_rate": 0.0002, "epoch": 0.2001291155584248, "step": 620}, {"loss": 0.8554, "grad_norm": 0.4583742916584015, "learning_rate": 0.0002, "epoch": 0.20335700451904454, "step": 630}, {"loss": 0.8698, "grad_norm": 0.47958165407180786, "learning_rate": 0.0002, "epoch": 0.2065848934796643, "step": 640}, {"loss": 0.8213, "grad_norm": 0.4526064097881317, "learning_rate": 0.0002, "epoch": 0.20981278244028406, "step": 650}, {"loss": 0.8313, "grad_norm": 0.45890581607818604, "learning_rate": 0.0002, "epoch": 0.2130406714009038, "step": 660}, {"loss": 0.8143, "grad_norm": 0.42725905776023865, "learning_rate": 0.0002, "epoch": 0.21626856036152356, "step": 670}, {"loss": 0.8675, "grad_norm": 0.40380963683128357, "learning_rate": 0.0002, "epoch": 0.21949644932214332, "step": 680}, {"loss": 0.9004, "grad_norm": 0.4372998774051666, "learning_rate": 0.0002, "epoch": 0.22272433828276308, "step": 690}, {"loss": 0.8208, "grad_norm": 0.4245864450931549, "learning_rate": 0.0002, "epoch": 0.22595222724338282, "step": 700}, {"loss": 0.8564, "grad_norm": 0.4061129689216614, "learning_rate": 0.0002, "epoch": 0.22918011620400258, "step": 710}, {"loss": 0.8275, "grad_norm": 0.474454790353775, "learning_rate": 0.0002, "epoch": 0.23240800516462234, "step": 720}, {"loss": 0.8346, "grad_norm": 0.4908486008644104, "learning_rate": 0.0002, "epoch": 0.23563589412524208, "step": 730}, {"loss": 0.8755, "grad_norm": 0.4284191429615021, "learning_rate": 0.0002, "epoch": 0.23886378308586184, "step": 740}, {"loss": 0.8387, "grad_norm": 0.44730308651924133, "learning_rate": 0.0002, "epoch": 0.2420916720464816, "step": 750}, {"loss": 0.8135, "grad_norm": 0.4433246850967407, "learning_rate": 0.0002, "epoch": 0.24531956100710137, "step": 760}, {"loss": 0.8644, "grad_norm": 0.43668854236602783, "learning_rate": 0.0002, "epoch": 0.2485474499677211, "step": 770}, {"loss": 0.8025, "grad_norm": 0.34324130415916443, "learning_rate": 0.0002, "epoch": 0.25177533892834086, "step": 780}, {"loss": 0.8725, "grad_norm": 0.46476295590400696, "learning_rate": 0.0002, "epoch": 0.2550032278889606, "step": 790}, {"loss": 0.8157, "grad_norm": 0.5047039985656738, "learning_rate": 0.0002, "epoch": 0.2582311168495804, "step": 800}, {"loss": 0.8643, "grad_norm": 0.4402127265930176, "learning_rate": 0.0002, "epoch": 0.26145900581020015, "step": 810}, {"loss": 0.8025, "grad_norm": 0.4642465114593506, "learning_rate": 0.0002, "epoch": 0.26468689477081986, "step": 820}, {"loss": 0.8836, "grad_norm": 0.40093424916267395, "learning_rate": 0.0002, "epoch": 0.2679147837314396, "step": 830}, {"loss": 0.83, "grad_norm": 0.42501842975616455, "learning_rate": 0.0002, "epoch": 0.2711426726920594, "step": 840}, {"loss": 0.8573, "grad_norm": 0.43279722332954407, "learning_rate": 0.0002, "epoch": 0.27437056165267915, "step": 850}, {"loss": 0.817, "grad_norm": 0.5991243720054626, "learning_rate": 0.0002, "epoch": 0.2775984506132989, "step": 860}, {"loss": 0.7981, "grad_norm": 0.4217848777770996, "learning_rate": 0.0002, "epoch": 0.28082633957391867, "step": 870}, {"loss": 0.8135, "grad_norm": 0.3933536410331726, "learning_rate": 0.0002, "epoch": 0.28405422853453843, "step": 880}, {"loss": 0.8846, "grad_norm": 0.5868505239486694, "learning_rate": 0.0002, "epoch": 0.28728211749515814, "step": 890}, {"loss": 0.8759, "grad_norm": 0.5209547877311707, "learning_rate": 0.0002, "epoch": 0.2905100064557779, "step": 900}, {"loss": 0.815, "grad_norm": 0.49307361245155334, "learning_rate": 0.0002, "epoch": 0.29373789541639767, "step": 910}, {"loss": 0.7813, "grad_norm": 0.4288382828235626, "learning_rate": 0.0002, "epoch": 0.2969657843770174, "step": 920}, {"loss": 0.8431, "grad_norm": 0.33568474650382996, "learning_rate": 0.0002, "epoch": 0.3001936733376372, "step": 930}, {"loss": 0.8455, "grad_norm": 1.0915930271148682, "learning_rate": 0.0002, "epoch": 0.30342156229825695, "step": 940}, {"loss": 0.8535, "grad_norm": 0.5489798188209534, "learning_rate": 0.0002, "epoch": 0.3066494512588767, "step": 950}, {"loss": 0.8031, "grad_norm": 0.42971742153167725, "learning_rate": 0.0002, "epoch": 0.3098773402194964, "step": 960}, {"loss": 0.8253, "grad_norm": 0.43375834822654724, "learning_rate": 0.0002, "epoch": 0.3131052291801162, "step": 970}, {"loss": 0.7747, "grad_norm": 0.47488611936569214, "learning_rate": 0.0002, "epoch": 0.31633311814073595, "step": 980}, {"loss": 0.7906, "grad_norm": 0.46296775341033936, "learning_rate": 0.0002, "epoch": 0.3195610071013557, "step": 990}, {"loss": 0.7948, "grad_norm": 0.4548890292644501, "learning_rate": 0.0002, "epoch": 0.32278889606197547, "step": 1000}, {"loss": 0.8856, "grad_norm": 0.41834497451782227, "learning_rate": 0.0002, "epoch": 0.32601678502259523, "step": 1010}, {"loss": 0.7791, "grad_norm": 0.441092312335968, "learning_rate": 0.0002, "epoch": 0.329244673983215, "step": 1020}, {"loss": 0.8191, "grad_norm": 0.637322187423706, "learning_rate": 0.0002, "epoch": 0.33247256294383476, "step": 1030}, {"loss": 0.8685, "grad_norm": 0.4374958574771881, "learning_rate": 0.0002, "epoch": 0.33570045190445447, "step": 1040}, {"loss": 0.8423, "grad_norm": 0.3935825824737549, "learning_rate": 0.0002, "epoch": 0.33892834086507423, "step": 1050}, {"loss": 0.8287, "grad_norm": 0.43526220321655273, "learning_rate": 0.0002, "epoch": 0.342156229825694, "step": 1060}, {"loss": 0.8413, "grad_norm": 0.45327696204185486, "learning_rate": 0.0002, "epoch": 0.34538411878631375, "step": 1070}, {"loss": 0.7421, "grad_norm": 0.4126075506210327, "learning_rate": 0.0002, "epoch": 0.3486120077469335, "step": 1080}, {"loss": 0.8427, "grad_norm": 0.4714072048664093, "learning_rate": 0.0002, "epoch": 0.3518398967075533, "step": 1090}, {"loss": 0.8028, "grad_norm": 0.518127977848053, "learning_rate": 0.0002, "epoch": 0.35506778566817304, "step": 1100}, {"loss": 0.8479, "grad_norm": 0.43264099955558777, "learning_rate": 0.0002, "epoch": 0.35829567462879275, "step": 1110}, {"loss": 0.8724, "grad_norm": 0.4857400357723236, "learning_rate": 0.0002, "epoch": 0.3615235635894125, "step": 1120}, {"loss": 0.7735, "grad_norm": 0.37591469287872314, "learning_rate": 0.0002, "epoch": 0.3647514525500323, "step": 1130}, {"loss": 0.8531, "grad_norm": 0.4165478050708771, "learning_rate": 0.0002, "epoch": 0.36797934151065204, "step": 1140}, {"loss": 0.8151, "grad_norm": 0.42911383509635925, "learning_rate": 0.0002, "epoch": 0.3712072304712718, "step": 1150}, {"loss": 0.8722, "grad_norm": 0.44980287551879883, "learning_rate": 0.0002, "epoch": 0.37443511943189156, "step": 1160}, {"loss": 0.7961, "grad_norm": 0.4066573679447174, "learning_rate": 0.0002, "epoch": 0.3776630083925113, "step": 1170}, {"loss": 0.8317, "grad_norm": 0.5056195855140686, "learning_rate": 0.0002, "epoch": 0.38089089735313103, "step": 1180}, {"loss": 0.8387, "grad_norm": 0.4141536355018616, "learning_rate": 0.0002, "epoch": 0.3841187863137508, "step": 1190}, {"loss": 0.8019, "grad_norm": 0.4501924514770508, "learning_rate": 0.0002, "epoch": 0.38734667527437056, "step": 1200}, {"loss": 0.8528, "grad_norm": 0.43304240703582764, "learning_rate": 0.0002, "epoch": 0.3905745642349903, "step": 1210}, {"loss": 0.8905, "grad_norm": 0.475777804851532, "learning_rate": 0.0002, "epoch": 0.3938024531956101, "step": 1220}, {"loss": 0.8643, "grad_norm": 0.5846465826034546, "learning_rate": 0.0002, "epoch": 0.39703034215622984, "step": 1230}, {"loss": 0.8078, "grad_norm": 0.42899325489997864, "learning_rate": 0.0002, "epoch": 0.4002582311168496, "step": 1240}, {"loss": 0.8415, "grad_norm": 0.3980463147163391, "learning_rate": 0.0002, "epoch": 0.4034861200774693, "step": 1250}, {"loss": 0.8026, "grad_norm": 0.45769768953323364, "learning_rate": 0.0002, "epoch": 0.4067140090380891, "step": 1260}, {"loss": 0.8377, "grad_norm": 0.5101280212402344, "learning_rate": 0.0002, "epoch": 0.40994189799870884, "step": 1270}, {"loss": 0.7905, "grad_norm": 0.47374317049980164, "learning_rate": 0.0002, "epoch": 0.4131697869593286, "step": 1280}, {"loss": 0.8172, "grad_norm": 0.4261878728866577, "learning_rate": 0.0002, "epoch": 0.41639767591994836, "step": 1290}, {"loss": 0.9004, "grad_norm": 0.46954256296157837, "learning_rate": 0.0002, "epoch": 0.4196255648805681, "step": 1300}, {"loss": 0.7868, "grad_norm": 0.5205738544464111, "learning_rate": 0.0002, "epoch": 0.4228534538411879, "step": 1310}, {"loss": 0.8964, "grad_norm": 0.5176340937614441, "learning_rate": 0.0002, "epoch": 0.4260813428018076, "step": 1320}, {"loss": 0.8764, "grad_norm": 0.5155916810035706, "learning_rate": 0.0002, "epoch": 0.42930923176242736, "step": 1330}, {"loss": 0.8197, "grad_norm": 0.44548553228378296, "learning_rate": 0.0002, "epoch": 0.4325371207230471, "step": 1340}, {"loss": 0.7873, "grad_norm": 0.5633558630943298, "learning_rate": 0.0002, "epoch": 0.4357650096836669, "step": 1350}, {"loss": 0.7889, "grad_norm": 0.42444056272506714, "learning_rate": 0.0002, "epoch": 0.43899289864428664, "step": 1360}, {"loss": 0.8588, "grad_norm": 0.5226860642433167, "learning_rate": 0.0002, "epoch": 0.4422207876049064, "step": 1370}, {"loss": 0.8232, "grad_norm": 0.5354582071304321, "learning_rate": 0.0002, "epoch": 0.44544867656552617, "step": 1380}, {"loss": 0.816, "grad_norm": 0.472646564245224, "learning_rate": 0.0002, "epoch": 0.4486765655261459, "step": 1390}, {"loss": 0.7953, "grad_norm": 0.6312310099601746, "learning_rate": 0.0002, "epoch": 0.45190445448676564, "step": 1400}, {"loss": 0.8212, "grad_norm": 0.4298408031463623, "learning_rate": 0.0002, "epoch": 0.4551323434473854, "step": 1410}, {"loss": 0.8447, "grad_norm": 0.43427202105522156, "learning_rate": 0.0002, "epoch": 0.45836023240800516, "step": 1420}, {"loss": 0.8342, "grad_norm": 0.44097861647605896, "learning_rate": 0.0002, "epoch": 0.4615881213686249, "step": 1430}, {"loss": 0.8301, "grad_norm": 0.5142693519592285, "learning_rate": 0.0002, "epoch": 0.4648160103292447, "step": 1440}, {"loss": 0.8144, "grad_norm": 0.46416547894477844, "learning_rate": 0.0002, "epoch": 0.46804389928986445, "step": 1450}, {"loss": 0.8342, "grad_norm": 0.4858551025390625, "learning_rate": 0.0002, "epoch": 0.47127178825048416, "step": 1460}, {"loss": 0.8354, "grad_norm": 0.4709177315235138, "learning_rate": 0.0002, "epoch": 0.4744996772111039, "step": 1470}, {"loss": 0.8391, "grad_norm": 0.5500252842903137, "learning_rate": 0.0002, "epoch": 0.4777275661717237, "step": 1480}, {"loss": 0.8359, "grad_norm": 0.43364381790161133, "learning_rate": 0.0002, "epoch": 0.48095545513234345, "step": 1490}, {"loss": 0.8446, "grad_norm": 0.47712287306785583, "learning_rate": 0.0002, "epoch": 0.4841833440929632, "step": 1500}, {"loss": 0.8518, "grad_norm": 0.4518495202064514, "learning_rate": 0.0002, "epoch": 0.48741123305358297, "step": 1510}, {"loss": 0.819, "grad_norm": 0.4539008140563965, "learning_rate": 0.0002, "epoch": 0.49063912201420273, "step": 1520}, {"loss": 0.8276, "grad_norm": 0.4993067979812622, "learning_rate": 0.0002, "epoch": 0.49386701097482244, "step": 1530}, {"loss": 0.8297, "grad_norm": 0.6094803214073181, "learning_rate": 0.0002, "epoch": 0.4970948999354422, "step": 1540}, {"loss": 0.8263, "grad_norm": 0.48602527379989624, "learning_rate": 0.0002, "epoch": 0.500322788896062, "step": 1550}, {"loss": 0.8182, "grad_norm": 0.40245795249938965, "learning_rate": 0.0002, "epoch": 0.5035506778566817, "step": 1560}, {"loss": 0.7907, "grad_norm": 0.456787645816803, "learning_rate": 0.0002, "epoch": 0.5067785668173015, "step": 1570}, {"loss": 0.86, "grad_norm": 0.43936216831207275, "learning_rate": 0.0002, "epoch": 0.5100064557779213, "step": 1580}, {"loss": 0.7928, "grad_norm": 0.549018144607544, "learning_rate": 0.0002, "epoch": 0.513234344738541, "step": 1590}, {"loss": 0.8169, "grad_norm": 0.41746795177459717, "learning_rate": 0.0002, "epoch": 0.5164622336991608, "step": 1600}, {"loss": 0.7868, "grad_norm": 0.4217053949832916, "learning_rate": 0.0002, "epoch": 0.5196901226597805, "step": 1610}, {"loss": 0.8161, "grad_norm": 0.449913889169693, "learning_rate": 0.0002, "epoch": 0.5229180116204003, "step": 1620}, {"loss": 0.7938, "grad_norm": 0.5084872245788574, "learning_rate": 0.0002, "epoch": 0.5261459005810201, "step": 1630}, {"loss": 0.8295, "grad_norm": 0.46248653531074524, "learning_rate": 0.0002, "epoch": 0.5293737895416397, "step": 1640}, {"loss": 0.7993, "grad_norm": 0.4824236035346985, "learning_rate": 0.0002, "epoch": 0.5326016785022595, "step": 1650}, {"loss": 0.8711, "grad_norm": 0.6010985374450684, "learning_rate": 0.0002, "epoch": 0.5358295674628792, "step": 1660}, {"loss": 0.8266, "grad_norm": 0.4757920801639557, "learning_rate": 0.0002, "epoch": 0.539057456423499, "step": 1670}, {"loss": 0.8182, "grad_norm": 0.45161882042884827, "learning_rate": 0.0002, "epoch": 0.5422853453841188, "step": 1680}, {"loss": 0.8141, "grad_norm": 0.49314990639686584, "learning_rate": 0.0002, "epoch": 0.5455132343447385, "step": 1690}, {"loss": 0.8091, "grad_norm": 0.3918305039405823, "learning_rate": 0.0002, "epoch": 0.5487411233053583, "step": 1700}, {"loss": 0.8177, "grad_norm": 0.5966728925704956, "learning_rate": 0.0002, "epoch": 0.551969012265978, "step": 1710}, {"loss": 0.8438, "grad_norm": 0.4208986163139343, "learning_rate": 0.0002, "epoch": 0.5551969012265978, "step": 1720}, {"loss": 0.817, "grad_norm": 0.43724218010902405, "learning_rate": 0.0002, "epoch": 0.5584247901872176, "step": 1730}, {"loss": 0.7956, "grad_norm": 0.5287272930145264, "learning_rate": 0.0002, "epoch": 0.5616526791478373, "step": 1740}, {"loss": 0.8557, "grad_norm": 0.4961899518966675, "learning_rate": 0.0002, "epoch": 0.5648805681084571, "step": 1750}, {"loss": 0.8029, "grad_norm": 0.4468635320663452, "learning_rate": 0.0002, "epoch": 0.5681084570690769, "step": 1760}, {"loss": 0.7968, "grad_norm": 0.6423530578613281, "learning_rate": 0.0002, "epoch": 0.5713363460296966, "step": 1770}, {"loss": 0.8324, "grad_norm": 0.4601971507072449, "learning_rate": 0.0002, "epoch": 0.5745642349903163, "step": 1780}, {"loss": 0.8171, "grad_norm": 0.46514901518821716, "learning_rate": 0.0002, "epoch": 0.577792123950936, "step": 1790}, {"loss": 0.8186, "grad_norm": 0.4771687388420105, "learning_rate": 0.0002, "epoch": 0.5810200129115558, "step": 1800}, {"loss": 0.856, "grad_norm": 0.46514490246772766, "learning_rate": 0.0002, "epoch": 0.5842479018721756, "step": 1810}, {"loss": 0.84, "grad_norm": 0.5373936295509338, "learning_rate": 0.0002, "epoch": 0.5874757908327953, "step": 1820}, {"loss": 0.8456, "grad_norm": 0.5175791382789612, "learning_rate": 0.0002, "epoch": 0.5907036797934151, "step": 1830}, {"loss": 0.7957, "grad_norm": 0.4522802233695984, "learning_rate": 0.0002, "epoch": 0.5939315687540349, "step": 1840}, {"loss": 0.8633, "grad_norm": 0.42987772822380066, "learning_rate": 0.0002, "epoch": 0.5971594577146546, "step": 1850}, {"loss": 0.7871, "grad_norm": 0.5566838383674622, "learning_rate": 0.0002, "epoch": 0.6003873466752744, "step": 1860}, {"loss": 0.8312, "grad_norm": 0.42807698249816895, "learning_rate": 0.0002, "epoch": 0.6036152356358941, "step": 1870}, {"loss": 0.8035, "grad_norm": 0.4957767724990845, "learning_rate": 0.0002, "epoch": 0.6068431245965139, "step": 1880}, {"loss": 0.8145, "grad_norm": 0.4260980188846588, "learning_rate": 0.0002, "epoch": 0.6100710135571337, "step": 1890}, {"loss": 0.8363, "grad_norm": 0.4777357876300812, "learning_rate": 0.0002, "epoch": 0.6132989025177534, "step": 1900}, {"loss": 0.8404, "grad_norm": 0.4434216022491455, "learning_rate": 0.0002, "epoch": 0.6165267914783732, "step": 1910}, {"loss": 0.8057, "grad_norm": 0.5215433835983276, "learning_rate": 0.0002, "epoch": 0.6197546804389928, "step": 1920}, {"loss": 0.82, "grad_norm": 0.5143248438835144, "learning_rate": 0.0002, "epoch": 0.6229825693996126, "step": 1930}, {"loss": 0.8107, "grad_norm": 0.5213413238525391, "learning_rate": 0.0002, "epoch": 0.6262104583602324, "step": 1940}, {"loss": 0.7549, "grad_norm": 0.5408226251602173, "learning_rate": 0.0002, "epoch": 0.6294383473208521, "step": 1950}, {"loss": 0.8405, "grad_norm": 0.5479708909988403, "learning_rate": 0.0002, "epoch": 0.6326662362814719, "step": 1960}, {"loss": 0.8138, "grad_norm": 0.4490949809551239, "learning_rate": 0.0002, "epoch": 0.6358941252420917, "step": 1970}, {"loss": 0.854, "grad_norm": 0.48815059661865234, "learning_rate": 0.0002, "epoch": 0.6391220142027114, "step": 1980}, {"loss": 0.8568, "grad_norm": 0.46498045325279236, "learning_rate": 0.0002, "epoch": 0.6423499031633312, "step": 1990}, {"loss": 0.8263, "grad_norm": 0.5136561393737793, "learning_rate": 0.0002, "epoch": 0.6455777921239509, "step": 2000}, {"loss": 0.8503, "grad_norm": 0.5145719647407532, "learning_rate": 0.0002, "epoch": 0.6488056810845707, "step": 2010}, {"loss": 0.8456, "grad_norm": 0.5430373549461365, "learning_rate": 0.0002, "epoch": 0.6520335700451905, "step": 2020}, {"loss": 0.8115, "grad_norm": 0.46347954869270325, "learning_rate": 0.0002, "epoch": 0.6552614590058102, "step": 2030}, {"loss": 0.8769, "grad_norm": 0.5189562439918518, "learning_rate": 0.0002, "epoch": 0.65848934796643, "step": 2040}, {"loss": 0.8453, "grad_norm": 0.43843990564346313, "learning_rate": 0.0002, "epoch": 0.6617172369270498, "step": 2050}, {"loss": 0.7951, "grad_norm": 0.4654983580112457, "learning_rate": 0.0002, "epoch": 0.6649451258876695, "step": 2060}, {"loss": 0.8308, "grad_norm": 0.44835716485977173, "learning_rate": 0.0002, "epoch": 0.6681730148482892, "step": 2070}, {"loss": 0.8181, "grad_norm": 0.38811734318733215, "learning_rate": 0.0002, "epoch": 0.6714009038089089, "step": 2080}, {"loss": 0.762, "grad_norm": 0.5709853172302246, "learning_rate": 0.0002, "epoch": 0.6746287927695287, "step": 2090}, {"loss": 0.8334, "grad_norm": 0.49994757771492004, "learning_rate": 0.0002, "epoch": 0.6778566817301485, "step": 2100}, {"loss": 0.8, "grad_norm": 0.5505402684211731, "learning_rate": 0.0002, "epoch": 0.6810845706907682, "step": 2110}, {"loss": 0.8227, "grad_norm": 0.48195120692253113, "learning_rate": 0.0002, "epoch": 0.684312459651388, "step": 2120}, {"loss": 0.7879, "grad_norm": 0.4854775071144104, "learning_rate": 0.0002, "epoch": 0.6875403486120077, "step": 2130}, {"loss": 0.8231, "grad_norm": 0.6422494649887085, "learning_rate": 0.0002, "epoch": 0.6907682375726275, "step": 2140}, {"loss": 0.8353, "grad_norm": 0.3972536027431488, "learning_rate": 0.0002, "epoch": 0.6939961265332473, "step": 2150}, {"loss": 0.8068, "grad_norm": 0.4297836422920227, "learning_rate": 0.0002, "epoch": 0.697224015493867, "step": 2160}, {"loss": 0.8017, "grad_norm": 0.45486778020858765, "learning_rate": 0.0002, "epoch": 0.7004519044544868, "step": 2170}, {"loss": 0.8507, "grad_norm": 0.4706047773361206, "learning_rate": 0.0002, "epoch": 0.7036797934151066, "step": 2180}, {"loss": 0.8234, "grad_norm": 0.46426892280578613, "learning_rate": 0.0002, "epoch": 0.7069076823757263, "step": 2190}, {"loss": 0.8472, "grad_norm": 0.46333715319633484, "learning_rate": 0.0002, "epoch": 0.7101355713363461, "step": 2200}, {"loss": 0.8247, "grad_norm": 0.4632524251937866, "learning_rate": 0.0002, "epoch": 0.7133634602969657, "step": 2210}, {"loss": 0.8452, "grad_norm": 0.4610830843448639, "learning_rate": 0.0002, "epoch": 0.7165913492575855, "step": 2220}, {"loss": 0.7338, "grad_norm": 0.4905324876308441, "learning_rate": 0.0002, "epoch": 0.7198192382182053, "step": 2230}, {"loss": 0.7715, "grad_norm": 0.4936263859272003, "learning_rate": 0.0002, "epoch": 0.723047127178825, "step": 2240}, {"loss": 0.8162, "grad_norm": 0.40778425335884094, "learning_rate": 0.0002, "epoch": 0.7262750161394448, "step": 2250}, {"loss": 0.828, "grad_norm": 0.50351482629776, "learning_rate": 0.0002, "epoch": 0.7295029051000645, "step": 2260}, {"loss": 0.8475, "grad_norm": 0.4894128143787384, "learning_rate": 0.0002, "epoch": 0.7327307940606843, "step": 2270}, {"loss": 0.8087, "grad_norm": 0.5580906271934509, "learning_rate": 0.0002, "epoch": 0.7359586830213041, "step": 2280}, {"loss": 0.8157, "grad_norm": 0.4655369520187378, "learning_rate": 0.0002, "epoch": 0.7391865719819238, "step": 2290}, {"loss": 0.8395, "grad_norm": 0.4666965901851654, "learning_rate": 0.0002, "epoch": 0.7424144609425436, "step": 2300}, {"loss": 0.7605, "grad_norm": 0.46259936690330505, "learning_rate": 0.0002, "epoch": 0.7456423499031634, "step": 2310}, {"loss": 0.7849, "grad_norm": 0.520706832408905, "learning_rate": 0.0002, "epoch": 0.7488702388637831, "step": 2320}, {"loss": 0.8173, "grad_norm": 0.5142408013343811, "learning_rate": 0.0002, "epoch": 0.7520981278244029, "step": 2330}, {"loss": 0.7782, "grad_norm": 0.5355164408683777, "learning_rate": 0.0002, "epoch": 0.7553260167850226, "step": 2340}, {"loss": 0.8242, "grad_norm": 0.5517185926437378, "learning_rate": 0.0002, "epoch": 0.7585539057456423, "step": 2350}, {"loss": 0.8404, "grad_norm": 0.7162677049636841, "learning_rate": 0.0002, "epoch": 0.7617817947062621, "step": 2360}, {"loss": 0.8455, "grad_norm": 0.42402133345603943, "learning_rate": 0.0002, "epoch": 0.7650096836668818, "step": 2370}, {"loss": 0.8214, "grad_norm": 0.47180113196372986, "learning_rate": 0.0002, "epoch": 0.7682375726275016, "step": 2380}, {"loss": 0.8274, "grad_norm": 0.6262288689613342, "learning_rate": 0.0002, "epoch": 0.7714654615881213, "step": 2390}, {"loss": 0.7915, "grad_norm": 0.5177528262138367, "learning_rate": 0.0002, "epoch": 0.7746933505487411, "step": 2400}, {"loss": 0.7631, "grad_norm": 0.555721640586853, "learning_rate": 0.0002, "epoch": 0.7779212395093609, "step": 2410}, {"loss": 0.795, "grad_norm": 0.5592644810676575, "learning_rate": 0.0002, "epoch": 0.7811491284699806, "step": 2420}, {"loss": 0.8081, "grad_norm": 0.38025397062301636, "learning_rate": 0.0002, "epoch": 0.7843770174306004, "step": 2430}, {"loss": 0.7851, "grad_norm": 0.4597472548484802, "learning_rate": 0.0002, "epoch": 0.7876049063912202, "step": 2440}, {"loss": 0.8575, "grad_norm": 0.4929825961589813, "learning_rate": 0.0002, "epoch": 0.7908327953518399, "step": 2450}, {"loss": 0.7584, "grad_norm": 0.45277655124664307, "learning_rate": 0.0002, "epoch": 0.7940606843124597, "step": 2460}, {"loss": 0.8208, "grad_norm": 0.6224122643470764, "learning_rate": 0.0002, "epoch": 0.7972885732730794, "step": 2470}, {"loss": 0.8449, "grad_norm": 0.5740901827812195, "learning_rate": 0.0002, "epoch": 0.8005164622336992, "step": 2480}, {"loss": 0.7834, "grad_norm": 0.41335329413414, "learning_rate": 0.0002, "epoch": 0.8037443511943189, "step": 2490}, {"loss": 0.7768, "grad_norm": 0.4738694131374359, "learning_rate": 0.0002, "epoch": 0.8069722401549386, "step": 2500}, {"loss": 0.7927, "grad_norm": 0.5288197994232178, "learning_rate": 0.0002, "epoch": 0.8102001291155584, "step": 2510}, {"loss": 0.8334, "grad_norm": 0.5404666066169739, "learning_rate": 0.0002, "epoch": 0.8134280180761781, "step": 2520}, {"loss": 0.7998, "grad_norm": 0.4444909691810608, "learning_rate": 0.0002, "epoch": 0.8166559070367979, "step": 2530}, {"loss": 0.8683, "grad_norm": 0.542061448097229, "learning_rate": 0.0002, "epoch": 0.8198837959974177, "step": 2540}, {"loss": 0.8038, "grad_norm": 0.4914741814136505, "learning_rate": 0.0002, "epoch": 0.8231116849580374, "step": 2550}, {"loss": 0.7899, "grad_norm": 0.41703441739082336, "learning_rate": 0.0002, "epoch": 0.8263395739186572, "step": 2560}, {"loss": 0.824, "grad_norm": 0.5489841103553772, "learning_rate": 0.0002, "epoch": 0.829567462879277, "step": 2570}, {"loss": 0.8157, "grad_norm": 0.5359883308410645, "learning_rate": 0.0002, "epoch": 0.8327953518398967, "step": 2580}, {"loss": 0.8122, "grad_norm": 0.5541019439697266, "learning_rate": 0.0002, "epoch": 0.8360232408005165, "step": 2590}, {"loss": 0.797, "grad_norm": 0.4746638834476471, "learning_rate": 0.0002, "epoch": 0.8392511297611362, "step": 2600}, {"loss": 0.8116, "grad_norm": 0.5243194103240967, "learning_rate": 0.0002, "epoch": 0.842479018721756, "step": 2610}, {"loss": 0.8173, "grad_norm": 0.46824976801872253, "learning_rate": 0.0002, "epoch": 0.8457069076823758, "step": 2620}, {"loss": 0.7525, "grad_norm": 0.49487847089767456, "learning_rate": 0.0002, "epoch": 0.8489347966429954, "step": 2630}, {"loss": 0.8296, "grad_norm": 0.42180097103118896, "learning_rate": 0.0002, "epoch": 0.8521626856036152, "step": 2640}, {"loss": 0.8304, "grad_norm": 0.5516560077667236, "learning_rate": 0.0002, "epoch": 0.855390574564235, "step": 2650}, {"loss": 0.7882, "grad_norm": 0.4392191767692566, "learning_rate": 0.0002, "epoch": 0.8586184635248547, "step": 2660}, {"loss": 0.848, "grad_norm": 0.5387210845947266, "learning_rate": 0.0002, "epoch": 0.8618463524854745, "step": 2670}, {"loss": 0.8094, "grad_norm": 0.6232406497001648, "learning_rate": 0.0002, "epoch": 0.8650742414460942, "step": 2680}, {"loss": 0.768, "grad_norm": 0.53749018907547, "learning_rate": 0.0002, "epoch": 0.868302130406714, "step": 2690}, {"loss": 0.8299, "grad_norm": 0.47480374574661255, "learning_rate": 0.0002, "epoch": 0.8715300193673338, "step": 2700}, {"loss": 0.8055, "grad_norm": 0.44618046283721924, "learning_rate": 0.0002, "epoch": 0.8747579083279535, "step": 2710}, {"loss": 0.8015, "grad_norm": 0.4173581302165985, "learning_rate": 0.0002, "epoch": 0.8779857972885733, "step": 2720}, {"loss": 0.7713, "grad_norm": 0.524081289768219, "learning_rate": 0.0002, "epoch": 0.881213686249193, "step": 2730}, {"loss": 0.8738, "grad_norm": 0.5608431100845337, "learning_rate": 0.0002, "epoch": 0.8844415752098128, "step": 2740}, {"loss": 0.8513, "grad_norm": 0.5212284922599792, "learning_rate": 0.0002, "epoch": 0.8876694641704326, "step": 2750}, {"loss": 0.8139, "grad_norm": 0.5601475834846497, "learning_rate": 0.0002, "epoch": 0.8908973531310523, "step": 2760}, {"loss": 0.7947, "grad_norm": 0.4499223828315735, "learning_rate": 0.0002, "epoch": 0.8941252420916721, "step": 2770}, {"loss": 0.8559, "grad_norm": 0.46945226192474365, "learning_rate": 0.0002, "epoch": 0.8973531310522918, "step": 2780}, {"loss": 0.801, "grad_norm": 0.4837495684623718, "learning_rate": 0.0002, "epoch": 0.9005810200129115, "step": 2790}, {"loss": 0.7887, "grad_norm": 0.5059258937835693, "learning_rate": 0.0002, "epoch": 0.9038089089735313, "step": 2800}, {"loss": 0.8571, "grad_norm": 0.4857945144176483, "learning_rate": 0.0002, "epoch": 0.907036797934151, "step": 2810}, {"loss": 0.8301, "grad_norm": 0.5001962780952454, "learning_rate": 0.0002, "epoch": 0.9102646868947708, "step": 2820}, {"loss": 0.8236, "grad_norm": 0.5468648672103882, "learning_rate": 0.0002, "epoch": 0.9134925758553906, "step": 2830}, {"loss": 0.8071, "grad_norm": 0.5533056259155273, "learning_rate": 0.0002, "epoch": 0.9167204648160103, "step": 2840}, {"loss": 0.7895, "grad_norm": 0.5909785628318787, "learning_rate": 0.0002, "epoch": 0.9199483537766301, "step": 2850}, {"loss": 0.796, "grad_norm": 0.47428104281425476, "learning_rate": 0.0002, "epoch": 0.9231762427372499, "step": 2860}, {"loss": 0.7845, "grad_norm": 0.548814058303833, "learning_rate": 0.0002, "epoch": 0.9264041316978696, "step": 2870}, {"loss": 0.7871, "grad_norm": 0.5576745271682739, "learning_rate": 0.0002, "epoch": 0.9296320206584894, "step": 2880}, {"loss": 0.8399, "grad_norm": 0.47094792127609253, "learning_rate": 0.0002, "epoch": 0.9328599096191091, "step": 2890}, {"loss": 0.805, "grad_norm": 0.5408539772033691, "learning_rate": 0.0002, "epoch": 0.9360877985797289, "step": 2900}, {"loss": 0.785, "grad_norm": 0.5922889113426208, "learning_rate": 0.0002, "epoch": 0.9393156875403487, "step": 2910}, {"loss": 0.8043, "grad_norm": 0.45462584495544434, "learning_rate": 0.0002, "epoch": 0.9425435765009683, "step": 2920}, {"loss": 0.8344, "grad_norm": 0.6864947080612183, "learning_rate": 0.0002, "epoch": 0.9457714654615881, "step": 2930}, {"loss": 0.8166, "grad_norm": 0.4706299304962158, "learning_rate": 0.0002, "epoch": 0.9489993544222078, "step": 2940}, {"loss": 0.8422, "grad_norm": 0.5583269596099854, "learning_rate": 0.0002, "epoch": 0.9522272433828276, "step": 2950}, {"loss": 0.836, "grad_norm": 0.51015704870224, "learning_rate": 0.0002, "epoch": 0.9554551323434474, "step": 2960}, {"loss": 0.8371, "grad_norm": 0.5325582027435303, "learning_rate": 0.0002, "epoch": 0.9586830213040671, "step": 2970}, {"loss": 0.7593, "grad_norm": 0.49008598923683167, "learning_rate": 0.0002, "epoch": 0.9619109102646869, "step": 2980}, {"loss": 0.8093, "grad_norm": 0.4422132074832916, "learning_rate": 0.0002, "epoch": 0.9651387992253067, "step": 2990}, {"loss": 0.7966, "grad_norm": 0.5053589344024658, "learning_rate": 0.0002, "epoch": 0.9683666881859264, "step": 3000}, {"loss": 0.8081, "grad_norm": 0.46754521131515503, "learning_rate": 0.0002, "epoch": 0.9715945771465462, "step": 3010}, {"loss": 0.8377, "grad_norm": 0.5613434910774231, "learning_rate": 0.0002, "epoch": 0.9748224661071659, "step": 3020}, {"loss": 0.7856, "grad_norm": 0.5052843689918518, "learning_rate": 0.0002, "epoch": 0.9780503550677857, "step": 3030}, {"loss": 0.8412, "grad_norm": 0.4270972013473511, "learning_rate": 0.0002, "epoch": 0.9812782440284055, "step": 3040}, {"loss": 0.8353, "grad_norm": 0.4974991977214813, "learning_rate": 0.0002, "epoch": 0.9845061329890252, "step": 3050}, {"loss": 0.8415, "grad_norm": 0.4432311952114105, "learning_rate": 0.0002, "epoch": 0.9877340219496449, "step": 3060}, {"loss": 0.7764, "grad_norm": 0.466457724571228, "learning_rate": 0.0002, "epoch": 0.9909619109102646, "step": 3070}, {"loss": 0.8067, "grad_norm": 0.6438009142875671, "learning_rate": 0.0002, "epoch": 0.9941897998708844, "step": 3080}, {"loss": 0.8425, "grad_norm": 0.5593604445457458, "learning_rate": 0.0002, "epoch": 0.9974176888315042, "step": 3090}, {"eval_loss": 1.0958120822906494, "eval_runtime": 148.3273, "eval_samples_per_second": 4.942, "eval_steps_per_second": 0.62, "epoch": 1.0, "step": 3098}, {"loss": 0.8275, "grad_norm": 0.5701445937156677, "learning_rate": 0.0002, "epoch": 1.000645577792124, "step": 3100}, {"loss": 0.7756, "grad_norm": 0.6089657545089722, "learning_rate": 0.0002, "epoch": 1.0038734667527438, "step": 3110}, {"loss": 0.7492, "grad_norm": 0.5619552135467529, "learning_rate": 0.0002, "epoch": 1.0071013557133635, "step": 3120}, {"loss": 0.7544, "grad_norm": 0.5550283789634705, "learning_rate": 0.0002, "epoch": 1.010329244673983, "step": 3130}, {"loss": 0.8006, "grad_norm": 0.6221792101860046, "learning_rate": 0.0002, "epoch": 1.013557133634603, "step": 3140}, {"loss": 0.7603, "grad_norm": 0.5450758934020996, "learning_rate": 0.0002, "epoch": 1.0167850225952226, "step": 3150}, {"loss": 0.7021, "grad_norm": 0.4359588027000427, "learning_rate": 0.0002, "epoch": 1.0200129115558425, "step": 3160}, {"loss": 0.7468, "grad_norm": 0.5932239890098572, "learning_rate": 0.0002, "epoch": 1.0232408005164622, "step": 3170}, {"loss": 0.7649, "grad_norm": 0.45478707551956177, "learning_rate": 0.0002, "epoch": 1.026468689477082, "step": 3180}, {"loss": 0.7355, "grad_norm": 0.677615761756897, "learning_rate": 0.0002, "epoch": 1.0296965784377017, "step": 3190}, {"loss": 0.6928, "grad_norm": 0.6231790781021118, "learning_rate": 0.0002, "epoch": 1.0329244673983216, "step": 3200}, {"loss": 0.7471, "grad_norm": 0.5074195861816406, "learning_rate": 0.0002, "epoch": 1.0361523563589412, "step": 3210}, {"loss": 0.6864, "grad_norm": 0.4844142198562622, "learning_rate": 0.0002, "epoch": 1.039380245319561, "step": 3220}, {"loss": 0.7655, "grad_norm": 0.5372750759124756, "learning_rate": 0.0002, "epoch": 1.0426081342801807, "step": 3230}, {"loss": 0.7384, "grad_norm": 0.46296265721321106, "learning_rate": 0.0002, "epoch": 1.0458360232408006, "step": 3240}, {"loss": 0.7894, "grad_norm": 0.5417148470878601, "learning_rate": 0.0002, "epoch": 1.0490639122014203, "step": 3250}, {"loss": 0.7637, "grad_norm": 0.5695074200630188, "learning_rate": 0.0002, "epoch": 1.0522918011620401, "step": 3260}, {"loss": 0.7456, "grad_norm": 0.5050092935562134, "learning_rate": 0.0002, "epoch": 1.0555196901226598, "step": 3270}, {"loss": 0.6805, "grad_norm": 0.5320752263069153, "learning_rate": 0.0002, "epoch": 1.0587475790832794, "step": 3280}, {"loss": 0.7419, "grad_norm": 0.5832052230834961, "learning_rate": 0.0002, "epoch": 1.0619754680438993, "step": 3290}, {"loss": 0.7656, "grad_norm": 0.5228804349899292, "learning_rate": 0.0002, "epoch": 1.065203357004519, "step": 3300}, {"loss": 0.6834, "grad_norm": 0.5819445252418518, "learning_rate": 0.0002, "epoch": 1.0684312459651388, "step": 3310}, {"loss": 0.7093, "grad_norm": 0.4201328754425049, "learning_rate": 0.0002, "epoch": 1.0716591349257585, "step": 3320}, {"loss": 0.7494, "grad_norm": 0.5424145460128784, "learning_rate": 0.0002, "epoch": 1.0748870238863784, "step": 3330}, {"loss": 0.7828, "grad_norm": 0.6169946789741516, "learning_rate": 0.0002, "epoch": 1.078114912846998, "step": 3340}, {"loss": 0.7505, "grad_norm": 0.607676088809967, "learning_rate": 0.0002, "epoch": 1.0813428018076179, "step": 3350}, {"loss": 0.7315, "grad_norm": 0.5191982388496399, "learning_rate": 0.0002, "epoch": 1.0845706907682375, "step": 3360}, {"loss": 0.7699, "grad_norm": 0.5728003978729248, "learning_rate": 0.0002, "epoch": 1.0877985797288574, "step": 3370}, {"loss": 0.7381, "grad_norm": 0.5402643084526062, "learning_rate": 0.0002, "epoch": 1.091026468689477, "step": 3380}, {"loss": 0.7208, "grad_norm": 0.5377541780471802, "learning_rate": 0.0002, "epoch": 1.094254357650097, "step": 3390}, {"loss": 0.7672, "grad_norm": 0.4751385748386383, "learning_rate": 0.0002, "epoch": 1.0974822466107166, "step": 3400}, {"loss": 0.7326, "grad_norm": 0.559158444404602, "learning_rate": 0.0002, "epoch": 1.1007101355713362, "step": 3410}, {"loss": 0.7366, "grad_norm": 0.4917701482772827, "learning_rate": 0.0002, "epoch": 1.103938024531956, "step": 3420}, {"loss": 0.7593, "grad_norm": 0.5507875084877014, "learning_rate": 0.0002, "epoch": 1.1071659134925758, "step": 3430}, {"loss": 0.7424, "grad_norm": 0.45458680391311646, "learning_rate": 0.0002, "epoch": 1.1103938024531956, "step": 3440}, {"loss": 0.7234, "grad_norm": 0.5721744894981384, "learning_rate": 0.0002, "epoch": 1.1136216914138153, "step": 3450}, {"loss": 0.7219, "grad_norm": 0.5776081681251526, "learning_rate": 0.0002, "epoch": 1.1168495803744352, "step": 3460}, {"loss": 0.7644, "grad_norm": 0.5261953473091125, "learning_rate": 0.0002, "epoch": 1.1200774693350548, "step": 3470}, {"loss": 0.6586, "grad_norm": 0.47759532928466797, "learning_rate": 0.0002, "epoch": 1.1233053582956747, "step": 3480}, {"loss": 0.7641, "grad_norm": 0.5697659850120544, "learning_rate": 0.0002, "epoch": 1.1265332472562943, "step": 3490}, {"loss": 0.7017, "grad_norm": 0.5643419623374939, "learning_rate": 0.0002, "epoch": 1.1297611362169142, "step": 3500}, {"loss": 0.7235, "grad_norm": 0.6502931118011475, "learning_rate": 0.0002, "epoch": 1.1329890251775339, "step": 3510}, {"loss": 0.7662, "grad_norm": 0.5236507654190063, "learning_rate": 0.0002, "epoch": 1.1362169141381537, "step": 3520}, {"loss": 0.7571, "grad_norm": 0.6521499156951904, "learning_rate": 0.0002, "epoch": 1.1394448030987734, "step": 3530}, {"loss": 0.7304, "grad_norm": 0.5893217325210571, "learning_rate": 0.0002, "epoch": 1.142672692059393, "step": 3540}, {"loss": 0.7508, "grad_norm": 0.5300073027610779, "learning_rate": 0.0002, "epoch": 1.145900581020013, "step": 3550}, {"loss": 0.6937, "grad_norm": 0.6794660091400146, "learning_rate": 0.0002, "epoch": 1.1491284699806328, "step": 3560}, {"loss": 0.7614, "grad_norm": 0.5420064926147461, "learning_rate": 0.0002, "epoch": 1.1523563589412524, "step": 3570}, {"loss": 0.7648, "grad_norm": 0.5096590518951416, "learning_rate": 0.0002, "epoch": 1.155584247901872, "step": 3580}, {"loss": 0.7436, "grad_norm": 0.5726043581962585, "learning_rate": 0.0002, "epoch": 1.158812136862492, "step": 3590}, {"loss": 0.7728, "grad_norm": 0.7388110160827637, "learning_rate": 0.0002, "epoch": 1.1620400258231116, "step": 3600}, {"loss": 0.7421, "grad_norm": 0.5597969889640808, "learning_rate": 0.0002, "epoch": 1.1652679147837315, "step": 3610}, {"loss": 0.7132, "grad_norm": 0.5067800283432007, "learning_rate": 0.0002, "epoch": 1.1684958037443511, "step": 3620}, {"loss": 0.7893, "grad_norm": 0.6625118255615234, "learning_rate": 0.0002, "epoch": 1.171723692704971, "step": 3630}, {"loss": 0.7611, "grad_norm": 0.5830849409103394, "learning_rate": 0.0002, "epoch": 1.1749515816655907, "step": 3640}, {"loss": 0.7973, "grad_norm": 0.6140692830085754, "learning_rate": 0.0002, "epoch": 1.1781794706262105, "step": 3650}, {"loss": 0.7617, "grad_norm": 0.714523434638977, "learning_rate": 0.0002, "epoch": 1.1814073595868302, "step": 3660}, {"loss": 0.7092, "grad_norm": 0.5196696519851685, "learning_rate": 0.0002, "epoch": 1.18463524854745, "step": 3670}, {"loss": 0.7821, "grad_norm": 0.6677889823913574, "learning_rate": 0.0002, "epoch": 1.1878631375080697, "step": 3680}, {"loss": 0.7813, "grad_norm": 0.47095245122909546, "learning_rate": 0.0002, "epoch": 1.1910910264686896, "step": 3690}, {"loss": 0.7702, "grad_norm": 0.5197778940200806, "learning_rate": 0.0002, "epoch": 1.1943189154293092, "step": 3700}, {"loss": 0.7349, "grad_norm": 0.5156530141830444, "learning_rate": 0.0002, "epoch": 1.1975468043899289, "step": 3710}, {"loss": 0.7738, "grad_norm": 0.6968549489974976, "learning_rate": 0.0002, "epoch": 1.2007746933505488, "step": 3720}, {"loss": 0.7599, "grad_norm": 0.48983848094940186, "learning_rate": 0.0002, "epoch": 1.2040025823111684, "step": 3730}, {"loss": 0.7163, "grad_norm": 0.6709973216056824, "learning_rate": 0.0002, "epoch": 1.2072304712717883, "step": 3740}, {"loss": 0.7632, "grad_norm": 0.48681750893592834, "learning_rate": 0.0002, "epoch": 1.210458360232408, "step": 3750}, {"loss": 0.7039, "grad_norm": 0.49475061893463135, "learning_rate": 0.0002, "epoch": 1.2136862491930278, "step": 3760}, {"loss": 0.7372, "grad_norm": 0.6163983345031738, "learning_rate": 0.0002, "epoch": 1.2169141381536475, "step": 3770}, {"loss": 0.757, "grad_norm": 0.5481411218643188, "learning_rate": 0.0002, "epoch": 1.2201420271142673, "step": 3780}, {"loss": 0.7601, "grad_norm": 0.620639979839325, "learning_rate": 0.0002, "epoch": 1.223369916074887, "step": 3790}, {"loss": 0.7738, "grad_norm": 0.7017222046852112, "learning_rate": 0.0002, "epoch": 1.2265978050355069, "step": 3800}, {"loss": 0.7468, "grad_norm": 0.5872400403022766, "learning_rate": 0.0002, "epoch": 1.2298256939961265, "step": 3810}, {"loss": 0.7854, "grad_norm": 0.45765596628189087, "learning_rate": 0.0002, "epoch": 1.2330535829567464, "step": 3820}, {"loss": 0.7865, "grad_norm": 0.5676377415657043, "learning_rate": 0.0002, "epoch": 1.236281471917366, "step": 3830}, {"loss": 0.7696, "grad_norm": 0.4793425500392914, "learning_rate": 0.0002, "epoch": 1.2395093608779857, "step": 3840}, {"loss": 0.7065, "grad_norm": 0.5060022473335266, "learning_rate": 0.0002, "epoch": 1.2427372498386056, "step": 3850}, {"loss": 0.7333, "grad_norm": 0.6140682697296143, "learning_rate": 0.0002, "epoch": 1.2459651387992252, "step": 3860}, {"loss": 0.7496, "grad_norm": 0.5030326843261719, "learning_rate": 0.0002, "epoch": 1.249193027759845, "step": 3870}, {"loss": 0.7226, "grad_norm": 0.6609430909156799, "learning_rate": 0.0002, "epoch": 1.2524209167204647, "step": 3880}, {"loss": 0.7212, "grad_norm": 0.5459545850753784, "learning_rate": 0.0002, "epoch": 1.2556488056810846, "step": 3890}, {"loss": 0.7145, "grad_norm": 0.5328870415687561, "learning_rate": 0.0002, "epoch": 1.2588766946417043, "step": 3900}, {"loss": 0.7572, "grad_norm": 0.5840652585029602, "learning_rate": 0.0002, "epoch": 1.2621045836023241, "step": 3910}, {"loss": 0.7624, "grad_norm": 0.5587584376335144, "learning_rate": 0.0002, "epoch": 1.2653324725629438, "step": 3920}, {"loss": 0.7846, "grad_norm": 0.5886949896812439, "learning_rate": 0.0002, "epoch": 1.2685603615235637, "step": 3930}, {"loss": 0.7251, "grad_norm": 0.5128693580627441, "learning_rate": 0.0002, "epoch": 1.2717882504841833, "step": 3940}, {"loss": 0.7032, "grad_norm": 0.6207669377326965, "learning_rate": 0.0002, "epoch": 1.2750161394448032, "step": 3950}, {"loss": 0.7506, "grad_norm": 0.5789574384689331, "learning_rate": 0.0002, "epoch": 1.2782440284054228, "step": 3960}, {"loss": 0.7574, "grad_norm": 0.503162145614624, "learning_rate": 0.0002, "epoch": 1.2814719173660425, "step": 3970}, {"loss": 0.7489, "grad_norm": 0.6670064926147461, "learning_rate": 0.0002, "epoch": 1.2846998063266624, "step": 3980}, {"loss": 0.7198, "grad_norm": 0.5676213502883911, "learning_rate": 0.0002, "epoch": 1.2879276952872822, "step": 3990}, {"loss": 0.7892, "grad_norm": 0.5383169054985046, "learning_rate": 0.0002, "epoch": 1.2911555842479019, "step": 4000}, {"loss": 0.7432, "grad_norm": 0.714743971824646, "learning_rate": 0.0002, "epoch": 1.2943834732085215, "step": 4010}, {"loss": 0.7594, "grad_norm": 0.5740262269973755, "learning_rate": 0.0002, "epoch": 1.2976113621691414, "step": 4020}, {"loss": 0.7564, "grad_norm": 0.6143045425415039, "learning_rate": 0.0002, "epoch": 1.300839251129761, "step": 4030}, {"loss": 0.7181, "grad_norm": 0.501025378704071, "learning_rate": 0.0002, "epoch": 1.304067140090381, "step": 4040}, {"loss": 0.7099, "grad_norm": 0.5784100294113159, "learning_rate": 0.0002, "epoch": 1.3072950290510006, "step": 4050}, {"loss": 0.7403, "grad_norm": 0.6182606220245361, "learning_rate": 0.0002, "epoch": 1.3105229180116205, "step": 4060}, {"loss": 0.7249, "grad_norm": 0.5072231292724609, "learning_rate": 0.0002, "epoch": 1.3137508069722401, "step": 4070}, {"loss": 0.7451, "grad_norm": 0.6841012835502625, "learning_rate": 0.0002, "epoch": 1.31697869593286, "step": 4080}, {"loss": 0.7395, "grad_norm": 0.697257936000824, "learning_rate": 0.0002, "epoch": 1.3202065848934796, "step": 4090}, {"loss": 0.7401, "grad_norm": 0.5113214254379272, "learning_rate": 0.0002, "epoch": 1.3234344738540993, "step": 4100}, {"loss": 0.7336, "grad_norm": 0.6270561814308167, "learning_rate": 0.0002, "epoch": 1.3266623628147192, "step": 4110}, {"loss": 0.7535, "grad_norm": 0.5525947213172913, "learning_rate": 0.0002, "epoch": 1.329890251775339, "step": 4120}, {"loss": 0.6999, "grad_norm": 0.546071469783783, "learning_rate": 0.0002, "epoch": 1.3331181407359587, "step": 4130}, {"loss": 0.7884, "grad_norm": 0.6516721248626709, "learning_rate": 0.0002, "epoch": 1.3363460296965783, "step": 4140}, {"loss": 0.755, "grad_norm": 0.6235111355781555, "learning_rate": 0.0002, "epoch": 1.3395739186571982, "step": 4150}, {"loss": 0.7467, "grad_norm": 0.538649320602417, "learning_rate": 0.0002, "epoch": 1.3428018076178179, "step": 4160}, {"loss": 0.7368, "grad_norm": 0.5367001891136169, "learning_rate": 0.0002, "epoch": 1.3460296965784377, "step": 4170}, {"loss": 0.7536, "grad_norm": 0.6134631037712097, "learning_rate": 0.0002, "epoch": 1.3492575855390574, "step": 4180}, {"loss": 0.8245, "grad_norm": 0.5827262997627258, "learning_rate": 0.0002, "epoch": 1.3524854744996773, "step": 4190}, {"loss": 0.7288, "grad_norm": 0.5706096291542053, "learning_rate": 0.0002, "epoch": 1.355713363460297, "step": 4200}, {"loss": 0.7302, "grad_norm": 0.6422057151794434, "learning_rate": 0.0002, "epoch": 1.3589412524209168, "step": 4210}, {"loss": 0.7303, "grad_norm": 0.6316141486167908, "learning_rate": 0.0002, "epoch": 1.3621691413815364, "step": 4220}, {"loss": 0.7457, "grad_norm": 0.6946983933448792, "learning_rate": 0.0002, "epoch": 1.365397030342156, "step": 4230}, {"loss": 0.7388, "grad_norm": 0.5381525754928589, "learning_rate": 0.0002, "epoch": 1.368624919302776, "step": 4240}, {"loss": 0.73, "grad_norm": 0.5484845638275146, "learning_rate": 0.0002, "epoch": 1.3718528082633958, "step": 4250}, {"loss": 0.7584, "grad_norm": 0.5961896777153015, "learning_rate": 0.0002, "epoch": 1.3750806972240155, "step": 4260}, {"loss": 0.8006, "grad_norm": 0.6041752696037292, "learning_rate": 0.0002, "epoch": 1.3783085861846351, "step": 4270}, {"loss": 0.7276, "grad_norm": 0.6283464431762695, "learning_rate": 0.0002, "epoch": 1.381536475145255, "step": 4280}, {"loss": 0.757, "grad_norm": 0.6761324405670166, "learning_rate": 0.0002, "epoch": 1.384764364105875, "step": 4290}, {"loss": 0.7381, "grad_norm": 0.504311203956604, "learning_rate": 0.0002, "epoch": 1.3879922530664945, "step": 4300}, {"loss": 0.7536, "grad_norm": 0.6100395917892456, "learning_rate": 0.0002, "epoch": 1.3912201420271142, "step": 4310}, {"loss": 0.7103, "grad_norm": 0.6245788335800171, "learning_rate": 0.0002, "epoch": 1.394448030987734, "step": 4320}, {"loss": 0.7505, "grad_norm": 0.6074621081352234, "learning_rate": 0.0002, "epoch": 1.3976759199483537, "step": 4330}, {"loss": 0.752, "grad_norm": 0.6683838963508606, "learning_rate": 0.0002, "epoch": 1.4009038089089736, "step": 4340}, {"loss": 0.7537, "grad_norm": 0.622998058795929, "learning_rate": 0.0002, "epoch": 1.4041316978695932, "step": 4350}, {"loss": 0.8148, "grad_norm": 0.6089423894882202, "learning_rate": 0.0002, "epoch": 1.4073595868302131, "step": 4360}, {"loss": 0.7715, "grad_norm": 0.6381658911705017, "learning_rate": 0.0002, "epoch": 1.4105874757908328, "step": 4370}, {"loss": 0.7871, "grad_norm": 0.5419308543205261, "learning_rate": 0.0002, "epoch": 1.4138153647514526, "step": 4380}, {"loss": 0.7386, "grad_norm": 0.6026232242584229, "learning_rate": 0.0002, "epoch": 1.4170432537120723, "step": 4390}, {"loss": 0.7529, "grad_norm": 0.4911101162433624, "learning_rate": 0.0002, "epoch": 1.420271142672692, "step": 4400}, {"loss": 0.7495, "grad_norm": 0.6302908062934875, "learning_rate": 0.0002, "epoch": 1.4234990316333118, "step": 4410}, {"loss": 0.7446, "grad_norm": 0.6692768931388855, "learning_rate": 0.0002, "epoch": 1.4267269205939317, "step": 4420}, {"loss": 0.7312, "grad_norm": 0.46294572949409485, "learning_rate": 0.0002, "epoch": 1.4299548095545513, "step": 4430}, {"loss": 0.7255, "grad_norm": 0.5452619194984436, "learning_rate": 0.0002, "epoch": 1.433182698515171, "step": 4440}, {"loss": 0.7974, "grad_norm": 0.7809233069419861, "learning_rate": 0.0002, "epoch": 1.4364105874757909, "step": 4450}, {"loss": 0.7103, "grad_norm": 0.550088107585907, "learning_rate": 0.0002, "epoch": 1.4396384764364105, "step": 4460}, {"loss": 0.7088, "grad_norm": 0.7139151096343994, "learning_rate": 0.0002, "epoch": 1.4428663653970304, "step": 4470}, {"loss": 0.7358, "grad_norm": 0.6187090873718262, "learning_rate": 0.0002, "epoch": 1.44609425435765, "step": 4480}, {"loss": 0.7608, "grad_norm": 0.5948249101638794, "learning_rate": 0.0002, "epoch": 1.44932214331827, "step": 4490}, {"loss": 0.7582, "grad_norm": 0.6510892510414124, "learning_rate": 0.0002, "epoch": 1.4525500322788896, "step": 4500}, {"loss": 0.7105, "grad_norm": 0.6552293300628662, "learning_rate": 0.0002, "epoch": 1.4557779212395094, "step": 4510}, {"loss": 0.7965, "grad_norm": 0.585574209690094, "learning_rate": 0.0002, "epoch": 1.459005810200129, "step": 4520}, {"loss": 0.761, "grad_norm": 0.4830162823200226, "learning_rate": 0.0002, "epoch": 1.4622336991607487, "step": 4530}, {"loss": 0.7424, "grad_norm": 0.5780223608016968, "learning_rate": 0.0002, "epoch": 1.4654615881213686, "step": 4540}, {"loss": 0.7518, "grad_norm": 0.5462607145309448, "learning_rate": 0.0002, "epoch": 1.4686894770819885, "step": 4550}, {"loss": 0.7342, "grad_norm": 0.5183546543121338, "learning_rate": 0.0002, "epoch": 1.4719173660426081, "step": 4560}, {"loss": 0.71, "grad_norm": 0.676917552947998, "learning_rate": 0.0002, "epoch": 1.4751452550032278, "step": 4570}, {"loss": 0.7875, "grad_norm": 0.5772345066070557, "learning_rate": 0.0002, "epoch": 1.4783731439638477, "step": 4580}, {"loss": 0.7709, "grad_norm": 0.7320035696029663, "learning_rate": 0.0002, "epoch": 1.4816010329244673, "step": 4590}, {"loss": 0.7601, "grad_norm": 0.5024042129516602, "learning_rate": 0.0002, "epoch": 1.4848289218850872, "step": 4600}, {"loss": 0.8061, "grad_norm": 0.5482868552207947, "learning_rate": 0.0002, "epoch": 1.4880568108457068, "step": 4610}, {"loss": 0.714, "grad_norm": 0.5447399616241455, "learning_rate": 0.0002, "epoch": 1.4912846998063267, "step": 4620}, {"loss": 0.7959, "grad_norm": 0.5953414440155029, "learning_rate": 0.0002, "epoch": 1.4945125887669464, "step": 4630}, {"loss": 0.7463, "grad_norm": 0.6983066201210022, "learning_rate": 0.0002, "epoch": 1.4977404777275662, "step": 4640}, {"loss": 0.7877, "grad_norm": 0.586327075958252, "learning_rate": 0.0002, "epoch": 1.500968366688186, "step": 4650}, {"loss": 0.7169, "grad_norm": 0.5839682221412659, "learning_rate": 0.0002, "epoch": 1.5041962556488055, "step": 4660}, {"loss": 0.7524, "grad_norm": 0.5959209203720093, "learning_rate": 0.0002, "epoch": 1.5074241446094254, "step": 4670}, {"loss": 0.7615, "grad_norm": 0.5073857307434082, "learning_rate": 0.0002, "epoch": 1.5106520335700453, "step": 4680}, {"loss": 0.7258, "grad_norm": 0.5183001160621643, "learning_rate": 0.0002, "epoch": 1.513879922530665, "step": 4690}, {"loss": 0.784, "grad_norm": 0.593530535697937, "learning_rate": 0.0002, "epoch": 1.5171078114912846, "step": 4700}, {"loss": 0.7722, "grad_norm": 0.675993025302887, "learning_rate": 0.0002, "epoch": 1.5203357004519045, "step": 4710}, {"loss": 0.7485, "grad_norm": 0.5823286771774292, "learning_rate": 0.0002, "epoch": 1.5235635894125243, "step": 4720}, {"loss": 0.7474, "grad_norm": 0.5825035572052002, "learning_rate": 0.0002, "epoch": 1.526791478373144, "step": 4730}, {"loss": 0.8287, "grad_norm": 0.5689691305160522, "learning_rate": 0.0002, "epoch": 1.5300193673337636, "step": 4740}, {"loss": 0.7279, "grad_norm": 0.6037150621414185, "learning_rate": 0.0002, "epoch": 1.5332472562943835, "step": 4750}, {"loss": 0.7865, "grad_norm": 0.6393677592277527, "learning_rate": 0.0002, "epoch": 1.5364751452550034, "step": 4760}, {"loss": 0.805, "grad_norm": 0.5926381945610046, "learning_rate": 0.0002, "epoch": 1.539703034215623, "step": 4770}, {"loss": 0.7425, "grad_norm": 0.9468599557876587, "learning_rate": 0.0002, "epoch": 1.5429309231762427, "step": 4780}, {"loss": 0.7565, "grad_norm": 0.7544237375259399, "learning_rate": 0.0002, "epoch": 1.5461588121368623, "step": 4790}, {"loss": 0.7398, "grad_norm": 0.5308566093444824, "learning_rate": 0.0002, "epoch": 1.5493867010974822, "step": 4800}, {"loss": 0.7756, "grad_norm": 0.6590296030044556, "learning_rate": 0.0002, "epoch": 1.552614590058102, "step": 4810}, {"loss": 0.7212, "grad_norm": 0.5630404353141785, "learning_rate": 0.0002, "epoch": 1.5558424790187217, "step": 4820}, {"loss": 0.7593, "grad_norm": 0.6800200939178467, "learning_rate": 0.0002, "epoch": 1.5590703679793414, "step": 4830}, {"loss": 0.7373, "grad_norm": 0.5463718175888062, "learning_rate": 0.0002, "epoch": 1.5622982569399613, "step": 4840}, {"loss": 0.7519, "grad_norm": 0.505135178565979, "learning_rate": 0.0002, "epoch": 1.5655261459005811, "step": 4850}, {"loss": 0.8122, "grad_norm": 0.5469676852226257, "learning_rate": 0.0002, "epoch": 1.5687540348612008, "step": 4860}, {"loss": 0.7185, "grad_norm": 0.5318337678909302, "learning_rate": 0.0002, "epoch": 1.5719819238218204, "step": 4870}, {"loss": 0.7324, "grad_norm": 0.7287914752960205, "learning_rate": 0.0002, "epoch": 1.5752098127824403, "step": 4880}, {"loss": 0.7532, "grad_norm": 0.7318989038467407, "learning_rate": 0.0002, "epoch": 1.5784377017430602, "step": 4890}, {"loss": 0.7851, "grad_norm": 0.6499921679496765, "learning_rate": 0.0002, "epoch": 1.5816655907036798, "step": 4900}, {"loss": 0.753, "grad_norm": 0.47907355427742004, "learning_rate": 0.0002, "epoch": 1.5848934796642995, "step": 4910}, {"loss": 0.7699, "grad_norm": 0.7338833808898926, "learning_rate": 0.0002, "epoch": 1.5881213686249191, "step": 4920}, {"loss": 0.7592, "grad_norm": 0.5800719261169434, "learning_rate": 0.0002, "epoch": 1.591349257585539, "step": 4930}, {"loss": 0.7211, "grad_norm": 0.5365763306617737, "learning_rate": 0.0002, "epoch": 1.594577146546159, "step": 4940}, {"loss": 0.777, "grad_norm": 0.5800772309303284, "learning_rate": 0.0002, "epoch": 1.5978050355067785, "step": 4950}, {"loss": 0.8027, "grad_norm": 0.7878010869026184, "learning_rate": 0.0002, "epoch": 1.6010329244673982, "step": 4960}, {"loss": 0.7894, "grad_norm": 0.5919058918952942, "learning_rate": 0.0002, "epoch": 1.604260813428018, "step": 4970}, {"loss": 0.7762, "grad_norm": 0.5004435181617737, "learning_rate": 0.0002, "epoch": 1.607488702388638, "step": 4980}, {"loss": 0.7447, "grad_norm": 0.6299242377281189, "learning_rate": 0.0002, "epoch": 1.6107165913492576, "step": 4990}, {"loss": 0.7149, "grad_norm": 0.6307242512702942, "learning_rate": 0.0002, "epoch": 1.6139444803098772, "step": 5000}, {"loss": 0.7693, "grad_norm": 0.7838703989982605, "learning_rate": 0.0002, "epoch": 1.6171723692704971, "step": 5010}, {"loss": 0.7364, "grad_norm": 0.6454671621322632, "learning_rate": 0.0002, "epoch": 1.620400258231117, "step": 5020}, {"loss": 0.74, "grad_norm": 0.5907095670700073, "learning_rate": 0.0002, "epoch": 1.6236281471917366, "step": 5030}, {"loss": 0.7331, "grad_norm": 0.6053501963615417, "learning_rate": 0.0002, "epoch": 1.6268560361523563, "step": 5040}, {"loss": 0.6987, "grad_norm": 0.5644670128822327, "learning_rate": 0.0002, "epoch": 1.630083925112976, "step": 5050}, {"loss": 0.7886, "grad_norm": 0.6320949792861938, "learning_rate": 0.0002, "epoch": 1.6333118140735958, "step": 5060}, {"loss": 0.7109, "grad_norm": 0.6101489067077637, "learning_rate": 0.0002, "epoch": 1.6365397030342157, "step": 5070}, {"loss": 0.6922, "grad_norm": 0.9435283541679382, "learning_rate": 0.0002, "epoch": 1.6397675919948353, "step": 5080}, {"loss": 0.729, "grad_norm": 0.6668919324874878, "learning_rate": 0.0002, "epoch": 1.642995480955455, "step": 5090}, {"loss": 0.7402, "grad_norm": 0.6160340905189514, "learning_rate": 0.0002, "epoch": 1.6462233699160749, "step": 5100}, {"loss": 0.7461, "grad_norm": 0.5999835729598999, "learning_rate": 0.0002, "epoch": 1.6494512588766947, "step": 5110}, {"loss": 0.7661, "grad_norm": 0.9378551840782166, "learning_rate": 0.0002, "epoch": 1.6526791478373144, "step": 5120}, {"loss": 0.7586, "grad_norm": 0.4795055389404297, "learning_rate": 0.0002, "epoch": 1.655907036797934, "step": 5130}, {"loss": 0.7342, "grad_norm": 0.4878861606121063, "learning_rate": 0.0002, "epoch": 1.659134925758554, "step": 5140}, {"loss": 0.7362, "grad_norm": 0.6042965054512024, "learning_rate": 0.0002, "epoch": 1.6623628147191738, "step": 5150}, {"loss": 0.7863, "grad_norm": 0.5829901695251465, "learning_rate": 0.0002, "epoch": 1.6655907036797934, "step": 5160}, {"loss": 0.7498, "grad_norm": 0.5168480277061462, "learning_rate": 0.0002, "epoch": 1.668818592640413, "step": 5170}, {"loss": 0.7333, "grad_norm": 0.6489511132240295, "learning_rate": 0.0002, "epoch": 1.672046481601033, "step": 5180}, {"loss": 0.7257, "grad_norm": 0.5955966114997864, "learning_rate": 0.0002, "epoch": 1.6752743705616526, "step": 5190}, {"loss": 0.7938, "grad_norm": 0.6228088140487671, "learning_rate": 0.0002, "epoch": 1.6785022595222725, "step": 5200}, {"loss": 0.7626, "grad_norm": 0.5726390480995178, "learning_rate": 0.0002, "epoch": 1.6817301484828922, "step": 5210}, {"loss": 0.7479, "grad_norm": 0.6116343140602112, "learning_rate": 0.0002, "epoch": 1.6849580374435118, "step": 5220}, {"loss": 0.7169, "grad_norm": 0.5483687520027161, "learning_rate": 0.0002, "epoch": 1.6881859264041317, "step": 5230}, {"loss": 0.7293, "grad_norm": 0.570941686630249, "learning_rate": 0.0002, "epoch": 1.6914138153647515, "step": 5240}, {"loss": 0.723, "grad_norm": 0.6048086285591125, "learning_rate": 0.0002, "epoch": 1.6946417043253712, "step": 5250}, {"loss": 0.7861, "grad_norm": 0.6769003868103027, "learning_rate": 0.0002, "epoch": 1.6978695932859909, "step": 5260}, {"loss": 0.7885, "grad_norm": 0.5629057884216309, "learning_rate": 0.0002, "epoch": 1.7010974822466107, "step": 5270}, {"loss": 0.7693, "grad_norm": 0.657341480255127, "learning_rate": 0.0002, "epoch": 1.7043253712072306, "step": 5280}, {"loss": 0.7357, "grad_norm": 0.6256147623062134, "learning_rate": 0.0002, "epoch": 1.7075532601678503, "step": 5290}, {"loss": 0.714, "grad_norm": 0.5498088002204895, "learning_rate": 0.0002, "epoch": 1.71078114912847, "step": 5300}, {"loss": 0.7669, "grad_norm": 0.5078358054161072, "learning_rate": 0.0002, "epoch": 1.7140090380890898, "step": 5310}, {"loss": 0.7872, "grad_norm": 0.6696692705154419, "learning_rate": 0.0002, "epoch": 1.7172369270497096, "step": 5320}, {"loss": 0.8205, "grad_norm": 0.6692847013473511, "learning_rate": 0.0002, "epoch": 1.7204648160103293, "step": 5330}, {"loss": 0.7432, "grad_norm": 0.5415751934051514, "learning_rate": 0.0002, "epoch": 1.723692704970949, "step": 5340}, {"loss": 0.7499, "grad_norm": 0.5367611050605774, "learning_rate": 0.0002, "epoch": 1.7269205939315686, "step": 5350}, {"loss": 0.7631, "grad_norm": 0.7321061491966248, "learning_rate": 0.0002, "epoch": 1.7301484828921885, "step": 5360}, {"loss": 0.7827, "grad_norm": 0.723972499370575, "learning_rate": 0.0002, "epoch": 1.7333763718528084, "step": 5370}, {"loss": 0.7077, "grad_norm": 0.7328100204467773, "learning_rate": 0.0002, "epoch": 1.736604260813428, "step": 5380}, {"loss": 0.7503, "grad_norm": 0.5785264372825623, "learning_rate": 0.0002, "epoch": 1.7398321497740477, "step": 5390}, {"loss": 0.7188, "grad_norm": 0.7812932133674622, "learning_rate": 0.0002, "epoch": 1.7430600387346675, "step": 5400}, {"loss": 0.7386, "grad_norm": 0.6493327617645264, "learning_rate": 0.0002, "epoch": 1.7462879276952874, "step": 5410}, {"loss": 0.7487, "grad_norm": 0.5825939774513245, "learning_rate": 0.0002, "epoch": 1.749515816655907, "step": 5420}, {"loss": 0.7625, "grad_norm": 0.6969610452651978, "learning_rate": 0.0002, "epoch": 1.7527437056165267, "step": 5430}, {"loss": 0.7512, "grad_norm": 0.5558062195777893, "learning_rate": 0.0002, "epoch": 1.7559715945771466, "step": 5440}, {"loss": 0.7256, "grad_norm": 0.49222221970558167, "learning_rate": 0.0002, "epoch": 1.7591994835377665, "step": 5450}, {"loss": 0.7477, "grad_norm": 0.5844656825065613, "learning_rate": 0.0002, "epoch": 1.762427372498386, "step": 5460}, {"loss": 0.7695, "grad_norm": 0.8706597685813904, "learning_rate": 0.0002, "epoch": 1.7656552614590058, "step": 5470}, {"loss": 0.7582, "grad_norm": 0.6167706251144409, "learning_rate": 0.0002, "epoch": 1.7688831504196254, "step": 5480}, {"loss": 0.7521, "grad_norm": 0.5890011787414551, "learning_rate": 0.0002, "epoch": 1.7721110393802453, "step": 5490}, {"loss": 0.8319, "grad_norm": 0.6551728248596191, "learning_rate": 0.0002, "epoch": 1.7753389283408652, "step": 5500}, {"loss": 0.7615, "grad_norm": 0.5848751068115234, "learning_rate": 0.0002, "epoch": 1.7785668173014848, "step": 5510}, {"loss": 0.7622, "grad_norm": 0.6664014458656311, "learning_rate": 0.0002, "epoch": 1.7817947062621045, "step": 5520}, {"loss": 0.7544, "grad_norm": 0.5931693911552429, "learning_rate": 0.0002, "epoch": 1.7850225952227243, "step": 5530}, {"loss": 0.7992, "grad_norm": 0.5534724593162537, "learning_rate": 0.0002, "epoch": 1.7882504841833442, "step": 5540}, {"loss": 0.7967, "grad_norm": 0.5590878129005432, "learning_rate": 0.0002, "epoch": 1.7914783731439639, "step": 5550}, {"loss": 0.7406, "grad_norm": 0.6947470903396606, "learning_rate": 0.0002, "epoch": 1.7947062621045835, "step": 5560}, {"loss": 0.7614, "grad_norm": 0.6104130148887634, "learning_rate": 0.0002, "epoch": 1.7979341510652034, "step": 5570}, {"loss": 0.8032, "grad_norm": 0.6135714054107666, "learning_rate": 0.0002, "epoch": 1.8011620400258233, "step": 5580}, {"loss": 0.7403, "grad_norm": 0.6626853346824646, "learning_rate": 0.0002, "epoch": 1.804389928986443, "step": 5590}, {"loss": 0.7746, "grad_norm": 0.6977612972259521, "learning_rate": 0.0002, "epoch": 1.8076178179470626, "step": 5600}, {"loss": 0.7899, "grad_norm": 0.6275238394737244, "learning_rate": 0.0002, "epoch": 1.8108457069076824, "step": 5610}, {"loss": 0.7392, "grad_norm": 0.5017505288124084, "learning_rate": 0.0002, "epoch": 1.814073595868302, "step": 5620}, {"loss": 0.7669, "grad_norm": 0.8314290642738342, "learning_rate": 0.0002, "epoch": 1.817301484828922, "step": 5630}, {"loss": 0.7031, "grad_norm": 0.6863582134246826, "learning_rate": 0.0002, "epoch": 1.8205293737895416, "step": 5640}, {"loss": 0.743, "grad_norm": 0.69544917345047, "learning_rate": 0.0002, "epoch": 1.8237572627501613, "step": 5650}, {"loss": 0.7277, "grad_norm": 0.515499472618103, "learning_rate": 0.0002, "epoch": 1.8269851517107811, "step": 5660}, {"loss": 0.7166, "grad_norm": 0.6100873947143555, "learning_rate": 0.0002, "epoch": 1.830213040671401, "step": 5670}, {"loss": 0.7217, "grad_norm": 0.67416912317276, "learning_rate": 0.0002, "epoch": 1.8334409296320207, "step": 5680}, {"loss": 0.7575, "grad_norm": 0.7057772278785706, "learning_rate": 0.0002, "epoch": 1.8366688185926403, "step": 5690}, {"loss": 0.7483, "grad_norm": 0.7374551892280579, "learning_rate": 0.0002, "epoch": 1.8398967075532602, "step": 5700}, {"loss": 0.81, "grad_norm": 0.6266297101974487, "learning_rate": 0.0002, "epoch": 1.84312459651388, "step": 5710}, {"loss": 0.728, "grad_norm": 0.5629227757453918, "learning_rate": 0.0002, "epoch": 1.8463524854744997, "step": 5720}, {"loss": 0.8043, "grad_norm": 0.6603655815124512, "learning_rate": 0.0002, "epoch": 1.8495803744351194, "step": 5730}, {"loss": 0.7587, "grad_norm": 0.8113715052604675, "learning_rate": 0.0002, "epoch": 1.8528082633957392, "step": 5740}, {"loss": 0.7486, "grad_norm": 0.7143914103507996, "learning_rate": 0.0002, "epoch": 1.856036152356359, "step": 5750}, {"loss": 0.7619, "grad_norm": 0.6273732781410217, "learning_rate": 0.0002, "epoch": 1.8592640413169788, "step": 5760}, {"loss": 0.7962, "grad_norm": 0.5428690910339355, "learning_rate": 0.0002, "epoch": 1.8624919302775984, "step": 5770}, {"loss": 0.7581, "grad_norm": 0.6405037641525269, "learning_rate": 0.0002, "epoch": 1.865719819238218, "step": 5780}, {"loss": 0.7569, "grad_norm": 0.700873613357544, "learning_rate": 0.0002, "epoch": 1.868947708198838, "step": 5790}, {"loss": 0.7353, "grad_norm": 0.5645238161087036, "learning_rate": 0.0002, "epoch": 1.8721755971594578, "step": 5800}, {"loss": 0.8037, "grad_norm": 0.8780353665351868, "learning_rate": 0.0002, "epoch": 1.8754034861200775, "step": 5810}, {"loss": 0.7686, "grad_norm": 0.6295409798622131, "learning_rate": 0.0002, "epoch": 1.878631375080697, "step": 5820}, {"loss": 0.8067, "grad_norm": 0.678269624710083, "learning_rate": 0.0002, "epoch": 1.881859264041317, "step": 5830}, {"loss": 0.7537, "grad_norm": 0.6464608907699585, "learning_rate": 0.0002, "epoch": 1.8850871530019369, "step": 5840}, {"loss": 0.7423, "grad_norm": 0.6201048493385315, "learning_rate": 0.0002, "epoch": 1.8883150419625565, "step": 5850}, {"loss": 0.7694, "grad_norm": 0.6046274304389954, "learning_rate": 0.0002, "epoch": 1.8915429309231762, "step": 5860}, {"loss": 0.781, "grad_norm": 0.7532408833503723, "learning_rate": 0.0002, "epoch": 1.894770819883796, "step": 5870}, {"loss": 0.6885, "grad_norm": 0.6066767573356628, "learning_rate": 0.0002, "epoch": 1.897998708844416, "step": 5880}, {"loss": 0.7631, "grad_norm": 0.6289830207824707, "learning_rate": 0.0002, "epoch": 1.9012265978050356, "step": 5890}, {"loss": 0.7501, "grad_norm": 0.5204319953918457, "learning_rate": 0.0002, "epoch": 1.9044544867656552, "step": 5900}, {"loss": 0.7335, "grad_norm": 0.6708219647407532, "learning_rate": 0.0002, "epoch": 1.9076823757262749, "step": 5910}, {"loss": 0.7455, "grad_norm": 0.4915677309036255, "learning_rate": 0.0002, "epoch": 1.9109102646868947, "step": 5920}, {"loss": 0.7464, "grad_norm": 0.652717113494873, "learning_rate": 0.0002, "epoch": 1.9141381536475146, "step": 5930}, {"loss": 0.7687, "grad_norm": 0.5446316003799438, "learning_rate": 0.0002, "epoch": 1.9173660426081343, "step": 5940}, {"loss": 0.7424, "grad_norm": 0.4958149194717407, "learning_rate": 0.0002, "epoch": 1.920593931568754, "step": 5950}, {"loss": 0.757, "grad_norm": 0.5623434782028198, "learning_rate": 0.0002, "epoch": 1.9238218205293738, "step": 5960}, {"loss": 0.7446, "grad_norm": 0.6855450868606567, "learning_rate": 0.0002, "epoch": 1.9270497094899937, "step": 5970}, {"loss": 0.827, "grad_norm": 0.5710492730140686, "learning_rate": 0.0002, "epoch": 1.9302775984506133, "step": 5980}, {"loss": 0.7245, "grad_norm": 0.5379431843757629, "learning_rate": 0.0002, "epoch": 1.933505487411233, "step": 5990}, {"loss": 0.77, "grad_norm": 0.557129442691803, "learning_rate": 0.0002, "epoch": 1.9367333763718528, "step": 6000}, {"loss": 0.6988, "grad_norm": 0.6336663961410522, "learning_rate": 0.0002, "epoch": 1.9399612653324727, "step": 6010}, {"loss": 0.7316, "grad_norm": 0.5950582027435303, "learning_rate": 0.0002, "epoch": 1.9431891542930924, "step": 6020}, {"loss": 0.7443, "grad_norm": 0.5905954837799072, "learning_rate": 0.0002, "epoch": 1.946417043253712, "step": 6030}, {"loss": 0.7127, "grad_norm": 0.6688982844352722, "learning_rate": 0.0002, "epoch": 1.9496449322143317, "step": 6040}, {"loss": 0.79, "grad_norm": 0.5440775752067566, "learning_rate": 0.0002, "epoch": 1.9528728211749515, "step": 6050}, {"loss": 0.7221, "grad_norm": 0.6207906603813171, "learning_rate": 0.0002, "epoch": 1.9561007101355714, "step": 6060}, {"loss": 0.738, "grad_norm": 0.6999374628067017, "learning_rate": 0.0002, "epoch": 1.959328599096191, "step": 6070}, {"loss": 0.7372, "grad_norm": 0.6310848593711853, "learning_rate": 0.0002, "epoch": 1.9625564880568107, "step": 6080}, {"loss": 0.7198, "grad_norm": 0.5903388261795044, "learning_rate": 0.0002, "epoch": 1.9657843770174306, "step": 6090}, {"loss": 0.7103, "grad_norm": 0.6333889961242676, "learning_rate": 0.0002, "epoch": 1.9690122659780505, "step": 6100}, {"loss": 0.7246, "grad_norm": 0.5604711174964905, "learning_rate": 0.0002, "epoch": 1.97224015493867, "step": 6110}, {"loss": 0.761, "grad_norm": 0.9234541654586792, "learning_rate": 0.0002, "epoch": 1.9754680438992898, "step": 6120}, {"loss": 0.7375, "grad_norm": 0.6149102449417114, "learning_rate": 0.0002, "epoch": 1.9786959328599096, "step": 6130}, {"loss": 0.7286, "grad_norm": 0.615446150302887, "learning_rate": 0.0002, "epoch": 1.9819238218205295, "step": 6140}, {"loss": 0.7333, "grad_norm": 0.5176635980606079, "learning_rate": 0.0002, "epoch": 1.9851517107811492, "step": 6150}, {"loss": 0.718, "grad_norm": 0.7124109864234924, "learning_rate": 0.0002, "epoch": 1.9883795997417688, "step": 6160}, {"loss": 0.7669, "grad_norm": 0.6317567825317383, "learning_rate": 0.0002, "epoch": 1.9916074887023887, "step": 6170}, {"loss": 0.8012, "grad_norm": 0.6855016350746155, "learning_rate": 0.0002, "epoch": 1.9948353776630086, "step": 6180}, {"loss": 0.7376, "grad_norm": 0.6423715353012085, "learning_rate": 0.0002, "epoch": 1.9980632666236282, "step": 6190}]} +{"epoch": 3.0, "step": 9294, "epoch_duration": 11254.594346523285, "total_accumulated_duration": 32090.091183662415, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75537109375}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.593, "grad_norm": 0.7092075347900391, "learning_rate": 0.0002, "epoch": 0.0032278889606197547, "step": 10}, {"loss": 1.0956, "grad_norm": 0.6900479793548584, "learning_rate": 0.0002, "epoch": 0.006455777921239509, "step": 20}, {"loss": 0.9807, "grad_norm": 0.6788288950920105, "learning_rate": 0.0002, "epoch": 0.009683666881859263, "step": 30}, {"loss": 0.9385, "grad_norm": 0.5590243339538574, "learning_rate": 0.0002, "epoch": 0.012911555842479019, "step": 40}, {"loss": 0.931, "grad_norm": 0.5136010646820068, "learning_rate": 0.0002, "epoch": 0.016139444803098774, "step": 50}, {"loss": 0.8896, "grad_norm": 0.45298320055007935, "learning_rate": 0.0002, "epoch": 0.019367333763718526, "step": 60}, {"loss": 0.9184, "grad_norm": 0.5917162299156189, "learning_rate": 0.0002, "epoch": 0.022595222724338282, "step": 70}, {"loss": 0.8705, "grad_norm": 0.4414856433868408, "learning_rate": 0.0002, "epoch": 0.025823111684958037, "step": 80}, {"loss": 0.8419, "grad_norm": 0.5547978281974792, "learning_rate": 0.0002, "epoch": 0.029051000645577793, "step": 90}, {"loss": 0.8987, "grad_norm": 0.5271288156509399, "learning_rate": 0.0002, "epoch": 0.03227888960619755, "step": 100}, {"loss": 0.8543, "grad_norm": 0.5506119728088379, "learning_rate": 0.0002, "epoch": 0.035506778566817304, "step": 110}, {"loss": 0.8373, "grad_norm": 0.5579327940940857, "learning_rate": 0.0002, "epoch": 0.03873466752743705, "step": 120}, {"loss": 0.8826, "grad_norm": 0.5099632740020752, "learning_rate": 0.0002, "epoch": 0.04196255648805681, "step": 130}, {"loss": 0.9239, "grad_norm": 0.40396833419799805, "learning_rate": 0.0002, "epoch": 0.045190445448676564, "step": 140}, {"loss": 0.846, "grad_norm": 0.5008092522621155, "learning_rate": 0.0002, "epoch": 0.04841833440929632, "step": 150}, {"loss": 0.8564, "grad_norm": 0.4388776421546936, "learning_rate": 0.0002, "epoch": 0.051646223369916075, "step": 160}, {"loss": 0.8829, "grad_norm": 0.44138944149017334, "learning_rate": 0.0002, "epoch": 0.05487411233053583, "step": 170}, {"loss": 0.8061, "grad_norm": 0.358484148979187, "learning_rate": 0.0002, "epoch": 0.058102001291155586, "step": 180}, {"loss": 0.8956, "grad_norm": 0.457052081823349, "learning_rate": 0.0002, "epoch": 0.06132989025177534, "step": 190}, {"loss": 0.9138, "grad_norm": 0.5537622570991516, "learning_rate": 0.0002, "epoch": 0.0645577792123951, "step": 200}, {"loss": 0.8701, "grad_norm": 0.552631676197052, "learning_rate": 0.0002, "epoch": 0.06778566817301485, "step": 210}, {"loss": 0.8854, "grad_norm": 0.4414575397968292, "learning_rate": 0.0002, "epoch": 0.07101355713363461, "step": 220}, {"loss": 0.8581, "grad_norm": 0.4996664226055145, "learning_rate": 0.0002, "epoch": 0.07424144609425436, "step": 230}, {"loss": 0.8675, "grad_norm": 0.7321897149085999, "learning_rate": 0.0002, "epoch": 0.0774693350548741, "step": 240}, {"loss": 0.8848, "grad_norm": 0.4553901255130768, "learning_rate": 0.0002, "epoch": 0.08069722401549387, "step": 250}, {"loss": 0.868, "grad_norm": 0.5039054751396179, "learning_rate": 0.0002, "epoch": 0.08392511297611362, "step": 260}, {"loss": 0.8317, "grad_norm": 0.4113094210624695, "learning_rate": 0.0002, "epoch": 0.08715300193673338, "step": 270}, {"loss": 0.8074, "grad_norm": 0.450436532497406, "learning_rate": 0.0002, "epoch": 0.09038089089735313, "step": 280}, {"loss": 0.8105, "grad_norm": 0.4548024535179138, "learning_rate": 0.0002, "epoch": 0.09360877985797289, "step": 290}, {"loss": 0.8325, "grad_norm": 0.4932962656021118, "learning_rate": 0.0002, "epoch": 0.09683666881859264, "step": 300}, {"loss": 0.8105, "grad_norm": 0.4005250334739685, "learning_rate": 0.0002, "epoch": 0.1000645577792124, "step": 310}, {"loss": 0.8083, "grad_norm": 1.8321624994277954, "learning_rate": 0.0002, "epoch": 0.10329244673983215, "step": 320}, {"loss": 0.8411, "grad_norm": 0.45815610885620117, "learning_rate": 0.0002, "epoch": 0.1065203357004519, "step": 330}, {"loss": 0.857, "grad_norm": 0.39324095845222473, "learning_rate": 0.0002, "epoch": 0.10974822466107166, "step": 340}, {"loss": 0.8258, "grad_norm": 0.546273946762085, "learning_rate": 0.0002, "epoch": 0.11297611362169141, "step": 350}, {"loss": 0.882, "grad_norm": 0.497448593378067, "learning_rate": 0.0002, "epoch": 0.11620400258231117, "step": 360}, {"loss": 0.7608, "grad_norm": 0.37508800625801086, "learning_rate": 0.0002, "epoch": 0.11943189154293092, "step": 370}, {"loss": 0.852, "grad_norm": 0.45849609375, "learning_rate": 0.0002, "epoch": 0.12265978050355068, "step": 380}, {"loss": 0.8437, "grad_norm": 0.5488408803939819, "learning_rate": 0.0002, "epoch": 0.12588766946417043, "step": 390}, {"loss": 0.8349, "grad_norm": 0.4477061331272125, "learning_rate": 0.0002, "epoch": 0.1291155584247902, "step": 400}, {"loss": 0.8306, "grad_norm": 0.39227980375289917, "learning_rate": 0.0002, "epoch": 0.13234344738540993, "step": 410}, {"loss": 0.7933, "grad_norm": 0.3922233581542969, "learning_rate": 0.0002, "epoch": 0.1355713363460297, "step": 420}, {"loss": 0.8134, "grad_norm": 0.42901909351348877, "learning_rate": 0.0002, "epoch": 0.13879922530664945, "step": 430}, {"loss": 0.8271, "grad_norm": 0.4217798709869385, "learning_rate": 0.0002, "epoch": 0.14202711426726922, "step": 440}, {"loss": 0.8594, "grad_norm": 0.43470677733421326, "learning_rate": 0.0002, "epoch": 0.14525500322788895, "step": 450}, {"loss": 0.8106, "grad_norm": 0.5324403047561646, "learning_rate": 0.0002, "epoch": 0.1484828921885087, "step": 460}, {"loss": 0.8729, "grad_norm": 0.3999756872653961, "learning_rate": 0.0002, "epoch": 0.15171078114912848, "step": 470}, {"loss": 0.7702, "grad_norm": 0.404933363199234, "learning_rate": 0.0002, "epoch": 0.1549386701097482, "step": 480}, {"loss": 0.8151, "grad_norm": 0.44122636318206787, "learning_rate": 0.0002, "epoch": 0.15816655907036797, "step": 490}, {"loss": 0.8457, "grad_norm": 0.510166347026825, "learning_rate": 0.0002, "epoch": 0.16139444803098774, "step": 500}, {"loss": 0.8692, "grad_norm": 0.4549732506275177, "learning_rate": 0.0002, "epoch": 0.1646223369916075, "step": 510}, {"loss": 0.8466, "grad_norm": 0.5148182511329651, "learning_rate": 0.0002, "epoch": 0.16785022595222723, "step": 520}, {"loss": 0.8317, "grad_norm": 0.3596806824207306, "learning_rate": 0.0002, "epoch": 0.171078114912847, "step": 530}, {"loss": 0.844, "grad_norm": 0.4388909339904785, "learning_rate": 0.0002, "epoch": 0.17430600387346676, "step": 540}, {"loss": 0.8322, "grad_norm": 0.5052742958068848, "learning_rate": 0.0002, "epoch": 0.17753389283408652, "step": 550}, {"loss": 0.791, "grad_norm": 0.48248958587646484, "learning_rate": 0.0002, "epoch": 0.18076178179470626, "step": 560}, {"loss": 0.8593, "grad_norm": 0.5360197424888611, "learning_rate": 0.0002, "epoch": 0.18398967075532602, "step": 570}, {"loss": 0.817, "grad_norm": 0.43999341130256653, "learning_rate": 0.0002, "epoch": 0.18721755971594578, "step": 580}, {"loss": 0.8311, "grad_norm": 0.3685208261013031, "learning_rate": 0.0002, "epoch": 0.19044544867656552, "step": 590}, {"loss": 0.8341, "grad_norm": 0.4601275622844696, "learning_rate": 0.0002, "epoch": 0.19367333763718528, "step": 600}, {"loss": 0.8483, "grad_norm": 0.4778369665145874, "learning_rate": 0.0002, "epoch": 0.19690122659780504, "step": 610}, {"loss": 0.8653, "grad_norm": 0.4867003560066223, "learning_rate": 0.0002, "epoch": 0.2001291155584248, "step": 620}, {"loss": 0.8554, "grad_norm": 0.4583742916584015, "learning_rate": 0.0002, "epoch": 0.20335700451904454, "step": 630}, {"loss": 0.8698, "grad_norm": 0.47958165407180786, "learning_rate": 0.0002, "epoch": 0.2065848934796643, "step": 640}, {"loss": 0.8213, "grad_norm": 0.4526064097881317, "learning_rate": 0.0002, "epoch": 0.20981278244028406, "step": 650}, {"loss": 0.8313, "grad_norm": 0.45890581607818604, "learning_rate": 0.0002, "epoch": 0.2130406714009038, "step": 660}, {"loss": 0.8143, "grad_norm": 0.42725905776023865, "learning_rate": 0.0002, "epoch": 0.21626856036152356, "step": 670}, {"loss": 0.8675, "grad_norm": 0.40380963683128357, "learning_rate": 0.0002, "epoch": 0.21949644932214332, "step": 680}, {"loss": 0.9004, "grad_norm": 0.4372998774051666, "learning_rate": 0.0002, "epoch": 0.22272433828276308, "step": 690}, {"loss": 0.8208, "grad_norm": 0.4245864450931549, "learning_rate": 0.0002, "epoch": 0.22595222724338282, "step": 700}, {"loss": 0.8564, "grad_norm": 0.4061129689216614, "learning_rate": 0.0002, "epoch": 0.22918011620400258, "step": 710}, {"loss": 0.8275, "grad_norm": 0.474454790353775, "learning_rate": 0.0002, "epoch": 0.23240800516462234, "step": 720}, {"loss": 0.8346, "grad_norm": 0.4908486008644104, "learning_rate": 0.0002, "epoch": 0.23563589412524208, "step": 730}, {"loss": 0.8755, "grad_norm": 0.4284191429615021, "learning_rate": 0.0002, "epoch": 0.23886378308586184, "step": 740}, {"loss": 0.8387, "grad_norm": 0.44730308651924133, "learning_rate": 0.0002, "epoch": 0.2420916720464816, "step": 750}, {"loss": 0.8135, "grad_norm": 0.4433246850967407, "learning_rate": 0.0002, "epoch": 0.24531956100710137, "step": 760}, {"loss": 0.8644, "grad_norm": 0.43668854236602783, "learning_rate": 0.0002, "epoch": 0.2485474499677211, "step": 770}, {"loss": 0.8025, "grad_norm": 0.34324130415916443, "learning_rate": 0.0002, "epoch": 0.25177533892834086, "step": 780}, {"loss": 0.8725, "grad_norm": 0.46476295590400696, "learning_rate": 0.0002, "epoch": 0.2550032278889606, "step": 790}, {"loss": 0.8157, "grad_norm": 0.5047039985656738, "learning_rate": 0.0002, "epoch": 0.2582311168495804, "step": 800}, {"loss": 0.8643, "grad_norm": 0.4402127265930176, "learning_rate": 0.0002, "epoch": 0.26145900581020015, "step": 810}, {"loss": 0.8025, "grad_norm": 0.4642465114593506, "learning_rate": 0.0002, "epoch": 0.26468689477081986, "step": 820}, {"loss": 0.8836, "grad_norm": 0.40093424916267395, "learning_rate": 0.0002, "epoch": 0.2679147837314396, "step": 830}, {"loss": 0.83, "grad_norm": 0.42501842975616455, "learning_rate": 0.0002, "epoch": 0.2711426726920594, "step": 840}, {"loss": 0.8573, "grad_norm": 0.43279722332954407, "learning_rate": 0.0002, "epoch": 0.27437056165267915, "step": 850}, {"loss": 0.817, "grad_norm": 0.5991243720054626, "learning_rate": 0.0002, "epoch": 0.2775984506132989, "step": 860}, {"loss": 0.7981, "grad_norm": 0.4217848777770996, "learning_rate": 0.0002, "epoch": 0.28082633957391867, "step": 870}, {"loss": 0.8135, "grad_norm": 0.3933536410331726, "learning_rate": 0.0002, "epoch": 0.28405422853453843, "step": 880}, {"loss": 0.8846, "grad_norm": 0.5868505239486694, "learning_rate": 0.0002, "epoch": 0.28728211749515814, "step": 890}, {"loss": 0.8759, "grad_norm": 0.5209547877311707, "learning_rate": 0.0002, "epoch": 0.2905100064557779, "step": 900}, {"loss": 0.815, "grad_norm": 0.49307361245155334, "learning_rate": 0.0002, "epoch": 0.29373789541639767, "step": 910}, {"loss": 0.7813, "grad_norm": 0.4288382828235626, "learning_rate": 0.0002, "epoch": 0.2969657843770174, "step": 920}, {"loss": 0.8431, "grad_norm": 0.33568474650382996, "learning_rate": 0.0002, "epoch": 0.3001936733376372, "step": 930}, {"loss": 0.8455, "grad_norm": 1.0915930271148682, "learning_rate": 0.0002, "epoch": 0.30342156229825695, "step": 940}, {"loss": 0.8535, "grad_norm": 0.5489798188209534, "learning_rate": 0.0002, "epoch": 0.3066494512588767, "step": 950}, {"loss": 0.8031, "grad_norm": 0.42971742153167725, "learning_rate": 0.0002, "epoch": 0.3098773402194964, "step": 960}, {"loss": 0.8253, "grad_norm": 0.43375834822654724, "learning_rate": 0.0002, "epoch": 0.3131052291801162, "step": 970}, {"loss": 0.7747, "grad_norm": 0.47488611936569214, "learning_rate": 0.0002, "epoch": 0.31633311814073595, "step": 980}, {"loss": 0.7906, "grad_norm": 0.46296775341033936, "learning_rate": 0.0002, "epoch": 0.3195610071013557, "step": 990}, {"loss": 0.7948, "grad_norm": 0.4548890292644501, "learning_rate": 0.0002, "epoch": 0.32278889606197547, "step": 1000}, {"loss": 0.8856, "grad_norm": 0.41834497451782227, "learning_rate": 0.0002, "epoch": 0.32601678502259523, "step": 1010}, {"loss": 0.7791, "grad_norm": 0.441092312335968, "learning_rate": 0.0002, "epoch": 0.329244673983215, "step": 1020}, {"loss": 0.8191, "grad_norm": 0.637322187423706, "learning_rate": 0.0002, "epoch": 0.33247256294383476, "step": 1030}, {"loss": 0.8685, "grad_norm": 0.4374958574771881, "learning_rate": 0.0002, "epoch": 0.33570045190445447, "step": 1040}, {"loss": 0.8423, "grad_norm": 0.3935825824737549, "learning_rate": 0.0002, "epoch": 0.33892834086507423, "step": 1050}, {"loss": 0.8287, "grad_norm": 0.43526220321655273, "learning_rate": 0.0002, "epoch": 0.342156229825694, "step": 1060}, {"loss": 0.8413, "grad_norm": 0.45327696204185486, "learning_rate": 0.0002, "epoch": 0.34538411878631375, "step": 1070}, {"loss": 0.7421, "grad_norm": 0.4126075506210327, "learning_rate": 0.0002, "epoch": 0.3486120077469335, "step": 1080}, {"loss": 0.8427, "grad_norm": 0.4714072048664093, "learning_rate": 0.0002, "epoch": 0.3518398967075533, "step": 1090}, {"loss": 0.8028, "grad_norm": 0.518127977848053, "learning_rate": 0.0002, "epoch": 0.35506778566817304, "step": 1100}, {"loss": 0.8479, "grad_norm": 0.43264099955558777, "learning_rate": 0.0002, "epoch": 0.35829567462879275, "step": 1110}, {"loss": 0.8724, "grad_norm": 0.4857400357723236, "learning_rate": 0.0002, "epoch": 0.3615235635894125, "step": 1120}, {"loss": 0.7735, "grad_norm": 0.37591469287872314, "learning_rate": 0.0002, "epoch": 0.3647514525500323, "step": 1130}, {"loss": 0.8531, "grad_norm": 0.4165478050708771, "learning_rate": 0.0002, "epoch": 0.36797934151065204, "step": 1140}, {"loss": 0.8151, "grad_norm": 0.42911383509635925, "learning_rate": 0.0002, "epoch": 0.3712072304712718, "step": 1150}, {"loss": 0.8722, "grad_norm": 0.44980287551879883, "learning_rate": 0.0002, "epoch": 0.37443511943189156, "step": 1160}, {"loss": 0.7961, "grad_norm": 0.4066573679447174, "learning_rate": 0.0002, "epoch": 0.3776630083925113, "step": 1170}, {"loss": 0.8317, "grad_norm": 0.5056195855140686, "learning_rate": 0.0002, "epoch": 0.38089089735313103, "step": 1180}, {"loss": 0.8387, "grad_norm": 0.4141536355018616, "learning_rate": 0.0002, "epoch": 0.3841187863137508, "step": 1190}, {"loss": 0.8019, "grad_norm": 0.4501924514770508, "learning_rate": 0.0002, "epoch": 0.38734667527437056, "step": 1200}, {"loss": 0.8528, "grad_norm": 0.43304240703582764, "learning_rate": 0.0002, "epoch": 0.3905745642349903, "step": 1210}, {"loss": 0.8905, "grad_norm": 0.475777804851532, "learning_rate": 0.0002, "epoch": 0.3938024531956101, "step": 1220}, {"loss": 0.8643, "grad_norm": 0.5846465826034546, "learning_rate": 0.0002, "epoch": 0.39703034215622984, "step": 1230}, {"loss": 0.8078, "grad_norm": 0.42899325489997864, "learning_rate": 0.0002, "epoch": 0.4002582311168496, "step": 1240}, {"loss": 0.8415, "grad_norm": 0.3980463147163391, "learning_rate": 0.0002, "epoch": 0.4034861200774693, "step": 1250}, {"loss": 0.8026, "grad_norm": 0.45769768953323364, "learning_rate": 0.0002, "epoch": 0.4067140090380891, "step": 1260}, {"loss": 0.8377, "grad_norm": 0.5101280212402344, "learning_rate": 0.0002, "epoch": 0.40994189799870884, "step": 1270}, {"loss": 0.7905, "grad_norm": 0.47374317049980164, "learning_rate": 0.0002, "epoch": 0.4131697869593286, "step": 1280}, {"loss": 0.8172, "grad_norm": 0.4261878728866577, "learning_rate": 0.0002, "epoch": 0.41639767591994836, "step": 1290}, {"loss": 0.9004, "grad_norm": 0.46954256296157837, "learning_rate": 0.0002, "epoch": 0.4196255648805681, "step": 1300}, {"loss": 0.7868, "grad_norm": 0.5205738544464111, "learning_rate": 0.0002, "epoch": 0.4228534538411879, "step": 1310}, {"loss": 0.8964, "grad_norm": 0.5176340937614441, "learning_rate": 0.0002, "epoch": 0.4260813428018076, "step": 1320}, {"loss": 0.8764, "grad_norm": 0.5155916810035706, "learning_rate": 0.0002, "epoch": 0.42930923176242736, "step": 1330}, {"loss": 0.8197, "grad_norm": 0.44548553228378296, "learning_rate": 0.0002, "epoch": 0.4325371207230471, "step": 1340}, {"loss": 0.7873, "grad_norm": 0.5633558630943298, "learning_rate": 0.0002, "epoch": 0.4357650096836669, "step": 1350}, {"loss": 0.7889, "grad_norm": 0.42444056272506714, "learning_rate": 0.0002, "epoch": 0.43899289864428664, "step": 1360}, {"loss": 0.8588, "grad_norm": 0.5226860642433167, "learning_rate": 0.0002, "epoch": 0.4422207876049064, "step": 1370}, {"loss": 0.8232, "grad_norm": 0.5354582071304321, "learning_rate": 0.0002, "epoch": 0.44544867656552617, "step": 1380}, {"loss": 0.816, "grad_norm": 0.472646564245224, "learning_rate": 0.0002, "epoch": 0.4486765655261459, "step": 1390}, {"loss": 0.7953, "grad_norm": 0.6312310099601746, "learning_rate": 0.0002, "epoch": 0.45190445448676564, "step": 1400}, {"loss": 0.8212, "grad_norm": 0.4298408031463623, "learning_rate": 0.0002, "epoch": 0.4551323434473854, "step": 1410}, {"loss": 0.8447, "grad_norm": 0.43427202105522156, "learning_rate": 0.0002, "epoch": 0.45836023240800516, "step": 1420}, {"loss": 0.8342, "grad_norm": 0.44097861647605896, "learning_rate": 0.0002, "epoch": 0.4615881213686249, "step": 1430}, {"loss": 0.8301, "grad_norm": 0.5142693519592285, "learning_rate": 0.0002, "epoch": 0.4648160103292447, "step": 1440}, {"loss": 0.8144, "grad_norm": 0.46416547894477844, "learning_rate": 0.0002, "epoch": 0.46804389928986445, "step": 1450}, {"loss": 0.8342, "grad_norm": 0.4858551025390625, "learning_rate": 0.0002, "epoch": 0.47127178825048416, "step": 1460}, {"loss": 0.8354, "grad_norm": 0.4709177315235138, "learning_rate": 0.0002, "epoch": 0.4744996772111039, "step": 1470}, {"loss": 0.8391, "grad_norm": 0.5500252842903137, "learning_rate": 0.0002, "epoch": 0.4777275661717237, "step": 1480}, {"loss": 0.8359, "grad_norm": 0.43364381790161133, "learning_rate": 0.0002, "epoch": 0.48095545513234345, "step": 1490}, {"loss": 0.8446, "grad_norm": 0.47712287306785583, "learning_rate": 0.0002, "epoch": 0.4841833440929632, "step": 1500}, {"loss": 0.8518, "grad_norm": 0.4518495202064514, "learning_rate": 0.0002, "epoch": 0.48741123305358297, "step": 1510}, {"loss": 0.819, "grad_norm": 0.4539008140563965, "learning_rate": 0.0002, "epoch": 0.49063912201420273, "step": 1520}, {"loss": 0.8276, "grad_norm": 0.4993067979812622, "learning_rate": 0.0002, "epoch": 0.49386701097482244, "step": 1530}, {"loss": 0.8297, "grad_norm": 0.6094803214073181, "learning_rate": 0.0002, "epoch": 0.4970948999354422, "step": 1540}, {"loss": 0.8263, "grad_norm": 0.48602527379989624, "learning_rate": 0.0002, "epoch": 0.500322788896062, "step": 1550}, {"loss": 0.8182, "grad_norm": 0.40245795249938965, "learning_rate": 0.0002, "epoch": 0.5035506778566817, "step": 1560}, {"loss": 0.7907, "grad_norm": 0.456787645816803, "learning_rate": 0.0002, "epoch": 0.5067785668173015, "step": 1570}, {"loss": 0.86, "grad_norm": 0.43936216831207275, "learning_rate": 0.0002, "epoch": 0.5100064557779213, "step": 1580}, {"loss": 0.7928, "grad_norm": 0.549018144607544, "learning_rate": 0.0002, "epoch": 0.513234344738541, "step": 1590}, {"loss": 0.8169, "grad_norm": 0.41746795177459717, "learning_rate": 0.0002, "epoch": 0.5164622336991608, "step": 1600}, {"loss": 0.7868, "grad_norm": 0.4217053949832916, "learning_rate": 0.0002, "epoch": 0.5196901226597805, "step": 1610}, {"loss": 0.8161, "grad_norm": 0.449913889169693, "learning_rate": 0.0002, "epoch": 0.5229180116204003, "step": 1620}, {"loss": 0.7938, "grad_norm": 0.5084872245788574, "learning_rate": 0.0002, "epoch": 0.5261459005810201, "step": 1630}, {"loss": 0.8295, "grad_norm": 0.46248653531074524, "learning_rate": 0.0002, "epoch": 0.5293737895416397, "step": 1640}, {"loss": 0.7993, "grad_norm": 0.4824236035346985, "learning_rate": 0.0002, "epoch": 0.5326016785022595, "step": 1650}, {"loss": 0.8711, "grad_norm": 0.6010985374450684, "learning_rate": 0.0002, "epoch": 0.5358295674628792, "step": 1660}, {"loss": 0.8266, "grad_norm": 0.4757920801639557, "learning_rate": 0.0002, "epoch": 0.539057456423499, "step": 1670}, {"loss": 0.8182, "grad_norm": 0.45161882042884827, "learning_rate": 0.0002, "epoch": 0.5422853453841188, "step": 1680}, {"loss": 0.8141, "grad_norm": 0.49314990639686584, "learning_rate": 0.0002, "epoch": 0.5455132343447385, "step": 1690}, {"loss": 0.8091, "grad_norm": 0.3918305039405823, "learning_rate": 0.0002, "epoch": 0.5487411233053583, "step": 1700}, {"loss": 0.8177, "grad_norm": 0.5966728925704956, "learning_rate": 0.0002, "epoch": 0.551969012265978, "step": 1710}, {"loss": 0.8438, "grad_norm": 0.4208986163139343, "learning_rate": 0.0002, "epoch": 0.5551969012265978, "step": 1720}, {"loss": 0.817, "grad_norm": 0.43724218010902405, "learning_rate": 0.0002, "epoch": 0.5584247901872176, "step": 1730}, {"loss": 0.7956, "grad_norm": 0.5287272930145264, "learning_rate": 0.0002, "epoch": 0.5616526791478373, "step": 1740}, {"loss": 0.8557, "grad_norm": 0.4961899518966675, "learning_rate": 0.0002, "epoch": 0.5648805681084571, "step": 1750}, {"loss": 0.8029, "grad_norm": 0.4468635320663452, "learning_rate": 0.0002, "epoch": 0.5681084570690769, "step": 1760}, {"loss": 0.7968, "grad_norm": 0.6423530578613281, "learning_rate": 0.0002, "epoch": 0.5713363460296966, "step": 1770}, {"loss": 0.8324, "grad_norm": 0.4601971507072449, "learning_rate": 0.0002, "epoch": 0.5745642349903163, "step": 1780}, {"loss": 0.8171, "grad_norm": 0.46514901518821716, "learning_rate": 0.0002, "epoch": 0.577792123950936, "step": 1790}, {"loss": 0.8186, "grad_norm": 0.4771687388420105, "learning_rate": 0.0002, "epoch": 0.5810200129115558, "step": 1800}, {"loss": 0.856, "grad_norm": 0.46514490246772766, "learning_rate": 0.0002, "epoch": 0.5842479018721756, "step": 1810}, {"loss": 0.84, "grad_norm": 0.5373936295509338, "learning_rate": 0.0002, "epoch": 0.5874757908327953, "step": 1820}, {"loss": 0.8456, "grad_norm": 0.5175791382789612, "learning_rate": 0.0002, "epoch": 0.5907036797934151, "step": 1830}, {"loss": 0.7957, "grad_norm": 0.4522802233695984, "learning_rate": 0.0002, "epoch": 0.5939315687540349, "step": 1840}, {"loss": 0.8633, "grad_norm": 0.42987772822380066, "learning_rate": 0.0002, "epoch": 0.5971594577146546, "step": 1850}, {"loss": 0.7871, "grad_norm": 0.5566838383674622, "learning_rate": 0.0002, "epoch": 0.6003873466752744, "step": 1860}, {"loss": 0.8312, "grad_norm": 0.42807698249816895, "learning_rate": 0.0002, "epoch": 0.6036152356358941, "step": 1870}, {"loss": 0.8035, "grad_norm": 0.4957767724990845, "learning_rate": 0.0002, "epoch": 0.6068431245965139, "step": 1880}, {"loss": 0.8145, "grad_norm": 0.4260980188846588, "learning_rate": 0.0002, "epoch": 0.6100710135571337, "step": 1890}, {"loss": 0.8363, "grad_norm": 0.4777357876300812, "learning_rate": 0.0002, "epoch": 0.6132989025177534, "step": 1900}, {"loss": 0.8404, "grad_norm": 0.4434216022491455, "learning_rate": 0.0002, "epoch": 0.6165267914783732, "step": 1910}, {"loss": 0.8057, "grad_norm": 0.5215433835983276, "learning_rate": 0.0002, "epoch": 0.6197546804389928, "step": 1920}, {"loss": 0.82, "grad_norm": 0.5143248438835144, "learning_rate": 0.0002, "epoch": 0.6229825693996126, "step": 1930}, {"loss": 0.8107, "grad_norm": 0.5213413238525391, "learning_rate": 0.0002, "epoch": 0.6262104583602324, "step": 1940}, {"loss": 0.7549, "grad_norm": 0.5408226251602173, "learning_rate": 0.0002, "epoch": 0.6294383473208521, "step": 1950}, {"loss": 0.8405, "grad_norm": 0.5479708909988403, "learning_rate": 0.0002, "epoch": 0.6326662362814719, "step": 1960}, {"loss": 0.8138, "grad_norm": 0.4490949809551239, "learning_rate": 0.0002, "epoch": 0.6358941252420917, "step": 1970}, {"loss": 0.854, "grad_norm": 0.48815059661865234, "learning_rate": 0.0002, "epoch": 0.6391220142027114, "step": 1980}, {"loss": 0.8568, "grad_norm": 0.46498045325279236, "learning_rate": 0.0002, "epoch": 0.6423499031633312, "step": 1990}, {"loss": 0.8263, "grad_norm": 0.5136561393737793, "learning_rate": 0.0002, "epoch": 0.6455777921239509, "step": 2000}, {"loss": 0.8503, "grad_norm": 0.5145719647407532, "learning_rate": 0.0002, "epoch": 0.6488056810845707, "step": 2010}, {"loss": 0.8456, "grad_norm": 0.5430373549461365, "learning_rate": 0.0002, "epoch": 0.6520335700451905, "step": 2020}, {"loss": 0.8115, "grad_norm": 0.46347954869270325, "learning_rate": 0.0002, "epoch": 0.6552614590058102, "step": 2030}, {"loss": 0.8769, "grad_norm": 0.5189562439918518, "learning_rate": 0.0002, "epoch": 0.65848934796643, "step": 2040}, {"loss": 0.8453, "grad_norm": 0.43843990564346313, "learning_rate": 0.0002, "epoch": 0.6617172369270498, "step": 2050}, {"loss": 0.7951, "grad_norm": 0.4654983580112457, "learning_rate": 0.0002, "epoch": 0.6649451258876695, "step": 2060}, {"loss": 0.8308, "grad_norm": 0.44835716485977173, "learning_rate": 0.0002, "epoch": 0.6681730148482892, "step": 2070}, {"loss": 0.8181, "grad_norm": 0.38811734318733215, "learning_rate": 0.0002, "epoch": 0.6714009038089089, "step": 2080}, {"loss": 0.762, "grad_norm": 0.5709853172302246, "learning_rate": 0.0002, "epoch": 0.6746287927695287, "step": 2090}, {"loss": 0.8334, "grad_norm": 0.49994757771492004, "learning_rate": 0.0002, "epoch": 0.6778566817301485, "step": 2100}, {"loss": 0.8, "grad_norm": 0.5505402684211731, "learning_rate": 0.0002, "epoch": 0.6810845706907682, "step": 2110}, {"loss": 0.8227, "grad_norm": 0.48195120692253113, "learning_rate": 0.0002, "epoch": 0.684312459651388, "step": 2120}, {"loss": 0.7879, "grad_norm": 0.4854775071144104, "learning_rate": 0.0002, "epoch": 0.6875403486120077, "step": 2130}, {"loss": 0.8231, "grad_norm": 0.6422494649887085, "learning_rate": 0.0002, "epoch": 0.6907682375726275, "step": 2140}, {"loss": 0.8353, "grad_norm": 0.3972536027431488, "learning_rate": 0.0002, "epoch": 0.6939961265332473, "step": 2150}, {"loss": 0.8068, "grad_norm": 0.4297836422920227, "learning_rate": 0.0002, "epoch": 0.697224015493867, "step": 2160}, {"loss": 0.8017, "grad_norm": 0.45486778020858765, "learning_rate": 0.0002, "epoch": 0.7004519044544868, "step": 2170}, {"loss": 0.8507, "grad_norm": 0.4706047773361206, "learning_rate": 0.0002, "epoch": 0.7036797934151066, "step": 2180}, {"loss": 0.8234, "grad_norm": 0.46426892280578613, "learning_rate": 0.0002, "epoch": 0.7069076823757263, "step": 2190}, {"loss": 0.8472, "grad_norm": 0.46333715319633484, "learning_rate": 0.0002, "epoch": 0.7101355713363461, "step": 2200}, {"loss": 0.8247, "grad_norm": 0.4632524251937866, "learning_rate": 0.0002, "epoch": 0.7133634602969657, "step": 2210}, {"loss": 0.8452, "grad_norm": 0.4610830843448639, "learning_rate": 0.0002, "epoch": 0.7165913492575855, "step": 2220}, {"loss": 0.7338, "grad_norm": 0.4905324876308441, "learning_rate": 0.0002, "epoch": 0.7198192382182053, "step": 2230}, {"loss": 0.7715, "grad_norm": 0.4936263859272003, "learning_rate": 0.0002, "epoch": 0.723047127178825, "step": 2240}, {"loss": 0.8162, "grad_norm": 0.40778425335884094, "learning_rate": 0.0002, "epoch": 0.7262750161394448, "step": 2250}, {"loss": 0.828, "grad_norm": 0.50351482629776, "learning_rate": 0.0002, "epoch": 0.7295029051000645, "step": 2260}, {"loss": 0.8475, "grad_norm": 0.4894128143787384, "learning_rate": 0.0002, "epoch": 0.7327307940606843, "step": 2270}, {"loss": 0.8087, "grad_norm": 0.5580906271934509, "learning_rate": 0.0002, "epoch": 0.7359586830213041, "step": 2280}, {"loss": 0.8157, "grad_norm": 0.4655369520187378, "learning_rate": 0.0002, "epoch": 0.7391865719819238, "step": 2290}, {"loss": 0.8395, "grad_norm": 0.4666965901851654, "learning_rate": 0.0002, "epoch": 0.7424144609425436, "step": 2300}, {"loss": 0.7605, "grad_norm": 0.46259936690330505, "learning_rate": 0.0002, "epoch": 0.7456423499031634, "step": 2310}, {"loss": 0.7849, "grad_norm": 0.520706832408905, "learning_rate": 0.0002, "epoch": 0.7488702388637831, "step": 2320}, {"loss": 0.8173, "grad_norm": 0.5142408013343811, "learning_rate": 0.0002, "epoch": 0.7520981278244029, "step": 2330}, {"loss": 0.7782, "grad_norm": 0.5355164408683777, "learning_rate": 0.0002, "epoch": 0.7553260167850226, "step": 2340}, {"loss": 0.8242, "grad_norm": 0.5517185926437378, "learning_rate": 0.0002, "epoch": 0.7585539057456423, "step": 2350}, {"loss": 0.8404, "grad_norm": 0.7162677049636841, "learning_rate": 0.0002, "epoch": 0.7617817947062621, "step": 2360}, {"loss": 0.8455, "grad_norm": 0.42402133345603943, "learning_rate": 0.0002, "epoch": 0.7650096836668818, "step": 2370}, {"loss": 0.8214, "grad_norm": 0.47180113196372986, "learning_rate": 0.0002, "epoch": 0.7682375726275016, "step": 2380}, {"loss": 0.8274, "grad_norm": 0.6262288689613342, "learning_rate": 0.0002, "epoch": 0.7714654615881213, "step": 2390}, {"loss": 0.7915, "grad_norm": 0.5177528262138367, "learning_rate": 0.0002, "epoch": 0.7746933505487411, "step": 2400}, {"loss": 0.7631, "grad_norm": 0.555721640586853, "learning_rate": 0.0002, "epoch": 0.7779212395093609, "step": 2410}, {"loss": 0.795, "grad_norm": 0.5592644810676575, "learning_rate": 0.0002, "epoch": 0.7811491284699806, "step": 2420}, {"loss": 0.8081, "grad_norm": 0.38025397062301636, "learning_rate": 0.0002, "epoch": 0.7843770174306004, "step": 2430}, {"loss": 0.7851, "grad_norm": 0.4597472548484802, "learning_rate": 0.0002, "epoch": 0.7876049063912202, "step": 2440}, {"loss": 0.8575, "grad_norm": 0.4929825961589813, "learning_rate": 0.0002, "epoch": 0.7908327953518399, "step": 2450}, {"loss": 0.7584, "grad_norm": 0.45277655124664307, "learning_rate": 0.0002, "epoch": 0.7940606843124597, "step": 2460}, {"loss": 0.8208, "grad_norm": 0.6224122643470764, "learning_rate": 0.0002, "epoch": 0.7972885732730794, "step": 2470}, {"loss": 0.8449, "grad_norm": 0.5740901827812195, "learning_rate": 0.0002, "epoch": 0.8005164622336992, "step": 2480}, {"loss": 0.7834, "grad_norm": 0.41335329413414, "learning_rate": 0.0002, "epoch": 0.8037443511943189, "step": 2490}, {"loss": 0.7768, "grad_norm": 0.4738694131374359, "learning_rate": 0.0002, "epoch": 0.8069722401549386, "step": 2500}, {"loss": 0.7927, "grad_norm": 0.5288197994232178, "learning_rate": 0.0002, "epoch": 0.8102001291155584, "step": 2510}, {"loss": 0.8334, "grad_norm": 0.5404666066169739, "learning_rate": 0.0002, "epoch": 0.8134280180761781, "step": 2520}, {"loss": 0.7998, "grad_norm": 0.4444909691810608, "learning_rate": 0.0002, "epoch": 0.8166559070367979, "step": 2530}, {"loss": 0.8683, "grad_norm": 0.542061448097229, "learning_rate": 0.0002, "epoch": 0.8198837959974177, "step": 2540}, {"loss": 0.8038, "grad_norm": 0.4914741814136505, "learning_rate": 0.0002, "epoch": 0.8231116849580374, "step": 2550}, {"loss": 0.7899, "grad_norm": 0.41703441739082336, "learning_rate": 0.0002, "epoch": 0.8263395739186572, "step": 2560}, {"loss": 0.824, "grad_norm": 0.5489841103553772, "learning_rate": 0.0002, "epoch": 0.829567462879277, "step": 2570}, {"loss": 0.8157, "grad_norm": 0.5359883308410645, "learning_rate": 0.0002, "epoch": 0.8327953518398967, "step": 2580}, {"loss": 0.8122, "grad_norm": 0.5541019439697266, "learning_rate": 0.0002, "epoch": 0.8360232408005165, "step": 2590}, {"loss": 0.797, "grad_norm": 0.4746638834476471, "learning_rate": 0.0002, "epoch": 0.8392511297611362, "step": 2600}, {"loss": 0.8116, "grad_norm": 0.5243194103240967, "learning_rate": 0.0002, "epoch": 0.842479018721756, "step": 2610}, {"loss": 0.8173, "grad_norm": 0.46824976801872253, "learning_rate": 0.0002, "epoch": 0.8457069076823758, "step": 2620}, {"loss": 0.7525, "grad_norm": 0.49487847089767456, "learning_rate": 0.0002, "epoch": 0.8489347966429954, "step": 2630}, {"loss": 0.8296, "grad_norm": 0.42180097103118896, "learning_rate": 0.0002, "epoch": 0.8521626856036152, "step": 2640}, {"loss": 0.8304, "grad_norm": 0.5516560077667236, "learning_rate": 0.0002, "epoch": 0.855390574564235, "step": 2650}, {"loss": 0.7882, "grad_norm": 0.4392191767692566, "learning_rate": 0.0002, "epoch": 0.8586184635248547, "step": 2660}, {"loss": 0.848, "grad_norm": 0.5387210845947266, "learning_rate": 0.0002, "epoch": 0.8618463524854745, "step": 2670}, {"loss": 0.8094, "grad_norm": 0.6232406497001648, "learning_rate": 0.0002, "epoch": 0.8650742414460942, "step": 2680}, {"loss": 0.768, "grad_norm": 0.53749018907547, "learning_rate": 0.0002, "epoch": 0.868302130406714, "step": 2690}, {"loss": 0.8299, "grad_norm": 0.47480374574661255, "learning_rate": 0.0002, "epoch": 0.8715300193673338, "step": 2700}, {"loss": 0.8055, "grad_norm": 0.44618046283721924, "learning_rate": 0.0002, "epoch": 0.8747579083279535, "step": 2710}, {"loss": 0.8015, "grad_norm": 0.4173581302165985, "learning_rate": 0.0002, "epoch": 0.8779857972885733, "step": 2720}, {"loss": 0.7713, "grad_norm": 0.524081289768219, "learning_rate": 0.0002, "epoch": 0.881213686249193, "step": 2730}, {"loss": 0.8738, "grad_norm": 0.5608431100845337, "learning_rate": 0.0002, "epoch": 0.8844415752098128, "step": 2740}, {"loss": 0.8513, "grad_norm": 0.5212284922599792, "learning_rate": 0.0002, "epoch": 0.8876694641704326, "step": 2750}, {"loss": 0.8139, "grad_norm": 0.5601475834846497, "learning_rate": 0.0002, "epoch": 0.8908973531310523, "step": 2760}, {"loss": 0.7947, "grad_norm": 0.4499223828315735, "learning_rate": 0.0002, "epoch": 0.8941252420916721, "step": 2770}, {"loss": 0.8559, "grad_norm": 0.46945226192474365, "learning_rate": 0.0002, "epoch": 0.8973531310522918, "step": 2780}, {"loss": 0.801, "grad_norm": 0.4837495684623718, "learning_rate": 0.0002, "epoch": 0.9005810200129115, "step": 2790}, {"loss": 0.7887, "grad_norm": 0.5059258937835693, "learning_rate": 0.0002, "epoch": 0.9038089089735313, "step": 2800}, {"loss": 0.8571, "grad_norm": 0.4857945144176483, "learning_rate": 0.0002, "epoch": 0.907036797934151, "step": 2810}, {"loss": 0.8301, "grad_norm": 0.5001962780952454, "learning_rate": 0.0002, "epoch": 0.9102646868947708, "step": 2820}, {"loss": 0.8236, "grad_norm": 0.5468648672103882, "learning_rate": 0.0002, "epoch": 0.9134925758553906, "step": 2830}, {"loss": 0.8071, "grad_norm": 0.5533056259155273, "learning_rate": 0.0002, "epoch": 0.9167204648160103, "step": 2840}, {"loss": 0.7895, "grad_norm": 0.5909785628318787, "learning_rate": 0.0002, "epoch": 0.9199483537766301, "step": 2850}, {"loss": 0.796, "grad_norm": 0.47428104281425476, "learning_rate": 0.0002, "epoch": 0.9231762427372499, "step": 2860}, {"loss": 0.7845, "grad_norm": 0.548814058303833, "learning_rate": 0.0002, "epoch": 0.9264041316978696, "step": 2870}, {"loss": 0.7871, "grad_norm": 0.5576745271682739, "learning_rate": 0.0002, "epoch": 0.9296320206584894, "step": 2880}, {"loss": 0.8399, "grad_norm": 0.47094792127609253, "learning_rate": 0.0002, "epoch": 0.9328599096191091, "step": 2890}, {"loss": 0.805, "grad_norm": 0.5408539772033691, "learning_rate": 0.0002, "epoch": 0.9360877985797289, "step": 2900}, {"loss": 0.785, "grad_norm": 0.5922889113426208, "learning_rate": 0.0002, "epoch": 0.9393156875403487, "step": 2910}, {"loss": 0.8043, "grad_norm": 0.45462584495544434, "learning_rate": 0.0002, "epoch": 0.9425435765009683, "step": 2920}, {"loss": 0.8344, "grad_norm": 0.6864947080612183, "learning_rate": 0.0002, "epoch": 0.9457714654615881, "step": 2930}, {"loss": 0.8166, "grad_norm": 0.4706299304962158, "learning_rate": 0.0002, "epoch": 0.9489993544222078, "step": 2940}, {"loss": 0.8422, "grad_norm": 0.5583269596099854, "learning_rate": 0.0002, "epoch": 0.9522272433828276, "step": 2950}, {"loss": 0.836, "grad_norm": 0.51015704870224, "learning_rate": 0.0002, "epoch": 0.9554551323434474, "step": 2960}, {"loss": 0.8371, "grad_norm": 0.5325582027435303, "learning_rate": 0.0002, "epoch": 0.9586830213040671, "step": 2970}, {"loss": 0.7593, "grad_norm": 0.49008598923683167, "learning_rate": 0.0002, "epoch": 0.9619109102646869, "step": 2980}, {"loss": 0.8093, "grad_norm": 0.4422132074832916, "learning_rate": 0.0002, "epoch": 0.9651387992253067, "step": 2990}, {"loss": 0.7966, "grad_norm": 0.5053589344024658, "learning_rate": 0.0002, "epoch": 0.9683666881859264, "step": 3000}, {"loss": 0.8081, "grad_norm": 0.46754521131515503, "learning_rate": 0.0002, "epoch": 0.9715945771465462, "step": 3010}, {"loss": 0.8377, "grad_norm": 0.5613434910774231, "learning_rate": 0.0002, "epoch": 0.9748224661071659, "step": 3020}, {"loss": 0.7856, "grad_norm": 0.5052843689918518, "learning_rate": 0.0002, "epoch": 0.9780503550677857, "step": 3030}, {"loss": 0.8412, "grad_norm": 0.4270972013473511, "learning_rate": 0.0002, "epoch": 0.9812782440284055, "step": 3040}, {"loss": 0.8353, "grad_norm": 0.4974991977214813, "learning_rate": 0.0002, "epoch": 0.9845061329890252, "step": 3050}, {"loss": 0.8415, "grad_norm": 0.4432311952114105, "learning_rate": 0.0002, "epoch": 0.9877340219496449, "step": 3060}, {"loss": 0.7764, "grad_norm": 0.466457724571228, "learning_rate": 0.0002, "epoch": 0.9909619109102646, "step": 3070}, {"loss": 0.8067, "grad_norm": 0.6438009142875671, "learning_rate": 0.0002, "epoch": 0.9941897998708844, "step": 3080}, {"loss": 0.8425, "grad_norm": 0.5593604445457458, "learning_rate": 0.0002, "epoch": 0.9974176888315042, "step": 3090}, {"eval_loss": 1.0958120822906494, "eval_runtime": 148.3273, "eval_samples_per_second": 4.942, "eval_steps_per_second": 0.62, "epoch": 1.0, "step": 3098}, {"loss": 0.8275, "grad_norm": 0.5701445937156677, "learning_rate": 0.0002, "epoch": 1.000645577792124, "step": 3100}, {"loss": 0.7756, "grad_norm": 0.6089657545089722, "learning_rate": 0.0002, "epoch": 1.0038734667527438, "step": 3110}, {"loss": 0.7492, "grad_norm": 0.5619552135467529, "learning_rate": 0.0002, "epoch": 1.0071013557133635, "step": 3120}, {"loss": 0.7544, "grad_norm": 0.5550283789634705, "learning_rate": 0.0002, "epoch": 1.010329244673983, "step": 3130}, {"loss": 0.8006, "grad_norm": 0.6221792101860046, "learning_rate": 0.0002, "epoch": 1.013557133634603, "step": 3140}, {"loss": 0.7603, "grad_norm": 0.5450758934020996, "learning_rate": 0.0002, "epoch": 1.0167850225952226, "step": 3150}, {"loss": 0.7021, "grad_norm": 0.4359588027000427, "learning_rate": 0.0002, "epoch": 1.0200129115558425, "step": 3160}, {"loss": 0.7468, "grad_norm": 0.5932239890098572, "learning_rate": 0.0002, "epoch": 1.0232408005164622, "step": 3170}, {"loss": 0.7649, "grad_norm": 0.45478707551956177, "learning_rate": 0.0002, "epoch": 1.026468689477082, "step": 3180}, {"loss": 0.7355, "grad_norm": 0.677615761756897, "learning_rate": 0.0002, "epoch": 1.0296965784377017, "step": 3190}, {"loss": 0.6928, "grad_norm": 0.6231790781021118, "learning_rate": 0.0002, "epoch": 1.0329244673983216, "step": 3200}, {"loss": 0.7471, "grad_norm": 0.5074195861816406, "learning_rate": 0.0002, "epoch": 1.0361523563589412, "step": 3210}, {"loss": 0.6864, "grad_norm": 0.4844142198562622, "learning_rate": 0.0002, "epoch": 1.039380245319561, "step": 3220}, {"loss": 0.7655, "grad_norm": 0.5372750759124756, "learning_rate": 0.0002, "epoch": 1.0426081342801807, "step": 3230}, {"loss": 0.7384, "grad_norm": 0.46296265721321106, "learning_rate": 0.0002, "epoch": 1.0458360232408006, "step": 3240}, {"loss": 0.7894, "grad_norm": 0.5417148470878601, "learning_rate": 0.0002, "epoch": 1.0490639122014203, "step": 3250}, {"loss": 0.7637, "grad_norm": 0.5695074200630188, "learning_rate": 0.0002, "epoch": 1.0522918011620401, "step": 3260}, {"loss": 0.7456, "grad_norm": 0.5050092935562134, "learning_rate": 0.0002, "epoch": 1.0555196901226598, "step": 3270}, {"loss": 0.6805, "grad_norm": 0.5320752263069153, "learning_rate": 0.0002, "epoch": 1.0587475790832794, "step": 3280}, {"loss": 0.7419, "grad_norm": 0.5832052230834961, "learning_rate": 0.0002, "epoch": 1.0619754680438993, "step": 3290}, {"loss": 0.7656, "grad_norm": 0.5228804349899292, "learning_rate": 0.0002, "epoch": 1.065203357004519, "step": 3300}, {"loss": 0.6834, "grad_norm": 0.5819445252418518, "learning_rate": 0.0002, "epoch": 1.0684312459651388, "step": 3310}, {"loss": 0.7093, "grad_norm": 0.4201328754425049, "learning_rate": 0.0002, "epoch": 1.0716591349257585, "step": 3320}, {"loss": 0.7494, "grad_norm": 0.5424145460128784, "learning_rate": 0.0002, "epoch": 1.0748870238863784, "step": 3330}, {"loss": 0.7828, "grad_norm": 0.6169946789741516, "learning_rate": 0.0002, "epoch": 1.078114912846998, "step": 3340}, {"loss": 0.7505, "grad_norm": 0.607676088809967, "learning_rate": 0.0002, "epoch": 1.0813428018076179, "step": 3350}, {"loss": 0.7315, "grad_norm": 0.5191982388496399, "learning_rate": 0.0002, "epoch": 1.0845706907682375, "step": 3360}, {"loss": 0.7699, "grad_norm": 0.5728003978729248, "learning_rate": 0.0002, "epoch": 1.0877985797288574, "step": 3370}, {"loss": 0.7381, "grad_norm": 0.5402643084526062, "learning_rate": 0.0002, "epoch": 1.091026468689477, "step": 3380}, {"loss": 0.7208, "grad_norm": 0.5377541780471802, "learning_rate": 0.0002, "epoch": 1.094254357650097, "step": 3390}, {"loss": 0.7672, "grad_norm": 0.4751385748386383, "learning_rate": 0.0002, "epoch": 1.0974822466107166, "step": 3400}, {"loss": 0.7326, "grad_norm": 0.559158444404602, "learning_rate": 0.0002, "epoch": 1.1007101355713362, "step": 3410}, {"loss": 0.7366, "grad_norm": 0.4917701482772827, "learning_rate": 0.0002, "epoch": 1.103938024531956, "step": 3420}, {"loss": 0.7593, "grad_norm": 0.5507875084877014, "learning_rate": 0.0002, "epoch": 1.1071659134925758, "step": 3430}, {"loss": 0.7424, "grad_norm": 0.45458680391311646, "learning_rate": 0.0002, "epoch": 1.1103938024531956, "step": 3440}, {"loss": 0.7234, "grad_norm": 0.5721744894981384, "learning_rate": 0.0002, "epoch": 1.1136216914138153, "step": 3450}, {"loss": 0.7219, "grad_norm": 0.5776081681251526, "learning_rate": 0.0002, "epoch": 1.1168495803744352, "step": 3460}, {"loss": 0.7644, "grad_norm": 0.5261953473091125, "learning_rate": 0.0002, "epoch": 1.1200774693350548, "step": 3470}, {"loss": 0.6586, "grad_norm": 0.47759532928466797, "learning_rate": 0.0002, "epoch": 1.1233053582956747, "step": 3480}, {"loss": 0.7641, "grad_norm": 0.5697659850120544, "learning_rate": 0.0002, "epoch": 1.1265332472562943, "step": 3490}, {"loss": 0.7017, "grad_norm": 0.5643419623374939, "learning_rate": 0.0002, "epoch": 1.1297611362169142, "step": 3500}, {"loss": 0.7235, "grad_norm": 0.6502931118011475, "learning_rate": 0.0002, "epoch": 1.1329890251775339, "step": 3510}, {"loss": 0.7662, "grad_norm": 0.5236507654190063, "learning_rate": 0.0002, "epoch": 1.1362169141381537, "step": 3520}, {"loss": 0.7571, "grad_norm": 0.6521499156951904, "learning_rate": 0.0002, "epoch": 1.1394448030987734, "step": 3530}, {"loss": 0.7304, "grad_norm": 0.5893217325210571, "learning_rate": 0.0002, "epoch": 1.142672692059393, "step": 3540}, {"loss": 0.7508, "grad_norm": 0.5300073027610779, "learning_rate": 0.0002, "epoch": 1.145900581020013, "step": 3550}, {"loss": 0.6937, "grad_norm": 0.6794660091400146, "learning_rate": 0.0002, "epoch": 1.1491284699806328, "step": 3560}, {"loss": 0.7614, "grad_norm": 0.5420064926147461, "learning_rate": 0.0002, "epoch": 1.1523563589412524, "step": 3570}, {"loss": 0.7648, "grad_norm": 0.5096590518951416, "learning_rate": 0.0002, "epoch": 1.155584247901872, "step": 3580}, {"loss": 0.7436, "grad_norm": 0.5726043581962585, "learning_rate": 0.0002, "epoch": 1.158812136862492, "step": 3590}, {"loss": 0.7728, "grad_norm": 0.7388110160827637, "learning_rate": 0.0002, "epoch": 1.1620400258231116, "step": 3600}, {"loss": 0.7421, "grad_norm": 0.5597969889640808, "learning_rate": 0.0002, "epoch": 1.1652679147837315, "step": 3610}, {"loss": 0.7132, "grad_norm": 0.5067800283432007, "learning_rate": 0.0002, "epoch": 1.1684958037443511, "step": 3620}, {"loss": 0.7893, "grad_norm": 0.6625118255615234, "learning_rate": 0.0002, "epoch": 1.171723692704971, "step": 3630}, {"loss": 0.7611, "grad_norm": 0.5830849409103394, "learning_rate": 0.0002, "epoch": 1.1749515816655907, "step": 3640}, {"loss": 0.7973, "grad_norm": 0.6140692830085754, "learning_rate": 0.0002, "epoch": 1.1781794706262105, "step": 3650}, {"loss": 0.7617, "grad_norm": 0.714523434638977, "learning_rate": 0.0002, "epoch": 1.1814073595868302, "step": 3660}, {"loss": 0.7092, "grad_norm": 0.5196696519851685, "learning_rate": 0.0002, "epoch": 1.18463524854745, "step": 3670}, {"loss": 0.7821, "grad_norm": 0.6677889823913574, "learning_rate": 0.0002, "epoch": 1.1878631375080697, "step": 3680}, {"loss": 0.7813, "grad_norm": 0.47095245122909546, "learning_rate": 0.0002, "epoch": 1.1910910264686896, "step": 3690}, {"loss": 0.7702, "grad_norm": 0.5197778940200806, "learning_rate": 0.0002, "epoch": 1.1943189154293092, "step": 3700}, {"loss": 0.7349, "grad_norm": 0.5156530141830444, "learning_rate": 0.0002, "epoch": 1.1975468043899289, "step": 3710}, {"loss": 0.7738, "grad_norm": 0.6968549489974976, "learning_rate": 0.0002, "epoch": 1.2007746933505488, "step": 3720}, {"loss": 0.7599, "grad_norm": 0.48983848094940186, "learning_rate": 0.0002, "epoch": 1.2040025823111684, "step": 3730}, {"loss": 0.7163, "grad_norm": 0.6709973216056824, "learning_rate": 0.0002, "epoch": 1.2072304712717883, "step": 3740}, {"loss": 0.7632, "grad_norm": 0.48681750893592834, "learning_rate": 0.0002, "epoch": 1.210458360232408, "step": 3750}, {"loss": 0.7039, "grad_norm": 0.49475061893463135, "learning_rate": 0.0002, "epoch": 1.2136862491930278, "step": 3760}, {"loss": 0.7372, "grad_norm": 0.6163983345031738, "learning_rate": 0.0002, "epoch": 1.2169141381536475, "step": 3770}, {"loss": 0.757, "grad_norm": 0.5481411218643188, "learning_rate": 0.0002, "epoch": 1.2201420271142673, "step": 3780}, {"loss": 0.7601, "grad_norm": 0.620639979839325, "learning_rate": 0.0002, "epoch": 1.223369916074887, "step": 3790}, {"loss": 0.7738, "grad_norm": 0.7017222046852112, "learning_rate": 0.0002, "epoch": 1.2265978050355069, "step": 3800}, {"loss": 0.7468, "grad_norm": 0.5872400403022766, "learning_rate": 0.0002, "epoch": 1.2298256939961265, "step": 3810}, {"loss": 0.7854, "grad_norm": 0.45765596628189087, "learning_rate": 0.0002, "epoch": 1.2330535829567464, "step": 3820}, {"loss": 0.7865, "grad_norm": 0.5676377415657043, "learning_rate": 0.0002, "epoch": 1.236281471917366, "step": 3830}, {"loss": 0.7696, "grad_norm": 0.4793425500392914, "learning_rate": 0.0002, "epoch": 1.2395093608779857, "step": 3840}, {"loss": 0.7065, "grad_norm": 0.5060022473335266, "learning_rate": 0.0002, "epoch": 1.2427372498386056, "step": 3850}, {"loss": 0.7333, "grad_norm": 0.6140682697296143, "learning_rate": 0.0002, "epoch": 1.2459651387992252, "step": 3860}, {"loss": 0.7496, "grad_norm": 0.5030326843261719, "learning_rate": 0.0002, "epoch": 1.249193027759845, "step": 3870}, {"loss": 0.7226, "grad_norm": 0.6609430909156799, "learning_rate": 0.0002, "epoch": 1.2524209167204647, "step": 3880}, {"loss": 0.7212, "grad_norm": 0.5459545850753784, "learning_rate": 0.0002, "epoch": 1.2556488056810846, "step": 3890}, {"loss": 0.7145, "grad_norm": 0.5328870415687561, "learning_rate": 0.0002, "epoch": 1.2588766946417043, "step": 3900}, {"loss": 0.7572, "grad_norm": 0.5840652585029602, "learning_rate": 0.0002, "epoch": 1.2621045836023241, "step": 3910}, {"loss": 0.7624, "grad_norm": 0.5587584376335144, "learning_rate": 0.0002, "epoch": 1.2653324725629438, "step": 3920}, {"loss": 0.7846, "grad_norm": 0.5886949896812439, "learning_rate": 0.0002, "epoch": 1.2685603615235637, "step": 3930}, {"loss": 0.7251, "grad_norm": 0.5128693580627441, "learning_rate": 0.0002, "epoch": 1.2717882504841833, "step": 3940}, {"loss": 0.7032, "grad_norm": 0.6207669377326965, "learning_rate": 0.0002, "epoch": 1.2750161394448032, "step": 3950}, {"loss": 0.7506, "grad_norm": 0.5789574384689331, "learning_rate": 0.0002, "epoch": 1.2782440284054228, "step": 3960}, {"loss": 0.7574, "grad_norm": 0.503162145614624, "learning_rate": 0.0002, "epoch": 1.2814719173660425, "step": 3970}, {"loss": 0.7489, "grad_norm": 0.6670064926147461, "learning_rate": 0.0002, "epoch": 1.2846998063266624, "step": 3980}, {"loss": 0.7198, "grad_norm": 0.5676213502883911, "learning_rate": 0.0002, "epoch": 1.2879276952872822, "step": 3990}, {"loss": 0.7892, "grad_norm": 0.5383169054985046, "learning_rate": 0.0002, "epoch": 1.2911555842479019, "step": 4000}, {"loss": 0.7432, "grad_norm": 0.714743971824646, "learning_rate": 0.0002, "epoch": 1.2943834732085215, "step": 4010}, {"loss": 0.7594, "grad_norm": 0.5740262269973755, "learning_rate": 0.0002, "epoch": 1.2976113621691414, "step": 4020}, {"loss": 0.7564, "grad_norm": 0.6143045425415039, "learning_rate": 0.0002, "epoch": 1.300839251129761, "step": 4030}, {"loss": 0.7181, "grad_norm": 0.501025378704071, "learning_rate": 0.0002, "epoch": 1.304067140090381, "step": 4040}, {"loss": 0.7099, "grad_norm": 0.5784100294113159, "learning_rate": 0.0002, "epoch": 1.3072950290510006, "step": 4050}, {"loss": 0.7403, "grad_norm": 0.6182606220245361, "learning_rate": 0.0002, "epoch": 1.3105229180116205, "step": 4060}, {"loss": 0.7249, "grad_norm": 0.5072231292724609, "learning_rate": 0.0002, "epoch": 1.3137508069722401, "step": 4070}, {"loss": 0.7451, "grad_norm": 0.6841012835502625, "learning_rate": 0.0002, "epoch": 1.31697869593286, "step": 4080}, {"loss": 0.7395, "grad_norm": 0.697257936000824, "learning_rate": 0.0002, "epoch": 1.3202065848934796, "step": 4090}, {"loss": 0.7401, "grad_norm": 0.5113214254379272, "learning_rate": 0.0002, "epoch": 1.3234344738540993, "step": 4100}, {"loss": 0.7336, "grad_norm": 0.6270561814308167, "learning_rate": 0.0002, "epoch": 1.3266623628147192, "step": 4110}, {"loss": 0.7535, "grad_norm": 0.5525947213172913, "learning_rate": 0.0002, "epoch": 1.329890251775339, "step": 4120}, {"loss": 0.6999, "grad_norm": 0.546071469783783, "learning_rate": 0.0002, "epoch": 1.3331181407359587, "step": 4130}, {"loss": 0.7884, "grad_norm": 0.6516721248626709, "learning_rate": 0.0002, "epoch": 1.3363460296965783, "step": 4140}, {"loss": 0.755, "grad_norm": 0.6235111355781555, "learning_rate": 0.0002, "epoch": 1.3395739186571982, "step": 4150}, {"loss": 0.7467, "grad_norm": 0.538649320602417, "learning_rate": 0.0002, "epoch": 1.3428018076178179, "step": 4160}, {"loss": 0.7368, "grad_norm": 0.5367001891136169, "learning_rate": 0.0002, "epoch": 1.3460296965784377, "step": 4170}, {"loss": 0.7536, "grad_norm": 0.6134631037712097, "learning_rate": 0.0002, "epoch": 1.3492575855390574, "step": 4180}, {"loss": 0.8245, "grad_norm": 0.5827262997627258, "learning_rate": 0.0002, "epoch": 1.3524854744996773, "step": 4190}, {"loss": 0.7288, "grad_norm": 0.5706096291542053, "learning_rate": 0.0002, "epoch": 1.355713363460297, "step": 4200}, {"loss": 0.7302, "grad_norm": 0.6422057151794434, "learning_rate": 0.0002, "epoch": 1.3589412524209168, "step": 4210}, {"loss": 0.7303, "grad_norm": 0.6316141486167908, "learning_rate": 0.0002, "epoch": 1.3621691413815364, "step": 4220}, {"loss": 0.7457, "grad_norm": 0.6946983933448792, "learning_rate": 0.0002, "epoch": 1.365397030342156, "step": 4230}, {"loss": 0.7388, "grad_norm": 0.5381525754928589, "learning_rate": 0.0002, "epoch": 1.368624919302776, "step": 4240}, {"loss": 0.73, "grad_norm": 0.5484845638275146, "learning_rate": 0.0002, "epoch": 1.3718528082633958, "step": 4250}, {"loss": 0.7584, "grad_norm": 0.5961896777153015, "learning_rate": 0.0002, "epoch": 1.3750806972240155, "step": 4260}, {"loss": 0.8006, "grad_norm": 0.6041752696037292, "learning_rate": 0.0002, "epoch": 1.3783085861846351, "step": 4270}, {"loss": 0.7276, "grad_norm": 0.6283464431762695, "learning_rate": 0.0002, "epoch": 1.381536475145255, "step": 4280}, {"loss": 0.757, "grad_norm": 0.6761324405670166, "learning_rate": 0.0002, "epoch": 1.384764364105875, "step": 4290}, {"loss": 0.7381, "grad_norm": 0.504311203956604, "learning_rate": 0.0002, "epoch": 1.3879922530664945, "step": 4300}, {"loss": 0.7536, "grad_norm": 0.6100395917892456, "learning_rate": 0.0002, "epoch": 1.3912201420271142, "step": 4310}, {"loss": 0.7103, "grad_norm": 0.6245788335800171, "learning_rate": 0.0002, "epoch": 1.394448030987734, "step": 4320}, {"loss": 0.7505, "grad_norm": 0.6074621081352234, "learning_rate": 0.0002, "epoch": 1.3976759199483537, "step": 4330}, {"loss": 0.752, "grad_norm": 0.6683838963508606, "learning_rate": 0.0002, "epoch": 1.4009038089089736, "step": 4340}, {"loss": 0.7537, "grad_norm": 0.622998058795929, "learning_rate": 0.0002, "epoch": 1.4041316978695932, "step": 4350}, {"loss": 0.8148, "grad_norm": 0.6089423894882202, "learning_rate": 0.0002, "epoch": 1.4073595868302131, "step": 4360}, {"loss": 0.7715, "grad_norm": 0.6381658911705017, "learning_rate": 0.0002, "epoch": 1.4105874757908328, "step": 4370}, {"loss": 0.7871, "grad_norm": 0.5419308543205261, "learning_rate": 0.0002, "epoch": 1.4138153647514526, "step": 4380}, {"loss": 0.7386, "grad_norm": 0.6026232242584229, "learning_rate": 0.0002, "epoch": 1.4170432537120723, "step": 4390}, {"loss": 0.7529, "grad_norm": 0.4911101162433624, "learning_rate": 0.0002, "epoch": 1.420271142672692, "step": 4400}, {"loss": 0.7495, "grad_norm": 0.6302908062934875, "learning_rate": 0.0002, "epoch": 1.4234990316333118, "step": 4410}, {"loss": 0.7446, "grad_norm": 0.6692768931388855, "learning_rate": 0.0002, "epoch": 1.4267269205939317, "step": 4420}, {"loss": 0.7312, "grad_norm": 0.46294572949409485, "learning_rate": 0.0002, "epoch": 1.4299548095545513, "step": 4430}, {"loss": 0.7255, "grad_norm": 0.5452619194984436, "learning_rate": 0.0002, "epoch": 1.433182698515171, "step": 4440}, {"loss": 0.7974, "grad_norm": 0.7809233069419861, "learning_rate": 0.0002, "epoch": 1.4364105874757909, "step": 4450}, {"loss": 0.7103, "grad_norm": 0.550088107585907, "learning_rate": 0.0002, "epoch": 1.4396384764364105, "step": 4460}, {"loss": 0.7088, "grad_norm": 0.7139151096343994, "learning_rate": 0.0002, "epoch": 1.4428663653970304, "step": 4470}, {"loss": 0.7358, "grad_norm": 0.6187090873718262, "learning_rate": 0.0002, "epoch": 1.44609425435765, "step": 4480}, {"loss": 0.7608, "grad_norm": 0.5948249101638794, "learning_rate": 0.0002, "epoch": 1.44932214331827, "step": 4490}, {"loss": 0.7582, "grad_norm": 0.6510892510414124, "learning_rate": 0.0002, "epoch": 1.4525500322788896, "step": 4500}, {"loss": 0.7105, "grad_norm": 0.6552293300628662, "learning_rate": 0.0002, "epoch": 1.4557779212395094, "step": 4510}, {"loss": 0.7965, "grad_norm": 0.585574209690094, "learning_rate": 0.0002, "epoch": 1.459005810200129, "step": 4520}, {"loss": 0.761, "grad_norm": 0.4830162823200226, "learning_rate": 0.0002, "epoch": 1.4622336991607487, "step": 4530}, {"loss": 0.7424, "grad_norm": 0.5780223608016968, "learning_rate": 0.0002, "epoch": 1.4654615881213686, "step": 4540}, {"loss": 0.7518, "grad_norm": 0.5462607145309448, "learning_rate": 0.0002, "epoch": 1.4686894770819885, "step": 4550}, {"loss": 0.7342, "grad_norm": 0.5183546543121338, "learning_rate": 0.0002, "epoch": 1.4719173660426081, "step": 4560}, {"loss": 0.71, "grad_norm": 0.676917552947998, "learning_rate": 0.0002, "epoch": 1.4751452550032278, "step": 4570}, {"loss": 0.7875, "grad_norm": 0.5772345066070557, "learning_rate": 0.0002, "epoch": 1.4783731439638477, "step": 4580}, {"loss": 0.7709, "grad_norm": 0.7320035696029663, "learning_rate": 0.0002, "epoch": 1.4816010329244673, "step": 4590}, {"loss": 0.7601, "grad_norm": 0.5024042129516602, "learning_rate": 0.0002, "epoch": 1.4848289218850872, "step": 4600}, {"loss": 0.8061, "grad_norm": 0.5482868552207947, "learning_rate": 0.0002, "epoch": 1.4880568108457068, "step": 4610}, {"loss": 0.714, "grad_norm": 0.5447399616241455, "learning_rate": 0.0002, "epoch": 1.4912846998063267, "step": 4620}, {"loss": 0.7959, "grad_norm": 0.5953414440155029, "learning_rate": 0.0002, "epoch": 1.4945125887669464, "step": 4630}, {"loss": 0.7463, "grad_norm": 0.6983066201210022, "learning_rate": 0.0002, "epoch": 1.4977404777275662, "step": 4640}, {"loss": 0.7877, "grad_norm": 0.586327075958252, "learning_rate": 0.0002, "epoch": 1.500968366688186, "step": 4650}, {"loss": 0.7169, "grad_norm": 0.5839682221412659, "learning_rate": 0.0002, "epoch": 1.5041962556488055, "step": 4660}, {"loss": 0.7524, "grad_norm": 0.5959209203720093, "learning_rate": 0.0002, "epoch": 1.5074241446094254, "step": 4670}, {"loss": 0.7615, "grad_norm": 0.5073857307434082, "learning_rate": 0.0002, "epoch": 1.5106520335700453, "step": 4680}, {"loss": 0.7258, "grad_norm": 0.5183001160621643, "learning_rate": 0.0002, "epoch": 1.513879922530665, "step": 4690}, {"loss": 0.784, "grad_norm": 0.593530535697937, "learning_rate": 0.0002, "epoch": 1.5171078114912846, "step": 4700}, {"loss": 0.7722, "grad_norm": 0.675993025302887, "learning_rate": 0.0002, "epoch": 1.5203357004519045, "step": 4710}, {"loss": 0.7485, "grad_norm": 0.5823286771774292, "learning_rate": 0.0002, "epoch": 1.5235635894125243, "step": 4720}, {"loss": 0.7474, "grad_norm": 0.5825035572052002, "learning_rate": 0.0002, "epoch": 1.526791478373144, "step": 4730}, {"loss": 0.8287, "grad_norm": 0.5689691305160522, "learning_rate": 0.0002, "epoch": 1.5300193673337636, "step": 4740}, {"loss": 0.7279, "grad_norm": 0.6037150621414185, "learning_rate": 0.0002, "epoch": 1.5332472562943835, "step": 4750}, {"loss": 0.7865, "grad_norm": 0.6393677592277527, "learning_rate": 0.0002, "epoch": 1.5364751452550034, "step": 4760}, {"loss": 0.805, "grad_norm": 0.5926381945610046, "learning_rate": 0.0002, "epoch": 1.539703034215623, "step": 4770}, {"loss": 0.7425, "grad_norm": 0.9468599557876587, "learning_rate": 0.0002, "epoch": 1.5429309231762427, "step": 4780}, {"loss": 0.7565, "grad_norm": 0.7544237375259399, "learning_rate": 0.0002, "epoch": 1.5461588121368623, "step": 4790}, {"loss": 0.7398, "grad_norm": 0.5308566093444824, "learning_rate": 0.0002, "epoch": 1.5493867010974822, "step": 4800}, {"loss": 0.7756, "grad_norm": 0.6590296030044556, "learning_rate": 0.0002, "epoch": 1.552614590058102, "step": 4810}, {"loss": 0.7212, "grad_norm": 0.5630404353141785, "learning_rate": 0.0002, "epoch": 1.5558424790187217, "step": 4820}, {"loss": 0.7593, "grad_norm": 0.6800200939178467, "learning_rate": 0.0002, "epoch": 1.5590703679793414, "step": 4830}, {"loss": 0.7373, "grad_norm": 0.5463718175888062, "learning_rate": 0.0002, "epoch": 1.5622982569399613, "step": 4840}, {"loss": 0.7519, "grad_norm": 0.505135178565979, "learning_rate": 0.0002, "epoch": 1.5655261459005811, "step": 4850}, {"loss": 0.8122, "grad_norm": 0.5469676852226257, "learning_rate": 0.0002, "epoch": 1.5687540348612008, "step": 4860}, {"loss": 0.7185, "grad_norm": 0.5318337678909302, "learning_rate": 0.0002, "epoch": 1.5719819238218204, "step": 4870}, {"loss": 0.7324, "grad_norm": 0.7287914752960205, "learning_rate": 0.0002, "epoch": 1.5752098127824403, "step": 4880}, {"loss": 0.7532, "grad_norm": 0.7318989038467407, "learning_rate": 0.0002, "epoch": 1.5784377017430602, "step": 4890}, {"loss": 0.7851, "grad_norm": 0.6499921679496765, "learning_rate": 0.0002, "epoch": 1.5816655907036798, "step": 4900}, {"loss": 0.753, "grad_norm": 0.47907355427742004, "learning_rate": 0.0002, "epoch": 1.5848934796642995, "step": 4910}, {"loss": 0.7699, "grad_norm": 0.7338833808898926, "learning_rate": 0.0002, "epoch": 1.5881213686249191, "step": 4920}, {"loss": 0.7592, "grad_norm": 0.5800719261169434, "learning_rate": 0.0002, "epoch": 1.591349257585539, "step": 4930}, {"loss": 0.7211, "grad_norm": 0.5365763306617737, "learning_rate": 0.0002, "epoch": 1.594577146546159, "step": 4940}, {"loss": 0.777, "grad_norm": 0.5800772309303284, "learning_rate": 0.0002, "epoch": 1.5978050355067785, "step": 4950}, {"loss": 0.8027, "grad_norm": 0.7878010869026184, "learning_rate": 0.0002, "epoch": 1.6010329244673982, "step": 4960}, {"loss": 0.7894, "grad_norm": 0.5919058918952942, "learning_rate": 0.0002, "epoch": 1.604260813428018, "step": 4970}, {"loss": 0.7762, "grad_norm": 0.5004435181617737, "learning_rate": 0.0002, "epoch": 1.607488702388638, "step": 4980}, {"loss": 0.7447, "grad_norm": 0.6299242377281189, "learning_rate": 0.0002, "epoch": 1.6107165913492576, "step": 4990}, {"loss": 0.7149, "grad_norm": 0.6307242512702942, "learning_rate": 0.0002, "epoch": 1.6139444803098772, "step": 5000}, {"loss": 0.7693, "grad_norm": 0.7838703989982605, "learning_rate": 0.0002, "epoch": 1.6171723692704971, "step": 5010}, {"loss": 0.7364, "grad_norm": 0.6454671621322632, "learning_rate": 0.0002, "epoch": 1.620400258231117, "step": 5020}, {"loss": 0.74, "grad_norm": 0.5907095670700073, "learning_rate": 0.0002, "epoch": 1.6236281471917366, "step": 5030}, {"loss": 0.7331, "grad_norm": 0.6053501963615417, "learning_rate": 0.0002, "epoch": 1.6268560361523563, "step": 5040}, {"loss": 0.6987, "grad_norm": 0.5644670128822327, "learning_rate": 0.0002, "epoch": 1.630083925112976, "step": 5050}, {"loss": 0.7886, "grad_norm": 0.6320949792861938, "learning_rate": 0.0002, "epoch": 1.6333118140735958, "step": 5060}, {"loss": 0.7109, "grad_norm": 0.6101489067077637, "learning_rate": 0.0002, "epoch": 1.6365397030342157, "step": 5070}, {"loss": 0.6922, "grad_norm": 0.9435283541679382, "learning_rate": 0.0002, "epoch": 1.6397675919948353, "step": 5080}, {"loss": 0.729, "grad_norm": 0.6668919324874878, "learning_rate": 0.0002, "epoch": 1.642995480955455, "step": 5090}, {"loss": 0.7402, "grad_norm": 0.6160340905189514, "learning_rate": 0.0002, "epoch": 1.6462233699160749, "step": 5100}, {"loss": 0.7461, "grad_norm": 0.5999835729598999, "learning_rate": 0.0002, "epoch": 1.6494512588766947, "step": 5110}, {"loss": 0.7661, "grad_norm": 0.9378551840782166, "learning_rate": 0.0002, "epoch": 1.6526791478373144, "step": 5120}, {"loss": 0.7586, "grad_norm": 0.4795055389404297, "learning_rate": 0.0002, "epoch": 1.655907036797934, "step": 5130}, {"loss": 0.7342, "grad_norm": 0.4878861606121063, "learning_rate": 0.0002, "epoch": 1.659134925758554, "step": 5140}, {"loss": 0.7362, "grad_norm": 0.6042965054512024, "learning_rate": 0.0002, "epoch": 1.6623628147191738, "step": 5150}, {"loss": 0.7863, "grad_norm": 0.5829901695251465, "learning_rate": 0.0002, "epoch": 1.6655907036797934, "step": 5160}, {"loss": 0.7498, "grad_norm": 0.5168480277061462, "learning_rate": 0.0002, "epoch": 1.668818592640413, "step": 5170}, {"loss": 0.7333, "grad_norm": 0.6489511132240295, "learning_rate": 0.0002, "epoch": 1.672046481601033, "step": 5180}, {"loss": 0.7257, "grad_norm": 0.5955966114997864, "learning_rate": 0.0002, "epoch": 1.6752743705616526, "step": 5190}, {"loss": 0.7938, "grad_norm": 0.6228088140487671, "learning_rate": 0.0002, "epoch": 1.6785022595222725, "step": 5200}, {"loss": 0.7626, "grad_norm": 0.5726390480995178, "learning_rate": 0.0002, "epoch": 1.6817301484828922, "step": 5210}, {"loss": 0.7479, "grad_norm": 0.6116343140602112, "learning_rate": 0.0002, "epoch": 1.6849580374435118, "step": 5220}, {"loss": 0.7169, "grad_norm": 0.5483687520027161, "learning_rate": 0.0002, "epoch": 1.6881859264041317, "step": 5230}, {"loss": 0.7293, "grad_norm": 0.570941686630249, "learning_rate": 0.0002, "epoch": 1.6914138153647515, "step": 5240}, {"loss": 0.723, "grad_norm": 0.6048086285591125, "learning_rate": 0.0002, "epoch": 1.6946417043253712, "step": 5250}, {"loss": 0.7861, "grad_norm": 0.6769003868103027, "learning_rate": 0.0002, "epoch": 1.6978695932859909, "step": 5260}, {"loss": 0.7885, "grad_norm": 0.5629057884216309, "learning_rate": 0.0002, "epoch": 1.7010974822466107, "step": 5270}, {"loss": 0.7693, "grad_norm": 0.657341480255127, "learning_rate": 0.0002, "epoch": 1.7043253712072306, "step": 5280}, {"loss": 0.7357, "grad_norm": 0.6256147623062134, "learning_rate": 0.0002, "epoch": 1.7075532601678503, "step": 5290}, {"loss": 0.714, "grad_norm": 0.5498088002204895, "learning_rate": 0.0002, "epoch": 1.71078114912847, "step": 5300}, {"loss": 0.7669, "grad_norm": 0.5078358054161072, "learning_rate": 0.0002, "epoch": 1.7140090380890898, "step": 5310}, {"loss": 0.7872, "grad_norm": 0.6696692705154419, "learning_rate": 0.0002, "epoch": 1.7172369270497096, "step": 5320}, {"loss": 0.8205, "grad_norm": 0.6692847013473511, "learning_rate": 0.0002, "epoch": 1.7204648160103293, "step": 5330}, {"loss": 0.7432, "grad_norm": 0.5415751934051514, "learning_rate": 0.0002, "epoch": 1.723692704970949, "step": 5340}, {"loss": 0.7499, "grad_norm": 0.5367611050605774, "learning_rate": 0.0002, "epoch": 1.7269205939315686, "step": 5350}, {"loss": 0.7631, "grad_norm": 0.7321061491966248, "learning_rate": 0.0002, "epoch": 1.7301484828921885, "step": 5360}, {"loss": 0.7827, "grad_norm": 0.723972499370575, "learning_rate": 0.0002, "epoch": 1.7333763718528084, "step": 5370}, {"loss": 0.7077, "grad_norm": 0.7328100204467773, "learning_rate": 0.0002, "epoch": 1.736604260813428, "step": 5380}, {"loss": 0.7503, "grad_norm": 0.5785264372825623, "learning_rate": 0.0002, "epoch": 1.7398321497740477, "step": 5390}, {"loss": 0.7188, "grad_norm": 0.7812932133674622, "learning_rate": 0.0002, "epoch": 1.7430600387346675, "step": 5400}, {"loss": 0.7386, "grad_norm": 0.6493327617645264, "learning_rate": 0.0002, "epoch": 1.7462879276952874, "step": 5410}, {"loss": 0.7487, "grad_norm": 0.5825939774513245, "learning_rate": 0.0002, "epoch": 1.749515816655907, "step": 5420}, {"loss": 0.7625, "grad_norm": 0.6969610452651978, "learning_rate": 0.0002, "epoch": 1.7527437056165267, "step": 5430}, {"loss": 0.7512, "grad_norm": 0.5558062195777893, "learning_rate": 0.0002, "epoch": 1.7559715945771466, "step": 5440}, {"loss": 0.7256, "grad_norm": 0.49222221970558167, "learning_rate": 0.0002, "epoch": 1.7591994835377665, "step": 5450}, {"loss": 0.7477, "grad_norm": 0.5844656825065613, "learning_rate": 0.0002, "epoch": 1.762427372498386, "step": 5460}, {"loss": 0.7695, "grad_norm": 0.8706597685813904, "learning_rate": 0.0002, "epoch": 1.7656552614590058, "step": 5470}, {"loss": 0.7582, "grad_norm": 0.6167706251144409, "learning_rate": 0.0002, "epoch": 1.7688831504196254, "step": 5480}, {"loss": 0.7521, "grad_norm": 0.5890011787414551, "learning_rate": 0.0002, "epoch": 1.7721110393802453, "step": 5490}, {"loss": 0.8319, "grad_norm": 0.6551728248596191, "learning_rate": 0.0002, "epoch": 1.7753389283408652, "step": 5500}, {"loss": 0.7615, "grad_norm": 0.5848751068115234, "learning_rate": 0.0002, "epoch": 1.7785668173014848, "step": 5510}, {"loss": 0.7622, "grad_norm": 0.6664014458656311, "learning_rate": 0.0002, "epoch": 1.7817947062621045, "step": 5520}, {"loss": 0.7544, "grad_norm": 0.5931693911552429, "learning_rate": 0.0002, "epoch": 1.7850225952227243, "step": 5530}, {"loss": 0.7992, "grad_norm": 0.5534724593162537, "learning_rate": 0.0002, "epoch": 1.7882504841833442, "step": 5540}, {"loss": 0.7967, "grad_norm": 0.5590878129005432, "learning_rate": 0.0002, "epoch": 1.7914783731439639, "step": 5550}, {"loss": 0.7406, "grad_norm": 0.6947470903396606, "learning_rate": 0.0002, "epoch": 1.7947062621045835, "step": 5560}, {"loss": 0.7614, "grad_norm": 0.6104130148887634, "learning_rate": 0.0002, "epoch": 1.7979341510652034, "step": 5570}, {"loss": 0.8032, "grad_norm": 0.6135714054107666, "learning_rate": 0.0002, "epoch": 1.8011620400258233, "step": 5580}, {"loss": 0.7403, "grad_norm": 0.6626853346824646, "learning_rate": 0.0002, "epoch": 1.804389928986443, "step": 5590}, {"loss": 0.7746, "grad_norm": 0.6977612972259521, "learning_rate": 0.0002, "epoch": 1.8076178179470626, "step": 5600}, {"loss": 0.7899, "grad_norm": 0.6275238394737244, "learning_rate": 0.0002, "epoch": 1.8108457069076824, "step": 5610}, {"loss": 0.7392, "grad_norm": 0.5017505288124084, "learning_rate": 0.0002, "epoch": 1.814073595868302, "step": 5620}, {"loss": 0.7669, "grad_norm": 0.8314290642738342, "learning_rate": 0.0002, "epoch": 1.817301484828922, "step": 5630}, {"loss": 0.7031, "grad_norm": 0.6863582134246826, "learning_rate": 0.0002, "epoch": 1.8205293737895416, "step": 5640}, {"loss": 0.743, "grad_norm": 0.69544917345047, "learning_rate": 0.0002, "epoch": 1.8237572627501613, "step": 5650}, {"loss": 0.7277, "grad_norm": 0.515499472618103, "learning_rate": 0.0002, "epoch": 1.8269851517107811, "step": 5660}, {"loss": 0.7166, "grad_norm": 0.6100873947143555, "learning_rate": 0.0002, "epoch": 1.830213040671401, "step": 5670}, {"loss": 0.7217, "grad_norm": 0.67416912317276, "learning_rate": 0.0002, "epoch": 1.8334409296320207, "step": 5680}, {"loss": 0.7575, "grad_norm": 0.7057772278785706, "learning_rate": 0.0002, "epoch": 1.8366688185926403, "step": 5690}, {"loss": 0.7483, "grad_norm": 0.7374551892280579, "learning_rate": 0.0002, "epoch": 1.8398967075532602, "step": 5700}, {"loss": 0.81, "grad_norm": 0.6266297101974487, "learning_rate": 0.0002, "epoch": 1.84312459651388, "step": 5710}, {"loss": 0.728, "grad_norm": 0.5629227757453918, "learning_rate": 0.0002, "epoch": 1.8463524854744997, "step": 5720}, {"loss": 0.8043, "grad_norm": 0.6603655815124512, "learning_rate": 0.0002, "epoch": 1.8495803744351194, "step": 5730}, {"loss": 0.7587, "grad_norm": 0.8113715052604675, "learning_rate": 0.0002, "epoch": 1.8528082633957392, "step": 5740}, {"loss": 0.7486, "grad_norm": 0.7143914103507996, "learning_rate": 0.0002, "epoch": 1.856036152356359, "step": 5750}, {"loss": 0.7619, "grad_norm": 0.6273732781410217, "learning_rate": 0.0002, "epoch": 1.8592640413169788, "step": 5760}, {"loss": 0.7962, "grad_norm": 0.5428690910339355, "learning_rate": 0.0002, "epoch": 1.8624919302775984, "step": 5770}, {"loss": 0.7581, "grad_norm": 0.6405037641525269, "learning_rate": 0.0002, "epoch": 1.865719819238218, "step": 5780}, {"loss": 0.7569, "grad_norm": 0.700873613357544, "learning_rate": 0.0002, "epoch": 1.868947708198838, "step": 5790}, {"loss": 0.7353, "grad_norm": 0.5645238161087036, "learning_rate": 0.0002, "epoch": 1.8721755971594578, "step": 5800}, {"loss": 0.8037, "grad_norm": 0.8780353665351868, "learning_rate": 0.0002, "epoch": 1.8754034861200775, "step": 5810}, {"loss": 0.7686, "grad_norm": 0.6295409798622131, "learning_rate": 0.0002, "epoch": 1.878631375080697, "step": 5820}, {"loss": 0.8067, "grad_norm": 0.678269624710083, "learning_rate": 0.0002, "epoch": 1.881859264041317, "step": 5830}, {"loss": 0.7537, "grad_norm": 0.6464608907699585, "learning_rate": 0.0002, "epoch": 1.8850871530019369, "step": 5840}, {"loss": 0.7423, "grad_norm": 0.6201048493385315, "learning_rate": 0.0002, "epoch": 1.8883150419625565, "step": 5850}, {"loss": 0.7694, "grad_norm": 0.6046274304389954, "learning_rate": 0.0002, "epoch": 1.8915429309231762, "step": 5860}, {"loss": 0.781, "grad_norm": 0.7532408833503723, "learning_rate": 0.0002, "epoch": 1.894770819883796, "step": 5870}, {"loss": 0.6885, "grad_norm": 0.6066767573356628, "learning_rate": 0.0002, "epoch": 1.897998708844416, "step": 5880}, {"loss": 0.7631, "grad_norm": 0.6289830207824707, "learning_rate": 0.0002, "epoch": 1.9012265978050356, "step": 5890}, {"loss": 0.7501, "grad_norm": 0.5204319953918457, "learning_rate": 0.0002, "epoch": 1.9044544867656552, "step": 5900}, {"loss": 0.7335, "grad_norm": 0.6708219647407532, "learning_rate": 0.0002, "epoch": 1.9076823757262749, "step": 5910}, {"loss": 0.7455, "grad_norm": 0.4915677309036255, "learning_rate": 0.0002, "epoch": 1.9109102646868947, "step": 5920}, {"loss": 0.7464, "grad_norm": 0.652717113494873, "learning_rate": 0.0002, "epoch": 1.9141381536475146, "step": 5930}, {"loss": 0.7687, "grad_norm": 0.5446316003799438, "learning_rate": 0.0002, "epoch": 1.9173660426081343, "step": 5940}, {"loss": 0.7424, "grad_norm": 0.4958149194717407, "learning_rate": 0.0002, "epoch": 1.920593931568754, "step": 5950}, {"loss": 0.757, "grad_norm": 0.5623434782028198, "learning_rate": 0.0002, "epoch": 1.9238218205293738, "step": 5960}, {"loss": 0.7446, "grad_norm": 0.6855450868606567, "learning_rate": 0.0002, "epoch": 1.9270497094899937, "step": 5970}, {"loss": 0.827, "grad_norm": 0.5710492730140686, "learning_rate": 0.0002, "epoch": 1.9302775984506133, "step": 5980}, {"loss": 0.7245, "grad_norm": 0.5379431843757629, "learning_rate": 0.0002, "epoch": 1.933505487411233, "step": 5990}, {"loss": 0.77, "grad_norm": 0.557129442691803, "learning_rate": 0.0002, "epoch": 1.9367333763718528, "step": 6000}, {"loss": 0.6988, "grad_norm": 0.6336663961410522, "learning_rate": 0.0002, "epoch": 1.9399612653324727, "step": 6010}, {"loss": 0.7316, "grad_norm": 0.5950582027435303, "learning_rate": 0.0002, "epoch": 1.9431891542930924, "step": 6020}, {"loss": 0.7443, "grad_norm": 0.5905954837799072, "learning_rate": 0.0002, "epoch": 1.946417043253712, "step": 6030}, {"loss": 0.7127, "grad_norm": 0.6688982844352722, "learning_rate": 0.0002, "epoch": 1.9496449322143317, "step": 6040}, {"loss": 0.79, "grad_norm": 0.5440775752067566, "learning_rate": 0.0002, "epoch": 1.9528728211749515, "step": 6050}, {"loss": 0.7221, "grad_norm": 0.6207906603813171, "learning_rate": 0.0002, "epoch": 1.9561007101355714, "step": 6060}, {"loss": 0.738, "grad_norm": 0.6999374628067017, "learning_rate": 0.0002, "epoch": 1.959328599096191, "step": 6070}, {"loss": 0.7372, "grad_norm": 0.6310848593711853, "learning_rate": 0.0002, "epoch": 1.9625564880568107, "step": 6080}, {"loss": 0.7198, "grad_norm": 0.5903388261795044, "learning_rate": 0.0002, "epoch": 1.9657843770174306, "step": 6090}, {"loss": 0.7103, "grad_norm": 0.6333889961242676, "learning_rate": 0.0002, "epoch": 1.9690122659780505, "step": 6100}, {"loss": 0.7246, "grad_norm": 0.5604711174964905, "learning_rate": 0.0002, "epoch": 1.97224015493867, "step": 6110}, {"loss": 0.761, "grad_norm": 0.9234541654586792, "learning_rate": 0.0002, "epoch": 1.9754680438992898, "step": 6120}, {"loss": 0.7375, "grad_norm": 0.6149102449417114, "learning_rate": 0.0002, "epoch": 1.9786959328599096, "step": 6130}, {"loss": 0.7286, "grad_norm": 0.615446150302887, "learning_rate": 0.0002, "epoch": 1.9819238218205295, "step": 6140}, {"loss": 0.7333, "grad_norm": 0.5176635980606079, "learning_rate": 0.0002, "epoch": 1.9851517107811492, "step": 6150}, {"loss": 0.718, "grad_norm": 0.7124109864234924, "learning_rate": 0.0002, "epoch": 1.9883795997417688, "step": 6160}, {"loss": 0.7669, "grad_norm": 0.6317567825317383, "learning_rate": 0.0002, "epoch": 1.9916074887023887, "step": 6170}, {"loss": 0.8012, "grad_norm": 0.6855016350746155, "learning_rate": 0.0002, "epoch": 1.9948353776630086, "step": 6180}, {"loss": 0.7376, "grad_norm": 0.6423715353012085, "learning_rate": 0.0002, "epoch": 1.9980632666236282, "step": 6190}, {"eval_loss": 1.1096643209457397, "eval_runtime": 147.7997, "eval_samples_per_second": 4.959, "eval_steps_per_second": 0.622, "epoch": 2.0, "step": 6196}, {"loss": 0.7131, "grad_norm": 0.5322932600975037, "learning_rate": 0.0002, "epoch": 2.001291155584248, "step": 6200}, {"loss": 0.6619, "grad_norm": 0.8152306079864502, "learning_rate": 0.0002, "epoch": 2.0045190445448675, "step": 6210}, {"loss": 0.6731, "grad_norm": 0.6215983033180237, "learning_rate": 0.0002, "epoch": 2.0077469335054876, "step": 6220}, {"loss": 0.658, "grad_norm": 0.845498263835907, "learning_rate": 0.0002, "epoch": 2.0109748224661073, "step": 6230}, {"loss": 0.6954, "grad_norm": 0.733559787273407, "learning_rate": 0.0002, "epoch": 2.014202711426727, "step": 6240}, {"loss": 0.6707, "grad_norm": 0.51433926820755, "learning_rate": 0.0002, "epoch": 2.0174306003873466, "step": 6250}, {"loss": 0.6304, "grad_norm": 0.6374049782752991, "learning_rate": 0.0002, "epoch": 2.020658489347966, "step": 6260}, {"loss": 0.6831, "grad_norm": 0.7833638191223145, "learning_rate": 0.0002, "epoch": 2.0238863783085863, "step": 6270}, {"loss": 0.6672, "grad_norm": 0.8929463028907776, "learning_rate": 0.0002, "epoch": 2.027114267269206, "step": 6280}, {"loss": 0.637, "grad_norm": 0.669731855392456, "learning_rate": 0.0002, "epoch": 2.0303421562298256, "step": 6290}, {"loss": 0.646, "grad_norm": 0.5846071243286133, "learning_rate": 0.0002, "epoch": 2.0335700451904453, "step": 6300}, {"loss": 0.6647, "grad_norm": 0.7087787985801697, "learning_rate": 0.0002, "epoch": 2.0367979341510654, "step": 6310}, {"loss": 0.6433, "grad_norm": 0.6739160418510437, "learning_rate": 0.0002, "epoch": 2.040025823111685, "step": 6320}, {"loss": 0.6301, "grad_norm": 0.4860886335372925, "learning_rate": 0.0002, "epoch": 2.0432537120723047, "step": 6330}, {"loss": 0.6439, "grad_norm": 0.7201244831085205, "learning_rate": 0.0002, "epoch": 2.0464816010329243, "step": 6340}, {"loss": 0.6676, "grad_norm": 0.7409170269966125, "learning_rate": 0.0002, "epoch": 2.0497094899935444, "step": 6350}, {"loss": 0.6153, "grad_norm": 0.6843920350074768, "learning_rate": 0.0002, "epoch": 2.052937378954164, "step": 6360}, {"loss": 0.6674, "grad_norm": 0.7519999742507935, "learning_rate": 0.0002, "epoch": 2.0561652679147837, "step": 6370}, {"loss": 0.6928, "grad_norm": 0.5732819437980652, "learning_rate": 0.0002, "epoch": 2.0593931568754034, "step": 6380}, {"loss": 0.6496, "grad_norm": 0.7565118074417114, "learning_rate": 0.0002, "epoch": 2.062621045836023, "step": 6390}, {"loss": 0.6354, "grad_norm": 0.8147150278091431, "learning_rate": 0.0002, "epoch": 2.065848934796643, "step": 6400}, {"loss": 0.6593, "grad_norm": 0.6941924691200256, "learning_rate": 0.0002, "epoch": 2.0690768237572628, "step": 6410}, {"loss": 0.6698, "grad_norm": 0.6549784541130066, "learning_rate": 0.0002, "epoch": 2.0723047127178824, "step": 6420}, {"loss": 0.6927, "grad_norm": 0.7224905490875244, "learning_rate": 0.0002, "epoch": 2.075532601678502, "step": 6430}, {"loss": 0.6755, "grad_norm": 0.7754863500595093, "learning_rate": 0.0002, "epoch": 2.078760490639122, "step": 6440}, {"loss": 0.6738, "grad_norm": 0.691318154335022, "learning_rate": 0.0002, "epoch": 2.081988379599742, "step": 6450}, {"loss": 0.6233, "grad_norm": 0.6009294986724854, "learning_rate": 0.0002, "epoch": 2.0852162685603615, "step": 6460}, {"loss": 0.6691, "grad_norm": 0.6753945350646973, "learning_rate": 0.0002, "epoch": 2.088444157520981, "step": 6470}, {"loss": 0.6935, "grad_norm": 0.6899921298027039, "learning_rate": 0.0002, "epoch": 2.091672046481601, "step": 6480}, {"loss": 0.6918, "grad_norm": 0.846510648727417, "learning_rate": 0.0002, "epoch": 2.094899935442221, "step": 6490}, {"loss": 0.6084, "grad_norm": 0.6432605981826782, "learning_rate": 0.0002, "epoch": 2.0981278244028405, "step": 6500}, {"loss": 0.6867, "grad_norm": 0.8125239014625549, "learning_rate": 0.0002, "epoch": 2.10135571336346, "step": 6510}, {"loss": 0.6939, "grad_norm": 0.628302812576294, "learning_rate": 0.0002, "epoch": 2.1045836023240803, "step": 6520}, {"loss": 0.5909, "grad_norm": 0.7164334654808044, "learning_rate": 0.0002, "epoch": 2.1078114912847, "step": 6530}, {"loss": 0.6578, "grad_norm": 0.7476949095726013, "learning_rate": 0.0002, "epoch": 2.1110393802453196, "step": 6540}, {"loss": 0.6351, "grad_norm": 0.7577515840530396, "learning_rate": 0.0002, "epoch": 2.114267269205939, "step": 6550}, {"loss": 0.6669, "grad_norm": 0.5684467554092407, "learning_rate": 0.0002, "epoch": 2.117495158166559, "step": 6560}, {"loss": 0.6343, "grad_norm": 0.6121789216995239, "learning_rate": 0.0002, "epoch": 2.120723047127179, "step": 6570}, {"loss": 0.6314, "grad_norm": 0.6095348596572876, "learning_rate": 0.0002, "epoch": 2.1239509360877986, "step": 6580}, {"loss": 0.6276, "grad_norm": 0.7803651690483093, "learning_rate": 0.0002, "epoch": 2.1271788250484183, "step": 6590}, {"loss": 0.6579, "grad_norm": 0.5990583300590515, "learning_rate": 0.0002, "epoch": 2.130406714009038, "step": 6600}, {"loss": 0.6228, "grad_norm": 0.6569220423698425, "learning_rate": 0.0002, "epoch": 2.133634602969658, "step": 6610}, {"loss": 0.7049, "grad_norm": 0.5961166620254517, "learning_rate": 0.0002, "epoch": 2.1368624919302777, "step": 6620}, {"loss": 0.6359, "grad_norm": 0.5860554575920105, "learning_rate": 0.0002, "epoch": 2.1400903808908973, "step": 6630}, {"loss": 0.6651, "grad_norm": 0.5994001626968384, "learning_rate": 0.0002, "epoch": 2.143318269851517, "step": 6640}, {"loss": 0.6421, "grad_norm": 0.7723015546798706, "learning_rate": 0.0002, "epoch": 2.146546158812137, "step": 6650}, {"loss": 0.6723, "grad_norm": 0.676355242729187, "learning_rate": 0.0002, "epoch": 2.1497740477727567, "step": 6660}, {"loss": 0.6826, "grad_norm": 0.5689092874526978, "learning_rate": 0.0002, "epoch": 2.1530019367333764, "step": 6670}, {"loss": 0.6613, "grad_norm": 0.6933727264404297, "learning_rate": 0.0002, "epoch": 2.156229825693996, "step": 6680}, {"loss": 0.6957, "grad_norm": 0.8380527496337891, "learning_rate": 0.0002, "epoch": 2.159457714654616, "step": 6690}, {"loss": 0.6705, "grad_norm": 0.6876497268676758, "learning_rate": 0.0002, "epoch": 2.1626856036152358, "step": 6700}, {"loss": 0.6112, "grad_norm": 0.6418334245681763, "learning_rate": 0.0002, "epoch": 2.1659134925758554, "step": 6710}, {"loss": 0.6357, "grad_norm": 0.7169192433357239, "learning_rate": 0.0002, "epoch": 2.169141381536475, "step": 6720}, {"loss": 0.6492, "grad_norm": 0.6664170622825623, "learning_rate": 0.0002, "epoch": 2.1723692704970947, "step": 6730}, {"loss": 0.6751, "grad_norm": 0.6011993288993835, "learning_rate": 0.0002, "epoch": 2.175597159457715, "step": 6740}, {"loss": 0.696, "grad_norm": 0.5529947280883789, "learning_rate": 0.0002, "epoch": 2.1788250484183345, "step": 6750}, {"loss": 0.671, "grad_norm": 0.6879532933235168, "learning_rate": 0.0002, "epoch": 2.182052937378954, "step": 6760}, {"loss": 0.6634, "grad_norm": 0.6426113843917847, "learning_rate": 0.0002, "epoch": 2.1852808263395738, "step": 6770}, {"loss": 0.6592, "grad_norm": 0.6571047306060791, "learning_rate": 0.0002, "epoch": 2.188508715300194, "step": 6780}, {"loss": 0.6494, "grad_norm": 0.6400564908981323, "learning_rate": 0.0002, "epoch": 2.1917366042608135, "step": 6790}, {"loss": 0.6369, "grad_norm": 0.6509664058685303, "learning_rate": 0.0002, "epoch": 2.194964493221433, "step": 6800}, {"loss": 0.6771, "grad_norm": 0.6673197150230408, "learning_rate": 0.0002, "epoch": 2.198192382182053, "step": 6810}, {"loss": 0.6491, "grad_norm": 0.48205727338790894, "learning_rate": 0.0002, "epoch": 2.2014202711426725, "step": 6820}, {"loss": 0.6894, "grad_norm": 0.849525511264801, "learning_rate": 0.0002, "epoch": 2.2046481601032926, "step": 6830}, {"loss": 0.6977, "grad_norm": 0.6150892376899719, "learning_rate": 0.0002, "epoch": 2.207876049063912, "step": 6840}, {"loss": 0.6843, "grad_norm": 0.7826945781707764, "learning_rate": 0.0002, "epoch": 2.211103938024532, "step": 6850}, {"loss": 0.6338, "grad_norm": 0.5711963772773743, "learning_rate": 0.0002, "epoch": 2.2143318269851515, "step": 6860}, {"loss": 0.6585, "grad_norm": 0.6017758846282959, "learning_rate": 0.0002, "epoch": 2.2175597159457716, "step": 6870}, {"loss": 0.6657, "grad_norm": 0.785434901714325, "learning_rate": 0.0002, "epoch": 2.2207876049063913, "step": 6880}, {"loss": 0.7075, "grad_norm": 0.6251688599586487, "learning_rate": 0.0002, "epoch": 2.224015493867011, "step": 6890}, {"loss": 0.6564, "grad_norm": 0.8242034316062927, "learning_rate": 0.0002, "epoch": 2.2272433828276306, "step": 6900}, {"loss": 0.672, "grad_norm": 0.7272933125495911, "learning_rate": 0.0002, "epoch": 2.2304712717882507, "step": 6910}, {"loss": 0.6541, "grad_norm": 0.7159379720687866, "learning_rate": 0.0002, "epoch": 2.2336991607488703, "step": 6920}, {"loss": 0.6859, "grad_norm": 0.6518042087554932, "learning_rate": 0.0002, "epoch": 2.23692704970949, "step": 6930}, {"loss": 0.5987, "grad_norm": 0.7365370392799377, "learning_rate": 0.0002, "epoch": 2.2401549386701096, "step": 6940}, {"loss": 0.6511, "grad_norm": 0.5674061179161072, "learning_rate": 0.0002, "epoch": 2.2433828276307297, "step": 6950}, {"loss": 0.6748, "grad_norm": 0.669185996055603, "learning_rate": 0.0002, "epoch": 2.2466107165913494, "step": 6960}, {"loss": 0.656, "grad_norm": 0.6638304591178894, "learning_rate": 0.0002, "epoch": 2.249838605551969, "step": 6970}, {"loss": 0.636, "grad_norm": 0.757006824016571, "learning_rate": 0.0002, "epoch": 2.2530664945125887, "step": 6980}, {"loss": 0.6597, "grad_norm": 0.7574930787086487, "learning_rate": 0.0002, "epoch": 2.2562943834732083, "step": 6990}, {"loss": 0.6859, "grad_norm": 0.7819514870643616, "learning_rate": 0.0002, "epoch": 2.2595222724338284, "step": 7000}, {"loss": 0.6238, "grad_norm": 0.6987583041191101, "learning_rate": 0.0002, "epoch": 2.262750161394448, "step": 7010}, {"loss": 0.661, "grad_norm": 0.6628551483154297, "learning_rate": 0.0002, "epoch": 2.2659780503550677, "step": 7020}, {"loss": 0.6254, "grad_norm": 0.7855866551399231, "learning_rate": 0.0002, "epoch": 2.2692059393156874, "step": 7030}, {"loss": 0.6679, "grad_norm": 0.6102892756462097, "learning_rate": 0.0002, "epoch": 2.2724338282763075, "step": 7040}, {"loss": 0.694, "grad_norm": 0.7844198942184448, "learning_rate": 0.0002, "epoch": 2.275661717236927, "step": 7050}, {"loss": 0.63, "grad_norm": 0.6209492087364197, "learning_rate": 0.0002, "epoch": 2.2788896061975468, "step": 7060}, {"loss": 0.6418, "grad_norm": 0.8351290225982666, "learning_rate": 0.0002, "epoch": 2.2821174951581664, "step": 7070}, {"loss": 0.6648, "grad_norm": 0.6883546710014343, "learning_rate": 0.0002, "epoch": 2.285345384118786, "step": 7080}, {"loss": 0.7046, "grad_norm": 0.6626381874084473, "learning_rate": 0.0002, "epoch": 2.288573273079406, "step": 7090}, {"loss": 0.6535, "grad_norm": 0.7216270565986633, "learning_rate": 0.0002, "epoch": 2.291801162040026, "step": 7100}, {"loss": 0.6414, "grad_norm": 0.8246777057647705, "learning_rate": 0.0002, "epoch": 2.2950290510006455, "step": 7110}, {"loss": 0.6315, "grad_norm": 0.614326000213623, "learning_rate": 0.0002, "epoch": 2.2982569399612656, "step": 7120}, {"loss": 0.6303, "grad_norm": 0.8785578012466431, "learning_rate": 0.0002, "epoch": 2.301484828921885, "step": 7130}, {"loss": 0.6348, "grad_norm": 0.7021808624267578, "learning_rate": 0.0002, "epoch": 2.304712717882505, "step": 7140}, {"loss": 0.6738, "grad_norm": 0.6999403238296509, "learning_rate": 0.0002, "epoch": 2.3079406068431245, "step": 7150}, {"loss": 0.6547, "grad_norm": 0.8013143539428711, "learning_rate": 0.0002, "epoch": 2.311168495803744, "step": 7160}, {"loss": 0.6461, "grad_norm": 0.6592583060264587, "learning_rate": 0.0002, "epoch": 2.3143963847643643, "step": 7170}, {"loss": 0.6369, "grad_norm": 0.6260249018669128, "learning_rate": 0.0002, "epoch": 2.317624273724984, "step": 7180}, {"loss": 0.6647, "grad_norm": 0.9352797269821167, "learning_rate": 0.0002, "epoch": 2.3208521626856036, "step": 7190}, {"loss": 0.6543, "grad_norm": 0.6629612445831299, "learning_rate": 0.0002, "epoch": 2.324080051646223, "step": 7200}, {"loss": 0.6811, "grad_norm": 0.7062810063362122, "learning_rate": 0.0002, "epoch": 2.3273079406068433, "step": 7210}, {"loss": 0.67, "grad_norm": 0.7236241102218628, "learning_rate": 0.0002, "epoch": 2.330535829567463, "step": 7220}, {"loss": 0.6462, "grad_norm": 0.7528148293495178, "learning_rate": 0.0002, "epoch": 2.3337637185280826, "step": 7230}, {"loss": 0.694, "grad_norm": 0.7604748606681824, "learning_rate": 0.0002, "epoch": 2.3369916074887023, "step": 7240}, {"loss": 0.6475, "grad_norm": 0.5601189136505127, "learning_rate": 0.0002, "epoch": 2.340219496449322, "step": 7250}, {"loss": 0.6925, "grad_norm": 0.7099230885505676, "learning_rate": 0.0002, "epoch": 2.343447385409942, "step": 7260}, {"loss": 0.6333, "grad_norm": 0.6699047684669495, "learning_rate": 0.0002, "epoch": 2.3466752743705617, "step": 7270}, {"loss": 0.6434, "grad_norm": 0.7315047979354858, "learning_rate": 0.0002, "epoch": 2.3499031633311813, "step": 7280}, {"loss": 0.6927, "grad_norm": 0.632836103439331, "learning_rate": 0.0002, "epoch": 2.353131052291801, "step": 7290}, {"loss": 0.6458, "grad_norm": 0.9410115480422974, "learning_rate": 0.0002, "epoch": 2.356358941252421, "step": 7300}, {"loss": 0.6699, "grad_norm": 0.626554012298584, "learning_rate": 0.0002, "epoch": 2.3595868302130407, "step": 7310}, {"loss": 0.6495, "grad_norm": 0.7538444399833679, "learning_rate": 0.0002, "epoch": 2.3628147191736604, "step": 7320}, {"loss": 0.6321, "grad_norm": 0.6826626062393188, "learning_rate": 0.0002, "epoch": 2.36604260813428, "step": 7330}, {"loss": 0.6752, "grad_norm": 0.6739391088485718, "learning_rate": 0.0002, "epoch": 2.3692704970949, "step": 7340}, {"loss": 0.6518, "grad_norm": 0.7518446445465088, "learning_rate": 0.0002, "epoch": 2.3724983860555198, "step": 7350}, {"loss": 0.7142, "grad_norm": 0.714133083820343, "learning_rate": 0.0002, "epoch": 2.3757262750161394, "step": 7360}, {"loss": 0.6794, "grad_norm": 0.7144588232040405, "learning_rate": 0.0002, "epoch": 2.378954163976759, "step": 7370}, {"loss": 0.6922, "grad_norm": 0.6598120927810669, "learning_rate": 0.0002, "epoch": 2.382182052937379, "step": 7380}, {"loss": 0.6562, "grad_norm": 0.7079148292541504, "learning_rate": 0.0002, "epoch": 2.385409941897999, "step": 7390}, {"loss": 0.6492, "grad_norm": 0.6750902533531189, "learning_rate": 0.0002, "epoch": 2.3886378308586185, "step": 7400}, {"loss": 0.6398, "grad_norm": 0.7181967496871948, "learning_rate": 0.0002, "epoch": 2.391865719819238, "step": 7410}, {"loss": 0.6793, "grad_norm": 0.7720552086830139, "learning_rate": 0.0002, "epoch": 2.3950936087798578, "step": 7420}, {"loss": 0.6804, "grad_norm": 0.7592426538467407, "learning_rate": 0.0002, "epoch": 2.398321497740478, "step": 7430}, {"loss": 0.6667, "grad_norm": 0.7161896824836731, "learning_rate": 0.0002, "epoch": 2.4015493867010975, "step": 7440}, {"loss": 0.6891, "grad_norm": 0.8019260764122009, "learning_rate": 0.0002, "epoch": 2.404777275661717, "step": 7450}, {"loss": 0.6864, "grad_norm": 0.7093342542648315, "learning_rate": 0.0002, "epoch": 2.408005164622337, "step": 7460}, {"loss": 0.6445, "grad_norm": 0.8464207649230957, "learning_rate": 0.0002, "epoch": 2.411233053582957, "step": 7470}, {"loss": 0.6724, "grad_norm": 0.773666501045227, "learning_rate": 0.0002, "epoch": 2.4144609425435766, "step": 7480}, {"loss": 0.6774, "grad_norm": 0.8451611995697021, "learning_rate": 0.0002, "epoch": 2.4176888315041962, "step": 7490}, {"loss": 0.694, "grad_norm": 0.656795084476471, "learning_rate": 0.0002, "epoch": 2.420916720464816, "step": 7500}, {"loss": 0.6824, "grad_norm": 0.7129034996032715, "learning_rate": 0.0002, "epoch": 2.4241446094254355, "step": 7510}, {"loss": 0.711, "grad_norm": 0.8325763940811157, "learning_rate": 0.0002, "epoch": 2.4273724983860556, "step": 7520}, {"loss": 0.6238, "grad_norm": 0.7806527614593506, "learning_rate": 0.0002, "epoch": 2.4306003873466753, "step": 7530}, {"loss": 0.6972, "grad_norm": 0.6994536519050598, "learning_rate": 0.0002, "epoch": 2.433828276307295, "step": 7540}, {"loss": 0.6615, "grad_norm": 0.6898999214172363, "learning_rate": 0.0002, "epoch": 2.437056165267915, "step": 7550}, {"loss": 0.7108, "grad_norm": 0.719490647315979, "learning_rate": 0.0002, "epoch": 2.4402840542285347, "step": 7560}, {"loss": 0.668, "grad_norm": 0.6841562390327454, "learning_rate": 0.0002, "epoch": 2.4435119431891543, "step": 7570}, {"loss": 0.6504, "grad_norm": 0.7573311924934387, "learning_rate": 0.0002, "epoch": 2.446739832149774, "step": 7580}, {"loss": 0.6607, "grad_norm": 0.7295880317687988, "learning_rate": 0.0002, "epoch": 2.4499677211103936, "step": 7590}, {"loss": 0.6593, "grad_norm": 0.710136353969574, "learning_rate": 0.0002, "epoch": 2.4531956100710137, "step": 7600}, {"loss": 0.7137, "grad_norm": 0.6126235127449036, "learning_rate": 0.0002, "epoch": 2.4564234990316334, "step": 7610}, {"loss": 0.6562, "grad_norm": 0.8025609850883484, "learning_rate": 0.0002, "epoch": 2.459651387992253, "step": 7620}, {"loss": 0.6464, "grad_norm": 0.7839472889900208, "learning_rate": 0.0002, "epoch": 2.4628792769528727, "step": 7630}, {"loss": 0.6797, "grad_norm": 0.7253499031066895, "learning_rate": 0.0002, "epoch": 2.4661071659134928, "step": 7640}, {"loss": 0.7341, "grad_norm": 0.7918946743011475, "learning_rate": 0.0002, "epoch": 2.4693350548741124, "step": 7650}, {"loss": 0.6646, "grad_norm": 0.7930178046226501, "learning_rate": 0.0002, "epoch": 2.472562943834732, "step": 7660}, {"loss": 0.6294, "grad_norm": 0.6826170086860657, "learning_rate": 0.0002, "epoch": 2.4757908327953517, "step": 7670}, {"loss": 0.6697, "grad_norm": 0.6576805114746094, "learning_rate": 0.0002, "epoch": 2.4790187217559714, "step": 7680}, {"loss": 0.682, "grad_norm": 0.7012448310852051, "learning_rate": 0.0002, "epoch": 2.4822466107165915, "step": 7690}, {"loss": 0.6418, "grad_norm": 0.7774284482002258, "learning_rate": 0.0002, "epoch": 2.485474499677211, "step": 7700}, {"loss": 0.6566, "grad_norm": 0.6502766013145447, "learning_rate": 0.0002, "epoch": 2.4887023886378308, "step": 7710}, {"loss": 0.6965, "grad_norm": 0.7638739347457886, "learning_rate": 0.0002, "epoch": 2.4919302775984504, "step": 7720}, {"loss": 0.6454, "grad_norm": 0.6217384338378906, "learning_rate": 0.0002, "epoch": 2.4951581665590705, "step": 7730}, {"loss": 0.6837, "grad_norm": 0.7576302886009216, "learning_rate": 0.0002, "epoch": 2.49838605551969, "step": 7740}, {"loss": 0.6855, "grad_norm": 0.6877137422561646, "learning_rate": 0.0002, "epoch": 2.50161394448031, "step": 7750}, {"loss": 0.6604, "grad_norm": 0.6998329162597656, "learning_rate": 0.0002, "epoch": 2.5048418334409295, "step": 7760}, {"loss": 0.6666, "grad_norm": 0.7879213690757751, "learning_rate": 0.0002, "epoch": 2.508069722401549, "step": 7770}, {"loss": 0.715, "grad_norm": 0.7834980487823486, "learning_rate": 0.0002, "epoch": 2.5112976113621692, "step": 7780}, {"loss": 0.6954, "grad_norm": 0.7789630889892578, "learning_rate": 0.0002, "epoch": 2.514525500322789, "step": 7790}, {"loss": 0.6979, "grad_norm": 0.7403590083122253, "learning_rate": 0.0002, "epoch": 2.5177533892834085, "step": 7800}, {"loss": 0.6964, "grad_norm": 0.6029766201972961, "learning_rate": 0.0002, "epoch": 2.5209812782440286, "step": 7810}, {"loss": 0.6887, "grad_norm": 0.7061092257499695, "learning_rate": 0.0002, "epoch": 2.5242091672046483, "step": 7820}, {"loss": 0.6628, "grad_norm": 0.7120763659477234, "learning_rate": 0.0002, "epoch": 2.527437056165268, "step": 7830}, {"loss": 0.6876, "grad_norm": 0.6173675656318665, "learning_rate": 0.0002, "epoch": 2.5306649451258876, "step": 7840}, {"loss": 0.6635, "grad_norm": 0.9566813111305237, "learning_rate": 0.0002, "epoch": 2.5338928340865072, "step": 7850}, {"loss": 0.654, "grad_norm": 0.8497620224952698, "learning_rate": 0.0002, "epoch": 2.5371207230471273, "step": 7860}, {"loss": 0.644, "grad_norm": 0.7663498520851135, "learning_rate": 0.0002, "epoch": 2.540348612007747, "step": 7870}, {"loss": 0.6292, "grad_norm": 0.6329668760299683, "learning_rate": 0.0002, "epoch": 2.5435765009683666, "step": 7880}, {"loss": 0.686, "grad_norm": 0.8128195405006409, "learning_rate": 0.0002, "epoch": 2.5468043899289863, "step": 7890}, {"loss": 0.6619, "grad_norm": 0.6622284650802612, "learning_rate": 0.0002, "epoch": 2.5500322788896064, "step": 7900}, {"loss": 0.693, "grad_norm": 0.8460057973861694, "learning_rate": 0.0002, "epoch": 2.553260167850226, "step": 7910}, {"loss": 0.6619, "grad_norm": 0.6586956977844238, "learning_rate": 0.0002, "epoch": 2.5564880568108457, "step": 7920}, {"loss": 0.6976, "grad_norm": 0.7569382190704346, "learning_rate": 0.0002, "epoch": 2.5597159457714653, "step": 7930}, {"loss": 0.6235, "grad_norm": 0.6409714221954346, "learning_rate": 0.0002, "epoch": 2.562943834732085, "step": 7940}, {"loss": 0.6663, "grad_norm": 0.7031713128089905, "learning_rate": 0.0002, "epoch": 2.566171723692705, "step": 7950}, {"loss": 0.6344, "grad_norm": 0.7983605265617371, "learning_rate": 0.0002, "epoch": 2.5693996126533247, "step": 7960}, {"loss": 0.6834, "grad_norm": 0.7165433168411255, "learning_rate": 0.0002, "epoch": 2.5726275016139444, "step": 7970}, {"loss": 0.6517, "grad_norm": 0.6630598902702332, "learning_rate": 0.0002, "epoch": 2.5758553905745645, "step": 7980}, {"loss": 0.7164, "grad_norm": 0.5883122086524963, "learning_rate": 0.0002, "epoch": 2.579083279535184, "step": 7990}, {"loss": 0.6715, "grad_norm": 0.5928755402565002, "learning_rate": 0.0002, "epoch": 2.5823111684958038, "step": 8000}, {"loss": 0.6701, "grad_norm": 0.7843712568283081, "learning_rate": 0.0002, "epoch": 2.5855390574564234, "step": 8010}, {"loss": 0.6617, "grad_norm": 0.7206324338912964, "learning_rate": 0.0002, "epoch": 2.588766946417043, "step": 8020}, {"loss": 0.6968, "grad_norm": 0.812480092048645, "learning_rate": 0.0002, "epoch": 2.5919948353776627, "step": 8030}, {"loss": 0.6735, "grad_norm": 0.9843078255653381, "learning_rate": 0.0002, "epoch": 2.595222724338283, "step": 8040}, {"loss": 0.6877, "grad_norm": 0.7524392604827881, "learning_rate": 0.0002, "epoch": 2.5984506132989025, "step": 8050}, {"loss": 0.7188, "grad_norm": 0.6220380067825317, "learning_rate": 0.0002, "epoch": 2.601678502259522, "step": 8060}, {"loss": 0.6878, "grad_norm": 0.7461398243904114, "learning_rate": 0.0002, "epoch": 2.6049063912201422, "step": 8070}, {"loss": 0.6626, "grad_norm": 0.720974326133728, "learning_rate": 0.0002, "epoch": 2.608134280180762, "step": 8080}, {"loss": 0.6756, "grad_norm": 0.649509847164154, "learning_rate": 0.0002, "epoch": 2.6113621691413815, "step": 8090}, {"loss": 0.6394, "grad_norm": 0.6894662976264954, "learning_rate": 0.0002, "epoch": 2.614590058102001, "step": 8100}, {"loss": 0.6329, "grad_norm": 0.734433114528656, "learning_rate": 0.0002, "epoch": 2.617817947062621, "step": 8110}, {"loss": 0.6698, "grad_norm": 0.7468628883361816, "learning_rate": 0.0002, "epoch": 2.621045836023241, "step": 8120}, {"loss": 0.658, "grad_norm": 0.6508180499076843, "learning_rate": 0.0002, "epoch": 2.6242737249838606, "step": 8130}, {"loss": 0.6619, "grad_norm": 0.8735209107398987, "learning_rate": 0.0002, "epoch": 2.6275016139444802, "step": 8140}, {"loss": 0.6717, "grad_norm": 0.8162857294082642, "learning_rate": 0.0002, "epoch": 2.6307295029051003, "step": 8150}, {"loss": 0.6496, "grad_norm": 0.628872811794281, "learning_rate": 0.0002, "epoch": 2.63395739186572, "step": 8160}, {"loss": 0.6608, "grad_norm": 0.8078708052635193, "learning_rate": 0.0002, "epoch": 2.6371852808263396, "step": 8170}, {"loss": 0.6916, "grad_norm": 0.7849429845809937, "learning_rate": 0.0002, "epoch": 2.6404131697869593, "step": 8180}, {"loss": 0.6671, "grad_norm": 0.8115387558937073, "learning_rate": 0.0002, "epoch": 2.643641058747579, "step": 8190}, {"loss": 0.6761, "grad_norm": 0.7462222576141357, "learning_rate": 0.0002, "epoch": 2.6468689477081986, "step": 8200}, {"loss": 0.6923, "grad_norm": 0.753662645816803, "learning_rate": 0.0002, "epoch": 2.6500968366688187, "step": 8210}, {"loss": 0.6666, "grad_norm": 0.6100404858589172, "learning_rate": 0.0002, "epoch": 2.6533247256294383, "step": 8220}, {"loss": 0.7256, "grad_norm": 0.9084606766700745, "learning_rate": 0.0002, "epoch": 2.656552614590058, "step": 8230}, {"loss": 0.6385, "grad_norm": 0.6412538886070251, "learning_rate": 0.0002, "epoch": 2.659780503550678, "step": 8240}, {"loss": 0.7048, "grad_norm": 0.7640451192855835, "learning_rate": 0.0002, "epoch": 2.6630083925112977, "step": 8250}, {"loss": 0.6846, "grad_norm": 0.5972344875335693, "learning_rate": 0.0002, "epoch": 2.6662362814719174, "step": 8260}, {"loss": 0.682, "grad_norm": 0.6935883164405823, "learning_rate": 0.0002, "epoch": 2.669464170432537, "step": 8270}, {"loss": 0.6625, "grad_norm": 0.789399266242981, "learning_rate": 0.0002, "epoch": 2.6726920593931567, "step": 8280}, {"loss": 0.6541, "grad_norm": 0.7143490314483643, "learning_rate": 0.0002, "epoch": 2.675919948353777, "step": 8290}, {"loss": 0.6741, "grad_norm": 0.6670652627944946, "learning_rate": 0.0002, "epoch": 2.6791478373143964, "step": 8300}, {"loss": 0.6936, "grad_norm": 0.687108039855957, "learning_rate": 0.0002, "epoch": 2.682375726275016, "step": 8310}, {"loss": 0.7124, "grad_norm": 0.7914147973060608, "learning_rate": 0.0002, "epoch": 2.6856036152356357, "step": 8320}, {"loss": 0.6584, "grad_norm": 0.8398420214653015, "learning_rate": 0.0002, "epoch": 2.688831504196256, "step": 8330}, {"loss": 0.6679, "grad_norm": 0.6592720746994019, "learning_rate": 0.0002, "epoch": 2.6920593931568755, "step": 8340}, {"loss": 0.6673, "grad_norm": 0.6888470649719238, "learning_rate": 0.0002, "epoch": 2.695287282117495, "step": 8350}, {"loss": 0.6483, "grad_norm": 0.7127556800842285, "learning_rate": 0.0002, "epoch": 2.698515171078115, "step": 8360}, {"loss": 0.7013, "grad_norm": 0.6630286574363708, "learning_rate": 0.0002, "epoch": 2.7017430600387344, "step": 8370}, {"loss": 0.6842, "grad_norm": 0.8261964321136475, "learning_rate": 0.0002, "epoch": 2.7049709489993545, "step": 8380}, {"loss": 0.6613, "grad_norm": 0.717339813709259, "learning_rate": 0.0002, "epoch": 2.708198837959974, "step": 8390}, {"loss": 0.6929, "grad_norm": 0.651637613773346, "learning_rate": 0.0002, "epoch": 2.711426726920594, "step": 8400}, {"loss": 0.6796, "grad_norm": 0.7936098575592041, "learning_rate": 0.0002, "epoch": 2.714654615881214, "step": 8410}, {"loss": 0.696, "grad_norm": 0.8761560320854187, "learning_rate": 0.0002, "epoch": 2.7178825048418336, "step": 8420}, {"loss": 0.6889, "grad_norm": 0.6768006086349487, "learning_rate": 0.0002, "epoch": 2.7211103938024532, "step": 8430}, {"loss": 0.6844, "grad_norm": 0.7121055722236633, "learning_rate": 0.0002, "epoch": 2.724338282763073, "step": 8440}, {"loss": 0.6608, "grad_norm": 0.6811696887016296, "learning_rate": 0.0002, "epoch": 2.7275661717236925, "step": 8450}, {"loss": 0.7046, "grad_norm": 0.8168250918388367, "learning_rate": 0.0002, "epoch": 2.730794060684312, "step": 8460}, {"loss": 0.6809, "grad_norm": 0.660682737827301, "learning_rate": 0.0002, "epoch": 2.7340219496449323, "step": 8470}, {"loss": 0.6916, "grad_norm": 0.7369356155395508, "learning_rate": 0.0002, "epoch": 2.737249838605552, "step": 8480}, {"loss": 0.6383, "grad_norm": 0.7545099854469299, "learning_rate": 0.0002, "epoch": 2.7404777275661716, "step": 8490}, {"loss": 0.6917, "grad_norm": 0.6991257667541504, "learning_rate": 0.0002, "epoch": 2.7437056165267917, "step": 8500}, {"loss": 0.6953, "grad_norm": 0.7195324301719666, "learning_rate": 0.0002, "epoch": 2.7469335054874113, "step": 8510}, {"loss": 0.6955, "grad_norm": 0.8995378017425537, "learning_rate": 0.0002, "epoch": 2.750161394448031, "step": 8520}, {"loss": 0.684, "grad_norm": 0.6924123764038086, "learning_rate": 0.0002, "epoch": 2.7533892834086506, "step": 8530}, {"loss": 0.6675, "grad_norm": 0.6260585784912109, "learning_rate": 0.0002, "epoch": 2.7566171723692703, "step": 8540}, {"loss": 0.6613, "grad_norm": 0.7273091673851013, "learning_rate": 0.0002, "epoch": 2.7598450613298904, "step": 8550}, {"loss": 0.6853, "grad_norm": 0.720562219619751, "learning_rate": 0.0002, "epoch": 2.76307295029051, "step": 8560}, {"loss": 0.6452, "grad_norm": 0.6360004544258118, "learning_rate": 0.0002, "epoch": 2.7663008392511297, "step": 8570}, {"loss": 0.6118, "grad_norm": 0.7634525895118713, "learning_rate": 0.0002, "epoch": 2.76952872821175, "step": 8580}, {"loss": 0.686, "grad_norm": 0.6586076021194458, "learning_rate": 0.0002, "epoch": 2.7727566171723694, "step": 8590}, {"loss": 0.7072, "grad_norm": 0.6542639136314392, "learning_rate": 0.0002, "epoch": 2.775984506132989, "step": 8600}, {"loss": 0.7126, "grad_norm": 0.7650290727615356, "learning_rate": 0.0002, "epoch": 2.7792123950936087, "step": 8610}, {"loss": 0.6923, "grad_norm": 0.6551542282104492, "learning_rate": 0.0002, "epoch": 2.7824402840542284, "step": 8620}, {"loss": 0.6937, "grad_norm": 0.6915501952171326, "learning_rate": 0.0002, "epoch": 2.785668173014848, "step": 8630}, {"loss": 0.6586, "grad_norm": 0.8061493635177612, "learning_rate": 0.0002, "epoch": 2.788896061975468, "step": 8640}, {"loss": 0.6853, "grad_norm": 0.8403584957122803, "learning_rate": 0.0002, "epoch": 2.792123950936088, "step": 8650}, {"loss": 0.6616, "grad_norm": 0.6455532312393188, "learning_rate": 0.0002, "epoch": 2.7953518398967074, "step": 8660}, {"loss": 0.6819, "grad_norm": 0.8296352028846741, "learning_rate": 0.0002, "epoch": 2.7985797288573275, "step": 8670}, {"loss": 0.6678, "grad_norm": 0.7288752794265747, "learning_rate": 0.0002, "epoch": 2.801807617817947, "step": 8680}, {"loss": 0.6778, "grad_norm": 0.7628464102745056, "learning_rate": 0.0002, "epoch": 2.805035506778567, "step": 8690}, {"loss": 0.7176, "grad_norm": 0.9993878602981567, "learning_rate": 0.0002, "epoch": 2.8082633957391865, "step": 8700}, {"loss": 0.6414, "grad_norm": 0.6972465515136719, "learning_rate": 0.0002, "epoch": 2.811491284699806, "step": 8710}, {"loss": 0.6777, "grad_norm": 0.645042896270752, "learning_rate": 0.0002, "epoch": 2.8147191736604262, "step": 8720}, {"loss": 0.6587, "grad_norm": 0.6853853464126587, "learning_rate": 0.0002, "epoch": 2.817947062621046, "step": 8730}, {"loss": 0.6405, "grad_norm": 0.5935067534446716, "learning_rate": 0.0002, "epoch": 2.8211749515816655, "step": 8740}, {"loss": 0.6674, "grad_norm": 0.7336633205413818, "learning_rate": 0.0002, "epoch": 2.824402840542285, "step": 8750}, {"loss": 0.6662, "grad_norm": 0.7074962854385376, "learning_rate": 0.0002, "epoch": 2.8276307295029053, "step": 8760}, {"loss": 0.6744, "grad_norm": 0.6667559742927551, "learning_rate": 0.0002, "epoch": 2.830858618463525, "step": 8770}, {"loss": 0.7142, "grad_norm": 0.8101205229759216, "learning_rate": 0.0002, "epoch": 2.8340865074241446, "step": 8780}, {"loss": 0.6727, "grad_norm": 0.8841480016708374, "learning_rate": 0.0002, "epoch": 2.8373143963847642, "step": 8790}, {"loss": 0.6601, "grad_norm": 0.5891591310501099, "learning_rate": 0.0002, "epoch": 2.840542285345384, "step": 8800}, {"loss": 0.7114, "grad_norm": 0.667032778263092, "learning_rate": 0.0002, "epoch": 2.843770174306004, "step": 8810}, {"loss": 0.7295, "grad_norm": 0.7629773020744324, "learning_rate": 0.0002, "epoch": 2.8469980632666236, "step": 8820}, {"loss": 0.703, "grad_norm": 0.79471355676651, "learning_rate": 0.0002, "epoch": 2.8502259522272433, "step": 8830}, {"loss": 0.7278, "grad_norm": 0.7529178261756897, "learning_rate": 0.0002, "epoch": 2.8534538411878634, "step": 8840}, {"loss": 0.7163, "grad_norm": 0.7014923691749573, "learning_rate": 0.0002, "epoch": 2.856681730148483, "step": 8850}, {"loss": 0.6803, "grad_norm": 0.7996514439582825, "learning_rate": 0.0002, "epoch": 2.8599096191091027, "step": 8860}, {"loss": 0.6562, "grad_norm": 0.7044785618782043, "learning_rate": 0.0002, "epoch": 2.8631375080697223, "step": 8870}, {"loss": 0.6966, "grad_norm": 0.6792093515396118, "learning_rate": 0.0002, "epoch": 2.866365397030342, "step": 8880}, {"loss": 0.685, "grad_norm": 0.69175124168396, "learning_rate": 0.0002, "epoch": 2.8695932859909616, "step": 8890}, {"loss": 0.7225, "grad_norm": 0.7499129176139832, "learning_rate": 0.0002, "epoch": 2.8728211749515817, "step": 8900}, {"loss": 0.6922, "grad_norm": 0.7678789496421814, "learning_rate": 0.0002, "epoch": 2.8760490639122014, "step": 8910}, {"loss": 0.6803, "grad_norm": 0.7478128671646118, "learning_rate": 0.0002, "epoch": 2.879276952872821, "step": 8920}, {"loss": 0.6689, "grad_norm": 0.6767086386680603, "learning_rate": 0.0002, "epoch": 2.882504841833441, "step": 8930}, {"loss": 0.6587, "grad_norm": 0.7222196459770203, "learning_rate": 0.0002, "epoch": 2.885732730794061, "step": 8940}, {"loss": 0.6472, "grad_norm": 0.6950580477714539, "learning_rate": 0.0002, "epoch": 2.8889606197546804, "step": 8950}, {"loss": 0.7064, "grad_norm": 0.7759528160095215, "learning_rate": 0.0002, "epoch": 2.8921885087153, "step": 8960}, {"loss": 0.6349, "grad_norm": 0.6686919927597046, "learning_rate": 0.0002, "epoch": 2.8954163976759197, "step": 8970}, {"loss": 0.6801, "grad_norm": 0.9245954751968384, "learning_rate": 0.0002, "epoch": 2.89864428663654, "step": 8980}, {"loss": 0.6703, "grad_norm": 0.8734814524650574, "learning_rate": 0.0002, "epoch": 2.9018721755971595, "step": 8990}, {"loss": 0.6716, "grad_norm": 0.6056219339370728, "learning_rate": 0.0002, "epoch": 2.905100064557779, "step": 9000}, {"loss": 0.6535, "grad_norm": 0.7364102005958557, "learning_rate": 0.0002, "epoch": 2.9083279535183992, "step": 9010}, {"loss": 0.707, "grad_norm": 0.6563605070114136, "learning_rate": 0.0002, "epoch": 2.911555842479019, "step": 9020}, {"loss": 0.6564, "grad_norm": 0.659978985786438, "learning_rate": 0.0002, "epoch": 2.9147837314396385, "step": 9030}, {"loss": 0.7154, "grad_norm": 0.8176041841506958, "learning_rate": 0.0002, "epoch": 2.918011620400258, "step": 9040}, {"loss": 0.72, "grad_norm": 0.743677020072937, "learning_rate": 0.0002, "epoch": 2.921239509360878, "step": 9050}, {"loss": 0.7017, "grad_norm": 0.7418383359909058, "learning_rate": 0.0002, "epoch": 2.9244673983214975, "step": 9060}, {"loss": 0.6635, "grad_norm": 0.6916524767875671, "learning_rate": 0.0002, "epoch": 2.9276952872821176, "step": 9070}, {"loss": 0.6502, "grad_norm": 0.6559975743293762, "learning_rate": 0.0002, "epoch": 2.9309231762427372, "step": 9080}, {"loss": 0.7016, "grad_norm": 0.7431221008300781, "learning_rate": 0.0002, "epoch": 2.934151065203357, "step": 9090}, {"loss": 0.6829, "grad_norm": 0.7525941133499146, "learning_rate": 0.0002, "epoch": 2.937378954163977, "step": 9100}, {"loss": 0.7073, "grad_norm": 0.6860167384147644, "learning_rate": 0.0002, "epoch": 2.9406068431245966, "step": 9110}, {"loss": 0.6912, "grad_norm": 0.6467666029930115, "learning_rate": 0.0002, "epoch": 2.9438347320852163, "step": 9120}, {"loss": 0.7122, "grad_norm": 0.7595751285552979, "learning_rate": 0.0002, "epoch": 2.947062621045836, "step": 9130}, {"loss": 0.6951, "grad_norm": 0.6558279991149902, "learning_rate": 0.0002, "epoch": 2.9502905100064556, "step": 9140}, {"loss": 0.7081, "grad_norm": 0.6818708181381226, "learning_rate": 0.0002, "epoch": 2.9535183989670757, "step": 9150}, {"loss": 0.6921, "grad_norm": 0.8387085795402527, "learning_rate": 0.0002, "epoch": 2.9567462879276953, "step": 9160}, {"loss": 0.6914, "grad_norm": 0.7705109715461731, "learning_rate": 0.0002, "epoch": 2.959974176888315, "step": 9170}, {"loss": 0.6849, "grad_norm": 0.688106894493103, "learning_rate": 0.0002, "epoch": 2.9632020658489346, "step": 9180}, {"loss": 0.6833, "grad_norm": 0.659532368183136, "learning_rate": 0.0002, "epoch": 2.9664299548095547, "step": 9190}, {"loss": 0.6383, "grad_norm": 0.6839388608932495, "learning_rate": 0.0002, "epoch": 2.9696578437701744, "step": 9200}, {"loss": 0.6952, "grad_norm": 0.6927599310874939, "learning_rate": 0.0002, "epoch": 2.972885732730794, "step": 9210}, {"loss": 0.7338, "grad_norm": 0.6902472972869873, "learning_rate": 0.0002, "epoch": 2.9761136216914137, "step": 9220}, {"loss": 0.6671, "grad_norm": 0.620399534702301, "learning_rate": 0.0002, "epoch": 2.9793415106520333, "step": 9230}, {"loss": 0.6588, "grad_norm": 0.6812364459037781, "learning_rate": 0.0002, "epoch": 2.9825693996126534, "step": 9240}, {"loss": 0.6957, "grad_norm": 0.7681456208229065, "learning_rate": 0.0002, "epoch": 2.985797288573273, "step": 9250}, {"loss": 0.7113, "grad_norm": 0.7621907591819763, "learning_rate": 0.0002, "epoch": 2.9890251775338927, "step": 9260}, {"loss": 0.6601, "grad_norm": 0.6075740456581116, "learning_rate": 0.0002, "epoch": 2.992253066494513, "step": 9270}, {"loss": 0.6758, "grad_norm": 0.7100434899330139, "learning_rate": 0.0002, "epoch": 2.9954809554551325, "step": 9280}, {"loss": 0.73, "grad_norm": 0.7314488887786865, "learning_rate": 0.0002, "epoch": 2.998708844415752, "step": 9290}]} +{"epoch": 4.0, "step": 12392, "epoch_duration": 11941.852810144424, "total_accumulated_duration": 44031.94399380684, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75537109375}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.593, "grad_norm": 0.7092075347900391, "learning_rate": 0.0002, "epoch": 0.0032278889606197547, "step": 10}, {"loss": 1.0956, "grad_norm": 0.6900479793548584, "learning_rate": 0.0002, "epoch": 0.006455777921239509, "step": 20}, {"loss": 0.9807, "grad_norm": 0.6788288950920105, "learning_rate": 0.0002, "epoch": 0.009683666881859263, "step": 30}, {"loss": 0.9385, "grad_norm": 0.5590243339538574, "learning_rate": 0.0002, "epoch": 0.012911555842479019, "step": 40}, {"loss": 0.931, "grad_norm": 0.5136010646820068, "learning_rate": 0.0002, "epoch": 0.016139444803098774, "step": 50}, {"loss": 0.8896, "grad_norm": 0.45298320055007935, "learning_rate": 0.0002, "epoch": 0.019367333763718526, "step": 60}, {"loss": 0.9184, "grad_norm": 0.5917162299156189, "learning_rate": 0.0002, "epoch": 0.022595222724338282, "step": 70}, {"loss": 0.8705, "grad_norm": 0.4414856433868408, "learning_rate": 0.0002, "epoch": 0.025823111684958037, "step": 80}, {"loss": 0.8419, "grad_norm": 0.5547978281974792, "learning_rate": 0.0002, "epoch": 0.029051000645577793, "step": 90}, {"loss": 0.8987, "grad_norm": 0.5271288156509399, "learning_rate": 0.0002, "epoch": 0.03227888960619755, "step": 100}, {"loss": 0.8543, "grad_norm": 0.5506119728088379, "learning_rate": 0.0002, "epoch": 0.035506778566817304, "step": 110}, {"loss": 0.8373, "grad_norm": 0.5579327940940857, "learning_rate": 0.0002, "epoch": 0.03873466752743705, "step": 120}, {"loss": 0.8826, "grad_norm": 0.5099632740020752, "learning_rate": 0.0002, "epoch": 0.04196255648805681, "step": 130}, {"loss": 0.9239, "grad_norm": 0.40396833419799805, "learning_rate": 0.0002, "epoch": 0.045190445448676564, "step": 140}, {"loss": 0.846, "grad_norm": 0.5008092522621155, "learning_rate": 0.0002, "epoch": 0.04841833440929632, "step": 150}, {"loss": 0.8564, "grad_norm": 0.4388776421546936, "learning_rate": 0.0002, "epoch": 0.051646223369916075, "step": 160}, {"loss": 0.8829, "grad_norm": 0.44138944149017334, "learning_rate": 0.0002, "epoch": 0.05487411233053583, "step": 170}, {"loss": 0.8061, "grad_norm": 0.358484148979187, "learning_rate": 0.0002, "epoch": 0.058102001291155586, "step": 180}, {"loss": 0.8956, "grad_norm": 0.457052081823349, "learning_rate": 0.0002, "epoch": 0.06132989025177534, "step": 190}, {"loss": 0.9138, "grad_norm": 0.5537622570991516, "learning_rate": 0.0002, "epoch": 0.0645577792123951, "step": 200}, {"loss": 0.8701, "grad_norm": 0.552631676197052, "learning_rate": 0.0002, "epoch": 0.06778566817301485, "step": 210}, {"loss": 0.8854, "grad_norm": 0.4414575397968292, "learning_rate": 0.0002, "epoch": 0.07101355713363461, "step": 220}, {"loss": 0.8581, "grad_norm": 0.4996664226055145, "learning_rate": 0.0002, "epoch": 0.07424144609425436, "step": 230}, {"loss": 0.8675, "grad_norm": 0.7321897149085999, "learning_rate": 0.0002, "epoch": 0.0774693350548741, "step": 240}, {"loss": 0.8848, "grad_norm": 0.4553901255130768, "learning_rate": 0.0002, "epoch": 0.08069722401549387, "step": 250}, {"loss": 0.868, "grad_norm": 0.5039054751396179, "learning_rate": 0.0002, "epoch": 0.08392511297611362, "step": 260}, {"loss": 0.8317, "grad_norm": 0.4113094210624695, "learning_rate": 0.0002, "epoch": 0.08715300193673338, "step": 270}, {"loss": 0.8074, "grad_norm": 0.450436532497406, "learning_rate": 0.0002, "epoch": 0.09038089089735313, "step": 280}, {"loss": 0.8105, "grad_norm": 0.4548024535179138, "learning_rate": 0.0002, "epoch": 0.09360877985797289, "step": 290}, {"loss": 0.8325, "grad_norm": 0.4932962656021118, "learning_rate": 0.0002, "epoch": 0.09683666881859264, "step": 300}, {"loss": 0.8105, "grad_norm": 0.4005250334739685, "learning_rate": 0.0002, "epoch": 0.1000645577792124, "step": 310}, {"loss": 0.8083, "grad_norm": 1.8321624994277954, "learning_rate": 0.0002, "epoch": 0.10329244673983215, "step": 320}, {"loss": 0.8411, "grad_norm": 0.45815610885620117, "learning_rate": 0.0002, "epoch": 0.1065203357004519, "step": 330}, {"loss": 0.857, "grad_norm": 0.39324095845222473, "learning_rate": 0.0002, "epoch": 0.10974822466107166, "step": 340}, {"loss": 0.8258, "grad_norm": 0.546273946762085, "learning_rate": 0.0002, "epoch": 0.11297611362169141, "step": 350}, {"loss": 0.882, "grad_norm": 0.497448593378067, "learning_rate": 0.0002, "epoch": 0.11620400258231117, "step": 360}, {"loss": 0.7608, "grad_norm": 0.37508800625801086, "learning_rate": 0.0002, "epoch": 0.11943189154293092, "step": 370}, {"loss": 0.852, "grad_norm": 0.45849609375, "learning_rate": 0.0002, "epoch": 0.12265978050355068, "step": 380}, {"loss": 0.8437, "grad_norm": 0.5488408803939819, "learning_rate": 0.0002, "epoch": 0.12588766946417043, "step": 390}, {"loss": 0.8349, "grad_norm": 0.4477061331272125, "learning_rate": 0.0002, "epoch": 0.1291155584247902, "step": 400}, {"loss": 0.8306, "grad_norm": 0.39227980375289917, "learning_rate": 0.0002, "epoch": 0.13234344738540993, "step": 410}, {"loss": 0.7933, "grad_norm": 0.3922233581542969, "learning_rate": 0.0002, "epoch": 0.1355713363460297, "step": 420}, {"loss": 0.8134, "grad_norm": 0.42901909351348877, "learning_rate": 0.0002, "epoch": 0.13879922530664945, "step": 430}, {"loss": 0.8271, "grad_norm": 0.4217798709869385, "learning_rate": 0.0002, "epoch": 0.14202711426726922, "step": 440}, {"loss": 0.8594, "grad_norm": 0.43470677733421326, "learning_rate": 0.0002, "epoch": 0.14525500322788895, "step": 450}, {"loss": 0.8106, "grad_norm": 0.5324403047561646, "learning_rate": 0.0002, "epoch": 0.1484828921885087, "step": 460}, {"loss": 0.8729, "grad_norm": 0.3999756872653961, "learning_rate": 0.0002, "epoch": 0.15171078114912848, "step": 470}, {"loss": 0.7702, "grad_norm": 0.404933363199234, "learning_rate": 0.0002, "epoch": 0.1549386701097482, "step": 480}, {"loss": 0.8151, "grad_norm": 0.44122636318206787, "learning_rate": 0.0002, "epoch": 0.15816655907036797, "step": 490}, {"loss": 0.8457, "grad_norm": 0.510166347026825, "learning_rate": 0.0002, "epoch": 0.16139444803098774, "step": 500}, {"loss": 0.8692, "grad_norm": 0.4549732506275177, "learning_rate": 0.0002, "epoch": 0.1646223369916075, "step": 510}, {"loss": 0.8466, "grad_norm": 0.5148182511329651, "learning_rate": 0.0002, "epoch": 0.16785022595222723, "step": 520}, {"loss": 0.8317, "grad_norm": 0.3596806824207306, "learning_rate": 0.0002, "epoch": 0.171078114912847, "step": 530}, {"loss": 0.844, "grad_norm": 0.4388909339904785, "learning_rate": 0.0002, "epoch": 0.17430600387346676, "step": 540}, {"loss": 0.8322, "grad_norm": 0.5052742958068848, "learning_rate": 0.0002, "epoch": 0.17753389283408652, "step": 550}, {"loss": 0.791, "grad_norm": 0.48248958587646484, "learning_rate": 0.0002, "epoch": 0.18076178179470626, "step": 560}, {"loss": 0.8593, "grad_norm": 0.5360197424888611, "learning_rate": 0.0002, "epoch": 0.18398967075532602, "step": 570}, {"loss": 0.817, "grad_norm": 0.43999341130256653, "learning_rate": 0.0002, "epoch": 0.18721755971594578, "step": 580}, {"loss": 0.8311, "grad_norm": 0.3685208261013031, "learning_rate": 0.0002, "epoch": 0.19044544867656552, "step": 590}, {"loss": 0.8341, "grad_norm": 0.4601275622844696, "learning_rate": 0.0002, "epoch": 0.19367333763718528, "step": 600}, {"loss": 0.8483, "grad_norm": 0.4778369665145874, "learning_rate": 0.0002, "epoch": 0.19690122659780504, "step": 610}, {"loss": 0.8653, "grad_norm": 0.4867003560066223, "learning_rate": 0.0002, "epoch": 0.2001291155584248, "step": 620}, {"loss": 0.8554, "grad_norm": 0.4583742916584015, "learning_rate": 0.0002, "epoch": 0.20335700451904454, "step": 630}, {"loss": 0.8698, "grad_norm": 0.47958165407180786, "learning_rate": 0.0002, "epoch": 0.2065848934796643, "step": 640}, {"loss": 0.8213, "grad_norm": 0.4526064097881317, "learning_rate": 0.0002, "epoch": 0.20981278244028406, "step": 650}, {"loss": 0.8313, "grad_norm": 0.45890581607818604, "learning_rate": 0.0002, "epoch": 0.2130406714009038, "step": 660}, {"loss": 0.8143, "grad_norm": 0.42725905776023865, "learning_rate": 0.0002, "epoch": 0.21626856036152356, "step": 670}, {"loss": 0.8675, "grad_norm": 0.40380963683128357, "learning_rate": 0.0002, "epoch": 0.21949644932214332, "step": 680}, {"loss": 0.9004, "grad_norm": 0.4372998774051666, "learning_rate": 0.0002, "epoch": 0.22272433828276308, "step": 690}, {"loss": 0.8208, "grad_norm": 0.4245864450931549, "learning_rate": 0.0002, "epoch": 0.22595222724338282, "step": 700}, {"loss": 0.8564, "grad_norm": 0.4061129689216614, "learning_rate": 0.0002, "epoch": 0.22918011620400258, "step": 710}, {"loss": 0.8275, "grad_norm": 0.474454790353775, "learning_rate": 0.0002, "epoch": 0.23240800516462234, "step": 720}, {"loss": 0.8346, "grad_norm": 0.4908486008644104, "learning_rate": 0.0002, "epoch": 0.23563589412524208, "step": 730}, {"loss": 0.8755, "grad_norm": 0.4284191429615021, "learning_rate": 0.0002, "epoch": 0.23886378308586184, "step": 740}, {"loss": 0.8387, "grad_norm": 0.44730308651924133, "learning_rate": 0.0002, "epoch": 0.2420916720464816, "step": 750}, {"loss": 0.8135, "grad_norm": 0.4433246850967407, "learning_rate": 0.0002, "epoch": 0.24531956100710137, "step": 760}, {"loss": 0.8644, "grad_norm": 0.43668854236602783, "learning_rate": 0.0002, "epoch": 0.2485474499677211, "step": 770}, {"loss": 0.8025, "grad_norm": 0.34324130415916443, "learning_rate": 0.0002, "epoch": 0.25177533892834086, "step": 780}, {"loss": 0.8725, "grad_norm": 0.46476295590400696, "learning_rate": 0.0002, "epoch": 0.2550032278889606, "step": 790}, {"loss": 0.8157, "grad_norm": 0.5047039985656738, "learning_rate": 0.0002, "epoch": 0.2582311168495804, "step": 800}, {"loss": 0.8643, "grad_norm": 0.4402127265930176, "learning_rate": 0.0002, "epoch": 0.26145900581020015, "step": 810}, {"loss": 0.8025, "grad_norm": 0.4642465114593506, "learning_rate": 0.0002, "epoch": 0.26468689477081986, "step": 820}, {"loss": 0.8836, "grad_norm": 0.40093424916267395, "learning_rate": 0.0002, "epoch": 0.2679147837314396, "step": 830}, {"loss": 0.83, "grad_norm": 0.42501842975616455, "learning_rate": 0.0002, "epoch": 0.2711426726920594, "step": 840}, {"loss": 0.8573, "grad_norm": 0.43279722332954407, "learning_rate": 0.0002, "epoch": 0.27437056165267915, "step": 850}, {"loss": 0.817, "grad_norm": 0.5991243720054626, "learning_rate": 0.0002, "epoch": 0.2775984506132989, "step": 860}, {"loss": 0.7981, "grad_norm": 0.4217848777770996, "learning_rate": 0.0002, "epoch": 0.28082633957391867, "step": 870}, {"loss": 0.8135, "grad_norm": 0.3933536410331726, "learning_rate": 0.0002, "epoch": 0.28405422853453843, "step": 880}, {"loss": 0.8846, "grad_norm": 0.5868505239486694, "learning_rate": 0.0002, "epoch": 0.28728211749515814, "step": 890}, {"loss": 0.8759, "grad_norm": 0.5209547877311707, "learning_rate": 0.0002, "epoch": 0.2905100064557779, "step": 900}, {"loss": 0.815, "grad_norm": 0.49307361245155334, "learning_rate": 0.0002, "epoch": 0.29373789541639767, "step": 910}, {"loss": 0.7813, "grad_norm": 0.4288382828235626, "learning_rate": 0.0002, "epoch": 0.2969657843770174, "step": 920}, {"loss": 0.8431, "grad_norm": 0.33568474650382996, "learning_rate": 0.0002, "epoch": 0.3001936733376372, "step": 930}, {"loss": 0.8455, "grad_norm": 1.0915930271148682, "learning_rate": 0.0002, "epoch": 0.30342156229825695, "step": 940}, {"loss": 0.8535, "grad_norm": 0.5489798188209534, "learning_rate": 0.0002, "epoch": 0.3066494512588767, "step": 950}, {"loss": 0.8031, "grad_norm": 0.42971742153167725, "learning_rate": 0.0002, "epoch": 0.3098773402194964, "step": 960}, {"loss": 0.8253, "grad_norm": 0.43375834822654724, "learning_rate": 0.0002, "epoch": 0.3131052291801162, "step": 970}, {"loss": 0.7747, "grad_norm": 0.47488611936569214, "learning_rate": 0.0002, "epoch": 0.31633311814073595, "step": 980}, {"loss": 0.7906, "grad_norm": 0.46296775341033936, "learning_rate": 0.0002, "epoch": 0.3195610071013557, "step": 990}, {"loss": 0.7948, "grad_norm": 0.4548890292644501, "learning_rate": 0.0002, "epoch": 0.32278889606197547, "step": 1000}, {"loss": 0.8856, "grad_norm": 0.41834497451782227, "learning_rate": 0.0002, "epoch": 0.32601678502259523, "step": 1010}, {"loss": 0.7791, "grad_norm": 0.441092312335968, "learning_rate": 0.0002, "epoch": 0.329244673983215, "step": 1020}, {"loss": 0.8191, "grad_norm": 0.637322187423706, "learning_rate": 0.0002, "epoch": 0.33247256294383476, "step": 1030}, {"loss": 0.8685, "grad_norm": 0.4374958574771881, "learning_rate": 0.0002, "epoch": 0.33570045190445447, "step": 1040}, {"loss": 0.8423, "grad_norm": 0.3935825824737549, "learning_rate": 0.0002, "epoch": 0.33892834086507423, "step": 1050}, {"loss": 0.8287, "grad_norm": 0.43526220321655273, "learning_rate": 0.0002, "epoch": 0.342156229825694, "step": 1060}, {"loss": 0.8413, "grad_norm": 0.45327696204185486, "learning_rate": 0.0002, "epoch": 0.34538411878631375, "step": 1070}, {"loss": 0.7421, "grad_norm": 0.4126075506210327, "learning_rate": 0.0002, "epoch": 0.3486120077469335, "step": 1080}, {"loss": 0.8427, "grad_norm": 0.4714072048664093, "learning_rate": 0.0002, "epoch": 0.3518398967075533, "step": 1090}, {"loss": 0.8028, "grad_norm": 0.518127977848053, "learning_rate": 0.0002, "epoch": 0.35506778566817304, "step": 1100}, {"loss": 0.8479, "grad_norm": 0.43264099955558777, "learning_rate": 0.0002, "epoch": 0.35829567462879275, "step": 1110}, {"loss": 0.8724, "grad_norm": 0.4857400357723236, "learning_rate": 0.0002, "epoch": 0.3615235635894125, "step": 1120}, {"loss": 0.7735, "grad_norm": 0.37591469287872314, "learning_rate": 0.0002, "epoch": 0.3647514525500323, "step": 1130}, {"loss": 0.8531, "grad_norm": 0.4165478050708771, "learning_rate": 0.0002, "epoch": 0.36797934151065204, "step": 1140}, {"loss": 0.8151, "grad_norm": 0.42911383509635925, "learning_rate": 0.0002, "epoch": 0.3712072304712718, "step": 1150}, {"loss": 0.8722, "grad_norm": 0.44980287551879883, "learning_rate": 0.0002, "epoch": 0.37443511943189156, "step": 1160}, {"loss": 0.7961, "grad_norm": 0.4066573679447174, "learning_rate": 0.0002, "epoch": 0.3776630083925113, "step": 1170}, {"loss": 0.8317, "grad_norm": 0.5056195855140686, "learning_rate": 0.0002, "epoch": 0.38089089735313103, "step": 1180}, {"loss": 0.8387, "grad_norm": 0.4141536355018616, "learning_rate": 0.0002, "epoch": 0.3841187863137508, "step": 1190}, {"loss": 0.8019, "grad_norm": 0.4501924514770508, "learning_rate": 0.0002, "epoch": 0.38734667527437056, "step": 1200}, {"loss": 0.8528, "grad_norm": 0.43304240703582764, "learning_rate": 0.0002, "epoch": 0.3905745642349903, "step": 1210}, {"loss": 0.8905, "grad_norm": 0.475777804851532, "learning_rate": 0.0002, "epoch": 0.3938024531956101, "step": 1220}, {"loss": 0.8643, "grad_norm": 0.5846465826034546, "learning_rate": 0.0002, "epoch": 0.39703034215622984, "step": 1230}, {"loss": 0.8078, "grad_norm": 0.42899325489997864, "learning_rate": 0.0002, "epoch": 0.4002582311168496, "step": 1240}, {"loss": 0.8415, "grad_norm": 0.3980463147163391, "learning_rate": 0.0002, "epoch": 0.4034861200774693, "step": 1250}, {"loss": 0.8026, "grad_norm": 0.45769768953323364, "learning_rate": 0.0002, "epoch": 0.4067140090380891, "step": 1260}, {"loss": 0.8377, "grad_norm": 0.5101280212402344, "learning_rate": 0.0002, "epoch": 0.40994189799870884, "step": 1270}, {"loss": 0.7905, "grad_norm": 0.47374317049980164, "learning_rate": 0.0002, "epoch": 0.4131697869593286, "step": 1280}, {"loss": 0.8172, "grad_norm": 0.4261878728866577, "learning_rate": 0.0002, "epoch": 0.41639767591994836, "step": 1290}, {"loss": 0.9004, "grad_norm": 0.46954256296157837, "learning_rate": 0.0002, "epoch": 0.4196255648805681, "step": 1300}, {"loss": 0.7868, "grad_norm": 0.5205738544464111, "learning_rate": 0.0002, "epoch": 0.4228534538411879, "step": 1310}, {"loss": 0.8964, "grad_norm": 0.5176340937614441, "learning_rate": 0.0002, "epoch": 0.4260813428018076, "step": 1320}, {"loss": 0.8764, "grad_norm": 0.5155916810035706, "learning_rate": 0.0002, "epoch": 0.42930923176242736, "step": 1330}, {"loss": 0.8197, "grad_norm": 0.44548553228378296, "learning_rate": 0.0002, "epoch": 0.4325371207230471, "step": 1340}, {"loss": 0.7873, "grad_norm": 0.5633558630943298, "learning_rate": 0.0002, "epoch": 0.4357650096836669, "step": 1350}, {"loss": 0.7889, "grad_norm": 0.42444056272506714, "learning_rate": 0.0002, "epoch": 0.43899289864428664, "step": 1360}, {"loss": 0.8588, "grad_norm": 0.5226860642433167, "learning_rate": 0.0002, "epoch": 0.4422207876049064, "step": 1370}, {"loss": 0.8232, "grad_norm": 0.5354582071304321, "learning_rate": 0.0002, "epoch": 0.44544867656552617, "step": 1380}, {"loss": 0.816, "grad_norm": 0.472646564245224, "learning_rate": 0.0002, "epoch": 0.4486765655261459, "step": 1390}, {"loss": 0.7953, "grad_norm": 0.6312310099601746, "learning_rate": 0.0002, "epoch": 0.45190445448676564, "step": 1400}, {"loss": 0.8212, "grad_norm": 0.4298408031463623, "learning_rate": 0.0002, "epoch": 0.4551323434473854, "step": 1410}, {"loss": 0.8447, "grad_norm": 0.43427202105522156, "learning_rate": 0.0002, "epoch": 0.45836023240800516, "step": 1420}, {"loss": 0.8342, "grad_norm": 0.44097861647605896, "learning_rate": 0.0002, "epoch": 0.4615881213686249, "step": 1430}, {"loss": 0.8301, "grad_norm": 0.5142693519592285, "learning_rate": 0.0002, "epoch": 0.4648160103292447, "step": 1440}, {"loss": 0.8144, "grad_norm": 0.46416547894477844, "learning_rate": 0.0002, "epoch": 0.46804389928986445, "step": 1450}, {"loss": 0.8342, "grad_norm": 0.4858551025390625, "learning_rate": 0.0002, "epoch": 0.47127178825048416, "step": 1460}, {"loss": 0.8354, "grad_norm": 0.4709177315235138, "learning_rate": 0.0002, "epoch": 0.4744996772111039, "step": 1470}, {"loss": 0.8391, "grad_norm": 0.5500252842903137, "learning_rate": 0.0002, "epoch": 0.4777275661717237, "step": 1480}, {"loss": 0.8359, "grad_norm": 0.43364381790161133, "learning_rate": 0.0002, "epoch": 0.48095545513234345, "step": 1490}, {"loss": 0.8446, "grad_norm": 0.47712287306785583, "learning_rate": 0.0002, "epoch": 0.4841833440929632, "step": 1500}, {"loss": 0.8518, "grad_norm": 0.4518495202064514, "learning_rate": 0.0002, "epoch": 0.48741123305358297, "step": 1510}, {"loss": 0.819, "grad_norm": 0.4539008140563965, "learning_rate": 0.0002, "epoch": 0.49063912201420273, "step": 1520}, {"loss": 0.8276, "grad_norm": 0.4993067979812622, "learning_rate": 0.0002, "epoch": 0.49386701097482244, "step": 1530}, {"loss": 0.8297, "grad_norm": 0.6094803214073181, "learning_rate": 0.0002, "epoch": 0.4970948999354422, "step": 1540}, {"loss": 0.8263, "grad_norm": 0.48602527379989624, "learning_rate": 0.0002, "epoch": 0.500322788896062, "step": 1550}, {"loss": 0.8182, "grad_norm": 0.40245795249938965, "learning_rate": 0.0002, "epoch": 0.5035506778566817, "step": 1560}, {"loss": 0.7907, "grad_norm": 0.456787645816803, "learning_rate": 0.0002, "epoch": 0.5067785668173015, "step": 1570}, {"loss": 0.86, "grad_norm": 0.43936216831207275, "learning_rate": 0.0002, "epoch": 0.5100064557779213, "step": 1580}, {"loss": 0.7928, "grad_norm": 0.549018144607544, "learning_rate": 0.0002, "epoch": 0.513234344738541, "step": 1590}, {"loss": 0.8169, "grad_norm": 0.41746795177459717, "learning_rate": 0.0002, "epoch": 0.5164622336991608, "step": 1600}, {"loss": 0.7868, "grad_norm": 0.4217053949832916, "learning_rate": 0.0002, "epoch": 0.5196901226597805, "step": 1610}, {"loss": 0.8161, "grad_norm": 0.449913889169693, "learning_rate": 0.0002, "epoch": 0.5229180116204003, "step": 1620}, {"loss": 0.7938, "grad_norm": 0.5084872245788574, "learning_rate": 0.0002, "epoch": 0.5261459005810201, "step": 1630}, {"loss": 0.8295, "grad_norm": 0.46248653531074524, "learning_rate": 0.0002, "epoch": 0.5293737895416397, "step": 1640}, {"loss": 0.7993, "grad_norm": 0.4824236035346985, "learning_rate": 0.0002, "epoch": 0.5326016785022595, "step": 1650}, {"loss": 0.8711, "grad_norm": 0.6010985374450684, "learning_rate": 0.0002, "epoch": 0.5358295674628792, "step": 1660}, {"loss": 0.8266, "grad_norm": 0.4757920801639557, "learning_rate": 0.0002, "epoch": 0.539057456423499, "step": 1670}, {"loss": 0.8182, "grad_norm": 0.45161882042884827, "learning_rate": 0.0002, "epoch": 0.5422853453841188, "step": 1680}, {"loss": 0.8141, "grad_norm": 0.49314990639686584, "learning_rate": 0.0002, "epoch": 0.5455132343447385, "step": 1690}, {"loss": 0.8091, "grad_norm": 0.3918305039405823, "learning_rate": 0.0002, "epoch": 0.5487411233053583, "step": 1700}, {"loss": 0.8177, "grad_norm": 0.5966728925704956, "learning_rate": 0.0002, "epoch": 0.551969012265978, "step": 1710}, {"loss": 0.8438, "grad_norm": 0.4208986163139343, "learning_rate": 0.0002, "epoch": 0.5551969012265978, "step": 1720}, {"loss": 0.817, "grad_norm": 0.43724218010902405, "learning_rate": 0.0002, "epoch": 0.5584247901872176, "step": 1730}, {"loss": 0.7956, "grad_norm": 0.5287272930145264, "learning_rate": 0.0002, "epoch": 0.5616526791478373, "step": 1740}, {"loss": 0.8557, "grad_norm": 0.4961899518966675, "learning_rate": 0.0002, "epoch": 0.5648805681084571, "step": 1750}, {"loss": 0.8029, "grad_norm": 0.4468635320663452, "learning_rate": 0.0002, "epoch": 0.5681084570690769, "step": 1760}, {"loss": 0.7968, "grad_norm": 0.6423530578613281, "learning_rate": 0.0002, "epoch": 0.5713363460296966, "step": 1770}, {"loss": 0.8324, "grad_norm": 0.4601971507072449, "learning_rate": 0.0002, "epoch": 0.5745642349903163, "step": 1780}, {"loss": 0.8171, "grad_norm": 0.46514901518821716, "learning_rate": 0.0002, "epoch": 0.577792123950936, "step": 1790}, {"loss": 0.8186, "grad_norm": 0.4771687388420105, "learning_rate": 0.0002, "epoch": 0.5810200129115558, "step": 1800}, {"loss": 0.856, "grad_norm": 0.46514490246772766, "learning_rate": 0.0002, "epoch": 0.5842479018721756, "step": 1810}, {"loss": 0.84, "grad_norm": 0.5373936295509338, "learning_rate": 0.0002, "epoch": 0.5874757908327953, "step": 1820}, {"loss": 0.8456, "grad_norm": 0.5175791382789612, "learning_rate": 0.0002, "epoch": 0.5907036797934151, "step": 1830}, {"loss": 0.7957, "grad_norm": 0.4522802233695984, "learning_rate": 0.0002, "epoch": 0.5939315687540349, "step": 1840}, {"loss": 0.8633, "grad_norm": 0.42987772822380066, "learning_rate": 0.0002, "epoch": 0.5971594577146546, "step": 1850}, {"loss": 0.7871, "grad_norm": 0.5566838383674622, "learning_rate": 0.0002, "epoch": 0.6003873466752744, "step": 1860}, {"loss": 0.8312, "grad_norm": 0.42807698249816895, "learning_rate": 0.0002, "epoch": 0.6036152356358941, "step": 1870}, {"loss": 0.8035, "grad_norm": 0.4957767724990845, "learning_rate": 0.0002, "epoch": 0.6068431245965139, "step": 1880}, {"loss": 0.8145, "grad_norm": 0.4260980188846588, "learning_rate": 0.0002, "epoch": 0.6100710135571337, "step": 1890}, {"loss": 0.8363, "grad_norm": 0.4777357876300812, "learning_rate": 0.0002, "epoch": 0.6132989025177534, "step": 1900}, {"loss": 0.8404, "grad_norm": 0.4434216022491455, "learning_rate": 0.0002, "epoch": 0.6165267914783732, "step": 1910}, {"loss": 0.8057, "grad_norm": 0.5215433835983276, "learning_rate": 0.0002, "epoch": 0.6197546804389928, "step": 1920}, {"loss": 0.82, "grad_norm": 0.5143248438835144, "learning_rate": 0.0002, "epoch": 0.6229825693996126, "step": 1930}, {"loss": 0.8107, "grad_norm": 0.5213413238525391, "learning_rate": 0.0002, "epoch": 0.6262104583602324, "step": 1940}, {"loss": 0.7549, "grad_norm": 0.5408226251602173, "learning_rate": 0.0002, "epoch": 0.6294383473208521, "step": 1950}, {"loss": 0.8405, "grad_norm": 0.5479708909988403, "learning_rate": 0.0002, "epoch": 0.6326662362814719, "step": 1960}, {"loss": 0.8138, "grad_norm": 0.4490949809551239, "learning_rate": 0.0002, "epoch": 0.6358941252420917, "step": 1970}, {"loss": 0.854, "grad_norm": 0.48815059661865234, "learning_rate": 0.0002, "epoch": 0.6391220142027114, "step": 1980}, {"loss": 0.8568, "grad_norm": 0.46498045325279236, "learning_rate": 0.0002, "epoch": 0.6423499031633312, "step": 1990}, {"loss": 0.8263, "grad_norm": 0.5136561393737793, "learning_rate": 0.0002, "epoch": 0.6455777921239509, "step": 2000}, {"loss": 0.8503, "grad_norm": 0.5145719647407532, "learning_rate": 0.0002, "epoch": 0.6488056810845707, "step": 2010}, {"loss": 0.8456, "grad_norm": 0.5430373549461365, "learning_rate": 0.0002, "epoch": 0.6520335700451905, "step": 2020}, {"loss": 0.8115, "grad_norm": 0.46347954869270325, "learning_rate": 0.0002, "epoch": 0.6552614590058102, "step": 2030}, {"loss": 0.8769, "grad_norm": 0.5189562439918518, "learning_rate": 0.0002, "epoch": 0.65848934796643, "step": 2040}, {"loss": 0.8453, "grad_norm": 0.43843990564346313, "learning_rate": 0.0002, "epoch": 0.6617172369270498, "step": 2050}, {"loss": 0.7951, "grad_norm": 0.4654983580112457, "learning_rate": 0.0002, "epoch": 0.6649451258876695, "step": 2060}, {"loss": 0.8308, "grad_norm": 0.44835716485977173, "learning_rate": 0.0002, "epoch": 0.6681730148482892, "step": 2070}, {"loss": 0.8181, "grad_norm": 0.38811734318733215, "learning_rate": 0.0002, "epoch": 0.6714009038089089, "step": 2080}, {"loss": 0.762, "grad_norm": 0.5709853172302246, "learning_rate": 0.0002, "epoch": 0.6746287927695287, "step": 2090}, {"loss": 0.8334, "grad_norm": 0.49994757771492004, "learning_rate": 0.0002, "epoch": 0.6778566817301485, "step": 2100}, {"loss": 0.8, "grad_norm": 0.5505402684211731, "learning_rate": 0.0002, "epoch": 0.6810845706907682, "step": 2110}, {"loss": 0.8227, "grad_norm": 0.48195120692253113, "learning_rate": 0.0002, "epoch": 0.684312459651388, "step": 2120}, {"loss": 0.7879, "grad_norm": 0.4854775071144104, "learning_rate": 0.0002, "epoch": 0.6875403486120077, "step": 2130}, {"loss": 0.8231, "grad_norm": 0.6422494649887085, "learning_rate": 0.0002, "epoch": 0.6907682375726275, "step": 2140}, {"loss": 0.8353, "grad_norm": 0.3972536027431488, "learning_rate": 0.0002, "epoch": 0.6939961265332473, "step": 2150}, {"loss": 0.8068, "grad_norm": 0.4297836422920227, "learning_rate": 0.0002, "epoch": 0.697224015493867, "step": 2160}, {"loss": 0.8017, "grad_norm": 0.45486778020858765, "learning_rate": 0.0002, "epoch": 0.7004519044544868, "step": 2170}, {"loss": 0.8507, "grad_norm": 0.4706047773361206, "learning_rate": 0.0002, "epoch": 0.7036797934151066, "step": 2180}, {"loss": 0.8234, "grad_norm": 0.46426892280578613, "learning_rate": 0.0002, "epoch": 0.7069076823757263, "step": 2190}, {"loss": 0.8472, "grad_norm": 0.46333715319633484, "learning_rate": 0.0002, "epoch": 0.7101355713363461, "step": 2200}, {"loss": 0.8247, "grad_norm": 0.4632524251937866, "learning_rate": 0.0002, "epoch": 0.7133634602969657, "step": 2210}, {"loss": 0.8452, "grad_norm": 0.4610830843448639, "learning_rate": 0.0002, "epoch": 0.7165913492575855, "step": 2220}, {"loss": 0.7338, "grad_norm": 0.4905324876308441, "learning_rate": 0.0002, "epoch": 0.7198192382182053, "step": 2230}, {"loss": 0.7715, "grad_norm": 0.4936263859272003, "learning_rate": 0.0002, "epoch": 0.723047127178825, "step": 2240}, {"loss": 0.8162, "grad_norm": 0.40778425335884094, "learning_rate": 0.0002, "epoch": 0.7262750161394448, "step": 2250}, {"loss": 0.828, "grad_norm": 0.50351482629776, "learning_rate": 0.0002, "epoch": 0.7295029051000645, "step": 2260}, {"loss": 0.8475, "grad_norm": 0.4894128143787384, "learning_rate": 0.0002, "epoch": 0.7327307940606843, "step": 2270}, {"loss": 0.8087, "grad_norm": 0.5580906271934509, "learning_rate": 0.0002, "epoch": 0.7359586830213041, "step": 2280}, {"loss": 0.8157, "grad_norm": 0.4655369520187378, "learning_rate": 0.0002, "epoch": 0.7391865719819238, "step": 2290}, {"loss": 0.8395, "grad_norm": 0.4666965901851654, "learning_rate": 0.0002, "epoch": 0.7424144609425436, "step": 2300}, {"loss": 0.7605, "grad_norm": 0.46259936690330505, "learning_rate": 0.0002, "epoch": 0.7456423499031634, "step": 2310}, {"loss": 0.7849, "grad_norm": 0.520706832408905, "learning_rate": 0.0002, "epoch": 0.7488702388637831, "step": 2320}, {"loss": 0.8173, "grad_norm": 0.5142408013343811, "learning_rate": 0.0002, "epoch": 0.7520981278244029, "step": 2330}, {"loss": 0.7782, "grad_norm": 0.5355164408683777, "learning_rate": 0.0002, "epoch": 0.7553260167850226, "step": 2340}, {"loss": 0.8242, "grad_norm": 0.5517185926437378, "learning_rate": 0.0002, "epoch": 0.7585539057456423, "step": 2350}, {"loss": 0.8404, "grad_norm": 0.7162677049636841, "learning_rate": 0.0002, "epoch": 0.7617817947062621, "step": 2360}, {"loss": 0.8455, "grad_norm": 0.42402133345603943, "learning_rate": 0.0002, "epoch": 0.7650096836668818, "step": 2370}, {"loss": 0.8214, "grad_norm": 0.47180113196372986, "learning_rate": 0.0002, "epoch": 0.7682375726275016, "step": 2380}, {"loss": 0.8274, "grad_norm": 0.6262288689613342, "learning_rate": 0.0002, "epoch": 0.7714654615881213, "step": 2390}, {"loss": 0.7915, "grad_norm": 0.5177528262138367, "learning_rate": 0.0002, "epoch": 0.7746933505487411, "step": 2400}, {"loss": 0.7631, "grad_norm": 0.555721640586853, "learning_rate": 0.0002, "epoch": 0.7779212395093609, "step": 2410}, {"loss": 0.795, "grad_norm": 0.5592644810676575, "learning_rate": 0.0002, "epoch": 0.7811491284699806, "step": 2420}, {"loss": 0.8081, "grad_norm": 0.38025397062301636, "learning_rate": 0.0002, "epoch": 0.7843770174306004, "step": 2430}, {"loss": 0.7851, "grad_norm": 0.4597472548484802, "learning_rate": 0.0002, "epoch": 0.7876049063912202, "step": 2440}, {"loss": 0.8575, "grad_norm": 0.4929825961589813, "learning_rate": 0.0002, "epoch": 0.7908327953518399, "step": 2450}, {"loss": 0.7584, "grad_norm": 0.45277655124664307, "learning_rate": 0.0002, "epoch": 0.7940606843124597, "step": 2460}, {"loss": 0.8208, "grad_norm": 0.6224122643470764, "learning_rate": 0.0002, "epoch": 0.7972885732730794, "step": 2470}, {"loss": 0.8449, "grad_norm": 0.5740901827812195, "learning_rate": 0.0002, "epoch": 0.8005164622336992, "step": 2480}, {"loss": 0.7834, "grad_norm": 0.41335329413414, "learning_rate": 0.0002, "epoch": 0.8037443511943189, "step": 2490}, {"loss": 0.7768, "grad_norm": 0.4738694131374359, "learning_rate": 0.0002, "epoch": 0.8069722401549386, "step": 2500}, {"loss": 0.7927, "grad_norm": 0.5288197994232178, "learning_rate": 0.0002, "epoch": 0.8102001291155584, "step": 2510}, {"loss": 0.8334, "grad_norm": 0.5404666066169739, "learning_rate": 0.0002, "epoch": 0.8134280180761781, "step": 2520}, {"loss": 0.7998, "grad_norm": 0.4444909691810608, "learning_rate": 0.0002, "epoch": 0.8166559070367979, "step": 2530}, {"loss": 0.8683, "grad_norm": 0.542061448097229, "learning_rate": 0.0002, "epoch": 0.8198837959974177, "step": 2540}, {"loss": 0.8038, "grad_norm": 0.4914741814136505, "learning_rate": 0.0002, "epoch": 0.8231116849580374, "step": 2550}, {"loss": 0.7899, "grad_norm": 0.41703441739082336, "learning_rate": 0.0002, "epoch": 0.8263395739186572, "step": 2560}, {"loss": 0.824, "grad_norm": 0.5489841103553772, "learning_rate": 0.0002, "epoch": 0.829567462879277, "step": 2570}, {"loss": 0.8157, "grad_norm": 0.5359883308410645, "learning_rate": 0.0002, "epoch": 0.8327953518398967, "step": 2580}, {"loss": 0.8122, "grad_norm": 0.5541019439697266, "learning_rate": 0.0002, "epoch": 0.8360232408005165, "step": 2590}, {"loss": 0.797, "grad_norm": 0.4746638834476471, "learning_rate": 0.0002, "epoch": 0.8392511297611362, "step": 2600}, {"loss": 0.8116, "grad_norm": 0.5243194103240967, "learning_rate": 0.0002, "epoch": 0.842479018721756, "step": 2610}, {"loss": 0.8173, "grad_norm": 0.46824976801872253, "learning_rate": 0.0002, "epoch": 0.8457069076823758, "step": 2620}, {"loss": 0.7525, "grad_norm": 0.49487847089767456, "learning_rate": 0.0002, "epoch": 0.8489347966429954, "step": 2630}, {"loss": 0.8296, "grad_norm": 0.42180097103118896, "learning_rate": 0.0002, "epoch": 0.8521626856036152, "step": 2640}, {"loss": 0.8304, "grad_norm": 0.5516560077667236, "learning_rate": 0.0002, "epoch": 0.855390574564235, "step": 2650}, {"loss": 0.7882, "grad_norm": 0.4392191767692566, "learning_rate": 0.0002, "epoch": 0.8586184635248547, "step": 2660}, {"loss": 0.848, "grad_norm": 0.5387210845947266, "learning_rate": 0.0002, "epoch": 0.8618463524854745, "step": 2670}, {"loss": 0.8094, "grad_norm": 0.6232406497001648, "learning_rate": 0.0002, "epoch": 0.8650742414460942, "step": 2680}, {"loss": 0.768, "grad_norm": 0.53749018907547, "learning_rate": 0.0002, "epoch": 0.868302130406714, "step": 2690}, {"loss": 0.8299, "grad_norm": 0.47480374574661255, "learning_rate": 0.0002, "epoch": 0.8715300193673338, "step": 2700}, {"loss": 0.8055, "grad_norm": 0.44618046283721924, "learning_rate": 0.0002, "epoch": 0.8747579083279535, "step": 2710}, {"loss": 0.8015, "grad_norm": 0.4173581302165985, "learning_rate": 0.0002, "epoch": 0.8779857972885733, "step": 2720}, {"loss": 0.7713, "grad_norm": 0.524081289768219, "learning_rate": 0.0002, "epoch": 0.881213686249193, "step": 2730}, {"loss": 0.8738, "grad_norm": 0.5608431100845337, "learning_rate": 0.0002, "epoch": 0.8844415752098128, "step": 2740}, {"loss": 0.8513, "grad_norm": 0.5212284922599792, "learning_rate": 0.0002, "epoch": 0.8876694641704326, "step": 2750}, {"loss": 0.8139, "grad_norm": 0.5601475834846497, "learning_rate": 0.0002, "epoch": 0.8908973531310523, "step": 2760}, {"loss": 0.7947, "grad_norm": 0.4499223828315735, "learning_rate": 0.0002, "epoch": 0.8941252420916721, "step": 2770}, {"loss": 0.8559, "grad_norm": 0.46945226192474365, "learning_rate": 0.0002, "epoch": 0.8973531310522918, "step": 2780}, {"loss": 0.801, "grad_norm": 0.4837495684623718, "learning_rate": 0.0002, "epoch": 0.9005810200129115, "step": 2790}, {"loss": 0.7887, "grad_norm": 0.5059258937835693, "learning_rate": 0.0002, "epoch": 0.9038089089735313, "step": 2800}, {"loss": 0.8571, "grad_norm": 0.4857945144176483, "learning_rate": 0.0002, "epoch": 0.907036797934151, "step": 2810}, {"loss": 0.8301, "grad_norm": 0.5001962780952454, "learning_rate": 0.0002, "epoch": 0.9102646868947708, "step": 2820}, {"loss": 0.8236, "grad_norm": 0.5468648672103882, "learning_rate": 0.0002, "epoch": 0.9134925758553906, "step": 2830}, {"loss": 0.8071, "grad_norm": 0.5533056259155273, "learning_rate": 0.0002, "epoch": 0.9167204648160103, "step": 2840}, {"loss": 0.7895, "grad_norm": 0.5909785628318787, "learning_rate": 0.0002, "epoch": 0.9199483537766301, "step": 2850}, {"loss": 0.796, "grad_norm": 0.47428104281425476, "learning_rate": 0.0002, "epoch": 0.9231762427372499, "step": 2860}, {"loss": 0.7845, "grad_norm": 0.548814058303833, "learning_rate": 0.0002, "epoch": 0.9264041316978696, "step": 2870}, {"loss": 0.7871, "grad_norm": 0.5576745271682739, "learning_rate": 0.0002, "epoch": 0.9296320206584894, "step": 2880}, {"loss": 0.8399, "grad_norm": 0.47094792127609253, "learning_rate": 0.0002, "epoch": 0.9328599096191091, "step": 2890}, {"loss": 0.805, "grad_norm": 0.5408539772033691, "learning_rate": 0.0002, "epoch": 0.9360877985797289, "step": 2900}, {"loss": 0.785, "grad_norm": 0.5922889113426208, "learning_rate": 0.0002, "epoch": 0.9393156875403487, "step": 2910}, {"loss": 0.8043, "grad_norm": 0.45462584495544434, "learning_rate": 0.0002, "epoch": 0.9425435765009683, "step": 2920}, {"loss": 0.8344, "grad_norm": 0.6864947080612183, "learning_rate": 0.0002, "epoch": 0.9457714654615881, "step": 2930}, {"loss": 0.8166, "grad_norm": 0.4706299304962158, "learning_rate": 0.0002, "epoch": 0.9489993544222078, "step": 2940}, {"loss": 0.8422, "grad_norm": 0.5583269596099854, "learning_rate": 0.0002, "epoch": 0.9522272433828276, "step": 2950}, {"loss": 0.836, "grad_norm": 0.51015704870224, "learning_rate": 0.0002, "epoch": 0.9554551323434474, "step": 2960}, {"loss": 0.8371, "grad_norm": 0.5325582027435303, "learning_rate": 0.0002, "epoch": 0.9586830213040671, "step": 2970}, {"loss": 0.7593, "grad_norm": 0.49008598923683167, "learning_rate": 0.0002, "epoch": 0.9619109102646869, "step": 2980}, {"loss": 0.8093, "grad_norm": 0.4422132074832916, "learning_rate": 0.0002, "epoch": 0.9651387992253067, "step": 2990}, {"loss": 0.7966, "grad_norm": 0.5053589344024658, "learning_rate": 0.0002, "epoch": 0.9683666881859264, "step": 3000}, {"loss": 0.8081, "grad_norm": 0.46754521131515503, "learning_rate": 0.0002, "epoch": 0.9715945771465462, "step": 3010}, {"loss": 0.8377, "grad_norm": 0.5613434910774231, "learning_rate": 0.0002, "epoch": 0.9748224661071659, "step": 3020}, {"loss": 0.7856, "grad_norm": 0.5052843689918518, "learning_rate": 0.0002, "epoch": 0.9780503550677857, "step": 3030}, {"loss": 0.8412, "grad_norm": 0.4270972013473511, "learning_rate": 0.0002, "epoch": 0.9812782440284055, "step": 3040}, {"loss": 0.8353, "grad_norm": 0.4974991977214813, "learning_rate": 0.0002, "epoch": 0.9845061329890252, "step": 3050}, {"loss": 0.8415, "grad_norm": 0.4432311952114105, "learning_rate": 0.0002, "epoch": 0.9877340219496449, "step": 3060}, {"loss": 0.7764, "grad_norm": 0.466457724571228, "learning_rate": 0.0002, "epoch": 0.9909619109102646, "step": 3070}, {"loss": 0.8067, "grad_norm": 0.6438009142875671, "learning_rate": 0.0002, "epoch": 0.9941897998708844, "step": 3080}, {"loss": 0.8425, "grad_norm": 0.5593604445457458, "learning_rate": 0.0002, "epoch": 0.9974176888315042, "step": 3090}, {"eval_loss": 1.0958120822906494, "eval_runtime": 148.3273, "eval_samples_per_second": 4.942, "eval_steps_per_second": 0.62, "epoch": 1.0, "step": 3098}, {"loss": 0.8275, "grad_norm": 0.5701445937156677, "learning_rate": 0.0002, "epoch": 1.000645577792124, "step": 3100}, {"loss": 0.7756, "grad_norm": 0.6089657545089722, "learning_rate": 0.0002, "epoch": 1.0038734667527438, "step": 3110}, {"loss": 0.7492, "grad_norm": 0.5619552135467529, "learning_rate": 0.0002, "epoch": 1.0071013557133635, "step": 3120}, {"loss": 0.7544, "grad_norm": 0.5550283789634705, "learning_rate": 0.0002, "epoch": 1.010329244673983, "step": 3130}, {"loss": 0.8006, "grad_norm": 0.6221792101860046, "learning_rate": 0.0002, "epoch": 1.013557133634603, "step": 3140}, {"loss": 0.7603, "grad_norm": 0.5450758934020996, "learning_rate": 0.0002, "epoch": 1.0167850225952226, "step": 3150}, {"loss": 0.7021, "grad_norm": 0.4359588027000427, "learning_rate": 0.0002, "epoch": 1.0200129115558425, "step": 3160}, {"loss": 0.7468, "grad_norm": 0.5932239890098572, "learning_rate": 0.0002, "epoch": 1.0232408005164622, "step": 3170}, {"loss": 0.7649, "grad_norm": 0.45478707551956177, "learning_rate": 0.0002, "epoch": 1.026468689477082, "step": 3180}, {"loss": 0.7355, "grad_norm": 0.677615761756897, "learning_rate": 0.0002, "epoch": 1.0296965784377017, "step": 3190}, {"loss": 0.6928, "grad_norm": 0.6231790781021118, "learning_rate": 0.0002, "epoch": 1.0329244673983216, "step": 3200}, {"loss": 0.7471, "grad_norm": 0.5074195861816406, "learning_rate": 0.0002, "epoch": 1.0361523563589412, "step": 3210}, {"loss": 0.6864, "grad_norm": 0.4844142198562622, "learning_rate": 0.0002, "epoch": 1.039380245319561, "step": 3220}, {"loss": 0.7655, "grad_norm": 0.5372750759124756, "learning_rate": 0.0002, "epoch": 1.0426081342801807, "step": 3230}, {"loss": 0.7384, "grad_norm": 0.46296265721321106, "learning_rate": 0.0002, "epoch": 1.0458360232408006, "step": 3240}, {"loss": 0.7894, "grad_norm": 0.5417148470878601, "learning_rate": 0.0002, "epoch": 1.0490639122014203, "step": 3250}, {"loss": 0.7637, "grad_norm": 0.5695074200630188, "learning_rate": 0.0002, "epoch": 1.0522918011620401, "step": 3260}, {"loss": 0.7456, "grad_norm": 0.5050092935562134, "learning_rate": 0.0002, "epoch": 1.0555196901226598, "step": 3270}, {"loss": 0.6805, "grad_norm": 0.5320752263069153, "learning_rate": 0.0002, "epoch": 1.0587475790832794, "step": 3280}, {"loss": 0.7419, "grad_norm": 0.5832052230834961, "learning_rate": 0.0002, "epoch": 1.0619754680438993, "step": 3290}, {"loss": 0.7656, "grad_norm": 0.5228804349899292, "learning_rate": 0.0002, "epoch": 1.065203357004519, "step": 3300}, {"loss": 0.6834, "grad_norm": 0.5819445252418518, "learning_rate": 0.0002, "epoch": 1.0684312459651388, "step": 3310}, {"loss": 0.7093, "grad_norm": 0.4201328754425049, "learning_rate": 0.0002, "epoch": 1.0716591349257585, "step": 3320}, {"loss": 0.7494, "grad_norm": 0.5424145460128784, "learning_rate": 0.0002, "epoch": 1.0748870238863784, "step": 3330}, {"loss": 0.7828, "grad_norm": 0.6169946789741516, "learning_rate": 0.0002, "epoch": 1.078114912846998, "step": 3340}, {"loss": 0.7505, "grad_norm": 0.607676088809967, "learning_rate": 0.0002, "epoch": 1.0813428018076179, "step": 3350}, {"loss": 0.7315, "grad_norm": 0.5191982388496399, "learning_rate": 0.0002, "epoch": 1.0845706907682375, "step": 3360}, {"loss": 0.7699, "grad_norm": 0.5728003978729248, "learning_rate": 0.0002, "epoch": 1.0877985797288574, "step": 3370}, {"loss": 0.7381, "grad_norm": 0.5402643084526062, "learning_rate": 0.0002, "epoch": 1.091026468689477, "step": 3380}, {"loss": 0.7208, "grad_norm": 0.5377541780471802, "learning_rate": 0.0002, "epoch": 1.094254357650097, "step": 3390}, {"loss": 0.7672, "grad_norm": 0.4751385748386383, "learning_rate": 0.0002, "epoch": 1.0974822466107166, "step": 3400}, {"loss": 0.7326, "grad_norm": 0.559158444404602, "learning_rate": 0.0002, "epoch": 1.1007101355713362, "step": 3410}, {"loss": 0.7366, "grad_norm": 0.4917701482772827, "learning_rate": 0.0002, "epoch": 1.103938024531956, "step": 3420}, {"loss": 0.7593, "grad_norm": 0.5507875084877014, "learning_rate": 0.0002, "epoch": 1.1071659134925758, "step": 3430}, {"loss": 0.7424, "grad_norm": 0.45458680391311646, "learning_rate": 0.0002, "epoch": 1.1103938024531956, "step": 3440}, {"loss": 0.7234, "grad_norm": 0.5721744894981384, "learning_rate": 0.0002, "epoch": 1.1136216914138153, "step": 3450}, {"loss": 0.7219, "grad_norm": 0.5776081681251526, "learning_rate": 0.0002, "epoch": 1.1168495803744352, "step": 3460}, {"loss": 0.7644, "grad_norm": 0.5261953473091125, "learning_rate": 0.0002, "epoch": 1.1200774693350548, "step": 3470}, {"loss": 0.6586, "grad_norm": 0.47759532928466797, "learning_rate": 0.0002, "epoch": 1.1233053582956747, "step": 3480}, {"loss": 0.7641, "grad_norm": 0.5697659850120544, "learning_rate": 0.0002, "epoch": 1.1265332472562943, "step": 3490}, {"loss": 0.7017, "grad_norm": 0.5643419623374939, "learning_rate": 0.0002, "epoch": 1.1297611362169142, "step": 3500}, {"loss": 0.7235, "grad_norm": 0.6502931118011475, "learning_rate": 0.0002, "epoch": 1.1329890251775339, "step": 3510}, {"loss": 0.7662, "grad_norm": 0.5236507654190063, "learning_rate": 0.0002, "epoch": 1.1362169141381537, "step": 3520}, {"loss": 0.7571, "grad_norm": 0.6521499156951904, "learning_rate": 0.0002, "epoch": 1.1394448030987734, "step": 3530}, {"loss": 0.7304, "grad_norm": 0.5893217325210571, "learning_rate": 0.0002, "epoch": 1.142672692059393, "step": 3540}, {"loss": 0.7508, "grad_norm": 0.5300073027610779, "learning_rate": 0.0002, "epoch": 1.145900581020013, "step": 3550}, {"loss": 0.6937, "grad_norm": 0.6794660091400146, "learning_rate": 0.0002, "epoch": 1.1491284699806328, "step": 3560}, {"loss": 0.7614, "grad_norm": 0.5420064926147461, "learning_rate": 0.0002, "epoch": 1.1523563589412524, "step": 3570}, {"loss": 0.7648, "grad_norm": 0.5096590518951416, "learning_rate": 0.0002, "epoch": 1.155584247901872, "step": 3580}, {"loss": 0.7436, "grad_norm": 0.5726043581962585, "learning_rate": 0.0002, "epoch": 1.158812136862492, "step": 3590}, {"loss": 0.7728, "grad_norm": 0.7388110160827637, "learning_rate": 0.0002, "epoch": 1.1620400258231116, "step": 3600}, {"loss": 0.7421, "grad_norm": 0.5597969889640808, "learning_rate": 0.0002, "epoch": 1.1652679147837315, "step": 3610}, {"loss": 0.7132, "grad_norm": 0.5067800283432007, "learning_rate": 0.0002, "epoch": 1.1684958037443511, "step": 3620}, {"loss": 0.7893, "grad_norm": 0.6625118255615234, "learning_rate": 0.0002, "epoch": 1.171723692704971, "step": 3630}, {"loss": 0.7611, "grad_norm": 0.5830849409103394, "learning_rate": 0.0002, "epoch": 1.1749515816655907, "step": 3640}, {"loss": 0.7973, "grad_norm": 0.6140692830085754, "learning_rate": 0.0002, "epoch": 1.1781794706262105, "step": 3650}, {"loss": 0.7617, "grad_norm": 0.714523434638977, "learning_rate": 0.0002, "epoch": 1.1814073595868302, "step": 3660}, {"loss": 0.7092, "grad_norm": 0.5196696519851685, "learning_rate": 0.0002, "epoch": 1.18463524854745, "step": 3670}, {"loss": 0.7821, "grad_norm": 0.6677889823913574, "learning_rate": 0.0002, "epoch": 1.1878631375080697, "step": 3680}, {"loss": 0.7813, "grad_norm": 0.47095245122909546, "learning_rate": 0.0002, "epoch": 1.1910910264686896, "step": 3690}, {"loss": 0.7702, "grad_norm": 0.5197778940200806, "learning_rate": 0.0002, "epoch": 1.1943189154293092, "step": 3700}, {"loss": 0.7349, "grad_norm": 0.5156530141830444, "learning_rate": 0.0002, "epoch": 1.1975468043899289, "step": 3710}, {"loss": 0.7738, "grad_norm": 0.6968549489974976, "learning_rate": 0.0002, "epoch": 1.2007746933505488, "step": 3720}, {"loss": 0.7599, "grad_norm": 0.48983848094940186, "learning_rate": 0.0002, "epoch": 1.2040025823111684, "step": 3730}, {"loss": 0.7163, "grad_norm": 0.6709973216056824, "learning_rate": 0.0002, "epoch": 1.2072304712717883, "step": 3740}, {"loss": 0.7632, "grad_norm": 0.48681750893592834, "learning_rate": 0.0002, "epoch": 1.210458360232408, "step": 3750}, {"loss": 0.7039, "grad_norm": 0.49475061893463135, "learning_rate": 0.0002, "epoch": 1.2136862491930278, "step": 3760}, {"loss": 0.7372, "grad_norm": 0.6163983345031738, "learning_rate": 0.0002, "epoch": 1.2169141381536475, "step": 3770}, {"loss": 0.757, "grad_norm": 0.5481411218643188, "learning_rate": 0.0002, "epoch": 1.2201420271142673, "step": 3780}, {"loss": 0.7601, "grad_norm": 0.620639979839325, "learning_rate": 0.0002, "epoch": 1.223369916074887, "step": 3790}, {"loss": 0.7738, "grad_norm": 0.7017222046852112, "learning_rate": 0.0002, "epoch": 1.2265978050355069, "step": 3800}, {"loss": 0.7468, "grad_norm": 0.5872400403022766, "learning_rate": 0.0002, "epoch": 1.2298256939961265, "step": 3810}, {"loss": 0.7854, "grad_norm": 0.45765596628189087, "learning_rate": 0.0002, "epoch": 1.2330535829567464, "step": 3820}, {"loss": 0.7865, "grad_norm": 0.5676377415657043, "learning_rate": 0.0002, "epoch": 1.236281471917366, "step": 3830}, {"loss": 0.7696, "grad_norm": 0.4793425500392914, "learning_rate": 0.0002, "epoch": 1.2395093608779857, "step": 3840}, {"loss": 0.7065, "grad_norm": 0.5060022473335266, "learning_rate": 0.0002, "epoch": 1.2427372498386056, "step": 3850}, {"loss": 0.7333, "grad_norm": 0.6140682697296143, "learning_rate": 0.0002, "epoch": 1.2459651387992252, "step": 3860}, {"loss": 0.7496, "grad_norm": 0.5030326843261719, "learning_rate": 0.0002, "epoch": 1.249193027759845, "step": 3870}, {"loss": 0.7226, "grad_norm": 0.6609430909156799, "learning_rate": 0.0002, "epoch": 1.2524209167204647, "step": 3880}, {"loss": 0.7212, "grad_norm": 0.5459545850753784, "learning_rate": 0.0002, "epoch": 1.2556488056810846, "step": 3890}, {"loss": 0.7145, "grad_norm": 0.5328870415687561, "learning_rate": 0.0002, "epoch": 1.2588766946417043, "step": 3900}, {"loss": 0.7572, "grad_norm": 0.5840652585029602, "learning_rate": 0.0002, "epoch": 1.2621045836023241, "step": 3910}, {"loss": 0.7624, "grad_norm": 0.5587584376335144, "learning_rate": 0.0002, "epoch": 1.2653324725629438, "step": 3920}, {"loss": 0.7846, "grad_norm": 0.5886949896812439, "learning_rate": 0.0002, "epoch": 1.2685603615235637, "step": 3930}, {"loss": 0.7251, "grad_norm": 0.5128693580627441, "learning_rate": 0.0002, "epoch": 1.2717882504841833, "step": 3940}, {"loss": 0.7032, "grad_norm": 0.6207669377326965, "learning_rate": 0.0002, "epoch": 1.2750161394448032, "step": 3950}, {"loss": 0.7506, "grad_norm": 0.5789574384689331, "learning_rate": 0.0002, "epoch": 1.2782440284054228, "step": 3960}, {"loss": 0.7574, "grad_norm": 0.503162145614624, "learning_rate": 0.0002, "epoch": 1.2814719173660425, "step": 3970}, {"loss": 0.7489, "grad_norm": 0.6670064926147461, "learning_rate": 0.0002, "epoch": 1.2846998063266624, "step": 3980}, {"loss": 0.7198, "grad_norm": 0.5676213502883911, "learning_rate": 0.0002, "epoch": 1.2879276952872822, "step": 3990}, {"loss": 0.7892, "grad_norm": 0.5383169054985046, "learning_rate": 0.0002, "epoch": 1.2911555842479019, "step": 4000}, {"loss": 0.7432, "grad_norm": 0.714743971824646, "learning_rate": 0.0002, "epoch": 1.2943834732085215, "step": 4010}, {"loss": 0.7594, "grad_norm": 0.5740262269973755, "learning_rate": 0.0002, "epoch": 1.2976113621691414, "step": 4020}, {"loss": 0.7564, "grad_norm": 0.6143045425415039, "learning_rate": 0.0002, "epoch": 1.300839251129761, "step": 4030}, {"loss": 0.7181, "grad_norm": 0.501025378704071, "learning_rate": 0.0002, "epoch": 1.304067140090381, "step": 4040}, {"loss": 0.7099, "grad_norm": 0.5784100294113159, "learning_rate": 0.0002, "epoch": 1.3072950290510006, "step": 4050}, {"loss": 0.7403, "grad_norm": 0.6182606220245361, "learning_rate": 0.0002, "epoch": 1.3105229180116205, "step": 4060}, {"loss": 0.7249, "grad_norm": 0.5072231292724609, "learning_rate": 0.0002, "epoch": 1.3137508069722401, "step": 4070}, {"loss": 0.7451, "grad_norm": 0.6841012835502625, "learning_rate": 0.0002, "epoch": 1.31697869593286, "step": 4080}, {"loss": 0.7395, "grad_norm": 0.697257936000824, "learning_rate": 0.0002, "epoch": 1.3202065848934796, "step": 4090}, {"loss": 0.7401, "grad_norm": 0.5113214254379272, "learning_rate": 0.0002, "epoch": 1.3234344738540993, "step": 4100}, {"loss": 0.7336, "grad_norm": 0.6270561814308167, "learning_rate": 0.0002, "epoch": 1.3266623628147192, "step": 4110}, {"loss": 0.7535, "grad_norm": 0.5525947213172913, "learning_rate": 0.0002, "epoch": 1.329890251775339, "step": 4120}, {"loss": 0.6999, "grad_norm": 0.546071469783783, "learning_rate": 0.0002, "epoch": 1.3331181407359587, "step": 4130}, {"loss": 0.7884, "grad_norm": 0.6516721248626709, "learning_rate": 0.0002, "epoch": 1.3363460296965783, "step": 4140}, {"loss": 0.755, "grad_norm": 0.6235111355781555, "learning_rate": 0.0002, "epoch": 1.3395739186571982, "step": 4150}, {"loss": 0.7467, "grad_norm": 0.538649320602417, "learning_rate": 0.0002, "epoch": 1.3428018076178179, "step": 4160}, {"loss": 0.7368, "grad_norm": 0.5367001891136169, "learning_rate": 0.0002, "epoch": 1.3460296965784377, "step": 4170}, {"loss": 0.7536, "grad_norm": 0.6134631037712097, "learning_rate": 0.0002, "epoch": 1.3492575855390574, "step": 4180}, {"loss": 0.8245, "grad_norm": 0.5827262997627258, "learning_rate": 0.0002, "epoch": 1.3524854744996773, "step": 4190}, {"loss": 0.7288, "grad_norm": 0.5706096291542053, "learning_rate": 0.0002, "epoch": 1.355713363460297, "step": 4200}, {"loss": 0.7302, "grad_norm": 0.6422057151794434, "learning_rate": 0.0002, "epoch": 1.3589412524209168, "step": 4210}, {"loss": 0.7303, "grad_norm": 0.6316141486167908, "learning_rate": 0.0002, "epoch": 1.3621691413815364, "step": 4220}, {"loss": 0.7457, "grad_norm": 0.6946983933448792, "learning_rate": 0.0002, "epoch": 1.365397030342156, "step": 4230}, {"loss": 0.7388, "grad_norm": 0.5381525754928589, "learning_rate": 0.0002, "epoch": 1.368624919302776, "step": 4240}, {"loss": 0.73, "grad_norm": 0.5484845638275146, "learning_rate": 0.0002, "epoch": 1.3718528082633958, "step": 4250}, {"loss": 0.7584, "grad_norm": 0.5961896777153015, "learning_rate": 0.0002, "epoch": 1.3750806972240155, "step": 4260}, {"loss": 0.8006, "grad_norm": 0.6041752696037292, "learning_rate": 0.0002, "epoch": 1.3783085861846351, "step": 4270}, {"loss": 0.7276, "grad_norm": 0.6283464431762695, "learning_rate": 0.0002, "epoch": 1.381536475145255, "step": 4280}, {"loss": 0.757, "grad_norm": 0.6761324405670166, "learning_rate": 0.0002, "epoch": 1.384764364105875, "step": 4290}, {"loss": 0.7381, "grad_norm": 0.504311203956604, "learning_rate": 0.0002, "epoch": 1.3879922530664945, "step": 4300}, {"loss": 0.7536, "grad_norm": 0.6100395917892456, "learning_rate": 0.0002, "epoch": 1.3912201420271142, "step": 4310}, {"loss": 0.7103, "grad_norm": 0.6245788335800171, "learning_rate": 0.0002, "epoch": 1.394448030987734, "step": 4320}, {"loss": 0.7505, "grad_norm": 0.6074621081352234, "learning_rate": 0.0002, "epoch": 1.3976759199483537, "step": 4330}, {"loss": 0.752, "grad_norm": 0.6683838963508606, "learning_rate": 0.0002, "epoch": 1.4009038089089736, "step": 4340}, {"loss": 0.7537, "grad_norm": 0.622998058795929, "learning_rate": 0.0002, "epoch": 1.4041316978695932, "step": 4350}, {"loss": 0.8148, "grad_norm": 0.6089423894882202, "learning_rate": 0.0002, "epoch": 1.4073595868302131, "step": 4360}, {"loss": 0.7715, "grad_norm": 0.6381658911705017, "learning_rate": 0.0002, "epoch": 1.4105874757908328, "step": 4370}, {"loss": 0.7871, "grad_norm": 0.5419308543205261, "learning_rate": 0.0002, "epoch": 1.4138153647514526, "step": 4380}, {"loss": 0.7386, "grad_norm": 0.6026232242584229, "learning_rate": 0.0002, "epoch": 1.4170432537120723, "step": 4390}, {"loss": 0.7529, "grad_norm": 0.4911101162433624, "learning_rate": 0.0002, "epoch": 1.420271142672692, "step": 4400}, {"loss": 0.7495, "grad_norm": 0.6302908062934875, "learning_rate": 0.0002, "epoch": 1.4234990316333118, "step": 4410}, {"loss": 0.7446, "grad_norm": 0.6692768931388855, "learning_rate": 0.0002, "epoch": 1.4267269205939317, "step": 4420}, {"loss": 0.7312, "grad_norm": 0.46294572949409485, "learning_rate": 0.0002, "epoch": 1.4299548095545513, "step": 4430}, {"loss": 0.7255, "grad_norm": 0.5452619194984436, "learning_rate": 0.0002, "epoch": 1.433182698515171, "step": 4440}, {"loss": 0.7974, "grad_norm": 0.7809233069419861, "learning_rate": 0.0002, "epoch": 1.4364105874757909, "step": 4450}, {"loss": 0.7103, "grad_norm": 0.550088107585907, "learning_rate": 0.0002, "epoch": 1.4396384764364105, "step": 4460}, {"loss": 0.7088, "grad_norm": 0.7139151096343994, "learning_rate": 0.0002, "epoch": 1.4428663653970304, "step": 4470}, {"loss": 0.7358, "grad_norm": 0.6187090873718262, "learning_rate": 0.0002, "epoch": 1.44609425435765, "step": 4480}, {"loss": 0.7608, "grad_norm": 0.5948249101638794, "learning_rate": 0.0002, "epoch": 1.44932214331827, "step": 4490}, {"loss": 0.7582, "grad_norm": 0.6510892510414124, "learning_rate": 0.0002, "epoch": 1.4525500322788896, "step": 4500}, {"loss": 0.7105, "grad_norm": 0.6552293300628662, "learning_rate": 0.0002, "epoch": 1.4557779212395094, "step": 4510}, {"loss": 0.7965, "grad_norm": 0.585574209690094, "learning_rate": 0.0002, "epoch": 1.459005810200129, "step": 4520}, {"loss": 0.761, "grad_norm": 0.4830162823200226, "learning_rate": 0.0002, "epoch": 1.4622336991607487, "step": 4530}, {"loss": 0.7424, "grad_norm": 0.5780223608016968, "learning_rate": 0.0002, "epoch": 1.4654615881213686, "step": 4540}, {"loss": 0.7518, "grad_norm": 0.5462607145309448, "learning_rate": 0.0002, "epoch": 1.4686894770819885, "step": 4550}, {"loss": 0.7342, "grad_norm": 0.5183546543121338, "learning_rate": 0.0002, "epoch": 1.4719173660426081, "step": 4560}, {"loss": 0.71, "grad_norm": 0.676917552947998, "learning_rate": 0.0002, "epoch": 1.4751452550032278, "step": 4570}, {"loss": 0.7875, "grad_norm": 0.5772345066070557, "learning_rate": 0.0002, "epoch": 1.4783731439638477, "step": 4580}, {"loss": 0.7709, "grad_norm": 0.7320035696029663, "learning_rate": 0.0002, "epoch": 1.4816010329244673, "step": 4590}, {"loss": 0.7601, "grad_norm": 0.5024042129516602, "learning_rate": 0.0002, "epoch": 1.4848289218850872, "step": 4600}, {"loss": 0.8061, "grad_norm": 0.5482868552207947, "learning_rate": 0.0002, "epoch": 1.4880568108457068, "step": 4610}, {"loss": 0.714, "grad_norm": 0.5447399616241455, "learning_rate": 0.0002, "epoch": 1.4912846998063267, "step": 4620}, {"loss": 0.7959, "grad_norm": 0.5953414440155029, "learning_rate": 0.0002, "epoch": 1.4945125887669464, "step": 4630}, {"loss": 0.7463, "grad_norm": 0.6983066201210022, "learning_rate": 0.0002, "epoch": 1.4977404777275662, "step": 4640}, {"loss": 0.7877, "grad_norm": 0.586327075958252, "learning_rate": 0.0002, "epoch": 1.500968366688186, "step": 4650}, {"loss": 0.7169, "grad_norm": 0.5839682221412659, "learning_rate": 0.0002, "epoch": 1.5041962556488055, "step": 4660}, {"loss": 0.7524, "grad_norm": 0.5959209203720093, "learning_rate": 0.0002, "epoch": 1.5074241446094254, "step": 4670}, {"loss": 0.7615, "grad_norm": 0.5073857307434082, "learning_rate": 0.0002, "epoch": 1.5106520335700453, "step": 4680}, {"loss": 0.7258, "grad_norm": 0.5183001160621643, "learning_rate": 0.0002, "epoch": 1.513879922530665, "step": 4690}, {"loss": 0.784, "grad_norm": 0.593530535697937, "learning_rate": 0.0002, "epoch": 1.5171078114912846, "step": 4700}, {"loss": 0.7722, "grad_norm": 0.675993025302887, "learning_rate": 0.0002, "epoch": 1.5203357004519045, "step": 4710}, {"loss": 0.7485, "grad_norm": 0.5823286771774292, "learning_rate": 0.0002, "epoch": 1.5235635894125243, "step": 4720}, {"loss": 0.7474, "grad_norm": 0.5825035572052002, "learning_rate": 0.0002, "epoch": 1.526791478373144, "step": 4730}, {"loss": 0.8287, "grad_norm": 0.5689691305160522, "learning_rate": 0.0002, "epoch": 1.5300193673337636, "step": 4740}, {"loss": 0.7279, "grad_norm": 0.6037150621414185, "learning_rate": 0.0002, "epoch": 1.5332472562943835, "step": 4750}, {"loss": 0.7865, "grad_norm": 0.6393677592277527, "learning_rate": 0.0002, "epoch": 1.5364751452550034, "step": 4760}, {"loss": 0.805, "grad_norm": 0.5926381945610046, "learning_rate": 0.0002, "epoch": 1.539703034215623, "step": 4770}, {"loss": 0.7425, "grad_norm": 0.9468599557876587, "learning_rate": 0.0002, "epoch": 1.5429309231762427, "step": 4780}, {"loss": 0.7565, "grad_norm": 0.7544237375259399, "learning_rate": 0.0002, "epoch": 1.5461588121368623, "step": 4790}, {"loss": 0.7398, "grad_norm": 0.5308566093444824, "learning_rate": 0.0002, "epoch": 1.5493867010974822, "step": 4800}, {"loss": 0.7756, "grad_norm": 0.6590296030044556, "learning_rate": 0.0002, "epoch": 1.552614590058102, "step": 4810}, {"loss": 0.7212, "grad_norm": 0.5630404353141785, "learning_rate": 0.0002, "epoch": 1.5558424790187217, "step": 4820}, {"loss": 0.7593, "grad_norm": 0.6800200939178467, "learning_rate": 0.0002, "epoch": 1.5590703679793414, "step": 4830}, {"loss": 0.7373, "grad_norm": 0.5463718175888062, "learning_rate": 0.0002, "epoch": 1.5622982569399613, "step": 4840}, {"loss": 0.7519, "grad_norm": 0.505135178565979, "learning_rate": 0.0002, "epoch": 1.5655261459005811, "step": 4850}, {"loss": 0.8122, "grad_norm": 0.5469676852226257, "learning_rate": 0.0002, "epoch": 1.5687540348612008, "step": 4860}, {"loss": 0.7185, "grad_norm": 0.5318337678909302, "learning_rate": 0.0002, "epoch": 1.5719819238218204, "step": 4870}, {"loss": 0.7324, "grad_norm": 0.7287914752960205, "learning_rate": 0.0002, "epoch": 1.5752098127824403, "step": 4880}, {"loss": 0.7532, "grad_norm": 0.7318989038467407, "learning_rate": 0.0002, "epoch": 1.5784377017430602, "step": 4890}, {"loss": 0.7851, "grad_norm": 0.6499921679496765, "learning_rate": 0.0002, "epoch": 1.5816655907036798, "step": 4900}, {"loss": 0.753, "grad_norm": 0.47907355427742004, "learning_rate": 0.0002, "epoch": 1.5848934796642995, "step": 4910}, {"loss": 0.7699, "grad_norm": 0.7338833808898926, "learning_rate": 0.0002, "epoch": 1.5881213686249191, "step": 4920}, {"loss": 0.7592, "grad_norm": 0.5800719261169434, "learning_rate": 0.0002, "epoch": 1.591349257585539, "step": 4930}, {"loss": 0.7211, "grad_norm": 0.5365763306617737, "learning_rate": 0.0002, "epoch": 1.594577146546159, "step": 4940}, {"loss": 0.777, "grad_norm": 0.5800772309303284, "learning_rate": 0.0002, "epoch": 1.5978050355067785, "step": 4950}, {"loss": 0.8027, "grad_norm": 0.7878010869026184, "learning_rate": 0.0002, "epoch": 1.6010329244673982, "step": 4960}, {"loss": 0.7894, "grad_norm": 0.5919058918952942, "learning_rate": 0.0002, "epoch": 1.604260813428018, "step": 4970}, {"loss": 0.7762, "grad_norm": 0.5004435181617737, "learning_rate": 0.0002, "epoch": 1.607488702388638, "step": 4980}, {"loss": 0.7447, "grad_norm": 0.6299242377281189, "learning_rate": 0.0002, "epoch": 1.6107165913492576, "step": 4990}, {"loss": 0.7149, "grad_norm": 0.6307242512702942, "learning_rate": 0.0002, "epoch": 1.6139444803098772, "step": 5000}, {"loss": 0.7693, "grad_norm": 0.7838703989982605, "learning_rate": 0.0002, "epoch": 1.6171723692704971, "step": 5010}, {"loss": 0.7364, "grad_norm": 0.6454671621322632, "learning_rate": 0.0002, "epoch": 1.620400258231117, "step": 5020}, {"loss": 0.74, "grad_norm": 0.5907095670700073, "learning_rate": 0.0002, "epoch": 1.6236281471917366, "step": 5030}, {"loss": 0.7331, "grad_norm": 0.6053501963615417, "learning_rate": 0.0002, "epoch": 1.6268560361523563, "step": 5040}, {"loss": 0.6987, "grad_norm": 0.5644670128822327, "learning_rate": 0.0002, "epoch": 1.630083925112976, "step": 5050}, {"loss": 0.7886, "grad_norm": 0.6320949792861938, "learning_rate": 0.0002, "epoch": 1.6333118140735958, "step": 5060}, {"loss": 0.7109, "grad_norm": 0.6101489067077637, "learning_rate": 0.0002, "epoch": 1.6365397030342157, "step": 5070}, {"loss": 0.6922, "grad_norm": 0.9435283541679382, "learning_rate": 0.0002, "epoch": 1.6397675919948353, "step": 5080}, {"loss": 0.729, "grad_norm": 0.6668919324874878, "learning_rate": 0.0002, "epoch": 1.642995480955455, "step": 5090}, {"loss": 0.7402, "grad_norm": 0.6160340905189514, "learning_rate": 0.0002, "epoch": 1.6462233699160749, "step": 5100}, {"loss": 0.7461, "grad_norm": 0.5999835729598999, "learning_rate": 0.0002, "epoch": 1.6494512588766947, "step": 5110}, {"loss": 0.7661, "grad_norm": 0.9378551840782166, "learning_rate": 0.0002, "epoch": 1.6526791478373144, "step": 5120}, {"loss": 0.7586, "grad_norm": 0.4795055389404297, "learning_rate": 0.0002, "epoch": 1.655907036797934, "step": 5130}, {"loss": 0.7342, "grad_norm": 0.4878861606121063, "learning_rate": 0.0002, "epoch": 1.659134925758554, "step": 5140}, {"loss": 0.7362, "grad_norm": 0.6042965054512024, "learning_rate": 0.0002, "epoch": 1.6623628147191738, "step": 5150}, {"loss": 0.7863, "grad_norm": 0.5829901695251465, "learning_rate": 0.0002, "epoch": 1.6655907036797934, "step": 5160}, {"loss": 0.7498, "grad_norm": 0.5168480277061462, "learning_rate": 0.0002, "epoch": 1.668818592640413, "step": 5170}, {"loss": 0.7333, "grad_norm": 0.6489511132240295, "learning_rate": 0.0002, "epoch": 1.672046481601033, "step": 5180}, {"loss": 0.7257, "grad_norm": 0.5955966114997864, "learning_rate": 0.0002, "epoch": 1.6752743705616526, "step": 5190}, {"loss": 0.7938, "grad_norm": 0.6228088140487671, "learning_rate": 0.0002, "epoch": 1.6785022595222725, "step": 5200}, {"loss": 0.7626, "grad_norm": 0.5726390480995178, "learning_rate": 0.0002, "epoch": 1.6817301484828922, "step": 5210}, {"loss": 0.7479, "grad_norm": 0.6116343140602112, "learning_rate": 0.0002, "epoch": 1.6849580374435118, "step": 5220}, {"loss": 0.7169, "grad_norm": 0.5483687520027161, "learning_rate": 0.0002, "epoch": 1.6881859264041317, "step": 5230}, {"loss": 0.7293, "grad_norm": 0.570941686630249, "learning_rate": 0.0002, "epoch": 1.6914138153647515, "step": 5240}, {"loss": 0.723, "grad_norm": 0.6048086285591125, "learning_rate": 0.0002, "epoch": 1.6946417043253712, "step": 5250}, {"loss": 0.7861, "grad_norm": 0.6769003868103027, "learning_rate": 0.0002, "epoch": 1.6978695932859909, "step": 5260}, {"loss": 0.7885, "grad_norm": 0.5629057884216309, "learning_rate": 0.0002, "epoch": 1.7010974822466107, "step": 5270}, {"loss": 0.7693, "grad_norm": 0.657341480255127, "learning_rate": 0.0002, "epoch": 1.7043253712072306, "step": 5280}, {"loss": 0.7357, "grad_norm": 0.6256147623062134, "learning_rate": 0.0002, "epoch": 1.7075532601678503, "step": 5290}, {"loss": 0.714, "grad_norm": 0.5498088002204895, "learning_rate": 0.0002, "epoch": 1.71078114912847, "step": 5300}, {"loss": 0.7669, "grad_norm": 0.5078358054161072, "learning_rate": 0.0002, "epoch": 1.7140090380890898, "step": 5310}, {"loss": 0.7872, "grad_norm": 0.6696692705154419, "learning_rate": 0.0002, "epoch": 1.7172369270497096, "step": 5320}, {"loss": 0.8205, "grad_norm": 0.6692847013473511, "learning_rate": 0.0002, "epoch": 1.7204648160103293, "step": 5330}, {"loss": 0.7432, "grad_norm": 0.5415751934051514, "learning_rate": 0.0002, "epoch": 1.723692704970949, "step": 5340}, {"loss": 0.7499, "grad_norm": 0.5367611050605774, "learning_rate": 0.0002, "epoch": 1.7269205939315686, "step": 5350}, {"loss": 0.7631, "grad_norm": 0.7321061491966248, "learning_rate": 0.0002, "epoch": 1.7301484828921885, "step": 5360}, {"loss": 0.7827, "grad_norm": 0.723972499370575, "learning_rate": 0.0002, "epoch": 1.7333763718528084, "step": 5370}, {"loss": 0.7077, "grad_norm": 0.7328100204467773, "learning_rate": 0.0002, "epoch": 1.736604260813428, "step": 5380}, {"loss": 0.7503, "grad_norm": 0.5785264372825623, "learning_rate": 0.0002, "epoch": 1.7398321497740477, "step": 5390}, {"loss": 0.7188, "grad_norm": 0.7812932133674622, "learning_rate": 0.0002, "epoch": 1.7430600387346675, "step": 5400}, {"loss": 0.7386, "grad_norm": 0.6493327617645264, "learning_rate": 0.0002, "epoch": 1.7462879276952874, "step": 5410}, {"loss": 0.7487, "grad_norm": 0.5825939774513245, "learning_rate": 0.0002, "epoch": 1.749515816655907, "step": 5420}, {"loss": 0.7625, "grad_norm": 0.6969610452651978, "learning_rate": 0.0002, "epoch": 1.7527437056165267, "step": 5430}, {"loss": 0.7512, "grad_norm": 0.5558062195777893, "learning_rate": 0.0002, "epoch": 1.7559715945771466, "step": 5440}, {"loss": 0.7256, "grad_norm": 0.49222221970558167, "learning_rate": 0.0002, "epoch": 1.7591994835377665, "step": 5450}, {"loss": 0.7477, "grad_norm": 0.5844656825065613, "learning_rate": 0.0002, "epoch": 1.762427372498386, "step": 5460}, {"loss": 0.7695, "grad_norm": 0.8706597685813904, "learning_rate": 0.0002, "epoch": 1.7656552614590058, "step": 5470}, {"loss": 0.7582, "grad_norm": 0.6167706251144409, "learning_rate": 0.0002, "epoch": 1.7688831504196254, "step": 5480}, {"loss": 0.7521, "grad_norm": 0.5890011787414551, "learning_rate": 0.0002, "epoch": 1.7721110393802453, "step": 5490}, {"loss": 0.8319, "grad_norm": 0.6551728248596191, "learning_rate": 0.0002, "epoch": 1.7753389283408652, "step": 5500}, {"loss": 0.7615, "grad_norm": 0.5848751068115234, "learning_rate": 0.0002, "epoch": 1.7785668173014848, "step": 5510}, {"loss": 0.7622, "grad_norm": 0.6664014458656311, "learning_rate": 0.0002, "epoch": 1.7817947062621045, "step": 5520}, {"loss": 0.7544, "grad_norm": 0.5931693911552429, "learning_rate": 0.0002, "epoch": 1.7850225952227243, "step": 5530}, {"loss": 0.7992, "grad_norm": 0.5534724593162537, "learning_rate": 0.0002, "epoch": 1.7882504841833442, "step": 5540}, {"loss": 0.7967, "grad_norm": 0.5590878129005432, "learning_rate": 0.0002, "epoch": 1.7914783731439639, "step": 5550}, {"loss": 0.7406, "grad_norm": 0.6947470903396606, "learning_rate": 0.0002, "epoch": 1.7947062621045835, "step": 5560}, {"loss": 0.7614, "grad_norm": 0.6104130148887634, "learning_rate": 0.0002, "epoch": 1.7979341510652034, "step": 5570}, {"loss": 0.8032, "grad_norm": 0.6135714054107666, "learning_rate": 0.0002, "epoch": 1.8011620400258233, "step": 5580}, {"loss": 0.7403, "grad_norm": 0.6626853346824646, "learning_rate": 0.0002, "epoch": 1.804389928986443, "step": 5590}, {"loss": 0.7746, "grad_norm": 0.6977612972259521, "learning_rate": 0.0002, "epoch": 1.8076178179470626, "step": 5600}, {"loss": 0.7899, "grad_norm": 0.6275238394737244, "learning_rate": 0.0002, "epoch": 1.8108457069076824, "step": 5610}, {"loss": 0.7392, "grad_norm": 0.5017505288124084, "learning_rate": 0.0002, "epoch": 1.814073595868302, "step": 5620}, {"loss": 0.7669, "grad_norm": 0.8314290642738342, "learning_rate": 0.0002, "epoch": 1.817301484828922, "step": 5630}, {"loss": 0.7031, "grad_norm": 0.6863582134246826, "learning_rate": 0.0002, "epoch": 1.8205293737895416, "step": 5640}, {"loss": 0.743, "grad_norm": 0.69544917345047, "learning_rate": 0.0002, "epoch": 1.8237572627501613, "step": 5650}, {"loss": 0.7277, "grad_norm": 0.515499472618103, "learning_rate": 0.0002, "epoch": 1.8269851517107811, "step": 5660}, {"loss": 0.7166, "grad_norm": 0.6100873947143555, "learning_rate": 0.0002, "epoch": 1.830213040671401, "step": 5670}, {"loss": 0.7217, "grad_norm": 0.67416912317276, "learning_rate": 0.0002, "epoch": 1.8334409296320207, "step": 5680}, {"loss": 0.7575, "grad_norm": 0.7057772278785706, "learning_rate": 0.0002, "epoch": 1.8366688185926403, "step": 5690}, {"loss": 0.7483, "grad_norm": 0.7374551892280579, "learning_rate": 0.0002, "epoch": 1.8398967075532602, "step": 5700}, {"loss": 0.81, "grad_norm": 0.6266297101974487, "learning_rate": 0.0002, "epoch": 1.84312459651388, "step": 5710}, {"loss": 0.728, "grad_norm": 0.5629227757453918, "learning_rate": 0.0002, "epoch": 1.8463524854744997, "step": 5720}, {"loss": 0.8043, "grad_norm": 0.6603655815124512, "learning_rate": 0.0002, "epoch": 1.8495803744351194, "step": 5730}, {"loss": 0.7587, "grad_norm": 0.8113715052604675, "learning_rate": 0.0002, "epoch": 1.8528082633957392, "step": 5740}, {"loss": 0.7486, "grad_norm": 0.7143914103507996, "learning_rate": 0.0002, "epoch": 1.856036152356359, "step": 5750}, {"loss": 0.7619, "grad_norm": 0.6273732781410217, "learning_rate": 0.0002, "epoch": 1.8592640413169788, "step": 5760}, {"loss": 0.7962, "grad_norm": 0.5428690910339355, "learning_rate": 0.0002, "epoch": 1.8624919302775984, "step": 5770}, {"loss": 0.7581, "grad_norm": 0.6405037641525269, "learning_rate": 0.0002, "epoch": 1.865719819238218, "step": 5780}, {"loss": 0.7569, "grad_norm": 0.700873613357544, "learning_rate": 0.0002, "epoch": 1.868947708198838, "step": 5790}, {"loss": 0.7353, "grad_norm": 0.5645238161087036, "learning_rate": 0.0002, "epoch": 1.8721755971594578, "step": 5800}, {"loss": 0.8037, "grad_norm": 0.8780353665351868, "learning_rate": 0.0002, "epoch": 1.8754034861200775, "step": 5810}, {"loss": 0.7686, "grad_norm": 0.6295409798622131, "learning_rate": 0.0002, "epoch": 1.878631375080697, "step": 5820}, {"loss": 0.8067, "grad_norm": 0.678269624710083, "learning_rate": 0.0002, "epoch": 1.881859264041317, "step": 5830}, {"loss": 0.7537, "grad_norm": 0.6464608907699585, "learning_rate": 0.0002, "epoch": 1.8850871530019369, "step": 5840}, {"loss": 0.7423, "grad_norm": 0.6201048493385315, "learning_rate": 0.0002, "epoch": 1.8883150419625565, "step": 5850}, {"loss": 0.7694, "grad_norm": 0.6046274304389954, "learning_rate": 0.0002, "epoch": 1.8915429309231762, "step": 5860}, {"loss": 0.781, "grad_norm": 0.7532408833503723, "learning_rate": 0.0002, "epoch": 1.894770819883796, "step": 5870}, {"loss": 0.6885, "grad_norm": 0.6066767573356628, "learning_rate": 0.0002, "epoch": 1.897998708844416, "step": 5880}, {"loss": 0.7631, "grad_norm": 0.6289830207824707, "learning_rate": 0.0002, "epoch": 1.9012265978050356, "step": 5890}, {"loss": 0.7501, "grad_norm": 0.5204319953918457, "learning_rate": 0.0002, "epoch": 1.9044544867656552, "step": 5900}, {"loss": 0.7335, "grad_norm": 0.6708219647407532, "learning_rate": 0.0002, "epoch": 1.9076823757262749, "step": 5910}, {"loss": 0.7455, "grad_norm": 0.4915677309036255, "learning_rate": 0.0002, "epoch": 1.9109102646868947, "step": 5920}, {"loss": 0.7464, "grad_norm": 0.652717113494873, "learning_rate": 0.0002, "epoch": 1.9141381536475146, "step": 5930}, {"loss": 0.7687, "grad_norm": 0.5446316003799438, "learning_rate": 0.0002, "epoch": 1.9173660426081343, "step": 5940}, {"loss": 0.7424, "grad_norm": 0.4958149194717407, "learning_rate": 0.0002, "epoch": 1.920593931568754, "step": 5950}, {"loss": 0.757, "grad_norm": 0.5623434782028198, "learning_rate": 0.0002, "epoch": 1.9238218205293738, "step": 5960}, {"loss": 0.7446, "grad_norm": 0.6855450868606567, "learning_rate": 0.0002, "epoch": 1.9270497094899937, "step": 5970}, {"loss": 0.827, "grad_norm": 0.5710492730140686, "learning_rate": 0.0002, "epoch": 1.9302775984506133, "step": 5980}, {"loss": 0.7245, "grad_norm": 0.5379431843757629, "learning_rate": 0.0002, "epoch": 1.933505487411233, "step": 5990}, {"loss": 0.77, "grad_norm": 0.557129442691803, "learning_rate": 0.0002, "epoch": 1.9367333763718528, "step": 6000}, {"loss": 0.6988, "grad_norm": 0.6336663961410522, "learning_rate": 0.0002, "epoch": 1.9399612653324727, "step": 6010}, {"loss": 0.7316, "grad_norm": 0.5950582027435303, "learning_rate": 0.0002, "epoch": 1.9431891542930924, "step": 6020}, {"loss": 0.7443, "grad_norm": 0.5905954837799072, "learning_rate": 0.0002, "epoch": 1.946417043253712, "step": 6030}, {"loss": 0.7127, "grad_norm": 0.6688982844352722, "learning_rate": 0.0002, "epoch": 1.9496449322143317, "step": 6040}, {"loss": 0.79, "grad_norm": 0.5440775752067566, "learning_rate": 0.0002, "epoch": 1.9528728211749515, "step": 6050}, {"loss": 0.7221, "grad_norm": 0.6207906603813171, "learning_rate": 0.0002, "epoch": 1.9561007101355714, "step": 6060}, {"loss": 0.738, "grad_norm": 0.6999374628067017, "learning_rate": 0.0002, "epoch": 1.959328599096191, "step": 6070}, {"loss": 0.7372, "grad_norm": 0.6310848593711853, "learning_rate": 0.0002, "epoch": 1.9625564880568107, "step": 6080}, {"loss": 0.7198, "grad_norm": 0.5903388261795044, "learning_rate": 0.0002, "epoch": 1.9657843770174306, "step": 6090}, {"loss": 0.7103, "grad_norm": 0.6333889961242676, "learning_rate": 0.0002, "epoch": 1.9690122659780505, "step": 6100}, {"loss": 0.7246, "grad_norm": 0.5604711174964905, "learning_rate": 0.0002, "epoch": 1.97224015493867, "step": 6110}, {"loss": 0.761, "grad_norm": 0.9234541654586792, "learning_rate": 0.0002, "epoch": 1.9754680438992898, "step": 6120}, {"loss": 0.7375, "grad_norm": 0.6149102449417114, "learning_rate": 0.0002, "epoch": 1.9786959328599096, "step": 6130}, {"loss": 0.7286, "grad_norm": 0.615446150302887, "learning_rate": 0.0002, "epoch": 1.9819238218205295, "step": 6140}, {"loss": 0.7333, "grad_norm": 0.5176635980606079, "learning_rate": 0.0002, "epoch": 1.9851517107811492, "step": 6150}, {"loss": 0.718, "grad_norm": 0.7124109864234924, "learning_rate": 0.0002, "epoch": 1.9883795997417688, "step": 6160}, {"loss": 0.7669, "grad_norm": 0.6317567825317383, "learning_rate": 0.0002, "epoch": 1.9916074887023887, "step": 6170}, {"loss": 0.8012, "grad_norm": 0.6855016350746155, "learning_rate": 0.0002, "epoch": 1.9948353776630086, "step": 6180}, {"loss": 0.7376, "grad_norm": 0.6423715353012085, "learning_rate": 0.0002, "epoch": 1.9980632666236282, "step": 6190}, {"eval_loss": 1.1096643209457397, "eval_runtime": 147.7997, "eval_samples_per_second": 4.959, "eval_steps_per_second": 0.622, "epoch": 2.0, "step": 6196}, {"loss": 0.7131, "grad_norm": 0.5322932600975037, "learning_rate": 0.0002, "epoch": 2.001291155584248, "step": 6200}, {"loss": 0.6619, "grad_norm": 0.8152306079864502, "learning_rate": 0.0002, "epoch": 2.0045190445448675, "step": 6210}, {"loss": 0.6731, "grad_norm": 0.6215983033180237, "learning_rate": 0.0002, "epoch": 2.0077469335054876, "step": 6220}, {"loss": 0.658, "grad_norm": 0.845498263835907, "learning_rate": 0.0002, "epoch": 2.0109748224661073, "step": 6230}, {"loss": 0.6954, "grad_norm": 0.733559787273407, "learning_rate": 0.0002, "epoch": 2.014202711426727, "step": 6240}, {"loss": 0.6707, "grad_norm": 0.51433926820755, "learning_rate": 0.0002, "epoch": 2.0174306003873466, "step": 6250}, {"loss": 0.6304, "grad_norm": 0.6374049782752991, "learning_rate": 0.0002, "epoch": 2.020658489347966, "step": 6260}, {"loss": 0.6831, "grad_norm": 0.7833638191223145, "learning_rate": 0.0002, "epoch": 2.0238863783085863, "step": 6270}, {"loss": 0.6672, "grad_norm": 0.8929463028907776, "learning_rate": 0.0002, "epoch": 2.027114267269206, "step": 6280}, {"loss": 0.637, "grad_norm": 0.669731855392456, "learning_rate": 0.0002, "epoch": 2.0303421562298256, "step": 6290}, {"loss": 0.646, "grad_norm": 0.5846071243286133, "learning_rate": 0.0002, "epoch": 2.0335700451904453, "step": 6300}, {"loss": 0.6647, "grad_norm": 0.7087787985801697, "learning_rate": 0.0002, "epoch": 2.0367979341510654, "step": 6310}, {"loss": 0.6433, "grad_norm": 0.6739160418510437, "learning_rate": 0.0002, "epoch": 2.040025823111685, "step": 6320}, {"loss": 0.6301, "grad_norm": 0.4860886335372925, "learning_rate": 0.0002, "epoch": 2.0432537120723047, "step": 6330}, {"loss": 0.6439, "grad_norm": 0.7201244831085205, "learning_rate": 0.0002, "epoch": 2.0464816010329243, "step": 6340}, {"loss": 0.6676, "grad_norm": 0.7409170269966125, "learning_rate": 0.0002, "epoch": 2.0497094899935444, "step": 6350}, {"loss": 0.6153, "grad_norm": 0.6843920350074768, "learning_rate": 0.0002, "epoch": 2.052937378954164, "step": 6360}, {"loss": 0.6674, "grad_norm": 0.7519999742507935, "learning_rate": 0.0002, "epoch": 2.0561652679147837, "step": 6370}, {"loss": 0.6928, "grad_norm": 0.5732819437980652, "learning_rate": 0.0002, "epoch": 2.0593931568754034, "step": 6380}, {"loss": 0.6496, "grad_norm": 0.7565118074417114, "learning_rate": 0.0002, "epoch": 2.062621045836023, "step": 6390}, {"loss": 0.6354, "grad_norm": 0.8147150278091431, "learning_rate": 0.0002, "epoch": 2.065848934796643, "step": 6400}, {"loss": 0.6593, "grad_norm": 0.6941924691200256, "learning_rate": 0.0002, "epoch": 2.0690768237572628, "step": 6410}, {"loss": 0.6698, "grad_norm": 0.6549784541130066, "learning_rate": 0.0002, "epoch": 2.0723047127178824, "step": 6420}, {"loss": 0.6927, "grad_norm": 0.7224905490875244, "learning_rate": 0.0002, "epoch": 2.075532601678502, "step": 6430}, {"loss": 0.6755, "grad_norm": 0.7754863500595093, "learning_rate": 0.0002, "epoch": 2.078760490639122, "step": 6440}, {"loss": 0.6738, "grad_norm": 0.691318154335022, "learning_rate": 0.0002, "epoch": 2.081988379599742, "step": 6450}, {"loss": 0.6233, "grad_norm": 0.6009294986724854, "learning_rate": 0.0002, "epoch": 2.0852162685603615, "step": 6460}, {"loss": 0.6691, "grad_norm": 0.6753945350646973, "learning_rate": 0.0002, "epoch": 2.088444157520981, "step": 6470}, {"loss": 0.6935, "grad_norm": 0.6899921298027039, "learning_rate": 0.0002, "epoch": 2.091672046481601, "step": 6480}, {"loss": 0.6918, "grad_norm": 0.846510648727417, "learning_rate": 0.0002, "epoch": 2.094899935442221, "step": 6490}, {"loss": 0.6084, "grad_norm": 0.6432605981826782, "learning_rate": 0.0002, "epoch": 2.0981278244028405, "step": 6500}, {"loss": 0.6867, "grad_norm": 0.8125239014625549, "learning_rate": 0.0002, "epoch": 2.10135571336346, "step": 6510}, {"loss": 0.6939, "grad_norm": 0.628302812576294, "learning_rate": 0.0002, "epoch": 2.1045836023240803, "step": 6520}, {"loss": 0.5909, "grad_norm": 0.7164334654808044, "learning_rate": 0.0002, "epoch": 2.1078114912847, "step": 6530}, {"loss": 0.6578, "grad_norm": 0.7476949095726013, "learning_rate": 0.0002, "epoch": 2.1110393802453196, "step": 6540}, {"loss": 0.6351, "grad_norm": 0.7577515840530396, "learning_rate": 0.0002, "epoch": 2.114267269205939, "step": 6550}, {"loss": 0.6669, "grad_norm": 0.5684467554092407, "learning_rate": 0.0002, "epoch": 2.117495158166559, "step": 6560}, {"loss": 0.6343, "grad_norm": 0.6121789216995239, "learning_rate": 0.0002, "epoch": 2.120723047127179, "step": 6570}, {"loss": 0.6314, "grad_norm": 0.6095348596572876, "learning_rate": 0.0002, "epoch": 2.1239509360877986, "step": 6580}, {"loss": 0.6276, "grad_norm": 0.7803651690483093, "learning_rate": 0.0002, "epoch": 2.1271788250484183, "step": 6590}, {"loss": 0.6579, "grad_norm": 0.5990583300590515, "learning_rate": 0.0002, "epoch": 2.130406714009038, "step": 6600}, {"loss": 0.6228, "grad_norm": 0.6569220423698425, "learning_rate": 0.0002, "epoch": 2.133634602969658, "step": 6610}, {"loss": 0.7049, "grad_norm": 0.5961166620254517, "learning_rate": 0.0002, "epoch": 2.1368624919302777, "step": 6620}, {"loss": 0.6359, "grad_norm": 0.5860554575920105, "learning_rate": 0.0002, "epoch": 2.1400903808908973, "step": 6630}, {"loss": 0.6651, "grad_norm": 0.5994001626968384, "learning_rate": 0.0002, "epoch": 2.143318269851517, "step": 6640}, {"loss": 0.6421, "grad_norm": 0.7723015546798706, "learning_rate": 0.0002, "epoch": 2.146546158812137, "step": 6650}, {"loss": 0.6723, "grad_norm": 0.676355242729187, "learning_rate": 0.0002, "epoch": 2.1497740477727567, "step": 6660}, {"loss": 0.6826, "grad_norm": 0.5689092874526978, "learning_rate": 0.0002, "epoch": 2.1530019367333764, "step": 6670}, {"loss": 0.6613, "grad_norm": 0.6933727264404297, "learning_rate": 0.0002, "epoch": 2.156229825693996, "step": 6680}, {"loss": 0.6957, "grad_norm": 0.8380527496337891, "learning_rate": 0.0002, "epoch": 2.159457714654616, "step": 6690}, {"loss": 0.6705, "grad_norm": 0.6876497268676758, "learning_rate": 0.0002, "epoch": 2.1626856036152358, "step": 6700}, {"loss": 0.6112, "grad_norm": 0.6418334245681763, "learning_rate": 0.0002, "epoch": 2.1659134925758554, "step": 6710}, {"loss": 0.6357, "grad_norm": 0.7169192433357239, "learning_rate": 0.0002, "epoch": 2.169141381536475, "step": 6720}, {"loss": 0.6492, "grad_norm": 0.6664170622825623, "learning_rate": 0.0002, "epoch": 2.1723692704970947, "step": 6730}, {"loss": 0.6751, "grad_norm": 0.6011993288993835, "learning_rate": 0.0002, "epoch": 2.175597159457715, "step": 6740}, {"loss": 0.696, "grad_norm": 0.5529947280883789, "learning_rate": 0.0002, "epoch": 2.1788250484183345, "step": 6750}, {"loss": 0.671, "grad_norm": 0.6879532933235168, "learning_rate": 0.0002, "epoch": 2.182052937378954, "step": 6760}, {"loss": 0.6634, "grad_norm": 0.6426113843917847, "learning_rate": 0.0002, "epoch": 2.1852808263395738, "step": 6770}, {"loss": 0.6592, "grad_norm": 0.6571047306060791, "learning_rate": 0.0002, "epoch": 2.188508715300194, "step": 6780}, {"loss": 0.6494, "grad_norm": 0.6400564908981323, "learning_rate": 0.0002, "epoch": 2.1917366042608135, "step": 6790}, {"loss": 0.6369, "grad_norm": 0.6509664058685303, "learning_rate": 0.0002, "epoch": 2.194964493221433, "step": 6800}, {"loss": 0.6771, "grad_norm": 0.6673197150230408, "learning_rate": 0.0002, "epoch": 2.198192382182053, "step": 6810}, {"loss": 0.6491, "grad_norm": 0.48205727338790894, "learning_rate": 0.0002, "epoch": 2.2014202711426725, "step": 6820}, {"loss": 0.6894, "grad_norm": 0.849525511264801, "learning_rate": 0.0002, "epoch": 2.2046481601032926, "step": 6830}, {"loss": 0.6977, "grad_norm": 0.6150892376899719, "learning_rate": 0.0002, "epoch": 2.207876049063912, "step": 6840}, {"loss": 0.6843, "grad_norm": 0.7826945781707764, "learning_rate": 0.0002, "epoch": 2.211103938024532, "step": 6850}, {"loss": 0.6338, "grad_norm": 0.5711963772773743, "learning_rate": 0.0002, "epoch": 2.2143318269851515, "step": 6860}, {"loss": 0.6585, "grad_norm": 0.6017758846282959, "learning_rate": 0.0002, "epoch": 2.2175597159457716, "step": 6870}, {"loss": 0.6657, "grad_norm": 0.785434901714325, "learning_rate": 0.0002, "epoch": 2.2207876049063913, "step": 6880}, {"loss": 0.7075, "grad_norm": 0.6251688599586487, "learning_rate": 0.0002, "epoch": 2.224015493867011, "step": 6890}, {"loss": 0.6564, "grad_norm": 0.8242034316062927, "learning_rate": 0.0002, "epoch": 2.2272433828276306, "step": 6900}, {"loss": 0.672, "grad_norm": 0.7272933125495911, "learning_rate": 0.0002, "epoch": 2.2304712717882507, "step": 6910}, {"loss": 0.6541, "grad_norm": 0.7159379720687866, "learning_rate": 0.0002, "epoch": 2.2336991607488703, "step": 6920}, {"loss": 0.6859, "grad_norm": 0.6518042087554932, "learning_rate": 0.0002, "epoch": 2.23692704970949, "step": 6930}, {"loss": 0.5987, "grad_norm": 0.7365370392799377, "learning_rate": 0.0002, "epoch": 2.2401549386701096, "step": 6940}, {"loss": 0.6511, "grad_norm": 0.5674061179161072, "learning_rate": 0.0002, "epoch": 2.2433828276307297, "step": 6950}, {"loss": 0.6748, "grad_norm": 0.669185996055603, "learning_rate": 0.0002, "epoch": 2.2466107165913494, "step": 6960}, {"loss": 0.656, "grad_norm": 0.6638304591178894, "learning_rate": 0.0002, "epoch": 2.249838605551969, "step": 6970}, {"loss": 0.636, "grad_norm": 0.757006824016571, "learning_rate": 0.0002, "epoch": 2.2530664945125887, "step": 6980}, {"loss": 0.6597, "grad_norm": 0.7574930787086487, "learning_rate": 0.0002, "epoch": 2.2562943834732083, "step": 6990}, {"loss": 0.6859, "grad_norm": 0.7819514870643616, "learning_rate": 0.0002, "epoch": 2.2595222724338284, "step": 7000}, {"loss": 0.6238, "grad_norm": 0.6987583041191101, "learning_rate": 0.0002, "epoch": 2.262750161394448, "step": 7010}, {"loss": 0.661, "grad_norm": 0.6628551483154297, "learning_rate": 0.0002, "epoch": 2.2659780503550677, "step": 7020}, {"loss": 0.6254, "grad_norm": 0.7855866551399231, "learning_rate": 0.0002, "epoch": 2.2692059393156874, "step": 7030}, {"loss": 0.6679, "grad_norm": 0.6102892756462097, "learning_rate": 0.0002, "epoch": 2.2724338282763075, "step": 7040}, {"loss": 0.694, "grad_norm": 0.7844198942184448, "learning_rate": 0.0002, "epoch": 2.275661717236927, "step": 7050}, {"loss": 0.63, "grad_norm": 0.6209492087364197, "learning_rate": 0.0002, "epoch": 2.2788896061975468, "step": 7060}, {"loss": 0.6418, "grad_norm": 0.8351290225982666, "learning_rate": 0.0002, "epoch": 2.2821174951581664, "step": 7070}, {"loss": 0.6648, "grad_norm": 0.6883546710014343, "learning_rate": 0.0002, "epoch": 2.285345384118786, "step": 7080}, {"loss": 0.7046, "grad_norm": 0.6626381874084473, "learning_rate": 0.0002, "epoch": 2.288573273079406, "step": 7090}, {"loss": 0.6535, "grad_norm": 0.7216270565986633, "learning_rate": 0.0002, "epoch": 2.291801162040026, "step": 7100}, {"loss": 0.6414, "grad_norm": 0.8246777057647705, "learning_rate": 0.0002, "epoch": 2.2950290510006455, "step": 7110}, {"loss": 0.6315, "grad_norm": 0.614326000213623, "learning_rate": 0.0002, "epoch": 2.2982569399612656, "step": 7120}, {"loss": 0.6303, "grad_norm": 0.8785578012466431, "learning_rate": 0.0002, "epoch": 2.301484828921885, "step": 7130}, {"loss": 0.6348, "grad_norm": 0.7021808624267578, "learning_rate": 0.0002, "epoch": 2.304712717882505, "step": 7140}, {"loss": 0.6738, "grad_norm": 0.6999403238296509, "learning_rate": 0.0002, "epoch": 2.3079406068431245, "step": 7150}, {"loss": 0.6547, "grad_norm": 0.8013143539428711, "learning_rate": 0.0002, "epoch": 2.311168495803744, "step": 7160}, {"loss": 0.6461, "grad_norm": 0.6592583060264587, "learning_rate": 0.0002, "epoch": 2.3143963847643643, "step": 7170}, {"loss": 0.6369, "grad_norm": 0.6260249018669128, "learning_rate": 0.0002, "epoch": 2.317624273724984, "step": 7180}, {"loss": 0.6647, "grad_norm": 0.9352797269821167, "learning_rate": 0.0002, "epoch": 2.3208521626856036, "step": 7190}, {"loss": 0.6543, "grad_norm": 0.6629612445831299, "learning_rate": 0.0002, "epoch": 2.324080051646223, "step": 7200}, {"loss": 0.6811, "grad_norm": 0.7062810063362122, "learning_rate": 0.0002, "epoch": 2.3273079406068433, "step": 7210}, {"loss": 0.67, "grad_norm": 0.7236241102218628, "learning_rate": 0.0002, "epoch": 2.330535829567463, "step": 7220}, {"loss": 0.6462, "grad_norm": 0.7528148293495178, "learning_rate": 0.0002, "epoch": 2.3337637185280826, "step": 7230}, {"loss": 0.694, "grad_norm": 0.7604748606681824, "learning_rate": 0.0002, "epoch": 2.3369916074887023, "step": 7240}, {"loss": 0.6475, "grad_norm": 0.5601189136505127, "learning_rate": 0.0002, "epoch": 2.340219496449322, "step": 7250}, {"loss": 0.6925, "grad_norm": 0.7099230885505676, "learning_rate": 0.0002, "epoch": 2.343447385409942, "step": 7260}, {"loss": 0.6333, "grad_norm": 0.6699047684669495, "learning_rate": 0.0002, "epoch": 2.3466752743705617, "step": 7270}, {"loss": 0.6434, "grad_norm": 0.7315047979354858, "learning_rate": 0.0002, "epoch": 2.3499031633311813, "step": 7280}, {"loss": 0.6927, "grad_norm": 0.632836103439331, "learning_rate": 0.0002, "epoch": 2.353131052291801, "step": 7290}, {"loss": 0.6458, "grad_norm": 0.9410115480422974, "learning_rate": 0.0002, "epoch": 2.356358941252421, "step": 7300}, {"loss": 0.6699, "grad_norm": 0.626554012298584, "learning_rate": 0.0002, "epoch": 2.3595868302130407, "step": 7310}, {"loss": 0.6495, "grad_norm": 0.7538444399833679, "learning_rate": 0.0002, "epoch": 2.3628147191736604, "step": 7320}, {"loss": 0.6321, "grad_norm": 0.6826626062393188, "learning_rate": 0.0002, "epoch": 2.36604260813428, "step": 7330}, {"loss": 0.6752, "grad_norm": 0.6739391088485718, "learning_rate": 0.0002, "epoch": 2.3692704970949, "step": 7340}, {"loss": 0.6518, "grad_norm": 0.7518446445465088, "learning_rate": 0.0002, "epoch": 2.3724983860555198, "step": 7350}, {"loss": 0.7142, "grad_norm": 0.714133083820343, "learning_rate": 0.0002, "epoch": 2.3757262750161394, "step": 7360}, {"loss": 0.6794, "grad_norm": 0.7144588232040405, "learning_rate": 0.0002, "epoch": 2.378954163976759, "step": 7370}, {"loss": 0.6922, "grad_norm": 0.6598120927810669, "learning_rate": 0.0002, "epoch": 2.382182052937379, "step": 7380}, {"loss": 0.6562, "grad_norm": 0.7079148292541504, "learning_rate": 0.0002, "epoch": 2.385409941897999, "step": 7390}, {"loss": 0.6492, "grad_norm": 0.6750902533531189, "learning_rate": 0.0002, "epoch": 2.3886378308586185, "step": 7400}, {"loss": 0.6398, "grad_norm": 0.7181967496871948, "learning_rate": 0.0002, "epoch": 2.391865719819238, "step": 7410}, {"loss": 0.6793, "grad_norm": 0.7720552086830139, "learning_rate": 0.0002, "epoch": 2.3950936087798578, "step": 7420}, {"loss": 0.6804, "grad_norm": 0.7592426538467407, "learning_rate": 0.0002, "epoch": 2.398321497740478, "step": 7430}, {"loss": 0.6667, "grad_norm": 0.7161896824836731, "learning_rate": 0.0002, "epoch": 2.4015493867010975, "step": 7440}, {"loss": 0.6891, "grad_norm": 0.8019260764122009, "learning_rate": 0.0002, "epoch": 2.404777275661717, "step": 7450}, {"loss": 0.6864, "grad_norm": 0.7093342542648315, "learning_rate": 0.0002, "epoch": 2.408005164622337, "step": 7460}, {"loss": 0.6445, "grad_norm": 0.8464207649230957, "learning_rate": 0.0002, "epoch": 2.411233053582957, "step": 7470}, {"loss": 0.6724, "grad_norm": 0.773666501045227, "learning_rate": 0.0002, "epoch": 2.4144609425435766, "step": 7480}, {"loss": 0.6774, "grad_norm": 0.8451611995697021, "learning_rate": 0.0002, "epoch": 2.4176888315041962, "step": 7490}, {"loss": 0.694, "grad_norm": 0.656795084476471, "learning_rate": 0.0002, "epoch": 2.420916720464816, "step": 7500}, {"loss": 0.6824, "grad_norm": 0.7129034996032715, "learning_rate": 0.0002, "epoch": 2.4241446094254355, "step": 7510}, {"loss": 0.711, "grad_norm": 0.8325763940811157, "learning_rate": 0.0002, "epoch": 2.4273724983860556, "step": 7520}, {"loss": 0.6238, "grad_norm": 0.7806527614593506, "learning_rate": 0.0002, "epoch": 2.4306003873466753, "step": 7530}, {"loss": 0.6972, "grad_norm": 0.6994536519050598, "learning_rate": 0.0002, "epoch": 2.433828276307295, "step": 7540}, {"loss": 0.6615, "grad_norm": 0.6898999214172363, "learning_rate": 0.0002, "epoch": 2.437056165267915, "step": 7550}, {"loss": 0.7108, "grad_norm": 0.719490647315979, "learning_rate": 0.0002, "epoch": 2.4402840542285347, "step": 7560}, {"loss": 0.668, "grad_norm": 0.6841562390327454, "learning_rate": 0.0002, "epoch": 2.4435119431891543, "step": 7570}, {"loss": 0.6504, "grad_norm": 0.7573311924934387, "learning_rate": 0.0002, "epoch": 2.446739832149774, "step": 7580}, {"loss": 0.6607, "grad_norm": 0.7295880317687988, "learning_rate": 0.0002, "epoch": 2.4499677211103936, "step": 7590}, {"loss": 0.6593, "grad_norm": 0.710136353969574, "learning_rate": 0.0002, "epoch": 2.4531956100710137, "step": 7600}, {"loss": 0.7137, "grad_norm": 0.6126235127449036, "learning_rate": 0.0002, "epoch": 2.4564234990316334, "step": 7610}, {"loss": 0.6562, "grad_norm": 0.8025609850883484, "learning_rate": 0.0002, "epoch": 2.459651387992253, "step": 7620}, {"loss": 0.6464, "grad_norm": 0.7839472889900208, "learning_rate": 0.0002, "epoch": 2.4628792769528727, "step": 7630}, {"loss": 0.6797, "grad_norm": 0.7253499031066895, "learning_rate": 0.0002, "epoch": 2.4661071659134928, "step": 7640}, {"loss": 0.7341, "grad_norm": 0.7918946743011475, "learning_rate": 0.0002, "epoch": 2.4693350548741124, "step": 7650}, {"loss": 0.6646, "grad_norm": 0.7930178046226501, "learning_rate": 0.0002, "epoch": 2.472562943834732, "step": 7660}, {"loss": 0.6294, "grad_norm": 0.6826170086860657, "learning_rate": 0.0002, "epoch": 2.4757908327953517, "step": 7670}, {"loss": 0.6697, "grad_norm": 0.6576805114746094, "learning_rate": 0.0002, "epoch": 2.4790187217559714, "step": 7680}, {"loss": 0.682, "grad_norm": 0.7012448310852051, "learning_rate": 0.0002, "epoch": 2.4822466107165915, "step": 7690}, {"loss": 0.6418, "grad_norm": 0.7774284482002258, "learning_rate": 0.0002, "epoch": 2.485474499677211, "step": 7700}, {"loss": 0.6566, "grad_norm": 0.6502766013145447, "learning_rate": 0.0002, "epoch": 2.4887023886378308, "step": 7710}, {"loss": 0.6965, "grad_norm": 0.7638739347457886, "learning_rate": 0.0002, "epoch": 2.4919302775984504, "step": 7720}, {"loss": 0.6454, "grad_norm": 0.6217384338378906, "learning_rate": 0.0002, "epoch": 2.4951581665590705, "step": 7730}, {"loss": 0.6837, "grad_norm": 0.7576302886009216, "learning_rate": 0.0002, "epoch": 2.49838605551969, "step": 7740}, {"loss": 0.6855, "grad_norm": 0.6877137422561646, "learning_rate": 0.0002, "epoch": 2.50161394448031, "step": 7750}, {"loss": 0.6604, "grad_norm": 0.6998329162597656, "learning_rate": 0.0002, "epoch": 2.5048418334409295, "step": 7760}, {"loss": 0.6666, "grad_norm": 0.7879213690757751, "learning_rate": 0.0002, "epoch": 2.508069722401549, "step": 7770}, {"loss": 0.715, "grad_norm": 0.7834980487823486, "learning_rate": 0.0002, "epoch": 2.5112976113621692, "step": 7780}, {"loss": 0.6954, "grad_norm": 0.7789630889892578, "learning_rate": 0.0002, "epoch": 2.514525500322789, "step": 7790}, {"loss": 0.6979, "grad_norm": 0.7403590083122253, "learning_rate": 0.0002, "epoch": 2.5177533892834085, "step": 7800}, {"loss": 0.6964, "grad_norm": 0.6029766201972961, "learning_rate": 0.0002, "epoch": 2.5209812782440286, "step": 7810}, {"loss": 0.6887, "grad_norm": 0.7061092257499695, "learning_rate": 0.0002, "epoch": 2.5242091672046483, "step": 7820}, {"loss": 0.6628, "grad_norm": 0.7120763659477234, "learning_rate": 0.0002, "epoch": 2.527437056165268, "step": 7830}, {"loss": 0.6876, "grad_norm": 0.6173675656318665, "learning_rate": 0.0002, "epoch": 2.5306649451258876, "step": 7840}, {"loss": 0.6635, "grad_norm": 0.9566813111305237, "learning_rate": 0.0002, "epoch": 2.5338928340865072, "step": 7850}, {"loss": 0.654, "grad_norm": 0.8497620224952698, "learning_rate": 0.0002, "epoch": 2.5371207230471273, "step": 7860}, {"loss": 0.644, "grad_norm": 0.7663498520851135, "learning_rate": 0.0002, "epoch": 2.540348612007747, "step": 7870}, {"loss": 0.6292, "grad_norm": 0.6329668760299683, "learning_rate": 0.0002, "epoch": 2.5435765009683666, "step": 7880}, {"loss": 0.686, "grad_norm": 0.8128195405006409, "learning_rate": 0.0002, "epoch": 2.5468043899289863, "step": 7890}, {"loss": 0.6619, "grad_norm": 0.6622284650802612, "learning_rate": 0.0002, "epoch": 2.5500322788896064, "step": 7900}, {"loss": 0.693, "grad_norm": 0.8460057973861694, "learning_rate": 0.0002, "epoch": 2.553260167850226, "step": 7910}, {"loss": 0.6619, "grad_norm": 0.6586956977844238, "learning_rate": 0.0002, "epoch": 2.5564880568108457, "step": 7920}, {"loss": 0.6976, "grad_norm": 0.7569382190704346, "learning_rate": 0.0002, "epoch": 2.5597159457714653, "step": 7930}, {"loss": 0.6235, "grad_norm": 0.6409714221954346, "learning_rate": 0.0002, "epoch": 2.562943834732085, "step": 7940}, {"loss": 0.6663, "grad_norm": 0.7031713128089905, "learning_rate": 0.0002, "epoch": 2.566171723692705, "step": 7950}, {"loss": 0.6344, "grad_norm": 0.7983605265617371, "learning_rate": 0.0002, "epoch": 2.5693996126533247, "step": 7960}, {"loss": 0.6834, "grad_norm": 0.7165433168411255, "learning_rate": 0.0002, "epoch": 2.5726275016139444, "step": 7970}, {"loss": 0.6517, "grad_norm": 0.6630598902702332, "learning_rate": 0.0002, "epoch": 2.5758553905745645, "step": 7980}, {"loss": 0.7164, "grad_norm": 0.5883122086524963, "learning_rate": 0.0002, "epoch": 2.579083279535184, "step": 7990}, {"loss": 0.6715, "grad_norm": 0.5928755402565002, "learning_rate": 0.0002, "epoch": 2.5823111684958038, "step": 8000}, {"loss": 0.6701, "grad_norm": 0.7843712568283081, "learning_rate": 0.0002, "epoch": 2.5855390574564234, "step": 8010}, {"loss": 0.6617, "grad_norm": 0.7206324338912964, "learning_rate": 0.0002, "epoch": 2.588766946417043, "step": 8020}, {"loss": 0.6968, "grad_norm": 0.812480092048645, "learning_rate": 0.0002, "epoch": 2.5919948353776627, "step": 8030}, {"loss": 0.6735, "grad_norm": 0.9843078255653381, "learning_rate": 0.0002, "epoch": 2.595222724338283, "step": 8040}, {"loss": 0.6877, "grad_norm": 0.7524392604827881, "learning_rate": 0.0002, "epoch": 2.5984506132989025, "step": 8050}, {"loss": 0.7188, "grad_norm": 0.6220380067825317, "learning_rate": 0.0002, "epoch": 2.601678502259522, "step": 8060}, {"loss": 0.6878, "grad_norm": 0.7461398243904114, "learning_rate": 0.0002, "epoch": 2.6049063912201422, "step": 8070}, {"loss": 0.6626, "grad_norm": 0.720974326133728, "learning_rate": 0.0002, "epoch": 2.608134280180762, "step": 8080}, {"loss": 0.6756, "grad_norm": 0.649509847164154, "learning_rate": 0.0002, "epoch": 2.6113621691413815, "step": 8090}, {"loss": 0.6394, "grad_norm": 0.6894662976264954, "learning_rate": 0.0002, "epoch": 2.614590058102001, "step": 8100}, {"loss": 0.6329, "grad_norm": 0.734433114528656, "learning_rate": 0.0002, "epoch": 2.617817947062621, "step": 8110}, {"loss": 0.6698, "grad_norm": 0.7468628883361816, "learning_rate": 0.0002, "epoch": 2.621045836023241, "step": 8120}, {"loss": 0.658, "grad_norm": 0.6508180499076843, "learning_rate": 0.0002, "epoch": 2.6242737249838606, "step": 8130}, {"loss": 0.6619, "grad_norm": 0.8735209107398987, "learning_rate": 0.0002, "epoch": 2.6275016139444802, "step": 8140}, {"loss": 0.6717, "grad_norm": 0.8162857294082642, "learning_rate": 0.0002, "epoch": 2.6307295029051003, "step": 8150}, {"loss": 0.6496, "grad_norm": 0.628872811794281, "learning_rate": 0.0002, "epoch": 2.63395739186572, "step": 8160}, {"loss": 0.6608, "grad_norm": 0.8078708052635193, "learning_rate": 0.0002, "epoch": 2.6371852808263396, "step": 8170}, {"loss": 0.6916, "grad_norm": 0.7849429845809937, "learning_rate": 0.0002, "epoch": 2.6404131697869593, "step": 8180}, {"loss": 0.6671, "grad_norm": 0.8115387558937073, "learning_rate": 0.0002, "epoch": 2.643641058747579, "step": 8190}, {"loss": 0.6761, "grad_norm": 0.7462222576141357, "learning_rate": 0.0002, "epoch": 2.6468689477081986, "step": 8200}, {"loss": 0.6923, "grad_norm": 0.753662645816803, "learning_rate": 0.0002, "epoch": 2.6500968366688187, "step": 8210}, {"loss": 0.6666, "grad_norm": 0.6100404858589172, "learning_rate": 0.0002, "epoch": 2.6533247256294383, "step": 8220}, {"loss": 0.7256, "grad_norm": 0.9084606766700745, "learning_rate": 0.0002, "epoch": 2.656552614590058, "step": 8230}, {"loss": 0.6385, "grad_norm": 0.6412538886070251, "learning_rate": 0.0002, "epoch": 2.659780503550678, "step": 8240}, {"loss": 0.7048, "grad_norm": 0.7640451192855835, "learning_rate": 0.0002, "epoch": 2.6630083925112977, "step": 8250}, {"loss": 0.6846, "grad_norm": 0.5972344875335693, "learning_rate": 0.0002, "epoch": 2.6662362814719174, "step": 8260}, {"loss": 0.682, "grad_norm": 0.6935883164405823, "learning_rate": 0.0002, "epoch": 2.669464170432537, "step": 8270}, {"loss": 0.6625, "grad_norm": 0.789399266242981, "learning_rate": 0.0002, "epoch": 2.6726920593931567, "step": 8280}, {"loss": 0.6541, "grad_norm": 0.7143490314483643, "learning_rate": 0.0002, "epoch": 2.675919948353777, "step": 8290}, {"loss": 0.6741, "grad_norm": 0.6670652627944946, "learning_rate": 0.0002, "epoch": 2.6791478373143964, "step": 8300}, {"loss": 0.6936, "grad_norm": 0.687108039855957, "learning_rate": 0.0002, "epoch": 2.682375726275016, "step": 8310}, {"loss": 0.7124, "grad_norm": 0.7914147973060608, "learning_rate": 0.0002, "epoch": 2.6856036152356357, "step": 8320}, {"loss": 0.6584, "grad_norm": 0.8398420214653015, "learning_rate": 0.0002, "epoch": 2.688831504196256, "step": 8330}, {"loss": 0.6679, "grad_norm": 0.6592720746994019, "learning_rate": 0.0002, "epoch": 2.6920593931568755, "step": 8340}, {"loss": 0.6673, "grad_norm": 0.6888470649719238, "learning_rate": 0.0002, "epoch": 2.695287282117495, "step": 8350}, {"loss": 0.6483, "grad_norm": 0.7127556800842285, "learning_rate": 0.0002, "epoch": 2.698515171078115, "step": 8360}, {"loss": 0.7013, "grad_norm": 0.6630286574363708, "learning_rate": 0.0002, "epoch": 2.7017430600387344, "step": 8370}, {"loss": 0.6842, "grad_norm": 0.8261964321136475, "learning_rate": 0.0002, "epoch": 2.7049709489993545, "step": 8380}, {"loss": 0.6613, "grad_norm": 0.717339813709259, "learning_rate": 0.0002, "epoch": 2.708198837959974, "step": 8390}, {"loss": 0.6929, "grad_norm": 0.651637613773346, "learning_rate": 0.0002, "epoch": 2.711426726920594, "step": 8400}, {"loss": 0.6796, "grad_norm": 0.7936098575592041, "learning_rate": 0.0002, "epoch": 2.714654615881214, "step": 8410}, {"loss": 0.696, "grad_norm": 0.8761560320854187, "learning_rate": 0.0002, "epoch": 2.7178825048418336, "step": 8420}, {"loss": 0.6889, "grad_norm": 0.6768006086349487, "learning_rate": 0.0002, "epoch": 2.7211103938024532, "step": 8430}, {"loss": 0.6844, "grad_norm": 0.7121055722236633, "learning_rate": 0.0002, "epoch": 2.724338282763073, "step": 8440}, {"loss": 0.6608, "grad_norm": 0.6811696887016296, "learning_rate": 0.0002, "epoch": 2.7275661717236925, "step": 8450}, {"loss": 0.7046, "grad_norm": 0.8168250918388367, "learning_rate": 0.0002, "epoch": 2.730794060684312, "step": 8460}, {"loss": 0.6809, "grad_norm": 0.660682737827301, "learning_rate": 0.0002, "epoch": 2.7340219496449323, "step": 8470}, {"loss": 0.6916, "grad_norm": 0.7369356155395508, "learning_rate": 0.0002, "epoch": 2.737249838605552, "step": 8480}, {"loss": 0.6383, "grad_norm": 0.7545099854469299, "learning_rate": 0.0002, "epoch": 2.7404777275661716, "step": 8490}, {"loss": 0.6917, "grad_norm": 0.6991257667541504, "learning_rate": 0.0002, "epoch": 2.7437056165267917, "step": 8500}, {"loss": 0.6953, "grad_norm": 0.7195324301719666, "learning_rate": 0.0002, "epoch": 2.7469335054874113, "step": 8510}, {"loss": 0.6955, "grad_norm": 0.8995378017425537, "learning_rate": 0.0002, "epoch": 2.750161394448031, "step": 8520}, {"loss": 0.684, "grad_norm": 0.6924123764038086, "learning_rate": 0.0002, "epoch": 2.7533892834086506, "step": 8530}, {"loss": 0.6675, "grad_norm": 0.6260585784912109, "learning_rate": 0.0002, "epoch": 2.7566171723692703, "step": 8540}, {"loss": 0.6613, "grad_norm": 0.7273091673851013, "learning_rate": 0.0002, "epoch": 2.7598450613298904, "step": 8550}, {"loss": 0.6853, "grad_norm": 0.720562219619751, "learning_rate": 0.0002, "epoch": 2.76307295029051, "step": 8560}, {"loss": 0.6452, "grad_norm": 0.6360004544258118, "learning_rate": 0.0002, "epoch": 2.7663008392511297, "step": 8570}, {"loss": 0.6118, "grad_norm": 0.7634525895118713, "learning_rate": 0.0002, "epoch": 2.76952872821175, "step": 8580}, {"loss": 0.686, "grad_norm": 0.6586076021194458, "learning_rate": 0.0002, "epoch": 2.7727566171723694, "step": 8590}, {"loss": 0.7072, "grad_norm": 0.6542639136314392, "learning_rate": 0.0002, "epoch": 2.775984506132989, "step": 8600}, {"loss": 0.7126, "grad_norm": 0.7650290727615356, "learning_rate": 0.0002, "epoch": 2.7792123950936087, "step": 8610}, {"loss": 0.6923, "grad_norm": 0.6551542282104492, "learning_rate": 0.0002, "epoch": 2.7824402840542284, "step": 8620}, {"loss": 0.6937, "grad_norm": 0.6915501952171326, "learning_rate": 0.0002, "epoch": 2.785668173014848, "step": 8630}, {"loss": 0.6586, "grad_norm": 0.8061493635177612, "learning_rate": 0.0002, "epoch": 2.788896061975468, "step": 8640}, {"loss": 0.6853, "grad_norm": 0.8403584957122803, "learning_rate": 0.0002, "epoch": 2.792123950936088, "step": 8650}, {"loss": 0.6616, "grad_norm": 0.6455532312393188, "learning_rate": 0.0002, "epoch": 2.7953518398967074, "step": 8660}, {"loss": 0.6819, "grad_norm": 0.8296352028846741, "learning_rate": 0.0002, "epoch": 2.7985797288573275, "step": 8670}, {"loss": 0.6678, "grad_norm": 0.7288752794265747, "learning_rate": 0.0002, "epoch": 2.801807617817947, "step": 8680}, {"loss": 0.6778, "grad_norm": 0.7628464102745056, "learning_rate": 0.0002, "epoch": 2.805035506778567, "step": 8690}, {"loss": 0.7176, "grad_norm": 0.9993878602981567, "learning_rate": 0.0002, "epoch": 2.8082633957391865, "step": 8700}, {"loss": 0.6414, "grad_norm": 0.6972465515136719, "learning_rate": 0.0002, "epoch": 2.811491284699806, "step": 8710}, {"loss": 0.6777, "grad_norm": 0.645042896270752, "learning_rate": 0.0002, "epoch": 2.8147191736604262, "step": 8720}, {"loss": 0.6587, "grad_norm": 0.6853853464126587, "learning_rate": 0.0002, "epoch": 2.817947062621046, "step": 8730}, {"loss": 0.6405, "grad_norm": 0.5935067534446716, "learning_rate": 0.0002, "epoch": 2.8211749515816655, "step": 8740}, {"loss": 0.6674, "grad_norm": 0.7336633205413818, "learning_rate": 0.0002, "epoch": 2.824402840542285, "step": 8750}, {"loss": 0.6662, "grad_norm": 0.7074962854385376, "learning_rate": 0.0002, "epoch": 2.8276307295029053, "step": 8760}, {"loss": 0.6744, "grad_norm": 0.6667559742927551, "learning_rate": 0.0002, "epoch": 2.830858618463525, "step": 8770}, {"loss": 0.7142, "grad_norm": 0.8101205229759216, "learning_rate": 0.0002, "epoch": 2.8340865074241446, "step": 8780}, {"loss": 0.6727, "grad_norm": 0.8841480016708374, "learning_rate": 0.0002, "epoch": 2.8373143963847642, "step": 8790}, {"loss": 0.6601, "grad_norm": 0.5891591310501099, "learning_rate": 0.0002, "epoch": 2.840542285345384, "step": 8800}, {"loss": 0.7114, "grad_norm": 0.667032778263092, "learning_rate": 0.0002, "epoch": 2.843770174306004, "step": 8810}, {"loss": 0.7295, "grad_norm": 0.7629773020744324, "learning_rate": 0.0002, "epoch": 2.8469980632666236, "step": 8820}, {"loss": 0.703, "grad_norm": 0.79471355676651, "learning_rate": 0.0002, "epoch": 2.8502259522272433, "step": 8830}, {"loss": 0.7278, "grad_norm": 0.7529178261756897, "learning_rate": 0.0002, "epoch": 2.8534538411878634, "step": 8840}, {"loss": 0.7163, "grad_norm": 0.7014923691749573, "learning_rate": 0.0002, "epoch": 2.856681730148483, "step": 8850}, {"loss": 0.6803, "grad_norm": 0.7996514439582825, "learning_rate": 0.0002, "epoch": 2.8599096191091027, "step": 8860}, {"loss": 0.6562, "grad_norm": 0.7044785618782043, "learning_rate": 0.0002, "epoch": 2.8631375080697223, "step": 8870}, {"loss": 0.6966, "grad_norm": 0.6792093515396118, "learning_rate": 0.0002, "epoch": 2.866365397030342, "step": 8880}, {"loss": 0.685, "grad_norm": 0.69175124168396, "learning_rate": 0.0002, "epoch": 2.8695932859909616, "step": 8890}, {"loss": 0.7225, "grad_norm": 0.7499129176139832, "learning_rate": 0.0002, "epoch": 2.8728211749515817, "step": 8900}, {"loss": 0.6922, "grad_norm": 0.7678789496421814, "learning_rate": 0.0002, "epoch": 2.8760490639122014, "step": 8910}, {"loss": 0.6803, "grad_norm": 0.7478128671646118, "learning_rate": 0.0002, "epoch": 2.879276952872821, "step": 8920}, {"loss": 0.6689, "grad_norm": 0.6767086386680603, "learning_rate": 0.0002, "epoch": 2.882504841833441, "step": 8930}, {"loss": 0.6587, "grad_norm": 0.7222196459770203, "learning_rate": 0.0002, "epoch": 2.885732730794061, "step": 8940}, {"loss": 0.6472, "grad_norm": 0.6950580477714539, "learning_rate": 0.0002, "epoch": 2.8889606197546804, "step": 8950}, {"loss": 0.7064, "grad_norm": 0.7759528160095215, "learning_rate": 0.0002, "epoch": 2.8921885087153, "step": 8960}, {"loss": 0.6349, "grad_norm": 0.6686919927597046, "learning_rate": 0.0002, "epoch": 2.8954163976759197, "step": 8970}, {"loss": 0.6801, "grad_norm": 0.9245954751968384, "learning_rate": 0.0002, "epoch": 2.89864428663654, "step": 8980}, {"loss": 0.6703, "grad_norm": 0.8734814524650574, "learning_rate": 0.0002, "epoch": 2.9018721755971595, "step": 8990}, {"loss": 0.6716, "grad_norm": 0.6056219339370728, "learning_rate": 0.0002, "epoch": 2.905100064557779, "step": 9000}, {"loss": 0.6535, "grad_norm": 0.7364102005958557, "learning_rate": 0.0002, "epoch": 2.9083279535183992, "step": 9010}, {"loss": 0.707, "grad_norm": 0.6563605070114136, "learning_rate": 0.0002, "epoch": 2.911555842479019, "step": 9020}, {"loss": 0.6564, "grad_norm": 0.659978985786438, "learning_rate": 0.0002, "epoch": 2.9147837314396385, "step": 9030}, {"loss": 0.7154, "grad_norm": 0.8176041841506958, "learning_rate": 0.0002, "epoch": 2.918011620400258, "step": 9040}, {"loss": 0.72, "grad_norm": 0.743677020072937, "learning_rate": 0.0002, "epoch": 2.921239509360878, "step": 9050}, {"loss": 0.7017, "grad_norm": 0.7418383359909058, "learning_rate": 0.0002, "epoch": 2.9244673983214975, "step": 9060}, {"loss": 0.6635, "grad_norm": 0.6916524767875671, "learning_rate": 0.0002, "epoch": 2.9276952872821176, "step": 9070}, {"loss": 0.6502, "grad_norm": 0.6559975743293762, "learning_rate": 0.0002, "epoch": 2.9309231762427372, "step": 9080}, {"loss": 0.7016, "grad_norm": 0.7431221008300781, "learning_rate": 0.0002, "epoch": 2.934151065203357, "step": 9090}, {"loss": 0.6829, "grad_norm": 0.7525941133499146, "learning_rate": 0.0002, "epoch": 2.937378954163977, "step": 9100}, {"loss": 0.7073, "grad_norm": 0.6860167384147644, "learning_rate": 0.0002, "epoch": 2.9406068431245966, "step": 9110}, {"loss": 0.6912, "grad_norm": 0.6467666029930115, "learning_rate": 0.0002, "epoch": 2.9438347320852163, "step": 9120}, {"loss": 0.7122, "grad_norm": 0.7595751285552979, "learning_rate": 0.0002, "epoch": 2.947062621045836, "step": 9130}, {"loss": 0.6951, "grad_norm": 0.6558279991149902, "learning_rate": 0.0002, "epoch": 2.9502905100064556, "step": 9140}, {"loss": 0.7081, "grad_norm": 0.6818708181381226, "learning_rate": 0.0002, "epoch": 2.9535183989670757, "step": 9150}, {"loss": 0.6921, "grad_norm": 0.8387085795402527, "learning_rate": 0.0002, "epoch": 2.9567462879276953, "step": 9160}, {"loss": 0.6914, "grad_norm": 0.7705109715461731, "learning_rate": 0.0002, "epoch": 2.959974176888315, "step": 9170}, {"loss": 0.6849, "grad_norm": 0.688106894493103, "learning_rate": 0.0002, "epoch": 2.9632020658489346, "step": 9180}, {"loss": 0.6833, "grad_norm": 0.659532368183136, "learning_rate": 0.0002, "epoch": 2.9664299548095547, "step": 9190}, {"loss": 0.6383, "grad_norm": 0.6839388608932495, "learning_rate": 0.0002, "epoch": 2.9696578437701744, "step": 9200}, {"loss": 0.6952, "grad_norm": 0.6927599310874939, "learning_rate": 0.0002, "epoch": 2.972885732730794, "step": 9210}, {"loss": 0.7338, "grad_norm": 0.6902472972869873, "learning_rate": 0.0002, "epoch": 2.9761136216914137, "step": 9220}, {"loss": 0.6671, "grad_norm": 0.620399534702301, "learning_rate": 0.0002, "epoch": 2.9793415106520333, "step": 9230}, {"loss": 0.6588, "grad_norm": 0.6812364459037781, "learning_rate": 0.0002, "epoch": 2.9825693996126534, "step": 9240}, {"loss": 0.6957, "grad_norm": 0.7681456208229065, "learning_rate": 0.0002, "epoch": 2.985797288573273, "step": 9250}, {"loss": 0.7113, "grad_norm": 0.7621907591819763, "learning_rate": 0.0002, "epoch": 2.9890251775338927, "step": 9260}, {"loss": 0.6601, "grad_norm": 0.6075740456581116, "learning_rate": 0.0002, "epoch": 2.992253066494513, "step": 9270}, {"loss": 0.6758, "grad_norm": 0.7100434899330139, "learning_rate": 0.0002, "epoch": 2.9954809554551325, "step": 9280}, {"loss": 0.73, "grad_norm": 0.7314488887786865, "learning_rate": 0.0002, "epoch": 2.998708844415752, "step": 9290}, {"eval_loss": 1.1434104442596436, "eval_runtime": 166.3732, "eval_samples_per_second": 4.406, "eval_steps_per_second": 0.553, "epoch": 3.0, "step": 9294}, {"loss": 0.6401, "grad_norm": 0.7408893704414368, "learning_rate": 0.0002, "epoch": 3.001936733376372, "step": 9300}, {"loss": 0.5182, "grad_norm": 0.9773574471473694, "learning_rate": 0.0002, "epoch": 3.0051646223369914, "step": 9310}, {"loss": 0.5432, "grad_norm": 0.7919653058052063, "learning_rate": 0.0002, "epoch": 3.0083925112976115, "step": 9320}, {"loss": 0.6156, "grad_norm": 0.9139202833175659, "learning_rate": 0.0002, "epoch": 3.011620400258231, "step": 9330}, {"loss": 0.5736, "grad_norm": 0.8296737670898438, "learning_rate": 0.0002, "epoch": 3.014848289218851, "step": 9340}, {"loss": 0.5567, "grad_norm": 0.786868155002594, "learning_rate": 0.0002, "epoch": 3.0180761781794705, "step": 9350}, {"loss": 0.578, "grad_norm": 0.5928055644035339, "learning_rate": 0.0002, "epoch": 3.0213040671400906, "step": 9360}, {"loss": 0.5376, "grad_norm": 0.8785701394081116, "learning_rate": 0.0002, "epoch": 3.0245319561007102, "step": 9370}, {"loss": 0.5664, "grad_norm": 0.7978872060775757, "learning_rate": 0.0002, "epoch": 3.02775984506133, "step": 9380}, {"loss": 0.5797, "grad_norm": 0.7160913348197937, "learning_rate": 0.0002, "epoch": 3.0309877340219495, "step": 9390}, {"loss": 0.5777, "grad_norm": 0.904465913772583, "learning_rate": 0.0002, "epoch": 3.034215622982569, "step": 9400}, {"loss": 0.5518, "grad_norm": 0.7082195281982422, "learning_rate": 0.0002, "epoch": 3.0374435119431893, "step": 9410}, {"loss": 0.5434, "grad_norm": 0.9686778783798218, "learning_rate": 0.0002, "epoch": 3.040671400903809, "step": 9420}, {"loss": 0.5692, "grad_norm": 0.8788613677024841, "learning_rate": 0.0002, "epoch": 3.0438992898644286, "step": 9430}, {"loss": 0.5599, "grad_norm": 0.8217582106590271, "learning_rate": 0.0002, "epoch": 3.0471271788250482, "step": 9440}, {"loss": 0.5405, "grad_norm": 0.7380914092063904, "learning_rate": 0.0002, "epoch": 3.0503550677856683, "step": 9450}, {"loss": 0.6258, "grad_norm": 0.7339285612106323, "learning_rate": 0.0002, "epoch": 3.053582956746288, "step": 9460}, {"loss": 0.5646, "grad_norm": 0.7175183296203613, "learning_rate": 0.0002, "epoch": 3.0568108457069076, "step": 9470}, {"loss": 0.5667, "grad_norm": 0.8275379538536072, "learning_rate": 0.0002, "epoch": 3.0600387346675273, "step": 9480}, {"loss": 0.5868, "grad_norm": 0.6544256806373596, "learning_rate": 0.0002, "epoch": 3.0632666236281474, "step": 9490}, {"loss": 0.5365, "grad_norm": 0.8193472623825073, "learning_rate": 0.0002, "epoch": 3.066494512588767, "step": 9500}, {"loss": 0.5614, "grad_norm": 0.7967836856842041, "learning_rate": 0.0002, "epoch": 3.0697224015493867, "step": 9510}, {"loss": 0.5629, "grad_norm": 0.8788684010505676, "learning_rate": 0.0002, "epoch": 3.0729502905100063, "step": 9520}, {"loss": 0.5397, "grad_norm": 0.9410629868507385, "learning_rate": 0.0002, "epoch": 3.0761781794706264, "step": 9530}, {"loss": 0.5473, "grad_norm": 0.7448706030845642, "learning_rate": 0.0002, "epoch": 3.079406068431246, "step": 9540}, {"loss": 0.5774, "grad_norm": 0.9149372577667236, "learning_rate": 0.0002, "epoch": 3.0826339573918657, "step": 9550}, {"loss": 0.5347, "grad_norm": 0.7265563607215881, "learning_rate": 0.0002, "epoch": 3.0858618463524854, "step": 9560}, {"loss": 0.5487, "grad_norm": 1.0305068492889404, "learning_rate": 0.0002, "epoch": 3.089089735313105, "step": 9570}, {"loss": 0.5884, "grad_norm": 0.7987357974052429, "learning_rate": 0.0002, "epoch": 3.092317624273725, "step": 9580}, {"loss": 0.6216, "grad_norm": 0.7733123898506165, "learning_rate": 0.0002, "epoch": 3.095545513234345, "step": 9590}, {"loss": 0.5848, "grad_norm": 1.0438069105148315, "learning_rate": 0.0002, "epoch": 3.0987734021949644, "step": 9600}, {"loss": 0.5612, "grad_norm": 0.7951784729957581, "learning_rate": 0.0002, "epoch": 3.102001291155584, "step": 9610}, {"loss": 0.6184, "grad_norm": 0.7776783108711243, "learning_rate": 0.0002, "epoch": 3.105229180116204, "step": 9620}, {"loss": 0.5626, "grad_norm": 0.7060676217079163, "learning_rate": 0.0002, "epoch": 3.108457069076824, "step": 9630}, {"loss": 0.5731, "grad_norm": 0.871569037437439, "learning_rate": 0.0002, "epoch": 3.1116849580374435, "step": 9640}, {"loss": 0.5168, "grad_norm": 0.8873385787010193, "learning_rate": 0.0002, "epoch": 3.114912846998063, "step": 9650}, {"loss": 0.5985, "grad_norm": 0.750998318195343, "learning_rate": 0.0002, "epoch": 3.118140735958683, "step": 9660}, {"loss": 0.5741, "grad_norm": 0.8678529262542725, "learning_rate": 0.0002, "epoch": 3.121368624919303, "step": 9670}, {"loss": 0.5831, "grad_norm": 0.7706599235534668, "learning_rate": 0.0002, "epoch": 3.1245965138799225, "step": 9680}, {"loss": 0.6142, "grad_norm": 0.8317574858665466, "learning_rate": 0.0002, "epoch": 3.127824402840542, "step": 9690}, {"loss": 0.5634, "grad_norm": 0.801800012588501, "learning_rate": 0.0002, "epoch": 3.131052291801162, "step": 9700}, {"loss": 0.6044, "grad_norm": 0.8574623465538025, "learning_rate": 0.0002, "epoch": 3.134280180761782, "step": 9710}, {"loss": 0.6072, "grad_norm": 0.6556540727615356, "learning_rate": 0.0002, "epoch": 3.1375080697224016, "step": 9720}, {"loss": 0.6058, "grad_norm": 0.8555161952972412, "learning_rate": 0.0002, "epoch": 3.1407359586830212, "step": 9730}, {"loss": 0.6069, "grad_norm": 0.8825467824935913, "learning_rate": 0.0002, "epoch": 3.143963847643641, "step": 9740}, {"loss": 0.5689, "grad_norm": 0.8297156691551208, "learning_rate": 0.0002, "epoch": 3.147191736604261, "step": 9750}, {"loss": 0.5738, "grad_norm": 0.7710384726524353, "learning_rate": 0.0002, "epoch": 3.1504196255648806, "step": 9760}, {"loss": 0.571, "grad_norm": 0.8778039216995239, "learning_rate": 0.0002, "epoch": 3.1536475145255003, "step": 9770}, {"loss": 0.5913, "grad_norm": 0.9014058113098145, "learning_rate": 0.0002, "epoch": 3.15687540348612, "step": 9780}, {"loss": 0.5496, "grad_norm": 0.6856890320777893, "learning_rate": 0.0002, "epoch": 3.16010329244674, "step": 9790}, {"loss": 0.558, "grad_norm": 0.6520644426345825, "learning_rate": 0.0002, "epoch": 3.1633311814073597, "step": 9800}, {"loss": 0.6024, "grad_norm": 0.7250499129295349, "learning_rate": 0.0002, "epoch": 3.1665590703679793, "step": 9810}, {"loss": 0.5823, "grad_norm": 0.8331542015075684, "learning_rate": 0.0002, "epoch": 3.169786959328599, "step": 9820}, {"loss": 0.5803, "grad_norm": 0.8531261682510376, "learning_rate": 0.0002, "epoch": 3.1730148482892186, "step": 9830}, {"loss": 0.57, "grad_norm": 0.8997558355331421, "learning_rate": 0.0002, "epoch": 3.1762427372498387, "step": 9840}, {"loss": 0.5921, "grad_norm": 0.708335280418396, "learning_rate": 0.0002, "epoch": 3.1794706262104584, "step": 9850}, {"loss": 0.5997, "grad_norm": 1.0074886083602905, "learning_rate": 0.0002, "epoch": 3.182698515171078, "step": 9860}, {"loss": 0.573, "grad_norm": 1.0804681777954102, "learning_rate": 0.0002, "epoch": 3.1859264041316977, "step": 9870}, {"loss": 0.5527, "grad_norm": 0.9510730504989624, "learning_rate": 0.0002, "epoch": 3.189154293092318, "step": 9880}, {"loss": 0.6401, "grad_norm": 0.7211061716079712, "learning_rate": 0.0002, "epoch": 3.1923821820529374, "step": 9890}, {"loss": 0.5563, "grad_norm": 0.8767086267471313, "learning_rate": 0.0002, "epoch": 3.195610071013557, "step": 9900}, {"loss": 0.5747, "grad_norm": 0.8388153314590454, "learning_rate": 0.0002, "epoch": 3.1988379599741767, "step": 9910}, {"loss": 0.5681, "grad_norm": 0.8038473725318909, "learning_rate": 0.0002, "epoch": 3.202065848934797, "step": 9920}, {"loss": 0.5594, "grad_norm": 0.8187747001647949, "learning_rate": 0.0002, "epoch": 3.2052937378954165, "step": 9930}, {"loss": 0.5813, "grad_norm": 0.7427355051040649, "learning_rate": 0.0002, "epoch": 3.208521626856036, "step": 9940}, {"loss": 0.5709, "grad_norm": 0.8017025589942932, "learning_rate": 0.0002, "epoch": 3.211749515816656, "step": 9950}, {"loss": 0.6106, "grad_norm": 0.738595187664032, "learning_rate": 0.0002, "epoch": 3.214977404777276, "step": 9960}, {"loss": 0.6006, "grad_norm": 0.7521342039108276, "learning_rate": 0.0002, "epoch": 3.2182052937378955, "step": 9970}, {"loss": 0.5706, "grad_norm": 0.840329110622406, "learning_rate": 0.0002, "epoch": 3.221433182698515, "step": 9980}, {"loss": 0.5666, "grad_norm": 0.9809671640396118, "learning_rate": 0.0002, "epoch": 3.224661071659135, "step": 9990}, {"loss": 0.6223, "grad_norm": 0.8456943035125732, "learning_rate": 0.0002, "epoch": 3.2278889606197545, "step": 10000}, {"loss": 0.5798, "grad_norm": 0.8962995409965515, "learning_rate": 0.0002, "epoch": 3.2311168495803746, "step": 10010}, {"loss": 0.5399, "grad_norm": 0.6492817401885986, "learning_rate": 0.0002, "epoch": 3.2343447385409942, "step": 10020}, {"loss": 0.5678, "grad_norm": 1.0471255779266357, "learning_rate": 0.0002, "epoch": 3.237572627501614, "step": 10030}, {"loss": 0.5452, "grad_norm": 0.7995471358299255, "learning_rate": 0.0002, "epoch": 3.2408005164622335, "step": 10040}, {"loss": 0.615, "grad_norm": 0.7231964468955994, "learning_rate": 0.0002, "epoch": 3.2440284054228536, "step": 10050}, {"loss": 0.5586, "grad_norm": 0.639630138874054, "learning_rate": 0.0002, "epoch": 3.2472562943834733, "step": 10060}, {"loss": 0.6271, "grad_norm": 0.7957055568695068, "learning_rate": 0.0002, "epoch": 3.250484183344093, "step": 10070}, {"loss": 0.5845, "grad_norm": 0.7735482454299927, "learning_rate": 0.0002, "epoch": 3.2537120723047126, "step": 10080}, {"loss": 0.5791, "grad_norm": 0.8139488101005554, "learning_rate": 0.0002, "epoch": 3.2569399612653323, "step": 10090}, {"loss": 0.6049, "grad_norm": 0.8113240003585815, "learning_rate": 0.0002, "epoch": 3.2601678502259523, "step": 10100}, {"loss": 0.5617, "grad_norm": 0.7735909819602966, "learning_rate": 0.0002, "epoch": 3.263395739186572, "step": 10110}, {"loss": 0.5964, "grad_norm": 0.7760744094848633, "learning_rate": 0.0002, "epoch": 3.2666236281471916, "step": 10120}, {"loss": 0.5786, "grad_norm": 0.8078505396842957, "learning_rate": 0.0002, "epoch": 3.2698515171078113, "step": 10130}, {"loss": 0.5904, "grad_norm": 0.983648955821991, "learning_rate": 0.0002, "epoch": 3.2730794060684314, "step": 10140}, {"loss": 0.596, "grad_norm": 0.7131832242012024, "learning_rate": 0.0002, "epoch": 3.276307295029051, "step": 10150}, {"loss": 0.5986, "grad_norm": 0.924493134021759, "learning_rate": 0.0002, "epoch": 3.2795351839896707, "step": 10160}, {"loss": 0.5733, "grad_norm": 0.9371112585067749, "learning_rate": 0.0002, "epoch": 3.2827630729502904, "step": 10170}, {"loss": 0.5891, "grad_norm": 0.8989261388778687, "learning_rate": 0.0002, "epoch": 3.2859909619109104, "step": 10180}, {"loss": 0.6143, "grad_norm": 0.8130394816398621, "learning_rate": 0.0002, "epoch": 3.28921885087153, "step": 10190}, {"loss": 0.5555, "grad_norm": 0.9899941086769104, "learning_rate": 0.0002, "epoch": 3.2924467398321497, "step": 10200}, {"loss": 0.5899, "grad_norm": 1.007038950920105, "learning_rate": 0.0002, "epoch": 3.2956746287927694, "step": 10210}, {"loss": 0.5713, "grad_norm": 0.7465066313743591, "learning_rate": 0.0002, "epoch": 3.2989025177533895, "step": 10220}, {"loss": 0.6307, "grad_norm": 0.7202590703964233, "learning_rate": 0.0002, "epoch": 3.302130406714009, "step": 10230}, {"loss": 0.5659, "grad_norm": 0.6258249282836914, "learning_rate": 0.0002, "epoch": 3.305358295674629, "step": 10240}, {"loss": 0.5869, "grad_norm": 0.8996058702468872, "learning_rate": 0.0002, "epoch": 3.3085861846352485, "step": 10250}, {"loss": 0.5825, "grad_norm": 0.9550982713699341, "learning_rate": 0.0002, "epoch": 3.311814073595868, "step": 10260}, {"loss": 0.5602, "grad_norm": 0.7010059952735901, "learning_rate": 0.0002, "epoch": 3.315041962556488, "step": 10270}, {"loss": 0.5853, "grad_norm": 0.9639869332313538, "learning_rate": 0.0002, "epoch": 3.318269851517108, "step": 10280}, {"loss": 0.5362, "grad_norm": 1.0192502737045288, "learning_rate": 0.0002, "epoch": 3.3214977404777275, "step": 10290}, {"loss": 0.5605, "grad_norm": 0.7953670024871826, "learning_rate": 0.0002, "epoch": 3.324725629438347, "step": 10300}, {"loss": 0.6386, "grad_norm": 0.7436774969100952, "learning_rate": 0.0002, "epoch": 3.3279535183989672, "step": 10310}, {"loss": 0.5823, "grad_norm": 0.7846777439117432, "learning_rate": 0.0002, "epoch": 3.331181407359587, "step": 10320}, {"loss": 0.6119, "grad_norm": 0.8963494896888733, "learning_rate": 0.0002, "epoch": 3.3344092963202066, "step": 10330}, {"loss": 0.5872, "grad_norm": 0.6876392364501953, "learning_rate": 0.0002, "epoch": 3.337637185280826, "step": 10340}, {"loss": 0.6291, "grad_norm": 0.9161638021469116, "learning_rate": 0.0002, "epoch": 3.340865074241446, "step": 10350}, {"loss": 0.5955, "grad_norm": 0.8964458107948303, "learning_rate": 0.0002, "epoch": 3.344092963202066, "step": 10360}, {"loss": 0.5965, "grad_norm": 0.9052296280860901, "learning_rate": 0.0002, "epoch": 3.3473208521626856, "step": 10370}, {"loss": 0.5958, "grad_norm": 0.9292596578598022, "learning_rate": 0.0002, "epoch": 3.3505487411233053, "step": 10380}, {"loss": 0.5487, "grad_norm": 0.9605957269668579, "learning_rate": 0.0002, "epoch": 3.3537766300839253, "step": 10390}, {"loss": 0.6214, "grad_norm": 1.0198872089385986, "learning_rate": 0.0002, "epoch": 3.357004519044545, "step": 10400}, {"loss": 0.6053, "grad_norm": 0.7043630480766296, "learning_rate": 0.0002, "epoch": 3.3602324080051647, "step": 10410}, {"loss": 0.5451, "grad_norm": 1.0533326864242554, "learning_rate": 0.0002, "epoch": 3.3634602969657843, "step": 10420}, {"loss": 0.6134, "grad_norm": 0.7552485466003418, "learning_rate": 0.0002, "epoch": 3.366688185926404, "step": 10430}, {"loss": 0.631, "grad_norm": 0.692708432674408, "learning_rate": 0.0002, "epoch": 3.369916074887024, "step": 10440}, {"loss": 0.631, "grad_norm": 0.985952615737915, "learning_rate": 0.0002, "epoch": 3.3731439638476437, "step": 10450}, {"loss": 0.5689, "grad_norm": 0.6749676465988159, "learning_rate": 0.0002, "epoch": 3.3763718528082634, "step": 10460}, {"loss": 0.5724, "grad_norm": 0.9514535665512085, "learning_rate": 0.0002, "epoch": 3.379599741768883, "step": 10470}, {"loss": 0.5982, "grad_norm": 1.2681142091751099, "learning_rate": 0.0002, "epoch": 3.382827630729503, "step": 10480}, {"loss": 0.5778, "grad_norm": 1.031968355178833, "learning_rate": 0.0002, "epoch": 3.3860555196901228, "step": 10490}, {"loss": 0.5964, "grad_norm": 0.8061563968658447, "learning_rate": 0.0002, "epoch": 3.3892834086507424, "step": 10500}, {"loss": 0.6094, "grad_norm": 1.0515062808990479, "learning_rate": 0.0002, "epoch": 3.392511297611362, "step": 10510}, {"loss": 0.542, "grad_norm": 0.9055540561676025, "learning_rate": 0.0002, "epoch": 3.3957391865719817, "step": 10520}, {"loss": 0.6148, "grad_norm": 0.9318141341209412, "learning_rate": 0.0002, "epoch": 3.398967075532602, "step": 10530}, {"loss": 0.5722, "grad_norm": 0.8266817331314087, "learning_rate": 0.0002, "epoch": 3.4021949644932215, "step": 10540}, {"loss": 0.6015, "grad_norm": 1.2322112321853638, "learning_rate": 0.0002, "epoch": 3.405422853453841, "step": 10550}, {"loss": 0.6215, "grad_norm": 0.9535136818885803, "learning_rate": 0.0002, "epoch": 3.4086507424144608, "step": 10560}, {"loss": 0.561, "grad_norm": 0.9243819117546082, "learning_rate": 0.0002, "epoch": 3.411878631375081, "step": 10570}, {"loss": 0.5844, "grad_norm": 0.9011809825897217, "learning_rate": 0.0002, "epoch": 3.4151065203357005, "step": 10580}, {"loss": 0.6175, "grad_norm": 0.9923036694526672, "learning_rate": 0.0002, "epoch": 3.41833440929632, "step": 10590}, {"loss": 0.6033, "grad_norm": 0.8903067111968994, "learning_rate": 0.0002, "epoch": 3.42156229825694, "step": 10600}, {"loss": 0.5563, "grad_norm": 0.7101534605026245, "learning_rate": 0.0002, "epoch": 3.42479018721756, "step": 10610}, {"loss": 0.598, "grad_norm": 0.8186570405960083, "learning_rate": 0.0002, "epoch": 3.4280180761781796, "step": 10620}, {"loss": 0.5897, "grad_norm": 0.9480205774307251, "learning_rate": 0.0002, "epoch": 3.431245965138799, "step": 10630}, {"loss": 0.5798, "grad_norm": 1.1370961666107178, "learning_rate": 0.0002, "epoch": 3.434473854099419, "step": 10640}, {"loss": 0.5779, "grad_norm": 1.017669677734375, "learning_rate": 0.0002, "epoch": 3.437701743060039, "step": 10650}, {"loss": 0.5999, "grad_norm": 0.7625100016593933, "learning_rate": 0.0002, "epoch": 3.4409296320206586, "step": 10660}, {"loss": 0.5705, "grad_norm": 0.9288196563720703, "learning_rate": 0.0002, "epoch": 3.4441575209812783, "step": 10670}, {"loss": 0.6255, "grad_norm": 0.8800460696220398, "learning_rate": 0.0002, "epoch": 3.447385409941898, "step": 10680}, {"loss": 0.6245, "grad_norm": 0.7499661445617676, "learning_rate": 0.0002, "epoch": 3.4506132989025176, "step": 10690}, {"loss": 0.5979, "grad_norm": 0.8254973292350769, "learning_rate": 0.0002, "epoch": 3.4538411878631377, "step": 10700}, {"loss": 0.5742, "grad_norm": 0.8735857605934143, "learning_rate": 0.0002, "epoch": 3.4570690768237573, "step": 10710}, {"loss": 0.6356, "grad_norm": 0.9601819515228271, "learning_rate": 0.0002, "epoch": 3.460296965784377, "step": 10720}, {"loss": 0.5574, "grad_norm": 0.8031058311462402, "learning_rate": 0.0002, "epoch": 3.4635248547449966, "step": 10730}, {"loss": 0.6078, "grad_norm": 0.8039247393608093, "learning_rate": 0.0002, "epoch": 3.4667527437056167, "step": 10740}, {"loss": 0.593, "grad_norm": 0.8936953544616699, "learning_rate": 0.0002, "epoch": 3.4699806326662364, "step": 10750}, {"loss": 0.5971, "grad_norm": 0.8201186060905457, "learning_rate": 0.0002, "epoch": 3.473208521626856, "step": 10760}, {"loss": 0.5875, "grad_norm": 1.0064148902893066, "learning_rate": 0.0002, "epoch": 3.4764364105874757, "step": 10770}, {"loss": 0.5639, "grad_norm": 0.8617483377456665, "learning_rate": 0.0002, "epoch": 3.4796642995480953, "step": 10780}, {"loss": 0.6022, "grad_norm": 0.8532096147537231, "learning_rate": 0.0002, "epoch": 3.4828921885087154, "step": 10790}, {"loss": 0.5765, "grad_norm": 0.8646879196166992, "learning_rate": 0.0002, "epoch": 3.486120077469335, "step": 10800}, {"loss": 0.5799, "grad_norm": 0.7962660789489746, "learning_rate": 0.0002, "epoch": 3.4893479664299547, "step": 10810}, {"loss": 0.5398, "grad_norm": 0.9560028314590454, "learning_rate": 0.0002, "epoch": 3.492575855390575, "step": 10820}, {"loss": 0.6082, "grad_norm": 0.928439736366272, "learning_rate": 0.0002, "epoch": 3.4958037443511945, "step": 10830}, {"loss": 0.6112, "grad_norm": 0.8219282627105713, "learning_rate": 0.0002, "epoch": 3.499031633311814, "step": 10840}, {"loss": 0.6369, "grad_norm": 0.7918338179588318, "learning_rate": 0.0002, "epoch": 3.5022595222724338, "step": 10850}, {"loss": 0.6164, "grad_norm": 0.961295485496521, "learning_rate": 0.0002, "epoch": 3.5054874112330534, "step": 10860}, {"loss": 0.5534, "grad_norm": 1.0731624364852905, "learning_rate": 0.0002, "epoch": 3.5087153001936735, "step": 10870}, {"loss": 0.5829, "grad_norm": 0.9551863074302673, "learning_rate": 0.0002, "epoch": 3.511943189154293, "step": 10880}, {"loss": 0.5746, "grad_norm": 0.8409819602966309, "learning_rate": 0.0002, "epoch": 3.515171078114913, "step": 10890}, {"loss": 0.5813, "grad_norm": 0.7546320557594299, "learning_rate": 0.0002, "epoch": 3.5183989670755325, "step": 10900}, {"loss": 0.6184, "grad_norm": 0.7505252361297607, "learning_rate": 0.0002, "epoch": 3.5216268560361526, "step": 10910}, {"loss": 0.5649, "grad_norm": 0.7505561113357544, "learning_rate": 0.0002, "epoch": 3.524854744996772, "step": 10920}, {"loss": 0.6277, "grad_norm": 1.086177945137024, "learning_rate": 0.0002, "epoch": 3.528082633957392, "step": 10930}, {"loss": 0.5983, "grad_norm": 0.7721118330955505, "learning_rate": 0.0002, "epoch": 3.5313105229180115, "step": 10940}, {"loss": 0.5919, "grad_norm": 0.9567878246307373, "learning_rate": 0.0002, "epoch": 3.534538411878631, "step": 10950}, {"loss": 0.6261, "grad_norm": 0.8377360105514526, "learning_rate": 0.0002, "epoch": 3.5377663008392513, "step": 10960}, {"loss": 0.633, "grad_norm": 1.0174858570098877, "learning_rate": 0.0002, "epoch": 3.540994189799871, "step": 10970}, {"loss": 0.599, "grad_norm": 0.8164418935775757, "learning_rate": 0.0002, "epoch": 3.5442220787604906, "step": 10980}, {"loss": 0.5471, "grad_norm": 0.8959241509437561, "learning_rate": 0.0002, "epoch": 3.5474499677211107, "step": 10990}, {"loss": 0.6195, "grad_norm": 1.0154379606246948, "learning_rate": 0.0002, "epoch": 3.5506778566817303, "step": 11000}, {"loss": 0.5835, "grad_norm": 0.7812292575836182, "learning_rate": 0.0002, "epoch": 3.55390574564235, "step": 11010}, {"loss": 0.6052, "grad_norm": 0.9849029779434204, "learning_rate": 0.0002, "epoch": 3.5571336346029696, "step": 11020}, {"loss": 0.5689, "grad_norm": 0.8826184272766113, "learning_rate": 0.0002, "epoch": 3.5603615235635893, "step": 11030}, {"loss": 0.601, "grad_norm": 0.9039685726165771, "learning_rate": 0.0002, "epoch": 3.563589412524209, "step": 11040}, {"loss": 0.5996, "grad_norm": 0.9585249423980713, "learning_rate": 0.0002, "epoch": 3.566817301484829, "step": 11050}, {"loss": 0.5714, "grad_norm": 0.8083069324493408, "learning_rate": 0.0002, "epoch": 3.5700451904454487, "step": 11060}, {"loss": 0.6317, "grad_norm": 0.9528678059577942, "learning_rate": 0.0002, "epoch": 3.5732730794060683, "step": 11070}, {"loss": 0.6278, "grad_norm": 0.8297588229179382, "learning_rate": 0.0002, "epoch": 3.5765009683666884, "step": 11080}, {"loss": 0.5919, "grad_norm": 0.8191716074943542, "learning_rate": 0.0002, "epoch": 3.579728857327308, "step": 11090}, {"loss": 0.5971, "grad_norm": 0.8056275844573975, "learning_rate": 0.0002, "epoch": 3.5829567462879277, "step": 11100}, {"loss": 0.6325, "grad_norm": 0.701930582523346, "learning_rate": 0.0002, "epoch": 3.5861846352485474, "step": 11110}, {"loss": 0.6088, "grad_norm": 0.7644643187522888, "learning_rate": 0.0002, "epoch": 3.589412524209167, "step": 11120}, {"loss": 0.605, "grad_norm": 0.668004035949707, "learning_rate": 0.0002, "epoch": 3.592640413169787, "step": 11130}, {"loss": 0.5735, "grad_norm": 0.8849539756774902, "learning_rate": 0.0002, "epoch": 3.5958683021304068, "step": 11140}, {"loss": 0.6412, "grad_norm": 0.8123571276664734, "learning_rate": 0.0002, "epoch": 3.5990961910910264, "step": 11150}, {"loss": 0.5626, "grad_norm": 0.7591469287872314, "learning_rate": 0.0002, "epoch": 3.602324080051646, "step": 11160}, {"loss": 0.5668, "grad_norm": 0.776466965675354, "learning_rate": 0.0002, "epoch": 3.605551969012266, "step": 11170}, {"loss": 0.6631, "grad_norm": 0.9156150221824646, "learning_rate": 0.0002, "epoch": 3.608779857972886, "step": 11180}, {"loss": 0.5867, "grad_norm": 0.7517618536949158, "learning_rate": 0.0002, "epoch": 3.6120077469335055, "step": 11190}, {"loss": 0.5939, "grad_norm": 0.931239128112793, "learning_rate": 0.0002, "epoch": 3.615235635894125, "step": 11200}, {"loss": 0.5736, "grad_norm": 0.9107872843742371, "learning_rate": 0.0002, "epoch": 3.6184635248547448, "step": 11210}, {"loss": 0.5665, "grad_norm": 0.7624770998954773, "learning_rate": 0.0002, "epoch": 3.621691413815365, "step": 11220}, {"loss": 0.6033, "grad_norm": 0.8129580616950989, "learning_rate": 0.0002, "epoch": 3.6249193027759845, "step": 11230}, {"loss": 0.6192, "grad_norm": 0.7339836955070496, "learning_rate": 0.0002, "epoch": 3.628147191736604, "step": 11240}, {"loss": 0.5976, "grad_norm": 0.8901296854019165, "learning_rate": 0.0002, "epoch": 3.6313750806972243, "step": 11250}, {"loss": 0.5977, "grad_norm": 1.1374726295471191, "learning_rate": 0.0002, "epoch": 3.634602969657844, "step": 11260}, {"loss": 0.5859, "grad_norm": 0.7438275218009949, "learning_rate": 0.0002, "epoch": 3.6378308586184636, "step": 11270}, {"loss": 0.5757, "grad_norm": 0.808646559715271, "learning_rate": 0.0002, "epoch": 3.641058747579083, "step": 11280}, {"loss": 0.6244, "grad_norm": 1.091810941696167, "learning_rate": 0.0002, "epoch": 3.644286636539703, "step": 11290}, {"loss": 0.5957, "grad_norm": 0.8439257144927979, "learning_rate": 0.0002, "epoch": 3.6475145255003225, "step": 11300}, {"loss": 0.6115, "grad_norm": 0.9720633029937744, "learning_rate": 0.0002, "epoch": 3.6507424144609426, "step": 11310}, {"loss": 0.5942, "grad_norm": 0.738571047782898, "learning_rate": 0.0002, "epoch": 3.6539703034215623, "step": 11320}, {"loss": 0.6029, "grad_norm": 0.6961580514907837, "learning_rate": 0.0002, "epoch": 3.657198192382182, "step": 11330}, {"loss": 0.6226, "grad_norm": 0.8192131519317627, "learning_rate": 0.0002, "epoch": 3.660426081342802, "step": 11340}, {"loss": 0.6155, "grad_norm": 0.8367205858230591, "learning_rate": 0.0002, "epoch": 3.6636539703034217, "step": 11350}, {"loss": 0.586, "grad_norm": 0.7735666632652283, "learning_rate": 0.0002, "epoch": 3.6668818592640413, "step": 11360}, {"loss": 0.6113, "grad_norm": 0.6507132649421692, "learning_rate": 0.0002, "epoch": 3.670109748224661, "step": 11370}, {"loss": 0.6273, "grad_norm": 0.8271192312240601, "learning_rate": 0.0002, "epoch": 3.6733376371852806, "step": 11380}, {"loss": 0.5995, "grad_norm": 0.8724204301834106, "learning_rate": 0.0002, "epoch": 3.6765655261459007, "step": 11390}, {"loss": 0.6131, "grad_norm": 0.8448445200920105, "learning_rate": 0.0002, "epoch": 3.6797934151065204, "step": 11400}, {"loss": 0.5923, "grad_norm": 0.6756882071495056, "learning_rate": 0.0002, "epoch": 3.68302130406714, "step": 11410}, {"loss": 0.6443, "grad_norm": 0.7859625816345215, "learning_rate": 0.0002, "epoch": 3.68624919302776, "step": 11420}, {"loss": 0.6567, "grad_norm": 0.8929487466812134, "learning_rate": 0.0002, "epoch": 3.6894770819883798, "step": 11430}, {"loss": 0.6474, "grad_norm": 0.8163391351699829, "learning_rate": 0.0002, "epoch": 3.6927049709489994, "step": 11440}, {"loss": 0.6467, "grad_norm": 0.8948464393615723, "learning_rate": 0.0002, "epoch": 3.695932859909619, "step": 11450}, {"loss": 0.624, "grad_norm": 0.8654782176017761, "learning_rate": 0.0002, "epoch": 3.6991607488702387, "step": 11460}, {"loss": 0.6142, "grad_norm": 0.9514864683151245, "learning_rate": 0.0002, "epoch": 3.7023886378308584, "step": 11470}, {"loss": 0.606, "grad_norm": 0.7298579812049866, "learning_rate": 0.0002, "epoch": 3.7056165267914785, "step": 11480}, {"loss": 0.5853, "grad_norm": 0.9266309142112732, "learning_rate": 0.0002, "epoch": 3.708844415752098, "step": 11490}, {"loss": 0.6122, "grad_norm": 0.8608686923980713, "learning_rate": 0.0002, "epoch": 3.7120723047127178, "step": 11500}, {"loss": 0.6348, "grad_norm": 0.921788215637207, "learning_rate": 0.0002, "epoch": 3.715300193673338, "step": 11510}, {"loss": 0.6191, "grad_norm": 0.8537021279335022, "learning_rate": 0.0002, "epoch": 3.7185280826339575, "step": 11520}, {"loss": 0.6228, "grad_norm": 1.115194320678711, "learning_rate": 0.0002, "epoch": 3.721755971594577, "step": 11530}, {"loss": 0.5828, "grad_norm": 0.7614817023277283, "learning_rate": 0.0002, "epoch": 3.724983860555197, "step": 11540}, {"loss": 0.5776, "grad_norm": 0.871999204158783, "learning_rate": 0.0002, "epoch": 3.7282117495158165, "step": 11550}, {"loss": 0.5962, "grad_norm": 0.9668049812316895, "learning_rate": 0.0002, "epoch": 3.7314396384764366, "step": 11560}, {"loss": 0.5534, "grad_norm": 1.2185815572738647, "learning_rate": 0.0002, "epoch": 3.734667527437056, "step": 11570}, {"loss": 0.5936, "grad_norm": 0.8258453011512756, "learning_rate": 0.0002, "epoch": 3.737895416397676, "step": 11580}, {"loss": 0.5853, "grad_norm": 0.8708966374397278, "learning_rate": 0.0002, "epoch": 3.7411233053582955, "step": 11590}, {"loss": 0.5847, "grad_norm": 0.7784267663955688, "learning_rate": 0.0002, "epoch": 3.7443511943189156, "step": 11600}, {"loss": 0.6404, "grad_norm": 0.7504425048828125, "learning_rate": 0.0002, "epoch": 3.7475790832795353, "step": 11610}, {"loss": 0.5922, "grad_norm": 0.9144526124000549, "learning_rate": 0.0002, "epoch": 3.750806972240155, "step": 11620}, {"loss": 0.6425, "grad_norm": 0.922581672668457, "learning_rate": 0.0002, "epoch": 3.7540348612007746, "step": 11630}, {"loss": 0.6402, "grad_norm": 0.9348630905151367, "learning_rate": 0.0002, "epoch": 3.757262750161394, "step": 11640}, {"loss": 0.5852, "grad_norm": 1.0740231275558472, "learning_rate": 0.0002, "epoch": 3.7604906391220143, "step": 11650}, {"loss": 0.599, "grad_norm": 0.884830117225647, "learning_rate": 0.0002, "epoch": 3.763718528082634, "step": 11660}, {"loss": 0.5991, "grad_norm": 1.0256348848342896, "learning_rate": 0.0002, "epoch": 3.7669464170432536, "step": 11670}, {"loss": 0.626, "grad_norm": 0.6795592904090881, "learning_rate": 0.0002, "epoch": 3.7701743060038737, "step": 11680}, {"loss": 0.6241, "grad_norm": 0.9381206631660461, "learning_rate": 0.0002, "epoch": 3.7734021949644934, "step": 11690}, {"loss": 0.6054, "grad_norm": 0.7633092403411865, "learning_rate": 0.0002, "epoch": 3.776630083925113, "step": 11700}, {"loss": 0.5937, "grad_norm": 0.7506213188171387, "learning_rate": 0.0002, "epoch": 3.7798579728857327, "step": 11710}, {"loss": 0.5933, "grad_norm": 0.8182913064956665, "learning_rate": 0.0002, "epoch": 3.7830858618463523, "step": 11720}, {"loss": 0.6043, "grad_norm": 1.019322156906128, "learning_rate": 0.0002, "epoch": 3.786313750806972, "step": 11730}, {"loss": 0.633, "grad_norm": 0.8895221948623657, "learning_rate": 0.0002, "epoch": 3.789541639767592, "step": 11740}, {"loss": 0.6553, "grad_norm": 0.948847770690918, "learning_rate": 0.0002, "epoch": 3.7927695287282117, "step": 11750}, {"loss": 0.6265, "grad_norm": 0.9068999886512756, "learning_rate": 0.0002, "epoch": 3.7959974176888314, "step": 11760}, {"loss": 0.6163, "grad_norm": 0.7920539975166321, "learning_rate": 0.0002, "epoch": 3.7992253066494515, "step": 11770}, {"loss": 0.5964, "grad_norm": 0.8441922068595886, "learning_rate": 0.0002, "epoch": 3.802453195610071, "step": 11780}, {"loss": 0.6379, "grad_norm": 0.9258501529693604, "learning_rate": 0.0002, "epoch": 3.8056810845706908, "step": 11790}, {"loss": 0.6379, "grad_norm": 0.7354241609573364, "learning_rate": 0.0002, "epoch": 3.8089089735313104, "step": 11800}, {"loss": 0.6177, "grad_norm": 0.9494872689247131, "learning_rate": 0.0002, "epoch": 3.81213686249193, "step": 11810}, {"loss": 0.5931, "grad_norm": 0.8266556859016418, "learning_rate": 0.0002, "epoch": 3.81536475145255, "step": 11820}, {"loss": 0.641, "grad_norm": 0.7951219081878662, "learning_rate": 0.0002, "epoch": 3.81859264041317, "step": 11830}, {"loss": 0.5767, "grad_norm": 0.7688382267951965, "learning_rate": 0.0002, "epoch": 3.8218205293737895, "step": 11840}, {"loss": 0.6117, "grad_norm": 1.0917940139770508, "learning_rate": 0.0002, "epoch": 3.8250484183344096, "step": 11850}, {"loss": 0.5857, "grad_norm": 0.9880442023277283, "learning_rate": 0.0002, "epoch": 3.828276307295029, "step": 11860}, {"loss": 0.6579, "grad_norm": 0.8433151245117188, "learning_rate": 0.0002, "epoch": 3.831504196255649, "step": 11870}, {"loss": 0.5876, "grad_norm": 0.8691204786300659, "learning_rate": 0.0002, "epoch": 3.8347320852162685, "step": 11880}, {"loss": 0.6308, "grad_norm": 0.7698143124580383, "learning_rate": 0.0002, "epoch": 3.837959974176888, "step": 11890}, {"loss": 0.6531, "grad_norm": 0.8874883651733398, "learning_rate": 0.0002, "epoch": 3.841187863137508, "step": 11900}, {"loss": 0.6242, "grad_norm": 1.1209359169006348, "learning_rate": 0.0002, "epoch": 3.844415752098128, "step": 11910}, {"loss": 0.6415, "grad_norm": 0.7723544239997864, "learning_rate": 0.0002, "epoch": 3.8476436410587476, "step": 11920}, {"loss": 0.6091, "grad_norm": 0.8363937139511108, "learning_rate": 0.0002, "epoch": 3.850871530019367, "step": 11930}, {"loss": 0.6498, "grad_norm": 0.9209707975387573, "learning_rate": 0.0002, "epoch": 3.8540994189799873, "step": 11940}, {"loss": 0.6471, "grad_norm": 0.9456894993782043, "learning_rate": 0.0002, "epoch": 3.857327307940607, "step": 11950}, {"loss": 0.6432, "grad_norm": 1.5748413801193237, "learning_rate": 0.0002, "epoch": 3.8605551969012266, "step": 11960}, {"loss": 0.6197, "grad_norm": 0.9083569049835205, "learning_rate": 0.0002, "epoch": 3.8637830858618463, "step": 11970}, {"loss": 0.6593, "grad_norm": 0.7672823071479797, "learning_rate": 0.0002, "epoch": 3.867010974822466, "step": 11980}, {"loss": 0.6238, "grad_norm": 0.8647152185440063, "learning_rate": 0.0002, "epoch": 3.870238863783086, "step": 11990}, {"loss": 0.5755, "grad_norm": 0.9564255475997925, "learning_rate": 0.0002, "epoch": 3.8734667527437057, "step": 12000}, {"loss": 0.6321, "grad_norm": 0.773267924785614, "learning_rate": 0.0002, "epoch": 3.8766946417043253, "step": 12010}, {"loss": 0.6057, "grad_norm": 0.8030173182487488, "learning_rate": 0.0002, "epoch": 3.879922530664945, "step": 12020}, {"loss": 0.6194, "grad_norm": 0.8002150058746338, "learning_rate": 0.0002, "epoch": 3.883150419625565, "step": 12030}, {"loss": 0.6194, "grad_norm": 0.98802250623703, "learning_rate": 0.0002, "epoch": 3.8863783085861847, "step": 12040}, {"loss": 0.6026, "grad_norm": 0.7868124842643738, "learning_rate": 0.0002, "epoch": 3.8896061975468044, "step": 12050}, {"loss": 0.6303, "grad_norm": 0.932182788848877, "learning_rate": 0.0002, "epoch": 3.892834086507424, "step": 12060}, {"loss": 0.5863, "grad_norm": 0.8576806783676147, "learning_rate": 0.0002, "epoch": 3.8960619754680437, "step": 12070}, {"loss": 0.6079, "grad_norm": 0.8985713124275208, "learning_rate": 0.0002, "epoch": 3.8992898644286638, "step": 12080}, {"loss": 0.6449, "grad_norm": 0.7876521944999695, "learning_rate": 0.0002, "epoch": 3.9025177533892834, "step": 12090}, {"loss": 0.5655, "grad_norm": 0.773936927318573, "learning_rate": 0.0002, "epoch": 3.905745642349903, "step": 12100}, {"loss": 0.5765, "grad_norm": 0.7274761199951172, "learning_rate": 0.0002, "epoch": 3.908973531310523, "step": 12110}, {"loss": 0.6182, "grad_norm": 0.8625598549842834, "learning_rate": 0.0002, "epoch": 3.912201420271143, "step": 12120}, {"loss": 0.5855, "grad_norm": 0.8702362179756165, "learning_rate": 0.0002, "epoch": 3.9154293092317625, "step": 12130}, {"loss": 0.6493, "grad_norm": 0.912579357624054, "learning_rate": 0.0002, "epoch": 3.918657198192382, "step": 12140}, {"loss": 0.6341, "grad_norm": 0.8697066903114319, "learning_rate": 0.0002, "epoch": 3.9218850871530018, "step": 12150}, {"loss": 0.6037, "grad_norm": 1.005232572555542, "learning_rate": 0.0002, "epoch": 3.9251129761136214, "step": 12160}, {"loss": 0.621, "grad_norm": 0.793902575969696, "learning_rate": 0.0002, "epoch": 3.9283408650742415, "step": 12170}, {"loss": 0.599, "grad_norm": 0.7025905847549438, "learning_rate": 0.0002, "epoch": 3.931568754034861, "step": 12180}, {"loss": 0.6421, "grad_norm": 0.97635817527771, "learning_rate": 0.0002, "epoch": 3.934796642995481, "step": 12190}, {"loss": 0.6416, "grad_norm": 0.855417013168335, "learning_rate": 0.0002, "epoch": 3.938024531956101, "step": 12200}, {"loss": 0.5979, "grad_norm": 0.8841291666030884, "learning_rate": 0.0002, "epoch": 3.9412524209167206, "step": 12210}, {"loss": 0.5666, "grad_norm": 1.1762064695358276, "learning_rate": 0.0002, "epoch": 3.94448030987734, "step": 12220}, {"loss": 0.586, "grad_norm": 0.8393193483352661, "learning_rate": 0.0002, "epoch": 3.94770819883796, "step": 12230}, {"loss": 0.5738, "grad_norm": 0.9324905276298523, "learning_rate": 0.0002, "epoch": 3.9509360877985795, "step": 12240}, {"loss": 0.5954, "grad_norm": 0.8607982993125916, "learning_rate": 0.0002, "epoch": 3.9541639767591996, "step": 12250}, {"loss": 0.6277, "grad_norm": 0.8586681485176086, "learning_rate": 0.0002, "epoch": 3.9573918657198193, "step": 12260}, {"loss": 0.5841, "grad_norm": 1.1082909107208252, "learning_rate": 0.0002, "epoch": 3.960619754680439, "step": 12270}, {"loss": 0.6231, "grad_norm": 1.065027117729187, "learning_rate": 0.0002, "epoch": 3.963847643641059, "step": 12280}, {"loss": 0.5996, "grad_norm": 0.9544363021850586, "learning_rate": 0.0002, "epoch": 3.9670755326016787, "step": 12290}, {"loss": 0.6301, "grad_norm": 0.9008927345275879, "learning_rate": 0.0002, "epoch": 3.9703034215622983, "step": 12300}, {"loss": 0.6108, "grad_norm": 0.8717467188835144, "learning_rate": 0.0002, "epoch": 3.973531310522918, "step": 12310}, {"loss": 0.6465, "grad_norm": 0.9718339443206787, "learning_rate": 0.0002, "epoch": 3.9767591994835376, "step": 12320}, {"loss": 0.603, "grad_norm": 1.0362015962600708, "learning_rate": 0.0002, "epoch": 3.9799870884441573, "step": 12330}, {"loss": 0.6229, "grad_norm": 1.0844318866729736, "learning_rate": 0.0002, "epoch": 3.9832149774047774, "step": 12340}, {"loss": 0.6777, "grad_norm": 0.7506240606307983, "learning_rate": 0.0002, "epoch": 3.986442866365397, "step": 12350}, {"loss": 0.6076, "grad_norm": 1.005982756614685, "learning_rate": 0.0002, "epoch": 3.9896707553260167, "step": 12360}, {"loss": 0.5926, "grad_norm": 0.7566431164741516, "learning_rate": 0.0002, "epoch": 3.9928986442866368, "step": 12370}, {"loss": 0.653, "grad_norm": 0.8819181323051453, "learning_rate": 0.0002, "epoch": 3.9961265332472564, "step": 12380}, {"loss": 0.6197, "grad_norm": 0.884497880935669, "learning_rate": 0.0002, "epoch": 3.999354422207876, "step": 12390}]} +{"epoch": 5.0, "step": 15490, "epoch_duration": 11818.198820114136, "total_accumulated_duration": 55850.142813920975, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75537109375}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.593, "grad_norm": 0.7092075347900391, "learning_rate": 0.0002, "epoch": 0.0032278889606197547, "step": 10}, {"loss": 1.0956, "grad_norm": 0.6900479793548584, "learning_rate": 0.0002, "epoch": 0.006455777921239509, "step": 20}, {"loss": 0.9807, "grad_norm": 0.6788288950920105, "learning_rate": 0.0002, "epoch": 0.009683666881859263, "step": 30}, {"loss": 0.9385, "grad_norm": 0.5590243339538574, "learning_rate": 0.0002, "epoch": 0.012911555842479019, "step": 40}, {"loss": 0.931, "grad_norm": 0.5136010646820068, "learning_rate": 0.0002, "epoch": 0.016139444803098774, "step": 50}, {"loss": 0.8896, "grad_norm": 0.45298320055007935, "learning_rate": 0.0002, "epoch": 0.019367333763718526, "step": 60}, {"loss": 0.9184, "grad_norm": 0.5917162299156189, "learning_rate": 0.0002, "epoch": 0.022595222724338282, "step": 70}, {"loss": 0.8705, "grad_norm": 0.4414856433868408, "learning_rate": 0.0002, "epoch": 0.025823111684958037, "step": 80}, {"loss": 0.8419, "grad_norm": 0.5547978281974792, "learning_rate": 0.0002, "epoch": 0.029051000645577793, "step": 90}, {"loss": 0.8987, "grad_norm": 0.5271288156509399, "learning_rate": 0.0002, "epoch": 0.03227888960619755, "step": 100}, {"loss": 0.8543, "grad_norm": 0.5506119728088379, "learning_rate": 0.0002, "epoch": 0.035506778566817304, "step": 110}, {"loss": 0.8373, "grad_norm": 0.5579327940940857, "learning_rate": 0.0002, "epoch": 0.03873466752743705, "step": 120}, {"loss": 0.8826, "grad_norm": 0.5099632740020752, "learning_rate": 0.0002, "epoch": 0.04196255648805681, "step": 130}, {"loss": 0.9239, "grad_norm": 0.40396833419799805, "learning_rate": 0.0002, "epoch": 0.045190445448676564, "step": 140}, {"loss": 0.846, "grad_norm": 0.5008092522621155, "learning_rate": 0.0002, "epoch": 0.04841833440929632, "step": 150}, {"loss": 0.8564, "grad_norm": 0.4388776421546936, "learning_rate": 0.0002, "epoch": 0.051646223369916075, "step": 160}, {"loss": 0.8829, "grad_norm": 0.44138944149017334, "learning_rate": 0.0002, "epoch": 0.05487411233053583, "step": 170}, {"loss": 0.8061, "grad_norm": 0.358484148979187, "learning_rate": 0.0002, "epoch": 0.058102001291155586, "step": 180}, {"loss": 0.8956, "grad_norm": 0.457052081823349, "learning_rate": 0.0002, "epoch": 0.06132989025177534, "step": 190}, {"loss": 0.9138, "grad_norm": 0.5537622570991516, "learning_rate": 0.0002, "epoch": 0.0645577792123951, "step": 200}, {"loss": 0.8701, "grad_norm": 0.552631676197052, "learning_rate": 0.0002, "epoch": 0.06778566817301485, "step": 210}, {"loss": 0.8854, "grad_norm": 0.4414575397968292, "learning_rate": 0.0002, "epoch": 0.07101355713363461, "step": 220}, {"loss": 0.8581, "grad_norm": 0.4996664226055145, "learning_rate": 0.0002, "epoch": 0.07424144609425436, "step": 230}, {"loss": 0.8675, "grad_norm": 0.7321897149085999, "learning_rate": 0.0002, "epoch": 0.0774693350548741, "step": 240}, {"loss": 0.8848, "grad_norm": 0.4553901255130768, "learning_rate": 0.0002, "epoch": 0.08069722401549387, "step": 250}, {"loss": 0.868, "grad_norm": 0.5039054751396179, "learning_rate": 0.0002, "epoch": 0.08392511297611362, "step": 260}, {"loss": 0.8317, "grad_norm": 0.4113094210624695, "learning_rate": 0.0002, "epoch": 0.08715300193673338, "step": 270}, {"loss": 0.8074, "grad_norm": 0.450436532497406, "learning_rate": 0.0002, "epoch": 0.09038089089735313, "step": 280}, {"loss": 0.8105, "grad_norm": 0.4548024535179138, "learning_rate": 0.0002, "epoch": 0.09360877985797289, "step": 290}, {"loss": 0.8325, "grad_norm": 0.4932962656021118, "learning_rate": 0.0002, "epoch": 0.09683666881859264, "step": 300}, {"loss": 0.8105, "grad_norm": 0.4005250334739685, "learning_rate": 0.0002, "epoch": 0.1000645577792124, "step": 310}, {"loss": 0.8083, "grad_norm": 1.8321624994277954, "learning_rate": 0.0002, "epoch": 0.10329244673983215, "step": 320}, {"loss": 0.8411, "grad_norm": 0.45815610885620117, "learning_rate": 0.0002, "epoch": 0.1065203357004519, "step": 330}, {"loss": 0.857, "grad_norm": 0.39324095845222473, "learning_rate": 0.0002, "epoch": 0.10974822466107166, "step": 340}, {"loss": 0.8258, "grad_norm": 0.546273946762085, "learning_rate": 0.0002, "epoch": 0.11297611362169141, "step": 350}, {"loss": 0.882, "grad_norm": 0.497448593378067, "learning_rate": 0.0002, "epoch": 0.11620400258231117, "step": 360}, {"loss": 0.7608, "grad_norm": 0.37508800625801086, "learning_rate": 0.0002, "epoch": 0.11943189154293092, "step": 370}, {"loss": 0.852, "grad_norm": 0.45849609375, "learning_rate": 0.0002, "epoch": 0.12265978050355068, "step": 380}, {"loss": 0.8437, "grad_norm": 0.5488408803939819, "learning_rate": 0.0002, "epoch": 0.12588766946417043, "step": 390}, {"loss": 0.8349, "grad_norm": 0.4477061331272125, "learning_rate": 0.0002, "epoch": 0.1291155584247902, "step": 400}, {"loss": 0.8306, "grad_norm": 0.39227980375289917, "learning_rate": 0.0002, "epoch": 0.13234344738540993, "step": 410}, {"loss": 0.7933, "grad_norm": 0.3922233581542969, "learning_rate": 0.0002, "epoch": 0.1355713363460297, "step": 420}, {"loss": 0.8134, "grad_norm": 0.42901909351348877, "learning_rate": 0.0002, "epoch": 0.13879922530664945, "step": 430}, {"loss": 0.8271, "grad_norm": 0.4217798709869385, "learning_rate": 0.0002, "epoch": 0.14202711426726922, "step": 440}, {"loss": 0.8594, "grad_norm": 0.43470677733421326, "learning_rate": 0.0002, "epoch": 0.14525500322788895, "step": 450}, {"loss": 0.8106, "grad_norm": 0.5324403047561646, "learning_rate": 0.0002, "epoch": 0.1484828921885087, "step": 460}, {"loss": 0.8729, "grad_norm": 0.3999756872653961, "learning_rate": 0.0002, "epoch": 0.15171078114912848, "step": 470}, {"loss": 0.7702, "grad_norm": 0.404933363199234, "learning_rate": 0.0002, "epoch": 0.1549386701097482, "step": 480}, {"loss": 0.8151, "grad_norm": 0.44122636318206787, "learning_rate": 0.0002, "epoch": 0.15816655907036797, "step": 490}, {"loss": 0.8457, "grad_norm": 0.510166347026825, "learning_rate": 0.0002, "epoch": 0.16139444803098774, "step": 500}, {"loss": 0.8692, "grad_norm": 0.4549732506275177, "learning_rate": 0.0002, "epoch": 0.1646223369916075, "step": 510}, {"loss": 0.8466, "grad_norm": 0.5148182511329651, "learning_rate": 0.0002, "epoch": 0.16785022595222723, "step": 520}, {"loss": 0.8317, "grad_norm": 0.3596806824207306, "learning_rate": 0.0002, "epoch": 0.171078114912847, "step": 530}, {"loss": 0.844, "grad_norm": 0.4388909339904785, "learning_rate": 0.0002, "epoch": 0.17430600387346676, "step": 540}, {"loss": 0.8322, "grad_norm": 0.5052742958068848, "learning_rate": 0.0002, "epoch": 0.17753389283408652, "step": 550}, {"loss": 0.791, "grad_norm": 0.48248958587646484, "learning_rate": 0.0002, "epoch": 0.18076178179470626, "step": 560}, {"loss": 0.8593, "grad_norm": 0.5360197424888611, "learning_rate": 0.0002, "epoch": 0.18398967075532602, "step": 570}, {"loss": 0.817, "grad_norm": 0.43999341130256653, "learning_rate": 0.0002, "epoch": 0.18721755971594578, "step": 580}, {"loss": 0.8311, "grad_norm": 0.3685208261013031, "learning_rate": 0.0002, "epoch": 0.19044544867656552, "step": 590}, {"loss": 0.8341, "grad_norm": 0.4601275622844696, "learning_rate": 0.0002, "epoch": 0.19367333763718528, "step": 600}, {"loss": 0.8483, "grad_norm": 0.4778369665145874, "learning_rate": 0.0002, "epoch": 0.19690122659780504, "step": 610}, {"loss": 0.8653, "grad_norm": 0.4867003560066223, "learning_rate": 0.0002, "epoch": 0.2001291155584248, "step": 620}, {"loss": 0.8554, "grad_norm": 0.4583742916584015, "learning_rate": 0.0002, "epoch": 0.20335700451904454, "step": 630}, {"loss": 0.8698, "grad_norm": 0.47958165407180786, "learning_rate": 0.0002, "epoch": 0.2065848934796643, "step": 640}, {"loss": 0.8213, "grad_norm": 0.4526064097881317, "learning_rate": 0.0002, "epoch": 0.20981278244028406, "step": 650}, {"loss": 0.8313, "grad_norm": 0.45890581607818604, "learning_rate": 0.0002, "epoch": 0.2130406714009038, "step": 660}, {"loss": 0.8143, "grad_norm": 0.42725905776023865, "learning_rate": 0.0002, "epoch": 0.21626856036152356, "step": 670}, {"loss": 0.8675, "grad_norm": 0.40380963683128357, "learning_rate": 0.0002, "epoch": 0.21949644932214332, "step": 680}, {"loss": 0.9004, "grad_norm": 0.4372998774051666, "learning_rate": 0.0002, "epoch": 0.22272433828276308, "step": 690}, {"loss": 0.8208, "grad_norm": 0.4245864450931549, "learning_rate": 0.0002, "epoch": 0.22595222724338282, "step": 700}, {"loss": 0.8564, "grad_norm": 0.4061129689216614, "learning_rate": 0.0002, "epoch": 0.22918011620400258, "step": 710}, {"loss": 0.8275, "grad_norm": 0.474454790353775, "learning_rate": 0.0002, "epoch": 0.23240800516462234, "step": 720}, {"loss": 0.8346, "grad_norm": 0.4908486008644104, "learning_rate": 0.0002, "epoch": 0.23563589412524208, "step": 730}, {"loss": 0.8755, "grad_norm": 0.4284191429615021, "learning_rate": 0.0002, "epoch": 0.23886378308586184, "step": 740}, {"loss": 0.8387, "grad_norm": 0.44730308651924133, "learning_rate": 0.0002, "epoch": 0.2420916720464816, "step": 750}, {"loss": 0.8135, "grad_norm": 0.4433246850967407, "learning_rate": 0.0002, "epoch": 0.24531956100710137, "step": 760}, {"loss": 0.8644, "grad_norm": 0.43668854236602783, "learning_rate": 0.0002, "epoch": 0.2485474499677211, "step": 770}, {"loss": 0.8025, "grad_norm": 0.34324130415916443, "learning_rate": 0.0002, "epoch": 0.25177533892834086, "step": 780}, {"loss": 0.8725, "grad_norm": 0.46476295590400696, "learning_rate": 0.0002, "epoch": 0.2550032278889606, "step": 790}, {"loss": 0.8157, "grad_norm": 0.5047039985656738, "learning_rate": 0.0002, "epoch": 0.2582311168495804, "step": 800}, {"loss": 0.8643, "grad_norm": 0.4402127265930176, "learning_rate": 0.0002, "epoch": 0.26145900581020015, "step": 810}, {"loss": 0.8025, "grad_norm": 0.4642465114593506, "learning_rate": 0.0002, "epoch": 0.26468689477081986, "step": 820}, {"loss": 0.8836, "grad_norm": 0.40093424916267395, "learning_rate": 0.0002, "epoch": 0.2679147837314396, "step": 830}, {"loss": 0.83, "grad_norm": 0.42501842975616455, "learning_rate": 0.0002, "epoch": 0.2711426726920594, "step": 840}, {"loss": 0.8573, "grad_norm": 0.43279722332954407, "learning_rate": 0.0002, "epoch": 0.27437056165267915, "step": 850}, {"loss": 0.817, "grad_norm": 0.5991243720054626, "learning_rate": 0.0002, "epoch": 0.2775984506132989, "step": 860}, {"loss": 0.7981, "grad_norm": 0.4217848777770996, "learning_rate": 0.0002, "epoch": 0.28082633957391867, "step": 870}, {"loss": 0.8135, "grad_norm": 0.3933536410331726, "learning_rate": 0.0002, "epoch": 0.28405422853453843, "step": 880}, {"loss": 0.8846, "grad_norm": 0.5868505239486694, "learning_rate": 0.0002, "epoch": 0.28728211749515814, "step": 890}, {"loss": 0.8759, "grad_norm": 0.5209547877311707, "learning_rate": 0.0002, "epoch": 0.2905100064557779, "step": 900}, {"loss": 0.815, "grad_norm": 0.49307361245155334, "learning_rate": 0.0002, "epoch": 0.29373789541639767, "step": 910}, {"loss": 0.7813, "grad_norm": 0.4288382828235626, "learning_rate": 0.0002, "epoch": 0.2969657843770174, "step": 920}, {"loss": 0.8431, "grad_norm": 0.33568474650382996, "learning_rate": 0.0002, "epoch": 0.3001936733376372, "step": 930}, {"loss": 0.8455, "grad_norm": 1.0915930271148682, "learning_rate": 0.0002, "epoch": 0.30342156229825695, "step": 940}, {"loss": 0.8535, "grad_norm": 0.5489798188209534, "learning_rate": 0.0002, "epoch": 0.3066494512588767, "step": 950}, {"loss": 0.8031, "grad_norm": 0.42971742153167725, "learning_rate": 0.0002, "epoch": 0.3098773402194964, "step": 960}, {"loss": 0.8253, "grad_norm": 0.43375834822654724, "learning_rate": 0.0002, "epoch": 0.3131052291801162, "step": 970}, {"loss": 0.7747, "grad_norm": 0.47488611936569214, "learning_rate": 0.0002, "epoch": 0.31633311814073595, "step": 980}, {"loss": 0.7906, "grad_norm": 0.46296775341033936, "learning_rate": 0.0002, "epoch": 0.3195610071013557, "step": 990}, {"loss": 0.7948, "grad_norm": 0.4548890292644501, "learning_rate": 0.0002, "epoch": 0.32278889606197547, "step": 1000}, {"loss": 0.8856, "grad_norm": 0.41834497451782227, "learning_rate": 0.0002, "epoch": 0.32601678502259523, "step": 1010}, {"loss": 0.7791, "grad_norm": 0.441092312335968, "learning_rate": 0.0002, "epoch": 0.329244673983215, "step": 1020}, {"loss": 0.8191, "grad_norm": 0.637322187423706, "learning_rate": 0.0002, "epoch": 0.33247256294383476, "step": 1030}, {"loss": 0.8685, "grad_norm": 0.4374958574771881, "learning_rate": 0.0002, "epoch": 0.33570045190445447, "step": 1040}, {"loss": 0.8423, "grad_norm": 0.3935825824737549, "learning_rate": 0.0002, "epoch": 0.33892834086507423, "step": 1050}, {"loss": 0.8287, "grad_norm": 0.43526220321655273, "learning_rate": 0.0002, "epoch": 0.342156229825694, "step": 1060}, {"loss": 0.8413, "grad_norm": 0.45327696204185486, "learning_rate": 0.0002, "epoch": 0.34538411878631375, "step": 1070}, {"loss": 0.7421, "grad_norm": 0.4126075506210327, "learning_rate": 0.0002, "epoch": 0.3486120077469335, "step": 1080}, {"loss": 0.8427, "grad_norm": 0.4714072048664093, "learning_rate": 0.0002, "epoch": 0.3518398967075533, "step": 1090}, {"loss": 0.8028, "grad_norm": 0.518127977848053, "learning_rate": 0.0002, "epoch": 0.35506778566817304, "step": 1100}, {"loss": 0.8479, "grad_norm": 0.43264099955558777, "learning_rate": 0.0002, "epoch": 0.35829567462879275, "step": 1110}, {"loss": 0.8724, "grad_norm": 0.4857400357723236, "learning_rate": 0.0002, "epoch": 0.3615235635894125, "step": 1120}, {"loss": 0.7735, "grad_norm": 0.37591469287872314, "learning_rate": 0.0002, "epoch": 0.3647514525500323, "step": 1130}, {"loss": 0.8531, "grad_norm": 0.4165478050708771, "learning_rate": 0.0002, "epoch": 0.36797934151065204, "step": 1140}, {"loss": 0.8151, "grad_norm": 0.42911383509635925, "learning_rate": 0.0002, "epoch": 0.3712072304712718, "step": 1150}, {"loss": 0.8722, "grad_norm": 0.44980287551879883, "learning_rate": 0.0002, "epoch": 0.37443511943189156, "step": 1160}, {"loss": 0.7961, "grad_norm": 0.4066573679447174, "learning_rate": 0.0002, "epoch": 0.3776630083925113, "step": 1170}, {"loss": 0.8317, "grad_norm": 0.5056195855140686, "learning_rate": 0.0002, "epoch": 0.38089089735313103, "step": 1180}, {"loss": 0.8387, "grad_norm": 0.4141536355018616, "learning_rate": 0.0002, "epoch": 0.3841187863137508, "step": 1190}, {"loss": 0.8019, "grad_norm": 0.4501924514770508, "learning_rate": 0.0002, "epoch": 0.38734667527437056, "step": 1200}, {"loss": 0.8528, "grad_norm": 0.43304240703582764, "learning_rate": 0.0002, "epoch": 0.3905745642349903, "step": 1210}, {"loss": 0.8905, "grad_norm": 0.475777804851532, "learning_rate": 0.0002, "epoch": 0.3938024531956101, "step": 1220}, {"loss": 0.8643, "grad_norm": 0.5846465826034546, "learning_rate": 0.0002, "epoch": 0.39703034215622984, "step": 1230}, {"loss": 0.8078, "grad_norm": 0.42899325489997864, "learning_rate": 0.0002, "epoch": 0.4002582311168496, "step": 1240}, {"loss": 0.8415, "grad_norm": 0.3980463147163391, "learning_rate": 0.0002, "epoch": 0.4034861200774693, "step": 1250}, {"loss": 0.8026, "grad_norm": 0.45769768953323364, "learning_rate": 0.0002, "epoch": 0.4067140090380891, "step": 1260}, {"loss": 0.8377, "grad_norm": 0.5101280212402344, "learning_rate": 0.0002, "epoch": 0.40994189799870884, "step": 1270}, {"loss": 0.7905, "grad_norm": 0.47374317049980164, "learning_rate": 0.0002, "epoch": 0.4131697869593286, "step": 1280}, {"loss": 0.8172, "grad_norm": 0.4261878728866577, "learning_rate": 0.0002, "epoch": 0.41639767591994836, "step": 1290}, {"loss": 0.9004, "grad_norm": 0.46954256296157837, "learning_rate": 0.0002, "epoch": 0.4196255648805681, "step": 1300}, {"loss": 0.7868, "grad_norm": 0.5205738544464111, "learning_rate": 0.0002, "epoch": 0.4228534538411879, "step": 1310}, {"loss": 0.8964, "grad_norm": 0.5176340937614441, "learning_rate": 0.0002, "epoch": 0.4260813428018076, "step": 1320}, {"loss": 0.8764, "grad_norm": 0.5155916810035706, "learning_rate": 0.0002, "epoch": 0.42930923176242736, "step": 1330}, {"loss": 0.8197, "grad_norm": 0.44548553228378296, "learning_rate": 0.0002, "epoch": 0.4325371207230471, "step": 1340}, {"loss": 0.7873, "grad_norm": 0.5633558630943298, "learning_rate": 0.0002, "epoch": 0.4357650096836669, "step": 1350}, {"loss": 0.7889, "grad_norm": 0.42444056272506714, "learning_rate": 0.0002, "epoch": 0.43899289864428664, "step": 1360}, {"loss": 0.8588, "grad_norm": 0.5226860642433167, "learning_rate": 0.0002, "epoch": 0.4422207876049064, "step": 1370}, {"loss": 0.8232, "grad_norm": 0.5354582071304321, "learning_rate": 0.0002, "epoch": 0.44544867656552617, "step": 1380}, {"loss": 0.816, "grad_norm": 0.472646564245224, "learning_rate": 0.0002, "epoch": 0.4486765655261459, "step": 1390}, {"loss": 0.7953, "grad_norm": 0.6312310099601746, "learning_rate": 0.0002, "epoch": 0.45190445448676564, "step": 1400}, {"loss": 0.8212, "grad_norm": 0.4298408031463623, "learning_rate": 0.0002, "epoch": 0.4551323434473854, "step": 1410}, {"loss": 0.8447, "grad_norm": 0.43427202105522156, "learning_rate": 0.0002, "epoch": 0.45836023240800516, "step": 1420}, {"loss": 0.8342, "grad_norm": 0.44097861647605896, "learning_rate": 0.0002, "epoch": 0.4615881213686249, "step": 1430}, {"loss": 0.8301, "grad_norm": 0.5142693519592285, "learning_rate": 0.0002, "epoch": 0.4648160103292447, "step": 1440}, {"loss": 0.8144, "grad_norm": 0.46416547894477844, "learning_rate": 0.0002, "epoch": 0.46804389928986445, "step": 1450}, {"loss": 0.8342, "grad_norm": 0.4858551025390625, "learning_rate": 0.0002, "epoch": 0.47127178825048416, "step": 1460}, {"loss": 0.8354, "grad_norm": 0.4709177315235138, "learning_rate": 0.0002, "epoch": 0.4744996772111039, "step": 1470}, {"loss": 0.8391, "grad_norm": 0.5500252842903137, "learning_rate": 0.0002, "epoch": 0.4777275661717237, "step": 1480}, {"loss": 0.8359, "grad_norm": 0.43364381790161133, "learning_rate": 0.0002, "epoch": 0.48095545513234345, "step": 1490}, {"loss": 0.8446, "grad_norm": 0.47712287306785583, "learning_rate": 0.0002, "epoch": 0.4841833440929632, "step": 1500}, {"loss": 0.8518, "grad_norm": 0.4518495202064514, "learning_rate": 0.0002, "epoch": 0.48741123305358297, "step": 1510}, {"loss": 0.819, "grad_norm": 0.4539008140563965, "learning_rate": 0.0002, "epoch": 0.49063912201420273, "step": 1520}, {"loss": 0.8276, "grad_norm": 0.4993067979812622, "learning_rate": 0.0002, "epoch": 0.49386701097482244, "step": 1530}, {"loss": 0.8297, "grad_norm": 0.6094803214073181, "learning_rate": 0.0002, "epoch": 0.4970948999354422, "step": 1540}, {"loss": 0.8263, "grad_norm": 0.48602527379989624, "learning_rate": 0.0002, "epoch": 0.500322788896062, "step": 1550}, {"loss": 0.8182, "grad_norm": 0.40245795249938965, "learning_rate": 0.0002, "epoch": 0.5035506778566817, "step": 1560}, {"loss": 0.7907, "grad_norm": 0.456787645816803, "learning_rate": 0.0002, "epoch": 0.5067785668173015, "step": 1570}, {"loss": 0.86, "grad_norm": 0.43936216831207275, "learning_rate": 0.0002, "epoch": 0.5100064557779213, "step": 1580}, {"loss": 0.7928, "grad_norm": 0.549018144607544, "learning_rate": 0.0002, "epoch": 0.513234344738541, "step": 1590}, {"loss": 0.8169, "grad_norm": 0.41746795177459717, "learning_rate": 0.0002, "epoch": 0.5164622336991608, "step": 1600}, {"loss": 0.7868, "grad_norm": 0.4217053949832916, "learning_rate": 0.0002, "epoch": 0.5196901226597805, "step": 1610}, {"loss": 0.8161, "grad_norm": 0.449913889169693, "learning_rate": 0.0002, "epoch": 0.5229180116204003, "step": 1620}, {"loss": 0.7938, "grad_norm": 0.5084872245788574, "learning_rate": 0.0002, "epoch": 0.5261459005810201, "step": 1630}, {"loss": 0.8295, "grad_norm": 0.46248653531074524, "learning_rate": 0.0002, "epoch": 0.5293737895416397, "step": 1640}, {"loss": 0.7993, "grad_norm": 0.4824236035346985, "learning_rate": 0.0002, "epoch": 0.5326016785022595, "step": 1650}, {"loss": 0.8711, "grad_norm": 0.6010985374450684, "learning_rate": 0.0002, "epoch": 0.5358295674628792, "step": 1660}, {"loss": 0.8266, "grad_norm": 0.4757920801639557, "learning_rate": 0.0002, "epoch": 0.539057456423499, "step": 1670}, {"loss": 0.8182, "grad_norm": 0.45161882042884827, "learning_rate": 0.0002, "epoch": 0.5422853453841188, "step": 1680}, {"loss": 0.8141, "grad_norm": 0.49314990639686584, "learning_rate": 0.0002, "epoch": 0.5455132343447385, "step": 1690}, {"loss": 0.8091, "grad_norm": 0.3918305039405823, "learning_rate": 0.0002, "epoch": 0.5487411233053583, "step": 1700}, {"loss": 0.8177, "grad_norm": 0.5966728925704956, "learning_rate": 0.0002, "epoch": 0.551969012265978, "step": 1710}, {"loss": 0.8438, "grad_norm": 0.4208986163139343, "learning_rate": 0.0002, "epoch": 0.5551969012265978, "step": 1720}, {"loss": 0.817, "grad_norm": 0.43724218010902405, "learning_rate": 0.0002, "epoch": 0.5584247901872176, "step": 1730}, {"loss": 0.7956, "grad_norm": 0.5287272930145264, "learning_rate": 0.0002, "epoch": 0.5616526791478373, "step": 1740}, {"loss": 0.8557, "grad_norm": 0.4961899518966675, "learning_rate": 0.0002, "epoch": 0.5648805681084571, "step": 1750}, {"loss": 0.8029, "grad_norm": 0.4468635320663452, "learning_rate": 0.0002, "epoch": 0.5681084570690769, "step": 1760}, {"loss": 0.7968, "grad_norm": 0.6423530578613281, "learning_rate": 0.0002, "epoch": 0.5713363460296966, "step": 1770}, {"loss": 0.8324, "grad_norm": 0.4601971507072449, "learning_rate": 0.0002, "epoch": 0.5745642349903163, "step": 1780}, {"loss": 0.8171, "grad_norm": 0.46514901518821716, "learning_rate": 0.0002, "epoch": 0.577792123950936, "step": 1790}, {"loss": 0.8186, "grad_norm": 0.4771687388420105, "learning_rate": 0.0002, "epoch": 0.5810200129115558, "step": 1800}, {"loss": 0.856, "grad_norm": 0.46514490246772766, "learning_rate": 0.0002, "epoch": 0.5842479018721756, "step": 1810}, {"loss": 0.84, "grad_norm": 0.5373936295509338, "learning_rate": 0.0002, "epoch": 0.5874757908327953, "step": 1820}, {"loss": 0.8456, "grad_norm": 0.5175791382789612, "learning_rate": 0.0002, "epoch": 0.5907036797934151, "step": 1830}, {"loss": 0.7957, "grad_norm": 0.4522802233695984, "learning_rate": 0.0002, "epoch": 0.5939315687540349, "step": 1840}, {"loss": 0.8633, "grad_norm": 0.42987772822380066, "learning_rate": 0.0002, "epoch": 0.5971594577146546, "step": 1850}, {"loss": 0.7871, "grad_norm": 0.5566838383674622, "learning_rate": 0.0002, "epoch": 0.6003873466752744, "step": 1860}, {"loss": 0.8312, "grad_norm": 0.42807698249816895, "learning_rate": 0.0002, "epoch": 0.6036152356358941, "step": 1870}, {"loss": 0.8035, "grad_norm": 0.4957767724990845, "learning_rate": 0.0002, "epoch": 0.6068431245965139, "step": 1880}, {"loss": 0.8145, "grad_norm": 0.4260980188846588, "learning_rate": 0.0002, "epoch": 0.6100710135571337, "step": 1890}, {"loss": 0.8363, "grad_norm": 0.4777357876300812, "learning_rate": 0.0002, "epoch": 0.6132989025177534, "step": 1900}, {"loss": 0.8404, "grad_norm": 0.4434216022491455, "learning_rate": 0.0002, "epoch": 0.6165267914783732, "step": 1910}, {"loss": 0.8057, "grad_norm": 0.5215433835983276, "learning_rate": 0.0002, "epoch": 0.6197546804389928, "step": 1920}, {"loss": 0.82, "grad_norm": 0.5143248438835144, "learning_rate": 0.0002, "epoch": 0.6229825693996126, "step": 1930}, {"loss": 0.8107, "grad_norm": 0.5213413238525391, "learning_rate": 0.0002, "epoch": 0.6262104583602324, "step": 1940}, {"loss": 0.7549, "grad_norm": 0.5408226251602173, "learning_rate": 0.0002, "epoch": 0.6294383473208521, "step": 1950}, {"loss": 0.8405, "grad_norm": 0.5479708909988403, "learning_rate": 0.0002, "epoch": 0.6326662362814719, "step": 1960}, {"loss": 0.8138, "grad_norm": 0.4490949809551239, "learning_rate": 0.0002, "epoch": 0.6358941252420917, "step": 1970}, {"loss": 0.854, "grad_norm": 0.48815059661865234, "learning_rate": 0.0002, "epoch": 0.6391220142027114, "step": 1980}, {"loss": 0.8568, "grad_norm": 0.46498045325279236, "learning_rate": 0.0002, "epoch": 0.6423499031633312, "step": 1990}, {"loss": 0.8263, "grad_norm": 0.5136561393737793, "learning_rate": 0.0002, "epoch": 0.6455777921239509, "step": 2000}, {"loss": 0.8503, "grad_norm": 0.5145719647407532, "learning_rate": 0.0002, "epoch": 0.6488056810845707, "step": 2010}, {"loss": 0.8456, "grad_norm": 0.5430373549461365, "learning_rate": 0.0002, "epoch": 0.6520335700451905, "step": 2020}, {"loss": 0.8115, "grad_norm": 0.46347954869270325, "learning_rate": 0.0002, "epoch": 0.6552614590058102, "step": 2030}, {"loss": 0.8769, "grad_norm": 0.5189562439918518, "learning_rate": 0.0002, "epoch": 0.65848934796643, "step": 2040}, {"loss": 0.8453, "grad_norm": 0.43843990564346313, "learning_rate": 0.0002, "epoch": 0.6617172369270498, "step": 2050}, {"loss": 0.7951, "grad_norm": 0.4654983580112457, "learning_rate": 0.0002, "epoch": 0.6649451258876695, "step": 2060}, {"loss": 0.8308, "grad_norm": 0.44835716485977173, "learning_rate": 0.0002, "epoch": 0.6681730148482892, "step": 2070}, {"loss": 0.8181, "grad_norm": 0.38811734318733215, "learning_rate": 0.0002, "epoch": 0.6714009038089089, "step": 2080}, {"loss": 0.762, "grad_norm": 0.5709853172302246, "learning_rate": 0.0002, "epoch": 0.6746287927695287, "step": 2090}, {"loss": 0.8334, "grad_norm": 0.49994757771492004, "learning_rate": 0.0002, "epoch": 0.6778566817301485, "step": 2100}, {"loss": 0.8, "grad_norm": 0.5505402684211731, "learning_rate": 0.0002, "epoch": 0.6810845706907682, "step": 2110}, {"loss": 0.8227, "grad_norm": 0.48195120692253113, "learning_rate": 0.0002, "epoch": 0.684312459651388, "step": 2120}, {"loss": 0.7879, "grad_norm": 0.4854775071144104, "learning_rate": 0.0002, "epoch": 0.6875403486120077, "step": 2130}, {"loss": 0.8231, "grad_norm": 0.6422494649887085, "learning_rate": 0.0002, "epoch": 0.6907682375726275, "step": 2140}, {"loss": 0.8353, "grad_norm": 0.3972536027431488, "learning_rate": 0.0002, "epoch": 0.6939961265332473, "step": 2150}, {"loss": 0.8068, "grad_norm": 0.4297836422920227, "learning_rate": 0.0002, "epoch": 0.697224015493867, "step": 2160}, {"loss": 0.8017, "grad_norm": 0.45486778020858765, "learning_rate": 0.0002, "epoch": 0.7004519044544868, "step": 2170}, {"loss": 0.8507, "grad_norm": 0.4706047773361206, "learning_rate": 0.0002, "epoch": 0.7036797934151066, "step": 2180}, {"loss": 0.8234, "grad_norm": 0.46426892280578613, "learning_rate": 0.0002, "epoch": 0.7069076823757263, "step": 2190}, {"loss": 0.8472, "grad_norm": 0.46333715319633484, "learning_rate": 0.0002, "epoch": 0.7101355713363461, "step": 2200}, {"loss": 0.8247, "grad_norm": 0.4632524251937866, "learning_rate": 0.0002, "epoch": 0.7133634602969657, "step": 2210}, {"loss": 0.8452, "grad_norm": 0.4610830843448639, "learning_rate": 0.0002, "epoch": 0.7165913492575855, "step": 2220}, {"loss": 0.7338, "grad_norm": 0.4905324876308441, "learning_rate": 0.0002, "epoch": 0.7198192382182053, "step": 2230}, {"loss": 0.7715, "grad_norm": 0.4936263859272003, "learning_rate": 0.0002, "epoch": 0.723047127178825, "step": 2240}, {"loss": 0.8162, "grad_norm": 0.40778425335884094, "learning_rate": 0.0002, "epoch": 0.7262750161394448, "step": 2250}, {"loss": 0.828, "grad_norm": 0.50351482629776, "learning_rate": 0.0002, "epoch": 0.7295029051000645, "step": 2260}, {"loss": 0.8475, "grad_norm": 0.4894128143787384, "learning_rate": 0.0002, "epoch": 0.7327307940606843, "step": 2270}, {"loss": 0.8087, "grad_norm": 0.5580906271934509, "learning_rate": 0.0002, "epoch": 0.7359586830213041, "step": 2280}, {"loss": 0.8157, "grad_norm": 0.4655369520187378, "learning_rate": 0.0002, "epoch": 0.7391865719819238, "step": 2290}, {"loss": 0.8395, "grad_norm": 0.4666965901851654, "learning_rate": 0.0002, "epoch": 0.7424144609425436, "step": 2300}, {"loss": 0.7605, "grad_norm": 0.46259936690330505, "learning_rate": 0.0002, "epoch": 0.7456423499031634, "step": 2310}, {"loss": 0.7849, "grad_norm": 0.520706832408905, "learning_rate": 0.0002, "epoch": 0.7488702388637831, "step": 2320}, {"loss": 0.8173, "grad_norm": 0.5142408013343811, "learning_rate": 0.0002, "epoch": 0.7520981278244029, "step": 2330}, {"loss": 0.7782, "grad_norm": 0.5355164408683777, "learning_rate": 0.0002, "epoch": 0.7553260167850226, "step": 2340}, {"loss": 0.8242, "grad_norm": 0.5517185926437378, "learning_rate": 0.0002, "epoch": 0.7585539057456423, "step": 2350}, {"loss": 0.8404, "grad_norm": 0.7162677049636841, "learning_rate": 0.0002, "epoch": 0.7617817947062621, "step": 2360}, {"loss": 0.8455, "grad_norm": 0.42402133345603943, "learning_rate": 0.0002, "epoch": 0.7650096836668818, "step": 2370}, {"loss": 0.8214, "grad_norm": 0.47180113196372986, "learning_rate": 0.0002, "epoch": 0.7682375726275016, "step": 2380}, {"loss": 0.8274, "grad_norm": 0.6262288689613342, "learning_rate": 0.0002, "epoch": 0.7714654615881213, "step": 2390}, {"loss": 0.7915, "grad_norm": 0.5177528262138367, "learning_rate": 0.0002, "epoch": 0.7746933505487411, "step": 2400}, {"loss": 0.7631, "grad_norm": 0.555721640586853, "learning_rate": 0.0002, "epoch": 0.7779212395093609, "step": 2410}, {"loss": 0.795, "grad_norm": 0.5592644810676575, "learning_rate": 0.0002, "epoch": 0.7811491284699806, "step": 2420}, {"loss": 0.8081, "grad_norm": 0.38025397062301636, "learning_rate": 0.0002, "epoch": 0.7843770174306004, "step": 2430}, {"loss": 0.7851, "grad_norm": 0.4597472548484802, "learning_rate": 0.0002, "epoch": 0.7876049063912202, "step": 2440}, {"loss": 0.8575, "grad_norm": 0.4929825961589813, "learning_rate": 0.0002, "epoch": 0.7908327953518399, "step": 2450}, {"loss": 0.7584, "grad_norm": 0.45277655124664307, "learning_rate": 0.0002, "epoch": 0.7940606843124597, "step": 2460}, {"loss": 0.8208, "grad_norm": 0.6224122643470764, "learning_rate": 0.0002, "epoch": 0.7972885732730794, "step": 2470}, {"loss": 0.8449, "grad_norm": 0.5740901827812195, "learning_rate": 0.0002, "epoch": 0.8005164622336992, "step": 2480}, {"loss": 0.7834, "grad_norm": 0.41335329413414, "learning_rate": 0.0002, "epoch": 0.8037443511943189, "step": 2490}, {"loss": 0.7768, "grad_norm": 0.4738694131374359, "learning_rate": 0.0002, "epoch": 0.8069722401549386, "step": 2500}, {"loss": 0.7927, "grad_norm": 0.5288197994232178, "learning_rate": 0.0002, "epoch": 0.8102001291155584, "step": 2510}, {"loss": 0.8334, "grad_norm": 0.5404666066169739, "learning_rate": 0.0002, "epoch": 0.8134280180761781, "step": 2520}, {"loss": 0.7998, "grad_norm": 0.4444909691810608, "learning_rate": 0.0002, "epoch": 0.8166559070367979, "step": 2530}, {"loss": 0.8683, "grad_norm": 0.542061448097229, "learning_rate": 0.0002, "epoch": 0.8198837959974177, "step": 2540}, {"loss": 0.8038, "grad_norm": 0.4914741814136505, "learning_rate": 0.0002, "epoch": 0.8231116849580374, "step": 2550}, {"loss": 0.7899, "grad_norm": 0.41703441739082336, "learning_rate": 0.0002, "epoch": 0.8263395739186572, "step": 2560}, {"loss": 0.824, "grad_norm": 0.5489841103553772, "learning_rate": 0.0002, "epoch": 0.829567462879277, "step": 2570}, {"loss": 0.8157, "grad_norm": 0.5359883308410645, "learning_rate": 0.0002, "epoch": 0.8327953518398967, "step": 2580}, {"loss": 0.8122, "grad_norm": 0.5541019439697266, "learning_rate": 0.0002, "epoch": 0.8360232408005165, "step": 2590}, {"loss": 0.797, "grad_norm": 0.4746638834476471, "learning_rate": 0.0002, "epoch": 0.8392511297611362, "step": 2600}, {"loss": 0.8116, "grad_norm": 0.5243194103240967, "learning_rate": 0.0002, "epoch": 0.842479018721756, "step": 2610}, {"loss": 0.8173, "grad_norm": 0.46824976801872253, "learning_rate": 0.0002, "epoch": 0.8457069076823758, "step": 2620}, {"loss": 0.7525, "grad_norm": 0.49487847089767456, "learning_rate": 0.0002, "epoch": 0.8489347966429954, "step": 2630}, {"loss": 0.8296, "grad_norm": 0.42180097103118896, "learning_rate": 0.0002, "epoch": 0.8521626856036152, "step": 2640}, {"loss": 0.8304, "grad_norm": 0.5516560077667236, "learning_rate": 0.0002, "epoch": 0.855390574564235, "step": 2650}, {"loss": 0.7882, "grad_norm": 0.4392191767692566, "learning_rate": 0.0002, "epoch": 0.8586184635248547, "step": 2660}, {"loss": 0.848, "grad_norm": 0.5387210845947266, "learning_rate": 0.0002, "epoch": 0.8618463524854745, "step": 2670}, {"loss": 0.8094, "grad_norm": 0.6232406497001648, "learning_rate": 0.0002, "epoch": 0.8650742414460942, "step": 2680}, {"loss": 0.768, "grad_norm": 0.53749018907547, "learning_rate": 0.0002, "epoch": 0.868302130406714, "step": 2690}, {"loss": 0.8299, "grad_norm": 0.47480374574661255, "learning_rate": 0.0002, "epoch": 0.8715300193673338, "step": 2700}, {"loss": 0.8055, "grad_norm": 0.44618046283721924, "learning_rate": 0.0002, "epoch": 0.8747579083279535, "step": 2710}, {"loss": 0.8015, "grad_norm": 0.4173581302165985, "learning_rate": 0.0002, "epoch": 0.8779857972885733, "step": 2720}, {"loss": 0.7713, "grad_norm": 0.524081289768219, "learning_rate": 0.0002, "epoch": 0.881213686249193, "step": 2730}, {"loss": 0.8738, "grad_norm": 0.5608431100845337, "learning_rate": 0.0002, "epoch": 0.8844415752098128, "step": 2740}, {"loss": 0.8513, "grad_norm": 0.5212284922599792, "learning_rate": 0.0002, "epoch": 0.8876694641704326, "step": 2750}, {"loss": 0.8139, "grad_norm": 0.5601475834846497, "learning_rate": 0.0002, "epoch": 0.8908973531310523, "step": 2760}, {"loss": 0.7947, "grad_norm": 0.4499223828315735, "learning_rate": 0.0002, "epoch": 0.8941252420916721, "step": 2770}, {"loss": 0.8559, "grad_norm": 0.46945226192474365, "learning_rate": 0.0002, "epoch": 0.8973531310522918, "step": 2780}, {"loss": 0.801, "grad_norm": 0.4837495684623718, "learning_rate": 0.0002, "epoch": 0.9005810200129115, "step": 2790}, {"loss": 0.7887, "grad_norm": 0.5059258937835693, "learning_rate": 0.0002, "epoch": 0.9038089089735313, "step": 2800}, {"loss": 0.8571, "grad_norm": 0.4857945144176483, "learning_rate": 0.0002, "epoch": 0.907036797934151, "step": 2810}, {"loss": 0.8301, "grad_norm": 0.5001962780952454, "learning_rate": 0.0002, "epoch": 0.9102646868947708, "step": 2820}, {"loss": 0.8236, "grad_norm": 0.5468648672103882, "learning_rate": 0.0002, "epoch": 0.9134925758553906, "step": 2830}, {"loss": 0.8071, "grad_norm": 0.5533056259155273, "learning_rate": 0.0002, "epoch": 0.9167204648160103, "step": 2840}, {"loss": 0.7895, "grad_norm": 0.5909785628318787, "learning_rate": 0.0002, "epoch": 0.9199483537766301, "step": 2850}, {"loss": 0.796, "grad_norm": 0.47428104281425476, "learning_rate": 0.0002, "epoch": 0.9231762427372499, "step": 2860}, {"loss": 0.7845, "grad_norm": 0.548814058303833, "learning_rate": 0.0002, "epoch": 0.9264041316978696, "step": 2870}, {"loss": 0.7871, "grad_norm": 0.5576745271682739, "learning_rate": 0.0002, "epoch": 0.9296320206584894, "step": 2880}, {"loss": 0.8399, "grad_norm": 0.47094792127609253, "learning_rate": 0.0002, "epoch": 0.9328599096191091, "step": 2890}, {"loss": 0.805, "grad_norm": 0.5408539772033691, "learning_rate": 0.0002, "epoch": 0.9360877985797289, "step": 2900}, {"loss": 0.785, "grad_norm": 0.5922889113426208, "learning_rate": 0.0002, "epoch": 0.9393156875403487, "step": 2910}, {"loss": 0.8043, "grad_norm": 0.45462584495544434, "learning_rate": 0.0002, "epoch": 0.9425435765009683, "step": 2920}, {"loss": 0.8344, "grad_norm": 0.6864947080612183, "learning_rate": 0.0002, "epoch": 0.9457714654615881, "step": 2930}, {"loss": 0.8166, "grad_norm": 0.4706299304962158, "learning_rate": 0.0002, "epoch": 0.9489993544222078, "step": 2940}, {"loss": 0.8422, "grad_norm": 0.5583269596099854, "learning_rate": 0.0002, "epoch": 0.9522272433828276, "step": 2950}, {"loss": 0.836, "grad_norm": 0.51015704870224, "learning_rate": 0.0002, "epoch": 0.9554551323434474, "step": 2960}, {"loss": 0.8371, "grad_norm": 0.5325582027435303, "learning_rate": 0.0002, "epoch": 0.9586830213040671, "step": 2970}, {"loss": 0.7593, "grad_norm": 0.49008598923683167, "learning_rate": 0.0002, "epoch": 0.9619109102646869, "step": 2980}, {"loss": 0.8093, "grad_norm": 0.4422132074832916, "learning_rate": 0.0002, "epoch": 0.9651387992253067, "step": 2990}, {"loss": 0.7966, "grad_norm": 0.5053589344024658, "learning_rate": 0.0002, "epoch": 0.9683666881859264, "step": 3000}, {"loss": 0.8081, "grad_norm": 0.46754521131515503, "learning_rate": 0.0002, "epoch": 0.9715945771465462, "step": 3010}, {"loss": 0.8377, "grad_norm": 0.5613434910774231, "learning_rate": 0.0002, "epoch": 0.9748224661071659, "step": 3020}, {"loss": 0.7856, "grad_norm": 0.5052843689918518, "learning_rate": 0.0002, "epoch": 0.9780503550677857, "step": 3030}, {"loss": 0.8412, "grad_norm": 0.4270972013473511, "learning_rate": 0.0002, "epoch": 0.9812782440284055, "step": 3040}, {"loss": 0.8353, "grad_norm": 0.4974991977214813, "learning_rate": 0.0002, "epoch": 0.9845061329890252, "step": 3050}, {"loss": 0.8415, "grad_norm": 0.4432311952114105, "learning_rate": 0.0002, "epoch": 0.9877340219496449, "step": 3060}, {"loss": 0.7764, "grad_norm": 0.466457724571228, "learning_rate": 0.0002, "epoch": 0.9909619109102646, "step": 3070}, {"loss": 0.8067, "grad_norm": 0.6438009142875671, "learning_rate": 0.0002, "epoch": 0.9941897998708844, "step": 3080}, {"loss": 0.8425, "grad_norm": 0.5593604445457458, "learning_rate": 0.0002, "epoch": 0.9974176888315042, "step": 3090}, {"eval_loss": 1.0958120822906494, "eval_runtime": 148.3273, "eval_samples_per_second": 4.942, "eval_steps_per_second": 0.62, "epoch": 1.0, "step": 3098}, {"loss": 0.8275, "grad_norm": 0.5701445937156677, "learning_rate": 0.0002, "epoch": 1.000645577792124, "step": 3100}, {"loss": 0.7756, "grad_norm": 0.6089657545089722, "learning_rate": 0.0002, "epoch": 1.0038734667527438, "step": 3110}, {"loss": 0.7492, "grad_norm": 0.5619552135467529, "learning_rate": 0.0002, "epoch": 1.0071013557133635, "step": 3120}, {"loss": 0.7544, "grad_norm": 0.5550283789634705, "learning_rate": 0.0002, "epoch": 1.010329244673983, "step": 3130}, {"loss": 0.8006, "grad_norm": 0.6221792101860046, "learning_rate": 0.0002, "epoch": 1.013557133634603, "step": 3140}, {"loss": 0.7603, "grad_norm": 0.5450758934020996, "learning_rate": 0.0002, "epoch": 1.0167850225952226, "step": 3150}, {"loss": 0.7021, "grad_norm": 0.4359588027000427, "learning_rate": 0.0002, "epoch": 1.0200129115558425, "step": 3160}, {"loss": 0.7468, "grad_norm": 0.5932239890098572, "learning_rate": 0.0002, "epoch": 1.0232408005164622, "step": 3170}, {"loss": 0.7649, "grad_norm": 0.45478707551956177, "learning_rate": 0.0002, "epoch": 1.026468689477082, "step": 3180}, {"loss": 0.7355, "grad_norm": 0.677615761756897, "learning_rate": 0.0002, "epoch": 1.0296965784377017, "step": 3190}, {"loss": 0.6928, "grad_norm": 0.6231790781021118, "learning_rate": 0.0002, "epoch": 1.0329244673983216, "step": 3200}, {"loss": 0.7471, "grad_norm": 0.5074195861816406, "learning_rate": 0.0002, "epoch": 1.0361523563589412, "step": 3210}, {"loss": 0.6864, "grad_norm": 0.4844142198562622, "learning_rate": 0.0002, "epoch": 1.039380245319561, "step": 3220}, {"loss": 0.7655, "grad_norm": 0.5372750759124756, "learning_rate": 0.0002, "epoch": 1.0426081342801807, "step": 3230}, {"loss": 0.7384, "grad_norm": 0.46296265721321106, "learning_rate": 0.0002, "epoch": 1.0458360232408006, "step": 3240}, {"loss": 0.7894, "grad_norm": 0.5417148470878601, "learning_rate": 0.0002, "epoch": 1.0490639122014203, "step": 3250}, {"loss": 0.7637, "grad_norm": 0.5695074200630188, "learning_rate": 0.0002, "epoch": 1.0522918011620401, "step": 3260}, {"loss": 0.7456, "grad_norm": 0.5050092935562134, "learning_rate": 0.0002, "epoch": 1.0555196901226598, "step": 3270}, {"loss": 0.6805, "grad_norm": 0.5320752263069153, "learning_rate": 0.0002, "epoch": 1.0587475790832794, "step": 3280}, {"loss": 0.7419, "grad_norm": 0.5832052230834961, "learning_rate": 0.0002, "epoch": 1.0619754680438993, "step": 3290}, {"loss": 0.7656, "grad_norm": 0.5228804349899292, "learning_rate": 0.0002, "epoch": 1.065203357004519, "step": 3300}, {"loss": 0.6834, "grad_norm": 0.5819445252418518, "learning_rate": 0.0002, "epoch": 1.0684312459651388, "step": 3310}, {"loss": 0.7093, "grad_norm": 0.4201328754425049, "learning_rate": 0.0002, "epoch": 1.0716591349257585, "step": 3320}, {"loss": 0.7494, "grad_norm": 0.5424145460128784, "learning_rate": 0.0002, "epoch": 1.0748870238863784, "step": 3330}, {"loss": 0.7828, "grad_norm": 0.6169946789741516, "learning_rate": 0.0002, "epoch": 1.078114912846998, "step": 3340}, {"loss": 0.7505, "grad_norm": 0.607676088809967, "learning_rate": 0.0002, "epoch": 1.0813428018076179, "step": 3350}, {"loss": 0.7315, "grad_norm": 0.5191982388496399, "learning_rate": 0.0002, "epoch": 1.0845706907682375, "step": 3360}, {"loss": 0.7699, "grad_norm": 0.5728003978729248, "learning_rate": 0.0002, "epoch": 1.0877985797288574, "step": 3370}, {"loss": 0.7381, "grad_norm": 0.5402643084526062, "learning_rate": 0.0002, "epoch": 1.091026468689477, "step": 3380}, {"loss": 0.7208, "grad_norm": 0.5377541780471802, "learning_rate": 0.0002, "epoch": 1.094254357650097, "step": 3390}, {"loss": 0.7672, "grad_norm": 0.4751385748386383, "learning_rate": 0.0002, "epoch": 1.0974822466107166, "step": 3400}, {"loss": 0.7326, "grad_norm": 0.559158444404602, "learning_rate": 0.0002, "epoch": 1.1007101355713362, "step": 3410}, {"loss": 0.7366, "grad_norm": 0.4917701482772827, "learning_rate": 0.0002, "epoch": 1.103938024531956, "step": 3420}, {"loss": 0.7593, "grad_norm": 0.5507875084877014, "learning_rate": 0.0002, "epoch": 1.1071659134925758, "step": 3430}, {"loss": 0.7424, "grad_norm": 0.45458680391311646, "learning_rate": 0.0002, "epoch": 1.1103938024531956, "step": 3440}, {"loss": 0.7234, "grad_norm": 0.5721744894981384, "learning_rate": 0.0002, "epoch": 1.1136216914138153, "step": 3450}, {"loss": 0.7219, "grad_norm": 0.5776081681251526, "learning_rate": 0.0002, "epoch": 1.1168495803744352, "step": 3460}, {"loss": 0.7644, "grad_norm": 0.5261953473091125, "learning_rate": 0.0002, "epoch": 1.1200774693350548, "step": 3470}, {"loss": 0.6586, "grad_norm": 0.47759532928466797, "learning_rate": 0.0002, "epoch": 1.1233053582956747, "step": 3480}, {"loss": 0.7641, "grad_norm": 0.5697659850120544, "learning_rate": 0.0002, "epoch": 1.1265332472562943, "step": 3490}, {"loss": 0.7017, "grad_norm": 0.5643419623374939, "learning_rate": 0.0002, "epoch": 1.1297611362169142, "step": 3500}, {"loss": 0.7235, "grad_norm": 0.6502931118011475, "learning_rate": 0.0002, "epoch": 1.1329890251775339, "step": 3510}, {"loss": 0.7662, "grad_norm": 0.5236507654190063, "learning_rate": 0.0002, "epoch": 1.1362169141381537, "step": 3520}, {"loss": 0.7571, "grad_norm": 0.6521499156951904, "learning_rate": 0.0002, "epoch": 1.1394448030987734, "step": 3530}, {"loss": 0.7304, "grad_norm": 0.5893217325210571, "learning_rate": 0.0002, "epoch": 1.142672692059393, "step": 3540}, {"loss": 0.7508, "grad_norm": 0.5300073027610779, "learning_rate": 0.0002, "epoch": 1.145900581020013, "step": 3550}, {"loss": 0.6937, "grad_norm": 0.6794660091400146, "learning_rate": 0.0002, "epoch": 1.1491284699806328, "step": 3560}, {"loss": 0.7614, "grad_norm": 0.5420064926147461, "learning_rate": 0.0002, "epoch": 1.1523563589412524, "step": 3570}, {"loss": 0.7648, "grad_norm": 0.5096590518951416, "learning_rate": 0.0002, "epoch": 1.155584247901872, "step": 3580}, {"loss": 0.7436, "grad_norm": 0.5726043581962585, "learning_rate": 0.0002, "epoch": 1.158812136862492, "step": 3590}, {"loss": 0.7728, "grad_norm": 0.7388110160827637, "learning_rate": 0.0002, "epoch": 1.1620400258231116, "step": 3600}, {"loss": 0.7421, "grad_norm": 0.5597969889640808, "learning_rate": 0.0002, "epoch": 1.1652679147837315, "step": 3610}, {"loss": 0.7132, "grad_norm": 0.5067800283432007, "learning_rate": 0.0002, "epoch": 1.1684958037443511, "step": 3620}, {"loss": 0.7893, "grad_norm": 0.6625118255615234, "learning_rate": 0.0002, "epoch": 1.171723692704971, "step": 3630}, {"loss": 0.7611, "grad_norm": 0.5830849409103394, "learning_rate": 0.0002, "epoch": 1.1749515816655907, "step": 3640}, {"loss": 0.7973, "grad_norm": 0.6140692830085754, "learning_rate": 0.0002, "epoch": 1.1781794706262105, "step": 3650}, {"loss": 0.7617, "grad_norm": 0.714523434638977, "learning_rate": 0.0002, "epoch": 1.1814073595868302, "step": 3660}, {"loss": 0.7092, "grad_norm": 0.5196696519851685, "learning_rate": 0.0002, "epoch": 1.18463524854745, "step": 3670}, {"loss": 0.7821, "grad_norm": 0.6677889823913574, "learning_rate": 0.0002, "epoch": 1.1878631375080697, "step": 3680}, {"loss": 0.7813, "grad_norm": 0.47095245122909546, "learning_rate": 0.0002, "epoch": 1.1910910264686896, "step": 3690}, {"loss": 0.7702, "grad_norm": 0.5197778940200806, "learning_rate": 0.0002, "epoch": 1.1943189154293092, "step": 3700}, {"loss": 0.7349, "grad_norm": 0.5156530141830444, "learning_rate": 0.0002, "epoch": 1.1975468043899289, "step": 3710}, {"loss": 0.7738, "grad_norm": 0.6968549489974976, "learning_rate": 0.0002, "epoch": 1.2007746933505488, "step": 3720}, {"loss": 0.7599, "grad_norm": 0.48983848094940186, "learning_rate": 0.0002, "epoch": 1.2040025823111684, "step": 3730}, {"loss": 0.7163, "grad_norm": 0.6709973216056824, "learning_rate": 0.0002, "epoch": 1.2072304712717883, "step": 3740}, {"loss": 0.7632, "grad_norm": 0.48681750893592834, "learning_rate": 0.0002, "epoch": 1.210458360232408, "step": 3750}, {"loss": 0.7039, "grad_norm": 0.49475061893463135, "learning_rate": 0.0002, "epoch": 1.2136862491930278, "step": 3760}, {"loss": 0.7372, "grad_norm": 0.6163983345031738, "learning_rate": 0.0002, "epoch": 1.2169141381536475, "step": 3770}, {"loss": 0.757, "grad_norm": 0.5481411218643188, "learning_rate": 0.0002, "epoch": 1.2201420271142673, "step": 3780}, {"loss": 0.7601, "grad_norm": 0.620639979839325, "learning_rate": 0.0002, "epoch": 1.223369916074887, "step": 3790}, {"loss": 0.7738, "grad_norm": 0.7017222046852112, "learning_rate": 0.0002, "epoch": 1.2265978050355069, "step": 3800}, {"loss": 0.7468, "grad_norm": 0.5872400403022766, "learning_rate": 0.0002, "epoch": 1.2298256939961265, "step": 3810}, {"loss": 0.7854, "grad_norm": 0.45765596628189087, "learning_rate": 0.0002, "epoch": 1.2330535829567464, "step": 3820}, {"loss": 0.7865, "grad_norm": 0.5676377415657043, "learning_rate": 0.0002, "epoch": 1.236281471917366, "step": 3830}, {"loss": 0.7696, "grad_norm": 0.4793425500392914, "learning_rate": 0.0002, "epoch": 1.2395093608779857, "step": 3840}, {"loss": 0.7065, "grad_norm": 0.5060022473335266, "learning_rate": 0.0002, "epoch": 1.2427372498386056, "step": 3850}, {"loss": 0.7333, "grad_norm": 0.6140682697296143, "learning_rate": 0.0002, "epoch": 1.2459651387992252, "step": 3860}, {"loss": 0.7496, "grad_norm": 0.5030326843261719, "learning_rate": 0.0002, "epoch": 1.249193027759845, "step": 3870}, {"loss": 0.7226, "grad_norm": 0.6609430909156799, "learning_rate": 0.0002, "epoch": 1.2524209167204647, "step": 3880}, {"loss": 0.7212, "grad_norm": 0.5459545850753784, "learning_rate": 0.0002, "epoch": 1.2556488056810846, "step": 3890}, {"loss": 0.7145, "grad_norm": 0.5328870415687561, "learning_rate": 0.0002, "epoch": 1.2588766946417043, "step": 3900}, {"loss": 0.7572, "grad_norm": 0.5840652585029602, "learning_rate": 0.0002, "epoch": 1.2621045836023241, "step": 3910}, {"loss": 0.7624, "grad_norm": 0.5587584376335144, "learning_rate": 0.0002, "epoch": 1.2653324725629438, "step": 3920}, {"loss": 0.7846, "grad_norm": 0.5886949896812439, "learning_rate": 0.0002, "epoch": 1.2685603615235637, "step": 3930}, {"loss": 0.7251, "grad_norm": 0.5128693580627441, "learning_rate": 0.0002, "epoch": 1.2717882504841833, "step": 3940}, {"loss": 0.7032, "grad_norm": 0.6207669377326965, "learning_rate": 0.0002, "epoch": 1.2750161394448032, "step": 3950}, {"loss": 0.7506, "grad_norm": 0.5789574384689331, "learning_rate": 0.0002, "epoch": 1.2782440284054228, "step": 3960}, {"loss": 0.7574, "grad_norm": 0.503162145614624, "learning_rate": 0.0002, "epoch": 1.2814719173660425, "step": 3970}, {"loss": 0.7489, "grad_norm": 0.6670064926147461, "learning_rate": 0.0002, "epoch": 1.2846998063266624, "step": 3980}, {"loss": 0.7198, "grad_norm": 0.5676213502883911, "learning_rate": 0.0002, "epoch": 1.2879276952872822, "step": 3990}, {"loss": 0.7892, "grad_norm": 0.5383169054985046, "learning_rate": 0.0002, "epoch": 1.2911555842479019, "step": 4000}, {"loss": 0.7432, "grad_norm": 0.714743971824646, "learning_rate": 0.0002, "epoch": 1.2943834732085215, "step": 4010}, {"loss": 0.7594, "grad_norm": 0.5740262269973755, "learning_rate": 0.0002, "epoch": 1.2976113621691414, "step": 4020}, {"loss": 0.7564, "grad_norm": 0.6143045425415039, "learning_rate": 0.0002, "epoch": 1.300839251129761, "step": 4030}, {"loss": 0.7181, "grad_norm": 0.501025378704071, "learning_rate": 0.0002, "epoch": 1.304067140090381, "step": 4040}, {"loss": 0.7099, "grad_norm": 0.5784100294113159, "learning_rate": 0.0002, "epoch": 1.3072950290510006, "step": 4050}, {"loss": 0.7403, "grad_norm": 0.6182606220245361, "learning_rate": 0.0002, "epoch": 1.3105229180116205, "step": 4060}, {"loss": 0.7249, "grad_norm": 0.5072231292724609, "learning_rate": 0.0002, "epoch": 1.3137508069722401, "step": 4070}, {"loss": 0.7451, "grad_norm": 0.6841012835502625, "learning_rate": 0.0002, "epoch": 1.31697869593286, "step": 4080}, {"loss": 0.7395, "grad_norm": 0.697257936000824, "learning_rate": 0.0002, "epoch": 1.3202065848934796, "step": 4090}, {"loss": 0.7401, "grad_norm": 0.5113214254379272, "learning_rate": 0.0002, "epoch": 1.3234344738540993, "step": 4100}, {"loss": 0.7336, "grad_norm": 0.6270561814308167, "learning_rate": 0.0002, "epoch": 1.3266623628147192, "step": 4110}, {"loss": 0.7535, "grad_norm": 0.5525947213172913, "learning_rate": 0.0002, "epoch": 1.329890251775339, "step": 4120}, {"loss": 0.6999, "grad_norm": 0.546071469783783, "learning_rate": 0.0002, "epoch": 1.3331181407359587, "step": 4130}, {"loss": 0.7884, "grad_norm": 0.6516721248626709, "learning_rate": 0.0002, "epoch": 1.3363460296965783, "step": 4140}, {"loss": 0.755, "grad_norm": 0.6235111355781555, "learning_rate": 0.0002, "epoch": 1.3395739186571982, "step": 4150}, {"loss": 0.7467, "grad_norm": 0.538649320602417, "learning_rate": 0.0002, "epoch": 1.3428018076178179, "step": 4160}, {"loss": 0.7368, "grad_norm": 0.5367001891136169, "learning_rate": 0.0002, "epoch": 1.3460296965784377, "step": 4170}, {"loss": 0.7536, "grad_norm": 0.6134631037712097, "learning_rate": 0.0002, "epoch": 1.3492575855390574, "step": 4180}, {"loss": 0.8245, "grad_norm": 0.5827262997627258, "learning_rate": 0.0002, "epoch": 1.3524854744996773, "step": 4190}, {"loss": 0.7288, "grad_norm": 0.5706096291542053, "learning_rate": 0.0002, "epoch": 1.355713363460297, "step": 4200}, {"loss": 0.7302, "grad_norm": 0.6422057151794434, "learning_rate": 0.0002, "epoch": 1.3589412524209168, "step": 4210}, {"loss": 0.7303, "grad_norm": 0.6316141486167908, "learning_rate": 0.0002, "epoch": 1.3621691413815364, "step": 4220}, {"loss": 0.7457, "grad_norm": 0.6946983933448792, "learning_rate": 0.0002, "epoch": 1.365397030342156, "step": 4230}, {"loss": 0.7388, "grad_norm": 0.5381525754928589, "learning_rate": 0.0002, "epoch": 1.368624919302776, "step": 4240}, {"loss": 0.73, "grad_norm": 0.5484845638275146, "learning_rate": 0.0002, "epoch": 1.3718528082633958, "step": 4250}, {"loss": 0.7584, "grad_norm": 0.5961896777153015, "learning_rate": 0.0002, "epoch": 1.3750806972240155, "step": 4260}, {"loss": 0.8006, "grad_norm": 0.6041752696037292, "learning_rate": 0.0002, "epoch": 1.3783085861846351, "step": 4270}, {"loss": 0.7276, "grad_norm": 0.6283464431762695, "learning_rate": 0.0002, "epoch": 1.381536475145255, "step": 4280}, {"loss": 0.757, "grad_norm": 0.6761324405670166, "learning_rate": 0.0002, "epoch": 1.384764364105875, "step": 4290}, {"loss": 0.7381, "grad_norm": 0.504311203956604, "learning_rate": 0.0002, "epoch": 1.3879922530664945, "step": 4300}, {"loss": 0.7536, "grad_norm": 0.6100395917892456, "learning_rate": 0.0002, "epoch": 1.3912201420271142, "step": 4310}, {"loss": 0.7103, "grad_norm": 0.6245788335800171, "learning_rate": 0.0002, "epoch": 1.394448030987734, "step": 4320}, {"loss": 0.7505, "grad_norm": 0.6074621081352234, "learning_rate": 0.0002, "epoch": 1.3976759199483537, "step": 4330}, {"loss": 0.752, "grad_norm": 0.6683838963508606, "learning_rate": 0.0002, "epoch": 1.4009038089089736, "step": 4340}, {"loss": 0.7537, "grad_norm": 0.622998058795929, "learning_rate": 0.0002, "epoch": 1.4041316978695932, "step": 4350}, {"loss": 0.8148, "grad_norm": 0.6089423894882202, "learning_rate": 0.0002, "epoch": 1.4073595868302131, "step": 4360}, {"loss": 0.7715, "grad_norm": 0.6381658911705017, "learning_rate": 0.0002, "epoch": 1.4105874757908328, "step": 4370}, {"loss": 0.7871, "grad_norm": 0.5419308543205261, "learning_rate": 0.0002, "epoch": 1.4138153647514526, "step": 4380}, {"loss": 0.7386, "grad_norm": 0.6026232242584229, "learning_rate": 0.0002, "epoch": 1.4170432537120723, "step": 4390}, {"loss": 0.7529, "grad_norm": 0.4911101162433624, "learning_rate": 0.0002, "epoch": 1.420271142672692, "step": 4400}, {"loss": 0.7495, "grad_norm": 0.6302908062934875, "learning_rate": 0.0002, "epoch": 1.4234990316333118, "step": 4410}, {"loss": 0.7446, "grad_norm": 0.6692768931388855, "learning_rate": 0.0002, "epoch": 1.4267269205939317, "step": 4420}, {"loss": 0.7312, "grad_norm": 0.46294572949409485, "learning_rate": 0.0002, "epoch": 1.4299548095545513, "step": 4430}, {"loss": 0.7255, "grad_norm": 0.5452619194984436, "learning_rate": 0.0002, "epoch": 1.433182698515171, "step": 4440}, {"loss": 0.7974, "grad_norm": 0.7809233069419861, "learning_rate": 0.0002, "epoch": 1.4364105874757909, "step": 4450}, {"loss": 0.7103, "grad_norm": 0.550088107585907, "learning_rate": 0.0002, "epoch": 1.4396384764364105, "step": 4460}, {"loss": 0.7088, "grad_norm": 0.7139151096343994, "learning_rate": 0.0002, "epoch": 1.4428663653970304, "step": 4470}, {"loss": 0.7358, "grad_norm": 0.6187090873718262, "learning_rate": 0.0002, "epoch": 1.44609425435765, "step": 4480}, {"loss": 0.7608, "grad_norm": 0.5948249101638794, "learning_rate": 0.0002, "epoch": 1.44932214331827, "step": 4490}, {"loss": 0.7582, "grad_norm": 0.6510892510414124, "learning_rate": 0.0002, "epoch": 1.4525500322788896, "step": 4500}, {"loss": 0.7105, "grad_norm": 0.6552293300628662, "learning_rate": 0.0002, "epoch": 1.4557779212395094, "step": 4510}, {"loss": 0.7965, "grad_norm": 0.585574209690094, "learning_rate": 0.0002, "epoch": 1.459005810200129, "step": 4520}, {"loss": 0.761, "grad_norm": 0.4830162823200226, "learning_rate": 0.0002, "epoch": 1.4622336991607487, "step": 4530}, {"loss": 0.7424, "grad_norm": 0.5780223608016968, "learning_rate": 0.0002, "epoch": 1.4654615881213686, "step": 4540}, {"loss": 0.7518, "grad_norm": 0.5462607145309448, "learning_rate": 0.0002, "epoch": 1.4686894770819885, "step": 4550}, {"loss": 0.7342, "grad_norm": 0.5183546543121338, "learning_rate": 0.0002, "epoch": 1.4719173660426081, "step": 4560}, {"loss": 0.71, "grad_norm": 0.676917552947998, "learning_rate": 0.0002, "epoch": 1.4751452550032278, "step": 4570}, {"loss": 0.7875, "grad_norm": 0.5772345066070557, "learning_rate": 0.0002, "epoch": 1.4783731439638477, "step": 4580}, {"loss": 0.7709, "grad_norm": 0.7320035696029663, "learning_rate": 0.0002, "epoch": 1.4816010329244673, "step": 4590}, {"loss": 0.7601, "grad_norm": 0.5024042129516602, "learning_rate": 0.0002, "epoch": 1.4848289218850872, "step": 4600}, {"loss": 0.8061, "grad_norm": 0.5482868552207947, "learning_rate": 0.0002, "epoch": 1.4880568108457068, "step": 4610}, {"loss": 0.714, "grad_norm": 0.5447399616241455, "learning_rate": 0.0002, "epoch": 1.4912846998063267, "step": 4620}, {"loss": 0.7959, "grad_norm": 0.5953414440155029, "learning_rate": 0.0002, "epoch": 1.4945125887669464, "step": 4630}, {"loss": 0.7463, "grad_norm": 0.6983066201210022, "learning_rate": 0.0002, "epoch": 1.4977404777275662, "step": 4640}, {"loss": 0.7877, "grad_norm": 0.586327075958252, "learning_rate": 0.0002, "epoch": 1.500968366688186, "step": 4650}, {"loss": 0.7169, "grad_norm": 0.5839682221412659, "learning_rate": 0.0002, "epoch": 1.5041962556488055, "step": 4660}, {"loss": 0.7524, "grad_norm": 0.5959209203720093, "learning_rate": 0.0002, "epoch": 1.5074241446094254, "step": 4670}, {"loss": 0.7615, "grad_norm": 0.5073857307434082, "learning_rate": 0.0002, "epoch": 1.5106520335700453, "step": 4680}, {"loss": 0.7258, "grad_norm": 0.5183001160621643, "learning_rate": 0.0002, "epoch": 1.513879922530665, "step": 4690}, {"loss": 0.784, "grad_norm": 0.593530535697937, "learning_rate": 0.0002, "epoch": 1.5171078114912846, "step": 4700}, {"loss": 0.7722, "grad_norm": 0.675993025302887, "learning_rate": 0.0002, "epoch": 1.5203357004519045, "step": 4710}, {"loss": 0.7485, "grad_norm": 0.5823286771774292, "learning_rate": 0.0002, "epoch": 1.5235635894125243, "step": 4720}, {"loss": 0.7474, "grad_norm": 0.5825035572052002, "learning_rate": 0.0002, "epoch": 1.526791478373144, "step": 4730}, {"loss": 0.8287, "grad_norm": 0.5689691305160522, "learning_rate": 0.0002, "epoch": 1.5300193673337636, "step": 4740}, {"loss": 0.7279, "grad_norm": 0.6037150621414185, "learning_rate": 0.0002, "epoch": 1.5332472562943835, "step": 4750}, {"loss": 0.7865, "grad_norm": 0.6393677592277527, "learning_rate": 0.0002, "epoch": 1.5364751452550034, "step": 4760}, {"loss": 0.805, "grad_norm": 0.5926381945610046, "learning_rate": 0.0002, "epoch": 1.539703034215623, "step": 4770}, {"loss": 0.7425, "grad_norm": 0.9468599557876587, "learning_rate": 0.0002, "epoch": 1.5429309231762427, "step": 4780}, {"loss": 0.7565, "grad_norm": 0.7544237375259399, "learning_rate": 0.0002, "epoch": 1.5461588121368623, "step": 4790}, {"loss": 0.7398, "grad_norm": 0.5308566093444824, "learning_rate": 0.0002, "epoch": 1.5493867010974822, "step": 4800}, {"loss": 0.7756, "grad_norm": 0.6590296030044556, "learning_rate": 0.0002, "epoch": 1.552614590058102, "step": 4810}, {"loss": 0.7212, "grad_norm": 0.5630404353141785, "learning_rate": 0.0002, "epoch": 1.5558424790187217, "step": 4820}, {"loss": 0.7593, "grad_norm": 0.6800200939178467, "learning_rate": 0.0002, "epoch": 1.5590703679793414, "step": 4830}, {"loss": 0.7373, "grad_norm": 0.5463718175888062, "learning_rate": 0.0002, "epoch": 1.5622982569399613, "step": 4840}, {"loss": 0.7519, "grad_norm": 0.505135178565979, "learning_rate": 0.0002, "epoch": 1.5655261459005811, "step": 4850}, {"loss": 0.8122, "grad_norm": 0.5469676852226257, "learning_rate": 0.0002, "epoch": 1.5687540348612008, "step": 4860}, {"loss": 0.7185, "grad_norm": 0.5318337678909302, "learning_rate": 0.0002, "epoch": 1.5719819238218204, "step": 4870}, {"loss": 0.7324, "grad_norm": 0.7287914752960205, "learning_rate": 0.0002, "epoch": 1.5752098127824403, "step": 4880}, {"loss": 0.7532, "grad_norm": 0.7318989038467407, "learning_rate": 0.0002, "epoch": 1.5784377017430602, "step": 4890}, {"loss": 0.7851, "grad_norm": 0.6499921679496765, "learning_rate": 0.0002, "epoch": 1.5816655907036798, "step": 4900}, {"loss": 0.753, "grad_norm": 0.47907355427742004, "learning_rate": 0.0002, "epoch": 1.5848934796642995, "step": 4910}, {"loss": 0.7699, "grad_norm": 0.7338833808898926, "learning_rate": 0.0002, "epoch": 1.5881213686249191, "step": 4920}, {"loss": 0.7592, "grad_norm": 0.5800719261169434, "learning_rate": 0.0002, "epoch": 1.591349257585539, "step": 4930}, {"loss": 0.7211, "grad_norm": 0.5365763306617737, "learning_rate": 0.0002, "epoch": 1.594577146546159, "step": 4940}, {"loss": 0.777, "grad_norm": 0.5800772309303284, "learning_rate": 0.0002, "epoch": 1.5978050355067785, "step": 4950}, {"loss": 0.8027, "grad_norm": 0.7878010869026184, "learning_rate": 0.0002, "epoch": 1.6010329244673982, "step": 4960}, {"loss": 0.7894, "grad_norm": 0.5919058918952942, "learning_rate": 0.0002, "epoch": 1.604260813428018, "step": 4970}, {"loss": 0.7762, "grad_norm": 0.5004435181617737, "learning_rate": 0.0002, "epoch": 1.607488702388638, "step": 4980}, {"loss": 0.7447, "grad_norm": 0.6299242377281189, "learning_rate": 0.0002, "epoch": 1.6107165913492576, "step": 4990}, {"loss": 0.7149, "grad_norm": 0.6307242512702942, "learning_rate": 0.0002, "epoch": 1.6139444803098772, "step": 5000}, {"loss": 0.7693, "grad_norm": 0.7838703989982605, "learning_rate": 0.0002, "epoch": 1.6171723692704971, "step": 5010}, {"loss": 0.7364, "grad_norm": 0.6454671621322632, "learning_rate": 0.0002, "epoch": 1.620400258231117, "step": 5020}, {"loss": 0.74, "grad_norm": 0.5907095670700073, "learning_rate": 0.0002, "epoch": 1.6236281471917366, "step": 5030}, {"loss": 0.7331, "grad_norm": 0.6053501963615417, "learning_rate": 0.0002, "epoch": 1.6268560361523563, "step": 5040}, {"loss": 0.6987, "grad_norm": 0.5644670128822327, "learning_rate": 0.0002, "epoch": 1.630083925112976, "step": 5050}, {"loss": 0.7886, "grad_norm": 0.6320949792861938, "learning_rate": 0.0002, "epoch": 1.6333118140735958, "step": 5060}, {"loss": 0.7109, "grad_norm": 0.6101489067077637, "learning_rate": 0.0002, "epoch": 1.6365397030342157, "step": 5070}, {"loss": 0.6922, "grad_norm": 0.9435283541679382, "learning_rate": 0.0002, "epoch": 1.6397675919948353, "step": 5080}, {"loss": 0.729, "grad_norm": 0.6668919324874878, "learning_rate": 0.0002, "epoch": 1.642995480955455, "step": 5090}, {"loss": 0.7402, "grad_norm": 0.6160340905189514, "learning_rate": 0.0002, "epoch": 1.6462233699160749, "step": 5100}, {"loss": 0.7461, "grad_norm": 0.5999835729598999, "learning_rate": 0.0002, "epoch": 1.6494512588766947, "step": 5110}, {"loss": 0.7661, "grad_norm": 0.9378551840782166, "learning_rate": 0.0002, "epoch": 1.6526791478373144, "step": 5120}, {"loss": 0.7586, "grad_norm": 0.4795055389404297, "learning_rate": 0.0002, "epoch": 1.655907036797934, "step": 5130}, {"loss": 0.7342, "grad_norm": 0.4878861606121063, "learning_rate": 0.0002, "epoch": 1.659134925758554, "step": 5140}, {"loss": 0.7362, "grad_norm": 0.6042965054512024, "learning_rate": 0.0002, "epoch": 1.6623628147191738, "step": 5150}, {"loss": 0.7863, "grad_norm": 0.5829901695251465, "learning_rate": 0.0002, "epoch": 1.6655907036797934, "step": 5160}, {"loss": 0.7498, "grad_norm": 0.5168480277061462, "learning_rate": 0.0002, "epoch": 1.668818592640413, "step": 5170}, {"loss": 0.7333, "grad_norm": 0.6489511132240295, "learning_rate": 0.0002, "epoch": 1.672046481601033, "step": 5180}, {"loss": 0.7257, "grad_norm": 0.5955966114997864, "learning_rate": 0.0002, "epoch": 1.6752743705616526, "step": 5190}, {"loss": 0.7938, "grad_norm": 0.6228088140487671, "learning_rate": 0.0002, "epoch": 1.6785022595222725, "step": 5200}, {"loss": 0.7626, "grad_norm": 0.5726390480995178, "learning_rate": 0.0002, "epoch": 1.6817301484828922, "step": 5210}, {"loss": 0.7479, "grad_norm": 0.6116343140602112, "learning_rate": 0.0002, "epoch": 1.6849580374435118, "step": 5220}, {"loss": 0.7169, "grad_norm": 0.5483687520027161, "learning_rate": 0.0002, "epoch": 1.6881859264041317, "step": 5230}, {"loss": 0.7293, "grad_norm": 0.570941686630249, "learning_rate": 0.0002, "epoch": 1.6914138153647515, "step": 5240}, {"loss": 0.723, "grad_norm": 0.6048086285591125, "learning_rate": 0.0002, "epoch": 1.6946417043253712, "step": 5250}, {"loss": 0.7861, "grad_norm": 0.6769003868103027, "learning_rate": 0.0002, "epoch": 1.6978695932859909, "step": 5260}, {"loss": 0.7885, "grad_norm": 0.5629057884216309, "learning_rate": 0.0002, "epoch": 1.7010974822466107, "step": 5270}, {"loss": 0.7693, "grad_norm": 0.657341480255127, "learning_rate": 0.0002, "epoch": 1.7043253712072306, "step": 5280}, {"loss": 0.7357, "grad_norm": 0.6256147623062134, "learning_rate": 0.0002, "epoch": 1.7075532601678503, "step": 5290}, {"loss": 0.714, "grad_norm": 0.5498088002204895, "learning_rate": 0.0002, "epoch": 1.71078114912847, "step": 5300}, {"loss": 0.7669, "grad_norm": 0.5078358054161072, "learning_rate": 0.0002, "epoch": 1.7140090380890898, "step": 5310}, {"loss": 0.7872, "grad_norm": 0.6696692705154419, "learning_rate": 0.0002, "epoch": 1.7172369270497096, "step": 5320}, {"loss": 0.8205, "grad_norm": 0.6692847013473511, "learning_rate": 0.0002, "epoch": 1.7204648160103293, "step": 5330}, {"loss": 0.7432, "grad_norm": 0.5415751934051514, "learning_rate": 0.0002, "epoch": 1.723692704970949, "step": 5340}, {"loss": 0.7499, "grad_norm": 0.5367611050605774, "learning_rate": 0.0002, "epoch": 1.7269205939315686, "step": 5350}, {"loss": 0.7631, "grad_norm": 0.7321061491966248, "learning_rate": 0.0002, "epoch": 1.7301484828921885, "step": 5360}, {"loss": 0.7827, "grad_norm": 0.723972499370575, "learning_rate": 0.0002, "epoch": 1.7333763718528084, "step": 5370}, {"loss": 0.7077, "grad_norm": 0.7328100204467773, "learning_rate": 0.0002, "epoch": 1.736604260813428, "step": 5380}, {"loss": 0.7503, "grad_norm": 0.5785264372825623, "learning_rate": 0.0002, "epoch": 1.7398321497740477, "step": 5390}, {"loss": 0.7188, "grad_norm": 0.7812932133674622, "learning_rate": 0.0002, "epoch": 1.7430600387346675, "step": 5400}, {"loss": 0.7386, "grad_norm": 0.6493327617645264, "learning_rate": 0.0002, "epoch": 1.7462879276952874, "step": 5410}, {"loss": 0.7487, "grad_norm": 0.5825939774513245, "learning_rate": 0.0002, "epoch": 1.749515816655907, "step": 5420}, {"loss": 0.7625, "grad_norm": 0.6969610452651978, "learning_rate": 0.0002, "epoch": 1.7527437056165267, "step": 5430}, {"loss": 0.7512, "grad_norm": 0.5558062195777893, "learning_rate": 0.0002, "epoch": 1.7559715945771466, "step": 5440}, {"loss": 0.7256, "grad_norm": 0.49222221970558167, "learning_rate": 0.0002, "epoch": 1.7591994835377665, "step": 5450}, {"loss": 0.7477, "grad_norm": 0.5844656825065613, "learning_rate": 0.0002, "epoch": 1.762427372498386, "step": 5460}, {"loss": 0.7695, "grad_norm": 0.8706597685813904, "learning_rate": 0.0002, "epoch": 1.7656552614590058, "step": 5470}, {"loss": 0.7582, "grad_norm": 0.6167706251144409, "learning_rate": 0.0002, "epoch": 1.7688831504196254, "step": 5480}, {"loss": 0.7521, "grad_norm": 0.5890011787414551, "learning_rate": 0.0002, "epoch": 1.7721110393802453, "step": 5490}, {"loss": 0.8319, "grad_norm": 0.6551728248596191, "learning_rate": 0.0002, "epoch": 1.7753389283408652, "step": 5500}, {"loss": 0.7615, "grad_norm": 0.5848751068115234, "learning_rate": 0.0002, "epoch": 1.7785668173014848, "step": 5510}, {"loss": 0.7622, "grad_norm": 0.6664014458656311, "learning_rate": 0.0002, "epoch": 1.7817947062621045, "step": 5520}, {"loss": 0.7544, "grad_norm": 0.5931693911552429, "learning_rate": 0.0002, "epoch": 1.7850225952227243, "step": 5530}, {"loss": 0.7992, "grad_norm": 0.5534724593162537, "learning_rate": 0.0002, "epoch": 1.7882504841833442, "step": 5540}, {"loss": 0.7967, "grad_norm": 0.5590878129005432, "learning_rate": 0.0002, "epoch": 1.7914783731439639, "step": 5550}, {"loss": 0.7406, "grad_norm": 0.6947470903396606, "learning_rate": 0.0002, "epoch": 1.7947062621045835, "step": 5560}, {"loss": 0.7614, "grad_norm": 0.6104130148887634, "learning_rate": 0.0002, "epoch": 1.7979341510652034, "step": 5570}, {"loss": 0.8032, "grad_norm": 0.6135714054107666, "learning_rate": 0.0002, "epoch": 1.8011620400258233, "step": 5580}, {"loss": 0.7403, "grad_norm": 0.6626853346824646, "learning_rate": 0.0002, "epoch": 1.804389928986443, "step": 5590}, {"loss": 0.7746, "grad_norm": 0.6977612972259521, "learning_rate": 0.0002, "epoch": 1.8076178179470626, "step": 5600}, {"loss": 0.7899, "grad_norm": 0.6275238394737244, "learning_rate": 0.0002, "epoch": 1.8108457069076824, "step": 5610}, {"loss": 0.7392, "grad_norm": 0.5017505288124084, "learning_rate": 0.0002, "epoch": 1.814073595868302, "step": 5620}, {"loss": 0.7669, "grad_norm": 0.8314290642738342, "learning_rate": 0.0002, "epoch": 1.817301484828922, "step": 5630}, {"loss": 0.7031, "grad_norm": 0.6863582134246826, "learning_rate": 0.0002, "epoch": 1.8205293737895416, "step": 5640}, {"loss": 0.743, "grad_norm": 0.69544917345047, "learning_rate": 0.0002, "epoch": 1.8237572627501613, "step": 5650}, {"loss": 0.7277, "grad_norm": 0.515499472618103, "learning_rate": 0.0002, "epoch": 1.8269851517107811, "step": 5660}, {"loss": 0.7166, "grad_norm": 0.6100873947143555, "learning_rate": 0.0002, "epoch": 1.830213040671401, "step": 5670}, {"loss": 0.7217, "grad_norm": 0.67416912317276, "learning_rate": 0.0002, "epoch": 1.8334409296320207, "step": 5680}, {"loss": 0.7575, "grad_norm": 0.7057772278785706, "learning_rate": 0.0002, "epoch": 1.8366688185926403, "step": 5690}, {"loss": 0.7483, "grad_norm": 0.7374551892280579, "learning_rate": 0.0002, "epoch": 1.8398967075532602, "step": 5700}, {"loss": 0.81, "grad_norm": 0.6266297101974487, "learning_rate": 0.0002, "epoch": 1.84312459651388, "step": 5710}, {"loss": 0.728, "grad_norm": 0.5629227757453918, "learning_rate": 0.0002, "epoch": 1.8463524854744997, "step": 5720}, {"loss": 0.8043, "grad_norm": 0.6603655815124512, "learning_rate": 0.0002, "epoch": 1.8495803744351194, "step": 5730}, {"loss": 0.7587, "grad_norm": 0.8113715052604675, "learning_rate": 0.0002, "epoch": 1.8528082633957392, "step": 5740}, {"loss": 0.7486, "grad_norm": 0.7143914103507996, "learning_rate": 0.0002, "epoch": 1.856036152356359, "step": 5750}, {"loss": 0.7619, "grad_norm": 0.6273732781410217, "learning_rate": 0.0002, "epoch": 1.8592640413169788, "step": 5760}, {"loss": 0.7962, "grad_norm": 0.5428690910339355, "learning_rate": 0.0002, "epoch": 1.8624919302775984, "step": 5770}, {"loss": 0.7581, "grad_norm": 0.6405037641525269, "learning_rate": 0.0002, "epoch": 1.865719819238218, "step": 5780}, {"loss": 0.7569, "grad_norm": 0.700873613357544, "learning_rate": 0.0002, "epoch": 1.868947708198838, "step": 5790}, {"loss": 0.7353, "grad_norm": 0.5645238161087036, "learning_rate": 0.0002, "epoch": 1.8721755971594578, "step": 5800}, {"loss": 0.8037, "grad_norm": 0.8780353665351868, "learning_rate": 0.0002, "epoch": 1.8754034861200775, "step": 5810}, {"loss": 0.7686, "grad_norm": 0.6295409798622131, "learning_rate": 0.0002, "epoch": 1.878631375080697, "step": 5820}, {"loss": 0.8067, "grad_norm": 0.678269624710083, "learning_rate": 0.0002, "epoch": 1.881859264041317, "step": 5830}, {"loss": 0.7537, "grad_norm": 0.6464608907699585, "learning_rate": 0.0002, "epoch": 1.8850871530019369, "step": 5840}, {"loss": 0.7423, "grad_norm": 0.6201048493385315, "learning_rate": 0.0002, "epoch": 1.8883150419625565, "step": 5850}, {"loss": 0.7694, "grad_norm": 0.6046274304389954, "learning_rate": 0.0002, "epoch": 1.8915429309231762, "step": 5860}, {"loss": 0.781, "grad_norm": 0.7532408833503723, "learning_rate": 0.0002, "epoch": 1.894770819883796, "step": 5870}, {"loss": 0.6885, "grad_norm": 0.6066767573356628, "learning_rate": 0.0002, "epoch": 1.897998708844416, "step": 5880}, {"loss": 0.7631, "grad_norm": 0.6289830207824707, "learning_rate": 0.0002, "epoch": 1.9012265978050356, "step": 5890}, {"loss": 0.7501, "grad_norm": 0.5204319953918457, "learning_rate": 0.0002, "epoch": 1.9044544867656552, "step": 5900}, {"loss": 0.7335, "grad_norm": 0.6708219647407532, "learning_rate": 0.0002, "epoch": 1.9076823757262749, "step": 5910}, {"loss": 0.7455, "grad_norm": 0.4915677309036255, "learning_rate": 0.0002, "epoch": 1.9109102646868947, "step": 5920}, {"loss": 0.7464, "grad_norm": 0.652717113494873, "learning_rate": 0.0002, "epoch": 1.9141381536475146, "step": 5930}, {"loss": 0.7687, "grad_norm": 0.5446316003799438, "learning_rate": 0.0002, "epoch": 1.9173660426081343, "step": 5940}, {"loss": 0.7424, "grad_norm": 0.4958149194717407, "learning_rate": 0.0002, "epoch": 1.920593931568754, "step": 5950}, {"loss": 0.757, "grad_norm": 0.5623434782028198, "learning_rate": 0.0002, "epoch": 1.9238218205293738, "step": 5960}, {"loss": 0.7446, "grad_norm": 0.6855450868606567, "learning_rate": 0.0002, "epoch": 1.9270497094899937, "step": 5970}, {"loss": 0.827, "grad_norm": 0.5710492730140686, "learning_rate": 0.0002, "epoch": 1.9302775984506133, "step": 5980}, {"loss": 0.7245, "grad_norm": 0.5379431843757629, "learning_rate": 0.0002, "epoch": 1.933505487411233, "step": 5990}, {"loss": 0.77, "grad_norm": 0.557129442691803, "learning_rate": 0.0002, "epoch": 1.9367333763718528, "step": 6000}, {"loss": 0.6988, "grad_norm": 0.6336663961410522, "learning_rate": 0.0002, "epoch": 1.9399612653324727, "step": 6010}, {"loss": 0.7316, "grad_norm": 0.5950582027435303, "learning_rate": 0.0002, "epoch": 1.9431891542930924, "step": 6020}, {"loss": 0.7443, "grad_norm": 0.5905954837799072, "learning_rate": 0.0002, "epoch": 1.946417043253712, "step": 6030}, {"loss": 0.7127, "grad_norm": 0.6688982844352722, "learning_rate": 0.0002, "epoch": 1.9496449322143317, "step": 6040}, {"loss": 0.79, "grad_norm": 0.5440775752067566, "learning_rate": 0.0002, "epoch": 1.9528728211749515, "step": 6050}, {"loss": 0.7221, "grad_norm": 0.6207906603813171, "learning_rate": 0.0002, "epoch": 1.9561007101355714, "step": 6060}, {"loss": 0.738, "grad_norm": 0.6999374628067017, "learning_rate": 0.0002, "epoch": 1.959328599096191, "step": 6070}, {"loss": 0.7372, "grad_norm": 0.6310848593711853, "learning_rate": 0.0002, "epoch": 1.9625564880568107, "step": 6080}, {"loss": 0.7198, "grad_norm": 0.5903388261795044, "learning_rate": 0.0002, "epoch": 1.9657843770174306, "step": 6090}, {"loss": 0.7103, "grad_norm": 0.6333889961242676, "learning_rate": 0.0002, "epoch": 1.9690122659780505, "step": 6100}, {"loss": 0.7246, "grad_norm": 0.5604711174964905, "learning_rate": 0.0002, "epoch": 1.97224015493867, "step": 6110}, {"loss": 0.761, "grad_norm": 0.9234541654586792, "learning_rate": 0.0002, "epoch": 1.9754680438992898, "step": 6120}, {"loss": 0.7375, "grad_norm": 0.6149102449417114, "learning_rate": 0.0002, "epoch": 1.9786959328599096, "step": 6130}, {"loss": 0.7286, "grad_norm": 0.615446150302887, "learning_rate": 0.0002, "epoch": 1.9819238218205295, "step": 6140}, {"loss": 0.7333, "grad_norm": 0.5176635980606079, "learning_rate": 0.0002, "epoch": 1.9851517107811492, "step": 6150}, {"loss": 0.718, "grad_norm": 0.7124109864234924, "learning_rate": 0.0002, "epoch": 1.9883795997417688, "step": 6160}, {"loss": 0.7669, "grad_norm": 0.6317567825317383, "learning_rate": 0.0002, "epoch": 1.9916074887023887, "step": 6170}, {"loss": 0.8012, "grad_norm": 0.6855016350746155, "learning_rate": 0.0002, "epoch": 1.9948353776630086, "step": 6180}, {"loss": 0.7376, "grad_norm": 0.6423715353012085, "learning_rate": 0.0002, "epoch": 1.9980632666236282, "step": 6190}, {"eval_loss": 1.1096643209457397, "eval_runtime": 147.7997, "eval_samples_per_second": 4.959, "eval_steps_per_second": 0.622, "epoch": 2.0, "step": 6196}, {"loss": 0.7131, "grad_norm": 0.5322932600975037, "learning_rate": 0.0002, "epoch": 2.001291155584248, "step": 6200}, {"loss": 0.6619, "grad_norm": 0.8152306079864502, "learning_rate": 0.0002, "epoch": 2.0045190445448675, "step": 6210}, {"loss": 0.6731, "grad_norm": 0.6215983033180237, "learning_rate": 0.0002, "epoch": 2.0077469335054876, "step": 6220}, {"loss": 0.658, "grad_norm": 0.845498263835907, "learning_rate": 0.0002, "epoch": 2.0109748224661073, "step": 6230}, {"loss": 0.6954, "grad_norm": 0.733559787273407, "learning_rate": 0.0002, "epoch": 2.014202711426727, "step": 6240}, {"loss": 0.6707, "grad_norm": 0.51433926820755, "learning_rate": 0.0002, "epoch": 2.0174306003873466, "step": 6250}, {"loss": 0.6304, "grad_norm": 0.6374049782752991, "learning_rate": 0.0002, "epoch": 2.020658489347966, "step": 6260}, {"loss": 0.6831, "grad_norm": 0.7833638191223145, "learning_rate": 0.0002, "epoch": 2.0238863783085863, "step": 6270}, {"loss": 0.6672, "grad_norm": 0.8929463028907776, "learning_rate": 0.0002, "epoch": 2.027114267269206, "step": 6280}, {"loss": 0.637, "grad_norm": 0.669731855392456, "learning_rate": 0.0002, "epoch": 2.0303421562298256, "step": 6290}, {"loss": 0.646, "grad_norm": 0.5846071243286133, "learning_rate": 0.0002, "epoch": 2.0335700451904453, "step": 6300}, {"loss": 0.6647, "grad_norm": 0.7087787985801697, "learning_rate": 0.0002, "epoch": 2.0367979341510654, "step": 6310}, {"loss": 0.6433, "grad_norm": 0.6739160418510437, "learning_rate": 0.0002, "epoch": 2.040025823111685, "step": 6320}, {"loss": 0.6301, "grad_norm": 0.4860886335372925, "learning_rate": 0.0002, "epoch": 2.0432537120723047, "step": 6330}, {"loss": 0.6439, "grad_norm": 0.7201244831085205, "learning_rate": 0.0002, "epoch": 2.0464816010329243, "step": 6340}, {"loss": 0.6676, "grad_norm": 0.7409170269966125, "learning_rate": 0.0002, "epoch": 2.0497094899935444, "step": 6350}, {"loss": 0.6153, "grad_norm": 0.6843920350074768, "learning_rate": 0.0002, "epoch": 2.052937378954164, "step": 6360}, {"loss": 0.6674, "grad_norm": 0.7519999742507935, "learning_rate": 0.0002, "epoch": 2.0561652679147837, "step": 6370}, {"loss": 0.6928, "grad_norm": 0.5732819437980652, "learning_rate": 0.0002, "epoch": 2.0593931568754034, "step": 6380}, {"loss": 0.6496, "grad_norm": 0.7565118074417114, "learning_rate": 0.0002, "epoch": 2.062621045836023, "step": 6390}, {"loss": 0.6354, "grad_norm": 0.8147150278091431, "learning_rate": 0.0002, "epoch": 2.065848934796643, "step": 6400}, {"loss": 0.6593, "grad_norm": 0.6941924691200256, "learning_rate": 0.0002, "epoch": 2.0690768237572628, "step": 6410}, {"loss": 0.6698, "grad_norm": 0.6549784541130066, "learning_rate": 0.0002, "epoch": 2.0723047127178824, "step": 6420}, {"loss": 0.6927, "grad_norm": 0.7224905490875244, "learning_rate": 0.0002, "epoch": 2.075532601678502, "step": 6430}, {"loss": 0.6755, "grad_norm": 0.7754863500595093, "learning_rate": 0.0002, "epoch": 2.078760490639122, "step": 6440}, {"loss": 0.6738, "grad_norm": 0.691318154335022, "learning_rate": 0.0002, "epoch": 2.081988379599742, "step": 6450}, {"loss": 0.6233, "grad_norm": 0.6009294986724854, "learning_rate": 0.0002, "epoch": 2.0852162685603615, "step": 6460}, {"loss": 0.6691, "grad_norm": 0.6753945350646973, "learning_rate": 0.0002, "epoch": 2.088444157520981, "step": 6470}, {"loss": 0.6935, "grad_norm": 0.6899921298027039, "learning_rate": 0.0002, "epoch": 2.091672046481601, "step": 6480}, {"loss": 0.6918, "grad_norm": 0.846510648727417, "learning_rate": 0.0002, "epoch": 2.094899935442221, "step": 6490}, {"loss": 0.6084, "grad_norm": 0.6432605981826782, "learning_rate": 0.0002, "epoch": 2.0981278244028405, "step": 6500}, {"loss": 0.6867, "grad_norm": 0.8125239014625549, "learning_rate": 0.0002, "epoch": 2.10135571336346, "step": 6510}, {"loss": 0.6939, "grad_norm": 0.628302812576294, "learning_rate": 0.0002, "epoch": 2.1045836023240803, "step": 6520}, {"loss": 0.5909, "grad_norm": 0.7164334654808044, "learning_rate": 0.0002, "epoch": 2.1078114912847, "step": 6530}, {"loss": 0.6578, "grad_norm": 0.7476949095726013, "learning_rate": 0.0002, "epoch": 2.1110393802453196, "step": 6540}, {"loss": 0.6351, "grad_norm": 0.7577515840530396, "learning_rate": 0.0002, "epoch": 2.114267269205939, "step": 6550}, {"loss": 0.6669, "grad_norm": 0.5684467554092407, "learning_rate": 0.0002, "epoch": 2.117495158166559, "step": 6560}, {"loss": 0.6343, "grad_norm": 0.6121789216995239, "learning_rate": 0.0002, "epoch": 2.120723047127179, "step": 6570}, {"loss": 0.6314, "grad_norm": 0.6095348596572876, "learning_rate": 0.0002, "epoch": 2.1239509360877986, "step": 6580}, {"loss": 0.6276, "grad_norm": 0.7803651690483093, "learning_rate": 0.0002, "epoch": 2.1271788250484183, "step": 6590}, {"loss": 0.6579, "grad_norm": 0.5990583300590515, "learning_rate": 0.0002, "epoch": 2.130406714009038, "step": 6600}, {"loss": 0.6228, "grad_norm": 0.6569220423698425, "learning_rate": 0.0002, "epoch": 2.133634602969658, "step": 6610}, {"loss": 0.7049, "grad_norm": 0.5961166620254517, "learning_rate": 0.0002, "epoch": 2.1368624919302777, "step": 6620}, {"loss": 0.6359, "grad_norm": 0.5860554575920105, "learning_rate": 0.0002, "epoch": 2.1400903808908973, "step": 6630}, {"loss": 0.6651, "grad_norm": 0.5994001626968384, "learning_rate": 0.0002, "epoch": 2.143318269851517, "step": 6640}, {"loss": 0.6421, "grad_norm": 0.7723015546798706, "learning_rate": 0.0002, "epoch": 2.146546158812137, "step": 6650}, {"loss": 0.6723, "grad_norm": 0.676355242729187, "learning_rate": 0.0002, "epoch": 2.1497740477727567, "step": 6660}, {"loss": 0.6826, "grad_norm": 0.5689092874526978, "learning_rate": 0.0002, "epoch": 2.1530019367333764, "step": 6670}, {"loss": 0.6613, "grad_norm": 0.6933727264404297, "learning_rate": 0.0002, "epoch": 2.156229825693996, "step": 6680}, {"loss": 0.6957, "grad_norm": 0.8380527496337891, "learning_rate": 0.0002, "epoch": 2.159457714654616, "step": 6690}, {"loss": 0.6705, "grad_norm": 0.6876497268676758, "learning_rate": 0.0002, "epoch": 2.1626856036152358, "step": 6700}, {"loss": 0.6112, "grad_norm": 0.6418334245681763, "learning_rate": 0.0002, "epoch": 2.1659134925758554, "step": 6710}, {"loss": 0.6357, "grad_norm": 0.7169192433357239, "learning_rate": 0.0002, "epoch": 2.169141381536475, "step": 6720}, {"loss": 0.6492, "grad_norm": 0.6664170622825623, "learning_rate": 0.0002, "epoch": 2.1723692704970947, "step": 6730}, {"loss": 0.6751, "grad_norm": 0.6011993288993835, "learning_rate": 0.0002, "epoch": 2.175597159457715, "step": 6740}, {"loss": 0.696, "grad_norm": 0.5529947280883789, "learning_rate": 0.0002, "epoch": 2.1788250484183345, "step": 6750}, {"loss": 0.671, "grad_norm": 0.6879532933235168, "learning_rate": 0.0002, "epoch": 2.182052937378954, "step": 6760}, {"loss": 0.6634, "grad_norm": 0.6426113843917847, "learning_rate": 0.0002, "epoch": 2.1852808263395738, "step": 6770}, {"loss": 0.6592, "grad_norm": 0.6571047306060791, "learning_rate": 0.0002, "epoch": 2.188508715300194, "step": 6780}, {"loss": 0.6494, "grad_norm": 0.6400564908981323, "learning_rate": 0.0002, "epoch": 2.1917366042608135, "step": 6790}, {"loss": 0.6369, "grad_norm": 0.6509664058685303, "learning_rate": 0.0002, "epoch": 2.194964493221433, "step": 6800}, {"loss": 0.6771, "grad_norm": 0.6673197150230408, "learning_rate": 0.0002, "epoch": 2.198192382182053, "step": 6810}, {"loss": 0.6491, "grad_norm": 0.48205727338790894, "learning_rate": 0.0002, "epoch": 2.2014202711426725, "step": 6820}, {"loss": 0.6894, "grad_norm": 0.849525511264801, "learning_rate": 0.0002, "epoch": 2.2046481601032926, "step": 6830}, {"loss": 0.6977, "grad_norm": 0.6150892376899719, "learning_rate": 0.0002, "epoch": 2.207876049063912, "step": 6840}, {"loss": 0.6843, "grad_norm": 0.7826945781707764, "learning_rate": 0.0002, "epoch": 2.211103938024532, "step": 6850}, {"loss": 0.6338, "grad_norm": 0.5711963772773743, "learning_rate": 0.0002, "epoch": 2.2143318269851515, "step": 6860}, {"loss": 0.6585, "grad_norm": 0.6017758846282959, "learning_rate": 0.0002, "epoch": 2.2175597159457716, "step": 6870}, {"loss": 0.6657, "grad_norm": 0.785434901714325, "learning_rate": 0.0002, "epoch": 2.2207876049063913, "step": 6880}, {"loss": 0.7075, "grad_norm": 0.6251688599586487, "learning_rate": 0.0002, "epoch": 2.224015493867011, "step": 6890}, {"loss": 0.6564, "grad_norm": 0.8242034316062927, "learning_rate": 0.0002, "epoch": 2.2272433828276306, "step": 6900}, {"loss": 0.672, "grad_norm": 0.7272933125495911, "learning_rate": 0.0002, "epoch": 2.2304712717882507, "step": 6910}, {"loss": 0.6541, "grad_norm": 0.7159379720687866, "learning_rate": 0.0002, "epoch": 2.2336991607488703, "step": 6920}, {"loss": 0.6859, "grad_norm": 0.6518042087554932, "learning_rate": 0.0002, "epoch": 2.23692704970949, "step": 6930}, {"loss": 0.5987, "grad_norm": 0.7365370392799377, "learning_rate": 0.0002, "epoch": 2.2401549386701096, "step": 6940}, {"loss": 0.6511, "grad_norm": 0.5674061179161072, "learning_rate": 0.0002, "epoch": 2.2433828276307297, "step": 6950}, {"loss": 0.6748, "grad_norm": 0.669185996055603, "learning_rate": 0.0002, "epoch": 2.2466107165913494, "step": 6960}, {"loss": 0.656, "grad_norm": 0.6638304591178894, "learning_rate": 0.0002, "epoch": 2.249838605551969, "step": 6970}, {"loss": 0.636, "grad_norm": 0.757006824016571, "learning_rate": 0.0002, "epoch": 2.2530664945125887, "step": 6980}, {"loss": 0.6597, "grad_norm": 0.7574930787086487, "learning_rate": 0.0002, "epoch": 2.2562943834732083, "step": 6990}, {"loss": 0.6859, "grad_norm": 0.7819514870643616, "learning_rate": 0.0002, "epoch": 2.2595222724338284, "step": 7000}, {"loss": 0.6238, "grad_norm": 0.6987583041191101, "learning_rate": 0.0002, "epoch": 2.262750161394448, "step": 7010}, {"loss": 0.661, "grad_norm": 0.6628551483154297, "learning_rate": 0.0002, "epoch": 2.2659780503550677, "step": 7020}, {"loss": 0.6254, "grad_norm": 0.7855866551399231, "learning_rate": 0.0002, "epoch": 2.2692059393156874, "step": 7030}, {"loss": 0.6679, "grad_norm": 0.6102892756462097, "learning_rate": 0.0002, "epoch": 2.2724338282763075, "step": 7040}, {"loss": 0.694, "grad_norm": 0.7844198942184448, "learning_rate": 0.0002, "epoch": 2.275661717236927, "step": 7050}, {"loss": 0.63, "grad_norm": 0.6209492087364197, "learning_rate": 0.0002, "epoch": 2.2788896061975468, "step": 7060}, {"loss": 0.6418, "grad_norm": 0.8351290225982666, "learning_rate": 0.0002, "epoch": 2.2821174951581664, "step": 7070}, {"loss": 0.6648, "grad_norm": 0.6883546710014343, "learning_rate": 0.0002, "epoch": 2.285345384118786, "step": 7080}, {"loss": 0.7046, "grad_norm": 0.6626381874084473, "learning_rate": 0.0002, "epoch": 2.288573273079406, "step": 7090}, {"loss": 0.6535, "grad_norm": 0.7216270565986633, "learning_rate": 0.0002, "epoch": 2.291801162040026, "step": 7100}, {"loss": 0.6414, "grad_norm": 0.8246777057647705, "learning_rate": 0.0002, "epoch": 2.2950290510006455, "step": 7110}, {"loss": 0.6315, "grad_norm": 0.614326000213623, "learning_rate": 0.0002, "epoch": 2.2982569399612656, "step": 7120}, {"loss": 0.6303, "grad_norm": 0.8785578012466431, "learning_rate": 0.0002, "epoch": 2.301484828921885, "step": 7130}, {"loss": 0.6348, "grad_norm": 0.7021808624267578, "learning_rate": 0.0002, "epoch": 2.304712717882505, "step": 7140}, {"loss": 0.6738, "grad_norm": 0.6999403238296509, "learning_rate": 0.0002, "epoch": 2.3079406068431245, "step": 7150}, {"loss": 0.6547, "grad_norm": 0.8013143539428711, "learning_rate": 0.0002, "epoch": 2.311168495803744, "step": 7160}, {"loss": 0.6461, "grad_norm": 0.6592583060264587, "learning_rate": 0.0002, "epoch": 2.3143963847643643, "step": 7170}, {"loss": 0.6369, "grad_norm": 0.6260249018669128, "learning_rate": 0.0002, "epoch": 2.317624273724984, "step": 7180}, {"loss": 0.6647, "grad_norm": 0.9352797269821167, "learning_rate": 0.0002, "epoch": 2.3208521626856036, "step": 7190}, {"loss": 0.6543, "grad_norm": 0.6629612445831299, "learning_rate": 0.0002, "epoch": 2.324080051646223, "step": 7200}, {"loss": 0.6811, "grad_norm": 0.7062810063362122, "learning_rate": 0.0002, "epoch": 2.3273079406068433, "step": 7210}, {"loss": 0.67, "grad_norm": 0.7236241102218628, "learning_rate": 0.0002, "epoch": 2.330535829567463, "step": 7220}, {"loss": 0.6462, "grad_norm": 0.7528148293495178, "learning_rate": 0.0002, "epoch": 2.3337637185280826, "step": 7230}, {"loss": 0.694, "grad_norm": 0.7604748606681824, "learning_rate": 0.0002, "epoch": 2.3369916074887023, "step": 7240}, {"loss": 0.6475, "grad_norm": 0.5601189136505127, "learning_rate": 0.0002, "epoch": 2.340219496449322, "step": 7250}, {"loss": 0.6925, "grad_norm": 0.7099230885505676, "learning_rate": 0.0002, "epoch": 2.343447385409942, "step": 7260}, {"loss": 0.6333, "grad_norm": 0.6699047684669495, "learning_rate": 0.0002, "epoch": 2.3466752743705617, "step": 7270}, {"loss": 0.6434, "grad_norm": 0.7315047979354858, "learning_rate": 0.0002, "epoch": 2.3499031633311813, "step": 7280}, {"loss": 0.6927, "grad_norm": 0.632836103439331, "learning_rate": 0.0002, "epoch": 2.353131052291801, "step": 7290}, {"loss": 0.6458, "grad_norm": 0.9410115480422974, "learning_rate": 0.0002, "epoch": 2.356358941252421, "step": 7300}, {"loss": 0.6699, "grad_norm": 0.626554012298584, "learning_rate": 0.0002, "epoch": 2.3595868302130407, "step": 7310}, {"loss": 0.6495, "grad_norm": 0.7538444399833679, "learning_rate": 0.0002, "epoch": 2.3628147191736604, "step": 7320}, {"loss": 0.6321, "grad_norm": 0.6826626062393188, "learning_rate": 0.0002, "epoch": 2.36604260813428, "step": 7330}, {"loss": 0.6752, "grad_norm": 0.6739391088485718, "learning_rate": 0.0002, "epoch": 2.3692704970949, "step": 7340}, {"loss": 0.6518, "grad_norm": 0.7518446445465088, "learning_rate": 0.0002, "epoch": 2.3724983860555198, "step": 7350}, {"loss": 0.7142, "grad_norm": 0.714133083820343, "learning_rate": 0.0002, "epoch": 2.3757262750161394, "step": 7360}, {"loss": 0.6794, "grad_norm": 0.7144588232040405, "learning_rate": 0.0002, "epoch": 2.378954163976759, "step": 7370}, {"loss": 0.6922, "grad_norm": 0.6598120927810669, "learning_rate": 0.0002, "epoch": 2.382182052937379, "step": 7380}, {"loss": 0.6562, "grad_norm": 0.7079148292541504, "learning_rate": 0.0002, "epoch": 2.385409941897999, "step": 7390}, {"loss": 0.6492, "grad_norm": 0.6750902533531189, "learning_rate": 0.0002, "epoch": 2.3886378308586185, "step": 7400}, {"loss": 0.6398, "grad_norm": 0.7181967496871948, "learning_rate": 0.0002, "epoch": 2.391865719819238, "step": 7410}, {"loss": 0.6793, "grad_norm": 0.7720552086830139, "learning_rate": 0.0002, "epoch": 2.3950936087798578, "step": 7420}, {"loss": 0.6804, "grad_norm": 0.7592426538467407, "learning_rate": 0.0002, "epoch": 2.398321497740478, "step": 7430}, {"loss": 0.6667, "grad_norm": 0.7161896824836731, "learning_rate": 0.0002, "epoch": 2.4015493867010975, "step": 7440}, {"loss": 0.6891, "grad_norm": 0.8019260764122009, "learning_rate": 0.0002, "epoch": 2.404777275661717, "step": 7450}, {"loss": 0.6864, "grad_norm": 0.7093342542648315, "learning_rate": 0.0002, "epoch": 2.408005164622337, "step": 7460}, {"loss": 0.6445, "grad_norm": 0.8464207649230957, "learning_rate": 0.0002, "epoch": 2.411233053582957, "step": 7470}, {"loss": 0.6724, "grad_norm": 0.773666501045227, "learning_rate": 0.0002, "epoch": 2.4144609425435766, "step": 7480}, {"loss": 0.6774, "grad_norm": 0.8451611995697021, "learning_rate": 0.0002, "epoch": 2.4176888315041962, "step": 7490}, {"loss": 0.694, "grad_norm": 0.656795084476471, "learning_rate": 0.0002, "epoch": 2.420916720464816, "step": 7500}, {"loss": 0.6824, "grad_norm": 0.7129034996032715, "learning_rate": 0.0002, "epoch": 2.4241446094254355, "step": 7510}, {"loss": 0.711, "grad_norm": 0.8325763940811157, "learning_rate": 0.0002, "epoch": 2.4273724983860556, "step": 7520}, {"loss": 0.6238, "grad_norm": 0.7806527614593506, "learning_rate": 0.0002, "epoch": 2.4306003873466753, "step": 7530}, {"loss": 0.6972, "grad_norm": 0.6994536519050598, "learning_rate": 0.0002, "epoch": 2.433828276307295, "step": 7540}, {"loss": 0.6615, "grad_norm": 0.6898999214172363, "learning_rate": 0.0002, "epoch": 2.437056165267915, "step": 7550}, {"loss": 0.7108, "grad_norm": 0.719490647315979, "learning_rate": 0.0002, "epoch": 2.4402840542285347, "step": 7560}, {"loss": 0.668, "grad_norm": 0.6841562390327454, "learning_rate": 0.0002, "epoch": 2.4435119431891543, "step": 7570}, {"loss": 0.6504, "grad_norm": 0.7573311924934387, "learning_rate": 0.0002, "epoch": 2.446739832149774, "step": 7580}, {"loss": 0.6607, "grad_norm": 0.7295880317687988, "learning_rate": 0.0002, "epoch": 2.4499677211103936, "step": 7590}, {"loss": 0.6593, "grad_norm": 0.710136353969574, "learning_rate": 0.0002, "epoch": 2.4531956100710137, "step": 7600}, {"loss": 0.7137, "grad_norm": 0.6126235127449036, "learning_rate": 0.0002, "epoch": 2.4564234990316334, "step": 7610}, {"loss": 0.6562, "grad_norm": 0.8025609850883484, "learning_rate": 0.0002, "epoch": 2.459651387992253, "step": 7620}, {"loss": 0.6464, "grad_norm": 0.7839472889900208, "learning_rate": 0.0002, "epoch": 2.4628792769528727, "step": 7630}, {"loss": 0.6797, "grad_norm": 0.7253499031066895, "learning_rate": 0.0002, "epoch": 2.4661071659134928, "step": 7640}, {"loss": 0.7341, "grad_norm": 0.7918946743011475, "learning_rate": 0.0002, "epoch": 2.4693350548741124, "step": 7650}, {"loss": 0.6646, "grad_norm": 0.7930178046226501, "learning_rate": 0.0002, "epoch": 2.472562943834732, "step": 7660}, {"loss": 0.6294, "grad_norm": 0.6826170086860657, "learning_rate": 0.0002, "epoch": 2.4757908327953517, "step": 7670}, {"loss": 0.6697, "grad_norm": 0.6576805114746094, "learning_rate": 0.0002, "epoch": 2.4790187217559714, "step": 7680}, {"loss": 0.682, "grad_norm": 0.7012448310852051, "learning_rate": 0.0002, "epoch": 2.4822466107165915, "step": 7690}, {"loss": 0.6418, "grad_norm": 0.7774284482002258, "learning_rate": 0.0002, "epoch": 2.485474499677211, "step": 7700}, {"loss": 0.6566, "grad_norm": 0.6502766013145447, "learning_rate": 0.0002, "epoch": 2.4887023886378308, "step": 7710}, {"loss": 0.6965, "grad_norm": 0.7638739347457886, "learning_rate": 0.0002, "epoch": 2.4919302775984504, "step": 7720}, {"loss": 0.6454, "grad_norm": 0.6217384338378906, "learning_rate": 0.0002, "epoch": 2.4951581665590705, "step": 7730}, {"loss": 0.6837, "grad_norm": 0.7576302886009216, "learning_rate": 0.0002, "epoch": 2.49838605551969, "step": 7740}, {"loss": 0.6855, "grad_norm": 0.6877137422561646, "learning_rate": 0.0002, "epoch": 2.50161394448031, "step": 7750}, {"loss": 0.6604, "grad_norm": 0.6998329162597656, "learning_rate": 0.0002, "epoch": 2.5048418334409295, "step": 7760}, {"loss": 0.6666, "grad_norm": 0.7879213690757751, "learning_rate": 0.0002, "epoch": 2.508069722401549, "step": 7770}, {"loss": 0.715, "grad_norm": 0.7834980487823486, "learning_rate": 0.0002, "epoch": 2.5112976113621692, "step": 7780}, {"loss": 0.6954, "grad_norm": 0.7789630889892578, "learning_rate": 0.0002, "epoch": 2.514525500322789, "step": 7790}, {"loss": 0.6979, "grad_norm": 0.7403590083122253, "learning_rate": 0.0002, "epoch": 2.5177533892834085, "step": 7800}, {"loss": 0.6964, "grad_norm": 0.6029766201972961, "learning_rate": 0.0002, "epoch": 2.5209812782440286, "step": 7810}, {"loss": 0.6887, "grad_norm": 0.7061092257499695, "learning_rate": 0.0002, "epoch": 2.5242091672046483, "step": 7820}, {"loss": 0.6628, "grad_norm": 0.7120763659477234, "learning_rate": 0.0002, "epoch": 2.527437056165268, "step": 7830}, {"loss": 0.6876, "grad_norm": 0.6173675656318665, "learning_rate": 0.0002, "epoch": 2.5306649451258876, "step": 7840}, {"loss": 0.6635, "grad_norm": 0.9566813111305237, "learning_rate": 0.0002, "epoch": 2.5338928340865072, "step": 7850}, {"loss": 0.654, "grad_norm": 0.8497620224952698, "learning_rate": 0.0002, "epoch": 2.5371207230471273, "step": 7860}, {"loss": 0.644, "grad_norm": 0.7663498520851135, "learning_rate": 0.0002, "epoch": 2.540348612007747, "step": 7870}, {"loss": 0.6292, "grad_norm": 0.6329668760299683, "learning_rate": 0.0002, "epoch": 2.5435765009683666, "step": 7880}, {"loss": 0.686, "grad_norm": 0.8128195405006409, "learning_rate": 0.0002, "epoch": 2.5468043899289863, "step": 7890}, {"loss": 0.6619, "grad_norm": 0.6622284650802612, "learning_rate": 0.0002, "epoch": 2.5500322788896064, "step": 7900}, {"loss": 0.693, "grad_norm": 0.8460057973861694, "learning_rate": 0.0002, "epoch": 2.553260167850226, "step": 7910}, {"loss": 0.6619, "grad_norm": 0.6586956977844238, "learning_rate": 0.0002, "epoch": 2.5564880568108457, "step": 7920}, {"loss": 0.6976, "grad_norm": 0.7569382190704346, "learning_rate": 0.0002, "epoch": 2.5597159457714653, "step": 7930}, {"loss": 0.6235, "grad_norm": 0.6409714221954346, "learning_rate": 0.0002, "epoch": 2.562943834732085, "step": 7940}, {"loss": 0.6663, "grad_norm": 0.7031713128089905, "learning_rate": 0.0002, "epoch": 2.566171723692705, "step": 7950}, {"loss": 0.6344, "grad_norm": 0.7983605265617371, "learning_rate": 0.0002, "epoch": 2.5693996126533247, "step": 7960}, {"loss": 0.6834, "grad_norm": 0.7165433168411255, "learning_rate": 0.0002, "epoch": 2.5726275016139444, "step": 7970}, {"loss": 0.6517, "grad_norm": 0.6630598902702332, "learning_rate": 0.0002, "epoch": 2.5758553905745645, "step": 7980}, {"loss": 0.7164, "grad_norm": 0.5883122086524963, "learning_rate": 0.0002, "epoch": 2.579083279535184, "step": 7990}, {"loss": 0.6715, "grad_norm": 0.5928755402565002, "learning_rate": 0.0002, "epoch": 2.5823111684958038, "step": 8000}, {"loss": 0.6701, "grad_norm": 0.7843712568283081, "learning_rate": 0.0002, "epoch": 2.5855390574564234, "step": 8010}, {"loss": 0.6617, "grad_norm": 0.7206324338912964, "learning_rate": 0.0002, "epoch": 2.588766946417043, "step": 8020}, {"loss": 0.6968, "grad_norm": 0.812480092048645, "learning_rate": 0.0002, "epoch": 2.5919948353776627, "step": 8030}, {"loss": 0.6735, "grad_norm": 0.9843078255653381, "learning_rate": 0.0002, "epoch": 2.595222724338283, "step": 8040}, {"loss": 0.6877, "grad_norm": 0.7524392604827881, "learning_rate": 0.0002, "epoch": 2.5984506132989025, "step": 8050}, {"loss": 0.7188, "grad_norm": 0.6220380067825317, "learning_rate": 0.0002, "epoch": 2.601678502259522, "step": 8060}, {"loss": 0.6878, "grad_norm": 0.7461398243904114, "learning_rate": 0.0002, "epoch": 2.6049063912201422, "step": 8070}, {"loss": 0.6626, "grad_norm": 0.720974326133728, "learning_rate": 0.0002, "epoch": 2.608134280180762, "step": 8080}, {"loss": 0.6756, "grad_norm": 0.649509847164154, "learning_rate": 0.0002, "epoch": 2.6113621691413815, "step": 8090}, {"loss": 0.6394, "grad_norm": 0.6894662976264954, "learning_rate": 0.0002, "epoch": 2.614590058102001, "step": 8100}, {"loss": 0.6329, "grad_norm": 0.734433114528656, "learning_rate": 0.0002, "epoch": 2.617817947062621, "step": 8110}, {"loss": 0.6698, "grad_norm": 0.7468628883361816, "learning_rate": 0.0002, "epoch": 2.621045836023241, "step": 8120}, {"loss": 0.658, "grad_norm": 0.6508180499076843, "learning_rate": 0.0002, "epoch": 2.6242737249838606, "step": 8130}, {"loss": 0.6619, "grad_norm": 0.8735209107398987, "learning_rate": 0.0002, "epoch": 2.6275016139444802, "step": 8140}, {"loss": 0.6717, "grad_norm": 0.8162857294082642, "learning_rate": 0.0002, "epoch": 2.6307295029051003, "step": 8150}, {"loss": 0.6496, "grad_norm": 0.628872811794281, "learning_rate": 0.0002, "epoch": 2.63395739186572, "step": 8160}, {"loss": 0.6608, "grad_norm": 0.8078708052635193, "learning_rate": 0.0002, "epoch": 2.6371852808263396, "step": 8170}, {"loss": 0.6916, "grad_norm": 0.7849429845809937, "learning_rate": 0.0002, "epoch": 2.6404131697869593, "step": 8180}, {"loss": 0.6671, "grad_norm": 0.8115387558937073, "learning_rate": 0.0002, "epoch": 2.643641058747579, "step": 8190}, {"loss": 0.6761, "grad_norm": 0.7462222576141357, "learning_rate": 0.0002, "epoch": 2.6468689477081986, "step": 8200}, {"loss": 0.6923, "grad_norm": 0.753662645816803, "learning_rate": 0.0002, "epoch": 2.6500968366688187, "step": 8210}, {"loss": 0.6666, "grad_norm": 0.6100404858589172, "learning_rate": 0.0002, "epoch": 2.6533247256294383, "step": 8220}, {"loss": 0.7256, "grad_norm": 0.9084606766700745, "learning_rate": 0.0002, "epoch": 2.656552614590058, "step": 8230}, {"loss": 0.6385, "grad_norm": 0.6412538886070251, "learning_rate": 0.0002, "epoch": 2.659780503550678, "step": 8240}, {"loss": 0.7048, "grad_norm": 0.7640451192855835, "learning_rate": 0.0002, "epoch": 2.6630083925112977, "step": 8250}, {"loss": 0.6846, "grad_norm": 0.5972344875335693, "learning_rate": 0.0002, "epoch": 2.6662362814719174, "step": 8260}, {"loss": 0.682, "grad_norm": 0.6935883164405823, "learning_rate": 0.0002, "epoch": 2.669464170432537, "step": 8270}, {"loss": 0.6625, "grad_norm": 0.789399266242981, "learning_rate": 0.0002, "epoch": 2.6726920593931567, "step": 8280}, {"loss": 0.6541, "grad_norm": 0.7143490314483643, "learning_rate": 0.0002, "epoch": 2.675919948353777, "step": 8290}, {"loss": 0.6741, "grad_norm": 0.6670652627944946, "learning_rate": 0.0002, "epoch": 2.6791478373143964, "step": 8300}, {"loss": 0.6936, "grad_norm": 0.687108039855957, "learning_rate": 0.0002, "epoch": 2.682375726275016, "step": 8310}, {"loss": 0.7124, "grad_norm": 0.7914147973060608, "learning_rate": 0.0002, "epoch": 2.6856036152356357, "step": 8320}, {"loss": 0.6584, "grad_norm": 0.8398420214653015, "learning_rate": 0.0002, "epoch": 2.688831504196256, "step": 8330}, {"loss": 0.6679, "grad_norm": 0.6592720746994019, "learning_rate": 0.0002, "epoch": 2.6920593931568755, "step": 8340}, {"loss": 0.6673, "grad_norm": 0.6888470649719238, "learning_rate": 0.0002, "epoch": 2.695287282117495, "step": 8350}, {"loss": 0.6483, "grad_norm": 0.7127556800842285, "learning_rate": 0.0002, "epoch": 2.698515171078115, "step": 8360}, {"loss": 0.7013, "grad_norm": 0.6630286574363708, "learning_rate": 0.0002, "epoch": 2.7017430600387344, "step": 8370}, {"loss": 0.6842, "grad_norm": 0.8261964321136475, "learning_rate": 0.0002, "epoch": 2.7049709489993545, "step": 8380}, {"loss": 0.6613, "grad_norm": 0.717339813709259, "learning_rate": 0.0002, "epoch": 2.708198837959974, "step": 8390}, {"loss": 0.6929, "grad_norm": 0.651637613773346, "learning_rate": 0.0002, "epoch": 2.711426726920594, "step": 8400}, {"loss": 0.6796, "grad_norm": 0.7936098575592041, "learning_rate": 0.0002, "epoch": 2.714654615881214, "step": 8410}, {"loss": 0.696, "grad_norm": 0.8761560320854187, "learning_rate": 0.0002, "epoch": 2.7178825048418336, "step": 8420}, {"loss": 0.6889, "grad_norm": 0.6768006086349487, "learning_rate": 0.0002, "epoch": 2.7211103938024532, "step": 8430}, {"loss": 0.6844, "grad_norm": 0.7121055722236633, "learning_rate": 0.0002, "epoch": 2.724338282763073, "step": 8440}, {"loss": 0.6608, "grad_norm": 0.6811696887016296, "learning_rate": 0.0002, "epoch": 2.7275661717236925, "step": 8450}, {"loss": 0.7046, "grad_norm": 0.8168250918388367, "learning_rate": 0.0002, "epoch": 2.730794060684312, "step": 8460}, {"loss": 0.6809, "grad_norm": 0.660682737827301, "learning_rate": 0.0002, "epoch": 2.7340219496449323, "step": 8470}, {"loss": 0.6916, "grad_norm": 0.7369356155395508, "learning_rate": 0.0002, "epoch": 2.737249838605552, "step": 8480}, {"loss": 0.6383, "grad_norm": 0.7545099854469299, "learning_rate": 0.0002, "epoch": 2.7404777275661716, "step": 8490}, {"loss": 0.6917, "grad_norm": 0.6991257667541504, "learning_rate": 0.0002, "epoch": 2.7437056165267917, "step": 8500}, {"loss": 0.6953, "grad_norm": 0.7195324301719666, "learning_rate": 0.0002, "epoch": 2.7469335054874113, "step": 8510}, {"loss": 0.6955, "grad_norm": 0.8995378017425537, "learning_rate": 0.0002, "epoch": 2.750161394448031, "step": 8520}, {"loss": 0.684, "grad_norm": 0.6924123764038086, "learning_rate": 0.0002, "epoch": 2.7533892834086506, "step": 8530}, {"loss": 0.6675, "grad_norm": 0.6260585784912109, "learning_rate": 0.0002, "epoch": 2.7566171723692703, "step": 8540}, {"loss": 0.6613, "grad_norm": 0.7273091673851013, "learning_rate": 0.0002, "epoch": 2.7598450613298904, "step": 8550}, {"loss": 0.6853, "grad_norm": 0.720562219619751, "learning_rate": 0.0002, "epoch": 2.76307295029051, "step": 8560}, {"loss": 0.6452, "grad_norm": 0.6360004544258118, "learning_rate": 0.0002, "epoch": 2.7663008392511297, "step": 8570}, {"loss": 0.6118, "grad_norm": 0.7634525895118713, "learning_rate": 0.0002, "epoch": 2.76952872821175, "step": 8580}, {"loss": 0.686, "grad_norm": 0.6586076021194458, "learning_rate": 0.0002, "epoch": 2.7727566171723694, "step": 8590}, {"loss": 0.7072, "grad_norm": 0.6542639136314392, "learning_rate": 0.0002, "epoch": 2.775984506132989, "step": 8600}, {"loss": 0.7126, "grad_norm": 0.7650290727615356, "learning_rate": 0.0002, "epoch": 2.7792123950936087, "step": 8610}, {"loss": 0.6923, "grad_norm": 0.6551542282104492, "learning_rate": 0.0002, "epoch": 2.7824402840542284, "step": 8620}, {"loss": 0.6937, "grad_norm": 0.6915501952171326, "learning_rate": 0.0002, "epoch": 2.785668173014848, "step": 8630}, {"loss": 0.6586, "grad_norm": 0.8061493635177612, "learning_rate": 0.0002, "epoch": 2.788896061975468, "step": 8640}, {"loss": 0.6853, "grad_norm": 0.8403584957122803, "learning_rate": 0.0002, "epoch": 2.792123950936088, "step": 8650}, {"loss": 0.6616, "grad_norm": 0.6455532312393188, "learning_rate": 0.0002, "epoch": 2.7953518398967074, "step": 8660}, {"loss": 0.6819, "grad_norm": 0.8296352028846741, "learning_rate": 0.0002, "epoch": 2.7985797288573275, "step": 8670}, {"loss": 0.6678, "grad_norm": 0.7288752794265747, "learning_rate": 0.0002, "epoch": 2.801807617817947, "step": 8680}, {"loss": 0.6778, "grad_norm": 0.7628464102745056, "learning_rate": 0.0002, "epoch": 2.805035506778567, "step": 8690}, {"loss": 0.7176, "grad_norm": 0.9993878602981567, "learning_rate": 0.0002, "epoch": 2.8082633957391865, "step": 8700}, {"loss": 0.6414, "grad_norm": 0.6972465515136719, "learning_rate": 0.0002, "epoch": 2.811491284699806, "step": 8710}, {"loss": 0.6777, "grad_norm": 0.645042896270752, "learning_rate": 0.0002, "epoch": 2.8147191736604262, "step": 8720}, {"loss": 0.6587, "grad_norm": 0.6853853464126587, "learning_rate": 0.0002, "epoch": 2.817947062621046, "step": 8730}, {"loss": 0.6405, "grad_norm": 0.5935067534446716, "learning_rate": 0.0002, "epoch": 2.8211749515816655, "step": 8740}, {"loss": 0.6674, "grad_norm": 0.7336633205413818, "learning_rate": 0.0002, "epoch": 2.824402840542285, "step": 8750}, {"loss": 0.6662, "grad_norm": 0.7074962854385376, "learning_rate": 0.0002, "epoch": 2.8276307295029053, "step": 8760}, {"loss": 0.6744, "grad_norm": 0.6667559742927551, "learning_rate": 0.0002, "epoch": 2.830858618463525, "step": 8770}, {"loss": 0.7142, "grad_norm": 0.8101205229759216, "learning_rate": 0.0002, "epoch": 2.8340865074241446, "step": 8780}, {"loss": 0.6727, "grad_norm": 0.8841480016708374, "learning_rate": 0.0002, "epoch": 2.8373143963847642, "step": 8790}, {"loss": 0.6601, "grad_norm": 0.5891591310501099, "learning_rate": 0.0002, "epoch": 2.840542285345384, "step": 8800}, {"loss": 0.7114, "grad_norm": 0.667032778263092, "learning_rate": 0.0002, "epoch": 2.843770174306004, "step": 8810}, {"loss": 0.7295, "grad_norm": 0.7629773020744324, "learning_rate": 0.0002, "epoch": 2.8469980632666236, "step": 8820}, {"loss": 0.703, "grad_norm": 0.79471355676651, "learning_rate": 0.0002, "epoch": 2.8502259522272433, "step": 8830}, {"loss": 0.7278, "grad_norm": 0.7529178261756897, "learning_rate": 0.0002, "epoch": 2.8534538411878634, "step": 8840}, {"loss": 0.7163, "grad_norm": 0.7014923691749573, "learning_rate": 0.0002, "epoch": 2.856681730148483, "step": 8850}, {"loss": 0.6803, "grad_norm": 0.7996514439582825, "learning_rate": 0.0002, "epoch": 2.8599096191091027, "step": 8860}, {"loss": 0.6562, "grad_norm": 0.7044785618782043, "learning_rate": 0.0002, "epoch": 2.8631375080697223, "step": 8870}, {"loss": 0.6966, "grad_norm": 0.6792093515396118, "learning_rate": 0.0002, "epoch": 2.866365397030342, "step": 8880}, {"loss": 0.685, "grad_norm": 0.69175124168396, "learning_rate": 0.0002, "epoch": 2.8695932859909616, "step": 8890}, {"loss": 0.7225, "grad_norm": 0.7499129176139832, "learning_rate": 0.0002, "epoch": 2.8728211749515817, "step": 8900}, {"loss": 0.6922, "grad_norm": 0.7678789496421814, "learning_rate": 0.0002, "epoch": 2.8760490639122014, "step": 8910}, {"loss": 0.6803, "grad_norm": 0.7478128671646118, "learning_rate": 0.0002, "epoch": 2.879276952872821, "step": 8920}, {"loss": 0.6689, "grad_norm": 0.6767086386680603, "learning_rate": 0.0002, "epoch": 2.882504841833441, "step": 8930}, {"loss": 0.6587, "grad_norm": 0.7222196459770203, "learning_rate": 0.0002, "epoch": 2.885732730794061, "step": 8940}, {"loss": 0.6472, "grad_norm": 0.6950580477714539, "learning_rate": 0.0002, "epoch": 2.8889606197546804, "step": 8950}, {"loss": 0.7064, "grad_norm": 0.7759528160095215, "learning_rate": 0.0002, "epoch": 2.8921885087153, "step": 8960}, {"loss": 0.6349, "grad_norm": 0.6686919927597046, "learning_rate": 0.0002, "epoch": 2.8954163976759197, "step": 8970}, {"loss": 0.6801, "grad_norm": 0.9245954751968384, "learning_rate": 0.0002, "epoch": 2.89864428663654, "step": 8980}, {"loss": 0.6703, "grad_norm": 0.8734814524650574, "learning_rate": 0.0002, "epoch": 2.9018721755971595, "step": 8990}, {"loss": 0.6716, "grad_norm": 0.6056219339370728, "learning_rate": 0.0002, "epoch": 2.905100064557779, "step": 9000}, {"loss": 0.6535, "grad_norm": 0.7364102005958557, "learning_rate": 0.0002, "epoch": 2.9083279535183992, "step": 9010}, {"loss": 0.707, "grad_norm": 0.6563605070114136, "learning_rate": 0.0002, "epoch": 2.911555842479019, "step": 9020}, {"loss": 0.6564, "grad_norm": 0.659978985786438, "learning_rate": 0.0002, "epoch": 2.9147837314396385, "step": 9030}, {"loss": 0.7154, "grad_norm": 0.8176041841506958, "learning_rate": 0.0002, "epoch": 2.918011620400258, "step": 9040}, {"loss": 0.72, "grad_norm": 0.743677020072937, "learning_rate": 0.0002, "epoch": 2.921239509360878, "step": 9050}, {"loss": 0.7017, "grad_norm": 0.7418383359909058, "learning_rate": 0.0002, "epoch": 2.9244673983214975, "step": 9060}, {"loss": 0.6635, "grad_norm": 0.6916524767875671, "learning_rate": 0.0002, "epoch": 2.9276952872821176, "step": 9070}, {"loss": 0.6502, "grad_norm": 0.6559975743293762, "learning_rate": 0.0002, "epoch": 2.9309231762427372, "step": 9080}, {"loss": 0.7016, "grad_norm": 0.7431221008300781, "learning_rate": 0.0002, "epoch": 2.934151065203357, "step": 9090}, {"loss": 0.6829, "grad_norm": 0.7525941133499146, "learning_rate": 0.0002, "epoch": 2.937378954163977, "step": 9100}, {"loss": 0.7073, "grad_norm": 0.6860167384147644, "learning_rate": 0.0002, "epoch": 2.9406068431245966, "step": 9110}, {"loss": 0.6912, "grad_norm": 0.6467666029930115, "learning_rate": 0.0002, "epoch": 2.9438347320852163, "step": 9120}, {"loss": 0.7122, "grad_norm": 0.7595751285552979, "learning_rate": 0.0002, "epoch": 2.947062621045836, "step": 9130}, {"loss": 0.6951, "grad_norm": 0.6558279991149902, "learning_rate": 0.0002, "epoch": 2.9502905100064556, "step": 9140}, {"loss": 0.7081, "grad_norm": 0.6818708181381226, "learning_rate": 0.0002, "epoch": 2.9535183989670757, "step": 9150}, {"loss": 0.6921, "grad_norm": 0.8387085795402527, "learning_rate": 0.0002, "epoch": 2.9567462879276953, "step": 9160}, {"loss": 0.6914, "grad_norm": 0.7705109715461731, "learning_rate": 0.0002, "epoch": 2.959974176888315, "step": 9170}, {"loss": 0.6849, "grad_norm": 0.688106894493103, "learning_rate": 0.0002, "epoch": 2.9632020658489346, "step": 9180}, {"loss": 0.6833, "grad_norm": 0.659532368183136, "learning_rate": 0.0002, "epoch": 2.9664299548095547, "step": 9190}, {"loss": 0.6383, "grad_norm": 0.6839388608932495, "learning_rate": 0.0002, "epoch": 2.9696578437701744, "step": 9200}, {"loss": 0.6952, "grad_norm": 0.6927599310874939, "learning_rate": 0.0002, "epoch": 2.972885732730794, "step": 9210}, {"loss": 0.7338, "grad_norm": 0.6902472972869873, "learning_rate": 0.0002, "epoch": 2.9761136216914137, "step": 9220}, {"loss": 0.6671, "grad_norm": 0.620399534702301, "learning_rate": 0.0002, "epoch": 2.9793415106520333, "step": 9230}, {"loss": 0.6588, "grad_norm": 0.6812364459037781, "learning_rate": 0.0002, "epoch": 2.9825693996126534, "step": 9240}, {"loss": 0.6957, "grad_norm": 0.7681456208229065, "learning_rate": 0.0002, "epoch": 2.985797288573273, "step": 9250}, {"loss": 0.7113, "grad_norm": 0.7621907591819763, "learning_rate": 0.0002, "epoch": 2.9890251775338927, "step": 9260}, {"loss": 0.6601, "grad_norm": 0.6075740456581116, "learning_rate": 0.0002, "epoch": 2.992253066494513, "step": 9270}, {"loss": 0.6758, "grad_norm": 0.7100434899330139, "learning_rate": 0.0002, "epoch": 2.9954809554551325, "step": 9280}, {"loss": 0.73, "grad_norm": 0.7314488887786865, "learning_rate": 0.0002, "epoch": 2.998708844415752, "step": 9290}, {"eval_loss": 1.1434104442596436, "eval_runtime": 166.3732, "eval_samples_per_second": 4.406, "eval_steps_per_second": 0.553, "epoch": 3.0, "step": 9294}, {"loss": 0.6401, "grad_norm": 0.7408893704414368, "learning_rate": 0.0002, "epoch": 3.001936733376372, "step": 9300}, {"loss": 0.5182, "grad_norm": 0.9773574471473694, "learning_rate": 0.0002, "epoch": 3.0051646223369914, "step": 9310}, {"loss": 0.5432, "grad_norm": 0.7919653058052063, "learning_rate": 0.0002, "epoch": 3.0083925112976115, "step": 9320}, {"loss": 0.6156, "grad_norm": 0.9139202833175659, "learning_rate": 0.0002, "epoch": 3.011620400258231, "step": 9330}, {"loss": 0.5736, "grad_norm": 0.8296737670898438, "learning_rate": 0.0002, "epoch": 3.014848289218851, "step": 9340}, {"loss": 0.5567, "grad_norm": 0.786868155002594, "learning_rate": 0.0002, "epoch": 3.0180761781794705, "step": 9350}, {"loss": 0.578, "grad_norm": 0.5928055644035339, "learning_rate": 0.0002, "epoch": 3.0213040671400906, "step": 9360}, {"loss": 0.5376, "grad_norm": 0.8785701394081116, "learning_rate": 0.0002, "epoch": 3.0245319561007102, "step": 9370}, {"loss": 0.5664, "grad_norm": 0.7978872060775757, "learning_rate": 0.0002, "epoch": 3.02775984506133, "step": 9380}, {"loss": 0.5797, "grad_norm": 0.7160913348197937, "learning_rate": 0.0002, "epoch": 3.0309877340219495, "step": 9390}, {"loss": 0.5777, "grad_norm": 0.904465913772583, "learning_rate": 0.0002, "epoch": 3.034215622982569, "step": 9400}, {"loss": 0.5518, "grad_norm": 0.7082195281982422, "learning_rate": 0.0002, "epoch": 3.0374435119431893, "step": 9410}, {"loss": 0.5434, "grad_norm": 0.9686778783798218, "learning_rate": 0.0002, "epoch": 3.040671400903809, "step": 9420}, {"loss": 0.5692, "grad_norm": 0.8788613677024841, "learning_rate": 0.0002, "epoch": 3.0438992898644286, "step": 9430}, {"loss": 0.5599, "grad_norm": 0.8217582106590271, "learning_rate": 0.0002, "epoch": 3.0471271788250482, "step": 9440}, {"loss": 0.5405, "grad_norm": 0.7380914092063904, "learning_rate": 0.0002, "epoch": 3.0503550677856683, "step": 9450}, {"loss": 0.6258, "grad_norm": 0.7339285612106323, "learning_rate": 0.0002, "epoch": 3.053582956746288, "step": 9460}, {"loss": 0.5646, "grad_norm": 0.7175183296203613, "learning_rate": 0.0002, "epoch": 3.0568108457069076, "step": 9470}, {"loss": 0.5667, "grad_norm": 0.8275379538536072, "learning_rate": 0.0002, "epoch": 3.0600387346675273, "step": 9480}, {"loss": 0.5868, "grad_norm": 0.6544256806373596, "learning_rate": 0.0002, "epoch": 3.0632666236281474, "step": 9490}, {"loss": 0.5365, "grad_norm": 0.8193472623825073, "learning_rate": 0.0002, "epoch": 3.066494512588767, "step": 9500}, {"loss": 0.5614, "grad_norm": 0.7967836856842041, "learning_rate": 0.0002, "epoch": 3.0697224015493867, "step": 9510}, {"loss": 0.5629, "grad_norm": 0.8788684010505676, "learning_rate": 0.0002, "epoch": 3.0729502905100063, "step": 9520}, {"loss": 0.5397, "grad_norm": 0.9410629868507385, "learning_rate": 0.0002, "epoch": 3.0761781794706264, "step": 9530}, {"loss": 0.5473, "grad_norm": 0.7448706030845642, "learning_rate": 0.0002, "epoch": 3.079406068431246, "step": 9540}, {"loss": 0.5774, "grad_norm": 0.9149372577667236, "learning_rate": 0.0002, "epoch": 3.0826339573918657, "step": 9550}, {"loss": 0.5347, "grad_norm": 0.7265563607215881, "learning_rate": 0.0002, "epoch": 3.0858618463524854, "step": 9560}, {"loss": 0.5487, "grad_norm": 1.0305068492889404, "learning_rate": 0.0002, "epoch": 3.089089735313105, "step": 9570}, {"loss": 0.5884, "grad_norm": 0.7987357974052429, "learning_rate": 0.0002, "epoch": 3.092317624273725, "step": 9580}, {"loss": 0.6216, "grad_norm": 0.7733123898506165, "learning_rate": 0.0002, "epoch": 3.095545513234345, "step": 9590}, {"loss": 0.5848, "grad_norm": 1.0438069105148315, "learning_rate": 0.0002, "epoch": 3.0987734021949644, "step": 9600}, {"loss": 0.5612, "grad_norm": 0.7951784729957581, "learning_rate": 0.0002, "epoch": 3.102001291155584, "step": 9610}, {"loss": 0.6184, "grad_norm": 0.7776783108711243, "learning_rate": 0.0002, "epoch": 3.105229180116204, "step": 9620}, {"loss": 0.5626, "grad_norm": 0.7060676217079163, "learning_rate": 0.0002, "epoch": 3.108457069076824, "step": 9630}, {"loss": 0.5731, "grad_norm": 0.871569037437439, "learning_rate": 0.0002, "epoch": 3.1116849580374435, "step": 9640}, {"loss": 0.5168, "grad_norm": 0.8873385787010193, "learning_rate": 0.0002, "epoch": 3.114912846998063, "step": 9650}, {"loss": 0.5985, "grad_norm": 0.750998318195343, "learning_rate": 0.0002, "epoch": 3.118140735958683, "step": 9660}, {"loss": 0.5741, "grad_norm": 0.8678529262542725, "learning_rate": 0.0002, "epoch": 3.121368624919303, "step": 9670}, {"loss": 0.5831, "grad_norm": 0.7706599235534668, "learning_rate": 0.0002, "epoch": 3.1245965138799225, "step": 9680}, {"loss": 0.6142, "grad_norm": 0.8317574858665466, "learning_rate": 0.0002, "epoch": 3.127824402840542, "step": 9690}, {"loss": 0.5634, "grad_norm": 0.801800012588501, "learning_rate": 0.0002, "epoch": 3.131052291801162, "step": 9700}, {"loss": 0.6044, "grad_norm": 0.8574623465538025, "learning_rate": 0.0002, "epoch": 3.134280180761782, "step": 9710}, {"loss": 0.6072, "grad_norm": 0.6556540727615356, "learning_rate": 0.0002, "epoch": 3.1375080697224016, "step": 9720}, {"loss": 0.6058, "grad_norm": 0.8555161952972412, "learning_rate": 0.0002, "epoch": 3.1407359586830212, "step": 9730}, {"loss": 0.6069, "grad_norm": 0.8825467824935913, "learning_rate": 0.0002, "epoch": 3.143963847643641, "step": 9740}, {"loss": 0.5689, "grad_norm": 0.8297156691551208, "learning_rate": 0.0002, "epoch": 3.147191736604261, "step": 9750}, {"loss": 0.5738, "grad_norm": 0.7710384726524353, "learning_rate": 0.0002, "epoch": 3.1504196255648806, "step": 9760}, {"loss": 0.571, "grad_norm": 0.8778039216995239, "learning_rate": 0.0002, "epoch": 3.1536475145255003, "step": 9770}, {"loss": 0.5913, "grad_norm": 0.9014058113098145, "learning_rate": 0.0002, "epoch": 3.15687540348612, "step": 9780}, {"loss": 0.5496, "grad_norm": 0.6856890320777893, "learning_rate": 0.0002, "epoch": 3.16010329244674, "step": 9790}, {"loss": 0.558, "grad_norm": 0.6520644426345825, "learning_rate": 0.0002, "epoch": 3.1633311814073597, "step": 9800}, {"loss": 0.6024, "grad_norm": 0.7250499129295349, "learning_rate": 0.0002, "epoch": 3.1665590703679793, "step": 9810}, {"loss": 0.5823, "grad_norm": 0.8331542015075684, "learning_rate": 0.0002, "epoch": 3.169786959328599, "step": 9820}, {"loss": 0.5803, "grad_norm": 0.8531261682510376, "learning_rate": 0.0002, "epoch": 3.1730148482892186, "step": 9830}, {"loss": 0.57, "grad_norm": 0.8997558355331421, "learning_rate": 0.0002, "epoch": 3.1762427372498387, "step": 9840}, {"loss": 0.5921, "grad_norm": 0.708335280418396, "learning_rate": 0.0002, "epoch": 3.1794706262104584, "step": 9850}, {"loss": 0.5997, "grad_norm": 1.0074886083602905, "learning_rate": 0.0002, "epoch": 3.182698515171078, "step": 9860}, {"loss": 0.573, "grad_norm": 1.0804681777954102, "learning_rate": 0.0002, "epoch": 3.1859264041316977, "step": 9870}, {"loss": 0.5527, "grad_norm": 0.9510730504989624, "learning_rate": 0.0002, "epoch": 3.189154293092318, "step": 9880}, {"loss": 0.6401, "grad_norm": 0.7211061716079712, "learning_rate": 0.0002, "epoch": 3.1923821820529374, "step": 9890}, {"loss": 0.5563, "grad_norm": 0.8767086267471313, "learning_rate": 0.0002, "epoch": 3.195610071013557, "step": 9900}, {"loss": 0.5747, "grad_norm": 0.8388153314590454, "learning_rate": 0.0002, "epoch": 3.1988379599741767, "step": 9910}, {"loss": 0.5681, "grad_norm": 0.8038473725318909, "learning_rate": 0.0002, "epoch": 3.202065848934797, "step": 9920}, {"loss": 0.5594, "grad_norm": 0.8187747001647949, "learning_rate": 0.0002, "epoch": 3.2052937378954165, "step": 9930}, {"loss": 0.5813, "grad_norm": 0.7427355051040649, "learning_rate": 0.0002, "epoch": 3.208521626856036, "step": 9940}, {"loss": 0.5709, "grad_norm": 0.8017025589942932, "learning_rate": 0.0002, "epoch": 3.211749515816656, "step": 9950}, {"loss": 0.6106, "grad_norm": 0.738595187664032, "learning_rate": 0.0002, "epoch": 3.214977404777276, "step": 9960}, {"loss": 0.6006, "grad_norm": 0.7521342039108276, "learning_rate": 0.0002, "epoch": 3.2182052937378955, "step": 9970}, {"loss": 0.5706, "grad_norm": 0.840329110622406, "learning_rate": 0.0002, "epoch": 3.221433182698515, "step": 9980}, {"loss": 0.5666, "grad_norm": 0.9809671640396118, "learning_rate": 0.0002, "epoch": 3.224661071659135, "step": 9990}, {"loss": 0.6223, "grad_norm": 0.8456943035125732, "learning_rate": 0.0002, "epoch": 3.2278889606197545, "step": 10000}, {"loss": 0.5798, "grad_norm": 0.8962995409965515, "learning_rate": 0.0002, "epoch": 3.2311168495803746, "step": 10010}, {"loss": 0.5399, "grad_norm": 0.6492817401885986, "learning_rate": 0.0002, "epoch": 3.2343447385409942, "step": 10020}, {"loss": 0.5678, "grad_norm": 1.0471255779266357, "learning_rate": 0.0002, "epoch": 3.237572627501614, "step": 10030}, {"loss": 0.5452, "grad_norm": 0.7995471358299255, "learning_rate": 0.0002, "epoch": 3.2408005164622335, "step": 10040}, {"loss": 0.615, "grad_norm": 0.7231964468955994, "learning_rate": 0.0002, "epoch": 3.2440284054228536, "step": 10050}, {"loss": 0.5586, "grad_norm": 0.639630138874054, "learning_rate": 0.0002, "epoch": 3.2472562943834733, "step": 10060}, {"loss": 0.6271, "grad_norm": 0.7957055568695068, "learning_rate": 0.0002, "epoch": 3.250484183344093, "step": 10070}, {"loss": 0.5845, "grad_norm": 0.7735482454299927, "learning_rate": 0.0002, "epoch": 3.2537120723047126, "step": 10080}, {"loss": 0.5791, "grad_norm": 0.8139488101005554, "learning_rate": 0.0002, "epoch": 3.2569399612653323, "step": 10090}, {"loss": 0.6049, "grad_norm": 0.8113240003585815, "learning_rate": 0.0002, "epoch": 3.2601678502259523, "step": 10100}, {"loss": 0.5617, "grad_norm": 0.7735909819602966, "learning_rate": 0.0002, "epoch": 3.263395739186572, "step": 10110}, {"loss": 0.5964, "grad_norm": 0.7760744094848633, "learning_rate": 0.0002, "epoch": 3.2666236281471916, "step": 10120}, {"loss": 0.5786, "grad_norm": 0.8078505396842957, "learning_rate": 0.0002, "epoch": 3.2698515171078113, "step": 10130}, {"loss": 0.5904, "grad_norm": 0.983648955821991, "learning_rate": 0.0002, "epoch": 3.2730794060684314, "step": 10140}, {"loss": 0.596, "grad_norm": 0.7131832242012024, "learning_rate": 0.0002, "epoch": 3.276307295029051, "step": 10150}, {"loss": 0.5986, "grad_norm": 0.924493134021759, "learning_rate": 0.0002, "epoch": 3.2795351839896707, "step": 10160}, {"loss": 0.5733, "grad_norm": 0.9371112585067749, "learning_rate": 0.0002, "epoch": 3.2827630729502904, "step": 10170}, {"loss": 0.5891, "grad_norm": 0.8989261388778687, "learning_rate": 0.0002, "epoch": 3.2859909619109104, "step": 10180}, {"loss": 0.6143, "grad_norm": 0.8130394816398621, "learning_rate": 0.0002, "epoch": 3.28921885087153, "step": 10190}, {"loss": 0.5555, "grad_norm": 0.9899941086769104, "learning_rate": 0.0002, "epoch": 3.2924467398321497, "step": 10200}, {"loss": 0.5899, "grad_norm": 1.007038950920105, "learning_rate": 0.0002, "epoch": 3.2956746287927694, "step": 10210}, {"loss": 0.5713, "grad_norm": 0.7465066313743591, "learning_rate": 0.0002, "epoch": 3.2989025177533895, "step": 10220}, {"loss": 0.6307, "grad_norm": 0.7202590703964233, "learning_rate": 0.0002, "epoch": 3.302130406714009, "step": 10230}, {"loss": 0.5659, "grad_norm": 0.6258249282836914, "learning_rate": 0.0002, "epoch": 3.305358295674629, "step": 10240}, {"loss": 0.5869, "grad_norm": 0.8996058702468872, "learning_rate": 0.0002, "epoch": 3.3085861846352485, "step": 10250}, {"loss": 0.5825, "grad_norm": 0.9550982713699341, "learning_rate": 0.0002, "epoch": 3.311814073595868, "step": 10260}, {"loss": 0.5602, "grad_norm": 0.7010059952735901, "learning_rate": 0.0002, "epoch": 3.315041962556488, "step": 10270}, {"loss": 0.5853, "grad_norm": 0.9639869332313538, "learning_rate": 0.0002, "epoch": 3.318269851517108, "step": 10280}, {"loss": 0.5362, "grad_norm": 1.0192502737045288, "learning_rate": 0.0002, "epoch": 3.3214977404777275, "step": 10290}, {"loss": 0.5605, "grad_norm": 0.7953670024871826, "learning_rate": 0.0002, "epoch": 3.324725629438347, "step": 10300}, {"loss": 0.6386, "grad_norm": 0.7436774969100952, "learning_rate": 0.0002, "epoch": 3.3279535183989672, "step": 10310}, {"loss": 0.5823, "grad_norm": 0.7846777439117432, "learning_rate": 0.0002, "epoch": 3.331181407359587, "step": 10320}, {"loss": 0.6119, "grad_norm": 0.8963494896888733, "learning_rate": 0.0002, "epoch": 3.3344092963202066, "step": 10330}, {"loss": 0.5872, "grad_norm": 0.6876392364501953, "learning_rate": 0.0002, "epoch": 3.337637185280826, "step": 10340}, {"loss": 0.6291, "grad_norm": 0.9161638021469116, "learning_rate": 0.0002, "epoch": 3.340865074241446, "step": 10350}, {"loss": 0.5955, "grad_norm": 0.8964458107948303, "learning_rate": 0.0002, "epoch": 3.344092963202066, "step": 10360}, {"loss": 0.5965, "grad_norm": 0.9052296280860901, "learning_rate": 0.0002, "epoch": 3.3473208521626856, "step": 10370}, {"loss": 0.5958, "grad_norm": 0.9292596578598022, "learning_rate": 0.0002, "epoch": 3.3505487411233053, "step": 10380}, {"loss": 0.5487, "grad_norm": 0.9605957269668579, "learning_rate": 0.0002, "epoch": 3.3537766300839253, "step": 10390}, {"loss": 0.6214, "grad_norm": 1.0198872089385986, "learning_rate": 0.0002, "epoch": 3.357004519044545, "step": 10400}, {"loss": 0.6053, "grad_norm": 0.7043630480766296, "learning_rate": 0.0002, "epoch": 3.3602324080051647, "step": 10410}, {"loss": 0.5451, "grad_norm": 1.0533326864242554, "learning_rate": 0.0002, "epoch": 3.3634602969657843, "step": 10420}, {"loss": 0.6134, "grad_norm": 0.7552485466003418, "learning_rate": 0.0002, "epoch": 3.366688185926404, "step": 10430}, {"loss": 0.631, "grad_norm": 0.692708432674408, "learning_rate": 0.0002, "epoch": 3.369916074887024, "step": 10440}, {"loss": 0.631, "grad_norm": 0.985952615737915, "learning_rate": 0.0002, "epoch": 3.3731439638476437, "step": 10450}, {"loss": 0.5689, "grad_norm": 0.6749676465988159, "learning_rate": 0.0002, "epoch": 3.3763718528082634, "step": 10460}, {"loss": 0.5724, "grad_norm": 0.9514535665512085, "learning_rate": 0.0002, "epoch": 3.379599741768883, "step": 10470}, {"loss": 0.5982, "grad_norm": 1.2681142091751099, "learning_rate": 0.0002, "epoch": 3.382827630729503, "step": 10480}, {"loss": 0.5778, "grad_norm": 1.031968355178833, "learning_rate": 0.0002, "epoch": 3.3860555196901228, "step": 10490}, {"loss": 0.5964, "grad_norm": 0.8061563968658447, "learning_rate": 0.0002, "epoch": 3.3892834086507424, "step": 10500}, {"loss": 0.6094, "grad_norm": 1.0515062808990479, "learning_rate": 0.0002, "epoch": 3.392511297611362, "step": 10510}, {"loss": 0.542, "grad_norm": 0.9055540561676025, "learning_rate": 0.0002, "epoch": 3.3957391865719817, "step": 10520}, {"loss": 0.6148, "grad_norm": 0.9318141341209412, "learning_rate": 0.0002, "epoch": 3.398967075532602, "step": 10530}, {"loss": 0.5722, "grad_norm": 0.8266817331314087, "learning_rate": 0.0002, "epoch": 3.4021949644932215, "step": 10540}, {"loss": 0.6015, "grad_norm": 1.2322112321853638, "learning_rate": 0.0002, "epoch": 3.405422853453841, "step": 10550}, {"loss": 0.6215, "grad_norm": 0.9535136818885803, "learning_rate": 0.0002, "epoch": 3.4086507424144608, "step": 10560}, {"loss": 0.561, "grad_norm": 0.9243819117546082, "learning_rate": 0.0002, "epoch": 3.411878631375081, "step": 10570}, {"loss": 0.5844, "grad_norm": 0.9011809825897217, "learning_rate": 0.0002, "epoch": 3.4151065203357005, "step": 10580}, {"loss": 0.6175, "grad_norm": 0.9923036694526672, "learning_rate": 0.0002, "epoch": 3.41833440929632, "step": 10590}, {"loss": 0.6033, "grad_norm": 0.8903067111968994, "learning_rate": 0.0002, "epoch": 3.42156229825694, "step": 10600}, {"loss": 0.5563, "grad_norm": 0.7101534605026245, "learning_rate": 0.0002, "epoch": 3.42479018721756, "step": 10610}, {"loss": 0.598, "grad_norm": 0.8186570405960083, "learning_rate": 0.0002, "epoch": 3.4280180761781796, "step": 10620}, {"loss": 0.5897, "grad_norm": 0.9480205774307251, "learning_rate": 0.0002, "epoch": 3.431245965138799, "step": 10630}, {"loss": 0.5798, "grad_norm": 1.1370961666107178, "learning_rate": 0.0002, "epoch": 3.434473854099419, "step": 10640}, {"loss": 0.5779, "grad_norm": 1.017669677734375, "learning_rate": 0.0002, "epoch": 3.437701743060039, "step": 10650}, {"loss": 0.5999, "grad_norm": 0.7625100016593933, "learning_rate": 0.0002, "epoch": 3.4409296320206586, "step": 10660}, {"loss": 0.5705, "grad_norm": 0.9288196563720703, "learning_rate": 0.0002, "epoch": 3.4441575209812783, "step": 10670}, {"loss": 0.6255, "grad_norm": 0.8800460696220398, "learning_rate": 0.0002, "epoch": 3.447385409941898, "step": 10680}, {"loss": 0.6245, "grad_norm": 0.7499661445617676, "learning_rate": 0.0002, "epoch": 3.4506132989025176, "step": 10690}, {"loss": 0.5979, "grad_norm": 0.8254973292350769, "learning_rate": 0.0002, "epoch": 3.4538411878631377, "step": 10700}, {"loss": 0.5742, "grad_norm": 0.8735857605934143, "learning_rate": 0.0002, "epoch": 3.4570690768237573, "step": 10710}, {"loss": 0.6356, "grad_norm": 0.9601819515228271, "learning_rate": 0.0002, "epoch": 3.460296965784377, "step": 10720}, {"loss": 0.5574, "grad_norm": 0.8031058311462402, "learning_rate": 0.0002, "epoch": 3.4635248547449966, "step": 10730}, {"loss": 0.6078, "grad_norm": 0.8039247393608093, "learning_rate": 0.0002, "epoch": 3.4667527437056167, "step": 10740}, {"loss": 0.593, "grad_norm": 0.8936953544616699, "learning_rate": 0.0002, "epoch": 3.4699806326662364, "step": 10750}, {"loss": 0.5971, "grad_norm": 0.8201186060905457, "learning_rate": 0.0002, "epoch": 3.473208521626856, "step": 10760}, {"loss": 0.5875, "grad_norm": 1.0064148902893066, "learning_rate": 0.0002, "epoch": 3.4764364105874757, "step": 10770}, {"loss": 0.5639, "grad_norm": 0.8617483377456665, "learning_rate": 0.0002, "epoch": 3.4796642995480953, "step": 10780}, {"loss": 0.6022, "grad_norm": 0.8532096147537231, "learning_rate": 0.0002, "epoch": 3.4828921885087154, "step": 10790}, {"loss": 0.5765, "grad_norm": 0.8646879196166992, "learning_rate": 0.0002, "epoch": 3.486120077469335, "step": 10800}, {"loss": 0.5799, "grad_norm": 0.7962660789489746, "learning_rate": 0.0002, "epoch": 3.4893479664299547, "step": 10810}, {"loss": 0.5398, "grad_norm": 0.9560028314590454, "learning_rate": 0.0002, "epoch": 3.492575855390575, "step": 10820}, {"loss": 0.6082, "grad_norm": 0.928439736366272, "learning_rate": 0.0002, "epoch": 3.4958037443511945, "step": 10830}, {"loss": 0.6112, "grad_norm": 0.8219282627105713, "learning_rate": 0.0002, "epoch": 3.499031633311814, "step": 10840}, {"loss": 0.6369, "grad_norm": 0.7918338179588318, "learning_rate": 0.0002, "epoch": 3.5022595222724338, "step": 10850}, {"loss": 0.6164, "grad_norm": 0.961295485496521, "learning_rate": 0.0002, "epoch": 3.5054874112330534, "step": 10860}, {"loss": 0.5534, "grad_norm": 1.0731624364852905, "learning_rate": 0.0002, "epoch": 3.5087153001936735, "step": 10870}, {"loss": 0.5829, "grad_norm": 0.9551863074302673, "learning_rate": 0.0002, "epoch": 3.511943189154293, "step": 10880}, {"loss": 0.5746, "grad_norm": 0.8409819602966309, "learning_rate": 0.0002, "epoch": 3.515171078114913, "step": 10890}, {"loss": 0.5813, "grad_norm": 0.7546320557594299, "learning_rate": 0.0002, "epoch": 3.5183989670755325, "step": 10900}, {"loss": 0.6184, "grad_norm": 0.7505252361297607, "learning_rate": 0.0002, "epoch": 3.5216268560361526, "step": 10910}, {"loss": 0.5649, "grad_norm": 0.7505561113357544, "learning_rate": 0.0002, "epoch": 3.524854744996772, "step": 10920}, {"loss": 0.6277, "grad_norm": 1.086177945137024, "learning_rate": 0.0002, "epoch": 3.528082633957392, "step": 10930}, {"loss": 0.5983, "grad_norm": 0.7721118330955505, "learning_rate": 0.0002, "epoch": 3.5313105229180115, "step": 10940}, {"loss": 0.5919, "grad_norm": 0.9567878246307373, "learning_rate": 0.0002, "epoch": 3.534538411878631, "step": 10950}, {"loss": 0.6261, "grad_norm": 0.8377360105514526, "learning_rate": 0.0002, "epoch": 3.5377663008392513, "step": 10960}, {"loss": 0.633, "grad_norm": 1.0174858570098877, "learning_rate": 0.0002, "epoch": 3.540994189799871, "step": 10970}, {"loss": 0.599, "grad_norm": 0.8164418935775757, "learning_rate": 0.0002, "epoch": 3.5442220787604906, "step": 10980}, {"loss": 0.5471, "grad_norm": 0.8959241509437561, "learning_rate": 0.0002, "epoch": 3.5474499677211107, "step": 10990}, {"loss": 0.6195, "grad_norm": 1.0154379606246948, "learning_rate": 0.0002, "epoch": 3.5506778566817303, "step": 11000}, {"loss": 0.5835, "grad_norm": 0.7812292575836182, "learning_rate": 0.0002, "epoch": 3.55390574564235, "step": 11010}, {"loss": 0.6052, "grad_norm": 0.9849029779434204, "learning_rate": 0.0002, "epoch": 3.5571336346029696, "step": 11020}, {"loss": 0.5689, "grad_norm": 0.8826184272766113, "learning_rate": 0.0002, "epoch": 3.5603615235635893, "step": 11030}, {"loss": 0.601, "grad_norm": 0.9039685726165771, "learning_rate": 0.0002, "epoch": 3.563589412524209, "step": 11040}, {"loss": 0.5996, "grad_norm": 0.9585249423980713, "learning_rate": 0.0002, "epoch": 3.566817301484829, "step": 11050}, {"loss": 0.5714, "grad_norm": 0.8083069324493408, "learning_rate": 0.0002, "epoch": 3.5700451904454487, "step": 11060}, {"loss": 0.6317, "grad_norm": 0.9528678059577942, "learning_rate": 0.0002, "epoch": 3.5732730794060683, "step": 11070}, {"loss": 0.6278, "grad_norm": 0.8297588229179382, "learning_rate": 0.0002, "epoch": 3.5765009683666884, "step": 11080}, {"loss": 0.5919, "grad_norm": 0.8191716074943542, "learning_rate": 0.0002, "epoch": 3.579728857327308, "step": 11090}, {"loss": 0.5971, "grad_norm": 0.8056275844573975, "learning_rate": 0.0002, "epoch": 3.5829567462879277, "step": 11100}, {"loss": 0.6325, "grad_norm": 0.701930582523346, "learning_rate": 0.0002, "epoch": 3.5861846352485474, "step": 11110}, {"loss": 0.6088, "grad_norm": 0.7644643187522888, "learning_rate": 0.0002, "epoch": 3.589412524209167, "step": 11120}, {"loss": 0.605, "grad_norm": 0.668004035949707, "learning_rate": 0.0002, "epoch": 3.592640413169787, "step": 11130}, {"loss": 0.5735, "grad_norm": 0.8849539756774902, "learning_rate": 0.0002, "epoch": 3.5958683021304068, "step": 11140}, {"loss": 0.6412, "grad_norm": 0.8123571276664734, "learning_rate": 0.0002, "epoch": 3.5990961910910264, "step": 11150}, {"loss": 0.5626, "grad_norm": 0.7591469287872314, "learning_rate": 0.0002, "epoch": 3.602324080051646, "step": 11160}, {"loss": 0.5668, "grad_norm": 0.776466965675354, "learning_rate": 0.0002, "epoch": 3.605551969012266, "step": 11170}, {"loss": 0.6631, "grad_norm": 0.9156150221824646, "learning_rate": 0.0002, "epoch": 3.608779857972886, "step": 11180}, {"loss": 0.5867, "grad_norm": 0.7517618536949158, "learning_rate": 0.0002, "epoch": 3.6120077469335055, "step": 11190}, {"loss": 0.5939, "grad_norm": 0.931239128112793, "learning_rate": 0.0002, "epoch": 3.615235635894125, "step": 11200}, {"loss": 0.5736, "grad_norm": 0.9107872843742371, "learning_rate": 0.0002, "epoch": 3.6184635248547448, "step": 11210}, {"loss": 0.5665, "grad_norm": 0.7624770998954773, "learning_rate": 0.0002, "epoch": 3.621691413815365, "step": 11220}, {"loss": 0.6033, "grad_norm": 0.8129580616950989, "learning_rate": 0.0002, "epoch": 3.6249193027759845, "step": 11230}, {"loss": 0.6192, "grad_norm": 0.7339836955070496, "learning_rate": 0.0002, "epoch": 3.628147191736604, "step": 11240}, {"loss": 0.5976, "grad_norm": 0.8901296854019165, "learning_rate": 0.0002, "epoch": 3.6313750806972243, "step": 11250}, {"loss": 0.5977, "grad_norm": 1.1374726295471191, "learning_rate": 0.0002, "epoch": 3.634602969657844, "step": 11260}, {"loss": 0.5859, "grad_norm": 0.7438275218009949, "learning_rate": 0.0002, "epoch": 3.6378308586184636, "step": 11270}, {"loss": 0.5757, "grad_norm": 0.808646559715271, "learning_rate": 0.0002, "epoch": 3.641058747579083, "step": 11280}, {"loss": 0.6244, "grad_norm": 1.091810941696167, "learning_rate": 0.0002, "epoch": 3.644286636539703, "step": 11290}, {"loss": 0.5957, "grad_norm": 0.8439257144927979, "learning_rate": 0.0002, "epoch": 3.6475145255003225, "step": 11300}, {"loss": 0.6115, "grad_norm": 0.9720633029937744, "learning_rate": 0.0002, "epoch": 3.6507424144609426, "step": 11310}, {"loss": 0.5942, "grad_norm": 0.738571047782898, "learning_rate": 0.0002, "epoch": 3.6539703034215623, "step": 11320}, {"loss": 0.6029, "grad_norm": 0.6961580514907837, "learning_rate": 0.0002, "epoch": 3.657198192382182, "step": 11330}, {"loss": 0.6226, "grad_norm": 0.8192131519317627, "learning_rate": 0.0002, "epoch": 3.660426081342802, "step": 11340}, {"loss": 0.6155, "grad_norm": 0.8367205858230591, "learning_rate": 0.0002, "epoch": 3.6636539703034217, "step": 11350}, {"loss": 0.586, "grad_norm": 0.7735666632652283, "learning_rate": 0.0002, "epoch": 3.6668818592640413, "step": 11360}, {"loss": 0.6113, "grad_norm": 0.6507132649421692, "learning_rate": 0.0002, "epoch": 3.670109748224661, "step": 11370}, {"loss": 0.6273, "grad_norm": 0.8271192312240601, "learning_rate": 0.0002, "epoch": 3.6733376371852806, "step": 11380}, {"loss": 0.5995, "grad_norm": 0.8724204301834106, "learning_rate": 0.0002, "epoch": 3.6765655261459007, "step": 11390}, {"loss": 0.6131, "grad_norm": 0.8448445200920105, "learning_rate": 0.0002, "epoch": 3.6797934151065204, "step": 11400}, {"loss": 0.5923, "grad_norm": 0.6756882071495056, "learning_rate": 0.0002, "epoch": 3.68302130406714, "step": 11410}, {"loss": 0.6443, "grad_norm": 0.7859625816345215, "learning_rate": 0.0002, "epoch": 3.68624919302776, "step": 11420}, {"loss": 0.6567, "grad_norm": 0.8929487466812134, "learning_rate": 0.0002, "epoch": 3.6894770819883798, "step": 11430}, {"loss": 0.6474, "grad_norm": 0.8163391351699829, "learning_rate": 0.0002, "epoch": 3.6927049709489994, "step": 11440}, {"loss": 0.6467, "grad_norm": 0.8948464393615723, "learning_rate": 0.0002, "epoch": 3.695932859909619, "step": 11450}, {"loss": 0.624, "grad_norm": 0.8654782176017761, "learning_rate": 0.0002, "epoch": 3.6991607488702387, "step": 11460}, {"loss": 0.6142, "grad_norm": 0.9514864683151245, "learning_rate": 0.0002, "epoch": 3.7023886378308584, "step": 11470}, {"loss": 0.606, "grad_norm": 0.7298579812049866, "learning_rate": 0.0002, "epoch": 3.7056165267914785, "step": 11480}, {"loss": 0.5853, "grad_norm": 0.9266309142112732, "learning_rate": 0.0002, "epoch": 3.708844415752098, "step": 11490}, {"loss": 0.6122, "grad_norm": 0.8608686923980713, "learning_rate": 0.0002, "epoch": 3.7120723047127178, "step": 11500}, {"loss": 0.6348, "grad_norm": 0.921788215637207, "learning_rate": 0.0002, "epoch": 3.715300193673338, "step": 11510}, {"loss": 0.6191, "grad_norm": 0.8537021279335022, "learning_rate": 0.0002, "epoch": 3.7185280826339575, "step": 11520}, {"loss": 0.6228, "grad_norm": 1.115194320678711, "learning_rate": 0.0002, "epoch": 3.721755971594577, "step": 11530}, {"loss": 0.5828, "grad_norm": 0.7614817023277283, "learning_rate": 0.0002, "epoch": 3.724983860555197, "step": 11540}, {"loss": 0.5776, "grad_norm": 0.871999204158783, "learning_rate": 0.0002, "epoch": 3.7282117495158165, "step": 11550}, {"loss": 0.5962, "grad_norm": 0.9668049812316895, "learning_rate": 0.0002, "epoch": 3.7314396384764366, "step": 11560}, {"loss": 0.5534, "grad_norm": 1.2185815572738647, "learning_rate": 0.0002, "epoch": 3.734667527437056, "step": 11570}, {"loss": 0.5936, "grad_norm": 0.8258453011512756, "learning_rate": 0.0002, "epoch": 3.737895416397676, "step": 11580}, {"loss": 0.5853, "grad_norm": 0.8708966374397278, "learning_rate": 0.0002, "epoch": 3.7411233053582955, "step": 11590}, {"loss": 0.5847, "grad_norm": 0.7784267663955688, "learning_rate": 0.0002, "epoch": 3.7443511943189156, "step": 11600}, {"loss": 0.6404, "grad_norm": 0.7504425048828125, "learning_rate": 0.0002, "epoch": 3.7475790832795353, "step": 11610}, {"loss": 0.5922, "grad_norm": 0.9144526124000549, "learning_rate": 0.0002, "epoch": 3.750806972240155, "step": 11620}, {"loss": 0.6425, "grad_norm": 0.922581672668457, "learning_rate": 0.0002, "epoch": 3.7540348612007746, "step": 11630}, {"loss": 0.6402, "grad_norm": 0.9348630905151367, "learning_rate": 0.0002, "epoch": 3.757262750161394, "step": 11640}, {"loss": 0.5852, "grad_norm": 1.0740231275558472, "learning_rate": 0.0002, "epoch": 3.7604906391220143, "step": 11650}, {"loss": 0.599, "grad_norm": 0.884830117225647, "learning_rate": 0.0002, "epoch": 3.763718528082634, "step": 11660}, {"loss": 0.5991, "grad_norm": 1.0256348848342896, "learning_rate": 0.0002, "epoch": 3.7669464170432536, "step": 11670}, {"loss": 0.626, "grad_norm": 0.6795592904090881, "learning_rate": 0.0002, "epoch": 3.7701743060038737, "step": 11680}, {"loss": 0.6241, "grad_norm": 0.9381206631660461, "learning_rate": 0.0002, "epoch": 3.7734021949644934, "step": 11690}, {"loss": 0.6054, "grad_norm": 0.7633092403411865, "learning_rate": 0.0002, "epoch": 3.776630083925113, "step": 11700}, {"loss": 0.5937, "grad_norm": 0.7506213188171387, "learning_rate": 0.0002, "epoch": 3.7798579728857327, "step": 11710}, {"loss": 0.5933, "grad_norm": 0.8182913064956665, "learning_rate": 0.0002, "epoch": 3.7830858618463523, "step": 11720}, {"loss": 0.6043, "grad_norm": 1.019322156906128, "learning_rate": 0.0002, "epoch": 3.786313750806972, "step": 11730}, {"loss": 0.633, "grad_norm": 0.8895221948623657, "learning_rate": 0.0002, "epoch": 3.789541639767592, "step": 11740}, {"loss": 0.6553, "grad_norm": 0.948847770690918, "learning_rate": 0.0002, "epoch": 3.7927695287282117, "step": 11750}, {"loss": 0.6265, "grad_norm": 0.9068999886512756, "learning_rate": 0.0002, "epoch": 3.7959974176888314, "step": 11760}, {"loss": 0.6163, "grad_norm": 0.7920539975166321, "learning_rate": 0.0002, "epoch": 3.7992253066494515, "step": 11770}, {"loss": 0.5964, "grad_norm": 0.8441922068595886, "learning_rate": 0.0002, "epoch": 3.802453195610071, "step": 11780}, {"loss": 0.6379, "grad_norm": 0.9258501529693604, "learning_rate": 0.0002, "epoch": 3.8056810845706908, "step": 11790}, {"loss": 0.6379, "grad_norm": 0.7354241609573364, "learning_rate": 0.0002, "epoch": 3.8089089735313104, "step": 11800}, {"loss": 0.6177, "grad_norm": 0.9494872689247131, "learning_rate": 0.0002, "epoch": 3.81213686249193, "step": 11810}, {"loss": 0.5931, "grad_norm": 0.8266556859016418, "learning_rate": 0.0002, "epoch": 3.81536475145255, "step": 11820}, {"loss": 0.641, "grad_norm": 0.7951219081878662, "learning_rate": 0.0002, "epoch": 3.81859264041317, "step": 11830}, {"loss": 0.5767, "grad_norm": 0.7688382267951965, "learning_rate": 0.0002, "epoch": 3.8218205293737895, "step": 11840}, {"loss": 0.6117, "grad_norm": 1.0917940139770508, "learning_rate": 0.0002, "epoch": 3.8250484183344096, "step": 11850}, {"loss": 0.5857, "grad_norm": 0.9880442023277283, "learning_rate": 0.0002, "epoch": 3.828276307295029, "step": 11860}, {"loss": 0.6579, "grad_norm": 0.8433151245117188, "learning_rate": 0.0002, "epoch": 3.831504196255649, "step": 11870}, {"loss": 0.5876, "grad_norm": 0.8691204786300659, "learning_rate": 0.0002, "epoch": 3.8347320852162685, "step": 11880}, {"loss": 0.6308, "grad_norm": 0.7698143124580383, "learning_rate": 0.0002, "epoch": 3.837959974176888, "step": 11890}, {"loss": 0.6531, "grad_norm": 0.8874883651733398, "learning_rate": 0.0002, "epoch": 3.841187863137508, "step": 11900}, {"loss": 0.6242, "grad_norm": 1.1209359169006348, "learning_rate": 0.0002, "epoch": 3.844415752098128, "step": 11910}, {"loss": 0.6415, "grad_norm": 0.7723544239997864, "learning_rate": 0.0002, "epoch": 3.8476436410587476, "step": 11920}, {"loss": 0.6091, "grad_norm": 0.8363937139511108, "learning_rate": 0.0002, "epoch": 3.850871530019367, "step": 11930}, {"loss": 0.6498, "grad_norm": 0.9209707975387573, "learning_rate": 0.0002, "epoch": 3.8540994189799873, "step": 11940}, {"loss": 0.6471, "grad_norm": 0.9456894993782043, "learning_rate": 0.0002, "epoch": 3.857327307940607, "step": 11950}, {"loss": 0.6432, "grad_norm": 1.5748413801193237, "learning_rate": 0.0002, "epoch": 3.8605551969012266, "step": 11960}, {"loss": 0.6197, "grad_norm": 0.9083569049835205, "learning_rate": 0.0002, "epoch": 3.8637830858618463, "step": 11970}, {"loss": 0.6593, "grad_norm": 0.7672823071479797, "learning_rate": 0.0002, "epoch": 3.867010974822466, "step": 11980}, {"loss": 0.6238, "grad_norm": 0.8647152185440063, "learning_rate": 0.0002, "epoch": 3.870238863783086, "step": 11990}, {"loss": 0.5755, "grad_norm": 0.9564255475997925, "learning_rate": 0.0002, "epoch": 3.8734667527437057, "step": 12000}, {"loss": 0.6321, "grad_norm": 0.773267924785614, "learning_rate": 0.0002, "epoch": 3.8766946417043253, "step": 12010}, {"loss": 0.6057, "grad_norm": 0.8030173182487488, "learning_rate": 0.0002, "epoch": 3.879922530664945, "step": 12020}, {"loss": 0.6194, "grad_norm": 0.8002150058746338, "learning_rate": 0.0002, "epoch": 3.883150419625565, "step": 12030}, {"loss": 0.6194, "grad_norm": 0.98802250623703, "learning_rate": 0.0002, "epoch": 3.8863783085861847, "step": 12040}, {"loss": 0.6026, "grad_norm": 0.7868124842643738, "learning_rate": 0.0002, "epoch": 3.8896061975468044, "step": 12050}, {"loss": 0.6303, "grad_norm": 0.932182788848877, "learning_rate": 0.0002, "epoch": 3.892834086507424, "step": 12060}, {"loss": 0.5863, "grad_norm": 0.8576806783676147, "learning_rate": 0.0002, "epoch": 3.8960619754680437, "step": 12070}, {"loss": 0.6079, "grad_norm": 0.8985713124275208, "learning_rate": 0.0002, "epoch": 3.8992898644286638, "step": 12080}, {"loss": 0.6449, "grad_norm": 0.7876521944999695, "learning_rate": 0.0002, "epoch": 3.9025177533892834, "step": 12090}, {"loss": 0.5655, "grad_norm": 0.773936927318573, "learning_rate": 0.0002, "epoch": 3.905745642349903, "step": 12100}, {"loss": 0.5765, "grad_norm": 0.7274761199951172, "learning_rate": 0.0002, "epoch": 3.908973531310523, "step": 12110}, {"loss": 0.6182, "grad_norm": 0.8625598549842834, "learning_rate": 0.0002, "epoch": 3.912201420271143, "step": 12120}, {"loss": 0.5855, "grad_norm": 0.8702362179756165, "learning_rate": 0.0002, "epoch": 3.9154293092317625, "step": 12130}, {"loss": 0.6493, "grad_norm": 0.912579357624054, "learning_rate": 0.0002, "epoch": 3.918657198192382, "step": 12140}, {"loss": 0.6341, "grad_norm": 0.8697066903114319, "learning_rate": 0.0002, "epoch": 3.9218850871530018, "step": 12150}, {"loss": 0.6037, "grad_norm": 1.005232572555542, "learning_rate": 0.0002, "epoch": 3.9251129761136214, "step": 12160}, {"loss": 0.621, "grad_norm": 0.793902575969696, "learning_rate": 0.0002, "epoch": 3.9283408650742415, "step": 12170}, {"loss": 0.599, "grad_norm": 0.7025905847549438, "learning_rate": 0.0002, "epoch": 3.931568754034861, "step": 12180}, {"loss": 0.6421, "grad_norm": 0.97635817527771, "learning_rate": 0.0002, "epoch": 3.934796642995481, "step": 12190}, {"loss": 0.6416, "grad_norm": 0.855417013168335, "learning_rate": 0.0002, "epoch": 3.938024531956101, "step": 12200}, {"loss": 0.5979, "grad_norm": 0.8841291666030884, "learning_rate": 0.0002, "epoch": 3.9412524209167206, "step": 12210}, {"loss": 0.5666, "grad_norm": 1.1762064695358276, "learning_rate": 0.0002, "epoch": 3.94448030987734, "step": 12220}, {"loss": 0.586, "grad_norm": 0.8393193483352661, "learning_rate": 0.0002, "epoch": 3.94770819883796, "step": 12230}, {"loss": 0.5738, "grad_norm": 0.9324905276298523, "learning_rate": 0.0002, "epoch": 3.9509360877985795, "step": 12240}, {"loss": 0.5954, "grad_norm": 0.8607982993125916, "learning_rate": 0.0002, "epoch": 3.9541639767591996, "step": 12250}, {"loss": 0.6277, "grad_norm": 0.8586681485176086, "learning_rate": 0.0002, "epoch": 3.9573918657198193, "step": 12260}, {"loss": 0.5841, "grad_norm": 1.1082909107208252, "learning_rate": 0.0002, "epoch": 3.960619754680439, "step": 12270}, {"loss": 0.6231, "grad_norm": 1.065027117729187, "learning_rate": 0.0002, "epoch": 3.963847643641059, "step": 12280}, {"loss": 0.5996, "grad_norm": 0.9544363021850586, "learning_rate": 0.0002, "epoch": 3.9670755326016787, "step": 12290}, {"loss": 0.6301, "grad_norm": 0.9008927345275879, "learning_rate": 0.0002, "epoch": 3.9703034215622983, "step": 12300}, {"loss": 0.6108, "grad_norm": 0.8717467188835144, "learning_rate": 0.0002, "epoch": 3.973531310522918, "step": 12310}, {"loss": 0.6465, "grad_norm": 0.9718339443206787, "learning_rate": 0.0002, "epoch": 3.9767591994835376, "step": 12320}, {"loss": 0.603, "grad_norm": 1.0362015962600708, "learning_rate": 0.0002, "epoch": 3.9799870884441573, "step": 12330}, {"loss": 0.6229, "grad_norm": 1.0844318866729736, "learning_rate": 0.0002, "epoch": 3.9832149774047774, "step": 12340}, {"loss": 0.6777, "grad_norm": 0.7506240606307983, "learning_rate": 0.0002, "epoch": 3.986442866365397, "step": 12350}, {"loss": 0.6076, "grad_norm": 1.005982756614685, "learning_rate": 0.0002, "epoch": 3.9896707553260167, "step": 12360}, {"loss": 0.5926, "grad_norm": 0.7566431164741516, "learning_rate": 0.0002, "epoch": 3.9928986442866368, "step": 12370}, {"loss": 0.653, "grad_norm": 0.8819181323051453, "learning_rate": 0.0002, "epoch": 3.9961265332472564, "step": 12380}, {"loss": 0.6197, "grad_norm": 0.884497880935669, "learning_rate": 0.0002, "epoch": 3.999354422207876, "step": 12390}, {"eval_loss": 1.1907150745391846, "eval_runtime": 161.5766, "eval_samples_per_second": 4.537, "eval_steps_per_second": 0.569, "epoch": 4.0, "step": 12392}, {"loss": 0.5203, "grad_norm": 1.0407241582870483, "learning_rate": 0.0002, "epoch": 4.002582311168496, "step": 12400}, {"loss": 0.4978, "grad_norm": 1.0199295282363892, "learning_rate": 0.0002, "epoch": 4.005810200129115, "step": 12410}, {"loss": 0.4985, "grad_norm": 0.8456302881240845, "learning_rate": 0.0002, "epoch": 4.009038089089735, "step": 12420}, {"loss": 0.4669, "grad_norm": 1.0621124505996704, "learning_rate": 0.0002, "epoch": 4.012265978050355, "step": 12430}, {"loss": 0.5277, "grad_norm": 0.8984712362289429, "learning_rate": 0.0002, "epoch": 4.015493867010975, "step": 12440}, {"loss": 0.5508, "grad_norm": 1.3785864114761353, "learning_rate": 0.0002, "epoch": 4.018721755971595, "step": 12450}, {"loss": 0.5244, "grad_norm": 0.7911781668663025, "learning_rate": 0.0002, "epoch": 4.0219496449322145, "step": 12460}, {"loss": 0.4746, "grad_norm": 1.0977907180786133, "learning_rate": 0.0002, "epoch": 4.025177533892834, "step": 12470}, {"loss": 0.4632, "grad_norm": 1.0664983987808228, "learning_rate": 0.0002, "epoch": 4.028405422853454, "step": 12480}, {"loss": 0.5151, "grad_norm": 1.0807124376296997, "learning_rate": 0.0002, "epoch": 4.0316333118140735, "step": 12490}, {"loss": 0.4712, "grad_norm": 1.2650192975997925, "learning_rate": 0.0002, "epoch": 4.034861200774693, "step": 12500}, {"loss": 0.5111, "grad_norm": 0.7164070010185242, "learning_rate": 0.0002, "epoch": 4.038089089735313, "step": 12510}, {"loss": 0.5015, "grad_norm": 1.0047489404678345, "learning_rate": 0.0002, "epoch": 4.041316978695932, "step": 12520}, {"loss": 0.5467, "grad_norm": 0.9303901791572571, "learning_rate": 0.0002, "epoch": 4.044544867656553, "step": 12530}, {"loss": 0.5165, "grad_norm": 1.0319702625274658, "learning_rate": 0.0002, "epoch": 4.047772756617173, "step": 12540}, {"loss": 0.4834, "grad_norm": 0.9549729228019714, "learning_rate": 0.0002, "epoch": 4.051000645577792, "step": 12550}, {"loss": 0.5235, "grad_norm": 0.7175564765930176, "learning_rate": 0.0002, "epoch": 4.054228534538412, "step": 12560}, {"loss": 0.5257, "grad_norm": 1.0622259378433228, "learning_rate": 0.0002, "epoch": 4.057456423499032, "step": 12570}, {"loss": 0.5098, "grad_norm": 1.172074556350708, "learning_rate": 0.0002, "epoch": 4.060684312459651, "step": 12580}, {"loss": 0.5112, "grad_norm": 0.9702366590499878, "learning_rate": 0.0002, "epoch": 4.063912201420271, "step": 12590}, {"loss": 0.5042, "grad_norm": 0.741511344909668, "learning_rate": 0.0002, "epoch": 4.0671400903808905, "step": 12600}, {"loss": 0.4996, "grad_norm": 0.8632621169090271, "learning_rate": 0.0002, "epoch": 4.070367979341511, "step": 12610}, {"loss": 0.4927, "grad_norm": 0.9695962071418762, "learning_rate": 0.0002, "epoch": 4.073595868302131, "step": 12620}, {"loss": 0.4618, "grad_norm": 0.9401052594184875, "learning_rate": 0.0002, "epoch": 4.07682375726275, "step": 12630}, {"loss": 0.4889, "grad_norm": 0.8068707585334778, "learning_rate": 0.0002, "epoch": 4.08005164622337, "step": 12640}, {"loss": 0.5046, "grad_norm": 0.9554762840270996, "learning_rate": 0.0002, "epoch": 4.08327953518399, "step": 12650}, {"loss": 0.5081, "grad_norm": 0.7637128233909607, "learning_rate": 0.0002, "epoch": 4.086507424144609, "step": 12660}, {"loss": 0.4997, "grad_norm": 0.6703744530677795, "learning_rate": 0.0002, "epoch": 4.089735313105229, "step": 12670}, {"loss": 0.4977, "grad_norm": 0.8623828887939453, "learning_rate": 0.0002, "epoch": 4.092963202065849, "step": 12680}, {"loss": 0.4616, "grad_norm": 0.8198223114013672, "learning_rate": 0.0002, "epoch": 4.096191091026468, "step": 12690}, {"loss": 0.5372, "grad_norm": 1.3449875116348267, "learning_rate": 0.0002, "epoch": 4.099418979987089, "step": 12700}, {"loss": 0.4782, "grad_norm": 0.8333606123924255, "learning_rate": 0.0002, "epoch": 4.1026468689477085, "step": 12710}, {"loss": 0.5135, "grad_norm": 1.1647733449935913, "learning_rate": 0.0002, "epoch": 4.105874757908328, "step": 12720}, {"loss": 0.5147, "grad_norm": 1.0560213327407837, "learning_rate": 0.0002, "epoch": 4.109102646868948, "step": 12730}, {"loss": 0.5244, "grad_norm": 0.9479449987411499, "learning_rate": 0.0002, "epoch": 4.112330535829567, "step": 12740}, {"loss": 0.4596, "grad_norm": 1.1634587049484253, "learning_rate": 0.0002, "epoch": 4.115558424790187, "step": 12750}, {"loss": 0.4966, "grad_norm": 0.813987672328949, "learning_rate": 0.0002, "epoch": 4.118786313750807, "step": 12760}, {"loss": 0.5133, "grad_norm": 0.968461275100708, "learning_rate": 0.0002, "epoch": 4.122014202711426, "step": 12770}, {"loss": 0.5113, "grad_norm": 0.9324830770492554, "learning_rate": 0.0002, "epoch": 4.125242091672046, "step": 12780}, {"loss": 0.5233, "grad_norm": 0.8313411474227905, "learning_rate": 0.0002, "epoch": 4.128469980632667, "step": 12790}, {"loss": 0.5169, "grad_norm": 1.0177634954452515, "learning_rate": 0.0002, "epoch": 4.131697869593286, "step": 12800}, {"loss": 0.4635, "grad_norm": 1.0890623331069946, "learning_rate": 0.0002, "epoch": 4.134925758553906, "step": 12810}, {"loss": 0.519, "grad_norm": 0.9131693840026855, "learning_rate": 0.0002, "epoch": 4.1381536475145255, "step": 12820}, {"loss": 0.5017, "grad_norm": 0.8400680422782898, "learning_rate": 0.0002, "epoch": 4.141381536475145, "step": 12830}, {"loss": 0.5195, "grad_norm": 0.8988795876502991, "learning_rate": 0.0002, "epoch": 4.144609425435765, "step": 12840}, {"loss": 0.5052, "grad_norm": 0.9224025011062622, "learning_rate": 0.0002, "epoch": 4.1478373143963845, "step": 12850}, {"loss": 0.5001, "grad_norm": 0.7453159689903259, "learning_rate": 0.0002, "epoch": 4.151065203357004, "step": 12860}, {"loss": 0.4874, "grad_norm": 0.9815868139266968, "learning_rate": 0.0002, "epoch": 4.154293092317625, "step": 12870}, {"loss": 0.5485, "grad_norm": 1.2542768716812134, "learning_rate": 0.0002, "epoch": 4.157520981278244, "step": 12880}, {"loss": 0.5287, "grad_norm": 1.0092132091522217, "learning_rate": 0.0002, "epoch": 4.160748870238864, "step": 12890}, {"loss": 0.5125, "grad_norm": 1.1836622953414917, "learning_rate": 0.0002, "epoch": 4.163976759199484, "step": 12900}, {"loss": 0.5089, "grad_norm": 0.7706810235977173, "learning_rate": 0.0002, "epoch": 4.167204648160103, "step": 12910}, {"loss": 0.5123, "grad_norm": 1.00058913230896, "learning_rate": 0.0002, "epoch": 4.170432537120723, "step": 12920}, {"loss": 0.5238, "grad_norm": 1.2326250076293945, "learning_rate": 0.0002, "epoch": 4.173660426081343, "step": 12930}, {"loss": 0.5405, "grad_norm": 0.8829123377799988, "learning_rate": 0.0002, "epoch": 4.176888315041962, "step": 12940}, {"loss": 0.517, "grad_norm": 0.936042845249176, "learning_rate": 0.0002, "epoch": 4.180116204002582, "step": 12950}, {"loss": 0.4991, "grad_norm": 0.9773517847061157, "learning_rate": 0.0002, "epoch": 4.183344092963202, "step": 12960}, {"loss": 0.5025, "grad_norm": 0.9786297678947449, "learning_rate": 0.0002, "epoch": 4.186571981923822, "step": 12970}, {"loss": 0.5276, "grad_norm": 0.7524558901786804, "learning_rate": 0.0002, "epoch": 4.189799870884442, "step": 12980}, {"loss": 0.5522, "grad_norm": 1.0107866525650024, "learning_rate": 0.0002, "epoch": 4.193027759845061, "step": 12990}, {"loss": 0.5304, "grad_norm": 1.0092947483062744, "learning_rate": 0.0002, "epoch": 4.196255648805681, "step": 13000}, {"loss": 0.5061, "grad_norm": 1.18181312084198, "learning_rate": 0.0002, "epoch": 4.199483537766301, "step": 13010}, {"loss": 0.512, "grad_norm": 0.8845750093460083, "learning_rate": 0.0002, "epoch": 4.20271142672692, "step": 13020}, {"loss": 0.5329, "grad_norm": 1.0789145231246948, "learning_rate": 0.0002, "epoch": 4.20593931568754, "step": 13030}, {"loss": 0.5001, "grad_norm": 0.9562082886695862, "learning_rate": 0.0002, "epoch": 4.2091672046481605, "step": 13040}, {"loss": 0.5211, "grad_norm": 0.875755786895752, "learning_rate": 0.0002, "epoch": 4.21239509360878, "step": 13050}, {"loss": 0.5162, "grad_norm": 1.0694596767425537, "learning_rate": 0.0002, "epoch": 4.2156229825694, "step": 13060}, {"loss": 0.4917, "grad_norm": 1.0053378343582153, "learning_rate": 0.0002, "epoch": 4.2188508715300195, "step": 13070}, {"loss": 0.542, "grad_norm": 1.1628689765930176, "learning_rate": 0.0002, "epoch": 4.222078760490639, "step": 13080}, {"loss": 0.4796, "grad_norm": 0.9455991983413696, "learning_rate": 0.0002, "epoch": 4.225306649451259, "step": 13090}, {"loss": 0.4802, "grad_norm": 0.9736765623092651, "learning_rate": 0.0002, "epoch": 4.228534538411878, "step": 13100}, {"loss": 0.5411, "grad_norm": 0.8653560876846313, "learning_rate": 0.0002, "epoch": 4.231762427372498, "step": 13110}, {"loss": 0.5347, "grad_norm": 0.9335988163948059, "learning_rate": 0.0002, "epoch": 4.234990316333118, "step": 13120}, {"loss": 0.5217, "grad_norm": 0.9102661609649658, "learning_rate": 0.0002, "epoch": 4.238218205293738, "step": 13130}, {"loss": 0.5531, "grad_norm": 1.0595461130142212, "learning_rate": 0.0002, "epoch": 4.241446094254358, "step": 13140}, {"loss": 0.517, "grad_norm": 0.8947662711143494, "learning_rate": 0.0002, "epoch": 4.244673983214978, "step": 13150}, {"loss": 0.5116, "grad_norm": 1.0835723876953125, "learning_rate": 0.0002, "epoch": 4.247901872175597, "step": 13160}, {"loss": 0.5212, "grad_norm": 0.8496462106704712, "learning_rate": 0.0002, "epoch": 4.251129761136217, "step": 13170}, {"loss": 0.5079, "grad_norm": 0.9395631551742554, "learning_rate": 0.0002, "epoch": 4.2543576500968365, "step": 13180}, {"loss": 0.5076, "grad_norm": 1.2939592599868774, "learning_rate": 0.0002, "epoch": 4.257585539057456, "step": 13190}, {"loss": 0.5209, "grad_norm": 0.9325923919677734, "learning_rate": 0.0002, "epoch": 4.260813428018076, "step": 13200}, {"loss": 0.4984, "grad_norm": 0.9220664501190186, "learning_rate": 0.0002, "epoch": 4.264041316978696, "step": 13210}, {"loss": 0.5553, "grad_norm": 0.9505137205123901, "learning_rate": 0.0002, "epoch": 4.267269205939316, "step": 13220}, {"loss": 0.5238, "grad_norm": 1.0713751316070557, "learning_rate": 0.0002, "epoch": 4.270497094899936, "step": 13230}, {"loss": 0.5478, "grad_norm": 0.8390375971794128, "learning_rate": 0.0002, "epoch": 4.273724983860555, "step": 13240}, {"loss": 0.5217, "grad_norm": 0.8943426012992859, "learning_rate": 0.0002, "epoch": 4.276952872821175, "step": 13250}, {"loss": 0.5486, "grad_norm": 0.9175868630409241, "learning_rate": 0.0002, "epoch": 4.280180761781795, "step": 13260}, {"loss": 0.5208, "grad_norm": 0.9969881176948547, "learning_rate": 0.0002, "epoch": 4.283408650742414, "step": 13270}, {"loss": 0.5376, "grad_norm": 1.2271877527236938, "learning_rate": 0.0002, "epoch": 4.286636539703034, "step": 13280}, {"loss": 0.4811, "grad_norm": 0.9463263154029846, "learning_rate": 0.0002, "epoch": 4.289864428663654, "step": 13290}, {"loss": 0.52, "grad_norm": 1.0306228399276733, "learning_rate": 0.0002, "epoch": 4.293092317624274, "step": 13300}, {"loss": 0.5092, "grad_norm": 0.8454763889312744, "learning_rate": 0.0002, "epoch": 4.296320206584894, "step": 13310}, {"loss": 0.5657, "grad_norm": 0.9843119978904724, "learning_rate": 0.0002, "epoch": 4.299548095545513, "step": 13320}, {"loss": 0.5407, "grad_norm": 1.0836851596832275, "learning_rate": 0.0002, "epoch": 4.302775984506133, "step": 13330}, {"loss": 0.5336, "grad_norm": 1.0719412565231323, "learning_rate": 0.0002, "epoch": 4.306003873466753, "step": 13340}, {"loss": 0.4798, "grad_norm": 0.9276487827301025, "learning_rate": 0.0002, "epoch": 4.309231762427372, "step": 13350}, {"loss": 0.5256, "grad_norm": 0.897072434425354, "learning_rate": 0.0002, "epoch": 4.312459651387992, "step": 13360}, {"loss": 0.5333, "grad_norm": 1.0493228435516357, "learning_rate": 0.0002, "epoch": 4.315687540348612, "step": 13370}, {"loss": 0.5218, "grad_norm": 0.9446353316307068, "learning_rate": 0.0002, "epoch": 4.318915429309232, "step": 13380}, {"loss": 0.4765, "grad_norm": 0.7765224575996399, "learning_rate": 0.0002, "epoch": 4.322143318269852, "step": 13390}, {"loss": 0.5907, "grad_norm": 0.9100048542022705, "learning_rate": 0.0002, "epoch": 4.3253712072304715, "step": 13400}, {"loss": 0.5393, "grad_norm": 1.0913089513778687, "learning_rate": 0.0002, "epoch": 4.328599096191091, "step": 13410}, {"loss": 0.494, "grad_norm": 0.9607733488082886, "learning_rate": 0.0002, "epoch": 4.331826985151711, "step": 13420}, {"loss": 0.5273, "grad_norm": 0.8774219155311584, "learning_rate": 0.0002, "epoch": 4.3350548741123305, "step": 13430}, {"loss": 0.5482, "grad_norm": 0.8366804122924805, "learning_rate": 0.0002, "epoch": 4.33828276307295, "step": 13440}, {"loss": 0.5487, "grad_norm": 1.034727931022644, "learning_rate": 0.0002, "epoch": 4.34151065203357, "step": 13450}, {"loss": 0.4995, "grad_norm": 0.942743182182312, "learning_rate": 0.0002, "epoch": 4.344738540994189, "step": 13460}, {"loss": 0.5222, "grad_norm": 0.7237029075622559, "learning_rate": 0.0002, "epoch": 4.347966429954809, "step": 13470}, {"loss": 0.5461, "grad_norm": 0.8216196894645691, "learning_rate": 0.0002, "epoch": 4.35119431891543, "step": 13480}, {"loss": 0.5104, "grad_norm": 1.031860113143921, "learning_rate": 0.0002, "epoch": 4.354422207876049, "step": 13490}, {"loss": 0.547, "grad_norm": 0.8880493640899658, "learning_rate": 0.0002, "epoch": 4.357650096836669, "step": 13500}, {"loss": 0.5259, "grad_norm": 0.8442490696907043, "learning_rate": 0.0002, "epoch": 4.360877985797289, "step": 13510}, {"loss": 0.5176, "grad_norm": 1.270971655845642, "learning_rate": 0.0002, "epoch": 4.364105874757908, "step": 13520}, {"loss": 0.5028, "grad_norm": 0.9657870531082153, "learning_rate": 0.0002, "epoch": 4.367333763718528, "step": 13530}, {"loss": 0.5136, "grad_norm": 0.7477133870124817, "learning_rate": 0.0002, "epoch": 4.3705616526791475, "step": 13540}, {"loss": 0.5483, "grad_norm": 1.0209243297576904, "learning_rate": 0.0002, "epoch": 4.373789541639767, "step": 13550}, {"loss": 0.4888, "grad_norm": 0.8714015483856201, "learning_rate": 0.0002, "epoch": 4.377017430600388, "step": 13560}, {"loss": 0.5428, "grad_norm": 1.0490189790725708, "learning_rate": 0.0002, "epoch": 4.380245319561007, "step": 13570}, {"loss": 0.5398, "grad_norm": 0.9454663991928101, "learning_rate": 0.0002, "epoch": 4.383473208521627, "step": 13580}, {"loss": 0.5072, "grad_norm": 1.154146432876587, "learning_rate": 0.0002, "epoch": 4.386701097482247, "step": 13590}, {"loss": 0.5096, "grad_norm": 1.155090570449829, "learning_rate": 0.0002, "epoch": 4.389928986442866, "step": 13600}, {"loss": 0.5679, "grad_norm": 0.9853842854499817, "learning_rate": 0.0002, "epoch": 4.393156875403486, "step": 13610}, {"loss": 0.4992, "grad_norm": 0.9265837669372559, "learning_rate": 0.0002, "epoch": 4.396384764364106, "step": 13620}, {"loss": 0.523, "grad_norm": 0.8367540240287781, "learning_rate": 0.0002, "epoch": 4.399612653324725, "step": 13630}, {"loss": 0.564, "grad_norm": 1.1453629732131958, "learning_rate": 0.0002, "epoch": 4.402840542285345, "step": 13640}, {"loss": 0.573, "grad_norm": 1.0856295824050903, "learning_rate": 0.0002, "epoch": 4.4060684312459655, "step": 13650}, {"loss": 0.5178, "grad_norm": 0.9284523129463196, "learning_rate": 0.0002, "epoch": 4.409296320206585, "step": 13660}, {"loss": 0.4862, "grad_norm": 0.9632299542427063, "learning_rate": 0.0002, "epoch": 4.412524209167205, "step": 13670}, {"loss": 0.5928, "grad_norm": 1.048524260520935, "learning_rate": 0.0002, "epoch": 4.415752098127824, "step": 13680}, {"loss": 0.5258, "grad_norm": 0.9787682294845581, "learning_rate": 0.0002, "epoch": 4.418979987088444, "step": 13690}, {"loss": 0.5513, "grad_norm": 1.0728684663772583, "learning_rate": 0.0002, "epoch": 4.422207876049064, "step": 13700}, {"loss": 0.5243, "grad_norm": 0.72867351770401, "learning_rate": 0.0002, "epoch": 4.425435765009683, "step": 13710}, {"loss": 0.5313, "grad_norm": 0.8932793736457825, "learning_rate": 0.0002, "epoch": 4.428663653970303, "step": 13720}, {"loss": 0.5156, "grad_norm": 1.098343849182129, "learning_rate": 0.0002, "epoch": 4.431891542930924, "step": 13730}, {"loss": 0.5342, "grad_norm": 0.9321235418319702, "learning_rate": 0.0002, "epoch": 4.435119431891543, "step": 13740}, {"loss": 0.5114, "grad_norm": 0.8868634104728699, "learning_rate": 0.0002, "epoch": 4.438347320852163, "step": 13750}, {"loss": 0.5284, "grad_norm": 1.200064778327942, "learning_rate": 0.0002, "epoch": 4.4415752098127825, "step": 13760}, {"loss": 0.5208, "grad_norm": 0.8968019485473633, "learning_rate": 0.0002, "epoch": 4.444803098773402, "step": 13770}, {"loss": 0.4979, "grad_norm": 0.9560935497283936, "learning_rate": 0.0002, "epoch": 4.448030987734022, "step": 13780}, {"loss": 0.5134, "grad_norm": 0.7985701560974121, "learning_rate": 0.0002, "epoch": 4.4512588766946415, "step": 13790}, {"loss": 0.5113, "grad_norm": 1.062540888786316, "learning_rate": 0.0002, "epoch": 4.454486765655261, "step": 13800}, {"loss": 0.525, "grad_norm": 1.0827109813690186, "learning_rate": 0.0002, "epoch": 4.457714654615881, "step": 13810}, {"loss": 0.5541, "grad_norm": 1.0853543281555176, "learning_rate": 0.0002, "epoch": 4.460942543576501, "step": 13820}, {"loss": 0.5381, "grad_norm": 1.0613641738891602, "learning_rate": 0.0002, "epoch": 4.464170432537121, "step": 13830}, {"loss": 0.5684, "grad_norm": 0.9037535190582275, "learning_rate": 0.0002, "epoch": 4.467398321497741, "step": 13840}, {"loss": 0.5112, "grad_norm": 0.9216223955154419, "learning_rate": 0.0002, "epoch": 4.47062621045836, "step": 13850}, {"loss": 0.5341, "grad_norm": 0.8952260613441467, "learning_rate": 0.0002, "epoch": 4.47385409941898, "step": 13860}, {"loss": 0.5026, "grad_norm": 0.9997953176498413, "learning_rate": 0.0002, "epoch": 4.4770819883796, "step": 13870}, {"loss": 0.5107, "grad_norm": 1.062458872795105, "learning_rate": 0.0002, "epoch": 4.480309877340219, "step": 13880}, {"loss": 0.5463, "grad_norm": 0.9185126423835754, "learning_rate": 0.0002, "epoch": 4.483537766300839, "step": 13890}, {"loss": 0.5181, "grad_norm": 1.2389954328536987, "learning_rate": 0.0002, "epoch": 4.486765655261459, "step": 13900}, {"loss": 0.5199, "grad_norm": 1.1632126569747925, "learning_rate": 0.0002, "epoch": 4.489993544222079, "step": 13910}, {"loss": 0.5128, "grad_norm": 1.0304487943649292, "learning_rate": 0.0002, "epoch": 4.493221433182699, "step": 13920}, {"loss": 0.5331, "grad_norm": 0.9144788384437561, "learning_rate": 0.0002, "epoch": 4.496449322143318, "step": 13930}, {"loss": 0.5312, "grad_norm": 1.0285682678222656, "learning_rate": 0.0002, "epoch": 4.499677211103938, "step": 13940}, {"loss": 0.554, "grad_norm": 1.1187206506729126, "learning_rate": 0.0002, "epoch": 4.502905100064558, "step": 13950}, {"loss": 0.5268, "grad_norm": 0.7917197942733765, "learning_rate": 0.0002, "epoch": 4.506132989025177, "step": 13960}, {"loss": 0.5227, "grad_norm": 0.8495619297027588, "learning_rate": 0.0002, "epoch": 4.509360877985797, "step": 13970}, {"loss": 0.4971, "grad_norm": 1.0450760126113892, "learning_rate": 0.0002, "epoch": 4.512588766946417, "step": 13980}, {"loss": 0.5402, "grad_norm": 1.0061010122299194, "learning_rate": 0.0002, "epoch": 4.515816655907037, "step": 13990}, {"loss": 0.527, "grad_norm": 1.0232428312301636, "learning_rate": 0.0002, "epoch": 4.519044544867657, "step": 14000}, {"loss": 0.5002, "grad_norm": 0.8734631538391113, "learning_rate": 0.0002, "epoch": 4.5222724338282765, "step": 14010}, {"loss": 0.5464, "grad_norm": 1.1085621118545532, "learning_rate": 0.0002, "epoch": 4.525500322788896, "step": 14020}, {"loss": 0.5167, "grad_norm": 0.9178624749183655, "learning_rate": 0.0002, "epoch": 4.528728211749516, "step": 14030}, {"loss": 0.5589, "grad_norm": 1.0687317848205566, "learning_rate": 0.0002, "epoch": 4.531956100710135, "step": 14040}, {"loss": 0.5576, "grad_norm": 0.9237300157546997, "learning_rate": 0.0002, "epoch": 4.535183989670755, "step": 14050}, {"loss": 0.5062, "grad_norm": 0.9667123556137085, "learning_rate": 0.0002, "epoch": 4.538411878631375, "step": 14060}, {"loss": 0.5645, "grad_norm": 1.1286747455596924, "learning_rate": 0.0002, "epoch": 4.541639767591995, "step": 14070}, {"loss": 0.5226, "grad_norm": 1.055392861366272, "learning_rate": 0.0002, "epoch": 4.544867656552615, "step": 14080}, {"loss": 0.5428, "grad_norm": 0.9492936134338379, "learning_rate": 0.0002, "epoch": 4.548095545513235, "step": 14090}, {"loss": 0.5559, "grad_norm": 0.9881349802017212, "learning_rate": 0.0002, "epoch": 4.551323434473854, "step": 14100}, {"loss": 0.5572, "grad_norm": 0.9389023184776306, "learning_rate": 0.0002, "epoch": 4.554551323434474, "step": 14110}, {"loss": 0.5511, "grad_norm": 0.8395606875419617, "learning_rate": 0.0002, "epoch": 4.5577792123950935, "step": 14120}, {"loss": 0.5696, "grad_norm": 0.9019067287445068, "learning_rate": 0.0002, "epoch": 4.561007101355713, "step": 14130}, {"loss": 0.5564, "grad_norm": 1.1058136224746704, "learning_rate": 0.0002, "epoch": 4.564234990316333, "step": 14140}, {"loss": 0.5323, "grad_norm": 1.0683821439743042, "learning_rate": 0.0002, "epoch": 4.5674628792769525, "step": 14150}, {"loss": 0.5527, "grad_norm": 1.3398395776748657, "learning_rate": 0.0002, "epoch": 4.570690768237572, "step": 14160}, {"loss": 0.4713, "grad_norm": 0.7829096913337708, "learning_rate": 0.0002, "epoch": 4.573918657198193, "step": 14170}, {"loss": 0.525, "grad_norm": 0.9636675119400024, "learning_rate": 0.0002, "epoch": 4.577146546158812, "step": 14180}, {"loss": 0.5458, "grad_norm": 1.0291401147842407, "learning_rate": 0.0002, "epoch": 4.580374435119432, "step": 14190}, {"loss": 0.5366, "grad_norm": 1.0894310474395752, "learning_rate": 0.0002, "epoch": 4.583602324080052, "step": 14200}, {"loss": 0.5125, "grad_norm": 1.111573576927185, "learning_rate": 0.0002, "epoch": 4.586830213040671, "step": 14210}, {"loss": 0.5444, "grad_norm": 0.9345336556434631, "learning_rate": 0.0002, "epoch": 4.590058102001291, "step": 14220}, {"loss": 0.5175, "grad_norm": 1.3338757753372192, "learning_rate": 0.0002, "epoch": 4.593285990961911, "step": 14230}, {"loss": 0.5227, "grad_norm": 1.1146448850631714, "learning_rate": 0.0002, "epoch": 4.596513879922531, "step": 14240}, {"loss": 0.543, "grad_norm": 1.1576755046844482, "learning_rate": 0.0002, "epoch": 4.599741768883151, "step": 14250}, {"loss": 0.5315, "grad_norm": 0.6851092576980591, "learning_rate": 0.0002, "epoch": 4.60296965784377, "step": 14260}, {"loss": 0.5027, "grad_norm": 0.9067938923835754, "learning_rate": 0.0002, "epoch": 4.60619754680439, "step": 14270}, {"loss": 0.5237, "grad_norm": 0.8767340183258057, "learning_rate": 0.0002, "epoch": 4.60942543576501, "step": 14280}, {"loss": 0.5294, "grad_norm": 1.024880290031433, "learning_rate": 0.0002, "epoch": 4.612653324725629, "step": 14290}, {"loss": 0.5371, "grad_norm": 0.9226394891738892, "learning_rate": 0.0002, "epoch": 4.615881213686249, "step": 14300}, {"loss": 0.5281, "grad_norm": 1.018187165260315, "learning_rate": 0.0002, "epoch": 4.619109102646869, "step": 14310}, {"loss": 0.5546, "grad_norm": 0.8851249814033508, "learning_rate": 0.0002, "epoch": 4.622336991607488, "step": 14320}, {"loss": 0.5206, "grad_norm": 0.745798647403717, "learning_rate": 0.0002, "epoch": 4.625564880568108, "step": 14330}, {"loss": 0.5531, "grad_norm": 1.2082698345184326, "learning_rate": 0.0002, "epoch": 4.6287927695287285, "step": 14340}, {"loss": 0.5449, "grad_norm": 0.901454508304596, "learning_rate": 0.0002, "epoch": 4.632020658489348, "step": 14350}, {"loss": 0.5433, "grad_norm": 0.9593124985694885, "learning_rate": 0.0002, "epoch": 4.635248547449968, "step": 14360}, {"loss": 0.4939, "grad_norm": 1.1241410970687866, "learning_rate": 0.0002, "epoch": 4.6384764364105875, "step": 14370}, {"loss": 0.5319, "grad_norm": 0.9221102595329285, "learning_rate": 0.0002, "epoch": 4.641704325371207, "step": 14380}, {"loss": 0.524, "grad_norm": 1.0035039186477661, "learning_rate": 0.0002, "epoch": 4.644932214331827, "step": 14390}, {"loss": 0.5617, "grad_norm": 1.1270662546157837, "learning_rate": 0.0002, "epoch": 4.648160103292446, "step": 14400}, {"loss": 0.5663, "grad_norm": 0.8631120324134827, "learning_rate": 0.0002, "epoch": 4.651387992253067, "step": 14410}, {"loss": 0.5705, "grad_norm": 1.0604606866836548, "learning_rate": 0.0002, "epoch": 4.654615881213687, "step": 14420}, {"loss": 0.5307, "grad_norm": 0.8002706170082092, "learning_rate": 0.0002, "epoch": 4.657843770174306, "step": 14430}, {"loss": 0.5459, "grad_norm": 1.0642075538635254, "learning_rate": 0.0002, "epoch": 4.661071659134926, "step": 14440}, {"loss": 0.5497, "grad_norm": 0.9315671324729919, "learning_rate": 0.0002, "epoch": 4.664299548095546, "step": 14450}, {"loss": 0.5542, "grad_norm": 0.8311864137649536, "learning_rate": 0.0002, "epoch": 4.667527437056165, "step": 14460}, {"loss": 0.5533, "grad_norm": 0.8900430202484131, "learning_rate": 0.0002, "epoch": 4.670755326016785, "step": 14470}, {"loss": 0.5086, "grad_norm": 1.059267282485962, "learning_rate": 0.0002, "epoch": 4.6739832149774045, "step": 14480}, {"loss": 0.5583, "grad_norm": 0.9864052534103394, "learning_rate": 0.0002, "epoch": 4.677211103938024, "step": 14490}, {"loss": 0.5737, "grad_norm": 1.210854411125183, "learning_rate": 0.0002, "epoch": 4.680438992898644, "step": 14500}, {"loss": 0.536, "grad_norm": 1.030693769454956, "learning_rate": 0.0002, "epoch": 4.683666881859264, "step": 14510}, {"loss": 0.544, "grad_norm": 0.9809406995773315, "learning_rate": 0.0002, "epoch": 4.686894770819884, "step": 14520}, {"loss": 0.5522, "grad_norm": 1.0471004247665405, "learning_rate": 0.0002, "epoch": 4.690122659780504, "step": 14530}, {"loss": 0.5613, "grad_norm": 1.1583727598190308, "learning_rate": 0.0002, "epoch": 4.693350548741123, "step": 14540}, {"loss": 0.5608, "grad_norm": 0.9664418697357178, "learning_rate": 0.0002, "epoch": 4.696578437701743, "step": 14550}, {"loss": 0.5624, "grad_norm": 0.9511209726333618, "learning_rate": 0.0002, "epoch": 4.699806326662363, "step": 14560}, {"loss": 0.5806, "grad_norm": 1.0211684703826904, "learning_rate": 0.0002, "epoch": 4.703034215622982, "step": 14570}, {"loss": 0.5536, "grad_norm": 1.097276210784912, "learning_rate": 0.0002, "epoch": 4.706262104583602, "step": 14580}, {"loss": 0.5527, "grad_norm": 0.9363943338394165, "learning_rate": 0.0002, "epoch": 4.7094899935442225, "step": 14590}, {"loss": 0.5261, "grad_norm": 1.4700615406036377, "learning_rate": 0.0002, "epoch": 4.712717882504842, "step": 14600}, {"loss": 0.5489, "grad_norm": 1.0001553297042847, "learning_rate": 0.0002, "epoch": 4.715945771465462, "step": 14610}, {"loss": 0.5236, "grad_norm": 1.0489927530288696, "learning_rate": 0.0002, "epoch": 4.719173660426081, "step": 14620}, {"loss": 0.5418, "grad_norm": 1.0483676195144653, "learning_rate": 0.0002, "epoch": 4.722401549386701, "step": 14630}, {"loss": 0.5596, "grad_norm": 1.1501940488815308, "learning_rate": 0.0002, "epoch": 4.725629438347321, "step": 14640}, {"loss": 0.5059, "grad_norm": 1.1703146696090698, "learning_rate": 0.0002, "epoch": 4.72885732730794, "step": 14650}, {"loss": 0.5356, "grad_norm": 0.8842985033988953, "learning_rate": 0.0002, "epoch": 4.73208521626856, "step": 14660}, {"loss": 0.5229, "grad_norm": 0.9147908687591553, "learning_rate": 0.0002, "epoch": 4.73531310522918, "step": 14670}, {"loss": 0.5436, "grad_norm": 1.0391576290130615, "learning_rate": 0.0002, "epoch": 4.7385409941898, "step": 14680}, {"loss": 0.5803, "grad_norm": 0.9469179511070251, "learning_rate": 0.0002, "epoch": 4.74176888315042, "step": 14690}, {"loss": 0.5201, "grad_norm": 1.0529530048370361, "learning_rate": 0.0002, "epoch": 4.7449967721110395, "step": 14700}, {"loss": 0.5401, "grad_norm": 0.9645711183547974, "learning_rate": 0.0002, "epoch": 4.748224661071659, "step": 14710}, {"loss": 0.5123, "grad_norm": 0.8163343071937561, "learning_rate": 0.0002, "epoch": 4.751452550032279, "step": 14720}, {"loss": 0.5654, "grad_norm": 1.0581341981887817, "learning_rate": 0.0002, "epoch": 4.7546804389928985, "step": 14730}, {"loss": 0.5709, "grad_norm": 1.0913853645324707, "learning_rate": 0.0002, "epoch": 4.757908327953518, "step": 14740}, {"loss": 0.5342, "grad_norm": 1.1071174144744873, "learning_rate": 0.0002, "epoch": 4.761136216914138, "step": 14750}, {"loss": 0.5353, "grad_norm": 1.0060709714889526, "learning_rate": 0.0002, "epoch": 4.764364105874758, "step": 14760}, {"loss": 0.5415, "grad_norm": 1.012024164199829, "learning_rate": 0.0002, "epoch": 4.767591994835378, "step": 14770}, {"loss": 0.5351, "grad_norm": 0.8438148498535156, "learning_rate": 0.0002, "epoch": 4.770819883795998, "step": 14780}, {"loss": 0.5424, "grad_norm": 0.8136811256408691, "learning_rate": 0.0002, "epoch": 4.774047772756617, "step": 14790}, {"loss": 0.5397, "grad_norm": 1.0765691995620728, "learning_rate": 0.0002, "epoch": 4.777275661717237, "step": 14800}, {"loss": 0.5616, "grad_norm": 1.0582574605941772, "learning_rate": 0.0002, "epoch": 4.780503550677857, "step": 14810}, {"loss": 0.5554, "grad_norm": 0.9419516921043396, "learning_rate": 0.0002, "epoch": 4.783731439638476, "step": 14820}, {"loss": 0.5499, "grad_norm": 0.9626181721687317, "learning_rate": 0.0002, "epoch": 4.786959328599096, "step": 14830}, {"loss": 0.565, "grad_norm": 1.2552800178527832, "learning_rate": 0.0002, "epoch": 4.7901872175597155, "step": 14840}, {"loss": 0.5402, "grad_norm": 0.9379919171333313, "learning_rate": 0.0002, "epoch": 4.793415106520336, "step": 14850}, {"loss": 0.5583, "grad_norm": 0.8166947364807129, "learning_rate": 0.0002, "epoch": 4.796642995480956, "step": 14860}, {"loss": 0.5139, "grad_norm": 0.9008694887161255, "learning_rate": 0.0002, "epoch": 4.799870884441575, "step": 14870}, {"loss": 0.5049, "grad_norm": 1.0256156921386719, "learning_rate": 0.0002, "epoch": 4.803098773402195, "step": 14880}, {"loss": 0.5531, "grad_norm": 0.9486594200134277, "learning_rate": 0.0002, "epoch": 4.806326662362815, "step": 14890}, {"loss": 0.5667, "grad_norm": 0.955238401889801, "learning_rate": 0.0002, "epoch": 4.809554551323434, "step": 14900}, {"loss": 0.5269, "grad_norm": 1.03775954246521, "learning_rate": 0.0002, "epoch": 4.812782440284054, "step": 14910}, {"loss": 0.5445, "grad_norm": 1.1383405923843384, "learning_rate": 0.0002, "epoch": 4.816010329244674, "step": 14920}, {"loss": 0.5347, "grad_norm": 0.9411700963973999, "learning_rate": 0.0002, "epoch": 4.819238218205294, "step": 14930}, {"loss": 0.4899, "grad_norm": 0.8188554644584656, "learning_rate": 0.0002, "epoch": 4.822466107165914, "step": 14940}, {"loss": 0.5618, "grad_norm": 1.1336265802383423, "learning_rate": 0.0002, "epoch": 4.8256939961265335, "step": 14950}, {"loss": 0.5578, "grad_norm": 1.106121301651001, "learning_rate": 0.0002, "epoch": 4.828921885087153, "step": 14960}, {"loss": 0.5306, "grad_norm": 1.0206533670425415, "learning_rate": 0.0002, "epoch": 4.832149774047773, "step": 14970}, {"loss": 0.5714, "grad_norm": 1.1123926639556885, "learning_rate": 0.0002, "epoch": 4.8353776630083924, "step": 14980}, {"loss": 0.5208, "grad_norm": 0.7879418730735779, "learning_rate": 0.0002, "epoch": 4.838605551969012, "step": 14990}, {"loss": 0.5385, "grad_norm": 1.0171709060668945, "learning_rate": 0.0002, "epoch": 4.841833440929632, "step": 15000}, {"loss": 0.6049, "grad_norm": 1.010671615600586, "learning_rate": 0.0002, "epoch": 4.845061329890251, "step": 15010}, {"loss": 0.5497, "grad_norm": 1.0778919458389282, "learning_rate": 0.0002, "epoch": 4.848289218850871, "step": 15020}, {"loss": 0.5587, "grad_norm": 1.0479968786239624, "learning_rate": 0.0002, "epoch": 4.851517107811492, "step": 15030}, {"loss": 0.5637, "grad_norm": 1.0345100164413452, "learning_rate": 0.0002, "epoch": 4.854744996772111, "step": 15040}, {"loss": 0.5809, "grad_norm": 0.9539691805839539, "learning_rate": 0.0002, "epoch": 4.857972885732731, "step": 15050}, {"loss": 0.5314, "grad_norm": 0.9914752840995789, "learning_rate": 0.0002, "epoch": 4.8612007746933505, "step": 15060}, {"loss": 0.5277, "grad_norm": 1.1935476064682007, "learning_rate": 0.0002, "epoch": 4.86442866365397, "step": 15070}, {"loss": 0.5497, "grad_norm": 1.0065057277679443, "learning_rate": 0.0002, "epoch": 4.86765655261459, "step": 15080}, {"loss": 0.5563, "grad_norm": 0.9320993423461914, "learning_rate": 0.0002, "epoch": 4.8708844415752095, "step": 15090}, {"loss": 0.5757, "grad_norm": 1.0578069686889648, "learning_rate": 0.0002, "epoch": 4.87411233053583, "step": 15100}, {"loss": 0.5472, "grad_norm": 0.9666239023208618, "learning_rate": 0.0002, "epoch": 4.87734021949645, "step": 15110}, {"loss": 0.5564, "grad_norm": 1.1322687864303589, "learning_rate": 0.0002, "epoch": 4.880568108457069, "step": 15120}, {"loss": 0.5381, "grad_norm": 0.955674409866333, "learning_rate": 0.0002, "epoch": 4.883795997417689, "step": 15130}, {"loss": 0.557, "grad_norm": 1.119413137435913, "learning_rate": 0.0002, "epoch": 4.887023886378309, "step": 15140}, {"loss": 0.5527, "grad_norm": 0.863646924495697, "learning_rate": 0.0002, "epoch": 4.890251775338928, "step": 15150}, {"loss": 0.5908, "grad_norm": 1.1823450326919556, "learning_rate": 0.0002, "epoch": 4.893479664299548, "step": 15160}, {"loss": 0.5654, "grad_norm": 0.8657588958740234, "learning_rate": 0.0002, "epoch": 4.896707553260168, "step": 15170}, {"loss": 0.5239, "grad_norm": 0.8575737476348877, "learning_rate": 0.0002, "epoch": 4.899935442220787, "step": 15180}, {"loss": 0.564, "grad_norm": 0.9611830711364746, "learning_rate": 0.0002, "epoch": 4.903163331181407, "step": 15190}, {"loss": 0.5505, "grad_norm": 1.1981453895568848, "learning_rate": 0.0002, "epoch": 4.906391220142027, "step": 15200}, {"loss": 0.5582, "grad_norm": 0.9401199221611023, "learning_rate": 0.0002, "epoch": 4.909619109102647, "step": 15210}, {"loss": 0.5631, "grad_norm": 0.8420369625091553, "learning_rate": 0.0002, "epoch": 4.912846998063267, "step": 15220}, {"loss": 0.5255, "grad_norm": 0.7877969145774841, "learning_rate": 0.0002, "epoch": 4.916074887023886, "step": 15230}, {"loss": 0.5522, "grad_norm": 0.8988324403762817, "learning_rate": 0.0002, "epoch": 4.919302775984506, "step": 15240}, {"loss": 0.5274, "grad_norm": 1.1103752851486206, "learning_rate": 0.0002, "epoch": 4.922530664945126, "step": 15250}, {"loss": 0.5249, "grad_norm": 0.8874443173408508, "learning_rate": 0.0002, "epoch": 4.925758553905745, "step": 15260}, {"loss": 0.5677, "grad_norm": 1.1001752614974976, "learning_rate": 0.0002, "epoch": 4.928986442866366, "step": 15270}, {"loss": 0.5596, "grad_norm": 0.9661307334899902, "learning_rate": 0.0002, "epoch": 4.9322143318269855, "step": 15280}, {"loss": 0.5678, "grad_norm": 1.1738812923431396, "learning_rate": 0.0002, "epoch": 4.935442220787605, "step": 15290}, {"loss": 0.5057, "grad_norm": 0.9773507714271545, "learning_rate": 0.0002, "epoch": 4.938670109748225, "step": 15300}, {"loss": 0.5029, "grad_norm": 1.0735599994659424, "learning_rate": 0.0002, "epoch": 4.9418979987088445, "step": 15310}, {"loss": 0.4996, "grad_norm": 1.0552113056182861, "learning_rate": 0.0002, "epoch": 4.945125887669464, "step": 15320}, {"loss": 0.5201, "grad_norm": 1.0900797843933105, "learning_rate": 0.0002, "epoch": 4.948353776630084, "step": 15330}, {"loss": 0.552, "grad_norm": 1.0908405780792236, "learning_rate": 0.0002, "epoch": 4.9515816655907035, "step": 15340}, {"loss": 0.6208, "grad_norm": 1.010221004486084, "learning_rate": 0.0002, "epoch": 4.954809554551323, "step": 15350}, {"loss": 0.5423, "grad_norm": 1.0321437120437622, "learning_rate": 0.0002, "epoch": 4.958037443511943, "step": 15360}, {"loss": 0.5903, "grad_norm": 0.8430278897285461, "learning_rate": 0.0002, "epoch": 4.961265332472563, "step": 15370}, {"loss": 0.538, "grad_norm": 0.8775330185890198, "learning_rate": 0.0002, "epoch": 4.964493221433183, "step": 15380}, {"loss": 0.5344, "grad_norm": 0.9796988368034363, "learning_rate": 0.0002, "epoch": 4.967721110393803, "step": 15390}, {"loss": 0.5352, "grad_norm": 0.8782257437705994, "learning_rate": 0.0002, "epoch": 4.970948999354422, "step": 15400}, {"loss": 0.5843, "grad_norm": 0.9959840774536133, "learning_rate": 0.0002, "epoch": 4.974176888315042, "step": 15410}, {"loss": 0.5783, "grad_norm": 1.0730273723602295, "learning_rate": 0.0002, "epoch": 4.9774047772756616, "step": 15420}, {"loss": 0.5277, "grad_norm": 0.8653680682182312, "learning_rate": 0.0002, "epoch": 4.980632666236281, "step": 15430}, {"loss": 0.5301, "grad_norm": 1.0769985914230347, "learning_rate": 0.0002, "epoch": 4.983860555196901, "step": 15440}, {"loss": 0.5727, "grad_norm": 1.1336040496826172, "learning_rate": 0.0002, "epoch": 4.987088444157521, "step": 15450}, {"loss": 0.5454, "grad_norm": 0.9844824075698853, "learning_rate": 0.0002, "epoch": 4.990316333118141, "step": 15460}, {"loss": 0.5316, "grad_norm": 0.8368769288063049, "learning_rate": 0.0002, "epoch": 4.993544222078761, "step": 15470}, {"loss": 0.5464, "grad_norm": 1.0238676071166992, "learning_rate": 0.0002, "epoch": 4.99677211103938, "step": 15480}, {"loss": 0.5577, "grad_norm": 1.064820408821106, "learning_rate": 0.0002, "epoch": 5.0, "step": 15490}]} +{"epoch": 6.0, "step": 18588, "epoch_duration": 11691.103891849518, "total_accumulated_duration": 67541.24670577049, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75537109375}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.593, "grad_norm": 0.7092075347900391, "learning_rate": 0.0002, "epoch": 0.0032278889606197547, "step": 10}, {"loss": 1.0956, "grad_norm": 0.6900479793548584, "learning_rate": 0.0002, "epoch": 0.006455777921239509, "step": 20}, {"loss": 0.9807, "grad_norm": 0.6788288950920105, "learning_rate": 0.0002, "epoch": 0.009683666881859263, "step": 30}, {"loss": 0.9385, "grad_norm": 0.5590243339538574, "learning_rate": 0.0002, "epoch": 0.012911555842479019, "step": 40}, {"loss": 0.931, "grad_norm": 0.5136010646820068, "learning_rate": 0.0002, "epoch": 0.016139444803098774, "step": 50}, {"loss": 0.8896, "grad_norm": 0.45298320055007935, "learning_rate": 0.0002, "epoch": 0.019367333763718526, "step": 60}, {"loss": 0.9184, "grad_norm": 0.5917162299156189, "learning_rate": 0.0002, "epoch": 0.022595222724338282, "step": 70}, {"loss": 0.8705, "grad_norm": 0.4414856433868408, "learning_rate": 0.0002, "epoch": 0.025823111684958037, "step": 80}, {"loss": 0.8419, "grad_norm": 0.5547978281974792, "learning_rate": 0.0002, "epoch": 0.029051000645577793, "step": 90}, {"loss": 0.8987, "grad_norm": 0.5271288156509399, "learning_rate": 0.0002, "epoch": 0.03227888960619755, "step": 100}, {"loss": 0.8543, "grad_norm": 0.5506119728088379, "learning_rate": 0.0002, "epoch": 0.035506778566817304, "step": 110}, {"loss": 0.8373, "grad_norm": 0.5579327940940857, "learning_rate": 0.0002, "epoch": 0.03873466752743705, "step": 120}, {"loss": 0.8826, "grad_norm": 0.5099632740020752, "learning_rate": 0.0002, "epoch": 0.04196255648805681, "step": 130}, {"loss": 0.9239, "grad_norm": 0.40396833419799805, "learning_rate": 0.0002, "epoch": 0.045190445448676564, "step": 140}, {"loss": 0.846, "grad_norm": 0.5008092522621155, "learning_rate": 0.0002, "epoch": 0.04841833440929632, "step": 150}, {"loss": 0.8564, "grad_norm": 0.4388776421546936, "learning_rate": 0.0002, "epoch": 0.051646223369916075, "step": 160}, {"loss": 0.8829, "grad_norm": 0.44138944149017334, "learning_rate": 0.0002, "epoch": 0.05487411233053583, "step": 170}, {"loss": 0.8061, "grad_norm": 0.358484148979187, "learning_rate": 0.0002, "epoch": 0.058102001291155586, "step": 180}, {"loss": 0.8956, "grad_norm": 0.457052081823349, "learning_rate": 0.0002, "epoch": 0.06132989025177534, "step": 190}, {"loss": 0.9138, "grad_norm": 0.5537622570991516, "learning_rate": 0.0002, "epoch": 0.0645577792123951, "step": 200}, {"loss": 0.8701, "grad_norm": 0.552631676197052, "learning_rate": 0.0002, "epoch": 0.06778566817301485, "step": 210}, {"loss": 0.8854, "grad_norm": 0.4414575397968292, "learning_rate": 0.0002, "epoch": 0.07101355713363461, "step": 220}, {"loss": 0.8581, "grad_norm": 0.4996664226055145, "learning_rate": 0.0002, "epoch": 0.07424144609425436, "step": 230}, {"loss": 0.8675, "grad_norm": 0.7321897149085999, "learning_rate": 0.0002, "epoch": 0.0774693350548741, "step": 240}, {"loss": 0.8848, "grad_norm": 0.4553901255130768, "learning_rate": 0.0002, "epoch": 0.08069722401549387, "step": 250}, {"loss": 0.868, "grad_norm": 0.5039054751396179, "learning_rate": 0.0002, "epoch": 0.08392511297611362, "step": 260}, {"loss": 0.8317, "grad_norm": 0.4113094210624695, "learning_rate": 0.0002, "epoch": 0.08715300193673338, "step": 270}, {"loss": 0.8074, "grad_norm": 0.450436532497406, "learning_rate": 0.0002, "epoch": 0.09038089089735313, "step": 280}, {"loss": 0.8105, "grad_norm": 0.4548024535179138, "learning_rate": 0.0002, "epoch": 0.09360877985797289, "step": 290}, {"loss": 0.8325, "grad_norm": 0.4932962656021118, "learning_rate": 0.0002, "epoch": 0.09683666881859264, "step": 300}, {"loss": 0.8105, "grad_norm": 0.4005250334739685, "learning_rate": 0.0002, "epoch": 0.1000645577792124, "step": 310}, {"loss": 0.8083, "grad_norm": 1.8321624994277954, "learning_rate": 0.0002, "epoch": 0.10329244673983215, "step": 320}, {"loss": 0.8411, "grad_norm": 0.45815610885620117, "learning_rate": 0.0002, "epoch": 0.1065203357004519, "step": 330}, {"loss": 0.857, "grad_norm": 0.39324095845222473, "learning_rate": 0.0002, "epoch": 0.10974822466107166, "step": 340}, {"loss": 0.8258, "grad_norm": 0.546273946762085, "learning_rate": 0.0002, "epoch": 0.11297611362169141, "step": 350}, {"loss": 0.882, "grad_norm": 0.497448593378067, "learning_rate": 0.0002, "epoch": 0.11620400258231117, "step": 360}, {"loss": 0.7608, "grad_norm": 0.37508800625801086, "learning_rate": 0.0002, "epoch": 0.11943189154293092, "step": 370}, {"loss": 0.852, "grad_norm": 0.45849609375, "learning_rate": 0.0002, "epoch": 0.12265978050355068, "step": 380}, {"loss": 0.8437, "grad_norm": 0.5488408803939819, "learning_rate": 0.0002, "epoch": 0.12588766946417043, "step": 390}, {"loss": 0.8349, "grad_norm": 0.4477061331272125, "learning_rate": 0.0002, "epoch": 0.1291155584247902, "step": 400}, {"loss": 0.8306, "grad_norm": 0.39227980375289917, "learning_rate": 0.0002, "epoch": 0.13234344738540993, "step": 410}, {"loss": 0.7933, "grad_norm": 0.3922233581542969, "learning_rate": 0.0002, "epoch": 0.1355713363460297, "step": 420}, {"loss": 0.8134, "grad_norm": 0.42901909351348877, "learning_rate": 0.0002, "epoch": 0.13879922530664945, "step": 430}, {"loss": 0.8271, "grad_norm": 0.4217798709869385, "learning_rate": 0.0002, "epoch": 0.14202711426726922, "step": 440}, {"loss": 0.8594, "grad_norm": 0.43470677733421326, "learning_rate": 0.0002, "epoch": 0.14525500322788895, "step": 450}, {"loss": 0.8106, "grad_norm": 0.5324403047561646, "learning_rate": 0.0002, "epoch": 0.1484828921885087, "step": 460}, {"loss": 0.8729, "grad_norm": 0.3999756872653961, "learning_rate": 0.0002, "epoch": 0.15171078114912848, "step": 470}, {"loss": 0.7702, "grad_norm": 0.404933363199234, "learning_rate": 0.0002, "epoch": 0.1549386701097482, "step": 480}, {"loss": 0.8151, "grad_norm": 0.44122636318206787, "learning_rate": 0.0002, "epoch": 0.15816655907036797, "step": 490}, {"loss": 0.8457, "grad_norm": 0.510166347026825, "learning_rate": 0.0002, "epoch": 0.16139444803098774, "step": 500}, {"loss": 0.8692, "grad_norm": 0.4549732506275177, "learning_rate": 0.0002, "epoch": 0.1646223369916075, "step": 510}, {"loss": 0.8466, "grad_norm": 0.5148182511329651, "learning_rate": 0.0002, "epoch": 0.16785022595222723, "step": 520}, {"loss": 0.8317, "grad_norm": 0.3596806824207306, "learning_rate": 0.0002, "epoch": 0.171078114912847, "step": 530}, {"loss": 0.844, "grad_norm": 0.4388909339904785, "learning_rate": 0.0002, "epoch": 0.17430600387346676, "step": 540}, {"loss": 0.8322, "grad_norm": 0.5052742958068848, "learning_rate": 0.0002, "epoch": 0.17753389283408652, "step": 550}, {"loss": 0.791, "grad_norm": 0.48248958587646484, "learning_rate": 0.0002, "epoch": 0.18076178179470626, "step": 560}, {"loss": 0.8593, "grad_norm": 0.5360197424888611, "learning_rate": 0.0002, "epoch": 0.18398967075532602, "step": 570}, {"loss": 0.817, "grad_norm": 0.43999341130256653, "learning_rate": 0.0002, "epoch": 0.18721755971594578, "step": 580}, {"loss": 0.8311, "grad_norm": 0.3685208261013031, "learning_rate": 0.0002, "epoch": 0.19044544867656552, "step": 590}, {"loss": 0.8341, "grad_norm": 0.4601275622844696, "learning_rate": 0.0002, "epoch": 0.19367333763718528, "step": 600}, {"loss": 0.8483, "grad_norm": 0.4778369665145874, "learning_rate": 0.0002, "epoch": 0.19690122659780504, "step": 610}, {"loss": 0.8653, "grad_norm": 0.4867003560066223, "learning_rate": 0.0002, "epoch": 0.2001291155584248, "step": 620}, {"loss": 0.8554, "grad_norm": 0.4583742916584015, "learning_rate": 0.0002, "epoch": 0.20335700451904454, "step": 630}, {"loss": 0.8698, "grad_norm": 0.47958165407180786, "learning_rate": 0.0002, "epoch": 0.2065848934796643, "step": 640}, {"loss": 0.8213, "grad_norm": 0.4526064097881317, "learning_rate": 0.0002, "epoch": 0.20981278244028406, "step": 650}, {"loss": 0.8313, "grad_norm": 0.45890581607818604, "learning_rate": 0.0002, "epoch": 0.2130406714009038, "step": 660}, {"loss": 0.8143, "grad_norm": 0.42725905776023865, "learning_rate": 0.0002, "epoch": 0.21626856036152356, "step": 670}, {"loss": 0.8675, "grad_norm": 0.40380963683128357, "learning_rate": 0.0002, "epoch": 0.21949644932214332, "step": 680}, {"loss": 0.9004, "grad_norm": 0.4372998774051666, "learning_rate": 0.0002, "epoch": 0.22272433828276308, "step": 690}, {"loss": 0.8208, "grad_norm": 0.4245864450931549, "learning_rate": 0.0002, "epoch": 0.22595222724338282, "step": 700}, {"loss": 0.8564, "grad_norm": 0.4061129689216614, "learning_rate": 0.0002, "epoch": 0.22918011620400258, "step": 710}, {"loss": 0.8275, "grad_norm": 0.474454790353775, "learning_rate": 0.0002, "epoch": 0.23240800516462234, "step": 720}, {"loss": 0.8346, "grad_norm": 0.4908486008644104, "learning_rate": 0.0002, "epoch": 0.23563589412524208, "step": 730}, {"loss": 0.8755, "grad_norm": 0.4284191429615021, "learning_rate": 0.0002, "epoch": 0.23886378308586184, "step": 740}, {"loss": 0.8387, "grad_norm": 0.44730308651924133, "learning_rate": 0.0002, "epoch": 0.2420916720464816, "step": 750}, {"loss": 0.8135, "grad_norm": 0.4433246850967407, "learning_rate": 0.0002, "epoch": 0.24531956100710137, "step": 760}, {"loss": 0.8644, "grad_norm": 0.43668854236602783, "learning_rate": 0.0002, "epoch": 0.2485474499677211, "step": 770}, {"loss": 0.8025, "grad_norm": 0.34324130415916443, "learning_rate": 0.0002, "epoch": 0.25177533892834086, "step": 780}, {"loss": 0.8725, "grad_norm": 0.46476295590400696, "learning_rate": 0.0002, "epoch": 0.2550032278889606, "step": 790}, {"loss": 0.8157, "grad_norm": 0.5047039985656738, "learning_rate": 0.0002, "epoch": 0.2582311168495804, "step": 800}, {"loss": 0.8643, "grad_norm": 0.4402127265930176, "learning_rate": 0.0002, "epoch": 0.26145900581020015, "step": 810}, {"loss": 0.8025, "grad_norm": 0.4642465114593506, "learning_rate": 0.0002, "epoch": 0.26468689477081986, "step": 820}, {"loss": 0.8836, "grad_norm": 0.40093424916267395, "learning_rate": 0.0002, "epoch": 0.2679147837314396, "step": 830}, {"loss": 0.83, "grad_norm": 0.42501842975616455, "learning_rate": 0.0002, "epoch": 0.2711426726920594, "step": 840}, {"loss": 0.8573, "grad_norm": 0.43279722332954407, "learning_rate": 0.0002, "epoch": 0.27437056165267915, "step": 850}, {"loss": 0.817, "grad_norm": 0.5991243720054626, "learning_rate": 0.0002, "epoch": 0.2775984506132989, "step": 860}, {"loss": 0.7981, "grad_norm": 0.4217848777770996, "learning_rate": 0.0002, "epoch": 0.28082633957391867, "step": 870}, {"loss": 0.8135, "grad_norm": 0.3933536410331726, "learning_rate": 0.0002, "epoch": 0.28405422853453843, "step": 880}, {"loss": 0.8846, "grad_norm": 0.5868505239486694, "learning_rate": 0.0002, "epoch": 0.28728211749515814, "step": 890}, {"loss": 0.8759, "grad_norm": 0.5209547877311707, "learning_rate": 0.0002, "epoch": 0.2905100064557779, "step": 900}, {"loss": 0.815, "grad_norm": 0.49307361245155334, "learning_rate": 0.0002, "epoch": 0.29373789541639767, "step": 910}, {"loss": 0.7813, "grad_norm": 0.4288382828235626, "learning_rate": 0.0002, "epoch": 0.2969657843770174, "step": 920}, {"loss": 0.8431, "grad_norm": 0.33568474650382996, "learning_rate": 0.0002, "epoch": 0.3001936733376372, "step": 930}, {"loss": 0.8455, "grad_norm": 1.0915930271148682, "learning_rate": 0.0002, "epoch": 0.30342156229825695, "step": 940}, {"loss": 0.8535, "grad_norm": 0.5489798188209534, "learning_rate": 0.0002, "epoch": 0.3066494512588767, "step": 950}, {"loss": 0.8031, "grad_norm": 0.42971742153167725, "learning_rate": 0.0002, "epoch": 0.3098773402194964, "step": 960}, {"loss": 0.8253, "grad_norm": 0.43375834822654724, "learning_rate": 0.0002, "epoch": 0.3131052291801162, "step": 970}, {"loss": 0.7747, "grad_norm": 0.47488611936569214, "learning_rate": 0.0002, "epoch": 0.31633311814073595, "step": 980}, {"loss": 0.7906, "grad_norm": 0.46296775341033936, "learning_rate": 0.0002, "epoch": 0.3195610071013557, "step": 990}, {"loss": 0.7948, "grad_norm": 0.4548890292644501, "learning_rate": 0.0002, "epoch": 0.32278889606197547, "step": 1000}, {"loss": 0.8856, "grad_norm": 0.41834497451782227, "learning_rate": 0.0002, "epoch": 0.32601678502259523, "step": 1010}, {"loss": 0.7791, "grad_norm": 0.441092312335968, "learning_rate": 0.0002, "epoch": 0.329244673983215, "step": 1020}, {"loss": 0.8191, "grad_norm": 0.637322187423706, "learning_rate": 0.0002, "epoch": 0.33247256294383476, "step": 1030}, {"loss": 0.8685, "grad_norm": 0.4374958574771881, "learning_rate": 0.0002, "epoch": 0.33570045190445447, "step": 1040}, {"loss": 0.8423, "grad_norm": 0.3935825824737549, "learning_rate": 0.0002, "epoch": 0.33892834086507423, "step": 1050}, {"loss": 0.8287, "grad_norm": 0.43526220321655273, "learning_rate": 0.0002, "epoch": 0.342156229825694, "step": 1060}, {"loss": 0.8413, "grad_norm": 0.45327696204185486, "learning_rate": 0.0002, "epoch": 0.34538411878631375, "step": 1070}, {"loss": 0.7421, "grad_norm": 0.4126075506210327, "learning_rate": 0.0002, "epoch": 0.3486120077469335, "step": 1080}, {"loss": 0.8427, "grad_norm": 0.4714072048664093, "learning_rate": 0.0002, "epoch": 0.3518398967075533, "step": 1090}, {"loss": 0.8028, "grad_norm": 0.518127977848053, "learning_rate": 0.0002, "epoch": 0.35506778566817304, "step": 1100}, {"loss": 0.8479, "grad_norm": 0.43264099955558777, "learning_rate": 0.0002, "epoch": 0.35829567462879275, "step": 1110}, {"loss": 0.8724, "grad_norm": 0.4857400357723236, "learning_rate": 0.0002, "epoch": 0.3615235635894125, "step": 1120}, {"loss": 0.7735, "grad_norm": 0.37591469287872314, "learning_rate": 0.0002, "epoch": 0.3647514525500323, "step": 1130}, {"loss": 0.8531, "grad_norm": 0.4165478050708771, "learning_rate": 0.0002, "epoch": 0.36797934151065204, "step": 1140}, {"loss": 0.8151, "grad_norm": 0.42911383509635925, "learning_rate": 0.0002, "epoch": 0.3712072304712718, "step": 1150}, {"loss": 0.8722, "grad_norm": 0.44980287551879883, "learning_rate": 0.0002, "epoch": 0.37443511943189156, "step": 1160}, {"loss": 0.7961, "grad_norm": 0.4066573679447174, "learning_rate": 0.0002, "epoch": 0.3776630083925113, "step": 1170}, {"loss": 0.8317, "grad_norm": 0.5056195855140686, "learning_rate": 0.0002, "epoch": 0.38089089735313103, "step": 1180}, {"loss": 0.8387, "grad_norm": 0.4141536355018616, "learning_rate": 0.0002, "epoch": 0.3841187863137508, "step": 1190}, {"loss": 0.8019, "grad_norm": 0.4501924514770508, "learning_rate": 0.0002, "epoch": 0.38734667527437056, "step": 1200}, {"loss": 0.8528, "grad_norm": 0.43304240703582764, "learning_rate": 0.0002, "epoch": 0.3905745642349903, "step": 1210}, {"loss": 0.8905, "grad_norm": 0.475777804851532, "learning_rate": 0.0002, "epoch": 0.3938024531956101, "step": 1220}, {"loss": 0.8643, "grad_norm": 0.5846465826034546, "learning_rate": 0.0002, "epoch": 0.39703034215622984, "step": 1230}, {"loss": 0.8078, "grad_norm": 0.42899325489997864, "learning_rate": 0.0002, "epoch": 0.4002582311168496, "step": 1240}, {"loss": 0.8415, "grad_norm": 0.3980463147163391, "learning_rate": 0.0002, "epoch": 0.4034861200774693, "step": 1250}, {"loss": 0.8026, "grad_norm": 0.45769768953323364, "learning_rate": 0.0002, "epoch": 0.4067140090380891, "step": 1260}, {"loss": 0.8377, "grad_norm": 0.5101280212402344, "learning_rate": 0.0002, "epoch": 0.40994189799870884, "step": 1270}, {"loss": 0.7905, "grad_norm": 0.47374317049980164, "learning_rate": 0.0002, "epoch": 0.4131697869593286, "step": 1280}, {"loss": 0.8172, "grad_norm": 0.4261878728866577, "learning_rate": 0.0002, "epoch": 0.41639767591994836, "step": 1290}, {"loss": 0.9004, "grad_norm": 0.46954256296157837, "learning_rate": 0.0002, "epoch": 0.4196255648805681, "step": 1300}, {"loss": 0.7868, "grad_norm": 0.5205738544464111, "learning_rate": 0.0002, "epoch": 0.4228534538411879, "step": 1310}, {"loss": 0.8964, "grad_norm": 0.5176340937614441, "learning_rate": 0.0002, "epoch": 0.4260813428018076, "step": 1320}, {"loss": 0.8764, "grad_norm": 0.5155916810035706, "learning_rate": 0.0002, "epoch": 0.42930923176242736, "step": 1330}, {"loss": 0.8197, "grad_norm": 0.44548553228378296, "learning_rate": 0.0002, "epoch": 0.4325371207230471, "step": 1340}, {"loss": 0.7873, "grad_norm": 0.5633558630943298, "learning_rate": 0.0002, "epoch": 0.4357650096836669, "step": 1350}, {"loss": 0.7889, "grad_norm": 0.42444056272506714, "learning_rate": 0.0002, "epoch": 0.43899289864428664, "step": 1360}, {"loss": 0.8588, "grad_norm": 0.5226860642433167, "learning_rate": 0.0002, "epoch": 0.4422207876049064, "step": 1370}, {"loss": 0.8232, "grad_norm": 0.5354582071304321, "learning_rate": 0.0002, "epoch": 0.44544867656552617, "step": 1380}, {"loss": 0.816, "grad_norm": 0.472646564245224, "learning_rate": 0.0002, "epoch": 0.4486765655261459, "step": 1390}, {"loss": 0.7953, "grad_norm": 0.6312310099601746, "learning_rate": 0.0002, "epoch": 0.45190445448676564, "step": 1400}, {"loss": 0.8212, "grad_norm": 0.4298408031463623, "learning_rate": 0.0002, "epoch": 0.4551323434473854, "step": 1410}, {"loss": 0.8447, "grad_norm": 0.43427202105522156, "learning_rate": 0.0002, "epoch": 0.45836023240800516, "step": 1420}, {"loss": 0.8342, "grad_norm": 0.44097861647605896, "learning_rate": 0.0002, "epoch": 0.4615881213686249, "step": 1430}, {"loss": 0.8301, "grad_norm": 0.5142693519592285, "learning_rate": 0.0002, "epoch": 0.4648160103292447, "step": 1440}, {"loss": 0.8144, "grad_norm": 0.46416547894477844, "learning_rate": 0.0002, "epoch": 0.46804389928986445, "step": 1450}, {"loss": 0.8342, "grad_norm": 0.4858551025390625, "learning_rate": 0.0002, "epoch": 0.47127178825048416, "step": 1460}, {"loss": 0.8354, "grad_norm": 0.4709177315235138, "learning_rate": 0.0002, "epoch": 0.4744996772111039, "step": 1470}, {"loss": 0.8391, "grad_norm": 0.5500252842903137, "learning_rate": 0.0002, "epoch": 0.4777275661717237, "step": 1480}, {"loss": 0.8359, "grad_norm": 0.43364381790161133, "learning_rate": 0.0002, "epoch": 0.48095545513234345, "step": 1490}, {"loss": 0.8446, "grad_norm": 0.47712287306785583, "learning_rate": 0.0002, "epoch": 0.4841833440929632, "step": 1500}, {"loss": 0.8518, "grad_norm": 0.4518495202064514, "learning_rate": 0.0002, "epoch": 0.48741123305358297, "step": 1510}, {"loss": 0.819, "grad_norm": 0.4539008140563965, "learning_rate": 0.0002, "epoch": 0.49063912201420273, "step": 1520}, {"loss": 0.8276, "grad_norm": 0.4993067979812622, "learning_rate": 0.0002, "epoch": 0.49386701097482244, "step": 1530}, {"loss": 0.8297, "grad_norm": 0.6094803214073181, "learning_rate": 0.0002, "epoch": 0.4970948999354422, "step": 1540}, {"loss": 0.8263, "grad_norm": 0.48602527379989624, "learning_rate": 0.0002, "epoch": 0.500322788896062, "step": 1550}, {"loss": 0.8182, "grad_norm": 0.40245795249938965, "learning_rate": 0.0002, "epoch": 0.5035506778566817, "step": 1560}, {"loss": 0.7907, "grad_norm": 0.456787645816803, "learning_rate": 0.0002, "epoch": 0.5067785668173015, "step": 1570}, {"loss": 0.86, "grad_norm": 0.43936216831207275, "learning_rate": 0.0002, "epoch": 0.5100064557779213, "step": 1580}, {"loss": 0.7928, "grad_norm": 0.549018144607544, "learning_rate": 0.0002, "epoch": 0.513234344738541, "step": 1590}, {"loss": 0.8169, "grad_norm": 0.41746795177459717, "learning_rate": 0.0002, "epoch": 0.5164622336991608, "step": 1600}, {"loss": 0.7868, "grad_norm": 0.4217053949832916, "learning_rate": 0.0002, "epoch": 0.5196901226597805, "step": 1610}, {"loss": 0.8161, "grad_norm": 0.449913889169693, "learning_rate": 0.0002, "epoch": 0.5229180116204003, "step": 1620}, {"loss": 0.7938, "grad_norm": 0.5084872245788574, "learning_rate": 0.0002, "epoch": 0.5261459005810201, "step": 1630}, {"loss": 0.8295, "grad_norm": 0.46248653531074524, "learning_rate": 0.0002, "epoch": 0.5293737895416397, "step": 1640}, {"loss": 0.7993, "grad_norm": 0.4824236035346985, "learning_rate": 0.0002, "epoch": 0.5326016785022595, "step": 1650}, {"loss": 0.8711, "grad_norm": 0.6010985374450684, "learning_rate": 0.0002, "epoch": 0.5358295674628792, "step": 1660}, {"loss": 0.8266, "grad_norm": 0.4757920801639557, "learning_rate": 0.0002, "epoch": 0.539057456423499, "step": 1670}, {"loss": 0.8182, "grad_norm": 0.45161882042884827, "learning_rate": 0.0002, "epoch": 0.5422853453841188, "step": 1680}, {"loss": 0.8141, "grad_norm": 0.49314990639686584, "learning_rate": 0.0002, "epoch": 0.5455132343447385, "step": 1690}, {"loss": 0.8091, "grad_norm": 0.3918305039405823, "learning_rate": 0.0002, "epoch": 0.5487411233053583, "step": 1700}, {"loss": 0.8177, "grad_norm": 0.5966728925704956, "learning_rate": 0.0002, "epoch": 0.551969012265978, "step": 1710}, {"loss": 0.8438, "grad_norm": 0.4208986163139343, "learning_rate": 0.0002, "epoch": 0.5551969012265978, "step": 1720}, {"loss": 0.817, "grad_norm": 0.43724218010902405, "learning_rate": 0.0002, "epoch": 0.5584247901872176, "step": 1730}, {"loss": 0.7956, "grad_norm": 0.5287272930145264, "learning_rate": 0.0002, "epoch": 0.5616526791478373, "step": 1740}, {"loss": 0.8557, "grad_norm": 0.4961899518966675, "learning_rate": 0.0002, "epoch": 0.5648805681084571, "step": 1750}, {"loss": 0.8029, "grad_norm": 0.4468635320663452, "learning_rate": 0.0002, "epoch": 0.5681084570690769, "step": 1760}, {"loss": 0.7968, "grad_norm": 0.6423530578613281, "learning_rate": 0.0002, "epoch": 0.5713363460296966, "step": 1770}, {"loss": 0.8324, "grad_norm": 0.4601971507072449, "learning_rate": 0.0002, "epoch": 0.5745642349903163, "step": 1780}, {"loss": 0.8171, "grad_norm": 0.46514901518821716, "learning_rate": 0.0002, "epoch": 0.577792123950936, "step": 1790}, {"loss": 0.8186, "grad_norm": 0.4771687388420105, "learning_rate": 0.0002, "epoch": 0.5810200129115558, "step": 1800}, {"loss": 0.856, "grad_norm": 0.46514490246772766, "learning_rate": 0.0002, "epoch": 0.5842479018721756, "step": 1810}, {"loss": 0.84, "grad_norm": 0.5373936295509338, "learning_rate": 0.0002, "epoch": 0.5874757908327953, "step": 1820}, {"loss": 0.8456, "grad_norm": 0.5175791382789612, "learning_rate": 0.0002, "epoch": 0.5907036797934151, "step": 1830}, {"loss": 0.7957, "grad_norm": 0.4522802233695984, "learning_rate": 0.0002, "epoch": 0.5939315687540349, "step": 1840}, {"loss": 0.8633, "grad_norm": 0.42987772822380066, "learning_rate": 0.0002, "epoch": 0.5971594577146546, "step": 1850}, {"loss": 0.7871, "grad_norm": 0.5566838383674622, "learning_rate": 0.0002, "epoch": 0.6003873466752744, "step": 1860}, {"loss": 0.8312, "grad_norm": 0.42807698249816895, "learning_rate": 0.0002, "epoch": 0.6036152356358941, "step": 1870}, {"loss": 0.8035, "grad_norm": 0.4957767724990845, "learning_rate": 0.0002, "epoch": 0.6068431245965139, "step": 1880}, {"loss": 0.8145, "grad_norm": 0.4260980188846588, "learning_rate": 0.0002, "epoch": 0.6100710135571337, "step": 1890}, {"loss": 0.8363, "grad_norm": 0.4777357876300812, "learning_rate": 0.0002, "epoch": 0.6132989025177534, "step": 1900}, {"loss": 0.8404, "grad_norm": 0.4434216022491455, "learning_rate": 0.0002, "epoch": 0.6165267914783732, "step": 1910}, {"loss": 0.8057, "grad_norm": 0.5215433835983276, "learning_rate": 0.0002, "epoch": 0.6197546804389928, "step": 1920}, {"loss": 0.82, "grad_norm": 0.5143248438835144, "learning_rate": 0.0002, "epoch": 0.6229825693996126, "step": 1930}, {"loss": 0.8107, "grad_norm": 0.5213413238525391, "learning_rate": 0.0002, "epoch": 0.6262104583602324, "step": 1940}, {"loss": 0.7549, "grad_norm": 0.5408226251602173, "learning_rate": 0.0002, "epoch": 0.6294383473208521, "step": 1950}, {"loss": 0.8405, "grad_norm": 0.5479708909988403, "learning_rate": 0.0002, "epoch": 0.6326662362814719, "step": 1960}, {"loss": 0.8138, "grad_norm": 0.4490949809551239, "learning_rate": 0.0002, "epoch": 0.6358941252420917, "step": 1970}, {"loss": 0.854, "grad_norm": 0.48815059661865234, "learning_rate": 0.0002, "epoch": 0.6391220142027114, "step": 1980}, {"loss": 0.8568, "grad_norm": 0.46498045325279236, "learning_rate": 0.0002, "epoch": 0.6423499031633312, "step": 1990}, {"loss": 0.8263, "grad_norm": 0.5136561393737793, "learning_rate": 0.0002, "epoch": 0.6455777921239509, "step": 2000}, {"loss": 0.8503, "grad_norm": 0.5145719647407532, "learning_rate": 0.0002, "epoch": 0.6488056810845707, "step": 2010}, {"loss": 0.8456, "grad_norm": 0.5430373549461365, "learning_rate": 0.0002, "epoch": 0.6520335700451905, "step": 2020}, {"loss": 0.8115, "grad_norm": 0.46347954869270325, "learning_rate": 0.0002, "epoch": 0.6552614590058102, "step": 2030}, {"loss": 0.8769, "grad_norm": 0.5189562439918518, "learning_rate": 0.0002, "epoch": 0.65848934796643, "step": 2040}, {"loss": 0.8453, "grad_norm": 0.43843990564346313, "learning_rate": 0.0002, "epoch": 0.6617172369270498, "step": 2050}, {"loss": 0.7951, "grad_norm": 0.4654983580112457, "learning_rate": 0.0002, "epoch": 0.6649451258876695, "step": 2060}, {"loss": 0.8308, "grad_norm": 0.44835716485977173, "learning_rate": 0.0002, "epoch": 0.6681730148482892, "step": 2070}, {"loss": 0.8181, "grad_norm": 0.38811734318733215, "learning_rate": 0.0002, "epoch": 0.6714009038089089, "step": 2080}, {"loss": 0.762, "grad_norm": 0.5709853172302246, "learning_rate": 0.0002, "epoch": 0.6746287927695287, "step": 2090}, {"loss": 0.8334, "grad_norm": 0.49994757771492004, "learning_rate": 0.0002, "epoch": 0.6778566817301485, "step": 2100}, {"loss": 0.8, "grad_norm": 0.5505402684211731, "learning_rate": 0.0002, "epoch": 0.6810845706907682, "step": 2110}, {"loss": 0.8227, "grad_norm": 0.48195120692253113, "learning_rate": 0.0002, "epoch": 0.684312459651388, "step": 2120}, {"loss": 0.7879, "grad_norm": 0.4854775071144104, "learning_rate": 0.0002, "epoch": 0.6875403486120077, "step": 2130}, {"loss": 0.8231, "grad_norm": 0.6422494649887085, "learning_rate": 0.0002, "epoch": 0.6907682375726275, "step": 2140}, {"loss": 0.8353, "grad_norm": 0.3972536027431488, "learning_rate": 0.0002, "epoch": 0.6939961265332473, "step": 2150}, {"loss": 0.8068, "grad_norm": 0.4297836422920227, "learning_rate": 0.0002, "epoch": 0.697224015493867, "step": 2160}, {"loss": 0.8017, "grad_norm": 0.45486778020858765, "learning_rate": 0.0002, "epoch": 0.7004519044544868, "step": 2170}, {"loss": 0.8507, "grad_norm": 0.4706047773361206, "learning_rate": 0.0002, "epoch": 0.7036797934151066, "step": 2180}, {"loss": 0.8234, "grad_norm": 0.46426892280578613, "learning_rate": 0.0002, "epoch": 0.7069076823757263, "step": 2190}, {"loss": 0.8472, "grad_norm": 0.46333715319633484, "learning_rate": 0.0002, "epoch": 0.7101355713363461, "step": 2200}, {"loss": 0.8247, "grad_norm": 0.4632524251937866, "learning_rate": 0.0002, "epoch": 0.7133634602969657, "step": 2210}, {"loss": 0.8452, "grad_norm": 0.4610830843448639, "learning_rate": 0.0002, "epoch": 0.7165913492575855, "step": 2220}, {"loss": 0.7338, "grad_norm": 0.4905324876308441, "learning_rate": 0.0002, "epoch": 0.7198192382182053, "step": 2230}, {"loss": 0.7715, "grad_norm": 0.4936263859272003, "learning_rate": 0.0002, "epoch": 0.723047127178825, "step": 2240}, {"loss": 0.8162, "grad_norm": 0.40778425335884094, "learning_rate": 0.0002, "epoch": 0.7262750161394448, "step": 2250}, {"loss": 0.828, "grad_norm": 0.50351482629776, "learning_rate": 0.0002, "epoch": 0.7295029051000645, "step": 2260}, {"loss": 0.8475, "grad_norm": 0.4894128143787384, "learning_rate": 0.0002, "epoch": 0.7327307940606843, "step": 2270}, {"loss": 0.8087, "grad_norm": 0.5580906271934509, "learning_rate": 0.0002, "epoch": 0.7359586830213041, "step": 2280}, {"loss": 0.8157, "grad_norm": 0.4655369520187378, "learning_rate": 0.0002, "epoch": 0.7391865719819238, "step": 2290}, {"loss": 0.8395, "grad_norm": 0.4666965901851654, "learning_rate": 0.0002, "epoch": 0.7424144609425436, "step": 2300}, {"loss": 0.7605, "grad_norm": 0.46259936690330505, "learning_rate": 0.0002, "epoch": 0.7456423499031634, "step": 2310}, {"loss": 0.7849, "grad_norm": 0.520706832408905, "learning_rate": 0.0002, "epoch": 0.7488702388637831, "step": 2320}, {"loss": 0.8173, "grad_norm": 0.5142408013343811, "learning_rate": 0.0002, "epoch": 0.7520981278244029, "step": 2330}, {"loss": 0.7782, "grad_norm": 0.5355164408683777, "learning_rate": 0.0002, "epoch": 0.7553260167850226, "step": 2340}, {"loss": 0.8242, "grad_norm": 0.5517185926437378, "learning_rate": 0.0002, "epoch": 0.7585539057456423, "step": 2350}, {"loss": 0.8404, "grad_norm": 0.7162677049636841, "learning_rate": 0.0002, "epoch": 0.7617817947062621, "step": 2360}, {"loss": 0.8455, "grad_norm": 0.42402133345603943, "learning_rate": 0.0002, "epoch": 0.7650096836668818, "step": 2370}, {"loss": 0.8214, "grad_norm": 0.47180113196372986, "learning_rate": 0.0002, "epoch": 0.7682375726275016, "step": 2380}, {"loss": 0.8274, "grad_norm": 0.6262288689613342, "learning_rate": 0.0002, "epoch": 0.7714654615881213, "step": 2390}, {"loss": 0.7915, "grad_norm": 0.5177528262138367, "learning_rate": 0.0002, "epoch": 0.7746933505487411, "step": 2400}, {"loss": 0.7631, "grad_norm": 0.555721640586853, "learning_rate": 0.0002, "epoch": 0.7779212395093609, "step": 2410}, {"loss": 0.795, "grad_norm": 0.5592644810676575, "learning_rate": 0.0002, "epoch": 0.7811491284699806, "step": 2420}, {"loss": 0.8081, "grad_norm": 0.38025397062301636, "learning_rate": 0.0002, "epoch": 0.7843770174306004, "step": 2430}, {"loss": 0.7851, "grad_norm": 0.4597472548484802, "learning_rate": 0.0002, "epoch": 0.7876049063912202, "step": 2440}, {"loss": 0.8575, "grad_norm": 0.4929825961589813, "learning_rate": 0.0002, "epoch": 0.7908327953518399, "step": 2450}, {"loss": 0.7584, "grad_norm": 0.45277655124664307, "learning_rate": 0.0002, "epoch": 0.7940606843124597, "step": 2460}, {"loss": 0.8208, "grad_norm": 0.6224122643470764, "learning_rate": 0.0002, "epoch": 0.7972885732730794, "step": 2470}, {"loss": 0.8449, "grad_norm": 0.5740901827812195, "learning_rate": 0.0002, "epoch": 0.8005164622336992, "step": 2480}, {"loss": 0.7834, "grad_norm": 0.41335329413414, "learning_rate": 0.0002, "epoch": 0.8037443511943189, "step": 2490}, {"loss": 0.7768, "grad_norm": 0.4738694131374359, "learning_rate": 0.0002, "epoch": 0.8069722401549386, "step": 2500}, {"loss": 0.7927, "grad_norm": 0.5288197994232178, "learning_rate": 0.0002, "epoch": 0.8102001291155584, "step": 2510}, {"loss": 0.8334, "grad_norm": 0.5404666066169739, "learning_rate": 0.0002, "epoch": 0.8134280180761781, "step": 2520}, {"loss": 0.7998, "grad_norm": 0.4444909691810608, "learning_rate": 0.0002, "epoch": 0.8166559070367979, "step": 2530}, {"loss": 0.8683, "grad_norm": 0.542061448097229, "learning_rate": 0.0002, "epoch": 0.8198837959974177, "step": 2540}, {"loss": 0.8038, "grad_norm": 0.4914741814136505, "learning_rate": 0.0002, "epoch": 0.8231116849580374, "step": 2550}, {"loss": 0.7899, "grad_norm": 0.41703441739082336, "learning_rate": 0.0002, "epoch": 0.8263395739186572, "step": 2560}, {"loss": 0.824, "grad_norm": 0.5489841103553772, "learning_rate": 0.0002, "epoch": 0.829567462879277, "step": 2570}, {"loss": 0.8157, "grad_norm": 0.5359883308410645, "learning_rate": 0.0002, "epoch": 0.8327953518398967, "step": 2580}, {"loss": 0.8122, "grad_norm": 0.5541019439697266, "learning_rate": 0.0002, "epoch": 0.8360232408005165, "step": 2590}, {"loss": 0.797, "grad_norm": 0.4746638834476471, "learning_rate": 0.0002, "epoch": 0.8392511297611362, "step": 2600}, {"loss": 0.8116, "grad_norm": 0.5243194103240967, "learning_rate": 0.0002, "epoch": 0.842479018721756, "step": 2610}, {"loss": 0.8173, "grad_norm": 0.46824976801872253, "learning_rate": 0.0002, "epoch": 0.8457069076823758, "step": 2620}, {"loss": 0.7525, "grad_norm": 0.49487847089767456, "learning_rate": 0.0002, "epoch": 0.8489347966429954, "step": 2630}, {"loss": 0.8296, "grad_norm": 0.42180097103118896, "learning_rate": 0.0002, "epoch": 0.8521626856036152, "step": 2640}, {"loss": 0.8304, "grad_norm": 0.5516560077667236, "learning_rate": 0.0002, "epoch": 0.855390574564235, "step": 2650}, {"loss": 0.7882, "grad_norm": 0.4392191767692566, "learning_rate": 0.0002, "epoch": 0.8586184635248547, "step": 2660}, {"loss": 0.848, "grad_norm": 0.5387210845947266, "learning_rate": 0.0002, "epoch": 0.8618463524854745, "step": 2670}, {"loss": 0.8094, "grad_norm": 0.6232406497001648, "learning_rate": 0.0002, "epoch": 0.8650742414460942, "step": 2680}, {"loss": 0.768, "grad_norm": 0.53749018907547, "learning_rate": 0.0002, "epoch": 0.868302130406714, "step": 2690}, {"loss": 0.8299, "grad_norm": 0.47480374574661255, "learning_rate": 0.0002, "epoch": 0.8715300193673338, "step": 2700}, {"loss": 0.8055, "grad_norm": 0.44618046283721924, "learning_rate": 0.0002, "epoch": 0.8747579083279535, "step": 2710}, {"loss": 0.8015, "grad_norm": 0.4173581302165985, "learning_rate": 0.0002, "epoch": 0.8779857972885733, "step": 2720}, {"loss": 0.7713, "grad_norm": 0.524081289768219, "learning_rate": 0.0002, "epoch": 0.881213686249193, "step": 2730}, {"loss": 0.8738, "grad_norm": 0.5608431100845337, "learning_rate": 0.0002, "epoch": 0.8844415752098128, "step": 2740}, {"loss": 0.8513, "grad_norm": 0.5212284922599792, "learning_rate": 0.0002, "epoch": 0.8876694641704326, "step": 2750}, {"loss": 0.8139, "grad_norm": 0.5601475834846497, "learning_rate": 0.0002, "epoch": 0.8908973531310523, "step": 2760}, {"loss": 0.7947, "grad_norm": 0.4499223828315735, "learning_rate": 0.0002, "epoch": 0.8941252420916721, "step": 2770}, {"loss": 0.8559, "grad_norm": 0.46945226192474365, "learning_rate": 0.0002, "epoch": 0.8973531310522918, "step": 2780}, {"loss": 0.801, "grad_norm": 0.4837495684623718, "learning_rate": 0.0002, "epoch": 0.9005810200129115, "step": 2790}, {"loss": 0.7887, "grad_norm": 0.5059258937835693, "learning_rate": 0.0002, "epoch": 0.9038089089735313, "step": 2800}, {"loss": 0.8571, "grad_norm": 0.4857945144176483, "learning_rate": 0.0002, "epoch": 0.907036797934151, "step": 2810}, {"loss": 0.8301, "grad_norm": 0.5001962780952454, "learning_rate": 0.0002, "epoch": 0.9102646868947708, "step": 2820}, {"loss": 0.8236, "grad_norm": 0.5468648672103882, "learning_rate": 0.0002, "epoch": 0.9134925758553906, "step": 2830}, {"loss": 0.8071, "grad_norm": 0.5533056259155273, "learning_rate": 0.0002, "epoch": 0.9167204648160103, "step": 2840}, {"loss": 0.7895, "grad_norm": 0.5909785628318787, "learning_rate": 0.0002, "epoch": 0.9199483537766301, "step": 2850}, {"loss": 0.796, "grad_norm": 0.47428104281425476, "learning_rate": 0.0002, "epoch": 0.9231762427372499, "step": 2860}, {"loss": 0.7845, "grad_norm": 0.548814058303833, "learning_rate": 0.0002, "epoch": 0.9264041316978696, "step": 2870}, {"loss": 0.7871, "grad_norm": 0.5576745271682739, "learning_rate": 0.0002, "epoch": 0.9296320206584894, "step": 2880}, {"loss": 0.8399, "grad_norm": 0.47094792127609253, "learning_rate": 0.0002, "epoch": 0.9328599096191091, "step": 2890}, {"loss": 0.805, "grad_norm": 0.5408539772033691, "learning_rate": 0.0002, "epoch": 0.9360877985797289, "step": 2900}, {"loss": 0.785, "grad_norm": 0.5922889113426208, "learning_rate": 0.0002, "epoch": 0.9393156875403487, "step": 2910}, {"loss": 0.8043, "grad_norm": 0.45462584495544434, "learning_rate": 0.0002, "epoch": 0.9425435765009683, "step": 2920}, {"loss": 0.8344, "grad_norm": 0.6864947080612183, "learning_rate": 0.0002, "epoch": 0.9457714654615881, "step": 2930}, {"loss": 0.8166, "grad_norm": 0.4706299304962158, "learning_rate": 0.0002, "epoch": 0.9489993544222078, "step": 2940}, {"loss": 0.8422, "grad_norm": 0.5583269596099854, "learning_rate": 0.0002, "epoch": 0.9522272433828276, "step": 2950}, {"loss": 0.836, "grad_norm": 0.51015704870224, "learning_rate": 0.0002, "epoch": 0.9554551323434474, "step": 2960}, {"loss": 0.8371, "grad_norm": 0.5325582027435303, "learning_rate": 0.0002, "epoch": 0.9586830213040671, "step": 2970}, {"loss": 0.7593, "grad_norm": 0.49008598923683167, "learning_rate": 0.0002, "epoch": 0.9619109102646869, "step": 2980}, {"loss": 0.8093, "grad_norm": 0.4422132074832916, "learning_rate": 0.0002, "epoch": 0.9651387992253067, "step": 2990}, {"loss": 0.7966, "grad_norm": 0.5053589344024658, "learning_rate": 0.0002, "epoch": 0.9683666881859264, "step": 3000}, {"loss": 0.8081, "grad_norm": 0.46754521131515503, "learning_rate": 0.0002, "epoch": 0.9715945771465462, "step": 3010}, {"loss": 0.8377, "grad_norm": 0.5613434910774231, "learning_rate": 0.0002, "epoch": 0.9748224661071659, "step": 3020}, {"loss": 0.7856, "grad_norm": 0.5052843689918518, "learning_rate": 0.0002, "epoch": 0.9780503550677857, "step": 3030}, {"loss": 0.8412, "grad_norm": 0.4270972013473511, "learning_rate": 0.0002, "epoch": 0.9812782440284055, "step": 3040}, {"loss": 0.8353, "grad_norm": 0.4974991977214813, "learning_rate": 0.0002, "epoch": 0.9845061329890252, "step": 3050}, {"loss": 0.8415, "grad_norm": 0.4432311952114105, "learning_rate": 0.0002, "epoch": 0.9877340219496449, "step": 3060}, {"loss": 0.7764, "grad_norm": 0.466457724571228, "learning_rate": 0.0002, "epoch": 0.9909619109102646, "step": 3070}, {"loss": 0.8067, "grad_norm": 0.6438009142875671, "learning_rate": 0.0002, "epoch": 0.9941897998708844, "step": 3080}, {"loss": 0.8425, "grad_norm": 0.5593604445457458, "learning_rate": 0.0002, "epoch": 0.9974176888315042, "step": 3090}, {"eval_loss": 1.0958120822906494, "eval_runtime": 148.3273, "eval_samples_per_second": 4.942, "eval_steps_per_second": 0.62, "epoch": 1.0, "step": 3098}, {"loss": 0.8275, "grad_norm": 0.5701445937156677, "learning_rate": 0.0002, "epoch": 1.000645577792124, "step": 3100}, {"loss": 0.7756, "grad_norm": 0.6089657545089722, "learning_rate": 0.0002, "epoch": 1.0038734667527438, "step": 3110}, {"loss": 0.7492, "grad_norm": 0.5619552135467529, "learning_rate": 0.0002, "epoch": 1.0071013557133635, "step": 3120}, {"loss": 0.7544, "grad_norm": 0.5550283789634705, "learning_rate": 0.0002, "epoch": 1.010329244673983, "step": 3130}, {"loss": 0.8006, "grad_norm": 0.6221792101860046, "learning_rate": 0.0002, "epoch": 1.013557133634603, "step": 3140}, {"loss": 0.7603, "grad_norm": 0.5450758934020996, "learning_rate": 0.0002, "epoch": 1.0167850225952226, "step": 3150}, {"loss": 0.7021, "grad_norm": 0.4359588027000427, "learning_rate": 0.0002, "epoch": 1.0200129115558425, "step": 3160}, {"loss": 0.7468, "grad_norm": 0.5932239890098572, "learning_rate": 0.0002, "epoch": 1.0232408005164622, "step": 3170}, {"loss": 0.7649, "grad_norm": 0.45478707551956177, "learning_rate": 0.0002, "epoch": 1.026468689477082, "step": 3180}, {"loss": 0.7355, "grad_norm": 0.677615761756897, "learning_rate": 0.0002, "epoch": 1.0296965784377017, "step": 3190}, {"loss": 0.6928, "grad_norm": 0.6231790781021118, "learning_rate": 0.0002, "epoch": 1.0329244673983216, "step": 3200}, {"loss": 0.7471, "grad_norm": 0.5074195861816406, "learning_rate": 0.0002, "epoch": 1.0361523563589412, "step": 3210}, {"loss": 0.6864, "grad_norm": 0.4844142198562622, "learning_rate": 0.0002, "epoch": 1.039380245319561, "step": 3220}, {"loss": 0.7655, "grad_norm": 0.5372750759124756, "learning_rate": 0.0002, "epoch": 1.0426081342801807, "step": 3230}, {"loss": 0.7384, "grad_norm": 0.46296265721321106, "learning_rate": 0.0002, "epoch": 1.0458360232408006, "step": 3240}, {"loss": 0.7894, "grad_norm": 0.5417148470878601, "learning_rate": 0.0002, "epoch": 1.0490639122014203, "step": 3250}, {"loss": 0.7637, "grad_norm": 0.5695074200630188, "learning_rate": 0.0002, "epoch": 1.0522918011620401, "step": 3260}, {"loss": 0.7456, "grad_norm": 0.5050092935562134, "learning_rate": 0.0002, "epoch": 1.0555196901226598, "step": 3270}, {"loss": 0.6805, "grad_norm": 0.5320752263069153, "learning_rate": 0.0002, "epoch": 1.0587475790832794, "step": 3280}, {"loss": 0.7419, "grad_norm": 0.5832052230834961, "learning_rate": 0.0002, "epoch": 1.0619754680438993, "step": 3290}, {"loss": 0.7656, "grad_norm": 0.5228804349899292, "learning_rate": 0.0002, "epoch": 1.065203357004519, "step": 3300}, {"loss": 0.6834, "grad_norm": 0.5819445252418518, "learning_rate": 0.0002, "epoch": 1.0684312459651388, "step": 3310}, {"loss": 0.7093, "grad_norm": 0.4201328754425049, "learning_rate": 0.0002, "epoch": 1.0716591349257585, "step": 3320}, {"loss": 0.7494, "grad_norm": 0.5424145460128784, "learning_rate": 0.0002, "epoch": 1.0748870238863784, "step": 3330}, {"loss": 0.7828, "grad_norm": 0.6169946789741516, "learning_rate": 0.0002, "epoch": 1.078114912846998, "step": 3340}, {"loss": 0.7505, "grad_norm": 0.607676088809967, "learning_rate": 0.0002, "epoch": 1.0813428018076179, "step": 3350}, {"loss": 0.7315, "grad_norm": 0.5191982388496399, "learning_rate": 0.0002, "epoch": 1.0845706907682375, "step": 3360}, {"loss": 0.7699, "grad_norm": 0.5728003978729248, "learning_rate": 0.0002, "epoch": 1.0877985797288574, "step": 3370}, {"loss": 0.7381, "grad_norm": 0.5402643084526062, "learning_rate": 0.0002, "epoch": 1.091026468689477, "step": 3380}, {"loss": 0.7208, "grad_norm": 0.5377541780471802, "learning_rate": 0.0002, "epoch": 1.094254357650097, "step": 3390}, {"loss": 0.7672, "grad_norm": 0.4751385748386383, "learning_rate": 0.0002, "epoch": 1.0974822466107166, "step": 3400}, {"loss": 0.7326, "grad_norm": 0.559158444404602, "learning_rate": 0.0002, "epoch": 1.1007101355713362, "step": 3410}, {"loss": 0.7366, "grad_norm": 0.4917701482772827, "learning_rate": 0.0002, "epoch": 1.103938024531956, "step": 3420}, {"loss": 0.7593, "grad_norm": 0.5507875084877014, "learning_rate": 0.0002, "epoch": 1.1071659134925758, "step": 3430}, {"loss": 0.7424, "grad_norm": 0.45458680391311646, "learning_rate": 0.0002, "epoch": 1.1103938024531956, "step": 3440}, {"loss": 0.7234, "grad_norm": 0.5721744894981384, "learning_rate": 0.0002, "epoch": 1.1136216914138153, "step": 3450}, {"loss": 0.7219, "grad_norm": 0.5776081681251526, "learning_rate": 0.0002, "epoch": 1.1168495803744352, "step": 3460}, {"loss": 0.7644, "grad_norm": 0.5261953473091125, "learning_rate": 0.0002, "epoch": 1.1200774693350548, "step": 3470}, {"loss": 0.6586, "grad_norm": 0.47759532928466797, "learning_rate": 0.0002, "epoch": 1.1233053582956747, "step": 3480}, {"loss": 0.7641, "grad_norm": 0.5697659850120544, "learning_rate": 0.0002, "epoch": 1.1265332472562943, "step": 3490}, {"loss": 0.7017, "grad_norm": 0.5643419623374939, "learning_rate": 0.0002, "epoch": 1.1297611362169142, "step": 3500}, {"loss": 0.7235, "grad_norm": 0.6502931118011475, "learning_rate": 0.0002, "epoch": 1.1329890251775339, "step": 3510}, {"loss": 0.7662, "grad_norm": 0.5236507654190063, "learning_rate": 0.0002, "epoch": 1.1362169141381537, "step": 3520}, {"loss": 0.7571, "grad_norm": 0.6521499156951904, "learning_rate": 0.0002, "epoch": 1.1394448030987734, "step": 3530}, {"loss": 0.7304, "grad_norm": 0.5893217325210571, "learning_rate": 0.0002, "epoch": 1.142672692059393, "step": 3540}, {"loss": 0.7508, "grad_norm": 0.5300073027610779, "learning_rate": 0.0002, "epoch": 1.145900581020013, "step": 3550}, {"loss": 0.6937, "grad_norm": 0.6794660091400146, "learning_rate": 0.0002, "epoch": 1.1491284699806328, "step": 3560}, {"loss": 0.7614, "grad_norm": 0.5420064926147461, "learning_rate": 0.0002, "epoch": 1.1523563589412524, "step": 3570}, {"loss": 0.7648, "grad_norm": 0.5096590518951416, "learning_rate": 0.0002, "epoch": 1.155584247901872, "step": 3580}, {"loss": 0.7436, "grad_norm": 0.5726043581962585, "learning_rate": 0.0002, "epoch": 1.158812136862492, "step": 3590}, {"loss": 0.7728, "grad_norm": 0.7388110160827637, "learning_rate": 0.0002, "epoch": 1.1620400258231116, "step": 3600}, {"loss": 0.7421, "grad_norm": 0.5597969889640808, "learning_rate": 0.0002, "epoch": 1.1652679147837315, "step": 3610}, {"loss": 0.7132, "grad_norm": 0.5067800283432007, "learning_rate": 0.0002, "epoch": 1.1684958037443511, "step": 3620}, {"loss": 0.7893, "grad_norm": 0.6625118255615234, "learning_rate": 0.0002, "epoch": 1.171723692704971, "step": 3630}, {"loss": 0.7611, "grad_norm": 0.5830849409103394, "learning_rate": 0.0002, "epoch": 1.1749515816655907, "step": 3640}, {"loss": 0.7973, "grad_norm": 0.6140692830085754, "learning_rate": 0.0002, "epoch": 1.1781794706262105, "step": 3650}, {"loss": 0.7617, "grad_norm": 0.714523434638977, "learning_rate": 0.0002, "epoch": 1.1814073595868302, "step": 3660}, {"loss": 0.7092, "grad_norm": 0.5196696519851685, "learning_rate": 0.0002, "epoch": 1.18463524854745, "step": 3670}, {"loss": 0.7821, "grad_norm": 0.6677889823913574, "learning_rate": 0.0002, "epoch": 1.1878631375080697, "step": 3680}, {"loss": 0.7813, "grad_norm": 0.47095245122909546, "learning_rate": 0.0002, "epoch": 1.1910910264686896, "step": 3690}, {"loss": 0.7702, "grad_norm": 0.5197778940200806, "learning_rate": 0.0002, "epoch": 1.1943189154293092, "step": 3700}, {"loss": 0.7349, "grad_norm": 0.5156530141830444, "learning_rate": 0.0002, "epoch": 1.1975468043899289, "step": 3710}, {"loss": 0.7738, "grad_norm": 0.6968549489974976, "learning_rate": 0.0002, "epoch": 1.2007746933505488, "step": 3720}, {"loss": 0.7599, "grad_norm": 0.48983848094940186, "learning_rate": 0.0002, "epoch": 1.2040025823111684, "step": 3730}, {"loss": 0.7163, "grad_norm": 0.6709973216056824, "learning_rate": 0.0002, "epoch": 1.2072304712717883, "step": 3740}, {"loss": 0.7632, "grad_norm": 0.48681750893592834, "learning_rate": 0.0002, "epoch": 1.210458360232408, "step": 3750}, {"loss": 0.7039, "grad_norm": 0.49475061893463135, "learning_rate": 0.0002, "epoch": 1.2136862491930278, "step": 3760}, {"loss": 0.7372, "grad_norm": 0.6163983345031738, "learning_rate": 0.0002, "epoch": 1.2169141381536475, "step": 3770}, {"loss": 0.757, "grad_norm": 0.5481411218643188, "learning_rate": 0.0002, "epoch": 1.2201420271142673, "step": 3780}, {"loss": 0.7601, "grad_norm": 0.620639979839325, "learning_rate": 0.0002, "epoch": 1.223369916074887, "step": 3790}, {"loss": 0.7738, "grad_norm": 0.7017222046852112, "learning_rate": 0.0002, "epoch": 1.2265978050355069, "step": 3800}, {"loss": 0.7468, "grad_norm": 0.5872400403022766, "learning_rate": 0.0002, "epoch": 1.2298256939961265, "step": 3810}, {"loss": 0.7854, "grad_norm": 0.45765596628189087, "learning_rate": 0.0002, "epoch": 1.2330535829567464, "step": 3820}, {"loss": 0.7865, "grad_norm": 0.5676377415657043, "learning_rate": 0.0002, "epoch": 1.236281471917366, "step": 3830}, {"loss": 0.7696, "grad_norm": 0.4793425500392914, "learning_rate": 0.0002, "epoch": 1.2395093608779857, "step": 3840}, {"loss": 0.7065, "grad_norm": 0.5060022473335266, "learning_rate": 0.0002, "epoch": 1.2427372498386056, "step": 3850}, {"loss": 0.7333, "grad_norm": 0.6140682697296143, "learning_rate": 0.0002, "epoch": 1.2459651387992252, "step": 3860}, {"loss": 0.7496, "grad_norm": 0.5030326843261719, "learning_rate": 0.0002, "epoch": 1.249193027759845, "step": 3870}, {"loss": 0.7226, "grad_norm": 0.6609430909156799, "learning_rate": 0.0002, "epoch": 1.2524209167204647, "step": 3880}, {"loss": 0.7212, "grad_norm": 0.5459545850753784, "learning_rate": 0.0002, "epoch": 1.2556488056810846, "step": 3890}, {"loss": 0.7145, "grad_norm": 0.5328870415687561, "learning_rate": 0.0002, "epoch": 1.2588766946417043, "step": 3900}, {"loss": 0.7572, "grad_norm": 0.5840652585029602, "learning_rate": 0.0002, "epoch": 1.2621045836023241, "step": 3910}, {"loss": 0.7624, "grad_norm": 0.5587584376335144, "learning_rate": 0.0002, "epoch": 1.2653324725629438, "step": 3920}, {"loss": 0.7846, "grad_norm": 0.5886949896812439, "learning_rate": 0.0002, "epoch": 1.2685603615235637, "step": 3930}, {"loss": 0.7251, "grad_norm": 0.5128693580627441, "learning_rate": 0.0002, "epoch": 1.2717882504841833, "step": 3940}, {"loss": 0.7032, "grad_norm": 0.6207669377326965, "learning_rate": 0.0002, "epoch": 1.2750161394448032, "step": 3950}, {"loss": 0.7506, "grad_norm": 0.5789574384689331, "learning_rate": 0.0002, "epoch": 1.2782440284054228, "step": 3960}, {"loss": 0.7574, "grad_norm": 0.503162145614624, "learning_rate": 0.0002, "epoch": 1.2814719173660425, "step": 3970}, {"loss": 0.7489, "grad_norm": 0.6670064926147461, "learning_rate": 0.0002, "epoch": 1.2846998063266624, "step": 3980}, {"loss": 0.7198, "grad_norm": 0.5676213502883911, "learning_rate": 0.0002, "epoch": 1.2879276952872822, "step": 3990}, {"loss": 0.7892, "grad_norm": 0.5383169054985046, "learning_rate": 0.0002, "epoch": 1.2911555842479019, "step": 4000}, {"loss": 0.7432, "grad_norm": 0.714743971824646, "learning_rate": 0.0002, "epoch": 1.2943834732085215, "step": 4010}, {"loss": 0.7594, "grad_norm": 0.5740262269973755, "learning_rate": 0.0002, "epoch": 1.2976113621691414, "step": 4020}, {"loss": 0.7564, "grad_norm": 0.6143045425415039, "learning_rate": 0.0002, "epoch": 1.300839251129761, "step": 4030}, {"loss": 0.7181, "grad_norm": 0.501025378704071, "learning_rate": 0.0002, "epoch": 1.304067140090381, "step": 4040}, {"loss": 0.7099, "grad_norm": 0.5784100294113159, "learning_rate": 0.0002, "epoch": 1.3072950290510006, "step": 4050}, {"loss": 0.7403, "grad_norm": 0.6182606220245361, "learning_rate": 0.0002, "epoch": 1.3105229180116205, "step": 4060}, {"loss": 0.7249, "grad_norm": 0.5072231292724609, "learning_rate": 0.0002, "epoch": 1.3137508069722401, "step": 4070}, {"loss": 0.7451, "grad_norm": 0.6841012835502625, "learning_rate": 0.0002, "epoch": 1.31697869593286, "step": 4080}, {"loss": 0.7395, "grad_norm": 0.697257936000824, "learning_rate": 0.0002, "epoch": 1.3202065848934796, "step": 4090}, {"loss": 0.7401, "grad_norm": 0.5113214254379272, "learning_rate": 0.0002, "epoch": 1.3234344738540993, "step": 4100}, {"loss": 0.7336, "grad_norm": 0.6270561814308167, "learning_rate": 0.0002, "epoch": 1.3266623628147192, "step": 4110}, {"loss": 0.7535, "grad_norm": 0.5525947213172913, "learning_rate": 0.0002, "epoch": 1.329890251775339, "step": 4120}, {"loss": 0.6999, "grad_norm": 0.546071469783783, "learning_rate": 0.0002, "epoch": 1.3331181407359587, "step": 4130}, {"loss": 0.7884, "grad_norm": 0.6516721248626709, "learning_rate": 0.0002, "epoch": 1.3363460296965783, "step": 4140}, {"loss": 0.755, "grad_norm": 0.6235111355781555, "learning_rate": 0.0002, "epoch": 1.3395739186571982, "step": 4150}, {"loss": 0.7467, "grad_norm": 0.538649320602417, "learning_rate": 0.0002, "epoch": 1.3428018076178179, "step": 4160}, {"loss": 0.7368, "grad_norm": 0.5367001891136169, "learning_rate": 0.0002, "epoch": 1.3460296965784377, "step": 4170}, {"loss": 0.7536, "grad_norm": 0.6134631037712097, "learning_rate": 0.0002, "epoch": 1.3492575855390574, "step": 4180}, {"loss": 0.8245, "grad_norm": 0.5827262997627258, "learning_rate": 0.0002, "epoch": 1.3524854744996773, "step": 4190}, {"loss": 0.7288, "grad_norm": 0.5706096291542053, "learning_rate": 0.0002, "epoch": 1.355713363460297, "step": 4200}, {"loss": 0.7302, "grad_norm": 0.6422057151794434, "learning_rate": 0.0002, "epoch": 1.3589412524209168, "step": 4210}, {"loss": 0.7303, "grad_norm": 0.6316141486167908, "learning_rate": 0.0002, "epoch": 1.3621691413815364, "step": 4220}, {"loss": 0.7457, "grad_norm": 0.6946983933448792, "learning_rate": 0.0002, "epoch": 1.365397030342156, "step": 4230}, {"loss": 0.7388, "grad_norm": 0.5381525754928589, "learning_rate": 0.0002, "epoch": 1.368624919302776, "step": 4240}, {"loss": 0.73, "grad_norm": 0.5484845638275146, "learning_rate": 0.0002, "epoch": 1.3718528082633958, "step": 4250}, {"loss": 0.7584, "grad_norm": 0.5961896777153015, "learning_rate": 0.0002, "epoch": 1.3750806972240155, "step": 4260}, {"loss": 0.8006, "grad_norm": 0.6041752696037292, "learning_rate": 0.0002, "epoch": 1.3783085861846351, "step": 4270}, {"loss": 0.7276, "grad_norm": 0.6283464431762695, "learning_rate": 0.0002, "epoch": 1.381536475145255, "step": 4280}, {"loss": 0.757, "grad_norm": 0.6761324405670166, "learning_rate": 0.0002, "epoch": 1.384764364105875, "step": 4290}, {"loss": 0.7381, "grad_norm": 0.504311203956604, "learning_rate": 0.0002, "epoch": 1.3879922530664945, "step": 4300}, {"loss": 0.7536, "grad_norm": 0.6100395917892456, "learning_rate": 0.0002, "epoch": 1.3912201420271142, "step": 4310}, {"loss": 0.7103, "grad_norm": 0.6245788335800171, "learning_rate": 0.0002, "epoch": 1.394448030987734, "step": 4320}, {"loss": 0.7505, "grad_norm": 0.6074621081352234, "learning_rate": 0.0002, "epoch": 1.3976759199483537, "step": 4330}, {"loss": 0.752, "grad_norm": 0.6683838963508606, "learning_rate": 0.0002, "epoch": 1.4009038089089736, "step": 4340}, {"loss": 0.7537, "grad_norm": 0.622998058795929, "learning_rate": 0.0002, "epoch": 1.4041316978695932, "step": 4350}, {"loss": 0.8148, "grad_norm": 0.6089423894882202, "learning_rate": 0.0002, "epoch": 1.4073595868302131, "step": 4360}, {"loss": 0.7715, "grad_norm": 0.6381658911705017, "learning_rate": 0.0002, "epoch": 1.4105874757908328, "step": 4370}, {"loss": 0.7871, "grad_norm": 0.5419308543205261, "learning_rate": 0.0002, "epoch": 1.4138153647514526, "step": 4380}, {"loss": 0.7386, "grad_norm": 0.6026232242584229, "learning_rate": 0.0002, "epoch": 1.4170432537120723, "step": 4390}, {"loss": 0.7529, "grad_norm": 0.4911101162433624, "learning_rate": 0.0002, "epoch": 1.420271142672692, "step": 4400}, {"loss": 0.7495, "grad_norm": 0.6302908062934875, "learning_rate": 0.0002, "epoch": 1.4234990316333118, "step": 4410}, {"loss": 0.7446, "grad_norm": 0.6692768931388855, "learning_rate": 0.0002, "epoch": 1.4267269205939317, "step": 4420}, {"loss": 0.7312, "grad_norm": 0.46294572949409485, "learning_rate": 0.0002, "epoch": 1.4299548095545513, "step": 4430}, {"loss": 0.7255, "grad_norm": 0.5452619194984436, "learning_rate": 0.0002, "epoch": 1.433182698515171, "step": 4440}, {"loss": 0.7974, "grad_norm": 0.7809233069419861, "learning_rate": 0.0002, "epoch": 1.4364105874757909, "step": 4450}, {"loss": 0.7103, "grad_norm": 0.550088107585907, "learning_rate": 0.0002, "epoch": 1.4396384764364105, "step": 4460}, {"loss": 0.7088, "grad_norm": 0.7139151096343994, "learning_rate": 0.0002, "epoch": 1.4428663653970304, "step": 4470}, {"loss": 0.7358, "grad_norm": 0.6187090873718262, "learning_rate": 0.0002, "epoch": 1.44609425435765, "step": 4480}, {"loss": 0.7608, "grad_norm": 0.5948249101638794, "learning_rate": 0.0002, "epoch": 1.44932214331827, "step": 4490}, {"loss": 0.7582, "grad_norm": 0.6510892510414124, "learning_rate": 0.0002, "epoch": 1.4525500322788896, "step": 4500}, {"loss": 0.7105, "grad_norm": 0.6552293300628662, "learning_rate": 0.0002, "epoch": 1.4557779212395094, "step": 4510}, {"loss": 0.7965, "grad_norm": 0.585574209690094, "learning_rate": 0.0002, "epoch": 1.459005810200129, "step": 4520}, {"loss": 0.761, "grad_norm": 0.4830162823200226, "learning_rate": 0.0002, "epoch": 1.4622336991607487, "step": 4530}, {"loss": 0.7424, "grad_norm": 0.5780223608016968, "learning_rate": 0.0002, "epoch": 1.4654615881213686, "step": 4540}, {"loss": 0.7518, "grad_norm": 0.5462607145309448, "learning_rate": 0.0002, "epoch": 1.4686894770819885, "step": 4550}, {"loss": 0.7342, "grad_norm": 0.5183546543121338, "learning_rate": 0.0002, "epoch": 1.4719173660426081, "step": 4560}, {"loss": 0.71, "grad_norm": 0.676917552947998, "learning_rate": 0.0002, "epoch": 1.4751452550032278, "step": 4570}, {"loss": 0.7875, "grad_norm": 0.5772345066070557, "learning_rate": 0.0002, "epoch": 1.4783731439638477, "step": 4580}, {"loss": 0.7709, "grad_norm": 0.7320035696029663, "learning_rate": 0.0002, "epoch": 1.4816010329244673, "step": 4590}, {"loss": 0.7601, "grad_norm": 0.5024042129516602, "learning_rate": 0.0002, "epoch": 1.4848289218850872, "step": 4600}, {"loss": 0.8061, "grad_norm": 0.5482868552207947, "learning_rate": 0.0002, "epoch": 1.4880568108457068, "step": 4610}, {"loss": 0.714, "grad_norm": 0.5447399616241455, "learning_rate": 0.0002, "epoch": 1.4912846998063267, "step": 4620}, {"loss": 0.7959, "grad_norm": 0.5953414440155029, "learning_rate": 0.0002, "epoch": 1.4945125887669464, "step": 4630}, {"loss": 0.7463, "grad_norm": 0.6983066201210022, "learning_rate": 0.0002, "epoch": 1.4977404777275662, "step": 4640}, {"loss": 0.7877, "grad_norm": 0.586327075958252, "learning_rate": 0.0002, "epoch": 1.500968366688186, "step": 4650}, {"loss": 0.7169, "grad_norm": 0.5839682221412659, "learning_rate": 0.0002, "epoch": 1.5041962556488055, "step": 4660}, {"loss": 0.7524, "grad_norm": 0.5959209203720093, "learning_rate": 0.0002, "epoch": 1.5074241446094254, "step": 4670}, {"loss": 0.7615, "grad_norm": 0.5073857307434082, "learning_rate": 0.0002, "epoch": 1.5106520335700453, "step": 4680}, {"loss": 0.7258, "grad_norm": 0.5183001160621643, "learning_rate": 0.0002, "epoch": 1.513879922530665, "step": 4690}, {"loss": 0.784, "grad_norm": 0.593530535697937, "learning_rate": 0.0002, "epoch": 1.5171078114912846, "step": 4700}, {"loss": 0.7722, "grad_norm": 0.675993025302887, "learning_rate": 0.0002, "epoch": 1.5203357004519045, "step": 4710}, {"loss": 0.7485, "grad_norm": 0.5823286771774292, "learning_rate": 0.0002, "epoch": 1.5235635894125243, "step": 4720}, {"loss": 0.7474, "grad_norm": 0.5825035572052002, "learning_rate": 0.0002, "epoch": 1.526791478373144, "step": 4730}, {"loss": 0.8287, "grad_norm": 0.5689691305160522, "learning_rate": 0.0002, "epoch": 1.5300193673337636, "step": 4740}, {"loss": 0.7279, "grad_norm": 0.6037150621414185, "learning_rate": 0.0002, "epoch": 1.5332472562943835, "step": 4750}, {"loss": 0.7865, "grad_norm": 0.6393677592277527, "learning_rate": 0.0002, "epoch": 1.5364751452550034, "step": 4760}, {"loss": 0.805, "grad_norm": 0.5926381945610046, "learning_rate": 0.0002, "epoch": 1.539703034215623, "step": 4770}, {"loss": 0.7425, "grad_norm": 0.9468599557876587, "learning_rate": 0.0002, "epoch": 1.5429309231762427, "step": 4780}, {"loss": 0.7565, "grad_norm": 0.7544237375259399, "learning_rate": 0.0002, "epoch": 1.5461588121368623, "step": 4790}, {"loss": 0.7398, "grad_norm": 0.5308566093444824, "learning_rate": 0.0002, "epoch": 1.5493867010974822, "step": 4800}, {"loss": 0.7756, "grad_norm": 0.6590296030044556, "learning_rate": 0.0002, "epoch": 1.552614590058102, "step": 4810}, {"loss": 0.7212, "grad_norm": 0.5630404353141785, "learning_rate": 0.0002, "epoch": 1.5558424790187217, "step": 4820}, {"loss": 0.7593, "grad_norm": 0.6800200939178467, "learning_rate": 0.0002, "epoch": 1.5590703679793414, "step": 4830}, {"loss": 0.7373, "grad_norm": 0.5463718175888062, "learning_rate": 0.0002, "epoch": 1.5622982569399613, "step": 4840}, {"loss": 0.7519, "grad_norm": 0.505135178565979, "learning_rate": 0.0002, "epoch": 1.5655261459005811, "step": 4850}, {"loss": 0.8122, "grad_norm": 0.5469676852226257, "learning_rate": 0.0002, "epoch": 1.5687540348612008, "step": 4860}, {"loss": 0.7185, "grad_norm": 0.5318337678909302, "learning_rate": 0.0002, "epoch": 1.5719819238218204, "step": 4870}, {"loss": 0.7324, "grad_norm": 0.7287914752960205, "learning_rate": 0.0002, "epoch": 1.5752098127824403, "step": 4880}, {"loss": 0.7532, "grad_norm": 0.7318989038467407, "learning_rate": 0.0002, "epoch": 1.5784377017430602, "step": 4890}, {"loss": 0.7851, "grad_norm": 0.6499921679496765, "learning_rate": 0.0002, "epoch": 1.5816655907036798, "step": 4900}, {"loss": 0.753, "grad_norm": 0.47907355427742004, "learning_rate": 0.0002, "epoch": 1.5848934796642995, "step": 4910}, {"loss": 0.7699, "grad_norm": 0.7338833808898926, "learning_rate": 0.0002, "epoch": 1.5881213686249191, "step": 4920}, {"loss": 0.7592, "grad_norm": 0.5800719261169434, "learning_rate": 0.0002, "epoch": 1.591349257585539, "step": 4930}, {"loss": 0.7211, "grad_norm": 0.5365763306617737, "learning_rate": 0.0002, "epoch": 1.594577146546159, "step": 4940}, {"loss": 0.777, "grad_norm": 0.5800772309303284, "learning_rate": 0.0002, "epoch": 1.5978050355067785, "step": 4950}, {"loss": 0.8027, "grad_norm": 0.7878010869026184, "learning_rate": 0.0002, "epoch": 1.6010329244673982, "step": 4960}, {"loss": 0.7894, "grad_norm": 0.5919058918952942, "learning_rate": 0.0002, "epoch": 1.604260813428018, "step": 4970}, {"loss": 0.7762, "grad_norm": 0.5004435181617737, "learning_rate": 0.0002, "epoch": 1.607488702388638, "step": 4980}, {"loss": 0.7447, "grad_norm": 0.6299242377281189, "learning_rate": 0.0002, "epoch": 1.6107165913492576, "step": 4990}, {"loss": 0.7149, "grad_norm": 0.6307242512702942, "learning_rate": 0.0002, "epoch": 1.6139444803098772, "step": 5000}, {"loss": 0.7693, "grad_norm": 0.7838703989982605, "learning_rate": 0.0002, "epoch": 1.6171723692704971, "step": 5010}, {"loss": 0.7364, "grad_norm": 0.6454671621322632, "learning_rate": 0.0002, "epoch": 1.620400258231117, "step": 5020}, {"loss": 0.74, "grad_norm": 0.5907095670700073, "learning_rate": 0.0002, "epoch": 1.6236281471917366, "step": 5030}, {"loss": 0.7331, "grad_norm": 0.6053501963615417, "learning_rate": 0.0002, "epoch": 1.6268560361523563, "step": 5040}, {"loss": 0.6987, "grad_norm": 0.5644670128822327, "learning_rate": 0.0002, "epoch": 1.630083925112976, "step": 5050}, {"loss": 0.7886, "grad_norm": 0.6320949792861938, "learning_rate": 0.0002, "epoch": 1.6333118140735958, "step": 5060}, {"loss": 0.7109, "grad_norm": 0.6101489067077637, "learning_rate": 0.0002, "epoch": 1.6365397030342157, "step": 5070}, {"loss": 0.6922, "grad_norm": 0.9435283541679382, "learning_rate": 0.0002, "epoch": 1.6397675919948353, "step": 5080}, {"loss": 0.729, "grad_norm": 0.6668919324874878, "learning_rate": 0.0002, "epoch": 1.642995480955455, "step": 5090}, {"loss": 0.7402, "grad_norm": 0.6160340905189514, "learning_rate": 0.0002, "epoch": 1.6462233699160749, "step": 5100}, {"loss": 0.7461, "grad_norm": 0.5999835729598999, "learning_rate": 0.0002, "epoch": 1.6494512588766947, "step": 5110}, {"loss": 0.7661, "grad_norm": 0.9378551840782166, "learning_rate": 0.0002, "epoch": 1.6526791478373144, "step": 5120}, {"loss": 0.7586, "grad_norm": 0.4795055389404297, "learning_rate": 0.0002, "epoch": 1.655907036797934, "step": 5130}, {"loss": 0.7342, "grad_norm": 0.4878861606121063, "learning_rate": 0.0002, "epoch": 1.659134925758554, "step": 5140}, {"loss": 0.7362, "grad_norm": 0.6042965054512024, "learning_rate": 0.0002, "epoch": 1.6623628147191738, "step": 5150}, {"loss": 0.7863, "grad_norm": 0.5829901695251465, "learning_rate": 0.0002, "epoch": 1.6655907036797934, "step": 5160}, {"loss": 0.7498, "grad_norm": 0.5168480277061462, "learning_rate": 0.0002, "epoch": 1.668818592640413, "step": 5170}, {"loss": 0.7333, "grad_norm": 0.6489511132240295, "learning_rate": 0.0002, "epoch": 1.672046481601033, "step": 5180}, {"loss": 0.7257, "grad_norm": 0.5955966114997864, "learning_rate": 0.0002, "epoch": 1.6752743705616526, "step": 5190}, {"loss": 0.7938, "grad_norm": 0.6228088140487671, "learning_rate": 0.0002, "epoch": 1.6785022595222725, "step": 5200}, {"loss": 0.7626, "grad_norm": 0.5726390480995178, "learning_rate": 0.0002, "epoch": 1.6817301484828922, "step": 5210}, {"loss": 0.7479, "grad_norm": 0.6116343140602112, "learning_rate": 0.0002, "epoch": 1.6849580374435118, "step": 5220}, {"loss": 0.7169, "grad_norm": 0.5483687520027161, "learning_rate": 0.0002, "epoch": 1.6881859264041317, "step": 5230}, {"loss": 0.7293, "grad_norm": 0.570941686630249, "learning_rate": 0.0002, "epoch": 1.6914138153647515, "step": 5240}, {"loss": 0.723, "grad_norm": 0.6048086285591125, "learning_rate": 0.0002, "epoch": 1.6946417043253712, "step": 5250}, {"loss": 0.7861, "grad_norm": 0.6769003868103027, "learning_rate": 0.0002, "epoch": 1.6978695932859909, "step": 5260}, {"loss": 0.7885, "grad_norm": 0.5629057884216309, "learning_rate": 0.0002, "epoch": 1.7010974822466107, "step": 5270}, {"loss": 0.7693, "grad_norm": 0.657341480255127, "learning_rate": 0.0002, "epoch": 1.7043253712072306, "step": 5280}, {"loss": 0.7357, "grad_norm": 0.6256147623062134, "learning_rate": 0.0002, "epoch": 1.7075532601678503, "step": 5290}, {"loss": 0.714, "grad_norm": 0.5498088002204895, "learning_rate": 0.0002, "epoch": 1.71078114912847, "step": 5300}, {"loss": 0.7669, "grad_norm": 0.5078358054161072, "learning_rate": 0.0002, "epoch": 1.7140090380890898, "step": 5310}, {"loss": 0.7872, "grad_norm": 0.6696692705154419, "learning_rate": 0.0002, "epoch": 1.7172369270497096, "step": 5320}, {"loss": 0.8205, "grad_norm": 0.6692847013473511, "learning_rate": 0.0002, "epoch": 1.7204648160103293, "step": 5330}, {"loss": 0.7432, "grad_norm": 0.5415751934051514, "learning_rate": 0.0002, "epoch": 1.723692704970949, "step": 5340}, {"loss": 0.7499, "grad_norm": 0.5367611050605774, "learning_rate": 0.0002, "epoch": 1.7269205939315686, "step": 5350}, {"loss": 0.7631, "grad_norm": 0.7321061491966248, "learning_rate": 0.0002, "epoch": 1.7301484828921885, "step": 5360}, {"loss": 0.7827, "grad_norm": 0.723972499370575, "learning_rate": 0.0002, "epoch": 1.7333763718528084, "step": 5370}, {"loss": 0.7077, "grad_norm": 0.7328100204467773, "learning_rate": 0.0002, "epoch": 1.736604260813428, "step": 5380}, {"loss": 0.7503, "grad_norm": 0.5785264372825623, "learning_rate": 0.0002, "epoch": 1.7398321497740477, "step": 5390}, {"loss": 0.7188, "grad_norm": 0.7812932133674622, "learning_rate": 0.0002, "epoch": 1.7430600387346675, "step": 5400}, {"loss": 0.7386, "grad_norm": 0.6493327617645264, "learning_rate": 0.0002, "epoch": 1.7462879276952874, "step": 5410}, {"loss": 0.7487, "grad_norm": 0.5825939774513245, "learning_rate": 0.0002, "epoch": 1.749515816655907, "step": 5420}, {"loss": 0.7625, "grad_norm": 0.6969610452651978, "learning_rate": 0.0002, "epoch": 1.7527437056165267, "step": 5430}, {"loss": 0.7512, "grad_norm": 0.5558062195777893, "learning_rate": 0.0002, "epoch": 1.7559715945771466, "step": 5440}, {"loss": 0.7256, "grad_norm": 0.49222221970558167, "learning_rate": 0.0002, "epoch": 1.7591994835377665, "step": 5450}, {"loss": 0.7477, "grad_norm": 0.5844656825065613, "learning_rate": 0.0002, "epoch": 1.762427372498386, "step": 5460}, {"loss": 0.7695, "grad_norm": 0.8706597685813904, "learning_rate": 0.0002, "epoch": 1.7656552614590058, "step": 5470}, {"loss": 0.7582, "grad_norm": 0.6167706251144409, "learning_rate": 0.0002, "epoch": 1.7688831504196254, "step": 5480}, {"loss": 0.7521, "grad_norm": 0.5890011787414551, "learning_rate": 0.0002, "epoch": 1.7721110393802453, "step": 5490}, {"loss": 0.8319, "grad_norm": 0.6551728248596191, "learning_rate": 0.0002, "epoch": 1.7753389283408652, "step": 5500}, {"loss": 0.7615, "grad_norm": 0.5848751068115234, "learning_rate": 0.0002, "epoch": 1.7785668173014848, "step": 5510}, {"loss": 0.7622, "grad_norm": 0.6664014458656311, "learning_rate": 0.0002, "epoch": 1.7817947062621045, "step": 5520}, {"loss": 0.7544, "grad_norm": 0.5931693911552429, "learning_rate": 0.0002, "epoch": 1.7850225952227243, "step": 5530}, {"loss": 0.7992, "grad_norm": 0.5534724593162537, "learning_rate": 0.0002, "epoch": 1.7882504841833442, "step": 5540}, {"loss": 0.7967, "grad_norm": 0.5590878129005432, "learning_rate": 0.0002, "epoch": 1.7914783731439639, "step": 5550}, {"loss": 0.7406, "grad_norm": 0.6947470903396606, "learning_rate": 0.0002, "epoch": 1.7947062621045835, "step": 5560}, {"loss": 0.7614, "grad_norm": 0.6104130148887634, "learning_rate": 0.0002, "epoch": 1.7979341510652034, "step": 5570}, {"loss": 0.8032, "grad_norm": 0.6135714054107666, "learning_rate": 0.0002, "epoch": 1.8011620400258233, "step": 5580}, {"loss": 0.7403, "grad_norm": 0.6626853346824646, "learning_rate": 0.0002, "epoch": 1.804389928986443, "step": 5590}, {"loss": 0.7746, "grad_norm": 0.6977612972259521, "learning_rate": 0.0002, "epoch": 1.8076178179470626, "step": 5600}, {"loss": 0.7899, "grad_norm": 0.6275238394737244, "learning_rate": 0.0002, "epoch": 1.8108457069076824, "step": 5610}, {"loss": 0.7392, "grad_norm": 0.5017505288124084, "learning_rate": 0.0002, "epoch": 1.814073595868302, "step": 5620}, {"loss": 0.7669, "grad_norm": 0.8314290642738342, "learning_rate": 0.0002, "epoch": 1.817301484828922, "step": 5630}, {"loss": 0.7031, "grad_norm": 0.6863582134246826, "learning_rate": 0.0002, "epoch": 1.8205293737895416, "step": 5640}, {"loss": 0.743, "grad_norm": 0.69544917345047, "learning_rate": 0.0002, "epoch": 1.8237572627501613, "step": 5650}, {"loss": 0.7277, "grad_norm": 0.515499472618103, "learning_rate": 0.0002, "epoch": 1.8269851517107811, "step": 5660}, {"loss": 0.7166, "grad_norm": 0.6100873947143555, "learning_rate": 0.0002, "epoch": 1.830213040671401, "step": 5670}, {"loss": 0.7217, "grad_norm": 0.67416912317276, "learning_rate": 0.0002, "epoch": 1.8334409296320207, "step": 5680}, {"loss": 0.7575, "grad_norm": 0.7057772278785706, "learning_rate": 0.0002, "epoch": 1.8366688185926403, "step": 5690}, {"loss": 0.7483, "grad_norm": 0.7374551892280579, "learning_rate": 0.0002, "epoch": 1.8398967075532602, "step": 5700}, {"loss": 0.81, "grad_norm": 0.6266297101974487, "learning_rate": 0.0002, "epoch": 1.84312459651388, "step": 5710}, {"loss": 0.728, "grad_norm": 0.5629227757453918, "learning_rate": 0.0002, "epoch": 1.8463524854744997, "step": 5720}, {"loss": 0.8043, "grad_norm": 0.6603655815124512, "learning_rate": 0.0002, "epoch": 1.8495803744351194, "step": 5730}, {"loss": 0.7587, "grad_norm": 0.8113715052604675, "learning_rate": 0.0002, "epoch": 1.8528082633957392, "step": 5740}, {"loss": 0.7486, "grad_norm": 0.7143914103507996, "learning_rate": 0.0002, "epoch": 1.856036152356359, "step": 5750}, {"loss": 0.7619, "grad_norm": 0.6273732781410217, "learning_rate": 0.0002, "epoch": 1.8592640413169788, "step": 5760}, {"loss": 0.7962, "grad_norm": 0.5428690910339355, "learning_rate": 0.0002, "epoch": 1.8624919302775984, "step": 5770}, {"loss": 0.7581, "grad_norm": 0.6405037641525269, "learning_rate": 0.0002, "epoch": 1.865719819238218, "step": 5780}, {"loss": 0.7569, "grad_norm": 0.700873613357544, "learning_rate": 0.0002, "epoch": 1.868947708198838, "step": 5790}, {"loss": 0.7353, "grad_norm": 0.5645238161087036, "learning_rate": 0.0002, "epoch": 1.8721755971594578, "step": 5800}, {"loss": 0.8037, "grad_norm": 0.8780353665351868, "learning_rate": 0.0002, "epoch": 1.8754034861200775, "step": 5810}, {"loss": 0.7686, "grad_norm": 0.6295409798622131, "learning_rate": 0.0002, "epoch": 1.878631375080697, "step": 5820}, {"loss": 0.8067, "grad_norm": 0.678269624710083, "learning_rate": 0.0002, "epoch": 1.881859264041317, "step": 5830}, {"loss": 0.7537, "grad_norm": 0.6464608907699585, "learning_rate": 0.0002, "epoch": 1.8850871530019369, "step": 5840}, {"loss": 0.7423, "grad_norm": 0.6201048493385315, "learning_rate": 0.0002, "epoch": 1.8883150419625565, "step": 5850}, {"loss": 0.7694, "grad_norm": 0.6046274304389954, "learning_rate": 0.0002, "epoch": 1.8915429309231762, "step": 5860}, {"loss": 0.781, "grad_norm": 0.7532408833503723, "learning_rate": 0.0002, "epoch": 1.894770819883796, "step": 5870}, {"loss": 0.6885, "grad_norm": 0.6066767573356628, "learning_rate": 0.0002, "epoch": 1.897998708844416, "step": 5880}, {"loss": 0.7631, "grad_norm": 0.6289830207824707, "learning_rate": 0.0002, "epoch": 1.9012265978050356, "step": 5890}, {"loss": 0.7501, "grad_norm": 0.5204319953918457, "learning_rate": 0.0002, "epoch": 1.9044544867656552, "step": 5900}, {"loss": 0.7335, "grad_norm": 0.6708219647407532, "learning_rate": 0.0002, "epoch": 1.9076823757262749, "step": 5910}, {"loss": 0.7455, "grad_norm": 0.4915677309036255, "learning_rate": 0.0002, "epoch": 1.9109102646868947, "step": 5920}, {"loss": 0.7464, "grad_norm": 0.652717113494873, "learning_rate": 0.0002, "epoch": 1.9141381536475146, "step": 5930}, {"loss": 0.7687, "grad_norm": 0.5446316003799438, "learning_rate": 0.0002, "epoch": 1.9173660426081343, "step": 5940}, {"loss": 0.7424, "grad_norm": 0.4958149194717407, "learning_rate": 0.0002, "epoch": 1.920593931568754, "step": 5950}, {"loss": 0.757, "grad_norm": 0.5623434782028198, "learning_rate": 0.0002, "epoch": 1.9238218205293738, "step": 5960}, {"loss": 0.7446, "grad_norm": 0.6855450868606567, "learning_rate": 0.0002, "epoch": 1.9270497094899937, "step": 5970}, {"loss": 0.827, "grad_norm": 0.5710492730140686, "learning_rate": 0.0002, "epoch": 1.9302775984506133, "step": 5980}, {"loss": 0.7245, "grad_norm": 0.5379431843757629, "learning_rate": 0.0002, "epoch": 1.933505487411233, "step": 5990}, {"loss": 0.77, "grad_norm": 0.557129442691803, "learning_rate": 0.0002, "epoch": 1.9367333763718528, "step": 6000}, {"loss": 0.6988, "grad_norm": 0.6336663961410522, "learning_rate": 0.0002, "epoch": 1.9399612653324727, "step": 6010}, {"loss": 0.7316, "grad_norm": 0.5950582027435303, "learning_rate": 0.0002, "epoch": 1.9431891542930924, "step": 6020}, {"loss": 0.7443, "grad_norm": 0.5905954837799072, "learning_rate": 0.0002, "epoch": 1.946417043253712, "step": 6030}, {"loss": 0.7127, "grad_norm": 0.6688982844352722, "learning_rate": 0.0002, "epoch": 1.9496449322143317, "step": 6040}, {"loss": 0.79, "grad_norm": 0.5440775752067566, "learning_rate": 0.0002, "epoch": 1.9528728211749515, "step": 6050}, {"loss": 0.7221, "grad_norm": 0.6207906603813171, "learning_rate": 0.0002, "epoch": 1.9561007101355714, "step": 6060}, {"loss": 0.738, "grad_norm": 0.6999374628067017, "learning_rate": 0.0002, "epoch": 1.959328599096191, "step": 6070}, {"loss": 0.7372, "grad_norm": 0.6310848593711853, "learning_rate": 0.0002, "epoch": 1.9625564880568107, "step": 6080}, {"loss": 0.7198, "grad_norm": 0.5903388261795044, "learning_rate": 0.0002, "epoch": 1.9657843770174306, "step": 6090}, {"loss": 0.7103, "grad_norm": 0.6333889961242676, "learning_rate": 0.0002, "epoch": 1.9690122659780505, "step": 6100}, {"loss": 0.7246, "grad_norm": 0.5604711174964905, "learning_rate": 0.0002, "epoch": 1.97224015493867, "step": 6110}, {"loss": 0.761, "grad_norm": 0.9234541654586792, "learning_rate": 0.0002, "epoch": 1.9754680438992898, "step": 6120}, {"loss": 0.7375, "grad_norm": 0.6149102449417114, "learning_rate": 0.0002, "epoch": 1.9786959328599096, "step": 6130}, {"loss": 0.7286, "grad_norm": 0.615446150302887, "learning_rate": 0.0002, "epoch": 1.9819238218205295, "step": 6140}, {"loss": 0.7333, "grad_norm": 0.5176635980606079, "learning_rate": 0.0002, "epoch": 1.9851517107811492, "step": 6150}, {"loss": 0.718, "grad_norm": 0.7124109864234924, "learning_rate": 0.0002, "epoch": 1.9883795997417688, "step": 6160}, {"loss": 0.7669, "grad_norm": 0.6317567825317383, "learning_rate": 0.0002, "epoch": 1.9916074887023887, "step": 6170}, {"loss": 0.8012, "grad_norm": 0.6855016350746155, "learning_rate": 0.0002, "epoch": 1.9948353776630086, "step": 6180}, {"loss": 0.7376, "grad_norm": 0.6423715353012085, "learning_rate": 0.0002, "epoch": 1.9980632666236282, "step": 6190}, {"eval_loss": 1.1096643209457397, "eval_runtime": 147.7997, "eval_samples_per_second": 4.959, "eval_steps_per_second": 0.622, "epoch": 2.0, "step": 6196}, {"loss": 0.7131, "grad_norm": 0.5322932600975037, "learning_rate": 0.0002, "epoch": 2.001291155584248, "step": 6200}, {"loss": 0.6619, "grad_norm": 0.8152306079864502, "learning_rate": 0.0002, "epoch": 2.0045190445448675, "step": 6210}, {"loss": 0.6731, "grad_norm": 0.6215983033180237, "learning_rate": 0.0002, "epoch": 2.0077469335054876, "step": 6220}, {"loss": 0.658, "grad_norm": 0.845498263835907, "learning_rate": 0.0002, "epoch": 2.0109748224661073, "step": 6230}, {"loss": 0.6954, "grad_norm": 0.733559787273407, "learning_rate": 0.0002, "epoch": 2.014202711426727, "step": 6240}, {"loss": 0.6707, "grad_norm": 0.51433926820755, "learning_rate": 0.0002, "epoch": 2.0174306003873466, "step": 6250}, {"loss": 0.6304, "grad_norm": 0.6374049782752991, "learning_rate": 0.0002, "epoch": 2.020658489347966, "step": 6260}, {"loss": 0.6831, "grad_norm": 0.7833638191223145, "learning_rate": 0.0002, "epoch": 2.0238863783085863, "step": 6270}, {"loss": 0.6672, "grad_norm": 0.8929463028907776, "learning_rate": 0.0002, "epoch": 2.027114267269206, "step": 6280}, {"loss": 0.637, "grad_norm": 0.669731855392456, "learning_rate": 0.0002, "epoch": 2.0303421562298256, "step": 6290}, {"loss": 0.646, "grad_norm": 0.5846071243286133, "learning_rate": 0.0002, "epoch": 2.0335700451904453, "step": 6300}, {"loss": 0.6647, "grad_norm": 0.7087787985801697, "learning_rate": 0.0002, "epoch": 2.0367979341510654, "step": 6310}, {"loss": 0.6433, "grad_norm": 0.6739160418510437, "learning_rate": 0.0002, "epoch": 2.040025823111685, "step": 6320}, {"loss": 0.6301, "grad_norm": 0.4860886335372925, "learning_rate": 0.0002, "epoch": 2.0432537120723047, "step": 6330}, {"loss": 0.6439, "grad_norm": 0.7201244831085205, "learning_rate": 0.0002, "epoch": 2.0464816010329243, "step": 6340}, {"loss": 0.6676, "grad_norm": 0.7409170269966125, "learning_rate": 0.0002, "epoch": 2.0497094899935444, "step": 6350}, {"loss": 0.6153, "grad_norm": 0.6843920350074768, "learning_rate": 0.0002, "epoch": 2.052937378954164, "step": 6360}, {"loss": 0.6674, "grad_norm": 0.7519999742507935, "learning_rate": 0.0002, "epoch": 2.0561652679147837, "step": 6370}, {"loss": 0.6928, "grad_norm": 0.5732819437980652, "learning_rate": 0.0002, "epoch": 2.0593931568754034, "step": 6380}, {"loss": 0.6496, "grad_norm": 0.7565118074417114, "learning_rate": 0.0002, "epoch": 2.062621045836023, "step": 6390}, {"loss": 0.6354, "grad_norm": 0.8147150278091431, "learning_rate": 0.0002, "epoch": 2.065848934796643, "step": 6400}, {"loss": 0.6593, "grad_norm": 0.6941924691200256, "learning_rate": 0.0002, "epoch": 2.0690768237572628, "step": 6410}, {"loss": 0.6698, "grad_norm": 0.6549784541130066, "learning_rate": 0.0002, "epoch": 2.0723047127178824, "step": 6420}, {"loss": 0.6927, "grad_norm": 0.7224905490875244, "learning_rate": 0.0002, "epoch": 2.075532601678502, "step": 6430}, {"loss": 0.6755, "grad_norm": 0.7754863500595093, "learning_rate": 0.0002, "epoch": 2.078760490639122, "step": 6440}, {"loss": 0.6738, "grad_norm": 0.691318154335022, "learning_rate": 0.0002, "epoch": 2.081988379599742, "step": 6450}, {"loss": 0.6233, "grad_norm": 0.6009294986724854, "learning_rate": 0.0002, "epoch": 2.0852162685603615, "step": 6460}, {"loss": 0.6691, "grad_norm": 0.6753945350646973, "learning_rate": 0.0002, "epoch": 2.088444157520981, "step": 6470}, {"loss": 0.6935, "grad_norm": 0.6899921298027039, "learning_rate": 0.0002, "epoch": 2.091672046481601, "step": 6480}, {"loss": 0.6918, "grad_norm": 0.846510648727417, "learning_rate": 0.0002, "epoch": 2.094899935442221, "step": 6490}, {"loss": 0.6084, "grad_norm": 0.6432605981826782, "learning_rate": 0.0002, "epoch": 2.0981278244028405, "step": 6500}, {"loss": 0.6867, "grad_norm": 0.8125239014625549, "learning_rate": 0.0002, "epoch": 2.10135571336346, "step": 6510}, {"loss": 0.6939, "grad_norm": 0.628302812576294, "learning_rate": 0.0002, "epoch": 2.1045836023240803, "step": 6520}, {"loss": 0.5909, "grad_norm": 0.7164334654808044, "learning_rate": 0.0002, "epoch": 2.1078114912847, "step": 6530}, {"loss": 0.6578, "grad_norm": 0.7476949095726013, "learning_rate": 0.0002, "epoch": 2.1110393802453196, "step": 6540}, {"loss": 0.6351, "grad_norm": 0.7577515840530396, "learning_rate": 0.0002, "epoch": 2.114267269205939, "step": 6550}, {"loss": 0.6669, "grad_norm": 0.5684467554092407, "learning_rate": 0.0002, "epoch": 2.117495158166559, "step": 6560}, {"loss": 0.6343, "grad_norm": 0.6121789216995239, "learning_rate": 0.0002, "epoch": 2.120723047127179, "step": 6570}, {"loss": 0.6314, "grad_norm": 0.6095348596572876, "learning_rate": 0.0002, "epoch": 2.1239509360877986, "step": 6580}, {"loss": 0.6276, "grad_norm": 0.7803651690483093, "learning_rate": 0.0002, "epoch": 2.1271788250484183, "step": 6590}, {"loss": 0.6579, "grad_norm": 0.5990583300590515, "learning_rate": 0.0002, "epoch": 2.130406714009038, "step": 6600}, {"loss": 0.6228, "grad_norm": 0.6569220423698425, "learning_rate": 0.0002, "epoch": 2.133634602969658, "step": 6610}, {"loss": 0.7049, "grad_norm": 0.5961166620254517, "learning_rate": 0.0002, "epoch": 2.1368624919302777, "step": 6620}, {"loss": 0.6359, "grad_norm": 0.5860554575920105, "learning_rate": 0.0002, "epoch": 2.1400903808908973, "step": 6630}, {"loss": 0.6651, "grad_norm": 0.5994001626968384, "learning_rate": 0.0002, "epoch": 2.143318269851517, "step": 6640}, {"loss": 0.6421, "grad_norm": 0.7723015546798706, "learning_rate": 0.0002, "epoch": 2.146546158812137, "step": 6650}, {"loss": 0.6723, "grad_norm": 0.676355242729187, "learning_rate": 0.0002, "epoch": 2.1497740477727567, "step": 6660}, {"loss": 0.6826, "grad_norm": 0.5689092874526978, "learning_rate": 0.0002, "epoch": 2.1530019367333764, "step": 6670}, {"loss": 0.6613, "grad_norm": 0.6933727264404297, "learning_rate": 0.0002, "epoch": 2.156229825693996, "step": 6680}, {"loss": 0.6957, "grad_norm": 0.8380527496337891, "learning_rate": 0.0002, "epoch": 2.159457714654616, "step": 6690}, {"loss": 0.6705, "grad_norm": 0.6876497268676758, "learning_rate": 0.0002, "epoch": 2.1626856036152358, "step": 6700}, {"loss": 0.6112, "grad_norm": 0.6418334245681763, "learning_rate": 0.0002, "epoch": 2.1659134925758554, "step": 6710}, {"loss": 0.6357, "grad_norm": 0.7169192433357239, "learning_rate": 0.0002, "epoch": 2.169141381536475, "step": 6720}, {"loss": 0.6492, "grad_norm": 0.6664170622825623, "learning_rate": 0.0002, "epoch": 2.1723692704970947, "step": 6730}, {"loss": 0.6751, "grad_norm": 0.6011993288993835, "learning_rate": 0.0002, "epoch": 2.175597159457715, "step": 6740}, {"loss": 0.696, "grad_norm": 0.5529947280883789, "learning_rate": 0.0002, "epoch": 2.1788250484183345, "step": 6750}, {"loss": 0.671, "grad_norm": 0.6879532933235168, "learning_rate": 0.0002, "epoch": 2.182052937378954, "step": 6760}, {"loss": 0.6634, "grad_norm": 0.6426113843917847, "learning_rate": 0.0002, "epoch": 2.1852808263395738, "step": 6770}, {"loss": 0.6592, "grad_norm": 0.6571047306060791, "learning_rate": 0.0002, "epoch": 2.188508715300194, "step": 6780}, {"loss": 0.6494, "grad_norm": 0.6400564908981323, "learning_rate": 0.0002, "epoch": 2.1917366042608135, "step": 6790}, {"loss": 0.6369, "grad_norm": 0.6509664058685303, "learning_rate": 0.0002, "epoch": 2.194964493221433, "step": 6800}, {"loss": 0.6771, "grad_norm": 0.6673197150230408, "learning_rate": 0.0002, "epoch": 2.198192382182053, "step": 6810}, {"loss": 0.6491, "grad_norm": 0.48205727338790894, "learning_rate": 0.0002, "epoch": 2.2014202711426725, "step": 6820}, {"loss": 0.6894, "grad_norm": 0.849525511264801, "learning_rate": 0.0002, "epoch": 2.2046481601032926, "step": 6830}, {"loss": 0.6977, "grad_norm": 0.6150892376899719, "learning_rate": 0.0002, "epoch": 2.207876049063912, "step": 6840}, {"loss": 0.6843, "grad_norm": 0.7826945781707764, "learning_rate": 0.0002, "epoch": 2.211103938024532, "step": 6850}, {"loss": 0.6338, "grad_norm": 0.5711963772773743, "learning_rate": 0.0002, "epoch": 2.2143318269851515, "step": 6860}, {"loss": 0.6585, "grad_norm": 0.6017758846282959, "learning_rate": 0.0002, "epoch": 2.2175597159457716, "step": 6870}, {"loss": 0.6657, "grad_norm": 0.785434901714325, "learning_rate": 0.0002, "epoch": 2.2207876049063913, "step": 6880}, {"loss": 0.7075, "grad_norm": 0.6251688599586487, "learning_rate": 0.0002, "epoch": 2.224015493867011, "step": 6890}, {"loss": 0.6564, "grad_norm": 0.8242034316062927, "learning_rate": 0.0002, "epoch": 2.2272433828276306, "step": 6900}, {"loss": 0.672, "grad_norm": 0.7272933125495911, "learning_rate": 0.0002, "epoch": 2.2304712717882507, "step": 6910}, {"loss": 0.6541, "grad_norm": 0.7159379720687866, "learning_rate": 0.0002, "epoch": 2.2336991607488703, "step": 6920}, {"loss": 0.6859, "grad_norm": 0.6518042087554932, "learning_rate": 0.0002, "epoch": 2.23692704970949, "step": 6930}, {"loss": 0.5987, "grad_norm": 0.7365370392799377, "learning_rate": 0.0002, "epoch": 2.2401549386701096, "step": 6940}, {"loss": 0.6511, "grad_norm": 0.5674061179161072, "learning_rate": 0.0002, "epoch": 2.2433828276307297, "step": 6950}, {"loss": 0.6748, "grad_norm": 0.669185996055603, "learning_rate": 0.0002, "epoch": 2.2466107165913494, "step": 6960}, {"loss": 0.656, "grad_norm": 0.6638304591178894, "learning_rate": 0.0002, "epoch": 2.249838605551969, "step": 6970}, {"loss": 0.636, "grad_norm": 0.757006824016571, "learning_rate": 0.0002, "epoch": 2.2530664945125887, "step": 6980}, {"loss": 0.6597, "grad_norm": 0.7574930787086487, "learning_rate": 0.0002, "epoch": 2.2562943834732083, "step": 6990}, {"loss": 0.6859, "grad_norm": 0.7819514870643616, "learning_rate": 0.0002, "epoch": 2.2595222724338284, "step": 7000}, {"loss": 0.6238, "grad_norm": 0.6987583041191101, "learning_rate": 0.0002, "epoch": 2.262750161394448, "step": 7010}, {"loss": 0.661, "grad_norm": 0.6628551483154297, "learning_rate": 0.0002, "epoch": 2.2659780503550677, "step": 7020}, {"loss": 0.6254, "grad_norm": 0.7855866551399231, "learning_rate": 0.0002, "epoch": 2.2692059393156874, "step": 7030}, {"loss": 0.6679, "grad_norm": 0.6102892756462097, "learning_rate": 0.0002, "epoch": 2.2724338282763075, "step": 7040}, {"loss": 0.694, "grad_norm": 0.7844198942184448, "learning_rate": 0.0002, "epoch": 2.275661717236927, "step": 7050}, {"loss": 0.63, "grad_norm": 0.6209492087364197, "learning_rate": 0.0002, "epoch": 2.2788896061975468, "step": 7060}, {"loss": 0.6418, "grad_norm": 0.8351290225982666, "learning_rate": 0.0002, "epoch": 2.2821174951581664, "step": 7070}, {"loss": 0.6648, "grad_norm": 0.6883546710014343, "learning_rate": 0.0002, "epoch": 2.285345384118786, "step": 7080}, {"loss": 0.7046, "grad_norm": 0.6626381874084473, "learning_rate": 0.0002, "epoch": 2.288573273079406, "step": 7090}, {"loss": 0.6535, "grad_norm": 0.7216270565986633, "learning_rate": 0.0002, "epoch": 2.291801162040026, "step": 7100}, {"loss": 0.6414, "grad_norm": 0.8246777057647705, "learning_rate": 0.0002, "epoch": 2.2950290510006455, "step": 7110}, {"loss": 0.6315, "grad_norm": 0.614326000213623, "learning_rate": 0.0002, "epoch": 2.2982569399612656, "step": 7120}, {"loss": 0.6303, "grad_norm": 0.8785578012466431, "learning_rate": 0.0002, "epoch": 2.301484828921885, "step": 7130}, {"loss": 0.6348, "grad_norm": 0.7021808624267578, "learning_rate": 0.0002, "epoch": 2.304712717882505, "step": 7140}, {"loss": 0.6738, "grad_norm": 0.6999403238296509, "learning_rate": 0.0002, "epoch": 2.3079406068431245, "step": 7150}, {"loss": 0.6547, "grad_norm": 0.8013143539428711, "learning_rate": 0.0002, "epoch": 2.311168495803744, "step": 7160}, {"loss": 0.6461, "grad_norm": 0.6592583060264587, "learning_rate": 0.0002, "epoch": 2.3143963847643643, "step": 7170}, {"loss": 0.6369, "grad_norm": 0.6260249018669128, "learning_rate": 0.0002, "epoch": 2.317624273724984, "step": 7180}, {"loss": 0.6647, "grad_norm": 0.9352797269821167, "learning_rate": 0.0002, "epoch": 2.3208521626856036, "step": 7190}, {"loss": 0.6543, "grad_norm": 0.6629612445831299, "learning_rate": 0.0002, "epoch": 2.324080051646223, "step": 7200}, {"loss": 0.6811, "grad_norm": 0.7062810063362122, "learning_rate": 0.0002, "epoch": 2.3273079406068433, "step": 7210}, {"loss": 0.67, "grad_norm": 0.7236241102218628, "learning_rate": 0.0002, "epoch": 2.330535829567463, "step": 7220}, {"loss": 0.6462, "grad_norm": 0.7528148293495178, "learning_rate": 0.0002, "epoch": 2.3337637185280826, "step": 7230}, {"loss": 0.694, "grad_norm": 0.7604748606681824, "learning_rate": 0.0002, "epoch": 2.3369916074887023, "step": 7240}, {"loss": 0.6475, "grad_norm": 0.5601189136505127, "learning_rate": 0.0002, "epoch": 2.340219496449322, "step": 7250}, {"loss": 0.6925, "grad_norm": 0.7099230885505676, "learning_rate": 0.0002, "epoch": 2.343447385409942, "step": 7260}, {"loss": 0.6333, "grad_norm": 0.6699047684669495, "learning_rate": 0.0002, "epoch": 2.3466752743705617, "step": 7270}, {"loss": 0.6434, "grad_norm": 0.7315047979354858, "learning_rate": 0.0002, "epoch": 2.3499031633311813, "step": 7280}, {"loss": 0.6927, "grad_norm": 0.632836103439331, "learning_rate": 0.0002, "epoch": 2.353131052291801, "step": 7290}, {"loss": 0.6458, "grad_norm": 0.9410115480422974, "learning_rate": 0.0002, "epoch": 2.356358941252421, "step": 7300}, {"loss": 0.6699, "grad_norm": 0.626554012298584, "learning_rate": 0.0002, "epoch": 2.3595868302130407, "step": 7310}, {"loss": 0.6495, "grad_norm": 0.7538444399833679, "learning_rate": 0.0002, "epoch": 2.3628147191736604, "step": 7320}, {"loss": 0.6321, "grad_norm": 0.6826626062393188, "learning_rate": 0.0002, "epoch": 2.36604260813428, "step": 7330}, {"loss": 0.6752, "grad_norm": 0.6739391088485718, "learning_rate": 0.0002, "epoch": 2.3692704970949, "step": 7340}, {"loss": 0.6518, "grad_norm": 0.7518446445465088, "learning_rate": 0.0002, "epoch": 2.3724983860555198, "step": 7350}, {"loss": 0.7142, "grad_norm": 0.714133083820343, "learning_rate": 0.0002, "epoch": 2.3757262750161394, "step": 7360}, {"loss": 0.6794, "grad_norm": 0.7144588232040405, "learning_rate": 0.0002, "epoch": 2.378954163976759, "step": 7370}, {"loss": 0.6922, "grad_norm": 0.6598120927810669, "learning_rate": 0.0002, "epoch": 2.382182052937379, "step": 7380}, {"loss": 0.6562, "grad_norm": 0.7079148292541504, "learning_rate": 0.0002, "epoch": 2.385409941897999, "step": 7390}, {"loss": 0.6492, "grad_norm": 0.6750902533531189, "learning_rate": 0.0002, "epoch": 2.3886378308586185, "step": 7400}, {"loss": 0.6398, "grad_norm": 0.7181967496871948, "learning_rate": 0.0002, "epoch": 2.391865719819238, "step": 7410}, {"loss": 0.6793, "grad_norm": 0.7720552086830139, "learning_rate": 0.0002, "epoch": 2.3950936087798578, "step": 7420}, {"loss": 0.6804, "grad_norm": 0.7592426538467407, "learning_rate": 0.0002, "epoch": 2.398321497740478, "step": 7430}, {"loss": 0.6667, "grad_norm": 0.7161896824836731, "learning_rate": 0.0002, "epoch": 2.4015493867010975, "step": 7440}, {"loss": 0.6891, "grad_norm": 0.8019260764122009, "learning_rate": 0.0002, "epoch": 2.404777275661717, "step": 7450}, {"loss": 0.6864, "grad_norm": 0.7093342542648315, "learning_rate": 0.0002, "epoch": 2.408005164622337, "step": 7460}, {"loss": 0.6445, "grad_norm": 0.8464207649230957, "learning_rate": 0.0002, "epoch": 2.411233053582957, "step": 7470}, {"loss": 0.6724, "grad_norm": 0.773666501045227, "learning_rate": 0.0002, "epoch": 2.4144609425435766, "step": 7480}, {"loss": 0.6774, "grad_norm": 0.8451611995697021, "learning_rate": 0.0002, "epoch": 2.4176888315041962, "step": 7490}, {"loss": 0.694, "grad_norm": 0.656795084476471, "learning_rate": 0.0002, "epoch": 2.420916720464816, "step": 7500}, {"loss": 0.6824, "grad_norm": 0.7129034996032715, "learning_rate": 0.0002, "epoch": 2.4241446094254355, "step": 7510}, {"loss": 0.711, "grad_norm": 0.8325763940811157, "learning_rate": 0.0002, "epoch": 2.4273724983860556, "step": 7520}, {"loss": 0.6238, "grad_norm": 0.7806527614593506, "learning_rate": 0.0002, "epoch": 2.4306003873466753, "step": 7530}, {"loss": 0.6972, "grad_norm": 0.6994536519050598, "learning_rate": 0.0002, "epoch": 2.433828276307295, "step": 7540}, {"loss": 0.6615, "grad_norm": 0.6898999214172363, "learning_rate": 0.0002, "epoch": 2.437056165267915, "step": 7550}, {"loss": 0.7108, "grad_norm": 0.719490647315979, "learning_rate": 0.0002, "epoch": 2.4402840542285347, "step": 7560}, {"loss": 0.668, "grad_norm": 0.6841562390327454, "learning_rate": 0.0002, "epoch": 2.4435119431891543, "step": 7570}, {"loss": 0.6504, "grad_norm": 0.7573311924934387, "learning_rate": 0.0002, "epoch": 2.446739832149774, "step": 7580}, {"loss": 0.6607, "grad_norm": 0.7295880317687988, "learning_rate": 0.0002, "epoch": 2.4499677211103936, "step": 7590}, {"loss": 0.6593, "grad_norm": 0.710136353969574, "learning_rate": 0.0002, "epoch": 2.4531956100710137, "step": 7600}, {"loss": 0.7137, "grad_norm": 0.6126235127449036, "learning_rate": 0.0002, "epoch": 2.4564234990316334, "step": 7610}, {"loss": 0.6562, "grad_norm": 0.8025609850883484, "learning_rate": 0.0002, "epoch": 2.459651387992253, "step": 7620}, {"loss": 0.6464, "grad_norm": 0.7839472889900208, "learning_rate": 0.0002, "epoch": 2.4628792769528727, "step": 7630}, {"loss": 0.6797, "grad_norm": 0.7253499031066895, "learning_rate": 0.0002, "epoch": 2.4661071659134928, "step": 7640}, {"loss": 0.7341, "grad_norm": 0.7918946743011475, "learning_rate": 0.0002, "epoch": 2.4693350548741124, "step": 7650}, {"loss": 0.6646, "grad_norm": 0.7930178046226501, "learning_rate": 0.0002, "epoch": 2.472562943834732, "step": 7660}, {"loss": 0.6294, "grad_norm": 0.6826170086860657, "learning_rate": 0.0002, "epoch": 2.4757908327953517, "step": 7670}, {"loss": 0.6697, "grad_norm": 0.6576805114746094, "learning_rate": 0.0002, "epoch": 2.4790187217559714, "step": 7680}, {"loss": 0.682, "grad_norm": 0.7012448310852051, "learning_rate": 0.0002, "epoch": 2.4822466107165915, "step": 7690}, {"loss": 0.6418, "grad_norm": 0.7774284482002258, "learning_rate": 0.0002, "epoch": 2.485474499677211, "step": 7700}, {"loss": 0.6566, "grad_norm": 0.6502766013145447, "learning_rate": 0.0002, "epoch": 2.4887023886378308, "step": 7710}, {"loss": 0.6965, "grad_norm": 0.7638739347457886, "learning_rate": 0.0002, "epoch": 2.4919302775984504, "step": 7720}, {"loss": 0.6454, "grad_norm": 0.6217384338378906, "learning_rate": 0.0002, "epoch": 2.4951581665590705, "step": 7730}, {"loss": 0.6837, "grad_norm": 0.7576302886009216, "learning_rate": 0.0002, "epoch": 2.49838605551969, "step": 7740}, {"loss": 0.6855, "grad_norm": 0.6877137422561646, "learning_rate": 0.0002, "epoch": 2.50161394448031, "step": 7750}, {"loss": 0.6604, "grad_norm": 0.6998329162597656, "learning_rate": 0.0002, "epoch": 2.5048418334409295, "step": 7760}, {"loss": 0.6666, "grad_norm": 0.7879213690757751, "learning_rate": 0.0002, "epoch": 2.508069722401549, "step": 7770}, {"loss": 0.715, "grad_norm": 0.7834980487823486, "learning_rate": 0.0002, "epoch": 2.5112976113621692, "step": 7780}, {"loss": 0.6954, "grad_norm": 0.7789630889892578, "learning_rate": 0.0002, "epoch": 2.514525500322789, "step": 7790}, {"loss": 0.6979, "grad_norm": 0.7403590083122253, "learning_rate": 0.0002, "epoch": 2.5177533892834085, "step": 7800}, {"loss": 0.6964, "grad_norm": 0.6029766201972961, "learning_rate": 0.0002, "epoch": 2.5209812782440286, "step": 7810}, {"loss": 0.6887, "grad_norm": 0.7061092257499695, "learning_rate": 0.0002, "epoch": 2.5242091672046483, "step": 7820}, {"loss": 0.6628, "grad_norm": 0.7120763659477234, "learning_rate": 0.0002, "epoch": 2.527437056165268, "step": 7830}, {"loss": 0.6876, "grad_norm": 0.6173675656318665, "learning_rate": 0.0002, "epoch": 2.5306649451258876, "step": 7840}, {"loss": 0.6635, "grad_norm": 0.9566813111305237, "learning_rate": 0.0002, "epoch": 2.5338928340865072, "step": 7850}, {"loss": 0.654, "grad_norm": 0.8497620224952698, "learning_rate": 0.0002, "epoch": 2.5371207230471273, "step": 7860}, {"loss": 0.644, "grad_norm": 0.7663498520851135, "learning_rate": 0.0002, "epoch": 2.540348612007747, "step": 7870}, {"loss": 0.6292, "grad_norm": 0.6329668760299683, "learning_rate": 0.0002, "epoch": 2.5435765009683666, "step": 7880}, {"loss": 0.686, "grad_norm": 0.8128195405006409, "learning_rate": 0.0002, "epoch": 2.5468043899289863, "step": 7890}, {"loss": 0.6619, "grad_norm": 0.6622284650802612, "learning_rate": 0.0002, "epoch": 2.5500322788896064, "step": 7900}, {"loss": 0.693, "grad_norm": 0.8460057973861694, "learning_rate": 0.0002, "epoch": 2.553260167850226, "step": 7910}, {"loss": 0.6619, "grad_norm": 0.6586956977844238, "learning_rate": 0.0002, "epoch": 2.5564880568108457, "step": 7920}, {"loss": 0.6976, "grad_norm": 0.7569382190704346, "learning_rate": 0.0002, "epoch": 2.5597159457714653, "step": 7930}, {"loss": 0.6235, "grad_norm": 0.6409714221954346, "learning_rate": 0.0002, "epoch": 2.562943834732085, "step": 7940}, {"loss": 0.6663, "grad_norm": 0.7031713128089905, "learning_rate": 0.0002, "epoch": 2.566171723692705, "step": 7950}, {"loss": 0.6344, "grad_norm": 0.7983605265617371, "learning_rate": 0.0002, "epoch": 2.5693996126533247, "step": 7960}, {"loss": 0.6834, "grad_norm": 0.7165433168411255, "learning_rate": 0.0002, "epoch": 2.5726275016139444, "step": 7970}, {"loss": 0.6517, "grad_norm": 0.6630598902702332, "learning_rate": 0.0002, "epoch": 2.5758553905745645, "step": 7980}, {"loss": 0.7164, "grad_norm": 0.5883122086524963, "learning_rate": 0.0002, "epoch": 2.579083279535184, "step": 7990}, {"loss": 0.6715, "grad_norm": 0.5928755402565002, "learning_rate": 0.0002, "epoch": 2.5823111684958038, "step": 8000}, {"loss": 0.6701, "grad_norm": 0.7843712568283081, "learning_rate": 0.0002, "epoch": 2.5855390574564234, "step": 8010}, {"loss": 0.6617, "grad_norm": 0.7206324338912964, "learning_rate": 0.0002, "epoch": 2.588766946417043, "step": 8020}, {"loss": 0.6968, "grad_norm": 0.812480092048645, "learning_rate": 0.0002, "epoch": 2.5919948353776627, "step": 8030}, {"loss": 0.6735, "grad_norm": 0.9843078255653381, "learning_rate": 0.0002, "epoch": 2.595222724338283, "step": 8040}, {"loss": 0.6877, "grad_norm": 0.7524392604827881, "learning_rate": 0.0002, "epoch": 2.5984506132989025, "step": 8050}, {"loss": 0.7188, "grad_norm": 0.6220380067825317, "learning_rate": 0.0002, "epoch": 2.601678502259522, "step": 8060}, {"loss": 0.6878, "grad_norm": 0.7461398243904114, "learning_rate": 0.0002, "epoch": 2.6049063912201422, "step": 8070}, {"loss": 0.6626, "grad_norm": 0.720974326133728, "learning_rate": 0.0002, "epoch": 2.608134280180762, "step": 8080}, {"loss": 0.6756, "grad_norm": 0.649509847164154, "learning_rate": 0.0002, "epoch": 2.6113621691413815, "step": 8090}, {"loss": 0.6394, "grad_norm": 0.6894662976264954, "learning_rate": 0.0002, "epoch": 2.614590058102001, "step": 8100}, {"loss": 0.6329, "grad_norm": 0.734433114528656, "learning_rate": 0.0002, "epoch": 2.617817947062621, "step": 8110}, {"loss": 0.6698, "grad_norm": 0.7468628883361816, "learning_rate": 0.0002, "epoch": 2.621045836023241, "step": 8120}, {"loss": 0.658, "grad_norm": 0.6508180499076843, "learning_rate": 0.0002, "epoch": 2.6242737249838606, "step": 8130}, {"loss": 0.6619, "grad_norm": 0.8735209107398987, "learning_rate": 0.0002, "epoch": 2.6275016139444802, "step": 8140}, {"loss": 0.6717, "grad_norm": 0.8162857294082642, "learning_rate": 0.0002, "epoch": 2.6307295029051003, "step": 8150}, {"loss": 0.6496, "grad_norm": 0.628872811794281, "learning_rate": 0.0002, "epoch": 2.63395739186572, "step": 8160}, {"loss": 0.6608, "grad_norm": 0.8078708052635193, "learning_rate": 0.0002, "epoch": 2.6371852808263396, "step": 8170}, {"loss": 0.6916, "grad_norm": 0.7849429845809937, "learning_rate": 0.0002, "epoch": 2.6404131697869593, "step": 8180}, {"loss": 0.6671, "grad_norm": 0.8115387558937073, "learning_rate": 0.0002, "epoch": 2.643641058747579, "step": 8190}, {"loss": 0.6761, "grad_norm": 0.7462222576141357, "learning_rate": 0.0002, "epoch": 2.6468689477081986, "step": 8200}, {"loss": 0.6923, "grad_norm": 0.753662645816803, "learning_rate": 0.0002, "epoch": 2.6500968366688187, "step": 8210}, {"loss": 0.6666, "grad_norm": 0.6100404858589172, "learning_rate": 0.0002, "epoch": 2.6533247256294383, "step": 8220}, {"loss": 0.7256, "grad_norm": 0.9084606766700745, "learning_rate": 0.0002, "epoch": 2.656552614590058, "step": 8230}, {"loss": 0.6385, "grad_norm": 0.6412538886070251, "learning_rate": 0.0002, "epoch": 2.659780503550678, "step": 8240}, {"loss": 0.7048, "grad_norm": 0.7640451192855835, "learning_rate": 0.0002, "epoch": 2.6630083925112977, "step": 8250}, {"loss": 0.6846, "grad_norm": 0.5972344875335693, "learning_rate": 0.0002, "epoch": 2.6662362814719174, "step": 8260}, {"loss": 0.682, "grad_norm": 0.6935883164405823, "learning_rate": 0.0002, "epoch": 2.669464170432537, "step": 8270}, {"loss": 0.6625, "grad_norm": 0.789399266242981, "learning_rate": 0.0002, "epoch": 2.6726920593931567, "step": 8280}, {"loss": 0.6541, "grad_norm": 0.7143490314483643, "learning_rate": 0.0002, "epoch": 2.675919948353777, "step": 8290}, {"loss": 0.6741, "grad_norm": 0.6670652627944946, "learning_rate": 0.0002, "epoch": 2.6791478373143964, "step": 8300}, {"loss": 0.6936, "grad_norm": 0.687108039855957, "learning_rate": 0.0002, "epoch": 2.682375726275016, "step": 8310}, {"loss": 0.7124, "grad_norm": 0.7914147973060608, "learning_rate": 0.0002, "epoch": 2.6856036152356357, "step": 8320}, {"loss": 0.6584, "grad_norm": 0.8398420214653015, "learning_rate": 0.0002, "epoch": 2.688831504196256, "step": 8330}, {"loss": 0.6679, "grad_norm": 0.6592720746994019, "learning_rate": 0.0002, "epoch": 2.6920593931568755, "step": 8340}, {"loss": 0.6673, "grad_norm": 0.6888470649719238, "learning_rate": 0.0002, "epoch": 2.695287282117495, "step": 8350}, {"loss": 0.6483, "grad_norm": 0.7127556800842285, "learning_rate": 0.0002, "epoch": 2.698515171078115, "step": 8360}, {"loss": 0.7013, "grad_norm": 0.6630286574363708, "learning_rate": 0.0002, "epoch": 2.7017430600387344, "step": 8370}, {"loss": 0.6842, "grad_norm": 0.8261964321136475, "learning_rate": 0.0002, "epoch": 2.7049709489993545, "step": 8380}, {"loss": 0.6613, "grad_norm": 0.717339813709259, "learning_rate": 0.0002, "epoch": 2.708198837959974, "step": 8390}, {"loss": 0.6929, "grad_norm": 0.651637613773346, "learning_rate": 0.0002, "epoch": 2.711426726920594, "step": 8400}, {"loss": 0.6796, "grad_norm": 0.7936098575592041, "learning_rate": 0.0002, "epoch": 2.714654615881214, "step": 8410}, {"loss": 0.696, "grad_norm": 0.8761560320854187, "learning_rate": 0.0002, "epoch": 2.7178825048418336, "step": 8420}, {"loss": 0.6889, "grad_norm": 0.6768006086349487, "learning_rate": 0.0002, "epoch": 2.7211103938024532, "step": 8430}, {"loss": 0.6844, "grad_norm": 0.7121055722236633, "learning_rate": 0.0002, "epoch": 2.724338282763073, "step": 8440}, {"loss": 0.6608, "grad_norm": 0.6811696887016296, "learning_rate": 0.0002, "epoch": 2.7275661717236925, "step": 8450}, {"loss": 0.7046, "grad_norm": 0.8168250918388367, "learning_rate": 0.0002, "epoch": 2.730794060684312, "step": 8460}, {"loss": 0.6809, "grad_norm": 0.660682737827301, "learning_rate": 0.0002, "epoch": 2.7340219496449323, "step": 8470}, {"loss": 0.6916, "grad_norm": 0.7369356155395508, "learning_rate": 0.0002, "epoch": 2.737249838605552, "step": 8480}, {"loss": 0.6383, "grad_norm": 0.7545099854469299, "learning_rate": 0.0002, "epoch": 2.7404777275661716, "step": 8490}, {"loss": 0.6917, "grad_norm": 0.6991257667541504, "learning_rate": 0.0002, "epoch": 2.7437056165267917, "step": 8500}, {"loss": 0.6953, "grad_norm": 0.7195324301719666, "learning_rate": 0.0002, "epoch": 2.7469335054874113, "step": 8510}, {"loss": 0.6955, "grad_norm": 0.8995378017425537, "learning_rate": 0.0002, "epoch": 2.750161394448031, "step": 8520}, {"loss": 0.684, "grad_norm": 0.6924123764038086, "learning_rate": 0.0002, "epoch": 2.7533892834086506, "step": 8530}, {"loss": 0.6675, "grad_norm": 0.6260585784912109, "learning_rate": 0.0002, "epoch": 2.7566171723692703, "step": 8540}, {"loss": 0.6613, "grad_norm": 0.7273091673851013, "learning_rate": 0.0002, "epoch": 2.7598450613298904, "step": 8550}, {"loss": 0.6853, "grad_norm": 0.720562219619751, "learning_rate": 0.0002, "epoch": 2.76307295029051, "step": 8560}, {"loss": 0.6452, "grad_norm": 0.6360004544258118, "learning_rate": 0.0002, "epoch": 2.7663008392511297, "step": 8570}, {"loss": 0.6118, "grad_norm": 0.7634525895118713, "learning_rate": 0.0002, "epoch": 2.76952872821175, "step": 8580}, {"loss": 0.686, "grad_norm": 0.6586076021194458, "learning_rate": 0.0002, "epoch": 2.7727566171723694, "step": 8590}, {"loss": 0.7072, "grad_norm": 0.6542639136314392, "learning_rate": 0.0002, "epoch": 2.775984506132989, "step": 8600}, {"loss": 0.7126, "grad_norm": 0.7650290727615356, "learning_rate": 0.0002, "epoch": 2.7792123950936087, "step": 8610}, {"loss": 0.6923, "grad_norm": 0.6551542282104492, "learning_rate": 0.0002, "epoch": 2.7824402840542284, "step": 8620}, {"loss": 0.6937, "grad_norm": 0.6915501952171326, "learning_rate": 0.0002, "epoch": 2.785668173014848, "step": 8630}, {"loss": 0.6586, "grad_norm": 0.8061493635177612, "learning_rate": 0.0002, "epoch": 2.788896061975468, "step": 8640}, {"loss": 0.6853, "grad_norm": 0.8403584957122803, "learning_rate": 0.0002, "epoch": 2.792123950936088, "step": 8650}, {"loss": 0.6616, "grad_norm": 0.6455532312393188, "learning_rate": 0.0002, "epoch": 2.7953518398967074, "step": 8660}, {"loss": 0.6819, "grad_norm": 0.8296352028846741, "learning_rate": 0.0002, "epoch": 2.7985797288573275, "step": 8670}, {"loss": 0.6678, "grad_norm": 0.7288752794265747, "learning_rate": 0.0002, "epoch": 2.801807617817947, "step": 8680}, {"loss": 0.6778, "grad_norm": 0.7628464102745056, "learning_rate": 0.0002, "epoch": 2.805035506778567, "step": 8690}, {"loss": 0.7176, "grad_norm": 0.9993878602981567, "learning_rate": 0.0002, "epoch": 2.8082633957391865, "step": 8700}, {"loss": 0.6414, "grad_norm": 0.6972465515136719, "learning_rate": 0.0002, "epoch": 2.811491284699806, "step": 8710}, {"loss": 0.6777, "grad_norm": 0.645042896270752, "learning_rate": 0.0002, "epoch": 2.8147191736604262, "step": 8720}, {"loss": 0.6587, "grad_norm": 0.6853853464126587, "learning_rate": 0.0002, "epoch": 2.817947062621046, "step": 8730}, {"loss": 0.6405, "grad_norm": 0.5935067534446716, "learning_rate": 0.0002, "epoch": 2.8211749515816655, "step": 8740}, {"loss": 0.6674, "grad_norm": 0.7336633205413818, "learning_rate": 0.0002, "epoch": 2.824402840542285, "step": 8750}, {"loss": 0.6662, "grad_norm": 0.7074962854385376, "learning_rate": 0.0002, "epoch": 2.8276307295029053, "step": 8760}, {"loss": 0.6744, "grad_norm": 0.6667559742927551, "learning_rate": 0.0002, "epoch": 2.830858618463525, "step": 8770}, {"loss": 0.7142, "grad_norm": 0.8101205229759216, "learning_rate": 0.0002, "epoch": 2.8340865074241446, "step": 8780}, {"loss": 0.6727, "grad_norm": 0.8841480016708374, "learning_rate": 0.0002, "epoch": 2.8373143963847642, "step": 8790}, {"loss": 0.6601, "grad_norm": 0.5891591310501099, "learning_rate": 0.0002, "epoch": 2.840542285345384, "step": 8800}, {"loss": 0.7114, "grad_norm": 0.667032778263092, "learning_rate": 0.0002, "epoch": 2.843770174306004, "step": 8810}, {"loss": 0.7295, "grad_norm": 0.7629773020744324, "learning_rate": 0.0002, "epoch": 2.8469980632666236, "step": 8820}, {"loss": 0.703, "grad_norm": 0.79471355676651, "learning_rate": 0.0002, "epoch": 2.8502259522272433, "step": 8830}, {"loss": 0.7278, "grad_norm": 0.7529178261756897, "learning_rate": 0.0002, "epoch": 2.8534538411878634, "step": 8840}, {"loss": 0.7163, "grad_norm": 0.7014923691749573, "learning_rate": 0.0002, "epoch": 2.856681730148483, "step": 8850}, {"loss": 0.6803, "grad_norm": 0.7996514439582825, "learning_rate": 0.0002, "epoch": 2.8599096191091027, "step": 8860}, {"loss": 0.6562, "grad_norm": 0.7044785618782043, "learning_rate": 0.0002, "epoch": 2.8631375080697223, "step": 8870}, {"loss": 0.6966, "grad_norm": 0.6792093515396118, "learning_rate": 0.0002, "epoch": 2.866365397030342, "step": 8880}, {"loss": 0.685, "grad_norm": 0.69175124168396, "learning_rate": 0.0002, "epoch": 2.8695932859909616, "step": 8890}, {"loss": 0.7225, "grad_norm": 0.7499129176139832, "learning_rate": 0.0002, "epoch": 2.8728211749515817, "step": 8900}, {"loss": 0.6922, "grad_norm": 0.7678789496421814, "learning_rate": 0.0002, "epoch": 2.8760490639122014, "step": 8910}, {"loss": 0.6803, "grad_norm": 0.7478128671646118, "learning_rate": 0.0002, "epoch": 2.879276952872821, "step": 8920}, {"loss": 0.6689, "grad_norm": 0.6767086386680603, "learning_rate": 0.0002, "epoch": 2.882504841833441, "step": 8930}, {"loss": 0.6587, "grad_norm": 0.7222196459770203, "learning_rate": 0.0002, "epoch": 2.885732730794061, "step": 8940}, {"loss": 0.6472, "grad_norm": 0.6950580477714539, "learning_rate": 0.0002, "epoch": 2.8889606197546804, "step": 8950}, {"loss": 0.7064, "grad_norm": 0.7759528160095215, "learning_rate": 0.0002, "epoch": 2.8921885087153, "step": 8960}, {"loss": 0.6349, "grad_norm": 0.6686919927597046, "learning_rate": 0.0002, "epoch": 2.8954163976759197, "step": 8970}, {"loss": 0.6801, "grad_norm": 0.9245954751968384, "learning_rate": 0.0002, "epoch": 2.89864428663654, "step": 8980}, {"loss": 0.6703, "grad_norm": 0.8734814524650574, "learning_rate": 0.0002, "epoch": 2.9018721755971595, "step": 8990}, {"loss": 0.6716, "grad_norm": 0.6056219339370728, "learning_rate": 0.0002, "epoch": 2.905100064557779, "step": 9000}, {"loss": 0.6535, "grad_norm": 0.7364102005958557, "learning_rate": 0.0002, "epoch": 2.9083279535183992, "step": 9010}, {"loss": 0.707, "grad_norm": 0.6563605070114136, "learning_rate": 0.0002, "epoch": 2.911555842479019, "step": 9020}, {"loss": 0.6564, "grad_norm": 0.659978985786438, "learning_rate": 0.0002, "epoch": 2.9147837314396385, "step": 9030}, {"loss": 0.7154, "grad_norm": 0.8176041841506958, "learning_rate": 0.0002, "epoch": 2.918011620400258, "step": 9040}, {"loss": 0.72, "grad_norm": 0.743677020072937, "learning_rate": 0.0002, "epoch": 2.921239509360878, "step": 9050}, {"loss": 0.7017, "grad_norm": 0.7418383359909058, "learning_rate": 0.0002, "epoch": 2.9244673983214975, "step": 9060}, {"loss": 0.6635, "grad_norm": 0.6916524767875671, "learning_rate": 0.0002, "epoch": 2.9276952872821176, "step": 9070}, {"loss": 0.6502, "grad_norm": 0.6559975743293762, "learning_rate": 0.0002, "epoch": 2.9309231762427372, "step": 9080}, {"loss": 0.7016, "grad_norm": 0.7431221008300781, "learning_rate": 0.0002, "epoch": 2.934151065203357, "step": 9090}, {"loss": 0.6829, "grad_norm": 0.7525941133499146, "learning_rate": 0.0002, "epoch": 2.937378954163977, "step": 9100}, {"loss": 0.7073, "grad_norm": 0.6860167384147644, "learning_rate": 0.0002, "epoch": 2.9406068431245966, "step": 9110}, {"loss": 0.6912, "grad_norm": 0.6467666029930115, "learning_rate": 0.0002, "epoch": 2.9438347320852163, "step": 9120}, {"loss": 0.7122, "grad_norm": 0.7595751285552979, "learning_rate": 0.0002, "epoch": 2.947062621045836, "step": 9130}, {"loss": 0.6951, "grad_norm": 0.6558279991149902, "learning_rate": 0.0002, "epoch": 2.9502905100064556, "step": 9140}, {"loss": 0.7081, "grad_norm": 0.6818708181381226, "learning_rate": 0.0002, "epoch": 2.9535183989670757, "step": 9150}, {"loss": 0.6921, "grad_norm": 0.8387085795402527, "learning_rate": 0.0002, "epoch": 2.9567462879276953, "step": 9160}, {"loss": 0.6914, "grad_norm": 0.7705109715461731, "learning_rate": 0.0002, "epoch": 2.959974176888315, "step": 9170}, {"loss": 0.6849, "grad_norm": 0.688106894493103, "learning_rate": 0.0002, "epoch": 2.9632020658489346, "step": 9180}, {"loss": 0.6833, "grad_norm": 0.659532368183136, "learning_rate": 0.0002, "epoch": 2.9664299548095547, "step": 9190}, {"loss": 0.6383, "grad_norm": 0.6839388608932495, "learning_rate": 0.0002, "epoch": 2.9696578437701744, "step": 9200}, {"loss": 0.6952, "grad_norm": 0.6927599310874939, "learning_rate": 0.0002, "epoch": 2.972885732730794, "step": 9210}, {"loss": 0.7338, "grad_norm": 0.6902472972869873, "learning_rate": 0.0002, "epoch": 2.9761136216914137, "step": 9220}, {"loss": 0.6671, "grad_norm": 0.620399534702301, "learning_rate": 0.0002, "epoch": 2.9793415106520333, "step": 9230}, {"loss": 0.6588, "grad_norm": 0.6812364459037781, "learning_rate": 0.0002, "epoch": 2.9825693996126534, "step": 9240}, {"loss": 0.6957, "grad_norm": 0.7681456208229065, "learning_rate": 0.0002, "epoch": 2.985797288573273, "step": 9250}, {"loss": 0.7113, "grad_norm": 0.7621907591819763, "learning_rate": 0.0002, "epoch": 2.9890251775338927, "step": 9260}, {"loss": 0.6601, "grad_norm": 0.6075740456581116, "learning_rate": 0.0002, "epoch": 2.992253066494513, "step": 9270}, {"loss": 0.6758, "grad_norm": 0.7100434899330139, "learning_rate": 0.0002, "epoch": 2.9954809554551325, "step": 9280}, {"loss": 0.73, "grad_norm": 0.7314488887786865, "learning_rate": 0.0002, "epoch": 2.998708844415752, "step": 9290}, {"eval_loss": 1.1434104442596436, "eval_runtime": 166.3732, "eval_samples_per_second": 4.406, "eval_steps_per_second": 0.553, "epoch": 3.0, "step": 9294}, {"loss": 0.6401, "grad_norm": 0.7408893704414368, "learning_rate": 0.0002, "epoch": 3.001936733376372, "step": 9300}, {"loss": 0.5182, "grad_norm": 0.9773574471473694, "learning_rate": 0.0002, "epoch": 3.0051646223369914, "step": 9310}, {"loss": 0.5432, "grad_norm": 0.7919653058052063, "learning_rate": 0.0002, "epoch": 3.0083925112976115, "step": 9320}, {"loss": 0.6156, "grad_norm": 0.9139202833175659, "learning_rate": 0.0002, "epoch": 3.011620400258231, "step": 9330}, {"loss": 0.5736, "grad_norm": 0.8296737670898438, "learning_rate": 0.0002, "epoch": 3.014848289218851, "step": 9340}, {"loss": 0.5567, "grad_norm": 0.786868155002594, "learning_rate": 0.0002, "epoch": 3.0180761781794705, "step": 9350}, {"loss": 0.578, "grad_norm": 0.5928055644035339, "learning_rate": 0.0002, "epoch": 3.0213040671400906, "step": 9360}, {"loss": 0.5376, "grad_norm": 0.8785701394081116, "learning_rate": 0.0002, "epoch": 3.0245319561007102, "step": 9370}, {"loss": 0.5664, "grad_norm": 0.7978872060775757, "learning_rate": 0.0002, "epoch": 3.02775984506133, "step": 9380}, {"loss": 0.5797, "grad_norm": 0.7160913348197937, "learning_rate": 0.0002, "epoch": 3.0309877340219495, "step": 9390}, {"loss": 0.5777, "grad_norm": 0.904465913772583, "learning_rate": 0.0002, "epoch": 3.034215622982569, "step": 9400}, {"loss": 0.5518, "grad_norm": 0.7082195281982422, "learning_rate": 0.0002, "epoch": 3.0374435119431893, "step": 9410}, {"loss": 0.5434, "grad_norm": 0.9686778783798218, "learning_rate": 0.0002, "epoch": 3.040671400903809, "step": 9420}, {"loss": 0.5692, "grad_norm": 0.8788613677024841, "learning_rate": 0.0002, "epoch": 3.0438992898644286, "step": 9430}, {"loss": 0.5599, "grad_norm": 0.8217582106590271, "learning_rate": 0.0002, "epoch": 3.0471271788250482, "step": 9440}, {"loss": 0.5405, "grad_norm": 0.7380914092063904, "learning_rate": 0.0002, "epoch": 3.0503550677856683, "step": 9450}, {"loss": 0.6258, "grad_norm": 0.7339285612106323, "learning_rate": 0.0002, "epoch": 3.053582956746288, "step": 9460}, {"loss": 0.5646, "grad_norm": 0.7175183296203613, "learning_rate": 0.0002, "epoch": 3.0568108457069076, "step": 9470}, {"loss": 0.5667, "grad_norm": 0.8275379538536072, "learning_rate": 0.0002, "epoch": 3.0600387346675273, "step": 9480}, {"loss": 0.5868, "grad_norm": 0.6544256806373596, "learning_rate": 0.0002, "epoch": 3.0632666236281474, "step": 9490}, {"loss": 0.5365, "grad_norm": 0.8193472623825073, "learning_rate": 0.0002, "epoch": 3.066494512588767, "step": 9500}, {"loss": 0.5614, "grad_norm": 0.7967836856842041, "learning_rate": 0.0002, "epoch": 3.0697224015493867, "step": 9510}, {"loss": 0.5629, "grad_norm": 0.8788684010505676, "learning_rate": 0.0002, "epoch": 3.0729502905100063, "step": 9520}, {"loss": 0.5397, "grad_norm": 0.9410629868507385, "learning_rate": 0.0002, "epoch": 3.0761781794706264, "step": 9530}, {"loss": 0.5473, "grad_norm": 0.7448706030845642, "learning_rate": 0.0002, "epoch": 3.079406068431246, "step": 9540}, {"loss": 0.5774, "grad_norm": 0.9149372577667236, "learning_rate": 0.0002, "epoch": 3.0826339573918657, "step": 9550}, {"loss": 0.5347, "grad_norm": 0.7265563607215881, "learning_rate": 0.0002, "epoch": 3.0858618463524854, "step": 9560}, {"loss": 0.5487, "grad_norm": 1.0305068492889404, "learning_rate": 0.0002, "epoch": 3.089089735313105, "step": 9570}, {"loss": 0.5884, "grad_norm": 0.7987357974052429, "learning_rate": 0.0002, "epoch": 3.092317624273725, "step": 9580}, {"loss": 0.6216, "grad_norm": 0.7733123898506165, "learning_rate": 0.0002, "epoch": 3.095545513234345, "step": 9590}, {"loss": 0.5848, "grad_norm": 1.0438069105148315, "learning_rate": 0.0002, "epoch": 3.0987734021949644, "step": 9600}, {"loss": 0.5612, "grad_norm": 0.7951784729957581, "learning_rate": 0.0002, "epoch": 3.102001291155584, "step": 9610}, {"loss": 0.6184, "grad_norm": 0.7776783108711243, "learning_rate": 0.0002, "epoch": 3.105229180116204, "step": 9620}, {"loss": 0.5626, "grad_norm": 0.7060676217079163, "learning_rate": 0.0002, "epoch": 3.108457069076824, "step": 9630}, {"loss": 0.5731, "grad_norm": 0.871569037437439, "learning_rate": 0.0002, "epoch": 3.1116849580374435, "step": 9640}, {"loss": 0.5168, "grad_norm": 0.8873385787010193, "learning_rate": 0.0002, "epoch": 3.114912846998063, "step": 9650}, {"loss": 0.5985, "grad_norm": 0.750998318195343, "learning_rate": 0.0002, "epoch": 3.118140735958683, "step": 9660}, {"loss": 0.5741, "grad_norm": 0.8678529262542725, "learning_rate": 0.0002, "epoch": 3.121368624919303, "step": 9670}, {"loss": 0.5831, "grad_norm": 0.7706599235534668, "learning_rate": 0.0002, "epoch": 3.1245965138799225, "step": 9680}, {"loss": 0.6142, "grad_norm": 0.8317574858665466, "learning_rate": 0.0002, "epoch": 3.127824402840542, "step": 9690}, {"loss": 0.5634, "grad_norm": 0.801800012588501, "learning_rate": 0.0002, "epoch": 3.131052291801162, "step": 9700}, {"loss": 0.6044, "grad_norm": 0.8574623465538025, "learning_rate": 0.0002, "epoch": 3.134280180761782, "step": 9710}, {"loss": 0.6072, "grad_norm": 0.6556540727615356, "learning_rate": 0.0002, "epoch": 3.1375080697224016, "step": 9720}, {"loss": 0.6058, "grad_norm": 0.8555161952972412, "learning_rate": 0.0002, "epoch": 3.1407359586830212, "step": 9730}, {"loss": 0.6069, "grad_norm": 0.8825467824935913, "learning_rate": 0.0002, "epoch": 3.143963847643641, "step": 9740}, {"loss": 0.5689, "grad_norm": 0.8297156691551208, "learning_rate": 0.0002, "epoch": 3.147191736604261, "step": 9750}, {"loss": 0.5738, "grad_norm": 0.7710384726524353, "learning_rate": 0.0002, "epoch": 3.1504196255648806, "step": 9760}, {"loss": 0.571, "grad_norm": 0.8778039216995239, "learning_rate": 0.0002, "epoch": 3.1536475145255003, "step": 9770}, {"loss": 0.5913, "grad_norm": 0.9014058113098145, "learning_rate": 0.0002, "epoch": 3.15687540348612, "step": 9780}, {"loss": 0.5496, "grad_norm": 0.6856890320777893, "learning_rate": 0.0002, "epoch": 3.16010329244674, "step": 9790}, {"loss": 0.558, "grad_norm": 0.6520644426345825, "learning_rate": 0.0002, "epoch": 3.1633311814073597, "step": 9800}, {"loss": 0.6024, "grad_norm": 0.7250499129295349, "learning_rate": 0.0002, "epoch": 3.1665590703679793, "step": 9810}, {"loss": 0.5823, "grad_norm": 0.8331542015075684, "learning_rate": 0.0002, "epoch": 3.169786959328599, "step": 9820}, {"loss": 0.5803, "grad_norm": 0.8531261682510376, "learning_rate": 0.0002, "epoch": 3.1730148482892186, "step": 9830}, {"loss": 0.57, "grad_norm": 0.8997558355331421, "learning_rate": 0.0002, "epoch": 3.1762427372498387, "step": 9840}, {"loss": 0.5921, "grad_norm": 0.708335280418396, "learning_rate": 0.0002, "epoch": 3.1794706262104584, "step": 9850}, {"loss": 0.5997, "grad_norm": 1.0074886083602905, "learning_rate": 0.0002, "epoch": 3.182698515171078, "step": 9860}, {"loss": 0.573, "grad_norm": 1.0804681777954102, "learning_rate": 0.0002, "epoch": 3.1859264041316977, "step": 9870}, {"loss": 0.5527, "grad_norm": 0.9510730504989624, "learning_rate": 0.0002, "epoch": 3.189154293092318, "step": 9880}, {"loss": 0.6401, "grad_norm": 0.7211061716079712, "learning_rate": 0.0002, "epoch": 3.1923821820529374, "step": 9890}, {"loss": 0.5563, "grad_norm": 0.8767086267471313, "learning_rate": 0.0002, "epoch": 3.195610071013557, "step": 9900}, {"loss": 0.5747, "grad_norm": 0.8388153314590454, "learning_rate": 0.0002, "epoch": 3.1988379599741767, "step": 9910}, {"loss": 0.5681, "grad_norm": 0.8038473725318909, "learning_rate": 0.0002, "epoch": 3.202065848934797, "step": 9920}, {"loss": 0.5594, "grad_norm": 0.8187747001647949, "learning_rate": 0.0002, "epoch": 3.2052937378954165, "step": 9930}, {"loss": 0.5813, "grad_norm": 0.7427355051040649, "learning_rate": 0.0002, "epoch": 3.208521626856036, "step": 9940}, {"loss": 0.5709, "grad_norm": 0.8017025589942932, "learning_rate": 0.0002, "epoch": 3.211749515816656, "step": 9950}, {"loss": 0.6106, "grad_norm": 0.738595187664032, "learning_rate": 0.0002, "epoch": 3.214977404777276, "step": 9960}, {"loss": 0.6006, "grad_norm": 0.7521342039108276, "learning_rate": 0.0002, "epoch": 3.2182052937378955, "step": 9970}, {"loss": 0.5706, "grad_norm": 0.840329110622406, "learning_rate": 0.0002, "epoch": 3.221433182698515, "step": 9980}, {"loss": 0.5666, "grad_norm": 0.9809671640396118, "learning_rate": 0.0002, "epoch": 3.224661071659135, "step": 9990}, {"loss": 0.6223, "grad_norm": 0.8456943035125732, "learning_rate": 0.0002, "epoch": 3.2278889606197545, "step": 10000}, {"loss": 0.5798, "grad_norm": 0.8962995409965515, "learning_rate": 0.0002, "epoch": 3.2311168495803746, "step": 10010}, {"loss": 0.5399, "grad_norm": 0.6492817401885986, "learning_rate": 0.0002, "epoch": 3.2343447385409942, "step": 10020}, {"loss": 0.5678, "grad_norm": 1.0471255779266357, "learning_rate": 0.0002, "epoch": 3.237572627501614, "step": 10030}, {"loss": 0.5452, "grad_norm": 0.7995471358299255, "learning_rate": 0.0002, "epoch": 3.2408005164622335, "step": 10040}, {"loss": 0.615, "grad_norm": 0.7231964468955994, "learning_rate": 0.0002, "epoch": 3.2440284054228536, "step": 10050}, {"loss": 0.5586, "grad_norm": 0.639630138874054, "learning_rate": 0.0002, "epoch": 3.2472562943834733, "step": 10060}, {"loss": 0.6271, "grad_norm": 0.7957055568695068, "learning_rate": 0.0002, "epoch": 3.250484183344093, "step": 10070}, {"loss": 0.5845, "grad_norm": 0.7735482454299927, "learning_rate": 0.0002, "epoch": 3.2537120723047126, "step": 10080}, {"loss": 0.5791, "grad_norm": 0.8139488101005554, "learning_rate": 0.0002, "epoch": 3.2569399612653323, "step": 10090}, {"loss": 0.6049, "grad_norm": 0.8113240003585815, "learning_rate": 0.0002, "epoch": 3.2601678502259523, "step": 10100}, {"loss": 0.5617, "grad_norm": 0.7735909819602966, "learning_rate": 0.0002, "epoch": 3.263395739186572, "step": 10110}, {"loss": 0.5964, "grad_norm": 0.7760744094848633, "learning_rate": 0.0002, "epoch": 3.2666236281471916, "step": 10120}, {"loss": 0.5786, "grad_norm": 0.8078505396842957, "learning_rate": 0.0002, "epoch": 3.2698515171078113, "step": 10130}, {"loss": 0.5904, "grad_norm": 0.983648955821991, "learning_rate": 0.0002, "epoch": 3.2730794060684314, "step": 10140}, {"loss": 0.596, "grad_norm": 0.7131832242012024, "learning_rate": 0.0002, "epoch": 3.276307295029051, "step": 10150}, {"loss": 0.5986, "grad_norm": 0.924493134021759, "learning_rate": 0.0002, "epoch": 3.2795351839896707, "step": 10160}, {"loss": 0.5733, "grad_norm": 0.9371112585067749, "learning_rate": 0.0002, "epoch": 3.2827630729502904, "step": 10170}, {"loss": 0.5891, "grad_norm": 0.8989261388778687, "learning_rate": 0.0002, "epoch": 3.2859909619109104, "step": 10180}, {"loss": 0.6143, "grad_norm": 0.8130394816398621, "learning_rate": 0.0002, "epoch": 3.28921885087153, "step": 10190}, {"loss": 0.5555, "grad_norm": 0.9899941086769104, "learning_rate": 0.0002, "epoch": 3.2924467398321497, "step": 10200}, {"loss": 0.5899, "grad_norm": 1.007038950920105, "learning_rate": 0.0002, "epoch": 3.2956746287927694, "step": 10210}, {"loss": 0.5713, "grad_norm": 0.7465066313743591, "learning_rate": 0.0002, "epoch": 3.2989025177533895, "step": 10220}, {"loss": 0.6307, "grad_norm": 0.7202590703964233, "learning_rate": 0.0002, "epoch": 3.302130406714009, "step": 10230}, {"loss": 0.5659, "grad_norm": 0.6258249282836914, "learning_rate": 0.0002, "epoch": 3.305358295674629, "step": 10240}, {"loss": 0.5869, "grad_norm": 0.8996058702468872, "learning_rate": 0.0002, "epoch": 3.3085861846352485, "step": 10250}, {"loss": 0.5825, "grad_norm": 0.9550982713699341, "learning_rate": 0.0002, "epoch": 3.311814073595868, "step": 10260}, {"loss": 0.5602, "grad_norm": 0.7010059952735901, "learning_rate": 0.0002, "epoch": 3.315041962556488, "step": 10270}, {"loss": 0.5853, "grad_norm": 0.9639869332313538, "learning_rate": 0.0002, "epoch": 3.318269851517108, "step": 10280}, {"loss": 0.5362, "grad_norm": 1.0192502737045288, "learning_rate": 0.0002, "epoch": 3.3214977404777275, "step": 10290}, {"loss": 0.5605, "grad_norm": 0.7953670024871826, "learning_rate": 0.0002, "epoch": 3.324725629438347, "step": 10300}, {"loss": 0.6386, "grad_norm": 0.7436774969100952, "learning_rate": 0.0002, "epoch": 3.3279535183989672, "step": 10310}, {"loss": 0.5823, "grad_norm": 0.7846777439117432, "learning_rate": 0.0002, "epoch": 3.331181407359587, "step": 10320}, {"loss": 0.6119, "grad_norm": 0.8963494896888733, "learning_rate": 0.0002, "epoch": 3.3344092963202066, "step": 10330}, {"loss": 0.5872, "grad_norm": 0.6876392364501953, "learning_rate": 0.0002, "epoch": 3.337637185280826, "step": 10340}, {"loss": 0.6291, "grad_norm": 0.9161638021469116, "learning_rate": 0.0002, "epoch": 3.340865074241446, "step": 10350}, {"loss": 0.5955, "grad_norm": 0.8964458107948303, "learning_rate": 0.0002, "epoch": 3.344092963202066, "step": 10360}, {"loss": 0.5965, "grad_norm": 0.9052296280860901, "learning_rate": 0.0002, "epoch": 3.3473208521626856, "step": 10370}, {"loss": 0.5958, "grad_norm": 0.9292596578598022, "learning_rate": 0.0002, "epoch": 3.3505487411233053, "step": 10380}, {"loss": 0.5487, "grad_norm": 0.9605957269668579, "learning_rate": 0.0002, "epoch": 3.3537766300839253, "step": 10390}, {"loss": 0.6214, "grad_norm": 1.0198872089385986, "learning_rate": 0.0002, "epoch": 3.357004519044545, "step": 10400}, {"loss": 0.6053, "grad_norm": 0.7043630480766296, "learning_rate": 0.0002, "epoch": 3.3602324080051647, "step": 10410}, {"loss": 0.5451, "grad_norm": 1.0533326864242554, "learning_rate": 0.0002, "epoch": 3.3634602969657843, "step": 10420}, {"loss": 0.6134, "grad_norm": 0.7552485466003418, "learning_rate": 0.0002, "epoch": 3.366688185926404, "step": 10430}, {"loss": 0.631, "grad_norm": 0.692708432674408, "learning_rate": 0.0002, "epoch": 3.369916074887024, "step": 10440}, {"loss": 0.631, "grad_norm": 0.985952615737915, "learning_rate": 0.0002, "epoch": 3.3731439638476437, "step": 10450}, {"loss": 0.5689, "grad_norm": 0.6749676465988159, "learning_rate": 0.0002, "epoch": 3.3763718528082634, "step": 10460}, {"loss": 0.5724, "grad_norm": 0.9514535665512085, "learning_rate": 0.0002, "epoch": 3.379599741768883, "step": 10470}, {"loss": 0.5982, "grad_norm": 1.2681142091751099, "learning_rate": 0.0002, "epoch": 3.382827630729503, "step": 10480}, {"loss": 0.5778, "grad_norm": 1.031968355178833, "learning_rate": 0.0002, "epoch": 3.3860555196901228, "step": 10490}, {"loss": 0.5964, "grad_norm": 0.8061563968658447, "learning_rate": 0.0002, "epoch": 3.3892834086507424, "step": 10500}, {"loss": 0.6094, "grad_norm": 1.0515062808990479, "learning_rate": 0.0002, "epoch": 3.392511297611362, "step": 10510}, {"loss": 0.542, "grad_norm": 0.9055540561676025, "learning_rate": 0.0002, "epoch": 3.3957391865719817, "step": 10520}, {"loss": 0.6148, "grad_norm": 0.9318141341209412, "learning_rate": 0.0002, "epoch": 3.398967075532602, "step": 10530}, {"loss": 0.5722, "grad_norm": 0.8266817331314087, "learning_rate": 0.0002, "epoch": 3.4021949644932215, "step": 10540}, {"loss": 0.6015, "grad_norm": 1.2322112321853638, "learning_rate": 0.0002, "epoch": 3.405422853453841, "step": 10550}, {"loss": 0.6215, "grad_norm": 0.9535136818885803, "learning_rate": 0.0002, "epoch": 3.4086507424144608, "step": 10560}, {"loss": 0.561, "grad_norm": 0.9243819117546082, "learning_rate": 0.0002, "epoch": 3.411878631375081, "step": 10570}, {"loss": 0.5844, "grad_norm": 0.9011809825897217, "learning_rate": 0.0002, "epoch": 3.4151065203357005, "step": 10580}, {"loss": 0.6175, "grad_norm": 0.9923036694526672, "learning_rate": 0.0002, "epoch": 3.41833440929632, "step": 10590}, {"loss": 0.6033, "grad_norm": 0.8903067111968994, "learning_rate": 0.0002, "epoch": 3.42156229825694, "step": 10600}, {"loss": 0.5563, "grad_norm": 0.7101534605026245, "learning_rate": 0.0002, "epoch": 3.42479018721756, "step": 10610}, {"loss": 0.598, "grad_norm": 0.8186570405960083, "learning_rate": 0.0002, "epoch": 3.4280180761781796, "step": 10620}, {"loss": 0.5897, "grad_norm": 0.9480205774307251, "learning_rate": 0.0002, "epoch": 3.431245965138799, "step": 10630}, {"loss": 0.5798, "grad_norm": 1.1370961666107178, "learning_rate": 0.0002, "epoch": 3.434473854099419, "step": 10640}, {"loss": 0.5779, "grad_norm": 1.017669677734375, "learning_rate": 0.0002, "epoch": 3.437701743060039, "step": 10650}, {"loss": 0.5999, "grad_norm": 0.7625100016593933, "learning_rate": 0.0002, "epoch": 3.4409296320206586, "step": 10660}, {"loss": 0.5705, "grad_norm": 0.9288196563720703, "learning_rate": 0.0002, "epoch": 3.4441575209812783, "step": 10670}, {"loss": 0.6255, "grad_norm": 0.8800460696220398, "learning_rate": 0.0002, "epoch": 3.447385409941898, "step": 10680}, {"loss": 0.6245, "grad_norm": 0.7499661445617676, "learning_rate": 0.0002, "epoch": 3.4506132989025176, "step": 10690}, {"loss": 0.5979, "grad_norm": 0.8254973292350769, "learning_rate": 0.0002, "epoch": 3.4538411878631377, "step": 10700}, {"loss": 0.5742, "grad_norm": 0.8735857605934143, "learning_rate": 0.0002, "epoch": 3.4570690768237573, "step": 10710}, {"loss": 0.6356, "grad_norm": 0.9601819515228271, "learning_rate": 0.0002, "epoch": 3.460296965784377, "step": 10720}, {"loss": 0.5574, "grad_norm": 0.8031058311462402, "learning_rate": 0.0002, "epoch": 3.4635248547449966, "step": 10730}, {"loss": 0.6078, "grad_norm": 0.8039247393608093, "learning_rate": 0.0002, "epoch": 3.4667527437056167, "step": 10740}, {"loss": 0.593, "grad_norm": 0.8936953544616699, "learning_rate": 0.0002, "epoch": 3.4699806326662364, "step": 10750}, {"loss": 0.5971, "grad_norm": 0.8201186060905457, "learning_rate": 0.0002, "epoch": 3.473208521626856, "step": 10760}, {"loss": 0.5875, "grad_norm": 1.0064148902893066, "learning_rate": 0.0002, "epoch": 3.4764364105874757, "step": 10770}, {"loss": 0.5639, "grad_norm": 0.8617483377456665, "learning_rate": 0.0002, "epoch": 3.4796642995480953, "step": 10780}, {"loss": 0.6022, "grad_norm": 0.8532096147537231, "learning_rate": 0.0002, "epoch": 3.4828921885087154, "step": 10790}, {"loss": 0.5765, "grad_norm": 0.8646879196166992, "learning_rate": 0.0002, "epoch": 3.486120077469335, "step": 10800}, {"loss": 0.5799, "grad_norm": 0.7962660789489746, "learning_rate": 0.0002, "epoch": 3.4893479664299547, "step": 10810}, {"loss": 0.5398, "grad_norm": 0.9560028314590454, "learning_rate": 0.0002, "epoch": 3.492575855390575, "step": 10820}, {"loss": 0.6082, "grad_norm": 0.928439736366272, "learning_rate": 0.0002, "epoch": 3.4958037443511945, "step": 10830}, {"loss": 0.6112, "grad_norm": 0.8219282627105713, "learning_rate": 0.0002, "epoch": 3.499031633311814, "step": 10840}, {"loss": 0.6369, "grad_norm": 0.7918338179588318, "learning_rate": 0.0002, "epoch": 3.5022595222724338, "step": 10850}, {"loss": 0.6164, "grad_norm": 0.961295485496521, "learning_rate": 0.0002, "epoch": 3.5054874112330534, "step": 10860}, {"loss": 0.5534, "grad_norm": 1.0731624364852905, "learning_rate": 0.0002, "epoch": 3.5087153001936735, "step": 10870}, {"loss": 0.5829, "grad_norm": 0.9551863074302673, "learning_rate": 0.0002, "epoch": 3.511943189154293, "step": 10880}, {"loss": 0.5746, "grad_norm": 0.8409819602966309, "learning_rate": 0.0002, "epoch": 3.515171078114913, "step": 10890}, {"loss": 0.5813, "grad_norm": 0.7546320557594299, "learning_rate": 0.0002, "epoch": 3.5183989670755325, "step": 10900}, {"loss": 0.6184, "grad_norm": 0.7505252361297607, "learning_rate": 0.0002, "epoch": 3.5216268560361526, "step": 10910}, {"loss": 0.5649, "grad_norm": 0.7505561113357544, "learning_rate": 0.0002, "epoch": 3.524854744996772, "step": 10920}, {"loss": 0.6277, "grad_norm": 1.086177945137024, "learning_rate": 0.0002, "epoch": 3.528082633957392, "step": 10930}, {"loss": 0.5983, "grad_norm": 0.7721118330955505, "learning_rate": 0.0002, "epoch": 3.5313105229180115, "step": 10940}, {"loss": 0.5919, "grad_norm": 0.9567878246307373, "learning_rate": 0.0002, "epoch": 3.534538411878631, "step": 10950}, {"loss": 0.6261, "grad_norm": 0.8377360105514526, "learning_rate": 0.0002, "epoch": 3.5377663008392513, "step": 10960}, {"loss": 0.633, "grad_norm": 1.0174858570098877, "learning_rate": 0.0002, "epoch": 3.540994189799871, "step": 10970}, {"loss": 0.599, "grad_norm": 0.8164418935775757, "learning_rate": 0.0002, "epoch": 3.5442220787604906, "step": 10980}, {"loss": 0.5471, "grad_norm": 0.8959241509437561, "learning_rate": 0.0002, "epoch": 3.5474499677211107, "step": 10990}, {"loss": 0.6195, "grad_norm": 1.0154379606246948, "learning_rate": 0.0002, "epoch": 3.5506778566817303, "step": 11000}, {"loss": 0.5835, "grad_norm": 0.7812292575836182, "learning_rate": 0.0002, "epoch": 3.55390574564235, "step": 11010}, {"loss": 0.6052, "grad_norm": 0.9849029779434204, "learning_rate": 0.0002, "epoch": 3.5571336346029696, "step": 11020}, {"loss": 0.5689, "grad_norm": 0.8826184272766113, "learning_rate": 0.0002, "epoch": 3.5603615235635893, "step": 11030}, {"loss": 0.601, "grad_norm": 0.9039685726165771, "learning_rate": 0.0002, "epoch": 3.563589412524209, "step": 11040}, {"loss": 0.5996, "grad_norm": 0.9585249423980713, "learning_rate": 0.0002, "epoch": 3.566817301484829, "step": 11050}, {"loss": 0.5714, "grad_norm": 0.8083069324493408, "learning_rate": 0.0002, "epoch": 3.5700451904454487, "step": 11060}, {"loss": 0.6317, "grad_norm": 0.9528678059577942, "learning_rate": 0.0002, "epoch": 3.5732730794060683, "step": 11070}, {"loss": 0.6278, "grad_norm": 0.8297588229179382, "learning_rate": 0.0002, "epoch": 3.5765009683666884, "step": 11080}, {"loss": 0.5919, "grad_norm": 0.8191716074943542, "learning_rate": 0.0002, "epoch": 3.579728857327308, "step": 11090}, {"loss": 0.5971, "grad_norm": 0.8056275844573975, "learning_rate": 0.0002, "epoch": 3.5829567462879277, "step": 11100}, {"loss": 0.6325, "grad_norm": 0.701930582523346, "learning_rate": 0.0002, "epoch": 3.5861846352485474, "step": 11110}, {"loss": 0.6088, "grad_norm": 0.7644643187522888, "learning_rate": 0.0002, "epoch": 3.589412524209167, "step": 11120}, {"loss": 0.605, "grad_norm": 0.668004035949707, "learning_rate": 0.0002, "epoch": 3.592640413169787, "step": 11130}, {"loss": 0.5735, "grad_norm": 0.8849539756774902, "learning_rate": 0.0002, "epoch": 3.5958683021304068, "step": 11140}, {"loss": 0.6412, "grad_norm": 0.8123571276664734, "learning_rate": 0.0002, "epoch": 3.5990961910910264, "step": 11150}, {"loss": 0.5626, "grad_norm": 0.7591469287872314, "learning_rate": 0.0002, "epoch": 3.602324080051646, "step": 11160}, {"loss": 0.5668, "grad_norm": 0.776466965675354, "learning_rate": 0.0002, "epoch": 3.605551969012266, "step": 11170}, {"loss": 0.6631, "grad_norm": 0.9156150221824646, "learning_rate": 0.0002, "epoch": 3.608779857972886, "step": 11180}, {"loss": 0.5867, "grad_norm": 0.7517618536949158, "learning_rate": 0.0002, "epoch": 3.6120077469335055, "step": 11190}, {"loss": 0.5939, "grad_norm": 0.931239128112793, "learning_rate": 0.0002, "epoch": 3.615235635894125, "step": 11200}, {"loss": 0.5736, "grad_norm": 0.9107872843742371, "learning_rate": 0.0002, "epoch": 3.6184635248547448, "step": 11210}, {"loss": 0.5665, "grad_norm": 0.7624770998954773, "learning_rate": 0.0002, "epoch": 3.621691413815365, "step": 11220}, {"loss": 0.6033, "grad_norm": 0.8129580616950989, "learning_rate": 0.0002, "epoch": 3.6249193027759845, "step": 11230}, {"loss": 0.6192, "grad_norm": 0.7339836955070496, "learning_rate": 0.0002, "epoch": 3.628147191736604, "step": 11240}, {"loss": 0.5976, "grad_norm": 0.8901296854019165, "learning_rate": 0.0002, "epoch": 3.6313750806972243, "step": 11250}, {"loss": 0.5977, "grad_norm": 1.1374726295471191, "learning_rate": 0.0002, "epoch": 3.634602969657844, "step": 11260}, {"loss": 0.5859, "grad_norm": 0.7438275218009949, "learning_rate": 0.0002, "epoch": 3.6378308586184636, "step": 11270}, {"loss": 0.5757, "grad_norm": 0.808646559715271, "learning_rate": 0.0002, "epoch": 3.641058747579083, "step": 11280}, {"loss": 0.6244, "grad_norm": 1.091810941696167, "learning_rate": 0.0002, "epoch": 3.644286636539703, "step": 11290}, {"loss": 0.5957, "grad_norm": 0.8439257144927979, "learning_rate": 0.0002, "epoch": 3.6475145255003225, "step": 11300}, {"loss": 0.6115, "grad_norm": 0.9720633029937744, "learning_rate": 0.0002, "epoch": 3.6507424144609426, "step": 11310}, {"loss": 0.5942, "grad_norm": 0.738571047782898, "learning_rate": 0.0002, "epoch": 3.6539703034215623, "step": 11320}, {"loss": 0.6029, "grad_norm": 0.6961580514907837, "learning_rate": 0.0002, "epoch": 3.657198192382182, "step": 11330}, {"loss": 0.6226, "grad_norm": 0.8192131519317627, "learning_rate": 0.0002, "epoch": 3.660426081342802, "step": 11340}, {"loss": 0.6155, "grad_norm": 0.8367205858230591, "learning_rate": 0.0002, "epoch": 3.6636539703034217, "step": 11350}, {"loss": 0.586, "grad_norm": 0.7735666632652283, "learning_rate": 0.0002, "epoch": 3.6668818592640413, "step": 11360}, {"loss": 0.6113, "grad_norm": 0.6507132649421692, "learning_rate": 0.0002, "epoch": 3.670109748224661, "step": 11370}, {"loss": 0.6273, "grad_norm": 0.8271192312240601, "learning_rate": 0.0002, "epoch": 3.6733376371852806, "step": 11380}, {"loss": 0.5995, "grad_norm": 0.8724204301834106, "learning_rate": 0.0002, "epoch": 3.6765655261459007, "step": 11390}, {"loss": 0.6131, "grad_norm": 0.8448445200920105, "learning_rate": 0.0002, "epoch": 3.6797934151065204, "step": 11400}, {"loss": 0.5923, "grad_norm": 0.6756882071495056, "learning_rate": 0.0002, "epoch": 3.68302130406714, "step": 11410}, {"loss": 0.6443, "grad_norm": 0.7859625816345215, "learning_rate": 0.0002, "epoch": 3.68624919302776, "step": 11420}, {"loss": 0.6567, "grad_norm": 0.8929487466812134, "learning_rate": 0.0002, "epoch": 3.6894770819883798, "step": 11430}, {"loss": 0.6474, "grad_norm": 0.8163391351699829, "learning_rate": 0.0002, "epoch": 3.6927049709489994, "step": 11440}, {"loss": 0.6467, "grad_norm": 0.8948464393615723, "learning_rate": 0.0002, "epoch": 3.695932859909619, "step": 11450}, {"loss": 0.624, "grad_norm": 0.8654782176017761, "learning_rate": 0.0002, "epoch": 3.6991607488702387, "step": 11460}, {"loss": 0.6142, "grad_norm": 0.9514864683151245, "learning_rate": 0.0002, "epoch": 3.7023886378308584, "step": 11470}, {"loss": 0.606, "grad_norm": 0.7298579812049866, "learning_rate": 0.0002, "epoch": 3.7056165267914785, "step": 11480}, {"loss": 0.5853, "grad_norm": 0.9266309142112732, "learning_rate": 0.0002, "epoch": 3.708844415752098, "step": 11490}, {"loss": 0.6122, "grad_norm": 0.8608686923980713, "learning_rate": 0.0002, "epoch": 3.7120723047127178, "step": 11500}, {"loss": 0.6348, "grad_norm": 0.921788215637207, "learning_rate": 0.0002, "epoch": 3.715300193673338, "step": 11510}, {"loss": 0.6191, "grad_norm": 0.8537021279335022, "learning_rate": 0.0002, "epoch": 3.7185280826339575, "step": 11520}, {"loss": 0.6228, "grad_norm": 1.115194320678711, "learning_rate": 0.0002, "epoch": 3.721755971594577, "step": 11530}, {"loss": 0.5828, "grad_norm": 0.7614817023277283, "learning_rate": 0.0002, "epoch": 3.724983860555197, "step": 11540}, {"loss": 0.5776, "grad_norm": 0.871999204158783, "learning_rate": 0.0002, "epoch": 3.7282117495158165, "step": 11550}, {"loss": 0.5962, "grad_norm": 0.9668049812316895, "learning_rate": 0.0002, "epoch": 3.7314396384764366, "step": 11560}, {"loss": 0.5534, "grad_norm": 1.2185815572738647, "learning_rate": 0.0002, "epoch": 3.734667527437056, "step": 11570}, {"loss": 0.5936, "grad_norm": 0.8258453011512756, "learning_rate": 0.0002, "epoch": 3.737895416397676, "step": 11580}, {"loss": 0.5853, "grad_norm": 0.8708966374397278, "learning_rate": 0.0002, "epoch": 3.7411233053582955, "step": 11590}, {"loss": 0.5847, "grad_norm": 0.7784267663955688, "learning_rate": 0.0002, "epoch": 3.7443511943189156, "step": 11600}, {"loss": 0.6404, "grad_norm": 0.7504425048828125, "learning_rate": 0.0002, "epoch": 3.7475790832795353, "step": 11610}, {"loss": 0.5922, "grad_norm": 0.9144526124000549, "learning_rate": 0.0002, "epoch": 3.750806972240155, "step": 11620}, {"loss": 0.6425, "grad_norm": 0.922581672668457, "learning_rate": 0.0002, "epoch": 3.7540348612007746, "step": 11630}, {"loss": 0.6402, "grad_norm": 0.9348630905151367, "learning_rate": 0.0002, "epoch": 3.757262750161394, "step": 11640}, {"loss": 0.5852, "grad_norm": 1.0740231275558472, "learning_rate": 0.0002, "epoch": 3.7604906391220143, "step": 11650}, {"loss": 0.599, "grad_norm": 0.884830117225647, "learning_rate": 0.0002, "epoch": 3.763718528082634, "step": 11660}, {"loss": 0.5991, "grad_norm": 1.0256348848342896, "learning_rate": 0.0002, "epoch": 3.7669464170432536, "step": 11670}, {"loss": 0.626, "grad_norm": 0.6795592904090881, "learning_rate": 0.0002, "epoch": 3.7701743060038737, "step": 11680}, {"loss": 0.6241, "grad_norm": 0.9381206631660461, "learning_rate": 0.0002, "epoch": 3.7734021949644934, "step": 11690}, {"loss": 0.6054, "grad_norm": 0.7633092403411865, "learning_rate": 0.0002, "epoch": 3.776630083925113, "step": 11700}, {"loss": 0.5937, "grad_norm": 0.7506213188171387, "learning_rate": 0.0002, "epoch": 3.7798579728857327, "step": 11710}, {"loss": 0.5933, "grad_norm": 0.8182913064956665, "learning_rate": 0.0002, "epoch": 3.7830858618463523, "step": 11720}, {"loss": 0.6043, "grad_norm": 1.019322156906128, "learning_rate": 0.0002, "epoch": 3.786313750806972, "step": 11730}, {"loss": 0.633, "grad_norm": 0.8895221948623657, "learning_rate": 0.0002, "epoch": 3.789541639767592, "step": 11740}, {"loss": 0.6553, "grad_norm": 0.948847770690918, "learning_rate": 0.0002, "epoch": 3.7927695287282117, "step": 11750}, {"loss": 0.6265, "grad_norm": 0.9068999886512756, "learning_rate": 0.0002, "epoch": 3.7959974176888314, "step": 11760}, {"loss": 0.6163, "grad_norm": 0.7920539975166321, "learning_rate": 0.0002, "epoch": 3.7992253066494515, "step": 11770}, {"loss": 0.5964, "grad_norm": 0.8441922068595886, "learning_rate": 0.0002, "epoch": 3.802453195610071, "step": 11780}, {"loss": 0.6379, "grad_norm": 0.9258501529693604, "learning_rate": 0.0002, "epoch": 3.8056810845706908, "step": 11790}, {"loss": 0.6379, "grad_norm": 0.7354241609573364, "learning_rate": 0.0002, "epoch": 3.8089089735313104, "step": 11800}, {"loss": 0.6177, "grad_norm": 0.9494872689247131, "learning_rate": 0.0002, "epoch": 3.81213686249193, "step": 11810}, {"loss": 0.5931, "grad_norm": 0.8266556859016418, "learning_rate": 0.0002, "epoch": 3.81536475145255, "step": 11820}, {"loss": 0.641, "grad_norm": 0.7951219081878662, "learning_rate": 0.0002, "epoch": 3.81859264041317, "step": 11830}, {"loss": 0.5767, "grad_norm": 0.7688382267951965, "learning_rate": 0.0002, "epoch": 3.8218205293737895, "step": 11840}, {"loss": 0.6117, "grad_norm": 1.0917940139770508, "learning_rate": 0.0002, "epoch": 3.8250484183344096, "step": 11850}, {"loss": 0.5857, "grad_norm": 0.9880442023277283, "learning_rate": 0.0002, "epoch": 3.828276307295029, "step": 11860}, {"loss": 0.6579, "grad_norm": 0.8433151245117188, "learning_rate": 0.0002, "epoch": 3.831504196255649, "step": 11870}, {"loss": 0.5876, "grad_norm": 0.8691204786300659, "learning_rate": 0.0002, "epoch": 3.8347320852162685, "step": 11880}, {"loss": 0.6308, "grad_norm": 0.7698143124580383, "learning_rate": 0.0002, "epoch": 3.837959974176888, "step": 11890}, {"loss": 0.6531, "grad_norm": 0.8874883651733398, "learning_rate": 0.0002, "epoch": 3.841187863137508, "step": 11900}, {"loss": 0.6242, "grad_norm": 1.1209359169006348, "learning_rate": 0.0002, "epoch": 3.844415752098128, "step": 11910}, {"loss": 0.6415, "grad_norm": 0.7723544239997864, "learning_rate": 0.0002, "epoch": 3.8476436410587476, "step": 11920}, {"loss": 0.6091, "grad_norm": 0.8363937139511108, "learning_rate": 0.0002, "epoch": 3.850871530019367, "step": 11930}, {"loss": 0.6498, "grad_norm": 0.9209707975387573, "learning_rate": 0.0002, "epoch": 3.8540994189799873, "step": 11940}, {"loss": 0.6471, "grad_norm": 0.9456894993782043, "learning_rate": 0.0002, "epoch": 3.857327307940607, "step": 11950}, {"loss": 0.6432, "grad_norm": 1.5748413801193237, "learning_rate": 0.0002, "epoch": 3.8605551969012266, "step": 11960}, {"loss": 0.6197, "grad_norm": 0.9083569049835205, "learning_rate": 0.0002, "epoch": 3.8637830858618463, "step": 11970}, {"loss": 0.6593, "grad_norm": 0.7672823071479797, "learning_rate": 0.0002, "epoch": 3.867010974822466, "step": 11980}, {"loss": 0.6238, "grad_norm": 0.8647152185440063, "learning_rate": 0.0002, "epoch": 3.870238863783086, "step": 11990}, {"loss": 0.5755, "grad_norm": 0.9564255475997925, "learning_rate": 0.0002, "epoch": 3.8734667527437057, "step": 12000}, {"loss": 0.6321, "grad_norm": 0.773267924785614, "learning_rate": 0.0002, "epoch": 3.8766946417043253, "step": 12010}, {"loss": 0.6057, "grad_norm": 0.8030173182487488, "learning_rate": 0.0002, "epoch": 3.879922530664945, "step": 12020}, {"loss": 0.6194, "grad_norm": 0.8002150058746338, "learning_rate": 0.0002, "epoch": 3.883150419625565, "step": 12030}, {"loss": 0.6194, "grad_norm": 0.98802250623703, "learning_rate": 0.0002, "epoch": 3.8863783085861847, "step": 12040}, {"loss": 0.6026, "grad_norm": 0.7868124842643738, "learning_rate": 0.0002, "epoch": 3.8896061975468044, "step": 12050}, {"loss": 0.6303, "grad_norm": 0.932182788848877, "learning_rate": 0.0002, "epoch": 3.892834086507424, "step": 12060}, {"loss": 0.5863, "grad_norm": 0.8576806783676147, "learning_rate": 0.0002, "epoch": 3.8960619754680437, "step": 12070}, {"loss": 0.6079, "grad_norm": 0.8985713124275208, "learning_rate": 0.0002, "epoch": 3.8992898644286638, "step": 12080}, {"loss": 0.6449, "grad_norm": 0.7876521944999695, "learning_rate": 0.0002, "epoch": 3.9025177533892834, "step": 12090}, {"loss": 0.5655, "grad_norm": 0.773936927318573, "learning_rate": 0.0002, "epoch": 3.905745642349903, "step": 12100}, {"loss": 0.5765, "grad_norm": 0.7274761199951172, "learning_rate": 0.0002, "epoch": 3.908973531310523, "step": 12110}, {"loss": 0.6182, "grad_norm": 0.8625598549842834, "learning_rate": 0.0002, "epoch": 3.912201420271143, "step": 12120}, {"loss": 0.5855, "grad_norm": 0.8702362179756165, "learning_rate": 0.0002, "epoch": 3.9154293092317625, "step": 12130}, {"loss": 0.6493, "grad_norm": 0.912579357624054, "learning_rate": 0.0002, "epoch": 3.918657198192382, "step": 12140}, {"loss": 0.6341, "grad_norm": 0.8697066903114319, "learning_rate": 0.0002, "epoch": 3.9218850871530018, "step": 12150}, {"loss": 0.6037, "grad_norm": 1.005232572555542, "learning_rate": 0.0002, "epoch": 3.9251129761136214, "step": 12160}, {"loss": 0.621, "grad_norm": 0.793902575969696, "learning_rate": 0.0002, "epoch": 3.9283408650742415, "step": 12170}, {"loss": 0.599, "grad_norm": 0.7025905847549438, "learning_rate": 0.0002, "epoch": 3.931568754034861, "step": 12180}, {"loss": 0.6421, "grad_norm": 0.97635817527771, "learning_rate": 0.0002, "epoch": 3.934796642995481, "step": 12190}, {"loss": 0.6416, "grad_norm": 0.855417013168335, "learning_rate": 0.0002, "epoch": 3.938024531956101, "step": 12200}, {"loss": 0.5979, "grad_norm": 0.8841291666030884, "learning_rate": 0.0002, "epoch": 3.9412524209167206, "step": 12210}, {"loss": 0.5666, "grad_norm": 1.1762064695358276, "learning_rate": 0.0002, "epoch": 3.94448030987734, "step": 12220}, {"loss": 0.586, "grad_norm": 0.8393193483352661, "learning_rate": 0.0002, "epoch": 3.94770819883796, "step": 12230}, {"loss": 0.5738, "grad_norm": 0.9324905276298523, "learning_rate": 0.0002, "epoch": 3.9509360877985795, "step": 12240}, {"loss": 0.5954, "grad_norm": 0.8607982993125916, "learning_rate": 0.0002, "epoch": 3.9541639767591996, "step": 12250}, {"loss": 0.6277, "grad_norm": 0.8586681485176086, "learning_rate": 0.0002, "epoch": 3.9573918657198193, "step": 12260}, {"loss": 0.5841, "grad_norm": 1.1082909107208252, "learning_rate": 0.0002, "epoch": 3.960619754680439, "step": 12270}, {"loss": 0.6231, "grad_norm": 1.065027117729187, "learning_rate": 0.0002, "epoch": 3.963847643641059, "step": 12280}, {"loss": 0.5996, "grad_norm": 0.9544363021850586, "learning_rate": 0.0002, "epoch": 3.9670755326016787, "step": 12290}, {"loss": 0.6301, "grad_norm": 0.9008927345275879, "learning_rate": 0.0002, "epoch": 3.9703034215622983, "step": 12300}, {"loss": 0.6108, "grad_norm": 0.8717467188835144, "learning_rate": 0.0002, "epoch": 3.973531310522918, "step": 12310}, {"loss": 0.6465, "grad_norm": 0.9718339443206787, "learning_rate": 0.0002, "epoch": 3.9767591994835376, "step": 12320}, {"loss": 0.603, "grad_norm": 1.0362015962600708, "learning_rate": 0.0002, "epoch": 3.9799870884441573, "step": 12330}, {"loss": 0.6229, "grad_norm": 1.0844318866729736, "learning_rate": 0.0002, "epoch": 3.9832149774047774, "step": 12340}, {"loss": 0.6777, "grad_norm": 0.7506240606307983, "learning_rate": 0.0002, "epoch": 3.986442866365397, "step": 12350}, {"loss": 0.6076, "grad_norm": 1.005982756614685, "learning_rate": 0.0002, "epoch": 3.9896707553260167, "step": 12360}, {"loss": 0.5926, "grad_norm": 0.7566431164741516, "learning_rate": 0.0002, "epoch": 3.9928986442866368, "step": 12370}, {"loss": 0.653, "grad_norm": 0.8819181323051453, "learning_rate": 0.0002, "epoch": 3.9961265332472564, "step": 12380}, {"loss": 0.6197, "grad_norm": 0.884497880935669, "learning_rate": 0.0002, "epoch": 3.999354422207876, "step": 12390}, {"eval_loss": 1.1907150745391846, "eval_runtime": 161.5766, "eval_samples_per_second": 4.537, "eval_steps_per_second": 0.569, "epoch": 4.0, "step": 12392}, {"loss": 0.5203, "grad_norm": 1.0407241582870483, "learning_rate": 0.0002, "epoch": 4.002582311168496, "step": 12400}, {"loss": 0.4978, "grad_norm": 1.0199295282363892, "learning_rate": 0.0002, "epoch": 4.005810200129115, "step": 12410}, {"loss": 0.4985, "grad_norm": 0.8456302881240845, "learning_rate": 0.0002, "epoch": 4.009038089089735, "step": 12420}, {"loss": 0.4669, "grad_norm": 1.0621124505996704, "learning_rate": 0.0002, "epoch": 4.012265978050355, "step": 12430}, {"loss": 0.5277, "grad_norm": 0.8984712362289429, "learning_rate": 0.0002, "epoch": 4.015493867010975, "step": 12440}, {"loss": 0.5508, "grad_norm": 1.3785864114761353, "learning_rate": 0.0002, "epoch": 4.018721755971595, "step": 12450}, {"loss": 0.5244, "grad_norm": 0.7911781668663025, "learning_rate": 0.0002, "epoch": 4.0219496449322145, "step": 12460}, {"loss": 0.4746, "grad_norm": 1.0977907180786133, "learning_rate": 0.0002, "epoch": 4.025177533892834, "step": 12470}, {"loss": 0.4632, "grad_norm": 1.0664983987808228, "learning_rate": 0.0002, "epoch": 4.028405422853454, "step": 12480}, {"loss": 0.5151, "grad_norm": 1.0807124376296997, "learning_rate": 0.0002, "epoch": 4.0316333118140735, "step": 12490}, {"loss": 0.4712, "grad_norm": 1.2650192975997925, "learning_rate": 0.0002, "epoch": 4.034861200774693, "step": 12500}, {"loss": 0.5111, "grad_norm": 0.7164070010185242, "learning_rate": 0.0002, "epoch": 4.038089089735313, "step": 12510}, {"loss": 0.5015, "grad_norm": 1.0047489404678345, "learning_rate": 0.0002, "epoch": 4.041316978695932, "step": 12520}, {"loss": 0.5467, "grad_norm": 0.9303901791572571, "learning_rate": 0.0002, "epoch": 4.044544867656553, "step": 12530}, {"loss": 0.5165, "grad_norm": 1.0319702625274658, "learning_rate": 0.0002, "epoch": 4.047772756617173, "step": 12540}, {"loss": 0.4834, "grad_norm": 0.9549729228019714, "learning_rate": 0.0002, "epoch": 4.051000645577792, "step": 12550}, {"loss": 0.5235, "grad_norm": 0.7175564765930176, "learning_rate": 0.0002, "epoch": 4.054228534538412, "step": 12560}, {"loss": 0.5257, "grad_norm": 1.0622259378433228, "learning_rate": 0.0002, "epoch": 4.057456423499032, "step": 12570}, {"loss": 0.5098, "grad_norm": 1.172074556350708, "learning_rate": 0.0002, "epoch": 4.060684312459651, "step": 12580}, {"loss": 0.5112, "grad_norm": 0.9702366590499878, "learning_rate": 0.0002, "epoch": 4.063912201420271, "step": 12590}, {"loss": 0.5042, "grad_norm": 0.741511344909668, "learning_rate": 0.0002, "epoch": 4.0671400903808905, "step": 12600}, {"loss": 0.4996, "grad_norm": 0.8632621169090271, "learning_rate": 0.0002, "epoch": 4.070367979341511, "step": 12610}, {"loss": 0.4927, "grad_norm": 0.9695962071418762, "learning_rate": 0.0002, "epoch": 4.073595868302131, "step": 12620}, {"loss": 0.4618, "grad_norm": 0.9401052594184875, "learning_rate": 0.0002, "epoch": 4.07682375726275, "step": 12630}, {"loss": 0.4889, "grad_norm": 0.8068707585334778, "learning_rate": 0.0002, "epoch": 4.08005164622337, "step": 12640}, {"loss": 0.5046, "grad_norm": 0.9554762840270996, "learning_rate": 0.0002, "epoch": 4.08327953518399, "step": 12650}, {"loss": 0.5081, "grad_norm": 0.7637128233909607, "learning_rate": 0.0002, "epoch": 4.086507424144609, "step": 12660}, {"loss": 0.4997, "grad_norm": 0.6703744530677795, "learning_rate": 0.0002, "epoch": 4.089735313105229, "step": 12670}, {"loss": 0.4977, "grad_norm": 0.8623828887939453, "learning_rate": 0.0002, "epoch": 4.092963202065849, "step": 12680}, {"loss": 0.4616, "grad_norm": 0.8198223114013672, "learning_rate": 0.0002, "epoch": 4.096191091026468, "step": 12690}, {"loss": 0.5372, "grad_norm": 1.3449875116348267, "learning_rate": 0.0002, "epoch": 4.099418979987089, "step": 12700}, {"loss": 0.4782, "grad_norm": 0.8333606123924255, "learning_rate": 0.0002, "epoch": 4.1026468689477085, "step": 12710}, {"loss": 0.5135, "grad_norm": 1.1647733449935913, "learning_rate": 0.0002, "epoch": 4.105874757908328, "step": 12720}, {"loss": 0.5147, "grad_norm": 1.0560213327407837, "learning_rate": 0.0002, "epoch": 4.109102646868948, "step": 12730}, {"loss": 0.5244, "grad_norm": 0.9479449987411499, "learning_rate": 0.0002, "epoch": 4.112330535829567, "step": 12740}, {"loss": 0.4596, "grad_norm": 1.1634587049484253, "learning_rate": 0.0002, "epoch": 4.115558424790187, "step": 12750}, {"loss": 0.4966, "grad_norm": 0.813987672328949, "learning_rate": 0.0002, "epoch": 4.118786313750807, "step": 12760}, {"loss": 0.5133, "grad_norm": 0.968461275100708, "learning_rate": 0.0002, "epoch": 4.122014202711426, "step": 12770}, {"loss": 0.5113, "grad_norm": 0.9324830770492554, "learning_rate": 0.0002, "epoch": 4.125242091672046, "step": 12780}, {"loss": 0.5233, "grad_norm": 0.8313411474227905, "learning_rate": 0.0002, "epoch": 4.128469980632667, "step": 12790}, {"loss": 0.5169, "grad_norm": 1.0177634954452515, "learning_rate": 0.0002, "epoch": 4.131697869593286, "step": 12800}, {"loss": 0.4635, "grad_norm": 1.0890623331069946, "learning_rate": 0.0002, "epoch": 4.134925758553906, "step": 12810}, {"loss": 0.519, "grad_norm": 0.9131693840026855, "learning_rate": 0.0002, "epoch": 4.1381536475145255, "step": 12820}, {"loss": 0.5017, "grad_norm": 0.8400680422782898, "learning_rate": 0.0002, "epoch": 4.141381536475145, "step": 12830}, {"loss": 0.5195, "grad_norm": 0.8988795876502991, "learning_rate": 0.0002, "epoch": 4.144609425435765, "step": 12840}, {"loss": 0.5052, "grad_norm": 0.9224025011062622, "learning_rate": 0.0002, "epoch": 4.1478373143963845, "step": 12850}, {"loss": 0.5001, "grad_norm": 0.7453159689903259, "learning_rate": 0.0002, "epoch": 4.151065203357004, "step": 12860}, {"loss": 0.4874, "grad_norm": 0.9815868139266968, "learning_rate": 0.0002, "epoch": 4.154293092317625, "step": 12870}, {"loss": 0.5485, "grad_norm": 1.2542768716812134, "learning_rate": 0.0002, "epoch": 4.157520981278244, "step": 12880}, {"loss": 0.5287, "grad_norm": 1.0092132091522217, "learning_rate": 0.0002, "epoch": 4.160748870238864, "step": 12890}, {"loss": 0.5125, "grad_norm": 1.1836622953414917, "learning_rate": 0.0002, "epoch": 4.163976759199484, "step": 12900}, {"loss": 0.5089, "grad_norm": 0.7706810235977173, "learning_rate": 0.0002, "epoch": 4.167204648160103, "step": 12910}, {"loss": 0.5123, "grad_norm": 1.00058913230896, "learning_rate": 0.0002, "epoch": 4.170432537120723, "step": 12920}, {"loss": 0.5238, "grad_norm": 1.2326250076293945, "learning_rate": 0.0002, "epoch": 4.173660426081343, "step": 12930}, {"loss": 0.5405, "grad_norm": 0.8829123377799988, "learning_rate": 0.0002, "epoch": 4.176888315041962, "step": 12940}, {"loss": 0.517, "grad_norm": 0.936042845249176, "learning_rate": 0.0002, "epoch": 4.180116204002582, "step": 12950}, {"loss": 0.4991, "grad_norm": 0.9773517847061157, "learning_rate": 0.0002, "epoch": 4.183344092963202, "step": 12960}, {"loss": 0.5025, "grad_norm": 0.9786297678947449, "learning_rate": 0.0002, "epoch": 4.186571981923822, "step": 12970}, {"loss": 0.5276, "grad_norm": 0.7524558901786804, "learning_rate": 0.0002, "epoch": 4.189799870884442, "step": 12980}, {"loss": 0.5522, "grad_norm": 1.0107866525650024, "learning_rate": 0.0002, "epoch": 4.193027759845061, "step": 12990}, {"loss": 0.5304, "grad_norm": 1.0092947483062744, "learning_rate": 0.0002, "epoch": 4.196255648805681, "step": 13000}, {"loss": 0.5061, "grad_norm": 1.18181312084198, "learning_rate": 0.0002, "epoch": 4.199483537766301, "step": 13010}, {"loss": 0.512, "grad_norm": 0.8845750093460083, "learning_rate": 0.0002, "epoch": 4.20271142672692, "step": 13020}, {"loss": 0.5329, "grad_norm": 1.0789145231246948, "learning_rate": 0.0002, "epoch": 4.20593931568754, "step": 13030}, {"loss": 0.5001, "grad_norm": 0.9562082886695862, "learning_rate": 0.0002, "epoch": 4.2091672046481605, "step": 13040}, {"loss": 0.5211, "grad_norm": 0.875755786895752, "learning_rate": 0.0002, "epoch": 4.21239509360878, "step": 13050}, {"loss": 0.5162, "grad_norm": 1.0694596767425537, "learning_rate": 0.0002, "epoch": 4.2156229825694, "step": 13060}, {"loss": 0.4917, "grad_norm": 1.0053378343582153, "learning_rate": 0.0002, "epoch": 4.2188508715300195, "step": 13070}, {"loss": 0.542, "grad_norm": 1.1628689765930176, "learning_rate": 0.0002, "epoch": 4.222078760490639, "step": 13080}, {"loss": 0.4796, "grad_norm": 0.9455991983413696, "learning_rate": 0.0002, "epoch": 4.225306649451259, "step": 13090}, {"loss": 0.4802, "grad_norm": 0.9736765623092651, "learning_rate": 0.0002, "epoch": 4.228534538411878, "step": 13100}, {"loss": 0.5411, "grad_norm": 0.8653560876846313, "learning_rate": 0.0002, "epoch": 4.231762427372498, "step": 13110}, {"loss": 0.5347, "grad_norm": 0.9335988163948059, "learning_rate": 0.0002, "epoch": 4.234990316333118, "step": 13120}, {"loss": 0.5217, "grad_norm": 0.9102661609649658, "learning_rate": 0.0002, "epoch": 4.238218205293738, "step": 13130}, {"loss": 0.5531, "grad_norm": 1.0595461130142212, "learning_rate": 0.0002, "epoch": 4.241446094254358, "step": 13140}, {"loss": 0.517, "grad_norm": 0.8947662711143494, "learning_rate": 0.0002, "epoch": 4.244673983214978, "step": 13150}, {"loss": 0.5116, "grad_norm": 1.0835723876953125, "learning_rate": 0.0002, "epoch": 4.247901872175597, "step": 13160}, {"loss": 0.5212, "grad_norm": 0.8496462106704712, "learning_rate": 0.0002, "epoch": 4.251129761136217, "step": 13170}, {"loss": 0.5079, "grad_norm": 0.9395631551742554, "learning_rate": 0.0002, "epoch": 4.2543576500968365, "step": 13180}, {"loss": 0.5076, "grad_norm": 1.2939592599868774, "learning_rate": 0.0002, "epoch": 4.257585539057456, "step": 13190}, {"loss": 0.5209, "grad_norm": 0.9325923919677734, "learning_rate": 0.0002, "epoch": 4.260813428018076, "step": 13200}, {"loss": 0.4984, "grad_norm": 0.9220664501190186, "learning_rate": 0.0002, "epoch": 4.264041316978696, "step": 13210}, {"loss": 0.5553, "grad_norm": 0.9505137205123901, "learning_rate": 0.0002, "epoch": 4.267269205939316, "step": 13220}, {"loss": 0.5238, "grad_norm": 1.0713751316070557, "learning_rate": 0.0002, "epoch": 4.270497094899936, "step": 13230}, {"loss": 0.5478, "grad_norm": 0.8390375971794128, "learning_rate": 0.0002, "epoch": 4.273724983860555, "step": 13240}, {"loss": 0.5217, "grad_norm": 0.8943426012992859, "learning_rate": 0.0002, "epoch": 4.276952872821175, "step": 13250}, {"loss": 0.5486, "grad_norm": 0.9175868630409241, "learning_rate": 0.0002, "epoch": 4.280180761781795, "step": 13260}, {"loss": 0.5208, "grad_norm": 0.9969881176948547, "learning_rate": 0.0002, "epoch": 4.283408650742414, "step": 13270}, {"loss": 0.5376, "grad_norm": 1.2271877527236938, "learning_rate": 0.0002, "epoch": 4.286636539703034, "step": 13280}, {"loss": 0.4811, "grad_norm": 0.9463263154029846, "learning_rate": 0.0002, "epoch": 4.289864428663654, "step": 13290}, {"loss": 0.52, "grad_norm": 1.0306228399276733, "learning_rate": 0.0002, "epoch": 4.293092317624274, "step": 13300}, {"loss": 0.5092, "grad_norm": 0.8454763889312744, "learning_rate": 0.0002, "epoch": 4.296320206584894, "step": 13310}, {"loss": 0.5657, "grad_norm": 0.9843119978904724, "learning_rate": 0.0002, "epoch": 4.299548095545513, "step": 13320}, {"loss": 0.5407, "grad_norm": 1.0836851596832275, "learning_rate": 0.0002, "epoch": 4.302775984506133, "step": 13330}, {"loss": 0.5336, "grad_norm": 1.0719412565231323, "learning_rate": 0.0002, "epoch": 4.306003873466753, "step": 13340}, {"loss": 0.4798, "grad_norm": 0.9276487827301025, "learning_rate": 0.0002, "epoch": 4.309231762427372, "step": 13350}, {"loss": 0.5256, "grad_norm": 0.897072434425354, "learning_rate": 0.0002, "epoch": 4.312459651387992, "step": 13360}, {"loss": 0.5333, "grad_norm": 1.0493228435516357, "learning_rate": 0.0002, "epoch": 4.315687540348612, "step": 13370}, {"loss": 0.5218, "grad_norm": 0.9446353316307068, "learning_rate": 0.0002, "epoch": 4.318915429309232, "step": 13380}, {"loss": 0.4765, "grad_norm": 0.7765224575996399, "learning_rate": 0.0002, "epoch": 4.322143318269852, "step": 13390}, {"loss": 0.5907, "grad_norm": 0.9100048542022705, "learning_rate": 0.0002, "epoch": 4.3253712072304715, "step": 13400}, {"loss": 0.5393, "grad_norm": 1.0913089513778687, "learning_rate": 0.0002, "epoch": 4.328599096191091, "step": 13410}, {"loss": 0.494, "grad_norm": 0.9607733488082886, "learning_rate": 0.0002, "epoch": 4.331826985151711, "step": 13420}, {"loss": 0.5273, "grad_norm": 0.8774219155311584, "learning_rate": 0.0002, "epoch": 4.3350548741123305, "step": 13430}, {"loss": 0.5482, "grad_norm": 0.8366804122924805, "learning_rate": 0.0002, "epoch": 4.33828276307295, "step": 13440}, {"loss": 0.5487, "grad_norm": 1.034727931022644, "learning_rate": 0.0002, "epoch": 4.34151065203357, "step": 13450}, {"loss": 0.4995, "grad_norm": 0.942743182182312, "learning_rate": 0.0002, "epoch": 4.344738540994189, "step": 13460}, {"loss": 0.5222, "grad_norm": 0.7237029075622559, "learning_rate": 0.0002, "epoch": 4.347966429954809, "step": 13470}, {"loss": 0.5461, "grad_norm": 0.8216196894645691, "learning_rate": 0.0002, "epoch": 4.35119431891543, "step": 13480}, {"loss": 0.5104, "grad_norm": 1.031860113143921, "learning_rate": 0.0002, "epoch": 4.354422207876049, "step": 13490}, {"loss": 0.547, "grad_norm": 0.8880493640899658, "learning_rate": 0.0002, "epoch": 4.357650096836669, "step": 13500}, {"loss": 0.5259, "grad_norm": 0.8442490696907043, "learning_rate": 0.0002, "epoch": 4.360877985797289, "step": 13510}, {"loss": 0.5176, "grad_norm": 1.270971655845642, "learning_rate": 0.0002, "epoch": 4.364105874757908, "step": 13520}, {"loss": 0.5028, "grad_norm": 0.9657870531082153, "learning_rate": 0.0002, "epoch": 4.367333763718528, "step": 13530}, {"loss": 0.5136, "grad_norm": 0.7477133870124817, "learning_rate": 0.0002, "epoch": 4.3705616526791475, "step": 13540}, {"loss": 0.5483, "grad_norm": 1.0209243297576904, "learning_rate": 0.0002, "epoch": 4.373789541639767, "step": 13550}, {"loss": 0.4888, "grad_norm": 0.8714015483856201, "learning_rate": 0.0002, "epoch": 4.377017430600388, "step": 13560}, {"loss": 0.5428, "grad_norm": 1.0490189790725708, "learning_rate": 0.0002, "epoch": 4.380245319561007, "step": 13570}, {"loss": 0.5398, "grad_norm": 0.9454663991928101, "learning_rate": 0.0002, "epoch": 4.383473208521627, "step": 13580}, {"loss": 0.5072, "grad_norm": 1.154146432876587, "learning_rate": 0.0002, "epoch": 4.386701097482247, "step": 13590}, {"loss": 0.5096, "grad_norm": 1.155090570449829, "learning_rate": 0.0002, "epoch": 4.389928986442866, "step": 13600}, {"loss": 0.5679, "grad_norm": 0.9853842854499817, "learning_rate": 0.0002, "epoch": 4.393156875403486, "step": 13610}, {"loss": 0.4992, "grad_norm": 0.9265837669372559, "learning_rate": 0.0002, "epoch": 4.396384764364106, "step": 13620}, {"loss": 0.523, "grad_norm": 0.8367540240287781, "learning_rate": 0.0002, "epoch": 4.399612653324725, "step": 13630}, {"loss": 0.564, "grad_norm": 1.1453629732131958, "learning_rate": 0.0002, "epoch": 4.402840542285345, "step": 13640}, {"loss": 0.573, "grad_norm": 1.0856295824050903, "learning_rate": 0.0002, "epoch": 4.4060684312459655, "step": 13650}, {"loss": 0.5178, "grad_norm": 0.9284523129463196, "learning_rate": 0.0002, "epoch": 4.409296320206585, "step": 13660}, {"loss": 0.4862, "grad_norm": 0.9632299542427063, "learning_rate": 0.0002, "epoch": 4.412524209167205, "step": 13670}, {"loss": 0.5928, "grad_norm": 1.048524260520935, "learning_rate": 0.0002, "epoch": 4.415752098127824, "step": 13680}, {"loss": 0.5258, "grad_norm": 0.9787682294845581, "learning_rate": 0.0002, "epoch": 4.418979987088444, "step": 13690}, {"loss": 0.5513, "grad_norm": 1.0728684663772583, "learning_rate": 0.0002, "epoch": 4.422207876049064, "step": 13700}, {"loss": 0.5243, "grad_norm": 0.72867351770401, "learning_rate": 0.0002, "epoch": 4.425435765009683, "step": 13710}, {"loss": 0.5313, "grad_norm": 0.8932793736457825, "learning_rate": 0.0002, "epoch": 4.428663653970303, "step": 13720}, {"loss": 0.5156, "grad_norm": 1.098343849182129, "learning_rate": 0.0002, "epoch": 4.431891542930924, "step": 13730}, {"loss": 0.5342, "grad_norm": 0.9321235418319702, "learning_rate": 0.0002, "epoch": 4.435119431891543, "step": 13740}, {"loss": 0.5114, "grad_norm": 0.8868634104728699, "learning_rate": 0.0002, "epoch": 4.438347320852163, "step": 13750}, {"loss": 0.5284, "grad_norm": 1.200064778327942, "learning_rate": 0.0002, "epoch": 4.4415752098127825, "step": 13760}, {"loss": 0.5208, "grad_norm": 0.8968019485473633, "learning_rate": 0.0002, "epoch": 4.444803098773402, "step": 13770}, {"loss": 0.4979, "grad_norm": 0.9560935497283936, "learning_rate": 0.0002, "epoch": 4.448030987734022, "step": 13780}, {"loss": 0.5134, "grad_norm": 0.7985701560974121, "learning_rate": 0.0002, "epoch": 4.4512588766946415, "step": 13790}, {"loss": 0.5113, "grad_norm": 1.062540888786316, "learning_rate": 0.0002, "epoch": 4.454486765655261, "step": 13800}, {"loss": 0.525, "grad_norm": 1.0827109813690186, "learning_rate": 0.0002, "epoch": 4.457714654615881, "step": 13810}, {"loss": 0.5541, "grad_norm": 1.0853543281555176, "learning_rate": 0.0002, "epoch": 4.460942543576501, "step": 13820}, {"loss": 0.5381, "grad_norm": 1.0613641738891602, "learning_rate": 0.0002, "epoch": 4.464170432537121, "step": 13830}, {"loss": 0.5684, "grad_norm": 0.9037535190582275, "learning_rate": 0.0002, "epoch": 4.467398321497741, "step": 13840}, {"loss": 0.5112, "grad_norm": 0.9216223955154419, "learning_rate": 0.0002, "epoch": 4.47062621045836, "step": 13850}, {"loss": 0.5341, "grad_norm": 0.8952260613441467, "learning_rate": 0.0002, "epoch": 4.47385409941898, "step": 13860}, {"loss": 0.5026, "grad_norm": 0.9997953176498413, "learning_rate": 0.0002, "epoch": 4.4770819883796, "step": 13870}, {"loss": 0.5107, "grad_norm": 1.062458872795105, "learning_rate": 0.0002, "epoch": 4.480309877340219, "step": 13880}, {"loss": 0.5463, "grad_norm": 0.9185126423835754, "learning_rate": 0.0002, "epoch": 4.483537766300839, "step": 13890}, {"loss": 0.5181, "grad_norm": 1.2389954328536987, "learning_rate": 0.0002, "epoch": 4.486765655261459, "step": 13900}, {"loss": 0.5199, "grad_norm": 1.1632126569747925, "learning_rate": 0.0002, "epoch": 4.489993544222079, "step": 13910}, {"loss": 0.5128, "grad_norm": 1.0304487943649292, "learning_rate": 0.0002, "epoch": 4.493221433182699, "step": 13920}, {"loss": 0.5331, "grad_norm": 0.9144788384437561, "learning_rate": 0.0002, "epoch": 4.496449322143318, "step": 13930}, {"loss": 0.5312, "grad_norm": 1.0285682678222656, "learning_rate": 0.0002, "epoch": 4.499677211103938, "step": 13940}, {"loss": 0.554, "grad_norm": 1.1187206506729126, "learning_rate": 0.0002, "epoch": 4.502905100064558, "step": 13950}, {"loss": 0.5268, "grad_norm": 0.7917197942733765, "learning_rate": 0.0002, "epoch": 4.506132989025177, "step": 13960}, {"loss": 0.5227, "grad_norm": 0.8495619297027588, "learning_rate": 0.0002, "epoch": 4.509360877985797, "step": 13970}, {"loss": 0.4971, "grad_norm": 1.0450760126113892, "learning_rate": 0.0002, "epoch": 4.512588766946417, "step": 13980}, {"loss": 0.5402, "grad_norm": 1.0061010122299194, "learning_rate": 0.0002, "epoch": 4.515816655907037, "step": 13990}, {"loss": 0.527, "grad_norm": 1.0232428312301636, "learning_rate": 0.0002, "epoch": 4.519044544867657, "step": 14000}, {"loss": 0.5002, "grad_norm": 0.8734631538391113, "learning_rate": 0.0002, "epoch": 4.5222724338282765, "step": 14010}, {"loss": 0.5464, "grad_norm": 1.1085621118545532, "learning_rate": 0.0002, "epoch": 4.525500322788896, "step": 14020}, {"loss": 0.5167, "grad_norm": 0.9178624749183655, "learning_rate": 0.0002, "epoch": 4.528728211749516, "step": 14030}, {"loss": 0.5589, "grad_norm": 1.0687317848205566, "learning_rate": 0.0002, "epoch": 4.531956100710135, "step": 14040}, {"loss": 0.5576, "grad_norm": 0.9237300157546997, "learning_rate": 0.0002, "epoch": 4.535183989670755, "step": 14050}, {"loss": 0.5062, "grad_norm": 0.9667123556137085, "learning_rate": 0.0002, "epoch": 4.538411878631375, "step": 14060}, {"loss": 0.5645, "grad_norm": 1.1286747455596924, "learning_rate": 0.0002, "epoch": 4.541639767591995, "step": 14070}, {"loss": 0.5226, "grad_norm": 1.055392861366272, "learning_rate": 0.0002, "epoch": 4.544867656552615, "step": 14080}, {"loss": 0.5428, "grad_norm": 0.9492936134338379, "learning_rate": 0.0002, "epoch": 4.548095545513235, "step": 14090}, {"loss": 0.5559, "grad_norm": 0.9881349802017212, "learning_rate": 0.0002, "epoch": 4.551323434473854, "step": 14100}, {"loss": 0.5572, "grad_norm": 0.9389023184776306, "learning_rate": 0.0002, "epoch": 4.554551323434474, "step": 14110}, {"loss": 0.5511, "grad_norm": 0.8395606875419617, "learning_rate": 0.0002, "epoch": 4.5577792123950935, "step": 14120}, {"loss": 0.5696, "grad_norm": 0.9019067287445068, "learning_rate": 0.0002, "epoch": 4.561007101355713, "step": 14130}, {"loss": 0.5564, "grad_norm": 1.1058136224746704, "learning_rate": 0.0002, "epoch": 4.564234990316333, "step": 14140}, {"loss": 0.5323, "grad_norm": 1.0683821439743042, "learning_rate": 0.0002, "epoch": 4.5674628792769525, "step": 14150}, {"loss": 0.5527, "grad_norm": 1.3398395776748657, "learning_rate": 0.0002, "epoch": 4.570690768237572, "step": 14160}, {"loss": 0.4713, "grad_norm": 0.7829096913337708, "learning_rate": 0.0002, "epoch": 4.573918657198193, "step": 14170}, {"loss": 0.525, "grad_norm": 0.9636675119400024, "learning_rate": 0.0002, "epoch": 4.577146546158812, "step": 14180}, {"loss": 0.5458, "grad_norm": 1.0291401147842407, "learning_rate": 0.0002, "epoch": 4.580374435119432, "step": 14190}, {"loss": 0.5366, "grad_norm": 1.0894310474395752, "learning_rate": 0.0002, "epoch": 4.583602324080052, "step": 14200}, {"loss": 0.5125, "grad_norm": 1.111573576927185, "learning_rate": 0.0002, "epoch": 4.586830213040671, "step": 14210}, {"loss": 0.5444, "grad_norm": 0.9345336556434631, "learning_rate": 0.0002, "epoch": 4.590058102001291, "step": 14220}, {"loss": 0.5175, "grad_norm": 1.3338757753372192, "learning_rate": 0.0002, "epoch": 4.593285990961911, "step": 14230}, {"loss": 0.5227, "grad_norm": 1.1146448850631714, "learning_rate": 0.0002, "epoch": 4.596513879922531, "step": 14240}, {"loss": 0.543, "grad_norm": 1.1576755046844482, "learning_rate": 0.0002, "epoch": 4.599741768883151, "step": 14250}, {"loss": 0.5315, "grad_norm": 0.6851092576980591, "learning_rate": 0.0002, "epoch": 4.60296965784377, "step": 14260}, {"loss": 0.5027, "grad_norm": 0.9067938923835754, "learning_rate": 0.0002, "epoch": 4.60619754680439, "step": 14270}, {"loss": 0.5237, "grad_norm": 0.8767340183258057, "learning_rate": 0.0002, "epoch": 4.60942543576501, "step": 14280}, {"loss": 0.5294, "grad_norm": 1.024880290031433, "learning_rate": 0.0002, "epoch": 4.612653324725629, "step": 14290}, {"loss": 0.5371, "grad_norm": 0.9226394891738892, "learning_rate": 0.0002, "epoch": 4.615881213686249, "step": 14300}, {"loss": 0.5281, "grad_norm": 1.018187165260315, "learning_rate": 0.0002, "epoch": 4.619109102646869, "step": 14310}, {"loss": 0.5546, "grad_norm": 0.8851249814033508, "learning_rate": 0.0002, "epoch": 4.622336991607488, "step": 14320}, {"loss": 0.5206, "grad_norm": 0.745798647403717, "learning_rate": 0.0002, "epoch": 4.625564880568108, "step": 14330}, {"loss": 0.5531, "grad_norm": 1.2082698345184326, "learning_rate": 0.0002, "epoch": 4.6287927695287285, "step": 14340}, {"loss": 0.5449, "grad_norm": 0.901454508304596, "learning_rate": 0.0002, "epoch": 4.632020658489348, "step": 14350}, {"loss": 0.5433, "grad_norm": 0.9593124985694885, "learning_rate": 0.0002, "epoch": 4.635248547449968, "step": 14360}, {"loss": 0.4939, "grad_norm": 1.1241410970687866, "learning_rate": 0.0002, "epoch": 4.6384764364105875, "step": 14370}, {"loss": 0.5319, "grad_norm": 0.9221102595329285, "learning_rate": 0.0002, "epoch": 4.641704325371207, "step": 14380}, {"loss": 0.524, "grad_norm": 1.0035039186477661, "learning_rate": 0.0002, "epoch": 4.644932214331827, "step": 14390}, {"loss": 0.5617, "grad_norm": 1.1270662546157837, "learning_rate": 0.0002, "epoch": 4.648160103292446, "step": 14400}, {"loss": 0.5663, "grad_norm": 0.8631120324134827, "learning_rate": 0.0002, "epoch": 4.651387992253067, "step": 14410}, {"loss": 0.5705, "grad_norm": 1.0604606866836548, "learning_rate": 0.0002, "epoch": 4.654615881213687, "step": 14420}, {"loss": 0.5307, "grad_norm": 0.8002706170082092, "learning_rate": 0.0002, "epoch": 4.657843770174306, "step": 14430}, {"loss": 0.5459, "grad_norm": 1.0642075538635254, "learning_rate": 0.0002, "epoch": 4.661071659134926, "step": 14440}, {"loss": 0.5497, "grad_norm": 0.9315671324729919, "learning_rate": 0.0002, "epoch": 4.664299548095546, "step": 14450}, {"loss": 0.5542, "grad_norm": 0.8311864137649536, "learning_rate": 0.0002, "epoch": 4.667527437056165, "step": 14460}, {"loss": 0.5533, "grad_norm": 0.8900430202484131, "learning_rate": 0.0002, "epoch": 4.670755326016785, "step": 14470}, {"loss": 0.5086, "grad_norm": 1.059267282485962, "learning_rate": 0.0002, "epoch": 4.6739832149774045, "step": 14480}, {"loss": 0.5583, "grad_norm": 0.9864052534103394, "learning_rate": 0.0002, "epoch": 4.677211103938024, "step": 14490}, {"loss": 0.5737, "grad_norm": 1.210854411125183, "learning_rate": 0.0002, "epoch": 4.680438992898644, "step": 14500}, {"loss": 0.536, "grad_norm": 1.030693769454956, "learning_rate": 0.0002, "epoch": 4.683666881859264, "step": 14510}, {"loss": 0.544, "grad_norm": 0.9809406995773315, "learning_rate": 0.0002, "epoch": 4.686894770819884, "step": 14520}, {"loss": 0.5522, "grad_norm": 1.0471004247665405, "learning_rate": 0.0002, "epoch": 4.690122659780504, "step": 14530}, {"loss": 0.5613, "grad_norm": 1.1583727598190308, "learning_rate": 0.0002, "epoch": 4.693350548741123, "step": 14540}, {"loss": 0.5608, "grad_norm": 0.9664418697357178, "learning_rate": 0.0002, "epoch": 4.696578437701743, "step": 14550}, {"loss": 0.5624, "grad_norm": 0.9511209726333618, "learning_rate": 0.0002, "epoch": 4.699806326662363, "step": 14560}, {"loss": 0.5806, "grad_norm": 1.0211684703826904, "learning_rate": 0.0002, "epoch": 4.703034215622982, "step": 14570}, {"loss": 0.5536, "grad_norm": 1.097276210784912, "learning_rate": 0.0002, "epoch": 4.706262104583602, "step": 14580}, {"loss": 0.5527, "grad_norm": 0.9363943338394165, "learning_rate": 0.0002, "epoch": 4.7094899935442225, "step": 14590}, {"loss": 0.5261, "grad_norm": 1.4700615406036377, "learning_rate": 0.0002, "epoch": 4.712717882504842, "step": 14600}, {"loss": 0.5489, "grad_norm": 1.0001553297042847, "learning_rate": 0.0002, "epoch": 4.715945771465462, "step": 14610}, {"loss": 0.5236, "grad_norm": 1.0489927530288696, "learning_rate": 0.0002, "epoch": 4.719173660426081, "step": 14620}, {"loss": 0.5418, "grad_norm": 1.0483676195144653, "learning_rate": 0.0002, "epoch": 4.722401549386701, "step": 14630}, {"loss": 0.5596, "grad_norm": 1.1501940488815308, "learning_rate": 0.0002, "epoch": 4.725629438347321, "step": 14640}, {"loss": 0.5059, "grad_norm": 1.1703146696090698, "learning_rate": 0.0002, "epoch": 4.72885732730794, "step": 14650}, {"loss": 0.5356, "grad_norm": 0.8842985033988953, "learning_rate": 0.0002, "epoch": 4.73208521626856, "step": 14660}, {"loss": 0.5229, "grad_norm": 0.9147908687591553, "learning_rate": 0.0002, "epoch": 4.73531310522918, "step": 14670}, {"loss": 0.5436, "grad_norm": 1.0391576290130615, "learning_rate": 0.0002, "epoch": 4.7385409941898, "step": 14680}, {"loss": 0.5803, "grad_norm": 0.9469179511070251, "learning_rate": 0.0002, "epoch": 4.74176888315042, "step": 14690}, {"loss": 0.5201, "grad_norm": 1.0529530048370361, "learning_rate": 0.0002, "epoch": 4.7449967721110395, "step": 14700}, {"loss": 0.5401, "grad_norm": 0.9645711183547974, "learning_rate": 0.0002, "epoch": 4.748224661071659, "step": 14710}, {"loss": 0.5123, "grad_norm": 0.8163343071937561, "learning_rate": 0.0002, "epoch": 4.751452550032279, "step": 14720}, {"loss": 0.5654, "grad_norm": 1.0581341981887817, "learning_rate": 0.0002, "epoch": 4.7546804389928985, "step": 14730}, {"loss": 0.5709, "grad_norm": 1.0913853645324707, "learning_rate": 0.0002, "epoch": 4.757908327953518, "step": 14740}, {"loss": 0.5342, "grad_norm": 1.1071174144744873, "learning_rate": 0.0002, "epoch": 4.761136216914138, "step": 14750}, {"loss": 0.5353, "grad_norm": 1.0060709714889526, "learning_rate": 0.0002, "epoch": 4.764364105874758, "step": 14760}, {"loss": 0.5415, "grad_norm": 1.012024164199829, "learning_rate": 0.0002, "epoch": 4.767591994835378, "step": 14770}, {"loss": 0.5351, "grad_norm": 0.8438148498535156, "learning_rate": 0.0002, "epoch": 4.770819883795998, "step": 14780}, {"loss": 0.5424, "grad_norm": 0.8136811256408691, "learning_rate": 0.0002, "epoch": 4.774047772756617, "step": 14790}, {"loss": 0.5397, "grad_norm": 1.0765691995620728, "learning_rate": 0.0002, "epoch": 4.777275661717237, "step": 14800}, {"loss": 0.5616, "grad_norm": 1.0582574605941772, "learning_rate": 0.0002, "epoch": 4.780503550677857, "step": 14810}, {"loss": 0.5554, "grad_norm": 0.9419516921043396, "learning_rate": 0.0002, "epoch": 4.783731439638476, "step": 14820}, {"loss": 0.5499, "grad_norm": 0.9626181721687317, "learning_rate": 0.0002, "epoch": 4.786959328599096, "step": 14830}, {"loss": 0.565, "grad_norm": 1.2552800178527832, "learning_rate": 0.0002, "epoch": 4.7901872175597155, "step": 14840}, {"loss": 0.5402, "grad_norm": 0.9379919171333313, "learning_rate": 0.0002, "epoch": 4.793415106520336, "step": 14850}, {"loss": 0.5583, "grad_norm": 0.8166947364807129, "learning_rate": 0.0002, "epoch": 4.796642995480956, "step": 14860}, {"loss": 0.5139, "grad_norm": 0.9008694887161255, "learning_rate": 0.0002, "epoch": 4.799870884441575, "step": 14870}, {"loss": 0.5049, "grad_norm": 1.0256156921386719, "learning_rate": 0.0002, "epoch": 4.803098773402195, "step": 14880}, {"loss": 0.5531, "grad_norm": 0.9486594200134277, "learning_rate": 0.0002, "epoch": 4.806326662362815, "step": 14890}, {"loss": 0.5667, "grad_norm": 0.955238401889801, "learning_rate": 0.0002, "epoch": 4.809554551323434, "step": 14900}, {"loss": 0.5269, "grad_norm": 1.03775954246521, "learning_rate": 0.0002, "epoch": 4.812782440284054, "step": 14910}, {"loss": 0.5445, "grad_norm": 1.1383405923843384, "learning_rate": 0.0002, "epoch": 4.816010329244674, "step": 14920}, {"loss": 0.5347, "grad_norm": 0.9411700963973999, "learning_rate": 0.0002, "epoch": 4.819238218205294, "step": 14930}, {"loss": 0.4899, "grad_norm": 0.8188554644584656, "learning_rate": 0.0002, "epoch": 4.822466107165914, "step": 14940}, {"loss": 0.5618, "grad_norm": 1.1336265802383423, "learning_rate": 0.0002, "epoch": 4.8256939961265335, "step": 14950}, {"loss": 0.5578, "grad_norm": 1.106121301651001, "learning_rate": 0.0002, "epoch": 4.828921885087153, "step": 14960}, {"loss": 0.5306, "grad_norm": 1.0206533670425415, "learning_rate": 0.0002, "epoch": 4.832149774047773, "step": 14970}, {"loss": 0.5714, "grad_norm": 1.1123926639556885, "learning_rate": 0.0002, "epoch": 4.8353776630083924, "step": 14980}, {"loss": 0.5208, "grad_norm": 0.7879418730735779, "learning_rate": 0.0002, "epoch": 4.838605551969012, "step": 14990}, {"loss": 0.5385, "grad_norm": 1.0171709060668945, "learning_rate": 0.0002, "epoch": 4.841833440929632, "step": 15000}, {"loss": 0.6049, "grad_norm": 1.010671615600586, "learning_rate": 0.0002, "epoch": 4.845061329890251, "step": 15010}, {"loss": 0.5497, "grad_norm": 1.0778919458389282, "learning_rate": 0.0002, "epoch": 4.848289218850871, "step": 15020}, {"loss": 0.5587, "grad_norm": 1.0479968786239624, "learning_rate": 0.0002, "epoch": 4.851517107811492, "step": 15030}, {"loss": 0.5637, "grad_norm": 1.0345100164413452, "learning_rate": 0.0002, "epoch": 4.854744996772111, "step": 15040}, {"loss": 0.5809, "grad_norm": 0.9539691805839539, "learning_rate": 0.0002, "epoch": 4.857972885732731, "step": 15050}, {"loss": 0.5314, "grad_norm": 0.9914752840995789, "learning_rate": 0.0002, "epoch": 4.8612007746933505, "step": 15060}, {"loss": 0.5277, "grad_norm": 1.1935476064682007, "learning_rate": 0.0002, "epoch": 4.86442866365397, "step": 15070}, {"loss": 0.5497, "grad_norm": 1.0065057277679443, "learning_rate": 0.0002, "epoch": 4.86765655261459, "step": 15080}, {"loss": 0.5563, "grad_norm": 0.9320993423461914, "learning_rate": 0.0002, "epoch": 4.8708844415752095, "step": 15090}, {"loss": 0.5757, "grad_norm": 1.0578069686889648, "learning_rate": 0.0002, "epoch": 4.87411233053583, "step": 15100}, {"loss": 0.5472, "grad_norm": 0.9666239023208618, "learning_rate": 0.0002, "epoch": 4.87734021949645, "step": 15110}, {"loss": 0.5564, "grad_norm": 1.1322687864303589, "learning_rate": 0.0002, "epoch": 4.880568108457069, "step": 15120}, {"loss": 0.5381, "grad_norm": 0.955674409866333, "learning_rate": 0.0002, "epoch": 4.883795997417689, "step": 15130}, {"loss": 0.557, "grad_norm": 1.119413137435913, "learning_rate": 0.0002, "epoch": 4.887023886378309, "step": 15140}, {"loss": 0.5527, "grad_norm": 0.863646924495697, "learning_rate": 0.0002, "epoch": 4.890251775338928, "step": 15150}, {"loss": 0.5908, "grad_norm": 1.1823450326919556, "learning_rate": 0.0002, "epoch": 4.893479664299548, "step": 15160}, {"loss": 0.5654, "grad_norm": 0.8657588958740234, "learning_rate": 0.0002, "epoch": 4.896707553260168, "step": 15170}, {"loss": 0.5239, "grad_norm": 0.8575737476348877, "learning_rate": 0.0002, "epoch": 4.899935442220787, "step": 15180}, {"loss": 0.564, "grad_norm": 0.9611830711364746, "learning_rate": 0.0002, "epoch": 4.903163331181407, "step": 15190}, {"loss": 0.5505, "grad_norm": 1.1981453895568848, "learning_rate": 0.0002, "epoch": 4.906391220142027, "step": 15200}, {"loss": 0.5582, "grad_norm": 0.9401199221611023, "learning_rate": 0.0002, "epoch": 4.909619109102647, "step": 15210}, {"loss": 0.5631, "grad_norm": 0.8420369625091553, "learning_rate": 0.0002, "epoch": 4.912846998063267, "step": 15220}, {"loss": 0.5255, "grad_norm": 0.7877969145774841, "learning_rate": 0.0002, "epoch": 4.916074887023886, "step": 15230}, {"loss": 0.5522, "grad_norm": 0.8988324403762817, "learning_rate": 0.0002, "epoch": 4.919302775984506, "step": 15240}, {"loss": 0.5274, "grad_norm": 1.1103752851486206, "learning_rate": 0.0002, "epoch": 4.922530664945126, "step": 15250}, {"loss": 0.5249, "grad_norm": 0.8874443173408508, "learning_rate": 0.0002, "epoch": 4.925758553905745, "step": 15260}, {"loss": 0.5677, "grad_norm": 1.1001752614974976, "learning_rate": 0.0002, "epoch": 4.928986442866366, "step": 15270}, {"loss": 0.5596, "grad_norm": 0.9661307334899902, "learning_rate": 0.0002, "epoch": 4.9322143318269855, "step": 15280}, {"loss": 0.5678, "grad_norm": 1.1738812923431396, "learning_rate": 0.0002, "epoch": 4.935442220787605, "step": 15290}, {"loss": 0.5057, "grad_norm": 0.9773507714271545, "learning_rate": 0.0002, "epoch": 4.938670109748225, "step": 15300}, {"loss": 0.5029, "grad_norm": 1.0735599994659424, "learning_rate": 0.0002, "epoch": 4.9418979987088445, "step": 15310}, {"loss": 0.4996, "grad_norm": 1.0552113056182861, "learning_rate": 0.0002, "epoch": 4.945125887669464, "step": 15320}, {"loss": 0.5201, "grad_norm": 1.0900797843933105, "learning_rate": 0.0002, "epoch": 4.948353776630084, "step": 15330}, {"loss": 0.552, "grad_norm": 1.0908405780792236, "learning_rate": 0.0002, "epoch": 4.9515816655907035, "step": 15340}, {"loss": 0.6208, "grad_norm": 1.010221004486084, "learning_rate": 0.0002, "epoch": 4.954809554551323, "step": 15350}, {"loss": 0.5423, "grad_norm": 1.0321437120437622, "learning_rate": 0.0002, "epoch": 4.958037443511943, "step": 15360}, {"loss": 0.5903, "grad_norm": 0.8430278897285461, "learning_rate": 0.0002, "epoch": 4.961265332472563, "step": 15370}, {"loss": 0.538, "grad_norm": 0.8775330185890198, "learning_rate": 0.0002, "epoch": 4.964493221433183, "step": 15380}, {"loss": 0.5344, "grad_norm": 0.9796988368034363, "learning_rate": 0.0002, "epoch": 4.967721110393803, "step": 15390}, {"loss": 0.5352, "grad_norm": 0.8782257437705994, "learning_rate": 0.0002, "epoch": 4.970948999354422, "step": 15400}, {"loss": 0.5843, "grad_norm": 0.9959840774536133, "learning_rate": 0.0002, "epoch": 4.974176888315042, "step": 15410}, {"loss": 0.5783, "grad_norm": 1.0730273723602295, "learning_rate": 0.0002, "epoch": 4.9774047772756616, "step": 15420}, {"loss": 0.5277, "grad_norm": 0.8653680682182312, "learning_rate": 0.0002, "epoch": 4.980632666236281, "step": 15430}, {"loss": 0.5301, "grad_norm": 1.0769985914230347, "learning_rate": 0.0002, "epoch": 4.983860555196901, "step": 15440}, {"loss": 0.5727, "grad_norm": 1.1336040496826172, "learning_rate": 0.0002, "epoch": 4.987088444157521, "step": 15450}, {"loss": 0.5454, "grad_norm": 0.9844824075698853, "learning_rate": 0.0002, "epoch": 4.990316333118141, "step": 15460}, {"loss": 0.5316, "grad_norm": 0.8368769288063049, "learning_rate": 0.0002, "epoch": 4.993544222078761, "step": 15470}, {"loss": 0.5464, "grad_norm": 1.0238676071166992, "learning_rate": 0.0002, "epoch": 4.99677211103938, "step": 15480}, {"loss": 0.5577, "grad_norm": 1.064820408821106, "learning_rate": 0.0002, "epoch": 5.0, "step": 15490}, {"eval_loss": 1.241918921470642, "eval_runtime": 158.4099, "eval_samples_per_second": 4.627, "eval_steps_per_second": 0.581, "epoch": 5.0, "step": 15490}, {"loss": 0.4554, "grad_norm": 1.1366689205169678, "learning_rate": 0.0002, "epoch": 5.00322788896062, "step": 15500}, {"loss": 0.4288, "grad_norm": 1.2548010349273682, "learning_rate": 0.0002, "epoch": 5.006455777921239, "step": 15510}, {"loss": 0.4276, "grad_norm": 1.3875139951705933, "learning_rate": 0.0002, "epoch": 5.009683666881859, "step": 15520}, {"loss": 0.4198, "grad_norm": 0.9834036231040955, "learning_rate": 0.0002, "epoch": 5.012911555842479, "step": 15530}, {"loss": 0.4531, "grad_norm": 1.0737303495407104, "learning_rate": 0.0002, "epoch": 5.016139444803099, "step": 15540}, {"loss": 0.4073, "grad_norm": 0.9877859950065613, "learning_rate": 0.0002, "epoch": 5.019367333763719, "step": 15550}, {"loss": 0.4459, "grad_norm": 1.143268346786499, "learning_rate": 0.0002, "epoch": 5.0225952227243384, "step": 15560}, {"loss": 0.4477, "grad_norm": 1.1206166744232178, "learning_rate": 0.0002, "epoch": 5.025823111684958, "step": 15570}, {"loss": 0.4593, "grad_norm": 0.9977272748947144, "learning_rate": 0.0002, "epoch": 5.029051000645578, "step": 15580}, {"loss": 0.436, "grad_norm": 1.3193285465240479, "learning_rate": 0.0002, "epoch": 5.032278889606197, "step": 15590}, {"loss": 0.4426, "grad_norm": 1.0761713981628418, "learning_rate": 0.0002, "epoch": 5.035506778566817, "step": 15600}, {"loss": 0.4701, "grad_norm": 1.1250759363174438, "learning_rate": 0.0002, "epoch": 5.038734667527437, "step": 15610}, {"loss": 0.3995, "grad_norm": 1.0414305925369263, "learning_rate": 0.0002, "epoch": 5.041962556488057, "step": 15620}, {"loss": 0.4244, "grad_norm": 1.0906853675842285, "learning_rate": 0.0002, "epoch": 5.045190445448677, "step": 15630}, {"loss": 0.441, "grad_norm": 0.9360867142677307, "learning_rate": 0.0002, "epoch": 5.0484183344092965, "step": 15640}, {"loss": 0.4146, "grad_norm": 0.9078057408332825, "learning_rate": 0.0002, "epoch": 5.051646223369916, "step": 15650}, {"loss": 0.4285, "grad_norm": 1.0054848194122314, "learning_rate": 0.0002, "epoch": 5.054874112330536, "step": 15660}, {"loss": 0.417, "grad_norm": 0.9538215398788452, "learning_rate": 0.0002, "epoch": 5.0581020012911555, "step": 15670}, {"loss": 0.4629, "grad_norm": 1.6312693357467651, "learning_rate": 0.0002, "epoch": 5.061329890251775, "step": 15680}, {"loss": 0.3996, "grad_norm": 1.2100921869277954, "learning_rate": 0.0002, "epoch": 5.064557779212395, "step": 15690}, {"loss": 0.4489, "grad_norm": 1.2776238918304443, "learning_rate": 0.0002, "epoch": 5.0677856681730145, "step": 15700}, {"loss": 0.4728, "grad_norm": 1.0110050439834595, "learning_rate": 0.0002, "epoch": 5.071013557133635, "step": 15710}, {"loss": 0.4916, "grad_norm": 1.0896575450897217, "learning_rate": 0.0002, "epoch": 5.074241446094255, "step": 15720}, {"loss": 0.4462, "grad_norm": 0.9989936947822571, "learning_rate": 0.0002, "epoch": 5.077469335054874, "step": 15730}, {"loss": 0.457, "grad_norm": 1.0412228107452393, "learning_rate": 0.0002, "epoch": 5.080697224015494, "step": 15740}, {"loss": 0.4525, "grad_norm": 1.0964457988739014, "learning_rate": 0.0002, "epoch": 5.083925112976114, "step": 15750}, {"loss": 0.4539, "grad_norm": 1.1700960397720337, "learning_rate": 0.0002, "epoch": 5.087153001936733, "step": 15760}, {"loss": 0.4517, "grad_norm": 0.9515631794929504, "learning_rate": 0.0002, "epoch": 5.090380890897353, "step": 15770}, {"loss": 0.4352, "grad_norm": 1.0895006656646729, "learning_rate": 0.0002, "epoch": 5.093608779857973, "step": 15780}, {"loss": 0.4765, "grad_norm": 1.041312575340271, "learning_rate": 0.0002, "epoch": 5.096836668818592, "step": 15790}, {"loss": 0.4532, "grad_norm": 0.9518465399742126, "learning_rate": 0.0002, "epoch": 5.100064557779213, "step": 15800}, {"loss": 0.4187, "grad_norm": 0.8317030668258667, "learning_rate": 0.0002, "epoch": 5.103292446739832, "step": 15810}, {"loss": 0.4523, "grad_norm": 1.0933761596679688, "learning_rate": 0.0002, "epoch": 5.106520335700452, "step": 15820}, {"loss": 0.4689, "grad_norm": 1.0069324970245361, "learning_rate": 0.0002, "epoch": 5.109748224661072, "step": 15830}, {"loss": 0.4773, "grad_norm": 1.1166068315505981, "learning_rate": 0.0002, "epoch": 5.112976113621691, "step": 15840}, {"loss": 0.4635, "grad_norm": 1.069992184638977, "learning_rate": 0.0002, "epoch": 5.116204002582311, "step": 15850}, {"loss": 0.445, "grad_norm": 1.3728036880493164, "learning_rate": 0.0002, "epoch": 5.119431891542931, "step": 15860}, {"loss": 0.4563, "grad_norm": 1.0625780820846558, "learning_rate": 0.0002, "epoch": 5.12265978050355, "step": 15870}, {"loss": 0.426, "grad_norm": 1.090174913406372, "learning_rate": 0.0002, "epoch": 5.125887669464171, "step": 15880}, {"loss": 0.457, "grad_norm": 0.8729526996612549, "learning_rate": 0.0002, "epoch": 5.1291155584247905, "step": 15890}, {"loss": 0.4686, "grad_norm": 0.9561540484428406, "learning_rate": 0.0002, "epoch": 5.13234344738541, "step": 15900}, {"loss": 0.4266, "grad_norm": 1.012120246887207, "learning_rate": 0.0002, "epoch": 5.13557133634603, "step": 15910}, {"loss": 0.4484, "grad_norm": 1.1027921438217163, "learning_rate": 0.0002, "epoch": 5.1387992253066495, "step": 15920}, {"loss": 0.4389, "grad_norm": 1.0878126621246338, "learning_rate": 0.0002, "epoch": 5.142027114267269, "step": 15930}, {"loss": 0.4716, "grad_norm": 0.9619103670120239, "learning_rate": 0.0002, "epoch": 5.145255003227889, "step": 15940}, {"loss": 0.4071, "grad_norm": 1.1684138774871826, "learning_rate": 0.0002, "epoch": 5.148482892188508, "step": 15950}, {"loss": 0.4292, "grad_norm": 1.3379510641098022, "learning_rate": 0.0002, "epoch": 5.151710781149128, "step": 15960}, {"loss": 0.4413, "grad_norm": 1.0427496433258057, "learning_rate": 0.0002, "epoch": 5.154938670109749, "step": 15970}, {"loss": 0.4665, "grad_norm": 0.9917148351669312, "learning_rate": 0.0002, "epoch": 5.158166559070368, "step": 15980}, {"loss": 0.4527, "grad_norm": 1.0899780988693237, "learning_rate": 0.0002, "epoch": 5.161394448030988, "step": 15990}, {"loss": 0.4764, "grad_norm": 0.9251647591590881, "learning_rate": 0.0002, "epoch": 5.1646223369916076, "step": 16000}, {"loss": 0.5043, "grad_norm": 1.1669172048568726, "learning_rate": 0.0002, "epoch": 5.167850225952227, "step": 16010}, {"loss": 0.4726, "grad_norm": 1.2285256385803223, "learning_rate": 0.0002, "epoch": 5.171078114912847, "step": 16020}, {"loss": 0.4312, "grad_norm": 1.0504484176635742, "learning_rate": 0.0002, "epoch": 5.1743060038734665, "step": 16030}, {"loss": 0.4507, "grad_norm": 1.2829089164733887, "learning_rate": 0.0002, "epoch": 5.177533892834086, "step": 16040}, {"loss": 0.4547, "grad_norm": 0.9332743287086487, "learning_rate": 0.0002, "epoch": 5.180761781794706, "step": 16050}, {"loss": 0.4211, "grad_norm": 1.0054426193237305, "learning_rate": 0.0002, "epoch": 5.183989670755326, "step": 16060}, {"loss": 0.4415, "grad_norm": 1.0049669742584229, "learning_rate": 0.0002, "epoch": 5.187217559715946, "step": 16070}, {"loss": 0.4462, "grad_norm": 1.0171366930007935, "learning_rate": 0.0002, "epoch": 5.190445448676566, "step": 16080}, {"loss": 0.4725, "grad_norm": 1.234966516494751, "learning_rate": 0.0002, "epoch": 5.193673337637185, "step": 16090}, {"loss": 0.4579, "grad_norm": 0.9127960205078125, "learning_rate": 0.0002, "epoch": 5.196901226597805, "step": 16100}, {"loss": 0.4647, "grad_norm": 1.153924822807312, "learning_rate": 0.0002, "epoch": 5.200129115558425, "step": 16110}, {"loss": 0.4826, "grad_norm": 1.26716947555542, "learning_rate": 0.0002, "epoch": 5.203357004519044, "step": 16120}, {"loss": 0.446, "grad_norm": 1.2438743114471436, "learning_rate": 0.0002, "epoch": 5.206584893479664, "step": 16130}, {"loss": 0.4768, "grad_norm": 1.0888392925262451, "learning_rate": 0.0002, "epoch": 5.2098127824402845, "step": 16140}, {"loss": 0.4508, "grad_norm": 1.1741917133331299, "learning_rate": 0.0002, "epoch": 5.213040671400904, "step": 16150}, {"loss": 0.4271, "grad_norm": 0.9508614540100098, "learning_rate": 0.0002, "epoch": 5.216268560361524, "step": 16160}, {"loss": 0.4577, "grad_norm": 0.9714716672897339, "learning_rate": 0.0002, "epoch": 5.219496449322143, "step": 16170}, {"loss": 0.4636, "grad_norm": 1.2681622505187988, "learning_rate": 0.0002, "epoch": 5.222724338282763, "step": 16180}, {"loss": 0.4723, "grad_norm": 1.045871376991272, "learning_rate": 0.0002, "epoch": 5.225952227243383, "step": 16190}, {"loss": 0.4467, "grad_norm": 1.0272563695907593, "learning_rate": 0.0002, "epoch": 5.229180116204002, "step": 16200}, {"loss": 0.4353, "grad_norm": 1.092901349067688, "learning_rate": 0.0002, "epoch": 5.232408005164622, "step": 16210}, {"loss": 0.4588, "grad_norm": 0.9332799315452576, "learning_rate": 0.0002, "epoch": 5.235635894125242, "step": 16220}, {"loss": 0.4594, "grad_norm": 1.1728498935699463, "learning_rate": 0.0002, "epoch": 5.238863783085862, "step": 16230}, {"loss": 0.4652, "grad_norm": 0.9932476878166199, "learning_rate": 0.0002, "epoch": 5.242091672046482, "step": 16240}, {"loss": 0.4469, "grad_norm": 0.735236406326294, "learning_rate": 0.0002, "epoch": 5.2453195610071015, "step": 16250}, {"loss": 0.4386, "grad_norm": 1.0289303064346313, "learning_rate": 0.0002, "epoch": 5.248547449967721, "step": 16260}, {"loss": 0.4303, "grad_norm": 0.9488231539726257, "learning_rate": 0.0002, "epoch": 5.251775338928341, "step": 16270}, {"loss": 0.4495, "grad_norm": 0.8320055603981018, "learning_rate": 0.0002, "epoch": 5.2550032278889605, "step": 16280}, {"loss": 0.4224, "grad_norm": 1.2013251781463623, "learning_rate": 0.0002, "epoch": 5.25823111684958, "step": 16290}, {"loss": 0.4666, "grad_norm": 1.0649845600128174, "learning_rate": 0.0002, "epoch": 5.2614590058102, "step": 16300}, {"loss": 0.4325, "grad_norm": 1.1674472093582153, "learning_rate": 0.0002, "epoch": 5.26468689477082, "step": 16310}, {"loss": 0.4482, "grad_norm": 1.3934763669967651, "learning_rate": 0.0002, "epoch": 5.26791478373144, "step": 16320}, {"loss": 0.4494, "grad_norm": 0.8427977561950684, "learning_rate": 0.0002, "epoch": 5.27114267269206, "step": 16330}, {"loss": 0.4234, "grad_norm": 1.0497093200683594, "learning_rate": 0.0002, "epoch": 5.274370561652679, "step": 16340}, {"loss": 0.4337, "grad_norm": 0.8562338352203369, "learning_rate": 0.0002, "epoch": 5.277598450613299, "step": 16350}, {"loss": 0.4664, "grad_norm": 1.043920874595642, "learning_rate": 0.0002, "epoch": 5.280826339573919, "step": 16360}, {"loss": 0.4463, "grad_norm": 1.0039188861846924, "learning_rate": 0.0002, "epoch": 5.284054228534538, "step": 16370}, {"loss": 0.4149, "grad_norm": 0.9414041638374329, "learning_rate": 0.0002, "epoch": 5.287282117495158, "step": 16380}, {"loss": 0.5119, "grad_norm": 1.3346221446990967, "learning_rate": 0.0002, "epoch": 5.2905100064557775, "step": 16390}, {"loss": 0.4479, "grad_norm": 1.0173962116241455, "learning_rate": 0.0002, "epoch": 5.293737895416398, "step": 16400}, {"loss": 0.4538, "grad_norm": 0.7756500244140625, "learning_rate": 0.0002, "epoch": 5.296965784377018, "step": 16410}, {"loss": 0.4306, "grad_norm": 1.1185362339019775, "learning_rate": 0.0002, "epoch": 5.300193673337637, "step": 16420}, {"loss": 0.5033, "grad_norm": 1.0904899835586548, "learning_rate": 0.0002, "epoch": 5.303421562298257, "step": 16430}, {"loss": 0.4887, "grad_norm": 1.0803170204162598, "learning_rate": 0.0002, "epoch": 5.306649451258877, "step": 16440}, {"loss": 0.4473, "grad_norm": 1.1492092609405518, "learning_rate": 0.0002, "epoch": 5.309877340219496, "step": 16450}, {"loss": 0.4696, "grad_norm": 1.1212135553359985, "learning_rate": 0.0002, "epoch": 5.313105229180116, "step": 16460}, {"loss": 0.4438, "grad_norm": 0.8274528980255127, "learning_rate": 0.0002, "epoch": 5.316333118140736, "step": 16470}, {"loss": 0.468, "grad_norm": 1.118891716003418, "learning_rate": 0.0002, "epoch": 5.319561007101356, "step": 16480}, {"loss": 0.4403, "grad_norm": 1.185945749282837, "learning_rate": 0.0002, "epoch": 5.322788896061976, "step": 16490}, {"loss": 0.4946, "grad_norm": 1.0275214910507202, "learning_rate": 0.0002, "epoch": 5.3260167850225955, "step": 16500}, {"loss": 0.4612, "grad_norm": 0.9346362352371216, "learning_rate": 0.0002, "epoch": 5.329244673983215, "step": 16510}, {"loss": 0.4722, "grad_norm": 0.9600600600242615, "learning_rate": 0.0002, "epoch": 5.332472562943835, "step": 16520}, {"loss": 0.4536, "grad_norm": 1.1238188743591309, "learning_rate": 0.0002, "epoch": 5.335700451904454, "step": 16530}, {"loss": 0.5025, "grad_norm": 0.8660476207733154, "learning_rate": 0.0002, "epoch": 5.338928340865074, "step": 16540}, {"loss": 0.4732, "grad_norm": 0.9869821071624756, "learning_rate": 0.0002, "epoch": 5.342156229825694, "step": 16550}, {"loss": 0.4967, "grad_norm": 1.1719090938568115, "learning_rate": 0.0002, "epoch": 5.345384118786313, "step": 16560}, {"loss": 0.4563, "grad_norm": 1.0122894048690796, "learning_rate": 0.0002, "epoch": 5.348612007746934, "step": 16570}, {"loss": 0.5066, "grad_norm": 1.2431079149246216, "learning_rate": 0.0002, "epoch": 5.351839896707554, "step": 16580}, {"loss": 0.4708, "grad_norm": 1.4178080558776855, "learning_rate": 0.0002, "epoch": 5.355067785668173, "step": 16590}, {"loss": 0.4686, "grad_norm": 1.1895726919174194, "learning_rate": 0.0002, "epoch": 5.358295674628793, "step": 16600}, {"loss": 0.475, "grad_norm": 1.154392123222351, "learning_rate": 0.0002, "epoch": 5.3615235635894125, "step": 16610}, {"loss": 0.4511, "grad_norm": 0.9207229018211365, "learning_rate": 0.0002, "epoch": 5.364751452550032, "step": 16620}, {"loss": 0.4606, "grad_norm": 1.0247414112091064, "learning_rate": 0.0002, "epoch": 5.367979341510652, "step": 16630}, {"loss": 0.4886, "grad_norm": 1.0402202606201172, "learning_rate": 0.0002, "epoch": 5.3712072304712715, "step": 16640}, {"loss": 0.4903, "grad_norm": 1.1902891397476196, "learning_rate": 0.0002, "epoch": 5.374435119431892, "step": 16650}, {"loss": 0.4583, "grad_norm": 0.9572759866714478, "learning_rate": 0.0002, "epoch": 5.377663008392512, "step": 16660}, {"loss": 0.4636, "grad_norm": 0.9968860149383545, "learning_rate": 0.0002, "epoch": 5.380890897353131, "step": 16670}, {"loss": 0.477, "grad_norm": 1.2468547821044922, "learning_rate": 0.0002, "epoch": 5.384118786313751, "step": 16680}, {"loss": 0.5223, "grad_norm": 1.154661774635315, "learning_rate": 0.0002, "epoch": 5.387346675274371, "step": 16690}, {"loss": 0.4637, "grad_norm": 0.8837044835090637, "learning_rate": 0.0002, "epoch": 5.39057456423499, "step": 16700}, {"loss": 0.4744, "grad_norm": 1.0317907333374023, "learning_rate": 0.0002, "epoch": 5.39380245319561, "step": 16710}, {"loss": 0.4831, "grad_norm": 0.9811587929725647, "learning_rate": 0.0002, "epoch": 5.39703034215623, "step": 16720}, {"loss": 0.4739, "grad_norm": 0.9487450122833252, "learning_rate": 0.0002, "epoch": 5.400258231116849, "step": 16730}, {"loss": 0.4574, "grad_norm": 1.0540274381637573, "learning_rate": 0.0002, "epoch": 5.403486120077469, "step": 16740}, {"loss": 0.4709, "grad_norm": 1.028363585472107, "learning_rate": 0.0002, "epoch": 5.406714009038089, "step": 16750}, {"loss": 0.468, "grad_norm": 1.0200704336166382, "learning_rate": 0.0002, "epoch": 5.409941897998709, "step": 16760}, {"loss": 0.4383, "grad_norm": 1.0330981016159058, "learning_rate": 0.0002, "epoch": 5.413169786959329, "step": 16770}, {"loss": 0.4645, "grad_norm": 1.320875644683838, "learning_rate": 0.0002, "epoch": 5.416397675919948, "step": 16780}, {"loss": 0.4601, "grad_norm": 0.9838143587112427, "learning_rate": 0.0002, "epoch": 5.419625564880568, "step": 16790}, {"loss": 0.4835, "grad_norm": 1.1006578207015991, "learning_rate": 0.0002, "epoch": 5.422853453841188, "step": 16800}, {"loss": 0.4871, "grad_norm": 1.099174976348877, "learning_rate": 0.0002, "epoch": 5.426081342801807, "step": 16810}, {"loss": 0.4773, "grad_norm": 1.0632189512252808, "learning_rate": 0.0002, "epoch": 5.429309231762427, "step": 16820}, {"loss": 0.4732, "grad_norm": 0.9673194885253906, "learning_rate": 0.0002, "epoch": 5.4325371207230475, "step": 16830}, {"loss": 0.4731, "grad_norm": 0.853013813495636, "learning_rate": 0.0002, "epoch": 5.435765009683667, "step": 16840}, {"loss": 0.4856, "grad_norm": 1.0261728763580322, "learning_rate": 0.0002, "epoch": 5.438992898644287, "step": 16850}, {"loss": 0.4729, "grad_norm": 1.1642370223999023, "learning_rate": 0.0002, "epoch": 5.4422207876049065, "step": 16860}, {"loss": 0.4751, "grad_norm": 0.8715673685073853, "learning_rate": 0.0002, "epoch": 5.445448676565526, "step": 16870}, {"loss": 0.4566, "grad_norm": 0.905746579170227, "learning_rate": 0.0002, "epoch": 5.448676565526146, "step": 16880}, {"loss": 0.4536, "grad_norm": 1.1051915884017944, "learning_rate": 0.0002, "epoch": 5.451904454486765, "step": 16890}, {"loss": 0.4944, "grad_norm": 1.0781478881835938, "learning_rate": 0.0002, "epoch": 5.455132343447385, "step": 16900}, {"loss": 0.4655, "grad_norm": 1.1168911457061768, "learning_rate": 0.0002, "epoch": 5.458360232408005, "step": 16910}, {"loss": 0.4624, "grad_norm": 1.1150046586990356, "learning_rate": 0.0002, "epoch": 5.461588121368625, "step": 16920}, {"loss": 0.4849, "grad_norm": 0.9862499833106995, "learning_rate": 0.0002, "epoch": 5.464816010329245, "step": 16930}, {"loss": 0.47, "grad_norm": 1.5416640043258667, "learning_rate": 0.0002, "epoch": 5.468043899289865, "step": 16940}, {"loss": 0.4508, "grad_norm": 0.8960899710655212, "learning_rate": 0.0002, "epoch": 5.471271788250484, "step": 16950}, {"loss": 0.5002, "grad_norm": 0.9796477556228638, "learning_rate": 0.0002, "epoch": 5.474499677211104, "step": 16960}, {"loss": 0.4939, "grad_norm": 0.9526587128639221, "learning_rate": 0.0002, "epoch": 5.4777275661717235, "step": 16970}, {"loss": 0.4807, "grad_norm": 1.2373039722442627, "learning_rate": 0.0002, "epoch": 5.480955455132343, "step": 16980}, {"loss": 0.4642, "grad_norm": 1.1860566139221191, "learning_rate": 0.0002, "epoch": 5.484183344092963, "step": 16990}, {"loss": 0.4929, "grad_norm": 1.477345585823059, "learning_rate": 0.0002, "epoch": 5.487411233053583, "step": 17000}, {"loss": 0.4566, "grad_norm": 1.1029295921325684, "learning_rate": 0.0002, "epoch": 5.490639122014203, "step": 17010}, {"loss": 0.487, "grad_norm": 1.1416981220245361, "learning_rate": 0.0002, "epoch": 5.493867010974823, "step": 17020}, {"loss": 0.475, "grad_norm": 1.1647989749908447, "learning_rate": 0.0002, "epoch": 5.497094899935442, "step": 17030}, {"loss": 0.4644, "grad_norm": 1.1297032833099365, "learning_rate": 0.0002, "epoch": 5.500322788896062, "step": 17040}, {"loss": 0.4885, "grad_norm": 0.9764689207077026, "learning_rate": 0.0002, "epoch": 5.503550677856682, "step": 17050}, {"loss": 0.4789, "grad_norm": 1.038161039352417, "learning_rate": 0.0002, "epoch": 5.506778566817301, "step": 17060}, {"loss": 0.4467, "grad_norm": 1.1417886018753052, "learning_rate": 0.0002, "epoch": 5.510006455777921, "step": 17070}, {"loss": 0.4782, "grad_norm": 0.9300898313522339, "learning_rate": 0.0002, "epoch": 5.513234344738541, "step": 17080}, {"loss": 0.4805, "grad_norm": 1.0295016765594482, "learning_rate": 0.0002, "epoch": 5.516462233699161, "step": 17090}, {"loss": 0.4663, "grad_norm": 1.1273008584976196, "learning_rate": 0.0002, "epoch": 5.519690122659781, "step": 17100}, {"loss": 0.4897, "grad_norm": 0.9542737007141113, "learning_rate": 0.0002, "epoch": 5.5229180116204, "step": 17110}, {"loss": 0.51, "grad_norm": 1.34589421749115, "learning_rate": 0.0002, "epoch": 5.52614590058102, "step": 17120}, {"loss": 0.467, "grad_norm": 0.9889675378799438, "learning_rate": 0.0002, "epoch": 5.52937378954164, "step": 17130}, {"loss": 0.4752, "grad_norm": 1.25719153881073, "learning_rate": 0.0002, "epoch": 5.532601678502259, "step": 17140}, {"loss": 0.4609, "grad_norm": 1.2511073350906372, "learning_rate": 0.0002, "epoch": 5.535829567462879, "step": 17150}, {"loss": 0.4992, "grad_norm": 1.1993521451950073, "learning_rate": 0.0002, "epoch": 5.539057456423499, "step": 17160}, {"loss": 0.4986, "grad_norm": 1.1394526958465576, "learning_rate": 0.0002, "epoch": 5.542285345384119, "step": 17170}, {"loss": 0.5284, "grad_norm": 1.0435349941253662, "learning_rate": 0.0002, "epoch": 5.545513234344739, "step": 17180}, {"loss": 0.4934, "grad_norm": 1.120940089225769, "learning_rate": 0.0002, "epoch": 5.5487411233053585, "step": 17190}, {"loss": 0.4704, "grad_norm": 1.0906445980072021, "learning_rate": 0.0002, "epoch": 5.551969012265978, "step": 17200}, {"loss": 0.4896, "grad_norm": 0.8883966207504272, "learning_rate": 0.0002, "epoch": 5.555196901226598, "step": 17210}, {"loss": 0.4696, "grad_norm": 1.3078752756118774, "learning_rate": 0.0002, "epoch": 5.5584247901872175, "step": 17220}, {"loss": 0.4805, "grad_norm": 1.0224416255950928, "learning_rate": 0.0002, "epoch": 5.561652679147837, "step": 17230}, {"loss": 0.47, "grad_norm": 1.242518663406372, "learning_rate": 0.0002, "epoch": 5.564880568108457, "step": 17240}, {"loss": 0.4708, "grad_norm": 1.2328250408172607, "learning_rate": 0.0002, "epoch": 5.568108457069076, "step": 17250}, {"loss": 0.4685, "grad_norm": 1.2186611890792847, "learning_rate": 0.0002, "epoch": 5.571336346029697, "step": 17260}, {"loss": 0.4688, "grad_norm": 1.0947459936141968, "learning_rate": 0.0002, "epoch": 5.574564234990317, "step": 17270}, {"loss": 0.506, "grad_norm": 1.075279951095581, "learning_rate": 0.0002, "epoch": 5.577792123950936, "step": 17280}, {"loss": 0.478, "grad_norm": 1.0316804647445679, "learning_rate": 0.0002, "epoch": 5.581020012911556, "step": 17290}, {"loss": 0.478, "grad_norm": 1.1077373027801514, "learning_rate": 0.0002, "epoch": 5.584247901872176, "step": 17300}, {"loss": 0.4857, "grad_norm": 1.219228744506836, "learning_rate": 0.0002, "epoch": 5.587475790832795, "step": 17310}, {"loss": 0.4465, "grad_norm": 1.026361346244812, "learning_rate": 0.0002, "epoch": 5.590703679793415, "step": 17320}, {"loss": 0.4831, "grad_norm": 1.1621283292770386, "learning_rate": 0.0002, "epoch": 5.5939315687540345, "step": 17330}, {"loss": 0.4706, "grad_norm": 1.0177470445632935, "learning_rate": 0.0002, "epoch": 5.597159457714655, "step": 17340}, {"loss": 0.4961, "grad_norm": 1.0625319480895996, "learning_rate": 0.0002, "epoch": 5.600387346675275, "step": 17350}, {"loss": 0.484, "grad_norm": 1.148815393447876, "learning_rate": 0.0002, "epoch": 5.603615235635894, "step": 17360}, {"loss": 0.4804, "grad_norm": 1.0571802854537964, "learning_rate": 0.0002, "epoch": 5.606843124596514, "step": 17370}, {"loss": 0.5202, "grad_norm": 1.2069389820098877, "learning_rate": 0.0002, "epoch": 5.610071013557134, "step": 17380}, {"loss": 0.5029, "grad_norm": 1.407530426979065, "learning_rate": 0.0002, "epoch": 5.613298902517753, "step": 17390}, {"loss": 0.4688, "grad_norm": 1.247060775756836, "learning_rate": 0.0002, "epoch": 5.616526791478373, "step": 17400}, {"loss": 0.4359, "grad_norm": 1.431684136390686, "learning_rate": 0.0002, "epoch": 5.619754680438993, "step": 17410}, {"loss": 0.5244, "grad_norm": 1.0520552396774292, "learning_rate": 0.0002, "epoch": 5.622982569399612, "step": 17420}, {"loss": 0.4993, "grad_norm": 1.0593537092208862, "learning_rate": 0.0002, "epoch": 5.626210458360232, "step": 17430}, {"loss": 0.4911, "grad_norm": 1.4414515495300293, "learning_rate": 0.0002, "epoch": 5.6294383473208525, "step": 17440}, {"loss": 0.4761, "grad_norm": 1.0902460813522339, "learning_rate": 0.0002, "epoch": 5.632666236281472, "step": 17450}, {"loss": 0.4737, "grad_norm": 0.890944242477417, "learning_rate": 0.0002, "epoch": 5.635894125242092, "step": 17460}, {"loss": 0.4706, "grad_norm": 1.035675287246704, "learning_rate": 0.0002, "epoch": 5.639122014202711, "step": 17470}, {"loss": 0.484, "grad_norm": 0.9792264103889465, "learning_rate": 0.0002, "epoch": 5.642349903163331, "step": 17480}, {"loss": 0.4753, "grad_norm": 1.1888220310211182, "learning_rate": 0.0002, "epoch": 5.645577792123951, "step": 17490}, {"loss": 0.5047, "grad_norm": 1.0169143676757812, "learning_rate": 0.0002, "epoch": 5.64880568108457, "step": 17500}, {"loss": 0.4919, "grad_norm": 0.9812449216842651, "learning_rate": 0.0002, "epoch": 5.652033570045191, "step": 17510}, {"loss": 0.4879, "grad_norm": 1.0509105920791626, "learning_rate": 0.0002, "epoch": 5.655261459005811, "step": 17520}, {"loss": 0.4695, "grad_norm": 0.9047426581382751, "learning_rate": 0.0002, "epoch": 5.65848934796643, "step": 17530}, {"loss": 0.4712, "grad_norm": 1.2393709421157837, "learning_rate": 0.0002, "epoch": 5.66171723692705, "step": 17540}, {"loss": 0.5012, "grad_norm": 1.1098991632461548, "learning_rate": 0.0002, "epoch": 5.6649451258876695, "step": 17550}, {"loss": 0.4499, "grad_norm": 0.8181570768356323, "learning_rate": 0.0002, "epoch": 5.668173014848289, "step": 17560}, {"loss": 0.4973, "grad_norm": 0.9676381945610046, "learning_rate": 0.0002, "epoch": 5.671400903808909, "step": 17570}, {"loss": 0.5058, "grad_norm": 1.1225934028625488, "learning_rate": 0.0002, "epoch": 5.6746287927695285, "step": 17580}, {"loss": 0.5165, "grad_norm": 1.6259925365447998, "learning_rate": 0.0002, "epoch": 5.677856681730148, "step": 17590}, {"loss": 0.4613, "grad_norm": 0.7751404643058777, "learning_rate": 0.0002, "epoch": 5.681084570690768, "step": 17600}, {"loss": 0.4895, "grad_norm": 0.8478589057922363, "learning_rate": 0.0002, "epoch": 5.684312459651388, "step": 17610}, {"loss": 0.4492, "grad_norm": 1.2887113094329834, "learning_rate": 0.0002, "epoch": 5.687540348612008, "step": 17620}, {"loss": 0.4792, "grad_norm": 1.1452652215957642, "learning_rate": 0.0002, "epoch": 5.690768237572628, "step": 17630}, {"loss": 0.4889, "grad_norm": 1.0370417833328247, "learning_rate": 0.0002, "epoch": 5.693996126533247, "step": 17640}, {"loss": 0.535, "grad_norm": 1.1358870267868042, "learning_rate": 0.0002, "epoch": 5.697224015493867, "step": 17650}, {"loss": 0.4753, "grad_norm": 1.2772479057312012, "learning_rate": 0.0002, "epoch": 5.700451904454487, "step": 17660}, {"loss": 0.4492, "grad_norm": 1.182812213897705, "learning_rate": 0.0002, "epoch": 5.703679793415106, "step": 17670}, {"loss": 0.5025, "grad_norm": 1.099074125289917, "learning_rate": 0.0002, "epoch": 5.706907682375727, "step": 17680}, {"loss": 0.4945, "grad_norm": 0.938634991645813, "learning_rate": 0.0002, "epoch": 5.710135571336346, "step": 17690}, {"loss": 0.491, "grad_norm": 0.9385238885879517, "learning_rate": 0.0002, "epoch": 5.713363460296966, "step": 17700}, {"loss": 0.4849, "grad_norm": 1.1486014127731323, "learning_rate": 0.0002, "epoch": 5.716591349257586, "step": 17710}, {"loss": 0.5043, "grad_norm": 0.9433078169822693, "learning_rate": 0.0002, "epoch": 5.719819238218205, "step": 17720}, {"loss": 0.4543, "grad_norm": 1.02472722530365, "learning_rate": 0.0002, "epoch": 5.723047127178825, "step": 17730}, {"loss": 0.4631, "grad_norm": 0.9360876679420471, "learning_rate": 0.0002, "epoch": 5.726275016139445, "step": 17740}, {"loss": 0.4947, "grad_norm": 1.0481483936309814, "learning_rate": 0.0002, "epoch": 5.729502905100064, "step": 17750}, {"loss": 0.4763, "grad_norm": 1.0032516717910767, "learning_rate": 0.0002, "epoch": 5.732730794060684, "step": 17760}, {"loss": 0.4819, "grad_norm": 0.8908069729804993, "learning_rate": 0.0002, "epoch": 5.735958683021304, "step": 17770}, {"loss": 0.5188, "grad_norm": 1.0679123401641846, "learning_rate": 0.0002, "epoch": 5.739186571981924, "step": 17780}, {"loss": 0.4818, "grad_norm": 1.0448014736175537, "learning_rate": 0.0002, "epoch": 5.742414460942544, "step": 17790}, {"loss": 0.4869, "grad_norm": 1.0433847904205322, "learning_rate": 0.0002, "epoch": 5.7456423499031635, "step": 17800}, {"loss": 0.5243, "grad_norm": 1.000291109085083, "learning_rate": 0.0002, "epoch": 5.748870238863783, "step": 17810}, {"loss": 0.4891, "grad_norm": 1.1238429546356201, "learning_rate": 0.0002, "epoch": 5.752098127824403, "step": 17820}, {"loss": 0.4905, "grad_norm": 1.09062659740448, "learning_rate": 0.0002, "epoch": 5.755326016785022, "step": 17830}, {"loss": 0.4883, "grad_norm": 0.8538689613342285, "learning_rate": 0.0002, "epoch": 5.758553905745642, "step": 17840}, {"loss": 0.4989, "grad_norm": 1.3872947692871094, "learning_rate": 0.0002, "epoch": 5.761781794706262, "step": 17850}, {"loss": 0.4707, "grad_norm": 1.0578876733779907, "learning_rate": 0.0002, "epoch": 5.765009683666882, "step": 17860}, {"loss": 0.5281, "grad_norm": 1.1761705875396729, "learning_rate": 0.0002, "epoch": 5.768237572627502, "step": 17870}, {"loss": 0.4802, "grad_norm": 1.1223368644714355, "learning_rate": 0.0002, "epoch": 5.771465461588122, "step": 17880}, {"loss": 0.505, "grad_norm": 1.2484360933303833, "learning_rate": 0.0002, "epoch": 5.774693350548741, "step": 17890}, {"loss": 0.4786, "grad_norm": 1.2461199760437012, "learning_rate": 0.0002, "epoch": 5.777921239509361, "step": 17900}, {"loss": 0.4933, "grad_norm": 1.1718299388885498, "learning_rate": 0.0002, "epoch": 5.7811491284699805, "step": 17910}, {"loss": 0.471, "grad_norm": 0.9896837472915649, "learning_rate": 0.0002, "epoch": 5.7843770174306, "step": 17920}, {"loss": 0.4808, "grad_norm": 1.3759760856628418, "learning_rate": 0.0002, "epoch": 5.78760490639122, "step": 17930}, {"loss": 0.4847, "grad_norm": 1.0596622228622437, "learning_rate": 0.0002, "epoch": 5.7908327953518395, "step": 17940}, {"loss": 0.5153, "grad_norm": 0.9292021989822388, "learning_rate": 0.0002, "epoch": 5.79406068431246, "step": 17950}, {"loss": 0.4783, "grad_norm": 0.8786653876304626, "learning_rate": 0.0002, "epoch": 5.79728857327308, "step": 17960}, {"loss": 0.4598, "grad_norm": 1.2087152004241943, "learning_rate": 0.0002, "epoch": 5.800516462233699, "step": 17970}, {"loss": 0.4953, "grad_norm": 1.1643104553222656, "learning_rate": 0.0002, "epoch": 5.803744351194319, "step": 17980}, {"loss": 0.5111, "grad_norm": 0.971613347530365, "learning_rate": 0.0002, "epoch": 5.806972240154939, "step": 17990}, {"loss": 0.5094, "grad_norm": 1.306227684020996, "learning_rate": 0.0002, "epoch": 5.810200129115558, "step": 18000}, {"loss": 0.5392, "grad_norm": 1.3665502071380615, "learning_rate": 0.0002, "epoch": 5.813428018076178, "step": 18010}, {"loss": 0.4887, "grad_norm": 1.2227312326431274, "learning_rate": 0.0002, "epoch": 5.816655907036798, "step": 18020}, {"loss": 0.5203, "grad_norm": 1.180694818496704, "learning_rate": 0.0002, "epoch": 5.819883795997418, "step": 18030}, {"loss": 0.4962, "grad_norm": 1.1045362949371338, "learning_rate": 0.0002, "epoch": 5.823111684958038, "step": 18040}, {"loss": 0.4969, "grad_norm": 1.3828954696655273, "learning_rate": 0.0002, "epoch": 5.826339573918657, "step": 18050}, {"loss": 0.5493, "grad_norm": 1.305102825164795, "learning_rate": 0.0002, "epoch": 5.829567462879277, "step": 18060}, {"loss": 0.4844, "grad_norm": 1.2708743810653687, "learning_rate": 0.0002, "epoch": 5.832795351839897, "step": 18070}, {"loss": 0.4834, "grad_norm": 1.0344188213348389, "learning_rate": 0.0002, "epoch": 5.836023240800516, "step": 18080}, {"loss": 0.5088, "grad_norm": 1.1321724653244019, "learning_rate": 0.0002, "epoch": 5.839251129761136, "step": 18090}, {"loss": 0.4888, "grad_norm": 1.2162611484527588, "learning_rate": 0.0002, "epoch": 5.842479018721756, "step": 18100}, {"loss": 0.5014, "grad_norm": 1.427612543106079, "learning_rate": 0.0002, "epoch": 5.845706907682375, "step": 18110}, {"loss": 0.5339, "grad_norm": 1.4391452074050903, "learning_rate": 0.0002, "epoch": 5.848934796642995, "step": 18120}, {"loss": 0.528, "grad_norm": 1.1548216342926025, "learning_rate": 0.0002, "epoch": 5.8521626856036155, "step": 18130}, {"loss": 0.4779, "grad_norm": 1.2336437702178955, "learning_rate": 0.0002, "epoch": 5.855390574564235, "step": 18140}, {"loss": 0.4844, "grad_norm": 1.254661202430725, "learning_rate": 0.0002, "epoch": 5.858618463524855, "step": 18150}, {"loss": 0.5201, "grad_norm": 0.8326491117477417, "learning_rate": 0.0002, "epoch": 5.8618463524854745, "step": 18160}, {"loss": 0.5076, "grad_norm": 1.0907988548278809, "learning_rate": 0.0002, "epoch": 5.865074241446094, "step": 18170}, {"loss": 0.48, "grad_norm": 0.9896568655967712, "learning_rate": 0.0002, "epoch": 5.868302130406714, "step": 18180}, {"loss": 0.4628, "grad_norm": 0.9440065026283264, "learning_rate": 0.0002, "epoch": 5.871530019367333, "step": 18190}, {"loss": 0.5265, "grad_norm": 1.09321129322052, "learning_rate": 0.0002, "epoch": 5.874757908327954, "step": 18200}, {"loss": 0.4737, "grad_norm": 1.2588142156600952, "learning_rate": 0.0002, "epoch": 5.877985797288574, "step": 18210}, {"loss": 0.475, "grad_norm": 1.1731587648391724, "learning_rate": 0.0002, "epoch": 5.881213686249193, "step": 18220}, {"loss": 0.504, "grad_norm": 0.9904444217681885, "learning_rate": 0.0002, "epoch": 5.884441575209813, "step": 18230}, {"loss": 0.4842, "grad_norm": 0.8985799551010132, "learning_rate": 0.0002, "epoch": 5.887669464170433, "step": 18240}, {"loss": 0.4878, "grad_norm": 1.0182441473007202, "learning_rate": 0.0002, "epoch": 5.890897353131052, "step": 18250}, {"loss": 0.5224, "grad_norm": 1.1574701070785522, "learning_rate": 0.0002, "epoch": 5.894125242091672, "step": 18260}, {"loss": 0.5, "grad_norm": 1.1776602268218994, "learning_rate": 0.0002, "epoch": 5.8973531310522915, "step": 18270}, {"loss": 0.5245, "grad_norm": 1.4951308965682983, "learning_rate": 0.0002, "epoch": 5.900581020012911, "step": 18280}, {"loss": 0.5454, "grad_norm": 1.1440261602401733, "learning_rate": 0.0002, "epoch": 5.903808908973531, "step": 18290}, {"loss": 0.4868, "grad_norm": 0.9925196170806885, "learning_rate": 0.0002, "epoch": 5.907036797934151, "step": 18300}, {"loss": 0.5142, "grad_norm": 1.098615288734436, "learning_rate": 0.0002, "epoch": 5.910264686894771, "step": 18310}, {"loss": 0.5184, "grad_norm": 1.0030080080032349, "learning_rate": 0.0002, "epoch": 5.913492575855391, "step": 18320}, {"loss": 0.474, "grad_norm": 0.9890318512916565, "learning_rate": 0.0002, "epoch": 5.91672046481601, "step": 18330}, {"loss": 0.5125, "grad_norm": 1.2209392786026, "learning_rate": 0.0002, "epoch": 5.91994835377663, "step": 18340}, {"loss": 0.4634, "grad_norm": 1.108933925628662, "learning_rate": 0.0002, "epoch": 5.92317624273725, "step": 18350}, {"loss": 0.4813, "grad_norm": 1.086024522781372, "learning_rate": 0.0002, "epoch": 5.926404131697869, "step": 18360}, {"loss": 0.4952, "grad_norm": 1.0061167478561401, "learning_rate": 0.0002, "epoch": 5.92963202065849, "step": 18370}, {"loss": 0.4848, "grad_norm": 0.9445858597755432, "learning_rate": 0.0002, "epoch": 5.9328599096191095, "step": 18380}, {"loss": 0.5014, "grad_norm": 0.9556859135627747, "learning_rate": 0.0002, "epoch": 5.936087798579729, "step": 18390}, {"loss": 0.4966, "grad_norm": 1.154168963432312, "learning_rate": 0.0002, "epoch": 5.939315687540349, "step": 18400}, {"loss": 0.4836, "grad_norm": 1.0495831966400146, "learning_rate": 0.0002, "epoch": 5.942543576500968, "step": 18410}, {"loss": 0.5021, "grad_norm": 1.0717304944992065, "learning_rate": 0.0002, "epoch": 5.945771465461588, "step": 18420}, {"loss": 0.4794, "grad_norm": 1.06618332862854, "learning_rate": 0.0002, "epoch": 5.948999354422208, "step": 18430}, {"loss": 0.5011, "grad_norm": 0.9567165374755859, "learning_rate": 0.0002, "epoch": 5.952227243382827, "step": 18440}, {"loss": 0.485, "grad_norm": 1.0306249856948853, "learning_rate": 0.0002, "epoch": 5.955455132343447, "step": 18450}, {"loss": 0.4948, "grad_norm": 1.1879968643188477, "learning_rate": 0.0002, "epoch": 5.958683021304067, "step": 18460}, {"loss": 0.5185, "grad_norm": 1.3177233934402466, "learning_rate": 0.0002, "epoch": 5.961910910264687, "step": 18470}, {"loss": 0.4966, "grad_norm": 1.0945817232131958, "learning_rate": 0.0002, "epoch": 5.965138799225307, "step": 18480}, {"loss": 0.5196, "grad_norm": 1.029414415359497, "learning_rate": 0.0002, "epoch": 5.9683666881859265, "step": 18490}, {"loss": 0.5154, "grad_norm": 1.2266209125518799, "learning_rate": 0.0002, "epoch": 5.971594577146546, "step": 18500}, {"loss": 0.4914, "grad_norm": 1.2167150974273682, "learning_rate": 0.0002, "epoch": 5.974822466107166, "step": 18510}, {"loss": 0.466, "grad_norm": 0.9941056966781616, "learning_rate": 0.0002, "epoch": 5.9780503550677855, "step": 18520}, {"loss": 0.5037, "grad_norm": 1.4244859218597412, "learning_rate": 0.0002, "epoch": 5.981278244028405, "step": 18530}, {"loss": 0.4902, "grad_norm": 0.8976260423660278, "learning_rate": 0.0002, "epoch": 5.984506132989026, "step": 18540}, {"loss": 0.5039, "grad_norm": 1.0162699222564697, "learning_rate": 0.0002, "epoch": 5.987734021949645, "step": 18550}, {"loss": 0.5138, "grad_norm": 1.196677803993225, "learning_rate": 0.0002, "epoch": 5.990961910910265, "step": 18560}, {"loss": 0.4626, "grad_norm": 1.163403868675232, "learning_rate": 0.0002, "epoch": 5.994189799870885, "step": 18570}, {"loss": 0.5105, "grad_norm": 1.010205626487732, "learning_rate": 0.0002, "epoch": 5.997417688831504, "step": 18580}]} +{"epoch": 7.0, "step": 21686, "epoch_duration": 11709.138325929642, "total_accumulated_duration": 79250.38503170013, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75537109375}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.593, "grad_norm": 0.7092075347900391, "learning_rate": 0.0002, "epoch": 0.0032278889606197547, "step": 10}, {"loss": 1.0956, "grad_norm": 0.6900479793548584, "learning_rate": 0.0002, "epoch": 0.006455777921239509, "step": 20}, {"loss": 0.9807, "grad_norm": 0.6788288950920105, "learning_rate": 0.0002, "epoch": 0.009683666881859263, "step": 30}, {"loss": 0.9385, "grad_norm": 0.5590243339538574, "learning_rate": 0.0002, "epoch": 0.012911555842479019, "step": 40}, {"loss": 0.931, "grad_norm": 0.5136010646820068, "learning_rate": 0.0002, "epoch": 0.016139444803098774, "step": 50}, {"loss": 0.8896, "grad_norm": 0.45298320055007935, "learning_rate": 0.0002, "epoch": 0.019367333763718526, "step": 60}, {"loss": 0.9184, "grad_norm": 0.5917162299156189, "learning_rate": 0.0002, "epoch": 0.022595222724338282, "step": 70}, {"loss": 0.8705, "grad_norm": 0.4414856433868408, "learning_rate": 0.0002, "epoch": 0.025823111684958037, "step": 80}, {"loss": 0.8419, "grad_norm": 0.5547978281974792, "learning_rate": 0.0002, "epoch": 0.029051000645577793, "step": 90}, {"loss": 0.8987, "grad_norm": 0.5271288156509399, "learning_rate": 0.0002, "epoch": 0.03227888960619755, "step": 100}, {"loss": 0.8543, "grad_norm": 0.5506119728088379, "learning_rate": 0.0002, "epoch": 0.035506778566817304, "step": 110}, {"loss": 0.8373, "grad_norm": 0.5579327940940857, "learning_rate": 0.0002, "epoch": 0.03873466752743705, "step": 120}, {"loss": 0.8826, "grad_norm": 0.5099632740020752, "learning_rate": 0.0002, "epoch": 0.04196255648805681, "step": 130}, {"loss": 0.9239, "grad_norm": 0.40396833419799805, "learning_rate": 0.0002, "epoch": 0.045190445448676564, "step": 140}, {"loss": 0.846, "grad_norm": 0.5008092522621155, "learning_rate": 0.0002, "epoch": 0.04841833440929632, "step": 150}, {"loss": 0.8564, "grad_norm": 0.4388776421546936, "learning_rate": 0.0002, "epoch": 0.051646223369916075, "step": 160}, {"loss": 0.8829, "grad_norm": 0.44138944149017334, "learning_rate": 0.0002, "epoch": 0.05487411233053583, "step": 170}, {"loss": 0.8061, "grad_norm": 0.358484148979187, "learning_rate": 0.0002, "epoch": 0.058102001291155586, "step": 180}, {"loss": 0.8956, "grad_norm": 0.457052081823349, "learning_rate": 0.0002, "epoch": 0.06132989025177534, "step": 190}, {"loss": 0.9138, "grad_norm": 0.5537622570991516, "learning_rate": 0.0002, "epoch": 0.0645577792123951, "step": 200}, {"loss": 0.8701, "grad_norm": 0.552631676197052, "learning_rate": 0.0002, "epoch": 0.06778566817301485, "step": 210}, {"loss": 0.8854, "grad_norm": 0.4414575397968292, "learning_rate": 0.0002, "epoch": 0.07101355713363461, "step": 220}, {"loss": 0.8581, "grad_norm": 0.4996664226055145, "learning_rate": 0.0002, "epoch": 0.07424144609425436, "step": 230}, {"loss": 0.8675, "grad_norm": 0.7321897149085999, "learning_rate": 0.0002, "epoch": 0.0774693350548741, "step": 240}, {"loss": 0.8848, "grad_norm": 0.4553901255130768, "learning_rate": 0.0002, "epoch": 0.08069722401549387, "step": 250}, {"loss": 0.868, "grad_norm": 0.5039054751396179, "learning_rate": 0.0002, "epoch": 0.08392511297611362, "step": 260}, {"loss": 0.8317, "grad_norm": 0.4113094210624695, "learning_rate": 0.0002, "epoch": 0.08715300193673338, "step": 270}, {"loss": 0.8074, "grad_norm": 0.450436532497406, "learning_rate": 0.0002, "epoch": 0.09038089089735313, "step": 280}, {"loss": 0.8105, "grad_norm": 0.4548024535179138, "learning_rate": 0.0002, "epoch": 0.09360877985797289, "step": 290}, {"loss": 0.8325, "grad_norm": 0.4932962656021118, "learning_rate": 0.0002, "epoch": 0.09683666881859264, "step": 300}, {"loss": 0.8105, "grad_norm": 0.4005250334739685, "learning_rate": 0.0002, "epoch": 0.1000645577792124, "step": 310}, {"loss": 0.8083, "grad_norm": 1.8321624994277954, "learning_rate": 0.0002, "epoch": 0.10329244673983215, "step": 320}, {"loss": 0.8411, "grad_norm": 0.45815610885620117, "learning_rate": 0.0002, "epoch": 0.1065203357004519, "step": 330}, {"loss": 0.857, "grad_norm": 0.39324095845222473, "learning_rate": 0.0002, "epoch": 0.10974822466107166, "step": 340}, {"loss": 0.8258, "grad_norm": 0.546273946762085, "learning_rate": 0.0002, "epoch": 0.11297611362169141, "step": 350}, {"loss": 0.882, "grad_norm": 0.497448593378067, "learning_rate": 0.0002, "epoch": 0.11620400258231117, "step": 360}, {"loss": 0.7608, "grad_norm": 0.37508800625801086, "learning_rate": 0.0002, "epoch": 0.11943189154293092, "step": 370}, {"loss": 0.852, "grad_norm": 0.45849609375, "learning_rate": 0.0002, "epoch": 0.12265978050355068, "step": 380}, {"loss": 0.8437, "grad_norm": 0.5488408803939819, "learning_rate": 0.0002, "epoch": 0.12588766946417043, "step": 390}, {"loss": 0.8349, "grad_norm": 0.4477061331272125, "learning_rate": 0.0002, "epoch": 0.1291155584247902, "step": 400}, {"loss": 0.8306, "grad_norm": 0.39227980375289917, "learning_rate": 0.0002, "epoch": 0.13234344738540993, "step": 410}, {"loss": 0.7933, "grad_norm": 0.3922233581542969, "learning_rate": 0.0002, "epoch": 0.1355713363460297, "step": 420}, {"loss": 0.8134, "grad_norm": 0.42901909351348877, "learning_rate": 0.0002, "epoch": 0.13879922530664945, "step": 430}, {"loss": 0.8271, "grad_norm": 0.4217798709869385, "learning_rate": 0.0002, "epoch": 0.14202711426726922, "step": 440}, {"loss": 0.8594, "grad_norm": 0.43470677733421326, "learning_rate": 0.0002, "epoch": 0.14525500322788895, "step": 450}, {"loss": 0.8106, "grad_norm": 0.5324403047561646, "learning_rate": 0.0002, "epoch": 0.1484828921885087, "step": 460}, {"loss": 0.8729, "grad_norm": 0.3999756872653961, "learning_rate": 0.0002, "epoch": 0.15171078114912848, "step": 470}, {"loss": 0.7702, "grad_norm": 0.404933363199234, "learning_rate": 0.0002, "epoch": 0.1549386701097482, "step": 480}, {"loss": 0.8151, "grad_norm": 0.44122636318206787, "learning_rate": 0.0002, "epoch": 0.15816655907036797, "step": 490}, {"loss": 0.8457, "grad_norm": 0.510166347026825, "learning_rate": 0.0002, "epoch": 0.16139444803098774, "step": 500}, {"loss": 0.8692, "grad_norm": 0.4549732506275177, "learning_rate": 0.0002, "epoch": 0.1646223369916075, "step": 510}, {"loss": 0.8466, "grad_norm": 0.5148182511329651, "learning_rate": 0.0002, "epoch": 0.16785022595222723, "step": 520}, {"loss": 0.8317, "grad_norm": 0.3596806824207306, "learning_rate": 0.0002, "epoch": 0.171078114912847, "step": 530}, {"loss": 0.844, "grad_norm": 0.4388909339904785, "learning_rate": 0.0002, "epoch": 0.17430600387346676, "step": 540}, {"loss": 0.8322, "grad_norm": 0.5052742958068848, "learning_rate": 0.0002, "epoch": 0.17753389283408652, "step": 550}, {"loss": 0.791, "grad_norm": 0.48248958587646484, "learning_rate": 0.0002, "epoch": 0.18076178179470626, "step": 560}, {"loss": 0.8593, "grad_norm": 0.5360197424888611, "learning_rate": 0.0002, "epoch": 0.18398967075532602, "step": 570}, {"loss": 0.817, "grad_norm": 0.43999341130256653, "learning_rate": 0.0002, "epoch": 0.18721755971594578, "step": 580}, {"loss": 0.8311, "grad_norm": 0.3685208261013031, "learning_rate": 0.0002, "epoch": 0.19044544867656552, "step": 590}, {"loss": 0.8341, "grad_norm": 0.4601275622844696, "learning_rate": 0.0002, "epoch": 0.19367333763718528, "step": 600}, {"loss": 0.8483, "grad_norm": 0.4778369665145874, "learning_rate": 0.0002, "epoch": 0.19690122659780504, "step": 610}, {"loss": 0.8653, "grad_norm": 0.4867003560066223, "learning_rate": 0.0002, "epoch": 0.2001291155584248, "step": 620}, {"loss": 0.8554, "grad_norm": 0.4583742916584015, "learning_rate": 0.0002, "epoch": 0.20335700451904454, "step": 630}, {"loss": 0.8698, "grad_norm": 0.47958165407180786, "learning_rate": 0.0002, "epoch": 0.2065848934796643, "step": 640}, {"loss": 0.8213, "grad_norm": 0.4526064097881317, "learning_rate": 0.0002, "epoch": 0.20981278244028406, "step": 650}, {"loss": 0.8313, "grad_norm": 0.45890581607818604, "learning_rate": 0.0002, "epoch": 0.2130406714009038, "step": 660}, {"loss": 0.8143, "grad_norm": 0.42725905776023865, "learning_rate": 0.0002, "epoch": 0.21626856036152356, "step": 670}, {"loss": 0.8675, "grad_norm": 0.40380963683128357, "learning_rate": 0.0002, "epoch": 0.21949644932214332, "step": 680}, {"loss": 0.9004, "grad_norm": 0.4372998774051666, "learning_rate": 0.0002, "epoch": 0.22272433828276308, "step": 690}, {"loss": 0.8208, "grad_norm": 0.4245864450931549, "learning_rate": 0.0002, "epoch": 0.22595222724338282, "step": 700}, {"loss": 0.8564, "grad_norm": 0.4061129689216614, "learning_rate": 0.0002, "epoch": 0.22918011620400258, "step": 710}, {"loss": 0.8275, "grad_norm": 0.474454790353775, "learning_rate": 0.0002, "epoch": 0.23240800516462234, "step": 720}, {"loss": 0.8346, "grad_norm": 0.4908486008644104, "learning_rate": 0.0002, "epoch": 0.23563589412524208, "step": 730}, {"loss": 0.8755, "grad_norm": 0.4284191429615021, "learning_rate": 0.0002, "epoch": 0.23886378308586184, "step": 740}, {"loss": 0.8387, "grad_norm": 0.44730308651924133, "learning_rate": 0.0002, "epoch": 0.2420916720464816, "step": 750}, {"loss": 0.8135, "grad_norm": 0.4433246850967407, "learning_rate": 0.0002, "epoch": 0.24531956100710137, "step": 760}, {"loss": 0.8644, "grad_norm": 0.43668854236602783, "learning_rate": 0.0002, "epoch": 0.2485474499677211, "step": 770}, {"loss": 0.8025, "grad_norm": 0.34324130415916443, "learning_rate": 0.0002, "epoch": 0.25177533892834086, "step": 780}, {"loss": 0.8725, "grad_norm": 0.46476295590400696, "learning_rate": 0.0002, "epoch": 0.2550032278889606, "step": 790}, {"loss": 0.8157, "grad_norm": 0.5047039985656738, "learning_rate": 0.0002, "epoch": 0.2582311168495804, "step": 800}, {"loss": 0.8643, "grad_norm": 0.4402127265930176, "learning_rate": 0.0002, "epoch": 0.26145900581020015, "step": 810}, {"loss": 0.8025, "grad_norm": 0.4642465114593506, "learning_rate": 0.0002, "epoch": 0.26468689477081986, "step": 820}, {"loss": 0.8836, "grad_norm": 0.40093424916267395, "learning_rate": 0.0002, "epoch": 0.2679147837314396, "step": 830}, {"loss": 0.83, "grad_norm": 0.42501842975616455, "learning_rate": 0.0002, "epoch": 0.2711426726920594, "step": 840}, {"loss": 0.8573, "grad_norm": 0.43279722332954407, "learning_rate": 0.0002, "epoch": 0.27437056165267915, "step": 850}, {"loss": 0.817, "grad_norm": 0.5991243720054626, "learning_rate": 0.0002, "epoch": 0.2775984506132989, "step": 860}, {"loss": 0.7981, "grad_norm": 0.4217848777770996, "learning_rate": 0.0002, "epoch": 0.28082633957391867, "step": 870}, {"loss": 0.8135, "grad_norm": 0.3933536410331726, "learning_rate": 0.0002, "epoch": 0.28405422853453843, "step": 880}, {"loss": 0.8846, "grad_norm": 0.5868505239486694, "learning_rate": 0.0002, "epoch": 0.28728211749515814, "step": 890}, {"loss": 0.8759, "grad_norm": 0.5209547877311707, "learning_rate": 0.0002, "epoch": 0.2905100064557779, "step": 900}, {"loss": 0.815, "grad_norm": 0.49307361245155334, "learning_rate": 0.0002, "epoch": 0.29373789541639767, "step": 910}, {"loss": 0.7813, "grad_norm": 0.4288382828235626, "learning_rate": 0.0002, "epoch": 0.2969657843770174, "step": 920}, {"loss": 0.8431, "grad_norm": 0.33568474650382996, "learning_rate": 0.0002, "epoch": 0.3001936733376372, "step": 930}, {"loss": 0.8455, "grad_norm": 1.0915930271148682, "learning_rate": 0.0002, "epoch": 0.30342156229825695, "step": 940}, {"loss": 0.8535, "grad_norm": 0.5489798188209534, "learning_rate": 0.0002, "epoch": 0.3066494512588767, "step": 950}, {"loss": 0.8031, "grad_norm": 0.42971742153167725, "learning_rate": 0.0002, "epoch": 0.3098773402194964, "step": 960}, {"loss": 0.8253, "grad_norm": 0.43375834822654724, "learning_rate": 0.0002, "epoch": 0.3131052291801162, "step": 970}, {"loss": 0.7747, "grad_norm": 0.47488611936569214, "learning_rate": 0.0002, "epoch": 0.31633311814073595, "step": 980}, {"loss": 0.7906, "grad_norm": 0.46296775341033936, "learning_rate": 0.0002, "epoch": 0.3195610071013557, "step": 990}, {"loss": 0.7948, "grad_norm": 0.4548890292644501, "learning_rate": 0.0002, "epoch": 0.32278889606197547, "step": 1000}, {"loss": 0.8856, "grad_norm": 0.41834497451782227, "learning_rate": 0.0002, "epoch": 0.32601678502259523, "step": 1010}, {"loss": 0.7791, "grad_norm": 0.441092312335968, "learning_rate": 0.0002, "epoch": 0.329244673983215, "step": 1020}, {"loss": 0.8191, "grad_norm": 0.637322187423706, "learning_rate": 0.0002, "epoch": 0.33247256294383476, "step": 1030}, {"loss": 0.8685, "grad_norm": 0.4374958574771881, "learning_rate": 0.0002, "epoch": 0.33570045190445447, "step": 1040}, {"loss": 0.8423, "grad_norm": 0.3935825824737549, "learning_rate": 0.0002, "epoch": 0.33892834086507423, "step": 1050}, {"loss": 0.8287, "grad_norm": 0.43526220321655273, "learning_rate": 0.0002, "epoch": 0.342156229825694, "step": 1060}, {"loss": 0.8413, "grad_norm": 0.45327696204185486, "learning_rate": 0.0002, "epoch": 0.34538411878631375, "step": 1070}, {"loss": 0.7421, "grad_norm": 0.4126075506210327, "learning_rate": 0.0002, "epoch": 0.3486120077469335, "step": 1080}, {"loss": 0.8427, "grad_norm": 0.4714072048664093, "learning_rate": 0.0002, "epoch": 0.3518398967075533, "step": 1090}, {"loss": 0.8028, "grad_norm": 0.518127977848053, "learning_rate": 0.0002, "epoch": 0.35506778566817304, "step": 1100}, {"loss": 0.8479, "grad_norm": 0.43264099955558777, "learning_rate": 0.0002, "epoch": 0.35829567462879275, "step": 1110}, {"loss": 0.8724, "grad_norm": 0.4857400357723236, "learning_rate": 0.0002, "epoch": 0.3615235635894125, "step": 1120}, {"loss": 0.7735, "grad_norm": 0.37591469287872314, "learning_rate": 0.0002, "epoch": 0.3647514525500323, "step": 1130}, {"loss": 0.8531, "grad_norm": 0.4165478050708771, "learning_rate": 0.0002, "epoch": 0.36797934151065204, "step": 1140}, {"loss": 0.8151, "grad_norm": 0.42911383509635925, "learning_rate": 0.0002, "epoch": 0.3712072304712718, "step": 1150}, {"loss": 0.8722, "grad_norm": 0.44980287551879883, "learning_rate": 0.0002, "epoch": 0.37443511943189156, "step": 1160}, {"loss": 0.7961, "grad_norm": 0.4066573679447174, "learning_rate": 0.0002, "epoch": 0.3776630083925113, "step": 1170}, {"loss": 0.8317, "grad_norm": 0.5056195855140686, "learning_rate": 0.0002, "epoch": 0.38089089735313103, "step": 1180}, {"loss": 0.8387, "grad_norm": 0.4141536355018616, "learning_rate": 0.0002, "epoch": 0.3841187863137508, "step": 1190}, {"loss": 0.8019, "grad_norm": 0.4501924514770508, "learning_rate": 0.0002, "epoch": 0.38734667527437056, "step": 1200}, {"loss": 0.8528, "grad_norm": 0.43304240703582764, "learning_rate": 0.0002, "epoch": 0.3905745642349903, "step": 1210}, {"loss": 0.8905, "grad_norm": 0.475777804851532, "learning_rate": 0.0002, "epoch": 0.3938024531956101, "step": 1220}, {"loss": 0.8643, "grad_norm": 0.5846465826034546, "learning_rate": 0.0002, "epoch": 0.39703034215622984, "step": 1230}, {"loss": 0.8078, "grad_norm": 0.42899325489997864, "learning_rate": 0.0002, "epoch": 0.4002582311168496, "step": 1240}, {"loss": 0.8415, "grad_norm": 0.3980463147163391, "learning_rate": 0.0002, "epoch": 0.4034861200774693, "step": 1250}, {"loss": 0.8026, "grad_norm": 0.45769768953323364, "learning_rate": 0.0002, "epoch": 0.4067140090380891, "step": 1260}, {"loss": 0.8377, "grad_norm": 0.5101280212402344, "learning_rate": 0.0002, "epoch": 0.40994189799870884, "step": 1270}, {"loss": 0.7905, "grad_norm": 0.47374317049980164, "learning_rate": 0.0002, "epoch": 0.4131697869593286, "step": 1280}, {"loss": 0.8172, "grad_norm": 0.4261878728866577, "learning_rate": 0.0002, "epoch": 0.41639767591994836, "step": 1290}, {"loss": 0.9004, "grad_norm": 0.46954256296157837, "learning_rate": 0.0002, "epoch": 0.4196255648805681, "step": 1300}, {"loss": 0.7868, "grad_norm": 0.5205738544464111, "learning_rate": 0.0002, "epoch": 0.4228534538411879, "step": 1310}, {"loss": 0.8964, "grad_norm": 0.5176340937614441, "learning_rate": 0.0002, "epoch": 0.4260813428018076, "step": 1320}, {"loss": 0.8764, "grad_norm": 0.5155916810035706, "learning_rate": 0.0002, "epoch": 0.42930923176242736, "step": 1330}, {"loss": 0.8197, "grad_norm": 0.44548553228378296, "learning_rate": 0.0002, "epoch": 0.4325371207230471, "step": 1340}, {"loss": 0.7873, "grad_norm": 0.5633558630943298, "learning_rate": 0.0002, "epoch": 0.4357650096836669, "step": 1350}, {"loss": 0.7889, "grad_norm": 0.42444056272506714, "learning_rate": 0.0002, "epoch": 0.43899289864428664, "step": 1360}, {"loss": 0.8588, "grad_norm": 0.5226860642433167, "learning_rate": 0.0002, "epoch": 0.4422207876049064, "step": 1370}, {"loss": 0.8232, "grad_norm": 0.5354582071304321, "learning_rate": 0.0002, "epoch": 0.44544867656552617, "step": 1380}, {"loss": 0.816, "grad_norm": 0.472646564245224, "learning_rate": 0.0002, "epoch": 0.4486765655261459, "step": 1390}, {"loss": 0.7953, "grad_norm": 0.6312310099601746, "learning_rate": 0.0002, "epoch": 0.45190445448676564, "step": 1400}, {"loss": 0.8212, "grad_norm": 0.4298408031463623, "learning_rate": 0.0002, "epoch": 0.4551323434473854, "step": 1410}, {"loss": 0.8447, "grad_norm": 0.43427202105522156, "learning_rate": 0.0002, "epoch": 0.45836023240800516, "step": 1420}, {"loss": 0.8342, "grad_norm": 0.44097861647605896, "learning_rate": 0.0002, "epoch": 0.4615881213686249, "step": 1430}, {"loss": 0.8301, "grad_norm": 0.5142693519592285, "learning_rate": 0.0002, "epoch": 0.4648160103292447, "step": 1440}, {"loss": 0.8144, "grad_norm": 0.46416547894477844, "learning_rate": 0.0002, "epoch": 0.46804389928986445, "step": 1450}, {"loss": 0.8342, "grad_norm": 0.4858551025390625, "learning_rate": 0.0002, "epoch": 0.47127178825048416, "step": 1460}, {"loss": 0.8354, "grad_norm": 0.4709177315235138, "learning_rate": 0.0002, "epoch": 0.4744996772111039, "step": 1470}, {"loss": 0.8391, "grad_norm": 0.5500252842903137, "learning_rate": 0.0002, "epoch": 0.4777275661717237, "step": 1480}, {"loss": 0.8359, "grad_norm": 0.43364381790161133, "learning_rate": 0.0002, "epoch": 0.48095545513234345, "step": 1490}, {"loss": 0.8446, "grad_norm": 0.47712287306785583, "learning_rate": 0.0002, "epoch": 0.4841833440929632, "step": 1500}, {"loss": 0.8518, "grad_norm": 0.4518495202064514, "learning_rate": 0.0002, "epoch": 0.48741123305358297, "step": 1510}, {"loss": 0.819, "grad_norm": 0.4539008140563965, "learning_rate": 0.0002, "epoch": 0.49063912201420273, "step": 1520}, {"loss": 0.8276, "grad_norm": 0.4993067979812622, "learning_rate": 0.0002, "epoch": 0.49386701097482244, "step": 1530}, {"loss": 0.8297, "grad_norm": 0.6094803214073181, "learning_rate": 0.0002, "epoch": 0.4970948999354422, "step": 1540}, {"loss": 0.8263, "grad_norm": 0.48602527379989624, "learning_rate": 0.0002, "epoch": 0.500322788896062, "step": 1550}, {"loss": 0.8182, "grad_norm": 0.40245795249938965, "learning_rate": 0.0002, "epoch": 0.5035506778566817, "step": 1560}, {"loss": 0.7907, "grad_norm": 0.456787645816803, "learning_rate": 0.0002, "epoch": 0.5067785668173015, "step": 1570}, {"loss": 0.86, "grad_norm": 0.43936216831207275, "learning_rate": 0.0002, "epoch": 0.5100064557779213, "step": 1580}, {"loss": 0.7928, "grad_norm": 0.549018144607544, "learning_rate": 0.0002, "epoch": 0.513234344738541, "step": 1590}, {"loss": 0.8169, "grad_norm": 0.41746795177459717, "learning_rate": 0.0002, "epoch": 0.5164622336991608, "step": 1600}, {"loss": 0.7868, "grad_norm": 0.4217053949832916, "learning_rate": 0.0002, "epoch": 0.5196901226597805, "step": 1610}, {"loss": 0.8161, "grad_norm": 0.449913889169693, "learning_rate": 0.0002, "epoch": 0.5229180116204003, "step": 1620}, {"loss": 0.7938, "grad_norm": 0.5084872245788574, "learning_rate": 0.0002, "epoch": 0.5261459005810201, "step": 1630}, {"loss": 0.8295, "grad_norm": 0.46248653531074524, "learning_rate": 0.0002, "epoch": 0.5293737895416397, "step": 1640}, {"loss": 0.7993, "grad_norm": 0.4824236035346985, "learning_rate": 0.0002, "epoch": 0.5326016785022595, "step": 1650}, {"loss": 0.8711, "grad_norm": 0.6010985374450684, "learning_rate": 0.0002, "epoch": 0.5358295674628792, "step": 1660}, {"loss": 0.8266, "grad_norm": 0.4757920801639557, "learning_rate": 0.0002, "epoch": 0.539057456423499, "step": 1670}, {"loss": 0.8182, "grad_norm": 0.45161882042884827, "learning_rate": 0.0002, "epoch": 0.5422853453841188, "step": 1680}, {"loss": 0.8141, "grad_norm": 0.49314990639686584, "learning_rate": 0.0002, "epoch": 0.5455132343447385, "step": 1690}, {"loss": 0.8091, "grad_norm": 0.3918305039405823, "learning_rate": 0.0002, "epoch": 0.5487411233053583, "step": 1700}, {"loss": 0.8177, "grad_norm": 0.5966728925704956, "learning_rate": 0.0002, "epoch": 0.551969012265978, "step": 1710}, {"loss": 0.8438, "grad_norm": 0.4208986163139343, "learning_rate": 0.0002, "epoch": 0.5551969012265978, "step": 1720}, {"loss": 0.817, "grad_norm": 0.43724218010902405, "learning_rate": 0.0002, "epoch": 0.5584247901872176, "step": 1730}, {"loss": 0.7956, "grad_norm": 0.5287272930145264, "learning_rate": 0.0002, "epoch": 0.5616526791478373, "step": 1740}, {"loss": 0.8557, "grad_norm": 0.4961899518966675, "learning_rate": 0.0002, "epoch": 0.5648805681084571, "step": 1750}, {"loss": 0.8029, "grad_norm": 0.4468635320663452, "learning_rate": 0.0002, "epoch": 0.5681084570690769, "step": 1760}, {"loss": 0.7968, "grad_norm": 0.6423530578613281, "learning_rate": 0.0002, "epoch": 0.5713363460296966, "step": 1770}, {"loss": 0.8324, "grad_norm": 0.4601971507072449, "learning_rate": 0.0002, "epoch": 0.5745642349903163, "step": 1780}, {"loss": 0.8171, "grad_norm": 0.46514901518821716, "learning_rate": 0.0002, "epoch": 0.577792123950936, "step": 1790}, {"loss": 0.8186, "grad_norm": 0.4771687388420105, "learning_rate": 0.0002, "epoch": 0.5810200129115558, "step": 1800}, {"loss": 0.856, "grad_norm": 0.46514490246772766, "learning_rate": 0.0002, "epoch": 0.5842479018721756, "step": 1810}, {"loss": 0.84, "grad_norm": 0.5373936295509338, "learning_rate": 0.0002, "epoch": 0.5874757908327953, "step": 1820}, {"loss": 0.8456, "grad_norm": 0.5175791382789612, "learning_rate": 0.0002, "epoch": 0.5907036797934151, "step": 1830}, {"loss": 0.7957, "grad_norm": 0.4522802233695984, "learning_rate": 0.0002, "epoch": 0.5939315687540349, "step": 1840}, {"loss": 0.8633, "grad_norm": 0.42987772822380066, "learning_rate": 0.0002, "epoch": 0.5971594577146546, "step": 1850}, {"loss": 0.7871, "grad_norm": 0.5566838383674622, "learning_rate": 0.0002, "epoch": 0.6003873466752744, "step": 1860}, {"loss": 0.8312, "grad_norm": 0.42807698249816895, "learning_rate": 0.0002, "epoch": 0.6036152356358941, "step": 1870}, {"loss": 0.8035, "grad_norm": 0.4957767724990845, "learning_rate": 0.0002, "epoch": 0.6068431245965139, "step": 1880}, {"loss": 0.8145, "grad_norm": 0.4260980188846588, "learning_rate": 0.0002, "epoch": 0.6100710135571337, "step": 1890}, {"loss": 0.8363, "grad_norm": 0.4777357876300812, "learning_rate": 0.0002, "epoch": 0.6132989025177534, "step": 1900}, {"loss": 0.8404, "grad_norm": 0.4434216022491455, "learning_rate": 0.0002, "epoch": 0.6165267914783732, "step": 1910}, {"loss": 0.8057, "grad_norm": 0.5215433835983276, "learning_rate": 0.0002, "epoch": 0.6197546804389928, "step": 1920}, {"loss": 0.82, "grad_norm": 0.5143248438835144, "learning_rate": 0.0002, "epoch": 0.6229825693996126, "step": 1930}, {"loss": 0.8107, "grad_norm": 0.5213413238525391, "learning_rate": 0.0002, "epoch": 0.6262104583602324, "step": 1940}, {"loss": 0.7549, "grad_norm": 0.5408226251602173, "learning_rate": 0.0002, "epoch": 0.6294383473208521, "step": 1950}, {"loss": 0.8405, "grad_norm": 0.5479708909988403, "learning_rate": 0.0002, "epoch": 0.6326662362814719, "step": 1960}, {"loss": 0.8138, "grad_norm": 0.4490949809551239, "learning_rate": 0.0002, "epoch": 0.6358941252420917, "step": 1970}, {"loss": 0.854, "grad_norm": 0.48815059661865234, "learning_rate": 0.0002, "epoch": 0.6391220142027114, "step": 1980}, {"loss": 0.8568, "grad_norm": 0.46498045325279236, "learning_rate": 0.0002, "epoch": 0.6423499031633312, "step": 1990}, {"loss": 0.8263, "grad_norm": 0.5136561393737793, "learning_rate": 0.0002, "epoch": 0.6455777921239509, "step": 2000}, {"loss": 0.8503, "grad_norm": 0.5145719647407532, "learning_rate": 0.0002, "epoch": 0.6488056810845707, "step": 2010}, {"loss": 0.8456, "grad_norm": 0.5430373549461365, "learning_rate": 0.0002, "epoch": 0.6520335700451905, "step": 2020}, {"loss": 0.8115, "grad_norm": 0.46347954869270325, "learning_rate": 0.0002, "epoch": 0.6552614590058102, "step": 2030}, {"loss": 0.8769, "grad_norm": 0.5189562439918518, "learning_rate": 0.0002, "epoch": 0.65848934796643, "step": 2040}, {"loss": 0.8453, "grad_norm": 0.43843990564346313, "learning_rate": 0.0002, "epoch": 0.6617172369270498, "step": 2050}, {"loss": 0.7951, "grad_norm": 0.4654983580112457, "learning_rate": 0.0002, "epoch": 0.6649451258876695, "step": 2060}, {"loss": 0.8308, "grad_norm": 0.44835716485977173, "learning_rate": 0.0002, "epoch": 0.6681730148482892, "step": 2070}, {"loss": 0.8181, "grad_norm": 0.38811734318733215, "learning_rate": 0.0002, "epoch": 0.6714009038089089, "step": 2080}, {"loss": 0.762, "grad_norm": 0.5709853172302246, "learning_rate": 0.0002, "epoch": 0.6746287927695287, "step": 2090}, {"loss": 0.8334, "grad_norm": 0.49994757771492004, "learning_rate": 0.0002, "epoch": 0.6778566817301485, "step": 2100}, {"loss": 0.8, "grad_norm": 0.5505402684211731, "learning_rate": 0.0002, "epoch": 0.6810845706907682, "step": 2110}, {"loss": 0.8227, "grad_norm": 0.48195120692253113, "learning_rate": 0.0002, "epoch": 0.684312459651388, "step": 2120}, {"loss": 0.7879, "grad_norm": 0.4854775071144104, "learning_rate": 0.0002, "epoch": 0.6875403486120077, "step": 2130}, {"loss": 0.8231, "grad_norm": 0.6422494649887085, "learning_rate": 0.0002, "epoch": 0.6907682375726275, "step": 2140}, {"loss": 0.8353, "grad_norm": 0.3972536027431488, "learning_rate": 0.0002, "epoch": 0.6939961265332473, "step": 2150}, {"loss": 0.8068, "grad_norm": 0.4297836422920227, "learning_rate": 0.0002, "epoch": 0.697224015493867, "step": 2160}, {"loss": 0.8017, "grad_norm": 0.45486778020858765, "learning_rate": 0.0002, "epoch": 0.7004519044544868, "step": 2170}, {"loss": 0.8507, "grad_norm": 0.4706047773361206, "learning_rate": 0.0002, "epoch": 0.7036797934151066, "step": 2180}, {"loss": 0.8234, "grad_norm": 0.46426892280578613, "learning_rate": 0.0002, "epoch": 0.7069076823757263, "step": 2190}, {"loss": 0.8472, "grad_norm": 0.46333715319633484, "learning_rate": 0.0002, "epoch": 0.7101355713363461, "step": 2200}, {"loss": 0.8247, "grad_norm": 0.4632524251937866, "learning_rate": 0.0002, "epoch": 0.7133634602969657, "step": 2210}, {"loss": 0.8452, "grad_norm": 0.4610830843448639, "learning_rate": 0.0002, "epoch": 0.7165913492575855, "step": 2220}, {"loss": 0.7338, "grad_norm": 0.4905324876308441, "learning_rate": 0.0002, "epoch": 0.7198192382182053, "step": 2230}, {"loss": 0.7715, "grad_norm": 0.4936263859272003, "learning_rate": 0.0002, "epoch": 0.723047127178825, "step": 2240}, {"loss": 0.8162, "grad_norm": 0.40778425335884094, "learning_rate": 0.0002, "epoch": 0.7262750161394448, "step": 2250}, {"loss": 0.828, "grad_norm": 0.50351482629776, "learning_rate": 0.0002, "epoch": 0.7295029051000645, "step": 2260}, {"loss": 0.8475, "grad_norm": 0.4894128143787384, "learning_rate": 0.0002, "epoch": 0.7327307940606843, "step": 2270}, {"loss": 0.8087, "grad_norm": 0.5580906271934509, "learning_rate": 0.0002, "epoch": 0.7359586830213041, "step": 2280}, {"loss": 0.8157, "grad_norm": 0.4655369520187378, "learning_rate": 0.0002, "epoch": 0.7391865719819238, "step": 2290}, {"loss": 0.8395, "grad_norm": 0.4666965901851654, "learning_rate": 0.0002, "epoch": 0.7424144609425436, "step": 2300}, {"loss": 0.7605, "grad_norm": 0.46259936690330505, "learning_rate": 0.0002, "epoch": 0.7456423499031634, "step": 2310}, {"loss": 0.7849, "grad_norm": 0.520706832408905, "learning_rate": 0.0002, "epoch": 0.7488702388637831, "step": 2320}, {"loss": 0.8173, "grad_norm": 0.5142408013343811, "learning_rate": 0.0002, "epoch": 0.7520981278244029, "step": 2330}, {"loss": 0.7782, "grad_norm": 0.5355164408683777, "learning_rate": 0.0002, "epoch": 0.7553260167850226, "step": 2340}, {"loss": 0.8242, "grad_norm": 0.5517185926437378, "learning_rate": 0.0002, "epoch": 0.7585539057456423, "step": 2350}, {"loss": 0.8404, "grad_norm": 0.7162677049636841, "learning_rate": 0.0002, "epoch": 0.7617817947062621, "step": 2360}, {"loss": 0.8455, "grad_norm": 0.42402133345603943, "learning_rate": 0.0002, "epoch": 0.7650096836668818, "step": 2370}, {"loss": 0.8214, "grad_norm": 0.47180113196372986, "learning_rate": 0.0002, "epoch": 0.7682375726275016, "step": 2380}, {"loss": 0.8274, "grad_norm": 0.6262288689613342, "learning_rate": 0.0002, "epoch": 0.7714654615881213, "step": 2390}, {"loss": 0.7915, "grad_norm": 0.5177528262138367, "learning_rate": 0.0002, "epoch": 0.7746933505487411, "step": 2400}, {"loss": 0.7631, "grad_norm": 0.555721640586853, "learning_rate": 0.0002, "epoch": 0.7779212395093609, "step": 2410}, {"loss": 0.795, "grad_norm": 0.5592644810676575, "learning_rate": 0.0002, "epoch": 0.7811491284699806, "step": 2420}, {"loss": 0.8081, "grad_norm": 0.38025397062301636, "learning_rate": 0.0002, "epoch": 0.7843770174306004, "step": 2430}, {"loss": 0.7851, "grad_norm": 0.4597472548484802, "learning_rate": 0.0002, "epoch": 0.7876049063912202, "step": 2440}, {"loss": 0.8575, "grad_norm": 0.4929825961589813, "learning_rate": 0.0002, "epoch": 0.7908327953518399, "step": 2450}, {"loss": 0.7584, "grad_norm": 0.45277655124664307, "learning_rate": 0.0002, "epoch": 0.7940606843124597, "step": 2460}, {"loss": 0.8208, "grad_norm": 0.6224122643470764, "learning_rate": 0.0002, "epoch": 0.7972885732730794, "step": 2470}, {"loss": 0.8449, "grad_norm": 0.5740901827812195, "learning_rate": 0.0002, "epoch": 0.8005164622336992, "step": 2480}, {"loss": 0.7834, "grad_norm": 0.41335329413414, "learning_rate": 0.0002, "epoch": 0.8037443511943189, "step": 2490}, {"loss": 0.7768, "grad_norm": 0.4738694131374359, "learning_rate": 0.0002, "epoch": 0.8069722401549386, "step": 2500}, {"loss": 0.7927, "grad_norm": 0.5288197994232178, "learning_rate": 0.0002, "epoch": 0.8102001291155584, "step": 2510}, {"loss": 0.8334, "grad_norm": 0.5404666066169739, "learning_rate": 0.0002, "epoch": 0.8134280180761781, "step": 2520}, {"loss": 0.7998, "grad_norm": 0.4444909691810608, "learning_rate": 0.0002, "epoch": 0.8166559070367979, "step": 2530}, {"loss": 0.8683, "grad_norm": 0.542061448097229, "learning_rate": 0.0002, "epoch": 0.8198837959974177, "step": 2540}, {"loss": 0.8038, "grad_norm": 0.4914741814136505, "learning_rate": 0.0002, "epoch": 0.8231116849580374, "step": 2550}, {"loss": 0.7899, "grad_norm": 0.41703441739082336, "learning_rate": 0.0002, "epoch": 0.8263395739186572, "step": 2560}, {"loss": 0.824, "grad_norm": 0.5489841103553772, "learning_rate": 0.0002, "epoch": 0.829567462879277, "step": 2570}, {"loss": 0.8157, "grad_norm": 0.5359883308410645, "learning_rate": 0.0002, "epoch": 0.8327953518398967, "step": 2580}, {"loss": 0.8122, "grad_norm": 0.5541019439697266, "learning_rate": 0.0002, "epoch": 0.8360232408005165, "step": 2590}, {"loss": 0.797, "grad_norm": 0.4746638834476471, "learning_rate": 0.0002, "epoch": 0.8392511297611362, "step": 2600}, {"loss": 0.8116, "grad_norm": 0.5243194103240967, "learning_rate": 0.0002, "epoch": 0.842479018721756, "step": 2610}, {"loss": 0.8173, "grad_norm": 0.46824976801872253, "learning_rate": 0.0002, "epoch": 0.8457069076823758, "step": 2620}, {"loss": 0.7525, "grad_norm": 0.49487847089767456, "learning_rate": 0.0002, "epoch": 0.8489347966429954, "step": 2630}, {"loss": 0.8296, "grad_norm": 0.42180097103118896, "learning_rate": 0.0002, "epoch": 0.8521626856036152, "step": 2640}, {"loss": 0.8304, "grad_norm": 0.5516560077667236, "learning_rate": 0.0002, "epoch": 0.855390574564235, "step": 2650}, {"loss": 0.7882, "grad_norm": 0.4392191767692566, "learning_rate": 0.0002, "epoch": 0.8586184635248547, "step": 2660}, {"loss": 0.848, "grad_norm": 0.5387210845947266, "learning_rate": 0.0002, "epoch": 0.8618463524854745, "step": 2670}, {"loss": 0.8094, "grad_norm": 0.6232406497001648, "learning_rate": 0.0002, "epoch": 0.8650742414460942, "step": 2680}, {"loss": 0.768, "grad_norm": 0.53749018907547, "learning_rate": 0.0002, "epoch": 0.868302130406714, "step": 2690}, {"loss": 0.8299, "grad_norm": 0.47480374574661255, "learning_rate": 0.0002, "epoch": 0.8715300193673338, "step": 2700}, {"loss": 0.8055, "grad_norm": 0.44618046283721924, "learning_rate": 0.0002, "epoch": 0.8747579083279535, "step": 2710}, {"loss": 0.8015, "grad_norm": 0.4173581302165985, "learning_rate": 0.0002, "epoch": 0.8779857972885733, "step": 2720}, {"loss": 0.7713, "grad_norm": 0.524081289768219, "learning_rate": 0.0002, "epoch": 0.881213686249193, "step": 2730}, {"loss": 0.8738, "grad_norm": 0.5608431100845337, "learning_rate": 0.0002, "epoch": 0.8844415752098128, "step": 2740}, {"loss": 0.8513, "grad_norm": 0.5212284922599792, "learning_rate": 0.0002, "epoch": 0.8876694641704326, "step": 2750}, {"loss": 0.8139, "grad_norm": 0.5601475834846497, "learning_rate": 0.0002, "epoch": 0.8908973531310523, "step": 2760}, {"loss": 0.7947, "grad_norm": 0.4499223828315735, "learning_rate": 0.0002, "epoch": 0.8941252420916721, "step": 2770}, {"loss": 0.8559, "grad_norm": 0.46945226192474365, "learning_rate": 0.0002, "epoch": 0.8973531310522918, "step": 2780}, {"loss": 0.801, "grad_norm": 0.4837495684623718, "learning_rate": 0.0002, "epoch": 0.9005810200129115, "step": 2790}, {"loss": 0.7887, "grad_norm": 0.5059258937835693, "learning_rate": 0.0002, "epoch": 0.9038089089735313, "step": 2800}, {"loss": 0.8571, "grad_norm": 0.4857945144176483, "learning_rate": 0.0002, "epoch": 0.907036797934151, "step": 2810}, {"loss": 0.8301, "grad_norm": 0.5001962780952454, "learning_rate": 0.0002, "epoch": 0.9102646868947708, "step": 2820}, {"loss": 0.8236, "grad_norm": 0.5468648672103882, "learning_rate": 0.0002, "epoch": 0.9134925758553906, "step": 2830}, {"loss": 0.8071, "grad_norm": 0.5533056259155273, "learning_rate": 0.0002, "epoch": 0.9167204648160103, "step": 2840}, {"loss": 0.7895, "grad_norm": 0.5909785628318787, "learning_rate": 0.0002, "epoch": 0.9199483537766301, "step": 2850}, {"loss": 0.796, "grad_norm": 0.47428104281425476, "learning_rate": 0.0002, "epoch": 0.9231762427372499, "step": 2860}, {"loss": 0.7845, "grad_norm": 0.548814058303833, "learning_rate": 0.0002, "epoch": 0.9264041316978696, "step": 2870}, {"loss": 0.7871, "grad_norm": 0.5576745271682739, "learning_rate": 0.0002, "epoch": 0.9296320206584894, "step": 2880}, {"loss": 0.8399, "grad_norm": 0.47094792127609253, "learning_rate": 0.0002, "epoch": 0.9328599096191091, "step": 2890}, {"loss": 0.805, "grad_norm": 0.5408539772033691, "learning_rate": 0.0002, "epoch": 0.9360877985797289, "step": 2900}, {"loss": 0.785, "grad_norm": 0.5922889113426208, "learning_rate": 0.0002, "epoch": 0.9393156875403487, "step": 2910}, {"loss": 0.8043, "grad_norm": 0.45462584495544434, "learning_rate": 0.0002, "epoch": 0.9425435765009683, "step": 2920}, {"loss": 0.8344, "grad_norm": 0.6864947080612183, "learning_rate": 0.0002, "epoch": 0.9457714654615881, "step": 2930}, {"loss": 0.8166, "grad_norm": 0.4706299304962158, "learning_rate": 0.0002, "epoch": 0.9489993544222078, "step": 2940}, {"loss": 0.8422, "grad_norm": 0.5583269596099854, "learning_rate": 0.0002, "epoch": 0.9522272433828276, "step": 2950}, {"loss": 0.836, "grad_norm": 0.51015704870224, "learning_rate": 0.0002, "epoch": 0.9554551323434474, "step": 2960}, {"loss": 0.8371, "grad_norm": 0.5325582027435303, "learning_rate": 0.0002, "epoch": 0.9586830213040671, "step": 2970}, {"loss": 0.7593, "grad_norm": 0.49008598923683167, "learning_rate": 0.0002, "epoch": 0.9619109102646869, "step": 2980}, {"loss": 0.8093, "grad_norm": 0.4422132074832916, "learning_rate": 0.0002, "epoch": 0.9651387992253067, "step": 2990}, {"loss": 0.7966, "grad_norm": 0.5053589344024658, "learning_rate": 0.0002, "epoch": 0.9683666881859264, "step": 3000}, {"loss": 0.8081, "grad_norm": 0.46754521131515503, "learning_rate": 0.0002, "epoch": 0.9715945771465462, "step": 3010}, {"loss": 0.8377, "grad_norm": 0.5613434910774231, "learning_rate": 0.0002, "epoch": 0.9748224661071659, "step": 3020}, {"loss": 0.7856, "grad_norm": 0.5052843689918518, "learning_rate": 0.0002, "epoch": 0.9780503550677857, "step": 3030}, {"loss": 0.8412, "grad_norm": 0.4270972013473511, "learning_rate": 0.0002, "epoch": 0.9812782440284055, "step": 3040}, {"loss": 0.8353, "grad_norm": 0.4974991977214813, "learning_rate": 0.0002, "epoch": 0.9845061329890252, "step": 3050}, {"loss": 0.8415, "grad_norm": 0.4432311952114105, "learning_rate": 0.0002, "epoch": 0.9877340219496449, "step": 3060}, {"loss": 0.7764, "grad_norm": 0.466457724571228, "learning_rate": 0.0002, "epoch": 0.9909619109102646, "step": 3070}, {"loss": 0.8067, "grad_norm": 0.6438009142875671, "learning_rate": 0.0002, "epoch": 0.9941897998708844, "step": 3080}, {"loss": 0.8425, "grad_norm": 0.5593604445457458, "learning_rate": 0.0002, "epoch": 0.9974176888315042, "step": 3090}, {"eval_loss": 1.0958120822906494, "eval_runtime": 148.3273, "eval_samples_per_second": 4.942, "eval_steps_per_second": 0.62, "epoch": 1.0, "step": 3098}, {"loss": 0.8275, "grad_norm": 0.5701445937156677, "learning_rate": 0.0002, "epoch": 1.000645577792124, "step": 3100}, {"loss": 0.7756, "grad_norm": 0.6089657545089722, "learning_rate": 0.0002, "epoch": 1.0038734667527438, "step": 3110}, {"loss": 0.7492, "grad_norm": 0.5619552135467529, "learning_rate": 0.0002, "epoch": 1.0071013557133635, "step": 3120}, {"loss": 0.7544, "grad_norm": 0.5550283789634705, "learning_rate": 0.0002, "epoch": 1.010329244673983, "step": 3130}, {"loss": 0.8006, "grad_norm": 0.6221792101860046, "learning_rate": 0.0002, "epoch": 1.013557133634603, "step": 3140}, {"loss": 0.7603, "grad_norm": 0.5450758934020996, "learning_rate": 0.0002, "epoch": 1.0167850225952226, "step": 3150}, {"loss": 0.7021, "grad_norm": 0.4359588027000427, "learning_rate": 0.0002, "epoch": 1.0200129115558425, "step": 3160}, {"loss": 0.7468, "grad_norm": 0.5932239890098572, "learning_rate": 0.0002, "epoch": 1.0232408005164622, "step": 3170}, {"loss": 0.7649, "grad_norm": 0.45478707551956177, "learning_rate": 0.0002, "epoch": 1.026468689477082, "step": 3180}, {"loss": 0.7355, "grad_norm": 0.677615761756897, "learning_rate": 0.0002, "epoch": 1.0296965784377017, "step": 3190}, {"loss": 0.6928, "grad_norm": 0.6231790781021118, "learning_rate": 0.0002, "epoch": 1.0329244673983216, "step": 3200}, {"loss": 0.7471, "grad_norm": 0.5074195861816406, "learning_rate": 0.0002, "epoch": 1.0361523563589412, "step": 3210}, {"loss": 0.6864, "grad_norm": 0.4844142198562622, "learning_rate": 0.0002, "epoch": 1.039380245319561, "step": 3220}, {"loss": 0.7655, "grad_norm": 0.5372750759124756, "learning_rate": 0.0002, "epoch": 1.0426081342801807, "step": 3230}, {"loss": 0.7384, "grad_norm": 0.46296265721321106, "learning_rate": 0.0002, "epoch": 1.0458360232408006, "step": 3240}, {"loss": 0.7894, "grad_norm": 0.5417148470878601, "learning_rate": 0.0002, "epoch": 1.0490639122014203, "step": 3250}, {"loss": 0.7637, "grad_norm": 0.5695074200630188, "learning_rate": 0.0002, "epoch": 1.0522918011620401, "step": 3260}, {"loss": 0.7456, "grad_norm": 0.5050092935562134, "learning_rate": 0.0002, "epoch": 1.0555196901226598, "step": 3270}, {"loss": 0.6805, "grad_norm": 0.5320752263069153, "learning_rate": 0.0002, "epoch": 1.0587475790832794, "step": 3280}, {"loss": 0.7419, "grad_norm": 0.5832052230834961, "learning_rate": 0.0002, "epoch": 1.0619754680438993, "step": 3290}, {"loss": 0.7656, "grad_norm": 0.5228804349899292, "learning_rate": 0.0002, "epoch": 1.065203357004519, "step": 3300}, {"loss": 0.6834, "grad_norm": 0.5819445252418518, "learning_rate": 0.0002, "epoch": 1.0684312459651388, "step": 3310}, {"loss": 0.7093, "grad_norm": 0.4201328754425049, "learning_rate": 0.0002, "epoch": 1.0716591349257585, "step": 3320}, {"loss": 0.7494, "grad_norm": 0.5424145460128784, "learning_rate": 0.0002, "epoch": 1.0748870238863784, "step": 3330}, {"loss": 0.7828, "grad_norm": 0.6169946789741516, "learning_rate": 0.0002, "epoch": 1.078114912846998, "step": 3340}, {"loss": 0.7505, "grad_norm": 0.607676088809967, "learning_rate": 0.0002, "epoch": 1.0813428018076179, "step": 3350}, {"loss": 0.7315, "grad_norm": 0.5191982388496399, "learning_rate": 0.0002, "epoch": 1.0845706907682375, "step": 3360}, {"loss": 0.7699, "grad_norm": 0.5728003978729248, "learning_rate": 0.0002, "epoch": 1.0877985797288574, "step": 3370}, {"loss": 0.7381, "grad_norm": 0.5402643084526062, "learning_rate": 0.0002, "epoch": 1.091026468689477, "step": 3380}, {"loss": 0.7208, "grad_norm": 0.5377541780471802, "learning_rate": 0.0002, "epoch": 1.094254357650097, "step": 3390}, {"loss": 0.7672, "grad_norm": 0.4751385748386383, "learning_rate": 0.0002, "epoch": 1.0974822466107166, "step": 3400}, {"loss": 0.7326, "grad_norm": 0.559158444404602, "learning_rate": 0.0002, "epoch": 1.1007101355713362, "step": 3410}, {"loss": 0.7366, "grad_norm": 0.4917701482772827, "learning_rate": 0.0002, "epoch": 1.103938024531956, "step": 3420}, {"loss": 0.7593, "grad_norm": 0.5507875084877014, "learning_rate": 0.0002, "epoch": 1.1071659134925758, "step": 3430}, {"loss": 0.7424, "grad_norm": 0.45458680391311646, "learning_rate": 0.0002, "epoch": 1.1103938024531956, "step": 3440}, {"loss": 0.7234, "grad_norm": 0.5721744894981384, "learning_rate": 0.0002, "epoch": 1.1136216914138153, "step": 3450}, {"loss": 0.7219, "grad_norm": 0.5776081681251526, "learning_rate": 0.0002, "epoch": 1.1168495803744352, "step": 3460}, {"loss": 0.7644, "grad_norm": 0.5261953473091125, "learning_rate": 0.0002, "epoch": 1.1200774693350548, "step": 3470}, {"loss": 0.6586, "grad_norm": 0.47759532928466797, "learning_rate": 0.0002, "epoch": 1.1233053582956747, "step": 3480}, {"loss": 0.7641, "grad_norm": 0.5697659850120544, "learning_rate": 0.0002, "epoch": 1.1265332472562943, "step": 3490}, {"loss": 0.7017, "grad_norm": 0.5643419623374939, "learning_rate": 0.0002, "epoch": 1.1297611362169142, "step": 3500}, {"loss": 0.7235, "grad_norm": 0.6502931118011475, "learning_rate": 0.0002, "epoch": 1.1329890251775339, "step": 3510}, {"loss": 0.7662, "grad_norm": 0.5236507654190063, "learning_rate": 0.0002, "epoch": 1.1362169141381537, "step": 3520}, {"loss": 0.7571, "grad_norm": 0.6521499156951904, "learning_rate": 0.0002, "epoch": 1.1394448030987734, "step": 3530}, {"loss": 0.7304, "grad_norm": 0.5893217325210571, "learning_rate": 0.0002, "epoch": 1.142672692059393, "step": 3540}, {"loss": 0.7508, "grad_norm": 0.5300073027610779, "learning_rate": 0.0002, "epoch": 1.145900581020013, "step": 3550}, {"loss": 0.6937, "grad_norm": 0.6794660091400146, "learning_rate": 0.0002, "epoch": 1.1491284699806328, "step": 3560}, {"loss": 0.7614, "grad_norm": 0.5420064926147461, "learning_rate": 0.0002, "epoch": 1.1523563589412524, "step": 3570}, {"loss": 0.7648, "grad_norm": 0.5096590518951416, "learning_rate": 0.0002, "epoch": 1.155584247901872, "step": 3580}, {"loss": 0.7436, "grad_norm": 0.5726043581962585, "learning_rate": 0.0002, "epoch": 1.158812136862492, "step": 3590}, {"loss": 0.7728, "grad_norm": 0.7388110160827637, "learning_rate": 0.0002, "epoch": 1.1620400258231116, "step": 3600}, {"loss": 0.7421, "grad_norm": 0.5597969889640808, "learning_rate": 0.0002, "epoch": 1.1652679147837315, "step": 3610}, {"loss": 0.7132, "grad_norm": 0.5067800283432007, "learning_rate": 0.0002, "epoch": 1.1684958037443511, "step": 3620}, {"loss": 0.7893, "grad_norm": 0.6625118255615234, "learning_rate": 0.0002, "epoch": 1.171723692704971, "step": 3630}, {"loss": 0.7611, "grad_norm": 0.5830849409103394, "learning_rate": 0.0002, "epoch": 1.1749515816655907, "step": 3640}, {"loss": 0.7973, "grad_norm": 0.6140692830085754, "learning_rate": 0.0002, "epoch": 1.1781794706262105, "step": 3650}, {"loss": 0.7617, "grad_norm": 0.714523434638977, "learning_rate": 0.0002, "epoch": 1.1814073595868302, "step": 3660}, {"loss": 0.7092, "grad_norm": 0.5196696519851685, "learning_rate": 0.0002, "epoch": 1.18463524854745, "step": 3670}, {"loss": 0.7821, "grad_norm": 0.6677889823913574, "learning_rate": 0.0002, "epoch": 1.1878631375080697, "step": 3680}, {"loss": 0.7813, "grad_norm": 0.47095245122909546, "learning_rate": 0.0002, "epoch": 1.1910910264686896, "step": 3690}, {"loss": 0.7702, "grad_norm": 0.5197778940200806, "learning_rate": 0.0002, "epoch": 1.1943189154293092, "step": 3700}, {"loss": 0.7349, "grad_norm": 0.5156530141830444, "learning_rate": 0.0002, "epoch": 1.1975468043899289, "step": 3710}, {"loss": 0.7738, "grad_norm": 0.6968549489974976, "learning_rate": 0.0002, "epoch": 1.2007746933505488, "step": 3720}, {"loss": 0.7599, "grad_norm": 0.48983848094940186, "learning_rate": 0.0002, "epoch": 1.2040025823111684, "step": 3730}, {"loss": 0.7163, "grad_norm": 0.6709973216056824, "learning_rate": 0.0002, "epoch": 1.2072304712717883, "step": 3740}, {"loss": 0.7632, "grad_norm": 0.48681750893592834, "learning_rate": 0.0002, "epoch": 1.210458360232408, "step": 3750}, {"loss": 0.7039, "grad_norm": 0.49475061893463135, "learning_rate": 0.0002, "epoch": 1.2136862491930278, "step": 3760}, {"loss": 0.7372, "grad_norm": 0.6163983345031738, "learning_rate": 0.0002, "epoch": 1.2169141381536475, "step": 3770}, {"loss": 0.757, "grad_norm": 0.5481411218643188, "learning_rate": 0.0002, "epoch": 1.2201420271142673, "step": 3780}, {"loss": 0.7601, "grad_norm": 0.620639979839325, "learning_rate": 0.0002, "epoch": 1.223369916074887, "step": 3790}, {"loss": 0.7738, "grad_norm": 0.7017222046852112, "learning_rate": 0.0002, "epoch": 1.2265978050355069, "step": 3800}, {"loss": 0.7468, "grad_norm": 0.5872400403022766, "learning_rate": 0.0002, "epoch": 1.2298256939961265, "step": 3810}, {"loss": 0.7854, "grad_norm": 0.45765596628189087, "learning_rate": 0.0002, "epoch": 1.2330535829567464, "step": 3820}, {"loss": 0.7865, "grad_norm": 0.5676377415657043, "learning_rate": 0.0002, "epoch": 1.236281471917366, "step": 3830}, {"loss": 0.7696, "grad_norm": 0.4793425500392914, "learning_rate": 0.0002, "epoch": 1.2395093608779857, "step": 3840}, {"loss": 0.7065, "grad_norm": 0.5060022473335266, "learning_rate": 0.0002, "epoch": 1.2427372498386056, "step": 3850}, {"loss": 0.7333, "grad_norm": 0.6140682697296143, "learning_rate": 0.0002, "epoch": 1.2459651387992252, "step": 3860}, {"loss": 0.7496, "grad_norm": 0.5030326843261719, "learning_rate": 0.0002, "epoch": 1.249193027759845, "step": 3870}, {"loss": 0.7226, "grad_norm": 0.6609430909156799, "learning_rate": 0.0002, "epoch": 1.2524209167204647, "step": 3880}, {"loss": 0.7212, "grad_norm": 0.5459545850753784, "learning_rate": 0.0002, "epoch": 1.2556488056810846, "step": 3890}, {"loss": 0.7145, "grad_norm": 0.5328870415687561, "learning_rate": 0.0002, "epoch": 1.2588766946417043, "step": 3900}, {"loss": 0.7572, "grad_norm": 0.5840652585029602, "learning_rate": 0.0002, "epoch": 1.2621045836023241, "step": 3910}, {"loss": 0.7624, "grad_norm": 0.5587584376335144, "learning_rate": 0.0002, "epoch": 1.2653324725629438, "step": 3920}, {"loss": 0.7846, "grad_norm": 0.5886949896812439, "learning_rate": 0.0002, "epoch": 1.2685603615235637, "step": 3930}, {"loss": 0.7251, "grad_norm": 0.5128693580627441, "learning_rate": 0.0002, "epoch": 1.2717882504841833, "step": 3940}, {"loss": 0.7032, "grad_norm": 0.6207669377326965, "learning_rate": 0.0002, "epoch": 1.2750161394448032, "step": 3950}, {"loss": 0.7506, "grad_norm": 0.5789574384689331, "learning_rate": 0.0002, "epoch": 1.2782440284054228, "step": 3960}, {"loss": 0.7574, "grad_norm": 0.503162145614624, "learning_rate": 0.0002, "epoch": 1.2814719173660425, "step": 3970}, {"loss": 0.7489, "grad_norm": 0.6670064926147461, "learning_rate": 0.0002, "epoch": 1.2846998063266624, "step": 3980}, {"loss": 0.7198, "grad_norm": 0.5676213502883911, "learning_rate": 0.0002, "epoch": 1.2879276952872822, "step": 3990}, {"loss": 0.7892, "grad_norm": 0.5383169054985046, "learning_rate": 0.0002, "epoch": 1.2911555842479019, "step": 4000}, {"loss": 0.7432, "grad_norm": 0.714743971824646, "learning_rate": 0.0002, "epoch": 1.2943834732085215, "step": 4010}, {"loss": 0.7594, "grad_norm": 0.5740262269973755, "learning_rate": 0.0002, "epoch": 1.2976113621691414, "step": 4020}, {"loss": 0.7564, "grad_norm": 0.6143045425415039, "learning_rate": 0.0002, "epoch": 1.300839251129761, "step": 4030}, {"loss": 0.7181, "grad_norm": 0.501025378704071, "learning_rate": 0.0002, "epoch": 1.304067140090381, "step": 4040}, {"loss": 0.7099, "grad_norm": 0.5784100294113159, "learning_rate": 0.0002, "epoch": 1.3072950290510006, "step": 4050}, {"loss": 0.7403, "grad_norm": 0.6182606220245361, "learning_rate": 0.0002, "epoch": 1.3105229180116205, "step": 4060}, {"loss": 0.7249, "grad_norm": 0.5072231292724609, "learning_rate": 0.0002, "epoch": 1.3137508069722401, "step": 4070}, {"loss": 0.7451, "grad_norm": 0.6841012835502625, "learning_rate": 0.0002, "epoch": 1.31697869593286, "step": 4080}, {"loss": 0.7395, "grad_norm": 0.697257936000824, "learning_rate": 0.0002, "epoch": 1.3202065848934796, "step": 4090}, {"loss": 0.7401, "grad_norm": 0.5113214254379272, "learning_rate": 0.0002, "epoch": 1.3234344738540993, "step": 4100}, {"loss": 0.7336, "grad_norm": 0.6270561814308167, "learning_rate": 0.0002, "epoch": 1.3266623628147192, "step": 4110}, {"loss": 0.7535, "grad_norm": 0.5525947213172913, "learning_rate": 0.0002, "epoch": 1.329890251775339, "step": 4120}, {"loss": 0.6999, "grad_norm": 0.546071469783783, "learning_rate": 0.0002, "epoch": 1.3331181407359587, "step": 4130}, {"loss": 0.7884, "grad_norm": 0.6516721248626709, "learning_rate": 0.0002, "epoch": 1.3363460296965783, "step": 4140}, {"loss": 0.755, "grad_norm": 0.6235111355781555, "learning_rate": 0.0002, "epoch": 1.3395739186571982, "step": 4150}, {"loss": 0.7467, "grad_norm": 0.538649320602417, "learning_rate": 0.0002, "epoch": 1.3428018076178179, "step": 4160}, {"loss": 0.7368, "grad_norm": 0.5367001891136169, "learning_rate": 0.0002, "epoch": 1.3460296965784377, "step": 4170}, {"loss": 0.7536, "grad_norm": 0.6134631037712097, "learning_rate": 0.0002, "epoch": 1.3492575855390574, "step": 4180}, {"loss": 0.8245, "grad_norm": 0.5827262997627258, "learning_rate": 0.0002, "epoch": 1.3524854744996773, "step": 4190}, {"loss": 0.7288, "grad_norm": 0.5706096291542053, "learning_rate": 0.0002, "epoch": 1.355713363460297, "step": 4200}, {"loss": 0.7302, "grad_norm": 0.6422057151794434, "learning_rate": 0.0002, "epoch": 1.3589412524209168, "step": 4210}, {"loss": 0.7303, "grad_norm": 0.6316141486167908, "learning_rate": 0.0002, "epoch": 1.3621691413815364, "step": 4220}, {"loss": 0.7457, "grad_norm": 0.6946983933448792, "learning_rate": 0.0002, "epoch": 1.365397030342156, "step": 4230}, {"loss": 0.7388, "grad_norm": 0.5381525754928589, "learning_rate": 0.0002, "epoch": 1.368624919302776, "step": 4240}, {"loss": 0.73, "grad_norm": 0.5484845638275146, "learning_rate": 0.0002, "epoch": 1.3718528082633958, "step": 4250}, {"loss": 0.7584, "grad_norm": 0.5961896777153015, "learning_rate": 0.0002, "epoch": 1.3750806972240155, "step": 4260}, {"loss": 0.8006, "grad_norm": 0.6041752696037292, "learning_rate": 0.0002, "epoch": 1.3783085861846351, "step": 4270}, {"loss": 0.7276, "grad_norm": 0.6283464431762695, "learning_rate": 0.0002, "epoch": 1.381536475145255, "step": 4280}, {"loss": 0.757, "grad_norm": 0.6761324405670166, "learning_rate": 0.0002, "epoch": 1.384764364105875, "step": 4290}, {"loss": 0.7381, "grad_norm": 0.504311203956604, "learning_rate": 0.0002, "epoch": 1.3879922530664945, "step": 4300}, {"loss": 0.7536, "grad_norm": 0.6100395917892456, "learning_rate": 0.0002, "epoch": 1.3912201420271142, "step": 4310}, {"loss": 0.7103, "grad_norm": 0.6245788335800171, "learning_rate": 0.0002, "epoch": 1.394448030987734, "step": 4320}, {"loss": 0.7505, "grad_norm": 0.6074621081352234, "learning_rate": 0.0002, "epoch": 1.3976759199483537, "step": 4330}, {"loss": 0.752, "grad_norm": 0.6683838963508606, "learning_rate": 0.0002, "epoch": 1.4009038089089736, "step": 4340}, {"loss": 0.7537, "grad_norm": 0.622998058795929, "learning_rate": 0.0002, "epoch": 1.4041316978695932, "step": 4350}, {"loss": 0.8148, "grad_norm": 0.6089423894882202, "learning_rate": 0.0002, "epoch": 1.4073595868302131, "step": 4360}, {"loss": 0.7715, "grad_norm": 0.6381658911705017, "learning_rate": 0.0002, "epoch": 1.4105874757908328, "step": 4370}, {"loss": 0.7871, "grad_norm": 0.5419308543205261, "learning_rate": 0.0002, "epoch": 1.4138153647514526, "step": 4380}, {"loss": 0.7386, "grad_norm": 0.6026232242584229, "learning_rate": 0.0002, "epoch": 1.4170432537120723, "step": 4390}, {"loss": 0.7529, "grad_norm": 0.4911101162433624, "learning_rate": 0.0002, "epoch": 1.420271142672692, "step": 4400}, {"loss": 0.7495, "grad_norm": 0.6302908062934875, "learning_rate": 0.0002, "epoch": 1.4234990316333118, "step": 4410}, {"loss": 0.7446, "grad_norm": 0.6692768931388855, "learning_rate": 0.0002, "epoch": 1.4267269205939317, "step": 4420}, {"loss": 0.7312, "grad_norm": 0.46294572949409485, "learning_rate": 0.0002, "epoch": 1.4299548095545513, "step": 4430}, {"loss": 0.7255, "grad_norm": 0.5452619194984436, "learning_rate": 0.0002, "epoch": 1.433182698515171, "step": 4440}, {"loss": 0.7974, "grad_norm": 0.7809233069419861, "learning_rate": 0.0002, "epoch": 1.4364105874757909, "step": 4450}, {"loss": 0.7103, "grad_norm": 0.550088107585907, "learning_rate": 0.0002, "epoch": 1.4396384764364105, "step": 4460}, {"loss": 0.7088, "grad_norm": 0.7139151096343994, "learning_rate": 0.0002, "epoch": 1.4428663653970304, "step": 4470}, {"loss": 0.7358, "grad_norm": 0.6187090873718262, "learning_rate": 0.0002, "epoch": 1.44609425435765, "step": 4480}, {"loss": 0.7608, "grad_norm": 0.5948249101638794, "learning_rate": 0.0002, "epoch": 1.44932214331827, "step": 4490}, {"loss": 0.7582, "grad_norm": 0.6510892510414124, "learning_rate": 0.0002, "epoch": 1.4525500322788896, "step": 4500}, {"loss": 0.7105, "grad_norm": 0.6552293300628662, "learning_rate": 0.0002, "epoch": 1.4557779212395094, "step": 4510}, {"loss": 0.7965, "grad_norm": 0.585574209690094, "learning_rate": 0.0002, "epoch": 1.459005810200129, "step": 4520}, {"loss": 0.761, "grad_norm": 0.4830162823200226, "learning_rate": 0.0002, "epoch": 1.4622336991607487, "step": 4530}, {"loss": 0.7424, "grad_norm": 0.5780223608016968, "learning_rate": 0.0002, "epoch": 1.4654615881213686, "step": 4540}, {"loss": 0.7518, "grad_norm": 0.5462607145309448, "learning_rate": 0.0002, "epoch": 1.4686894770819885, "step": 4550}, {"loss": 0.7342, "grad_norm": 0.5183546543121338, "learning_rate": 0.0002, "epoch": 1.4719173660426081, "step": 4560}, {"loss": 0.71, "grad_norm": 0.676917552947998, "learning_rate": 0.0002, "epoch": 1.4751452550032278, "step": 4570}, {"loss": 0.7875, "grad_norm": 0.5772345066070557, "learning_rate": 0.0002, "epoch": 1.4783731439638477, "step": 4580}, {"loss": 0.7709, "grad_norm": 0.7320035696029663, "learning_rate": 0.0002, "epoch": 1.4816010329244673, "step": 4590}, {"loss": 0.7601, "grad_norm": 0.5024042129516602, "learning_rate": 0.0002, "epoch": 1.4848289218850872, "step": 4600}, {"loss": 0.8061, "grad_norm": 0.5482868552207947, "learning_rate": 0.0002, "epoch": 1.4880568108457068, "step": 4610}, {"loss": 0.714, "grad_norm": 0.5447399616241455, "learning_rate": 0.0002, "epoch": 1.4912846998063267, "step": 4620}, {"loss": 0.7959, "grad_norm": 0.5953414440155029, "learning_rate": 0.0002, "epoch": 1.4945125887669464, "step": 4630}, {"loss": 0.7463, "grad_norm": 0.6983066201210022, "learning_rate": 0.0002, "epoch": 1.4977404777275662, "step": 4640}, {"loss": 0.7877, "grad_norm": 0.586327075958252, "learning_rate": 0.0002, "epoch": 1.500968366688186, "step": 4650}, {"loss": 0.7169, "grad_norm": 0.5839682221412659, "learning_rate": 0.0002, "epoch": 1.5041962556488055, "step": 4660}, {"loss": 0.7524, "grad_norm": 0.5959209203720093, "learning_rate": 0.0002, "epoch": 1.5074241446094254, "step": 4670}, {"loss": 0.7615, "grad_norm": 0.5073857307434082, "learning_rate": 0.0002, "epoch": 1.5106520335700453, "step": 4680}, {"loss": 0.7258, "grad_norm": 0.5183001160621643, "learning_rate": 0.0002, "epoch": 1.513879922530665, "step": 4690}, {"loss": 0.784, "grad_norm": 0.593530535697937, "learning_rate": 0.0002, "epoch": 1.5171078114912846, "step": 4700}, {"loss": 0.7722, "grad_norm": 0.675993025302887, "learning_rate": 0.0002, "epoch": 1.5203357004519045, "step": 4710}, {"loss": 0.7485, "grad_norm": 0.5823286771774292, "learning_rate": 0.0002, "epoch": 1.5235635894125243, "step": 4720}, {"loss": 0.7474, "grad_norm": 0.5825035572052002, "learning_rate": 0.0002, "epoch": 1.526791478373144, "step": 4730}, {"loss": 0.8287, "grad_norm": 0.5689691305160522, "learning_rate": 0.0002, "epoch": 1.5300193673337636, "step": 4740}, {"loss": 0.7279, "grad_norm": 0.6037150621414185, "learning_rate": 0.0002, "epoch": 1.5332472562943835, "step": 4750}, {"loss": 0.7865, "grad_norm": 0.6393677592277527, "learning_rate": 0.0002, "epoch": 1.5364751452550034, "step": 4760}, {"loss": 0.805, "grad_norm": 0.5926381945610046, "learning_rate": 0.0002, "epoch": 1.539703034215623, "step": 4770}, {"loss": 0.7425, "grad_norm": 0.9468599557876587, "learning_rate": 0.0002, "epoch": 1.5429309231762427, "step": 4780}, {"loss": 0.7565, "grad_norm": 0.7544237375259399, "learning_rate": 0.0002, "epoch": 1.5461588121368623, "step": 4790}, {"loss": 0.7398, "grad_norm": 0.5308566093444824, "learning_rate": 0.0002, "epoch": 1.5493867010974822, "step": 4800}, {"loss": 0.7756, "grad_norm": 0.6590296030044556, "learning_rate": 0.0002, "epoch": 1.552614590058102, "step": 4810}, {"loss": 0.7212, "grad_norm": 0.5630404353141785, "learning_rate": 0.0002, "epoch": 1.5558424790187217, "step": 4820}, {"loss": 0.7593, "grad_norm": 0.6800200939178467, "learning_rate": 0.0002, "epoch": 1.5590703679793414, "step": 4830}, {"loss": 0.7373, "grad_norm": 0.5463718175888062, "learning_rate": 0.0002, "epoch": 1.5622982569399613, "step": 4840}, {"loss": 0.7519, "grad_norm": 0.505135178565979, "learning_rate": 0.0002, "epoch": 1.5655261459005811, "step": 4850}, {"loss": 0.8122, "grad_norm": 0.5469676852226257, "learning_rate": 0.0002, "epoch": 1.5687540348612008, "step": 4860}, {"loss": 0.7185, "grad_norm": 0.5318337678909302, "learning_rate": 0.0002, "epoch": 1.5719819238218204, "step": 4870}, {"loss": 0.7324, "grad_norm": 0.7287914752960205, "learning_rate": 0.0002, "epoch": 1.5752098127824403, "step": 4880}, {"loss": 0.7532, "grad_norm": 0.7318989038467407, "learning_rate": 0.0002, "epoch": 1.5784377017430602, "step": 4890}, {"loss": 0.7851, "grad_norm": 0.6499921679496765, "learning_rate": 0.0002, "epoch": 1.5816655907036798, "step": 4900}, {"loss": 0.753, "grad_norm": 0.47907355427742004, "learning_rate": 0.0002, "epoch": 1.5848934796642995, "step": 4910}, {"loss": 0.7699, "grad_norm": 0.7338833808898926, "learning_rate": 0.0002, "epoch": 1.5881213686249191, "step": 4920}, {"loss": 0.7592, "grad_norm": 0.5800719261169434, "learning_rate": 0.0002, "epoch": 1.591349257585539, "step": 4930}, {"loss": 0.7211, "grad_norm": 0.5365763306617737, "learning_rate": 0.0002, "epoch": 1.594577146546159, "step": 4940}, {"loss": 0.777, "grad_norm": 0.5800772309303284, "learning_rate": 0.0002, "epoch": 1.5978050355067785, "step": 4950}, {"loss": 0.8027, "grad_norm": 0.7878010869026184, "learning_rate": 0.0002, "epoch": 1.6010329244673982, "step": 4960}, {"loss": 0.7894, "grad_norm": 0.5919058918952942, "learning_rate": 0.0002, "epoch": 1.604260813428018, "step": 4970}, {"loss": 0.7762, "grad_norm": 0.5004435181617737, "learning_rate": 0.0002, "epoch": 1.607488702388638, "step": 4980}, {"loss": 0.7447, "grad_norm": 0.6299242377281189, "learning_rate": 0.0002, "epoch": 1.6107165913492576, "step": 4990}, {"loss": 0.7149, "grad_norm": 0.6307242512702942, "learning_rate": 0.0002, "epoch": 1.6139444803098772, "step": 5000}, {"loss": 0.7693, "grad_norm": 0.7838703989982605, "learning_rate": 0.0002, "epoch": 1.6171723692704971, "step": 5010}, {"loss": 0.7364, "grad_norm": 0.6454671621322632, "learning_rate": 0.0002, "epoch": 1.620400258231117, "step": 5020}, {"loss": 0.74, "grad_norm": 0.5907095670700073, "learning_rate": 0.0002, "epoch": 1.6236281471917366, "step": 5030}, {"loss": 0.7331, "grad_norm": 0.6053501963615417, "learning_rate": 0.0002, "epoch": 1.6268560361523563, "step": 5040}, {"loss": 0.6987, "grad_norm": 0.5644670128822327, "learning_rate": 0.0002, "epoch": 1.630083925112976, "step": 5050}, {"loss": 0.7886, "grad_norm": 0.6320949792861938, "learning_rate": 0.0002, "epoch": 1.6333118140735958, "step": 5060}, {"loss": 0.7109, "grad_norm": 0.6101489067077637, "learning_rate": 0.0002, "epoch": 1.6365397030342157, "step": 5070}, {"loss": 0.6922, "grad_norm": 0.9435283541679382, "learning_rate": 0.0002, "epoch": 1.6397675919948353, "step": 5080}, {"loss": 0.729, "grad_norm": 0.6668919324874878, "learning_rate": 0.0002, "epoch": 1.642995480955455, "step": 5090}, {"loss": 0.7402, "grad_norm": 0.6160340905189514, "learning_rate": 0.0002, "epoch": 1.6462233699160749, "step": 5100}, {"loss": 0.7461, "grad_norm": 0.5999835729598999, "learning_rate": 0.0002, "epoch": 1.6494512588766947, "step": 5110}, {"loss": 0.7661, "grad_norm": 0.9378551840782166, "learning_rate": 0.0002, "epoch": 1.6526791478373144, "step": 5120}, {"loss": 0.7586, "grad_norm": 0.4795055389404297, "learning_rate": 0.0002, "epoch": 1.655907036797934, "step": 5130}, {"loss": 0.7342, "grad_norm": 0.4878861606121063, "learning_rate": 0.0002, "epoch": 1.659134925758554, "step": 5140}, {"loss": 0.7362, "grad_norm": 0.6042965054512024, "learning_rate": 0.0002, "epoch": 1.6623628147191738, "step": 5150}, {"loss": 0.7863, "grad_norm": 0.5829901695251465, "learning_rate": 0.0002, "epoch": 1.6655907036797934, "step": 5160}, {"loss": 0.7498, "grad_norm": 0.5168480277061462, "learning_rate": 0.0002, "epoch": 1.668818592640413, "step": 5170}, {"loss": 0.7333, "grad_norm": 0.6489511132240295, "learning_rate": 0.0002, "epoch": 1.672046481601033, "step": 5180}, {"loss": 0.7257, "grad_norm": 0.5955966114997864, "learning_rate": 0.0002, "epoch": 1.6752743705616526, "step": 5190}, {"loss": 0.7938, "grad_norm": 0.6228088140487671, "learning_rate": 0.0002, "epoch": 1.6785022595222725, "step": 5200}, {"loss": 0.7626, "grad_norm": 0.5726390480995178, "learning_rate": 0.0002, "epoch": 1.6817301484828922, "step": 5210}, {"loss": 0.7479, "grad_norm": 0.6116343140602112, "learning_rate": 0.0002, "epoch": 1.6849580374435118, "step": 5220}, {"loss": 0.7169, "grad_norm": 0.5483687520027161, "learning_rate": 0.0002, "epoch": 1.6881859264041317, "step": 5230}, {"loss": 0.7293, "grad_norm": 0.570941686630249, "learning_rate": 0.0002, "epoch": 1.6914138153647515, "step": 5240}, {"loss": 0.723, "grad_norm": 0.6048086285591125, "learning_rate": 0.0002, "epoch": 1.6946417043253712, "step": 5250}, {"loss": 0.7861, "grad_norm": 0.6769003868103027, "learning_rate": 0.0002, "epoch": 1.6978695932859909, "step": 5260}, {"loss": 0.7885, "grad_norm": 0.5629057884216309, "learning_rate": 0.0002, "epoch": 1.7010974822466107, "step": 5270}, {"loss": 0.7693, "grad_norm": 0.657341480255127, "learning_rate": 0.0002, "epoch": 1.7043253712072306, "step": 5280}, {"loss": 0.7357, "grad_norm": 0.6256147623062134, "learning_rate": 0.0002, "epoch": 1.7075532601678503, "step": 5290}, {"loss": 0.714, "grad_norm": 0.5498088002204895, "learning_rate": 0.0002, "epoch": 1.71078114912847, "step": 5300}, {"loss": 0.7669, "grad_norm": 0.5078358054161072, "learning_rate": 0.0002, "epoch": 1.7140090380890898, "step": 5310}, {"loss": 0.7872, "grad_norm": 0.6696692705154419, "learning_rate": 0.0002, "epoch": 1.7172369270497096, "step": 5320}, {"loss": 0.8205, "grad_norm": 0.6692847013473511, "learning_rate": 0.0002, "epoch": 1.7204648160103293, "step": 5330}, {"loss": 0.7432, "grad_norm": 0.5415751934051514, "learning_rate": 0.0002, "epoch": 1.723692704970949, "step": 5340}, {"loss": 0.7499, "grad_norm": 0.5367611050605774, "learning_rate": 0.0002, "epoch": 1.7269205939315686, "step": 5350}, {"loss": 0.7631, "grad_norm": 0.7321061491966248, "learning_rate": 0.0002, "epoch": 1.7301484828921885, "step": 5360}, {"loss": 0.7827, "grad_norm": 0.723972499370575, "learning_rate": 0.0002, "epoch": 1.7333763718528084, "step": 5370}, {"loss": 0.7077, "grad_norm": 0.7328100204467773, "learning_rate": 0.0002, "epoch": 1.736604260813428, "step": 5380}, {"loss": 0.7503, "grad_norm": 0.5785264372825623, "learning_rate": 0.0002, "epoch": 1.7398321497740477, "step": 5390}, {"loss": 0.7188, "grad_norm": 0.7812932133674622, "learning_rate": 0.0002, "epoch": 1.7430600387346675, "step": 5400}, {"loss": 0.7386, "grad_norm": 0.6493327617645264, "learning_rate": 0.0002, "epoch": 1.7462879276952874, "step": 5410}, {"loss": 0.7487, "grad_norm": 0.5825939774513245, "learning_rate": 0.0002, "epoch": 1.749515816655907, "step": 5420}, {"loss": 0.7625, "grad_norm": 0.6969610452651978, "learning_rate": 0.0002, "epoch": 1.7527437056165267, "step": 5430}, {"loss": 0.7512, "grad_norm": 0.5558062195777893, "learning_rate": 0.0002, "epoch": 1.7559715945771466, "step": 5440}, {"loss": 0.7256, "grad_norm": 0.49222221970558167, "learning_rate": 0.0002, "epoch": 1.7591994835377665, "step": 5450}, {"loss": 0.7477, "grad_norm": 0.5844656825065613, "learning_rate": 0.0002, "epoch": 1.762427372498386, "step": 5460}, {"loss": 0.7695, "grad_norm": 0.8706597685813904, "learning_rate": 0.0002, "epoch": 1.7656552614590058, "step": 5470}, {"loss": 0.7582, "grad_norm": 0.6167706251144409, "learning_rate": 0.0002, "epoch": 1.7688831504196254, "step": 5480}, {"loss": 0.7521, "grad_norm": 0.5890011787414551, "learning_rate": 0.0002, "epoch": 1.7721110393802453, "step": 5490}, {"loss": 0.8319, "grad_norm": 0.6551728248596191, "learning_rate": 0.0002, "epoch": 1.7753389283408652, "step": 5500}, {"loss": 0.7615, "grad_norm": 0.5848751068115234, "learning_rate": 0.0002, "epoch": 1.7785668173014848, "step": 5510}, {"loss": 0.7622, "grad_norm": 0.6664014458656311, "learning_rate": 0.0002, "epoch": 1.7817947062621045, "step": 5520}, {"loss": 0.7544, "grad_norm": 0.5931693911552429, "learning_rate": 0.0002, "epoch": 1.7850225952227243, "step": 5530}, {"loss": 0.7992, "grad_norm": 0.5534724593162537, "learning_rate": 0.0002, "epoch": 1.7882504841833442, "step": 5540}, {"loss": 0.7967, "grad_norm": 0.5590878129005432, "learning_rate": 0.0002, "epoch": 1.7914783731439639, "step": 5550}, {"loss": 0.7406, "grad_norm": 0.6947470903396606, "learning_rate": 0.0002, "epoch": 1.7947062621045835, "step": 5560}, {"loss": 0.7614, "grad_norm": 0.6104130148887634, "learning_rate": 0.0002, "epoch": 1.7979341510652034, "step": 5570}, {"loss": 0.8032, "grad_norm": 0.6135714054107666, "learning_rate": 0.0002, "epoch": 1.8011620400258233, "step": 5580}, {"loss": 0.7403, "grad_norm": 0.6626853346824646, "learning_rate": 0.0002, "epoch": 1.804389928986443, "step": 5590}, {"loss": 0.7746, "grad_norm": 0.6977612972259521, "learning_rate": 0.0002, "epoch": 1.8076178179470626, "step": 5600}, {"loss": 0.7899, "grad_norm": 0.6275238394737244, "learning_rate": 0.0002, "epoch": 1.8108457069076824, "step": 5610}, {"loss": 0.7392, "grad_norm": 0.5017505288124084, "learning_rate": 0.0002, "epoch": 1.814073595868302, "step": 5620}, {"loss": 0.7669, "grad_norm": 0.8314290642738342, "learning_rate": 0.0002, "epoch": 1.817301484828922, "step": 5630}, {"loss": 0.7031, "grad_norm": 0.6863582134246826, "learning_rate": 0.0002, "epoch": 1.8205293737895416, "step": 5640}, {"loss": 0.743, "grad_norm": 0.69544917345047, "learning_rate": 0.0002, "epoch": 1.8237572627501613, "step": 5650}, {"loss": 0.7277, "grad_norm": 0.515499472618103, "learning_rate": 0.0002, "epoch": 1.8269851517107811, "step": 5660}, {"loss": 0.7166, "grad_norm": 0.6100873947143555, "learning_rate": 0.0002, "epoch": 1.830213040671401, "step": 5670}, {"loss": 0.7217, "grad_norm": 0.67416912317276, "learning_rate": 0.0002, "epoch": 1.8334409296320207, "step": 5680}, {"loss": 0.7575, "grad_norm": 0.7057772278785706, "learning_rate": 0.0002, "epoch": 1.8366688185926403, "step": 5690}, {"loss": 0.7483, "grad_norm": 0.7374551892280579, "learning_rate": 0.0002, "epoch": 1.8398967075532602, "step": 5700}, {"loss": 0.81, "grad_norm": 0.6266297101974487, "learning_rate": 0.0002, "epoch": 1.84312459651388, "step": 5710}, {"loss": 0.728, "grad_norm": 0.5629227757453918, "learning_rate": 0.0002, "epoch": 1.8463524854744997, "step": 5720}, {"loss": 0.8043, "grad_norm": 0.6603655815124512, "learning_rate": 0.0002, "epoch": 1.8495803744351194, "step": 5730}, {"loss": 0.7587, "grad_norm": 0.8113715052604675, "learning_rate": 0.0002, "epoch": 1.8528082633957392, "step": 5740}, {"loss": 0.7486, "grad_norm": 0.7143914103507996, "learning_rate": 0.0002, "epoch": 1.856036152356359, "step": 5750}, {"loss": 0.7619, "grad_norm": 0.6273732781410217, "learning_rate": 0.0002, "epoch": 1.8592640413169788, "step": 5760}, {"loss": 0.7962, "grad_norm": 0.5428690910339355, "learning_rate": 0.0002, "epoch": 1.8624919302775984, "step": 5770}, {"loss": 0.7581, "grad_norm": 0.6405037641525269, "learning_rate": 0.0002, "epoch": 1.865719819238218, "step": 5780}, {"loss": 0.7569, "grad_norm": 0.700873613357544, "learning_rate": 0.0002, "epoch": 1.868947708198838, "step": 5790}, {"loss": 0.7353, "grad_norm": 0.5645238161087036, "learning_rate": 0.0002, "epoch": 1.8721755971594578, "step": 5800}, {"loss": 0.8037, "grad_norm": 0.8780353665351868, "learning_rate": 0.0002, "epoch": 1.8754034861200775, "step": 5810}, {"loss": 0.7686, "grad_norm": 0.6295409798622131, "learning_rate": 0.0002, "epoch": 1.878631375080697, "step": 5820}, {"loss": 0.8067, "grad_norm": 0.678269624710083, "learning_rate": 0.0002, "epoch": 1.881859264041317, "step": 5830}, {"loss": 0.7537, "grad_norm": 0.6464608907699585, "learning_rate": 0.0002, "epoch": 1.8850871530019369, "step": 5840}, {"loss": 0.7423, "grad_norm": 0.6201048493385315, "learning_rate": 0.0002, "epoch": 1.8883150419625565, "step": 5850}, {"loss": 0.7694, "grad_norm": 0.6046274304389954, "learning_rate": 0.0002, "epoch": 1.8915429309231762, "step": 5860}, {"loss": 0.781, "grad_norm": 0.7532408833503723, "learning_rate": 0.0002, "epoch": 1.894770819883796, "step": 5870}, {"loss": 0.6885, "grad_norm": 0.6066767573356628, "learning_rate": 0.0002, "epoch": 1.897998708844416, "step": 5880}, {"loss": 0.7631, "grad_norm": 0.6289830207824707, "learning_rate": 0.0002, "epoch": 1.9012265978050356, "step": 5890}, {"loss": 0.7501, "grad_norm": 0.5204319953918457, "learning_rate": 0.0002, "epoch": 1.9044544867656552, "step": 5900}, {"loss": 0.7335, "grad_norm": 0.6708219647407532, "learning_rate": 0.0002, "epoch": 1.9076823757262749, "step": 5910}, {"loss": 0.7455, "grad_norm": 0.4915677309036255, "learning_rate": 0.0002, "epoch": 1.9109102646868947, "step": 5920}, {"loss": 0.7464, "grad_norm": 0.652717113494873, "learning_rate": 0.0002, "epoch": 1.9141381536475146, "step": 5930}, {"loss": 0.7687, "grad_norm": 0.5446316003799438, "learning_rate": 0.0002, "epoch": 1.9173660426081343, "step": 5940}, {"loss": 0.7424, "grad_norm": 0.4958149194717407, "learning_rate": 0.0002, "epoch": 1.920593931568754, "step": 5950}, {"loss": 0.757, "grad_norm": 0.5623434782028198, "learning_rate": 0.0002, "epoch": 1.9238218205293738, "step": 5960}, {"loss": 0.7446, "grad_norm": 0.6855450868606567, "learning_rate": 0.0002, "epoch": 1.9270497094899937, "step": 5970}, {"loss": 0.827, "grad_norm": 0.5710492730140686, "learning_rate": 0.0002, "epoch": 1.9302775984506133, "step": 5980}, {"loss": 0.7245, "grad_norm": 0.5379431843757629, "learning_rate": 0.0002, "epoch": 1.933505487411233, "step": 5990}, {"loss": 0.77, "grad_norm": 0.557129442691803, "learning_rate": 0.0002, "epoch": 1.9367333763718528, "step": 6000}, {"loss": 0.6988, "grad_norm": 0.6336663961410522, "learning_rate": 0.0002, "epoch": 1.9399612653324727, "step": 6010}, {"loss": 0.7316, "grad_norm": 0.5950582027435303, "learning_rate": 0.0002, "epoch": 1.9431891542930924, "step": 6020}, {"loss": 0.7443, "grad_norm": 0.5905954837799072, "learning_rate": 0.0002, "epoch": 1.946417043253712, "step": 6030}, {"loss": 0.7127, "grad_norm": 0.6688982844352722, "learning_rate": 0.0002, "epoch": 1.9496449322143317, "step": 6040}, {"loss": 0.79, "grad_norm": 0.5440775752067566, "learning_rate": 0.0002, "epoch": 1.9528728211749515, "step": 6050}, {"loss": 0.7221, "grad_norm": 0.6207906603813171, "learning_rate": 0.0002, "epoch": 1.9561007101355714, "step": 6060}, {"loss": 0.738, "grad_norm": 0.6999374628067017, "learning_rate": 0.0002, "epoch": 1.959328599096191, "step": 6070}, {"loss": 0.7372, "grad_norm": 0.6310848593711853, "learning_rate": 0.0002, "epoch": 1.9625564880568107, "step": 6080}, {"loss": 0.7198, "grad_norm": 0.5903388261795044, "learning_rate": 0.0002, "epoch": 1.9657843770174306, "step": 6090}, {"loss": 0.7103, "grad_norm": 0.6333889961242676, "learning_rate": 0.0002, "epoch": 1.9690122659780505, "step": 6100}, {"loss": 0.7246, "grad_norm": 0.5604711174964905, "learning_rate": 0.0002, "epoch": 1.97224015493867, "step": 6110}, {"loss": 0.761, "grad_norm": 0.9234541654586792, "learning_rate": 0.0002, "epoch": 1.9754680438992898, "step": 6120}, {"loss": 0.7375, "grad_norm": 0.6149102449417114, "learning_rate": 0.0002, "epoch": 1.9786959328599096, "step": 6130}, {"loss": 0.7286, "grad_norm": 0.615446150302887, "learning_rate": 0.0002, "epoch": 1.9819238218205295, "step": 6140}, {"loss": 0.7333, "grad_norm": 0.5176635980606079, "learning_rate": 0.0002, "epoch": 1.9851517107811492, "step": 6150}, {"loss": 0.718, "grad_norm": 0.7124109864234924, "learning_rate": 0.0002, "epoch": 1.9883795997417688, "step": 6160}, {"loss": 0.7669, "grad_norm": 0.6317567825317383, "learning_rate": 0.0002, "epoch": 1.9916074887023887, "step": 6170}, {"loss": 0.8012, "grad_norm": 0.6855016350746155, "learning_rate": 0.0002, "epoch": 1.9948353776630086, "step": 6180}, {"loss": 0.7376, "grad_norm": 0.6423715353012085, "learning_rate": 0.0002, "epoch": 1.9980632666236282, "step": 6190}, {"eval_loss": 1.1096643209457397, "eval_runtime": 147.7997, "eval_samples_per_second": 4.959, "eval_steps_per_second": 0.622, "epoch": 2.0, "step": 6196}, {"loss": 0.7131, "grad_norm": 0.5322932600975037, "learning_rate": 0.0002, "epoch": 2.001291155584248, "step": 6200}, {"loss": 0.6619, "grad_norm": 0.8152306079864502, "learning_rate": 0.0002, "epoch": 2.0045190445448675, "step": 6210}, {"loss": 0.6731, "grad_norm": 0.6215983033180237, "learning_rate": 0.0002, "epoch": 2.0077469335054876, "step": 6220}, {"loss": 0.658, "grad_norm": 0.845498263835907, "learning_rate": 0.0002, "epoch": 2.0109748224661073, "step": 6230}, {"loss": 0.6954, "grad_norm": 0.733559787273407, "learning_rate": 0.0002, "epoch": 2.014202711426727, "step": 6240}, {"loss": 0.6707, "grad_norm": 0.51433926820755, "learning_rate": 0.0002, "epoch": 2.0174306003873466, "step": 6250}, {"loss": 0.6304, "grad_norm": 0.6374049782752991, "learning_rate": 0.0002, "epoch": 2.020658489347966, "step": 6260}, {"loss": 0.6831, "grad_norm": 0.7833638191223145, "learning_rate": 0.0002, "epoch": 2.0238863783085863, "step": 6270}, {"loss": 0.6672, "grad_norm": 0.8929463028907776, "learning_rate": 0.0002, "epoch": 2.027114267269206, "step": 6280}, {"loss": 0.637, "grad_norm": 0.669731855392456, "learning_rate": 0.0002, "epoch": 2.0303421562298256, "step": 6290}, {"loss": 0.646, "grad_norm": 0.5846071243286133, "learning_rate": 0.0002, "epoch": 2.0335700451904453, "step": 6300}, {"loss": 0.6647, "grad_norm": 0.7087787985801697, "learning_rate": 0.0002, "epoch": 2.0367979341510654, "step": 6310}, {"loss": 0.6433, "grad_norm": 0.6739160418510437, "learning_rate": 0.0002, "epoch": 2.040025823111685, "step": 6320}, {"loss": 0.6301, "grad_norm": 0.4860886335372925, "learning_rate": 0.0002, "epoch": 2.0432537120723047, "step": 6330}, {"loss": 0.6439, "grad_norm": 0.7201244831085205, "learning_rate": 0.0002, "epoch": 2.0464816010329243, "step": 6340}, {"loss": 0.6676, "grad_norm": 0.7409170269966125, "learning_rate": 0.0002, "epoch": 2.0497094899935444, "step": 6350}, {"loss": 0.6153, "grad_norm": 0.6843920350074768, "learning_rate": 0.0002, "epoch": 2.052937378954164, "step": 6360}, {"loss": 0.6674, "grad_norm": 0.7519999742507935, "learning_rate": 0.0002, "epoch": 2.0561652679147837, "step": 6370}, {"loss": 0.6928, "grad_norm": 0.5732819437980652, "learning_rate": 0.0002, "epoch": 2.0593931568754034, "step": 6380}, {"loss": 0.6496, "grad_norm": 0.7565118074417114, "learning_rate": 0.0002, "epoch": 2.062621045836023, "step": 6390}, {"loss": 0.6354, "grad_norm": 0.8147150278091431, "learning_rate": 0.0002, "epoch": 2.065848934796643, "step": 6400}, {"loss": 0.6593, "grad_norm": 0.6941924691200256, "learning_rate": 0.0002, "epoch": 2.0690768237572628, "step": 6410}, {"loss": 0.6698, "grad_norm": 0.6549784541130066, "learning_rate": 0.0002, "epoch": 2.0723047127178824, "step": 6420}, {"loss": 0.6927, "grad_norm": 0.7224905490875244, "learning_rate": 0.0002, "epoch": 2.075532601678502, "step": 6430}, {"loss": 0.6755, "grad_norm": 0.7754863500595093, "learning_rate": 0.0002, "epoch": 2.078760490639122, "step": 6440}, {"loss": 0.6738, "grad_norm": 0.691318154335022, "learning_rate": 0.0002, "epoch": 2.081988379599742, "step": 6450}, {"loss": 0.6233, "grad_norm": 0.6009294986724854, "learning_rate": 0.0002, "epoch": 2.0852162685603615, "step": 6460}, {"loss": 0.6691, "grad_norm": 0.6753945350646973, "learning_rate": 0.0002, "epoch": 2.088444157520981, "step": 6470}, {"loss": 0.6935, "grad_norm": 0.6899921298027039, "learning_rate": 0.0002, "epoch": 2.091672046481601, "step": 6480}, {"loss": 0.6918, "grad_norm": 0.846510648727417, "learning_rate": 0.0002, "epoch": 2.094899935442221, "step": 6490}, {"loss": 0.6084, "grad_norm": 0.6432605981826782, "learning_rate": 0.0002, "epoch": 2.0981278244028405, "step": 6500}, {"loss": 0.6867, "grad_norm": 0.8125239014625549, "learning_rate": 0.0002, "epoch": 2.10135571336346, "step": 6510}, {"loss": 0.6939, "grad_norm": 0.628302812576294, "learning_rate": 0.0002, "epoch": 2.1045836023240803, "step": 6520}, {"loss": 0.5909, "grad_norm": 0.7164334654808044, "learning_rate": 0.0002, "epoch": 2.1078114912847, "step": 6530}, {"loss": 0.6578, "grad_norm": 0.7476949095726013, "learning_rate": 0.0002, "epoch": 2.1110393802453196, "step": 6540}, {"loss": 0.6351, "grad_norm": 0.7577515840530396, "learning_rate": 0.0002, "epoch": 2.114267269205939, "step": 6550}, {"loss": 0.6669, "grad_norm": 0.5684467554092407, "learning_rate": 0.0002, "epoch": 2.117495158166559, "step": 6560}, {"loss": 0.6343, "grad_norm": 0.6121789216995239, "learning_rate": 0.0002, "epoch": 2.120723047127179, "step": 6570}, {"loss": 0.6314, "grad_norm": 0.6095348596572876, "learning_rate": 0.0002, "epoch": 2.1239509360877986, "step": 6580}, {"loss": 0.6276, "grad_norm": 0.7803651690483093, "learning_rate": 0.0002, "epoch": 2.1271788250484183, "step": 6590}, {"loss": 0.6579, "grad_norm": 0.5990583300590515, "learning_rate": 0.0002, "epoch": 2.130406714009038, "step": 6600}, {"loss": 0.6228, "grad_norm": 0.6569220423698425, "learning_rate": 0.0002, "epoch": 2.133634602969658, "step": 6610}, {"loss": 0.7049, "grad_norm": 0.5961166620254517, "learning_rate": 0.0002, "epoch": 2.1368624919302777, "step": 6620}, {"loss": 0.6359, "grad_norm": 0.5860554575920105, "learning_rate": 0.0002, "epoch": 2.1400903808908973, "step": 6630}, {"loss": 0.6651, "grad_norm": 0.5994001626968384, "learning_rate": 0.0002, "epoch": 2.143318269851517, "step": 6640}, {"loss": 0.6421, "grad_norm": 0.7723015546798706, "learning_rate": 0.0002, "epoch": 2.146546158812137, "step": 6650}, {"loss": 0.6723, "grad_norm": 0.676355242729187, "learning_rate": 0.0002, "epoch": 2.1497740477727567, "step": 6660}, {"loss": 0.6826, "grad_norm": 0.5689092874526978, "learning_rate": 0.0002, "epoch": 2.1530019367333764, "step": 6670}, {"loss": 0.6613, "grad_norm": 0.6933727264404297, "learning_rate": 0.0002, "epoch": 2.156229825693996, "step": 6680}, {"loss": 0.6957, "grad_norm": 0.8380527496337891, "learning_rate": 0.0002, "epoch": 2.159457714654616, "step": 6690}, {"loss": 0.6705, "grad_norm": 0.6876497268676758, "learning_rate": 0.0002, "epoch": 2.1626856036152358, "step": 6700}, {"loss": 0.6112, "grad_norm": 0.6418334245681763, "learning_rate": 0.0002, "epoch": 2.1659134925758554, "step": 6710}, {"loss": 0.6357, "grad_norm": 0.7169192433357239, "learning_rate": 0.0002, "epoch": 2.169141381536475, "step": 6720}, {"loss": 0.6492, "grad_norm": 0.6664170622825623, "learning_rate": 0.0002, "epoch": 2.1723692704970947, "step": 6730}, {"loss": 0.6751, "grad_norm": 0.6011993288993835, "learning_rate": 0.0002, "epoch": 2.175597159457715, "step": 6740}, {"loss": 0.696, "grad_norm": 0.5529947280883789, "learning_rate": 0.0002, "epoch": 2.1788250484183345, "step": 6750}, {"loss": 0.671, "grad_norm": 0.6879532933235168, "learning_rate": 0.0002, "epoch": 2.182052937378954, "step": 6760}, {"loss": 0.6634, "grad_norm": 0.6426113843917847, "learning_rate": 0.0002, "epoch": 2.1852808263395738, "step": 6770}, {"loss": 0.6592, "grad_norm": 0.6571047306060791, "learning_rate": 0.0002, "epoch": 2.188508715300194, "step": 6780}, {"loss": 0.6494, "grad_norm": 0.6400564908981323, "learning_rate": 0.0002, "epoch": 2.1917366042608135, "step": 6790}, {"loss": 0.6369, "grad_norm": 0.6509664058685303, "learning_rate": 0.0002, "epoch": 2.194964493221433, "step": 6800}, {"loss": 0.6771, "grad_norm": 0.6673197150230408, "learning_rate": 0.0002, "epoch": 2.198192382182053, "step": 6810}, {"loss": 0.6491, "grad_norm": 0.48205727338790894, "learning_rate": 0.0002, "epoch": 2.2014202711426725, "step": 6820}, {"loss": 0.6894, "grad_norm": 0.849525511264801, "learning_rate": 0.0002, "epoch": 2.2046481601032926, "step": 6830}, {"loss": 0.6977, "grad_norm": 0.6150892376899719, "learning_rate": 0.0002, "epoch": 2.207876049063912, "step": 6840}, {"loss": 0.6843, "grad_norm": 0.7826945781707764, "learning_rate": 0.0002, "epoch": 2.211103938024532, "step": 6850}, {"loss": 0.6338, "grad_norm": 0.5711963772773743, "learning_rate": 0.0002, "epoch": 2.2143318269851515, "step": 6860}, {"loss": 0.6585, "grad_norm": 0.6017758846282959, "learning_rate": 0.0002, "epoch": 2.2175597159457716, "step": 6870}, {"loss": 0.6657, "grad_norm": 0.785434901714325, "learning_rate": 0.0002, "epoch": 2.2207876049063913, "step": 6880}, {"loss": 0.7075, "grad_norm": 0.6251688599586487, "learning_rate": 0.0002, "epoch": 2.224015493867011, "step": 6890}, {"loss": 0.6564, "grad_norm": 0.8242034316062927, "learning_rate": 0.0002, "epoch": 2.2272433828276306, "step": 6900}, {"loss": 0.672, "grad_norm": 0.7272933125495911, "learning_rate": 0.0002, "epoch": 2.2304712717882507, "step": 6910}, {"loss": 0.6541, "grad_norm": 0.7159379720687866, "learning_rate": 0.0002, "epoch": 2.2336991607488703, "step": 6920}, {"loss": 0.6859, "grad_norm": 0.6518042087554932, "learning_rate": 0.0002, "epoch": 2.23692704970949, "step": 6930}, {"loss": 0.5987, "grad_norm": 0.7365370392799377, "learning_rate": 0.0002, "epoch": 2.2401549386701096, "step": 6940}, {"loss": 0.6511, "grad_norm": 0.5674061179161072, "learning_rate": 0.0002, "epoch": 2.2433828276307297, "step": 6950}, {"loss": 0.6748, "grad_norm": 0.669185996055603, "learning_rate": 0.0002, "epoch": 2.2466107165913494, "step": 6960}, {"loss": 0.656, "grad_norm": 0.6638304591178894, "learning_rate": 0.0002, "epoch": 2.249838605551969, "step": 6970}, {"loss": 0.636, "grad_norm": 0.757006824016571, "learning_rate": 0.0002, "epoch": 2.2530664945125887, "step": 6980}, {"loss": 0.6597, "grad_norm": 0.7574930787086487, "learning_rate": 0.0002, "epoch": 2.2562943834732083, "step": 6990}, {"loss": 0.6859, "grad_norm": 0.7819514870643616, "learning_rate": 0.0002, "epoch": 2.2595222724338284, "step": 7000}, {"loss": 0.6238, "grad_norm": 0.6987583041191101, "learning_rate": 0.0002, "epoch": 2.262750161394448, "step": 7010}, {"loss": 0.661, "grad_norm": 0.6628551483154297, "learning_rate": 0.0002, "epoch": 2.2659780503550677, "step": 7020}, {"loss": 0.6254, "grad_norm": 0.7855866551399231, "learning_rate": 0.0002, "epoch": 2.2692059393156874, "step": 7030}, {"loss": 0.6679, "grad_norm": 0.6102892756462097, "learning_rate": 0.0002, "epoch": 2.2724338282763075, "step": 7040}, {"loss": 0.694, "grad_norm": 0.7844198942184448, "learning_rate": 0.0002, "epoch": 2.275661717236927, "step": 7050}, {"loss": 0.63, "grad_norm": 0.6209492087364197, "learning_rate": 0.0002, "epoch": 2.2788896061975468, "step": 7060}, {"loss": 0.6418, "grad_norm": 0.8351290225982666, "learning_rate": 0.0002, "epoch": 2.2821174951581664, "step": 7070}, {"loss": 0.6648, "grad_norm": 0.6883546710014343, "learning_rate": 0.0002, "epoch": 2.285345384118786, "step": 7080}, {"loss": 0.7046, "grad_norm": 0.6626381874084473, "learning_rate": 0.0002, "epoch": 2.288573273079406, "step": 7090}, {"loss": 0.6535, "grad_norm": 0.7216270565986633, "learning_rate": 0.0002, "epoch": 2.291801162040026, "step": 7100}, {"loss": 0.6414, "grad_norm": 0.8246777057647705, "learning_rate": 0.0002, "epoch": 2.2950290510006455, "step": 7110}, {"loss": 0.6315, "grad_norm": 0.614326000213623, "learning_rate": 0.0002, "epoch": 2.2982569399612656, "step": 7120}, {"loss": 0.6303, "grad_norm": 0.8785578012466431, "learning_rate": 0.0002, "epoch": 2.301484828921885, "step": 7130}, {"loss": 0.6348, "grad_norm": 0.7021808624267578, "learning_rate": 0.0002, "epoch": 2.304712717882505, "step": 7140}, {"loss": 0.6738, "grad_norm": 0.6999403238296509, "learning_rate": 0.0002, "epoch": 2.3079406068431245, "step": 7150}, {"loss": 0.6547, "grad_norm": 0.8013143539428711, "learning_rate": 0.0002, "epoch": 2.311168495803744, "step": 7160}, {"loss": 0.6461, "grad_norm": 0.6592583060264587, "learning_rate": 0.0002, "epoch": 2.3143963847643643, "step": 7170}, {"loss": 0.6369, "grad_norm": 0.6260249018669128, "learning_rate": 0.0002, "epoch": 2.317624273724984, "step": 7180}, {"loss": 0.6647, "grad_norm": 0.9352797269821167, "learning_rate": 0.0002, "epoch": 2.3208521626856036, "step": 7190}, {"loss": 0.6543, "grad_norm": 0.6629612445831299, "learning_rate": 0.0002, "epoch": 2.324080051646223, "step": 7200}, {"loss": 0.6811, "grad_norm": 0.7062810063362122, "learning_rate": 0.0002, "epoch": 2.3273079406068433, "step": 7210}, {"loss": 0.67, "grad_norm": 0.7236241102218628, "learning_rate": 0.0002, "epoch": 2.330535829567463, "step": 7220}, {"loss": 0.6462, "grad_norm": 0.7528148293495178, "learning_rate": 0.0002, "epoch": 2.3337637185280826, "step": 7230}, {"loss": 0.694, "grad_norm": 0.7604748606681824, "learning_rate": 0.0002, "epoch": 2.3369916074887023, "step": 7240}, {"loss": 0.6475, "grad_norm": 0.5601189136505127, "learning_rate": 0.0002, "epoch": 2.340219496449322, "step": 7250}, {"loss": 0.6925, "grad_norm": 0.7099230885505676, "learning_rate": 0.0002, "epoch": 2.343447385409942, "step": 7260}, {"loss": 0.6333, "grad_norm": 0.6699047684669495, "learning_rate": 0.0002, "epoch": 2.3466752743705617, "step": 7270}, {"loss": 0.6434, "grad_norm": 0.7315047979354858, "learning_rate": 0.0002, "epoch": 2.3499031633311813, "step": 7280}, {"loss": 0.6927, "grad_norm": 0.632836103439331, "learning_rate": 0.0002, "epoch": 2.353131052291801, "step": 7290}, {"loss": 0.6458, "grad_norm": 0.9410115480422974, "learning_rate": 0.0002, "epoch": 2.356358941252421, "step": 7300}, {"loss": 0.6699, "grad_norm": 0.626554012298584, "learning_rate": 0.0002, "epoch": 2.3595868302130407, "step": 7310}, {"loss": 0.6495, "grad_norm": 0.7538444399833679, "learning_rate": 0.0002, "epoch": 2.3628147191736604, "step": 7320}, {"loss": 0.6321, "grad_norm": 0.6826626062393188, "learning_rate": 0.0002, "epoch": 2.36604260813428, "step": 7330}, {"loss": 0.6752, "grad_norm": 0.6739391088485718, "learning_rate": 0.0002, "epoch": 2.3692704970949, "step": 7340}, {"loss": 0.6518, "grad_norm": 0.7518446445465088, "learning_rate": 0.0002, "epoch": 2.3724983860555198, "step": 7350}, {"loss": 0.7142, "grad_norm": 0.714133083820343, "learning_rate": 0.0002, "epoch": 2.3757262750161394, "step": 7360}, {"loss": 0.6794, "grad_norm": 0.7144588232040405, "learning_rate": 0.0002, "epoch": 2.378954163976759, "step": 7370}, {"loss": 0.6922, "grad_norm": 0.6598120927810669, "learning_rate": 0.0002, "epoch": 2.382182052937379, "step": 7380}, {"loss": 0.6562, "grad_norm": 0.7079148292541504, "learning_rate": 0.0002, "epoch": 2.385409941897999, "step": 7390}, {"loss": 0.6492, "grad_norm": 0.6750902533531189, "learning_rate": 0.0002, "epoch": 2.3886378308586185, "step": 7400}, {"loss": 0.6398, "grad_norm": 0.7181967496871948, "learning_rate": 0.0002, "epoch": 2.391865719819238, "step": 7410}, {"loss": 0.6793, "grad_norm": 0.7720552086830139, "learning_rate": 0.0002, "epoch": 2.3950936087798578, "step": 7420}, {"loss": 0.6804, "grad_norm": 0.7592426538467407, "learning_rate": 0.0002, "epoch": 2.398321497740478, "step": 7430}, {"loss": 0.6667, "grad_norm": 0.7161896824836731, "learning_rate": 0.0002, "epoch": 2.4015493867010975, "step": 7440}, {"loss": 0.6891, "grad_norm": 0.8019260764122009, "learning_rate": 0.0002, "epoch": 2.404777275661717, "step": 7450}, {"loss": 0.6864, "grad_norm": 0.7093342542648315, "learning_rate": 0.0002, "epoch": 2.408005164622337, "step": 7460}, {"loss": 0.6445, "grad_norm": 0.8464207649230957, "learning_rate": 0.0002, "epoch": 2.411233053582957, "step": 7470}, {"loss": 0.6724, "grad_norm": 0.773666501045227, "learning_rate": 0.0002, "epoch": 2.4144609425435766, "step": 7480}, {"loss": 0.6774, "grad_norm": 0.8451611995697021, "learning_rate": 0.0002, "epoch": 2.4176888315041962, "step": 7490}, {"loss": 0.694, "grad_norm": 0.656795084476471, "learning_rate": 0.0002, "epoch": 2.420916720464816, "step": 7500}, {"loss": 0.6824, "grad_norm": 0.7129034996032715, "learning_rate": 0.0002, "epoch": 2.4241446094254355, "step": 7510}, {"loss": 0.711, "grad_norm": 0.8325763940811157, "learning_rate": 0.0002, "epoch": 2.4273724983860556, "step": 7520}, {"loss": 0.6238, "grad_norm": 0.7806527614593506, "learning_rate": 0.0002, "epoch": 2.4306003873466753, "step": 7530}, {"loss": 0.6972, "grad_norm": 0.6994536519050598, "learning_rate": 0.0002, "epoch": 2.433828276307295, "step": 7540}, {"loss": 0.6615, "grad_norm": 0.6898999214172363, "learning_rate": 0.0002, "epoch": 2.437056165267915, "step": 7550}, {"loss": 0.7108, "grad_norm": 0.719490647315979, "learning_rate": 0.0002, "epoch": 2.4402840542285347, "step": 7560}, {"loss": 0.668, "grad_norm": 0.6841562390327454, "learning_rate": 0.0002, "epoch": 2.4435119431891543, "step": 7570}, {"loss": 0.6504, "grad_norm": 0.7573311924934387, "learning_rate": 0.0002, "epoch": 2.446739832149774, "step": 7580}, {"loss": 0.6607, "grad_norm": 0.7295880317687988, "learning_rate": 0.0002, "epoch": 2.4499677211103936, "step": 7590}, {"loss": 0.6593, "grad_norm": 0.710136353969574, "learning_rate": 0.0002, "epoch": 2.4531956100710137, "step": 7600}, {"loss": 0.7137, "grad_norm": 0.6126235127449036, "learning_rate": 0.0002, "epoch": 2.4564234990316334, "step": 7610}, {"loss": 0.6562, "grad_norm": 0.8025609850883484, "learning_rate": 0.0002, "epoch": 2.459651387992253, "step": 7620}, {"loss": 0.6464, "grad_norm": 0.7839472889900208, "learning_rate": 0.0002, "epoch": 2.4628792769528727, "step": 7630}, {"loss": 0.6797, "grad_norm": 0.7253499031066895, "learning_rate": 0.0002, "epoch": 2.4661071659134928, "step": 7640}, {"loss": 0.7341, "grad_norm": 0.7918946743011475, "learning_rate": 0.0002, "epoch": 2.4693350548741124, "step": 7650}, {"loss": 0.6646, "grad_norm": 0.7930178046226501, "learning_rate": 0.0002, "epoch": 2.472562943834732, "step": 7660}, {"loss": 0.6294, "grad_norm": 0.6826170086860657, "learning_rate": 0.0002, "epoch": 2.4757908327953517, "step": 7670}, {"loss": 0.6697, "grad_norm": 0.6576805114746094, "learning_rate": 0.0002, "epoch": 2.4790187217559714, "step": 7680}, {"loss": 0.682, "grad_norm": 0.7012448310852051, "learning_rate": 0.0002, "epoch": 2.4822466107165915, "step": 7690}, {"loss": 0.6418, "grad_norm": 0.7774284482002258, "learning_rate": 0.0002, "epoch": 2.485474499677211, "step": 7700}, {"loss": 0.6566, "grad_norm": 0.6502766013145447, "learning_rate": 0.0002, "epoch": 2.4887023886378308, "step": 7710}, {"loss": 0.6965, "grad_norm": 0.7638739347457886, "learning_rate": 0.0002, "epoch": 2.4919302775984504, "step": 7720}, {"loss": 0.6454, "grad_norm": 0.6217384338378906, "learning_rate": 0.0002, "epoch": 2.4951581665590705, "step": 7730}, {"loss": 0.6837, "grad_norm": 0.7576302886009216, "learning_rate": 0.0002, "epoch": 2.49838605551969, "step": 7740}, {"loss": 0.6855, "grad_norm": 0.6877137422561646, "learning_rate": 0.0002, "epoch": 2.50161394448031, "step": 7750}, {"loss": 0.6604, "grad_norm": 0.6998329162597656, "learning_rate": 0.0002, "epoch": 2.5048418334409295, "step": 7760}, {"loss": 0.6666, "grad_norm": 0.7879213690757751, "learning_rate": 0.0002, "epoch": 2.508069722401549, "step": 7770}, {"loss": 0.715, "grad_norm": 0.7834980487823486, "learning_rate": 0.0002, "epoch": 2.5112976113621692, "step": 7780}, {"loss": 0.6954, "grad_norm": 0.7789630889892578, "learning_rate": 0.0002, "epoch": 2.514525500322789, "step": 7790}, {"loss": 0.6979, "grad_norm": 0.7403590083122253, "learning_rate": 0.0002, "epoch": 2.5177533892834085, "step": 7800}, {"loss": 0.6964, "grad_norm": 0.6029766201972961, "learning_rate": 0.0002, "epoch": 2.5209812782440286, "step": 7810}, {"loss": 0.6887, "grad_norm": 0.7061092257499695, "learning_rate": 0.0002, "epoch": 2.5242091672046483, "step": 7820}, {"loss": 0.6628, "grad_norm": 0.7120763659477234, "learning_rate": 0.0002, "epoch": 2.527437056165268, "step": 7830}, {"loss": 0.6876, "grad_norm": 0.6173675656318665, "learning_rate": 0.0002, "epoch": 2.5306649451258876, "step": 7840}, {"loss": 0.6635, "grad_norm": 0.9566813111305237, "learning_rate": 0.0002, "epoch": 2.5338928340865072, "step": 7850}, {"loss": 0.654, "grad_norm": 0.8497620224952698, "learning_rate": 0.0002, "epoch": 2.5371207230471273, "step": 7860}, {"loss": 0.644, "grad_norm": 0.7663498520851135, "learning_rate": 0.0002, "epoch": 2.540348612007747, "step": 7870}, {"loss": 0.6292, "grad_norm": 0.6329668760299683, "learning_rate": 0.0002, "epoch": 2.5435765009683666, "step": 7880}, {"loss": 0.686, "grad_norm": 0.8128195405006409, "learning_rate": 0.0002, "epoch": 2.5468043899289863, "step": 7890}, {"loss": 0.6619, "grad_norm": 0.6622284650802612, "learning_rate": 0.0002, "epoch": 2.5500322788896064, "step": 7900}, {"loss": 0.693, "grad_norm": 0.8460057973861694, "learning_rate": 0.0002, "epoch": 2.553260167850226, "step": 7910}, {"loss": 0.6619, "grad_norm": 0.6586956977844238, "learning_rate": 0.0002, "epoch": 2.5564880568108457, "step": 7920}, {"loss": 0.6976, "grad_norm": 0.7569382190704346, "learning_rate": 0.0002, "epoch": 2.5597159457714653, "step": 7930}, {"loss": 0.6235, "grad_norm": 0.6409714221954346, "learning_rate": 0.0002, "epoch": 2.562943834732085, "step": 7940}, {"loss": 0.6663, "grad_norm": 0.7031713128089905, "learning_rate": 0.0002, "epoch": 2.566171723692705, "step": 7950}, {"loss": 0.6344, "grad_norm": 0.7983605265617371, "learning_rate": 0.0002, "epoch": 2.5693996126533247, "step": 7960}, {"loss": 0.6834, "grad_norm": 0.7165433168411255, "learning_rate": 0.0002, "epoch": 2.5726275016139444, "step": 7970}, {"loss": 0.6517, "grad_norm": 0.6630598902702332, "learning_rate": 0.0002, "epoch": 2.5758553905745645, "step": 7980}, {"loss": 0.7164, "grad_norm": 0.5883122086524963, "learning_rate": 0.0002, "epoch": 2.579083279535184, "step": 7990}, {"loss": 0.6715, "grad_norm": 0.5928755402565002, "learning_rate": 0.0002, "epoch": 2.5823111684958038, "step": 8000}, {"loss": 0.6701, "grad_norm": 0.7843712568283081, "learning_rate": 0.0002, "epoch": 2.5855390574564234, "step": 8010}, {"loss": 0.6617, "grad_norm": 0.7206324338912964, "learning_rate": 0.0002, "epoch": 2.588766946417043, "step": 8020}, {"loss": 0.6968, "grad_norm": 0.812480092048645, "learning_rate": 0.0002, "epoch": 2.5919948353776627, "step": 8030}, {"loss": 0.6735, "grad_norm": 0.9843078255653381, "learning_rate": 0.0002, "epoch": 2.595222724338283, "step": 8040}, {"loss": 0.6877, "grad_norm": 0.7524392604827881, "learning_rate": 0.0002, "epoch": 2.5984506132989025, "step": 8050}, {"loss": 0.7188, "grad_norm": 0.6220380067825317, "learning_rate": 0.0002, "epoch": 2.601678502259522, "step": 8060}, {"loss": 0.6878, "grad_norm": 0.7461398243904114, "learning_rate": 0.0002, "epoch": 2.6049063912201422, "step": 8070}, {"loss": 0.6626, "grad_norm": 0.720974326133728, "learning_rate": 0.0002, "epoch": 2.608134280180762, "step": 8080}, {"loss": 0.6756, "grad_norm": 0.649509847164154, "learning_rate": 0.0002, "epoch": 2.6113621691413815, "step": 8090}, {"loss": 0.6394, "grad_norm": 0.6894662976264954, "learning_rate": 0.0002, "epoch": 2.614590058102001, "step": 8100}, {"loss": 0.6329, "grad_norm": 0.734433114528656, "learning_rate": 0.0002, "epoch": 2.617817947062621, "step": 8110}, {"loss": 0.6698, "grad_norm": 0.7468628883361816, "learning_rate": 0.0002, "epoch": 2.621045836023241, "step": 8120}, {"loss": 0.658, "grad_norm": 0.6508180499076843, "learning_rate": 0.0002, "epoch": 2.6242737249838606, "step": 8130}, {"loss": 0.6619, "grad_norm": 0.8735209107398987, "learning_rate": 0.0002, "epoch": 2.6275016139444802, "step": 8140}, {"loss": 0.6717, "grad_norm": 0.8162857294082642, "learning_rate": 0.0002, "epoch": 2.6307295029051003, "step": 8150}, {"loss": 0.6496, "grad_norm": 0.628872811794281, "learning_rate": 0.0002, "epoch": 2.63395739186572, "step": 8160}, {"loss": 0.6608, "grad_norm": 0.8078708052635193, "learning_rate": 0.0002, "epoch": 2.6371852808263396, "step": 8170}, {"loss": 0.6916, "grad_norm": 0.7849429845809937, "learning_rate": 0.0002, "epoch": 2.6404131697869593, "step": 8180}, {"loss": 0.6671, "grad_norm": 0.8115387558937073, "learning_rate": 0.0002, "epoch": 2.643641058747579, "step": 8190}, {"loss": 0.6761, "grad_norm": 0.7462222576141357, "learning_rate": 0.0002, "epoch": 2.6468689477081986, "step": 8200}, {"loss": 0.6923, "grad_norm": 0.753662645816803, "learning_rate": 0.0002, "epoch": 2.6500968366688187, "step": 8210}, {"loss": 0.6666, "grad_norm": 0.6100404858589172, "learning_rate": 0.0002, "epoch": 2.6533247256294383, "step": 8220}, {"loss": 0.7256, "grad_norm": 0.9084606766700745, "learning_rate": 0.0002, "epoch": 2.656552614590058, "step": 8230}, {"loss": 0.6385, "grad_norm": 0.6412538886070251, "learning_rate": 0.0002, "epoch": 2.659780503550678, "step": 8240}, {"loss": 0.7048, "grad_norm": 0.7640451192855835, "learning_rate": 0.0002, "epoch": 2.6630083925112977, "step": 8250}, {"loss": 0.6846, "grad_norm": 0.5972344875335693, "learning_rate": 0.0002, "epoch": 2.6662362814719174, "step": 8260}, {"loss": 0.682, "grad_norm": 0.6935883164405823, "learning_rate": 0.0002, "epoch": 2.669464170432537, "step": 8270}, {"loss": 0.6625, "grad_norm": 0.789399266242981, "learning_rate": 0.0002, "epoch": 2.6726920593931567, "step": 8280}, {"loss": 0.6541, "grad_norm": 0.7143490314483643, "learning_rate": 0.0002, "epoch": 2.675919948353777, "step": 8290}, {"loss": 0.6741, "grad_norm": 0.6670652627944946, "learning_rate": 0.0002, "epoch": 2.6791478373143964, "step": 8300}, {"loss": 0.6936, "grad_norm": 0.687108039855957, "learning_rate": 0.0002, "epoch": 2.682375726275016, "step": 8310}, {"loss": 0.7124, "grad_norm": 0.7914147973060608, "learning_rate": 0.0002, "epoch": 2.6856036152356357, "step": 8320}, {"loss": 0.6584, "grad_norm": 0.8398420214653015, "learning_rate": 0.0002, "epoch": 2.688831504196256, "step": 8330}, {"loss": 0.6679, "grad_norm": 0.6592720746994019, "learning_rate": 0.0002, "epoch": 2.6920593931568755, "step": 8340}, {"loss": 0.6673, "grad_norm": 0.6888470649719238, "learning_rate": 0.0002, "epoch": 2.695287282117495, "step": 8350}, {"loss": 0.6483, "grad_norm": 0.7127556800842285, "learning_rate": 0.0002, "epoch": 2.698515171078115, "step": 8360}, {"loss": 0.7013, "grad_norm": 0.6630286574363708, "learning_rate": 0.0002, "epoch": 2.7017430600387344, "step": 8370}, {"loss": 0.6842, "grad_norm": 0.8261964321136475, "learning_rate": 0.0002, "epoch": 2.7049709489993545, "step": 8380}, {"loss": 0.6613, "grad_norm": 0.717339813709259, "learning_rate": 0.0002, "epoch": 2.708198837959974, "step": 8390}, {"loss": 0.6929, "grad_norm": 0.651637613773346, "learning_rate": 0.0002, "epoch": 2.711426726920594, "step": 8400}, {"loss": 0.6796, "grad_norm": 0.7936098575592041, "learning_rate": 0.0002, "epoch": 2.714654615881214, "step": 8410}, {"loss": 0.696, "grad_norm": 0.8761560320854187, "learning_rate": 0.0002, "epoch": 2.7178825048418336, "step": 8420}, {"loss": 0.6889, "grad_norm": 0.6768006086349487, "learning_rate": 0.0002, "epoch": 2.7211103938024532, "step": 8430}, {"loss": 0.6844, "grad_norm": 0.7121055722236633, "learning_rate": 0.0002, "epoch": 2.724338282763073, "step": 8440}, {"loss": 0.6608, "grad_norm": 0.6811696887016296, "learning_rate": 0.0002, "epoch": 2.7275661717236925, "step": 8450}, {"loss": 0.7046, "grad_norm": 0.8168250918388367, "learning_rate": 0.0002, "epoch": 2.730794060684312, "step": 8460}, {"loss": 0.6809, "grad_norm": 0.660682737827301, "learning_rate": 0.0002, "epoch": 2.7340219496449323, "step": 8470}, {"loss": 0.6916, "grad_norm": 0.7369356155395508, "learning_rate": 0.0002, "epoch": 2.737249838605552, "step": 8480}, {"loss": 0.6383, "grad_norm": 0.7545099854469299, "learning_rate": 0.0002, "epoch": 2.7404777275661716, "step": 8490}, {"loss": 0.6917, "grad_norm": 0.6991257667541504, "learning_rate": 0.0002, "epoch": 2.7437056165267917, "step": 8500}, {"loss": 0.6953, "grad_norm": 0.7195324301719666, "learning_rate": 0.0002, "epoch": 2.7469335054874113, "step": 8510}, {"loss": 0.6955, "grad_norm": 0.8995378017425537, "learning_rate": 0.0002, "epoch": 2.750161394448031, "step": 8520}, {"loss": 0.684, "grad_norm": 0.6924123764038086, "learning_rate": 0.0002, "epoch": 2.7533892834086506, "step": 8530}, {"loss": 0.6675, "grad_norm": 0.6260585784912109, "learning_rate": 0.0002, "epoch": 2.7566171723692703, "step": 8540}, {"loss": 0.6613, "grad_norm": 0.7273091673851013, "learning_rate": 0.0002, "epoch": 2.7598450613298904, "step": 8550}, {"loss": 0.6853, "grad_norm": 0.720562219619751, "learning_rate": 0.0002, "epoch": 2.76307295029051, "step": 8560}, {"loss": 0.6452, "grad_norm": 0.6360004544258118, "learning_rate": 0.0002, "epoch": 2.7663008392511297, "step": 8570}, {"loss": 0.6118, "grad_norm": 0.7634525895118713, "learning_rate": 0.0002, "epoch": 2.76952872821175, "step": 8580}, {"loss": 0.686, "grad_norm": 0.6586076021194458, "learning_rate": 0.0002, "epoch": 2.7727566171723694, "step": 8590}, {"loss": 0.7072, "grad_norm": 0.6542639136314392, "learning_rate": 0.0002, "epoch": 2.775984506132989, "step": 8600}, {"loss": 0.7126, "grad_norm": 0.7650290727615356, "learning_rate": 0.0002, "epoch": 2.7792123950936087, "step": 8610}, {"loss": 0.6923, "grad_norm": 0.6551542282104492, "learning_rate": 0.0002, "epoch": 2.7824402840542284, "step": 8620}, {"loss": 0.6937, "grad_norm": 0.6915501952171326, "learning_rate": 0.0002, "epoch": 2.785668173014848, "step": 8630}, {"loss": 0.6586, "grad_norm": 0.8061493635177612, "learning_rate": 0.0002, "epoch": 2.788896061975468, "step": 8640}, {"loss": 0.6853, "grad_norm": 0.8403584957122803, "learning_rate": 0.0002, "epoch": 2.792123950936088, "step": 8650}, {"loss": 0.6616, "grad_norm": 0.6455532312393188, "learning_rate": 0.0002, "epoch": 2.7953518398967074, "step": 8660}, {"loss": 0.6819, "grad_norm": 0.8296352028846741, "learning_rate": 0.0002, "epoch": 2.7985797288573275, "step": 8670}, {"loss": 0.6678, "grad_norm": 0.7288752794265747, "learning_rate": 0.0002, "epoch": 2.801807617817947, "step": 8680}, {"loss": 0.6778, "grad_norm": 0.7628464102745056, "learning_rate": 0.0002, "epoch": 2.805035506778567, "step": 8690}, {"loss": 0.7176, "grad_norm": 0.9993878602981567, "learning_rate": 0.0002, "epoch": 2.8082633957391865, "step": 8700}, {"loss": 0.6414, "grad_norm": 0.6972465515136719, "learning_rate": 0.0002, "epoch": 2.811491284699806, "step": 8710}, {"loss": 0.6777, "grad_norm": 0.645042896270752, "learning_rate": 0.0002, "epoch": 2.8147191736604262, "step": 8720}, {"loss": 0.6587, "grad_norm": 0.6853853464126587, "learning_rate": 0.0002, "epoch": 2.817947062621046, "step": 8730}, {"loss": 0.6405, "grad_norm": 0.5935067534446716, "learning_rate": 0.0002, "epoch": 2.8211749515816655, "step": 8740}, {"loss": 0.6674, "grad_norm": 0.7336633205413818, "learning_rate": 0.0002, "epoch": 2.824402840542285, "step": 8750}, {"loss": 0.6662, "grad_norm": 0.7074962854385376, "learning_rate": 0.0002, "epoch": 2.8276307295029053, "step": 8760}, {"loss": 0.6744, "grad_norm": 0.6667559742927551, "learning_rate": 0.0002, "epoch": 2.830858618463525, "step": 8770}, {"loss": 0.7142, "grad_norm": 0.8101205229759216, "learning_rate": 0.0002, "epoch": 2.8340865074241446, "step": 8780}, {"loss": 0.6727, "grad_norm": 0.8841480016708374, "learning_rate": 0.0002, "epoch": 2.8373143963847642, "step": 8790}, {"loss": 0.6601, "grad_norm": 0.5891591310501099, "learning_rate": 0.0002, "epoch": 2.840542285345384, "step": 8800}, {"loss": 0.7114, "grad_norm": 0.667032778263092, "learning_rate": 0.0002, "epoch": 2.843770174306004, "step": 8810}, {"loss": 0.7295, "grad_norm": 0.7629773020744324, "learning_rate": 0.0002, "epoch": 2.8469980632666236, "step": 8820}, {"loss": 0.703, "grad_norm": 0.79471355676651, "learning_rate": 0.0002, "epoch": 2.8502259522272433, "step": 8830}, {"loss": 0.7278, "grad_norm": 0.7529178261756897, "learning_rate": 0.0002, "epoch": 2.8534538411878634, "step": 8840}, {"loss": 0.7163, "grad_norm": 0.7014923691749573, "learning_rate": 0.0002, "epoch": 2.856681730148483, "step": 8850}, {"loss": 0.6803, "grad_norm": 0.7996514439582825, "learning_rate": 0.0002, "epoch": 2.8599096191091027, "step": 8860}, {"loss": 0.6562, "grad_norm": 0.7044785618782043, "learning_rate": 0.0002, "epoch": 2.8631375080697223, "step": 8870}, {"loss": 0.6966, "grad_norm": 0.6792093515396118, "learning_rate": 0.0002, "epoch": 2.866365397030342, "step": 8880}, {"loss": 0.685, "grad_norm": 0.69175124168396, "learning_rate": 0.0002, "epoch": 2.8695932859909616, "step": 8890}, {"loss": 0.7225, "grad_norm": 0.7499129176139832, "learning_rate": 0.0002, "epoch": 2.8728211749515817, "step": 8900}, {"loss": 0.6922, "grad_norm": 0.7678789496421814, "learning_rate": 0.0002, "epoch": 2.8760490639122014, "step": 8910}, {"loss": 0.6803, "grad_norm": 0.7478128671646118, "learning_rate": 0.0002, "epoch": 2.879276952872821, "step": 8920}, {"loss": 0.6689, "grad_norm": 0.6767086386680603, "learning_rate": 0.0002, "epoch": 2.882504841833441, "step": 8930}, {"loss": 0.6587, "grad_norm": 0.7222196459770203, "learning_rate": 0.0002, "epoch": 2.885732730794061, "step": 8940}, {"loss": 0.6472, "grad_norm": 0.6950580477714539, "learning_rate": 0.0002, "epoch": 2.8889606197546804, "step": 8950}, {"loss": 0.7064, "grad_norm": 0.7759528160095215, "learning_rate": 0.0002, "epoch": 2.8921885087153, "step": 8960}, {"loss": 0.6349, "grad_norm": 0.6686919927597046, "learning_rate": 0.0002, "epoch": 2.8954163976759197, "step": 8970}, {"loss": 0.6801, "grad_norm": 0.9245954751968384, "learning_rate": 0.0002, "epoch": 2.89864428663654, "step": 8980}, {"loss": 0.6703, "grad_norm": 0.8734814524650574, "learning_rate": 0.0002, "epoch": 2.9018721755971595, "step": 8990}, {"loss": 0.6716, "grad_norm": 0.6056219339370728, "learning_rate": 0.0002, "epoch": 2.905100064557779, "step": 9000}, {"loss": 0.6535, "grad_norm": 0.7364102005958557, "learning_rate": 0.0002, "epoch": 2.9083279535183992, "step": 9010}, {"loss": 0.707, "grad_norm": 0.6563605070114136, "learning_rate": 0.0002, "epoch": 2.911555842479019, "step": 9020}, {"loss": 0.6564, "grad_norm": 0.659978985786438, "learning_rate": 0.0002, "epoch": 2.9147837314396385, "step": 9030}, {"loss": 0.7154, "grad_norm": 0.8176041841506958, "learning_rate": 0.0002, "epoch": 2.918011620400258, "step": 9040}, {"loss": 0.72, "grad_norm": 0.743677020072937, "learning_rate": 0.0002, "epoch": 2.921239509360878, "step": 9050}, {"loss": 0.7017, "grad_norm": 0.7418383359909058, "learning_rate": 0.0002, "epoch": 2.9244673983214975, "step": 9060}, {"loss": 0.6635, "grad_norm": 0.6916524767875671, "learning_rate": 0.0002, "epoch": 2.9276952872821176, "step": 9070}, {"loss": 0.6502, "grad_norm": 0.6559975743293762, "learning_rate": 0.0002, "epoch": 2.9309231762427372, "step": 9080}, {"loss": 0.7016, "grad_norm": 0.7431221008300781, "learning_rate": 0.0002, "epoch": 2.934151065203357, "step": 9090}, {"loss": 0.6829, "grad_norm": 0.7525941133499146, "learning_rate": 0.0002, "epoch": 2.937378954163977, "step": 9100}, {"loss": 0.7073, "grad_norm": 0.6860167384147644, "learning_rate": 0.0002, "epoch": 2.9406068431245966, "step": 9110}, {"loss": 0.6912, "grad_norm": 0.6467666029930115, "learning_rate": 0.0002, "epoch": 2.9438347320852163, "step": 9120}, {"loss": 0.7122, "grad_norm": 0.7595751285552979, "learning_rate": 0.0002, "epoch": 2.947062621045836, "step": 9130}, {"loss": 0.6951, "grad_norm": 0.6558279991149902, "learning_rate": 0.0002, "epoch": 2.9502905100064556, "step": 9140}, {"loss": 0.7081, "grad_norm": 0.6818708181381226, "learning_rate": 0.0002, "epoch": 2.9535183989670757, "step": 9150}, {"loss": 0.6921, "grad_norm": 0.8387085795402527, "learning_rate": 0.0002, "epoch": 2.9567462879276953, "step": 9160}, {"loss": 0.6914, "grad_norm": 0.7705109715461731, "learning_rate": 0.0002, "epoch": 2.959974176888315, "step": 9170}, {"loss": 0.6849, "grad_norm": 0.688106894493103, "learning_rate": 0.0002, "epoch": 2.9632020658489346, "step": 9180}, {"loss": 0.6833, "grad_norm": 0.659532368183136, "learning_rate": 0.0002, "epoch": 2.9664299548095547, "step": 9190}, {"loss": 0.6383, "grad_norm": 0.6839388608932495, "learning_rate": 0.0002, "epoch": 2.9696578437701744, "step": 9200}, {"loss": 0.6952, "grad_norm": 0.6927599310874939, "learning_rate": 0.0002, "epoch": 2.972885732730794, "step": 9210}, {"loss": 0.7338, "grad_norm": 0.6902472972869873, "learning_rate": 0.0002, "epoch": 2.9761136216914137, "step": 9220}, {"loss": 0.6671, "grad_norm": 0.620399534702301, "learning_rate": 0.0002, "epoch": 2.9793415106520333, "step": 9230}, {"loss": 0.6588, "grad_norm": 0.6812364459037781, "learning_rate": 0.0002, "epoch": 2.9825693996126534, "step": 9240}, {"loss": 0.6957, "grad_norm": 0.7681456208229065, "learning_rate": 0.0002, "epoch": 2.985797288573273, "step": 9250}, {"loss": 0.7113, "grad_norm": 0.7621907591819763, "learning_rate": 0.0002, "epoch": 2.9890251775338927, "step": 9260}, {"loss": 0.6601, "grad_norm": 0.6075740456581116, "learning_rate": 0.0002, "epoch": 2.992253066494513, "step": 9270}, {"loss": 0.6758, "grad_norm": 0.7100434899330139, "learning_rate": 0.0002, "epoch": 2.9954809554551325, "step": 9280}, {"loss": 0.73, "grad_norm": 0.7314488887786865, "learning_rate": 0.0002, "epoch": 2.998708844415752, "step": 9290}, {"eval_loss": 1.1434104442596436, "eval_runtime": 166.3732, "eval_samples_per_second": 4.406, "eval_steps_per_second": 0.553, "epoch": 3.0, "step": 9294}, {"loss": 0.6401, "grad_norm": 0.7408893704414368, "learning_rate": 0.0002, "epoch": 3.001936733376372, "step": 9300}, {"loss": 0.5182, "grad_norm": 0.9773574471473694, "learning_rate": 0.0002, "epoch": 3.0051646223369914, "step": 9310}, {"loss": 0.5432, "grad_norm": 0.7919653058052063, "learning_rate": 0.0002, "epoch": 3.0083925112976115, "step": 9320}, {"loss": 0.6156, "grad_norm": 0.9139202833175659, "learning_rate": 0.0002, "epoch": 3.011620400258231, "step": 9330}, {"loss": 0.5736, "grad_norm": 0.8296737670898438, "learning_rate": 0.0002, "epoch": 3.014848289218851, "step": 9340}, {"loss": 0.5567, "grad_norm": 0.786868155002594, "learning_rate": 0.0002, "epoch": 3.0180761781794705, "step": 9350}, {"loss": 0.578, "grad_norm": 0.5928055644035339, "learning_rate": 0.0002, "epoch": 3.0213040671400906, "step": 9360}, {"loss": 0.5376, "grad_norm": 0.8785701394081116, "learning_rate": 0.0002, "epoch": 3.0245319561007102, "step": 9370}, {"loss": 0.5664, "grad_norm": 0.7978872060775757, "learning_rate": 0.0002, "epoch": 3.02775984506133, "step": 9380}, {"loss": 0.5797, "grad_norm": 0.7160913348197937, "learning_rate": 0.0002, "epoch": 3.0309877340219495, "step": 9390}, {"loss": 0.5777, "grad_norm": 0.904465913772583, "learning_rate": 0.0002, "epoch": 3.034215622982569, "step": 9400}, {"loss": 0.5518, "grad_norm": 0.7082195281982422, "learning_rate": 0.0002, "epoch": 3.0374435119431893, "step": 9410}, {"loss": 0.5434, "grad_norm": 0.9686778783798218, "learning_rate": 0.0002, "epoch": 3.040671400903809, "step": 9420}, {"loss": 0.5692, "grad_norm": 0.8788613677024841, "learning_rate": 0.0002, "epoch": 3.0438992898644286, "step": 9430}, {"loss": 0.5599, "grad_norm": 0.8217582106590271, "learning_rate": 0.0002, "epoch": 3.0471271788250482, "step": 9440}, {"loss": 0.5405, "grad_norm": 0.7380914092063904, "learning_rate": 0.0002, "epoch": 3.0503550677856683, "step": 9450}, {"loss": 0.6258, "grad_norm": 0.7339285612106323, "learning_rate": 0.0002, "epoch": 3.053582956746288, "step": 9460}, {"loss": 0.5646, "grad_norm": 0.7175183296203613, "learning_rate": 0.0002, "epoch": 3.0568108457069076, "step": 9470}, {"loss": 0.5667, "grad_norm": 0.8275379538536072, "learning_rate": 0.0002, "epoch": 3.0600387346675273, "step": 9480}, {"loss": 0.5868, "grad_norm": 0.6544256806373596, "learning_rate": 0.0002, "epoch": 3.0632666236281474, "step": 9490}, {"loss": 0.5365, "grad_norm": 0.8193472623825073, "learning_rate": 0.0002, "epoch": 3.066494512588767, "step": 9500}, {"loss": 0.5614, "grad_norm": 0.7967836856842041, "learning_rate": 0.0002, "epoch": 3.0697224015493867, "step": 9510}, {"loss": 0.5629, "grad_norm": 0.8788684010505676, "learning_rate": 0.0002, "epoch": 3.0729502905100063, "step": 9520}, {"loss": 0.5397, "grad_norm": 0.9410629868507385, "learning_rate": 0.0002, "epoch": 3.0761781794706264, "step": 9530}, {"loss": 0.5473, "grad_norm": 0.7448706030845642, "learning_rate": 0.0002, "epoch": 3.079406068431246, "step": 9540}, {"loss": 0.5774, "grad_norm": 0.9149372577667236, "learning_rate": 0.0002, "epoch": 3.0826339573918657, "step": 9550}, {"loss": 0.5347, "grad_norm": 0.7265563607215881, "learning_rate": 0.0002, "epoch": 3.0858618463524854, "step": 9560}, {"loss": 0.5487, "grad_norm": 1.0305068492889404, "learning_rate": 0.0002, "epoch": 3.089089735313105, "step": 9570}, {"loss": 0.5884, "grad_norm": 0.7987357974052429, "learning_rate": 0.0002, "epoch": 3.092317624273725, "step": 9580}, {"loss": 0.6216, "grad_norm": 0.7733123898506165, "learning_rate": 0.0002, "epoch": 3.095545513234345, "step": 9590}, {"loss": 0.5848, "grad_norm": 1.0438069105148315, "learning_rate": 0.0002, "epoch": 3.0987734021949644, "step": 9600}, {"loss": 0.5612, "grad_norm": 0.7951784729957581, "learning_rate": 0.0002, "epoch": 3.102001291155584, "step": 9610}, {"loss": 0.6184, "grad_norm": 0.7776783108711243, "learning_rate": 0.0002, "epoch": 3.105229180116204, "step": 9620}, {"loss": 0.5626, "grad_norm": 0.7060676217079163, "learning_rate": 0.0002, "epoch": 3.108457069076824, "step": 9630}, {"loss": 0.5731, "grad_norm": 0.871569037437439, "learning_rate": 0.0002, "epoch": 3.1116849580374435, "step": 9640}, {"loss": 0.5168, "grad_norm": 0.8873385787010193, "learning_rate": 0.0002, "epoch": 3.114912846998063, "step": 9650}, {"loss": 0.5985, "grad_norm": 0.750998318195343, "learning_rate": 0.0002, "epoch": 3.118140735958683, "step": 9660}, {"loss": 0.5741, "grad_norm": 0.8678529262542725, "learning_rate": 0.0002, "epoch": 3.121368624919303, "step": 9670}, {"loss": 0.5831, "grad_norm": 0.7706599235534668, "learning_rate": 0.0002, "epoch": 3.1245965138799225, "step": 9680}, {"loss": 0.6142, "grad_norm": 0.8317574858665466, "learning_rate": 0.0002, "epoch": 3.127824402840542, "step": 9690}, {"loss": 0.5634, "grad_norm": 0.801800012588501, "learning_rate": 0.0002, "epoch": 3.131052291801162, "step": 9700}, {"loss": 0.6044, "grad_norm": 0.8574623465538025, "learning_rate": 0.0002, "epoch": 3.134280180761782, "step": 9710}, {"loss": 0.6072, "grad_norm": 0.6556540727615356, "learning_rate": 0.0002, "epoch": 3.1375080697224016, "step": 9720}, {"loss": 0.6058, "grad_norm": 0.8555161952972412, "learning_rate": 0.0002, "epoch": 3.1407359586830212, "step": 9730}, {"loss": 0.6069, "grad_norm": 0.8825467824935913, "learning_rate": 0.0002, "epoch": 3.143963847643641, "step": 9740}, {"loss": 0.5689, "grad_norm": 0.8297156691551208, "learning_rate": 0.0002, "epoch": 3.147191736604261, "step": 9750}, {"loss": 0.5738, "grad_norm": 0.7710384726524353, "learning_rate": 0.0002, "epoch": 3.1504196255648806, "step": 9760}, {"loss": 0.571, "grad_norm": 0.8778039216995239, "learning_rate": 0.0002, "epoch": 3.1536475145255003, "step": 9770}, {"loss": 0.5913, "grad_norm": 0.9014058113098145, "learning_rate": 0.0002, "epoch": 3.15687540348612, "step": 9780}, {"loss": 0.5496, "grad_norm": 0.6856890320777893, "learning_rate": 0.0002, "epoch": 3.16010329244674, "step": 9790}, {"loss": 0.558, "grad_norm": 0.6520644426345825, "learning_rate": 0.0002, "epoch": 3.1633311814073597, "step": 9800}, {"loss": 0.6024, "grad_norm": 0.7250499129295349, "learning_rate": 0.0002, "epoch": 3.1665590703679793, "step": 9810}, {"loss": 0.5823, "grad_norm": 0.8331542015075684, "learning_rate": 0.0002, "epoch": 3.169786959328599, "step": 9820}, {"loss": 0.5803, "grad_norm": 0.8531261682510376, "learning_rate": 0.0002, "epoch": 3.1730148482892186, "step": 9830}, {"loss": 0.57, "grad_norm": 0.8997558355331421, "learning_rate": 0.0002, "epoch": 3.1762427372498387, "step": 9840}, {"loss": 0.5921, "grad_norm": 0.708335280418396, "learning_rate": 0.0002, "epoch": 3.1794706262104584, "step": 9850}, {"loss": 0.5997, "grad_norm": 1.0074886083602905, "learning_rate": 0.0002, "epoch": 3.182698515171078, "step": 9860}, {"loss": 0.573, "grad_norm": 1.0804681777954102, "learning_rate": 0.0002, "epoch": 3.1859264041316977, "step": 9870}, {"loss": 0.5527, "grad_norm": 0.9510730504989624, "learning_rate": 0.0002, "epoch": 3.189154293092318, "step": 9880}, {"loss": 0.6401, "grad_norm": 0.7211061716079712, "learning_rate": 0.0002, "epoch": 3.1923821820529374, "step": 9890}, {"loss": 0.5563, "grad_norm": 0.8767086267471313, "learning_rate": 0.0002, "epoch": 3.195610071013557, "step": 9900}, {"loss": 0.5747, "grad_norm": 0.8388153314590454, "learning_rate": 0.0002, "epoch": 3.1988379599741767, "step": 9910}, {"loss": 0.5681, "grad_norm": 0.8038473725318909, "learning_rate": 0.0002, "epoch": 3.202065848934797, "step": 9920}, {"loss": 0.5594, "grad_norm": 0.8187747001647949, "learning_rate": 0.0002, "epoch": 3.2052937378954165, "step": 9930}, {"loss": 0.5813, "grad_norm": 0.7427355051040649, "learning_rate": 0.0002, "epoch": 3.208521626856036, "step": 9940}, {"loss": 0.5709, "grad_norm": 0.8017025589942932, "learning_rate": 0.0002, "epoch": 3.211749515816656, "step": 9950}, {"loss": 0.6106, "grad_norm": 0.738595187664032, "learning_rate": 0.0002, "epoch": 3.214977404777276, "step": 9960}, {"loss": 0.6006, "grad_norm": 0.7521342039108276, "learning_rate": 0.0002, "epoch": 3.2182052937378955, "step": 9970}, {"loss": 0.5706, "grad_norm": 0.840329110622406, "learning_rate": 0.0002, "epoch": 3.221433182698515, "step": 9980}, {"loss": 0.5666, "grad_norm": 0.9809671640396118, "learning_rate": 0.0002, "epoch": 3.224661071659135, "step": 9990}, {"loss": 0.6223, "grad_norm": 0.8456943035125732, "learning_rate": 0.0002, "epoch": 3.2278889606197545, "step": 10000}, {"loss": 0.5798, "grad_norm": 0.8962995409965515, "learning_rate": 0.0002, "epoch": 3.2311168495803746, "step": 10010}, {"loss": 0.5399, "grad_norm": 0.6492817401885986, "learning_rate": 0.0002, "epoch": 3.2343447385409942, "step": 10020}, {"loss": 0.5678, "grad_norm": 1.0471255779266357, "learning_rate": 0.0002, "epoch": 3.237572627501614, "step": 10030}, {"loss": 0.5452, "grad_norm": 0.7995471358299255, "learning_rate": 0.0002, "epoch": 3.2408005164622335, "step": 10040}, {"loss": 0.615, "grad_norm": 0.7231964468955994, "learning_rate": 0.0002, "epoch": 3.2440284054228536, "step": 10050}, {"loss": 0.5586, "grad_norm": 0.639630138874054, "learning_rate": 0.0002, "epoch": 3.2472562943834733, "step": 10060}, {"loss": 0.6271, "grad_norm": 0.7957055568695068, "learning_rate": 0.0002, "epoch": 3.250484183344093, "step": 10070}, {"loss": 0.5845, "grad_norm": 0.7735482454299927, "learning_rate": 0.0002, "epoch": 3.2537120723047126, "step": 10080}, {"loss": 0.5791, "grad_norm": 0.8139488101005554, "learning_rate": 0.0002, "epoch": 3.2569399612653323, "step": 10090}, {"loss": 0.6049, "grad_norm": 0.8113240003585815, "learning_rate": 0.0002, "epoch": 3.2601678502259523, "step": 10100}, {"loss": 0.5617, "grad_norm": 0.7735909819602966, "learning_rate": 0.0002, "epoch": 3.263395739186572, "step": 10110}, {"loss": 0.5964, "grad_norm": 0.7760744094848633, "learning_rate": 0.0002, "epoch": 3.2666236281471916, "step": 10120}, {"loss": 0.5786, "grad_norm": 0.8078505396842957, "learning_rate": 0.0002, "epoch": 3.2698515171078113, "step": 10130}, {"loss": 0.5904, "grad_norm": 0.983648955821991, "learning_rate": 0.0002, "epoch": 3.2730794060684314, "step": 10140}, {"loss": 0.596, "grad_norm": 0.7131832242012024, "learning_rate": 0.0002, "epoch": 3.276307295029051, "step": 10150}, {"loss": 0.5986, "grad_norm": 0.924493134021759, "learning_rate": 0.0002, "epoch": 3.2795351839896707, "step": 10160}, {"loss": 0.5733, "grad_norm": 0.9371112585067749, "learning_rate": 0.0002, "epoch": 3.2827630729502904, "step": 10170}, {"loss": 0.5891, "grad_norm": 0.8989261388778687, "learning_rate": 0.0002, "epoch": 3.2859909619109104, "step": 10180}, {"loss": 0.6143, "grad_norm": 0.8130394816398621, "learning_rate": 0.0002, "epoch": 3.28921885087153, "step": 10190}, {"loss": 0.5555, "grad_norm": 0.9899941086769104, "learning_rate": 0.0002, "epoch": 3.2924467398321497, "step": 10200}, {"loss": 0.5899, "grad_norm": 1.007038950920105, "learning_rate": 0.0002, "epoch": 3.2956746287927694, "step": 10210}, {"loss": 0.5713, "grad_norm": 0.7465066313743591, "learning_rate": 0.0002, "epoch": 3.2989025177533895, "step": 10220}, {"loss": 0.6307, "grad_norm": 0.7202590703964233, "learning_rate": 0.0002, "epoch": 3.302130406714009, "step": 10230}, {"loss": 0.5659, "grad_norm": 0.6258249282836914, "learning_rate": 0.0002, "epoch": 3.305358295674629, "step": 10240}, {"loss": 0.5869, "grad_norm": 0.8996058702468872, "learning_rate": 0.0002, "epoch": 3.3085861846352485, "step": 10250}, {"loss": 0.5825, "grad_norm": 0.9550982713699341, "learning_rate": 0.0002, "epoch": 3.311814073595868, "step": 10260}, {"loss": 0.5602, "grad_norm": 0.7010059952735901, "learning_rate": 0.0002, "epoch": 3.315041962556488, "step": 10270}, {"loss": 0.5853, "grad_norm": 0.9639869332313538, "learning_rate": 0.0002, "epoch": 3.318269851517108, "step": 10280}, {"loss": 0.5362, "grad_norm": 1.0192502737045288, "learning_rate": 0.0002, "epoch": 3.3214977404777275, "step": 10290}, {"loss": 0.5605, "grad_norm": 0.7953670024871826, "learning_rate": 0.0002, "epoch": 3.324725629438347, "step": 10300}, {"loss": 0.6386, "grad_norm": 0.7436774969100952, "learning_rate": 0.0002, "epoch": 3.3279535183989672, "step": 10310}, {"loss": 0.5823, "grad_norm": 0.7846777439117432, "learning_rate": 0.0002, "epoch": 3.331181407359587, "step": 10320}, {"loss": 0.6119, "grad_norm": 0.8963494896888733, "learning_rate": 0.0002, "epoch": 3.3344092963202066, "step": 10330}, {"loss": 0.5872, "grad_norm": 0.6876392364501953, "learning_rate": 0.0002, "epoch": 3.337637185280826, "step": 10340}, {"loss": 0.6291, "grad_norm": 0.9161638021469116, "learning_rate": 0.0002, "epoch": 3.340865074241446, "step": 10350}, {"loss": 0.5955, "grad_norm": 0.8964458107948303, "learning_rate": 0.0002, "epoch": 3.344092963202066, "step": 10360}, {"loss": 0.5965, "grad_norm": 0.9052296280860901, "learning_rate": 0.0002, "epoch": 3.3473208521626856, "step": 10370}, {"loss": 0.5958, "grad_norm": 0.9292596578598022, "learning_rate": 0.0002, "epoch": 3.3505487411233053, "step": 10380}, {"loss": 0.5487, "grad_norm": 0.9605957269668579, "learning_rate": 0.0002, "epoch": 3.3537766300839253, "step": 10390}, {"loss": 0.6214, "grad_norm": 1.0198872089385986, "learning_rate": 0.0002, "epoch": 3.357004519044545, "step": 10400}, {"loss": 0.6053, "grad_norm": 0.7043630480766296, "learning_rate": 0.0002, "epoch": 3.3602324080051647, "step": 10410}, {"loss": 0.5451, "grad_norm": 1.0533326864242554, "learning_rate": 0.0002, "epoch": 3.3634602969657843, "step": 10420}, {"loss": 0.6134, "grad_norm": 0.7552485466003418, "learning_rate": 0.0002, "epoch": 3.366688185926404, "step": 10430}, {"loss": 0.631, "grad_norm": 0.692708432674408, "learning_rate": 0.0002, "epoch": 3.369916074887024, "step": 10440}, {"loss": 0.631, "grad_norm": 0.985952615737915, "learning_rate": 0.0002, "epoch": 3.3731439638476437, "step": 10450}, {"loss": 0.5689, "grad_norm": 0.6749676465988159, "learning_rate": 0.0002, "epoch": 3.3763718528082634, "step": 10460}, {"loss": 0.5724, "grad_norm": 0.9514535665512085, "learning_rate": 0.0002, "epoch": 3.379599741768883, "step": 10470}, {"loss": 0.5982, "grad_norm": 1.2681142091751099, "learning_rate": 0.0002, "epoch": 3.382827630729503, "step": 10480}, {"loss": 0.5778, "grad_norm": 1.031968355178833, "learning_rate": 0.0002, "epoch": 3.3860555196901228, "step": 10490}, {"loss": 0.5964, "grad_norm": 0.8061563968658447, "learning_rate": 0.0002, "epoch": 3.3892834086507424, "step": 10500}, {"loss": 0.6094, "grad_norm": 1.0515062808990479, "learning_rate": 0.0002, "epoch": 3.392511297611362, "step": 10510}, {"loss": 0.542, "grad_norm": 0.9055540561676025, "learning_rate": 0.0002, "epoch": 3.3957391865719817, "step": 10520}, {"loss": 0.6148, "grad_norm": 0.9318141341209412, "learning_rate": 0.0002, "epoch": 3.398967075532602, "step": 10530}, {"loss": 0.5722, "grad_norm": 0.8266817331314087, "learning_rate": 0.0002, "epoch": 3.4021949644932215, "step": 10540}, {"loss": 0.6015, "grad_norm": 1.2322112321853638, "learning_rate": 0.0002, "epoch": 3.405422853453841, "step": 10550}, {"loss": 0.6215, "grad_norm": 0.9535136818885803, "learning_rate": 0.0002, "epoch": 3.4086507424144608, "step": 10560}, {"loss": 0.561, "grad_norm": 0.9243819117546082, "learning_rate": 0.0002, "epoch": 3.411878631375081, "step": 10570}, {"loss": 0.5844, "grad_norm": 0.9011809825897217, "learning_rate": 0.0002, "epoch": 3.4151065203357005, "step": 10580}, {"loss": 0.6175, "grad_norm": 0.9923036694526672, "learning_rate": 0.0002, "epoch": 3.41833440929632, "step": 10590}, {"loss": 0.6033, "grad_norm": 0.8903067111968994, "learning_rate": 0.0002, "epoch": 3.42156229825694, "step": 10600}, {"loss": 0.5563, "grad_norm": 0.7101534605026245, "learning_rate": 0.0002, "epoch": 3.42479018721756, "step": 10610}, {"loss": 0.598, "grad_norm": 0.8186570405960083, "learning_rate": 0.0002, "epoch": 3.4280180761781796, "step": 10620}, {"loss": 0.5897, "grad_norm": 0.9480205774307251, "learning_rate": 0.0002, "epoch": 3.431245965138799, "step": 10630}, {"loss": 0.5798, "grad_norm": 1.1370961666107178, "learning_rate": 0.0002, "epoch": 3.434473854099419, "step": 10640}, {"loss": 0.5779, "grad_norm": 1.017669677734375, "learning_rate": 0.0002, "epoch": 3.437701743060039, "step": 10650}, {"loss": 0.5999, "grad_norm": 0.7625100016593933, "learning_rate": 0.0002, "epoch": 3.4409296320206586, "step": 10660}, {"loss": 0.5705, "grad_norm": 0.9288196563720703, "learning_rate": 0.0002, "epoch": 3.4441575209812783, "step": 10670}, {"loss": 0.6255, "grad_norm": 0.8800460696220398, "learning_rate": 0.0002, "epoch": 3.447385409941898, "step": 10680}, {"loss": 0.6245, "grad_norm": 0.7499661445617676, "learning_rate": 0.0002, "epoch": 3.4506132989025176, "step": 10690}, {"loss": 0.5979, "grad_norm": 0.8254973292350769, "learning_rate": 0.0002, "epoch": 3.4538411878631377, "step": 10700}, {"loss": 0.5742, "grad_norm": 0.8735857605934143, "learning_rate": 0.0002, "epoch": 3.4570690768237573, "step": 10710}, {"loss": 0.6356, "grad_norm": 0.9601819515228271, "learning_rate": 0.0002, "epoch": 3.460296965784377, "step": 10720}, {"loss": 0.5574, "grad_norm": 0.8031058311462402, "learning_rate": 0.0002, "epoch": 3.4635248547449966, "step": 10730}, {"loss": 0.6078, "grad_norm": 0.8039247393608093, "learning_rate": 0.0002, "epoch": 3.4667527437056167, "step": 10740}, {"loss": 0.593, "grad_norm": 0.8936953544616699, "learning_rate": 0.0002, "epoch": 3.4699806326662364, "step": 10750}, {"loss": 0.5971, "grad_norm": 0.8201186060905457, "learning_rate": 0.0002, "epoch": 3.473208521626856, "step": 10760}, {"loss": 0.5875, "grad_norm": 1.0064148902893066, "learning_rate": 0.0002, "epoch": 3.4764364105874757, "step": 10770}, {"loss": 0.5639, "grad_norm": 0.8617483377456665, "learning_rate": 0.0002, "epoch": 3.4796642995480953, "step": 10780}, {"loss": 0.6022, "grad_norm": 0.8532096147537231, "learning_rate": 0.0002, "epoch": 3.4828921885087154, "step": 10790}, {"loss": 0.5765, "grad_norm": 0.8646879196166992, "learning_rate": 0.0002, "epoch": 3.486120077469335, "step": 10800}, {"loss": 0.5799, "grad_norm": 0.7962660789489746, "learning_rate": 0.0002, "epoch": 3.4893479664299547, "step": 10810}, {"loss": 0.5398, "grad_norm": 0.9560028314590454, "learning_rate": 0.0002, "epoch": 3.492575855390575, "step": 10820}, {"loss": 0.6082, "grad_norm": 0.928439736366272, "learning_rate": 0.0002, "epoch": 3.4958037443511945, "step": 10830}, {"loss": 0.6112, "grad_norm": 0.8219282627105713, "learning_rate": 0.0002, "epoch": 3.499031633311814, "step": 10840}, {"loss": 0.6369, "grad_norm": 0.7918338179588318, "learning_rate": 0.0002, "epoch": 3.5022595222724338, "step": 10850}, {"loss": 0.6164, "grad_norm": 0.961295485496521, "learning_rate": 0.0002, "epoch": 3.5054874112330534, "step": 10860}, {"loss": 0.5534, "grad_norm": 1.0731624364852905, "learning_rate": 0.0002, "epoch": 3.5087153001936735, "step": 10870}, {"loss": 0.5829, "grad_norm": 0.9551863074302673, "learning_rate": 0.0002, "epoch": 3.511943189154293, "step": 10880}, {"loss": 0.5746, "grad_norm": 0.8409819602966309, "learning_rate": 0.0002, "epoch": 3.515171078114913, "step": 10890}, {"loss": 0.5813, "grad_norm": 0.7546320557594299, "learning_rate": 0.0002, "epoch": 3.5183989670755325, "step": 10900}, {"loss": 0.6184, "grad_norm": 0.7505252361297607, "learning_rate": 0.0002, "epoch": 3.5216268560361526, "step": 10910}, {"loss": 0.5649, "grad_norm": 0.7505561113357544, "learning_rate": 0.0002, "epoch": 3.524854744996772, "step": 10920}, {"loss": 0.6277, "grad_norm": 1.086177945137024, "learning_rate": 0.0002, "epoch": 3.528082633957392, "step": 10930}, {"loss": 0.5983, "grad_norm": 0.7721118330955505, "learning_rate": 0.0002, "epoch": 3.5313105229180115, "step": 10940}, {"loss": 0.5919, "grad_norm": 0.9567878246307373, "learning_rate": 0.0002, "epoch": 3.534538411878631, "step": 10950}, {"loss": 0.6261, "grad_norm": 0.8377360105514526, "learning_rate": 0.0002, "epoch": 3.5377663008392513, "step": 10960}, {"loss": 0.633, "grad_norm": 1.0174858570098877, "learning_rate": 0.0002, "epoch": 3.540994189799871, "step": 10970}, {"loss": 0.599, "grad_norm": 0.8164418935775757, "learning_rate": 0.0002, "epoch": 3.5442220787604906, "step": 10980}, {"loss": 0.5471, "grad_norm": 0.8959241509437561, "learning_rate": 0.0002, "epoch": 3.5474499677211107, "step": 10990}, {"loss": 0.6195, "grad_norm": 1.0154379606246948, "learning_rate": 0.0002, "epoch": 3.5506778566817303, "step": 11000}, {"loss": 0.5835, "grad_norm": 0.7812292575836182, "learning_rate": 0.0002, "epoch": 3.55390574564235, "step": 11010}, {"loss": 0.6052, "grad_norm": 0.9849029779434204, "learning_rate": 0.0002, "epoch": 3.5571336346029696, "step": 11020}, {"loss": 0.5689, "grad_norm": 0.8826184272766113, "learning_rate": 0.0002, "epoch": 3.5603615235635893, "step": 11030}, {"loss": 0.601, "grad_norm": 0.9039685726165771, "learning_rate": 0.0002, "epoch": 3.563589412524209, "step": 11040}, {"loss": 0.5996, "grad_norm": 0.9585249423980713, "learning_rate": 0.0002, "epoch": 3.566817301484829, "step": 11050}, {"loss": 0.5714, "grad_norm": 0.8083069324493408, "learning_rate": 0.0002, "epoch": 3.5700451904454487, "step": 11060}, {"loss": 0.6317, "grad_norm": 0.9528678059577942, "learning_rate": 0.0002, "epoch": 3.5732730794060683, "step": 11070}, {"loss": 0.6278, "grad_norm": 0.8297588229179382, "learning_rate": 0.0002, "epoch": 3.5765009683666884, "step": 11080}, {"loss": 0.5919, "grad_norm": 0.8191716074943542, "learning_rate": 0.0002, "epoch": 3.579728857327308, "step": 11090}, {"loss": 0.5971, "grad_norm": 0.8056275844573975, "learning_rate": 0.0002, "epoch": 3.5829567462879277, "step": 11100}, {"loss": 0.6325, "grad_norm": 0.701930582523346, "learning_rate": 0.0002, "epoch": 3.5861846352485474, "step": 11110}, {"loss": 0.6088, "grad_norm": 0.7644643187522888, "learning_rate": 0.0002, "epoch": 3.589412524209167, "step": 11120}, {"loss": 0.605, "grad_norm": 0.668004035949707, "learning_rate": 0.0002, "epoch": 3.592640413169787, "step": 11130}, {"loss": 0.5735, "grad_norm": 0.8849539756774902, "learning_rate": 0.0002, "epoch": 3.5958683021304068, "step": 11140}, {"loss": 0.6412, "grad_norm": 0.8123571276664734, "learning_rate": 0.0002, "epoch": 3.5990961910910264, "step": 11150}, {"loss": 0.5626, "grad_norm": 0.7591469287872314, "learning_rate": 0.0002, "epoch": 3.602324080051646, "step": 11160}, {"loss": 0.5668, "grad_norm": 0.776466965675354, "learning_rate": 0.0002, "epoch": 3.605551969012266, "step": 11170}, {"loss": 0.6631, "grad_norm": 0.9156150221824646, "learning_rate": 0.0002, "epoch": 3.608779857972886, "step": 11180}, {"loss": 0.5867, "grad_norm": 0.7517618536949158, "learning_rate": 0.0002, "epoch": 3.6120077469335055, "step": 11190}, {"loss": 0.5939, "grad_norm": 0.931239128112793, "learning_rate": 0.0002, "epoch": 3.615235635894125, "step": 11200}, {"loss": 0.5736, "grad_norm": 0.9107872843742371, "learning_rate": 0.0002, "epoch": 3.6184635248547448, "step": 11210}, {"loss": 0.5665, "grad_norm": 0.7624770998954773, "learning_rate": 0.0002, "epoch": 3.621691413815365, "step": 11220}, {"loss": 0.6033, "grad_norm": 0.8129580616950989, "learning_rate": 0.0002, "epoch": 3.6249193027759845, "step": 11230}, {"loss": 0.6192, "grad_norm": 0.7339836955070496, "learning_rate": 0.0002, "epoch": 3.628147191736604, "step": 11240}, {"loss": 0.5976, "grad_norm": 0.8901296854019165, "learning_rate": 0.0002, "epoch": 3.6313750806972243, "step": 11250}, {"loss": 0.5977, "grad_norm": 1.1374726295471191, "learning_rate": 0.0002, "epoch": 3.634602969657844, "step": 11260}, {"loss": 0.5859, "grad_norm": 0.7438275218009949, "learning_rate": 0.0002, "epoch": 3.6378308586184636, "step": 11270}, {"loss": 0.5757, "grad_norm": 0.808646559715271, "learning_rate": 0.0002, "epoch": 3.641058747579083, "step": 11280}, {"loss": 0.6244, "grad_norm": 1.091810941696167, "learning_rate": 0.0002, "epoch": 3.644286636539703, "step": 11290}, {"loss": 0.5957, "grad_norm": 0.8439257144927979, "learning_rate": 0.0002, "epoch": 3.6475145255003225, "step": 11300}, {"loss": 0.6115, "grad_norm": 0.9720633029937744, "learning_rate": 0.0002, "epoch": 3.6507424144609426, "step": 11310}, {"loss": 0.5942, "grad_norm": 0.738571047782898, "learning_rate": 0.0002, "epoch": 3.6539703034215623, "step": 11320}, {"loss": 0.6029, "grad_norm": 0.6961580514907837, "learning_rate": 0.0002, "epoch": 3.657198192382182, "step": 11330}, {"loss": 0.6226, "grad_norm": 0.8192131519317627, "learning_rate": 0.0002, "epoch": 3.660426081342802, "step": 11340}, {"loss": 0.6155, "grad_norm": 0.8367205858230591, "learning_rate": 0.0002, "epoch": 3.6636539703034217, "step": 11350}, {"loss": 0.586, "grad_norm": 0.7735666632652283, "learning_rate": 0.0002, "epoch": 3.6668818592640413, "step": 11360}, {"loss": 0.6113, "grad_norm": 0.6507132649421692, "learning_rate": 0.0002, "epoch": 3.670109748224661, "step": 11370}, {"loss": 0.6273, "grad_norm": 0.8271192312240601, "learning_rate": 0.0002, "epoch": 3.6733376371852806, "step": 11380}, {"loss": 0.5995, "grad_norm": 0.8724204301834106, "learning_rate": 0.0002, "epoch": 3.6765655261459007, "step": 11390}, {"loss": 0.6131, "grad_norm": 0.8448445200920105, "learning_rate": 0.0002, "epoch": 3.6797934151065204, "step": 11400}, {"loss": 0.5923, "grad_norm": 0.6756882071495056, "learning_rate": 0.0002, "epoch": 3.68302130406714, "step": 11410}, {"loss": 0.6443, "grad_norm": 0.7859625816345215, "learning_rate": 0.0002, "epoch": 3.68624919302776, "step": 11420}, {"loss": 0.6567, "grad_norm": 0.8929487466812134, "learning_rate": 0.0002, "epoch": 3.6894770819883798, "step": 11430}, {"loss": 0.6474, "grad_norm": 0.8163391351699829, "learning_rate": 0.0002, "epoch": 3.6927049709489994, "step": 11440}, {"loss": 0.6467, "grad_norm": 0.8948464393615723, "learning_rate": 0.0002, "epoch": 3.695932859909619, "step": 11450}, {"loss": 0.624, "grad_norm": 0.8654782176017761, "learning_rate": 0.0002, "epoch": 3.6991607488702387, "step": 11460}, {"loss": 0.6142, "grad_norm": 0.9514864683151245, "learning_rate": 0.0002, "epoch": 3.7023886378308584, "step": 11470}, {"loss": 0.606, "grad_norm": 0.7298579812049866, "learning_rate": 0.0002, "epoch": 3.7056165267914785, "step": 11480}, {"loss": 0.5853, "grad_norm": 0.9266309142112732, "learning_rate": 0.0002, "epoch": 3.708844415752098, "step": 11490}, {"loss": 0.6122, "grad_norm": 0.8608686923980713, "learning_rate": 0.0002, "epoch": 3.7120723047127178, "step": 11500}, {"loss": 0.6348, "grad_norm": 0.921788215637207, "learning_rate": 0.0002, "epoch": 3.715300193673338, "step": 11510}, {"loss": 0.6191, "grad_norm": 0.8537021279335022, "learning_rate": 0.0002, "epoch": 3.7185280826339575, "step": 11520}, {"loss": 0.6228, "grad_norm": 1.115194320678711, "learning_rate": 0.0002, "epoch": 3.721755971594577, "step": 11530}, {"loss": 0.5828, "grad_norm": 0.7614817023277283, "learning_rate": 0.0002, "epoch": 3.724983860555197, "step": 11540}, {"loss": 0.5776, "grad_norm": 0.871999204158783, "learning_rate": 0.0002, "epoch": 3.7282117495158165, "step": 11550}, {"loss": 0.5962, "grad_norm": 0.9668049812316895, "learning_rate": 0.0002, "epoch": 3.7314396384764366, "step": 11560}, {"loss": 0.5534, "grad_norm": 1.2185815572738647, "learning_rate": 0.0002, "epoch": 3.734667527437056, "step": 11570}, {"loss": 0.5936, "grad_norm": 0.8258453011512756, "learning_rate": 0.0002, "epoch": 3.737895416397676, "step": 11580}, {"loss": 0.5853, "grad_norm": 0.8708966374397278, "learning_rate": 0.0002, "epoch": 3.7411233053582955, "step": 11590}, {"loss": 0.5847, "grad_norm": 0.7784267663955688, "learning_rate": 0.0002, "epoch": 3.7443511943189156, "step": 11600}, {"loss": 0.6404, "grad_norm": 0.7504425048828125, "learning_rate": 0.0002, "epoch": 3.7475790832795353, "step": 11610}, {"loss": 0.5922, "grad_norm": 0.9144526124000549, "learning_rate": 0.0002, "epoch": 3.750806972240155, "step": 11620}, {"loss": 0.6425, "grad_norm": 0.922581672668457, "learning_rate": 0.0002, "epoch": 3.7540348612007746, "step": 11630}, {"loss": 0.6402, "grad_norm": 0.9348630905151367, "learning_rate": 0.0002, "epoch": 3.757262750161394, "step": 11640}, {"loss": 0.5852, "grad_norm": 1.0740231275558472, "learning_rate": 0.0002, "epoch": 3.7604906391220143, "step": 11650}, {"loss": 0.599, "grad_norm": 0.884830117225647, "learning_rate": 0.0002, "epoch": 3.763718528082634, "step": 11660}, {"loss": 0.5991, "grad_norm": 1.0256348848342896, "learning_rate": 0.0002, "epoch": 3.7669464170432536, "step": 11670}, {"loss": 0.626, "grad_norm": 0.6795592904090881, "learning_rate": 0.0002, "epoch": 3.7701743060038737, "step": 11680}, {"loss": 0.6241, "grad_norm": 0.9381206631660461, "learning_rate": 0.0002, "epoch": 3.7734021949644934, "step": 11690}, {"loss": 0.6054, "grad_norm": 0.7633092403411865, "learning_rate": 0.0002, "epoch": 3.776630083925113, "step": 11700}, {"loss": 0.5937, "grad_norm": 0.7506213188171387, "learning_rate": 0.0002, "epoch": 3.7798579728857327, "step": 11710}, {"loss": 0.5933, "grad_norm": 0.8182913064956665, "learning_rate": 0.0002, "epoch": 3.7830858618463523, "step": 11720}, {"loss": 0.6043, "grad_norm": 1.019322156906128, "learning_rate": 0.0002, "epoch": 3.786313750806972, "step": 11730}, {"loss": 0.633, "grad_norm": 0.8895221948623657, "learning_rate": 0.0002, "epoch": 3.789541639767592, "step": 11740}, {"loss": 0.6553, "grad_norm": 0.948847770690918, "learning_rate": 0.0002, "epoch": 3.7927695287282117, "step": 11750}, {"loss": 0.6265, "grad_norm": 0.9068999886512756, "learning_rate": 0.0002, "epoch": 3.7959974176888314, "step": 11760}, {"loss": 0.6163, "grad_norm": 0.7920539975166321, "learning_rate": 0.0002, "epoch": 3.7992253066494515, "step": 11770}, {"loss": 0.5964, "grad_norm": 0.8441922068595886, "learning_rate": 0.0002, "epoch": 3.802453195610071, "step": 11780}, {"loss": 0.6379, "grad_norm": 0.9258501529693604, "learning_rate": 0.0002, "epoch": 3.8056810845706908, "step": 11790}, {"loss": 0.6379, "grad_norm": 0.7354241609573364, "learning_rate": 0.0002, "epoch": 3.8089089735313104, "step": 11800}, {"loss": 0.6177, "grad_norm": 0.9494872689247131, "learning_rate": 0.0002, "epoch": 3.81213686249193, "step": 11810}, {"loss": 0.5931, "grad_norm": 0.8266556859016418, "learning_rate": 0.0002, "epoch": 3.81536475145255, "step": 11820}, {"loss": 0.641, "grad_norm": 0.7951219081878662, "learning_rate": 0.0002, "epoch": 3.81859264041317, "step": 11830}, {"loss": 0.5767, "grad_norm": 0.7688382267951965, "learning_rate": 0.0002, "epoch": 3.8218205293737895, "step": 11840}, {"loss": 0.6117, "grad_norm": 1.0917940139770508, "learning_rate": 0.0002, "epoch": 3.8250484183344096, "step": 11850}, {"loss": 0.5857, "grad_norm": 0.9880442023277283, "learning_rate": 0.0002, "epoch": 3.828276307295029, "step": 11860}, {"loss": 0.6579, "grad_norm": 0.8433151245117188, "learning_rate": 0.0002, "epoch": 3.831504196255649, "step": 11870}, {"loss": 0.5876, "grad_norm": 0.8691204786300659, "learning_rate": 0.0002, "epoch": 3.8347320852162685, "step": 11880}, {"loss": 0.6308, "grad_norm": 0.7698143124580383, "learning_rate": 0.0002, "epoch": 3.837959974176888, "step": 11890}, {"loss": 0.6531, "grad_norm": 0.8874883651733398, "learning_rate": 0.0002, "epoch": 3.841187863137508, "step": 11900}, {"loss": 0.6242, "grad_norm": 1.1209359169006348, "learning_rate": 0.0002, "epoch": 3.844415752098128, "step": 11910}, {"loss": 0.6415, "grad_norm": 0.7723544239997864, "learning_rate": 0.0002, "epoch": 3.8476436410587476, "step": 11920}, {"loss": 0.6091, "grad_norm": 0.8363937139511108, "learning_rate": 0.0002, "epoch": 3.850871530019367, "step": 11930}, {"loss": 0.6498, "grad_norm": 0.9209707975387573, "learning_rate": 0.0002, "epoch": 3.8540994189799873, "step": 11940}, {"loss": 0.6471, "grad_norm": 0.9456894993782043, "learning_rate": 0.0002, "epoch": 3.857327307940607, "step": 11950}, {"loss": 0.6432, "grad_norm": 1.5748413801193237, "learning_rate": 0.0002, "epoch": 3.8605551969012266, "step": 11960}, {"loss": 0.6197, "grad_norm": 0.9083569049835205, "learning_rate": 0.0002, "epoch": 3.8637830858618463, "step": 11970}, {"loss": 0.6593, "grad_norm": 0.7672823071479797, "learning_rate": 0.0002, "epoch": 3.867010974822466, "step": 11980}, {"loss": 0.6238, "grad_norm": 0.8647152185440063, "learning_rate": 0.0002, "epoch": 3.870238863783086, "step": 11990}, {"loss": 0.5755, "grad_norm": 0.9564255475997925, "learning_rate": 0.0002, "epoch": 3.8734667527437057, "step": 12000}, {"loss": 0.6321, "grad_norm": 0.773267924785614, "learning_rate": 0.0002, "epoch": 3.8766946417043253, "step": 12010}, {"loss": 0.6057, "grad_norm": 0.8030173182487488, "learning_rate": 0.0002, "epoch": 3.879922530664945, "step": 12020}, {"loss": 0.6194, "grad_norm": 0.8002150058746338, "learning_rate": 0.0002, "epoch": 3.883150419625565, "step": 12030}, {"loss": 0.6194, "grad_norm": 0.98802250623703, "learning_rate": 0.0002, "epoch": 3.8863783085861847, "step": 12040}, {"loss": 0.6026, "grad_norm": 0.7868124842643738, "learning_rate": 0.0002, "epoch": 3.8896061975468044, "step": 12050}, {"loss": 0.6303, "grad_norm": 0.932182788848877, "learning_rate": 0.0002, "epoch": 3.892834086507424, "step": 12060}, {"loss": 0.5863, "grad_norm": 0.8576806783676147, "learning_rate": 0.0002, "epoch": 3.8960619754680437, "step": 12070}, {"loss": 0.6079, "grad_norm": 0.8985713124275208, "learning_rate": 0.0002, "epoch": 3.8992898644286638, "step": 12080}, {"loss": 0.6449, "grad_norm": 0.7876521944999695, "learning_rate": 0.0002, "epoch": 3.9025177533892834, "step": 12090}, {"loss": 0.5655, "grad_norm": 0.773936927318573, "learning_rate": 0.0002, "epoch": 3.905745642349903, "step": 12100}, {"loss": 0.5765, "grad_norm": 0.7274761199951172, "learning_rate": 0.0002, "epoch": 3.908973531310523, "step": 12110}, {"loss": 0.6182, "grad_norm": 0.8625598549842834, "learning_rate": 0.0002, "epoch": 3.912201420271143, "step": 12120}, {"loss": 0.5855, "grad_norm": 0.8702362179756165, "learning_rate": 0.0002, "epoch": 3.9154293092317625, "step": 12130}, {"loss": 0.6493, "grad_norm": 0.912579357624054, "learning_rate": 0.0002, "epoch": 3.918657198192382, "step": 12140}, {"loss": 0.6341, "grad_norm": 0.8697066903114319, "learning_rate": 0.0002, "epoch": 3.9218850871530018, "step": 12150}, {"loss": 0.6037, "grad_norm": 1.005232572555542, "learning_rate": 0.0002, "epoch": 3.9251129761136214, "step": 12160}, {"loss": 0.621, "grad_norm": 0.793902575969696, "learning_rate": 0.0002, "epoch": 3.9283408650742415, "step": 12170}, {"loss": 0.599, "grad_norm": 0.7025905847549438, "learning_rate": 0.0002, "epoch": 3.931568754034861, "step": 12180}, {"loss": 0.6421, "grad_norm": 0.97635817527771, "learning_rate": 0.0002, "epoch": 3.934796642995481, "step": 12190}, {"loss": 0.6416, "grad_norm": 0.855417013168335, "learning_rate": 0.0002, "epoch": 3.938024531956101, "step": 12200}, {"loss": 0.5979, "grad_norm": 0.8841291666030884, "learning_rate": 0.0002, "epoch": 3.9412524209167206, "step": 12210}, {"loss": 0.5666, "grad_norm": 1.1762064695358276, "learning_rate": 0.0002, "epoch": 3.94448030987734, "step": 12220}, {"loss": 0.586, "grad_norm": 0.8393193483352661, "learning_rate": 0.0002, "epoch": 3.94770819883796, "step": 12230}, {"loss": 0.5738, "grad_norm": 0.9324905276298523, "learning_rate": 0.0002, "epoch": 3.9509360877985795, "step": 12240}, {"loss": 0.5954, "grad_norm": 0.8607982993125916, "learning_rate": 0.0002, "epoch": 3.9541639767591996, "step": 12250}, {"loss": 0.6277, "grad_norm": 0.8586681485176086, "learning_rate": 0.0002, "epoch": 3.9573918657198193, "step": 12260}, {"loss": 0.5841, "grad_norm": 1.1082909107208252, "learning_rate": 0.0002, "epoch": 3.960619754680439, "step": 12270}, {"loss": 0.6231, "grad_norm": 1.065027117729187, "learning_rate": 0.0002, "epoch": 3.963847643641059, "step": 12280}, {"loss": 0.5996, "grad_norm": 0.9544363021850586, "learning_rate": 0.0002, "epoch": 3.9670755326016787, "step": 12290}, {"loss": 0.6301, "grad_norm": 0.9008927345275879, "learning_rate": 0.0002, "epoch": 3.9703034215622983, "step": 12300}, {"loss": 0.6108, "grad_norm": 0.8717467188835144, "learning_rate": 0.0002, "epoch": 3.973531310522918, "step": 12310}, {"loss": 0.6465, "grad_norm": 0.9718339443206787, "learning_rate": 0.0002, "epoch": 3.9767591994835376, "step": 12320}, {"loss": 0.603, "grad_norm": 1.0362015962600708, "learning_rate": 0.0002, "epoch": 3.9799870884441573, "step": 12330}, {"loss": 0.6229, "grad_norm": 1.0844318866729736, "learning_rate": 0.0002, "epoch": 3.9832149774047774, "step": 12340}, {"loss": 0.6777, "grad_norm": 0.7506240606307983, "learning_rate": 0.0002, "epoch": 3.986442866365397, "step": 12350}, {"loss": 0.6076, "grad_norm": 1.005982756614685, "learning_rate": 0.0002, "epoch": 3.9896707553260167, "step": 12360}, {"loss": 0.5926, "grad_norm": 0.7566431164741516, "learning_rate": 0.0002, "epoch": 3.9928986442866368, "step": 12370}, {"loss": 0.653, "grad_norm": 0.8819181323051453, "learning_rate": 0.0002, "epoch": 3.9961265332472564, "step": 12380}, {"loss": 0.6197, "grad_norm": 0.884497880935669, "learning_rate": 0.0002, "epoch": 3.999354422207876, "step": 12390}, {"eval_loss": 1.1907150745391846, "eval_runtime": 161.5766, "eval_samples_per_second": 4.537, "eval_steps_per_second": 0.569, "epoch": 4.0, "step": 12392}, {"loss": 0.5203, "grad_norm": 1.0407241582870483, "learning_rate": 0.0002, "epoch": 4.002582311168496, "step": 12400}, {"loss": 0.4978, "grad_norm": 1.0199295282363892, "learning_rate": 0.0002, "epoch": 4.005810200129115, "step": 12410}, {"loss": 0.4985, "grad_norm": 0.8456302881240845, "learning_rate": 0.0002, "epoch": 4.009038089089735, "step": 12420}, {"loss": 0.4669, "grad_norm": 1.0621124505996704, "learning_rate": 0.0002, "epoch": 4.012265978050355, "step": 12430}, {"loss": 0.5277, "grad_norm": 0.8984712362289429, "learning_rate": 0.0002, "epoch": 4.015493867010975, "step": 12440}, {"loss": 0.5508, "grad_norm": 1.3785864114761353, "learning_rate": 0.0002, "epoch": 4.018721755971595, "step": 12450}, {"loss": 0.5244, "grad_norm": 0.7911781668663025, "learning_rate": 0.0002, "epoch": 4.0219496449322145, "step": 12460}, {"loss": 0.4746, "grad_norm": 1.0977907180786133, "learning_rate": 0.0002, "epoch": 4.025177533892834, "step": 12470}, {"loss": 0.4632, "grad_norm": 1.0664983987808228, "learning_rate": 0.0002, "epoch": 4.028405422853454, "step": 12480}, {"loss": 0.5151, "grad_norm": 1.0807124376296997, "learning_rate": 0.0002, "epoch": 4.0316333118140735, "step": 12490}, {"loss": 0.4712, "grad_norm": 1.2650192975997925, "learning_rate": 0.0002, "epoch": 4.034861200774693, "step": 12500}, {"loss": 0.5111, "grad_norm": 0.7164070010185242, "learning_rate": 0.0002, "epoch": 4.038089089735313, "step": 12510}, {"loss": 0.5015, "grad_norm": 1.0047489404678345, "learning_rate": 0.0002, "epoch": 4.041316978695932, "step": 12520}, {"loss": 0.5467, "grad_norm": 0.9303901791572571, "learning_rate": 0.0002, "epoch": 4.044544867656553, "step": 12530}, {"loss": 0.5165, "grad_norm": 1.0319702625274658, "learning_rate": 0.0002, "epoch": 4.047772756617173, "step": 12540}, {"loss": 0.4834, "grad_norm": 0.9549729228019714, "learning_rate": 0.0002, "epoch": 4.051000645577792, "step": 12550}, {"loss": 0.5235, "grad_norm": 0.7175564765930176, "learning_rate": 0.0002, "epoch": 4.054228534538412, "step": 12560}, {"loss": 0.5257, "grad_norm": 1.0622259378433228, "learning_rate": 0.0002, "epoch": 4.057456423499032, "step": 12570}, {"loss": 0.5098, "grad_norm": 1.172074556350708, "learning_rate": 0.0002, "epoch": 4.060684312459651, "step": 12580}, {"loss": 0.5112, "grad_norm": 0.9702366590499878, "learning_rate": 0.0002, "epoch": 4.063912201420271, "step": 12590}, {"loss": 0.5042, "grad_norm": 0.741511344909668, "learning_rate": 0.0002, "epoch": 4.0671400903808905, "step": 12600}, {"loss": 0.4996, "grad_norm": 0.8632621169090271, "learning_rate": 0.0002, "epoch": 4.070367979341511, "step": 12610}, {"loss": 0.4927, "grad_norm": 0.9695962071418762, "learning_rate": 0.0002, "epoch": 4.073595868302131, "step": 12620}, {"loss": 0.4618, "grad_norm": 0.9401052594184875, "learning_rate": 0.0002, "epoch": 4.07682375726275, "step": 12630}, {"loss": 0.4889, "grad_norm": 0.8068707585334778, "learning_rate": 0.0002, "epoch": 4.08005164622337, "step": 12640}, {"loss": 0.5046, "grad_norm": 0.9554762840270996, "learning_rate": 0.0002, "epoch": 4.08327953518399, "step": 12650}, {"loss": 0.5081, "grad_norm": 0.7637128233909607, "learning_rate": 0.0002, "epoch": 4.086507424144609, "step": 12660}, {"loss": 0.4997, "grad_norm": 0.6703744530677795, "learning_rate": 0.0002, "epoch": 4.089735313105229, "step": 12670}, {"loss": 0.4977, "grad_norm": 0.8623828887939453, "learning_rate": 0.0002, "epoch": 4.092963202065849, "step": 12680}, {"loss": 0.4616, "grad_norm": 0.8198223114013672, "learning_rate": 0.0002, "epoch": 4.096191091026468, "step": 12690}, {"loss": 0.5372, "grad_norm": 1.3449875116348267, "learning_rate": 0.0002, "epoch": 4.099418979987089, "step": 12700}, {"loss": 0.4782, "grad_norm": 0.8333606123924255, "learning_rate": 0.0002, "epoch": 4.1026468689477085, "step": 12710}, {"loss": 0.5135, "grad_norm": 1.1647733449935913, "learning_rate": 0.0002, "epoch": 4.105874757908328, "step": 12720}, {"loss": 0.5147, "grad_norm": 1.0560213327407837, "learning_rate": 0.0002, "epoch": 4.109102646868948, "step": 12730}, {"loss": 0.5244, "grad_norm": 0.9479449987411499, "learning_rate": 0.0002, "epoch": 4.112330535829567, "step": 12740}, {"loss": 0.4596, "grad_norm": 1.1634587049484253, "learning_rate": 0.0002, "epoch": 4.115558424790187, "step": 12750}, {"loss": 0.4966, "grad_norm": 0.813987672328949, "learning_rate": 0.0002, "epoch": 4.118786313750807, "step": 12760}, {"loss": 0.5133, "grad_norm": 0.968461275100708, "learning_rate": 0.0002, "epoch": 4.122014202711426, "step": 12770}, {"loss": 0.5113, "grad_norm": 0.9324830770492554, "learning_rate": 0.0002, "epoch": 4.125242091672046, "step": 12780}, {"loss": 0.5233, "grad_norm": 0.8313411474227905, "learning_rate": 0.0002, "epoch": 4.128469980632667, "step": 12790}, {"loss": 0.5169, "grad_norm": 1.0177634954452515, "learning_rate": 0.0002, "epoch": 4.131697869593286, "step": 12800}, {"loss": 0.4635, "grad_norm": 1.0890623331069946, "learning_rate": 0.0002, "epoch": 4.134925758553906, "step": 12810}, {"loss": 0.519, "grad_norm": 0.9131693840026855, "learning_rate": 0.0002, "epoch": 4.1381536475145255, "step": 12820}, {"loss": 0.5017, "grad_norm": 0.8400680422782898, "learning_rate": 0.0002, "epoch": 4.141381536475145, "step": 12830}, {"loss": 0.5195, "grad_norm": 0.8988795876502991, "learning_rate": 0.0002, "epoch": 4.144609425435765, "step": 12840}, {"loss": 0.5052, "grad_norm": 0.9224025011062622, "learning_rate": 0.0002, "epoch": 4.1478373143963845, "step": 12850}, {"loss": 0.5001, "grad_norm": 0.7453159689903259, "learning_rate": 0.0002, "epoch": 4.151065203357004, "step": 12860}, {"loss": 0.4874, "grad_norm": 0.9815868139266968, "learning_rate": 0.0002, "epoch": 4.154293092317625, "step": 12870}, {"loss": 0.5485, "grad_norm": 1.2542768716812134, "learning_rate": 0.0002, "epoch": 4.157520981278244, "step": 12880}, {"loss": 0.5287, "grad_norm": 1.0092132091522217, "learning_rate": 0.0002, "epoch": 4.160748870238864, "step": 12890}, {"loss": 0.5125, "grad_norm": 1.1836622953414917, "learning_rate": 0.0002, "epoch": 4.163976759199484, "step": 12900}, {"loss": 0.5089, "grad_norm": 0.7706810235977173, "learning_rate": 0.0002, "epoch": 4.167204648160103, "step": 12910}, {"loss": 0.5123, "grad_norm": 1.00058913230896, "learning_rate": 0.0002, "epoch": 4.170432537120723, "step": 12920}, {"loss": 0.5238, "grad_norm": 1.2326250076293945, "learning_rate": 0.0002, "epoch": 4.173660426081343, "step": 12930}, {"loss": 0.5405, "grad_norm": 0.8829123377799988, "learning_rate": 0.0002, "epoch": 4.176888315041962, "step": 12940}, {"loss": 0.517, "grad_norm": 0.936042845249176, "learning_rate": 0.0002, "epoch": 4.180116204002582, "step": 12950}, {"loss": 0.4991, "grad_norm": 0.9773517847061157, "learning_rate": 0.0002, "epoch": 4.183344092963202, "step": 12960}, {"loss": 0.5025, "grad_norm": 0.9786297678947449, "learning_rate": 0.0002, "epoch": 4.186571981923822, "step": 12970}, {"loss": 0.5276, "grad_norm": 0.7524558901786804, "learning_rate": 0.0002, "epoch": 4.189799870884442, "step": 12980}, {"loss": 0.5522, "grad_norm": 1.0107866525650024, "learning_rate": 0.0002, "epoch": 4.193027759845061, "step": 12990}, {"loss": 0.5304, "grad_norm": 1.0092947483062744, "learning_rate": 0.0002, "epoch": 4.196255648805681, "step": 13000}, {"loss": 0.5061, "grad_norm": 1.18181312084198, "learning_rate": 0.0002, "epoch": 4.199483537766301, "step": 13010}, {"loss": 0.512, "grad_norm": 0.8845750093460083, "learning_rate": 0.0002, "epoch": 4.20271142672692, "step": 13020}, {"loss": 0.5329, "grad_norm": 1.0789145231246948, "learning_rate": 0.0002, "epoch": 4.20593931568754, "step": 13030}, {"loss": 0.5001, "grad_norm": 0.9562082886695862, "learning_rate": 0.0002, "epoch": 4.2091672046481605, "step": 13040}, {"loss": 0.5211, "grad_norm": 0.875755786895752, "learning_rate": 0.0002, "epoch": 4.21239509360878, "step": 13050}, {"loss": 0.5162, "grad_norm": 1.0694596767425537, "learning_rate": 0.0002, "epoch": 4.2156229825694, "step": 13060}, {"loss": 0.4917, "grad_norm": 1.0053378343582153, "learning_rate": 0.0002, "epoch": 4.2188508715300195, "step": 13070}, {"loss": 0.542, "grad_norm": 1.1628689765930176, "learning_rate": 0.0002, "epoch": 4.222078760490639, "step": 13080}, {"loss": 0.4796, "grad_norm": 0.9455991983413696, "learning_rate": 0.0002, "epoch": 4.225306649451259, "step": 13090}, {"loss": 0.4802, "grad_norm": 0.9736765623092651, "learning_rate": 0.0002, "epoch": 4.228534538411878, "step": 13100}, {"loss": 0.5411, "grad_norm": 0.8653560876846313, "learning_rate": 0.0002, "epoch": 4.231762427372498, "step": 13110}, {"loss": 0.5347, "grad_norm": 0.9335988163948059, "learning_rate": 0.0002, "epoch": 4.234990316333118, "step": 13120}, {"loss": 0.5217, "grad_norm": 0.9102661609649658, "learning_rate": 0.0002, "epoch": 4.238218205293738, "step": 13130}, {"loss": 0.5531, "grad_norm": 1.0595461130142212, "learning_rate": 0.0002, "epoch": 4.241446094254358, "step": 13140}, {"loss": 0.517, "grad_norm": 0.8947662711143494, "learning_rate": 0.0002, "epoch": 4.244673983214978, "step": 13150}, {"loss": 0.5116, "grad_norm": 1.0835723876953125, "learning_rate": 0.0002, "epoch": 4.247901872175597, "step": 13160}, {"loss": 0.5212, "grad_norm": 0.8496462106704712, "learning_rate": 0.0002, "epoch": 4.251129761136217, "step": 13170}, {"loss": 0.5079, "grad_norm": 0.9395631551742554, "learning_rate": 0.0002, "epoch": 4.2543576500968365, "step": 13180}, {"loss": 0.5076, "grad_norm": 1.2939592599868774, "learning_rate": 0.0002, "epoch": 4.257585539057456, "step": 13190}, {"loss": 0.5209, "grad_norm": 0.9325923919677734, "learning_rate": 0.0002, "epoch": 4.260813428018076, "step": 13200}, {"loss": 0.4984, "grad_norm": 0.9220664501190186, "learning_rate": 0.0002, "epoch": 4.264041316978696, "step": 13210}, {"loss": 0.5553, "grad_norm": 0.9505137205123901, "learning_rate": 0.0002, "epoch": 4.267269205939316, "step": 13220}, {"loss": 0.5238, "grad_norm": 1.0713751316070557, "learning_rate": 0.0002, "epoch": 4.270497094899936, "step": 13230}, {"loss": 0.5478, "grad_norm": 0.8390375971794128, "learning_rate": 0.0002, "epoch": 4.273724983860555, "step": 13240}, {"loss": 0.5217, "grad_norm": 0.8943426012992859, "learning_rate": 0.0002, "epoch": 4.276952872821175, "step": 13250}, {"loss": 0.5486, "grad_norm": 0.9175868630409241, "learning_rate": 0.0002, "epoch": 4.280180761781795, "step": 13260}, {"loss": 0.5208, "grad_norm": 0.9969881176948547, "learning_rate": 0.0002, "epoch": 4.283408650742414, "step": 13270}, {"loss": 0.5376, "grad_norm": 1.2271877527236938, "learning_rate": 0.0002, "epoch": 4.286636539703034, "step": 13280}, {"loss": 0.4811, "grad_norm": 0.9463263154029846, "learning_rate": 0.0002, "epoch": 4.289864428663654, "step": 13290}, {"loss": 0.52, "grad_norm": 1.0306228399276733, "learning_rate": 0.0002, "epoch": 4.293092317624274, "step": 13300}, {"loss": 0.5092, "grad_norm": 0.8454763889312744, "learning_rate": 0.0002, "epoch": 4.296320206584894, "step": 13310}, {"loss": 0.5657, "grad_norm": 0.9843119978904724, "learning_rate": 0.0002, "epoch": 4.299548095545513, "step": 13320}, {"loss": 0.5407, "grad_norm": 1.0836851596832275, "learning_rate": 0.0002, "epoch": 4.302775984506133, "step": 13330}, {"loss": 0.5336, "grad_norm": 1.0719412565231323, "learning_rate": 0.0002, "epoch": 4.306003873466753, "step": 13340}, {"loss": 0.4798, "grad_norm": 0.9276487827301025, "learning_rate": 0.0002, "epoch": 4.309231762427372, "step": 13350}, {"loss": 0.5256, "grad_norm": 0.897072434425354, "learning_rate": 0.0002, "epoch": 4.312459651387992, "step": 13360}, {"loss": 0.5333, "grad_norm": 1.0493228435516357, "learning_rate": 0.0002, "epoch": 4.315687540348612, "step": 13370}, {"loss": 0.5218, "grad_norm": 0.9446353316307068, "learning_rate": 0.0002, "epoch": 4.318915429309232, "step": 13380}, {"loss": 0.4765, "grad_norm": 0.7765224575996399, "learning_rate": 0.0002, "epoch": 4.322143318269852, "step": 13390}, {"loss": 0.5907, "grad_norm": 0.9100048542022705, "learning_rate": 0.0002, "epoch": 4.3253712072304715, "step": 13400}, {"loss": 0.5393, "grad_norm": 1.0913089513778687, "learning_rate": 0.0002, "epoch": 4.328599096191091, "step": 13410}, {"loss": 0.494, "grad_norm": 0.9607733488082886, "learning_rate": 0.0002, "epoch": 4.331826985151711, "step": 13420}, {"loss": 0.5273, "grad_norm": 0.8774219155311584, "learning_rate": 0.0002, "epoch": 4.3350548741123305, "step": 13430}, {"loss": 0.5482, "grad_norm": 0.8366804122924805, "learning_rate": 0.0002, "epoch": 4.33828276307295, "step": 13440}, {"loss": 0.5487, "grad_norm": 1.034727931022644, "learning_rate": 0.0002, "epoch": 4.34151065203357, "step": 13450}, {"loss": 0.4995, "grad_norm": 0.942743182182312, "learning_rate": 0.0002, "epoch": 4.344738540994189, "step": 13460}, {"loss": 0.5222, "grad_norm": 0.7237029075622559, "learning_rate": 0.0002, "epoch": 4.347966429954809, "step": 13470}, {"loss": 0.5461, "grad_norm": 0.8216196894645691, "learning_rate": 0.0002, "epoch": 4.35119431891543, "step": 13480}, {"loss": 0.5104, "grad_norm": 1.031860113143921, "learning_rate": 0.0002, "epoch": 4.354422207876049, "step": 13490}, {"loss": 0.547, "grad_norm": 0.8880493640899658, "learning_rate": 0.0002, "epoch": 4.357650096836669, "step": 13500}, {"loss": 0.5259, "grad_norm": 0.8442490696907043, "learning_rate": 0.0002, "epoch": 4.360877985797289, "step": 13510}, {"loss": 0.5176, "grad_norm": 1.270971655845642, "learning_rate": 0.0002, "epoch": 4.364105874757908, "step": 13520}, {"loss": 0.5028, "grad_norm": 0.9657870531082153, "learning_rate": 0.0002, "epoch": 4.367333763718528, "step": 13530}, {"loss": 0.5136, "grad_norm": 0.7477133870124817, "learning_rate": 0.0002, "epoch": 4.3705616526791475, "step": 13540}, {"loss": 0.5483, "grad_norm": 1.0209243297576904, "learning_rate": 0.0002, "epoch": 4.373789541639767, "step": 13550}, {"loss": 0.4888, "grad_norm": 0.8714015483856201, "learning_rate": 0.0002, "epoch": 4.377017430600388, "step": 13560}, {"loss": 0.5428, "grad_norm": 1.0490189790725708, "learning_rate": 0.0002, "epoch": 4.380245319561007, "step": 13570}, {"loss": 0.5398, "grad_norm": 0.9454663991928101, "learning_rate": 0.0002, "epoch": 4.383473208521627, "step": 13580}, {"loss": 0.5072, "grad_norm": 1.154146432876587, "learning_rate": 0.0002, "epoch": 4.386701097482247, "step": 13590}, {"loss": 0.5096, "grad_norm": 1.155090570449829, "learning_rate": 0.0002, "epoch": 4.389928986442866, "step": 13600}, {"loss": 0.5679, "grad_norm": 0.9853842854499817, "learning_rate": 0.0002, "epoch": 4.393156875403486, "step": 13610}, {"loss": 0.4992, "grad_norm": 0.9265837669372559, "learning_rate": 0.0002, "epoch": 4.396384764364106, "step": 13620}, {"loss": 0.523, "grad_norm": 0.8367540240287781, "learning_rate": 0.0002, "epoch": 4.399612653324725, "step": 13630}, {"loss": 0.564, "grad_norm": 1.1453629732131958, "learning_rate": 0.0002, "epoch": 4.402840542285345, "step": 13640}, {"loss": 0.573, "grad_norm": 1.0856295824050903, "learning_rate": 0.0002, "epoch": 4.4060684312459655, "step": 13650}, {"loss": 0.5178, "grad_norm": 0.9284523129463196, "learning_rate": 0.0002, "epoch": 4.409296320206585, "step": 13660}, {"loss": 0.4862, "grad_norm": 0.9632299542427063, "learning_rate": 0.0002, "epoch": 4.412524209167205, "step": 13670}, {"loss": 0.5928, "grad_norm": 1.048524260520935, "learning_rate": 0.0002, "epoch": 4.415752098127824, "step": 13680}, {"loss": 0.5258, "grad_norm": 0.9787682294845581, "learning_rate": 0.0002, "epoch": 4.418979987088444, "step": 13690}, {"loss": 0.5513, "grad_norm": 1.0728684663772583, "learning_rate": 0.0002, "epoch": 4.422207876049064, "step": 13700}, {"loss": 0.5243, "grad_norm": 0.72867351770401, "learning_rate": 0.0002, "epoch": 4.425435765009683, "step": 13710}, {"loss": 0.5313, "grad_norm": 0.8932793736457825, "learning_rate": 0.0002, "epoch": 4.428663653970303, "step": 13720}, {"loss": 0.5156, "grad_norm": 1.098343849182129, "learning_rate": 0.0002, "epoch": 4.431891542930924, "step": 13730}, {"loss": 0.5342, "grad_norm": 0.9321235418319702, "learning_rate": 0.0002, "epoch": 4.435119431891543, "step": 13740}, {"loss": 0.5114, "grad_norm": 0.8868634104728699, "learning_rate": 0.0002, "epoch": 4.438347320852163, "step": 13750}, {"loss": 0.5284, "grad_norm": 1.200064778327942, "learning_rate": 0.0002, "epoch": 4.4415752098127825, "step": 13760}, {"loss": 0.5208, "grad_norm": 0.8968019485473633, "learning_rate": 0.0002, "epoch": 4.444803098773402, "step": 13770}, {"loss": 0.4979, "grad_norm": 0.9560935497283936, "learning_rate": 0.0002, "epoch": 4.448030987734022, "step": 13780}, {"loss": 0.5134, "grad_norm": 0.7985701560974121, "learning_rate": 0.0002, "epoch": 4.4512588766946415, "step": 13790}, {"loss": 0.5113, "grad_norm": 1.062540888786316, "learning_rate": 0.0002, "epoch": 4.454486765655261, "step": 13800}, {"loss": 0.525, "grad_norm": 1.0827109813690186, "learning_rate": 0.0002, "epoch": 4.457714654615881, "step": 13810}, {"loss": 0.5541, "grad_norm": 1.0853543281555176, "learning_rate": 0.0002, "epoch": 4.460942543576501, "step": 13820}, {"loss": 0.5381, "grad_norm": 1.0613641738891602, "learning_rate": 0.0002, "epoch": 4.464170432537121, "step": 13830}, {"loss": 0.5684, "grad_norm": 0.9037535190582275, "learning_rate": 0.0002, "epoch": 4.467398321497741, "step": 13840}, {"loss": 0.5112, "grad_norm": 0.9216223955154419, "learning_rate": 0.0002, "epoch": 4.47062621045836, "step": 13850}, {"loss": 0.5341, "grad_norm": 0.8952260613441467, "learning_rate": 0.0002, "epoch": 4.47385409941898, "step": 13860}, {"loss": 0.5026, "grad_norm": 0.9997953176498413, "learning_rate": 0.0002, "epoch": 4.4770819883796, "step": 13870}, {"loss": 0.5107, "grad_norm": 1.062458872795105, "learning_rate": 0.0002, "epoch": 4.480309877340219, "step": 13880}, {"loss": 0.5463, "grad_norm": 0.9185126423835754, "learning_rate": 0.0002, "epoch": 4.483537766300839, "step": 13890}, {"loss": 0.5181, "grad_norm": 1.2389954328536987, "learning_rate": 0.0002, "epoch": 4.486765655261459, "step": 13900}, {"loss": 0.5199, "grad_norm": 1.1632126569747925, "learning_rate": 0.0002, "epoch": 4.489993544222079, "step": 13910}, {"loss": 0.5128, "grad_norm": 1.0304487943649292, "learning_rate": 0.0002, "epoch": 4.493221433182699, "step": 13920}, {"loss": 0.5331, "grad_norm": 0.9144788384437561, "learning_rate": 0.0002, "epoch": 4.496449322143318, "step": 13930}, {"loss": 0.5312, "grad_norm": 1.0285682678222656, "learning_rate": 0.0002, "epoch": 4.499677211103938, "step": 13940}, {"loss": 0.554, "grad_norm": 1.1187206506729126, "learning_rate": 0.0002, "epoch": 4.502905100064558, "step": 13950}, {"loss": 0.5268, "grad_norm": 0.7917197942733765, "learning_rate": 0.0002, "epoch": 4.506132989025177, "step": 13960}, {"loss": 0.5227, "grad_norm": 0.8495619297027588, "learning_rate": 0.0002, "epoch": 4.509360877985797, "step": 13970}, {"loss": 0.4971, "grad_norm": 1.0450760126113892, "learning_rate": 0.0002, "epoch": 4.512588766946417, "step": 13980}, {"loss": 0.5402, "grad_norm": 1.0061010122299194, "learning_rate": 0.0002, "epoch": 4.515816655907037, "step": 13990}, {"loss": 0.527, "grad_norm": 1.0232428312301636, "learning_rate": 0.0002, "epoch": 4.519044544867657, "step": 14000}, {"loss": 0.5002, "grad_norm": 0.8734631538391113, "learning_rate": 0.0002, "epoch": 4.5222724338282765, "step": 14010}, {"loss": 0.5464, "grad_norm": 1.1085621118545532, "learning_rate": 0.0002, "epoch": 4.525500322788896, "step": 14020}, {"loss": 0.5167, "grad_norm": 0.9178624749183655, "learning_rate": 0.0002, "epoch": 4.528728211749516, "step": 14030}, {"loss": 0.5589, "grad_norm": 1.0687317848205566, "learning_rate": 0.0002, "epoch": 4.531956100710135, "step": 14040}, {"loss": 0.5576, "grad_norm": 0.9237300157546997, "learning_rate": 0.0002, "epoch": 4.535183989670755, "step": 14050}, {"loss": 0.5062, "grad_norm": 0.9667123556137085, "learning_rate": 0.0002, "epoch": 4.538411878631375, "step": 14060}, {"loss": 0.5645, "grad_norm": 1.1286747455596924, "learning_rate": 0.0002, "epoch": 4.541639767591995, "step": 14070}, {"loss": 0.5226, "grad_norm": 1.055392861366272, "learning_rate": 0.0002, "epoch": 4.544867656552615, "step": 14080}, {"loss": 0.5428, "grad_norm": 0.9492936134338379, "learning_rate": 0.0002, "epoch": 4.548095545513235, "step": 14090}, {"loss": 0.5559, "grad_norm": 0.9881349802017212, "learning_rate": 0.0002, "epoch": 4.551323434473854, "step": 14100}, {"loss": 0.5572, "grad_norm": 0.9389023184776306, "learning_rate": 0.0002, "epoch": 4.554551323434474, "step": 14110}, {"loss": 0.5511, "grad_norm": 0.8395606875419617, "learning_rate": 0.0002, "epoch": 4.5577792123950935, "step": 14120}, {"loss": 0.5696, "grad_norm": 0.9019067287445068, "learning_rate": 0.0002, "epoch": 4.561007101355713, "step": 14130}, {"loss": 0.5564, "grad_norm": 1.1058136224746704, "learning_rate": 0.0002, "epoch": 4.564234990316333, "step": 14140}, {"loss": 0.5323, "grad_norm": 1.0683821439743042, "learning_rate": 0.0002, "epoch": 4.5674628792769525, "step": 14150}, {"loss": 0.5527, "grad_norm": 1.3398395776748657, "learning_rate": 0.0002, "epoch": 4.570690768237572, "step": 14160}, {"loss": 0.4713, "grad_norm": 0.7829096913337708, "learning_rate": 0.0002, "epoch": 4.573918657198193, "step": 14170}, {"loss": 0.525, "grad_norm": 0.9636675119400024, "learning_rate": 0.0002, "epoch": 4.577146546158812, "step": 14180}, {"loss": 0.5458, "grad_norm": 1.0291401147842407, "learning_rate": 0.0002, "epoch": 4.580374435119432, "step": 14190}, {"loss": 0.5366, "grad_norm": 1.0894310474395752, "learning_rate": 0.0002, "epoch": 4.583602324080052, "step": 14200}, {"loss": 0.5125, "grad_norm": 1.111573576927185, "learning_rate": 0.0002, "epoch": 4.586830213040671, "step": 14210}, {"loss": 0.5444, "grad_norm": 0.9345336556434631, "learning_rate": 0.0002, "epoch": 4.590058102001291, "step": 14220}, {"loss": 0.5175, "grad_norm": 1.3338757753372192, "learning_rate": 0.0002, "epoch": 4.593285990961911, "step": 14230}, {"loss": 0.5227, "grad_norm": 1.1146448850631714, "learning_rate": 0.0002, "epoch": 4.596513879922531, "step": 14240}, {"loss": 0.543, "grad_norm": 1.1576755046844482, "learning_rate": 0.0002, "epoch": 4.599741768883151, "step": 14250}, {"loss": 0.5315, "grad_norm": 0.6851092576980591, "learning_rate": 0.0002, "epoch": 4.60296965784377, "step": 14260}, {"loss": 0.5027, "grad_norm": 0.9067938923835754, "learning_rate": 0.0002, "epoch": 4.60619754680439, "step": 14270}, {"loss": 0.5237, "grad_norm": 0.8767340183258057, "learning_rate": 0.0002, "epoch": 4.60942543576501, "step": 14280}, {"loss": 0.5294, "grad_norm": 1.024880290031433, "learning_rate": 0.0002, "epoch": 4.612653324725629, "step": 14290}, {"loss": 0.5371, "grad_norm": 0.9226394891738892, "learning_rate": 0.0002, "epoch": 4.615881213686249, "step": 14300}, {"loss": 0.5281, "grad_norm": 1.018187165260315, "learning_rate": 0.0002, "epoch": 4.619109102646869, "step": 14310}, {"loss": 0.5546, "grad_norm": 0.8851249814033508, "learning_rate": 0.0002, "epoch": 4.622336991607488, "step": 14320}, {"loss": 0.5206, "grad_norm": 0.745798647403717, "learning_rate": 0.0002, "epoch": 4.625564880568108, "step": 14330}, {"loss": 0.5531, "grad_norm": 1.2082698345184326, "learning_rate": 0.0002, "epoch": 4.6287927695287285, "step": 14340}, {"loss": 0.5449, "grad_norm": 0.901454508304596, "learning_rate": 0.0002, "epoch": 4.632020658489348, "step": 14350}, {"loss": 0.5433, "grad_norm": 0.9593124985694885, "learning_rate": 0.0002, "epoch": 4.635248547449968, "step": 14360}, {"loss": 0.4939, "grad_norm": 1.1241410970687866, "learning_rate": 0.0002, "epoch": 4.6384764364105875, "step": 14370}, {"loss": 0.5319, "grad_norm": 0.9221102595329285, "learning_rate": 0.0002, "epoch": 4.641704325371207, "step": 14380}, {"loss": 0.524, "grad_norm": 1.0035039186477661, "learning_rate": 0.0002, "epoch": 4.644932214331827, "step": 14390}, {"loss": 0.5617, "grad_norm": 1.1270662546157837, "learning_rate": 0.0002, "epoch": 4.648160103292446, "step": 14400}, {"loss": 0.5663, "grad_norm": 0.8631120324134827, "learning_rate": 0.0002, "epoch": 4.651387992253067, "step": 14410}, {"loss": 0.5705, "grad_norm": 1.0604606866836548, "learning_rate": 0.0002, "epoch": 4.654615881213687, "step": 14420}, {"loss": 0.5307, "grad_norm": 0.8002706170082092, "learning_rate": 0.0002, "epoch": 4.657843770174306, "step": 14430}, {"loss": 0.5459, "grad_norm": 1.0642075538635254, "learning_rate": 0.0002, "epoch": 4.661071659134926, "step": 14440}, {"loss": 0.5497, "grad_norm": 0.9315671324729919, "learning_rate": 0.0002, "epoch": 4.664299548095546, "step": 14450}, {"loss": 0.5542, "grad_norm": 0.8311864137649536, "learning_rate": 0.0002, "epoch": 4.667527437056165, "step": 14460}, {"loss": 0.5533, "grad_norm": 0.8900430202484131, "learning_rate": 0.0002, "epoch": 4.670755326016785, "step": 14470}, {"loss": 0.5086, "grad_norm": 1.059267282485962, "learning_rate": 0.0002, "epoch": 4.6739832149774045, "step": 14480}, {"loss": 0.5583, "grad_norm": 0.9864052534103394, "learning_rate": 0.0002, "epoch": 4.677211103938024, "step": 14490}, {"loss": 0.5737, "grad_norm": 1.210854411125183, "learning_rate": 0.0002, "epoch": 4.680438992898644, "step": 14500}, {"loss": 0.536, "grad_norm": 1.030693769454956, "learning_rate": 0.0002, "epoch": 4.683666881859264, "step": 14510}, {"loss": 0.544, "grad_norm": 0.9809406995773315, "learning_rate": 0.0002, "epoch": 4.686894770819884, "step": 14520}, {"loss": 0.5522, "grad_norm": 1.0471004247665405, "learning_rate": 0.0002, "epoch": 4.690122659780504, "step": 14530}, {"loss": 0.5613, "grad_norm": 1.1583727598190308, "learning_rate": 0.0002, "epoch": 4.693350548741123, "step": 14540}, {"loss": 0.5608, "grad_norm": 0.9664418697357178, "learning_rate": 0.0002, "epoch": 4.696578437701743, "step": 14550}, {"loss": 0.5624, "grad_norm": 0.9511209726333618, "learning_rate": 0.0002, "epoch": 4.699806326662363, "step": 14560}, {"loss": 0.5806, "grad_norm": 1.0211684703826904, "learning_rate": 0.0002, "epoch": 4.703034215622982, "step": 14570}, {"loss": 0.5536, "grad_norm": 1.097276210784912, "learning_rate": 0.0002, "epoch": 4.706262104583602, "step": 14580}, {"loss": 0.5527, "grad_norm": 0.9363943338394165, "learning_rate": 0.0002, "epoch": 4.7094899935442225, "step": 14590}, {"loss": 0.5261, "grad_norm": 1.4700615406036377, "learning_rate": 0.0002, "epoch": 4.712717882504842, "step": 14600}, {"loss": 0.5489, "grad_norm": 1.0001553297042847, "learning_rate": 0.0002, "epoch": 4.715945771465462, "step": 14610}, {"loss": 0.5236, "grad_norm": 1.0489927530288696, "learning_rate": 0.0002, "epoch": 4.719173660426081, "step": 14620}, {"loss": 0.5418, "grad_norm": 1.0483676195144653, "learning_rate": 0.0002, "epoch": 4.722401549386701, "step": 14630}, {"loss": 0.5596, "grad_norm": 1.1501940488815308, "learning_rate": 0.0002, "epoch": 4.725629438347321, "step": 14640}, {"loss": 0.5059, "grad_norm": 1.1703146696090698, "learning_rate": 0.0002, "epoch": 4.72885732730794, "step": 14650}, {"loss": 0.5356, "grad_norm": 0.8842985033988953, "learning_rate": 0.0002, "epoch": 4.73208521626856, "step": 14660}, {"loss": 0.5229, "grad_norm": 0.9147908687591553, "learning_rate": 0.0002, "epoch": 4.73531310522918, "step": 14670}, {"loss": 0.5436, "grad_norm": 1.0391576290130615, "learning_rate": 0.0002, "epoch": 4.7385409941898, "step": 14680}, {"loss": 0.5803, "grad_norm": 0.9469179511070251, "learning_rate": 0.0002, "epoch": 4.74176888315042, "step": 14690}, {"loss": 0.5201, "grad_norm": 1.0529530048370361, "learning_rate": 0.0002, "epoch": 4.7449967721110395, "step": 14700}, {"loss": 0.5401, "grad_norm": 0.9645711183547974, "learning_rate": 0.0002, "epoch": 4.748224661071659, "step": 14710}, {"loss": 0.5123, "grad_norm": 0.8163343071937561, "learning_rate": 0.0002, "epoch": 4.751452550032279, "step": 14720}, {"loss": 0.5654, "grad_norm": 1.0581341981887817, "learning_rate": 0.0002, "epoch": 4.7546804389928985, "step": 14730}, {"loss": 0.5709, "grad_norm": 1.0913853645324707, "learning_rate": 0.0002, "epoch": 4.757908327953518, "step": 14740}, {"loss": 0.5342, "grad_norm": 1.1071174144744873, "learning_rate": 0.0002, "epoch": 4.761136216914138, "step": 14750}, {"loss": 0.5353, "grad_norm": 1.0060709714889526, "learning_rate": 0.0002, "epoch": 4.764364105874758, "step": 14760}, {"loss": 0.5415, "grad_norm": 1.012024164199829, "learning_rate": 0.0002, "epoch": 4.767591994835378, "step": 14770}, {"loss": 0.5351, "grad_norm": 0.8438148498535156, "learning_rate": 0.0002, "epoch": 4.770819883795998, "step": 14780}, {"loss": 0.5424, "grad_norm": 0.8136811256408691, "learning_rate": 0.0002, "epoch": 4.774047772756617, "step": 14790}, {"loss": 0.5397, "grad_norm": 1.0765691995620728, "learning_rate": 0.0002, "epoch": 4.777275661717237, "step": 14800}, {"loss": 0.5616, "grad_norm": 1.0582574605941772, "learning_rate": 0.0002, "epoch": 4.780503550677857, "step": 14810}, {"loss": 0.5554, "grad_norm": 0.9419516921043396, "learning_rate": 0.0002, "epoch": 4.783731439638476, "step": 14820}, {"loss": 0.5499, "grad_norm": 0.9626181721687317, "learning_rate": 0.0002, "epoch": 4.786959328599096, "step": 14830}, {"loss": 0.565, "grad_norm": 1.2552800178527832, "learning_rate": 0.0002, "epoch": 4.7901872175597155, "step": 14840}, {"loss": 0.5402, "grad_norm": 0.9379919171333313, "learning_rate": 0.0002, "epoch": 4.793415106520336, "step": 14850}, {"loss": 0.5583, "grad_norm": 0.8166947364807129, "learning_rate": 0.0002, "epoch": 4.796642995480956, "step": 14860}, {"loss": 0.5139, "grad_norm": 0.9008694887161255, "learning_rate": 0.0002, "epoch": 4.799870884441575, "step": 14870}, {"loss": 0.5049, "grad_norm": 1.0256156921386719, "learning_rate": 0.0002, "epoch": 4.803098773402195, "step": 14880}, {"loss": 0.5531, "grad_norm": 0.9486594200134277, "learning_rate": 0.0002, "epoch": 4.806326662362815, "step": 14890}, {"loss": 0.5667, "grad_norm": 0.955238401889801, "learning_rate": 0.0002, "epoch": 4.809554551323434, "step": 14900}, {"loss": 0.5269, "grad_norm": 1.03775954246521, "learning_rate": 0.0002, "epoch": 4.812782440284054, "step": 14910}, {"loss": 0.5445, "grad_norm": 1.1383405923843384, "learning_rate": 0.0002, "epoch": 4.816010329244674, "step": 14920}, {"loss": 0.5347, "grad_norm": 0.9411700963973999, "learning_rate": 0.0002, "epoch": 4.819238218205294, "step": 14930}, {"loss": 0.4899, "grad_norm": 0.8188554644584656, "learning_rate": 0.0002, "epoch": 4.822466107165914, "step": 14940}, {"loss": 0.5618, "grad_norm": 1.1336265802383423, "learning_rate": 0.0002, "epoch": 4.8256939961265335, "step": 14950}, {"loss": 0.5578, "grad_norm": 1.106121301651001, "learning_rate": 0.0002, "epoch": 4.828921885087153, "step": 14960}, {"loss": 0.5306, "grad_norm": 1.0206533670425415, "learning_rate": 0.0002, "epoch": 4.832149774047773, "step": 14970}, {"loss": 0.5714, "grad_norm": 1.1123926639556885, "learning_rate": 0.0002, "epoch": 4.8353776630083924, "step": 14980}, {"loss": 0.5208, "grad_norm": 0.7879418730735779, "learning_rate": 0.0002, "epoch": 4.838605551969012, "step": 14990}, {"loss": 0.5385, "grad_norm": 1.0171709060668945, "learning_rate": 0.0002, "epoch": 4.841833440929632, "step": 15000}, {"loss": 0.6049, "grad_norm": 1.010671615600586, "learning_rate": 0.0002, "epoch": 4.845061329890251, "step": 15010}, {"loss": 0.5497, "grad_norm": 1.0778919458389282, "learning_rate": 0.0002, "epoch": 4.848289218850871, "step": 15020}, {"loss": 0.5587, "grad_norm": 1.0479968786239624, "learning_rate": 0.0002, "epoch": 4.851517107811492, "step": 15030}, {"loss": 0.5637, "grad_norm": 1.0345100164413452, "learning_rate": 0.0002, "epoch": 4.854744996772111, "step": 15040}, {"loss": 0.5809, "grad_norm": 0.9539691805839539, "learning_rate": 0.0002, "epoch": 4.857972885732731, "step": 15050}, {"loss": 0.5314, "grad_norm": 0.9914752840995789, "learning_rate": 0.0002, "epoch": 4.8612007746933505, "step": 15060}, {"loss": 0.5277, "grad_norm": 1.1935476064682007, "learning_rate": 0.0002, "epoch": 4.86442866365397, "step": 15070}, {"loss": 0.5497, "grad_norm": 1.0065057277679443, "learning_rate": 0.0002, "epoch": 4.86765655261459, "step": 15080}, {"loss": 0.5563, "grad_norm": 0.9320993423461914, "learning_rate": 0.0002, "epoch": 4.8708844415752095, "step": 15090}, {"loss": 0.5757, "grad_norm": 1.0578069686889648, "learning_rate": 0.0002, "epoch": 4.87411233053583, "step": 15100}, {"loss": 0.5472, "grad_norm": 0.9666239023208618, "learning_rate": 0.0002, "epoch": 4.87734021949645, "step": 15110}, {"loss": 0.5564, "grad_norm": 1.1322687864303589, "learning_rate": 0.0002, "epoch": 4.880568108457069, "step": 15120}, {"loss": 0.5381, "grad_norm": 0.955674409866333, "learning_rate": 0.0002, "epoch": 4.883795997417689, "step": 15130}, {"loss": 0.557, "grad_norm": 1.119413137435913, "learning_rate": 0.0002, "epoch": 4.887023886378309, "step": 15140}, {"loss": 0.5527, "grad_norm": 0.863646924495697, "learning_rate": 0.0002, "epoch": 4.890251775338928, "step": 15150}, {"loss": 0.5908, "grad_norm": 1.1823450326919556, "learning_rate": 0.0002, "epoch": 4.893479664299548, "step": 15160}, {"loss": 0.5654, "grad_norm": 0.8657588958740234, "learning_rate": 0.0002, "epoch": 4.896707553260168, "step": 15170}, {"loss": 0.5239, "grad_norm": 0.8575737476348877, "learning_rate": 0.0002, "epoch": 4.899935442220787, "step": 15180}, {"loss": 0.564, "grad_norm": 0.9611830711364746, "learning_rate": 0.0002, "epoch": 4.903163331181407, "step": 15190}, {"loss": 0.5505, "grad_norm": 1.1981453895568848, "learning_rate": 0.0002, "epoch": 4.906391220142027, "step": 15200}, {"loss": 0.5582, "grad_norm": 0.9401199221611023, "learning_rate": 0.0002, "epoch": 4.909619109102647, "step": 15210}, {"loss": 0.5631, "grad_norm": 0.8420369625091553, "learning_rate": 0.0002, "epoch": 4.912846998063267, "step": 15220}, {"loss": 0.5255, "grad_norm": 0.7877969145774841, "learning_rate": 0.0002, "epoch": 4.916074887023886, "step": 15230}, {"loss": 0.5522, "grad_norm": 0.8988324403762817, "learning_rate": 0.0002, "epoch": 4.919302775984506, "step": 15240}, {"loss": 0.5274, "grad_norm": 1.1103752851486206, "learning_rate": 0.0002, "epoch": 4.922530664945126, "step": 15250}, {"loss": 0.5249, "grad_norm": 0.8874443173408508, "learning_rate": 0.0002, "epoch": 4.925758553905745, "step": 15260}, {"loss": 0.5677, "grad_norm": 1.1001752614974976, "learning_rate": 0.0002, "epoch": 4.928986442866366, "step": 15270}, {"loss": 0.5596, "grad_norm": 0.9661307334899902, "learning_rate": 0.0002, "epoch": 4.9322143318269855, "step": 15280}, {"loss": 0.5678, "grad_norm": 1.1738812923431396, "learning_rate": 0.0002, "epoch": 4.935442220787605, "step": 15290}, {"loss": 0.5057, "grad_norm": 0.9773507714271545, "learning_rate": 0.0002, "epoch": 4.938670109748225, "step": 15300}, {"loss": 0.5029, "grad_norm": 1.0735599994659424, "learning_rate": 0.0002, "epoch": 4.9418979987088445, "step": 15310}, {"loss": 0.4996, "grad_norm": 1.0552113056182861, "learning_rate": 0.0002, "epoch": 4.945125887669464, "step": 15320}, {"loss": 0.5201, "grad_norm": 1.0900797843933105, "learning_rate": 0.0002, "epoch": 4.948353776630084, "step": 15330}, {"loss": 0.552, "grad_norm": 1.0908405780792236, "learning_rate": 0.0002, "epoch": 4.9515816655907035, "step": 15340}, {"loss": 0.6208, "grad_norm": 1.010221004486084, "learning_rate": 0.0002, "epoch": 4.954809554551323, "step": 15350}, {"loss": 0.5423, "grad_norm": 1.0321437120437622, "learning_rate": 0.0002, "epoch": 4.958037443511943, "step": 15360}, {"loss": 0.5903, "grad_norm": 0.8430278897285461, "learning_rate": 0.0002, "epoch": 4.961265332472563, "step": 15370}, {"loss": 0.538, "grad_norm": 0.8775330185890198, "learning_rate": 0.0002, "epoch": 4.964493221433183, "step": 15380}, {"loss": 0.5344, "grad_norm": 0.9796988368034363, "learning_rate": 0.0002, "epoch": 4.967721110393803, "step": 15390}, {"loss": 0.5352, "grad_norm": 0.8782257437705994, "learning_rate": 0.0002, "epoch": 4.970948999354422, "step": 15400}, {"loss": 0.5843, "grad_norm": 0.9959840774536133, "learning_rate": 0.0002, "epoch": 4.974176888315042, "step": 15410}, {"loss": 0.5783, "grad_norm": 1.0730273723602295, "learning_rate": 0.0002, "epoch": 4.9774047772756616, "step": 15420}, {"loss": 0.5277, "grad_norm": 0.8653680682182312, "learning_rate": 0.0002, "epoch": 4.980632666236281, "step": 15430}, {"loss": 0.5301, "grad_norm": 1.0769985914230347, "learning_rate": 0.0002, "epoch": 4.983860555196901, "step": 15440}, {"loss": 0.5727, "grad_norm": 1.1336040496826172, "learning_rate": 0.0002, "epoch": 4.987088444157521, "step": 15450}, {"loss": 0.5454, "grad_norm": 0.9844824075698853, "learning_rate": 0.0002, "epoch": 4.990316333118141, "step": 15460}, {"loss": 0.5316, "grad_norm": 0.8368769288063049, "learning_rate": 0.0002, "epoch": 4.993544222078761, "step": 15470}, {"loss": 0.5464, "grad_norm": 1.0238676071166992, "learning_rate": 0.0002, "epoch": 4.99677211103938, "step": 15480}, {"loss": 0.5577, "grad_norm": 1.064820408821106, "learning_rate": 0.0002, "epoch": 5.0, "step": 15490}, {"eval_loss": 1.241918921470642, "eval_runtime": 158.4099, "eval_samples_per_second": 4.627, "eval_steps_per_second": 0.581, "epoch": 5.0, "step": 15490}, {"loss": 0.4554, "grad_norm": 1.1366689205169678, "learning_rate": 0.0002, "epoch": 5.00322788896062, "step": 15500}, {"loss": 0.4288, "grad_norm": 1.2548010349273682, "learning_rate": 0.0002, "epoch": 5.006455777921239, "step": 15510}, {"loss": 0.4276, "grad_norm": 1.3875139951705933, "learning_rate": 0.0002, "epoch": 5.009683666881859, "step": 15520}, {"loss": 0.4198, "grad_norm": 0.9834036231040955, "learning_rate": 0.0002, "epoch": 5.012911555842479, "step": 15530}, {"loss": 0.4531, "grad_norm": 1.0737303495407104, "learning_rate": 0.0002, "epoch": 5.016139444803099, "step": 15540}, {"loss": 0.4073, "grad_norm": 0.9877859950065613, "learning_rate": 0.0002, "epoch": 5.019367333763719, "step": 15550}, {"loss": 0.4459, "grad_norm": 1.143268346786499, "learning_rate": 0.0002, "epoch": 5.0225952227243384, "step": 15560}, {"loss": 0.4477, "grad_norm": 1.1206166744232178, "learning_rate": 0.0002, "epoch": 5.025823111684958, "step": 15570}, {"loss": 0.4593, "grad_norm": 0.9977272748947144, "learning_rate": 0.0002, "epoch": 5.029051000645578, "step": 15580}, {"loss": 0.436, "grad_norm": 1.3193285465240479, "learning_rate": 0.0002, "epoch": 5.032278889606197, "step": 15590}, {"loss": 0.4426, "grad_norm": 1.0761713981628418, "learning_rate": 0.0002, "epoch": 5.035506778566817, "step": 15600}, {"loss": 0.4701, "grad_norm": 1.1250759363174438, "learning_rate": 0.0002, "epoch": 5.038734667527437, "step": 15610}, {"loss": 0.3995, "grad_norm": 1.0414305925369263, "learning_rate": 0.0002, "epoch": 5.041962556488057, "step": 15620}, {"loss": 0.4244, "grad_norm": 1.0906853675842285, "learning_rate": 0.0002, "epoch": 5.045190445448677, "step": 15630}, {"loss": 0.441, "grad_norm": 0.9360867142677307, "learning_rate": 0.0002, "epoch": 5.0484183344092965, "step": 15640}, {"loss": 0.4146, "grad_norm": 0.9078057408332825, "learning_rate": 0.0002, "epoch": 5.051646223369916, "step": 15650}, {"loss": 0.4285, "grad_norm": 1.0054848194122314, "learning_rate": 0.0002, "epoch": 5.054874112330536, "step": 15660}, {"loss": 0.417, "grad_norm": 0.9538215398788452, "learning_rate": 0.0002, "epoch": 5.0581020012911555, "step": 15670}, {"loss": 0.4629, "grad_norm": 1.6312693357467651, "learning_rate": 0.0002, "epoch": 5.061329890251775, "step": 15680}, {"loss": 0.3996, "grad_norm": 1.2100921869277954, "learning_rate": 0.0002, "epoch": 5.064557779212395, "step": 15690}, {"loss": 0.4489, "grad_norm": 1.2776238918304443, "learning_rate": 0.0002, "epoch": 5.0677856681730145, "step": 15700}, {"loss": 0.4728, "grad_norm": 1.0110050439834595, "learning_rate": 0.0002, "epoch": 5.071013557133635, "step": 15710}, {"loss": 0.4916, "grad_norm": 1.0896575450897217, "learning_rate": 0.0002, "epoch": 5.074241446094255, "step": 15720}, {"loss": 0.4462, "grad_norm": 0.9989936947822571, "learning_rate": 0.0002, "epoch": 5.077469335054874, "step": 15730}, {"loss": 0.457, "grad_norm": 1.0412228107452393, "learning_rate": 0.0002, "epoch": 5.080697224015494, "step": 15740}, {"loss": 0.4525, "grad_norm": 1.0964457988739014, "learning_rate": 0.0002, "epoch": 5.083925112976114, "step": 15750}, {"loss": 0.4539, "grad_norm": 1.1700960397720337, "learning_rate": 0.0002, "epoch": 5.087153001936733, "step": 15760}, {"loss": 0.4517, "grad_norm": 0.9515631794929504, "learning_rate": 0.0002, "epoch": 5.090380890897353, "step": 15770}, {"loss": 0.4352, "grad_norm": 1.0895006656646729, "learning_rate": 0.0002, "epoch": 5.093608779857973, "step": 15780}, {"loss": 0.4765, "grad_norm": 1.041312575340271, "learning_rate": 0.0002, "epoch": 5.096836668818592, "step": 15790}, {"loss": 0.4532, "grad_norm": 0.9518465399742126, "learning_rate": 0.0002, "epoch": 5.100064557779213, "step": 15800}, {"loss": 0.4187, "grad_norm": 0.8317030668258667, "learning_rate": 0.0002, "epoch": 5.103292446739832, "step": 15810}, {"loss": 0.4523, "grad_norm": 1.0933761596679688, "learning_rate": 0.0002, "epoch": 5.106520335700452, "step": 15820}, {"loss": 0.4689, "grad_norm": 1.0069324970245361, "learning_rate": 0.0002, "epoch": 5.109748224661072, "step": 15830}, {"loss": 0.4773, "grad_norm": 1.1166068315505981, "learning_rate": 0.0002, "epoch": 5.112976113621691, "step": 15840}, {"loss": 0.4635, "grad_norm": 1.069992184638977, "learning_rate": 0.0002, "epoch": 5.116204002582311, "step": 15850}, {"loss": 0.445, "grad_norm": 1.3728036880493164, "learning_rate": 0.0002, "epoch": 5.119431891542931, "step": 15860}, {"loss": 0.4563, "grad_norm": 1.0625780820846558, "learning_rate": 0.0002, "epoch": 5.12265978050355, "step": 15870}, {"loss": 0.426, "grad_norm": 1.090174913406372, "learning_rate": 0.0002, "epoch": 5.125887669464171, "step": 15880}, {"loss": 0.457, "grad_norm": 0.8729526996612549, "learning_rate": 0.0002, "epoch": 5.1291155584247905, "step": 15890}, {"loss": 0.4686, "grad_norm": 0.9561540484428406, "learning_rate": 0.0002, "epoch": 5.13234344738541, "step": 15900}, {"loss": 0.4266, "grad_norm": 1.012120246887207, "learning_rate": 0.0002, "epoch": 5.13557133634603, "step": 15910}, {"loss": 0.4484, "grad_norm": 1.1027921438217163, "learning_rate": 0.0002, "epoch": 5.1387992253066495, "step": 15920}, {"loss": 0.4389, "grad_norm": 1.0878126621246338, "learning_rate": 0.0002, "epoch": 5.142027114267269, "step": 15930}, {"loss": 0.4716, "grad_norm": 0.9619103670120239, "learning_rate": 0.0002, "epoch": 5.145255003227889, "step": 15940}, {"loss": 0.4071, "grad_norm": 1.1684138774871826, "learning_rate": 0.0002, "epoch": 5.148482892188508, "step": 15950}, {"loss": 0.4292, "grad_norm": 1.3379510641098022, "learning_rate": 0.0002, "epoch": 5.151710781149128, "step": 15960}, {"loss": 0.4413, "grad_norm": 1.0427496433258057, "learning_rate": 0.0002, "epoch": 5.154938670109749, "step": 15970}, {"loss": 0.4665, "grad_norm": 0.9917148351669312, "learning_rate": 0.0002, "epoch": 5.158166559070368, "step": 15980}, {"loss": 0.4527, "grad_norm": 1.0899780988693237, "learning_rate": 0.0002, "epoch": 5.161394448030988, "step": 15990}, {"loss": 0.4764, "grad_norm": 0.9251647591590881, "learning_rate": 0.0002, "epoch": 5.1646223369916076, "step": 16000}, {"loss": 0.5043, "grad_norm": 1.1669172048568726, "learning_rate": 0.0002, "epoch": 5.167850225952227, "step": 16010}, {"loss": 0.4726, "grad_norm": 1.2285256385803223, "learning_rate": 0.0002, "epoch": 5.171078114912847, "step": 16020}, {"loss": 0.4312, "grad_norm": 1.0504484176635742, "learning_rate": 0.0002, "epoch": 5.1743060038734665, "step": 16030}, {"loss": 0.4507, "grad_norm": 1.2829089164733887, "learning_rate": 0.0002, "epoch": 5.177533892834086, "step": 16040}, {"loss": 0.4547, "grad_norm": 0.9332743287086487, "learning_rate": 0.0002, "epoch": 5.180761781794706, "step": 16050}, {"loss": 0.4211, "grad_norm": 1.0054426193237305, "learning_rate": 0.0002, "epoch": 5.183989670755326, "step": 16060}, {"loss": 0.4415, "grad_norm": 1.0049669742584229, "learning_rate": 0.0002, "epoch": 5.187217559715946, "step": 16070}, {"loss": 0.4462, "grad_norm": 1.0171366930007935, "learning_rate": 0.0002, "epoch": 5.190445448676566, "step": 16080}, {"loss": 0.4725, "grad_norm": 1.234966516494751, "learning_rate": 0.0002, "epoch": 5.193673337637185, "step": 16090}, {"loss": 0.4579, "grad_norm": 0.9127960205078125, "learning_rate": 0.0002, "epoch": 5.196901226597805, "step": 16100}, {"loss": 0.4647, "grad_norm": 1.153924822807312, "learning_rate": 0.0002, "epoch": 5.200129115558425, "step": 16110}, {"loss": 0.4826, "grad_norm": 1.26716947555542, "learning_rate": 0.0002, "epoch": 5.203357004519044, "step": 16120}, {"loss": 0.446, "grad_norm": 1.2438743114471436, "learning_rate": 0.0002, "epoch": 5.206584893479664, "step": 16130}, {"loss": 0.4768, "grad_norm": 1.0888392925262451, "learning_rate": 0.0002, "epoch": 5.2098127824402845, "step": 16140}, {"loss": 0.4508, "grad_norm": 1.1741917133331299, "learning_rate": 0.0002, "epoch": 5.213040671400904, "step": 16150}, {"loss": 0.4271, "grad_norm": 0.9508614540100098, "learning_rate": 0.0002, "epoch": 5.216268560361524, "step": 16160}, {"loss": 0.4577, "grad_norm": 0.9714716672897339, "learning_rate": 0.0002, "epoch": 5.219496449322143, "step": 16170}, {"loss": 0.4636, "grad_norm": 1.2681622505187988, "learning_rate": 0.0002, "epoch": 5.222724338282763, "step": 16180}, {"loss": 0.4723, "grad_norm": 1.045871376991272, "learning_rate": 0.0002, "epoch": 5.225952227243383, "step": 16190}, {"loss": 0.4467, "grad_norm": 1.0272563695907593, "learning_rate": 0.0002, "epoch": 5.229180116204002, "step": 16200}, {"loss": 0.4353, "grad_norm": 1.092901349067688, "learning_rate": 0.0002, "epoch": 5.232408005164622, "step": 16210}, {"loss": 0.4588, "grad_norm": 0.9332799315452576, "learning_rate": 0.0002, "epoch": 5.235635894125242, "step": 16220}, {"loss": 0.4594, "grad_norm": 1.1728498935699463, "learning_rate": 0.0002, "epoch": 5.238863783085862, "step": 16230}, {"loss": 0.4652, "grad_norm": 0.9932476878166199, "learning_rate": 0.0002, "epoch": 5.242091672046482, "step": 16240}, {"loss": 0.4469, "grad_norm": 0.735236406326294, "learning_rate": 0.0002, "epoch": 5.2453195610071015, "step": 16250}, {"loss": 0.4386, "grad_norm": 1.0289303064346313, "learning_rate": 0.0002, "epoch": 5.248547449967721, "step": 16260}, {"loss": 0.4303, "grad_norm": 0.9488231539726257, "learning_rate": 0.0002, "epoch": 5.251775338928341, "step": 16270}, {"loss": 0.4495, "grad_norm": 0.8320055603981018, "learning_rate": 0.0002, "epoch": 5.2550032278889605, "step": 16280}, {"loss": 0.4224, "grad_norm": 1.2013251781463623, "learning_rate": 0.0002, "epoch": 5.25823111684958, "step": 16290}, {"loss": 0.4666, "grad_norm": 1.0649845600128174, "learning_rate": 0.0002, "epoch": 5.2614590058102, "step": 16300}, {"loss": 0.4325, "grad_norm": 1.1674472093582153, "learning_rate": 0.0002, "epoch": 5.26468689477082, "step": 16310}, {"loss": 0.4482, "grad_norm": 1.3934763669967651, "learning_rate": 0.0002, "epoch": 5.26791478373144, "step": 16320}, {"loss": 0.4494, "grad_norm": 0.8427977561950684, "learning_rate": 0.0002, "epoch": 5.27114267269206, "step": 16330}, {"loss": 0.4234, "grad_norm": 1.0497093200683594, "learning_rate": 0.0002, "epoch": 5.274370561652679, "step": 16340}, {"loss": 0.4337, "grad_norm": 0.8562338352203369, "learning_rate": 0.0002, "epoch": 5.277598450613299, "step": 16350}, {"loss": 0.4664, "grad_norm": 1.043920874595642, "learning_rate": 0.0002, "epoch": 5.280826339573919, "step": 16360}, {"loss": 0.4463, "grad_norm": 1.0039188861846924, "learning_rate": 0.0002, "epoch": 5.284054228534538, "step": 16370}, {"loss": 0.4149, "grad_norm": 0.9414041638374329, "learning_rate": 0.0002, "epoch": 5.287282117495158, "step": 16380}, {"loss": 0.5119, "grad_norm": 1.3346221446990967, "learning_rate": 0.0002, "epoch": 5.2905100064557775, "step": 16390}, {"loss": 0.4479, "grad_norm": 1.0173962116241455, "learning_rate": 0.0002, "epoch": 5.293737895416398, "step": 16400}, {"loss": 0.4538, "grad_norm": 0.7756500244140625, "learning_rate": 0.0002, "epoch": 5.296965784377018, "step": 16410}, {"loss": 0.4306, "grad_norm": 1.1185362339019775, "learning_rate": 0.0002, "epoch": 5.300193673337637, "step": 16420}, {"loss": 0.5033, "grad_norm": 1.0904899835586548, "learning_rate": 0.0002, "epoch": 5.303421562298257, "step": 16430}, {"loss": 0.4887, "grad_norm": 1.0803170204162598, "learning_rate": 0.0002, "epoch": 5.306649451258877, "step": 16440}, {"loss": 0.4473, "grad_norm": 1.1492092609405518, "learning_rate": 0.0002, "epoch": 5.309877340219496, "step": 16450}, {"loss": 0.4696, "grad_norm": 1.1212135553359985, "learning_rate": 0.0002, "epoch": 5.313105229180116, "step": 16460}, {"loss": 0.4438, "grad_norm": 0.8274528980255127, "learning_rate": 0.0002, "epoch": 5.316333118140736, "step": 16470}, {"loss": 0.468, "grad_norm": 1.118891716003418, "learning_rate": 0.0002, "epoch": 5.319561007101356, "step": 16480}, {"loss": 0.4403, "grad_norm": 1.185945749282837, "learning_rate": 0.0002, "epoch": 5.322788896061976, "step": 16490}, {"loss": 0.4946, "grad_norm": 1.0275214910507202, "learning_rate": 0.0002, "epoch": 5.3260167850225955, "step": 16500}, {"loss": 0.4612, "grad_norm": 0.9346362352371216, "learning_rate": 0.0002, "epoch": 5.329244673983215, "step": 16510}, {"loss": 0.4722, "grad_norm": 0.9600600600242615, "learning_rate": 0.0002, "epoch": 5.332472562943835, "step": 16520}, {"loss": 0.4536, "grad_norm": 1.1238188743591309, "learning_rate": 0.0002, "epoch": 5.335700451904454, "step": 16530}, {"loss": 0.5025, "grad_norm": 0.8660476207733154, "learning_rate": 0.0002, "epoch": 5.338928340865074, "step": 16540}, {"loss": 0.4732, "grad_norm": 0.9869821071624756, "learning_rate": 0.0002, "epoch": 5.342156229825694, "step": 16550}, {"loss": 0.4967, "grad_norm": 1.1719090938568115, "learning_rate": 0.0002, "epoch": 5.345384118786313, "step": 16560}, {"loss": 0.4563, "grad_norm": 1.0122894048690796, "learning_rate": 0.0002, "epoch": 5.348612007746934, "step": 16570}, {"loss": 0.5066, "grad_norm": 1.2431079149246216, "learning_rate": 0.0002, "epoch": 5.351839896707554, "step": 16580}, {"loss": 0.4708, "grad_norm": 1.4178080558776855, "learning_rate": 0.0002, "epoch": 5.355067785668173, "step": 16590}, {"loss": 0.4686, "grad_norm": 1.1895726919174194, "learning_rate": 0.0002, "epoch": 5.358295674628793, "step": 16600}, {"loss": 0.475, "grad_norm": 1.154392123222351, "learning_rate": 0.0002, "epoch": 5.3615235635894125, "step": 16610}, {"loss": 0.4511, "grad_norm": 0.9207229018211365, "learning_rate": 0.0002, "epoch": 5.364751452550032, "step": 16620}, {"loss": 0.4606, "grad_norm": 1.0247414112091064, "learning_rate": 0.0002, "epoch": 5.367979341510652, "step": 16630}, {"loss": 0.4886, "grad_norm": 1.0402202606201172, "learning_rate": 0.0002, "epoch": 5.3712072304712715, "step": 16640}, {"loss": 0.4903, "grad_norm": 1.1902891397476196, "learning_rate": 0.0002, "epoch": 5.374435119431892, "step": 16650}, {"loss": 0.4583, "grad_norm": 0.9572759866714478, "learning_rate": 0.0002, "epoch": 5.377663008392512, "step": 16660}, {"loss": 0.4636, "grad_norm": 0.9968860149383545, "learning_rate": 0.0002, "epoch": 5.380890897353131, "step": 16670}, {"loss": 0.477, "grad_norm": 1.2468547821044922, "learning_rate": 0.0002, "epoch": 5.384118786313751, "step": 16680}, {"loss": 0.5223, "grad_norm": 1.154661774635315, "learning_rate": 0.0002, "epoch": 5.387346675274371, "step": 16690}, {"loss": 0.4637, "grad_norm": 0.8837044835090637, "learning_rate": 0.0002, "epoch": 5.39057456423499, "step": 16700}, {"loss": 0.4744, "grad_norm": 1.0317907333374023, "learning_rate": 0.0002, "epoch": 5.39380245319561, "step": 16710}, {"loss": 0.4831, "grad_norm": 0.9811587929725647, "learning_rate": 0.0002, "epoch": 5.39703034215623, "step": 16720}, {"loss": 0.4739, "grad_norm": 0.9487450122833252, "learning_rate": 0.0002, "epoch": 5.400258231116849, "step": 16730}, {"loss": 0.4574, "grad_norm": 1.0540274381637573, "learning_rate": 0.0002, "epoch": 5.403486120077469, "step": 16740}, {"loss": 0.4709, "grad_norm": 1.028363585472107, "learning_rate": 0.0002, "epoch": 5.406714009038089, "step": 16750}, {"loss": 0.468, "grad_norm": 1.0200704336166382, "learning_rate": 0.0002, "epoch": 5.409941897998709, "step": 16760}, {"loss": 0.4383, "grad_norm": 1.0330981016159058, "learning_rate": 0.0002, "epoch": 5.413169786959329, "step": 16770}, {"loss": 0.4645, "grad_norm": 1.320875644683838, "learning_rate": 0.0002, "epoch": 5.416397675919948, "step": 16780}, {"loss": 0.4601, "grad_norm": 0.9838143587112427, "learning_rate": 0.0002, "epoch": 5.419625564880568, "step": 16790}, {"loss": 0.4835, "grad_norm": 1.1006578207015991, "learning_rate": 0.0002, "epoch": 5.422853453841188, "step": 16800}, {"loss": 0.4871, "grad_norm": 1.099174976348877, "learning_rate": 0.0002, "epoch": 5.426081342801807, "step": 16810}, {"loss": 0.4773, "grad_norm": 1.0632189512252808, "learning_rate": 0.0002, "epoch": 5.429309231762427, "step": 16820}, {"loss": 0.4732, "grad_norm": 0.9673194885253906, "learning_rate": 0.0002, "epoch": 5.4325371207230475, "step": 16830}, {"loss": 0.4731, "grad_norm": 0.853013813495636, "learning_rate": 0.0002, "epoch": 5.435765009683667, "step": 16840}, {"loss": 0.4856, "grad_norm": 1.0261728763580322, "learning_rate": 0.0002, "epoch": 5.438992898644287, "step": 16850}, {"loss": 0.4729, "grad_norm": 1.1642370223999023, "learning_rate": 0.0002, "epoch": 5.4422207876049065, "step": 16860}, {"loss": 0.4751, "grad_norm": 0.8715673685073853, "learning_rate": 0.0002, "epoch": 5.445448676565526, "step": 16870}, {"loss": 0.4566, "grad_norm": 0.905746579170227, "learning_rate": 0.0002, "epoch": 5.448676565526146, "step": 16880}, {"loss": 0.4536, "grad_norm": 1.1051915884017944, "learning_rate": 0.0002, "epoch": 5.451904454486765, "step": 16890}, {"loss": 0.4944, "grad_norm": 1.0781478881835938, "learning_rate": 0.0002, "epoch": 5.455132343447385, "step": 16900}, {"loss": 0.4655, "grad_norm": 1.1168911457061768, "learning_rate": 0.0002, "epoch": 5.458360232408005, "step": 16910}, {"loss": 0.4624, "grad_norm": 1.1150046586990356, "learning_rate": 0.0002, "epoch": 5.461588121368625, "step": 16920}, {"loss": 0.4849, "grad_norm": 0.9862499833106995, "learning_rate": 0.0002, "epoch": 5.464816010329245, "step": 16930}, {"loss": 0.47, "grad_norm": 1.5416640043258667, "learning_rate": 0.0002, "epoch": 5.468043899289865, "step": 16940}, {"loss": 0.4508, "grad_norm": 0.8960899710655212, "learning_rate": 0.0002, "epoch": 5.471271788250484, "step": 16950}, {"loss": 0.5002, "grad_norm": 0.9796477556228638, "learning_rate": 0.0002, "epoch": 5.474499677211104, "step": 16960}, {"loss": 0.4939, "grad_norm": 0.9526587128639221, "learning_rate": 0.0002, "epoch": 5.4777275661717235, "step": 16970}, {"loss": 0.4807, "grad_norm": 1.2373039722442627, "learning_rate": 0.0002, "epoch": 5.480955455132343, "step": 16980}, {"loss": 0.4642, "grad_norm": 1.1860566139221191, "learning_rate": 0.0002, "epoch": 5.484183344092963, "step": 16990}, {"loss": 0.4929, "grad_norm": 1.477345585823059, "learning_rate": 0.0002, "epoch": 5.487411233053583, "step": 17000}, {"loss": 0.4566, "grad_norm": 1.1029295921325684, "learning_rate": 0.0002, "epoch": 5.490639122014203, "step": 17010}, {"loss": 0.487, "grad_norm": 1.1416981220245361, "learning_rate": 0.0002, "epoch": 5.493867010974823, "step": 17020}, {"loss": 0.475, "grad_norm": 1.1647989749908447, "learning_rate": 0.0002, "epoch": 5.497094899935442, "step": 17030}, {"loss": 0.4644, "grad_norm": 1.1297032833099365, "learning_rate": 0.0002, "epoch": 5.500322788896062, "step": 17040}, {"loss": 0.4885, "grad_norm": 0.9764689207077026, "learning_rate": 0.0002, "epoch": 5.503550677856682, "step": 17050}, {"loss": 0.4789, "grad_norm": 1.038161039352417, "learning_rate": 0.0002, "epoch": 5.506778566817301, "step": 17060}, {"loss": 0.4467, "grad_norm": 1.1417886018753052, "learning_rate": 0.0002, "epoch": 5.510006455777921, "step": 17070}, {"loss": 0.4782, "grad_norm": 0.9300898313522339, "learning_rate": 0.0002, "epoch": 5.513234344738541, "step": 17080}, {"loss": 0.4805, "grad_norm": 1.0295016765594482, "learning_rate": 0.0002, "epoch": 5.516462233699161, "step": 17090}, {"loss": 0.4663, "grad_norm": 1.1273008584976196, "learning_rate": 0.0002, "epoch": 5.519690122659781, "step": 17100}, {"loss": 0.4897, "grad_norm": 0.9542737007141113, "learning_rate": 0.0002, "epoch": 5.5229180116204, "step": 17110}, {"loss": 0.51, "grad_norm": 1.34589421749115, "learning_rate": 0.0002, "epoch": 5.52614590058102, "step": 17120}, {"loss": 0.467, "grad_norm": 0.9889675378799438, "learning_rate": 0.0002, "epoch": 5.52937378954164, "step": 17130}, {"loss": 0.4752, "grad_norm": 1.25719153881073, "learning_rate": 0.0002, "epoch": 5.532601678502259, "step": 17140}, {"loss": 0.4609, "grad_norm": 1.2511073350906372, "learning_rate": 0.0002, "epoch": 5.535829567462879, "step": 17150}, {"loss": 0.4992, "grad_norm": 1.1993521451950073, "learning_rate": 0.0002, "epoch": 5.539057456423499, "step": 17160}, {"loss": 0.4986, "grad_norm": 1.1394526958465576, "learning_rate": 0.0002, "epoch": 5.542285345384119, "step": 17170}, {"loss": 0.5284, "grad_norm": 1.0435349941253662, "learning_rate": 0.0002, "epoch": 5.545513234344739, "step": 17180}, {"loss": 0.4934, "grad_norm": 1.120940089225769, "learning_rate": 0.0002, "epoch": 5.5487411233053585, "step": 17190}, {"loss": 0.4704, "grad_norm": 1.0906445980072021, "learning_rate": 0.0002, "epoch": 5.551969012265978, "step": 17200}, {"loss": 0.4896, "grad_norm": 0.8883966207504272, "learning_rate": 0.0002, "epoch": 5.555196901226598, "step": 17210}, {"loss": 0.4696, "grad_norm": 1.3078752756118774, "learning_rate": 0.0002, "epoch": 5.5584247901872175, "step": 17220}, {"loss": 0.4805, "grad_norm": 1.0224416255950928, "learning_rate": 0.0002, "epoch": 5.561652679147837, "step": 17230}, {"loss": 0.47, "grad_norm": 1.242518663406372, "learning_rate": 0.0002, "epoch": 5.564880568108457, "step": 17240}, {"loss": 0.4708, "grad_norm": 1.2328250408172607, "learning_rate": 0.0002, "epoch": 5.568108457069076, "step": 17250}, {"loss": 0.4685, "grad_norm": 1.2186611890792847, "learning_rate": 0.0002, "epoch": 5.571336346029697, "step": 17260}, {"loss": 0.4688, "grad_norm": 1.0947459936141968, "learning_rate": 0.0002, "epoch": 5.574564234990317, "step": 17270}, {"loss": 0.506, "grad_norm": 1.075279951095581, "learning_rate": 0.0002, "epoch": 5.577792123950936, "step": 17280}, {"loss": 0.478, "grad_norm": 1.0316804647445679, "learning_rate": 0.0002, "epoch": 5.581020012911556, "step": 17290}, {"loss": 0.478, "grad_norm": 1.1077373027801514, "learning_rate": 0.0002, "epoch": 5.584247901872176, "step": 17300}, {"loss": 0.4857, "grad_norm": 1.219228744506836, "learning_rate": 0.0002, "epoch": 5.587475790832795, "step": 17310}, {"loss": 0.4465, "grad_norm": 1.026361346244812, "learning_rate": 0.0002, "epoch": 5.590703679793415, "step": 17320}, {"loss": 0.4831, "grad_norm": 1.1621283292770386, "learning_rate": 0.0002, "epoch": 5.5939315687540345, "step": 17330}, {"loss": 0.4706, "grad_norm": 1.0177470445632935, "learning_rate": 0.0002, "epoch": 5.597159457714655, "step": 17340}, {"loss": 0.4961, "grad_norm": 1.0625319480895996, "learning_rate": 0.0002, "epoch": 5.600387346675275, "step": 17350}, {"loss": 0.484, "grad_norm": 1.148815393447876, "learning_rate": 0.0002, "epoch": 5.603615235635894, "step": 17360}, {"loss": 0.4804, "grad_norm": 1.0571802854537964, "learning_rate": 0.0002, "epoch": 5.606843124596514, "step": 17370}, {"loss": 0.5202, "grad_norm": 1.2069389820098877, "learning_rate": 0.0002, "epoch": 5.610071013557134, "step": 17380}, {"loss": 0.5029, "grad_norm": 1.407530426979065, "learning_rate": 0.0002, "epoch": 5.613298902517753, "step": 17390}, {"loss": 0.4688, "grad_norm": 1.247060775756836, "learning_rate": 0.0002, "epoch": 5.616526791478373, "step": 17400}, {"loss": 0.4359, "grad_norm": 1.431684136390686, "learning_rate": 0.0002, "epoch": 5.619754680438993, "step": 17410}, {"loss": 0.5244, "grad_norm": 1.0520552396774292, "learning_rate": 0.0002, "epoch": 5.622982569399612, "step": 17420}, {"loss": 0.4993, "grad_norm": 1.0593537092208862, "learning_rate": 0.0002, "epoch": 5.626210458360232, "step": 17430}, {"loss": 0.4911, "grad_norm": 1.4414515495300293, "learning_rate": 0.0002, "epoch": 5.6294383473208525, "step": 17440}, {"loss": 0.4761, "grad_norm": 1.0902460813522339, "learning_rate": 0.0002, "epoch": 5.632666236281472, "step": 17450}, {"loss": 0.4737, "grad_norm": 0.890944242477417, "learning_rate": 0.0002, "epoch": 5.635894125242092, "step": 17460}, {"loss": 0.4706, "grad_norm": 1.035675287246704, "learning_rate": 0.0002, "epoch": 5.639122014202711, "step": 17470}, {"loss": 0.484, "grad_norm": 0.9792264103889465, "learning_rate": 0.0002, "epoch": 5.642349903163331, "step": 17480}, {"loss": 0.4753, "grad_norm": 1.1888220310211182, "learning_rate": 0.0002, "epoch": 5.645577792123951, "step": 17490}, {"loss": 0.5047, "grad_norm": 1.0169143676757812, "learning_rate": 0.0002, "epoch": 5.64880568108457, "step": 17500}, {"loss": 0.4919, "grad_norm": 0.9812449216842651, "learning_rate": 0.0002, "epoch": 5.652033570045191, "step": 17510}, {"loss": 0.4879, "grad_norm": 1.0509105920791626, "learning_rate": 0.0002, "epoch": 5.655261459005811, "step": 17520}, {"loss": 0.4695, "grad_norm": 0.9047426581382751, "learning_rate": 0.0002, "epoch": 5.65848934796643, "step": 17530}, {"loss": 0.4712, "grad_norm": 1.2393709421157837, "learning_rate": 0.0002, "epoch": 5.66171723692705, "step": 17540}, {"loss": 0.5012, "grad_norm": 1.1098991632461548, "learning_rate": 0.0002, "epoch": 5.6649451258876695, "step": 17550}, {"loss": 0.4499, "grad_norm": 0.8181570768356323, "learning_rate": 0.0002, "epoch": 5.668173014848289, "step": 17560}, {"loss": 0.4973, "grad_norm": 0.9676381945610046, "learning_rate": 0.0002, "epoch": 5.671400903808909, "step": 17570}, {"loss": 0.5058, "grad_norm": 1.1225934028625488, "learning_rate": 0.0002, "epoch": 5.6746287927695285, "step": 17580}, {"loss": 0.5165, "grad_norm": 1.6259925365447998, "learning_rate": 0.0002, "epoch": 5.677856681730148, "step": 17590}, {"loss": 0.4613, "grad_norm": 0.7751404643058777, "learning_rate": 0.0002, "epoch": 5.681084570690768, "step": 17600}, {"loss": 0.4895, "grad_norm": 0.8478589057922363, "learning_rate": 0.0002, "epoch": 5.684312459651388, "step": 17610}, {"loss": 0.4492, "grad_norm": 1.2887113094329834, "learning_rate": 0.0002, "epoch": 5.687540348612008, "step": 17620}, {"loss": 0.4792, "grad_norm": 1.1452652215957642, "learning_rate": 0.0002, "epoch": 5.690768237572628, "step": 17630}, {"loss": 0.4889, "grad_norm": 1.0370417833328247, "learning_rate": 0.0002, "epoch": 5.693996126533247, "step": 17640}, {"loss": 0.535, "grad_norm": 1.1358870267868042, "learning_rate": 0.0002, "epoch": 5.697224015493867, "step": 17650}, {"loss": 0.4753, "grad_norm": 1.2772479057312012, "learning_rate": 0.0002, "epoch": 5.700451904454487, "step": 17660}, {"loss": 0.4492, "grad_norm": 1.182812213897705, "learning_rate": 0.0002, "epoch": 5.703679793415106, "step": 17670}, {"loss": 0.5025, "grad_norm": 1.099074125289917, "learning_rate": 0.0002, "epoch": 5.706907682375727, "step": 17680}, {"loss": 0.4945, "grad_norm": 0.938634991645813, "learning_rate": 0.0002, "epoch": 5.710135571336346, "step": 17690}, {"loss": 0.491, "grad_norm": 0.9385238885879517, "learning_rate": 0.0002, "epoch": 5.713363460296966, "step": 17700}, {"loss": 0.4849, "grad_norm": 1.1486014127731323, "learning_rate": 0.0002, "epoch": 5.716591349257586, "step": 17710}, {"loss": 0.5043, "grad_norm": 0.9433078169822693, "learning_rate": 0.0002, "epoch": 5.719819238218205, "step": 17720}, {"loss": 0.4543, "grad_norm": 1.02472722530365, "learning_rate": 0.0002, "epoch": 5.723047127178825, "step": 17730}, {"loss": 0.4631, "grad_norm": 0.9360876679420471, "learning_rate": 0.0002, "epoch": 5.726275016139445, "step": 17740}, {"loss": 0.4947, "grad_norm": 1.0481483936309814, "learning_rate": 0.0002, "epoch": 5.729502905100064, "step": 17750}, {"loss": 0.4763, "grad_norm": 1.0032516717910767, "learning_rate": 0.0002, "epoch": 5.732730794060684, "step": 17760}, {"loss": 0.4819, "grad_norm": 0.8908069729804993, "learning_rate": 0.0002, "epoch": 5.735958683021304, "step": 17770}, {"loss": 0.5188, "grad_norm": 1.0679123401641846, "learning_rate": 0.0002, "epoch": 5.739186571981924, "step": 17780}, {"loss": 0.4818, "grad_norm": 1.0448014736175537, "learning_rate": 0.0002, "epoch": 5.742414460942544, "step": 17790}, {"loss": 0.4869, "grad_norm": 1.0433847904205322, "learning_rate": 0.0002, "epoch": 5.7456423499031635, "step": 17800}, {"loss": 0.5243, "grad_norm": 1.000291109085083, "learning_rate": 0.0002, "epoch": 5.748870238863783, "step": 17810}, {"loss": 0.4891, "grad_norm": 1.1238429546356201, "learning_rate": 0.0002, "epoch": 5.752098127824403, "step": 17820}, {"loss": 0.4905, "grad_norm": 1.09062659740448, "learning_rate": 0.0002, "epoch": 5.755326016785022, "step": 17830}, {"loss": 0.4883, "grad_norm": 0.8538689613342285, "learning_rate": 0.0002, "epoch": 5.758553905745642, "step": 17840}, {"loss": 0.4989, "grad_norm": 1.3872947692871094, "learning_rate": 0.0002, "epoch": 5.761781794706262, "step": 17850}, {"loss": 0.4707, "grad_norm": 1.0578876733779907, "learning_rate": 0.0002, "epoch": 5.765009683666882, "step": 17860}, {"loss": 0.5281, "grad_norm": 1.1761705875396729, "learning_rate": 0.0002, "epoch": 5.768237572627502, "step": 17870}, {"loss": 0.4802, "grad_norm": 1.1223368644714355, "learning_rate": 0.0002, "epoch": 5.771465461588122, "step": 17880}, {"loss": 0.505, "grad_norm": 1.2484360933303833, "learning_rate": 0.0002, "epoch": 5.774693350548741, "step": 17890}, {"loss": 0.4786, "grad_norm": 1.2461199760437012, "learning_rate": 0.0002, "epoch": 5.777921239509361, "step": 17900}, {"loss": 0.4933, "grad_norm": 1.1718299388885498, "learning_rate": 0.0002, "epoch": 5.7811491284699805, "step": 17910}, {"loss": 0.471, "grad_norm": 0.9896837472915649, "learning_rate": 0.0002, "epoch": 5.7843770174306, "step": 17920}, {"loss": 0.4808, "grad_norm": 1.3759760856628418, "learning_rate": 0.0002, "epoch": 5.78760490639122, "step": 17930}, {"loss": 0.4847, "grad_norm": 1.0596622228622437, "learning_rate": 0.0002, "epoch": 5.7908327953518395, "step": 17940}, {"loss": 0.5153, "grad_norm": 0.9292021989822388, "learning_rate": 0.0002, "epoch": 5.79406068431246, "step": 17950}, {"loss": 0.4783, "grad_norm": 0.8786653876304626, "learning_rate": 0.0002, "epoch": 5.79728857327308, "step": 17960}, {"loss": 0.4598, "grad_norm": 1.2087152004241943, "learning_rate": 0.0002, "epoch": 5.800516462233699, "step": 17970}, {"loss": 0.4953, "grad_norm": 1.1643104553222656, "learning_rate": 0.0002, "epoch": 5.803744351194319, "step": 17980}, {"loss": 0.5111, "grad_norm": 0.971613347530365, "learning_rate": 0.0002, "epoch": 5.806972240154939, "step": 17990}, {"loss": 0.5094, "grad_norm": 1.306227684020996, "learning_rate": 0.0002, "epoch": 5.810200129115558, "step": 18000}, {"loss": 0.5392, "grad_norm": 1.3665502071380615, "learning_rate": 0.0002, "epoch": 5.813428018076178, "step": 18010}, {"loss": 0.4887, "grad_norm": 1.2227312326431274, "learning_rate": 0.0002, "epoch": 5.816655907036798, "step": 18020}, {"loss": 0.5203, "grad_norm": 1.180694818496704, "learning_rate": 0.0002, "epoch": 5.819883795997418, "step": 18030}, {"loss": 0.4962, "grad_norm": 1.1045362949371338, "learning_rate": 0.0002, "epoch": 5.823111684958038, "step": 18040}, {"loss": 0.4969, "grad_norm": 1.3828954696655273, "learning_rate": 0.0002, "epoch": 5.826339573918657, "step": 18050}, {"loss": 0.5493, "grad_norm": 1.305102825164795, "learning_rate": 0.0002, "epoch": 5.829567462879277, "step": 18060}, {"loss": 0.4844, "grad_norm": 1.2708743810653687, "learning_rate": 0.0002, "epoch": 5.832795351839897, "step": 18070}, {"loss": 0.4834, "grad_norm": 1.0344188213348389, "learning_rate": 0.0002, "epoch": 5.836023240800516, "step": 18080}, {"loss": 0.5088, "grad_norm": 1.1321724653244019, "learning_rate": 0.0002, "epoch": 5.839251129761136, "step": 18090}, {"loss": 0.4888, "grad_norm": 1.2162611484527588, "learning_rate": 0.0002, "epoch": 5.842479018721756, "step": 18100}, {"loss": 0.5014, "grad_norm": 1.427612543106079, "learning_rate": 0.0002, "epoch": 5.845706907682375, "step": 18110}, {"loss": 0.5339, "grad_norm": 1.4391452074050903, "learning_rate": 0.0002, "epoch": 5.848934796642995, "step": 18120}, {"loss": 0.528, "grad_norm": 1.1548216342926025, "learning_rate": 0.0002, "epoch": 5.8521626856036155, "step": 18130}, {"loss": 0.4779, "grad_norm": 1.2336437702178955, "learning_rate": 0.0002, "epoch": 5.855390574564235, "step": 18140}, {"loss": 0.4844, "grad_norm": 1.254661202430725, "learning_rate": 0.0002, "epoch": 5.858618463524855, "step": 18150}, {"loss": 0.5201, "grad_norm": 0.8326491117477417, "learning_rate": 0.0002, "epoch": 5.8618463524854745, "step": 18160}, {"loss": 0.5076, "grad_norm": 1.0907988548278809, "learning_rate": 0.0002, "epoch": 5.865074241446094, "step": 18170}, {"loss": 0.48, "grad_norm": 0.9896568655967712, "learning_rate": 0.0002, "epoch": 5.868302130406714, "step": 18180}, {"loss": 0.4628, "grad_norm": 0.9440065026283264, "learning_rate": 0.0002, "epoch": 5.871530019367333, "step": 18190}, {"loss": 0.5265, "grad_norm": 1.09321129322052, "learning_rate": 0.0002, "epoch": 5.874757908327954, "step": 18200}, {"loss": 0.4737, "grad_norm": 1.2588142156600952, "learning_rate": 0.0002, "epoch": 5.877985797288574, "step": 18210}, {"loss": 0.475, "grad_norm": 1.1731587648391724, "learning_rate": 0.0002, "epoch": 5.881213686249193, "step": 18220}, {"loss": 0.504, "grad_norm": 0.9904444217681885, "learning_rate": 0.0002, "epoch": 5.884441575209813, "step": 18230}, {"loss": 0.4842, "grad_norm": 0.8985799551010132, "learning_rate": 0.0002, "epoch": 5.887669464170433, "step": 18240}, {"loss": 0.4878, "grad_norm": 1.0182441473007202, "learning_rate": 0.0002, "epoch": 5.890897353131052, "step": 18250}, {"loss": 0.5224, "grad_norm": 1.1574701070785522, "learning_rate": 0.0002, "epoch": 5.894125242091672, "step": 18260}, {"loss": 0.5, "grad_norm": 1.1776602268218994, "learning_rate": 0.0002, "epoch": 5.8973531310522915, "step": 18270}, {"loss": 0.5245, "grad_norm": 1.4951308965682983, "learning_rate": 0.0002, "epoch": 5.900581020012911, "step": 18280}, {"loss": 0.5454, "grad_norm": 1.1440261602401733, "learning_rate": 0.0002, "epoch": 5.903808908973531, "step": 18290}, {"loss": 0.4868, "grad_norm": 0.9925196170806885, "learning_rate": 0.0002, "epoch": 5.907036797934151, "step": 18300}, {"loss": 0.5142, "grad_norm": 1.098615288734436, "learning_rate": 0.0002, "epoch": 5.910264686894771, "step": 18310}, {"loss": 0.5184, "grad_norm": 1.0030080080032349, "learning_rate": 0.0002, "epoch": 5.913492575855391, "step": 18320}, {"loss": 0.474, "grad_norm": 0.9890318512916565, "learning_rate": 0.0002, "epoch": 5.91672046481601, "step": 18330}, {"loss": 0.5125, "grad_norm": 1.2209392786026, "learning_rate": 0.0002, "epoch": 5.91994835377663, "step": 18340}, {"loss": 0.4634, "grad_norm": 1.108933925628662, "learning_rate": 0.0002, "epoch": 5.92317624273725, "step": 18350}, {"loss": 0.4813, "grad_norm": 1.086024522781372, "learning_rate": 0.0002, "epoch": 5.926404131697869, "step": 18360}, {"loss": 0.4952, "grad_norm": 1.0061167478561401, "learning_rate": 0.0002, "epoch": 5.92963202065849, "step": 18370}, {"loss": 0.4848, "grad_norm": 0.9445858597755432, "learning_rate": 0.0002, "epoch": 5.9328599096191095, "step": 18380}, {"loss": 0.5014, "grad_norm": 0.9556859135627747, "learning_rate": 0.0002, "epoch": 5.936087798579729, "step": 18390}, {"loss": 0.4966, "grad_norm": 1.154168963432312, "learning_rate": 0.0002, "epoch": 5.939315687540349, "step": 18400}, {"loss": 0.4836, "grad_norm": 1.0495831966400146, "learning_rate": 0.0002, "epoch": 5.942543576500968, "step": 18410}, {"loss": 0.5021, "grad_norm": 1.0717304944992065, "learning_rate": 0.0002, "epoch": 5.945771465461588, "step": 18420}, {"loss": 0.4794, "grad_norm": 1.06618332862854, "learning_rate": 0.0002, "epoch": 5.948999354422208, "step": 18430}, {"loss": 0.5011, "grad_norm": 0.9567165374755859, "learning_rate": 0.0002, "epoch": 5.952227243382827, "step": 18440}, {"loss": 0.485, "grad_norm": 1.0306249856948853, "learning_rate": 0.0002, "epoch": 5.955455132343447, "step": 18450}, {"loss": 0.4948, "grad_norm": 1.1879968643188477, "learning_rate": 0.0002, "epoch": 5.958683021304067, "step": 18460}, {"loss": 0.5185, "grad_norm": 1.3177233934402466, "learning_rate": 0.0002, "epoch": 5.961910910264687, "step": 18470}, {"loss": 0.4966, "grad_norm": 1.0945817232131958, "learning_rate": 0.0002, "epoch": 5.965138799225307, "step": 18480}, {"loss": 0.5196, "grad_norm": 1.029414415359497, "learning_rate": 0.0002, "epoch": 5.9683666881859265, "step": 18490}, {"loss": 0.5154, "grad_norm": 1.2266209125518799, "learning_rate": 0.0002, "epoch": 5.971594577146546, "step": 18500}, {"loss": 0.4914, "grad_norm": 1.2167150974273682, "learning_rate": 0.0002, "epoch": 5.974822466107166, "step": 18510}, {"loss": 0.466, "grad_norm": 0.9941056966781616, "learning_rate": 0.0002, "epoch": 5.9780503550677855, "step": 18520}, {"loss": 0.5037, "grad_norm": 1.4244859218597412, "learning_rate": 0.0002, "epoch": 5.981278244028405, "step": 18530}, {"loss": 0.4902, "grad_norm": 0.8976260423660278, "learning_rate": 0.0002, "epoch": 5.984506132989026, "step": 18540}, {"loss": 0.5039, "grad_norm": 1.0162699222564697, "learning_rate": 0.0002, "epoch": 5.987734021949645, "step": 18550}, {"loss": 0.5138, "grad_norm": 1.196677803993225, "learning_rate": 0.0002, "epoch": 5.990961910910265, "step": 18560}, {"loss": 0.4626, "grad_norm": 1.163403868675232, "learning_rate": 0.0002, "epoch": 5.994189799870885, "step": 18570}, {"loss": 0.5105, "grad_norm": 1.010205626487732, "learning_rate": 0.0002, "epoch": 5.997417688831504, "step": 18580}, {"eval_loss": 1.2861483097076416, "eval_runtime": 163.2683, "eval_samples_per_second": 4.49, "eval_steps_per_second": 0.563, "epoch": 6.0, "step": 18588}, {"loss": 0.4557, "grad_norm": 0.7334756255149841, "learning_rate": 0.0002, "epoch": 6.000645577792124, "step": 18590}, {"loss": 0.4201, "grad_norm": 1.093945026397705, "learning_rate": 0.0002, "epoch": 6.003873466752744, "step": 18600}, {"loss": 0.4235, "grad_norm": 1.2327148914337158, "learning_rate": 0.0002, "epoch": 6.007101355713363, "step": 18610}, {"loss": 0.377, "grad_norm": 1.3238836526870728, "learning_rate": 0.0002, "epoch": 6.010329244673983, "step": 18620}, {"loss": 0.3883, "grad_norm": 1.2364031076431274, "learning_rate": 0.0002, "epoch": 6.0135571336346025, "step": 18630}, {"loss": 0.3958, "grad_norm": 0.902474045753479, "learning_rate": 0.0002, "epoch": 6.016785022595223, "step": 18640}, {"loss": 0.4077, "grad_norm": 1.273280382156372, "learning_rate": 0.0002, "epoch": 6.020012911555843, "step": 18650}, {"loss": 0.4224, "grad_norm": 1.2470760345458984, "learning_rate": 0.0002, "epoch": 6.023240800516462, "step": 18660}, {"loss": 0.3752, "grad_norm": 1.2360138893127441, "learning_rate": 0.0002, "epoch": 6.026468689477082, "step": 18670}, {"loss": 0.3653, "grad_norm": 1.467140793800354, "learning_rate": 0.0002, "epoch": 6.029696578437702, "step": 18680}, {"loss": 0.3883, "grad_norm": 1.123871088027954, "learning_rate": 0.0002, "epoch": 6.032924467398321, "step": 18690}, {"loss": 0.3812, "grad_norm": 0.9732550978660583, "learning_rate": 0.0002, "epoch": 6.036152356358941, "step": 18700}, {"loss": 0.4163, "grad_norm": 1.170860767364502, "learning_rate": 0.0002, "epoch": 6.039380245319561, "step": 18710}, {"loss": 0.3836, "grad_norm": 1.2599345445632935, "learning_rate": 0.0002, "epoch": 6.042608134280181, "step": 18720}, {"loss": 0.3881, "grad_norm": 1.0808286666870117, "learning_rate": 0.0002, "epoch": 6.045836023240801, "step": 18730}, {"loss": 0.386, "grad_norm": 0.9799565076828003, "learning_rate": 0.0002, "epoch": 6.0490639122014205, "step": 18740}, {"loss": 0.3833, "grad_norm": 0.8425611853599548, "learning_rate": 0.0002, "epoch": 6.05229180116204, "step": 18750}, {"loss": 0.3765, "grad_norm": 0.9762344360351562, "learning_rate": 0.0002, "epoch": 6.05551969012266, "step": 18760}, {"loss": 0.3878, "grad_norm": 1.1290913820266724, "learning_rate": 0.0002, "epoch": 6.058747579083279, "step": 18770}, {"loss": 0.4061, "grad_norm": 1.2240493297576904, "learning_rate": 0.0002, "epoch": 6.061975468043899, "step": 18780}, {"loss": 0.3894, "grad_norm": 1.3422439098358154, "learning_rate": 0.0002, "epoch": 6.065203357004519, "step": 18790}, {"loss": 0.3885, "grad_norm": 1.0391879081726074, "learning_rate": 0.0002, "epoch": 6.068431245965138, "step": 18800}, {"loss": 0.409, "grad_norm": 1.0910760164260864, "learning_rate": 0.0002, "epoch": 6.071659134925759, "step": 18810}, {"loss": 0.3905, "grad_norm": 1.280098557472229, "learning_rate": 0.0002, "epoch": 6.074887023886379, "step": 18820}, {"loss": 0.3892, "grad_norm": 1.2102673053741455, "learning_rate": 0.0002, "epoch": 6.078114912846998, "step": 18830}, {"loss": 0.3757, "grad_norm": 1.3735624551773071, "learning_rate": 0.0002, "epoch": 6.081342801807618, "step": 18840}, {"loss": 0.4057, "grad_norm": 1.039419412612915, "learning_rate": 0.0002, "epoch": 6.0845706907682375, "step": 18850}, {"loss": 0.4093, "grad_norm": 1.175872802734375, "learning_rate": 0.0002, "epoch": 6.087798579728857, "step": 18860}, {"loss": 0.3933, "grad_norm": 1.4287301301956177, "learning_rate": 0.0002, "epoch": 6.091026468689477, "step": 18870}, {"loss": 0.4029, "grad_norm": 1.110627293586731, "learning_rate": 0.0002, "epoch": 6.0942543576500965, "step": 18880}, {"loss": 0.4195, "grad_norm": 1.1495535373687744, "learning_rate": 0.0002, "epoch": 6.097482246610717, "step": 18890}, {"loss": 0.4022, "grad_norm": 0.9764134287834167, "learning_rate": 0.0002, "epoch": 6.100710135571337, "step": 18900}, {"loss": 0.4097, "grad_norm": 1.0792596340179443, "learning_rate": 0.0002, "epoch": 6.103938024531956, "step": 18910}, {"loss": 0.402, "grad_norm": 1.2520235776901245, "learning_rate": 0.0002, "epoch": 6.107165913492576, "step": 18920}, {"loss": 0.4091, "grad_norm": 0.857008695602417, "learning_rate": 0.0002, "epoch": 6.110393802453196, "step": 18930}, {"loss": 0.4046, "grad_norm": 1.745723009109497, "learning_rate": 0.0002, "epoch": 6.113621691413815, "step": 18940}, {"loss": 0.4245, "grad_norm": 1.099941611289978, "learning_rate": 0.0002, "epoch": 6.116849580374435, "step": 18950}, {"loss": 0.3708, "grad_norm": 1.1402947902679443, "learning_rate": 0.0002, "epoch": 6.120077469335055, "step": 18960}, {"loss": 0.4022, "grad_norm": 1.0565131902694702, "learning_rate": 0.0002, "epoch": 6.123305358295674, "step": 18970}, {"loss": 0.3973, "grad_norm": 1.1511917114257812, "learning_rate": 0.0002, "epoch": 6.126533247256295, "step": 18980}, {"loss": 0.395, "grad_norm": 0.9029410481452942, "learning_rate": 0.0002, "epoch": 6.129761136216914, "step": 18990}, {"loss": 0.393, "grad_norm": 1.03252375125885, "learning_rate": 0.0002, "epoch": 6.132989025177534, "step": 19000}, {"loss": 0.3923, "grad_norm": 1.2058522701263428, "learning_rate": 0.0002, "epoch": 6.136216914138154, "step": 19010}, {"loss": 0.3963, "grad_norm": 1.2274953126907349, "learning_rate": 0.0002, "epoch": 6.139444803098773, "step": 19020}, {"loss": 0.3999, "grad_norm": 1.3196226358413696, "learning_rate": 0.0002, "epoch": 6.142672692059393, "step": 19030}, {"loss": 0.4176, "grad_norm": 0.8030686378479004, "learning_rate": 0.0002, "epoch": 6.145900581020013, "step": 19040}, {"loss": 0.3886, "grad_norm": 1.1762639284133911, "learning_rate": 0.0002, "epoch": 6.149128469980632, "step": 19050}, {"loss": 0.429, "grad_norm": 1.0247628688812256, "learning_rate": 0.0002, "epoch": 6.152356358941253, "step": 19060}, {"loss": 0.3876, "grad_norm": 0.99031662940979, "learning_rate": 0.0002, "epoch": 6.1555842479018725, "step": 19070}, {"loss": 0.3818, "grad_norm": 1.334445834159851, "learning_rate": 0.0002, "epoch": 6.158812136862492, "step": 19080}, {"loss": 0.4038, "grad_norm": 1.1160423755645752, "learning_rate": 0.0002, "epoch": 6.162040025823112, "step": 19090}, {"loss": 0.4081, "grad_norm": 1.2579560279846191, "learning_rate": 0.0002, "epoch": 6.1652679147837315, "step": 19100}, {"loss": 0.4092, "grad_norm": 0.9372721910476685, "learning_rate": 0.0002, "epoch": 6.168495803744351, "step": 19110}, {"loss": 0.3905, "grad_norm": 0.7995722889900208, "learning_rate": 0.0002, "epoch": 6.171723692704971, "step": 19120}, {"loss": 0.3896, "grad_norm": 1.0074360370635986, "learning_rate": 0.0002, "epoch": 6.17495158166559, "step": 19130}, {"loss": 0.4328, "grad_norm": 0.9821600914001465, "learning_rate": 0.0002, "epoch": 6.17817947062621, "step": 19140}, {"loss": 0.3845, "grad_norm": 1.1252691745758057, "learning_rate": 0.0002, "epoch": 6.181407359586831, "step": 19150}, {"loss": 0.3918, "grad_norm": 1.316981554031372, "learning_rate": 0.0002, "epoch": 6.18463524854745, "step": 19160}, {"loss": 0.3893, "grad_norm": 1.0131299495697021, "learning_rate": 0.0002, "epoch": 6.18786313750807, "step": 19170}, {"loss": 0.4111, "grad_norm": 1.3530288934707642, "learning_rate": 0.0002, "epoch": 6.19109102646869, "step": 19180}, {"loss": 0.416, "grad_norm": 1.148247480392456, "learning_rate": 0.0002, "epoch": 6.194318915429309, "step": 19190}, {"loss": 0.4191, "grad_norm": 1.5510036945343018, "learning_rate": 0.0002, "epoch": 6.197546804389929, "step": 19200}, {"loss": 0.423, "grad_norm": 1.3048018217086792, "learning_rate": 0.0002, "epoch": 6.2007746933505485, "step": 19210}, {"loss": 0.397, "grad_norm": 1.186187982559204, "learning_rate": 0.0002, "epoch": 6.204002582311168, "step": 19220}, {"loss": 0.4164, "grad_norm": 1.5199471712112427, "learning_rate": 0.0002, "epoch": 6.207230471271788, "step": 19230}, {"loss": 0.4322, "grad_norm": 1.1311423778533936, "learning_rate": 0.0002, "epoch": 6.210458360232408, "step": 19240}, {"loss": 0.4086, "grad_norm": 1.2345898151397705, "learning_rate": 0.0002, "epoch": 6.213686249193028, "step": 19250}, {"loss": 0.4122, "grad_norm": 1.0261863470077515, "learning_rate": 0.0002, "epoch": 6.216914138153648, "step": 19260}, {"loss": 0.4315, "grad_norm": 0.8985416293144226, "learning_rate": 0.0002, "epoch": 6.220142027114267, "step": 19270}, {"loss": 0.4052, "grad_norm": 1.3136980533599854, "learning_rate": 0.0002, "epoch": 6.223369916074887, "step": 19280}, {"loss": 0.4232, "grad_norm": 1.1949185132980347, "learning_rate": 0.0002, "epoch": 6.226597805035507, "step": 19290}, {"loss": 0.4255, "grad_norm": 0.9668909907341003, "learning_rate": 0.0002, "epoch": 6.229825693996126, "step": 19300}, {"loss": 0.3917, "grad_norm": 0.8858964443206787, "learning_rate": 0.0002, "epoch": 6.233053582956746, "step": 19310}, {"loss": 0.4087, "grad_norm": 1.4254822731018066, "learning_rate": 0.0002, "epoch": 6.236281471917366, "step": 19320}, {"loss": 0.426, "grad_norm": 1.0455392599105835, "learning_rate": 0.0002, "epoch": 6.239509360877986, "step": 19330}, {"loss": 0.3894, "grad_norm": 1.1690824031829834, "learning_rate": 0.0002, "epoch": 6.242737249838606, "step": 19340}, {"loss": 0.3777, "grad_norm": 1.0347497463226318, "learning_rate": 0.0002, "epoch": 6.245965138799225, "step": 19350}, {"loss": 0.3972, "grad_norm": 1.0790464878082275, "learning_rate": 0.0002, "epoch": 6.249193027759845, "step": 19360}, {"loss": 0.4393, "grad_norm": 1.1294453144073486, "learning_rate": 0.0002, "epoch": 6.252420916720465, "step": 19370}, {"loss": 0.4055, "grad_norm": 1.5094330310821533, "learning_rate": 0.0002, "epoch": 6.255648805681084, "step": 19380}, {"loss": 0.4228, "grad_norm": 1.1122944355010986, "learning_rate": 0.0002, "epoch": 6.258876694641704, "step": 19390}, {"loss": 0.4341, "grad_norm": 1.3123422861099243, "learning_rate": 0.0002, "epoch": 6.262104583602324, "step": 19400}, {"loss": 0.4206, "grad_norm": 1.0585907697677612, "learning_rate": 0.0002, "epoch": 6.265332472562944, "step": 19410}, {"loss": 0.4001, "grad_norm": 0.8711239099502563, "learning_rate": 0.0002, "epoch": 6.268560361523564, "step": 19420}, {"loss": 0.4201, "grad_norm": 1.2772116661071777, "learning_rate": 0.0002, "epoch": 6.2717882504841835, "step": 19430}, {"loss": 0.4298, "grad_norm": 1.0035508871078491, "learning_rate": 0.0002, "epoch": 6.275016139444803, "step": 19440}, {"loss": 0.4234, "grad_norm": 0.7933974862098694, "learning_rate": 0.0002, "epoch": 6.278244028405423, "step": 19450}, {"loss": 0.4144, "grad_norm": 1.2455826997756958, "learning_rate": 0.0002, "epoch": 6.2814719173660425, "step": 19460}, {"loss": 0.4171, "grad_norm": 1.2735545635223389, "learning_rate": 0.0002, "epoch": 6.284699806326662, "step": 19470}, {"loss": 0.3956, "grad_norm": 0.9773174524307251, "learning_rate": 0.0002, "epoch": 6.287927695287282, "step": 19480}, {"loss": 0.4264, "grad_norm": 1.2341974973678589, "learning_rate": 0.0002, "epoch": 6.2911555842479014, "step": 19490}, {"loss": 0.4068, "grad_norm": 1.286138653755188, "learning_rate": 0.0002, "epoch": 6.294383473208522, "step": 19500}, {"loss": 0.439, "grad_norm": 1.052889108657837, "learning_rate": 0.0002, "epoch": 6.297611362169142, "step": 19510}, {"loss": 0.4199, "grad_norm": 1.1955385208129883, "learning_rate": 0.0002, "epoch": 6.300839251129761, "step": 19520}, {"loss": 0.4242, "grad_norm": 1.2792452573776245, "learning_rate": 0.0002, "epoch": 6.304067140090381, "step": 19530}, {"loss": 0.3989, "grad_norm": 0.9077931046485901, "learning_rate": 0.0002, "epoch": 6.307295029051001, "step": 19540}, {"loss": 0.388, "grad_norm": 1.2492976188659668, "learning_rate": 0.0002, "epoch": 6.31052291801162, "step": 19550}, {"loss": 0.3828, "grad_norm": 1.1097182035446167, "learning_rate": 0.0002, "epoch": 6.31375080697224, "step": 19560}, {"loss": 0.4482, "grad_norm": 1.271609902381897, "learning_rate": 0.0002, "epoch": 6.3169786959328595, "step": 19570}, {"loss": 0.3851, "grad_norm": 1.4262897968292236, "learning_rate": 0.0002, "epoch": 6.32020658489348, "step": 19580}, {"loss": 0.4133, "grad_norm": 1.057338833808899, "learning_rate": 0.0002, "epoch": 6.3234344738541, "step": 19590}, {"loss": 0.4366, "grad_norm": 1.323028326034546, "learning_rate": 0.0002, "epoch": 6.326662362814719, "step": 19600}, {"loss": 0.4186, "grad_norm": 1.0991673469543457, "learning_rate": 0.0002, "epoch": 6.329890251775339, "step": 19610}, {"loss": 0.4132, "grad_norm": 1.1600234508514404, "learning_rate": 0.0002, "epoch": 6.333118140735959, "step": 19620}, {"loss": 0.4689, "grad_norm": 1.2986212968826294, "learning_rate": 0.0002, "epoch": 6.336346029696578, "step": 19630}, {"loss": 0.3914, "grad_norm": 1.2117934226989746, "learning_rate": 0.0002, "epoch": 6.339573918657198, "step": 19640}, {"loss": 0.3939, "grad_norm": 0.9747948050498962, "learning_rate": 0.0002, "epoch": 6.342801807617818, "step": 19650}, {"loss": 0.4517, "grad_norm": 1.2380492687225342, "learning_rate": 0.0002, "epoch": 6.346029696578437, "step": 19660}, {"loss": 0.4344, "grad_norm": 1.2475087642669678, "learning_rate": 0.0002, "epoch": 6.349257585539058, "step": 19670}, {"loss": 0.4253, "grad_norm": 1.022084355354309, "learning_rate": 0.0002, "epoch": 6.3524854744996775, "step": 19680}, {"loss": 0.4227, "grad_norm": 1.2422059774398804, "learning_rate": 0.0002, "epoch": 6.355713363460297, "step": 19690}, {"loss": 0.4205, "grad_norm": 1.5015275478363037, "learning_rate": 0.0002, "epoch": 6.358941252420917, "step": 19700}, {"loss": 0.414, "grad_norm": 1.068727970123291, "learning_rate": 0.0002, "epoch": 6.362169141381536, "step": 19710}, {"loss": 0.4054, "grad_norm": 1.3718897104263306, "learning_rate": 0.0002, "epoch": 6.365397030342156, "step": 19720}, {"loss": 0.4399, "grad_norm": 1.3437764644622803, "learning_rate": 0.0002, "epoch": 6.368624919302776, "step": 19730}, {"loss": 0.4187, "grad_norm": 0.9128499031066895, "learning_rate": 0.0002, "epoch": 6.371852808263395, "step": 19740}, {"loss": 0.4346, "grad_norm": 1.0678889751434326, "learning_rate": 0.0002, "epoch": 6.375080697224016, "step": 19750}, {"loss": 0.4103, "grad_norm": 1.0432878732681274, "learning_rate": 0.0002, "epoch": 6.378308586184636, "step": 19760}, {"loss": 0.4304, "grad_norm": 1.4033927917480469, "learning_rate": 0.0002, "epoch": 6.381536475145255, "step": 19770}, {"loss": 0.4225, "grad_norm": 1.2773922681808472, "learning_rate": 0.0002, "epoch": 6.384764364105875, "step": 19780}, {"loss": 0.4246, "grad_norm": 1.257847547531128, "learning_rate": 0.0002, "epoch": 6.3879922530664945, "step": 19790}, {"loss": 0.4261, "grad_norm": 0.8424118757247925, "learning_rate": 0.0002, "epoch": 6.391220142027114, "step": 19800}, {"loss": 0.4145, "grad_norm": 1.3387986421585083, "learning_rate": 0.0002, "epoch": 6.394448030987734, "step": 19810}, {"loss": 0.4268, "grad_norm": 1.1277328729629517, "learning_rate": 0.0002, "epoch": 6.3976759199483535, "step": 19820}, {"loss": 0.4213, "grad_norm": 1.264283537864685, "learning_rate": 0.0002, "epoch": 6.400903808908973, "step": 19830}, {"loss": 0.4506, "grad_norm": 1.1770991086959839, "learning_rate": 0.0002, "epoch": 6.404131697869594, "step": 19840}, {"loss": 0.4385, "grad_norm": 0.9695967435836792, "learning_rate": 0.0002, "epoch": 6.407359586830213, "step": 19850}, {"loss": 0.4258, "grad_norm": 1.3394994735717773, "learning_rate": 0.0002, "epoch": 6.410587475790833, "step": 19860}, {"loss": 0.4017, "grad_norm": 1.0515536069869995, "learning_rate": 0.0002, "epoch": 6.413815364751453, "step": 19870}, {"loss": 0.4555, "grad_norm": 1.3238868713378906, "learning_rate": 0.0002, "epoch": 6.417043253712072, "step": 19880}, {"loss": 0.4385, "grad_norm": 1.0801814794540405, "learning_rate": 0.0002, "epoch": 6.420271142672692, "step": 19890}, {"loss": 0.4135, "grad_norm": 1.1391135454177856, "learning_rate": 0.0002, "epoch": 6.423499031633312, "step": 19900}, {"loss": 0.4376, "grad_norm": 1.13046133518219, "learning_rate": 0.0002, "epoch": 6.426726920593931, "step": 19910}, {"loss": 0.4251, "grad_norm": 1.1657520532608032, "learning_rate": 0.0002, "epoch": 6.429954809554552, "step": 19920}, {"loss": 0.3951, "grad_norm": 1.3315341472625732, "learning_rate": 0.0002, "epoch": 6.433182698515171, "step": 19930}, {"loss": 0.4254, "grad_norm": 1.1806831359863281, "learning_rate": 0.0002, "epoch": 6.436410587475791, "step": 19940}, {"loss": 0.3988, "grad_norm": 1.1581867933273315, "learning_rate": 0.0002, "epoch": 6.439638476436411, "step": 19950}, {"loss": 0.4194, "grad_norm": 1.2601206302642822, "learning_rate": 0.0002, "epoch": 6.44286636539703, "step": 19960}, {"loss": 0.4505, "grad_norm": 1.1163229942321777, "learning_rate": 0.0002, "epoch": 6.44609425435765, "step": 19970}, {"loss": 0.4295, "grad_norm": 0.9959462285041809, "learning_rate": 0.0002, "epoch": 6.44932214331827, "step": 19980}, {"loss": 0.421, "grad_norm": 1.1213586330413818, "learning_rate": 0.0002, "epoch": 6.452550032278889, "step": 19990}, {"loss": 0.4354, "grad_norm": 1.1345361471176147, "learning_rate": 0.0002, "epoch": 6.455777921239509, "step": 20000}, {"loss": 0.429, "grad_norm": 1.245871901512146, "learning_rate": 0.0002, "epoch": 6.459005810200129, "step": 20010}, {"loss": 0.4395, "grad_norm": 1.0894919633865356, "learning_rate": 0.0002, "epoch": 6.462233699160749, "step": 20020}, {"loss": 0.4365, "grad_norm": 1.030206322669983, "learning_rate": 0.0002, "epoch": 6.465461588121369, "step": 20030}, {"loss": 0.4225, "grad_norm": 1.262133002281189, "learning_rate": 0.0002, "epoch": 6.4686894770819885, "step": 20040}, {"loss": 0.4301, "grad_norm": 1.167641043663025, "learning_rate": 0.0002, "epoch": 6.471917366042608, "step": 20050}, {"loss": 0.4438, "grad_norm": 1.1125705242156982, "learning_rate": 0.0002, "epoch": 6.475145255003228, "step": 20060}, {"loss": 0.4205, "grad_norm": 1.3777440786361694, "learning_rate": 0.0002, "epoch": 6.4783731439638474, "step": 20070}, {"loss": 0.424, "grad_norm": 1.1771081686019897, "learning_rate": 0.0002, "epoch": 6.481601032924467, "step": 20080}, {"loss": 0.4187, "grad_norm": 1.0414351224899292, "learning_rate": 0.0002, "epoch": 6.484828921885087, "step": 20090}, {"loss": 0.4419, "grad_norm": 1.2103244066238403, "learning_rate": 0.0002, "epoch": 6.488056810845707, "step": 20100}, {"loss": 0.4502, "grad_norm": 1.4153836965560913, "learning_rate": 0.0002, "epoch": 6.491284699806327, "step": 20110}, {"loss": 0.4524, "grad_norm": 1.2718676328659058, "learning_rate": 0.0002, "epoch": 6.494512588766947, "step": 20120}, {"loss": 0.4546, "grad_norm": 1.1040351390838623, "learning_rate": 0.0002, "epoch": 6.497740477727566, "step": 20130}, {"loss": 0.4105, "grad_norm": 0.9804210662841797, "learning_rate": 0.0002, "epoch": 6.500968366688186, "step": 20140}, {"loss": 0.4165, "grad_norm": 1.028836965560913, "learning_rate": 0.0002, "epoch": 6.5041962556488055, "step": 20150}, {"loss": 0.4106, "grad_norm": 1.1773076057434082, "learning_rate": 0.0002, "epoch": 6.507424144609425, "step": 20160}, {"loss": 0.4364, "grad_norm": 0.8597512245178223, "learning_rate": 0.0002, "epoch": 6.510652033570045, "step": 20170}, {"loss": 0.4346, "grad_norm": 1.4290635585784912, "learning_rate": 0.0002, "epoch": 6.5138799225306645, "step": 20180}, {"loss": 0.4057, "grad_norm": 0.9842908382415771, "learning_rate": 0.0002, "epoch": 6.517107811491285, "step": 20190}, {"loss": 0.4562, "grad_norm": 1.0254372358322144, "learning_rate": 0.0002, "epoch": 6.520335700451905, "step": 20200}, {"loss": 0.433, "grad_norm": 1.1869125366210938, "learning_rate": 0.0002, "epoch": 6.523563589412524, "step": 20210}, {"loss": 0.4247, "grad_norm": 1.0994106531143188, "learning_rate": 0.0002, "epoch": 6.526791478373144, "step": 20220}, {"loss": 0.416, "grad_norm": 1.03111732006073, "learning_rate": 0.0002, "epoch": 6.530019367333764, "step": 20230}, {"loss": 0.4202, "grad_norm": 1.5421077013015747, "learning_rate": 0.0002, "epoch": 6.533247256294383, "step": 20240}, {"loss": 0.4309, "grad_norm": 1.4383527040481567, "learning_rate": 0.0002, "epoch": 6.536475145255003, "step": 20250}, {"loss": 0.4086, "grad_norm": 1.0252864360809326, "learning_rate": 0.0002, "epoch": 6.539703034215623, "step": 20260}, {"loss": 0.4391, "grad_norm": 1.2504689693450928, "learning_rate": 0.0002, "epoch": 6.542930923176243, "step": 20270}, {"loss": 0.4294, "grad_norm": 1.2130976915359497, "learning_rate": 0.0002, "epoch": 6.546158812136863, "step": 20280}, {"loss": 0.4432, "grad_norm": 1.1186957359313965, "learning_rate": 0.0002, "epoch": 6.549386701097482, "step": 20290}, {"loss": 0.4225, "grad_norm": 1.0373939275741577, "learning_rate": 0.0002, "epoch": 6.552614590058102, "step": 20300}, {"loss": 0.3874, "grad_norm": 0.9950923323631287, "learning_rate": 0.0002, "epoch": 6.555842479018722, "step": 20310}, {"loss": 0.4257, "grad_norm": 1.1479439735412598, "learning_rate": 0.0002, "epoch": 6.559070367979341, "step": 20320}, {"loss": 0.4418, "grad_norm": 1.2426027059555054, "learning_rate": 0.0002, "epoch": 6.562298256939961, "step": 20330}, {"loss": 0.4274, "grad_norm": 1.3021808862686157, "learning_rate": 0.0002, "epoch": 6.565526145900581, "step": 20340}, {"loss": 0.4423, "grad_norm": 1.203259825706482, "learning_rate": 0.0002, "epoch": 6.5687540348612, "step": 20350}, {"loss": 0.4568, "grad_norm": 2.1131186485290527, "learning_rate": 0.0002, "epoch": 6.571981923821821, "step": 20360}, {"loss": 0.4272, "grad_norm": 1.1588627099990845, "learning_rate": 0.0002, "epoch": 6.5752098127824405, "step": 20370}, {"loss": 0.4727, "grad_norm": 1.0151054859161377, "learning_rate": 0.0002, "epoch": 6.57843770174306, "step": 20380}, {"loss": 0.4592, "grad_norm": 1.323155403137207, "learning_rate": 0.0002, "epoch": 6.58166559070368, "step": 20390}, {"loss": 0.4075, "grad_norm": 1.0907572507858276, "learning_rate": 0.0002, "epoch": 6.5848934796642995, "step": 20400}, {"loss": 0.4127, "grad_norm": 1.2375017404556274, "learning_rate": 0.0002, "epoch": 6.588121368624919, "step": 20410}, {"loss": 0.4483, "grad_norm": 1.0491245985031128, "learning_rate": 0.0002, "epoch": 6.591349257585539, "step": 20420}, {"loss": 0.4476, "grad_norm": 1.50575852394104, "learning_rate": 0.0002, "epoch": 6.5945771465461585, "step": 20430}, {"loss": 0.4235, "grad_norm": 0.9893020987510681, "learning_rate": 0.0002, "epoch": 6.597805035506779, "step": 20440}, {"loss": 0.4384, "grad_norm": 1.258591651916504, "learning_rate": 0.0002, "epoch": 6.601032924467399, "step": 20450}, {"loss": 0.4458, "grad_norm": 1.3949081897735596, "learning_rate": 0.0002, "epoch": 6.604260813428018, "step": 20460}, {"loss": 0.3885, "grad_norm": 1.152513861656189, "learning_rate": 0.0002, "epoch": 6.607488702388638, "step": 20470}, {"loss": 0.4257, "grad_norm": 1.218362808227539, "learning_rate": 0.0002, "epoch": 6.610716591349258, "step": 20480}, {"loss": 0.4448, "grad_norm": 1.3538687229156494, "learning_rate": 0.0002, "epoch": 6.613944480309877, "step": 20490}, {"loss": 0.4348, "grad_norm": 1.2896782159805298, "learning_rate": 0.0002, "epoch": 6.617172369270497, "step": 20500}, {"loss": 0.4287, "grad_norm": 1.0762150287628174, "learning_rate": 0.0002, "epoch": 6.6204002582311166, "step": 20510}, {"loss": 0.4529, "grad_norm": 1.1561447381973267, "learning_rate": 0.0002, "epoch": 6.623628147191736, "step": 20520}, {"loss": 0.4017, "grad_norm": 1.0553218126296997, "learning_rate": 0.0002, "epoch": 6.626856036152357, "step": 20530}, {"loss": 0.4321, "grad_norm": 1.1378765106201172, "learning_rate": 0.0002, "epoch": 6.630083925112976, "step": 20540}, {"loss": 0.4351, "grad_norm": 1.2299952507019043, "learning_rate": 0.0002, "epoch": 6.633311814073596, "step": 20550}, {"loss": 0.4406, "grad_norm": 1.4158518314361572, "learning_rate": 0.0002, "epoch": 6.636539703034216, "step": 20560}, {"loss": 0.4334, "grad_norm": 1.058830738067627, "learning_rate": 0.0002, "epoch": 6.639767591994835, "step": 20570}, {"loss": 0.4248, "grad_norm": 1.1069598197937012, "learning_rate": 0.0002, "epoch": 6.642995480955455, "step": 20580}, {"loss": 0.4651, "grad_norm": 1.3859037160873413, "learning_rate": 0.0002, "epoch": 6.646223369916075, "step": 20590}, {"loss": 0.4324, "grad_norm": 1.300588607788086, "learning_rate": 0.0002, "epoch": 6.649451258876694, "step": 20600}, {"loss": 0.4581, "grad_norm": 1.3861193656921387, "learning_rate": 0.0002, "epoch": 6.652679147837315, "step": 20610}, {"loss": 0.4198, "grad_norm": 1.2356518507003784, "learning_rate": 0.0002, "epoch": 6.6559070367979345, "step": 20620}, {"loss": 0.4578, "grad_norm": 1.1698070764541626, "learning_rate": 0.0002, "epoch": 6.659134925758554, "step": 20630}, {"loss": 0.4513, "grad_norm": 1.270707607269287, "learning_rate": 0.0002, "epoch": 6.662362814719174, "step": 20640}, {"loss": 0.4552, "grad_norm": 0.984618067741394, "learning_rate": 0.0002, "epoch": 6.6655907036797934, "step": 20650}, {"loss": 0.4648, "grad_norm": 1.2335834503173828, "learning_rate": 0.0002, "epoch": 6.668818592640413, "step": 20660}, {"loss": 0.4541, "grad_norm": 0.9497392773628235, "learning_rate": 0.0002, "epoch": 6.672046481601033, "step": 20670}, {"loss": 0.4176, "grad_norm": 1.011144757270813, "learning_rate": 0.0002, "epoch": 6.675274370561652, "step": 20680}, {"loss": 0.4424, "grad_norm": 1.1605948209762573, "learning_rate": 0.0002, "epoch": 6.678502259522272, "step": 20690}, {"loss": 0.4613, "grad_norm": 1.2136812210083008, "learning_rate": 0.0002, "epoch": 6.681730148482892, "step": 20700}, {"loss": 0.4287, "grad_norm": 1.0823525190353394, "learning_rate": 0.0002, "epoch": 6.684958037443512, "step": 20710}, {"loss": 0.4307, "grad_norm": 1.1929140090942383, "learning_rate": 0.0002, "epoch": 6.688185926404132, "step": 20720}, {"loss": 0.4453, "grad_norm": 1.2468219995498657, "learning_rate": 0.0002, "epoch": 6.6914138153647515, "step": 20730}, {"loss": 0.4262, "grad_norm": 1.2653573751449585, "learning_rate": 0.0002, "epoch": 6.694641704325371, "step": 20740}, {"loss": 0.4716, "grad_norm": 1.2253094911575317, "learning_rate": 0.0002, "epoch": 6.697869593285991, "step": 20750}, {"loss": 0.4462, "grad_norm": 1.103179931640625, "learning_rate": 0.0002, "epoch": 6.7010974822466105, "step": 20760}, {"loss": 0.4179, "grad_norm": 0.9180657863616943, "learning_rate": 0.0002, "epoch": 6.70432537120723, "step": 20770}, {"loss": 0.4712, "grad_norm": 1.1830929517745972, "learning_rate": 0.0002, "epoch": 6.707553260167851, "step": 20780}, {"loss": 0.4304, "grad_norm": 1.1052136421203613, "learning_rate": 0.0002, "epoch": 6.71078114912847, "step": 20790}, {"loss": 0.436, "grad_norm": 1.1268569231033325, "learning_rate": 0.0002, "epoch": 6.71400903808909, "step": 20800}, {"loss": 0.4109, "grad_norm": 1.0753320455551147, "learning_rate": 0.0002, "epoch": 6.71723692704971, "step": 20810}, {"loss": 0.4471, "grad_norm": 1.1100133657455444, "learning_rate": 0.0002, "epoch": 6.720464816010329, "step": 20820}, {"loss": 0.447, "grad_norm": 0.7498472929000854, "learning_rate": 0.0002, "epoch": 6.723692704970949, "step": 20830}, {"loss": 0.4182, "grad_norm": 1.1006664037704468, "learning_rate": 0.0002, "epoch": 6.726920593931569, "step": 20840}, {"loss": 0.4348, "grad_norm": 1.4599690437316895, "learning_rate": 0.0002, "epoch": 6.730148482892188, "step": 20850}, {"loss": 0.4596, "grad_norm": 1.324700951576233, "learning_rate": 0.0002, "epoch": 6.733376371852808, "step": 20860}, {"loss": 0.4373, "grad_norm": 1.1128668785095215, "learning_rate": 0.0002, "epoch": 6.736604260813428, "step": 20870}, {"loss": 0.4267, "grad_norm": 1.0438026189804077, "learning_rate": 0.0002, "epoch": 6.739832149774048, "step": 20880}, {"loss": 0.4366, "grad_norm": 1.1934672594070435, "learning_rate": 0.0002, "epoch": 6.743060038734668, "step": 20890}, {"loss": 0.4264, "grad_norm": 1.2108192443847656, "learning_rate": 0.0002, "epoch": 6.746287927695287, "step": 20900}, {"loss": 0.4327, "grad_norm": 1.1514620780944824, "learning_rate": 0.0002, "epoch": 6.749515816655907, "step": 20910}, {"loss": 0.4774, "grad_norm": 1.1723405122756958, "learning_rate": 0.0002, "epoch": 6.752743705616527, "step": 20920}, {"loss": 0.4458, "grad_norm": 1.1136211156845093, "learning_rate": 0.0002, "epoch": 6.755971594577146, "step": 20930}, {"loss": 0.4363, "grad_norm": 1.297601342201233, "learning_rate": 0.0002, "epoch": 6.759199483537766, "step": 20940}, {"loss": 0.4389, "grad_norm": 1.139397144317627, "learning_rate": 0.0002, "epoch": 6.7624273724983865, "step": 20950}, {"loss": 0.4344, "grad_norm": 1.2873362302780151, "learning_rate": 0.0002, "epoch": 6.765655261459006, "step": 20960}, {"loss": 0.4204, "grad_norm": 1.1499544382095337, "learning_rate": 0.0002, "epoch": 6.768883150419626, "step": 20970}, {"loss": 0.4279, "grad_norm": 1.3687032461166382, "learning_rate": 0.0002, "epoch": 6.7721110393802455, "step": 20980}, {"loss": 0.4621, "grad_norm": 1.2877939939498901, "learning_rate": 0.0002, "epoch": 6.775338928340865, "step": 20990}, {"loss": 0.4629, "grad_norm": 1.232993483543396, "learning_rate": 0.0002, "epoch": 6.778566817301485, "step": 21000}, {"loss": 0.4697, "grad_norm": 1.1765092611312866, "learning_rate": 0.0002, "epoch": 6.7817947062621045, "step": 21010}, {"loss": 0.431, "grad_norm": 1.4695899486541748, "learning_rate": 0.0002, "epoch": 6.785022595222724, "step": 21020}, {"loss": 0.4348, "grad_norm": 1.2325087785720825, "learning_rate": 0.0002, "epoch": 6.788250484183344, "step": 21030}, {"loss": 0.4595, "grad_norm": 1.3475068807601929, "learning_rate": 0.0002, "epoch": 6.791478373143963, "step": 21040}, {"loss": 0.4555, "grad_norm": 1.5654256343841553, "learning_rate": 0.0002, "epoch": 6.794706262104584, "step": 21050}, {"loss": 0.4672, "grad_norm": 1.4210680723190308, "learning_rate": 0.0002, "epoch": 6.797934151065204, "step": 21060}, {"loss": 0.4491, "grad_norm": 1.167878270149231, "learning_rate": 0.0002, "epoch": 6.801162040025823, "step": 21070}, {"loss": 0.4524, "grad_norm": 1.1643486022949219, "learning_rate": 0.0002, "epoch": 6.804389928986443, "step": 21080}, {"loss": 0.4467, "grad_norm": 1.1976310014724731, "learning_rate": 0.0002, "epoch": 6.8076178179470626, "step": 21090}, {"loss": 0.4449, "grad_norm": 1.1392749547958374, "learning_rate": 0.0002, "epoch": 6.810845706907682, "step": 21100}, {"loss": 0.4567, "grad_norm": 1.2456704378128052, "learning_rate": 0.0002, "epoch": 6.814073595868302, "step": 21110}, {"loss": 0.4271, "grad_norm": 1.0030150413513184, "learning_rate": 0.0002, "epoch": 6.8173014848289215, "step": 21120}, {"loss": 0.4258, "grad_norm": 1.4715943336486816, "learning_rate": 0.0002, "epoch": 6.820529373789542, "step": 21130}, {"loss": 0.4615, "grad_norm": 1.1307374238967896, "learning_rate": 0.0002, "epoch": 6.823757262750162, "step": 21140}, {"loss": 0.4643, "grad_norm": 1.37498140335083, "learning_rate": 0.0002, "epoch": 6.826985151710781, "step": 21150}, {"loss": 0.4447, "grad_norm": 1.2791364192962646, "learning_rate": 0.0002, "epoch": 6.830213040671401, "step": 21160}, {"loss": 0.4778, "grad_norm": 1.0518016815185547, "learning_rate": 0.0002, "epoch": 6.833440929632021, "step": 21170}, {"loss": 0.448, "grad_norm": 1.1237729787826538, "learning_rate": 0.0002, "epoch": 6.83666881859264, "step": 21180}, {"loss": 0.4299, "grad_norm": 1.0360032320022583, "learning_rate": 0.0002, "epoch": 6.83989670755326, "step": 21190}, {"loss": 0.4336, "grad_norm": 0.8733281493186951, "learning_rate": 0.0002, "epoch": 6.84312459651388, "step": 21200}, {"loss": 0.4495, "grad_norm": 1.3178322315216064, "learning_rate": 0.0002, "epoch": 6.846352485474499, "step": 21210}, {"loss": 0.4548, "grad_norm": 1.0884978771209717, "learning_rate": 0.0002, "epoch": 6.84958037443512, "step": 21220}, {"loss": 0.4543, "grad_norm": 1.213229775428772, "learning_rate": 0.0002, "epoch": 6.8528082633957395, "step": 21230}, {"loss": 0.4628, "grad_norm": 1.0828464031219482, "learning_rate": 0.0002, "epoch": 6.856036152356359, "step": 21240}, {"loss": 0.4353, "grad_norm": 1.2298113107681274, "learning_rate": 0.0002, "epoch": 6.859264041316979, "step": 21250}, {"loss": 0.4088, "grad_norm": 1.4773930311203003, "learning_rate": 0.0002, "epoch": 6.862491930277598, "step": 21260}, {"loss": 0.4529, "grad_norm": 0.992661714553833, "learning_rate": 0.0002, "epoch": 6.865719819238218, "step": 21270}, {"loss": 0.474, "grad_norm": 1.25167715549469, "learning_rate": 0.0002, "epoch": 6.868947708198838, "step": 21280}, {"loss": 0.4466, "grad_norm": 1.1554399728775024, "learning_rate": 0.0002, "epoch": 6.872175597159457, "step": 21290}, {"loss": 0.4375, "grad_norm": 1.2587701082229614, "learning_rate": 0.0002, "epoch": 6.875403486120078, "step": 21300}, {"loss": 0.4507, "grad_norm": 1.392392635345459, "learning_rate": 0.0002, "epoch": 6.8786313750806976, "step": 21310}, {"loss": 0.4432, "grad_norm": 1.2159595489501953, "learning_rate": 0.0002, "epoch": 6.881859264041317, "step": 21320}, {"loss": 0.4255, "grad_norm": 1.3811182975769043, "learning_rate": 0.0002, "epoch": 6.885087153001937, "step": 21330}, {"loss": 0.4437, "grad_norm": 1.2652684450149536, "learning_rate": 0.0002, "epoch": 6.8883150419625565, "step": 21340}, {"loss": 0.4797, "grad_norm": 1.1906380653381348, "learning_rate": 0.0002, "epoch": 6.891542930923176, "step": 21350}, {"loss": 0.423, "grad_norm": 1.0525990724563599, "learning_rate": 0.0002, "epoch": 6.894770819883796, "step": 21360}, {"loss": 0.4414, "grad_norm": 0.910491406917572, "learning_rate": 0.0002, "epoch": 6.8979987088444155, "step": 21370}, {"loss": 0.4882, "grad_norm": 1.366865634918213, "learning_rate": 0.0002, "epoch": 6.901226597805035, "step": 21380}, {"loss": 0.4648, "grad_norm": 1.1270265579223633, "learning_rate": 0.0002, "epoch": 6.904454486765655, "step": 21390}, {"loss": 0.4529, "grad_norm": 1.1745691299438477, "learning_rate": 0.0002, "epoch": 6.907682375726275, "step": 21400}, {"loss": 0.4504, "grad_norm": 1.1036182641983032, "learning_rate": 0.0002, "epoch": 6.910910264686895, "step": 21410}, {"loss": 0.4612, "grad_norm": 1.0906540155410767, "learning_rate": 0.0002, "epoch": 6.914138153647515, "step": 21420}, {"loss": 0.4408, "grad_norm": 1.1176798343658447, "learning_rate": 0.0002, "epoch": 6.917366042608134, "step": 21430}, {"loss": 0.477, "grad_norm": 1.525869607925415, "learning_rate": 0.0002, "epoch": 6.920593931568754, "step": 21440}, {"loss": 0.4473, "grad_norm": 1.2466827630996704, "learning_rate": 0.0002, "epoch": 6.923821820529374, "step": 21450}, {"loss": 0.4256, "grad_norm": 1.0200796127319336, "learning_rate": 0.0002, "epoch": 6.927049709489993, "step": 21460}, {"loss": 0.4601, "grad_norm": 1.2133489847183228, "learning_rate": 0.0002, "epoch": 6.930277598450614, "step": 21470}, {"loss": 0.44, "grad_norm": 1.2100290060043335, "learning_rate": 0.0002, "epoch": 6.933505487411233, "step": 21480}, {"loss": 0.468, "grad_norm": 1.1833131313323975, "learning_rate": 0.0002, "epoch": 6.936733376371853, "step": 21490}, {"loss": 0.4529, "grad_norm": 1.2262470722198486, "learning_rate": 0.0002, "epoch": 6.939961265332473, "step": 21500}, {"loss": 0.4612, "grad_norm": 1.0496156215667725, "learning_rate": 0.0002, "epoch": 6.943189154293092, "step": 21510}, {"loss": 0.4417, "grad_norm": 1.050690770149231, "learning_rate": 0.0002, "epoch": 6.946417043253712, "step": 21520}, {"loss": 0.4813, "grad_norm": 1.2035698890686035, "learning_rate": 0.0002, "epoch": 6.949644932214332, "step": 21530}, {"loss": 0.4349, "grad_norm": 1.408007025718689, "learning_rate": 0.0002, "epoch": 6.952872821174951, "step": 21540}, {"loss": 0.4391, "grad_norm": 1.2247556447982788, "learning_rate": 0.0002, "epoch": 6.956100710135571, "step": 21550}, {"loss": 0.4526, "grad_norm": 1.1727497577667236, "learning_rate": 0.0002, "epoch": 6.959328599096191, "step": 21560}, {"loss": 0.4566, "grad_norm": 1.2948925495147705, "learning_rate": 0.0002, "epoch": 6.962556488056811, "step": 21570}, {"loss": 0.4672, "grad_norm": 1.3374950885772705, "learning_rate": 0.0002, "epoch": 6.965784377017431, "step": 21580}, {"loss": 0.4515, "grad_norm": 1.164650559425354, "learning_rate": 0.0002, "epoch": 6.9690122659780505, "step": 21590}, {"loss": 0.4704, "grad_norm": 1.2682108879089355, "learning_rate": 0.0002, "epoch": 6.97224015493867, "step": 21600}, {"loss": 0.4557, "grad_norm": 1.195971131324768, "learning_rate": 0.0002, "epoch": 6.97546804389929, "step": 21610}, {"loss": 0.4194, "grad_norm": 1.1988017559051514, "learning_rate": 0.0002, "epoch": 6.978695932859909, "step": 21620}, {"loss": 0.4524, "grad_norm": 1.0981930494308472, "learning_rate": 0.0002, "epoch": 6.981923821820529, "step": 21630}, {"loss": 0.4808, "grad_norm": 1.307260274887085, "learning_rate": 0.0002, "epoch": 6.98515171078115, "step": 21640}, {"loss": 0.4936, "grad_norm": 1.2798160314559937, "learning_rate": 0.0002, "epoch": 6.988379599741769, "step": 21650}, {"loss": 0.4615, "grad_norm": 1.0053848028182983, "learning_rate": 0.0002, "epoch": 6.991607488702389, "step": 21660}, {"loss": 0.4496, "grad_norm": 1.2257840633392334, "learning_rate": 0.0002, "epoch": 6.994835377663009, "step": 21670}, {"loss": 0.4449, "grad_norm": 1.3769378662109375, "learning_rate": 0.0002, "epoch": 6.998063266623628, "step": 21680}]} +{"epoch": 8.0, "step": 24784, "epoch_duration": 11700.881046772003, "total_accumulated_duration": 90951.26607847214, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13688.75537109375}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-25595-sd-4/checkpoint-3098", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.593, "grad_norm": 0.7092075347900391, "learning_rate": 0.0002, "epoch": 0.0032278889606197547, "step": 10}, {"loss": 1.0956, "grad_norm": 0.6900479793548584, "learning_rate": 0.0002, "epoch": 0.006455777921239509, "step": 20}, {"loss": 0.9807, "grad_norm": 0.6788288950920105, "learning_rate": 0.0002, "epoch": 0.009683666881859263, "step": 30}, {"loss": 0.9385, "grad_norm": 0.5590243339538574, "learning_rate": 0.0002, "epoch": 0.012911555842479019, "step": 40}, {"loss": 0.931, "grad_norm": 0.5136010646820068, "learning_rate": 0.0002, "epoch": 0.016139444803098774, "step": 50}, {"loss": 0.8896, "grad_norm": 0.45298320055007935, "learning_rate": 0.0002, "epoch": 0.019367333763718526, "step": 60}, {"loss": 0.9184, "grad_norm": 0.5917162299156189, "learning_rate": 0.0002, "epoch": 0.022595222724338282, "step": 70}, {"loss": 0.8705, "grad_norm": 0.4414856433868408, "learning_rate": 0.0002, "epoch": 0.025823111684958037, "step": 80}, {"loss": 0.8419, "grad_norm": 0.5547978281974792, "learning_rate": 0.0002, "epoch": 0.029051000645577793, "step": 90}, {"loss": 0.8987, "grad_norm": 0.5271288156509399, "learning_rate": 0.0002, "epoch": 0.03227888960619755, "step": 100}, {"loss": 0.8543, "grad_norm": 0.5506119728088379, "learning_rate": 0.0002, "epoch": 0.035506778566817304, "step": 110}, {"loss": 0.8373, "grad_norm": 0.5579327940940857, "learning_rate": 0.0002, "epoch": 0.03873466752743705, "step": 120}, {"loss": 0.8826, "grad_norm": 0.5099632740020752, "learning_rate": 0.0002, "epoch": 0.04196255648805681, "step": 130}, {"loss": 0.9239, "grad_norm": 0.40396833419799805, "learning_rate": 0.0002, "epoch": 0.045190445448676564, "step": 140}, {"loss": 0.846, "grad_norm": 0.5008092522621155, "learning_rate": 0.0002, "epoch": 0.04841833440929632, "step": 150}, {"loss": 0.8564, "grad_norm": 0.4388776421546936, "learning_rate": 0.0002, "epoch": 0.051646223369916075, "step": 160}, {"loss": 0.8829, "grad_norm": 0.44138944149017334, "learning_rate": 0.0002, "epoch": 0.05487411233053583, "step": 170}, {"loss": 0.8061, "grad_norm": 0.358484148979187, "learning_rate": 0.0002, "epoch": 0.058102001291155586, "step": 180}, {"loss": 0.8956, "grad_norm": 0.457052081823349, "learning_rate": 0.0002, "epoch": 0.06132989025177534, "step": 190}, {"loss": 0.9138, "grad_norm": 0.5537622570991516, "learning_rate": 0.0002, "epoch": 0.0645577792123951, "step": 200}, {"loss": 0.8701, "grad_norm": 0.552631676197052, "learning_rate": 0.0002, "epoch": 0.06778566817301485, "step": 210}, {"loss": 0.8854, "grad_norm": 0.4414575397968292, "learning_rate": 0.0002, "epoch": 0.07101355713363461, "step": 220}, {"loss": 0.8581, "grad_norm": 0.4996664226055145, "learning_rate": 0.0002, "epoch": 0.07424144609425436, "step": 230}, {"loss": 0.8675, "grad_norm": 0.7321897149085999, "learning_rate": 0.0002, "epoch": 0.0774693350548741, "step": 240}, {"loss": 0.8848, "grad_norm": 0.4553901255130768, "learning_rate": 0.0002, "epoch": 0.08069722401549387, "step": 250}, {"loss": 0.868, "grad_norm": 0.5039054751396179, "learning_rate": 0.0002, "epoch": 0.08392511297611362, "step": 260}, {"loss": 0.8317, "grad_norm": 0.4113094210624695, "learning_rate": 0.0002, "epoch": 0.08715300193673338, "step": 270}, {"loss": 0.8074, "grad_norm": 0.450436532497406, "learning_rate": 0.0002, "epoch": 0.09038089089735313, "step": 280}, {"loss": 0.8105, "grad_norm": 0.4548024535179138, "learning_rate": 0.0002, "epoch": 0.09360877985797289, "step": 290}, {"loss": 0.8325, "grad_norm": 0.4932962656021118, "learning_rate": 0.0002, "epoch": 0.09683666881859264, "step": 300}, {"loss": 0.8105, "grad_norm": 0.4005250334739685, "learning_rate": 0.0002, "epoch": 0.1000645577792124, "step": 310}, {"loss": 0.8083, "grad_norm": 1.8321624994277954, "learning_rate": 0.0002, "epoch": 0.10329244673983215, "step": 320}, {"loss": 0.8411, "grad_norm": 0.45815610885620117, "learning_rate": 0.0002, "epoch": 0.1065203357004519, "step": 330}, {"loss": 0.857, "grad_norm": 0.39324095845222473, "learning_rate": 0.0002, "epoch": 0.10974822466107166, "step": 340}, {"loss": 0.8258, "grad_norm": 0.546273946762085, "learning_rate": 0.0002, "epoch": 0.11297611362169141, "step": 350}, {"loss": 0.882, "grad_norm": 0.497448593378067, "learning_rate": 0.0002, "epoch": 0.11620400258231117, "step": 360}, {"loss": 0.7608, "grad_norm": 0.37508800625801086, "learning_rate": 0.0002, "epoch": 0.11943189154293092, "step": 370}, {"loss": 0.852, "grad_norm": 0.45849609375, "learning_rate": 0.0002, "epoch": 0.12265978050355068, "step": 380}, {"loss": 0.8437, "grad_norm": 0.5488408803939819, "learning_rate": 0.0002, "epoch": 0.12588766946417043, "step": 390}, {"loss": 0.8349, "grad_norm": 0.4477061331272125, "learning_rate": 0.0002, "epoch": 0.1291155584247902, "step": 400}, {"loss": 0.8306, "grad_norm": 0.39227980375289917, "learning_rate": 0.0002, "epoch": 0.13234344738540993, "step": 410}, {"loss": 0.7933, "grad_norm": 0.3922233581542969, "learning_rate": 0.0002, "epoch": 0.1355713363460297, "step": 420}, {"loss": 0.8134, "grad_norm": 0.42901909351348877, "learning_rate": 0.0002, "epoch": 0.13879922530664945, "step": 430}, {"loss": 0.8271, "grad_norm": 0.4217798709869385, "learning_rate": 0.0002, "epoch": 0.14202711426726922, "step": 440}, {"loss": 0.8594, "grad_norm": 0.43470677733421326, "learning_rate": 0.0002, "epoch": 0.14525500322788895, "step": 450}, {"loss": 0.8106, "grad_norm": 0.5324403047561646, "learning_rate": 0.0002, "epoch": 0.1484828921885087, "step": 460}, {"loss": 0.8729, "grad_norm": 0.3999756872653961, "learning_rate": 0.0002, "epoch": 0.15171078114912848, "step": 470}, {"loss": 0.7702, "grad_norm": 0.404933363199234, "learning_rate": 0.0002, "epoch": 0.1549386701097482, "step": 480}, {"loss": 0.8151, "grad_norm": 0.44122636318206787, "learning_rate": 0.0002, "epoch": 0.15816655907036797, "step": 490}, {"loss": 0.8457, "grad_norm": 0.510166347026825, "learning_rate": 0.0002, "epoch": 0.16139444803098774, "step": 500}, {"loss": 0.8692, "grad_norm": 0.4549732506275177, "learning_rate": 0.0002, "epoch": 0.1646223369916075, "step": 510}, {"loss": 0.8466, "grad_norm": 0.5148182511329651, "learning_rate": 0.0002, "epoch": 0.16785022595222723, "step": 520}, {"loss": 0.8317, "grad_norm": 0.3596806824207306, "learning_rate": 0.0002, "epoch": 0.171078114912847, "step": 530}, {"loss": 0.844, "grad_norm": 0.4388909339904785, "learning_rate": 0.0002, "epoch": 0.17430600387346676, "step": 540}, {"loss": 0.8322, "grad_norm": 0.5052742958068848, "learning_rate": 0.0002, "epoch": 0.17753389283408652, "step": 550}, {"loss": 0.791, "grad_norm": 0.48248958587646484, "learning_rate": 0.0002, "epoch": 0.18076178179470626, "step": 560}, {"loss": 0.8593, "grad_norm": 0.5360197424888611, "learning_rate": 0.0002, "epoch": 0.18398967075532602, "step": 570}, {"loss": 0.817, "grad_norm": 0.43999341130256653, "learning_rate": 0.0002, "epoch": 0.18721755971594578, "step": 580}, {"loss": 0.8311, "grad_norm": 0.3685208261013031, "learning_rate": 0.0002, "epoch": 0.19044544867656552, "step": 590}, {"loss": 0.8341, "grad_norm": 0.4601275622844696, "learning_rate": 0.0002, "epoch": 0.19367333763718528, "step": 600}, {"loss": 0.8483, "grad_norm": 0.4778369665145874, "learning_rate": 0.0002, "epoch": 0.19690122659780504, "step": 610}, {"loss": 0.8653, "grad_norm": 0.4867003560066223, "learning_rate": 0.0002, "epoch": 0.2001291155584248, "step": 620}, {"loss": 0.8554, "grad_norm": 0.4583742916584015, "learning_rate": 0.0002, "epoch": 0.20335700451904454, "step": 630}, {"loss": 0.8698, "grad_norm": 0.47958165407180786, "learning_rate": 0.0002, "epoch": 0.2065848934796643, "step": 640}, {"loss": 0.8213, "grad_norm": 0.4526064097881317, "learning_rate": 0.0002, "epoch": 0.20981278244028406, "step": 650}, {"loss": 0.8313, "grad_norm": 0.45890581607818604, "learning_rate": 0.0002, "epoch": 0.2130406714009038, "step": 660}, {"loss": 0.8143, "grad_norm": 0.42725905776023865, "learning_rate": 0.0002, "epoch": 0.21626856036152356, "step": 670}, {"loss": 0.8675, "grad_norm": 0.40380963683128357, "learning_rate": 0.0002, "epoch": 0.21949644932214332, "step": 680}, {"loss": 0.9004, "grad_norm": 0.4372998774051666, "learning_rate": 0.0002, "epoch": 0.22272433828276308, "step": 690}, {"loss": 0.8208, "grad_norm": 0.4245864450931549, "learning_rate": 0.0002, "epoch": 0.22595222724338282, "step": 700}, {"loss": 0.8564, "grad_norm": 0.4061129689216614, "learning_rate": 0.0002, "epoch": 0.22918011620400258, "step": 710}, {"loss": 0.8275, "grad_norm": 0.474454790353775, "learning_rate": 0.0002, "epoch": 0.23240800516462234, "step": 720}, {"loss": 0.8346, "grad_norm": 0.4908486008644104, "learning_rate": 0.0002, "epoch": 0.23563589412524208, "step": 730}, {"loss": 0.8755, "grad_norm": 0.4284191429615021, "learning_rate": 0.0002, "epoch": 0.23886378308586184, "step": 740}, {"loss": 0.8387, "grad_norm": 0.44730308651924133, "learning_rate": 0.0002, "epoch": 0.2420916720464816, "step": 750}, {"loss": 0.8135, "grad_norm": 0.4433246850967407, "learning_rate": 0.0002, "epoch": 0.24531956100710137, "step": 760}, {"loss": 0.8644, "grad_norm": 0.43668854236602783, "learning_rate": 0.0002, "epoch": 0.2485474499677211, "step": 770}, {"loss": 0.8025, "grad_norm": 0.34324130415916443, "learning_rate": 0.0002, "epoch": 0.25177533892834086, "step": 780}, {"loss": 0.8725, "grad_norm": 0.46476295590400696, "learning_rate": 0.0002, "epoch": 0.2550032278889606, "step": 790}, {"loss": 0.8157, "grad_norm": 0.5047039985656738, "learning_rate": 0.0002, "epoch": 0.2582311168495804, "step": 800}, {"loss": 0.8643, "grad_norm": 0.4402127265930176, "learning_rate": 0.0002, "epoch": 0.26145900581020015, "step": 810}, {"loss": 0.8025, "grad_norm": 0.4642465114593506, "learning_rate": 0.0002, "epoch": 0.26468689477081986, "step": 820}, {"loss": 0.8836, "grad_norm": 0.40093424916267395, "learning_rate": 0.0002, "epoch": 0.2679147837314396, "step": 830}, {"loss": 0.83, "grad_norm": 0.42501842975616455, "learning_rate": 0.0002, "epoch": 0.2711426726920594, "step": 840}, {"loss": 0.8573, "grad_norm": 0.43279722332954407, "learning_rate": 0.0002, "epoch": 0.27437056165267915, "step": 850}, {"loss": 0.817, "grad_norm": 0.5991243720054626, "learning_rate": 0.0002, "epoch": 0.2775984506132989, "step": 860}, {"loss": 0.7981, "grad_norm": 0.4217848777770996, "learning_rate": 0.0002, "epoch": 0.28082633957391867, "step": 870}, {"loss": 0.8135, "grad_norm": 0.3933536410331726, "learning_rate": 0.0002, "epoch": 0.28405422853453843, "step": 880}, {"loss": 0.8846, "grad_norm": 0.5868505239486694, "learning_rate": 0.0002, "epoch": 0.28728211749515814, "step": 890}, {"loss": 0.8759, "grad_norm": 0.5209547877311707, "learning_rate": 0.0002, "epoch": 0.2905100064557779, "step": 900}, {"loss": 0.815, "grad_norm": 0.49307361245155334, "learning_rate": 0.0002, "epoch": 0.29373789541639767, "step": 910}, {"loss": 0.7813, "grad_norm": 0.4288382828235626, "learning_rate": 0.0002, "epoch": 0.2969657843770174, "step": 920}, {"loss": 0.8431, "grad_norm": 0.33568474650382996, "learning_rate": 0.0002, "epoch": 0.3001936733376372, "step": 930}, {"loss": 0.8455, "grad_norm": 1.0915930271148682, "learning_rate": 0.0002, "epoch": 0.30342156229825695, "step": 940}, {"loss": 0.8535, "grad_norm": 0.5489798188209534, "learning_rate": 0.0002, "epoch": 0.3066494512588767, "step": 950}, {"loss": 0.8031, "grad_norm": 0.42971742153167725, "learning_rate": 0.0002, "epoch": 0.3098773402194964, "step": 960}, {"loss": 0.8253, "grad_norm": 0.43375834822654724, "learning_rate": 0.0002, "epoch": 0.3131052291801162, "step": 970}, {"loss": 0.7747, "grad_norm": 0.47488611936569214, "learning_rate": 0.0002, "epoch": 0.31633311814073595, "step": 980}, {"loss": 0.7906, "grad_norm": 0.46296775341033936, "learning_rate": 0.0002, "epoch": 0.3195610071013557, "step": 990}, {"loss": 0.7948, "grad_norm": 0.4548890292644501, "learning_rate": 0.0002, "epoch": 0.32278889606197547, "step": 1000}, {"loss": 0.8856, "grad_norm": 0.41834497451782227, "learning_rate": 0.0002, "epoch": 0.32601678502259523, "step": 1010}, {"loss": 0.7791, "grad_norm": 0.441092312335968, "learning_rate": 0.0002, "epoch": 0.329244673983215, "step": 1020}, {"loss": 0.8191, "grad_norm": 0.637322187423706, "learning_rate": 0.0002, "epoch": 0.33247256294383476, "step": 1030}, {"loss": 0.8685, "grad_norm": 0.4374958574771881, "learning_rate": 0.0002, "epoch": 0.33570045190445447, "step": 1040}, {"loss": 0.8423, "grad_norm": 0.3935825824737549, "learning_rate": 0.0002, "epoch": 0.33892834086507423, "step": 1050}, {"loss": 0.8287, "grad_norm": 0.43526220321655273, "learning_rate": 0.0002, "epoch": 0.342156229825694, "step": 1060}, {"loss": 0.8413, "grad_norm": 0.45327696204185486, "learning_rate": 0.0002, "epoch": 0.34538411878631375, "step": 1070}, {"loss": 0.7421, "grad_norm": 0.4126075506210327, "learning_rate": 0.0002, "epoch": 0.3486120077469335, "step": 1080}, {"loss": 0.8427, "grad_norm": 0.4714072048664093, "learning_rate": 0.0002, "epoch": 0.3518398967075533, "step": 1090}, {"loss": 0.8028, "grad_norm": 0.518127977848053, "learning_rate": 0.0002, "epoch": 0.35506778566817304, "step": 1100}, {"loss": 0.8479, "grad_norm": 0.43264099955558777, "learning_rate": 0.0002, "epoch": 0.35829567462879275, "step": 1110}, {"loss": 0.8724, "grad_norm": 0.4857400357723236, "learning_rate": 0.0002, "epoch": 0.3615235635894125, "step": 1120}, {"loss": 0.7735, "grad_norm": 0.37591469287872314, "learning_rate": 0.0002, "epoch": 0.3647514525500323, "step": 1130}, {"loss": 0.8531, "grad_norm": 0.4165478050708771, "learning_rate": 0.0002, "epoch": 0.36797934151065204, "step": 1140}, {"loss": 0.8151, "grad_norm": 0.42911383509635925, "learning_rate": 0.0002, "epoch": 0.3712072304712718, "step": 1150}, {"loss": 0.8722, "grad_norm": 0.44980287551879883, "learning_rate": 0.0002, "epoch": 0.37443511943189156, "step": 1160}, {"loss": 0.7961, "grad_norm": 0.4066573679447174, "learning_rate": 0.0002, "epoch": 0.3776630083925113, "step": 1170}, {"loss": 0.8317, "grad_norm": 0.5056195855140686, "learning_rate": 0.0002, "epoch": 0.38089089735313103, "step": 1180}, {"loss": 0.8387, "grad_norm": 0.4141536355018616, "learning_rate": 0.0002, "epoch": 0.3841187863137508, "step": 1190}, {"loss": 0.8019, "grad_norm": 0.4501924514770508, "learning_rate": 0.0002, "epoch": 0.38734667527437056, "step": 1200}, {"loss": 0.8528, "grad_norm": 0.43304240703582764, "learning_rate": 0.0002, "epoch": 0.3905745642349903, "step": 1210}, {"loss": 0.8905, "grad_norm": 0.475777804851532, "learning_rate": 0.0002, "epoch": 0.3938024531956101, "step": 1220}, {"loss": 0.8643, "grad_norm": 0.5846465826034546, "learning_rate": 0.0002, "epoch": 0.39703034215622984, "step": 1230}, {"loss": 0.8078, "grad_norm": 0.42899325489997864, "learning_rate": 0.0002, "epoch": 0.4002582311168496, "step": 1240}, {"loss": 0.8415, "grad_norm": 0.3980463147163391, "learning_rate": 0.0002, "epoch": 0.4034861200774693, "step": 1250}, {"loss": 0.8026, "grad_norm": 0.45769768953323364, "learning_rate": 0.0002, "epoch": 0.4067140090380891, "step": 1260}, {"loss": 0.8377, "grad_norm": 0.5101280212402344, "learning_rate": 0.0002, "epoch": 0.40994189799870884, "step": 1270}, {"loss": 0.7905, "grad_norm": 0.47374317049980164, "learning_rate": 0.0002, "epoch": 0.4131697869593286, "step": 1280}, {"loss": 0.8172, "grad_norm": 0.4261878728866577, "learning_rate": 0.0002, "epoch": 0.41639767591994836, "step": 1290}, {"loss": 0.9004, "grad_norm": 0.46954256296157837, "learning_rate": 0.0002, "epoch": 0.4196255648805681, "step": 1300}, {"loss": 0.7868, "grad_norm": 0.5205738544464111, "learning_rate": 0.0002, "epoch": 0.4228534538411879, "step": 1310}, {"loss": 0.8964, "grad_norm": 0.5176340937614441, "learning_rate": 0.0002, "epoch": 0.4260813428018076, "step": 1320}, {"loss": 0.8764, "grad_norm": 0.5155916810035706, "learning_rate": 0.0002, "epoch": 0.42930923176242736, "step": 1330}, {"loss": 0.8197, "grad_norm": 0.44548553228378296, "learning_rate": 0.0002, "epoch": 0.4325371207230471, "step": 1340}, {"loss": 0.7873, "grad_norm": 0.5633558630943298, "learning_rate": 0.0002, "epoch": 0.4357650096836669, "step": 1350}, {"loss": 0.7889, "grad_norm": 0.42444056272506714, "learning_rate": 0.0002, "epoch": 0.43899289864428664, "step": 1360}, {"loss": 0.8588, "grad_norm": 0.5226860642433167, "learning_rate": 0.0002, "epoch": 0.4422207876049064, "step": 1370}, {"loss": 0.8232, "grad_norm": 0.5354582071304321, "learning_rate": 0.0002, "epoch": 0.44544867656552617, "step": 1380}, {"loss": 0.816, "grad_norm": 0.472646564245224, "learning_rate": 0.0002, "epoch": 0.4486765655261459, "step": 1390}, {"loss": 0.7953, "grad_norm": 0.6312310099601746, "learning_rate": 0.0002, "epoch": 0.45190445448676564, "step": 1400}, {"loss": 0.8212, "grad_norm": 0.4298408031463623, "learning_rate": 0.0002, "epoch": 0.4551323434473854, "step": 1410}, {"loss": 0.8447, "grad_norm": 0.43427202105522156, "learning_rate": 0.0002, "epoch": 0.45836023240800516, "step": 1420}, {"loss": 0.8342, "grad_norm": 0.44097861647605896, "learning_rate": 0.0002, "epoch": 0.4615881213686249, "step": 1430}, {"loss": 0.8301, "grad_norm": 0.5142693519592285, "learning_rate": 0.0002, "epoch": 0.4648160103292447, "step": 1440}, {"loss": 0.8144, "grad_norm": 0.46416547894477844, "learning_rate": 0.0002, "epoch": 0.46804389928986445, "step": 1450}, {"loss": 0.8342, "grad_norm": 0.4858551025390625, "learning_rate": 0.0002, "epoch": 0.47127178825048416, "step": 1460}, {"loss": 0.8354, "grad_norm": 0.4709177315235138, "learning_rate": 0.0002, "epoch": 0.4744996772111039, "step": 1470}, {"loss": 0.8391, "grad_norm": 0.5500252842903137, "learning_rate": 0.0002, "epoch": 0.4777275661717237, "step": 1480}, {"loss": 0.8359, "grad_norm": 0.43364381790161133, "learning_rate": 0.0002, "epoch": 0.48095545513234345, "step": 1490}, {"loss": 0.8446, "grad_norm": 0.47712287306785583, "learning_rate": 0.0002, "epoch": 0.4841833440929632, "step": 1500}, {"loss": 0.8518, "grad_norm": 0.4518495202064514, "learning_rate": 0.0002, "epoch": 0.48741123305358297, "step": 1510}, {"loss": 0.819, "grad_norm": 0.4539008140563965, "learning_rate": 0.0002, "epoch": 0.49063912201420273, "step": 1520}, {"loss": 0.8276, "grad_norm": 0.4993067979812622, "learning_rate": 0.0002, "epoch": 0.49386701097482244, "step": 1530}, {"loss": 0.8297, "grad_norm": 0.6094803214073181, "learning_rate": 0.0002, "epoch": 0.4970948999354422, "step": 1540}, {"loss": 0.8263, "grad_norm": 0.48602527379989624, "learning_rate": 0.0002, "epoch": 0.500322788896062, "step": 1550}, {"loss": 0.8182, "grad_norm": 0.40245795249938965, "learning_rate": 0.0002, "epoch": 0.5035506778566817, "step": 1560}, {"loss": 0.7907, "grad_norm": 0.456787645816803, "learning_rate": 0.0002, "epoch": 0.5067785668173015, "step": 1570}, {"loss": 0.86, "grad_norm": 0.43936216831207275, "learning_rate": 0.0002, "epoch": 0.5100064557779213, "step": 1580}, {"loss": 0.7928, "grad_norm": 0.549018144607544, "learning_rate": 0.0002, "epoch": 0.513234344738541, "step": 1590}, {"loss": 0.8169, "grad_norm": 0.41746795177459717, "learning_rate": 0.0002, "epoch": 0.5164622336991608, "step": 1600}, {"loss": 0.7868, "grad_norm": 0.4217053949832916, "learning_rate": 0.0002, "epoch": 0.5196901226597805, "step": 1610}, {"loss": 0.8161, "grad_norm": 0.449913889169693, "learning_rate": 0.0002, "epoch": 0.5229180116204003, "step": 1620}, {"loss": 0.7938, "grad_norm": 0.5084872245788574, "learning_rate": 0.0002, "epoch": 0.5261459005810201, "step": 1630}, {"loss": 0.8295, "grad_norm": 0.46248653531074524, "learning_rate": 0.0002, "epoch": 0.5293737895416397, "step": 1640}, {"loss": 0.7993, "grad_norm": 0.4824236035346985, "learning_rate": 0.0002, "epoch": 0.5326016785022595, "step": 1650}, {"loss": 0.8711, "grad_norm": 0.6010985374450684, "learning_rate": 0.0002, "epoch": 0.5358295674628792, "step": 1660}, {"loss": 0.8266, "grad_norm": 0.4757920801639557, "learning_rate": 0.0002, "epoch": 0.539057456423499, "step": 1670}, {"loss": 0.8182, "grad_norm": 0.45161882042884827, "learning_rate": 0.0002, "epoch": 0.5422853453841188, "step": 1680}, {"loss": 0.8141, "grad_norm": 0.49314990639686584, "learning_rate": 0.0002, "epoch": 0.5455132343447385, "step": 1690}, {"loss": 0.8091, "grad_norm": 0.3918305039405823, "learning_rate": 0.0002, "epoch": 0.5487411233053583, "step": 1700}, {"loss": 0.8177, "grad_norm": 0.5966728925704956, "learning_rate": 0.0002, "epoch": 0.551969012265978, "step": 1710}, {"loss": 0.8438, "grad_norm": 0.4208986163139343, "learning_rate": 0.0002, "epoch": 0.5551969012265978, "step": 1720}, {"loss": 0.817, "grad_norm": 0.43724218010902405, "learning_rate": 0.0002, "epoch": 0.5584247901872176, "step": 1730}, {"loss": 0.7956, "grad_norm": 0.5287272930145264, "learning_rate": 0.0002, "epoch": 0.5616526791478373, "step": 1740}, {"loss": 0.8557, "grad_norm": 0.4961899518966675, "learning_rate": 0.0002, "epoch": 0.5648805681084571, "step": 1750}, {"loss": 0.8029, "grad_norm": 0.4468635320663452, "learning_rate": 0.0002, "epoch": 0.5681084570690769, "step": 1760}, {"loss": 0.7968, "grad_norm": 0.6423530578613281, "learning_rate": 0.0002, "epoch": 0.5713363460296966, "step": 1770}, {"loss": 0.8324, "grad_norm": 0.4601971507072449, "learning_rate": 0.0002, "epoch": 0.5745642349903163, "step": 1780}, {"loss": 0.8171, "grad_norm": 0.46514901518821716, "learning_rate": 0.0002, "epoch": 0.577792123950936, "step": 1790}, {"loss": 0.8186, "grad_norm": 0.4771687388420105, "learning_rate": 0.0002, "epoch": 0.5810200129115558, "step": 1800}, {"loss": 0.856, "grad_norm": 0.46514490246772766, "learning_rate": 0.0002, "epoch": 0.5842479018721756, "step": 1810}, {"loss": 0.84, "grad_norm": 0.5373936295509338, "learning_rate": 0.0002, "epoch": 0.5874757908327953, "step": 1820}, {"loss": 0.8456, "grad_norm": 0.5175791382789612, "learning_rate": 0.0002, "epoch": 0.5907036797934151, "step": 1830}, {"loss": 0.7957, "grad_norm": 0.4522802233695984, "learning_rate": 0.0002, "epoch": 0.5939315687540349, "step": 1840}, {"loss": 0.8633, "grad_norm": 0.42987772822380066, "learning_rate": 0.0002, "epoch": 0.5971594577146546, "step": 1850}, {"loss": 0.7871, "grad_norm": 0.5566838383674622, "learning_rate": 0.0002, "epoch": 0.6003873466752744, "step": 1860}, {"loss": 0.8312, "grad_norm": 0.42807698249816895, "learning_rate": 0.0002, "epoch": 0.6036152356358941, "step": 1870}, {"loss": 0.8035, "grad_norm": 0.4957767724990845, "learning_rate": 0.0002, "epoch": 0.6068431245965139, "step": 1880}, {"loss": 0.8145, "grad_norm": 0.4260980188846588, "learning_rate": 0.0002, "epoch": 0.6100710135571337, "step": 1890}, {"loss": 0.8363, "grad_norm": 0.4777357876300812, "learning_rate": 0.0002, "epoch": 0.6132989025177534, "step": 1900}, {"loss": 0.8404, "grad_norm": 0.4434216022491455, "learning_rate": 0.0002, "epoch": 0.6165267914783732, "step": 1910}, {"loss": 0.8057, "grad_norm": 0.5215433835983276, "learning_rate": 0.0002, "epoch": 0.6197546804389928, "step": 1920}, {"loss": 0.82, "grad_norm": 0.5143248438835144, "learning_rate": 0.0002, "epoch": 0.6229825693996126, "step": 1930}, {"loss": 0.8107, "grad_norm": 0.5213413238525391, "learning_rate": 0.0002, "epoch": 0.6262104583602324, "step": 1940}, {"loss": 0.7549, "grad_norm": 0.5408226251602173, "learning_rate": 0.0002, "epoch": 0.6294383473208521, "step": 1950}, {"loss": 0.8405, "grad_norm": 0.5479708909988403, "learning_rate": 0.0002, "epoch": 0.6326662362814719, "step": 1960}, {"loss": 0.8138, "grad_norm": 0.4490949809551239, "learning_rate": 0.0002, "epoch": 0.6358941252420917, "step": 1970}, {"loss": 0.854, "grad_norm": 0.48815059661865234, "learning_rate": 0.0002, "epoch": 0.6391220142027114, "step": 1980}, {"loss": 0.8568, "grad_norm": 0.46498045325279236, "learning_rate": 0.0002, "epoch": 0.6423499031633312, "step": 1990}, {"loss": 0.8263, "grad_norm": 0.5136561393737793, "learning_rate": 0.0002, "epoch": 0.6455777921239509, "step": 2000}, {"loss": 0.8503, "grad_norm": 0.5145719647407532, "learning_rate": 0.0002, "epoch": 0.6488056810845707, "step": 2010}, {"loss": 0.8456, "grad_norm": 0.5430373549461365, "learning_rate": 0.0002, "epoch": 0.6520335700451905, "step": 2020}, {"loss": 0.8115, "grad_norm": 0.46347954869270325, "learning_rate": 0.0002, "epoch": 0.6552614590058102, "step": 2030}, {"loss": 0.8769, "grad_norm": 0.5189562439918518, "learning_rate": 0.0002, "epoch": 0.65848934796643, "step": 2040}, {"loss": 0.8453, "grad_norm": 0.43843990564346313, "learning_rate": 0.0002, "epoch": 0.6617172369270498, "step": 2050}, {"loss": 0.7951, "grad_norm": 0.4654983580112457, "learning_rate": 0.0002, "epoch": 0.6649451258876695, "step": 2060}, {"loss": 0.8308, "grad_norm": 0.44835716485977173, "learning_rate": 0.0002, "epoch": 0.6681730148482892, "step": 2070}, {"loss": 0.8181, "grad_norm": 0.38811734318733215, "learning_rate": 0.0002, "epoch": 0.6714009038089089, "step": 2080}, {"loss": 0.762, "grad_norm": 0.5709853172302246, "learning_rate": 0.0002, "epoch": 0.6746287927695287, "step": 2090}, {"loss": 0.8334, "grad_norm": 0.49994757771492004, "learning_rate": 0.0002, "epoch": 0.6778566817301485, "step": 2100}, {"loss": 0.8, "grad_norm": 0.5505402684211731, "learning_rate": 0.0002, "epoch": 0.6810845706907682, "step": 2110}, {"loss": 0.8227, "grad_norm": 0.48195120692253113, "learning_rate": 0.0002, "epoch": 0.684312459651388, "step": 2120}, {"loss": 0.7879, "grad_norm": 0.4854775071144104, "learning_rate": 0.0002, "epoch": 0.6875403486120077, "step": 2130}, {"loss": 0.8231, "grad_norm": 0.6422494649887085, "learning_rate": 0.0002, "epoch": 0.6907682375726275, "step": 2140}, {"loss": 0.8353, "grad_norm": 0.3972536027431488, "learning_rate": 0.0002, "epoch": 0.6939961265332473, "step": 2150}, {"loss": 0.8068, "grad_norm": 0.4297836422920227, "learning_rate": 0.0002, "epoch": 0.697224015493867, "step": 2160}, {"loss": 0.8017, "grad_norm": 0.45486778020858765, "learning_rate": 0.0002, "epoch": 0.7004519044544868, "step": 2170}, {"loss": 0.8507, "grad_norm": 0.4706047773361206, "learning_rate": 0.0002, "epoch": 0.7036797934151066, "step": 2180}, {"loss": 0.8234, "grad_norm": 0.46426892280578613, "learning_rate": 0.0002, "epoch": 0.7069076823757263, "step": 2190}, {"loss": 0.8472, "grad_norm": 0.46333715319633484, "learning_rate": 0.0002, "epoch": 0.7101355713363461, "step": 2200}, {"loss": 0.8247, "grad_norm": 0.4632524251937866, "learning_rate": 0.0002, "epoch": 0.7133634602969657, "step": 2210}, {"loss": 0.8452, "grad_norm": 0.4610830843448639, "learning_rate": 0.0002, "epoch": 0.7165913492575855, "step": 2220}, {"loss": 0.7338, "grad_norm": 0.4905324876308441, "learning_rate": 0.0002, "epoch": 0.7198192382182053, "step": 2230}, {"loss": 0.7715, "grad_norm": 0.4936263859272003, "learning_rate": 0.0002, "epoch": 0.723047127178825, "step": 2240}, {"loss": 0.8162, "grad_norm": 0.40778425335884094, "learning_rate": 0.0002, "epoch": 0.7262750161394448, "step": 2250}, {"loss": 0.828, "grad_norm": 0.50351482629776, "learning_rate": 0.0002, "epoch": 0.7295029051000645, "step": 2260}, {"loss": 0.8475, "grad_norm": 0.4894128143787384, "learning_rate": 0.0002, "epoch": 0.7327307940606843, "step": 2270}, {"loss": 0.8087, "grad_norm": 0.5580906271934509, "learning_rate": 0.0002, "epoch": 0.7359586830213041, "step": 2280}, {"loss": 0.8157, "grad_norm": 0.4655369520187378, "learning_rate": 0.0002, "epoch": 0.7391865719819238, "step": 2290}, {"loss": 0.8395, "grad_norm": 0.4666965901851654, "learning_rate": 0.0002, "epoch": 0.7424144609425436, "step": 2300}, {"loss": 0.7605, "grad_norm": 0.46259936690330505, "learning_rate": 0.0002, "epoch": 0.7456423499031634, "step": 2310}, {"loss": 0.7849, "grad_norm": 0.520706832408905, "learning_rate": 0.0002, "epoch": 0.7488702388637831, "step": 2320}, {"loss": 0.8173, "grad_norm": 0.5142408013343811, "learning_rate": 0.0002, "epoch": 0.7520981278244029, "step": 2330}, {"loss": 0.7782, "grad_norm": 0.5355164408683777, "learning_rate": 0.0002, "epoch": 0.7553260167850226, "step": 2340}, {"loss": 0.8242, "grad_norm": 0.5517185926437378, "learning_rate": 0.0002, "epoch": 0.7585539057456423, "step": 2350}, {"loss": 0.8404, "grad_norm": 0.7162677049636841, "learning_rate": 0.0002, "epoch": 0.7617817947062621, "step": 2360}, {"loss": 0.8455, "grad_norm": 0.42402133345603943, "learning_rate": 0.0002, "epoch": 0.7650096836668818, "step": 2370}, {"loss": 0.8214, "grad_norm": 0.47180113196372986, "learning_rate": 0.0002, "epoch": 0.7682375726275016, "step": 2380}, {"loss": 0.8274, "grad_norm": 0.6262288689613342, "learning_rate": 0.0002, "epoch": 0.7714654615881213, "step": 2390}, {"loss": 0.7915, "grad_norm": 0.5177528262138367, "learning_rate": 0.0002, "epoch": 0.7746933505487411, "step": 2400}, {"loss": 0.7631, "grad_norm": 0.555721640586853, "learning_rate": 0.0002, "epoch": 0.7779212395093609, "step": 2410}, {"loss": 0.795, "grad_norm": 0.5592644810676575, "learning_rate": 0.0002, "epoch": 0.7811491284699806, "step": 2420}, {"loss": 0.8081, "grad_norm": 0.38025397062301636, "learning_rate": 0.0002, "epoch": 0.7843770174306004, "step": 2430}, {"loss": 0.7851, "grad_norm": 0.4597472548484802, "learning_rate": 0.0002, "epoch": 0.7876049063912202, "step": 2440}, {"loss": 0.8575, "grad_norm": 0.4929825961589813, "learning_rate": 0.0002, "epoch": 0.7908327953518399, "step": 2450}, {"loss": 0.7584, "grad_norm": 0.45277655124664307, "learning_rate": 0.0002, "epoch": 0.7940606843124597, "step": 2460}, {"loss": 0.8208, "grad_norm": 0.6224122643470764, "learning_rate": 0.0002, "epoch": 0.7972885732730794, "step": 2470}, {"loss": 0.8449, "grad_norm": 0.5740901827812195, "learning_rate": 0.0002, "epoch": 0.8005164622336992, "step": 2480}, {"loss": 0.7834, "grad_norm": 0.41335329413414, "learning_rate": 0.0002, "epoch": 0.8037443511943189, "step": 2490}, {"loss": 0.7768, "grad_norm": 0.4738694131374359, "learning_rate": 0.0002, "epoch": 0.8069722401549386, "step": 2500}, {"loss": 0.7927, "grad_norm": 0.5288197994232178, "learning_rate": 0.0002, "epoch": 0.8102001291155584, "step": 2510}, {"loss": 0.8334, "grad_norm": 0.5404666066169739, "learning_rate": 0.0002, "epoch": 0.8134280180761781, "step": 2520}, {"loss": 0.7998, "grad_norm": 0.4444909691810608, "learning_rate": 0.0002, "epoch": 0.8166559070367979, "step": 2530}, {"loss": 0.8683, "grad_norm": 0.542061448097229, "learning_rate": 0.0002, "epoch": 0.8198837959974177, "step": 2540}, {"loss": 0.8038, "grad_norm": 0.4914741814136505, "learning_rate": 0.0002, "epoch": 0.8231116849580374, "step": 2550}, {"loss": 0.7899, "grad_norm": 0.41703441739082336, "learning_rate": 0.0002, "epoch": 0.8263395739186572, "step": 2560}, {"loss": 0.824, "grad_norm": 0.5489841103553772, "learning_rate": 0.0002, "epoch": 0.829567462879277, "step": 2570}, {"loss": 0.8157, "grad_norm": 0.5359883308410645, "learning_rate": 0.0002, "epoch": 0.8327953518398967, "step": 2580}, {"loss": 0.8122, "grad_norm": 0.5541019439697266, "learning_rate": 0.0002, "epoch": 0.8360232408005165, "step": 2590}, {"loss": 0.797, "grad_norm": 0.4746638834476471, "learning_rate": 0.0002, "epoch": 0.8392511297611362, "step": 2600}, {"loss": 0.8116, "grad_norm": 0.5243194103240967, "learning_rate": 0.0002, "epoch": 0.842479018721756, "step": 2610}, {"loss": 0.8173, "grad_norm": 0.46824976801872253, "learning_rate": 0.0002, "epoch": 0.8457069076823758, "step": 2620}, {"loss": 0.7525, "grad_norm": 0.49487847089767456, "learning_rate": 0.0002, "epoch": 0.8489347966429954, "step": 2630}, {"loss": 0.8296, "grad_norm": 0.42180097103118896, "learning_rate": 0.0002, "epoch": 0.8521626856036152, "step": 2640}, {"loss": 0.8304, "grad_norm": 0.5516560077667236, "learning_rate": 0.0002, "epoch": 0.855390574564235, "step": 2650}, {"loss": 0.7882, "grad_norm": 0.4392191767692566, "learning_rate": 0.0002, "epoch": 0.8586184635248547, "step": 2660}, {"loss": 0.848, "grad_norm": 0.5387210845947266, "learning_rate": 0.0002, "epoch": 0.8618463524854745, "step": 2670}, {"loss": 0.8094, "grad_norm": 0.6232406497001648, "learning_rate": 0.0002, "epoch": 0.8650742414460942, "step": 2680}, {"loss": 0.768, "grad_norm": 0.53749018907547, "learning_rate": 0.0002, "epoch": 0.868302130406714, "step": 2690}, {"loss": 0.8299, "grad_norm": 0.47480374574661255, "learning_rate": 0.0002, "epoch": 0.8715300193673338, "step": 2700}, {"loss": 0.8055, "grad_norm": 0.44618046283721924, "learning_rate": 0.0002, "epoch": 0.8747579083279535, "step": 2710}, {"loss": 0.8015, "grad_norm": 0.4173581302165985, "learning_rate": 0.0002, "epoch": 0.8779857972885733, "step": 2720}, {"loss": 0.7713, "grad_norm": 0.524081289768219, "learning_rate": 0.0002, "epoch": 0.881213686249193, "step": 2730}, {"loss": 0.8738, "grad_norm": 0.5608431100845337, "learning_rate": 0.0002, "epoch": 0.8844415752098128, "step": 2740}, {"loss": 0.8513, "grad_norm": 0.5212284922599792, "learning_rate": 0.0002, "epoch": 0.8876694641704326, "step": 2750}, {"loss": 0.8139, "grad_norm": 0.5601475834846497, "learning_rate": 0.0002, "epoch": 0.8908973531310523, "step": 2760}, {"loss": 0.7947, "grad_norm": 0.4499223828315735, "learning_rate": 0.0002, "epoch": 0.8941252420916721, "step": 2770}, {"loss": 0.8559, "grad_norm": 0.46945226192474365, "learning_rate": 0.0002, "epoch": 0.8973531310522918, "step": 2780}, {"loss": 0.801, "grad_norm": 0.4837495684623718, "learning_rate": 0.0002, "epoch": 0.9005810200129115, "step": 2790}, {"loss": 0.7887, "grad_norm": 0.5059258937835693, "learning_rate": 0.0002, "epoch": 0.9038089089735313, "step": 2800}, {"loss": 0.8571, "grad_norm": 0.4857945144176483, "learning_rate": 0.0002, "epoch": 0.907036797934151, "step": 2810}, {"loss": 0.8301, "grad_norm": 0.5001962780952454, "learning_rate": 0.0002, "epoch": 0.9102646868947708, "step": 2820}, {"loss": 0.8236, "grad_norm": 0.5468648672103882, "learning_rate": 0.0002, "epoch": 0.9134925758553906, "step": 2830}, {"loss": 0.8071, "grad_norm": 0.5533056259155273, "learning_rate": 0.0002, "epoch": 0.9167204648160103, "step": 2840}, {"loss": 0.7895, "grad_norm": 0.5909785628318787, "learning_rate": 0.0002, "epoch": 0.9199483537766301, "step": 2850}, {"loss": 0.796, "grad_norm": 0.47428104281425476, "learning_rate": 0.0002, "epoch": 0.9231762427372499, "step": 2860}, {"loss": 0.7845, "grad_norm": 0.548814058303833, "learning_rate": 0.0002, "epoch": 0.9264041316978696, "step": 2870}, {"loss": 0.7871, "grad_norm": 0.5576745271682739, "learning_rate": 0.0002, "epoch": 0.9296320206584894, "step": 2880}, {"loss": 0.8399, "grad_norm": 0.47094792127609253, "learning_rate": 0.0002, "epoch": 0.9328599096191091, "step": 2890}, {"loss": 0.805, "grad_norm": 0.5408539772033691, "learning_rate": 0.0002, "epoch": 0.9360877985797289, "step": 2900}, {"loss": 0.785, "grad_norm": 0.5922889113426208, "learning_rate": 0.0002, "epoch": 0.9393156875403487, "step": 2910}, {"loss": 0.8043, "grad_norm": 0.45462584495544434, "learning_rate": 0.0002, "epoch": 0.9425435765009683, "step": 2920}, {"loss": 0.8344, "grad_norm": 0.6864947080612183, "learning_rate": 0.0002, "epoch": 0.9457714654615881, "step": 2930}, {"loss": 0.8166, "grad_norm": 0.4706299304962158, "learning_rate": 0.0002, "epoch": 0.9489993544222078, "step": 2940}, {"loss": 0.8422, "grad_norm": 0.5583269596099854, "learning_rate": 0.0002, "epoch": 0.9522272433828276, "step": 2950}, {"loss": 0.836, "grad_norm": 0.51015704870224, "learning_rate": 0.0002, "epoch": 0.9554551323434474, "step": 2960}, {"loss": 0.8371, "grad_norm": 0.5325582027435303, "learning_rate": 0.0002, "epoch": 0.9586830213040671, "step": 2970}, {"loss": 0.7593, "grad_norm": 0.49008598923683167, "learning_rate": 0.0002, "epoch": 0.9619109102646869, "step": 2980}, {"loss": 0.8093, "grad_norm": 0.4422132074832916, "learning_rate": 0.0002, "epoch": 0.9651387992253067, "step": 2990}, {"loss": 0.7966, "grad_norm": 0.5053589344024658, "learning_rate": 0.0002, "epoch": 0.9683666881859264, "step": 3000}, {"loss": 0.8081, "grad_norm": 0.46754521131515503, "learning_rate": 0.0002, "epoch": 0.9715945771465462, "step": 3010}, {"loss": 0.8377, "grad_norm": 0.5613434910774231, "learning_rate": 0.0002, "epoch": 0.9748224661071659, "step": 3020}, {"loss": 0.7856, "grad_norm": 0.5052843689918518, "learning_rate": 0.0002, "epoch": 0.9780503550677857, "step": 3030}, {"loss": 0.8412, "grad_norm": 0.4270972013473511, "learning_rate": 0.0002, "epoch": 0.9812782440284055, "step": 3040}, {"loss": 0.8353, "grad_norm": 0.4974991977214813, "learning_rate": 0.0002, "epoch": 0.9845061329890252, "step": 3050}, {"loss": 0.8415, "grad_norm": 0.4432311952114105, "learning_rate": 0.0002, "epoch": 0.9877340219496449, "step": 3060}, {"loss": 0.7764, "grad_norm": 0.466457724571228, "learning_rate": 0.0002, "epoch": 0.9909619109102646, "step": 3070}, {"loss": 0.8067, "grad_norm": 0.6438009142875671, "learning_rate": 0.0002, "epoch": 0.9941897998708844, "step": 3080}, {"loss": 0.8425, "grad_norm": 0.5593604445457458, "learning_rate": 0.0002, "epoch": 0.9974176888315042, "step": 3090}, {"eval_loss": 1.0958120822906494, "eval_runtime": 148.3273, "eval_samples_per_second": 4.942, "eval_steps_per_second": 0.62, "epoch": 1.0, "step": 3098}, {"loss": 0.8275, "grad_norm": 0.5701445937156677, "learning_rate": 0.0002, "epoch": 1.000645577792124, "step": 3100}, {"loss": 0.7756, "grad_norm": 0.6089657545089722, "learning_rate": 0.0002, "epoch": 1.0038734667527438, "step": 3110}, {"loss": 0.7492, "grad_norm": 0.5619552135467529, "learning_rate": 0.0002, "epoch": 1.0071013557133635, "step": 3120}, {"loss": 0.7544, "grad_norm": 0.5550283789634705, "learning_rate": 0.0002, "epoch": 1.010329244673983, "step": 3130}, {"loss": 0.8006, "grad_norm": 0.6221792101860046, "learning_rate": 0.0002, "epoch": 1.013557133634603, "step": 3140}, {"loss": 0.7603, "grad_norm": 0.5450758934020996, "learning_rate": 0.0002, "epoch": 1.0167850225952226, "step": 3150}, {"loss": 0.7021, "grad_norm": 0.4359588027000427, "learning_rate": 0.0002, "epoch": 1.0200129115558425, "step": 3160}, {"loss": 0.7468, "grad_norm": 0.5932239890098572, "learning_rate": 0.0002, "epoch": 1.0232408005164622, "step": 3170}, {"loss": 0.7649, "grad_norm": 0.45478707551956177, "learning_rate": 0.0002, "epoch": 1.026468689477082, "step": 3180}, {"loss": 0.7355, "grad_norm": 0.677615761756897, "learning_rate": 0.0002, "epoch": 1.0296965784377017, "step": 3190}, {"loss": 0.6928, "grad_norm": 0.6231790781021118, "learning_rate": 0.0002, "epoch": 1.0329244673983216, "step": 3200}, {"loss": 0.7471, "grad_norm": 0.5074195861816406, "learning_rate": 0.0002, "epoch": 1.0361523563589412, "step": 3210}, {"loss": 0.6864, "grad_norm": 0.4844142198562622, "learning_rate": 0.0002, "epoch": 1.039380245319561, "step": 3220}, {"loss": 0.7655, "grad_norm": 0.5372750759124756, "learning_rate": 0.0002, "epoch": 1.0426081342801807, "step": 3230}, {"loss": 0.7384, "grad_norm": 0.46296265721321106, "learning_rate": 0.0002, "epoch": 1.0458360232408006, "step": 3240}, {"loss": 0.7894, "grad_norm": 0.5417148470878601, "learning_rate": 0.0002, "epoch": 1.0490639122014203, "step": 3250}, {"loss": 0.7637, "grad_norm": 0.5695074200630188, "learning_rate": 0.0002, "epoch": 1.0522918011620401, "step": 3260}, {"loss": 0.7456, "grad_norm": 0.5050092935562134, "learning_rate": 0.0002, "epoch": 1.0555196901226598, "step": 3270}, {"loss": 0.6805, "grad_norm": 0.5320752263069153, "learning_rate": 0.0002, "epoch": 1.0587475790832794, "step": 3280}, {"loss": 0.7419, "grad_norm": 0.5832052230834961, "learning_rate": 0.0002, "epoch": 1.0619754680438993, "step": 3290}, {"loss": 0.7656, "grad_norm": 0.5228804349899292, "learning_rate": 0.0002, "epoch": 1.065203357004519, "step": 3300}, {"loss": 0.6834, "grad_norm": 0.5819445252418518, "learning_rate": 0.0002, "epoch": 1.0684312459651388, "step": 3310}, {"loss": 0.7093, "grad_norm": 0.4201328754425049, "learning_rate": 0.0002, "epoch": 1.0716591349257585, "step": 3320}, {"loss": 0.7494, "grad_norm": 0.5424145460128784, "learning_rate": 0.0002, "epoch": 1.0748870238863784, "step": 3330}, {"loss": 0.7828, "grad_norm": 0.6169946789741516, "learning_rate": 0.0002, "epoch": 1.078114912846998, "step": 3340}, {"loss": 0.7505, "grad_norm": 0.607676088809967, "learning_rate": 0.0002, "epoch": 1.0813428018076179, "step": 3350}, {"loss": 0.7315, "grad_norm": 0.5191982388496399, "learning_rate": 0.0002, "epoch": 1.0845706907682375, "step": 3360}, {"loss": 0.7699, "grad_norm": 0.5728003978729248, "learning_rate": 0.0002, "epoch": 1.0877985797288574, "step": 3370}, {"loss": 0.7381, "grad_norm": 0.5402643084526062, "learning_rate": 0.0002, "epoch": 1.091026468689477, "step": 3380}, {"loss": 0.7208, "grad_norm": 0.5377541780471802, "learning_rate": 0.0002, "epoch": 1.094254357650097, "step": 3390}, {"loss": 0.7672, "grad_norm": 0.4751385748386383, "learning_rate": 0.0002, "epoch": 1.0974822466107166, "step": 3400}, {"loss": 0.7326, "grad_norm": 0.559158444404602, "learning_rate": 0.0002, "epoch": 1.1007101355713362, "step": 3410}, {"loss": 0.7366, "grad_norm": 0.4917701482772827, "learning_rate": 0.0002, "epoch": 1.103938024531956, "step": 3420}, {"loss": 0.7593, "grad_norm": 0.5507875084877014, "learning_rate": 0.0002, "epoch": 1.1071659134925758, "step": 3430}, {"loss": 0.7424, "grad_norm": 0.45458680391311646, "learning_rate": 0.0002, "epoch": 1.1103938024531956, "step": 3440}, {"loss": 0.7234, "grad_norm": 0.5721744894981384, "learning_rate": 0.0002, "epoch": 1.1136216914138153, "step": 3450}, {"loss": 0.7219, "grad_norm": 0.5776081681251526, "learning_rate": 0.0002, "epoch": 1.1168495803744352, "step": 3460}, {"loss": 0.7644, "grad_norm": 0.5261953473091125, "learning_rate": 0.0002, "epoch": 1.1200774693350548, "step": 3470}, {"loss": 0.6586, "grad_norm": 0.47759532928466797, "learning_rate": 0.0002, "epoch": 1.1233053582956747, "step": 3480}, {"loss": 0.7641, "grad_norm": 0.5697659850120544, "learning_rate": 0.0002, "epoch": 1.1265332472562943, "step": 3490}, {"loss": 0.7017, "grad_norm": 0.5643419623374939, "learning_rate": 0.0002, "epoch": 1.1297611362169142, "step": 3500}, {"loss": 0.7235, "grad_norm": 0.6502931118011475, "learning_rate": 0.0002, "epoch": 1.1329890251775339, "step": 3510}, {"loss": 0.7662, "grad_norm": 0.5236507654190063, "learning_rate": 0.0002, "epoch": 1.1362169141381537, "step": 3520}, {"loss": 0.7571, "grad_norm": 0.6521499156951904, "learning_rate": 0.0002, "epoch": 1.1394448030987734, "step": 3530}, {"loss": 0.7304, "grad_norm": 0.5893217325210571, "learning_rate": 0.0002, "epoch": 1.142672692059393, "step": 3540}, {"loss": 0.7508, "grad_norm": 0.5300073027610779, "learning_rate": 0.0002, "epoch": 1.145900581020013, "step": 3550}, {"loss": 0.6937, "grad_norm": 0.6794660091400146, "learning_rate": 0.0002, "epoch": 1.1491284699806328, "step": 3560}, {"loss": 0.7614, "grad_norm": 0.5420064926147461, "learning_rate": 0.0002, "epoch": 1.1523563589412524, "step": 3570}, {"loss": 0.7648, "grad_norm": 0.5096590518951416, "learning_rate": 0.0002, "epoch": 1.155584247901872, "step": 3580}, {"loss": 0.7436, "grad_norm": 0.5726043581962585, "learning_rate": 0.0002, "epoch": 1.158812136862492, "step": 3590}, {"loss": 0.7728, "grad_norm": 0.7388110160827637, "learning_rate": 0.0002, "epoch": 1.1620400258231116, "step": 3600}, {"loss": 0.7421, "grad_norm": 0.5597969889640808, "learning_rate": 0.0002, "epoch": 1.1652679147837315, "step": 3610}, {"loss": 0.7132, "grad_norm": 0.5067800283432007, "learning_rate": 0.0002, "epoch": 1.1684958037443511, "step": 3620}, {"loss": 0.7893, "grad_norm": 0.6625118255615234, "learning_rate": 0.0002, "epoch": 1.171723692704971, "step": 3630}, {"loss": 0.7611, "grad_norm": 0.5830849409103394, "learning_rate": 0.0002, "epoch": 1.1749515816655907, "step": 3640}, {"loss": 0.7973, "grad_norm": 0.6140692830085754, "learning_rate": 0.0002, "epoch": 1.1781794706262105, "step": 3650}, {"loss": 0.7617, "grad_norm": 0.714523434638977, "learning_rate": 0.0002, "epoch": 1.1814073595868302, "step": 3660}, {"loss": 0.7092, "grad_norm": 0.5196696519851685, "learning_rate": 0.0002, "epoch": 1.18463524854745, "step": 3670}, {"loss": 0.7821, "grad_norm": 0.6677889823913574, "learning_rate": 0.0002, "epoch": 1.1878631375080697, "step": 3680}, {"loss": 0.7813, "grad_norm": 0.47095245122909546, "learning_rate": 0.0002, "epoch": 1.1910910264686896, "step": 3690}, {"loss": 0.7702, "grad_norm": 0.5197778940200806, "learning_rate": 0.0002, "epoch": 1.1943189154293092, "step": 3700}, {"loss": 0.7349, "grad_norm": 0.5156530141830444, "learning_rate": 0.0002, "epoch": 1.1975468043899289, "step": 3710}, {"loss": 0.7738, "grad_norm": 0.6968549489974976, "learning_rate": 0.0002, "epoch": 1.2007746933505488, "step": 3720}, {"loss": 0.7599, "grad_norm": 0.48983848094940186, "learning_rate": 0.0002, "epoch": 1.2040025823111684, "step": 3730}, {"loss": 0.7163, "grad_norm": 0.6709973216056824, "learning_rate": 0.0002, "epoch": 1.2072304712717883, "step": 3740}, {"loss": 0.7632, "grad_norm": 0.48681750893592834, "learning_rate": 0.0002, "epoch": 1.210458360232408, "step": 3750}, {"loss": 0.7039, "grad_norm": 0.49475061893463135, "learning_rate": 0.0002, "epoch": 1.2136862491930278, "step": 3760}, {"loss": 0.7372, "grad_norm": 0.6163983345031738, "learning_rate": 0.0002, "epoch": 1.2169141381536475, "step": 3770}, {"loss": 0.757, "grad_norm": 0.5481411218643188, "learning_rate": 0.0002, "epoch": 1.2201420271142673, "step": 3780}, {"loss": 0.7601, "grad_norm": 0.620639979839325, "learning_rate": 0.0002, "epoch": 1.223369916074887, "step": 3790}, {"loss": 0.7738, "grad_norm": 0.7017222046852112, "learning_rate": 0.0002, "epoch": 1.2265978050355069, "step": 3800}, {"loss": 0.7468, "grad_norm": 0.5872400403022766, "learning_rate": 0.0002, "epoch": 1.2298256939961265, "step": 3810}, {"loss": 0.7854, "grad_norm": 0.45765596628189087, "learning_rate": 0.0002, "epoch": 1.2330535829567464, "step": 3820}, {"loss": 0.7865, "grad_norm": 0.5676377415657043, "learning_rate": 0.0002, "epoch": 1.236281471917366, "step": 3830}, {"loss": 0.7696, "grad_norm": 0.4793425500392914, "learning_rate": 0.0002, "epoch": 1.2395093608779857, "step": 3840}, {"loss": 0.7065, "grad_norm": 0.5060022473335266, "learning_rate": 0.0002, "epoch": 1.2427372498386056, "step": 3850}, {"loss": 0.7333, "grad_norm": 0.6140682697296143, "learning_rate": 0.0002, "epoch": 1.2459651387992252, "step": 3860}, {"loss": 0.7496, "grad_norm": 0.5030326843261719, "learning_rate": 0.0002, "epoch": 1.249193027759845, "step": 3870}, {"loss": 0.7226, "grad_norm": 0.6609430909156799, "learning_rate": 0.0002, "epoch": 1.2524209167204647, "step": 3880}, {"loss": 0.7212, "grad_norm": 0.5459545850753784, "learning_rate": 0.0002, "epoch": 1.2556488056810846, "step": 3890}, {"loss": 0.7145, "grad_norm": 0.5328870415687561, "learning_rate": 0.0002, "epoch": 1.2588766946417043, "step": 3900}, {"loss": 0.7572, "grad_norm": 0.5840652585029602, "learning_rate": 0.0002, "epoch": 1.2621045836023241, "step": 3910}, {"loss": 0.7624, "grad_norm": 0.5587584376335144, "learning_rate": 0.0002, "epoch": 1.2653324725629438, "step": 3920}, {"loss": 0.7846, "grad_norm": 0.5886949896812439, "learning_rate": 0.0002, "epoch": 1.2685603615235637, "step": 3930}, {"loss": 0.7251, "grad_norm": 0.5128693580627441, "learning_rate": 0.0002, "epoch": 1.2717882504841833, "step": 3940}, {"loss": 0.7032, "grad_norm": 0.6207669377326965, "learning_rate": 0.0002, "epoch": 1.2750161394448032, "step": 3950}, {"loss": 0.7506, "grad_norm": 0.5789574384689331, "learning_rate": 0.0002, "epoch": 1.2782440284054228, "step": 3960}, {"loss": 0.7574, "grad_norm": 0.503162145614624, "learning_rate": 0.0002, "epoch": 1.2814719173660425, "step": 3970}, {"loss": 0.7489, "grad_norm": 0.6670064926147461, "learning_rate": 0.0002, "epoch": 1.2846998063266624, "step": 3980}, {"loss": 0.7198, "grad_norm": 0.5676213502883911, "learning_rate": 0.0002, "epoch": 1.2879276952872822, "step": 3990}, {"loss": 0.7892, "grad_norm": 0.5383169054985046, "learning_rate": 0.0002, "epoch": 1.2911555842479019, "step": 4000}, {"loss": 0.7432, "grad_norm": 0.714743971824646, "learning_rate": 0.0002, "epoch": 1.2943834732085215, "step": 4010}, {"loss": 0.7594, "grad_norm": 0.5740262269973755, "learning_rate": 0.0002, "epoch": 1.2976113621691414, "step": 4020}, {"loss": 0.7564, "grad_norm": 0.6143045425415039, "learning_rate": 0.0002, "epoch": 1.300839251129761, "step": 4030}, {"loss": 0.7181, "grad_norm": 0.501025378704071, "learning_rate": 0.0002, "epoch": 1.304067140090381, "step": 4040}, {"loss": 0.7099, "grad_norm": 0.5784100294113159, "learning_rate": 0.0002, "epoch": 1.3072950290510006, "step": 4050}, {"loss": 0.7403, "grad_norm": 0.6182606220245361, "learning_rate": 0.0002, "epoch": 1.3105229180116205, "step": 4060}, {"loss": 0.7249, "grad_norm": 0.5072231292724609, "learning_rate": 0.0002, "epoch": 1.3137508069722401, "step": 4070}, {"loss": 0.7451, "grad_norm": 0.6841012835502625, "learning_rate": 0.0002, "epoch": 1.31697869593286, "step": 4080}, {"loss": 0.7395, "grad_norm": 0.697257936000824, "learning_rate": 0.0002, "epoch": 1.3202065848934796, "step": 4090}, {"loss": 0.7401, "grad_norm": 0.5113214254379272, "learning_rate": 0.0002, "epoch": 1.3234344738540993, "step": 4100}, {"loss": 0.7336, "grad_norm": 0.6270561814308167, "learning_rate": 0.0002, "epoch": 1.3266623628147192, "step": 4110}, {"loss": 0.7535, "grad_norm": 0.5525947213172913, "learning_rate": 0.0002, "epoch": 1.329890251775339, "step": 4120}, {"loss": 0.6999, "grad_norm": 0.546071469783783, "learning_rate": 0.0002, "epoch": 1.3331181407359587, "step": 4130}, {"loss": 0.7884, "grad_norm": 0.6516721248626709, "learning_rate": 0.0002, "epoch": 1.3363460296965783, "step": 4140}, {"loss": 0.755, "grad_norm": 0.6235111355781555, "learning_rate": 0.0002, "epoch": 1.3395739186571982, "step": 4150}, {"loss": 0.7467, "grad_norm": 0.538649320602417, "learning_rate": 0.0002, "epoch": 1.3428018076178179, "step": 4160}, {"loss": 0.7368, "grad_norm": 0.5367001891136169, "learning_rate": 0.0002, "epoch": 1.3460296965784377, "step": 4170}, {"loss": 0.7536, "grad_norm": 0.6134631037712097, "learning_rate": 0.0002, "epoch": 1.3492575855390574, "step": 4180}, {"loss": 0.8245, "grad_norm": 0.5827262997627258, "learning_rate": 0.0002, "epoch": 1.3524854744996773, "step": 4190}, {"loss": 0.7288, "grad_norm": 0.5706096291542053, "learning_rate": 0.0002, "epoch": 1.355713363460297, "step": 4200}, {"loss": 0.7302, "grad_norm": 0.6422057151794434, "learning_rate": 0.0002, "epoch": 1.3589412524209168, "step": 4210}, {"loss": 0.7303, "grad_norm": 0.6316141486167908, "learning_rate": 0.0002, "epoch": 1.3621691413815364, "step": 4220}, {"loss": 0.7457, "grad_norm": 0.6946983933448792, "learning_rate": 0.0002, "epoch": 1.365397030342156, "step": 4230}, {"loss": 0.7388, "grad_norm": 0.5381525754928589, "learning_rate": 0.0002, "epoch": 1.368624919302776, "step": 4240}, {"loss": 0.73, "grad_norm": 0.5484845638275146, "learning_rate": 0.0002, "epoch": 1.3718528082633958, "step": 4250}, {"loss": 0.7584, "grad_norm": 0.5961896777153015, "learning_rate": 0.0002, "epoch": 1.3750806972240155, "step": 4260}, {"loss": 0.8006, "grad_norm": 0.6041752696037292, "learning_rate": 0.0002, "epoch": 1.3783085861846351, "step": 4270}, {"loss": 0.7276, "grad_norm": 0.6283464431762695, "learning_rate": 0.0002, "epoch": 1.381536475145255, "step": 4280}, {"loss": 0.757, "grad_norm": 0.6761324405670166, "learning_rate": 0.0002, "epoch": 1.384764364105875, "step": 4290}, {"loss": 0.7381, "grad_norm": 0.504311203956604, "learning_rate": 0.0002, "epoch": 1.3879922530664945, "step": 4300}, {"loss": 0.7536, "grad_norm": 0.6100395917892456, "learning_rate": 0.0002, "epoch": 1.3912201420271142, "step": 4310}, {"loss": 0.7103, "grad_norm": 0.6245788335800171, "learning_rate": 0.0002, "epoch": 1.394448030987734, "step": 4320}, {"loss": 0.7505, "grad_norm": 0.6074621081352234, "learning_rate": 0.0002, "epoch": 1.3976759199483537, "step": 4330}, {"loss": 0.752, "grad_norm": 0.6683838963508606, "learning_rate": 0.0002, "epoch": 1.4009038089089736, "step": 4340}, {"loss": 0.7537, "grad_norm": 0.622998058795929, "learning_rate": 0.0002, "epoch": 1.4041316978695932, "step": 4350}, {"loss": 0.8148, "grad_norm": 0.6089423894882202, "learning_rate": 0.0002, "epoch": 1.4073595868302131, "step": 4360}, {"loss": 0.7715, "grad_norm": 0.6381658911705017, "learning_rate": 0.0002, "epoch": 1.4105874757908328, "step": 4370}, {"loss": 0.7871, "grad_norm": 0.5419308543205261, "learning_rate": 0.0002, "epoch": 1.4138153647514526, "step": 4380}, {"loss": 0.7386, "grad_norm": 0.6026232242584229, "learning_rate": 0.0002, "epoch": 1.4170432537120723, "step": 4390}, {"loss": 0.7529, "grad_norm": 0.4911101162433624, "learning_rate": 0.0002, "epoch": 1.420271142672692, "step": 4400}, {"loss": 0.7495, "grad_norm": 0.6302908062934875, "learning_rate": 0.0002, "epoch": 1.4234990316333118, "step": 4410}, {"loss": 0.7446, "grad_norm": 0.6692768931388855, "learning_rate": 0.0002, "epoch": 1.4267269205939317, "step": 4420}, {"loss": 0.7312, "grad_norm": 0.46294572949409485, "learning_rate": 0.0002, "epoch": 1.4299548095545513, "step": 4430}, {"loss": 0.7255, "grad_norm": 0.5452619194984436, "learning_rate": 0.0002, "epoch": 1.433182698515171, "step": 4440}, {"loss": 0.7974, "grad_norm": 0.7809233069419861, "learning_rate": 0.0002, "epoch": 1.4364105874757909, "step": 4450}, {"loss": 0.7103, "grad_norm": 0.550088107585907, "learning_rate": 0.0002, "epoch": 1.4396384764364105, "step": 4460}, {"loss": 0.7088, "grad_norm": 0.7139151096343994, "learning_rate": 0.0002, "epoch": 1.4428663653970304, "step": 4470}, {"loss": 0.7358, "grad_norm": 0.6187090873718262, "learning_rate": 0.0002, "epoch": 1.44609425435765, "step": 4480}, {"loss": 0.7608, "grad_norm": 0.5948249101638794, "learning_rate": 0.0002, "epoch": 1.44932214331827, "step": 4490}, {"loss": 0.7582, "grad_norm": 0.6510892510414124, "learning_rate": 0.0002, "epoch": 1.4525500322788896, "step": 4500}, {"loss": 0.7105, "grad_norm": 0.6552293300628662, "learning_rate": 0.0002, "epoch": 1.4557779212395094, "step": 4510}, {"loss": 0.7965, "grad_norm": 0.585574209690094, "learning_rate": 0.0002, "epoch": 1.459005810200129, "step": 4520}, {"loss": 0.761, "grad_norm": 0.4830162823200226, "learning_rate": 0.0002, "epoch": 1.4622336991607487, "step": 4530}, {"loss": 0.7424, "grad_norm": 0.5780223608016968, "learning_rate": 0.0002, "epoch": 1.4654615881213686, "step": 4540}, {"loss": 0.7518, "grad_norm": 0.5462607145309448, "learning_rate": 0.0002, "epoch": 1.4686894770819885, "step": 4550}, {"loss": 0.7342, "grad_norm": 0.5183546543121338, "learning_rate": 0.0002, "epoch": 1.4719173660426081, "step": 4560}, {"loss": 0.71, "grad_norm": 0.676917552947998, "learning_rate": 0.0002, "epoch": 1.4751452550032278, "step": 4570}, {"loss": 0.7875, "grad_norm": 0.5772345066070557, "learning_rate": 0.0002, "epoch": 1.4783731439638477, "step": 4580}, {"loss": 0.7709, "grad_norm": 0.7320035696029663, "learning_rate": 0.0002, "epoch": 1.4816010329244673, "step": 4590}, {"loss": 0.7601, "grad_norm": 0.5024042129516602, "learning_rate": 0.0002, "epoch": 1.4848289218850872, "step": 4600}, {"loss": 0.8061, "grad_norm": 0.5482868552207947, "learning_rate": 0.0002, "epoch": 1.4880568108457068, "step": 4610}, {"loss": 0.714, "grad_norm": 0.5447399616241455, "learning_rate": 0.0002, "epoch": 1.4912846998063267, "step": 4620}, {"loss": 0.7959, "grad_norm": 0.5953414440155029, "learning_rate": 0.0002, "epoch": 1.4945125887669464, "step": 4630}, {"loss": 0.7463, "grad_norm": 0.6983066201210022, "learning_rate": 0.0002, "epoch": 1.4977404777275662, "step": 4640}, {"loss": 0.7877, "grad_norm": 0.586327075958252, "learning_rate": 0.0002, "epoch": 1.500968366688186, "step": 4650}, {"loss": 0.7169, "grad_norm": 0.5839682221412659, "learning_rate": 0.0002, "epoch": 1.5041962556488055, "step": 4660}, {"loss": 0.7524, "grad_norm": 0.5959209203720093, "learning_rate": 0.0002, "epoch": 1.5074241446094254, "step": 4670}, {"loss": 0.7615, "grad_norm": 0.5073857307434082, "learning_rate": 0.0002, "epoch": 1.5106520335700453, "step": 4680}, {"loss": 0.7258, "grad_norm": 0.5183001160621643, "learning_rate": 0.0002, "epoch": 1.513879922530665, "step": 4690}, {"loss": 0.784, "grad_norm": 0.593530535697937, "learning_rate": 0.0002, "epoch": 1.5171078114912846, "step": 4700}, {"loss": 0.7722, "grad_norm": 0.675993025302887, "learning_rate": 0.0002, "epoch": 1.5203357004519045, "step": 4710}, {"loss": 0.7485, "grad_norm": 0.5823286771774292, "learning_rate": 0.0002, "epoch": 1.5235635894125243, "step": 4720}, {"loss": 0.7474, "grad_norm": 0.5825035572052002, "learning_rate": 0.0002, "epoch": 1.526791478373144, "step": 4730}, {"loss": 0.8287, "grad_norm": 0.5689691305160522, "learning_rate": 0.0002, "epoch": 1.5300193673337636, "step": 4740}, {"loss": 0.7279, "grad_norm": 0.6037150621414185, "learning_rate": 0.0002, "epoch": 1.5332472562943835, "step": 4750}, {"loss": 0.7865, "grad_norm": 0.6393677592277527, "learning_rate": 0.0002, "epoch": 1.5364751452550034, "step": 4760}, {"loss": 0.805, "grad_norm": 0.5926381945610046, "learning_rate": 0.0002, "epoch": 1.539703034215623, "step": 4770}, {"loss": 0.7425, "grad_norm": 0.9468599557876587, "learning_rate": 0.0002, "epoch": 1.5429309231762427, "step": 4780}, {"loss": 0.7565, "grad_norm": 0.7544237375259399, "learning_rate": 0.0002, "epoch": 1.5461588121368623, "step": 4790}, {"loss": 0.7398, "grad_norm": 0.5308566093444824, "learning_rate": 0.0002, "epoch": 1.5493867010974822, "step": 4800}, {"loss": 0.7756, "grad_norm": 0.6590296030044556, "learning_rate": 0.0002, "epoch": 1.552614590058102, "step": 4810}, {"loss": 0.7212, "grad_norm": 0.5630404353141785, "learning_rate": 0.0002, "epoch": 1.5558424790187217, "step": 4820}, {"loss": 0.7593, "grad_norm": 0.6800200939178467, "learning_rate": 0.0002, "epoch": 1.5590703679793414, "step": 4830}, {"loss": 0.7373, "grad_norm": 0.5463718175888062, "learning_rate": 0.0002, "epoch": 1.5622982569399613, "step": 4840}, {"loss": 0.7519, "grad_norm": 0.505135178565979, "learning_rate": 0.0002, "epoch": 1.5655261459005811, "step": 4850}, {"loss": 0.8122, "grad_norm": 0.5469676852226257, "learning_rate": 0.0002, "epoch": 1.5687540348612008, "step": 4860}, {"loss": 0.7185, "grad_norm": 0.5318337678909302, "learning_rate": 0.0002, "epoch": 1.5719819238218204, "step": 4870}, {"loss": 0.7324, "grad_norm": 0.7287914752960205, "learning_rate": 0.0002, "epoch": 1.5752098127824403, "step": 4880}, {"loss": 0.7532, "grad_norm": 0.7318989038467407, "learning_rate": 0.0002, "epoch": 1.5784377017430602, "step": 4890}, {"loss": 0.7851, "grad_norm": 0.6499921679496765, "learning_rate": 0.0002, "epoch": 1.5816655907036798, "step": 4900}, {"loss": 0.753, "grad_norm": 0.47907355427742004, "learning_rate": 0.0002, "epoch": 1.5848934796642995, "step": 4910}, {"loss": 0.7699, "grad_norm": 0.7338833808898926, "learning_rate": 0.0002, "epoch": 1.5881213686249191, "step": 4920}, {"loss": 0.7592, "grad_norm": 0.5800719261169434, "learning_rate": 0.0002, "epoch": 1.591349257585539, "step": 4930}, {"loss": 0.7211, "grad_norm": 0.5365763306617737, "learning_rate": 0.0002, "epoch": 1.594577146546159, "step": 4940}, {"loss": 0.777, "grad_norm": 0.5800772309303284, "learning_rate": 0.0002, "epoch": 1.5978050355067785, "step": 4950}, {"loss": 0.8027, "grad_norm": 0.7878010869026184, "learning_rate": 0.0002, "epoch": 1.6010329244673982, "step": 4960}, {"loss": 0.7894, "grad_norm": 0.5919058918952942, "learning_rate": 0.0002, "epoch": 1.604260813428018, "step": 4970}, {"loss": 0.7762, "grad_norm": 0.5004435181617737, "learning_rate": 0.0002, "epoch": 1.607488702388638, "step": 4980}, {"loss": 0.7447, "grad_norm": 0.6299242377281189, "learning_rate": 0.0002, "epoch": 1.6107165913492576, "step": 4990}, {"loss": 0.7149, "grad_norm": 0.6307242512702942, "learning_rate": 0.0002, "epoch": 1.6139444803098772, "step": 5000}, {"loss": 0.7693, "grad_norm": 0.7838703989982605, "learning_rate": 0.0002, "epoch": 1.6171723692704971, "step": 5010}, {"loss": 0.7364, "grad_norm": 0.6454671621322632, "learning_rate": 0.0002, "epoch": 1.620400258231117, "step": 5020}, {"loss": 0.74, "grad_norm": 0.5907095670700073, "learning_rate": 0.0002, "epoch": 1.6236281471917366, "step": 5030}, {"loss": 0.7331, "grad_norm": 0.6053501963615417, "learning_rate": 0.0002, "epoch": 1.6268560361523563, "step": 5040}, {"loss": 0.6987, "grad_norm": 0.5644670128822327, "learning_rate": 0.0002, "epoch": 1.630083925112976, "step": 5050}, {"loss": 0.7886, "grad_norm": 0.6320949792861938, "learning_rate": 0.0002, "epoch": 1.6333118140735958, "step": 5060}, {"loss": 0.7109, "grad_norm": 0.6101489067077637, "learning_rate": 0.0002, "epoch": 1.6365397030342157, "step": 5070}, {"loss": 0.6922, "grad_norm": 0.9435283541679382, "learning_rate": 0.0002, "epoch": 1.6397675919948353, "step": 5080}, {"loss": 0.729, "grad_norm": 0.6668919324874878, "learning_rate": 0.0002, "epoch": 1.642995480955455, "step": 5090}, {"loss": 0.7402, "grad_norm": 0.6160340905189514, "learning_rate": 0.0002, "epoch": 1.6462233699160749, "step": 5100}, {"loss": 0.7461, "grad_norm": 0.5999835729598999, "learning_rate": 0.0002, "epoch": 1.6494512588766947, "step": 5110}, {"loss": 0.7661, "grad_norm": 0.9378551840782166, "learning_rate": 0.0002, "epoch": 1.6526791478373144, "step": 5120}, {"loss": 0.7586, "grad_norm": 0.4795055389404297, "learning_rate": 0.0002, "epoch": 1.655907036797934, "step": 5130}, {"loss": 0.7342, "grad_norm": 0.4878861606121063, "learning_rate": 0.0002, "epoch": 1.659134925758554, "step": 5140}, {"loss": 0.7362, "grad_norm": 0.6042965054512024, "learning_rate": 0.0002, "epoch": 1.6623628147191738, "step": 5150}, {"loss": 0.7863, "grad_norm": 0.5829901695251465, "learning_rate": 0.0002, "epoch": 1.6655907036797934, "step": 5160}, {"loss": 0.7498, "grad_norm": 0.5168480277061462, "learning_rate": 0.0002, "epoch": 1.668818592640413, "step": 5170}, {"loss": 0.7333, "grad_norm": 0.6489511132240295, "learning_rate": 0.0002, "epoch": 1.672046481601033, "step": 5180}, {"loss": 0.7257, "grad_norm": 0.5955966114997864, "learning_rate": 0.0002, "epoch": 1.6752743705616526, "step": 5190}, {"loss": 0.7938, "grad_norm": 0.6228088140487671, "learning_rate": 0.0002, "epoch": 1.6785022595222725, "step": 5200}, {"loss": 0.7626, "grad_norm": 0.5726390480995178, "learning_rate": 0.0002, "epoch": 1.6817301484828922, "step": 5210}, {"loss": 0.7479, "grad_norm": 0.6116343140602112, "learning_rate": 0.0002, "epoch": 1.6849580374435118, "step": 5220}, {"loss": 0.7169, "grad_norm": 0.5483687520027161, "learning_rate": 0.0002, "epoch": 1.6881859264041317, "step": 5230}, {"loss": 0.7293, "grad_norm": 0.570941686630249, "learning_rate": 0.0002, "epoch": 1.6914138153647515, "step": 5240}, {"loss": 0.723, "grad_norm": 0.6048086285591125, "learning_rate": 0.0002, "epoch": 1.6946417043253712, "step": 5250}, {"loss": 0.7861, "grad_norm": 0.6769003868103027, "learning_rate": 0.0002, "epoch": 1.6978695932859909, "step": 5260}, {"loss": 0.7885, "grad_norm": 0.5629057884216309, "learning_rate": 0.0002, "epoch": 1.7010974822466107, "step": 5270}, {"loss": 0.7693, "grad_norm": 0.657341480255127, "learning_rate": 0.0002, "epoch": 1.7043253712072306, "step": 5280}, {"loss": 0.7357, "grad_norm": 0.6256147623062134, "learning_rate": 0.0002, "epoch": 1.7075532601678503, "step": 5290}, {"loss": 0.714, "grad_norm": 0.5498088002204895, "learning_rate": 0.0002, "epoch": 1.71078114912847, "step": 5300}, {"loss": 0.7669, "grad_norm": 0.5078358054161072, "learning_rate": 0.0002, "epoch": 1.7140090380890898, "step": 5310}, {"loss": 0.7872, "grad_norm": 0.6696692705154419, "learning_rate": 0.0002, "epoch": 1.7172369270497096, "step": 5320}, {"loss": 0.8205, "grad_norm": 0.6692847013473511, "learning_rate": 0.0002, "epoch": 1.7204648160103293, "step": 5330}, {"loss": 0.7432, "grad_norm": 0.5415751934051514, "learning_rate": 0.0002, "epoch": 1.723692704970949, "step": 5340}, {"loss": 0.7499, "grad_norm": 0.5367611050605774, "learning_rate": 0.0002, "epoch": 1.7269205939315686, "step": 5350}, {"loss": 0.7631, "grad_norm": 0.7321061491966248, "learning_rate": 0.0002, "epoch": 1.7301484828921885, "step": 5360}, {"loss": 0.7827, "grad_norm": 0.723972499370575, "learning_rate": 0.0002, "epoch": 1.7333763718528084, "step": 5370}, {"loss": 0.7077, "grad_norm": 0.7328100204467773, "learning_rate": 0.0002, "epoch": 1.736604260813428, "step": 5380}, {"loss": 0.7503, "grad_norm": 0.5785264372825623, "learning_rate": 0.0002, "epoch": 1.7398321497740477, "step": 5390}, {"loss": 0.7188, "grad_norm": 0.7812932133674622, "learning_rate": 0.0002, "epoch": 1.7430600387346675, "step": 5400}, {"loss": 0.7386, "grad_norm": 0.6493327617645264, "learning_rate": 0.0002, "epoch": 1.7462879276952874, "step": 5410}, {"loss": 0.7487, "grad_norm": 0.5825939774513245, "learning_rate": 0.0002, "epoch": 1.749515816655907, "step": 5420}, {"loss": 0.7625, "grad_norm": 0.6969610452651978, "learning_rate": 0.0002, "epoch": 1.7527437056165267, "step": 5430}, {"loss": 0.7512, "grad_norm": 0.5558062195777893, "learning_rate": 0.0002, "epoch": 1.7559715945771466, "step": 5440}, {"loss": 0.7256, "grad_norm": 0.49222221970558167, "learning_rate": 0.0002, "epoch": 1.7591994835377665, "step": 5450}, {"loss": 0.7477, "grad_norm": 0.5844656825065613, "learning_rate": 0.0002, "epoch": 1.762427372498386, "step": 5460}, {"loss": 0.7695, "grad_norm": 0.8706597685813904, "learning_rate": 0.0002, "epoch": 1.7656552614590058, "step": 5470}, {"loss": 0.7582, "grad_norm": 0.6167706251144409, "learning_rate": 0.0002, "epoch": 1.7688831504196254, "step": 5480}, {"loss": 0.7521, "grad_norm": 0.5890011787414551, "learning_rate": 0.0002, "epoch": 1.7721110393802453, "step": 5490}, {"loss": 0.8319, "grad_norm": 0.6551728248596191, "learning_rate": 0.0002, "epoch": 1.7753389283408652, "step": 5500}, {"loss": 0.7615, "grad_norm": 0.5848751068115234, "learning_rate": 0.0002, "epoch": 1.7785668173014848, "step": 5510}, {"loss": 0.7622, "grad_norm": 0.6664014458656311, "learning_rate": 0.0002, "epoch": 1.7817947062621045, "step": 5520}, {"loss": 0.7544, "grad_norm": 0.5931693911552429, "learning_rate": 0.0002, "epoch": 1.7850225952227243, "step": 5530}, {"loss": 0.7992, "grad_norm": 0.5534724593162537, "learning_rate": 0.0002, "epoch": 1.7882504841833442, "step": 5540}, {"loss": 0.7967, "grad_norm": 0.5590878129005432, "learning_rate": 0.0002, "epoch": 1.7914783731439639, "step": 5550}, {"loss": 0.7406, "grad_norm": 0.6947470903396606, "learning_rate": 0.0002, "epoch": 1.7947062621045835, "step": 5560}, {"loss": 0.7614, "grad_norm": 0.6104130148887634, "learning_rate": 0.0002, "epoch": 1.7979341510652034, "step": 5570}, {"loss": 0.8032, "grad_norm": 0.6135714054107666, "learning_rate": 0.0002, "epoch": 1.8011620400258233, "step": 5580}, {"loss": 0.7403, "grad_norm": 0.6626853346824646, "learning_rate": 0.0002, "epoch": 1.804389928986443, "step": 5590}, {"loss": 0.7746, "grad_norm": 0.6977612972259521, "learning_rate": 0.0002, "epoch": 1.8076178179470626, "step": 5600}, {"loss": 0.7899, "grad_norm": 0.6275238394737244, "learning_rate": 0.0002, "epoch": 1.8108457069076824, "step": 5610}, {"loss": 0.7392, "grad_norm": 0.5017505288124084, "learning_rate": 0.0002, "epoch": 1.814073595868302, "step": 5620}, {"loss": 0.7669, "grad_norm": 0.8314290642738342, "learning_rate": 0.0002, "epoch": 1.817301484828922, "step": 5630}, {"loss": 0.7031, "grad_norm": 0.6863582134246826, "learning_rate": 0.0002, "epoch": 1.8205293737895416, "step": 5640}, {"loss": 0.743, "grad_norm": 0.69544917345047, "learning_rate": 0.0002, "epoch": 1.8237572627501613, "step": 5650}, {"loss": 0.7277, "grad_norm": 0.515499472618103, "learning_rate": 0.0002, "epoch": 1.8269851517107811, "step": 5660}, {"loss": 0.7166, "grad_norm": 0.6100873947143555, "learning_rate": 0.0002, "epoch": 1.830213040671401, "step": 5670}, {"loss": 0.7217, "grad_norm": 0.67416912317276, "learning_rate": 0.0002, "epoch": 1.8334409296320207, "step": 5680}, {"loss": 0.7575, "grad_norm": 0.7057772278785706, "learning_rate": 0.0002, "epoch": 1.8366688185926403, "step": 5690}, {"loss": 0.7483, "grad_norm": 0.7374551892280579, "learning_rate": 0.0002, "epoch": 1.8398967075532602, "step": 5700}, {"loss": 0.81, "grad_norm": 0.6266297101974487, "learning_rate": 0.0002, "epoch": 1.84312459651388, "step": 5710}, {"loss": 0.728, "grad_norm": 0.5629227757453918, "learning_rate": 0.0002, "epoch": 1.8463524854744997, "step": 5720}, {"loss": 0.8043, "grad_norm": 0.6603655815124512, "learning_rate": 0.0002, "epoch": 1.8495803744351194, "step": 5730}, {"loss": 0.7587, "grad_norm": 0.8113715052604675, "learning_rate": 0.0002, "epoch": 1.8528082633957392, "step": 5740}, {"loss": 0.7486, "grad_norm": 0.7143914103507996, "learning_rate": 0.0002, "epoch": 1.856036152356359, "step": 5750}, {"loss": 0.7619, "grad_norm": 0.6273732781410217, "learning_rate": 0.0002, "epoch": 1.8592640413169788, "step": 5760}, {"loss": 0.7962, "grad_norm": 0.5428690910339355, "learning_rate": 0.0002, "epoch": 1.8624919302775984, "step": 5770}, {"loss": 0.7581, "grad_norm": 0.6405037641525269, "learning_rate": 0.0002, "epoch": 1.865719819238218, "step": 5780}, {"loss": 0.7569, "grad_norm": 0.700873613357544, "learning_rate": 0.0002, "epoch": 1.868947708198838, "step": 5790}, {"loss": 0.7353, "grad_norm": 0.5645238161087036, "learning_rate": 0.0002, "epoch": 1.8721755971594578, "step": 5800}, {"loss": 0.8037, "grad_norm": 0.8780353665351868, "learning_rate": 0.0002, "epoch": 1.8754034861200775, "step": 5810}, {"loss": 0.7686, "grad_norm": 0.6295409798622131, "learning_rate": 0.0002, "epoch": 1.878631375080697, "step": 5820}, {"loss": 0.8067, "grad_norm": 0.678269624710083, "learning_rate": 0.0002, "epoch": 1.881859264041317, "step": 5830}, {"loss": 0.7537, "grad_norm": 0.6464608907699585, "learning_rate": 0.0002, "epoch": 1.8850871530019369, "step": 5840}, {"loss": 0.7423, "grad_norm": 0.6201048493385315, "learning_rate": 0.0002, "epoch": 1.8883150419625565, "step": 5850}, {"loss": 0.7694, "grad_norm": 0.6046274304389954, "learning_rate": 0.0002, "epoch": 1.8915429309231762, "step": 5860}, {"loss": 0.781, "grad_norm": 0.7532408833503723, "learning_rate": 0.0002, "epoch": 1.894770819883796, "step": 5870}, {"loss": 0.6885, "grad_norm": 0.6066767573356628, "learning_rate": 0.0002, "epoch": 1.897998708844416, "step": 5880}, {"loss": 0.7631, "grad_norm": 0.6289830207824707, "learning_rate": 0.0002, "epoch": 1.9012265978050356, "step": 5890}, {"loss": 0.7501, "grad_norm": 0.5204319953918457, "learning_rate": 0.0002, "epoch": 1.9044544867656552, "step": 5900}, {"loss": 0.7335, "grad_norm": 0.6708219647407532, "learning_rate": 0.0002, "epoch": 1.9076823757262749, "step": 5910}, {"loss": 0.7455, "grad_norm": 0.4915677309036255, "learning_rate": 0.0002, "epoch": 1.9109102646868947, "step": 5920}, {"loss": 0.7464, "grad_norm": 0.652717113494873, "learning_rate": 0.0002, "epoch": 1.9141381536475146, "step": 5930}, {"loss": 0.7687, "grad_norm": 0.5446316003799438, "learning_rate": 0.0002, "epoch": 1.9173660426081343, "step": 5940}, {"loss": 0.7424, "grad_norm": 0.4958149194717407, "learning_rate": 0.0002, "epoch": 1.920593931568754, "step": 5950}, {"loss": 0.757, "grad_norm": 0.5623434782028198, "learning_rate": 0.0002, "epoch": 1.9238218205293738, "step": 5960}, {"loss": 0.7446, "grad_norm": 0.6855450868606567, "learning_rate": 0.0002, "epoch": 1.9270497094899937, "step": 5970}, {"loss": 0.827, "grad_norm": 0.5710492730140686, "learning_rate": 0.0002, "epoch": 1.9302775984506133, "step": 5980}, {"loss": 0.7245, "grad_norm": 0.5379431843757629, "learning_rate": 0.0002, "epoch": 1.933505487411233, "step": 5990}, {"loss": 0.77, "grad_norm": 0.557129442691803, "learning_rate": 0.0002, "epoch": 1.9367333763718528, "step": 6000}, {"loss": 0.6988, "grad_norm": 0.6336663961410522, "learning_rate": 0.0002, "epoch": 1.9399612653324727, "step": 6010}, {"loss": 0.7316, "grad_norm": 0.5950582027435303, "learning_rate": 0.0002, "epoch": 1.9431891542930924, "step": 6020}, {"loss": 0.7443, "grad_norm": 0.5905954837799072, "learning_rate": 0.0002, "epoch": 1.946417043253712, "step": 6030}, {"loss": 0.7127, "grad_norm": 0.6688982844352722, "learning_rate": 0.0002, "epoch": 1.9496449322143317, "step": 6040}, {"loss": 0.79, "grad_norm": 0.5440775752067566, "learning_rate": 0.0002, "epoch": 1.9528728211749515, "step": 6050}, {"loss": 0.7221, "grad_norm": 0.6207906603813171, "learning_rate": 0.0002, "epoch": 1.9561007101355714, "step": 6060}, {"loss": 0.738, "grad_norm": 0.6999374628067017, "learning_rate": 0.0002, "epoch": 1.959328599096191, "step": 6070}, {"loss": 0.7372, "grad_norm": 0.6310848593711853, "learning_rate": 0.0002, "epoch": 1.9625564880568107, "step": 6080}, {"loss": 0.7198, "grad_norm": 0.5903388261795044, "learning_rate": 0.0002, "epoch": 1.9657843770174306, "step": 6090}, {"loss": 0.7103, "grad_norm": 0.6333889961242676, "learning_rate": 0.0002, "epoch": 1.9690122659780505, "step": 6100}, {"loss": 0.7246, "grad_norm": 0.5604711174964905, "learning_rate": 0.0002, "epoch": 1.97224015493867, "step": 6110}, {"loss": 0.761, "grad_norm": 0.9234541654586792, "learning_rate": 0.0002, "epoch": 1.9754680438992898, "step": 6120}, {"loss": 0.7375, "grad_norm": 0.6149102449417114, "learning_rate": 0.0002, "epoch": 1.9786959328599096, "step": 6130}, {"loss": 0.7286, "grad_norm": 0.615446150302887, "learning_rate": 0.0002, "epoch": 1.9819238218205295, "step": 6140}, {"loss": 0.7333, "grad_norm": 0.5176635980606079, "learning_rate": 0.0002, "epoch": 1.9851517107811492, "step": 6150}, {"loss": 0.718, "grad_norm": 0.7124109864234924, "learning_rate": 0.0002, "epoch": 1.9883795997417688, "step": 6160}, {"loss": 0.7669, "grad_norm": 0.6317567825317383, "learning_rate": 0.0002, "epoch": 1.9916074887023887, "step": 6170}, {"loss": 0.8012, "grad_norm": 0.6855016350746155, "learning_rate": 0.0002, "epoch": 1.9948353776630086, "step": 6180}, {"loss": 0.7376, "grad_norm": 0.6423715353012085, "learning_rate": 0.0002, "epoch": 1.9980632666236282, "step": 6190}, {"eval_loss": 1.1096643209457397, "eval_runtime": 147.7997, "eval_samples_per_second": 4.959, "eval_steps_per_second": 0.622, "epoch": 2.0, "step": 6196}, {"loss": 0.7131, "grad_norm": 0.5322932600975037, "learning_rate": 0.0002, "epoch": 2.001291155584248, "step": 6200}, {"loss": 0.6619, "grad_norm": 0.8152306079864502, "learning_rate": 0.0002, "epoch": 2.0045190445448675, "step": 6210}, {"loss": 0.6731, "grad_norm": 0.6215983033180237, "learning_rate": 0.0002, "epoch": 2.0077469335054876, "step": 6220}, {"loss": 0.658, "grad_norm": 0.845498263835907, "learning_rate": 0.0002, "epoch": 2.0109748224661073, "step": 6230}, {"loss": 0.6954, "grad_norm": 0.733559787273407, "learning_rate": 0.0002, "epoch": 2.014202711426727, "step": 6240}, {"loss": 0.6707, "grad_norm": 0.51433926820755, "learning_rate": 0.0002, "epoch": 2.0174306003873466, "step": 6250}, {"loss": 0.6304, "grad_norm": 0.6374049782752991, "learning_rate": 0.0002, "epoch": 2.020658489347966, "step": 6260}, {"loss": 0.6831, "grad_norm": 0.7833638191223145, "learning_rate": 0.0002, "epoch": 2.0238863783085863, "step": 6270}, {"loss": 0.6672, "grad_norm": 0.8929463028907776, "learning_rate": 0.0002, "epoch": 2.027114267269206, "step": 6280}, {"loss": 0.637, "grad_norm": 0.669731855392456, "learning_rate": 0.0002, "epoch": 2.0303421562298256, "step": 6290}, {"loss": 0.646, "grad_norm": 0.5846071243286133, "learning_rate": 0.0002, "epoch": 2.0335700451904453, "step": 6300}, {"loss": 0.6647, "grad_norm": 0.7087787985801697, "learning_rate": 0.0002, "epoch": 2.0367979341510654, "step": 6310}, {"loss": 0.6433, "grad_norm": 0.6739160418510437, "learning_rate": 0.0002, "epoch": 2.040025823111685, "step": 6320}, {"loss": 0.6301, "grad_norm": 0.4860886335372925, "learning_rate": 0.0002, "epoch": 2.0432537120723047, "step": 6330}, {"loss": 0.6439, "grad_norm": 0.7201244831085205, "learning_rate": 0.0002, "epoch": 2.0464816010329243, "step": 6340}, {"loss": 0.6676, "grad_norm": 0.7409170269966125, "learning_rate": 0.0002, "epoch": 2.0497094899935444, "step": 6350}, {"loss": 0.6153, "grad_norm": 0.6843920350074768, "learning_rate": 0.0002, "epoch": 2.052937378954164, "step": 6360}, {"loss": 0.6674, "grad_norm": 0.7519999742507935, "learning_rate": 0.0002, "epoch": 2.0561652679147837, "step": 6370}, {"loss": 0.6928, "grad_norm": 0.5732819437980652, "learning_rate": 0.0002, "epoch": 2.0593931568754034, "step": 6380}, {"loss": 0.6496, "grad_norm": 0.7565118074417114, "learning_rate": 0.0002, "epoch": 2.062621045836023, "step": 6390}, {"loss": 0.6354, "grad_norm": 0.8147150278091431, "learning_rate": 0.0002, "epoch": 2.065848934796643, "step": 6400}, {"loss": 0.6593, "grad_norm": 0.6941924691200256, "learning_rate": 0.0002, "epoch": 2.0690768237572628, "step": 6410}, {"loss": 0.6698, "grad_norm": 0.6549784541130066, "learning_rate": 0.0002, "epoch": 2.0723047127178824, "step": 6420}, {"loss": 0.6927, "grad_norm": 0.7224905490875244, "learning_rate": 0.0002, "epoch": 2.075532601678502, "step": 6430}, {"loss": 0.6755, "grad_norm": 0.7754863500595093, "learning_rate": 0.0002, "epoch": 2.078760490639122, "step": 6440}, {"loss": 0.6738, "grad_norm": 0.691318154335022, "learning_rate": 0.0002, "epoch": 2.081988379599742, "step": 6450}, {"loss": 0.6233, "grad_norm": 0.6009294986724854, "learning_rate": 0.0002, "epoch": 2.0852162685603615, "step": 6460}, {"loss": 0.6691, "grad_norm": 0.6753945350646973, "learning_rate": 0.0002, "epoch": 2.088444157520981, "step": 6470}, {"loss": 0.6935, "grad_norm": 0.6899921298027039, "learning_rate": 0.0002, "epoch": 2.091672046481601, "step": 6480}, {"loss": 0.6918, "grad_norm": 0.846510648727417, "learning_rate": 0.0002, "epoch": 2.094899935442221, "step": 6490}, {"loss": 0.6084, "grad_norm": 0.6432605981826782, "learning_rate": 0.0002, "epoch": 2.0981278244028405, "step": 6500}, {"loss": 0.6867, "grad_norm": 0.8125239014625549, "learning_rate": 0.0002, "epoch": 2.10135571336346, "step": 6510}, {"loss": 0.6939, "grad_norm": 0.628302812576294, "learning_rate": 0.0002, "epoch": 2.1045836023240803, "step": 6520}, {"loss": 0.5909, "grad_norm": 0.7164334654808044, "learning_rate": 0.0002, "epoch": 2.1078114912847, "step": 6530}, {"loss": 0.6578, "grad_norm": 0.7476949095726013, "learning_rate": 0.0002, "epoch": 2.1110393802453196, "step": 6540}, {"loss": 0.6351, "grad_norm": 0.7577515840530396, "learning_rate": 0.0002, "epoch": 2.114267269205939, "step": 6550}, {"loss": 0.6669, "grad_norm": 0.5684467554092407, "learning_rate": 0.0002, "epoch": 2.117495158166559, "step": 6560}, {"loss": 0.6343, "grad_norm": 0.6121789216995239, "learning_rate": 0.0002, "epoch": 2.120723047127179, "step": 6570}, {"loss": 0.6314, "grad_norm": 0.6095348596572876, "learning_rate": 0.0002, "epoch": 2.1239509360877986, "step": 6580}, {"loss": 0.6276, "grad_norm": 0.7803651690483093, "learning_rate": 0.0002, "epoch": 2.1271788250484183, "step": 6590}, {"loss": 0.6579, "grad_norm": 0.5990583300590515, "learning_rate": 0.0002, "epoch": 2.130406714009038, "step": 6600}, {"loss": 0.6228, "grad_norm": 0.6569220423698425, "learning_rate": 0.0002, "epoch": 2.133634602969658, "step": 6610}, {"loss": 0.7049, "grad_norm": 0.5961166620254517, "learning_rate": 0.0002, "epoch": 2.1368624919302777, "step": 6620}, {"loss": 0.6359, "grad_norm": 0.5860554575920105, "learning_rate": 0.0002, "epoch": 2.1400903808908973, "step": 6630}, {"loss": 0.6651, "grad_norm": 0.5994001626968384, "learning_rate": 0.0002, "epoch": 2.143318269851517, "step": 6640}, {"loss": 0.6421, "grad_norm": 0.7723015546798706, "learning_rate": 0.0002, "epoch": 2.146546158812137, "step": 6650}, {"loss": 0.6723, "grad_norm": 0.676355242729187, "learning_rate": 0.0002, "epoch": 2.1497740477727567, "step": 6660}, {"loss": 0.6826, "grad_norm": 0.5689092874526978, "learning_rate": 0.0002, "epoch": 2.1530019367333764, "step": 6670}, {"loss": 0.6613, "grad_norm": 0.6933727264404297, "learning_rate": 0.0002, "epoch": 2.156229825693996, "step": 6680}, {"loss": 0.6957, "grad_norm": 0.8380527496337891, "learning_rate": 0.0002, "epoch": 2.159457714654616, "step": 6690}, {"loss": 0.6705, "grad_norm": 0.6876497268676758, "learning_rate": 0.0002, "epoch": 2.1626856036152358, "step": 6700}, {"loss": 0.6112, "grad_norm": 0.6418334245681763, "learning_rate": 0.0002, "epoch": 2.1659134925758554, "step": 6710}, {"loss": 0.6357, "grad_norm": 0.7169192433357239, "learning_rate": 0.0002, "epoch": 2.169141381536475, "step": 6720}, {"loss": 0.6492, "grad_norm": 0.6664170622825623, "learning_rate": 0.0002, "epoch": 2.1723692704970947, "step": 6730}, {"loss": 0.6751, "grad_norm": 0.6011993288993835, "learning_rate": 0.0002, "epoch": 2.175597159457715, "step": 6740}, {"loss": 0.696, "grad_norm": 0.5529947280883789, "learning_rate": 0.0002, "epoch": 2.1788250484183345, "step": 6750}, {"loss": 0.671, "grad_norm": 0.6879532933235168, "learning_rate": 0.0002, "epoch": 2.182052937378954, "step": 6760}, {"loss": 0.6634, "grad_norm": 0.6426113843917847, "learning_rate": 0.0002, "epoch": 2.1852808263395738, "step": 6770}, {"loss": 0.6592, "grad_norm": 0.6571047306060791, "learning_rate": 0.0002, "epoch": 2.188508715300194, "step": 6780}, {"loss": 0.6494, "grad_norm": 0.6400564908981323, "learning_rate": 0.0002, "epoch": 2.1917366042608135, "step": 6790}, {"loss": 0.6369, "grad_norm": 0.6509664058685303, "learning_rate": 0.0002, "epoch": 2.194964493221433, "step": 6800}, {"loss": 0.6771, "grad_norm": 0.6673197150230408, "learning_rate": 0.0002, "epoch": 2.198192382182053, "step": 6810}, {"loss": 0.6491, "grad_norm": 0.48205727338790894, "learning_rate": 0.0002, "epoch": 2.2014202711426725, "step": 6820}, {"loss": 0.6894, "grad_norm": 0.849525511264801, "learning_rate": 0.0002, "epoch": 2.2046481601032926, "step": 6830}, {"loss": 0.6977, "grad_norm": 0.6150892376899719, "learning_rate": 0.0002, "epoch": 2.207876049063912, "step": 6840}, {"loss": 0.6843, "grad_norm": 0.7826945781707764, "learning_rate": 0.0002, "epoch": 2.211103938024532, "step": 6850}, {"loss": 0.6338, "grad_norm": 0.5711963772773743, "learning_rate": 0.0002, "epoch": 2.2143318269851515, "step": 6860}, {"loss": 0.6585, "grad_norm": 0.6017758846282959, "learning_rate": 0.0002, "epoch": 2.2175597159457716, "step": 6870}, {"loss": 0.6657, "grad_norm": 0.785434901714325, "learning_rate": 0.0002, "epoch": 2.2207876049063913, "step": 6880}, {"loss": 0.7075, "grad_norm": 0.6251688599586487, "learning_rate": 0.0002, "epoch": 2.224015493867011, "step": 6890}, {"loss": 0.6564, "grad_norm": 0.8242034316062927, "learning_rate": 0.0002, "epoch": 2.2272433828276306, "step": 6900}, {"loss": 0.672, "grad_norm": 0.7272933125495911, "learning_rate": 0.0002, "epoch": 2.2304712717882507, "step": 6910}, {"loss": 0.6541, "grad_norm": 0.7159379720687866, "learning_rate": 0.0002, "epoch": 2.2336991607488703, "step": 6920}, {"loss": 0.6859, "grad_norm": 0.6518042087554932, "learning_rate": 0.0002, "epoch": 2.23692704970949, "step": 6930}, {"loss": 0.5987, "grad_norm": 0.7365370392799377, "learning_rate": 0.0002, "epoch": 2.2401549386701096, "step": 6940}, {"loss": 0.6511, "grad_norm": 0.5674061179161072, "learning_rate": 0.0002, "epoch": 2.2433828276307297, "step": 6950}, {"loss": 0.6748, "grad_norm": 0.669185996055603, "learning_rate": 0.0002, "epoch": 2.2466107165913494, "step": 6960}, {"loss": 0.656, "grad_norm": 0.6638304591178894, "learning_rate": 0.0002, "epoch": 2.249838605551969, "step": 6970}, {"loss": 0.636, "grad_norm": 0.757006824016571, "learning_rate": 0.0002, "epoch": 2.2530664945125887, "step": 6980}, {"loss": 0.6597, "grad_norm": 0.7574930787086487, "learning_rate": 0.0002, "epoch": 2.2562943834732083, "step": 6990}, {"loss": 0.6859, "grad_norm": 0.7819514870643616, "learning_rate": 0.0002, "epoch": 2.2595222724338284, "step": 7000}, {"loss": 0.6238, "grad_norm": 0.6987583041191101, "learning_rate": 0.0002, "epoch": 2.262750161394448, "step": 7010}, {"loss": 0.661, "grad_norm": 0.6628551483154297, "learning_rate": 0.0002, "epoch": 2.2659780503550677, "step": 7020}, {"loss": 0.6254, "grad_norm": 0.7855866551399231, "learning_rate": 0.0002, "epoch": 2.2692059393156874, "step": 7030}, {"loss": 0.6679, "grad_norm": 0.6102892756462097, "learning_rate": 0.0002, "epoch": 2.2724338282763075, "step": 7040}, {"loss": 0.694, "grad_norm": 0.7844198942184448, "learning_rate": 0.0002, "epoch": 2.275661717236927, "step": 7050}, {"loss": 0.63, "grad_norm": 0.6209492087364197, "learning_rate": 0.0002, "epoch": 2.2788896061975468, "step": 7060}, {"loss": 0.6418, "grad_norm": 0.8351290225982666, "learning_rate": 0.0002, "epoch": 2.2821174951581664, "step": 7070}, {"loss": 0.6648, "grad_norm": 0.6883546710014343, "learning_rate": 0.0002, "epoch": 2.285345384118786, "step": 7080}, {"loss": 0.7046, "grad_norm": 0.6626381874084473, "learning_rate": 0.0002, "epoch": 2.288573273079406, "step": 7090}, {"loss": 0.6535, "grad_norm": 0.7216270565986633, "learning_rate": 0.0002, "epoch": 2.291801162040026, "step": 7100}, {"loss": 0.6414, "grad_norm": 0.8246777057647705, "learning_rate": 0.0002, "epoch": 2.2950290510006455, "step": 7110}, {"loss": 0.6315, "grad_norm": 0.614326000213623, "learning_rate": 0.0002, "epoch": 2.2982569399612656, "step": 7120}, {"loss": 0.6303, "grad_norm": 0.8785578012466431, "learning_rate": 0.0002, "epoch": 2.301484828921885, "step": 7130}, {"loss": 0.6348, "grad_norm": 0.7021808624267578, "learning_rate": 0.0002, "epoch": 2.304712717882505, "step": 7140}, {"loss": 0.6738, "grad_norm": 0.6999403238296509, "learning_rate": 0.0002, "epoch": 2.3079406068431245, "step": 7150}, {"loss": 0.6547, "grad_norm": 0.8013143539428711, "learning_rate": 0.0002, "epoch": 2.311168495803744, "step": 7160}, {"loss": 0.6461, "grad_norm": 0.6592583060264587, "learning_rate": 0.0002, "epoch": 2.3143963847643643, "step": 7170}, {"loss": 0.6369, "grad_norm": 0.6260249018669128, "learning_rate": 0.0002, "epoch": 2.317624273724984, "step": 7180}, {"loss": 0.6647, "grad_norm": 0.9352797269821167, "learning_rate": 0.0002, "epoch": 2.3208521626856036, "step": 7190}, {"loss": 0.6543, "grad_norm": 0.6629612445831299, "learning_rate": 0.0002, "epoch": 2.324080051646223, "step": 7200}, {"loss": 0.6811, "grad_norm": 0.7062810063362122, "learning_rate": 0.0002, "epoch": 2.3273079406068433, "step": 7210}, {"loss": 0.67, "grad_norm": 0.7236241102218628, "learning_rate": 0.0002, "epoch": 2.330535829567463, "step": 7220}, {"loss": 0.6462, "grad_norm": 0.7528148293495178, "learning_rate": 0.0002, "epoch": 2.3337637185280826, "step": 7230}, {"loss": 0.694, "grad_norm": 0.7604748606681824, "learning_rate": 0.0002, "epoch": 2.3369916074887023, "step": 7240}, {"loss": 0.6475, "grad_norm": 0.5601189136505127, "learning_rate": 0.0002, "epoch": 2.340219496449322, "step": 7250}, {"loss": 0.6925, "grad_norm": 0.7099230885505676, "learning_rate": 0.0002, "epoch": 2.343447385409942, "step": 7260}, {"loss": 0.6333, "grad_norm": 0.6699047684669495, "learning_rate": 0.0002, "epoch": 2.3466752743705617, "step": 7270}, {"loss": 0.6434, "grad_norm": 0.7315047979354858, "learning_rate": 0.0002, "epoch": 2.3499031633311813, "step": 7280}, {"loss": 0.6927, "grad_norm": 0.632836103439331, "learning_rate": 0.0002, "epoch": 2.353131052291801, "step": 7290}, {"loss": 0.6458, "grad_norm": 0.9410115480422974, "learning_rate": 0.0002, "epoch": 2.356358941252421, "step": 7300}, {"loss": 0.6699, "grad_norm": 0.626554012298584, "learning_rate": 0.0002, "epoch": 2.3595868302130407, "step": 7310}, {"loss": 0.6495, "grad_norm": 0.7538444399833679, "learning_rate": 0.0002, "epoch": 2.3628147191736604, "step": 7320}, {"loss": 0.6321, "grad_norm": 0.6826626062393188, "learning_rate": 0.0002, "epoch": 2.36604260813428, "step": 7330}, {"loss": 0.6752, "grad_norm": 0.6739391088485718, "learning_rate": 0.0002, "epoch": 2.3692704970949, "step": 7340}, {"loss": 0.6518, "grad_norm": 0.7518446445465088, "learning_rate": 0.0002, "epoch": 2.3724983860555198, "step": 7350}, {"loss": 0.7142, "grad_norm": 0.714133083820343, "learning_rate": 0.0002, "epoch": 2.3757262750161394, "step": 7360}, {"loss": 0.6794, "grad_norm": 0.7144588232040405, "learning_rate": 0.0002, "epoch": 2.378954163976759, "step": 7370}, {"loss": 0.6922, "grad_norm": 0.6598120927810669, "learning_rate": 0.0002, "epoch": 2.382182052937379, "step": 7380}, {"loss": 0.6562, "grad_norm": 0.7079148292541504, "learning_rate": 0.0002, "epoch": 2.385409941897999, "step": 7390}, {"loss": 0.6492, "grad_norm": 0.6750902533531189, "learning_rate": 0.0002, "epoch": 2.3886378308586185, "step": 7400}, {"loss": 0.6398, "grad_norm": 0.7181967496871948, "learning_rate": 0.0002, "epoch": 2.391865719819238, "step": 7410}, {"loss": 0.6793, "grad_norm": 0.7720552086830139, "learning_rate": 0.0002, "epoch": 2.3950936087798578, "step": 7420}, {"loss": 0.6804, "grad_norm": 0.7592426538467407, "learning_rate": 0.0002, "epoch": 2.398321497740478, "step": 7430}, {"loss": 0.6667, "grad_norm": 0.7161896824836731, "learning_rate": 0.0002, "epoch": 2.4015493867010975, "step": 7440}, {"loss": 0.6891, "grad_norm": 0.8019260764122009, "learning_rate": 0.0002, "epoch": 2.404777275661717, "step": 7450}, {"loss": 0.6864, "grad_norm": 0.7093342542648315, "learning_rate": 0.0002, "epoch": 2.408005164622337, "step": 7460}, {"loss": 0.6445, "grad_norm": 0.8464207649230957, "learning_rate": 0.0002, "epoch": 2.411233053582957, "step": 7470}, {"loss": 0.6724, "grad_norm": 0.773666501045227, "learning_rate": 0.0002, "epoch": 2.4144609425435766, "step": 7480}, {"loss": 0.6774, "grad_norm": 0.8451611995697021, "learning_rate": 0.0002, "epoch": 2.4176888315041962, "step": 7490}, {"loss": 0.694, "grad_norm": 0.656795084476471, "learning_rate": 0.0002, "epoch": 2.420916720464816, "step": 7500}, {"loss": 0.6824, "grad_norm": 0.7129034996032715, "learning_rate": 0.0002, "epoch": 2.4241446094254355, "step": 7510}, {"loss": 0.711, "grad_norm": 0.8325763940811157, "learning_rate": 0.0002, "epoch": 2.4273724983860556, "step": 7520}, {"loss": 0.6238, "grad_norm": 0.7806527614593506, "learning_rate": 0.0002, "epoch": 2.4306003873466753, "step": 7530}, {"loss": 0.6972, "grad_norm": 0.6994536519050598, "learning_rate": 0.0002, "epoch": 2.433828276307295, "step": 7540}, {"loss": 0.6615, "grad_norm": 0.6898999214172363, "learning_rate": 0.0002, "epoch": 2.437056165267915, "step": 7550}, {"loss": 0.7108, "grad_norm": 0.719490647315979, "learning_rate": 0.0002, "epoch": 2.4402840542285347, "step": 7560}, {"loss": 0.668, "grad_norm": 0.6841562390327454, "learning_rate": 0.0002, "epoch": 2.4435119431891543, "step": 7570}, {"loss": 0.6504, "grad_norm": 0.7573311924934387, "learning_rate": 0.0002, "epoch": 2.446739832149774, "step": 7580}, {"loss": 0.6607, "grad_norm": 0.7295880317687988, "learning_rate": 0.0002, "epoch": 2.4499677211103936, "step": 7590}, {"loss": 0.6593, "grad_norm": 0.710136353969574, "learning_rate": 0.0002, "epoch": 2.4531956100710137, "step": 7600}, {"loss": 0.7137, "grad_norm": 0.6126235127449036, "learning_rate": 0.0002, "epoch": 2.4564234990316334, "step": 7610}, {"loss": 0.6562, "grad_norm": 0.8025609850883484, "learning_rate": 0.0002, "epoch": 2.459651387992253, "step": 7620}, {"loss": 0.6464, "grad_norm": 0.7839472889900208, "learning_rate": 0.0002, "epoch": 2.4628792769528727, "step": 7630}, {"loss": 0.6797, "grad_norm": 0.7253499031066895, "learning_rate": 0.0002, "epoch": 2.4661071659134928, "step": 7640}, {"loss": 0.7341, "grad_norm": 0.7918946743011475, "learning_rate": 0.0002, "epoch": 2.4693350548741124, "step": 7650}, {"loss": 0.6646, "grad_norm": 0.7930178046226501, "learning_rate": 0.0002, "epoch": 2.472562943834732, "step": 7660}, {"loss": 0.6294, "grad_norm": 0.6826170086860657, "learning_rate": 0.0002, "epoch": 2.4757908327953517, "step": 7670}, {"loss": 0.6697, "grad_norm": 0.6576805114746094, "learning_rate": 0.0002, "epoch": 2.4790187217559714, "step": 7680}, {"loss": 0.682, "grad_norm": 0.7012448310852051, "learning_rate": 0.0002, "epoch": 2.4822466107165915, "step": 7690}, {"loss": 0.6418, "grad_norm": 0.7774284482002258, "learning_rate": 0.0002, "epoch": 2.485474499677211, "step": 7700}, {"loss": 0.6566, "grad_norm": 0.6502766013145447, "learning_rate": 0.0002, "epoch": 2.4887023886378308, "step": 7710}, {"loss": 0.6965, "grad_norm": 0.7638739347457886, "learning_rate": 0.0002, "epoch": 2.4919302775984504, "step": 7720}, {"loss": 0.6454, "grad_norm": 0.6217384338378906, "learning_rate": 0.0002, "epoch": 2.4951581665590705, "step": 7730}, {"loss": 0.6837, "grad_norm": 0.7576302886009216, "learning_rate": 0.0002, "epoch": 2.49838605551969, "step": 7740}, {"loss": 0.6855, "grad_norm": 0.6877137422561646, "learning_rate": 0.0002, "epoch": 2.50161394448031, "step": 7750}, {"loss": 0.6604, "grad_norm": 0.6998329162597656, "learning_rate": 0.0002, "epoch": 2.5048418334409295, "step": 7760}, {"loss": 0.6666, "grad_norm": 0.7879213690757751, "learning_rate": 0.0002, "epoch": 2.508069722401549, "step": 7770}, {"loss": 0.715, "grad_norm": 0.7834980487823486, "learning_rate": 0.0002, "epoch": 2.5112976113621692, "step": 7780}, {"loss": 0.6954, "grad_norm": 0.7789630889892578, "learning_rate": 0.0002, "epoch": 2.514525500322789, "step": 7790}, {"loss": 0.6979, "grad_norm": 0.7403590083122253, "learning_rate": 0.0002, "epoch": 2.5177533892834085, "step": 7800}, {"loss": 0.6964, "grad_norm": 0.6029766201972961, "learning_rate": 0.0002, "epoch": 2.5209812782440286, "step": 7810}, {"loss": 0.6887, "grad_norm": 0.7061092257499695, "learning_rate": 0.0002, "epoch": 2.5242091672046483, "step": 7820}, {"loss": 0.6628, "grad_norm": 0.7120763659477234, "learning_rate": 0.0002, "epoch": 2.527437056165268, "step": 7830}, {"loss": 0.6876, "grad_norm": 0.6173675656318665, "learning_rate": 0.0002, "epoch": 2.5306649451258876, "step": 7840}, {"loss": 0.6635, "grad_norm": 0.9566813111305237, "learning_rate": 0.0002, "epoch": 2.5338928340865072, "step": 7850}, {"loss": 0.654, "grad_norm": 0.8497620224952698, "learning_rate": 0.0002, "epoch": 2.5371207230471273, "step": 7860}, {"loss": 0.644, "grad_norm": 0.7663498520851135, "learning_rate": 0.0002, "epoch": 2.540348612007747, "step": 7870}, {"loss": 0.6292, "grad_norm": 0.6329668760299683, "learning_rate": 0.0002, "epoch": 2.5435765009683666, "step": 7880}, {"loss": 0.686, "grad_norm": 0.8128195405006409, "learning_rate": 0.0002, "epoch": 2.5468043899289863, "step": 7890}, {"loss": 0.6619, "grad_norm": 0.6622284650802612, "learning_rate": 0.0002, "epoch": 2.5500322788896064, "step": 7900}, {"loss": 0.693, "grad_norm": 0.8460057973861694, "learning_rate": 0.0002, "epoch": 2.553260167850226, "step": 7910}, {"loss": 0.6619, "grad_norm": 0.6586956977844238, "learning_rate": 0.0002, "epoch": 2.5564880568108457, "step": 7920}, {"loss": 0.6976, "grad_norm": 0.7569382190704346, "learning_rate": 0.0002, "epoch": 2.5597159457714653, "step": 7930}, {"loss": 0.6235, "grad_norm": 0.6409714221954346, "learning_rate": 0.0002, "epoch": 2.562943834732085, "step": 7940}, {"loss": 0.6663, "grad_norm": 0.7031713128089905, "learning_rate": 0.0002, "epoch": 2.566171723692705, "step": 7950}, {"loss": 0.6344, "grad_norm": 0.7983605265617371, "learning_rate": 0.0002, "epoch": 2.5693996126533247, "step": 7960}, {"loss": 0.6834, "grad_norm": 0.7165433168411255, "learning_rate": 0.0002, "epoch": 2.5726275016139444, "step": 7970}, {"loss": 0.6517, "grad_norm": 0.6630598902702332, "learning_rate": 0.0002, "epoch": 2.5758553905745645, "step": 7980}, {"loss": 0.7164, "grad_norm": 0.5883122086524963, "learning_rate": 0.0002, "epoch": 2.579083279535184, "step": 7990}, {"loss": 0.6715, "grad_norm": 0.5928755402565002, "learning_rate": 0.0002, "epoch": 2.5823111684958038, "step": 8000}, {"loss": 0.6701, "grad_norm": 0.7843712568283081, "learning_rate": 0.0002, "epoch": 2.5855390574564234, "step": 8010}, {"loss": 0.6617, "grad_norm": 0.7206324338912964, "learning_rate": 0.0002, "epoch": 2.588766946417043, "step": 8020}, {"loss": 0.6968, "grad_norm": 0.812480092048645, "learning_rate": 0.0002, "epoch": 2.5919948353776627, "step": 8030}, {"loss": 0.6735, "grad_norm": 0.9843078255653381, "learning_rate": 0.0002, "epoch": 2.595222724338283, "step": 8040}, {"loss": 0.6877, "grad_norm": 0.7524392604827881, "learning_rate": 0.0002, "epoch": 2.5984506132989025, "step": 8050}, {"loss": 0.7188, "grad_norm": 0.6220380067825317, "learning_rate": 0.0002, "epoch": 2.601678502259522, "step": 8060}, {"loss": 0.6878, "grad_norm": 0.7461398243904114, "learning_rate": 0.0002, "epoch": 2.6049063912201422, "step": 8070}, {"loss": 0.6626, "grad_norm": 0.720974326133728, "learning_rate": 0.0002, "epoch": 2.608134280180762, "step": 8080}, {"loss": 0.6756, "grad_norm": 0.649509847164154, "learning_rate": 0.0002, "epoch": 2.6113621691413815, "step": 8090}, {"loss": 0.6394, "grad_norm": 0.6894662976264954, "learning_rate": 0.0002, "epoch": 2.614590058102001, "step": 8100}, {"loss": 0.6329, "grad_norm": 0.734433114528656, "learning_rate": 0.0002, "epoch": 2.617817947062621, "step": 8110}, {"loss": 0.6698, "grad_norm": 0.7468628883361816, "learning_rate": 0.0002, "epoch": 2.621045836023241, "step": 8120}, {"loss": 0.658, "grad_norm": 0.6508180499076843, "learning_rate": 0.0002, "epoch": 2.6242737249838606, "step": 8130}, {"loss": 0.6619, "grad_norm": 0.8735209107398987, "learning_rate": 0.0002, "epoch": 2.6275016139444802, "step": 8140}, {"loss": 0.6717, "grad_norm": 0.8162857294082642, "learning_rate": 0.0002, "epoch": 2.6307295029051003, "step": 8150}, {"loss": 0.6496, "grad_norm": 0.628872811794281, "learning_rate": 0.0002, "epoch": 2.63395739186572, "step": 8160}, {"loss": 0.6608, "grad_norm": 0.8078708052635193, "learning_rate": 0.0002, "epoch": 2.6371852808263396, "step": 8170}, {"loss": 0.6916, "grad_norm": 0.7849429845809937, "learning_rate": 0.0002, "epoch": 2.6404131697869593, "step": 8180}, {"loss": 0.6671, "grad_norm": 0.8115387558937073, "learning_rate": 0.0002, "epoch": 2.643641058747579, "step": 8190}, {"loss": 0.6761, "grad_norm": 0.7462222576141357, "learning_rate": 0.0002, "epoch": 2.6468689477081986, "step": 8200}, {"loss": 0.6923, "grad_norm": 0.753662645816803, "learning_rate": 0.0002, "epoch": 2.6500968366688187, "step": 8210}, {"loss": 0.6666, "grad_norm": 0.6100404858589172, "learning_rate": 0.0002, "epoch": 2.6533247256294383, "step": 8220}, {"loss": 0.7256, "grad_norm": 0.9084606766700745, "learning_rate": 0.0002, "epoch": 2.656552614590058, "step": 8230}, {"loss": 0.6385, "grad_norm": 0.6412538886070251, "learning_rate": 0.0002, "epoch": 2.659780503550678, "step": 8240}, {"loss": 0.7048, "grad_norm": 0.7640451192855835, "learning_rate": 0.0002, "epoch": 2.6630083925112977, "step": 8250}, {"loss": 0.6846, "grad_norm": 0.5972344875335693, "learning_rate": 0.0002, "epoch": 2.6662362814719174, "step": 8260}, {"loss": 0.682, "grad_norm": 0.6935883164405823, "learning_rate": 0.0002, "epoch": 2.669464170432537, "step": 8270}, {"loss": 0.6625, "grad_norm": 0.789399266242981, "learning_rate": 0.0002, "epoch": 2.6726920593931567, "step": 8280}, {"loss": 0.6541, "grad_norm": 0.7143490314483643, "learning_rate": 0.0002, "epoch": 2.675919948353777, "step": 8290}, {"loss": 0.6741, "grad_norm": 0.6670652627944946, "learning_rate": 0.0002, "epoch": 2.6791478373143964, "step": 8300}, {"loss": 0.6936, "grad_norm": 0.687108039855957, "learning_rate": 0.0002, "epoch": 2.682375726275016, "step": 8310}, {"loss": 0.7124, "grad_norm": 0.7914147973060608, "learning_rate": 0.0002, "epoch": 2.6856036152356357, "step": 8320}, {"loss": 0.6584, "grad_norm": 0.8398420214653015, "learning_rate": 0.0002, "epoch": 2.688831504196256, "step": 8330}, {"loss": 0.6679, "grad_norm": 0.6592720746994019, "learning_rate": 0.0002, "epoch": 2.6920593931568755, "step": 8340}, {"loss": 0.6673, "grad_norm": 0.6888470649719238, "learning_rate": 0.0002, "epoch": 2.695287282117495, "step": 8350}, {"loss": 0.6483, "grad_norm": 0.7127556800842285, "learning_rate": 0.0002, "epoch": 2.698515171078115, "step": 8360}, {"loss": 0.7013, "grad_norm": 0.6630286574363708, "learning_rate": 0.0002, "epoch": 2.7017430600387344, "step": 8370}, {"loss": 0.6842, "grad_norm": 0.8261964321136475, "learning_rate": 0.0002, "epoch": 2.7049709489993545, "step": 8380}, {"loss": 0.6613, "grad_norm": 0.717339813709259, "learning_rate": 0.0002, "epoch": 2.708198837959974, "step": 8390}, {"loss": 0.6929, "grad_norm": 0.651637613773346, "learning_rate": 0.0002, "epoch": 2.711426726920594, "step": 8400}, {"loss": 0.6796, "grad_norm": 0.7936098575592041, "learning_rate": 0.0002, "epoch": 2.714654615881214, "step": 8410}, {"loss": 0.696, "grad_norm": 0.8761560320854187, "learning_rate": 0.0002, "epoch": 2.7178825048418336, "step": 8420}, {"loss": 0.6889, "grad_norm": 0.6768006086349487, "learning_rate": 0.0002, "epoch": 2.7211103938024532, "step": 8430}, {"loss": 0.6844, "grad_norm": 0.7121055722236633, "learning_rate": 0.0002, "epoch": 2.724338282763073, "step": 8440}, {"loss": 0.6608, "grad_norm": 0.6811696887016296, "learning_rate": 0.0002, "epoch": 2.7275661717236925, "step": 8450}, {"loss": 0.7046, "grad_norm": 0.8168250918388367, "learning_rate": 0.0002, "epoch": 2.730794060684312, "step": 8460}, {"loss": 0.6809, "grad_norm": 0.660682737827301, "learning_rate": 0.0002, "epoch": 2.7340219496449323, "step": 8470}, {"loss": 0.6916, "grad_norm": 0.7369356155395508, "learning_rate": 0.0002, "epoch": 2.737249838605552, "step": 8480}, {"loss": 0.6383, "grad_norm": 0.7545099854469299, "learning_rate": 0.0002, "epoch": 2.7404777275661716, "step": 8490}, {"loss": 0.6917, "grad_norm": 0.6991257667541504, "learning_rate": 0.0002, "epoch": 2.7437056165267917, "step": 8500}, {"loss": 0.6953, "grad_norm": 0.7195324301719666, "learning_rate": 0.0002, "epoch": 2.7469335054874113, "step": 8510}, {"loss": 0.6955, "grad_norm": 0.8995378017425537, "learning_rate": 0.0002, "epoch": 2.750161394448031, "step": 8520}, {"loss": 0.684, "grad_norm": 0.6924123764038086, "learning_rate": 0.0002, "epoch": 2.7533892834086506, "step": 8530}, {"loss": 0.6675, "grad_norm": 0.6260585784912109, "learning_rate": 0.0002, "epoch": 2.7566171723692703, "step": 8540}, {"loss": 0.6613, "grad_norm": 0.7273091673851013, "learning_rate": 0.0002, "epoch": 2.7598450613298904, "step": 8550}, {"loss": 0.6853, "grad_norm": 0.720562219619751, "learning_rate": 0.0002, "epoch": 2.76307295029051, "step": 8560}, {"loss": 0.6452, "grad_norm": 0.6360004544258118, "learning_rate": 0.0002, "epoch": 2.7663008392511297, "step": 8570}, {"loss": 0.6118, "grad_norm": 0.7634525895118713, "learning_rate": 0.0002, "epoch": 2.76952872821175, "step": 8580}, {"loss": 0.686, "grad_norm": 0.6586076021194458, "learning_rate": 0.0002, "epoch": 2.7727566171723694, "step": 8590}, {"loss": 0.7072, "grad_norm": 0.6542639136314392, "learning_rate": 0.0002, "epoch": 2.775984506132989, "step": 8600}, {"loss": 0.7126, "grad_norm": 0.7650290727615356, "learning_rate": 0.0002, "epoch": 2.7792123950936087, "step": 8610}, {"loss": 0.6923, "grad_norm": 0.6551542282104492, "learning_rate": 0.0002, "epoch": 2.7824402840542284, "step": 8620}, {"loss": 0.6937, "grad_norm": 0.6915501952171326, "learning_rate": 0.0002, "epoch": 2.785668173014848, "step": 8630}, {"loss": 0.6586, "grad_norm": 0.8061493635177612, "learning_rate": 0.0002, "epoch": 2.788896061975468, "step": 8640}, {"loss": 0.6853, "grad_norm": 0.8403584957122803, "learning_rate": 0.0002, "epoch": 2.792123950936088, "step": 8650}, {"loss": 0.6616, "grad_norm": 0.6455532312393188, "learning_rate": 0.0002, "epoch": 2.7953518398967074, "step": 8660}, {"loss": 0.6819, "grad_norm": 0.8296352028846741, "learning_rate": 0.0002, "epoch": 2.7985797288573275, "step": 8670}, {"loss": 0.6678, "grad_norm": 0.7288752794265747, "learning_rate": 0.0002, "epoch": 2.801807617817947, "step": 8680}, {"loss": 0.6778, "grad_norm": 0.7628464102745056, "learning_rate": 0.0002, "epoch": 2.805035506778567, "step": 8690}, {"loss": 0.7176, "grad_norm": 0.9993878602981567, "learning_rate": 0.0002, "epoch": 2.8082633957391865, "step": 8700}, {"loss": 0.6414, "grad_norm": 0.6972465515136719, "learning_rate": 0.0002, "epoch": 2.811491284699806, "step": 8710}, {"loss": 0.6777, "grad_norm": 0.645042896270752, "learning_rate": 0.0002, "epoch": 2.8147191736604262, "step": 8720}, {"loss": 0.6587, "grad_norm": 0.6853853464126587, "learning_rate": 0.0002, "epoch": 2.817947062621046, "step": 8730}, {"loss": 0.6405, "grad_norm": 0.5935067534446716, "learning_rate": 0.0002, "epoch": 2.8211749515816655, "step": 8740}, {"loss": 0.6674, "grad_norm": 0.7336633205413818, "learning_rate": 0.0002, "epoch": 2.824402840542285, "step": 8750}, {"loss": 0.6662, "grad_norm": 0.7074962854385376, "learning_rate": 0.0002, "epoch": 2.8276307295029053, "step": 8760}, {"loss": 0.6744, "grad_norm": 0.6667559742927551, "learning_rate": 0.0002, "epoch": 2.830858618463525, "step": 8770}, {"loss": 0.7142, "grad_norm": 0.8101205229759216, "learning_rate": 0.0002, "epoch": 2.8340865074241446, "step": 8780}, {"loss": 0.6727, "grad_norm": 0.8841480016708374, "learning_rate": 0.0002, "epoch": 2.8373143963847642, "step": 8790}, {"loss": 0.6601, "grad_norm": 0.5891591310501099, "learning_rate": 0.0002, "epoch": 2.840542285345384, "step": 8800}, {"loss": 0.7114, "grad_norm": 0.667032778263092, "learning_rate": 0.0002, "epoch": 2.843770174306004, "step": 8810}, {"loss": 0.7295, "grad_norm": 0.7629773020744324, "learning_rate": 0.0002, "epoch": 2.8469980632666236, "step": 8820}, {"loss": 0.703, "grad_norm": 0.79471355676651, "learning_rate": 0.0002, "epoch": 2.8502259522272433, "step": 8830}, {"loss": 0.7278, "grad_norm": 0.7529178261756897, "learning_rate": 0.0002, "epoch": 2.8534538411878634, "step": 8840}, {"loss": 0.7163, "grad_norm": 0.7014923691749573, "learning_rate": 0.0002, "epoch": 2.856681730148483, "step": 8850}, {"loss": 0.6803, "grad_norm": 0.7996514439582825, "learning_rate": 0.0002, "epoch": 2.8599096191091027, "step": 8860}, {"loss": 0.6562, "grad_norm": 0.7044785618782043, "learning_rate": 0.0002, "epoch": 2.8631375080697223, "step": 8870}, {"loss": 0.6966, "grad_norm": 0.6792093515396118, "learning_rate": 0.0002, "epoch": 2.866365397030342, "step": 8880}, {"loss": 0.685, "grad_norm": 0.69175124168396, "learning_rate": 0.0002, "epoch": 2.8695932859909616, "step": 8890}, {"loss": 0.7225, "grad_norm": 0.7499129176139832, "learning_rate": 0.0002, "epoch": 2.8728211749515817, "step": 8900}, {"loss": 0.6922, "grad_norm": 0.7678789496421814, "learning_rate": 0.0002, "epoch": 2.8760490639122014, "step": 8910}, {"loss": 0.6803, "grad_norm": 0.7478128671646118, "learning_rate": 0.0002, "epoch": 2.879276952872821, "step": 8920}, {"loss": 0.6689, "grad_norm": 0.6767086386680603, "learning_rate": 0.0002, "epoch": 2.882504841833441, "step": 8930}, {"loss": 0.6587, "grad_norm": 0.7222196459770203, "learning_rate": 0.0002, "epoch": 2.885732730794061, "step": 8940}, {"loss": 0.6472, "grad_norm": 0.6950580477714539, "learning_rate": 0.0002, "epoch": 2.8889606197546804, "step": 8950}, {"loss": 0.7064, "grad_norm": 0.7759528160095215, "learning_rate": 0.0002, "epoch": 2.8921885087153, "step": 8960}, {"loss": 0.6349, "grad_norm": 0.6686919927597046, "learning_rate": 0.0002, "epoch": 2.8954163976759197, "step": 8970}, {"loss": 0.6801, "grad_norm": 0.9245954751968384, "learning_rate": 0.0002, "epoch": 2.89864428663654, "step": 8980}, {"loss": 0.6703, "grad_norm": 0.8734814524650574, "learning_rate": 0.0002, "epoch": 2.9018721755971595, "step": 8990}, {"loss": 0.6716, "grad_norm": 0.6056219339370728, "learning_rate": 0.0002, "epoch": 2.905100064557779, "step": 9000}, {"loss": 0.6535, "grad_norm": 0.7364102005958557, "learning_rate": 0.0002, "epoch": 2.9083279535183992, "step": 9010}, {"loss": 0.707, "grad_norm": 0.6563605070114136, "learning_rate": 0.0002, "epoch": 2.911555842479019, "step": 9020}, {"loss": 0.6564, "grad_norm": 0.659978985786438, "learning_rate": 0.0002, "epoch": 2.9147837314396385, "step": 9030}, {"loss": 0.7154, "grad_norm": 0.8176041841506958, "learning_rate": 0.0002, "epoch": 2.918011620400258, "step": 9040}, {"loss": 0.72, "grad_norm": 0.743677020072937, "learning_rate": 0.0002, "epoch": 2.921239509360878, "step": 9050}, {"loss": 0.7017, "grad_norm": 0.7418383359909058, "learning_rate": 0.0002, "epoch": 2.9244673983214975, "step": 9060}, {"loss": 0.6635, "grad_norm": 0.6916524767875671, "learning_rate": 0.0002, "epoch": 2.9276952872821176, "step": 9070}, {"loss": 0.6502, "grad_norm": 0.6559975743293762, "learning_rate": 0.0002, "epoch": 2.9309231762427372, "step": 9080}, {"loss": 0.7016, "grad_norm": 0.7431221008300781, "learning_rate": 0.0002, "epoch": 2.934151065203357, "step": 9090}, {"loss": 0.6829, "grad_norm": 0.7525941133499146, "learning_rate": 0.0002, "epoch": 2.937378954163977, "step": 9100}, {"loss": 0.7073, "grad_norm": 0.6860167384147644, "learning_rate": 0.0002, "epoch": 2.9406068431245966, "step": 9110}, {"loss": 0.6912, "grad_norm": 0.6467666029930115, "learning_rate": 0.0002, "epoch": 2.9438347320852163, "step": 9120}, {"loss": 0.7122, "grad_norm": 0.7595751285552979, "learning_rate": 0.0002, "epoch": 2.947062621045836, "step": 9130}, {"loss": 0.6951, "grad_norm": 0.6558279991149902, "learning_rate": 0.0002, "epoch": 2.9502905100064556, "step": 9140}, {"loss": 0.7081, "grad_norm": 0.6818708181381226, "learning_rate": 0.0002, "epoch": 2.9535183989670757, "step": 9150}, {"loss": 0.6921, "grad_norm": 0.8387085795402527, "learning_rate": 0.0002, "epoch": 2.9567462879276953, "step": 9160}, {"loss": 0.6914, "grad_norm": 0.7705109715461731, "learning_rate": 0.0002, "epoch": 2.959974176888315, "step": 9170}, {"loss": 0.6849, "grad_norm": 0.688106894493103, "learning_rate": 0.0002, "epoch": 2.9632020658489346, "step": 9180}, {"loss": 0.6833, "grad_norm": 0.659532368183136, "learning_rate": 0.0002, "epoch": 2.9664299548095547, "step": 9190}, {"loss": 0.6383, "grad_norm": 0.6839388608932495, "learning_rate": 0.0002, "epoch": 2.9696578437701744, "step": 9200}, {"loss": 0.6952, "grad_norm": 0.6927599310874939, "learning_rate": 0.0002, "epoch": 2.972885732730794, "step": 9210}, {"loss": 0.7338, "grad_norm": 0.6902472972869873, "learning_rate": 0.0002, "epoch": 2.9761136216914137, "step": 9220}, {"loss": 0.6671, "grad_norm": 0.620399534702301, "learning_rate": 0.0002, "epoch": 2.9793415106520333, "step": 9230}, {"loss": 0.6588, "grad_norm": 0.6812364459037781, "learning_rate": 0.0002, "epoch": 2.9825693996126534, "step": 9240}, {"loss": 0.6957, "grad_norm": 0.7681456208229065, "learning_rate": 0.0002, "epoch": 2.985797288573273, "step": 9250}, {"loss": 0.7113, "grad_norm": 0.7621907591819763, "learning_rate": 0.0002, "epoch": 2.9890251775338927, "step": 9260}, {"loss": 0.6601, "grad_norm": 0.6075740456581116, "learning_rate": 0.0002, "epoch": 2.992253066494513, "step": 9270}, {"loss": 0.6758, "grad_norm": 0.7100434899330139, "learning_rate": 0.0002, "epoch": 2.9954809554551325, "step": 9280}, {"loss": 0.73, "grad_norm": 0.7314488887786865, "learning_rate": 0.0002, "epoch": 2.998708844415752, "step": 9290}, {"eval_loss": 1.1434104442596436, "eval_runtime": 166.3732, "eval_samples_per_second": 4.406, "eval_steps_per_second": 0.553, "epoch": 3.0, "step": 9294}, {"loss": 0.6401, "grad_norm": 0.7408893704414368, "learning_rate": 0.0002, "epoch": 3.001936733376372, "step": 9300}, {"loss": 0.5182, "grad_norm": 0.9773574471473694, "learning_rate": 0.0002, "epoch": 3.0051646223369914, "step": 9310}, {"loss": 0.5432, "grad_norm": 0.7919653058052063, "learning_rate": 0.0002, "epoch": 3.0083925112976115, "step": 9320}, {"loss": 0.6156, "grad_norm": 0.9139202833175659, "learning_rate": 0.0002, "epoch": 3.011620400258231, "step": 9330}, {"loss": 0.5736, "grad_norm": 0.8296737670898438, "learning_rate": 0.0002, "epoch": 3.014848289218851, "step": 9340}, {"loss": 0.5567, "grad_norm": 0.786868155002594, "learning_rate": 0.0002, "epoch": 3.0180761781794705, "step": 9350}, {"loss": 0.578, "grad_norm": 0.5928055644035339, "learning_rate": 0.0002, "epoch": 3.0213040671400906, "step": 9360}, {"loss": 0.5376, "grad_norm": 0.8785701394081116, "learning_rate": 0.0002, "epoch": 3.0245319561007102, "step": 9370}, {"loss": 0.5664, "grad_norm": 0.7978872060775757, "learning_rate": 0.0002, "epoch": 3.02775984506133, "step": 9380}, {"loss": 0.5797, "grad_norm": 0.7160913348197937, "learning_rate": 0.0002, "epoch": 3.0309877340219495, "step": 9390}, {"loss": 0.5777, "grad_norm": 0.904465913772583, "learning_rate": 0.0002, "epoch": 3.034215622982569, "step": 9400}, {"loss": 0.5518, "grad_norm": 0.7082195281982422, "learning_rate": 0.0002, "epoch": 3.0374435119431893, "step": 9410}, {"loss": 0.5434, "grad_norm": 0.9686778783798218, "learning_rate": 0.0002, "epoch": 3.040671400903809, "step": 9420}, {"loss": 0.5692, "grad_norm": 0.8788613677024841, "learning_rate": 0.0002, "epoch": 3.0438992898644286, "step": 9430}, {"loss": 0.5599, "grad_norm": 0.8217582106590271, "learning_rate": 0.0002, "epoch": 3.0471271788250482, "step": 9440}, {"loss": 0.5405, "grad_norm": 0.7380914092063904, "learning_rate": 0.0002, "epoch": 3.0503550677856683, "step": 9450}, {"loss": 0.6258, "grad_norm": 0.7339285612106323, "learning_rate": 0.0002, "epoch": 3.053582956746288, "step": 9460}, {"loss": 0.5646, "grad_norm": 0.7175183296203613, "learning_rate": 0.0002, "epoch": 3.0568108457069076, "step": 9470}, {"loss": 0.5667, "grad_norm": 0.8275379538536072, "learning_rate": 0.0002, "epoch": 3.0600387346675273, "step": 9480}, {"loss": 0.5868, "grad_norm": 0.6544256806373596, "learning_rate": 0.0002, "epoch": 3.0632666236281474, "step": 9490}, {"loss": 0.5365, "grad_norm": 0.8193472623825073, "learning_rate": 0.0002, "epoch": 3.066494512588767, "step": 9500}, {"loss": 0.5614, "grad_norm": 0.7967836856842041, "learning_rate": 0.0002, "epoch": 3.0697224015493867, "step": 9510}, {"loss": 0.5629, "grad_norm": 0.8788684010505676, "learning_rate": 0.0002, "epoch": 3.0729502905100063, "step": 9520}, {"loss": 0.5397, "grad_norm": 0.9410629868507385, "learning_rate": 0.0002, "epoch": 3.0761781794706264, "step": 9530}, {"loss": 0.5473, "grad_norm": 0.7448706030845642, "learning_rate": 0.0002, "epoch": 3.079406068431246, "step": 9540}, {"loss": 0.5774, "grad_norm": 0.9149372577667236, "learning_rate": 0.0002, "epoch": 3.0826339573918657, "step": 9550}, {"loss": 0.5347, "grad_norm": 0.7265563607215881, "learning_rate": 0.0002, "epoch": 3.0858618463524854, "step": 9560}, {"loss": 0.5487, "grad_norm": 1.0305068492889404, "learning_rate": 0.0002, "epoch": 3.089089735313105, "step": 9570}, {"loss": 0.5884, "grad_norm": 0.7987357974052429, "learning_rate": 0.0002, "epoch": 3.092317624273725, "step": 9580}, {"loss": 0.6216, "grad_norm": 0.7733123898506165, "learning_rate": 0.0002, "epoch": 3.095545513234345, "step": 9590}, {"loss": 0.5848, "grad_norm": 1.0438069105148315, "learning_rate": 0.0002, "epoch": 3.0987734021949644, "step": 9600}, {"loss": 0.5612, "grad_norm": 0.7951784729957581, "learning_rate": 0.0002, "epoch": 3.102001291155584, "step": 9610}, {"loss": 0.6184, "grad_norm": 0.7776783108711243, "learning_rate": 0.0002, "epoch": 3.105229180116204, "step": 9620}, {"loss": 0.5626, "grad_norm": 0.7060676217079163, "learning_rate": 0.0002, "epoch": 3.108457069076824, "step": 9630}, {"loss": 0.5731, "grad_norm": 0.871569037437439, "learning_rate": 0.0002, "epoch": 3.1116849580374435, "step": 9640}, {"loss": 0.5168, "grad_norm": 0.8873385787010193, "learning_rate": 0.0002, "epoch": 3.114912846998063, "step": 9650}, {"loss": 0.5985, "grad_norm": 0.750998318195343, "learning_rate": 0.0002, "epoch": 3.118140735958683, "step": 9660}, {"loss": 0.5741, "grad_norm": 0.8678529262542725, "learning_rate": 0.0002, "epoch": 3.121368624919303, "step": 9670}, {"loss": 0.5831, "grad_norm": 0.7706599235534668, "learning_rate": 0.0002, "epoch": 3.1245965138799225, "step": 9680}, {"loss": 0.6142, "grad_norm": 0.8317574858665466, "learning_rate": 0.0002, "epoch": 3.127824402840542, "step": 9690}, {"loss": 0.5634, "grad_norm": 0.801800012588501, "learning_rate": 0.0002, "epoch": 3.131052291801162, "step": 9700}, {"loss": 0.6044, "grad_norm": 0.8574623465538025, "learning_rate": 0.0002, "epoch": 3.134280180761782, "step": 9710}, {"loss": 0.6072, "grad_norm": 0.6556540727615356, "learning_rate": 0.0002, "epoch": 3.1375080697224016, "step": 9720}, {"loss": 0.6058, "grad_norm": 0.8555161952972412, "learning_rate": 0.0002, "epoch": 3.1407359586830212, "step": 9730}, {"loss": 0.6069, "grad_norm": 0.8825467824935913, "learning_rate": 0.0002, "epoch": 3.143963847643641, "step": 9740}, {"loss": 0.5689, "grad_norm": 0.8297156691551208, "learning_rate": 0.0002, "epoch": 3.147191736604261, "step": 9750}, {"loss": 0.5738, "grad_norm": 0.7710384726524353, "learning_rate": 0.0002, "epoch": 3.1504196255648806, "step": 9760}, {"loss": 0.571, "grad_norm": 0.8778039216995239, "learning_rate": 0.0002, "epoch": 3.1536475145255003, "step": 9770}, {"loss": 0.5913, "grad_norm": 0.9014058113098145, "learning_rate": 0.0002, "epoch": 3.15687540348612, "step": 9780}, {"loss": 0.5496, "grad_norm": 0.6856890320777893, "learning_rate": 0.0002, "epoch": 3.16010329244674, "step": 9790}, {"loss": 0.558, "grad_norm": 0.6520644426345825, "learning_rate": 0.0002, "epoch": 3.1633311814073597, "step": 9800}, {"loss": 0.6024, "grad_norm": 0.7250499129295349, "learning_rate": 0.0002, "epoch": 3.1665590703679793, "step": 9810}, {"loss": 0.5823, "grad_norm": 0.8331542015075684, "learning_rate": 0.0002, "epoch": 3.169786959328599, "step": 9820}, {"loss": 0.5803, "grad_norm": 0.8531261682510376, "learning_rate": 0.0002, "epoch": 3.1730148482892186, "step": 9830}, {"loss": 0.57, "grad_norm": 0.8997558355331421, "learning_rate": 0.0002, "epoch": 3.1762427372498387, "step": 9840}, {"loss": 0.5921, "grad_norm": 0.708335280418396, "learning_rate": 0.0002, "epoch": 3.1794706262104584, "step": 9850}, {"loss": 0.5997, "grad_norm": 1.0074886083602905, "learning_rate": 0.0002, "epoch": 3.182698515171078, "step": 9860}, {"loss": 0.573, "grad_norm": 1.0804681777954102, "learning_rate": 0.0002, "epoch": 3.1859264041316977, "step": 9870}, {"loss": 0.5527, "grad_norm": 0.9510730504989624, "learning_rate": 0.0002, "epoch": 3.189154293092318, "step": 9880}, {"loss": 0.6401, "grad_norm": 0.7211061716079712, "learning_rate": 0.0002, "epoch": 3.1923821820529374, "step": 9890}, {"loss": 0.5563, "grad_norm": 0.8767086267471313, "learning_rate": 0.0002, "epoch": 3.195610071013557, "step": 9900}, {"loss": 0.5747, "grad_norm": 0.8388153314590454, "learning_rate": 0.0002, "epoch": 3.1988379599741767, "step": 9910}, {"loss": 0.5681, "grad_norm": 0.8038473725318909, "learning_rate": 0.0002, "epoch": 3.202065848934797, "step": 9920}, {"loss": 0.5594, "grad_norm": 0.8187747001647949, "learning_rate": 0.0002, "epoch": 3.2052937378954165, "step": 9930}, {"loss": 0.5813, "grad_norm": 0.7427355051040649, "learning_rate": 0.0002, "epoch": 3.208521626856036, "step": 9940}, {"loss": 0.5709, "grad_norm": 0.8017025589942932, "learning_rate": 0.0002, "epoch": 3.211749515816656, "step": 9950}, {"loss": 0.6106, "grad_norm": 0.738595187664032, "learning_rate": 0.0002, "epoch": 3.214977404777276, "step": 9960}, {"loss": 0.6006, "grad_norm": 0.7521342039108276, "learning_rate": 0.0002, "epoch": 3.2182052937378955, "step": 9970}, {"loss": 0.5706, "grad_norm": 0.840329110622406, "learning_rate": 0.0002, "epoch": 3.221433182698515, "step": 9980}, {"loss": 0.5666, "grad_norm": 0.9809671640396118, "learning_rate": 0.0002, "epoch": 3.224661071659135, "step": 9990}, {"loss": 0.6223, "grad_norm": 0.8456943035125732, "learning_rate": 0.0002, "epoch": 3.2278889606197545, "step": 10000}, {"loss": 0.5798, "grad_norm": 0.8962995409965515, "learning_rate": 0.0002, "epoch": 3.2311168495803746, "step": 10010}, {"loss": 0.5399, "grad_norm": 0.6492817401885986, "learning_rate": 0.0002, "epoch": 3.2343447385409942, "step": 10020}, {"loss": 0.5678, "grad_norm": 1.0471255779266357, "learning_rate": 0.0002, "epoch": 3.237572627501614, "step": 10030}, {"loss": 0.5452, "grad_norm": 0.7995471358299255, "learning_rate": 0.0002, "epoch": 3.2408005164622335, "step": 10040}, {"loss": 0.615, "grad_norm": 0.7231964468955994, "learning_rate": 0.0002, "epoch": 3.2440284054228536, "step": 10050}, {"loss": 0.5586, "grad_norm": 0.639630138874054, "learning_rate": 0.0002, "epoch": 3.2472562943834733, "step": 10060}, {"loss": 0.6271, "grad_norm": 0.7957055568695068, "learning_rate": 0.0002, "epoch": 3.250484183344093, "step": 10070}, {"loss": 0.5845, "grad_norm": 0.7735482454299927, "learning_rate": 0.0002, "epoch": 3.2537120723047126, "step": 10080}, {"loss": 0.5791, "grad_norm": 0.8139488101005554, "learning_rate": 0.0002, "epoch": 3.2569399612653323, "step": 10090}, {"loss": 0.6049, "grad_norm": 0.8113240003585815, "learning_rate": 0.0002, "epoch": 3.2601678502259523, "step": 10100}, {"loss": 0.5617, "grad_norm": 0.7735909819602966, "learning_rate": 0.0002, "epoch": 3.263395739186572, "step": 10110}, {"loss": 0.5964, "grad_norm": 0.7760744094848633, "learning_rate": 0.0002, "epoch": 3.2666236281471916, "step": 10120}, {"loss": 0.5786, "grad_norm": 0.8078505396842957, "learning_rate": 0.0002, "epoch": 3.2698515171078113, "step": 10130}, {"loss": 0.5904, "grad_norm": 0.983648955821991, "learning_rate": 0.0002, "epoch": 3.2730794060684314, "step": 10140}, {"loss": 0.596, "grad_norm": 0.7131832242012024, "learning_rate": 0.0002, "epoch": 3.276307295029051, "step": 10150}, {"loss": 0.5986, "grad_norm": 0.924493134021759, "learning_rate": 0.0002, "epoch": 3.2795351839896707, "step": 10160}, {"loss": 0.5733, "grad_norm": 0.9371112585067749, "learning_rate": 0.0002, "epoch": 3.2827630729502904, "step": 10170}, {"loss": 0.5891, "grad_norm": 0.8989261388778687, "learning_rate": 0.0002, "epoch": 3.2859909619109104, "step": 10180}, {"loss": 0.6143, "grad_norm": 0.8130394816398621, "learning_rate": 0.0002, "epoch": 3.28921885087153, "step": 10190}, {"loss": 0.5555, "grad_norm": 0.9899941086769104, "learning_rate": 0.0002, "epoch": 3.2924467398321497, "step": 10200}, {"loss": 0.5899, "grad_norm": 1.007038950920105, "learning_rate": 0.0002, "epoch": 3.2956746287927694, "step": 10210}, {"loss": 0.5713, "grad_norm": 0.7465066313743591, "learning_rate": 0.0002, "epoch": 3.2989025177533895, "step": 10220}, {"loss": 0.6307, "grad_norm": 0.7202590703964233, "learning_rate": 0.0002, "epoch": 3.302130406714009, "step": 10230}, {"loss": 0.5659, "grad_norm": 0.6258249282836914, "learning_rate": 0.0002, "epoch": 3.305358295674629, "step": 10240}, {"loss": 0.5869, "grad_norm": 0.8996058702468872, "learning_rate": 0.0002, "epoch": 3.3085861846352485, "step": 10250}, {"loss": 0.5825, "grad_norm": 0.9550982713699341, "learning_rate": 0.0002, "epoch": 3.311814073595868, "step": 10260}, {"loss": 0.5602, "grad_norm": 0.7010059952735901, "learning_rate": 0.0002, "epoch": 3.315041962556488, "step": 10270}, {"loss": 0.5853, "grad_norm": 0.9639869332313538, "learning_rate": 0.0002, "epoch": 3.318269851517108, "step": 10280}, {"loss": 0.5362, "grad_norm": 1.0192502737045288, "learning_rate": 0.0002, "epoch": 3.3214977404777275, "step": 10290}, {"loss": 0.5605, "grad_norm": 0.7953670024871826, "learning_rate": 0.0002, "epoch": 3.324725629438347, "step": 10300}, {"loss": 0.6386, "grad_norm": 0.7436774969100952, "learning_rate": 0.0002, "epoch": 3.3279535183989672, "step": 10310}, {"loss": 0.5823, "grad_norm": 0.7846777439117432, "learning_rate": 0.0002, "epoch": 3.331181407359587, "step": 10320}, {"loss": 0.6119, "grad_norm": 0.8963494896888733, "learning_rate": 0.0002, "epoch": 3.3344092963202066, "step": 10330}, {"loss": 0.5872, "grad_norm": 0.6876392364501953, "learning_rate": 0.0002, "epoch": 3.337637185280826, "step": 10340}, {"loss": 0.6291, "grad_norm": 0.9161638021469116, "learning_rate": 0.0002, "epoch": 3.340865074241446, "step": 10350}, {"loss": 0.5955, "grad_norm": 0.8964458107948303, "learning_rate": 0.0002, "epoch": 3.344092963202066, "step": 10360}, {"loss": 0.5965, "grad_norm": 0.9052296280860901, "learning_rate": 0.0002, "epoch": 3.3473208521626856, "step": 10370}, {"loss": 0.5958, "grad_norm": 0.9292596578598022, "learning_rate": 0.0002, "epoch": 3.3505487411233053, "step": 10380}, {"loss": 0.5487, "grad_norm": 0.9605957269668579, "learning_rate": 0.0002, "epoch": 3.3537766300839253, "step": 10390}, {"loss": 0.6214, "grad_norm": 1.0198872089385986, "learning_rate": 0.0002, "epoch": 3.357004519044545, "step": 10400}, {"loss": 0.6053, "grad_norm": 0.7043630480766296, "learning_rate": 0.0002, "epoch": 3.3602324080051647, "step": 10410}, {"loss": 0.5451, "grad_norm": 1.0533326864242554, "learning_rate": 0.0002, "epoch": 3.3634602969657843, "step": 10420}, {"loss": 0.6134, "grad_norm": 0.7552485466003418, "learning_rate": 0.0002, "epoch": 3.366688185926404, "step": 10430}, {"loss": 0.631, "grad_norm": 0.692708432674408, "learning_rate": 0.0002, "epoch": 3.369916074887024, "step": 10440}, {"loss": 0.631, "grad_norm": 0.985952615737915, "learning_rate": 0.0002, "epoch": 3.3731439638476437, "step": 10450}, {"loss": 0.5689, "grad_norm": 0.6749676465988159, "learning_rate": 0.0002, "epoch": 3.3763718528082634, "step": 10460}, {"loss": 0.5724, "grad_norm": 0.9514535665512085, "learning_rate": 0.0002, "epoch": 3.379599741768883, "step": 10470}, {"loss": 0.5982, "grad_norm": 1.2681142091751099, "learning_rate": 0.0002, "epoch": 3.382827630729503, "step": 10480}, {"loss": 0.5778, "grad_norm": 1.031968355178833, "learning_rate": 0.0002, "epoch": 3.3860555196901228, "step": 10490}, {"loss": 0.5964, "grad_norm": 0.8061563968658447, "learning_rate": 0.0002, "epoch": 3.3892834086507424, "step": 10500}, {"loss": 0.6094, "grad_norm": 1.0515062808990479, "learning_rate": 0.0002, "epoch": 3.392511297611362, "step": 10510}, {"loss": 0.542, "grad_norm": 0.9055540561676025, "learning_rate": 0.0002, "epoch": 3.3957391865719817, "step": 10520}, {"loss": 0.6148, "grad_norm": 0.9318141341209412, "learning_rate": 0.0002, "epoch": 3.398967075532602, "step": 10530}, {"loss": 0.5722, "grad_norm": 0.8266817331314087, "learning_rate": 0.0002, "epoch": 3.4021949644932215, "step": 10540}, {"loss": 0.6015, "grad_norm": 1.2322112321853638, "learning_rate": 0.0002, "epoch": 3.405422853453841, "step": 10550}, {"loss": 0.6215, "grad_norm": 0.9535136818885803, "learning_rate": 0.0002, "epoch": 3.4086507424144608, "step": 10560}, {"loss": 0.561, "grad_norm": 0.9243819117546082, "learning_rate": 0.0002, "epoch": 3.411878631375081, "step": 10570}, {"loss": 0.5844, "grad_norm": 0.9011809825897217, "learning_rate": 0.0002, "epoch": 3.4151065203357005, "step": 10580}, {"loss": 0.6175, "grad_norm": 0.9923036694526672, "learning_rate": 0.0002, "epoch": 3.41833440929632, "step": 10590}, {"loss": 0.6033, "grad_norm": 0.8903067111968994, "learning_rate": 0.0002, "epoch": 3.42156229825694, "step": 10600}, {"loss": 0.5563, "grad_norm": 0.7101534605026245, "learning_rate": 0.0002, "epoch": 3.42479018721756, "step": 10610}, {"loss": 0.598, "grad_norm": 0.8186570405960083, "learning_rate": 0.0002, "epoch": 3.4280180761781796, "step": 10620}, {"loss": 0.5897, "grad_norm": 0.9480205774307251, "learning_rate": 0.0002, "epoch": 3.431245965138799, "step": 10630}, {"loss": 0.5798, "grad_norm": 1.1370961666107178, "learning_rate": 0.0002, "epoch": 3.434473854099419, "step": 10640}, {"loss": 0.5779, "grad_norm": 1.017669677734375, "learning_rate": 0.0002, "epoch": 3.437701743060039, "step": 10650}, {"loss": 0.5999, "grad_norm": 0.7625100016593933, "learning_rate": 0.0002, "epoch": 3.4409296320206586, "step": 10660}, {"loss": 0.5705, "grad_norm": 0.9288196563720703, "learning_rate": 0.0002, "epoch": 3.4441575209812783, "step": 10670}, {"loss": 0.6255, "grad_norm": 0.8800460696220398, "learning_rate": 0.0002, "epoch": 3.447385409941898, "step": 10680}, {"loss": 0.6245, "grad_norm": 0.7499661445617676, "learning_rate": 0.0002, "epoch": 3.4506132989025176, "step": 10690}, {"loss": 0.5979, "grad_norm": 0.8254973292350769, "learning_rate": 0.0002, "epoch": 3.4538411878631377, "step": 10700}, {"loss": 0.5742, "grad_norm": 0.8735857605934143, "learning_rate": 0.0002, "epoch": 3.4570690768237573, "step": 10710}, {"loss": 0.6356, "grad_norm": 0.9601819515228271, "learning_rate": 0.0002, "epoch": 3.460296965784377, "step": 10720}, {"loss": 0.5574, "grad_norm": 0.8031058311462402, "learning_rate": 0.0002, "epoch": 3.4635248547449966, "step": 10730}, {"loss": 0.6078, "grad_norm": 0.8039247393608093, "learning_rate": 0.0002, "epoch": 3.4667527437056167, "step": 10740}, {"loss": 0.593, "grad_norm": 0.8936953544616699, "learning_rate": 0.0002, "epoch": 3.4699806326662364, "step": 10750}, {"loss": 0.5971, "grad_norm": 0.8201186060905457, "learning_rate": 0.0002, "epoch": 3.473208521626856, "step": 10760}, {"loss": 0.5875, "grad_norm": 1.0064148902893066, "learning_rate": 0.0002, "epoch": 3.4764364105874757, "step": 10770}, {"loss": 0.5639, "grad_norm": 0.8617483377456665, "learning_rate": 0.0002, "epoch": 3.4796642995480953, "step": 10780}, {"loss": 0.6022, "grad_norm": 0.8532096147537231, "learning_rate": 0.0002, "epoch": 3.4828921885087154, "step": 10790}, {"loss": 0.5765, "grad_norm": 0.8646879196166992, "learning_rate": 0.0002, "epoch": 3.486120077469335, "step": 10800}, {"loss": 0.5799, "grad_norm": 0.7962660789489746, "learning_rate": 0.0002, "epoch": 3.4893479664299547, "step": 10810}, {"loss": 0.5398, "grad_norm": 0.9560028314590454, "learning_rate": 0.0002, "epoch": 3.492575855390575, "step": 10820}, {"loss": 0.6082, "grad_norm": 0.928439736366272, "learning_rate": 0.0002, "epoch": 3.4958037443511945, "step": 10830}, {"loss": 0.6112, "grad_norm": 0.8219282627105713, "learning_rate": 0.0002, "epoch": 3.499031633311814, "step": 10840}, {"loss": 0.6369, "grad_norm": 0.7918338179588318, "learning_rate": 0.0002, "epoch": 3.5022595222724338, "step": 10850}, {"loss": 0.6164, "grad_norm": 0.961295485496521, "learning_rate": 0.0002, "epoch": 3.5054874112330534, "step": 10860}, {"loss": 0.5534, "grad_norm": 1.0731624364852905, "learning_rate": 0.0002, "epoch": 3.5087153001936735, "step": 10870}, {"loss": 0.5829, "grad_norm": 0.9551863074302673, "learning_rate": 0.0002, "epoch": 3.511943189154293, "step": 10880}, {"loss": 0.5746, "grad_norm": 0.8409819602966309, "learning_rate": 0.0002, "epoch": 3.515171078114913, "step": 10890}, {"loss": 0.5813, "grad_norm": 0.7546320557594299, "learning_rate": 0.0002, "epoch": 3.5183989670755325, "step": 10900}, {"loss": 0.6184, "grad_norm": 0.7505252361297607, "learning_rate": 0.0002, "epoch": 3.5216268560361526, "step": 10910}, {"loss": 0.5649, "grad_norm": 0.7505561113357544, "learning_rate": 0.0002, "epoch": 3.524854744996772, "step": 10920}, {"loss": 0.6277, "grad_norm": 1.086177945137024, "learning_rate": 0.0002, "epoch": 3.528082633957392, "step": 10930}, {"loss": 0.5983, "grad_norm": 0.7721118330955505, "learning_rate": 0.0002, "epoch": 3.5313105229180115, "step": 10940}, {"loss": 0.5919, "grad_norm": 0.9567878246307373, "learning_rate": 0.0002, "epoch": 3.534538411878631, "step": 10950}, {"loss": 0.6261, "grad_norm": 0.8377360105514526, "learning_rate": 0.0002, "epoch": 3.5377663008392513, "step": 10960}, {"loss": 0.633, "grad_norm": 1.0174858570098877, "learning_rate": 0.0002, "epoch": 3.540994189799871, "step": 10970}, {"loss": 0.599, "grad_norm": 0.8164418935775757, "learning_rate": 0.0002, "epoch": 3.5442220787604906, "step": 10980}, {"loss": 0.5471, "grad_norm": 0.8959241509437561, "learning_rate": 0.0002, "epoch": 3.5474499677211107, "step": 10990}, {"loss": 0.6195, "grad_norm": 1.0154379606246948, "learning_rate": 0.0002, "epoch": 3.5506778566817303, "step": 11000}, {"loss": 0.5835, "grad_norm": 0.7812292575836182, "learning_rate": 0.0002, "epoch": 3.55390574564235, "step": 11010}, {"loss": 0.6052, "grad_norm": 0.9849029779434204, "learning_rate": 0.0002, "epoch": 3.5571336346029696, "step": 11020}, {"loss": 0.5689, "grad_norm": 0.8826184272766113, "learning_rate": 0.0002, "epoch": 3.5603615235635893, "step": 11030}, {"loss": 0.601, "grad_norm": 0.9039685726165771, "learning_rate": 0.0002, "epoch": 3.563589412524209, "step": 11040}, {"loss": 0.5996, "grad_norm": 0.9585249423980713, "learning_rate": 0.0002, "epoch": 3.566817301484829, "step": 11050}, {"loss": 0.5714, "grad_norm": 0.8083069324493408, "learning_rate": 0.0002, "epoch": 3.5700451904454487, "step": 11060}, {"loss": 0.6317, "grad_norm": 0.9528678059577942, "learning_rate": 0.0002, "epoch": 3.5732730794060683, "step": 11070}, {"loss": 0.6278, "grad_norm": 0.8297588229179382, "learning_rate": 0.0002, "epoch": 3.5765009683666884, "step": 11080}, {"loss": 0.5919, "grad_norm": 0.8191716074943542, "learning_rate": 0.0002, "epoch": 3.579728857327308, "step": 11090}, {"loss": 0.5971, "grad_norm": 0.8056275844573975, "learning_rate": 0.0002, "epoch": 3.5829567462879277, "step": 11100}, {"loss": 0.6325, "grad_norm": 0.701930582523346, "learning_rate": 0.0002, "epoch": 3.5861846352485474, "step": 11110}, {"loss": 0.6088, "grad_norm": 0.7644643187522888, "learning_rate": 0.0002, "epoch": 3.589412524209167, "step": 11120}, {"loss": 0.605, "grad_norm": 0.668004035949707, "learning_rate": 0.0002, "epoch": 3.592640413169787, "step": 11130}, {"loss": 0.5735, "grad_norm": 0.8849539756774902, "learning_rate": 0.0002, "epoch": 3.5958683021304068, "step": 11140}, {"loss": 0.6412, "grad_norm": 0.8123571276664734, "learning_rate": 0.0002, "epoch": 3.5990961910910264, "step": 11150}, {"loss": 0.5626, "grad_norm": 0.7591469287872314, "learning_rate": 0.0002, "epoch": 3.602324080051646, "step": 11160}, {"loss": 0.5668, "grad_norm": 0.776466965675354, "learning_rate": 0.0002, "epoch": 3.605551969012266, "step": 11170}, {"loss": 0.6631, "grad_norm": 0.9156150221824646, "learning_rate": 0.0002, "epoch": 3.608779857972886, "step": 11180}, {"loss": 0.5867, "grad_norm": 0.7517618536949158, "learning_rate": 0.0002, "epoch": 3.6120077469335055, "step": 11190}, {"loss": 0.5939, "grad_norm": 0.931239128112793, "learning_rate": 0.0002, "epoch": 3.615235635894125, "step": 11200}, {"loss": 0.5736, "grad_norm": 0.9107872843742371, "learning_rate": 0.0002, "epoch": 3.6184635248547448, "step": 11210}, {"loss": 0.5665, "grad_norm": 0.7624770998954773, "learning_rate": 0.0002, "epoch": 3.621691413815365, "step": 11220}, {"loss": 0.6033, "grad_norm": 0.8129580616950989, "learning_rate": 0.0002, "epoch": 3.6249193027759845, "step": 11230}, {"loss": 0.6192, "grad_norm": 0.7339836955070496, "learning_rate": 0.0002, "epoch": 3.628147191736604, "step": 11240}, {"loss": 0.5976, "grad_norm": 0.8901296854019165, "learning_rate": 0.0002, "epoch": 3.6313750806972243, "step": 11250}, {"loss": 0.5977, "grad_norm": 1.1374726295471191, "learning_rate": 0.0002, "epoch": 3.634602969657844, "step": 11260}, {"loss": 0.5859, "grad_norm": 0.7438275218009949, "learning_rate": 0.0002, "epoch": 3.6378308586184636, "step": 11270}, {"loss": 0.5757, "grad_norm": 0.808646559715271, "learning_rate": 0.0002, "epoch": 3.641058747579083, "step": 11280}, {"loss": 0.6244, "grad_norm": 1.091810941696167, "learning_rate": 0.0002, "epoch": 3.644286636539703, "step": 11290}, {"loss": 0.5957, "grad_norm": 0.8439257144927979, "learning_rate": 0.0002, "epoch": 3.6475145255003225, "step": 11300}, {"loss": 0.6115, "grad_norm": 0.9720633029937744, "learning_rate": 0.0002, "epoch": 3.6507424144609426, "step": 11310}, {"loss": 0.5942, "grad_norm": 0.738571047782898, "learning_rate": 0.0002, "epoch": 3.6539703034215623, "step": 11320}, {"loss": 0.6029, "grad_norm": 0.6961580514907837, "learning_rate": 0.0002, "epoch": 3.657198192382182, "step": 11330}, {"loss": 0.6226, "grad_norm": 0.8192131519317627, "learning_rate": 0.0002, "epoch": 3.660426081342802, "step": 11340}, {"loss": 0.6155, "grad_norm": 0.8367205858230591, "learning_rate": 0.0002, "epoch": 3.6636539703034217, "step": 11350}, {"loss": 0.586, "grad_norm": 0.7735666632652283, "learning_rate": 0.0002, "epoch": 3.6668818592640413, "step": 11360}, {"loss": 0.6113, "grad_norm": 0.6507132649421692, "learning_rate": 0.0002, "epoch": 3.670109748224661, "step": 11370}, {"loss": 0.6273, "grad_norm": 0.8271192312240601, "learning_rate": 0.0002, "epoch": 3.6733376371852806, "step": 11380}, {"loss": 0.5995, "grad_norm": 0.8724204301834106, "learning_rate": 0.0002, "epoch": 3.6765655261459007, "step": 11390}, {"loss": 0.6131, "grad_norm": 0.8448445200920105, "learning_rate": 0.0002, "epoch": 3.6797934151065204, "step": 11400}, {"loss": 0.5923, "grad_norm": 0.6756882071495056, "learning_rate": 0.0002, "epoch": 3.68302130406714, "step": 11410}, {"loss": 0.6443, "grad_norm": 0.7859625816345215, "learning_rate": 0.0002, "epoch": 3.68624919302776, "step": 11420}, {"loss": 0.6567, "grad_norm": 0.8929487466812134, "learning_rate": 0.0002, "epoch": 3.6894770819883798, "step": 11430}, {"loss": 0.6474, "grad_norm": 0.8163391351699829, "learning_rate": 0.0002, "epoch": 3.6927049709489994, "step": 11440}, {"loss": 0.6467, "grad_norm": 0.8948464393615723, "learning_rate": 0.0002, "epoch": 3.695932859909619, "step": 11450}, {"loss": 0.624, "grad_norm": 0.8654782176017761, "learning_rate": 0.0002, "epoch": 3.6991607488702387, "step": 11460}, {"loss": 0.6142, "grad_norm": 0.9514864683151245, "learning_rate": 0.0002, "epoch": 3.7023886378308584, "step": 11470}, {"loss": 0.606, "grad_norm": 0.7298579812049866, "learning_rate": 0.0002, "epoch": 3.7056165267914785, "step": 11480}, {"loss": 0.5853, "grad_norm": 0.9266309142112732, "learning_rate": 0.0002, "epoch": 3.708844415752098, "step": 11490}, {"loss": 0.6122, "grad_norm": 0.8608686923980713, "learning_rate": 0.0002, "epoch": 3.7120723047127178, "step": 11500}, {"loss": 0.6348, "grad_norm": 0.921788215637207, "learning_rate": 0.0002, "epoch": 3.715300193673338, "step": 11510}, {"loss": 0.6191, "grad_norm": 0.8537021279335022, "learning_rate": 0.0002, "epoch": 3.7185280826339575, "step": 11520}, {"loss": 0.6228, "grad_norm": 1.115194320678711, "learning_rate": 0.0002, "epoch": 3.721755971594577, "step": 11530}, {"loss": 0.5828, "grad_norm": 0.7614817023277283, "learning_rate": 0.0002, "epoch": 3.724983860555197, "step": 11540}, {"loss": 0.5776, "grad_norm": 0.871999204158783, "learning_rate": 0.0002, "epoch": 3.7282117495158165, "step": 11550}, {"loss": 0.5962, "grad_norm": 0.9668049812316895, "learning_rate": 0.0002, "epoch": 3.7314396384764366, "step": 11560}, {"loss": 0.5534, "grad_norm": 1.2185815572738647, "learning_rate": 0.0002, "epoch": 3.734667527437056, "step": 11570}, {"loss": 0.5936, "grad_norm": 0.8258453011512756, "learning_rate": 0.0002, "epoch": 3.737895416397676, "step": 11580}, {"loss": 0.5853, "grad_norm": 0.8708966374397278, "learning_rate": 0.0002, "epoch": 3.7411233053582955, "step": 11590}, {"loss": 0.5847, "grad_norm": 0.7784267663955688, "learning_rate": 0.0002, "epoch": 3.7443511943189156, "step": 11600}, {"loss": 0.6404, "grad_norm": 0.7504425048828125, "learning_rate": 0.0002, "epoch": 3.7475790832795353, "step": 11610}, {"loss": 0.5922, "grad_norm": 0.9144526124000549, "learning_rate": 0.0002, "epoch": 3.750806972240155, "step": 11620}, {"loss": 0.6425, "grad_norm": 0.922581672668457, "learning_rate": 0.0002, "epoch": 3.7540348612007746, "step": 11630}, {"loss": 0.6402, "grad_norm": 0.9348630905151367, "learning_rate": 0.0002, "epoch": 3.757262750161394, "step": 11640}, {"loss": 0.5852, "grad_norm": 1.0740231275558472, "learning_rate": 0.0002, "epoch": 3.7604906391220143, "step": 11650}, {"loss": 0.599, "grad_norm": 0.884830117225647, "learning_rate": 0.0002, "epoch": 3.763718528082634, "step": 11660}, {"loss": 0.5991, "grad_norm": 1.0256348848342896, "learning_rate": 0.0002, "epoch": 3.7669464170432536, "step": 11670}, {"loss": 0.626, "grad_norm": 0.6795592904090881, "learning_rate": 0.0002, "epoch": 3.7701743060038737, "step": 11680}, {"loss": 0.6241, "grad_norm": 0.9381206631660461, "learning_rate": 0.0002, "epoch": 3.7734021949644934, "step": 11690}, {"loss": 0.6054, "grad_norm": 0.7633092403411865, "learning_rate": 0.0002, "epoch": 3.776630083925113, "step": 11700}, {"loss": 0.5937, "grad_norm": 0.7506213188171387, "learning_rate": 0.0002, "epoch": 3.7798579728857327, "step": 11710}, {"loss": 0.5933, "grad_norm": 0.8182913064956665, "learning_rate": 0.0002, "epoch": 3.7830858618463523, "step": 11720}, {"loss": 0.6043, "grad_norm": 1.019322156906128, "learning_rate": 0.0002, "epoch": 3.786313750806972, "step": 11730}, {"loss": 0.633, "grad_norm": 0.8895221948623657, "learning_rate": 0.0002, "epoch": 3.789541639767592, "step": 11740}, {"loss": 0.6553, "grad_norm": 0.948847770690918, "learning_rate": 0.0002, "epoch": 3.7927695287282117, "step": 11750}, {"loss": 0.6265, "grad_norm": 0.9068999886512756, "learning_rate": 0.0002, "epoch": 3.7959974176888314, "step": 11760}, {"loss": 0.6163, "grad_norm": 0.7920539975166321, "learning_rate": 0.0002, "epoch": 3.7992253066494515, "step": 11770}, {"loss": 0.5964, "grad_norm": 0.8441922068595886, "learning_rate": 0.0002, "epoch": 3.802453195610071, "step": 11780}, {"loss": 0.6379, "grad_norm": 0.9258501529693604, "learning_rate": 0.0002, "epoch": 3.8056810845706908, "step": 11790}, {"loss": 0.6379, "grad_norm": 0.7354241609573364, "learning_rate": 0.0002, "epoch": 3.8089089735313104, "step": 11800}, {"loss": 0.6177, "grad_norm": 0.9494872689247131, "learning_rate": 0.0002, "epoch": 3.81213686249193, "step": 11810}, {"loss": 0.5931, "grad_norm": 0.8266556859016418, "learning_rate": 0.0002, "epoch": 3.81536475145255, "step": 11820}, {"loss": 0.641, "grad_norm": 0.7951219081878662, "learning_rate": 0.0002, "epoch": 3.81859264041317, "step": 11830}, {"loss": 0.5767, "grad_norm": 0.7688382267951965, "learning_rate": 0.0002, "epoch": 3.8218205293737895, "step": 11840}, {"loss": 0.6117, "grad_norm": 1.0917940139770508, "learning_rate": 0.0002, "epoch": 3.8250484183344096, "step": 11850}, {"loss": 0.5857, "grad_norm": 0.9880442023277283, "learning_rate": 0.0002, "epoch": 3.828276307295029, "step": 11860}, {"loss": 0.6579, "grad_norm": 0.8433151245117188, "learning_rate": 0.0002, "epoch": 3.831504196255649, "step": 11870}, {"loss": 0.5876, "grad_norm": 0.8691204786300659, "learning_rate": 0.0002, "epoch": 3.8347320852162685, "step": 11880}, {"loss": 0.6308, "grad_norm": 0.7698143124580383, "learning_rate": 0.0002, "epoch": 3.837959974176888, "step": 11890}, {"loss": 0.6531, "grad_norm": 0.8874883651733398, "learning_rate": 0.0002, "epoch": 3.841187863137508, "step": 11900}, {"loss": 0.6242, "grad_norm": 1.1209359169006348, "learning_rate": 0.0002, "epoch": 3.844415752098128, "step": 11910}, {"loss": 0.6415, "grad_norm": 0.7723544239997864, "learning_rate": 0.0002, "epoch": 3.8476436410587476, "step": 11920}, {"loss": 0.6091, "grad_norm": 0.8363937139511108, "learning_rate": 0.0002, "epoch": 3.850871530019367, "step": 11930}, {"loss": 0.6498, "grad_norm": 0.9209707975387573, "learning_rate": 0.0002, "epoch": 3.8540994189799873, "step": 11940}, {"loss": 0.6471, "grad_norm": 0.9456894993782043, "learning_rate": 0.0002, "epoch": 3.857327307940607, "step": 11950}, {"loss": 0.6432, "grad_norm": 1.5748413801193237, "learning_rate": 0.0002, "epoch": 3.8605551969012266, "step": 11960}, {"loss": 0.6197, "grad_norm": 0.9083569049835205, "learning_rate": 0.0002, "epoch": 3.8637830858618463, "step": 11970}, {"loss": 0.6593, "grad_norm": 0.7672823071479797, "learning_rate": 0.0002, "epoch": 3.867010974822466, "step": 11980}, {"loss": 0.6238, "grad_norm": 0.8647152185440063, "learning_rate": 0.0002, "epoch": 3.870238863783086, "step": 11990}, {"loss": 0.5755, "grad_norm": 0.9564255475997925, "learning_rate": 0.0002, "epoch": 3.8734667527437057, "step": 12000}, {"loss": 0.6321, "grad_norm": 0.773267924785614, "learning_rate": 0.0002, "epoch": 3.8766946417043253, "step": 12010}, {"loss": 0.6057, "grad_norm": 0.8030173182487488, "learning_rate": 0.0002, "epoch": 3.879922530664945, "step": 12020}, {"loss": 0.6194, "grad_norm": 0.8002150058746338, "learning_rate": 0.0002, "epoch": 3.883150419625565, "step": 12030}, {"loss": 0.6194, "grad_norm": 0.98802250623703, "learning_rate": 0.0002, "epoch": 3.8863783085861847, "step": 12040}, {"loss": 0.6026, "grad_norm": 0.7868124842643738, "learning_rate": 0.0002, "epoch": 3.8896061975468044, "step": 12050}, {"loss": 0.6303, "grad_norm": 0.932182788848877, "learning_rate": 0.0002, "epoch": 3.892834086507424, "step": 12060}, {"loss": 0.5863, "grad_norm": 0.8576806783676147, "learning_rate": 0.0002, "epoch": 3.8960619754680437, "step": 12070}, {"loss": 0.6079, "grad_norm": 0.8985713124275208, "learning_rate": 0.0002, "epoch": 3.8992898644286638, "step": 12080}, {"loss": 0.6449, "grad_norm": 0.7876521944999695, "learning_rate": 0.0002, "epoch": 3.9025177533892834, "step": 12090}, {"loss": 0.5655, "grad_norm": 0.773936927318573, "learning_rate": 0.0002, "epoch": 3.905745642349903, "step": 12100}, {"loss": 0.5765, "grad_norm": 0.7274761199951172, "learning_rate": 0.0002, "epoch": 3.908973531310523, "step": 12110}, {"loss": 0.6182, "grad_norm": 0.8625598549842834, "learning_rate": 0.0002, "epoch": 3.912201420271143, "step": 12120}, {"loss": 0.5855, "grad_norm": 0.8702362179756165, "learning_rate": 0.0002, "epoch": 3.9154293092317625, "step": 12130}, {"loss": 0.6493, "grad_norm": 0.912579357624054, "learning_rate": 0.0002, "epoch": 3.918657198192382, "step": 12140}, {"loss": 0.6341, "grad_norm": 0.8697066903114319, "learning_rate": 0.0002, "epoch": 3.9218850871530018, "step": 12150}, {"loss": 0.6037, "grad_norm": 1.005232572555542, "learning_rate": 0.0002, "epoch": 3.9251129761136214, "step": 12160}, {"loss": 0.621, "grad_norm": 0.793902575969696, "learning_rate": 0.0002, "epoch": 3.9283408650742415, "step": 12170}, {"loss": 0.599, "grad_norm": 0.7025905847549438, "learning_rate": 0.0002, "epoch": 3.931568754034861, "step": 12180}, {"loss": 0.6421, "grad_norm": 0.97635817527771, "learning_rate": 0.0002, "epoch": 3.934796642995481, "step": 12190}, {"loss": 0.6416, "grad_norm": 0.855417013168335, "learning_rate": 0.0002, "epoch": 3.938024531956101, "step": 12200}, {"loss": 0.5979, "grad_norm": 0.8841291666030884, "learning_rate": 0.0002, "epoch": 3.9412524209167206, "step": 12210}, {"loss": 0.5666, "grad_norm": 1.1762064695358276, "learning_rate": 0.0002, "epoch": 3.94448030987734, "step": 12220}, {"loss": 0.586, "grad_norm": 0.8393193483352661, "learning_rate": 0.0002, "epoch": 3.94770819883796, "step": 12230}, {"loss": 0.5738, "grad_norm": 0.9324905276298523, "learning_rate": 0.0002, "epoch": 3.9509360877985795, "step": 12240}, {"loss": 0.5954, "grad_norm": 0.8607982993125916, "learning_rate": 0.0002, "epoch": 3.9541639767591996, "step": 12250}, {"loss": 0.6277, "grad_norm": 0.8586681485176086, "learning_rate": 0.0002, "epoch": 3.9573918657198193, "step": 12260}, {"loss": 0.5841, "grad_norm": 1.1082909107208252, "learning_rate": 0.0002, "epoch": 3.960619754680439, "step": 12270}, {"loss": 0.6231, "grad_norm": 1.065027117729187, "learning_rate": 0.0002, "epoch": 3.963847643641059, "step": 12280}, {"loss": 0.5996, "grad_norm": 0.9544363021850586, "learning_rate": 0.0002, "epoch": 3.9670755326016787, "step": 12290}, {"loss": 0.6301, "grad_norm": 0.9008927345275879, "learning_rate": 0.0002, "epoch": 3.9703034215622983, "step": 12300}, {"loss": 0.6108, "grad_norm": 0.8717467188835144, "learning_rate": 0.0002, "epoch": 3.973531310522918, "step": 12310}, {"loss": 0.6465, "grad_norm": 0.9718339443206787, "learning_rate": 0.0002, "epoch": 3.9767591994835376, "step": 12320}, {"loss": 0.603, "grad_norm": 1.0362015962600708, "learning_rate": 0.0002, "epoch": 3.9799870884441573, "step": 12330}, {"loss": 0.6229, "grad_norm": 1.0844318866729736, "learning_rate": 0.0002, "epoch": 3.9832149774047774, "step": 12340}, {"loss": 0.6777, "grad_norm": 0.7506240606307983, "learning_rate": 0.0002, "epoch": 3.986442866365397, "step": 12350}, {"loss": 0.6076, "grad_norm": 1.005982756614685, "learning_rate": 0.0002, "epoch": 3.9896707553260167, "step": 12360}, {"loss": 0.5926, "grad_norm": 0.7566431164741516, "learning_rate": 0.0002, "epoch": 3.9928986442866368, "step": 12370}, {"loss": 0.653, "grad_norm": 0.8819181323051453, "learning_rate": 0.0002, "epoch": 3.9961265332472564, "step": 12380}, {"loss": 0.6197, "grad_norm": 0.884497880935669, "learning_rate": 0.0002, "epoch": 3.999354422207876, "step": 12390}, {"eval_loss": 1.1907150745391846, "eval_runtime": 161.5766, "eval_samples_per_second": 4.537, "eval_steps_per_second": 0.569, "epoch": 4.0, "step": 12392}, {"loss": 0.5203, "grad_norm": 1.0407241582870483, "learning_rate": 0.0002, "epoch": 4.002582311168496, "step": 12400}, {"loss": 0.4978, "grad_norm": 1.0199295282363892, "learning_rate": 0.0002, "epoch": 4.005810200129115, "step": 12410}, {"loss": 0.4985, "grad_norm": 0.8456302881240845, "learning_rate": 0.0002, "epoch": 4.009038089089735, "step": 12420}, {"loss": 0.4669, "grad_norm": 1.0621124505996704, "learning_rate": 0.0002, "epoch": 4.012265978050355, "step": 12430}, {"loss": 0.5277, "grad_norm": 0.8984712362289429, "learning_rate": 0.0002, "epoch": 4.015493867010975, "step": 12440}, {"loss": 0.5508, "grad_norm": 1.3785864114761353, "learning_rate": 0.0002, "epoch": 4.018721755971595, "step": 12450}, {"loss": 0.5244, "grad_norm": 0.7911781668663025, "learning_rate": 0.0002, "epoch": 4.0219496449322145, "step": 12460}, {"loss": 0.4746, "grad_norm": 1.0977907180786133, "learning_rate": 0.0002, "epoch": 4.025177533892834, "step": 12470}, {"loss": 0.4632, "grad_norm": 1.0664983987808228, "learning_rate": 0.0002, "epoch": 4.028405422853454, "step": 12480}, {"loss": 0.5151, "grad_norm": 1.0807124376296997, "learning_rate": 0.0002, "epoch": 4.0316333118140735, "step": 12490}, {"loss": 0.4712, "grad_norm": 1.2650192975997925, "learning_rate": 0.0002, "epoch": 4.034861200774693, "step": 12500}, {"loss": 0.5111, "grad_norm": 0.7164070010185242, "learning_rate": 0.0002, "epoch": 4.038089089735313, "step": 12510}, {"loss": 0.5015, "grad_norm": 1.0047489404678345, "learning_rate": 0.0002, "epoch": 4.041316978695932, "step": 12520}, {"loss": 0.5467, "grad_norm": 0.9303901791572571, "learning_rate": 0.0002, "epoch": 4.044544867656553, "step": 12530}, {"loss": 0.5165, "grad_norm": 1.0319702625274658, "learning_rate": 0.0002, "epoch": 4.047772756617173, "step": 12540}, {"loss": 0.4834, "grad_norm": 0.9549729228019714, "learning_rate": 0.0002, "epoch": 4.051000645577792, "step": 12550}, {"loss": 0.5235, "grad_norm": 0.7175564765930176, "learning_rate": 0.0002, "epoch": 4.054228534538412, "step": 12560}, {"loss": 0.5257, "grad_norm": 1.0622259378433228, "learning_rate": 0.0002, "epoch": 4.057456423499032, "step": 12570}, {"loss": 0.5098, "grad_norm": 1.172074556350708, "learning_rate": 0.0002, "epoch": 4.060684312459651, "step": 12580}, {"loss": 0.5112, "grad_norm": 0.9702366590499878, "learning_rate": 0.0002, "epoch": 4.063912201420271, "step": 12590}, {"loss": 0.5042, "grad_norm": 0.741511344909668, "learning_rate": 0.0002, "epoch": 4.0671400903808905, "step": 12600}, {"loss": 0.4996, "grad_norm": 0.8632621169090271, "learning_rate": 0.0002, "epoch": 4.070367979341511, "step": 12610}, {"loss": 0.4927, "grad_norm": 0.9695962071418762, "learning_rate": 0.0002, "epoch": 4.073595868302131, "step": 12620}, {"loss": 0.4618, "grad_norm": 0.9401052594184875, "learning_rate": 0.0002, "epoch": 4.07682375726275, "step": 12630}, {"loss": 0.4889, "grad_norm": 0.8068707585334778, "learning_rate": 0.0002, "epoch": 4.08005164622337, "step": 12640}, {"loss": 0.5046, "grad_norm": 0.9554762840270996, "learning_rate": 0.0002, "epoch": 4.08327953518399, "step": 12650}, {"loss": 0.5081, "grad_norm": 0.7637128233909607, "learning_rate": 0.0002, "epoch": 4.086507424144609, "step": 12660}, {"loss": 0.4997, "grad_norm": 0.6703744530677795, "learning_rate": 0.0002, "epoch": 4.089735313105229, "step": 12670}, {"loss": 0.4977, "grad_norm": 0.8623828887939453, "learning_rate": 0.0002, "epoch": 4.092963202065849, "step": 12680}, {"loss": 0.4616, "grad_norm": 0.8198223114013672, "learning_rate": 0.0002, "epoch": 4.096191091026468, "step": 12690}, {"loss": 0.5372, "grad_norm": 1.3449875116348267, "learning_rate": 0.0002, "epoch": 4.099418979987089, "step": 12700}, {"loss": 0.4782, "grad_norm": 0.8333606123924255, "learning_rate": 0.0002, "epoch": 4.1026468689477085, "step": 12710}, {"loss": 0.5135, "grad_norm": 1.1647733449935913, "learning_rate": 0.0002, "epoch": 4.105874757908328, "step": 12720}, {"loss": 0.5147, "grad_norm": 1.0560213327407837, "learning_rate": 0.0002, "epoch": 4.109102646868948, "step": 12730}, {"loss": 0.5244, "grad_norm": 0.9479449987411499, "learning_rate": 0.0002, "epoch": 4.112330535829567, "step": 12740}, {"loss": 0.4596, "grad_norm": 1.1634587049484253, "learning_rate": 0.0002, "epoch": 4.115558424790187, "step": 12750}, {"loss": 0.4966, "grad_norm": 0.813987672328949, "learning_rate": 0.0002, "epoch": 4.118786313750807, "step": 12760}, {"loss": 0.5133, "grad_norm": 0.968461275100708, "learning_rate": 0.0002, "epoch": 4.122014202711426, "step": 12770}, {"loss": 0.5113, "grad_norm": 0.9324830770492554, "learning_rate": 0.0002, "epoch": 4.125242091672046, "step": 12780}, {"loss": 0.5233, "grad_norm": 0.8313411474227905, "learning_rate": 0.0002, "epoch": 4.128469980632667, "step": 12790}, {"loss": 0.5169, "grad_norm": 1.0177634954452515, "learning_rate": 0.0002, "epoch": 4.131697869593286, "step": 12800}, {"loss": 0.4635, "grad_norm": 1.0890623331069946, "learning_rate": 0.0002, "epoch": 4.134925758553906, "step": 12810}, {"loss": 0.519, "grad_norm": 0.9131693840026855, "learning_rate": 0.0002, "epoch": 4.1381536475145255, "step": 12820}, {"loss": 0.5017, "grad_norm": 0.8400680422782898, "learning_rate": 0.0002, "epoch": 4.141381536475145, "step": 12830}, {"loss": 0.5195, "grad_norm": 0.8988795876502991, "learning_rate": 0.0002, "epoch": 4.144609425435765, "step": 12840}, {"loss": 0.5052, "grad_norm": 0.9224025011062622, "learning_rate": 0.0002, "epoch": 4.1478373143963845, "step": 12850}, {"loss": 0.5001, "grad_norm": 0.7453159689903259, "learning_rate": 0.0002, "epoch": 4.151065203357004, "step": 12860}, {"loss": 0.4874, "grad_norm": 0.9815868139266968, "learning_rate": 0.0002, "epoch": 4.154293092317625, "step": 12870}, {"loss": 0.5485, "grad_norm": 1.2542768716812134, "learning_rate": 0.0002, "epoch": 4.157520981278244, "step": 12880}, {"loss": 0.5287, "grad_norm": 1.0092132091522217, "learning_rate": 0.0002, "epoch": 4.160748870238864, "step": 12890}, {"loss": 0.5125, "grad_norm": 1.1836622953414917, "learning_rate": 0.0002, "epoch": 4.163976759199484, "step": 12900}, {"loss": 0.5089, "grad_norm": 0.7706810235977173, "learning_rate": 0.0002, "epoch": 4.167204648160103, "step": 12910}, {"loss": 0.5123, "grad_norm": 1.00058913230896, "learning_rate": 0.0002, "epoch": 4.170432537120723, "step": 12920}, {"loss": 0.5238, "grad_norm": 1.2326250076293945, "learning_rate": 0.0002, "epoch": 4.173660426081343, "step": 12930}, {"loss": 0.5405, "grad_norm": 0.8829123377799988, "learning_rate": 0.0002, "epoch": 4.176888315041962, "step": 12940}, {"loss": 0.517, "grad_norm": 0.936042845249176, "learning_rate": 0.0002, "epoch": 4.180116204002582, "step": 12950}, {"loss": 0.4991, "grad_norm": 0.9773517847061157, "learning_rate": 0.0002, "epoch": 4.183344092963202, "step": 12960}, {"loss": 0.5025, "grad_norm": 0.9786297678947449, "learning_rate": 0.0002, "epoch": 4.186571981923822, "step": 12970}, {"loss": 0.5276, "grad_norm": 0.7524558901786804, "learning_rate": 0.0002, "epoch": 4.189799870884442, "step": 12980}, {"loss": 0.5522, "grad_norm": 1.0107866525650024, "learning_rate": 0.0002, "epoch": 4.193027759845061, "step": 12990}, {"loss": 0.5304, "grad_norm": 1.0092947483062744, "learning_rate": 0.0002, "epoch": 4.196255648805681, "step": 13000}, {"loss": 0.5061, "grad_norm": 1.18181312084198, "learning_rate": 0.0002, "epoch": 4.199483537766301, "step": 13010}, {"loss": 0.512, "grad_norm": 0.8845750093460083, "learning_rate": 0.0002, "epoch": 4.20271142672692, "step": 13020}, {"loss": 0.5329, "grad_norm": 1.0789145231246948, "learning_rate": 0.0002, "epoch": 4.20593931568754, "step": 13030}, {"loss": 0.5001, "grad_norm": 0.9562082886695862, "learning_rate": 0.0002, "epoch": 4.2091672046481605, "step": 13040}, {"loss": 0.5211, "grad_norm": 0.875755786895752, "learning_rate": 0.0002, "epoch": 4.21239509360878, "step": 13050}, {"loss": 0.5162, "grad_norm": 1.0694596767425537, "learning_rate": 0.0002, "epoch": 4.2156229825694, "step": 13060}, {"loss": 0.4917, "grad_norm": 1.0053378343582153, "learning_rate": 0.0002, "epoch": 4.2188508715300195, "step": 13070}, {"loss": 0.542, "grad_norm": 1.1628689765930176, "learning_rate": 0.0002, "epoch": 4.222078760490639, "step": 13080}, {"loss": 0.4796, "grad_norm": 0.9455991983413696, "learning_rate": 0.0002, "epoch": 4.225306649451259, "step": 13090}, {"loss": 0.4802, "grad_norm": 0.9736765623092651, "learning_rate": 0.0002, "epoch": 4.228534538411878, "step": 13100}, {"loss": 0.5411, "grad_norm": 0.8653560876846313, "learning_rate": 0.0002, "epoch": 4.231762427372498, "step": 13110}, {"loss": 0.5347, "grad_norm": 0.9335988163948059, "learning_rate": 0.0002, "epoch": 4.234990316333118, "step": 13120}, {"loss": 0.5217, "grad_norm": 0.9102661609649658, "learning_rate": 0.0002, "epoch": 4.238218205293738, "step": 13130}, {"loss": 0.5531, "grad_norm": 1.0595461130142212, "learning_rate": 0.0002, "epoch": 4.241446094254358, "step": 13140}, {"loss": 0.517, "grad_norm": 0.8947662711143494, "learning_rate": 0.0002, "epoch": 4.244673983214978, "step": 13150}, {"loss": 0.5116, "grad_norm": 1.0835723876953125, "learning_rate": 0.0002, "epoch": 4.247901872175597, "step": 13160}, {"loss": 0.5212, "grad_norm": 0.8496462106704712, "learning_rate": 0.0002, "epoch": 4.251129761136217, "step": 13170}, {"loss": 0.5079, "grad_norm": 0.9395631551742554, "learning_rate": 0.0002, "epoch": 4.2543576500968365, "step": 13180}, {"loss": 0.5076, "grad_norm": 1.2939592599868774, "learning_rate": 0.0002, "epoch": 4.257585539057456, "step": 13190}, {"loss": 0.5209, "grad_norm": 0.9325923919677734, "learning_rate": 0.0002, "epoch": 4.260813428018076, "step": 13200}, {"loss": 0.4984, "grad_norm": 0.9220664501190186, "learning_rate": 0.0002, "epoch": 4.264041316978696, "step": 13210}, {"loss": 0.5553, "grad_norm": 0.9505137205123901, "learning_rate": 0.0002, "epoch": 4.267269205939316, "step": 13220}, {"loss": 0.5238, "grad_norm": 1.0713751316070557, "learning_rate": 0.0002, "epoch": 4.270497094899936, "step": 13230}, {"loss": 0.5478, "grad_norm": 0.8390375971794128, "learning_rate": 0.0002, "epoch": 4.273724983860555, "step": 13240}, {"loss": 0.5217, "grad_norm": 0.8943426012992859, "learning_rate": 0.0002, "epoch": 4.276952872821175, "step": 13250}, {"loss": 0.5486, "grad_norm": 0.9175868630409241, "learning_rate": 0.0002, "epoch": 4.280180761781795, "step": 13260}, {"loss": 0.5208, "grad_norm": 0.9969881176948547, "learning_rate": 0.0002, "epoch": 4.283408650742414, "step": 13270}, {"loss": 0.5376, "grad_norm": 1.2271877527236938, "learning_rate": 0.0002, "epoch": 4.286636539703034, "step": 13280}, {"loss": 0.4811, "grad_norm": 0.9463263154029846, "learning_rate": 0.0002, "epoch": 4.289864428663654, "step": 13290}, {"loss": 0.52, "grad_norm": 1.0306228399276733, "learning_rate": 0.0002, "epoch": 4.293092317624274, "step": 13300}, {"loss": 0.5092, "grad_norm": 0.8454763889312744, "learning_rate": 0.0002, "epoch": 4.296320206584894, "step": 13310}, {"loss": 0.5657, "grad_norm": 0.9843119978904724, "learning_rate": 0.0002, "epoch": 4.299548095545513, "step": 13320}, {"loss": 0.5407, "grad_norm": 1.0836851596832275, "learning_rate": 0.0002, "epoch": 4.302775984506133, "step": 13330}, {"loss": 0.5336, "grad_norm": 1.0719412565231323, "learning_rate": 0.0002, "epoch": 4.306003873466753, "step": 13340}, {"loss": 0.4798, "grad_norm": 0.9276487827301025, "learning_rate": 0.0002, "epoch": 4.309231762427372, "step": 13350}, {"loss": 0.5256, "grad_norm": 0.897072434425354, "learning_rate": 0.0002, "epoch": 4.312459651387992, "step": 13360}, {"loss": 0.5333, "grad_norm": 1.0493228435516357, "learning_rate": 0.0002, "epoch": 4.315687540348612, "step": 13370}, {"loss": 0.5218, "grad_norm": 0.9446353316307068, "learning_rate": 0.0002, "epoch": 4.318915429309232, "step": 13380}, {"loss": 0.4765, "grad_norm": 0.7765224575996399, "learning_rate": 0.0002, "epoch": 4.322143318269852, "step": 13390}, {"loss": 0.5907, "grad_norm": 0.9100048542022705, "learning_rate": 0.0002, "epoch": 4.3253712072304715, "step": 13400}, {"loss": 0.5393, "grad_norm": 1.0913089513778687, "learning_rate": 0.0002, "epoch": 4.328599096191091, "step": 13410}, {"loss": 0.494, "grad_norm": 0.9607733488082886, "learning_rate": 0.0002, "epoch": 4.331826985151711, "step": 13420}, {"loss": 0.5273, "grad_norm": 0.8774219155311584, "learning_rate": 0.0002, "epoch": 4.3350548741123305, "step": 13430}, {"loss": 0.5482, "grad_norm": 0.8366804122924805, "learning_rate": 0.0002, "epoch": 4.33828276307295, "step": 13440}, {"loss": 0.5487, "grad_norm": 1.034727931022644, "learning_rate": 0.0002, "epoch": 4.34151065203357, "step": 13450}, {"loss": 0.4995, "grad_norm": 0.942743182182312, "learning_rate": 0.0002, "epoch": 4.344738540994189, "step": 13460}, {"loss": 0.5222, "grad_norm": 0.7237029075622559, "learning_rate": 0.0002, "epoch": 4.347966429954809, "step": 13470}, {"loss": 0.5461, "grad_norm": 0.8216196894645691, "learning_rate": 0.0002, "epoch": 4.35119431891543, "step": 13480}, {"loss": 0.5104, "grad_norm": 1.031860113143921, "learning_rate": 0.0002, "epoch": 4.354422207876049, "step": 13490}, {"loss": 0.547, "grad_norm": 0.8880493640899658, "learning_rate": 0.0002, "epoch": 4.357650096836669, "step": 13500}, {"loss": 0.5259, "grad_norm": 0.8442490696907043, "learning_rate": 0.0002, "epoch": 4.360877985797289, "step": 13510}, {"loss": 0.5176, "grad_norm": 1.270971655845642, "learning_rate": 0.0002, "epoch": 4.364105874757908, "step": 13520}, {"loss": 0.5028, "grad_norm": 0.9657870531082153, "learning_rate": 0.0002, "epoch": 4.367333763718528, "step": 13530}, {"loss": 0.5136, "grad_norm": 0.7477133870124817, "learning_rate": 0.0002, "epoch": 4.3705616526791475, "step": 13540}, {"loss": 0.5483, "grad_norm": 1.0209243297576904, "learning_rate": 0.0002, "epoch": 4.373789541639767, "step": 13550}, {"loss": 0.4888, "grad_norm": 0.8714015483856201, "learning_rate": 0.0002, "epoch": 4.377017430600388, "step": 13560}, {"loss": 0.5428, "grad_norm": 1.0490189790725708, "learning_rate": 0.0002, "epoch": 4.380245319561007, "step": 13570}, {"loss": 0.5398, "grad_norm": 0.9454663991928101, "learning_rate": 0.0002, "epoch": 4.383473208521627, "step": 13580}, {"loss": 0.5072, "grad_norm": 1.154146432876587, "learning_rate": 0.0002, "epoch": 4.386701097482247, "step": 13590}, {"loss": 0.5096, "grad_norm": 1.155090570449829, "learning_rate": 0.0002, "epoch": 4.389928986442866, "step": 13600}, {"loss": 0.5679, "grad_norm": 0.9853842854499817, "learning_rate": 0.0002, "epoch": 4.393156875403486, "step": 13610}, {"loss": 0.4992, "grad_norm": 0.9265837669372559, "learning_rate": 0.0002, "epoch": 4.396384764364106, "step": 13620}, {"loss": 0.523, "grad_norm": 0.8367540240287781, "learning_rate": 0.0002, "epoch": 4.399612653324725, "step": 13630}, {"loss": 0.564, "grad_norm": 1.1453629732131958, "learning_rate": 0.0002, "epoch": 4.402840542285345, "step": 13640}, {"loss": 0.573, "grad_norm": 1.0856295824050903, "learning_rate": 0.0002, "epoch": 4.4060684312459655, "step": 13650}, {"loss": 0.5178, "grad_norm": 0.9284523129463196, "learning_rate": 0.0002, "epoch": 4.409296320206585, "step": 13660}, {"loss": 0.4862, "grad_norm": 0.9632299542427063, "learning_rate": 0.0002, "epoch": 4.412524209167205, "step": 13670}, {"loss": 0.5928, "grad_norm": 1.048524260520935, "learning_rate": 0.0002, "epoch": 4.415752098127824, "step": 13680}, {"loss": 0.5258, "grad_norm": 0.9787682294845581, "learning_rate": 0.0002, "epoch": 4.418979987088444, "step": 13690}, {"loss": 0.5513, "grad_norm": 1.0728684663772583, "learning_rate": 0.0002, "epoch": 4.422207876049064, "step": 13700}, {"loss": 0.5243, "grad_norm": 0.72867351770401, "learning_rate": 0.0002, "epoch": 4.425435765009683, "step": 13710}, {"loss": 0.5313, "grad_norm": 0.8932793736457825, "learning_rate": 0.0002, "epoch": 4.428663653970303, "step": 13720}, {"loss": 0.5156, "grad_norm": 1.098343849182129, "learning_rate": 0.0002, "epoch": 4.431891542930924, "step": 13730}, {"loss": 0.5342, "grad_norm": 0.9321235418319702, "learning_rate": 0.0002, "epoch": 4.435119431891543, "step": 13740}, {"loss": 0.5114, "grad_norm": 0.8868634104728699, "learning_rate": 0.0002, "epoch": 4.438347320852163, "step": 13750}, {"loss": 0.5284, "grad_norm": 1.200064778327942, "learning_rate": 0.0002, "epoch": 4.4415752098127825, "step": 13760}, {"loss": 0.5208, "grad_norm": 0.8968019485473633, "learning_rate": 0.0002, "epoch": 4.444803098773402, "step": 13770}, {"loss": 0.4979, "grad_norm": 0.9560935497283936, "learning_rate": 0.0002, "epoch": 4.448030987734022, "step": 13780}, {"loss": 0.5134, "grad_norm": 0.7985701560974121, "learning_rate": 0.0002, "epoch": 4.4512588766946415, "step": 13790}, {"loss": 0.5113, "grad_norm": 1.062540888786316, "learning_rate": 0.0002, "epoch": 4.454486765655261, "step": 13800}, {"loss": 0.525, "grad_norm": 1.0827109813690186, "learning_rate": 0.0002, "epoch": 4.457714654615881, "step": 13810}, {"loss": 0.5541, "grad_norm": 1.0853543281555176, "learning_rate": 0.0002, "epoch": 4.460942543576501, "step": 13820}, {"loss": 0.5381, "grad_norm": 1.0613641738891602, "learning_rate": 0.0002, "epoch": 4.464170432537121, "step": 13830}, {"loss": 0.5684, "grad_norm": 0.9037535190582275, "learning_rate": 0.0002, "epoch": 4.467398321497741, "step": 13840}, {"loss": 0.5112, "grad_norm": 0.9216223955154419, "learning_rate": 0.0002, "epoch": 4.47062621045836, "step": 13850}, {"loss": 0.5341, "grad_norm": 0.8952260613441467, "learning_rate": 0.0002, "epoch": 4.47385409941898, "step": 13860}, {"loss": 0.5026, "grad_norm": 0.9997953176498413, "learning_rate": 0.0002, "epoch": 4.4770819883796, "step": 13870}, {"loss": 0.5107, "grad_norm": 1.062458872795105, "learning_rate": 0.0002, "epoch": 4.480309877340219, "step": 13880}, {"loss": 0.5463, "grad_norm": 0.9185126423835754, "learning_rate": 0.0002, "epoch": 4.483537766300839, "step": 13890}, {"loss": 0.5181, "grad_norm": 1.2389954328536987, "learning_rate": 0.0002, "epoch": 4.486765655261459, "step": 13900}, {"loss": 0.5199, "grad_norm": 1.1632126569747925, "learning_rate": 0.0002, "epoch": 4.489993544222079, "step": 13910}, {"loss": 0.5128, "grad_norm": 1.0304487943649292, "learning_rate": 0.0002, "epoch": 4.493221433182699, "step": 13920}, {"loss": 0.5331, "grad_norm": 0.9144788384437561, "learning_rate": 0.0002, "epoch": 4.496449322143318, "step": 13930}, {"loss": 0.5312, "grad_norm": 1.0285682678222656, "learning_rate": 0.0002, "epoch": 4.499677211103938, "step": 13940}, {"loss": 0.554, "grad_norm": 1.1187206506729126, "learning_rate": 0.0002, "epoch": 4.502905100064558, "step": 13950}, {"loss": 0.5268, "grad_norm": 0.7917197942733765, "learning_rate": 0.0002, "epoch": 4.506132989025177, "step": 13960}, {"loss": 0.5227, "grad_norm": 0.8495619297027588, "learning_rate": 0.0002, "epoch": 4.509360877985797, "step": 13970}, {"loss": 0.4971, "grad_norm": 1.0450760126113892, "learning_rate": 0.0002, "epoch": 4.512588766946417, "step": 13980}, {"loss": 0.5402, "grad_norm": 1.0061010122299194, "learning_rate": 0.0002, "epoch": 4.515816655907037, "step": 13990}, {"loss": 0.527, "grad_norm": 1.0232428312301636, "learning_rate": 0.0002, "epoch": 4.519044544867657, "step": 14000}, {"loss": 0.5002, "grad_norm": 0.8734631538391113, "learning_rate": 0.0002, "epoch": 4.5222724338282765, "step": 14010}, {"loss": 0.5464, "grad_norm": 1.1085621118545532, "learning_rate": 0.0002, "epoch": 4.525500322788896, "step": 14020}, {"loss": 0.5167, "grad_norm": 0.9178624749183655, "learning_rate": 0.0002, "epoch": 4.528728211749516, "step": 14030}, {"loss": 0.5589, "grad_norm": 1.0687317848205566, "learning_rate": 0.0002, "epoch": 4.531956100710135, "step": 14040}, {"loss": 0.5576, "grad_norm": 0.9237300157546997, "learning_rate": 0.0002, "epoch": 4.535183989670755, "step": 14050}, {"loss": 0.5062, "grad_norm": 0.9667123556137085, "learning_rate": 0.0002, "epoch": 4.538411878631375, "step": 14060}, {"loss": 0.5645, "grad_norm": 1.1286747455596924, "learning_rate": 0.0002, "epoch": 4.541639767591995, "step": 14070}, {"loss": 0.5226, "grad_norm": 1.055392861366272, "learning_rate": 0.0002, "epoch": 4.544867656552615, "step": 14080}, {"loss": 0.5428, "grad_norm": 0.9492936134338379, "learning_rate": 0.0002, "epoch": 4.548095545513235, "step": 14090}, {"loss": 0.5559, "grad_norm": 0.9881349802017212, "learning_rate": 0.0002, "epoch": 4.551323434473854, "step": 14100}, {"loss": 0.5572, "grad_norm": 0.9389023184776306, "learning_rate": 0.0002, "epoch": 4.554551323434474, "step": 14110}, {"loss": 0.5511, "grad_norm": 0.8395606875419617, "learning_rate": 0.0002, "epoch": 4.5577792123950935, "step": 14120}, {"loss": 0.5696, "grad_norm": 0.9019067287445068, "learning_rate": 0.0002, "epoch": 4.561007101355713, "step": 14130}, {"loss": 0.5564, "grad_norm": 1.1058136224746704, "learning_rate": 0.0002, "epoch": 4.564234990316333, "step": 14140}, {"loss": 0.5323, "grad_norm": 1.0683821439743042, "learning_rate": 0.0002, "epoch": 4.5674628792769525, "step": 14150}, {"loss": 0.5527, "grad_norm": 1.3398395776748657, "learning_rate": 0.0002, "epoch": 4.570690768237572, "step": 14160}, {"loss": 0.4713, "grad_norm": 0.7829096913337708, "learning_rate": 0.0002, "epoch": 4.573918657198193, "step": 14170}, {"loss": 0.525, "grad_norm": 0.9636675119400024, "learning_rate": 0.0002, "epoch": 4.577146546158812, "step": 14180}, {"loss": 0.5458, "grad_norm": 1.0291401147842407, "learning_rate": 0.0002, "epoch": 4.580374435119432, "step": 14190}, {"loss": 0.5366, "grad_norm": 1.0894310474395752, "learning_rate": 0.0002, "epoch": 4.583602324080052, "step": 14200}, {"loss": 0.5125, "grad_norm": 1.111573576927185, "learning_rate": 0.0002, "epoch": 4.586830213040671, "step": 14210}, {"loss": 0.5444, "grad_norm": 0.9345336556434631, "learning_rate": 0.0002, "epoch": 4.590058102001291, "step": 14220}, {"loss": 0.5175, "grad_norm": 1.3338757753372192, "learning_rate": 0.0002, "epoch": 4.593285990961911, "step": 14230}, {"loss": 0.5227, "grad_norm": 1.1146448850631714, "learning_rate": 0.0002, "epoch": 4.596513879922531, "step": 14240}, {"loss": 0.543, "grad_norm": 1.1576755046844482, "learning_rate": 0.0002, "epoch": 4.599741768883151, "step": 14250}, {"loss": 0.5315, "grad_norm": 0.6851092576980591, "learning_rate": 0.0002, "epoch": 4.60296965784377, "step": 14260}, {"loss": 0.5027, "grad_norm": 0.9067938923835754, "learning_rate": 0.0002, "epoch": 4.60619754680439, "step": 14270}, {"loss": 0.5237, "grad_norm": 0.8767340183258057, "learning_rate": 0.0002, "epoch": 4.60942543576501, "step": 14280}, {"loss": 0.5294, "grad_norm": 1.024880290031433, "learning_rate": 0.0002, "epoch": 4.612653324725629, "step": 14290}, {"loss": 0.5371, "grad_norm": 0.9226394891738892, "learning_rate": 0.0002, "epoch": 4.615881213686249, "step": 14300}, {"loss": 0.5281, "grad_norm": 1.018187165260315, "learning_rate": 0.0002, "epoch": 4.619109102646869, "step": 14310}, {"loss": 0.5546, "grad_norm": 0.8851249814033508, "learning_rate": 0.0002, "epoch": 4.622336991607488, "step": 14320}, {"loss": 0.5206, "grad_norm": 0.745798647403717, "learning_rate": 0.0002, "epoch": 4.625564880568108, "step": 14330}, {"loss": 0.5531, "grad_norm": 1.2082698345184326, "learning_rate": 0.0002, "epoch": 4.6287927695287285, "step": 14340}, {"loss": 0.5449, "grad_norm": 0.901454508304596, "learning_rate": 0.0002, "epoch": 4.632020658489348, "step": 14350}, {"loss": 0.5433, "grad_norm": 0.9593124985694885, "learning_rate": 0.0002, "epoch": 4.635248547449968, "step": 14360}, {"loss": 0.4939, "grad_norm": 1.1241410970687866, "learning_rate": 0.0002, "epoch": 4.6384764364105875, "step": 14370}, {"loss": 0.5319, "grad_norm": 0.9221102595329285, "learning_rate": 0.0002, "epoch": 4.641704325371207, "step": 14380}, {"loss": 0.524, "grad_norm": 1.0035039186477661, "learning_rate": 0.0002, "epoch": 4.644932214331827, "step": 14390}, {"loss": 0.5617, "grad_norm": 1.1270662546157837, "learning_rate": 0.0002, "epoch": 4.648160103292446, "step": 14400}, {"loss": 0.5663, "grad_norm": 0.8631120324134827, "learning_rate": 0.0002, "epoch": 4.651387992253067, "step": 14410}, {"loss": 0.5705, "grad_norm": 1.0604606866836548, "learning_rate": 0.0002, "epoch": 4.654615881213687, "step": 14420}, {"loss": 0.5307, "grad_norm": 0.8002706170082092, "learning_rate": 0.0002, "epoch": 4.657843770174306, "step": 14430}, {"loss": 0.5459, "grad_norm": 1.0642075538635254, "learning_rate": 0.0002, "epoch": 4.661071659134926, "step": 14440}, {"loss": 0.5497, "grad_norm": 0.9315671324729919, "learning_rate": 0.0002, "epoch": 4.664299548095546, "step": 14450}, {"loss": 0.5542, "grad_norm": 0.8311864137649536, "learning_rate": 0.0002, "epoch": 4.667527437056165, "step": 14460}, {"loss": 0.5533, "grad_norm": 0.8900430202484131, "learning_rate": 0.0002, "epoch": 4.670755326016785, "step": 14470}, {"loss": 0.5086, "grad_norm": 1.059267282485962, "learning_rate": 0.0002, "epoch": 4.6739832149774045, "step": 14480}, {"loss": 0.5583, "grad_norm": 0.9864052534103394, "learning_rate": 0.0002, "epoch": 4.677211103938024, "step": 14490}, {"loss": 0.5737, "grad_norm": 1.210854411125183, "learning_rate": 0.0002, "epoch": 4.680438992898644, "step": 14500}, {"loss": 0.536, "grad_norm": 1.030693769454956, "learning_rate": 0.0002, "epoch": 4.683666881859264, "step": 14510}, {"loss": 0.544, "grad_norm": 0.9809406995773315, "learning_rate": 0.0002, "epoch": 4.686894770819884, "step": 14520}, {"loss": 0.5522, "grad_norm": 1.0471004247665405, "learning_rate": 0.0002, "epoch": 4.690122659780504, "step": 14530}, {"loss": 0.5613, "grad_norm": 1.1583727598190308, "learning_rate": 0.0002, "epoch": 4.693350548741123, "step": 14540}, {"loss": 0.5608, "grad_norm": 0.9664418697357178, "learning_rate": 0.0002, "epoch": 4.696578437701743, "step": 14550}, {"loss": 0.5624, "grad_norm": 0.9511209726333618, "learning_rate": 0.0002, "epoch": 4.699806326662363, "step": 14560}, {"loss": 0.5806, "grad_norm": 1.0211684703826904, "learning_rate": 0.0002, "epoch": 4.703034215622982, "step": 14570}, {"loss": 0.5536, "grad_norm": 1.097276210784912, "learning_rate": 0.0002, "epoch": 4.706262104583602, "step": 14580}, {"loss": 0.5527, "grad_norm": 0.9363943338394165, "learning_rate": 0.0002, "epoch": 4.7094899935442225, "step": 14590}, {"loss": 0.5261, "grad_norm": 1.4700615406036377, "learning_rate": 0.0002, "epoch": 4.712717882504842, "step": 14600}, {"loss": 0.5489, "grad_norm": 1.0001553297042847, "learning_rate": 0.0002, "epoch": 4.715945771465462, "step": 14610}, {"loss": 0.5236, "grad_norm": 1.0489927530288696, "learning_rate": 0.0002, "epoch": 4.719173660426081, "step": 14620}, {"loss": 0.5418, "grad_norm": 1.0483676195144653, "learning_rate": 0.0002, "epoch": 4.722401549386701, "step": 14630}, {"loss": 0.5596, "grad_norm": 1.1501940488815308, "learning_rate": 0.0002, "epoch": 4.725629438347321, "step": 14640}, {"loss": 0.5059, "grad_norm": 1.1703146696090698, "learning_rate": 0.0002, "epoch": 4.72885732730794, "step": 14650}, {"loss": 0.5356, "grad_norm": 0.8842985033988953, "learning_rate": 0.0002, "epoch": 4.73208521626856, "step": 14660}, {"loss": 0.5229, "grad_norm": 0.9147908687591553, "learning_rate": 0.0002, "epoch": 4.73531310522918, "step": 14670}, {"loss": 0.5436, "grad_norm": 1.0391576290130615, "learning_rate": 0.0002, "epoch": 4.7385409941898, "step": 14680}, {"loss": 0.5803, "grad_norm": 0.9469179511070251, "learning_rate": 0.0002, "epoch": 4.74176888315042, "step": 14690}, {"loss": 0.5201, "grad_norm": 1.0529530048370361, "learning_rate": 0.0002, "epoch": 4.7449967721110395, "step": 14700}, {"loss": 0.5401, "grad_norm": 0.9645711183547974, "learning_rate": 0.0002, "epoch": 4.748224661071659, "step": 14710}, {"loss": 0.5123, "grad_norm": 0.8163343071937561, "learning_rate": 0.0002, "epoch": 4.751452550032279, "step": 14720}, {"loss": 0.5654, "grad_norm": 1.0581341981887817, "learning_rate": 0.0002, "epoch": 4.7546804389928985, "step": 14730}, {"loss": 0.5709, "grad_norm": 1.0913853645324707, "learning_rate": 0.0002, "epoch": 4.757908327953518, "step": 14740}, {"loss": 0.5342, "grad_norm": 1.1071174144744873, "learning_rate": 0.0002, "epoch": 4.761136216914138, "step": 14750}, {"loss": 0.5353, "grad_norm": 1.0060709714889526, "learning_rate": 0.0002, "epoch": 4.764364105874758, "step": 14760}, {"loss": 0.5415, "grad_norm": 1.012024164199829, "learning_rate": 0.0002, "epoch": 4.767591994835378, "step": 14770}, {"loss": 0.5351, "grad_norm": 0.8438148498535156, "learning_rate": 0.0002, "epoch": 4.770819883795998, "step": 14780}, {"loss": 0.5424, "grad_norm": 0.8136811256408691, "learning_rate": 0.0002, "epoch": 4.774047772756617, "step": 14790}, {"loss": 0.5397, "grad_norm": 1.0765691995620728, "learning_rate": 0.0002, "epoch": 4.777275661717237, "step": 14800}, {"loss": 0.5616, "grad_norm": 1.0582574605941772, "learning_rate": 0.0002, "epoch": 4.780503550677857, "step": 14810}, {"loss": 0.5554, "grad_norm": 0.9419516921043396, "learning_rate": 0.0002, "epoch": 4.783731439638476, "step": 14820}, {"loss": 0.5499, "grad_norm": 0.9626181721687317, "learning_rate": 0.0002, "epoch": 4.786959328599096, "step": 14830}, {"loss": 0.565, "grad_norm": 1.2552800178527832, "learning_rate": 0.0002, "epoch": 4.7901872175597155, "step": 14840}, {"loss": 0.5402, "grad_norm": 0.9379919171333313, "learning_rate": 0.0002, "epoch": 4.793415106520336, "step": 14850}, {"loss": 0.5583, "grad_norm": 0.8166947364807129, "learning_rate": 0.0002, "epoch": 4.796642995480956, "step": 14860}, {"loss": 0.5139, "grad_norm": 0.9008694887161255, "learning_rate": 0.0002, "epoch": 4.799870884441575, "step": 14870}, {"loss": 0.5049, "grad_norm": 1.0256156921386719, "learning_rate": 0.0002, "epoch": 4.803098773402195, "step": 14880}, {"loss": 0.5531, "grad_norm": 0.9486594200134277, "learning_rate": 0.0002, "epoch": 4.806326662362815, "step": 14890}, {"loss": 0.5667, "grad_norm": 0.955238401889801, "learning_rate": 0.0002, "epoch": 4.809554551323434, "step": 14900}, {"loss": 0.5269, "grad_norm": 1.03775954246521, "learning_rate": 0.0002, "epoch": 4.812782440284054, "step": 14910}, {"loss": 0.5445, "grad_norm": 1.1383405923843384, "learning_rate": 0.0002, "epoch": 4.816010329244674, "step": 14920}, {"loss": 0.5347, "grad_norm": 0.9411700963973999, "learning_rate": 0.0002, "epoch": 4.819238218205294, "step": 14930}, {"loss": 0.4899, "grad_norm": 0.8188554644584656, "learning_rate": 0.0002, "epoch": 4.822466107165914, "step": 14940}, {"loss": 0.5618, "grad_norm": 1.1336265802383423, "learning_rate": 0.0002, "epoch": 4.8256939961265335, "step": 14950}, {"loss": 0.5578, "grad_norm": 1.106121301651001, "learning_rate": 0.0002, "epoch": 4.828921885087153, "step": 14960}, {"loss": 0.5306, "grad_norm": 1.0206533670425415, "learning_rate": 0.0002, "epoch": 4.832149774047773, "step": 14970}, {"loss": 0.5714, "grad_norm": 1.1123926639556885, "learning_rate": 0.0002, "epoch": 4.8353776630083924, "step": 14980}, {"loss": 0.5208, "grad_norm": 0.7879418730735779, "learning_rate": 0.0002, "epoch": 4.838605551969012, "step": 14990}, {"loss": 0.5385, "grad_norm": 1.0171709060668945, "learning_rate": 0.0002, "epoch": 4.841833440929632, "step": 15000}, {"loss": 0.6049, "grad_norm": 1.010671615600586, "learning_rate": 0.0002, "epoch": 4.845061329890251, "step": 15010}, {"loss": 0.5497, "grad_norm": 1.0778919458389282, "learning_rate": 0.0002, "epoch": 4.848289218850871, "step": 15020}, {"loss": 0.5587, "grad_norm": 1.0479968786239624, "learning_rate": 0.0002, "epoch": 4.851517107811492, "step": 15030}, {"loss": 0.5637, "grad_norm": 1.0345100164413452, "learning_rate": 0.0002, "epoch": 4.854744996772111, "step": 15040}, {"loss": 0.5809, "grad_norm": 0.9539691805839539, "learning_rate": 0.0002, "epoch": 4.857972885732731, "step": 15050}, {"loss": 0.5314, "grad_norm": 0.9914752840995789, "learning_rate": 0.0002, "epoch": 4.8612007746933505, "step": 15060}, {"loss": 0.5277, "grad_norm": 1.1935476064682007, "learning_rate": 0.0002, "epoch": 4.86442866365397, "step": 15070}, {"loss": 0.5497, "grad_norm": 1.0065057277679443, "learning_rate": 0.0002, "epoch": 4.86765655261459, "step": 15080}, {"loss": 0.5563, "grad_norm": 0.9320993423461914, "learning_rate": 0.0002, "epoch": 4.8708844415752095, "step": 15090}, {"loss": 0.5757, "grad_norm": 1.0578069686889648, "learning_rate": 0.0002, "epoch": 4.87411233053583, "step": 15100}, {"loss": 0.5472, "grad_norm": 0.9666239023208618, "learning_rate": 0.0002, "epoch": 4.87734021949645, "step": 15110}, {"loss": 0.5564, "grad_norm": 1.1322687864303589, "learning_rate": 0.0002, "epoch": 4.880568108457069, "step": 15120}, {"loss": 0.5381, "grad_norm": 0.955674409866333, "learning_rate": 0.0002, "epoch": 4.883795997417689, "step": 15130}, {"loss": 0.557, "grad_norm": 1.119413137435913, "learning_rate": 0.0002, "epoch": 4.887023886378309, "step": 15140}, {"loss": 0.5527, "grad_norm": 0.863646924495697, "learning_rate": 0.0002, "epoch": 4.890251775338928, "step": 15150}, {"loss": 0.5908, "grad_norm": 1.1823450326919556, "learning_rate": 0.0002, "epoch": 4.893479664299548, "step": 15160}, {"loss": 0.5654, "grad_norm": 0.8657588958740234, "learning_rate": 0.0002, "epoch": 4.896707553260168, "step": 15170}, {"loss": 0.5239, "grad_norm": 0.8575737476348877, "learning_rate": 0.0002, "epoch": 4.899935442220787, "step": 15180}, {"loss": 0.564, "grad_norm": 0.9611830711364746, "learning_rate": 0.0002, "epoch": 4.903163331181407, "step": 15190}, {"loss": 0.5505, "grad_norm": 1.1981453895568848, "learning_rate": 0.0002, "epoch": 4.906391220142027, "step": 15200}, {"loss": 0.5582, "grad_norm": 0.9401199221611023, "learning_rate": 0.0002, "epoch": 4.909619109102647, "step": 15210}, {"loss": 0.5631, "grad_norm": 0.8420369625091553, "learning_rate": 0.0002, "epoch": 4.912846998063267, "step": 15220}, {"loss": 0.5255, "grad_norm": 0.7877969145774841, "learning_rate": 0.0002, "epoch": 4.916074887023886, "step": 15230}, {"loss": 0.5522, "grad_norm": 0.8988324403762817, "learning_rate": 0.0002, "epoch": 4.919302775984506, "step": 15240}, {"loss": 0.5274, "grad_norm": 1.1103752851486206, "learning_rate": 0.0002, "epoch": 4.922530664945126, "step": 15250}, {"loss": 0.5249, "grad_norm": 0.8874443173408508, "learning_rate": 0.0002, "epoch": 4.925758553905745, "step": 15260}, {"loss": 0.5677, "grad_norm": 1.1001752614974976, "learning_rate": 0.0002, "epoch": 4.928986442866366, "step": 15270}, {"loss": 0.5596, "grad_norm": 0.9661307334899902, "learning_rate": 0.0002, "epoch": 4.9322143318269855, "step": 15280}, {"loss": 0.5678, "grad_norm": 1.1738812923431396, "learning_rate": 0.0002, "epoch": 4.935442220787605, "step": 15290}, {"loss": 0.5057, "grad_norm": 0.9773507714271545, "learning_rate": 0.0002, "epoch": 4.938670109748225, "step": 15300}, {"loss": 0.5029, "grad_norm": 1.0735599994659424, "learning_rate": 0.0002, "epoch": 4.9418979987088445, "step": 15310}, {"loss": 0.4996, "grad_norm": 1.0552113056182861, "learning_rate": 0.0002, "epoch": 4.945125887669464, "step": 15320}, {"loss": 0.5201, "grad_norm": 1.0900797843933105, "learning_rate": 0.0002, "epoch": 4.948353776630084, "step": 15330}, {"loss": 0.552, "grad_norm": 1.0908405780792236, "learning_rate": 0.0002, "epoch": 4.9515816655907035, "step": 15340}, {"loss": 0.6208, "grad_norm": 1.010221004486084, "learning_rate": 0.0002, "epoch": 4.954809554551323, "step": 15350}, {"loss": 0.5423, "grad_norm": 1.0321437120437622, "learning_rate": 0.0002, "epoch": 4.958037443511943, "step": 15360}, {"loss": 0.5903, "grad_norm": 0.8430278897285461, "learning_rate": 0.0002, "epoch": 4.961265332472563, "step": 15370}, {"loss": 0.538, "grad_norm": 0.8775330185890198, "learning_rate": 0.0002, "epoch": 4.964493221433183, "step": 15380}, {"loss": 0.5344, "grad_norm": 0.9796988368034363, "learning_rate": 0.0002, "epoch": 4.967721110393803, "step": 15390}, {"loss": 0.5352, "grad_norm": 0.8782257437705994, "learning_rate": 0.0002, "epoch": 4.970948999354422, "step": 15400}, {"loss": 0.5843, "grad_norm": 0.9959840774536133, "learning_rate": 0.0002, "epoch": 4.974176888315042, "step": 15410}, {"loss": 0.5783, "grad_norm": 1.0730273723602295, "learning_rate": 0.0002, "epoch": 4.9774047772756616, "step": 15420}, {"loss": 0.5277, "grad_norm": 0.8653680682182312, "learning_rate": 0.0002, "epoch": 4.980632666236281, "step": 15430}, {"loss": 0.5301, "grad_norm": 1.0769985914230347, "learning_rate": 0.0002, "epoch": 4.983860555196901, "step": 15440}, {"loss": 0.5727, "grad_norm": 1.1336040496826172, "learning_rate": 0.0002, "epoch": 4.987088444157521, "step": 15450}, {"loss": 0.5454, "grad_norm": 0.9844824075698853, "learning_rate": 0.0002, "epoch": 4.990316333118141, "step": 15460}, {"loss": 0.5316, "grad_norm": 0.8368769288063049, "learning_rate": 0.0002, "epoch": 4.993544222078761, "step": 15470}, {"loss": 0.5464, "grad_norm": 1.0238676071166992, "learning_rate": 0.0002, "epoch": 4.99677211103938, "step": 15480}, {"loss": 0.5577, "grad_norm": 1.064820408821106, "learning_rate": 0.0002, "epoch": 5.0, "step": 15490}, {"eval_loss": 1.241918921470642, "eval_runtime": 158.4099, "eval_samples_per_second": 4.627, "eval_steps_per_second": 0.581, "epoch": 5.0, "step": 15490}, {"loss": 0.4554, "grad_norm": 1.1366689205169678, "learning_rate": 0.0002, "epoch": 5.00322788896062, "step": 15500}, {"loss": 0.4288, "grad_norm": 1.2548010349273682, "learning_rate": 0.0002, "epoch": 5.006455777921239, "step": 15510}, {"loss": 0.4276, "grad_norm": 1.3875139951705933, "learning_rate": 0.0002, "epoch": 5.009683666881859, "step": 15520}, {"loss": 0.4198, "grad_norm": 0.9834036231040955, "learning_rate": 0.0002, "epoch": 5.012911555842479, "step": 15530}, {"loss": 0.4531, "grad_norm": 1.0737303495407104, "learning_rate": 0.0002, "epoch": 5.016139444803099, "step": 15540}, {"loss": 0.4073, "grad_norm": 0.9877859950065613, "learning_rate": 0.0002, "epoch": 5.019367333763719, "step": 15550}, {"loss": 0.4459, "grad_norm": 1.143268346786499, "learning_rate": 0.0002, "epoch": 5.0225952227243384, "step": 15560}, {"loss": 0.4477, "grad_norm": 1.1206166744232178, "learning_rate": 0.0002, "epoch": 5.025823111684958, "step": 15570}, {"loss": 0.4593, "grad_norm": 0.9977272748947144, "learning_rate": 0.0002, "epoch": 5.029051000645578, "step": 15580}, {"loss": 0.436, "grad_norm": 1.3193285465240479, "learning_rate": 0.0002, "epoch": 5.032278889606197, "step": 15590}, {"loss": 0.4426, "grad_norm": 1.0761713981628418, "learning_rate": 0.0002, "epoch": 5.035506778566817, "step": 15600}, {"loss": 0.4701, "grad_norm": 1.1250759363174438, "learning_rate": 0.0002, "epoch": 5.038734667527437, "step": 15610}, {"loss": 0.3995, "grad_norm": 1.0414305925369263, "learning_rate": 0.0002, "epoch": 5.041962556488057, "step": 15620}, {"loss": 0.4244, "grad_norm": 1.0906853675842285, "learning_rate": 0.0002, "epoch": 5.045190445448677, "step": 15630}, {"loss": 0.441, "grad_norm": 0.9360867142677307, "learning_rate": 0.0002, "epoch": 5.0484183344092965, "step": 15640}, {"loss": 0.4146, "grad_norm": 0.9078057408332825, "learning_rate": 0.0002, "epoch": 5.051646223369916, "step": 15650}, {"loss": 0.4285, "grad_norm": 1.0054848194122314, "learning_rate": 0.0002, "epoch": 5.054874112330536, "step": 15660}, {"loss": 0.417, "grad_norm": 0.9538215398788452, "learning_rate": 0.0002, "epoch": 5.0581020012911555, "step": 15670}, {"loss": 0.4629, "grad_norm": 1.6312693357467651, "learning_rate": 0.0002, "epoch": 5.061329890251775, "step": 15680}, {"loss": 0.3996, "grad_norm": 1.2100921869277954, "learning_rate": 0.0002, "epoch": 5.064557779212395, "step": 15690}, {"loss": 0.4489, "grad_norm": 1.2776238918304443, "learning_rate": 0.0002, "epoch": 5.0677856681730145, "step": 15700}, {"loss": 0.4728, "grad_norm": 1.0110050439834595, "learning_rate": 0.0002, "epoch": 5.071013557133635, "step": 15710}, {"loss": 0.4916, "grad_norm": 1.0896575450897217, "learning_rate": 0.0002, "epoch": 5.074241446094255, "step": 15720}, {"loss": 0.4462, "grad_norm": 0.9989936947822571, "learning_rate": 0.0002, "epoch": 5.077469335054874, "step": 15730}, {"loss": 0.457, "grad_norm": 1.0412228107452393, "learning_rate": 0.0002, "epoch": 5.080697224015494, "step": 15740}, {"loss": 0.4525, "grad_norm": 1.0964457988739014, "learning_rate": 0.0002, "epoch": 5.083925112976114, "step": 15750}, {"loss": 0.4539, "grad_norm": 1.1700960397720337, "learning_rate": 0.0002, "epoch": 5.087153001936733, "step": 15760}, {"loss": 0.4517, "grad_norm": 0.9515631794929504, "learning_rate": 0.0002, "epoch": 5.090380890897353, "step": 15770}, {"loss": 0.4352, "grad_norm": 1.0895006656646729, "learning_rate": 0.0002, "epoch": 5.093608779857973, "step": 15780}, {"loss": 0.4765, "grad_norm": 1.041312575340271, "learning_rate": 0.0002, "epoch": 5.096836668818592, "step": 15790}, {"loss": 0.4532, "grad_norm": 0.9518465399742126, "learning_rate": 0.0002, "epoch": 5.100064557779213, "step": 15800}, {"loss": 0.4187, "grad_norm": 0.8317030668258667, "learning_rate": 0.0002, "epoch": 5.103292446739832, "step": 15810}, {"loss": 0.4523, "grad_norm": 1.0933761596679688, "learning_rate": 0.0002, "epoch": 5.106520335700452, "step": 15820}, {"loss": 0.4689, "grad_norm": 1.0069324970245361, "learning_rate": 0.0002, "epoch": 5.109748224661072, "step": 15830}, {"loss": 0.4773, "grad_norm": 1.1166068315505981, "learning_rate": 0.0002, "epoch": 5.112976113621691, "step": 15840}, {"loss": 0.4635, "grad_norm": 1.069992184638977, "learning_rate": 0.0002, "epoch": 5.116204002582311, "step": 15850}, {"loss": 0.445, "grad_norm": 1.3728036880493164, "learning_rate": 0.0002, "epoch": 5.119431891542931, "step": 15860}, {"loss": 0.4563, "grad_norm": 1.0625780820846558, "learning_rate": 0.0002, "epoch": 5.12265978050355, "step": 15870}, {"loss": 0.426, "grad_norm": 1.090174913406372, "learning_rate": 0.0002, "epoch": 5.125887669464171, "step": 15880}, {"loss": 0.457, "grad_norm": 0.8729526996612549, "learning_rate": 0.0002, "epoch": 5.1291155584247905, "step": 15890}, {"loss": 0.4686, "grad_norm": 0.9561540484428406, "learning_rate": 0.0002, "epoch": 5.13234344738541, "step": 15900}, {"loss": 0.4266, "grad_norm": 1.012120246887207, "learning_rate": 0.0002, "epoch": 5.13557133634603, "step": 15910}, {"loss": 0.4484, "grad_norm": 1.1027921438217163, "learning_rate": 0.0002, "epoch": 5.1387992253066495, "step": 15920}, {"loss": 0.4389, "grad_norm": 1.0878126621246338, "learning_rate": 0.0002, "epoch": 5.142027114267269, "step": 15930}, {"loss": 0.4716, "grad_norm": 0.9619103670120239, "learning_rate": 0.0002, "epoch": 5.145255003227889, "step": 15940}, {"loss": 0.4071, "grad_norm": 1.1684138774871826, "learning_rate": 0.0002, "epoch": 5.148482892188508, "step": 15950}, {"loss": 0.4292, "grad_norm": 1.3379510641098022, "learning_rate": 0.0002, "epoch": 5.151710781149128, "step": 15960}, {"loss": 0.4413, "grad_norm": 1.0427496433258057, "learning_rate": 0.0002, "epoch": 5.154938670109749, "step": 15970}, {"loss": 0.4665, "grad_norm": 0.9917148351669312, "learning_rate": 0.0002, "epoch": 5.158166559070368, "step": 15980}, {"loss": 0.4527, "grad_norm": 1.0899780988693237, "learning_rate": 0.0002, "epoch": 5.161394448030988, "step": 15990}, {"loss": 0.4764, "grad_norm": 0.9251647591590881, "learning_rate": 0.0002, "epoch": 5.1646223369916076, "step": 16000}, {"loss": 0.5043, "grad_norm": 1.1669172048568726, "learning_rate": 0.0002, "epoch": 5.167850225952227, "step": 16010}, {"loss": 0.4726, "grad_norm": 1.2285256385803223, "learning_rate": 0.0002, "epoch": 5.171078114912847, "step": 16020}, {"loss": 0.4312, "grad_norm": 1.0504484176635742, "learning_rate": 0.0002, "epoch": 5.1743060038734665, "step": 16030}, {"loss": 0.4507, "grad_norm": 1.2829089164733887, "learning_rate": 0.0002, "epoch": 5.177533892834086, "step": 16040}, {"loss": 0.4547, "grad_norm": 0.9332743287086487, "learning_rate": 0.0002, "epoch": 5.180761781794706, "step": 16050}, {"loss": 0.4211, "grad_norm": 1.0054426193237305, "learning_rate": 0.0002, "epoch": 5.183989670755326, "step": 16060}, {"loss": 0.4415, "grad_norm": 1.0049669742584229, "learning_rate": 0.0002, "epoch": 5.187217559715946, "step": 16070}, {"loss": 0.4462, "grad_norm": 1.0171366930007935, "learning_rate": 0.0002, "epoch": 5.190445448676566, "step": 16080}, {"loss": 0.4725, "grad_norm": 1.234966516494751, "learning_rate": 0.0002, "epoch": 5.193673337637185, "step": 16090}, {"loss": 0.4579, "grad_norm": 0.9127960205078125, "learning_rate": 0.0002, "epoch": 5.196901226597805, "step": 16100}, {"loss": 0.4647, "grad_norm": 1.153924822807312, "learning_rate": 0.0002, "epoch": 5.200129115558425, "step": 16110}, {"loss": 0.4826, "grad_norm": 1.26716947555542, "learning_rate": 0.0002, "epoch": 5.203357004519044, "step": 16120}, {"loss": 0.446, "grad_norm": 1.2438743114471436, "learning_rate": 0.0002, "epoch": 5.206584893479664, "step": 16130}, {"loss": 0.4768, "grad_norm": 1.0888392925262451, "learning_rate": 0.0002, "epoch": 5.2098127824402845, "step": 16140}, {"loss": 0.4508, "grad_norm": 1.1741917133331299, "learning_rate": 0.0002, "epoch": 5.213040671400904, "step": 16150}, {"loss": 0.4271, "grad_norm": 0.9508614540100098, "learning_rate": 0.0002, "epoch": 5.216268560361524, "step": 16160}, {"loss": 0.4577, "grad_norm": 0.9714716672897339, "learning_rate": 0.0002, "epoch": 5.219496449322143, "step": 16170}, {"loss": 0.4636, "grad_norm": 1.2681622505187988, "learning_rate": 0.0002, "epoch": 5.222724338282763, "step": 16180}, {"loss": 0.4723, "grad_norm": 1.045871376991272, "learning_rate": 0.0002, "epoch": 5.225952227243383, "step": 16190}, {"loss": 0.4467, "grad_norm": 1.0272563695907593, "learning_rate": 0.0002, "epoch": 5.229180116204002, "step": 16200}, {"loss": 0.4353, "grad_norm": 1.092901349067688, "learning_rate": 0.0002, "epoch": 5.232408005164622, "step": 16210}, {"loss": 0.4588, "grad_norm": 0.9332799315452576, "learning_rate": 0.0002, "epoch": 5.235635894125242, "step": 16220}, {"loss": 0.4594, "grad_norm": 1.1728498935699463, "learning_rate": 0.0002, "epoch": 5.238863783085862, "step": 16230}, {"loss": 0.4652, "grad_norm": 0.9932476878166199, "learning_rate": 0.0002, "epoch": 5.242091672046482, "step": 16240}, {"loss": 0.4469, "grad_norm": 0.735236406326294, "learning_rate": 0.0002, "epoch": 5.2453195610071015, "step": 16250}, {"loss": 0.4386, "grad_norm": 1.0289303064346313, "learning_rate": 0.0002, "epoch": 5.248547449967721, "step": 16260}, {"loss": 0.4303, "grad_norm": 0.9488231539726257, "learning_rate": 0.0002, "epoch": 5.251775338928341, "step": 16270}, {"loss": 0.4495, "grad_norm": 0.8320055603981018, "learning_rate": 0.0002, "epoch": 5.2550032278889605, "step": 16280}, {"loss": 0.4224, "grad_norm": 1.2013251781463623, "learning_rate": 0.0002, "epoch": 5.25823111684958, "step": 16290}, {"loss": 0.4666, "grad_norm": 1.0649845600128174, "learning_rate": 0.0002, "epoch": 5.2614590058102, "step": 16300}, {"loss": 0.4325, "grad_norm": 1.1674472093582153, "learning_rate": 0.0002, "epoch": 5.26468689477082, "step": 16310}, {"loss": 0.4482, "grad_norm": 1.3934763669967651, "learning_rate": 0.0002, "epoch": 5.26791478373144, "step": 16320}, {"loss": 0.4494, "grad_norm": 0.8427977561950684, "learning_rate": 0.0002, "epoch": 5.27114267269206, "step": 16330}, {"loss": 0.4234, "grad_norm": 1.0497093200683594, "learning_rate": 0.0002, "epoch": 5.274370561652679, "step": 16340}, {"loss": 0.4337, "grad_norm": 0.8562338352203369, "learning_rate": 0.0002, "epoch": 5.277598450613299, "step": 16350}, {"loss": 0.4664, "grad_norm": 1.043920874595642, "learning_rate": 0.0002, "epoch": 5.280826339573919, "step": 16360}, {"loss": 0.4463, "grad_norm": 1.0039188861846924, "learning_rate": 0.0002, "epoch": 5.284054228534538, "step": 16370}, {"loss": 0.4149, "grad_norm": 0.9414041638374329, "learning_rate": 0.0002, "epoch": 5.287282117495158, "step": 16380}, {"loss": 0.5119, "grad_norm": 1.3346221446990967, "learning_rate": 0.0002, "epoch": 5.2905100064557775, "step": 16390}, {"loss": 0.4479, "grad_norm": 1.0173962116241455, "learning_rate": 0.0002, "epoch": 5.293737895416398, "step": 16400}, {"loss": 0.4538, "grad_norm": 0.7756500244140625, "learning_rate": 0.0002, "epoch": 5.296965784377018, "step": 16410}, {"loss": 0.4306, "grad_norm": 1.1185362339019775, "learning_rate": 0.0002, "epoch": 5.300193673337637, "step": 16420}, {"loss": 0.5033, "grad_norm": 1.0904899835586548, "learning_rate": 0.0002, "epoch": 5.303421562298257, "step": 16430}, {"loss": 0.4887, "grad_norm": 1.0803170204162598, "learning_rate": 0.0002, "epoch": 5.306649451258877, "step": 16440}, {"loss": 0.4473, "grad_norm": 1.1492092609405518, "learning_rate": 0.0002, "epoch": 5.309877340219496, "step": 16450}, {"loss": 0.4696, "grad_norm": 1.1212135553359985, "learning_rate": 0.0002, "epoch": 5.313105229180116, "step": 16460}, {"loss": 0.4438, "grad_norm": 0.8274528980255127, "learning_rate": 0.0002, "epoch": 5.316333118140736, "step": 16470}, {"loss": 0.468, "grad_norm": 1.118891716003418, "learning_rate": 0.0002, "epoch": 5.319561007101356, "step": 16480}, {"loss": 0.4403, "grad_norm": 1.185945749282837, "learning_rate": 0.0002, "epoch": 5.322788896061976, "step": 16490}, {"loss": 0.4946, "grad_norm": 1.0275214910507202, "learning_rate": 0.0002, "epoch": 5.3260167850225955, "step": 16500}, {"loss": 0.4612, "grad_norm": 0.9346362352371216, "learning_rate": 0.0002, "epoch": 5.329244673983215, "step": 16510}, {"loss": 0.4722, "grad_norm": 0.9600600600242615, "learning_rate": 0.0002, "epoch": 5.332472562943835, "step": 16520}, {"loss": 0.4536, "grad_norm": 1.1238188743591309, "learning_rate": 0.0002, "epoch": 5.335700451904454, "step": 16530}, {"loss": 0.5025, "grad_norm": 0.8660476207733154, "learning_rate": 0.0002, "epoch": 5.338928340865074, "step": 16540}, {"loss": 0.4732, "grad_norm": 0.9869821071624756, "learning_rate": 0.0002, "epoch": 5.342156229825694, "step": 16550}, {"loss": 0.4967, "grad_norm": 1.1719090938568115, "learning_rate": 0.0002, "epoch": 5.345384118786313, "step": 16560}, {"loss": 0.4563, "grad_norm": 1.0122894048690796, "learning_rate": 0.0002, "epoch": 5.348612007746934, "step": 16570}, {"loss": 0.5066, "grad_norm": 1.2431079149246216, "learning_rate": 0.0002, "epoch": 5.351839896707554, "step": 16580}, {"loss": 0.4708, "grad_norm": 1.4178080558776855, "learning_rate": 0.0002, "epoch": 5.355067785668173, "step": 16590}, {"loss": 0.4686, "grad_norm": 1.1895726919174194, "learning_rate": 0.0002, "epoch": 5.358295674628793, "step": 16600}, {"loss": 0.475, "grad_norm": 1.154392123222351, "learning_rate": 0.0002, "epoch": 5.3615235635894125, "step": 16610}, {"loss": 0.4511, "grad_norm": 0.9207229018211365, "learning_rate": 0.0002, "epoch": 5.364751452550032, "step": 16620}, {"loss": 0.4606, "grad_norm": 1.0247414112091064, "learning_rate": 0.0002, "epoch": 5.367979341510652, "step": 16630}, {"loss": 0.4886, "grad_norm": 1.0402202606201172, "learning_rate": 0.0002, "epoch": 5.3712072304712715, "step": 16640}, {"loss": 0.4903, "grad_norm": 1.1902891397476196, "learning_rate": 0.0002, "epoch": 5.374435119431892, "step": 16650}, {"loss": 0.4583, "grad_norm": 0.9572759866714478, "learning_rate": 0.0002, "epoch": 5.377663008392512, "step": 16660}, {"loss": 0.4636, "grad_norm": 0.9968860149383545, "learning_rate": 0.0002, "epoch": 5.380890897353131, "step": 16670}, {"loss": 0.477, "grad_norm": 1.2468547821044922, "learning_rate": 0.0002, "epoch": 5.384118786313751, "step": 16680}, {"loss": 0.5223, "grad_norm": 1.154661774635315, "learning_rate": 0.0002, "epoch": 5.387346675274371, "step": 16690}, {"loss": 0.4637, "grad_norm": 0.8837044835090637, "learning_rate": 0.0002, "epoch": 5.39057456423499, "step": 16700}, {"loss": 0.4744, "grad_norm": 1.0317907333374023, "learning_rate": 0.0002, "epoch": 5.39380245319561, "step": 16710}, {"loss": 0.4831, "grad_norm": 0.9811587929725647, "learning_rate": 0.0002, "epoch": 5.39703034215623, "step": 16720}, {"loss": 0.4739, "grad_norm": 0.9487450122833252, "learning_rate": 0.0002, "epoch": 5.400258231116849, "step": 16730}, {"loss": 0.4574, "grad_norm": 1.0540274381637573, "learning_rate": 0.0002, "epoch": 5.403486120077469, "step": 16740}, {"loss": 0.4709, "grad_norm": 1.028363585472107, "learning_rate": 0.0002, "epoch": 5.406714009038089, "step": 16750}, {"loss": 0.468, "grad_norm": 1.0200704336166382, "learning_rate": 0.0002, "epoch": 5.409941897998709, "step": 16760}, {"loss": 0.4383, "grad_norm": 1.0330981016159058, "learning_rate": 0.0002, "epoch": 5.413169786959329, "step": 16770}, {"loss": 0.4645, "grad_norm": 1.320875644683838, "learning_rate": 0.0002, "epoch": 5.416397675919948, "step": 16780}, {"loss": 0.4601, "grad_norm": 0.9838143587112427, "learning_rate": 0.0002, "epoch": 5.419625564880568, "step": 16790}, {"loss": 0.4835, "grad_norm": 1.1006578207015991, "learning_rate": 0.0002, "epoch": 5.422853453841188, "step": 16800}, {"loss": 0.4871, "grad_norm": 1.099174976348877, "learning_rate": 0.0002, "epoch": 5.426081342801807, "step": 16810}, {"loss": 0.4773, "grad_norm": 1.0632189512252808, "learning_rate": 0.0002, "epoch": 5.429309231762427, "step": 16820}, {"loss": 0.4732, "grad_norm": 0.9673194885253906, "learning_rate": 0.0002, "epoch": 5.4325371207230475, "step": 16830}, {"loss": 0.4731, "grad_norm": 0.853013813495636, "learning_rate": 0.0002, "epoch": 5.435765009683667, "step": 16840}, {"loss": 0.4856, "grad_norm": 1.0261728763580322, "learning_rate": 0.0002, "epoch": 5.438992898644287, "step": 16850}, {"loss": 0.4729, "grad_norm": 1.1642370223999023, "learning_rate": 0.0002, "epoch": 5.4422207876049065, "step": 16860}, {"loss": 0.4751, "grad_norm": 0.8715673685073853, "learning_rate": 0.0002, "epoch": 5.445448676565526, "step": 16870}, {"loss": 0.4566, "grad_norm": 0.905746579170227, "learning_rate": 0.0002, "epoch": 5.448676565526146, "step": 16880}, {"loss": 0.4536, "grad_norm": 1.1051915884017944, "learning_rate": 0.0002, "epoch": 5.451904454486765, "step": 16890}, {"loss": 0.4944, "grad_norm": 1.0781478881835938, "learning_rate": 0.0002, "epoch": 5.455132343447385, "step": 16900}, {"loss": 0.4655, "grad_norm": 1.1168911457061768, "learning_rate": 0.0002, "epoch": 5.458360232408005, "step": 16910}, {"loss": 0.4624, "grad_norm": 1.1150046586990356, "learning_rate": 0.0002, "epoch": 5.461588121368625, "step": 16920}, {"loss": 0.4849, "grad_norm": 0.9862499833106995, "learning_rate": 0.0002, "epoch": 5.464816010329245, "step": 16930}, {"loss": 0.47, "grad_norm": 1.5416640043258667, "learning_rate": 0.0002, "epoch": 5.468043899289865, "step": 16940}, {"loss": 0.4508, "grad_norm": 0.8960899710655212, "learning_rate": 0.0002, "epoch": 5.471271788250484, "step": 16950}, {"loss": 0.5002, "grad_norm": 0.9796477556228638, "learning_rate": 0.0002, "epoch": 5.474499677211104, "step": 16960}, {"loss": 0.4939, "grad_norm": 0.9526587128639221, "learning_rate": 0.0002, "epoch": 5.4777275661717235, "step": 16970}, {"loss": 0.4807, "grad_norm": 1.2373039722442627, "learning_rate": 0.0002, "epoch": 5.480955455132343, "step": 16980}, {"loss": 0.4642, "grad_norm": 1.1860566139221191, "learning_rate": 0.0002, "epoch": 5.484183344092963, "step": 16990}, {"loss": 0.4929, "grad_norm": 1.477345585823059, "learning_rate": 0.0002, "epoch": 5.487411233053583, "step": 17000}, {"loss": 0.4566, "grad_norm": 1.1029295921325684, "learning_rate": 0.0002, "epoch": 5.490639122014203, "step": 17010}, {"loss": 0.487, "grad_norm": 1.1416981220245361, "learning_rate": 0.0002, "epoch": 5.493867010974823, "step": 17020}, {"loss": 0.475, "grad_norm": 1.1647989749908447, "learning_rate": 0.0002, "epoch": 5.497094899935442, "step": 17030}, {"loss": 0.4644, "grad_norm": 1.1297032833099365, "learning_rate": 0.0002, "epoch": 5.500322788896062, "step": 17040}, {"loss": 0.4885, "grad_norm": 0.9764689207077026, "learning_rate": 0.0002, "epoch": 5.503550677856682, "step": 17050}, {"loss": 0.4789, "grad_norm": 1.038161039352417, "learning_rate": 0.0002, "epoch": 5.506778566817301, "step": 17060}, {"loss": 0.4467, "grad_norm": 1.1417886018753052, "learning_rate": 0.0002, "epoch": 5.510006455777921, "step": 17070}, {"loss": 0.4782, "grad_norm": 0.9300898313522339, "learning_rate": 0.0002, "epoch": 5.513234344738541, "step": 17080}, {"loss": 0.4805, "grad_norm": 1.0295016765594482, "learning_rate": 0.0002, "epoch": 5.516462233699161, "step": 17090}, {"loss": 0.4663, "grad_norm": 1.1273008584976196, "learning_rate": 0.0002, "epoch": 5.519690122659781, "step": 17100}, {"loss": 0.4897, "grad_norm": 0.9542737007141113, "learning_rate": 0.0002, "epoch": 5.5229180116204, "step": 17110}, {"loss": 0.51, "grad_norm": 1.34589421749115, "learning_rate": 0.0002, "epoch": 5.52614590058102, "step": 17120}, {"loss": 0.467, "grad_norm": 0.9889675378799438, "learning_rate": 0.0002, "epoch": 5.52937378954164, "step": 17130}, {"loss": 0.4752, "grad_norm": 1.25719153881073, "learning_rate": 0.0002, "epoch": 5.532601678502259, "step": 17140}, {"loss": 0.4609, "grad_norm": 1.2511073350906372, "learning_rate": 0.0002, "epoch": 5.535829567462879, "step": 17150}, {"loss": 0.4992, "grad_norm": 1.1993521451950073, "learning_rate": 0.0002, "epoch": 5.539057456423499, "step": 17160}, {"loss": 0.4986, "grad_norm": 1.1394526958465576, "learning_rate": 0.0002, "epoch": 5.542285345384119, "step": 17170}, {"loss": 0.5284, "grad_norm": 1.0435349941253662, "learning_rate": 0.0002, "epoch": 5.545513234344739, "step": 17180}, {"loss": 0.4934, "grad_norm": 1.120940089225769, "learning_rate": 0.0002, "epoch": 5.5487411233053585, "step": 17190}, {"loss": 0.4704, "grad_norm": 1.0906445980072021, "learning_rate": 0.0002, "epoch": 5.551969012265978, "step": 17200}, {"loss": 0.4896, "grad_norm": 0.8883966207504272, "learning_rate": 0.0002, "epoch": 5.555196901226598, "step": 17210}, {"loss": 0.4696, "grad_norm": 1.3078752756118774, "learning_rate": 0.0002, "epoch": 5.5584247901872175, "step": 17220}, {"loss": 0.4805, "grad_norm": 1.0224416255950928, "learning_rate": 0.0002, "epoch": 5.561652679147837, "step": 17230}, {"loss": 0.47, "grad_norm": 1.242518663406372, "learning_rate": 0.0002, "epoch": 5.564880568108457, "step": 17240}, {"loss": 0.4708, "grad_norm": 1.2328250408172607, "learning_rate": 0.0002, "epoch": 5.568108457069076, "step": 17250}, {"loss": 0.4685, "grad_norm": 1.2186611890792847, "learning_rate": 0.0002, "epoch": 5.571336346029697, "step": 17260}, {"loss": 0.4688, "grad_norm": 1.0947459936141968, "learning_rate": 0.0002, "epoch": 5.574564234990317, "step": 17270}, {"loss": 0.506, "grad_norm": 1.075279951095581, "learning_rate": 0.0002, "epoch": 5.577792123950936, "step": 17280}, {"loss": 0.478, "grad_norm": 1.0316804647445679, "learning_rate": 0.0002, "epoch": 5.581020012911556, "step": 17290}, {"loss": 0.478, "grad_norm": 1.1077373027801514, "learning_rate": 0.0002, "epoch": 5.584247901872176, "step": 17300}, {"loss": 0.4857, "grad_norm": 1.219228744506836, "learning_rate": 0.0002, "epoch": 5.587475790832795, "step": 17310}, {"loss": 0.4465, "grad_norm": 1.026361346244812, "learning_rate": 0.0002, "epoch": 5.590703679793415, "step": 17320}, {"loss": 0.4831, "grad_norm": 1.1621283292770386, "learning_rate": 0.0002, "epoch": 5.5939315687540345, "step": 17330}, {"loss": 0.4706, "grad_norm": 1.0177470445632935, "learning_rate": 0.0002, "epoch": 5.597159457714655, "step": 17340}, {"loss": 0.4961, "grad_norm": 1.0625319480895996, "learning_rate": 0.0002, "epoch": 5.600387346675275, "step": 17350}, {"loss": 0.484, "grad_norm": 1.148815393447876, "learning_rate": 0.0002, "epoch": 5.603615235635894, "step": 17360}, {"loss": 0.4804, "grad_norm": 1.0571802854537964, "learning_rate": 0.0002, "epoch": 5.606843124596514, "step": 17370}, {"loss": 0.5202, "grad_norm": 1.2069389820098877, "learning_rate": 0.0002, "epoch": 5.610071013557134, "step": 17380}, {"loss": 0.5029, "grad_norm": 1.407530426979065, "learning_rate": 0.0002, "epoch": 5.613298902517753, "step": 17390}, {"loss": 0.4688, "grad_norm": 1.247060775756836, "learning_rate": 0.0002, "epoch": 5.616526791478373, "step": 17400}, {"loss": 0.4359, "grad_norm": 1.431684136390686, "learning_rate": 0.0002, "epoch": 5.619754680438993, "step": 17410}, {"loss": 0.5244, "grad_norm": 1.0520552396774292, "learning_rate": 0.0002, "epoch": 5.622982569399612, "step": 17420}, {"loss": 0.4993, "grad_norm": 1.0593537092208862, "learning_rate": 0.0002, "epoch": 5.626210458360232, "step": 17430}, {"loss": 0.4911, "grad_norm": 1.4414515495300293, "learning_rate": 0.0002, "epoch": 5.6294383473208525, "step": 17440}, {"loss": 0.4761, "grad_norm": 1.0902460813522339, "learning_rate": 0.0002, "epoch": 5.632666236281472, "step": 17450}, {"loss": 0.4737, "grad_norm": 0.890944242477417, "learning_rate": 0.0002, "epoch": 5.635894125242092, "step": 17460}, {"loss": 0.4706, "grad_norm": 1.035675287246704, "learning_rate": 0.0002, "epoch": 5.639122014202711, "step": 17470}, {"loss": 0.484, "grad_norm": 0.9792264103889465, "learning_rate": 0.0002, "epoch": 5.642349903163331, "step": 17480}, {"loss": 0.4753, "grad_norm": 1.1888220310211182, "learning_rate": 0.0002, "epoch": 5.645577792123951, "step": 17490}, {"loss": 0.5047, "grad_norm": 1.0169143676757812, "learning_rate": 0.0002, "epoch": 5.64880568108457, "step": 17500}, {"loss": 0.4919, "grad_norm": 0.9812449216842651, "learning_rate": 0.0002, "epoch": 5.652033570045191, "step": 17510}, {"loss": 0.4879, "grad_norm": 1.0509105920791626, "learning_rate": 0.0002, "epoch": 5.655261459005811, "step": 17520}, {"loss": 0.4695, "grad_norm": 0.9047426581382751, "learning_rate": 0.0002, "epoch": 5.65848934796643, "step": 17530}, {"loss": 0.4712, "grad_norm": 1.2393709421157837, "learning_rate": 0.0002, "epoch": 5.66171723692705, "step": 17540}, {"loss": 0.5012, "grad_norm": 1.1098991632461548, "learning_rate": 0.0002, "epoch": 5.6649451258876695, "step": 17550}, {"loss": 0.4499, "grad_norm": 0.8181570768356323, "learning_rate": 0.0002, "epoch": 5.668173014848289, "step": 17560}, {"loss": 0.4973, "grad_norm": 0.9676381945610046, "learning_rate": 0.0002, "epoch": 5.671400903808909, "step": 17570}, {"loss": 0.5058, "grad_norm": 1.1225934028625488, "learning_rate": 0.0002, "epoch": 5.6746287927695285, "step": 17580}, {"loss": 0.5165, "grad_norm": 1.6259925365447998, "learning_rate": 0.0002, "epoch": 5.677856681730148, "step": 17590}, {"loss": 0.4613, "grad_norm": 0.7751404643058777, "learning_rate": 0.0002, "epoch": 5.681084570690768, "step": 17600}, {"loss": 0.4895, "grad_norm": 0.8478589057922363, "learning_rate": 0.0002, "epoch": 5.684312459651388, "step": 17610}, {"loss": 0.4492, "grad_norm": 1.2887113094329834, "learning_rate": 0.0002, "epoch": 5.687540348612008, "step": 17620}, {"loss": 0.4792, "grad_norm": 1.1452652215957642, "learning_rate": 0.0002, "epoch": 5.690768237572628, "step": 17630}, {"loss": 0.4889, "grad_norm": 1.0370417833328247, "learning_rate": 0.0002, "epoch": 5.693996126533247, "step": 17640}, {"loss": 0.535, "grad_norm": 1.1358870267868042, "learning_rate": 0.0002, "epoch": 5.697224015493867, "step": 17650}, {"loss": 0.4753, "grad_norm": 1.2772479057312012, "learning_rate": 0.0002, "epoch": 5.700451904454487, "step": 17660}, {"loss": 0.4492, "grad_norm": 1.182812213897705, "learning_rate": 0.0002, "epoch": 5.703679793415106, "step": 17670}, {"loss": 0.5025, "grad_norm": 1.099074125289917, "learning_rate": 0.0002, "epoch": 5.706907682375727, "step": 17680}, {"loss": 0.4945, "grad_norm": 0.938634991645813, "learning_rate": 0.0002, "epoch": 5.710135571336346, "step": 17690}, {"loss": 0.491, "grad_norm": 0.9385238885879517, "learning_rate": 0.0002, "epoch": 5.713363460296966, "step": 17700}, {"loss": 0.4849, "grad_norm": 1.1486014127731323, "learning_rate": 0.0002, "epoch": 5.716591349257586, "step": 17710}, {"loss": 0.5043, "grad_norm": 0.9433078169822693, "learning_rate": 0.0002, "epoch": 5.719819238218205, "step": 17720}, {"loss": 0.4543, "grad_norm": 1.02472722530365, "learning_rate": 0.0002, "epoch": 5.723047127178825, "step": 17730}, {"loss": 0.4631, "grad_norm": 0.9360876679420471, "learning_rate": 0.0002, "epoch": 5.726275016139445, "step": 17740}, {"loss": 0.4947, "grad_norm": 1.0481483936309814, "learning_rate": 0.0002, "epoch": 5.729502905100064, "step": 17750}, {"loss": 0.4763, "grad_norm": 1.0032516717910767, "learning_rate": 0.0002, "epoch": 5.732730794060684, "step": 17760}, {"loss": 0.4819, "grad_norm": 0.8908069729804993, "learning_rate": 0.0002, "epoch": 5.735958683021304, "step": 17770}, {"loss": 0.5188, "grad_norm": 1.0679123401641846, "learning_rate": 0.0002, "epoch": 5.739186571981924, "step": 17780}, {"loss": 0.4818, "grad_norm": 1.0448014736175537, "learning_rate": 0.0002, "epoch": 5.742414460942544, "step": 17790}, {"loss": 0.4869, "grad_norm": 1.0433847904205322, "learning_rate": 0.0002, "epoch": 5.7456423499031635, "step": 17800}, {"loss": 0.5243, "grad_norm": 1.000291109085083, "learning_rate": 0.0002, "epoch": 5.748870238863783, "step": 17810}, {"loss": 0.4891, "grad_norm": 1.1238429546356201, "learning_rate": 0.0002, "epoch": 5.752098127824403, "step": 17820}, {"loss": 0.4905, "grad_norm": 1.09062659740448, "learning_rate": 0.0002, "epoch": 5.755326016785022, "step": 17830}, {"loss": 0.4883, "grad_norm": 0.8538689613342285, "learning_rate": 0.0002, "epoch": 5.758553905745642, "step": 17840}, {"loss": 0.4989, "grad_norm": 1.3872947692871094, "learning_rate": 0.0002, "epoch": 5.761781794706262, "step": 17850}, {"loss": 0.4707, "grad_norm": 1.0578876733779907, "learning_rate": 0.0002, "epoch": 5.765009683666882, "step": 17860}, {"loss": 0.5281, "grad_norm": 1.1761705875396729, "learning_rate": 0.0002, "epoch": 5.768237572627502, "step": 17870}, {"loss": 0.4802, "grad_norm": 1.1223368644714355, "learning_rate": 0.0002, "epoch": 5.771465461588122, "step": 17880}, {"loss": 0.505, "grad_norm": 1.2484360933303833, "learning_rate": 0.0002, "epoch": 5.774693350548741, "step": 17890}, {"loss": 0.4786, "grad_norm": 1.2461199760437012, "learning_rate": 0.0002, "epoch": 5.777921239509361, "step": 17900}, {"loss": 0.4933, "grad_norm": 1.1718299388885498, "learning_rate": 0.0002, "epoch": 5.7811491284699805, "step": 17910}, {"loss": 0.471, "grad_norm": 0.9896837472915649, "learning_rate": 0.0002, "epoch": 5.7843770174306, "step": 17920}, {"loss": 0.4808, "grad_norm": 1.3759760856628418, "learning_rate": 0.0002, "epoch": 5.78760490639122, "step": 17930}, {"loss": 0.4847, "grad_norm": 1.0596622228622437, "learning_rate": 0.0002, "epoch": 5.7908327953518395, "step": 17940}, {"loss": 0.5153, "grad_norm": 0.9292021989822388, "learning_rate": 0.0002, "epoch": 5.79406068431246, "step": 17950}, {"loss": 0.4783, "grad_norm": 0.8786653876304626, "learning_rate": 0.0002, "epoch": 5.79728857327308, "step": 17960}, {"loss": 0.4598, "grad_norm": 1.2087152004241943, "learning_rate": 0.0002, "epoch": 5.800516462233699, "step": 17970}, {"loss": 0.4953, "grad_norm": 1.1643104553222656, "learning_rate": 0.0002, "epoch": 5.803744351194319, "step": 17980}, {"loss": 0.5111, "grad_norm": 0.971613347530365, "learning_rate": 0.0002, "epoch": 5.806972240154939, "step": 17990}, {"loss": 0.5094, "grad_norm": 1.306227684020996, "learning_rate": 0.0002, "epoch": 5.810200129115558, "step": 18000}, {"loss": 0.5392, "grad_norm": 1.3665502071380615, "learning_rate": 0.0002, "epoch": 5.813428018076178, "step": 18010}, {"loss": 0.4887, "grad_norm": 1.2227312326431274, "learning_rate": 0.0002, "epoch": 5.816655907036798, "step": 18020}, {"loss": 0.5203, "grad_norm": 1.180694818496704, "learning_rate": 0.0002, "epoch": 5.819883795997418, "step": 18030}, {"loss": 0.4962, "grad_norm": 1.1045362949371338, "learning_rate": 0.0002, "epoch": 5.823111684958038, "step": 18040}, {"loss": 0.4969, "grad_norm": 1.3828954696655273, "learning_rate": 0.0002, "epoch": 5.826339573918657, "step": 18050}, {"loss": 0.5493, "grad_norm": 1.305102825164795, "learning_rate": 0.0002, "epoch": 5.829567462879277, "step": 18060}, {"loss": 0.4844, "grad_norm": 1.2708743810653687, "learning_rate": 0.0002, "epoch": 5.832795351839897, "step": 18070}, {"loss": 0.4834, "grad_norm": 1.0344188213348389, "learning_rate": 0.0002, "epoch": 5.836023240800516, "step": 18080}, {"loss": 0.5088, "grad_norm": 1.1321724653244019, "learning_rate": 0.0002, "epoch": 5.839251129761136, "step": 18090}, {"loss": 0.4888, "grad_norm": 1.2162611484527588, "learning_rate": 0.0002, "epoch": 5.842479018721756, "step": 18100}, {"loss": 0.5014, "grad_norm": 1.427612543106079, "learning_rate": 0.0002, "epoch": 5.845706907682375, "step": 18110}, {"loss": 0.5339, "grad_norm": 1.4391452074050903, "learning_rate": 0.0002, "epoch": 5.848934796642995, "step": 18120}, {"loss": 0.528, "grad_norm": 1.1548216342926025, "learning_rate": 0.0002, "epoch": 5.8521626856036155, "step": 18130}, {"loss": 0.4779, "grad_norm": 1.2336437702178955, "learning_rate": 0.0002, "epoch": 5.855390574564235, "step": 18140}, {"loss": 0.4844, "grad_norm": 1.254661202430725, "learning_rate": 0.0002, "epoch": 5.858618463524855, "step": 18150}, {"loss": 0.5201, "grad_norm": 0.8326491117477417, "learning_rate": 0.0002, "epoch": 5.8618463524854745, "step": 18160}, {"loss": 0.5076, "grad_norm": 1.0907988548278809, "learning_rate": 0.0002, "epoch": 5.865074241446094, "step": 18170}, {"loss": 0.48, "grad_norm": 0.9896568655967712, "learning_rate": 0.0002, "epoch": 5.868302130406714, "step": 18180}, {"loss": 0.4628, "grad_norm": 0.9440065026283264, "learning_rate": 0.0002, "epoch": 5.871530019367333, "step": 18190}, {"loss": 0.5265, "grad_norm": 1.09321129322052, "learning_rate": 0.0002, "epoch": 5.874757908327954, "step": 18200}, {"loss": 0.4737, "grad_norm": 1.2588142156600952, "learning_rate": 0.0002, "epoch": 5.877985797288574, "step": 18210}, {"loss": 0.475, "grad_norm": 1.1731587648391724, "learning_rate": 0.0002, "epoch": 5.881213686249193, "step": 18220}, {"loss": 0.504, "grad_norm": 0.9904444217681885, "learning_rate": 0.0002, "epoch": 5.884441575209813, "step": 18230}, {"loss": 0.4842, "grad_norm": 0.8985799551010132, "learning_rate": 0.0002, "epoch": 5.887669464170433, "step": 18240}, {"loss": 0.4878, "grad_norm": 1.0182441473007202, "learning_rate": 0.0002, "epoch": 5.890897353131052, "step": 18250}, {"loss": 0.5224, "grad_norm": 1.1574701070785522, "learning_rate": 0.0002, "epoch": 5.894125242091672, "step": 18260}, {"loss": 0.5, "grad_norm": 1.1776602268218994, "learning_rate": 0.0002, "epoch": 5.8973531310522915, "step": 18270}, {"loss": 0.5245, "grad_norm": 1.4951308965682983, "learning_rate": 0.0002, "epoch": 5.900581020012911, "step": 18280}, {"loss": 0.5454, "grad_norm": 1.1440261602401733, "learning_rate": 0.0002, "epoch": 5.903808908973531, "step": 18290}, {"loss": 0.4868, "grad_norm": 0.9925196170806885, "learning_rate": 0.0002, "epoch": 5.907036797934151, "step": 18300}, {"loss": 0.5142, "grad_norm": 1.098615288734436, "learning_rate": 0.0002, "epoch": 5.910264686894771, "step": 18310}, {"loss": 0.5184, "grad_norm": 1.0030080080032349, "learning_rate": 0.0002, "epoch": 5.913492575855391, "step": 18320}, {"loss": 0.474, "grad_norm": 0.9890318512916565, "learning_rate": 0.0002, "epoch": 5.91672046481601, "step": 18330}, {"loss": 0.5125, "grad_norm": 1.2209392786026, "learning_rate": 0.0002, "epoch": 5.91994835377663, "step": 18340}, {"loss": 0.4634, "grad_norm": 1.108933925628662, "learning_rate": 0.0002, "epoch": 5.92317624273725, "step": 18350}, {"loss": 0.4813, "grad_norm": 1.086024522781372, "learning_rate": 0.0002, "epoch": 5.926404131697869, "step": 18360}, {"loss": 0.4952, "grad_norm": 1.0061167478561401, "learning_rate": 0.0002, "epoch": 5.92963202065849, "step": 18370}, {"loss": 0.4848, "grad_norm": 0.9445858597755432, "learning_rate": 0.0002, "epoch": 5.9328599096191095, "step": 18380}, {"loss": 0.5014, "grad_norm": 0.9556859135627747, "learning_rate": 0.0002, "epoch": 5.936087798579729, "step": 18390}, {"loss": 0.4966, "grad_norm": 1.154168963432312, "learning_rate": 0.0002, "epoch": 5.939315687540349, "step": 18400}, {"loss": 0.4836, "grad_norm": 1.0495831966400146, "learning_rate": 0.0002, "epoch": 5.942543576500968, "step": 18410}, {"loss": 0.5021, "grad_norm": 1.0717304944992065, "learning_rate": 0.0002, "epoch": 5.945771465461588, "step": 18420}, {"loss": 0.4794, "grad_norm": 1.06618332862854, "learning_rate": 0.0002, "epoch": 5.948999354422208, "step": 18430}, {"loss": 0.5011, "grad_norm": 0.9567165374755859, "learning_rate": 0.0002, "epoch": 5.952227243382827, "step": 18440}, {"loss": 0.485, "grad_norm": 1.0306249856948853, "learning_rate": 0.0002, "epoch": 5.955455132343447, "step": 18450}, {"loss": 0.4948, "grad_norm": 1.1879968643188477, "learning_rate": 0.0002, "epoch": 5.958683021304067, "step": 18460}, {"loss": 0.5185, "grad_norm": 1.3177233934402466, "learning_rate": 0.0002, "epoch": 5.961910910264687, "step": 18470}, {"loss": 0.4966, "grad_norm": 1.0945817232131958, "learning_rate": 0.0002, "epoch": 5.965138799225307, "step": 18480}, {"loss": 0.5196, "grad_norm": 1.029414415359497, "learning_rate": 0.0002, "epoch": 5.9683666881859265, "step": 18490}, {"loss": 0.5154, "grad_norm": 1.2266209125518799, "learning_rate": 0.0002, "epoch": 5.971594577146546, "step": 18500}, {"loss": 0.4914, "grad_norm": 1.2167150974273682, "learning_rate": 0.0002, "epoch": 5.974822466107166, "step": 18510}, {"loss": 0.466, "grad_norm": 0.9941056966781616, "learning_rate": 0.0002, "epoch": 5.9780503550677855, "step": 18520}, {"loss": 0.5037, "grad_norm": 1.4244859218597412, "learning_rate": 0.0002, "epoch": 5.981278244028405, "step": 18530}, {"loss": 0.4902, "grad_norm": 0.8976260423660278, "learning_rate": 0.0002, "epoch": 5.984506132989026, "step": 18540}, {"loss": 0.5039, "grad_norm": 1.0162699222564697, "learning_rate": 0.0002, "epoch": 5.987734021949645, "step": 18550}, {"loss": 0.5138, "grad_norm": 1.196677803993225, "learning_rate": 0.0002, "epoch": 5.990961910910265, "step": 18560}, {"loss": 0.4626, "grad_norm": 1.163403868675232, "learning_rate": 0.0002, "epoch": 5.994189799870885, "step": 18570}, {"loss": 0.5105, "grad_norm": 1.010205626487732, "learning_rate": 0.0002, "epoch": 5.997417688831504, "step": 18580}, {"eval_loss": 1.2861483097076416, "eval_runtime": 163.2683, "eval_samples_per_second": 4.49, "eval_steps_per_second": 0.563, "epoch": 6.0, "step": 18588}, {"loss": 0.4557, "grad_norm": 0.7334756255149841, "learning_rate": 0.0002, "epoch": 6.000645577792124, "step": 18590}, {"loss": 0.4201, "grad_norm": 1.093945026397705, "learning_rate": 0.0002, "epoch": 6.003873466752744, "step": 18600}, {"loss": 0.4235, "grad_norm": 1.2327148914337158, "learning_rate": 0.0002, "epoch": 6.007101355713363, "step": 18610}, {"loss": 0.377, "grad_norm": 1.3238836526870728, "learning_rate": 0.0002, "epoch": 6.010329244673983, "step": 18620}, {"loss": 0.3883, "grad_norm": 1.2364031076431274, "learning_rate": 0.0002, "epoch": 6.0135571336346025, "step": 18630}, {"loss": 0.3958, "grad_norm": 0.902474045753479, "learning_rate": 0.0002, "epoch": 6.016785022595223, "step": 18640}, {"loss": 0.4077, "grad_norm": 1.273280382156372, "learning_rate": 0.0002, "epoch": 6.020012911555843, "step": 18650}, {"loss": 0.4224, "grad_norm": 1.2470760345458984, "learning_rate": 0.0002, "epoch": 6.023240800516462, "step": 18660}, {"loss": 0.3752, "grad_norm": 1.2360138893127441, "learning_rate": 0.0002, "epoch": 6.026468689477082, "step": 18670}, {"loss": 0.3653, "grad_norm": 1.467140793800354, "learning_rate": 0.0002, "epoch": 6.029696578437702, "step": 18680}, {"loss": 0.3883, "grad_norm": 1.123871088027954, "learning_rate": 0.0002, "epoch": 6.032924467398321, "step": 18690}, {"loss": 0.3812, "grad_norm": 0.9732550978660583, "learning_rate": 0.0002, "epoch": 6.036152356358941, "step": 18700}, {"loss": 0.4163, "grad_norm": 1.170860767364502, "learning_rate": 0.0002, "epoch": 6.039380245319561, "step": 18710}, {"loss": 0.3836, "grad_norm": 1.2599345445632935, "learning_rate": 0.0002, "epoch": 6.042608134280181, "step": 18720}, {"loss": 0.3881, "grad_norm": 1.0808286666870117, "learning_rate": 0.0002, "epoch": 6.045836023240801, "step": 18730}, {"loss": 0.386, "grad_norm": 0.9799565076828003, "learning_rate": 0.0002, "epoch": 6.0490639122014205, "step": 18740}, {"loss": 0.3833, "grad_norm": 0.8425611853599548, "learning_rate": 0.0002, "epoch": 6.05229180116204, "step": 18750}, {"loss": 0.3765, "grad_norm": 0.9762344360351562, "learning_rate": 0.0002, "epoch": 6.05551969012266, "step": 18760}, {"loss": 0.3878, "grad_norm": 1.1290913820266724, "learning_rate": 0.0002, "epoch": 6.058747579083279, "step": 18770}, {"loss": 0.4061, "grad_norm": 1.2240493297576904, "learning_rate": 0.0002, "epoch": 6.061975468043899, "step": 18780}, {"loss": 0.3894, "grad_norm": 1.3422439098358154, "learning_rate": 0.0002, "epoch": 6.065203357004519, "step": 18790}, {"loss": 0.3885, "grad_norm": 1.0391879081726074, "learning_rate": 0.0002, "epoch": 6.068431245965138, "step": 18800}, {"loss": 0.409, "grad_norm": 1.0910760164260864, "learning_rate": 0.0002, "epoch": 6.071659134925759, "step": 18810}, {"loss": 0.3905, "grad_norm": 1.280098557472229, "learning_rate": 0.0002, "epoch": 6.074887023886379, "step": 18820}, {"loss": 0.3892, "grad_norm": 1.2102673053741455, "learning_rate": 0.0002, "epoch": 6.078114912846998, "step": 18830}, {"loss": 0.3757, "grad_norm": 1.3735624551773071, "learning_rate": 0.0002, "epoch": 6.081342801807618, "step": 18840}, {"loss": 0.4057, "grad_norm": 1.039419412612915, "learning_rate": 0.0002, "epoch": 6.0845706907682375, "step": 18850}, {"loss": 0.4093, "grad_norm": 1.175872802734375, "learning_rate": 0.0002, "epoch": 6.087798579728857, "step": 18860}, {"loss": 0.3933, "grad_norm": 1.4287301301956177, "learning_rate": 0.0002, "epoch": 6.091026468689477, "step": 18870}, {"loss": 0.4029, "grad_norm": 1.110627293586731, "learning_rate": 0.0002, "epoch": 6.0942543576500965, "step": 18880}, {"loss": 0.4195, "grad_norm": 1.1495535373687744, "learning_rate": 0.0002, "epoch": 6.097482246610717, "step": 18890}, {"loss": 0.4022, "grad_norm": 0.9764134287834167, "learning_rate": 0.0002, "epoch": 6.100710135571337, "step": 18900}, {"loss": 0.4097, "grad_norm": 1.0792596340179443, "learning_rate": 0.0002, "epoch": 6.103938024531956, "step": 18910}, {"loss": 0.402, "grad_norm": 1.2520235776901245, "learning_rate": 0.0002, "epoch": 6.107165913492576, "step": 18920}, {"loss": 0.4091, "grad_norm": 0.857008695602417, "learning_rate": 0.0002, "epoch": 6.110393802453196, "step": 18930}, {"loss": 0.4046, "grad_norm": 1.745723009109497, "learning_rate": 0.0002, "epoch": 6.113621691413815, "step": 18940}, {"loss": 0.4245, "grad_norm": 1.099941611289978, "learning_rate": 0.0002, "epoch": 6.116849580374435, "step": 18950}, {"loss": 0.3708, "grad_norm": 1.1402947902679443, "learning_rate": 0.0002, "epoch": 6.120077469335055, "step": 18960}, {"loss": 0.4022, "grad_norm": 1.0565131902694702, "learning_rate": 0.0002, "epoch": 6.123305358295674, "step": 18970}, {"loss": 0.3973, "grad_norm": 1.1511917114257812, "learning_rate": 0.0002, "epoch": 6.126533247256295, "step": 18980}, {"loss": 0.395, "grad_norm": 0.9029410481452942, "learning_rate": 0.0002, "epoch": 6.129761136216914, "step": 18990}, {"loss": 0.393, "grad_norm": 1.03252375125885, "learning_rate": 0.0002, "epoch": 6.132989025177534, "step": 19000}, {"loss": 0.3923, "grad_norm": 1.2058522701263428, "learning_rate": 0.0002, "epoch": 6.136216914138154, "step": 19010}, {"loss": 0.3963, "grad_norm": 1.2274953126907349, "learning_rate": 0.0002, "epoch": 6.139444803098773, "step": 19020}, {"loss": 0.3999, "grad_norm": 1.3196226358413696, "learning_rate": 0.0002, "epoch": 6.142672692059393, "step": 19030}, {"loss": 0.4176, "grad_norm": 0.8030686378479004, "learning_rate": 0.0002, "epoch": 6.145900581020013, "step": 19040}, {"loss": 0.3886, "grad_norm": 1.1762639284133911, "learning_rate": 0.0002, "epoch": 6.149128469980632, "step": 19050}, {"loss": 0.429, "grad_norm": 1.0247628688812256, "learning_rate": 0.0002, "epoch": 6.152356358941253, "step": 19060}, {"loss": 0.3876, "grad_norm": 0.99031662940979, "learning_rate": 0.0002, "epoch": 6.1555842479018725, "step": 19070}, {"loss": 0.3818, "grad_norm": 1.334445834159851, "learning_rate": 0.0002, "epoch": 6.158812136862492, "step": 19080}, {"loss": 0.4038, "grad_norm": 1.1160423755645752, "learning_rate": 0.0002, "epoch": 6.162040025823112, "step": 19090}, {"loss": 0.4081, "grad_norm": 1.2579560279846191, "learning_rate": 0.0002, "epoch": 6.1652679147837315, "step": 19100}, {"loss": 0.4092, "grad_norm": 0.9372721910476685, "learning_rate": 0.0002, "epoch": 6.168495803744351, "step": 19110}, {"loss": 0.3905, "grad_norm": 0.7995722889900208, "learning_rate": 0.0002, "epoch": 6.171723692704971, "step": 19120}, {"loss": 0.3896, "grad_norm": 1.0074360370635986, "learning_rate": 0.0002, "epoch": 6.17495158166559, "step": 19130}, {"loss": 0.4328, "grad_norm": 0.9821600914001465, "learning_rate": 0.0002, "epoch": 6.17817947062621, "step": 19140}, {"loss": 0.3845, "grad_norm": 1.1252691745758057, "learning_rate": 0.0002, "epoch": 6.181407359586831, "step": 19150}, {"loss": 0.3918, "grad_norm": 1.316981554031372, "learning_rate": 0.0002, "epoch": 6.18463524854745, "step": 19160}, {"loss": 0.3893, "grad_norm": 1.0131299495697021, "learning_rate": 0.0002, "epoch": 6.18786313750807, "step": 19170}, {"loss": 0.4111, "grad_norm": 1.3530288934707642, "learning_rate": 0.0002, "epoch": 6.19109102646869, "step": 19180}, {"loss": 0.416, "grad_norm": 1.148247480392456, "learning_rate": 0.0002, "epoch": 6.194318915429309, "step": 19190}, {"loss": 0.4191, "grad_norm": 1.5510036945343018, "learning_rate": 0.0002, "epoch": 6.197546804389929, "step": 19200}, {"loss": 0.423, "grad_norm": 1.3048018217086792, "learning_rate": 0.0002, "epoch": 6.2007746933505485, "step": 19210}, {"loss": 0.397, "grad_norm": 1.186187982559204, "learning_rate": 0.0002, "epoch": 6.204002582311168, "step": 19220}, {"loss": 0.4164, "grad_norm": 1.5199471712112427, "learning_rate": 0.0002, "epoch": 6.207230471271788, "step": 19230}, {"loss": 0.4322, "grad_norm": 1.1311423778533936, "learning_rate": 0.0002, "epoch": 6.210458360232408, "step": 19240}, {"loss": 0.4086, "grad_norm": 1.2345898151397705, "learning_rate": 0.0002, "epoch": 6.213686249193028, "step": 19250}, {"loss": 0.4122, "grad_norm": 1.0261863470077515, "learning_rate": 0.0002, "epoch": 6.216914138153648, "step": 19260}, {"loss": 0.4315, "grad_norm": 0.8985416293144226, "learning_rate": 0.0002, "epoch": 6.220142027114267, "step": 19270}, {"loss": 0.4052, "grad_norm": 1.3136980533599854, "learning_rate": 0.0002, "epoch": 6.223369916074887, "step": 19280}, {"loss": 0.4232, "grad_norm": 1.1949185132980347, "learning_rate": 0.0002, "epoch": 6.226597805035507, "step": 19290}, {"loss": 0.4255, "grad_norm": 0.9668909907341003, "learning_rate": 0.0002, "epoch": 6.229825693996126, "step": 19300}, {"loss": 0.3917, "grad_norm": 0.8858964443206787, "learning_rate": 0.0002, "epoch": 6.233053582956746, "step": 19310}, {"loss": 0.4087, "grad_norm": 1.4254822731018066, "learning_rate": 0.0002, "epoch": 6.236281471917366, "step": 19320}, {"loss": 0.426, "grad_norm": 1.0455392599105835, "learning_rate": 0.0002, "epoch": 6.239509360877986, "step": 19330}, {"loss": 0.3894, "grad_norm": 1.1690824031829834, "learning_rate": 0.0002, "epoch": 6.242737249838606, "step": 19340}, {"loss": 0.3777, "grad_norm": 1.0347497463226318, "learning_rate": 0.0002, "epoch": 6.245965138799225, "step": 19350}, {"loss": 0.3972, "grad_norm": 1.0790464878082275, "learning_rate": 0.0002, "epoch": 6.249193027759845, "step": 19360}, {"loss": 0.4393, "grad_norm": 1.1294453144073486, "learning_rate": 0.0002, "epoch": 6.252420916720465, "step": 19370}, {"loss": 0.4055, "grad_norm": 1.5094330310821533, "learning_rate": 0.0002, "epoch": 6.255648805681084, "step": 19380}, {"loss": 0.4228, "grad_norm": 1.1122944355010986, "learning_rate": 0.0002, "epoch": 6.258876694641704, "step": 19390}, {"loss": 0.4341, "grad_norm": 1.3123422861099243, "learning_rate": 0.0002, "epoch": 6.262104583602324, "step": 19400}, {"loss": 0.4206, "grad_norm": 1.0585907697677612, "learning_rate": 0.0002, "epoch": 6.265332472562944, "step": 19410}, {"loss": 0.4001, "grad_norm": 0.8711239099502563, "learning_rate": 0.0002, "epoch": 6.268560361523564, "step": 19420}, {"loss": 0.4201, "grad_norm": 1.2772116661071777, "learning_rate": 0.0002, "epoch": 6.2717882504841835, "step": 19430}, {"loss": 0.4298, "grad_norm": 1.0035508871078491, "learning_rate": 0.0002, "epoch": 6.275016139444803, "step": 19440}, {"loss": 0.4234, "grad_norm": 0.7933974862098694, "learning_rate": 0.0002, "epoch": 6.278244028405423, "step": 19450}, {"loss": 0.4144, "grad_norm": 1.2455826997756958, "learning_rate": 0.0002, "epoch": 6.2814719173660425, "step": 19460}, {"loss": 0.4171, "grad_norm": 1.2735545635223389, "learning_rate": 0.0002, "epoch": 6.284699806326662, "step": 19470}, {"loss": 0.3956, "grad_norm": 0.9773174524307251, "learning_rate": 0.0002, "epoch": 6.287927695287282, "step": 19480}, {"loss": 0.4264, "grad_norm": 1.2341974973678589, "learning_rate": 0.0002, "epoch": 6.2911555842479014, "step": 19490}, {"loss": 0.4068, "grad_norm": 1.286138653755188, "learning_rate": 0.0002, "epoch": 6.294383473208522, "step": 19500}, {"loss": 0.439, "grad_norm": 1.052889108657837, "learning_rate": 0.0002, "epoch": 6.297611362169142, "step": 19510}, {"loss": 0.4199, "grad_norm": 1.1955385208129883, "learning_rate": 0.0002, "epoch": 6.300839251129761, "step": 19520}, {"loss": 0.4242, "grad_norm": 1.2792452573776245, "learning_rate": 0.0002, "epoch": 6.304067140090381, "step": 19530}, {"loss": 0.3989, "grad_norm": 0.9077931046485901, "learning_rate": 0.0002, "epoch": 6.307295029051001, "step": 19540}, {"loss": 0.388, "grad_norm": 1.2492976188659668, "learning_rate": 0.0002, "epoch": 6.31052291801162, "step": 19550}, {"loss": 0.3828, "grad_norm": 1.1097182035446167, "learning_rate": 0.0002, "epoch": 6.31375080697224, "step": 19560}, {"loss": 0.4482, "grad_norm": 1.271609902381897, "learning_rate": 0.0002, "epoch": 6.3169786959328595, "step": 19570}, {"loss": 0.3851, "grad_norm": 1.4262897968292236, "learning_rate": 0.0002, "epoch": 6.32020658489348, "step": 19580}, {"loss": 0.4133, "grad_norm": 1.057338833808899, "learning_rate": 0.0002, "epoch": 6.3234344738541, "step": 19590}, {"loss": 0.4366, "grad_norm": 1.323028326034546, "learning_rate": 0.0002, "epoch": 6.326662362814719, "step": 19600}, {"loss": 0.4186, "grad_norm": 1.0991673469543457, "learning_rate": 0.0002, "epoch": 6.329890251775339, "step": 19610}, {"loss": 0.4132, "grad_norm": 1.1600234508514404, "learning_rate": 0.0002, "epoch": 6.333118140735959, "step": 19620}, {"loss": 0.4689, "grad_norm": 1.2986212968826294, "learning_rate": 0.0002, "epoch": 6.336346029696578, "step": 19630}, {"loss": 0.3914, "grad_norm": 1.2117934226989746, "learning_rate": 0.0002, "epoch": 6.339573918657198, "step": 19640}, {"loss": 0.3939, "grad_norm": 0.9747948050498962, "learning_rate": 0.0002, "epoch": 6.342801807617818, "step": 19650}, {"loss": 0.4517, "grad_norm": 1.2380492687225342, "learning_rate": 0.0002, "epoch": 6.346029696578437, "step": 19660}, {"loss": 0.4344, "grad_norm": 1.2475087642669678, "learning_rate": 0.0002, "epoch": 6.349257585539058, "step": 19670}, {"loss": 0.4253, "grad_norm": 1.022084355354309, "learning_rate": 0.0002, "epoch": 6.3524854744996775, "step": 19680}, {"loss": 0.4227, "grad_norm": 1.2422059774398804, "learning_rate": 0.0002, "epoch": 6.355713363460297, "step": 19690}, {"loss": 0.4205, "grad_norm": 1.5015275478363037, "learning_rate": 0.0002, "epoch": 6.358941252420917, "step": 19700}, {"loss": 0.414, "grad_norm": 1.068727970123291, "learning_rate": 0.0002, "epoch": 6.362169141381536, "step": 19710}, {"loss": 0.4054, "grad_norm": 1.3718897104263306, "learning_rate": 0.0002, "epoch": 6.365397030342156, "step": 19720}, {"loss": 0.4399, "grad_norm": 1.3437764644622803, "learning_rate": 0.0002, "epoch": 6.368624919302776, "step": 19730}, {"loss": 0.4187, "grad_norm": 0.9128499031066895, "learning_rate": 0.0002, "epoch": 6.371852808263395, "step": 19740}, {"loss": 0.4346, "grad_norm": 1.0678889751434326, "learning_rate": 0.0002, "epoch": 6.375080697224016, "step": 19750}, {"loss": 0.4103, "grad_norm": 1.0432878732681274, "learning_rate": 0.0002, "epoch": 6.378308586184636, "step": 19760}, {"loss": 0.4304, "grad_norm": 1.4033927917480469, "learning_rate": 0.0002, "epoch": 6.381536475145255, "step": 19770}, {"loss": 0.4225, "grad_norm": 1.2773922681808472, "learning_rate": 0.0002, "epoch": 6.384764364105875, "step": 19780}, {"loss": 0.4246, "grad_norm": 1.257847547531128, "learning_rate": 0.0002, "epoch": 6.3879922530664945, "step": 19790}, {"loss": 0.4261, "grad_norm": 0.8424118757247925, "learning_rate": 0.0002, "epoch": 6.391220142027114, "step": 19800}, {"loss": 0.4145, "grad_norm": 1.3387986421585083, "learning_rate": 0.0002, "epoch": 6.394448030987734, "step": 19810}, {"loss": 0.4268, "grad_norm": 1.1277328729629517, "learning_rate": 0.0002, "epoch": 6.3976759199483535, "step": 19820}, {"loss": 0.4213, "grad_norm": 1.264283537864685, "learning_rate": 0.0002, "epoch": 6.400903808908973, "step": 19830}, {"loss": 0.4506, "grad_norm": 1.1770991086959839, "learning_rate": 0.0002, "epoch": 6.404131697869594, "step": 19840}, {"loss": 0.4385, "grad_norm": 0.9695967435836792, "learning_rate": 0.0002, "epoch": 6.407359586830213, "step": 19850}, {"loss": 0.4258, "grad_norm": 1.3394994735717773, "learning_rate": 0.0002, "epoch": 6.410587475790833, "step": 19860}, {"loss": 0.4017, "grad_norm": 1.0515536069869995, "learning_rate": 0.0002, "epoch": 6.413815364751453, "step": 19870}, {"loss": 0.4555, "grad_norm": 1.3238868713378906, "learning_rate": 0.0002, "epoch": 6.417043253712072, "step": 19880}, {"loss": 0.4385, "grad_norm": 1.0801814794540405, "learning_rate": 0.0002, "epoch": 6.420271142672692, "step": 19890}, {"loss": 0.4135, "grad_norm": 1.1391135454177856, "learning_rate": 0.0002, "epoch": 6.423499031633312, "step": 19900}, {"loss": 0.4376, "grad_norm": 1.13046133518219, "learning_rate": 0.0002, "epoch": 6.426726920593931, "step": 19910}, {"loss": 0.4251, "grad_norm": 1.1657520532608032, "learning_rate": 0.0002, "epoch": 6.429954809554552, "step": 19920}, {"loss": 0.3951, "grad_norm": 1.3315341472625732, "learning_rate": 0.0002, "epoch": 6.433182698515171, "step": 19930}, {"loss": 0.4254, "grad_norm": 1.1806831359863281, "learning_rate": 0.0002, "epoch": 6.436410587475791, "step": 19940}, {"loss": 0.3988, "grad_norm": 1.1581867933273315, "learning_rate": 0.0002, "epoch": 6.439638476436411, "step": 19950}, {"loss": 0.4194, "grad_norm": 1.2601206302642822, "learning_rate": 0.0002, "epoch": 6.44286636539703, "step": 19960}, {"loss": 0.4505, "grad_norm": 1.1163229942321777, "learning_rate": 0.0002, "epoch": 6.44609425435765, "step": 19970}, {"loss": 0.4295, "grad_norm": 0.9959462285041809, "learning_rate": 0.0002, "epoch": 6.44932214331827, "step": 19980}, {"loss": 0.421, "grad_norm": 1.1213586330413818, "learning_rate": 0.0002, "epoch": 6.452550032278889, "step": 19990}, {"loss": 0.4354, "grad_norm": 1.1345361471176147, "learning_rate": 0.0002, "epoch": 6.455777921239509, "step": 20000}, {"loss": 0.429, "grad_norm": 1.245871901512146, "learning_rate": 0.0002, "epoch": 6.459005810200129, "step": 20010}, {"loss": 0.4395, "grad_norm": 1.0894919633865356, "learning_rate": 0.0002, "epoch": 6.462233699160749, "step": 20020}, {"loss": 0.4365, "grad_norm": 1.030206322669983, "learning_rate": 0.0002, "epoch": 6.465461588121369, "step": 20030}, {"loss": 0.4225, "grad_norm": 1.262133002281189, "learning_rate": 0.0002, "epoch": 6.4686894770819885, "step": 20040}, {"loss": 0.4301, "grad_norm": 1.167641043663025, "learning_rate": 0.0002, "epoch": 6.471917366042608, "step": 20050}, {"loss": 0.4438, "grad_norm": 1.1125705242156982, "learning_rate": 0.0002, "epoch": 6.475145255003228, "step": 20060}, {"loss": 0.4205, "grad_norm": 1.3777440786361694, "learning_rate": 0.0002, "epoch": 6.4783731439638474, "step": 20070}, {"loss": 0.424, "grad_norm": 1.1771081686019897, "learning_rate": 0.0002, "epoch": 6.481601032924467, "step": 20080}, {"loss": 0.4187, "grad_norm": 1.0414351224899292, "learning_rate": 0.0002, "epoch": 6.484828921885087, "step": 20090}, {"loss": 0.4419, "grad_norm": 1.2103244066238403, "learning_rate": 0.0002, "epoch": 6.488056810845707, "step": 20100}, {"loss": 0.4502, "grad_norm": 1.4153836965560913, "learning_rate": 0.0002, "epoch": 6.491284699806327, "step": 20110}, {"loss": 0.4524, "grad_norm": 1.2718676328659058, "learning_rate": 0.0002, "epoch": 6.494512588766947, "step": 20120}, {"loss": 0.4546, "grad_norm": 1.1040351390838623, "learning_rate": 0.0002, "epoch": 6.497740477727566, "step": 20130}, {"loss": 0.4105, "grad_norm": 0.9804210662841797, "learning_rate": 0.0002, "epoch": 6.500968366688186, "step": 20140}, {"loss": 0.4165, "grad_norm": 1.028836965560913, "learning_rate": 0.0002, "epoch": 6.5041962556488055, "step": 20150}, {"loss": 0.4106, "grad_norm": 1.1773076057434082, "learning_rate": 0.0002, "epoch": 6.507424144609425, "step": 20160}, {"loss": 0.4364, "grad_norm": 0.8597512245178223, "learning_rate": 0.0002, "epoch": 6.510652033570045, "step": 20170}, {"loss": 0.4346, "grad_norm": 1.4290635585784912, "learning_rate": 0.0002, "epoch": 6.5138799225306645, "step": 20180}, {"loss": 0.4057, "grad_norm": 0.9842908382415771, "learning_rate": 0.0002, "epoch": 6.517107811491285, "step": 20190}, {"loss": 0.4562, "grad_norm": 1.0254372358322144, "learning_rate": 0.0002, "epoch": 6.520335700451905, "step": 20200}, {"loss": 0.433, "grad_norm": 1.1869125366210938, "learning_rate": 0.0002, "epoch": 6.523563589412524, "step": 20210}, {"loss": 0.4247, "grad_norm": 1.0994106531143188, "learning_rate": 0.0002, "epoch": 6.526791478373144, "step": 20220}, {"loss": 0.416, "grad_norm": 1.03111732006073, "learning_rate": 0.0002, "epoch": 6.530019367333764, "step": 20230}, {"loss": 0.4202, "grad_norm": 1.5421077013015747, "learning_rate": 0.0002, "epoch": 6.533247256294383, "step": 20240}, {"loss": 0.4309, "grad_norm": 1.4383527040481567, "learning_rate": 0.0002, "epoch": 6.536475145255003, "step": 20250}, {"loss": 0.4086, "grad_norm": 1.0252864360809326, "learning_rate": 0.0002, "epoch": 6.539703034215623, "step": 20260}, {"loss": 0.4391, "grad_norm": 1.2504689693450928, "learning_rate": 0.0002, "epoch": 6.542930923176243, "step": 20270}, {"loss": 0.4294, "grad_norm": 1.2130976915359497, "learning_rate": 0.0002, "epoch": 6.546158812136863, "step": 20280}, {"loss": 0.4432, "grad_norm": 1.1186957359313965, "learning_rate": 0.0002, "epoch": 6.549386701097482, "step": 20290}, {"loss": 0.4225, "grad_norm": 1.0373939275741577, "learning_rate": 0.0002, "epoch": 6.552614590058102, "step": 20300}, {"loss": 0.3874, "grad_norm": 0.9950923323631287, "learning_rate": 0.0002, "epoch": 6.555842479018722, "step": 20310}, {"loss": 0.4257, "grad_norm": 1.1479439735412598, "learning_rate": 0.0002, "epoch": 6.559070367979341, "step": 20320}, {"loss": 0.4418, "grad_norm": 1.2426027059555054, "learning_rate": 0.0002, "epoch": 6.562298256939961, "step": 20330}, {"loss": 0.4274, "grad_norm": 1.3021808862686157, "learning_rate": 0.0002, "epoch": 6.565526145900581, "step": 20340}, {"loss": 0.4423, "grad_norm": 1.203259825706482, "learning_rate": 0.0002, "epoch": 6.5687540348612, "step": 20350}, {"loss": 0.4568, "grad_norm": 2.1131186485290527, "learning_rate": 0.0002, "epoch": 6.571981923821821, "step": 20360}, {"loss": 0.4272, "grad_norm": 1.1588627099990845, "learning_rate": 0.0002, "epoch": 6.5752098127824405, "step": 20370}, {"loss": 0.4727, "grad_norm": 1.0151054859161377, "learning_rate": 0.0002, "epoch": 6.57843770174306, "step": 20380}, {"loss": 0.4592, "grad_norm": 1.323155403137207, "learning_rate": 0.0002, "epoch": 6.58166559070368, "step": 20390}, {"loss": 0.4075, "grad_norm": 1.0907572507858276, "learning_rate": 0.0002, "epoch": 6.5848934796642995, "step": 20400}, {"loss": 0.4127, "grad_norm": 1.2375017404556274, "learning_rate": 0.0002, "epoch": 6.588121368624919, "step": 20410}, {"loss": 0.4483, "grad_norm": 1.0491245985031128, "learning_rate": 0.0002, "epoch": 6.591349257585539, "step": 20420}, {"loss": 0.4476, "grad_norm": 1.50575852394104, "learning_rate": 0.0002, "epoch": 6.5945771465461585, "step": 20430}, {"loss": 0.4235, "grad_norm": 0.9893020987510681, "learning_rate": 0.0002, "epoch": 6.597805035506779, "step": 20440}, {"loss": 0.4384, "grad_norm": 1.258591651916504, "learning_rate": 0.0002, "epoch": 6.601032924467399, "step": 20450}, {"loss": 0.4458, "grad_norm": 1.3949081897735596, "learning_rate": 0.0002, "epoch": 6.604260813428018, "step": 20460}, {"loss": 0.3885, "grad_norm": 1.152513861656189, "learning_rate": 0.0002, "epoch": 6.607488702388638, "step": 20470}, {"loss": 0.4257, "grad_norm": 1.218362808227539, "learning_rate": 0.0002, "epoch": 6.610716591349258, "step": 20480}, {"loss": 0.4448, "grad_norm": 1.3538687229156494, "learning_rate": 0.0002, "epoch": 6.613944480309877, "step": 20490}, {"loss": 0.4348, "grad_norm": 1.2896782159805298, "learning_rate": 0.0002, "epoch": 6.617172369270497, "step": 20500}, {"loss": 0.4287, "grad_norm": 1.0762150287628174, "learning_rate": 0.0002, "epoch": 6.6204002582311166, "step": 20510}, {"loss": 0.4529, "grad_norm": 1.1561447381973267, "learning_rate": 0.0002, "epoch": 6.623628147191736, "step": 20520}, {"loss": 0.4017, "grad_norm": 1.0553218126296997, "learning_rate": 0.0002, "epoch": 6.626856036152357, "step": 20530}, {"loss": 0.4321, "grad_norm": 1.1378765106201172, "learning_rate": 0.0002, "epoch": 6.630083925112976, "step": 20540}, {"loss": 0.4351, "grad_norm": 1.2299952507019043, "learning_rate": 0.0002, "epoch": 6.633311814073596, "step": 20550}, {"loss": 0.4406, "grad_norm": 1.4158518314361572, "learning_rate": 0.0002, "epoch": 6.636539703034216, "step": 20560}, {"loss": 0.4334, "grad_norm": 1.058830738067627, "learning_rate": 0.0002, "epoch": 6.639767591994835, "step": 20570}, {"loss": 0.4248, "grad_norm": 1.1069598197937012, "learning_rate": 0.0002, "epoch": 6.642995480955455, "step": 20580}, {"loss": 0.4651, "grad_norm": 1.3859037160873413, "learning_rate": 0.0002, "epoch": 6.646223369916075, "step": 20590}, {"loss": 0.4324, "grad_norm": 1.300588607788086, "learning_rate": 0.0002, "epoch": 6.649451258876694, "step": 20600}, {"loss": 0.4581, "grad_norm": 1.3861193656921387, "learning_rate": 0.0002, "epoch": 6.652679147837315, "step": 20610}, {"loss": 0.4198, "grad_norm": 1.2356518507003784, "learning_rate": 0.0002, "epoch": 6.6559070367979345, "step": 20620}, {"loss": 0.4578, "grad_norm": 1.1698070764541626, "learning_rate": 0.0002, "epoch": 6.659134925758554, "step": 20630}, {"loss": 0.4513, "grad_norm": 1.270707607269287, "learning_rate": 0.0002, "epoch": 6.662362814719174, "step": 20640}, {"loss": 0.4552, "grad_norm": 0.984618067741394, "learning_rate": 0.0002, "epoch": 6.6655907036797934, "step": 20650}, {"loss": 0.4648, "grad_norm": 1.2335834503173828, "learning_rate": 0.0002, "epoch": 6.668818592640413, "step": 20660}, {"loss": 0.4541, "grad_norm": 0.9497392773628235, "learning_rate": 0.0002, "epoch": 6.672046481601033, "step": 20670}, {"loss": 0.4176, "grad_norm": 1.011144757270813, "learning_rate": 0.0002, "epoch": 6.675274370561652, "step": 20680}, {"loss": 0.4424, "grad_norm": 1.1605948209762573, "learning_rate": 0.0002, "epoch": 6.678502259522272, "step": 20690}, {"loss": 0.4613, "grad_norm": 1.2136812210083008, "learning_rate": 0.0002, "epoch": 6.681730148482892, "step": 20700}, {"loss": 0.4287, "grad_norm": 1.0823525190353394, "learning_rate": 0.0002, "epoch": 6.684958037443512, "step": 20710}, {"loss": 0.4307, "grad_norm": 1.1929140090942383, "learning_rate": 0.0002, "epoch": 6.688185926404132, "step": 20720}, {"loss": 0.4453, "grad_norm": 1.2468219995498657, "learning_rate": 0.0002, "epoch": 6.6914138153647515, "step": 20730}, {"loss": 0.4262, "grad_norm": 1.2653573751449585, "learning_rate": 0.0002, "epoch": 6.694641704325371, "step": 20740}, {"loss": 0.4716, "grad_norm": 1.2253094911575317, "learning_rate": 0.0002, "epoch": 6.697869593285991, "step": 20750}, {"loss": 0.4462, "grad_norm": 1.103179931640625, "learning_rate": 0.0002, "epoch": 6.7010974822466105, "step": 20760}, {"loss": 0.4179, "grad_norm": 0.9180657863616943, "learning_rate": 0.0002, "epoch": 6.70432537120723, "step": 20770}, {"loss": 0.4712, "grad_norm": 1.1830929517745972, "learning_rate": 0.0002, "epoch": 6.707553260167851, "step": 20780}, {"loss": 0.4304, "grad_norm": 1.1052136421203613, "learning_rate": 0.0002, "epoch": 6.71078114912847, "step": 20790}, {"loss": 0.436, "grad_norm": 1.1268569231033325, "learning_rate": 0.0002, "epoch": 6.71400903808909, "step": 20800}, {"loss": 0.4109, "grad_norm": 1.0753320455551147, "learning_rate": 0.0002, "epoch": 6.71723692704971, "step": 20810}, {"loss": 0.4471, "grad_norm": 1.1100133657455444, "learning_rate": 0.0002, "epoch": 6.720464816010329, "step": 20820}, {"loss": 0.447, "grad_norm": 0.7498472929000854, "learning_rate": 0.0002, "epoch": 6.723692704970949, "step": 20830}, {"loss": 0.4182, "grad_norm": 1.1006664037704468, "learning_rate": 0.0002, "epoch": 6.726920593931569, "step": 20840}, {"loss": 0.4348, "grad_norm": 1.4599690437316895, "learning_rate": 0.0002, "epoch": 6.730148482892188, "step": 20850}, {"loss": 0.4596, "grad_norm": 1.324700951576233, "learning_rate": 0.0002, "epoch": 6.733376371852808, "step": 20860}, {"loss": 0.4373, "grad_norm": 1.1128668785095215, "learning_rate": 0.0002, "epoch": 6.736604260813428, "step": 20870}, {"loss": 0.4267, "grad_norm": 1.0438026189804077, "learning_rate": 0.0002, "epoch": 6.739832149774048, "step": 20880}, {"loss": 0.4366, "grad_norm": 1.1934672594070435, "learning_rate": 0.0002, "epoch": 6.743060038734668, "step": 20890}, {"loss": 0.4264, "grad_norm": 1.2108192443847656, "learning_rate": 0.0002, "epoch": 6.746287927695287, "step": 20900}, {"loss": 0.4327, "grad_norm": 1.1514620780944824, "learning_rate": 0.0002, "epoch": 6.749515816655907, "step": 20910}, {"loss": 0.4774, "grad_norm": 1.1723405122756958, "learning_rate": 0.0002, "epoch": 6.752743705616527, "step": 20920}, {"loss": 0.4458, "grad_norm": 1.1136211156845093, "learning_rate": 0.0002, "epoch": 6.755971594577146, "step": 20930}, {"loss": 0.4363, "grad_norm": 1.297601342201233, "learning_rate": 0.0002, "epoch": 6.759199483537766, "step": 20940}, {"loss": 0.4389, "grad_norm": 1.139397144317627, "learning_rate": 0.0002, "epoch": 6.7624273724983865, "step": 20950}, {"loss": 0.4344, "grad_norm": 1.2873362302780151, "learning_rate": 0.0002, "epoch": 6.765655261459006, "step": 20960}, {"loss": 0.4204, "grad_norm": 1.1499544382095337, "learning_rate": 0.0002, "epoch": 6.768883150419626, "step": 20970}, {"loss": 0.4279, "grad_norm": 1.3687032461166382, "learning_rate": 0.0002, "epoch": 6.7721110393802455, "step": 20980}, {"loss": 0.4621, "grad_norm": 1.2877939939498901, "learning_rate": 0.0002, "epoch": 6.775338928340865, "step": 20990}, {"loss": 0.4629, "grad_norm": 1.232993483543396, "learning_rate": 0.0002, "epoch": 6.778566817301485, "step": 21000}, {"loss": 0.4697, "grad_norm": 1.1765092611312866, "learning_rate": 0.0002, "epoch": 6.7817947062621045, "step": 21010}, {"loss": 0.431, "grad_norm": 1.4695899486541748, "learning_rate": 0.0002, "epoch": 6.785022595222724, "step": 21020}, {"loss": 0.4348, "grad_norm": 1.2325087785720825, "learning_rate": 0.0002, "epoch": 6.788250484183344, "step": 21030}, {"loss": 0.4595, "grad_norm": 1.3475068807601929, "learning_rate": 0.0002, "epoch": 6.791478373143963, "step": 21040}, {"loss": 0.4555, "grad_norm": 1.5654256343841553, "learning_rate": 0.0002, "epoch": 6.794706262104584, "step": 21050}, {"loss": 0.4672, "grad_norm": 1.4210680723190308, "learning_rate": 0.0002, "epoch": 6.797934151065204, "step": 21060}, {"loss": 0.4491, "grad_norm": 1.167878270149231, "learning_rate": 0.0002, "epoch": 6.801162040025823, "step": 21070}, {"loss": 0.4524, "grad_norm": 1.1643486022949219, "learning_rate": 0.0002, "epoch": 6.804389928986443, "step": 21080}, {"loss": 0.4467, "grad_norm": 1.1976310014724731, "learning_rate": 0.0002, "epoch": 6.8076178179470626, "step": 21090}, {"loss": 0.4449, "grad_norm": 1.1392749547958374, "learning_rate": 0.0002, "epoch": 6.810845706907682, "step": 21100}, {"loss": 0.4567, "grad_norm": 1.2456704378128052, "learning_rate": 0.0002, "epoch": 6.814073595868302, "step": 21110}, {"loss": 0.4271, "grad_norm": 1.0030150413513184, "learning_rate": 0.0002, "epoch": 6.8173014848289215, "step": 21120}, {"loss": 0.4258, "grad_norm": 1.4715943336486816, "learning_rate": 0.0002, "epoch": 6.820529373789542, "step": 21130}, {"loss": 0.4615, "grad_norm": 1.1307374238967896, "learning_rate": 0.0002, "epoch": 6.823757262750162, "step": 21140}, {"loss": 0.4643, "grad_norm": 1.37498140335083, "learning_rate": 0.0002, "epoch": 6.826985151710781, "step": 21150}, {"loss": 0.4447, "grad_norm": 1.2791364192962646, "learning_rate": 0.0002, "epoch": 6.830213040671401, "step": 21160}, {"loss": 0.4778, "grad_norm": 1.0518016815185547, "learning_rate": 0.0002, "epoch": 6.833440929632021, "step": 21170}, {"loss": 0.448, "grad_norm": 1.1237729787826538, "learning_rate": 0.0002, "epoch": 6.83666881859264, "step": 21180}, {"loss": 0.4299, "grad_norm": 1.0360032320022583, "learning_rate": 0.0002, "epoch": 6.83989670755326, "step": 21190}, {"loss": 0.4336, "grad_norm": 0.8733281493186951, "learning_rate": 0.0002, "epoch": 6.84312459651388, "step": 21200}, {"loss": 0.4495, "grad_norm": 1.3178322315216064, "learning_rate": 0.0002, "epoch": 6.846352485474499, "step": 21210}, {"loss": 0.4548, "grad_norm": 1.0884978771209717, "learning_rate": 0.0002, "epoch": 6.84958037443512, "step": 21220}, {"loss": 0.4543, "grad_norm": 1.213229775428772, "learning_rate": 0.0002, "epoch": 6.8528082633957395, "step": 21230}, {"loss": 0.4628, "grad_norm": 1.0828464031219482, "learning_rate": 0.0002, "epoch": 6.856036152356359, "step": 21240}, {"loss": 0.4353, "grad_norm": 1.2298113107681274, "learning_rate": 0.0002, "epoch": 6.859264041316979, "step": 21250}, {"loss": 0.4088, "grad_norm": 1.4773930311203003, "learning_rate": 0.0002, "epoch": 6.862491930277598, "step": 21260}, {"loss": 0.4529, "grad_norm": 0.992661714553833, "learning_rate": 0.0002, "epoch": 6.865719819238218, "step": 21270}, {"loss": 0.474, "grad_norm": 1.25167715549469, "learning_rate": 0.0002, "epoch": 6.868947708198838, "step": 21280}, {"loss": 0.4466, "grad_norm": 1.1554399728775024, "learning_rate": 0.0002, "epoch": 6.872175597159457, "step": 21290}, {"loss": 0.4375, "grad_norm": 1.2587701082229614, "learning_rate": 0.0002, "epoch": 6.875403486120078, "step": 21300}, {"loss": 0.4507, "grad_norm": 1.392392635345459, "learning_rate": 0.0002, "epoch": 6.8786313750806976, "step": 21310}, {"loss": 0.4432, "grad_norm": 1.2159595489501953, "learning_rate": 0.0002, "epoch": 6.881859264041317, "step": 21320}, {"loss": 0.4255, "grad_norm": 1.3811182975769043, "learning_rate": 0.0002, "epoch": 6.885087153001937, "step": 21330}, {"loss": 0.4437, "grad_norm": 1.2652684450149536, "learning_rate": 0.0002, "epoch": 6.8883150419625565, "step": 21340}, {"loss": 0.4797, "grad_norm": 1.1906380653381348, "learning_rate": 0.0002, "epoch": 6.891542930923176, "step": 21350}, {"loss": 0.423, "grad_norm": 1.0525990724563599, "learning_rate": 0.0002, "epoch": 6.894770819883796, "step": 21360}, {"loss": 0.4414, "grad_norm": 0.910491406917572, "learning_rate": 0.0002, "epoch": 6.8979987088444155, "step": 21370}, {"loss": 0.4882, "grad_norm": 1.366865634918213, "learning_rate": 0.0002, "epoch": 6.901226597805035, "step": 21380}, {"loss": 0.4648, "grad_norm": 1.1270265579223633, "learning_rate": 0.0002, "epoch": 6.904454486765655, "step": 21390}, {"loss": 0.4529, "grad_norm": 1.1745691299438477, "learning_rate": 0.0002, "epoch": 6.907682375726275, "step": 21400}, {"loss": 0.4504, "grad_norm": 1.1036182641983032, "learning_rate": 0.0002, "epoch": 6.910910264686895, "step": 21410}, {"loss": 0.4612, "grad_norm": 1.0906540155410767, "learning_rate": 0.0002, "epoch": 6.914138153647515, "step": 21420}, {"loss": 0.4408, "grad_norm": 1.1176798343658447, "learning_rate": 0.0002, "epoch": 6.917366042608134, "step": 21430}, {"loss": 0.477, "grad_norm": 1.525869607925415, "learning_rate": 0.0002, "epoch": 6.920593931568754, "step": 21440}, {"loss": 0.4473, "grad_norm": 1.2466827630996704, "learning_rate": 0.0002, "epoch": 6.923821820529374, "step": 21450}, {"loss": 0.4256, "grad_norm": 1.0200796127319336, "learning_rate": 0.0002, "epoch": 6.927049709489993, "step": 21460}, {"loss": 0.4601, "grad_norm": 1.2133489847183228, "learning_rate": 0.0002, "epoch": 6.930277598450614, "step": 21470}, {"loss": 0.44, "grad_norm": 1.2100290060043335, "learning_rate": 0.0002, "epoch": 6.933505487411233, "step": 21480}, {"loss": 0.468, "grad_norm": 1.1833131313323975, "learning_rate": 0.0002, "epoch": 6.936733376371853, "step": 21490}, {"loss": 0.4529, "grad_norm": 1.2262470722198486, "learning_rate": 0.0002, "epoch": 6.939961265332473, "step": 21500}, {"loss": 0.4612, "grad_norm": 1.0496156215667725, "learning_rate": 0.0002, "epoch": 6.943189154293092, "step": 21510}, {"loss": 0.4417, "grad_norm": 1.050690770149231, "learning_rate": 0.0002, "epoch": 6.946417043253712, "step": 21520}, {"loss": 0.4813, "grad_norm": 1.2035698890686035, "learning_rate": 0.0002, "epoch": 6.949644932214332, "step": 21530}, {"loss": 0.4349, "grad_norm": 1.408007025718689, "learning_rate": 0.0002, "epoch": 6.952872821174951, "step": 21540}, {"loss": 0.4391, "grad_norm": 1.2247556447982788, "learning_rate": 0.0002, "epoch": 6.956100710135571, "step": 21550}, {"loss": 0.4526, "grad_norm": 1.1727497577667236, "learning_rate": 0.0002, "epoch": 6.959328599096191, "step": 21560}, {"loss": 0.4566, "grad_norm": 1.2948925495147705, "learning_rate": 0.0002, "epoch": 6.962556488056811, "step": 21570}, {"loss": 0.4672, "grad_norm": 1.3374950885772705, "learning_rate": 0.0002, "epoch": 6.965784377017431, "step": 21580}, {"loss": 0.4515, "grad_norm": 1.164650559425354, "learning_rate": 0.0002, "epoch": 6.9690122659780505, "step": 21590}, {"loss": 0.4704, "grad_norm": 1.2682108879089355, "learning_rate": 0.0002, "epoch": 6.97224015493867, "step": 21600}, {"loss": 0.4557, "grad_norm": 1.195971131324768, "learning_rate": 0.0002, "epoch": 6.97546804389929, "step": 21610}, {"loss": 0.4194, "grad_norm": 1.1988017559051514, "learning_rate": 0.0002, "epoch": 6.978695932859909, "step": 21620}, {"loss": 0.4524, "grad_norm": 1.0981930494308472, "learning_rate": 0.0002, "epoch": 6.981923821820529, "step": 21630}, {"loss": 0.4808, "grad_norm": 1.307260274887085, "learning_rate": 0.0002, "epoch": 6.98515171078115, "step": 21640}, {"loss": 0.4936, "grad_norm": 1.2798160314559937, "learning_rate": 0.0002, "epoch": 6.988379599741769, "step": 21650}, {"loss": 0.4615, "grad_norm": 1.0053848028182983, "learning_rate": 0.0002, "epoch": 6.991607488702389, "step": 21660}, {"loss": 0.4496, "grad_norm": 1.2257840633392334, "learning_rate": 0.0002, "epoch": 6.994835377663009, "step": 21670}, {"loss": 0.4449, "grad_norm": 1.3769378662109375, "learning_rate": 0.0002, "epoch": 6.998063266623628, "step": 21680}, {"eval_loss": 1.3414524793624878, "eval_runtime": 162.0091, "eval_samples_per_second": 4.524, "eval_steps_per_second": 0.568, "epoch": 7.0, "step": 21686}, {"loss": 0.4148, "grad_norm": 0.834328830242157, "learning_rate": 0.0002, "epoch": 7.001291155584248, "step": 21690}, {"loss": 0.3444, "grad_norm": 1.0984957218170166, "learning_rate": 0.0002, "epoch": 7.0045190445448675, "step": 21700}, {"loss": 0.3456, "grad_norm": 1.0821330547332764, "learning_rate": 0.0002, "epoch": 7.007746933505487, "step": 21710}, {"loss": 0.3698, "grad_norm": 1.1686056852340698, "learning_rate": 0.0002, "epoch": 7.010974822466107, "step": 21720}, {"loss": 0.3425, "grad_norm": 1.0800853967666626, "learning_rate": 0.0002, "epoch": 7.014202711426727, "step": 21730}, {"loss": 0.3518, "grad_norm": 1.0158464908599854, "learning_rate": 0.0002, "epoch": 7.017430600387347, "step": 21740}, {"loss": 0.3388, "grad_norm": 1.1526305675506592, "learning_rate": 0.0002, "epoch": 7.020658489347967, "step": 21750}, {"loss": 0.3549, "grad_norm": 0.9431301951408386, "learning_rate": 0.0002, "epoch": 7.023886378308586, "step": 21760}, {"loss": 0.3756, "grad_norm": 1.2625824213027954, "learning_rate": 0.0002, "epoch": 7.027114267269206, "step": 21770}, {"loss": 0.3513, "grad_norm": 1.2469223737716675, "learning_rate": 0.0002, "epoch": 7.030342156229826, "step": 21780}, {"loss": 0.3756, "grad_norm": 1.0981431007385254, "learning_rate": 0.0002, "epoch": 7.033570045190445, "step": 21790}, {"loss": 0.3543, "grad_norm": 1.147852897644043, "learning_rate": 0.0002, "epoch": 7.036797934151065, "step": 21800}, {"loss": 0.3706, "grad_norm": 1.368754506111145, "learning_rate": 0.0002, "epoch": 7.040025823111685, "step": 21810}, {"loss": 0.3446, "grad_norm": 0.7324210405349731, "learning_rate": 0.0002, "epoch": 7.043253712072305, "step": 21820}, {"loss": 0.3493, "grad_norm": 1.264591932296753, "learning_rate": 0.0002, "epoch": 7.046481601032925, "step": 21830}, {"loss": 0.3368, "grad_norm": 1.080914855003357, "learning_rate": 0.0002, "epoch": 7.049709489993544, "step": 21840}, {"loss": 0.3676, "grad_norm": 0.8814678192138672, "learning_rate": 0.0002, "epoch": 7.052937378954164, "step": 21850}, {"loss": 0.3537, "grad_norm": 1.0538815259933472, "learning_rate": 0.0002, "epoch": 7.056165267914784, "step": 21860}, {"loss": 0.3436, "grad_norm": 1.0479655265808105, "learning_rate": 0.0002, "epoch": 7.059393156875403, "step": 21870}, {"loss": 0.3482, "grad_norm": 1.260636329650879, "learning_rate": 0.0002, "epoch": 7.062621045836023, "step": 21880}, {"loss": 0.3442, "grad_norm": 1.0623047351837158, "learning_rate": 0.0002, "epoch": 7.065848934796643, "step": 21890}, {"loss": 0.3841, "grad_norm": 1.083094835281372, "learning_rate": 0.0002, "epoch": 7.069076823757262, "step": 21900}, {"loss": 0.3517, "grad_norm": 1.1972185373306274, "learning_rate": 0.0002, "epoch": 7.072304712717883, "step": 21910}, {"loss": 0.3642, "grad_norm": 1.217283844947815, "learning_rate": 0.0002, "epoch": 7.0755326016785025, "step": 21920}, {"loss": 0.3709, "grad_norm": 1.7448943853378296, "learning_rate": 0.0002, "epoch": 7.078760490639122, "step": 21930}, {"loss": 0.3705, "grad_norm": 0.7799133062362671, "learning_rate": 0.0002, "epoch": 7.081988379599742, "step": 21940}, {"loss": 0.3658, "grad_norm": 1.0691521167755127, "learning_rate": 0.0002, "epoch": 7.0852162685603615, "step": 21950}, {"loss": 0.3879, "grad_norm": 1.4790667295455933, "learning_rate": 0.0002, "epoch": 7.088444157520981, "step": 21960}, {"loss": 0.3432, "grad_norm": 1.0977898836135864, "learning_rate": 0.0002, "epoch": 7.091672046481601, "step": 21970}, {"loss": 0.3636, "grad_norm": 2.204333543777466, "learning_rate": 0.0002, "epoch": 7.09489993544222, "step": 21980}, {"loss": 0.3561, "grad_norm": 1.1866867542266846, "learning_rate": 0.0002, "epoch": 7.098127824402841, "step": 21990}, {"loss": 0.3678, "grad_norm": 1.2251238822937012, "learning_rate": 0.0002, "epoch": 7.101355713363461, "step": 22000}, {"loss": 0.3819, "grad_norm": 1.1271567344665527, "learning_rate": 0.0002, "epoch": 7.10458360232408, "step": 22010}, {"loss": 0.3434, "grad_norm": 0.8748073577880859, "learning_rate": 0.0002, "epoch": 7.1078114912847, "step": 22020}, {"loss": 0.3628, "grad_norm": 1.1254602670669556, "learning_rate": 0.0002, "epoch": 7.11103938024532, "step": 22030}, {"loss": 0.3604, "grad_norm": 1.2542496919631958, "learning_rate": 0.0002, "epoch": 7.114267269205939, "step": 22040}, {"loss": 0.3761, "grad_norm": 1.059043526649475, "learning_rate": 0.0002, "epoch": 7.117495158166559, "step": 22050}, {"loss": 0.3717, "grad_norm": 1.054980993270874, "learning_rate": 0.0002, "epoch": 7.1207230471271785, "step": 22060}, {"loss": 0.3849, "grad_norm": 1.5040231943130493, "learning_rate": 0.0002, "epoch": 7.123950936087798, "step": 22070}, {"loss": 0.387, "grad_norm": 1.089801549911499, "learning_rate": 0.0002, "epoch": 7.127178825048419, "step": 22080}, {"loss": 0.3474, "grad_norm": 0.8638873100280762, "learning_rate": 0.0002, "epoch": 7.130406714009038, "step": 22090}, {"loss": 0.3738, "grad_norm": 1.0746978521347046, "learning_rate": 0.0002, "epoch": 7.133634602969658, "step": 22100}, {"loss": 0.3605, "grad_norm": 0.875741720199585, "learning_rate": 0.0002, "epoch": 7.136862491930278, "step": 22110}, {"loss": 0.3465, "grad_norm": 1.0179301500320435, "learning_rate": 0.0002, "epoch": 7.140090380890897, "step": 22120}, {"loss": 0.3779, "grad_norm": 1.18764066696167, "learning_rate": 0.0002, "epoch": 7.143318269851517, "step": 22130}, {"loss": 0.392, "grad_norm": 1.1021716594696045, "learning_rate": 0.0002, "epoch": 7.146546158812137, "step": 22140}, {"loss": 0.3763, "grad_norm": 1.3633701801300049, "learning_rate": 0.0002, "epoch": 7.149774047772756, "step": 22150}, {"loss": 0.3595, "grad_norm": 1.124321460723877, "learning_rate": 0.0002, "epoch": 7.153001936733377, "step": 22160}, {"loss": 0.3668, "grad_norm": 1.1838600635528564, "learning_rate": 0.0002, "epoch": 7.1562298256939965, "step": 22170}, {"loss": 0.367, "grad_norm": 1.1565297842025757, "learning_rate": 0.0002, "epoch": 7.159457714654616, "step": 22180}, {"loss": 0.3728, "grad_norm": 1.1444414854049683, "learning_rate": 0.0002, "epoch": 7.162685603615236, "step": 22190}, {"loss": 0.3679, "grad_norm": 1.4376155138015747, "learning_rate": 0.0002, "epoch": 7.165913492575855, "step": 22200}, {"loss": 0.4011, "grad_norm": 1.154999852180481, "learning_rate": 0.0002, "epoch": 7.169141381536475, "step": 22210}, {"loss": 0.3719, "grad_norm": 1.2167491912841797, "learning_rate": 0.0002, "epoch": 7.172369270497095, "step": 22220}, {"loss": 0.398, "grad_norm": 1.1870360374450684, "learning_rate": 0.0002, "epoch": 7.175597159457714, "step": 22230}, {"loss": 0.3751, "grad_norm": 1.269687294960022, "learning_rate": 0.0002, "epoch": 7.178825048418334, "step": 22240}, {"loss": 0.3468, "grad_norm": 1.2174620628356934, "learning_rate": 0.0002, "epoch": 7.182052937378955, "step": 22250}, {"loss": 0.3833, "grad_norm": 0.8996151685714722, "learning_rate": 0.0002, "epoch": 7.185280826339574, "step": 22260}, {"loss": 0.3802, "grad_norm": 1.1364930868148804, "learning_rate": 0.0002, "epoch": 7.188508715300194, "step": 22270}, {"loss": 0.3655, "grad_norm": 1.2437993288040161, "learning_rate": 0.0002, "epoch": 7.1917366042608135, "step": 22280}, {"loss": 0.3633, "grad_norm": 1.3526612520217896, "learning_rate": 0.0002, "epoch": 7.194964493221433, "step": 22290}, {"loss": 0.3644, "grad_norm": 0.9819979071617126, "learning_rate": 0.0002, "epoch": 7.198192382182053, "step": 22300}, {"loss": 0.3532, "grad_norm": 1.3902596235275269, "learning_rate": 0.0002, "epoch": 7.2014202711426725, "step": 22310}, {"loss": 0.3896, "grad_norm": 1.2565160989761353, "learning_rate": 0.0002, "epoch": 7.204648160103292, "step": 22320}, {"loss": 0.3535, "grad_norm": 1.2485729455947876, "learning_rate": 0.0002, "epoch": 7.207876049063912, "step": 22330}, {"loss": 0.3615, "grad_norm": 1.1691182851791382, "learning_rate": 0.0002, "epoch": 7.211103938024532, "step": 22340}, {"loss": 0.3543, "grad_norm": 1.0192445516586304, "learning_rate": 0.0002, "epoch": 7.214331826985152, "step": 22350}, {"loss": 0.3774, "grad_norm": 1.2632675170898438, "learning_rate": 0.0002, "epoch": 7.217559715945772, "step": 22360}, {"loss": 0.3668, "grad_norm": 1.4515255689620972, "learning_rate": 0.0002, "epoch": 7.220787604906391, "step": 22370}, {"loss": 0.3605, "grad_norm": 1.3013306856155396, "learning_rate": 0.0002, "epoch": 7.224015493867011, "step": 22380}, {"loss": 0.4009, "grad_norm": 0.9696382284164429, "learning_rate": 0.0002, "epoch": 7.227243382827631, "step": 22390}, {"loss": 0.3669, "grad_norm": 1.2517571449279785, "learning_rate": 0.0002, "epoch": 7.23047127178825, "step": 22400}, {"loss": 0.385, "grad_norm": 1.275736689567566, "learning_rate": 0.0002, "epoch": 7.23369916074887, "step": 22410}, {"loss": 0.3859, "grad_norm": 1.2981343269348145, "learning_rate": 0.0002, "epoch": 7.23692704970949, "step": 22420}, {"loss": 0.3617, "grad_norm": 1.1113612651824951, "learning_rate": 0.0002, "epoch": 7.24015493867011, "step": 22430}, {"loss": 0.3681, "grad_norm": 1.1843012571334839, "learning_rate": 0.0002, "epoch": 7.24338282763073, "step": 22440}, {"loss": 0.4053, "grad_norm": 1.2983063459396362, "learning_rate": 0.0002, "epoch": 7.246610716591349, "step": 22450}, {"loss": 0.3691, "grad_norm": 1.1059116125106812, "learning_rate": 0.0002, "epoch": 7.249838605551969, "step": 22460}, {"loss": 0.4125, "grad_norm": 1.4968358278274536, "learning_rate": 0.0002, "epoch": 7.253066494512589, "step": 22470}, {"loss": 0.3664, "grad_norm": 1.2906030416488647, "learning_rate": 0.0002, "epoch": 7.256294383473208, "step": 22480}, {"loss": 0.3896, "grad_norm": 1.1108627319335938, "learning_rate": 0.0002, "epoch": 7.259522272433828, "step": 22490}, {"loss": 0.3779, "grad_norm": 0.9844270348548889, "learning_rate": 0.0002, "epoch": 7.262750161394448, "step": 22500}, {"loss": 0.3779, "grad_norm": 1.0623210668563843, "learning_rate": 0.0002, "epoch": 7.265978050355068, "step": 22510}, {"loss": 0.3862, "grad_norm": 1.2726962566375732, "learning_rate": 0.0002, "epoch": 7.269205939315688, "step": 22520}, {"loss": 0.3889, "grad_norm": 1.1712630987167358, "learning_rate": 0.0002, "epoch": 7.2724338282763075, "step": 22530}, {"loss": 0.3744, "grad_norm": 1.0604515075683594, "learning_rate": 0.0002, "epoch": 7.275661717236927, "step": 22540}, {"loss": 0.3878, "grad_norm": 1.1781001091003418, "learning_rate": 0.0002, "epoch": 7.278889606197547, "step": 22550}, {"loss": 0.3806, "grad_norm": 1.2568641901016235, "learning_rate": 0.0002, "epoch": 7.282117495158166, "step": 22560}, {"loss": 0.4032, "grad_norm": 1.2375072240829468, "learning_rate": 0.0002, "epoch": 7.285345384118786, "step": 22570}, {"loss": 0.41, "grad_norm": 1.2701354026794434, "learning_rate": 0.0002, "epoch": 7.288573273079406, "step": 22580}, {"loss": 0.3716, "grad_norm": 1.2957371473312378, "learning_rate": 0.0002, "epoch": 7.291801162040025, "step": 22590}, {"loss": 0.3564, "grad_norm": 1.1555131673812866, "learning_rate": 0.0002, "epoch": 7.295029051000646, "step": 22600}, {"loss": 0.3887, "grad_norm": 1.1809004545211792, "learning_rate": 0.0002, "epoch": 7.298256939961266, "step": 22610}, {"loss": 0.3521, "grad_norm": 1.156985878944397, "learning_rate": 0.0002, "epoch": 7.301484828921885, "step": 22620}, {"loss": 0.3648, "grad_norm": 1.3241633176803589, "learning_rate": 0.0002, "epoch": 7.304712717882505, "step": 22630}, {"loss": 0.4075, "grad_norm": 1.3285194635391235, "learning_rate": 0.0002, "epoch": 7.3079406068431245, "step": 22640}, {"loss": 0.3802, "grad_norm": 1.0388010740280151, "learning_rate": 0.0002, "epoch": 7.311168495803744, "step": 22650}, {"loss": 0.3895, "grad_norm": 1.1035511493682861, "learning_rate": 0.0002, "epoch": 7.314396384764364, "step": 22660}, {"loss": 0.3607, "grad_norm": 1.1168203353881836, "learning_rate": 0.0002, "epoch": 7.3176242737249835, "step": 22670}, {"loss": 0.3785, "grad_norm": 1.0566749572753906, "learning_rate": 0.0002, "epoch": 7.320852162685604, "step": 22680}, {"loss": 0.3833, "grad_norm": 1.0538207292556763, "learning_rate": 0.0002, "epoch": 7.324080051646224, "step": 22690}, {"loss": 0.3691, "grad_norm": 1.0754560232162476, "learning_rate": 0.0002, "epoch": 7.327307940606843, "step": 22700}, {"loss": 0.3503, "grad_norm": 1.036759614944458, "learning_rate": 0.0002, "epoch": 7.330535829567463, "step": 22710}, {"loss": 0.3821, "grad_norm": 1.1662222146987915, "learning_rate": 0.0002, "epoch": 7.333763718528083, "step": 22720}, {"loss": 0.376, "grad_norm": 1.1255900859832764, "learning_rate": 0.0002, "epoch": 7.336991607488702, "step": 22730}, {"loss": 0.4036, "grad_norm": 1.4802581071853638, "learning_rate": 0.0002, "epoch": 7.340219496449322, "step": 22740}, {"loss": 0.3889, "grad_norm": 1.1963917016983032, "learning_rate": 0.0002, "epoch": 7.343447385409942, "step": 22750}, {"loss": 0.3732, "grad_norm": 1.0769098997116089, "learning_rate": 0.0002, "epoch": 7.346675274370561, "step": 22760}, {"loss": 0.3914, "grad_norm": 1.5818109512329102, "learning_rate": 0.0002, "epoch": 7.349903163331182, "step": 22770}, {"loss": 0.3577, "grad_norm": 1.5089726448059082, "learning_rate": 0.0002, "epoch": 7.353131052291801, "step": 22780}, {"loss": 0.3788, "grad_norm": 1.0024120807647705, "learning_rate": 0.0002, "epoch": 7.356358941252421, "step": 22790}, {"loss": 0.3867, "grad_norm": 1.2956844568252563, "learning_rate": 0.0002, "epoch": 7.359586830213041, "step": 22800}, {"loss": 0.3612, "grad_norm": 1.0113978385925293, "learning_rate": 0.0002, "epoch": 7.36281471917366, "step": 22810}, {"loss": 0.3548, "grad_norm": 1.4180196523666382, "learning_rate": 0.0002, "epoch": 7.36604260813428, "step": 22820}, {"loss": 0.3817, "grad_norm": 0.9611803293228149, "learning_rate": 0.0002, "epoch": 7.3692704970949, "step": 22830}, {"loss": 0.3755, "grad_norm": 1.2668812274932861, "learning_rate": 0.0002, "epoch": 7.372498386055519, "step": 22840}, {"loss": 0.4001, "grad_norm": 1.2809178829193115, "learning_rate": 0.0002, "epoch": 7.37572627501614, "step": 22850}, {"loss": 0.3859, "grad_norm": 1.4618953466415405, "learning_rate": 0.0002, "epoch": 7.3789541639767595, "step": 22860}, {"loss": 0.3796, "grad_norm": 1.0964281558990479, "learning_rate": 0.0002, "epoch": 7.382182052937379, "step": 22870}, {"loss": 0.369, "grad_norm": 1.2329200506210327, "learning_rate": 0.0002, "epoch": 7.385409941897999, "step": 22880}, {"loss": 0.3762, "grad_norm": 1.0750329494476318, "learning_rate": 0.0002, "epoch": 7.3886378308586185, "step": 22890}, {"loss": 0.3762, "grad_norm": 0.9547448754310608, "learning_rate": 0.0002, "epoch": 7.391865719819238, "step": 22900}, {"loss": 0.3741, "grad_norm": 1.146202802658081, "learning_rate": 0.0002, "epoch": 7.395093608779858, "step": 22910}, {"loss": 0.4, "grad_norm": 1.1540607213974, "learning_rate": 0.0002, "epoch": 7.398321497740477, "step": 22920}, {"loss": 0.3714, "grad_norm": 1.1683391332626343, "learning_rate": 0.0002, "epoch": 7.401549386701097, "step": 22930}, {"loss": 0.3786, "grad_norm": 1.2653683423995972, "learning_rate": 0.0002, "epoch": 7.404777275661718, "step": 22940}, {"loss": 0.3835, "grad_norm": 1.1355576515197754, "learning_rate": 0.0002, "epoch": 7.408005164622337, "step": 22950}, {"loss": 0.3958, "grad_norm": 1.2306767702102661, "learning_rate": 0.0002, "epoch": 7.411233053582957, "step": 22960}, {"loss": 0.3752, "grad_norm": 1.2526071071624756, "learning_rate": 0.0002, "epoch": 7.414460942543577, "step": 22970}, {"loss": 0.3931, "grad_norm": 1.3868485689163208, "learning_rate": 0.0002, "epoch": 7.417688831504196, "step": 22980}, {"loss": 0.3899, "grad_norm": 1.257453203201294, "learning_rate": 0.0002, "epoch": 7.420916720464816, "step": 22990}, {"loss": 0.3758, "grad_norm": 1.1610639095306396, "learning_rate": 0.0002, "epoch": 7.4241446094254355, "step": 23000}, {"loss": 0.3639, "grad_norm": 1.3744033575057983, "learning_rate": 0.0002, "epoch": 7.427372498386055, "step": 23010}, {"loss": 0.3885, "grad_norm": 1.0811532735824585, "learning_rate": 0.0002, "epoch": 7.430600387346676, "step": 23020}, {"loss": 0.3914, "grad_norm": 1.170789122581482, "learning_rate": 0.0002, "epoch": 7.433828276307295, "step": 23030}, {"loss": 0.4192, "grad_norm": 1.2688828706741333, "learning_rate": 0.0002, "epoch": 7.437056165267915, "step": 23040}, {"loss": 0.3859, "grad_norm": 1.1140133142471313, "learning_rate": 0.0002, "epoch": 7.440284054228535, "step": 23050}, {"loss": 0.3856, "grad_norm": 1.525015950202942, "learning_rate": 0.0002, "epoch": 7.443511943189154, "step": 23060}, {"loss": 0.3775, "grad_norm": 1.120497226715088, "learning_rate": 0.0002, "epoch": 7.446739832149774, "step": 23070}, {"loss": 0.3917, "grad_norm": 1.298614740371704, "learning_rate": 0.0002, "epoch": 7.449967721110394, "step": 23080}, {"loss": 0.3662, "grad_norm": 1.096987247467041, "learning_rate": 0.0002, "epoch": 7.453195610071013, "step": 23090}, {"loss": 0.3898, "grad_norm": 1.2544305324554443, "learning_rate": 0.0002, "epoch": 7.456423499031633, "step": 23100}, {"loss": 0.4021, "grad_norm": 1.4809341430664062, "learning_rate": 0.0002, "epoch": 7.4596513879922535, "step": 23110}, {"loss": 0.3775, "grad_norm": 0.9224157333374023, "learning_rate": 0.0002, "epoch": 7.462879276952873, "step": 23120}, {"loss": 0.3644, "grad_norm": 1.4894850254058838, "learning_rate": 0.0002, "epoch": 7.466107165913493, "step": 23130}, {"loss": 0.3804, "grad_norm": 1.1947047710418701, "learning_rate": 0.0002, "epoch": 7.469335054874112, "step": 23140}, {"loss": 0.3843, "grad_norm": 1.5348929166793823, "learning_rate": 0.0002, "epoch": 7.472562943834732, "step": 23150}, {"loss": 0.3941, "grad_norm": 1.0486136674880981, "learning_rate": 0.0002, "epoch": 7.475790832795352, "step": 23160}, {"loss": 0.3935, "grad_norm": 1.6460468769073486, "learning_rate": 0.0002, "epoch": 7.479018721755971, "step": 23170}, {"loss": 0.3755, "grad_norm": 0.9416976571083069, "learning_rate": 0.0002, "epoch": 7.482246610716591, "step": 23180}, {"loss": 0.4044, "grad_norm": 1.3972517251968384, "learning_rate": 0.0002, "epoch": 7.485474499677212, "step": 23190}, {"loss": 0.3869, "grad_norm": 1.3033207654953003, "learning_rate": 0.0002, "epoch": 7.488702388637831, "step": 23200}, {"loss": 0.3896, "grad_norm": 1.1479045152664185, "learning_rate": 0.0002, "epoch": 7.491930277598451, "step": 23210}, {"loss": 0.3746, "grad_norm": 1.108995795249939, "learning_rate": 0.0002, "epoch": 7.4951581665590705, "step": 23220}, {"loss": 0.3802, "grad_norm": 1.2081542015075684, "learning_rate": 0.0002, "epoch": 7.49838605551969, "step": 23230}, {"loss": 0.3782, "grad_norm": 1.227265477180481, "learning_rate": 0.0002, "epoch": 7.50161394448031, "step": 23240}, {"loss": 0.3999, "grad_norm": 1.3606903553009033, "learning_rate": 0.0002, "epoch": 7.5048418334409295, "step": 23250}, {"loss": 0.3845, "grad_norm": 1.4457145929336548, "learning_rate": 0.0002, "epoch": 7.508069722401549, "step": 23260}, {"loss": 0.3809, "grad_norm": 1.071205496788025, "learning_rate": 0.0002, "epoch": 7.511297611362169, "step": 23270}, {"loss": 0.3707, "grad_norm": 1.0113176107406616, "learning_rate": 0.0002, "epoch": 7.514525500322788, "step": 23280}, {"loss": 0.3815, "grad_norm": 1.2792452573776245, "learning_rate": 0.0002, "epoch": 7.517753389283409, "step": 23290}, {"loss": 0.3945, "grad_norm": 1.16257643699646, "learning_rate": 0.0002, "epoch": 7.520981278244029, "step": 23300}, {"loss": 0.4063, "grad_norm": 1.4449529647827148, "learning_rate": 0.0002, "epoch": 7.524209167204648, "step": 23310}, {"loss": 0.3693, "grad_norm": 1.0467441082000732, "learning_rate": 0.0002, "epoch": 7.527437056165268, "step": 23320}, {"loss": 0.3925, "grad_norm": 1.2062382698059082, "learning_rate": 0.0002, "epoch": 7.530664945125888, "step": 23330}, {"loss": 0.404, "grad_norm": 1.3828591108322144, "learning_rate": 0.0002, "epoch": 7.533892834086507, "step": 23340}, {"loss": 0.3694, "grad_norm": 1.1746373176574707, "learning_rate": 0.0002, "epoch": 7.537120723047127, "step": 23350}, {"loss": 0.3803, "grad_norm": 1.1252634525299072, "learning_rate": 0.0002, "epoch": 7.540348612007747, "step": 23360}, {"loss": 0.3979, "grad_norm": 1.1146548986434937, "learning_rate": 0.0002, "epoch": 7.543576500968367, "step": 23370}, {"loss": 0.4093, "grad_norm": 1.2049988508224487, "learning_rate": 0.0002, "epoch": 7.546804389928987, "step": 23380}, {"loss": 0.419, "grad_norm": 1.211979866027832, "learning_rate": 0.0002, "epoch": 7.550032278889606, "step": 23390}, {"loss": 0.3793, "grad_norm": 1.1158992052078247, "learning_rate": 0.0002, "epoch": 7.553260167850226, "step": 23400}, {"loss": 0.3748, "grad_norm": 1.0987670421600342, "learning_rate": 0.0002, "epoch": 7.556488056810846, "step": 23410}, {"loss": 0.3835, "grad_norm": 1.2179386615753174, "learning_rate": 0.0002, "epoch": 7.559715945771465, "step": 23420}, {"loss": 0.3934, "grad_norm": 1.2416619062423706, "learning_rate": 0.0002, "epoch": 7.562943834732085, "step": 23430}, {"loss": 0.3951, "grad_norm": 0.7858901023864746, "learning_rate": 0.0002, "epoch": 7.566171723692705, "step": 23440}, {"loss": 0.3938, "grad_norm": 1.4219504594802856, "learning_rate": 0.0002, "epoch": 7.569399612653324, "step": 23450}, {"loss": 0.3811, "grad_norm": 0.9971513152122498, "learning_rate": 0.0002, "epoch": 7.572627501613945, "step": 23460}, {"loss": 0.3846, "grad_norm": 1.2463445663452148, "learning_rate": 0.0002, "epoch": 7.5758553905745645, "step": 23470}, {"loss": 0.391, "grad_norm": 0.9103072881698608, "learning_rate": 0.0002, "epoch": 7.579083279535184, "step": 23480}, {"loss": 0.4219, "grad_norm": 1.296644687652588, "learning_rate": 0.0002, "epoch": 7.582311168495804, "step": 23490}, {"loss": 0.4191, "grad_norm": 1.2630009651184082, "learning_rate": 0.0002, "epoch": 7.585539057456423, "step": 23500}, {"loss": 0.3822, "grad_norm": 1.1580113172531128, "learning_rate": 0.0002, "epoch": 7.588766946417043, "step": 23510}, {"loss": 0.4366, "grad_norm": 1.3033956289291382, "learning_rate": 0.0002, "epoch": 7.591994835377663, "step": 23520}, {"loss": 0.3951, "grad_norm": 1.1394670009613037, "learning_rate": 0.0002, "epoch": 7.595222724338282, "step": 23530}, {"loss": 0.379, "grad_norm": 1.1448818445205688, "learning_rate": 0.0002, "epoch": 7.598450613298903, "step": 23540}, {"loss": 0.3967, "grad_norm": 1.3899340629577637, "learning_rate": 0.0002, "epoch": 7.601678502259523, "step": 23550}, {"loss": 0.3844, "grad_norm": 1.2759299278259277, "learning_rate": 0.0002, "epoch": 7.604906391220142, "step": 23560}, {"loss": 0.4017, "grad_norm": 1.0882219076156616, "learning_rate": 0.0002, "epoch": 7.608134280180762, "step": 23570}, {"loss": 0.3926, "grad_norm": 1.189413070678711, "learning_rate": 0.0002, "epoch": 7.6113621691413815, "step": 23580}, {"loss": 0.41, "grad_norm": 1.1257762908935547, "learning_rate": 0.0002, "epoch": 7.614590058102001, "step": 23590}, {"loss": 0.4264, "grad_norm": 1.2915645837783813, "learning_rate": 0.0002, "epoch": 7.617817947062621, "step": 23600}, {"loss": 0.401, "grad_norm": 1.3340779542922974, "learning_rate": 0.0002, "epoch": 7.6210458360232405, "step": 23610}, {"loss": 0.4148, "grad_norm": 1.3149892091751099, "learning_rate": 0.0002, "epoch": 7.62427372498386, "step": 23620}, {"loss": 0.3946, "grad_norm": 1.4316612482070923, "learning_rate": 0.0002, "epoch": 7.627501613944481, "step": 23630}, {"loss": 0.3893, "grad_norm": 1.024850606918335, "learning_rate": 0.0002, "epoch": 7.6307295029051, "step": 23640}, {"loss": 0.4275, "grad_norm": 1.193853735923767, "learning_rate": 0.0002, "epoch": 7.63395739186572, "step": 23650}, {"loss": 0.4064, "grad_norm": 1.1436676979064941, "learning_rate": 0.0002, "epoch": 7.63718528082634, "step": 23660}, {"loss": 0.4051, "grad_norm": 1.231313705444336, "learning_rate": 0.0002, "epoch": 7.640413169786959, "step": 23670}, {"loss": 0.4088, "grad_norm": 1.370025634765625, "learning_rate": 0.0002, "epoch": 7.643641058747579, "step": 23680}, {"loss": 0.3881, "grad_norm": 1.4087916612625122, "learning_rate": 0.0002, "epoch": 7.646868947708199, "step": 23690}, {"loss": 0.3767, "grad_norm": 1.143715500831604, "learning_rate": 0.0002, "epoch": 7.650096836668818, "step": 23700}, {"loss": 0.3976, "grad_norm": 1.0907450914382935, "learning_rate": 0.0002, "epoch": 7.653324725629439, "step": 23710}, {"loss": 0.423, "grad_norm": 1.1993663311004639, "learning_rate": 0.0002, "epoch": 7.656552614590058, "step": 23720}, {"loss": 0.3833, "grad_norm": 1.5836968421936035, "learning_rate": 0.0002, "epoch": 7.659780503550678, "step": 23730}, {"loss": 0.4029, "grad_norm": 1.1070377826690674, "learning_rate": 0.0002, "epoch": 7.663008392511298, "step": 23740}, {"loss": 0.3889, "grad_norm": 1.0333292484283447, "learning_rate": 0.0002, "epoch": 7.666236281471917, "step": 23750}, {"loss": 0.3862, "grad_norm": 1.293520450592041, "learning_rate": 0.0002, "epoch": 7.669464170432537, "step": 23760}, {"loss": 0.393, "grad_norm": 1.164291262626648, "learning_rate": 0.0002, "epoch": 7.672692059393157, "step": 23770}, {"loss": 0.4133, "grad_norm": 1.1913787126541138, "learning_rate": 0.0002, "epoch": 7.675919948353776, "step": 23780}, {"loss": 0.3839, "grad_norm": 0.9081819653511047, "learning_rate": 0.0002, "epoch": 7.679147837314396, "step": 23790}, {"loss": 0.449, "grad_norm": 1.2931487560272217, "learning_rate": 0.0002, "epoch": 7.6823757262750165, "step": 23800}, {"loss": 0.3958, "grad_norm": 1.2466086149215698, "learning_rate": 0.0002, "epoch": 7.685603615235636, "step": 23810}, {"loss": 0.4183, "grad_norm": 1.2980233430862427, "learning_rate": 0.0002, "epoch": 7.688831504196256, "step": 23820}, {"loss": 0.4035, "grad_norm": 1.357170581817627, "learning_rate": 0.0002, "epoch": 7.6920593931568755, "step": 23830}, {"loss": 0.385, "grad_norm": 1.0869120359420776, "learning_rate": 0.0002, "epoch": 7.695287282117495, "step": 23840}, {"loss": 0.4135, "grad_norm": 0.9358172416687012, "learning_rate": 0.0002, "epoch": 7.698515171078115, "step": 23850}, {"loss": 0.403, "grad_norm": 1.4435080289840698, "learning_rate": 0.0002, "epoch": 7.701743060038734, "step": 23860}, {"loss": 0.3964, "grad_norm": 1.0344315767288208, "learning_rate": 0.0002, "epoch": 7.704970948999354, "step": 23870}, {"loss": 0.4093, "grad_norm": 1.2128890752792358, "learning_rate": 0.0002, "epoch": 7.708198837959975, "step": 23880}, {"loss": 0.3924, "grad_norm": 1.239585280418396, "learning_rate": 0.0002, "epoch": 7.711426726920594, "step": 23890}, {"loss": 0.3966, "grad_norm": 1.1732957363128662, "learning_rate": 0.0002, "epoch": 7.714654615881214, "step": 23900}, {"loss": 0.3917, "grad_norm": 1.2434546947479248, "learning_rate": 0.0002, "epoch": 7.717882504841834, "step": 23910}, {"loss": 0.3876, "grad_norm": 1.2031792402267456, "learning_rate": 0.0002, "epoch": 7.721110393802453, "step": 23920}, {"loss": 0.3948, "grad_norm": 1.1401077508926392, "learning_rate": 0.0002, "epoch": 7.724338282763073, "step": 23930}, {"loss": 0.4178, "grad_norm": 1.3985689878463745, "learning_rate": 0.0002, "epoch": 7.7275661717236925, "step": 23940}, {"loss": 0.3933, "grad_norm": 1.3179208040237427, "learning_rate": 0.0002, "epoch": 7.730794060684312, "step": 23950}, {"loss": 0.4043, "grad_norm": 1.071332335472107, "learning_rate": 0.0002, "epoch": 7.734021949644932, "step": 23960}, {"loss": 0.4217, "grad_norm": 1.169771671295166, "learning_rate": 0.0002, "epoch": 7.7372498386055515, "step": 23970}, {"loss": 0.4149, "grad_norm": 1.2893975973129272, "learning_rate": 0.0002, "epoch": 7.740477727566172, "step": 23980}, {"loss": 0.4136, "grad_norm": 1.424354076385498, "learning_rate": 0.0002, "epoch": 7.743705616526792, "step": 23990}, {"loss": 0.403, "grad_norm": 1.3814094066619873, "learning_rate": 0.0002, "epoch": 7.746933505487411, "step": 24000}, {"loss": 0.3875, "grad_norm": 1.04098641872406, "learning_rate": 0.0002, "epoch": 7.750161394448031, "step": 24010}, {"loss": 0.3875, "grad_norm": 1.2493431568145752, "learning_rate": 0.0002, "epoch": 7.753389283408651, "step": 24020}, {"loss": 0.3948, "grad_norm": 1.20700204372406, "learning_rate": 0.0002, "epoch": 7.75661717236927, "step": 24030}, {"loss": 0.3946, "grad_norm": 1.0956356525421143, "learning_rate": 0.0002, "epoch": 7.75984506132989, "step": 24040}, {"loss": 0.4026, "grad_norm": 1.0404914617538452, "learning_rate": 0.0002, "epoch": 7.7630729502905105, "step": 24050}, {"loss": 0.4263, "grad_norm": 1.1474649906158447, "learning_rate": 0.0002, "epoch": 7.76630083925113, "step": 24060}, {"loss": 0.408, "grad_norm": 1.5770092010498047, "learning_rate": 0.0002, "epoch": 7.76952872821175, "step": 24070}, {"loss": 0.406, "grad_norm": 1.1962103843688965, "learning_rate": 0.0002, "epoch": 7.772756617172369, "step": 24080}, {"loss": 0.4168, "grad_norm": 1.2712551355361938, "learning_rate": 0.0002, "epoch": 7.775984506132989, "step": 24090}, {"loss": 0.4081, "grad_norm": 1.0740753412246704, "learning_rate": 0.0002, "epoch": 7.779212395093609, "step": 24100}, {"loss": 0.3736, "grad_norm": 1.2754921913146973, "learning_rate": 0.0002, "epoch": 7.782440284054228, "step": 24110}, {"loss": 0.4068, "grad_norm": 1.2397977113723755, "learning_rate": 0.0002, "epoch": 7.785668173014848, "step": 24120}, {"loss": 0.4099, "grad_norm": 1.6444467306137085, "learning_rate": 0.0002, "epoch": 7.788896061975468, "step": 24130}, {"loss": 0.4206, "grad_norm": 1.1543670892715454, "learning_rate": 0.0002, "epoch": 7.792123950936087, "step": 24140}, {"loss": 0.4201, "grad_norm": 1.284700870513916, "learning_rate": 0.0002, "epoch": 7.795351839896708, "step": 24150}, {"loss": 0.4014, "grad_norm": 1.3647849559783936, "learning_rate": 0.0002, "epoch": 7.7985797288573275, "step": 24160}, {"loss": 0.3868, "grad_norm": 1.3251831531524658, "learning_rate": 0.0002, "epoch": 7.801807617817947, "step": 24170}, {"loss": 0.3926, "grad_norm": 0.9937632083892822, "learning_rate": 0.0002, "epoch": 7.805035506778567, "step": 24180}, {"loss": 0.4154, "grad_norm": 1.2740001678466797, "learning_rate": 0.0002, "epoch": 7.8082633957391865, "step": 24190}, {"loss": 0.4144, "grad_norm": 1.2092649936676025, "learning_rate": 0.0002, "epoch": 7.811491284699806, "step": 24200}, {"loss": 0.4359, "grad_norm": 1.363057255744934, "learning_rate": 0.0002, "epoch": 7.814719173660426, "step": 24210}, {"loss": 0.4006, "grad_norm": 1.2452268600463867, "learning_rate": 0.0002, "epoch": 7.817947062621046, "step": 24220}, {"loss": 0.4297, "grad_norm": 1.2593066692352295, "learning_rate": 0.0002, "epoch": 7.821174951581666, "step": 24230}, {"loss": 0.4023, "grad_norm": 1.3587749004364014, "learning_rate": 0.0002, "epoch": 7.824402840542286, "step": 24240}, {"loss": 0.4152, "grad_norm": 1.2257705926895142, "learning_rate": 0.0002, "epoch": 7.827630729502905, "step": 24250}, {"loss": 0.3872, "grad_norm": 1.257444977760315, "learning_rate": 0.0002, "epoch": 7.830858618463525, "step": 24260}, {"loss": 0.3883, "grad_norm": 1.3570739030838013, "learning_rate": 0.0002, "epoch": 7.834086507424145, "step": 24270}, {"loss": 0.418, "grad_norm": 1.2873027324676514, "learning_rate": 0.0002, "epoch": 7.837314396384764, "step": 24280}, {"loss": 0.3813, "grad_norm": 1.078808069229126, "learning_rate": 0.0002, "epoch": 7.840542285345384, "step": 24290}, {"loss": 0.4167, "grad_norm": 1.409043788909912, "learning_rate": 0.0002, "epoch": 7.8437701743060035, "step": 24300}, {"loss": 0.4394, "grad_norm": 1.113909363746643, "learning_rate": 0.0002, "epoch": 7.846998063266623, "step": 24310}, {"loss": 0.4063, "grad_norm": 1.432429313659668, "learning_rate": 0.0002, "epoch": 7.850225952227244, "step": 24320}, {"loss": 0.4368, "grad_norm": 1.1753697395324707, "learning_rate": 0.0002, "epoch": 7.853453841187863, "step": 24330}, {"loss": 0.4368, "grad_norm": 1.4771350622177124, "learning_rate": 0.0002, "epoch": 7.856681730148483, "step": 24340}, {"loss": 0.432, "grad_norm": 1.0278029441833496, "learning_rate": 0.0002, "epoch": 7.859909619109103, "step": 24350}, {"loss": 0.408, "grad_norm": 1.064161777496338, "learning_rate": 0.0002, "epoch": 7.863137508069722, "step": 24360}, {"loss": 0.4023, "grad_norm": 1.4824532270431519, "learning_rate": 0.0002, "epoch": 7.866365397030342, "step": 24370}, {"loss": 0.4283, "grad_norm": 1.3403675556182861, "learning_rate": 0.0002, "epoch": 7.869593285990962, "step": 24380}, {"loss": 0.418, "grad_norm": 1.3019866943359375, "learning_rate": 0.0002, "epoch": 7.872821174951581, "step": 24390}, {"loss": 0.4295, "grad_norm": 1.3158677816390991, "learning_rate": 0.0002, "epoch": 7.876049063912202, "step": 24400}, {"loss": 0.4371, "grad_norm": 1.3224833011627197, "learning_rate": 0.0002, "epoch": 7.8792769528728215, "step": 24410}, {"loss": 0.4193, "grad_norm": 1.158711314201355, "learning_rate": 0.0002, "epoch": 7.882504841833441, "step": 24420}, {"loss": 0.3888, "grad_norm": 1.5012301206588745, "learning_rate": 0.0002, "epoch": 7.885732730794061, "step": 24430}, {"loss": 0.3872, "grad_norm": 1.0743858814239502, "learning_rate": 0.0002, "epoch": 7.88896061975468, "step": 24440}, {"loss": 0.3838, "grad_norm": 1.1748833656311035, "learning_rate": 0.0002, "epoch": 7.8921885087153, "step": 24450}, {"loss": 0.4151, "grad_norm": 1.2368545532226562, "learning_rate": 0.0002, "epoch": 7.89541639767592, "step": 24460}, {"loss": 0.4292, "grad_norm": 1.339815378189087, "learning_rate": 0.0002, "epoch": 7.898644286636539, "step": 24470}, {"loss": 0.3871, "grad_norm": 1.106711983680725, "learning_rate": 0.0002, "epoch": 7.901872175597159, "step": 24480}, {"loss": 0.4038, "grad_norm": 1.082188367843628, "learning_rate": 0.0002, "epoch": 7.90510006455778, "step": 24490}, {"loss": 0.4296, "grad_norm": 1.2585617303848267, "learning_rate": 0.0002, "epoch": 7.908327953518399, "step": 24500}, {"loss": 0.4063, "grad_norm": 1.2435230016708374, "learning_rate": 0.0002, "epoch": 7.911555842479019, "step": 24510}, {"loss": 0.4008, "grad_norm": 1.6732012033462524, "learning_rate": 0.0002, "epoch": 7.9147837314396385, "step": 24520}, {"loss": 0.392, "grad_norm": 1.1985243558883667, "learning_rate": 0.0002, "epoch": 7.918011620400258, "step": 24530}, {"loss": 0.3927, "grad_norm": 1.255313515663147, "learning_rate": 0.0002, "epoch": 7.921239509360878, "step": 24540}, {"loss": 0.4229, "grad_norm": 1.2786425352096558, "learning_rate": 0.0002, "epoch": 7.9244673983214975, "step": 24550}, {"loss": 0.4087, "grad_norm": 1.1514666080474854, "learning_rate": 0.0002, "epoch": 7.927695287282117, "step": 24560}, {"loss": 0.4315, "grad_norm": 1.3536173105239868, "learning_rate": 0.0002, "epoch": 7.930923176242738, "step": 24570}, {"loss": 0.4172, "grad_norm": 1.3156218528747559, "learning_rate": 0.0002, "epoch": 7.934151065203357, "step": 24580}, {"loss": 0.4088, "grad_norm": 1.465572476387024, "learning_rate": 0.0002, "epoch": 7.937378954163977, "step": 24590}, {"loss": 0.4161, "grad_norm": 1.0745478868484497, "learning_rate": 0.0002, "epoch": 7.940606843124597, "step": 24600}, {"loss": 0.4084, "grad_norm": 1.2898974418640137, "learning_rate": 0.0002, "epoch": 7.943834732085216, "step": 24610}, {"loss": 0.4066, "grad_norm": 0.9425821900367737, "learning_rate": 0.0002, "epoch": 7.947062621045836, "step": 24620}, {"loss": 0.4281, "grad_norm": 1.238996148109436, "learning_rate": 0.0002, "epoch": 7.950290510006456, "step": 24630}, {"loss": 0.4093, "grad_norm": 1.5326380729675293, "learning_rate": 0.0002, "epoch": 7.953518398967075, "step": 24640}, {"loss": 0.3992, "grad_norm": 0.8708599209785461, "learning_rate": 0.0002, "epoch": 7.956746287927695, "step": 24650}, {"loss": 0.4215, "grad_norm": 1.45661461353302, "learning_rate": 0.0002, "epoch": 7.9599741768883145, "step": 24660}, {"loss": 0.404, "grad_norm": 1.204917073249817, "learning_rate": 0.0002, "epoch": 7.963202065848935, "step": 24670}, {"loss": 0.4095, "grad_norm": 1.2509328126907349, "learning_rate": 0.0002, "epoch": 7.966429954809555, "step": 24680}, {"loss": 0.4102, "grad_norm": 1.3137809038162231, "learning_rate": 0.0002, "epoch": 7.969657843770174, "step": 24690}, {"loss": 0.416, "grad_norm": 1.0418064594268799, "learning_rate": 0.0002, "epoch": 7.972885732730794, "step": 24700}, {"loss": 0.423, "grad_norm": 1.4729000329971313, "learning_rate": 0.0002, "epoch": 7.976113621691414, "step": 24710}, {"loss": 0.4104, "grad_norm": 1.1795575618743896, "learning_rate": 0.0002, "epoch": 7.979341510652033, "step": 24720}, {"loss": 0.39, "grad_norm": 1.7517948150634766, "learning_rate": 0.0002, "epoch": 7.982569399612653, "step": 24730}, {"loss": 0.4214, "grad_norm": 1.0974000692367554, "learning_rate": 0.0002, "epoch": 7.9857972885732735, "step": 24740}, {"loss": 0.4426, "grad_norm": 1.1564710140228271, "learning_rate": 0.0002, "epoch": 7.989025177533893, "step": 24750}, {"loss": 0.4022, "grad_norm": 1.1639856100082397, "learning_rate": 0.0002, "epoch": 7.992253066494513, "step": 24760}, {"loss": 0.4392, "grad_norm": 1.2776424884796143, "learning_rate": 0.0002, "epoch": 7.9954809554551325, "step": 24770}, {"loss": 0.4118, "grad_norm": 1.084326148033142, "learning_rate": 0.0002, "epoch": 7.998708844415752, "step": 24780}]}