diff --git a/.gitattributes b/.gitattributes index 9e61fa067cc885befa2a18a893ef8fae961d89f4..b4eec751814281bcf34c3171f4196f745470dc74 100644 --- a/.gitattributes +++ b/.gitattributes @@ -4721,3 +4721,12 @@ Qwen2-7B-Instruct_int4_flare-en-fpb_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_ Qwen2-7B-Instruct_int4_flare-en-fpb_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-3100-sd-10000/checkpoint-762/tokenizer.json filter=lfs diff=lfs merge=lfs -text Qwen2-7B-Instruct_int4_flare-en-fpb_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-3100-sd-10000/checkpoint-952/tokenizer.json filter=lfs diff=lfs merge=lfs -text Qwen2-7B-Instruct_int4_flare-en-fpb_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-3100-sd-10000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..894306c21591f0f9fc1ad6edef9d7f95864e6e88 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6e6524f0f0d768c7962609d66799cf291b435521c1246c8b6c63ea99e47dce1 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7f7fb8974bbe9e3ded294ba9f6549ebe451d4e28 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f0dffcebfe38a83d54dd012bc62f82ee7d2b7b9157304b819748919869de6f4 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..422b84716e6e5590c9161a0d19a972ea120a6407 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6508de495fd367d7e871fa8e4fc75836e030f326d435064d21860c3a88186250 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..873378f7f032804548ed4fb5cb640548d2d63d40 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3200755ef5e97483bb817f7cf48836d884fb26788ef30ca0156da782ef1dc412 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5e3d28beeab0e8c386cfeb3f8e978d2b211dbdf3 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23ff4515f17a8f43b8ac627a744199be2a7a561be94af3e0dbcf8fc48ffce02f +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bc5c9e545f9a414ef2d71332375f729465530003 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/trainer_state.json @@ -0,0 +1,8708 @@ +{ + "best_metric": 1.0871200561523438, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 12371, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016166841807452913, + "grad_norm": 0.9894065856933594, + "learning_rate": 0.0002, + "loss": 1.6636, + "step": 10 + }, + { + "epoch": 0.0032333683614905826, + "grad_norm": 1.7810699939727783, + "learning_rate": 0.0002, + "loss": 1.1528, + "step": 20 + }, + { + "epoch": 0.004850052542235874, + "grad_norm": 0.5969577431678772, + "learning_rate": 0.0002, + "loss": 0.9767, + "step": 30 + }, + { + "epoch": 0.006466736722981165, + "grad_norm": 0.6354120969772339, + "learning_rate": 0.0002, + "loss": 0.9772, + "step": 40 + }, + { + "epoch": 0.008083420903726457, + "grad_norm": 0.5604607462882996, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 50 + }, + { + "epoch": 0.009700105084471748, + "grad_norm": 0.4676193594932556, + "learning_rate": 0.0002, + "loss": 0.8841, + "step": 60 + }, + { + "epoch": 0.01131678926521704, + "grad_norm": 0.6099211573600769, + "learning_rate": 0.0002, + "loss": 0.9022, + "step": 70 + }, + { + "epoch": 0.01293347344596233, + "grad_norm": 0.48639994859695435, + "learning_rate": 0.0002, + "loss": 0.9133, + "step": 80 + }, + { + "epoch": 0.014550157626707623, + "grad_norm": 0.4904264509677887, + "learning_rate": 0.0002, + "loss": 0.8704, + "step": 90 + }, + { + "epoch": 0.016166841807452915, + "grad_norm": 2.8334362506866455, + "learning_rate": 0.0002, + "loss": 0.8855, + "step": 100 + }, + { + "epoch": 0.017783525988198205, + "grad_norm": 0.43221670389175415, + "learning_rate": 0.0002, + "loss": 0.8958, + "step": 110 + }, + { + "epoch": 0.019400210168943496, + "grad_norm": 0.42244166135787964, + "learning_rate": 0.0002, + "loss": 0.8412, + "step": 120 + }, + { + "epoch": 0.02101689434968879, + "grad_norm": 0.45363298058509827, + "learning_rate": 0.0002, + "loss": 0.8467, + "step": 130 + }, + { + "epoch": 0.02263357853043408, + "grad_norm": 0.44816508889198303, + "learning_rate": 0.0002, + "loss": 0.8641, + "step": 140 + }, + { + "epoch": 0.02425026271117937, + "grad_norm": 0.43308213353157043, + "learning_rate": 0.0002, + "loss": 0.8496, + "step": 150 + }, + { + "epoch": 0.02586694689192466, + "grad_norm": 0.4084763526916504, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 160 + }, + { + "epoch": 0.027483631072669955, + "grad_norm": 0.5363703966140747, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 170 + }, + { + "epoch": 0.029100315253415245, + "grad_norm": 0.4619699716567993, + "learning_rate": 0.0002, + "loss": 0.8558, + "step": 180 + }, + { + "epoch": 0.030716999434160536, + "grad_norm": 0.49069908261299133, + "learning_rate": 0.0002, + "loss": 0.8878, + "step": 190 + }, + { + "epoch": 0.03233368361490583, + "grad_norm": 0.4645835757255554, + "learning_rate": 0.0002, + "loss": 0.8867, + "step": 200 + }, + { + "epoch": 0.03395036779565112, + "grad_norm": 1.2411243915557861, + "learning_rate": 0.0002, + "loss": 0.8842, + "step": 210 + }, + { + "epoch": 0.03556705197639641, + "grad_norm": 0.5211851596832275, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 220 + }, + { + "epoch": 0.037183736157141704, + "grad_norm": 0.5253691673278809, + "learning_rate": 0.0002, + "loss": 0.8194, + "step": 230 + }, + { + "epoch": 0.03880042033788699, + "grad_norm": 0.4567478895187378, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 240 + }, + { + "epoch": 0.040417104518632285, + "grad_norm": 0.5472128391265869, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 250 + }, + { + "epoch": 0.04203378869937758, + "grad_norm": 0.42978546023368835, + "learning_rate": 0.0002, + "loss": 0.8201, + "step": 260 + }, + { + "epoch": 0.043650472880122866, + "grad_norm": 0.601734459400177, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 270 + }, + { + "epoch": 0.04526715706086816, + "grad_norm": 0.4286513328552246, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 280 + }, + { + "epoch": 0.046883841241613454, + "grad_norm": 0.5230861902236938, + "learning_rate": 0.0002, + "loss": 0.8758, + "step": 290 + }, + { + "epoch": 0.04850052542235874, + "grad_norm": 0.6504611968994141, + "learning_rate": 0.0002, + "loss": 0.8636, + "step": 300 + }, + { + "epoch": 0.050117209603104035, + "grad_norm": 0.43485215306282043, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 310 + }, + { + "epoch": 0.05173389378384932, + "grad_norm": 0.4717007875442505, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 320 + }, + { + "epoch": 0.053350577964594616, + "grad_norm": 0.4059787690639496, + "learning_rate": 0.0002, + "loss": 0.8469, + "step": 330 + }, + { + "epoch": 0.05496726214533991, + "grad_norm": 0.4366913437843323, + "learning_rate": 0.0002, + "loss": 0.8866, + "step": 340 + }, + { + "epoch": 0.0565839463260852, + "grad_norm": 0.4233848452568054, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 350 + }, + { + "epoch": 0.05820063050683049, + "grad_norm": 0.4209108352661133, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 360 + }, + { + "epoch": 0.059817314687575784, + "grad_norm": 0.41637396812438965, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 370 + }, + { + "epoch": 0.06143399886832107, + "grad_norm": 0.46235376596450806, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 380 + }, + { + "epoch": 0.06305068304906636, + "grad_norm": 0.4013484716415405, + "learning_rate": 0.0002, + "loss": 0.7966, + "step": 390 + }, + { + "epoch": 0.06466736722981166, + "grad_norm": 0.47443896532058716, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 400 + }, + { + "epoch": 0.06628405141055695, + "grad_norm": 0.3942156434059143, + "learning_rate": 0.0002, + "loss": 0.8666, + "step": 410 + }, + { + "epoch": 0.06790073559130223, + "grad_norm": 0.4965320825576782, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 420 + }, + { + "epoch": 0.06951741977204753, + "grad_norm": 0.4304835796356201, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 430 + }, + { + "epoch": 0.07113410395279282, + "grad_norm": 0.511726975440979, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 440 + }, + { + "epoch": 0.07275078813353811, + "grad_norm": 0.4040689170360565, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 450 + }, + { + "epoch": 0.07436747231428341, + "grad_norm": 0.5402171015739441, + "learning_rate": 0.0002, + "loss": 0.8788, + "step": 460 + }, + { + "epoch": 0.0759841564950287, + "grad_norm": 0.4174517095088959, + "learning_rate": 0.0002, + "loss": 0.8737, + "step": 470 + }, + { + "epoch": 0.07760084067577398, + "grad_norm": 0.4306182265281677, + "learning_rate": 0.0002, + "loss": 0.7605, + "step": 480 + }, + { + "epoch": 0.07921752485651928, + "grad_norm": 0.535210132598877, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 490 + }, + { + "epoch": 0.08083420903726457, + "grad_norm": 0.5339109897613525, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 500 + }, + { + "epoch": 0.08245089321800986, + "grad_norm": 0.45754891633987427, + "learning_rate": 0.0002, + "loss": 0.8985, + "step": 510 + }, + { + "epoch": 0.08406757739875516, + "grad_norm": 0.43820783495903015, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 520 + }, + { + "epoch": 0.08568426157950045, + "grad_norm": 0.4434749186038971, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 530 + }, + { + "epoch": 0.08730094576024573, + "grad_norm": 0.43111467361450195, + "learning_rate": 0.0002, + "loss": 0.7857, + "step": 540 + }, + { + "epoch": 0.08891762994099103, + "grad_norm": 0.4378940165042877, + "learning_rate": 0.0002, + "loss": 0.8418, + "step": 550 + }, + { + "epoch": 0.09053431412173632, + "grad_norm": 0.4772215187549591, + "learning_rate": 0.0002, + "loss": 0.8361, + "step": 560 + }, + { + "epoch": 0.09215099830248161, + "grad_norm": 0.6837629079818726, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 570 + }, + { + "epoch": 0.09376768248322691, + "grad_norm": 0.42241212725639343, + "learning_rate": 0.0002, + "loss": 0.8607, + "step": 580 + }, + { + "epoch": 0.0953843666639722, + "grad_norm": 0.5165936350822449, + "learning_rate": 0.0002, + "loss": 0.852, + "step": 590 + }, + { + "epoch": 0.09700105084471748, + "grad_norm": 0.48737478256225586, + "learning_rate": 0.0002, + "loss": 0.8664, + "step": 600 + }, + { + "epoch": 0.09861773502546278, + "grad_norm": 0.47419852018356323, + "learning_rate": 0.0002, + "loss": 0.8806, + "step": 610 + }, + { + "epoch": 0.10023441920620807, + "grad_norm": 0.4975486099720001, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 620 + }, + { + "epoch": 0.10185110338695336, + "grad_norm": 0.49123844504356384, + "learning_rate": 0.0002, + "loss": 0.8548, + "step": 630 + }, + { + "epoch": 0.10346778756769864, + "grad_norm": 0.6288952827453613, + "learning_rate": 0.0002, + "loss": 0.8911, + "step": 640 + }, + { + "epoch": 0.10508447174844394, + "grad_norm": 0.4277345836162567, + "learning_rate": 0.0002, + "loss": 0.827, + "step": 650 + }, + { + "epoch": 0.10670115592918923, + "grad_norm": 0.4021061956882477, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 660 + }, + { + "epoch": 0.10831784010993452, + "grad_norm": 0.3492237329483032, + "learning_rate": 0.0002, + "loss": 0.87, + "step": 670 + }, + { + "epoch": 0.10993452429067982, + "grad_norm": 0.4341012239456177, + "learning_rate": 0.0002, + "loss": 0.8698, + "step": 680 + }, + { + "epoch": 0.1115512084714251, + "grad_norm": 0.7296304106712341, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 690 + }, + { + "epoch": 0.1131678926521704, + "grad_norm": 0.397494912147522, + "learning_rate": 0.0002, + "loss": 0.8433, + "step": 700 + }, + { + "epoch": 0.1147845768329157, + "grad_norm": 0.396431028842926, + "learning_rate": 0.0002, + "loss": 0.827, + "step": 710 + }, + { + "epoch": 0.11640126101366098, + "grad_norm": 0.48842838406562805, + "learning_rate": 0.0002, + "loss": 0.8379, + "step": 720 + }, + { + "epoch": 0.11801794519440627, + "grad_norm": 0.46322616934776306, + "learning_rate": 0.0002, + "loss": 0.8238, + "step": 730 + }, + { + "epoch": 0.11963462937515157, + "grad_norm": 0.47990912199020386, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 740 + }, + { + "epoch": 0.12125131355589686, + "grad_norm": 0.4997142255306244, + "learning_rate": 0.0002, + "loss": 0.82, + "step": 750 + }, + { + "epoch": 0.12286799773664214, + "grad_norm": 0.4040526747703552, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 760 + }, + { + "epoch": 0.12448468191738744, + "grad_norm": 0.453095942735672, + "learning_rate": 0.0002, + "loss": 0.863, + "step": 770 + }, + { + "epoch": 0.12610136609813272, + "grad_norm": 0.4636971950531006, + "learning_rate": 0.0002, + "loss": 0.8792, + "step": 780 + }, + { + "epoch": 0.12771805027887803, + "grad_norm": 0.4279276132583618, + "learning_rate": 0.0002, + "loss": 0.8112, + "step": 790 + }, + { + "epoch": 0.12933473445962332, + "grad_norm": 0.46212655305862427, + "learning_rate": 0.0002, + "loss": 0.8711, + "step": 800 + }, + { + "epoch": 0.1309514186403686, + "grad_norm": 0.43127650022506714, + "learning_rate": 0.0002, + "loss": 0.8368, + "step": 810 + }, + { + "epoch": 0.1325681028211139, + "grad_norm": 0.4201301336288452, + "learning_rate": 0.0002, + "loss": 0.8476, + "step": 820 + }, + { + "epoch": 0.13418478700185918, + "grad_norm": 0.42583167552948, + "learning_rate": 0.0002, + "loss": 0.8078, + "step": 830 + }, + { + "epoch": 0.13580147118260447, + "grad_norm": 0.4535622000694275, + "learning_rate": 0.0002, + "loss": 0.8219, + "step": 840 + }, + { + "epoch": 0.13741815536334978, + "grad_norm": 0.4116036891937256, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 850 + }, + { + "epoch": 0.13903483954409507, + "grad_norm": 0.45997580885887146, + "learning_rate": 0.0002, + "loss": 0.8466, + "step": 860 + }, + { + "epoch": 0.14065152372484035, + "grad_norm": 0.4487837255001068, + "learning_rate": 0.0002, + "loss": 0.8917, + "step": 870 + }, + { + "epoch": 0.14226820790558564, + "grad_norm": 0.43650057911872864, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 880 + }, + { + "epoch": 0.14388489208633093, + "grad_norm": 0.5335358381271362, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 890 + }, + { + "epoch": 0.14550157626707622, + "grad_norm": 0.5989000201225281, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 900 + }, + { + "epoch": 0.14711826044782153, + "grad_norm": 0.517179012298584, + "learning_rate": 0.0002, + "loss": 0.8385, + "step": 910 + }, + { + "epoch": 0.14873494462856682, + "grad_norm": 0.44435232877731323, + "learning_rate": 0.0002, + "loss": 0.8255, + "step": 920 + }, + { + "epoch": 0.1503516288093121, + "grad_norm": 0.42635923624038696, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 930 + }, + { + "epoch": 0.1519683129900574, + "grad_norm": 0.49603334069252014, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 940 + }, + { + "epoch": 0.15358499717080268, + "grad_norm": 0.40639808773994446, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 950 + }, + { + "epoch": 0.15520168135154797, + "grad_norm": 0.4850759208202362, + "learning_rate": 0.0002, + "loss": 0.8529, + "step": 960 + }, + { + "epoch": 0.15681836553229328, + "grad_norm": 0.4427442252635956, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 970 + }, + { + "epoch": 0.15843504971303857, + "grad_norm": 0.3760930001735687, + "learning_rate": 0.0002, + "loss": 0.8705, + "step": 980 + }, + { + "epoch": 0.16005173389378385, + "grad_norm": 0.4794144332408905, + "learning_rate": 0.0002, + "loss": 0.8644, + "step": 990 + }, + { + "epoch": 0.16166841807452914, + "grad_norm": 0.45828768610954285, + "learning_rate": 0.0002, + "loss": 0.8002, + "step": 1000 + }, + { + "epoch": 0.16328510225527443, + "grad_norm": 0.6313053369522095, + "learning_rate": 0.0002, + "loss": 0.7658, + "step": 1010 + }, + { + "epoch": 0.16490178643601971, + "grad_norm": 0.45041006803512573, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1020 + }, + { + "epoch": 0.166518470616765, + "grad_norm": 0.441403865814209, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 1030 + }, + { + "epoch": 0.16813515479751032, + "grad_norm": 0.8171296119689941, + "learning_rate": 0.0002, + "loss": 0.8475, + "step": 1040 + }, + { + "epoch": 0.1697518389782556, + "grad_norm": 0.7137420773506165, + "learning_rate": 0.0002, + "loss": 0.845, + "step": 1050 + }, + { + "epoch": 0.1713685231590009, + "grad_norm": 0.5236809849739075, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 1060 + }, + { + "epoch": 0.17298520733974618, + "grad_norm": 0.5021864175796509, + "learning_rate": 0.0002, + "loss": 0.8265, + "step": 1070 + }, + { + "epoch": 0.17460189152049146, + "grad_norm": 0.47347521781921387, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 1080 + }, + { + "epoch": 0.17621857570123675, + "grad_norm": 0.4631653428077698, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 1090 + }, + { + "epoch": 0.17783525988198207, + "grad_norm": 0.49169182777404785, + "learning_rate": 0.0002, + "loss": 0.8166, + "step": 1100 + }, + { + "epoch": 0.17945194406272735, + "grad_norm": 0.5019739270210266, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 1110 + }, + { + "epoch": 0.18106862824347264, + "grad_norm": 0.5100422501564026, + "learning_rate": 0.0002, + "loss": 0.8247, + "step": 1120 + }, + { + "epoch": 0.18268531242421793, + "grad_norm": 0.3888324499130249, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 1130 + }, + { + "epoch": 0.18430199660496321, + "grad_norm": 0.39765217900276184, + "learning_rate": 0.0002, + "loss": 0.8533, + "step": 1140 + }, + { + "epoch": 0.1859186807857085, + "grad_norm": 0.47190186381340027, + "learning_rate": 0.0002, + "loss": 0.8541, + "step": 1150 + }, + { + "epoch": 0.18753536496645382, + "grad_norm": 0.4464188814163208, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 1160 + }, + { + "epoch": 0.1891520491471991, + "grad_norm": 0.5153930187225342, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 1170 + }, + { + "epoch": 0.1907687333279444, + "grad_norm": 0.4779708683490753, + "learning_rate": 0.0002, + "loss": 0.8033, + "step": 1180 + }, + { + "epoch": 0.19238541750868968, + "grad_norm": 0.4834315776824951, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 1190 + }, + { + "epoch": 0.19400210168943496, + "grad_norm": 0.402357816696167, + "learning_rate": 0.0002, + "loss": 0.7721, + "step": 1200 + }, + { + "epoch": 0.19561878587018025, + "grad_norm": 0.45899084210395813, + "learning_rate": 0.0002, + "loss": 0.7941, + "step": 1210 + }, + { + "epoch": 0.19723547005092557, + "grad_norm": 0.5106529593467712, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 1220 + }, + { + "epoch": 0.19885215423167085, + "grad_norm": 0.45261722803115845, + "learning_rate": 0.0002, + "loss": 0.7816, + "step": 1230 + }, + { + "epoch": 0.20046883841241614, + "grad_norm": 0.4647127091884613, + "learning_rate": 0.0002, + "loss": 0.8068, + "step": 1240 + }, + { + "epoch": 0.20208552259316143, + "grad_norm": 0.4849368929862976, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 1250 + }, + { + "epoch": 0.2037022067739067, + "grad_norm": 0.4518061578273773, + "learning_rate": 0.0002, + "loss": 0.8514, + "step": 1260 + }, + { + "epoch": 0.205318890954652, + "grad_norm": 0.49535325169563293, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 1270 + }, + { + "epoch": 0.2069355751353973, + "grad_norm": 0.4835205376148224, + "learning_rate": 0.0002, + "loss": 0.8348, + "step": 1280 + }, + { + "epoch": 0.2085522593161426, + "grad_norm": 0.45308539271354675, + "learning_rate": 0.0002, + "loss": 0.8428, + "step": 1290 + }, + { + "epoch": 0.2101689434968879, + "grad_norm": 0.5369905233383179, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 1300 + }, + { + "epoch": 0.21178562767763318, + "grad_norm": 0.5031622052192688, + "learning_rate": 0.0002, + "loss": 0.8676, + "step": 1310 + }, + { + "epoch": 0.21340231185837846, + "grad_norm": 0.48010334372520447, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 1320 + }, + { + "epoch": 0.21501899603912375, + "grad_norm": 0.4905701279640198, + "learning_rate": 0.0002, + "loss": 0.806, + "step": 1330 + }, + { + "epoch": 0.21663568021986904, + "grad_norm": 0.43531742691993713, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 1340 + }, + { + "epoch": 0.21825236440061435, + "grad_norm": 0.44330692291259766, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 1350 + }, + { + "epoch": 0.21986904858135964, + "grad_norm": 0.5384416580200195, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 1360 + }, + { + "epoch": 0.22148573276210493, + "grad_norm": 0.4181833863258362, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 1370 + }, + { + "epoch": 0.2231024169428502, + "grad_norm": 0.523833692073822, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 1380 + }, + { + "epoch": 0.2247191011235955, + "grad_norm": 0.5528736710548401, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 1390 + }, + { + "epoch": 0.2263357853043408, + "grad_norm": 0.43515023589134216, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 1400 + }, + { + "epoch": 0.2279524694850861, + "grad_norm": 0.48809877038002014, + "learning_rate": 0.0002, + "loss": 0.8403, + "step": 1410 + }, + { + "epoch": 0.2295691536658314, + "grad_norm": 0.43591251969337463, + "learning_rate": 0.0002, + "loss": 0.8165, + "step": 1420 + }, + { + "epoch": 0.23118583784657668, + "grad_norm": 0.44625312089920044, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 1430 + }, + { + "epoch": 0.23280252202732196, + "grad_norm": 0.4390665292739868, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 1440 + }, + { + "epoch": 0.23441920620806725, + "grad_norm": 0.48496049642562866, + "learning_rate": 0.0002, + "loss": 0.8465, + "step": 1450 + }, + { + "epoch": 0.23603589038881254, + "grad_norm": 0.45919957756996155, + "learning_rate": 0.0002, + "loss": 0.775, + "step": 1460 + }, + { + "epoch": 0.23765257456955785, + "grad_norm": 0.5471845865249634, + "learning_rate": 0.0002, + "loss": 0.8659, + "step": 1470 + }, + { + "epoch": 0.23926925875030314, + "grad_norm": 0.47269317507743835, + "learning_rate": 0.0002, + "loss": 0.8164, + "step": 1480 + }, + { + "epoch": 0.24088594293104842, + "grad_norm": 0.4930245578289032, + "learning_rate": 0.0002, + "loss": 0.854, + "step": 1490 + }, + { + "epoch": 0.2425026271117937, + "grad_norm": 0.5605630278587341, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 1500 + }, + { + "epoch": 0.244119311292539, + "grad_norm": 0.4435870945453644, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 1510 + }, + { + "epoch": 0.24573599547328429, + "grad_norm": 0.4941999912261963, + "learning_rate": 0.0002, + "loss": 0.8123, + "step": 1520 + }, + { + "epoch": 0.24735267965402957, + "grad_norm": 0.5100624561309814, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 1530 + }, + { + "epoch": 0.2489693638347749, + "grad_norm": 0.4638267457485199, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 1540 + }, + { + "epoch": 0.25058604801552015, + "grad_norm": 0.5071570873260498, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 1550 + }, + { + "epoch": 0.25220273219626543, + "grad_norm": 0.4291319251060486, + "learning_rate": 0.0002, + "loss": 0.7724, + "step": 1560 + }, + { + "epoch": 0.2538194163770108, + "grad_norm": 0.5388049483299255, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 1570 + }, + { + "epoch": 0.25543610055775606, + "grad_norm": 0.5083683729171753, + "learning_rate": 0.0002, + "loss": 0.8176, + "step": 1580 + }, + { + "epoch": 0.25705278473850135, + "grad_norm": 0.4824463725090027, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 1590 + }, + { + "epoch": 0.25866946891924664, + "grad_norm": 0.41177722811698914, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 1600 + }, + { + "epoch": 0.2602861530999919, + "grad_norm": 0.5656219124794006, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 1610 + }, + { + "epoch": 0.2619028372807372, + "grad_norm": 0.41063204407691956, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 1620 + }, + { + "epoch": 0.2635195214614825, + "grad_norm": 0.4897061288356781, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 1630 + }, + { + "epoch": 0.2651362056422278, + "grad_norm": 0.4454376697540283, + "learning_rate": 0.0002, + "loss": 0.8198, + "step": 1640 + }, + { + "epoch": 0.26675288982297307, + "grad_norm": 0.4355238378047943, + "learning_rate": 0.0002, + "loss": 0.8684, + "step": 1650 + }, + { + "epoch": 0.26836957400371836, + "grad_norm": 0.458310067653656, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 1660 + }, + { + "epoch": 0.26998625818446365, + "grad_norm": 0.4752083718776703, + "learning_rate": 0.0002, + "loss": 0.7935, + "step": 1670 + }, + { + "epoch": 0.27160294236520893, + "grad_norm": 0.4666106402873993, + "learning_rate": 0.0002, + "loss": 0.8267, + "step": 1680 + }, + { + "epoch": 0.2732196265459543, + "grad_norm": 0.4213818609714508, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 1690 + }, + { + "epoch": 0.27483631072669956, + "grad_norm": 0.5768913626670837, + "learning_rate": 0.0002, + "loss": 0.8559, + "step": 1700 + }, + { + "epoch": 0.27645299490744485, + "grad_norm": 0.4209914803504944, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 1710 + }, + { + "epoch": 0.27806967908819014, + "grad_norm": 0.501909613609314, + "learning_rate": 0.0002, + "loss": 0.8167, + "step": 1720 + }, + { + "epoch": 0.2796863632689354, + "grad_norm": 0.5266261100769043, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 1730 + }, + { + "epoch": 0.2813030474496807, + "grad_norm": 0.43806859850883484, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 1740 + }, + { + "epoch": 0.282919731630426, + "grad_norm": 0.46048814058303833, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 1750 + }, + { + "epoch": 0.2845364158111713, + "grad_norm": 0.44972819089889526, + "learning_rate": 0.0002, + "loss": 0.8596, + "step": 1760 + }, + { + "epoch": 0.28615309999191657, + "grad_norm": 0.5114831328392029, + "learning_rate": 0.0002, + "loss": 0.8421, + "step": 1770 + }, + { + "epoch": 0.28776978417266186, + "grad_norm": 0.47931742668151855, + "learning_rate": 0.0002, + "loss": 0.8361, + "step": 1780 + }, + { + "epoch": 0.28938646835340714, + "grad_norm": 0.5092599987983704, + "learning_rate": 0.0002, + "loss": 0.8265, + "step": 1790 + }, + { + "epoch": 0.29100315253415243, + "grad_norm": 0.37581443786621094, + "learning_rate": 0.0002, + "loss": 0.8506, + "step": 1800 + }, + { + "epoch": 0.2926198367148977, + "grad_norm": 0.47097381949424744, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 1810 + }, + { + "epoch": 0.29423652089564306, + "grad_norm": 0.48300236463546753, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 1820 + }, + { + "epoch": 0.29585320507638835, + "grad_norm": 0.5600419640541077, + "learning_rate": 0.0002, + "loss": 0.8391, + "step": 1830 + }, + { + "epoch": 0.29746988925713364, + "grad_norm": 0.48555272817611694, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 1840 + }, + { + "epoch": 0.2990865734378789, + "grad_norm": 0.3752668499946594, + "learning_rate": 0.0002, + "loss": 0.7657, + "step": 1850 + }, + { + "epoch": 0.3007032576186242, + "grad_norm": 0.5328747034072876, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 1860 + }, + { + "epoch": 0.3023199417993695, + "grad_norm": 0.48716455698013306, + "learning_rate": 0.0002, + "loss": 0.8426, + "step": 1870 + }, + { + "epoch": 0.3039366259801148, + "grad_norm": 0.5011493563652039, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 1880 + }, + { + "epoch": 0.30555331016086007, + "grad_norm": 0.46461427211761475, + "learning_rate": 0.0002, + "loss": 0.852, + "step": 1890 + }, + { + "epoch": 0.30716999434160536, + "grad_norm": 0.36630210280418396, + "learning_rate": 0.0002, + "loss": 0.8478, + "step": 1900 + }, + { + "epoch": 0.30878667852235064, + "grad_norm": 0.4217296242713928, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 1910 + }, + { + "epoch": 0.31040336270309593, + "grad_norm": 0.4394875466823578, + "learning_rate": 0.0002, + "loss": 0.8128, + "step": 1920 + }, + { + "epoch": 0.3120200468838412, + "grad_norm": 0.6587965488433838, + "learning_rate": 0.0002, + "loss": 0.8471, + "step": 1930 + }, + { + "epoch": 0.31363673106458656, + "grad_norm": 0.5469298958778381, + "learning_rate": 0.0002, + "loss": 0.8565, + "step": 1940 + }, + { + "epoch": 0.31525341524533185, + "grad_norm": 0.4371595084667206, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 1950 + }, + { + "epoch": 0.31687009942607713, + "grad_norm": 0.4809541404247284, + "learning_rate": 0.0002, + "loss": 0.887, + "step": 1960 + }, + { + "epoch": 0.3184867836068224, + "grad_norm": 0.6061086654663086, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 1970 + }, + { + "epoch": 0.3201034677875677, + "grad_norm": 0.5342657566070557, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 1980 + }, + { + "epoch": 0.321720151968313, + "grad_norm": 0.5057743787765503, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 1990 + }, + { + "epoch": 0.3233368361490583, + "grad_norm": 0.528626024723053, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 2000 + }, + { + "epoch": 0.32495352032980357, + "grad_norm": 0.46742770075798035, + "learning_rate": 0.0002, + "loss": 0.8845, + "step": 2010 + }, + { + "epoch": 0.32657020451054886, + "grad_norm": 0.515101432800293, + "learning_rate": 0.0002, + "loss": 0.8484, + "step": 2020 + }, + { + "epoch": 0.32818688869129414, + "grad_norm": 0.41941216588020325, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 2030 + }, + { + "epoch": 0.32980357287203943, + "grad_norm": 0.49902522563934326, + "learning_rate": 0.0002, + "loss": 0.7637, + "step": 2040 + }, + { + "epoch": 0.3314202570527847, + "grad_norm": 0.4120897650718689, + "learning_rate": 0.0002, + "loss": 0.7822, + "step": 2050 + }, + { + "epoch": 0.33303694123353, + "grad_norm": 0.45352041721343994, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 2060 + }, + { + "epoch": 0.33465362541427535, + "grad_norm": 0.523199737071991, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 2070 + }, + { + "epoch": 0.33627030959502063, + "grad_norm": 0.4390358626842499, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 2080 + }, + { + "epoch": 0.3378869937757659, + "grad_norm": 0.6752901077270508, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 2090 + }, + { + "epoch": 0.3395036779565112, + "grad_norm": 0.547821044921875, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 2100 + }, + { + "epoch": 0.3411203621372565, + "grad_norm": 0.5161308646202087, + "learning_rate": 0.0002, + "loss": 0.8561, + "step": 2110 + }, + { + "epoch": 0.3427370463180018, + "grad_norm": 0.4565401077270508, + "learning_rate": 0.0002, + "loss": 0.7697, + "step": 2120 + }, + { + "epoch": 0.34435373049874707, + "grad_norm": 0.4666115939617157, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 2130 + }, + { + "epoch": 0.34597041467949236, + "grad_norm": 0.4090428352355957, + "learning_rate": 0.0002, + "loss": 0.8189, + "step": 2140 + }, + { + "epoch": 0.34758709886023764, + "grad_norm": 0.510845422744751, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 2150 + }, + { + "epoch": 0.34920378304098293, + "grad_norm": 0.42861923575401306, + "learning_rate": 0.0002, + "loss": 0.8398, + "step": 2160 + }, + { + "epoch": 0.3508204672217282, + "grad_norm": 0.4476332664489746, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 2170 + }, + { + "epoch": 0.3524371514024735, + "grad_norm": 0.6065791249275208, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 2180 + }, + { + "epoch": 0.35405383558321885, + "grad_norm": 0.42335066199302673, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 2190 + }, + { + "epoch": 0.35567051976396413, + "grad_norm": 0.5094629526138306, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 2200 + }, + { + "epoch": 0.3572872039447094, + "grad_norm": 0.5476373434066772, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 2210 + }, + { + "epoch": 0.3589038881254547, + "grad_norm": 0.3911719024181366, + "learning_rate": 0.0002, + "loss": 0.814, + "step": 2220 + }, + { + "epoch": 0.3605205723062, + "grad_norm": 0.6599636077880859, + "learning_rate": 0.0002, + "loss": 0.8599, + "step": 2230 + }, + { + "epoch": 0.3621372564869453, + "grad_norm": 0.40381914377212524, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 2240 + }, + { + "epoch": 0.36375394066769057, + "grad_norm": 0.4433908462524414, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 2250 + }, + { + "epoch": 0.36537062484843585, + "grad_norm": 0.578326940536499, + "learning_rate": 0.0002, + "loss": 0.8503, + "step": 2260 + }, + { + "epoch": 0.36698730902918114, + "grad_norm": 0.5734784007072449, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 2270 + }, + { + "epoch": 0.36860399320992643, + "grad_norm": 0.45555487275123596, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 2280 + }, + { + "epoch": 0.3702206773906717, + "grad_norm": 0.5666276216506958, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 2290 + }, + { + "epoch": 0.371837361571417, + "grad_norm": 0.5461117625236511, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 2300 + }, + { + "epoch": 0.3734540457521623, + "grad_norm": 0.6318911910057068, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 2310 + }, + { + "epoch": 0.37507072993290763, + "grad_norm": 0.493263304233551, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 2320 + }, + { + "epoch": 0.3766874141136529, + "grad_norm": 0.5888760089874268, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 2330 + }, + { + "epoch": 0.3783040982943982, + "grad_norm": 0.48671841621398926, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 2340 + }, + { + "epoch": 0.3799207824751435, + "grad_norm": 0.4385145306587219, + "learning_rate": 0.0002, + "loss": 0.8367, + "step": 2350 + }, + { + "epoch": 0.3815374666558888, + "grad_norm": 0.5523318648338318, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 2360 + }, + { + "epoch": 0.38315415083663407, + "grad_norm": 0.7308220267295837, + "learning_rate": 0.0002, + "loss": 0.8351, + "step": 2370 + }, + { + "epoch": 0.38477083501737935, + "grad_norm": 0.554214358329773, + "learning_rate": 0.0002, + "loss": 0.859, + "step": 2380 + }, + { + "epoch": 0.38638751919812464, + "grad_norm": 0.5425800085067749, + "learning_rate": 0.0002, + "loss": 0.8146, + "step": 2390 + }, + { + "epoch": 0.3880042033788699, + "grad_norm": 0.48811158537864685, + "learning_rate": 0.0002, + "loss": 0.8282, + "step": 2400 + }, + { + "epoch": 0.3896208875596152, + "grad_norm": 0.49212366342544556, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 2410 + }, + { + "epoch": 0.3912375717403605, + "grad_norm": 0.5222218632698059, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 2420 + }, + { + "epoch": 0.3928542559211058, + "grad_norm": 0.4699819087982178, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 2430 + }, + { + "epoch": 0.39447094010185113, + "grad_norm": 0.46153587102890015, + "learning_rate": 0.0002, + "loss": 0.7919, + "step": 2440 + }, + { + "epoch": 0.3960876242825964, + "grad_norm": 0.4150611162185669, + "learning_rate": 0.0002, + "loss": 0.8111, + "step": 2450 + }, + { + "epoch": 0.3977043084633417, + "grad_norm": 0.5799614787101746, + "learning_rate": 0.0002, + "loss": 0.8589, + "step": 2460 + }, + { + "epoch": 0.399320992644087, + "grad_norm": 0.56536865234375, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 2470 + }, + { + "epoch": 0.4009376768248323, + "grad_norm": 0.5451247096061707, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 2480 + }, + { + "epoch": 0.40255436100557757, + "grad_norm": 0.5914521217346191, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 2490 + }, + { + "epoch": 0.40417104518632285, + "grad_norm": 0.4428117275238037, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 2500 + }, + { + "epoch": 0.40578772936706814, + "grad_norm": 0.48580947518348694, + "learning_rate": 0.0002, + "loss": 0.8054, + "step": 2510 + }, + { + "epoch": 0.4074044135478134, + "grad_norm": 0.436734676361084, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 2520 + }, + { + "epoch": 0.4090210977285587, + "grad_norm": 0.5752223134040833, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 2530 + }, + { + "epoch": 0.410637781909304, + "grad_norm": 0.4271308183670044, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 2540 + }, + { + "epoch": 0.4122544660900493, + "grad_norm": 0.46294718980789185, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 2550 + }, + { + "epoch": 0.4138711502707946, + "grad_norm": 0.49407583475112915, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 2560 + }, + { + "epoch": 0.4154878344515399, + "grad_norm": 0.4729035496711731, + "learning_rate": 0.0002, + "loss": 0.7881, + "step": 2570 + }, + { + "epoch": 0.4171045186322852, + "grad_norm": 0.4129747152328491, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 2580 + }, + { + "epoch": 0.4187212028130305, + "grad_norm": 0.5684236288070679, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 2590 + }, + { + "epoch": 0.4203378869937758, + "grad_norm": 0.4862157106399536, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 2600 + }, + { + "epoch": 0.42195457117452106, + "grad_norm": 0.46567976474761963, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 2610 + }, + { + "epoch": 0.42357125535526635, + "grad_norm": 0.5710650682449341, + "learning_rate": 0.0002, + "loss": 0.7755, + "step": 2620 + }, + { + "epoch": 0.42518793953601164, + "grad_norm": 0.5660041570663452, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 2630 + }, + { + "epoch": 0.4268046237167569, + "grad_norm": 0.47944375872612, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 2640 + }, + { + "epoch": 0.4284213078975022, + "grad_norm": 0.537223756313324, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 2650 + }, + { + "epoch": 0.4300379920782475, + "grad_norm": 0.41669997572898865, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 2660 + }, + { + "epoch": 0.4316546762589928, + "grad_norm": 0.44727686047554016, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 2670 + }, + { + "epoch": 0.4332713604397381, + "grad_norm": 0.5600888729095459, + "learning_rate": 0.0002, + "loss": 0.8241, + "step": 2680 + }, + { + "epoch": 0.4348880446204834, + "grad_norm": 0.39820605516433716, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 2690 + }, + { + "epoch": 0.4365047288012287, + "grad_norm": 0.5637655854225159, + "learning_rate": 0.0002, + "loss": 0.8202, + "step": 2700 + }, + { + "epoch": 0.438121412981974, + "grad_norm": 0.6363666653633118, + "learning_rate": 0.0002, + "loss": 0.855, + "step": 2710 + }, + { + "epoch": 0.4397380971627193, + "grad_norm": 0.5656129121780396, + "learning_rate": 0.0002, + "loss": 0.8468, + "step": 2720 + }, + { + "epoch": 0.44135478134346456, + "grad_norm": 0.5600156188011169, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 2730 + }, + { + "epoch": 0.44297146552420985, + "grad_norm": 0.5506579875946045, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 2740 + }, + { + "epoch": 0.44458814970495514, + "grad_norm": 0.49878305196762085, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 2750 + }, + { + "epoch": 0.4462048338857004, + "grad_norm": 0.4569213092327118, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 2760 + }, + { + "epoch": 0.4478215180664457, + "grad_norm": 0.6056680083274841, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 2770 + }, + { + "epoch": 0.449438202247191, + "grad_norm": 0.44474557042121887, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 2780 + }, + { + "epoch": 0.4510548864279363, + "grad_norm": 0.46055394411087036, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 2790 + }, + { + "epoch": 0.4526715706086816, + "grad_norm": 0.4904133379459381, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 2800 + }, + { + "epoch": 0.45428825478942686, + "grad_norm": 0.5647031664848328, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 2810 + }, + { + "epoch": 0.4559049389701722, + "grad_norm": 0.5759473443031311, + "learning_rate": 0.0002, + "loss": 0.8622, + "step": 2820 + }, + { + "epoch": 0.4575216231509175, + "grad_norm": 0.5161895751953125, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 2830 + }, + { + "epoch": 0.4591383073316628, + "grad_norm": 0.4248254597187042, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 2840 + }, + { + "epoch": 0.46075499151240806, + "grad_norm": 0.45395001769065857, + "learning_rate": 0.0002, + "loss": 0.7838, + "step": 2850 + }, + { + "epoch": 0.46237167569315335, + "grad_norm": 0.5358697772026062, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 2860 + }, + { + "epoch": 0.46398835987389864, + "grad_norm": 0.5379165410995483, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 2870 + }, + { + "epoch": 0.4656050440546439, + "grad_norm": 0.4601989686489105, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 2880 + }, + { + "epoch": 0.4672217282353892, + "grad_norm": 0.671115517616272, + "learning_rate": 0.0002, + "loss": 0.8523, + "step": 2890 + }, + { + "epoch": 0.4688384124161345, + "grad_norm": 0.4425133168697357, + "learning_rate": 0.0002, + "loss": 0.8262, + "step": 2900 + }, + { + "epoch": 0.4704550965968798, + "grad_norm": 0.5446155071258545, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 2910 + }, + { + "epoch": 0.47207178077762507, + "grad_norm": 0.603306233882904, + "learning_rate": 0.0002, + "loss": 0.8106, + "step": 2920 + }, + { + "epoch": 0.47368846495837036, + "grad_norm": 0.5377997159957886, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 2930 + }, + { + "epoch": 0.4753051491391157, + "grad_norm": 0.4931027591228485, + "learning_rate": 0.0002, + "loss": 0.8075, + "step": 2940 + }, + { + "epoch": 0.476921833319861, + "grad_norm": 0.4711960256099701, + "learning_rate": 0.0002, + "loss": 0.8004, + "step": 2950 + }, + { + "epoch": 0.4785385175006063, + "grad_norm": 0.5020492672920227, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 2960 + }, + { + "epoch": 0.48015520168135156, + "grad_norm": 0.5428946614265442, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 2970 + }, + { + "epoch": 0.48177188586209685, + "grad_norm": 0.5294089317321777, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 2980 + }, + { + "epoch": 0.48338857004284214, + "grad_norm": 0.648289144039154, + "learning_rate": 0.0002, + "loss": 0.8553, + "step": 2990 + }, + { + "epoch": 0.4850052542235874, + "grad_norm": 0.47916680574417114, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 3000 + }, + { + "epoch": 0.4866219384043327, + "grad_norm": 0.43849772214889526, + "learning_rate": 0.0002, + "loss": 0.8087, + "step": 3010 + }, + { + "epoch": 0.488238622585078, + "grad_norm": 0.47007861733436584, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 3020 + }, + { + "epoch": 0.4898553067658233, + "grad_norm": 0.6314331293106079, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 3030 + }, + { + "epoch": 0.49147199094656857, + "grad_norm": 0.49211493134498596, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 3040 + }, + { + "epoch": 0.49308867512731386, + "grad_norm": 0.4537973403930664, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 3050 + }, + { + "epoch": 0.49470535930805914, + "grad_norm": 0.47326919436454773, + "learning_rate": 0.0002, + "loss": 0.8095, + "step": 3060 + }, + { + "epoch": 0.4963220434888045, + "grad_norm": 0.525874137878418, + "learning_rate": 0.0002, + "loss": 0.8447, + "step": 3070 + }, + { + "epoch": 0.4979387276695498, + "grad_norm": 0.6361091732978821, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 3080 + }, + { + "epoch": 0.49955541185029506, + "grad_norm": 0.5850642919540405, + "learning_rate": 0.0002, + "loss": 0.821, + "step": 3090 + }, + { + "epoch": 0.5011720960310403, + "grad_norm": 0.47299543023109436, + "learning_rate": 0.0002, + "loss": 0.8279, + "step": 3100 + }, + { + "epoch": 0.5027887802117856, + "grad_norm": 0.473099946975708, + "learning_rate": 0.0002, + "loss": 0.8681, + "step": 3110 + }, + { + "epoch": 0.5044054643925309, + "grad_norm": 0.48186397552490234, + "learning_rate": 0.0002, + "loss": 0.8223, + "step": 3120 + }, + { + "epoch": 0.5060221485732762, + "grad_norm": 0.5015401840209961, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 3130 + }, + { + "epoch": 0.5076388327540216, + "grad_norm": 0.5617750287055969, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 3140 + }, + { + "epoch": 0.5092555169347668, + "grad_norm": 0.5169327259063721, + "learning_rate": 0.0002, + "loss": 0.8708, + "step": 3150 + }, + { + "epoch": 0.5108722011155121, + "grad_norm": 0.545657753944397, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 3160 + }, + { + "epoch": 0.5124888852962574, + "grad_norm": 0.512864351272583, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 3170 + }, + { + "epoch": 0.5141055694770027, + "grad_norm": 0.4113546311855316, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 3180 + }, + { + "epoch": 0.5157222536577479, + "grad_norm": 0.44532445073127747, + "learning_rate": 0.0002, + "loss": 0.8206, + "step": 3190 + }, + { + "epoch": 0.5173389378384933, + "grad_norm": 0.5623497366905212, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 3200 + }, + { + "epoch": 0.5189556220192385, + "grad_norm": 0.5084741115570068, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 3210 + }, + { + "epoch": 0.5205723061999838, + "grad_norm": 0.5305403470993042, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 3220 + }, + { + "epoch": 0.5221889903807291, + "grad_norm": 0.4708254337310791, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 3230 + }, + { + "epoch": 0.5238056745614744, + "grad_norm": 0.43827131390571594, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 3240 + }, + { + "epoch": 0.5254223587422197, + "grad_norm": 0.5630002617835999, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 3250 + }, + { + "epoch": 0.527039042922965, + "grad_norm": 0.5010961890220642, + "learning_rate": 0.0002, + "loss": 0.7522, + "step": 3260 + }, + { + "epoch": 0.5286557271037103, + "grad_norm": 0.6303122043609619, + "learning_rate": 0.0002, + "loss": 0.8374, + "step": 3270 + }, + { + "epoch": 0.5302724112844556, + "grad_norm": 0.5107331275939941, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 3280 + }, + { + "epoch": 0.5318890954652009, + "grad_norm": 0.5700443387031555, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 3290 + }, + { + "epoch": 0.5335057796459461, + "grad_norm": 0.46296367049217224, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 3300 + }, + { + "epoch": 0.5351224638266915, + "grad_norm": 0.531568706035614, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 3310 + }, + { + "epoch": 0.5367391480074367, + "grad_norm": 0.4686741530895233, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 3320 + }, + { + "epoch": 0.5383558321881821, + "grad_norm": 0.5404331088066101, + "learning_rate": 0.0002, + "loss": 0.8104, + "step": 3330 + }, + { + "epoch": 0.5399725163689273, + "grad_norm": 0.6368790864944458, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 3340 + }, + { + "epoch": 0.5415892005496726, + "grad_norm": 0.42300888895988464, + "learning_rate": 0.0002, + "loss": 0.8514, + "step": 3350 + }, + { + "epoch": 0.5432058847304179, + "grad_norm": 0.5362542867660522, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 3360 + }, + { + "epoch": 0.5448225689111632, + "grad_norm": 0.497128963470459, + "learning_rate": 0.0002, + "loss": 0.858, + "step": 3370 + }, + { + "epoch": 0.5464392530919085, + "grad_norm": 0.5006386041641235, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 3380 + }, + { + "epoch": 0.5480559372726538, + "grad_norm": 0.44136837124824524, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 3390 + }, + { + "epoch": 0.5496726214533991, + "grad_norm": 0.5897833108901978, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 3400 + }, + { + "epoch": 0.5512893056341444, + "grad_norm": 0.641075611114502, + "learning_rate": 0.0002, + "loss": 0.8895, + "step": 3410 + }, + { + "epoch": 0.5529059898148897, + "grad_norm": 0.7251322269439697, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 3420 + }, + { + "epoch": 0.5545226739956349, + "grad_norm": 0.47411349415779114, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 3430 + }, + { + "epoch": 0.5561393581763803, + "grad_norm": 0.4994310438632965, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 3440 + }, + { + "epoch": 0.5577560423571255, + "grad_norm": 0.5814438462257385, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 3450 + }, + { + "epoch": 0.5593727265378708, + "grad_norm": 0.6278898119926453, + "learning_rate": 0.0002, + "loss": 0.8805, + "step": 3460 + }, + { + "epoch": 0.5609894107186161, + "grad_norm": 0.46208274364471436, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 3470 + }, + { + "epoch": 0.5626060948993614, + "grad_norm": 0.5718930959701538, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 3480 + }, + { + "epoch": 0.5642227790801067, + "grad_norm": 0.48178744316101074, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 3490 + }, + { + "epoch": 0.565839463260852, + "grad_norm": 0.47336965799331665, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 3500 + }, + { + "epoch": 0.5674561474415973, + "grad_norm": 0.43442684412002563, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 3510 + }, + { + "epoch": 0.5690728316223426, + "grad_norm": 0.6463358998298645, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 3520 + }, + { + "epoch": 0.5706895158030879, + "grad_norm": 0.5286486744880676, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 3530 + }, + { + "epoch": 0.5723061999838331, + "grad_norm": 0.5405499935150146, + "learning_rate": 0.0002, + "loss": 0.8421, + "step": 3540 + }, + { + "epoch": 0.5739228841645785, + "grad_norm": 0.6654391884803772, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 3550 + }, + { + "epoch": 0.5755395683453237, + "grad_norm": 0.5081980228424072, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 3560 + }, + { + "epoch": 0.5771562525260691, + "grad_norm": 0.48978179693222046, + "learning_rate": 0.0002, + "loss": 0.7753, + "step": 3570 + }, + { + "epoch": 0.5787729367068143, + "grad_norm": 0.5840612053871155, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3580 + }, + { + "epoch": 0.5803896208875596, + "grad_norm": 0.5235261917114258, + "learning_rate": 0.0002, + "loss": 0.8937, + "step": 3590 + }, + { + "epoch": 0.5820063050683049, + "grad_norm": 0.5672075748443604, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 3600 + }, + { + "epoch": 0.5836229892490502, + "grad_norm": 0.5613429546356201, + "learning_rate": 0.0002, + "loss": 0.8347, + "step": 3610 + }, + { + "epoch": 0.5852396734297954, + "grad_norm": 0.4032273590564728, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 3620 + }, + { + "epoch": 0.5868563576105408, + "grad_norm": 0.49559324979782104, + "learning_rate": 0.0002, + "loss": 0.8421, + "step": 3630 + }, + { + "epoch": 0.5884730417912861, + "grad_norm": 0.6895697712898254, + "learning_rate": 0.0002, + "loss": 0.8332, + "step": 3640 + }, + { + "epoch": 0.5900897259720314, + "grad_norm": 0.4750136435031891, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 3650 + }, + { + "epoch": 0.5917064101527767, + "grad_norm": 0.5176819562911987, + "learning_rate": 0.0002, + "loss": 0.8219, + "step": 3660 + }, + { + "epoch": 0.5933230943335219, + "grad_norm": 0.5817760229110718, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3670 + }, + { + "epoch": 0.5949397785142673, + "grad_norm": 0.6064626574516296, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 3680 + }, + { + "epoch": 0.5965564626950125, + "grad_norm": 0.6728700995445251, + "learning_rate": 0.0002, + "loss": 0.8422, + "step": 3690 + }, + { + "epoch": 0.5981731468757578, + "grad_norm": 0.609305202960968, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 3700 + }, + { + "epoch": 0.5997898310565031, + "grad_norm": 0.4615488350391388, + "learning_rate": 0.0002, + "loss": 0.8048, + "step": 3710 + }, + { + "epoch": 0.6014065152372484, + "grad_norm": 2.0531179904937744, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 3720 + }, + { + "epoch": 0.6030231994179936, + "grad_norm": 0.5091132521629333, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 3730 + }, + { + "epoch": 0.604639883598739, + "grad_norm": 0.5951124429702759, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 3740 + }, + { + "epoch": 0.6062565677794842, + "grad_norm": 0.5870208144187927, + "learning_rate": 0.0002, + "loss": 0.7784, + "step": 3750 + }, + { + "epoch": 0.6078732519602296, + "grad_norm": 0.6254619359970093, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 3760 + }, + { + "epoch": 0.6094899361409749, + "grad_norm": 0.5577626824378967, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 3770 + }, + { + "epoch": 0.6111066203217201, + "grad_norm": 0.5004405379295349, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 3780 + }, + { + "epoch": 0.6127233045024655, + "grad_norm": 0.5527383685112, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 3790 + }, + { + "epoch": 0.6143399886832107, + "grad_norm": 0.49116113781929016, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 3800 + }, + { + "epoch": 0.6159566728639561, + "grad_norm": 0.5299299359321594, + "learning_rate": 0.0002, + "loss": 0.8352, + "step": 3810 + }, + { + "epoch": 0.6175733570447013, + "grad_norm": 0.464897483587265, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 3820 + }, + { + "epoch": 0.6191900412254466, + "grad_norm": 0.6505740880966187, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 3830 + }, + { + "epoch": 0.6208067254061919, + "grad_norm": 0.5512559413909912, + "learning_rate": 0.0002, + "loss": 0.8123, + "step": 3840 + }, + { + "epoch": 0.6224234095869372, + "grad_norm": 0.49427518248558044, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 3850 + }, + { + "epoch": 0.6240400937676824, + "grad_norm": 0.3839147090911865, + "learning_rate": 0.0002, + "loss": 0.7751, + "step": 3860 + }, + { + "epoch": 0.6256567779484278, + "grad_norm": 0.5760218501091003, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 3870 + }, + { + "epoch": 0.6272734621291731, + "grad_norm": 0.7226507067680359, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 3880 + }, + { + "epoch": 0.6288901463099184, + "grad_norm": 0.676781415939331, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 3890 + }, + { + "epoch": 0.6305068304906637, + "grad_norm": 0.4284018278121948, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 3900 + }, + { + "epoch": 0.6321235146714089, + "grad_norm": 0.5060628056526184, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 3910 + }, + { + "epoch": 0.6337401988521543, + "grad_norm": 0.5524522066116333, + "learning_rate": 0.0002, + "loss": 0.8089, + "step": 3920 + }, + { + "epoch": 0.6353568830328995, + "grad_norm": 0.6099881529808044, + "learning_rate": 0.0002, + "loss": 0.8276, + "step": 3930 + }, + { + "epoch": 0.6369735672136448, + "grad_norm": 0.43155938386917114, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 3940 + }, + { + "epoch": 0.6385902513943901, + "grad_norm": 0.6427084803581238, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 3950 + }, + { + "epoch": 0.6402069355751354, + "grad_norm": 0.541220486164093, + "learning_rate": 0.0002, + "loss": 0.8368, + "step": 3960 + }, + { + "epoch": 0.6418236197558806, + "grad_norm": 0.5414294600486755, + "learning_rate": 0.0002, + "loss": 0.8539, + "step": 3970 + }, + { + "epoch": 0.643440303936626, + "grad_norm": 0.46344003081321716, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 3980 + }, + { + "epoch": 0.6450569881173712, + "grad_norm": 0.45209285616874695, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 3990 + }, + { + "epoch": 0.6466736722981166, + "grad_norm": 0.5417284369468689, + "learning_rate": 0.0002, + "loss": 0.8202, + "step": 4000 + }, + { + "epoch": 0.6482903564788619, + "grad_norm": 0.7995685935020447, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 4010 + }, + { + "epoch": 0.6499070406596071, + "grad_norm": 0.6384002566337585, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 4020 + }, + { + "epoch": 0.6515237248403525, + "grad_norm": 0.4472815692424774, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 4030 + }, + { + "epoch": 0.6531404090210977, + "grad_norm": 0.6834294199943542, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 4040 + }, + { + "epoch": 0.654757093201843, + "grad_norm": 0.4612339735031128, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 4050 + }, + { + "epoch": 0.6563737773825883, + "grad_norm": 0.9266576170921326, + "learning_rate": 0.0002, + "loss": 0.8141, + "step": 4060 + }, + { + "epoch": 0.6579904615633336, + "grad_norm": 0.4470861852169037, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 4070 + }, + { + "epoch": 0.6596071457440789, + "grad_norm": 0.45544925332069397, + "learning_rate": 0.0002, + "loss": 0.8293, + "step": 4080 + }, + { + "epoch": 0.6612238299248242, + "grad_norm": 0.6144481301307678, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 4090 + }, + { + "epoch": 0.6628405141055694, + "grad_norm": 0.5936288237571716, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 4100 + }, + { + "epoch": 0.6644571982863148, + "grad_norm": 0.4822963774204254, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 4110 + }, + { + "epoch": 0.66607388246706, + "grad_norm": 0.48432496190071106, + "learning_rate": 0.0002, + "loss": 0.7997, + "step": 4120 + }, + { + "epoch": 0.6676905666478054, + "grad_norm": 0.4901607930660248, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 4130 + }, + { + "epoch": 0.6693072508285507, + "grad_norm": 0.5018393397331238, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 4140 + }, + { + "epoch": 0.6709239350092959, + "grad_norm": 0.6946378946304321, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 4150 + }, + { + "epoch": 0.6725406191900413, + "grad_norm": 0.5997390747070312, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 4160 + }, + { + "epoch": 0.6741573033707865, + "grad_norm": 0.6738849878311157, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 4170 + }, + { + "epoch": 0.6757739875515318, + "grad_norm": 0.6110581159591675, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 4180 + }, + { + "epoch": 0.6773906717322771, + "grad_norm": 0.5703322291374207, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 4190 + }, + { + "epoch": 0.6790073559130224, + "grad_norm": 0.4686066210269928, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 4200 + }, + { + "epoch": 0.6806240400937676, + "grad_norm": 0.6394643783569336, + "learning_rate": 0.0002, + "loss": 0.8441, + "step": 4210 + }, + { + "epoch": 0.682240724274513, + "grad_norm": 0.5454841256141663, + "learning_rate": 0.0002, + "loss": 0.8011, + "step": 4220 + }, + { + "epoch": 0.6838574084552582, + "grad_norm": 0.4859732985496521, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 4230 + }, + { + "epoch": 0.6854740926360036, + "grad_norm": 0.5544065833091736, + "learning_rate": 0.0002, + "loss": 0.8161, + "step": 4240 + }, + { + "epoch": 0.6870907768167488, + "grad_norm": 0.4902505576610565, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 4250 + }, + { + "epoch": 0.6887074609974941, + "grad_norm": 0.4768051505088806, + "learning_rate": 0.0002, + "loss": 0.7977, + "step": 4260 + }, + { + "epoch": 0.6903241451782395, + "grad_norm": 0.49982190132141113, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 4270 + }, + { + "epoch": 0.6919408293589847, + "grad_norm": 0.6351838111877441, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 4280 + }, + { + "epoch": 0.69355751353973, + "grad_norm": 0.5647561550140381, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 4290 + }, + { + "epoch": 0.6951741977204753, + "grad_norm": 0.5340486764907837, + "learning_rate": 0.0002, + "loss": 0.7618, + "step": 4300 + }, + { + "epoch": 0.6967908819012206, + "grad_norm": 0.5649092793464661, + "learning_rate": 0.0002, + "loss": 0.8526, + "step": 4310 + }, + { + "epoch": 0.6984075660819659, + "grad_norm": 0.6183916926383972, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 4320 + }, + { + "epoch": 0.7000242502627112, + "grad_norm": 0.6154509782791138, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 4330 + }, + { + "epoch": 0.7016409344434564, + "grad_norm": 0.5156264305114746, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 4340 + }, + { + "epoch": 0.7032576186242018, + "grad_norm": 0.562171459197998, + "learning_rate": 0.0002, + "loss": 0.8512, + "step": 4350 + }, + { + "epoch": 0.704874302804947, + "grad_norm": 0.4949502646923065, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 4360 + }, + { + "epoch": 0.7064909869856923, + "grad_norm": 0.5171684622764587, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 4370 + }, + { + "epoch": 0.7081076711664377, + "grad_norm": 0.6198443174362183, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 4380 + }, + { + "epoch": 0.7097243553471829, + "grad_norm": 0.5802276134490967, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 4390 + }, + { + "epoch": 0.7113410395279283, + "grad_norm": 0.41096967458724976, + "learning_rate": 0.0002, + "loss": 0.8797, + "step": 4400 + }, + { + "epoch": 0.7129577237086735, + "grad_norm": 0.4397392272949219, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 4410 + }, + { + "epoch": 0.7145744078894188, + "grad_norm": 0.45228442549705505, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 4420 + }, + { + "epoch": 0.7161910920701641, + "grad_norm": 0.4839673936367035, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 4430 + }, + { + "epoch": 0.7178077762509094, + "grad_norm": 0.6140755414962769, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 4440 + }, + { + "epoch": 0.7194244604316546, + "grad_norm": 0.6841378808021545, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 4450 + }, + { + "epoch": 0.7210411446124, + "grad_norm": 0.6664239168167114, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 4460 + }, + { + "epoch": 0.7226578287931452, + "grad_norm": 0.47552719712257385, + "learning_rate": 0.0002, + "loss": 0.7983, + "step": 4470 + }, + { + "epoch": 0.7242745129738906, + "grad_norm": 0.6649776101112366, + "learning_rate": 0.0002, + "loss": 0.8982, + "step": 4480 + }, + { + "epoch": 0.7258911971546358, + "grad_norm": 0.5159541964530945, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 4490 + }, + { + "epoch": 0.7275078813353811, + "grad_norm": 0.6693112850189209, + "learning_rate": 0.0002, + "loss": 0.7786, + "step": 4500 + }, + { + "epoch": 0.7291245655161265, + "grad_norm": 0.48870977759361267, + "learning_rate": 0.0002, + "loss": 0.8655, + "step": 4510 + }, + { + "epoch": 0.7307412496968717, + "grad_norm": 0.4857887923717499, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 4520 + }, + { + "epoch": 0.732357933877617, + "grad_norm": 0.5515662431716919, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 4530 + }, + { + "epoch": 0.7339746180583623, + "grad_norm": 0.6292222738265991, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 4540 + }, + { + "epoch": 0.7355913022391076, + "grad_norm": 0.48265689611434937, + "learning_rate": 0.0002, + "loss": 0.7749, + "step": 4550 + }, + { + "epoch": 0.7372079864198529, + "grad_norm": 0.8044266104698181, + "learning_rate": 0.0002, + "loss": 0.8499, + "step": 4560 + }, + { + "epoch": 0.7388246706005982, + "grad_norm": 0.6111769676208496, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 4570 + }, + { + "epoch": 0.7404413547813434, + "grad_norm": 0.5229553580284119, + "learning_rate": 0.0002, + "loss": 0.7291, + "step": 4580 + }, + { + "epoch": 0.7420580389620888, + "grad_norm": 0.6054152250289917, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 4590 + }, + { + "epoch": 0.743674723142834, + "grad_norm": 0.5574966669082642, + "learning_rate": 0.0002, + "loss": 0.8169, + "step": 4600 + }, + { + "epoch": 0.7452914073235793, + "grad_norm": 0.5395817160606384, + "learning_rate": 0.0002, + "loss": 0.8439, + "step": 4610 + }, + { + "epoch": 0.7469080915043246, + "grad_norm": 0.7116472721099854, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 4620 + }, + { + "epoch": 0.7485247756850699, + "grad_norm": 0.5618700981140137, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 4630 + }, + { + "epoch": 0.7501414598658153, + "grad_norm": 0.5802770853042603, + "learning_rate": 0.0002, + "loss": 0.7744, + "step": 4640 + }, + { + "epoch": 0.7517581440465605, + "grad_norm": 0.5690428018569946, + "learning_rate": 0.0002, + "loss": 0.7924, + "step": 4650 + }, + { + "epoch": 0.7533748282273058, + "grad_norm": 0.4813360273838043, + "learning_rate": 0.0002, + "loss": 0.8017, + "step": 4660 + }, + { + "epoch": 0.7549915124080511, + "grad_norm": 0.5434042811393738, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 4670 + }, + { + "epoch": 0.7566081965887964, + "grad_norm": 0.5502099990844727, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 4680 + }, + { + "epoch": 0.7582248807695416, + "grad_norm": 0.6020621061325073, + "learning_rate": 0.0002, + "loss": 0.8598, + "step": 4690 + }, + { + "epoch": 0.759841564950287, + "grad_norm": 0.4922301471233368, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 4700 + }, + { + "epoch": 0.7614582491310322, + "grad_norm": 0.6492828726768494, + "learning_rate": 0.0002, + "loss": 0.788, + "step": 4710 + }, + { + "epoch": 0.7630749333117776, + "grad_norm": 0.4865580201148987, + "learning_rate": 0.0002, + "loss": 0.8313, + "step": 4720 + }, + { + "epoch": 0.7646916174925228, + "grad_norm": 0.5971422791481018, + "learning_rate": 0.0002, + "loss": 0.7966, + "step": 4730 + }, + { + "epoch": 0.7663083016732681, + "grad_norm": 0.6832674145698547, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 4740 + }, + { + "epoch": 0.7679249858540134, + "grad_norm": 0.500908613204956, + "learning_rate": 0.0002, + "loss": 0.8156, + "step": 4750 + }, + { + "epoch": 0.7695416700347587, + "grad_norm": 0.6112465858459473, + "learning_rate": 0.0002, + "loss": 0.8383, + "step": 4760 + }, + { + "epoch": 0.771158354215504, + "grad_norm": 0.5753506422042847, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 4770 + }, + { + "epoch": 0.7727750383962493, + "grad_norm": 0.6529405117034912, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 4780 + }, + { + "epoch": 0.7743917225769946, + "grad_norm": 0.5916843414306641, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 4790 + }, + { + "epoch": 0.7760084067577399, + "grad_norm": 0.4821224510669708, + "learning_rate": 0.0002, + "loss": 0.83, + "step": 4800 + }, + { + "epoch": 0.7776250909384852, + "grad_norm": 0.5532580018043518, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 4810 + }, + { + "epoch": 0.7792417751192304, + "grad_norm": 0.4604877233505249, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 4820 + }, + { + "epoch": 0.7808584592999758, + "grad_norm": 0.5009613037109375, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 4830 + }, + { + "epoch": 0.782475143480721, + "grad_norm": 0.6448560357093811, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 4840 + }, + { + "epoch": 0.7840918276614663, + "grad_norm": 0.44327953457832336, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 4850 + }, + { + "epoch": 0.7857085118422116, + "grad_norm": 0.5355411171913147, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 4860 + }, + { + "epoch": 0.7873251960229569, + "grad_norm": 0.5635677576065063, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 4870 + }, + { + "epoch": 0.7889418802037023, + "grad_norm": 0.5417491793632507, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 4880 + }, + { + "epoch": 0.7905585643844475, + "grad_norm": 0.4567430913448334, + "learning_rate": 0.0002, + "loss": 0.7819, + "step": 4890 + }, + { + "epoch": 0.7921752485651928, + "grad_norm": 0.44651296734809875, + "learning_rate": 0.0002, + "loss": 0.8454, + "step": 4900 + }, + { + "epoch": 0.7937919327459381, + "grad_norm": 0.5741217136383057, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 4910 + }, + { + "epoch": 0.7954086169266834, + "grad_norm": 0.6605045199394226, + "learning_rate": 0.0002, + "loss": 0.8093, + "step": 4920 + }, + { + "epoch": 0.7970253011074286, + "grad_norm": 0.5126531720161438, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 4930 + }, + { + "epoch": 0.798641985288174, + "grad_norm": 0.513648271560669, + "learning_rate": 0.0002, + "loss": 0.7793, + "step": 4940 + }, + { + "epoch": 0.8002586694689192, + "grad_norm": 0.5350404381752014, + "learning_rate": 0.0002, + "loss": 0.8314, + "step": 4950 + }, + { + "epoch": 0.8018753536496646, + "grad_norm": 0.5731674432754517, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 4960 + }, + { + "epoch": 0.8034920378304098, + "grad_norm": 0.5974258184432983, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 4970 + }, + { + "epoch": 0.8051087220111551, + "grad_norm": 0.8774799704551697, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 4980 + }, + { + "epoch": 0.8067254061919004, + "grad_norm": 0.5994430184364319, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 4990 + }, + { + "epoch": 0.8083420903726457, + "grad_norm": 0.4894903004169464, + "learning_rate": 0.0002, + "loss": 0.7736, + "step": 5000 + }, + { + "epoch": 0.809958774553391, + "grad_norm": 0.5218459367752075, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 5010 + }, + { + "epoch": 0.8115754587341363, + "grad_norm": 0.5232468843460083, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 5020 + }, + { + "epoch": 0.8131921429148816, + "grad_norm": 0.44358372688293457, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 5030 + }, + { + "epoch": 0.8148088270956269, + "grad_norm": 0.6202037334442139, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 5040 + }, + { + "epoch": 0.8164255112763722, + "grad_norm": 0.7721474170684814, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 5050 + }, + { + "epoch": 0.8180421954571174, + "grad_norm": 0.5568501353263855, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 5060 + }, + { + "epoch": 0.8196588796378628, + "grad_norm": 0.49148809909820557, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 5070 + }, + { + "epoch": 0.821275563818608, + "grad_norm": 0.4956012964248657, + "learning_rate": 0.0002, + "loss": 0.8054, + "step": 5080 + }, + { + "epoch": 0.8228922479993533, + "grad_norm": 0.6078833937644958, + "learning_rate": 0.0002, + "loss": 0.8201, + "step": 5090 + }, + { + "epoch": 0.8245089321800986, + "grad_norm": 0.46906954050064087, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 5100 + }, + { + "epoch": 0.8261256163608439, + "grad_norm": 0.50812166929245, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 5110 + }, + { + "epoch": 0.8277423005415891, + "grad_norm": 0.5319661498069763, + "learning_rate": 0.0002, + "loss": 0.8243, + "step": 5120 + }, + { + "epoch": 0.8293589847223345, + "grad_norm": 0.4949689209461212, + "learning_rate": 0.0002, + "loss": 0.7798, + "step": 5130 + }, + { + "epoch": 0.8309756689030798, + "grad_norm": 0.5151591300964355, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 5140 + }, + { + "epoch": 0.8325923530838251, + "grad_norm": 0.5530214309692383, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 5150 + }, + { + "epoch": 0.8342090372645704, + "grad_norm": 0.6297410130500793, + "learning_rate": 0.0002, + "loss": 0.8251, + "step": 5160 + }, + { + "epoch": 0.8358257214453156, + "grad_norm": 0.5466840267181396, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 5170 + }, + { + "epoch": 0.837442405626061, + "grad_norm": 0.652913510799408, + "learning_rate": 0.0002, + "loss": 0.7875, + "step": 5180 + }, + { + "epoch": 0.8390590898068062, + "grad_norm": 0.5811293125152588, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 5190 + }, + { + "epoch": 0.8406757739875516, + "grad_norm": 0.5109550952911377, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 5200 + }, + { + "epoch": 0.8422924581682968, + "grad_norm": 0.4551706612110138, + "learning_rate": 0.0002, + "loss": 0.8077, + "step": 5210 + }, + { + "epoch": 0.8439091423490421, + "grad_norm": 0.5813754200935364, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 5220 + }, + { + "epoch": 0.8455258265297874, + "grad_norm": 0.5856947898864746, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 5230 + }, + { + "epoch": 0.8471425107105327, + "grad_norm": 0.5482739210128784, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 5240 + }, + { + "epoch": 0.8487591948912779, + "grad_norm": 0.49023720622062683, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 5250 + }, + { + "epoch": 0.8503758790720233, + "grad_norm": 0.49472475051879883, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 5260 + }, + { + "epoch": 0.8519925632527686, + "grad_norm": 0.5490226745605469, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 5270 + }, + { + "epoch": 0.8536092474335139, + "grad_norm": 0.5340665578842163, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 5280 + }, + { + "epoch": 0.8552259316142592, + "grad_norm": 0.5962483882904053, + "learning_rate": 0.0002, + "loss": 0.8277, + "step": 5290 + }, + { + "epoch": 0.8568426157950044, + "grad_norm": 0.586358368396759, + "learning_rate": 0.0002, + "loss": 0.8765, + "step": 5300 + }, + { + "epoch": 0.8584592999757498, + "grad_norm": 0.49120277166366577, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 5310 + }, + { + "epoch": 0.860075984156495, + "grad_norm": 0.5887332558631897, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 5320 + }, + { + "epoch": 0.8616926683372403, + "grad_norm": 0.42496153712272644, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 5330 + }, + { + "epoch": 0.8633093525179856, + "grad_norm": 0.5489874482154846, + "learning_rate": 0.0002, + "loss": 0.7905, + "step": 5340 + }, + { + "epoch": 0.8649260366987309, + "grad_norm": 0.5850813984870911, + "learning_rate": 0.0002, + "loss": 0.7958, + "step": 5350 + }, + { + "epoch": 0.8665427208794761, + "grad_norm": 0.517487108707428, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 5360 + }, + { + "epoch": 0.8681594050602215, + "grad_norm": 0.5339142680168152, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 5370 + }, + { + "epoch": 0.8697760892409668, + "grad_norm": 0.6236387491226196, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 5380 + }, + { + "epoch": 0.8713927734217121, + "grad_norm": 0.5752192735671997, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 5390 + }, + { + "epoch": 0.8730094576024574, + "grad_norm": 0.6724614500999451, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 5400 + }, + { + "epoch": 0.8746261417832026, + "grad_norm": 0.5280613303184509, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 5410 + }, + { + "epoch": 0.876242825963948, + "grad_norm": 0.44033288955688477, + "learning_rate": 0.0002, + "loss": 0.8231, + "step": 5420 + }, + { + "epoch": 0.8778595101446932, + "grad_norm": 0.5199708342552185, + "learning_rate": 0.0002, + "loss": 0.8839, + "step": 5430 + }, + { + "epoch": 0.8794761943254386, + "grad_norm": 0.46778348088264465, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 5440 + }, + { + "epoch": 0.8810928785061838, + "grad_norm": 0.4657754898071289, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 5450 + }, + { + "epoch": 0.8827095626869291, + "grad_norm": 0.5472902655601501, + "learning_rate": 0.0002, + "loss": 0.7799, + "step": 5460 + }, + { + "epoch": 0.8843262468676744, + "grad_norm": 0.4876766800880432, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 5470 + }, + { + "epoch": 0.8859429310484197, + "grad_norm": 0.5057248473167419, + "learning_rate": 0.0002, + "loss": 0.7906, + "step": 5480 + }, + { + "epoch": 0.8875596152291649, + "grad_norm": 0.4637320637702942, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 5490 + }, + { + "epoch": 0.8891762994099103, + "grad_norm": 0.471955806016922, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 5500 + }, + { + "epoch": 0.8907929835906556, + "grad_norm": 0.5209813714027405, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 5510 + }, + { + "epoch": 0.8924096677714008, + "grad_norm": 0.6213834285736084, + "learning_rate": 0.0002, + "loss": 0.8106, + "step": 5520 + }, + { + "epoch": 0.8940263519521462, + "grad_norm": 0.5215408205986023, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 5530 + }, + { + "epoch": 0.8956430361328914, + "grad_norm": 0.580478310585022, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 5540 + }, + { + "epoch": 0.8972597203136368, + "grad_norm": 0.49102169275283813, + "learning_rate": 0.0002, + "loss": 0.8371, + "step": 5550 + }, + { + "epoch": 0.898876404494382, + "grad_norm": 0.6043479442596436, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 5560 + }, + { + "epoch": 0.9004930886751273, + "grad_norm": 0.5636463165283203, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 5570 + }, + { + "epoch": 0.9021097728558726, + "grad_norm": 0.5620124340057373, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 5580 + }, + { + "epoch": 0.9037264570366179, + "grad_norm": 0.5206354856491089, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 5590 + }, + { + "epoch": 0.9053431412173631, + "grad_norm": 0.5798229575157166, + "learning_rate": 0.0002, + "loss": 0.8557, + "step": 5600 + }, + { + "epoch": 0.9069598253981085, + "grad_norm": 0.6428212523460388, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 5610 + }, + { + "epoch": 0.9085765095788537, + "grad_norm": 0.48064687848091125, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 5620 + }, + { + "epoch": 0.9101931937595991, + "grad_norm": 0.6347860097885132, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 5630 + }, + { + "epoch": 0.9118098779403444, + "grad_norm": 0.5353913307189941, + "learning_rate": 0.0002, + "loss": 0.851, + "step": 5640 + }, + { + "epoch": 0.9134265621210896, + "grad_norm": 0.5323944091796875, + "learning_rate": 0.0002, + "loss": 0.7736, + "step": 5650 + }, + { + "epoch": 0.915043246301835, + "grad_norm": 0.5261843204498291, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 5660 + }, + { + "epoch": 0.9166599304825802, + "grad_norm": 0.5451326966285706, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 5670 + }, + { + "epoch": 0.9182766146633256, + "grad_norm": 0.5183324217796326, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 5680 + }, + { + "epoch": 0.9198932988440708, + "grad_norm": 0.47229018807411194, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 5690 + }, + { + "epoch": 0.9215099830248161, + "grad_norm": 0.49180513620376587, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 5700 + }, + { + "epoch": 0.9231266672055614, + "grad_norm": 0.5419785380363464, + "learning_rate": 0.0002, + "loss": 0.8735, + "step": 5710 + }, + { + "epoch": 0.9247433513863067, + "grad_norm": 0.5408698916435242, + "learning_rate": 0.0002, + "loss": 0.7378, + "step": 5720 + }, + { + "epoch": 0.9263600355670519, + "grad_norm": 0.5286232829093933, + "learning_rate": 0.0002, + "loss": 0.7701, + "step": 5730 + }, + { + "epoch": 0.9279767197477973, + "grad_norm": 0.7539758086204529, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 5740 + }, + { + "epoch": 0.9295934039285425, + "grad_norm": 0.5166944861412048, + "learning_rate": 0.0002, + "loss": 0.8118, + "step": 5750 + }, + { + "epoch": 0.9312100881092878, + "grad_norm": 0.6601425409317017, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 5760 + }, + { + "epoch": 0.9328267722900332, + "grad_norm": 0.5029960870742798, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 5770 + }, + { + "epoch": 0.9344434564707784, + "grad_norm": 0.4926645755767822, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 5780 + }, + { + "epoch": 0.9360601406515238, + "grad_norm": 0.5739615559577942, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 5790 + }, + { + "epoch": 0.937676824832269, + "grad_norm": 0.5058279037475586, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 5800 + }, + { + "epoch": 0.9392935090130143, + "grad_norm": 0.5260962247848511, + "learning_rate": 0.0002, + "loss": 0.8537, + "step": 5810 + }, + { + "epoch": 0.9409101931937596, + "grad_norm": 0.5768588185310364, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 5820 + }, + { + "epoch": 0.9425268773745049, + "grad_norm": 0.5170126557350159, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 5830 + }, + { + "epoch": 0.9441435615552501, + "grad_norm": 0.5745864510536194, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 5840 + }, + { + "epoch": 0.9457602457359955, + "grad_norm": 0.5551357865333557, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 5850 + }, + { + "epoch": 0.9473769299167407, + "grad_norm": 0.5776078701019287, + "learning_rate": 0.0002, + "loss": 0.8529, + "step": 5860 + }, + { + "epoch": 0.9489936140974861, + "grad_norm": 0.5340062379837036, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 5870 + }, + { + "epoch": 0.9506102982782314, + "grad_norm": 0.6447290182113647, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 5880 + }, + { + "epoch": 0.9522269824589766, + "grad_norm": 0.5123815536499023, + "learning_rate": 0.0002, + "loss": 0.7945, + "step": 5890 + }, + { + "epoch": 0.953843666639722, + "grad_norm": 0.48547613620758057, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 5900 + }, + { + "epoch": 0.9554603508204672, + "grad_norm": 0.5791414976119995, + "learning_rate": 0.0002, + "loss": 0.7896, + "step": 5910 + }, + { + "epoch": 0.9570770350012126, + "grad_norm": 0.6195011734962463, + "learning_rate": 0.0002, + "loss": 0.8408, + "step": 5920 + }, + { + "epoch": 0.9586937191819578, + "grad_norm": 0.6323803067207336, + "learning_rate": 0.0002, + "loss": 0.7805, + "step": 5930 + }, + { + "epoch": 0.9603104033627031, + "grad_norm": 0.45552879571914673, + "learning_rate": 0.0002, + "loss": 0.8484, + "step": 5940 + }, + { + "epoch": 0.9619270875434484, + "grad_norm": 0.5796473622322083, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 5950 + }, + { + "epoch": 0.9635437717241937, + "grad_norm": 0.647261381149292, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 5960 + }, + { + "epoch": 0.9651604559049389, + "grad_norm": 0.5487682819366455, + "learning_rate": 0.0002, + "loss": 0.8086, + "step": 5970 + }, + { + "epoch": 0.9667771400856843, + "grad_norm": 0.5743663907051086, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 5980 + }, + { + "epoch": 0.9683938242664295, + "grad_norm": 0.5470591187477112, + "learning_rate": 0.0002, + "loss": 0.8153, + "step": 5990 + }, + { + "epoch": 0.9700105084471748, + "grad_norm": 0.5901660323143005, + "learning_rate": 0.0002, + "loss": 0.8119, + "step": 6000 + }, + { + "epoch": 0.9716271926279202, + "grad_norm": 0.6544759273529053, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 6010 + }, + { + "epoch": 0.9732438768086654, + "grad_norm": 0.6288470029830933, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 6020 + }, + { + "epoch": 0.9748605609894108, + "grad_norm": 0.673153817653656, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 6030 + }, + { + "epoch": 0.976477245170156, + "grad_norm": 0.42854753136634827, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 6040 + }, + { + "epoch": 0.9780939293509013, + "grad_norm": 0.5227066278457642, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 6050 + }, + { + "epoch": 0.9797106135316466, + "grad_norm": 0.5372416973114014, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 6060 + }, + { + "epoch": 0.9813272977123919, + "grad_norm": 0.6026402115821838, + "learning_rate": 0.0002, + "loss": 0.7591, + "step": 6070 + }, + { + "epoch": 0.9829439818931371, + "grad_norm": 0.49547791481018066, + "learning_rate": 0.0002, + "loss": 0.8347, + "step": 6080 + }, + { + "epoch": 0.9845606660738825, + "grad_norm": 0.4641951322555542, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 6090 + }, + { + "epoch": 0.9861773502546277, + "grad_norm": 0.5818535089492798, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 6100 + }, + { + "epoch": 0.9877940344353731, + "grad_norm": 0.63955157995224, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 6110 + }, + { + "epoch": 0.9894107186161183, + "grad_norm": 0.5649438500404358, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 6120 + }, + { + "epoch": 0.9910274027968636, + "grad_norm": 0.5290433168411255, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 6130 + }, + { + "epoch": 0.992644086977609, + "grad_norm": 0.6399374008178711, + "learning_rate": 0.0002, + "loss": 0.8109, + "step": 6140 + }, + { + "epoch": 0.9942607711583542, + "grad_norm": 0.6736576557159424, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 6150 + }, + { + "epoch": 0.9958774553390995, + "grad_norm": 0.515420138835907, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 6160 + }, + { + "epoch": 0.9974941395198448, + "grad_norm": 0.562677800655365, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 6170 + }, + { + "epoch": 0.9991108237005901, + "grad_norm": 0.7113858461380005, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 6180 + }, + { + "epoch": 0.9999191657909627, + "eval_loss": 1.0871200561523438, + "eval_runtime": 122.2071, + "eval_samples_per_second": 5.998, + "eval_steps_per_second": 0.753, + "step": 6185 + }, + { + "epoch": 1.0007275078813354, + "grad_norm": 0.7111801505088806, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 6190 + }, + { + "epoch": 1.0023441920620806, + "grad_norm": 0.5402125716209412, + "learning_rate": 0.0002, + "loss": 0.6865, + "step": 6200 + }, + { + "epoch": 1.003960876242826, + "grad_norm": 0.6098830103874207, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 6210 + }, + { + "epoch": 1.0055775604235713, + "grad_norm": 0.5829983353614807, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 6220 + }, + { + "epoch": 1.0071942446043165, + "grad_norm": 0.5614621043205261, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 6230 + }, + { + "epoch": 1.0088109287850617, + "grad_norm": 0.5954238772392273, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 6240 + }, + { + "epoch": 1.0104276129658072, + "grad_norm": 0.6480574607849121, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 6250 + }, + { + "epoch": 1.0120442971465524, + "grad_norm": 0.6051128506660461, + "learning_rate": 0.0002, + "loss": 0.7514, + "step": 6260 + }, + { + "epoch": 1.0136609813272976, + "grad_norm": 0.6318870782852173, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 6270 + }, + { + "epoch": 1.015277665508043, + "grad_norm": 0.5048980116844177, + "learning_rate": 0.0002, + "loss": 0.7178, + "step": 6280 + }, + { + "epoch": 1.0168943496887883, + "grad_norm": 0.6346936225891113, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 6290 + }, + { + "epoch": 1.0185110338695336, + "grad_norm": 0.5711665749549866, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 6300 + }, + { + "epoch": 1.0201277180502788, + "grad_norm": 0.5175361037254333, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 6310 + }, + { + "epoch": 1.0217444022310243, + "grad_norm": 0.5360831618309021, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 6320 + }, + { + "epoch": 1.0233610864117695, + "grad_norm": 0.614675760269165, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 6330 + }, + { + "epoch": 1.0249777705925147, + "grad_norm": 0.5626118183135986, + "learning_rate": 0.0002, + "loss": 0.7748, + "step": 6340 + }, + { + "epoch": 1.02659445477326, + "grad_norm": 0.574897289276123, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 6350 + }, + { + "epoch": 1.0282111389540054, + "grad_norm": 0.7185447812080383, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 6360 + }, + { + "epoch": 1.0298278231347506, + "grad_norm": 0.6705799698829651, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 6370 + }, + { + "epoch": 1.0314445073154959, + "grad_norm": 0.6740428805351257, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 6380 + }, + { + "epoch": 1.0330611914962413, + "grad_norm": 0.663902759552002, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 6390 + }, + { + "epoch": 1.0346778756769865, + "grad_norm": 0.5029543042182922, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 6400 + }, + { + "epoch": 1.0362945598577318, + "grad_norm": 0.7813863158226013, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 6410 + }, + { + "epoch": 1.037911244038477, + "grad_norm": 0.5396282076835632, + "learning_rate": 0.0002, + "loss": 0.7433, + "step": 6420 + }, + { + "epoch": 1.0395279282192225, + "grad_norm": 0.5253293514251709, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 6430 + }, + { + "epoch": 1.0411446123999677, + "grad_norm": 0.7236770987510681, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 6440 + }, + { + "epoch": 1.042761296580713, + "grad_norm": 0.5670917630195618, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 6450 + }, + { + "epoch": 1.0443779807614582, + "grad_norm": 0.6031978726387024, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 6460 + }, + { + "epoch": 1.0459946649422036, + "grad_norm": 0.5309213399887085, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 6470 + }, + { + "epoch": 1.0476113491229488, + "grad_norm": 0.7114651799201965, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 6480 + }, + { + "epoch": 1.049228033303694, + "grad_norm": 0.5591610670089722, + "learning_rate": 0.0002, + "loss": 0.7583, + "step": 6490 + }, + { + "epoch": 1.0508447174844395, + "grad_norm": 0.5185961127281189, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 6500 + }, + { + "epoch": 1.0524614016651848, + "grad_norm": 0.6510552167892456, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 6510 + }, + { + "epoch": 1.05407808584593, + "grad_norm": 0.6557928919792175, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 6520 + }, + { + "epoch": 1.0556947700266752, + "grad_norm": 0.6973192691802979, + "learning_rate": 0.0002, + "loss": 0.8056, + "step": 6530 + }, + { + "epoch": 1.0573114542074207, + "grad_norm": 0.6226583123207092, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 6540 + }, + { + "epoch": 1.058928138388166, + "grad_norm": 0.5633195638656616, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 6550 + }, + { + "epoch": 1.0605448225689111, + "grad_norm": 0.7466658353805542, + "learning_rate": 0.0002, + "loss": 0.7082, + "step": 6560 + }, + { + "epoch": 1.0621615067496564, + "grad_norm": 0.6462772488594055, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 6570 + }, + { + "epoch": 1.0637781909304018, + "grad_norm": 0.5266856551170349, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 6580 + }, + { + "epoch": 1.065394875111147, + "grad_norm": 0.534392774105072, + "learning_rate": 0.0002, + "loss": 0.7157, + "step": 6590 + }, + { + "epoch": 1.0670115592918923, + "grad_norm": 0.7514177560806274, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 6600 + }, + { + "epoch": 1.0686282434726375, + "grad_norm": 0.7593035697937012, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 6610 + }, + { + "epoch": 1.070244927653383, + "grad_norm": 0.5277858972549438, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 6620 + }, + { + "epoch": 1.0718616118341282, + "grad_norm": 0.5573670268058777, + "learning_rate": 0.0002, + "loss": 0.7405, + "step": 6630 + }, + { + "epoch": 1.0734782960148734, + "grad_norm": 0.6802396774291992, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 6640 + }, + { + "epoch": 1.0750949801956189, + "grad_norm": 0.7367215752601624, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 6650 + }, + { + "epoch": 1.0767116643763641, + "grad_norm": 0.5961891412734985, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 6660 + }, + { + "epoch": 1.0783283485571094, + "grad_norm": 0.5736313462257385, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 6670 + }, + { + "epoch": 1.0799450327378546, + "grad_norm": 0.619219183921814, + "learning_rate": 0.0002, + "loss": 0.7178, + "step": 6680 + }, + { + "epoch": 1.0815617169186, + "grad_norm": 0.6214390993118286, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 6690 + }, + { + "epoch": 1.0831784010993453, + "grad_norm": 0.564536988735199, + "learning_rate": 0.0002, + "loss": 0.7554, + "step": 6700 + }, + { + "epoch": 1.0847950852800905, + "grad_norm": 0.5838140249252319, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 6710 + }, + { + "epoch": 1.0864117694608357, + "grad_norm": 0.7000553607940674, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 6720 + }, + { + "epoch": 1.0880284536415812, + "grad_norm": 0.7078263759613037, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 6730 + }, + { + "epoch": 1.0896451378223264, + "grad_norm": 0.8353848457336426, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 6740 + }, + { + "epoch": 1.0912618220030716, + "grad_norm": 0.5615518689155579, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 6750 + }, + { + "epoch": 1.0928785061838169, + "grad_norm": 0.5475581288337708, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 6760 + }, + { + "epoch": 1.0944951903645623, + "grad_norm": 0.5835978388786316, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 6770 + }, + { + "epoch": 1.0961118745453076, + "grad_norm": 0.5516105890274048, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 6780 + }, + { + "epoch": 1.0977285587260528, + "grad_norm": 0.5875251889228821, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 6790 + }, + { + "epoch": 1.0993452429067982, + "grad_norm": 0.7376947999000549, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 6800 + }, + { + "epoch": 1.1009619270875435, + "grad_norm": 0.5656165480613708, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 6810 + }, + { + "epoch": 1.1025786112682887, + "grad_norm": 0.6365954279899597, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 6820 + }, + { + "epoch": 1.104195295449034, + "grad_norm": 0.5033080577850342, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 6830 + }, + { + "epoch": 1.1058119796297794, + "grad_norm": 0.617396891117096, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 6840 + }, + { + "epoch": 1.1074286638105246, + "grad_norm": 0.6395374536514282, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 6850 + }, + { + "epoch": 1.1090453479912699, + "grad_norm": 0.6775295734405518, + "learning_rate": 0.0002, + "loss": 0.7335, + "step": 6860 + }, + { + "epoch": 1.1106620321720153, + "grad_norm": 0.6655223965644836, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 6870 + }, + { + "epoch": 1.1122787163527605, + "grad_norm": 0.676655113697052, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 6880 + }, + { + "epoch": 1.1138954005335058, + "grad_norm": 0.6062718629837036, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 6890 + }, + { + "epoch": 1.115512084714251, + "grad_norm": 0.590943455696106, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 6900 + }, + { + "epoch": 1.1171287688949965, + "grad_norm": 0.6315317153930664, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 6910 + }, + { + "epoch": 1.1187454530757417, + "grad_norm": 0.47979024052619934, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 6920 + }, + { + "epoch": 1.120362137256487, + "grad_norm": 0.647298276424408, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 6930 + }, + { + "epoch": 1.1219788214372322, + "grad_norm": 0.7336484789848328, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 6940 + }, + { + "epoch": 1.1235955056179776, + "grad_norm": 0.5071424245834351, + "learning_rate": 0.0002, + "loss": 0.693, + "step": 6950 + }, + { + "epoch": 1.1252121897987228, + "grad_norm": 0.6527144312858582, + "learning_rate": 0.0002, + "loss": 0.7378, + "step": 6960 + }, + { + "epoch": 1.126828873979468, + "grad_norm": 0.6935935020446777, + "learning_rate": 0.0002, + "loss": 0.7228, + "step": 6970 + }, + { + "epoch": 1.1284455581602133, + "grad_norm": 0.8026931881904602, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 6980 + }, + { + "epoch": 1.1300622423409588, + "grad_norm": 0.5210393667221069, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 6990 + }, + { + "epoch": 1.131678926521704, + "grad_norm": 0.60475093126297, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 7000 + }, + { + "epoch": 1.1332956107024492, + "grad_norm": 0.6417073607444763, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 7010 + }, + { + "epoch": 1.1349122948831947, + "grad_norm": 0.6732175946235657, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 7020 + }, + { + "epoch": 1.13652897906394, + "grad_norm": 0.6719491481781006, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 7030 + }, + { + "epoch": 1.1381456632446851, + "grad_norm": 0.5708295106887817, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 7040 + }, + { + "epoch": 1.1397623474254304, + "grad_norm": 0.7141719460487366, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 7050 + }, + { + "epoch": 1.1413790316061758, + "grad_norm": 0.6187017560005188, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 7060 + }, + { + "epoch": 1.142995715786921, + "grad_norm": 0.50581294298172, + "learning_rate": 0.0002, + "loss": 0.7657, + "step": 7070 + }, + { + "epoch": 1.1446123999676663, + "grad_norm": 0.5620143413543701, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 7080 + }, + { + "epoch": 1.1462290841484115, + "grad_norm": 0.6231929659843445, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 7090 + }, + { + "epoch": 1.147845768329157, + "grad_norm": 0.5775774121284485, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 7100 + }, + { + "epoch": 1.1494624525099022, + "grad_norm": 0.6492809653282166, + "learning_rate": 0.0002, + "loss": 0.7728, + "step": 7110 + }, + { + "epoch": 1.1510791366906474, + "grad_norm": 0.6434972286224365, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 7120 + }, + { + "epoch": 1.1526958208713927, + "grad_norm": 0.6191812753677368, + "learning_rate": 0.0002, + "loss": 0.7374, + "step": 7130 + }, + { + "epoch": 1.1543125050521381, + "grad_norm": 0.6690331697463989, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 7140 + }, + { + "epoch": 1.1559291892328833, + "grad_norm": 0.5977938175201416, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 7150 + }, + { + "epoch": 1.1575458734136286, + "grad_norm": 0.6195854544639587, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 7160 + }, + { + "epoch": 1.159162557594374, + "grad_norm": 0.5752048492431641, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 7170 + }, + { + "epoch": 1.1607792417751193, + "grad_norm": 0.589081883430481, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 7180 + }, + { + "epoch": 1.1623959259558645, + "grad_norm": 0.756996750831604, + "learning_rate": 0.0002, + "loss": 0.704, + "step": 7190 + }, + { + "epoch": 1.1640126101366097, + "grad_norm": 0.7614967226982117, + "learning_rate": 0.0002, + "loss": 0.7404, + "step": 7200 + }, + { + "epoch": 1.1656292943173552, + "grad_norm": 0.6120437979698181, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 7210 + }, + { + "epoch": 1.1672459784981004, + "grad_norm": 0.6210004687309265, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 7220 + }, + { + "epoch": 1.1688626626788456, + "grad_norm": 0.6044116020202637, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 7230 + }, + { + "epoch": 1.170479346859591, + "grad_norm": 0.5418457388877869, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 7240 + }, + { + "epoch": 1.1720960310403363, + "grad_norm": 0.6413537263870239, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 7250 + }, + { + "epoch": 1.1737127152210816, + "grad_norm": 0.5777867436408997, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 7260 + }, + { + "epoch": 1.1753293994018268, + "grad_norm": 0.7092402577400208, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 7270 + }, + { + "epoch": 1.176946083582572, + "grad_norm": 0.6351709365844727, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 7280 + }, + { + "epoch": 1.1785627677633175, + "grad_norm": 0.6172189712524414, + "learning_rate": 0.0002, + "loss": 0.7527, + "step": 7290 + }, + { + "epoch": 1.1801794519440627, + "grad_norm": 0.6801714897155762, + "learning_rate": 0.0002, + "loss": 0.7319, + "step": 7300 + }, + { + "epoch": 1.181796136124808, + "grad_norm": 0.6044712066650391, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 7310 + }, + { + "epoch": 1.1834128203055534, + "grad_norm": 0.7413212060928345, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 7320 + }, + { + "epoch": 1.1850295044862986, + "grad_norm": 0.5303856134414673, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 7330 + }, + { + "epoch": 1.1866461886670439, + "grad_norm": 0.5647098422050476, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 7340 + }, + { + "epoch": 1.188262872847789, + "grad_norm": 0.7374135255813599, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 7350 + }, + { + "epoch": 1.1898795570285345, + "grad_norm": 0.5710089206695557, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 7360 + }, + { + "epoch": 1.1914962412092798, + "grad_norm": 0.6073619723320007, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 7370 + }, + { + "epoch": 1.193112925390025, + "grad_norm": 0.5899916887283325, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 7380 + }, + { + "epoch": 1.1947296095707705, + "grad_norm": 0.7762434482574463, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 7390 + }, + { + "epoch": 1.1963462937515157, + "grad_norm": 0.679949939250946, + "learning_rate": 0.0002, + "loss": 0.7319, + "step": 7400 + }, + { + "epoch": 1.197962977932261, + "grad_norm": 0.6106849312782288, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 7410 + }, + { + "epoch": 1.1995796621130062, + "grad_norm": 0.682461678981781, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 7420 + }, + { + "epoch": 1.2011963462937516, + "grad_norm": 0.6087017059326172, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 7430 + }, + { + "epoch": 1.2028130304744968, + "grad_norm": 0.63739013671875, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 7440 + }, + { + "epoch": 1.204429714655242, + "grad_norm": 0.6154777407646179, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 7450 + }, + { + "epoch": 1.2060463988359873, + "grad_norm": 0.7491534948348999, + "learning_rate": 0.0002, + "loss": 0.7565, + "step": 7460 + }, + { + "epoch": 1.2076630830167328, + "grad_norm": 0.6664797067642212, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 7470 + }, + { + "epoch": 1.209279767197478, + "grad_norm": 0.6660266518592834, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 7480 + }, + { + "epoch": 1.2108964513782232, + "grad_norm": 0.6972551345825195, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 7490 + }, + { + "epoch": 1.2125131355589684, + "grad_norm": 0.6157945990562439, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 7500 + }, + { + "epoch": 1.214129819739714, + "grad_norm": 0.5199310183525085, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 7510 + }, + { + "epoch": 1.2157465039204591, + "grad_norm": 0.577610433101654, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 7520 + }, + { + "epoch": 1.2173631881012044, + "grad_norm": 0.53652423620224, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 7530 + }, + { + "epoch": 1.2189798722819498, + "grad_norm": 0.6479050517082214, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 7540 + }, + { + "epoch": 1.220596556462695, + "grad_norm": 0.618748128414154, + "learning_rate": 0.0002, + "loss": 0.7534, + "step": 7550 + }, + { + "epoch": 1.2222132406434403, + "grad_norm": 0.6311424374580383, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 7560 + }, + { + "epoch": 1.2238299248241855, + "grad_norm": 0.6595825552940369, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 7570 + }, + { + "epoch": 1.225446609004931, + "grad_norm": 0.5198960900306702, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 7580 + }, + { + "epoch": 1.2270632931856762, + "grad_norm": 0.578650712966919, + "learning_rate": 0.0002, + "loss": 0.674, + "step": 7590 + }, + { + "epoch": 1.2286799773664214, + "grad_norm": 0.6080220937728882, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 7600 + }, + { + "epoch": 1.2302966615471669, + "grad_norm": 0.7050248384475708, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 7610 + }, + { + "epoch": 1.2319133457279121, + "grad_norm": 0.6652196049690247, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 7620 + }, + { + "epoch": 1.2335300299086573, + "grad_norm": 0.7322776317596436, + "learning_rate": 0.0002, + "loss": 0.7085, + "step": 7630 + }, + { + "epoch": 1.2351467140894026, + "grad_norm": 0.4998728036880493, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 7640 + }, + { + "epoch": 1.2367633982701478, + "grad_norm": 0.6428788900375366, + "learning_rate": 0.0002, + "loss": 0.7214, + "step": 7650 + }, + { + "epoch": 1.2383800824508933, + "grad_norm": 0.585242509841919, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 7660 + }, + { + "epoch": 1.2399967666316385, + "grad_norm": 0.5211917757987976, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 7670 + }, + { + "epoch": 1.2416134508123837, + "grad_norm": 0.6490384340286255, + "learning_rate": 0.0002, + "loss": 0.746, + "step": 7680 + }, + { + "epoch": 1.2432301349931292, + "grad_norm": 0.6249763369560242, + "learning_rate": 0.0002, + "loss": 0.7186, + "step": 7690 + }, + { + "epoch": 1.2448468191738744, + "grad_norm": 0.71870356798172, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 7700 + }, + { + "epoch": 1.2464635033546196, + "grad_norm": 0.6761967539787292, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 7710 + }, + { + "epoch": 1.2480801875353649, + "grad_norm": 0.6500617265701294, + "learning_rate": 0.0002, + "loss": 0.7501, + "step": 7720 + }, + { + "epoch": 1.2496968717161103, + "grad_norm": 0.8069869875907898, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 7730 + }, + { + "epoch": 1.2513135558968556, + "grad_norm": 0.6044608950614929, + "learning_rate": 0.0002, + "loss": 0.6747, + "step": 7740 + }, + { + "epoch": 1.2529302400776008, + "grad_norm": 0.6573283076286316, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 7750 + }, + { + "epoch": 1.2545469242583462, + "grad_norm": 0.625430166721344, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 7760 + }, + { + "epoch": 1.2561636084390915, + "grad_norm": 0.5442022681236267, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 7770 + }, + { + "epoch": 1.2577802926198367, + "grad_norm": 0.6818386912345886, + "learning_rate": 0.0002, + "loss": 0.7172, + "step": 7780 + }, + { + "epoch": 1.259396976800582, + "grad_norm": 0.6381874084472656, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 7790 + }, + { + "epoch": 1.2610136609813272, + "grad_norm": 0.6269212961196899, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 7800 + }, + { + "epoch": 1.2626303451620726, + "grad_norm": 0.600121259689331, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 7810 + }, + { + "epoch": 1.2642470293428179, + "grad_norm": 0.6337703466415405, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 7820 + }, + { + "epoch": 1.2658637135235633, + "grad_norm": 0.7234963774681091, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 7830 + }, + { + "epoch": 1.2674803977043085, + "grad_norm": 0.800184965133667, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 7840 + }, + { + "epoch": 1.2690970818850538, + "grad_norm": 0.7539464831352234, + "learning_rate": 0.0002, + "loss": 0.7426, + "step": 7850 + }, + { + "epoch": 1.270713766065799, + "grad_norm": 0.5493760704994202, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 7860 + }, + { + "epoch": 1.2723304502465442, + "grad_norm": 0.7477145791053772, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 7870 + }, + { + "epoch": 1.2739471344272897, + "grad_norm": 0.6366362571716309, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 7880 + }, + { + "epoch": 1.275563818608035, + "grad_norm": 0.7419533729553223, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 7890 + }, + { + "epoch": 1.2771805027887801, + "grad_norm": 0.6141223311424255, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 7900 + }, + { + "epoch": 1.2787971869695256, + "grad_norm": 0.7522598505020142, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 7910 + }, + { + "epoch": 1.2804138711502708, + "grad_norm": 0.6935804486274719, + "learning_rate": 0.0002, + "loss": 0.7097, + "step": 7920 + }, + { + "epoch": 1.282030555331016, + "grad_norm": 0.7239290475845337, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 7930 + }, + { + "epoch": 1.2836472395117613, + "grad_norm": 0.8800187110900879, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 7940 + }, + { + "epoch": 1.2852639236925067, + "grad_norm": 0.540458083152771, + "learning_rate": 0.0002, + "loss": 0.6991, + "step": 7950 + }, + { + "epoch": 1.286880607873252, + "grad_norm": 0.6492934226989746, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 7960 + }, + { + "epoch": 1.2884972920539972, + "grad_norm": 0.6543959379196167, + "learning_rate": 0.0002, + "loss": 0.7742, + "step": 7970 + }, + { + "epoch": 1.2901139762347427, + "grad_norm": 0.5804705619812012, + "learning_rate": 0.0002, + "loss": 0.7316, + "step": 7980 + }, + { + "epoch": 1.291730660415488, + "grad_norm": 0.7074727416038513, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 7990 + }, + { + "epoch": 1.2933473445962331, + "grad_norm": 0.5347974300384521, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 8000 + }, + { + "epoch": 1.2949640287769784, + "grad_norm": 0.6457298398017883, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 8010 + }, + { + "epoch": 1.2965807129577236, + "grad_norm": 0.6407219171524048, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 8020 + }, + { + "epoch": 1.298197397138469, + "grad_norm": 0.828439474105835, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 8030 + }, + { + "epoch": 1.2998140813192143, + "grad_norm": 0.4840380549430847, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 8040 + }, + { + "epoch": 1.3014307654999595, + "grad_norm": 0.5921024680137634, + "learning_rate": 0.0002, + "loss": 0.7283, + "step": 8050 + }, + { + "epoch": 1.303047449680705, + "grad_norm": 0.6170315146446228, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 8060 + }, + { + "epoch": 1.3046641338614502, + "grad_norm": 0.5374847054481506, + "learning_rate": 0.0002, + "loss": 0.7534, + "step": 8070 + }, + { + "epoch": 1.3062808180421954, + "grad_norm": 0.545758068561554, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 8080 + }, + { + "epoch": 1.3078975022229407, + "grad_norm": 0.55641770362854, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 8090 + }, + { + "epoch": 1.309514186403686, + "grad_norm": 0.6724897027015686, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 8100 + }, + { + "epoch": 1.3111308705844313, + "grad_norm": 0.6923972368240356, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 8110 + }, + { + "epoch": 1.3127475547651766, + "grad_norm": 0.5136841535568237, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 8120 + }, + { + "epoch": 1.314364238945922, + "grad_norm": 0.6766283512115479, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 8130 + }, + { + "epoch": 1.3159809231266673, + "grad_norm": 0.6283926367759705, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 8140 + }, + { + "epoch": 1.3175976073074125, + "grad_norm": 0.644216001033783, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 8150 + }, + { + "epoch": 1.3192142914881577, + "grad_norm": 0.7827503085136414, + "learning_rate": 0.0002, + "loss": 0.7125, + "step": 8160 + }, + { + "epoch": 1.320830975668903, + "grad_norm": 0.6651390790939331, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 8170 + }, + { + "epoch": 1.3224476598496484, + "grad_norm": 0.5547412633895874, + "learning_rate": 0.0002, + "loss": 0.7778, + "step": 8180 + }, + { + "epoch": 1.3240643440303936, + "grad_norm": 0.6765179634094238, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 8190 + }, + { + "epoch": 1.325681028211139, + "grad_norm": 0.6822077035903931, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 8200 + }, + { + "epoch": 1.3272977123918843, + "grad_norm": 0.5941002368927002, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 8210 + }, + { + "epoch": 1.3289143965726296, + "grad_norm": 0.4850037097930908, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 8220 + }, + { + "epoch": 1.3305310807533748, + "grad_norm": 0.6162990927696228, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 8230 + }, + { + "epoch": 1.33214776493412, + "grad_norm": 0.6665613651275635, + "learning_rate": 0.0002, + "loss": 0.7751, + "step": 8240 + }, + { + "epoch": 1.3337644491148655, + "grad_norm": 0.618192732334137, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 8250 + }, + { + "epoch": 1.3353811332956107, + "grad_norm": 0.710418701171875, + "learning_rate": 0.0002, + "loss": 0.7532, + "step": 8260 + }, + { + "epoch": 1.336997817476356, + "grad_norm": 0.5109876990318298, + "learning_rate": 0.0002, + "loss": 0.7306, + "step": 8270 + }, + { + "epoch": 1.3386145016571014, + "grad_norm": 0.6791711449623108, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 8280 + }, + { + "epoch": 1.3402311858378466, + "grad_norm": 0.6836432814598083, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 8290 + }, + { + "epoch": 1.3418478700185918, + "grad_norm": 0.5579386353492737, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 8300 + }, + { + "epoch": 1.343464554199337, + "grad_norm": 0.6713546514511108, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 8310 + }, + { + "epoch": 1.3450812383800825, + "grad_norm": 0.5353720188140869, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 8320 + }, + { + "epoch": 1.3466979225608278, + "grad_norm": 0.5813682675361633, + "learning_rate": 0.0002, + "loss": 0.718, + "step": 8330 + }, + { + "epoch": 1.348314606741573, + "grad_norm": 0.8158791661262512, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 8340 + }, + { + "epoch": 1.3499312909223184, + "grad_norm": 0.6193785071372986, + "learning_rate": 0.0002, + "loss": 0.6992, + "step": 8350 + }, + { + "epoch": 1.3515479751030637, + "grad_norm": 0.6353939771652222, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 8360 + }, + { + "epoch": 1.353164659283809, + "grad_norm": 0.6925048232078552, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 8370 + }, + { + "epoch": 1.3547813434645541, + "grad_norm": 0.988264799118042, + "learning_rate": 0.0002, + "loss": 0.736, + "step": 8380 + }, + { + "epoch": 1.3563980276452994, + "grad_norm": 0.6476002931594849, + "learning_rate": 0.0002, + "loss": 0.7744, + "step": 8390 + }, + { + "epoch": 1.3580147118260448, + "grad_norm": 0.7120398879051208, + "learning_rate": 0.0002, + "loss": 0.776, + "step": 8400 + }, + { + "epoch": 1.35963139600679, + "grad_norm": 0.9048416614532471, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 8410 + }, + { + "epoch": 1.3612480801875353, + "grad_norm": 0.7000672817230225, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 8420 + }, + { + "epoch": 1.3628647643682807, + "grad_norm": 0.6015632152557373, + "learning_rate": 0.0002, + "loss": 0.7358, + "step": 8430 + }, + { + "epoch": 1.364481448549026, + "grad_norm": 0.612516462802887, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 8440 + }, + { + "epoch": 1.3660981327297712, + "grad_norm": 0.5969301462173462, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 8450 + }, + { + "epoch": 1.3677148169105164, + "grad_norm": 0.6730654239654541, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 8460 + }, + { + "epoch": 1.369331501091262, + "grad_norm": 0.6386392116546631, + "learning_rate": 0.0002, + "loss": 0.7465, + "step": 8470 + }, + { + "epoch": 1.3709481852720071, + "grad_norm": 0.739544153213501, + "learning_rate": 0.0002, + "loss": 0.7433, + "step": 8480 + }, + { + "epoch": 1.3725648694527524, + "grad_norm": 0.6462782621383667, + "learning_rate": 0.0002, + "loss": 0.7892, + "step": 8490 + }, + { + "epoch": 1.3741815536334978, + "grad_norm": 0.7346843481063843, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 8500 + }, + { + "epoch": 1.375798237814243, + "grad_norm": 0.6884821057319641, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 8510 + }, + { + "epoch": 1.3774149219949883, + "grad_norm": 0.6999333500862122, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 8520 + }, + { + "epoch": 1.3790316061757335, + "grad_norm": 0.5378713011741638, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 8530 + }, + { + "epoch": 1.3806482903564787, + "grad_norm": 0.5417906641960144, + "learning_rate": 0.0002, + "loss": 0.6797, + "step": 8540 + }, + { + "epoch": 1.3822649745372242, + "grad_norm": 0.6602526307106018, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 8550 + }, + { + "epoch": 1.3838816587179694, + "grad_norm": 0.7073674201965332, + "learning_rate": 0.0002, + "loss": 0.7356, + "step": 8560 + }, + { + "epoch": 1.3854983428987149, + "grad_norm": 0.5841707587242126, + "learning_rate": 0.0002, + "loss": 0.75, + "step": 8570 + }, + { + "epoch": 1.38711502707946, + "grad_norm": 0.7031095027923584, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 8580 + }, + { + "epoch": 1.3887317112602053, + "grad_norm": 0.5198570489883423, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 8590 + }, + { + "epoch": 1.3903483954409506, + "grad_norm": 0.7261320352554321, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 8600 + }, + { + "epoch": 1.3919650796216958, + "grad_norm": 0.5616350173950195, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 8610 + }, + { + "epoch": 1.3935817638024413, + "grad_norm": 0.5185914635658264, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 8620 + }, + { + "epoch": 1.3951984479831865, + "grad_norm": 0.5814694762229919, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 8630 + }, + { + "epoch": 1.3968151321639317, + "grad_norm": 0.6977371573448181, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 8640 + }, + { + "epoch": 1.3984318163446772, + "grad_norm": 0.6855689883232117, + "learning_rate": 0.0002, + "loss": 0.7574, + "step": 8650 + }, + { + "epoch": 1.4000485005254224, + "grad_norm": 0.5414357781410217, + "learning_rate": 0.0002, + "loss": 0.7802, + "step": 8660 + }, + { + "epoch": 1.4016651847061676, + "grad_norm": 0.6970012784004211, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 8670 + }, + { + "epoch": 1.4032818688869129, + "grad_norm": 0.526079535484314, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 8680 + }, + { + "epoch": 1.404898553067658, + "grad_norm": 0.758712887763977, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 8690 + }, + { + "epoch": 1.4065152372484035, + "grad_norm": 0.7118762731552124, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 8700 + }, + { + "epoch": 1.4081319214291488, + "grad_norm": 0.5696909427642822, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 8710 + }, + { + "epoch": 1.4097486056098942, + "grad_norm": 0.7995436787605286, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 8720 + }, + { + "epoch": 1.4113652897906395, + "grad_norm": 0.7237521409988403, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 8730 + }, + { + "epoch": 1.4129819739713847, + "grad_norm": 0.744628369808197, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 8740 + }, + { + "epoch": 1.41459865815213, + "grad_norm": 0.6082926988601685, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 8750 + }, + { + "epoch": 1.4162153423328752, + "grad_norm": 0.5185243487358093, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 8760 + }, + { + "epoch": 1.4178320265136206, + "grad_norm": 0.5183082222938538, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 8770 + }, + { + "epoch": 1.4194487106943658, + "grad_norm": 0.7326041460037231, + "learning_rate": 0.0002, + "loss": 0.7509, + "step": 8780 + }, + { + "epoch": 1.421065394875111, + "grad_norm": 0.7174660563468933, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 8790 + }, + { + "epoch": 1.4226820790558565, + "grad_norm": 0.8080165982246399, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 8800 + }, + { + "epoch": 1.4242987632366018, + "grad_norm": 0.5061507821083069, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 8810 + }, + { + "epoch": 1.425915447417347, + "grad_norm": 0.801602840423584, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 8820 + }, + { + "epoch": 1.4275321315980922, + "grad_norm": 0.6150273084640503, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 8830 + }, + { + "epoch": 1.4291488157788377, + "grad_norm": 0.8786525726318359, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 8840 + }, + { + "epoch": 1.430765499959583, + "grad_norm": 0.6371538639068604, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 8850 + }, + { + "epoch": 1.4323821841403281, + "grad_norm": 0.6409295797348022, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 8860 + }, + { + "epoch": 1.4339988683210736, + "grad_norm": 0.6452359557151794, + "learning_rate": 0.0002, + "loss": 0.7891, + "step": 8870 + }, + { + "epoch": 1.4356155525018188, + "grad_norm": 0.5842334628105164, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 8880 + }, + { + "epoch": 1.437232236682564, + "grad_norm": 0.696761965751648, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 8890 + }, + { + "epoch": 1.4388489208633093, + "grad_norm": 0.6384600400924683, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 8900 + }, + { + "epoch": 1.4404656050440545, + "grad_norm": 0.5981136560440063, + "learning_rate": 0.0002, + "loss": 0.7049, + "step": 8910 + }, + { + "epoch": 1.4420822892248, + "grad_norm": 0.6355637907981873, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 8920 + }, + { + "epoch": 1.4436989734055452, + "grad_norm": 0.6374830603599548, + "learning_rate": 0.0002, + "loss": 0.7653, + "step": 8930 + }, + { + "epoch": 1.4453156575862904, + "grad_norm": 0.559013307094574, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 8940 + }, + { + "epoch": 1.446932341767036, + "grad_norm": 0.7289170026779175, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 8950 + }, + { + "epoch": 1.4485490259477811, + "grad_norm": 0.8649206757545471, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 8960 + }, + { + "epoch": 1.4501657101285264, + "grad_norm": 0.7664689421653748, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 8970 + }, + { + "epoch": 1.4517823943092716, + "grad_norm": 0.7109952569007874, + "learning_rate": 0.0002, + "loss": 0.7431, + "step": 8980 + }, + { + "epoch": 1.453399078490017, + "grad_norm": 0.6312844753265381, + "learning_rate": 0.0002, + "loss": 0.7997, + "step": 8990 + }, + { + "epoch": 1.4550157626707623, + "grad_norm": 0.6616617441177368, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 9000 + }, + { + "epoch": 1.4566324468515075, + "grad_norm": 0.7384068965911865, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 9010 + }, + { + "epoch": 1.458249131032253, + "grad_norm": 0.6549670100212097, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 9020 + }, + { + "epoch": 1.4598658152129982, + "grad_norm": 0.6254119277000427, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 9030 + }, + { + "epoch": 1.4614824993937434, + "grad_norm": 0.6806328892707825, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 9040 + }, + { + "epoch": 1.4630991835744886, + "grad_norm": 0.6803115010261536, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 9050 + }, + { + "epoch": 1.4647158677552339, + "grad_norm": 0.48529282212257385, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 9060 + }, + { + "epoch": 1.4663325519359793, + "grad_norm": 0.5995030999183655, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 9070 + }, + { + "epoch": 1.4679492361167246, + "grad_norm": 0.6005427837371826, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 9080 + }, + { + "epoch": 1.46956592029747, + "grad_norm": 0.718564510345459, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 9090 + }, + { + "epoch": 1.4711826044782153, + "grad_norm": 0.7003577351570129, + "learning_rate": 0.0002, + "loss": 0.7089, + "step": 9100 + }, + { + "epoch": 1.4727992886589605, + "grad_norm": 0.5888323783874512, + "learning_rate": 0.0002, + "loss": 0.8069, + "step": 9110 + }, + { + "epoch": 1.4744159728397057, + "grad_norm": 0.6417609453201294, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 9120 + }, + { + "epoch": 1.476032657020451, + "grad_norm": 0.572294294834137, + "learning_rate": 0.0002, + "loss": 0.7441, + "step": 9130 + }, + { + "epoch": 1.4776493412011964, + "grad_norm": 0.8200714588165283, + "learning_rate": 0.0002, + "loss": 0.8053, + "step": 9140 + }, + { + "epoch": 1.4792660253819416, + "grad_norm": 0.6343288421630859, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 9150 + }, + { + "epoch": 1.4808827095626869, + "grad_norm": 0.7017961144447327, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 9160 + }, + { + "epoch": 1.4824993937434323, + "grad_norm": 0.6202912926673889, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 9170 + }, + { + "epoch": 1.4841160779241775, + "grad_norm": 0.6677869558334351, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 9180 + }, + { + "epoch": 1.4857327621049228, + "grad_norm": 0.6052267551422119, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 9190 + }, + { + "epoch": 1.487349446285668, + "grad_norm": 0.6638872027397156, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 9200 + }, + { + "epoch": 1.4889661304664135, + "grad_norm": 0.6245523691177368, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 9210 + }, + { + "epoch": 1.4905828146471587, + "grad_norm": 0.5761767625808716, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 9220 + }, + { + "epoch": 1.492199498827904, + "grad_norm": 0.8175981640815735, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 9230 + }, + { + "epoch": 1.4938161830086494, + "grad_norm": 0.9144009947776794, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 9240 + }, + { + "epoch": 1.4954328671893946, + "grad_norm": 0.5742552876472473, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 9250 + }, + { + "epoch": 1.4970495513701398, + "grad_norm": 0.534534215927124, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 9260 + }, + { + "epoch": 1.498666235550885, + "grad_norm": 0.7836225032806396, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 9270 + }, + { + "epoch": 1.5002829197316303, + "grad_norm": 0.5292993187904358, + "learning_rate": 0.0002, + "loss": 0.7453, + "step": 9280 + }, + { + "epoch": 1.5018996039123758, + "grad_norm": 0.8044071793556213, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 9290 + }, + { + "epoch": 1.503516288093121, + "grad_norm": 0.6185805201530457, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 9300 + }, + { + "epoch": 1.5051329722738664, + "grad_norm": 0.6093607544898987, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 9310 + }, + { + "epoch": 1.5067496564546117, + "grad_norm": 0.5891730189323425, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 9320 + }, + { + "epoch": 1.508366340635357, + "grad_norm": 0.6331129670143127, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 9330 + }, + { + "epoch": 1.5099830248161021, + "grad_norm": 0.7690958380699158, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 9340 + }, + { + "epoch": 1.5115997089968474, + "grad_norm": 0.6548877358436584, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 9350 + }, + { + "epoch": 1.5132163931775926, + "grad_norm": 0.6545143127441406, + "learning_rate": 0.0002, + "loss": 0.7408, + "step": 9360 + }, + { + "epoch": 1.514833077358338, + "grad_norm": 0.553247332572937, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 9370 + }, + { + "epoch": 1.5164497615390833, + "grad_norm": 0.8145074844360352, + "learning_rate": 0.0002, + "loss": 0.7265, + "step": 9380 + }, + { + "epoch": 1.5180664457198287, + "grad_norm": 0.7636994123458862, + "learning_rate": 0.0002, + "loss": 0.7379, + "step": 9390 + }, + { + "epoch": 1.519683129900574, + "grad_norm": 0.6838982701301575, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 9400 + }, + { + "epoch": 1.5212998140813192, + "grad_norm": 0.8599441647529602, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 9410 + }, + { + "epoch": 1.5229164982620644, + "grad_norm": 0.7020329833030701, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 9420 + }, + { + "epoch": 1.5245331824428097, + "grad_norm": 0.6964772343635559, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 9430 + }, + { + "epoch": 1.5261498666235551, + "grad_norm": 0.6916600465774536, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 9440 + }, + { + "epoch": 1.5277665508043003, + "grad_norm": 0.7282621264457703, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 9450 + }, + { + "epoch": 1.5293832349850458, + "grad_norm": 0.5363983511924744, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 9460 + }, + { + "epoch": 1.530999919165791, + "grad_norm": 0.6184861063957214, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 9470 + }, + { + "epoch": 1.5326166033465363, + "grad_norm": 0.5991285443305969, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 9480 + }, + { + "epoch": 1.5342332875272815, + "grad_norm": 0.8176587820053101, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 9490 + }, + { + "epoch": 1.5358499717080267, + "grad_norm": 0.6473721861839294, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 9500 + }, + { + "epoch": 1.5374666558887722, + "grad_norm": 0.7319952845573425, + "learning_rate": 0.0002, + "loss": 0.7632, + "step": 9510 + }, + { + "epoch": 1.5390833400695174, + "grad_norm": 0.702900230884552, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 9520 + }, + { + "epoch": 1.5407000242502629, + "grad_norm": 0.7971600294113159, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 9530 + }, + { + "epoch": 1.542316708431008, + "grad_norm": 0.6527525186538696, + "learning_rate": 0.0002, + "loss": 0.7352, + "step": 9540 + }, + { + "epoch": 1.5439333926117533, + "grad_norm": 0.5791676044464111, + "learning_rate": 0.0002, + "loss": 0.7425, + "step": 9550 + }, + { + "epoch": 1.5455500767924986, + "grad_norm": 0.5619390606880188, + "learning_rate": 0.0002, + "loss": 0.7585, + "step": 9560 + }, + { + "epoch": 1.5471667609732438, + "grad_norm": 0.5701689124107361, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 9570 + }, + { + "epoch": 1.548783445153989, + "grad_norm": 0.47549352049827576, + "learning_rate": 0.0002, + "loss": 0.793, + "step": 9580 + }, + { + "epoch": 1.5504001293347345, + "grad_norm": 0.8730611205101013, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 9590 + }, + { + "epoch": 1.5520168135154797, + "grad_norm": 0.6842091083526611, + "learning_rate": 0.0002, + "loss": 0.798, + "step": 9600 + }, + { + "epoch": 1.5536334976962252, + "grad_norm": 0.6675129532814026, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 9610 + }, + { + "epoch": 1.5552501818769704, + "grad_norm": 0.8173956274986267, + "learning_rate": 0.0002, + "loss": 0.7954, + "step": 9620 + }, + { + "epoch": 1.5568668660577156, + "grad_norm": 0.724947452545166, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 9630 + }, + { + "epoch": 1.5584835502384609, + "grad_norm": 0.6154758930206299, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 9640 + }, + { + "epoch": 1.560100234419206, + "grad_norm": 0.6072008013725281, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 9650 + }, + { + "epoch": 1.5617169185999515, + "grad_norm": 0.659010648727417, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 9660 + }, + { + "epoch": 1.5633336027806968, + "grad_norm": 0.65857994556427, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 9670 + }, + { + "epoch": 1.5649502869614422, + "grad_norm": 0.5914267301559448, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 9680 + }, + { + "epoch": 1.5665669711421875, + "grad_norm": 0.6248020529747009, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 9690 + }, + { + "epoch": 1.5681836553229327, + "grad_norm": 0.7147795557975769, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 9700 + }, + { + "epoch": 1.569800339503678, + "grad_norm": 0.7076232433319092, + "learning_rate": 0.0002, + "loss": 0.7335, + "step": 9710 + }, + { + "epoch": 1.5714170236844232, + "grad_norm": 0.6217400431632996, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 9720 + }, + { + "epoch": 1.5730337078651684, + "grad_norm": 0.6709911227226257, + "learning_rate": 0.0002, + "loss": 0.7296, + "step": 9730 + }, + { + "epoch": 1.5746503920459138, + "grad_norm": 0.749171257019043, + "learning_rate": 0.0002, + "loss": 0.7306, + "step": 9740 + }, + { + "epoch": 1.576267076226659, + "grad_norm": 0.6241145730018616, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 9750 + }, + { + "epoch": 1.5778837604074045, + "grad_norm": 0.4960934817790985, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 9760 + }, + { + "epoch": 1.5795004445881498, + "grad_norm": 0.6593309640884399, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 9770 + }, + { + "epoch": 1.581117128768895, + "grad_norm": 0.5814042091369629, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 9780 + }, + { + "epoch": 1.5827338129496402, + "grad_norm": 0.5936070680618286, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 9790 + }, + { + "epoch": 1.5843504971303854, + "grad_norm": 0.6454403400421143, + "learning_rate": 0.0002, + "loss": 0.7769, + "step": 9800 + }, + { + "epoch": 1.585967181311131, + "grad_norm": 0.7612107992172241, + "learning_rate": 0.0002, + "loss": 0.7677, + "step": 9810 + }, + { + "epoch": 1.5875838654918761, + "grad_norm": 0.6494482755661011, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 9820 + }, + { + "epoch": 1.5892005496726216, + "grad_norm": 0.7825694680213928, + "learning_rate": 0.0002, + "loss": 0.7569, + "step": 9830 + }, + { + "epoch": 1.5908172338533668, + "grad_norm": 0.6757757663726807, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 9840 + }, + { + "epoch": 1.592433918034112, + "grad_norm": 0.7105609178543091, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 9850 + }, + { + "epoch": 1.5940506022148573, + "grad_norm": 0.7596991062164307, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 9860 + }, + { + "epoch": 1.5956672863956025, + "grad_norm": 0.5681525468826294, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 9870 + }, + { + "epoch": 1.5972839705763477, + "grad_norm": 0.6090980768203735, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 9880 + }, + { + "epoch": 1.5989006547570932, + "grad_norm": 0.6271613240242004, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 9890 + }, + { + "epoch": 1.6005173389378387, + "grad_norm": 0.7656369805335999, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 9900 + }, + { + "epoch": 1.6021340231185839, + "grad_norm": 0.7504446506500244, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 9910 + }, + { + "epoch": 1.6037507072993291, + "grad_norm": 0.659656286239624, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 9920 + }, + { + "epoch": 1.6053673914800743, + "grad_norm": 0.6006826162338257, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 9930 + }, + { + "epoch": 1.6069840756608196, + "grad_norm": 0.7872757911682129, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 9940 + }, + { + "epoch": 1.6086007598415648, + "grad_norm": 0.5545852780342102, + "learning_rate": 0.0002, + "loss": 0.768, + "step": 9950 + }, + { + "epoch": 1.6102174440223103, + "grad_norm": 0.7429468631744385, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 9960 + }, + { + "epoch": 1.6118341282030555, + "grad_norm": 0.6873556971549988, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 9970 + }, + { + "epoch": 1.613450812383801, + "grad_norm": 0.5874287486076355, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 9980 + }, + { + "epoch": 1.6150674965645462, + "grad_norm": 0.6039386987686157, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 9990 + }, + { + "epoch": 1.6166841807452914, + "grad_norm": 0.6233575940132141, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 10000 + }, + { + "epoch": 1.6183008649260366, + "grad_norm": 0.7676448225975037, + "learning_rate": 0.0002, + "loss": 0.7602, + "step": 10010 + }, + { + "epoch": 1.6199175491067819, + "grad_norm": 0.6565698385238647, + "learning_rate": 0.0002, + "loss": 0.7784, + "step": 10020 + }, + { + "epoch": 1.6215342332875273, + "grad_norm": 0.6787590384483337, + "learning_rate": 0.0002, + "loss": 0.7104, + "step": 10030 + }, + { + "epoch": 1.6231509174682726, + "grad_norm": 0.6137678027153015, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 10040 + }, + { + "epoch": 1.624767601649018, + "grad_norm": 0.5236800312995911, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 10050 + }, + { + "epoch": 1.6263842858297632, + "grad_norm": 0.7626367807388306, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 10060 + }, + { + "epoch": 1.6280009700105085, + "grad_norm": 0.5657260417938232, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 10070 + }, + { + "epoch": 1.6296176541912537, + "grad_norm": 0.4913991391658783, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 10080 + }, + { + "epoch": 1.631234338371999, + "grad_norm": 0.7715556621551514, + "learning_rate": 0.0002, + "loss": 0.7596, + "step": 10090 + }, + { + "epoch": 1.6328510225527442, + "grad_norm": 0.6509000062942505, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 10100 + }, + { + "epoch": 1.6344677067334896, + "grad_norm": 0.6215850114822388, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 10110 + }, + { + "epoch": 1.6360843909142349, + "grad_norm": 0.6956844329833984, + "learning_rate": 0.0002, + "loss": 0.7705, + "step": 10120 + }, + { + "epoch": 1.6377010750949803, + "grad_norm": 0.6111597418785095, + "learning_rate": 0.0002, + "loss": 0.7129, + "step": 10130 + }, + { + "epoch": 1.6393177592757255, + "grad_norm": 0.6518288850784302, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 10140 + }, + { + "epoch": 1.6409344434564708, + "grad_norm": 0.6914522051811218, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 10150 + }, + { + "epoch": 1.642551127637216, + "grad_norm": 0.63785719871521, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 10160 + }, + { + "epoch": 1.6441678118179612, + "grad_norm": 0.6379287838935852, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 10170 + }, + { + "epoch": 1.6457844959987067, + "grad_norm": 0.6793403029441833, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 10180 + }, + { + "epoch": 1.647401180179452, + "grad_norm": 0.6099132895469666, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 10190 + }, + { + "epoch": 1.6490178643601974, + "grad_norm": 0.5869854092597961, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 10200 + }, + { + "epoch": 1.6506345485409426, + "grad_norm": 0.7716999053955078, + "learning_rate": 0.0002, + "loss": 0.8308, + "step": 10210 + }, + { + "epoch": 1.6522512327216878, + "grad_norm": 0.6854110360145569, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 10220 + }, + { + "epoch": 1.653867916902433, + "grad_norm": 0.6957170367240906, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 10230 + }, + { + "epoch": 1.6554846010831783, + "grad_norm": 0.6932903528213501, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 10240 + }, + { + "epoch": 1.6571012852639235, + "grad_norm": 0.7713165283203125, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 10250 + }, + { + "epoch": 1.658717969444669, + "grad_norm": 0.7455793619155884, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 10260 + }, + { + "epoch": 1.6603346536254144, + "grad_norm": 0.5464168190956116, + "learning_rate": 0.0002, + "loss": 0.7524, + "step": 10270 + }, + { + "epoch": 1.6619513378061597, + "grad_norm": 0.6782926321029663, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 10280 + }, + { + "epoch": 1.663568021986905, + "grad_norm": 0.7962649464607239, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 10290 + }, + { + "epoch": 1.6651847061676501, + "grad_norm": 0.6814526319503784, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 10300 + }, + { + "epoch": 1.6668013903483954, + "grad_norm": 0.656895101070404, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 10310 + }, + { + "epoch": 1.6684180745291406, + "grad_norm": 0.6085672378540039, + "learning_rate": 0.0002, + "loss": 0.7358, + "step": 10320 + }, + { + "epoch": 1.670034758709886, + "grad_norm": 0.585508406162262, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 10330 + }, + { + "epoch": 1.6716514428906313, + "grad_norm": 0.6930184364318848, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 10340 + }, + { + "epoch": 1.6732681270713767, + "grad_norm": 0.575663149356842, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 10350 + }, + { + "epoch": 1.674884811252122, + "grad_norm": 0.582502543926239, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 10360 + }, + { + "epoch": 1.6765014954328672, + "grad_norm": 0.5668916702270508, + "learning_rate": 0.0002, + "loss": 0.7793, + "step": 10370 + }, + { + "epoch": 1.6781181796136124, + "grad_norm": 0.6070065498352051, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 10380 + }, + { + "epoch": 1.6797348637943577, + "grad_norm": 0.6141316294670105, + "learning_rate": 0.0002, + "loss": 0.7939, + "step": 10390 + }, + { + "epoch": 1.6813515479751031, + "grad_norm": 0.8359124064445496, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 10400 + }, + { + "epoch": 1.6829682321558483, + "grad_norm": 0.5378185510635376, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 10410 + }, + { + "epoch": 1.6845849163365938, + "grad_norm": 0.6959536075592041, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 10420 + }, + { + "epoch": 1.686201600517339, + "grad_norm": 0.6514357328414917, + "learning_rate": 0.0002, + "loss": 0.7872, + "step": 10430 + }, + { + "epoch": 1.6878182846980843, + "grad_norm": 0.7706646919250488, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 10440 + }, + { + "epoch": 1.6894349688788295, + "grad_norm": 0.6183337569236755, + "learning_rate": 0.0002, + "loss": 0.7673, + "step": 10450 + }, + { + "epoch": 1.6910516530595747, + "grad_norm": 0.6123278141021729, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 10460 + }, + { + "epoch": 1.69266833724032, + "grad_norm": 0.6894851326942444, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 10470 + }, + { + "epoch": 1.6942850214210654, + "grad_norm": 0.7497312426567078, + "learning_rate": 0.0002, + "loss": 0.7435, + "step": 10480 + }, + { + "epoch": 1.6959017056018106, + "grad_norm": 0.5968214273452759, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 10490 + }, + { + "epoch": 1.697518389782556, + "grad_norm": 0.6747927069664001, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 10500 + }, + { + "epoch": 1.6991350739633013, + "grad_norm": 0.5708310008049011, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 10510 + }, + { + "epoch": 1.7007517581440466, + "grad_norm": 0.606526792049408, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 10520 + }, + { + "epoch": 1.7023684423247918, + "grad_norm": 0.662011981010437, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 10530 + }, + { + "epoch": 1.703985126505537, + "grad_norm": 0.7583045363426208, + "learning_rate": 0.0002, + "loss": 0.7999, + "step": 10540 + }, + { + "epoch": 1.7056018106862825, + "grad_norm": 0.721632182598114, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 10550 + }, + { + "epoch": 1.7072184948670277, + "grad_norm": 0.6107715368270874, + "learning_rate": 0.0002, + "loss": 0.7407, + "step": 10560 + }, + { + "epoch": 1.7088351790477732, + "grad_norm": 0.6652471423149109, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 10570 + }, + { + "epoch": 1.7104518632285184, + "grad_norm": 0.6308087110519409, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 10580 + }, + { + "epoch": 1.7120685474092636, + "grad_norm": 0.5464386940002441, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 10590 + }, + { + "epoch": 1.7136852315900089, + "grad_norm": 0.6558911204338074, + "learning_rate": 0.0002, + "loss": 0.7063, + "step": 10600 + }, + { + "epoch": 1.715301915770754, + "grad_norm": 0.5665024518966675, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 10610 + }, + { + "epoch": 1.7169185999514993, + "grad_norm": 0.7888094186782837, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 10620 + }, + { + "epoch": 1.7185352841322448, + "grad_norm": 0.7084909081459045, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 10630 + }, + { + "epoch": 1.7201519683129902, + "grad_norm": 0.7982324361801147, + "learning_rate": 0.0002, + "loss": 0.7557, + "step": 10640 + }, + { + "epoch": 1.7217686524937355, + "grad_norm": 0.6418732404708862, + "learning_rate": 0.0002, + "loss": 0.7345, + "step": 10650 + }, + { + "epoch": 1.7233853366744807, + "grad_norm": 0.7636681795120239, + "learning_rate": 0.0002, + "loss": 0.7734, + "step": 10660 + }, + { + "epoch": 1.725002020855226, + "grad_norm": 0.5646875500679016, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 10670 + }, + { + "epoch": 1.7266187050359711, + "grad_norm": 0.5231260657310486, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 10680 + }, + { + "epoch": 1.7282353892167164, + "grad_norm": 0.7635011672973633, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 10690 + }, + { + "epoch": 1.7298520733974618, + "grad_norm": 0.7518259286880493, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 10700 + }, + { + "epoch": 1.731468757578207, + "grad_norm": 0.7295602560043335, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 10710 + }, + { + "epoch": 1.7330854417589525, + "grad_norm": 0.6984632015228271, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 10720 + }, + { + "epoch": 1.7347021259396977, + "grad_norm": 0.6198219060897827, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 10730 + }, + { + "epoch": 1.736318810120443, + "grad_norm": 0.6957576274871826, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 10740 + }, + { + "epoch": 1.7379354943011882, + "grad_norm": 0.6430263519287109, + "learning_rate": 0.0002, + "loss": 0.7917, + "step": 10750 + }, + { + "epoch": 1.7395521784819334, + "grad_norm": 0.6134995222091675, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 10760 + }, + { + "epoch": 1.741168862662679, + "grad_norm": 0.7209452986717224, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 10770 + }, + { + "epoch": 1.7427855468434241, + "grad_norm": 0.6735447645187378, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 10780 + }, + { + "epoch": 1.7444022310241696, + "grad_norm": 0.5605693459510803, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 10790 + }, + { + "epoch": 1.7460189152049148, + "grad_norm": 0.6882363557815552, + "learning_rate": 0.0002, + "loss": 0.7759, + "step": 10800 + }, + { + "epoch": 1.74763559938566, + "grad_norm": 0.6386259198188782, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 10810 + }, + { + "epoch": 1.7492522835664053, + "grad_norm": 0.6529015302658081, + "learning_rate": 0.0002, + "loss": 0.7697, + "step": 10820 + }, + { + "epoch": 1.7508689677471505, + "grad_norm": 0.5664082765579224, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 10830 + }, + { + "epoch": 1.7524856519278957, + "grad_norm": 0.7532684206962585, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 10840 + }, + { + "epoch": 1.7541023361086412, + "grad_norm": 0.77171391248703, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 10850 + }, + { + "epoch": 1.7557190202893864, + "grad_norm": 0.7255431413650513, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 10860 + }, + { + "epoch": 1.7573357044701319, + "grad_norm": 0.763083279132843, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 10870 + }, + { + "epoch": 1.758952388650877, + "grad_norm": 0.6042402982711792, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 10880 + }, + { + "epoch": 1.7605690728316223, + "grad_norm": 0.7642518281936646, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 10890 + }, + { + "epoch": 1.7621857570123676, + "grad_norm": 0.6347904801368713, + "learning_rate": 0.0002, + "loss": 0.746, + "step": 10900 + }, + { + "epoch": 1.7638024411931128, + "grad_norm": 0.5371627807617188, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 10910 + }, + { + "epoch": 1.7654191253738583, + "grad_norm": 0.6840225458145142, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 10920 + }, + { + "epoch": 1.7670358095546035, + "grad_norm": 0.5288469195365906, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 10930 + }, + { + "epoch": 1.768652493735349, + "grad_norm": 0.69020676612854, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 10940 + }, + { + "epoch": 1.7702691779160942, + "grad_norm": 0.5943242311477661, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 10950 + }, + { + "epoch": 1.7718858620968394, + "grad_norm": 0.5616418123245239, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 10960 + }, + { + "epoch": 1.7735025462775846, + "grad_norm": 0.7209470868110657, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 10970 + }, + { + "epoch": 1.7751192304583299, + "grad_norm": 0.6657957434654236, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 10980 + }, + { + "epoch": 1.776735914639075, + "grad_norm": 0.6469064950942993, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 10990 + }, + { + "epoch": 1.7783525988198206, + "grad_norm": 0.6615678071975708, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 11000 + }, + { + "epoch": 1.779969283000566, + "grad_norm": 0.6722439527511597, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 11010 + }, + { + "epoch": 1.7815859671813112, + "grad_norm": 0.634136974811554, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 11020 + }, + { + "epoch": 1.7832026513620565, + "grad_norm": 0.6024377346038818, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 11030 + }, + { + "epoch": 1.7848193355428017, + "grad_norm": 0.6909403800964355, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 11040 + }, + { + "epoch": 1.786436019723547, + "grad_norm": 0.7148767709732056, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 11050 + }, + { + "epoch": 1.7880527039042922, + "grad_norm": 0.7442979216575623, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 11060 + }, + { + "epoch": 1.7896693880850376, + "grad_norm": 0.6830431818962097, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 11070 + }, + { + "epoch": 1.7912860722657828, + "grad_norm": 0.9172667264938354, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 11080 + }, + { + "epoch": 1.7929027564465283, + "grad_norm": 0.6799490451812744, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 11090 + }, + { + "epoch": 1.7945194406272735, + "grad_norm": 0.7617024779319763, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 11100 + }, + { + "epoch": 1.7961361248080188, + "grad_norm": 0.7701810002326965, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 11110 + }, + { + "epoch": 1.797752808988764, + "grad_norm": 0.7454385757446289, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 11120 + }, + { + "epoch": 1.7993694931695092, + "grad_norm": 0.6121436953544617, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 11130 + }, + { + "epoch": 1.8009861773502547, + "grad_norm": 0.6237571835517883, + "learning_rate": 0.0002, + "loss": 0.7305, + "step": 11140 + }, + { + "epoch": 1.802602861531, + "grad_norm": 0.6818515658378601, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 11150 + }, + { + "epoch": 1.8042195457117454, + "grad_norm": 0.7768308520317078, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 11160 + }, + { + "epoch": 1.8058362298924906, + "grad_norm": 0.6875537633895874, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 11170 + }, + { + "epoch": 1.8074529140732358, + "grad_norm": 0.7950584888458252, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 11180 + }, + { + "epoch": 1.809069598253981, + "grad_norm": 0.8210248351097107, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 11190 + }, + { + "epoch": 1.8106862824347263, + "grad_norm": 0.6674110889434814, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 11200 + }, + { + "epoch": 1.8123029666154715, + "grad_norm": 0.6261674761772156, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 11210 + }, + { + "epoch": 1.813919650796217, + "grad_norm": 0.6484741568565369, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 11220 + }, + { + "epoch": 1.8155363349769622, + "grad_norm": 0.6231244206428528, + "learning_rate": 0.0002, + "loss": 0.7718, + "step": 11230 + }, + { + "epoch": 1.8171530191577077, + "grad_norm": 0.7243146896362305, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 11240 + }, + { + "epoch": 1.818769703338453, + "grad_norm": 0.6776193380355835, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 11250 + }, + { + "epoch": 1.8203863875191981, + "grad_norm": 0.5973618030548096, + "learning_rate": 0.0002, + "loss": 0.7317, + "step": 11260 + }, + { + "epoch": 1.8220030716999434, + "grad_norm": 0.6451361179351807, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 11270 + }, + { + "epoch": 1.8236197558806886, + "grad_norm": 0.5963068008422852, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 11280 + }, + { + "epoch": 1.825236440061434, + "grad_norm": 0.536902129650116, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 11290 + }, + { + "epoch": 1.8268531242421793, + "grad_norm": 0.6993787288665771, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 11300 + }, + { + "epoch": 1.8284698084229247, + "grad_norm": 0.6135255098342896, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 11310 + }, + { + "epoch": 1.83008649260367, + "grad_norm": 0.6057423949241638, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 11320 + }, + { + "epoch": 1.8317031767844152, + "grad_norm": 0.6598812341690063, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 11330 + }, + { + "epoch": 1.8333198609651604, + "grad_norm": 0.6075948476791382, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 11340 + }, + { + "epoch": 1.8349365451459057, + "grad_norm": 0.7065447568893433, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 11350 + }, + { + "epoch": 1.8365532293266509, + "grad_norm": 0.680526614189148, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 11360 + }, + { + "epoch": 1.8381699135073963, + "grad_norm": 0.6356695294380188, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 11370 + }, + { + "epoch": 1.8397865976881416, + "grad_norm": 0.6399052143096924, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 11380 + }, + { + "epoch": 1.841403281868887, + "grad_norm": 0.6125704050064087, + "learning_rate": 0.0002, + "loss": 0.7618, + "step": 11390 + }, + { + "epoch": 1.8430199660496323, + "grad_norm": 0.7124643325805664, + "learning_rate": 0.0002, + "loss": 0.755, + "step": 11400 + }, + { + "epoch": 1.8446366502303775, + "grad_norm": 0.6099604964256287, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 11410 + }, + { + "epoch": 1.8462533344111227, + "grad_norm": 0.7338208556175232, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 11420 + }, + { + "epoch": 1.847870018591868, + "grad_norm": 0.7534668445587158, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 11430 + }, + { + "epoch": 1.8494867027726134, + "grad_norm": 0.6135470271110535, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 11440 + }, + { + "epoch": 1.8511033869533586, + "grad_norm": 0.6229309439659119, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 11450 + }, + { + "epoch": 1.852720071134104, + "grad_norm": 0.706423282623291, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 11460 + }, + { + "epoch": 1.8543367553148493, + "grad_norm": 0.5460049510002136, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 11470 + }, + { + "epoch": 1.8559534394955945, + "grad_norm": 0.6616711020469666, + "learning_rate": 0.0002, + "loss": 0.7416, + "step": 11480 + }, + { + "epoch": 1.8575701236763398, + "grad_norm": 0.6372783184051514, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 11490 + }, + { + "epoch": 1.859186807857085, + "grad_norm": 0.7162668108940125, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 11500 + }, + { + "epoch": 1.8608034920378305, + "grad_norm": 0.6605209708213806, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 11510 + }, + { + "epoch": 1.8624201762185757, + "grad_norm": 0.6933956742286682, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 11520 + }, + { + "epoch": 1.8640368603993211, + "grad_norm": 0.6582090854644775, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 11530 + }, + { + "epoch": 1.8656535445800664, + "grad_norm": 0.6416500806808472, + "learning_rate": 0.0002, + "loss": 0.7313, + "step": 11540 + }, + { + "epoch": 1.8672702287608116, + "grad_norm": 0.5434312224388123, + "learning_rate": 0.0002, + "loss": 0.7372, + "step": 11550 + }, + { + "epoch": 1.8688869129415568, + "grad_norm": 0.6827567219734192, + "learning_rate": 0.0002, + "loss": 0.7635, + "step": 11560 + }, + { + "epoch": 1.870503597122302, + "grad_norm": 0.7354370951652527, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 11570 + }, + { + "epoch": 1.8721202813030473, + "grad_norm": 0.590372622013092, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 11580 + }, + { + "epoch": 1.8737369654837928, + "grad_norm": 0.853183925151825, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 11590 + }, + { + "epoch": 1.875353649664538, + "grad_norm": 0.822678804397583, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 11600 + }, + { + "epoch": 1.8769703338452834, + "grad_norm": 0.6591550707817078, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 11610 + }, + { + "epoch": 1.8785870180260287, + "grad_norm": 0.7475301623344421, + "learning_rate": 0.0002, + "loss": 0.7054, + "step": 11620 + }, + { + "epoch": 1.880203702206774, + "grad_norm": 0.6390765309333801, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 11630 + }, + { + "epoch": 1.8818203863875191, + "grad_norm": 0.6589758992195129, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 11640 + }, + { + "epoch": 1.8834370705682644, + "grad_norm": 0.6765508651733398, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 11650 + }, + { + "epoch": 1.8850537547490098, + "grad_norm": 0.6527857780456543, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 11660 + }, + { + "epoch": 1.886670438929755, + "grad_norm": 0.6642923951148987, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 11670 + }, + { + "epoch": 1.8882871231105005, + "grad_norm": 0.6945584416389465, + "learning_rate": 0.0002, + "loss": 0.7701, + "step": 11680 + }, + { + "epoch": 1.8899038072912457, + "grad_norm": 0.694018542766571, + "learning_rate": 0.0002, + "loss": 0.7711, + "step": 11690 + }, + { + "epoch": 1.891520491471991, + "grad_norm": 0.7237417101860046, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 11700 + }, + { + "epoch": 1.8931371756527362, + "grad_norm": 0.7401309609413147, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 11710 + }, + { + "epoch": 1.8947538598334814, + "grad_norm": 0.6537784337997437, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 11720 + }, + { + "epoch": 1.8963705440142267, + "grad_norm": 0.7398539185523987, + "learning_rate": 0.0002, + "loss": 0.793, + "step": 11730 + }, + { + "epoch": 1.8979872281949721, + "grad_norm": 0.6696075797080994, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 11740 + }, + { + "epoch": 1.8996039123757174, + "grad_norm": 0.6014142036437988, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 11750 + }, + { + "epoch": 1.9012205965564628, + "grad_norm": 0.7023524641990662, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 11760 + }, + { + "epoch": 1.902837280737208, + "grad_norm": 0.739973783493042, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 11770 + }, + { + "epoch": 1.9044539649179533, + "grad_norm": 0.5576770901679993, + "learning_rate": 0.0002, + "loss": 0.7848, + "step": 11780 + }, + { + "epoch": 1.9060706490986985, + "grad_norm": 0.6907393932342529, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 11790 + }, + { + "epoch": 1.9076873332794437, + "grad_norm": 0.6934581995010376, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 11800 + }, + { + "epoch": 1.9093040174601892, + "grad_norm": 0.591774582862854, + "learning_rate": 0.0002, + "loss": 0.7199, + "step": 11810 + }, + { + "epoch": 1.9109207016409344, + "grad_norm": 0.6249791383743286, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 11820 + }, + { + "epoch": 1.9125373858216799, + "grad_norm": 0.6755744218826294, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 11830 + }, + { + "epoch": 1.914154070002425, + "grad_norm": 0.7286285161972046, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 11840 + }, + { + "epoch": 1.9157707541831703, + "grad_norm": 0.7867850065231323, + "learning_rate": 0.0002, + "loss": 0.7509, + "step": 11850 + }, + { + "epoch": 1.9173874383639156, + "grad_norm": 0.6283972859382629, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 11860 + }, + { + "epoch": 1.9190041225446608, + "grad_norm": 0.605823814868927, + "learning_rate": 0.0002, + "loss": 0.7296, + "step": 11870 + }, + { + "epoch": 1.920620806725406, + "grad_norm": 0.5927976965904236, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 11880 + }, + { + "epoch": 1.9222374909061515, + "grad_norm": 0.5974002480506897, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 11890 + }, + { + "epoch": 1.923854175086897, + "grad_norm": 0.7091866135597229, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 11900 + }, + { + "epoch": 1.9254708592676422, + "grad_norm": 0.72496497631073, + "learning_rate": 0.0002, + "loss": 0.775, + "step": 11910 + }, + { + "epoch": 1.9270875434483874, + "grad_norm": 0.6131896376609802, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 11920 + }, + { + "epoch": 1.9287042276291326, + "grad_norm": 0.6556436419487, + "learning_rate": 0.0002, + "loss": 0.7228, + "step": 11930 + }, + { + "epoch": 1.9303209118098779, + "grad_norm": 0.622932493686676, + "learning_rate": 0.0002, + "loss": 0.7319, + "step": 11940 + }, + { + "epoch": 1.931937595990623, + "grad_norm": 0.6618631482124329, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 11950 + }, + { + "epoch": 1.9335542801713685, + "grad_norm": 0.630966305732727, + "learning_rate": 0.0002, + "loss": 0.8332, + "step": 11960 + }, + { + "epoch": 1.9351709643521138, + "grad_norm": 0.6336734890937805, + "learning_rate": 0.0002, + "loss": 0.6854, + "step": 11970 + }, + { + "epoch": 1.9367876485328592, + "grad_norm": 0.655403196811676, + "learning_rate": 0.0002, + "loss": 0.7433, + "step": 11980 + }, + { + "epoch": 1.9384043327136045, + "grad_norm": 0.5640574097633362, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 11990 + }, + { + "epoch": 1.9400210168943497, + "grad_norm": 0.6322951316833496, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 12000 + }, + { + "epoch": 1.941637701075095, + "grad_norm": 0.615703821182251, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 12010 + }, + { + "epoch": 1.9432543852558402, + "grad_norm": 0.6487536430358887, + "learning_rate": 0.0002, + "loss": 0.786, + "step": 12020 + }, + { + "epoch": 1.9448710694365856, + "grad_norm": 0.9209630489349365, + "learning_rate": 0.0002, + "loss": 0.7435, + "step": 12030 + }, + { + "epoch": 1.9464877536173308, + "grad_norm": 0.67485511302948, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 12040 + }, + { + "epoch": 1.9481044377980763, + "grad_norm": 0.6831230521202087, + "learning_rate": 0.0002, + "loss": 0.7551, + "step": 12050 + }, + { + "epoch": 1.9497211219788215, + "grad_norm": 0.6578302383422852, + "learning_rate": 0.0002, + "loss": 0.7546, + "step": 12060 + }, + { + "epoch": 1.9513378061595668, + "grad_norm": 0.9975938200950623, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 12070 + }, + { + "epoch": 1.952954490340312, + "grad_norm": 0.6637365221977234, + "learning_rate": 0.0002, + "loss": 0.7952, + "step": 12080 + }, + { + "epoch": 1.9545711745210572, + "grad_norm": 0.605707049369812, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 12090 + }, + { + "epoch": 1.9561878587018025, + "grad_norm": 0.6584440469741821, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 12100 + }, + { + "epoch": 1.957804542882548, + "grad_norm": 0.6070835590362549, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 12110 + }, + { + "epoch": 1.9594212270632931, + "grad_norm": 0.7862601280212402, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 12120 + }, + { + "epoch": 1.9610379112440386, + "grad_norm": 0.8175255060195923, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 12130 + }, + { + "epoch": 1.9626545954247838, + "grad_norm": 0.5648472905158997, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 12140 + }, + { + "epoch": 1.964271279605529, + "grad_norm": 0.6591973304748535, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 12150 + }, + { + "epoch": 1.9658879637862743, + "grad_norm": 0.5960676074028015, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 12160 + }, + { + "epoch": 1.9675046479670195, + "grad_norm": 0.7272544503211975, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 12170 + }, + { + "epoch": 1.969121332147765, + "grad_norm": 0.7176699042320251, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 12180 + }, + { + "epoch": 1.9707380163285102, + "grad_norm": 0.6927123665809631, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 12190 + }, + { + "epoch": 1.9723547005092557, + "grad_norm": 0.5536034107208252, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 12200 + }, + { + "epoch": 1.9739713846900009, + "grad_norm": 0.8348390460014343, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 12210 + }, + { + "epoch": 1.9755880688707461, + "grad_norm": 0.6591181755065918, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 12220 + }, + { + "epoch": 1.9772047530514913, + "grad_norm": 1.0624109506607056, + "learning_rate": 0.0002, + "loss": 0.763, + "step": 12230 + }, + { + "epoch": 1.9788214372322366, + "grad_norm": 0.9265586137771606, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 12240 + }, + { + "epoch": 1.9804381214129818, + "grad_norm": 0.5998196005821228, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 12250 + }, + { + "epoch": 1.9820548055937273, + "grad_norm": 0.6960851550102234, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 12260 + }, + { + "epoch": 1.9836714897744727, + "grad_norm": 0.7674502730369568, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 12270 + }, + { + "epoch": 1.985288173955218, + "grad_norm": 0.6407275795936584, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 12280 + }, + { + "epoch": 1.9869048581359632, + "grad_norm": 0.6673079133033752, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 12290 + }, + { + "epoch": 1.9885215423167084, + "grad_norm": 0.6989844441413879, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 12300 + }, + { + "epoch": 1.9901382264974536, + "grad_norm": 0.7564442157745361, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 12310 + }, + { + "epoch": 1.9917549106781989, + "grad_norm": 0.6385478973388672, + "learning_rate": 0.0002, + "loss": 0.7719, + "step": 12320 + }, + { + "epoch": 1.9933715948589443, + "grad_norm": 0.7193717956542969, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 12330 + }, + { + "epoch": 1.9949882790396896, + "grad_norm": 0.7987112402915955, + "learning_rate": 0.0002, + "loss": 0.7583, + "step": 12340 + }, + { + "epoch": 1.996604963220435, + "grad_norm": 0.7260826826095581, + "learning_rate": 0.0002, + "loss": 0.7793, + "step": 12350 + }, + { + "epoch": 1.9982216474011802, + "grad_norm": 0.7968255281448364, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 12360 + }, + { + "epoch": 1.9998383315819255, + "grad_norm": 0.6893062591552734, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 12370 + }, + { + "epoch": 2.0, + "eval_loss": 1.1044032573699951, + "eval_runtime": 122.1508, + "eval_samples_per_second": 6.001, + "eval_steps_per_second": 0.753, + "step": 12371 + } + ], + "logging_steps": 10, + "max_steps": 49480, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.7250218020909875e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..155b12fa9acbc6e71dba75c92bfa79e152397ebf --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-12371/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28694d5564a2b5c7d6881d4ba2af103356aa22489d2c22768ebbe47283c0f4a1 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3ed21d20216a5ce4568a175156a5ddbb9cb33669 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d9f75e2ee0bafc7ec9c12b204485f764703d363e45fbf4d172349d03b0c4653 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b993ee39e96ec08b19bf65bc33ec96629b5cc736 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cad21d963ec0206ac34fc4ec45037ceb484173249d20e8c8a3ef5d7f9f971ef8 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..45b03f0a02348d0c09b9c35820ade27d2825d1be --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a94f50547bb5416c60eade4d015f2d1426e23d331062664b231faf5aeeb61aee +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..98cb302180982cc3bd57a359a66df9af4df06ef1 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e862db497a76925f2f33bbe167cf8fc7ad8a5bd750c2ca881ec45670205b20d +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cad81d8e99afadcc095799183f259473cbc9a987 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/trainer_state.json @@ -0,0 +1,13042 @@ +{ + "best_metric": 1.0871200561523438, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185", + "epoch": 2.9999191657909625, + "eval_steps": 10, + "global_step": 18556, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016166841807452913, + "grad_norm": 0.9894065856933594, + "learning_rate": 0.0002, + "loss": 1.6636, + "step": 10 + }, + { + "epoch": 0.0032333683614905826, + "grad_norm": 1.7810699939727783, + "learning_rate": 0.0002, + "loss": 1.1528, + "step": 20 + }, + { + "epoch": 0.004850052542235874, + "grad_norm": 0.5969577431678772, + "learning_rate": 0.0002, + "loss": 0.9767, + "step": 30 + }, + { + "epoch": 0.006466736722981165, + "grad_norm": 0.6354120969772339, + "learning_rate": 0.0002, + "loss": 0.9772, + "step": 40 + }, + { + "epoch": 0.008083420903726457, + "grad_norm": 0.5604607462882996, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 50 + }, + { + "epoch": 0.009700105084471748, + "grad_norm": 0.4676193594932556, + "learning_rate": 0.0002, + "loss": 0.8841, + "step": 60 + }, + { + "epoch": 0.01131678926521704, + "grad_norm": 0.6099211573600769, + "learning_rate": 0.0002, + "loss": 0.9022, + "step": 70 + }, + { + "epoch": 0.01293347344596233, + "grad_norm": 0.48639994859695435, + "learning_rate": 0.0002, + "loss": 0.9133, + "step": 80 + }, + { + "epoch": 0.014550157626707623, + "grad_norm": 0.4904264509677887, + "learning_rate": 0.0002, + "loss": 0.8704, + "step": 90 + }, + { + "epoch": 0.016166841807452915, + "grad_norm": 2.8334362506866455, + "learning_rate": 0.0002, + "loss": 0.8855, + "step": 100 + }, + { + "epoch": 0.017783525988198205, + "grad_norm": 0.43221670389175415, + "learning_rate": 0.0002, + "loss": 0.8958, + "step": 110 + }, + { + "epoch": 0.019400210168943496, + "grad_norm": 0.42244166135787964, + "learning_rate": 0.0002, + "loss": 0.8412, + "step": 120 + }, + { + "epoch": 0.02101689434968879, + "grad_norm": 0.45363298058509827, + "learning_rate": 0.0002, + "loss": 0.8467, + "step": 130 + }, + { + "epoch": 0.02263357853043408, + "grad_norm": 0.44816508889198303, + "learning_rate": 0.0002, + "loss": 0.8641, + "step": 140 + }, + { + "epoch": 0.02425026271117937, + "grad_norm": 0.43308213353157043, + "learning_rate": 0.0002, + "loss": 0.8496, + "step": 150 + }, + { + "epoch": 0.02586694689192466, + "grad_norm": 0.4084763526916504, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 160 + }, + { + "epoch": 0.027483631072669955, + "grad_norm": 0.5363703966140747, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 170 + }, + { + "epoch": 0.029100315253415245, + "grad_norm": 0.4619699716567993, + "learning_rate": 0.0002, + "loss": 0.8558, + "step": 180 + }, + { + "epoch": 0.030716999434160536, + "grad_norm": 0.49069908261299133, + "learning_rate": 0.0002, + "loss": 0.8878, + "step": 190 + }, + { + "epoch": 0.03233368361490583, + "grad_norm": 0.4645835757255554, + "learning_rate": 0.0002, + "loss": 0.8867, + "step": 200 + }, + { + "epoch": 0.03395036779565112, + "grad_norm": 1.2411243915557861, + "learning_rate": 0.0002, + "loss": 0.8842, + "step": 210 + }, + { + "epoch": 0.03556705197639641, + "grad_norm": 0.5211851596832275, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 220 + }, + { + "epoch": 0.037183736157141704, + "grad_norm": 0.5253691673278809, + "learning_rate": 0.0002, + "loss": 0.8194, + "step": 230 + }, + { + "epoch": 0.03880042033788699, + "grad_norm": 0.4567478895187378, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 240 + }, + { + "epoch": 0.040417104518632285, + "grad_norm": 0.5472128391265869, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 250 + }, + { + "epoch": 0.04203378869937758, + "grad_norm": 0.42978546023368835, + "learning_rate": 0.0002, + "loss": 0.8201, + "step": 260 + }, + { + "epoch": 0.043650472880122866, + "grad_norm": 0.601734459400177, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 270 + }, + { + "epoch": 0.04526715706086816, + "grad_norm": 0.4286513328552246, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 280 + }, + { + "epoch": 0.046883841241613454, + "grad_norm": 0.5230861902236938, + "learning_rate": 0.0002, + "loss": 0.8758, + "step": 290 + }, + { + "epoch": 0.04850052542235874, + "grad_norm": 0.6504611968994141, + "learning_rate": 0.0002, + "loss": 0.8636, + "step": 300 + }, + { + "epoch": 0.050117209603104035, + "grad_norm": 0.43485215306282043, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 310 + }, + { + "epoch": 0.05173389378384932, + "grad_norm": 0.4717007875442505, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 320 + }, + { + "epoch": 0.053350577964594616, + "grad_norm": 0.4059787690639496, + "learning_rate": 0.0002, + "loss": 0.8469, + "step": 330 + }, + { + "epoch": 0.05496726214533991, + "grad_norm": 0.4366913437843323, + "learning_rate": 0.0002, + "loss": 0.8866, + "step": 340 + }, + { + "epoch": 0.0565839463260852, + "grad_norm": 0.4233848452568054, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 350 + }, + { + "epoch": 0.05820063050683049, + "grad_norm": 0.4209108352661133, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 360 + }, + { + "epoch": 0.059817314687575784, + "grad_norm": 0.41637396812438965, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 370 + }, + { + "epoch": 0.06143399886832107, + "grad_norm": 0.46235376596450806, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 380 + }, + { + "epoch": 0.06305068304906636, + "grad_norm": 0.4013484716415405, + "learning_rate": 0.0002, + "loss": 0.7966, + "step": 390 + }, + { + "epoch": 0.06466736722981166, + "grad_norm": 0.47443896532058716, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 400 + }, + { + "epoch": 0.06628405141055695, + "grad_norm": 0.3942156434059143, + "learning_rate": 0.0002, + "loss": 0.8666, + "step": 410 + }, + { + "epoch": 0.06790073559130223, + "grad_norm": 0.4965320825576782, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 420 + }, + { + "epoch": 0.06951741977204753, + "grad_norm": 0.4304835796356201, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 430 + }, + { + "epoch": 0.07113410395279282, + "grad_norm": 0.511726975440979, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 440 + }, + { + "epoch": 0.07275078813353811, + "grad_norm": 0.4040689170360565, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 450 + }, + { + "epoch": 0.07436747231428341, + "grad_norm": 0.5402171015739441, + "learning_rate": 0.0002, + "loss": 0.8788, + "step": 460 + }, + { + "epoch": 0.0759841564950287, + "grad_norm": 0.4174517095088959, + "learning_rate": 0.0002, + "loss": 0.8737, + "step": 470 + }, + { + "epoch": 0.07760084067577398, + "grad_norm": 0.4306182265281677, + "learning_rate": 0.0002, + "loss": 0.7605, + "step": 480 + }, + { + "epoch": 0.07921752485651928, + "grad_norm": 0.535210132598877, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 490 + }, + { + "epoch": 0.08083420903726457, + "grad_norm": 0.5339109897613525, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 500 + }, + { + "epoch": 0.08245089321800986, + "grad_norm": 0.45754891633987427, + "learning_rate": 0.0002, + "loss": 0.8985, + "step": 510 + }, + { + "epoch": 0.08406757739875516, + "grad_norm": 0.43820783495903015, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 520 + }, + { + "epoch": 0.08568426157950045, + "grad_norm": 0.4434749186038971, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 530 + }, + { + "epoch": 0.08730094576024573, + "grad_norm": 0.43111467361450195, + "learning_rate": 0.0002, + "loss": 0.7857, + "step": 540 + }, + { + "epoch": 0.08891762994099103, + "grad_norm": 0.4378940165042877, + "learning_rate": 0.0002, + "loss": 0.8418, + "step": 550 + }, + { + "epoch": 0.09053431412173632, + "grad_norm": 0.4772215187549591, + "learning_rate": 0.0002, + "loss": 0.8361, + "step": 560 + }, + { + "epoch": 0.09215099830248161, + "grad_norm": 0.6837629079818726, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 570 + }, + { + "epoch": 0.09376768248322691, + "grad_norm": 0.42241212725639343, + "learning_rate": 0.0002, + "loss": 0.8607, + "step": 580 + }, + { + "epoch": 0.0953843666639722, + "grad_norm": 0.5165936350822449, + "learning_rate": 0.0002, + "loss": 0.852, + "step": 590 + }, + { + "epoch": 0.09700105084471748, + "grad_norm": 0.48737478256225586, + "learning_rate": 0.0002, + "loss": 0.8664, + "step": 600 + }, + { + "epoch": 0.09861773502546278, + "grad_norm": 0.47419852018356323, + "learning_rate": 0.0002, + "loss": 0.8806, + "step": 610 + }, + { + "epoch": 0.10023441920620807, + "grad_norm": 0.4975486099720001, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 620 + }, + { + "epoch": 0.10185110338695336, + "grad_norm": 0.49123844504356384, + "learning_rate": 0.0002, + "loss": 0.8548, + "step": 630 + }, + { + "epoch": 0.10346778756769864, + "grad_norm": 0.6288952827453613, + "learning_rate": 0.0002, + "loss": 0.8911, + "step": 640 + }, + { + "epoch": 0.10508447174844394, + "grad_norm": 0.4277345836162567, + "learning_rate": 0.0002, + "loss": 0.827, + "step": 650 + }, + { + "epoch": 0.10670115592918923, + "grad_norm": 0.4021061956882477, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 660 + }, + { + "epoch": 0.10831784010993452, + "grad_norm": 0.3492237329483032, + "learning_rate": 0.0002, + "loss": 0.87, + "step": 670 + }, + { + "epoch": 0.10993452429067982, + "grad_norm": 0.4341012239456177, + "learning_rate": 0.0002, + "loss": 0.8698, + "step": 680 + }, + { + "epoch": 0.1115512084714251, + "grad_norm": 0.7296304106712341, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 690 + }, + { + "epoch": 0.1131678926521704, + "grad_norm": 0.397494912147522, + "learning_rate": 0.0002, + "loss": 0.8433, + "step": 700 + }, + { + "epoch": 0.1147845768329157, + "grad_norm": 0.396431028842926, + "learning_rate": 0.0002, + "loss": 0.827, + "step": 710 + }, + { + "epoch": 0.11640126101366098, + "grad_norm": 0.48842838406562805, + "learning_rate": 0.0002, + "loss": 0.8379, + "step": 720 + }, + { + "epoch": 0.11801794519440627, + "grad_norm": 0.46322616934776306, + "learning_rate": 0.0002, + "loss": 0.8238, + "step": 730 + }, + { + "epoch": 0.11963462937515157, + "grad_norm": 0.47990912199020386, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 740 + }, + { + "epoch": 0.12125131355589686, + "grad_norm": 0.4997142255306244, + "learning_rate": 0.0002, + "loss": 0.82, + "step": 750 + }, + { + "epoch": 0.12286799773664214, + "grad_norm": 0.4040526747703552, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 760 + }, + { + "epoch": 0.12448468191738744, + "grad_norm": 0.453095942735672, + "learning_rate": 0.0002, + "loss": 0.863, + "step": 770 + }, + { + "epoch": 0.12610136609813272, + "grad_norm": 0.4636971950531006, + "learning_rate": 0.0002, + "loss": 0.8792, + "step": 780 + }, + { + "epoch": 0.12771805027887803, + "grad_norm": 0.4279276132583618, + "learning_rate": 0.0002, + "loss": 0.8112, + "step": 790 + }, + { + "epoch": 0.12933473445962332, + "grad_norm": 0.46212655305862427, + "learning_rate": 0.0002, + "loss": 0.8711, + "step": 800 + }, + { + "epoch": 0.1309514186403686, + "grad_norm": 0.43127650022506714, + "learning_rate": 0.0002, + "loss": 0.8368, + "step": 810 + }, + { + "epoch": 0.1325681028211139, + "grad_norm": 0.4201301336288452, + "learning_rate": 0.0002, + "loss": 0.8476, + "step": 820 + }, + { + "epoch": 0.13418478700185918, + "grad_norm": 0.42583167552948, + "learning_rate": 0.0002, + "loss": 0.8078, + "step": 830 + }, + { + "epoch": 0.13580147118260447, + "grad_norm": 0.4535622000694275, + "learning_rate": 0.0002, + "loss": 0.8219, + "step": 840 + }, + { + "epoch": 0.13741815536334978, + "grad_norm": 0.4116036891937256, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 850 + }, + { + "epoch": 0.13903483954409507, + "grad_norm": 0.45997580885887146, + "learning_rate": 0.0002, + "loss": 0.8466, + "step": 860 + }, + { + "epoch": 0.14065152372484035, + "grad_norm": 0.4487837255001068, + "learning_rate": 0.0002, + "loss": 0.8917, + "step": 870 + }, + { + "epoch": 0.14226820790558564, + "grad_norm": 0.43650057911872864, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 880 + }, + { + "epoch": 0.14388489208633093, + "grad_norm": 0.5335358381271362, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 890 + }, + { + "epoch": 0.14550157626707622, + "grad_norm": 0.5989000201225281, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 900 + }, + { + "epoch": 0.14711826044782153, + "grad_norm": 0.517179012298584, + "learning_rate": 0.0002, + "loss": 0.8385, + "step": 910 + }, + { + "epoch": 0.14873494462856682, + "grad_norm": 0.44435232877731323, + "learning_rate": 0.0002, + "loss": 0.8255, + "step": 920 + }, + { + "epoch": 0.1503516288093121, + "grad_norm": 0.42635923624038696, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 930 + }, + { + "epoch": 0.1519683129900574, + "grad_norm": 0.49603334069252014, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 940 + }, + { + "epoch": 0.15358499717080268, + "grad_norm": 0.40639808773994446, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 950 + }, + { + "epoch": 0.15520168135154797, + "grad_norm": 0.4850759208202362, + "learning_rate": 0.0002, + "loss": 0.8529, + "step": 960 + }, + { + "epoch": 0.15681836553229328, + "grad_norm": 0.4427442252635956, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 970 + }, + { + "epoch": 0.15843504971303857, + "grad_norm": 0.3760930001735687, + "learning_rate": 0.0002, + "loss": 0.8705, + "step": 980 + }, + { + "epoch": 0.16005173389378385, + "grad_norm": 0.4794144332408905, + "learning_rate": 0.0002, + "loss": 0.8644, + "step": 990 + }, + { + "epoch": 0.16166841807452914, + "grad_norm": 0.45828768610954285, + "learning_rate": 0.0002, + "loss": 0.8002, + "step": 1000 + }, + { + "epoch": 0.16328510225527443, + "grad_norm": 0.6313053369522095, + "learning_rate": 0.0002, + "loss": 0.7658, + "step": 1010 + }, + { + "epoch": 0.16490178643601971, + "grad_norm": 0.45041006803512573, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1020 + }, + { + "epoch": 0.166518470616765, + "grad_norm": 0.441403865814209, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 1030 + }, + { + "epoch": 0.16813515479751032, + "grad_norm": 0.8171296119689941, + "learning_rate": 0.0002, + "loss": 0.8475, + "step": 1040 + }, + { + "epoch": 0.1697518389782556, + "grad_norm": 0.7137420773506165, + "learning_rate": 0.0002, + "loss": 0.845, + "step": 1050 + }, + { + "epoch": 0.1713685231590009, + "grad_norm": 0.5236809849739075, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 1060 + }, + { + "epoch": 0.17298520733974618, + "grad_norm": 0.5021864175796509, + "learning_rate": 0.0002, + "loss": 0.8265, + "step": 1070 + }, + { + "epoch": 0.17460189152049146, + "grad_norm": 0.47347521781921387, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 1080 + }, + { + "epoch": 0.17621857570123675, + "grad_norm": 0.4631653428077698, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 1090 + }, + { + "epoch": 0.17783525988198207, + "grad_norm": 0.49169182777404785, + "learning_rate": 0.0002, + "loss": 0.8166, + "step": 1100 + }, + { + "epoch": 0.17945194406272735, + "grad_norm": 0.5019739270210266, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 1110 + }, + { + "epoch": 0.18106862824347264, + "grad_norm": 0.5100422501564026, + "learning_rate": 0.0002, + "loss": 0.8247, + "step": 1120 + }, + { + "epoch": 0.18268531242421793, + "grad_norm": 0.3888324499130249, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 1130 + }, + { + "epoch": 0.18430199660496321, + "grad_norm": 0.39765217900276184, + "learning_rate": 0.0002, + "loss": 0.8533, + "step": 1140 + }, + { + "epoch": 0.1859186807857085, + "grad_norm": 0.47190186381340027, + "learning_rate": 0.0002, + "loss": 0.8541, + "step": 1150 + }, + { + "epoch": 0.18753536496645382, + "grad_norm": 0.4464188814163208, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 1160 + }, + { + "epoch": 0.1891520491471991, + "grad_norm": 0.5153930187225342, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 1170 + }, + { + "epoch": 0.1907687333279444, + "grad_norm": 0.4779708683490753, + "learning_rate": 0.0002, + "loss": 0.8033, + "step": 1180 + }, + { + "epoch": 0.19238541750868968, + "grad_norm": 0.4834315776824951, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 1190 + }, + { + "epoch": 0.19400210168943496, + "grad_norm": 0.402357816696167, + "learning_rate": 0.0002, + "loss": 0.7721, + "step": 1200 + }, + { + "epoch": 0.19561878587018025, + "grad_norm": 0.45899084210395813, + "learning_rate": 0.0002, + "loss": 0.7941, + "step": 1210 + }, + { + "epoch": 0.19723547005092557, + "grad_norm": 0.5106529593467712, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 1220 + }, + { + "epoch": 0.19885215423167085, + "grad_norm": 0.45261722803115845, + "learning_rate": 0.0002, + "loss": 0.7816, + "step": 1230 + }, + { + "epoch": 0.20046883841241614, + "grad_norm": 0.4647127091884613, + "learning_rate": 0.0002, + "loss": 0.8068, + "step": 1240 + }, + { + "epoch": 0.20208552259316143, + "grad_norm": 0.4849368929862976, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 1250 + }, + { + "epoch": 0.2037022067739067, + "grad_norm": 0.4518061578273773, + "learning_rate": 0.0002, + "loss": 0.8514, + "step": 1260 + }, + { + "epoch": 0.205318890954652, + "grad_norm": 0.49535325169563293, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 1270 + }, + { + "epoch": 0.2069355751353973, + "grad_norm": 0.4835205376148224, + "learning_rate": 0.0002, + "loss": 0.8348, + "step": 1280 + }, + { + "epoch": 0.2085522593161426, + "grad_norm": 0.45308539271354675, + "learning_rate": 0.0002, + "loss": 0.8428, + "step": 1290 + }, + { + "epoch": 0.2101689434968879, + "grad_norm": 0.5369905233383179, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 1300 + }, + { + "epoch": 0.21178562767763318, + "grad_norm": 0.5031622052192688, + "learning_rate": 0.0002, + "loss": 0.8676, + "step": 1310 + }, + { + "epoch": 0.21340231185837846, + "grad_norm": 0.48010334372520447, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 1320 + }, + { + "epoch": 0.21501899603912375, + "grad_norm": 0.4905701279640198, + "learning_rate": 0.0002, + "loss": 0.806, + "step": 1330 + }, + { + "epoch": 0.21663568021986904, + "grad_norm": 0.43531742691993713, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 1340 + }, + { + "epoch": 0.21825236440061435, + "grad_norm": 0.44330692291259766, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 1350 + }, + { + "epoch": 0.21986904858135964, + "grad_norm": 0.5384416580200195, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 1360 + }, + { + "epoch": 0.22148573276210493, + "grad_norm": 0.4181833863258362, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 1370 + }, + { + "epoch": 0.2231024169428502, + "grad_norm": 0.523833692073822, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 1380 + }, + { + "epoch": 0.2247191011235955, + "grad_norm": 0.5528736710548401, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 1390 + }, + { + "epoch": 0.2263357853043408, + "grad_norm": 0.43515023589134216, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 1400 + }, + { + "epoch": 0.2279524694850861, + "grad_norm": 0.48809877038002014, + "learning_rate": 0.0002, + "loss": 0.8403, + "step": 1410 + }, + { + "epoch": 0.2295691536658314, + "grad_norm": 0.43591251969337463, + "learning_rate": 0.0002, + "loss": 0.8165, + "step": 1420 + }, + { + "epoch": 0.23118583784657668, + "grad_norm": 0.44625312089920044, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 1430 + }, + { + "epoch": 0.23280252202732196, + "grad_norm": 0.4390665292739868, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 1440 + }, + { + "epoch": 0.23441920620806725, + "grad_norm": 0.48496049642562866, + "learning_rate": 0.0002, + "loss": 0.8465, + "step": 1450 + }, + { + "epoch": 0.23603589038881254, + "grad_norm": 0.45919957756996155, + "learning_rate": 0.0002, + "loss": 0.775, + "step": 1460 + }, + { + "epoch": 0.23765257456955785, + "grad_norm": 0.5471845865249634, + "learning_rate": 0.0002, + "loss": 0.8659, + "step": 1470 + }, + { + "epoch": 0.23926925875030314, + "grad_norm": 0.47269317507743835, + "learning_rate": 0.0002, + "loss": 0.8164, + "step": 1480 + }, + { + "epoch": 0.24088594293104842, + "grad_norm": 0.4930245578289032, + "learning_rate": 0.0002, + "loss": 0.854, + "step": 1490 + }, + { + "epoch": 0.2425026271117937, + "grad_norm": 0.5605630278587341, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 1500 + }, + { + "epoch": 0.244119311292539, + "grad_norm": 0.4435870945453644, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 1510 + }, + { + "epoch": 0.24573599547328429, + "grad_norm": 0.4941999912261963, + "learning_rate": 0.0002, + "loss": 0.8123, + "step": 1520 + }, + { + "epoch": 0.24735267965402957, + "grad_norm": 0.5100624561309814, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 1530 + }, + { + "epoch": 0.2489693638347749, + "grad_norm": 0.4638267457485199, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 1540 + }, + { + "epoch": 0.25058604801552015, + "grad_norm": 0.5071570873260498, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 1550 + }, + { + "epoch": 0.25220273219626543, + "grad_norm": 0.4291319251060486, + "learning_rate": 0.0002, + "loss": 0.7724, + "step": 1560 + }, + { + "epoch": 0.2538194163770108, + "grad_norm": 0.5388049483299255, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 1570 + }, + { + "epoch": 0.25543610055775606, + "grad_norm": 0.5083683729171753, + "learning_rate": 0.0002, + "loss": 0.8176, + "step": 1580 + }, + { + "epoch": 0.25705278473850135, + "grad_norm": 0.4824463725090027, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 1590 + }, + { + "epoch": 0.25866946891924664, + "grad_norm": 0.41177722811698914, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 1600 + }, + { + "epoch": 0.2602861530999919, + "grad_norm": 0.5656219124794006, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 1610 + }, + { + "epoch": 0.2619028372807372, + "grad_norm": 0.41063204407691956, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 1620 + }, + { + "epoch": 0.2635195214614825, + "grad_norm": 0.4897061288356781, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 1630 + }, + { + "epoch": 0.2651362056422278, + "grad_norm": 0.4454376697540283, + "learning_rate": 0.0002, + "loss": 0.8198, + "step": 1640 + }, + { + "epoch": 0.26675288982297307, + "grad_norm": 0.4355238378047943, + "learning_rate": 0.0002, + "loss": 0.8684, + "step": 1650 + }, + { + "epoch": 0.26836957400371836, + "grad_norm": 0.458310067653656, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 1660 + }, + { + "epoch": 0.26998625818446365, + "grad_norm": 0.4752083718776703, + "learning_rate": 0.0002, + "loss": 0.7935, + "step": 1670 + }, + { + "epoch": 0.27160294236520893, + "grad_norm": 0.4666106402873993, + "learning_rate": 0.0002, + "loss": 0.8267, + "step": 1680 + }, + { + "epoch": 0.2732196265459543, + "grad_norm": 0.4213818609714508, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 1690 + }, + { + "epoch": 0.27483631072669956, + "grad_norm": 0.5768913626670837, + "learning_rate": 0.0002, + "loss": 0.8559, + "step": 1700 + }, + { + "epoch": 0.27645299490744485, + "grad_norm": 0.4209914803504944, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 1710 + }, + { + "epoch": 0.27806967908819014, + "grad_norm": 0.501909613609314, + "learning_rate": 0.0002, + "loss": 0.8167, + "step": 1720 + }, + { + "epoch": 0.2796863632689354, + "grad_norm": 0.5266261100769043, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 1730 + }, + { + "epoch": 0.2813030474496807, + "grad_norm": 0.43806859850883484, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 1740 + }, + { + "epoch": 0.282919731630426, + "grad_norm": 0.46048814058303833, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 1750 + }, + { + "epoch": 0.2845364158111713, + "grad_norm": 0.44972819089889526, + "learning_rate": 0.0002, + "loss": 0.8596, + "step": 1760 + }, + { + "epoch": 0.28615309999191657, + "grad_norm": 0.5114831328392029, + "learning_rate": 0.0002, + "loss": 0.8421, + "step": 1770 + }, + { + "epoch": 0.28776978417266186, + "grad_norm": 0.47931742668151855, + "learning_rate": 0.0002, + "loss": 0.8361, + "step": 1780 + }, + { + "epoch": 0.28938646835340714, + "grad_norm": 0.5092599987983704, + "learning_rate": 0.0002, + "loss": 0.8265, + "step": 1790 + }, + { + "epoch": 0.29100315253415243, + "grad_norm": 0.37581443786621094, + "learning_rate": 0.0002, + "loss": 0.8506, + "step": 1800 + }, + { + "epoch": 0.2926198367148977, + "grad_norm": 0.47097381949424744, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 1810 + }, + { + "epoch": 0.29423652089564306, + "grad_norm": 0.48300236463546753, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 1820 + }, + { + "epoch": 0.29585320507638835, + "grad_norm": 0.5600419640541077, + "learning_rate": 0.0002, + "loss": 0.8391, + "step": 1830 + }, + { + "epoch": 0.29746988925713364, + "grad_norm": 0.48555272817611694, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 1840 + }, + { + "epoch": 0.2990865734378789, + "grad_norm": 0.3752668499946594, + "learning_rate": 0.0002, + "loss": 0.7657, + "step": 1850 + }, + { + "epoch": 0.3007032576186242, + "grad_norm": 0.5328747034072876, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 1860 + }, + { + "epoch": 0.3023199417993695, + "grad_norm": 0.48716455698013306, + "learning_rate": 0.0002, + "loss": 0.8426, + "step": 1870 + }, + { + "epoch": 0.3039366259801148, + "grad_norm": 0.5011493563652039, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 1880 + }, + { + "epoch": 0.30555331016086007, + "grad_norm": 0.46461427211761475, + "learning_rate": 0.0002, + "loss": 0.852, + "step": 1890 + }, + { + "epoch": 0.30716999434160536, + "grad_norm": 0.36630210280418396, + "learning_rate": 0.0002, + "loss": 0.8478, + "step": 1900 + }, + { + "epoch": 0.30878667852235064, + "grad_norm": 0.4217296242713928, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 1910 + }, + { + "epoch": 0.31040336270309593, + "grad_norm": 0.4394875466823578, + "learning_rate": 0.0002, + "loss": 0.8128, + "step": 1920 + }, + { + "epoch": 0.3120200468838412, + "grad_norm": 0.6587965488433838, + "learning_rate": 0.0002, + "loss": 0.8471, + "step": 1930 + }, + { + "epoch": 0.31363673106458656, + "grad_norm": 0.5469298958778381, + "learning_rate": 0.0002, + "loss": 0.8565, + "step": 1940 + }, + { + "epoch": 0.31525341524533185, + "grad_norm": 0.4371595084667206, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 1950 + }, + { + "epoch": 0.31687009942607713, + "grad_norm": 0.4809541404247284, + "learning_rate": 0.0002, + "loss": 0.887, + "step": 1960 + }, + { + "epoch": 0.3184867836068224, + "grad_norm": 0.6061086654663086, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 1970 + }, + { + "epoch": 0.3201034677875677, + "grad_norm": 0.5342657566070557, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 1980 + }, + { + "epoch": 0.321720151968313, + "grad_norm": 0.5057743787765503, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 1990 + }, + { + "epoch": 0.3233368361490583, + "grad_norm": 0.528626024723053, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 2000 + }, + { + "epoch": 0.32495352032980357, + "grad_norm": 0.46742770075798035, + "learning_rate": 0.0002, + "loss": 0.8845, + "step": 2010 + }, + { + "epoch": 0.32657020451054886, + "grad_norm": 0.515101432800293, + "learning_rate": 0.0002, + "loss": 0.8484, + "step": 2020 + }, + { + "epoch": 0.32818688869129414, + "grad_norm": 0.41941216588020325, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 2030 + }, + { + "epoch": 0.32980357287203943, + "grad_norm": 0.49902522563934326, + "learning_rate": 0.0002, + "loss": 0.7637, + "step": 2040 + }, + { + "epoch": 0.3314202570527847, + "grad_norm": 0.4120897650718689, + "learning_rate": 0.0002, + "loss": 0.7822, + "step": 2050 + }, + { + "epoch": 0.33303694123353, + "grad_norm": 0.45352041721343994, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 2060 + }, + { + "epoch": 0.33465362541427535, + "grad_norm": 0.523199737071991, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 2070 + }, + { + "epoch": 0.33627030959502063, + "grad_norm": 0.4390358626842499, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 2080 + }, + { + "epoch": 0.3378869937757659, + "grad_norm": 0.6752901077270508, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 2090 + }, + { + "epoch": 0.3395036779565112, + "grad_norm": 0.547821044921875, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 2100 + }, + { + "epoch": 0.3411203621372565, + "grad_norm": 0.5161308646202087, + "learning_rate": 0.0002, + "loss": 0.8561, + "step": 2110 + }, + { + "epoch": 0.3427370463180018, + "grad_norm": 0.4565401077270508, + "learning_rate": 0.0002, + "loss": 0.7697, + "step": 2120 + }, + { + "epoch": 0.34435373049874707, + "grad_norm": 0.4666115939617157, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 2130 + }, + { + "epoch": 0.34597041467949236, + "grad_norm": 0.4090428352355957, + "learning_rate": 0.0002, + "loss": 0.8189, + "step": 2140 + }, + { + "epoch": 0.34758709886023764, + "grad_norm": 0.510845422744751, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 2150 + }, + { + "epoch": 0.34920378304098293, + "grad_norm": 0.42861923575401306, + "learning_rate": 0.0002, + "loss": 0.8398, + "step": 2160 + }, + { + "epoch": 0.3508204672217282, + "grad_norm": 0.4476332664489746, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 2170 + }, + { + "epoch": 0.3524371514024735, + "grad_norm": 0.6065791249275208, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 2180 + }, + { + "epoch": 0.35405383558321885, + "grad_norm": 0.42335066199302673, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 2190 + }, + { + "epoch": 0.35567051976396413, + "grad_norm": 0.5094629526138306, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 2200 + }, + { + "epoch": 0.3572872039447094, + "grad_norm": 0.5476373434066772, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 2210 + }, + { + "epoch": 0.3589038881254547, + "grad_norm": 0.3911719024181366, + "learning_rate": 0.0002, + "loss": 0.814, + "step": 2220 + }, + { + "epoch": 0.3605205723062, + "grad_norm": 0.6599636077880859, + "learning_rate": 0.0002, + "loss": 0.8599, + "step": 2230 + }, + { + "epoch": 0.3621372564869453, + "grad_norm": 0.40381914377212524, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 2240 + }, + { + "epoch": 0.36375394066769057, + "grad_norm": 0.4433908462524414, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 2250 + }, + { + "epoch": 0.36537062484843585, + "grad_norm": 0.578326940536499, + "learning_rate": 0.0002, + "loss": 0.8503, + "step": 2260 + }, + { + "epoch": 0.36698730902918114, + "grad_norm": 0.5734784007072449, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 2270 + }, + { + "epoch": 0.36860399320992643, + "grad_norm": 0.45555487275123596, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 2280 + }, + { + "epoch": 0.3702206773906717, + "grad_norm": 0.5666276216506958, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 2290 + }, + { + "epoch": 0.371837361571417, + "grad_norm": 0.5461117625236511, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 2300 + }, + { + "epoch": 0.3734540457521623, + "grad_norm": 0.6318911910057068, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 2310 + }, + { + "epoch": 0.37507072993290763, + "grad_norm": 0.493263304233551, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 2320 + }, + { + "epoch": 0.3766874141136529, + "grad_norm": 0.5888760089874268, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 2330 + }, + { + "epoch": 0.3783040982943982, + "grad_norm": 0.48671841621398926, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 2340 + }, + { + "epoch": 0.3799207824751435, + "grad_norm": 0.4385145306587219, + "learning_rate": 0.0002, + "loss": 0.8367, + "step": 2350 + }, + { + "epoch": 0.3815374666558888, + "grad_norm": 0.5523318648338318, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 2360 + }, + { + "epoch": 0.38315415083663407, + "grad_norm": 0.7308220267295837, + "learning_rate": 0.0002, + "loss": 0.8351, + "step": 2370 + }, + { + "epoch": 0.38477083501737935, + "grad_norm": 0.554214358329773, + "learning_rate": 0.0002, + "loss": 0.859, + "step": 2380 + }, + { + "epoch": 0.38638751919812464, + "grad_norm": 0.5425800085067749, + "learning_rate": 0.0002, + "loss": 0.8146, + "step": 2390 + }, + { + "epoch": 0.3880042033788699, + "grad_norm": 0.48811158537864685, + "learning_rate": 0.0002, + "loss": 0.8282, + "step": 2400 + }, + { + "epoch": 0.3896208875596152, + "grad_norm": 0.49212366342544556, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 2410 + }, + { + "epoch": 0.3912375717403605, + "grad_norm": 0.5222218632698059, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 2420 + }, + { + "epoch": 0.3928542559211058, + "grad_norm": 0.4699819087982178, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 2430 + }, + { + "epoch": 0.39447094010185113, + "grad_norm": 0.46153587102890015, + "learning_rate": 0.0002, + "loss": 0.7919, + "step": 2440 + }, + { + "epoch": 0.3960876242825964, + "grad_norm": 0.4150611162185669, + "learning_rate": 0.0002, + "loss": 0.8111, + "step": 2450 + }, + { + "epoch": 0.3977043084633417, + "grad_norm": 0.5799614787101746, + "learning_rate": 0.0002, + "loss": 0.8589, + "step": 2460 + }, + { + "epoch": 0.399320992644087, + "grad_norm": 0.56536865234375, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 2470 + }, + { + "epoch": 0.4009376768248323, + "grad_norm": 0.5451247096061707, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 2480 + }, + { + "epoch": 0.40255436100557757, + "grad_norm": 0.5914521217346191, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 2490 + }, + { + "epoch": 0.40417104518632285, + "grad_norm": 0.4428117275238037, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 2500 + }, + { + "epoch": 0.40578772936706814, + "grad_norm": 0.48580947518348694, + "learning_rate": 0.0002, + "loss": 0.8054, + "step": 2510 + }, + { + "epoch": 0.4074044135478134, + "grad_norm": 0.436734676361084, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 2520 + }, + { + "epoch": 0.4090210977285587, + "grad_norm": 0.5752223134040833, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 2530 + }, + { + "epoch": 0.410637781909304, + "grad_norm": 0.4271308183670044, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 2540 + }, + { + "epoch": 0.4122544660900493, + "grad_norm": 0.46294718980789185, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 2550 + }, + { + "epoch": 0.4138711502707946, + "grad_norm": 0.49407583475112915, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 2560 + }, + { + "epoch": 0.4154878344515399, + "grad_norm": 0.4729035496711731, + "learning_rate": 0.0002, + "loss": 0.7881, + "step": 2570 + }, + { + "epoch": 0.4171045186322852, + "grad_norm": 0.4129747152328491, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 2580 + }, + { + "epoch": 0.4187212028130305, + "grad_norm": 0.5684236288070679, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 2590 + }, + { + "epoch": 0.4203378869937758, + "grad_norm": 0.4862157106399536, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 2600 + }, + { + "epoch": 0.42195457117452106, + "grad_norm": 0.46567976474761963, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 2610 + }, + { + "epoch": 0.42357125535526635, + "grad_norm": 0.5710650682449341, + "learning_rate": 0.0002, + "loss": 0.7755, + "step": 2620 + }, + { + "epoch": 0.42518793953601164, + "grad_norm": 0.5660041570663452, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 2630 + }, + { + "epoch": 0.4268046237167569, + "grad_norm": 0.47944375872612, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 2640 + }, + { + "epoch": 0.4284213078975022, + "grad_norm": 0.537223756313324, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 2650 + }, + { + "epoch": 0.4300379920782475, + "grad_norm": 0.41669997572898865, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 2660 + }, + { + "epoch": 0.4316546762589928, + "grad_norm": 0.44727686047554016, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 2670 + }, + { + "epoch": 0.4332713604397381, + "grad_norm": 0.5600888729095459, + "learning_rate": 0.0002, + "loss": 0.8241, + "step": 2680 + }, + { + "epoch": 0.4348880446204834, + "grad_norm": 0.39820605516433716, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 2690 + }, + { + "epoch": 0.4365047288012287, + "grad_norm": 0.5637655854225159, + "learning_rate": 0.0002, + "loss": 0.8202, + "step": 2700 + }, + { + "epoch": 0.438121412981974, + "grad_norm": 0.6363666653633118, + "learning_rate": 0.0002, + "loss": 0.855, + "step": 2710 + }, + { + "epoch": 0.4397380971627193, + "grad_norm": 0.5656129121780396, + "learning_rate": 0.0002, + "loss": 0.8468, + "step": 2720 + }, + { + "epoch": 0.44135478134346456, + "grad_norm": 0.5600156188011169, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 2730 + }, + { + "epoch": 0.44297146552420985, + "grad_norm": 0.5506579875946045, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 2740 + }, + { + "epoch": 0.44458814970495514, + "grad_norm": 0.49878305196762085, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 2750 + }, + { + "epoch": 0.4462048338857004, + "grad_norm": 0.4569213092327118, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 2760 + }, + { + "epoch": 0.4478215180664457, + "grad_norm": 0.6056680083274841, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 2770 + }, + { + "epoch": 0.449438202247191, + "grad_norm": 0.44474557042121887, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 2780 + }, + { + "epoch": 0.4510548864279363, + "grad_norm": 0.46055394411087036, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 2790 + }, + { + "epoch": 0.4526715706086816, + "grad_norm": 0.4904133379459381, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 2800 + }, + { + "epoch": 0.45428825478942686, + "grad_norm": 0.5647031664848328, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 2810 + }, + { + "epoch": 0.4559049389701722, + "grad_norm": 0.5759473443031311, + "learning_rate": 0.0002, + "loss": 0.8622, + "step": 2820 + }, + { + "epoch": 0.4575216231509175, + "grad_norm": 0.5161895751953125, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 2830 + }, + { + "epoch": 0.4591383073316628, + "grad_norm": 0.4248254597187042, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 2840 + }, + { + "epoch": 0.46075499151240806, + "grad_norm": 0.45395001769065857, + "learning_rate": 0.0002, + "loss": 0.7838, + "step": 2850 + }, + { + "epoch": 0.46237167569315335, + "grad_norm": 0.5358697772026062, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 2860 + }, + { + "epoch": 0.46398835987389864, + "grad_norm": 0.5379165410995483, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 2870 + }, + { + "epoch": 0.4656050440546439, + "grad_norm": 0.4601989686489105, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 2880 + }, + { + "epoch": 0.4672217282353892, + "grad_norm": 0.671115517616272, + "learning_rate": 0.0002, + "loss": 0.8523, + "step": 2890 + }, + { + "epoch": 0.4688384124161345, + "grad_norm": 0.4425133168697357, + "learning_rate": 0.0002, + "loss": 0.8262, + "step": 2900 + }, + { + "epoch": 0.4704550965968798, + "grad_norm": 0.5446155071258545, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 2910 + }, + { + "epoch": 0.47207178077762507, + "grad_norm": 0.603306233882904, + "learning_rate": 0.0002, + "loss": 0.8106, + "step": 2920 + }, + { + "epoch": 0.47368846495837036, + "grad_norm": 0.5377997159957886, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 2930 + }, + { + "epoch": 0.4753051491391157, + "grad_norm": 0.4931027591228485, + "learning_rate": 0.0002, + "loss": 0.8075, + "step": 2940 + }, + { + "epoch": 0.476921833319861, + "grad_norm": 0.4711960256099701, + "learning_rate": 0.0002, + "loss": 0.8004, + "step": 2950 + }, + { + "epoch": 0.4785385175006063, + "grad_norm": 0.5020492672920227, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 2960 + }, + { + "epoch": 0.48015520168135156, + "grad_norm": 0.5428946614265442, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 2970 + }, + { + "epoch": 0.48177188586209685, + "grad_norm": 0.5294089317321777, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 2980 + }, + { + "epoch": 0.48338857004284214, + "grad_norm": 0.648289144039154, + "learning_rate": 0.0002, + "loss": 0.8553, + "step": 2990 + }, + { + "epoch": 0.4850052542235874, + "grad_norm": 0.47916680574417114, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 3000 + }, + { + "epoch": 0.4866219384043327, + "grad_norm": 0.43849772214889526, + "learning_rate": 0.0002, + "loss": 0.8087, + "step": 3010 + }, + { + "epoch": 0.488238622585078, + "grad_norm": 0.47007861733436584, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 3020 + }, + { + "epoch": 0.4898553067658233, + "grad_norm": 0.6314331293106079, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 3030 + }, + { + "epoch": 0.49147199094656857, + "grad_norm": 0.49211493134498596, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 3040 + }, + { + "epoch": 0.49308867512731386, + "grad_norm": 0.4537973403930664, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 3050 + }, + { + "epoch": 0.49470535930805914, + "grad_norm": 0.47326919436454773, + "learning_rate": 0.0002, + "loss": 0.8095, + "step": 3060 + }, + { + "epoch": 0.4963220434888045, + "grad_norm": 0.525874137878418, + "learning_rate": 0.0002, + "loss": 0.8447, + "step": 3070 + }, + { + "epoch": 0.4979387276695498, + "grad_norm": 0.6361091732978821, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 3080 + }, + { + "epoch": 0.49955541185029506, + "grad_norm": 0.5850642919540405, + "learning_rate": 0.0002, + "loss": 0.821, + "step": 3090 + }, + { + "epoch": 0.5011720960310403, + "grad_norm": 0.47299543023109436, + "learning_rate": 0.0002, + "loss": 0.8279, + "step": 3100 + }, + { + "epoch": 0.5027887802117856, + "grad_norm": 0.473099946975708, + "learning_rate": 0.0002, + "loss": 0.8681, + "step": 3110 + }, + { + "epoch": 0.5044054643925309, + "grad_norm": 0.48186397552490234, + "learning_rate": 0.0002, + "loss": 0.8223, + "step": 3120 + }, + { + "epoch": 0.5060221485732762, + "grad_norm": 0.5015401840209961, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 3130 + }, + { + "epoch": 0.5076388327540216, + "grad_norm": 0.5617750287055969, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 3140 + }, + { + "epoch": 0.5092555169347668, + "grad_norm": 0.5169327259063721, + "learning_rate": 0.0002, + "loss": 0.8708, + "step": 3150 + }, + { + "epoch": 0.5108722011155121, + "grad_norm": 0.545657753944397, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 3160 + }, + { + "epoch": 0.5124888852962574, + "grad_norm": 0.512864351272583, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 3170 + }, + { + "epoch": 0.5141055694770027, + "grad_norm": 0.4113546311855316, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 3180 + }, + { + "epoch": 0.5157222536577479, + "grad_norm": 0.44532445073127747, + "learning_rate": 0.0002, + "loss": 0.8206, + "step": 3190 + }, + { + "epoch": 0.5173389378384933, + "grad_norm": 0.5623497366905212, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 3200 + }, + { + "epoch": 0.5189556220192385, + "grad_norm": 0.5084741115570068, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 3210 + }, + { + "epoch": 0.5205723061999838, + "grad_norm": 0.5305403470993042, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 3220 + }, + { + "epoch": 0.5221889903807291, + "grad_norm": 0.4708254337310791, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 3230 + }, + { + "epoch": 0.5238056745614744, + "grad_norm": 0.43827131390571594, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 3240 + }, + { + "epoch": 0.5254223587422197, + "grad_norm": 0.5630002617835999, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 3250 + }, + { + "epoch": 0.527039042922965, + "grad_norm": 0.5010961890220642, + "learning_rate": 0.0002, + "loss": 0.7522, + "step": 3260 + }, + { + "epoch": 0.5286557271037103, + "grad_norm": 0.6303122043609619, + "learning_rate": 0.0002, + "loss": 0.8374, + "step": 3270 + }, + { + "epoch": 0.5302724112844556, + "grad_norm": 0.5107331275939941, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 3280 + }, + { + "epoch": 0.5318890954652009, + "grad_norm": 0.5700443387031555, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 3290 + }, + { + "epoch": 0.5335057796459461, + "grad_norm": 0.46296367049217224, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 3300 + }, + { + "epoch": 0.5351224638266915, + "grad_norm": 0.531568706035614, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 3310 + }, + { + "epoch": 0.5367391480074367, + "grad_norm": 0.4686741530895233, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 3320 + }, + { + "epoch": 0.5383558321881821, + "grad_norm": 0.5404331088066101, + "learning_rate": 0.0002, + "loss": 0.8104, + "step": 3330 + }, + { + "epoch": 0.5399725163689273, + "grad_norm": 0.6368790864944458, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 3340 + }, + { + "epoch": 0.5415892005496726, + "grad_norm": 0.42300888895988464, + "learning_rate": 0.0002, + "loss": 0.8514, + "step": 3350 + }, + { + "epoch": 0.5432058847304179, + "grad_norm": 0.5362542867660522, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 3360 + }, + { + "epoch": 0.5448225689111632, + "grad_norm": 0.497128963470459, + "learning_rate": 0.0002, + "loss": 0.858, + "step": 3370 + }, + { + "epoch": 0.5464392530919085, + "grad_norm": 0.5006386041641235, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 3380 + }, + { + "epoch": 0.5480559372726538, + "grad_norm": 0.44136837124824524, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 3390 + }, + { + "epoch": 0.5496726214533991, + "grad_norm": 0.5897833108901978, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 3400 + }, + { + "epoch": 0.5512893056341444, + "grad_norm": 0.641075611114502, + "learning_rate": 0.0002, + "loss": 0.8895, + "step": 3410 + }, + { + "epoch": 0.5529059898148897, + "grad_norm": 0.7251322269439697, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 3420 + }, + { + "epoch": 0.5545226739956349, + "grad_norm": 0.47411349415779114, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 3430 + }, + { + "epoch": 0.5561393581763803, + "grad_norm": 0.4994310438632965, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 3440 + }, + { + "epoch": 0.5577560423571255, + "grad_norm": 0.5814438462257385, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 3450 + }, + { + "epoch": 0.5593727265378708, + "grad_norm": 0.6278898119926453, + "learning_rate": 0.0002, + "loss": 0.8805, + "step": 3460 + }, + { + "epoch": 0.5609894107186161, + "grad_norm": 0.46208274364471436, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 3470 + }, + { + "epoch": 0.5626060948993614, + "grad_norm": 0.5718930959701538, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 3480 + }, + { + "epoch": 0.5642227790801067, + "grad_norm": 0.48178744316101074, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 3490 + }, + { + "epoch": 0.565839463260852, + "grad_norm": 0.47336965799331665, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 3500 + }, + { + "epoch": 0.5674561474415973, + "grad_norm": 0.43442684412002563, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 3510 + }, + { + "epoch": 0.5690728316223426, + "grad_norm": 0.6463358998298645, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 3520 + }, + { + "epoch": 0.5706895158030879, + "grad_norm": 0.5286486744880676, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 3530 + }, + { + "epoch": 0.5723061999838331, + "grad_norm": 0.5405499935150146, + "learning_rate": 0.0002, + "loss": 0.8421, + "step": 3540 + }, + { + "epoch": 0.5739228841645785, + "grad_norm": 0.6654391884803772, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 3550 + }, + { + "epoch": 0.5755395683453237, + "grad_norm": 0.5081980228424072, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 3560 + }, + { + "epoch": 0.5771562525260691, + "grad_norm": 0.48978179693222046, + "learning_rate": 0.0002, + "loss": 0.7753, + "step": 3570 + }, + { + "epoch": 0.5787729367068143, + "grad_norm": 0.5840612053871155, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3580 + }, + { + "epoch": 0.5803896208875596, + "grad_norm": 0.5235261917114258, + "learning_rate": 0.0002, + "loss": 0.8937, + "step": 3590 + }, + { + "epoch": 0.5820063050683049, + "grad_norm": 0.5672075748443604, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 3600 + }, + { + "epoch": 0.5836229892490502, + "grad_norm": 0.5613429546356201, + "learning_rate": 0.0002, + "loss": 0.8347, + "step": 3610 + }, + { + "epoch": 0.5852396734297954, + "grad_norm": 0.4032273590564728, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 3620 + }, + { + "epoch": 0.5868563576105408, + "grad_norm": 0.49559324979782104, + "learning_rate": 0.0002, + "loss": 0.8421, + "step": 3630 + }, + { + "epoch": 0.5884730417912861, + "grad_norm": 0.6895697712898254, + "learning_rate": 0.0002, + "loss": 0.8332, + "step": 3640 + }, + { + "epoch": 0.5900897259720314, + "grad_norm": 0.4750136435031891, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 3650 + }, + { + "epoch": 0.5917064101527767, + "grad_norm": 0.5176819562911987, + "learning_rate": 0.0002, + "loss": 0.8219, + "step": 3660 + }, + { + "epoch": 0.5933230943335219, + "grad_norm": 0.5817760229110718, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3670 + }, + { + "epoch": 0.5949397785142673, + "grad_norm": 0.6064626574516296, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 3680 + }, + { + "epoch": 0.5965564626950125, + "grad_norm": 0.6728700995445251, + "learning_rate": 0.0002, + "loss": 0.8422, + "step": 3690 + }, + { + "epoch": 0.5981731468757578, + "grad_norm": 0.609305202960968, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 3700 + }, + { + "epoch": 0.5997898310565031, + "grad_norm": 0.4615488350391388, + "learning_rate": 0.0002, + "loss": 0.8048, + "step": 3710 + }, + { + "epoch": 0.6014065152372484, + "grad_norm": 2.0531179904937744, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 3720 + }, + { + "epoch": 0.6030231994179936, + "grad_norm": 0.5091132521629333, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 3730 + }, + { + "epoch": 0.604639883598739, + "grad_norm": 0.5951124429702759, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 3740 + }, + { + "epoch": 0.6062565677794842, + "grad_norm": 0.5870208144187927, + "learning_rate": 0.0002, + "loss": 0.7784, + "step": 3750 + }, + { + "epoch": 0.6078732519602296, + "grad_norm": 0.6254619359970093, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 3760 + }, + { + "epoch": 0.6094899361409749, + "grad_norm": 0.5577626824378967, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 3770 + }, + { + "epoch": 0.6111066203217201, + "grad_norm": 0.5004405379295349, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 3780 + }, + { + "epoch": 0.6127233045024655, + "grad_norm": 0.5527383685112, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 3790 + }, + { + "epoch": 0.6143399886832107, + "grad_norm": 0.49116113781929016, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 3800 + }, + { + "epoch": 0.6159566728639561, + "grad_norm": 0.5299299359321594, + "learning_rate": 0.0002, + "loss": 0.8352, + "step": 3810 + }, + { + "epoch": 0.6175733570447013, + "grad_norm": 0.464897483587265, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 3820 + }, + { + "epoch": 0.6191900412254466, + "grad_norm": 0.6505740880966187, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 3830 + }, + { + "epoch": 0.6208067254061919, + "grad_norm": 0.5512559413909912, + "learning_rate": 0.0002, + "loss": 0.8123, + "step": 3840 + }, + { + "epoch": 0.6224234095869372, + "grad_norm": 0.49427518248558044, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 3850 + }, + { + "epoch": 0.6240400937676824, + "grad_norm": 0.3839147090911865, + "learning_rate": 0.0002, + "loss": 0.7751, + "step": 3860 + }, + { + "epoch": 0.6256567779484278, + "grad_norm": 0.5760218501091003, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 3870 + }, + { + "epoch": 0.6272734621291731, + "grad_norm": 0.7226507067680359, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 3880 + }, + { + "epoch": 0.6288901463099184, + "grad_norm": 0.676781415939331, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 3890 + }, + { + "epoch": 0.6305068304906637, + "grad_norm": 0.4284018278121948, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 3900 + }, + { + "epoch": 0.6321235146714089, + "grad_norm": 0.5060628056526184, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 3910 + }, + { + "epoch": 0.6337401988521543, + "grad_norm": 0.5524522066116333, + "learning_rate": 0.0002, + "loss": 0.8089, + "step": 3920 + }, + { + "epoch": 0.6353568830328995, + "grad_norm": 0.6099881529808044, + "learning_rate": 0.0002, + "loss": 0.8276, + "step": 3930 + }, + { + "epoch": 0.6369735672136448, + "grad_norm": 0.43155938386917114, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 3940 + }, + { + "epoch": 0.6385902513943901, + "grad_norm": 0.6427084803581238, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 3950 + }, + { + "epoch": 0.6402069355751354, + "grad_norm": 0.541220486164093, + "learning_rate": 0.0002, + "loss": 0.8368, + "step": 3960 + }, + { + "epoch": 0.6418236197558806, + "grad_norm": 0.5414294600486755, + "learning_rate": 0.0002, + "loss": 0.8539, + "step": 3970 + }, + { + "epoch": 0.643440303936626, + "grad_norm": 0.46344003081321716, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 3980 + }, + { + "epoch": 0.6450569881173712, + "grad_norm": 0.45209285616874695, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 3990 + }, + { + "epoch": 0.6466736722981166, + "grad_norm": 0.5417284369468689, + "learning_rate": 0.0002, + "loss": 0.8202, + "step": 4000 + }, + { + "epoch": 0.6482903564788619, + "grad_norm": 0.7995685935020447, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 4010 + }, + { + "epoch": 0.6499070406596071, + "grad_norm": 0.6384002566337585, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 4020 + }, + { + "epoch": 0.6515237248403525, + "grad_norm": 0.4472815692424774, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 4030 + }, + { + "epoch": 0.6531404090210977, + "grad_norm": 0.6834294199943542, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 4040 + }, + { + "epoch": 0.654757093201843, + "grad_norm": 0.4612339735031128, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 4050 + }, + { + "epoch": 0.6563737773825883, + "grad_norm": 0.9266576170921326, + "learning_rate": 0.0002, + "loss": 0.8141, + "step": 4060 + }, + { + "epoch": 0.6579904615633336, + "grad_norm": 0.4470861852169037, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 4070 + }, + { + "epoch": 0.6596071457440789, + "grad_norm": 0.45544925332069397, + "learning_rate": 0.0002, + "loss": 0.8293, + "step": 4080 + }, + { + "epoch": 0.6612238299248242, + "grad_norm": 0.6144481301307678, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 4090 + }, + { + "epoch": 0.6628405141055694, + "grad_norm": 0.5936288237571716, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 4100 + }, + { + "epoch": 0.6644571982863148, + "grad_norm": 0.4822963774204254, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 4110 + }, + { + "epoch": 0.66607388246706, + "grad_norm": 0.48432496190071106, + "learning_rate": 0.0002, + "loss": 0.7997, + "step": 4120 + }, + { + "epoch": 0.6676905666478054, + "grad_norm": 0.4901607930660248, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 4130 + }, + { + "epoch": 0.6693072508285507, + "grad_norm": 0.5018393397331238, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 4140 + }, + { + "epoch": 0.6709239350092959, + "grad_norm": 0.6946378946304321, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 4150 + }, + { + "epoch": 0.6725406191900413, + "grad_norm": 0.5997390747070312, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 4160 + }, + { + "epoch": 0.6741573033707865, + "grad_norm": 0.6738849878311157, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 4170 + }, + { + "epoch": 0.6757739875515318, + "grad_norm": 0.6110581159591675, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 4180 + }, + { + "epoch": 0.6773906717322771, + "grad_norm": 0.5703322291374207, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 4190 + }, + { + "epoch": 0.6790073559130224, + "grad_norm": 0.4686066210269928, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 4200 + }, + { + "epoch": 0.6806240400937676, + "grad_norm": 0.6394643783569336, + "learning_rate": 0.0002, + "loss": 0.8441, + "step": 4210 + }, + { + "epoch": 0.682240724274513, + "grad_norm": 0.5454841256141663, + "learning_rate": 0.0002, + "loss": 0.8011, + "step": 4220 + }, + { + "epoch": 0.6838574084552582, + "grad_norm": 0.4859732985496521, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 4230 + }, + { + "epoch": 0.6854740926360036, + "grad_norm": 0.5544065833091736, + "learning_rate": 0.0002, + "loss": 0.8161, + "step": 4240 + }, + { + "epoch": 0.6870907768167488, + "grad_norm": 0.4902505576610565, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 4250 + }, + { + "epoch": 0.6887074609974941, + "grad_norm": 0.4768051505088806, + "learning_rate": 0.0002, + "loss": 0.7977, + "step": 4260 + }, + { + "epoch": 0.6903241451782395, + "grad_norm": 0.49982190132141113, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 4270 + }, + { + "epoch": 0.6919408293589847, + "grad_norm": 0.6351838111877441, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 4280 + }, + { + "epoch": 0.69355751353973, + "grad_norm": 0.5647561550140381, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 4290 + }, + { + "epoch": 0.6951741977204753, + "grad_norm": 0.5340486764907837, + "learning_rate": 0.0002, + "loss": 0.7618, + "step": 4300 + }, + { + "epoch": 0.6967908819012206, + "grad_norm": 0.5649092793464661, + "learning_rate": 0.0002, + "loss": 0.8526, + "step": 4310 + }, + { + "epoch": 0.6984075660819659, + "grad_norm": 0.6183916926383972, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 4320 + }, + { + "epoch": 0.7000242502627112, + "grad_norm": 0.6154509782791138, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 4330 + }, + { + "epoch": 0.7016409344434564, + "grad_norm": 0.5156264305114746, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 4340 + }, + { + "epoch": 0.7032576186242018, + "grad_norm": 0.562171459197998, + "learning_rate": 0.0002, + "loss": 0.8512, + "step": 4350 + }, + { + "epoch": 0.704874302804947, + "grad_norm": 0.4949502646923065, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 4360 + }, + { + "epoch": 0.7064909869856923, + "grad_norm": 0.5171684622764587, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 4370 + }, + { + "epoch": 0.7081076711664377, + "grad_norm": 0.6198443174362183, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 4380 + }, + { + "epoch": 0.7097243553471829, + "grad_norm": 0.5802276134490967, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 4390 + }, + { + "epoch": 0.7113410395279283, + "grad_norm": 0.41096967458724976, + "learning_rate": 0.0002, + "loss": 0.8797, + "step": 4400 + }, + { + "epoch": 0.7129577237086735, + "grad_norm": 0.4397392272949219, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 4410 + }, + { + "epoch": 0.7145744078894188, + "grad_norm": 0.45228442549705505, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 4420 + }, + { + "epoch": 0.7161910920701641, + "grad_norm": 0.4839673936367035, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 4430 + }, + { + "epoch": 0.7178077762509094, + "grad_norm": 0.6140755414962769, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 4440 + }, + { + "epoch": 0.7194244604316546, + "grad_norm": 0.6841378808021545, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 4450 + }, + { + "epoch": 0.7210411446124, + "grad_norm": 0.6664239168167114, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 4460 + }, + { + "epoch": 0.7226578287931452, + "grad_norm": 0.47552719712257385, + "learning_rate": 0.0002, + "loss": 0.7983, + "step": 4470 + }, + { + "epoch": 0.7242745129738906, + "grad_norm": 0.6649776101112366, + "learning_rate": 0.0002, + "loss": 0.8982, + "step": 4480 + }, + { + "epoch": 0.7258911971546358, + "grad_norm": 0.5159541964530945, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 4490 + }, + { + "epoch": 0.7275078813353811, + "grad_norm": 0.6693112850189209, + "learning_rate": 0.0002, + "loss": 0.7786, + "step": 4500 + }, + { + "epoch": 0.7291245655161265, + "grad_norm": 0.48870977759361267, + "learning_rate": 0.0002, + "loss": 0.8655, + "step": 4510 + }, + { + "epoch": 0.7307412496968717, + "grad_norm": 0.4857887923717499, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 4520 + }, + { + "epoch": 0.732357933877617, + "grad_norm": 0.5515662431716919, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 4530 + }, + { + "epoch": 0.7339746180583623, + "grad_norm": 0.6292222738265991, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 4540 + }, + { + "epoch": 0.7355913022391076, + "grad_norm": 0.48265689611434937, + "learning_rate": 0.0002, + "loss": 0.7749, + "step": 4550 + }, + { + "epoch": 0.7372079864198529, + "grad_norm": 0.8044266104698181, + "learning_rate": 0.0002, + "loss": 0.8499, + "step": 4560 + }, + { + "epoch": 0.7388246706005982, + "grad_norm": 0.6111769676208496, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 4570 + }, + { + "epoch": 0.7404413547813434, + "grad_norm": 0.5229553580284119, + "learning_rate": 0.0002, + "loss": 0.7291, + "step": 4580 + }, + { + "epoch": 0.7420580389620888, + "grad_norm": 0.6054152250289917, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 4590 + }, + { + "epoch": 0.743674723142834, + "grad_norm": 0.5574966669082642, + "learning_rate": 0.0002, + "loss": 0.8169, + "step": 4600 + }, + { + "epoch": 0.7452914073235793, + "grad_norm": 0.5395817160606384, + "learning_rate": 0.0002, + "loss": 0.8439, + "step": 4610 + }, + { + "epoch": 0.7469080915043246, + "grad_norm": 0.7116472721099854, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 4620 + }, + { + "epoch": 0.7485247756850699, + "grad_norm": 0.5618700981140137, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 4630 + }, + { + "epoch": 0.7501414598658153, + "grad_norm": 0.5802770853042603, + "learning_rate": 0.0002, + "loss": 0.7744, + "step": 4640 + }, + { + "epoch": 0.7517581440465605, + "grad_norm": 0.5690428018569946, + "learning_rate": 0.0002, + "loss": 0.7924, + "step": 4650 + }, + { + "epoch": 0.7533748282273058, + "grad_norm": 0.4813360273838043, + "learning_rate": 0.0002, + "loss": 0.8017, + "step": 4660 + }, + { + "epoch": 0.7549915124080511, + "grad_norm": 0.5434042811393738, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 4670 + }, + { + "epoch": 0.7566081965887964, + "grad_norm": 0.5502099990844727, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 4680 + }, + { + "epoch": 0.7582248807695416, + "grad_norm": 0.6020621061325073, + "learning_rate": 0.0002, + "loss": 0.8598, + "step": 4690 + }, + { + "epoch": 0.759841564950287, + "grad_norm": 0.4922301471233368, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 4700 + }, + { + "epoch": 0.7614582491310322, + "grad_norm": 0.6492828726768494, + "learning_rate": 0.0002, + "loss": 0.788, + "step": 4710 + }, + { + "epoch": 0.7630749333117776, + "grad_norm": 0.4865580201148987, + "learning_rate": 0.0002, + "loss": 0.8313, + "step": 4720 + }, + { + "epoch": 0.7646916174925228, + "grad_norm": 0.5971422791481018, + "learning_rate": 0.0002, + "loss": 0.7966, + "step": 4730 + }, + { + "epoch": 0.7663083016732681, + "grad_norm": 0.6832674145698547, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 4740 + }, + { + "epoch": 0.7679249858540134, + "grad_norm": 0.500908613204956, + "learning_rate": 0.0002, + "loss": 0.8156, + "step": 4750 + }, + { + "epoch": 0.7695416700347587, + "grad_norm": 0.6112465858459473, + "learning_rate": 0.0002, + "loss": 0.8383, + "step": 4760 + }, + { + "epoch": 0.771158354215504, + "grad_norm": 0.5753506422042847, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 4770 + }, + { + "epoch": 0.7727750383962493, + "grad_norm": 0.6529405117034912, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 4780 + }, + { + "epoch": 0.7743917225769946, + "grad_norm": 0.5916843414306641, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 4790 + }, + { + "epoch": 0.7760084067577399, + "grad_norm": 0.4821224510669708, + "learning_rate": 0.0002, + "loss": 0.83, + "step": 4800 + }, + { + "epoch": 0.7776250909384852, + "grad_norm": 0.5532580018043518, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 4810 + }, + { + "epoch": 0.7792417751192304, + "grad_norm": 0.4604877233505249, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 4820 + }, + { + "epoch": 0.7808584592999758, + "grad_norm": 0.5009613037109375, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 4830 + }, + { + "epoch": 0.782475143480721, + "grad_norm": 0.6448560357093811, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 4840 + }, + { + "epoch": 0.7840918276614663, + "grad_norm": 0.44327953457832336, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 4850 + }, + { + "epoch": 0.7857085118422116, + "grad_norm": 0.5355411171913147, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 4860 + }, + { + "epoch": 0.7873251960229569, + "grad_norm": 0.5635677576065063, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 4870 + }, + { + "epoch": 0.7889418802037023, + "grad_norm": 0.5417491793632507, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 4880 + }, + { + "epoch": 0.7905585643844475, + "grad_norm": 0.4567430913448334, + "learning_rate": 0.0002, + "loss": 0.7819, + "step": 4890 + }, + { + "epoch": 0.7921752485651928, + "grad_norm": 0.44651296734809875, + "learning_rate": 0.0002, + "loss": 0.8454, + "step": 4900 + }, + { + "epoch": 0.7937919327459381, + "grad_norm": 0.5741217136383057, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 4910 + }, + { + "epoch": 0.7954086169266834, + "grad_norm": 0.6605045199394226, + "learning_rate": 0.0002, + "loss": 0.8093, + "step": 4920 + }, + { + "epoch": 0.7970253011074286, + "grad_norm": 0.5126531720161438, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 4930 + }, + { + "epoch": 0.798641985288174, + "grad_norm": 0.513648271560669, + "learning_rate": 0.0002, + "loss": 0.7793, + "step": 4940 + }, + { + "epoch": 0.8002586694689192, + "grad_norm": 0.5350404381752014, + "learning_rate": 0.0002, + "loss": 0.8314, + "step": 4950 + }, + { + "epoch": 0.8018753536496646, + "grad_norm": 0.5731674432754517, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 4960 + }, + { + "epoch": 0.8034920378304098, + "grad_norm": 0.5974258184432983, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 4970 + }, + { + "epoch": 0.8051087220111551, + "grad_norm": 0.8774799704551697, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 4980 + }, + { + "epoch": 0.8067254061919004, + "grad_norm": 0.5994430184364319, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 4990 + }, + { + "epoch": 0.8083420903726457, + "grad_norm": 0.4894903004169464, + "learning_rate": 0.0002, + "loss": 0.7736, + "step": 5000 + }, + { + "epoch": 0.809958774553391, + "grad_norm": 0.5218459367752075, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 5010 + }, + { + "epoch": 0.8115754587341363, + "grad_norm": 0.5232468843460083, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 5020 + }, + { + "epoch": 0.8131921429148816, + "grad_norm": 0.44358372688293457, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 5030 + }, + { + "epoch": 0.8148088270956269, + "grad_norm": 0.6202037334442139, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 5040 + }, + { + "epoch": 0.8164255112763722, + "grad_norm": 0.7721474170684814, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 5050 + }, + { + "epoch": 0.8180421954571174, + "grad_norm": 0.5568501353263855, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 5060 + }, + { + "epoch": 0.8196588796378628, + "grad_norm": 0.49148809909820557, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 5070 + }, + { + "epoch": 0.821275563818608, + "grad_norm": 0.4956012964248657, + "learning_rate": 0.0002, + "loss": 0.8054, + "step": 5080 + }, + { + "epoch": 0.8228922479993533, + "grad_norm": 0.6078833937644958, + "learning_rate": 0.0002, + "loss": 0.8201, + "step": 5090 + }, + { + "epoch": 0.8245089321800986, + "grad_norm": 0.46906954050064087, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 5100 + }, + { + "epoch": 0.8261256163608439, + "grad_norm": 0.50812166929245, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 5110 + }, + { + "epoch": 0.8277423005415891, + "grad_norm": 0.5319661498069763, + "learning_rate": 0.0002, + "loss": 0.8243, + "step": 5120 + }, + { + "epoch": 0.8293589847223345, + "grad_norm": 0.4949689209461212, + "learning_rate": 0.0002, + "loss": 0.7798, + "step": 5130 + }, + { + "epoch": 0.8309756689030798, + "grad_norm": 0.5151591300964355, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 5140 + }, + { + "epoch": 0.8325923530838251, + "grad_norm": 0.5530214309692383, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 5150 + }, + { + "epoch": 0.8342090372645704, + "grad_norm": 0.6297410130500793, + "learning_rate": 0.0002, + "loss": 0.8251, + "step": 5160 + }, + { + "epoch": 0.8358257214453156, + "grad_norm": 0.5466840267181396, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 5170 + }, + { + "epoch": 0.837442405626061, + "grad_norm": 0.652913510799408, + "learning_rate": 0.0002, + "loss": 0.7875, + "step": 5180 + }, + { + "epoch": 0.8390590898068062, + "grad_norm": 0.5811293125152588, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 5190 + }, + { + "epoch": 0.8406757739875516, + "grad_norm": 0.5109550952911377, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 5200 + }, + { + "epoch": 0.8422924581682968, + "grad_norm": 0.4551706612110138, + "learning_rate": 0.0002, + "loss": 0.8077, + "step": 5210 + }, + { + "epoch": 0.8439091423490421, + "grad_norm": 0.5813754200935364, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 5220 + }, + { + "epoch": 0.8455258265297874, + "grad_norm": 0.5856947898864746, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 5230 + }, + { + "epoch": 0.8471425107105327, + "grad_norm": 0.5482739210128784, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 5240 + }, + { + "epoch": 0.8487591948912779, + "grad_norm": 0.49023720622062683, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 5250 + }, + { + "epoch": 0.8503758790720233, + "grad_norm": 0.49472475051879883, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 5260 + }, + { + "epoch": 0.8519925632527686, + "grad_norm": 0.5490226745605469, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 5270 + }, + { + "epoch": 0.8536092474335139, + "grad_norm": 0.5340665578842163, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 5280 + }, + { + "epoch": 0.8552259316142592, + "grad_norm": 0.5962483882904053, + "learning_rate": 0.0002, + "loss": 0.8277, + "step": 5290 + }, + { + "epoch": 0.8568426157950044, + "grad_norm": 0.586358368396759, + "learning_rate": 0.0002, + "loss": 0.8765, + "step": 5300 + }, + { + "epoch": 0.8584592999757498, + "grad_norm": 0.49120277166366577, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 5310 + }, + { + "epoch": 0.860075984156495, + "grad_norm": 0.5887332558631897, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 5320 + }, + { + "epoch": 0.8616926683372403, + "grad_norm": 0.42496153712272644, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 5330 + }, + { + "epoch": 0.8633093525179856, + "grad_norm": 0.5489874482154846, + "learning_rate": 0.0002, + "loss": 0.7905, + "step": 5340 + }, + { + "epoch": 0.8649260366987309, + "grad_norm": 0.5850813984870911, + "learning_rate": 0.0002, + "loss": 0.7958, + "step": 5350 + }, + { + "epoch": 0.8665427208794761, + "grad_norm": 0.517487108707428, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 5360 + }, + { + "epoch": 0.8681594050602215, + "grad_norm": 0.5339142680168152, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 5370 + }, + { + "epoch": 0.8697760892409668, + "grad_norm": 0.6236387491226196, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 5380 + }, + { + "epoch": 0.8713927734217121, + "grad_norm": 0.5752192735671997, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 5390 + }, + { + "epoch": 0.8730094576024574, + "grad_norm": 0.6724614500999451, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 5400 + }, + { + "epoch": 0.8746261417832026, + "grad_norm": 0.5280613303184509, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 5410 + }, + { + "epoch": 0.876242825963948, + "grad_norm": 0.44033288955688477, + "learning_rate": 0.0002, + "loss": 0.8231, + "step": 5420 + }, + { + "epoch": 0.8778595101446932, + "grad_norm": 0.5199708342552185, + "learning_rate": 0.0002, + "loss": 0.8839, + "step": 5430 + }, + { + "epoch": 0.8794761943254386, + "grad_norm": 0.46778348088264465, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 5440 + }, + { + "epoch": 0.8810928785061838, + "grad_norm": 0.4657754898071289, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 5450 + }, + { + "epoch": 0.8827095626869291, + "grad_norm": 0.5472902655601501, + "learning_rate": 0.0002, + "loss": 0.7799, + "step": 5460 + }, + { + "epoch": 0.8843262468676744, + "grad_norm": 0.4876766800880432, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 5470 + }, + { + "epoch": 0.8859429310484197, + "grad_norm": 0.5057248473167419, + "learning_rate": 0.0002, + "loss": 0.7906, + "step": 5480 + }, + { + "epoch": 0.8875596152291649, + "grad_norm": 0.4637320637702942, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 5490 + }, + { + "epoch": 0.8891762994099103, + "grad_norm": 0.471955806016922, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 5500 + }, + { + "epoch": 0.8907929835906556, + "grad_norm": 0.5209813714027405, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 5510 + }, + { + "epoch": 0.8924096677714008, + "grad_norm": 0.6213834285736084, + "learning_rate": 0.0002, + "loss": 0.8106, + "step": 5520 + }, + { + "epoch": 0.8940263519521462, + "grad_norm": 0.5215408205986023, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 5530 + }, + { + "epoch": 0.8956430361328914, + "grad_norm": 0.580478310585022, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 5540 + }, + { + "epoch": 0.8972597203136368, + "grad_norm": 0.49102169275283813, + "learning_rate": 0.0002, + "loss": 0.8371, + "step": 5550 + }, + { + "epoch": 0.898876404494382, + "grad_norm": 0.6043479442596436, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 5560 + }, + { + "epoch": 0.9004930886751273, + "grad_norm": 0.5636463165283203, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 5570 + }, + { + "epoch": 0.9021097728558726, + "grad_norm": 0.5620124340057373, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 5580 + }, + { + "epoch": 0.9037264570366179, + "grad_norm": 0.5206354856491089, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 5590 + }, + { + "epoch": 0.9053431412173631, + "grad_norm": 0.5798229575157166, + "learning_rate": 0.0002, + "loss": 0.8557, + "step": 5600 + }, + { + "epoch": 0.9069598253981085, + "grad_norm": 0.6428212523460388, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 5610 + }, + { + "epoch": 0.9085765095788537, + "grad_norm": 0.48064687848091125, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 5620 + }, + { + "epoch": 0.9101931937595991, + "grad_norm": 0.6347860097885132, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 5630 + }, + { + "epoch": 0.9118098779403444, + "grad_norm": 0.5353913307189941, + "learning_rate": 0.0002, + "loss": 0.851, + "step": 5640 + }, + { + "epoch": 0.9134265621210896, + "grad_norm": 0.5323944091796875, + "learning_rate": 0.0002, + "loss": 0.7736, + "step": 5650 + }, + { + "epoch": 0.915043246301835, + "grad_norm": 0.5261843204498291, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 5660 + }, + { + "epoch": 0.9166599304825802, + "grad_norm": 0.5451326966285706, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 5670 + }, + { + "epoch": 0.9182766146633256, + "grad_norm": 0.5183324217796326, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 5680 + }, + { + "epoch": 0.9198932988440708, + "grad_norm": 0.47229018807411194, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 5690 + }, + { + "epoch": 0.9215099830248161, + "grad_norm": 0.49180513620376587, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 5700 + }, + { + "epoch": 0.9231266672055614, + "grad_norm": 0.5419785380363464, + "learning_rate": 0.0002, + "loss": 0.8735, + "step": 5710 + }, + { + "epoch": 0.9247433513863067, + "grad_norm": 0.5408698916435242, + "learning_rate": 0.0002, + "loss": 0.7378, + "step": 5720 + }, + { + "epoch": 0.9263600355670519, + "grad_norm": 0.5286232829093933, + "learning_rate": 0.0002, + "loss": 0.7701, + "step": 5730 + }, + { + "epoch": 0.9279767197477973, + "grad_norm": 0.7539758086204529, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 5740 + }, + { + "epoch": 0.9295934039285425, + "grad_norm": 0.5166944861412048, + "learning_rate": 0.0002, + "loss": 0.8118, + "step": 5750 + }, + { + "epoch": 0.9312100881092878, + "grad_norm": 0.6601425409317017, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 5760 + }, + { + "epoch": 0.9328267722900332, + "grad_norm": 0.5029960870742798, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 5770 + }, + { + "epoch": 0.9344434564707784, + "grad_norm": 0.4926645755767822, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 5780 + }, + { + "epoch": 0.9360601406515238, + "grad_norm": 0.5739615559577942, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 5790 + }, + { + "epoch": 0.937676824832269, + "grad_norm": 0.5058279037475586, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 5800 + }, + { + "epoch": 0.9392935090130143, + "grad_norm": 0.5260962247848511, + "learning_rate": 0.0002, + "loss": 0.8537, + "step": 5810 + }, + { + "epoch": 0.9409101931937596, + "grad_norm": 0.5768588185310364, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 5820 + }, + { + "epoch": 0.9425268773745049, + "grad_norm": 0.5170126557350159, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 5830 + }, + { + "epoch": 0.9441435615552501, + "grad_norm": 0.5745864510536194, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 5840 + }, + { + "epoch": 0.9457602457359955, + "grad_norm": 0.5551357865333557, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 5850 + }, + { + "epoch": 0.9473769299167407, + "grad_norm": 0.5776078701019287, + "learning_rate": 0.0002, + "loss": 0.8529, + "step": 5860 + }, + { + "epoch": 0.9489936140974861, + "grad_norm": 0.5340062379837036, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 5870 + }, + { + "epoch": 0.9506102982782314, + "grad_norm": 0.6447290182113647, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 5880 + }, + { + "epoch": 0.9522269824589766, + "grad_norm": 0.5123815536499023, + "learning_rate": 0.0002, + "loss": 0.7945, + "step": 5890 + }, + { + "epoch": 0.953843666639722, + "grad_norm": 0.48547613620758057, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 5900 + }, + { + "epoch": 0.9554603508204672, + "grad_norm": 0.5791414976119995, + "learning_rate": 0.0002, + "loss": 0.7896, + "step": 5910 + }, + { + "epoch": 0.9570770350012126, + "grad_norm": 0.6195011734962463, + "learning_rate": 0.0002, + "loss": 0.8408, + "step": 5920 + }, + { + "epoch": 0.9586937191819578, + "grad_norm": 0.6323803067207336, + "learning_rate": 0.0002, + "loss": 0.7805, + "step": 5930 + }, + { + "epoch": 0.9603104033627031, + "grad_norm": 0.45552879571914673, + "learning_rate": 0.0002, + "loss": 0.8484, + "step": 5940 + }, + { + "epoch": 0.9619270875434484, + "grad_norm": 0.5796473622322083, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 5950 + }, + { + "epoch": 0.9635437717241937, + "grad_norm": 0.647261381149292, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 5960 + }, + { + "epoch": 0.9651604559049389, + "grad_norm": 0.5487682819366455, + "learning_rate": 0.0002, + "loss": 0.8086, + "step": 5970 + }, + { + "epoch": 0.9667771400856843, + "grad_norm": 0.5743663907051086, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 5980 + }, + { + "epoch": 0.9683938242664295, + "grad_norm": 0.5470591187477112, + "learning_rate": 0.0002, + "loss": 0.8153, + "step": 5990 + }, + { + "epoch": 0.9700105084471748, + "grad_norm": 0.5901660323143005, + "learning_rate": 0.0002, + "loss": 0.8119, + "step": 6000 + }, + { + "epoch": 0.9716271926279202, + "grad_norm": 0.6544759273529053, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 6010 + }, + { + "epoch": 0.9732438768086654, + "grad_norm": 0.6288470029830933, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 6020 + }, + { + "epoch": 0.9748605609894108, + "grad_norm": 0.673153817653656, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 6030 + }, + { + "epoch": 0.976477245170156, + "grad_norm": 0.42854753136634827, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 6040 + }, + { + "epoch": 0.9780939293509013, + "grad_norm": 0.5227066278457642, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 6050 + }, + { + "epoch": 0.9797106135316466, + "grad_norm": 0.5372416973114014, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 6060 + }, + { + "epoch": 0.9813272977123919, + "grad_norm": 0.6026402115821838, + "learning_rate": 0.0002, + "loss": 0.7591, + "step": 6070 + }, + { + "epoch": 0.9829439818931371, + "grad_norm": 0.49547791481018066, + "learning_rate": 0.0002, + "loss": 0.8347, + "step": 6080 + }, + { + "epoch": 0.9845606660738825, + "grad_norm": 0.4641951322555542, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 6090 + }, + { + "epoch": 0.9861773502546277, + "grad_norm": 0.5818535089492798, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 6100 + }, + { + "epoch": 0.9877940344353731, + "grad_norm": 0.63955157995224, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 6110 + }, + { + "epoch": 0.9894107186161183, + "grad_norm": 0.5649438500404358, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 6120 + }, + { + "epoch": 0.9910274027968636, + "grad_norm": 0.5290433168411255, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 6130 + }, + { + "epoch": 0.992644086977609, + "grad_norm": 0.6399374008178711, + "learning_rate": 0.0002, + "loss": 0.8109, + "step": 6140 + }, + { + "epoch": 0.9942607711583542, + "grad_norm": 0.6736576557159424, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 6150 + }, + { + "epoch": 0.9958774553390995, + "grad_norm": 0.515420138835907, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 6160 + }, + { + "epoch": 0.9974941395198448, + "grad_norm": 0.562677800655365, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 6170 + }, + { + "epoch": 0.9991108237005901, + "grad_norm": 0.7113858461380005, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 6180 + }, + { + "epoch": 0.9999191657909627, + "eval_loss": 1.0871200561523438, + "eval_runtime": 122.2071, + "eval_samples_per_second": 5.998, + "eval_steps_per_second": 0.753, + "step": 6185 + }, + { + "epoch": 1.0007275078813354, + "grad_norm": 0.7111801505088806, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 6190 + }, + { + "epoch": 1.0023441920620806, + "grad_norm": 0.5402125716209412, + "learning_rate": 0.0002, + "loss": 0.6865, + "step": 6200 + }, + { + "epoch": 1.003960876242826, + "grad_norm": 0.6098830103874207, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 6210 + }, + { + "epoch": 1.0055775604235713, + "grad_norm": 0.5829983353614807, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 6220 + }, + { + "epoch": 1.0071942446043165, + "grad_norm": 0.5614621043205261, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 6230 + }, + { + "epoch": 1.0088109287850617, + "grad_norm": 0.5954238772392273, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 6240 + }, + { + "epoch": 1.0104276129658072, + "grad_norm": 0.6480574607849121, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 6250 + }, + { + "epoch": 1.0120442971465524, + "grad_norm": 0.6051128506660461, + "learning_rate": 0.0002, + "loss": 0.7514, + "step": 6260 + }, + { + "epoch": 1.0136609813272976, + "grad_norm": 0.6318870782852173, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 6270 + }, + { + "epoch": 1.015277665508043, + "grad_norm": 0.5048980116844177, + "learning_rate": 0.0002, + "loss": 0.7178, + "step": 6280 + }, + { + "epoch": 1.0168943496887883, + "grad_norm": 0.6346936225891113, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 6290 + }, + { + "epoch": 1.0185110338695336, + "grad_norm": 0.5711665749549866, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 6300 + }, + { + "epoch": 1.0201277180502788, + "grad_norm": 0.5175361037254333, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 6310 + }, + { + "epoch": 1.0217444022310243, + "grad_norm": 0.5360831618309021, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 6320 + }, + { + "epoch": 1.0233610864117695, + "grad_norm": 0.614675760269165, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 6330 + }, + { + "epoch": 1.0249777705925147, + "grad_norm": 0.5626118183135986, + "learning_rate": 0.0002, + "loss": 0.7748, + "step": 6340 + }, + { + "epoch": 1.02659445477326, + "grad_norm": 0.574897289276123, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 6350 + }, + { + "epoch": 1.0282111389540054, + "grad_norm": 0.7185447812080383, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 6360 + }, + { + "epoch": 1.0298278231347506, + "grad_norm": 0.6705799698829651, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 6370 + }, + { + "epoch": 1.0314445073154959, + "grad_norm": 0.6740428805351257, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 6380 + }, + { + "epoch": 1.0330611914962413, + "grad_norm": 0.663902759552002, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 6390 + }, + { + "epoch": 1.0346778756769865, + "grad_norm": 0.5029543042182922, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 6400 + }, + { + "epoch": 1.0362945598577318, + "grad_norm": 0.7813863158226013, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 6410 + }, + { + "epoch": 1.037911244038477, + "grad_norm": 0.5396282076835632, + "learning_rate": 0.0002, + "loss": 0.7433, + "step": 6420 + }, + { + "epoch": 1.0395279282192225, + "grad_norm": 0.5253293514251709, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 6430 + }, + { + "epoch": 1.0411446123999677, + "grad_norm": 0.7236770987510681, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 6440 + }, + { + "epoch": 1.042761296580713, + "grad_norm": 0.5670917630195618, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 6450 + }, + { + "epoch": 1.0443779807614582, + "grad_norm": 0.6031978726387024, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 6460 + }, + { + "epoch": 1.0459946649422036, + "grad_norm": 0.5309213399887085, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 6470 + }, + { + "epoch": 1.0476113491229488, + "grad_norm": 0.7114651799201965, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 6480 + }, + { + "epoch": 1.049228033303694, + "grad_norm": 0.5591610670089722, + "learning_rate": 0.0002, + "loss": 0.7583, + "step": 6490 + }, + { + "epoch": 1.0508447174844395, + "grad_norm": 0.5185961127281189, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 6500 + }, + { + "epoch": 1.0524614016651848, + "grad_norm": 0.6510552167892456, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 6510 + }, + { + "epoch": 1.05407808584593, + "grad_norm": 0.6557928919792175, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 6520 + }, + { + "epoch": 1.0556947700266752, + "grad_norm": 0.6973192691802979, + "learning_rate": 0.0002, + "loss": 0.8056, + "step": 6530 + }, + { + "epoch": 1.0573114542074207, + "grad_norm": 0.6226583123207092, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 6540 + }, + { + "epoch": 1.058928138388166, + "grad_norm": 0.5633195638656616, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 6550 + }, + { + "epoch": 1.0605448225689111, + "grad_norm": 0.7466658353805542, + "learning_rate": 0.0002, + "loss": 0.7082, + "step": 6560 + }, + { + "epoch": 1.0621615067496564, + "grad_norm": 0.6462772488594055, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 6570 + }, + { + "epoch": 1.0637781909304018, + "grad_norm": 0.5266856551170349, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 6580 + }, + { + "epoch": 1.065394875111147, + "grad_norm": 0.534392774105072, + "learning_rate": 0.0002, + "loss": 0.7157, + "step": 6590 + }, + { + "epoch": 1.0670115592918923, + "grad_norm": 0.7514177560806274, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 6600 + }, + { + "epoch": 1.0686282434726375, + "grad_norm": 0.7593035697937012, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 6610 + }, + { + "epoch": 1.070244927653383, + "grad_norm": 0.5277858972549438, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 6620 + }, + { + "epoch": 1.0718616118341282, + "grad_norm": 0.5573670268058777, + "learning_rate": 0.0002, + "loss": 0.7405, + "step": 6630 + }, + { + "epoch": 1.0734782960148734, + "grad_norm": 0.6802396774291992, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 6640 + }, + { + "epoch": 1.0750949801956189, + "grad_norm": 0.7367215752601624, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 6650 + }, + { + "epoch": 1.0767116643763641, + "grad_norm": 0.5961891412734985, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 6660 + }, + { + "epoch": 1.0783283485571094, + "grad_norm": 0.5736313462257385, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 6670 + }, + { + "epoch": 1.0799450327378546, + "grad_norm": 0.619219183921814, + "learning_rate": 0.0002, + "loss": 0.7178, + "step": 6680 + }, + { + "epoch": 1.0815617169186, + "grad_norm": 0.6214390993118286, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 6690 + }, + { + "epoch": 1.0831784010993453, + "grad_norm": 0.564536988735199, + "learning_rate": 0.0002, + "loss": 0.7554, + "step": 6700 + }, + { + "epoch": 1.0847950852800905, + "grad_norm": 0.5838140249252319, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 6710 + }, + { + "epoch": 1.0864117694608357, + "grad_norm": 0.7000553607940674, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 6720 + }, + { + "epoch": 1.0880284536415812, + "grad_norm": 0.7078263759613037, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 6730 + }, + { + "epoch": 1.0896451378223264, + "grad_norm": 0.8353848457336426, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 6740 + }, + { + "epoch": 1.0912618220030716, + "grad_norm": 0.5615518689155579, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 6750 + }, + { + "epoch": 1.0928785061838169, + "grad_norm": 0.5475581288337708, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 6760 + }, + { + "epoch": 1.0944951903645623, + "grad_norm": 0.5835978388786316, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 6770 + }, + { + "epoch": 1.0961118745453076, + "grad_norm": 0.5516105890274048, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 6780 + }, + { + "epoch": 1.0977285587260528, + "grad_norm": 0.5875251889228821, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 6790 + }, + { + "epoch": 1.0993452429067982, + "grad_norm": 0.7376947999000549, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 6800 + }, + { + "epoch": 1.1009619270875435, + "grad_norm": 0.5656165480613708, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 6810 + }, + { + "epoch": 1.1025786112682887, + "grad_norm": 0.6365954279899597, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 6820 + }, + { + "epoch": 1.104195295449034, + "grad_norm": 0.5033080577850342, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 6830 + }, + { + "epoch": 1.1058119796297794, + "grad_norm": 0.617396891117096, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 6840 + }, + { + "epoch": 1.1074286638105246, + "grad_norm": 0.6395374536514282, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 6850 + }, + { + "epoch": 1.1090453479912699, + "grad_norm": 0.6775295734405518, + "learning_rate": 0.0002, + "loss": 0.7335, + "step": 6860 + }, + { + "epoch": 1.1106620321720153, + "grad_norm": 0.6655223965644836, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 6870 + }, + { + "epoch": 1.1122787163527605, + "grad_norm": 0.676655113697052, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 6880 + }, + { + "epoch": 1.1138954005335058, + "grad_norm": 0.6062718629837036, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 6890 + }, + { + "epoch": 1.115512084714251, + "grad_norm": 0.590943455696106, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 6900 + }, + { + "epoch": 1.1171287688949965, + "grad_norm": 0.6315317153930664, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 6910 + }, + { + "epoch": 1.1187454530757417, + "grad_norm": 0.47979024052619934, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 6920 + }, + { + "epoch": 1.120362137256487, + "grad_norm": 0.647298276424408, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 6930 + }, + { + "epoch": 1.1219788214372322, + "grad_norm": 0.7336484789848328, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 6940 + }, + { + "epoch": 1.1235955056179776, + "grad_norm": 0.5071424245834351, + "learning_rate": 0.0002, + "loss": 0.693, + "step": 6950 + }, + { + "epoch": 1.1252121897987228, + "grad_norm": 0.6527144312858582, + "learning_rate": 0.0002, + "loss": 0.7378, + "step": 6960 + }, + { + "epoch": 1.126828873979468, + "grad_norm": 0.6935935020446777, + "learning_rate": 0.0002, + "loss": 0.7228, + "step": 6970 + }, + { + "epoch": 1.1284455581602133, + "grad_norm": 0.8026931881904602, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 6980 + }, + { + "epoch": 1.1300622423409588, + "grad_norm": 0.5210393667221069, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 6990 + }, + { + "epoch": 1.131678926521704, + "grad_norm": 0.60475093126297, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 7000 + }, + { + "epoch": 1.1332956107024492, + "grad_norm": 0.6417073607444763, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 7010 + }, + { + "epoch": 1.1349122948831947, + "grad_norm": 0.6732175946235657, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 7020 + }, + { + "epoch": 1.13652897906394, + "grad_norm": 0.6719491481781006, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 7030 + }, + { + "epoch": 1.1381456632446851, + "grad_norm": 0.5708295106887817, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 7040 + }, + { + "epoch": 1.1397623474254304, + "grad_norm": 0.7141719460487366, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 7050 + }, + { + "epoch": 1.1413790316061758, + "grad_norm": 0.6187017560005188, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 7060 + }, + { + "epoch": 1.142995715786921, + "grad_norm": 0.50581294298172, + "learning_rate": 0.0002, + "loss": 0.7657, + "step": 7070 + }, + { + "epoch": 1.1446123999676663, + "grad_norm": 0.5620143413543701, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 7080 + }, + { + "epoch": 1.1462290841484115, + "grad_norm": 0.6231929659843445, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 7090 + }, + { + "epoch": 1.147845768329157, + "grad_norm": 0.5775774121284485, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 7100 + }, + { + "epoch": 1.1494624525099022, + "grad_norm": 0.6492809653282166, + "learning_rate": 0.0002, + "loss": 0.7728, + "step": 7110 + }, + { + "epoch": 1.1510791366906474, + "grad_norm": 0.6434972286224365, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 7120 + }, + { + "epoch": 1.1526958208713927, + "grad_norm": 0.6191812753677368, + "learning_rate": 0.0002, + "loss": 0.7374, + "step": 7130 + }, + { + "epoch": 1.1543125050521381, + "grad_norm": 0.6690331697463989, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 7140 + }, + { + "epoch": 1.1559291892328833, + "grad_norm": 0.5977938175201416, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 7150 + }, + { + "epoch": 1.1575458734136286, + "grad_norm": 0.6195854544639587, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 7160 + }, + { + "epoch": 1.159162557594374, + "grad_norm": 0.5752048492431641, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 7170 + }, + { + "epoch": 1.1607792417751193, + "grad_norm": 0.589081883430481, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 7180 + }, + { + "epoch": 1.1623959259558645, + "grad_norm": 0.756996750831604, + "learning_rate": 0.0002, + "loss": 0.704, + "step": 7190 + }, + { + "epoch": 1.1640126101366097, + "grad_norm": 0.7614967226982117, + "learning_rate": 0.0002, + "loss": 0.7404, + "step": 7200 + }, + { + "epoch": 1.1656292943173552, + "grad_norm": 0.6120437979698181, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 7210 + }, + { + "epoch": 1.1672459784981004, + "grad_norm": 0.6210004687309265, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 7220 + }, + { + "epoch": 1.1688626626788456, + "grad_norm": 0.6044116020202637, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 7230 + }, + { + "epoch": 1.170479346859591, + "grad_norm": 0.5418457388877869, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 7240 + }, + { + "epoch": 1.1720960310403363, + "grad_norm": 0.6413537263870239, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 7250 + }, + { + "epoch": 1.1737127152210816, + "grad_norm": 0.5777867436408997, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 7260 + }, + { + "epoch": 1.1753293994018268, + "grad_norm": 0.7092402577400208, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 7270 + }, + { + "epoch": 1.176946083582572, + "grad_norm": 0.6351709365844727, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 7280 + }, + { + "epoch": 1.1785627677633175, + "grad_norm": 0.6172189712524414, + "learning_rate": 0.0002, + "loss": 0.7527, + "step": 7290 + }, + { + "epoch": 1.1801794519440627, + "grad_norm": 0.6801714897155762, + "learning_rate": 0.0002, + "loss": 0.7319, + "step": 7300 + }, + { + "epoch": 1.181796136124808, + "grad_norm": 0.6044712066650391, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 7310 + }, + { + "epoch": 1.1834128203055534, + "grad_norm": 0.7413212060928345, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 7320 + }, + { + "epoch": 1.1850295044862986, + "grad_norm": 0.5303856134414673, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 7330 + }, + { + "epoch": 1.1866461886670439, + "grad_norm": 0.5647098422050476, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 7340 + }, + { + "epoch": 1.188262872847789, + "grad_norm": 0.7374135255813599, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 7350 + }, + { + "epoch": 1.1898795570285345, + "grad_norm": 0.5710089206695557, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 7360 + }, + { + "epoch": 1.1914962412092798, + "grad_norm": 0.6073619723320007, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 7370 + }, + { + "epoch": 1.193112925390025, + "grad_norm": 0.5899916887283325, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 7380 + }, + { + "epoch": 1.1947296095707705, + "grad_norm": 0.7762434482574463, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 7390 + }, + { + "epoch": 1.1963462937515157, + "grad_norm": 0.679949939250946, + "learning_rate": 0.0002, + "loss": 0.7319, + "step": 7400 + }, + { + "epoch": 1.197962977932261, + "grad_norm": 0.6106849312782288, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 7410 + }, + { + "epoch": 1.1995796621130062, + "grad_norm": 0.682461678981781, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 7420 + }, + { + "epoch": 1.2011963462937516, + "grad_norm": 0.6087017059326172, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 7430 + }, + { + "epoch": 1.2028130304744968, + "grad_norm": 0.63739013671875, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 7440 + }, + { + "epoch": 1.204429714655242, + "grad_norm": 0.6154777407646179, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 7450 + }, + { + "epoch": 1.2060463988359873, + "grad_norm": 0.7491534948348999, + "learning_rate": 0.0002, + "loss": 0.7565, + "step": 7460 + }, + { + "epoch": 1.2076630830167328, + "grad_norm": 0.6664797067642212, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 7470 + }, + { + "epoch": 1.209279767197478, + "grad_norm": 0.6660266518592834, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 7480 + }, + { + "epoch": 1.2108964513782232, + "grad_norm": 0.6972551345825195, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 7490 + }, + { + "epoch": 1.2125131355589684, + "grad_norm": 0.6157945990562439, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 7500 + }, + { + "epoch": 1.214129819739714, + "grad_norm": 0.5199310183525085, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 7510 + }, + { + "epoch": 1.2157465039204591, + "grad_norm": 0.577610433101654, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 7520 + }, + { + "epoch": 1.2173631881012044, + "grad_norm": 0.53652423620224, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 7530 + }, + { + "epoch": 1.2189798722819498, + "grad_norm": 0.6479050517082214, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 7540 + }, + { + "epoch": 1.220596556462695, + "grad_norm": 0.618748128414154, + "learning_rate": 0.0002, + "loss": 0.7534, + "step": 7550 + }, + { + "epoch": 1.2222132406434403, + "grad_norm": 0.6311424374580383, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 7560 + }, + { + "epoch": 1.2238299248241855, + "grad_norm": 0.6595825552940369, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 7570 + }, + { + "epoch": 1.225446609004931, + "grad_norm": 0.5198960900306702, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 7580 + }, + { + "epoch": 1.2270632931856762, + "grad_norm": 0.578650712966919, + "learning_rate": 0.0002, + "loss": 0.674, + "step": 7590 + }, + { + "epoch": 1.2286799773664214, + "grad_norm": 0.6080220937728882, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 7600 + }, + { + "epoch": 1.2302966615471669, + "grad_norm": 0.7050248384475708, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 7610 + }, + { + "epoch": 1.2319133457279121, + "grad_norm": 0.6652196049690247, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 7620 + }, + { + "epoch": 1.2335300299086573, + "grad_norm": 0.7322776317596436, + "learning_rate": 0.0002, + "loss": 0.7085, + "step": 7630 + }, + { + "epoch": 1.2351467140894026, + "grad_norm": 0.4998728036880493, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 7640 + }, + { + "epoch": 1.2367633982701478, + "grad_norm": 0.6428788900375366, + "learning_rate": 0.0002, + "loss": 0.7214, + "step": 7650 + }, + { + "epoch": 1.2383800824508933, + "grad_norm": 0.585242509841919, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 7660 + }, + { + "epoch": 1.2399967666316385, + "grad_norm": 0.5211917757987976, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 7670 + }, + { + "epoch": 1.2416134508123837, + "grad_norm": 0.6490384340286255, + "learning_rate": 0.0002, + "loss": 0.746, + "step": 7680 + }, + { + "epoch": 1.2432301349931292, + "grad_norm": 0.6249763369560242, + "learning_rate": 0.0002, + "loss": 0.7186, + "step": 7690 + }, + { + "epoch": 1.2448468191738744, + "grad_norm": 0.71870356798172, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 7700 + }, + { + "epoch": 1.2464635033546196, + "grad_norm": 0.6761967539787292, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 7710 + }, + { + "epoch": 1.2480801875353649, + "grad_norm": 0.6500617265701294, + "learning_rate": 0.0002, + "loss": 0.7501, + "step": 7720 + }, + { + "epoch": 1.2496968717161103, + "grad_norm": 0.8069869875907898, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 7730 + }, + { + "epoch": 1.2513135558968556, + "grad_norm": 0.6044608950614929, + "learning_rate": 0.0002, + "loss": 0.6747, + "step": 7740 + }, + { + "epoch": 1.2529302400776008, + "grad_norm": 0.6573283076286316, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 7750 + }, + { + "epoch": 1.2545469242583462, + "grad_norm": 0.625430166721344, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 7760 + }, + { + "epoch": 1.2561636084390915, + "grad_norm": 0.5442022681236267, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 7770 + }, + { + "epoch": 1.2577802926198367, + "grad_norm": 0.6818386912345886, + "learning_rate": 0.0002, + "loss": 0.7172, + "step": 7780 + }, + { + "epoch": 1.259396976800582, + "grad_norm": 0.6381874084472656, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 7790 + }, + { + "epoch": 1.2610136609813272, + "grad_norm": 0.6269212961196899, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 7800 + }, + { + "epoch": 1.2626303451620726, + "grad_norm": 0.600121259689331, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 7810 + }, + { + "epoch": 1.2642470293428179, + "grad_norm": 0.6337703466415405, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 7820 + }, + { + "epoch": 1.2658637135235633, + "grad_norm": 0.7234963774681091, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 7830 + }, + { + "epoch": 1.2674803977043085, + "grad_norm": 0.800184965133667, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 7840 + }, + { + "epoch": 1.2690970818850538, + "grad_norm": 0.7539464831352234, + "learning_rate": 0.0002, + "loss": 0.7426, + "step": 7850 + }, + { + "epoch": 1.270713766065799, + "grad_norm": 0.5493760704994202, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 7860 + }, + { + "epoch": 1.2723304502465442, + "grad_norm": 0.7477145791053772, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 7870 + }, + { + "epoch": 1.2739471344272897, + "grad_norm": 0.6366362571716309, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 7880 + }, + { + "epoch": 1.275563818608035, + "grad_norm": 0.7419533729553223, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 7890 + }, + { + "epoch": 1.2771805027887801, + "grad_norm": 0.6141223311424255, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 7900 + }, + { + "epoch": 1.2787971869695256, + "grad_norm": 0.7522598505020142, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 7910 + }, + { + "epoch": 1.2804138711502708, + "grad_norm": 0.6935804486274719, + "learning_rate": 0.0002, + "loss": 0.7097, + "step": 7920 + }, + { + "epoch": 1.282030555331016, + "grad_norm": 0.7239290475845337, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 7930 + }, + { + "epoch": 1.2836472395117613, + "grad_norm": 0.8800187110900879, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 7940 + }, + { + "epoch": 1.2852639236925067, + "grad_norm": 0.540458083152771, + "learning_rate": 0.0002, + "loss": 0.6991, + "step": 7950 + }, + { + "epoch": 1.286880607873252, + "grad_norm": 0.6492934226989746, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 7960 + }, + { + "epoch": 1.2884972920539972, + "grad_norm": 0.6543959379196167, + "learning_rate": 0.0002, + "loss": 0.7742, + "step": 7970 + }, + { + "epoch": 1.2901139762347427, + "grad_norm": 0.5804705619812012, + "learning_rate": 0.0002, + "loss": 0.7316, + "step": 7980 + }, + { + "epoch": 1.291730660415488, + "grad_norm": 0.7074727416038513, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 7990 + }, + { + "epoch": 1.2933473445962331, + "grad_norm": 0.5347974300384521, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 8000 + }, + { + "epoch": 1.2949640287769784, + "grad_norm": 0.6457298398017883, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 8010 + }, + { + "epoch": 1.2965807129577236, + "grad_norm": 0.6407219171524048, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 8020 + }, + { + "epoch": 1.298197397138469, + "grad_norm": 0.828439474105835, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 8030 + }, + { + "epoch": 1.2998140813192143, + "grad_norm": 0.4840380549430847, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 8040 + }, + { + "epoch": 1.3014307654999595, + "grad_norm": 0.5921024680137634, + "learning_rate": 0.0002, + "loss": 0.7283, + "step": 8050 + }, + { + "epoch": 1.303047449680705, + "grad_norm": 0.6170315146446228, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 8060 + }, + { + "epoch": 1.3046641338614502, + "grad_norm": 0.5374847054481506, + "learning_rate": 0.0002, + "loss": 0.7534, + "step": 8070 + }, + { + "epoch": 1.3062808180421954, + "grad_norm": 0.545758068561554, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 8080 + }, + { + "epoch": 1.3078975022229407, + "grad_norm": 0.55641770362854, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 8090 + }, + { + "epoch": 1.309514186403686, + "grad_norm": 0.6724897027015686, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 8100 + }, + { + "epoch": 1.3111308705844313, + "grad_norm": 0.6923972368240356, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 8110 + }, + { + "epoch": 1.3127475547651766, + "grad_norm": 0.5136841535568237, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 8120 + }, + { + "epoch": 1.314364238945922, + "grad_norm": 0.6766283512115479, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 8130 + }, + { + "epoch": 1.3159809231266673, + "grad_norm": 0.6283926367759705, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 8140 + }, + { + "epoch": 1.3175976073074125, + "grad_norm": 0.644216001033783, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 8150 + }, + { + "epoch": 1.3192142914881577, + "grad_norm": 0.7827503085136414, + "learning_rate": 0.0002, + "loss": 0.7125, + "step": 8160 + }, + { + "epoch": 1.320830975668903, + "grad_norm": 0.6651390790939331, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 8170 + }, + { + "epoch": 1.3224476598496484, + "grad_norm": 0.5547412633895874, + "learning_rate": 0.0002, + "loss": 0.7778, + "step": 8180 + }, + { + "epoch": 1.3240643440303936, + "grad_norm": 0.6765179634094238, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 8190 + }, + { + "epoch": 1.325681028211139, + "grad_norm": 0.6822077035903931, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 8200 + }, + { + "epoch": 1.3272977123918843, + "grad_norm": 0.5941002368927002, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 8210 + }, + { + "epoch": 1.3289143965726296, + "grad_norm": 0.4850037097930908, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 8220 + }, + { + "epoch": 1.3305310807533748, + "grad_norm": 0.6162990927696228, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 8230 + }, + { + "epoch": 1.33214776493412, + "grad_norm": 0.6665613651275635, + "learning_rate": 0.0002, + "loss": 0.7751, + "step": 8240 + }, + { + "epoch": 1.3337644491148655, + "grad_norm": 0.618192732334137, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 8250 + }, + { + "epoch": 1.3353811332956107, + "grad_norm": 0.710418701171875, + "learning_rate": 0.0002, + "loss": 0.7532, + "step": 8260 + }, + { + "epoch": 1.336997817476356, + "grad_norm": 0.5109876990318298, + "learning_rate": 0.0002, + "loss": 0.7306, + "step": 8270 + }, + { + "epoch": 1.3386145016571014, + "grad_norm": 0.6791711449623108, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 8280 + }, + { + "epoch": 1.3402311858378466, + "grad_norm": 0.6836432814598083, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 8290 + }, + { + "epoch": 1.3418478700185918, + "grad_norm": 0.5579386353492737, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 8300 + }, + { + "epoch": 1.343464554199337, + "grad_norm": 0.6713546514511108, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 8310 + }, + { + "epoch": 1.3450812383800825, + "grad_norm": 0.5353720188140869, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 8320 + }, + { + "epoch": 1.3466979225608278, + "grad_norm": 0.5813682675361633, + "learning_rate": 0.0002, + "loss": 0.718, + "step": 8330 + }, + { + "epoch": 1.348314606741573, + "grad_norm": 0.8158791661262512, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 8340 + }, + { + "epoch": 1.3499312909223184, + "grad_norm": 0.6193785071372986, + "learning_rate": 0.0002, + "loss": 0.6992, + "step": 8350 + }, + { + "epoch": 1.3515479751030637, + "grad_norm": 0.6353939771652222, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 8360 + }, + { + "epoch": 1.353164659283809, + "grad_norm": 0.6925048232078552, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 8370 + }, + { + "epoch": 1.3547813434645541, + "grad_norm": 0.988264799118042, + "learning_rate": 0.0002, + "loss": 0.736, + "step": 8380 + }, + { + "epoch": 1.3563980276452994, + "grad_norm": 0.6476002931594849, + "learning_rate": 0.0002, + "loss": 0.7744, + "step": 8390 + }, + { + "epoch": 1.3580147118260448, + "grad_norm": 0.7120398879051208, + "learning_rate": 0.0002, + "loss": 0.776, + "step": 8400 + }, + { + "epoch": 1.35963139600679, + "grad_norm": 0.9048416614532471, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 8410 + }, + { + "epoch": 1.3612480801875353, + "grad_norm": 0.7000672817230225, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 8420 + }, + { + "epoch": 1.3628647643682807, + "grad_norm": 0.6015632152557373, + "learning_rate": 0.0002, + "loss": 0.7358, + "step": 8430 + }, + { + "epoch": 1.364481448549026, + "grad_norm": 0.612516462802887, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 8440 + }, + { + "epoch": 1.3660981327297712, + "grad_norm": 0.5969301462173462, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 8450 + }, + { + "epoch": 1.3677148169105164, + "grad_norm": 0.6730654239654541, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 8460 + }, + { + "epoch": 1.369331501091262, + "grad_norm": 0.6386392116546631, + "learning_rate": 0.0002, + "loss": 0.7465, + "step": 8470 + }, + { + "epoch": 1.3709481852720071, + "grad_norm": 0.739544153213501, + "learning_rate": 0.0002, + "loss": 0.7433, + "step": 8480 + }, + { + "epoch": 1.3725648694527524, + "grad_norm": 0.6462782621383667, + "learning_rate": 0.0002, + "loss": 0.7892, + "step": 8490 + }, + { + "epoch": 1.3741815536334978, + "grad_norm": 0.7346843481063843, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 8500 + }, + { + "epoch": 1.375798237814243, + "grad_norm": 0.6884821057319641, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 8510 + }, + { + "epoch": 1.3774149219949883, + "grad_norm": 0.6999333500862122, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 8520 + }, + { + "epoch": 1.3790316061757335, + "grad_norm": 0.5378713011741638, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 8530 + }, + { + "epoch": 1.3806482903564787, + "grad_norm": 0.5417906641960144, + "learning_rate": 0.0002, + "loss": 0.6797, + "step": 8540 + }, + { + "epoch": 1.3822649745372242, + "grad_norm": 0.6602526307106018, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 8550 + }, + { + "epoch": 1.3838816587179694, + "grad_norm": 0.7073674201965332, + "learning_rate": 0.0002, + "loss": 0.7356, + "step": 8560 + }, + { + "epoch": 1.3854983428987149, + "grad_norm": 0.5841707587242126, + "learning_rate": 0.0002, + "loss": 0.75, + "step": 8570 + }, + { + "epoch": 1.38711502707946, + "grad_norm": 0.7031095027923584, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 8580 + }, + { + "epoch": 1.3887317112602053, + "grad_norm": 0.5198570489883423, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 8590 + }, + { + "epoch": 1.3903483954409506, + "grad_norm": 0.7261320352554321, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 8600 + }, + { + "epoch": 1.3919650796216958, + "grad_norm": 0.5616350173950195, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 8610 + }, + { + "epoch": 1.3935817638024413, + "grad_norm": 0.5185914635658264, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 8620 + }, + { + "epoch": 1.3951984479831865, + "grad_norm": 0.5814694762229919, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 8630 + }, + { + "epoch": 1.3968151321639317, + "grad_norm": 0.6977371573448181, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 8640 + }, + { + "epoch": 1.3984318163446772, + "grad_norm": 0.6855689883232117, + "learning_rate": 0.0002, + "loss": 0.7574, + "step": 8650 + }, + { + "epoch": 1.4000485005254224, + "grad_norm": 0.5414357781410217, + "learning_rate": 0.0002, + "loss": 0.7802, + "step": 8660 + }, + { + "epoch": 1.4016651847061676, + "grad_norm": 0.6970012784004211, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 8670 + }, + { + "epoch": 1.4032818688869129, + "grad_norm": 0.526079535484314, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 8680 + }, + { + "epoch": 1.404898553067658, + "grad_norm": 0.758712887763977, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 8690 + }, + { + "epoch": 1.4065152372484035, + "grad_norm": 0.7118762731552124, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 8700 + }, + { + "epoch": 1.4081319214291488, + "grad_norm": 0.5696909427642822, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 8710 + }, + { + "epoch": 1.4097486056098942, + "grad_norm": 0.7995436787605286, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 8720 + }, + { + "epoch": 1.4113652897906395, + "grad_norm": 0.7237521409988403, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 8730 + }, + { + "epoch": 1.4129819739713847, + "grad_norm": 0.744628369808197, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 8740 + }, + { + "epoch": 1.41459865815213, + "grad_norm": 0.6082926988601685, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 8750 + }, + { + "epoch": 1.4162153423328752, + "grad_norm": 0.5185243487358093, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 8760 + }, + { + "epoch": 1.4178320265136206, + "grad_norm": 0.5183082222938538, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 8770 + }, + { + "epoch": 1.4194487106943658, + "grad_norm": 0.7326041460037231, + "learning_rate": 0.0002, + "loss": 0.7509, + "step": 8780 + }, + { + "epoch": 1.421065394875111, + "grad_norm": 0.7174660563468933, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 8790 + }, + { + "epoch": 1.4226820790558565, + "grad_norm": 0.8080165982246399, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 8800 + }, + { + "epoch": 1.4242987632366018, + "grad_norm": 0.5061507821083069, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 8810 + }, + { + "epoch": 1.425915447417347, + "grad_norm": 0.801602840423584, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 8820 + }, + { + "epoch": 1.4275321315980922, + "grad_norm": 0.6150273084640503, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 8830 + }, + { + "epoch": 1.4291488157788377, + "grad_norm": 0.8786525726318359, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 8840 + }, + { + "epoch": 1.430765499959583, + "grad_norm": 0.6371538639068604, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 8850 + }, + { + "epoch": 1.4323821841403281, + "grad_norm": 0.6409295797348022, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 8860 + }, + { + "epoch": 1.4339988683210736, + "grad_norm": 0.6452359557151794, + "learning_rate": 0.0002, + "loss": 0.7891, + "step": 8870 + }, + { + "epoch": 1.4356155525018188, + "grad_norm": 0.5842334628105164, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 8880 + }, + { + "epoch": 1.437232236682564, + "grad_norm": 0.696761965751648, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 8890 + }, + { + "epoch": 1.4388489208633093, + "grad_norm": 0.6384600400924683, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 8900 + }, + { + "epoch": 1.4404656050440545, + "grad_norm": 0.5981136560440063, + "learning_rate": 0.0002, + "loss": 0.7049, + "step": 8910 + }, + { + "epoch": 1.4420822892248, + "grad_norm": 0.6355637907981873, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 8920 + }, + { + "epoch": 1.4436989734055452, + "grad_norm": 0.6374830603599548, + "learning_rate": 0.0002, + "loss": 0.7653, + "step": 8930 + }, + { + "epoch": 1.4453156575862904, + "grad_norm": 0.559013307094574, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 8940 + }, + { + "epoch": 1.446932341767036, + "grad_norm": 0.7289170026779175, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 8950 + }, + { + "epoch": 1.4485490259477811, + "grad_norm": 0.8649206757545471, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 8960 + }, + { + "epoch": 1.4501657101285264, + "grad_norm": 0.7664689421653748, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 8970 + }, + { + "epoch": 1.4517823943092716, + "grad_norm": 0.7109952569007874, + "learning_rate": 0.0002, + "loss": 0.7431, + "step": 8980 + }, + { + "epoch": 1.453399078490017, + "grad_norm": 0.6312844753265381, + "learning_rate": 0.0002, + "loss": 0.7997, + "step": 8990 + }, + { + "epoch": 1.4550157626707623, + "grad_norm": 0.6616617441177368, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 9000 + }, + { + "epoch": 1.4566324468515075, + "grad_norm": 0.7384068965911865, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 9010 + }, + { + "epoch": 1.458249131032253, + "grad_norm": 0.6549670100212097, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 9020 + }, + { + "epoch": 1.4598658152129982, + "grad_norm": 0.6254119277000427, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 9030 + }, + { + "epoch": 1.4614824993937434, + "grad_norm": 0.6806328892707825, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 9040 + }, + { + "epoch": 1.4630991835744886, + "grad_norm": 0.6803115010261536, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 9050 + }, + { + "epoch": 1.4647158677552339, + "grad_norm": 0.48529282212257385, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 9060 + }, + { + "epoch": 1.4663325519359793, + "grad_norm": 0.5995030999183655, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 9070 + }, + { + "epoch": 1.4679492361167246, + "grad_norm": 0.6005427837371826, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 9080 + }, + { + "epoch": 1.46956592029747, + "grad_norm": 0.718564510345459, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 9090 + }, + { + "epoch": 1.4711826044782153, + "grad_norm": 0.7003577351570129, + "learning_rate": 0.0002, + "loss": 0.7089, + "step": 9100 + }, + { + "epoch": 1.4727992886589605, + "grad_norm": 0.5888323783874512, + "learning_rate": 0.0002, + "loss": 0.8069, + "step": 9110 + }, + { + "epoch": 1.4744159728397057, + "grad_norm": 0.6417609453201294, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 9120 + }, + { + "epoch": 1.476032657020451, + "grad_norm": 0.572294294834137, + "learning_rate": 0.0002, + "loss": 0.7441, + "step": 9130 + }, + { + "epoch": 1.4776493412011964, + "grad_norm": 0.8200714588165283, + "learning_rate": 0.0002, + "loss": 0.8053, + "step": 9140 + }, + { + "epoch": 1.4792660253819416, + "grad_norm": 0.6343288421630859, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 9150 + }, + { + "epoch": 1.4808827095626869, + "grad_norm": 0.7017961144447327, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 9160 + }, + { + "epoch": 1.4824993937434323, + "grad_norm": 0.6202912926673889, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 9170 + }, + { + "epoch": 1.4841160779241775, + "grad_norm": 0.6677869558334351, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 9180 + }, + { + "epoch": 1.4857327621049228, + "grad_norm": 0.6052267551422119, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 9190 + }, + { + "epoch": 1.487349446285668, + "grad_norm": 0.6638872027397156, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 9200 + }, + { + "epoch": 1.4889661304664135, + "grad_norm": 0.6245523691177368, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 9210 + }, + { + "epoch": 1.4905828146471587, + "grad_norm": 0.5761767625808716, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 9220 + }, + { + "epoch": 1.492199498827904, + "grad_norm": 0.8175981640815735, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 9230 + }, + { + "epoch": 1.4938161830086494, + "grad_norm": 0.9144009947776794, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 9240 + }, + { + "epoch": 1.4954328671893946, + "grad_norm": 0.5742552876472473, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 9250 + }, + { + "epoch": 1.4970495513701398, + "grad_norm": 0.534534215927124, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 9260 + }, + { + "epoch": 1.498666235550885, + "grad_norm": 0.7836225032806396, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 9270 + }, + { + "epoch": 1.5002829197316303, + "grad_norm": 0.5292993187904358, + "learning_rate": 0.0002, + "loss": 0.7453, + "step": 9280 + }, + { + "epoch": 1.5018996039123758, + "grad_norm": 0.8044071793556213, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 9290 + }, + { + "epoch": 1.503516288093121, + "grad_norm": 0.6185805201530457, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 9300 + }, + { + "epoch": 1.5051329722738664, + "grad_norm": 0.6093607544898987, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 9310 + }, + { + "epoch": 1.5067496564546117, + "grad_norm": 0.5891730189323425, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 9320 + }, + { + "epoch": 1.508366340635357, + "grad_norm": 0.6331129670143127, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 9330 + }, + { + "epoch": 1.5099830248161021, + "grad_norm": 0.7690958380699158, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 9340 + }, + { + "epoch": 1.5115997089968474, + "grad_norm": 0.6548877358436584, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 9350 + }, + { + "epoch": 1.5132163931775926, + "grad_norm": 0.6545143127441406, + "learning_rate": 0.0002, + "loss": 0.7408, + "step": 9360 + }, + { + "epoch": 1.514833077358338, + "grad_norm": 0.553247332572937, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 9370 + }, + { + "epoch": 1.5164497615390833, + "grad_norm": 0.8145074844360352, + "learning_rate": 0.0002, + "loss": 0.7265, + "step": 9380 + }, + { + "epoch": 1.5180664457198287, + "grad_norm": 0.7636994123458862, + "learning_rate": 0.0002, + "loss": 0.7379, + "step": 9390 + }, + { + "epoch": 1.519683129900574, + "grad_norm": 0.6838982701301575, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 9400 + }, + { + "epoch": 1.5212998140813192, + "grad_norm": 0.8599441647529602, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 9410 + }, + { + "epoch": 1.5229164982620644, + "grad_norm": 0.7020329833030701, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 9420 + }, + { + "epoch": 1.5245331824428097, + "grad_norm": 0.6964772343635559, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 9430 + }, + { + "epoch": 1.5261498666235551, + "grad_norm": 0.6916600465774536, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 9440 + }, + { + "epoch": 1.5277665508043003, + "grad_norm": 0.7282621264457703, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 9450 + }, + { + "epoch": 1.5293832349850458, + "grad_norm": 0.5363983511924744, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 9460 + }, + { + "epoch": 1.530999919165791, + "grad_norm": 0.6184861063957214, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 9470 + }, + { + "epoch": 1.5326166033465363, + "grad_norm": 0.5991285443305969, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 9480 + }, + { + "epoch": 1.5342332875272815, + "grad_norm": 0.8176587820053101, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 9490 + }, + { + "epoch": 1.5358499717080267, + "grad_norm": 0.6473721861839294, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 9500 + }, + { + "epoch": 1.5374666558887722, + "grad_norm": 0.7319952845573425, + "learning_rate": 0.0002, + "loss": 0.7632, + "step": 9510 + }, + { + "epoch": 1.5390833400695174, + "grad_norm": 0.702900230884552, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 9520 + }, + { + "epoch": 1.5407000242502629, + "grad_norm": 0.7971600294113159, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 9530 + }, + { + "epoch": 1.542316708431008, + "grad_norm": 0.6527525186538696, + "learning_rate": 0.0002, + "loss": 0.7352, + "step": 9540 + }, + { + "epoch": 1.5439333926117533, + "grad_norm": 0.5791676044464111, + "learning_rate": 0.0002, + "loss": 0.7425, + "step": 9550 + }, + { + "epoch": 1.5455500767924986, + "grad_norm": 0.5619390606880188, + "learning_rate": 0.0002, + "loss": 0.7585, + "step": 9560 + }, + { + "epoch": 1.5471667609732438, + "grad_norm": 0.5701689124107361, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 9570 + }, + { + "epoch": 1.548783445153989, + "grad_norm": 0.47549352049827576, + "learning_rate": 0.0002, + "loss": 0.793, + "step": 9580 + }, + { + "epoch": 1.5504001293347345, + "grad_norm": 0.8730611205101013, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 9590 + }, + { + "epoch": 1.5520168135154797, + "grad_norm": 0.6842091083526611, + "learning_rate": 0.0002, + "loss": 0.798, + "step": 9600 + }, + { + "epoch": 1.5536334976962252, + "grad_norm": 0.6675129532814026, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 9610 + }, + { + "epoch": 1.5552501818769704, + "grad_norm": 0.8173956274986267, + "learning_rate": 0.0002, + "loss": 0.7954, + "step": 9620 + }, + { + "epoch": 1.5568668660577156, + "grad_norm": 0.724947452545166, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 9630 + }, + { + "epoch": 1.5584835502384609, + "grad_norm": 0.6154758930206299, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 9640 + }, + { + "epoch": 1.560100234419206, + "grad_norm": 0.6072008013725281, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 9650 + }, + { + "epoch": 1.5617169185999515, + "grad_norm": 0.659010648727417, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 9660 + }, + { + "epoch": 1.5633336027806968, + "grad_norm": 0.65857994556427, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 9670 + }, + { + "epoch": 1.5649502869614422, + "grad_norm": 0.5914267301559448, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 9680 + }, + { + "epoch": 1.5665669711421875, + "grad_norm": 0.6248020529747009, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 9690 + }, + { + "epoch": 1.5681836553229327, + "grad_norm": 0.7147795557975769, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 9700 + }, + { + "epoch": 1.569800339503678, + "grad_norm": 0.7076232433319092, + "learning_rate": 0.0002, + "loss": 0.7335, + "step": 9710 + }, + { + "epoch": 1.5714170236844232, + "grad_norm": 0.6217400431632996, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 9720 + }, + { + "epoch": 1.5730337078651684, + "grad_norm": 0.6709911227226257, + "learning_rate": 0.0002, + "loss": 0.7296, + "step": 9730 + }, + { + "epoch": 1.5746503920459138, + "grad_norm": 0.749171257019043, + "learning_rate": 0.0002, + "loss": 0.7306, + "step": 9740 + }, + { + "epoch": 1.576267076226659, + "grad_norm": 0.6241145730018616, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 9750 + }, + { + "epoch": 1.5778837604074045, + "grad_norm": 0.4960934817790985, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 9760 + }, + { + "epoch": 1.5795004445881498, + "grad_norm": 0.6593309640884399, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 9770 + }, + { + "epoch": 1.581117128768895, + "grad_norm": 0.5814042091369629, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 9780 + }, + { + "epoch": 1.5827338129496402, + "grad_norm": 0.5936070680618286, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 9790 + }, + { + "epoch": 1.5843504971303854, + "grad_norm": 0.6454403400421143, + "learning_rate": 0.0002, + "loss": 0.7769, + "step": 9800 + }, + { + "epoch": 1.585967181311131, + "grad_norm": 0.7612107992172241, + "learning_rate": 0.0002, + "loss": 0.7677, + "step": 9810 + }, + { + "epoch": 1.5875838654918761, + "grad_norm": 0.6494482755661011, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 9820 + }, + { + "epoch": 1.5892005496726216, + "grad_norm": 0.7825694680213928, + "learning_rate": 0.0002, + "loss": 0.7569, + "step": 9830 + }, + { + "epoch": 1.5908172338533668, + "grad_norm": 0.6757757663726807, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 9840 + }, + { + "epoch": 1.592433918034112, + "grad_norm": 0.7105609178543091, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 9850 + }, + { + "epoch": 1.5940506022148573, + "grad_norm": 0.7596991062164307, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 9860 + }, + { + "epoch": 1.5956672863956025, + "grad_norm": 0.5681525468826294, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 9870 + }, + { + "epoch": 1.5972839705763477, + "grad_norm": 0.6090980768203735, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 9880 + }, + { + "epoch": 1.5989006547570932, + "grad_norm": 0.6271613240242004, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 9890 + }, + { + "epoch": 1.6005173389378387, + "grad_norm": 0.7656369805335999, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 9900 + }, + { + "epoch": 1.6021340231185839, + "grad_norm": 0.7504446506500244, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 9910 + }, + { + "epoch": 1.6037507072993291, + "grad_norm": 0.659656286239624, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 9920 + }, + { + "epoch": 1.6053673914800743, + "grad_norm": 0.6006826162338257, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 9930 + }, + { + "epoch": 1.6069840756608196, + "grad_norm": 0.7872757911682129, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 9940 + }, + { + "epoch": 1.6086007598415648, + "grad_norm": 0.5545852780342102, + "learning_rate": 0.0002, + "loss": 0.768, + "step": 9950 + }, + { + "epoch": 1.6102174440223103, + "grad_norm": 0.7429468631744385, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 9960 + }, + { + "epoch": 1.6118341282030555, + "grad_norm": 0.6873556971549988, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 9970 + }, + { + "epoch": 1.613450812383801, + "grad_norm": 0.5874287486076355, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 9980 + }, + { + "epoch": 1.6150674965645462, + "grad_norm": 0.6039386987686157, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 9990 + }, + { + "epoch": 1.6166841807452914, + "grad_norm": 0.6233575940132141, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 10000 + }, + { + "epoch": 1.6183008649260366, + "grad_norm": 0.7676448225975037, + "learning_rate": 0.0002, + "loss": 0.7602, + "step": 10010 + }, + { + "epoch": 1.6199175491067819, + "grad_norm": 0.6565698385238647, + "learning_rate": 0.0002, + "loss": 0.7784, + "step": 10020 + }, + { + "epoch": 1.6215342332875273, + "grad_norm": 0.6787590384483337, + "learning_rate": 0.0002, + "loss": 0.7104, + "step": 10030 + }, + { + "epoch": 1.6231509174682726, + "grad_norm": 0.6137678027153015, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 10040 + }, + { + "epoch": 1.624767601649018, + "grad_norm": 0.5236800312995911, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 10050 + }, + { + "epoch": 1.6263842858297632, + "grad_norm": 0.7626367807388306, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 10060 + }, + { + "epoch": 1.6280009700105085, + "grad_norm": 0.5657260417938232, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 10070 + }, + { + "epoch": 1.6296176541912537, + "grad_norm": 0.4913991391658783, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 10080 + }, + { + "epoch": 1.631234338371999, + "grad_norm": 0.7715556621551514, + "learning_rate": 0.0002, + "loss": 0.7596, + "step": 10090 + }, + { + "epoch": 1.6328510225527442, + "grad_norm": 0.6509000062942505, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 10100 + }, + { + "epoch": 1.6344677067334896, + "grad_norm": 0.6215850114822388, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 10110 + }, + { + "epoch": 1.6360843909142349, + "grad_norm": 0.6956844329833984, + "learning_rate": 0.0002, + "loss": 0.7705, + "step": 10120 + }, + { + "epoch": 1.6377010750949803, + "grad_norm": 0.6111597418785095, + "learning_rate": 0.0002, + "loss": 0.7129, + "step": 10130 + }, + { + "epoch": 1.6393177592757255, + "grad_norm": 0.6518288850784302, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 10140 + }, + { + "epoch": 1.6409344434564708, + "grad_norm": 0.6914522051811218, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 10150 + }, + { + "epoch": 1.642551127637216, + "grad_norm": 0.63785719871521, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 10160 + }, + { + "epoch": 1.6441678118179612, + "grad_norm": 0.6379287838935852, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 10170 + }, + { + "epoch": 1.6457844959987067, + "grad_norm": 0.6793403029441833, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 10180 + }, + { + "epoch": 1.647401180179452, + "grad_norm": 0.6099132895469666, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 10190 + }, + { + "epoch": 1.6490178643601974, + "grad_norm": 0.5869854092597961, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 10200 + }, + { + "epoch": 1.6506345485409426, + "grad_norm": 0.7716999053955078, + "learning_rate": 0.0002, + "loss": 0.8308, + "step": 10210 + }, + { + "epoch": 1.6522512327216878, + "grad_norm": 0.6854110360145569, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 10220 + }, + { + "epoch": 1.653867916902433, + "grad_norm": 0.6957170367240906, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 10230 + }, + { + "epoch": 1.6554846010831783, + "grad_norm": 0.6932903528213501, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 10240 + }, + { + "epoch": 1.6571012852639235, + "grad_norm": 0.7713165283203125, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 10250 + }, + { + "epoch": 1.658717969444669, + "grad_norm": 0.7455793619155884, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 10260 + }, + { + "epoch": 1.6603346536254144, + "grad_norm": 0.5464168190956116, + "learning_rate": 0.0002, + "loss": 0.7524, + "step": 10270 + }, + { + "epoch": 1.6619513378061597, + "grad_norm": 0.6782926321029663, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 10280 + }, + { + "epoch": 1.663568021986905, + "grad_norm": 0.7962649464607239, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 10290 + }, + { + "epoch": 1.6651847061676501, + "grad_norm": 0.6814526319503784, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 10300 + }, + { + "epoch": 1.6668013903483954, + "grad_norm": 0.656895101070404, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 10310 + }, + { + "epoch": 1.6684180745291406, + "grad_norm": 0.6085672378540039, + "learning_rate": 0.0002, + "loss": 0.7358, + "step": 10320 + }, + { + "epoch": 1.670034758709886, + "grad_norm": 0.585508406162262, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 10330 + }, + { + "epoch": 1.6716514428906313, + "grad_norm": 0.6930184364318848, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 10340 + }, + { + "epoch": 1.6732681270713767, + "grad_norm": 0.575663149356842, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 10350 + }, + { + "epoch": 1.674884811252122, + "grad_norm": 0.582502543926239, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 10360 + }, + { + "epoch": 1.6765014954328672, + "grad_norm": 0.5668916702270508, + "learning_rate": 0.0002, + "loss": 0.7793, + "step": 10370 + }, + { + "epoch": 1.6781181796136124, + "grad_norm": 0.6070065498352051, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 10380 + }, + { + "epoch": 1.6797348637943577, + "grad_norm": 0.6141316294670105, + "learning_rate": 0.0002, + "loss": 0.7939, + "step": 10390 + }, + { + "epoch": 1.6813515479751031, + "grad_norm": 0.8359124064445496, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 10400 + }, + { + "epoch": 1.6829682321558483, + "grad_norm": 0.5378185510635376, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 10410 + }, + { + "epoch": 1.6845849163365938, + "grad_norm": 0.6959536075592041, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 10420 + }, + { + "epoch": 1.686201600517339, + "grad_norm": 0.6514357328414917, + "learning_rate": 0.0002, + "loss": 0.7872, + "step": 10430 + }, + { + "epoch": 1.6878182846980843, + "grad_norm": 0.7706646919250488, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 10440 + }, + { + "epoch": 1.6894349688788295, + "grad_norm": 0.6183337569236755, + "learning_rate": 0.0002, + "loss": 0.7673, + "step": 10450 + }, + { + "epoch": 1.6910516530595747, + "grad_norm": 0.6123278141021729, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 10460 + }, + { + "epoch": 1.69266833724032, + "grad_norm": 0.6894851326942444, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 10470 + }, + { + "epoch": 1.6942850214210654, + "grad_norm": 0.7497312426567078, + "learning_rate": 0.0002, + "loss": 0.7435, + "step": 10480 + }, + { + "epoch": 1.6959017056018106, + "grad_norm": 0.5968214273452759, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 10490 + }, + { + "epoch": 1.697518389782556, + "grad_norm": 0.6747927069664001, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 10500 + }, + { + "epoch": 1.6991350739633013, + "grad_norm": 0.5708310008049011, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 10510 + }, + { + "epoch": 1.7007517581440466, + "grad_norm": 0.606526792049408, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 10520 + }, + { + "epoch": 1.7023684423247918, + "grad_norm": 0.662011981010437, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 10530 + }, + { + "epoch": 1.703985126505537, + "grad_norm": 0.7583045363426208, + "learning_rate": 0.0002, + "loss": 0.7999, + "step": 10540 + }, + { + "epoch": 1.7056018106862825, + "grad_norm": 0.721632182598114, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 10550 + }, + { + "epoch": 1.7072184948670277, + "grad_norm": 0.6107715368270874, + "learning_rate": 0.0002, + "loss": 0.7407, + "step": 10560 + }, + { + "epoch": 1.7088351790477732, + "grad_norm": 0.6652471423149109, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 10570 + }, + { + "epoch": 1.7104518632285184, + "grad_norm": 0.6308087110519409, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 10580 + }, + { + "epoch": 1.7120685474092636, + "grad_norm": 0.5464386940002441, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 10590 + }, + { + "epoch": 1.7136852315900089, + "grad_norm": 0.6558911204338074, + "learning_rate": 0.0002, + "loss": 0.7063, + "step": 10600 + }, + { + "epoch": 1.715301915770754, + "grad_norm": 0.5665024518966675, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 10610 + }, + { + "epoch": 1.7169185999514993, + "grad_norm": 0.7888094186782837, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 10620 + }, + { + "epoch": 1.7185352841322448, + "grad_norm": 0.7084909081459045, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 10630 + }, + { + "epoch": 1.7201519683129902, + "grad_norm": 0.7982324361801147, + "learning_rate": 0.0002, + "loss": 0.7557, + "step": 10640 + }, + { + "epoch": 1.7217686524937355, + "grad_norm": 0.6418732404708862, + "learning_rate": 0.0002, + "loss": 0.7345, + "step": 10650 + }, + { + "epoch": 1.7233853366744807, + "grad_norm": 0.7636681795120239, + "learning_rate": 0.0002, + "loss": 0.7734, + "step": 10660 + }, + { + "epoch": 1.725002020855226, + "grad_norm": 0.5646875500679016, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 10670 + }, + { + "epoch": 1.7266187050359711, + "grad_norm": 0.5231260657310486, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 10680 + }, + { + "epoch": 1.7282353892167164, + "grad_norm": 0.7635011672973633, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 10690 + }, + { + "epoch": 1.7298520733974618, + "grad_norm": 0.7518259286880493, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 10700 + }, + { + "epoch": 1.731468757578207, + "grad_norm": 0.7295602560043335, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 10710 + }, + { + "epoch": 1.7330854417589525, + "grad_norm": 0.6984632015228271, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 10720 + }, + { + "epoch": 1.7347021259396977, + "grad_norm": 0.6198219060897827, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 10730 + }, + { + "epoch": 1.736318810120443, + "grad_norm": 0.6957576274871826, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 10740 + }, + { + "epoch": 1.7379354943011882, + "grad_norm": 0.6430263519287109, + "learning_rate": 0.0002, + "loss": 0.7917, + "step": 10750 + }, + { + "epoch": 1.7395521784819334, + "grad_norm": 0.6134995222091675, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 10760 + }, + { + "epoch": 1.741168862662679, + "grad_norm": 0.7209452986717224, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 10770 + }, + { + "epoch": 1.7427855468434241, + "grad_norm": 0.6735447645187378, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 10780 + }, + { + "epoch": 1.7444022310241696, + "grad_norm": 0.5605693459510803, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 10790 + }, + { + "epoch": 1.7460189152049148, + "grad_norm": 0.6882363557815552, + "learning_rate": 0.0002, + "loss": 0.7759, + "step": 10800 + }, + { + "epoch": 1.74763559938566, + "grad_norm": 0.6386259198188782, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 10810 + }, + { + "epoch": 1.7492522835664053, + "grad_norm": 0.6529015302658081, + "learning_rate": 0.0002, + "loss": 0.7697, + "step": 10820 + }, + { + "epoch": 1.7508689677471505, + "grad_norm": 0.5664082765579224, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 10830 + }, + { + "epoch": 1.7524856519278957, + "grad_norm": 0.7532684206962585, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 10840 + }, + { + "epoch": 1.7541023361086412, + "grad_norm": 0.77171391248703, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 10850 + }, + { + "epoch": 1.7557190202893864, + "grad_norm": 0.7255431413650513, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 10860 + }, + { + "epoch": 1.7573357044701319, + "grad_norm": 0.763083279132843, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 10870 + }, + { + "epoch": 1.758952388650877, + "grad_norm": 0.6042402982711792, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 10880 + }, + { + "epoch": 1.7605690728316223, + "grad_norm": 0.7642518281936646, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 10890 + }, + { + "epoch": 1.7621857570123676, + "grad_norm": 0.6347904801368713, + "learning_rate": 0.0002, + "loss": 0.746, + "step": 10900 + }, + { + "epoch": 1.7638024411931128, + "grad_norm": 0.5371627807617188, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 10910 + }, + { + "epoch": 1.7654191253738583, + "grad_norm": 0.6840225458145142, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 10920 + }, + { + "epoch": 1.7670358095546035, + "grad_norm": 0.5288469195365906, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 10930 + }, + { + "epoch": 1.768652493735349, + "grad_norm": 0.69020676612854, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 10940 + }, + { + "epoch": 1.7702691779160942, + "grad_norm": 0.5943242311477661, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 10950 + }, + { + "epoch": 1.7718858620968394, + "grad_norm": 0.5616418123245239, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 10960 + }, + { + "epoch": 1.7735025462775846, + "grad_norm": 0.7209470868110657, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 10970 + }, + { + "epoch": 1.7751192304583299, + "grad_norm": 0.6657957434654236, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 10980 + }, + { + "epoch": 1.776735914639075, + "grad_norm": 0.6469064950942993, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 10990 + }, + { + "epoch": 1.7783525988198206, + "grad_norm": 0.6615678071975708, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 11000 + }, + { + "epoch": 1.779969283000566, + "grad_norm": 0.6722439527511597, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 11010 + }, + { + "epoch": 1.7815859671813112, + "grad_norm": 0.634136974811554, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 11020 + }, + { + "epoch": 1.7832026513620565, + "grad_norm": 0.6024377346038818, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 11030 + }, + { + "epoch": 1.7848193355428017, + "grad_norm": 0.6909403800964355, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 11040 + }, + { + "epoch": 1.786436019723547, + "grad_norm": 0.7148767709732056, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 11050 + }, + { + "epoch": 1.7880527039042922, + "grad_norm": 0.7442979216575623, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 11060 + }, + { + "epoch": 1.7896693880850376, + "grad_norm": 0.6830431818962097, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 11070 + }, + { + "epoch": 1.7912860722657828, + "grad_norm": 0.9172667264938354, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 11080 + }, + { + "epoch": 1.7929027564465283, + "grad_norm": 0.6799490451812744, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 11090 + }, + { + "epoch": 1.7945194406272735, + "grad_norm": 0.7617024779319763, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 11100 + }, + { + "epoch": 1.7961361248080188, + "grad_norm": 0.7701810002326965, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 11110 + }, + { + "epoch": 1.797752808988764, + "grad_norm": 0.7454385757446289, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 11120 + }, + { + "epoch": 1.7993694931695092, + "grad_norm": 0.6121436953544617, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 11130 + }, + { + "epoch": 1.8009861773502547, + "grad_norm": 0.6237571835517883, + "learning_rate": 0.0002, + "loss": 0.7305, + "step": 11140 + }, + { + "epoch": 1.802602861531, + "grad_norm": 0.6818515658378601, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 11150 + }, + { + "epoch": 1.8042195457117454, + "grad_norm": 0.7768308520317078, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 11160 + }, + { + "epoch": 1.8058362298924906, + "grad_norm": 0.6875537633895874, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 11170 + }, + { + "epoch": 1.8074529140732358, + "grad_norm": 0.7950584888458252, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 11180 + }, + { + "epoch": 1.809069598253981, + "grad_norm": 0.8210248351097107, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 11190 + }, + { + "epoch": 1.8106862824347263, + "grad_norm": 0.6674110889434814, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 11200 + }, + { + "epoch": 1.8123029666154715, + "grad_norm": 0.6261674761772156, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 11210 + }, + { + "epoch": 1.813919650796217, + "grad_norm": 0.6484741568565369, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 11220 + }, + { + "epoch": 1.8155363349769622, + "grad_norm": 0.6231244206428528, + "learning_rate": 0.0002, + "loss": 0.7718, + "step": 11230 + }, + { + "epoch": 1.8171530191577077, + "grad_norm": 0.7243146896362305, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 11240 + }, + { + "epoch": 1.818769703338453, + "grad_norm": 0.6776193380355835, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 11250 + }, + { + "epoch": 1.8203863875191981, + "grad_norm": 0.5973618030548096, + "learning_rate": 0.0002, + "loss": 0.7317, + "step": 11260 + }, + { + "epoch": 1.8220030716999434, + "grad_norm": 0.6451361179351807, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 11270 + }, + { + "epoch": 1.8236197558806886, + "grad_norm": 0.5963068008422852, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 11280 + }, + { + "epoch": 1.825236440061434, + "grad_norm": 0.536902129650116, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 11290 + }, + { + "epoch": 1.8268531242421793, + "grad_norm": 0.6993787288665771, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 11300 + }, + { + "epoch": 1.8284698084229247, + "grad_norm": 0.6135255098342896, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 11310 + }, + { + "epoch": 1.83008649260367, + "grad_norm": 0.6057423949241638, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 11320 + }, + { + "epoch": 1.8317031767844152, + "grad_norm": 0.6598812341690063, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 11330 + }, + { + "epoch": 1.8333198609651604, + "grad_norm": 0.6075948476791382, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 11340 + }, + { + "epoch": 1.8349365451459057, + "grad_norm": 0.7065447568893433, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 11350 + }, + { + "epoch": 1.8365532293266509, + "grad_norm": 0.680526614189148, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 11360 + }, + { + "epoch": 1.8381699135073963, + "grad_norm": 0.6356695294380188, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 11370 + }, + { + "epoch": 1.8397865976881416, + "grad_norm": 0.6399052143096924, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 11380 + }, + { + "epoch": 1.841403281868887, + "grad_norm": 0.6125704050064087, + "learning_rate": 0.0002, + "loss": 0.7618, + "step": 11390 + }, + { + "epoch": 1.8430199660496323, + "grad_norm": 0.7124643325805664, + "learning_rate": 0.0002, + "loss": 0.755, + "step": 11400 + }, + { + "epoch": 1.8446366502303775, + "grad_norm": 0.6099604964256287, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 11410 + }, + { + "epoch": 1.8462533344111227, + "grad_norm": 0.7338208556175232, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 11420 + }, + { + "epoch": 1.847870018591868, + "grad_norm": 0.7534668445587158, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 11430 + }, + { + "epoch": 1.8494867027726134, + "grad_norm": 0.6135470271110535, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 11440 + }, + { + "epoch": 1.8511033869533586, + "grad_norm": 0.6229309439659119, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 11450 + }, + { + "epoch": 1.852720071134104, + "grad_norm": 0.706423282623291, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 11460 + }, + { + "epoch": 1.8543367553148493, + "grad_norm": 0.5460049510002136, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 11470 + }, + { + "epoch": 1.8559534394955945, + "grad_norm": 0.6616711020469666, + "learning_rate": 0.0002, + "loss": 0.7416, + "step": 11480 + }, + { + "epoch": 1.8575701236763398, + "grad_norm": 0.6372783184051514, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 11490 + }, + { + "epoch": 1.859186807857085, + "grad_norm": 0.7162668108940125, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 11500 + }, + { + "epoch": 1.8608034920378305, + "grad_norm": 0.6605209708213806, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 11510 + }, + { + "epoch": 1.8624201762185757, + "grad_norm": 0.6933956742286682, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 11520 + }, + { + "epoch": 1.8640368603993211, + "grad_norm": 0.6582090854644775, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 11530 + }, + { + "epoch": 1.8656535445800664, + "grad_norm": 0.6416500806808472, + "learning_rate": 0.0002, + "loss": 0.7313, + "step": 11540 + }, + { + "epoch": 1.8672702287608116, + "grad_norm": 0.5434312224388123, + "learning_rate": 0.0002, + "loss": 0.7372, + "step": 11550 + }, + { + "epoch": 1.8688869129415568, + "grad_norm": 0.6827567219734192, + "learning_rate": 0.0002, + "loss": 0.7635, + "step": 11560 + }, + { + "epoch": 1.870503597122302, + "grad_norm": 0.7354370951652527, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 11570 + }, + { + "epoch": 1.8721202813030473, + "grad_norm": 0.590372622013092, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 11580 + }, + { + "epoch": 1.8737369654837928, + "grad_norm": 0.853183925151825, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 11590 + }, + { + "epoch": 1.875353649664538, + "grad_norm": 0.822678804397583, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 11600 + }, + { + "epoch": 1.8769703338452834, + "grad_norm": 0.6591550707817078, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 11610 + }, + { + "epoch": 1.8785870180260287, + "grad_norm": 0.7475301623344421, + "learning_rate": 0.0002, + "loss": 0.7054, + "step": 11620 + }, + { + "epoch": 1.880203702206774, + "grad_norm": 0.6390765309333801, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 11630 + }, + { + "epoch": 1.8818203863875191, + "grad_norm": 0.6589758992195129, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 11640 + }, + { + "epoch": 1.8834370705682644, + "grad_norm": 0.6765508651733398, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 11650 + }, + { + "epoch": 1.8850537547490098, + "grad_norm": 0.6527857780456543, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 11660 + }, + { + "epoch": 1.886670438929755, + "grad_norm": 0.6642923951148987, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 11670 + }, + { + "epoch": 1.8882871231105005, + "grad_norm": 0.6945584416389465, + "learning_rate": 0.0002, + "loss": 0.7701, + "step": 11680 + }, + { + "epoch": 1.8899038072912457, + "grad_norm": 0.694018542766571, + "learning_rate": 0.0002, + "loss": 0.7711, + "step": 11690 + }, + { + "epoch": 1.891520491471991, + "grad_norm": 0.7237417101860046, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 11700 + }, + { + "epoch": 1.8931371756527362, + "grad_norm": 0.7401309609413147, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 11710 + }, + { + "epoch": 1.8947538598334814, + "grad_norm": 0.6537784337997437, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 11720 + }, + { + "epoch": 1.8963705440142267, + "grad_norm": 0.7398539185523987, + "learning_rate": 0.0002, + "loss": 0.793, + "step": 11730 + }, + { + "epoch": 1.8979872281949721, + "grad_norm": 0.6696075797080994, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 11740 + }, + { + "epoch": 1.8996039123757174, + "grad_norm": 0.6014142036437988, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 11750 + }, + { + "epoch": 1.9012205965564628, + "grad_norm": 0.7023524641990662, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 11760 + }, + { + "epoch": 1.902837280737208, + "grad_norm": 0.739973783493042, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 11770 + }, + { + "epoch": 1.9044539649179533, + "grad_norm": 0.5576770901679993, + "learning_rate": 0.0002, + "loss": 0.7848, + "step": 11780 + }, + { + "epoch": 1.9060706490986985, + "grad_norm": 0.6907393932342529, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 11790 + }, + { + "epoch": 1.9076873332794437, + "grad_norm": 0.6934581995010376, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 11800 + }, + { + "epoch": 1.9093040174601892, + "grad_norm": 0.591774582862854, + "learning_rate": 0.0002, + "loss": 0.7199, + "step": 11810 + }, + { + "epoch": 1.9109207016409344, + "grad_norm": 0.6249791383743286, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 11820 + }, + { + "epoch": 1.9125373858216799, + "grad_norm": 0.6755744218826294, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 11830 + }, + { + "epoch": 1.914154070002425, + "grad_norm": 0.7286285161972046, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 11840 + }, + { + "epoch": 1.9157707541831703, + "grad_norm": 0.7867850065231323, + "learning_rate": 0.0002, + "loss": 0.7509, + "step": 11850 + }, + { + "epoch": 1.9173874383639156, + "grad_norm": 0.6283972859382629, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 11860 + }, + { + "epoch": 1.9190041225446608, + "grad_norm": 0.605823814868927, + "learning_rate": 0.0002, + "loss": 0.7296, + "step": 11870 + }, + { + "epoch": 1.920620806725406, + "grad_norm": 0.5927976965904236, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 11880 + }, + { + "epoch": 1.9222374909061515, + "grad_norm": 0.5974002480506897, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 11890 + }, + { + "epoch": 1.923854175086897, + "grad_norm": 0.7091866135597229, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 11900 + }, + { + "epoch": 1.9254708592676422, + "grad_norm": 0.72496497631073, + "learning_rate": 0.0002, + "loss": 0.775, + "step": 11910 + }, + { + "epoch": 1.9270875434483874, + "grad_norm": 0.6131896376609802, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 11920 + }, + { + "epoch": 1.9287042276291326, + "grad_norm": 0.6556436419487, + "learning_rate": 0.0002, + "loss": 0.7228, + "step": 11930 + }, + { + "epoch": 1.9303209118098779, + "grad_norm": 0.622932493686676, + "learning_rate": 0.0002, + "loss": 0.7319, + "step": 11940 + }, + { + "epoch": 1.931937595990623, + "grad_norm": 0.6618631482124329, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 11950 + }, + { + "epoch": 1.9335542801713685, + "grad_norm": 0.630966305732727, + "learning_rate": 0.0002, + "loss": 0.8332, + "step": 11960 + }, + { + "epoch": 1.9351709643521138, + "grad_norm": 0.6336734890937805, + "learning_rate": 0.0002, + "loss": 0.6854, + "step": 11970 + }, + { + "epoch": 1.9367876485328592, + "grad_norm": 0.655403196811676, + "learning_rate": 0.0002, + "loss": 0.7433, + "step": 11980 + }, + { + "epoch": 1.9384043327136045, + "grad_norm": 0.5640574097633362, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 11990 + }, + { + "epoch": 1.9400210168943497, + "grad_norm": 0.6322951316833496, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 12000 + }, + { + "epoch": 1.941637701075095, + "grad_norm": 0.615703821182251, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 12010 + }, + { + "epoch": 1.9432543852558402, + "grad_norm": 0.6487536430358887, + "learning_rate": 0.0002, + "loss": 0.786, + "step": 12020 + }, + { + "epoch": 1.9448710694365856, + "grad_norm": 0.9209630489349365, + "learning_rate": 0.0002, + "loss": 0.7435, + "step": 12030 + }, + { + "epoch": 1.9464877536173308, + "grad_norm": 0.67485511302948, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 12040 + }, + { + "epoch": 1.9481044377980763, + "grad_norm": 0.6831230521202087, + "learning_rate": 0.0002, + "loss": 0.7551, + "step": 12050 + }, + { + "epoch": 1.9497211219788215, + "grad_norm": 0.6578302383422852, + "learning_rate": 0.0002, + "loss": 0.7546, + "step": 12060 + }, + { + "epoch": 1.9513378061595668, + "grad_norm": 0.9975938200950623, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 12070 + }, + { + "epoch": 1.952954490340312, + "grad_norm": 0.6637365221977234, + "learning_rate": 0.0002, + "loss": 0.7952, + "step": 12080 + }, + { + "epoch": 1.9545711745210572, + "grad_norm": 0.605707049369812, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 12090 + }, + { + "epoch": 1.9561878587018025, + "grad_norm": 0.6584440469741821, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 12100 + }, + { + "epoch": 1.957804542882548, + "grad_norm": 0.6070835590362549, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 12110 + }, + { + "epoch": 1.9594212270632931, + "grad_norm": 0.7862601280212402, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 12120 + }, + { + "epoch": 1.9610379112440386, + "grad_norm": 0.8175255060195923, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 12130 + }, + { + "epoch": 1.9626545954247838, + "grad_norm": 0.5648472905158997, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 12140 + }, + { + "epoch": 1.964271279605529, + "grad_norm": 0.6591973304748535, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 12150 + }, + { + "epoch": 1.9658879637862743, + "grad_norm": 0.5960676074028015, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 12160 + }, + { + "epoch": 1.9675046479670195, + "grad_norm": 0.7272544503211975, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 12170 + }, + { + "epoch": 1.969121332147765, + "grad_norm": 0.7176699042320251, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 12180 + }, + { + "epoch": 1.9707380163285102, + "grad_norm": 0.6927123665809631, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 12190 + }, + { + "epoch": 1.9723547005092557, + "grad_norm": 0.5536034107208252, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 12200 + }, + { + "epoch": 1.9739713846900009, + "grad_norm": 0.8348390460014343, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 12210 + }, + { + "epoch": 1.9755880688707461, + "grad_norm": 0.6591181755065918, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 12220 + }, + { + "epoch": 1.9772047530514913, + "grad_norm": 1.0624109506607056, + "learning_rate": 0.0002, + "loss": 0.763, + "step": 12230 + }, + { + "epoch": 1.9788214372322366, + "grad_norm": 0.9265586137771606, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 12240 + }, + { + "epoch": 1.9804381214129818, + "grad_norm": 0.5998196005821228, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 12250 + }, + { + "epoch": 1.9820548055937273, + "grad_norm": 0.6960851550102234, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 12260 + }, + { + "epoch": 1.9836714897744727, + "grad_norm": 0.7674502730369568, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 12270 + }, + { + "epoch": 1.985288173955218, + "grad_norm": 0.6407275795936584, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 12280 + }, + { + "epoch": 1.9869048581359632, + "grad_norm": 0.6673079133033752, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 12290 + }, + { + "epoch": 1.9885215423167084, + "grad_norm": 0.6989844441413879, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 12300 + }, + { + "epoch": 1.9901382264974536, + "grad_norm": 0.7564442157745361, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 12310 + }, + { + "epoch": 1.9917549106781989, + "grad_norm": 0.6385478973388672, + "learning_rate": 0.0002, + "loss": 0.7719, + "step": 12320 + }, + { + "epoch": 1.9933715948589443, + "grad_norm": 0.7193717956542969, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 12330 + }, + { + "epoch": 1.9949882790396896, + "grad_norm": 0.7987112402915955, + "learning_rate": 0.0002, + "loss": 0.7583, + "step": 12340 + }, + { + "epoch": 1.996604963220435, + "grad_norm": 0.7260826826095581, + "learning_rate": 0.0002, + "loss": 0.7793, + "step": 12350 + }, + { + "epoch": 1.9982216474011802, + "grad_norm": 0.7968255281448364, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 12360 + }, + { + "epoch": 1.9998383315819255, + "grad_norm": 0.6893062591552734, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 12370 + }, + { + "epoch": 2.0, + "eval_loss": 1.1044032573699951, + "eval_runtime": 122.1508, + "eval_samples_per_second": 6.001, + "eval_steps_per_second": 0.753, + "step": 12371 + }, + { + "epoch": 2.0014550157626707, + "grad_norm": 0.7775409817695618, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 12380 + }, + { + "epoch": 2.003071699943416, + "grad_norm": 0.76218581199646, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 12390 + }, + { + "epoch": 2.004688384124161, + "grad_norm": 0.5677764415740967, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 12400 + }, + { + "epoch": 2.006305068304907, + "grad_norm": 0.808442234992981, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 12410 + }, + { + "epoch": 2.007921752485652, + "grad_norm": 0.7144765257835388, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 12420 + }, + { + "epoch": 2.0095384366663973, + "grad_norm": 0.6914031505584717, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 12430 + }, + { + "epoch": 2.0111551208471425, + "grad_norm": 0.7581454515457153, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 12440 + }, + { + "epoch": 2.0127718050278878, + "grad_norm": 0.8388504981994629, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 12450 + }, + { + "epoch": 2.014388489208633, + "grad_norm": 0.6716406941413879, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 12460 + }, + { + "epoch": 2.0160051733893782, + "grad_norm": 0.898902416229248, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 12470 + }, + { + "epoch": 2.0176218575701235, + "grad_norm": 0.6432679891586304, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 12480 + }, + { + "epoch": 2.019238541750869, + "grad_norm": 0.8021109104156494, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 12490 + }, + { + "epoch": 2.0208552259316144, + "grad_norm": 0.7039216756820679, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 12500 + }, + { + "epoch": 2.0224719101123596, + "grad_norm": 0.646531879901886, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 12510 + }, + { + "epoch": 2.024088594293105, + "grad_norm": 0.783704400062561, + "learning_rate": 0.0002, + "loss": 0.6302, + "step": 12520 + }, + { + "epoch": 2.02570527847385, + "grad_norm": 0.8805046677589417, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 12530 + }, + { + "epoch": 2.0273219626545953, + "grad_norm": 0.7289270758628845, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 12540 + }, + { + "epoch": 2.0289386468353405, + "grad_norm": 0.71653151512146, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 12550 + }, + { + "epoch": 2.030555331016086, + "grad_norm": 0.73281329870224, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 12560 + }, + { + "epoch": 2.0321720151968314, + "grad_norm": 0.6657090187072754, + "learning_rate": 0.0002, + "loss": 0.6448, + "step": 12570 + }, + { + "epoch": 2.0337886993775767, + "grad_norm": 0.8241133093833923, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 12580 + }, + { + "epoch": 2.035405383558322, + "grad_norm": 0.5834135413169861, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 12590 + }, + { + "epoch": 2.037022067739067, + "grad_norm": 0.84502112865448, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 12600 + }, + { + "epoch": 2.0386387519198124, + "grad_norm": 0.8952481746673584, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 12610 + }, + { + "epoch": 2.0402554361005576, + "grad_norm": 0.7801461815834045, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 12620 + }, + { + "epoch": 2.041872120281303, + "grad_norm": 0.6788367033004761, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 12630 + }, + { + "epoch": 2.0434888044620485, + "grad_norm": 0.7241756319999695, + "learning_rate": 0.0002, + "loss": 0.6162, + "step": 12640 + }, + { + "epoch": 2.0451054886427937, + "grad_norm": 0.6933388113975525, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 12650 + }, + { + "epoch": 2.046722172823539, + "grad_norm": 0.8029746413230896, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 12660 + }, + { + "epoch": 2.048338857004284, + "grad_norm": 0.946399986743927, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 12670 + }, + { + "epoch": 2.0499555411850294, + "grad_norm": 0.7072678804397583, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 12680 + }, + { + "epoch": 2.0515722253657747, + "grad_norm": 0.6810618042945862, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 12690 + }, + { + "epoch": 2.05318890954652, + "grad_norm": 0.7661160230636597, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 12700 + }, + { + "epoch": 2.0548055937272656, + "grad_norm": 0.6350653767585754, + "learning_rate": 0.0002, + "loss": 0.6799, + "step": 12710 + }, + { + "epoch": 2.056422277908011, + "grad_norm": 0.861890971660614, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 12720 + }, + { + "epoch": 2.058038962088756, + "grad_norm": 0.6489875912666321, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 12730 + }, + { + "epoch": 2.0596556462695013, + "grad_norm": 0.8268506526947021, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 12740 + }, + { + "epoch": 2.0612723304502465, + "grad_norm": 0.607679545879364, + "learning_rate": 0.0002, + "loss": 0.6524, + "step": 12750 + }, + { + "epoch": 2.0628890146309917, + "grad_norm": 0.6754153370857239, + "learning_rate": 0.0002, + "loss": 0.6649, + "step": 12760 + }, + { + "epoch": 2.064505698811737, + "grad_norm": 0.7263124585151672, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 12770 + }, + { + "epoch": 2.0661223829924826, + "grad_norm": 0.6986154317855835, + "learning_rate": 0.0002, + "loss": 0.6189, + "step": 12780 + }, + { + "epoch": 2.067739067173228, + "grad_norm": 0.7768576741218567, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 12790 + }, + { + "epoch": 2.069355751353973, + "grad_norm": 0.7546762824058533, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 12800 + }, + { + "epoch": 2.0709724355347183, + "grad_norm": 0.7588880062103271, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 12810 + }, + { + "epoch": 2.0725891197154636, + "grad_norm": 0.7457242608070374, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 12820 + }, + { + "epoch": 2.074205803896209, + "grad_norm": 0.6983516812324524, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 12830 + }, + { + "epoch": 2.075822488076954, + "grad_norm": 0.7950928807258606, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 12840 + }, + { + "epoch": 2.0774391722576993, + "grad_norm": 0.9248087406158447, + "learning_rate": 0.0002, + "loss": 0.6603, + "step": 12850 + }, + { + "epoch": 2.079055856438445, + "grad_norm": 0.7229493260383606, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 12860 + }, + { + "epoch": 2.08067254061919, + "grad_norm": 0.5710847973823547, + "learning_rate": 0.0002, + "loss": 0.6702, + "step": 12870 + }, + { + "epoch": 2.0822892247999354, + "grad_norm": 0.9580423831939697, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 12880 + }, + { + "epoch": 2.0839059089806806, + "grad_norm": 0.7399665713310242, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 12890 + }, + { + "epoch": 2.085522593161426, + "grad_norm": 0.7981410622596741, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 12900 + }, + { + "epoch": 2.087139277342171, + "grad_norm": 0.870759904384613, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 12910 + }, + { + "epoch": 2.0887559615229163, + "grad_norm": 0.7001481652259827, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 12920 + }, + { + "epoch": 2.090372645703662, + "grad_norm": 0.6745418310165405, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 12930 + }, + { + "epoch": 2.0919893298844072, + "grad_norm": 0.7739067673683167, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 12940 + }, + { + "epoch": 2.0936060140651525, + "grad_norm": 0.6742934584617615, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 12950 + }, + { + "epoch": 2.0952226982458977, + "grad_norm": 0.7270349860191345, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 12960 + }, + { + "epoch": 2.096839382426643, + "grad_norm": 0.7150624394416809, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 12970 + }, + { + "epoch": 2.098456066607388, + "grad_norm": 0.7734767198562622, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 12980 + }, + { + "epoch": 2.1000727507881334, + "grad_norm": 0.7618662118911743, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 12990 + }, + { + "epoch": 2.101689434968879, + "grad_norm": 0.6557944416999817, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 13000 + }, + { + "epoch": 2.1033061191496243, + "grad_norm": 0.8786448240280151, + "learning_rate": 0.0002, + "loss": 0.7268, + "step": 13010 + }, + { + "epoch": 2.1049228033303695, + "grad_norm": 0.6878724098205566, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 13020 + }, + { + "epoch": 2.1065394875111147, + "grad_norm": 0.822318971157074, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 13030 + }, + { + "epoch": 2.10815617169186, + "grad_norm": 0.831468939781189, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 13040 + }, + { + "epoch": 2.109772855872605, + "grad_norm": 0.7699505686759949, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 13050 + }, + { + "epoch": 2.1113895400533504, + "grad_norm": 0.7559016346931458, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 13060 + }, + { + "epoch": 2.1130062242340957, + "grad_norm": 0.6942209601402283, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 13070 + }, + { + "epoch": 2.1146229084148414, + "grad_norm": 0.6098947525024414, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 13080 + }, + { + "epoch": 2.1162395925955866, + "grad_norm": 0.6499016284942627, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 13090 + }, + { + "epoch": 2.117856276776332, + "grad_norm": 0.7719953060150146, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 13100 + }, + { + "epoch": 2.119472960957077, + "grad_norm": 0.6708134412765503, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 13110 + }, + { + "epoch": 2.1210896451378223, + "grad_norm": 0.8119585514068604, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 13120 + }, + { + "epoch": 2.1227063293185675, + "grad_norm": 0.6947157979011536, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 13130 + }, + { + "epoch": 2.1243230134993127, + "grad_norm": 0.8831837773323059, + "learning_rate": 0.0002, + "loss": 0.6375, + "step": 13140 + }, + { + "epoch": 2.1259396976800584, + "grad_norm": 0.7266910672187805, + "learning_rate": 0.0002, + "loss": 0.6997, + "step": 13150 + }, + { + "epoch": 2.1275563818608036, + "grad_norm": 0.8864351511001587, + "learning_rate": 0.0002, + "loss": 0.6446, + "step": 13160 + }, + { + "epoch": 2.129173066041549, + "grad_norm": 0.8104248046875, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 13170 + }, + { + "epoch": 2.130789750222294, + "grad_norm": 0.6077079772949219, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 13180 + }, + { + "epoch": 2.1324064344030393, + "grad_norm": 0.6874213814735413, + "learning_rate": 0.0002, + "loss": 0.6572, + "step": 13190 + }, + { + "epoch": 2.1340231185837846, + "grad_norm": 0.7134367823600769, + "learning_rate": 0.0002, + "loss": 0.642, + "step": 13200 + }, + { + "epoch": 2.13563980276453, + "grad_norm": 0.6101235151290894, + "learning_rate": 0.0002, + "loss": 0.7016, + "step": 13210 + }, + { + "epoch": 2.137256486945275, + "grad_norm": 0.6042411923408508, + "learning_rate": 0.0002, + "loss": 0.6529, + "step": 13220 + }, + { + "epoch": 2.1388731711260207, + "grad_norm": 0.914601743221283, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 13230 + }, + { + "epoch": 2.140489855306766, + "grad_norm": 0.7104284167289734, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 13240 + }, + { + "epoch": 2.142106539487511, + "grad_norm": 0.664395272731781, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 13250 + }, + { + "epoch": 2.1437232236682564, + "grad_norm": 0.6991241574287415, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 13260 + }, + { + "epoch": 2.1453399078490016, + "grad_norm": 0.5469560623168945, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 13270 + }, + { + "epoch": 2.146956592029747, + "grad_norm": 0.8454998135566711, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 13280 + }, + { + "epoch": 2.148573276210492, + "grad_norm": 0.7088868618011475, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 13290 + }, + { + "epoch": 2.1501899603912378, + "grad_norm": 0.7002687454223633, + "learning_rate": 0.0002, + "loss": 0.6835, + "step": 13300 + }, + { + "epoch": 2.151806644571983, + "grad_norm": 0.7785214781761169, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 13310 + }, + { + "epoch": 2.1534233287527282, + "grad_norm": 0.8049132227897644, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 13320 + }, + { + "epoch": 2.1550400129334735, + "grad_norm": 0.8062595129013062, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 13330 + }, + { + "epoch": 2.1566566971142187, + "grad_norm": 0.6208319067955017, + "learning_rate": 0.0002, + "loss": 0.6603, + "step": 13340 + }, + { + "epoch": 2.158273381294964, + "grad_norm": 0.7519655823707581, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 13350 + }, + { + "epoch": 2.159890065475709, + "grad_norm": 0.7645747065544128, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 13360 + }, + { + "epoch": 2.1615067496564544, + "grad_norm": 0.6847302913665771, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 13370 + }, + { + "epoch": 2.1631234338372, + "grad_norm": 0.8630441427230835, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 13380 + }, + { + "epoch": 2.1647401180179453, + "grad_norm": 0.7947702407836914, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 13390 + }, + { + "epoch": 2.1663568021986905, + "grad_norm": 0.6836977005004883, + "learning_rate": 0.0002, + "loss": 0.7206, + "step": 13400 + }, + { + "epoch": 2.1679734863794358, + "grad_norm": 0.7340566515922546, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 13410 + }, + { + "epoch": 2.169590170560181, + "grad_norm": 0.7075738906860352, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 13420 + }, + { + "epoch": 2.1712068547409262, + "grad_norm": 0.7080879807472229, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 13430 + }, + { + "epoch": 2.1728235389216715, + "grad_norm": 0.6218613386154175, + "learning_rate": 0.0002, + "loss": 0.6615, + "step": 13440 + }, + { + "epoch": 2.174440223102417, + "grad_norm": 0.8211479187011719, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 13450 + }, + { + "epoch": 2.1760569072831624, + "grad_norm": 0.864466667175293, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 13460 + }, + { + "epoch": 2.1776735914639076, + "grad_norm": 0.7943857908248901, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 13470 + }, + { + "epoch": 2.179290275644653, + "grad_norm": 0.78728187084198, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 13480 + }, + { + "epoch": 2.180906959825398, + "grad_norm": 0.697527289390564, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 13490 + }, + { + "epoch": 2.1825236440061433, + "grad_norm": 0.8205804228782654, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 13500 + }, + { + "epoch": 2.1841403281868885, + "grad_norm": 0.8709042072296143, + "learning_rate": 0.0002, + "loss": 0.7227, + "step": 13510 + }, + { + "epoch": 2.1857570123676338, + "grad_norm": 0.6228537559509277, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 13520 + }, + { + "epoch": 2.1873736965483794, + "grad_norm": 0.9566980004310608, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 13530 + }, + { + "epoch": 2.1889903807291247, + "grad_norm": 0.7128894329071045, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 13540 + }, + { + "epoch": 2.19060706490987, + "grad_norm": 0.6888654232025146, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 13550 + }, + { + "epoch": 2.192223749090615, + "grad_norm": 0.6444337368011475, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 13560 + }, + { + "epoch": 2.1938404332713604, + "grad_norm": 0.8008806705474854, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 13570 + }, + { + "epoch": 2.1954571174521056, + "grad_norm": 0.8482748866081238, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 13580 + }, + { + "epoch": 2.197073801632851, + "grad_norm": 0.8584157228469849, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 13590 + }, + { + "epoch": 2.1986904858135965, + "grad_norm": 0.7513734698295593, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 13600 + }, + { + "epoch": 2.2003071699943417, + "grad_norm": 0.7864262461662292, + "learning_rate": 0.0002, + "loss": 0.6632, + "step": 13610 + }, + { + "epoch": 2.201923854175087, + "grad_norm": 0.8493645191192627, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 13620 + }, + { + "epoch": 2.203540538355832, + "grad_norm": 0.6902140974998474, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 13630 + }, + { + "epoch": 2.2051572225365774, + "grad_norm": 0.8711254596710205, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 13640 + }, + { + "epoch": 2.2067739067173227, + "grad_norm": 0.7832191586494446, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 13650 + }, + { + "epoch": 2.208390590898068, + "grad_norm": 0.5668176412582397, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 13660 + }, + { + "epoch": 2.2100072750788136, + "grad_norm": 0.8648375272750854, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 13670 + }, + { + "epoch": 2.211623959259559, + "grad_norm": 0.7643089890480042, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 13680 + }, + { + "epoch": 2.213240643440304, + "grad_norm": 0.6293777823448181, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 13690 + }, + { + "epoch": 2.2148573276210493, + "grad_norm": 0.6459372639656067, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 13700 + }, + { + "epoch": 2.2164740118017945, + "grad_norm": 0.7060744166374207, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 13710 + }, + { + "epoch": 2.2180906959825397, + "grad_norm": 0.674109160900116, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 13720 + }, + { + "epoch": 2.219707380163285, + "grad_norm": 0.830392062664032, + "learning_rate": 0.0002, + "loss": 0.6529, + "step": 13730 + }, + { + "epoch": 2.2213240643440306, + "grad_norm": 0.6474477052688599, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 13740 + }, + { + "epoch": 2.222940748524776, + "grad_norm": 0.7037909626960754, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 13750 + }, + { + "epoch": 2.224557432705521, + "grad_norm": 0.6554131507873535, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 13760 + }, + { + "epoch": 2.2261741168862663, + "grad_norm": 0.7822230458259583, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 13770 + }, + { + "epoch": 2.2277908010670116, + "grad_norm": 0.9082167744636536, + "learning_rate": 0.0002, + "loss": 0.6505, + "step": 13780 + }, + { + "epoch": 2.229407485247757, + "grad_norm": 0.7918276190757751, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 13790 + }, + { + "epoch": 2.231024169428502, + "grad_norm": 0.7354569435119629, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 13800 + }, + { + "epoch": 2.2326408536092472, + "grad_norm": 0.8265249133110046, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 13810 + }, + { + "epoch": 2.234257537789993, + "grad_norm": 0.6653847098350525, + "learning_rate": 0.0002, + "loss": 0.6871, + "step": 13820 + }, + { + "epoch": 2.235874221970738, + "grad_norm": 0.7157923579216003, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 13830 + }, + { + "epoch": 2.2374909061514834, + "grad_norm": 0.7110323309898376, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 13840 + }, + { + "epoch": 2.2391075903322286, + "grad_norm": 0.7155357599258423, + "learning_rate": 0.0002, + "loss": 0.6913, + "step": 13850 + }, + { + "epoch": 2.240724274512974, + "grad_norm": 1.0177817344665527, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 13860 + }, + { + "epoch": 2.242340958693719, + "grad_norm": 0.7601948380470276, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 13870 + }, + { + "epoch": 2.2439576428744643, + "grad_norm": 0.7628820538520813, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 13880 + }, + { + "epoch": 2.24557432705521, + "grad_norm": 0.7089297771453857, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 13890 + }, + { + "epoch": 2.247191011235955, + "grad_norm": 0.695178210735321, + "learning_rate": 0.0002, + "loss": 0.7236, + "step": 13900 + }, + { + "epoch": 2.2488076954167004, + "grad_norm": 0.7631948590278625, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 13910 + }, + { + "epoch": 2.2504243795974457, + "grad_norm": 0.8203101754188538, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 13920 + }, + { + "epoch": 2.252041063778191, + "grad_norm": 0.8099079728126526, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 13930 + }, + { + "epoch": 2.253657747958936, + "grad_norm": 0.6498546004295349, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 13940 + }, + { + "epoch": 2.2552744321396814, + "grad_norm": 0.7797415256500244, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 13950 + }, + { + "epoch": 2.2568911163204266, + "grad_norm": 0.8254124522209167, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 13960 + }, + { + "epoch": 2.2585078005011723, + "grad_norm": 0.6327953338623047, + "learning_rate": 0.0002, + "loss": 0.6806, + "step": 13970 + }, + { + "epoch": 2.2601244846819175, + "grad_norm": 0.734194278717041, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 13980 + }, + { + "epoch": 2.2617411688626627, + "grad_norm": 0.9014202952384949, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 13990 + }, + { + "epoch": 2.263357853043408, + "grad_norm": 0.7643631100654602, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 14000 + }, + { + "epoch": 2.264974537224153, + "grad_norm": 0.8882834911346436, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 14010 + }, + { + "epoch": 2.2665912214048984, + "grad_norm": 0.7975873351097107, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 14020 + }, + { + "epoch": 2.2682079055856437, + "grad_norm": 0.7765783071517944, + "learning_rate": 0.0002, + "loss": 0.7193, + "step": 14030 + }, + { + "epoch": 2.2698245897663893, + "grad_norm": 0.8846288323402405, + "learning_rate": 0.0002, + "loss": 0.662, + "step": 14040 + }, + { + "epoch": 2.2714412739471346, + "grad_norm": 0.9006744027137756, + "learning_rate": 0.0002, + "loss": 0.6494, + "step": 14050 + }, + { + "epoch": 2.27305795812788, + "grad_norm": 0.7420173287391663, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 14060 + }, + { + "epoch": 2.274674642308625, + "grad_norm": 0.7956424951553345, + "learning_rate": 0.0002, + "loss": 0.7068, + "step": 14070 + }, + { + "epoch": 2.2762913264893703, + "grad_norm": 0.7783209085464478, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 14080 + }, + { + "epoch": 2.2779080106701155, + "grad_norm": 0.7597188949584961, + "learning_rate": 0.0002, + "loss": 0.7202, + "step": 14090 + }, + { + "epoch": 2.2795246948508607, + "grad_norm": 0.6718921661376953, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 14100 + }, + { + "epoch": 2.281141379031606, + "grad_norm": 0.7528082132339478, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 14110 + }, + { + "epoch": 2.2827580632123516, + "grad_norm": 0.8379864692687988, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 14120 + }, + { + "epoch": 2.284374747393097, + "grad_norm": 0.748613715171814, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 14130 + }, + { + "epoch": 2.285991431573842, + "grad_norm": 0.7435423135757446, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 14140 + }, + { + "epoch": 2.2876081157545873, + "grad_norm": 0.7580803632736206, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 14150 + }, + { + "epoch": 2.2892247999353326, + "grad_norm": 0.6278321146965027, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 14160 + }, + { + "epoch": 2.290841484116078, + "grad_norm": 0.7663896083831787, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 14170 + }, + { + "epoch": 2.292458168296823, + "grad_norm": 0.9716812372207642, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 14180 + }, + { + "epoch": 2.2940748524775687, + "grad_norm": 0.8993458151817322, + "learning_rate": 0.0002, + "loss": 0.7067, + "step": 14190 + }, + { + "epoch": 2.295691536658314, + "grad_norm": 0.6156117916107178, + "learning_rate": 0.0002, + "loss": 0.6172, + "step": 14200 + }, + { + "epoch": 2.297308220839059, + "grad_norm": 0.8911278247833252, + "learning_rate": 0.0002, + "loss": 0.6318, + "step": 14210 + }, + { + "epoch": 2.2989249050198044, + "grad_norm": 0.6422147154808044, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 14220 + }, + { + "epoch": 2.3005415892005496, + "grad_norm": 0.6866879463195801, + "learning_rate": 0.0002, + "loss": 0.6795, + "step": 14230 + }, + { + "epoch": 2.302158273381295, + "grad_norm": 0.9297130107879639, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 14240 + }, + { + "epoch": 2.30377495756204, + "grad_norm": 0.7501356601715088, + "learning_rate": 0.0002, + "loss": 0.6823, + "step": 14250 + }, + { + "epoch": 2.3053916417427853, + "grad_norm": 0.8363515138626099, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 14260 + }, + { + "epoch": 2.307008325923531, + "grad_norm": 0.9083868265151978, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 14270 + }, + { + "epoch": 2.3086250101042762, + "grad_norm": 0.7791516780853271, + "learning_rate": 0.0002, + "loss": 0.6862, + "step": 14280 + }, + { + "epoch": 2.3102416942850215, + "grad_norm": 0.8766953349113464, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 14290 + }, + { + "epoch": 2.3118583784657667, + "grad_norm": 0.7916635274887085, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 14300 + }, + { + "epoch": 2.313475062646512, + "grad_norm": 0.627525269985199, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 14310 + }, + { + "epoch": 2.315091746827257, + "grad_norm": 0.8856783509254456, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 14320 + }, + { + "epoch": 2.316708431008003, + "grad_norm": 0.6758689284324646, + "learning_rate": 0.0002, + "loss": 0.6806, + "step": 14330 + }, + { + "epoch": 2.318325115188748, + "grad_norm": 0.6428321003913879, + "learning_rate": 0.0002, + "loss": 0.6794, + "step": 14340 + }, + { + "epoch": 2.3199417993694933, + "grad_norm": 0.9032121300697327, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 14350 + }, + { + "epoch": 2.3215584835502385, + "grad_norm": 0.8035986423492432, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 14360 + }, + { + "epoch": 2.3231751677309838, + "grad_norm": 0.7974579334259033, + "learning_rate": 0.0002, + "loss": 0.7067, + "step": 14370 + }, + { + "epoch": 2.324791851911729, + "grad_norm": 0.8356034755706787, + "learning_rate": 0.0002, + "loss": 0.6451, + "step": 14380 + }, + { + "epoch": 2.326408536092474, + "grad_norm": 0.998760998249054, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 14390 + }, + { + "epoch": 2.3280252202732195, + "grad_norm": 0.6518142223358154, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 14400 + }, + { + "epoch": 2.3296419044539647, + "grad_norm": 0.7443506717681885, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 14410 + }, + { + "epoch": 2.3312585886347104, + "grad_norm": 0.8436172604560852, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 14420 + }, + { + "epoch": 2.3328752728154556, + "grad_norm": 0.7411080598831177, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 14430 + }, + { + "epoch": 2.334491956996201, + "grad_norm": 0.8839048743247986, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 14440 + }, + { + "epoch": 2.336108641176946, + "grad_norm": 0.8360885977745056, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 14450 + }, + { + "epoch": 2.3377253253576913, + "grad_norm": 0.7608986496925354, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 14460 + }, + { + "epoch": 2.3393420095384365, + "grad_norm": 0.8179867267608643, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 14470 + }, + { + "epoch": 2.340958693719182, + "grad_norm": 0.5989999771118164, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 14480 + }, + { + "epoch": 2.3425753778999274, + "grad_norm": 0.9450054168701172, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 14490 + }, + { + "epoch": 2.3441920620806727, + "grad_norm": 0.7885149717330933, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 14500 + }, + { + "epoch": 2.345808746261418, + "grad_norm": 0.8152616620063782, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 14510 + }, + { + "epoch": 2.347425430442163, + "grad_norm": 0.7193838953971863, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 14520 + }, + { + "epoch": 2.3490421146229084, + "grad_norm": 0.6701092720031738, + "learning_rate": 0.0002, + "loss": 0.6594, + "step": 14530 + }, + { + "epoch": 2.3506587988036536, + "grad_norm": 0.7529364228248596, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 14540 + }, + { + "epoch": 2.352275482984399, + "grad_norm": 0.6599733829498291, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 14550 + }, + { + "epoch": 2.353892167165144, + "grad_norm": 0.9502474069595337, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 14560 + }, + { + "epoch": 2.3555088513458897, + "grad_norm": 0.7619650959968567, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 14570 + }, + { + "epoch": 2.357125535526635, + "grad_norm": 0.9854652285575867, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 14580 + }, + { + "epoch": 2.35874221970738, + "grad_norm": 0.727439284324646, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 14590 + }, + { + "epoch": 2.3603589038881254, + "grad_norm": 0.6994746327400208, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 14600 + }, + { + "epoch": 2.3619755880688706, + "grad_norm": 0.7117531299591064, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 14610 + }, + { + "epoch": 2.363592272249616, + "grad_norm": 0.6403067708015442, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 14620 + }, + { + "epoch": 2.3652089564303616, + "grad_norm": 0.8377841711044312, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 14630 + }, + { + "epoch": 2.366825640611107, + "grad_norm": 0.749171257019043, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 14640 + }, + { + "epoch": 2.368442324791852, + "grad_norm": 0.8418586254119873, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 14650 + }, + { + "epoch": 2.3700590089725972, + "grad_norm": 0.6178573369979858, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 14660 + }, + { + "epoch": 2.3716756931533425, + "grad_norm": 0.6368302702903748, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 14670 + }, + { + "epoch": 2.3732923773340877, + "grad_norm": 0.9122977256774902, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 14680 + }, + { + "epoch": 2.374909061514833, + "grad_norm": 0.7086195349693298, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 14690 + }, + { + "epoch": 2.376525745695578, + "grad_norm": 0.7500800490379333, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 14700 + }, + { + "epoch": 2.378142429876324, + "grad_norm": 0.6634900569915771, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 14710 + }, + { + "epoch": 2.379759114057069, + "grad_norm": 0.839898407459259, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 14720 + }, + { + "epoch": 2.3813757982378143, + "grad_norm": 0.7578426003456116, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 14730 + }, + { + "epoch": 2.3829924824185595, + "grad_norm": 1.0213173627853394, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 14740 + }, + { + "epoch": 2.3846091665993048, + "grad_norm": 0.7855949401855469, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 14750 + }, + { + "epoch": 2.38622585078005, + "grad_norm": 0.7224128842353821, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 14760 + }, + { + "epoch": 2.3878425349607952, + "grad_norm": 0.8040381669998169, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 14770 + }, + { + "epoch": 2.389459219141541, + "grad_norm": 0.7705281376838684, + "learning_rate": 0.0002, + "loss": 0.6799, + "step": 14780 + }, + { + "epoch": 2.391075903322286, + "grad_norm": 0.667966902256012, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 14790 + }, + { + "epoch": 2.3926925875030314, + "grad_norm": 0.6611011028289795, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 14800 + }, + { + "epoch": 2.3943092716837766, + "grad_norm": 0.6862651705741882, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 14810 + }, + { + "epoch": 2.395925955864522, + "grad_norm": 0.8086010217666626, + "learning_rate": 0.0002, + "loss": 0.6537, + "step": 14820 + }, + { + "epoch": 2.397542640045267, + "grad_norm": 0.7189689874649048, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 14830 + }, + { + "epoch": 2.3991593242260123, + "grad_norm": 0.6280009150505066, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 14840 + }, + { + "epoch": 2.4007760084067575, + "grad_norm": 0.7826612591743469, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 14850 + }, + { + "epoch": 2.402392692587503, + "grad_norm": 0.7681610584259033, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 14860 + }, + { + "epoch": 2.4040093767682484, + "grad_norm": 0.720966100692749, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 14870 + }, + { + "epoch": 2.4056260609489937, + "grad_norm": 0.8202250599861145, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 14880 + }, + { + "epoch": 2.407242745129739, + "grad_norm": 0.786212682723999, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 14890 + }, + { + "epoch": 2.408859429310484, + "grad_norm": 0.6647164821624756, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 14900 + }, + { + "epoch": 2.4104761134912294, + "grad_norm": 0.7566399574279785, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 14910 + }, + { + "epoch": 2.4120927976719746, + "grad_norm": 0.748814582824707, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 14920 + }, + { + "epoch": 2.4137094818527203, + "grad_norm": 0.7624038457870483, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 14930 + }, + { + "epoch": 2.4153261660334655, + "grad_norm": 0.8267335295677185, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 14940 + }, + { + "epoch": 2.4169428502142107, + "grad_norm": 0.8785360455513, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 14950 + }, + { + "epoch": 2.418559534394956, + "grad_norm": 0.679887592792511, + "learning_rate": 0.0002, + "loss": 0.6718, + "step": 14960 + }, + { + "epoch": 2.420176218575701, + "grad_norm": 0.7218474745750427, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 14970 + }, + { + "epoch": 2.4217929027564464, + "grad_norm": 0.6342799663543701, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 14980 + }, + { + "epoch": 2.4234095869371917, + "grad_norm": 0.7098712921142578, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 14990 + }, + { + "epoch": 2.425026271117937, + "grad_norm": 0.7497431635856628, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 15000 + }, + { + "epoch": 2.4266429552986826, + "grad_norm": 0.934836208820343, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 15010 + }, + { + "epoch": 2.428259639479428, + "grad_norm": 0.8430966734886169, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 15020 + }, + { + "epoch": 2.429876323660173, + "grad_norm": 0.7032104730606079, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 15030 + }, + { + "epoch": 2.4314930078409183, + "grad_norm": 0.7746111750602722, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 15040 + }, + { + "epoch": 2.4331096920216635, + "grad_norm": 0.7661406397819519, + "learning_rate": 0.0002, + "loss": 0.7177, + "step": 15050 + }, + { + "epoch": 2.4347263762024087, + "grad_norm": 0.6941645741462708, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 15060 + }, + { + "epoch": 2.436343060383154, + "grad_norm": 0.7487249374389648, + "learning_rate": 0.0002, + "loss": 0.6421, + "step": 15070 + }, + { + "epoch": 2.4379597445638996, + "grad_norm": 0.7639912962913513, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 15080 + }, + { + "epoch": 2.439576428744645, + "grad_norm": 0.7708953619003296, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 15090 + }, + { + "epoch": 2.44119311292539, + "grad_norm": 0.9135832190513611, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 15100 + }, + { + "epoch": 2.4428097971061353, + "grad_norm": 0.8283005356788635, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 15110 + }, + { + "epoch": 2.4444264812868806, + "grad_norm": 0.925299346446991, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 15120 + }, + { + "epoch": 2.446043165467626, + "grad_norm": 0.7013528943061829, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 15130 + }, + { + "epoch": 2.447659849648371, + "grad_norm": 0.622303307056427, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 15140 + }, + { + "epoch": 2.4492765338291163, + "grad_norm": 0.876569390296936, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 15150 + }, + { + "epoch": 2.450893218009862, + "grad_norm": 0.6836351752281189, + "learning_rate": 0.0002, + "loss": 0.6463, + "step": 15160 + }, + { + "epoch": 2.452509902190607, + "grad_norm": 0.7886684536933899, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 15170 + }, + { + "epoch": 2.4541265863713524, + "grad_norm": 0.6647440791130066, + "learning_rate": 0.0002, + "loss": 0.6794, + "step": 15180 + }, + { + "epoch": 2.4557432705520976, + "grad_norm": 0.7477722764015198, + "learning_rate": 0.0002, + "loss": 0.6353, + "step": 15190 + }, + { + "epoch": 2.457359954732843, + "grad_norm": 0.8192033767700195, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 15200 + }, + { + "epoch": 2.458976638913588, + "grad_norm": 0.847537100315094, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 15210 + }, + { + "epoch": 2.4605933230943338, + "grad_norm": 0.9027776122093201, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 15220 + }, + { + "epoch": 2.462210007275079, + "grad_norm": 0.7217772006988525, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 15230 + }, + { + "epoch": 2.4638266914558242, + "grad_norm": 0.7994546294212341, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 15240 + }, + { + "epoch": 2.4654433756365695, + "grad_norm": 0.939916729927063, + "learning_rate": 0.0002, + "loss": 0.6828, + "step": 15250 + }, + { + "epoch": 2.4670600598173147, + "grad_norm": 1.0009053945541382, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 15260 + }, + { + "epoch": 2.46867674399806, + "grad_norm": 0.625555694103241, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 15270 + }, + { + "epoch": 2.470293428178805, + "grad_norm": 0.7924878597259521, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 15280 + }, + { + "epoch": 2.4719101123595504, + "grad_norm": 0.8536689877510071, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 15290 + }, + { + "epoch": 2.4735267965402956, + "grad_norm": 0.8572589755058289, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 15300 + }, + { + "epoch": 2.4751434807210413, + "grad_norm": 0.773279070854187, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 15310 + }, + { + "epoch": 2.4767601649017865, + "grad_norm": 0.7708749771118164, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 15320 + }, + { + "epoch": 2.4783768490825318, + "grad_norm": 0.770905077457428, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 15330 + }, + { + "epoch": 2.479993533263277, + "grad_norm": 0.8238571882247925, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 15340 + }, + { + "epoch": 2.481610217444022, + "grad_norm": 0.7670477032661438, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 15350 + }, + { + "epoch": 2.4832269016247674, + "grad_norm": 0.905036985874176, + "learning_rate": 0.0002, + "loss": 0.7759, + "step": 15360 + }, + { + "epoch": 2.484843585805513, + "grad_norm": 0.6672089695930481, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 15370 + }, + { + "epoch": 2.4864602699862584, + "grad_norm": 0.625095784664154, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 15380 + }, + { + "epoch": 2.4880769541670036, + "grad_norm": 0.679772675037384, + "learning_rate": 0.0002, + "loss": 0.6396, + "step": 15390 + }, + { + "epoch": 2.489693638347749, + "grad_norm": 0.711492121219635, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 15400 + }, + { + "epoch": 2.491310322528494, + "grad_norm": 0.876189112663269, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 15410 + }, + { + "epoch": 2.4929270067092393, + "grad_norm": 0.7236915230751038, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 15420 + }, + { + "epoch": 2.4945436908899845, + "grad_norm": 0.6629832983016968, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 15430 + }, + { + "epoch": 2.4961603750707297, + "grad_norm": 0.9756859540939331, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 15440 + }, + { + "epoch": 2.4977770592514754, + "grad_norm": 0.6896940469741821, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 15450 + }, + { + "epoch": 2.4993937434322206, + "grad_norm": 0.7105149626731873, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 15460 + }, + { + "epoch": 2.501010427612966, + "grad_norm": 0.8374546766281128, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 15470 + }, + { + "epoch": 2.502627111793711, + "grad_norm": 0.7320070266723633, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 15480 + }, + { + "epoch": 2.5042437959744563, + "grad_norm": 0.8306367993354797, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 15490 + }, + { + "epoch": 2.5058604801552016, + "grad_norm": 0.7472721338272095, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 15500 + }, + { + "epoch": 2.507477164335947, + "grad_norm": 0.6147692203521729, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 15510 + }, + { + "epoch": 2.5090938485166925, + "grad_norm": 0.7788505554199219, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 15520 + }, + { + "epoch": 2.5107105326974377, + "grad_norm": 0.8807527422904968, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 15530 + }, + { + "epoch": 2.512327216878183, + "grad_norm": 0.7521643042564392, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 15540 + }, + { + "epoch": 2.513943901058928, + "grad_norm": 0.6900225281715393, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 15550 + }, + { + "epoch": 2.5155605852396734, + "grad_norm": 0.6601938605308533, + "learning_rate": 0.0002, + "loss": 0.6769, + "step": 15560 + }, + { + "epoch": 2.5171772694204186, + "grad_norm": 0.8179984092712402, + "learning_rate": 0.0002, + "loss": 0.6648, + "step": 15570 + }, + { + "epoch": 2.518793953601164, + "grad_norm": 0.792556881904602, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 15580 + }, + { + "epoch": 2.520410637781909, + "grad_norm": 0.7081938982009888, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 15590 + }, + { + "epoch": 2.5220273219626543, + "grad_norm": 0.8733121156692505, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 15600 + }, + { + "epoch": 2.5236440061434, + "grad_norm": 0.7980992794036865, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 15610 + }, + { + "epoch": 2.5252606903241452, + "grad_norm": 0.883664071559906, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 15620 + }, + { + "epoch": 2.5268773745048905, + "grad_norm": 0.6963341236114502, + "learning_rate": 0.0002, + "loss": 0.7022, + "step": 15630 + }, + { + "epoch": 2.5284940586856357, + "grad_norm": 0.6433573365211487, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 15640 + }, + { + "epoch": 2.530110742866381, + "grad_norm": 0.8538183569908142, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 15650 + }, + { + "epoch": 2.5317274270471266, + "grad_norm": 0.9748201370239258, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 15660 + }, + { + "epoch": 2.533344111227872, + "grad_norm": 0.7670575380325317, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 15670 + }, + { + "epoch": 2.534960795408617, + "grad_norm": 0.8738890290260315, + "learning_rate": 0.0002, + "loss": 0.6435, + "step": 15680 + }, + { + "epoch": 2.5365774795893623, + "grad_norm": 0.8391636610031128, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 15690 + }, + { + "epoch": 2.5381941637701075, + "grad_norm": 0.7239366769790649, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 15700 + }, + { + "epoch": 2.5398108479508528, + "grad_norm": 0.8498379588127136, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 15710 + }, + { + "epoch": 2.541427532131598, + "grad_norm": 0.8029484152793884, + "learning_rate": 0.0002, + "loss": 0.6998, + "step": 15720 + }, + { + "epoch": 2.5430442163123432, + "grad_norm": 1.0639333724975586, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 15730 + }, + { + "epoch": 2.5446609004930885, + "grad_norm": 0.6401297450065613, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 15740 + }, + { + "epoch": 2.5462775846738337, + "grad_norm": 0.7123814821243286, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 15750 + }, + { + "epoch": 2.5478942688545794, + "grad_norm": 0.7874974608421326, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 15760 + }, + { + "epoch": 2.5495109530353246, + "grad_norm": 0.8046808838844299, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 15770 + }, + { + "epoch": 2.55112763721607, + "grad_norm": 0.7888661623001099, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 15780 + }, + { + "epoch": 2.552744321396815, + "grad_norm": 0.8445866107940674, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 15790 + }, + { + "epoch": 2.5543610055775603, + "grad_norm": 0.7475846409797668, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 15800 + }, + { + "epoch": 2.555977689758306, + "grad_norm": 0.7455102801322937, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 15810 + }, + { + "epoch": 2.557594373939051, + "grad_norm": 0.8226983547210693, + "learning_rate": 0.0002, + "loss": 0.6932, + "step": 15820 + }, + { + "epoch": 2.5592110581197964, + "grad_norm": 0.8920368552207947, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 15830 + }, + { + "epoch": 2.5608277423005417, + "grad_norm": 0.8413904905319214, + "learning_rate": 0.0002, + "loss": 0.6297, + "step": 15840 + }, + { + "epoch": 2.562444426481287, + "grad_norm": 0.8483649492263794, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 15850 + }, + { + "epoch": 2.564061110662032, + "grad_norm": 0.5923284292221069, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 15860 + }, + { + "epoch": 2.5656777948427774, + "grad_norm": 0.8518726229667664, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 15870 + }, + { + "epoch": 2.5672944790235226, + "grad_norm": 0.731235146522522, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 15880 + }, + { + "epoch": 2.568911163204268, + "grad_norm": 0.7517194151878357, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 15890 + }, + { + "epoch": 2.5705278473850135, + "grad_norm": 0.8378692269325256, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 15900 + }, + { + "epoch": 2.5721445315657587, + "grad_norm": 0.843701958656311, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 15910 + }, + { + "epoch": 2.573761215746504, + "grad_norm": 0.7254629731178284, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 15920 + }, + { + "epoch": 2.575377899927249, + "grad_norm": 0.8863335847854614, + "learning_rate": 0.0002, + "loss": 0.6872, + "step": 15930 + }, + { + "epoch": 2.5769945841079944, + "grad_norm": 0.7675097584724426, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 15940 + }, + { + "epoch": 2.5786112682887397, + "grad_norm": 0.82063889503479, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 15950 + }, + { + "epoch": 2.5802279524694853, + "grad_norm": 0.7729717493057251, + "learning_rate": 0.0002, + "loss": 0.662, + "step": 15960 + }, + { + "epoch": 2.5818446366502306, + "grad_norm": 0.8301846981048584, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 15970 + }, + { + "epoch": 2.583461320830976, + "grad_norm": 0.7906861305236816, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 15980 + }, + { + "epoch": 2.585078005011721, + "grad_norm": 0.6749057173728943, + "learning_rate": 0.0002, + "loss": 0.7175, + "step": 15990 + }, + { + "epoch": 2.5866946891924663, + "grad_norm": 0.9386842846870422, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 16000 + }, + { + "epoch": 2.5883113733732115, + "grad_norm": 0.7868891358375549, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 16010 + }, + { + "epoch": 2.5899280575539567, + "grad_norm": 0.8674671053886414, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 16020 + }, + { + "epoch": 2.591544741734702, + "grad_norm": 0.7043559551239014, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 16030 + }, + { + "epoch": 2.593161425915447, + "grad_norm": 0.5846083760261536, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 16040 + }, + { + "epoch": 2.594778110096193, + "grad_norm": 0.7323982119560242, + "learning_rate": 0.0002, + "loss": 0.7322, + "step": 16050 + }, + { + "epoch": 2.596394794276938, + "grad_norm": 0.9069556593894958, + "learning_rate": 0.0002, + "loss": 0.6794, + "step": 16060 + }, + { + "epoch": 2.5980114784576833, + "grad_norm": 0.7522736191749573, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 16070 + }, + { + "epoch": 2.5996281626384286, + "grad_norm": 0.8149648308753967, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 16080 + }, + { + "epoch": 2.601244846819174, + "grad_norm": 0.6214233040809631, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 16090 + }, + { + "epoch": 2.602861530999919, + "grad_norm": 0.6803743839263916, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 16100 + }, + { + "epoch": 2.6044782151806647, + "grad_norm": 0.7223997116088867, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 16110 + }, + { + "epoch": 2.60609489936141, + "grad_norm": 0.7324174642562866, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 16120 + }, + { + "epoch": 2.607711583542155, + "grad_norm": 0.9594739675521851, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 16130 + }, + { + "epoch": 2.6093282677229004, + "grad_norm": 0.9485327005386353, + "learning_rate": 0.0002, + "loss": 0.6451, + "step": 16140 + }, + { + "epoch": 2.6109449519036456, + "grad_norm": 0.8449000120162964, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 16150 + }, + { + "epoch": 2.612561636084391, + "grad_norm": 0.8520140051841736, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 16160 + }, + { + "epoch": 2.614178320265136, + "grad_norm": 0.7456524968147278, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 16170 + }, + { + "epoch": 2.6157950044458813, + "grad_norm": 0.9912857413291931, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 16180 + }, + { + "epoch": 2.6174116886266265, + "grad_norm": 0.9001946449279785, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 16190 + }, + { + "epoch": 2.619028372807372, + "grad_norm": 0.6568667888641357, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 16200 + }, + { + "epoch": 2.6206450569881174, + "grad_norm": 1.0248128175735474, + "learning_rate": 0.0002, + "loss": 0.7013, + "step": 16210 + }, + { + "epoch": 2.6222617411688627, + "grad_norm": 0.6509039998054504, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 16220 + }, + { + "epoch": 2.623878425349608, + "grad_norm": 0.7626351118087769, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 16230 + }, + { + "epoch": 2.625495109530353, + "grad_norm": 0.6938552260398865, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 16240 + }, + { + "epoch": 2.6271117937110984, + "grad_norm": 0.6434680819511414, + "learning_rate": 0.0002, + "loss": 0.65, + "step": 16250 + }, + { + "epoch": 2.628728477891844, + "grad_norm": 0.7111515998840332, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 16260 + }, + { + "epoch": 2.6303451620725893, + "grad_norm": 0.7712395787239075, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 16270 + }, + { + "epoch": 2.6319618462533345, + "grad_norm": 0.792209267616272, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 16280 + }, + { + "epoch": 2.6335785304340797, + "grad_norm": 0.6801066398620605, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 16290 + }, + { + "epoch": 2.635195214614825, + "grad_norm": 0.7802573442459106, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 16300 + }, + { + "epoch": 2.63681189879557, + "grad_norm": 0.7742244601249695, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 16310 + }, + { + "epoch": 2.6384285829763154, + "grad_norm": 0.664184033870697, + "learning_rate": 0.0002, + "loss": 0.6524, + "step": 16320 + }, + { + "epoch": 2.6400452671570607, + "grad_norm": 0.9242228865623474, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 16330 + }, + { + "epoch": 2.641661951337806, + "grad_norm": 0.9661325216293335, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 16340 + }, + { + "epoch": 2.6432786355185516, + "grad_norm": 0.837526798248291, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 16350 + }, + { + "epoch": 2.644895319699297, + "grad_norm": 1.1834373474121094, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 16360 + }, + { + "epoch": 2.646512003880042, + "grad_norm": 0.7467831373214722, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 16370 + }, + { + "epoch": 2.6481286880607873, + "grad_norm": 0.8627146482467651, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 16380 + }, + { + "epoch": 2.6497453722415325, + "grad_norm": 0.790447473526001, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 16390 + }, + { + "epoch": 2.651362056422278, + "grad_norm": 0.8447365164756775, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 16400 + }, + { + "epoch": 2.6529787406030234, + "grad_norm": 0.7831417918205261, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 16410 + }, + { + "epoch": 2.6545954247837686, + "grad_norm": 0.6837952136993408, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 16420 + }, + { + "epoch": 2.656212108964514, + "grad_norm": 0.7031801342964172, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 16430 + }, + { + "epoch": 2.657828793145259, + "grad_norm": 0.8963770866394043, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 16440 + }, + { + "epoch": 2.6594454773260043, + "grad_norm": 0.6852328181266785, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 16450 + }, + { + "epoch": 2.6610621615067496, + "grad_norm": 0.8069294095039368, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 16460 + }, + { + "epoch": 2.662678845687495, + "grad_norm": 0.7503686547279358, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 16470 + }, + { + "epoch": 2.66429552986824, + "grad_norm": 0.6430956125259399, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 16480 + }, + { + "epoch": 2.6659122140489853, + "grad_norm": 0.7894312739372253, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 16490 + }, + { + "epoch": 2.667528898229731, + "grad_norm": 0.7277431488037109, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 16500 + }, + { + "epoch": 2.669145582410476, + "grad_norm": 0.6816153526306152, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 16510 + }, + { + "epoch": 2.6707622665912214, + "grad_norm": 0.8145235776901245, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 16520 + }, + { + "epoch": 2.6723789507719666, + "grad_norm": 0.8645890355110168, + "learning_rate": 0.0002, + "loss": 0.709, + "step": 16530 + }, + { + "epoch": 2.673995634952712, + "grad_norm": 0.704393208026886, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 16540 + }, + { + "epoch": 2.6756123191334575, + "grad_norm": 1.0120846033096313, + "learning_rate": 0.0002, + "loss": 0.6378, + "step": 16550 + }, + { + "epoch": 2.6772290033142028, + "grad_norm": 0.6919328570365906, + "learning_rate": 0.0002, + "loss": 0.7241, + "step": 16560 + }, + { + "epoch": 2.678845687494948, + "grad_norm": 0.6924574971199036, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 16570 + }, + { + "epoch": 2.6804623716756932, + "grad_norm": 0.9679301381111145, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 16580 + }, + { + "epoch": 2.6820790558564385, + "grad_norm": 0.6810211539268494, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 16590 + }, + { + "epoch": 2.6836957400371837, + "grad_norm": 0.9730555415153503, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 16600 + }, + { + "epoch": 2.685312424217929, + "grad_norm": 0.7852821350097656, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 16610 + }, + { + "epoch": 2.686929108398674, + "grad_norm": 0.6059057116508484, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 16620 + }, + { + "epoch": 2.6885457925794194, + "grad_norm": 0.9395958781242371, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 16630 + }, + { + "epoch": 2.690162476760165, + "grad_norm": 0.7473729848861694, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 16640 + }, + { + "epoch": 2.6917791609409103, + "grad_norm": 0.765934407711029, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 16650 + }, + { + "epoch": 2.6933958451216555, + "grad_norm": 0.8496677279472351, + "learning_rate": 0.0002, + "loss": 0.707, + "step": 16660 + }, + { + "epoch": 2.6950125293024008, + "grad_norm": 0.7641879916191101, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 16670 + }, + { + "epoch": 2.696629213483146, + "grad_norm": 0.8471952676773071, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 16680 + }, + { + "epoch": 2.6982458976638912, + "grad_norm": 0.6946060657501221, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 16690 + }, + { + "epoch": 2.699862581844637, + "grad_norm": 0.7361312508583069, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 16700 + }, + { + "epoch": 2.701479266025382, + "grad_norm": 0.6605038046836853, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 16710 + }, + { + "epoch": 2.7030959502061274, + "grad_norm": 0.7164411544799805, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 16720 + }, + { + "epoch": 2.7047126343868726, + "grad_norm": 0.6496201157569885, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 16730 + }, + { + "epoch": 2.706329318567618, + "grad_norm": 0.7826663851737976, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 16740 + }, + { + "epoch": 2.707946002748363, + "grad_norm": 0.7639131546020508, + "learning_rate": 0.0002, + "loss": 0.6773, + "step": 16750 + }, + { + "epoch": 2.7095626869291083, + "grad_norm": 0.7976210713386536, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 16760 + }, + { + "epoch": 2.7111793711098535, + "grad_norm": 0.6836577653884888, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 16770 + }, + { + "epoch": 2.7127960552905988, + "grad_norm": 0.8025202751159668, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 16780 + }, + { + "epoch": 2.7144127394713444, + "grad_norm": 0.7636463642120361, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 16790 + }, + { + "epoch": 2.7160294236520897, + "grad_norm": 0.7481677532196045, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 16800 + }, + { + "epoch": 2.717646107832835, + "grad_norm": 0.7566834688186646, + "learning_rate": 0.0002, + "loss": 0.6925, + "step": 16810 + }, + { + "epoch": 2.71926279201358, + "grad_norm": 0.7931267619132996, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 16820 + }, + { + "epoch": 2.7208794761943254, + "grad_norm": 0.8811662197113037, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 16830 + }, + { + "epoch": 2.7224961603750706, + "grad_norm": 0.8561240434646606, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 16840 + }, + { + "epoch": 2.7241128445558163, + "grad_norm": 0.7121599316596985, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 16850 + }, + { + "epoch": 2.7257295287365615, + "grad_norm": 0.8066257238388062, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 16860 + }, + { + "epoch": 2.7273462129173067, + "grad_norm": 0.7699271440505981, + "learning_rate": 0.0002, + "loss": 0.6839, + "step": 16870 + }, + { + "epoch": 2.728962897098052, + "grad_norm": 1.1828432083129883, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 16880 + }, + { + "epoch": 2.730579581278797, + "grad_norm": 0.9989302754402161, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 16890 + }, + { + "epoch": 2.7321962654595424, + "grad_norm": 0.8100560307502747, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 16900 + }, + { + "epoch": 2.7338129496402876, + "grad_norm": 0.8615233898162842, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 16910 + }, + { + "epoch": 2.735429633821033, + "grad_norm": 0.8633756041526794, + "learning_rate": 0.0002, + "loss": 0.6322, + "step": 16920 + }, + { + "epoch": 2.737046318001778, + "grad_norm": 0.7769348621368408, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 16930 + }, + { + "epoch": 2.738663002182524, + "grad_norm": 0.6943058371543884, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 16940 + }, + { + "epoch": 2.740279686363269, + "grad_norm": 0.8510736227035522, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 16950 + }, + { + "epoch": 2.7418963705440142, + "grad_norm": 0.7732602953910828, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 16960 + }, + { + "epoch": 2.7435130547247595, + "grad_norm": 0.5981788635253906, + "learning_rate": 0.0002, + "loss": 0.6553, + "step": 16970 + }, + { + "epoch": 2.7451297389055047, + "grad_norm": 0.7604416012763977, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 16980 + }, + { + "epoch": 2.74674642308625, + "grad_norm": 0.7377738356590271, + "learning_rate": 0.0002, + "loss": 0.6981, + "step": 16990 + }, + { + "epoch": 2.7483631072669956, + "grad_norm": 0.9400289058685303, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 17000 + }, + { + "epoch": 2.749979791447741, + "grad_norm": 0.6340599656105042, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 17010 + }, + { + "epoch": 2.751596475628486, + "grad_norm": 0.7297601103782654, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 17020 + }, + { + "epoch": 2.7532131598092313, + "grad_norm": 0.9479979872703552, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 17030 + }, + { + "epoch": 2.7548298439899765, + "grad_norm": 0.8461511135101318, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 17040 + }, + { + "epoch": 2.7564465281707218, + "grad_norm": 0.7477551698684692, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 17050 + }, + { + "epoch": 2.758063212351467, + "grad_norm": 1.019270420074463, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 17060 + }, + { + "epoch": 2.7596798965322122, + "grad_norm": 0.7730235457420349, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 17070 + }, + { + "epoch": 2.7612965807129575, + "grad_norm": 0.8216866254806519, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 17080 + }, + { + "epoch": 2.762913264893703, + "grad_norm": 0.7235931754112244, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 17090 + }, + { + "epoch": 2.7645299490744484, + "grad_norm": 0.7352296710014343, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 17100 + }, + { + "epoch": 2.7661466332551936, + "grad_norm": 0.8129373788833618, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 17110 + }, + { + "epoch": 2.767763317435939, + "grad_norm": 0.7387019991874695, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 17120 + }, + { + "epoch": 2.769380001616684, + "grad_norm": 0.9149190187454224, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 17130 + }, + { + "epoch": 2.7709966857974297, + "grad_norm": 0.7352971434593201, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 17140 + }, + { + "epoch": 2.772613369978175, + "grad_norm": 0.7903780341148376, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 17150 + }, + { + "epoch": 2.77423005415892, + "grad_norm": 0.8255927562713623, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 17160 + }, + { + "epoch": 2.7758467383396654, + "grad_norm": 0.7235927581787109, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 17170 + }, + { + "epoch": 2.7774634225204107, + "grad_norm": 0.8281434774398804, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 17180 + }, + { + "epoch": 2.779080106701156, + "grad_norm": 0.7586921453475952, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 17190 + }, + { + "epoch": 2.780696790881901, + "grad_norm": 0.7161715030670166, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 17200 + }, + { + "epoch": 2.7823134750626464, + "grad_norm": 0.762868344783783, + "learning_rate": 0.0002, + "loss": 0.6426, + "step": 17210 + }, + { + "epoch": 2.7839301592433916, + "grad_norm": 0.9285483360290527, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 17220 + }, + { + "epoch": 2.785546843424137, + "grad_norm": 0.6900462508201599, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 17230 + }, + { + "epoch": 2.7871635276048825, + "grad_norm": 0.780384361743927, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 17240 + }, + { + "epoch": 2.7887802117856277, + "grad_norm": 0.7580406665802002, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 17250 + }, + { + "epoch": 2.790396895966373, + "grad_norm": 0.8145199418067932, + "learning_rate": 0.0002, + "loss": 0.6833, + "step": 17260 + }, + { + "epoch": 2.792013580147118, + "grad_norm": 0.9159596562385559, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 17270 + }, + { + "epoch": 2.7936302643278634, + "grad_norm": 0.9590014219284058, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 17280 + }, + { + "epoch": 2.795246948508609, + "grad_norm": 0.7603529691696167, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 17290 + }, + { + "epoch": 2.7968636326893543, + "grad_norm": 0.8039976358413696, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 17300 + }, + { + "epoch": 2.7984803168700996, + "grad_norm": 0.8364847302436829, + "learning_rate": 0.0002, + "loss": 0.7037, + "step": 17310 + }, + { + "epoch": 2.800097001050845, + "grad_norm": 0.8763046860694885, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 17320 + }, + { + "epoch": 2.80171368523159, + "grad_norm": 0.8409647941589355, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 17330 + }, + { + "epoch": 2.8033303694123353, + "grad_norm": 0.7649006247520447, + "learning_rate": 0.0002, + "loss": 0.6936, + "step": 17340 + }, + { + "epoch": 2.8049470535930805, + "grad_norm": 0.7970262169837952, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 17350 + }, + { + "epoch": 2.8065637377738257, + "grad_norm": 0.9088607430458069, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 17360 + }, + { + "epoch": 2.808180421954571, + "grad_norm": 0.6454846858978271, + "learning_rate": 0.0002, + "loss": 0.675, + "step": 17370 + }, + { + "epoch": 2.809797106135316, + "grad_norm": 0.7744787931442261, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 17380 + }, + { + "epoch": 2.811413790316062, + "grad_norm": 0.6678640842437744, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 17390 + }, + { + "epoch": 2.813030474496807, + "grad_norm": 0.772676944732666, + "learning_rate": 0.0002, + "loss": 0.6784, + "step": 17400 + }, + { + "epoch": 2.8146471586775523, + "grad_norm": 0.7088175415992737, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 17410 + }, + { + "epoch": 2.8162638428582976, + "grad_norm": 0.8280573487281799, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 17420 + }, + { + "epoch": 2.817880527039043, + "grad_norm": 0.6665388345718384, + "learning_rate": 0.0002, + "loss": 0.6732, + "step": 17430 + }, + { + "epoch": 2.8194972112197885, + "grad_norm": 0.6427883505821228, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 17440 + }, + { + "epoch": 2.8211138954005337, + "grad_norm": 0.9697760343551636, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 17450 + }, + { + "epoch": 2.822730579581279, + "grad_norm": 0.7573966383934021, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 17460 + }, + { + "epoch": 2.824347263762024, + "grad_norm": 0.878688633441925, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 17470 + }, + { + "epoch": 2.8259639479427694, + "grad_norm": 0.7752242684364319, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 17480 + }, + { + "epoch": 2.8275806321235146, + "grad_norm": 0.6135398745536804, + "learning_rate": 0.0002, + "loss": 0.6638, + "step": 17490 + }, + { + "epoch": 2.82919731630426, + "grad_norm": 0.6924924850463867, + "learning_rate": 0.0002, + "loss": 0.6829, + "step": 17500 + }, + { + "epoch": 2.830814000485005, + "grad_norm": 0.7471627593040466, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 17510 + }, + { + "epoch": 2.8324306846657503, + "grad_norm": 0.7145499587059021, + "learning_rate": 0.0002, + "loss": 0.7016, + "step": 17520 + }, + { + "epoch": 2.834047368846496, + "grad_norm": 0.7415414452552795, + "learning_rate": 0.0002, + "loss": 0.6787, + "step": 17530 + }, + { + "epoch": 2.8356640530272412, + "grad_norm": 0.7328441739082336, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 17540 + }, + { + "epoch": 2.8372807372079865, + "grad_norm": 0.8267839550971985, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 17550 + }, + { + "epoch": 2.8388974213887317, + "grad_norm": 0.8877885341644287, + "learning_rate": 0.0002, + "loss": 0.6787, + "step": 17560 + }, + { + "epoch": 2.840514105569477, + "grad_norm": 0.857138454914093, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 17570 + }, + { + "epoch": 2.842130789750222, + "grad_norm": 0.8470779657363892, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 17580 + }, + { + "epoch": 2.843747473930968, + "grad_norm": 0.8553254008293152, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 17590 + }, + { + "epoch": 2.845364158111713, + "grad_norm": 0.8033196926116943, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 17600 + }, + { + "epoch": 2.8469808422924583, + "grad_norm": 0.7949087023735046, + "learning_rate": 0.0002, + "loss": 0.7062, + "step": 17610 + }, + { + "epoch": 2.8485975264732035, + "grad_norm": 0.9241406321525574, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 17620 + }, + { + "epoch": 2.8502142106539488, + "grad_norm": 0.7721285223960876, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 17630 + }, + { + "epoch": 2.851830894834694, + "grad_norm": 1.0246692895889282, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 17640 + }, + { + "epoch": 2.853447579015439, + "grad_norm": 0.9244589805603027, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 17650 + }, + { + "epoch": 2.8550642631961844, + "grad_norm": 0.7243508696556091, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 17660 + }, + { + "epoch": 2.8566809473769297, + "grad_norm": 0.8943371176719666, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 17670 + }, + { + "epoch": 2.8582976315576754, + "grad_norm": 0.6531758904457092, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 17680 + }, + { + "epoch": 2.8599143157384206, + "grad_norm": 0.8367000818252563, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 17690 + }, + { + "epoch": 2.861530999919166, + "grad_norm": 0.7868556380271912, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 17700 + }, + { + "epoch": 2.863147684099911, + "grad_norm": 0.7213859558105469, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 17710 + }, + { + "epoch": 2.8647643682806563, + "grad_norm": 0.7383931279182434, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 17720 + }, + { + "epoch": 2.8663810524614015, + "grad_norm": 0.7566812634468079, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 17730 + }, + { + "epoch": 2.867997736642147, + "grad_norm": 0.6930373311042786, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 17740 + }, + { + "epoch": 2.8696144208228924, + "grad_norm": 0.7911090850830078, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 17750 + }, + { + "epoch": 2.8712311050036377, + "grad_norm": 0.8484548926353455, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 17760 + }, + { + "epoch": 2.872847789184383, + "grad_norm": 0.7647597193717957, + "learning_rate": 0.0002, + "loss": 0.6931, + "step": 17770 + }, + { + "epoch": 2.874464473365128, + "grad_norm": 0.8791151642799377, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 17780 + }, + { + "epoch": 2.8760811575458733, + "grad_norm": 0.7253178358078003, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 17790 + }, + { + "epoch": 2.8776978417266186, + "grad_norm": 0.7956077456474304, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 17800 + }, + { + "epoch": 2.879314525907364, + "grad_norm": 0.8657688498497009, + "learning_rate": 0.0002, + "loss": 0.6687, + "step": 17810 + }, + { + "epoch": 2.880931210088109, + "grad_norm": 0.7059141993522644, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 17820 + }, + { + "epoch": 2.8825478942688547, + "grad_norm": 0.8886896967887878, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 17830 + }, + { + "epoch": 2.8841645784496, + "grad_norm": 0.821032702922821, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 17840 + }, + { + "epoch": 2.885781262630345, + "grad_norm": 0.7183963656425476, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 17850 + }, + { + "epoch": 2.8873979468110904, + "grad_norm": 0.6222899556159973, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 17860 + }, + { + "epoch": 2.8890146309918356, + "grad_norm": 0.8187434077262878, + "learning_rate": 0.0002, + "loss": 0.6626, + "step": 17870 + }, + { + "epoch": 2.890631315172581, + "grad_norm": 0.9838479161262512, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 17880 + }, + { + "epoch": 2.8922479993533265, + "grad_norm": 0.7567742466926575, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 17890 + }, + { + "epoch": 2.893864683534072, + "grad_norm": 0.6875903606414795, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 17900 + }, + { + "epoch": 2.895481367714817, + "grad_norm": 0.8043789267539978, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 17910 + }, + { + "epoch": 2.8970980518955622, + "grad_norm": 0.8062626719474792, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 17920 + }, + { + "epoch": 2.8987147360763075, + "grad_norm": 1.0251191854476929, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 17930 + }, + { + "epoch": 2.9003314202570527, + "grad_norm": 0.882253110408783, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 17940 + }, + { + "epoch": 2.901948104437798, + "grad_norm": 0.8683299422264099, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 17950 + }, + { + "epoch": 2.903564788618543, + "grad_norm": 0.7167282104492188, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 17960 + }, + { + "epoch": 2.9051814727992884, + "grad_norm": 0.7093694806098938, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 17970 + }, + { + "epoch": 2.906798156980034, + "grad_norm": 0.8549879193305969, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 17980 + }, + { + "epoch": 2.9084148411607793, + "grad_norm": 0.6989606618881226, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 17990 + }, + { + "epoch": 2.9100315253415245, + "grad_norm": 0.9482976794242859, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 18000 + }, + { + "epoch": 2.9116482095222698, + "grad_norm": 0.7182440161705017, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 18010 + }, + { + "epoch": 2.913264893703015, + "grad_norm": 0.7732226252555847, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 18020 + }, + { + "epoch": 2.9148815778837607, + "grad_norm": 0.7936875224113464, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 18030 + }, + { + "epoch": 2.916498262064506, + "grad_norm": 0.8825615644454956, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 18040 + }, + { + "epoch": 2.918114946245251, + "grad_norm": 0.6778587102890015, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 18050 + }, + { + "epoch": 2.9197316304259964, + "grad_norm": 0.7529265880584717, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 18060 + }, + { + "epoch": 2.9213483146067416, + "grad_norm": 0.7111883163452148, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 18070 + }, + { + "epoch": 2.922964998787487, + "grad_norm": 0.7214767932891846, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 18080 + }, + { + "epoch": 2.924581682968232, + "grad_norm": 0.800417423248291, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 18090 + }, + { + "epoch": 2.9261983671489773, + "grad_norm": 1.248575210571289, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 18100 + }, + { + "epoch": 2.9278150513297225, + "grad_norm": 0.757788360118866, + "learning_rate": 0.0002, + "loss": 0.7004, + "step": 18110 + }, + { + "epoch": 2.9294317355104678, + "grad_norm": 1.0583995580673218, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 18120 + }, + { + "epoch": 2.9310484196912134, + "grad_norm": 0.8228777647018433, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 18130 + }, + { + "epoch": 2.9326651038719587, + "grad_norm": 0.8374035358428955, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 18140 + }, + { + "epoch": 2.934281788052704, + "grad_norm": 0.7976473569869995, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 18150 + }, + { + "epoch": 2.935898472233449, + "grad_norm": 0.8009907603263855, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 18160 + }, + { + "epoch": 2.9375151564141944, + "grad_norm": 0.835213303565979, + "learning_rate": 0.0002, + "loss": 0.7485, + "step": 18170 + }, + { + "epoch": 2.93913184059494, + "grad_norm": 0.7982219457626343, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 18180 + }, + { + "epoch": 2.9407485247756853, + "grad_norm": 0.7070978879928589, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 18190 + }, + { + "epoch": 2.9423652089564305, + "grad_norm": 0.8619440197944641, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 18200 + }, + { + "epoch": 2.9439818931371757, + "grad_norm": 0.6693987250328064, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 18210 + }, + { + "epoch": 2.945598577317921, + "grad_norm": 0.6747021079063416, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 18220 + }, + { + "epoch": 2.947215261498666, + "grad_norm": 0.860387921333313, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 18230 + }, + { + "epoch": 2.9488319456794114, + "grad_norm": 0.799976646900177, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 18240 + }, + { + "epoch": 2.9504486298601567, + "grad_norm": 0.7864769101142883, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 18250 + }, + { + "epoch": 2.952065314040902, + "grad_norm": 0.6713884472846985, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 18260 + }, + { + "epoch": 2.9536819982216476, + "grad_norm": 0.9031508564949036, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 18270 + }, + { + "epoch": 2.955298682402393, + "grad_norm": 0.7205073237419128, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 18280 + }, + { + "epoch": 2.956915366583138, + "grad_norm": 0.7746205925941467, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 18290 + }, + { + "epoch": 2.9585320507638833, + "grad_norm": 0.6533427834510803, + "learning_rate": 0.0002, + "loss": 0.6218, + "step": 18300 + }, + { + "epoch": 2.9601487349446285, + "grad_norm": 0.9083208441734314, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 18310 + }, + { + "epoch": 2.9617654191253737, + "grad_norm": 0.7446991801261902, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 18320 + }, + { + "epoch": 2.9633821033061194, + "grad_norm": 0.6514461636543274, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 18330 + }, + { + "epoch": 2.9649987874868646, + "grad_norm": 0.8580465912818909, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 18340 + }, + { + "epoch": 2.96661547166761, + "grad_norm": 0.7074266076087952, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 18350 + }, + { + "epoch": 2.968232155848355, + "grad_norm": 0.899892270565033, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 18360 + }, + { + "epoch": 2.9698488400291003, + "grad_norm": 0.8217641711235046, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 18370 + }, + { + "epoch": 2.9714655242098456, + "grad_norm": 0.8611799478530884, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 18380 + }, + { + "epoch": 2.973082208390591, + "grad_norm": 0.6909302473068237, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 18390 + }, + { + "epoch": 2.974698892571336, + "grad_norm": 0.6554358005523682, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 18400 + }, + { + "epoch": 2.9763155767520812, + "grad_norm": 0.7803071737289429, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 18410 + }, + { + "epoch": 2.977932260932827, + "grad_norm": 0.7838954925537109, + "learning_rate": 0.0002, + "loss": 0.7322, + "step": 18420 + }, + { + "epoch": 2.979548945113572, + "grad_norm": 0.7098495364189148, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 18430 + }, + { + "epoch": 2.9811656292943174, + "grad_norm": 0.8981785774230957, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 18440 + }, + { + "epoch": 2.9827823134750626, + "grad_norm": 0.7197171449661255, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 18450 + }, + { + "epoch": 2.984398997655808, + "grad_norm": 0.793185293674469, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 18460 + }, + { + "epoch": 2.986015681836553, + "grad_norm": 0.8531473875045776, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 18470 + }, + { + "epoch": 2.9876323660172988, + "grad_norm": 0.6627361178398132, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 18480 + }, + { + "epoch": 2.989249050198044, + "grad_norm": 0.5708155035972595, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 18490 + }, + { + "epoch": 2.990865734378789, + "grad_norm": 0.8227280378341675, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 18500 + }, + { + "epoch": 2.9924824185595345, + "grad_norm": 0.7102749943733215, + "learning_rate": 0.0002, + "loss": 0.6701, + "step": 18510 + }, + { + "epoch": 2.9940991027402797, + "grad_norm": 0.839485228061676, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 18520 + }, + { + "epoch": 2.995715786921025, + "grad_norm": 0.9038704037666321, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 18530 + }, + { + "epoch": 2.99733247110177, + "grad_norm": 0.8737510442733765, + "learning_rate": 0.0002, + "loss": 0.7186, + "step": 18540 + }, + { + "epoch": 2.9989491552825154, + "grad_norm": 0.7323142886161804, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 18550 + }, + { + "epoch": 2.9999191657909625, + "eval_loss": 1.1262480020523071, + "eval_runtime": 122.0868, + "eval_samples_per_second": 6.004, + "eval_steps_per_second": 0.754, + "step": 18556 + } + ], + "logging_steps": 10, + "max_steps": 49480, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.587532703136481e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..155b12fa9acbc6e71dba75c92bfa79e152397ebf --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-18556/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28694d5564a2b5c7d6881d4ba2af103356aa22489d2c22768ebbe47283c0f4a1 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b4728a8d41cdf93d8b529cdfd1f99a5be2c754d8 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:932a3fd701339ac0d6d24098de64692c58e2f5819c1c254b5170b90f9786ef3f +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f141976366a4602a7f1e902746a2e9c591c03cdc --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4abee7ce2c656dbc963450bea7ad045f80818ca885875b7270228650d2d85e87 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..8aa782afef5fb506c1085024c63d0b3f35500787 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f176776c36fdc2d6595bb77fc99e5132f347416e5d12b98e23c7bd4498df3581 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1bcab7a111927143da964bbcdfcdd3a7fb414748 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b949cbb2a6d4962c67bf28344950c416a55c88bdaab30dddd07052abcc682b3a +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..683a20286dac73f4a80befa8618856aa28edcb05 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/trainer_state.json @@ -0,0 +1,17383 @@ +{ + "best_metric": 1.0871200561523438, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 24742, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016166841807452913, + "grad_norm": 0.9894065856933594, + "learning_rate": 0.0002, + "loss": 1.6636, + "step": 10 + }, + { + "epoch": 0.0032333683614905826, + "grad_norm": 1.7810699939727783, + "learning_rate": 0.0002, + "loss": 1.1528, + "step": 20 + }, + { + "epoch": 0.004850052542235874, + "grad_norm": 0.5969577431678772, + "learning_rate": 0.0002, + "loss": 0.9767, + "step": 30 + }, + { + "epoch": 0.006466736722981165, + "grad_norm": 0.6354120969772339, + "learning_rate": 0.0002, + "loss": 0.9772, + "step": 40 + }, + { + "epoch": 0.008083420903726457, + "grad_norm": 0.5604607462882996, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 50 + }, + { + "epoch": 0.009700105084471748, + "grad_norm": 0.4676193594932556, + "learning_rate": 0.0002, + "loss": 0.8841, + "step": 60 + }, + { + "epoch": 0.01131678926521704, + "grad_norm": 0.6099211573600769, + "learning_rate": 0.0002, + "loss": 0.9022, + "step": 70 + }, + { + "epoch": 0.01293347344596233, + "grad_norm": 0.48639994859695435, + "learning_rate": 0.0002, + "loss": 0.9133, + "step": 80 + }, + { + "epoch": 0.014550157626707623, + "grad_norm": 0.4904264509677887, + "learning_rate": 0.0002, + "loss": 0.8704, + "step": 90 + }, + { + "epoch": 0.016166841807452915, + "grad_norm": 2.8334362506866455, + "learning_rate": 0.0002, + "loss": 0.8855, + "step": 100 + }, + { + "epoch": 0.017783525988198205, + "grad_norm": 0.43221670389175415, + "learning_rate": 0.0002, + "loss": 0.8958, + "step": 110 + }, + { + "epoch": 0.019400210168943496, + "grad_norm": 0.42244166135787964, + "learning_rate": 0.0002, + "loss": 0.8412, + "step": 120 + }, + { + "epoch": 0.02101689434968879, + "grad_norm": 0.45363298058509827, + "learning_rate": 0.0002, + "loss": 0.8467, + "step": 130 + }, + { + "epoch": 0.02263357853043408, + "grad_norm": 0.44816508889198303, + "learning_rate": 0.0002, + "loss": 0.8641, + "step": 140 + }, + { + "epoch": 0.02425026271117937, + "grad_norm": 0.43308213353157043, + "learning_rate": 0.0002, + "loss": 0.8496, + "step": 150 + }, + { + "epoch": 0.02586694689192466, + "grad_norm": 0.4084763526916504, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 160 + }, + { + "epoch": 0.027483631072669955, + "grad_norm": 0.5363703966140747, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 170 + }, + { + "epoch": 0.029100315253415245, + "grad_norm": 0.4619699716567993, + "learning_rate": 0.0002, + "loss": 0.8558, + "step": 180 + }, + { + "epoch": 0.030716999434160536, + "grad_norm": 0.49069908261299133, + "learning_rate": 0.0002, + "loss": 0.8878, + "step": 190 + }, + { + "epoch": 0.03233368361490583, + "grad_norm": 0.4645835757255554, + "learning_rate": 0.0002, + "loss": 0.8867, + "step": 200 + }, + { + "epoch": 0.03395036779565112, + "grad_norm": 1.2411243915557861, + "learning_rate": 0.0002, + "loss": 0.8842, + "step": 210 + }, + { + "epoch": 0.03556705197639641, + "grad_norm": 0.5211851596832275, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 220 + }, + { + "epoch": 0.037183736157141704, + "grad_norm": 0.5253691673278809, + "learning_rate": 0.0002, + "loss": 0.8194, + "step": 230 + }, + { + "epoch": 0.03880042033788699, + "grad_norm": 0.4567478895187378, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 240 + }, + { + "epoch": 0.040417104518632285, + "grad_norm": 0.5472128391265869, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 250 + }, + { + "epoch": 0.04203378869937758, + "grad_norm": 0.42978546023368835, + "learning_rate": 0.0002, + "loss": 0.8201, + "step": 260 + }, + { + "epoch": 0.043650472880122866, + "grad_norm": 0.601734459400177, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 270 + }, + { + "epoch": 0.04526715706086816, + "grad_norm": 0.4286513328552246, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 280 + }, + { + "epoch": 0.046883841241613454, + "grad_norm": 0.5230861902236938, + "learning_rate": 0.0002, + "loss": 0.8758, + "step": 290 + }, + { + "epoch": 0.04850052542235874, + "grad_norm": 0.6504611968994141, + "learning_rate": 0.0002, + "loss": 0.8636, + "step": 300 + }, + { + "epoch": 0.050117209603104035, + "grad_norm": 0.43485215306282043, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 310 + }, + { + "epoch": 0.05173389378384932, + "grad_norm": 0.4717007875442505, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 320 + }, + { + "epoch": 0.053350577964594616, + "grad_norm": 0.4059787690639496, + "learning_rate": 0.0002, + "loss": 0.8469, + "step": 330 + }, + { + "epoch": 0.05496726214533991, + "grad_norm": 0.4366913437843323, + "learning_rate": 0.0002, + "loss": 0.8866, + "step": 340 + }, + { + "epoch": 0.0565839463260852, + "grad_norm": 0.4233848452568054, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 350 + }, + { + "epoch": 0.05820063050683049, + "grad_norm": 0.4209108352661133, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 360 + }, + { + "epoch": 0.059817314687575784, + "grad_norm": 0.41637396812438965, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 370 + }, + { + "epoch": 0.06143399886832107, + "grad_norm": 0.46235376596450806, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 380 + }, + { + "epoch": 0.06305068304906636, + "grad_norm": 0.4013484716415405, + "learning_rate": 0.0002, + "loss": 0.7966, + "step": 390 + }, + { + "epoch": 0.06466736722981166, + "grad_norm": 0.47443896532058716, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 400 + }, + { + "epoch": 0.06628405141055695, + "grad_norm": 0.3942156434059143, + "learning_rate": 0.0002, + "loss": 0.8666, + "step": 410 + }, + { + "epoch": 0.06790073559130223, + "grad_norm": 0.4965320825576782, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 420 + }, + { + "epoch": 0.06951741977204753, + "grad_norm": 0.4304835796356201, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 430 + }, + { + "epoch": 0.07113410395279282, + "grad_norm": 0.511726975440979, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 440 + }, + { + "epoch": 0.07275078813353811, + "grad_norm": 0.4040689170360565, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 450 + }, + { + "epoch": 0.07436747231428341, + "grad_norm": 0.5402171015739441, + "learning_rate": 0.0002, + "loss": 0.8788, + "step": 460 + }, + { + "epoch": 0.0759841564950287, + "grad_norm": 0.4174517095088959, + "learning_rate": 0.0002, + "loss": 0.8737, + "step": 470 + }, + { + "epoch": 0.07760084067577398, + "grad_norm": 0.4306182265281677, + "learning_rate": 0.0002, + "loss": 0.7605, + "step": 480 + }, + { + "epoch": 0.07921752485651928, + "grad_norm": 0.535210132598877, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 490 + }, + { + "epoch": 0.08083420903726457, + "grad_norm": 0.5339109897613525, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 500 + }, + { + "epoch": 0.08245089321800986, + "grad_norm": 0.45754891633987427, + "learning_rate": 0.0002, + "loss": 0.8985, + "step": 510 + }, + { + "epoch": 0.08406757739875516, + "grad_norm": 0.43820783495903015, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 520 + }, + { + "epoch": 0.08568426157950045, + "grad_norm": 0.4434749186038971, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 530 + }, + { + "epoch": 0.08730094576024573, + "grad_norm": 0.43111467361450195, + "learning_rate": 0.0002, + "loss": 0.7857, + "step": 540 + }, + { + "epoch": 0.08891762994099103, + "grad_norm": 0.4378940165042877, + "learning_rate": 0.0002, + "loss": 0.8418, + "step": 550 + }, + { + "epoch": 0.09053431412173632, + "grad_norm": 0.4772215187549591, + "learning_rate": 0.0002, + "loss": 0.8361, + "step": 560 + }, + { + "epoch": 0.09215099830248161, + "grad_norm": 0.6837629079818726, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 570 + }, + { + "epoch": 0.09376768248322691, + "grad_norm": 0.42241212725639343, + "learning_rate": 0.0002, + "loss": 0.8607, + "step": 580 + }, + { + "epoch": 0.0953843666639722, + "grad_norm": 0.5165936350822449, + "learning_rate": 0.0002, + "loss": 0.852, + "step": 590 + }, + { + "epoch": 0.09700105084471748, + "grad_norm": 0.48737478256225586, + "learning_rate": 0.0002, + "loss": 0.8664, + "step": 600 + }, + { + "epoch": 0.09861773502546278, + "grad_norm": 0.47419852018356323, + "learning_rate": 0.0002, + "loss": 0.8806, + "step": 610 + }, + { + "epoch": 0.10023441920620807, + "grad_norm": 0.4975486099720001, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 620 + }, + { + "epoch": 0.10185110338695336, + "grad_norm": 0.49123844504356384, + "learning_rate": 0.0002, + "loss": 0.8548, + "step": 630 + }, + { + "epoch": 0.10346778756769864, + "grad_norm": 0.6288952827453613, + "learning_rate": 0.0002, + "loss": 0.8911, + "step": 640 + }, + { + "epoch": 0.10508447174844394, + "grad_norm": 0.4277345836162567, + "learning_rate": 0.0002, + "loss": 0.827, + "step": 650 + }, + { + "epoch": 0.10670115592918923, + "grad_norm": 0.4021061956882477, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 660 + }, + { + "epoch": 0.10831784010993452, + "grad_norm": 0.3492237329483032, + "learning_rate": 0.0002, + "loss": 0.87, + "step": 670 + }, + { + "epoch": 0.10993452429067982, + "grad_norm": 0.4341012239456177, + "learning_rate": 0.0002, + "loss": 0.8698, + "step": 680 + }, + { + "epoch": 0.1115512084714251, + "grad_norm": 0.7296304106712341, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 690 + }, + { + "epoch": 0.1131678926521704, + "grad_norm": 0.397494912147522, + "learning_rate": 0.0002, + "loss": 0.8433, + "step": 700 + }, + { + "epoch": 0.1147845768329157, + "grad_norm": 0.396431028842926, + "learning_rate": 0.0002, + "loss": 0.827, + "step": 710 + }, + { + "epoch": 0.11640126101366098, + "grad_norm": 0.48842838406562805, + "learning_rate": 0.0002, + "loss": 0.8379, + "step": 720 + }, + { + "epoch": 0.11801794519440627, + "grad_norm": 0.46322616934776306, + "learning_rate": 0.0002, + "loss": 0.8238, + "step": 730 + }, + { + "epoch": 0.11963462937515157, + "grad_norm": 0.47990912199020386, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 740 + }, + { + "epoch": 0.12125131355589686, + "grad_norm": 0.4997142255306244, + "learning_rate": 0.0002, + "loss": 0.82, + "step": 750 + }, + { + "epoch": 0.12286799773664214, + "grad_norm": 0.4040526747703552, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 760 + }, + { + "epoch": 0.12448468191738744, + "grad_norm": 0.453095942735672, + "learning_rate": 0.0002, + "loss": 0.863, + "step": 770 + }, + { + "epoch": 0.12610136609813272, + "grad_norm": 0.4636971950531006, + "learning_rate": 0.0002, + "loss": 0.8792, + "step": 780 + }, + { + "epoch": 0.12771805027887803, + "grad_norm": 0.4279276132583618, + "learning_rate": 0.0002, + "loss": 0.8112, + "step": 790 + }, + { + "epoch": 0.12933473445962332, + "grad_norm": 0.46212655305862427, + "learning_rate": 0.0002, + "loss": 0.8711, + "step": 800 + }, + { + "epoch": 0.1309514186403686, + "grad_norm": 0.43127650022506714, + "learning_rate": 0.0002, + "loss": 0.8368, + "step": 810 + }, + { + "epoch": 0.1325681028211139, + "grad_norm": 0.4201301336288452, + "learning_rate": 0.0002, + "loss": 0.8476, + "step": 820 + }, + { + "epoch": 0.13418478700185918, + "grad_norm": 0.42583167552948, + "learning_rate": 0.0002, + "loss": 0.8078, + "step": 830 + }, + { + "epoch": 0.13580147118260447, + "grad_norm": 0.4535622000694275, + "learning_rate": 0.0002, + "loss": 0.8219, + "step": 840 + }, + { + "epoch": 0.13741815536334978, + "grad_norm": 0.4116036891937256, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 850 + }, + { + "epoch": 0.13903483954409507, + "grad_norm": 0.45997580885887146, + "learning_rate": 0.0002, + "loss": 0.8466, + "step": 860 + }, + { + "epoch": 0.14065152372484035, + "grad_norm": 0.4487837255001068, + "learning_rate": 0.0002, + "loss": 0.8917, + "step": 870 + }, + { + "epoch": 0.14226820790558564, + "grad_norm": 0.43650057911872864, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 880 + }, + { + "epoch": 0.14388489208633093, + "grad_norm": 0.5335358381271362, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 890 + }, + { + "epoch": 0.14550157626707622, + "grad_norm": 0.5989000201225281, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 900 + }, + { + "epoch": 0.14711826044782153, + "grad_norm": 0.517179012298584, + "learning_rate": 0.0002, + "loss": 0.8385, + "step": 910 + }, + { + "epoch": 0.14873494462856682, + "grad_norm": 0.44435232877731323, + "learning_rate": 0.0002, + "loss": 0.8255, + "step": 920 + }, + { + "epoch": 0.1503516288093121, + "grad_norm": 0.42635923624038696, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 930 + }, + { + "epoch": 0.1519683129900574, + "grad_norm": 0.49603334069252014, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 940 + }, + { + "epoch": 0.15358499717080268, + "grad_norm": 0.40639808773994446, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 950 + }, + { + "epoch": 0.15520168135154797, + "grad_norm": 0.4850759208202362, + "learning_rate": 0.0002, + "loss": 0.8529, + "step": 960 + }, + { + "epoch": 0.15681836553229328, + "grad_norm": 0.4427442252635956, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 970 + }, + { + "epoch": 0.15843504971303857, + "grad_norm": 0.3760930001735687, + "learning_rate": 0.0002, + "loss": 0.8705, + "step": 980 + }, + { + "epoch": 0.16005173389378385, + "grad_norm": 0.4794144332408905, + "learning_rate": 0.0002, + "loss": 0.8644, + "step": 990 + }, + { + "epoch": 0.16166841807452914, + "grad_norm": 0.45828768610954285, + "learning_rate": 0.0002, + "loss": 0.8002, + "step": 1000 + }, + { + "epoch": 0.16328510225527443, + "grad_norm": 0.6313053369522095, + "learning_rate": 0.0002, + "loss": 0.7658, + "step": 1010 + }, + { + "epoch": 0.16490178643601971, + "grad_norm": 0.45041006803512573, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1020 + }, + { + "epoch": 0.166518470616765, + "grad_norm": 0.441403865814209, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 1030 + }, + { + "epoch": 0.16813515479751032, + "grad_norm": 0.8171296119689941, + "learning_rate": 0.0002, + "loss": 0.8475, + "step": 1040 + }, + { + "epoch": 0.1697518389782556, + "grad_norm": 0.7137420773506165, + "learning_rate": 0.0002, + "loss": 0.845, + "step": 1050 + }, + { + "epoch": 0.1713685231590009, + "grad_norm": 0.5236809849739075, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 1060 + }, + { + "epoch": 0.17298520733974618, + "grad_norm": 0.5021864175796509, + "learning_rate": 0.0002, + "loss": 0.8265, + "step": 1070 + }, + { + "epoch": 0.17460189152049146, + "grad_norm": 0.47347521781921387, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 1080 + }, + { + "epoch": 0.17621857570123675, + "grad_norm": 0.4631653428077698, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 1090 + }, + { + "epoch": 0.17783525988198207, + "grad_norm": 0.49169182777404785, + "learning_rate": 0.0002, + "loss": 0.8166, + "step": 1100 + }, + { + "epoch": 0.17945194406272735, + "grad_norm": 0.5019739270210266, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 1110 + }, + { + "epoch": 0.18106862824347264, + "grad_norm": 0.5100422501564026, + "learning_rate": 0.0002, + "loss": 0.8247, + "step": 1120 + }, + { + "epoch": 0.18268531242421793, + "grad_norm": 0.3888324499130249, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 1130 + }, + { + "epoch": 0.18430199660496321, + "grad_norm": 0.39765217900276184, + "learning_rate": 0.0002, + "loss": 0.8533, + "step": 1140 + }, + { + "epoch": 0.1859186807857085, + "grad_norm": 0.47190186381340027, + "learning_rate": 0.0002, + "loss": 0.8541, + "step": 1150 + }, + { + "epoch": 0.18753536496645382, + "grad_norm": 0.4464188814163208, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 1160 + }, + { + "epoch": 0.1891520491471991, + "grad_norm": 0.5153930187225342, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 1170 + }, + { + "epoch": 0.1907687333279444, + "grad_norm": 0.4779708683490753, + "learning_rate": 0.0002, + "loss": 0.8033, + "step": 1180 + }, + { + "epoch": 0.19238541750868968, + "grad_norm": 0.4834315776824951, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 1190 + }, + { + "epoch": 0.19400210168943496, + "grad_norm": 0.402357816696167, + "learning_rate": 0.0002, + "loss": 0.7721, + "step": 1200 + }, + { + "epoch": 0.19561878587018025, + "grad_norm": 0.45899084210395813, + "learning_rate": 0.0002, + "loss": 0.7941, + "step": 1210 + }, + { + "epoch": 0.19723547005092557, + "grad_norm": 0.5106529593467712, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 1220 + }, + { + "epoch": 0.19885215423167085, + "grad_norm": 0.45261722803115845, + "learning_rate": 0.0002, + "loss": 0.7816, + "step": 1230 + }, + { + "epoch": 0.20046883841241614, + "grad_norm": 0.4647127091884613, + "learning_rate": 0.0002, + "loss": 0.8068, + "step": 1240 + }, + { + "epoch": 0.20208552259316143, + "grad_norm": 0.4849368929862976, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 1250 + }, + { + "epoch": 0.2037022067739067, + "grad_norm": 0.4518061578273773, + "learning_rate": 0.0002, + "loss": 0.8514, + "step": 1260 + }, + { + "epoch": 0.205318890954652, + "grad_norm": 0.49535325169563293, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 1270 + }, + { + "epoch": 0.2069355751353973, + "grad_norm": 0.4835205376148224, + "learning_rate": 0.0002, + "loss": 0.8348, + "step": 1280 + }, + { + "epoch": 0.2085522593161426, + "grad_norm": 0.45308539271354675, + "learning_rate": 0.0002, + "loss": 0.8428, + "step": 1290 + }, + { + "epoch": 0.2101689434968879, + "grad_norm": 0.5369905233383179, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 1300 + }, + { + "epoch": 0.21178562767763318, + "grad_norm": 0.5031622052192688, + "learning_rate": 0.0002, + "loss": 0.8676, + "step": 1310 + }, + { + "epoch": 0.21340231185837846, + "grad_norm": 0.48010334372520447, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 1320 + }, + { + "epoch": 0.21501899603912375, + "grad_norm": 0.4905701279640198, + "learning_rate": 0.0002, + "loss": 0.806, + "step": 1330 + }, + { + "epoch": 0.21663568021986904, + "grad_norm": 0.43531742691993713, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 1340 + }, + { + "epoch": 0.21825236440061435, + "grad_norm": 0.44330692291259766, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 1350 + }, + { + "epoch": 0.21986904858135964, + "grad_norm": 0.5384416580200195, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 1360 + }, + { + "epoch": 0.22148573276210493, + "grad_norm": 0.4181833863258362, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 1370 + }, + { + "epoch": 0.2231024169428502, + "grad_norm": 0.523833692073822, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 1380 + }, + { + "epoch": 0.2247191011235955, + "grad_norm": 0.5528736710548401, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 1390 + }, + { + "epoch": 0.2263357853043408, + "grad_norm": 0.43515023589134216, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 1400 + }, + { + "epoch": 0.2279524694850861, + "grad_norm": 0.48809877038002014, + "learning_rate": 0.0002, + "loss": 0.8403, + "step": 1410 + }, + { + "epoch": 0.2295691536658314, + "grad_norm": 0.43591251969337463, + "learning_rate": 0.0002, + "loss": 0.8165, + "step": 1420 + }, + { + "epoch": 0.23118583784657668, + "grad_norm": 0.44625312089920044, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 1430 + }, + { + "epoch": 0.23280252202732196, + "grad_norm": 0.4390665292739868, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 1440 + }, + { + "epoch": 0.23441920620806725, + "grad_norm": 0.48496049642562866, + "learning_rate": 0.0002, + "loss": 0.8465, + "step": 1450 + }, + { + "epoch": 0.23603589038881254, + "grad_norm": 0.45919957756996155, + "learning_rate": 0.0002, + "loss": 0.775, + "step": 1460 + }, + { + "epoch": 0.23765257456955785, + "grad_norm": 0.5471845865249634, + "learning_rate": 0.0002, + "loss": 0.8659, + "step": 1470 + }, + { + "epoch": 0.23926925875030314, + "grad_norm": 0.47269317507743835, + "learning_rate": 0.0002, + "loss": 0.8164, + "step": 1480 + }, + { + "epoch": 0.24088594293104842, + "grad_norm": 0.4930245578289032, + "learning_rate": 0.0002, + "loss": 0.854, + "step": 1490 + }, + { + "epoch": 0.2425026271117937, + "grad_norm": 0.5605630278587341, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 1500 + }, + { + "epoch": 0.244119311292539, + "grad_norm": 0.4435870945453644, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 1510 + }, + { + "epoch": 0.24573599547328429, + "grad_norm": 0.4941999912261963, + "learning_rate": 0.0002, + "loss": 0.8123, + "step": 1520 + }, + { + "epoch": 0.24735267965402957, + "grad_norm": 0.5100624561309814, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 1530 + }, + { + "epoch": 0.2489693638347749, + "grad_norm": 0.4638267457485199, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 1540 + }, + { + "epoch": 0.25058604801552015, + "grad_norm": 0.5071570873260498, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 1550 + }, + { + "epoch": 0.25220273219626543, + "grad_norm": 0.4291319251060486, + "learning_rate": 0.0002, + "loss": 0.7724, + "step": 1560 + }, + { + "epoch": 0.2538194163770108, + "grad_norm": 0.5388049483299255, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 1570 + }, + { + "epoch": 0.25543610055775606, + "grad_norm": 0.5083683729171753, + "learning_rate": 0.0002, + "loss": 0.8176, + "step": 1580 + }, + { + "epoch": 0.25705278473850135, + "grad_norm": 0.4824463725090027, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 1590 + }, + { + "epoch": 0.25866946891924664, + "grad_norm": 0.41177722811698914, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 1600 + }, + { + "epoch": 0.2602861530999919, + "grad_norm": 0.5656219124794006, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 1610 + }, + { + "epoch": 0.2619028372807372, + "grad_norm": 0.41063204407691956, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 1620 + }, + { + "epoch": 0.2635195214614825, + "grad_norm": 0.4897061288356781, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 1630 + }, + { + "epoch": 0.2651362056422278, + "grad_norm": 0.4454376697540283, + "learning_rate": 0.0002, + "loss": 0.8198, + "step": 1640 + }, + { + "epoch": 0.26675288982297307, + "grad_norm": 0.4355238378047943, + "learning_rate": 0.0002, + "loss": 0.8684, + "step": 1650 + }, + { + "epoch": 0.26836957400371836, + "grad_norm": 0.458310067653656, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 1660 + }, + { + "epoch": 0.26998625818446365, + "grad_norm": 0.4752083718776703, + "learning_rate": 0.0002, + "loss": 0.7935, + "step": 1670 + }, + { + "epoch": 0.27160294236520893, + "grad_norm": 0.4666106402873993, + "learning_rate": 0.0002, + "loss": 0.8267, + "step": 1680 + }, + { + "epoch": 0.2732196265459543, + "grad_norm": 0.4213818609714508, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 1690 + }, + { + "epoch": 0.27483631072669956, + "grad_norm": 0.5768913626670837, + "learning_rate": 0.0002, + "loss": 0.8559, + "step": 1700 + }, + { + "epoch": 0.27645299490744485, + "grad_norm": 0.4209914803504944, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 1710 + }, + { + "epoch": 0.27806967908819014, + "grad_norm": 0.501909613609314, + "learning_rate": 0.0002, + "loss": 0.8167, + "step": 1720 + }, + { + "epoch": 0.2796863632689354, + "grad_norm": 0.5266261100769043, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 1730 + }, + { + "epoch": 0.2813030474496807, + "grad_norm": 0.43806859850883484, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 1740 + }, + { + "epoch": 0.282919731630426, + "grad_norm": 0.46048814058303833, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 1750 + }, + { + "epoch": 0.2845364158111713, + "grad_norm": 0.44972819089889526, + "learning_rate": 0.0002, + "loss": 0.8596, + "step": 1760 + }, + { + "epoch": 0.28615309999191657, + "grad_norm": 0.5114831328392029, + "learning_rate": 0.0002, + "loss": 0.8421, + "step": 1770 + }, + { + "epoch": 0.28776978417266186, + "grad_norm": 0.47931742668151855, + "learning_rate": 0.0002, + "loss": 0.8361, + "step": 1780 + }, + { + "epoch": 0.28938646835340714, + "grad_norm": 0.5092599987983704, + "learning_rate": 0.0002, + "loss": 0.8265, + "step": 1790 + }, + { + "epoch": 0.29100315253415243, + "grad_norm": 0.37581443786621094, + "learning_rate": 0.0002, + "loss": 0.8506, + "step": 1800 + }, + { + "epoch": 0.2926198367148977, + "grad_norm": 0.47097381949424744, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 1810 + }, + { + "epoch": 0.29423652089564306, + "grad_norm": 0.48300236463546753, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 1820 + }, + { + "epoch": 0.29585320507638835, + "grad_norm": 0.5600419640541077, + "learning_rate": 0.0002, + "loss": 0.8391, + "step": 1830 + }, + { + "epoch": 0.29746988925713364, + "grad_norm": 0.48555272817611694, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 1840 + }, + { + "epoch": 0.2990865734378789, + "grad_norm": 0.3752668499946594, + "learning_rate": 0.0002, + "loss": 0.7657, + "step": 1850 + }, + { + "epoch": 0.3007032576186242, + "grad_norm": 0.5328747034072876, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 1860 + }, + { + "epoch": 0.3023199417993695, + "grad_norm": 0.48716455698013306, + "learning_rate": 0.0002, + "loss": 0.8426, + "step": 1870 + }, + { + "epoch": 0.3039366259801148, + "grad_norm": 0.5011493563652039, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 1880 + }, + { + "epoch": 0.30555331016086007, + "grad_norm": 0.46461427211761475, + "learning_rate": 0.0002, + "loss": 0.852, + "step": 1890 + }, + { + "epoch": 0.30716999434160536, + "grad_norm": 0.36630210280418396, + "learning_rate": 0.0002, + "loss": 0.8478, + "step": 1900 + }, + { + "epoch": 0.30878667852235064, + "grad_norm": 0.4217296242713928, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 1910 + }, + { + "epoch": 0.31040336270309593, + "grad_norm": 0.4394875466823578, + "learning_rate": 0.0002, + "loss": 0.8128, + "step": 1920 + }, + { + "epoch": 0.3120200468838412, + "grad_norm": 0.6587965488433838, + "learning_rate": 0.0002, + "loss": 0.8471, + "step": 1930 + }, + { + "epoch": 0.31363673106458656, + "grad_norm": 0.5469298958778381, + "learning_rate": 0.0002, + "loss": 0.8565, + "step": 1940 + }, + { + "epoch": 0.31525341524533185, + "grad_norm": 0.4371595084667206, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 1950 + }, + { + "epoch": 0.31687009942607713, + "grad_norm": 0.4809541404247284, + "learning_rate": 0.0002, + "loss": 0.887, + "step": 1960 + }, + { + "epoch": 0.3184867836068224, + "grad_norm": 0.6061086654663086, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 1970 + }, + { + "epoch": 0.3201034677875677, + "grad_norm": 0.5342657566070557, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 1980 + }, + { + "epoch": 0.321720151968313, + "grad_norm": 0.5057743787765503, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 1990 + }, + { + "epoch": 0.3233368361490583, + "grad_norm": 0.528626024723053, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 2000 + }, + { + "epoch": 0.32495352032980357, + "grad_norm": 0.46742770075798035, + "learning_rate": 0.0002, + "loss": 0.8845, + "step": 2010 + }, + { + "epoch": 0.32657020451054886, + "grad_norm": 0.515101432800293, + "learning_rate": 0.0002, + "loss": 0.8484, + "step": 2020 + }, + { + "epoch": 0.32818688869129414, + "grad_norm": 0.41941216588020325, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 2030 + }, + { + "epoch": 0.32980357287203943, + "grad_norm": 0.49902522563934326, + "learning_rate": 0.0002, + "loss": 0.7637, + "step": 2040 + }, + { + "epoch": 0.3314202570527847, + "grad_norm": 0.4120897650718689, + "learning_rate": 0.0002, + "loss": 0.7822, + "step": 2050 + }, + { + "epoch": 0.33303694123353, + "grad_norm": 0.45352041721343994, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 2060 + }, + { + "epoch": 0.33465362541427535, + "grad_norm": 0.523199737071991, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 2070 + }, + { + "epoch": 0.33627030959502063, + "grad_norm": 0.4390358626842499, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 2080 + }, + { + "epoch": 0.3378869937757659, + "grad_norm": 0.6752901077270508, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 2090 + }, + { + "epoch": 0.3395036779565112, + "grad_norm": 0.547821044921875, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 2100 + }, + { + "epoch": 0.3411203621372565, + "grad_norm": 0.5161308646202087, + "learning_rate": 0.0002, + "loss": 0.8561, + "step": 2110 + }, + { + "epoch": 0.3427370463180018, + "grad_norm": 0.4565401077270508, + "learning_rate": 0.0002, + "loss": 0.7697, + "step": 2120 + }, + { + "epoch": 0.34435373049874707, + "grad_norm": 0.4666115939617157, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 2130 + }, + { + "epoch": 0.34597041467949236, + "grad_norm": 0.4090428352355957, + "learning_rate": 0.0002, + "loss": 0.8189, + "step": 2140 + }, + { + "epoch": 0.34758709886023764, + "grad_norm": 0.510845422744751, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 2150 + }, + { + "epoch": 0.34920378304098293, + "grad_norm": 0.42861923575401306, + "learning_rate": 0.0002, + "loss": 0.8398, + "step": 2160 + }, + { + "epoch": 0.3508204672217282, + "grad_norm": 0.4476332664489746, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 2170 + }, + { + "epoch": 0.3524371514024735, + "grad_norm": 0.6065791249275208, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 2180 + }, + { + "epoch": 0.35405383558321885, + "grad_norm": 0.42335066199302673, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 2190 + }, + { + "epoch": 0.35567051976396413, + "grad_norm": 0.5094629526138306, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 2200 + }, + { + "epoch": 0.3572872039447094, + "grad_norm": 0.5476373434066772, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 2210 + }, + { + "epoch": 0.3589038881254547, + "grad_norm": 0.3911719024181366, + "learning_rate": 0.0002, + "loss": 0.814, + "step": 2220 + }, + { + "epoch": 0.3605205723062, + "grad_norm": 0.6599636077880859, + "learning_rate": 0.0002, + "loss": 0.8599, + "step": 2230 + }, + { + "epoch": 0.3621372564869453, + "grad_norm": 0.40381914377212524, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 2240 + }, + { + "epoch": 0.36375394066769057, + "grad_norm": 0.4433908462524414, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 2250 + }, + { + "epoch": 0.36537062484843585, + "grad_norm": 0.578326940536499, + "learning_rate": 0.0002, + "loss": 0.8503, + "step": 2260 + }, + { + "epoch": 0.36698730902918114, + "grad_norm": 0.5734784007072449, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 2270 + }, + { + "epoch": 0.36860399320992643, + "grad_norm": 0.45555487275123596, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 2280 + }, + { + "epoch": 0.3702206773906717, + "grad_norm": 0.5666276216506958, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 2290 + }, + { + "epoch": 0.371837361571417, + "grad_norm": 0.5461117625236511, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 2300 + }, + { + "epoch": 0.3734540457521623, + "grad_norm": 0.6318911910057068, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 2310 + }, + { + "epoch": 0.37507072993290763, + "grad_norm": 0.493263304233551, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 2320 + }, + { + "epoch": 0.3766874141136529, + "grad_norm": 0.5888760089874268, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 2330 + }, + { + "epoch": 0.3783040982943982, + "grad_norm": 0.48671841621398926, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 2340 + }, + { + "epoch": 0.3799207824751435, + "grad_norm": 0.4385145306587219, + "learning_rate": 0.0002, + "loss": 0.8367, + "step": 2350 + }, + { + "epoch": 0.3815374666558888, + "grad_norm": 0.5523318648338318, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 2360 + }, + { + "epoch": 0.38315415083663407, + "grad_norm": 0.7308220267295837, + "learning_rate": 0.0002, + "loss": 0.8351, + "step": 2370 + }, + { + "epoch": 0.38477083501737935, + "grad_norm": 0.554214358329773, + "learning_rate": 0.0002, + "loss": 0.859, + "step": 2380 + }, + { + "epoch": 0.38638751919812464, + "grad_norm": 0.5425800085067749, + "learning_rate": 0.0002, + "loss": 0.8146, + "step": 2390 + }, + { + "epoch": 0.3880042033788699, + "grad_norm": 0.48811158537864685, + "learning_rate": 0.0002, + "loss": 0.8282, + "step": 2400 + }, + { + "epoch": 0.3896208875596152, + "grad_norm": 0.49212366342544556, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 2410 + }, + { + "epoch": 0.3912375717403605, + "grad_norm": 0.5222218632698059, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 2420 + }, + { + "epoch": 0.3928542559211058, + "grad_norm": 0.4699819087982178, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 2430 + }, + { + "epoch": 0.39447094010185113, + "grad_norm": 0.46153587102890015, + "learning_rate": 0.0002, + "loss": 0.7919, + "step": 2440 + }, + { + "epoch": 0.3960876242825964, + "grad_norm": 0.4150611162185669, + "learning_rate": 0.0002, + "loss": 0.8111, + "step": 2450 + }, + { + "epoch": 0.3977043084633417, + "grad_norm": 0.5799614787101746, + "learning_rate": 0.0002, + "loss": 0.8589, + "step": 2460 + }, + { + "epoch": 0.399320992644087, + "grad_norm": 0.56536865234375, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 2470 + }, + { + "epoch": 0.4009376768248323, + "grad_norm": 0.5451247096061707, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 2480 + }, + { + "epoch": 0.40255436100557757, + "grad_norm": 0.5914521217346191, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 2490 + }, + { + "epoch": 0.40417104518632285, + "grad_norm": 0.4428117275238037, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 2500 + }, + { + "epoch": 0.40578772936706814, + "grad_norm": 0.48580947518348694, + "learning_rate": 0.0002, + "loss": 0.8054, + "step": 2510 + }, + { + "epoch": 0.4074044135478134, + "grad_norm": 0.436734676361084, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 2520 + }, + { + "epoch": 0.4090210977285587, + "grad_norm": 0.5752223134040833, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 2530 + }, + { + "epoch": 0.410637781909304, + "grad_norm": 0.4271308183670044, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 2540 + }, + { + "epoch": 0.4122544660900493, + "grad_norm": 0.46294718980789185, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 2550 + }, + { + "epoch": 0.4138711502707946, + "grad_norm": 0.49407583475112915, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 2560 + }, + { + "epoch": 0.4154878344515399, + "grad_norm": 0.4729035496711731, + "learning_rate": 0.0002, + "loss": 0.7881, + "step": 2570 + }, + { + "epoch": 0.4171045186322852, + "grad_norm": 0.4129747152328491, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 2580 + }, + { + "epoch": 0.4187212028130305, + "grad_norm": 0.5684236288070679, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 2590 + }, + { + "epoch": 0.4203378869937758, + "grad_norm": 0.4862157106399536, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 2600 + }, + { + "epoch": 0.42195457117452106, + "grad_norm": 0.46567976474761963, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 2610 + }, + { + "epoch": 0.42357125535526635, + "grad_norm": 0.5710650682449341, + "learning_rate": 0.0002, + "loss": 0.7755, + "step": 2620 + }, + { + "epoch": 0.42518793953601164, + "grad_norm": 0.5660041570663452, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 2630 + }, + { + "epoch": 0.4268046237167569, + "grad_norm": 0.47944375872612, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 2640 + }, + { + "epoch": 0.4284213078975022, + "grad_norm": 0.537223756313324, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 2650 + }, + { + "epoch": 0.4300379920782475, + "grad_norm": 0.41669997572898865, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 2660 + }, + { + "epoch": 0.4316546762589928, + "grad_norm": 0.44727686047554016, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 2670 + }, + { + "epoch": 0.4332713604397381, + "grad_norm": 0.5600888729095459, + "learning_rate": 0.0002, + "loss": 0.8241, + "step": 2680 + }, + { + "epoch": 0.4348880446204834, + "grad_norm": 0.39820605516433716, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 2690 + }, + { + "epoch": 0.4365047288012287, + "grad_norm": 0.5637655854225159, + "learning_rate": 0.0002, + "loss": 0.8202, + "step": 2700 + }, + { + "epoch": 0.438121412981974, + "grad_norm": 0.6363666653633118, + "learning_rate": 0.0002, + "loss": 0.855, + "step": 2710 + }, + { + "epoch": 0.4397380971627193, + "grad_norm": 0.5656129121780396, + "learning_rate": 0.0002, + "loss": 0.8468, + "step": 2720 + }, + { + "epoch": 0.44135478134346456, + "grad_norm": 0.5600156188011169, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 2730 + }, + { + "epoch": 0.44297146552420985, + "grad_norm": 0.5506579875946045, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 2740 + }, + { + "epoch": 0.44458814970495514, + "grad_norm": 0.49878305196762085, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 2750 + }, + { + "epoch": 0.4462048338857004, + "grad_norm": 0.4569213092327118, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 2760 + }, + { + "epoch": 0.4478215180664457, + "grad_norm": 0.6056680083274841, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 2770 + }, + { + "epoch": 0.449438202247191, + "grad_norm": 0.44474557042121887, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 2780 + }, + { + "epoch": 0.4510548864279363, + "grad_norm": 0.46055394411087036, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 2790 + }, + { + "epoch": 0.4526715706086816, + "grad_norm": 0.4904133379459381, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 2800 + }, + { + "epoch": 0.45428825478942686, + "grad_norm": 0.5647031664848328, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 2810 + }, + { + "epoch": 0.4559049389701722, + "grad_norm": 0.5759473443031311, + "learning_rate": 0.0002, + "loss": 0.8622, + "step": 2820 + }, + { + "epoch": 0.4575216231509175, + "grad_norm": 0.5161895751953125, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 2830 + }, + { + "epoch": 0.4591383073316628, + "grad_norm": 0.4248254597187042, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 2840 + }, + { + "epoch": 0.46075499151240806, + "grad_norm": 0.45395001769065857, + "learning_rate": 0.0002, + "loss": 0.7838, + "step": 2850 + }, + { + "epoch": 0.46237167569315335, + "grad_norm": 0.5358697772026062, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 2860 + }, + { + "epoch": 0.46398835987389864, + "grad_norm": 0.5379165410995483, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 2870 + }, + { + "epoch": 0.4656050440546439, + "grad_norm": 0.4601989686489105, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 2880 + }, + { + "epoch": 0.4672217282353892, + "grad_norm": 0.671115517616272, + "learning_rate": 0.0002, + "loss": 0.8523, + "step": 2890 + }, + { + "epoch": 0.4688384124161345, + "grad_norm": 0.4425133168697357, + "learning_rate": 0.0002, + "loss": 0.8262, + "step": 2900 + }, + { + "epoch": 0.4704550965968798, + "grad_norm": 0.5446155071258545, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 2910 + }, + { + "epoch": 0.47207178077762507, + "grad_norm": 0.603306233882904, + "learning_rate": 0.0002, + "loss": 0.8106, + "step": 2920 + }, + { + "epoch": 0.47368846495837036, + "grad_norm": 0.5377997159957886, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 2930 + }, + { + "epoch": 0.4753051491391157, + "grad_norm": 0.4931027591228485, + "learning_rate": 0.0002, + "loss": 0.8075, + "step": 2940 + }, + { + "epoch": 0.476921833319861, + "grad_norm": 0.4711960256099701, + "learning_rate": 0.0002, + "loss": 0.8004, + "step": 2950 + }, + { + "epoch": 0.4785385175006063, + "grad_norm": 0.5020492672920227, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 2960 + }, + { + "epoch": 0.48015520168135156, + "grad_norm": 0.5428946614265442, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 2970 + }, + { + "epoch": 0.48177188586209685, + "grad_norm": 0.5294089317321777, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 2980 + }, + { + "epoch": 0.48338857004284214, + "grad_norm": 0.648289144039154, + "learning_rate": 0.0002, + "loss": 0.8553, + "step": 2990 + }, + { + "epoch": 0.4850052542235874, + "grad_norm": 0.47916680574417114, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 3000 + }, + { + "epoch": 0.4866219384043327, + "grad_norm": 0.43849772214889526, + "learning_rate": 0.0002, + "loss": 0.8087, + "step": 3010 + }, + { + "epoch": 0.488238622585078, + "grad_norm": 0.47007861733436584, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 3020 + }, + { + "epoch": 0.4898553067658233, + "grad_norm": 0.6314331293106079, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 3030 + }, + { + "epoch": 0.49147199094656857, + "grad_norm": 0.49211493134498596, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 3040 + }, + { + "epoch": 0.49308867512731386, + "grad_norm": 0.4537973403930664, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 3050 + }, + { + "epoch": 0.49470535930805914, + "grad_norm": 0.47326919436454773, + "learning_rate": 0.0002, + "loss": 0.8095, + "step": 3060 + }, + { + "epoch": 0.4963220434888045, + "grad_norm": 0.525874137878418, + "learning_rate": 0.0002, + "loss": 0.8447, + "step": 3070 + }, + { + "epoch": 0.4979387276695498, + "grad_norm": 0.6361091732978821, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 3080 + }, + { + "epoch": 0.49955541185029506, + "grad_norm": 0.5850642919540405, + "learning_rate": 0.0002, + "loss": 0.821, + "step": 3090 + }, + { + "epoch": 0.5011720960310403, + "grad_norm": 0.47299543023109436, + "learning_rate": 0.0002, + "loss": 0.8279, + "step": 3100 + }, + { + "epoch": 0.5027887802117856, + "grad_norm": 0.473099946975708, + "learning_rate": 0.0002, + "loss": 0.8681, + "step": 3110 + }, + { + "epoch": 0.5044054643925309, + "grad_norm": 0.48186397552490234, + "learning_rate": 0.0002, + "loss": 0.8223, + "step": 3120 + }, + { + "epoch": 0.5060221485732762, + "grad_norm": 0.5015401840209961, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 3130 + }, + { + "epoch": 0.5076388327540216, + "grad_norm": 0.5617750287055969, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 3140 + }, + { + "epoch": 0.5092555169347668, + "grad_norm": 0.5169327259063721, + "learning_rate": 0.0002, + "loss": 0.8708, + "step": 3150 + }, + { + "epoch": 0.5108722011155121, + "grad_norm": 0.545657753944397, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 3160 + }, + { + "epoch": 0.5124888852962574, + "grad_norm": 0.512864351272583, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 3170 + }, + { + "epoch": 0.5141055694770027, + "grad_norm": 0.4113546311855316, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 3180 + }, + { + "epoch": 0.5157222536577479, + "grad_norm": 0.44532445073127747, + "learning_rate": 0.0002, + "loss": 0.8206, + "step": 3190 + }, + { + "epoch": 0.5173389378384933, + "grad_norm": 0.5623497366905212, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 3200 + }, + { + "epoch": 0.5189556220192385, + "grad_norm": 0.5084741115570068, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 3210 + }, + { + "epoch": 0.5205723061999838, + "grad_norm": 0.5305403470993042, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 3220 + }, + { + "epoch": 0.5221889903807291, + "grad_norm": 0.4708254337310791, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 3230 + }, + { + "epoch": 0.5238056745614744, + "grad_norm": 0.43827131390571594, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 3240 + }, + { + "epoch": 0.5254223587422197, + "grad_norm": 0.5630002617835999, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 3250 + }, + { + "epoch": 0.527039042922965, + "grad_norm": 0.5010961890220642, + "learning_rate": 0.0002, + "loss": 0.7522, + "step": 3260 + }, + { + "epoch": 0.5286557271037103, + "grad_norm": 0.6303122043609619, + "learning_rate": 0.0002, + "loss": 0.8374, + "step": 3270 + }, + { + "epoch": 0.5302724112844556, + "grad_norm": 0.5107331275939941, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 3280 + }, + { + "epoch": 0.5318890954652009, + "grad_norm": 0.5700443387031555, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 3290 + }, + { + "epoch": 0.5335057796459461, + "grad_norm": 0.46296367049217224, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 3300 + }, + { + "epoch": 0.5351224638266915, + "grad_norm": 0.531568706035614, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 3310 + }, + { + "epoch": 0.5367391480074367, + "grad_norm": 0.4686741530895233, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 3320 + }, + { + "epoch": 0.5383558321881821, + "grad_norm": 0.5404331088066101, + "learning_rate": 0.0002, + "loss": 0.8104, + "step": 3330 + }, + { + "epoch": 0.5399725163689273, + "grad_norm": 0.6368790864944458, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 3340 + }, + { + "epoch": 0.5415892005496726, + "grad_norm": 0.42300888895988464, + "learning_rate": 0.0002, + "loss": 0.8514, + "step": 3350 + }, + { + "epoch": 0.5432058847304179, + "grad_norm": 0.5362542867660522, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 3360 + }, + { + "epoch": 0.5448225689111632, + "grad_norm": 0.497128963470459, + "learning_rate": 0.0002, + "loss": 0.858, + "step": 3370 + }, + { + "epoch": 0.5464392530919085, + "grad_norm": 0.5006386041641235, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 3380 + }, + { + "epoch": 0.5480559372726538, + "grad_norm": 0.44136837124824524, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 3390 + }, + { + "epoch": 0.5496726214533991, + "grad_norm": 0.5897833108901978, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 3400 + }, + { + "epoch": 0.5512893056341444, + "grad_norm": 0.641075611114502, + "learning_rate": 0.0002, + "loss": 0.8895, + "step": 3410 + }, + { + "epoch": 0.5529059898148897, + "grad_norm": 0.7251322269439697, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 3420 + }, + { + "epoch": 0.5545226739956349, + "grad_norm": 0.47411349415779114, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 3430 + }, + { + "epoch": 0.5561393581763803, + "grad_norm": 0.4994310438632965, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 3440 + }, + { + "epoch": 0.5577560423571255, + "grad_norm": 0.5814438462257385, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 3450 + }, + { + "epoch": 0.5593727265378708, + "grad_norm": 0.6278898119926453, + "learning_rate": 0.0002, + "loss": 0.8805, + "step": 3460 + }, + { + "epoch": 0.5609894107186161, + "grad_norm": 0.46208274364471436, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 3470 + }, + { + "epoch": 0.5626060948993614, + "grad_norm": 0.5718930959701538, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 3480 + }, + { + "epoch": 0.5642227790801067, + "grad_norm": 0.48178744316101074, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 3490 + }, + { + "epoch": 0.565839463260852, + "grad_norm": 0.47336965799331665, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 3500 + }, + { + "epoch": 0.5674561474415973, + "grad_norm": 0.43442684412002563, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 3510 + }, + { + "epoch": 0.5690728316223426, + "grad_norm": 0.6463358998298645, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 3520 + }, + { + "epoch": 0.5706895158030879, + "grad_norm": 0.5286486744880676, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 3530 + }, + { + "epoch": 0.5723061999838331, + "grad_norm": 0.5405499935150146, + "learning_rate": 0.0002, + "loss": 0.8421, + "step": 3540 + }, + { + "epoch": 0.5739228841645785, + "grad_norm": 0.6654391884803772, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 3550 + }, + { + "epoch": 0.5755395683453237, + "grad_norm": 0.5081980228424072, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 3560 + }, + { + "epoch": 0.5771562525260691, + "grad_norm": 0.48978179693222046, + "learning_rate": 0.0002, + "loss": 0.7753, + "step": 3570 + }, + { + "epoch": 0.5787729367068143, + "grad_norm": 0.5840612053871155, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3580 + }, + { + "epoch": 0.5803896208875596, + "grad_norm": 0.5235261917114258, + "learning_rate": 0.0002, + "loss": 0.8937, + "step": 3590 + }, + { + "epoch": 0.5820063050683049, + "grad_norm": 0.5672075748443604, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 3600 + }, + { + "epoch": 0.5836229892490502, + "grad_norm": 0.5613429546356201, + "learning_rate": 0.0002, + "loss": 0.8347, + "step": 3610 + }, + { + "epoch": 0.5852396734297954, + "grad_norm": 0.4032273590564728, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 3620 + }, + { + "epoch": 0.5868563576105408, + "grad_norm": 0.49559324979782104, + "learning_rate": 0.0002, + "loss": 0.8421, + "step": 3630 + }, + { + "epoch": 0.5884730417912861, + "grad_norm": 0.6895697712898254, + "learning_rate": 0.0002, + "loss": 0.8332, + "step": 3640 + }, + { + "epoch": 0.5900897259720314, + "grad_norm": 0.4750136435031891, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 3650 + }, + { + "epoch": 0.5917064101527767, + "grad_norm": 0.5176819562911987, + "learning_rate": 0.0002, + "loss": 0.8219, + "step": 3660 + }, + { + "epoch": 0.5933230943335219, + "grad_norm": 0.5817760229110718, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3670 + }, + { + "epoch": 0.5949397785142673, + "grad_norm": 0.6064626574516296, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 3680 + }, + { + "epoch": 0.5965564626950125, + "grad_norm": 0.6728700995445251, + "learning_rate": 0.0002, + "loss": 0.8422, + "step": 3690 + }, + { + "epoch": 0.5981731468757578, + "grad_norm": 0.609305202960968, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 3700 + }, + { + "epoch": 0.5997898310565031, + "grad_norm": 0.4615488350391388, + "learning_rate": 0.0002, + "loss": 0.8048, + "step": 3710 + }, + { + "epoch": 0.6014065152372484, + "grad_norm": 2.0531179904937744, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 3720 + }, + { + "epoch": 0.6030231994179936, + "grad_norm": 0.5091132521629333, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 3730 + }, + { + "epoch": 0.604639883598739, + "grad_norm": 0.5951124429702759, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 3740 + }, + { + "epoch": 0.6062565677794842, + "grad_norm": 0.5870208144187927, + "learning_rate": 0.0002, + "loss": 0.7784, + "step": 3750 + }, + { + "epoch": 0.6078732519602296, + "grad_norm": 0.6254619359970093, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 3760 + }, + { + "epoch": 0.6094899361409749, + "grad_norm": 0.5577626824378967, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 3770 + }, + { + "epoch": 0.6111066203217201, + "grad_norm": 0.5004405379295349, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 3780 + }, + { + "epoch": 0.6127233045024655, + "grad_norm": 0.5527383685112, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 3790 + }, + { + "epoch": 0.6143399886832107, + "grad_norm": 0.49116113781929016, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 3800 + }, + { + "epoch": 0.6159566728639561, + "grad_norm": 0.5299299359321594, + "learning_rate": 0.0002, + "loss": 0.8352, + "step": 3810 + }, + { + "epoch": 0.6175733570447013, + "grad_norm": 0.464897483587265, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 3820 + }, + { + "epoch": 0.6191900412254466, + "grad_norm": 0.6505740880966187, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 3830 + }, + { + "epoch": 0.6208067254061919, + "grad_norm": 0.5512559413909912, + "learning_rate": 0.0002, + "loss": 0.8123, + "step": 3840 + }, + { + "epoch": 0.6224234095869372, + "grad_norm": 0.49427518248558044, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 3850 + }, + { + "epoch": 0.6240400937676824, + "grad_norm": 0.3839147090911865, + "learning_rate": 0.0002, + "loss": 0.7751, + "step": 3860 + }, + { + "epoch": 0.6256567779484278, + "grad_norm": 0.5760218501091003, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 3870 + }, + { + "epoch": 0.6272734621291731, + "grad_norm": 0.7226507067680359, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 3880 + }, + { + "epoch": 0.6288901463099184, + "grad_norm": 0.676781415939331, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 3890 + }, + { + "epoch": 0.6305068304906637, + "grad_norm": 0.4284018278121948, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 3900 + }, + { + "epoch": 0.6321235146714089, + "grad_norm": 0.5060628056526184, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 3910 + }, + { + "epoch": 0.6337401988521543, + "grad_norm": 0.5524522066116333, + "learning_rate": 0.0002, + "loss": 0.8089, + "step": 3920 + }, + { + "epoch": 0.6353568830328995, + "grad_norm": 0.6099881529808044, + "learning_rate": 0.0002, + "loss": 0.8276, + "step": 3930 + }, + { + "epoch": 0.6369735672136448, + "grad_norm": 0.43155938386917114, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 3940 + }, + { + "epoch": 0.6385902513943901, + "grad_norm": 0.6427084803581238, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 3950 + }, + { + "epoch": 0.6402069355751354, + "grad_norm": 0.541220486164093, + "learning_rate": 0.0002, + "loss": 0.8368, + "step": 3960 + }, + { + "epoch": 0.6418236197558806, + "grad_norm": 0.5414294600486755, + "learning_rate": 0.0002, + "loss": 0.8539, + "step": 3970 + }, + { + "epoch": 0.643440303936626, + "grad_norm": 0.46344003081321716, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 3980 + }, + { + "epoch": 0.6450569881173712, + "grad_norm": 0.45209285616874695, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 3990 + }, + { + "epoch": 0.6466736722981166, + "grad_norm": 0.5417284369468689, + "learning_rate": 0.0002, + "loss": 0.8202, + "step": 4000 + }, + { + "epoch": 0.6482903564788619, + "grad_norm": 0.7995685935020447, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 4010 + }, + { + "epoch": 0.6499070406596071, + "grad_norm": 0.6384002566337585, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 4020 + }, + { + "epoch": 0.6515237248403525, + "grad_norm": 0.4472815692424774, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 4030 + }, + { + "epoch": 0.6531404090210977, + "grad_norm": 0.6834294199943542, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 4040 + }, + { + "epoch": 0.654757093201843, + "grad_norm": 0.4612339735031128, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 4050 + }, + { + "epoch": 0.6563737773825883, + "grad_norm": 0.9266576170921326, + "learning_rate": 0.0002, + "loss": 0.8141, + "step": 4060 + }, + { + "epoch": 0.6579904615633336, + "grad_norm": 0.4470861852169037, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 4070 + }, + { + "epoch": 0.6596071457440789, + "grad_norm": 0.45544925332069397, + "learning_rate": 0.0002, + "loss": 0.8293, + "step": 4080 + }, + { + "epoch": 0.6612238299248242, + "grad_norm": 0.6144481301307678, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 4090 + }, + { + "epoch": 0.6628405141055694, + "grad_norm": 0.5936288237571716, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 4100 + }, + { + "epoch": 0.6644571982863148, + "grad_norm": 0.4822963774204254, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 4110 + }, + { + "epoch": 0.66607388246706, + "grad_norm": 0.48432496190071106, + "learning_rate": 0.0002, + "loss": 0.7997, + "step": 4120 + }, + { + "epoch": 0.6676905666478054, + "grad_norm": 0.4901607930660248, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 4130 + }, + { + "epoch": 0.6693072508285507, + "grad_norm": 0.5018393397331238, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 4140 + }, + { + "epoch": 0.6709239350092959, + "grad_norm": 0.6946378946304321, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 4150 + }, + { + "epoch": 0.6725406191900413, + "grad_norm": 0.5997390747070312, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 4160 + }, + { + "epoch": 0.6741573033707865, + "grad_norm": 0.6738849878311157, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 4170 + }, + { + "epoch": 0.6757739875515318, + "grad_norm": 0.6110581159591675, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 4180 + }, + { + "epoch": 0.6773906717322771, + "grad_norm": 0.5703322291374207, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 4190 + }, + { + "epoch": 0.6790073559130224, + "grad_norm": 0.4686066210269928, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 4200 + }, + { + "epoch": 0.6806240400937676, + "grad_norm": 0.6394643783569336, + "learning_rate": 0.0002, + "loss": 0.8441, + "step": 4210 + }, + { + "epoch": 0.682240724274513, + "grad_norm": 0.5454841256141663, + "learning_rate": 0.0002, + "loss": 0.8011, + "step": 4220 + }, + { + "epoch": 0.6838574084552582, + "grad_norm": 0.4859732985496521, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 4230 + }, + { + "epoch": 0.6854740926360036, + "grad_norm": 0.5544065833091736, + "learning_rate": 0.0002, + "loss": 0.8161, + "step": 4240 + }, + { + "epoch": 0.6870907768167488, + "grad_norm": 0.4902505576610565, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 4250 + }, + { + "epoch": 0.6887074609974941, + "grad_norm": 0.4768051505088806, + "learning_rate": 0.0002, + "loss": 0.7977, + "step": 4260 + }, + { + "epoch": 0.6903241451782395, + "grad_norm": 0.49982190132141113, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 4270 + }, + { + "epoch": 0.6919408293589847, + "grad_norm": 0.6351838111877441, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 4280 + }, + { + "epoch": 0.69355751353973, + "grad_norm": 0.5647561550140381, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 4290 + }, + { + "epoch": 0.6951741977204753, + "grad_norm": 0.5340486764907837, + "learning_rate": 0.0002, + "loss": 0.7618, + "step": 4300 + }, + { + "epoch": 0.6967908819012206, + "grad_norm": 0.5649092793464661, + "learning_rate": 0.0002, + "loss": 0.8526, + "step": 4310 + }, + { + "epoch": 0.6984075660819659, + "grad_norm": 0.6183916926383972, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 4320 + }, + { + "epoch": 0.7000242502627112, + "grad_norm": 0.6154509782791138, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 4330 + }, + { + "epoch": 0.7016409344434564, + "grad_norm": 0.5156264305114746, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 4340 + }, + { + "epoch": 0.7032576186242018, + "grad_norm": 0.562171459197998, + "learning_rate": 0.0002, + "loss": 0.8512, + "step": 4350 + }, + { + "epoch": 0.704874302804947, + "grad_norm": 0.4949502646923065, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 4360 + }, + { + "epoch": 0.7064909869856923, + "grad_norm": 0.5171684622764587, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 4370 + }, + { + "epoch": 0.7081076711664377, + "grad_norm": 0.6198443174362183, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 4380 + }, + { + "epoch": 0.7097243553471829, + "grad_norm": 0.5802276134490967, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 4390 + }, + { + "epoch": 0.7113410395279283, + "grad_norm": 0.41096967458724976, + "learning_rate": 0.0002, + "loss": 0.8797, + "step": 4400 + }, + { + "epoch": 0.7129577237086735, + "grad_norm": 0.4397392272949219, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 4410 + }, + { + "epoch": 0.7145744078894188, + "grad_norm": 0.45228442549705505, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 4420 + }, + { + "epoch": 0.7161910920701641, + "grad_norm": 0.4839673936367035, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 4430 + }, + { + "epoch": 0.7178077762509094, + "grad_norm": 0.6140755414962769, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 4440 + }, + { + "epoch": 0.7194244604316546, + "grad_norm": 0.6841378808021545, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 4450 + }, + { + "epoch": 0.7210411446124, + "grad_norm": 0.6664239168167114, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 4460 + }, + { + "epoch": 0.7226578287931452, + "grad_norm": 0.47552719712257385, + "learning_rate": 0.0002, + "loss": 0.7983, + "step": 4470 + }, + { + "epoch": 0.7242745129738906, + "grad_norm": 0.6649776101112366, + "learning_rate": 0.0002, + "loss": 0.8982, + "step": 4480 + }, + { + "epoch": 0.7258911971546358, + "grad_norm": 0.5159541964530945, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 4490 + }, + { + "epoch": 0.7275078813353811, + "grad_norm": 0.6693112850189209, + "learning_rate": 0.0002, + "loss": 0.7786, + "step": 4500 + }, + { + "epoch": 0.7291245655161265, + "grad_norm": 0.48870977759361267, + "learning_rate": 0.0002, + "loss": 0.8655, + "step": 4510 + }, + { + "epoch": 0.7307412496968717, + "grad_norm": 0.4857887923717499, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 4520 + }, + { + "epoch": 0.732357933877617, + "grad_norm": 0.5515662431716919, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 4530 + }, + { + "epoch": 0.7339746180583623, + "grad_norm": 0.6292222738265991, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 4540 + }, + { + "epoch": 0.7355913022391076, + "grad_norm": 0.48265689611434937, + "learning_rate": 0.0002, + "loss": 0.7749, + "step": 4550 + }, + { + "epoch": 0.7372079864198529, + "grad_norm": 0.8044266104698181, + "learning_rate": 0.0002, + "loss": 0.8499, + "step": 4560 + }, + { + "epoch": 0.7388246706005982, + "grad_norm": 0.6111769676208496, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 4570 + }, + { + "epoch": 0.7404413547813434, + "grad_norm": 0.5229553580284119, + "learning_rate": 0.0002, + "loss": 0.7291, + "step": 4580 + }, + { + "epoch": 0.7420580389620888, + "grad_norm": 0.6054152250289917, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 4590 + }, + { + "epoch": 0.743674723142834, + "grad_norm": 0.5574966669082642, + "learning_rate": 0.0002, + "loss": 0.8169, + "step": 4600 + }, + { + "epoch": 0.7452914073235793, + "grad_norm": 0.5395817160606384, + "learning_rate": 0.0002, + "loss": 0.8439, + "step": 4610 + }, + { + "epoch": 0.7469080915043246, + "grad_norm": 0.7116472721099854, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 4620 + }, + { + "epoch": 0.7485247756850699, + "grad_norm": 0.5618700981140137, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 4630 + }, + { + "epoch": 0.7501414598658153, + "grad_norm": 0.5802770853042603, + "learning_rate": 0.0002, + "loss": 0.7744, + "step": 4640 + }, + { + "epoch": 0.7517581440465605, + "grad_norm": 0.5690428018569946, + "learning_rate": 0.0002, + "loss": 0.7924, + "step": 4650 + }, + { + "epoch": 0.7533748282273058, + "grad_norm": 0.4813360273838043, + "learning_rate": 0.0002, + "loss": 0.8017, + "step": 4660 + }, + { + "epoch": 0.7549915124080511, + "grad_norm": 0.5434042811393738, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 4670 + }, + { + "epoch": 0.7566081965887964, + "grad_norm": 0.5502099990844727, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 4680 + }, + { + "epoch": 0.7582248807695416, + "grad_norm": 0.6020621061325073, + "learning_rate": 0.0002, + "loss": 0.8598, + "step": 4690 + }, + { + "epoch": 0.759841564950287, + "grad_norm": 0.4922301471233368, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 4700 + }, + { + "epoch": 0.7614582491310322, + "grad_norm": 0.6492828726768494, + "learning_rate": 0.0002, + "loss": 0.788, + "step": 4710 + }, + { + "epoch": 0.7630749333117776, + "grad_norm": 0.4865580201148987, + "learning_rate": 0.0002, + "loss": 0.8313, + "step": 4720 + }, + { + "epoch": 0.7646916174925228, + "grad_norm": 0.5971422791481018, + "learning_rate": 0.0002, + "loss": 0.7966, + "step": 4730 + }, + { + "epoch": 0.7663083016732681, + "grad_norm": 0.6832674145698547, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 4740 + }, + { + "epoch": 0.7679249858540134, + "grad_norm": 0.500908613204956, + "learning_rate": 0.0002, + "loss": 0.8156, + "step": 4750 + }, + { + "epoch": 0.7695416700347587, + "grad_norm": 0.6112465858459473, + "learning_rate": 0.0002, + "loss": 0.8383, + "step": 4760 + }, + { + "epoch": 0.771158354215504, + "grad_norm": 0.5753506422042847, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 4770 + }, + { + "epoch": 0.7727750383962493, + "grad_norm": 0.6529405117034912, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 4780 + }, + { + "epoch": 0.7743917225769946, + "grad_norm": 0.5916843414306641, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 4790 + }, + { + "epoch": 0.7760084067577399, + "grad_norm": 0.4821224510669708, + "learning_rate": 0.0002, + "loss": 0.83, + "step": 4800 + }, + { + "epoch": 0.7776250909384852, + "grad_norm": 0.5532580018043518, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 4810 + }, + { + "epoch": 0.7792417751192304, + "grad_norm": 0.4604877233505249, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 4820 + }, + { + "epoch": 0.7808584592999758, + "grad_norm": 0.5009613037109375, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 4830 + }, + { + "epoch": 0.782475143480721, + "grad_norm": 0.6448560357093811, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 4840 + }, + { + "epoch": 0.7840918276614663, + "grad_norm": 0.44327953457832336, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 4850 + }, + { + "epoch": 0.7857085118422116, + "grad_norm": 0.5355411171913147, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 4860 + }, + { + "epoch": 0.7873251960229569, + "grad_norm": 0.5635677576065063, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 4870 + }, + { + "epoch": 0.7889418802037023, + "grad_norm": 0.5417491793632507, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 4880 + }, + { + "epoch": 0.7905585643844475, + "grad_norm": 0.4567430913448334, + "learning_rate": 0.0002, + "loss": 0.7819, + "step": 4890 + }, + { + "epoch": 0.7921752485651928, + "grad_norm": 0.44651296734809875, + "learning_rate": 0.0002, + "loss": 0.8454, + "step": 4900 + }, + { + "epoch": 0.7937919327459381, + "grad_norm": 0.5741217136383057, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 4910 + }, + { + "epoch": 0.7954086169266834, + "grad_norm": 0.6605045199394226, + "learning_rate": 0.0002, + "loss": 0.8093, + "step": 4920 + }, + { + "epoch": 0.7970253011074286, + "grad_norm": 0.5126531720161438, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 4930 + }, + { + "epoch": 0.798641985288174, + "grad_norm": 0.513648271560669, + "learning_rate": 0.0002, + "loss": 0.7793, + "step": 4940 + }, + { + "epoch": 0.8002586694689192, + "grad_norm": 0.5350404381752014, + "learning_rate": 0.0002, + "loss": 0.8314, + "step": 4950 + }, + { + "epoch": 0.8018753536496646, + "grad_norm": 0.5731674432754517, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 4960 + }, + { + "epoch": 0.8034920378304098, + "grad_norm": 0.5974258184432983, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 4970 + }, + { + "epoch": 0.8051087220111551, + "grad_norm": 0.8774799704551697, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 4980 + }, + { + "epoch": 0.8067254061919004, + "grad_norm": 0.5994430184364319, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 4990 + }, + { + "epoch": 0.8083420903726457, + "grad_norm": 0.4894903004169464, + "learning_rate": 0.0002, + "loss": 0.7736, + "step": 5000 + }, + { + "epoch": 0.809958774553391, + "grad_norm": 0.5218459367752075, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 5010 + }, + { + "epoch": 0.8115754587341363, + "grad_norm": 0.5232468843460083, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 5020 + }, + { + "epoch": 0.8131921429148816, + "grad_norm": 0.44358372688293457, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 5030 + }, + { + "epoch": 0.8148088270956269, + "grad_norm": 0.6202037334442139, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 5040 + }, + { + "epoch": 0.8164255112763722, + "grad_norm": 0.7721474170684814, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 5050 + }, + { + "epoch": 0.8180421954571174, + "grad_norm": 0.5568501353263855, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 5060 + }, + { + "epoch": 0.8196588796378628, + "grad_norm": 0.49148809909820557, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 5070 + }, + { + "epoch": 0.821275563818608, + "grad_norm": 0.4956012964248657, + "learning_rate": 0.0002, + "loss": 0.8054, + "step": 5080 + }, + { + "epoch": 0.8228922479993533, + "grad_norm": 0.6078833937644958, + "learning_rate": 0.0002, + "loss": 0.8201, + "step": 5090 + }, + { + "epoch": 0.8245089321800986, + "grad_norm": 0.46906954050064087, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 5100 + }, + { + "epoch": 0.8261256163608439, + "grad_norm": 0.50812166929245, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 5110 + }, + { + "epoch": 0.8277423005415891, + "grad_norm": 0.5319661498069763, + "learning_rate": 0.0002, + "loss": 0.8243, + "step": 5120 + }, + { + "epoch": 0.8293589847223345, + "grad_norm": 0.4949689209461212, + "learning_rate": 0.0002, + "loss": 0.7798, + "step": 5130 + }, + { + "epoch": 0.8309756689030798, + "grad_norm": 0.5151591300964355, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 5140 + }, + { + "epoch": 0.8325923530838251, + "grad_norm": 0.5530214309692383, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 5150 + }, + { + "epoch": 0.8342090372645704, + "grad_norm": 0.6297410130500793, + "learning_rate": 0.0002, + "loss": 0.8251, + "step": 5160 + }, + { + "epoch": 0.8358257214453156, + "grad_norm": 0.5466840267181396, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 5170 + }, + { + "epoch": 0.837442405626061, + "grad_norm": 0.652913510799408, + "learning_rate": 0.0002, + "loss": 0.7875, + "step": 5180 + }, + { + "epoch": 0.8390590898068062, + "grad_norm": 0.5811293125152588, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 5190 + }, + { + "epoch": 0.8406757739875516, + "grad_norm": 0.5109550952911377, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 5200 + }, + { + "epoch": 0.8422924581682968, + "grad_norm": 0.4551706612110138, + "learning_rate": 0.0002, + "loss": 0.8077, + "step": 5210 + }, + { + "epoch": 0.8439091423490421, + "grad_norm": 0.5813754200935364, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 5220 + }, + { + "epoch": 0.8455258265297874, + "grad_norm": 0.5856947898864746, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 5230 + }, + { + "epoch": 0.8471425107105327, + "grad_norm": 0.5482739210128784, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 5240 + }, + { + "epoch": 0.8487591948912779, + "grad_norm": 0.49023720622062683, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 5250 + }, + { + "epoch": 0.8503758790720233, + "grad_norm": 0.49472475051879883, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 5260 + }, + { + "epoch": 0.8519925632527686, + "grad_norm": 0.5490226745605469, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 5270 + }, + { + "epoch": 0.8536092474335139, + "grad_norm": 0.5340665578842163, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 5280 + }, + { + "epoch": 0.8552259316142592, + "grad_norm": 0.5962483882904053, + "learning_rate": 0.0002, + "loss": 0.8277, + "step": 5290 + }, + { + "epoch": 0.8568426157950044, + "grad_norm": 0.586358368396759, + "learning_rate": 0.0002, + "loss": 0.8765, + "step": 5300 + }, + { + "epoch": 0.8584592999757498, + "grad_norm": 0.49120277166366577, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 5310 + }, + { + "epoch": 0.860075984156495, + "grad_norm": 0.5887332558631897, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 5320 + }, + { + "epoch": 0.8616926683372403, + "grad_norm": 0.42496153712272644, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 5330 + }, + { + "epoch": 0.8633093525179856, + "grad_norm": 0.5489874482154846, + "learning_rate": 0.0002, + "loss": 0.7905, + "step": 5340 + }, + { + "epoch": 0.8649260366987309, + "grad_norm": 0.5850813984870911, + "learning_rate": 0.0002, + "loss": 0.7958, + "step": 5350 + }, + { + "epoch": 0.8665427208794761, + "grad_norm": 0.517487108707428, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 5360 + }, + { + "epoch": 0.8681594050602215, + "grad_norm": 0.5339142680168152, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 5370 + }, + { + "epoch": 0.8697760892409668, + "grad_norm": 0.6236387491226196, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 5380 + }, + { + "epoch": 0.8713927734217121, + "grad_norm": 0.5752192735671997, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 5390 + }, + { + "epoch": 0.8730094576024574, + "grad_norm": 0.6724614500999451, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 5400 + }, + { + "epoch": 0.8746261417832026, + "grad_norm": 0.5280613303184509, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 5410 + }, + { + "epoch": 0.876242825963948, + "grad_norm": 0.44033288955688477, + "learning_rate": 0.0002, + "loss": 0.8231, + "step": 5420 + }, + { + "epoch": 0.8778595101446932, + "grad_norm": 0.5199708342552185, + "learning_rate": 0.0002, + "loss": 0.8839, + "step": 5430 + }, + { + "epoch": 0.8794761943254386, + "grad_norm": 0.46778348088264465, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 5440 + }, + { + "epoch": 0.8810928785061838, + "grad_norm": 0.4657754898071289, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 5450 + }, + { + "epoch": 0.8827095626869291, + "grad_norm": 0.5472902655601501, + "learning_rate": 0.0002, + "loss": 0.7799, + "step": 5460 + }, + { + "epoch": 0.8843262468676744, + "grad_norm": 0.4876766800880432, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 5470 + }, + { + "epoch": 0.8859429310484197, + "grad_norm": 0.5057248473167419, + "learning_rate": 0.0002, + "loss": 0.7906, + "step": 5480 + }, + { + "epoch": 0.8875596152291649, + "grad_norm": 0.4637320637702942, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 5490 + }, + { + "epoch": 0.8891762994099103, + "grad_norm": 0.471955806016922, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 5500 + }, + { + "epoch": 0.8907929835906556, + "grad_norm": 0.5209813714027405, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 5510 + }, + { + "epoch": 0.8924096677714008, + "grad_norm": 0.6213834285736084, + "learning_rate": 0.0002, + "loss": 0.8106, + "step": 5520 + }, + { + "epoch": 0.8940263519521462, + "grad_norm": 0.5215408205986023, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 5530 + }, + { + "epoch": 0.8956430361328914, + "grad_norm": 0.580478310585022, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 5540 + }, + { + "epoch": 0.8972597203136368, + "grad_norm": 0.49102169275283813, + "learning_rate": 0.0002, + "loss": 0.8371, + "step": 5550 + }, + { + "epoch": 0.898876404494382, + "grad_norm": 0.6043479442596436, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 5560 + }, + { + "epoch": 0.9004930886751273, + "grad_norm": 0.5636463165283203, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 5570 + }, + { + "epoch": 0.9021097728558726, + "grad_norm": 0.5620124340057373, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 5580 + }, + { + "epoch": 0.9037264570366179, + "grad_norm": 0.5206354856491089, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 5590 + }, + { + "epoch": 0.9053431412173631, + "grad_norm": 0.5798229575157166, + "learning_rate": 0.0002, + "loss": 0.8557, + "step": 5600 + }, + { + "epoch": 0.9069598253981085, + "grad_norm": 0.6428212523460388, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 5610 + }, + { + "epoch": 0.9085765095788537, + "grad_norm": 0.48064687848091125, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 5620 + }, + { + "epoch": 0.9101931937595991, + "grad_norm": 0.6347860097885132, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 5630 + }, + { + "epoch": 0.9118098779403444, + "grad_norm": 0.5353913307189941, + "learning_rate": 0.0002, + "loss": 0.851, + "step": 5640 + }, + { + "epoch": 0.9134265621210896, + "grad_norm": 0.5323944091796875, + "learning_rate": 0.0002, + "loss": 0.7736, + "step": 5650 + }, + { + "epoch": 0.915043246301835, + "grad_norm": 0.5261843204498291, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 5660 + }, + { + "epoch": 0.9166599304825802, + "grad_norm": 0.5451326966285706, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 5670 + }, + { + "epoch": 0.9182766146633256, + "grad_norm": 0.5183324217796326, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 5680 + }, + { + "epoch": 0.9198932988440708, + "grad_norm": 0.47229018807411194, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 5690 + }, + { + "epoch": 0.9215099830248161, + "grad_norm": 0.49180513620376587, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 5700 + }, + { + "epoch": 0.9231266672055614, + "grad_norm": 0.5419785380363464, + "learning_rate": 0.0002, + "loss": 0.8735, + "step": 5710 + }, + { + "epoch": 0.9247433513863067, + "grad_norm": 0.5408698916435242, + "learning_rate": 0.0002, + "loss": 0.7378, + "step": 5720 + }, + { + "epoch": 0.9263600355670519, + "grad_norm": 0.5286232829093933, + "learning_rate": 0.0002, + "loss": 0.7701, + "step": 5730 + }, + { + "epoch": 0.9279767197477973, + "grad_norm": 0.7539758086204529, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 5740 + }, + { + "epoch": 0.9295934039285425, + "grad_norm": 0.5166944861412048, + "learning_rate": 0.0002, + "loss": 0.8118, + "step": 5750 + }, + { + "epoch": 0.9312100881092878, + "grad_norm": 0.6601425409317017, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 5760 + }, + { + "epoch": 0.9328267722900332, + "grad_norm": 0.5029960870742798, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 5770 + }, + { + "epoch": 0.9344434564707784, + "grad_norm": 0.4926645755767822, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 5780 + }, + { + "epoch": 0.9360601406515238, + "grad_norm": 0.5739615559577942, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 5790 + }, + { + "epoch": 0.937676824832269, + "grad_norm": 0.5058279037475586, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 5800 + }, + { + "epoch": 0.9392935090130143, + "grad_norm": 0.5260962247848511, + "learning_rate": 0.0002, + "loss": 0.8537, + "step": 5810 + }, + { + "epoch": 0.9409101931937596, + "grad_norm": 0.5768588185310364, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 5820 + }, + { + "epoch": 0.9425268773745049, + "grad_norm": 0.5170126557350159, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 5830 + }, + { + "epoch": 0.9441435615552501, + "grad_norm": 0.5745864510536194, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 5840 + }, + { + "epoch": 0.9457602457359955, + "grad_norm": 0.5551357865333557, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 5850 + }, + { + "epoch": 0.9473769299167407, + "grad_norm": 0.5776078701019287, + "learning_rate": 0.0002, + "loss": 0.8529, + "step": 5860 + }, + { + "epoch": 0.9489936140974861, + "grad_norm": 0.5340062379837036, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 5870 + }, + { + "epoch": 0.9506102982782314, + "grad_norm": 0.6447290182113647, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 5880 + }, + { + "epoch": 0.9522269824589766, + "grad_norm": 0.5123815536499023, + "learning_rate": 0.0002, + "loss": 0.7945, + "step": 5890 + }, + { + "epoch": 0.953843666639722, + "grad_norm": 0.48547613620758057, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 5900 + }, + { + "epoch": 0.9554603508204672, + "grad_norm": 0.5791414976119995, + "learning_rate": 0.0002, + "loss": 0.7896, + "step": 5910 + }, + { + "epoch": 0.9570770350012126, + "grad_norm": 0.6195011734962463, + "learning_rate": 0.0002, + "loss": 0.8408, + "step": 5920 + }, + { + "epoch": 0.9586937191819578, + "grad_norm": 0.6323803067207336, + "learning_rate": 0.0002, + "loss": 0.7805, + "step": 5930 + }, + { + "epoch": 0.9603104033627031, + "grad_norm": 0.45552879571914673, + "learning_rate": 0.0002, + "loss": 0.8484, + "step": 5940 + }, + { + "epoch": 0.9619270875434484, + "grad_norm": 0.5796473622322083, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 5950 + }, + { + "epoch": 0.9635437717241937, + "grad_norm": 0.647261381149292, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 5960 + }, + { + "epoch": 0.9651604559049389, + "grad_norm": 0.5487682819366455, + "learning_rate": 0.0002, + "loss": 0.8086, + "step": 5970 + }, + { + "epoch": 0.9667771400856843, + "grad_norm": 0.5743663907051086, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 5980 + }, + { + "epoch": 0.9683938242664295, + "grad_norm": 0.5470591187477112, + "learning_rate": 0.0002, + "loss": 0.8153, + "step": 5990 + }, + { + "epoch": 0.9700105084471748, + "grad_norm": 0.5901660323143005, + "learning_rate": 0.0002, + "loss": 0.8119, + "step": 6000 + }, + { + "epoch": 0.9716271926279202, + "grad_norm": 0.6544759273529053, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 6010 + }, + { + "epoch": 0.9732438768086654, + "grad_norm": 0.6288470029830933, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 6020 + }, + { + "epoch": 0.9748605609894108, + "grad_norm": 0.673153817653656, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 6030 + }, + { + "epoch": 0.976477245170156, + "grad_norm": 0.42854753136634827, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 6040 + }, + { + "epoch": 0.9780939293509013, + "grad_norm": 0.5227066278457642, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 6050 + }, + { + "epoch": 0.9797106135316466, + "grad_norm": 0.5372416973114014, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 6060 + }, + { + "epoch": 0.9813272977123919, + "grad_norm": 0.6026402115821838, + "learning_rate": 0.0002, + "loss": 0.7591, + "step": 6070 + }, + { + "epoch": 0.9829439818931371, + "grad_norm": 0.49547791481018066, + "learning_rate": 0.0002, + "loss": 0.8347, + "step": 6080 + }, + { + "epoch": 0.9845606660738825, + "grad_norm": 0.4641951322555542, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 6090 + }, + { + "epoch": 0.9861773502546277, + "grad_norm": 0.5818535089492798, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 6100 + }, + { + "epoch": 0.9877940344353731, + "grad_norm": 0.63955157995224, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 6110 + }, + { + "epoch": 0.9894107186161183, + "grad_norm": 0.5649438500404358, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 6120 + }, + { + "epoch": 0.9910274027968636, + "grad_norm": 0.5290433168411255, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 6130 + }, + { + "epoch": 0.992644086977609, + "grad_norm": 0.6399374008178711, + "learning_rate": 0.0002, + "loss": 0.8109, + "step": 6140 + }, + { + "epoch": 0.9942607711583542, + "grad_norm": 0.6736576557159424, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 6150 + }, + { + "epoch": 0.9958774553390995, + "grad_norm": 0.515420138835907, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 6160 + }, + { + "epoch": 0.9974941395198448, + "grad_norm": 0.562677800655365, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 6170 + }, + { + "epoch": 0.9991108237005901, + "grad_norm": 0.7113858461380005, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 6180 + }, + { + "epoch": 0.9999191657909627, + "eval_loss": 1.0871200561523438, + "eval_runtime": 122.2071, + "eval_samples_per_second": 5.998, + "eval_steps_per_second": 0.753, + "step": 6185 + }, + { + "epoch": 1.0007275078813354, + "grad_norm": 0.7111801505088806, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 6190 + }, + { + "epoch": 1.0023441920620806, + "grad_norm": 0.5402125716209412, + "learning_rate": 0.0002, + "loss": 0.6865, + "step": 6200 + }, + { + "epoch": 1.003960876242826, + "grad_norm": 0.6098830103874207, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 6210 + }, + { + "epoch": 1.0055775604235713, + "grad_norm": 0.5829983353614807, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 6220 + }, + { + "epoch": 1.0071942446043165, + "grad_norm": 0.5614621043205261, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 6230 + }, + { + "epoch": 1.0088109287850617, + "grad_norm": 0.5954238772392273, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 6240 + }, + { + "epoch": 1.0104276129658072, + "grad_norm": 0.6480574607849121, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 6250 + }, + { + "epoch": 1.0120442971465524, + "grad_norm": 0.6051128506660461, + "learning_rate": 0.0002, + "loss": 0.7514, + "step": 6260 + }, + { + "epoch": 1.0136609813272976, + "grad_norm": 0.6318870782852173, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 6270 + }, + { + "epoch": 1.015277665508043, + "grad_norm": 0.5048980116844177, + "learning_rate": 0.0002, + "loss": 0.7178, + "step": 6280 + }, + { + "epoch": 1.0168943496887883, + "grad_norm": 0.6346936225891113, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 6290 + }, + { + "epoch": 1.0185110338695336, + "grad_norm": 0.5711665749549866, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 6300 + }, + { + "epoch": 1.0201277180502788, + "grad_norm": 0.5175361037254333, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 6310 + }, + { + "epoch": 1.0217444022310243, + "grad_norm": 0.5360831618309021, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 6320 + }, + { + "epoch": 1.0233610864117695, + "grad_norm": 0.614675760269165, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 6330 + }, + { + "epoch": 1.0249777705925147, + "grad_norm": 0.5626118183135986, + "learning_rate": 0.0002, + "loss": 0.7748, + "step": 6340 + }, + { + "epoch": 1.02659445477326, + "grad_norm": 0.574897289276123, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 6350 + }, + { + "epoch": 1.0282111389540054, + "grad_norm": 0.7185447812080383, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 6360 + }, + { + "epoch": 1.0298278231347506, + "grad_norm": 0.6705799698829651, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 6370 + }, + { + "epoch": 1.0314445073154959, + "grad_norm": 0.6740428805351257, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 6380 + }, + { + "epoch": 1.0330611914962413, + "grad_norm": 0.663902759552002, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 6390 + }, + { + "epoch": 1.0346778756769865, + "grad_norm": 0.5029543042182922, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 6400 + }, + { + "epoch": 1.0362945598577318, + "grad_norm": 0.7813863158226013, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 6410 + }, + { + "epoch": 1.037911244038477, + "grad_norm": 0.5396282076835632, + "learning_rate": 0.0002, + "loss": 0.7433, + "step": 6420 + }, + { + "epoch": 1.0395279282192225, + "grad_norm": 0.5253293514251709, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 6430 + }, + { + "epoch": 1.0411446123999677, + "grad_norm": 0.7236770987510681, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 6440 + }, + { + "epoch": 1.042761296580713, + "grad_norm": 0.5670917630195618, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 6450 + }, + { + "epoch": 1.0443779807614582, + "grad_norm": 0.6031978726387024, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 6460 + }, + { + "epoch": 1.0459946649422036, + "grad_norm": 0.5309213399887085, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 6470 + }, + { + "epoch": 1.0476113491229488, + "grad_norm": 0.7114651799201965, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 6480 + }, + { + "epoch": 1.049228033303694, + "grad_norm": 0.5591610670089722, + "learning_rate": 0.0002, + "loss": 0.7583, + "step": 6490 + }, + { + "epoch": 1.0508447174844395, + "grad_norm": 0.5185961127281189, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 6500 + }, + { + "epoch": 1.0524614016651848, + "grad_norm": 0.6510552167892456, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 6510 + }, + { + "epoch": 1.05407808584593, + "grad_norm": 0.6557928919792175, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 6520 + }, + { + "epoch": 1.0556947700266752, + "grad_norm": 0.6973192691802979, + "learning_rate": 0.0002, + "loss": 0.8056, + "step": 6530 + }, + { + "epoch": 1.0573114542074207, + "grad_norm": 0.6226583123207092, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 6540 + }, + { + "epoch": 1.058928138388166, + "grad_norm": 0.5633195638656616, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 6550 + }, + { + "epoch": 1.0605448225689111, + "grad_norm": 0.7466658353805542, + "learning_rate": 0.0002, + "loss": 0.7082, + "step": 6560 + }, + { + "epoch": 1.0621615067496564, + "grad_norm": 0.6462772488594055, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 6570 + }, + { + "epoch": 1.0637781909304018, + "grad_norm": 0.5266856551170349, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 6580 + }, + { + "epoch": 1.065394875111147, + "grad_norm": 0.534392774105072, + "learning_rate": 0.0002, + "loss": 0.7157, + "step": 6590 + }, + { + "epoch": 1.0670115592918923, + "grad_norm": 0.7514177560806274, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 6600 + }, + { + "epoch": 1.0686282434726375, + "grad_norm": 0.7593035697937012, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 6610 + }, + { + "epoch": 1.070244927653383, + "grad_norm": 0.5277858972549438, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 6620 + }, + { + "epoch": 1.0718616118341282, + "grad_norm": 0.5573670268058777, + "learning_rate": 0.0002, + "loss": 0.7405, + "step": 6630 + }, + { + "epoch": 1.0734782960148734, + "grad_norm": 0.6802396774291992, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 6640 + }, + { + "epoch": 1.0750949801956189, + "grad_norm": 0.7367215752601624, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 6650 + }, + { + "epoch": 1.0767116643763641, + "grad_norm": 0.5961891412734985, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 6660 + }, + { + "epoch": 1.0783283485571094, + "grad_norm": 0.5736313462257385, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 6670 + }, + { + "epoch": 1.0799450327378546, + "grad_norm": 0.619219183921814, + "learning_rate": 0.0002, + "loss": 0.7178, + "step": 6680 + }, + { + "epoch": 1.0815617169186, + "grad_norm": 0.6214390993118286, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 6690 + }, + { + "epoch": 1.0831784010993453, + "grad_norm": 0.564536988735199, + "learning_rate": 0.0002, + "loss": 0.7554, + "step": 6700 + }, + { + "epoch": 1.0847950852800905, + "grad_norm": 0.5838140249252319, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 6710 + }, + { + "epoch": 1.0864117694608357, + "grad_norm": 0.7000553607940674, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 6720 + }, + { + "epoch": 1.0880284536415812, + "grad_norm": 0.7078263759613037, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 6730 + }, + { + "epoch": 1.0896451378223264, + "grad_norm": 0.8353848457336426, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 6740 + }, + { + "epoch": 1.0912618220030716, + "grad_norm": 0.5615518689155579, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 6750 + }, + { + "epoch": 1.0928785061838169, + "grad_norm": 0.5475581288337708, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 6760 + }, + { + "epoch": 1.0944951903645623, + "grad_norm": 0.5835978388786316, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 6770 + }, + { + "epoch": 1.0961118745453076, + "grad_norm": 0.5516105890274048, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 6780 + }, + { + "epoch": 1.0977285587260528, + "grad_norm": 0.5875251889228821, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 6790 + }, + { + "epoch": 1.0993452429067982, + "grad_norm": 0.7376947999000549, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 6800 + }, + { + "epoch": 1.1009619270875435, + "grad_norm": 0.5656165480613708, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 6810 + }, + { + "epoch": 1.1025786112682887, + "grad_norm": 0.6365954279899597, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 6820 + }, + { + "epoch": 1.104195295449034, + "grad_norm": 0.5033080577850342, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 6830 + }, + { + "epoch": 1.1058119796297794, + "grad_norm": 0.617396891117096, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 6840 + }, + { + "epoch": 1.1074286638105246, + "grad_norm": 0.6395374536514282, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 6850 + }, + { + "epoch": 1.1090453479912699, + "grad_norm": 0.6775295734405518, + "learning_rate": 0.0002, + "loss": 0.7335, + "step": 6860 + }, + { + "epoch": 1.1106620321720153, + "grad_norm": 0.6655223965644836, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 6870 + }, + { + "epoch": 1.1122787163527605, + "grad_norm": 0.676655113697052, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 6880 + }, + { + "epoch": 1.1138954005335058, + "grad_norm": 0.6062718629837036, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 6890 + }, + { + "epoch": 1.115512084714251, + "grad_norm": 0.590943455696106, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 6900 + }, + { + "epoch": 1.1171287688949965, + "grad_norm": 0.6315317153930664, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 6910 + }, + { + "epoch": 1.1187454530757417, + "grad_norm": 0.47979024052619934, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 6920 + }, + { + "epoch": 1.120362137256487, + "grad_norm": 0.647298276424408, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 6930 + }, + { + "epoch": 1.1219788214372322, + "grad_norm": 0.7336484789848328, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 6940 + }, + { + "epoch": 1.1235955056179776, + "grad_norm": 0.5071424245834351, + "learning_rate": 0.0002, + "loss": 0.693, + "step": 6950 + }, + { + "epoch": 1.1252121897987228, + "grad_norm": 0.6527144312858582, + "learning_rate": 0.0002, + "loss": 0.7378, + "step": 6960 + }, + { + "epoch": 1.126828873979468, + "grad_norm": 0.6935935020446777, + "learning_rate": 0.0002, + "loss": 0.7228, + "step": 6970 + }, + { + "epoch": 1.1284455581602133, + "grad_norm": 0.8026931881904602, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 6980 + }, + { + "epoch": 1.1300622423409588, + "grad_norm": 0.5210393667221069, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 6990 + }, + { + "epoch": 1.131678926521704, + "grad_norm": 0.60475093126297, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 7000 + }, + { + "epoch": 1.1332956107024492, + "grad_norm": 0.6417073607444763, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 7010 + }, + { + "epoch": 1.1349122948831947, + "grad_norm": 0.6732175946235657, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 7020 + }, + { + "epoch": 1.13652897906394, + "grad_norm": 0.6719491481781006, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 7030 + }, + { + "epoch": 1.1381456632446851, + "grad_norm": 0.5708295106887817, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 7040 + }, + { + "epoch": 1.1397623474254304, + "grad_norm": 0.7141719460487366, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 7050 + }, + { + "epoch": 1.1413790316061758, + "grad_norm": 0.6187017560005188, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 7060 + }, + { + "epoch": 1.142995715786921, + "grad_norm": 0.50581294298172, + "learning_rate": 0.0002, + "loss": 0.7657, + "step": 7070 + }, + { + "epoch": 1.1446123999676663, + "grad_norm": 0.5620143413543701, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 7080 + }, + { + "epoch": 1.1462290841484115, + "grad_norm": 0.6231929659843445, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 7090 + }, + { + "epoch": 1.147845768329157, + "grad_norm": 0.5775774121284485, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 7100 + }, + { + "epoch": 1.1494624525099022, + "grad_norm": 0.6492809653282166, + "learning_rate": 0.0002, + "loss": 0.7728, + "step": 7110 + }, + { + "epoch": 1.1510791366906474, + "grad_norm": 0.6434972286224365, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 7120 + }, + { + "epoch": 1.1526958208713927, + "grad_norm": 0.6191812753677368, + "learning_rate": 0.0002, + "loss": 0.7374, + "step": 7130 + }, + { + "epoch": 1.1543125050521381, + "grad_norm": 0.6690331697463989, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 7140 + }, + { + "epoch": 1.1559291892328833, + "grad_norm": 0.5977938175201416, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 7150 + }, + { + "epoch": 1.1575458734136286, + "grad_norm": 0.6195854544639587, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 7160 + }, + { + "epoch": 1.159162557594374, + "grad_norm": 0.5752048492431641, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 7170 + }, + { + "epoch": 1.1607792417751193, + "grad_norm": 0.589081883430481, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 7180 + }, + { + "epoch": 1.1623959259558645, + "grad_norm": 0.756996750831604, + "learning_rate": 0.0002, + "loss": 0.704, + "step": 7190 + }, + { + "epoch": 1.1640126101366097, + "grad_norm": 0.7614967226982117, + "learning_rate": 0.0002, + "loss": 0.7404, + "step": 7200 + }, + { + "epoch": 1.1656292943173552, + "grad_norm": 0.6120437979698181, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 7210 + }, + { + "epoch": 1.1672459784981004, + "grad_norm": 0.6210004687309265, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 7220 + }, + { + "epoch": 1.1688626626788456, + "grad_norm": 0.6044116020202637, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 7230 + }, + { + "epoch": 1.170479346859591, + "grad_norm": 0.5418457388877869, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 7240 + }, + { + "epoch": 1.1720960310403363, + "grad_norm": 0.6413537263870239, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 7250 + }, + { + "epoch": 1.1737127152210816, + "grad_norm": 0.5777867436408997, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 7260 + }, + { + "epoch": 1.1753293994018268, + "grad_norm": 0.7092402577400208, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 7270 + }, + { + "epoch": 1.176946083582572, + "grad_norm": 0.6351709365844727, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 7280 + }, + { + "epoch": 1.1785627677633175, + "grad_norm": 0.6172189712524414, + "learning_rate": 0.0002, + "loss": 0.7527, + "step": 7290 + }, + { + "epoch": 1.1801794519440627, + "grad_norm": 0.6801714897155762, + "learning_rate": 0.0002, + "loss": 0.7319, + "step": 7300 + }, + { + "epoch": 1.181796136124808, + "grad_norm": 0.6044712066650391, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 7310 + }, + { + "epoch": 1.1834128203055534, + "grad_norm": 0.7413212060928345, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 7320 + }, + { + "epoch": 1.1850295044862986, + "grad_norm": 0.5303856134414673, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 7330 + }, + { + "epoch": 1.1866461886670439, + "grad_norm": 0.5647098422050476, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 7340 + }, + { + "epoch": 1.188262872847789, + "grad_norm": 0.7374135255813599, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 7350 + }, + { + "epoch": 1.1898795570285345, + "grad_norm": 0.5710089206695557, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 7360 + }, + { + "epoch": 1.1914962412092798, + "grad_norm": 0.6073619723320007, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 7370 + }, + { + "epoch": 1.193112925390025, + "grad_norm": 0.5899916887283325, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 7380 + }, + { + "epoch": 1.1947296095707705, + "grad_norm": 0.7762434482574463, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 7390 + }, + { + "epoch": 1.1963462937515157, + "grad_norm": 0.679949939250946, + "learning_rate": 0.0002, + "loss": 0.7319, + "step": 7400 + }, + { + "epoch": 1.197962977932261, + "grad_norm": 0.6106849312782288, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 7410 + }, + { + "epoch": 1.1995796621130062, + "grad_norm": 0.682461678981781, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 7420 + }, + { + "epoch": 1.2011963462937516, + "grad_norm": 0.6087017059326172, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 7430 + }, + { + "epoch": 1.2028130304744968, + "grad_norm": 0.63739013671875, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 7440 + }, + { + "epoch": 1.204429714655242, + "grad_norm": 0.6154777407646179, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 7450 + }, + { + "epoch": 1.2060463988359873, + "grad_norm": 0.7491534948348999, + "learning_rate": 0.0002, + "loss": 0.7565, + "step": 7460 + }, + { + "epoch": 1.2076630830167328, + "grad_norm": 0.6664797067642212, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 7470 + }, + { + "epoch": 1.209279767197478, + "grad_norm": 0.6660266518592834, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 7480 + }, + { + "epoch": 1.2108964513782232, + "grad_norm": 0.6972551345825195, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 7490 + }, + { + "epoch": 1.2125131355589684, + "grad_norm": 0.6157945990562439, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 7500 + }, + { + "epoch": 1.214129819739714, + "grad_norm": 0.5199310183525085, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 7510 + }, + { + "epoch": 1.2157465039204591, + "grad_norm": 0.577610433101654, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 7520 + }, + { + "epoch": 1.2173631881012044, + "grad_norm": 0.53652423620224, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 7530 + }, + { + "epoch": 1.2189798722819498, + "grad_norm": 0.6479050517082214, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 7540 + }, + { + "epoch": 1.220596556462695, + "grad_norm": 0.618748128414154, + "learning_rate": 0.0002, + "loss": 0.7534, + "step": 7550 + }, + { + "epoch": 1.2222132406434403, + "grad_norm": 0.6311424374580383, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 7560 + }, + { + "epoch": 1.2238299248241855, + "grad_norm": 0.6595825552940369, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 7570 + }, + { + "epoch": 1.225446609004931, + "grad_norm": 0.5198960900306702, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 7580 + }, + { + "epoch": 1.2270632931856762, + "grad_norm": 0.578650712966919, + "learning_rate": 0.0002, + "loss": 0.674, + "step": 7590 + }, + { + "epoch": 1.2286799773664214, + "grad_norm": 0.6080220937728882, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 7600 + }, + { + "epoch": 1.2302966615471669, + "grad_norm": 0.7050248384475708, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 7610 + }, + { + "epoch": 1.2319133457279121, + "grad_norm": 0.6652196049690247, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 7620 + }, + { + "epoch": 1.2335300299086573, + "grad_norm": 0.7322776317596436, + "learning_rate": 0.0002, + "loss": 0.7085, + "step": 7630 + }, + { + "epoch": 1.2351467140894026, + "grad_norm": 0.4998728036880493, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 7640 + }, + { + "epoch": 1.2367633982701478, + "grad_norm": 0.6428788900375366, + "learning_rate": 0.0002, + "loss": 0.7214, + "step": 7650 + }, + { + "epoch": 1.2383800824508933, + "grad_norm": 0.585242509841919, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 7660 + }, + { + "epoch": 1.2399967666316385, + "grad_norm": 0.5211917757987976, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 7670 + }, + { + "epoch": 1.2416134508123837, + "grad_norm": 0.6490384340286255, + "learning_rate": 0.0002, + "loss": 0.746, + "step": 7680 + }, + { + "epoch": 1.2432301349931292, + "grad_norm": 0.6249763369560242, + "learning_rate": 0.0002, + "loss": 0.7186, + "step": 7690 + }, + { + "epoch": 1.2448468191738744, + "grad_norm": 0.71870356798172, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 7700 + }, + { + "epoch": 1.2464635033546196, + "grad_norm": 0.6761967539787292, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 7710 + }, + { + "epoch": 1.2480801875353649, + "grad_norm": 0.6500617265701294, + "learning_rate": 0.0002, + "loss": 0.7501, + "step": 7720 + }, + { + "epoch": 1.2496968717161103, + "grad_norm": 0.8069869875907898, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 7730 + }, + { + "epoch": 1.2513135558968556, + "grad_norm": 0.6044608950614929, + "learning_rate": 0.0002, + "loss": 0.6747, + "step": 7740 + }, + { + "epoch": 1.2529302400776008, + "grad_norm": 0.6573283076286316, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 7750 + }, + { + "epoch": 1.2545469242583462, + "grad_norm": 0.625430166721344, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 7760 + }, + { + "epoch": 1.2561636084390915, + "grad_norm": 0.5442022681236267, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 7770 + }, + { + "epoch": 1.2577802926198367, + "grad_norm": 0.6818386912345886, + "learning_rate": 0.0002, + "loss": 0.7172, + "step": 7780 + }, + { + "epoch": 1.259396976800582, + "grad_norm": 0.6381874084472656, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 7790 + }, + { + "epoch": 1.2610136609813272, + "grad_norm": 0.6269212961196899, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 7800 + }, + { + "epoch": 1.2626303451620726, + "grad_norm": 0.600121259689331, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 7810 + }, + { + "epoch": 1.2642470293428179, + "grad_norm": 0.6337703466415405, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 7820 + }, + { + "epoch": 1.2658637135235633, + "grad_norm": 0.7234963774681091, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 7830 + }, + { + "epoch": 1.2674803977043085, + "grad_norm": 0.800184965133667, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 7840 + }, + { + "epoch": 1.2690970818850538, + "grad_norm": 0.7539464831352234, + "learning_rate": 0.0002, + "loss": 0.7426, + "step": 7850 + }, + { + "epoch": 1.270713766065799, + "grad_norm": 0.5493760704994202, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 7860 + }, + { + "epoch": 1.2723304502465442, + "grad_norm": 0.7477145791053772, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 7870 + }, + { + "epoch": 1.2739471344272897, + "grad_norm": 0.6366362571716309, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 7880 + }, + { + "epoch": 1.275563818608035, + "grad_norm": 0.7419533729553223, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 7890 + }, + { + "epoch": 1.2771805027887801, + "grad_norm": 0.6141223311424255, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 7900 + }, + { + "epoch": 1.2787971869695256, + "grad_norm": 0.7522598505020142, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 7910 + }, + { + "epoch": 1.2804138711502708, + "grad_norm": 0.6935804486274719, + "learning_rate": 0.0002, + "loss": 0.7097, + "step": 7920 + }, + { + "epoch": 1.282030555331016, + "grad_norm": 0.7239290475845337, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 7930 + }, + { + "epoch": 1.2836472395117613, + "grad_norm": 0.8800187110900879, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 7940 + }, + { + "epoch": 1.2852639236925067, + "grad_norm": 0.540458083152771, + "learning_rate": 0.0002, + "loss": 0.6991, + "step": 7950 + }, + { + "epoch": 1.286880607873252, + "grad_norm": 0.6492934226989746, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 7960 + }, + { + "epoch": 1.2884972920539972, + "grad_norm": 0.6543959379196167, + "learning_rate": 0.0002, + "loss": 0.7742, + "step": 7970 + }, + { + "epoch": 1.2901139762347427, + "grad_norm": 0.5804705619812012, + "learning_rate": 0.0002, + "loss": 0.7316, + "step": 7980 + }, + { + "epoch": 1.291730660415488, + "grad_norm": 0.7074727416038513, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 7990 + }, + { + "epoch": 1.2933473445962331, + "grad_norm": 0.5347974300384521, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 8000 + }, + { + "epoch": 1.2949640287769784, + "grad_norm": 0.6457298398017883, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 8010 + }, + { + "epoch": 1.2965807129577236, + "grad_norm": 0.6407219171524048, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 8020 + }, + { + "epoch": 1.298197397138469, + "grad_norm": 0.828439474105835, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 8030 + }, + { + "epoch": 1.2998140813192143, + "grad_norm": 0.4840380549430847, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 8040 + }, + { + "epoch": 1.3014307654999595, + "grad_norm": 0.5921024680137634, + "learning_rate": 0.0002, + "loss": 0.7283, + "step": 8050 + }, + { + "epoch": 1.303047449680705, + "grad_norm": 0.6170315146446228, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 8060 + }, + { + "epoch": 1.3046641338614502, + "grad_norm": 0.5374847054481506, + "learning_rate": 0.0002, + "loss": 0.7534, + "step": 8070 + }, + { + "epoch": 1.3062808180421954, + "grad_norm": 0.545758068561554, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 8080 + }, + { + "epoch": 1.3078975022229407, + "grad_norm": 0.55641770362854, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 8090 + }, + { + "epoch": 1.309514186403686, + "grad_norm": 0.6724897027015686, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 8100 + }, + { + "epoch": 1.3111308705844313, + "grad_norm": 0.6923972368240356, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 8110 + }, + { + "epoch": 1.3127475547651766, + "grad_norm": 0.5136841535568237, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 8120 + }, + { + "epoch": 1.314364238945922, + "grad_norm": 0.6766283512115479, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 8130 + }, + { + "epoch": 1.3159809231266673, + "grad_norm": 0.6283926367759705, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 8140 + }, + { + "epoch": 1.3175976073074125, + "grad_norm": 0.644216001033783, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 8150 + }, + { + "epoch": 1.3192142914881577, + "grad_norm": 0.7827503085136414, + "learning_rate": 0.0002, + "loss": 0.7125, + "step": 8160 + }, + { + "epoch": 1.320830975668903, + "grad_norm": 0.6651390790939331, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 8170 + }, + { + "epoch": 1.3224476598496484, + "grad_norm": 0.5547412633895874, + "learning_rate": 0.0002, + "loss": 0.7778, + "step": 8180 + }, + { + "epoch": 1.3240643440303936, + "grad_norm": 0.6765179634094238, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 8190 + }, + { + "epoch": 1.325681028211139, + "grad_norm": 0.6822077035903931, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 8200 + }, + { + "epoch": 1.3272977123918843, + "grad_norm": 0.5941002368927002, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 8210 + }, + { + "epoch": 1.3289143965726296, + "grad_norm": 0.4850037097930908, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 8220 + }, + { + "epoch": 1.3305310807533748, + "grad_norm": 0.6162990927696228, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 8230 + }, + { + "epoch": 1.33214776493412, + "grad_norm": 0.6665613651275635, + "learning_rate": 0.0002, + "loss": 0.7751, + "step": 8240 + }, + { + "epoch": 1.3337644491148655, + "grad_norm": 0.618192732334137, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 8250 + }, + { + "epoch": 1.3353811332956107, + "grad_norm": 0.710418701171875, + "learning_rate": 0.0002, + "loss": 0.7532, + "step": 8260 + }, + { + "epoch": 1.336997817476356, + "grad_norm": 0.5109876990318298, + "learning_rate": 0.0002, + "loss": 0.7306, + "step": 8270 + }, + { + "epoch": 1.3386145016571014, + "grad_norm": 0.6791711449623108, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 8280 + }, + { + "epoch": 1.3402311858378466, + "grad_norm": 0.6836432814598083, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 8290 + }, + { + "epoch": 1.3418478700185918, + "grad_norm": 0.5579386353492737, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 8300 + }, + { + "epoch": 1.343464554199337, + "grad_norm": 0.6713546514511108, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 8310 + }, + { + "epoch": 1.3450812383800825, + "grad_norm": 0.5353720188140869, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 8320 + }, + { + "epoch": 1.3466979225608278, + "grad_norm": 0.5813682675361633, + "learning_rate": 0.0002, + "loss": 0.718, + "step": 8330 + }, + { + "epoch": 1.348314606741573, + "grad_norm": 0.8158791661262512, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 8340 + }, + { + "epoch": 1.3499312909223184, + "grad_norm": 0.6193785071372986, + "learning_rate": 0.0002, + "loss": 0.6992, + "step": 8350 + }, + { + "epoch": 1.3515479751030637, + "grad_norm": 0.6353939771652222, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 8360 + }, + { + "epoch": 1.353164659283809, + "grad_norm": 0.6925048232078552, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 8370 + }, + { + "epoch": 1.3547813434645541, + "grad_norm": 0.988264799118042, + "learning_rate": 0.0002, + "loss": 0.736, + "step": 8380 + }, + { + "epoch": 1.3563980276452994, + "grad_norm": 0.6476002931594849, + "learning_rate": 0.0002, + "loss": 0.7744, + "step": 8390 + }, + { + "epoch": 1.3580147118260448, + "grad_norm": 0.7120398879051208, + "learning_rate": 0.0002, + "loss": 0.776, + "step": 8400 + }, + { + "epoch": 1.35963139600679, + "grad_norm": 0.9048416614532471, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 8410 + }, + { + "epoch": 1.3612480801875353, + "grad_norm": 0.7000672817230225, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 8420 + }, + { + "epoch": 1.3628647643682807, + "grad_norm": 0.6015632152557373, + "learning_rate": 0.0002, + "loss": 0.7358, + "step": 8430 + }, + { + "epoch": 1.364481448549026, + "grad_norm": 0.612516462802887, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 8440 + }, + { + "epoch": 1.3660981327297712, + "grad_norm": 0.5969301462173462, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 8450 + }, + { + "epoch": 1.3677148169105164, + "grad_norm": 0.6730654239654541, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 8460 + }, + { + "epoch": 1.369331501091262, + "grad_norm": 0.6386392116546631, + "learning_rate": 0.0002, + "loss": 0.7465, + "step": 8470 + }, + { + "epoch": 1.3709481852720071, + "grad_norm": 0.739544153213501, + "learning_rate": 0.0002, + "loss": 0.7433, + "step": 8480 + }, + { + "epoch": 1.3725648694527524, + "grad_norm": 0.6462782621383667, + "learning_rate": 0.0002, + "loss": 0.7892, + "step": 8490 + }, + { + "epoch": 1.3741815536334978, + "grad_norm": 0.7346843481063843, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 8500 + }, + { + "epoch": 1.375798237814243, + "grad_norm": 0.6884821057319641, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 8510 + }, + { + "epoch": 1.3774149219949883, + "grad_norm": 0.6999333500862122, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 8520 + }, + { + "epoch": 1.3790316061757335, + "grad_norm": 0.5378713011741638, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 8530 + }, + { + "epoch": 1.3806482903564787, + "grad_norm": 0.5417906641960144, + "learning_rate": 0.0002, + "loss": 0.6797, + "step": 8540 + }, + { + "epoch": 1.3822649745372242, + "grad_norm": 0.6602526307106018, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 8550 + }, + { + "epoch": 1.3838816587179694, + "grad_norm": 0.7073674201965332, + "learning_rate": 0.0002, + "loss": 0.7356, + "step": 8560 + }, + { + "epoch": 1.3854983428987149, + "grad_norm": 0.5841707587242126, + "learning_rate": 0.0002, + "loss": 0.75, + "step": 8570 + }, + { + "epoch": 1.38711502707946, + "grad_norm": 0.7031095027923584, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 8580 + }, + { + "epoch": 1.3887317112602053, + "grad_norm": 0.5198570489883423, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 8590 + }, + { + "epoch": 1.3903483954409506, + "grad_norm": 0.7261320352554321, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 8600 + }, + { + "epoch": 1.3919650796216958, + "grad_norm": 0.5616350173950195, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 8610 + }, + { + "epoch": 1.3935817638024413, + "grad_norm": 0.5185914635658264, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 8620 + }, + { + "epoch": 1.3951984479831865, + "grad_norm": 0.5814694762229919, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 8630 + }, + { + "epoch": 1.3968151321639317, + "grad_norm": 0.6977371573448181, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 8640 + }, + { + "epoch": 1.3984318163446772, + "grad_norm": 0.6855689883232117, + "learning_rate": 0.0002, + "loss": 0.7574, + "step": 8650 + }, + { + "epoch": 1.4000485005254224, + "grad_norm": 0.5414357781410217, + "learning_rate": 0.0002, + "loss": 0.7802, + "step": 8660 + }, + { + "epoch": 1.4016651847061676, + "grad_norm": 0.6970012784004211, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 8670 + }, + { + "epoch": 1.4032818688869129, + "grad_norm": 0.526079535484314, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 8680 + }, + { + "epoch": 1.404898553067658, + "grad_norm": 0.758712887763977, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 8690 + }, + { + "epoch": 1.4065152372484035, + "grad_norm": 0.7118762731552124, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 8700 + }, + { + "epoch": 1.4081319214291488, + "grad_norm": 0.5696909427642822, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 8710 + }, + { + "epoch": 1.4097486056098942, + "grad_norm": 0.7995436787605286, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 8720 + }, + { + "epoch": 1.4113652897906395, + "grad_norm": 0.7237521409988403, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 8730 + }, + { + "epoch": 1.4129819739713847, + "grad_norm": 0.744628369808197, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 8740 + }, + { + "epoch": 1.41459865815213, + "grad_norm": 0.6082926988601685, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 8750 + }, + { + "epoch": 1.4162153423328752, + "grad_norm": 0.5185243487358093, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 8760 + }, + { + "epoch": 1.4178320265136206, + "grad_norm": 0.5183082222938538, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 8770 + }, + { + "epoch": 1.4194487106943658, + "grad_norm": 0.7326041460037231, + "learning_rate": 0.0002, + "loss": 0.7509, + "step": 8780 + }, + { + "epoch": 1.421065394875111, + "grad_norm": 0.7174660563468933, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 8790 + }, + { + "epoch": 1.4226820790558565, + "grad_norm": 0.8080165982246399, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 8800 + }, + { + "epoch": 1.4242987632366018, + "grad_norm": 0.5061507821083069, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 8810 + }, + { + "epoch": 1.425915447417347, + "grad_norm": 0.801602840423584, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 8820 + }, + { + "epoch": 1.4275321315980922, + "grad_norm": 0.6150273084640503, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 8830 + }, + { + "epoch": 1.4291488157788377, + "grad_norm": 0.8786525726318359, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 8840 + }, + { + "epoch": 1.430765499959583, + "grad_norm": 0.6371538639068604, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 8850 + }, + { + "epoch": 1.4323821841403281, + "grad_norm": 0.6409295797348022, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 8860 + }, + { + "epoch": 1.4339988683210736, + "grad_norm": 0.6452359557151794, + "learning_rate": 0.0002, + "loss": 0.7891, + "step": 8870 + }, + { + "epoch": 1.4356155525018188, + "grad_norm": 0.5842334628105164, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 8880 + }, + { + "epoch": 1.437232236682564, + "grad_norm": 0.696761965751648, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 8890 + }, + { + "epoch": 1.4388489208633093, + "grad_norm": 0.6384600400924683, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 8900 + }, + { + "epoch": 1.4404656050440545, + "grad_norm": 0.5981136560440063, + "learning_rate": 0.0002, + "loss": 0.7049, + "step": 8910 + }, + { + "epoch": 1.4420822892248, + "grad_norm": 0.6355637907981873, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 8920 + }, + { + "epoch": 1.4436989734055452, + "grad_norm": 0.6374830603599548, + "learning_rate": 0.0002, + "loss": 0.7653, + "step": 8930 + }, + { + "epoch": 1.4453156575862904, + "grad_norm": 0.559013307094574, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 8940 + }, + { + "epoch": 1.446932341767036, + "grad_norm": 0.7289170026779175, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 8950 + }, + { + "epoch": 1.4485490259477811, + "grad_norm": 0.8649206757545471, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 8960 + }, + { + "epoch": 1.4501657101285264, + "grad_norm": 0.7664689421653748, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 8970 + }, + { + "epoch": 1.4517823943092716, + "grad_norm": 0.7109952569007874, + "learning_rate": 0.0002, + "loss": 0.7431, + "step": 8980 + }, + { + "epoch": 1.453399078490017, + "grad_norm": 0.6312844753265381, + "learning_rate": 0.0002, + "loss": 0.7997, + "step": 8990 + }, + { + "epoch": 1.4550157626707623, + "grad_norm": 0.6616617441177368, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 9000 + }, + { + "epoch": 1.4566324468515075, + "grad_norm": 0.7384068965911865, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 9010 + }, + { + "epoch": 1.458249131032253, + "grad_norm": 0.6549670100212097, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 9020 + }, + { + "epoch": 1.4598658152129982, + "grad_norm": 0.6254119277000427, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 9030 + }, + { + "epoch": 1.4614824993937434, + "grad_norm": 0.6806328892707825, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 9040 + }, + { + "epoch": 1.4630991835744886, + "grad_norm": 0.6803115010261536, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 9050 + }, + { + "epoch": 1.4647158677552339, + "grad_norm": 0.48529282212257385, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 9060 + }, + { + "epoch": 1.4663325519359793, + "grad_norm": 0.5995030999183655, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 9070 + }, + { + "epoch": 1.4679492361167246, + "grad_norm": 0.6005427837371826, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 9080 + }, + { + "epoch": 1.46956592029747, + "grad_norm": 0.718564510345459, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 9090 + }, + { + "epoch": 1.4711826044782153, + "grad_norm": 0.7003577351570129, + "learning_rate": 0.0002, + "loss": 0.7089, + "step": 9100 + }, + { + "epoch": 1.4727992886589605, + "grad_norm": 0.5888323783874512, + "learning_rate": 0.0002, + "loss": 0.8069, + "step": 9110 + }, + { + "epoch": 1.4744159728397057, + "grad_norm": 0.6417609453201294, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 9120 + }, + { + "epoch": 1.476032657020451, + "grad_norm": 0.572294294834137, + "learning_rate": 0.0002, + "loss": 0.7441, + "step": 9130 + }, + { + "epoch": 1.4776493412011964, + "grad_norm": 0.8200714588165283, + "learning_rate": 0.0002, + "loss": 0.8053, + "step": 9140 + }, + { + "epoch": 1.4792660253819416, + "grad_norm": 0.6343288421630859, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 9150 + }, + { + "epoch": 1.4808827095626869, + "grad_norm": 0.7017961144447327, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 9160 + }, + { + "epoch": 1.4824993937434323, + "grad_norm": 0.6202912926673889, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 9170 + }, + { + "epoch": 1.4841160779241775, + "grad_norm": 0.6677869558334351, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 9180 + }, + { + "epoch": 1.4857327621049228, + "grad_norm": 0.6052267551422119, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 9190 + }, + { + "epoch": 1.487349446285668, + "grad_norm": 0.6638872027397156, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 9200 + }, + { + "epoch": 1.4889661304664135, + "grad_norm": 0.6245523691177368, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 9210 + }, + { + "epoch": 1.4905828146471587, + "grad_norm": 0.5761767625808716, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 9220 + }, + { + "epoch": 1.492199498827904, + "grad_norm": 0.8175981640815735, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 9230 + }, + { + "epoch": 1.4938161830086494, + "grad_norm": 0.9144009947776794, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 9240 + }, + { + "epoch": 1.4954328671893946, + "grad_norm": 0.5742552876472473, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 9250 + }, + { + "epoch": 1.4970495513701398, + "grad_norm": 0.534534215927124, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 9260 + }, + { + "epoch": 1.498666235550885, + "grad_norm": 0.7836225032806396, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 9270 + }, + { + "epoch": 1.5002829197316303, + "grad_norm": 0.5292993187904358, + "learning_rate": 0.0002, + "loss": 0.7453, + "step": 9280 + }, + { + "epoch": 1.5018996039123758, + "grad_norm": 0.8044071793556213, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 9290 + }, + { + "epoch": 1.503516288093121, + "grad_norm": 0.6185805201530457, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 9300 + }, + { + "epoch": 1.5051329722738664, + "grad_norm": 0.6093607544898987, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 9310 + }, + { + "epoch": 1.5067496564546117, + "grad_norm": 0.5891730189323425, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 9320 + }, + { + "epoch": 1.508366340635357, + "grad_norm": 0.6331129670143127, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 9330 + }, + { + "epoch": 1.5099830248161021, + "grad_norm": 0.7690958380699158, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 9340 + }, + { + "epoch": 1.5115997089968474, + "grad_norm": 0.6548877358436584, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 9350 + }, + { + "epoch": 1.5132163931775926, + "grad_norm": 0.6545143127441406, + "learning_rate": 0.0002, + "loss": 0.7408, + "step": 9360 + }, + { + "epoch": 1.514833077358338, + "grad_norm": 0.553247332572937, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 9370 + }, + { + "epoch": 1.5164497615390833, + "grad_norm": 0.8145074844360352, + "learning_rate": 0.0002, + "loss": 0.7265, + "step": 9380 + }, + { + "epoch": 1.5180664457198287, + "grad_norm": 0.7636994123458862, + "learning_rate": 0.0002, + "loss": 0.7379, + "step": 9390 + }, + { + "epoch": 1.519683129900574, + "grad_norm": 0.6838982701301575, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 9400 + }, + { + "epoch": 1.5212998140813192, + "grad_norm": 0.8599441647529602, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 9410 + }, + { + "epoch": 1.5229164982620644, + "grad_norm": 0.7020329833030701, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 9420 + }, + { + "epoch": 1.5245331824428097, + "grad_norm": 0.6964772343635559, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 9430 + }, + { + "epoch": 1.5261498666235551, + "grad_norm": 0.6916600465774536, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 9440 + }, + { + "epoch": 1.5277665508043003, + "grad_norm": 0.7282621264457703, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 9450 + }, + { + "epoch": 1.5293832349850458, + "grad_norm": 0.5363983511924744, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 9460 + }, + { + "epoch": 1.530999919165791, + "grad_norm": 0.6184861063957214, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 9470 + }, + { + "epoch": 1.5326166033465363, + "grad_norm": 0.5991285443305969, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 9480 + }, + { + "epoch": 1.5342332875272815, + "grad_norm": 0.8176587820053101, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 9490 + }, + { + "epoch": 1.5358499717080267, + "grad_norm": 0.6473721861839294, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 9500 + }, + { + "epoch": 1.5374666558887722, + "grad_norm": 0.7319952845573425, + "learning_rate": 0.0002, + "loss": 0.7632, + "step": 9510 + }, + { + "epoch": 1.5390833400695174, + "grad_norm": 0.702900230884552, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 9520 + }, + { + "epoch": 1.5407000242502629, + "grad_norm": 0.7971600294113159, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 9530 + }, + { + "epoch": 1.542316708431008, + "grad_norm": 0.6527525186538696, + "learning_rate": 0.0002, + "loss": 0.7352, + "step": 9540 + }, + { + "epoch": 1.5439333926117533, + "grad_norm": 0.5791676044464111, + "learning_rate": 0.0002, + "loss": 0.7425, + "step": 9550 + }, + { + "epoch": 1.5455500767924986, + "grad_norm": 0.5619390606880188, + "learning_rate": 0.0002, + "loss": 0.7585, + "step": 9560 + }, + { + "epoch": 1.5471667609732438, + "grad_norm": 0.5701689124107361, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 9570 + }, + { + "epoch": 1.548783445153989, + "grad_norm": 0.47549352049827576, + "learning_rate": 0.0002, + "loss": 0.793, + "step": 9580 + }, + { + "epoch": 1.5504001293347345, + "grad_norm": 0.8730611205101013, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 9590 + }, + { + "epoch": 1.5520168135154797, + "grad_norm": 0.6842091083526611, + "learning_rate": 0.0002, + "loss": 0.798, + "step": 9600 + }, + { + "epoch": 1.5536334976962252, + "grad_norm": 0.6675129532814026, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 9610 + }, + { + "epoch": 1.5552501818769704, + "grad_norm": 0.8173956274986267, + "learning_rate": 0.0002, + "loss": 0.7954, + "step": 9620 + }, + { + "epoch": 1.5568668660577156, + "grad_norm": 0.724947452545166, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 9630 + }, + { + "epoch": 1.5584835502384609, + "grad_norm": 0.6154758930206299, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 9640 + }, + { + "epoch": 1.560100234419206, + "grad_norm": 0.6072008013725281, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 9650 + }, + { + "epoch": 1.5617169185999515, + "grad_norm": 0.659010648727417, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 9660 + }, + { + "epoch": 1.5633336027806968, + "grad_norm": 0.65857994556427, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 9670 + }, + { + "epoch": 1.5649502869614422, + "grad_norm": 0.5914267301559448, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 9680 + }, + { + "epoch": 1.5665669711421875, + "grad_norm": 0.6248020529747009, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 9690 + }, + { + "epoch": 1.5681836553229327, + "grad_norm": 0.7147795557975769, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 9700 + }, + { + "epoch": 1.569800339503678, + "grad_norm": 0.7076232433319092, + "learning_rate": 0.0002, + "loss": 0.7335, + "step": 9710 + }, + { + "epoch": 1.5714170236844232, + "grad_norm": 0.6217400431632996, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 9720 + }, + { + "epoch": 1.5730337078651684, + "grad_norm": 0.6709911227226257, + "learning_rate": 0.0002, + "loss": 0.7296, + "step": 9730 + }, + { + "epoch": 1.5746503920459138, + "grad_norm": 0.749171257019043, + "learning_rate": 0.0002, + "loss": 0.7306, + "step": 9740 + }, + { + "epoch": 1.576267076226659, + "grad_norm": 0.6241145730018616, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 9750 + }, + { + "epoch": 1.5778837604074045, + "grad_norm": 0.4960934817790985, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 9760 + }, + { + "epoch": 1.5795004445881498, + "grad_norm": 0.6593309640884399, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 9770 + }, + { + "epoch": 1.581117128768895, + "grad_norm": 0.5814042091369629, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 9780 + }, + { + "epoch": 1.5827338129496402, + "grad_norm": 0.5936070680618286, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 9790 + }, + { + "epoch": 1.5843504971303854, + "grad_norm": 0.6454403400421143, + "learning_rate": 0.0002, + "loss": 0.7769, + "step": 9800 + }, + { + "epoch": 1.585967181311131, + "grad_norm": 0.7612107992172241, + "learning_rate": 0.0002, + "loss": 0.7677, + "step": 9810 + }, + { + "epoch": 1.5875838654918761, + "grad_norm": 0.6494482755661011, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 9820 + }, + { + "epoch": 1.5892005496726216, + "grad_norm": 0.7825694680213928, + "learning_rate": 0.0002, + "loss": 0.7569, + "step": 9830 + }, + { + "epoch": 1.5908172338533668, + "grad_norm": 0.6757757663726807, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 9840 + }, + { + "epoch": 1.592433918034112, + "grad_norm": 0.7105609178543091, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 9850 + }, + { + "epoch": 1.5940506022148573, + "grad_norm": 0.7596991062164307, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 9860 + }, + { + "epoch": 1.5956672863956025, + "grad_norm": 0.5681525468826294, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 9870 + }, + { + "epoch": 1.5972839705763477, + "grad_norm": 0.6090980768203735, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 9880 + }, + { + "epoch": 1.5989006547570932, + "grad_norm": 0.6271613240242004, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 9890 + }, + { + "epoch": 1.6005173389378387, + "grad_norm": 0.7656369805335999, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 9900 + }, + { + "epoch": 1.6021340231185839, + "grad_norm": 0.7504446506500244, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 9910 + }, + { + "epoch": 1.6037507072993291, + "grad_norm": 0.659656286239624, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 9920 + }, + { + "epoch": 1.6053673914800743, + "grad_norm": 0.6006826162338257, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 9930 + }, + { + "epoch": 1.6069840756608196, + "grad_norm": 0.7872757911682129, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 9940 + }, + { + "epoch": 1.6086007598415648, + "grad_norm": 0.5545852780342102, + "learning_rate": 0.0002, + "loss": 0.768, + "step": 9950 + }, + { + "epoch": 1.6102174440223103, + "grad_norm": 0.7429468631744385, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 9960 + }, + { + "epoch": 1.6118341282030555, + "grad_norm": 0.6873556971549988, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 9970 + }, + { + "epoch": 1.613450812383801, + "grad_norm": 0.5874287486076355, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 9980 + }, + { + "epoch": 1.6150674965645462, + "grad_norm": 0.6039386987686157, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 9990 + }, + { + "epoch": 1.6166841807452914, + "grad_norm": 0.6233575940132141, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 10000 + }, + { + "epoch": 1.6183008649260366, + "grad_norm": 0.7676448225975037, + "learning_rate": 0.0002, + "loss": 0.7602, + "step": 10010 + }, + { + "epoch": 1.6199175491067819, + "grad_norm": 0.6565698385238647, + "learning_rate": 0.0002, + "loss": 0.7784, + "step": 10020 + }, + { + "epoch": 1.6215342332875273, + "grad_norm": 0.6787590384483337, + "learning_rate": 0.0002, + "loss": 0.7104, + "step": 10030 + }, + { + "epoch": 1.6231509174682726, + "grad_norm": 0.6137678027153015, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 10040 + }, + { + "epoch": 1.624767601649018, + "grad_norm": 0.5236800312995911, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 10050 + }, + { + "epoch": 1.6263842858297632, + "grad_norm": 0.7626367807388306, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 10060 + }, + { + "epoch": 1.6280009700105085, + "grad_norm": 0.5657260417938232, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 10070 + }, + { + "epoch": 1.6296176541912537, + "grad_norm": 0.4913991391658783, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 10080 + }, + { + "epoch": 1.631234338371999, + "grad_norm": 0.7715556621551514, + "learning_rate": 0.0002, + "loss": 0.7596, + "step": 10090 + }, + { + "epoch": 1.6328510225527442, + "grad_norm": 0.6509000062942505, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 10100 + }, + { + "epoch": 1.6344677067334896, + "grad_norm": 0.6215850114822388, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 10110 + }, + { + "epoch": 1.6360843909142349, + "grad_norm": 0.6956844329833984, + "learning_rate": 0.0002, + "loss": 0.7705, + "step": 10120 + }, + { + "epoch": 1.6377010750949803, + "grad_norm": 0.6111597418785095, + "learning_rate": 0.0002, + "loss": 0.7129, + "step": 10130 + }, + { + "epoch": 1.6393177592757255, + "grad_norm": 0.6518288850784302, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 10140 + }, + { + "epoch": 1.6409344434564708, + "grad_norm": 0.6914522051811218, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 10150 + }, + { + "epoch": 1.642551127637216, + "grad_norm": 0.63785719871521, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 10160 + }, + { + "epoch": 1.6441678118179612, + "grad_norm": 0.6379287838935852, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 10170 + }, + { + "epoch": 1.6457844959987067, + "grad_norm": 0.6793403029441833, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 10180 + }, + { + "epoch": 1.647401180179452, + "grad_norm": 0.6099132895469666, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 10190 + }, + { + "epoch": 1.6490178643601974, + "grad_norm": 0.5869854092597961, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 10200 + }, + { + "epoch": 1.6506345485409426, + "grad_norm": 0.7716999053955078, + "learning_rate": 0.0002, + "loss": 0.8308, + "step": 10210 + }, + { + "epoch": 1.6522512327216878, + "grad_norm": 0.6854110360145569, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 10220 + }, + { + "epoch": 1.653867916902433, + "grad_norm": 0.6957170367240906, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 10230 + }, + { + "epoch": 1.6554846010831783, + "grad_norm": 0.6932903528213501, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 10240 + }, + { + "epoch": 1.6571012852639235, + "grad_norm": 0.7713165283203125, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 10250 + }, + { + "epoch": 1.658717969444669, + "grad_norm": 0.7455793619155884, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 10260 + }, + { + "epoch": 1.6603346536254144, + "grad_norm": 0.5464168190956116, + "learning_rate": 0.0002, + "loss": 0.7524, + "step": 10270 + }, + { + "epoch": 1.6619513378061597, + "grad_norm": 0.6782926321029663, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 10280 + }, + { + "epoch": 1.663568021986905, + "grad_norm": 0.7962649464607239, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 10290 + }, + { + "epoch": 1.6651847061676501, + "grad_norm": 0.6814526319503784, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 10300 + }, + { + "epoch": 1.6668013903483954, + "grad_norm": 0.656895101070404, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 10310 + }, + { + "epoch": 1.6684180745291406, + "grad_norm": 0.6085672378540039, + "learning_rate": 0.0002, + "loss": 0.7358, + "step": 10320 + }, + { + "epoch": 1.670034758709886, + "grad_norm": 0.585508406162262, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 10330 + }, + { + "epoch": 1.6716514428906313, + "grad_norm": 0.6930184364318848, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 10340 + }, + { + "epoch": 1.6732681270713767, + "grad_norm": 0.575663149356842, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 10350 + }, + { + "epoch": 1.674884811252122, + "grad_norm": 0.582502543926239, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 10360 + }, + { + "epoch": 1.6765014954328672, + "grad_norm": 0.5668916702270508, + "learning_rate": 0.0002, + "loss": 0.7793, + "step": 10370 + }, + { + "epoch": 1.6781181796136124, + "grad_norm": 0.6070065498352051, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 10380 + }, + { + "epoch": 1.6797348637943577, + "grad_norm": 0.6141316294670105, + "learning_rate": 0.0002, + "loss": 0.7939, + "step": 10390 + }, + { + "epoch": 1.6813515479751031, + "grad_norm": 0.8359124064445496, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 10400 + }, + { + "epoch": 1.6829682321558483, + "grad_norm": 0.5378185510635376, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 10410 + }, + { + "epoch": 1.6845849163365938, + "grad_norm": 0.6959536075592041, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 10420 + }, + { + "epoch": 1.686201600517339, + "grad_norm": 0.6514357328414917, + "learning_rate": 0.0002, + "loss": 0.7872, + "step": 10430 + }, + { + "epoch": 1.6878182846980843, + "grad_norm": 0.7706646919250488, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 10440 + }, + { + "epoch": 1.6894349688788295, + "grad_norm": 0.6183337569236755, + "learning_rate": 0.0002, + "loss": 0.7673, + "step": 10450 + }, + { + "epoch": 1.6910516530595747, + "grad_norm": 0.6123278141021729, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 10460 + }, + { + "epoch": 1.69266833724032, + "grad_norm": 0.6894851326942444, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 10470 + }, + { + "epoch": 1.6942850214210654, + "grad_norm": 0.7497312426567078, + "learning_rate": 0.0002, + "loss": 0.7435, + "step": 10480 + }, + { + "epoch": 1.6959017056018106, + "grad_norm": 0.5968214273452759, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 10490 + }, + { + "epoch": 1.697518389782556, + "grad_norm": 0.6747927069664001, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 10500 + }, + { + "epoch": 1.6991350739633013, + "grad_norm": 0.5708310008049011, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 10510 + }, + { + "epoch": 1.7007517581440466, + "grad_norm": 0.606526792049408, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 10520 + }, + { + "epoch": 1.7023684423247918, + "grad_norm": 0.662011981010437, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 10530 + }, + { + "epoch": 1.703985126505537, + "grad_norm": 0.7583045363426208, + "learning_rate": 0.0002, + "loss": 0.7999, + "step": 10540 + }, + { + "epoch": 1.7056018106862825, + "grad_norm": 0.721632182598114, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 10550 + }, + { + "epoch": 1.7072184948670277, + "grad_norm": 0.6107715368270874, + "learning_rate": 0.0002, + "loss": 0.7407, + "step": 10560 + }, + { + "epoch": 1.7088351790477732, + "grad_norm": 0.6652471423149109, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 10570 + }, + { + "epoch": 1.7104518632285184, + "grad_norm": 0.6308087110519409, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 10580 + }, + { + "epoch": 1.7120685474092636, + "grad_norm": 0.5464386940002441, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 10590 + }, + { + "epoch": 1.7136852315900089, + "grad_norm": 0.6558911204338074, + "learning_rate": 0.0002, + "loss": 0.7063, + "step": 10600 + }, + { + "epoch": 1.715301915770754, + "grad_norm": 0.5665024518966675, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 10610 + }, + { + "epoch": 1.7169185999514993, + "grad_norm": 0.7888094186782837, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 10620 + }, + { + "epoch": 1.7185352841322448, + "grad_norm": 0.7084909081459045, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 10630 + }, + { + "epoch": 1.7201519683129902, + "grad_norm": 0.7982324361801147, + "learning_rate": 0.0002, + "loss": 0.7557, + "step": 10640 + }, + { + "epoch": 1.7217686524937355, + "grad_norm": 0.6418732404708862, + "learning_rate": 0.0002, + "loss": 0.7345, + "step": 10650 + }, + { + "epoch": 1.7233853366744807, + "grad_norm": 0.7636681795120239, + "learning_rate": 0.0002, + "loss": 0.7734, + "step": 10660 + }, + { + "epoch": 1.725002020855226, + "grad_norm": 0.5646875500679016, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 10670 + }, + { + "epoch": 1.7266187050359711, + "grad_norm": 0.5231260657310486, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 10680 + }, + { + "epoch": 1.7282353892167164, + "grad_norm": 0.7635011672973633, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 10690 + }, + { + "epoch": 1.7298520733974618, + "grad_norm": 0.7518259286880493, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 10700 + }, + { + "epoch": 1.731468757578207, + "grad_norm": 0.7295602560043335, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 10710 + }, + { + "epoch": 1.7330854417589525, + "grad_norm": 0.6984632015228271, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 10720 + }, + { + "epoch": 1.7347021259396977, + "grad_norm": 0.6198219060897827, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 10730 + }, + { + "epoch": 1.736318810120443, + "grad_norm": 0.6957576274871826, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 10740 + }, + { + "epoch": 1.7379354943011882, + "grad_norm": 0.6430263519287109, + "learning_rate": 0.0002, + "loss": 0.7917, + "step": 10750 + }, + { + "epoch": 1.7395521784819334, + "grad_norm": 0.6134995222091675, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 10760 + }, + { + "epoch": 1.741168862662679, + "grad_norm": 0.7209452986717224, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 10770 + }, + { + "epoch": 1.7427855468434241, + "grad_norm": 0.6735447645187378, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 10780 + }, + { + "epoch": 1.7444022310241696, + "grad_norm": 0.5605693459510803, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 10790 + }, + { + "epoch": 1.7460189152049148, + "grad_norm": 0.6882363557815552, + "learning_rate": 0.0002, + "loss": 0.7759, + "step": 10800 + }, + { + "epoch": 1.74763559938566, + "grad_norm": 0.6386259198188782, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 10810 + }, + { + "epoch": 1.7492522835664053, + "grad_norm": 0.6529015302658081, + "learning_rate": 0.0002, + "loss": 0.7697, + "step": 10820 + }, + { + "epoch": 1.7508689677471505, + "grad_norm": 0.5664082765579224, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 10830 + }, + { + "epoch": 1.7524856519278957, + "grad_norm": 0.7532684206962585, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 10840 + }, + { + "epoch": 1.7541023361086412, + "grad_norm": 0.77171391248703, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 10850 + }, + { + "epoch": 1.7557190202893864, + "grad_norm": 0.7255431413650513, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 10860 + }, + { + "epoch": 1.7573357044701319, + "grad_norm": 0.763083279132843, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 10870 + }, + { + "epoch": 1.758952388650877, + "grad_norm": 0.6042402982711792, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 10880 + }, + { + "epoch": 1.7605690728316223, + "grad_norm": 0.7642518281936646, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 10890 + }, + { + "epoch": 1.7621857570123676, + "grad_norm": 0.6347904801368713, + "learning_rate": 0.0002, + "loss": 0.746, + "step": 10900 + }, + { + "epoch": 1.7638024411931128, + "grad_norm": 0.5371627807617188, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 10910 + }, + { + "epoch": 1.7654191253738583, + "grad_norm": 0.6840225458145142, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 10920 + }, + { + "epoch": 1.7670358095546035, + "grad_norm": 0.5288469195365906, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 10930 + }, + { + "epoch": 1.768652493735349, + "grad_norm": 0.69020676612854, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 10940 + }, + { + "epoch": 1.7702691779160942, + "grad_norm": 0.5943242311477661, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 10950 + }, + { + "epoch": 1.7718858620968394, + "grad_norm": 0.5616418123245239, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 10960 + }, + { + "epoch": 1.7735025462775846, + "grad_norm": 0.7209470868110657, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 10970 + }, + { + "epoch": 1.7751192304583299, + "grad_norm": 0.6657957434654236, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 10980 + }, + { + "epoch": 1.776735914639075, + "grad_norm": 0.6469064950942993, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 10990 + }, + { + "epoch": 1.7783525988198206, + "grad_norm": 0.6615678071975708, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 11000 + }, + { + "epoch": 1.779969283000566, + "grad_norm": 0.6722439527511597, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 11010 + }, + { + "epoch": 1.7815859671813112, + "grad_norm": 0.634136974811554, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 11020 + }, + { + "epoch": 1.7832026513620565, + "grad_norm": 0.6024377346038818, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 11030 + }, + { + "epoch": 1.7848193355428017, + "grad_norm": 0.6909403800964355, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 11040 + }, + { + "epoch": 1.786436019723547, + "grad_norm": 0.7148767709732056, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 11050 + }, + { + "epoch": 1.7880527039042922, + "grad_norm": 0.7442979216575623, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 11060 + }, + { + "epoch": 1.7896693880850376, + "grad_norm": 0.6830431818962097, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 11070 + }, + { + "epoch": 1.7912860722657828, + "grad_norm": 0.9172667264938354, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 11080 + }, + { + "epoch": 1.7929027564465283, + "grad_norm": 0.6799490451812744, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 11090 + }, + { + "epoch": 1.7945194406272735, + "grad_norm": 0.7617024779319763, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 11100 + }, + { + "epoch": 1.7961361248080188, + "grad_norm": 0.7701810002326965, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 11110 + }, + { + "epoch": 1.797752808988764, + "grad_norm": 0.7454385757446289, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 11120 + }, + { + "epoch": 1.7993694931695092, + "grad_norm": 0.6121436953544617, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 11130 + }, + { + "epoch": 1.8009861773502547, + "grad_norm": 0.6237571835517883, + "learning_rate": 0.0002, + "loss": 0.7305, + "step": 11140 + }, + { + "epoch": 1.802602861531, + "grad_norm": 0.6818515658378601, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 11150 + }, + { + "epoch": 1.8042195457117454, + "grad_norm": 0.7768308520317078, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 11160 + }, + { + "epoch": 1.8058362298924906, + "grad_norm": 0.6875537633895874, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 11170 + }, + { + "epoch": 1.8074529140732358, + "grad_norm": 0.7950584888458252, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 11180 + }, + { + "epoch": 1.809069598253981, + "grad_norm": 0.8210248351097107, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 11190 + }, + { + "epoch": 1.8106862824347263, + "grad_norm": 0.6674110889434814, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 11200 + }, + { + "epoch": 1.8123029666154715, + "grad_norm": 0.6261674761772156, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 11210 + }, + { + "epoch": 1.813919650796217, + "grad_norm": 0.6484741568565369, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 11220 + }, + { + "epoch": 1.8155363349769622, + "grad_norm": 0.6231244206428528, + "learning_rate": 0.0002, + "loss": 0.7718, + "step": 11230 + }, + { + "epoch": 1.8171530191577077, + "grad_norm": 0.7243146896362305, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 11240 + }, + { + "epoch": 1.818769703338453, + "grad_norm": 0.6776193380355835, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 11250 + }, + { + "epoch": 1.8203863875191981, + "grad_norm": 0.5973618030548096, + "learning_rate": 0.0002, + "loss": 0.7317, + "step": 11260 + }, + { + "epoch": 1.8220030716999434, + "grad_norm": 0.6451361179351807, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 11270 + }, + { + "epoch": 1.8236197558806886, + "grad_norm": 0.5963068008422852, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 11280 + }, + { + "epoch": 1.825236440061434, + "grad_norm": 0.536902129650116, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 11290 + }, + { + "epoch": 1.8268531242421793, + "grad_norm": 0.6993787288665771, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 11300 + }, + { + "epoch": 1.8284698084229247, + "grad_norm": 0.6135255098342896, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 11310 + }, + { + "epoch": 1.83008649260367, + "grad_norm": 0.6057423949241638, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 11320 + }, + { + "epoch": 1.8317031767844152, + "grad_norm": 0.6598812341690063, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 11330 + }, + { + "epoch": 1.8333198609651604, + "grad_norm": 0.6075948476791382, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 11340 + }, + { + "epoch": 1.8349365451459057, + "grad_norm": 0.7065447568893433, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 11350 + }, + { + "epoch": 1.8365532293266509, + "grad_norm": 0.680526614189148, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 11360 + }, + { + "epoch": 1.8381699135073963, + "grad_norm": 0.6356695294380188, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 11370 + }, + { + "epoch": 1.8397865976881416, + "grad_norm": 0.6399052143096924, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 11380 + }, + { + "epoch": 1.841403281868887, + "grad_norm": 0.6125704050064087, + "learning_rate": 0.0002, + "loss": 0.7618, + "step": 11390 + }, + { + "epoch": 1.8430199660496323, + "grad_norm": 0.7124643325805664, + "learning_rate": 0.0002, + "loss": 0.755, + "step": 11400 + }, + { + "epoch": 1.8446366502303775, + "grad_norm": 0.6099604964256287, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 11410 + }, + { + "epoch": 1.8462533344111227, + "grad_norm": 0.7338208556175232, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 11420 + }, + { + "epoch": 1.847870018591868, + "grad_norm": 0.7534668445587158, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 11430 + }, + { + "epoch": 1.8494867027726134, + "grad_norm": 0.6135470271110535, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 11440 + }, + { + "epoch": 1.8511033869533586, + "grad_norm": 0.6229309439659119, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 11450 + }, + { + "epoch": 1.852720071134104, + "grad_norm": 0.706423282623291, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 11460 + }, + { + "epoch": 1.8543367553148493, + "grad_norm": 0.5460049510002136, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 11470 + }, + { + "epoch": 1.8559534394955945, + "grad_norm": 0.6616711020469666, + "learning_rate": 0.0002, + "loss": 0.7416, + "step": 11480 + }, + { + "epoch": 1.8575701236763398, + "grad_norm": 0.6372783184051514, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 11490 + }, + { + "epoch": 1.859186807857085, + "grad_norm": 0.7162668108940125, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 11500 + }, + { + "epoch": 1.8608034920378305, + "grad_norm": 0.6605209708213806, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 11510 + }, + { + "epoch": 1.8624201762185757, + "grad_norm": 0.6933956742286682, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 11520 + }, + { + "epoch": 1.8640368603993211, + "grad_norm": 0.6582090854644775, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 11530 + }, + { + "epoch": 1.8656535445800664, + "grad_norm": 0.6416500806808472, + "learning_rate": 0.0002, + "loss": 0.7313, + "step": 11540 + }, + { + "epoch": 1.8672702287608116, + "grad_norm": 0.5434312224388123, + "learning_rate": 0.0002, + "loss": 0.7372, + "step": 11550 + }, + { + "epoch": 1.8688869129415568, + "grad_norm": 0.6827567219734192, + "learning_rate": 0.0002, + "loss": 0.7635, + "step": 11560 + }, + { + "epoch": 1.870503597122302, + "grad_norm": 0.7354370951652527, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 11570 + }, + { + "epoch": 1.8721202813030473, + "grad_norm": 0.590372622013092, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 11580 + }, + { + "epoch": 1.8737369654837928, + "grad_norm": 0.853183925151825, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 11590 + }, + { + "epoch": 1.875353649664538, + "grad_norm": 0.822678804397583, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 11600 + }, + { + "epoch": 1.8769703338452834, + "grad_norm": 0.6591550707817078, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 11610 + }, + { + "epoch": 1.8785870180260287, + "grad_norm": 0.7475301623344421, + "learning_rate": 0.0002, + "loss": 0.7054, + "step": 11620 + }, + { + "epoch": 1.880203702206774, + "grad_norm": 0.6390765309333801, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 11630 + }, + { + "epoch": 1.8818203863875191, + "grad_norm": 0.6589758992195129, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 11640 + }, + { + "epoch": 1.8834370705682644, + "grad_norm": 0.6765508651733398, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 11650 + }, + { + "epoch": 1.8850537547490098, + "grad_norm": 0.6527857780456543, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 11660 + }, + { + "epoch": 1.886670438929755, + "grad_norm": 0.6642923951148987, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 11670 + }, + { + "epoch": 1.8882871231105005, + "grad_norm": 0.6945584416389465, + "learning_rate": 0.0002, + "loss": 0.7701, + "step": 11680 + }, + { + "epoch": 1.8899038072912457, + "grad_norm": 0.694018542766571, + "learning_rate": 0.0002, + "loss": 0.7711, + "step": 11690 + }, + { + "epoch": 1.891520491471991, + "grad_norm": 0.7237417101860046, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 11700 + }, + { + "epoch": 1.8931371756527362, + "grad_norm": 0.7401309609413147, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 11710 + }, + { + "epoch": 1.8947538598334814, + "grad_norm": 0.6537784337997437, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 11720 + }, + { + "epoch": 1.8963705440142267, + "grad_norm": 0.7398539185523987, + "learning_rate": 0.0002, + "loss": 0.793, + "step": 11730 + }, + { + "epoch": 1.8979872281949721, + "grad_norm": 0.6696075797080994, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 11740 + }, + { + "epoch": 1.8996039123757174, + "grad_norm": 0.6014142036437988, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 11750 + }, + { + "epoch": 1.9012205965564628, + "grad_norm": 0.7023524641990662, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 11760 + }, + { + "epoch": 1.902837280737208, + "grad_norm": 0.739973783493042, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 11770 + }, + { + "epoch": 1.9044539649179533, + "grad_norm": 0.5576770901679993, + "learning_rate": 0.0002, + "loss": 0.7848, + "step": 11780 + }, + { + "epoch": 1.9060706490986985, + "grad_norm": 0.6907393932342529, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 11790 + }, + { + "epoch": 1.9076873332794437, + "grad_norm": 0.6934581995010376, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 11800 + }, + { + "epoch": 1.9093040174601892, + "grad_norm": 0.591774582862854, + "learning_rate": 0.0002, + "loss": 0.7199, + "step": 11810 + }, + { + "epoch": 1.9109207016409344, + "grad_norm": 0.6249791383743286, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 11820 + }, + { + "epoch": 1.9125373858216799, + "grad_norm": 0.6755744218826294, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 11830 + }, + { + "epoch": 1.914154070002425, + "grad_norm": 0.7286285161972046, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 11840 + }, + { + "epoch": 1.9157707541831703, + "grad_norm": 0.7867850065231323, + "learning_rate": 0.0002, + "loss": 0.7509, + "step": 11850 + }, + { + "epoch": 1.9173874383639156, + "grad_norm": 0.6283972859382629, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 11860 + }, + { + "epoch": 1.9190041225446608, + "grad_norm": 0.605823814868927, + "learning_rate": 0.0002, + "loss": 0.7296, + "step": 11870 + }, + { + "epoch": 1.920620806725406, + "grad_norm": 0.5927976965904236, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 11880 + }, + { + "epoch": 1.9222374909061515, + "grad_norm": 0.5974002480506897, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 11890 + }, + { + "epoch": 1.923854175086897, + "grad_norm": 0.7091866135597229, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 11900 + }, + { + "epoch": 1.9254708592676422, + "grad_norm": 0.72496497631073, + "learning_rate": 0.0002, + "loss": 0.775, + "step": 11910 + }, + { + "epoch": 1.9270875434483874, + "grad_norm": 0.6131896376609802, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 11920 + }, + { + "epoch": 1.9287042276291326, + "grad_norm": 0.6556436419487, + "learning_rate": 0.0002, + "loss": 0.7228, + "step": 11930 + }, + { + "epoch": 1.9303209118098779, + "grad_norm": 0.622932493686676, + "learning_rate": 0.0002, + "loss": 0.7319, + "step": 11940 + }, + { + "epoch": 1.931937595990623, + "grad_norm": 0.6618631482124329, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 11950 + }, + { + "epoch": 1.9335542801713685, + "grad_norm": 0.630966305732727, + "learning_rate": 0.0002, + "loss": 0.8332, + "step": 11960 + }, + { + "epoch": 1.9351709643521138, + "grad_norm": 0.6336734890937805, + "learning_rate": 0.0002, + "loss": 0.6854, + "step": 11970 + }, + { + "epoch": 1.9367876485328592, + "grad_norm": 0.655403196811676, + "learning_rate": 0.0002, + "loss": 0.7433, + "step": 11980 + }, + { + "epoch": 1.9384043327136045, + "grad_norm": 0.5640574097633362, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 11990 + }, + { + "epoch": 1.9400210168943497, + "grad_norm": 0.6322951316833496, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 12000 + }, + { + "epoch": 1.941637701075095, + "grad_norm": 0.615703821182251, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 12010 + }, + { + "epoch": 1.9432543852558402, + "grad_norm": 0.6487536430358887, + "learning_rate": 0.0002, + "loss": 0.786, + "step": 12020 + }, + { + "epoch": 1.9448710694365856, + "grad_norm": 0.9209630489349365, + "learning_rate": 0.0002, + "loss": 0.7435, + "step": 12030 + }, + { + "epoch": 1.9464877536173308, + "grad_norm": 0.67485511302948, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 12040 + }, + { + "epoch": 1.9481044377980763, + "grad_norm": 0.6831230521202087, + "learning_rate": 0.0002, + "loss": 0.7551, + "step": 12050 + }, + { + "epoch": 1.9497211219788215, + "grad_norm": 0.6578302383422852, + "learning_rate": 0.0002, + "loss": 0.7546, + "step": 12060 + }, + { + "epoch": 1.9513378061595668, + "grad_norm": 0.9975938200950623, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 12070 + }, + { + "epoch": 1.952954490340312, + "grad_norm": 0.6637365221977234, + "learning_rate": 0.0002, + "loss": 0.7952, + "step": 12080 + }, + { + "epoch": 1.9545711745210572, + "grad_norm": 0.605707049369812, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 12090 + }, + { + "epoch": 1.9561878587018025, + "grad_norm": 0.6584440469741821, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 12100 + }, + { + "epoch": 1.957804542882548, + "grad_norm": 0.6070835590362549, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 12110 + }, + { + "epoch": 1.9594212270632931, + "grad_norm": 0.7862601280212402, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 12120 + }, + { + "epoch": 1.9610379112440386, + "grad_norm": 0.8175255060195923, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 12130 + }, + { + "epoch": 1.9626545954247838, + "grad_norm": 0.5648472905158997, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 12140 + }, + { + "epoch": 1.964271279605529, + "grad_norm": 0.6591973304748535, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 12150 + }, + { + "epoch": 1.9658879637862743, + "grad_norm": 0.5960676074028015, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 12160 + }, + { + "epoch": 1.9675046479670195, + "grad_norm": 0.7272544503211975, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 12170 + }, + { + "epoch": 1.969121332147765, + "grad_norm": 0.7176699042320251, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 12180 + }, + { + "epoch": 1.9707380163285102, + "grad_norm": 0.6927123665809631, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 12190 + }, + { + "epoch": 1.9723547005092557, + "grad_norm": 0.5536034107208252, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 12200 + }, + { + "epoch": 1.9739713846900009, + "grad_norm": 0.8348390460014343, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 12210 + }, + { + "epoch": 1.9755880688707461, + "grad_norm": 0.6591181755065918, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 12220 + }, + { + "epoch": 1.9772047530514913, + "grad_norm": 1.0624109506607056, + "learning_rate": 0.0002, + "loss": 0.763, + "step": 12230 + }, + { + "epoch": 1.9788214372322366, + "grad_norm": 0.9265586137771606, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 12240 + }, + { + "epoch": 1.9804381214129818, + "grad_norm": 0.5998196005821228, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 12250 + }, + { + "epoch": 1.9820548055937273, + "grad_norm": 0.6960851550102234, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 12260 + }, + { + "epoch": 1.9836714897744727, + "grad_norm": 0.7674502730369568, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 12270 + }, + { + "epoch": 1.985288173955218, + "grad_norm": 0.6407275795936584, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 12280 + }, + { + "epoch": 1.9869048581359632, + "grad_norm": 0.6673079133033752, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 12290 + }, + { + "epoch": 1.9885215423167084, + "grad_norm": 0.6989844441413879, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 12300 + }, + { + "epoch": 1.9901382264974536, + "grad_norm": 0.7564442157745361, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 12310 + }, + { + "epoch": 1.9917549106781989, + "grad_norm": 0.6385478973388672, + "learning_rate": 0.0002, + "loss": 0.7719, + "step": 12320 + }, + { + "epoch": 1.9933715948589443, + "grad_norm": 0.7193717956542969, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 12330 + }, + { + "epoch": 1.9949882790396896, + "grad_norm": 0.7987112402915955, + "learning_rate": 0.0002, + "loss": 0.7583, + "step": 12340 + }, + { + "epoch": 1.996604963220435, + "grad_norm": 0.7260826826095581, + "learning_rate": 0.0002, + "loss": 0.7793, + "step": 12350 + }, + { + "epoch": 1.9982216474011802, + "grad_norm": 0.7968255281448364, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 12360 + }, + { + "epoch": 1.9998383315819255, + "grad_norm": 0.6893062591552734, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 12370 + }, + { + "epoch": 2.0, + "eval_loss": 1.1044032573699951, + "eval_runtime": 122.1508, + "eval_samples_per_second": 6.001, + "eval_steps_per_second": 0.753, + "step": 12371 + }, + { + "epoch": 2.0014550157626707, + "grad_norm": 0.7775409817695618, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 12380 + }, + { + "epoch": 2.003071699943416, + "grad_norm": 0.76218581199646, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 12390 + }, + { + "epoch": 2.004688384124161, + "grad_norm": 0.5677764415740967, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 12400 + }, + { + "epoch": 2.006305068304907, + "grad_norm": 0.808442234992981, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 12410 + }, + { + "epoch": 2.007921752485652, + "grad_norm": 0.7144765257835388, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 12420 + }, + { + "epoch": 2.0095384366663973, + "grad_norm": 0.6914031505584717, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 12430 + }, + { + "epoch": 2.0111551208471425, + "grad_norm": 0.7581454515457153, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 12440 + }, + { + "epoch": 2.0127718050278878, + "grad_norm": 0.8388504981994629, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 12450 + }, + { + "epoch": 2.014388489208633, + "grad_norm": 0.6716406941413879, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 12460 + }, + { + "epoch": 2.0160051733893782, + "grad_norm": 0.898902416229248, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 12470 + }, + { + "epoch": 2.0176218575701235, + "grad_norm": 0.6432679891586304, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 12480 + }, + { + "epoch": 2.019238541750869, + "grad_norm": 0.8021109104156494, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 12490 + }, + { + "epoch": 2.0208552259316144, + "grad_norm": 0.7039216756820679, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 12500 + }, + { + "epoch": 2.0224719101123596, + "grad_norm": 0.646531879901886, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 12510 + }, + { + "epoch": 2.024088594293105, + "grad_norm": 0.783704400062561, + "learning_rate": 0.0002, + "loss": 0.6302, + "step": 12520 + }, + { + "epoch": 2.02570527847385, + "grad_norm": 0.8805046677589417, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 12530 + }, + { + "epoch": 2.0273219626545953, + "grad_norm": 0.7289270758628845, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 12540 + }, + { + "epoch": 2.0289386468353405, + "grad_norm": 0.71653151512146, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 12550 + }, + { + "epoch": 2.030555331016086, + "grad_norm": 0.73281329870224, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 12560 + }, + { + "epoch": 2.0321720151968314, + "grad_norm": 0.6657090187072754, + "learning_rate": 0.0002, + "loss": 0.6448, + "step": 12570 + }, + { + "epoch": 2.0337886993775767, + "grad_norm": 0.8241133093833923, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 12580 + }, + { + "epoch": 2.035405383558322, + "grad_norm": 0.5834135413169861, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 12590 + }, + { + "epoch": 2.037022067739067, + "grad_norm": 0.84502112865448, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 12600 + }, + { + "epoch": 2.0386387519198124, + "grad_norm": 0.8952481746673584, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 12610 + }, + { + "epoch": 2.0402554361005576, + "grad_norm": 0.7801461815834045, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 12620 + }, + { + "epoch": 2.041872120281303, + "grad_norm": 0.6788367033004761, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 12630 + }, + { + "epoch": 2.0434888044620485, + "grad_norm": 0.7241756319999695, + "learning_rate": 0.0002, + "loss": 0.6162, + "step": 12640 + }, + { + "epoch": 2.0451054886427937, + "grad_norm": 0.6933388113975525, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 12650 + }, + { + "epoch": 2.046722172823539, + "grad_norm": 0.8029746413230896, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 12660 + }, + { + "epoch": 2.048338857004284, + "grad_norm": 0.946399986743927, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 12670 + }, + { + "epoch": 2.0499555411850294, + "grad_norm": 0.7072678804397583, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 12680 + }, + { + "epoch": 2.0515722253657747, + "grad_norm": 0.6810618042945862, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 12690 + }, + { + "epoch": 2.05318890954652, + "grad_norm": 0.7661160230636597, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 12700 + }, + { + "epoch": 2.0548055937272656, + "grad_norm": 0.6350653767585754, + "learning_rate": 0.0002, + "loss": 0.6799, + "step": 12710 + }, + { + "epoch": 2.056422277908011, + "grad_norm": 0.861890971660614, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 12720 + }, + { + "epoch": 2.058038962088756, + "grad_norm": 0.6489875912666321, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 12730 + }, + { + "epoch": 2.0596556462695013, + "grad_norm": 0.8268506526947021, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 12740 + }, + { + "epoch": 2.0612723304502465, + "grad_norm": 0.607679545879364, + "learning_rate": 0.0002, + "loss": 0.6524, + "step": 12750 + }, + { + "epoch": 2.0628890146309917, + "grad_norm": 0.6754153370857239, + "learning_rate": 0.0002, + "loss": 0.6649, + "step": 12760 + }, + { + "epoch": 2.064505698811737, + "grad_norm": 0.7263124585151672, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 12770 + }, + { + "epoch": 2.0661223829924826, + "grad_norm": 0.6986154317855835, + "learning_rate": 0.0002, + "loss": 0.6189, + "step": 12780 + }, + { + "epoch": 2.067739067173228, + "grad_norm": 0.7768576741218567, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 12790 + }, + { + "epoch": 2.069355751353973, + "grad_norm": 0.7546762824058533, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 12800 + }, + { + "epoch": 2.0709724355347183, + "grad_norm": 0.7588880062103271, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 12810 + }, + { + "epoch": 2.0725891197154636, + "grad_norm": 0.7457242608070374, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 12820 + }, + { + "epoch": 2.074205803896209, + "grad_norm": 0.6983516812324524, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 12830 + }, + { + "epoch": 2.075822488076954, + "grad_norm": 0.7950928807258606, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 12840 + }, + { + "epoch": 2.0774391722576993, + "grad_norm": 0.9248087406158447, + "learning_rate": 0.0002, + "loss": 0.6603, + "step": 12850 + }, + { + "epoch": 2.079055856438445, + "grad_norm": 0.7229493260383606, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 12860 + }, + { + "epoch": 2.08067254061919, + "grad_norm": 0.5710847973823547, + "learning_rate": 0.0002, + "loss": 0.6702, + "step": 12870 + }, + { + "epoch": 2.0822892247999354, + "grad_norm": 0.9580423831939697, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 12880 + }, + { + "epoch": 2.0839059089806806, + "grad_norm": 0.7399665713310242, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 12890 + }, + { + "epoch": 2.085522593161426, + "grad_norm": 0.7981410622596741, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 12900 + }, + { + "epoch": 2.087139277342171, + "grad_norm": 0.870759904384613, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 12910 + }, + { + "epoch": 2.0887559615229163, + "grad_norm": 0.7001481652259827, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 12920 + }, + { + "epoch": 2.090372645703662, + "grad_norm": 0.6745418310165405, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 12930 + }, + { + "epoch": 2.0919893298844072, + "grad_norm": 0.7739067673683167, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 12940 + }, + { + "epoch": 2.0936060140651525, + "grad_norm": 0.6742934584617615, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 12950 + }, + { + "epoch": 2.0952226982458977, + "grad_norm": 0.7270349860191345, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 12960 + }, + { + "epoch": 2.096839382426643, + "grad_norm": 0.7150624394416809, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 12970 + }, + { + "epoch": 2.098456066607388, + "grad_norm": 0.7734767198562622, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 12980 + }, + { + "epoch": 2.1000727507881334, + "grad_norm": 0.7618662118911743, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 12990 + }, + { + "epoch": 2.101689434968879, + "grad_norm": 0.6557944416999817, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 13000 + }, + { + "epoch": 2.1033061191496243, + "grad_norm": 0.8786448240280151, + "learning_rate": 0.0002, + "loss": 0.7268, + "step": 13010 + }, + { + "epoch": 2.1049228033303695, + "grad_norm": 0.6878724098205566, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 13020 + }, + { + "epoch": 2.1065394875111147, + "grad_norm": 0.822318971157074, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 13030 + }, + { + "epoch": 2.10815617169186, + "grad_norm": 0.831468939781189, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 13040 + }, + { + "epoch": 2.109772855872605, + "grad_norm": 0.7699505686759949, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 13050 + }, + { + "epoch": 2.1113895400533504, + "grad_norm": 0.7559016346931458, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 13060 + }, + { + "epoch": 2.1130062242340957, + "grad_norm": 0.6942209601402283, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 13070 + }, + { + "epoch": 2.1146229084148414, + "grad_norm": 0.6098947525024414, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 13080 + }, + { + "epoch": 2.1162395925955866, + "grad_norm": 0.6499016284942627, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 13090 + }, + { + "epoch": 2.117856276776332, + "grad_norm": 0.7719953060150146, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 13100 + }, + { + "epoch": 2.119472960957077, + "grad_norm": 0.6708134412765503, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 13110 + }, + { + "epoch": 2.1210896451378223, + "grad_norm": 0.8119585514068604, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 13120 + }, + { + "epoch": 2.1227063293185675, + "grad_norm": 0.6947157979011536, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 13130 + }, + { + "epoch": 2.1243230134993127, + "grad_norm": 0.8831837773323059, + "learning_rate": 0.0002, + "loss": 0.6375, + "step": 13140 + }, + { + "epoch": 2.1259396976800584, + "grad_norm": 0.7266910672187805, + "learning_rate": 0.0002, + "loss": 0.6997, + "step": 13150 + }, + { + "epoch": 2.1275563818608036, + "grad_norm": 0.8864351511001587, + "learning_rate": 0.0002, + "loss": 0.6446, + "step": 13160 + }, + { + "epoch": 2.129173066041549, + "grad_norm": 0.8104248046875, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 13170 + }, + { + "epoch": 2.130789750222294, + "grad_norm": 0.6077079772949219, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 13180 + }, + { + "epoch": 2.1324064344030393, + "grad_norm": 0.6874213814735413, + "learning_rate": 0.0002, + "loss": 0.6572, + "step": 13190 + }, + { + "epoch": 2.1340231185837846, + "grad_norm": 0.7134367823600769, + "learning_rate": 0.0002, + "loss": 0.642, + "step": 13200 + }, + { + "epoch": 2.13563980276453, + "grad_norm": 0.6101235151290894, + "learning_rate": 0.0002, + "loss": 0.7016, + "step": 13210 + }, + { + "epoch": 2.137256486945275, + "grad_norm": 0.6042411923408508, + "learning_rate": 0.0002, + "loss": 0.6529, + "step": 13220 + }, + { + "epoch": 2.1388731711260207, + "grad_norm": 0.914601743221283, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 13230 + }, + { + "epoch": 2.140489855306766, + "grad_norm": 0.7104284167289734, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 13240 + }, + { + "epoch": 2.142106539487511, + "grad_norm": 0.664395272731781, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 13250 + }, + { + "epoch": 2.1437232236682564, + "grad_norm": 0.6991241574287415, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 13260 + }, + { + "epoch": 2.1453399078490016, + "grad_norm": 0.5469560623168945, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 13270 + }, + { + "epoch": 2.146956592029747, + "grad_norm": 0.8454998135566711, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 13280 + }, + { + "epoch": 2.148573276210492, + "grad_norm": 0.7088868618011475, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 13290 + }, + { + "epoch": 2.1501899603912378, + "grad_norm": 0.7002687454223633, + "learning_rate": 0.0002, + "loss": 0.6835, + "step": 13300 + }, + { + "epoch": 2.151806644571983, + "grad_norm": 0.7785214781761169, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 13310 + }, + { + "epoch": 2.1534233287527282, + "grad_norm": 0.8049132227897644, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 13320 + }, + { + "epoch": 2.1550400129334735, + "grad_norm": 0.8062595129013062, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 13330 + }, + { + "epoch": 2.1566566971142187, + "grad_norm": 0.6208319067955017, + "learning_rate": 0.0002, + "loss": 0.6603, + "step": 13340 + }, + { + "epoch": 2.158273381294964, + "grad_norm": 0.7519655823707581, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 13350 + }, + { + "epoch": 2.159890065475709, + "grad_norm": 0.7645747065544128, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 13360 + }, + { + "epoch": 2.1615067496564544, + "grad_norm": 0.6847302913665771, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 13370 + }, + { + "epoch": 2.1631234338372, + "grad_norm": 0.8630441427230835, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 13380 + }, + { + "epoch": 2.1647401180179453, + "grad_norm": 0.7947702407836914, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 13390 + }, + { + "epoch": 2.1663568021986905, + "grad_norm": 0.6836977005004883, + "learning_rate": 0.0002, + "loss": 0.7206, + "step": 13400 + }, + { + "epoch": 2.1679734863794358, + "grad_norm": 0.7340566515922546, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 13410 + }, + { + "epoch": 2.169590170560181, + "grad_norm": 0.7075738906860352, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 13420 + }, + { + "epoch": 2.1712068547409262, + "grad_norm": 0.7080879807472229, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 13430 + }, + { + "epoch": 2.1728235389216715, + "grad_norm": 0.6218613386154175, + "learning_rate": 0.0002, + "loss": 0.6615, + "step": 13440 + }, + { + "epoch": 2.174440223102417, + "grad_norm": 0.8211479187011719, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 13450 + }, + { + "epoch": 2.1760569072831624, + "grad_norm": 0.864466667175293, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 13460 + }, + { + "epoch": 2.1776735914639076, + "grad_norm": 0.7943857908248901, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 13470 + }, + { + "epoch": 2.179290275644653, + "grad_norm": 0.78728187084198, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 13480 + }, + { + "epoch": 2.180906959825398, + "grad_norm": 0.697527289390564, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 13490 + }, + { + "epoch": 2.1825236440061433, + "grad_norm": 0.8205804228782654, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 13500 + }, + { + "epoch": 2.1841403281868885, + "grad_norm": 0.8709042072296143, + "learning_rate": 0.0002, + "loss": 0.7227, + "step": 13510 + }, + { + "epoch": 2.1857570123676338, + "grad_norm": 0.6228537559509277, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 13520 + }, + { + "epoch": 2.1873736965483794, + "grad_norm": 0.9566980004310608, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 13530 + }, + { + "epoch": 2.1889903807291247, + "grad_norm": 0.7128894329071045, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 13540 + }, + { + "epoch": 2.19060706490987, + "grad_norm": 0.6888654232025146, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 13550 + }, + { + "epoch": 2.192223749090615, + "grad_norm": 0.6444337368011475, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 13560 + }, + { + "epoch": 2.1938404332713604, + "grad_norm": 0.8008806705474854, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 13570 + }, + { + "epoch": 2.1954571174521056, + "grad_norm": 0.8482748866081238, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 13580 + }, + { + "epoch": 2.197073801632851, + "grad_norm": 0.8584157228469849, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 13590 + }, + { + "epoch": 2.1986904858135965, + "grad_norm": 0.7513734698295593, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 13600 + }, + { + "epoch": 2.2003071699943417, + "grad_norm": 0.7864262461662292, + "learning_rate": 0.0002, + "loss": 0.6632, + "step": 13610 + }, + { + "epoch": 2.201923854175087, + "grad_norm": 0.8493645191192627, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 13620 + }, + { + "epoch": 2.203540538355832, + "grad_norm": 0.6902140974998474, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 13630 + }, + { + "epoch": 2.2051572225365774, + "grad_norm": 0.8711254596710205, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 13640 + }, + { + "epoch": 2.2067739067173227, + "grad_norm": 0.7832191586494446, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 13650 + }, + { + "epoch": 2.208390590898068, + "grad_norm": 0.5668176412582397, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 13660 + }, + { + "epoch": 2.2100072750788136, + "grad_norm": 0.8648375272750854, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 13670 + }, + { + "epoch": 2.211623959259559, + "grad_norm": 0.7643089890480042, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 13680 + }, + { + "epoch": 2.213240643440304, + "grad_norm": 0.6293777823448181, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 13690 + }, + { + "epoch": 2.2148573276210493, + "grad_norm": 0.6459372639656067, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 13700 + }, + { + "epoch": 2.2164740118017945, + "grad_norm": 0.7060744166374207, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 13710 + }, + { + "epoch": 2.2180906959825397, + "grad_norm": 0.674109160900116, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 13720 + }, + { + "epoch": 2.219707380163285, + "grad_norm": 0.830392062664032, + "learning_rate": 0.0002, + "loss": 0.6529, + "step": 13730 + }, + { + "epoch": 2.2213240643440306, + "grad_norm": 0.6474477052688599, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 13740 + }, + { + "epoch": 2.222940748524776, + "grad_norm": 0.7037909626960754, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 13750 + }, + { + "epoch": 2.224557432705521, + "grad_norm": 0.6554131507873535, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 13760 + }, + { + "epoch": 2.2261741168862663, + "grad_norm": 0.7822230458259583, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 13770 + }, + { + "epoch": 2.2277908010670116, + "grad_norm": 0.9082167744636536, + "learning_rate": 0.0002, + "loss": 0.6505, + "step": 13780 + }, + { + "epoch": 2.229407485247757, + "grad_norm": 0.7918276190757751, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 13790 + }, + { + "epoch": 2.231024169428502, + "grad_norm": 0.7354569435119629, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 13800 + }, + { + "epoch": 2.2326408536092472, + "grad_norm": 0.8265249133110046, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 13810 + }, + { + "epoch": 2.234257537789993, + "grad_norm": 0.6653847098350525, + "learning_rate": 0.0002, + "loss": 0.6871, + "step": 13820 + }, + { + "epoch": 2.235874221970738, + "grad_norm": 0.7157923579216003, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 13830 + }, + { + "epoch": 2.2374909061514834, + "grad_norm": 0.7110323309898376, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 13840 + }, + { + "epoch": 2.2391075903322286, + "grad_norm": 0.7155357599258423, + "learning_rate": 0.0002, + "loss": 0.6913, + "step": 13850 + }, + { + "epoch": 2.240724274512974, + "grad_norm": 1.0177817344665527, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 13860 + }, + { + "epoch": 2.242340958693719, + "grad_norm": 0.7601948380470276, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 13870 + }, + { + "epoch": 2.2439576428744643, + "grad_norm": 0.7628820538520813, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 13880 + }, + { + "epoch": 2.24557432705521, + "grad_norm": 0.7089297771453857, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 13890 + }, + { + "epoch": 2.247191011235955, + "grad_norm": 0.695178210735321, + "learning_rate": 0.0002, + "loss": 0.7236, + "step": 13900 + }, + { + "epoch": 2.2488076954167004, + "grad_norm": 0.7631948590278625, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 13910 + }, + { + "epoch": 2.2504243795974457, + "grad_norm": 0.8203101754188538, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 13920 + }, + { + "epoch": 2.252041063778191, + "grad_norm": 0.8099079728126526, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 13930 + }, + { + "epoch": 2.253657747958936, + "grad_norm": 0.6498546004295349, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 13940 + }, + { + "epoch": 2.2552744321396814, + "grad_norm": 0.7797415256500244, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 13950 + }, + { + "epoch": 2.2568911163204266, + "grad_norm": 0.8254124522209167, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 13960 + }, + { + "epoch": 2.2585078005011723, + "grad_norm": 0.6327953338623047, + "learning_rate": 0.0002, + "loss": 0.6806, + "step": 13970 + }, + { + "epoch": 2.2601244846819175, + "grad_norm": 0.734194278717041, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 13980 + }, + { + "epoch": 2.2617411688626627, + "grad_norm": 0.9014202952384949, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 13990 + }, + { + "epoch": 2.263357853043408, + "grad_norm": 0.7643631100654602, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 14000 + }, + { + "epoch": 2.264974537224153, + "grad_norm": 0.8882834911346436, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 14010 + }, + { + "epoch": 2.2665912214048984, + "grad_norm": 0.7975873351097107, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 14020 + }, + { + "epoch": 2.2682079055856437, + "grad_norm": 0.7765783071517944, + "learning_rate": 0.0002, + "loss": 0.7193, + "step": 14030 + }, + { + "epoch": 2.2698245897663893, + "grad_norm": 0.8846288323402405, + "learning_rate": 0.0002, + "loss": 0.662, + "step": 14040 + }, + { + "epoch": 2.2714412739471346, + "grad_norm": 0.9006744027137756, + "learning_rate": 0.0002, + "loss": 0.6494, + "step": 14050 + }, + { + "epoch": 2.27305795812788, + "grad_norm": 0.7420173287391663, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 14060 + }, + { + "epoch": 2.274674642308625, + "grad_norm": 0.7956424951553345, + "learning_rate": 0.0002, + "loss": 0.7068, + "step": 14070 + }, + { + "epoch": 2.2762913264893703, + "grad_norm": 0.7783209085464478, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 14080 + }, + { + "epoch": 2.2779080106701155, + "grad_norm": 0.7597188949584961, + "learning_rate": 0.0002, + "loss": 0.7202, + "step": 14090 + }, + { + "epoch": 2.2795246948508607, + "grad_norm": 0.6718921661376953, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 14100 + }, + { + "epoch": 2.281141379031606, + "grad_norm": 0.7528082132339478, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 14110 + }, + { + "epoch": 2.2827580632123516, + "grad_norm": 0.8379864692687988, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 14120 + }, + { + "epoch": 2.284374747393097, + "grad_norm": 0.748613715171814, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 14130 + }, + { + "epoch": 2.285991431573842, + "grad_norm": 0.7435423135757446, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 14140 + }, + { + "epoch": 2.2876081157545873, + "grad_norm": 0.7580803632736206, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 14150 + }, + { + "epoch": 2.2892247999353326, + "grad_norm": 0.6278321146965027, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 14160 + }, + { + "epoch": 2.290841484116078, + "grad_norm": 0.7663896083831787, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 14170 + }, + { + "epoch": 2.292458168296823, + "grad_norm": 0.9716812372207642, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 14180 + }, + { + "epoch": 2.2940748524775687, + "grad_norm": 0.8993458151817322, + "learning_rate": 0.0002, + "loss": 0.7067, + "step": 14190 + }, + { + "epoch": 2.295691536658314, + "grad_norm": 0.6156117916107178, + "learning_rate": 0.0002, + "loss": 0.6172, + "step": 14200 + }, + { + "epoch": 2.297308220839059, + "grad_norm": 0.8911278247833252, + "learning_rate": 0.0002, + "loss": 0.6318, + "step": 14210 + }, + { + "epoch": 2.2989249050198044, + "grad_norm": 0.6422147154808044, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 14220 + }, + { + "epoch": 2.3005415892005496, + "grad_norm": 0.6866879463195801, + "learning_rate": 0.0002, + "loss": 0.6795, + "step": 14230 + }, + { + "epoch": 2.302158273381295, + "grad_norm": 0.9297130107879639, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 14240 + }, + { + "epoch": 2.30377495756204, + "grad_norm": 0.7501356601715088, + "learning_rate": 0.0002, + "loss": 0.6823, + "step": 14250 + }, + { + "epoch": 2.3053916417427853, + "grad_norm": 0.8363515138626099, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 14260 + }, + { + "epoch": 2.307008325923531, + "grad_norm": 0.9083868265151978, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 14270 + }, + { + "epoch": 2.3086250101042762, + "grad_norm": 0.7791516780853271, + "learning_rate": 0.0002, + "loss": 0.6862, + "step": 14280 + }, + { + "epoch": 2.3102416942850215, + "grad_norm": 0.8766953349113464, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 14290 + }, + { + "epoch": 2.3118583784657667, + "grad_norm": 0.7916635274887085, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 14300 + }, + { + "epoch": 2.313475062646512, + "grad_norm": 0.627525269985199, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 14310 + }, + { + "epoch": 2.315091746827257, + "grad_norm": 0.8856783509254456, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 14320 + }, + { + "epoch": 2.316708431008003, + "grad_norm": 0.6758689284324646, + "learning_rate": 0.0002, + "loss": 0.6806, + "step": 14330 + }, + { + "epoch": 2.318325115188748, + "grad_norm": 0.6428321003913879, + "learning_rate": 0.0002, + "loss": 0.6794, + "step": 14340 + }, + { + "epoch": 2.3199417993694933, + "grad_norm": 0.9032121300697327, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 14350 + }, + { + "epoch": 2.3215584835502385, + "grad_norm": 0.8035986423492432, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 14360 + }, + { + "epoch": 2.3231751677309838, + "grad_norm": 0.7974579334259033, + "learning_rate": 0.0002, + "loss": 0.7067, + "step": 14370 + }, + { + "epoch": 2.324791851911729, + "grad_norm": 0.8356034755706787, + "learning_rate": 0.0002, + "loss": 0.6451, + "step": 14380 + }, + { + "epoch": 2.326408536092474, + "grad_norm": 0.998760998249054, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 14390 + }, + { + "epoch": 2.3280252202732195, + "grad_norm": 0.6518142223358154, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 14400 + }, + { + "epoch": 2.3296419044539647, + "grad_norm": 0.7443506717681885, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 14410 + }, + { + "epoch": 2.3312585886347104, + "grad_norm": 0.8436172604560852, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 14420 + }, + { + "epoch": 2.3328752728154556, + "grad_norm": 0.7411080598831177, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 14430 + }, + { + "epoch": 2.334491956996201, + "grad_norm": 0.8839048743247986, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 14440 + }, + { + "epoch": 2.336108641176946, + "grad_norm": 0.8360885977745056, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 14450 + }, + { + "epoch": 2.3377253253576913, + "grad_norm": 0.7608986496925354, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 14460 + }, + { + "epoch": 2.3393420095384365, + "grad_norm": 0.8179867267608643, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 14470 + }, + { + "epoch": 2.340958693719182, + "grad_norm": 0.5989999771118164, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 14480 + }, + { + "epoch": 2.3425753778999274, + "grad_norm": 0.9450054168701172, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 14490 + }, + { + "epoch": 2.3441920620806727, + "grad_norm": 0.7885149717330933, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 14500 + }, + { + "epoch": 2.345808746261418, + "grad_norm": 0.8152616620063782, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 14510 + }, + { + "epoch": 2.347425430442163, + "grad_norm": 0.7193838953971863, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 14520 + }, + { + "epoch": 2.3490421146229084, + "grad_norm": 0.6701092720031738, + "learning_rate": 0.0002, + "loss": 0.6594, + "step": 14530 + }, + { + "epoch": 2.3506587988036536, + "grad_norm": 0.7529364228248596, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 14540 + }, + { + "epoch": 2.352275482984399, + "grad_norm": 0.6599733829498291, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 14550 + }, + { + "epoch": 2.353892167165144, + "grad_norm": 0.9502474069595337, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 14560 + }, + { + "epoch": 2.3555088513458897, + "grad_norm": 0.7619650959968567, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 14570 + }, + { + "epoch": 2.357125535526635, + "grad_norm": 0.9854652285575867, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 14580 + }, + { + "epoch": 2.35874221970738, + "grad_norm": 0.727439284324646, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 14590 + }, + { + "epoch": 2.3603589038881254, + "grad_norm": 0.6994746327400208, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 14600 + }, + { + "epoch": 2.3619755880688706, + "grad_norm": 0.7117531299591064, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 14610 + }, + { + "epoch": 2.363592272249616, + "grad_norm": 0.6403067708015442, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 14620 + }, + { + "epoch": 2.3652089564303616, + "grad_norm": 0.8377841711044312, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 14630 + }, + { + "epoch": 2.366825640611107, + "grad_norm": 0.749171257019043, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 14640 + }, + { + "epoch": 2.368442324791852, + "grad_norm": 0.8418586254119873, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 14650 + }, + { + "epoch": 2.3700590089725972, + "grad_norm": 0.6178573369979858, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 14660 + }, + { + "epoch": 2.3716756931533425, + "grad_norm": 0.6368302702903748, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 14670 + }, + { + "epoch": 2.3732923773340877, + "grad_norm": 0.9122977256774902, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 14680 + }, + { + "epoch": 2.374909061514833, + "grad_norm": 0.7086195349693298, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 14690 + }, + { + "epoch": 2.376525745695578, + "grad_norm": 0.7500800490379333, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 14700 + }, + { + "epoch": 2.378142429876324, + "grad_norm": 0.6634900569915771, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 14710 + }, + { + "epoch": 2.379759114057069, + "grad_norm": 0.839898407459259, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 14720 + }, + { + "epoch": 2.3813757982378143, + "grad_norm": 0.7578426003456116, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 14730 + }, + { + "epoch": 2.3829924824185595, + "grad_norm": 1.0213173627853394, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 14740 + }, + { + "epoch": 2.3846091665993048, + "grad_norm": 0.7855949401855469, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 14750 + }, + { + "epoch": 2.38622585078005, + "grad_norm": 0.7224128842353821, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 14760 + }, + { + "epoch": 2.3878425349607952, + "grad_norm": 0.8040381669998169, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 14770 + }, + { + "epoch": 2.389459219141541, + "grad_norm": 0.7705281376838684, + "learning_rate": 0.0002, + "loss": 0.6799, + "step": 14780 + }, + { + "epoch": 2.391075903322286, + "grad_norm": 0.667966902256012, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 14790 + }, + { + "epoch": 2.3926925875030314, + "grad_norm": 0.6611011028289795, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 14800 + }, + { + "epoch": 2.3943092716837766, + "grad_norm": 0.6862651705741882, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 14810 + }, + { + "epoch": 2.395925955864522, + "grad_norm": 0.8086010217666626, + "learning_rate": 0.0002, + "loss": 0.6537, + "step": 14820 + }, + { + "epoch": 2.397542640045267, + "grad_norm": 0.7189689874649048, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 14830 + }, + { + "epoch": 2.3991593242260123, + "grad_norm": 0.6280009150505066, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 14840 + }, + { + "epoch": 2.4007760084067575, + "grad_norm": 0.7826612591743469, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 14850 + }, + { + "epoch": 2.402392692587503, + "grad_norm": 0.7681610584259033, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 14860 + }, + { + "epoch": 2.4040093767682484, + "grad_norm": 0.720966100692749, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 14870 + }, + { + "epoch": 2.4056260609489937, + "grad_norm": 0.8202250599861145, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 14880 + }, + { + "epoch": 2.407242745129739, + "grad_norm": 0.786212682723999, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 14890 + }, + { + "epoch": 2.408859429310484, + "grad_norm": 0.6647164821624756, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 14900 + }, + { + "epoch": 2.4104761134912294, + "grad_norm": 0.7566399574279785, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 14910 + }, + { + "epoch": 2.4120927976719746, + "grad_norm": 0.748814582824707, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 14920 + }, + { + "epoch": 2.4137094818527203, + "grad_norm": 0.7624038457870483, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 14930 + }, + { + "epoch": 2.4153261660334655, + "grad_norm": 0.8267335295677185, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 14940 + }, + { + "epoch": 2.4169428502142107, + "grad_norm": 0.8785360455513, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 14950 + }, + { + "epoch": 2.418559534394956, + "grad_norm": 0.679887592792511, + "learning_rate": 0.0002, + "loss": 0.6718, + "step": 14960 + }, + { + "epoch": 2.420176218575701, + "grad_norm": 0.7218474745750427, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 14970 + }, + { + "epoch": 2.4217929027564464, + "grad_norm": 0.6342799663543701, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 14980 + }, + { + "epoch": 2.4234095869371917, + "grad_norm": 0.7098712921142578, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 14990 + }, + { + "epoch": 2.425026271117937, + "grad_norm": 0.7497431635856628, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 15000 + }, + { + "epoch": 2.4266429552986826, + "grad_norm": 0.934836208820343, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 15010 + }, + { + "epoch": 2.428259639479428, + "grad_norm": 0.8430966734886169, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 15020 + }, + { + "epoch": 2.429876323660173, + "grad_norm": 0.7032104730606079, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 15030 + }, + { + "epoch": 2.4314930078409183, + "grad_norm": 0.7746111750602722, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 15040 + }, + { + "epoch": 2.4331096920216635, + "grad_norm": 0.7661406397819519, + "learning_rate": 0.0002, + "loss": 0.7177, + "step": 15050 + }, + { + "epoch": 2.4347263762024087, + "grad_norm": 0.6941645741462708, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 15060 + }, + { + "epoch": 2.436343060383154, + "grad_norm": 0.7487249374389648, + "learning_rate": 0.0002, + "loss": 0.6421, + "step": 15070 + }, + { + "epoch": 2.4379597445638996, + "grad_norm": 0.7639912962913513, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 15080 + }, + { + "epoch": 2.439576428744645, + "grad_norm": 0.7708953619003296, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 15090 + }, + { + "epoch": 2.44119311292539, + "grad_norm": 0.9135832190513611, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 15100 + }, + { + "epoch": 2.4428097971061353, + "grad_norm": 0.8283005356788635, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 15110 + }, + { + "epoch": 2.4444264812868806, + "grad_norm": 0.925299346446991, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 15120 + }, + { + "epoch": 2.446043165467626, + "grad_norm": 0.7013528943061829, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 15130 + }, + { + "epoch": 2.447659849648371, + "grad_norm": 0.622303307056427, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 15140 + }, + { + "epoch": 2.4492765338291163, + "grad_norm": 0.876569390296936, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 15150 + }, + { + "epoch": 2.450893218009862, + "grad_norm": 0.6836351752281189, + "learning_rate": 0.0002, + "loss": 0.6463, + "step": 15160 + }, + { + "epoch": 2.452509902190607, + "grad_norm": 0.7886684536933899, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 15170 + }, + { + "epoch": 2.4541265863713524, + "grad_norm": 0.6647440791130066, + "learning_rate": 0.0002, + "loss": 0.6794, + "step": 15180 + }, + { + "epoch": 2.4557432705520976, + "grad_norm": 0.7477722764015198, + "learning_rate": 0.0002, + "loss": 0.6353, + "step": 15190 + }, + { + "epoch": 2.457359954732843, + "grad_norm": 0.8192033767700195, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 15200 + }, + { + "epoch": 2.458976638913588, + "grad_norm": 0.847537100315094, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 15210 + }, + { + "epoch": 2.4605933230943338, + "grad_norm": 0.9027776122093201, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 15220 + }, + { + "epoch": 2.462210007275079, + "grad_norm": 0.7217772006988525, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 15230 + }, + { + "epoch": 2.4638266914558242, + "grad_norm": 0.7994546294212341, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 15240 + }, + { + "epoch": 2.4654433756365695, + "grad_norm": 0.939916729927063, + "learning_rate": 0.0002, + "loss": 0.6828, + "step": 15250 + }, + { + "epoch": 2.4670600598173147, + "grad_norm": 1.0009053945541382, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 15260 + }, + { + "epoch": 2.46867674399806, + "grad_norm": 0.625555694103241, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 15270 + }, + { + "epoch": 2.470293428178805, + "grad_norm": 0.7924878597259521, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 15280 + }, + { + "epoch": 2.4719101123595504, + "grad_norm": 0.8536689877510071, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 15290 + }, + { + "epoch": 2.4735267965402956, + "grad_norm": 0.8572589755058289, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 15300 + }, + { + "epoch": 2.4751434807210413, + "grad_norm": 0.773279070854187, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 15310 + }, + { + "epoch": 2.4767601649017865, + "grad_norm": 0.7708749771118164, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 15320 + }, + { + "epoch": 2.4783768490825318, + "grad_norm": 0.770905077457428, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 15330 + }, + { + "epoch": 2.479993533263277, + "grad_norm": 0.8238571882247925, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 15340 + }, + { + "epoch": 2.481610217444022, + "grad_norm": 0.7670477032661438, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 15350 + }, + { + "epoch": 2.4832269016247674, + "grad_norm": 0.905036985874176, + "learning_rate": 0.0002, + "loss": 0.7759, + "step": 15360 + }, + { + "epoch": 2.484843585805513, + "grad_norm": 0.6672089695930481, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 15370 + }, + { + "epoch": 2.4864602699862584, + "grad_norm": 0.625095784664154, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 15380 + }, + { + "epoch": 2.4880769541670036, + "grad_norm": 0.679772675037384, + "learning_rate": 0.0002, + "loss": 0.6396, + "step": 15390 + }, + { + "epoch": 2.489693638347749, + "grad_norm": 0.711492121219635, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 15400 + }, + { + "epoch": 2.491310322528494, + "grad_norm": 0.876189112663269, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 15410 + }, + { + "epoch": 2.4929270067092393, + "grad_norm": 0.7236915230751038, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 15420 + }, + { + "epoch": 2.4945436908899845, + "grad_norm": 0.6629832983016968, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 15430 + }, + { + "epoch": 2.4961603750707297, + "grad_norm": 0.9756859540939331, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 15440 + }, + { + "epoch": 2.4977770592514754, + "grad_norm": 0.6896940469741821, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 15450 + }, + { + "epoch": 2.4993937434322206, + "grad_norm": 0.7105149626731873, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 15460 + }, + { + "epoch": 2.501010427612966, + "grad_norm": 0.8374546766281128, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 15470 + }, + { + "epoch": 2.502627111793711, + "grad_norm": 0.7320070266723633, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 15480 + }, + { + "epoch": 2.5042437959744563, + "grad_norm": 0.8306367993354797, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 15490 + }, + { + "epoch": 2.5058604801552016, + "grad_norm": 0.7472721338272095, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 15500 + }, + { + "epoch": 2.507477164335947, + "grad_norm": 0.6147692203521729, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 15510 + }, + { + "epoch": 2.5090938485166925, + "grad_norm": 0.7788505554199219, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 15520 + }, + { + "epoch": 2.5107105326974377, + "grad_norm": 0.8807527422904968, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 15530 + }, + { + "epoch": 2.512327216878183, + "grad_norm": 0.7521643042564392, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 15540 + }, + { + "epoch": 2.513943901058928, + "grad_norm": 0.6900225281715393, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 15550 + }, + { + "epoch": 2.5155605852396734, + "grad_norm": 0.6601938605308533, + "learning_rate": 0.0002, + "loss": 0.6769, + "step": 15560 + }, + { + "epoch": 2.5171772694204186, + "grad_norm": 0.8179984092712402, + "learning_rate": 0.0002, + "loss": 0.6648, + "step": 15570 + }, + { + "epoch": 2.518793953601164, + "grad_norm": 0.792556881904602, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 15580 + }, + { + "epoch": 2.520410637781909, + "grad_norm": 0.7081938982009888, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 15590 + }, + { + "epoch": 2.5220273219626543, + "grad_norm": 0.8733121156692505, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 15600 + }, + { + "epoch": 2.5236440061434, + "grad_norm": 0.7980992794036865, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 15610 + }, + { + "epoch": 2.5252606903241452, + "grad_norm": 0.883664071559906, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 15620 + }, + { + "epoch": 2.5268773745048905, + "grad_norm": 0.6963341236114502, + "learning_rate": 0.0002, + "loss": 0.7022, + "step": 15630 + }, + { + "epoch": 2.5284940586856357, + "grad_norm": 0.6433573365211487, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 15640 + }, + { + "epoch": 2.530110742866381, + "grad_norm": 0.8538183569908142, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 15650 + }, + { + "epoch": 2.5317274270471266, + "grad_norm": 0.9748201370239258, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 15660 + }, + { + "epoch": 2.533344111227872, + "grad_norm": 0.7670575380325317, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 15670 + }, + { + "epoch": 2.534960795408617, + "grad_norm": 0.8738890290260315, + "learning_rate": 0.0002, + "loss": 0.6435, + "step": 15680 + }, + { + "epoch": 2.5365774795893623, + "grad_norm": 0.8391636610031128, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 15690 + }, + { + "epoch": 2.5381941637701075, + "grad_norm": 0.7239366769790649, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 15700 + }, + { + "epoch": 2.5398108479508528, + "grad_norm": 0.8498379588127136, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 15710 + }, + { + "epoch": 2.541427532131598, + "grad_norm": 0.8029484152793884, + "learning_rate": 0.0002, + "loss": 0.6998, + "step": 15720 + }, + { + "epoch": 2.5430442163123432, + "grad_norm": 1.0639333724975586, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 15730 + }, + { + "epoch": 2.5446609004930885, + "grad_norm": 0.6401297450065613, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 15740 + }, + { + "epoch": 2.5462775846738337, + "grad_norm": 0.7123814821243286, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 15750 + }, + { + "epoch": 2.5478942688545794, + "grad_norm": 0.7874974608421326, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 15760 + }, + { + "epoch": 2.5495109530353246, + "grad_norm": 0.8046808838844299, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 15770 + }, + { + "epoch": 2.55112763721607, + "grad_norm": 0.7888661623001099, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 15780 + }, + { + "epoch": 2.552744321396815, + "grad_norm": 0.8445866107940674, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 15790 + }, + { + "epoch": 2.5543610055775603, + "grad_norm": 0.7475846409797668, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 15800 + }, + { + "epoch": 2.555977689758306, + "grad_norm": 0.7455102801322937, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 15810 + }, + { + "epoch": 2.557594373939051, + "grad_norm": 0.8226983547210693, + "learning_rate": 0.0002, + "loss": 0.6932, + "step": 15820 + }, + { + "epoch": 2.5592110581197964, + "grad_norm": 0.8920368552207947, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 15830 + }, + { + "epoch": 2.5608277423005417, + "grad_norm": 0.8413904905319214, + "learning_rate": 0.0002, + "loss": 0.6297, + "step": 15840 + }, + { + "epoch": 2.562444426481287, + "grad_norm": 0.8483649492263794, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 15850 + }, + { + "epoch": 2.564061110662032, + "grad_norm": 0.5923284292221069, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 15860 + }, + { + "epoch": 2.5656777948427774, + "grad_norm": 0.8518726229667664, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 15870 + }, + { + "epoch": 2.5672944790235226, + "grad_norm": 0.731235146522522, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 15880 + }, + { + "epoch": 2.568911163204268, + "grad_norm": 0.7517194151878357, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 15890 + }, + { + "epoch": 2.5705278473850135, + "grad_norm": 0.8378692269325256, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 15900 + }, + { + "epoch": 2.5721445315657587, + "grad_norm": 0.843701958656311, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 15910 + }, + { + "epoch": 2.573761215746504, + "grad_norm": 0.7254629731178284, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 15920 + }, + { + "epoch": 2.575377899927249, + "grad_norm": 0.8863335847854614, + "learning_rate": 0.0002, + "loss": 0.6872, + "step": 15930 + }, + { + "epoch": 2.5769945841079944, + "grad_norm": 0.7675097584724426, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 15940 + }, + { + "epoch": 2.5786112682887397, + "grad_norm": 0.82063889503479, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 15950 + }, + { + "epoch": 2.5802279524694853, + "grad_norm": 0.7729717493057251, + "learning_rate": 0.0002, + "loss": 0.662, + "step": 15960 + }, + { + "epoch": 2.5818446366502306, + "grad_norm": 0.8301846981048584, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 15970 + }, + { + "epoch": 2.583461320830976, + "grad_norm": 0.7906861305236816, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 15980 + }, + { + "epoch": 2.585078005011721, + "grad_norm": 0.6749057173728943, + "learning_rate": 0.0002, + "loss": 0.7175, + "step": 15990 + }, + { + "epoch": 2.5866946891924663, + "grad_norm": 0.9386842846870422, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 16000 + }, + { + "epoch": 2.5883113733732115, + "grad_norm": 0.7868891358375549, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 16010 + }, + { + "epoch": 2.5899280575539567, + "grad_norm": 0.8674671053886414, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 16020 + }, + { + "epoch": 2.591544741734702, + "grad_norm": 0.7043559551239014, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 16030 + }, + { + "epoch": 2.593161425915447, + "grad_norm": 0.5846083760261536, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 16040 + }, + { + "epoch": 2.594778110096193, + "grad_norm": 0.7323982119560242, + "learning_rate": 0.0002, + "loss": 0.7322, + "step": 16050 + }, + { + "epoch": 2.596394794276938, + "grad_norm": 0.9069556593894958, + "learning_rate": 0.0002, + "loss": 0.6794, + "step": 16060 + }, + { + "epoch": 2.5980114784576833, + "grad_norm": 0.7522736191749573, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 16070 + }, + { + "epoch": 2.5996281626384286, + "grad_norm": 0.8149648308753967, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 16080 + }, + { + "epoch": 2.601244846819174, + "grad_norm": 0.6214233040809631, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 16090 + }, + { + "epoch": 2.602861530999919, + "grad_norm": 0.6803743839263916, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 16100 + }, + { + "epoch": 2.6044782151806647, + "grad_norm": 0.7223997116088867, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 16110 + }, + { + "epoch": 2.60609489936141, + "grad_norm": 0.7324174642562866, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 16120 + }, + { + "epoch": 2.607711583542155, + "grad_norm": 0.9594739675521851, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 16130 + }, + { + "epoch": 2.6093282677229004, + "grad_norm": 0.9485327005386353, + "learning_rate": 0.0002, + "loss": 0.6451, + "step": 16140 + }, + { + "epoch": 2.6109449519036456, + "grad_norm": 0.8449000120162964, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 16150 + }, + { + "epoch": 2.612561636084391, + "grad_norm": 0.8520140051841736, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 16160 + }, + { + "epoch": 2.614178320265136, + "grad_norm": 0.7456524968147278, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 16170 + }, + { + "epoch": 2.6157950044458813, + "grad_norm": 0.9912857413291931, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 16180 + }, + { + "epoch": 2.6174116886266265, + "grad_norm": 0.9001946449279785, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 16190 + }, + { + "epoch": 2.619028372807372, + "grad_norm": 0.6568667888641357, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 16200 + }, + { + "epoch": 2.6206450569881174, + "grad_norm": 1.0248128175735474, + "learning_rate": 0.0002, + "loss": 0.7013, + "step": 16210 + }, + { + "epoch": 2.6222617411688627, + "grad_norm": 0.6509039998054504, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 16220 + }, + { + "epoch": 2.623878425349608, + "grad_norm": 0.7626351118087769, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 16230 + }, + { + "epoch": 2.625495109530353, + "grad_norm": 0.6938552260398865, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 16240 + }, + { + "epoch": 2.6271117937110984, + "grad_norm": 0.6434680819511414, + "learning_rate": 0.0002, + "loss": 0.65, + "step": 16250 + }, + { + "epoch": 2.628728477891844, + "grad_norm": 0.7111515998840332, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 16260 + }, + { + "epoch": 2.6303451620725893, + "grad_norm": 0.7712395787239075, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 16270 + }, + { + "epoch": 2.6319618462533345, + "grad_norm": 0.792209267616272, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 16280 + }, + { + "epoch": 2.6335785304340797, + "grad_norm": 0.6801066398620605, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 16290 + }, + { + "epoch": 2.635195214614825, + "grad_norm": 0.7802573442459106, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 16300 + }, + { + "epoch": 2.63681189879557, + "grad_norm": 0.7742244601249695, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 16310 + }, + { + "epoch": 2.6384285829763154, + "grad_norm": 0.664184033870697, + "learning_rate": 0.0002, + "loss": 0.6524, + "step": 16320 + }, + { + "epoch": 2.6400452671570607, + "grad_norm": 0.9242228865623474, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 16330 + }, + { + "epoch": 2.641661951337806, + "grad_norm": 0.9661325216293335, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 16340 + }, + { + "epoch": 2.6432786355185516, + "grad_norm": 0.837526798248291, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 16350 + }, + { + "epoch": 2.644895319699297, + "grad_norm": 1.1834373474121094, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 16360 + }, + { + "epoch": 2.646512003880042, + "grad_norm": 0.7467831373214722, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 16370 + }, + { + "epoch": 2.6481286880607873, + "grad_norm": 0.8627146482467651, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 16380 + }, + { + "epoch": 2.6497453722415325, + "grad_norm": 0.790447473526001, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 16390 + }, + { + "epoch": 2.651362056422278, + "grad_norm": 0.8447365164756775, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 16400 + }, + { + "epoch": 2.6529787406030234, + "grad_norm": 0.7831417918205261, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 16410 + }, + { + "epoch": 2.6545954247837686, + "grad_norm": 0.6837952136993408, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 16420 + }, + { + "epoch": 2.656212108964514, + "grad_norm": 0.7031801342964172, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 16430 + }, + { + "epoch": 2.657828793145259, + "grad_norm": 0.8963770866394043, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 16440 + }, + { + "epoch": 2.6594454773260043, + "grad_norm": 0.6852328181266785, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 16450 + }, + { + "epoch": 2.6610621615067496, + "grad_norm": 0.8069294095039368, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 16460 + }, + { + "epoch": 2.662678845687495, + "grad_norm": 0.7503686547279358, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 16470 + }, + { + "epoch": 2.66429552986824, + "grad_norm": 0.6430956125259399, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 16480 + }, + { + "epoch": 2.6659122140489853, + "grad_norm": 0.7894312739372253, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 16490 + }, + { + "epoch": 2.667528898229731, + "grad_norm": 0.7277431488037109, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 16500 + }, + { + "epoch": 2.669145582410476, + "grad_norm": 0.6816153526306152, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 16510 + }, + { + "epoch": 2.6707622665912214, + "grad_norm": 0.8145235776901245, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 16520 + }, + { + "epoch": 2.6723789507719666, + "grad_norm": 0.8645890355110168, + "learning_rate": 0.0002, + "loss": 0.709, + "step": 16530 + }, + { + "epoch": 2.673995634952712, + "grad_norm": 0.704393208026886, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 16540 + }, + { + "epoch": 2.6756123191334575, + "grad_norm": 1.0120846033096313, + "learning_rate": 0.0002, + "loss": 0.6378, + "step": 16550 + }, + { + "epoch": 2.6772290033142028, + "grad_norm": 0.6919328570365906, + "learning_rate": 0.0002, + "loss": 0.7241, + "step": 16560 + }, + { + "epoch": 2.678845687494948, + "grad_norm": 0.6924574971199036, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 16570 + }, + { + "epoch": 2.6804623716756932, + "grad_norm": 0.9679301381111145, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 16580 + }, + { + "epoch": 2.6820790558564385, + "grad_norm": 0.6810211539268494, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 16590 + }, + { + "epoch": 2.6836957400371837, + "grad_norm": 0.9730555415153503, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 16600 + }, + { + "epoch": 2.685312424217929, + "grad_norm": 0.7852821350097656, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 16610 + }, + { + "epoch": 2.686929108398674, + "grad_norm": 0.6059057116508484, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 16620 + }, + { + "epoch": 2.6885457925794194, + "grad_norm": 0.9395958781242371, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 16630 + }, + { + "epoch": 2.690162476760165, + "grad_norm": 0.7473729848861694, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 16640 + }, + { + "epoch": 2.6917791609409103, + "grad_norm": 0.765934407711029, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 16650 + }, + { + "epoch": 2.6933958451216555, + "grad_norm": 0.8496677279472351, + "learning_rate": 0.0002, + "loss": 0.707, + "step": 16660 + }, + { + "epoch": 2.6950125293024008, + "grad_norm": 0.7641879916191101, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 16670 + }, + { + "epoch": 2.696629213483146, + "grad_norm": 0.8471952676773071, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 16680 + }, + { + "epoch": 2.6982458976638912, + "grad_norm": 0.6946060657501221, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 16690 + }, + { + "epoch": 2.699862581844637, + "grad_norm": 0.7361312508583069, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 16700 + }, + { + "epoch": 2.701479266025382, + "grad_norm": 0.6605038046836853, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 16710 + }, + { + "epoch": 2.7030959502061274, + "grad_norm": 0.7164411544799805, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 16720 + }, + { + "epoch": 2.7047126343868726, + "grad_norm": 0.6496201157569885, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 16730 + }, + { + "epoch": 2.706329318567618, + "grad_norm": 0.7826663851737976, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 16740 + }, + { + "epoch": 2.707946002748363, + "grad_norm": 0.7639131546020508, + "learning_rate": 0.0002, + "loss": 0.6773, + "step": 16750 + }, + { + "epoch": 2.7095626869291083, + "grad_norm": 0.7976210713386536, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 16760 + }, + { + "epoch": 2.7111793711098535, + "grad_norm": 0.6836577653884888, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 16770 + }, + { + "epoch": 2.7127960552905988, + "grad_norm": 0.8025202751159668, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 16780 + }, + { + "epoch": 2.7144127394713444, + "grad_norm": 0.7636463642120361, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 16790 + }, + { + "epoch": 2.7160294236520897, + "grad_norm": 0.7481677532196045, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 16800 + }, + { + "epoch": 2.717646107832835, + "grad_norm": 0.7566834688186646, + "learning_rate": 0.0002, + "loss": 0.6925, + "step": 16810 + }, + { + "epoch": 2.71926279201358, + "grad_norm": 0.7931267619132996, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 16820 + }, + { + "epoch": 2.7208794761943254, + "grad_norm": 0.8811662197113037, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 16830 + }, + { + "epoch": 2.7224961603750706, + "grad_norm": 0.8561240434646606, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 16840 + }, + { + "epoch": 2.7241128445558163, + "grad_norm": 0.7121599316596985, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 16850 + }, + { + "epoch": 2.7257295287365615, + "grad_norm": 0.8066257238388062, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 16860 + }, + { + "epoch": 2.7273462129173067, + "grad_norm": 0.7699271440505981, + "learning_rate": 0.0002, + "loss": 0.6839, + "step": 16870 + }, + { + "epoch": 2.728962897098052, + "grad_norm": 1.1828432083129883, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 16880 + }, + { + "epoch": 2.730579581278797, + "grad_norm": 0.9989302754402161, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 16890 + }, + { + "epoch": 2.7321962654595424, + "grad_norm": 0.8100560307502747, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 16900 + }, + { + "epoch": 2.7338129496402876, + "grad_norm": 0.8615233898162842, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 16910 + }, + { + "epoch": 2.735429633821033, + "grad_norm": 0.8633756041526794, + "learning_rate": 0.0002, + "loss": 0.6322, + "step": 16920 + }, + { + "epoch": 2.737046318001778, + "grad_norm": 0.7769348621368408, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 16930 + }, + { + "epoch": 2.738663002182524, + "grad_norm": 0.6943058371543884, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 16940 + }, + { + "epoch": 2.740279686363269, + "grad_norm": 0.8510736227035522, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 16950 + }, + { + "epoch": 2.7418963705440142, + "grad_norm": 0.7732602953910828, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 16960 + }, + { + "epoch": 2.7435130547247595, + "grad_norm": 0.5981788635253906, + "learning_rate": 0.0002, + "loss": 0.6553, + "step": 16970 + }, + { + "epoch": 2.7451297389055047, + "grad_norm": 0.7604416012763977, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 16980 + }, + { + "epoch": 2.74674642308625, + "grad_norm": 0.7377738356590271, + "learning_rate": 0.0002, + "loss": 0.6981, + "step": 16990 + }, + { + "epoch": 2.7483631072669956, + "grad_norm": 0.9400289058685303, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 17000 + }, + { + "epoch": 2.749979791447741, + "grad_norm": 0.6340599656105042, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 17010 + }, + { + "epoch": 2.751596475628486, + "grad_norm": 0.7297601103782654, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 17020 + }, + { + "epoch": 2.7532131598092313, + "grad_norm": 0.9479979872703552, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 17030 + }, + { + "epoch": 2.7548298439899765, + "grad_norm": 0.8461511135101318, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 17040 + }, + { + "epoch": 2.7564465281707218, + "grad_norm": 0.7477551698684692, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 17050 + }, + { + "epoch": 2.758063212351467, + "grad_norm": 1.019270420074463, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 17060 + }, + { + "epoch": 2.7596798965322122, + "grad_norm": 0.7730235457420349, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 17070 + }, + { + "epoch": 2.7612965807129575, + "grad_norm": 0.8216866254806519, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 17080 + }, + { + "epoch": 2.762913264893703, + "grad_norm": 0.7235931754112244, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 17090 + }, + { + "epoch": 2.7645299490744484, + "grad_norm": 0.7352296710014343, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 17100 + }, + { + "epoch": 2.7661466332551936, + "grad_norm": 0.8129373788833618, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 17110 + }, + { + "epoch": 2.767763317435939, + "grad_norm": 0.7387019991874695, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 17120 + }, + { + "epoch": 2.769380001616684, + "grad_norm": 0.9149190187454224, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 17130 + }, + { + "epoch": 2.7709966857974297, + "grad_norm": 0.7352971434593201, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 17140 + }, + { + "epoch": 2.772613369978175, + "grad_norm": 0.7903780341148376, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 17150 + }, + { + "epoch": 2.77423005415892, + "grad_norm": 0.8255927562713623, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 17160 + }, + { + "epoch": 2.7758467383396654, + "grad_norm": 0.7235927581787109, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 17170 + }, + { + "epoch": 2.7774634225204107, + "grad_norm": 0.8281434774398804, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 17180 + }, + { + "epoch": 2.779080106701156, + "grad_norm": 0.7586921453475952, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 17190 + }, + { + "epoch": 2.780696790881901, + "grad_norm": 0.7161715030670166, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 17200 + }, + { + "epoch": 2.7823134750626464, + "grad_norm": 0.762868344783783, + "learning_rate": 0.0002, + "loss": 0.6426, + "step": 17210 + }, + { + "epoch": 2.7839301592433916, + "grad_norm": 0.9285483360290527, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 17220 + }, + { + "epoch": 2.785546843424137, + "grad_norm": 0.6900462508201599, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 17230 + }, + { + "epoch": 2.7871635276048825, + "grad_norm": 0.780384361743927, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 17240 + }, + { + "epoch": 2.7887802117856277, + "grad_norm": 0.7580406665802002, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 17250 + }, + { + "epoch": 2.790396895966373, + "grad_norm": 0.8145199418067932, + "learning_rate": 0.0002, + "loss": 0.6833, + "step": 17260 + }, + { + "epoch": 2.792013580147118, + "grad_norm": 0.9159596562385559, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 17270 + }, + { + "epoch": 2.7936302643278634, + "grad_norm": 0.9590014219284058, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 17280 + }, + { + "epoch": 2.795246948508609, + "grad_norm": 0.7603529691696167, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 17290 + }, + { + "epoch": 2.7968636326893543, + "grad_norm": 0.8039976358413696, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 17300 + }, + { + "epoch": 2.7984803168700996, + "grad_norm": 0.8364847302436829, + "learning_rate": 0.0002, + "loss": 0.7037, + "step": 17310 + }, + { + "epoch": 2.800097001050845, + "grad_norm": 0.8763046860694885, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 17320 + }, + { + "epoch": 2.80171368523159, + "grad_norm": 0.8409647941589355, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 17330 + }, + { + "epoch": 2.8033303694123353, + "grad_norm": 0.7649006247520447, + "learning_rate": 0.0002, + "loss": 0.6936, + "step": 17340 + }, + { + "epoch": 2.8049470535930805, + "grad_norm": 0.7970262169837952, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 17350 + }, + { + "epoch": 2.8065637377738257, + "grad_norm": 0.9088607430458069, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 17360 + }, + { + "epoch": 2.808180421954571, + "grad_norm": 0.6454846858978271, + "learning_rate": 0.0002, + "loss": 0.675, + "step": 17370 + }, + { + "epoch": 2.809797106135316, + "grad_norm": 0.7744787931442261, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 17380 + }, + { + "epoch": 2.811413790316062, + "grad_norm": 0.6678640842437744, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 17390 + }, + { + "epoch": 2.813030474496807, + "grad_norm": 0.772676944732666, + "learning_rate": 0.0002, + "loss": 0.6784, + "step": 17400 + }, + { + "epoch": 2.8146471586775523, + "grad_norm": 0.7088175415992737, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 17410 + }, + { + "epoch": 2.8162638428582976, + "grad_norm": 0.8280573487281799, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 17420 + }, + { + "epoch": 2.817880527039043, + "grad_norm": 0.6665388345718384, + "learning_rate": 0.0002, + "loss": 0.6732, + "step": 17430 + }, + { + "epoch": 2.8194972112197885, + "grad_norm": 0.6427883505821228, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 17440 + }, + { + "epoch": 2.8211138954005337, + "grad_norm": 0.9697760343551636, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 17450 + }, + { + "epoch": 2.822730579581279, + "grad_norm": 0.7573966383934021, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 17460 + }, + { + "epoch": 2.824347263762024, + "grad_norm": 0.878688633441925, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 17470 + }, + { + "epoch": 2.8259639479427694, + "grad_norm": 0.7752242684364319, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 17480 + }, + { + "epoch": 2.8275806321235146, + "grad_norm": 0.6135398745536804, + "learning_rate": 0.0002, + "loss": 0.6638, + "step": 17490 + }, + { + "epoch": 2.82919731630426, + "grad_norm": 0.6924924850463867, + "learning_rate": 0.0002, + "loss": 0.6829, + "step": 17500 + }, + { + "epoch": 2.830814000485005, + "grad_norm": 0.7471627593040466, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 17510 + }, + { + "epoch": 2.8324306846657503, + "grad_norm": 0.7145499587059021, + "learning_rate": 0.0002, + "loss": 0.7016, + "step": 17520 + }, + { + "epoch": 2.834047368846496, + "grad_norm": 0.7415414452552795, + "learning_rate": 0.0002, + "loss": 0.6787, + "step": 17530 + }, + { + "epoch": 2.8356640530272412, + "grad_norm": 0.7328441739082336, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 17540 + }, + { + "epoch": 2.8372807372079865, + "grad_norm": 0.8267839550971985, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 17550 + }, + { + "epoch": 2.8388974213887317, + "grad_norm": 0.8877885341644287, + "learning_rate": 0.0002, + "loss": 0.6787, + "step": 17560 + }, + { + "epoch": 2.840514105569477, + "grad_norm": 0.857138454914093, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 17570 + }, + { + "epoch": 2.842130789750222, + "grad_norm": 0.8470779657363892, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 17580 + }, + { + "epoch": 2.843747473930968, + "grad_norm": 0.8553254008293152, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 17590 + }, + { + "epoch": 2.845364158111713, + "grad_norm": 0.8033196926116943, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 17600 + }, + { + "epoch": 2.8469808422924583, + "grad_norm": 0.7949087023735046, + "learning_rate": 0.0002, + "loss": 0.7062, + "step": 17610 + }, + { + "epoch": 2.8485975264732035, + "grad_norm": 0.9241406321525574, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 17620 + }, + { + "epoch": 2.8502142106539488, + "grad_norm": 0.7721285223960876, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 17630 + }, + { + "epoch": 2.851830894834694, + "grad_norm": 1.0246692895889282, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 17640 + }, + { + "epoch": 2.853447579015439, + "grad_norm": 0.9244589805603027, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 17650 + }, + { + "epoch": 2.8550642631961844, + "grad_norm": 0.7243508696556091, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 17660 + }, + { + "epoch": 2.8566809473769297, + "grad_norm": 0.8943371176719666, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 17670 + }, + { + "epoch": 2.8582976315576754, + "grad_norm": 0.6531758904457092, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 17680 + }, + { + "epoch": 2.8599143157384206, + "grad_norm": 0.8367000818252563, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 17690 + }, + { + "epoch": 2.861530999919166, + "grad_norm": 0.7868556380271912, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 17700 + }, + { + "epoch": 2.863147684099911, + "grad_norm": 0.7213859558105469, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 17710 + }, + { + "epoch": 2.8647643682806563, + "grad_norm": 0.7383931279182434, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 17720 + }, + { + "epoch": 2.8663810524614015, + "grad_norm": 0.7566812634468079, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 17730 + }, + { + "epoch": 2.867997736642147, + "grad_norm": 0.6930373311042786, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 17740 + }, + { + "epoch": 2.8696144208228924, + "grad_norm": 0.7911090850830078, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 17750 + }, + { + "epoch": 2.8712311050036377, + "grad_norm": 0.8484548926353455, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 17760 + }, + { + "epoch": 2.872847789184383, + "grad_norm": 0.7647597193717957, + "learning_rate": 0.0002, + "loss": 0.6931, + "step": 17770 + }, + { + "epoch": 2.874464473365128, + "grad_norm": 0.8791151642799377, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 17780 + }, + { + "epoch": 2.8760811575458733, + "grad_norm": 0.7253178358078003, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 17790 + }, + { + "epoch": 2.8776978417266186, + "grad_norm": 0.7956077456474304, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 17800 + }, + { + "epoch": 2.879314525907364, + "grad_norm": 0.8657688498497009, + "learning_rate": 0.0002, + "loss": 0.6687, + "step": 17810 + }, + { + "epoch": 2.880931210088109, + "grad_norm": 0.7059141993522644, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 17820 + }, + { + "epoch": 2.8825478942688547, + "grad_norm": 0.8886896967887878, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 17830 + }, + { + "epoch": 2.8841645784496, + "grad_norm": 0.821032702922821, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 17840 + }, + { + "epoch": 2.885781262630345, + "grad_norm": 0.7183963656425476, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 17850 + }, + { + "epoch": 2.8873979468110904, + "grad_norm": 0.6222899556159973, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 17860 + }, + { + "epoch": 2.8890146309918356, + "grad_norm": 0.8187434077262878, + "learning_rate": 0.0002, + "loss": 0.6626, + "step": 17870 + }, + { + "epoch": 2.890631315172581, + "grad_norm": 0.9838479161262512, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 17880 + }, + { + "epoch": 2.8922479993533265, + "grad_norm": 0.7567742466926575, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 17890 + }, + { + "epoch": 2.893864683534072, + "grad_norm": 0.6875903606414795, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 17900 + }, + { + "epoch": 2.895481367714817, + "grad_norm": 0.8043789267539978, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 17910 + }, + { + "epoch": 2.8970980518955622, + "grad_norm": 0.8062626719474792, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 17920 + }, + { + "epoch": 2.8987147360763075, + "grad_norm": 1.0251191854476929, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 17930 + }, + { + "epoch": 2.9003314202570527, + "grad_norm": 0.882253110408783, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 17940 + }, + { + "epoch": 2.901948104437798, + "grad_norm": 0.8683299422264099, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 17950 + }, + { + "epoch": 2.903564788618543, + "grad_norm": 0.7167282104492188, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 17960 + }, + { + "epoch": 2.9051814727992884, + "grad_norm": 0.7093694806098938, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 17970 + }, + { + "epoch": 2.906798156980034, + "grad_norm": 0.8549879193305969, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 17980 + }, + { + "epoch": 2.9084148411607793, + "grad_norm": 0.6989606618881226, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 17990 + }, + { + "epoch": 2.9100315253415245, + "grad_norm": 0.9482976794242859, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 18000 + }, + { + "epoch": 2.9116482095222698, + "grad_norm": 0.7182440161705017, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 18010 + }, + { + "epoch": 2.913264893703015, + "grad_norm": 0.7732226252555847, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 18020 + }, + { + "epoch": 2.9148815778837607, + "grad_norm": 0.7936875224113464, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 18030 + }, + { + "epoch": 2.916498262064506, + "grad_norm": 0.8825615644454956, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 18040 + }, + { + "epoch": 2.918114946245251, + "grad_norm": 0.6778587102890015, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 18050 + }, + { + "epoch": 2.9197316304259964, + "grad_norm": 0.7529265880584717, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 18060 + }, + { + "epoch": 2.9213483146067416, + "grad_norm": 0.7111883163452148, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 18070 + }, + { + "epoch": 2.922964998787487, + "grad_norm": 0.7214767932891846, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 18080 + }, + { + "epoch": 2.924581682968232, + "grad_norm": 0.800417423248291, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 18090 + }, + { + "epoch": 2.9261983671489773, + "grad_norm": 1.248575210571289, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 18100 + }, + { + "epoch": 2.9278150513297225, + "grad_norm": 0.757788360118866, + "learning_rate": 0.0002, + "loss": 0.7004, + "step": 18110 + }, + { + "epoch": 2.9294317355104678, + "grad_norm": 1.0583995580673218, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 18120 + }, + { + "epoch": 2.9310484196912134, + "grad_norm": 0.8228777647018433, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 18130 + }, + { + "epoch": 2.9326651038719587, + "grad_norm": 0.8374035358428955, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 18140 + }, + { + "epoch": 2.934281788052704, + "grad_norm": 0.7976473569869995, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 18150 + }, + { + "epoch": 2.935898472233449, + "grad_norm": 0.8009907603263855, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 18160 + }, + { + "epoch": 2.9375151564141944, + "grad_norm": 0.835213303565979, + "learning_rate": 0.0002, + "loss": 0.7485, + "step": 18170 + }, + { + "epoch": 2.93913184059494, + "grad_norm": 0.7982219457626343, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 18180 + }, + { + "epoch": 2.9407485247756853, + "grad_norm": 0.7070978879928589, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 18190 + }, + { + "epoch": 2.9423652089564305, + "grad_norm": 0.8619440197944641, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 18200 + }, + { + "epoch": 2.9439818931371757, + "grad_norm": 0.6693987250328064, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 18210 + }, + { + "epoch": 2.945598577317921, + "grad_norm": 0.6747021079063416, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 18220 + }, + { + "epoch": 2.947215261498666, + "grad_norm": 0.860387921333313, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 18230 + }, + { + "epoch": 2.9488319456794114, + "grad_norm": 0.799976646900177, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 18240 + }, + { + "epoch": 2.9504486298601567, + "grad_norm": 0.7864769101142883, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 18250 + }, + { + "epoch": 2.952065314040902, + "grad_norm": 0.6713884472846985, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 18260 + }, + { + "epoch": 2.9536819982216476, + "grad_norm": 0.9031508564949036, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 18270 + }, + { + "epoch": 2.955298682402393, + "grad_norm": 0.7205073237419128, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 18280 + }, + { + "epoch": 2.956915366583138, + "grad_norm": 0.7746205925941467, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 18290 + }, + { + "epoch": 2.9585320507638833, + "grad_norm": 0.6533427834510803, + "learning_rate": 0.0002, + "loss": 0.6218, + "step": 18300 + }, + { + "epoch": 2.9601487349446285, + "grad_norm": 0.9083208441734314, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 18310 + }, + { + "epoch": 2.9617654191253737, + "grad_norm": 0.7446991801261902, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 18320 + }, + { + "epoch": 2.9633821033061194, + "grad_norm": 0.6514461636543274, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 18330 + }, + { + "epoch": 2.9649987874868646, + "grad_norm": 0.8580465912818909, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 18340 + }, + { + "epoch": 2.96661547166761, + "grad_norm": 0.7074266076087952, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 18350 + }, + { + "epoch": 2.968232155848355, + "grad_norm": 0.899892270565033, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 18360 + }, + { + "epoch": 2.9698488400291003, + "grad_norm": 0.8217641711235046, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 18370 + }, + { + "epoch": 2.9714655242098456, + "grad_norm": 0.8611799478530884, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 18380 + }, + { + "epoch": 2.973082208390591, + "grad_norm": 0.6909302473068237, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 18390 + }, + { + "epoch": 2.974698892571336, + "grad_norm": 0.6554358005523682, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 18400 + }, + { + "epoch": 2.9763155767520812, + "grad_norm": 0.7803071737289429, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 18410 + }, + { + "epoch": 2.977932260932827, + "grad_norm": 0.7838954925537109, + "learning_rate": 0.0002, + "loss": 0.7322, + "step": 18420 + }, + { + "epoch": 2.979548945113572, + "grad_norm": 0.7098495364189148, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 18430 + }, + { + "epoch": 2.9811656292943174, + "grad_norm": 0.8981785774230957, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 18440 + }, + { + "epoch": 2.9827823134750626, + "grad_norm": 0.7197171449661255, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 18450 + }, + { + "epoch": 2.984398997655808, + "grad_norm": 0.793185293674469, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 18460 + }, + { + "epoch": 2.986015681836553, + "grad_norm": 0.8531473875045776, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 18470 + }, + { + "epoch": 2.9876323660172988, + "grad_norm": 0.6627361178398132, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 18480 + }, + { + "epoch": 2.989249050198044, + "grad_norm": 0.5708155035972595, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 18490 + }, + { + "epoch": 2.990865734378789, + "grad_norm": 0.8227280378341675, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 18500 + }, + { + "epoch": 2.9924824185595345, + "grad_norm": 0.7102749943733215, + "learning_rate": 0.0002, + "loss": 0.6701, + "step": 18510 + }, + { + "epoch": 2.9940991027402797, + "grad_norm": 0.839485228061676, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 18520 + }, + { + "epoch": 2.995715786921025, + "grad_norm": 0.9038704037666321, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 18530 + }, + { + "epoch": 2.99733247110177, + "grad_norm": 0.8737510442733765, + "learning_rate": 0.0002, + "loss": 0.7186, + "step": 18540 + }, + { + "epoch": 2.9989491552825154, + "grad_norm": 0.7323142886161804, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 18550 + }, + { + "epoch": 2.9999191657909625, + "eval_loss": 1.1262480020523071, + "eval_runtime": 122.0868, + "eval_samples_per_second": 6.004, + "eval_steps_per_second": 0.754, + "step": 18556 + }, + { + "epoch": 3.000565839463261, + "grad_norm": 0.8465463519096375, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 18560 + }, + { + "epoch": 3.0021825236440063, + "grad_norm": 0.9134138822555542, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 18570 + }, + { + "epoch": 3.0037992078247515, + "grad_norm": 0.760715126991272, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 18580 + }, + { + "epoch": 3.0054158920054967, + "grad_norm": 0.9208743572235107, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 18590 + }, + { + "epoch": 3.007032576186242, + "grad_norm": 0.9232364892959595, + "learning_rate": 0.0002, + "loss": 0.5731, + "step": 18600 + }, + { + "epoch": 3.008649260366987, + "grad_norm": 1.1881544589996338, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 18610 + }, + { + "epoch": 3.0102659445477324, + "grad_norm": 0.9372987747192383, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 18620 + }, + { + "epoch": 3.0118826287284777, + "grad_norm": 0.6900241374969482, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 18630 + }, + { + "epoch": 3.0134993129092233, + "grad_norm": 0.8451071381568909, + "learning_rate": 0.0002, + "loss": 0.5256, + "step": 18640 + }, + { + "epoch": 3.0151159970899686, + "grad_norm": 0.7763112187385559, + "learning_rate": 0.0002, + "loss": 0.5916, + "step": 18650 + }, + { + "epoch": 3.016732681270714, + "grad_norm": 1.043653964996338, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 18660 + }, + { + "epoch": 3.018349365451459, + "grad_norm": 1.0170660018920898, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 18670 + }, + { + "epoch": 3.0199660496322043, + "grad_norm": 0.7534180283546448, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 18680 + }, + { + "epoch": 3.0215827338129495, + "grad_norm": 0.7507367730140686, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 18690 + }, + { + "epoch": 3.0231994179936947, + "grad_norm": 0.7861620187759399, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 18700 + }, + { + "epoch": 3.0248161021744404, + "grad_norm": 1.0580339431762695, + "learning_rate": 0.0002, + "loss": 0.5802, + "step": 18710 + }, + { + "epoch": 3.0264327863551856, + "grad_norm": 0.7542710900306702, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 18720 + }, + { + "epoch": 3.028049470535931, + "grad_norm": 0.8189544677734375, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 18730 + }, + { + "epoch": 3.029666154716676, + "grad_norm": 0.9126611351966858, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 18740 + }, + { + "epoch": 3.0312828388974213, + "grad_norm": 0.8891341686248779, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 18750 + }, + { + "epoch": 3.0328995230781666, + "grad_norm": 0.8419283032417297, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 18760 + }, + { + "epoch": 3.034516207258912, + "grad_norm": 0.8048048615455627, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 18770 + }, + { + "epoch": 3.0361328914396575, + "grad_norm": 0.7820217609405518, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 18780 + }, + { + "epoch": 3.0377495756204027, + "grad_norm": 0.854721188545227, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 18790 + }, + { + "epoch": 3.039366259801148, + "grad_norm": 0.912092924118042, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 18800 + }, + { + "epoch": 3.040982943981893, + "grad_norm": 0.6596226096153259, + "learning_rate": 0.0002, + "loss": 0.5734, + "step": 18810 + }, + { + "epoch": 3.0425996281626384, + "grad_norm": 0.6351348757743835, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 18820 + }, + { + "epoch": 3.0442163123433836, + "grad_norm": 0.778188943862915, + "learning_rate": 0.0002, + "loss": 0.5953, + "step": 18830 + }, + { + "epoch": 3.045832996524129, + "grad_norm": 0.68234783411026, + "learning_rate": 0.0002, + "loss": 0.602, + "step": 18840 + }, + { + "epoch": 3.047449680704874, + "grad_norm": 0.998628556728363, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 18850 + }, + { + "epoch": 3.0490663648856198, + "grad_norm": 0.7393841743469238, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 18860 + }, + { + "epoch": 3.050683049066365, + "grad_norm": 0.84438556432724, + "learning_rate": 0.0002, + "loss": 0.568, + "step": 18870 + }, + { + "epoch": 3.0522997332471102, + "grad_norm": 0.8857501745223999, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 18880 + }, + { + "epoch": 3.0539164174278555, + "grad_norm": 0.7208474278450012, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 18890 + }, + { + "epoch": 3.0555331016086007, + "grad_norm": 0.7135229110717773, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 18900 + }, + { + "epoch": 3.057149785789346, + "grad_norm": 0.9130001664161682, + "learning_rate": 0.0002, + "loss": 0.5575, + "step": 18910 + }, + { + "epoch": 3.058766469970091, + "grad_norm": 0.9001716375350952, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 18920 + }, + { + "epoch": 3.060383154150837, + "grad_norm": 0.8667559623718262, + "learning_rate": 0.0002, + "loss": 0.6052, + "step": 18930 + }, + { + "epoch": 3.061999838331582, + "grad_norm": 0.8943959474563599, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 18940 + }, + { + "epoch": 3.0636165225123273, + "grad_norm": 0.8298377990722656, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 18950 + }, + { + "epoch": 3.0652332066930725, + "grad_norm": 0.7935267686843872, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 18960 + }, + { + "epoch": 3.0668498908738178, + "grad_norm": 1.1506379842758179, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 18970 + }, + { + "epoch": 3.068466575054563, + "grad_norm": 0.7693049907684326, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 18980 + }, + { + "epoch": 3.0700832592353082, + "grad_norm": 0.8040135502815247, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 18990 + }, + { + "epoch": 3.0716999434160535, + "grad_norm": 0.828404426574707, + "learning_rate": 0.0002, + "loss": 0.5541, + "step": 19000 + }, + { + "epoch": 3.073316627596799, + "grad_norm": 0.8811164498329163, + "learning_rate": 0.0002, + "loss": 0.6048, + "step": 19010 + }, + { + "epoch": 3.0749333117775444, + "grad_norm": 1.036205768585205, + "learning_rate": 0.0002, + "loss": 0.5845, + "step": 19020 + }, + { + "epoch": 3.0765499959582896, + "grad_norm": 0.8857285976409912, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 19030 + }, + { + "epoch": 3.078166680139035, + "grad_norm": 0.8392079472541809, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 19040 + }, + { + "epoch": 3.07978336431978, + "grad_norm": 1.0287401676177979, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 19050 + }, + { + "epoch": 3.0814000485005253, + "grad_norm": 1.0086315870285034, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 19060 + }, + { + "epoch": 3.0830167326812705, + "grad_norm": 0.9245324730873108, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 19070 + }, + { + "epoch": 3.084633416862016, + "grad_norm": 0.8680877089500427, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 19080 + }, + { + "epoch": 3.0862501010427614, + "grad_norm": 0.8814793825149536, + "learning_rate": 0.0002, + "loss": 0.5611, + "step": 19090 + }, + { + "epoch": 3.0878667852235067, + "grad_norm": 0.9234458208084106, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 19100 + }, + { + "epoch": 3.089483469404252, + "grad_norm": 1.1291664838790894, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 19110 + }, + { + "epoch": 3.091100153584997, + "grad_norm": 0.9191402792930603, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 19120 + }, + { + "epoch": 3.0927168377657424, + "grad_norm": 0.7103154063224792, + "learning_rate": 0.0002, + "loss": 0.5856, + "step": 19130 + }, + { + "epoch": 3.0943335219464876, + "grad_norm": 0.9368883967399597, + "learning_rate": 0.0002, + "loss": 0.6479, + "step": 19140 + }, + { + "epoch": 3.095950206127233, + "grad_norm": 0.9676656723022461, + "learning_rate": 0.0002, + "loss": 0.6167, + "step": 19150 + }, + { + "epoch": 3.0975668903079785, + "grad_norm": 0.8739792704582214, + "learning_rate": 0.0002, + "loss": 0.5794, + "step": 19160 + }, + { + "epoch": 3.0991835744887237, + "grad_norm": 0.8530174493789673, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 19170 + }, + { + "epoch": 3.100800258669469, + "grad_norm": 0.794945478439331, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 19180 + }, + { + "epoch": 3.102416942850214, + "grad_norm": 0.9508888125419617, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 19190 + }, + { + "epoch": 3.1040336270309594, + "grad_norm": 1.0599955320358276, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 19200 + }, + { + "epoch": 3.1056503112117047, + "grad_norm": 1.0673625469207764, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 19210 + }, + { + "epoch": 3.10726699539245, + "grad_norm": 0.7739115953445435, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 19220 + }, + { + "epoch": 3.1088836795731956, + "grad_norm": 0.9884951114654541, + "learning_rate": 0.0002, + "loss": 0.6046, + "step": 19230 + }, + { + "epoch": 3.110500363753941, + "grad_norm": 0.862260103225708, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 19240 + }, + { + "epoch": 3.112117047934686, + "grad_norm": 0.7690284848213196, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 19250 + }, + { + "epoch": 3.1137337321154313, + "grad_norm": 0.8758958578109741, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 19260 + }, + { + "epoch": 3.1153504162961765, + "grad_norm": 1.0356395244598389, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 19270 + }, + { + "epoch": 3.1169671004769217, + "grad_norm": 0.6950937509536743, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 19280 + }, + { + "epoch": 3.118583784657667, + "grad_norm": 0.760998010635376, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 19290 + }, + { + "epoch": 3.1202004688384126, + "grad_norm": 0.9335789084434509, + "learning_rate": 0.0002, + "loss": 0.575, + "step": 19300 + }, + { + "epoch": 3.121817153019158, + "grad_norm": 0.9636204242706299, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 19310 + }, + { + "epoch": 3.123433837199903, + "grad_norm": 1.0820997953414917, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 19320 + }, + { + "epoch": 3.1250505213806483, + "grad_norm": 0.7333487272262573, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 19330 + }, + { + "epoch": 3.1266672055613935, + "grad_norm": 1.0417509078979492, + "learning_rate": 0.0002, + "loss": 0.6178, + "step": 19340 + }, + { + "epoch": 3.128283889742139, + "grad_norm": 0.9267749190330505, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 19350 + }, + { + "epoch": 3.129900573922884, + "grad_norm": 0.777798593044281, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 19360 + }, + { + "epoch": 3.1315172581036297, + "grad_norm": 0.8425456881523132, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 19370 + }, + { + "epoch": 3.133133942284375, + "grad_norm": 0.9617102146148682, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 19380 + }, + { + "epoch": 3.13475062646512, + "grad_norm": 1.0052828788757324, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 19390 + }, + { + "epoch": 3.1363673106458654, + "grad_norm": 0.7637009024620056, + "learning_rate": 0.0002, + "loss": 0.5713, + "step": 19400 + }, + { + "epoch": 3.1379839948266106, + "grad_norm": 0.7958088517189026, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 19410 + }, + { + "epoch": 3.139600679007356, + "grad_norm": 0.9161727428436279, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 19420 + }, + { + "epoch": 3.141217363188101, + "grad_norm": 0.8402149677276611, + "learning_rate": 0.0002, + "loss": 0.5638, + "step": 19430 + }, + { + "epoch": 3.1428340473688463, + "grad_norm": 1.0056525468826294, + "learning_rate": 0.0002, + "loss": 0.5848, + "step": 19440 + }, + { + "epoch": 3.144450731549592, + "grad_norm": 1.0129190683364868, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 19450 + }, + { + "epoch": 3.146067415730337, + "grad_norm": 0.790825366973877, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 19460 + }, + { + "epoch": 3.1476840999110824, + "grad_norm": 1.441665530204773, + "learning_rate": 0.0002, + "loss": 0.5607, + "step": 19470 + }, + { + "epoch": 3.1493007840918277, + "grad_norm": 0.7846331596374512, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 19480 + }, + { + "epoch": 3.150917468272573, + "grad_norm": 0.7915332913398743, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 19490 + }, + { + "epoch": 3.152534152453318, + "grad_norm": 0.933982253074646, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 19500 + }, + { + "epoch": 3.1541508366340634, + "grad_norm": 1.038408637046814, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 19510 + }, + { + "epoch": 3.155767520814809, + "grad_norm": 1.018935203552246, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 19520 + }, + { + "epoch": 3.1573842049955543, + "grad_norm": 0.9618112444877625, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 19530 + }, + { + "epoch": 3.1590008891762995, + "grad_norm": 0.8900452852249146, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 19540 + }, + { + "epoch": 3.1606175733570447, + "grad_norm": 0.8254160284996033, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 19550 + }, + { + "epoch": 3.16223425753779, + "grad_norm": 1.004376769065857, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 19560 + }, + { + "epoch": 3.163850941718535, + "grad_norm": 1.0490446090698242, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 19570 + }, + { + "epoch": 3.1654676258992804, + "grad_norm": 0.7387403845787048, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 19580 + }, + { + "epoch": 3.1670843100800257, + "grad_norm": 0.7611538171768188, + "learning_rate": 0.0002, + "loss": 0.5898, + "step": 19590 + }, + { + "epoch": 3.1687009942607713, + "grad_norm": 0.8239886164665222, + "learning_rate": 0.0002, + "loss": 0.5937, + "step": 19600 + }, + { + "epoch": 3.1703176784415166, + "grad_norm": 0.9327243566513062, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 19610 + }, + { + "epoch": 3.171934362622262, + "grad_norm": 0.9662560224533081, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 19620 + }, + { + "epoch": 3.173551046803007, + "grad_norm": 0.9183341860771179, + "learning_rate": 0.0002, + "loss": 0.5988, + "step": 19630 + }, + { + "epoch": 3.1751677309837523, + "grad_norm": 0.875066876411438, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 19640 + }, + { + "epoch": 3.1767844151644975, + "grad_norm": 0.8567508459091187, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 19650 + }, + { + "epoch": 3.1784010993452427, + "grad_norm": 0.6805780529975891, + "learning_rate": 0.0002, + "loss": 0.5805, + "step": 19660 + }, + { + "epoch": 3.1800177835259884, + "grad_norm": 0.8776944279670715, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 19670 + }, + { + "epoch": 3.1816344677067336, + "grad_norm": 0.9036329984664917, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 19680 + }, + { + "epoch": 3.183251151887479, + "grad_norm": 0.8527372479438782, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 19690 + }, + { + "epoch": 3.184867836068224, + "grad_norm": 1.1045585870742798, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 19700 + }, + { + "epoch": 3.1864845202489693, + "grad_norm": 0.9213830828666687, + "learning_rate": 0.0002, + "loss": 0.5491, + "step": 19710 + }, + { + "epoch": 3.1881012044297146, + "grad_norm": 0.8865814805030823, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 19720 + }, + { + "epoch": 3.18971788861046, + "grad_norm": 0.7939388751983643, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 19730 + }, + { + "epoch": 3.191334572791205, + "grad_norm": 0.6966729760169983, + "learning_rate": 0.0002, + "loss": 0.5682, + "step": 19740 + }, + { + "epoch": 3.1929512569719507, + "grad_norm": 0.8023673295974731, + "learning_rate": 0.0002, + "loss": 0.5839, + "step": 19750 + }, + { + "epoch": 3.194567941152696, + "grad_norm": 0.7992037534713745, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 19760 + }, + { + "epoch": 3.196184625333441, + "grad_norm": 0.7412247657775879, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 19770 + }, + { + "epoch": 3.1978013095141864, + "grad_norm": 0.9598729014396667, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 19780 + }, + { + "epoch": 3.1994179936949316, + "grad_norm": 0.8331366777420044, + "learning_rate": 0.0002, + "loss": 0.5685, + "step": 19790 + }, + { + "epoch": 3.201034677875677, + "grad_norm": 0.8939169645309448, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 19800 + }, + { + "epoch": 3.202651362056422, + "grad_norm": 0.9219734072685242, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 19810 + }, + { + "epoch": 3.2042680462371678, + "grad_norm": 0.869490385055542, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 19820 + }, + { + "epoch": 3.205884730417913, + "grad_norm": 0.8989706635475159, + "learning_rate": 0.0002, + "loss": 0.5946, + "step": 19830 + }, + { + "epoch": 3.2075014145986582, + "grad_norm": 0.8477165102958679, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 19840 + }, + { + "epoch": 3.2091180987794035, + "grad_norm": 0.8720678687095642, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 19850 + }, + { + "epoch": 3.2107347829601487, + "grad_norm": 0.861406683921814, + "learning_rate": 0.0002, + "loss": 0.5694, + "step": 19860 + }, + { + "epoch": 3.212351467140894, + "grad_norm": 0.8228686451911926, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 19870 + }, + { + "epoch": 3.213968151321639, + "grad_norm": 0.7936596870422363, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 19880 + }, + { + "epoch": 3.2155848355023844, + "grad_norm": 1.097377896308899, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 19890 + }, + { + "epoch": 3.21720151968313, + "grad_norm": 0.9544782638549805, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 19900 + }, + { + "epoch": 3.2188182038638753, + "grad_norm": 0.8240751624107361, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 19910 + }, + { + "epoch": 3.2204348880446205, + "grad_norm": 0.8332096338272095, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 19920 + }, + { + "epoch": 3.2220515722253658, + "grad_norm": 1.0954567193984985, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 19930 + }, + { + "epoch": 3.223668256406111, + "grad_norm": 0.7790525555610657, + "learning_rate": 0.0002, + "loss": 0.652, + "step": 19940 + }, + { + "epoch": 3.225284940586856, + "grad_norm": 0.7966814041137695, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 19950 + }, + { + "epoch": 3.2269016247676015, + "grad_norm": 0.9751881957054138, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 19960 + }, + { + "epoch": 3.228518308948347, + "grad_norm": 0.9856047630310059, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 19970 + }, + { + "epoch": 3.2301349931290924, + "grad_norm": 1.3062353134155273, + "learning_rate": 0.0002, + "loss": 0.5837, + "step": 19980 + }, + { + "epoch": 3.2317516773098376, + "grad_norm": 0.9510692358016968, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 19990 + }, + { + "epoch": 3.233368361490583, + "grad_norm": 0.8630342483520508, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 20000 + }, + { + "epoch": 3.234985045671328, + "grad_norm": 0.8966519236564636, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 20010 + }, + { + "epoch": 3.2366017298520733, + "grad_norm": 0.7093510627746582, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 20020 + }, + { + "epoch": 3.2382184140328185, + "grad_norm": 0.7771096229553223, + "learning_rate": 0.0002, + "loss": 0.585, + "step": 20030 + }, + { + "epoch": 3.2398350982135637, + "grad_norm": 0.841058075428009, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 20040 + }, + { + "epoch": 3.2414517823943094, + "grad_norm": 0.909712553024292, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 20050 + }, + { + "epoch": 3.2430684665750547, + "grad_norm": 0.8321019411087036, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 20060 + }, + { + "epoch": 3.2446851507558, + "grad_norm": 0.779901921749115, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 20070 + }, + { + "epoch": 3.246301834936545, + "grad_norm": 0.6249170303344727, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 20080 + }, + { + "epoch": 3.2479185191172903, + "grad_norm": 0.8000940680503845, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 20090 + }, + { + "epoch": 3.2495352032980356, + "grad_norm": 0.7627735137939453, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 20100 + }, + { + "epoch": 3.2511518874787813, + "grad_norm": 0.8780747056007385, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 20110 + }, + { + "epoch": 3.2527685716595265, + "grad_norm": 0.772037148475647, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 20120 + }, + { + "epoch": 3.2543852558402717, + "grad_norm": 1.0086580514907837, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 20130 + }, + { + "epoch": 3.256001940021017, + "grad_norm": 0.9360289573669434, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 20140 + }, + { + "epoch": 3.257618624201762, + "grad_norm": 1.2099586725234985, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 20150 + }, + { + "epoch": 3.2592353083825074, + "grad_norm": 0.8368481397628784, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 20160 + }, + { + "epoch": 3.2608519925632526, + "grad_norm": 0.7391039133071899, + "learning_rate": 0.0002, + "loss": 0.5626, + "step": 20170 + }, + { + "epoch": 3.262468676743998, + "grad_norm": 0.9122273325920105, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 20180 + }, + { + "epoch": 3.264085360924743, + "grad_norm": 0.8502281904220581, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 20190 + }, + { + "epoch": 3.265702045105489, + "grad_norm": 1.0926852226257324, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 20200 + }, + { + "epoch": 3.267318729286234, + "grad_norm": 0.7902828454971313, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 20210 + }, + { + "epoch": 3.2689354134669792, + "grad_norm": 0.8724729418754578, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 20220 + }, + { + "epoch": 3.2705520976477245, + "grad_norm": 0.8469277024269104, + "learning_rate": 0.0002, + "loss": 0.6242, + "step": 20230 + }, + { + "epoch": 3.2721687818284697, + "grad_norm": 0.8865092992782593, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 20240 + }, + { + "epoch": 3.273785466009215, + "grad_norm": 1.0979334115982056, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 20250 + }, + { + "epoch": 3.2754021501899606, + "grad_norm": 1.0860793590545654, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 20260 + }, + { + "epoch": 3.277018834370706, + "grad_norm": 0.981745183467865, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 20270 + }, + { + "epoch": 3.278635518551451, + "grad_norm": 0.9155020713806152, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 20280 + }, + { + "epoch": 3.2802522027321963, + "grad_norm": 0.8436718583106995, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 20290 + }, + { + "epoch": 3.2818688869129415, + "grad_norm": 1.0329409837722778, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 20300 + }, + { + "epoch": 3.2834855710936868, + "grad_norm": 0.9876394271850586, + "learning_rate": 0.0002, + "loss": 0.6, + "step": 20310 + }, + { + "epoch": 3.285102255274432, + "grad_norm": 0.8052917718887329, + "learning_rate": 0.0002, + "loss": 0.5463, + "step": 20320 + }, + { + "epoch": 3.2867189394551772, + "grad_norm": 0.8390680551528931, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 20330 + }, + { + "epoch": 3.288335623635923, + "grad_norm": 0.9515735507011414, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 20340 + }, + { + "epoch": 3.289952307816668, + "grad_norm": 0.8028870224952698, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 20350 + }, + { + "epoch": 3.2915689919974134, + "grad_norm": 0.862592339515686, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 20360 + }, + { + "epoch": 3.2931856761781586, + "grad_norm": 0.7451621890068054, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 20370 + }, + { + "epoch": 3.294802360358904, + "grad_norm": 0.8966776728630066, + "learning_rate": 0.0002, + "loss": 0.6458, + "step": 20380 + }, + { + "epoch": 3.296419044539649, + "grad_norm": 0.9289216995239258, + "learning_rate": 0.0002, + "loss": 0.5967, + "step": 20390 + }, + { + "epoch": 3.2980357287203943, + "grad_norm": 0.9649626612663269, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 20400 + }, + { + "epoch": 3.29965241290114, + "grad_norm": 1.1953798532485962, + "learning_rate": 0.0002, + "loss": 0.5781, + "step": 20410 + }, + { + "epoch": 3.301269097081885, + "grad_norm": 0.8929083943367004, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 20420 + }, + { + "epoch": 3.3028857812626304, + "grad_norm": 0.8922014236450195, + "learning_rate": 0.0002, + "loss": 0.597, + "step": 20430 + }, + { + "epoch": 3.3045024654433757, + "grad_norm": 0.9754860401153564, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 20440 + }, + { + "epoch": 3.306119149624121, + "grad_norm": 0.8873140215873718, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 20450 + }, + { + "epoch": 3.307735833804866, + "grad_norm": 0.857271671295166, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 20460 + }, + { + "epoch": 3.3093525179856114, + "grad_norm": 0.9022141098976135, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 20470 + }, + { + "epoch": 3.3109692021663566, + "grad_norm": 0.8614798188209534, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 20480 + }, + { + "epoch": 3.3125858863471023, + "grad_norm": 0.8838164210319519, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 20490 + }, + { + "epoch": 3.3142025705278475, + "grad_norm": 0.8709736466407776, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 20500 + }, + { + "epoch": 3.3158192547085927, + "grad_norm": 0.9533300995826721, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 20510 + }, + { + "epoch": 3.317435938889338, + "grad_norm": 0.8259269595146179, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 20520 + }, + { + "epoch": 3.319052623070083, + "grad_norm": 0.8607608079910278, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 20530 + }, + { + "epoch": 3.3206693072508284, + "grad_norm": 1.0863020420074463, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 20540 + }, + { + "epoch": 3.3222859914315737, + "grad_norm": 1.011489987373352, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 20550 + }, + { + "epoch": 3.3239026756123193, + "grad_norm": 0.6952177882194519, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 20560 + }, + { + "epoch": 3.3255193597930646, + "grad_norm": 0.9638974070549011, + "learning_rate": 0.0002, + "loss": 0.6229, + "step": 20570 + }, + { + "epoch": 3.32713604397381, + "grad_norm": 1.0310138463974, + "learning_rate": 0.0002, + "loss": 0.5882, + "step": 20580 + }, + { + "epoch": 3.328752728154555, + "grad_norm": 0.9371318221092224, + "learning_rate": 0.0002, + "loss": 0.594, + "step": 20590 + }, + { + "epoch": 3.3303694123353003, + "grad_norm": 0.8756691813468933, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 20600 + }, + { + "epoch": 3.3319860965160455, + "grad_norm": 1.054175853729248, + "learning_rate": 0.0002, + "loss": 0.5994, + "step": 20610 + }, + { + "epoch": 3.3336027806967907, + "grad_norm": 0.9074128270149231, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 20620 + }, + { + "epoch": 3.335219464877536, + "grad_norm": 0.906900942325592, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 20630 + }, + { + "epoch": 3.3368361490582816, + "grad_norm": 0.8689333200454712, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 20640 + }, + { + "epoch": 3.338452833239027, + "grad_norm": 0.9889747500419617, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 20650 + }, + { + "epoch": 3.340069517419772, + "grad_norm": 1.0685805082321167, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 20660 + }, + { + "epoch": 3.3416862016005173, + "grad_norm": 0.7495010495185852, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 20670 + }, + { + "epoch": 3.3433028857812626, + "grad_norm": 0.8747848272323608, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 20680 + }, + { + "epoch": 3.344919569962008, + "grad_norm": 0.9762673377990723, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 20690 + }, + { + "epoch": 3.346536254142753, + "grad_norm": 1.0284489393234253, + "learning_rate": 0.0002, + "loss": 0.6784, + "step": 20700 + }, + { + "epoch": 3.3481529383234987, + "grad_norm": 0.7293812036514282, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 20710 + }, + { + "epoch": 3.349769622504244, + "grad_norm": 0.8330199122428894, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 20720 + }, + { + "epoch": 3.351386306684989, + "grad_norm": 0.9808499217033386, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 20730 + }, + { + "epoch": 3.3530029908657344, + "grad_norm": 0.9508825540542603, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 20740 + }, + { + "epoch": 3.3546196750464796, + "grad_norm": 0.790483832359314, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 20750 + }, + { + "epoch": 3.356236359227225, + "grad_norm": 1.022793173789978, + "learning_rate": 0.0002, + "loss": 0.6327, + "step": 20760 + }, + { + "epoch": 3.35785304340797, + "grad_norm": 0.8318950533866882, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 20770 + }, + { + "epoch": 3.3594697275887153, + "grad_norm": 0.7980858087539673, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 20780 + }, + { + "epoch": 3.361086411769461, + "grad_norm": 0.8114802241325378, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 20790 + }, + { + "epoch": 3.3627030959502062, + "grad_norm": 0.8522519469261169, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 20800 + }, + { + "epoch": 3.3643197801309515, + "grad_norm": 0.9142431616783142, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 20810 + }, + { + "epoch": 3.3659364643116967, + "grad_norm": 0.771170437335968, + "learning_rate": 0.0002, + "loss": 0.6331, + "step": 20820 + }, + { + "epoch": 3.367553148492442, + "grad_norm": 1.0628231763839722, + "learning_rate": 0.0002, + "loss": 0.5879, + "step": 20830 + }, + { + "epoch": 3.369169832673187, + "grad_norm": 0.9384352564811707, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 20840 + }, + { + "epoch": 3.370786516853933, + "grad_norm": 1.1286591291427612, + "learning_rate": 0.0002, + "loss": 0.6292, + "step": 20850 + }, + { + "epoch": 3.372403201034678, + "grad_norm": 1.1349513530731201, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 20860 + }, + { + "epoch": 3.3740198852154233, + "grad_norm": 1.0127464532852173, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 20870 + }, + { + "epoch": 3.3756365693961685, + "grad_norm": 0.9111971855163574, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 20880 + }, + { + "epoch": 3.3772532535769137, + "grad_norm": 0.871356725692749, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 20890 + }, + { + "epoch": 3.378869937757659, + "grad_norm": 0.7774117588996887, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 20900 + }, + { + "epoch": 3.380486621938404, + "grad_norm": 1.0089964866638184, + "learning_rate": 0.0002, + "loss": 0.6062, + "step": 20910 + }, + { + "epoch": 3.3821033061191494, + "grad_norm": 0.7855867147445679, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 20920 + }, + { + "epoch": 3.3837199902998947, + "grad_norm": 1.3713710308074951, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 20930 + }, + { + "epoch": 3.3853366744806404, + "grad_norm": 0.8599116206169128, + "learning_rate": 0.0002, + "loss": 0.6627, + "step": 20940 + }, + { + "epoch": 3.3869533586613856, + "grad_norm": 0.9392673373222351, + "learning_rate": 0.0002, + "loss": 0.6224, + "step": 20950 + }, + { + "epoch": 3.388570042842131, + "grad_norm": 0.8764075040817261, + "learning_rate": 0.0002, + "loss": 0.5855, + "step": 20960 + }, + { + "epoch": 3.390186727022876, + "grad_norm": 0.8240136504173279, + "learning_rate": 0.0002, + "loss": 0.5734, + "step": 20970 + }, + { + "epoch": 3.3918034112036213, + "grad_norm": 1.0982369184494019, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 20980 + }, + { + "epoch": 3.3934200953843665, + "grad_norm": 1.0599013566970825, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 20990 + }, + { + "epoch": 3.395036779565112, + "grad_norm": 0.895438015460968, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 21000 + }, + { + "epoch": 3.3966534637458574, + "grad_norm": 0.6974841356277466, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 21010 + }, + { + "epoch": 3.3982701479266026, + "grad_norm": 0.9571719765663147, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 21020 + }, + { + "epoch": 3.399886832107348, + "grad_norm": 0.831912636756897, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 21030 + }, + { + "epoch": 3.401503516288093, + "grad_norm": 0.831936240196228, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 21040 + }, + { + "epoch": 3.4031202004688383, + "grad_norm": 0.7388373613357544, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 21050 + }, + { + "epoch": 3.4047368846495836, + "grad_norm": 0.938667356967926, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 21060 + }, + { + "epoch": 3.406353568830329, + "grad_norm": 0.9202313423156738, + "learning_rate": 0.0002, + "loss": 0.6121, + "step": 21070 + }, + { + "epoch": 3.4079702530110745, + "grad_norm": 0.9888381958007812, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 21080 + }, + { + "epoch": 3.4095869371918197, + "grad_norm": 0.8526970744132996, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 21090 + }, + { + "epoch": 3.411203621372565, + "grad_norm": 0.7939383387565613, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 21100 + }, + { + "epoch": 3.41282030555331, + "grad_norm": 0.9986352920532227, + "learning_rate": 0.0002, + "loss": 0.6066, + "step": 21110 + }, + { + "epoch": 3.4144369897340554, + "grad_norm": 0.8895300030708313, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 21120 + }, + { + "epoch": 3.4160536739148006, + "grad_norm": 0.9559482932090759, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 21130 + }, + { + "epoch": 3.417670358095546, + "grad_norm": 0.8351506590843201, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 21140 + }, + { + "epoch": 3.4192870422762915, + "grad_norm": 0.8224456906318665, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 21150 + }, + { + "epoch": 3.4209037264570368, + "grad_norm": 1.0110299587249756, + "learning_rate": 0.0002, + "loss": 0.5871, + "step": 21160 + }, + { + "epoch": 3.422520410637782, + "grad_norm": 0.82564777135849, + "learning_rate": 0.0002, + "loss": 0.6116, + "step": 21170 + }, + { + "epoch": 3.4241370948185272, + "grad_norm": 1.004738688468933, + "learning_rate": 0.0002, + "loss": 0.595, + "step": 21180 + }, + { + "epoch": 3.4257537789992725, + "grad_norm": 0.7545676827430725, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 21190 + }, + { + "epoch": 3.4273704631800177, + "grad_norm": 0.8918704390525818, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 21200 + }, + { + "epoch": 3.428987147360763, + "grad_norm": 0.8336876034736633, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 21210 + }, + { + "epoch": 3.430603831541508, + "grad_norm": 0.8928771018981934, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 21220 + }, + { + "epoch": 3.432220515722254, + "grad_norm": 0.7663705945014954, + "learning_rate": 0.0002, + "loss": 0.6468, + "step": 21230 + }, + { + "epoch": 3.433837199902999, + "grad_norm": 0.8392598628997803, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 21240 + }, + { + "epoch": 3.4354538840837443, + "grad_norm": 0.8819600343704224, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 21250 + }, + { + "epoch": 3.4370705682644895, + "grad_norm": 0.9124642014503479, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 21260 + }, + { + "epoch": 3.4386872524452348, + "grad_norm": 0.8329763412475586, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 21270 + }, + { + "epoch": 3.44030393662598, + "grad_norm": 0.9982839822769165, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 21280 + }, + { + "epoch": 3.4419206208067252, + "grad_norm": 0.9105954766273499, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 21290 + }, + { + "epoch": 3.443537304987471, + "grad_norm": 0.8182359337806702, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 21300 + }, + { + "epoch": 3.445153989168216, + "grad_norm": 1.0568904876708984, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 21310 + }, + { + "epoch": 3.4467706733489614, + "grad_norm": 0.968539834022522, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 21320 + }, + { + "epoch": 3.4483873575297066, + "grad_norm": 0.8774511218070984, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 21330 + }, + { + "epoch": 3.450004041710452, + "grad_norm": 0.7598156332969666, + "learning_rate": 0.0002, + "loss": 0.6438, + "step": 21340 + }, + { + "epoch": 3.451620725891197, + "grad_norm": 1.1012897491455078, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 21350 + }, + { + "epoch": 3.4532374100719423, + "grad_norm": 0.8040637373924255, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 21360 + }, + { + "epoch": 3.4548540942526875, + "grad_norm": 0.8497496247291565, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 21370 + }, + { + "epoch": 3.456470778433433, + "grad_norm": 0.8429915904998779, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 21380 + }, + { + "epoch": 3.4580874626141784, + "grad_norm": 0.8107112646102905, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 21390 + }, + { + "epoch": 3.4597041467949237, + "grad_norm": 1.00872004032135, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 21400 + }, + { + "epoch": 3.461320830975669, + "grad_norm": 0.8266542553901672, + "learning_rate": 0.0002, + "loss": 0.5712, + "step": 21410 + }, + { + "epoch": 3.462937515156414, + "grad_norm": 0.8972568511962891, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 21420 + }, + { + "epoch": 3.4645541993371594, + "grad_norm": 1.0781476497650146, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 21430 + }, + { + "epoch": 3.4661708835179046, + "grad_norm": 0.9571592807769775, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 21440 + }, + { + "epoch": 3.4677875676986503, + "grad_norm": 0.881547212600708, + "learning_rate": 0.0002, + "loss": 0.6309, + "step": 21450 + }, + { + "epoch": 3.4694042518793955, + "grad_norm": 0.6955338716506958, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 21460 + }, + { + "epoch": 3.4710209360601407, + "grad_norm": 0.901187539100647, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 21470 + }, + { + "epoch": 3.472637620240886, + "grad_norm": 0.7063511610031128, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 21480 + }, + { + "epoch": 3.474254304421631, + "grad_norm": 0.8462792038917542, + "learning_rate": 0.0002, + "loss": 0.6154, + "step": 21490 + }, + { + "epoch": 3.4758709886023764, + "grad_norm": 1.1861060857772827, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 21500 + }, + { + "epoch": 3.4774876727831217, + "grad_norm": 0.70503169298172, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 21510 + }, + { + "epoch": 3.479104356963867, + "grad_norm": 0.9650066494941711, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 21520 + }, + { + "epoch": 3.4807210411446126, + "grad_norm": 1.0266852378845215, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 21530 + }, + { + "epoch": 3.482337725325358, + "grad_norm": 0.956372857093811, + "learning_rate": 0.0002, + "loss": 0.6553, + "step": 21540 + }, + { + "epoch": 3.483954409506103, + "grad_norm": 0.8848432898521423, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 21550 + }, + { + "epoch": 3.4855710936868483, + "grad_norm": 1.0805351734161377, + "learning_rate": 0.0002, + "loss": 0.6375, + "step": 21560 + }, + { + "epoch": 3.4871877778675935, + "grad_norm": 0.9279725551605225, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 21570 + }, + { + "epoch": 3.4888044620483387, + "grad_norm": 0.9049562215805054, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 21580 + }, + { + "epoch": 3.4904211462290844, + "grad_norm": 0.9619429111480713, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 21590 + }, + { + "epoch": 3.4920378304098296, + "grad_norm": 0.8508906960487366, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 21600 + }, + { + "epoch": 3.493654514590575, + "grad_norm": 0.8692502379417419, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 21610 + }, + { + "epoch": 3.49527119877132, + "grad_norm": 0.8187332153320312, + "learning_rate": 0.0002, + "loss": 0.5878, + "step": 21620 + }, + { + "epoch": 3.4968878829520653, + "grad_norm": 1.145400047302246, + "learning_rate": 0.0002, + "loss": 0.5874, + "step": 21630 + }, + { + "epoch": 3.4985045671328105, + "grad_norm": 0.8281388282775879, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 21640 + }, + { + "epoch": 3.500121251313556, + "grad_norm": 0.82256019115448, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 21650 + }, + { + "epoch": 3.501737935494301, + "grad_norm": 0.9315484762191772, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 21660 + }, + { + "epoch": 3.5033546196750462, + "grad_norm": 0.7626111507415771, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 21670 + }, + { + "epoch": 3.504971303855792, + "grad_norm": 0.9275059103965759, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 21680 + }, + { + "epoch": 3.506587988036537, + "grad_norm": 0.7906724810600281, + "learning_rate": 0.0002, + "loss": 0.64, + "step": 21690 + }, + { + "epoch": 3.5082046722172824, + "grad_norm": 0.8289761543273926, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 21700 + }, + { + "epoch": 3.5098213563980276, + "grad_norm": 0.8316431045532227, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 21710 + }, + { + "epoch": 3.511438040578773, + "grad_norm": 1.0451812744140625, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 21720 + }, + { + "epoch": 3.513054724759518, + "grad_norm": 0.928252637386322, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 21730 + }, + { + "epoch": 3.5146714089402638, + "grad_norm": 0.7985895276069641, + "learning_rate": 0.0002, + "loss": 0.6062, + "step": 21740 + }, + { + "epoch": 3.516288093121009, + "grad_norm": 0.6740974187850952, + "learning_rate": 0.0002, + "loss": 0.6463, + "step": 21750 + }, + { + "epoch": 3.517904777301754, + "grad_norm": 0.8482223749160767, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 21760 + }, + { + "epoch": 3.5195214614824994, + "grad_norm": 0.889947772026062, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 21770 + }, + { + "epoch": 3.5211381456632447, + "grad_norm": 0.8304598927497864, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 21780 + }, + { + "epoch": 3.52275482984399, + "grad_norm": 0.8002981543540955, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 21790 + }, + { + "epoch": 3.524371514024735, + "grad_norm": 0.8115083575248718, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 21800 + }, + { + "epoch": 3.5259881982054804, + "grad_norm": 0.9715048670768738, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 21810 + }, + { + "epoch": 3.5276048823862256, + "grad_norm": 1.0910786390304565, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 21820 + }, + { + "epoch": 3.5292215665669713, + "grad_norm": 0.8438942432403564, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 21830 + }, + { + "epoch": 3.5308382507477165, + "grad_norm": 0.8813382983207703, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 21840 + }, + { + "epoch": 3.5324549349284617, + "grad_norm": 0.7092908024787903, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 21850 + }, + { + "epoch": 3.534071619109207, + "grad_norm": 0.8332187533378601, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 21860 + }, + { + "epoch": 3.535688303289952, + "grad_norm": 0.8958209156990051, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 21870 + }, + { + "epoch": 3.5373049874706974, + "grad_norm": 0.824138879776001, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 21880 + }, + { + "epoch": 3.538921671651443, + "grad_norm": 0.8375158309936523, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 21890 + }, + { + "epoch": 3.5405383558321883, + "grad_norm": 1.0274608135223389, + "learning_rate": 0.0002, + "loss": 0.6019, + "step": 21900 + }, + { + "epoch": 3.5421550400129336, + "grad_norm": 0.7088932394981384, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 21910 + }, + { + "epoch": 3.543771724193679, + "grad_norm": 0.8172445297241211, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 21920 + }, + { + "epoch": 3.545388408374424, + "grad_norm": 0.9904135465621948, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 21930 + }, + { + "epoch": 3.5470050925551693, + "grad_norm": 0.9900432229042053, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 21940 + }, + { + "epoch": 3.5486217767359145, + "grad_norm": 0.8963301181793213, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 21950 + }, + { + "epoch": 3.5502384609166597, + "grad_norm": 0.8551464676856995, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 21960 + }, + { + "epoch": 3.551855145097405, + "grad_norm": 1.0916603803634644, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 21970 + }, + { + "epoch": 3.5534718292781506, + "grad_norm": 0.841598391532898, + "learning_rate": 0.0002, + "loss": 0.6523, + "step": 21980 + }, + { + "epoch": 3.555088513458896, + "grad_norm": 0.8566757440567017, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 21990 + }, + { + "epoch": 3.556705197639641, + "grad_norm": 1.0145052671432495, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 22000 + }, + { + "epoch": 3.5583218818203863, + "grad_norm": 0.9293754696846008, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 22010 + }, + { + "epoch": 3.5599385660011316, + "grad_norm": 0.9568536281585693, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 22020 + }, + { + "epoch": 3.5615552501818772, + "grad_norm": 0.8613139986991882, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 22030 + }, + { + "epoch": 3.5631719343626225, + "grad_norm": 0.8179237246513367, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 22040 + }, + { + "epoch": 3.5647886185433677, + "grad_norm": 0.9059830904006958, + "learning_rate": 0.0002, + "loss": 0.574, + "step": 22050 + }, + { + "epoch": 3.566405302724113, + "grad_norm": 1.0068252086639404, + "learning_rate": 0.0002, + "loss": 0.6448, + "step": 22060 + }, + { + "epoch": 3.568021986904858, + "grad_norm": 0.9682072997093201, + "learning_rate": 0.0002, + "loss": 0.6239, + "step": 22070 + }, + { + "epoch": 3.5696386710856034, + "grad_norm": 0.8514005541801453, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 22080 + }, + { + "epoch": 3.5712553552663486, + "grad_norm": 0.8327770829200745, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 22090 + }, + { + "epoch": 3.572872039447094, + "grad_norm": 1.024976372718811, + "learning_rate": 0.0002, + "loss": 0.5976, + "step": 22100 + }, + { + "epoch": 3.574488723627839, + "grad_norm": 0.7721174955368042, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 22110 + }, + { + "epoch": 3.5761054078085843, + "grad_norm": 1.0351054668426514, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 22120 + }, + { + "epoch": 3.57772209198933, + "grad_norm": 0.9680907130241394, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 22130 + }, + { + "epoch": 3.5793387761700752, + "grad_norm": 0.8016974925994873, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 22140 + }, + { + "epoch": 3.5809554603508205, + "grad_norm": 1.0109003782272339, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 22150 + }, + { + "epoch": 3.5825721445315657, + "grad_norm": 1.0473392009735107, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 22160 + }, + { + "epoch": 3.584188828712311, + "grad_norm": 0.8686613440513611, + "learning_rate": 0.0002, + "loss": 0.6353, + "step": 22170 + }, + { + "epoch": 3.5858055128930566, + "grad_norm": 0.869149923324585, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 22180 + }, + { + "epoch": 3.587422197073802, + "grad_norm": 0.9769062995910645, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 22190 + }, + { + "epoch": 3.589038881254547, + "grad_norm": 0.779636561870575, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 22200 + }, + { + "epoch": 3.5906555654352923, + "grad_norm": 0.9063841104507446, + "learning_rate": 0.0002, + "loss": 0.5875, + "step": 22210 + }, + { + "epoch": 3.5922722496160375, + "grad_norm": 0.9216037392616272, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 22220 + }, + { + "epoch": 3.5938889337967828, + "grad_norm": 1.0217336416244507, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 22230 + }, + { + "epoch": 3.595505617977528, + "grad_norm": 0.8513161540031433, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 22240 + }, + { + "epoch": 3.597122302158273, + "grad_norm": 0.8084813952445984, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 22250 + }, + { + "epoch": 3.5987389863390185, + "grad_norm": 0.8524802923202515, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 22260 + }, + { + "epoch": 3.600355670519764, + "grad_norm": 0.9356237649917603, + "learning_rate": 0.0002, + "loss": 0.5599, + "step": 22270 + }, + { + "epoch": 3.6019723547005094, + "grad_norm": 1.009600281715393, + "learning_rate": 0.0002, + "loss": 0.628, + "step": 22280 + }, + { + "epoch": 3.6035890388812546, + "grad_norm": 0.9900581240653992, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 22290 + }, + { + "epoch": 3.605205723062, + "grad_norm": 1.062495231628418, + "learning_rate": 0.0002, + "loss": 0.5725, + "step": 22300 + }, + { + "epoch": 3.606822407242745, + "grad_norm": 0.8832381367683411, + "learning_rate": 0.0002, + "loss": 0.607, + "step": 22310 + }, + { + "epoch": 3.6084390914234903, + "grad_norm": 0.9284297823905945, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 22320 + }, + { + "epoch": 3.610055775604236, + "grad_norm": 1.2381829023361206, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 22330 + }, + { + "epoch": 3.611672459784981, + "grad_norm": 0.929434597492218, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 22340 + }, + { + "epoch": 3.6132891439657264, + "grad_norm": 0.9714490175247192, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 22350 + }, + { + "epoch": 3.6149058281464717, + "grad_norm": 0.808014988899231, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 22360 + }, + { + "epoch": 3.616522512327217, + "grad_norm": 1.0364398956298828, + "learning_rate": 0.0002, + "loss": 0.6428, + "step": 22370 + }, + { + "epoch": 3.618139196507962, + "grad_norm": 0.7858489751815796, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 22380 + }, + { + "epoch": 3.6197558806887074, + "grad_norm": 0.9920870065689087, + "learning_rate": 0.0002, + "loss": 0.6214, + "step": 22390 + }, + { + "epoch": 3.6213725648694526, + "grad_norm": 0.9183220863342285, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 22400 + }, + { + "epoch": 3.622989249050198, + "grad_norm": 0.9826246500015259, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 22410 + }, + { + "epoch": 3.6246059332309435, + "grad_norm": 0.8632931113243103, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 22420 + }, + { + "epoch": 3.6262226174116887, + "grad_norm": 0.8468965291976929, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 22430 + }, + { + "epoch": 3.627839301592434, + "grad_norm": 0.8466871976852417, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 22440 + }, + { + "epoch": 3.629455985773179, + "grad_norm": 0.9501169919967651, + "learning_rate": 0.0002, + "loss": 0.5941, + "step": 22450 + }, + { + "epoch": 3.6310726699539244, + "grad_norm": 0.8906720876693726, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 22460 + }, + { + "epoch": 3.6326893541346696, + "grad_norm": 0.7400227189064026, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 22470 + }, + { + "epoch": 3.6343060383154153, + "grad_norm": 0.9756355881690979, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 22480 + }, + { + "epoch": 3.6359227224961606, + "grad_norm": 0.7504993081092834, + "learning_rate": 0.0002, + "loss": 0.6203, + "step": 22490 + }, + { + "epoch": 3.637539406676906, + "grad_norm": 0.9270039200782776, + "learning_rate": 0.0002, + "loss": 0.6302, + "step": 22500 + }, + { + "epoch": 3.639156090857651, + "grad_norm": 0.8841686844825745, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 22510 + }, + { + "epoch": 3.6407727750383962, + "grad_norm": 0.8533213138580322, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 22520 + }, + { + "epoch": 3.6423894592191415, + "grad_norm": 1.0052043199539185, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 22530 + }, + { + "epoch": 3.6440061433998867, + "grad_norm": 1.0323461294174194, + "learning_rate": 0.0002, + "loss": 0.6363, + "step": 22540 + }, + { + "epoch": 3.645622827580632, + "grad_norm": 0.8654312491416931, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 22550 + }, + { + "epoch": 3.647239511761377, + "grad_norm": 0.6400038003921509, + "learning_rate": 0.0002, + "loss": 0.6155, + "step": 22560 + }, + { + "epoch": 3.648856195942123, + "grad_norm": 0.8061298727989197, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 22570 + }, + { + "epoch": 3.650472880122868, + "grad_norm": 0.9257854223251343, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 22580 + }, + { + "epoch": 3.6520895643036133, + "grad_norm": 0.8439396619796753, + "learning_rate": 0.0002, + "loss": 0.6409, + "step": 22590 + }, + { + "epoch": 3.6537062484843585, + "grad_norm": 0.7764544486999512, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 22600 + }, + { + "epoch": 3.6553229326651038, + "grad_norm": 1.125451683998108, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 22610 + }, + { + "epoch": 3.656939616845849, + "grad_norm": 0.7523018717765808, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 22620 + }, + { + "epoch": 3.6585563010265947, + "grad_norm": 1.071026086807251, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 22630 + }, + { + "epoch": 3.66017298520734, + "grad_norm": 0.945791482925415, + "learning_rate": 0.0002, + "loss": 0.6459, + "step": 22640 + }, + { + "epoch": 3.661789669388085, + "grad_norm": 0.8001811504364014, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 22650 + }, + { + "epoch": 3.6634063535688304, + "grad_norm": 0.9700816869735718, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 22660 + }, + { + "epoch": 3.6650230377495756, + "grad_norm": 0.9053242206573486, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 22670 + }, + { + "epoch": 3.666639721930321, + "grad_norm": 0.944362461566925, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 22680 + }, + { + "epoch": 3.668256406111066, + "grad_norm": 1.067489504814148, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 22690 + }, + { + "epoch": 3.6698730902918113, + "grad_norm": 1.0984995365142822, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 22700 + }, + { + "epoch": 3.6714897744725565, + "grad_norm": 0.9336317777633667, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 22710 + }, + { + "epoch": 3.673106458653302, + "grad_norm": 0.9261918663978577, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 22720 + }, + { + "epoch": 3.6747231428340474, + "grad_norm": 0.8648008704185486, + "learning_rate": 0.0002, + "loss": 0.6332, + "step": 22730 + }, + { + "epoch": 3.6763398270147927, + "grad_norm": 0.7225083708763123, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 22740 + }, + { + "epoch": 3.677956511195538, + "grad_norm": 0.9258282780647278, + "learning_rate": 0.0002, + "loss": 0.6406, + "step": 22750 + }, + { + "epoch": 3.679573195376283, + "grad_norm": 0.70876145362854, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 22760 + }, + { + "epoch": 3.681189879557029, + "grad_norm": 0.8780210018157959, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 22770 + }, + { + "epoch": 3.682806563737774, + "grad_norm": 0.8075440526008606, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 22780 + }, + { + "epoch": 3.6844232479185193, + "grad_norm": 0.8503130674362183, + "learning_rate": 0.0002, + "loss": 0.6561, + "step": 22790 + }, + { + "epoch": 3.6860399320992645, + "grad_norm": 0.8413618206977844, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 22800 + }, + { + "epoch": 3.6876566162800097, + "grad_norm": 0.8675165176391602, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 22810 + }, + { + "epoch": 3.689273300460755, + "grad_norm": 0.8235884308815002, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 22820 + }, + { + "epoch": 3.6908899846415, + "grad_norm": 0.9477725625038147, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 22830 + }, + { + "epoch": 3.6925066688222454, + "grad_norm": 0.7883533835411072, + "learning_rate": 0.0002, + "loss": 0.6481, + "step": 22840 + }, + { + "epoch": 3.6941233530029907, + "grad_norm": 1.047913908958435, + "learning_rate": 0.0002, + "loss": 0.5872, + "step": 22850 + }, + { + "epoch": 3.695740037183736, + "grad_norm": 0.9171528816223145, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 22860 + }, + { + "epoch": 3.6973567213644816, + "grad_norm": 0.9338192343711853, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 22870 + }, + { + "epoch": 3.698973405545227, + "grad_norm": 0.8799443244934082, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 22880 + }, + { + "epoch": 3.700590089725972, + "grad_norm": 0.8515434861183167, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 22890 + }, + { + "epoch": 3.7022067739067173, + "grad_norm": 0.7805591821670532, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 22900 + }, + { + "epoch": 3.7038234580874625, + "grad_norm": 0.8470911979675293, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 22910 + }, + { + "epoch": 3.705440142268208, + "grad_norm": 0.9452309012413025, + "learning_rate": 0.0002, + "loss": 0.6557, + "step": 22920 + }, + { + "epoch": 3.7070568264489534, + "grad_norm": 0.950243353843689, + "learning_rate": 0.0002, + "loss": 0.6529, + "step": 22930 + }, + { + "epoch": 3.7086735106296986, + "grad_norm": 0.7882499098777771, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 22940 + }, + { + "epoch": 3.710290194810444, + "grad_norm": 0.8307787775993347, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 22950 + }, + { + "epoch": 3.711906878991189, + "grad_norm": 1.0970630645751953, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 22960 + }, + { + "epoch": 3.7135235631719343, + "grad_norm": 0.8269566297531128, + "learning_rate": 0.0002, + "loss": 0.6281, + "step": 22970 + }, + { + "epoch": 3.7151402473526796, + "grad_norm": 0.8306704759597778, + "learning_rate": 0.0002, + "loss": 0.6561, + "step": 22980 + }, + { + "epoch": 3.716756931533425, + "grad_norm": 0.9710225462913513, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 22990 + }, + { + "epoch": 3.71837361571417, + "grad_norm": 0.8890530467033386, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 23000 + }, + { + "epoch": 3.7199902998949153, + "grad_norm": 0.883522629737854, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 23010 + }, + { + "epoch": 3.721606984075661, + "grad_norm": 0.8662652373313904, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 23020 + }, + { + "epoch": 3.723223668256406, + "grad_norm": 0.7228406667709351, + "learning_rate": 0.0002, + "loss": 0.6266, + "step": 23030 + }, + { + "epoch": 3.7248403524371514, + "grad_norm": 1.060792088508606, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 23040 + }, + { + "epoch": 3.7264570366178966, + "grad_norm": 1.0119613409042358, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 23050 + }, + { + "epoch": 3.728073720798642, + "grad_norm": 0.9212996959686279, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 23060 + }, + { + "epoch": 3.7296904049793875, + "grad_norm": 0.925690233707428, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 23070 + }, + { + "epoch": 3.7313070891601328, + "grad_norm": 0.8323310613632202, + "learning_rate": 0.0002, + "loss": 0.615, + "step": 23080 + }, + { + "epoch": 3.732923773340878, + "grad_norm": 0.8966048955917358, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 23090 + }, + { + "epoch": 3.7345404575216232, + "grad_norm": 0.8995837569236755, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 23100 + }, + { + "epoch": 3.7361571417023685, + "grad_norm": 0.8748890161514282, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 23110 + }, + { + "epoch": 3.7377738258831137, + "grad_norm": 0.7985540628433228, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 23120 + }, + { + "epoch": 3.739390510063859, + "grad_norm": 1.0240917205810547, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 23130 + }, + { + "epoch": 3.741007194244604, + "grad_norm": 0.9181789755821228, + "learning_rate": 0.0002, + "loss": 0.6747, + "step": 23140 + }, + { + "epoch": 3.7426238784253494, + "grad_norm": 0.8896583914756775, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 23150 + }, + { + "epoch": 3.744240562606095, + "grad_norm": 0.8635515570640564, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 23160 + }, + { + "epoch": 3.7458572467868403, + "grad_norm": 0.8873575329780579, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 23170 + }, + { + "epoch": 3.7474739309675855, + "grad_norm": 0.9807148575782776, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 23180 + }, + { + "epoch": 3.7490906151483308, + "grad_norm": 0.900477945804596, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 23190 + }, + { + "epoch": 3.750707299329076, + "grad_norm": 0.9379992485046387, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 23200 + }, + { + "epoch": 3.752323983509821, + "grad_norm": 0.9649890661239624, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 23210 + }, + { + "epoch": 3.753940667690567, + "grad_norm": 0.824442446231842, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 23220 + }, + { + "epoch": 3.755557351871312, + "grad_norm": 0.8896150588989258, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 23230 + }, + { + "epoch": 3.7571740360520574, + "grad_norm": 0.751249372959137, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 23240 + }, + { + "epoch": 3.7587907202328026, + "grad_norm": 0.9392193555831909, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 23250 + }, + { + "epoch": 3.760407404413548, + "grad_norm": 0.9284586310386658, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 23260 + }, + { + "epoch": 3.762024088594293, + "grad_norm": 0.7738175392150879, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 23270 + }, + { + "epoch": 3.7636407727750383, + "grad_norm": 0.9252978563308716, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 23280 + }, + { + "epoch": 3.7652574569557835, + "grad_norm": 0.9501895904541016, + "learning_rate": 0.0002, + "loss": 0.5984, + "step": 23290 + }, + { + "epoch": 3.7668741411365287, + "grad_norm": 0.9416276216506958, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 23300 + }, + { + "epoch": 3.7684908253172744, + "grad_norm": 0.7076631784439087, + "learning_rate": 0.0002, + "loss": 0.6507, + "step": 23310 + }, + { + "epoch": 3.7701075094980196, + "grad_norm": 0.9864492416381836, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 23320 + }, + { + "epoch": 3.771724193678765, + "grad_norm": 0.8450456261634827, + "learning_rate": 0.0002, + "loss": 0.6537, + "step": 23330 + }, + { + "epoch": 3.77334087785951, + "grad_norm": 1.0768941640853882, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 23340 + }, + { + "epoch": 3.7749575620402553, + "grad_norm": 0.9956819415092468, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 23350 + }, + { + "epoch": 3.7765742462210006, + "grad_norm": 0.9234658479690552, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 23360 + }, + { + "epoch": 3.7781909304017463, + "grad_norm": 1.0993858575820923, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 23370 + }, + { + "epoch": 3.7798076145824915, + "grad_norm": 0.923159658908844, + "learning_rate": 0.0002, + "loss": 0.6391, + "step": 23380 + }, + { + "epoch": 3.7814242987632367, + "grad_norm": 0.9311541318893433, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 23390 + }, + { + "epoch": 3.783040982943982, + "grad_norm": 0.919681191444397, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 23400 + }, + { + "epoch": 3.784657667124727, + "grad_norm": 1.7406195402145386, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 23410 + }, + { + "epoch": 3.7862743513054724, + "grad_norm": 0.7789074182510376, + "learning_rate": 0.0002, + "loss": 0.657, + "step": 23420 + }, + { + "epoch": 3.7878910354862176, + "grad_norm": 0.8302814960479736, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 23430 + }, + { + "epoch": 3.789507719666963, + "grad_norm": 0.8089349269866943, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 23440 + }, + { + "epoch": 3.791124403847708, + "grad_norm": 0.9006284475326538, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 23450 + }, + { + "epoch": 3.7927410880284538, + "grad_norm": 0.8426766991615295, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 23460 + }, + { + "epoch": 3.794357772209199, + "grad_norm": 1.2576252222061157, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 23470 + }, + { + "epoch": 3.7959744563899442, + "grad_norm": 1.0307610034942627, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 23480 + }, + { + "epoch": 3.7975911405706895, + "grad_norm": 0.8525972962379456, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 23490 + }, + { + "epoch": 3.7992078247514347, + "grad_norm": 1.159039855003357, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 23500 + }, + { + "epoch": 3.80082450893218, + "grad_norm": 1.4193549156188965, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 23510 + }, + { + "epoch": 3.8024411931129256, + "grad_norm": 0.8245543837547302, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 23520 + }, + { + "epoch": 3.804057877293671, + "grad_norm": 0.8847230076789856, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 23530 + }, + { + "epoch": 3.805674561474416, + "grad_norm": 0.9574624300003052, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 23540 + }, + { + "epoch": 3.8072912456551613, + "grad_norm": 1.048020601272583, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 23550 + }, + { + "epoch": 3.8089079298359065, + "grad_norm": 0.8302255868911743, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 23560 + }, + { + "epoch": 3.8105246140166518, + "grad_norm": 0.8269215822219849, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 23570 + }, + { + "epoch": 3.812141298197397, + "grad_norm": 0.9375753402709961, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 23580 + }, + { + "epoch": 3.8137579823781422, + "grad_norm": 1.0234097242355347, + "learning_rate": 0.0002, + "loss": 0.6132, + "step": 23590 + }, + { + "epoch": 3.8153746665588875, + "grad_norm": 0.8978445529937744, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 23600 + }, + { + "epoch": 3.816991350739633, + "grad_norm": 0.7929515838623047, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 23610 + }, + { + "epoch": 3.8186080349203784, + "grad_norm": 1.3255881071090698, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 23620 + }, + { + "epoch": 3.8202247191011236, + "grad_norm": 0.9188598990440369, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 23630 + }, + { + "epoch": 3.821841403281869, + "grad_norm": 0.8811675906181335, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 23640 + }, + { + "epoch": 3.823458087462614, + "grad_norm": 0.8061038255691528, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 23650 + }, + { + "epoch": 3.8250747716433597, + "grad_norm": 0.9975376129150391, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 23660 + }, + { + "epoch": 3.826691455824105, + "grad_norm": 0.8036105036735535, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 23670 + }, + { + "epoch": 3.82830814000485, + "grad_norm": 0.7401984333992004, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 23680 + }, + { + "epoch": 3.8299248241855954, + "grad_norm": 0.829753041267395, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 23690 + }, + { + "epoch": 3.8315415083663407, + "grad_norm": 0.8753240704536438, + "learning_rate": 0.0002, + "loss": 0.6611, + "step": 23700 + }, + { + "epoch": 3.833158192547086, + "grad_norm": 0.8157842755317688, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 23710 + }, + { + "epoch": 3.834774876727831, + "grad_norm": 0.6183798909187317, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 23720 + }, + { + "epoch": 3.8363915609085764, + "grad_norm": 0.9548442363739014, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 23730 + }, + { + "epoch": 3.8380082450893216, + "grad_norm": 0.8319669961929321, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 23740 + }, + { + "epoch": 3.839624929270067, + "grad_norm": 0.9718693494796753, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 23750 + }, + { + "epoch": 3.8412416134508125, + "grad_norm": 0.8672235012054443, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 23760 + }, + { + "epoch": 3.8428582976315577, + "grad_norm": 1.1210707426071167, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 23770 + }, + { + "epoch": 3.844474981812303, + "grad_norm": 0.9177767634391785, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 23780 + }, + { + "epoch": 3.846091665993048, + "grad_norm": 0.8714171648025513, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 23790 + }, + { + "epoch": 3.8477083501737934, + "grad_norm": 1.1853246688842773, + "learning_rate": 0.0002, + "loss": 0.6302, + "step": 23800 + }, + { + "epoch": 3.849325034354539, + "grad_norm": 0.8091260194778442, + "learning_rate": 0.0002, + "loss": 0.6144, + "step": 23810 + }, + { + "epoch": 3.8509417185352843, + "grad_norm": 0.9710774421691895, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 23820 + }, + { + "epoch": 3.8525584027160296, + "grad_norm": 0.7648707628250122, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 23830 + }, + { + "epoch": 3.854175086896775, + "grad_norm": 0.7809253931045532, + "learning_rate": 0.0002, + "loss": 0.6013, + "step": 23840 + }, + { + "epoch": 3.85579177107752, + "grad_norm": 0.8337951898574829, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 23850 + }, + { + "epoch": 3.8574084552582653, + "grad_norm": 0.9271913170814514, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 23860 + }, + { + "epoch": 3.8590251394390105, + "grad_norm": 0.985334038734436, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 23870 + }, + { + "epoch": 3.8606418236197557, + "grad_norm": 0.8458583354949951, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 23880 + }, + { + "epoch": 3.862258507800501, + "grad_norm": 1.015348196029663, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 23890 + }, + { + "epoch": 3.8638751919812466, + "grad_norm": 1.0121688842773438, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 23900 + }, + { + "epoch": 3.865491876161992, + "grad_norm": 0.8883971571922302, + "learning_rate": 0.0002, + "loss": 0.5921, + "step": 23910 + }, + { + "epoch": 3.867108560342737, + "grad_norm": 1.028086543083191, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 23920 + }, + { + "epoch": 3.8687252445234823, + "grad_norm": 0.9645734429359436, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 23930 + }, + { + "epoch": 3.8703419287042276, + "grad_norm": 0.8235350251197815, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 23940 + }, + { + "epoch": 3.871958612884973, + "grad_norm": 1.0298916101455688, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 23950 + }, + { + "epoch": 3.8735752970657185, + "grad_norm": 1.0063377618789673, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 23960 + }, + { + "epoch": 3.8751919812464637, + "grad_norm": 0.9230626821517944, + "learning_rate": 0.0002, + "loss": 0.6234, + "step": 23970 + }, + { + "epoch": 3.876808665427209, + "grad_norm": 0.9243063926696777, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 23980 + }, + { + "epoch": 3.878425349607954, + "grad_norm": 1.0211291313171387, + "learning_rate": 0.0002, + "loss": 0.6035, + "step": 23990 + }, + { + "epoch": 3.8800420337886994, + "grad_norm": 0.7800535559654236, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 24000 + }, + { + "epoch": 3.8816587179694446, + "grad_norm": 0.7904248833656311, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 24010 + }, + { + "epoch": 3.88327540215019, + "grad_norm": 1.1975988149642944, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 24020 + }, + { + "epoch": 3.884892086330935, + "grad_norm": 1.0626593828201294, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 24030 + }, + { + "epoch": 3.8865087705116803, + "grad_norm": 0.9012193083763123, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 24040 + }, + { + "epoch": 3.888125454692426, + "grad_norm": 1.1159172058105469, + "learning_rate": 0.0002, + "loss": 0.6786, + "step": 24050 + }, + { + "epoch": 3.889742138873171, + "grad_norm": 1.276838779449463, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 24060 + }, + { + "epoch": 3.8913588230539164, + "grad_norm": 0.8467690348625183, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 24070 + }, + { + "epoch": 3.8929755072346617, + "grad_norm": 0.9862841963768005, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 24080 + }, + { + "epoch": 3.894592191415407, + "grad_norm": 0.7134621739387512, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 24090 + }, + { + "epoch": 3.896208875596152, + "grad_norm": 0.8178175091743469, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 24100 + }, + { + "epoch": 3.897825559776898, + "grad_norm": 0.9229172468185425, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 24110 + }, + { + "epoch": 3.899442243957643, + "grad_norm": 1.0878316164016724, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 24120 + }, + { + "epoch": 3.9010589281383883, + "grad_norm": 0.971645712852478, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 24130 + }, + { + "epoch": 3.9026756123191335, + "grad_norm": 0.8862188458442688, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 24140 + }, + { + "epoch": 3.9042922964998787, + "grad_norm": 0.9126982688903809, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 24150 + }, + { + "epoch": 3.905908980680624, + "grad_norm": 0.8833470940589905, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 24160 + }, + { + "epoch": 3.907525664861369, + "grad_norm": 0.8320947885513306, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 24170 + }, + { + "epoch": 3.9091423490421144, + "grad_norm": 0.9156602025032043, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 24180 + }, + { + "epoch": 3.9107590332228597, + "grad_norm": 1.029181957244873, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 24190 + }, + { + "epoch": 3.9123757174036053, + "grad_norm": 0.9052802324295044, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 24200 + }, + { + "epoch": 3.9139924015843506, + "grad_norm": 0.8847255110740662, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 24210 + }, + { + "epoch": 3.915609085765096, + "grad_norm": 0.9642062187194824, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 24220 + }, + { + "epoch": 3.917225769945841, + "grad_norm": 0.8629093766212463, + "learning_rate": 0.0002, + "loss": 0.6557, + "step": 24230 + }, + { + "epoch": 3.9188424541265863, + "grad_norm": 0.8674976825714111, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 24240 + }, + { + "epoch": 3.9204591383073315, + "grad_norm": 1.104846477508545, + "learning_rate": 0.0002, + "loss": 0.5874, + "step": 24250 + }, + { + "epoch": 3.922075822488077, + "grad_norm": 1.0874955654144287, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 24260 + }, + { + "epoch": 3.9236925066688224, + "grad_norm": 0.8689812421798706, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 24270 + }, + { + "epoch": 3.9253091908495676, + "grad_norm": 0.9724617004394531, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 24280 + }, + { + "epoch": 3.926925875030313, + "grad_norm": 0.9165538549423218, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 24290 + }, + { + "epoch": 3.928542559211058, + "grad_norm": 0.9307710528373718, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 24300 + }, + { + "epoch": 3.9301592433918033, + "grad_norm": 0.8589295148849487, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 24310 + }, + { + "epoch": 3.9317759275725486, + "grad_norm": 0.9151099920272827, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 24320 + }, + { + "epoch": 3.933392611753294, + "grad_norm": 0.9633517265319824, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 24330 + }, + { + "epoch": 3.935009295934039, + "grad_norm": 0.9521116018295288, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 24340 + }, + { + "epoch": 3.9366259801147847, + "grad_norm": 0.8366776704788208, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 24350 + }, + { + "epoch": 3.93824266429553, + "grad_norm": 0.8972663283348083, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 24360 + }, + { + "epoch": 3.939859348476275, + "grad_norm": 0.8102919459342957, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 24370 + }, + { + "epoch": 3.9414760326570204, + "grad_norm": 0.8189975023269653, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 24380 + }, + { + "epoch": 3.9430927168377656, + "grad_norm": 0.9569464921951294, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 24390 + }, + { + "epoch": 3.9447094010185113, + "grad_norm": 0.7459101676940918, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 24400 + }, + { + "epoch": 3.9463260851992565, + "grad_norm": 0.8536974787712097, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 24410 + }, + { + "epoch": 3.9479427693800018, + "grad_norm": 0.8763698935508728, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 24420 + }, + { + "epoch": 3.949559453560747, + "grad_norm": 0.9381106495857239, + "learning_rate": 0.0002, + "loss": 0.6478, + "step": 24430 + }, + { + "epoch": 3.9511761377414922, + "grad_norm": 0.934440016746521, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 24440 + }, + { + "epoch": 3.9527928219222375, + "grad_norm": 0.903918981552124, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 24450 + }, + { + "epoch": 3.9544095061029827, + "grad_norm": 0.8771953582763672, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 24460 + }, + { + "epoch": 3.956026190283728, + "grad_norm": 1.0375410318374634, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 24470 + }, + { + "epoch": 3.957642874464473, + "grad_norm": 0.9439185261726379, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 24480 + }, + { + "epoch": 3.9592595586452184, + "grad_norm": 0.935467004776001, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 24490 + }, + { + "epoch": 3.960876242825964, + "grad_norm": 0.6900772452354431, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 24500 + }, + { + "epoch": 3.9624929270067093, + "grad_norm": 1.0172916650772095, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 24510 + }, + { + "epoch": 3.9641096111874545, + "grad_norm": 0.9167046546936035, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 24520 + }, + { + "epoch": 3.9657262953681998, + "grad_norm": 0.7230527997016907, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 24530 + }, + { + "epoch": 3.967342979548945, + "grad_norm": 0.8980403542518616, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 24540 + }, + { + "epoch": 3.9689596637296907, + "grad_norm": 0.8555465936660767, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 24550 + }, + { + "epoch": 3.970576347910436, + "grad_norm": 0.7825445532798767, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 24560 + }, + { + "epoch": 3.972193032091181, + "grad_norm": 0.7273133993148804, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 24570 + }, + { + "epoch": 3.9738097162719264, + "grad_norm": 0.9612047672271729, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 24580 + }, + { + "epoch": 3.9754264004526716, + "grad_norm": 0.9865460991859436, + "learning_rate": 0.0002, + "loss": 0.6426, + "step": 24590 + }, + { + "epoch": 3.977043084633417, + "grad_norm": 0.8638762831687927, + "learning_rate": 0.0002, + "loss": 0.6052, + "step": 24600 + }, + { + "epoch": 3.978659768814162, + "grad_norm": 1.0096198320388794, + "learning_rate": 0.0002, + "loss": 0.6097, + "step": 24610 + }, + { + "epoch": 3.9802764529949073, + "grad_norm": 0.8475532531738281, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 24620 + }, + { + "epoch": 3.9818931371756525, + "grad_norm": 0.9696195721626282, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 24630 + }, + { + "epoch": 3.9835098213563978, + "grad_norm": 0.7499843239784241, + "learning_rate": 0.0002, + "loss": 0.6446, + "step": 24640 + }, + { + "epoch": 3.9851265055371434, + "grad_norm": 0.8865424990653992, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 24650 + }, + { + "epoch": 3.9867431897178887, + "grad_norm": 0.8089959025382996, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 24660 + }, + { + "epoch": 3.988359873898634, + "grad_norm": 0.6946012377738953, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 24670 + }, + { + "epoch": 3.989976558079379, + "grad_norm": 0.7991759181022644, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 24680 + }, + { + "epoch": 3.9915932422601244, + "grad_norm": 0.8803931474685669, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 24690 + }, + { + "epoch": 3.99320992644087, + "grad_norm": 0.8848299980163574, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 24700 + }, + { + "epoch": 3.9948266106216153, + "grad_norm": 0.7448889017105103, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 24710 + }, + { + "epoch": 3.9964432948023605, + "grad_norm": 0.9361620545387268, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 24720 + }, + { + "epoch": 3.9980599789831057, + "grad_norm": 0.9958081245422363, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 24730 + }, + { + "epoch": 3.999676663163851, + "grad_norm": 1.026004672050476, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 24740 + }, + { + "epoch": 4.0, + "eval_loss": 1.1524168252944946, + "eval_runtime": 122.1585, + "eval_samples_per_second": 6.0, + "eval_steps_per_second": 0.753, + "step": 24742 + } + ], + "logging_steps": 10, + "max_steps": 49480, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1450043604181975e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..155b12fa9acbc6e71dba75c92bfa79e152397ebf --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-24742/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28694d5564a2b5c7d6881d4ba2af103356aa22489d2c22768ebbe47283c0f4a1 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ce80f21ed1430250bb2ab7d46153081cf1e22877 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e04d544e4567e29a67e15f9106733f490e4c61532a4c007dbebfeecbd9ff4ec +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3342d83a2f98f5c73c723fb6acad633f3bb5c270 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6c898a033af64e51f08c03ec5d2a5e2b26f6c694581bc3ab8111a68bae869bd +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1a80f7f758218b04a02a67ab397d6b0e8797938b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b421df5e4949ec6ebb6abd76b5b9cb0ccf6cbe8d92c3f93f3da670bfcfed14a +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..46397082e444ae5e9364813106f3681d944fd9b2 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3901a3f254a869f9b316e3e369e0229c1e3af371b3b635bc272919551abe800b +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a280dfb0b2cea5818094da1a41a0c216d16a5214 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/trainer_state.json @@ -0,0 +1,21717 @@ +{ + "best_metric": 1.0871200561523438, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185", + "epoch": 4.9999191657909625, + "eval_steps": 10, + "global_step": 30927, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016166841807452913, + "grad_norm": 0.9894065856933594, + "learning_rate": 0.0002, + "loss": 1.6636, + "step": 10 + }, + { + "epoch": 0.0032333683614905826, + "grad_norm": 1.7810699939727783, + "learning_rate": 0.0002, + "loss": 1.1528, + "step": 20 + }, + { + "epoch": 0.004850052542235874, + "grad_norm": 0.5969577431678772, + "learning_rate": 0.0002, + "loss": 0.9767, + "step": 30 + }, + { + "epoch": 0.006466736722981165, + "grad_norm": 0.6354120969772339, + "learning_rate": 0.0002, + "loss": 0.9772, + "step": 40 + }, + { + "epoch": 0.008083420903726457, + "grad_norm": 0.5604607462882996, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 50 + }, + { + "epoch": 0.009700105084471748, + "grad_norm": 0.4676193594932556, + "learning_rate": 0.0002, + "loss": 0.8841, + "step": 60 + }, + { + "epoch": 0.01131678926521704, + "grad_norm": 0.6099211573600769, + "learning_rate": 0.0002, + "loss": 0.9022, + "step": 70 + }, + { + "epoch": 0.01293347344596233, + "grad_norm": 0.48639994859695435, + "learning_rate": 0.0002, + "loss": 0.9133, + "step": 80 + }, + { + "epoch": 0.014550157626707623, + "grad_norm": 0.4904264509677887, + "learning_rate": 0.0002, + "loss": 0.8704, + "step": 90 + }, + { + "epoch": 0.016166841807452915, + "grad_norm": 2.8334362506866455, + "learning_rate": 0.0002, + "loss": 0.8855, + "step": 100 + }, + { + "epoch": 0.017783525988198205, + "grad_norm": 0.43221670389175415, + "learning_rate": 0.0002, + "loss": 0.8958, + "step": 110 + }, + { + "epoch": 0.019400210168943496, + "grad_norm": 0.42244166135787964, + "learning_rate": 0.0002, + "loss": 0.8412, + "step": 120 + }, + { + "epoch": 0.02101689434968879, + "grad_norm": 0.45363298058509827, + "learning_rate": 0.0002, + "loss": 0.8467, + "step": 130 + }, + { + "epoch": 0.02263357853043408, + "grad_norm": 0.44816508889198303, + "learning_rate": 0.0002, + "loss": 0.8641, + "step": 140 + }, + { + "epoch": 0.02425026271117937, + "grad_norm": 0.43308213353157043, + "learning_rate": 0.0002, + "loss": 0.8496, + "step": 150 + }, + { + "epoch": 0.02586694689192466, + "grad_norm": 0.4084763526916504, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 160 + }, + { + "epoch": 0.027483631072669955, + "grad_norm": 0.5363703966140747, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 170 + }, + { + "epoch": 0.029100315253415245, + "grad_norm": 0.4619699716567993, + "learning_rate": 0.0002, + "loss": 0.8558, + "step": 180 + }, + { + "epoch": 0.030716999434160536, + "grad_norm": 0.49069908261299133, + "learning_rate": 0.0002, + "loss": 0.8878, + "step": 190 + }, + { + "epoch": 0.03233368361490583, + "grad_norm": 0.4645835757255554, + "learning_rate": 0.0002, + "loss": 0.8867, + "step": 200 + }, + { + "epoch": 0.03395036779565112, + "grad_norm": 1.2411243915557861, + "learning_rate": 0.0002, + "loss": 0.8842, + "step": 210 + }, + { + "epoch": 0.03556705197639641, + "grad_norm": 0.5211851596832275, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 220 + }, + { + "epoch": 0.037183736157141704, + "grad_norm": 0.5253691673278809, + "learning_rate": 0.0002, + "loss": 0.8194, + "step": 230 + }, + { + "epoch": 0.03880042033788699, + "grad_norm": 0.4567478895187378, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 240 + }, + { + "epoch": 0.040417104518632285, + "grad_norm": 0.5472128391265869, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 250 + }, + { + "epoch": 0.04203378869937758, + "grad_norm": 0.42978546023368835, + "learning_rate": 0.0002, + "loss": 0.8201, + "step": 260 + }, + { + "epoch": 0.043650472880122866, + "grad_norm": 0.601734459400177, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 270 + }, + { + "epoch": 0.04526715706086816, + "grad_norm": 0.4286513328552246, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 280 + }, + { + "epoch": 0.046883841241613454, + "grad_norm": 0.5230861902236938, + "learning_rate": 0.0002, + "loss": 0.8758, + "step": 290 + }, + { + "epoch": 0.04850052542235874, + "grad_norm": 0.6504611968994141, + "learning_rate": 0.0002, + "loss": 0.8636, + "step": 300 + }, + { + "epoch": 0.050117209603104035, + "grad_norm": 0.43485215306282043, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 310 + }, + { + "epoch": 0.05173389378384932, + "grad_norm": 0.4717007875442505, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 320 + }, + { + "epoch": 0.053350577964594616, + "grad_norm": 0.4059787690639496, + "learning_rate": 0.0002, + "loss": 0.8469, + "step": 330 + }, + { + "epoch": 0.05496726214533991, + "grad_norm": 0.4366913437843323, + "learning_rate": 0.0002, + "loss": 0.8866, + "step": 340 + }, + { + "epoch": 0.0565839463260852, + "grad_norm": 0.4233848452568054, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 350 + }, + { + "epoch": 0.05820063050683049, + "grad_norm": 0.4209108352661133, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 360 + }, + { + "epoch": 0.059817314687575784, + "grad_norm": 0.41637396812438965, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 370 + }, + { + "epoch": 0.06143399886832107, + "grad_norm": 0.46235376596450806, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 380 + }, + { + "epoch": 0.06305068304906636, + "grad_norm": 0.4013484716415405, + "learning_rate": 0.0002, + "loss": 0.7966, + "step": 390 + }, + { + "epoch": 0.06466736722981166, + "grad_norm": 0.47443896532058716, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 400 + }, + { + "epoch": 0.06628405141055695, + "grad_norm": 0.3942156434059143, + "learning_rate": 0.0002, + "loss": 0.8666, + "step": 410 + }, + { + "epoch": 0.06790073559130223, + "grad_norm": 0.4965320825576782, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 420 + }, + { + "epoch": 0.06951741977204753, + "grad_norm": 0.4304835796356201, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 430 + }, + { + "epoch": 0.07113410395279282, + "grad_norm": 0.511726975440979, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 440 + }, + { + "epoch": 0.07275078813353811, + "grad_norm": 0.4040689170360565, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 450 + }, + { + "epoch": 0.07436747231428341, + "grad_norm": 0.5402171015739441, + "learning_rate": 0.0002, + "loss": 0.8788, + "step": 460 + }, + { + "epoch": 0.0759841564950287, + "grad_norm": 0.4174517095088959, + "learning_rate": 0.0002, + "loss": 0.8737, + "step": 470 + }, + { + "epoch": 0.07760084067577398, + "grad_norm": 0.4306182265281677, + "learning_rate": 0.0002, + "loss": 0.7605, + "step": 480 + }, + { + "epoch": 0.07921752485651928, + "grad_norm": 0.535210132598877, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 490 + }, + { + "epoch": 0.08083420903726457, + "grad_norm": 0.5339109897613525, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 500 + }, + { + "epoch": 0.08245089321800986, + "grad_norm": 0.45754891633987427, + "learning_rate": 0.0002, + "loss": 0.8985, + "step": 510 + }, + { + "epoch": 0.08406757739875516, + "grad_norm": 0.43820783495903015, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 520 + }, + { + "epoch": 0.08568426157950045, + "grad_norm": 0.4434749186038971, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 530 + }, + { + "epoch": 0.08730094576024573, + "grad_norm": 0.43111467361450195, + "learning_rate": 0.0002, + "loss": 0.7857, + "step": 540 + }, + { + "epoch": 0.08891762994099103, + "grad_norm": 0.4378940165042877, + "learning_rate": 0.0002, + "loss": 0.8418, + "step": 550 + }, + { + "epoch": 0.09053431412173632, + "grad_norm": 0.4772215187549591, + "learning_rate": 0.0002, + "loss": 0.8361, + "step": 560 + }, + { + "epoch": 0.09215099830248161, + "grad_norm": 0.6837629079818726, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 570 + }, + { + "epoch": 0.09376768248322691, + "grad_norm": 0.42241212725639343, + "learning_rate": 0.0002, + "loss": 0.8607, + "step": 580 + }, + { + "epoch": 0.0953843666639722, + "grad_norm": 0.5165936350822449, + "learning_rate": 0.0002, + "loss": 0.852, + "step": 590 + }, + { + "epoch": 0.09700105084471748, + "grad_norm": 0.48737478256225586, + "learning_rate": 0.0002, + "loss": 0.8664, + "step": 600 + }, + { + "epoch": 0.09861773502546278, + "grad_norm": 0.47419852018356323, + "learning_rate": 0.0002, + "loss": 0.8806, + "step": 610 + }, + { + "epoch": 0.10023441920620807, + "grad_norm": 0.4975486099720001, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 620 + }, + { + "epoch": 0.10185110338695336, + "grad_norm": 0.49123844504356384, + "learning_rate": 0.0002, + "loss": 0.8548, + "step": 630 + }, + { + "epoch": 0.10346778756769864, + "grad_norm": 0.6288952827453613, + "learning_rate": 0.0002, + "loss": 0.8911, + "step": 640 + }, + { + "epoch": 0.10508447174844394, + "grad_norm": 0.4277345836162567, + "learning_rate": 0.0002, + "loss": 0.827, + "step": 650 + }, + { + "epoch": 0.10670115592918923, + "grad_norm": 0.4021061956882477, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 660 + }, + { + "epoch": 0.10831784010993452, + "grad_norm": 0.3492237329483032, + "learning_rate": 0.0002, + "loss": 0.87, + "step": 670 + }, + { + "epoch": 0.10993452429067982, + "grad_norm": 0.4341012239456177, + "learning_rate": 0.0002, + "loss": 0.8698, + "step": 680 + }, + { + "epoch": 0.1115512084714251, + "grad_norm": 0.7296304106712341, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 690 + }, + { + "epoch": 0.1131678926521704, + "grad_norm": 0.397494912147522, + "learning_rate": 0.0002, + "loss": 0.8433, + "step": 700 + }, + { + "epoch": 0.1147845768329157, + "grad_norm": 0.396431028842926, + "learning_rate": 0.0002, + "loss": 0.827, + "step": 710 + }, + { + "epoch": 0.11640126101366098, + "grad_norm": 0.48842838406562805, + "learning_rate": 0.0002, + "loss": 0.8379, + "step": 720 + }, + { + "epoch": 0.11801794519440627, + "grad_norm": 0.46322616934776306, + "learning_rate": 0.0002, + "loss": 0.8238, + "step": 730 + }, + { + "epoch": 0.11963462937515157, + "grad_norm": 0.47990912199020386, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 740 + }, + { + "epoch": 0.12125131355589686, + "grad_norm": 0.4997142255306244, + "learning_rate": 0.0002, + "loss": 0.82, + "step": 750 + }, + { + "epoch": 0.12286799773664214, + "grad_norm": 0.4040526747703552, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 760 + }, + { + "epoch": 0.12448468191738744, + "grad_norm": 0.453095942735672, + "learning_rate": 0.0002, + "loss": 0.863, + "step": 770 + }, + { + "epoch": 0.12610136609813272, + "grad_norm": 0.4636971950531006, + "learning_rate": 0.0002, + "loss": 0.8792, + "step": 780 + }, + { + "epoch": 0.12771805027887803, + "grad_norm": 0.4279276132583618, + "learning_rate": 0.0002, + "loss": 0.8112, + "step": 790 + }, + { + "epoch": 0.12933473445962332, + "grad_norm": 0.46212655305862427, + "learning_rate": 0.0002, + "loss": 0.8711, + "step": 800 + }, + { + "epoch": 0.1309514186403686, + "grad_norm": 0.43127650022506714, + "learning_rate": 0.0002, + "loss": 0.8368, + "step": 810 + }, + { + "epoch": 0.1325681028211139, + "grad_norm": 0.4201301336288452, + "learning_rate": 0.0002, + "loss": 0.8476, + "step": 820 + }, + { + "epoch": 0.13418478700185918, + "grad_norm": 0.42583167552948, + "learning_rate": 0.0002, + "loss": 0.8078, + "step": 830 + }, + { + "epoch": 0.13580147118260447, + "grad_norm": 0.4535622000694275, + "learning_rate": 0.0002, + "loss": 0.8219, + "step": 840 + }, + { + "epoch": 0.13741815536334978, + "grad_norm": 0.4116036891937256, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 850 + }, + { + "epoch": 0.13903483954409507, + "grad_norm": 0.45997580885887146, + "learning_rate": 0.0002, + "loss": 0.8466, + "step": 860 + }, + { + "epoch": 0.14065152372484035, + "grad_norm": 0.4487837255001068, + "learning_rate": 0.0002, + "loss": 0.8917, + "step": 870 + }, + { + "epoch": 0.14226820790558564, + "grad_norm": 0.43650057911872864, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 880 + }, + { + "epoch": 0.14388489208633093, + "grad_norm": 0.5335358381271362, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 890 + }, + { + "epoch": 0.14550157626707622, + "grad_norm": 0.5989000201225281, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 900 + }, + { + "epoch": 0.14711826044782153, + "grad_norm": 0.517179012298584, + "learning_rate": 0.0002, + "loss": 0.8385, + "step": 910 + }, + { + "epoch": 0.14873494462856682, + "grad_norm": 0.44435232877731323, + "learning_rate": 0.0002, + "loss": 0.8255, + "step": 920 + }, + { + "epoch": 0.1503516288093121, + "grad_norm": 0.42635923624038696, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 930 + }, + { + "epoch": 0.1519683129900574, + "grad_norm": 0.49603334069252014, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 940 + }, + { + "epoch": 0.15358499717080268, + "grad_norm": 0.40639808773994446, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 950 + }, + { + "epoch": 0.15520168135154797, + "grad_norm": 0.4850759208202362, + "learning_rate": 0.0002, + "loss": 0.8529, + "step": 960 + }, + { + "epoch": 0.15681836553229328, + "grad_norm": 0.4427442252635956, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 970 + }, + { + "epoch": 0.15843504971303857, + "grad_norm": 0.3760930001735687, + "learning_rate": 0.0002, + "loss": 0.8705, + "step": 980 + }, + { + "epoch": 0.16005173389378385, + "grad_norm": 0.4794144332408905, + "learning_rate": 0.0002, + "loss": 0.8644, + "step": 990 + }, + { + "epoch": 0.16166841807452914, + "grad_norm": 0.45828768610954285, + "learning_rate": 0.0002, + "loss": 0.8002, + "step": 1000 + }, + { + "epoch": 0.16328510225527443, + "grad_norm": 0.6313053369522095, + "learning_rate": 0.0002, + "loss": 0.7658, + "step": 1010 + }, + { + "epoch": 0.16490178643601971, + "grad_norm": 0.45041006803512573, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1020 + }, + { + "epoch": 0.166518470616765, + "grad_norm": 0.441403865814209, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 1030 + }, + { + "epoch": 0.16813515479751032, + "grad_norm": 0.8171296119689941, + "learning_rate": 0.0002, + "loss": 0.8475, + "step": 1040 + }, + { + "epoch": 0.1697518389782556, + "grad_norm": 0.7137420773506165, + "learning_rate": 0.0002, + "loss": 0.845, + "step": 1050 + }, + { + "epoch": 0.1713685231590009, + "grad_norm": 0.5236809849739075, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 1060 + }, + { + "epoch": 0.17298520733974618, + "grad_norm": 0.5021864175796509, + "learning_rate": 0.0002, + "loss": 0.8265, + "step": 1070 + }, + { + "epoch": 0.17460189152049146, + "grad_norm": 0.47347521781921387, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 1080 + }, + { + "epoch": 0.17621857570123675, + "grad_norm": 0.4631653428077698, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 1090 + }, + { + "epoch": 0.17783525988198207, + "grad_norm": 0.49169182777404785, + "learning_rate": 0.0002, + "loss": 0.8166, + "step": 1100 + }, + { + "epoch": 0.17945194406272735, + "grad_norm": 0.5019739270210266, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 1110 + }, + { + "epoch": 0.18106862824347264, + "grad_norm": 0.5100422501564026, + "learning_rate": 0.0002, + "loss": 0.8247, + "step": 1120 + }, + { + "epoch": 0.18268531242421793, + "grad_norm": 0.3888324499130249, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 1130 + }, + { + "epoch": 0.18430199660496321, + "grad_norm": 0.39765217900276184, + "learning_rate": 0.0002, + "loss": 0.8533, + "step": 1140 + }, + { + "epoch": 0.1859186807857085, + "grad_norm": 0.47190186381340027, + "learning_rate": 0.0002, + "loss": 0.8541, + "step": 1150 + }, + { + "epoch": 0.18753536496645382, + "grad_norm": 0.4464188814163208, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 1160 + }, + { + "epoch": 0.1891520491471991, + "grad_norm": 0.5153930187225342, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 1170 + }, + { + "epoch": 0.1907687333279444, + "grad_norm": 0.4779708683490753, + "learning_rate": 0.0002, + "loss": 0.8033, + "step": 1180 + }, + { + "epoch": 0.19238541750868968, + "grad_norm": 0.4834315776824951, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 1190 + }, + { + "epoch": 0.19400210168943496, + "grad_norm": 0.402357816696167, + "learning_rate": 0.0002, + "loss": 0.7721, + "step": 1200 + }, + { + "epoch": 0.19561878587018025, + "grad_norm": 0.45899084210395813, + "learning_rate": 0.0002, + "loss": 0.7941, + "step": 1210 + }, + { + "epoch": 0.19723547005092557, + "grad_norm": 0.5106529593467712, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 1220 + }, + { + "epoch": 0.19885215423167085, + "grad_norm": 0.45261722803115845, + "learning_rate": 0.0002, + "loss": 0.7816, + "step": 1230 + }, + { + "epoch": 0.20046883841241614, + "grad_norm": 0.4647127091884613, + "learning_rate": 0.0002, + "loss": 0.8068, + "step": 1240 + }, + { + "epoch": 0.20208552259316143, + "grad_norm": 0.4849368929862976, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 1250 + }, + { + "epoch": 0.2037022067739067, + "grad_norm": 0.4518061578273773, + "learning_rate": 0.0002, + "loss": 0.8514, + "step": 1260 + }, + { + "epoch": 0.205318890954652, + "grad_norm": 0.49535325169563293, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 1270 + }, + { + "epoch": 0.2069355751353973, + "grad_norm": 0.4835205376148224, + "learning_rate": 0.0002, + "loss": 0.8348, + "step": 1280 + }, + { + "epoch": 0.2085522593161426, + "grad_norm": 0.45308539271354675, + "learning_rate": 0.0002, + "loss": 0.8428, + "step": 1290 + }, + { + "epoch": 0.2101689434968879, + "grad_norm": 0.5369905233383179, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 1300 + }, + { + "epoch": 0.21178562767763318, + "grad_norm": 0.5031622052192688, + "learning_rate": 0.0002, + "loss": 0.8676, + "step": 1310 + }, + { + "epoch": 0.21340231185837846, + "grad_norm": 0.48010334372520447, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 1320 + }, + { + "epoch": 0.21501899603912375, + "grad_norm": 0.4905701279640198, + "learning_rate": 0.0002, + "loss": 0.806, + "step": 1330 + }, + { + "epoch": 0.21663568021986904, + "grad_norm": 0.43531742691993713, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 1340 + }, + { + "epoch": 0.21825236440061435, + "grad_norm": 0.44330692291259766, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 1350 + }, + { + "epoch": 0.21986904858135964, + "grad_norm": 0.5384416580200195, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 1360 + }, + { + "epoch": 0.22148573276210493, + "grad_norm": 0.4181833863258362, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 1370 + }, + { + "epoch": 0.2231024169428502, + "grad_norm": 0.523833692073822, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 1380 + }, + { + "epoch": 0.2247191011235955, + "grad_norm": 0.5528736710548401, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 1390 + }, + { + "epoch": 0.2263357853043408, + "grad_norm": 0.43515023589134216, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 1400 + }, + { + "epoch": 0.2279524694850861, + "grad_norm": 0.48809877038002014, + "learning_rate": 0.0002, + "loss": 0.8403, + "step": 1410 + }, + { + "epoch": 0.2295691536658314, + "grad_norm": 0.43591251969337463, + "learning_rate": 0.0002, + "loss": 0.8165, + "step": 1420 + }, + { + "epoch": 0.23118583784657668, + "grad_norm": 0.44625312089920044, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 1430 + }, + { + "epoch": 0.23280252202732196, + "grad_norm": 0.4390665292739868, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 1440 + }, + { + "epoch": 0.23441920620806725, + "grad_norm": 0.48496049642562866, + "learning_rate": 0.0002, + "loss": 0.8465, + "step": 1450 + }, + { + "epoch": 0.23603589038881254, + "grad_norm": 0.45919957756996155, + "learning_rate": 0.0002, + "loss": 0.775, + "step": 1460 + }, + { + "epoch": 0.23765257456955785, + "grad_norm": 0.5471845865249634, + "learning_rate": 0.0002, + "loss": 0.8659, + "step": 1470 + }, + { + "epoch": 0.23926925875030314, + "grad_norm": 0.47269317507743835, + "learning_rate": 0.0002, + "loss": 0.8164, + "step": 1480 + }, + { + "epoch": 0.24088594293104842, + "grad_norm": 0.4930245578289032, + "learning_rate": 0.0002, + "loss": 0.854, + "step": 1490 + }, + { + "epoch": 0.2425026271117937, + "grad_norm": 0.5605630278587341, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 1500 + }, + { + "epoch": 0.244119311292539, + "grad_norm": 0.4435870945453644, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 1510 + }, + { + "epoch": 0.24573599547328429, + "grad_norm": 0.4941999912261963, + "learning_rate": 0.0002, + "loss": 0.8123, + "step": 1520 + }, + { + "epoch": 0.24735267965402957, + "grad_norm": 0.5100624561309814, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 1530 + }, + { + "epoch": 0.2489693638347749, + "grad_norm": 0.4638267457485199, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 1540 + }, + { + "epoch": 0.25058604801552015, + "grad_norm": 0.5071570873260498, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 1550 + }, + { + "epoch": 0.25220273219626543, + "grad_norm": 0.4291319251060486, + "learning_rate": 0.0002, + "loss": 0.7724, + "step": 1560 + }, + { + "epoch": 0.2538194163770108, + "grad_norm": 0.5388049483299255, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 1570 + }, + { + "epoch": 0.25543610055775606, + "grad_norm": 0.5083683729171753, + "learning_rate": 0.0002, + "loss": 0.8176, + "step": 1580 + }, + { + "epoch": 0.25705278473850135, + "grad_norm": 0.4824463725090027, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 1590 + }, + { + "epoch": 0.25866946891924664, + "grad_norm": 0.41177722811698914, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 1600 + }, + { + "epoch": 0.2602861530999919, + "grad_norm": 0.5656219124794006, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 1610 + }, + { + "epoch": 0.2619028372807372, + "grad_norm": 0.41063204407691956, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 1620 + }, + { + "epoch": 0.2635195214614825, + "grad_norm": 0.4897061288356781, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 1630 + }, + { + "epoch": 0.2651362056422278, + "grad_norm": 0.4454376697540283, + "learning_rate": 0.0002, + "loss": 0.8198, + "step": 1640 + }, + { + "epoch": 0.26675288982297307, + "grad_norm": 0.4355238378047943, + "learning_rate": 0.0002, + "loss": 0.8684, + "step": 1650 + }, + { + "epoch": 0.26836957400371836, + "grad_norm": 0.458310067653656, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 1660 + }, + { + "epoch": 0.26998625818446365, + "grad_norm": 0.4752083718776703, + "learning_rate": 0.0002, + "loss": 0.7935, + "step": 1670 + }, + { + "epoch": 0.27160294236520893, + "grad_norm": 0.4666106402873993, + "learning_rate": 0.0002, + "loss": 0.8267, + "step": 1680 + }, + { + "epoch": 0.2732196265459543, + "grad_norm": 0.4213818609714508, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 1690 + }, + { + "epoch": 0.27483631072669956, + "grad_norm": 0.5768913626670837, + "learning_rate": 0.0002, + "loss": 0.8559, + "step": 1700 + }, + { + "epoch": 0.27645299490744485, + "grad_norm": 0.4209914803504944, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 1710 + }, + { + "epoch": 0.27806967908819014, + "grad_norm": 0.501909613609314, + "learning_rate": 0.0002, + "loss": 0.8167, + "step": 1720 + }, + { + "epoch": 0.2796863632689354, + "grad_norm": 0.5266261100769043, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 1730 + }, + { + "epoch": 0.2813030474496807, + "grad_norm": 0.43806859850883484, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 1740 + }, + { + "epoch": 0.282919731630426, + "grad_norm": 0.46048814058303833, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 1750 + }, + { + "epoch": 0.2845364158111713, + "grad_norm": 0.44972819089889526, + "learning_rate": 0.0002, + "loss": 0.8596, + "step": 1760 + }, + { + "epoch": 0.28615309999191657, + "grad_norm": 0.5114831328392029, + "learning_rate": 0.0002, + "loss": 0.8421, + "step": 1770 + }, + { + "epoch": 0.28776978417266186, + "grad_norm": 0.47931742668151855, + "learning_rate": 0.0002, + "loss": 0.8361, + "step": 1780 + }, + { + "epoch": 0.28938646835340714, + "grad_norm": 0.5092599987983704, + "learning_rate": 0.0002, + "loss": 0.8265, + "step": 1790 + }, + { + "epoch": 0.29100315253415243, + "grad_norm": 0.37581443786621094, + "learning_rate": 0.0002, + "loss": 0.8506, + "step": 1800 + }, + { + "epoch": 0.2926198367148977, + "grad_norm": 0.47097381949424744, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 1810 + }, + { + "epoch": 0.29423652089564306, + "grad_norm": 0.48300236463546753, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 1820 + }, + { + "epoch": 0.29585320507638835, + "grad_norm": 0.5600419640541077, + "learning_rate": 0.0002, + "loss": 0.8391, + "step": 1830 + }, + { + "epoch": 0.29746988925713364, + "grad_norm": 0.48555272817611694, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 1840 + }, + { + "epoch": 0.2990865734378789, + "grad_norm": 0.3752668499946594, + "learning_rate": 0.0002, + "loss": 0.7657, + "step": 1850 + }, + { + "epoch": 0.3007032576186242, + "grad_norm": 0.5328747034072876, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 1860 + }, + { + "epoch": 0.3023199417993695, + "grad_norm": 0.48716455698013306, + "learning_rate": 0.0002, + "loss": 0.8426, + "step": 1870 + }, + { + "epoch": 0.3039366259801148, + "grad_norm": 0.5011493563652039, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 1880 + }, + { + "epoch": 0.30555331016086007, + "grad_norm": 0.46461427211761475, + "learning_rate": 0.0002, + "loss": 0.852, + "step": 1890 + }, + { + "epoch": 0.30716999434160536, + "grad_norm": 0.36630210280418396, + "learning_rate": 0.0002, + "loss": 0.8478, + "step": 1900 + }, + { + "epoch": 0.30878667852235064, + "grad_norm": 0.4217296242713928, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 1910 + }, + { + "epoch": 0.31040336270309593, + "grad_norm": 0.4394875466823578, + "learning_rate": 0.0002, + "loss": 0.8128, + "step": 1920 + }, + { + "epoch": 0.3120200468838412, + "grad_norm": 0.6587965488433838, + "learning_rate": 0.0002, + "loss": 0.8471, + "step": 1930 + }, + { + "epoch": 0.31363673106458656, + "grad_norm": 0.5469298958778381, + "learning_rate": 0.0002, + "loss": 0.8565, + "step": 1940 + }, + { + "epoch": 0.31525341524533185, + "grad_norm": 0.4371595084667206, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 1950 + }, + { + "epoch": 0.31687009942607713, + "grad_norm": 0.4809541404247284, + "learning_rate": 0.0002, + "loss": 0.887, + "step": 1960 + }, + { + "epoch": 0.3184867836068224, + "grad_norm": 0.6061086654663086, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 1970 + }, + { + "epoch": 0.3201034677875677, + "grad_norm": 0.5342657566070557, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 1980 + }, + { + "epoch": 0.321720151968313, + "grad_norm": 0.5057743787765503, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 1990 + }, + { + "epoch": 0.3233368361490583, + "grad_norm": 0.528626024723053, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 2000 + }, + { + "epoch": 0.32495352032980357, + "grad_norm": 0.46742770075798035, + "learning_rate": 0.0002, + "loss": 0.8845, + "step": 2010 + }, + { + "epoch": 0.32657020451054886, + "grad_norm": 0.515101432800293, + "learning_rate": 0.0002, + "loss": 0.8484, + "step": 2020 + }, + { + "epoch": 0.32818688869129414, + "grad_norm": 0.41941216588020325, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 2030 + }, + { + "epoch": 0.32980357287203943, + "grad_norm": 0.49902522563934326, + "learning_rate": 0.0002, + "loss": 0.7637, + "step": 2040 + }, + { + "epoch": 0.3314202570527847, + "grad_norm": 0.4120897650718689, + "learning_rate": 0.0002, + "loss": 0.7822, + "step": 2050 + }, + { + "epoch": 0.33303694123353, + "grad_norm": 0.45352041721343994, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 2060 + }, + { + "epoch": 0.33465362541427535, + "grad_norm": 0.523199737071991, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 2070 + }, + { + "epoch": 0.33627030959502063, + "grad_norm": 0.4390358626842499, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 2080 + }, + { + "epoch": 0.3378869937757659, + "grad_norm": 0.6752901077270508, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 2090 + }, + { + "epoch": 0.3395036779565112, + "grad_norm": 0.547821044921875, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 2100 + }, + { + "epoch": 0.3411203621372565, + "grad_norm": 0.5161308646202087, + "learning_rate": 0.0002, + "loss": 0.8561, + "step": 2110 + }, + { + "epoch": 0.3427370463180018, + "grad_norm": 0.4565401077270508, + "learning_rate": 0.0002, + "loss": 0.7697, + "step": 2120 + }, + { + "epoch": 0.34435373049874707, + "grad_norm": 0.4666115939617157, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 2130 + }, + { + "epoch": 0.34597041467949236, + "grad_norm": 0.4090428352355957, + "learning_rate": 0.0002, + "loss": 0.8189, + "step": 2140 + }, + { + "epoch": 0.34758709886023764, + "grad_norm": 0.510845422744751, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 2150 + }, + { + "epoch": 0.34920378304098293, + "grad_norm": 0.42861923575401306, + "learning_rate": 0.0002, + "loss": 0.8398, + "step": 2160 + }, + { + "epoch": 0.3508204672217282, + "grad_norm": 0.4476332664489746, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 2170 + }, + { + "epoch": 0.3524371514024735, + "grad_norm": 0.6065791249275208, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 2180 + }, + { + "epoch": 0.35405383558321885, + "grad_norm": 0.42335066199302673, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 2190 + }, + { + "epoch": 0.35567051976396413, + "grad_norm": 0.5094629526138306, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 2200 + }, + { + "epoch": 0.3572872039447094, + "grad_norm": 0.5476373434066772, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 2210 + }, + { + "epoch": 0.3589038881254547, + "grad_norm": 0.3911719024181366, + "learning_rate": 0.0002, + "loss": 0.814, + "step": 2220 + }, + { + "epoch": 0.3605205723062, + "grad_norm": 0.6599636077880859, + "learning_rate": 0.0002, + "loss": 0.8599, + "step": 2230 + }, + { + "epoch": 0.3621372564869453, + "grad_norm": 0.40381914377212524, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 2240 + }, + { + "epoch": 0.36375394066769057, + "grad_norm": 0.4433908462524414, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 2250 + }, + { + "epoch": 0.36537062484843585, + "grad_norm": 0.578326940536499, + "learning_rate": 0.0002, + "loss": 0.8503, + "step": 2260 + }, + { + "epoch": 0.36698730902918114, + "grad_norm": 0.5734784007072449, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 2270 + }, + { + "epoch": 0.36860399320992643, + "grad_norm": 0.45555487275123596, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 2280 + }, + { + "epoch": 0.3702206773906717, + "grad_norm": 0.5666276216506958, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 2290 + }, + { + "epoch": 0.371837361571417, + "grad_norm": 0.5461117625236511, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 2300 + }, + { + "epoch": 0.3734540457521623, + "grad_norm": 0.6318911910057068, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 2310 + }, + { + "epoch": 0.37507072993290763, + "grad_norm": 0.493263304233551, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 2320 + }, + { + "epoch": 0.3766874141136529, + "grad_norm": 0.5888760089874268, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 2330 + }, + { + "epoch": 0.3783040982943982, + "grad_norm": 0.48671841621398926, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 2340 + }, + { + "epoch": 0.3799207824751435, + "grad_norm": 0.4385145306587219, + "learning_rate": 0.0002, + "loss": 0.8367, + "step": 2350 + }, + { + "epoch": 0.3815374666558888, + "grad_norm": 0.5523318648338318, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 2360 + }, + { + "epoch": 0.38315415083663407, + "grad_norm": 0.7308220267295837, + "learning_rate": 0.0002, + "loss": 0.8351, + "step": 2370 + }, + { + "epoch": 0.38477083501737935, + "grad_norm": 0.554214358329773, + "learning_rate": 0.0002, + "loss": 0.859, + "step": 2380 + }, + { + "epoch": 0.38638751919812464, + "grad_norm": 0.5425800085067749, + "learning_rate": 0.0002, + "loss": 0.8146, + "step": 2390 + }, + { + "epoch": 0.3880042033788699, + "grad_norm": 0.48811158537864685, + "learning_rate": 0.0002, + "loss": 0.8282, + "step": 2400 + }, + { + "epoch": 0.3896208875596152, + "grad_norm": 0.49212366342544556, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 2410 + }, + { + "epoch": 0.3912375717403605, + "grad_norm": 0.5222218632698059, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 2420 + }, + { + "epoch": 0.3928542559211058, + "grad_norm": 0.4699819087982178, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 2430 + }, + { + "epoch": 0.39447094010185113, + "grad_norm": 0.46153587102890015, + "learning_rate": 0.0002, + "loss": 0.7919, + "step": 2440 + }, + { + "epoch": 0.3960876242825964, + "grad_norm": 0.4150611162185669, + "learning_rate": 0.0002, + "loss": 0.8111, + "step": 2450 + }, + { + "epoch": 0.3977043084633417, + "grad_norm": 0.5799614787101746, + "learning_rate": 0.0002, + "loss": 0.8589, + "step": 2460 + }, + { + "epoch": 0.399320992644087, + "grad_norm": 0.56536865234375, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 2470 + }, + { + "epoch": 0.4009376768248323, + "grad_norm": 0.5451247096061707, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 2480 + }, + { + "epoch": 0.40255436100557757, + "grad_norm": 0.5914521217346191, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 2490 + }, + { + "epoch": 0.40417104518632285, + "grad_norm": 0.4428117275238037, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 2500 + }, + { + "epoch": 0.40578772936706814, + "grad_norm": 0.48580947518348694, + "learning_rate": 0.0002, + "loss": 0.8054, + "step": 2510 + }, + { + "epoch": 0.4074044135478134, + "grad_norm": 0.436734676361084, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 2520 + }, + { + "epoch": 0.4090210977285587, + "grad_norm": 0.5752223134040833, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 2530 + }, + { + "epoch": 0.410637781909304, + "grad_norm": 0.4271308183670044, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 2540 + }, + { + "epoch": 0.4122544660900493, + "grad_norm": 0.46294718980789185, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 2550 + }, + { + "epoch": 0.4138711502707946, + "grad_norm": 0.49407583475112915, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 2560 + }, + { + "epoch": 0.4154878344515399, + "grad_norm": 0.4729035496711731, + "learning_rate": 0.0002, + "loss": 0.7881, + "step": 2570 + }, + { + "epoch": 0.4171045186322852, + "grad_norm": 0.4129747152328491, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 2580 + }, + { + "epoch": 0.4187212028130305, + "grad_norm": 0.5684236288070679, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 2590 + }, + { + "epoch": 0.4203378869937758, + "grad_norm": 0.4862157106399536, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 2600 + }, + { + "epoch": 0.42195457117452106, + "grad_norm": 0.46567976474761963, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 2610 + }, + { + "epoch": 0.42357125535526635, + "grad_norm": 0.5710650682449341, + "learning_rate": 0.0002, + "loss": 0.7755, + "step": 2620 + }, + { + "epoch": 0.42518793953601164, + "grad_norm": 0.5660041570663452, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 2630 + }, + { + "epoch": 0.4268046237167569, + "grad_norm": 0.47944375872612, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 2640 + }, + { + "epoch": 0.4284213078975022, + "grad_norm": 0.537223756313324, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 2650 + }, + { + "epoch": 0.4300379920782475, + "grad_norm": 0.41669997572898865, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 2660 + }, + { + "epoch": 0.4316546762589928, + "grad_norm": 0.44727686047554016, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 2670 + }, + { + "epoch": 0.4332713604397381, + "grad_norm": 0.5600888729095459, + "learning_rate": 0.0002, + "loss": 0.8241, + "step": 2680 + }, + { + "epoch": 0.4348880446204834, + "grad_norm": 0.39820605516433716, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 2690 + }, + { + "epoch": 0.4365047288012287, + "grad_norm": 0.5637655854225159, + "learning_rate": 0.0002, + "loss": 0.8202, + "step": 2700 + }, + { + "epoch": 0.438121412981974, + "grad_norm": 0.6363666653633118, + "learning_rate": 0.0002, + "loss": 0.855, + "step": 2710 + }, + { + "epoch": 0.4397380971627193, + "grad_norm": 0.5656129121780396, + "learning_rate": 0.0002, + "loss": 0.8468, + "step": 2720 + }, + { + "epoch": 0.44135478134346456, + "grad_norm": 0.5600156188011169, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 2730 + }, + { + "epoch": 0.44297146552420985, + "grad_norm": 0.5506579875946045, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 2740 + }, + { + "epoch": 0.44458814970495514, + "grad_norm": 0.49878305196762085, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 2750 + }, + { + "epoch": 0.4462048338857004, + "grad_norm": 0.4569213092327118, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 2760 + }, + { + "epoch": 0.4478215180664457, + "grad_norm": 0.6056680083274841, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 2770 + }, + { + "epoch": 0.449438202247191, + "grad_norm": 0.44474557042121887, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 2780 + }, + { + "epoch": 0.4510548864279363, + "grad_norm": 0.46055394411087036, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 2790 + }, + { + "epoch": 0.4526715706086816, + "grad_norm": 0.4904133379459381, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 2800 + }, + { + "epoch": 0.45428825478942686, + "grad_norm": 0.5647031664848328, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 2810 + }, + { + "epoch": 0.4559049389701722, + "grad_norm": 0.5759473443031311, + "learning_rate": 0.0002, + "loss": 0.8622, + "step": 2820 + }, + { + "epoch": 0.4575216231509175, + "grad_norm": 0.5161895751953125, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 2830 + }, + { + "epoch": 0.4591383073316628, + "grad_norm": 0.4248254597187042, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 2840 + }, + { + "epoch": 0.46075499151240806, + "grad_norm": 0.45395001769065857, + "learning_rate": 0.0002, + "loss": 0.7838, + "step": 2850 + }, + { + "epoch": 0.46237167569315335, + "grad_norm": 0.5358697772026062, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 2860 + }, + { + "epoch": 0.46398835987389864, + "grad_norm": 0.5379165410995483, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 2870 + }, + { + "epoch": 0.4656050440546439, + "grad_norm": 0.4601989686489105, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 2880 + }, + { + "epoch": 0.4672217282353892, + "grad_norm": 0.671115517616272, + "learning_rate": 0.0002, + "loss": 0.8523, + "step": 2890 + }, + { + "epoch": 0.4688384124161345, + "grad_norm": 0.4425133168697357, + "learning_rate": 0.0002, + "loss": 0.8262, + "step": 2900 + }, + { + "epoch": 0.4704550965968798, + "grad_norm": 0.5446155071258545, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 2910 + }, + { + "epoch": 0.47207178077762507, + "grad_norm": 0.603306233882904, + "learning_rate": 0.0002, + "loss": 0.8106, + "step": 2920 + }, + { + "epoch": 0.47368846495837036, + "grad_norm": 0.5377997159957886, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 2930 + }, + { + "epoch": 0.4753051491391157, + "grad_norm": 0.4931027591228485, + "learning_rate": 0.0002, + "loss": 0.8075, + "step": 2940 + }, + { + "epoch": 0.476921833319861, + "grad_norm": 0.4711960256099701, + "learning_rate": 0.0002, + "loss": 0.8004, + "step": 2950 + }, + { + "epoch": 0.4785385175006063, + "grad_norm": 0.5020492672920227, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 2960 + }, + { + "epoch": 0.48015520168135156, + "grad_norm": 0.5428946614265442, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 2970 + }, + { + "epoch": 0.48177188586209685, + "grad_norm": 0.5294089317321777, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 2980 + }, + { + "epoch": 0.48338857004284214, + "grad_norm": 0.648289144039154, + "learning_rate": 0.0002, + "loss": 0.8553, + "step": 2990 + }, + { + "epoch": 0.4850052542235874, + "grad_norm": 0.47916680574417114, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 3000 + }, + { + "epoch": 0.4866219384043327, + "grad_norm": 0.43849772214889526, + "learning_rate": 0.0002, + "loss": 0.8087, + "step": 3010 + }, + { + "epoch": 0.488238622585078, + "grad_norm": 0.47007861733436584, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 3020 + }, + { + "epoch": 0.4898553067658233, + "grad_norm": 0.6314331293106079, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 3030 + }, + { + "epoch": 0.49147199094656857, + "grad_norm": 0.49211493134498596, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 3040 + }, + { + "epoch": 0.49308867512731386, + "grad_norm": 0.4537973403930664, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 3050 + }, + { + "epoch": 0.49470535930805914, + "grad_norm": 0.47326919436454773, + "learning_rate": 0.0002, + "loss": 0.8095, + "step": 3060 + }, + { + "epoch": 0.4963220434888045, + "grad_norm": 0.525874137878418, + "learning_rate": 0.0002, + "loss": 0.8447, + "step": 3070 + }, + { + "epoch": 0.4979387276695498, + "grad_norm": 0.6361091732978821, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 3080 + }, + { + "epoch": 0.49955541185029506, + "grad_norm": 0.5850642919540405, + "learning_rate": 0.0002, + "loss": 0.821, + "step": 3090 + }, + { + "epoch": 0.5011720960310403, + "grad_norm": 0.47299543023109436, + "learning_rate": 0.0002, + "loss": 0.8279, + "step": 3100 + }, + { + "epoch": 0.5027887802117856, + "grad_norm": 0.473099946975708, + "learning_rate": 0.0002, + "loss": 0.8681, + "step": 3110 + }, + { + "epoch": 0.5044054643925309, + "grad_norm": 0.48186397552490234, + "learning_rate": 0.0002, + "loss": 0.8223, + "step": 3120 + }, + { + "epoch": 0.5060221485732762, + "grad_norm": 0.5015401840209961, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 3130 + }, + { + "epoch": 0.5076388327540216, + "grad_norm": 0.5617750287055969, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 3140 + }, + { + "epoch": 0.5092555169347668, + "grad_norm": 0.5169327259063721, + "learning_rate": 0.0002, + "loss": 0.8708, + "step": 3150 + }, + { + "epoch": 0.5108722011155121, + "grad_norm": 0.545657753944397, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 3160 + }, + { + "epoch": 0.5124888852962574, + "grad_norm": 0.512864351272583, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 3170 + }, + { + "epoch": 0.5141055694770027, + "grad_norm": 0.4113546311855316, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 3180 + }, + { + "epoch": 0.5157222536577479, + "grad_norm": 0.44532445073127747, + "learning_rate": 0.0002, + "loss": 0.8206, + "step": 3190 + }, + { + "epoch": 0.5173389378384933, + "grad_norm": 0.5623497366905212, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 3200 + }, + { + "epoch": 0.5189556220192385, + "grad_norm": 0.5084741115570068, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 3210 + }, + { + "epoch": 0.5205723061999838, + "grad_norm": 0.5305403470993042, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 3220 + }, + { + "epoch": 0.5221889903807291, + "grad_norm": 0.4708254337310791, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 3230 + }, + { + "epoch": 0.5238056745614744, + "grad_norm": 0.43827131390571594, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 3240 + }, + { + "epoch": 0.5254223587422197, + "grad_norm": 0.5630002617835999, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 3250 + }, + { + "epoch": 0.527039042922965, + "grad_norm": 0.5010961890220642, + "learning_rate": 0.0002, + "loss": 0.7522, + "step": 3260 + }, + { + "epoch": 0.5286557271037103, + "grad_norm": 0.6303122043609619, + "learning_rate": 0.0002, + "loss": 0.8374, + "step": 3270 + }, + { + "epoch": 0.5302724112844556, + "grad_norm": 0.5107331275939941, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 3280 + }, + { + "epoch": 0.5318890954652009, + "grad_norm": 0.5700443387031555, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 3290 + }, + { + "epoch": 0.5335057796459461, + "grad_norm": 0.46296367049217224, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 3300 + }, + { + "epoch": 0.5351224638266915, + "grad_norm": 0.531568706035614, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 3310 + }, + { + "epoch": 0.5367391480074367, + "grad_norm": 0.4686741530895233, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 3320 + }, + { + "epoch": 0.5383558321881821, + "grad_norm": 0.5404331088066101, + "learning_rate": 0.0002, + "loss": 0.8104, + "step": 3330 + }, + { + "epoch": 0.5399725163689273, + "grad_norm": 0.6368790864944458, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 3340 + }, + { + "epoch": 0.5415892005496726, + "grad_norm": 0.42300888895988464, + "learning_rate": 0.0002, + "loss": 0.8514, + "step": 3350 + }, + { + "epoch": 0.5432058847304179, + "grad_norm": 0.5362542867660522, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 3360 + }, + { + "epoch": 0.5448225689111632, + "grad_norm": 0.497128963470459, + "learning_rate": 0.0002, + "loss": 0.858, + "step": 3370 + }, + { + "epoch": 0.5464392530919085, + "grad_norm": 0.5006386041641235, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 3380 + }, + { + "epoch": 0.5480559372726538, + "grad_norm": 0.44136837124824524, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 3390 + }, + { + "epoch": 0.5496726214533991, + "grad_norm": 0.5897833108901978, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 3400 + }, + { + "epoch": 0.5512893056341444, + "grad_norm": 0.641075611114502, + "learning_rate": 0.0002, + "loss": 0.8895, + "step": 3410 + }, + { + "epoch": 0.5529059898148897, + "grad_norm": 0.7251322269439697, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 3420 + }, + { + "epoch": 0.5545226739956349, + "grad_norm": 0.47411349415779114, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 3430 + }, + { + "epoch": 0.5561393581763803, + "grad_norm": 0.4994310438632965, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 3440 + }, + { + "epoch": 0.5577560423571255, + "grad_norm": 0.5814438462257385, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 3450 + }, + { + "epoch": 0.5593727265378708, + "grad_norm": 0.6278898119926453, + "learning_rate": 0.0002, + "loss": 0.8805, + "step": 3460 + }, + { + "epoch": 0.5609894107186161, + "grad_norm": 0.46208274364471436, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 3470 + }, + { + "epoch": 0.5626060948993614, + "grad_norm": 0.5718930959701538, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 3480 + }, + { + "epoch": 0.5642227790801067, + "grad_norm": 0.48178744316101074, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 3490 + }, + { + "epoch": 0.565839463260852, + "grad_norm": 0.47336965799331665, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 3500 + }, + { + "epoch": 0.5674561474415973, + "grad_norm": 0.43442684412002563, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 3510 + }, + { + "epoch": 0.5690728316223426, + "grad_norm": 0.6463358998298645, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 3520 + }, + { + "epoch": 0.5706895158030879, + "grad_norm": 0.5286486744880676, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 3530 + }, + { + "epoch": 0.5723061999838331, + "grad_norm": 0.5405499935150146, + "learning_rate": 0.0002, + "loss": 0.8421, + "step": 3540 + }, + { + "epoch": 0.5739228841645785, + "grad_norm": 0.6654391884803772, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 3550 + }, + { + "epoch": 0.5755395683453237, + "grad_norm": 0.5081980228424072, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 3560 + }, + { + "epoch": 0.5771562525260691, + "grad_norm": 0.48978179693222046, + "learning_rate": 0.0002, + "loss": 0.7753, + "step": 3570 + }, + { + "epoch": 0.5787729367068143, + "grad_norm": 0.5840612053871155, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3580 + }, + { + "epoch": 0.5803896208875596, + "grad_norm": 0.5235261917114258, + "learning_rate": 0.0002, + "loss": 0.8937, + "step": 3590 + }, + { + "epoch": 0.5820063050683049, + "grad_norm": 0.5672075748443604, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 3600 + }, + { + "epoch": 0.5836229892490502, + "grad_norm": 0.5613429546356201, + "learning_rate": 0.0002, + "loss": 0.8347, + "step": 3610 + }, + { + "epoch": 0.5852396734297954, + "grad_norm": 0.4032273590564728, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 3620 + }, + { + "epoch": 0.5868563576105408, + "grad_norm": 0.49559324979782104, + "learning_rate": 0.0002, + "loss": 0.8421, + "step": 3630 + }, + { + "epoch": 0.5884730417912861, + "grad_norm": 0.6895697712898254, + "learning_rate": 0.0002, + "loss": 0.8332, + "step": 3640 + }, + { + "epoch": 0.5900897259720314, + "grad_norm": 0.4750136435031891, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 3650 + }, + { + "epoch": 0.5917064101527767, + "grad_norm": 0.5176819562911987, + "learning_rate": 0.0002, + "loss": 0.8219, + "step": 3660 + }, + { + "epoch": 0.5933230943335219, + "grad_norm": 0.5817760229110718, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3670 + }, + { + "epoch": 0.5949397785142673, + "grad_norm": 0.6064626574516296, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 3680 + }, + { + "epoch": 0.5965564626950125, + "grad_norm": 0.6728700995445251, + "learning_rate": 0.0002, + "loss": 0.8422, + "step": 3690 + }, + { + "epoch": 0.5981731468757578, + "grad_norm": 0.609305202960968, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 3700 + }, + { + "epoch": 0.5997898310565031, + "grad_norm": 0.4615488350391388, + "learning_rate": 0.0002, + "loss": 0.8048, + "step": 3710 + }, + { + "epoch": 0.6014065152372484, + "grad_norm": 2.0531179904937744, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 3720 + }, + { + "epoch": 0.6030231994179936, + "grad_norm": 0.5091132521629333, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 3730 + }, + { + "epoch": 0.604639883598739, + "grad_norm": 0.5951124429702759, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 3740 + }, + { + "epoch": 0.6062565677794842, + "grad_norm": 0.5870208144187927, + "learning_rate": 0.0002, + "loss": 0.7784, + "step": 3750 + }, + { + "epoch": 0.6078732519602296, + "grad_norm": 0.6254619359970093, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 3760 + }, + { + "epoch": 0.6094899361409749, + "grad_norm": 0.5577626824378967, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 3770 + }, + { + "epoch": 0.6111066203217201, + "grad_norm": 0.5004405379295349, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 3780 + }, + { + "epoch": 0.6127233045024655, + "grad_norm": 0.5527383685112, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 3790 + }, + { + "epoch": 0.6143399886832107, + "grad_norm": 0.49116113781929016, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 3800 + }, + { + "epoch": 0.6159566728639561, + "grad_norm": 0.5299299359321594, + "learning_rate": 0.0002, + "loss": 0.8352, + "step": 3810 + }, + { + "epoch": 0.6175733570447013, + "grad_norm": 0.464897483587265, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 3820 + }, + { + "epoch": 0.6191900412254466, + "grad_norm": 0.6505740880966187, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 3830 + }, + { + "epoch": 0.6208067254061919, + "grad_norm": 0.5512559413909912, + "learning_rate": 0.0002, + "loss": 0.8123, + "step": 3840 + }, + { + "epoch": 0.6224234095869372, + "grad_norm": 0.49427518248558044, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 3850 + }, + { + "epoch": 0.6240400937676824, + "grad_norm": 0.3839147090911865, + "learning_rate": 0.0002, + "loss": 0.7751, + "step": 3860 + }, + { + "epoch": 0.6256567779484278, + "grad_norm": 0.5760218501091003, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 3870 + }, + { + "epoch": 0.6272734621291731, + "grad_norm": 0.7226507067680359, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 3880 + }, + { + "epoch": 0.6288901463099184, + "grad_norm": 0.676781415939331, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 3890 + }, + { + "epoch": 0.6305068304906637, + "grad_norm": 0.4284018278121948, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 3900 + }, + { + "epoch": 0.6321235146714089, + "grad_norm": 0.5060628056526184, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 3910 + }, + { + "epoch": 0.6337401988521543, + "grad_norm": 0.5524522066116333, + "learning_rate": 0.0002, + "loss": 0.8089, + "step": 3920 + }, + { + "epoch": 0.6353568830328995, + "grad_norm": 0.6099881529808044, + "learning_rate": 0.0002, + "loss": 0.8276, + "step": 3930 + }, + { + "epoch": 0.6369735672136448, + "grad_norm": 0.43155938386917114, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 3940 + }, + { + "epoch": 0.6385902513943901, + "grad_norm": 0.6427084803581238, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 3950 + }, + { + "epoch": 0.6402069355751354, + "grad_norm": 0.541220486164093, + "learning_rate": 0.0002, + "loss": 0.8368, + "step": 3960 + }, + { + "epoch": 0.6418236197558806, + "grad_norm": 0.5414294600486755, + "learning_rate": 0.0002, + "loss": 0.8539, + "step": 3970 + }, + { + "epoch": 0.643440303936626, + "grad_norm": 0.46344003081321716, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 3980 + }, + { + "epoch": 0.6450569881173712, + "grad_norm": 0.45209285616874695, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 3990 + }, + { + "epoch": 0.6466736722981166, + "grad_norm": 0.5417284369468689, + "learning_rate": 0.0002, + "loss": 0.8202, + "step": 4000 + }, + { + "epoch": 0.6482903564788619, + "grad_norm": 0.7995685935020447, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 4010 + }, + { + "epoch": 0.6499070406596071, + "grad_norm": 0.6384002566337585, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 4020 + }, + { + "epoch": 0.6515237248403525, + "grad_norm": 0.4472815692424774, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 4030 + }, + { + "epoch": 0.6531404090210977, + "grad_norm": 0.6834294199943542, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 4040 + }, + { + "epoch": 0.654757093201843, + "grad_norm": 0.4612339735031128, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 4050 + }, + { + "epoch": 0.6563737773825883, + "grad_norm": 0.9266576170921326, + "learning_rate": 0.0002, + "loss": 0.8141, + "step": 4060 + }, + { + "epoch": 0.6579904615633336, + "grad_norm": 0.4470861852169037, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 4070 + }, + { + "epoch": 0.6596071457440789, + "grad_norm": 0.45544925332069397, + "learning_rate": 0.0002, + "loss": 0.8293, + "step": 4080 + }, + { + "epoch": 0.6612238299248242, + "grad_norm": 0.6144481301307678, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 4090 + }, + { + "epoch": 0.6628405141055694, + "grad_norm": 0.5936288237571716, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 4100 + }, + { + "epoch": 0.6644571982863148, + "grad_norm": 0.4822963774204254, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 4110 + }, + { + "epoch": 0.66607388246706, + "grad_norm": 0.48432496190071106, + "learning_rate": 0.0002, + "loss": 0.7997, + "step": 4120 + }, + { + "epoch": 0.6676905666478054, + "grad_norm": 0.4901607930660248, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 4130 + }, + { + "epoch": 0.6693072508285507, + "grad_norm": 0.5018393397331238, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 4140 + }, + { + "epoch": 0.6709239350092959, + "grad_norm": 0.6946378946304321, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 4150 + }, + { + "epoch": 0.6725406191900413, + "grad_norm": 0.5997390747070312, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 4160 + }, + { + "epoch": 0.6741573033707865, + "grad_norm": 0.6738849878311157, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 4170 + }, + { + "epoch": 0.6757739875515318, + "grad_norm": 0.6110581159591675, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 4180 + }, + { + "epoch": 0.6773906717322771, + "grad_norm": 0.5703322291374207, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 4190 + }, + { + "epoch": 0.6790073559130224, + "grad_norm": 0.4686066210269928, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 4200 + }, + { + "epoch": 0.6806240400937676, + "grad_norm": 0.6394643783569336, + "learning_rate": 0.0002, + "loss": 0.8441, + "step": 4210 + }, + { + "epoch": 0.682240724274513, + "grad_norm": 0.5454841256141663, + "learning_rate": 0.0002, + "loss": 0.8011, + "step": 4220 + }, + { + "epoch": 0.6838574084552582, + "grad_norm": 0.4859732985496521, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 4230 + }, + { + "epoch": 0.6854740926360036, + "grad_norm": 0.5544065833091736, + "learning_rate": 0.0002, + "loss": 0.8161, + "step": 4240 + }, + { + "epoch": 0.6870907768167488, + "grad_norm": 0.4902505576610565, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 4250 + }, + { + "epoch": 0.6887074609974941, + "grad_norm": 0.4768051505088806, + "learning_rate": 0.0002, + "loss": 0.7977, + "step": 4260 + }, + { + "epoch": 0.6903241451782395, + "grad_norm": 0.49982190132141113, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 4270 + }, + { + "epoch": 0.6919408293589847, + "grad_norm": 0.6351838111877441, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 4280 + }, + { + "epoch": 0.69355751353973, + "grad_norm": 0.5647561550140381, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 4290 + }, + { + "epoch": 0.6951741977204753, + "grad_norm": 0.5340486764907837, + "learning_rate": 0.0002, + "loss": 0.7618, + "step": 4300 + }, + { + "epoch": 0.6967908819012206, + "grad_norm": 0.5649092793464661, + "learning_rate": 0.0002, + "loss": 0.8526, + "step": 4310 + }, + { + "epoch": 0.6984075660819659, + "grad_norm": 0.6183916926383972, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 4320 + }, + { + "epoch": 0.7000242502627112, + "grad_norm": 0.6154509782791138, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 4330 + }, + { + "epoch": 0.7016409344434564, + "grad_norm": 0.5156264305114746, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 4340 + }, + { + "epoch": 0.7032576186242018, + "grad_norm": 0.562171459197998, + "learning_rate": 0.0002, + "loss": 0.8512, + "step": 4350 + }, + { + "epoch": 0.704874302804947, + "grad_norm": 0.4949502646923065, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 4360 + }, + { + "epoch": 0.7064909869856923, + "grad_norm": 0.5171684622764587, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 4370 + }, + { + "epoch": 0.7081076711664377, + "grad_norm": 0.6198443174362183, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 4380 + }, + { + "epoch": 0.7097243553471829, + "grad_norm": 0.5802276134490967, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 4390 + }, + { + "epoch": 0.7113410395279283, + "grad_norm": 0.41096967458724976, + "learning_rate": 0.0002, + "loss": 0.8797, + "step": 4400 + }, + { + "epoch": 0.7129577237086735, + "grad_norm": 0.4397392272949219, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 4410 + }, + { + "epoch": 0.7145744078894188, + "grad_norm": 0.45228442549705505, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 4420 + }, + { + "epoch": 0.7161910920701641, + "grad_norm": 0.4839673936367035, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 4430 + }, + { + "epoch": 0.7178077762509094, + "grad_norm": 0.6140755414962769, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 4440 + }, + { + "epoch": 0.7194244604316546, + "grad_norm": 0.6841378808021545, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 4450 + }, + { + "epoch": 0.7210411446124, + "grad_norm": 0.6664239168167114, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 4460 + }, + { + "epoch": 0.7226578287931452, + "grad_norm": 0.47552719712257385, + "learning_rate": 0.0002, + "loss": 0.7983, + "step": 4470 + }, + { + "epoch": 0.7242745129738906, + "grad_norm": 0.6649776101112366, + "learning_rate": 0.0002, + "loss": 0.8982, + "step": 4480 + }, + { + "epoch": 0.7258911971546358, + "grad_norm": 0.5159541964530945, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 4490 + }, + { + "epoch": 0.7275078813353811, + "grad_norm": 0.6693112850189209, + "learning_rate": 0.0002, + "loss": 0.7786, + "step": 4500 + }, + { + "epoch": 0.7291245655161265, + "grad_norm": 0.48870977759361267, + "learning_rate": 0.0002, + "loss": 0.8655, + "step": 4510 + }, + { + "epoch": 0.7307412496968717, + "grad_norm": 0.4857887923717499, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 4520 + }, + { + "epoch": 0.732357933877617, + "grad_norm": 0.5515662431716919, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 4530 + }, + { + "epoch": 0.7339746180583623, + "grad_norm": 0.6292222738265991, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 4540 + }, + { + "epoch": 0.7355913022391076, + "grad_norm": 0.48265689611434937, + "learning_rate": 0.0002, + "loss": 0.7749, + "step": 4550 + }, + { + "epoch": 0.7372079864198529, + "grad_norm": 0.8044266104698181, + "learning_rate": 0.0002, + "loss": 0.8499, + "step": 4560 + }, + { + "epoch": 0.7388246706005982, + "grad_norm": 0.6111769676208496, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 4570 + }, + { + "epoch": 0.7404413547813434, + "grad_norm": 0.5229553580284119, + "learning_rate": 0.0002, + "loss": 0.7291, + "step": 4580 + }, + { + "epoch": 0.7420580389620888, + "grad_norm": 0.6054152250289917, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 4590 + }, + { + "epoch": 0.743674723142834, + "grad_norm": 0.5574966669082642, + "learning_rate": 0.0002, + "loss": 0.8169, + "step": 4600 + }, + { + "epoch": 0.7452914073235793, + "grad_norm": 0.5395817160606384, + "learning_rate": 0.0002, + "loss": 0.8439, + "step": 4610 + }, + { + "epoch": 0.7469080915043246, + "grad_norm": 0.7116472721099854, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 4620 + }, + { + "epoch": 0.7485247756850699, + "grad_norm": 0.5618700981140137, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 4630 + }, + { + "epoch": 0.7501414598658153, + "grad_norm": 0.5802770853042603, + "learning_rate": 0.0002, + "loss": 0.7744, + "step": 4640 + }, + { + "epoch": 0.7517581440465605, + "grad_norm": 0.5690428018569946, + "learning_rate": 0.0002, + "loss": 0.7924, + "step": 4650 + }, + { + "epoch": 0.7533748282273058, + "grad_norm": 0.4813360273838043, + "learning_rate": 0.0002, + "loss": 0.8017, + "step": 4660 + }, + { + "epoch": 0.7549915124080511, + "grad_norm": 0.5434042811393738, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 4670 + }, + { + "epoch": 0.7566081965887964, + "grad_norm": 0.5502099990844727, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 4680 + }, + { + "epoch": 0.7582248807695416, + "grad_norm": 0.6020621061325073, + "learning_rate": 0.0002, + "loss": 0.8598, + "step": 4690 + }, + { + "epoch": 0.759841564950287, + "grad_norm": 0.4922301471233368, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 4700 + }, + { + "epoch": 0.7614582491310322, + "grad_norm": 0.6492828726768494, + "learning_rate": 0.0002, + "loss": 0.788, + "step": 4710 + }, + { + "epoch": 0.7630749333117776, + "grad_norm": 0.4865580201148987, + "learning_rate": 0.0002, + "loss": 0.8313, + "step": 4720 + }, + { + "epoch": 0.7646916174925228, + "grad_norm": 0.5971422791481018, + "learning_rate": 0.0002, + "loss": 0.7966, + "step": 4730 + }, + { + "epoch": 0.7663083016732681, + "grad_norm": 0.6832674145698547, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 4740 + }, + { + "epoch": 0.7679249858540134, + "grad_norm": 0.500908613204956, + "learning_rate": 0.0002, + "loss": 0.8156, + "step": 4750 + }, + { + "epoch": 0.7695416700347587, + "grad_norm": 0.6112465858459473, + "learning_rate": 0.0002, + "loss": 0.8383, + "step": 4760 + }, + { + "epoch": 0.771158354215504, + "grad_norm": 0.5753506422042847, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 4770 + }, + { + "epoch": 0.7727750383962493, + "grad_norm": 0.6529405117034912, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 4780 + }, + { + "epoch": 0.7743917225769946, + "grad_norm": 0.5916843414306641, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 4790 + }, + { + "epoch": 0.7760084067577399, + "grad_norm": 0.4821224510669708, + "learning_rate": 0.0002, + "loss": 0.83, + "step": 4800 + }, + { + "epoch": 0.7776250909384852, + "grad_norm": 0.5532580018043518, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 4810 + }, + { + "epoch": 0.7792417751192304, + "grad_norm": 0.4604877233505249, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 4820 + }, + { + "epoch": 0.7808584592999758, + "grad_norm": 0.5009613037109375, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 4830 + }, + { + "epoch": 0.782475143480721, + "grad_norm": 0.6448560357093811, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 4840 + }, + { + "epoch": 0.7840918276614663, + "grad_norm": 0.44327953457832336, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 4850 + }, + { + "epoch": 0.7857085118422116, + "grad_norm": 0.5355411171913147, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 4860 + }, + { + "epoch": 0.7873251960229569, + "grad_norm": 0.5635677576065063, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 4870 + }, + { + "epoch": 0.7889418802037023, + "grad_norm": 0.5417491793632507, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 4880 + }, + { + "epoch": 0.7905585643844475, + "grad_norm": 0.4567430913448334, + "learning_rate": 0.0002, + "loss": 0.7819, + "step": 4890 + }, + { + "epoch": 0.7921752485651928, + "grad_norm": 0.44651296734809875, + "learning_rate": 0.0002, + "loss": 0.8454, + "step": 4900 + }, + { + "epoch": 0.7937919327459381, + "grad_norm": 0.5741217136383057, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 4910 + }, + { + "epoch": 0.7954086169266834, + "grad_norm": 0.6605045199394226, + "learning_rate": 0.0002, + "loss": 0.8093, + "step": 4920 + }, + { + "epoch": 0.7970253011074286, + "grad_norm": 0.5126531720161438, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 4930 + }, + { + "epoch": 0.798641985288174, + "grad_norm": 0.513648271560669, + "learning_rate": 0.0002, + "loss": 0.7793, + "step": 4940 + }, + { + "epoch": 0.8002586694689192, + "grad_norm": 0.5350404381752014, + "learning_rate": 0.0002, + "loss": 0.8314, + "step": 4950 + }, + { + "epoch": 0.8018753536496646, + "grad_norm": 0.5731674432754517, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 4960 + }, + { + "epoch": 0.8034920378304098, + "grad_norm": 0.5974258184432983, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 4970 + }, + { + "epoch": 0.8051087220111551, + "grad_norm": 0.8774799704551697, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 4980 + }, + { + "epoch": 0.8067254061919004, + "grad_norm": 0.5994430184364319, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 4990 + }, + { + "epoch": 0.8083420903726457, + "grad_norm": 0.4894903004169464, + "learning_rate": 0.0002, + "loss": 0.7736, + "step": 5000 + }, + { + "epoch": 0.809958774553391, + "grad_norm": 0.5218459367752075, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 5010 + }, + { + "epoch": 0.8115754587341363, + "grad_norm": 0.5232468843460083, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 5020 + }, + { + "epoch": 0.8131921429148816, + "grad_norm": 0.44358372688293457, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 5030 + }, + { + "epoch": 0.8148088270956269, + "grad_norm": 0.6202037334442139, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 5040 + }, + { + "epoch": 0.8164255112763722, + "grad_norm": 0.7721474170684814, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 5050 + }, + { + "epoch": 0.8180421954571174, + "grad_norm": 0.5568501353263855, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 5060 + }, + { + "epoch": 0.8196588796378628, + "grad_norm": 0.49148809909820557, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 5070 + }, + { + "epoch": 0.821275563818608, + "grad_norm": 0.4956012964248657, + "learning_rate": 0.0002, + "loss": 0.8054, + "step": 5080 + }, + { + "epoch": 0.8228922479993533, + "grad_norm": 0.6078833937644958, + "learning_rate": 0.0002, + "loss": 0.8201, + "step": 5090 + }, + { + "epoch": 0.8245089321800986, + "grad_norm": 0.46906954050064087, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 5100 + }, + { + "epoch": 0.8261256163608439, + "grad_norm": 0.50812166929245, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 5110 + }, + { + "epoch": 0.8277423005415891, + "grad_norm": 0.5319661498069763, + "learning_rate": 0.0002, + "loss": 0.8243, + "step": 5120 + }, + { + "epoch": 0.8293589847223345, + "grad_norm": 0.4949689209461212, + "learning_rate": 0.0002, + "loss": 0.7798, + "step": 5130 + }, + { + "epoch": 0.8309756689030798, + "grad_norm": 0.5151591300964355, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 5140 + }, + { + "epoch": 0.8325923530838251, + "grad_norm": 0.5530214309692383, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 5150 + }, + { + "epoch": 0.8342090372645704, + "grad_norm": 0.6297410130500793, + "learning_rate": 0.0002, + "loss": 0.8251, + "step": 5160 + }, + { + "epoch": 0.8358257214453156, + "grad_norm": 0.5466840267181396, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 5170 + }, + { + "epoch": 0.837442405626061, + "grad_norm": 0.652913510799408, + "learning_rate": 0.0002, + "loss": 0.7875, + "step": 5180 + }, + { + "epoch": 0.8390590898068062, + "grad_norm": 0.5811293125152588, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 5190 + }, + { + "epoch": 0.8406757739875516, + "grad_norm": 0.5109550952911377, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 5200 + }, + { + "epoch": 0.8422924581682968, + "grad_norm": 0.4551706612110138, + "learning_rate": 0.0002, + "loss": 0.8077, + "step": 5210 + }, + { + "epoch": 0.8439091423490421, + "grad_norm": 0.5813754200935364, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 5220 + }, + { + "epoch": 0.8455258265297874, + "grad_norm": 0.5856947898864746, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 5230 + }, + { + "epoch": 0.8471425107105327, + "grad_norm": 0.5482739210128784, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 5240 + }, + { + "epoch": 0.8487591948912779, + "grad_norm": 0.49023720622062683, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 5250 + }, + { + "epoch": 0.8503758790720233, + "grad_norm": 0.49472475051879883, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 5260 + }, + { + "epoch": 0.8519925632527686, + "grad_norm": 0.5490226745605469, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 5270 + }, + { + "epoch": 0.8536092474335139, + "grad_norm": 0.5340665578842163, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 5280 + }, + { + "epoch": 0.8552259316142592, + "grad_norm": 0.5962483882904053, + "learning_rate": 0.0002, + "loss": 0.8277, + "step": 5290 + }, + { + "epoch": 0.8568426157950044, + "grad_norm": 0.586358368396759, + "learning_rate": 0.0002, + "loss": 0.8765, + "step": 5300 + }, + { + "epoch": 0.8584592999757498, + "grad_norm": 0.49120277166366577, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 5310 + }, + { + "epoch": 0.860075984156495, + "grad_norm": 0.5887332558631897, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 5320 + }, + { + "epoch": 0.8616926683372403, + "grad_norm": 0.42496153712272644, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 5330 + }, + { + "epoch": 0.8633093525179856, + "grad_norm": 0.5489874482154846, + "learning_rate": 0.0002, + "loss": 0.7905, + "step": 5340 + }, + { + "epoch": 0.8649260366987309, + "grad_norm": 0.5850813984870911, + "learning_rate": 0.0002, + "loss": 0.7958, + "step": 5350 + }, + { + "epoch": 0.8665427208794761, + "grad_norm": 0.517487108707428, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 5360 + }, + { + "epoch": 0.8681594050602215, + "grad_norm": 0.5339142680168152, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 5370 + }, + { + "epoch": 0.8697760892409668, + "grad_norm": 0.6236387491226196, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 5380 + }, + { + "epoch": 0.8713927734217121, + "grad_norm": 0.5752192735671997, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 5390 + }, + { + "epoch": 0.8730094576024574, + "grad_norm": 0.6724614500999451, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 5400 + }, + { + "epoch": 0.8746261417832026, + "grad_norm": 0.5280613303184509, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 5410 + }, + { + "epoch": 0.876242825963948, + "grad_norm": 0.44033288955688477, + "learning_rate": 0.0002, + "loss": 0.8231, + "step": 5420 + }, + { + "epoch": 0.8778595101446932, + "grad_norm": 0.5199708342552185, + "learning_rate": 0.0002, + "loss": 0.8839, + "step": 5430 + }, + { + "epoch": 0.8794761943254386, + "grad_norm": 0.46778348088264465, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 5440 + }, + { + "epoch": 0.8810928785061838, + "grad_norm": 0.4657754898071289, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 5450 + }, + { + "epoch": 0.8827095626869291, + "grad_norm": 0.5472902655601501, + "learning_rate": 0.0002, + "loss": 0.7799, + "step": 5460 + }, + { + "epoch": 0.8843262468676744, + "grad_norm": 0.4876766800880432, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 5470 + }, + { + "epoch": 0.8859429310484197, + "grad_norm": 0.5057248473167419, + "learning_rate": 0.0002, + "loss": 0.7906, + "step": 5480 + }, + { + "epoch": 0.8875596152291649, + "grad_norm": 0.4637320637702942, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 5490 + }, + { + "epoch": 0.8891762994099103, + "grad_norm": 0.471955806016922, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 5500 + }, + { + "epoch": 0.8907929835906556, + "grad_norm": 0.5209813714027405, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 5510 + }, + { + "epoch": 0.8924096677714008, + "grad_norm": 0.6213834285736084, + "learning_rate": 0.0002, + "loss": 0.8106, + "step": 5520 + }, + { + "epoch": 0.8940263519521462, + "grad_norm": 0.5215408205986023, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 5530 + }, + { + "epoch": 0.8956430361328914, + "grad_norm": 0.580478310585022, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 5540 + }, + { + "epoch": 0.8972597203136368, + "grad_norm": 0.49102169275283813, + "learning_rate": 0.0002, + "loss": 0.8371, + "step": 5550 + }, + { + "epoch": 0.898876404494382, + "grad_norm": 0.6043479442596436, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 5560 + }, + { + "epoch": 0.9004930886751273, + "grad_norm": 0.5636463165283203, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 5570 + }, + { + "epoch": 0.9021097728558726, + "grad_norm": 0.5620124340057373, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 5580 + }, + { + "epoch": 0.9037264570366179, + "grad_norm": 0.5206354856491089, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 5590 + }, + { + "epoch": 0.9053431412173631, + "grad_norm": 0.5798229575157166, + "learning_rate": 0.0002, + "loss": 0.8557, + "step": 5600 + }, + { + "epoch": 0.9069598253981085, + "grad_norm": 0.6428212523460388, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 5610 + }, + { + "epoch": 0.9085765095788537, + "grad_norm": 0.48064687848091125, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 5620 + }, + { + "epoch": 0.9101931937595991, + "grad_norm": 0.6347860097885132, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 5630 + }, + { + "epoch": 0.9118098779403444, + "grad_norm": 0.5353913307189941, + "learning_rate": 0.0002, + "loss": 0.851, + "step": 5640 + }, + { + "epoch": 0.9134265621210896, + "grad_norm": 0.5323944091796875, + "learning_rate": 0.0002, + "loss": 0.7736, + "step": 5650 + }, + { + "epoch": 0.915043246301835, + "grad_norm": 0.5261843204498291, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 5660 + }, + { + "epoch": 0.9166599304825802, + "grad_norm": 0.5451326966285706, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 5670 + }, + { + "epoch": 0.9182766146633256, + "grad_norm": 0.5183324217796326, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 5680 + }, + { + "epoch": 0.9198932988440708, + "grad_norm": 0.47229018807411194, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 5690 + }, + { + "epoch": 0.9215099830248161, + "grad_norm": 0.49180513620376587, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 5700 + }, + { + "epoch": 0.9231266672055614, + "grad_norm": 0.5419785380363464, + "learning_rate": 0.0002, + "loss": 0.8735, + "step": 5710 + }, + { + "epoch": 0.9247433513863067, + "grad_norm": 0.5408698916435242, + "learning_rate": 0.0002, + "loss": 0.7378, + "step": 5720 + }, + { + "epoch": 0.9263600355670519, + "grad_norm": 0.5286232829093933, + "learning_rate": 0.0002, + "loss": 0.7701, + "step": 5730 + }, + { + "epoch": 0.9279767197477973, + "grad_norm": 0.7539758086204529, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 5740 + }, + { + "epoch": 0.9295934039285425, + "grad_norm": 0.5166944861412048, + "learning_rate": 0.0002, + "loss": 0.8118, + "step": 5750 + }, + { + "epoch": 0.9312100881092878, + "grad_norm": 0.6601425409317017, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 5760 + }, + { + "epoch": 0.9328267722900332, + "grad_norm": 0.5029960870742798, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 5770 + }, + { + "epoch": 0.9344434564707784, + "grad_norm": 0.4926645755767822, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 5780 + }, + { + "epoch": 0.9360601406515238, + "grad_norm": 0.5739615559577942, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 5790 + }, + { + "epoch": 0.937676824832269, + "grad_norm": 0.5058279037475586, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 5800 + }, + { + "epoch": 0.9392935090130143, + "grad_norm": 0.5260962247848511, + "learning_rate": 0.0002, + "loss": 0.8537, + "step": 5810 + }, + { + "epoch": 0.9409101931937596, + "grad_norm": 0.5768588185310364, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 5820 + }, + { + "epoch": 0.9425268773745049, + "grad_norm": 0.5170126557350159, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 5830 + }, + { + "epoch": 0.9441435615552501, + "grad_norm": 0.5745864510536194, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 5840 + }, + { + "epoch": 0.9457602457359955, + "grad_norm": 0.5551357865333557, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 5850 + }, + { + "epoch": 0.9473769299167407, + "grad_norm": 0.5776078701019287, + "learning_rate": 0.0002, + "loss": 0.8529, + "step": 5860 + }, + { + "epoch": 0.9489936140974861, + "grad_norm": 0.5340062379837036, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 5870 + }, + { + "epoch": 0.9506102982782314, + "grad_norm": 0.6447290182113647, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 5880 + }, + { + "epoch": 0.9522269824589766, + "grad_norm": 0.5123815536499023, + "learning_rate": 0.0002, + "loss": 0.7945, + "step": 5890 + }, + { + "epoch": 0.953843666639722, + "grad_norm": 0.48547613620758057, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 5900 + }, + { + "epoch": 0.9554603508204672, + "grad_norm": 0.5791414976119995, + "learning_rate": 0.0002, + "loss": 0.7896, + "step": 5910 + }, + { + "epoch": 0.9570770350012126, + "grad_norm": 0.6195011734962463, + "learning_rate": 0.0002, + "loss": 0.8408, + "step": 5920 + }, + { + "epoch": 0.9586937191819578, + "grad_norm": 0.6323803067207336, + "learning_rate": 0.0002, + "loss": 0.7805, + "step": 5930 + }, + { + "epoch": 0.9603104033627031, + "grad_norm": 0.45552879571914673, + "learning_rate": 0.0002, + "loss": 0.8484, + "step": 5940 + }, + { + "epoch": 0.9619270875434484, + "grad_norm": 0.5796473622322083, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 5950 + }, + { + "epoch": 0.9635437717241937, + "grad_norm": 0.647261381149292, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 5960 + }, + { + "epoch": 0.9651604559049389, + "grad_norm": 0.5487682819366455, + "learning_rate": 0.0002, + "loss": 0.8086, + "step": 5970 + }, + { + "epoch": 0.9667771400856843, + "grad_norm": 0.5743663907051086, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 5980 + }, + { + "epoch": 0.9683938242664295, + "grad_norm": 0.5470591187477112, + "learning_rate": 0.0002, + "loss": 0.8153, + "step": 5990 + }, + { + "epoch": 0.9700105084471748, + "grad_norm": 0.5901660323143005, + "learning_rate": 0.0002, + "loss": 0.8119, + "step": 6000 + }, + { + "epoch": 0.9716271926279202, + "grad_norm": 0.6544759273529053, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 6010 + }, + { + "epoch": 0.9732438768086654, + "grad_norm": 0.6288470029830933, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 6020 + }, + { + "epoch": 0.9748605609894108, + "grad_norm": 0.673153817653656, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 6030 + }, + { + "epoch": 0.976477245170156, + "grad_norm": 0.42854753136634827, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 6040 + }, + { + "epoch": 0.9780939293509013, + "grad_norm": 0.5227066278457642, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 6050 + }, + { + "epoch": 0.9797106135316466, + "grad_norm": 0.5372416973114014, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 6060 + }, + { + "epoch": 0.9813272977123919, + "grad_norm": 0.6026402115821838, + "learning_rate": 0.0002, + "loss": 0.7591, + "step": 6070 + }, + { + "epoch": 0.9829439818931371, + "grad_norm": 0.49547791481018066, + "learning_rate": 0.0002, + "loss": 0.8347, + "step": 6080 + }, + { + "epoch": 0.9845606660738825, + "grad_norm": 0.4641951322555542, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 6090 + }, + { + "epoch": 0.9861773502546277, + "grad_norm": 0.5818535089492798, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 6100 + }, + { + "epoch": 0.9877940344353731, + "grad_norm": 0.63955157995224, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 6110 + }, + { + "epoch": 0.9894107186161183, + "grad_norm": 0.5649438500404358, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 6120 + }, + { + "epoch": 0.9910274027968636, + "grad_norm": 0.5290433168411255, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 6130 + }, + { + "epoch": 0.992644086977609, + "grad_norm": 0.6399374008178711, + "learning_rate": 0.0002, + "loss": 0.8109, + "step": 6140 + }, + { + "epoch": 0.9942607711583542, + "grad_norm": 0.6736576557159424, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 6150 + }, + { + "epoch": 0.9958774553390995, + "grad_norm": 0.515420138835907, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 6160 + }, + { + "epoch": 0.9974941395198448, + "grad_norm": 0.562677800655365, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 6170 + }, + { + "epoch": 0.9991108237005901, + "grad_norm": 0.7113858461380005, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 6180 + }, + { + "epoch": 0.9999191657909627, + "eval_loss": 1.0871200561523438, + "eval_runtime": 122.2071, + "eval_samples_per_second": 5.998, + "eval_steps_per_second": 0.753, + "step": 6185 + }, + { + "epoch": 1.0007275078813354, + "grad_norm": 0.7111801505088806, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 6190 + }, + { + "epoch": 1.0023441920620806, + "grad_norm": 0.5402125716209412, + "learning_rate": 0.0002, + "loss": 0.6865, + "step": 6200 + }, + { + "epoch": 1.003960876242826, + "grad_norm": 0.6098830103874207, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 6210 + }, + { + "epoch": 1.0055775604235713, + "grad_norm": 0.5829983353614807, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 6220 + }, + { + "epoch": 1.0071942446043165, + "grad_norm": 0.5614621043205261, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 6230 + }, + { + "epoch": 1.0088109287850617, + "grad_norm": 0.5954238772392273, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 6240 + }, + { + "epoch": 1.0104276129658072, + "grad_norm": 0.6480574607849121, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 6250 + }, + { + "epoch": 1.0120442971465524, + "grad_norm": 0.6051128506660461, + "learning_rate": 0.0002, + "loss": 0.7514, + "step": 6260 + }, + { + "epoch": 1.0136609813272976, + "grad_norm": 0.6318870782852173, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 6270 + }, + { + "epoch": 1.015277665508043, + "grad_norm": 0.5048980116844177, + "learning_rate": 0.0002, + "loss": 0.7178, + "step": 6280 + }, + { + "epoch": 1.0168943496887883, + "grad_norm": 0.6346936225891113, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 6290 + }, + { + "epoch": 1.0185110338695336, + "grad_norm": 0.5711665749549866, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 6300 + }, + { + "epoch": 1.0201277180502788, + "grad_norm": 0.5175361037254333, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 6310 + }, + { + "epoch": 1.0217444022310243, + "grad_norm": 0.5360831618309021, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 6320 + }, + { + "epoch": 1.0233610864117695, + "grad_norm": 0.614675760269165, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 6330 + }, + { + "epoch": 1.0249777705925147, + "grad_norm": 0.5626118183135986, + "learning_rate": 0.0002, + "loss": 0.7748, + "step": 6340 + }, + { + "epoch": 1.02659445477326, + "grad_norm": 0.574897289276123, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 6350 + }, + { + "epoch": 1.0282111389540054, + "grad_norm": 0.7185447812080383, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 6360 + }, + { + "epoch": 1.0298278231347506, + "grad_norm": 0.6705799698829651, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 6370 + }, + { + "epoch": 1.0314445073154959, + "grad_norm": 0.6740428805351257, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 6380 + }, + { + "epoch": 1.0330611914962413, + "grad_norm": 0.663902759552002, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 6390 + }, + { + "epoch": 1.0346778756769865, + "grad_norm": 0.5029543042182922, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 6400 + }, + { + "epoch": 1.0362945598577318, + "grad_norm": 0.7813863158226013, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 6410 + }, + { + "epoch": 1.037911244038477, + "grad_norm": 0.5396282076835632, + "learning_rate": 0.0002, + "loss": 0.7433, + "step": 6420 + }, + { + "epoch": 1.0395279282192225, + "grad_norm": 0.5253293514251709, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 6430 + }, + { + "epoch": 1.0411446123999677, + "grad_norm": 0.7236770987510681, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 6440 + }, + { + "epoch": 1.042761296580713, + "grad_norm": 0.5670917630195618, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 6450 + }, + { + "epoch": 1.0443779807614582, + "grad_norm": 0.6031978726387024, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 6460 + }, + { + "epoch": 1.0459946649422036, + "grad_norm": 0.5309213399887085, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 6470 + }, + { + "epoch": 1.0476113491229488, + "grad_norm": 0.7114651799201965, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 6480 + }, + { + "epoch": 1.049228033303694, + "grad_norm": 0.5591610670089722, + "learning_rate": 0.0002, + "loss": 0.7583, + "step": 6490 + }, + { + "epoch": 1.0508447174844395, + "grad_norm": 0.5185961127281189, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 6500 + }, + { + "epoch": 1.0524614016651848, + "grad_norm": 0.6510552167892456, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 6510 + }, + { + "epoch": 1.05407808584593, + "grad_norm": 0.6557928919792175, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 6520 + }, + { + "epoch": 1.0556947700266752, + "grad_norm": 0.6973192691802979, + "learning_rate": 0.0002, + "loss": 0.8056, + "step": 6530 + }, + { + "epoch": 1.0573114542074207, + "grad_norm": 0.6226583123207092, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 6540 + }, + { + "epoch": 1.058928138388166, + "grad_norm": 0.5633195638656616, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 6550 + }, + { + "epoch": 1.0605448225689111, + "grad_norm": 0.7466658353805542, + "learning_rate": 0.0002, + "loss": 0.7082, + "step": 6560 + }, + { + "epoch": 1.0621615067496564, + "grad_norm": 0.6462772488594055, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 6570 + }, + { + "epoch": 1.0637781909304018, + "grad_norm": 0.5266856551170349, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 6580 + }, + { + "epoch": 1.065394875111147, + "grad_norm": 0.534392774105072, + "learning_rate": 0.0002, + "loss": 0.7157, + "step": 6590 + }, + { + "epoch": 1.0670115592918923, + "grad_norm": 0.7514177560806274, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 6600 + }, + { + "epoch": 1.0686282434726375, + "grad_norm": 0.7593035697937012, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 6610 + }, + { + "epoch": 1.070244927653383, + "grad_norm": 0.5277858972549438, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 6620 + }, + { + "epoch": 1.0718616118341282, + "grad_norm": 0.5573670268058777, + "learning_rate": 0.0002, + "loss": 0.7405, + "step": 6630 + }, + { + "epoch": 1.0734782960148734, + "grad_norm": 0.6802396774291992, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 6640 + }, + { + "epoch": 1.0750949801956189, + "grad_norm": 0.7367215752601624, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 6650 + }, + { + "epoch": 1.0767116643763641, + "grad_norm": 0.5961891412734985, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 6660 + }, + { + "epoch": 1.0783283485571094, + "grad_norm": 0.5736313462257385, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 6670 + }, + { + "epoch": 1.0799450327378546, + "grad_norm": 0.619219183921814, + "learning_rate": 0.0002, + "loss": 0.7178, + "step": 6680 + }, + { + "epoch": 1.0815617169186, + "grad_norm": 0.6214390993118286, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 6690 + }, + { + "epoch": 1.0831784010993453, + "grad_norm": 0.564536988735199, + "learning_rate": 0.0002, + "loss": 0.7554, + "step": 6700 + }, + { + "epoch": 1.0847950852800905, + "grad_norm": 0.5838140249252319, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 6710 + }, + { + "epoch": 1.0864117694608357, + "grad_norm": 0.7000553607940674, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 6720 + }, + { + "epoch": 1.0880284536415812, + "grad_norm": 0.7078263759613037, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 6730 + }, + { + "epoch": 1.0896451378223264, + "grad_norm": 0.8353848457336426, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 6740 + }, + { + "epoch": 1.0912618220030716, + "grad_norm": 0.5615518689155579, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 6750 + }, + { + "epoch": 1.0928785061838169, + "grad_norm": 0.5475581288337708, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 6760 + }, + { + "epoch": 1.0944951903645623, + "grad_norm": 0.5835978388786316, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 6770 + }, + { + "epoch": 1.0961118745453076, + "grad_norm": 0.5516105890274048, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 6780 + }, + { + "epoch": 1.0977285587260528, + "grad_norm": 0.5875251889228821, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 6790 + }, + { + "epoch": 1.0993452429067982, + "grad_norm": 0.7376947999000549, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 6800 + }, + { + "epoch": 1.1009619270875435, + "grad_norm": 0.5656165480613708, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 6810 + }, + { + "epoch": 1.1025786112682887, + "grad_norm": 0.6365954279899597, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 6820 + }, + { + "epoch": 1.104195295449034, + "grad_norm": 0.5033080577850342, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 6830 + }, + { + "epoch": 1.1058119796297794, + "grad_norm": 0.617396891117096, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 6840 + }, + { + "epoch": 1.1074286638105246, + "grad_norm": 0.6395374536514282, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 6850 + }, + { + "epoch": 1.1090453479912699, + "grad_norm": 0.6775295734405518, + "learning_rate": 0.0002, + "loss": 0.7335, + "step": 6860 + }, + { + "epoch": 1.1106620321720153, + "grad_norm": 0.6655223965644836, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 6870 + }, + { + "epoch": 1.1122787163527605, + "grad_norm": 0.676655113697052, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 6880 + }, + { + "epoch": 1.1138954005335058, + "grad_norm": 0.6062718629837036, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 6890 + }, + { + "epoch": 1.115512084714251, + "grad_norm": 0.590943455696106, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 6900 + }, + { + "epoch": 1.1171287688949965, + "grad_norm": 0.6315317153930664, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 6910 + }, + { + "epoch": 1.1187454530757417, + "grad_norm": 0.47979024052619934, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 6920 + }, + { + "epoch": 1.120362137256487, + "grad_norm": 0.647298276424408, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 6930 + }, + { + "epoch": 1.1219788214372322, + "grad_norm": 0.7336484789848328, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 6940 + }, + { + "epoch": 1.1235955056179776, + "grad_norm": 0.5071424245834351, + "learning_rate": 0.0002, + "loss": 0.693, + "step": 6950 + }, + { + "epoch": 1.1252121897987228, + "grad_norm": 0.6527144312858582, + "learning_rate": 0.0002, + "loss": 0.7378, + "step": 6960 + }, + { + "epoch": 1.126828873979468, + "grad_norm": 0.6935935020446777, + "learning_rate": 0.0002, + "loss": 0.7228, + "step": 6970 + }, + { + "epoch": 1.1284455581602133, + "grad_norm": 0.8026931881904602, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 6980 + }, + { + "epoch": 1.1300622423409588, + "grad_norm": 0.5210393667221069, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 6990 + }, + { + "epoch": 1.131678926521704, + "grad_norm": 0.60475093126297, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 7000 + }, + { + "epoch": 1.1332956107024492, + "grad_norm": 0.6417073607444763, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 7010 + }, + { + "epoch": 1.1349122948831947, + "grad_norm": 0.6732175946235657, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 7020 + }, + { + "epoch": 1.13652897906394, + "grad_norm": 0.6719491481781006, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 7030 + }, + { + "epoch": 1.1381456632446851, + "grad_norm": 0.5708295106887817, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 7040 + }, + { + "epoch": 1.1397623474254304, + "grad_norm": 0.7141719460487366, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 7050 + }, + { + "epoch": 1.1413790316061758, + "grad_norm": 0.6187017560005188, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 7060 + }, + { + "epoch": 1.142995715786921, + "grad_norm": 0.50581294298172, + "learning_rate": 0.0002, + "loss": 0.7657, + "step": 7070 + }, + { + "epoch": 1.1446123999676663, + "grad_norm": 0.5620143413543701, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 7080 + }, + { + "epoch": 1.1462290841484115, + "grad_norm": 0.6231929659843445, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 7090 + }, + { + "epoch": 1.147845768329157, + "grad_norm": 0.5775774121284485, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 7100 + }, + { + "epoch": 1.1494624525099022, + "grad_norm": 0.6492809653282166, + "learning_rate": 0.0002, + "loss": 0.7728, + "step": 7110 + }, + { + "epoch": 1.1510791366906474, + "grad_norm": 0.6434972286224365, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 7120 + }, + { + "epoch": 1.1526958208713927, + "grad_norm": 0.6191812753677368, + "learning_rate": 0.0002, + "loss": 0.7374, + "step": 7130 + }, + { + "epoch": 1.1543125050521381, + "grad_norm": 0.6690331697463989, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 7140 + }, + { + "epoch": 1.1559291892328833, + "grad_norm": 0.5977938175201416, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 7150 + }, + { + "epoch": 1.1575458734136286, + "grad_norm": 0.6195854544639587, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 7160 + }, + { + "epoch": 1.159162557594374, + "grad_norm": 0.5752048492431641, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 7170 + }, + { + "epoch": 1.1607792417751193, + "grad_norm": 0.589081883430481, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 7180 + }, + { + "epoch": 1.1623959259558645, + "grad_norm": 0.756996750831604, + "learning_rate": 0.0002, + "loss": 0.704, + "step": 7190 + }, + { + "epoch": 1.1640126101366097, + "grad_norm": 0.7614967226982117, + "learning_rate": 0.0002, + "loss": 0.7404, + "step": 7200 + }, + { + "epoch": 1.1656292943173552, + "grad_norm": 0.6120437979698181, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 7210 + }, + { + "epoch": 1.1672459784981004, + "grad_norm": 0.6210004687309265, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 7220 + }, + { + "epoch": 1.1688626626788456, + "grad_norm": 0.6044116020202637, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 7230 + }, + { + "epoch": 1.170479346859591, + "grad_norm": 0.5418457388877869, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 7240 + }, + { + "epoch": 1.1720960310403363, + "grad_norm": 0.6413537263870239, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 7250 + }, + { + "epoch": 1.1737127152210816, + "grad_norm": 0.5777867436408997, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 7260 + }, + { + "epoch": 1.1753293994018268, + "grad_norm": 0.7092402577400208, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 7270 + }, + { + "epoch": 1.176946083582572, + "grad_norm": 0.6351709365844727, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 7280 + }, + { + "epoch": 1.1785627677633175, + "grad_norm": 0.6172189712524414, + "learning_rate": 0.0002, + "loss": 0.7527, + "step": 7290 + }, + { + "epoch": 1.1801794519440627, + "grad_norm": 0.6801714897155762, + "learning_rate": 0.0002, + "loss": 0.7319, + "step": 7300 + }, + { + "epoch": 1.181796136124808, + "grad_norm": 0.6044712066650391, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 7310 + }, + { + "epoch": 1.1834128203055534, + "grad_norm": 0.7413212060928345, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 7320 + }, + { + "epoch": 1.1850295044862986, + "grad_norm": 0.5303856134414673, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 7330 + }, + { + "epoch": 1.1866461886670439, + "grad_norm": 0.5647098422050476, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 7340 + }, + { + "epoch": 1.188262872847789, + "grad_norm": 0.7374135255813599, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 7350 + }, + { + "epoch": 1.1898795570285345, + "grad_norm": 0.5710089206695557, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 7360 + }, + { + "epoch": 1.1914962412092798, + "grad_norm": 0.6073619723320007, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 7370 + }, + { + "epoch": 1.193112925390025, + "grad_norm": 0.5899916887283325, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 7380 + }, + { + "epoch": 1.1947296095707705, + "grad_norm": 0.7762434482574463, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 7390 + }, + { + "epoch": 1.1963462937515157, + "grad_norm": 0.679949939250946, + "learning_rate": 0.0002, + "loss": 0.7319, + "step": 7400 + }, + { + "epoch": 1.197962977932261, + "grad_norm": 0.6106849312782288, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 7410 + }, + { + "epoch": 1.1995796621130062, + "grad_norm": 0.682461678981781, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 7420 + }, + { + "epoch": 1.2011963462937516, + "grad_norm": 0.6087017059326172, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 7430 + }, + { + "epoch": 1.2028130304744968, + "grad_norm": 0.63739013671875, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 7440 + }, + { + "epoch": 1.204429714655242, + "grad_norm": 0.6154777407646179, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 7450 + }, + { + "epoch": 1.2060463988359873, + "grad_norm": 0.7491534948348999, + "learning_rate": 0.0002, + "loss": 0.7565, + "step": 7460 + }, + { + "epoch": 1.2076630830167328, + "grad_norm": 0.6664797067642212, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 7470 + }, + { + "epoch": 1.209279767197478, + "grad_norm": 0.6660266518592834, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 7480 + }, + { + "epoch": 1.2108964513782232, + "grad_norm": 0.6972551345825195, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 7490 + }, + { + "epoch": 1.2125131355589684, + "grad_norm": 0.6157945990562439, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 7500 + }, + { + "epoch": 1.214129819739714, + "grad_norm": 0.5199310183525085, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 7510 + }, + { + "epoch": 1.2157465039204591, + "grad_norm": 0.577610433101654, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 7520 + }, + { + "epoch": 1.2173631881012044, + "grad_norm": 0.53652423620224, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 7530 + }, + { + "epoch": 1.2189798722819498, + "grad_norm": 0.6479050517082214, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 7540 + }, + { + "epoch": 1.220596556462695, + "grad_norm": 0.618748128414154, + "learning_rate": 0.0002, + "loss": 0.7534, + "step": 7550 + }, + { + "epoch": 1.2222132406434403, + "grad_norm": 0.6311424374580383, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 7560 + }, + { + "epoch": 1.2238299248241855, + "grad_norm": 0.6595825552940369, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 7570 + }, + { + "epoch": 1.225446609004931, + "grad_norm": 0.5198960900306702, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 7580 + }, + { + "epoch": 1.2270632931856762, + "grad_norm": 0.578650712966919, + "learning_rate": 0.0002, + "loss": 0.674, + "step": 7590 + }, + { + "epoch": 1.2286799773664214, + "grad_norm": 0.6080220937728882, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 7600 + }, + { + "epoch": 1.2302966615471669, + "grad_norm": 0.7050248384475708, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 7610 + }, + { + "epoch": 1.2319133457279121, + "grad_norm": 0.6652196049690247, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 7620 + }, + { + "epoch": 1.2335300299086573, + "grad_norm": 0.7322776317596436, + "learning_rate": 0.0002, + "loss": 0.7085, + "step": 7630 + }, + { + "epoch": 1.2351467140894026, + "grad_norm": 0.4998728036880493, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 7640 + }, + { + "epoch": 1.2367633982701478, + "grad_norm": 0.6428788900375366, + "learning_rate": 0.0002, + "loss": 0.7214, + "step": 7650 + }, + { + "epoch": 1.2383800824508933, + "grad_norm": 0.585242509841919, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 7660 + }, + { + "epoch": 1.2399967666316385, + "grad_norm": 0.5211917757987976, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 7670 + }, + { + "epoch": 1.2416134508123837, + "grad_norm": 0.6490384340286255, + "learning_rate": 0.0002, + "loss": 0.746, + "step": 7680 + }, + { + "epoch": 1.2432301349931292, + "grad_norm": 0.6249763369560242, + "learning_rate": 0.0002, + "loss": 0.7186, + "step": 7690 + }, + { + "epoch": 1.2448468191738744, + "grad_norm": 0.71870356798172, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 7700 + }, + { + "epoch": 1.2464635033546196, + "grad_norm": 0.6761967539787292, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 7710 + }, + { + "epoch": 1.2480801875353649, + "grad_norm": 0.6500617265701294, + "learning_rate": 0.0002, + "loss": 0.7501, + "step": 7720 + }, + { + "epoch": 1.2496968717161103, + "grad_norm": 0.8069869875907898, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 7730 + }, + { + "epoch": 1.2513135558968556, + "grad_norm": 0.6044608950614929, + "learning_rate": 0.0002, + "loss": 0.6747, + "step": 7740 + }, + { + "epoch": 1.2529302400776008, + "grad_norm": 0.6573283076286316, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 7750 + }, + { + "epoch": 1.2545469242583462, + "grad_norm": 0.625430166721344, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 7760 + }, + { + "epoch": 1.2561636084390915, + "grad_norm": 0.5442022681236267, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 7770 + }, + { + "epoch": 1.2577802926198367, + "grad_norm": 0.6818386912345886, + "learning_rate": 0.0002, + "loss": 0.7172, + "step": 7780 + }, + { + "epoch": 1.259396976800582, + "grad_norm": 0.6381874084472656, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 7790 + }, + { + "epoch": 1.2610136609813272, + "grad_norm": 0.6269212961196899, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 7800 + }, + { + "epoch": 1.2626303451620726, + "grad_norm": 0.600121259689331, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 7810 + }, + { + "epoch": 1.2642470293428179, + "grad_norm": 0.6337703466415405, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 7820 + }, + { + "epoch": 1.2658637135235633, + "grad_norm": 0.7234963774681091, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 7830 + }, + { + "epoch": 1.2674803977043085, + "grad_norm": 0.800184965133667, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 7840 + }, + { + "epoch": 1.2690970818850538, + "grad_norm": 0.7539464831352234, + "learning_rate": 0.0002, + "loss": 0.7426, + "step": 7850 + }, + { + "epoch": 1.270713766065799, + "grad_norm": 0.5493760704994202, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 7860 + }, + { + "epoch": 1.2723304502465442, + "grad_norm": 0.7477145791053772, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 7870 + }, + { + "epoch": 1.2739471344272897, + "grad_norm": 0.6366362571716309, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 7880 + }, + { + "epoch": 1.275563818608035, + "grad_norm": 0.7419533729553223, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 7890 + }, + { + "epoch": 1.2771805027887801, + "grad_norm": 0.6141223311424255, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 7900 + }, + { + "epoch": 1.2787971869695256, + "grad_norm": 0.7522598505020142, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 7910 + }, + { + "epoch": 1.2804138711502708, + "grad_norm": 0.6935804486274719, + "learning_rate": 0.0002, + "loss": 0.7097, + "step": 7920 + }, + { + "epoch": 1.282030555331016, + "grad_norm": 0.7239290475845337, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 7930 + }, + { + "epoch": 1.2836472395117613, + "grad_norm": 0.8800187110900879, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 7940 + }, + { + "epoch": 1.2852639236925067, + "grad_norm": 0.540458083152771, + "learning_rate": 0.0002, + "loss": 0.6991, + "step": 7950 + }, + { + "epoch": 1.286880607873252, + "grad_norm": 0.6492934226989746, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 7960 + }, + { + "epoch": 1.2884972920539972, + "grad_norm": 0.6543959379196167, + "learning_rate": 0.0002, + "loss": 0.7742, + "step": 7970 + }, + { + "epoch": 1.2901139762347427, + "grad_norm": 0.5804705619812012, + "learning_rate": 0.0002, + "loss": 0.7316, + "step": 7980 + }, + { + "epoch": 1.291730660415488, + "grad_norm": 0.7074727416038513, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 7990 + }, + { + "epoch": 1.2933473445962331, + "grad_norm": 0.5347974300384521, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 8000 + }, + { + "epoch": 1.2949640287769784, + "grad_norm": 0.6457298398017883, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 8010 + }, + { + "epoch": 1.2965807129577236, + "grad_norm": 0.6407219171524048, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 8020 + }, + { + "epoch": 1.298197397138469, + "grad_norm": 0.828439474105835, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 8030 + }, + { + "epoch": 1.2998140813192143, + "grad_norm": 0.4840380549430847, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 8040 + }, + { + "epoch": 1.3014307654999595, + "grad_norm": 0.5921024680137634, + "learning_rate": 0.0002, + "loss": 0.7283, + "step": 8050 + }, + { + "epoch": 1.303047449680705, + "grad_norm": 0.6170315146446228, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 8060 + }, + { + "epoch": 1.3046641338614502, + "grad_norm": 0.5374847054481506, + "learning_rate": 0.0002, + "loss": 0.7534, + "step": 8070 + }, + { + "epoch": 1.3062808180421954, + "grad_norm": 0.545758068561554, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 8080 + }, + { + "epoch": 1.3078975022229407, + "grad_norm": 0.55641770362854, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 8090 + }, + { + "epoch": 1.309514186403686, + "grad_norm": 0.6724897027015686, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 8100 + }, + { + "epoch": 1.3111308705844313, + "grad_norm": 0.6923972368240356, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 8110 + }, + { + "epoch": 1.3127475547651766, + "grad_norm": 0.5136841535568237, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 8120 + }, + { + "epoch": 1.314364238945922, + "grad_norm": 0.6766283512115479, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 8130 + }, + { + "epoch": 1.3159809231266673, + "grad_norm": 0.6283926367759705, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 8140 + }, + { + "epoch": 1.3175976073074125, + "grad_norm": 0.644216001033783, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 8150 + }, + { + "epoch": 1.3192142914881577, + "grad_norm": 0.7827503085136414, + "learning_rate": 0.0002, + "loss": 0.7125, + "step": 8160 + }, + { + "epoch": 1.320830975668903, + "grad_norm": 0.6651390790939331, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 8170 + }, + { + "epoch": 1.3224476598496484, + "grad_norm": 0.5547412633895874, + "learning_rate": 0.0002, + "loss": 0.7778, + "step": 8180 + }, + { + "epoch": 1.3240643440303936, + "grad_norm": 0.6765179634094238, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 8190 + }, + { + "epoch": 1.325681028211139, + "grad_norm": 0.6822077035903931, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 8200 + }, + { + "epoch": 1.3272977123918843, + "grad_norm": 0.5941002368927002, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 8210 + }, + { + "epoch": 1.3289143965726296, + "grad_norm": 0.4850037097930908, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 8220 + }, + { + "epoch": 1.3305310807533748, + "grad_norm": 0.6162990927696228, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 8230 + }, + { + "epoch": 1.33214776493412, + "grad_norm": 0.6665613651275635, + "learning_rate": 0.0002, + "loss": 0.7751, + "step": 8240 + }, + { + "epoch": 1.3337644491148655, + "grad_norm": 0.618192732334137, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 8250 + }, + { + "epoch": 1.3353811332956107, + "grad_norm": 0.710418701171875, + "learning_rate": 0.0002, + "loss": 0.7532, + "step": 8260 + }, + { + "epoch": 1.336997817476356, + "grad_norm": 0.5109876990318298, + "learning_rate": 0.0002, + "loss": 0.7306, + "step": 8270 + }, + { + "epoch": 1.3386145016571014, + "grad_norm": 0.6791711449623108, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 8280 + }, + { + "epoch": 1.3402311858378466, + "grad_norm": 0.6836432814598083, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 8290 + }, + { + "epoch": 1.3418478700185918, + "grad_norm": 0.5579386353492737, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 8300 + }, + { + "epoch": 1.343464554199337, + "grad_norm": 0.6713546514511108, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 8310 + }, + { + "epoch": 1.3450812383800825, + "grad_norm": 0.5353720188140869, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 8320 + }, + { + "epoch": 1.3466979225608278, + "grad_norm": 0.5813682675361633, + "learning_rate": 0.0002, + "loss": 0.718, + "step": 8330 + }, + { + "epoch": 1.348314606741573, + "grad_norm": 0.8158791661262512, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 8340 + }, + { + "epoch": 1.3499312909223184, + "grad_norm": 0.6193785071372986, + "learning_rate": 0.0002, + "loss": 0.6992, + "step": 8350 + }, + { + "epoch": 1.3515479751030637, + "grad_norm": 0.6353939771652222, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 8360 + }, + { + "epoch": 1.353164659283809, + "grad_norm": 0.6925048232078552, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 8370 + }, + { + "epoch": 1.3547813434645541, + "grad_norm": 0.988264799118042, + "learning_rate": 0.0002, + "loss": 0.736, + "step": 8380 + }, + { + "epoch": 1.3563980276452994, + "grad_norm": 0.6476002931594849, + "learning_rate": 0.0002, + "loss": 0.7744, + "step": 8390 + }, + { + "epoch": 1.3580147118260448, + "grad_norm": 0.7120398879051208, + "learning_rate": 0.0002, + "loss": 0.776, + "step": 8400 + }, + { + "epoch": 1.35963139600679, + "grad_norm": 0.9048416614532471, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 8410 + }, + { + "epoch": 1.3612480801875353, + "grad_norm": 0.7000672817230225, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 8420 + }, + { + "epoch": 1.3628647643682807, + "grad_norm": 0.6015632152557373, + "learning_rate": 0.0002, + "loss": 0.7358, + "step": 8430 + }, + { + "epoch": 1.364481448549026, + "grad_norm": 0.612516462802887, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 8440 + }, + { + "epoch": 1.3660981327297712, + "grad_norm": 0.5969301462173462, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 8450 + }, + { + "epoch": 1.3677148169105164, + "grad_norm": 0.6730654239654541, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 8460 + }, + { + "epoch": 1.369331501091262, + "grad_norm": 0.6386392116546631, + "learning_rate": 0.0002, + "loss": 0.7465, + "step": 8470 + }, + { + "epoch": 1.3709481852720071, + "grad_norm": 0.739544153213501, + "learning_rate": 0.0002, + "loss": 0.7433, + "step": 8480 + }, + { + "epoch": 1.3725648694527524, + "grad_norm": 0.6462782621383667, + "learning_rate": 0.0002, + "loss": 0.7892, + "step": 8490 + }, + { + "epoch": 1.3741815536334978, + "grad_norm": 0.7346843481063843, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 8500 + }, + { + "epoch": 1.375798237814243, + "grad_norm": 0.6884821057319641, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 8510 + }, + { + "epoch": 1.3774149219949883, + "grad_norm": 0.6999333500862122, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 8520 + }, + { + "epoch": 1.3790316061757335, + "grad_norm": 0.5378713011741638, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 8530 + }, + { + "epoch": 1.3806482903564787, + "grad_norm": 0.5417906641960144, + "learning_rate": 0.0002, + "loss": 0.6797, + "step": 8540 + }, + { + "epoch": 1.3822649745372242, + "grad_norm": 0.6602526307106018, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 8550 + }, + { + "epoch": 1.3838816587179694, + "grad_norm": 0.7073674201965332, + "learning_rate": 0.0002, + "loss": 0.7356, + "step": 8560 + }, + { + "epoch": 1.3854983428987149, + "grad_norm": 0.5841707587242126, + "learning_rate": 0.0002, + "loss": 0.75, + "step": 8570 + }, + { + "epoch": 1.38711502707946, + "grad_norm": 0.7031095027923584, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 8580 + }, + { + "epoch": 1.3887317112602053, + "grad_norm": 0.5198570489883423, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 8590 + }, + { + "epoch": 1.3903483954409506, + "grad_norm": 0.7261320352554321, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 8600 + }, + { + "epoch": 1.3919650796216958, + "grad_norm": 0.5616350173950195, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 8610 + }, + { + "epoch": 1.3935817638024413, + "grad_norm": 0.5185914635658264, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 8620 + }, + { + "epoch": 1.3951984479831865, + "grad_norm": 0.5814694762229919, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 8630 + }, + { + "epoch": 1.3968151321639317, + "grad_norm": 0.6977371573448181, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 8640 + }, + { + "epoch": 1.3984318163446772, + "grad_norm": 0.6855689883232117, + "learning_rate": 0.0002, + "loss": 0.7574, + "step": 8650 + }, + { + "epoch": 1.4000485005254224, + "grad_norm": 0.5414357781410217, + "learning_rate": 0.0002, + "loss": 0.7802, + "step": 8660 + }, + { + "epoch": 1.4016651847061676, + "grad_norm": 0.6970012784004211, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 8670 + }, + { + "epoch": 1.4032818688869129, + "grad_norm": 0.526079535484314, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 8680 + }, + { + "epoch": 1.404898553067658, + "grad_norm": 0.758712887763977, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 8690 + }, + { + "epoch": 1.4065152372484035, + "grad_norm": 0.7118762731552124, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 8700 + }, + { + "epoch": 1.4081319214291488, + "grad_norm": 0.5696909427642822, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 8710 + }, + { + "epoch": 1.4097486056098942, + "grad_norm": 0.7995436787605286, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 8720 + }, + { + "epoch": 1.4113652897906395, + "grad_norm": 0.7237521409988403, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 8730 + }, + { + "epoch": 1.4129819739713847, + "grad_norm": 0.744628369808197, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 8740 + }, + { + "epoch": 1.41459865815213, + "grad_norm": 0.6082926988601685, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 8750 + }, + { + "epoch": 1.4162153423328752, + "grad_norm": 0.5185243487358093, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 8760 + }, + { + "epoch": 1.4178320265136206, + "grad_norm": 0.5183082222938538, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 8770 + }, + { + "epoch": 1.4194487106943658, + "grad_norm": 0.7326041460037231, + "learning_rate": 0.0002, + "loss": 0.7509, + "step": 8780 + }, + { + "epoch": 1.421065394875111, + "grad_norm": 0.7174660563468933, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 8790 + }, + { + "epoch": 1.4226820790558565, + "grad_norm": 0.8080165982246399, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 8800 + }, + { + "epoch": 1.4242987632366018, + "grad_norm": 0.5061507821083069, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 8810 + }, + { + "epoch": 1.425915447417347, + "grad_norm": 0.801602840423584, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 8820 + }, + { + "epoch": 1.4275321315980922, + "grad_norm": 0.6150273084640503, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 8830 + }, + { + "epoch": 1.4291488157788377, + "grad_norm": 0.8786525726318359, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 8840 + }, + { + "epoch": 1.430765499959583, + "grad_norm": 0.6371538639068604, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 8850 + }, + { + "epoch": 1.4323821841403281, + "grad_norm": 0.6409295797348022, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 8860 + }, + { + "epoch": 1.4339988683210736, + "grad_norm": 0.6452359557151794, + "learning_rate": 0.0002, + "loss": 0.7891, + "step": 8870 + }, + { + "epoch": 1.4356155525018188, + "grad_norm": 0.5842334628105164, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 8880 + }, + { + "epoch": 1.437232236682564, + "grad_norm": 0.696761965751648, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 8890 + }, + { + "epoch": 1.4388489208633093, + "grad_norm": 0.6384600400924683, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 8900 + }, + { + "epoch": 1.4404656050440545, + "grad_norm": 0.5981136560440063, + "learning_rate": 0.0002, + "loss": 0.7049, + "step": 8910 + }, + { + "epoch": 1.4420822892248, + "grad_norm": 0.6355637907981873, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 8920 + }, + { + "epoch": 1.4436989734055452, + "grad_norm": 0.6374830603599548, + "learning_rate": 0.0002, + "loss": 0.7653, + "step": 8930 + }, + { + "epoch": 1.4453156575862904, + "grad_norm": 0.559013307094574, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 8940 + }, + { + "epoch": 1.446932341767036, + "grad_norm": 0.7289170026779175, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 8950 + }, + { + "epoch": 1.4485490259477811, + "grad_norm": 0.8649206757545471, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 8960 + }, + { + "epoch": 1.4501657101285264, + "grad_norm": 0.7664689421653748, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 8970 + }, + { + "epoch": 1.4517823943092716, + "grad_norm": 0.7109952569007874, + "learning_rate": 0.0002, + "loss": 0.7431, + "step": 8980 + }, + { + "epoch": 1.453399078490017, + "grad_norm": 0.6312844753265381, + "learning_rate": 0.0002, + "loss": 0.7997, + "step": 8990 + }, + { + "epoch": 1.4550157626707623, + "grad_norm": 0.6616617441177368, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 9000 + }, + { + "epoch": 1.4566324468515075, + "grad_norm": 0.7384068965911865, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 9010 + }, + { + "epoch": 1.458249131032253, + "grad_norm": 0.6549670100212097, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 9020 + }, + { + "epoch": 1.4598658152129982, + "grad_norm": 0.6254119277000427, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 9030 + }, + { + "epoch": 1.4614824993937434, + "grad_norm": 0.6806328892707825, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 9040 + }, + { + "epoch": 1.4630991835744886, + "grad_norm": 0.6803115010261536, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 9050 + }, + { + "epoch": 1.4647158677552339, + "grad_norm": 0.48529282212257385, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 9060 + }, + { + "epoch": 1.4663325519359793, + "grad_norm": 0.5995030999183655, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 9070 + }, + { + "epoch": 1.4679492361167246, + "grad_norm": 0.6005427837371826, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 9080 + }, + { + "epoch": 1.46956592029747, + "grad_norm": 0.718564510345459, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 9090 + }, + { + "epoch": 1.4711826044782153, + "grad_norm": 0.7003577351570129, + "learning_rate": 0.0002, + "loss": 0.7089, + "step": 9100 + }, + { + "epoch": 1.4727992886589605, + "grad_norm": 0.5888323783874512, + "learning_rate": 0.0002, + "loss": 0.8069, + "step": 9110 + }, + { + "epoch": 1.4744159728397057, + "grad_norm": 0.6417609453201294, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 9120 + }, + { + "epoch": 1.476032657020451, + "grad_norm": 0.572294294834137, + "learning_rate": 0.0002, + "loss": 0.7441, + "step": 9130 + }, + { + "epoch": 1.4776493412011964, + "grad_norm": 0.8200714588165283, + "learning_rate": 0.0002, + "loss": 0.8053, + "step": 9140 + }, + { + "epoch": 1.4792660253819416, + "grad_norm": 0.6343288421630859, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 9150 + }, + { + "epoch": 1.4808827095626869, + "grad_norm": 0.7017961144447327, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 9160 + }, + { + "epoch": 1.4824993937434323, + "grad_norm": 0.6202912926673889, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 9170 + }, + { + "epoch": 1.4841160779241775, + "grad_norm": 0.6677869558334351, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 9180 + }, + { + "epoch": 1.4857327621049228, + "grad_norm": 0.6052267551422119, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 9190 + }, + { + "epoch": 1.487349446285668, + "grad_norm": 0.6638872027397156, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 9200 + }, + { + "epoch": 1.4889661304664135, + "grad_norm": 0.6245523691177368, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 9210 + }, + { + "epoch": 1.4905828146471587, + "grad_norm": 0.5761767625808716, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 9220 + }, + { + "epoch": 1.492199498827904, + "grad_norm": 0.8175981640815735, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 9230 + }, + { + "epoch": 1.4938161830086494, + "grad_norm": 0.9144009947776794, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 9240 + }, + { + "epoch": 1.4954328671893946, + "grad_norm": 0.5742552876472473, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 9250 + }, + { + "epoch": 1.4970495513701398, + "grad_norm": 0.534534215927124, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 9260 + }, + { + "epoch": 1.498666235550885, + "grad_norm": 0.7836225032806396, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 9270 + }, + { + "epoch": 1.5002829197316303, + "grad_norm": 0.5292993187904358, + "learning_rate": 0.0002, + "loss": 0.7453, + "step": 9280 + }, + { + "epoch": 1.5018996039123758, + "grad_norm": 0.8044071793556213, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 9290 + }, + { + "epoch": 1.503516288093121, + "grad_norm": 0.6185805201530457, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 9300 + }, + { + "epoch": 1.5051329722738664, + "grad_norm": 0.6093607544898987, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 9310 + }, + { + "epoch": 1.5067496564546117, + "grad_norm": 0.5891730189323425, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 9320 + }, + { + "epoch": 1.508366340635357, + "grad_norm": 0.6331129670143127, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 9330 + }, + { + "epoch": 1.5099830248161021, + "grad_norm": 0.7690958380699158, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 9340 + }, + { + "epoch": 1.5115997089968474, + "grad_norm": 0.6548877358436584, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 9350 + }, + { + "epoch": 1.5132163931775926, + "grad_norm": 0.6545143127441406, + "learning_rate": 0.0002, + "loss": 0.7408, + "step": 9360 + }, + { + "epoch": 1.514833077358338, + "grad_norm": 0.553247332572937, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 9370 + }, + { + "epoch": 1.5164497615390833, + "grad_norm": 0.8145074844360352, + "learning_rate": 0.0002, + "loss": 0.7265, + "step": 9380 + }, + { + "epoch": 1.5180664457198287, + "grad_norm": 0.7636994123458862, + "learning_rate": 0.0002, + "loss": 0.7379, + "step": 9390 + }, + { + "epoch": 1.519683129900574, + "grad_norm": 0.6838982701301575, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 9400 + }, + { + "epoch": 1.5212998140813192, + "grad_norm": 0.8599441647529602, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 9410 + }, + { + "epoch": 1.5229164982620644, + "grad_norm": 0.7020329833030701, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 9420 + }, + { + "epoch": 1.5245331824428097, + "grad_norm": 0.6964772343635559, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 9430 + }, + { + "epoch": 1.5261498666235551, + "grad_norm": 0.6916600465774536, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 9440 + }, + { + "epoch": 1.5277665508043003, + "grad_norm": 0.7282621264457703, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 9450 + }, + { + "epoch": 1.5293832349850458, + "grad_norm": 0.5363983511924744, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 9460 + }, + { + "epoch": 1.530999919165791, + "grad_norm": 0.6184861063957214, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 9470 + }, + { + "epoch": 1.5326166033465363, + "grad_norm": 0.5991285443305969, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 9480 + }, + { + "epoch": 1.5342332875272815, + "grad_norm": 0.8176587820053101, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 9490 + }, + { + "epoch": 1.5358499717080267, + "grad_norm": 0.6473721861839294, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 9500 + }, + { + "epoch": 1.5374666558887722, + "grad_norm": 0.7319952845573425, + "learning_rate": 0.0002, + "loss": 0.7632, + "step": 9510 + }, + { + "epoch": 1.5390833400695174, + "grad_norm": 0.702900230884552, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 9520 + }, + { + "epoch": 1.5407000242502629, + "grad_norm": 0.7971600294113159, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 9530 + }, + { + "epoch": 1.542316708431008, + "grad_norm": 0.6527525186538696, + "learning_rate": 0.0002, + "loss": 0.7352, + "step": 9540 + }, + { + "epoch": 1.5439333926117533, + "grad_norm": 0.5791676044464111, + "learning_rate": 0.0002, + "loss": 0.7425, + "step": 9550 + }, + { + "epoch": 1.5455500767924986, + "grad_norm": 0.5619390606880188, + "learning_rate": 0.0002, + "loss": 0.7585, + "step": 9560 + }, + { + "epoch": 1.5471667609732438, + "grad_norm": 0.5701689124107361, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 9570 + }, + { + "epoch": 1.548783445153989, + "grad_norm": 0.47549352049827576, + "learning_rate": 0.0002, + "loss": 0.793, + "step": 9580 + }, + { + "epoch": 1.5504001293347345, + "grad_norm": 0.8730611205101013, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 9590 + }, + { + "epoch": 1.5520168135154797, + "grad_norm": 0.6842091083526611, + "learning_rate": 0.0002, + "loss": 0.798, + "step": 9600 + }, + { + "epoch": 1.5536334976962252, + "grad_norm": 0.6675129532814026, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 9610 + }, + { + "epoch": 1.5552501818769704, + "grad_norm": 0.8173956274986267, + "learning_rate": 0.0002, + "loss": 0.7954, + "step": 9620 + }, + { + "epoch": 1.5568668660577156, + "grad_norm": 0.724947452545166, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 9630 + }, + { + "epoch": 1.5584835502384609, + "grad_norm": 0.6154758930206299, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 9640 + }, + { + "epoch": 1.560100234419206, + "grad_norm": 0.6072008013725281, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 9650 + }, + { + "epoch": 1.5617169185999515, + "grad_norm": 0.659010648727417, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 9660 + }, + { + "epoch": 1.5633336027806968, + "grad_norm": 0.65857994556427, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 9670 + }, + { + "epoch": 1.5649502869614422, + "grad_norm": 0.5914267301559448, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 9680 + }, + { + "epoch": 1.5665669711421875, + "grad_norm": 0.6248020529747009, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 9690 + }, + { + "epoch": 1.5681836553229327, + "grad_norm": 0.7147795557975769, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 9700 + }, + { + "epoch": 1.569800339503678, + "grad_norm": 0.7076232433319092, + "learning_rate": 0.0002, + "loss": 0.7335, + "step": 9710 + }, + { + "epoch": 1.5714170236844232, + "grad_norm": 0.6217400431632996, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 9720 + }, + { + "epoch": 1.5730337078651684, + "grad_norm": 0.6709911227226257, + "learning_rate": 0.0002, + "loss": 0.7296, + "step": 9730 + }, + { + "epoch": 1.5746503920459138, + "grad_norm": 0.749171257019043, + "learning_rate": 0.0002, + "loss": 0.7306, + "step": 9740 + }, + { + "epoch": 1.576267076226659, + "grad_norm": 0.6241145730018616, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 9750 + }, + { + "epoch": 1.5778837604074045, + "grad_norm": 0.4960934817790985, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 9760 + }, + { + "epoch": 1.5795004445881498, + "grad_norm": 0.6593309640884399, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 9770 + }, + { + "epoch": 1.581117128768895, + "grad_norm": 0.5814042091369629, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 9780 + }, + { + "epoch": 1.5827338129496402, + "grad_norm": 0.5936070680618286, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 9790 + }, + { + "epoch": 1.5843504971303854, + "grad_norm": 0.6454403400421143, + "learning_rate": 0.0002, + "loss": 0.7769, + "step": 9800 + }, + { + "epoch": 1.585967181311131, + "grad_norm": 0.7612107992172241, + "learning_rate": 0.0002, + "loss": 0.7677, + "step": 9810 + }, + { + "epoch": 1.5875838654918761, + "grad_norm": 0.6494482755661011, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 9820 + }, + { + "epoch": 1.5892005496726216, + "grad_norm": 0.7825694680213928, + "learning_rate": 0.0002, + "loss": 0.7569, + "step": 9830 + }, + { + "epoch": 1.5908172338533668, + "grad_norm": 0.6757757663726807, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 9840 + }, + { + "epoch": 1.592433918034112, + "grad_norm": 0.7105609178543091, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 9850 + }, + { + "epoch": 1.5940506022148573, + "grad_norm": 0.7596991062164307, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 9860 + }, + { + "epoch": 1.5956672863956025, + "grad_norm": 0.5681525468826294, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 9870 + }, + { + "epoch": 1.5972839705763477, + "grad_norm": 0.6090980768203735, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 9880 + }, + { + "epoch": 1.5989006547570932, + "grad_norm": 0.6271613240242004, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 9890 + }, + { + "epoch": 1.6005173389378387, + "grad_norm": 0.7656369805335999, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 9900 + }, + { + "epoch": 1.6021340231185839, + "grad_norm": 0.7504446506500244, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 9910 + }, + { + "epoch": 1.6037507072993291, + "grad_norm": 0.659656286239624, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 9920 + }, + { + "epoch": 1.6053673914800743, + "grad_norm": 0.6006826162338257, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 9930 + }, + { + "epoch": 1.6069840756608196, + "grad_norm": 0.7872757911682129, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 9940 + }, + { + "epoch": 1.6086007598415648, + "grad_norm": 0.5545852780342102, + "learning_rate": 0.0002, + "loss": 0.768, + "step": 9950 + }, + { + "epoch": 1.6102174440223103, + "grad_norm": 0.7429468631744385, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 9960 + }, + { + "epoch": 1.6118341282030555, + "grad_norm": 0.6873556971549988, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 9970 + }, + { + "epoch": 1.613450812383801, + "grad_norm": 0.5874287486076355, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 9980 + }, + { + "epoch": 1.6150674965645462, + "grad_norm": 0.6039386987686157, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 9990 + }, + { + "epoch": 1.6166841807452914, + "grad_norm": 0.6233575940132141, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 10000 + }, + { + "epoch": 1.6183008649260366, + "grad_norm": 0.7676448225975037, + "learning_rate": 0.0002, + "loss": 0.7602, + "step": 10010 + }, + { + "epoch": 1.6199175491067819, + "grad_norm": 0.6565698385238647, + "learning_rate": 0.0002, + "loss": 0.7784, + "step": 10020 + }, + { + "epoch": 1.6215342332875273, + "grad_norm": 0.6787590384483337, + "learning_rate": 0.0002, + "loss": 0.7104, + "step": 10030 + }, + { + "epoch": 1.6231509174682726, + "grad_norm": 0.6137678027153015, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 10040 + }, + { + "epoch": 1.624767601649018, + "grad_norm": 0.5236800312995911, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 10050 + }, + { + "epoch": 1.6263842858297632, + "grad_norm": 0.7626367807388306, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 10060 + }, + { + "epoch": 1.6280009700105085, + "grad_norm": 0.5657260417938232, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 10070 + }, + { + "epoch": 1.6296176541912537, + "grad_norm": 0.4913991391658783, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 10080 + }, + { + "epoch": 1.631234338371999, + "grad_norm": 0.7715556621551514, + "learning_rate": 0.0002, + "loss": 0.7596, + "step": 10090 + }, + { + "epoch": 1.6328510225527442, + "grad_norm": 0.6509000062942505, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 10100 + }, + { + "epoch": 1.6344677067334896, + "grad_norm": 0.6215850114822388, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 10110 + }, + { + "epoch": 1.6360843909142349, + "grad_norm": 0.6956844329833984, + "learning_rate": 0.0002, + "loss": 0.7705, + "step": 10120 + }, + { + "epoch": 1.6377010750949803, + "grad_norm": 0.6111597418785095, + "learning_rate": 0.0002, + "loss": 0.7129, + "step": 10130 + }, + { + "epoch": 1.6393177592757255, + "grad_norm": 0.6518288850784302, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 10140 + }, + { + "epoch": 1.6409344434564708, + "grad_norm": 0.6914522051811218, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 10150 + }, + { + "epoch": 1.642551127637216, + "grad_norm": 0.63785719871521, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 10160 + }, + { + "epoch": 1.6441678118179612, + "grad_norm": 0.6379287838935852, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 10170 + }, + { + "epoch": 1.6457844959987067, + "grad_norm": 0.6793403029441833, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 10180 + }, + { + "epoch": 1.647401180179452, + "grad_norm": 0.6099132895469666, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 10190 + }, + { + "epoch": 1.6490178643601974, + "grad_norm": 0.5869854092597961, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 10200 + }, + { + "epoch": 1.6506345485409426, + "grad_norm": 0.7716999053955078, + "learning_rate": 0.0002, + "loss": 0.8308, + "step": 10210 + }, + { + "epoch": 1.6522512327216878, + "grad_norm": 0.6854110360145569, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 10220 + }, + { + "epoch": 1.653867916902433, + "grad_norm": 0.6957170367240906, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 10230 + }, + { + "epoch": 1.6554846010831783, + "grad_norm": 0.6932903528213501, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 10240 + }, + { + "epoch": 1.6571012852639235, + "grad_norm": 0.7713165283203125, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 10250 + }, + { + "epoch": 1.658717969444669, + "grad_norm": 0.7455793619155884, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 10260 + }, + { + "epoch": 1.6603346536254144, + "grad_norm": 0.5464168190956116, + "learning_rate": 0.0002, + "loss": 0.7524, + "step": 10270 + }, + { + "epoch": 1.6619513378061597, + "grad_norm": 0.6782926321029663, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 10280 + }, + { + "epoch": 1.663568021986905, + "grad_norm": 0.7962649464607239, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 10290 + }, + { + "epoch": 1.6651847061676501, + "grad_norm": 0.6814526319503784, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 10300 + }, + { + "epoch": 1.6668013903483954, + "grad_norm": 0.656895101070404, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 10310 + }, + { + "epoch": 1.6684180745291406, + "grad_norm": 0.6085672378540039, + "learning_rate": 0.0002, + "loss": 0.7358, + "step": 10320 + }, + { + "epoch": 1.670034758709886, + "grad_norm": 0.585508406162262, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 10330 + }, + { + "epoch": 1.6716514428906313, + "grad_norm": 0.6930184364318848, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 10340 + }, + { + "epoch": 1.6732681270713767, + "grad_norm": 0.575663149356842, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 10350 + }, + { + "epoch": 1.674884811252122, + "grad_norm": 0.582502543926239, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 10360 + }, + { + "epoch": 1.6765014954328672, + "grad_norm": 0.5668916702270508, + "learning_rate": 0.0002, + "loss": 0.7793, + "step": 10370 + }, + { + "epoch": 1.6781181796136124, + "grad_norm": 0.6070065498352051, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 10380 + }, + { + "epoch": 1.6797348637943577, + "grad_norm": 0.6141316294670105, + "learning_rate": 0.0002, + "loss": 0.7939, + "step": 10390 + }, + { + "epoch": 1.6813515479751031, + "grad_norm": 0.8359124064445496, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 10400 + }, + { + "epoch": 1.6829682321558483, + "grad_norm": 0.5378185510635376, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 10410 + }, + { + "epoch": 1.6845849163365938, + "grad_norm": 0.6959536075592041, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 10420 + }, + { + "epoch": 1.686201600517339, + "grad_norm": 0.6514357328414917, + "learning_rate": 0.0002, + "loss": 0.7872, + "step": 10430 + }, + { + "epoch": 1.6878182846980843, + "grad_norm": 0.7706646919250488, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 10440 + }, + { + "epoch": 1.6894349688788295, + "grad_norm": 0.6183337569236755, + "learning_rate": 0.0002, + "loss": 0.7673, + "step": 10450 + }, + { + "epoch": 1.6910516530595747, + "grad_norm": 0.6123278141021729, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 10460 + }, + { + "epoch": 1.69266833724032, + "grad_norm": 0.6894851326942444, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 10470 + }, + { + "epoch": 1.6942850214210654, + "grad_norm": 0.7497312426567078, + "learning_rate": 0.0002, + "loss": 0.7435, + "step": 10480 + }, + { + "epoch": 1.6959017056018106, + "grad_norm": 0.5968214273452759, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 10490 + }, + { + "epoch": 1.697518389782556, + "grad_norm": 0.6747927069664001, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 10500 + }, + { + "epoch": 1.6991350739633013, + "grad_norm": 0.5708310008049011, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 10510 + }, + { + "epoch": 1.7007517581440466, + "grad_norm": 0.606526792049408, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 10520 + }, + { + "epoch": 1.7023684423247918, + "grad_norm": 0.662011981010437, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 10530 + }, + { + "epoch": 1.703985126505537, + "grad_norm": 0.7583045363426208, + "learning_rate": 0.0002, + "loss": 0.7999, + "step": 10540 + }, + { + "epoch": 1.7056018106862825, + "grad_norm": 0.721632182598114, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 10550 + }, + { + "epoch": 1.7072184948670277, + "grad_norm": 0.6107715368270874, + "learning_rate": 0.0002, + "loss": 0.7407, + "step": 10560 + }, + { + "epoch": 1.7088351790477732, + "grad_norm": 0.6652471423149109, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 10570 + }, + { + "epoch": 1.7104518632285184, + "grad_norm": 0.6308087110519409, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 10580 + }, + { + "epoch": 1.7120685474092636, + "grad_norm": 0.5464386940002441, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 10590 + }, + { + "epoch": 1.7136852315900089, + "grad_norm": 0.6558911204338074, + "learning_rate": 0.0002, + "loss": 0.7063, + "step": 10600 + }, + { + "epoch": 1.715301915770754, + "grad_norm": 0.5665024518966675, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 10610 + }, + { + "epoch": 1.7169185999514993, + "grad_norm": 0.7888094186782837, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 10620 + }, + { + "epoch": 1.7185352841322448, + "grad_norm": 0.7084909081459045, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 10630 + }, + { + "epoch": 1.7201519683129902, + "grad_norm": 0.7982324361801147, + "learning_rate": 0.0002, + "loss": 0.7557, + "step": 10640 + }, + { + "epoch": 1.7217686524937355, + "grad_norm": 0.6418732404708862, + "learning_rate": 0.0002, + "loss": 0.7345, + "step": 10650 + }, + { + "epoch": 1.7233853366744807, + "grad_norm": 0.7636681795120239, + "learning_rate": 0.0002, + "loss": 0.7734, + "step": 10660 + }, + { + "epoch": 1.725002020855226, + "grad_norm": 0.5646875500679016, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 10670 + }, + { + "epoch": 1.7266187050359711, + "grad_norm": 0.5231260657310486, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 10680 + }, + { + "epoch": 1.7282353892167164, + "grad_norm": 0.7635011672973633, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 10690 + }, + { + "epoch": 1.7298520733974618, + "grad_norm": 0.7518259286880493, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 10700 + }, + { + "epoch": 1.731468757578207, + "grad_norm": 0.7295602560043335, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 10710 + }, + { + "epoch": 1.7330854417589525, + "grad_norm": 0.6984632015228271, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 10720 + }, + { + "epoch": 1.7347021259396977, + "grad_norm": 0.6198219060897827, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 10730 + }, + { + "epoch": 1.736318810120443, + "grad_norm": 0.6957576274871826, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 10740 + }, + { + "epoch": 1.7379354943011882, + "grad_norm": 0.6430263519287109, + "learning_rate": 0.0002, + "loss": 0.7917, + "step": 10750 + }, + { + "epoch": 1.7395521784819334, + "grad_norm": 0.6134995222091675, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 10760 + }, + { + "epoch": 1.741168862662679, + "grad_norm": 0.7209452986717224, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 10770 + }, + { + "epoch": 1.7427855468434241, + "grad_norm": 0.6735447645187378, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 10780 + }, + { + "epoch": 1.7444022310241696, + "grad_norm": 0.5605693459510803, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 10790 + }, + { + "epoch": 1.7460189152049148, + "grad_norm": 0.6882363557815552, + "learning_rate": 0.0002, + "loss": 0.7759, + "step": 10800 + }, + { + "epoch": 1.74763559938566, + "grad_norm": 0.6386259198188782, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 10810 + }, + { + "epoch": 1.7492522835664053, + "grad_norm": 0.6529015302658081, + "learning_rate": 0.0002, + "loss": 0.7697, + "step": 10820 + }, + { + "epoch": 1.7508689677471505, + "grad_norm": 0.5664082765579224, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 10830 + }, + { + "epoch": 1.7524856519278957, + "grad_norm": 0.7532684206962585, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 10840 + }, + { + "epoch": 1.7541023361086412, + "grad_norm": 0.77171391248703, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 10850 + }, + { + "epoch": 1.7557190202893864, + "grad_norm": 0.7255431413650513, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 10860 + }, + { + "epoch": 1.7573357044701319, + "grad_norm": 0.763083279132843, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 10870 + }, + { + "epoch": 1.758952388650877, + "grad_norm": 0.6042402982711792, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 10880 + }, + { + "epoch": 1.7605690728316223, + "grad_norm": 0.7642518281936646, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 10890 + }, + { + "epoch": 1.7621857570123676, + "grad_norm": 0.6347904801368713, + "learning_rate": 0.0002, + "loss": 0.746, + "step": 10900 + }, + { + "epoch": 1.7638024411931128, + "grad_norm": 0.5371627807617188, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 10910 + }, + { + "epoch": 1.7654191253738583, + "grad_norm": 0.6840225458145142, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 10920 + }, + { + "epoch": 1.7670358095546035, + "grad_norm": 0.5288469195365906, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 10930 + }, + { + "epoch": 1.768652493735349, + "grad_norm": 0.69020676612854, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 10940 + }, + { + "epoch": 1.7702691779160942, + "grad_norm": 0.5943242311477661, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 10950 + }, + { + "epoch": 1.7718858620968394, + "grad_norm": 0.5616418123245239, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 10960 + }, + { + "epoch": 1.7735025462775846, + "grad_norm": 0.7209470868110657, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 10970 + }, + { + "epoch": 1.7751192304583299, + "grad_norm": 0.6657957434654236, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 10980 + }, + { + "epoch": 1.776735914639075, + "grad_norm": 0.6469064950942993, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 10990 + }, + { + "epoch": 1.7783525988198206, + "grad_norm": 0.6615678071975708, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 11000 + }, + { + "epoch": 1.779969283000566, + "grad_norm": 0.6722439527511597, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 11010 + }, + { + "epoch": 1.7815859671813112, + "grad_norm": 0.634136974811554, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 11020 + }, + { + "epoch": 1.7832026513620565, + "grad_norm": 0.6024377346038818, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 11030 + }, + { + "epoch": 1.7848193355428017, + "grad_norm": 0.6909403800964355, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 11040 + }, + { + "epoch": 1.786436019723547, + "grad_norm": 0.7148767709732056, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 11050 + }, + { + "epoch": 1.7880527039042922, + "grad_norm": 0.7442979216575623, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 11060 + }, + { + "epoch": 1.7896693880850376, + "grad_norm": 0.6830431818962097, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 11070 + }, + { + "epoch": 1.7912860722657828, + "grad_norm": 0.9172667264938354, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 11080 + }, + { + "epoch": 1.7929027564465283, + "grad_norm": 0.6799490451812744, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 11090 + }, + { + "epoch": 1.7945194406272735, + "grad_norm": 0.7617024779319763, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 11100 + }, + { + "epoch": 1.7961361248080188, + "grad_norm": 0.7701810002326965, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 11110 + }, + { + "epoch": 1.797752808988764, + "grad_norm": 0.7454385757446289, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 11120 + }, + { + "epoch": 1.7993694931695092, + "grad_norm": 0.6121436953544617, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 11130 + }, + { + "epoch": 1.8009861773502547, + "grad_norm": 0.6237571835517883, + "learning_rate": 0.0002, + "loss": 0.7305, + "step": 11140 + }, + { + "epoch": 1.802602861531, + "grad_norm": 0.6818515658378601, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 11150 + }, + { + "epoch": 1.8042195457117454, + "grad_norm": 0.7768308520317078, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 11160 + }, + { + "epoch": 1.8058362298924906, + "grad_norm": 0.6875537633895874, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 11170 + }, + { + "epoch": 1.8074529140732358, + "grad_norm": 0.7950584888458252, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 11180 + }, + { + "epoch": 1.809069598253981, + "grad_norm": 0.8210248351097107, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 11190 + }, + { + "epoch": 1.8106862824347263, + "grad_norm": 0.6674110889434814, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 11200 + }, + { + "epoch": 1.8123029666154715, + "grad_norm": 0.6261674761772156, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 11210 + }, + { + "epoch": 1.813919650796217, + "grad_norm": 0.6484741568565369, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 11220 + }, + { + "epoch": 1.8155363349769622, + "grad_norm": 0.6231244206428528, + "learning_rate": 0.0002, + "loss": 0.7718, + "step": 11230 + }, + { + "epoch": 1.8171530191577077, + "grad_norm": 0.7243146896362305, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 11240 + }, + { + "epoch": 1.818769703338453, + "grad_norm": 0.6776193380355835, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 11250 + }, + { + "epoch": 1.8203863875191981, + "grad_norm": 0.5973618030548096, + "learning_rate": 0.0002, + "loss": 0.7317, + "step": 11260 + }, + { + "epoch": 1.8220030716999434, + "grad_norm": 0.6451361179351807, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 11270 + }, + { + "epoch": 1.8236197558806886, + "grad_norm": 0.5963068008422852, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 11280 + }, + { + "epoch": 1.825236440061434, + "grad_norm": 0.536902129650116, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 11290 + }, + { + "epoch": 1.8268531242421793, + "grad_norm": 0.6993787288665771, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 11300 + }, + { + "epoch": 1.8284698084229247, + "grad_norm": 0.6135255098342896, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 11310 + }, + { + "epoch": 1.83008649260367, + "grad_norm": 0.6057423949241638, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 11320 + }, + { + "epoch": 1.8317031767844152, + "grad_norm": 0.6598812341690063, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 11330 + }, + { + "epoch": 1.8333198609651604, + "grad_norm": 0.6075948476791382, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 11340 + }, + { + "epoch": 1.8349365451459057, + "grad_norm": 0.7065447568893433, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 11350 + }, + { + "epoch": 1.8365532293266509, + "grad_norm": 0.680526614189148, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 11360 + }, + { + "epoch": 1.8381699135073963, + "grad_norm": 0.6356695294380188, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 11370 + }, + { + "epoch": 1.8397865976881416, + "grad_norm": 0.6399052143096924, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 11380 + }, + { + "epoch": 1.841403281868887, + "grad_norm": 0.6125704050064087, + "learning_rate": 0.0002, + "loss": 0.7618, + "step": 11390 + }, + { + "epoch": 1.8430199660496323, + "grad_norm": 0.7124643325805664, + "learning_rate": 0.0002, + "loss": 0.755, + "step": 11400 + }, + { + "epoch": 1.8446366502303775, + "grad_norm": 0.6099604964256287, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 11410 + }, + { + "epoch": 1.8462533344111227, + "grad_norm": 0.7338208556175232, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 11420 + }, + { + "epoch": 1.847870018591868, + "grad_norm": 0.7534668445587158, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 11430 + }, + { + "epoch": 1.8494867027726134, + "grad_norm": 0.6135470271110535, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 11440 + }, + { + "epoch": 1.8511033869533586, + "grad_norm": 0.6229309439659119, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 11450 + }, + { + "epoch": 1.852720071134104, + "grad_norm": 0.706423282623291, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 11460 + }, + { + "epoch": 1.8543367553148493, + "grad_norm": 0.5460049510002136, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 11470 + }, + { + "epoch": 1.8559534394955945, + "grad_norm": 0.6616711020469666, + "learning_rate": 0.0002, + "loss": 0.7416, + "step": 11480 + }, + { + "epoch": 1.8575701236763398, + "grad_norm": 0.6372783184051514, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 11490 + }, + { + "epoch": 1.859186807857085, + "grad_norm": 0.7162668108940125, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 11500 + }, + { + "epoch": 1.8608034920378305, + "grad_norm": 0.6605209708213806, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 11510 + }, + { + "epoch": 1.8624201762185757, + "grad_norm": 0.6933956742286682, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 11520 + }, + { + "epoch": 1.8640368603993211, + "grad_norm": 0.6582090854644775, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 11530 + }, + { + "epoch": 1.8656535445800664, + "grad_norm": 0.6416500806808472, + "learning_rate": 0.0002, + "loss": 0.7313, + "step": 11540 + }, + { + "epoch": 1.8672702287608116, + "grad_norm": 0.5434312224388123, + "learning_rate": 0.0002, + "loss": 0.7372, + "step": 11550 + }, + { + "epoch": 1.8688869129415568, + "grad_norm": 0.6827567219734192, + "learning_rate": 0.0002, + "loss": 0.7635, + "step": 11560 + }, + { + "epoch": 1.870503597122302, + "grad_norm": 0.7354370951652527, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 11570 + }, + { + "epoch": 1.8721202813030473, + "grad_norm": 0.590372622013092, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 11580 + }, + { + "epoch": 1.8737369654837928, + "grad_norm": 0.853183925151825, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 11590 + }, + { + "epoch": 1.875353649664538, + "grad_norm": 0.822678804397583, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 11600 + }, + { + "epoch": 1.8769703338452834, + "grad_norm": 0.6591550707817078, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 11610 + }, + { + "epoch": 1.8785870180260287, + "grad_norm": 0.7475301623344421, + "learning_rate": 0.0002, + "loss": 0.7054, + "step": 11620 + }, + { + "epoch": 1.880203702206774, + "grad_norm": 0.6390765309333801, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 11630 + }, + { + "epoch": 1.8818203863875191, + "grad_norm": 0.6589758992195129, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 11640 + }, + { + "epoch": 1.8834370705682644, + "grad_norm": 0.6765508651733398, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 11650 + }, + { + "epoch": 1.8850537547490098, + "grad_norm": 0.6527857780456543, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 11660 + }, + { + "epoch": 1.886670438929755, + "grad_norm": 0.6642923951148987, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 11670 + }, + { + "epoch": 1.8882871231105005, + "grad_norm": 0.6945584416389465, + "learning_rate": 0.0002, + "loss": 0.7701, + "step": 11680 + }, + { + "epoch": 1.8899038072912457, + "grad_norm": 0.694018542766571, + "learning_rate": 0.0002, + "loss": 0.7711, + "step": 11690 + }, + { + "epoch": 1.891520491471991, + "grad_norm": 0.7237417101860046, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 11700 + }, + { + "epoch": 1.8931371756527362, + "grad_norm": 0.7401309609413147, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 11710 + }, + { + "epoch": 1.8947538598334814, + "grad_norm": 0.6537784337997437, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 11720 + }, + { + "epoch": 1.8963705440142267, + "grad_norm": 0.7398539185523987, + "learning_rate": 0.0002, + "loss": 0.793, + "step": 11730 + }, + { + "epoch": 1.8979872281949721, + "grad_norm": 0.6696075797080994, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 11740 + }, + { + "epoch": 1.8996039123757174, + "grad_norm": 0.6014142036437988, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 11750 + }, + { + "epoch": 1.9012205965564628, + "grad_norm": 0.7023524641990662, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 11760 + }, + { + "epoch": 1.902837280737208, + "grad_norm": 0.739973783493042, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 11770 + }, + { + "epoch": 1.9044539649179533, + "grad_norm": 0.5576770901679993, + "learning_rate": 0.0002, + "loss": 0.7848, + "step": 11780 + }, + { + "epoch": 1.9060706490986985, + "grad_norm": 0.6907393932342529, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 11790 + }, + { + "epoch": 1.9076873332794437, + "grad_norm": 0.6934581995010376, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 11800 + }, + { + "epoch": 1.9093040174601892, + "grad_norm": 0.591774582862854, + "learning_rate": 0.0002, + "loss": 0.7199, + "step": 11810 + }, + { + "epoch": 1.9109207016409344, + "grad_norm": 0.6249791383743286, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 11820 + }, + { + "epoch": 1.9125373858216799, + "grad_norm": 0.6755744218826294, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 11830 + }, + { + "epoch": 1.914154070002425, + "grad_norm": 0.7286285161972046, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 11840 + }, + { + "epoch": 1.9157707541831703, + "grad_norm": 0.7867850065231323, + "learning_rate": 0.0002, + "loss": 0.7509, + "step": 11850 + }, + { + "epoch": 1.9173874383639156, + "grad_norm": 0.6283972859382629, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 11860 + }, + { + "epoch": 1.9190041225446608, + "grad_norm": 0.605823814868927, + "learning_rate": 0.0002, + "loss": 0.7296, + "step": 11870 + }, + { + "epoch": 1.920620806725406, + "grad_norm": 0.5927976965904236, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 11880 + }, + { + "epoch": 1.9222374909061515, + "grad_norm": 0.5974002480506897, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 11890 + }, + { + "epoch": 1.923854175086897, + "grad_norm": 0.7091866135597229, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 11900 + }, + { + "epoch": 1.9254708592676422, + "grad_norm": 0.72496497631073, + "learning_rate": 0.0002, + "loss": 0.775, + "step": 11910 + }, + { + "epoch": 1.9270875434483874, + "grad_norm": 0.6131896376609802, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 11920 + }, + { + "epoch": 1.9287042276291326, + "grad_norm": 0.6556436419487, + "learning_rate": 0.0002, + "loss": 0.7228, + "step": 11930 + }, + { + "epoch": 1.9303209118098779, + "grad_norm": 0.622932493686676, + "learning_rate": 0.0002, + "loss": 0.7319, + "step": 11940 + }, + { + "epoch": 1.931937595990623, + "grad_norm": 0.6618631482124329, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 11950 + }, + { + "epoch": 1.9335542801713685, + "grad_norm": 0.630966305732727, + "learning_rate": 0.0002, + "loss": 0.8332, + "step": 11960 + }, + { + "epoch": 1.9351709643521138, + "grad_norm": 0.6336734890937805, + "learning_rate": 0.0002, + "loss": 0.6854, + "step": 11970 + }, + { + "epoch": 1.9367876485328592, + "grad_norm": 0.655403196811676, + "learning_rate": 0.0002, + "loss": 0.7433, + "step": 11980 + }, + { + "epoch": 1.9384043327136045, + "grad_norm": 0.5640574097633362, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 11990 + }, + { + "epoch": 1.9400210168943497, + "grad_norm": 0.6322951316833496, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 12000 + }, + { + "epoch": 1.941637701075095, + "grad_norm": 0.615703821182251, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 12010 + }, + { + "epoch": 1.9432543852558402, + "grad_norm": 0.6487536430358887, + "learning_rate": 0.0002, + "loss": 0.786, + "step": 12020 + }, + { + "epoch": 1.9448710694365856, + "grad_norm": 0.9209630489349365, + "learning_rate": 0.0002, + "loss": 0.7435, + "step": 12030 + }, + { + "epoch": 1.9464877536173308, + "grad_norm": 0.67485511302948, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 12040 + }, + { + "epoch": 1.9481044377980763, + "grad_norm": 0.6831230521202087, + "learning_rate": 0.0002, + "loss": 0.7551, + "step": 12050 + }, + { + "epoch": 1.9497211219788215, + "grad_norm": 0.6578302383422852, + "learning_rate": 0.0002, + "loss": 0.7546, + "step": 12060 + }, + { + "epoch": 1.9513378061595668, + "grad_norm": 0.9975938200950623, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 12070 + }, + { + "epoch": 1.952954490340312, + "grad_norm": 0.6637365221977234, + "learning_rate": 0.0002, + "loss": 0.7952, + "step": 12080 + }, + { + "epoch": 1.9545711745210572, + "grad_norm": 0.605707049369812, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 12090 + }, + { + "epoch": 1.9561878587018025, + "grad_norm": 0.6584440469741821, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 12100 + }, + { + "epoch": 1.957804542882548, + "grad_norm": 0.6070835590362549, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 12110 + }, + { + "epoch": 1.9594212270632931, + "grad_norm": 0.7862601280212402, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 12120 + }, + { + "epoch": 1.9610379112440386, + "grad_norm": 0.8175255060195923, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 12130 + }, + { + "epoch": 1.9626545954247838, + "grad_norm": 0.5648472905158997, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 12140 + }, + { + "epoch": 1.964271279605529, + "grad_norm": 0.6591973304748535, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 12150 + }, + { + "epoch": 1.9658879637862743, + "grad_norm": 0.5960676074028015, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 12160 + }, + { + "epoch": 1.9675046479670195, + "grad_norm": 0.7272544503211975, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 12170 + }, + { + "epoch": 1.969121332147765, + "grad_norm": 0.7176699042320251, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 12180 + }, + { + "epoch": 1.9707380163285102, + "grad_norm": 0.6927123665809631, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 12190 + }, + { + "epoch": 1.9723547005092557, + "grad_norm": 0.5536034107208252, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 12200 + }, + { + "epoch": 1.9739713846900009, + "grad_norm": 0.8348390460014343, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 12210 + }, + { + "epoch": 1.9755880688707461, + "grad_norm": 0.6591181755065918, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 12220 + }, + { + "epoch": 1.9772047530514913, + "grad_norm": 1.0624109506607056, + "learning_rate": 0.0002, + "loss": 0.763, + "step": 12230 + }, + { + "epoch": 1.9788214372322366, + "grad_norm": 0.9265586137771606, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 12240 + }, + { + "epoch": 1.9804381214129818, + "grad_norm": 0.5998196005821228, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 12250 + }, + { + "epoch": 1.9820548055937273, + "grad_norm": 0.6960851550102234, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 12260 + }, + { + "epoch": 1.9836714897744727, + "grad_norm": 0.7674502730369568, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 12270 + }, + { + "epoch": 1.985288173955218, + "grad_norm": 0.6407275795936584, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 12280 + }, + { + "epoch": 1.9869048581359632, + "grad_norm": 0.6673079133033752, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 12290 + }, + { + "epoch": 1.9885215423167084, + "grad_norm": 0.6989844441413879, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 12300 + }, + { + "epoch": 1.9901382264974536, + "grad_norm": 0.7564442157745361, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 12310 + }, + { + "epoch": 1.9917549106781989, + "grad_norm": 0.6385478973388672, + "learning_rate": 0.0002, + "loss": 0.7719, + "step": 12320 + }, + { + "epoch": 1.9933715948589443, + "grad_norm": 0.7193717956542969, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 12330 + }, + { + "epoch": 1.9949882790396896, + "grad_norm": 0.7987112402915955, + "learning_rate": 0.0002, + "loss": 0.7583, + "step": 12340 + }, + { + "epoch": 1.996604963220435, + "grad_norm": 0.7260826826095581, + "learning_rate": 0.0002, + "loss": 0.7793, + "step": 12350 + }, + { + "epoch": 1.9982216474011802, + "grad_norm": 0.7968255281448364, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 12360 + }, + { + "epoch": 1.9998383315819255, + "grad_norm": 0.6893062591552734, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 12370 + }, + { + "epoch": 2.0, + "eval_loss": 1.1044032573699951, + "eval_runtime": 122.1508, + "eval_samples_per_second": 6.001, + "eval_steps_per_second": 0.753, + "step": 12371 + }, + { + "epoch": 2.0014550157626707, + "grad_norm": 0.7775409817695618, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 12380 + }, + { + "epoch": 2.003071699943416, + "grad_norm": 0.76218581199646, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 12390 + }, + { + "epoch": 2.004688384124161, + "grad_norm": 0.5677764415740967, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 12400 + }, + { + "epoch": 2.006305068304907, + "grad_norm": 0.808442234992981, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 12410 + }, + { + "epoch": 2.007921752485652, + "grad_norm": 0.7144765257835388, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 12420 + }, + { + "epoch": 2.0095384366663973, + "grad_norm": 0.6914031505584717, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 12430 + }, + { + "epoch": 2.0111551208471425, + "grad_norm": 0.7581454515457153, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 12440 + }, + { + "epoch": 2.0127718050278878, + "grad_norm": 0.8388504981994629, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 12450 + }, + { + "epoch": 2.014388489208633, + "grad_norm": 0.6716406941413879, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 12460 + }, + { + "epoch": 2.0160051733893782, + "grad_norm": 0.898902416229248, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 12470 + }, + { + "epoch": 2.0176218575701235, + "grad_norm": 0.6432679891586304, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 12480 + }, + { + "epoch": 2.019238541750869, + "grad_norm": 0.8021109104156494, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 12490 + }, + { + "epoch": 2.0208552259316144, + "grad_norm": 0.7039216756820679, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 12500 + }, + { + "epoch": 2.0224719101123596, + "grad_norm": 0.646531879901886, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 12510 + }, + { + "epoch": 2.024088594293105, + "grad_norm": 0.783704400062561, + "learning_rate": 0.0002, + "loss": 0.6302, + "step": 12520 + }, + { + "epoch": 2.02570527847385, + "grad_norm": 0.8805046677589417, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 12530 + }, + { + "epoch": 2.0273219626545953, + "grad_norm": 0.7289270758628845, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 12540 + }, + { + "epoch": 2.0289386468353405, + "grad_norm": 0.71653151512146, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 12550 + }, + { + "epoch": 2.030555331016086, + "grad_norm": 0.73281329870224, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 12560 + }, + { + "epoch": 2.0321720151968314, + "grad_norm": 0.6657090187072754, + "learning_rate": 0.0002, + "loss": 0.6448, + "step": 12570 + }, + { + "epoch": 2.0337886993775767, + "grad_norm": 0.8241133093833923, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 12580 + }, + { + "epoch": 2.035405383558322, + "grad_norm": 0.5834135413169861, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 12590 + }, + { + "epoch": 2.037022067739067, + "grad_norm": 0.84502112865448, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 12600 + }, + { + "epoch": 2.0386387519198124, + "grad_norm": 0.8952481746673584, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 12610 + }, + { + "epoch": 2.0402554361005576, + "grad_norm": 0.7801461815834045, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 12620 + }, + { + "epoch": 2.041872120281303, + "grad_norm": 0.6788367033004761, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 12630 + }, + { + "epoch": 2.0434888044620485, + "grad_norm": 0.7241756319999695, + "learning_rate": 0.0002, + "loss": 0.6162, + "step": 12640 + }, + { + "epoch": 2.0451054886427937, + "grad_norm": 0.6933388113975525, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 12650 + }, + { + "epoch": 2.046722172823539, + "grad_norm": 0.8029746413230896, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 12660 + }, + { + "epoch": 2.048338857004284, + "grad_norm": 0.946399986743927, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 12670 + }, + { + "epoch": 2.0499555411850294, + "grad_norm": 0.7072678804397583, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 12680 + }, + { + "epoch": 2.0515722253657747, + "grad_norm": 0.6810618042945862, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 12690 + }, + { + "epoch": 2.05318890954652, + "grad_norm": 0.7661160230636597, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 12700 + }, + { + "epoch": 2.0548055937272656, + "grad_norm": 0.6350653767585754, + "learning_rate": 0.0002, + "loss": 0.6799, + "step": 12710 + }, + { + "epoch": 2.056422277908011, + "grad_norm": 0.861890971660614, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 12720 + }, + { + "epoch": 2.058038962088756, + "grad_norm": 0.6489875912666321, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 12730 + }, + { + "epoch": 2.0596556462695013, + "grad_norm": 0.8268506526947021, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 12740 + }, + { + "epoch": 2.0612723304502465, + "grad_norm": 0.607679545879364, + "learning_rate": 0.0002, + "loss": 0.6524, + "step": 12750 + }, + { + "epoch": 2.0628890146309917, + "grad_norm": 0.6754153370857239, + "learning_rate": 0.0002, + "loss": 0.6649, + "step": 12760 + }, + { + "epoch": 2.064505698811737, + "grad_norm": 0.7263124585151672, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 12770 + }, + { + "epoch": 2.0661223829924826, + "grad_norm": 0.6986154317855835, + "learning_rate": 0.0002, + "loss": 0.6189, + "step": 12780 + }, + { + "epoch": 2.067739067173228, + "grad_norm": 0.7768576741218567, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 12790 + }, + { + "epoch": 2.069355751353973, + "grad_norm": 0.7546762824058533, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 12800 + }, + { + "epoch": 2.0709724355347183, + "grad_norm": 0.7588880062103271, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 12810 + }, + { + "epoch": 2.0725891197154636, + "grad_norm": 0.7457242608070374, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 12820 + }, + { + "epoch": 2.074205803896209, + "grad_norm": 0.6983516812324524, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 12830 + }, + { + "epoch": 2.075822488076954, + "grad_norm": 0.7950928807258606, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 12840 + }, + { + "epoch": 2.0774391722576993, + "grad_norm": 0.9248087406158447, + "learning_rate": 0.0002, + "loss": 0.6603, + "step": 12850 + }, + { + "epoch": 2.079055856438445, + "grad_norm": 0.7229493260383606, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 12860 + }, + { + "epoch": 2.08067254061919, + "grad_norm": 0.5710847973823547, + "learning_rate": 0.0002, + "loss": 0.6702, + "step": 12870 + }, + { + "epoch": 2.0822892247999354, + "grad_norm": 0.9580423831939697, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 12880 + }, + { + "epoch": 2.0839059089806806, + "grad_norm": 0.7399665713310242, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 12890 + }, + { + "epoch": 2.085522593161426, + "grad_norm": 0.7981410622596741, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 12900 + }, + { + "epoch": 2.087139277342171, + "grad_norm": 0.870759904384613, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 12910 + }, + { + "epoch": 2.0887559615229163, + "grad_norm": 0.7001481652259827, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 12920 + }, + { + "epoch": 2.090372645703662, + "grad_norm": 0.6745418310165405, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 12930 + }, + { + "epoch": 2.0919893298844072, + "grad_norm": 0.7739067673683167, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 12940 + }, + { + "epoch": 2.0936060140651525, + "grad_norm": 0.6742934584617615, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 12950 + }, + { + "epoch": 2.0952226982458977, + "grad_norm": 0.7270349860191345, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 12960 + }, + { + "epoch": 2.096839382426643, + "grad_norm": 0.7150624394416809, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 12970 + }, + { + "epoch": 2.098456066607388, + "grad_norm": 0.7734767198562622, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 12980 + }, + { + "epoch": 2.1000727507881334, + "grad_norm": 0.7618662118911743, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 12990 + }, + { + "epoch": 2.101689434968879, + "grad_norm": 0.6557944416999817, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 13000 + }, + { + "epoch": 2.1033061191496243, + "grad_norm": 0.8786448240280151, + "learning_rate": 0.0002, + "loss": 0.7268, + "step": 13010 + }, + { + "epoch": 2.1049228033303695, + "grad_norm": 0.6878724098205566, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 13020 + }, + { + "epoch": 2.1065394875111147, + "grad_norm": 0.822318971157074, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 13030 + }, + { + "epoch": 2.10815617169186, + "grad_norm": 0.831468939781189, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 13040 + }, + { + "epoch": 2.109772855872605, + "grad_norm": 0.7699505686759949, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 13050 + }, + { + "epoch": 2.1113895400533504, + "grad_norm": 0.7559016346931458, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 13060 + }, + { + "epoch": 2.1130062242340957, + "grad_norm": 0.6942209601402283, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 13070 + }, + { + "epoch": 2.1146229084148414, + "grad_norm": 0.6098947525024414, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 13080 + }, + { + "epoch": 2.1162395925955866, + "grad_norm": 0.6499016284942627, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 13090 + }, + { + "epoch": 2.117856276776332, + "grad_norm": 0.7719953060150146, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 13100 + }, + { + "epoch": 2.119472960957077, + "grad_norm": 0.6708134412765503, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 13110 + }, + { + "epoch": 2.1210896451378223, + "grad_norm": 0.8119585514068604, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 13120 + }, + { + "epoch": 2.1227063293185675, + "grad_norm": 0.6947157979011536, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 13130 + }, + { + "epoch": 2.1243230134993127, + "grad_norm": 0.8831837773323059, + "learning_rate": 0.0002, + "loss": 0.6375, + "step": 13140 + }, + { + "epoch": 2.1259396976800584, + "grad_norm": 0.7266910672187805, + "learning_rate": 0.0002, + "loss": 0.6997, + "step": 13150 + }, + { + "epoch": 2.1275563818608036, + "grad_norm": 0.8864351511001587, + "learning_rate": 0.0002, + "loss": 0.6446, + "step": 13160 + }, + { + "epoch": 2.129173066041549, + "grad_norm": 0.8104248046875, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 13170 + }, + { + "epoch": 2.130789750222294, + "grad_norm": 0.6077079772949219, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 13180 + }, + { + "epoch": 2.1324064344030393, + "grad_norm": 0.6874213814735413, + "learning_rate": 0.0002, + "loss": 0.6572, + "step": 13190 + }, + { + "epoch": 2.1340231185837846, + "grad_norm": 0.7134367823600769, + "learning_rate": 0.0002, + "loss": 0.642, + "step": 13200 + }, + { + "epoch": 2.13563980276453, + "grad_norm": 0.6101235151290894, + "learning_rate": 0.0002, + "loss": 0.7016, + "step": 13210 + }, + { + "epoch": 2.137256486945275, + "grad_norm": 0.6042411923408508, + "learning_rate": 0.0002, + "loss": 0.6529, + "step": 13220 + }, + { + "epoch": 2.1388731711260207, + "grad_norm": 0.914601743221283, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 13230 + }, + { + "epoch": 2.140489855306766, + "grad_norm": 0.7104284167289734, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 13240 + }, + { + "epoch": 2.142106539487511, + "grad_norm": 0.664395272731781, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 13250 + }, + { + "epoch": 2.1437232236682564, + "grad_norm": 0.6991241574287415, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 13260 + }, + { + "epoch": 2.1453399078490016, + "grad_norm": 0.5469560623168945, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 13270 + }, + { + "epoch": 2.146956592029747, + "grad_norm": 0.8454998135566711, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 13280 + }, + { + "epoch": 2.148573276210492, + "grad_norm": 0.7088868618011475, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 13290 + }, + { + "epoch": 2.1501899603912378, + "grad_norm": 0.7002687454223633, + "learning_rate": 0.0002, + "loss": 0.6835, + "step": 13300 + }, + { + "epoch": 2.151806644571983, + "grad_norm": 0.7785214781761169, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 13310 + }, + { + "epoch": 2.1534233287527282, + "grad_norm": 0.8049132227897644, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 13320 + }, + { + "epoch": 2.1550400129334735, + "grad_norm": 0.8062595129013062, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 13330 + }, + { + "epoch": 2.1566566971142187, + "grad_norm": 0.6208319067955017, + "learning_rate": 0.0002, + "loss": 0.6603, + "step": 13340 + }, + { + "epoch": 2.158273381294964, + "grad_norm": 0.7519655823707581, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 13350 + }, + { + "epoch": 2.159890065475709, + "grad_norm": 0.7645747065544128, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 13360 + }, + { + "epoch": 2.1615067496564544, + "grad_norm": 0.6847302913665771, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 13370 + }, + { + "epoch": 2.1631234338372, + "grad_norm": 0.8630441427230835, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 13380 + }, + { + "epoch": 2.1647401180179453, + "grad_norm": 0.7947702407836914, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 13390 + }, + { + "epoch": 2.1663568021986905, + "grad_norm": 0.6836977005004883, + "learning_rate": 0.0002, + "loss": 0.7206, + "step": 13400 + }, + { + "epoch": 2.1679734863794358, + "grad_norm": 0.7340566515922546, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 13410 + }, + { + "epoch": 2.169590170560181, + "grad_norm": 0.7075738906860352, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 13420 + }, + { + "epoch": 2.1712068547409262, + "grad_norm": 0.7080879807472229, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 13430 + }, + { + "epoch": 2.1728235389216715, + "grad_norm": 0.6218613386154175, + "learning_rate": 0.0002, + "loss": 0.6615, + "step": 13440 + }, + { + "epoch": 2.174440223102417, + "grad_norm": 0.8211479187011719, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 13450 + }, + { + "epoch": 2.1760569072831624, + "grad_norm": 0.864466667175293, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 13460 + }, + { + "epoch": 2.1776735914639076, + "grad_norm": 0.7943857908248901, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 13470 + }, + { + "epoch": 2.179290275644653, + "grad_norm": 0.78728187084198, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 13480 + }, + { + "epoch": 2.180906959825398, + "grad_norm": 0.697527289390564, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 13490 + }, + { + "epoch": 2.1825236440061433, + "grad_norm": 0.8205804228782654, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 13500 + }, + { + "epoch": 2.1841403281868885, + "grad_norm": 0.8709042072296143, + "learning_rate": 0.0002, + "loss": 0.7227, + "step": 13510 + }, + { + "epoch": 2.1857570123676338, + "grad_norm": 0.6228537559509277, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 13520 + }, + { + "epoch": 2.1873736965483794, + "grad_norm": 0.9566980004310608, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 13530 + }, + { + "epoch": 2.1889903807291247, + "grad_norm": 0.7128894329071045, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 13540 + }, + { + "epoch": 2.19060706490987, + "grad_norm": 0.6888654232025146, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 13550 + }, + { + "epoch": 2.192223749090615, + "grad_norm": 0.6444337368011475, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 13560 + }, + { + "epoch": 2.1938404332713604, + "grad_norm": 0.8008806705474854, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 13570 + }, + { + "epoch": 2.1954571174521056, + "grad_norm": 0.8482748866081238, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 13580 + }, + { + "epoch": 2.197073801632851, + "grad_norm": 0.8584157228469849, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 13590 + }, + { + "epoch": 2.1986904858135965, + "grad_norm": 0.7513734698295593, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 13600 + }, + { + "epoch": 2.2003071699943417, + "grad_norm": 0.7864262461662292, + "learning_rate": 0.0002, + "loss": 0.6632, + "step": 13610 + }, + { + "epoch": 2.201923854175087, + "grad_norm": 0.8493645191192627, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 13620 + }, + { + "epoch": 2.203540538355832, + "grad_norm": 0.6902140974998474, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 13630 + }, + { + "epoch": 2.2051572225365774, + "grad_norm": 0.8711254596710205, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 13640 + }, + { + "epoch": 2.2067739067173227, + "grad_norm": 0.7832191586494446, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 13650 + }, + { + "epoch": 2.208390590898068, + "grad_norm": 0.5668176412582397, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 13660 + }, + { + "epoch": 2.2100072750788136, + "grad_norm": 0.8648375272750854, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 13670 + }, + { + "epoch": 2.211623959259559, + "grad_norm": 0.7643089890480042, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 13680 + }, + { + "epoch": 2.213240643440304, + "grad_norm": 0.6293777823448181, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 13690 + }, + { + "epoch": 2.2148573276210493, + "grad_norm": 0.6459372639656067, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 13700 + }, + { + "epoch": 2.2164740118017945, + "grad_norm": 0.7060744166374207, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 13710 + }, + { + "epoch": 2.2180906959825397, + "grad_norm": 0.674109160900116, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 13720 + }, + { + "epoch": 2.219707380163285, + "grad_norm": 0.830392062664032, + "learning_rate": 0.0002, + "loss": 0.6529, + "step": 13730 + }, + { + "epoch": 2.2213240643440306, + "grad_norm": 0.6474477052688599, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 13740 + }, + { + "epoch": 2.222940748524776, + "grad_norm": 0.7037909626960754, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 13750 + }, + { + "epoch": 2.224557432705521, + "grad_norm": 0.6554131507873535, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 13760 + }, + { + "epoch": 2.2261741168862663, + "grad_norm": 0.7822230458259583, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 13770 + }, + { + "epoch": 2.2277908010670116, + "grad_norm": 0.9082167744636536, + "learning_rate": 0.0002, + "loss": 0.6505, + "step": 13780 + }, + { + "epoch": 2.229407485247757, + "grad_norm": 0.7918276190757751, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 13790 + }, + { + "epoch": 2.231024169428502, + "grad_norm": 0.7354569435119629, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 13800 + }, + { + "epoch": 2.2326408536092472, + "grad_norm": 0.8265249133110046, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 13810 + }, + { + "epoch": 2.234257537789993, + "grad_norm": 0.6653847098350525, + "learning_rate": 0.0002, + "loss": 0.6871, + "step": 13820 + }, + { + "epoch": 2.235874221970738, + "grad_norm": 0.7157923579216003, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 13830 + }, + { + "epoch": 2.2374909061514834, + "grad_norm": 0.7110323309898376, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 13840 + }, + { + "epoch": 2.2391075903322286, + "grad_norm": 0.7155357599258423, + "learning_rate": 0.0002, + "loss": 0.6913, + "step": 13850 + }, + { + "epoch": 2.240724274512974, + "grad_norm": 1.0177817344665527, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 13860 + }, + { + "epoch": 2.242340958693719, + "grad_norm": 0.7601948380470276, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 13870 + }, + { + "epoch": 2.2439576428744643, + "grad_norm": 0.7628820538520813, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 13880 + }, + { + "epoch": 2.24557432705521, + "grad_norm": 0.7089297771453857, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 13890 + }, + { + "epoch": 2.247191011235955, + "grad_norm": 0.695178210735321, + "learning_rate": 0.0002, + "loss": 0.7236, + "step": 13900 + }, + { + "epoch": 2.2488076954167004, + "grad_norm": 0.7631948590278625, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 13910 + }, + { + "epoch": 2.2504243795974457, + "grad_norm": 0.8203101754188538, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 13920 + }, + { + "epoch": 2.252041063778191, + "grad_norm": 0.8099079728126526, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 13930 + }, + { + "epoch": 2.253657747958936, + "grad_norm": 0.6498546004295349, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 13940 + }, + { + "epoch": 2.2552744321396814, + "grad_norm": 0.7797415256500244, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 13950 + }, + { + "epoch": 2.2568911163204266, + "grad_norm": 0.8254124522209167, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 13960 + }, + { + "epoch": 2.2585078005011723, + "grad_norm": 0.6327953338623047, + "learning_rate": 0.0002, + "loss": 0.6806, + "step": 13970 + }, + { + "epoch": 2.2601244846819175, + "grad_norm": 0.734194278717041, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 13980 + }, + { + "epoch": 2.2617411688626627, + "grad_norm": 0.9014202952384949, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 13990 + }, + { + "epoch": 2.263357853043408, + "grad_norm": 0.7643631100654602, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 14000 + }, + { + "epoch": 2.264974537224153, + "grad_norm": 0.8882834911346436, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 14010 + }, + { + "epoch": 2.2665912214048984, + "grad_norm": 0.7975873351097107, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 14020 + }, + { + "epoch": 2.2682079055856437, + "grad_norm": 0.7765783071517944, + "learning_rate": 0.0002, + "loss": 0.7193, + "step": 14030 + }, + { + "epoch": 2.2698245897663893, + "grad_norm": 0.8846288323402405, + "learning_rate": 0.0002, + "loss": 0.662, + "step": 14040 + }, + { + "epoch": 2.2714412739471346, + "grad_norm": 0.9006744027137756, + "learning_rate": 0.0002, + "loss": 0.6494, + "step": 14050 + }, + { + "epoch": 2.27305795812788, + "grad_norm": 0.7420173287391663, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 14060 + }, + { + "epoch": 2.274674642308625, + "grad_norm": 0.7956424951553345, + "learning_rate": 0.0002, + "loss": 0.7068, + "step": 14070 + }, + { + "epoch": 2.2762913264893703, + "grad_norm": 0.7783209085464478, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 14080 + }, + { + "epoch": 2.2779080106701155, + "grad_norm": 0.7597188949584961, + "learning_rate": 0.0002, + "loss": 0.7202, + "step": 14090 + }, + { + "epoch": 2.2795246948508607, + "grad_norm": 0.6718921661376953, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 14100 + }, + { + "epoch": 2.281141379031606, + "grad_norm": 0.7528082132339478, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 14110 + }, + { + "epoch": 2.2827580632123516, + "grad_norm": 0.8379864692687988, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 14120 + }, + { + "epoch": 2.284374747393097, + "grad_norm": 0.748613715171814, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 14130 + }, + { + "epoch": 2.285991431573842, + "grad_norm": 0.7435423135757446, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 14140 + }, + { + "epoch": 2.2876081157545873, + "grad_norm": 0.7580803632736206, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 14150 + }, + { + "epoch": 2.2892247999353326, + "grad_norm": 0.6278321146965027, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 14160 + }, + { + "epoch": 2.290841484116078, + "grad_norm": 0.7663896083831787, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 14170 + }, + { + "epoch": 2.292458168296823, + "grad_norm": 0.9716812372207642, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 14180 + }, + { + "epoch": 2.2940748524775687, + "grad_norm": 0.8993458151817322, + "learning_rate": 0.0002, + "loss": 0.7067, + "step": 14190 + }, + { + "epoch": 2.295691536658314, + "grad_norm": 0.6156117916107178, + "learning_rate": 0.0002, + "loss": 0.6172, + "step": 14200 + }, + { + "epoch": 2.297308220839059, + "grad_norm": 0.8911278247833252, + "learning_rate": 0.0002, + "loss": 0.6318, + "step": 14210 + }, + { + "epoch": 2.2989249050198044, + "grad_norm": 0.6422147154808044, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 14220 + }, + { + "epoch": 2.3005415892005496, + "grad_norm": 0.6866879463195801, + "learning_rate": 0.0002, + "loss": 0.6795, + "step": 14230 + }, + { + "epoch": 2.302158273381295, + "grad_norm": 0.9297130107879639, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 14240 + }, + { + "epoch": 2.30377495756204, + "grad_norm": 0.7501356601715088, + "learning_rate": 0.0002, + "loss": 0.6823, + "step": 14250 + }, + { + "epoch": 2.3053916417427853, + "grad_norm": 0.8363515138626099, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 14260 + }, + { + "epoch": 2.307008325923531, + "grad_norm": 0.9083868265151978, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 14270 + }, + { + "epoch": 2.3086250101042762, + "grad_norm": 0.7791516780853271, + "learning_rate": 0.0002, + "loss": 0.6862, + "step": 14280 + }, + { + "epoch": 2.3102416942850215, + "grad_norm": 0.8766953349113464, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 14290 + }, + { + "epoch": 2.3118583784657667, + "grad_norm": 0.7916635274887085, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 14300 + }, + { + "epoch": 2.313475062646512, + "grad_norm": 0.627525269985199, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 14310 + }, + { + "epoch": 2.315091746827257, + "grad_norm": 0.8856783509254456, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 14320 + }, + { + "epoch": 2.316708431008003, + "grad_norm": 0.6758689284324646, + "learning_rate": 0.0002, + "loss": 0.6806, + "step": 14330 + }, + { + "epoch": 2.318325115188748, + "grad_norm": 0.6428321003913879, + "learning_rate": 0.0002, + "loss": 0.6794, + "step": 14340 + }, + { + "epoch": 2.3199417993694933, + "grad_norm": 0.9032121300697327, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 14350 + }, + { + "epoch": 2.3215584835502385, + "grad_norm": 0.8035986423492432, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 14360 + }, + { + "epoch": 2.3231751677309838, + "grad_norm": 0.7974579334259033, + "learning_rate": 0.0002, + "loss": 0.7067, + "step": 14370 + }, + { + "epoch": 2.324791851911729, + "grad_norm": 0.8356034755706787, + "learning_rate": 0.0002, + "loss": 0.6451, + "step": 14380 + }, + { + "epoch": 2.326408536092474, + "grad_norm": 0.998760998249054, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 14390 + }, + { + "epoch": 2.3280252202732195, + "grad_norm": 0.6518142223358154, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 14400 + }, + { + "epoch": 2.3296419044539647, + "grad_norm": 0.7443506717681885, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 14410 + }, + { + "epoch": 2.3312585886347104, + "grad_norm": 0.8436172604560852, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 14420 + }, + { + "epoch": 2.3328752728154556, + "grad_norm": 0.7411080598831177, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 14430 + }, + { + "epoch": 2.334491956996201, + "grad_norm": 0.8839048743247986, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 14440 + }, + { + "epoch": 2.336108641176946, + "grad_norm": 0.8360885977745056, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 14450 + }, + { + "epoch": 2.3377253253576913, + "grad_norm": 0.7608986496925354, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 14460 + }, + { + "epoch": 2.3393420095384365, + "grad_norm": 0.8179867267608643, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 14470 + }, + { + "epoch": 2.340958693719182, + "grad_norm": 0.5989999771118164, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 14480 + }, + { + "epoch": 2.3425753778999274, + "grad_norm": 0.9450054168701172, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 14490 + }, + { + "epoch": 2.3441920620806727, + "grad_norm": 0.7885149717330933, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 14500 + }, + { + "epoch": 2.345808746261418, + "grad_norm": 0.8152616620063782, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 14510 + }, + { + "epoch": 2.347425430442163, + "grad_norm": 0.7193838953971863, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 14520 + }, + { + "epoch": 2.3490421146229084, + "grad_norm": 0.6701092720031738, + "learning_rate": 0.0002, + "loss": 0.6594, + "step": 14530 + }, + { + "epoch": 2.3506587988036536, + "grad_norm": 0.7529364228248596, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 14540 + }, + { + "epoch": 2.352275482984399, + "grad_norm": 0.6599733829498291, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 14550 + }, + { + "epoch": 2.353892167165144, + "grad_norm": 0.9502474069595337, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 14560 + }, + { + "epoch": 2.3555088513458897, + "grad_norm": 0.7619650959968567, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 14570 + }, + { + "epoch": 2.357125535526635, + "grad_norm": 0.9854652285575867, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 14580 + }, + { + "epoch": 2.35874221970738, + "grad_norm": 0.727439284324646, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 14590 + }, + { + "epoch": 2.3603589038881254, + "grad_norm": 0.6994746327400208, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 14600 + }, + { + "epoch": 2.3619755880688706, + "grad_norm": 0.7117531299591064, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 14610 + }, + { + "epoch": 2.363592272249616, + "grad_norm": 0.6403067708015442, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 14620 + }, + { + "epoch": 2.3652089564303616, + "grad_norm": 0.8377841711044312, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 14630 + }, + { + "epoch": 2.366825640611107, + "grad_norm": 0.749171257019043, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 14640 + }, + { + "epoch": 2.368442324791852, + "grad_norm": 0.8418586254119873, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 14650 + }, + { + "epoch": 2.3700590089725972, + "grad_norm": 0.6178573369979858, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 14660 + }, + { + "epoch": 2.3716756931533425, + "grad_norm": 0.6368302702903748, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 14670 + }, + { + "epoch": 2.3732923773340877, + "grad_norm": 0.9122977256774902, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 14680 + }, + { + "epoch": 2.374909061514833, + "grad_norm": 0.7086195349693298, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 14690 + }, + { + "epoch": 2.376525745695578, + "grad_norm": 0.7500800490379333, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 14700 + }, + { + "epoch": 2.378142429876324, + "grad_norm": 0.6634900569915771, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 14710 + }, + { + "epoch": 2.379759114057069, + "grad_norm": 0.839898407459259, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 14720 + }, + { + "epoch": 2.3813757982378143, + "grad_norm": 0.7578426003456116, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 14730 + }, + { + "epoch": 2.3829924824185595, + "grad_norm": 1.0213173627853394, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 14740 + }, + { + "epoch": 2.3846091665993048, + "grad_norm": 0.7855949401855469, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 14750 + }, + { + "epoch": 2.38622585078005, + "grad_norm": 0.7224128842353821, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 14760 + }, + { + "epoch": 2.3878425349607952, + "grad_norm": 0.8040381669998169, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 14770 + }, + { + "epoch": 2.389459219141541, + "grad_norm": 0.7705281376838684, + "learning_rate": 0.0002, + "loss": 0.6799, + "step": 14780 + }, + { + "epoch": 2.391075903322286, + "grad_norm": 0.667966902256012, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 14790 + }, + { + "epoch": 2.3926925875030314, + "grad_norm": 0.6611011028289795, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 14800 + }, + { + "epoch": 2.3943092716837766, + "grad_norm": 0.6862651705741882, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 14810 + }, + { + "epoch": 2.395925955864522, + "grad_norm": 0.8086010217666626, + "learning_rate": 0.0002, + "loss": 0.6537, + "step": 14820 + }, + { + "epoch": 2.397542640045267, + "grad_norm": 0.7189689874649048, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 14830 + }, + { + "epoch": 2.3991593242260123, + "grad_norm": 0.6280009150505066, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 14840 + }, + { + "epoch": 2.4007760084067575, + "grad_norm": 0.7826612591743469, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 14850 + }, + { + "epoch": 2.402392692587503, + "grad_norm": 0.7681610584259033, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 14860 + }, + { + "epoch": 2.4040093767682484, + "grad_norm": 0.720966100692749, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 14870 + }, + { + "epoch": 2.4056260609489937, + "grad_norm": 0.8202250599861145, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 14880 + }, + { + "epoch": 2.407242745129739, + "grad_norm": 0.786212682723999, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 14890 + }, + { + "epoch": 2.408859429310484, + "grad_norm": 0.6647164821624756, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 14900 + }, + { + "epoch": 2.4104761134912294, + "grad_norm": 0.7566399574279785, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 14910 + }, + { + "epoch": 2.4120927976719746, + "grad_norm": 0.748814582824707, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 14920 + }, + { + "epoch": 2.4137094818527203, + "grad_norm": 0.7624038457870483, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 14930 + }, + { + "epoch": 2.4153261660334655, + "grad_norm": 0.8267335295677185, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 14940 + }, + { + "epoch": 2.4169428502142107, + "grad_norm": 0.8785360455513, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 14950 + }, + { + "epoch": 2.418559534394956, + "grad_norm": 0.679887592792511, + "learning_rate": 0.0002, + "loss": 0.6718, + "step": 14960 + }, + { + "epoch": 2.420176218575701, + "grad_norm": 0.7218474745750427, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 14970 + }, + { + "epoch": 2.4217929027564464, + "grad_norm": 0.6342799663543701, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 14980 + }, + { + "epoch": 2.4234095869371917, + "grad_norm": 0.7098712921142578, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 14990 + }, + { + "epoch": 2.425026271117937, + "grad_norm": 0.7497431635856628, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 15000 + }, + { + "epoch": 2.4266429552986826, + "grad_norm": 0.934836208820343, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 15010 + }, + { + "epoch": 2.428259639479428, + "grad_norm": 0.8430966734886169, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 15020 + }, + { + "epoch": 2.429876323660173, + "grad_norm": 0.7032104730606079, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 15030 + }, + { + "epoch": 2.4314930078409183, + "grad_norm": 0.7746111750602722, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 15040 + }, + { + "epoch": 2.4331096920216635, + "grad_norm": 0.7661406397819519, + "learning_rate": 0.0002, + "loss": 0.7177, + "step": 15050 + }, + { + "epoch": 2.4347263762024087, + "grad_norm": 0.6941645741462708, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 15060 + }, + { + "epoch": 2.436343060383154, + "grad_norm": 0.7487249374389648, + "learning_rate": 0.0002, + "loss": 0.6421, + "step": 15070 + }, + { + "epoch": 2.4379597445638996, + "grad_norm": 0.7639912962913513, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 15080 + }, + { + "epoch": 2.439576428744645, + "grad_norm": 0.7708953619003296, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 15090 + }, + { + "epoch": 2.44119311292539, + "grad_norm": 0.9135832190513611, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 15100 + }, + { + "epoch": 2.4428097971061353, + "grad_norm": 0.8283005356788635, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 15110 + }, + { + "epoch": 2.4444264812868806, + "grad_norm": 0.925299346446991, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 15120 + }, + { + "epoch": 2.446043165467626, + "grad_norm": 0.7013528943061829, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 15130 + }, + { + "epoch": 2.447659849648371, + "grad_norm": 0.622303307056427, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 15140 + }, + { + "epoch": 2.4492765338291163, + "grad_norm": 0.876569390296936, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 15150 + }, + { + "epoch": 2.450893218009862, + "grad_norm": 0.6836351752281189, + "learning_rate": 0.0002, + "loss": 0.6463, + "step": 15160 + }, + { + "epoch": 2.452509902190607, + "grad_norm": 0.7886684536933899, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 15170 + }, + { + "epoch": 2.4541265863713524, + "grad_norm": 0.6647440791130066, + "learning_rate": 0.0002, + "loss": 0.6794, + "step": 15180 + }, + { + "epoch": 2.4557432705520976, + "grad_norm": 0.7477722764015198, + "learning_rate": 0.0002, + "loss": 0.6353, + "step": 15190 + }, + { + "epoch": 2.457359954732843, + "grad_norm": 0.8192033767700195, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 15200 + }, + { + "epoch": 2.458976638913588, + "grad_norm": 0.847537100315094, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 15210 + }, + { + "epoch": 2.4605933230943338, + "grad_norm": 0.9027776122093201, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 15220 + }, + { + "epoch": 2.462210007275079, + "grad_norm": 0.7217772006988525, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 15230 + }, + { + "epoch": 2.4638266914558242, + "grad_norm": 0.7994546294212341, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 15240 + }, + { + "epoch": 2.4654433756365695, + "grad_norm": 0.939916729927063, + "learning_rate": 0.0002, + "loss": 0.6828, + "step": 15250 + }, + { + "epoch": 2.4670600598173147, + "grad_norm": 1.0009053945541382, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 15260 + }, + { + "epoch": 2.46867674399806, + "grad_norm": 0.625555694103241, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 15270 + }, + { + "epoch": 2.470293428178805, + "grad_norm": 0.7924878597259521, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 15280 + }, + { + "epoch": 2.4719101123595504, + "grad_norm": 0.8536689877510071, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 15290 + }, + { + "epoch": 2.4735267965402956, + "grad_norm": 0.8572589755058289, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 15300 + }, + { + "epoch": 2.4751434807210413, + "grad_norm": 0.773279070854187, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 15310 + }, + { + "epoch": 2.4767601649017865, + "grad_norm": 0.7708749771118164, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 15320 + }, + { + "epoch": 2.4783768490825318, + "grad_norm": 0.770905077457428, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 15330 + }, + { + "epoch": 2.479993533263277, + "grad_norm": 0.8238571882247925, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 15340 + }, + { + "epoch": 2.481610217444022, + "grad_norm": 0.7670477032661438, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 15350 + }, + { + "epoch": 2.4832269016247674, + "grad_norm": 0.905036985874176, + "learning_rate": 0.0002, + "loss": 0.7759, + "step": 15360 + }, + { + "epoch": 2.484843585805513, + "grad_norm": 0.6672089695930481, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 15370 + }, + { + "epoch": 2.4864602699862584, + "grad_norm": 0.625095784664154, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 15380 + }, + { + "epoch": 2.4880769541670036, + "grad_norm": 0.679772675037384, + "learning_rate": 0.0002, + "loss": 0.6396, + "step": 15390 + }, + { + "epoch": 2.489693638347749, + "grad_norm": 0.711492121219635, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 15400 + }, + { + "epoch": 2.491310322528494, + "grad_norm": 0.876189112663269, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 15410 + }, + { + "epoch": 2.4929270067092393, + "grad_norm": 0.7236915230751038, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 15420 + }, + { + "epoch": 2.4945436908899845, + "grad_norm": 0.6629832983016968, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 15430 + }, + { + "epoch": 2.4961603750707297, + "grad_norm": 0.9756859540939331, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 15440 + }, + { + "epoch": 2.4977770592514754, + "grad_norm": 0.6896940469741821, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 15450 + }, + { + "epoch": 2.4993937434322206, + "grad_norm": 0.7105149626731873, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 15460 + }, + { + "epoch": 2.501010427612966, + "grad_norm": 0.8374546766281128, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 15470 + }, + { + "epoch": 2.502627111793711, + "grad_norm": 0.7320070266723633, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 15480 + }, + { + "epoch": 2.5042437959744563, + "grad_norm": 0.8306367993354797, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 15490 + }, + { + "epoch": 2.5058604801552016, + "grad_norm": 0.7472721338272095, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 15500 + }, + { + "epoch": 2.507477164335947, + "grad_norm": 0.6147692203521729, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 15510 + }, + { + "epoch": 2.5090938485166925, + "grad_norm": 0.7788505554199219, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 15520 + }, + { + "epoch": 2.5107105326974377, + "grad_norm": 0.8807527422904968, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 15530 + }, + { + "epoch": 2.512327216878183, + "grad_norm": 0.7521643042564392, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 15540 + }, + { + "epoch": 2.513943901058928, + "grad_norm": 0.6900225281715393, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 15550 + }, + { + "epoch": 2.5155605852396734, + "grad_norm": 0.6601938605308533, + "learning_rate": 0.0002, + "loss": 0.6769, + "step": 15560 + }, + { + "epoch": 2.5171772694204186, + "grad_norm": 0.8179984092712402, + "learning_rate": 0.0002, + "loss": 0.6648, + "step": 15570 + }, + { + "epoch": 2.518793953601164, + "grad_norm": 0.792556881904602, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 15580 + }, + { + "epoch": 2.520410637781909, + "grad_norm": 0.7081938982009888, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 15590 + }, + { + "epoch": 2.5220273219626543, + "grad_norm": 0.8733121156692505, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 15600 + }, + { + "epoch": 2.5236440061434, + "grad_norm": 0.7980992794036865, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 15610 + }, + { + "epoch": 2.5252606903241452, + "grad_norm": 0.883664071559906, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 15620 + }, + { + "epoch": 2.5268773745048905, + "grad_norm": 0.6963341236114502, + "learning_rate": 0.0002, + "loss": 0.7022, + "step": 15630 + }, + { + "epoch": 2.5284940586856357, + "grad_norm": 0.6433573365211487, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 15640 + }, + { + "epoch": 2.530110742866381, + "grad_norm": 0.8538183569908142, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 15650 + }, + { + "epoch": 2.5317274270471266, + "grad_norm": 0.9748201370239258, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 15660 + }, + { + "epoch": 2.533344111227872, + "grad_norm": 0.7670575380325317, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 15670 + }, + { + "epoch": 2.534960795408617, + "grad_norm": 0.8738890290260315, + "learning_rate": 0.0002, + "loss": 0.6435, + "step": 15680 + }, + { + "epoch": 2.5365774795893623, + "grad_norm": 0.8391636610031128, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 15690 + }, + { + "epoch": 2.5381941637701075, + "grad_norm": 0.7239366769790649, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 15700 + }, + { + "epoch": 2.5398108479508528, + "grad_norm": 0.8498379588127136, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 15710 + }, + { + "epoch": 2.541427532131598, + "grad_norm": 0.8029484152793884, + "learning_rate": 0.0002, + "loss": 0.6998, + "step": 15720 + }, + { + "epoch": 2.5430442163123432, + "grad_norm": 1.0639333724975586, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 15730 + }, + { + "epoch": 2.5446609004930885, + "grad_norm": 0.6401297450065613, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 15740 + }, + { + "epoch": 2.5462775846738337, + "grad_norm": 0.7123814821243286, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 15750 + }, + { + "epoch": 2.5478942688545794, + "grad_norm": 0.7874974608421326, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 15760 + }, + { + "epoch": 2.5495109530353246, + "grad_norm": 0.8046808838844299, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 15770 + }, + { + "epoch": 2.55112763721607, + "grad_norm": 0.7888661623001099, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 15780 + }, + { + "epoch": 2.552744321396815, + "grad_norm": 0.8445866107940674, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 15790 + }, + { + "epoch": 2.5543610055775603, + "grad_norm": 0.7475846409797668, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 15800 + }, + { + "epoch": 2.555977689758306, + "grad_norm": 0.7455102801322937, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 15810 + }, + { + "epoch": 2.557594373939051, + "grad_norm": 0.8226983547210693, + "learning_rate": 0.0002, + "loss": 0.6932, + "step": 15820 + }, + { + "epoch": 2.5592110581197964, + "grad_norm": 0.8920368552207947, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 15830 + }, + { + "epoch": 2.5608277423005417, + "grad_norm": 0.8413904905319214, + "learning_rate": 0.0002, + "loss": 0.6297, + "step": 15840 + }, + { + "epoch": 2.562444426481287, + "grad_norm": 0.8483649492263794, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 15850 + }, + { + "epoch": 2.564061110662032, + "grad_norm": 0.5923284292221069, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 15860 + }, + { + "epoch": 2.5656777948427774, + "grad_norm": 0.8518726229667664, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 15870 + }, + { + "epoch": 2.5672944790235226, + "grad_norm": 0.731235146522522, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 15880 + }, + { + "epoch": 2.568911163204268, + "grad_norm": 0.7517194151878357, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 15890 + }, + { + "epoch": 2.5705278473850135, + "grad_norm": 0.8378692269325256, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 15900 + }, + { + "epoch": 2.5721445315657587, + "grad_norm": 0.843701958656311, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 15910 + }, + { + "epoch": 2.573761215746504, + "grad_norm": 0.7254629731178284, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 15920 + }, + { + "epoch": 2.575377899927249, + "grad_norm": 0.8863335847854614, + "learning_rate": 0.0002, + "loss": 0.6872, + "step": 15930 + }, + { + "epoch": 2.5769945841079944, + "grad_norm": 0.7675097584724426, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 15940 + }, + { + "epoch": 2.5786112682887397, + "grad_norm": 0.82063889503479, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 15950 + }, + { + "epoch": 2.5802279524694853, + "grad_norm": 0.7729717493057251, + "learning_rate": 0.0002, + "loss": 0.662, + "step": 15960 + }, + { + "epoch": 2.5818446366502306, + "grad_norm": 0.8301846981048584, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 15970 + }, + { + "epoch": 2.583461320830976, + "grad_norm": 0.7906861305236816, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 15980 + }, + { + "epoch": 2.585078005011721, + "grad_norm": 0.6749057173728943, + "learning_rate": 0.0002, + "loss": 0.7175, + "step": 15990 + }, + { + "epoch": 2.5866946891924663, + "grad_norm": 0.9386842846870422, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 16000 + }, + { + "epoch": 2.5883113733732115, + "grad_norm": 0.7868891358375549, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 16010 + }, + { + "epoch": 2.5899280575539567, + "grad_norm": 0.8674671053886414, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 16020 + }, + { + "epoch": 2.591544741734702, + "grad_norm": 0.7043559551239014, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 16030 + }, + { + "epoch": 2.593161425915447, + "grad_norm": 0.5846083760261536, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 16040 + }, + { + "epoch": 2.594778110096193, + "grad_norm": 0.7323982119560242, + "learning_rate": 0.0002, + "loss": 0.7322, + "step": 16050 + }, + { + "epoch": 2.596394794276938, + "grad_norm": 0.9069556593894958, + "learning_rate": 0.0002, + "loss": 0.6794, + "step": 16060 + }, + { + "epoch": 2.5980114784576833, + "grad_norm": 0.7522736191749573, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 16070 + }, + { + "epoch": 2.5996281626384286, + "grad_norm": 0.8149648308753967, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 16080 + }, + { + "epoch": 2.601244846819174, + "grad_norm": 0.6214233040809631, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 16090 + }, + { + "epoch": 2.602861530999919, + "grad_norm": 0.6803743839263916, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 16100 + }, + { + "epoch": 2.6044782151806647, + "grad_norm": 0.7223997116088867, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 16110 + }, + { + "epoch": 2.60609489936141, + "grad_norm": 0.7324174642562866, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 16120 + }, + { + "epoch": 2.607711583542155, + "grad_norm": 0.9594739675521851, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 16130 + }, + { + "epoch": 2.6093282677229004, + "grad_norm": 0.9485327005386353, + "learning_rate": 0.0002, + "loss": 0.6451, + "step": 16140 + }, + { + "epoch": 2.6109449519036456, + "grad_norm": 0.8449000120162964, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 16150 + }, + { + "epoch": 2.612561636084391, + "grad_norm": 0.8520140051841736, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 16160 + }, + { + "epoch": 2.614178320265136, + "grad_norm": 0.7456524968147278, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 16170 + }, + { + "epoch": 2.6157950044458813, + "grad_norm": 0.9912857413291931, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 16180 + }, + { + "epoch": 2.6174116886266265, + "grad_norm": 0.9001946449279785, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 16190 + }, + { + "epoch": 2.619028372807372, + "grad_norm": 0.6568667888641357, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 16200 + }, + { + "epoch": 2.6206450569881174, + "grad_norm": 1.0248128175735474, + "learning_rate": 0.0002, + "loss": 0.7013, + "step": 16210 + }, + { + "epoch": 2.6222617411688627, + "grad_norm": 0.6509039998054504, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 16220 + }, + { + "epoch": 2.623878425349608, + "grad_norm": 0.7626351118087769, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 16230 + }, + { + "epoch": 2.625495109530353, + "grad_norm": 0.6938552260398865, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 16240 + }, + { + "epoch": 2.6271117937110984, + "grad_norm": 0.6434680819511414, + "learning_rate": 0.0002, + "loss": 0.65, + "step": 16250 + }, + { + "epoch": 2.628728477891844, + "grad_norm": 0.7111515998840332, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 16260 + }, + { + "epoch": 2.6303451620725893, + "grad_norm": 0.7712395787239075, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 16270 + }, + { + "epoch": 2.6319618462533345, + "grad_norm": 0.792209267616272, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 16280 + }, + { + "epoch": 2.6335785304340797, + "grad_norm": 0.6801066398620605, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 16290 + }, + { + "epoch": 2.635195214614825, + "grad_norm": 0.7802573442459106, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 16300 + }, + { + "epoch": 2.63681189879557, + "grad_norm": 0.7742244601249695, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 16310 + }, + { + "epoch": 2.6384285829763154, + "grad_norm": 0.664184033870697, + "learning_rate": 0.0002, + "loss": 0.6524, + "step": 16320 + }, + { + "epoch": 2.6400452671570607, + "grad_norm": 0.9242228865623474, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 16330 + }, + { + "epoch": 2.641661951337806, + "grad_norm": 0.9661325216293335, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 16340 + }, + { + "epoch": 2.6432786355185516, + "grad_norm": 0.837526798248291, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 16350 + }, + { + "epoch": 2.644895319699297, + "grad_norm": 1.1834373474121094, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 16360 + }, + { + "epoch": 2.646512003880042, + "grad_norm": 0.7467831373214722, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 16370 + }, + { + "epoch": 2.6481286880607873, + "grad_norm": 0.8627146482467651, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 16380 + }, + { + "epoch": 2.6497453722415325, + "grad_norm": 0.790447473526001, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 16390 + }, + { + "epoch": 2.651362056422278, + "grad_norm": 0.8447365164756775, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 16400 + }, + { + "epoch": 2.6529787406030234, + "grad_norm": 0.7831417918205261, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 16410 + }, + { + "epoch": 2.6545954247837686, + "grad_norm": 0.6837952136993408, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 16420 + }, + { + "epoch": 2.656212108964514, + "grad_norm": 0.7031801342964172, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 16430 + }, + { + "epoch": 2.657828793145259, + "grad_norm": 0.8963770866394043, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 16440 + }, + { + "epoch": 2.6594454773260043, + "grad_norm": 0.6852328181266785, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 16450 + }, + { + "epoch": 2.6610621615067496, + "grad_norm": 0.8069294095039368, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 16460 + }, + { + "epoch": 2.662678845687495, + "grad_norm": 0.7503686547279358, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 16470 + }, + { + "epoch": 2.66429552986824, + "grad_norm": 0.6430956125259399, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 16480 + }, + { + "epoch": 2.6659122140489853, + "grad_norm": 0.7894312739372253, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 16490 + }, + { + "epoch": 2.667528898229731, + "grad_norm": 0.7277431488037109, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 16500 + }, + { + "epoch": 2.669145582410476, + "grad_norm": 0.6816153526306152, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 16510 + }, + { + "epoch": 2.6707622665912214, + "grad_norm": 0.8145235776901245, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 16520 + }, + { + "epoch": 2.6723789507719666, + "grad_norm": 0.8645890355110168, + "learning_rate": 0.0002, + "loss": 0.709, + "step": 16530 + }, + { + "epoch": 2.673995634952712, + "grad_norm": 0.704393208026886, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 16540 + }, + { + "epoch": 2.6756123191334575, + "grad_norm": 1.0120846033096313, + "learning_rate": 0.0002, + "loss": 0.6378, + "step": 16550 + }, + { + "epoch": 2.6772290033142028, + "grad_norm": 0.6919328570365906, + "learning_rate": 0.0002, + "loss": 0.7241, + "step": 16560 + }, + { + "epoch": 2.678845687494948, + "grad_norm": 0.6924574971199036, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 16570 + }, + { + "epoch": 2.6804623716756932, + "grad_norm": 0.9679301381111145, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 16580 + }, + { + "epoch": 2.6820790558564385, + "grad_norm": 0.6810211539268494, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 16590 + }, + { + "epoch": 2.6836957400371837, + "grad_norm": 0.9730555415153503, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 16600 + }, + { + "epoch": 2.685312424217929, + "grad_norm": 0.7852821350097656, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 16610 + }, + { + "epoch": 2.686929108398674, + "grad_norm": 0.6059057116508484, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 16620 + }, + { + "epoch": 2.6885457925794194, + "grad_norm": 0.9395958781242371, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 16630 + }, + { + "epoch": 2.690162476760165, + "grad_norm": 0.7473729848861694, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 16640 + }, + { + "epoch": 2.6917791609409103, + "grad_norm": 0.765934407711029, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 16650 + }, + { + "epoch": 2.6933958451216555, + "grad_norm": 0.8496677279472351, + "learning_rate": 0.0002, + "loss": 0.707, + "step": 16660 + }, + { + "epoch": 2.6950125293024008, + "grad_norm": 0.7641879916191101, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 16670 + }, + { + "epoch": 2.696629213483146, + "grad_norm": 0.8471952676773071, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 16680 + }, + { + "epoch": 2.6982458976638912, + "grad_norm": 0.6946060657501221, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 16690 + }, + { + "epoch": 2.699862581844637, + "grad_norm": 0.7361312508583069, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 16700 + }, + { + "epoch": 2.701479266025382, + "grad_norm": 0.6605038046836853, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 16710 + }, + { + "epoch": 2.7030959502061274, + "grad_norm": 0.7164411544799805, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 16720 + }, + { + "epoch": 2.7047126343868726, + "grad_norm": 0.6496201157569885, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 16730 + }, + { + "epoch": 2.706329318567618, + "grad_norm": 0.7826663851737976, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 16740 + }, + { + "epoch": 2.707946002748363, + "grad_norm": 0.7639131546020508, + "learning_rate": 0.0002, + "loss": 0.6773, + "step": 16750 + }, + { + "epoch": 2.7095626869291083, + "grad_norm": 0.7976210713386536, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 16760 + }, + { + "epoch": 2.7111793711098535, + "grad_norm": 0.6836577653884888, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 16770 + }, + { + "epoch": 2.7127960552905988, + "grad_norm": 0.8025202751159668, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 16780 + }, + { + "epoch": 2.7144127394713444, + "grad_norm": 0.7636463642120361, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 16790 + }, + { + "epoch": 2.7160294236520897, + "grad_norm": 0.7481677532196045, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 16800 + }, + { + "epoch": 2.717646107832835, + "grad_norm": 0.7566834688186646, + "learning_rate": 0.0002, + "loss": 0.6925, + "step": 16810 + }, + { + "epoch": 2.71926279201358, + "grad_norm": 0.7931267619132996, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 16820 + }, + { + "epoch": 2.7208794761943254, + "grad_norm": 0.8811662197113037, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 16830 + }, + { + "epoch": 2.7224961603750706, + "grad_norm": 0.8561240434646606, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 16840 + }, + { + "epoch": 2.7241128445558163, + "grad_norm": 0.7121599316596985, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 16850 + }, + { + "epoch": 2.7257295287365615, + "grad_norm": 0.8066257238388062, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 16860 + }, + { + "epoch": 2.7273462129173067, + "grad_norm": 0.7699271440505981, + "learning_rate": 0.0002, + "loss": 0.6839, + "step": 16870 + }, + { + "epoch": 2.728962897098052, + "grad_norm": 1.1828432083129883, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 16880 + }, + { + "epoch": 2.730579581278797, + "grad_norm": 0.9989302754402161, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 16890 + }, + { + "epoch": 2.7321962654595424, + "grad_norm": 0.8100560307502747, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 16900 + }, + { + "epoch": 2.7338129496402876, + "grad_norm": 0.8615233898162842, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 16910 + }, + { + "epoch": 2.735429633821033, + "grad_norm": 0.8633756041526794, + "learning_rate": 0.0002, + "loss": 0.6322, + "step": 16920 + }, + { + "epoch": 2.737046318001778, + "grad_norm": 0.7769348621368408, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 16930 + }, + { + "epoch": 2.738663002182524, + "grad_norm": 0.6943058371543884, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 16940 + }, + { + "epoch": 2.740279686363269, + "grad_norm": 0.8510736227035522, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 16950 + }, + { + "epoch": 2.7418963705440142, + "grad_norm": 0.7732602953910828, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 16960 + }, + { + "epoch": 2.7435130547247595, + "grad_norm": 0.5981788635253906, + "learning_rate": 0.0002, + "loss": 0.6553, + "step": 16970 + }, + { + "epoch": 2.7451297389055047, + "grad_norm": 0.7604416012763977, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 16980 + }, + { + "epoch": 2.74674642308625, + "grad_norm": 0.7377738356590271, + "learning_rate": 0.0002, + "loss": 0.6981, + "step": 16990 + }, + { + "epoch": 2.7483631072669956, + "grad_norm": 0.9400289058685303, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 17000 + }, + { + "epoch": 2.749979791447741, + "grad_norm": 0.6340599656105042, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 17010 + }, + { + "epoch": 2.751596475628486, + "grad_norm": 0.7297601103782654, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 17020 + }, + { + "epoch": 2.7532131598092313, + "grad_norm": 0.9479979872703552, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 17030 + }, + { + "epoch": 2.7548298439899765, + "grad_norm": 0.8461511135101318, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 17040 + }, + { + "epoch": 2.7564465281707218, + "grad_norm": 0.7477551698684692, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 17050 + }, + { + "epoch": 2.758063212351467, + "grad_norm": 1.019270420074463, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 17060 + }, + { + "epoch": 2.7596798965322122, + "grad_norm": 0.7730235457420349, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 17070 + }, + { + "epoch": 2.7612965807129575, + "grad_norm": 0.8216866254806519, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 17080 + }, + { + "epoch": 2.762913264893703, + "grad_norm": 0.7235931754112244, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 17090 + }, + { + "epoch": 2.7645299490744484, + "grad_norm": 0.7352296710014343, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 17100 + }, + { + "epoch": 2.7661466332551936, + "grad_norm": 0.8129373788833618, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 17110 + }, + { + "epoch": 2.767763317435939, + "grad_norm": 0.7387019991874695, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 17120 + }, + { + "epoch": 2.769380001616684, + "grad_norm": 0.9149190187454224, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 17130 + }, + { + "epoch": 2.7709966857974297, + "grad_norm": 0.7352971434593201, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 17140 + }, + { + "epoch": 2.772613369978175, + "grad_norm": 0.7903780341148376, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 17150 + }, + { + "epoch": 2.77423005415892, + "grad_norm": 0.8255927562713623, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 17160 + }, + { + "epoch": 2.7758467383396654, + "grad_norm": 0.7235927581787109, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 17170 + }, + { + "epoch": 2.7774634225204107, + "grad_norm": 0.8281434774398804, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 17180 + }, + { + "epoch": 2.779080106701156, + "grad_norm": 0.7586921453475952, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 17190 + }, + { + "epoch": 2.780696790881901, + "grad_norm": 0.7161715030670166, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 17200 + }, + { + "epoch": 2.7823134750626464, + "grad_norm": 0.762868344783783, + "learning_rate": 0.0002, + "loss": 0.6426, + "step": 17210 + }, + { + "epoch": 2.7839301592433916, + "grad_norm": 0.9285483360290527, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 17220 + }, + { + "epoch": 2.785546843424137, + "grad_norm": 0.6900462508201599, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 17230 + }, + { + "epoch": 2.7871635276048825, + "grad_norm": 0.780384361743927, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 17240 + }, + { + "epoch": 2.7887802117856277, + "grad_norm": 0.7580406665802002, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 17250 + }, + { + "epoch": 2.790396895966373, + "grad_norm": 0.8145199418067932, + "learning_rate": 0.0002, + "loss": 0.6833, + "step": 17260 + }, + { + "epoch": 2.792013580147118, + "grad_norm": 0.9159596562385559, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 17270 + }, + { + "epoch": 2.7936302643278634, + "grad_norm": 0.9590014219284058, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 17280 + }, + { + "epoch": 2.795246948508609, + "grad_norm": 0.7603529691696167, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 17290 + }, + { + "epoch": 2.7968636326893543, + "grad_norm": 0.8039976358413696, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 17300 + }, + { + "epoch": 2.7984803168700996, + "grad_norm": 0.8364847302436829, + "learning_rate": 0.0002, + "loss": 0.7037, + "step": 17310 + }, + { + "epoch": 2.800097001050845, + "grad_norm": 0.8763046860694885, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 17320 + }, + { + "epoch": 2.80171368523159, + "grad_norm": 0.8409647941589355, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 17330 + }, + { + "epoch": 2.8033303694123353, + "grad_norm": 0.7649006247520447, + "learning_rate": 0.0002, + "loss": 0.6936, + "step": 17340 + }, + { + "epoch": 2.8049470535930805, + "grad_norm": 0.7970262169837952, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 17350 + }, + { + "epoch": 2.8065637377738257, + "grad_norm": 0.9088607430458069, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 17360 + }, + { + "epoch": 2.808180421954571, + "grad_norm": 0.6454846858978271, + "learning_rate": 0.0002, + "loss": 0.675, + "step": 17370 + }, + { + "epoch": 2.809797106135316, + "grad_norm": 0.7744787931442261, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 17380 + }, + { + "epoch": 2.811413790316062, + "grad_norm": 0.6678640842437744, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 17390 + }, + { + "epoch": 2.813030474496807, + "grad_norm": 0.772676944732666, + "learning_rate": 0.0002, + "loss": 0.6784, + "step": 17400 + }, + { + "epoch": 2.8146471586775523, + "grad_norm": 0.7088175415992737, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 17410 + }, + { + "epoch": 2.8162638428582976, + "grad_norm": 0.8280573487281799, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 17420 + }, + { + "epoch": 2.817880527039043, + "grad_norm": 0.6665388345718384, + "learning_rate": 0.0002, + "loss": 0.6732, + "step": 17430 + }, + { + "epoch": 2.8194972112197885, + "grad_norm": 0.6427883505821228, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 17440 + }, + { + "epoch": 2.8211138954005337, + "grad_norm": 0.9697760343551636, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 17450 + }, + { + "epoch": 2.822730579581279, + "grad_norm": 0.7573966383934021, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 17460 + }, + { + "epoch": 2.824347263762024, + "grad_norm": 0.878688633441925, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 17470 + }, + { + "epoch": 2.8259639479427694, + "grad_norm": 0.7752242684364319, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 17480 + }, + { + "epoch": 2.8275806321235146, + "grad_norm": 0.6135398745536804, + "learning_rate": 0.0002, + "loss": 0.6638, + "step": 17490 + }, + { + "epoch": 2.82919731630426, + "grad_norm": 0.6924924850463867, + "learning_rate": 0.0002, + "loss": 0.6829, + "step": 17500 + }, + { + "epoch": 2.830814000485005, + "grad_norm": 0.7471627593040466, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 17510 + }, + { + "epoch": 2.8324306846657503, + "grad_norm": 0.7145499587059021, + "learning_rate": 0.0002, + "loss": 0.7016, + "step": 17520 + }, + { + "epoch": 2.834047368846496, + "grad_norm": 0.7415414452552795, + "learning_rate": 0.0002, + "loss": 0.6787, + "step": 17530 + }, + { + "epoch": 2.8356640530272412, + "grad_norm": 0.7328441739082336, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 17540 + }, + { + "epoch": 2.8372807372079865, + "grad_norm": 0.8267839550971985, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 17550 + }, + { + "epoch": 2.8388974213887317, + "grad_norm": 0.8877885341644287, + "learning_rate": 0.0002, + "loss": 0.6787, + "step": 17560 + }, + { + "epoch": 2.840514105569477, + "grad_norm": 0.857138454914093, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 17570 + }, + { + "epoch": 2.842130789750222, + "grad_norm": 0.8470779657363892, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 17580 + }, + { + "epoch": 2.843747473930968, + "grad_norm": 0.8553254008293152, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 17590 + }, + { + "epoch": 2.845364158111713, + "grad_norm": 0.8033196926116943, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 17600 + }, + { + "epoch": 2.8469808422924583, + "grad_norm": 0.7949087023735046, + "learning_rate": 0.0002, + "loss": 0.7062, + "step": 17610 + }, + { + "epoch": 2.8485975264732035, + "grad_norm": 0.9241406321525574, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 17620 + }, + { + "epoch": 2.8502142106539488, + "grad_norm": 0.7721285223960876, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 17630 + }, + { + "epoch": 2.851830894834694, + "grad_norm": 1.0246692895889282, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 17640 + }, + { + "epoch": 2.853447579015439, + "grad_norm": 0.9244589805603027, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 17650 + }, + { + "epoch": 2.8550642631961844, + "grad_norm": 0.7243508696556091, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 17660 + }, + { + "epoch": 2.8566809473769297, + "grad_norm": 0.8943371176719666, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 17670 + }, + { + "epoch": 2.8582976315576754, + "grad_norm": 0.6531758904457092, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 17680 + }, + { + "epoch": 2.8599143157384206, + "grad_norm": 0.8367000818252563, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 17690 + }, + { + "epoch": 2.861530999919166, + "grad_norm": 0.7868556380271912, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 17700 + }, + { + "epoch": 2.863147684099911, + "grad_norm": 0.7213859558105469, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 17710 + }, + { + "epoch": 2.8647643682806563, + "grad_norm": 0.7383931279182434, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 17720 + }, + { + "epoch": 2.8663810524614015, + "grad_norm": 0.7566812634468079, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 17730 + }, + { + "epoch": 2.867997736642147, + "grad_norm": 0.6930373311042786, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 17740 + }, + { + "epoch": 2.8696144208228924, + "grad_norm": 0.7911090850830078, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 17750 + }, + { + "epoch": 2.8712311050036377, + "grad_norm": 0.8484548926353455, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 17760 + }, + { + "epoch": 2.872847789184383, + "grad_norm": 0.7647597193717957, + "learning_rate": 0.0002, + "loss": 0.6931, + "step": 17770 + }, + { + "epoch": 2.874464473365128, + "grad_norm": 0.8791151642799377, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 17780 + }, + { + "epoch": 2.8760811575458733, + "grad_norm": 0.7253178358078003, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 17790 + }, + { + "epoch": 2.8776978417266186, + "grad_norm": 0.7956077456474304, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 17800 + }, + { + "epoch": 2.879314525907364, + "grad_norm": 0.8657688498497009, + "learning_rate": 0.0002, + "loss": 0.6687, + "step": 17810 + }, + { + "epoch": 2.880931210088109, + "grad_norm": 0.7059141993522644, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 17820 + }, + { + "epoch": 2.8825478942688547, + "grad_norm": 0.8886896967887878, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 17830 + }, + { + "epoch": 2.8841645784496, + "grad_norm": 0.821032702922821, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 17840 + }, + { + "epoch": 2.885781262630345, + "grad_norm": 0.7183963656425476, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 17850 + }, + { + "epoch": 2.8873979468110904, + "grad_norm": 0.6222899556159973, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 17860 + }, + { + "epoch": 2.8890146309918356, + "grad_norm": 0.8187434077262878, + "learning_rate": 0.0002, + "loss": 0.6626, + "step": 17870 + }, + { + "epoch": 2.890631315172581, + "grad_norm": 0.9838479161262512, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 17880 + }, + { + "epoch": 2.8922479993533265, + "grad_norm": 0.7567742466926575, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 17890 + }, + { + "epoch": 2.893864683534072, + "grad_norm": 0.6875903606414795, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 17900 + }, + { + "epoch": 2.895481367714817, + "grad_norm": 0.8043789267539978, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 17910 + }, + { + "epoch": 2.8970980518955622, + "grad_norm": 0.8062626719474792, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 17920 + }, + { + "epoch": 2.8987147360763075, + "grad_norm": 1.0251191854476929, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 17930 + }, + { + "epoch": 2.9003314202570527, + "grad_norm": 0.882253110408783, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 17940 + }, + { + "epoch": 2.901948104437798, + "grad_norm": 0.8683299422264099, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 17950 + }, + { + "epoch": 2.903564788618543, + "grad_norm": 0.7167282104492188, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 17960 + }, + { + "epoch": 2.9051814727992884, + "grad_norm": 0.7093694806098938, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 17970 + }, + { + "epoch": 2.906798156980034, + "grad_norm": 0.8549879193305969, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 17980 + }, + { + "epoch": 2.9084148411607793, + "grad_norm": 0.6989606618881226, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 17990 + }, + { + "epoch": 2.9100315253415245, + "grad_norm": 0.9482976794242859, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 18000 + }, + { + "epoch": 2.9116482095222698, + "grad_norm": 0.7182440161705017, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 18010 + }, + { + "epoch": 2.913264893703015, + "grad_norm": 0.7732226252555847, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 18020 + }, + { + "epoch": 2.9148815778837607, + "grad_norm": 0.7936875224113464, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 18030 + }, + { + "epoch": 2.916498262064506, + "grad_norm": 0.8825615644454956, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 18040 + }, + { + "epoch": 2.918114946245251, + "grad_norm": 0.6778587102890015, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 18050 + }, + { + "epoch": 2.9197316304259964, + "grad_norm": 0.7529265880584717, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 18060 + }, + { + "epoch": 2.9213483146067416, + "grad_norm": 0.7111883163452148, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 18070 + }, + { + "epoch": 2.922964998787487, + "grad_norm": 0.7214767932891846, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 18080 + }, + { + "epoch": 2.924581682968232, + "grad_norm": 0.800417423248291, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 18090 + }, + { + "epoch": 2.9261983671489773, + "grad_norm": 1.248575210571289, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 18100 + }, + { + "epoch": 2.9278150513297225, + "grad_norm": 0.757788360118866, + "learning_rate": 0.0002, + "loss": 0.7004, + "step": 18110 + }, + { + "epoch": 2.9294317355104678, + "grad_norm": 1.0583995580673218, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 18120 + }, + { + "epoch": 2.9310484196912134, + "grad_norm": 0.8228777647018433, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 18130 + }, + { + "epoch": 2.9326651038719587, + "grad_norm": 0.8374035358428955, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 18140 + }, + { + "epoch": 2.934281788052704, + "grad_norm": 0.7976473569869995, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 18150 + }, + { + "epoch": 2.935898472233449, + "grad_norm": 0.8009907603263855, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 18160 + }, + { + "epoch": 2.9375151564141944, + "grad_norm": 0.835213303565979, + "learning_rate": 0.0002, + "loss": 0.7485, + "step": 18170 + }, + { + "epoch": 2.93913184059494, + "grad_norm": 0.7982219457626343, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 18180 + }, + { + "epoch": 2.9407485247756853, + "grad_norm": 0.7070978879928589, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 18190 + }, + { + "epoch": 2.9423652089564305, + "grad_norm": 0.8619440197944641, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 18200 + }, + { + "epoch": 2.9439818931371757, + "grad_norm": 0.6693987250328064, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 18210 + }, + { + "epoch": 2.945598577317921, + "grad_norm": 0.6747021079063416, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 18220 + }, + { + "epoch": 2.947215261498666, + "grad_norm": 0.860387921333313, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 18230 + }, + { + "epoch": 2.9488319456794114, + "grad_norm": 0.799976646900177, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 18240 + }, + { + "epoch": 2.9504486298601567, + "grad_norm": 0.7864769101142883, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 18250 + }, + { + "epoch": 2.952065314040902, + "grad_norm": 0.6713884472846985, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 18260 + }, + { + "epoch": 2.9536819982216476, + "grad_norm": 0.9031508564949036, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 18270 + }, + { + "epoch": 2.955298682402393, + "grad_norm": 0.7205073237419128, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 18280 + }, + { + "epoch": 2.956915366583138, + "grad_norm": 0.7746205925941467, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 18290 + }, + { + "epoch": 2.9585320507638833, + "grad_norm": 0.6533427834510803, + "learning_rate": 0.0002, + "loss": 0.6218, + "step": 18300 + }, + { + "epoch": 2.9601487349446285, + "grad_norm": 0.9083208441734314, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 18310 + }, + { + "epoch": 2.9617654191253737, + "grad_norm": 0.7446991801261902, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 18320 + }, + { + "epoch": 2.9633821033061194, + "grad_norm": 0.6514461636543274, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 18330 + }, + { + "epoch": 2.9649987874868646, + "grad_norm": 0.8580465912818909, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 18340 + }, + { + "epoch": 2.96661547166761, + "grad_norm": 0.7074266076087952, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 18350 + }, + { + "epoch": 2.968232155848355, + "grad_norm": 0.899892270565033, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 18360 + }, + { + "epoch": 2.9698488400291003, + "grad_norm": 0.8217641711235046, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 18370 + }, + { + "epoch": 2.9714655242098456, + "grad_norm": 0.8611799478530884, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 18380 + }, + { + "epoch": 2.973082208390591, + "grad_norm": 0.6909302473068237, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 18390 + }, + { + "epoch": 2.974698892571336, + "grad_norm": 0.6554358005523682, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 18400 + }, + { + "epoch": 2.9763155767520812, + "grad_norm": 0.7803071737289429, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 18410 + }, + { + "epoch": 2.977932260932827, + "grad_norm": 0.7838954925537109, + "learning_rate": 0.0002, + "loss": 0.7322, + "step": 18420 + }, + { + "epoch": 2.979548945113572, + "grad_norm": 0.7098495364189148, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 18430 + }, + { + "epoch": 2.9811656292943174, + "grad_norm": 0.8981785774230957, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 18440 + }, + { + "epoch": 2.9827823134750626, + "grad_norm": 0.7197171449661255, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 18450 + }, + { + "epoch": 2.984398997655808, + "grad_norm": 0.793185293674469, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 18460 + }, + { + "epoch": 2.986015681836553, + "grad_norm": 0.8531473875045776, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 18470 + }, + { + "epoch": 2.9876323660172988, + "grad_norm": 0.6627361178398132, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 18480 + }, + { + "epoch": 2.989249050198044, + "grad_norm": 0.5708155035972595, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 18490 + }, + { + "epoch": 2.990865734378789, + "grad_norm": 0.8227280378341675, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 18500 + }, + { + "epoch": 2.9924824185595345, + "grad_norm": 0.7102749943733215, + "learning_rate": 0.0002, + "loss": 0.6701, + "step": 18510 + }, + { + "epoch": 2.9940991027402797, + "grad_norm": 0.839485228061676, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 18520 + }, + { + "epoch": 2.995715786921025, + "grad_norm": 0.9038704037666321, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 18530 + }, + { + "epoch": 2.99733247110177, + "grad_norm": 0.8737510442733765, + "learning_rate": 0.0002, + "loss": 0.7186, + "step": 18540 + }, + { + "epoch": 2.9989491552825154, + "grad_norm": 0.7323142886161804, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 18550 + }, + { + "epoch": 2.9999191657909625, + "eval_loss": 1.1262480020523071, + "eval_runtime": 122.0868, + "eval_samples_per_second": 6.004, + "eval_steps_per_second": 0.754, + "step": 18556 + }, + { + "epoch": 3.000565839463261, + "grad_norm": 0.8465463519096375, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 18560 + }, + { + "epoch": 3.0021825236440063, + "grad_norm": 0.9134138822555542, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 18570 + }, + { + "epoch": 3.0037992078247515, + "grad_norm": 0.760715126991272, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 18580 + }, + { + "epoch": 3.0054158920054967, + "grad_norm": 0.9208743572235107, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 18590 + }, + { + "epoch": 3.007032576186242, + "grad_norm": 0.9232364892959595, + "learning_rate": 0.0002, + "loss": 0.5731, + "step": 18600 + }, + { + "epoch": 3.008649260366987, + "grad_norm": 1.1881544589996338, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 18610 + }, + { + "epoch": 3.0102659445477324, + "grad_norm": 0.9372987747192383, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 18620 + }, + { + "epoch": 3.0118826287284777, + "grad_norm": 0.6900241374969482, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 18630 + }, + { + "epoch": 3.0134993129092233, + "grad_norm": 0.8451071381568909, + "learning_rate": 0.0002, + "loss": 0.5256, + "step": 18640 + }, + { + "epoch": 3.0151159970899686, + "grad_norm": 0.7763112187385559, + "learning_rate": 0.0002, + "loss": 0.5916, + "step": 18650 + }, + { + "epoch": 3.016732681270714, + "grad_norm": 1.043653964996338, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 18660 + }, + { + "epoch": 3.018349365451459, + "grad_norm": 1.0170660018920898, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 18670 + }, + { + "epoch": 3.0199660496322043, + "grad_norm": 0.7534180283546448, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 18680 + }, + { + "epoch": 3.0215827338129495, + "grad_norm": 0.7507367730140686, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 18690 + }, + { + "epoch": 3.0231994179936947, + "grad_norm": 0.7861620187759399, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 18700 + }, + { + "epoch": 3.0248161021744404, + "grad_norm": 1.0580339431762695, + "learning_rate": 0.0002, + "loss": 0.5802, + "step": 18710 + }, + { + "epoch": 3.0264327863551856, + "grad_norm": 0.7542710900306702, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 18720 + }, + { + "epoch": 3.028049470535931, + "grad_norm": 0.8189544677734375, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 18730 + }, + { + "epoch": 3.029666154716676, + "grad_norm": 0.9126611351966858, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 18740 + }, + { + "epoch": 3.0312828388974213, + "grad_norm": 0.8891341686248779, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 18750 + }, + { + "epoch": 3.0328995230781666, + "grad_norm": 0.8419283032417297, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 18760 + }, + { + "epoch": 3.034516207258912, + "grad_norm": 0.8048048615455627, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 18770 + }, + { + "epoch": 3.0361328914396575, + "grad_norm": 0.7820217609405518, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 18780 + }, + { + "epoch": 3.0377495756204027, + "grad_norm": 0.854721188545227, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 18790 + }, + { + "epoch": 3.039366259801148, + "grad_norm": 0.912092924118042, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 18800 + }, + { + "epoch": 3.040982943981893, + "grad_norm": 0.6596226096153259, + "learning_rate": 0.0002, + "loss": 0.5734, + "step": 18810 + }, + { + "epoch": 3.0425996281626384, + "grad_norm": 0.6351348757743835, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 18820 + }, + { + "epoch": 3.0442163123433836, + "grad_norm": 0.778188943862915, + "learning_rate": 0.0002, + "loss": 0.5953, + "step": 18830 + }, + { + "epoch": 3.045832996524129, + "grad_norm": 0.68234783411026, + "learning_rate": 0.0002, + "loss": 0.602, + "step": 18840 + }, + { + "epoch": 3.047449680704874, + "grad_norm": 0.998628556728363, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 18850 + }, + { + "epoch": 3.0490663648856198, + "grad_norm": 0.7393841743469238, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 18860 + }, + { + "epoch": 3.050683049066365, + "grad_norm": 0.84438556432724, + "learning_rate": 0.0002, + "loss": 0.568, + "step": 18870 + }, + { + "epoch": 3.0522997332471102, + "grad_norm": 0.8857501745223999, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 18880 + }, + { + "epoch": 3.0539164174278555, + "grad_norm": 0.7208474278450012, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 18890 + }, + { + "epoch": 3.0555331016086007, + "grad_norm": 0.7135229110717773, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 18900 + }, + { + "epoch": 3.057149785789346, + "grad_norm": 0.9130001664161682, + "learning_rate": 0.0002, + "loss": 0.5575, + "step": 18910 + }, + { + "epoch": 3.058766469970091, + "grad_norm": 0.9001716375350952, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 18920 + }, + { + "epoch": 3.060383154150837, + "grad_norm": 0.8667559623718262, + "learning_rate": 0.0002, + "loss": 0.6052, + "step": 18930 + }, + { + "epoch": 3.061999838331582, + "grad_norm": 0.8943959474563599, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 18940 + }, + { + "epoch": 3.0636165225123273, + "grad_norm": 0.8298377990722656, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 18950 + }, + { + "epoch": 3.0652332066930725, + "grad_norm": 0.7935267686843872, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 18960 + }, + { + "epoch": 3.0668498908738178, + "grad_norm": 1.1506379842758179, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 18970 + }, + { + "epoch": 3.068466575054563, + "grad_norm": 0.7693049907684326, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 18980 + }, + { + "epoch": 3.0700832592353082, + "grad_norm": 0.8040135502815247, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 18990 + }, + { + "epoch": 3.0716999434160535, + "grad_norm": 0.828404426574707, + "learning_rate": 0.0002, + "loss": 0.5541, + "step": 19000 + }, + { + "epoch": 3.073316627596799, + "grad_norm": 0.8811164498329163, + "learning_rate": 0.0002, + "loss": 0.6048, + "step": 19010 + }, + { + "epoch": 3.0749333117775444, + "grad_norm": 1.036205768585205, + "learning_rate": 0.0002, + "loss": 0.5845, + "step": 19020 + }, + { + "epoch": 3.0765499959582896, + "grad_norm": 0.8857285976409912, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 19030 + }, + { + "epoch": 3.078166680139035, + "grad_norm": 0.8392079472541809, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 19040 + }, + { + "epoch": 3.07978336431978, + "grad_norm": 1.0287401676177979, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 19050 + }, + { + "epoch": 3.0814000485005253, + "grad_norm": 1.0086315870285034, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 19060 + }, + { + "epoch": 3.0830167326812705, + "grad_norm": 0.9245324730873108, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 19070 + }, + { + "epoch": 3.084633416862016, + "grad_norm": 0.8680877089500427, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 19080 + }, + { + "epoch": 3.0862501010427614, + "grad_norm": 0.8814793825149536, + "learning_rate": 0.0002, + "loss": 0.5611, + "step": 19090 + }, + { + "epoch": 3.0878667852235067, + "grad_norm": 0.9234458208084106, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 19100 + }, + { + "epoch": 3.089483469404252, + "grad_norm": 1.1291664838790894, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 19110 + }, + { + "epoch": 3.091100153584997, + "grad_norm": 0.9191402792930603, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 19120 + }, + { + "epoch": 3.0927168377657424, + "grad_norm": 0.7103154063224792, + "learning_rate": 0.0002, + "loss": 0.5856, + "step": 19130 + }, + { + "epoch": 3.0943335219464876, + "grad_norm": 0.9368883967399597, + "learning_rate": 0.0002, + "loss": 0.6479, + "step": 19140 + }, + { + "epoch": 3.095950206127233, + "grad_norm": 0.9676656723022461, + "learning_rate": 0.0002, + "loss": 0.6167, + "step": 19150 + }, + { + "epoch": 3.0975668903079785, + "grad_norm": 0.8739792704582214, + "learning_rate": 0.0002, + "loss": 0.5794, + "step": 19160 + }, + { + "epoch": 3.0991835744887237, + "grad_norm": 0.8530174493789673, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 19170 + }, + { + "epoch": 3.100800258669469, + "grad_norm": 0.794945478439331, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 19180 + }, + { + "epoch": 3.102416942850214, + "grad_norm": 0.9508888125419617, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 19190 + }, + { + "epoch": 3.1040336270309594, + "grad_norm": 1.0599955320358276, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 19200 + }, + { + "epoch": 3.1056503112117047, + "grad_norm": 1.0673625469207764, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 19210 + }, + { + "epoch": 3.10726699539245, + "grad_norm": 0.7739115953445435, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 19220 + }, + { + "epoch": 3.1088836795731956, + "grad_norm": 0.9884951114654541, + "learning_rate": 0.0002, + "loss": 0.6046, + "step": 19230 + }, + { + "epoch": 3.110500363753941, + "grad_norm": 0.862260103225708, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 19240 + }, + { + "epoch": 3.112117047934686, + "grad_norm": 0.7690284848213196, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 19250 + }, + { + "epoch": 3.1137337321154313, + "grad_norm": 0.8758958578109741, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 19260 + }, + { + "epoch": 3.1153504162961765, + "grad_norm": 1.0356395244598389, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 19270 + }, + { + "epoch": 3.1169671004769217, + "grad_norm": 0.6950937509536743, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 19280 + }, + { + "epoch": 3.118583784657667, + "grad_norm": 0.760998010635376, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 19290 + }, + { + "epoch": 3.1202004688384126, + "grad_norm": 0.9335789084434509, + "learning_rate": 0.0002, + "loss": 0.575, + "step": 19300 + }, + { + "epoch": 3.121817153019158, + "grad_norm": 0.9636204242706299, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 19310 + }, + { + "epoch": 3.123433837199903, + "grad_norm": 1.0820997953414917, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 19320 + }, + { + "epoch": 3.1250505213806483, + "grad_norm": 0.7333487272262573, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 19330 + }, + { + "epoch": 3.1266672055613935, + "grad_norm": 1.0417509078979492, + "learning_rate": 0.0002, + "loss": 0.6178, + "step": 19340 + }, + { + "epoch": 3.128283889742139, + "grad_norm": 0.9267749190330505, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 19350 + }, + { + "epoch": 3.129900573922884, + "grad_norm": 0.777798593044281, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 19360 + }, + { + "epoch": 3.1315172581036297, + "grad_norm": 0.8425456881523132, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 19370 + }, + { + "epoch": 3.133133942284375, + "grad_norm": 0.9617102146148682, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 19380 + }, + { + "epoch": 3.13475062646512, + "grad_norm": 1.0052828788757324, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 19390 + }, + { + "epoch": 3.1363673106458654, + "grad_norm": 0.7637009024620056, + "learning_rate": 0.0002, + "loss": 0.5713, + "step": 19400 + }, + { + "epoch": 3.1379839948266106, + "grad_norm": 0.7958088517189026, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 19410 + }, + { + "epoch": 3.139600679007356, + "grad_norm": 0.9161727428436279, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 19420 + }, + { + "epoch": 3.141217363188101, + "grad_norm": 0.8402149677276611, + "learning_rate": 0.0002, + "loss": 0.5638, + "step": 19430 + }, + { + "epoch": 3.1428340473688463, + "grad_norm": 1.0056525468826294, + "learning_rate": 0.0002, + "loss": 0.5848, + "step": 19440 + }, + { + "epoch": 3.144450731549592, + "grad_norm": 1.0129190683364868, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 19450 + }, + { + "epoch": 3.146067415730337, + "grad_norm": 0.790825366973877, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 19460 + }, + { + "epoch": 3.1476840999110824, + "grad_norm": 1.441665530204773, + "learning_rate": 0.0002, + "loss": 0.5607, + "step": 19470 + }, + { + "epoch": 3.1493007840918277, + "grad_norm": 0.7846331596374512, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 19480 + }, + { + "epoch": 3.150917468272573, + "grad_norm": 0.7915332913398743, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 19490 + }, + { + "epoch": 3.152534152453318, + "grad_norm": 0.933982253074646, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 19500 + }, + { + "epoch": 3.1541508366340634, + "grad_norm": 1.038408637046814, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 19510 + }, + { + "epoch": 3.155767520814809, + "grad_norm": 1.018935203552246, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 19520 + }, + { + "epoch": 3.1573842049955543, + "grad_norm": 0.9618112444877625, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 19530 + }, + { + "epoch": 3.1590008891762995, + "grad_norm": 0.8900452852249146, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 19540 + }, + { + "epoch": 3.1606175733570447, + "grad_norm": 0.8254160284996033, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 19550 + }, + { + "epoch": 3.16223425753779, + "grad_norm": 1.004376769065857, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 19560 + }, + { + "epoch": 3.163850941718535, + "grad_norm": 1.0490446090698242, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 19570 + }, + { + "epoch": 3.1654676258992804, + "grad_norm": 0.7387403845787048, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 19580 + }, + { + "epoch": 3.1670843100800257, + "grad_norm": 0.7611538171768188, + "learning_rate": 0.0002, + "loss": 0.5898, + "step": 19590 + }, + { + "epoch": 3.1687009942607713, + "grad_norm": 0.8239886164665222, + "learning_rate": 0.0002, + "loss": 0.5937, + "step": 19600 + }, + { + "epoch": 3.1703176784415166, + "grad_norm": 0.9327243566513062, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 19610 + }, + { + "epoch": 3.171934362622262, + "grad_norm": 0.9662560224533081, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 19620 + }, + { + "epoch": 3.173551046803007, + "grad_norm": 0.9183341860771179, + "learning_rate": 0.0002, + "loss": 0.5988, + "step": 19630 + }, + { + "epoch": 3.1751677309837523, + "grad_norm": 0.875066876411438, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 19640 + }, + { + "epoch": 3.1767844151644975, + "grad_norm": 0.8567508459091187, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 19650 + }, + { + "epoch": 3.1784010993452427, + "grad_norm": 0.6805780529975891, + "learning_rate": 0.0002, + "loss": 0.5805, + "step": 19660 + }, + { + "epoch": 3.1800177835259884, + "grad_norm": 0.8776944279670715, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 19670 + }, + { + "epoch": 3.1816344677067336, + "grad_norm": 0.9036329984664917, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 19680 + }, + { + "epoch": 3.183251151887479, + "grad_norm": 0.8527372479438782, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 19690 + }, + { + "epoch": 3.184867836068224, + "grad_norm": 1.1045585870742798, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 19700 + }, + { + "epoch": 3.1864845202489693, + "grad_norm": 0.9213830828666687, + "learning_rate": 0.0002, + "loss": 0.5491, + "step": 19710 + }, + { + "epoch": 3.1881012044297146, + "grad_norm": 0.8865814805030823, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 19720 + }, + { + "epoch": 3.18971788861046, + "grad_norm": 0.7939388751983643, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 19730 + }, + { + "epoch": 3.191334572791205, + "grad_norm": 0.6966729760169983, + "learning_rate": 0.0002, + "loss": 0.5682, + "step": 19740 + }, + { + "epoch": 3.1929512569719507, + "grad_norm": 0.8023673295974731, + "learning_rate": 0.0002, + "loss": 0.5839, + "step": 19750 + }, + { + "epoch": 3.194567941152696, + "grad_norm": 0.7992037534713745, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 19760 + }, + { + "epoch": 3.196184625333441, + "grad_norm": 0.7412247657775879, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 19770 + }, + { + "epoch": 3.1978013095141864, + "grad_norm": 0.9598729014396667, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 19780 + }, + { + "epoch": 3.1994179936949316, + "grad_norm": 0.8331366777420044, + "learning_rate": 0.0002, + "loss": 0.5685, + "step": 19790 + }, + { + "epoch": 3.201034677875677, + "grad_norm": 0.8939169645309448, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 19800 + }, + { + "epoch": 3.202651362056422, + "grad_norm": 0.9219734072685242, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 19810 + }, + { + "epoch": 3.2042680462371678, + "grad_norm": 0.869490385055542, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 19820 + }, + { + "epoch": 3.205884730417913, + "grad_norm": 0.8989706635475159, + "learning_rate": 0.0002, + "loss": 0.5946, + "step": 19830 + }, + { + "epoch": 3.2075014145986582, + "grad_norm": 0.8477165102958679, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 19840 + }, + { + "epoch": 3.2091180987794035, + "grad_norm": 0.8720678687095642, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 19850 + }, + { + "epoch": 3.2107347829601487, + "grad_norm": 0.861406683921814, + "learning_rate": 0.0002, + "loss": 0.5694, + "step": 19860 + }, + { + "epoch": 3.212351467140894, + "grad_norm": 0.8228686451911926, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 19870 + }, + { + "epoch": 3.213968151321639, + "grad_norm": 0.7936596870422363, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 19880 + }, + { + "epoch": 3.2155848355023844, + "grad_norm": 1.097377896308899, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 19890 + }, + { + "epoch": 3.21720151968313, + "grad_norm": 0.9544782638549805, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 19900 + }, + { + "epoch": 3.2188182038638753, + "grad_norm": 0.8240751624107361, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 19910 + }, + { + "epoch": 3.2204348880446205, + "grad_norm": 0.8332096338272095, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 19920 + }, + { + "epoch": 3.2220515722253658, + "grad_norm": 1.0954567193984985, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 19930 + }, + { + "epoch": 3.223668256406111, + "grad_norm": 0.7790525555610657, + "learning_rate": 0.0002, + "loss": 0.652, + "step": 19940 + }, + { + "epoch": 3.225284940586856, + "grad_norm": 0.7966814041137695, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 19950 + }, + { + "epoch": 3.2269016247676015, + "grad_norm": 0.9751881957054138, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 19960 + }, + { + "epoch": 3.228518308948347, + "grad_norm": 0.9856047630310059, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 19970 + }, + { + "epoch": 3.2301349931290924, + "grad_norm": 1.3062353134155273, + "learning_rate": 0.0002, + "loss": 0.5837, + "step": 19980 + }, + { + "epoch": 3.2317516773098376, + "grad_norm": 0.9510692358016968, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 19990 + }, + { + "epoch": 3.233368361490583, + "grad_norm": 0.8630342483520508, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 20000 + }, + { + "epoch": 3.234985045671328, + "grad_norm": 0.8966519236564636, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 20010 + }, + { + "epoch": 3.2366017298520733, + "grad_norm": 0.7093510627746582, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 20020 + }, + { + "epoch": 3.2382184140328185, + "grad_norm": 0.7771096229553223, + "learning_rate": 0.0002, + "loss": 0.585, + "step": 20030 + }, + { + "epoch": 3.2398350982135637, + "grad_norm": 0.841058075428009, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 20040 + }, + { + "epoch": 3.2414517823943094, + "grad_norm": 0.909712553024292, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 20050 + }, + { + "epoch": 3.2430684665750547, + "grad_norm": 0.8321019411087036, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 20060 + }, + { + "epoch": 3.2446851507558, + "grad_norm": 0.779901921749115, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 20070 + }, + { + "epoch": 3.246301834936545, + "grad_norm": 0.6249170303344727, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 20080 + }, + { + "epoch": 3.2479185191172903, + "grad_norm": 0.8000940680503845, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 20090 + }, + { + "epoch": 3.2495352032980356, + "grad_norm": 0.7627735137939453, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 20100 + }, + { + "epoch": 3.2511518874787813, + "grad_norm": 0.8780747056007385, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 20110 + }, + { + "epoch": 3.2527685716595265, + "grad_norm": 0.772037148475647, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 20120 + }, + { + "epoch": 3.2543852558402717, + "grad_norm": 1.0086580514907837, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 20130 + }, + { + "epoch": 3.256001940021017, + "grad_norm": 0.9360289573669434, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 20140 + }, + { + "epoch": 3.257618624201762, + "grad_norm": 1.2099586725234985, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 20150 + }, + { + "epoch": 3.2592353083825074, + "grad_norm": 0.8368481397628784, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 20160 + }, + { + "epoch": 3.2608519925632526, + "grad_norm": 0.7391039133071899, + "learning_rate": 0.0002, + "loss": 0.5626, + "step": 20170 + }, + { + "epoch": 3.262468676743998, + "grad_norm": 0.9122273325920105, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 20180 + }, + { + "epoch": 3.264085360924743, + "grad_norm": 0.8502281904220581, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 20190 + }, + { + "epoch": 3.265702045105489, + "grad_norm": 1.0926852226257324, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 20200 + }, + { + "epoch": 3.267318729286234, + "grad_norm": 0.7902828454971313, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 20210 + }, + { + "epoch": 3.2689354134669792, + "grad_norm": 0.8724729418754578, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 20220 + }, + { + "epoch": 3.2705520976477245, + "grad_norm": 0.8469277024269104, + "learning_rate": 0.0002, + "loss": 0.6242, + "step": 20230 + }, + { + "epoch": 3.2721687818284697, + "grad_norm": 0.8865092992782593, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 20240 + }, + { + "epoch": 3.273785466009215, + "grad_norm": 1.0979334115982056, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 20250 + }, + { + "epoch": 3.2754021501899606, + "grad_norm": 1.0860793590545654, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 20260 + }, + { + "epoch": 3.277018834370706, + "grad_norm": 0.981745183467865, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 20270 + }, + { + "epoch": 3.278635518551451, + "grad_norm": 0.9155020713806152, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 20280 + }, + { + "epoch": 3.2802522027321963, + "grad_norm": 0.8436718583106995, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 20290 + }, + { + "epoch": 3.2818688869129415, + "grad_norm": 1.0329409837722778, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 20300 + }, + { + "epoch": 3.2834855710936868, + "grad_norm": 0.9876394271850586, + "learning_rate": 0.0002, + "loss": 0.6, + "step": 20310 + }, + { + "epoch": 3.285102255274432, + "grad_norm": 0.8052917718887329, + "learning_rate": 0.0002, + "loss": 0.5463, + "step": 20320 + }, + { + "epoch": 3.2867189394551772, + "grad_norm": 0.8390680551528931, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 20330 + }, + { + "epoch": 3.288335623635923, + "grad_norm": 0.9515735507011414, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 20340 + }, + { + "epoch": 3.289952307816668, + "grad_norm": 0.8028870224952698, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 20350 + }, + { + "epoch": 3.2915689919974134, + "grad_norm": 0.862592339515686, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 20360 + }, + { + "epoch": 3.2931856761781586, + "grad_norm": 0.7451621890068054, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 20370 + }, + { + "epoch": 3.294802360358904, + "grad_norm": 0.8966776728630066, + "learning_rate": 0.0002, + "loss": 0.6458, + "step": 20380 + }, + { + "epoch": 3.296419044539649, + "grad_norm": 0.9289216995239258, + "learning_rate": 0.0002, + "loss": 0.5967, + "step": 20390 + }, + { + "epoch": 3.2980357287203943, + "grad_norm": 0.9649626612663269, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 20400 + }, + { + "epoch": 3.29965241290114, + "grad_norm": 1.1953798532485962, + "learning_rate": 0.0002, + "loss": 0.5781, + "step": 20410 + }, + { + "epoch": 3.301269097081885, + "grad_norm": 0.8929083943367004, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 20420 + }, + { + "epoch": 3.3028857812626304, + "grad_norm": 0.8922014236450195, + "learning_rate": 0.0002, + "loss": 0.597, + "step": 20430 + }, + { + "epoch": 3.3045024654433757, + "grad_norm": 0.9754860401153564, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 20440 + }, + { + "epoch": 3.306119149624121, + "grad_norm": 0.8873140215873718, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 20450 + }, + { + "epoch": 3.307735833804866, + "grad_norm": 0.857271671295166, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 20460 + }, + { + "epoch": 3.3093525179856114, + "grad_norm": 0.9022141098976135, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 20470 + }, + { + "epoch": 3.3109692021663566, + "grad_norm": 0.8614798188209534, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 20480 + }, + { + "epoch": 3.3125858863471023, + "grad_norm": 0.8838164210319519, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 20490 + }, + { + "epoch": 3.3142025705278475, + "grad_norm": 0.8709736466407776, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 20500 + }, + { + "epoch": 3.3158192547085927, + "grad_norm": 0.9533300995826721, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 20510 + }, + { + "epoch": 3.317435938889338, + "grad_norm": 0.8259269595146179, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 20520 + }, + { + "epoch": 3.319052623070083, + "grad_norm": 0.8607608079910278, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 20530 + }, + { + "epoch": 3.3206693072508284, + "grad_norm": 1.0863020420074463, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 20540 + }, + { + "epoch": 3.3222859914315737, + "grad_norm": 1.011489987373352, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 20550 + }, + { + "epoch": 3.3239026756123193, + "grad_norm": 0.6952177882194519, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 20560 + }, + { + "epoch": 3.3255193597930646, + "grad_norm": 0.9638974070549011, + "learning_rate": 0.0002, + "loss": 0.6229, + "step": 20570 + }, + { + "epoch": 3.32713604397381, + "grad_norm": 1.0310138463974, + "learning_rate": 0.0002, + "loss": 0.5882, + "step": 20580 + }, + { + "epoch": 3.328752728154555, + "grad_norm": 0.9371318221092224, + "learning_rate": 0.0002, + "loss": 0.594, + "step": 20590 + }, + { + "epoch": 3.3303694123353003, + "grad_norm": 0.8756691813468933, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 20600 + }, + { + "epoch": 3.3319860965160455, + "grad_norm": 1.054175853729248, + "learning_rate": 0.0002, + "loss": 0.5994, + "step": 20610 + }, + { + "epoch": 3.3336027806967907, + "grad_norm": 0.9074128270149231, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 20620 + }, + { + "epoch": 3.335219464877536, + "grad_norm": 0.906900942325592, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 20630 + }, + { + "epoch": 3.3368361490582816, + "grad_norm": 0.8689333200454712, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 20640 + }, + { + "epoch": 3.338452833239027, + "grad_norm": 0.9889747500419617, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 20650 + }, + { + "epoch": 3.340069517419772, + "grad_norm": 1.0685805082321167, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 20660 + }, + { + "epoch": 3.3416862016005173, + "grad_norm": 0.7495010495185852, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 20670 + }, + { + "epoch": 3.3433028857812626, + "grad_norm": 0.8747848272323608, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 20680 + }, + { + "epoch": 3.344919569962008, + "grad_norm": 0.9762673377990723, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 20690 + }, + { + "epoch": 3.346536254142753, + "grad_norm": 1.0284489393234253, + "learning_rate": 0.0002, + "loss": 0.6784, + "step": 20700 + }, + { + "epoch": 3.3481529383234987, + "grad_norm": 0.7293812036514282, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 20710 + }, + { + "epoch": 3.349769622504244, + "grad_norm": 0.8330199122428894, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 20720 + }, + { + "epoch": 3.351386306684989, + "grad_norm": 0.9808499217033386, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 20730 + }, + { + "epoch": 3.3530029908657344, + "grad_norm": 0.9508825540542603, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 20740 + }, + { + "epoch": 3.3546196750464796, + "grad_norm": 0.790483832359314, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 20750 + }, + { + "epoch": 3.356236359227225, + "grad_norm": 1.022793173789978, + "learning_rate": 0.0002, + "loss": 0.6327, + "step": 20760 + }, + { + "epoch": 3.35785304340797, + "grad_norm": 0.8318950533866882, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 20770 + }, + { + "epoch": 3.3594697275887153, + "grad_norm": 0.7980858087539673, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 20780 + }, + { + "epoch": 3.361086411769461, + "grad_norm": 0.8114802241325378, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 20790 + }, + { + "epoch": 3.3627030959502062, + "grad_norm": 0.8522519469261169, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 20800 + }, + { + "epoch": 3.3643197801309515, + "grad_norm": 0.9142431616783142, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 20810 + }, + { + "epoch": 3.3659364643116967, + "grad_norm": 0.771170437335968, + "learning_rate": 0.0002, + "loss": 0.6331, + "step": 20820 + }, + { + "epoch": 3.367553148492442, + "grad_norm": 1.0628231763839722, + "learning_rate": 0.0002, + "loss": 0.5879, + "step": 20830 + }, + { + "epoch": 3.369169832673187, + "grad_norm": 0.9384352564811707, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 20840 + }, + { + "epoch": 3.370786516853933, + "grad_norm": 1.1286591291427612, + "learning_rate": 0.0002, + "loss": 0.6292, + "step": 20850 + }, + { + "epoch": 3.372403201034678, + "grad_norm": 1.1349513530731201, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 20860 + }, + { + "epoch": 3.3740198852154233, + "grad_norm": 1.0127464532852173, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 20870 + }, + { + "epoch": 3.3756365693961685, + "grad_norm": 0.9111971855163574, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 20880 + }, + { + "epoch": 3.3772532535769137, + "grad_norm": 0.871356725692749, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 20890 + }, + { + "epoch": 3.378869937757659, + "grad_norm": 0.7774117588996887, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 20900 + }, + { + "epoch": 3.380486621938404, + "grad_norm": 1.0089964866638184, + "learning_rate": 0.0002, + "loss": 0.6062, + "step": 20910 + }, + { + "epoch": 3.3821033061191494, + "grad_norm": 0.7855867147445679, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 20920 + }, + { + "epoch": 3.3837199902998947, + "grad_norm": 1.3713710308074951, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 20930 + }, + { + "epoch": 3.3853366744806404, + "grad_norm": 0.8599116206169128, + "learning_rate": 0.0002, + "loss": 0.6627, + "step": 20940 + }, + { + "epoch": 3.3869533586613856, + "grad_norm": 0.9392673373222351, + "learning_rate": 0.0002, + "loss": 0.6224, + "step": 20950 + }, + { + "epoch": 3.388570042842131, + "grad_norm": 0.8764075040817261, + "learning_rate": 0.0002, + "loss": 0.5855, + "step": 20960 + }, + { + "epoch": 3.390186727022876, + "grad_norm": 0.8240136504173279, + "learning_rate": 0.0002, + "loss": 0.5734, + "step": 20970 + }, + { + "epoch": 3.3918034112036213, + "grad_norm": 1.0982369184494019, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 20980 + }, + { + "epoch": 3.3934200953843665, + "grad_norm": 1.0599013566970825, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 20990 + }, + { + "epoch": 3.395036779565112, + "grad_norm": 0.895438015460968, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 21000 + }, + { + "epoch": 3.3966534637458574, + "grad_norm": 0.6974841356277466, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 21010 + }, + { + "epoch": 3.3982701479266026, + "grad_norm": 0.9571719765663147, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 21020 + }, + { + "epoch": 3.399886832107348, + "grad_norm": 0.831912636756897, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 21030 + }, + { + "epoch": 3.401503516288093, + "grad_norm": 0.831936240196228, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 21040 + }, + { + "epoch": 3.4031202004688383, + "grad_norm": 0.7388373613357544, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 21050 + }, + { + "epoch": 3.4047368846495836, + "grad_norm": 0.938667356967926, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 21060 + }, + { + "epoch": 3.406353568830329, + "grad_norm": 0.9202313423156738, + "learning_rate": 0.0002, + "loss": 0.6121, + "step": 21070 + }, + { + "epoch": 3.4079702530110745, + "grad_norm": 0.9888381958007812, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 21080 + }, + { + "epoch": 3.4095869371918197, + "grad_norm": 0.8526970744132996, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 21090 + }, + { + "epoch": 3.411203621372565, + "grad_norm": 0.7939383387565613, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 21100 + }, + { + "epoch": 3.41282030555331, + "grad_norm": 0.9986352920532227, + "learning_rate": 0.0002, + "loss": 0.6066, + "step": 21110 + }, + { + "epoch": 3.4144369897340554, + "grad_norm": 0.8895300030708313, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 21120 + }, + { + "epoch": 3.4160536739148006, + "grad_norm": 0.9559482932090759, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 21130 + }, + { + "epoch": 3.417670358095546, + "grad_norm": 0.8351506590843201, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 21140 + }, + { + "epoch": 3.4192870422762915, + "grad_norm": 0.8224456906318665, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 21150 + }, + { + "epoch": 3.4209037264570368, + "grad_norm": 1.0110299587249756, + "learning_rate": 0.0002, + "loss": 0.5871, + "step": 21160 + }, + { + "epoch": 3.422520410637782, + "grad_norm": 0.82564777135849, + "learning_rate": 0.0002, + "loss": 0.6116, + "step": 21170 + }, + { + "epoch": 3.4241370948185272, + "grad_norm": 1.004738688468933, + "learning_rate": 0.0002, + "loss": 0.595, + "step": 21180 + }, + { + "epoch": 3.4257537789992725, + "grad_norm": 0.7545676827430725, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 21190 + }, + { + "epoch": 3.4273704631800177, + "grad_norm": 0.8918704390525818, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 21200 + }, + { + "epoch": 3.428987147360763, + "grad_norm": 0.8336876034736633, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 21210 + }, + { + "epoch": 3.430603831541508, + "grad_norm": 0.8928771018981934, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 21220 + }, + { + "epoch": 3.432220515722254, + "grad_norm": 0.7663705945014954, + "learning_rate": 0.0002, + "loss": 0.6468, + "step": 21230 + }, + { + "epoch": 3.433837199902999, + "grad_norm": 0.8392598628997803, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 21240 + }, + { + "epoch": 3.4354538840837443, + "grad_norm": 0.8819600343704224, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 21250 + }, + { + "epoch": 3.4370705682644895, + "grad_norm": 0.9124642014503479, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 21260 + }, + { + "epoch": 3.4386872524452348, + "grad_norm": 0.8329763412475586, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 21270 + }, + { + "epoch": 3.44030393662598, + "grad_norm": 0.9982839822769165, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 21280 + }, + { + "epoch": 3.4419206208067252, + "grad_norm": 0.9105954766273499, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 21290 + }, + { + "epoch": 3.443537304987471, + "grad_norm": 0.8182359337806702, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 21300 + }, + { + "epoch": 3.445153989168216, + "grad_norm": 1.0568904876708984, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 21310 + }, + { + "epoch": 3.4467706733489614, + "grad_norm": 0.968539834022522, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 21320 + }, + { + "epoch": 3.4483873575297066, + "grad_norm": 0.8774511218070984, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 21330 + }, + { + "epoch": 3.450004041710452, + "grad_norm": 0.7598156332969666, + "learning_rate": 0.0002, + "loss": 0.6438, + "step": 21340 + }, + { + "epoch": 3.451620725891197, + "grad_norm": 1.1012897491455078, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 21350 + }, + { + "epoch": 3.4532374100719423, + "grad_norm": 0.8040637373924255, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 21360 + }, + { + "epoch": 3.4548540942526875, + "grad_norm": 0.8497496247291565, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 21370 + }, + { + "epoch": 3.456470778433433, + "grad_norm": 0.8429915904998779, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 21380 + }, + { + "epoch": 3.4580874626141784, + "grad_norm": 0.8107112646102905, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 21390 + }, + { + "epoch": 3.4597041467949237, + "grad_norm": 1.00872004032135, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 21400 + }, + { + "epoch": 3.461320830975669, + "grad_norm": 0.8266542553901672, + "learning_rate": 0.0002, + "loss": 0.5712, + "step": 21410 + }, + { + "epoch": 3.462937515156414, + "grad_norm": 0.8972568511962891, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 21420 + }, + { + "epoch": 3.4645541993371594, + "grad_norm": 1.0781476497650146, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 21430 + }, + { + "epoch": 3.4661708835179046, + "grad_norm": 0.9571592807769775, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 21440 + }, + { + "epoch": 3.4677875676986503, + "grad_norm": 0.881547212600708, + "learning_rate": 0.0002, + "loss": 0.6309, + "step": 21450 + }, + { + "epoch": 3.4694042518793955, + "grad_norm": 0.6955338716506958, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 21460 + }, + { + "epoch": 3.4710209360601407, + "grad_norm": 0.901187539100647, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 21470 + }, + { + "epoch": 3.472637620240886, + "grad_norm": 0.7063511610031128, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 21480 + }, + { + "epoch": 3.474254304421631, + "grad_norm": 0.8462792038917542, + "learning_rate": 0.0002, + "loss": 0.6154, + "step": 21490 + }, + { + "epoch": 3.4758709886023764, + "grad_norm": 1.1861060857772827, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 21500 + }, + { + "epoch": 3.4774876727831217, + "grad_norm": 0.70503169298172, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 21510 + }, + { + "epoch": 3.479104356963867, + "grad_norm": 0.9650066494941711, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 21520 + }, + { + "epoch": 3.4807210411446126, + "grad_norm": 1.0266852378845215, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 21530 + }, + { + "epoch": 3.482337725325358, + "grad_norm": 0.956372857093811, + "learning_rate": 0.0002, + "loss": 0.6553, + "step": 21540 + }, + { + "epoch": 3.483954409506103, + "grad_norm": 0.8848432898521423, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 21550 + }, + { + "epoch": 3.4855710936868483, + "grad_norm": 1.0805351734161377, + "learning_rate": 0.0002, + "loss": 0.6375, + "step": 21560 + }, + { + "epoch": 3.4871877778675935, + "grad_norm": 0.9279725551605225, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 21570 + }, + { + "epoch": 3.4888044620483387, + "grad_norm": 0.9049562215805054, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 21580 + }, + { + "epoch": 3.4904211462290844, + "grad_norm": 0.9619429111480713, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 21590 + }, + { + "epoch": 3.4920378304098296, + "grad_norm": 0.8508906960487366, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 21600 + }, + { + "epoch": 3.493654514590575, + "grad_norm": 0.8692502379417419, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 21610 + }, + { + "epoch": 3.49527119877132, + "grad_norm": 0.8187332153320312, + "learning_rate": 0.0002, + "loss": 0.5878, + "step": 21620 + }, + { + "epoch": 3.4968878829520653, + "grad_norm": 1.145400047302246, + "learning_rate": 0.0002, + "loss": 0.5874, + "step": 21630 + }, + { + "epoch": 3.4985045671328105, + "grad_norm": 0.8281388282775879, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 21640 + }, + { + "epoch": 3.500121251313556, + "grad_norm": 0.82256019115448, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 21650 + }, + { + "epoch": 3.501737935494301, + "grad_norm": 0.9315484762191772, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 21660 + }, + { + "epoch": 3.5033546196750462, + "grad_norm": 0.7626111507415771, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 21670 + }, + { + "epoch": 3.504971303855792, + "grad_norm": 0.9275059103965759, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 21680 + }, + { + "epoch": 3.506587988036537, + "grad_norm": 0.7906724810600281, + "learning_rate": 0.0002, + "loss": 0.64, + "step": 21690 + }, + { + "epoch": 3.5082046722172824, + "grad_norm": 0.8289761543273926, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 21700 + }, + { + "epoch": 3.5098213563980276, + "grad_norm": 0.8316431045532227, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 21710 + }, + { + "epoch": 3.511438040578773, + "grad_norm": 1.0451812744140625, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 21720 + }, + { + "epoch": 3.513054724759518, + "grad_norm": 0.928252637386322, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 21730 + }, + { + "epoch": 3.5146714089402638, + "grad_norm": 0.7985895276069641, + "learning_rate": 0.0002, + "loss": 0.6062, + "step": 21740 + }, + { + "epoch": 3.516288093121009, + "grad_norm": 0.6740974187850952, + "learning_rate": 0.0002, + "loss": 0.6463, + "step": 21750 + }, + { + "epoch": 3.517904777301754, + "grad_norm": 0.8482223749160767, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 21760 + }, + { + "epoch": 3.5195214614824994, + "grad_norm": 0.889947772026062, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 21770 + }, + { + "epoch": 3.5211381456632447, + "grad_norm": 0.8304598927497864, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 21780 + }, + { + "epoch": 3.52275482984399, + "grad_norm": 0.8002981543540955, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 21790 + }, + { + "epoch": 3.524371514024735, + "grad_norm": 0.8115083575248718, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 21800 + }, + { + "epoch": 3.5259881982054804, + "grad_norm": 0.9715048670768738, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 21810 + }, + { + "epoch": 3.5276048823862256, + "grad_norm": 1.0910786390304565, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 21820 + }, + { + "epoch": 3.5292215665669713, + "grad_norm": 0.8438942432403564, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 21830 + }, + { + "epoch": 3.5308382507477165, + "grad_norm": 0.8813382983207703, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 21840 + }, + { + "epoch": 3.5324549349284617, + "grad_norm": 0.7092908024787903, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 21850 + }, + { + "epoch": 3.534071619109207, + "grad_norm": 0.8332187533378601, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 21860 + }, + { + "epoch": 3.535688303289952, + "grad_norm": 0.8958209156990051, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 21870 + }, + { + "epoch": 3.5373049874706974, + "grad_norm": 0.824138879776001, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 21880 + }, + { + "epoch": 3.538921671651443, + "grad_norm": 0.8375158309936523, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 21890 + }, + { + "epoch": 3.5405383558321883, + "grad_norm": 1.0274608135223389, + "learning_rate": 0.0002, + "loss": 0.6019, + "step": 21900 + }, + { + "epoch": 3.5421550400129336, + "grad_norm": 0.7088932394981384, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 21910 + }, + { + "epoch": 3.543771724193679, + "grad_norm": 0.8172445297241211, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 21920 + }, + { + "epoch": 3.545388408374424, + "grad_norm": 0.9904135465621948, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 21930 + }, + { + "epoch": 3.5470050925551693, + "grad_norm": 0.9900432229042053, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 21940 + }, + { + "epoch": 3.5486217767359145, + "grad_norm": 0.8963301181793213, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 21950 + }, + { + "epoch": 3.5502384609166597, + "grad_norm": 0.8551464676856995, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 21960 + }, + { + "epoch": 3.551855145097405, + "grad_norm": 1.0916603803634644, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 21970 + }, + { + "epoch": 3.5534718292781506, + "grad_norm": 0.841598391532898, + "learning_rate": 0.0002, + "loss": 0.6523, + "step": 21980 + }, + { + "epoch": 3.555088513458896, + "grad_norm": 0.8566757440567017, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 21990 + }, + { + "epoch": 3.556705197639641, + "grad_norm": 1.0145052671432495, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 22000 + }, + { + "epoch": 3.5583218818203863, + "grad_norm": 0.9293754696846008, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 22010 + }, + { + "epoch": 3.5599385660011316, + "grad_norm": 0.9568536281585693, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 22020 + }, + { + "epoch": 3.5615552501818772, + "grad_norm": 0.8613139986991882, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 22030 + }, + { + "epoch": 3.5631719343626225, + "grad_norm": 0.8179237246513367, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 22040 + }, + { + "epoch": 3.5647886185433677, + "grad_norm": 0.9059830904006958, + "learning_rate": 0.0002, + "loss": 0.574, + "step": 22050 + }, + { + "epoch": 3.566405302724113, + "grad_norm": 1.0068252086639404, + "learning_rate": 0.0002, + "loss": 0.6448, + "step": 22060 + }, + { + "epoch": 3.568021986904858, + "grad_norm": 0.9682072997093201, + "learning_rate": 0.0002, + "loss": 0.6239, + "step": 22070 + }, + { + "epoch": 3.5696386710856034, + "grad_norm": 0.8514005541801453, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 22080 + }, + { + "epoch": 3.5712553552663486, + "grad_norm": 0.8327770829200745, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 22090 + }, + { + "epoch": 3.572872039447094, + "grad_norm": 1.024976372718811, + "learning_rate": 0.0002, + "loss": 0.5976, + "step": 22100 + }, + { + "epoch": 3.574488723627839, + "grad_norm": 0.7721174955368042, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 22110 + }, + { + "epoch": 3.5761054078085843, + "grad_norm": 1.0351054668426514, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 22120 + }, + { + "epoch": 3.57772209198933, + "grad_norm": 0.9680907130241394, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 22130 + }, + { + "epoch": 3.5793387761700752, + "grad_norm": 0.8016974925994873, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 22140 + }, + { + "epoch": 3.5809554603508205, + "grad_norm": 1.0109003782272339, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 22150 + }, + { + "epoch": 3.5825721445315657, + "grad_norm": 1.0473392009735107, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 22160 + }, + { + "epoch": 3.584188828712311, + "grad_norm": 0.8686613440513611, + "learning_rate": 0.0002, + "loss": 0.6353, + "step": 22170 + }, + { + "epoch": 3.5858055128930566, + "grad_norm": 0.869149923324585, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 22180 + }, + { + "epoch": 3.587422197073802, + "grad_norm": 0.9769062995910645, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 22190 + }, + { + "epoch": 3.589038881254547, + "grad_norm": 0.779636561870575, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 22200 + }, + { + "epoch": 3.5906555654352923, + "grad_norm": 0.9063841104507446, + "learning_rate": 0.0002, + "loss": 0.5875, + "step": 22210 + }, + { + "epoch": 3.5922722496160375, + "grad_norm": 0.9216037392616272, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 22220 + }, + { + "epoch": 3.5938889337967828, + "grad_norm": 1.0217336416244507, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 22230 + }, + { + "epoch": 3.595505617977528, + "grad_norm": 0.8513161540031433, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 22240 + }, + { + "epoch": 3.597122302158273, + "grad_norm": 0.8084813952445984, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 22250 + }, + { + "epoch": 3.5987389863390185, + "grad_norm": 0.8524802923202515, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 22260 + }, + { + "epoch": 3.600355670519764, + "grad_norm": 0.9356237649917603, + "learning_rate": 0.0002, + "loss": 0.5599, + "step": 22270 + }, + { + "epoch": 3.6019723547005094, + "grad_norm": 1.009600281715393, + "learning_rate": 0.0002, + "loss": 0.628, + "step": 22280 + }, + { + "epoch": 3.6035890388812546, + "grad_norm": 0.9900581240653992, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 22290 + }, + { + "epoch": 3.605205723062, + "grad_norm": 1.062495231628418, + "learning_rate": 0.0002, + "loss": 0.5725, + "step": 22300 + }, + { + "epoch": 3.606822407242745, + "grad_norm": 0.8832381367683411, + "learning_rate": 0.0002, + "loss": 0.607, + "step": 22310 + }, + { + "epoch": 3.6084390914234903, + "grad_norm": 0.9284297823905945, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 22320 + }, + { + "epoch": 3.610055775604236, + "grad_norm": 1.2381829023361206, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 22330 + }, + { + "epoch": 3.611672459784981, + "grad_norm": 0.929434597492218, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 22340 + }, + { + "epoch": 3.6132891439657264, + "grad_norm": 0.9714490175247192, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 22350 + }, + { + "epoch": 3.6149058281464717, + "grad_norm": 0.808014988899231, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 22360 + }, + { + "epoch": 3.616522512327217, + "grad_norm": 1.0364398956298828, + "learning_rate": 0.0002, + "loss": 0.6428, + "step": 22370 + }, + { + "epoch": 3.618139196507962, + "grad_norm": 0.7858489751815796, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 22380 + }, + { + "epoch": 3.6197558806887074, + "grad_norm": 0.9920870065689087, + "learning_rate": 0.0002, + "loss": 0.6214, + "step": 22390 + }, + { + "epoch": 3.6213725648694526, + "grad_norm": 0.9183220863342285, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 22400 + }, + { + "epoch": 3.622989249050198, + "grad_norm": 0.9826246500015259, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 22410 + }, + { + "epoch": 3.6246059332309435, + "grad_norm": 0.8632931113243103, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 22420 + }, + { + "epoch": 3.6262226174116887, + "grad_norm": 0.8468965291976929, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 22430 + }, + { + "epoch": 3.627839301592434, + "grad_norm": 0.8466871976852417, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 22440 + }, + { + "epoch": 3.629455985773179, + "grad_norm": 0.9501169919967651, + "learning_rate": 0.0002, + "loss": 0.5941, + "step": 22450 + }, + { + "epoch": 3.6310726699539244, + "grad_norm": 0.8906720876693726, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 22460 + }, + { + "epoch": 3.6326893541346696, + "grad_norm": 0.7400227189064026, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 22470 + }, + { + "epoch": 3.6343060383154153, + "grad_norm": 0.9756355881690979, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 22480 + }, + { + "epoch": 3.6359227224961606, + "grad_norm": 0.7504993081092834, + "learning_rate": 0.0002, + "loss": 0.6203, + "step": 22490 + }, + { + "epoch": 3.637539406676906, + "grad_norm": 0.9270039200782776, + "learning_rate": 0.0002, + "loss": 0.6302, + "step": 22500 + }, + { + "epoch": 3.639156090857651, + "grad_norm": 0.8841686844825745, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 22510 + }, + { + "epoch": 3.6407727750383962, + "grad_norm": 0.8533213138580322, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 22520 + }, + { + "epoch": 3.6423894592191415, + "grad_norm": 1.0052043199539185, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 22530 + }, + { + "epoch": 3.6440061433998867, + "grad_norm": 1.0323461294174194, + "learning_rate": 0.0002, + "loss": 0.6363, + "step": 22540 + }, + { + "epoch": 3.645622827580632, + "grad_norm": 0.8654312491416931, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 22550 + }, + { + "epoch": 3.647239511761377, + "grad_norm": 0.6400038003921509, + "learning_rate": 0.0002, + "loss": 0.6155, + "step": 22560 + }, + { + "epoch": 3.648856195942123, + "grad_norm": 0.8061298727989197, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 22570 + }, + { + "epoch": 3.650472880122868, + "grad_norm": 0.9257854223251343, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 22580 + }, + { + "epoch": 3.6520895643036133, + "grad_norm": 0.8439396619796753, + "learning_rate": 0.0002, + "loss": 0.6409, + "step": 22590 + }, + { + "epoch": 3.6537062484843585, + "grad_norm": 0.7764544486999512, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 22600 + }, + { + "epoch": 3.6553229326651038, + "grad_norm": 1.125451683998108, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 22610 + }, + { + "epoch": 3.656939616845849, + "grad_norm": 0.7523018717765808, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 22620 + }, + { + "epoch": 3.6585563010265947, + "grad_norm": 1.071026086807251, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 22630 + }, + { + "epoch": 3.66017298520734, + "grad_norm": 0.945791482925415, + "learning_rate": 0.0002, + "loss": 0.6459, + "step": 22640 + }, + { + "epoch": 3.661789669388085, + "grad_norm": 0.8001811504364014, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 22650 + }, + { + "epoch": 3.6634063535688304, + "grad_norm": 0.9700816869735718, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 22660 + }, + { + "epoch": 3.6650230377495756, + "grad_norm": 0.9053242206573486, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 22670 + }, + { + "epoch": 3.666639721930321, + "grad_norm": 0.944362461566925, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 22680 + }, + { + "epoch": 3.668256406111066, + "grad_norm": 1.067489504814148, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 22690 + }, + { + "epoch": 3.6698730902918113, + "grad_norm": 1.0984995365142822, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 22700 + }, + { + "epoch": 3.6714897744725565, + "grad_norm": 0.9336317777633667, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 22710 + }, + { + "epoch": 3.673106458653302, + "grad_norm": 0.9261918663978577, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 22720 + }, + { + "epoch": 3.6747231428340474, + "grad_norm": 0.8648008704185486, + "learning_rate": 0.0002, + "loss": 0.6332, + "step": 22730 + }, + { + "epoch": 3.6763398270147927, + "grad_norm": 0.7225083708763123, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 22740 + }, + { + "epoch": 3.677956511195538, + "grad_norm": 0.9258282780647278, + "learning_rate": 0.0002, + "loss": 0.6406, + "step": 22750 + }, + { + "epoch": 3.679573195376283, + "grad_norm": 0.70876145362854, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 22760 + }, + { + "epoch": 3.681189879557029, + "grad_norm": 0.8780210018157959, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 22770 + }, + { + "epoch": 3.682806563737774, + "grad_norm": 0.8075440526008606, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 22780 + }, + { + "epoch": 3.6844232479185193, + "grad_norm": 0.8503130674362183, + "learning_rate": 0.0002, + "loss": 0.6561, + "step": 22790 + }, + { + "epoch": 3.6860399320992645, + "grad_norm": 0.8413618206977844, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 22800 + }, + { + "epoch": 3.6876566162800097, + "grad_norm": 0.8675165176391602, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 22810 + }, + { + "epoch": 3.689273300460755, + "grad_norm": 0.8235884308815002, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 22820 + }, + { + "epoch": 3.6908899846415, + "grad_norm": 0.9477725625038147, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 22830 + }, + { + "epoch": 3.6925066688222454, + "grad_norm": 0.7883533835411072, + "learning_rate": 0.0002, + "loss": 0.6481, + "step": 22840 + }, + { + "epoch": 3.6941233530029907, + "grad_norm": 1.047913908958435, + "learning_rate": 0.0002, + "loss": 0.5872, + "step": 22850 + }, + { + "epoch": 3.695740037183736, + "grad_norm": 0.9171528816223145, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 22860 + }, + { + "epoch": 3.6973567213644816, + "grad_norm": 0.9338192343711853, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 22870 + }, + { + "epoch": 3.698973405545227, + "grad_norm": 0.8799443244934082, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 22880 + }, + { + "epoch": 3.700590089725972, + "grad_norm": 0.8515434861183167, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 22890 + }, + { + "epoch": 3.7022067739067173, + "grad_norm": 0.7805591821670532, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 22900 + }, + { + "epoch": 3.7038234580874625, + "grad_norm": 0.8470911979675293, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 22910 + }, + { + "epoch": 3.705440142268208, + "grad_norm": 0.9452309012413025, + "learning_rate": 0.0002, + "loss": 0.6557, + "step": 22920 + }, + { + "epoch": 3.7070568264489534, + "grad_norm": 0.950243353843689, + "learning_rate": 0.0002, + "loss": 0.6529, + "step": 22930 + }, + { + "epoch": 3.7086735106296986, + "grad_norm": 0.7882499098777771, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 22940 + }, + { + "epoch": 3.710290194810444, + "grad_norm": 0.8307787775993347, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 22950 + }, + { + "epoch": 3.711906878991189, + "grad_norm": 1.0970630645751953, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 22960 + }, + { + "epoch": 3.7135235631719343, + "grad_norm": 0.8269566297531128, + "learning_rate": 0.0002, + "loss": 0.6281, + "step": 22970 + }, + { + "epoch": 3.7151402473526796, + "grad_norm": 0.8306704759597778, + "learning_rate": 0.0002, + "loss": 0.6561, + "step": 22980 + }, + { + "epoch": 3.716756931533425, + "grad_norm": 0.9710225462913513, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 22990 + }, + { + "epoch": 3.71837361571417, + "grad_norm": 0.8890530467033386, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 23000 + }, + { + "epoch": 3.7199902998949153, + "grad_norm": 0.883522629737854, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 23010 + }, + { + "epoch": 3.721606984075661, + "grad_norm": 0.8662652373313904, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 23020 + }, + { + "epoch": 3.723223668256406, + "grad_norm": 0.7228406667709351, + "learning_rate": 0.0002, + "loss": 0.6266, + "step": 23030 + }, + { + "epoch": 3.7248403524371514, + "grad_norm": 1.060792088508606, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 23040 + }, + { + "epoch": 3.7264570366178966, + "grad_norm": 1.0119613409042358, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 23050 + }, + { + "epoch": 3.728073720798642, + "grad_norm": 0.9212996959686279, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 23060 + }, + { + "epoch": 3.7296904049793875, + "grad_norm": 0.925690233707428, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 23070 + }, + { + "epoch": 3.7313070891601328, + "grad_norm": 0.8323310613632202, + "learning_rate": 0.0002, + "loss": 0.615, + "step": 23080 + }, + { + "epoch": 3.732923773340878, + "grad_norm": 0.8966048955917358, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 23090 + }, + { + "epoch": 3.7345404575216232, + "grad_norm": 0.8995837569236755, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 23100 + }, + { + "epoch": 3.7361571417023685, + "grad_norm": 0.8748890161514282, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 23110 + }, + { + "epoch": 3.7377738258831137, + "grad_norm": 0.7985540628433228, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 23120 + }, + { + "epoch": 3.739390510063859, + "grad_norm": 1.0240917205810547, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 23130 + }, + { + "epoch": 3.741007194244604, + "grad_norm": 0.9181789755821228, + "learning_rate": 0.0002, + "loss": 0.6747, + "step": 23140 + }, + { + "epoch": 3.7426238784253494, + "grad_norm": 0.8896583914756775, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 23150 + }, + { + "epoch": 3.744240562606095, + "grad_norm": 0.8635515570640564, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 23160 + }, + { + "epoch": 3.7458572467868403, + "grad_norm": 0.8873575329780579, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 23170 + }, + { + "epoch": 3.7474739309675855, + "grad_norm": 0.9807148575782776, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 23180 + }, + { + "epoch": 3.7490906151483308, + "grad_norm": 0.900477945804596, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 23190 + }, + { + "epoch": 3.750707299329076, + "grad_norm": 0.9379992485046387, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 23200 + }, + { + "epoch": 3.752323983509821, + "grad_norm": 0.9649890661239624, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 23210 + }, + { + "epoch": 3.753940667690567, + "grad_norm": 0.824442446231842, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 23220 + }, + { + "epoch": 3.755557351871312, + "grad_norm": 0.8896150588989258, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 23230 + }, + { + "epoch": 3.7571740360520574, + "grad_norm": 0.751249372959137, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 23240 + }, + { + "epoch": 3.7587907202328026, + "grad_norm": 0.9392193555831909, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 23250 + }, + { + "epoch": 3.760407404413548, + "grad_norm": 0.9284586310386658, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 23260 + }, + { + "epoch": 3.762024088594293, + "grad_norm": 0.7738175392150879, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 23270 + }, + { + "epoch": 3.7636407727750383, + "grad_norm": 0.9252978563308716, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 23280 + }, + { + "epoch": 3.7652574569557835, + "grad_norm": 0.9501895904541016, + "learning_rate": 0.0002, + "loss": 0.5984, + "step": 23290 + }, + { + "epoch": 3.7668741411365287, + "grad_norm": 0.9416276216506958, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 23300 + }, + { + "epoch": 3.7684908253172744, + "grad_norm": 0.7076631784439087, + "learning_rate": 0.0002, + "loss": 0.6507, + "step": 23310 + }, + { + "epoch": 3.7701075094980196, + "grad_norm": 0.9864492416381836, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 23320 + }, + { + "epoch": 3.771724193678765, + "grad_norm": 0.8450456261634827, + "learning_rate": 0.0002, + "loss": 0.6537, + "step": 23330 + }, + { + "epoch": 3.77334087785951, + "grad_norm": 1.0768941640853882, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 23340 + }, + { + "epoch": 3.7749575620402553, + "grad_norm": 0.9956819415092468, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 23350 + }, + { + "epoch": 3.7765742462210006, + "grad_norm": 0.9234658479690552, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 23360 + }, + { + "epoch": 3.7781909304017463, + "grad_norm": 1.0993858575820923, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 23370 + }, + { + "epoch": 3.7798076145824915, + "grad_norm": 0.923159658908844, + "learning_rate": 0.0002, + "loss": 0.6391, + "step": 23380 + }, + { + "epoch": 3.7814242987632367, + "grad_norm": 0.9311541318893433, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 23390 + }, + { + "epoch": 3.783040982943982, + "grad_norm": 0.919681191444397, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 23400 + }, + { + "epoch": 3.784657667124727, + "grad_norm": 1.7406195402145386, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 23410 + }, + { + "epoch": 3.7862743513054724, + "grad_norm": 0.7789074182510376, + "learning_rate": 0.0002, + "loss": 0.657, + "step": 23420 + }, + { + "epoch": 3.7878910354862176, + "grad_norm": 0.8302814960479736, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 23430 + }, + { + "epoch": 3.789507719666963, + "grad_norm": 0.8089349269866943, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 23440 + }, + { + "epoch": 3.791124403847708, + "grad_norm": 0.9006284475326538, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 23450 + }, + { + "epoch": 3.7927410880284538, + "grad_norm": 0.8426766991615295, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 23460 + }, + { + "epoch": 3.794357772209199, + "grad_norm": 1.2576252222061157, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 23470 + }, + { + "epoch": 3.7959744563899442, + "grad_norm": 1.0307610034942627, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 23480 + }, + { + "epoch": 3.7975911405706895, + "grad_norm": 0.8525972962379456, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 23490 + }, + { + "epoch": 3.7992078247514347, + "grad_norm": 1.159039855003357, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 23500 + }, + { + "epoch": 3.80082450893218, + "grad_norm": 1.4193549156188965, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 23510 + }, + { + "epoch": 3.8024411931129256, + "grad_norm": 0.8245543837547302, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 23520 + }, + { + "epoch": 3.804057877293671, + "grad_norm": 0.8847230076789856, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 23530 + }, + { + "epoch": 3.805674561474416, + "grad_norm": 0.9574624300003052, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 23540 + }, + { + "epoch": 3.8072912456551613, + "grad_norm": 1.048020601272583, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 23550 + }, + { + "epoch": 3.8089079298359065, + "grad_norm": 0.8302255868911743, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 23560 + }, + { + "epoch": 3.8105246140166518, + "grad_norm": 0.8269215822219849, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 23570 + }, + { + "epoch": 3.812141298197397, + "grad_norm": 0.9375753402709961, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 23580 + }, + { + "epoch": 3.8137579823781422, + "grad_norm": 1.0234097242355347, + "learning_rate": 0.0002, + "loss": 0.6132, + "step": 23590 + }, + { + "epoch": 3.8153746665588875, + "grad_norm": 0.8978445529937744, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 23600 + }, + { + "epoch": 3.816991350739633, + "grad_norm": 0.7929515838623047, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 23610 + }, + { + "epoch": 3.8186080349203784, + "grad_norm": 1.3255881071090698, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 23620 + }, + { + "epoch": 3.8202247191011236, + "grad_norm": 0.9188598990440369, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 23630 + }, + { + "epoch": 3.821841403281869, + "grad_norm": 0.8811675906181335, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 23640 + }, + { + "epoch": 3.823458087462614, + "grad_norm": 0.8061038255691528, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 23650 + }, + { + "epoch": 3.8250747716433597, + "grad_norm": 0.9975376129150391, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 23660 + }, + { + "epoch": 3.826691455824105, + "grad_norm": 0.8036105036735535, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 23670 + }, + { + "epoch": 3.82830814000485, + "grad_norm": 0.7401984333992004, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 23680 + }, + { + "epoch": 3.8299248241855954, + "grad_norm": 0.829753041267395, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 23690 + }, + { + "epoch": 3.8315415083663407, + "grad_norm": 0.8753240704536438, + "learning_rate": 0.0002, + "loss": 0.6611, + "step": 23700 + }, + { + "epoch": 3.833158192547086, + "grad_norm": 0.8157842755317688, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 23710 + }, + { + "epoch": 3.834774876727831, + "grad_norm": 0.6183798909187317, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 23720 + }, + { + "epoch": 3.8363915609085764, + "grad_norm": 0.9548442363739014, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 23730 + }, + { + "epoch": 3.8380082450893216, + "grad_norm": 0.8319669961929321, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 23740 + }, + { + "epoch": 3.839624929270067, + "grad_norm": 0.9718693494796753, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 23750 + }, + { + "epoch": 3.8412416134508125, + "grad_norm": 0.8672235012054443, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 23760 + }, + { + "epoch": 3.8428582976315577, + "grad_norm": 1.1210707426071167, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 23770 + }, + { + "epoch": 3.844474981812303, + "grad_norm": 0.9177767634391785, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 23780 + }, + { + "epoch": 3.846091665993048, + "grad_norm": 0.8714171648025513, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 23790 + }, + { + "epoch": 3.8477083501737934, + "grad_norm": 1.1853246688842773, + "learning_rate": 0.0002, + "loss": 0.6302, + "step": 23800 + }, + { + "epoch": 3.849325034354539, + "grad_norm": 0.8091260194778442, + "learning_rate": 0.0002, + "loss": 0.6144, + "step": 23810 + }, + { + "epoch": 3.8509417185352843, + "grad_norm": 0.9710774421691895, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 23820 + }, + { + "epoch": 3.8525584027160296, + "grad_norm": 0.7648707628250122, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 23830 + }, + { + "epoch": 3.854175086896775, + "grad_norm": 0.7809253931045532, + "learning_rate": 0.0002, + "loss": 0.6013, + "step": 23840 + }, + { + "epoch": 3.85579177107752, + "grad_norm": 0.8337951898574829, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 23850 + }, + { + "epoch": 3.8574084552582653, + "grad_norm": 0.9271913170814514, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 23860 + }, + { + "epoch": 3.8590251394390105, + "grad_norm": 0.985334038734436, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 23870 + }, + { + "epoch": 3.8606418236197557, + "grad_norm": 0.8458583354949951, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 23880 + }, + { + "epoch": 3.862258507800501, + "grad_norm": 1.015348196029663, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 23890 + }, + { + "epoch": 3.8638751919812466, + "grad_norm": 1.0121688842773438, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 23900 + }, + { + "epoch": 3.865491876161992, + "grad_norm": 0.8883971571922302, + "learning_rate": 0.0002, + "loss": 0.5921, + "step": 23910 + }, + { + "epoch": 3.867108560342737, + "grad_norm": 1.028086543083191, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 23920 + }, + { + "epoch": 3.8687252445234823, + "grad_norm": 0.9645734429359436, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 23930 + }, + { + "epoch": 3.8703419287042276, + "grad_norm": 0.8235350251197815, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 23940 + }, + { + "epoch": 3.871958612884973, + "grad_norm": 1.0298916101455688, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 23950 + }, + { + "epoch": 3.8735752970657185, + "grad_norm": 1.0063377618789673, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 23960 + }, + { + "epoch": 3.8751919812464637, + "grad_norm": 0.9230626821517944, + "learning_rate": 0.0002, + "loss": 0.6234, + "step": 23970 + }, + { + "epoch": 3.876808665427209, + "grad_norm": 0.9243063926696777, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 23980 + }, + { + "epoch": 3.878425349607954, + "grad_norm": 1.0211291313171387, + "learning_rate": 0.0002, + "loss": 0.6035, + "step": 23990 + }, + { + "epoch": 3.8800420337886994, + "grad_norm": 0.7800535559654236, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 24000 + }, + { + "epoch": 3.8816587179694446, + "grad_norm": 0.7904248833656311, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 24010 + }, + { + "epoch": 3.88327540215019, + "grad_norm": 1.1975988149642944, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 24020 + }, + { + "epoch": 3.884892086330935, + "grad_norm": 1.0626593828201294, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 24030 + }, + { + "epoch": 3.8865087705116803, + "grad_norm": 0.9012193083763123, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 24040 + }, + { + "epoch": 3.888125454692426, + "grad_norm": 1.1159172058105469, + "learning_rate": 0.0002, + "loss": 0.6786, + "step": 24050 + }, + { + "epoch": 3.889742138873171, + "grad_norm": 1.276838779449463, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 24060 + }, + { + "epoch": 3.8913588230539164, + "grad_norm": 0.8467690348625183, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 24070 + }, + { + "epoch": 3.8929755072346617, + "grad_norm": 0.9862841963768005, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 24080 + }, + { + "epoch": 3.894592191415407, + "grad_norm": 0.7134621739387512, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 24090 + }, + { + "epoch": 3.896208875596152, + "grad_norm": 0.8178175091743469, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 24100 + }, + { + "epoch": 3.897825559776898, + "grad_norm": 0.9229172468185425, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 24110 + }, + { + "epoch": 3.899442243957643, + "grad_norm": 1.0878316164016724, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 24120 + }, + { + "epoch": 3.9010589281383883, + "grad_norm": 0.971645712852478, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 24130 + }, + { + "epoch": 3.9026756123191335, + "grad_norm": 0.8862188458442688, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 24140 + }, + { + "epoch": 3.9042922964998787, + "grad_norm": 0.9126982688903809, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 24150 + }, + { + "epoch": 3.905908980680624, + "grad_norm": 0.8833470940589905, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 24160 + }, + { + "epoch": 3.907525664861369, + "grad_norm": 0.8320947885513306, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 24170 + }, + { + "epoch": 3.9091423490421144, + "grad_norm": 0.9156602025032043, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 24180 + }, + { + "epoch": 3.9107590332228597, + "grad_norm": 1.029181957244873, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 24190 + }, + { + "epoch": 3.9123757174036053, + "grad_norm": 0.9052802324295044, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 24200 + }, + { + "epoch": 3.9139924015843506, + "grad_norm": 0.8847255110740662, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 24210 + }, + { + "epoch": 3.915609085765096, + "grad_norm": 0.9642062187194824, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 24220 + }, + { + "epoch": 3.917225769945841, + "grad_norm": 0.8629093766212463, + "learning_rate": 0.0002, + "loss": 0.6557, + "step": 24230 + }, + { + "epoch": 3.9188424541265863, + "grad_norm": 0.8674976825714111, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 24240 + }, + { + "epoch": 3.9204591383073315, + "grad_norm": 1.104846477508545, + "learning_rate": 0.0002, + "loss": 0.5874, + "step": 24250 + }, + { + "epoch": 3.922075822488077, + "grad_norm": 1.0874955654144287, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 24260 + }, + { + "epoch": 3.9236925066688224, + "grad_norm": 0.8689812421798706, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 24270 + }, + { + "epoch": 3.9253091908495676, + "grad_norm": 0.9724617004394531, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 24280 + }, + { + "epoch": 3.926925875030313, + "grad_norm": 0.9165538549423218, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 24290 + }, + { + "epoch": 3.928542559211058, + "grad_norm": 0.9307710528373718, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 24300 + }, + { + "epoch": 3.9301592433918033, + "grad_norm": 0.8589295148849487, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 24310 + }, + { + "epoch": 3.9317759275725486, + "grad_norm": 0.9151099920272827, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 24320 + }, + { + "epoch": 3.933392611753294, + "grad_norm": 0.9633517265319824, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 24330 + }, + { + "epoch": 3.935009295934039, + "grad_norm": 0.9521116018295288, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 24340 + }, + { + "epoch": 3.9366259801147847, + "grad_norm": 0.8366776704788208, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 24350 + }, + { + "epoch": 3.93824266429553, + "grad_norm": 0.8972663283348083, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 24360 + }, + { + "epoch": 3.939859348476275, + "grad_norm": 0.8102919459342957, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 24370 + }, + { + "epoch": 3.9414760326570204, + "grad_norm": 0.8189975023269653, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 24380 + }, + { + "epoch": 3.9430927168377656, + "grad_norm": 0.9569464921951294, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 24390 + }, + { + "epoch": 3.9447094010185113, + "grad_norm": 0.7459101676940918, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 24400 + }, + { + "epoch": 3.9463260851992565, + "grad_norm": 0.8536974787712097, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 24410 + }, + { + "epoch": 3.9479427693800018, + "grad_norm": 0.8763698935508728, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 24420 + }, + { + "epoch": 3.949559453560747, + "grad_norm": 0.9381106495857239, + "learning_rate": 0.0002, + "loss": 0.6478, + "step": 24430 + }, + { + "epoch": 3.9511761377414922, + "grad_norm": 0.934440016746521, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 24440 + }, + { + "epoch": 3.9527928219222375, + "grad_norm": 0.903918981552124, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 24450 + }, + { + "epoch": 3.9544095061029827, + "grad_norm": 0.8771953582763672, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 24460 + }, + { + "epoch": 3.956026190283728, + "grad_norm": 1.0375410318374634, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 24470 + }, + { + "epoch": 3.957642874464473, + "grad_norm": 0.9439185261726379, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 24480 + }, + { + "epoch": 3.9592595586452184, + "grad_norm": 0.935467004776001, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 24490 + }, + { + "epoch": 3.960876242825964, + "grad_norm": 0.6900772452354431, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 24500 + }, + { + "epoch": 3.9624929270067093, + "grad_norm": 1.0172916650772095, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 24510 + }, + { + "epoch": 3.9641096111874545, + "grad_norm": 0.9167046546936035, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 24520 + }, + { + "epoch": 3.9657262953681998, + "grad_norm": 0.7230527997016907, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 24530 + }, + { + "epoch": 3.967342979548945, + "grad_norm": 0.8980403542518616, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 24540 + }, + { + "epoch": 3.9689596637296907, + "grad_norm": 0.8555465936660767, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 24550 + }, + { + "epoch": 3.970576347910436, + "grad_norm": 0.7825445532798767, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 24560 + }, + { + "epoch": 3.972193032091181, + "grad_norm": 0.7273133993148804, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 24570 + }, + { + "epoch": 3.9738097162719264, + "grad_norm": 0.9612047672271729, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 24580 + }, + { + "epoch": 3.9754264004526716, + "grad_norm": 0.9865460991859436, + "learning_rate": 0.0002, + "loss": 0.6426, + "step": 24590 + }, + { + "epoch": 3.977043084633417, + "grad_norm": 0.8638762831687927, + "learning_rate": 0.0002, + "loss": 0.6052, + "step": 24600 + }, + { + "epoch": 3.978659768814162, + "grad_norm": 1.0096198320388794, + "learning_rate": 0.0002, + "loss": 0.6097, + "step": 24610 + }, + { + "epoch": 3.9802764529949073, + "grad_norm": 0.8475532531738281, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 24620 + }, + { + "epoch": 3.9818931371756525, + "grad_norm": 0.9696195721626282, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 24630 + }, + { + "epoch": 3.9835098213563978, + "grad_norm": 0.7499843239784241, + "learning_rate": 0.0002, + "loss": 0.6446, + "step": 24640 + }, + { + "epoch": 3.9851265055371434, + "grad_norm": 0.8865424990653992, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 24650 + }, + { + "epoch": 3.9867431897178887, + "grad_norm": 0.8089959025382996, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 24660 + }, + { + "epoch": 3.988359873898634, + "grad_norm": 0.6946012377738953, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 24670 + }, + { + "epoch": 3.989976558079379, + "grad_norm": 0.7991759181022644, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 24680 + }, + { + "epoch": 3.9915932422601244, + "grad_norm": 0.8803931474685669, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 24690 + }, + { + "epoch": 3.99320992644087, + "grad_norm": 0.8848299980163574, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 24700 + }, + { + "epoch": 3.9948266106216153, + "grad_norm": 0.7448889017105103, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 24710 + }, + { + "epoch": 3.9964432948023605, + "grad_norm": 0.9361620545387268, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 24720 + }, + { + "epoch": 3.9980599789831057, + "grad_norm": 0.9958081245422363, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 24730 + }, + { + "epoch": 3.999676663163851, + "grad_norm": 1.026004672050476, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 24740 + }, + { + "epoch": 4.0, + "eval_loss": 1.1524168252944946, + "eval_runtime": 122.1585, + "eval_samples_per_second": 6.0, + "eval_steps_per_second": 0.753, + "step": 24742 + }, + { + "epoch": 4.001293347344596, + "grad_norm": 1.0664808750152588, + "learning_rate": 0.0002, + "loss": 0.6057, + "step": 24750 + }, + { + "epoch": 4.002910031525341, + "grad_norm": 1.0113720893859863, + "learning_rate": 0.0002, + "loss": 0.5644, + "step": 24760 + }, + { + "epoch": 4.004526715706087, + "grad_norm": 0.991486668586731, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 24770 + }, + { + "epoch": 4.006143399886832, + "grad_norm": 0.951754629611969, + "learning_rate": 0.0002, + "loss": 0.508, + "step": 24780 + }, + { + "epoch": 4.007760084067577, + "grad_norm": 1.13059401512146, + "learning_rate": 0.0002, + "loss": 0.5314, + "step": 24790 + }, + { + "epoch": 4.009376768248322, + "grad_norm": 0.9343926310539246, + "learning_rate": 0.0002, + "loss": 0.5323, + "step": 24800 + }, + { + "epoch": 4.010993452429068, + "grad_norm": 1.0680590867996216, + "learning_rate": 0.0002, + "loss": 0.5161, + "step": 24810 + }, + { + "epoch": 4.012610136609814, + "grad_norm": 1.0022706985473633, + "learning_rate": 0.0002, + "loss": 0.513, + "step": 24820 + }, + { + "epoch": 4.014226820790559, + "grad_norm": 1.0285297632217407, + "learning_rate": 0.0002, + "loss": 0.543, + "step": 24830 + }, + { + "epoch": 4.015843504971304, + "grad_norm": 0.8347002863883972, + "learning_rate": 0.0002, + "loss": 0.5311, + "step": 24840 + }, + { + "epoch": 4.017460189152049, + "grad_norm": 0.9675396680831909, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 24850 + }, + { + "epoch": 4.019076873332795, + "grad_norm": 0.9238511323928833, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 24860 + }, + { + "epoch": 4.02069355751354, + "grad_norm": 1.1576941013336182, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 24870 + }, + { + "epoch": 4.022310241694285, + "grad_norm": 0.8583757281303406, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 24880 + }, + { + "epoch": 4.02392692587503, + "grad_norm": 0.9816817045211792, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 24890 + }, + { + "epoch": 4.0255436100557755, + "grad_norm": 0.955073893070221, + "learning_rate": 0.0002, + "loss": 0.5605, + "step": 24900 + }, + { + "epoch": 4.027160294236521, + "grad_norm": 1.1054974794387817, + "learning_rate": 0.0002, + "loss": 0.4896, + "step": 24910 + }, + { + "epoch": 4.028776978417266, + "grad_norm": 1.1240060329437256, + "learning_rate": 0.0002, + "loss": 0.5246, + "step": 24920 + }, + { + "epoch": 4.030393662598011, + "grad_norm": 0.9512825012207031, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 24930 + }, + { + "epoch": 4.0320103467787565, + "grad_norm": 0.85965496301651, + "learning_rate": 0.0002, + "loss": 0.5584, + "step": 24940 + }, + { + "epoch": 4.033627030959502, + "grad_norm": 0.9378061294555664, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 24950 + }, + { + "epoch": 4.035243715140247, + "grad_norm": 0.9655424356460571, + "learning_rate": 0.0002, + "loss": 0.5008, + "step": 24960 + }, + { + "epoch": 4.036860399320993, + "grad_norm": 1.1393707990646362, + "learning_rate": 0.0002, + "loss": 0.5538, + "step": 24970 + }, + { + "epoch": 4.038477083501738, + "grad_norm": 1.0220451354980469, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 24980 + }, + { + "epoch": 4.0400937676824835, + "grad_norm": 0.9785808324813843, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 24990 + }, + { + "epoch": 4.041710451863229, + "grad_norm": 1.0257649421691895, + "learning_rate": 0.0002, + "loss": 0.5153, + "step": 25000 + }, + { + "epoch": 4.043327136043974, + "grad_norm": 0.9737892150878906, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 25010 + }, + { + "epoch": 4.044943820224719, + "grad_norm": 0.7416959404945374, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 25020 + }, + { + "epoch": 4.046560504405464, + "grad_norm": 0.7909596562385559, + "learning_rate": 0.0002, + "loss": 0.5372, + "step": 25030 + }, + { + "epoch": 4.04817718858621, + "grad_norm": 0.8923130631446838, + "learning_rate": 0.0002, + "loss": 0.5265, + "step": 25040 + }, + { + "epoch": 4.049793872766955, + "grad_norm": 0.9044941663742065, + "learning_rate": 0.0002, + "loss": 0.5035, + "step": 25050 + }, + { + "epoch": 4.0514105569477, + "grad_norm": 0.866352379322052, + "learning_rate": 0.0002, + "loss": 0.5135, + "step": 25060 + }, + { + "epoch": 4.053027241128445, + "grad_norm": 1.544549822807312, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 25070 + }, + { + "epoch": 4.054643925309191, + "grad_norm": 0.8426995277404785, + "learning_rate": 0.0002, + "loss": 0.5418, + "step": 25080 + }, + { + "epoch": 4.056260609489936, + "grad_norm": 0.9797548651695251, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 25090 + }, + { + "epoch": 4.057877293670681, + "grad_norm": 0.8468434810638428, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 25100 + }, + { + "epoch": 4.059493977851426, + "grad_norm": 0.9294559955596924, + "learning_rate": 0.0002, + "loss": 0.5242, + "step": 25110 + }, + { + "epoch": 4.061110662032172, + "grad_norm": 0.9686688780784607, + "learning_rate": 0.0002, + "loss": 0.5295, + "step": 25120 + }, + { + "epoch": 4.062727346212918, + "grad_norm": 0.8042728304862976, + "learning_rate": 0.0002, + "loss": 0.5642, + "step": 25130 + }, + { + "epoch": 4.064344030393663, + "grad_norm": 1.165160894393921, + "learning_rate": 0.0002, + "loss": 0.548, + "step": 25140 + }, + { + "epoch": 4.065960714574408, + "grad_norm": 1.2161961793899536, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 25150 + }, + { + "epoch": 4.067577398755153, + "grad_norm": 1.0762810707092285, + "learning_rate": 0.0002, + "loss": 0.5217, + "step": 25160 + }, + { + "epoch": 4.069194082935899, + "grad_norm": 0.7580869793891907, + "learning_rate": 0.0002, + "loss": 0.5886, + "step": 25170 + }, + { + "epoch": 4.070810767116644, + "grad_norm": 0.9630117416381836, + "learning_rate": 0.0002, + "loss": 0.5401, + "step": 25180 + }, + { + "epoch": 4.072427451297389, + "grad_norm": 0.9049716591835022, + "learning_rate": 0.0002, + "loss": 0.5378, + "step": 25190 + }, + { + "epoch": 4.074044135478134, + "grad_norm": 1.1536930799484253, + "learning_rate": 0.0002, + "loss": 0.5266, + "step": 25200 + }, + { + "epoch": 4.0756608196588795, + "grad_norm": 0.901461124420166, + "learning_rate": 0.0002, + "loss": 0.5523, + "step": 25210 + }, + { + "epoch": 4.077277503839625, + "grad_norm": 1.3318437337875366, + "learning_rate": 0.0002, + "loss": 0.5132, + "step": 25220 + }, + { + "epoch": 4.07889418802037, + "grad_norm": 0.8811455368995667, + "learning_rate": 0.0002, + "loss": 0.5317, + "step": 25230 + }, + { + "epoch": 4.080510872201115, + "grad_norm": 1.0564165115356445, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 25240 + }, + { + "epoch": 4.08212755638186, + "grad_norm": 1.1008027791976929, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 25250 + }, + { + "epoch": 4.083744240562606, + "grad_norm": 1.150097131729126, + "learning_rate": 0.0002, + "loss": 0.5195, + "step": 25260 + }, + { + "epoch": 4.085360924743352, + "grad_norm": 0.9339924454689026, + "learning_rate": 0.0002, + "loss": 0.5321, + "step": 25270 + }, + { + "epoch": 4.086977608924097, + "grad_norm": 1.0902045965194702, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 25280 + }, + { + "epoch": 4.088594293104842, + "grad_norm": 0.8483911156654358, + "learning_rate": 0.0002, + "loss": 0.5203, + "step": 25290 + }, + { + "epoch": 4.0902109772855875, + "grad_norm": 0.9477024674415588, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 25300 + }, + { + "epoch": 4.091827661466333, + "grad_norm": 0.9500215649604797, + "learning_rate": 0.0002, + "loss": 0.5384, + "step": 25310 + }, + { + "epoch": 4.093444345647078, + "grad_norm": 1.040468454360962, + "learning_rate": 0.0002, + "loss": 0.5045, + "step": 25320 + }, + { + "epoch": 4.095061029827823, + "grad_norm": 0.7457592487335205, + "learning_rate": 0.0002, + "loss": 0.5488, + "step": 25330 + }, + { + "epoch": 4.096677714008568, + "grad_norm": 1.2092097997665405, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 25340 + }, + { + "epoch": 4.098294398189314, + "grad_norm": 0.9652107954025269, + "learning_rate": 0.0002, + "loss": 0.5174, + "step": 25350 + }, + { + "epoch": 4.099911082370059, + "grad_norm": 0.8464955687522888, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 25360 + }, + { + "epoch": 4.101527766550804, + "grad_norm": 0.875026285648346, + "learning_rate": 0.0002, + "loss": 0.5635, + "step": 25370 + }, + { + "epoch": 4.103144450731549, + "grad_norm": 0.9241740107536316, + "learning_rate": 0.0002, + "loss": 0.5774, + "step": 25380 + }, + { + "epoch": 4.1047611349122946, + "grad_norm": 0.9769546389579773, + "learning_rate": 0.0002, + "loss": 0.5578, + "step": 25390 + }, + { + "epoch": 4.10637781909304, + "grad_norm": 1.1501960754394531, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 25400 + }, + { + "epoch": 4.107994503273786, + "grad_norm": 0.9135243892669678, + "learning_rate": 0.0002, + "loss": 0.5241, + "step": 25410 + }, + { + "epoch": 4.109611187454531, + "grad_norm": 0.9905396103858948, + "learning_rate": 0.0002, + "loss": 0.5152, + "step": 25420 + }, + { + "epoch": 4.111227871635276, + "grad_norm": 0.9845104217529297, + "learning_rate": 0.0002, + "loss": 0.5064, + "step": 25430 + }, + { + "epoch": 4.112844555816022, + "grad_norm": 0.8326883912086487, + "learning_rate": 0.0002, + "loss": 0.5029, + "step": 25440 + }, + { + "epoch": 4.114461239996767, + "grad_norm": 0.9264556765556335, + "learning_rate": 0.0002, + "loss": 0.5312, + "step": 25450 + }, + { + "epoch": 4.116077924177512, + "grad_norm": 1.043080449104309, + "learning_rate": 0.0002, + "loss": 0.5968, + "step": 25460 + }, + { + "epoch": 4.117694608358257, + "grad_norm": 0.8533386588096619, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 25470 + }, + { + "epoch": 4.1193112925390025, + "grad_norm": 1.0133965015411377, + "learning_rate": 0.0002, + "loss": 0.5584, + "step": 25480 + }, + { + "epoch": 4.120927976719748, + "grad_norm": 0.7476310133934021, + "learning_rate": 0.0002, + "loss": 0.566, + "step": 25490 + }, + { + "epoch": 4.122544660900493, + "grad_norm": 1.1247259378433228, + "learning_rate": 0.0002, + "loss": 0.5189, + "step": 25500 + }, + { + "epoch": 4.124161345081238, + "grad_norm": 1.0764678716659546, + "learning_rate": 0.0002, + "loss": 0.5751, + "step": 25510 + }, + { + "epoch": 4.1257780292619834, + "grad_norm": 0.7679798007011414, + "learning_rate": 0.0002, + "loss": 0.5391, + "step": 25520 + }, + { + "epoch": 4.127394713442729, + "grad_norm": 0.8877071142196655, + "learning_rate": 0.0002, + "loss": 0.5233, + "step": 25530 + }, + { + "epoch": 4.129011397623474, + "grad_norm": 1.0440239906311035, + "learning_rate": 0.0002, + "loss": 0.5769, + "step": 25540 + }, + { + "epoch": 4.130628081804219, + "grad_norm": 0.984145998954773, + "learning_rate": 0.0002, + "loss": 0.5723, + "step": 25550 + }, + { + "epoch": 4.132244765984965, + "grad_norm": 0.8667055368423462, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 25560 + }, + { + "epoch": 4.1338614501657105, + "grad_norm": 1.1300835609436035, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 25570 + }, + { + "epoch": 4.135478134346456, + "grad_norm": 0.9314348101615906, + "learning_rate": 0.0002, + "loss": 0.524, + "step": 25580 + }, + { + "epoch": 4.137094818527201, + "grad_norm": 0.7731879949569702, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 25590 + }, + { + "epoch": 4.138711502707946, + "grad_norm": 1.0080097913742065, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 25600 + }, + { + "epoch": 4.140328186888691, + "grad_norm": 1.2475038766860962, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 25610 + }, + { + "epoch": 4.141944871069437, + "grad_norm": 0.9912930727005005, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 25620 + }, + { + "epoch": 4.143561555250182, + "grad_norm": 0.9088651537895203, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 25630 + }, + { + "epoch": 4.145178239430927, + "grad_norm": 0.8940697312355042, + "learning_rate": 0.0002, + "loss": 0.5393, + "step": 25640 + }, + { + "epoch": 4.146794923611672, + "grad_norm": 1.0798203945159912, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 25650 + }, + { + "epoch": 4.148411607792418, + "grad_norm": 0.955172061920166, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 25660 + }, + { + "epoch": 4.150028291973163, + "grad_norm": 0.9692716002464294, + "learning_rate": 0.0002, + "loss": 0.569, + "step": 25670 + }, + { + "epoch": 4.151644976153908, + "grad_norm": 1.0813939571380615, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 25680 + }, + { + "epoch": 4.153261660334653, + "grad_norm": 1.135675072669983, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 25690 + }, + { + "epoch": 4.1548783445153985, + "grad_norm": 1.0392236709594727, + "learning_rate": 0.0002, + "loss": 0.5247, + "step": 25700 + }, + { + "epoch": 4.156495028696145, + "grad_norm": 0.9473116993904114, + "learning_rate": 0.0002, + "loss": 0.5204, + "step": 25710 + }, + { + "epoch": 4.15811171287689, + "grad_norm": 0.712493896484375, + "learning_rate": 0.0002, + "loss": 0.5339, + "step": 25720 + }, + { + "epoch": 4.159728397057635, + "grad_norm": 0.8724465370178223, + "learning_rate": 0.0002, + "loss": 0.5781, + "step": 25730 + }, + { + "epoch": 4.16134508123838, + "grad_norm": 0.9870015978813171, + "learning_rate": 0.0002, + "loss": 0.5325, + "step": 25740 + }, + { + "epoch": 4.1629617654191255, + "grad_norm": 1.025273084640503, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 25750 + }, + { + "epoch": 4.164578449599871, + "grad_norm": 0.9243090152740479, + "learning_rate": 0.0002, + "loss": 0.5223, + "step": 25760 + }, + { + "epoch": 4.166195133780616, + "grad_norm": 1.1656451225280762, + "learning_rate": 0.0002, + "loss": 0.5177, + "step": 25770 + }, + { + "epoch": 4.167811817961361, + "grad_norm": 0.936358630657196, + "learning_rate": 0.0002, + "loss": 0.5334, + "step": 25780 + }, + { + "epoch": 4.1694285021421065, + "grad_norm": 0.8618208169937134, + "learning_rate": 0.0002, + "loss": 0.5236, + "step": 25790 + }, + { + "epoch": 4.171045186322852, + "grad_norm": 0.8580600023269653, + "learning_rate": 0.0002, + "loss": 0.5186, + "step": 25800 + }, + { + "epoch": 4.172661870503597, + "grad_norm": 1.0128562450408936, + "learning_rate": 0.0002, + "loss": 0.5212, + "step": 25810 + }, + { + "epoch": 4.174278554684342, + "grad_norm": 0.854865312576294, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 25820 + }, + { + "epoch": 4.175895238865087, + "grad_norm": 1.235082745552063, + "learning_rate": 0.0002, + "loss": 0.5377, + "step": 25830 + }, + { + "epoch": 4.177511923045833, + "grad_norm": 0.9796220660209656, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 25840 + }, + { + "epoch": 4.179128607226578, + "grad_norm": 0.8922094702720642, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 25850 + }, + { + "epoch": 4.180745291407324, + "grad_norm": 0.9672530293464661, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 25860 + }, + { + "epoch": 4.182361975588069, + "grad_norm": 0.8662548661231995, + "learning_rate": 0.0002, + "loss": 0.5074, + "step": 25870 + }, + { + "epoch": 4.1839786597688144, + "grad_norm": 0.7938798069953918, + "learning_rate": 0.0002, + "loss": 0.5329, + "step": 25880 + }, + { + "epoch": 4.18559534394956, + "grad_norm": 1.0517958402633667, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 25890 + }, + { + "epoch": 4.187212028130305, + "grad_norm": 0.8939275145530701, + "learning_rate": 0.0002, + "loss": 0.5147, + "step": 25900 + }, + { + "epoch": 4.18882871231105, + "grad_norm": 1.0296672582626343, + "learning_rate": 0.0002, + "loss": 0.5199, + "step": 25910 + }, + { + "epoch": 4.190445396491795, + "grad_norm": 0.8104017972946167, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 25920 + }, + { + "epoch": 4.192062080672541, + "grad_norm": 0.9984509944915771, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 25930 + }, + { + "epoch": 4.193678764853286, + "grad_norm": 0.9844784736633301, + "learning_rate": 0.0002, + "loss": 0.5356, + "step": 25940 + }, + { + "epoch": 4.195295449034031, + "grad_norm": 0.8168622255325317, + "learning_rate": 0.0002, + "loss": 0.5198, + "step": 25950 + }, + { + "epoch": 4.196912133214776, + "grad_norm": 1.0878913402557373, + "learning_rate": 0.0002, + "loss": 0.542, + "step": 25960 + }, + { + "epoch": 4.1985288173955215, + "grad_norm": 0.927126407623291, + "learning_rate": 0.0002, + "loss": 0.5414, + "step": 25970 + }, + { + "epoch": 4.200145501576267, + "grad_norm": 0.838586688041687, + "learning_rate": 0.0002, + "loss": 0.5794, + "step": 25980 + }, + { + "epoch": 4.201762185757012, + "grad_norm": 1.2572145462036133, + "learning_rate": 0.0002, + "loss": 0.5454, + "step": 25990 + }, + { + "epoch": 4.203378869937758, + "grad_norm": 1.0476740598678589, + "learning_rate": 0.0002, + "loss": 0.5048, + "step": 26000 + }, + { + "epoch": 4.204995554118503, + "grad_norm": 1.0873368978500366, + "learning_rate": 0.0002, + "loss": 0.5127, + "step": 26010 + }, + { + "epoch": 4.206612238299249, + "grad_norm": 1.2664896249771118, + "learning_rate": 0.0002, + "loss": 0.5679, + "step": 26020 + }, + { + "epoch": 4.208228922479994, + "grad_norm": 1.0312391519546509, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 26030 + }, + { + "epoch": 4.209845606660739, + "grad_norm": 1.0235042572021484, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 26040 + }, + { + "epoch": 4.211462290841484, + "grad_norm": 0.8882219195365906, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 26050 + }, + { + "epoch": 4.2130789750222295, + "grad_norm": 0.9115961790084839, + "learning_rate": 0.0002, + "loss": 0.5557, + "step": 26060 + }, + { + "epoch": 4.214695659202975, + "grad_norm": 1.0218228101730347, + "learning_rate": 0.0002, + "loss": 0.5455, + "step": 26070 + }, + { + "epoch": 4.21631234338372, + "grad_norm": 1.0802232027053833, + "learning_rate": 0.0002, + "loss": 0.5462, + "step": 26080 + }, + { + "epoch": 4.217929027564465, + "grad_norm": 1.1488053798675537, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 26090 + }, + { + "epoch": 4.21954571174521, + "grad_norm": 1.0487725734710693, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 26100 + }, + { + "epoch": 4.221162395925956, + "grad_norm": 0.9131165742874146, + "learning_rate": 0.0002, + "loss": 0.5568, + "step": 26110 + }, + { + "epoch": 4.222779080106701, + "grad_norm": 0.9012845158576965, + "learning_rate": 0.0002, + "loss": 0.5206, + "step": 26120 + }, + { + "epoch": 4.224395764287446, + "grad_norm": 0.8389840126037598, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 26130 + }, + { + "epoch": 4.226012448468191, + "grad_norm": 0.8924660682678223, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 26140 + }, + { + "epoch": 4.2276291326489375, + "grad_norm": 0.8556463718414307, + "learning_rate": 0.0002, + "loss": 0.5715, + "step": 26150 + }, + { + "epoch": 4.229245816829683, + "grad_norm": 0.9643129110336304, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 26160 + }, + { + "epoch": 4.230862501010428, + "grad_norm": 0.9865712523460388, + "learning_rate": 0.0002, + "loss": 0.5321, + "step": 26170 + }, + { + "epoch": 4.232479185191173, + "grad_norm": 1.152641773223877, + "learning_rate": 0.0002, + "loss": 0.5406, + "step": 26180 + }, + { + "epoch": 4.234095869371918, + "grad_norm": 0.9157698154449463, + "learning_rate": 0.0002, + "loss": 0.5632, + "step": 26190 + }, + { + "epoch": 4.235712553552664, + "grad_norm": 0.8418048620223999, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 26200 + }, + { + "epoch": 4.237329237733409, + "grad_norm": 0.9430168867111206, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 26210 + }, + { + "epoch": 4.238945921914154, + "grad_norm": 1.012582778930664, + "learning_rate": 0.0002, + "loss": 0.5574, + "step": 26220 + }, + { + "epoch": 4.240562606094899, + "grad_norm": 1.112619400024414, + "learning_rate": 0.0002, + "loss": 0.5693, + "step": 26230 + }, + { + "epoch": 4.2421792902756446, + "grad_norm": 0.9243621826171875, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 26240 + }, + { + "epoch": 4.24379597445639, + "grad_norm": 0.6977595686912537, + "learning_rate": 0.0002, + "loss": 0.569, + "step": 26250 + }, + { + "epoch": 4.245412658637135, + "grad_norm": 0.9600721597671509, + "learning_rate": 0.0002, + "loss": 0.5379, + "step": 26260 + }, + { + "epoch": 4.24702934281788, + "grad_norm": 0.882641613483429, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 26270 + }, + { + "epoch": 4.2486460269986255, + "grad_norm": 1.010920763015747, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 26280 + }, + { + "epoch": 4.250262711179371, + "grad_norm": 0.9289400577545166, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 26290 + }, + { + "epoch": 4.251879395360117, + "grad_norm": 1.137397289276123, + "learning_rate": 0.0002, + "loss": 0.541, + "step": 26300 + }, + { + "epoch": 4.253496079540862, + "grad_norm": 1.0136182308197021, + "learning_rate": 0.0002, + "loss": 0.5204, + "step": 26310 + }, + { + "epoch": 4.255112763721607, + "grad_norm": 0.9387356042861938, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 26320 + }, + { + "epoch": 4.2567294479023525, + "grad_norm": 1.1833957433700562, + "learning_rate": 0.0002, + "loss": 0.5948, + "step": 26330 + }, + { + "epoch": 4.258346132083098, + "grad_norm": 0.9415934681892395, + "learning_rate": 0.0002, + "loss": 0.5905, + "step": 26340 + }, + { + "epoch": 4.259962816263843, + "grad_norm": 0.8550165891647339, + "learning_rate": 0.0002, + "loss": 0.5539, + "step": 26350 + }, + { + "epoch": 4.261579500444588, + "grad_norm": 9.924622535705566, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 26360 + }, + { + "epoch": 4.2631961846253335, + "grad_norm": 1.0104902982711792, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 26370 + }, + { + "epoch": 4.264812868806079, + "grad_norm": 0.890794038772583, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 26380 + }, + { + "epoch": 4.266429552986824, + "grad_norm": 1.0560191869735718, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 26390 + }, + { + "epoch": 4.268046237167569, + "grad_norm": 1.0135581493377686, + "learning_rate": 0.0002, + "loss": 0.5119, + "step": 26400 + }, + { + "epoch": 4.269662921348314, + "grad_norm": 1.1304140090942383, + "learning_rate": 0.0002, + "loss": 0.5359, + "step": 26410 + }, + { + "epoch": 4.27127960552906, + "grad_norm": 0.9899303913116455, + "learning_rate": 0.0002, + "loss": 0.5615, + "step": 26420 + }, + { + "epoch": 4.272896289709805, + "grad_norm": 1.0505329370498657, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 26430 + }, + { + "epoch": 4.27451297389055, + "grad_norm": 0.9389396905899048, + "learning_rate": 0.0002, + "loss": 0.5384, + "step": 26440 + }, + { + "epoch": 4.276129658071296, + "grad_norm": 0.875328779220581, + "learning_rate": 0.0002, + "loss": 0.5558, + "step": 26450 + }, + { + "epoch": 4.277746342252041, + "grad_norm": 1.0689256191253662, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 26460 + }, + { + "epoch": 4.279363026432787, + "grad_norm": 0.9988957643508911, + "learning_rate": 0.0002, + "loss": 0.546, + "step": 26470 + }, + { + "epoch": 4.280979710613532, + "grad_norm": 0.8721813559532166, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 26480 + }, + { + "epoch": 4.282596394794277, + "grad_norm": 1.100109577178955, + "learning_rate": 0.0002, + "loss": 0.5424, + "step": 26490 + }, + { + "epoch": 4.284213078975022, + "grad_norm": 1.1607271432876587, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 26500 + }, + { + "epoch": 4.285829763155768, + "grad_norm": 0.879088819026947, + "learning_rate": 0.0002, + "loss": 0.6287, + "step": 26510 + }, + { + "epoch": 4.287446447336513, + "grad_norm": 0.9891700744628906, + "learning_rate": 0.0002, + "loss": 0.573, + "step": 26520 + }, + { + "epoch": 4.289063131517258, + "grad_norm": 1.0831127166748047, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 26530 + }, + { + "epoch": 4.290679815698003, + "grad_norm": 1.4108285903930664, + "learning_rate": 0.0002, + "loss": 0.5693, + "step": 26540 + }, + { + "epoch": 4.2922964998787485, + "grad_norm": 1.0630289316177368, + "learning_rate": 0.0002, + "loss": 0.5888, + "step": 26550 + }, + { + "epoch": 4.293913184059494, + "grad_norm": 1.0854572057724, + "learning_rate": 0.0002, + "loss": 0.5817, + "step": 26560 + }, + { + "epoch": 4.295529868240239, + "grad_norm": 0.9561646580696106, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 26570 + }, + { + "epoch": 4.297146552420984, + "grad_norm": 0.9064981937408447, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 26580 + }, + { + "epoch": 4.298763236601729, + "grad_norm": 1.0082972049713135, + "learning_rate": 0.0002, + "loss": 0.5847, + "step": 26590 + }, + { + "epoch": 4.3003799207824756, + "grad_norm": 1.1613214015960693, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 26600 + }, + { + "epoch": 4.301996604963221, + "grad_norm": 0.9847695231437683, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 26610 + }, + { + "epoch": 4.303613289143966, + "grad_norm": 1.0980697870254517, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 26620 + }, + { + "epoch": 4.305229973324711, + "grad_norm": 0.8861175179481506, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 26630 + }, + { + "epoch": 4.3068466575054565, + "grad_norm": 0.8917363286018372, + "learning_rate": 0.0002, + "loss": 0.5716, + "step": 26640 + }, + { + "epoch": 4.308463341686202, + "grad_norm": 1.0458378791809082, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 26650 + }, + { + "epoch": 4.310080025866947, + "grad_norm": 1.4859240055084229, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 26660 + }, + { + "epoch": 4.311696710047692, + "grad_norm": 1.1376359462738037, + "learning_rate": 0.0002, + "loss": 0.5296, + "step": 26670 + }, + { + "epoch": 4.313313394228437, + "grad_norm": 0.991349995136261, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 26680 + }, + { + "epoch": 4.314930078409183, + "grad_norm": 0.9995543956756592, + "learning_rate": 0.0002, + "loss": 0.5338, + "step": 26690 + }, + { + "epoch": 4.316546762589928, + "grad_norm": 1.0515851974487305, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 26700 + }, + { + "epoch": 4.318163446770673, + "grad_norm": 1.008023977279663, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 26710 + }, + { + "epoch": 4.319780130951418, + "grad_norm": 1.0184582471847534, + "learning_rate": 0.0002, + "loss": 0.5506, + "step": 26720 + }, + { + "epoch": 4.321396815132164, + "grad_norm": 1.161071538925171, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 26730 + }, + { + "epoch": 4.323013499312909, + "grad_norm": 0.9580779671669006, + "learning_rate": 0.0002, + "loss": 0.5633, + "step": 26740 + }, + { + "epoch": 4.324630183493655, + "grad_norm": 1.0189911127090454, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 26750 + }, + { + "epoch": 4.3262468676744, + "grad_norm": 0.7484358549118042, + "learning_rate": 0.0002, + "loss": 0.5237, + "step": 26760 + }, + { + "epoch": 4.327863551855145, + "grad_norm": 1.0015908479690552, + "learning_rate": 0.0002, + "loss": 0.5728, + "step": 26770 + }, + { + "epoch": 4.329480236035891, + "grad_norm": 0.8972945809364319, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 26780 + }, + { + "epoch": 4.331096920216636, + "grad_norm": 1.01099693775177, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 26790 + }, + { + "epoch": 4.332713604397381, + "grad_norm": 0.846958339214325, + "learning_rate": 0.0002, + "loss": 0.5591, + "step": 26800 + }, + { + "epoch": 4.334330288578126, + "grad_norm": 1.0792603492736816, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 26810 + }, + { + "epoch": 4.3359469727588715, + "grad_norm": 1.0373345613479614, + "learning_rate": 0.0002, + "loss": 0.5747, + "step": 26820 + }, + { + "epoch": 4.337563656939617, + "grad_norm": 0.9779167771339417, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 26830 + }, + { + "epoch": 4.339180341120362, + "grad_norm": 1.0235520601272583, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 26840 + }, + { + "epoch": 4.340797025301107, + "grad_norm": 1.04195237159729, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 26850 + }, + { + "epoch": 4.3424137094818525, + "grad_norm": 0.9479565620422363, + "learning_rate": 0.0002, + "loss": 0.5474, + "step": 26860 + }, + { + "epoch": 4.344030393662598, + "grad_norm": 0.9526172280311584, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 26870 + }, + { + "epoch": 4.345647077843343, + "grad_norm": 0.8571456074714661, + "learning_rate": 0.0002, + "loss": 0.521, + "step": 26880 + }, + { + "epoch": 4.347263762024088, + "grad_norm": 0.9475828409194946, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 26890 + }, + { + "epoch": 4.348880446204834, + "grad_norm": 1.0529576539993286, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 26900 + }, + { + "epoch": 4.3504971303855795, + "grad_norm": 0.9648140072822571, + "learning_rate": 0.0002, + "loss": 0.56, + "step": 26910 + }, + { + "epoch": 4.352113814566325, + "grad_norm": 1.0488841533660889, + "learning_rate": 0.0002, + "loss": 0.5162, + "step": 26920 + }, + { + "epoch": 4.35373049874707, + "grad_norm": 0.8771942257881165, + "learning_rate": 0.0002, + "loss": 0.5842, + "step": 26930 + }, + { + "epoch": 4.355347182927815, + "grad_norm": 0.9411202073097229, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 26940 + }, + { + "epoch": 4.35696386710856, + "grad_norm": 1.0997588634490967, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 26950 + }, + { + "epoch": 4.358580551289306, + "grad_norm": 0.968754768371582, + "learning_rate": 0.0002, + "loss": 0.5528, + "step": 26960 + }, + { + "epoch": 4.360197235470051, + "grad_norm": 0.9990773797035217, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 26970 + }, + { + "epoch": 4.361813919650796, + "grad_norm": 1.0210620164871216, + "learning_rate": 0.0002, + "loss": 0.5761, + "step": 26980 + }, + { + "epoch": 4.363430603831541, + "grad_norm": 0.855462908744812, + "learning_rate": 0.0002, + "loss": 0.5768, + "step": 26990 + }, + { + "epoch": 4.365047288012287, + "grad_norm": 0.9169660806655884, + "learning_rate": 0.0002, + "loss": 0.5493, + "step": 27000 + }, + { + "epoch": 4.366663972193032, + "grad_norm": 1.089629888534546, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 27010 + }, + { + "epoch": 4.368280656373777, + "grad_norm": 1.0932867527008057, + "learning_rate": 0.0002, + "loss": 0.5854, + "step": 27020 + }, + { + "epoch": 4.369897340554522, + "grad_norm": 0.9290956854820251, + "learning_rate": 0.0002, + "loss": 0.5656, + "step": 27030 + }, + { + "epoch": 4.3715140247352675, + "grad_norm": 1.2800624370574951, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 27040 + }, + { + "epoch": 4.373130708916014, + "grad_norm": 0.8993493318557739, + "learning_rate": 0.0002, + "loss": 0.5837, + "step": 27050 + }, + { + "epoch": 4.374747393096759, + "grad_norm": 1.1566431522369385, + "learning_rate": 0.0002, + "loss": 0.6232, + "step": 27060 + }, + { + "epoch": 4.376364077277504, + "grad_norm": 0.9479052424430847, + "learning_rate": 0.0002, + "loss": 0.5902, + "step": 27070 + }, + { + "epoch": 4.377980761458249, + "grad_norm": 1.0063648223876953, + "learning_rate": 0.0002, + "loss": 0.6189, + "step": 27080 + }, + { + "epoch": 4.379597445638995, + "grad_norm": 0.8342045545578003, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 27090 + }, + { + "epoch": 4.38121412981974, + "grad_norm": 1.1390739679336548, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 27100 + }, + { + "epoch": 4.382830814000485, + "grad_norm": 0.9547637104988098, + "learning_rate": 0.0002, + "loss": 0.5372, + "step": 27110 + }, + { + "epoch": 4.38444749818123, + "grad_norm": 1.0503804683685303, + "learning_rate": 0.0002, + "loss": 0.5728, + "step": 27120 + }, + { + "epoch": 4.3860641823619755, + "grad_norm": 0.9064017534255981, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 27130 + }, + { + "epoch": 4.387680866542721, + "grad_norm": 0.9382519125938416, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 27140 + }, + { + "epoch": 4.389297550723466, + "grad_norm": 1.0410341024398804, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 27150 + }, + { + "epoch": 4.390914234904211, + "grad_norm": 0.9218655824661255, + "learning_rate": 0.0002, + "loss": 0.6034, + "step": 27160 + }, + { + "epoch": 4.392530919084956, + "grad_norm": 0.8119737505912781, + "learning_rate": 0.0002, + "loss": 0.5204, + "step": 27170 + }, + { + "epoch": 4.394147603265702, + "grad_norm": 0.8584722876548767, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 27180 + }, + { + "epoch": 4.395764287446447, + "grad_norm": 0.9668293595314026, + "learning_rate": 0.0002, + "loss": 0.5772, + "step": 27190 + }, + { + "epoch": 4.397380971627193, + "grad_norm": 1.022334098815918, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 27200 + }, + { + "epoch": 4.398997655807938, + "grad_norm": 0.9553216099739075, + "learning_rate": 0.0002, + "loss": 0.5573, + "step": 27210 + }, + { + "epoch": 4.4006143399886835, + "grad_norm": 0.9282339215278625, + "learning_rate": 0.0002, + "loss": 0.5604, + "step": 27220 + }, + { + "epoch": 4.402231024169429, + "grad_norm": 1.0232292413711548, + "learning_rate": 0.0002, + "loss": 0.5599, + "step": 27230 + }, + { + "epoch": 4.403847708350174, + "grad_norm": 0.9915700554847717, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 27240 + }, + { + "epoch": 4.405464392530919, + "grad_norm": 1.0014961957931519, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 27250 + }, + { + "epoch": 4.407081076711664, + "grad_norm": 1.1172103881835938, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 27260 + }, + { + "epoch": 4.40869776089241, + "grad_norm": 0.8583093285560608, + "learning_rate": 0.0002, + "loss": 0.5286, + "step": 27270 + }, + { + "epoch": 4.410314445073155, + "grad_norm": 0.7609201669692993, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 27280 + }, + { + "epoch": 4.4119311292539, + "grad_norm": 1.0619351863861084, + "learning_rate": 0.0002, + "loss": 0.575, + "step": 27290 + }, + { + "epoch": 4.413547813434645, + "grad_norm": 1.0177674293518066, + "learning_rate": 0.0002, + "loss": 0.5579, + "step": 27300 + }, + { + "epoch": 4.4151644976153905, + "grad_norm": 0.9921218156814575, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 27310 + }, + { + "epoch": 4.416781181796136, + "grad_norm": 1.126244306564331, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 27320 + }, + { + "epoch": 4.418397865976881, + "grad_norm": 1.0678540468215942, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 27330 + }, + { + "epoch": 4.420014550157627, + "grad_norm": 0.8705704212188721, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 27340 + }, + { + "epoch": 4.421631234338372, + "grad_norm": 1.272074818611145, + "learning_rate": 0.0002, + "loss": 0.5763, + "step": 27350 + }, + { + "epoch": 4.423247918519118, + "grad_norm": 0.8740444183349609, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 27360 + }, + { + "epoch": 4.424864602699863, + "grad_norm": 1.0584250688552856, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 27370 + }, + { + "epoch": 4.426481286880608, + "grad_norm": 1.059870719909668, + "learning_rate": 0.0002, + "loss": 0.589, + "step": 27380 + }, + { + "epoch": 4.428097971061353, + "grad_norm": 1.072265863418579, + "learning_rate": 0.0002, + "loss": 0.5551, + "step": 27390 + }, + { + "epoch": 4.4297146552420985, + "grad_norm": 0.871481716632843, + "learning_rate": 0.0002, + "loss": 0.5584, + "step": 27400 + }, + { + "epoch": 4.431331339422844, + "grad_norm": 0.9555448293685913, + "learning_rate": 0.0002, + "loss": 0.5372, + "step": 27410 + }, + { + "epoch": 4.432948023603589, + "grad_norm": 1.0402292013168335, + "learning_rate": 0.0002, + "loss": 0.5593, + "step": 27420 + }, + { + "epoch": 4.434564707784334, + "grad_norm": 1.12587571144104, + "learning_rate": 0.0002, + "loss": 0.5532, + "step": 27430 + }, + { + "epoch": 4.436181391965079, + "grad_norm": 1.0783193111419678, + "learning_rate": 0.0002, + "loss": 0.5403, + "step": 27440 + }, + { + "epoch": 4.437798076145825, + "grad_norm": 1.024133563041687, + "learning_rate": 0.0002, + "loss": 0.5313, + "step": 27450 + }, + { + "epoch": 4.43941476032657, + "grad_norm": 0.9156768918037415, + "learning_rate": 0.0002, + "loss": 0.5621, + "step": 27460 + }, + { + "epoch": 4.441031444507315, + "grad_norm": 1.0215224027633667, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 27470 + }, + { + "epoch": 4.442648128688061, + "grad_norm": 1.082116961479187, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 27480 + }, + { + "epoch": 4.4442648128688065, + "grad_norm": 1.0412873029708862, + "learning_rate": 0.0002, + "loss": 0.6203, + "step": 27490 + }, + { + "epoch": 4.445881497049552, + "grad_norm": 1.0509289503097534, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 27500 + }, + { + "epoch": 4.447498181230297, + "grad_norm": 0.9291498064994812, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 27510 + }, + { + "epoch": 4.449114865411042, + "grad_norm": 0.970184326171875, + "learning_rate": 0.0002, + "loss": 0.5408, + "step": 27520 + }, + { + "epoch": 4.450731549591787, + "grad_norm": 0.8418883681297302, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 27530 + }, + { + "epoch": 4.452348233772533, + "grad_norm": 0.8823825120925903, + "learning_rate": 0.0002, + "loss": 0.5124, + "step": 27540 + }, + { + "epoch": 4.453964917953278, + "grad_norm": 1.1909019947052002, + "learning_rate": 0.0002, + "loss": 0.5867, + "step": 27550 + }, + { + "epoch": 4.455581602134023, + "grad_norm": 1.0317302942276, + "learning_rate": 0.0002, + "loss": 0.5685, + "step": 27560 + }, + { + "epoch": 4.457198286314768, + "grad_norm": 0.9977751970291138, + "learning_rate": 0.0002, + "loss": 0.5538, + "step": 27570 + }, + { + "epoch": 4.458814970495514, + "grad_norm": 0.8909519910812378, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 27580 + }, + { + "epoch": 4.460431654676259, + "grad_norm": 0.8653029799461365, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 27590 + }, + { + "epoch": 4.462048338857004, + "grad_norm": 1.0783653259277344, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 27600 + }, + { + "epoch": 4.463665023037749, + "grad_norm": 1.1235394477844238, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 27610 + }, + { + "epoch": 4.4652817072184945, + "grad_norm": 0.9386643767356873, + "learning_rate": 0.0002, + "loss": 0.5545, + "step": 27620 + }, + { + "epoch": 4.466898391399241, + "grad_norm": 1.0605148077011108, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 27630 + }, + { + "epoch": 4.468515075579986, + "grad_norm": 1.1283893585205078, + "learning_rate": 0.0002, + "loss": 0.5886, + "step": 27640 + }, + { + "epoch": 4.470131759760731, + "grad_norm": 1.0583468675613403, + "learning_rate": 0.0002, + "loss": 0.5801, + "step": 27650 + }, + { + "epoch": 4.471748443941476, + "grad_norm": 0.9563992023468018, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 27660 + }, + { + "epoch": 4.4733651281222215, + "grad_norm": 1.100598931312561, + "learning_rate": 0.0002, + "loss": 0.5687, + "step": 27670 + }, + { + "epoch": 4.474981812302967, + "grad_norm": 0.9386957287788391, + "learning_rate": 0.0002, + "loss": 0.589, + "step": 27680 + }, + { + "epoch": 4.476598496483712, + "grad_norm": 1.2946288585662842, + "learning_rate": 0.0002, + "loss": 0.6241, + "step": 27690 + }, + { + "epoch": 4.478215180664457, + "grad_norm": 1.0325199365615845, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 27700 + }, + { + "epoch": 4.4798318648452025, + "grad_norm": 1.0318928956985474, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 27710 + }, + { + "epoch": 4.481448549025948, + "grad_norm": 0.8721024394035339, + "learning_rate": 0.0002, + "loss": 0.5656, + "step": 27720 + }, + { + "epoch": 4.483065233206693, + "grad_norm": 1.17376708984375, + "learning_rate": 0.0002, + "loss": 0.5421, + "step": 27730 + }, + { + "epoch": 4.484681917387438, + "grad_norm": 1.0926326513290405, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 27740 + }, + { + "epoch": 4.486298601568183, + "grad_norm": 0.9043852686882019, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 27750 + }, + { + "epoch": 4.487915285748929, + "grad_norm": 1.064600944519043, + "learning_rate": 0.0002, + "loss": 0.582, + "step": 27760 + }, + { + "epoch": 4.489531969929674, + "grad_norm": 0.7833460569381714, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 27770 + }, + { + "epoch": 4.49114865411042, + "grad_norm": 1.1073496341705322, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 27780 + }, + { + "epoch": 4.492765338291165, + "grad_norm": 1.0799397230148315, + "learning_rate": 0.0002, + "loss": 0.5577, + "step": 27790 + }, + { + "epoch": 4.49438202247191, + "grad_norm": 1.1062238216400146, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 27800 + }, + { + "epoch": 4.495998706652656, + "grad_norm": 1.0568242073059082, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 27810 + }, + { + "epoch": 4.497615390833401, + "grad_norm": 0.8861091732978821, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 27820 + }, + { + "epoch": 4.499232075014146, + "grad_norm": 1.2297543287277222, + "learning_rate": 0.0002, + "loss": 0.5858, + "step": 27830 + }, + { + "epoch": 4.500848759194891, + "grad_norm": 0.9600302577018738, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 27840 + }, + { + "epoch": 4.502465443375637, + "grad_norm": 1.057051181793213, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 27850 + }, + { + "epoch": 4.504082127556382, + "grad_norm": 0.9839690923690796, + "learning_rate": 0.0002, + "loss": 0.5788, + "step": 27860 + }, + { + "epoch": 4.505698811737127, + "grad_norm": 1.1479853391647339, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 27870 + }, + { + "epoch": 4.507315495917872, + "grad_norm": 1.0550768375396729, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 27880 + }, + { + "epoch": 4.5089321800986175, + "grad_norm": 0.898209273815155, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 27890 + }, + { + "epoch": 4.510548864279363, + "grad_norm": 0.9460315108299255, + "learning_rate": 0.0002, + "loss": 0.5734, + "step": 27900 + }, + { + "epoch": 4.512165548460108, + "grad_norm": 0.9499884247779846, + "learning_rate": 0.0002, + "loss": 0.5702, + "step": 27910 + }, + { + "epoch": 4.513782232640853, + "grad_norm": 0.7801318764686584, + "learning_rate": 0.0002, + "loss": 0.5385, + "step": 27920 + }, + { + "epoch": 4.515398916821599, + "grad_norm": 0.9286966323852539, + "learning_rate": 0.0002, + "loss": 0.5391, + "step": 27930 + }, + { + "epoch": 4.517015601002345, + "grad_norm": 0.9539980292320251, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 27940 + }, + { + "epoch": 4.51863228518309, + "grad_norm": 1.1053401231765747, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 27950 + }, + { + "epoch": 4.520248969363835, + "grad_norm": 0.7535534501075745, + "learning_rate": 0.0002, + "loss": 0.6087, + "step": 27960 + }, + { + "epoch": 4.52186565354458, + "grad_norm": 1.076926589012146, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 27970 + }, + { + "epoch": 4.5234823377253255, + "grad_norm": 1.181935429573059, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 27980 + }, + { + "epoch": 4.525099021906071, + "grad_norm": 0.9293407201766968, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 27990 + }, + { + "epoch": 4.526715706086816, + "grad_norm": 0.8953009247779846, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 28000 + }, + { + "epoch": 4.528332390267561, + "grad_norm": 1.0850225687026978, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 28010 + }, + { + "epoch": 4.529949074448306, + "grad_norm": 0.9125663042068481, + "learning_rate": 0.0002, + "loss": 0.5459, + "step": 28020 + }, + { + "epoch": 4.531565758629052, + "grad_norm": 0.8745216727256775, + "learning_rate": 0.0002, + "loss": 0.5922, + "step": 28030 + }, + { + "epoch": 4.533182442809797, + "grad_norm": 1.0783463716506958, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 28040 + }, + { + "epoch": 4.534799126990542, + "grad_norm": 0.7513844966888428, + "learning_rate": 0.0002, + "loss": 0.5754, + "step": 28050 + }, + { + "epoch": 4.536415811171287, + "grad_norm": 1.0135776996612549, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 28060 + }, + { + "epoch": 4.538032495352033, + "grad_norm": 0.8886825442314148, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 28070 + }, + { + "epoch": 4.539649179532779, + "grad_norm": 0.8153995275497437, + "learning_rate": 0.0002, + "loss": 0.5605, + "step": 28080 + }, + { + "epoch": 4.541265863713524, + "grad_norm": 0.9853341579437256, + "learning_rate": 0.0002, + "loss": 0.6377, + "step": 28090 + }, + { + "epoch": 4.542882547894269, + "grad_norm": 0.9365800023078918, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 28100 + }, + { + "epoch": 4.544499232075014, + "grad_norm": 0.9765017628669739, + "learning_rate": 0.0002, + "loss": 0.5477, + "step": 28110 + }, + { + "epoch": 4.54611591625576, + "grad_norm": 0.9811279773712158, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 28120 + }, + { + "epoch": 4.547732600436505, + "grad_norm": 1.0387924909591675, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 28130 + }, + { + "epoch": 4.54934928461725, + "grad_norm": 1.0684878826141357, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 28140 + }, + { + "epoch": 4.550965968797995, + "grad_norm": 1.0000102519989014, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 28150 + }, + { + "epoch": 4.5525826529787405, + "grad_norm": 1.0717930793762207, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 28160 + }, + { + "epoch": 4.554199337159486, + "grad_norm": 0.990074634552002, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 28170 + }, + { + "epoch": 4.555816021340231, + "grad_norm": 0.8673754930496216, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 28180 + }, + { + "epoch": 4.557432705520976, + "grad_norm": 0.864247739315033, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 28190 + }, + { + "epoch": 4.5590493897017215, + "grad_norm": 0.8280200958251953, + "learning_rate": 0.0002, + "loss": 0.5516, + "step": 28200 + }, + { + "epoch": 4.560666073882467, + "grad_norm": 1.1312172412872314, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 28210 + }, + { + "epoch": 4.562282758063212, + "grad_norm": 0.9147403240203857, + "learning_rate": 0.0002, + "loss": 0.5776, + "step": 28220 + }, + { + "epoch": 4.563899442243958, + "grad_norm": 1.0321218967437744, + "learning_rate": 0.0002, + "loss": 0.5591, + "step": 28230 + }, + { + "epoch": 4.565516126424703, + "grad_norm": 1.168332815170288, + "learning_rate": 0.0002, + "loss": 0.5508, + "step": 28240 + }, + { + "epoch": 4.5671328106054485, + "grad_norm": 1.0067222118377686, + "learning_rate": 0.0002, + "loss": 0.5649, + "step": 28250 + }, + { + "epoch": 4.568749494786194, + "grad_norm": 1.0283393859863281, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 28260 + }, + { + "epoch": 4.570366178966939, + "grad_norm": 0.9912363886833191, + "learning_rate": 0.0002, + "loss": 0.5772, + "step": 28270 + }, + { + "epoch": 4.571982863147684, + "grad_norm": 1.108032464981079, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 28280 + }, + { + "epoch": 4.573599547328429, + "grad_norm": 0.8260078430175781, + "learning_rate": 0.0002, + "loss": 0.5529, + "step": 28290 + }, + { + "epoch": 4.575216231509175, + "grad_norm": 0.8946247100830078, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 28300 + }, + { + "epoch": 4.57683291568992, + "grad_norm": 0.8273587822914124, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 28310 + }, + { + "epoch": 4.578449599870665, + "grad_norm": 0.9040093421936035, + "learning_rate": 0.0002, + "loss": 0.6058, + "step": 28320 + }, + { + "epoch": 4.58006628405141, + "grad_norm": 0.8435290455818176, + "learning_rate": 0.0002, + "loss": 0.5521, + "step": 28330 + }, + { + "epoch": 4.581682968232156, + "grad_norm": 1.164088249206543, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 28340 + }, + { + "epoch": 4.583299652412901, + "grad_norm": 0.9861085414886475, + "learning_rate": 0.0002, + "loss": 0.5603, + "step": 28350 + }, + { + "epoch": 4.584916336593646, + "grad_norm": 0.8892980813980103, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 28360 + }, + { + "epoch": 4.586533020774391, + "grad_norm": 1.240574836730957, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 28370 + }, + { + "epoch": 4.588149704955137, + "grad_norm": 0.8669408559799194, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 28380 + }, + { + "epoch": 4.589766389135883, + "grad_norm": 0.9145985841751099, + "learning_rate": 0.0002, + "loss": 0.5603, + "step": 28390 + }, + { + "epoch": 4.591383073316628, + "grad_norm": 0.8584614992141724, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 28400 + }, + { + "epoch": 4.592999757497373, + "grad_norm": 1.118829369544983, + "learning_rate": 0.0002, + "loss": 0.5898, + "step": 28410 + }, + { + "epoch": 4.594616441678118, + "grad_norm": 1.1411553621292114, + "learning_rate": 0.0002, + "loss": 0.5641, + "step": 28420 + }, + { + "epoch": 4.596233125858864, + "grad_norm": 0.9433278441429138, + "learning_rate": 0.0002, + "loss": 0.549, + "step": 28430 + }, + { + "epoch": 4.597849810039609, + "grad_norm": 0.816830039024353, + "learning_rate": 0.0002, + "loss": 0.5496, + "step": 28440 + }, + { + "epoch": 4.599466494220354, + "grad_norm": 1.2124968767166138, + "learning_rate": 0.0002, + "loss": 0.5543, + "step": 28450 + }, + { + "epoch": 4.601083178401099, + "grad_norm": 0.9658762216567993, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 28460 + }, + { + "epoch": 4.6026998625818445, + "grad_norm": 0.836100161075592, + "learning_rate": 0.0002, + "loss": 0.5902, + "step": 28470 + }, + { + "epoch": 4.60431654676259, + "grad_norm": 0.9989104270935059, + "learning_rate": 0.0002, + "loss": 0.5749, + "step": 28480 + }, + { + "epoch": 4.605933230943335, + "grad_norm": 1.1298956871032715, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 28490 + }, + { + "epoch": 4.60754991512408, + "grad_norm": 1.1731704473495483, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 28500 + }, + { + "epoch": 4.609166599304825, + "grad_norm": 0.9624714255332947, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 28510 + }, + { + "epoch": 4.610783283485571, + "grad_norm": 1.364073634147644, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 28520 + }, + { + "epoch": 4.612399967666317, + "grad_norm": 1.1827356815338135, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 28530 + }, + { + "epoch": 4.614016651847062, + "grad_norm": 0.6651531457901001, + "learning_rate": 0.0002, + "loss": 0.5643, + "step": 28540 + }, + { + "epoch": 4.615633336027807, + "grad_norm": 1.1640995740890503, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 28550 + }, + { + "epoch": 4.6172500202085525, + "grad_norm": 1.028918743133545, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 28560 + }, + { + "epoch": 4.618866704389298, + "grad_norm": 0.8252120614051819, + "learning_rate": 0.0002, + "loss": 0.5607, + "step": 28570 + }, + { + "epoch": 4.620483388570043, + "grad_norm": 1.3536735773086548, + "learning_rate": 0.0002, + "loss": 0.5769, + "step": 28580 + }, + { + "epoch": 4.622100072750788, + "grad_norm": 1.2146915197372437, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 28590 + }, + { + "epoch": 4.623716756931533, + "grad_norm": 1.0122549533843994, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 28600 + }, + { + "epoch": 4.625333441112279, + "grad_norm": 0.9977872967720032, + "learning_rate": 0.0002, + "loss": 0.6072, + "step": 28610 + }, + { + "epoch": 4.626950125293024, + "grad_norm": 1.0159751176834106, + "learning_rate": 0.0002, + "loss": 0.5669, + "step": 28620 + }, + { + "epoch": 4.628566809473769, + "grad_norm": 1.0028325319290161, + "learning_rate": 0.0002, + "loss": 0.5935, + "step": 28630 + }, + { + "epoch": 4.630183493654514, + "grad_norm": 0.901638388633728, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 28640 + }, + { + "epoch": 4.6318001778352595, + "grad_norm": 0.9450507164001465, + "learning_rate": 0.0002, + "loss": 0.595, + "step": 28650 + }, + { + "epoch": 4.633416862016006, + "grad_norm": 0.9987545013427734, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 28660 + }, + { + "epoch": 4.63503354619675, + "grad_norm": 0.9574332237243652, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 28670 + }, + { + "epoch": 4.636650230377496, + "grad_norm": 1.2215653657913208, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 28680 + }, + { + "epoch": 4.638266914558241, + "grad_norm": 0.9798858761787415, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 28690 + }, + { + "epoch": 4.639883598738987, + "grad_norm": 1.0648466348648071, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 28700 + }, + { + "epoch": 4.641500282919732, + "grad_norm": 1.0606504678726196, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 28710 + }, + { + "epoch": 4.643116967100477, + "grad_norm": 1.0892442464828491, + "learning_rate": 0.0002, + "loss": 0.5801, + "step": 28720 + }, + { + "epoch": 4.644733651281222, + "grad_norm": 0.914391040802002, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 28730 + }, + { + "epoch": 4.6463503354619675, + "grad_norm": 0.9782370328903198, + "learning_rate": 0.0002, + "loss": 0.5439, + "step": 28740 + }, + { + "epoch": 4.647967019642713, + "grad_norm": 1.0344339609146118, + "learning_rate": 0.0002, + "loss": 0.6035, + "step": 28750 + }, + { + "epoch": 4.649583703823458, + "grad_norm": 1.0513931512832642, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 28760 + }, + { + "epoch": 4.651200388004203, + "grad_norm": 0.9711475968360901, + "learning_rate": 0.0002, + "loss": 0.546, + "step": 28770 + }, + { + "epoch": 4.652817072184948, + "grad_norm": 0.977519690990448, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 28780 + }, + { + "epoch": 4.654433756365694, + "grad_norm": 0.9150224924087524, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 28790 + }, + { + "epoch": 4.656050440546439, + "grad_norm": 1.0973542928695679, + "learning_rate": 0.0002, + "loss": 0.5382, + "step": 28800 + }, + { + "epoch": 4.657667124727185, + "grad_norm": 0.944877564907074, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 28810 + }, + { + "epoch": 4.659283808907929, + "grad_norm": 0.9508748650550842, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 28820 + }, + { + "epoch": 4.6609004930886755, + "grad_norm": 0.9681721329689026, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 28830 + }, + { + "epoch": 4.662517177269421, + "grad_norm": 1.0214351415634155, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 28840 + }, + { + "epoch": 4.664133861450166, + "grad_norm": 0.9748611450195312, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 28850 + }, + { + "epoch": 4.665750545630911, + "grad_norm": 0.8484147191047668, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 28860 + }, + { + "epoch": 4.667367229811656, + "grad_norm": 1.1252986192703247, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 28870 + }, + { + "epoch": 4.668983913992402, + "grad_norm": 0.8706206679344177, + "learning_rate": 0.0002, + "loss": 0.5724, + "step": 28880 + }, + { + "epoch": 4.670600598173147, + "grad_norm": 1.1432424783706665, + "learning_rate": 0.0002, + "loss": 0.6002, + "step": 28890 + }, + { + "epoch": 4.672217282353892, + "grad_norm": 1.017029047012329, + "learning_rate": 0.0002, + "loss": 0.5675, + "step": 28900 + }, + { + "epoch": 4.673833966534637, + "grad_norm": 1.085597038269043, + "learning_rate": 0.0002, + "loss": 0.5831, + "step": 28910 + }, + { + "epoch": 4.675450650715383, + "grad_norm": 0.9275796413421631, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 28920 + }, + { + "epoch": 4.677067334896128, + "grad_norm": 0.9518964886665344, + "learning_rate": 0.0002, + "loss": 0.5603, + "step": 28930 + }, + { + "epoch": 4.678684019076873, + "grad_norm": 1.0352122783660889, + "learning_rate": 0.0002, + "loss": 0.6232, + "step": 28940 + }, + { + "epoch": 4.680300703257618, + "grad_norm": 1.090124249458313, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 28950 + }, + { + "epoch": 4.681917387438364, + "grad_norm": 0.8799563050270081, + "learning_rate": 0.0002, + "loss": 0.5728, + "step": 28960 + }, + { + "epoch": 4.683534071619109, + "grad_norm": 1.0929821729660034, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 28970 + }, + { + "epoch": 4.685150755799855, + "grad_norm": 0.903727650642395, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 28980 + }, + { + "epoch": 4.6867674399806, + "grad_norm": 0.9752424955368042, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 28990 + }, + { + "epoch": 4.688384124161345, + "grad_norm": 0.9351571202278137, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 29000 + }, + { + "epoch": 4.6900008083420905, + "grad_norm": 0.923877477645874, + "learning_rate": 0.0002, + "loss": 0.5811, + "step": 29010 + }, + { + "epoch": 4.691617492522836, + "grad_norm": 1.045389175415039, + "learning_rate": 0.0002, + "loss": 0.5682, + "step": 29020 + }, + { + "epoch": 4.693234176703581, + "grad_norm": 1.0200831890106201, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 29030 + }, + { + "epoch": 4.694850860884326, + "grad_norm": 1.1499706506729126, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 29040 + }, + { + "epoch": 4.6964675450650715, + "grad_norm": 0.860118567943573, + "learning_rate": 0.0002, + "loss": 0.5745, + "step": 29050 + }, + { + "epoch": 4.698084229245817, + "grad_norm": 0.9774864315986633, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 29060 + }, + { + "epoch": 4.699700913426562, + "grad_norm": 1.0323210954666138, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 29070 + }, + { + "epoch": 4.701317597607307, + "grad_norm": 0.8492481112480164, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 29080 + }, + { + "epoch": 4.702934281788052, + "grad_norm": 1.131951093673706, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 29090 + }, + { + "epoch": 4.704550965968798, + "grad_norm": 0.8763113021850586, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 29100 + }, + { + "epoch": 4.706167650149544, + "grad_norm": 1.045028805732727, + "learning_rate": 0.0002, + "loss": 0.575, + "step": 29110 + }, + { + "epoch": 4.707784334330288, + "grad_norm": 0.9961401224136353, + "learning_rate": 0.0002, + "loss": 0.5548, + "step": 29120 + }, + { + "epoch": 4.709401018511034, + "grad_norm": 0.9282503724098206, + "learning_rate": 0.0002, + "loss": 0.559, + "step": 29130 + }, + { + "epoch": 4.711017702691779, + "grad_norm": 1.1418932676315308, + "learning_rate": 0.0002, + "loss": 0.5744, + "step": 29140 + }, + { + "epoch": 4.712634386872525, + "grad_norm": 0.9950099587440491, + "learning_rate": 0.0002, + "loss": 0.5394, + "step": 29150 + }, + { + "epoch": 4.71425107105327, + "grad_norm": 0.8304893374443054, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 29160 + }, + { + "epoch": 4.715867755234015, + "grad_norm": 1.115626335144043, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 29170 + }, + { + "epoch": 4.71748443941476, + "grad_norm": 1.079818606376648, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 29180 + }, + { + "epoch": 4.719101123595506, + "grad_norm": 1.1929082870483398, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 29190 + }, + { + "epoch": 4.720717807776251, + "grad_norm": 0.9621080756187439, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 29200 + }, + { + "epoch": 4.722334491956996, + "grad_norm": 0.8549222350120544, + "learning_rate": 0.0002, + "loss": 0.546, + "step": 29210 + }, + { + "epoch": 4.723951176137741, + "grad_norm": 0.9341941475868225, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 29220 + }, + { + "epoch": 4.7255678603184865, + "grad_norm": 1.075406789779663, + "learning_rate": 0.0002, + "loss": 0.5436, + "step": 29230 + }, + { + "epoch": 4.727184544499232, + "grad_norm": 1.0859880447387695, + "learning_rate": 0.0002, + "loss": 0.576, + "step": 29240 + }, + { + "epoch": 4.728801228679977, + "grad_norm": 0.8475605249404907, + "learning_rate": 0.0002, + "loss": 0.5525, + "step": 29250 + }, + { + "epoch": 4.730417912860723, + "grad_norm": 0.9331845641136169, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 29260 + }, + { + "epoch": 4.7320345970414674, + "grad_norm": 0.9279314279556274, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 29270 + }, + { + "epoch": 4.733651281222214, + "grad_norm": 0.7803558707237244, + "learning_rate": 0.0002, + "loss": 0.597, + "step": 29280 + }, + { + "epoch": 4.735267965402959, + "grad_norm": 1.0159329175949097, + "learning_rate": 0.0002, + "loss": 0.5968, + "step": 29290 + }, + { + "epoch": 4.736884649583704, + "grad_norm": 0.9448670744895935, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 29300 + }, + { + "epoch": 4.738501333764449, + "grad_norm": 1.0732197761535645, + "learning_rate": 0.0002, + "loss": 0.574, + "step": 29310 + }, + { + "epoch": 4.7401180179451945, + "grad_norm": 0.901830792427063, + "learning_rate": 0.0002, + "loss": 0.6066, + "step": 29320 + }, + { + "epoch": 4.74173470212594, + "grad_norm": 0.9141789674758911, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 29330 + }, + { + "epoch": 4.743351386306685, + "grad_norm": 0.9733418226242065, + "learning_rate": 0.0002, + "loss": 0.5481, + "step": 29340 + }, + { + "epoch": 4.74496807048743, + "grad_norm": 0.909810483455658, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 29350 + }, + { + "epoch": 4.746584754668175, + "grad_norm": 0.909541666507721, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 29360 + }, + { + "epoch": 4.748201438848921, + "grad_norm": 0.9383015632629395, + "learning_rate": 0.0002, + "loss": 0.5579, + "step": 29370 + }, + { + "epoch": 4.749818123029666, + "grad_norm": 0.9275668263435364, + "learning_rate": 0.0002, + "loss": 0.5529, + "step": 29380 + }, + { + "epoch": 4.751434807210411, + "grad_norm": 1.1146225929260254, + "learning_rate": 0.0002, + "loss": 0.5623, + "step": 29390 + }, + { + "epoch": 4.753051491391156, + "grad_norm": 1.0062453746795654, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 29400 + }, + { + "epoch": 4.7546681755719025, + "grad_norm": 0.9451895952224731, + "learning_rate": 0.0002, + "loss": 0.5872, + "step": 29410 + }, + { + "epoch": 4.756284859752648, + "grad_norm": 0.870457649230957, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 29420 + }, + { + "epoch": 4.757901543933393, + "grad_norm": 1.0411282777786255, + "learning_rate": 0.0002, + "loss": 0.57, + "step": 29430 + }, + { + "epoch": 4.759518228114138, + "grad_norm": 1.1648986339569092, + "learning_rate": 0.0002, + "loss": 0.5688, + "step": 29440 + }, + { + "epoch": 4.761134912294883, + "grad_norm": 0.8999572992324829, + "learning_rate": 0.0002, + "loss": 0.5432, + "step": 29450 + }, + { + "epoch": 4.762751596475629, + "grad_norm": 0.9863559007644653, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 29460 + }, + { + "epoch": 4.764368280656374, + "grad_norm": 0.9676542282104492, + "learning_rate": 0.0002, + "loss": 0.5779, + "step": 29470 + }, + { + "epoch": 4.765984964837119, + "grad_norm": 1.004775047302246, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 29480 + }, + { + "epoch": 4.767601649017864, + "grad_norm": 1.0937515497207642, + "learning_rate": 0.0002, + "loss": 0.6044, + "step": 29490 + }, + { + "epoch": 4.7692183331986095, + "grad_norm": 0.9551598429679871, + "learning_rate": 0.0002, + "loss": 0.5433, + "step": 29500 + }, + { + "epoch": 4.770835017379355, + "grad_norm": 1.0757228136062622, + "learning_rate": 0.0002, + "loss": 0.5609, + "step": 29510 + }, + { + "epoch": 4.7724517015601, + "grad_norm": 1.0588841438293457, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 29520 + }, + { + "epoch": 4.774068385740845, + "grad_norm": 1.0744032859802246, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 29530 + }, + { + "epoch": 4.7756850699215905, + "grad_norm": 1.0066277980804443, + "learning_rate": 0.0002, + "loss": 0.5681, + "step": 29540 + }, + { + "epoch": 4.777301754102336, + "grad_norm": 1.082319736480713, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 29550 + }, + { + "epoch": 4.778918438283082, + "grad_norm": 0.8252472877502441, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 29560 + }, + { + "epoch": 4.780535122463827, + "grad_norm": 0.9855340123176575, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 29570 + }, + { + "epoch": 4.782151806644572, + "grad_norm": 0.9991421699523926, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 29580 + }, + { + "epoch": 4.7837684908253175, + "grad_norm": 1.316841959953308, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 29590 + }, + { + "epoch": 4.785385175006063, + "grad_norm": 1.1513035297393799, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 29600 + }, + { + "epoch": 4.787001859186808, + "grad_norm": 0.9767683744430542, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 29610 + }, + { + "epoch": 4.788618543367553, + "grad_norm": 0.9786278605461121, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 29620 + }, + { + "epoch": 4.7902352275482984, + "grad_norm": 0.8004973530769348, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 29630 + }, + { + "epoch": 4.791851911729044, + "grad_norm": 1.0997767448425293, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 29640 + }, + { + "epoch": 4.793468595909789, + "grad_norm": 0.9752856492996216, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 29650 + }, + { + "epoch": 4.795085280090534, + "grad_norm": 1.0518392324447632, + "learning_rate": 0.0002, + "loss": 0.5916, + "step": 29660 + }, + { + "epoch": 4.796701964271279, + "grad_norm": 1.1050055027008057, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 29670 + }, + { + "epoch": 4.798318648452025, + "grad_norm": 0.9933857917785645, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 29680 + }, + { + "epoch": 4.79993533263277, + "grad_norm": 1.2804018259048462, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 29690 + }, + { + "epoch": 4.801552016813515, + "grad_norm": 1.0133371353149414, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 29700 + }, + { + "epoch": 4.803168700994261, + "grad_norm": 1.080350637435913, + "learning_rate": 0.0002, + "loss": 0.5662, + "step": 29710 + }, + { + "epoch": 4.804785385175006, + "grad_norm": 0.9986529350280762, + "learning_rate": 0.0002, + "loss": 0.5603, + "step": 29720 + }, + { + "epoch": 4.806402069355752, + "grad_norm": 0.975665807723999, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 29730 + }, + { + "epoch": 4.808018753536497, + "grad_norm": 0.8458138704299927, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 29740 + }, + { + "epoch": 4.809635437717242, + "grad_norm": 0.99330073595047, + "learning_rate": 0.0002, + "loss": 0.5837, + "step": 29750 + }, + { + "epoch": 4.811252121897987, + "grad_norm": 0.898274302482605, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 29760 + }, + { + "epoch": 4.812868806078733, + "grad_norm": 1.0504480600357056, + "learning_rate": 0.0002, + "loss": 0.5842, + "step": 29770 + }, + { + "epoch": 4.814485490259478, + "grad_norm": 0.937919020652771, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 29780 + }, + { + "epoch": 4.816102174440223, + "grad_norm": 0.9593307971954346, + "learning_rate": 0.0002, + "loss": 0.5885, + "step": 29790 + }, + { + "epoch": 4.817718858620968, + "grad_norm": 0.9431198835372925, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 29800 + }, + { + "epoch": 4.8193355428017135, + "grad_norm": 1.2729957103729248, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 29810 + }, + { + "epoch": 4.820952226982459, + "grad_norm": 0.8876838684082031, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 29820 + }, + { + "epoch": 4.822568911163204, + "grad_norm": 1.0185000896453857, + "learning_rate": 0.0002, + "loss": 0.5583, + "step": 29830 + }, + { + "epoch": 4.824185595343949, + "grad_norm": 1.064276099205017, + "learning_rate": 0.0002, + "loss": 0.5686, + "step": 29840 + }, + { + "epoch": 4.825802279524694, + "grad_norm": 0.9774803519248962, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 29850 + }, + { + "epoch": 4.8274189637054405, + "grad_norm": 1.131646990776062, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 29860 + }, + { + "epoch": 4.829035647886186, + "grad_norm": 1.081455945968628, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 29870 + }, + { + "epoch": 4.830652332066931, + "grad_norm": 0.990538477897644, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 29880 + }, + { + "epoch": 4.832269016247676, + "grad_norm": 0.9750600457191467, + "learning_rate": 0.0002, + "loss": 0.5833, + "step": 29890 + }, + { + "epoch": 4.8338857004284215, + "grad_norm": 1.0600621700286865, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 29900 + }, + { + "epoch": 4.835502384609167, + "grad_norm": 0.9237320423126221, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 29910 + }, + { + "epoch": 4.837119068789912, + "grad_norm": 0.9739177227020264, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 29920 + }, + { + "epoch": 4.838735752970657, + "grad_norm": 1.128677248954773, + "learning_rate": 0.0002, + "loss": 0.587, + "step": 29930 + }, + { + "epoch": 4.840352437151402, + "grad_norm": 1.042604923248291, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 29940 + }, + { + "epoch": 4.841969121332148, + "grad_norm": 0.849758505821228, + "learning_rate": 0.0002, + "loss": 0.5885, + "step": 29950 + }, + { + "epoch": 4.843585805512893, + "grad_norm": 1.2809888124465942, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 29960 + }, + { + "epoch": 4.845202489693638, + "grad_norm": 1.0177865028381348, + "learning_rate": 0.0002, + "loss": 0.5703, + "step": 29970 + }, + { + "epoch": 4.846819173874383, + "grad_norm": 1.0026639699935913, + "learning_rate": 0.0002, + "loss": 0.5946, + "step": 29980 + }, + { + "epoch": 4.8484358580551286, + "grad_norm": 0.9679505228996277, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 29990 + }, + { + "epoch": 4.850052542235874, + "grad_norm": 0.8939532041549683, + "learning_rate": 0.0002, + "loss": 0.5621, + "step": 30000 + }, + { + "epoch": 4.85166922641662, + "grad_norm": 0.9957457780838013, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 30010 + }, + { + "epoch": 4.853285910597365, + "grad_norm": 1.1646790504455566, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 30020 + }, + { + "epoch": 4.85490259477811, + "grad_norm": 0.8804680705070496, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 30030 + }, + { + "epoch": 4.856519278958856, + "grad_norm": 1.161970853805542, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 30040 + }, + { + "epoch": 4.858135963139601, + "grad_norm": 0.9081037640571594, + "learning_rate": 0.0002, + "loss": 0.5552, + "step": 30050 + }, + { + "epoch": 4.859752647320346, + "grad_norm": 0.9402848482131958, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 30060 + }, + { + "epoch": 4.861369331501091, + "grad_norm": 0.9023865461349487, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 30070 + }, + { + "epoch": 4.8629860156818365, + "grad_norm": 1.0173414945602417, + "learning_rate": 0.0002, + "loss": 0.5926, + "step": 30080 + }, + { + "epoch": 4.864602699862582, + "grad_norm": 1.084402322769165, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 30090 + }, + { + "epoch": 4.866219384043327, + "grad_norm": 0.9577937126159668, + "learning_rate": 0.0002, + "loss": 0.6311, + "step": 30100 + }, + { + "epoch": 4.867836068224072, + "grad_norm": 0.9807606935501099, + "learning_rate": 0.0002, + "loss": 0.5724, + "step": 30110 + }, + { + "epoch": 4.8694527524048175, + "grad_norm": 0.978784441947937, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 30120 + }, + { + "epoch": 4.871069436585563, + "grad_norm": 0.9762914776802063, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 30130 + }, + { + "epoch": 4.872686120766308, + "grad_norm": 0.9404871463775635, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 30140 + }, + { + "epoch": 4.874302804947053, + "grad_norm": 1.0069509744644165, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 30150 + }, + { + "epoch": 4.875919489127799, + "grad_norm": 1.1770923137664795, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 30160 + }, + { + "epoch": 4.8775361733085445, + "grad_norm": 1.021210789680481, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 30170 + }, + { + "epoch": 4.87915285748929, + "grad_norm": 0.8512648940086365, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 30180 + }, + { + "epoch": 4.880769541670035, + "grad_norm": 0.9345870018005371, + "learning_rate": 0.0002, + "loss": 0.5519, + "step": 30190 + }, + { + "epoch": 4.88238622585078, + "grad_norm": 1.0224418640136719, + "learning_rate": 0.0002, + "loss": 0.5682, + "step": 30200 + }, + { + "epoch": 4.884002910031525, + "grad_norm": 1.0316044092178345, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 30210 + }, + { + "epoch": 4.885619594212271, + "grad_norm": 1.102437973022461, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 30220 + }, + { + "epoch": 4.887236278393016, + "grad_norm": 1.0220023393630981, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 30230 + }, + { + "epoch": 4.888852962573761, + "grad_norm": 1.0934523344039917, + "learning_rate": 0.0002, + "loss": 0.5781, + "step": 30240 + }, + { + "epoch": 4.890469646754506, + "grad_norm": 1.264630913734436, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 30250 + }, + { + "epoch": 4.892086330935252, + "grad_norm": 1.0999879837036133, + "learning_rate": 0.0002, + "loss": 0.5712, + "step": 30260 + }, + { + "epoch": 4.893703015115997, + "grad_norm": 0.9124550223350525, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 30270 + }, + { + "epoch": 4.895319699296742, + "grad_norm": 0.9853624105453491, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 30280 + }, + { + "epoch": 4.896936383477488, + "grad_norm": 1.0589802265167236, + "learning_rate": 0.0002, + "loss": 0.595, + "step": 30290 + }, + { + "epoch": 4.8985530676582325, + "grad_norm": 0.8487226366996765, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 30300 + }, + { + "epoch": 4.900169751838979, + "grad_norm": 1.0212191343307495, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 30310 + }, + { + "epoch": 4.901786436019724, + "grad_norm": 1.0187491178512573, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 30320 + }, + { + "epoch": 4.903403120200469, + "grad_norm": 1.0013091564178467, + "learning_rate": 0.0002, + "loss": 0.5809, + "step": 30330 + }, + { + "epoch": 4.905019804381214, + "grad_norm": 1.0017542839050293, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 30340 + }, + { + "epoch": 4.9066364885619596, + "grad_norm": 0.9665151238441467, + "learning_rate": 0.0002, + "loss": 0.6002, + "step": 30350 + }, + { + "epoch": 4.908253172742705, + "grad_norm": 0.8774822950363159, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 30360 + }, + { + "epoch": 4.90986985692345, + "grad_norm": 0.9449850916862488, + "learning_rate": 0.0002, + "loss": 0.5771, + "step": 30370 + }, + { + "epoch": 4.911486541104195, + "grad_norm": 0.7368341088294983, + "learning_rate": 0.0002, + "loss": 0.58, + "step": 30380 + }, + { + "epoch": 4.9131032252849405, + "grad_norm": 0.9669167995452881, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 30390 + }, + { + "epoch": 4.914719909465686, + "grad_norm": 1.1227794885635376, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 30400 + }, + { + "epoch": 4.916336593646431, + "grad_norm": 0.9884361028671265, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 30410 + }, + { + "epoch": 4.917953277827176, + "grad_norm": 0.9949551224708557, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 30420 + }, + { + "epoch": 4.919569962007921, + "grad_norm": 0.9491621851921082, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 30430 + }, + { + "epoch": 4.9211866461886675, + "grad_norm": 0.78848797082901, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 30440 + }, + { + "epoch": 4.922803330369412, + "grad_norm": 1.0693835020065308, + "learning_rate": 0.0002, + "loss": 0.5561, + "step": 30450 + }, + { + "epoch": 4.924420014550158, + "grad_norm": 0.9573729634284973, + "learning_rate": 0.0002, + "loss": 0.566, + "step": 30460 + }, + { + "epoch": 4.926036698730903, + "grad_norm": 0.9975152611732483, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 30470 + }, + { + "epoch": 4.9276533829116484, + "grad_norm": 0.8695693016052246, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 30480 + }, + { + "epoch": 4.929270067092394, + "grad_norm": 1.145394206047058, + "learning_rate": 0.0002, + "loss": 0.6144, + "step": 30490 + }, + { + "epoch": 4.930886751273139, + "grad_norm": 0.7668989896774292, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 30500 + }, + { + "epoch": 4.932503435453884, + "grad_norm": 0.9630151391029358, + "learning_rate": 0.0002, + "loss": 0.6052, + "step": 30510 + }, + { + "epoch": 4.934120119634629, + "grad_norm": 0.940705418586731, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 30520 + }, + { + "epoch": 4.935736803815375, + "grad_norm": 1.3243348598480225, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 30530 + }, + { + "epoch": 4.93735348799612, + "grad_norm": 1.004347801208496, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 30540 + }, + { + "epoch": 4.938970172176865, + "grad_norm": 0.8711541295051575, + "learning_rate": 0.0002, + "loss": 0.583, + "step": 30550 + }, + { + "epoch": 4.94058685635761, + "grad_norm": 0.8980631828308105, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 30560 + }, + { + "epoch": 4.9422035405383555, + "grad_norm": 0.8388893604278564, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 30570 + }, + { + "epoch": 4.943820224719101, + "grad_norm": 1.0991183519363403, + "learning_rate": 0.0002, + "loss": 0.6189, + "step": 30580 + }, + { + "epoch": 4.945436908899847, + "grad_norm": 0.9731075763702393, + "learning_rate": 0.0002, + "loss": 0.5906, + "step": 30590 + }, + { + "epoch": 4.947053593080591, + "grad_norm": 1.3904452323913574, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 30600 + }, + { + "epoch": 4.948670277261337, + "grad_norm": 1.2489882707595825, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 30610 + }, + { + "epoch": 4.950286961442083, + "grad_norm": 1.240072250366211, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 30620 + }, + { + "epoch": 4.951903645622828, + "grad_norm": 0.9191411733627319, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 30630 + }, + { + "epoch": 4.953520329803573, + "grad_norm": 0.8888895511627197, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 30640 + }, + { + "epoch": 4.955137013984318, + "grad_norm": 0.9001450538635254, + "learning_rate": 0.0002, + "loss": 0.6594, + "step": 30650 + }, + { + "epoch": 4.9567536981650635, + "grad_norm": 1.053971767425537, + "learning_rate": 0.0002, + "loss": 0.6047, + "step": 30660 + }, + { + "epoch": 4.958370382345809, + "grad_norm": 1.2224042415618896, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 30670 + }, + { + "epoch": 4.959987066526554, + "grad_norm": 0.8855111598968506, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 30680 + }, + { + "epoch": 4.961603750707299, + "grad_norm": 0.9489575624465942, + "learning_rate": 0.0002, + "loss": 0.5764, + "step": 30690 + }, + { + "epoch": 4.963220434888044, + "grad_norm": 0.9635404944419861, + "learning_rate": 0.0002, + "loss": 0.5371, + "step": 30700 + }, + { + "epoch": 4.96483711906879, + "grad_norm": 1.1784121990203857, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 30710 + }, + { + "epoch": 4.966453803249535, + "grad_norm": 1.0059462785720825, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 30720 + }, + { + "epoch": 4.96807048743028, + "grad_norm": 0.9479738473892212, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 30730 + }, + { + "epoch": 4.969687171611026, + "grad_norm": 1.0624593496322632, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 30740 + }, + { + "epoch": 4.971303855791771, + "grad_norm": 1.1429259777069092, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 30750 + }, + { + "epoch": 4.972920539972517, + "grad_norm": 0.9102491140365601, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 30760 + }, + { + "epoch": 4.974537224153262, + "grad_norm": 1.1262688636779785, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 30770 + }, + { + "epoch": 4.976153908334007, + "grad_norm": 1.1415393352508545, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 30780 + }, + { + "epoch": 4.977770592514752, + "grad_norm": 1.083078384399414, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 30790 + }, + { + "epoch": 4.979387276695498, + "grad_norm": 0.964859127998352, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 30800 + }, + { + "epoch": 4.981003960876243, + "grad_norm": 0.8704743385314941, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 30810 + }, + { + "epoch": 4.982620645056988, + "grad_norm": 1.0714856386184692, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 30820 + }, + { + "epoch": 4.984237329237733, + "grad_norm": 0.6818771362304688, + "learning_rate": 0.0002, + "loss": 0.565, + "step": 30830 + }, + { + "epoch": 4.985854013418479, + "grad_norm": 1.0454156398773193, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 30840 + }, + { + "epoch": 4.987470697599224, + "grad_norm": 0.9410776495933533, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 30850 + }, + { + "epoch": 4.989087381779969, + "grad_norm": 1.0878902673721313, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 30860 + }, + { + "epoch": 4.990704065960714, + "grad_norm": 0.8916727304458618, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 30870 + }, + { + "epoch": 4.9923207501414595, + "grad_norm": 1.045776128768921, + "learning_rate": 0.0002, + "loss": 0.6066, + "step": 30880 + }, + { + "epoch": 4.993937434322206, + "grad_norm": 0.9861903786659241, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 30890 + }, + { + "epoch": 4.995554118502951, + "grad_norm": 0.9275050759315491, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 30900 + }, + { + "epoch": 4.997170802683696, + "grad_norm": 0.94013911485672, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 30910 + }, + { + "epoch": 4.998787486864441, + "grad_norm": 0.9771268367767334, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 30920 + }, + { + "epoch": 4.9999191657909625, + "eval_loss": 1.1968598365783691, + "eval_runtime": 122.2519, + "eval_samples_per_second": 5.996, + "eval_steps_per_second": 0.753, + "step": 30927 + } + ], + "logging_steps": 10, + "max_steps": 49480, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.431255450522747e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..155b12fa9acbc6e71dba75c92bfa79e152397ebf --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-30927/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28694d5564a2b5c7d6881d4ba2af103356aa22489d2c22768ebbe47283c0f4a1 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bf74e91baddd9bf12e2d15e86e17b59b8709c514 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae20f88a8da1f25d309771826b573d955d59bf638bfd7ebe56d06432eb2bcd57 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..000e7d4eeba61672fe5b70b5762a146650730c8a --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d770a15916660cc35507ef27edaa08ece197ee3ae29b8d7c8422d0dc1def1a85 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f6f721479c01f456f61ca73426e82ddb0ae7df41 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59aa160eb003909b59eb576bcfcd615685360c81e70469d31aeea715df40d745 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..10639d46460f861cbf1977ce29feebed3e09bd6f --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a13545656639db3126c168548c9c958ee9173553480b5b708d0d8b5933af97ad +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..da5362fe48a08695048e61a59e11aefcd7700e25 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/trainer_state.json @@ -0,0 +1,26058 @@ +{ + "best_metric": 1.0871200561523438, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 37113, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016166841807452913, + "grad_norm": 0.9894065856933594, + "learning_rate": 0.0002, + "loss": 1.6636, + "step": 10 + }, + { + "epoch": 0.0032333683614905826, + "grad_norm": 1.7810699939727783, + "learning_rate": 0.0002, + "loss": 1.1528, + "step": 20 + }, + { + "epoch": 0.004850052542235874, + "grad_norm": 0.5969577431678772, + "learning_rate": 0.0002, + "loss": 0.9767, + "step": 30 + }, + { + "epoch": 0.006466736722981165, + "grad_norm": 0.6354120969772339, + "learning_rate": 0.0002, + "loss": 0.9772, + "step": 40 + }, + { + "epoch": 0.008083420903726457, + "grad_norm": 0.5604607462882996, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 50 + }, + { + "epoch": 0.009700105084471748, + "grad_norm": 0.4676193594932556, + "learning_rate": 0.0002, + "loss": 0.8841, + "step": 60 + }, + { + "epoch": 0.01131678926521704, + "grad_norm": 0.6099211573600769, + "learning_rate": 0.0002, + "loss": 0.9022, + "step": 70 + }, + { + "epoch": 0.01293347344596233, + "grad_norm": 0.48639994859695435, + "learning_rate": 0.0002, + "loss": 0.9133, + "step": 80 + }, + { + "epoch": 0.014550157626707623, + "grad_norm": 0.4904264509677887, + "learning_rate": 0.0002, + "loss": 0.8704, + "step": 90 + }, + { + "epoch": 0.016166841807452915, + "grad_norm": 2.8334362506866455, + "learning_rate": 0.0002, + "loss": 0.8855, + "step": 100 + }, + { + "epoch": 0.017783525988198205, + "grad_norm": 0.43221670389175415, + "learning_rate": 0.0002, + "loss": 0.8958, + "step": 110 + }, + { + "epoch": 0.019400210168943496, + "grad_norm": 0.42244166135787964, + "learning_rate": 0.0002, + "loss": 0.8412, + "step": 120 + }, + { + "epoch": 0.02101689434968879, + "grad_norm": 0.45363298058509827, + "learning_rate": 0.0002, + "loss": 0.8467, + "step": 130 + }, + { + "epoch": 0.02263357853043408, + "grad_norm": 0.44816508889198303, + "learning_rate": 0.0002, + "loss": 0.8641, + "step": 140 + }, + { + "epoch": 0.02425026271117937, + "grad_norm": 0.43308213353157043, + "learning_rate": 0.0002, + "loss": 0.8496, + "step": 150 + }, + { + "epoch": 0.02586694689192466, + "grad_norm": 0.4084763526916504, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 160 + }, + { + "epoch": 0.027483631072669955, + "grad_norm": 0.5363703966140747, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 170 + }, + { + "epoch": 0.029100315253415245, + "grad_norm": 0.4619699716567993, + "learning_rate": 0.0002, + "loss": 0.8558, + "step": 180 + }, + { + "epoch": 0.030716999434160536, + "grad_norm": 0.49069908261299133, + "learning_rate": 0.0002, + "loss": 0.8878, + "step": 190 + }, + { + "epoch": 0.03233368361490583, + "grad_norm": 0.4645835757255554, + "learning_rate": 0.0002, + "loss": 0.8867, + "step": 200 + }, + { + "epoch": 0.03395036779565112, + "grad_norm": 1.2411243915557861, + "learning_rate": 0.0002, + "loss": 0.8842, + "step": 210 + }, + { + "epoch": 0.03556705197639641, + "grad_norm": 0.5211851596832275, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 220 + }, + { + "epoch": 0.037183736157141704, + "grad_norm": 0.5253691673278809, + "learning_rate": 0.0002, + "loss": 0.8194, + "step": 230 + }, + { + "epoch": 0.03880042033788699, + "grad_norm": 0.4567478895187378, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 240 + }, + { + "epoch": 0.040417104518632285, + "grad_norm": 0.5472128391265869, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 250 + }, + { + "epoch": 0.04203378869937758, + "grad_norm": 0.42978546023368835, + "learning_rate": 0.0002, + "loss": 0.8201, + "step": 260 + }, + { + "epoch": 0.043650472880122866, + "grad_norm": 0.601734459400177, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 270 + }, + { + "epoch": 0.04526715706086816, + "grad_norm": 0.4286513328552246, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 280 + }, + { + "epoch": 0.046883841241613454, + "grad_norm": 0.5230861902236938, + "learning_rate": 0.0002, + "loss": 0.8758, + "step": 290 + }, + { + "epoch": 0.04850052542235874, + "grad_norm": 0.6504611968994141, + "learning_rate": 0.0002, + "loss": 0.8636, + "step": 300 + }, + { + "epoch": 0.050117209603104035, + "grad_norm": 0.43485215306282043, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 310 + }, + { + "epoch": 0.05173389378384932, + "grad_norm": 0.4717007875442505, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 320 + }, + { + "epoch": 0.053350577964594616, + "grad_norm": 0.4059787690639496, + "learning_rate": 0.0002, + "loss": 0.8469, + "step": 330 + }, + { + "epoch": 0.05496726214533991, + "grad_norm": 0.4366913437843323, + "learning_rate": 0.0002, + "loss": 0.8866, + "step": 340 + }, + { + "epoch": 0.0565839463260852, + "grad_norm": 0.4233848452568054, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 350 + }, + { + "epoch": 0.05820063050683049, + "grad_norm": 0.4209108352661133, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 360 + }, + { + "epoch": 0.059817314687575784, + "grad_norm": 0.41637396812438965, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 370 + }, + { + "epoch": 0.06143399886832107, + "grad_norm": 0.46235376596450806, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 380 + }, + { + "epoch": 0.06305068304906636, + "grad_norm": 0.4013484716415405, + "learning_rate": 0.0002, + "loss": 0.7966, + "step": 390 + }, + { + "epoch": 0.06466736722981166, + "grad_norm": 0.47443896532058716, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 400 + }, + { + "epoch": 0.06628405141055695, + "grad_norm": 0.3942156434059143, + "learning_rate": 0.0002, + "loss": 0.8666, + "step": 410 + }, + { + "epoch": 0.06790073559130223, + "grad_norm": 0.4965320825576782, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 420 + }, + { + "epoch": 0.06951741977204753, + "grad_norm": 0.4304835796356201, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 430 + }, + { + "epoch": 0.07113410395279282, + "grad_norm": 0.511726975440979, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 440 + }, + { + "epoch": 0.07275078813353811, + "grad_norm": 0.4040689170360565, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 450 + }, + { + "epoch": 0.07436747231428341, + "grad_norm": 0.5402171015739441, + "learning_rate": 0.0002, + "loss": 0.8788, + "step": 460 + }, + { + "epoch": 0.0759841564950287, + "grad_norm": 0.4174517095088959, + "learning_rate": 0.0002, + "loss": 0.8737, + "step": 470 + }, + { + "epoch": 0.07760084067577398, + "grad_norm": 0.4306182265281677, + "learning_rate": 0.0002, + "loss": 0.7605, + "step": 480 + }, + { + "epoch": 0.07921752485651928, + "grad_norm": 0.535210132598877, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 490 + }, + { + "epoch": 0.08083420903726457, + "grad_norm": 0.5339109897613525, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 500 + }, + { + "epoch": 0.08245089321800986, + "grad_norm": 0.45754891633987427, + "learning_rate": 0.0002, + "loss": 0.8985, + "step": 510 + }, + { + "epoch": 0.08406757739875516, + "grad_norm": 0.43820783495903015, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 520 + }, + { + "epoch": 0.08568426157950045, + "grad_norm": 0.4434749186038971, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 530 + }, + { + "epoch": 0.08730094576024573, + "grad_norm": 0.43111467361450195, + "learning_rate": 0.0002, + "loss": 0.7857, + "step": 540 + }, + { + "epoch": 0.08891762994099103, + "grad_norm": 0.4378940165042877, + "learning_rate": 0.0002, + "loss": 0.8418, + "step": 550 + }, + { + "epoch": 0.09053431412173632, + "grad_norm": 0.4772215187549591, + "learning_rate": 0.0002, + "loss": 0.8361, + "step": 560 + }, + { + "epoch": 0.09215099830248161, + "grad_norm": 0.6837629079818726, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 570 + }, + { + "epoch": 0.09376768248322691, + "grad_norm": 0.42241212725639343, + "learning_rate": 0.0002, + "loss": 0.8607, + "step": 580 + }, + { + "epoch": 0.0953843666639722, + "grad_norm": 0.5165936350822449, + "learning_rate": 0.0002, + "loss": 0.852, + "step": 590 + }, + { + "epoch": 0.09700105084471748, + "grad_norm": 0.48737478256225586, + "learning_rate": 0.0002, + "loss": 0.8664, + "step": 600 + }, + { + "epoch": 0.09861773502546278, + "grad_norm": 0.47419852018356323, + "learning_rate": 0.0002, + "loss": 0.8806, + "step": 610 + }, + { + "epoch": 0.10023441920620807, + "grad_norm": 0.4975486099720001, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 620 + }, + { + "epoch": 0.10185110338695336, + "grad_norm": 0.49123844504356384, + "learning_rate": 0.0002, + "loss": 0.8548, + "step": 630 + }, + { + "epoch": 0.10346778756769864, + "grad_norm": 0.6288952827453613, + "learning_rate": 0.0002, + "loss": 0.8911, + "step": 640 + }, + { + "epoch": 0.10508447174844394, + "grad_norm": 0.4277345836162567, + "learning_rate": 0.0002, + "loss": 0.827, + "step": 650 + }, + { + "epoch": 0.10670115592918923, + "grad_norm": 0.4021061956882477, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 660 + }, + { + "epoch": 0.10831784010993452, + "grad_norm": 0.3492237329483032, + "learning_rate": 0.0002, + "loss": 0.87, + "step": 670 + }, + { + "epoch": 0.10993452429067982, + "grad_norm": 0.4341012239456177, + "learning_rate": 0.0002, + "loss": 0.8698, + "step": 680 + }, + { + "epoch": 0.1115512084714251, + "grad_norm": 0.7296304106712341, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 690 + }, + { + "epoch": 0.1131678926521704, + "grad_norm": 0.397494912147522, + "learning_rate": 0.0002, + "loss": 0.8433, + "step": 700 + }, + { + "epoch": 0.1147845768329157, + "grad_norm": 0.396431028842926, + "learning_rate": 0.0002, + "loss": 0.827, + "step": 710 + }, + { + "epoch": 0.11640126101366098, + "grad_norm": 0.48842838406562805, + "learning_rate": 0.0002, + "loss": 0.8379, + "step": 720 + }, + { + "epoch": 0.11801794519440627, + "grad_norm": 0.46322616934776306, + "learning_rate": 0.0002, + "loss": 0.8238, + "step": 730 + }, + { + "epoch": 0.11963462937515157, + "grad_norm": 0.47990912199020386, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 740 + }, + { + "epoch": 0.12125131355589686, + "grad_norm": 0.4997142255306244, + "learning_rate": 0.0002, + "loss": 0.82, + "step": 750 + }, + { + "epoch": 0.12286799773664214, + "grad_norm": 0.4040526747703552, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 760 + }, + { + "epoch": 0.12448468191738744, + "grad_norm": 0.453095942735672, + "learning_rate": 0.0002, + "loss": 0.863, + "step": 770 + }, + { + "epoch": 0.12610136609813272, + "grad_norm": 0.4636971950531006, + "learning_rate": 0.0002, + "loss": 0.8792, + "step": 780 + }, + { + "epoch": 0.12771805027887803, + "grad_norm": 0.4279276132583618, + "learning_rate": 0.0002, + "loss": 0.8112, + "step": 790 + }, + { + "epoch": 0.12933473445962332, + "grad_norm": 0.46212655305862427, + "learning_rate": 0.0002, + "loss": 0.8711, + "step": 800 + }, + { + "epoch": 0.1309514186403686, + "grad_norm": 0.43127650022506714, + "learning_rate": 0.0002, + "loss": 0.8368, + "step": 810 + }, + { + "epoch": 0.1325681028211139, + "grad_norm": 0.4201301336288452, + "learning_rate": 0.0002, + "loss": 0.8476, + "step": 820 + }, + { + "epoch": 0.13418478700185918, + "grad_norm": 0.42583167552948, + "learning_rate": 0.0002, + "loss": 0.8078, + "step": 830 + }, + { + "epoch": 0.13580147118260447, + "grad_norm": 0.4535622000694275, + "learning_rate": 0.0002, + "loss": 0.8219, + "step": 840 + }, + { + "epoch": 0.13741815536334978, + "grad_norm": 0.4116036891937256, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 850 + }, + { + "epoch": 0.13903483954409507, + "grad_norm": 0.45997580885887146, + "learning_rate": 0.0002, + "loss": 0.8466, + "step": 860 + }, + { + "epoch": 0.14065152372484035, + "grad_norm": 0.4487837255001068, + "learning_rate": 0.0002, + "loss": 0.8917, + "step": 870 + }, + { + "epoch": 0.14226820790558564, + "grad_norm": 0.43650057911872864, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 880 + }, + { + "epoch": 0.14388489208633093, + "grad_norm": 0.5335358381271362, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 890 + }, + { + "epoch": 0.14550157626707622, + "grad_norm": 0.5989000201225281, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 900 + }, + { + "epoch": 0.14711826044782153, + "grad_norm": 0.517179012298584, + "learning_rate": 0.0002, + "loss": 0.8385, + "step": 910 + }, + { + "epoch": 0.14873494462856682, + "grad_norm": 0.44435232877731323, + "learning_rate": 0.0002, + "loss": 0.8255, + "step": 920 + }, + { + "epoch": 0.1503516288093121, + "grad_norm": 0.42635923624038696, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 930 + }, + { + "epoch": 0.1519683129900574, + "grad_norm": 0.49603334069252014, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 940 + }, + { + "epoch": 0.15358499717080268, + "grad_norm": 0.40639808773994446, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 950 + }, + { + "epoch": 0.15520168135154797, + "grad_norm": 0.4850759208202362, + "learning_rate": 0.0002, + "loss": 0.8529, + "step": 960 + }, + { + "epoch": 0.15681836553229328, + "grad_norm": 0.4427442252635956, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 970 + }, + { + "epoch": 0.15843504971303857, + "grad_norm": 0.3760930001735687, + "learning_rate": 0.0002, + "loss": 0.8705, + "step": 980 + }, + { + "epoch": 0.16005173389378385, + "grad_norm": 0.4794144332408905, + "learning_rate": 0.0002, + "loss": 0.8644, + "step": 990 + }, + { + "epoch": 0.16166841807452914, + "grad_norm": 0.45828768610954285, + "learning_rate": 0.0002, + "loss": 0.8002, + "step": 1000 + }, + { + "epoch": 0.16328510225527443, + "grad_norm": 0.6313053369522095, + "learning_rate": 0.0002, + "loss": 0.7658, + "step": 1010 + }, + { + "epoch": 0.16490178643601971, + "grad_norm": 0.45041006803512573, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1020 + }, + { + "epoch": 0.166518470616765, + "grad_norm": 0.441403865814209, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 1030 + }, + { + "epoch": 0.16813515479751032, + "grad_norm": 0.8171296119689941, + "learning_rate": 0.0002, + "loss": 0.8475, + "step": 1040 + }, + { + "epoch": 0.1697518389782556, + "grad_norm": 0.7137420773506165, + "learning_rate": 0.0002, + "loss": 0.845, + "step": 1050 + }, + { + "epoch": 0.1713685231590009, + "grad_norm": 0.5236809849739075, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 1060 + }, + { + "epoch": 0.17298520733974618, + "grad_norm": 0.5021864175796509, + "learning_rate": 0.0002, + "loss": 0.8265, + "step": 1070 + }, + { + "epoch": 0.17460189152049146, + "grad_norm": 0.47347521781921387, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 1080 + }, + { + "epoch": 0.17621857570123675, + "grad_norm": 0.4631653428077698, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 1090 + }, + { + "epoch": 0.17783525988198207, + "grad_norm": 0.49169182777404785, + "learning_rate": 0.0002, + "loss": 0.8166, + "step": 1100 + }, + { + "epoch": 0.17945194406272735, + "grad_norm": 0.5019739270210266, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 1110 + }, + { + "epoch": 0.18106862824347264, + "grad_norm": 0.5100422501564026, + "learning_rate": 0.0002, + "loss": 0.8247, + "step": 1120 + }, + { + "epoch": 0.18268531242421793, + "grad_norm": 0.3888324499130249, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 1130 + }, + { + "epoch": 0.18430199660496321, + "grad_norm": 0.39765217900276184, + "learning_rate": 0.0002, + "loss": 0.8533, + "step": 1140 + }, + { + "epoch": 0.1859186807857085, + "grad_norm": 0.47190186381340027, + "learning_rate": 0.0002, + "loss": 0.8541, + "step": 1150 + }, + { + "epoch": 0.18753536496645382, + "grad_norm": 0.4464188814163208, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 1160 + }, + { + "epoch": 0.1891520491471991, + "grad_norm": 0.5153930187225342, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 1170 + }, + { + "epoch": 0.1907687333279444, + "grad_norm": 0.4779708683490753, + "learning_rate": 0.0002, + "loss": 0.8033, + "step": 1180 + }, + { + "epoch": 0.19238541750868968, + "grad_norm": 0.4834315776824951, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 1190 + }, + { + "epoch": 0.19400210168943496, + "grad_norm": 0.402357816696167, + "learning_rate": 0.0002, + "loss": 0.7721, + "step": 1200 + }, + { + "epoch": 0.19561878587018025, + "grad_norm": 0.45899084210395813, + "learning_rate": 0.0002, + "loss": 0.7941, + "step": 1210 + }, + { + "epoch": 0.19723547005092557, + "grad_norm": 0.5106529593467712, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 1220 + }, + { + "epoch": 0.19885215423167085, + "grad_norm": 0.45261722803115845, + "learning_rate": 0.0002, + "loss": 0.7816, + "step": 1230 + }, + { + "epoch": 0.20046883841241614, + "grad_norm": 0.4647127091884613, + "learning_rate": 0.0002, + "loss": 0.8068, + "step": 1240 + }, + { + "epoch": 0.20208552259316143, + "grad_norm": 0.4849368929862976, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 1250 + }, + { + "epoch": 0.2037022067739067, + "grad_norm": 0.4518061578273773, + "learning_rate": 0.0002, + "loss": 0.8514, + "step": 1260 + }, + { + "epoch": 0.205318890954652, + "grad_norm": 0.49535325169563293, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 1270 + }, + { + "epoch": 0.2069355751353973, + "grad_norm": 0.4835205376148224, + "learning_rate": 0.0002, + "loss": 0.8348, + "step": 1280 + }, + { + "epoch": 0.2085522593161426, + "grad_norm": 0.45308539271354675, + "learning_rate": 0.0002, + "loss": 0.8428, + "step": 1290 + }, + { + "epoch": 0.2101689434968879, + "grad_norm": 0.5369905233383179, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 1300 + }, + { + "epoch": 0.21178562767763318, + "grad_norm": 0.5031622052192688, + "learning_rate": 0.0002, + "loss": 0.8676, + "step": 1310 + }, + { + "epoch": 0.21340231185837846, + "grad_norm": 0.48010334372520447, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 1320 + }, + { + "epoch": 0.21501899603912375, + "grad_norm": 0.4905701279640198, + "learning_rate": 0.0002, + "loss": 0.806, + "step": 1330 + }, + { + "epoch": 0.21663568021986904, + "grad_norm": 0.43531742691993713, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 1340 + }, + { + "epoch": 0.21825236440061435, + "grad_norm": 0.44330692291259766, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 1350 + }, + { + "epoch": 0.21986904858135964, + "grad_norm": 0.5384416580200195, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 1360 + }, + { + "epoch": 0.22148573276210493, + "grad_norm": 0.4181833863258362, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 1370 + }, + { + "epoch": 0.2231024169428502, + "grad_norm": 0.523833692073822, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 1380 + }, + { + "epoch": 0.2247191011235955, + "grad_norm": 0.5528736710548401, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 1390 + }, + { + "epoch": 0.2263357853043408, + "grad_norm": 0.43515023589134216, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 1400 + }, + { + "epoch": 0.2279524694850861, + "grad_norm": 0.48809877038002014, + "learning_rate": 0.0002, + "loss": 0.8403, + "step": 1410 + }, + { + "epoch": 0.2295691536658314, + "grad_norm": 0.43591251969337463, + "learning_rate": 0.0002, + "loss": 0.8165, + "step": 1420 + }, + { + "epoch": 0.23118583784657668, + "grad_norm": 0.44625312089920044, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 1430 + }, + { + "epoch": 0.23280252202732196, + "grad_norm": 0.4390665292739868, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 1440 + }, + { + "epoch": 0.23441920620806725, + "grad_norm": 0.48496049642562866, + "learning_rate": 0.0002, + "loss": 0.8465, + "step": 1450 + }, + { + "epoch": 0.23603589038881254, + "grad_norm": 0.45919957756996155, + "learning_rate": 0.0002, + "loss": 0.775, + "step": 1460 + }, + { + "epoch": 0.23765257456955785, + "grad_norm": 0.5471845865249634, + "learning_rate": 0.0002, + "loss": 0.8659, + "step": 1470 + }, + { + "epoch": 0.23926925875030314, + "grad_norm": 0.47269317507743835, + "learning_rate": 0.0002, + "loss": 0.8164, + "step": 1480 + }, + { + "epoch": 0.24088594293104842, + "grad_norm": 0.4930245578289032, + "learning_rate": 0.0002, + "loss": 0.854, + "step": 1490 + }, + { + "epoch": 0.2425026271117937, + "grad_norm": 0.5605630278587341, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 1500 + }, + { + "epoch": 0.244119311292539, + "grad_norm": 0.4435870945453644, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 1510 + }, + { + "epoch": 0.24573599547328429, + "grad_norm": 0.4941999912261963, + "learning_rate": 0.0002, + "loss": 0.8123, + "step": 1520 + }, + { + "epoch": 0.24735267965402957, + "grad_norm": 0.5100624561309814, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 1530 + }, + { + "epoch": 0.2489693638347749, + "grad_norm": 0.4638267457485199, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 1540 + }, + { + "epoch": 0.25058604801552015, + "grad_norm": 0.5071570873260498, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 1550 + }, + { + "epoch": 0.25220273219626543, + "grad_norm": 0.4291319251060486, + "learning_rate": 0.0002, + "loss": 0.7724, + "step": 1560 + }, + { + "epoch": 0.2538194163770108, + "grad_norm": 0.5388049483299255, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 1570 + }, + { + "epoch": 0.25543610055775606, + "grad_norm": 0.5083683729171753, + "learning_rate": 0.0002, + "loss": 0.8176, + "step": 1580 + }, + { + "epoch": 0.25705278473850135, + "grad_norm": 0.4824463725090027, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 1590 + }, + { + "epoch": 0.25866946891924664, + "grad_norm": 0.41177722811698914, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 1600 + }, + { + "epoch": 0.2602861530999919, + "grad_norm": 0.5656219124794006, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 1610 + }, + { + "epoch": 0.2619028372807372, + "grad_norm": 0.41063204407691956, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 1620 + }, + { + "epoch": 0.2635195214614825, + "grad_norm": 0.4897061288356781, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 1630 + }, + { + "epoch": 0.2651362056422278, + "grad_norm": 0.4454376697540283, + "learning_rate": 0.0002, + "loss": 0.8198, + "step": 1640 + }, + { + "epoch": 0.26675288982297307, + "grad_norm": 0.4355238378047943, + "learning_rate": 0.0002, + "loss": 0.8684, + "step": 1650 + }, + { + "epoch": 0.26836957400371836, + "grad_norm": 0.458310067653656, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 1660 + }, + { + "epoch": 0.26998625818446365, + "grad_norm": 0.4752083718776703, + "learning_rate": 0.0002, + "loss": 0.7935, + "step": 1670 + }, + { + "epoch": 0.27160294236520893, + "grad_norm": 0.4666106402873993, + "learning_rate": 0.0002, + "loss": 0.8267, + "step": 1680 + }, + { + "epoch": 0.2732196265459543, + "grad_norm": 0.4213818609714508, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 1690 + }, + { + "epoch": 0.27483631072669956, + "grad_norm": 0.5768913626670837, + "learning_rate": 0.0002, + "loss": 0.8559, + "step": 1700 + }, + { + "epoch": 0.27645299490744485, + "grad_norm": 0.4209914803504944, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 1710 + }, + { + "epoch": 0.27806967908819014, + "grad_norm": 0.501909613609314, + "learning_rate": 0.0002, + "loss": 0.8167, + "step": 1720 + }, + { + "epoch": 0.2796863632689354, + "grad_norm": 0.5266261100769043, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 1730 + }, + { + "epoch": 0.2813030474496807, + "grad_norm": 0.43806859850883484, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 1740 + }, + { + "epoch": 0.282919731630426, + "grad_norm": 0.46048814058303833, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 1750 + }, + { + "epoch": 0.2845364158111713, + "grad_norm": 0.44972819089889526, + "learning_rate": 0.0002, + "loss": 0.8596, + "step": 1760 + }, + { + "epoch": 0.28615309999191657, + "grad_norm": 0.5114831328392029, + "learning_rate": 0.0002, + "loss": 0.8421, + "step": 1770 + }, + { + "epoch": 0.28776978417266186, + "grad_norm": 0.47931742668151855, + "learning_rate": 0.0002, + "loss": 0.8361, + "step": 1780 + }, + { + "epoch": 0.28938646835340714, + "grad_norm": 0.5092599987983704, + "learning_rate": 0.0002, + "loss": 0.8265, + "step": 1790 + }, + { + "epoch": 0.29100315253415243, + "grad_norm": 0.37581443786621094, + "learning_rate": 0.0002, + "loss": 0.8506, + "step": 1800 + }, + { + "epoch": 0.2926198367148977, + "grad_norm": 0.47097381949424744, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 1810 + }, + { + "epoch": 0.29423652089564306, + "grad_norm": 0.48300236463546753, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 1820 + }, + { + "epoch": 0.29585320507638835, + "grad_norm": 0.5600419640541077, + "learning_rate": 0.0002, + "loss": 0.8391, + "step": 1830 + }, + { + "epoch": 0.29746988925713364, + "grad_norm": 0.48555272817611694, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 1840 + }, + { + "epoch": 0.2990865734378789, + "grad_norm": 0.3752668499946594, + "learning_rate": 0.0002, + "loss": 0.7657, + "step": 1850 + }, + { + "epoch": 0.3007032576186242, + "grad_norm": 0.5328747034072876, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 1860 + }, + { + "epoch": 0.3023199417993695, + "grad_norm": 0.48716455698013306, + "learning_rate": 0.0002, + "loss": 0.8426, + "step": 1870 + }, + { + "epoch": 0.3039366259801148, + "grad_norm": 0.5011493563652039, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 1880 + }, + { + "epoch": 0.30555331016086007, + "grad_norm": 0.46461427211761475, + "learning_rate": 0.0002, + "loss": 0.852, + "step": 1890 + }, + { + "epoch": 0.30716999434160536, + "grad_norm": 0.36630210280418396, + "learning_rate": 0.0002, + "loss": 0.8478, + "step": 1900 + }, + { + "epoch": 0.30878667852235064, + "grad_norm": 0.4217296242713928, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 1910 + }, + { + "epoch": 0.31040336270309593, + "grad_norm": 0.4394875466823578, + "learning_rate": 0.0002, + "loss": 0.8128, + "step": 1920 + }, + { + "epoch": 0.3120200468838412, + "grad_norm": 0.6587965488433838, + "learning_rate": 0.0002, + "loss": 0.8471, + "step": 1930 + }, + { + "epoch": 0.31363673106458656, + "grad_norm": 0.5469298958778381, + "learning_rate": 0.0002, + "loss": 0.8565, + "step": 1940 + }, + { + "epoch": 0.31525341524533185, + "grad_norm": 0.4371595084667206, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 1950 + }, + { + "epoch": 0.31687009942607713, + "grad_norm": 0.4809541404247284, + "learning_rate": 0.0002, + "loss": 0.887, + "step": 1960 + }, + { + "epoch": 0.3184867836068224, + "grad_norm": 0.6061086654663086, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 1970 + }, + { + "epoch": 0.3201034677875677, + "grad_norm": 0.5342657566070557, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 1980 + }, + { + "epoch": 0.321720151968313, + "grad_norm": 0.5057743787765503, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 1990 + }, + { + "epoch": 0.3233368361490583, + "grad_norm": 0.528626024723053, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 2000 + }, + { + "epoch": 0.32495352032980357, + "grad_norm": 0.46742770075798035, + "learning_rate": 0.0002, + "loss": 0.8845, + "step": 2010 + }, + { + "epoch": 0.32657020451054886, + "grad_norm": 0.515101432800293, + "learning_rate": 0.0002, + "loss": 0.8484, + "step": 2020 + }, + { + "epoch": 0.32818688869129414, + "grad_norm": 0.41941216588020325, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 2030 + }, + { + "epoch": 0.32980357287203943, + "grad_norm": 0.49902522563934326, + "learning_rate": 0.0002, + "loss": 0.7637, + "step": 2040 + }, + { + "epoch": 0.3314202570527847, + "grad_norm": 0.4120897650718689, + "learning_rate": 0.0002, + "loss": 0.7822, + "step": 2050 + }, + { + "epoch": 0.33303694123353, + "grad_norm": 0.45352041721343994, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 2060 + }, + { + "epoch": 0.33465362541427535, + "grad_norm": 0.523199737071991, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 2070 + }, + { + "epoch": 0.33627030959502063, + "grad_norm": 0.4390358626842499, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 2080 + }, + { + "epoch": 0.3378869937757659, + "grad_norm": 0.6752901077270508, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 2090 + }, + { + "epoch": 0.3395036779565112, + "grad_norm": 0.547821044921875, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 2100 + }, + { + "epoch": 0.3411203621372565, + "grad_norm": 0.5161308646202087, + "learning_rate": 0.0002, + "loss": 0.8561, + "step": 2110 + }, + { + "epoch": 0.3427370463180018, + "grad_norm": 0.4565401077270508, + "learning_rate": 0.0002, + "loss": 0.7697, + "step": 2120 + }, + { + "epoch": 0.34435373049874707, + "grad_norm": 0.4666115939617157, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 2130 + }, + { + "epoch": 0.34597041467949236, + "grad_norm": 0.4090428352355957, + "learning_rate": 0.0002, + "loss": 0.8189, + "step": 2140 + }, + { + "epoch": 0.34758709886023764, + "grad_norm": 0.510845422744751, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 2150 + }, + { + "epoch": 0.34920378304098293, + "grad_norm": 0.42861923575401306, + "learning_rate": 0.0002, + "loss": 0.8398, + "step": 2160 + }, + { + "epoch": 0.3508204672217282, + "grad_norm": 0.4476332664489746, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 2170 + }, + { + "epoch": 0.3524371514024735, + "grad_norm": 0.6065791249275208, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 2180 + }, + { + "epoch": 0.35405383558321885, + "grad_norm": 0.42335066199302673, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 2190 + }, + { + "epoch": 0.35567051976396413, + "grad_norm": 0.5094629526138306, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 2200 + }, + { + "epoch": 0.3572872039447094, + "grad_norm": 0.5476373434066772, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 2210 + }, + { + "epoch": 0.3589038881254547, + "grad_norm": 0.3911719024181366, + "learning_rate": 0.0002, + "loss": 0.814, + "step": 2220 + }, + { + "epoch": 0.3605205723062, + "grad_norm": 0.6599636077880859, + "learning_rate": 0.0002, + "loss": 0.8599, + "step": 2230 + }, + { + "epoch": 0.3621372564869453, + "grad_norm": 0.40381914377212524, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 2240 + }, + { + "epoch": 0.36375394066769057, + "grad_norm": 0.4433908462524414, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 2250 + }, + { + "epoch": 0.36537062484843585, + "grad_norm": 0.578326940536499, + "learning_rate": 0.0002, + "loss": 0.8503, + "step": 2260 + }, + { + "epoch": 0.36698730902918114, + "grad_norm": 0.5734784007072449, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 2270 + }, + { + "epoch": 0.36860399320992643, + "grad_norm": 0.45555487275123596, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 2280 + }, + { + "epoch": 0.3702206773906717, + "grad_norm": 0.5666276216506958, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 2290 + }, + { + "epoch": 0.371837361571417, + "grad_norm": 0.5461117625236511, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 2300 + }, + { + "epoch": 0.3734540457521623, + "grad_norm": 0.6318911910057068, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 2310 + }, + { + "epoch": 0.37507072993290763, + "grad_norm": 0.493263304233551, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 2320 + }, + { + "epoch": 0.3766874141136529, + "grad_norm": 0.5888760089874268, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 2330 + }, + { + "epoch": 0.3783040982943982, + "grad_norm": 0.48671841621398926, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 2340 + }, + { + "epoch": 0.3799207824751435, + "grad_norm": 0.4385145306587219, + "learning_rate": 0.0002, + "loss": 0.8367, + "step": 2350 + }, + { + "epoch": 0.3815374666558888, + "grad_norm": 0.5523318648338318, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 2360 + }, + { + "epoch": 0.38315415083663407, + "grad_norm": 0.7308220267295837, + "learning_rate": 0.0002, + "loss": 0.8351, + "step": 2370 + }, + { + "epoch": 0.38477083501737935, + "grad_norm": 0.554214358329773, + "learning_rate": 0.0002, + "loss": 0.859, + "step": 2380 + }, + { + "epoch": 0.38638751919812464, + "grad_norm": 0.5425800085067749, + "learning_rate": 0.0002, + "loss": 0.8146, + "step": 2390 + }, + { + "epoch": 0.3880042033788699, + "grad_norm": 0.48811158537864685, + "learning_rate": 0.0002, + "loss": 0.8282, + "step": 2400 + }, + { + "epoch": 0.3896208875596152, + "grad_norm": 0.49212366342544556, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 2410 + }, + { + "epoch": 0.3912375717403605, + "grad_norm": 0.5222218632698059, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 2420 + }, + { + "epoch": 0.3928542559211058, + "grad_norm": 0.4699819087982178, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 2430 + }, + { + "epoch": 0.39447094010185113, + "grad_norm": 0.46153587102890015, + "learning_rate": 0.0002, + "loss": 0.7919, + "step": 2440 + }, + { + "epoch": 0.3960876242825964, + "grad_norm": 0.4150611162185669, + "learning_rate": 0.0002, + "loss": 0.8111, + "step": 2450 + }, + { + "epoch": 0.3977043084633417, + "grad_norm": 0.5799614787101746, + "learning_rate": 0.0002, + "loss": 0.8589, + "step": 2460 + }, + { + "epoch": 0.399320992644087, + "grad_norm": 0.56536865234375, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 2470 + }, + { + "epoch": 0.4009376768248323, + "grad_norm": 0.5451247096061707, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 2480 + }, + { + "epoch": 0.40255436100557757, + "grad_norm": 0.5914521217346191, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 2490 + }, + { + "epoch": 0.40417104518632285, + "grad_norm": 0.4428117275238037, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 2500 + }, + { + "epoch": 0.40578772936706814, + "grad_norm": 0.48580947518348694, + "learning_rate": 0.0002, + "loss": 0.8054, + "step": 2510 + }, + { + "epoch": 0.4074044135478134, + "grad_norm": 0.436734676361084, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 2520 + }, + { + "epoch": 0.4090210977285587, + "grad_norm": 0.5752223134040833, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 2530 + }, + { + "epoch": 0.410637781909304, + "grad_norm": 0.4271308183670044, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 2540 + }, + { + "epoch": 0.4122544660900493, + "grad_norm": 0.46294718980789185, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 2550 + }, + { + "epoch": 0.4138711502707946, + "grad_norm": 0.49407583475112915, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 2560 + }, + { + "epoch": 0.4154878344515399, + "grad_norm": 0.4729035496711731, + "learning_rate": 0.0002, + "loss": 0.7881, + "step": 2570 + }, + { + "epoch": 0.4171045186322852, + "grad_norm": 0.4129747152328491, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 2580 + }, + { + "epoch": 0.4187212028130305, + "grad_norm": 0.5684236288070679, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 2590 + }, + { + "epoch": 0.4203378869937758, + "grad_norm": 0.4862157106399536, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 2600 + }, + { + "epoch": 0.42195457117452106, + "grad_norm": 0.46567976474761963, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 2610 + }, + { + "epoch": 0.42357125535526635, + "grad_norm": 0.5710650682449341, + "learning_rate": 0.0002, + "loss": 0.7755, + "step": 2620 + }, + { + "epoch": 0.42518793953601164, + "grad_norm": 0.5660041570663452, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 2630 + }, + { + "epoch": 0.4268046237167569, + "grad_norm": 0.47944375872612, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 2640 + }, + { + "epoch": 0.4284213078975022, + "grad_norm": 0.537223756313324, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 2650 + }, + { + "epoch": 0.4300379920782475, + "grad_norm": 0.41669997572898865, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 2660 + }, + { + "epoch": 0.4316546762589928, + "grad_norm": 0.44727686047554016, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 2670 + }, + { + "epoch": 0.4332713604397381, + "grad_norm": 0.5600888729095459, + "learning_rate": 0.0002, + "loss": 0.8241, + "step": 2680 + }, + { + "epoch": 0.4348880446204834, + "grad_norm": 0.39820605516433716, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 2690 + }, + { + "epoch": 0.4365047288012287, + "grad_norm": 0.5637655854225159, + "learning_rate": 0.0002, + "loss": 0.8202, + "step": 2700 + }, + { + "epoch": 0.438121412981974, + "grad_norm": 0.6363666653633118, + "learning_rate": 0.0002, + "loss": 0.855, + "step": 2710 + }, + { + "epoch": 0.4397380971627193, + "grad_norm": 0.5656129121780396, + "learning_rate": 0.0002, + "loss": 0.8468, + "step": 2720 + }, + { + "epoch": 0.44135478134346456, + "grad_norm": 0.5600156188011169, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 2730 + }, + { + "epoch": 0.44297146552420985, + "grad_norm": 0.5506579875946045, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 2740 + }, + { + "epoch": 0.44458814970495514, + "grad_norm": 0.49878305196762085, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 2750 + }, + { + "epoch": 0.4462048338857004, + "grad_norm": 0.4569213092327118, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 2760 + }, + { + "epoch": 0.4478215180664457, + "grad_norm": 0.6056680083274841, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 2770 + }, + { + "epoch": 0.449438202247191, + "grad_norm": 0.44474557042121887, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 2780 + }, + { + "epoch": 0.4510548864279363, + "grad_norm": 0.46055394411087036, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 2790 + }, + { + "epoch": 0.4526715706086816, + "grad_norm": 0.4904133379459381, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 2800 + }, + { + "epoch": 0.45428825478942686, + "grad_norm": 0.5647031664848328, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 2810 + }, + { + "epoch": 0.4559049389701722, + "grad_norm": 0.5759473443031311, + "learning_rate": 0.0002, + "loss": 0.8622, + "step": 2820 + }, + { + "epoch": 0.4575216231509175, + "grad_norm": 0.5161895751953125, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 2830 + }, + { + "epoch": 0.4591383073316628, + "grad_norm": 0.4248254597187042, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 2840 + }, + { + "epoch": 0.46075499151240806, + "grad_norm": 0.45395001769065857, + "learning_rate": 0.0002, + "loss": 0.7838, + "step": 2850 + }, + { + "epoch": 0.46237167569315335, + "grad_norm": 0.5358697772026062, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 2860 + }, + { + "epoch": 0.46398835987389864, + "grad_norm": 0.5379165410995483, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 2870 + }, + { + "epoch": 0.4656050440546439, + "grad_norm": 0.4601989686489105, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 2880 + }, + { + "epoch": 0.4672217282353892, + "grad_norm": 0.671115517616272, + "learning_rate": 0.0002, + "loss": 0.8523, + "step": 2890 + }, + { + "epoch": 0.4688384124161345, + "grad_norm": 0.4425133168697357, + "learning_rate": 0.0002, + "loss": 0.8262, + "step": 2900 + }, + { + "epoch": 0.4704550965968798, + "grad_norm": 0.5446155071258545, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 2910 + }, + { + "epoch": 0.47207178077762507, + "grad_norm": 0.603306233882904, + "learning_rate": 0.0002, + "loss": 0.8106, + "step": 2920 + }, + { + "epoch": 0.47368846495837036, + "grad_norm": 0.5377997159957886, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 2930 + }, + { + "epoch": 0.4753051491391157, + "grad_norm": 0.4931027591228485, + "learning_rate": 0.0002, + "loss": 0.8075, + "step": 2940 + }, + { + "epoch": 0.476921833319861, + "grad_norm": 0.4711960256099701, + "learning_rate": 0.0002, + "loss": 0.8004, + "step": 2950 + }, + { + "epoch": 0.4785385175006063, + "grad_norm": 0.5020492672920227, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 2960 + }, + { + "epoch": 0.48015520168135156, + "grad_norm": 0.5428946614265442, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 2970 + }, + { + "epoch": 0.48177188586209685, + "grad_norm": 0.5294089317321777, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 2980 + }, + { + "epoch": 0.48338857004284214, + "grad_norm": 0.648289144039154, + "learning_rate": 0.0002, + "loss": 0.8553, + "step": 2990 + }, + { + "epoch": 0.4850052542235874, + "grad_norm": 0.47916680574417114, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 3000 + }, + { + "epoch": 0.4866219384043327, + "grad_norm": 0.43849772214889526, + "learning_rate": 0.0002, + "loss": 0.8087, + "step": 3010 + }, + { + "epoch": 0.488238622585078, + "grad_norm": 0.47007861733436584, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 3020 + }, + { + "epoch": 0.4898553067658233, + "grad_norm": 0.6314331293106079, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 3030 + }, + { + "epoch": 0.49147199094656857, + "grad_norm": 0.49211493134498596, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 3040 + }, + { + "epoch": 0.49308867512731386, + "grad_norm": 0.4537973403930664, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 3050 + }, + { + "epoch": 0.49470535930805914, + "grad_norm": 0.47326919436454773, + "learning_rate": 0.0002, + "loss": 0.8095, + "step": 3060 + }, + { + "epoch": 0.4963220434888045, + "grad_norm": 0.525874137878418, + "learning_rate": 0.0002, + "loss": 0.8447, + "step": 3070 + }, + { + "epoch": 0.4979387276695498, + "grad_norm": 0.6361091732978821, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 3080 + }, + { + "epoch": 0.49955541185029506, + "grad_norm": 0.5850642919540405, + "learning_rate": 0.0002, + "loss": 0.821, + "step": 3090 + }, + { + "epoch": 0.5011720960310403, + "grad_norm": 0.47299543023109436, + "learning_rate": 0.0002, + "loss": 0.8279, + "step": 3100 + }, + { + "epoch": 0.5027887802117856, + "grad_norm": 0.473099946975708, + "learning_rate": 0.0002, + "loss": 0.8681, + "step": 3110 + }, + { + "epoch": 0.5044054643925309, + "grad_norm": 0.48186397552490234, + "learning_rate": 0.0002, + "loss": 0.8223, + "step": 3120 + }, + { + "epoch": 0.5060221485732762, + "grad_norm": 0.5015401840209961, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 3130 + }, + { + "epoch": 0.5076388327540216, + "grad_norm": 0.5617750287055969, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 3140 + }, + { + "epoch": 0.5092555169347668, + "grad_norm": 0.5169327259063721, + "learning_rate": 0.0002, + "loss": 0.8708, + "step": 3150 + }, + { + "epoch": 0.5108722011155121, + "grad_norm": 0.545657753944397, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 3160 + }, + { + "epoch": 0.5124888852962574, + "grad_norm": 0.512864351272583, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 3170 + }, + { + "epoch": 0.5141055694770027, + "grad_norm": 0.4113546311855316, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 3180 + }, + { + "epoch": 0.5157222536577479, + "grad_norm": 0.44532445073127747, + "learning_rate": 0.0002, + "loss": 0.8206, + "step": 3190 + }, + { + "epoch": 0.5173389378384933, + "grad_norm": 0.5623497366905212, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 3200 + }, + { + "epoch": 0.5189556220192385, + "grad_norm": 0.5084741115570068, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 3210 + }, + { + "epoch": 0.5205723061999838, + "grad_norm": 0.5305403470993042, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 3220 + }, + { + "epoch": 0.5221889903807291, + "grad_norm": 0.4708254337310791, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 3230 + }, + { + "epoch": 0.5238056745614744, + "grad_norm": 0.43827131390571594, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 3240 + }, + { + "epoch": 0.5254223587422197, + "grad_norm": 0.5630002617835999, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 3250 + }, + { + "epoch": 0.527039042922965, + "grad_norm": 0.5010961890220642, + "learning_rate": 0.0002, + "loss": 0.7522, + "step": 3260 + }, + { + "epoch": 0.5286557271037103, + "grad_norm": 0.6303122043609619, + "learning_rate": 0.0002, + "loss": 0.8374, + "step": 3270 + }, + { + "epoch": 0.5302724112844556, + "grad_norm": 0.5107331275939941, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 3280 + }, + { + "epoch": 0.5318890954652009, + "grad_norm": 0.5700443387031555, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 3290 + }, + { + "epoch": 0.5335057796459461, + "grad_norm": 0.46296367049217224, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 3300 + }, + { + "epoch": 0.5351224638266915, + "grad_norm": 0.531568706035614, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 3310 + }, + { + "epoch": 0.5367391480074367, + "grad_norm": 0.4686741530895233, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 3320 + }, + { + "epoch": 0.5383558321881821, + "grad_norm": 0.5404331088066101, + "learning_rate": 0.0002, + "loss": 0.8104, + "step": 3330 + }, + { + "epoch": 0.5399725163689273, + "grad_norm": 0.6368790864944458, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 3340 + }, + { + "epoch": 0.5415892005496726, + "grad_norm": 0.42300888895988464, + "learning_rate": 0.0002, + "loss": 0.8514, + "step": 3350 + }, + { + "epoch": 0.5432058847304179, + "grad_norm": 0.5362542867660522, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 3360 + }, + { + "epoch": 0.5448225689111632, + "grad_norm": 0.497128963470459, + "learning_rate": 0.0002, + "loss": 0.858, + "step": 3370 + }, + { + "epoch": 0.5464392530919085, + "grad_norm": 0.5006386041641235, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 3380 + }, + { + "epoch": 0.5480559372726538, + "grad_norm": 0.44136837124824524, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 3390 + }, + { + "epoch": 0.5496726214533991, + "grad_norm": 0.5897833108901978, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 3400 + }, + { + "epoch": 0.5512893056341444, + "grad_norm": 0.641075611114502, + "learning_rate": 0.0002, + "loss": 0.8895, + "step": 3410 + }, + { + "epoch": 0.5529059898148897, + "grad_norm": 0.7251322269439697, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 3420 + }, + { + "epoch": 0.5545226739956349, + "grad_norm": 0.47411349415779114, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 3430 + }, + { + "epoch": 0.5561393581763803, + "grad_norm": 0.4994310438632965, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 3440 + }, + { + "epoch": 0.5577560423571255, + "grad_norm": 0.5814438462257385, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 3450 + }, + { + "epoch": 0.5593727265378708, + "grad_norm": 0.6278898119926453, + "learning_rate": 0.0002, + "loss": 0.8805, + "step": 3460 + }, + { + "epoch": 0.5609894107186161, + "grad_norm": 0.46208274364471436, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 3470 + }, + { + "epoch": 0.5626060948993614, + "grad_norm": 0.5718930959701538, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 3480 + }, + { + "epoch": 0.5642227790801067, + "grad_norm": 0.48178744316101074, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 3490 + }, + { + "epoch": 0.565839463260852, + "grad_norm": 0.47336965799331665, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 3500 + }, + { + "epoch": 0.5674561474415973, + "grad_norm": 0.43442684412002563, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 3510 + }, + { + "epoch": 0.5690728316223426, + "grad_norm": 0.6463358998298645, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 3520 + }, + { + "epoch": 0.5706895158030879, + "grad_norm": 0.5286486744880676, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 3530 + }, + { + "epoch": 0.5723061999838331, + "grad_norm": 0.5405499935150146, + "learning_rate": 0.0002, + "loss": 0.8421, + "step": 3540 + }, + { + "epoch": 0.5739228841645785, + "grad_norm": 0.6654391884803772, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 3550 + }, + { + "epoch": 0.5755395683453237, + "grad_norm": 0.5081980228424072, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 3560 + }, + { + "epoch": 0.5771562525260691, + "grad_norm": 0.48978179693222046, + "learning_rate": 0.0002, + "loss": 0.7753, + "step": 3570 + }, + { + "epoch": 0.5787729367068143, + "grad_norm": 0.5840612053871155, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3580 + }, + { + "epoch": 0.5803896208875596, + "grad_norm": 0.5235261917114258, + "learning_rate": 0.0002, + "loss": 0.8937, + "step": 3590 + }, + { + "epoch": 0.5820063050683049, + "grad_norm": 0.5672075748443604, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 3600 + }, + { + "epoch": 0.5836229892490502, + "grad_norm": 0.5613429546356201, + "learning_rate": 0.0002, + "loss": 0.8347, + "step": 3610 + }, + { + "epoch": 0.5852396734297954, + "grad_norm": 0.4032273590564728, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 3620 + }, + { + "epoch": 0.5868563576105408, + "grad_norm": 0.49559324979782104, + "learning_rate": 0.0002, + "loss": 0.8421, + "step": 3630 + }, + { + "epoch": 0.5884730417912861, + "grad_norm": 0.6895697712898254, + "learning_rate": 0.0002, + "loss": 0.8332, + "step": 3640 + }, + { + "epoch": 0.5900897259720314, + "grad_norm": 0.4750136435031891, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 3650 + }, + { + "epoch": 0.5917064101527767, + "grad_norm": 0.5176819562911987, + "learning_rate": 0.0002, + "loss": 0.8219, + "step": 3660 + }, + { + "epoch": 0.5933230943335219, + "grad_norm": 0.5817760229110718, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3670 + }, + { + "epoch": 0.5949397785142673, + "grad_norm": 0.6064626574516296, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 3680 + }, + { + "epoch": 0.5965564626950125, + "grad_norm": 0.6728700995445251, + "learning_rate": 0.0002, + "loss": 0.8422, + "step": 3690 + }, + { + "epoch": 0.5981731468757578, + "grad_norm": 0.609305202960968, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 3700 + }, + { + "epoch": 0.5997898310565031, + "grad_norm": 0.4615488350391388, + "learning_rate": 0.0002, + "loss": 0.8048, + "step": 3710 + }, + { + "epoch": 0.6014065152372484, + "grad_norm": 2.0531179904937744, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 3720 + }, + { + "epoch": 0.6030231994179936, + "grad_norm": 0.5091132521629333, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 3730 + }, + { + "epoch": 0.604639883598739, + "grad_norm": 0.5951124429702759, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 3740 + }, + { + "epoch": 0.6062565677794842, + "grad_norm": 0.5870208144187927, + "learning_rate": 0.0002, + "loss": 0.7784, + "step": 3750 + }, + { + "epoch": 0.6078732519602296, + "grad_norm": 0.6254619359970093, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 3760 + }, + { + "epoch": 0.6094899361409749, + "grad_norm": 0.5577626824378967, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 3770 + }, + { + "epoch": 0.6111066203217201, + "grad_norm": 0.5004405379295349, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 3780 + }, + { + "epoch": 0.6127233045024655, + "grad_norm": 0.5527383685112, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 3790 + }, + { + "epoch": 0.6143399886832107, + "grad_norm": 0.49116113781929016, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 3800 + }, + { + "epoch": 0.6159566728639561, + "grad_norm": 0.5299299359321594, + "learning_rate": 0.0002, + "loss": 0.8352, + "step": 3810 + }, + { + "epoch": 0.6175733570447013, + "grad_norm": 0.464897483587265, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 3820 + }, + { + "epoch": 0.6191900412254466, + "grad_norm": 0.6505740880966187, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 3830 + }, + { + "epoch": 0.6208067254061919, + "grad_norm": 0.5512559413909912, + "learning_rate": 0.0002, + "loss": 0.8123, + "step": 3840 + }, + { + "epoch": 0.6224234095869372, + "grad_norm": 0.49427518248558044, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 3850 + }, + { + "epoch": 0.6240400937676824, + "grad_norm": 0.3839147090911865, + "learning_rate": 0.0002, + "loss": 0.7751, + "step": 3860 + }, + { + "epoch": 0.6256567779484278, + "grad_norm": 0.5760218501091003, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 3870 + }, + { + "epoch": 0.6272734621291731, + "grad_norm": 0.7226507067680359, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 3880 + }, + { + "epoch": 0.6288901463099184, + "grad_norm": 0.676781415939331, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 3890 + }, + { + "epoch": 0.6305068304906637, + "grad_norm": 0.4284018278121948, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 3900 + }, + { + "epoch": 0.6321235146714089, + "grad_norm": 0.5060628056526184, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 3910 + }, + { + "epoch": 0.6337401988521543, + "grad_norm": 0.5524522066116333, + "learning_rate": 0.0002, + "loss": 0.8089, + "step": 3920 + }, + { + "epoch": 0.6353568830328995, + "grad_norm": 0.6099881529808044, + "learning_rate": 0.0002, + "loss": 0.8276, + "step": 3930 + }, + { + "epoch": 0.6369735672136448, + "grad_norm": 0.43155938386917114, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 3940 + }, + { + "epoch": 0.6385902513943901, + "grad_norm": 0.6427084803581238, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 3950 + }, + { + "epoch": 0.6402069355751354, + "grad_norm": 0.541220486164093, + "learning_rate": 0.0002, + "loss": 0.8368, + "step": 3960 + }, + { + "epoch": 0.6418236197558806, + "grad_norm": 0.5414294600486755, + "learning_rate": 0.0002, + "loss": 0.8539, + "step": 3970 + }, + { + "epoch": 0.643440303936626, + "grad_norm": 0.46344003081321716, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 3980 + }, + { + "epoch": 0.6450569881173712, + "grad_norm": 0.45209285616874695, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 3990 + }, + { + "epoch": 0.6466736722981166, + "grad_norm": 0.5417284369468689, + "learning_rate": 0.0002, + "loss": 0.8202, + "step": 4000 + }, + { + "epoch": 0.6482903564788619, + "grad_norm": 0.7995685935020447, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 4010 + }, + { + "epoch": 0.6499070406596071, + "grad_norm": 0.6384002566337585, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 4020 + }, + { + "epoch": 0.6515237248403525, + "grad_norm": 0.4472815692424774, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 4030 + }, + { + "epoch": 0.6531404090210977, + "grad_norm": 0.6834294199943542, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 4040 + }, + { + "epoch": 0.654757093201843, + "grad_norm": 0.4612339735031128, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 4050 + }, + { + "epoch": 0.6563737773825883, + "grad_norm": 0.9266576170921326, + "learning_rate": 0.0002, + "loss": 0.8141, + "step": 4060 + }, + { + "epoch": 0.6579904615633336, + "grad_norm": 0.4470861852169037, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 4070 + }, + { + "epoch": 0.6596071457440789, + "grad_norm": 0.45544925332069397, + "learning_rate": 0.0002, + "loss": 0.8293, + "step": 4080 + }, + { + "epoch": 0.6612238299248242, + "grad_norm": 0.6144481301307678, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 4090 + }, + { + "epoch": 0.6628405141055694, + "grad_norm": 0.5936288237571716, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 4100 + }, + { + "epoch": 0.6644571982863148, + "grad_norm": 0.4822963774204254, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 4110 + }, + { + "epoch": 0.66607388246706, + "grad_norm": 0.48432496190071106, + "learning_rate": 0.0002, + "loss": 0.7997, + "step": 4120 + }, + { + "epoch": 0.6676905666478054, + "grad_norm": 0.4901607930660248, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 4130 + }, + { + "epoch": 0.6693072508285507, + "grad_norm": 0.5018393397331238, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 4140 + }, + { + "epoch": 0.6709239350092959, + "grad_norm": 0.6946378946304321, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 4150 + }, + { + "epoch": 0.6725406191900413, + "grad_norm": 0.5997390747070312, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 4160 + }, + { + "epoch": 0.6741573033707865, + "grad_norm": 0.6738849878311157, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 4170 + }, + { + "epoch": 0.6757739875515318, + "grad_norm": 0.6110581159591675, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 4180 + }, + { + "epoch": 0.6773906717322771, + "grad_norm": 0.5703322291374207, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 4190 + }, + { + "epoch": 0.6790073559130224, + "grad_norm": 0.4686066210269928, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 4200 + }, + { + "epoch": 0.6806240400937676, + "grad_norm": 0.6394643783569336, + "learning_rate": 0.0002, + "loss": 0.8441, + "step": 4210 + }, + { + "epoch": 0.682240724274513, + "grad_norm": 0.5454841256141663, + "learning_rate": 0.0002, + "loss": 0.8011, + "step": 4220 + }, + { + "epoch": 0.6838574084552582, + "grad_norm": 0.4859732985496521, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 4230 + }, + { + "epoch": 0.6854740926360036, + "grad_norm": 0.5544065833091736, + "learning_rate": 0.0002, + "loss": 0.8161, + "step": 4240 + }, + { + "epoch": 0.6870907768167488, + "grad_norm": 0.4902505576610565, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 4250 + }, + { + "epoch": 0.6887074609974941, + "grad_norm": 0.4768051505088806, + "learning_rate": 0.0002, + "loss": 0.7977, + "step": 4260 + }, + { + "epoch": 0.6903241451782395, + "grad_norm": 0.49982190132141113, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 4270 + }, + { + "epoch": 0.6919408293589847, + "grad_norm": 0.6351838111877441, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 4280 + }, + { + "epoch": 0.69355751353973, + "grad_norm": 0.5647561550140381, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 4290 + }, + { + "epoch": 0.6951741977204753, + "grad_norm": 0.5340486764907837, + "learning_rate": 0.0002, + "loss": 0.7618, + "step": 4300 + }, + { + "epoch": 0.6967908819012206, + "grad_norm": 0.5649092793464661, + "learning_rate": 0.0002, + "loss": 0.8526, + "step": 4310 + }, + { + "epoch": 0.6984075660819659, + "grad_norm": 0.6183916926383972, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 4320 + }, + { + "epoch": 0.7000242502627112, + "grad_norm": 0.6154509782791138, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 4330 + }, + { + "epoch": 0.7016409344434564, + "grad_norm": 0.5156264305114746, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 4340 + }, + { + "epoch": 0.7032576186242018, + "grad_norm": 0.562171459197998, + "learning_rate": 0.0002, + "loss": 0.8512, + "step": 4350 + }, + { + "epoch": 0.704874302804947, + "grad_norm": 0.4949502646923065, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 4360 + }, + { + "epoch": 0.7064909869856923, + "grad_norm": 0.5171684622764587, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 4370 + }, + { + "epoch": 0.7081076711664377, + "grad_norm": 0.6198443174362183, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 4380 + }, + { + "epoch": 0.7097243553471829, + "grad_norm": 0.5802276134490967, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 4390 + }, + { + "epoch": 0.7113410395279283, + "grad_norm": 0.41096967458724976, + "learning_rate": 0.0002, + "loss": 0.8797, + "step": 4400 + }, + { + "epoch": 0.7129577237086735, + "grad_norm": 0.4397392272949219, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 4410 + }, + { + "epoch": 0.7145744078894188, + "grad_norm": 0.45228442549705505, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 4420 + }, + { + "epoch": 0.7161910920701641, + "grad_norm": 0.4839673936367035, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 4430 + }, + { + "epoch": 0.7178077762509094, + "grad_norm": 0.6140755414962769, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 4440 + }, + { + "epoch": 0.7194244604316546, + "grad_norm": 0.6841378808021545, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 4450 + }, + { + "epoch": 0.7210411446124, + "grad_norm": 0.6664239168167114, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 4460 + }, + { + "epoch": 0.7226578287931452, + "grad_norm": 0.47552719712257385, + "learning_rate": 0.0002, + "loss": 0.7983, + "step": 4470 + }, + { + "epoch": 0.7242745129738906, + "grad_norm": 0.6649776101112366, + "learning_rate": 0.0002, + "loss": 0.8982, + "step": 4480 + }, + { + "epoch": 0.7258911971546358, + "grad_norm": 0.5159541964530945, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 4490 + }, + { + "epoch": 0.7275078813353811, + "grad_norm": 0.6693112850189209, + "learning_rate": 0.0002, + "loss": 0.7786, + "step": 4500 + }, + { + "epoch": 0.7291245655161265, + "grad_norm": 0.48870977759361267, + "learning_rate": 0.0002, + "loss": 0.8655, + "step": 4510 + }, + { + "epoch": 0.7307412496968717, + "grad_norm": 0.4857887923717499, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 4520 + }, + { + "epoch": 0.732357933877617, + "grad_norm": 0.5515662431716919, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 4530 + }, + { + "epoch": 0.7339746180583623, + "grad_norm": 0.6292222738265991, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 4540 + }, + { + "epoch": 0.7355913022391076, + "grad_norm": 0.48265689611434937, + "learning_rate": 0.0002, + "loss": 0.7749, + "step": 4550 + }, + { + "epoch": 0.7372079864198529, + "grad_norm": 0.8044266104698181, + "learning_rate": 0.0002, + "loss": 0.8499, + "step": 4560 + }, + { + "epoch": 0.7388246706005982, + "grad_norm": 0.6111769676208496, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 4570 + }, + { + "epoch": 0.7404413547813434, + "grad_norm": 0.5229553580284119, + "learning_rate": 0.0002, + "loss": 0.7291, + "step": 4580 + }, + { + "epoch": 0.7420580389620888, + "grad_norm": 0.6054152250289917, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 4590 + }, + { + "epoch": 0.743674723142834, + "grad_norm": 0.5574966669082642, + "learning_rate": 0.0002, + "loss": 0.8169, + "step": 4600 + }, + { + "epoch": 0.7452914073235793, + "grad_norm": 0.5395817160606384, + "learning_rate": 0.0002, + "loss": 0.8439, + "step": 4610 + }, + { + "epoch": 0.7469080915043246, + "grad_norm": 0.7116472721099854, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 4620 + }, + { + "epoch": 0.7485247756850699, + "grad_norm": 0.5618700981140137, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 4630 + }, + { + "epoch": 0.7501414598658153, + "grad_norm": 0.5802770853042603, + "learning_rate": 0.0002, + "loss": 0.7744, + "step": 4640 + }, + { + "epoch": 0.7517581440465605, + "grad_norm": 0.5690428018569946, + "learning_rate": 0.0002, + "loss": 0.7924, + "step": 4650 + }, + { + "epoch": 0.7533748282273058, + "grad_norm": 0.4813360273838043, + "learning_rate": 0.0002, + "loss": 0.8017, + "step": 4660 + }, + { + "epoch": 0.7549915124080511, + "grad_norm": 0.5434042811393738, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 4670 + }, + { + "epoch": 0.7566081965887964, + "grad_norm": 0.5502099990844727, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 4680 + }, + { + "epoch": 0.7582248807695416, + "grad_norm": 0.6020621061325073, + "learning_rate": 0.0002, + "loss": 0.8598, + "step": 4690 + }, + { + "epoch": 0.759841564950287, + "grad_norm": 0.4922301471233368, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 4700 + }, + { + "epoch": 0.7614582491310322, + "grad_norm": 0.6492828726768494, + "learning_rate": 0.0002, + "loss": 0.788, + "step": 4710 + }, + { + "epoch": 0.7630749333117776, + "grad_norm": 0.4865580201148987, + "learning_rate": 0.0002, + "loss": 0.8313, + "step": 4720 + }, + { + "epoch": 0.7646916174925228, + "grad_norm": 0.5971422791481018, + "learning_rate": 0.0002, + "loss": 0.7966, + "step": 4730 + }, + { + "epoch": 0.7663083016732681, + "grad_norm": 0.6832674145698547, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 4740 + }, + { + "epoch": 0.7679249858540134, + "grad_norm": 0.500908613204956, + "learning_rate": 0.0002, + "loss": 0.8156, + "step": 4750 + }, + { + "epoch": 0.7695416700347587, + "grad_norm": 0.6112465858459473, + "learning_rate": 0.0002, + "loss": 0.8383, + "step": 4760 + }, + { + "epoch": 0.771158354215504, + "grad_norm": 0.5753506422042847, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 4770 + }, + { + "epoch": 0.7727750383962493, + "grad_norm": 0.6529405117034912, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 4780 + }, + { + "epoch": 0.7743917225769946, + "grad_norm": 0.5916843414306641, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 4790 + }, + { + "epoch": 0.7760084067577399, + "grad_norm": 0.4821224510669708, + "learning_rate": 0.0002, + "loss": 0.83, + "step": 4800 + }, + { + "epoch": 0.7776250909384852, + "grad_norm": 0.5532580018043518, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 4810 + }, + { + "epoch": 0.7792417751192304, + "grad_norm": 0.4604877233505249, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 4820 + }, + { + "epoch": 0.7808584592999758, + "grad_norm": 0.5009613037109375, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 4830 + }, + { + "epoch": 0.782475143480721, + "grad_norm": 0.6448560357093811, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 4840 + }, + { + "epoch": 0.7840918276614663, + "grad_norm": 0.44327953457832336, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 4850 + }, + { + "epoch": 0.7857085118422116, + "grad_norm": 0.5355411171913147, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 4860 + }, + { + "epoch": 0.7873251960229569, + "grad_norm": 0.5635677576065063, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 4870 + }, + { + "epoch": 0.7889418802037023, + "grad_norm": 0.5417491793632507, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 4880 + }, + { + "epoch": 0.7905585643844475, + "grad_norm": 0.4567430913448334, + "learning_rate": 0.0002, + "loss": 0.7819, + "step": 4890 + }, + { + "epoch": 0.7921752485651928, + "grad_norm": 0.44651296734809875, + "learning_rate": 0.0002, + "loss": 0.8454, + "step": 4900 + }, + { + "epoch": 0.7937919327459381, + "grad_norm": 0.5741217136383057, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 4910 + }, + { + "epoch": 0.7954086169266834, + "grad_norm": 0.6605045199394226, + "learning_rate": 0.0002, + "loss": 0.8093, + "step": 4920 + }, + { + "epoch": 0.7970253011074286, + "grad_norm": 0.5126531720161438, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 4930 + }, + { + "epoch": 0.798641985288174, + "grad_norm": 0.513648271560669, + "learning_rate": 0.0002, + "loss": 0.7793, + "step": 4940 + }, + { + "epoch": 0.8002586694689192, + "grad_norm": 0.5350404381752014, + "learning_rate": 0.0002, + "loss": 0.8314, + "step": 4950 + }, + { + "epoch": 0.8018753536496646, + "grad_norm": 0.5731674432754517, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 4960 + }, + { + "epoch": 0.8034920378304098, + "grad_norm": 0.5974258184432983, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 4970 + }, + { + "epoch": 0.8051087220111551, + "grad_norm": 0.8774799704551697, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 4980 + }, + { + "epoch": 0.8067254061919004, + "grad_norm": 0.5994430184364319, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 4990 + }, + { + "epoch": 0.8083420903726457, + "grad_norm": 0.4894903004169464, + "learning_rate": 0.0002, + "loss": 0.7736, + "step": 5000 + }, + { + "epoch": 0.809958774553391, + "grad_norm": 0.5218459367752075, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 5010 + }, + { + "epoch": 0.8115754587341363, + "grad_norm": 0.5232468843460083, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 5020 + }, + { + "epoch": 0.8131921429148816, + "grad_norm": 0.44358372688293457, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 5030 + }, + { + "epoch": 0.8148088270956269, + "grad_norm": 0.6202037334442139, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 5040 + }, + { + "epoch": 0.8164255112763722, + "grad_norm": 0.7721474170684814, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 5050 + }, + { + "epoch": 0.8180421954571174, + "grad_norm": 0.5568501353263855, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 5060 + }, + { + "epoch": 0.8196588796378628, + "grad_norm": 0.49148809909820557, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 5070 + }, + { + "epoch": 0.821275563818608, + "grad_norm": 0.4956012964248657, + "learning_rate": 0.0002, + "loss": 0.8054, + "step": 5080 + }, + { + "epoch": 0.8228922479993533, + "grad_norm": 0.6078833937644958, + "learning_rate": 0.0002, + "loss": 0.8201, + "step": 5090 + }, + { + "epoch": 0.8245089321800986, + "grad_norm": 0.46906954050064087, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 5100 + }, + { + "epoch": 0.8261256163608439, + "grad_norm": 0.50812166929245, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 5110 + }, + { + "epoch": 0.8277423005415891, + "grad_norm": 0.5319661498069763, + "learning_rate": 0.0002, + "loss": 0.8243, + "step": 5120 + }, + { + "epoch": 0.8293589847223345, + "grad_norm": 0.4949689209461212, + "learning_rate": 0.0002, + "loss": 0.7798, + "step": 5130 + }, + { + "epoch": 0.8309756689030798, + "grad_norm": 0.5151591300964355, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 5140 + }, + { + "epoch": 0.8325923530838251, + "grad_norm": 0.5530214309692383, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 5150 + }, + { + "epoch": 0.8342090372645704, + "grad_norm": 0.6297410130500793, + "learning_rate": 0.0002, + "loss": 0.8251, + "step": 5160 + }, + { + "epoch": 0.8358257214453156, + "grad_norm": 0.5466840267181396, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 5170 + }, + { + "epoch": 0.837442405626061, + "grad_norm": 0.652913510799408, + "learning_rate": 0.0002, + "loss": 0.7875, + "step": 5180 + }, + { + "epoch": 0.8390590898068062, + "grad_norm": 0.5811293125152588, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 5190 + }, + { + "epoch": 0.8406757739875516, + "grad_norm": 0.5109550952911377, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 5200 + }, + { + "epoch": 0.8422924581682968, + "grad_norm": 0.4551706612110138, + "learning_rate": 0.0002, + "loss": 0.8077, + "step": 5210 + }, + { + "epoch": 0.8439091423490421, + "grad_norm": 0.5813754200935364, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 5220 + }, + { + "epoch": 0.8455258265297874, + "grad_norm": 0.5856947898864746, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 5230 + }, + { + "epoch": 0.8471425107105327, + "grad_norm": 0.5482739210128784, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 5240 + }, + { + "epoch": 0.8487591948912779, + "grad_norm": 0.49023720622062683, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 5250 + }, + { + "epoch": 0.8503758790720233, + "grad_norm": 0.49472475051879883, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 5260 + }, + { + "epoch": 0.8519925632527686, + "grad_norm": 0.5490226745605469, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 5270 + }, + { + "epoch": 0.8536092474335139, + "grad_norm": 0.5340665578842163, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 5280 + }, + { + "epoch": 0.8552259316142592, + "grad_norm": 0.5962483882904053, + "learning_rate": 0.0002, + "loss": 0.8277, + "step": 5290 + }, + { + "epoch": 0.8568426157950044, + "grad_norm": 0.586358368396759, + "learning_rate": 0.0002, + "loss": 0.8765, + "step": 5300 + }, + { + "epoch": 0.8584592999757498, + "grad_norm": 0.49120277166366577, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 5310 + }, + { + "epoch": 0.860075984156495, + "grad_norm": 0.5887332558631897, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 5320 + }, + { + "epoch": 0.8616926683372403, + "grad_norm": 0.42496153712272644, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 5330 + }, + { + "epoch": 0.8633093525179856, + "grad_norm": 0.5489874482154846, + "learning_rate": 0.0002, + "loss": 0.7905, + "step": 5340 + }, + { + "epoch": 0.8649260366987309, + "grad_norm": 0.5850813984870911, + "learning_rate": 0.0002, + "loss": 0.7958, + "step": 5350 + }, + { + "epoch": 0.8665427208794761, + "grad_norm": 0.517487108707428, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 5360 + }, + { + "epoch": 0.8681594050602215, + "grad_norm": 0.5339142680168152, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 5370 + }, + { + "epoch": 0.8697760892409668, + "grad_norm": 0.6236387491226196, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 5380 + }, + { + "epoch": 0.8713927734217121, + "grad_norm": 0.5752192735671997, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 5390 + }, + { + "epoch": 0.8730094576024574, + "grad_norm": 0.6724614500999451, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 5400 + }, + { + "epoch": 0.8746261417832026, + "grad_norm": 0.5280613303184509, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 5410 + }, + { + "epoch": 0.876242825963948, + "grad_norm": 0.44033288955688477, + "learning_rate": 0.0002, + "loss": 0.8231, + "step": 5420 + }, + { + "epoch": 0.8778595101446932, + "grad_norm": 0.5199708342552185, + "learning_rate": 0.0002, + "loss": 0.8839, + "step": 5430 + }, + { + "epoch": 0.8794761943254386, + "grad_norm": 0.46778348088264465, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 5440 + }, + { + "epoch": 0.8810928785061838, + "grad_norm": 0.4657754898071289, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 5450 + }, + { + "epoch": 0.8827095626869291, + "grad_norm": 0.5472902655601501, + "learning_rate": 0.0002, + "loss": 0.7799, + "step": 5460 + }, + { + "epoch": 0.8843262468676744, + "grad_norm": 0.4876766800880432, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 5470 + }, + { + "epoch": 0.8859429310484197, + "grad_norm": 0.5057248473167419, + "learning_rate": 0.0002, + "loss": 0.7906, + "step": 5480 + }, + { + "epoch": 0.8875596152291649, + "grad_norm": 0.4637320637702942, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 5490 + }, + { + "epoch": 0.8891762994099103, + "grad_norm": 0.471955806016922, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 5500 + }, + { + "epoch": 0.8907929835906556, + "grad_norm": 0.5209813714027405, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 5510 + }, + { + "epoch": 0.8924096677714008, + "grad_norm": 0.6213834285736084, + "learning_rate": 0.0002, + "loss": 0.8106, + "step": 5520 + }, + { + "epoch": 0.8940263519521462, + "grad_norm": 0.5215408205986023, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 5530 + }, + { + "epoch": 0.8956430361328914, + "grad_norm": 0.580478310585022, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 5540 + }, + { + "epoch": 0.8972597203136368, + "grad_norm": 0.49102169275283813, + "learning_rate": 0.0002, + "loss": 0.8371, + "step": 5550 + }, + { + "epoch": 0.898876404494382, + "grad_norm": 0.6043479442596436, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 5560 + }, + { + "epoch": 0.9004930886751273, + "grad_norm": 0.5636463165283203, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 5570 + }, + { + "epoch": 0.9021097728558726, + "grad_norm": 0.5620124340057373, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 5580 + }, + { + "epoch": 0.9037264570366179, + "grad_norm": 0.5206354856491089, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 5590 + }, + { + "epoch": 0.9053431412173631, + "grad_norm": 0.5798229575157166, + "learning_rate": 0.0002, + "loss": 0.8557, + "step": 5600 + }, + { + "epoch": 0.9069598253981085, + "grad_norm": 0.6428212523460388, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 5610 + }, + { + "epoch": 0.9085765095788537, + "grad_norm": 0.48064687848091125, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 5620 + }, + { + "epoch": 0.9101931937595991, + "grad_norm": 0.6347860097885132, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 5630 + }, + { + "epoch": 0.9118098779403444, + "grad_norm": 0.5353913307189941, + "learning_rate": 0.0002, + "loss": 0.851, + "step": 5640 + }, + { + "epoch": 0.9134265621210896, + "grad_norm": 0.5323944091796875, + "learning_rate": 0.0002, + "loss": 0.7736, + "step": 5650 + }, + { + "epoch": 0.915043246301835, + "grad_norm": 0.5261843204498291, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 5660 + }, + { + "epoch": 0.9166599304825802, + "grad_norm": 0.5451326966285706, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 5670 + }, + { + "epoch": 0.9182766146633256, + "grad_norm": 0.5183324217796326, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 5680 + }, + { + "epoch": 0.9198932988440708, + "grad_norm": 0.47229018807411194, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 5690 + }, + { + "epoch": 0.9215099830248161, + "grad_norm": 0.49180513620376587, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 5700 + }, + { + "epoch": 0.9231266672055614, + "grad_norm": 0.5419785380363464, + "learning_rate": 0.0002, + "loss": 0.8735, + "step": 5710 + }, + { + "epoch": 0.9247433513863067, + "grad_norm": 0.5408698916435242, + "learning_rate": 0.0002, + "loss": 0.7378, + "step": 5720 + }, + { + "epoch": 0.9263600355670519, + "grad_norm": 0.5286232829093933, + "learning_rate": 0.0002, + "loss": 0.7701, + "step": 5730 + }, + { + "epoch": 0.9279767197477973, + "grad_norm": 0.7539758086204529, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 5740 + }, + { + "epoch": 0.9295934039285425, + "grad_norm": 0.5166944861412048, + "learning_rate": 0.0002, + "loss": 0.8118, + "step": 5750 + }, + { + "epoch": 0.9312100881092878, + "grad_norm": 0.6601425409317017, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 5760 + }, + { + "epoch": 0.9328267722900332, + "grad_norm": 0.5029960870742798, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 5770 + }, + { + "epoch": 0.9344434564707784, + "grad_norm": 0.4926645755767822, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 5780 + }, + { + "epoch": 0.9360601406515238, + "grad_norm": 0.5739615559577942, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 5790 + }, + { + "epoch": 0.937676824832269, + "grad_norm": 0.5058279037475586, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 5800 + }, + { + "epoch": 0.9392935090130143, + "grad_norm": 0.5260962247848511, + "learning_rate": 0.0002, + "loss": 0.8537, + "step": 5810 + }, + { + "epoch": 0.9409101931937596, + "grad_norm": 0.5768588185310364, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 5820 + }, + { + "epoch": 0.9425268773745049, + "grad_norm": 0.5170126557350159, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 5830 + }, + { + "epoch": 0.9441435615552501, + "grad_norm": 0.5745864510536194, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 5840 + }, + { + "epoch": 0.9457602457359955, + "grad_norm": 0.5551357865333557, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 5850 + }, + { + "epoch": 0.9473769299167407, + "grad_norm": 0.5776078701019287, + "learning_rate": 0.0002, + "loss": 0.8529, + "step": 5860 + }, + { + "epoch": 0.9489936140974861, + "grad_norm": 0.5340062379837036, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 5870 + }, + { + "epoch": 0.9506102982782314, + "grad_norm": 0.6447290182113647, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 5880 + }, + { + "epoch": 0.9522269824589766, + "grad_norm": 0.5123815536499023, + "learning_rate": 0.0002, + "loss": 0.7945, + "step": 5890 + }, + { + "epoch": 0.953843666639722, + "grad_norm": 0.48547613620758057, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 5900 + }, + { + "epoch": 0.9554603508204672, + "grad_norm": 0.5791414976119995, + "learning_rate": 0.0002, + "loss": 0.7896, + "step": 5910 + }, + { + "epoch": 0.9570770350012126, + "grad_norm": 0.6195011734962463, + "learning_rate": 0.0002, + "loss": 0.8408, + "step": 5920 + }, + { + "epoch": 0.9586937191819578, + "grad_norm": 0.6323803067207336, + "learning_rate": 0.0002, + "loss": 0.7805, + "step": 5930 + }, + { + "epoch": 0.9603104033627031, + "grad_norm": 0.45552879571914673, + "learning_rate": 0.0002, + "loss": 0.8484, + "step": 5940 + }, + { + "epoch": 0.9619270875434484, + "grad_norm": 0.5796473622322083, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 5950 + }, + { + "epoch": 0.9635437717241937, + "grad_norm": 0.647261381149292, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 5960 + }, + { + "epoch": 0.9651604559049389, + "grad_norm": 0.5487682819366455, + "learning_rate": 0.0002, + "loss": 0.8086, + "step": 5970 + }, + { + "epoch": 0.9667771400856843, + "grad_norm": 0.5743663907051086, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 5980 + }, + { + "epoch": 0.9683938242664295, + "grad_norm": 0.5470591187477112, + "learning_rate": 0.0002, + "loss": 0.8153, + "step": 5990 + }, + { + "epoch": 0.9700105084471748, + "grad_norm": 0.5901660323143005, + "learning_rate": 0.0002, + "loss": 0.8119, + "step": 6000 + }, + { + "epoch": 0.9716271926279202, + "grad_norm": 0.6544759273529053, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 6010 + }, + { + "epoch": 0.9732438768086654, + "grad_norm": 0.6288470029830933, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 6020 + }, + { + "epoch": 0.9748605609894108, + "grad_norm": 0.673153817653656, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 6030 + }, + { + "epoch": 0.976477245170156, + "grad_norm": 0.42854753136634827, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 6040 + }, + { + "epoch": 0.9780939293509013, + "grad_norm": 0.5227066278457642, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 6050 + }, + { + "epoch": 0.9797106135316466, + "grad_norm": 0.5372416973114014, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 6060 + }, + { + "epoch": 0.9813272977123919, + "grad_norm": 0.6026402115821838, + "learning_rate": 0.0002, + "loss": 0.7591, + "step": 6070 + }, + { + "epoch": 0.9829439818931371, + "grad_norm": 0.49547791481018066, + "learning_rate": 0.0002, + "loss": 0.8347, + "step": 6080 + }, + { + "epoch": 0.9845606660738825, + "grad_norm": 0.4641951322555542, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 6090 + }, + { + "epoch": 0.9861773502546277, + "grad_norm": 0.5818535089492798, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 6100 + }, + { + "epoch": 0.9877940344353731, + "grad_norm": 0.63955157995224, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 6110 + }, + { + "epoch": 0.9894107186161183, + "grad_norm": 0.5649438500404358, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 6120 + }, + { + "epoch": 0.9910274027968636, + "grad_norm": 0.5290433168411255, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 6130 + }, + { + "epoch": 0.992644086977609, + "grad_norm": 0.6399374008178711, + "learning_rate": 0.0002, + "loss": 0.8109, + "step": 6140 + }, + { + "epoch": 0.9942607711583542, + "grad_norm": 0.6736576557159424, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 6150 + }, + { + "epoch": 0.9958774553390995, + "grad_norm": 0.515420138835907, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 6160 + }, + { + "epoch": 0.9974941395198448, + "grad_norm": 0.562677800655365, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 6170 + }, + { + "epoch": 0.9991108237005901, + "grad_norm": 0.7113858461380005, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 6180 + }, + { + "epoch": 0.9999191657909627, + "eval_loss": 1.0871200561523438, + "eval_runtime": 122.2071, + "eval_samples_per_second": 5.998, + "eval_steps_per_second": 0.753, + "step": 6185 + }, + { + "epoch": 1.0007275078813354, + "grad_norm": 0.7111801505088806, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 6190 + }, + { + "epoch": 1.0023441920620806, + "grad_norm": 0.5402125716209412, + "learning_rate": 0.0002, + "loss": 0.6865, + "step": 6200 + }, + { + "epoch": 1.003960876242826, + "grad_norm": 0.6098830103874207, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 6210 + }, + { + "epoch": 1.0055775604235713, + "grad_norm": 0.5829983353614807, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 6220 + }, + { + "epoch": 1.0071942446043165, + "grad_norm": 0.5614621043205261, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 6230 + }, + { + "epoch": 1.0088109287850617, + "grad_norm": 0.5954238772392273, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 6240 + }, + { + "epoch": 1.0104276129658072, + "grad_norm": 0.6480574607849121, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 6250 + }, + { + "epoch": 1.0120442971465524, + "grad_norm": 0.6051128506660461, + "learning_rate": 0.0002, + "loss": 0.7514, + "step": 6260 + }, + { + "epoch": 1.0136609813272976, + "grad_norm": 0.6318870782852173, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 6270 + }, + { + "epoch": 1.015277665508043, + "grad_norm": 0.5048980116844177, + "learning_rate": 0.0002, + "loss": 0.7178, + "step": 6280 + }, + { + "epoch": 1.0168943496887883, + "grad_norm": 0.6346936225891113, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 6290 + }, + { + "epoch": 1.0185110338695336, + "grad_norm": 0.5711665749549866, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 6300 + }, + { + "epoch": 1.0201277180502788, + "grad_norm": 0.5175361037254333, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 6310 + }, + { + "epoch": 1.0217444022310243, + "grad_norm": 0.5360831618309021, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 6320 + }, + { + "epoch": 1.0233610864117695, + "grad_norm": 0.614675760269165, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 6330 + }, + { + "epoch": 1.0249777705925147, + "grad_norm": 0.5626118183135986, + "learning_rate": 0.0002, + "loss": 0.7748, + "step": 6340 + }, + { + "epoch": 1.02659445477326, + "grad_norm": 0.574897289276123, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 6350 + }, + { + "epoch": 1.0282111389540054, + "grad_norm": 0.7185447812080383, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 6360 + }, + { + "epoch": 1.0298278231347506, + "grad_norm": 0.6705799698829651, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 6370 + }, + { + "epoch": 1.0314445073154959, + "grad_norm": 0.6740428805351257, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 6380 + }, + { + "epoch": 1.0330611914962413, + "grad_norm": 0.663902759552002, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 6390 + }, + { + "epoch": 1.0346778756769865, + "grad_norm": 0.5029543042182922, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 6400 + }, + { + "epoch": 1.0362945598577318, + "grad_norm": 0.7813863158226013, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 6410 + }, + { + "epoch": 1.037911244038477, + "grad_norm": 0.5396282076835632, + "learning_rate": 0.0002, + "loss": 0.7433, + "step": 6420 + }, + { + "epoch": 1.0395279282192225, + "grad_norm": 0.5253293514251709, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 6430 + }, + { + "epoch": 1.0411446123999677, + "grad_norm": 0.7236770987510681, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 6440 + }, + { + "epoch": 1.042761296580713, + "grad_norm": 0.5670917630195618, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 6450 + }, + { + "epoch": 1.0443779807614582, + "grad_norm": 0.6031978726387024, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 6460 + }, + { + "epoch": 1.0459946649422036, + "grad_norm": 0.5309213399887085, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 6470 + }, + { + "epoch": 1.0476113491229488, + "grad_norm": 0.7114651799201965, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 6480 + }, + { + "epoch": 1.049228033303694, + "grad_norm": 0.5591610670089722, + "learning_rate": 0.0002, + "loss": 0.7583, + "step": 6490 + }, + { + "epoch": 1.0508447174844395, + "grad_norm": 0.5185961127281189, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 6500 + }, + { + "epoch": 1.0524614016651848, + "grad_norm": 0.6510552167892456, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 6510 + }, + { + "epoch": 1.05407808584593, + "grad_norm": 0.6557928919792175, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 6520 + }, + { + "epoch": 1.0556947700266752, + "grad_norm": 0.6973192691802979, + "learning_rate": 0.0002, + "loss": 0.8056, + "step": 6530 + }, + { + "epoch": 1.0573114542074207, + "grad_norm": 0.6226583123207092, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 6540 + }, + { + "epoch": 1.058928138388166, + "grad_norm": 0.5633195638656616, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 6550 + }, + { + "epoch": 1.0605448225689111, + "grad_norm": 0.7466658353805542, + "learning_rate": 0.0002, + "loss": 0.7082, + "step": 6560 + }, + { + "epoch": 1.0621615067496564, + "grad_norm": 0.6462772488594055, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 6570 + }, + { + "epoch": 1.0637781909304018, + "grad_norm": 0.5266856551170349, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 6580 + }, + { + "epoch": 1.065394875111147, + "grad_norm": 0.534392774105072, + "learning_rate": 0.0002, + "loss": 0.7157, + "step": 6590 + }, + { + "epoch": 1.0670115592918923, + "grad_norm": 0.7514177560806274, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 6600 + }, + { + "epoch": 1.0686282434726375, + "grad_norm": 0.7593035697937012, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 6610 + }, + { + "epoch": 1.070244927653383, + "grad_norm": 0.5277858972549438, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 6620 + }, + { + "epoch": 1.0718616118341282, + "grad_norm": 0.5573670268058777, + "learning_rate": 0.0002, + "loss": 0.7405, + "step": 6630 + }, + { + "epoch": 1.0734782960148734, + "grad_norm": 0.6802396774291992, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 6640 + }, + { + "epoch": 1.0750949801956189, + "grad_norm": 0.7367215752601624, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 6650 + }, + { + "epoch": 1.0767116643763641, + "grad_norm": 0.5961891412734985, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 6660 + }, + { + "epoch": 1.0783283485571094, + "grad_norm": 0.5736313462257385, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 6670 + }, + { + "epoch": 1.0799450327378546, + "grad_norm": 0.619219183921814, + "learning_rate": 0.0002, + "loss": 0.7178, + "step": 6680 + }, + { + "epoch": 1.0815617169186, + "grad_norm": 0.6214390993118286, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 6690 + }, + { + "epoch": 1.0831784010993453, + "grad_norm": 0.564536988735199, + "learning_rate": 0.0002, + "loss": 0.7554, + "step": 6700 + }, + { + "epoch": 1.0847950852800905, + "grad_norm": 0.5838140249252319, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 6710 + }, + { + "epoch": 1.0864117694608357, + "grad_norm": 0.7000553607940674, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 6720 + }, + { + "epoch": 1.0880284536415812, + "grad_norm": 0.7078263759613037, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 6730 + }, + { + "epoch": 1.0896451378223264, + "grad_norm": 0.8353848457336426, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 6740 + }, + { + "epoch": 1.0912618220030716, + "grad_norm": 0.5615518689155579, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 6750 + }, + { + "epoch": 1.0928785061838169, + "grad_norm": 0.5475581288337708, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 6760 + }, + { + "epoch": 1.0944951903645623, + "grad_norm": 0.5835978388786316, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 6770 + }, + { + "epoch": 1.0961118745453076, + "grad_norm": 0.5516105890274048, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 6780 + }, + { + "epoch": 1.0977285587260528, + "grad_norm": 0.5875251889228821, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 6790 + }, + { + "epoch": 1.0993452429067982, + "grad_norm": 0.7376947999000549, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 6800 + }, + { + "epoch": 1.1009619270875435, + "grad_norm": 0.5656165480613708, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 6810 + }, + { + "epoch": 1.1025786112682887, + "grad_norm": 0.6365954279899597, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 6820 + }, + { + "epoch": 1.104195295449034, + "grad_norm": 0.5033080577850342, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 6830 + }, + { + "epoch": 1.1058119796297794, + "grad_norm": 0.617396891117096, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 6840 + }, + { + "epoch": 1.1074286638105246, + "grad_norm": 0.6395374536514282, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 6850 + }, + { + "epoch": 1.1090453479912699, + "grad_norm": 0.6775295734405518, + "learning_rate": 0.0002, + "loss": 0.7335, + "step": 6860 + }, + { + "epoch": 1.1106620321720153, + "grad_norm": 0.6655223965644836, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 6870 + }, + { + "epoch": 1.1122787163527605, + "grad_norm": 0.676655113697052, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 6880 + }, + { + "epoch": 1.1138954005335058, + "grad_norm": 0.6062718629837036, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 6890 + }, + { + "epoch": 1.115512084714251, + "grad_norm": 0.590943455696106, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 6900 + }, + { + "epoch": 1.1171287688949965, + "grad_norm": 0.6315317153930664, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 6910 + }, + { + "epoch": 1.1187454530757417, + "grad_norm": 0.47979024052619934, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 6920 + }, + { + "epoch": 1.120362137256487, + "grad_norm": 0.647298276424408, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 6930 + }, + { + "epoch": 1.1219788214372322, + "grad_norm": 0.7336484789848328, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 6940 + }, + { + "epoch": 1.1235955056179776, + "grad_norm": 0.5071424245834351, + "learning_rate": 0.0002, + "loss": 0.693, + "step": 6950 + }, + { + "epoch": 1.1252121897987228, + "grad_norm": 0.6527144312858582, + "learning_rate": 0.0002, + "loss": 0.7378, + "step": 6960 + }, + { + "epoch": 1.126828873979468, + "grad_norm": 0.6935935020446777, + "learning_rate": 0.0002, + "loss": 0.7228, + "step": 6970 + }, + { + "epoch": 1.1284455581602133, + "grad_norm": 0.8026931881904602, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 6980 + }, + { + "epoch": 1.1300622423409588, + "grad_norm": 0.5210393667221069, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 6990 + }, + { + "epoch": 1.131678926521704, + "grad_norm": 0.60475093126297, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 7000 + }, + { + "epoch": 1.1332956107024492, + "grad_norm": 0.6417073607444763, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 7010 + }, + { + "epoch": 1.1349122948831947, + "grad_norm": 0.6732175946235657, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 7020 + }, + { + "epoch": 1.13652897906394, + "grad_norm": 0.6719491481781006, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 7030 + }, + { + "epoch": 1.1381456632446851, + "grad_norm": 0.5708295106887817, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 7040 + }, + { + "epoch": 1.1397623474254304, + "grad_norm": 0.7141719460487366, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 7050 + }, + { + "epoch": 1.1413790316061758, + "grad_norm": 0.6187017560005188, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 7060 + }, + { + "epoch": 1.142995715786921, + "grad_norm": 0.50581294298172, + "learning_rate": 0.0002, + "loss": 0.7657, + "step": 7070 + }, + { + "epoch": 1.1446123999676663, + "grad_norm": 0.5620143413543701, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 7080 + }, + { + "epoch": 1.1462290841484115, + "grad_norm": 0.6231929659843445, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 7090 + }, + { + "epoch": 1.147845768329157, + "grad_norm": 0.5775774121284485, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 7100 + }, + { + "epoch": 1.1494624525099022, + "grad_norm": 0.6492809653282166, + "learning_rate": 0.0002, + "loss": 0.7728, + "step": 7110 + }, + { + "epoch": 1.1510791366906474, + "grad_norm": 0.6434972286224365, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 7120 + }, + { + "epoch": 1.1526958208713927, + "grad_norm": 0.6191812753677368, + "learning_rate": 0.0002, + "loss": 0.7374, + "step": 7130 + }, + { + "epoch": 1.1543125050521381, + "grad_norm": 0.6690331697463989, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 7140 + }, + { + "epoch": 1.1559291892328833, + "grad_norm": 0.5977938175201416, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 7150 + }, + { + "epoch": 1.1575458734136286, + "grad_norm": 0.6195854544639587, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 7160 + }, + { + "epoch": 1.159162557594374, + "grad_norm": 0.5752048492431641, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 7170 + }, + { + "epoch": 1.1607792417751193, + "grad_norm": 0.589081883430481, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 7180 + }, + { + "epoch": 1.1623959259558645, + "grad_norm": 0.756996750831604, + "learning_rate": 0.0002, + "loss": 0.704, + "step": 7190 + }, + { + "epoch": 1.1640126101366097, + "grad_norm": 0.7614967226982117, + "learning_rate": 0.0002, + "loss": 0.7404, + "step": 7200 + }, + { + "epoch": 1.1656292943173552, + "grad_norm": 0.6120437979698181, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 7210 + }, + { + "epoch": 1.1672459784981004, + "grad_norm": 0.6210004687309265, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 7220 + }, + { + "epoch": 1.1688626626788456, + "grad_norm": 0.6044116020202637, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 7230 + }, + { + "epoch": 1.170479346859591, + "grad_norm": 0.5418457388877869, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 7240 + }, + { + "epoch": 1.1720960310403363, + "grad_norm": 0.6413537263870239, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 7250 + }, + { + "epoch": 1.1737127152210816, + "grad_norm": 0.5777867436408997, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 7260 + }, + { + "epoch": 1.1753293994018268, + "grad_norm": 0.7092402577400208, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 7270 + }, + { + "epoch": 1.176946083582572, + "grad_norm": 0.6351709365844727, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 7280 + }, + { + "epoch": 1.1785627677633175, + "grad_norm": 0.6172189712524414, + "learning_rate": 0.0002, + "loss": 0.7527, + "step": 7290 + }, + { + "epoch": 1.1801794519440627, + "grad_norm": 0.6801714897155762, + "learning_rate": 0.0002, + "loss": 0.7319, + "step": 7300 + }, + { + "epoch": 1.181796136124808, + "grad_norm": 0.6044712066650391, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 7310 + }, + { + "epoch": 1.1834128203055534, + "grad_norm": 0.7413212060928345, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 7320 + }, + { + "epoch": 1.1850295044862986, + "grad_norm": 0.5303856134414673, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 7330 + }, + { + "epoch": 1.1866461886670439, + "grad_norm": 0.5647098422050476, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 7340 + }, + { + "epoch": 1.188262872847789, + "grad_norm": 0.7374135255813599, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 7350 + }, + { + "epoch": 1.1898795570285345, + "grad_norm": 0.5710089206695557, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 7360 + }, + { + "epoch": 1.1914962412092798, + "grad_norm": 0.6073619723320007, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 7370 + }, + { + "epoch": 1.193112925390025, + "grad_norm": 0.5899916887283325, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 7380 + }, + { + "epoch": 1.1947296095707705, + "grad_norm": 0.7762434482574463, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 7390 + }, + { + "epoch": 1.1963462937515157, + "grad_norm": 0.679949939250946, + "learning_rate": 0.0002, + "loss": 0.7319, + "step": 7400 + }, + { + "epoch": 1.197962977932261, + "grad_norm": 0.6106849312782288, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 7410 + }, + { + "epoch": 1.1995796621130062, + "grad_norm": 0.682461678981781, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 7420 + }, + { + "epoch": 1.2011963462937516, + "grad_norm": 0.6087017059326172, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 7430 + }, + { + "epoch": 1.2028130304744968, + "grad_norm": 0.63739013671875, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 7440 + }, + { + "epoch": 1.204429714655242, + "grad_norm": 0.6154777407646179, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 7450 + }, + { + "epoch": 1.2060463988359873, + "grad_norm": 0.7491534948348999, + "learning_rate": 0.0002, + "loss": 0.7565, + "step": 7460 + }, + { + "epoch": 1.2076630830167328, + "grad_norm": 0.6664797067642212, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 7470 + }, + { + "epoch": 1.209279767197478, + "grad_norm": 0.6660266518592834, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 7480 + }, + { + "epoch": 1.2108964513782232, + "grad_norm": 0.6972551345825195, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 7490 + }, + { + "epoch": 1.2125131355589684, + "grad_norm": 0.6157945990562439, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 7500 + }, + { + "epoch": 1.214129819739714, + "grad_norm": 0.5199310183525085, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 7510 + }, + { + "epoch": 1.2157465039204591, + "grad_norm": 0.577610433101654, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 7520 + }, + { + "epoch": 1.2173631881012044, + "grad_norm": 0.53652423620224, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 7530 + }, + { + "epoch": 1.2189798722819498, + "grad_norm": 0.6479050517082214, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 7540 + }, + { + "epoch": 1.220596556462695, + "grad_norm": 0.618748128414154, + "learning_rate": 0.0002, + "loss": 0.7534, + "step": 7550 + }, + { + "epoch": 1.2222132406434403, + "grad_norm": 0.6311424374580383, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 7560 + }, + { + "epoch": 1.2238299248241855, + "grad_norm": 0.6595825552940369, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 7570 + }, + { + "epoch": 1.225446609004931, + "grad_norm": 0.5198960900306702, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 7580 + }, + { + "epoch": 1.2270632931856762, + "grad_norm": 0.578650712966919, + "learning_rate": 0.0002, + "loss": 0.674, + "step": 7590 + }, + { + "epoch": 1.2286799773664214, + "grad_norm": 0.6080220937728882, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 7600 + }, + { + "epoch": 1.2302966615471669, + "grad_norm": 0.7050248384475708, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 7610 + }, + { + "epoch": 1.2319133457279121, + "grad_norm": 0.6652196049690247, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 7620 + }, + { + "epoch": 1.2335300299086573, + "grad_norm": 0.7322776317596436, + "learning_rate": 0.0002, + "loss": 0.7085, + "step": 7630 + }, + { + "epoch": 1.2351467140894026, + "grad_norm": 0.4998728036880493, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 7640 + }, + { + "epoch": 1.2367633982701478, + "grad_norm": 0.6428788900375366, + "learning_rate": 0.0002, + "loss": 0.7214, + "step": 7650 + }, + { + "epoch": 1.2383800824508933, + "grad_norm": 0.585242509841919, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 7660 + }, + { + "epoch": 1.2399967666316385, + "grad_norm": 0.5211917757987976, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 7670 + }, + { + "epoch": 1.2416134508123837, + "grad_norm": 0.6490384340286255, + "learning_rate": 0.0002, + "loss": 0.746, + "step": 7680 + }, + { + "epoch": 1.2432301349931292, + "grad_norm": 0.6249763369560242, + "learning_rate": 0.0002, + "loss": 0.7186, + "step": 7690 + }, + { + "epoch": 1.2448468191738744, + "grad_norm": 0.71870356798172, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 7700 + }, + { + "epoch": 1.2464635033546196, + "grad_norm": 0.6761967539787292, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 7710 + }, + { + "epoch": 1.2480801875353649, + "grad_norm": 0.6500617265701294, + "learning_rate": 0.0002, + "loss": 0.7501, + "step": 7720 + }, + { + "epoch": 1.2496968717161103, + "grad_norm": 0.8069869875907898, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 7730 + }, + { + "epoch": 1.2513135558968556, + "grad_norm": 0.6044608950614929, + "learning_rate": 0.0002, + "loss": 0.6747, + "step": 7740 + }, + { + "epoch": 1.2529302400776008, + "grad_norm": 0.6573283076286316, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 7750 + }, + { + "epoch": 1.2545469242583462, + "grad_norm": 0.625430166721344, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 7760 + }, + { + "epoch": 1.2561636084390915, + "grad_norm": 0.5442022681236267, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 7770 + }, + { + "epoch": 1.2577802926198367, + "grad_norm": 0.6818386912345886, + "learning_rate": 0.0002, + "loss": 0.7172, + "step": 7780 + }, + { + "epoch": 1.259396976800582, + "grad_norm": 0.6381874084472656, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 7790 + }, + { + "epoch": 1.2610136609813272, + "grad_norm": 0.6269212961196899, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 7800 + }, + { + "epoch": 1.2626303451620726, + "grad_norm": 0.600121259689331, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 7810 + }, + { + "epoch": 1.2642470293428179, + "grad_norm": 0.6337703466415405, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 7820 + }, + { + "epoch": 1.2658637135235633, + "grad_norm": 0.7234963774681091, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 7830 + }, + { + "epoch": 1.2674803977043085, + "grad_norm": 0.800184965133667, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 7840 + }, + { + "epoch": 1.2690970818850538, + "grad_norm": 0.7539464831352234, + "learning_rate": 0.0002, + "loss": 0.7426, + "step": 7850 + }, + { + "epoch": 1.270713766065799, + "grad_norm": 0.5493760704994202, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 7860 + }, + { + "epoch": 1.2723304502465442, + "grad_norm": 0.7477145791053772, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 7870 + }, + { + "epoch": 1.2739471344272897, + "grad_norm": 0.6366362571716309, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 7880 + }, + { + "epoch": 1.275563818608035, + "grad_norm": 0.7419533729553223, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 7890 + }, + { + "epoch": 1.2771805027887801, + "grad_norm": 0.6141223311424255, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 7900 + }, + { + "epoch": 1.2787971869695256, + "grad_norm": 0.7522598505020142, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 7910 + }, + { + "epoch": 1.2804138711502708, + "grad_norm": 0.6935804486274719, + "learning_rate": 0.0002, + "loss": 0.7097, + "step": 7920 + }, + { + "epoch": 1.282030555331016, + "grad_norm": 0.7239290475845337, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 7930 + }, + { + "epoch": 1.2836472395117613, + "grad_norm": 0.8800187110900879, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 7940 + }, + { + "epoch": 1.2852639236925067, + "grad_norm": 0.540458083152771, + "learning_rate": 0.0002, + "loss": 0.6991, + "step": 7950 + }, + { + "epoch": 1.286880607873252, + "grad_norm": 0.6492934226989746, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 7960 + }, + { + "epoch": 1.2884972920539972, + "grad_norm": 0.6543959379196167, + "learning_rate": 0.0002, + "loss": 0.7742, + "step": 7970 + }, + { + "epoch": 1.2901139762347427, + "grad_norm": 0.5804705619812012, + "learning_rate": 0.0002, + "loss": 0.7316, + "step": 7980 + }, + { + "epoch": 1.291730660415488, + "grad_norm": 0.7074727416038513, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 7990 + }, + { + "epoch": 1.2933473445962331, + "grad_norm": 0.5347974300384521, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 8000 + }, + { + "epoch": 1.2949640287769784, + "grad_norm": 0.6457298398017883, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 8010 + }, + { + "epoch": 1.2965807129577236, + "grad_norm": 0.6407219171524048, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 8020 + }, + { + "epoch": 1.298197397138469, + "grad_norm": 0.828439474105835, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 8030 + }, + { + "epoch": 1.2998140813192143, + "grad_norm": 0.4840380549430847, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 8040 + }, + { + "epoch": 1.3014307654999595, + "grad_norm": 0.5921024680137634, + "learning_rate": 0.0002, + "loss": 0.7283, + "step": 8050 + }, + { + "epoch": 1.303047449680705, + "grad_norm": 0.6170315146446228, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 8060 + }, + { + "epoch": 1.3046641338614502, + "grad_norm": 0.5374847054481506, + "learning_rate": 0.0002, + "loss": 0.7534, + "step": 8070 + }, + { + "epoch": 1.3062808180421954, + "grad_norm": 0.545758068561554, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 8080 + }, + { + "epoch": 1.3078975022229407, + "grad_norm": 0.55641770362854, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 8090 + }, + { + "epoch": 1.309514186403686, + "grad_norm": 0.6724897027015686, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 8100 + }, + { + "epoch": 1.3111308705844313, + "grad_norm": 0.6923972368240356, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 8110 + }, + { + "epoch": 1.3127475547651766, + "grad_norm": 0.5136841535568237, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 8120 + }, + { + "epoch": 1.314364238945922, + "grad_norm": 0.6766283512115479, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 8130 + }, + { + "epoch": 1.3159809231266673, + "grad_norm": 0.6283926367759705, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 8140 + }, + { + "epoch": 1.3175976073074125, + "grad_norm": 0.644216001033783, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 8150 + }, + { + "epoch": 1.3192142914881577, + "grad_norm": 0.7827503085136414, + "learning_rate": 0.0002, + "loss": 0.7125, + "step": 8160 + }, + { + "epoch": 1.320830975668903, + "grad_norm": 0.6651390790939331, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 8170 + }, + { + "epoch": 1.3224476598496484, + "grad_norm": 0.5547412633895874, + "learning_rate": 0.0002, + "loss": 0.7778, + "step": 8180 + }, + { + "epoch": 1.3240643440303936, + "grad_norm": 0.6765179634094238, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 8190 + }, + { + "epoch": 1.325681028211139, + "grad_norm": 0.6822077035903931, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 8200 + }, + { + "epoch": 1.3272977123918843, + "grad_norm": 0.5941002368927002, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 8210 + }, + { + "epoch": 1.3289143965726296, + "grad_norm": 0.4850037097930908, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 8220 + }, + { + "epoch": 1.3305310807533748, + "grad_norm": 0.6162990927696228, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 8230 + }, + { + "epoch": 1.33214776493412, + "grad_norm": 0.6665613651275635, + "learning_rate": 0.0002, + "loss": 0.7751, + "step": 8240 + }, + { + "epoch": 1.3337644491148655, + "grad_norm": 0.618192732334137, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 8250 + }, + { + "epoch": 1.3353811332956107, + "grad_norm": 0.710418701171875, + "learning_rate": 0.0002, + "loss": 0.7532, + "step": 8260 + }, + { + "epoch": 1.336997817476356, + "grad_norm": 0.5109876990318298, + "learning_rate": 0.0002, + "loss": 0.7306, + "step": 8270 + }, + { + "epoch": 1.3386145016571014, + "grad_norm": 0.6791711449623108, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 8280 + }, + { + "epoch": 1.3402311858378466, + "grad_norm": 0.6836432814598083, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 8290 + }, + { + "epoch": 1.3418478700185918, + "grad_norm": 0.5579386353492737, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 8300 + }, + { + "epoch": 1.343464554199337, + "grad_norm": 0.6713546514511108, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 8310 + }, + { + "epoch": 1.3450812383800825, + "grad_norm": 0.5353720188140869, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 8320 + }, + { + "epoch": 1.3466979225608278, + "grad_norm": 0.5813682675361633, + "learning_rate": 0.0002, + "loss": 0.718, + "step": 8330 + }, + { + "epoch": 1.348314606741573, + "grad_norm": 0.8158791661262512, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 8340 + }, + { + "epoch": 1.3499312909223184, + "grad_norm": 0.6193785071372986, + "learning_rate": 0.0002, + "loss": 0.6992, + "step": 8350 + }, + { + "epoch": 1.3515479751030637, + "grad_norm": 0.6353939771652222, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 8360 + }, + { + "epoch": 1.353164659283809, + "grad_norm": 0.6925048232078552, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 8370 + }, + { + "epoch": 1.3547813434645541, + "grad_norm": 0.988264799118042, + "learning_rate": 0.0002, + "loss": 0.736, + "step": 8380 + }, + { + "epoch": 1.3563980276452994, + "grad_norm": 0.6476002931594849, + "learning_rate": 0.0002, + "loss": 0.7744, + "step": 8390 + }, + { + "epoch": 1.3580147118260448, + "grad_norm": 0.7120398879051208, + "learning_rate": 0.0002, + "loss": 0.776, + "step": 8400 + }, + { + "epoch": 1.35963139600679, + "grad_norm": 0.9048416614532471, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 8410 + }, + { + "epoch": 1.3612480801875353, + "grad_norm": 0.7000672817230225, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 8420 + }, + { + "epoch": 1.3628647643682807, + "grad_norm": 0.6015632152557373, + "learning_rate": 0.0002, + "loss": 0.7358, + "step": 8430 + }, + { + "epoch": 1.364481448549026, + "grad_norm": 0.612516462802887, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 8440 + }, + { + "epoch": 1.3660981327297712, + "grad_norm": 0.5969301462173462, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 8450 + }, + { + "epoch": 1.3677148169105164, + "grad_norm": 0.6730654239654541, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 8460 + }, + { + "epoch": 1.369331501091262, + "grad_norm": 0.6386392116546631, + "learning_rate": 0.0002, + "loss": 0.7465, + "step": 8470 + }, + { + "epoch": 1.3709481852720071, + "grad_norm": 0.739544153213501, + "learning_rate": 0.0002, + "loss": 0.7433, + "step": 8480 + }, + { + "epoch": 1.3725648694527524, + "grad_norm": 0.6462782621383667, + "learning_rate": 0.0002, + "loss": 0.7892, + "step": 8490 + }, + { + "epoch": 1.3741815536334978, + "grad_norm": 0.7346843481063843, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 8500 + }, + { + "epoch": 1.375798237814243, + "grad_norm": 0.6884821057319641, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 8510 + }, + { + "epoch": 1.3774149219949883, + "grad_norm": 0.6999333500862122, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 8520 + }, + { + "epoch": 1.3790316061757335, + "grad_norm": 0.5378713011741638, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 8530 + }, + { + "epoch": 1.3806482903564787, + "grad_norm": 0.5417906641960144, + "learning_rate": 0.0002, + "loss": 0.6797, + "step": 8540 + }, + { + "epoch": 1.3822649745372242, + "grad_norm": 0.6602526307106018, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 8550 + }, + { + "epoch": 1.3838816587179694, + "grad_norm": 0.7073674201965332, + "learning_rate": 0.0002, + "loss": 0.7356, + "step": 8560 + }, + { + "epoch": 1.3854983428987149, + "grad_norm": 0.5841707587242126, + "learning_rate": 0.0002, + "loss": 0.75, + "step": 8570 + }, + { + "epoch": 1.38711502707946, + "grad_norm": 0.7031095027923584, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 8580 + }, + { + "epoch": 1.3887317112602053, + "grad_norm": 0.5198570489883423, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 8590 + }, + { + "epoch": 1.3903483954409506, + "grad_norm": 0.7261320352554321, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 8600 + }, + { + "epoch": 1.3919650796216958, + "grad_norm": 0.5616350173950195, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 8610 + }, + { + "epoch": 1.3935817638024413, + "grad_norm": 0.5185914635658264, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 8620 + }, + { + "epoch": 1.3951984479831865, + "grad_norm": 0.5814694762229919, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 8630 + }, + { + "epoch": 1.3968151321639317, + "grad_norm": 0.6977371573448181, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 8640 + }, + { + "epoch": 1.3984318163446772, + "grad_norm": 0.6855689883232117, + "learning_rate": 0.0002, + "loss": 0.7574, + "step": 8650 + }, + { + "epoch": 1.4000485005254224, + "grad_norm": 0.5414357781410217, + "learning_rate": 0.0002, + "loss": 0.7802, + "step": 8660 + }, + { + "epoch": 1.4016651847061676, + "grad_norm": 0.6970012784004211, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 8670 + }, + { + "epoch": 1.4032818688869129, + "grad_norm": 0.526079535484314, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 8680 + }, + { + "epoch": 1.404898553067658, + "grad_norm": 0.758712887763977, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 8690 + }, + { + "epoch": 1.4065152372484035, + "grad_norm": 0.7118762731552124, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 8700 + }, + { + "epoch": 1.4081319214291488, + "grad_norm": 0.5696909427642822, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 8710 + }, + { + "epoch": 1.4097486056098942, + "grad_norm": 0.7995436787605286, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 8720 + }, + { + "epoch": 1.4113652897906395, + "grad_norm": 0.7237521409988403, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 8730 + }, + { + "epoch": 1.4129819739713847, + "grad_norm": 0.744628369808197, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 8740 + }, + { + "epoch": 1.41459865815213, + "grad_norm": 0.6082926988601685, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 8750 + }, + { + "epoch": 1.4162153423328752, + "grad_norm": 0.5185243487358093, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 8760 + }, + { + "epoch": 1.4178320265136206, + "grad_norm": 0.5183082222938538, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 8770 + }, + { + "epoch": 1.4194487106943658, + "grad_norm": 0.7326041460037231, + "learning_rate": 0.0002, + "loss": 0.7509, + "step": 8780 + }, + { + "epoch": 1.421065394875111, + "grad_norm": 0.7174660563468933, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 8790 + }, + { + "epoch": 1.4226820790558565, + "grad_norm": 0.8080165982246399, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 8800 + }, + { + "epoch": 1.4242987632366018, + "grad_norm": 0.5061507821083069, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 8810 + }, + { + "epoch": 1.425915447417347, + "grad_norm": 0.801602840423584, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 8820 + }, + { + "epoch": 1.4275321315980922, + "grad_norm": 0.6150273084640503, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 8830 + }, + { + "epoch": 1.4291488157788377, + "grad_norm": 0.8786525726318359, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 8840 + }, + { + "epoch": 1.430765499959583, + "grad_norm": 0.6371538639068604, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 8850 + }, + { + "epoch": 1.4323821841403281, + "grad_norm": 0.6409295797348022, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 8860 + }, + { + "epoch": 1.4339988683210736, + "grad_norm": 0.6452359557151794, + "learning_rate": 0.0002, + "loss": 0.7891, + "step": 8870 + }, + { + "epoch": 1.4356155525018188, + "grad_norm": 0.5842334628105164, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 8880 + }, + { + "epoch": 1.437232236682564, + "grad_norm": 0.696761965751648, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 8890 + }, + { + "epoch": 1.4388489208633093, + "grad_norm": 0.6384600400924683, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 8900 + }, + { + "epoch": 1.4404656050440545, + "grad_norm": 0.5981136560440063, + "learning_rate": 0.0002, + "loss": 0.7049, + "step": 8910 + }, + { + "epoch": 1.4420822892248, + "grad_norm": 0.6355637907981873, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 8920 + }, + { + "epoch": 1.4436989734055452, + "grad_norm": 0.6374830603599548, + "learning_rate": 0.0002, + "loss": 0.7653, + "step": 8930 + }, + { + "epoch": 1.4453156575862904, + "grad_norm": 0.559013307094574, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 8940 + }, + { + "epoch": 1.446932341767036, + "grad_norm": 0.7289170026779175, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 8950 + }, + { + "epoch": 1.4485490259477811, + "grad_norm": 0.8649206757545471, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 8960 + }, + { + "epoch": 1.4501657101285264, + "grad_norm": 0.7664689421653748, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 8970 + }, + { + "epoch": 1.4517823943092716, + "grad_norm": 0.7109952569007874, + "learning_rate": 0.0002, + "loss": 0.7431, + "step": 8980 + }, + { + "epoch": 1.453399078490017, + "grad_norm": 0.6312844753265381, + "learning_rate": 0.0002, + "loss": 0.7997, + "step": 8990 + }, + { + "epoch": 1.4550157626707623, + "grad_norm": 0.6616617441177368, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 9000 + }, + { + "epoch": 1.4566324468515075, + "grad_norm": 0.7384068965911865, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 9010 + }, + { + "epoch": 1.458249131032253, + "grad_norm": 0.6549670100212097, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 9020 + }, + { + "epoch": 1.4598658152129982, + "grad_norm": 0.6254119277000427, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 9030 + }, + { + "epoch": 1.4614824993937434, + "grad_norm": 0.6806328892707825, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 9040 + }, + { + "epoch": 1.4630991835744886, + "grad_norm": 0.6803115010261536, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 9050 + }, + { + "epoch": 1.4647158677552339, + "grad_norm": 0.48529282212257385, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 9060 + }, + { + "epoch": 1.4663325519359793, + "grad_norm": 0.5995030999183655, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 9070 + }, + { + "epoch": 1.4679492361167246, + "grad_norm": 0.6005427837371826, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 9080 + }, + { + "epoch": 1.46956592029747, + "grad_norm": 0.718564510345459, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 9090 + }, + { + "epoch": 1.4711826044782153, + "grad_norm": 0.7003577351570129, + "learning_rate": 0.0002, + "loss": 0.7089, + "step": 9100 + }, + { + "epoch": 1.4727992886589605, + "grad_norm": 0.5888323783874512, + "learning_rate": 0.0002, + "loss": 0.8069, + "step": 9110 + }, + { + "epoch": 1.4744159728397057, + "grad_norm": 0.6417609453201294, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 9120 + }, + { + "epoch": 1.476032657020451, + "grad_norm": 0.572294294834137, + "learning_rate": 0.0002, + "loss": 0.7441, + "step": 9130 + }, + { + "epoch": 1.4776493412011964, + "grad_norm": 0.8200714588165283, + "learning_rate": 0.0002, + "loss": 0.8053, + "step": 9140 + }, + { + "epoch": 1.4792660253819416, + "grad_norm": 0.6343288421630859, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 9150 + }, + { + "epoch": 1.4808827095626869, + "grad_norm": 0.7017961144447327, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 9160 + }, + { + "epoch": 1.4824993937434323, + "grad_norm": 0.6202912926673889, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 9170 + }, + { + "epoch": 1.4841160779241775, + "grad_norm": 0.6677869558334351, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 9180 + }, + { + "epoch": 1.4857327621049228, + "grad_norm": 0.6052267551422119, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 9190 + }, + { + "epoch": 1.487349446285668, + "grad_norm": 0.6638872027397156, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 9200 + }, + { + "epoch": 1.4889661304664135, + "grad_norm": 0.6245523691177368, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 9210 + }, + { + "epoch": 1.4905828146471587, + "grad_norm": 0.5761767625808716, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 9220 + }, + { + "epoch": 1.492199498827904, + "grad_norm": 0.8175981640815735, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 9230 + }, + { + "epoch": 1.4938161830086494, + "grad_norm": 0.9144009947776794, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 9240 + }, + { + "epoch": 1.4954328671893946, + "grad_norm": 0.5742552876472473, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 9250 + }, + { + "epoch": 1.4970495513701398, + "grad_norm": 0.534534215927124, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 9260 + }, + { + "epoch": 1.498666235550885, + "grad_norm": 0.7836225032806396, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 9270 + }, + { + "epoch": 1.5002829197316303, + "grad_norm": 0.5292993187904358, + "learning_rate": 0.0002, + "loss": 0.7453, + "step": 9280 + }, + { + "epoch": 1.5018996039123758, + "grad_norm": 0.8044071793556213, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 9290 + }, + { + "epoch": 1.503516288093121, + "grad_norm": 0.6185805201530457, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 9300 + }, + { + "epoch": 1.5051329722738664, + "grad_norm": 0.6093607544898987, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 9310 + }, + { + "epoch": 1.5067496564546117, + "grad_norm": 0.5891730189323425, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 9320 + }, + { + "epoch": 1.508366340635357, + "grad_norm": 0.6331129670143127, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 9330 + }, + { + "epoch": 1.5099830248161021, + "grad_norm": 0.7690958380699158, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 9340 + }, + { + "epoch": 1.5115997089968474, + "grad_norm": 0.6548877358436584, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 9350 + }, + { + "epoch": 1.5132163931775926, + "grad_norm": 0.6545143127441406, + "learning_rate": 0.0002, + "loss": 0.7408, + "step": 9360 + }, + { + "epoch": 1.514833077358338, + "grad_norm": 0.553247332572937, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 9370 + }, + { + "epoch": 1.5164497615390833, + "grad_norm": 0.8145074844360352, + "learning_rate": 0.0002, + "loss": 0.7265, + "step": 9380 + }, + { + "epoch": 1.5180664457198287, + "grad_norm": 0.7636994123458862, + "learning_rate": 0.0002, + "loss": 0.7379, + "step": 9390 + }, + { + "epoch": 1.519683129900574, + "grad_norm": 0.6838982701301575, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 9400 + }, + { + "epoch": 1.5212998140813192, + "grad_norm": 0.8599441647529602, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 9410 + }, + { + "epoch": 1.5229164982620644, + "grad_norm": 0.7020329833030701, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 9420 + }, + { + "epoch": 1.5245331824428097, + "grad_norm": 0.6964772343635559, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 9430 + }, + { + "epoch": 1.5261498666235551, + "grad_norm": 0.6916600465774536, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 9440 + }, + { + "epoch": 1.5277665508043003, + "grad_norm": 0.7282621264457703, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 9450 + }, + { + "epoch": 1.5293832349850458, + "grad_norm": 0.5363983511924744, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 9460 + }, + { + "epoch": 1.530999919165791, + "grad_norm": 0.6184861063957214, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 9470 + }, + { + "epoch": 1.5326166033465363, + "grad_norm": 0.5991285443305969, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 9480 + }, + { + "epoch": 1.5342332875272815, + "grad_norm": 0.8176587820053101, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 9490 + }, + { + "epoch": 1.5358499717080267, + "grad_norm": 0.6473721861839294, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 9500 + }, + { + "epoch": 1.5374666558887722, + "grad_norm": 0.7319952845573425, + "learning_rate": 0.0002, + "loss": 0.7632, + "step": 9510 + }, + { + "epoch": 1.5390833400695174, + "grad_norm": 0.702900230884552, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 9520 + }, + { + "epoch": 1.5407000242502629, + "grad_norm": 0.7971600294113159, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 9530 + }, + { + "epoch": 1.542316708431008, + "grad_norm": 0.6527525186538696, + "learning_rate": 0.0002, + "loss": 0.7352, + "step": 9540 + }, + { + "epoch": 1.5439333926117533, + "grad_norm": 0.5791676044464111, + "learning_rate": 0.0002, + "loss": 0.7425, + "step": 9550 + }, + { + "epoch": 1.5455500767924986, + "grad_norm": 0.5619390606880188, + "learning_rate": 0.0002, + "loss": 0.7585, + "step": 9560 + }, + { + "epoch": 1.5471667609732438, + "grad_norm": 0.5701689124107361, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 9570 + }, + { + "epoch": 1.548783445153989, + "grad_norm": 0.47549352049827576, + "learning_rate": 0.0002, + "loss": 0.793, + "step": 9580 + }, + { + "epoch": 1.5504001293347345, + "grad_norm": 0.8730611205101013, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 9590 + }, + { + "epoch": 1.5520168135154797, + "grad_norm": 0.6842091083526611, + "learning_rate": 0.0002, + "loss": 0.798, + "step": 9600 + }, + { + "epoch": 1.5536334976962252, + "grad_norm": 0.6675129532814026, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 9610 + }, + { + "epoch": 1.5552501818769704, + "grad_norm": 0.8173956274986267, + "learning_rate": 0.0002, + "loss": 0.7954, + "step": 9620 + }, + { + "epoch": 1.5568668660577156, + "grad_norm": 0.724947452545166, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 9630 + }, + { + "epoch": 1.5584835502384609, + "grad_norm": 0.6154758930206299, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 9640 + }, + { + "epoch": 1.560100234419206, + "grad_norm": 0.6072008013725281, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 9650 + }, + { + "epoch": 1.5617169185999515, + "grad_norm": 0.659010648727417, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 9660 + }, + { + "epoch": 1.5633336027806968, + "grad_norm": 0.65857994556427, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 9670 + }, + { + "epoch": 1.5649502869614422, + "grad_norm": 0.5914267301559448, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 9680 + }, + { + "epoch": 1.5665669711421875, + "grad_norm": 0.6248020529747009, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 9690 + }, + { + "epoch": 1.5681836553229327, + "grad_norm": 0.7147795557975769, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 9700 + }, + { + "epoch": 1.569800339503678, + "grad_norm": 0.7076232433319092, + "learning_rate": 0.0002, + "loss": 0.7335, + "step": 9710 + }, + { + "epoch": 1.5714170236844232, + "grad_norm": 0.6217400431632996, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 9720 + }, + { + "epoch": 1.5730337078651684, + "grad_norm": 0.6709911227226257, + "learning_rate": 0.0002, + "loss": 0.7296, + "step": 9730 + }, + { + "epoch": 1.5746503920459138, + "grad_norm": 0.749171257019043, + "learning_rate": 0.0002, + "loss": 0.7306, + "step": 9740 + }, + { + "epoch": 1.576267076226659, + "grad_norm": 0.6241145730018616, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 9750 + }, + { + "epoch": 1.5778837604074045, + "grad_norm": 0.4960934817790985, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 9760 + }, + { + "epoch": 1.5795004445881498, + "grad_norm": 0.6593309640884399, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 9770 + }, + { + "epoch": 1.581117128768895, + "grad_norm": 0.5814042091369629, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 9780 + }, + { + "epoch": 1.5827338129496402, + "grad_norm": 0.5936070680618286, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 9790 + }, + { + "epoch": 1.5843504971303854, + "grad_norm": 0.6454403400421143, + "learning_rate": 0.0002, + "loss": 0.7769, + "step": 9800 + }, + { + "epoch": 1.585967181311131, + "grad_norm": 0.7612107992172241, + "learning_rate": 0.0002, + "loss": 0.7677, + "step": 9810 + }, + { + "epoch": 1.5875838654918761, + "grad_norm": 0.6494482755661011, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 9820 + }, + { + "epoch": 1.5892005496726216, + "grad_norm": 0.7825694680213928, + "learning_rate": 0.0002, + "loss": 0.7569, + "step": 9830 + }, + { + "epoch": 1.5908172338533668, + "grad_norm": 0.6757757663726807, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 9840 + }, + { + "epoch": 1.592433918034112, + "grad_norm": 0.7105609178543091, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 9850 + }, + { + "epoch": 1.5940506022148573, + "grad_norm": 0.7596991062164307, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 9860 + }, + { + "epoch": 1.5956672863956025, + "grad_norm": 0.5681525468826294, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 9870 + }, + { + "epoch": 1.5972839705763477, + "grad_norm": 0.6090980768203735, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 9880 + }, + { + "epoch": 1.5989006547570932, + "grad_norm": 0.6271613240242004, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 9890 + }, + { + "epoch": 1.6005173389378387, + "grad_norm": 0.7656369805335999, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 9900 + }, + { + "epoch": 1.6021340231185839, + "grad_norm": 0.7504446506500244, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 9910 + }, + { + "epoch": 1.6037507072993291, + "grad_norm": 0.659656286239624, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 9920 + }, + { + "epoch": 1.6053673914800743, + "grad_norm": 0.6006826162338257, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 9930 + }, + { + "epoch": 1.6069840756608196, + "grad_norm": 0.7872757911682129, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 9940 + }, + { + "epoch": 1.6086007598415648, + "grad_norm": 0.5545852780342102, + "learning_rate": 0.0002, + "loss": 0.768, + "step": 9950 + }, + { + "epoch": 1.6102174440223103, + "grad_norm": 0.7429468631744385, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 9960 + }, + { + "epoch": 1.6118341282030555, + "grad_norm": 0.6873556971549988, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 9970 + }, + { + "epoch": 1.613450812383801, + "grad_norm": 0.5874287486076355, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 9980 + }, + { + "epoch": 1.6150674965645462, + "grad_norm": 0.6039386987686157, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 9990 + }, + { + "epoch": 1.6166841807452914, + "grad_norm": 0.6233575940132141, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 10000 + }, + { + "epoch": 1.6183008649260366, + "grad_norm": 0.7676448225975037, + "learning_rate": 0.0002, + "loss": 0.7602, + "step": 10010 + }, + { + "epoch": 1.6199175491067819, + "grad_norm": 0.6565698385238647, + "learning_rate": 0.0002, + "loss": 0.7784, + "step": 10020 + }, + { + "epoch": 1.6215342332875273, + "grad_norm": 0.6787590384483337, + "learning_rate": 0.0002, + "loss": 0.7104, + "step": 10030 + }, + { + "epoch": 1.6231509174682726, + "grad_norm": 0.6137678027153015, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 10040 + }, + { + "epoch": 1.624767601649018, + "grad_norm": 0.5236800312995911, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 10050 + }, + { + "epoch": 1.6263842858297632, + "grad_norm": 0.7626367807388306, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 10060 + }, + { + "epoch": 1.6280009700105085, + "grad_norm": 0.5657260417938232, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 10070 + }, + { + "epoch": 1.6296176541912537, + "grad_norm": 0.4913991391658783, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 10080 + }, + { + "epoch": 1.631234338371999, + "grad_norm": 0.7715556621551514, + "learning_rate": 0.0002, + "loss": 0.7596, + "step": 10090 + }, + { + "epoch": 1.6328510225527442, + "grad_norm": 0.6509000062942505, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 10100 + }, + { + "epoch": 1.6344677067334896, + "grad_norm": 0.6215850114822388, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 10110 + }, + { + "epoch": 1.6360843909142349, + "grad_norm": 0.6956844329833984, + "learning_rate": 0.0002, + "loss": 0.7705, + "step": 10120 + }, + { + "epoch": 1.6377010750949803, + "grad_norm": 0.6111597418785095, + "learning_rate": 0.0002, + "loss": 0.7129, + "step": 10130 + }, + { + "epoch": 1.6393177592757255, + "grad_norm": 0.6518288850784302, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 10140 + }, + { + "epoch": 1.6409344434564708, + "grad_norm": 0.6914522051811218, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 10150 + }, + { + "epoch": 1.642551127637216, + "grad_norm": 0.63785719871521, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 10160 + }, + { + "epoch": 1.6441678118179612, + "grad_norm": 0.6379287838935852, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 10170 + }, + { + "epoch": 1.6457844959987067, + "grad_norm": 0.6793403029441833, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 10180 + }, + { + "epoch": 1.647401180179452, + "grad_norm": 0.6099132895469666, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 10190 + }, + { + "epoch": 1.6490178643601974, + "grad_norm": 0.5869854092597961, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 10200 + }, + { + "epoch": 1.6506345485409426, + "grad_norm": 0.7716999053955078, + "learning_rate": 0.0002, + "loss": 0.8308, + "step": 10210 + }, + { + "epoch": 1.6522512327216878, + "grad_norm": 0.6854110360145569, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 10220 + }, + { + "epoch": 1.653867916902433, + "grad_norm": 0.6957170367240906, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 10230 + }, + { + "epoch": 1.6554846010831783, + "grad_norm": 0.6932903528213501, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 10240 + }, + { + "epoch": 1.6571012852639235, + "grad_norm": 0.7713165283203125, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 10250 + }, + { + "epoch": 1.658717969444669, + "grad_norm": 0.7455793619155884, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 10260 + }, + { + "epoch": 1.6603346536254144, + "grad_norm": 0.5464168190956116, + "learning_rate": 0.0002, + "loss": 0.7524, + "step": 10270 + }, + { + "epoch": 1.6619513378061597, + "grad_norm": 0.6782926321029663, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 10280 + }, + { + "epoch": 1.663568021986905, + "grad_norm": 0.7962649464607239, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 10290 + }, + { + "epoch": 1.6651847061676501, + "grad_norm": 0.6814526319503784, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 10300 + }, + { + "epoch": 1.6668013903483954, + "grad_norm": 0.656895101070404, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 10310 + }, + { + "epoch": 1.6684180745291406, + "grad_norm": 0.6085672378540039, + "learning_rate": 0.0002, + "loss": 0.7358, + "step": 10320 + }, + { + "epoch": 1.670034758709886, + "grad_norm": 0.585508406162262, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 10330 + }, + { + "epoch": 1.6716514428906313, + "grad_norm": 0.6930184364318848, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 10340 + }, + { + "epoch": 1.6732681270713767, + "grad_norm": 0.575663149356842, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 10350 + }, + { + "epoch": 1.674884811252122, + "grad_norm": 0.582502543926239, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 10360 + }, + { + "epoch": 1.6765014954328672, + "grad_norm": 0.5668916702270508, + "learning_rate": 0.0002, + "loss": 0.7793, + "step": 10370 + }, + { + "epoch": 1.6781181796136124, + "grad_norm": 0.6070065498352051, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 10380 + }, + { + "epoch": 1.6797348637943577, + "grad_norm": 0.6141316294670105, + "learning_rate": 0.0002, + "loss": 0.7939, + "step": 10390 + }, + { + "epoch": 1.6813515479751031, + "grad_norm": 0.8359124064445496, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 10400 + }, + { + "epoch": 1.6829682321558483, + "grad_norm": 0.5378185510635376, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 10410 + }, + { + "epoch": 1.6845849163365938, + "grad_norm": 0.6959536075592041, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 10420 + }, + { + "epoch": 1.686201600517339, + "grad_norm": 0.6514357328414917, + "learning_rate": 0.0002, + "loss": 0.7872, + "step": 10430 + }, + { + "epoch": 1.6878182846980843, + "grad_norm": 0.7706646919250488, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 10440 + }, + { + "epoch": 1.6894349688788295, + "grad_norm": 0.6183337569236755, + "learning_rate": 0.0002, + "loss": 0.7673, + "step": 10450 + }, + { + "epoch": 1.6910516530595747, + "grad_norm": 0.6123278141021729, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 10460 + }, + { + "epoch": 1.69266833724032, + "grad_norm": 0.6894851326942444, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 10470 + }, + { + "epoch": 1.6942850214210654, + "grad_norm": 0.7497312426567078, + "learning_rate": 0.0002, + "loss": 0.7435, + "step": 10480 + }, + { + "epoch": 1.6959017056018106, + "grad_norm": 0.5968214273452759, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 10490 + }, + { + "epoch": 1.697518389782556, + "grad_norm": 0.6747927069664001, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 10500 + }, + { + "epoch": 1.6991350739633013, + "grad_norm": 0.5708310008049011, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 10510 + }, + { + "epoch": 1.7007517581440466, + "grad_norm": 0.606526792049408, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 10520 + }, + { + "epoch": 1.7023684423247918, + "grad_norm": 0.662011981010437, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 10530 + }, + { + "epoch": 1.703985126505537, + "grad_norm": 0.7583045363426208, + "learning_rate": 0.0002, + "loss": 0.7999, + "step": 10540 + }, + { + "epoch": 1.7056018106862825, + "grad_norm": 0.721632182598114, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 10550 + }, + { + "epoch": 1.7072184948670277, + "grad_norm": 0.6107715368270874, + "learning_rate": 0.0002, + "loss": 0.7407, + "step": 10560 + }, + { + "epoch": 1.7088351790477732, + "grad_norm": 0.6652471423149109, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 10570 + }, + { + "epoch": 1.7104518632285184, + "grad_norm": 0.6308087110519409, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 10580 + }, + { + "epoch": 1.7120685474092636, + "grad_norm": 0.5464386940002441, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 10590 + }, + { + "epoch": 1.7136852315900089, + "grad_norm": 0.6558911204338074, + "learning_rate": 0.0002, + "loss": 0.7063, + "step": 10600 + }, + { + "epoch": 1.715301915770754, + "grad_norm": 0.5665024518966675, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 10610 + }, + { + "epoch": 1.7169185999514993, + "grad_norm": 0.7888094186782837, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 10620 + }, + { + "epoch": 1.7185352841322448, + "grad_norm": 0.7084909081459045, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 10630 + }, + { + "epoch": 1.7201519683129902, + "grad_norm": 0.7982324361801147, + "learning_rate": 0.0002, + "loss": 0.7557, + "step": 10640 + }, + { + "epoch": 1.7217686524937355, + "grad_norm": 0.6418732404708862, + "learning_rate": 0.0002, + "loss": 0.7345, + "step": 10650 + }, + { + "epoch": 1.7233853366744807, + "grad_norm": 0.7636681795120239, + "learning_rate": 0.0002, + "loss": 0.7734, + "step": 10660 + }, + { + "epoch": 1.725002020855226, + "grad_norm": 0.5646875500679016, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 10670 + }, + { + "epoch": 1.7266187050359711, + "grad_norm": 0.5231260657310486, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 10680 + }, + { + "epoch": 1.7282353892167164, + "grad_norm": 0.7635011672973633, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 10690 + }, + { + "epoch": 1.7298520733974618, + "grad_norm": 0.7518259286880493, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 10700 + }, + { + "epoch": 1.731468757578207, + "grad_norm": 0.7295602560043335, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 10710 + }, + { + "epoch": 1.7330854417589525, + "grad_norm": 0.6984632015228271, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 10720 + }, + { + "epoch": 1.7347021259396977, + "grad_norm": 0.6198219060897827, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 10730 + }, + { + "epoch": 1.736318810120443, + "grad_norm": 0.6957576274871826, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 10740 + }, + { + "epoch": 1.7379354943011882, + "grad_norm": 0.6430263519287109, + "learning_rate": 0.0002, + "loss": 0.7917, + "step": 10750 + }, + { + "epoch": 1.7395521784819334, + "grad_norm": 0.6134995222091675, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 10760 + }, + { + "epoch": 1.741168862662679, + "grad_norm": 0.7209452986717224, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 10770 + }, + { + "epoch": 1.7427855468434241, + "grad_norm": 0.6735447645187378, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 10780 + }, + { + "epoch": 1.7444022310241696, + "grad_norm": 0.5605693459510803, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 10790 + }, + { + "epoch": 1.7460189152049148, + "grad_norm": 0.6882363557815552, + "learning_rate": 0.0002, + "loss": 0.7759, + "step": 10800 + }, + { + "epoch": 1.74763559938566, + "grad_norm": 0.6386259198188782, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 10810 + }, + { + "epoch": 1.7492522835664053, + "grad_norm": 0.6529015302658081, + "learning_rate": 0.0002, + "loss": 0.7697, + "step": 10820 + }, + { + "epoch": 1.7508689677471505, + "grad_norm": 0.5664082765579224, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 10830 + }, + { + "epoch": 1.7524856519278957, + "grad_norm": 0.7532684206962585, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 10840 + }, + { + "epoch": 1.7541023361086412, + "grad_norm": 0.77171391248703, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 10850 + }, + { + "epoch": 1.7557190202893864, + "grad_norm": 0.7255431413650513, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 10860 + }, + { + "epoch": 1.7573357044701319, + "grad_norm": 0.763083279132843, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 10870 + }, + { + "epoch": 1.758952388650877, + "grad_norm": 0.6042402982711792, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 10880 + }, + { + "epoch": 1.7605690728316223, + "grad_norm": 0.7642518281936646, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 10890 + }, + { + "epoch": 1.7621857570123676, + "grad_norm": 0.6347904801368713, + "learning_rate": 0.0002, + "loss": 0.746, + "step": 10900 + }, + { + "epoch": 1.7638024411931128, + "grad_norm": 0.5371627807617188, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 10910 + }, + { + "epoch": 1.7654191253738583, + "grad_norm": 0.6840225458145142, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 10920 + }, + { + "epoch": 1.7670358095546035, + "grad_norm": 0.5288469195365906, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 10930 + }, + { + "epoch": 1.768652493735349, + "grad_norm": 0.69020676612854, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 10940 + }, + { + "epoch": 1.7702691779160942, + "grad_norm": 0.5943242311477661, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 10950 + }, + { + "epoch": 1.7718858620968394, + "grad_norm": 0.5616418123245239, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 10960 + }, + { + "epoch": 1.7735025462775846, + "grad_norm": 0.7209470868110657, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 10970 + }, + { + "epoch": 1.7751192304583299, + "grad_norm": 0.6657957434654236, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 10980 + }, + { + "epoch": 1.776735914639075, + "grad_norm": 0.6469064950942993, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 10990 + }, + { + "epoch": 1.7783525988198206, + "grad_norm": 0.6615678071975708, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 11000 + }, + { + "epoch": 1.779969283000566, + "grad_norm": 0.6722439527511597, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 11010 + }, + { + "epoch": 1.7815859671813112, + "grad_norm": 0.634136974811554, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 11020 + }, + { + "epoch": 1.7832026513620565, + "grad_norm": 0.6024377346038818, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 11030 + }, + { + "epoch": 1.7848193355428017, + "grad_norm": 0.6909403800964355, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 11040 + }, + { + "epoch": 1.786436019723547, + "grad_norm": 0.7148767709732056, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 11050 + }, + { + "epoch": 1.7880527039042922, + "grad_norm": 0.7442979216575623, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 11060 + }, + { + "epoch": 1.7896693880850376, + "grad_norm": 0.6830431818962097, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 11070 + }, + { + "epoch": 1.7912860722657828, + "grad_norm": 0.9172667264938354, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 11080 + }, + { + "epoch": 1.7929027564465283, + "grad_norm": 0.6799490451812744, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 11090 + }, + { + "epoch": 1.7945194406272735, + "grad_norm": 0.7617024779319763, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 11100 + }, + { + "epoch": 1.7961361248080188, + "grad_norm": 0.7701810002326965, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 11110 + }, + { + "epoch": 1.797752808988764, + "grad_norm": 0.7454385757446289, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 11120 + }, + { + "epoch": 1.7993694931695092, + "grad_norm": 0.6121436953544617, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 11130 + }, + { + "epoch": 1.8009861773502547, + "grad_norm": 0.6237571835517883, + "learning_rate": 0.0002, + "loss": 0.7305, + "step": 11140 + }, + { + "epoch": 1.802602861531, + "grad_norm": 0.6818515658378601, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 11150 + }, + { + "epoch": 1.8042195457117454, + "grad_norm": 0.7768308520317078, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 11160 + }, + { + "epoch": 1.8058362298924906, + "grad_norm": 0.6875537633895874, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 11170 + }, + { + "epoch": 1.8074529140732358, + "grad_norm": 0.7950584888458252, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 11180 + }, + { + "epoch": 1.809069598253981, + "grad_norm": 0.8210248351097107, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 11190 + }, + { + "epoch": 1.8106862824347263, + "grad_norm": 0.6674110889434814, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 11200 + }, + { + "epoch": 1.8123029666154715, + "grad_norm": 0.6261674761772156, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 11210 + }, + { + "epoch": 1.813919650796217, + "grad_norm": 0.6484741568565369, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 11220 + }, + { + "epoch": 1.8155363349769622, + "grad_norm": 0.6231244206428528, + "learning_rate": 0.0002, + "loss": 0.7718, + "step": 11230 + }, + { + "epoch": 1.8171530191577077, + "grad_norm": 0.7243146896362305, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 11240 + }, + { + "epoch": 1.818769703338453, + "grad_norm": 0.6776193380355835, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 11250 + }, + { + "epoch": 1.8203863875191981, + "grad_norm": 0.5973618030548096, + "learning_rate": 0.0002, + "loss": 0.7317, + "step": 11260 + }, + { + "epoch": 1.8220030716999434, + "grad_norm": 0.6451361179351807, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 11270 + }, + { + "epoch": 1.8236197558806886, + "grad_norm": 0.5963068008422852, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 11280 + }, + { + "epoch": 1.825236440061434, + "grad_norm": 0.536902129650116, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 11290 + }, + { + "epoch": 1.8268531242421793, + "grad_norm": 0.6993787288665771, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 11300 + }, + { + "epoch": 1.8284698084229247, + "grad_norm": 0.6135255098342896, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 11310 + }, + { + "epoch": 1.83008649260367, + "grad_norm": 0.6057423949241638, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 11320 + }, + { + "epoch": 1.8317031767844152, + "grad_norm": 0.6598812341690063, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 11330 + }, + { + "epoch": 1.8333198609651604, + "grad_norm": 0.6075948476791382, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 11340 + }, + { + "epoch": 1.8349365451459057, + "grad_norm": 0.7065447568893433, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 11350 + }, + { + "epoch": 1.8365532293266509, + "grad_norm": 0.680526614189148, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 11360 + }, + { + "epoch": 1.8381699135073963, + "grad_norm": 0.6356695294380188, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 11370 + }, + { + "epoch": 1.8397865976881416, + "grad_norm": 0.6399052143096924, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 11380 + }, + { + "epoch": 1.841403281868887, + "grad_norm": 0.6125704050064087, + "learning_rate": 0.0002, + "loss": 0.7618, + "step": 11390 + }, + { + "epoch": 1.8430199660496323, + "grad_norm": 0.7124643325805664, + "learning_rate": 0.0002, + "loss": 0.755, + "step": 11400 + }, + { + "epoch": 1.8446366502303775, + "grad_norm": 0.6099604964256287, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 11410 + }, + { + "epoch": 1.8462533344111227, + "grad_norm": 0.7338208556175232, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 11420 + }, + { + "epoch": 1.847870018591868, + "grad_norm": 0.7534668445587158, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 11430 + }, + { + "epoch": 1.8494867027726134, + "grad_norm": 0.6135470271110535, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 11440 + }, + { + "epoch": 1.8511033869533586, + "grad_norm": 0.6229309439659119, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 11450 + }, + { + "epoch": 1.852720071134104, + "grad_norm": 0.706423282623291, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 11460 + }, + { + "epoch": 1.8543367553148493, + "grad_norm": 0.5460049510002136, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 11470 + }, + { + "epoch": 1.8559534394955945, + "grad_norm": 0.6616711020469666, + "learning_rate": 0.0002, + "loss": 0.7416, + "step": 11480 + }, + { + "epoch": 1.8575701236763398, + "grad_norm": 0.6372783184051514, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 11490 + }, + { + "epoch": 1.859186807857085, + "grad_norm": 0.7162668108940125, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 11500 + }, + { + "epoch": 1.8608034920378305, + "grad_norm": 0.6605209708213806, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 11510 + }, + { + "epoch": 1.8624201762185757, + "grad_norm": 0.6933956742286682, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 11520 + }, + { + "epoch": 1.8640368603993211, + "grad_norm": 0.6582090854644775, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 11530 + }, + { + "epoch": 1.8656535445800664, + "grad_norm": 0.6416500806808472, + "learning_rate": 0.0002, + "loss": 0.7313, + "step": 11540 + }, + { + "epoch": 1.8672702287608116, + "grad_norm": 0.5434312224388123, + "learning_rate": 0.0002, + "loss": 0.7372, + "step": 11550 + }, + { + "epoch": 1.8688869129415568, + "grad_norm": 0.6827567219734192, + "learning_rate": 0.0002, + "loss": 0.7635, + "step": 11560 + }, + { + "epoch": 1.870503597122302, + "grad_norm": 0.7354370951652527, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 11570 + }, + { + "epoch": 1.8721202813030473, + "grad_norm": 0.590372622013092, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 11580 + }, + { + "epoch": 1.8737369654837928, + "grad_norm": 0.853183925151825, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 11590 + }, + { + "epoch": 1.875353649664538, + "grad_norm": 0.822678804397583, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 11600 + }, + { + "epoch": 1.8769703338452834, + "grad_norm": 0.6591550707817078, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 11610 + }, + { + "epoch": 1.8785870180260287, + "grad_norm": 0.7475301623344421, + "learning_rate": 0.0002, + "loss": 0.7054, + "step": 11620 + }, + { + "epoch": 1.880203702206774, + "grad_norm": 0.6390765309333801, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 11630 + }, + { + "epoch": 1.8818203863875191, + "grad_norm": 0.6589758992195129, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 11640 + }, + { + "epoch": 1.8834370705682644, + "grad_norm": 0.6765508651733398, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 11650 + }, + { + "epoch": 1.8850537547490098, + "grad_norm": 0.6527857780456543, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 11660 + }, + { + "epoch": 1.886670438929755, + "grad_norm": 0.6642923951148987, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 11670 + }, + { + "epoch": 1.8882871231105005, + "grad_norm": 0.6945584416389465, + "learning_rate": 0.0002, + "loss": 0.7701, + "step": 11680 + }, + { + "epoch": 1.8899038072912457, + "grad_norm": 0.694018542766571, + "learning_rate": 0.0002, + "loss": 0.7711, + "step": 11690 + }, + { + "epoch": 1.891520491471991, + "grad_norm": 0.7237417101860046, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 11700 + }, + { + "epoch": 1.8931371756527362, + "grad_norm": 0.7401309609413147, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 11710 + }, + { + "epoch": 1.8947538598334814, + "grad_norm": 0.6537784337997437, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 11720 + }, + { + "epoch": 1.8963705440142267, + "grad_norm": 0.7398539185523987, + "learning_rate": 0.0002, + "loss": 0.793, + "step": 11730 + }, + { + "epoch": 1.8979872281949721, + "grad_norm": 0.6696075797080994, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 11740 + }, + { + "epoch": 1.8996039123757174, + "grad_norm": 0.6014142036437988, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 11750 + }, + { + "epoch": 1.9012205965564628, + "grad_norm": 0.7023524641990662, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 11760 + }, + { + "epoch": 1.902837280737208, + "grad_norm": 0.739973783493042, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 11770 + }, + { + "epoch": 1.9044539649179533, + "grad_norm": 0.5576770901679993, + "learning_rate": 0.0002, + "loss": 0.7848, + "step": 11780 + }, + { + "epoch": 1.9060706490986985, + "grad_norm": 0.6907393932342529, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 11790 + }, + { + "epoch": 1.9076873332794437, + "grad_norm": 0.6934581995010376, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 11800 + }, + { + "epoch": 1.9093040174601892, + "grad_norm": 0.591774582862854, + "learning_rate": 0.0002, + "loss": 0.7199, + "step": 11810 + }, + { + "epoch": 1.9109207016409344, + "grad_norm": 0.6249791383743286, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 11820 + }, + { + "epoch": 1.9125373858216799, + "grad_norm": 0.6755744218826294, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 11830 + }, + { + "epoch": 1.914154070002425, + "grad_norm": 0.7286285161972046, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 11840 + }, + { + "epoch": 1.9157707541831703, + "grad_norm": 0.7867850065231323, + "learning_rate": 0.0002, + "loss": 0.7509, + "step": 11850 + }, + { + "epoch": 1.9173874383639156, + "grad_norm": 0.6283972859382629, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 11860 + }, + { + "epoch": 1.9190041225446608, + "grad_norm": 0.605823814868927, + "learning_rate": 0.0002, + "loss": 0.7296, + "step": 11870 + }, + { + "epoch": 1.920620806725406, + "grad_norm": 0.5927976965904236, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 11880 + }, + { + "epoch": 1.9222374909061515, + "grad_norm": 0.5974002480506897, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 11890 + }, + { + "epoch": 1.923854175086897, + "grad_norm": 0.7091866135597229, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 11900 + }, + { + "epoch": 1.9254708592676422, + "grad_norm": 0.72496497631073, + "learning_rate": 0.0002, + "loss": 0.775, + "step": 11910 + }, + { + "epoch": 1.9270875434483874, + "grad_norm": 0.6131896376609802, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 11920 + }, + { + "epoch": 1.9287042276291326, + "grad_norm": 0.6556436419487, + "learning_rate": 0.0002, + "loss": 0.7228, + "step": 11930 + }, + { + "epoch": 1.9303209118098779, + "grad_norm": 0.622932493686676, + "learning_rate": 0.0002, + "loss": 0.7319, + "step": 11940 + }, + { + "epoch": 1.931937595990623, + "grad_norm": 0.6618631482124329, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 11950 + }, + { + "epoch": 1.9335542801713685, + "grad_norm": 0.630966305732727, + "learning_rate": 0.0002, + "loss": 0.8332, + "step": 11960 + }, + { + "epoch": 1.9351709643521138, + "grad_norm": 0.6336734890937805, + "learning_rate": 0.0002, + "loss": 0.6854, + "step": 11970 + }, + { + "epoch": 1.9367876485328592, + "grad_norm": 0.655403196811676, + "learning_rate": 0.0002, + "loss": 0.7433, + "step": 11980 + }, + { + "epoch": 1.9384043327136045, + "grad_norm": 0.5640574097633362, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 11990 + }, + { + "epoch": 1.9400210168943497, + "grad_norm": 0.6322951316833496, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 12000 + }, + { + "epoch": 1.941637701075095, + "grad_norm": 0.615703821182251, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 12010 + }, + { + "epoch": 1.9432543852558402, + "grad_norm": 0.6487536430358887, + "learning_rate": 0.0002, + "loss": 0.786, + "step": 12020 + }, + { + "epoch": 1.9448710694365856, + "grad_norm": 0.9209630489349365, + "learning_rate": 0.0002, + "loss": 0.7435, + "step": 12030 + }, + { + "epoch": 1.9464877536173308, + "grad_norm": 0.67485511302948, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 12040 + }, + { + "epoch": 1.9481044377980763, + "grad_norm": 0.6831230521202087, + "learning_rate": 0.0002, + "loss": 0.7551, + "step": 12050 + }, + { + "epoch": 1.9497211219788215, + "grad_norm": 0.6578302383422852, + "learning_rate": 0.0002, + "loss": 0.7546, + "step": 12060 + }, + { + "epoch": 1.9513378061595668, + "grad_norm": 0.9975938200950623, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 12070 + }, + { + "epoch": 1.952954490340312, + "grad_norm": 0.6637365221977234, + "learning_rate": 0.0002, + "loss": 0.7952, + "step": 12080 + }, + { + "epoch": 1.9545711745210572, + "grad_norm": 0.605707049369812, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 12090 + }, + { + "epoch": 1.9561878587018025, + "grad_norm": 0.6584440469741821, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 12100 + }, + { + "epoch": 1.957804542882548, + "grad_norm": 0.6070835590362549, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 12110 + }, + { + "epoch": 1.9594212270632931, + "grad_norm": 0.7862601280212402, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 12120 + }, + { + "epoch": 1.9610379112440386, + "grad_norm": 0.8175255060195923, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 12130 + }, + { + "epoch": 1.9626545954247838, + "grad_norm": 0.5648472905158997, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 12140 + }, + { + "epoch": 1.964271279605529, + "grad_norm": 0.6591973304748535, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 12150 + }, + { + "epoch": 1.9658879637862743, + "grad_norm": 0.5960676074028015, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 12160 + }, + { + "epoch": 1.9675046479670195, + "grad_norm": 0.7272544503211975, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 12170 + }, + { + "epoch": 1.969121332147765, + "grad_norm": 0.7176699042320251, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 12180 + }, + { + "epoch": 1.9707380163285102, + "grad_norm": 0.6927123665809631, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 12190 + }, + { + "epoch": 1.9723547005092557, + "grad_norm": 0.5536034107208252, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 12200 + }, + { + "epoch": 1.9739713846900009, + "grad_norm": 0.8348390460014343, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 12210 + }, + { + "epoch": 1.9755880688707461, + "grad_norm": 0.6591181755065918, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 12220 + }, + { + "epoch": 1.9772047530514913, + "grad_norm": 1.0624109506607056, + "learning_rate": 0.0002, + "loss": 0.763, + "step": 12230 + }, + { + "epoch": 1.9788214372322366, + "grad_norm": 0.9265586137771606, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 12240 + }, + { + "epoch": 1.9804381214129818, + "grad_norm": 0.5998196005821228, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 12250 + }, + { + "epoch": 1.9820548055937273, + "grad_norm": 0.6960851550102234, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 12260 + }, + { + "epoch": 1.9836714897744727, + "grad_norm": 0.7674502730369568, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 12270 + }, + { + "epoch": 1.985288173955218, + "grad_norm": 0.6407275795936584, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 12280 + }, + { + "epoch": 1.9869048581359632, + "grad_norm": 0.6673079133033752, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 12290 + }, + { + "epoch": 1.9885215423167084, + "grad_norm": 0.6989844441413879, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 12300 + }, + { + "epoch": 1.9901382264974536, + "grad_norm": 0.7564442157745361, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 12310 + }, + { + "epoch": 1.9917549106781989, + "grad_norm": 0.6385478973388672, + "learning_rate": 0.0002, + "loss": 0.7719, + "step": 12320 + }, + { + "epoch": 1.9933715948589443, + "grad_norm": 0.7193717956542969, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 12330 + }, + { + "epoch": 1.9949882790396896, + "grad_norm": 0.7987112402915955, + "learning_rate": 0.0002, + "loss": 0.7583, + "step": 12340 + }, + { + "epoch": 1.996604963220435, + "grad_norm": 0.7260826826095581, + "learning_rate": 0.0002, + "loss": 0.7793, + "step": 12350 + }, + { + "epoch": 1.9982216474011802, + "grad_norm": 0.7968255281448364, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 12360 + }, + { + "epoch": 1.9998383315819255, + "grad_norm": 0.6893062591552734, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 12370 + }, + { + "epoch": 2.0, + "eval_loss": 1.1044032573699951, + "eval_runtime": 122.1508, + "eval_samples_per_second": 6.001, + "eval_steps_per_second": 0.753, + "step": 12371 + }, + { + "epoch": 2.0014550157626707, + "grad_norm": 0.7775409817695618, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 12380 + }, + { + "epoch": 2.003071699943416, + "grad_norm": 0.76218581199646, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 12390 + }, + { + "epoch": 2.004688384124161, + "grad_norm": 0.5677764415740967, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 12400 + }, + { + "epoch": 2.006305068304907, + "grad_norm": 0.808442234992981, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 12410 + }, + { + "epoch": 2.007921752485652, + "grad_norm": 0.7144765257835388, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 12420 + }, + { + "epoch": 2.0095384366663973, + "grad_norm": 0.6914031505584717, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 12430 + }, + { + "epoch": 2.0111551208471425, + "grad_norm": 0.7581454515457153, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 12440 + }, + { + "epoch": 2.0127718050278878, + "grad_norm": 0.8388504981994629, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 12450 + }, + { + "epoch": 2.014388489208633, + "grad_norm": 0.6716406941413879, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 12460 + }, + { + "epoch": 2.0160051733893782, + "grad_norm": 0.898902416229248, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 12470 + }, + { + "epoch": 2.0176218575701235, + "grad_norm": 0.6432679891586304, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 12480 + }, + { + "epoch": 2.019238541750869, + "grad_norm": 0.8021109104156494, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 12490 + }, + { + "epoch": 2.0208552259316144, + "grad_norm": 0.7039216756820679, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 12500 + }, + { + "epoch": 2.0224719101123596, + "grad_norm": 0.646531879901886, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 12510 + }, + { + "epoch": 2.024088594293105, + "grad_norm": 0.783704400062561, + "learning_rate": 0.0002, + "loss": 0.6302, + "step": 12520 + }, + { + "epoch": 2.02570527847385, + "grad_norm": 0.8805046677589417, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 12530 + }, + { + "epoch": 2.0273219626545953, + "grad_norm": 0.7289270758628845, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 12540 + }, + { + "epoch": 2.0289386468353405, + "grad_norm": 0.71653151512146, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 12550 + }, + { + "epoch": 2.030555331016086, + "grad_norm": 0.73281329870224, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 12560 + }, + { + "epoch": 2.0321720151968314, + "grad_norm": 0.6657090187072754, + "learning_rate": 0.0002, + "loss": 0.6448, + "step": 12570 + }, + { + "epoch": 2.0337886993775767, + "grad_norm": 0.8241133093833923, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 12580 + }, + { + "epoch": 2.035405383558322, + "grad_norm": 0.5834135413169861, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 12590 + }, + { + "epoch": 2.037022067739067, + "grad_norm": 0.84502112865448, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 12600 + }, + { + "epoch": 2.0386387519198124, + "grad_norm": 0.8952481746673584, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 12610 + }, + { + "epoch": 2.0402554361005576, + "grad_norm": 0.7801461815834045, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 12620 + }, + { + "epoch": 2.041872120281303, + "grad_norm": 0.6788367033004761, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 12630 + }, + { + "epoch": 2.0434888044620485, + "grad_norm": 0.7241756319999695, + "learning_rate": 0.0002, + "loss": 0.6162, + "step": 12640 + }, + { + "epoch": 2.0451054886427937, + "grad_norm": 0.6933388113975525, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 12650 + }, + { + "epoch": 2.046722172823539, + "grad_norm": 0.8029746413230896, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 12660 + }, + { + "epoch": 2.048338857004284, + "grad_norm": 0.946399986743927, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 12670 + }, + { + "epoch": 2.0499555411850294, + "grad_norm": 0.7072678804397583, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 12680 + }, + { + "epoch": 2.0515722253657747, + "grad_norm": 0.6810618042945862, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 12690 + }, + { + "epoch": 2.05318890954652, + "grad_norm": 0.7661160230636597, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 12700 + }, + { + "epoch": 2.0548055937272656, + "grad_norm": 0.6350653767585754, + "learning_rate": 0.0002, + "loss": 0.6799, + "step": 12710 + }, + { + "epoch": 2.056422277908011, + "grad_norm": 0.861890971660614, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 12720 + }, + { + "epoch": 2.058038962088756, + "grad_norm": 0.6489875912666321, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 12730 + }, + { + "epoch": 2.0596556462695013, + "grad_norm": 0.8268506526947021, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 12740 + }, + { + "epoch": 2.0612723304502465, + "grad_norm": 0.607679545879364, + "learning_rate": 0.0002, + "loss": 0.6524, + "step": 12750 + }, + { + "epoch": 2.0628890146309917, + "grad_norm": 0.6754153370857239, + "learning_rate": 0.0002, + "loss": 0.6649, + "step": 12760 + }, + { + "epoch": 2.064505698811737, + "grad_norm": 0.7263124585151672, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 12770 + }, + { + "epoch": 2.0661223829924826, + "grad_norm": 0.6986154317855835, + "learning_rate": 0.0002, + "loss": 0.6189, + "step": 12780 + }, + { + "epoch": 2.067739067173228, + "grad_norm": 0.7768576741218567, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 12790 + }, + { + "epoch": 2.069355751353973, + "grad_norm": 0.7546762824058533, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 12800 + }, + { + "epoch": 2.0709724355347183, + "grad_norm": 0.7588880062103271, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 12810 + }, + { + "epoch": 2.0725891197154636, + "grad_norm": 0.7457242608070374, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 12820 + }, + { + "epoch": 2.074205803896209, + "grad_norm": 0.6983516812324524, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 12830 + }, + { + "epoch": 2.075822488076954, + "grad_norm": 0.7950928807258606, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 12840 + }, + { + "epoch": 2.0774391722576993, + "grad_norm": 0.9248087406158447, + "learning_rate": 0.0002, + "loss": 0.6603, + "step": 12850 + }, + { + "epoch": 2.079055856438445, + "grad_norm": 0.7229493260383606, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 12860 + }, + { + "epoch": 2.08067254061919, + "grad_norm": 0.5710847973823547, + "learning_rate": 0.0002, + "loss": 0.6702, + "step": 12870 + }, + { + "epoch": 2.0822892247999354, + "grad_norm": 0.9580423831939697, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 12880 + }, + { + "epoch": 2.0839059089806806, + "grad_norm": 0.7399665713310242, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 12890 + }, + { + "epoch": 2.085522593161426, + "grad_norm": 0.7981410622596741, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 12900 + }, + { + "epoch": 2.087139277342171, + "grad_norm": 0.870759904384613, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 12910 + }, + { + "epoch": 2.0887559615229163, + "grad_norm": 0.7001481652259827, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 12920 + }, + { + "epoch": 2.090372645703662, + "grad_norm": 0.6745418310165405, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 12930 + }, + { + "epoch": 2.0919893298844072, + "grad_norm": 0.7739067673683167, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 12940 + }, + { + "epoch": 2.0936060140651525, + "grad_norm": 0.6742934584617615, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 12950 + }, + { + "epoch": 2.0952226982458977, + "grad_norm": 0.7270349860191345, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 12960 + }, + { + "epoch": 2.096839382426643, + "grad_norm": 0.7150624394416809, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 12970 + }, + { + "epoch": 2.098456066607388, + "grad_norm": 0.7734767198562622, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 12980 + }, + { + "epoch": 2.1000727507881334, + "grad_norm": 0.7618662118911743, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 12990 + }, + { + "epoch": 2.101689434968879, + "grad_norm": 0.6557944416999817, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 13000 + }, + { + "epoch": 2.1033061191496243, + "grad_norm": 0.8786448240280151, + "learning_rate": 0.0002, + "loss": 0.7268, + "step": 13010 + }, + { + "epoch": 2.1049228033303695, + "grad_norm": 0.6878724098205566, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 13020 + }, + { + "epoch": 2.1065394875111147, + "grad_norm": 0.822318971157074, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 13030 + }, + { + "epoch": 2.10815617169186, + "grad_norm": 0.831468939781189, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 13040 + }, + { + "epoch": 2.109772855872605, + "grad_norm": 0.7699505686759949, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 13050 + }, + { + "epoch": 2.1113895400533504, + "grad_norm": 0.7559016346931458, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 13060 + }, + { + "epoch": 2.1130062242340957, + "grad_norm": 0.6942209601402283, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 13070 + }, + { + "epoch": 2.1146229084148414, + "grad_norm": 0.6098947525024414, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 13080 + }, + { + "epoch": 2.1162395925955866, + "grad_norm": 0.6499016284942627, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 13090 + }, + { + "epoch": 2.117856276776332, + "grad_norm": 0.7719953060150146, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 13100 + }, + { + "epoch": 2.119472960957077, + "grad_norm": 0.6708134412765503, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 13110 + }, + { + "epoch": 2.1210896451378223, + "grad_norm": 0.8119585514068604, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 13120 + }, + { + "epoch": 2.1227063293185675, + "grad_norm": 0.6947157979011536, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 13130 + }, + { + "epoch": 2.1243230134993127, + "grad_norm": 0.8831837773323059, + "learning_rate": 0.0002, + "loss": 0.6375, + "step": 13140 + }, + { + "epoch": 2.1259396976800584, + "grad_norm": 0.7266910672187805, + "learning_rate": 0.0002, + "loss": 0.6997, + "step": 13150 + }, + { + "epoch": 2.1275563818608036, + "grad_norm": 0.8864351511001587, + "learning_rate": 0.0002, + "loss": 0.6446, + "step": 13160 + }, + { + "epoch": 2.129173066041549, + "grad_norm": 0.8104248046875, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 13170 + }, + { + "epoch": 2.130789750222294, + "grad_norm": 0.6077079772949219, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 13180 + }, + { + "epoch": 2.1324064344030393, + "grad_norm": 0.6874213814735413, + "learning_rate": 0.0002, + "loss": 0.6572, + "step": 13190 + }, + { + "epoch": 2.1340231185837846, + "grad_norm": 0.7134367823600769, + "learning_rate": 0.0002, + "loss": 0.642, + "step": 13200 + }, + { + "epoch": 2.13563980276453, + "grad_norm": 0.6101235151290894, + "learning_rate": 0.0002, + "loss": 0.7016, + "step": 13210 + }, + { + "epoch": 2.137256486945275, + "grad_norm": 0.6042411923408508, + "learning_rate": 0.0002, + "loss": 0.6529, + "step": 13220 + }, + { + "epoch": 2.1388731711260207, + "grad_norm": 0.914601743221283, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 13230 + }, + { + "epoch": 2.140489855306766, + "grad_norm": 0.7104284167289734, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 13240 + }, + { + "epoch": 2.142106539487511, + "grad_norm": 0.664395272731781, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 13250 + }, + { + "epoch": 2.1437232236682564, + "grad_norm": 0.6991241574287415, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 13260 + }, + { + "epoch": 2.1453399078490016, + "grad_norm": 0.5469560623168945, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 13270 + }, + { + "epoch": 2.146956592029747, + "grad_norm": 0.8454998135566711, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 13280 + }, + { + "epoch": 2.148573276210492, + "grad_norm": 0.7088868618011475, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 13290 + }, + { + "epoch": 2.1501899603912378, + "grad_norm": 0.7002687454223633, + "learning_rate": 0.0002, + "loss": 0.6835, + "step": 13300 + }, + { + "epoch": 2.151806644571983, + "grad_norm": 0.7785214781761169, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 13310 + }, + { + "epoch": 2.1534233287527282, + "grad_norm": 0.8049132227897644, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 13320 + }, + { + "epoch": 2.1550400129334735, + "grad_norm": 0.8062595129013062, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 13330 + }, + { + "epoch": 2.1566566971142187, + "grad_norm": 0.6208319067955017, + "learning_rate": 0.0002, + "loss": 0.6603, + "step": 13340 + }, + { + "epoch": 2.158273381294964, + "grad_norm": 0.7519655823707581, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 13350 + }, + { + "epoch": 2.159890065475709, + "grad_norm": 0.7645747065544128, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 13360 + }, + { + "epoch": 2.1615067496564544, + "grad_norm": 0.6847302913665771, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 13370 + }, + { + "epoch": 2.1631234338372, + "grad_norm": 0.8630441427230835, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 13380 + }, + { + "epoch": 2.1647401180179453, + "grad_norm": 0.7947702407836914, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 13390 + }, + { + "epoch": 2.1663568021986905, + "grad_norm": 0.6836977005004883, + "learning_rate": 0.0002, + "loss": 0.7206, + "step": 13400 + }, + { + "epoch": 2.1679734863794358, + "grad_norm": 0.7340566515922546, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 13410 + }, + { + "epoch": 2.169590170560181, + "grad_norm": 0.7075738906860352, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 13420 + }, + { + "epoch": 2.1712068547409262, + "grad_norm": 0.7080879807472229, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 13430 + }, + { + "epoch": 2.1728235389216715, + "grad_norm": 0.6218613386154175, + "learning_rate": 0.0002, + "loss": 0.6615, + "step": 13440 + }, + { + "epoch": 2.174440223102417, + "grad_norm": 0.8211479187011719, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 13450 + }, + { + "epoch": 2.1760569072831624, + "grad_norm": 0.864466667175293, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 13460 + }, + { + "epoch": 2.1776735914639076, + "grad_norm": 0.7943857908248901, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 13470 + }, + { + "epoch": 2.179290275644653, + "grad_norm": 0.78728187084198, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 13480 + }, + { + "epoch": 2.180906959825398, + "grad_norm": 0.697527289390564, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 13490 + }, + { + "epoch": 2.1825236440061433, + "grad_norm": 0.8205804228782654, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 13500 + }, + { + "epoch": 2.1841403281868885, + "grad_norm": 0.8709042072296143, + "learning_rate": 0.0002, + "loss": 0.7227, + "step": 13510 + }, + { + "epoch": 2.1857570123676338, + "grad_norm": 0.6228537559509277, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 13520 + }, + { + "epoch": 2.1873736965483794, + "grad_norm": 0.9566980004310608, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 13530 + }, + { + "epoch": 2.1889903807291247, + "grad_norm": 0.7128894329071045, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 13540 + }, + { + "epoch": 2.19060706490987, + "grad_norm": 0.6888654232025146, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 13550 + }, + { + "epoch": 2.192223749090615, + "grad_norm": 0.6444337368011475, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 13560 + }, + { + "epoch": 2.1938404332713604, + "grad_norm": 0.8008806705474854, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 13570 + }, + { + "epoch": 2.1954571174521056, + "grad_norm": 0.8482748866081238, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 13580 + }, + { + "epoch": 2.197073801632851, + "grad_norm": 0.8584157228469849, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 13590 + }, + { + "epoch": 2.1986904858135965, + "grad_norm": 0.7513734698295593, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 13600 + }, + { + "epoch": 2.2003071699943417, + "grad_norm": 0.7864262461662292, + "learning_rate": 0.0002, + "loss": 0.6632, + "step": 13610 + }, + { + "epoch": 2.201923854175087, + "grad_norm": 0.8493645191192627, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 13620 + }, + { + "epoch": 2.203540538355832, + "grad_norm": 0.6902140974998474, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 13630 + }, + { + "epoch": 2.2051572225365774, + "grad_norm": 0.8711254596710205, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 13640 + }, + { + "epoch": 2.2067739067173227, + "grad_norm": 0.7832191586494446, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 13650 + }, + { + "epoch": 2.208390590898068, + "grad_norm": 0.5668176412582397, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 13660 + }, + { + "epoch": 2.2100072750788136, + "grad_norm": 0.8648375272750854, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 13670 + }, + { + "epoch": 2.211623959259559, + "grad_norm": 0.7643089890480042, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 13680 + }, + { + "epoch": 2.213240643440304, + "grad_norm": 0.6293777823448181, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 13690 + }, + { + "epoch": 2.2148573276210493, + "grad_norm": 0.6459372639656067, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 13700 + }, + { + "epoch": 2.2164740118017945, + "grad_norm": 0.7060744166374207, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 13710 + }, + { + "epoch": 2.2180906959825397, + "grad_norm": 0.674109160900116, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 13720 + }, + { + "epoch": 2.219707380163285, + "grad_norm": 0.830392062664032, + "learning_rate": 0.0002, + "loss": 0.6529, + "step": 13730 + }, + { + "epoch": 2.2213240643440306, + "grad_norm": 0.6474477052688599, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 13740 + }, + { + "epoch": 2.222940748524776, + "grad_norm": 0.7037909626960754, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 13750 + }, + { + "epoch": 2.224557432705521, + "grad_norm": 0.6554131507873535, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 13760 + }, + { + "epoch": 2.2261741168862663, + "grad_norm": 0.7822230458259583, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 13770 + }, + { + "epoch": 2.2277908010670116, + "grad_norm": 0.9082167744636536, + "learning_rate": 0.0002, + "loss": 0.6505, + "step": 13780 + }, + { + "epoch": 2.229407485247757, + "grad_norm": 0.7918276190757751, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 13790 + }, + { + "epoch": 2.231024169428502, + "grad_norm": 0.7354569435119629, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 13800 + }, + { + "epoch": 2.2326408536092472, + "grad_norm": 0.8265249133110046, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 13810 + }, + { + "epoch": 2.234257537789993, + "grad_norm": 0.6653847098350525, + "learning_rate": 0.0002, + "loss": 0.6871, + "step": 13820 + }, + { + "epoch": 2.235874221970738, + "grad_norm": 0.7157923579216003, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 13830 + }, + { + "epoch": 2.2374909061514834, + "grad_norm": 0.7110323309898376, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 13840 + }, + { + "epoch": 2.2391075903322286, + "grad_norm": 0.7155357599258423, + "learning_rate": 0.0002, + "loss": 0.6913, + "step": 13850 + }, + { + "epoch": 2.240724274512974, + "grad_norm": 1.0177817344665527, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 13860 + }, + { + "epoch": 2.242340958693719, + "grad_norm": 0.7601948380470276, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 13870 + }, + { + "epoch": 2.2439576428744643, + "grad_norm": 0.7628820538520813, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 13880 + }, + { + "epoch": 2.24557432705521, + "grad_norm": 0.7089297771453857, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 13890 + }, + { + "epoch": 2.247191011235955, + "grad_norm": 0.695178210735321, + "learning_rate": 0.0002, + "loss": 0.7236, + "step": 13900 + }, + { + "epoch": 2.2488076954167004, + "grad_norm": 0.7631948590278625, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 13910 + }, + { + "epoch": 2.2504243795974457, + "grad_norm": 0.8203101754188538, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 13920 + }, + { + "epoch": 2.252041063778191, + "grad_norm": 0.8099079728126526, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 13930 + }, + { + "epoch": 2.253657747958936, + "grad_norm": 0.6498546004295349, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 13940 + }, + { + "epoch": 2.2552744321396814, + "grad_norm": 0.7797415256500244, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 13950 + }, + { + "epoch": 2.2568911163204266, + "grad_norm": 0.8254124522209167, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 13960 + }, + { + "epoch": 2.2585078005011723, + "grad_norm": 0.6327953338623047, + "learning_rate": 0.0002, + "loss": 0.6806, + "step": 13970 + }, + { + "epoch": 2.2601244846819175, + "grad_norm": 0.734194278717041, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 13980 + }, + { + "epoch": 2.2617411688626627, + "grad_norm": 0.9014202952384949, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 13990 + }, + { + "epoch": 2.263357853043408, + "grad_norm": 0.7643631100654602, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 14000 + }, + { + "epoch": 2.264974537224153, + "grad_norm": 0.8882834911346436, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 14010 + }, + { + "epoch": 2.2665912214048984, + "grad_norm": 0.7975873351097107, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 14020 + }, + { + "epoch": 2.2682079055856437, + "grad_norm": 0.7765783071517944, + "learning_rate": 0.0002, + "loss": 0.7193, + "step": 14030 + }, + { + "epoch": 2.2698245897663893, + "grad_norm": 0.8846288323402405, + "learning_rate": 0.0002, + "loss": 0.662, + "step": 14040 + }, + { + "epoch": 2.2714412739471346, + "grad_norm": 0.9006744027137756, + "learning_rate": 0.0002, + "loss": 0.6494, + "step": 14050 + }, + { + "epoch": 2.27305795812788, + "grad_norm": 0.7420173287391663, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 14060 + }, + { + "epoch": 2.274674642308625, + "grad_norm": 0.7956424951553345, + "learning_rate": 0.0002, + "loss": 0.7068, + "step": 14070 + }, + { + "epoch": 2.2762913264893703, + "grad_norm": 0.7783209085464478, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 14080 + }, + { + "epoch": 2.2779080106701155, + "grad_norm": 0.7597188949584961, + "learning_rate": 0.0002, + "loss": 0.7202, + "step": 14090 + }, + { + "epoch": 2.2795246948508607, + "grad_norm": 0.6718921661376953, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 14100 + }, + { + "epoch": 2.281141379031606, + "grad_norm": 0.7528082132339478, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 14110 + }, + { + "epoch": 2.2827580632123516, + "grad_norm": 0.8379864692687988, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 14120 + }, + { + "epoch": 2.284374747393097, + "grad_norm": 0.748613715171814, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 14130 + }, + { + "epoch": 2.285991431573842, + "grad_norm": 0.7435423135757446, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 14140 + }, + { + "epoch": 2.2876081157545873, + "grad_norm": 0.7580803632736206, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 14150 + }, + { + "epoch": 2.2892247999353326, + "grad_norm": 0.6278321146965027, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 14160 + }, + { + "epoch": 2.290841484116078, + "grad_norm": 0.7663896083831787, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 14170 + }, + { + "epoch": 2.292458168296823, + "grad_norm": 0.9716812372207642, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 14180 + }, + { + "epoch": 2.2940748524775687, + "grad_norm": 0.8993458151817322, + "learning_rate": 0.0002, + "loss": 0.7067, + "step": 14190 + }, + { + "epoch": 2.295691536658314, + "grad_norm": 0.6156117916107178, + "learning_rate": 0.0002, + "loss": 0.6172, + "step": 14200 + }, + { + "epoch": 2.297308220839059, + "grad_norm": 0.8911278247833252, + "learning_rate": 0.0002, + "loss": 0.6318, + "step": 14210 + }, + { + "epoch": 2.2989249050198044, + "grad_norm": 0.6422147154808044, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 14220 + }, + { + "epoch": 2.3005415892005496, + "grad_norm": 0.6866879463195801, + "learning_rate": 0.0002, + "loss": 0.6795, + "step": 14230 + }, + { + "epoch": 2.302158273381295, + "grad_norm": 0.9297130107879639, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 14240 + }, + { + "epoch": 2.30377495756204, + "grad_norm": 0.7501356601715088, + "learning_rate": 0.0002, + "loss": 0.6823, + "step": 14250 + }, + { + "epoch": 2.3053916417427853, + "grad_norm": 0.8363515138626099, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 14260 + }, + { + "epoch": 2.307008325923531, + "grad_norm": 0.9083868265151978, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 14270 + }, + { + "epoch": 2.3086250101042762, + "grad_norm": 0.7791516780853271, + "learning_rate": 0.0002, + "loss": 0.6862, + "step": 14280 + }, + { + "epoch": 2.3102416942850215, + "grad_norm": 0.8766953349113464, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 14290 + }, + { + "epoch": 2.3118583784657667, + "grad_norm": 0.7916635274887085, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 14300 + }, + { + "epoch": 2.313475062646512, + "grad_norm": 0.627525269985199, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 14310 + }, + { + "epoch": 2.315091746827257, + "grad_norm": 0.8856783509254456, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 14320 + }, + { + "epoch": 2.316708431008003, + "grad_norm": 0.6758689284324646, + "learning_rate": 0.0002, + "loss": 0.6806, + "step": 14330 + }, + { + "epoch": 2.318325115188748, + "grad_norm": 0.6428321003913879, + "learning_rate": 0.0002, + "loss": 0.6794, + "step": 14340 + }, + { + "epoch": 2.3199417993694933, + "grad_norm": 0.9032121300697327, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 14350 + }, + { + "epoch": 2.3215584835502385, + "grad_norm": 0.8035986423492432, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 14360 + }, + { + "epoch": 2.3231751677309838, + "grad_norm": 0.7974579334259033, + "learning_rate": 0.0002, + "loss": 0.7067, + "step": 14370 + }, + { + "epoch": 2.324791851911729, + "grad_norm": 0.8356034755706787, + "learning_rate": 0.0002, + "loss": 0.6451, + "step": 14380 + }, + { + "epoch": 2.326408536092474, + "grad_norm": 0.998760998249054, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 14390 + }, + { + "epoch": 2.3280252202732195, + "grad_norm": 0.6518142223358154, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 14400 + }, + { + "epoch": 2.3296419044539647, + "grad_norm": 0.7443506717681885, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 14410 + }, + { + "epoch": 2.3312585886347104, + "grad_norm": 0.8436172604560852, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 14420 + }, + { + "epoch": 2.3328752728154556, + "grad_norm": 0.7411080598831177, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 14430 + }, + { + "epoch": 2.334491956996201, + "grad_norm": 0.8839048743247986, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 14440 + }, + { + "epoch": 2.336108641176946, + "grad_norm": 0.8360885977745056, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 14450 + }, + { + "epoch": 2.3377253253576913, + "grad_norm": 0.7608986496925354, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 14460 + }, + { + "epoch": 2.3393420095384365, + "grad_norm": 0.8179867267608643, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 14470 + }, + { + "epoch": 2.340958693719182, + "grad_norm": 0.5989999771118164, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 14480 + }, + { + "epoch": 2.3425753778999274, + "grad_norm": 0.9450054168701172, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 14490 + }, + { + "epoch": 2.3441920620806727, + "grad_norm": 0.7885149717330933, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 14500 + }, + { + "epoch": 2.345808746261418, + "grad_norm": 0.8152616620063782, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 14510 + }, + { + "epoch": 2.347425430442163, + "grad_norm": 0.7193838953971863, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 14520 + }, + { + "epoch": 2.3490421146229084, + "grad_norm": 0.6701092720031738, + "learning_rate": 0.0002, + "loss": 0.6594, + "step": 14530 + }, + { + "epoch": 2.3506587988036536, + "grad_norm": 0.7529364228248596, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 14540 + }, + { + "epoch": 2.352275482984399, + "grad_norm": 0.6599733829498291, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 14550 + }, + { + "epoch": 2.353892167165144, + "grad_norm": 0.9502474069595337, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 14560 + }, + { + "epoch": 2.3555088513458897, + "grad_norm": 0.7619650959968567, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 14570 + }, + { + "epoch": 2.357125535526635, + "grad_norm": 0.9854652285575867, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 14580 + }, + { + "epoch": 2.35874221970738, + "grad_norm": 0.727439284324646, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 14590 + }, + { + "epoch": 2.3603589038881254, + "grad_norm": 0.6994746327400208, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 14600 + }, + { + "epoch": 2.3619755880688706, + "grad_norm": 0.7117531299591064, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 14610 + }, + { + "epoch": 2.363592272249616, + "grad_norm": 0.6403067708015442, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 14620 + }, + { + "epoch": 2.3652089564303616, + "grad_norm": 0.8377841711044312, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 14630 + }, + { + "epoch": 2.366825640611107, + "grad_norm": 0.749171257019043, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 14640 + }, + { + "epoch": 2.368442324791852, + "grad_norm": 0.8418586254119873, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 14650 + }, + { + "epoch": 2.3700590089725972, + "grad_norm": 0.6178573369979858, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 14660 + }, + { + "epoch": 2.3716756931533425, + "grad_norm": 0.6368302702903748, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 14670 + }, + { + "epoch": 2.3732923773340877, + "grad_norm": 0.9122977256774902, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 14680 + }, + { + "epoch": 2.374909061514833, + "grad_norm": 0.7086195349693298, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 14690 + }, + { + "epoch": 2.376525745695578, + "grad_norm": 0.7500800490379333, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 14700 + }, + { + "epoch": 2.378142429876324, + "grad_norm": 0.6634900569915771, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 14710 + }, + { + "epoch": 2.379759114057069, + "grad_norm": 0.839898407459259, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 14720 + }, + { + "epoch": 2.3813757982378143, + "grad_norm": 0.7578426003456116, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 14730 + }, + { + "epoch": 2.3829924824185595, + "grad_norm": 1.0213173627853394, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 14740 + }, + { + "epoch": 2.3846091665993048, + "grad_norm": 0.7855949401855469, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 14750 + }, + { + "epoch": 2.38622585078005, + "grad_norm": 0.7224128842353821, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 14760 + }, + { + "epoch": 2.3878425349607952, + "grad_norm": 0.8040381669998169, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 14770 + }, + { + "epoch": 2.389459219141541, + "grad_norm": 0.7705281376838684, + "learning_rate": 0.0002, + "loss": 0.6799, + "step": 14780 + }, + { + "epoch": 2.391075903322286, + "grad_norm": 0.667966902256012, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 14790 + }, + { + "epoch": 2.3926925875030314, + "grad_norm": 0.6611011028289795, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 14800 + }, + { + "epoch": 2.3943092716837766, + "grad_norm": 0.6862651705741882, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 14810 + }, + { + "epoch": 2.395925955864522, + "grad_norm": 0.8086010217666626, + "learning_rate": 0.0002, + "loss": 0.6537, + "step": 14820 + }, + { + "epoch": 2.397542640045267, + "grad_norm": 0.7189689874649048, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 14830 + }, + { + "epoch": 2.3991593242260123, + "grad_norm": 0.6280009150505066, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 14840 + }, + { + "epoch": 2.4007760084067575, + "grad_norm": 0.7826612591743469, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 14850 + }, + { + "epoch": 2.402392692587503, + "grad_norm": 0.7681610584259033, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 14860 + }, + { + "epoch": 2.4040093767682484, + "grad_norm": 0.720966100692749, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 14870 + }, + { + "epoch": 2.4056260609489937, + "grad_norm": 0.8202250599861145, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 14880 + }, + { + "epoch": 2.407242745129739, + "grad_norm": 0.786212682723999, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 14890 + }, + { + "epoch": 2.408859429310484, + "grad_norm": 0.6647164821624756, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 14900 + }, + { + "epoch": 2.4104761134912294, + "grad_norm": 0.7566399574279785, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 14910 + }, + { + "epoch": 2.4120927976719746, + "grad_norm": 0.748814582824707, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 14920 + }, + { + "epoch": 2.4137094818527203, + "grad_norm": 0.7624038457870483, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 14930 + }, + { + "epoch": 2.4153261660334655, + "grad_norm": 0.8267335295677185, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 14940 + }, + { + "epoch": 2.4169428502142107, + "grad_norm": 0.8785360455513, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 14950 + }, + { + "epoch": 2.418559534394956, + "grad_norm": 0.679887592792511, + "learning_rate": 0.0002, + "loss": 0.6718, + "step": 14960 + }, + { + "epoch": 2.420176218575701, + "grad_norm": 0.7218474745750427, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 14970 + }, + { + "epoch": 2.4217929027564464, + "grad_norm": 0.6342799663543701, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 14980 + }, + { + "epoch": 2.4234095869371917, + "grad_norm": 0.7098712921142578, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 14990 + }, + { + "epoch": 2.425026271117937, + "grad_norm": 0.7497431635856628, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 15000 + }, + { + "epoch": 2.4266429552986826, + "grad_norm": 0.934836208820343, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 15010 + }, + { + "epoch": 2.428259639479428, + "grad_norm": 0.8430966734886169, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 15020 + }, + { + "epoch": 2.429876323660173, + "grad_norm": 0.7032104730606079, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 15030 + }, + { + "epoch": 2.4314930078409183, + "grad_norm": 0.7746111750602722, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 15040 + }, + { + "epoch": 2.4331096920216635, + "grad_norm": 0.7661406397819519, + "learning_rate": 0.0002, + "loss": 0.7177, + "step": 15050 + }, + { + "epoch": 2.4347263762024087, + "grad_norm": 0.6941645741462708, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 15060 + }, + { + "epoch": 2.436343060383154, + "grad_norm": 0.7487249374389648, + "learning_rate": 0.0002, + "loss": 0.6421, + "step": 15070 + }, + { + "epoch": 2.4379597445638996, + "grad_norm": 0.7639912962913513, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 15080 + }, + { + "epoch": 2.439576428744645, + "grad_norm": 0.7708953619003296, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 15090 + }, + { + "epoch": 2.44119311292539, + "grad_norm": 0.9135832190513611, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 15100 + }, + { + "epoch": 2.4428097971061353, + "grad_norm": 0.8283005356788635, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 15110 + }, + { + "epoch": 2.4444264812868806, + "grad_norm": 0.925299346446991, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 15120 + }, + { + "epoch": 2.446043165467626, + "grad_norm": 0.7013528943061829, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 15130 + }, + { + "epoch": 2.447659849648371, + "grad_norm": 0.622303307056427, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 15140 + }, + { + "epoch": 2.4492765338291163, + "grad_norm": 0.876569390296936, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 15150 + }, + { + "epoch": 2.450893218009862, + "grad_norm": 0.6836351752281189, + "learning_rate": 0.0002, + "loss": 0.6463, + "step": 15160 + }, + { + "epoch": 2.452509902190607, + "grad_norm": 0.7886684536933899, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 15170 + }, + { + "epoch": 2.4541265863713524, + "grad_norm": 0.6647440791130066, + "learning_rate": 0.0002, + "loss": 0.6794, + "step": 15180 + }, + { + "epoch": 2.4557432705520976, + "grad_norm": 0.7477722764015198, + "learning_rate": 0.0002, + "loss": 0.6353, + "step": 15190 + }, + { + "epoch": 2.457359954732843, + "grad_norm": 0.8192033767700195, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 15200 + }, + { + "epoch": 2.458976638913588, + "grad_norm": 0.847537100315094, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 15210 + }, + { + "epoch": 2.4605933230943338, + "grad_norm": 0.9027776122093201, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 15220 + }, + { + "epoch": 2.462210007275079, + "grad_norm": 0.7217772006988525, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 15230 + }, + { + "epoch": 2.4638266914558242, + "grad_norm": 0.7994546294212341, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 15240 + }, + { + "epoch": 2.4654433756365695, + "grad_norm": 0.939916729927063, + "learning_rate": 0.0002, + "loss": 0.6828, + "step": 15250 + }, + { + "epoch": 2.4670600598173147, + "grad_norm": 1.0009053945541382, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 15260 + }, + { + "epoch": 2.46867674399806, + "grad_norm": 0.625555694103241, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 15270 + }, + { + "epoch": 2.470293428178805, + "grad_norm": 0.7924878597259521, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 15280 + }, + { + "epoch": 2.4719101123595504, + "grad_norm": 0.8536689877510071, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 15290 + }, + { + "epoch": 2.4735267965402956, + "grad_norm": 0.8572589755058289, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 15300 + }, + { + "epoch": 2.4751434807210413, + "grad_norm": 0.773279070854187, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 15310 + }, + { + "epoch": 2.4767601649017865, + "grad_norm": 0.7708749771118164, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 15320 + }, + { + "epoch": 2.4783768490825318, + "grad_norm": 0.770905077457428, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 15330 + }, + { + "epoch": 2.479993533263277, + "grad_norm": 0.8238571882247925, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 15340 + }, + { + "epoch": 2.481610217444022, + "grad_norm": 0.7670477032661438, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 15350 + }, + { + "epoch": 2.4832269016247674, + "grad_norm": 0.905036985874176, + "learning_rate": 0.0002, + "loss": 0.7759, + "step": 15360 + }, + { + "epoch": 2.484843585805513, + "grad_norm": 0.6672089695930481, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 15370 + }, + { + "epoch": 2.4864602699862584, + "grad_norm": 0.625095784664154, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 15380 + }, + { + "epoch": 2.4880769541670036, + "grad_norm": 0.679772675037384, + "learning_rate": 0.0002, + "loss": 0.6396, + "step": 15390 + }, + { + "epoch": 2.489693638347749, + "grad_norm": 0.711492121219635, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 15400 + }, + { + "epoch": 2.491310322528494, + "grad_norm": 0.876189112663269, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 15410 + }, + { + "epoch": 2.4929270067092393, + "grad_norm": 0.7236915230751038, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 15420 + }, + { + "epoch": 2.4945436908899845, + "grad_norm": 0.6629832983016968, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 15430 + }, + { + "epoch": 2.4961603750707297, + "grad_norm": 0.9756859540939331, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 15440 + }, + { + "epoch": 2.4977770592514754, + "grad_norm": 0.6896940469741821, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 15450 + }, + { + "epoch": 2.4993937434322206, + "grad_norm": 0.7105149626731873, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 15460 + }, + { + "epoch": 2.501010427612966, + "grad_norm": 0.8374546766281128, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 15470 + }, + { + "epoch": 2.502627111793711, + "grad_norm": 0.7320070266723633, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 15480 + }, + { + "epoch": 2.5042437959744563, + "grad_norm": 0.8306367993354797, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 15490 + }, + { + "epoch": 2.5058604801552016, + "grad_norm": 0.7472721338272095, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 15500 + }, + { + "epoch": 2.507477164335947, + "grad_norm": 0.6147692203521729, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 15510 + }, + { + "epoch": 2.5090938485166925, + "grad_norm": 0.7788505554199219, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 15520 + }, + { + "epoch": 2.5107105326974377, + "grad_norm": 0.8807527422904968, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 15530 + }, + { + "epoch": 2.512327216878183, + "grad_norm": 0.7521643042564392, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 15540 + }, + { + "epoch": 2.513943901058928, + "grad_norm": 0.6900225281715393, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 15550 + }, + { + "epoch": 2.5155605852396734, + "grad_norm": 0.6601938605308533, + "learning_rate": 0.0002, + "loss": 0.6769, + "step": 15560 + }, + { + "epoch": 2.5171772694204186, + "grad_norm": 0.8179984092712402, + "learning_rate": 0.0002, + "loss": 0.6648, + "step": 15570 + }, + { + "epoch": 2.518793953601164, + "grad_norm": 0.792556881904602, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 15580 + }, + { + "epoch": 2.520410637781909, + "grad_norm": 0.7081938982009888, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 15590 + }, + { + "epoch": 2.5220273219626543, + "grad_norm": 0.8733121156692505, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 15600 + }, + { + "epoch": 2.5236440061434, + "grad_norm": 0.7980992794036865, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 15610 + }, + { + "epoch": 2.5252606903241452, + "grad_norm": 0.883664071559906, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 15620 + }, + { + "epoch": 2.5268773745048905, + "grad_norm": 0.6963341236114502, + "learning_rate": 0.0002, + "loss": 0.7022, + "step": 15630 + }, + { + "epoch": 2.5284940586856357, + "grad_norm": 0.6433573365211487, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 15640 + }, + { + "epoch": 2.530110742866381, + "grad_norm": 0.8538183569908142, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 15650 + }, + { + "epoch": 2.5317274270471266, + "grad_norm": 0.9748201370239258, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 15660 + }, + { + "epoch": 2.533344111227872, + "grad_norm": 0.7670575380325317, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 15670 + }, + { + "epoch": 2.534960795408617, + "grad_norm": 0.8738890290260315, + "learning_rate": 0.0002, + "loss": 0.6435, + "step": 15680 + }, + { + "epoch": 2.5365774795893623, + "grad_norm": 0.8391636610031128, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 15690 + }, + { + "epoch": 2.5381941637701075, + "grad_norm": 0.7239366769790649, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 15700 + }, + { + "epoch": 2.5398108479508528, + "grad_norm": 0.8498379588127136, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 15710 + }, + { + "epoch": 2.541427532131598, + "grad_norm": 0.8029484152793884, + "learning_rate": 0.0002, + "loss": 0.6998, + "step": 15720 + }, + { + "epoch": 2.5430442163123432, + "grad_norm": 1.0639333724975586, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 15730 + }, + { + "epoch": 2.5446609004930885, + "grad_norm": 0.6401297450065613, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 15740 + }, + { + "epoch": 2.5462775846738337, + "grad_norm": 0.7123814821243286, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 15750 + }, + { + "epoch": 2.5478942688545794, + "grad_norm": 0.7874974608421326, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 15760 + }, + { + "epoch": 2.5495109530353246, + "grad_norm": 0.8046808838844299, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 15770 + }, + { + "epoch": 2.55112763721607, + "grad_norm": 0.7888661623001099, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 15780 + }, + { + "epoch": 2.552744321396815, + "grad_norm": 0.8445866107940674, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 15790 + }, + { + "epoch": 2.5543610055775603, + "grad_norm": 0.7475846409797668, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 15800 + }, + { + "epoch": 2.555977689758306, + "grad_norm": 0.7455102801322937, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 15810 + }, + { + "epoch": 2.557594373939051, + "grad_norm": 0.8226983547210693, + "learning_rate": 0.0002, + "loss": 0.6932, + "step": 15820 + }, + { + "epoch": 2.5592110581197964, + "grad_norm": 0.8920368552207947, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 15830 + }, + { + "epoch": 2.5608277423005417, + "grad_norm": 0.8413904905319214, + "learning_rate": 0.0002, + "loss": 0.6297, + "step": 15840 + }, + { + "epoch": 2.562444426481287, + "grad_norm": 0.8483649492263794, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 15850 + }, + { + "epoch": 2.564061110662032, + "grad_norm": 0.5923284292221069, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 15860 + }, + { + "epoch": 2.5656777948427774, + "grad_norm": 0.8518726229667664, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 15870 + }, + { + "epoch": 2.5672944790235226, + "grad_norm": 0.731235146522522, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 15880 + }, + { + "epoch": 2.568911163204268, + "grad_norm": 0.7517194151878357, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 15890 + }, + { + "epoch": 2.5705278473850135, + "grad_norm": 0.8378692269325256, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 15900 + }, + { + "epoch": 2.5721445315657587, + "grad_norm": 0.843701958656311, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 15910 + }, + { + "epoch": 2.573761215746504, + "grad_norm": 0.7254629731178284, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 15920 + }, + { + "epoch": 2.575377899927249, + "grad_norm": 0.8863335847854614, + "learning_rate": 0.0002, + "loss": 0.6872, + "step": 15930 + }, + { + "epoch": 2.5769945841079944, + "grad_norm": 0.7675097584724426, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 15940 + }, + { + "epoch": 2.5786112682887397, + "grad_norm": 0.82063889503479, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 15950 + }, + { + "epoch": 2.5802279524694853, + "grad_norm": 0.7729717493057251, + "learning_rate": 0.0002, + "loss": 0.662, + "step": 15960 + }, + { + "epoch": 2.5818446366502306, + "grad_norm": 0.8301846981048584, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 15970 + }, + { + "epoch": 2.583461320830976, + "grad_norm": 0.7906861305236816, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 15980 + }, + { + "epoch": 2.585078005011721, + "grad_norm": 0.6749057173728943, + "learning_rate": 0.0002, + "loss": 0.7175, + "step": 15990 + }, + { + "epoch": 2.5866946891924663, + "grad_norm": 0.9386842846870422, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 16000 + }, + { + "epoch": 2.5883113733732115, + "grad_norm": 0.7868891358375549, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 16010 + }, + { + "epoch": 2.5899280575539567, + "grad_norm": 0.8674671053886414, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 16020 + }, + { + "epoch": 2.591544741734702, + "grad_norm": 0.7043559551239014, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 16030 + }, + { + "epoch": 2.593161425915447, + "grad_norm": 0.5846083760261536, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 16040 + }, + { + "epoch": 2.594778110096193, + "grad_norm": 0.7323982119560242, + "learning_rate": 0.0002, + "loss": 0.7322, + "step": 16050 + }, + { + "epoch": 2.596394794276938, + "grad_norm": 0.9069556593894958, + "learning_rate": 0.0002, + "loss": 0.6794, + "step": 16060 + }, + { + "epoch": 2.5980114784576833, + "grad_norm": 0.7522736191749573, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 16070 + }, + { + "epoch": 2.5996281626384286, + "grad_norm": 0.8149648308753967, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 16080 + }, + { + "epoch": 2.601244846819174, + "grad_norm": 0.6214233040809631, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 16090 + }, + { + "epoch": 2.602861530999919, + "grad_norm": 0.6803743839263916, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 16100 + }, + { + "epoch": 2.6044782151806647, + "grad_norm": 0.7223997116088867, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 16110 + }, + { + "epoch": 2.60609489936141, + "grad_norm": 0.7324174642562866, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 16120 + }, + { + "epoch": 2.607711583542155, + "grad_norm": 0.9594739675521851, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 16130 + }, + { + "epoch": 2.6093282677229004, + "grad_norm": 0.9485327005386353, + "learning_rate": 0.0002, + "loss": 0.6451, + "step": 16140 + }, + { + "epoch": 2.6109449519036456, + "grad_norm": 0.8449000120162964, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 16150 + }, + { + "epoch": 2.612561636084391, + "grad_norm": 0.8520140051841736, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 16160 + }, + { + "epoch": 2.614178320265136, + "grad_norm": 0.7456524968147278, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 16170 + }, + { + "epoch": 2.6157950044458813, + "grad_norm": 0.9912857413291931, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 16180 + }, + { + "epoch": 2.6174116886266265, + "grad_norm": 0.9001946449279785, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 16190 + }, + { + "epoch": 2.619028372807372, + "grad_norm": 0.6568667888641357, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 16200 + }, + { + "epoch": 2.6206450569881174, + "grad_norm": 1.0248128175735474, + "learning_rate": 0.0002, + "loss": 0.7013, + "step": 16210 + }, + { + "epoch": 2.6222617411688627, + "grad_norm": 0.6509039998054504, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 16220 + }, + { + "epoch": 2.623878425349608, + "grad_norm": 0.7626351118087769, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 16230 + }, + { + "epoch": 2.625495109530353, + "grad_norm": 0.6938552260398865, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 16240 + }, + { + "epoch": 2.6271117937110984, + "grad_norm": 0.6434680819511414, + "learning_rate": 0.0002, + "loss": 0.65, + "step": 16250 + }, + { + "epoch": 2.628728477891844, + "grad_norm": 0.7111515998840332, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 16260 + }, + { + "epoch": 2.6303451620725893, + "grad_norm": 0.7712395787239075, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 16270 + }, + { + "epoch": 2.6319618462533345, + "grad_norm": 0.792209267616272, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 16280 + }, + { + "epoch": 2.6335785304340797, + "grad_norm": 0.6801066398620605, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 16290 + }, + { + "epoch": 2.635195214614825, + "grad_norm": 0.7802573442459106, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 16300 + }, + { + "epoch": 2.63681189879557, + "grad_norm": 0.7742244601249695, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 16310 + }, + { + "epoch": 2.6384285829763154, + "grad_norm": 0.664184033870697, + "learning_rate": 0.0002, + "loss": 0.6524, + "step": 16320 + }, + { + "epoch": 2.6400452671570607, + "grad_norm": 0.9242228865623474, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 16330 + }, + { + "epoch": 2.641661951337806, + "grad_norm": 0.9661325216293335, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 16340 + }, + { + "epoch": 2.6432786355185516, + "grad_norm": 0.837526798248291, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 16350 + }, + { + "epoch": 2.644895319699297, + "grad_norm": 1.1834373474121094, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 16360 + }, + { + "epoch": 2.646512003880042, + "grad_norm": 0.7467831373214722, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 16370 + }, + { + "epoch": 2.6481286880607873, + "grad_norm": 0.8627146482467651, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 16380 + }, + { + "epoch": 2.6497453722415325, + "grad_norm": 0.790447473526001, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 16390 + }, + { + "epoch": 2.651362056422278, + "grad_norm": 0.8447365164756775, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 16400 + }, + { + "epoch": 2.6529787406030234, + "grad_norm": 0.7831417918205261, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 16410 + }, + { + "epoch": 2.6545954247837686, + "grad_norm": 0.6837952136993408, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 16420 + }, + { + "epoch": 2.656212108964514, + "grad_norm": 0.7031801342964172, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 16430 + }, + { + "epoch": 2.657828793145259, + "grad_norm": 0.8963770866394043, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 16440 + }, + { + "epoch": 2.6594454773260043, + "grad_norm": 0.6852328181266785, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 16450 + }, + { + "epoch": 2.6610621615067496, + "grad_norm": 0.8069294095039368, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 16460 + }, + { + "epoch": 2.662678845687495, + "grad_norm": 0.7503686547279358, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 16470 + }, + { + "epoch": 2.66429552986824, + "grad_norm": 0.6430956125259399, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 16480 + }, + { + "epoch": 2.6659122140489853, + "grad_norm": 0.7894312739372253, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 16490 + }, + { + "epoch": 2.667528898229731, + "grad_norm": 0.7277431488037109, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 16500 + }, + { + "epoch": 2.669145582410476, + "grad_norm": 0.6816153526306152, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 16510 + }, + { + "epoch": 2.6707622665912214, + "grad_norm": 0.8145235776901245, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 16520 + }, + { + "epoch": 2.6723789507719666, + "grad_norm": 0.8645890355110168, + "learning_rate": 0.0002, + "loss": 0.709, + "step": 16530 + }, + { + "epoch": 2.673995634952712, + "grad_norm": 0.704393208026886, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 16540 + }, + { + "epoch": 2.6756123191334575, + "grad_norm": 1.0120846033096313, + "learning_rate": 0.0002, + "loss": 0.6378, + "step": 16550 + }, + { + "epoch": 2.6772290033142028, + "grad_norm": 0.6919328570365906, + "learning_rate": 0.0002, + "loss": 0.7241, + "step": 16560 + }, + { + "epoch": 2.678845687494948, + "grad_norm": 0.6924574971199036, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 16570 + }, + { + "epoch": 2.6804623716756932, + "grad_norm": 0.9679301381111145, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 16580 + }, + { + "epoch": 2.6820790558564385, + "grad_norm": 0.6810211539268494, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 16590 + }, + { + "epoch": 2.6836957400371837, + "grad_norm": 0.9730555415153503, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 16600 + }, + { + "epoch": 2.685312424217929, + "grad_norm": 0.7852821350097656, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 16610 + }, + { + "epoch": 2.686929108398674, + "grad_norm": 0.6059057116508484, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 16620 + }, + { + "epoch": 2.6885457925794194, + "grad_norm": 0.9395958781242371, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 16630 + }, + { + "epoch": 2.690162476760165, + "grad_norm": 0.7473729848861694, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 16640 + }, + { + "epoch": 2.6917791609409103, + "grad_norm": 0.765934407711029, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 16650 + }, + { + "epoch": 2.6933958451216555, + "grad_norm": 0.8496677279472351, + "learning_rate": 0.0002, + "loss": 0.707, + "step": 16660 + }, + { + "epoch": 2.6950125293024008, + "grad_norm": 0.7641879916191101, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 16670 + }, + { + "epoch": 2.696629213483146, + "grad_norm": 0.8471952676773071, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 16680 + }, + { + "epoch": 2.6982458976638912, + "grad_norm": 0.6946060657501221, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 16690 + }, + { + "epoch": 2.699862581844637, + "grad_norm": 0.7361312508583069, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 16700 + }, + { + "epoch": 2.701479266025382, + "grad_norm": 0.6605038046836853, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 16710 + }, + { + "epoch": 2.7030959502061274, + "grad_norm": 0.7164411544799805, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 16720 + }, + { + "epoch": 2.7047126343868726, + "grad_norm": 0.6496201157569885, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 16730 + }, + { + "epoch": 2.706329318567618, + "grad_norm": 0.7826663851737976, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 16740 + }, + { + "epoch": 2.707946002748363, + "grad_norm": 0.7639131546020508, + "learning_rate": 0.0002, + "loss": 0.6773, + "step": 16750 + }, + { + "epoch": 2.7095626869291083, + "grad_norm": 0.7976210713386536, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 16760 + }, + { + "epoch": 2.7111793711098535, + "grad_norm": 0.6836577653884888, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 16770 + }, + { + "epoch": 2.7127960552905988, + "grad_norm": 0.8025202751159668, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 16780 + }, + { + "epoch": 2.7144127394713444, + "grad_norm": 0.7636463642120361, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 16790 + }, + { + "epoch": 2.7160294236520897, + "grad_norm": 0.7481677532196045, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 16800 + }, + { + "epoch": 2.717646107832835, + "grad_norm": 0.7566834688186646, + "learning_rate": 0.0002, + "loss": 0.6925, + "step": 16810 + }, + { + "epoch": 2.71926279201358, + "grad_norm": 0.7931267619132996, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 16820 + }, + { + "epoch": 2.7208794761943254, + "grad_norm": 0.8811662197113037, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 16830 + }, + { + "epoch": 2.7224961603750706, + "grad_norm": 0.8561240434646606, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 16840 + }, + { + "epoch": 2.7241128445558163, + "grad_norm": 0.7121599316596985, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 16850 + }, + { + "epoch": 2.7257295287365615, + "grad_norm": 0.8066257238388062, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 16860 + }, + { + "epoch": 2.7273462129173067, + "grad_norm": 0.7699271440505981, + "learning_rate": 0.0002, + "loss": 0.6839, + "step": 16870 + }, + { + "epoch": 2.728962897098052, + "grad_norm": 1.1828432083129883, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 16880 + }, + { + "epoch": 2.730579581278797, + "grad_norm": 0.9989302754402161, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 16890 + }, + { + "epoch": 2.7321962654595424, + "grad_norm": 0.8100560307502747, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 16900 + }, + { + "epoch": 2.7338129496402876, + "grad_norm": 0.8615233898162842, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 16910 + }, + { + "epoch": 2.735429633821033, + "grad_norm": 0.8633756041526794, + "learning_rate": 0.0002, + "loss": 0.6322, + "step": 16920 + }, + { + "epoch": 2.737046318001778, + "grad_norm": 0.7769348621368408, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 16930 + }, + { + "epoch": 2.738663002182524, + "grad_norm": 0.6943058371543884, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 16940 + }, + { + "epoch": 2.740279686363269, + "grad_norm": 0.8510736227035522, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 16950 + }, + { + "epoch": 2.7418963705440142, + "grad_norm": 0.7732602953910828, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 16960 + }, + { + "epoch": 2.7435130547247595, + "grad_norm": 0.5981788635253906, + "learning_rate": 0.0002, + "loss": 0.6553, + "step": 16970 + }, + { + "epoch": 2.7451297389055047, + "grad_norm": 0.7604416012763977, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 16980 + }, + { + "epoch": 2.74674642308625, + "grad_norm": 0.7377738356590271, + "learning_rate": 0.0002, + "loss": 0.6981, + "step": 16990 + }, + { + "epoch": 2.7483631072669956, + "grad_norm": 0.9400289058685303, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 17000 + }, + { + "epoch": 2.749979791447741, + "grad_norm": 0.6340599656105042, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 17010 + }, + { + "epoch": 2.751596475628486, + "grad_norm": 0.7297601103782654, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 17020 + }, + { + "epoch": 2.7532131598092313, + "grad_norm": 0.9479979872703552, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 17030 + }, + { + "epoch": 2.7548298439899765, + "grad_norm": 0.8461511135101318, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 17040 + }, + { + "epoch": 2.7564465281707218, + "grad_norm": 0.7477551698684692, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 17050 + }, + { + "epoch": 2.758063212351467, + "grad_norm": 1.019270420074463, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 17060 + }, + { + "epoch": 2.7596798965322122, + "grad_norm": 0.7730235457420349, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 17070 + }, + { + "epoch": 2.7612965807129575, + "grad_norm": 0.8216866254806519, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 17080 + }, + { + "epoch": 2.762913264893703, + "grad_norm": 0.7235931754112244, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 17090 + }, + { + "epoch": 2.7645299490744484, + "grad_norm": 0.7352296710014343, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 17100 + }, + { + "epoch": 2.7661466332551936, + "grad_norm": 0.8129373788833618, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 17110 + }, + { + "epoch": 2.767763317435939, + "grad_norm": 0.7387019991874695, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 17120 + }, + { + "epoch": 2.769380001616684, + "grad_norm": 0.9149190187454224, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 17130 + }, + { + "epoch": 2.7709966857974297, + "grad_norm": 0.7352971434593201, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 17140 + }, + { + "epoch": 2.772613369978175, + "grad_norm": 0.7903780341148376, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 17150 + }, + { + "epoch": 2.77423005415892, + "grad_norm": 0.8255927562713623, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 17160 + }, + { + "epoch": 2.7758467383396654, + "grad_norm": 0.7235927581787109, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 17170 + }, + { + "epoch": 2.7774634225204107, + "grad_norm": 0.8281434774398804, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 17180 + }, + { + "epoch": 2.779080106701156, + "grad_norm": 0.7586921453475952, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 17190 + }, + { + "epoch": 2.780696790881901, + "grad_norm": 0.7161715030670166, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 17200 + }, + { + "epoch": 2.7823134750626464, + "grad_norm": 0.762868344783783, + "learning_rate": 0.0002, + "loss": 0.6426, + "step": 17210 + }, + { + "epoch": 2.7839301592433916, + "grad_norm": 0.9285483360290527, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 17220 + }, + { + "epoch": 2.785546843424137, + "grad_norm": 0.6900462508201599, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 17230 + }, + { + "epoch": 2.7871635276048825, + "grad_norm": 0.780384361743927, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 17240 + }, + { + "epoch": 2.7887802117856277, + "grad_norm": 0.7580406665802002, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 17250 + }, + { + "epoch": 2.790396895966373, + "grad_norm": 0.8145199418067932, + "learning_rate": 0.0002, + "loss": 0.6833, + "step": 17260 + }, + { + "epoch": 2.792013580147118, + "grad_norm": 0.9159596562385559, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 17270 + }, + { + "epoch": 2.7936302643278634, + "grad_norm": 0.9590014219284058, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 17280 + }, + { + "epoch": 2.795246948508609, + "grad_norm": 0.7603529691696167, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 17290 + }, + { + "epoch": 2.7968636326893543, + "grad_norm": 0.8039976358413696, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 17300 + }, + { + "epoch": 2.7984803168700996, + "grad_norm": 0.8364847302436829, + "learning_rate": 0.0002, + "loss": 0.7037, + "step": 17310 + }, + { + "epoch": 2.800097001050845, + "grad_norm": 0.8763046860694885, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 17320 + }, + { + "epoch": 2.80171368523159, + "grad_norm": 0.8409647941589355, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 17330 + }, + { + "epoch": 2.8033303694123353, + "grad_norm": 0.7649006247520447, + "learning_rate": 0.0002, + "loss": 0.6936, + "step": 17340 + }, + { + "epoch": 2.8049470535930805, + "grad_norm": 0.7970262169837952, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 17350 + }, + { + "epoch": 2.8065637377738257, + "grad_norm": 0.9088607430458069, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 17360 + }, + { + "epoch": 2.808180421954571, + "grad_norm": 0.6454846858978271, + "learning_rate": 0.0002, + "loss": 0.675, + "step": 17370 + }, + { + "epoch": 2.809797106135316, + "grad_norm": 0.7744787931442261, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 17380 + }, + { + "epoch": 2.811413790316062, + "grad_norm": 0.6678640842437744, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 17390 + }, + { + "epoch": 2.813030474496807, + "grad_norm": 0.772676944732666, + "learning_rate": 0.0002, + "loss": 0.6784, + "step": 17400 + }, + { + "epoch": 2.8146471586775523, + "grad_norm": 0.7088175415992737, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 17410 + }, + { + "epoch": 2.8162638428582976, + "grad_norm": 0.8280573487281799, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 17420 + }, + { + "epoch": 2.817880527039043, + "grad_norm": 0.6665388345718384, + "learning_rate": 0.0002, + "loss": 0.6732, + "step": 17430 + }, + { + "epoch": 2.8194972112197885, + "grad_norm": 0.6427883505821228, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 17440 + }, + { + "epoch": 2.8211138954005337, + "grad_norm": 0.9697760343551636, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 17450 + }, + { + "epoch": 2.822730579581279, + "grad_norm": 0.7573966383934021, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 17460 + }, + { + "epoch": 2.824347263762024, + "grad_norm": 0.878688633441925, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 17470 + }, + { + "epoch": 2.8259639479427694, + "grad_norm": 0.7752242684364319, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 17480 + }, + { + "epoch": 2.8275806321235146, + "grad_norm": 0.6135398745536804, + "learning_rate": 0.0002, + "loss": 0.6638, + "step": 17490 + }, + { + "epoch": 2.82919731630426, + "grad_norm": 0.6924924850463867, + "learning_rate": 0.0002, + "loss": 0.6829, + "step": 17500 + }, + { + "epoch": 2.830814000485005, + "grad_norm": 0.7471627593040466, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 17510 + }, + { + "epoch": 2.8324306846657503, + "grad_norm": 0.7145499587059021, + "learning_rate": 0.0002, + "loss": 0.7016, + "step": 17520 + }, + { + "epoch": 2.834047368846496, + "grad_norm": 0.7415414452552795, + "learning_rate": 0.0002, + "loss": 0.6787, + "step": 17530 + }, + { + "epoch": 2.8356640530272412, + "grad_norm": 0.7328441739082336, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 17540 + }, + { + "epoch": 2.8372807372079865, + "grad_norm": 0.8267839550971985, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 17550 + }, + { + "epoch": 2.8388974213887317, + "grad_norm": 0.8877885341644287, + "learning_rate": 0.0002, + "loss": 0.6787, + "step": 17560 + }, + { + "epoch": 2.840514105569477, + "grad_norm": 0.857138454914093, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 17570 + }, + { + "epoch": 2.842130789750222, + "grad_norm": 0.8470779657363892, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 17580 + }, + { + "epoch": 2.843747473930968, + "grad_norm": 0.8553254008293152, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 17590 + }, + { + "epoch": 2.845364158111713, + "grad_norm": 0.8033196926116943, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 17600 + }, + { + "epoch": 2.8469808422924583, + "grad_norm": 0.7949087023735046, + "learning_rate": 0.0002, + "loss": 0.7062, + "step": 17610 + }, + { + "epoch": 2.8485975264732035, + "grad_norm": 0.9241406321525574, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 17620 + }, + { + "epoch": 2.8502142106539488, + "grad_norm": 0.7721285223960876, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 17630 + }, + { + "epoch": 2.851830894834694, + "grad_norm": 1.0246692895889282, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 17640 + }, + { + "epoch": 2.853447579015439, + "grad_norm": 0.9244589805603027, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 17650 + }, + { + "epoch": 2.8550642631961844, + "grad_norm": 0.7243508696556091, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 17660 + }, + { + "epoch": 2.8566809473769297, + "grad_norm": 0.8943371176719666, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 17670 + }, + { + "epoch": 2.8582976315576754, + "grad_norm": 0.6531758904457092, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 17680 + }, + { + "epoch": 2.8599143157384206, + "grad_norm": 0.8367000818252563, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 17690 + }, + { + "epoch": 2.861530999919166, + "grad_norm": 0.7868556380271912, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 17700 + }, + { + "epoch": 2.863147684099911, + "grad_norm": 0.7213859558105469, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 17710 + }, + { + "epoch": 2.8647643682806563, + "grad_norm": 0.7383931279182434, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 17720 + }, + { + "epoch": 2.8663810524614015, + "grad_norm": 0.7566812634468079, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 17730 + }, + { + "epoch": 2.867997736642147, + "grad_norm": 0.6930373311042786, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 17740 + }, + { + "epoch": 2.8696144208228924, + "grad_norm": 0.7911090850830078, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 17750 + }, + { + "epoch": 2.8712311050036377, + "grad_norm": 0.8484548926353455, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 17760 + }, + { + "epoch": 2.872847789184383, + "grad_norm": 0.7647597193717957, + "learning_rate": 0.0002, + "loss": 0.6931, + "step": 17770 + }, + { + "epoch": 2.874464473365128, + "grad_norm": 0.8791151642799377, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 17780 + }, + { + "epoch": 2.8760811575458733, + "grad_norm": 0.7253178358078003, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 17790 + }, + { + "epoch": 2.8776978417266186, + "grad_norm": 0.7956077456474304, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 17800 + }, + { + "epoch": 2.879314525907364, + "grad_norm": 0.8657688498497009, + "learning_rate": 0.0002, + "loss": 0.6687, + "step": 17810 + }, + { + "epoch": 2.880931210088109, + "grad_norm": 0.7059141993522644, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 17820 + }, + { + "epoch": 2.8825478942688547, + "grad_norm": 0.8886896967887878, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 17830 + }, + { + "epoch": 2.8841645784496, + "grad_norm": 0.821032702922821, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 17840 + }, + { + "epoch": 2.885781262630345, + "grad_norm": 0.7183963656425476, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 17850 + }, + { + "epoch": 2.8873979468110904, + "grad_norm": 0.6222899556159973, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 17860 + }, + { + "epoch": 2.8890146309918356, + "grad_norm": 0.8187434077262878, + "learning_rate": 0.0002, + "loss": 0.6626, + "step": 17870 + }, + { + "epoch": 2.890631315172581, + "grad_norm": 0.9838479161262512, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 17880 + }, + { + "epoch": 2.8922479993533265, + "grad_norm": 0.7567742466926575, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 17890 + }, + { + "epoch": 2.893864683534072, + "grad_norm": 0.6875903606414795, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 17900 + }, + { + "epoch": 2.895481367714817, + "grad_norm": 0.8043789267539978, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 17910 + }, + { + "epoch": 2.8970980518955622, + "grad_norm": 0.8062626719474792, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 17920 + }, + { + "epoch": 2.8987147360763075, + "grad_norm": 1.0251191854476929, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 17930 + }, + { + "epoch": 2.9003314202570527, + "grad_norm": 0.882253110408783, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 17940 + }, + { + "epoch": 2.901948104437798, + "grad_norm": 0.8683299422264099, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 17950 + }, + { + "epoch": 2.903564788618543, + "grad_norm": 0.7167282104492188, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 17960 + }, + { + "epoch": 2.9051814727992884, + "grad_norm": 0.7093694806098938, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 17970 + }, + { + "epoch": 2.906798156980034, + "grad_norm": 0.8549879193305969, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 17980 + }, + { + "epoch": 2.9084148411607793, + "grad_norm": 0.6989606618881226, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 17990 + }, + { + "epoch": 2.9100315253415245, + "grad_norm": 0.9482976794242859, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 18000 + }, + { + "epoch": 2.9116482095222698, + "grad_norm": 0.7182440161705017, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 18010 + }, + { + "epoch": 2.913264893703015, + "grad_norm": 0.7732226252555847, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 18020 + }, + { + "epoch": 2.9148815778837607, + "grad_norm": 0.7936875224113464, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 18030 + }, + { + "epoch": 2.916498262064506, + "grad_norm": 0.8825615644454956, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 18040 + }, + { + "epoch": 2.918114946245251, + "grad_norm": 0.6778587102890015, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 18050 + }, + { + "epoch": 2.9197316304259964, + "grad_norm": 0.7529265880584717, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 18060 + }, + { + "epoch": 2.9213483146067416, + "grad_norm": 0.7111883163452148, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 18070 + }, + { + "epoch": 2.922964998787487, + "grad_norm": 0.7214767932891846, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 18080 + }, + { + "epoch": 2.924581682968232, + "grad_norm": 0.800417423248291, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 18090 + }, + { + "epoch": 2.9261983671489773, + "grad_norm": 1.248575210571289, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 18100 + }, + { + "epoch": 2.9278150513297225, + "grad_norm": 0.757788360118866, + "learning_rate": 0.0002, + "loss": 0.7004, + "step": 18110 + }, + { + "epoch": 2.9294317355104678, + "grad_norm": 1.0583995580673218, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 18120 + }, + { + "epoch": 2.9310484196912134, + "grad_norm": 0.8228777647018433, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 18130 + }, + { + "epoch": 2.9326651038719587, + "grad_norm": 0.8374035358428955, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 18140 + }, + { + "epoch": 2.934281788052704, + "grad_norm": 0.7976473569869995, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 18150 + }, + { + "epoch": 2.935898472233449, + "grad_norm": 0.8009907603263855, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 18160 + }, + { + "epoch": 2.9375151564141944, + "grad_norm": 0.835213303565979, + "learning_rate": 0.0002, + "loss": 0.7485, + "step": 18170 + }, + { + "epoch": 2.93913184059494, + "grad_norm": 0.7982219457626343, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 18180 + }, + { + "epoch": 2.9407485247756853, + "grad_norm": 0.7070978879928589, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 18190 + }, + { + "epoch": 2.9423652089564305, + "grad_norm": 0.8619440197944641, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 18200 + }, + { + "epoch": 2.9439818931371757, + "grad_norm": 0.6693987250328064, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 18210 + }, + { + "epoch": 2.945598577317921, + "grad_norm": 0.6747021079063416, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 18220 + }, + { + "epoch": 2.947215261498666, + "grad_norm": 0.860387921333313, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 18230 + }, + { + "epoch": 2.9488319456794114, + "grad_norm": 0.799976646900177, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 18240 + }, + { + "epoch": 2.9504486298601567, + "grad_norm": 0.7864769101142883, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 18250 + }, + { + "epoch": 2.952065314040902, + "grad_norm": 0.6713884472846985, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 18260 + }, + { + "epoch": 2.9536819982216476, + "grad_norm": 0.9031508564949036, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 18270 + }, + { + "epoch": 2.955298682402393, + "grad_norm": 0.7205073237419128, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 18280 + }, + { + "epoch": 2.956915366583138, + "grad_norm": 0.7746205925941467, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 18290 + }, + { + "epoch": 2.9585320507638833, + "grad_norm": 0.6533427834510803, + "learning_rate": 0.0002, + "loss": 0.6218, + "step": 18300 + }, + { + "epoch": 2.9601487349446285, + "grad_norm": 0.9083208441734314, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 18310 + }, + { + "epoch": 2.9617654191253737, + "grad_norm": 0.7446991801261902, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 18320 + }, + { + "epoch": 2.9633821033061194, + "grad_norm": 0.6514461636543274, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 18330 + }, + { + "epoch": 2.9649987874868646, + "grad_norm": 0.8580465912818909, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 18340 + }, + { + "epoch": 2.96661547166761, + "grad_norm": 0.7074266076087952, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 18350 + }, + { + "epoch": 2.968232155848355, + "grad_norm": 0.899892270565033, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 18360 + }, + { + "epoch": 2.9698488400291003, + "grad_norm": 0.8217641711235046, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 18370 + }, + { + "epoch": 2.9714655242098456, + "grad_norm": 0.8611799478530884, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 18380 + }, + { + "epoch": 2.973082208390591, + "grad_norm": 0.6909302473068237, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 18390 + }, + { + "epoch": 2.974698892571336, + "grad_norm": 0.6554358005523682, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 18400 + }, + { + "epoch": 2.9763155767520812, + "grad_norm": 0.7803071737289429, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 18410 + }, + { + "epoch": 2.977932260932827, + "grad_norm": 0.7838954925537109, + "learning_rate": 0.0002, + "loss": 0.7322, + "step": 18420 + }, + { + "epoch": 2.979548945113572, + "grad_norm": 0.7098495364189148, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 18430 + }, + { + "epoch": 2.9811656292943174, + "grad_norm": 0.8981785774230957, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 18440 + }, + { + "epoch": 2.9827823134750626, + "grad_norm": 0.7197171449661255, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 18450 + }, + { + "epoch": 2.984398997655808, + "grad_norm": 0.793185293674469, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 18460 + }, + { + "epoch": 2.986015681836553, + "grad_norm": 0.8531473875045776, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 18470 + }, + { + "epoch": 2.9876323660172988, + "grad_norm": 0.6627361178398132, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 18480 + }, + { + "epoch": 2.989249050198044, + "grad_norm": 0.5708155035972595, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 18490 + }, + { + "epoch": 2.990865734378789, + "grad_norm": 0.8227280378341675, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 18500 + }, + { + "epoch": 2.9924824185595345, + "grad_norm": 0.7102749943733215, + "learning_rate": 0.0002, + "loss": 0.6701, + "step": 18510 + }, + { + "epoch": 2.9940991027402797, + "grad_norm": 0.839485228061676, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 18520 + }, + { + "epoch": 2.995715786921025, + "grad_norm": 0.9038704037666321, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 18530 + }, + { + "epoch": 2.99733247110177, + "grad_norm": 0.8737510442733765, + "learning_rate": 0.0002, + "loss": 0.7186, + "step": 18540 + }, + { + "epoch": 2.9989491552825154, + "grad_norm": 0.7323142886161804, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 18550 + }, + { + "epoch": 2.9999191657909625, + "eval_loss": 1.1262480020523071, + "eval_runtime": 122.0868, + "eval_samples_per_second": 6.004, + "eval_steps_per_second": 0.754, + "step": 18556 + }, + { + "epoch": 3.000565839463261, + "grad_norm": 0.8465463519096375, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 18560 + }, + { + "epoch": 3.0021825236440063, + "grad_norm": 0.9134138822555542, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 18570 + }, + { + "epoch": 3.0037992078247515, + "grad_norm": 0.760715126991272, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 18580 + }, + { + "epoch": 3.0054158920054967, + "grad_norm": 0.9208743572235107, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 18590 + }, + { + "epoch": 3.007032576186242, + "grad_norm": 0.9232364892959595, + "learning_rate": 0.0002, + "loss": 0.5731, + "step": 18600 + }, + { + "epoch": 3.008649260366987, + "grad_norm": 1.1881544589996338, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 18610 + }, + { + "epoch": 3.0102659445477324, + "grad_norm": 0.9372987747192383, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 18620 + }, + { + "epoch": 3.0118826287284777, + "grad_norm": 0.6900241374969482, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 18630 + }, + { + "epoch": 3.0134993129092233, + "grad_norm": 0.8451071381568909, + "learning_rate": 0.0002, + "loss": 0.5256, + "step": 18640 + }, + { + "epoch": 3.0151159970899686, + "grad_norm": 0.7763112187385559, + "learning_rate": 0.0002, + "loss": 0.5916, + "step": 18650 + }, + { + "epoch": 3.016732681270714, + "grad_norm": 1.043653964996338, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 18660 + }, + { + "epoch": 3.018349365451459, + "grad_norm": 1.0170660018920898, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 18670 + }, + { + "epoch": 3.0199660496322043, + "grad_norm": 0.7534180283546448, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 18680 + }, + { + "epoch": 3.0215827338129495, + "grad_norm": 0.7507367730140686, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 18690 + }, + { + "epoch": 3.0231994179936947, + "grad_norm": 0.7861620187759399, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 18700 + }, + { + "epoch": 3.0248161021744404, + "grad_norm": 1.0580339431762695, + "learning_rate": 0.0002, + "loss": 0.5802, + "step": 18710 + }, + { + "epoch": 3.0264327863551856, + "grad_norm": 0.7542710900306702, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 18720 + }, + { + "epoch": 3.028049470535931, + "grad_norm": 0.8189544677734375, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 18730 + }, + { + "epoch": 3.029666154716676, + "grad_norm": 0.9126611351966858, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 18740 + }, + { + "epoch": 3.0312828388974213, + "grad_norm": 0.8891341686248779, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 18750 + }, + { + "epoch": 3.0328995230781666, + "grad_norm": 0.8419283032417297, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 18760 + }, + { + "epoch": 3.034516207258912, + "grad_norm": 0.8048048615455627, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 18770 + }, + { + "epoch": 3.0361328914396575, + "grad_norm": 0.7820217609405518, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 18780 + }, + { + "epoch": 3.0377495756204027, + "grad_norm": 0.854721188545227, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 18790 + }, + { + "epoch": 3.039366259801148, + "grad_norm": 0.912092924118042, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 18800 + }, + { + "epoch": 3.040982943981893, + "grad_norm": 0.6596226096153259, + "learning_rate": 0.0002, + "loss": 0.5734, + "step": 18810 + }, + { + "epoch": 3.0425996281626384, + "grad_norm": 0.6351348757743835, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 18820 + }, + { + "epoch": 3.0442163123433836, + "grad_norm": 0.778188943862915, + "learning_rate": 0.0002, + "loss": 0.5953, + "step": 18830 + }, + { + "epoch": 3.045832996524129, + "grad_norm": 0.68234783411026, + "learning_rate": 0.0002, + "loss": 0.602, + "step": 18840 + }, + { + "epoch": 3.047449680704874, + "grad_norm": 0.998628556728363, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 18850 + }, + { + "epoch": 3.0490663648856198, + "grad_norm": 0.7393841743469238, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 18860 + }, + { + "epoch": 3.050683049066365, + "grad_norm": 0.84438556432724, + "learning_rate": 0.0002, + "loss": 0.568, + "step": 18870 + }, + { + "epoch": 3.0522997332471102, + "grad_norm": 0.8857501745223999, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 18880 + }, + { + "epoch": 3.0539164174278555, + "grad_norm": 0.7208474278450012, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 18890 + }, + { + "epoch": 3.0555331016086007, + "grad_norm": 0.7135229110717773, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 18900 + }, + { + "epoch": 3.057149785789346, + "grad_norm": 0.9130001664161682, + "learning_rate": 0.0002, + "loss": 0.5575, + "step": 18910 + }, + { + "epoch": 3.058766469970091, + "grad_norm": 0.9001716375350952, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 18920 + }, + { + "epoch": 3.060383154150837, + "grad_norm": 0.8667559623718262, + "learning_rate": 0.0002, + "loss": 0.6052, + "step": 18930 + }, + { + "epoch": 3.061999838331582, + "grad_norm": 0.8943959474563599, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 18940 + }, + { + "epoch": 3.0636165225123273, + "grad_norm": 0.8298377990722656, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 18950 + }, + { + "epoch": 3.0652332066930725, + "grad_norm": 0.7935267686843872, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 18960 + }, + { + "epoch": 3.0668498908738178, + "grad_norm": 1.1506379842758179, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 18970 + }, + { + "epoch": 3.068466575054563, + "grad_norm": 0.7693049907684326, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 18980 + }, + { + "epoch": 3.0700832592353082, + "grad_norm": 0.8040135502815247, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 18990 + }, + { + "epoch": 3.0716999434160535, + "grad_norm": 0.828404426574707, + "learning_rate": 0.0002, + "loss": 0.5541, + "step": 19000 + }, + { + "epoch": 3.073316627596799, + "grad_norm": 0.8811164498329163, + "learning_rate": 0.0002, + "loss": 0.6048, + "step": 19010 + }, + { + "epoch": 3.0749333117775444, + "grad_norm": 1.036205768585205, + "learning_rate": 0.0002, + "loss": 0.5845, + "step": 19020 + }, + { + "epoch": 3.0765499959582896, + "grad_norm": 0.8857285976409912, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 19030 + }, + { + "epoch": 3.078166680139035, + "grad_norm": 0.8392079472541809, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 19040 + }, + { + "epoch": 3.07978336431978, + "grad_norm": 1.0287401676177979, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 19050 + }, + { + "epoch": 3.0814000485005253, + "grad_norm": 1.0086315870285034, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 19060 + }, + { + "epoch": 3.0830167326812705, + "grad_norm": 0.9245324730873108, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 19070 + }, + { + "epoch": 3.084633416862016, + "grad_norm": 0.8680877089500427, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 19080 + }, + { + "epoch": 3.0862501010427614, + "grad_norm": 0.8814793825149536, + "learning_rate": 0.0002, + "loss": 0.5611, + "step": 19090 + }, + { + "epoch": 3.0878667852235067, + "grad_norm": 0.9234458208084106, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 19100 + }, + { + "epoch": 3.089483469404252, + "grad_norm": 1.1291664838790894, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 19110 + }, + { + "epoch": 3.091100153584997, + "grad_norm": 0.9191402792930603, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 19120 + }, + { + "epoch": 3.0927168377657424, + "grad_norm": 0.7103154063224792, + "learning_rate": 0.0002, + "loss": 0.5856, + "step": 19130 + }, + { + "epoch": 3.0943335219464876, + "grad_norm": 0.9368883967399597, + "learning_rate": 0.0002, + "loss": 0.6479, + "step": 19140 + }, + { + "epoch": 3.095950206127233, + "grad_norm": 0.9676656723022461, + "learning_rate": 0.0002, + "loss": 0.6167, + "step": 19150 + }, + { + "epoch": 3.0975668903079785, + "grad_norm": 0.8739792704582214, + "learning_rate": 0.0002, + "loss": 0.5794, + "step": 19160 + }, + { + "epoch": 3.0991835744887237, + "grad_norm": 0.8530174493789673, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 19170 + }, + { + "epoch": 3.100800258669469, + "grad_norm": 0.794945478439331, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 19180 + }, + { + "epoch": 3.102416942850214, + "grad_norm": 0.9508888125419617, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 19190 + }, + { + "epoch": 3.1040336270309594, + "grad_norm": 1.0599955320358276, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 19200 + }, + { + "epoch": 3.1056503112117047, + "grad_norm": 1.0673625469207764, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 19210 + }, + { + "epoch": 3.10726699539245, + "grad_norm": 0.7739115953445435, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 19220 + }, + { + "epoch": 3.1088836795731956, + "grad_norm": 0.9884951114654541, + "learning_rate": 0.0002, + "loss": 0.6046, + "step": 19230 + }, + { + "epoch": 3.110500363753941, + "grad_norm": 0.862260103225708, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 19240 + }, + { + "epoch": 3.112117047934686, + "grad_norm": 0.7690284848213196, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 19250 + }, + { + "epoch": 3.1137337321154313, + "grad_norm": 0.8758958578109741, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 19260 + }, + { + "epoch": 3.1153504162961765, + "grad_norm": 1.0356395244598389, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 19270 + }, + { + "epoch": 3.1169671004769217, + "grad_norm": 0.6950937509536743, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 19280 + }, + { + "epoch": 3.118583784657667, + "grad_norm": 0.760998010635376, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 19290 + }, + { + "epoch": 3.1202004688384126, + "grad_norm": 0.9335789084434509, + "learning_rate": 0.0002, + "loss": 0.575, + "step": 19300 + }, + { + "epoch": 3.121817153019158, + "grad_norm": 0.9636204242706299, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 19310 + }, + { + "epoch": 3.123433837199903, + "grad_norm": 1.0820997953414917, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 19320 + }, + { + "epoch": 3.1250505213806483, + "grad_norm": 0.7333487272262573, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 19330 + }, + { + "epoch": 3.1266672055613935, + "grad_norm": 1.0417509078979492, + "learning_rate": 0.0002, + "loss": 0.6178, + "step": 19340 + }, + { + "epoch": 3.128283889742139, + "grad_norm": 0.9267749190330505, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 19350 + }, + { + "epoch": 3.129900573922884, + "grad_norm": 0.777798593044281, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 19360 + }, + { + "epoch": 3.1315172581036297, + "grad_norm": 0.8425456881523132, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 19370 + }, + { + "epoch": 3.133133942284375, + "grad_norm": 0.9617102146148682, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 19380 + }, + { + "epoch": 3.13475062646512, + "grad_norm": 1.0052828788757324, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 19390 + }, + { + "epoch": 3.1363673106458654, + "grad_norm": 0.7637009024620056, + "learning_rate": 0.0002, + "loss": 0.5713, + "step": 19400 + }, + { + "epoch": 3.1379839948266106, + "grad_norm": 0.7958088517189026, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 19410 + }, + { + "epoch": 3.139600679007356, + "grad_norm": 0.9161727428436279, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 19420 + }, + { + "epoch": 3.141217363188101, + "grad_norm": 0.8402149677276611, + "learning_rate": 0.0002, + "loss": 0.5638, + "step": 19430 + }, + { + "epoch": 3.1428340473688463, + "grad_norm": 1.0056525468826294, + "learning_rate": 0.0002, + "loss": 0.5848, + "step": 19440 + }, + { + "epoch": 3.144450731549592, + "grad_norm": 1.0129190683364868, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 19450 + }, + { + "epoch": 3.146067415730337, + "grad_norm": 0.790825366973877, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 19460 + }, + { + "epoch": 3.1476840999110824, + "grad_norm": 1.441665530204773, + "learning_rate": 0.0002, + "loss": 0.5607, + "step": 19470 + }, + { + "epoch": 3.1493007840918277, + "grad_norm": 0.7846331596374512, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 19480 + }, + { + "epoch": 3.150917468272573, + "grad_norm": 0.7915332913398743, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 19490 + }, + { + "epoch": 3.152534152453318, + "grad_norm": 0.933982253074646, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 19500 + }, + { + "epoch": 3.1541508366340634, + "grad_norm": 1.038408637046814, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 19510 + }, + { + "epoch": 3.155767520814809, + "grad_norm": 1.018935203552246, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 19520 + }, + { + "epoch": 3.1573842049955543, + "grad_norm": 0.9618112444877625, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 19530 + }, + { + "epoch": 3.1590008891762995, + "grad_norm": 0.8900452852249146, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 19540 + }, + { + "epoch": 3.1606175733570447, + "grad_norm": 0.8254160284996033, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 19550 + }, + { + "epoch": 3.16223425753779, + "grad_norm": 1.004376769065857, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 19560 + }, + { + "epoch": 3.163850941718535, + "grad_norm": 1.0490446090698242, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 19570 + }, + { + "epoch": 3.1654676258992804, + "grad_norm": 0.7387403845787048, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 19580 + }, + { + "epoch": 3.1670843100800257, + "grad_norm": 0.7611538171768188, + "learning_rate": 0.0002, + "loss": 0.5898, + "step": 19590 + }, + { + "epoch": 3.1687009942607713, + "grad_norm": 0.8239886164665222, + "learning_rate": 0.0002, + "loss": 0.5937, + "step": 19600 + }, + { + "epoch": 3.1703176784415166, + "grad_norm": 0.9327243566513062, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 19610 + }, + { + "epoch": 3.171934362622262, + "grad_norm": 0.9662560224533081, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 19620 + }, + { + "epoch": 3.173551046803007, + "grad_norm": 0.9183341860771179, + "learning_rate": 0.0002, + "loss": 0.5988, + "step": 19630 + }, + { + "epoch": 3.1751677309837523, + "grad_norm": 0.875066876411438, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 19640 + }, + { + "epoch": 3.1767844151644975, + "grad_norm": 0.8567508459091187, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 19650 + }, + { + "epoch": 3.1784010993452427, + "grad_norm": 0.6805780529975891, + "learning_rate": 0.0002, + "loss": 0.5805, + "step": 19660 + }, + { + "epoch": 3.1800177835259884, + "grad_norm": 0.8776944279670715, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 19670 + }, + { + "epoch": 3.1816344677067336, + "grad_norm": 0.9036329984664917, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 19680 + }, + { + "epoch": 3.183251151887479, + "grad_norm": 0.8527372479438782, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 19690 + }, + { + "epoch": 3.184867836068224, + "grad_norm": 1.1045585870742798, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 19700 + }, + { + "epoch": 3.1864845202489693, + "grad_norm": 0.9213830828666687, + "learning_rate": 0.0002, + "loss": 0.5491, + "step": 19710 + }, + { + "epoch": 3.1881012044297146, + "grad_norm": 0.8865814805030823, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 19720 + }, + { + "epoch": 3.18971788861046, + "grad_norm": 0.7939388751983643, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 19730 + }, + { + "epoch": 3.191334572791205, + "grad_norm": 0.6966729760169983, + "learning_rate": 0.0002, + "loss": 0.5682, + "step": 19740 + }, + { + "epoch": 3.1929512569719507, + "grad_norm": 0.8023673295974731, + "learning_rate": 0.0002, + "loss": 0.5839, + "step": 19750 + }, + { + "epoch": 3.194567941152696, + "grad_norm": 0.7992037534713745, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 19760 + }, + { + "epoch": 3.196184625333441, + "grad_norm": 0.7412247657775879, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 19770 + }, + { + "epoch": 3.1978013095141864, + "grad_norm": 0.9598729014396667, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 19780 + }, + { + "epoch": 3.1994179936949316, + "grad_norm": 0.8331366777420044, + "learning_rate": 0.0002, + "loss": 0.5685, + "step": 19790 + }, + { + "epoch": 3.201034677875677, + "grad_norm": 0.8939169645309448, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 19800 + }, + { + "epoch": 3.202651362056422, + "grad_norm": 0.9219734072685242, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 19810 + }, + { + "epoch": 3.2042680462371678, + "grad_norm": 0.869490385055542, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 19820 + }, + { + "epoch": 3.205884730417913, + "grad_norm": 0.8989706635475159, + "learning_rate": 0.0002, + "loss": 0.5946, + "step": 19830 + }, + { + "epoch": 3.2075014145986582, + "grad_norm": 0.8477165102958679, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 19840 + }, + { + "epoch": 3.2091180987794035, + "grad_norm": 0.8720678687095642, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 19850 + }, + { + "epoch": 3.2107347829601487, + "grad_norm": 0.861406683921814, + "learning_rate": 0.0002, + "loss": 0.5694, + "step": 19860 + }, + { + "epoch": 3.212351467140894, + "grad_norm": 0.8228686451911926, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 19870 + }, + { + "epoch": 3.213968151321639, + "grad_norm": 0.7936596870422363, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 19880 + }, + { + "epoch": 3.2155848355023844, + "grad_norm": 1.097377896308899, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 19890 + }, + { + "epoch": 3.21720151968313, + "grad_norm": 0.9544782638549805, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 19900 + }, + { + "epoch": 3.2188182038638753, + "grad_norm": 0.8240751624107361, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 19910 + }, + { + "epoch": 3.2204348880446205, + "grad_norm": 0.8332096338272095, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 19920 + }, + { + "epoch": 3.2220515722253658, + "grad_norm": 1.0954567193984985, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 19930 + }, + { + "epoch": 3.223668256406111, + "grad_norm": 0.7790525555610657, + "learning_rate": 0.0002, + "loss": 0.652, + "step": 19940 + }, + { + "epoch": 3.225284940586856, + "grad_norm": 0.7966814041137695, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 19950 + }, + { + "epoch": 3.2269016247676015, + "grad_norm": 0.9751881957054138, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 19960 + }, + { + "epoch": 3.228518308948347, + "grad_norm": 0.9856047630310059, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 19970 + }, + { + "epoch": 3.2301349931290924, + "grad_norm": 1.3062353134155273, + "learning_rate": 0.0002, + "loss": 0.5837, + "step": 19980 + }, + { + "epoch": 3.2317516773098376, + "grad_norm": 0.9510692358016968, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 19990 + }, + { + "epoch": 3.233368361490583, + "grad_norm": 0.8630342483520508, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 20000 + }, + { + "epoch": 3.234985045671328, + "grad_norm": 0.8966519236564636, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 20010 + }, + { + "epoch": 3.2366017298520733, + "grad_norm": 0.7093510627746582, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 20020 + }, + { + "epoch": 3.2382184140328185, + "grad_norm": 0.7771096229553223, + "learning_rate": 0.0002, + "loss": 0.585, + "step": 20030 + }, + { + "epoch": 3.2398350982135637, + "grad_norm": 0.841058075428009, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 20040 + }, + { + "epoch": 3.2414517823943094, + "grad_norm": 0.909712553024292, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 20050 + }, + { + "epoch": 3.2430684665750547, + "grad_norm": 0.8321019411087036, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 20060 + }, + { + "epoch": 3.2446851507558, + "grad_norm": 0.779901921749115, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 20070 + }, + { + "epoch": 3.246301834936545, + "grad_norm": 0.6249170303344727, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 20080 + }, + { + "epoch": 3.2479185191172903, + "grad_norm": 0.8000940680503845, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 20090 + }, + { + "epoch": 3.2495352032980356, + "grad_norm": 0.7627735137939453, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 20100 + }, + { + "epoch": 3.2511518874787813, + "grad_norm": 0.8780747056007385, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 20110 + }, + { + "epoch": 3.2527685716595265, + "grad_norm": 0.772037148475647, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 20120 + }, + { + "epoch": 3.2543852558402717, + "grad_norm": 1.0086580514907837, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 20130 + }, + { + "epoch": 3.256001940021017, + "grad_norm": 0.9360289573669434, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 20140 + }, + { + "epoch": 3.257618624201762, + "grad_norm": 1.2099586725234985, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 20150 + }, + { + "epoch": 3.2592353083825074, + "grad_norm": 0.8368481397628784, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 20160 + }, + { + "epoch": 3.2608519925632526, + "grad_norm": 0.7391039133071899, + "learning_rate": 0.0002, + "loss": 0.5626, + "step": 20170 + }, + { + "epoch": 3.262468676743998, + "grad_norm": 0.9122273325920105, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 20180 + }, + { + "epoch": 3.264085360924743, + "grad_norm": 0.8502281904220581, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 20190 + }, + { + "epoch": 3.265702045105489, + "grad_norm": 1.0926852226257324, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 20200 + }, + { + "epoch": 3.267318729286234, + "grad_norm": 0.7902828454971313, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 20210 + }, + { + "epoch": 3.2689354134669792, + "grad_norm": 0.8724729418754578, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 20220 + }, + { + "epoch": 3.2705520976477245, + "grad_norm": 0.8469277024269104, + "learning_rate": 0.0002, + "loss": 0.6242, + "step": 20230 + }, + { + "epoch": 3.2721687818284697, + "grad_norm": 0.8865092992782593, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 20240 + }, + { + "epoch": 3.273785466009215, + "grad_norm": 1.0979334115982056, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 20250 + }, + { + "epoch": 3.2754021501899606, + "grad_norm": 1.0860793590545654, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 20260 + }, + { + "epoch": 3.277018834370706, + "grad_norm": 0.981745183467865, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 20270 + }, + { + "epoch": 3.278635518551451, + "grad_norm": 0.9155020713806152, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 20280 + }, + { + "epoch": 3.2802522027321963, + "grad_norm": 0.8436718583106995, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 20290 + }, + { + "epoch": 3.2818688869129415, + "grad_norm": 1.0329409837722778, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 20300 + }, + { + "epoch": 3.2834855710936868, + "grad_norm": 0.9876394271850586, + "learning_rate": 0.0002, + "loss": 0.6, + "step": 20310 + }, + { + "epoch": 3.285102255274432, + "grad_norm": 0.8052917718887329, + "learning_rate": 0.0002, + "loss": 0.5463, + "step": 20320 + }, + { + "epoch": 3.2867189394551772, + "grad_norm": 0.8390680551528931, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 20330 + }, + { + "epoch": 3.288335623635923, + "grad_norm": 0.9515735507011414, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 20340 + }, + { + "epoch": 3.289952307816668, + "grad_norm": 0.8028870224952698, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 20350 + }, + { + "epoch": 3.2915689919974134, + "grad_norm": 0.862592339515686, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 20360 + }, + { + "epoch": 3.2931856761781586, + "grad_norm": 0.7451621890068054, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 20370 + }, + { + "epoch": 3.294802360358904, + "grad_norm": 0.8966776728630066, + "learning_rate": 0.0002, + "loss": 0.6458, + "step": 20380 + }, + { + "epoch": 3.296419044539649, + "grad_norm": 0.9289216995239258, + "learning_rate": 0.0002, + "loss": 0.5967, + "step": 20390 + }, + { + "epoch": 3.2980357287203943, + "grad_norm": 0.9649626612663269, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 20400 + }, + { + "epoch": 3.29965241290114, + "grad_norm": 1.1953798532485962, + "learning_rate": 0.0002, + "loss": 0.5781, + "step": 20410 + }, + { + "epoch": 3.301269097081885, + "grad_norm": 0.8929083943367004, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 20420 + }, + { + "epoch": 3.3028857812626304, + "grad_norm": 0.8922014236450195, + "learning_rate": 0.0002, + "loss": 0.597, + "step": 20430 + }, + { + "epoch": 3.3045024654433757, + "grad_norm": 0.9754860401153564, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 20440 + }, + { + "epoch": 3.306119149624121, + "grad_norm": 0.8873140215873718, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 20450 + }, + { + "epoch": 3.307735833804866, + "grad_norm": 0.857271671295166, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 20460 + }, + { + "epoch": 3.3093525179856114, + "grad_norm": 0.9022141098976135, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 20470 + }, + { + "epoch": 3.3109692021663566, + "grad_norm": 0.8614798188209534, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 20480 + }, + { + "epoch": 3.3125858863471023, + "grad_norm": 0.8838164210319519, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 20490 + }, + { + "epoch": 3.3142025705278475, + "grad_norm": 0.8709736466407776, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 20500 + }, + { + "epoch": 3.3158192547085927, + "grad_norm": 0.9533300995826721, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 20510 + }, + { + "epoch": 3.317435938889338, + "grad_norm": 0.8259269595146179, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 20520 + }, + { + "epoch": 3.319052623070083, + "grad_norm": 0.8607608079910278, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 20530 + }, + { + "epoch": 3.3206693072508284, + "grad_norm": 1.0863020420074463, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 20540 + }, + { + "epoch": 3.3222859914315737, + "grad_norm": 1.011489987373352, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 20550 + }, + { + "epoch": 3.3239026756123193, + "grad_norm": 0.6952177882194519, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 20560 + }, + { + "epoch": 3.3255193597930646, + "grad_norm": 0.9638974070549011, + "learning_rate": 0.0002, + "loss": 0.6229, + "step": 20570 + }, + { + "epoch": 3.32713604397381, + "grad_norm": 1.0310138463974, + "learning_rate": 0.0002, + "loss": 0.5882, + "step": 20580 + }, + { + "epoch": 3.328752728154555, + "grad_norm": 0.9371318221092224, + "learning_rate": 0.0002, + "loss": 0.594, + "step": 20590 + }, + { + "epoch": 3.3303694123353003, + "grad_norm": 0.8756691813468933, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 20600 + }, + { + "epoch": 3.3319860965160455, + "grad_norm": 1.054175853729248, + "learning_rate": 0.0002, + "loss": 0.5994, + "step": 20610 + }, + { + "epoch": 3.3336027806967907, + "grad_norm": 0.9074128270149231, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 20620 + }, + { + "epoch": 3.335219464877536, + "grad_norm": 0.906900942325592, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 20630 + }, + { + "epoch": 3.3368361490582816, + "grad_norm": 0.8689333200454712, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 20640 + }, + { + "epoch": 3.338452833239027, + "grad_norm": 0.9889747500419617, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 20650 + }, + { + "epoch": 3.340069517419772, + "grad_norm": 1.0685805082321167, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 20660 + }, + { + "epoch": 3.3416862016005173, + "grad_norm": 0.7495010495185852, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 20670 + }, + { + "epoch": 3.3433028857812626, + "grad_norm": 0.8747848272323608, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 20680 + }, + { + "epoch": 3.344919569962008, + "grad_norm": 0.9762673377990723, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 20690 + }, + { + "epoch": 3.346536254142753, + "grad_norm": 1.0284489393234253, + "learning_rate": 0.0002, + "loss": 0.6784, + "step": 20700 + }, + { + "epoch": 3.3481529383234987, + "grad_norm": 0.7293812036514282, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 20710 + }, + { + "epoch": 3.349769622504244, + "grad_norm": 0.8330199122428894, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 20720 + }, + { + "epoch": 3.351386306684989, + "grad_norm": 0.9808499217033386, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 20730 + }, + { + "epoch": 3.3530029908657344, + "grad_norm": 0.9508825540542603, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 20740 + }, + { + "epoch": 3.3546196750464796, + "grad_norm": 0.790483832359314, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 20750 + }, + { + "epoch": 3.356236359227225, + "grad_norm": 1.022793173789978, + "learning_rate": 0.0002, + "loss": 0.6327, + "step": 20760 + }, + { + "epoch": 3.35785304340797, + "grad_norm": 0.8318950533866882, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 20770 + }, + { + "epoch": 3.3594697275887153, + "grad_norm": 0.7980858087539673, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 20780 + }, + { + "epoch": 3.361086411769461, + "grad_norm": 0.8114802241325378, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 20790 + }, + { + "epoch": 3.3627030959502062, + "grad_norm": 0.8522519469261169, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 20800 + }, + { + "epoch": 3.3643197801309515, + "grad_norm": 0.9142431616783142, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 20810 + }, + { + "epoch": 3.3659364643116967, + "grad_norm": 0.771170437335968, + "learning_rate": 0.0002, + "loss": 0.6331, + "step": 20820 + }, + { + "epoch": 3.367553148492442, + "grad_norm": 1.0628231763839722, + "learning_rate": 0.0002, + "loss": 0.5879, + "step": 20830 + }, + { + "epoch": 3.369169832673187, + "grad_norm": 0.9384352564811707, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 20840 + }, + { + "epoch": 3.370786516853933, + "grad_norm": 1.1286591291427612, + "learning_rate": 0.0002, + "loss": 0.6292, + "step": 20850 + }, + { + "epoch": 3.372403201034678, + "grad_norm": 1.1349513530731201, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 20860 + }, + { + "epoch": 3.3740198852154233, + "grad_norm": 1.0127464532852173, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 20870 + }, + { + "epoch": 3.3756365693961685, + "grad_norm": 0.9111971855163574, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 20880 + }, + { + "epoch": 3.3772532535769137, + "grad_norm": 0.871356725692749, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 20890 + }, + { + "epoch": 3.378869937757659, + "grad_norm": 0.7774117588996887, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 20900 + }, + { + "epoch": 3.380486621938404, + "grad_norm": 1.0089964866638184, + "learning_rate": 0.0002, + "loss": 0.6062, + "step": 20910 + }, + { + "epoch": 3.3821033061191494, + "grad_norm": 0.7855867147445679, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 20920 + }, + { + "epoch": 3.3837199902998947, + "grad_norm": 1.3713710308074951, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 20930 + }, + { + "epoch": 3.3853366744806404, + "grad_norm": 0.8599116206169128, + "learning_rate": 0.0002, + "loss": 0.6627, + "step": 20940 + }, + { + "epoch": 3.3869533586613856, + "grad_norm": 0.9392673373222351, + "learning_rate": 0.0002, + "loss": 0.6224, + "step": 20950 + }, + { + "epoch": 3.388570042842131, + "grad_norm": 0.8764075040817261, + "learning_rate": 0.0002, + "loss": 0.5855, + "step": 20960 + }, + { + "epoch": 3.390186727022876, + "grad_norm": 0.8240136504173279, + "learning_rate": 0.0002, + "loss": 0.5734, + "step": 20970 + }, + { + "epoch": 3.3918034112036213, + "grad_norm": 1.0982369184494019, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 20980 + }, + { + "epoch": 3.3934200953843665, + "grad_norm": 1.0599013566970825, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 20990 + }, + { + "epoch": 3.395036779565112, + "grad_norm": 0.895438015460968, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 21000 + }, + { + "epoch": 3.3966534637458574, + "grad_norm": 0.6974841356277466, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 21010 + }, + { + "epoch": 3.3982701479266026, + "grad_norm": 0.9571719765663147, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 21020 + }, + { + "epoch": 3.399886832107348, + "grad_norm": 0.831912636756897, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 21030 + }, + { + "epoch": 3.401503516288093, + "grad_norm": 0.831936240196228, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 21040 + }, + { + "epoch": 3.4031202004688383, + "grad_norm": 0.7388373613357544, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 21050 + }, + { + "epoch": 3.4047368846495836, + "grad_norm": 0.938667356967926, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 21060 + }, + { + "epoch": 3.406353568830329, + "grad_norm": 0.9202313423156738, + "learning_rate": 0.0002, + "loss": 0.6121, + "step": 21070 + }, + { + "epoch": 3.4079702530110745, + "grad_norm": 0.9888381958007812, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 21080 + }, + { + "epoch": 3.4095869371918197, + "grad_norm": 0.8526970744132996, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 21090 + }, + { + "epoch": 3.411203621372565, + "grad_norm": 0.7939383387565613, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 21100 + }, + { + "epoch": 3.41282030555331, + "grad_norm": 0.9986352920532227, + "learning_rate": 0.0002, + "loss": 0.6066, + "step": 21110 + }, + { + "epoch": 3.4144369897340554, + "grad_norm": 0.8895300030708313, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 21120 + }, + { + "epoch": 3.4160536739148006, + "grad_norm": 0.9559482932090759, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 21130 + }, + { + "epoch": 3.417670358095546, + "grad_norm": 0.8351506590843201, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 21140 + }, + { + "epoch": 3.4192870422762915, + "grad_norm": 0.8224456906318665, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 21150 + }, + { + "epoch": 3.4209037264570368, + "grad_norm": 1.0110299587249756, + "learning_rate": 0.0002, + "loss": 0.5871, + "step": 21160 + }, + { + "epoch": 3.422520410637782, + "grad_norm": 0.82564777135849, + "learning_rate": 0.0002, + "loss": 0.6116, + "step": 21170 + }, + { + "epoch": 3.4241370948185272, + "grad_norm": 1.004738688468933, + "learning_rate": 0.0002, + "loss": 0.595, + "step": 21180 + }, + { + "epoch": 3.4257537789992725, + "grad_norm": 0.7545676827430725, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 21190 + }, + { + "epoch": 3.4273704631800177, + "grad_norm": 0.8918704390525818, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 21200 + }, + { + "epoch": 3.428987147360763, + "grad_norm": 0.8336876034736633, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 21210 + }, + { + "epoch": 3.430603831541508, + "grad_norm": 0.8928771018981934, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 21220 + }, + { + "epoch": 3.432220515722254, + "grad_norm": 0.7663705945014954, + "learning_rate": 0.0002, + "loss": 0.6468, + "step": 21230 + }, + { + "epoch": 3.433837199902999, + "grad_norm": 0.8392598628997803, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 21240 + }, + { + "epoch": 3.4354538840837443, + "grad_norm": 0.8819600343704224, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 21250 + }, + { + "epoch": 3.4370705682644895, + "grad_norm": 0.9124642014503479, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 21260 + }, + { + "epoch": 3.4386872524452348, + "grad_norm": 0.8329763412475586, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 21270 + }, + { + "epoch": 3.44030393662598, + "grad_norm": 0.9982839822769165, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 21280 + }, + { + "epoch": 3.4419206208067252, + "grad_norm": 0.9105954766273499, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 21290 + }, + { + "epoch": 3.443537304987471, + "grad_norm": 0.8182359337806702, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 21300 + }, + { + "epoch": 3.445153989168216, + "grad_norm": 1.0568904876708984, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 21310 + }, + { + "epoch": 3.4467706733489614, + "grad_norm": 0.968539834022522, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 21320 + }, + { + "epoch": 3.4483873575297066, + "grad_norm": 0.8774511218070984, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 21330 + }, + { + "epoch": 3.450004041710452, + "grad_norm": 0.7598156332969666, + "learning_rate": 0.0002, + "loss": 0.6438, + "step": 21340 + }, + { + "epoch": 3.451620725891197, + "grad_norm": 1.1012897491455078, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 21350 + }, + { + "epoch": 3.4532374100719423, + "grad_norm": 0.8040637373924255, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 21360 + }, + { + "epoch": 3.4548540942526875, + "grad_norm": 0.8497496247291565, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 21370 + }, + { + "epoch": 3.456470778433433, + "grad_norm": 0.8429915904998779, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 21380 + }, + { + "epoch": 3.4580874626141784, + "grad_norm": 0.8107112646102905, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 21390 + }, + { + "epoch": 3.4597041467949237, + "grad_norm": 1.00872004032135, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 21400 + }, + { + "epoch": 3.461320830975669, + "grad_norm": 0.8266542553901672, + "learning_rate": 0.0002, + "loss": 0.5712, + "step": 21410 + }, + { + "epoch": 3.462937515156414, + "grad_norm": 0.8972568511962891, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 21420 + }, + { + "epoch": 3.4645541993371594, + "grad_norm": 1.0781476497650146, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 21430 + }, + { + "epoch": 3.4661708835179046, + "grad_norm": 0.9571592807769775, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 21440 + }, + { + "epoch": 3.4677875676986503, + "grad_norm": 0.881547212600708, + "learning_rate": 0.0002, + "loss": 0.6309, + "step": 21450 + }, + { + "epoch": 3.4694042518793955, + "grad_norm": 0.6955338716506958, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 21460 + }, + { + "epoch": 3.4710209360601407, + "grad_norm": 0.901187539100647, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 21470 + }, + { + "epoch": 3.472637620240886, + "grad_norm": 0.7063511610031128, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 21480 + }, + { + "epoch": 3.474254304421631, + "grad_norm": 0.8462792038917542, + "learning_rate": 0.0002, + "loss": 0.6154, + "step": 21490 + }, + { + "epoch": 3.4758709886023764, + "grad_norm": 1.1861060857772827, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 21500 + }, + { + "epoch": 3.4774876727831217, + "grad_norm": 0.70503169298172, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 21510 + }, + { + "epoch": 3.479104356963867, + "grad_norm": 0.9650066494941711, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 21520 + }, + { + "epoch": 3.4807210411446126, + "grad_norm": 1.0266852378845215, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 21530 + }, + { + "epoch": 3.482337725325358, + "grad_norm": 0.956372857093811, + "learning_rate": 0.0002, + "loss": 0.6553, + "step": 21540 + }, + { + "epoch": 3.483954409506103, + "grad_norm": 0.8848432898521423, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 21550 + }, + { + "epoch": 3.4855710936868483, + "grad_norm": 1.0805351734161377, + "learning_rate": 0.0002, + "loss": 0.6375, + "step": 21560 + }, + { + "epoch": 3.4871877778675935, + "grad_norm": 0.9279725551605225, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 21570 + }, + { + "epoch": 3.4888044620483387, + "grad_norm": 0.9049562215805054, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 21580 + }, + { + "epoch": 3.4904211462290844, + "grad_norm": 0.9619429111480713, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 21590 + }, + { + "epoch": 3.4920378304098296, + "grad_norm": 0.8508906960487366, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 21600 + }, + { + "epoch": 3.493654514590575, + "grad_norm": 0.8692502379417419, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 21610 + }, + { + "epoch": 3.49527119877132, + "grad_norm": 0.8187332153320312, + "learning_rate": 0.0002, + "loss": 0.5878, + "step": 21620 + }, + { + "epoch": 3.4968878829520653, + "grad_norm": 1.145400047302246, + "learning_rate": 0.0002, + "loss": 0.5874, + "step": 21630 + }, + { + "epoch": 3.4985045671328105, + "grad_norm": 0.8281388282775879, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 21640 + }, + { + "epoch": 3.500121251313556, + "grad_norm": 0.82256019115448, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 21650 + }, + { + "epoch": 3.501737935494301, + "grad_norm": 0.9315484762191772, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 21660 + }, + { + "epoch": 3.5033546196750462, + "grad_norm": 0.7626111507415771, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 21670 + }, + { + "epoch": 3.504971303855792, + "grad_norm": 0.9275059103965759, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 21680 + }, + { + "epoch": 3.506587988036537, + "grad_norm": 0.7906724810600281, + "learning_rate": 0.0002, + "loss": 0.64, + "step": 21690 + }, + { + "epoch": 3.5082046722172824, + "grad_norm": 0.8289761543273926, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 21700 + }, + { + "epoch": 3.5098213563980276, + "grad_norm": 0.8316431045532227, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 21710 + }, + { + "epoch": 3.511438040578773, + "grad_norm": 1.0451812744140625, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 21720 + }, + { + "epoch": 3.513054724759518, + "grad_norm": 0.928252637386322, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 21730 + }, + { + "epoch": 3.5146714089402638, + "grad_norm": 0.7985895276069641, + "learning_rate": 0.0002, + "loss": 0.6062, + "step": 21740 + }, + { + "epoch": 3.516288093121009, + "grad_norm": 0.6740974187850952, + "learning_rate": 0.0002, + "loss": 0.6463, + "step": 21750 + }, + { + "epoch": 3.517904777301754, + "grad_norm": 0.8482223749160767, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 21760 + }, + { + "epoch": 3.5195214614824994, + "grad_norm": 0.889947772026062, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 21770 + }, + { + "epoch": 3.5211381456632447, + "grad_norm": 0.8304598927497864, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 21780 + }, + { + "epoch": 3.52275482984399, + "grad_norm": 0.8002981543540955, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 21790 + }, + { + "epoch": 3.524371514024735, + "grad_norm": 0.8115083575248718, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 21800 + }, + { + "epoch": 3.5259881982054804, + "grad_norm": 0.9715048670768738, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 21810 + }, + { + "epoch": 3.5276048823862256, + "grad_norm": 1.0910786390304565, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 21820 + }, + { + "epoch": 3.5292215665669713, + "grad_norm": 0.8438942432403564, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 21830 + }, + { + "epoch": 3.5308382507477165, + "grad_norm": 0.8813382983207703, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 21840 + }, + { + "epoch": 3.5324549349284617, + "grad_norm": 0.7092908024787903, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 21850 + }, + { + "epoch": 3.534071619109207, + "grad_norm": 0.8332187533378601, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 21860 + }, + { + "epoch": 3.535688303289952, + "grad_norm": 0.8958209156990051, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 21870 + }, + { + "epoch": 3.5373049874706974, + "grad_norm": 0.824138879776001, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 21880 + }, + { + "epoch": 3.538921671651443, + "grad_norm": 0.8375158309936523, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 21890 + }, + { + "epoch": 3.5405383558321883, + "grad_norm": 1.0274608135223389, + "learning_rate": 0.0002, + "loss": 0.6019, + "step": 21900 + }, + { + "epoch": 3.5421550400129336, + "grad_norm": 0.7088932394981384, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 21910 + }, + { + "epoch": 3.543771724193679, + "grad_norm": 0.8172445297241211, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 21920 + }, + { + "epoch": 3.545388408374424, + "grad_norm": 0.9904135465621948, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 21930 + }, + { + "epoch": 3.5470050925551693, + "grad_norm": 0.9900432229042053, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 21940 + }, + { + "epoch": 3.5486217767359145, + "grad_norm": 0.8963301181793213, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 21950 + }, + { + "epoch": 3.5502384609166597, + "grad_norm": 0.8551464676856995, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 21960 + }, + { + "epoch": 3.551855145097405, + "grad_norm": 1.0916603803634644, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 21970 + }, + { + "epoch": 3.5534718292781506, + "grad_norm": 0.841598391532898, + "learning_rate": 0.0002, + "loss": 0.6523, + "step": 21980 + }, + { + "epoch": 3.555088513458896, + "grad_norm": 0.8566757440567017, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 21990 + }, + { + "epoch": 3.556705197639641, + "grad_norm": 1.0145052671432495, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 22000 + }, + { + "epoch": 3.5583218818203863, + "grad_norm": 0.9293754696846008, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 22010 + }, + { + "epoch": 3.5599385660011316, + "grad_norm": 0.9568536281585693, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 22020 + }, + { + "epoch": 3.5615552501818772, + "grad_norm": 0.8613139986991882, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 22030 + }, + { + "epoch": 3.5631719343626225, + "grad_norm": 0.8179237246513367, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 22040 + }, + { + "epoch": 3.5647886185433677, + "grad_norm": 0.9059830904006958, + "learning_rate": 0.0002, + "loss": 0.574, + "step": 22050 + }, + { + "epoch": 3.566405302724113, + "grad_norm": 1.0068252086639404, + "learning_rate": 0.0002, + "loss": 0.6448, + "step": 22060 + }, + { + "epoch": 3.568021986904858, + "grad_norm": 0.9682072997093201, + "learning_rate": 0.0002, + "loss": 0.6239, + "step": 22070 + }, + { + "epoch": 3.5696386710856034, + "grad_norm": 0.8514005541801453, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 22080 + }, + { + "epoch": 3.5712553552663486, + "grad_norm": 0.8327770829200745, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 22090 + }, + { + "epoch": 3.572872039447094, + "grad_norm": 1.024976372718811, + "learning_rate": 0.0002, + "loss": 0.5976, + "step": 22100 + }, + { + "epoch": 3.574488723627839, + "grad_norm": 0.7721174955368042, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 22110 + }, + { + "epoch": 3.5761054078085843, + "grad_norm": 1.0351054668426514, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 22120 + }, + { + "epoch": 3.57772209198933, + "grad_norm": 0.9680907130241394, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 22130 + }, + { + "epoch": 3.5793387761700752, + "grad_norm": 0.8016974925994873, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 22140 + }, + { + "epoch": 3.5809554603508205, + "grad_norm": 1.0109003782272339, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 22150 + }, + { + "epoch": 3.5825721445315657, + "grad_norm": 1.0473392009735107, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 22160 + }, + { + "epoch": 3.584188828712311, + "grad_norm": 0.8686613440513611, + "learning_rate": 0.0002, + "loss": 0.6353, + "step": 22170 + }, + { + "epoch": 3.5858055128930566, + "grad_norm": 0.869149923324585, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 22180 + }, + { + "epoch": 3.587422197073802, + "grad_norm": 0.9769062995910645, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 22190 + }, + { + "epoch": 3.589038881254547, + "grad_norm": 0.779636561870575, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 22200 + }, + { + "epoch": 3.5906555654352923, + "grad_norm": 0.9063841104507446, + "learning_rate": 0.0002, + "loss": 0.5875, + "step": 22210 + }, + { + "epoch": 3.5922722496160375, + "grad_norm": 0.9216037392616272, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 22220 + }, + { + "epoch": 3.5938889337967828, + "grad_norm": 1.0217336416244507, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 22230 + }, + { + "epoch": 3.595505617977528, + "grad_norm": 0.8513161540031433, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 22240 + }, + { + "epoch": 3.597122302158273, + "grad_norm": 0.8084813952445984, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 22250 + }, + { + "epoch": 3.5987389863390185, + "grad_norm": 0.8524802923202515, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 22260 + }, + { + "epoch": 3.600355670519764, + "grad_norm": 0.9356237649917603, + "learning_rate": 0.0002, + "loss": 0.5599, + "step": 22270 + }, + { + "epoch": 3.6019723547005094, + "grad_norm": 1.009600281715393, + "learning_rate": 0.0002, + "loss": 0.628, + "step": 22280 + }, + { + "epoch": 3.6035890388812546, + "grad_norm": 0.9900581240653992, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 22290 + }, + { + "epoch": 3.605205723062, + "grad_norm": 1.062495231628418, + "learning_rate": 0.0002, + "loss": 0.5725, + "step": 22300 + }, + { + "epoch": 3.606822407242745, + "grad_norm": 0.8832381367683411, + "learning_rate": 0.0002, + "loss": 0.607, + "step": 22310 + }, + { + "epoch": 3.6084390914234903, + "grad_norm": 0.9284297823905945, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 22320 + }, + { + "epoch": 3.610055775604236, + "grad_norm": 1.2381829023361206, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 22330 + }, + { + "epoch": 3.611672459784981, + "grad_norm": 0.929434597492218, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 22340 + }, + { + "epoch": 3.6132891439657264, + "grad_norm": 0.9714490175247192, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 22350 + }, + { + "epoch": 3.6149058281464717, + "grad_norm": 0.808014988899231, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 22360 + }, + { + "epoch": 3.616522512327217, + "grad_norm": 1.0364398956298828, + "learning_rate": 0.0002, + "loss": 0.6428, + "step": 22370 + }, + { + "epoch": 3.618139196507962, + "grad_norm": 0.7858489751815796, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 22380 + }, + { + "epoch": 3.6197558806887074, + "grad_norm": 0.9920870065689087, + "learning_rate": 0.0002, + "loss": 0.6214, + "step": 22390 + }, + { + "epoch": 3.6213725648694526, + "grad_norm": 0.9183220863342285, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 22400 + }, + { + "epoch": 3.622989249050198, + "grad_norm": 0.9826246500015259, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 22410 + }, + { + "epoch": 3.6246059332309435, + "grad_norm": 0.8632931113243103, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 22420 + }, + { + "epoch": 3.6262226174116887, + "grad_norm": 0.8468965291976929, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 22430 + }, + { + "epoch": 3.627839301592434, + "grad_norm": 0.8466871976852417, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 22440 + }, + { + "epoch": 3.629455985773179, + "grad_norm": 0.9501169919967651, + "learning_rate": 0.0002, + "loss": 0.5941, + "step": 22450 + }, + { + "epoch": 3.6310726699539244, + "grad_norm": 0.8906720876693726, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 22460 + }, + { + "epoch": 3.6326893541346696, + "grad_norm": 0.7400227189064026, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 22470 + }, + { + "epoch": 3.6343060383154153, + "grad_norm": 0.9756355881690979, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 22480 + }, + { + "epoch": 3.6359227224961606, + "grad_norm": 0.7504993081092834, + "learning_rate": 0.0002, + "loss": 0.6203, + "step": 22490 + }, + { + "epoch": 3.637539406676906, + "grad_norm": 0.9270039200782776, + "learning_rate": 0.0002, + "loss": 0.6302, + "step": 22500 + }, + { + "epoch": 3.639156090857651, + "grad_norm": 0.8841686844825745, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 22510 + }, + { + "epoch": 3.6407727750383962, + "grad_norm": 0.8533213138580322, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 22520 + }, + { + "epoch": 3.6423894592191415, + "grad_norm": 1.0052043199539185, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 22530 + }, + { + "epoch": 3.6440061433998867, + "grad_norm": 1.0323461294174194, + "learning_rate": 0.0002, + "loss": 0.6363, + "step": 22540 + }, + { + "epoch": 3.645622827580632, + "grad_norm": 0.8654312491416931, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 22550 + }, + { + "epoch": 3.647239511761377, + "grad_norm": 0.6400038003921509, + "learning_rate": 0.0002, + "loss": 0.6155, + "step": 22560 + }, + { + "epoch": 3.648856195942123, + "grad_norm": 0.8061298727989197, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 22570 + }, + { + "epoch": 3.650472880122868, + "grad_norm": 0.9257854223251343, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 22580 + }, + { + "epoch": 3.6520895643036133, + "grad_norm": 0.8439396619796753, + "learning_rate": 0.0002, + "loss": 0.6409, + "step": 22590 + }, + { + "epoch": 3.6537062484843585, + "grad_norm": 0.7764544486999512, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 22600 + }, + { + "epoch": 3.6553229326651038, + "grad_norm": 1.125451683998108, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 22610 + }, + { + "epoch": 3.656939616845849, + "grad_norm": 0.7523018717765808, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 22620 + }, + { + "epoch": 3.6585563010265947, + "grad_norm": 1.071026086807251, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 22630 + }, + { + "epoch": 3.66017298520734, + "grad_norm": 0.945791482925415, + "learning_rate": 0.0002, + "loss": 0.6459, + "step": 22640 + }, + { + "epoch": 3.661789669388085, + "grad_norm": 0.8001811504364014, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 22650 + }, + { + "epoch": 3.6634063535688304, + "grad_norm": 0.9700816869735718, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 22660 + }, + { + "epoch": 3.6650230377495756, + "grad_norm": 0.9053242206573486, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 22670 + }, + { + "epoch": 3.666639721930321, + "grad_norm": 0.944362461566925, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 22680 + }, + { + "epoch": 3.668256406111066, + "grad_norm": 1.067489504814148, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 22690 + }, + { + "epoch": 3.6698730902918113, + "grad_norm": 1.0984995365142822, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 22700 + }, + { + "epoch": 3.6714897744725565, + "grad_norm": 0.9336317777633667, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 22710 + }, + { + "epoch": 3.673106458653302, + "grad_norm": 0.9261918663978577, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 22720 + }, + { + "epoch": 3.6747231428340474, + "grad_norm": 0.8648008704185486, + "learning_rate": 0.0002, + "loss": 0.6332, + "step": 22730 + }, + { + "epoch": 3.6763398270147927, + "grad_norm": 0.7225083708763123, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 22740 + }, + { + "epoch": 3.677956511195538, + "grad_norm": 0.9258282780647278, + "learning_rate": 0.0002, + "loss": 0.6406, + "step": 22750 + }, + { + "epoch": 3.679573195376283, + "grad_norm": 0.70876145362854, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 22760 + }, + { + "epoch": 3.681189879557029, + "grad_norm": 0.8780210018157959, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 22770 + }, + { + "epoch": 3.682806563737774, + "grad_norm": 0.8075440526008606, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 22780 + }, + { + "epoch": 3.6844232479185193, + "grad_norm": 0.8503130674362183, + "learning_rate": 0.0002, + "loss": 0.6561, + "step": 22790 + }, + { + "epoch": 3.6860399320992645, + "grad_norm": 0.8413618206977844, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 22800 + }, + { + "epoch": 3.6876566162800097, + "grad_norm": 0.8675165176391602, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 22810 + }, + { + "epoch": 3.689273300460755, + "grad_norm": 0.8235884308815002, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 22820 + }, + { + "epoch": 3.6908899846415, + "grad_norm": 0.9477725625038147, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 22830 + }, + { + "epoch": 3.6925066688222454, + "grad_norm": 0.7883533835411072, + "learning_rate": 0.0002, + "loss": 0.6481, + "step": 22840 + }, + { + "epoch": 3.6941233530029907, + "grad_norm": 1.047913908958435, + "learning_rate": 0.0002, + "loss": 0.5872, + "step": 22850 + }, + { + "epoch": 3.695740037183736, + "grad_norm": 0.9171528816223145, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 22860 + }, + { + "epoch": 3.6973567213644816, + "grad_norm": 0.9338192343711853, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 22870 + }, + { + "epoch": 3.698973405545227, + "grad_norm": 0.8799443244934082, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 22880 + }, + { + "epoch": 3.700590089725972, + "grad_norm": 0.8515434861183167, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 22890 + }, + { + "epoch": 3.7022067739067173, + "grad_norm": 0.7805591821670532, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 22900 + }, + { + "epoch": 3.7038234580874625, + "grad_norm": 0.8470911979675293, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 22910 + }, + { + "epoch": 3.705440142268208, + "grad_norm": 0.9452309012413025, + "learning_rate": 0.0002, + "loss": 0.6557, + "step": 22920 + }, + { + "epoch": 3.7070568264489534, + "grad_norm": 0.950243353843689, + "learning_rate": 0.0002, + "loss": 0.6529, + "step": 22930 + }, + { + "epoch": 3.7086735106296986, + "grad_norm": 0.7882499098777771, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 22940 + }, + { + "epoch": 3.710290194810444, + "grad_norm": 0.8307787775993347, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 22950 + }, + { + "epoch": 3.711906878991189, + "grad_norm": 1.0970630645751953, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 22960 + }, + { + "epoch": 3.7135235631719343, + "grad_norm": 0.8269566297531128, + "learning_rate": 0.0002, + "loss": 0.6281, + "step": 22970 + }, + { + "epoch": 3.7151402473526796, + "grad_norm": 0.8306704759597778, + "learning_rate": 0.0002, + "loss": 0.6561, + "step": 22980 + }, + { + "epoch": 3.716756931533425, + "grad_norm": 0.9710225462913513, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 22990 + }, + { + "epoch": 3.71837361571417, + "grad_norm": 0.8890530467033386, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 23000 + }, + { + "epoch": 3.7199902998949153, + "grad_norm": 0.883522629737854, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 23010 + }, + { + "epoch": 3.721606984075661, + "grad_norm": 0.8662652373313904, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 23020 + }, + { + "epoch": 3.723223668256406, + "grad_norm": 0.7228406667709351, + "learning_rate": 0.0002, + "loss": 0.6266, + "step": 23030 + }, + { + "epoch": 3.7248403524371514, + "grad_norm": 1.060792088508606, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 23040 + }, + { + "epoch": 3.7264570366178966, + "grad_norm": 1.0119613409042358, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 23050 + }, + { + "epoch": 3.728073720798642, + "grad_norm": 0.9212996959686279, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 23060 + }, + { + "epoch": 3.7296904049793875, + "grad_norm": 0.925690233707428, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 23070 + }, + { + "epoch": 3.7313070891601328, + "grad_norm": 0.8323310613632202, + "learning_rate": 0.0002, + "loss": 0.615, + "step": 23080 + }, + { + "epoch": 3.732923773340878, + "grad_norm": 0.8966048955917358, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 23090 + }, + { + "epoch": 3.7345404575216232, + "grad_norm": 0.8995837569236755, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 23100 + }, + { + "epoch": 3.7361571417023685, + "grad_norm": 0.8748890161514282, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 23110 + }, + { + "epoch": 3.7377738258831137, + "grad_norm": 0.7985540628433228, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 23120 + }, + { + "epoch": 3.739390510063859, + "grad_norm": 1.0240917205810547, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 23130 + }, + { + "epoch": 3.741007194244604, + "grad_norm": 0.9181789755821228, + "learning_rate": 0.0002, + "loss": 0.6747, + "step": 23140 + }, + { + "epoch": 3.7426238784253494, + "grad_norm": 0.8896583914756775, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 23150 + }, + { + "epoch": 3.744240562606095, + "grad_norm": 0.8635515570640564, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 23160 + }, + { + "epoch": 3.7458572467868403, + "grad_norm": 0.8873575329780579, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 23170 + }, + { + "epoch": 3.7474739309675855, + "grad_norm": 0.9807148575782776, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 23180 + }, + { + "epoch": 3.7490906151483308, + "grad_norm": 0.900477945804596, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 23190 + }, + { + "epoch": 3.750707299329076, + "grad_norm": 0.9379992485046387, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 23200 + }, + { + "epoch": 3.752323983509821, + "grad_norm": 0.9649890661239624, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 23210 + }, + { + "epoch": 3.753940667690567, + "grad_norm": 0.824442446231842, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 23220 + }, + { + "epoch": 3.755557351871312, + "grad_norm": 0.8896150588989258, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 23230 + }, + { + "epoch": 3.7571740360520574, + "grad_norm": 0.751249372959137, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 23240 + }, + { + "epoch": 3.7587907202328026, + "grad_norm": 0.9392193555831909, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 23250 + }, + { + "epoch": 3.760407404413548, + "grad_norm": 0.9284586310386658, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 23260 + }, + { + "epoch": 3.762024088594293, + "grad_norm": 0.7738175392150879, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 23270 + }, + { + "epoch": 3.7636407727750383, + "grad_norm": 0.9252978563308716, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 23280 + }, + { + "epoch": 3.7652574569557835, + "grad_norm": 0.9501895904541016, + "learning_rate": 0.0002, + "loss": 0.5984, + "step": 23290 + }, + { + "epoch": 3.7668741411365287, + "grad_norm": 0.9416276216506958, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 23300 + }, + { + "epoch": 3.7684908253172744, + "grad_norm": 0.7076631784439087, + "learning_rate": 0.0002, + "loss": 0.6507, + "step": 23310 + }, + { + "epoch": 3.7701075094980196, + "grad_norm": 0.9864492416381836, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 23320 + }, + { + "epoch": 3.771724193678765, + "grad_norm": 0.8450456261634827, + "learning_rate": 0.0002, + "loss": 0.6537, + "step": 23330 + }, + { + "epoch": 3.77334087785951, + "grad_norm": 1.0768941640853882, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 23340 + }, + { + "epoch": 3.7749575620402553, + "grad_norm": 0.9956819415092468, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 23350 + }, + { + "epoch": 3.7765742462210006, + "grad_norm": 0.9234658479690552, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 23360 + }, + { + "epoch": 3.7781909304017463, + "grad_norm": 1.0993858575820923, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 23370 + }, + { + "epoch": 3.7798076145824915, + "grad_norm": 0.923159658908844, + "learning_rate": 0.0002, + "loss": 0.6391, + "step": 23380 + }, + { + "epoch": 3.7814242987632367, + "grad_norm": 0.9311541318893433, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 23390 + }, + { + "epoch": 3.783040982943982, + "grad_norm": 0.919681191444397, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 23400 + }, + { + "epoch": 3.784657667124727, + "grad_norm": 1.7406195402145386, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 23410 + }, + { + "epoch": 3.7862743513054724, + "grad_norm": 0.7789074182510376, + "learning_rate": 0.0002, + "loss": 0.657, + "step": 23420 + }, + { + "epoch": 3.7878910354862176, + "grad_norm": 0.8302814960479736, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 23430 + }, + { + "epoch": 3.789507719666963, + "grad_norm": 0.8089349269866943, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 23440 + }, + { + "epoch": 3.791124403847708, + "grad_norm": 0.9006284475326538, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 23450 + }, + { + "epoch": 3.7927410880284538, + "grad_norm": 0.8426766991615295, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 23460 + }, + { + "epoch": 3.794357772209199, + "grad_norm": 1.2576252222061157, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 23470 + }, + { + "epoch": 3.7959744563899442, + "grad_norm": 1.0307610034942627, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 23480 + }, + { + "epoch": 3.7975911405706895, + "grad_norm": 0.8525972962379456, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 23490 + }, + { + "epoch": 3.7992078247514347, + "grad_norm": 1.159039855003357, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 23500 + }, + { + "epoch": 3.80082450893218, + "grad_norm": 1.4193549156188965, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 23510 + }, + { + "epoch": 3.8024411931129256, + "grad_norm": 0.8245543837547302, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 23520 + }, + { + "epoch": 3.804057877293671, + "grad_norm": 0.8847230076789856, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 23530 + }, + { + "epoch": 3.805674561474416, + "grad_norm": 0.9574624300003052, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 23540 + }, + { + "epoch": 3.8072912456551613, + "grad_norm": 1.048020601272583, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 23550 + }, + { + "epoch": 3.8089079298359065, + "grad_norm": 0.8302255868911743, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 23560 + }, + { + "epoch": 3.8105246140166518, + "grad_norm": 0.8269215822219849, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 23570 + }, + { + "epoch": 3.812141298197397, + "grad_norm": 0.9375753402709961, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 23580 + }, + { + "epoch": 3.8137579823781422, + "grad_norm": 1.0234097242355347, + "learning_rate": 0.0002, + "loss": 0.6132, + "step": 23590 + }, + { + "epoch": 3.8153746665588875, + "grad_norm": 0.8978445529937744, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 23600 + }, + { + "epoch": 3.816991350739633, + "grad_norm": 0.7929515838623047, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 23610 + }, + { + "epoch": 3.8186080349203784, + "grad_norm": 1.3255881071090698, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 23620 + }, + { + "epoch": 3.8202247191011236, + "grad_norm": 0.9188598990440369, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 23630 + }, + { + "epoch": 3.821841403281869, + "grad_norm": 0.8811675906181335, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 23640 + }, + { + "epoch": 3.823458087462614, + "grad_norm": 0.8061038255691528, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 23650 + }, + { + "epoch": 3.8250747716433597, + "grad_norm": 0.9975376129150391, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 23660 + }, + { + "epoch": 3.826691455824105, + "grad_norm": 0.8036105036735535, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 23670 + }, + { + "epoch": 3.82830814000485, + "grad_norm": 0.7401984333992004, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 23680 + }, + { + "epoch": 3.8299248241855954, + "grad_norm": 0.829753041267395, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 23690 + }, + { + "epoch": 3.8315415083663407, + "grad_norm": 0.8753240704536438, + "learning_rate": 0.0002, + "loss": 0.6611, + "step": 23700 + }, + { + "epoch": 3.833158192547086, + "grad_norm": 0.8157842755317688, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 23710 + }, + { + "epoch": 3.834774876727831, + "grad_norm": 0.6183798909187317, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 23720 + }, + { + "epoch": 3.8363915609085764, + "grad_norm": 0.9548442363739014, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 23730 + }, + { + "epoch": 3.8380082450893216, + "grad_norm": 0.8319669961929321, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 23740 + }, + { + "epoch": 3.839624929270067, + "grad_norm": 0.9718693494796753, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 23750 + }, + { + "epoch": 3.8412416134508125, + "grad_norm": 0.8672235012054443, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 23760 + }, + { + "epoch": 3.8428582976315577, + "grad_norm": 1.1210707426071167, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 23770 + }, + { + "epoch": 3.844474981812303, + "grad_norm": 0.9177767634391785, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 23780 + }, + { + "epoch": 3.846091665993048, + "grad_norm": 0.8714171648025513, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 23790 + }, + { + "epoch": 3.8477083501737934, + "grad_norm": 1.1853246688842773, + "learning_rate": 0.0002, + "loss": 0.6302, + "step": 23800 + }, + { + "epoch": 3.849325034354539, + "grad_norm": 0.8091260194778442, + "learning_rate": 0.0002, + "loss": 0.6144, + "step": 23810 + }, + { + "epoch": 3.8509417185352843, + "grad_norm": 0.9710774421691895, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 23820 + }, + { + "epoch": 3.8525584027160296, + "grad_norm": 0.7648707628250122, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 23830 + }, + { + "epoch": 3.854175086896775, + "grad_norm": 0.7809253931045532, + "learning_rate": 0.0002, + "loss": 0.6013, + "step": 23840 + }, + { + "epoch": 3.85579177107752, + "grad_norm": 0.8337951898574829, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 23850 + }, + { + "epoch": 3.8574084552582653, + "grad_norm": 0.9271913170814514, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 23860 + }, + { + "epoch": 3.8590251394390105, + "grad_norm": 0.985334038734436, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 23870 + }, + { + "epoch": 3.8606418236197557, + "grad_norm": 0.8458583354949951, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 23880 + }, + { + "epoch": 3.862258507800501, + "grad_norm": 1.015348196029663, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 23890 + }, + { + "epoch": 3.8638751919812466, + "grad_norm": 1.0121688842773438, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 23900 + }, + { + "epoch": 3.865491876161992, + "grad_norm": 0.8883971571922302, + "learning_rate": 0.0002, + "loss": 0.5921, + "step": 23910 + }, + { + "epoch": 3.867108560342737, + "grad_norm": 1.028086543083191, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 23920 + }, + { + "epoch": 3.8687252445234823, + "grad_norm": 0.9645734429359436, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 23930 + }, + { + "epoch": 3.8703419287042276, + "grad_norm": 0.8235350251197815, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 23940 + }, + { + "epoch": 3.871958612884973, + "grad_norm": 1.0298916101455688, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 23950 + }, + { + "epoch": 3.8735752970657185, + "grad_norm": 1.0063377618789673, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 23960 + }, + { + "epoch": 3.8751919812464637, + "grad_norm": 0.9230626821517944, + "learning_rate": 0.0002, + "loss": 0.6234, + "step": 23970 + }, + { + "epoch": 3.876808665427209, + "grad_norm": 0.9243063926696777, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 23980 + }, + { + "epoch": 3.878425349607954, + "grad_norm": 1.0211291313171387, + "learning_rate": 0.0002, + "loss": 0.6035, + "step": 23990 + }, + { + "epoch": 3.8800420337886994, + "grad_norm": 0.7800535559654236, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 24000 + }, + { + "epoch": 3.8816587179694446, + "grad_norm": 0.7904248833656311, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 24010 + }, + { + "epoch": 3.88327540215019, + "grad_norm": 1.1975988149642944, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 24020 + }, + { + "epoch": 3.884892086330935, + "grad_norm": 1.0626593828201294, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 24030 + }, + { + "epoch": 3.8865087705116803, + "grad_norm": 0.9012193083763123, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 24040 + }, + { + "epoch": 3.888125454692426, + "grad_norm": 1.1159172058105469, + "learning_rate": 0.0002, + "loss": 0.6786, + "step": 24050 + }, + { + "epoch": 3.889742138873171, + "grad_norm": 1.276838779449463, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 24060 + }, + { + "epoch": 3.8913588230539164, + "grad_norm": 0.8467690348625183, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 24070 + }, + { + "epoch": 3.8929755072346617, + "grad_norm": 0.9862841963768005, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 24080 + }, + { + "epoch": 3.894592191415407, + "grad_norm": 0.7134621739387512, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 24090 + }, + { + "epoch": 3.896208875596152, + "grad_norm": 0.8178175091743469, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 24100 + }, + { + "epoch": 3.897825559776898, + "grad_norm": 0.9229172468185425, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 24110 + }, + { + "epoch": 3.899442243957643, + "grad_norm": 1.0878316164016724, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 24120 + }, + { + "epoch": 3.9010589281383883, + "grad_norm": 0.971645712852478, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 24130 + }, + { + "epoch": 3.9026756123191335, + "grad_norm": 0.8862188458442688, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 24140 + }, + { + "epoch": 3.9042922964998787, + "grad_norm": 0.9126982688903809, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 24150 + }, + { + "epoch": 3.905908980680624, + "grad_norm": 0.8833470940589905, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 24160 + }, + { + "epoch": 3.907525664861369, + "grad_norm": 0.8320947885513306, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 24170 + }, + { + "epoch": 3.9091423490421144, + "grad_norm": 0.9156602025032043, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 24180 + }, + { + "epoch": 3.9107590332228597, + "grad_norm": 1.029181957244873, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 24190 + }, + { + "epoch": 3.9123757174036053, + "grad_norm": 0.9052802324295044, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 24200 + }, + { + "epoch": 3.9139924015843506, + "grad_norm": 0.8847255110740662, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 24210 + }, + { + "epoch": 3.915609085765096, + "grad_norm": 0.9642062187194824, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 24220 + }, + { + "epoch": 3.917225769945841, + "grad_norm": 0.8629093766212463, + "learning_rate": 0.0002, + "loss": 0.6557, + "step": 24230 + }, + { + "epoch": 3.9188424541265863, + "grad_norm": 0.8674976825714111, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 24240 + }, + { + "epoch": 3.9204591383073315, + "grad_norm": 1.104846477508545, + "learning_rate": 0.0002, + "loss": 0.5874, + "step": 24250 + }, + { + "epoch": 3.922075822488077, + "grad_norm": 1.0874955654144287, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 24260 + }, + { + "epoch": 3.9236925066688224, + "grad_norm": 0.8689812421798706, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 24270 + }, + { + "epoch": 3.9253091908495676, + "grad_norm": 0.9724617004394531, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 24280 + }, + { + "epoch": 3.926925875030313, + "grad_norm": 0.9165538549423218, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 24290 + }, + { + "epoch": 3.928542559211058, + "grad_norm": 0.9307710528373718, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 24300 + }, + { + "epoch": 3.9301592433918033, + "grad_norm": 0.8589295148849487, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 24310 + }, + { + "epoch": 3.9317759275725486, + "grad_norm": 0.9151099920272827, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 24320 + }, + { + "epoch": 3.933392611753294, + "grad_norm": 0.9633517265319824, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 24330 + }, + { + "epoch": 3.935009295934039, + "grad_norm": 0.9521116018295288, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 24340 + }, + { + "epoch": 3.9366259801147847, + "grad_norm": 0.8366776704788208, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 24350 + }, + { + "epoch": 3.93824266429553, + "grad_norm": 0.8972663283348083, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 24360 + }, + { + "epoch": 3.939859348476275, + "grad_norm": 0.8102919459342957, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 24370 + }, + { + "epoch": 3.9414760326570204, + "grad_norm": 0.8189975023269653, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 24380 + }, + { + "epoch": 3.9430927168377656, + "grad_norm": 0.9569464921951294, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 24390 + }, + { + "epoch": 3.9447094010185113, + "grad_norm": 0.7459101676940918, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 24400 + }, + { + "epoch": 3.9463260851992565, + "grad_norm": 0.8536974787712097, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 24410 + }, + { + "epoch": 3.9479427693800018, + "grad_norm": 0.8763698935508728, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 24420 + }, + { + "epoch": 3.949559453560747, + "grad_norm": 0.9381106495857239, + "learning_rate": 0.0002, + "loss": 0.6478, + "step": 24430 + }, + { + "epoch": 3.9511761377414922, + "grad_norm": 0.934440016746521, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 24440 + }, + { + "epoch": 3.9527928219222375, + "grad_norm": 0.903918981552124, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 24450 + }, + { + "epoch": 3.9544095061029827, + "grad_norm": 0.8771953582763672, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 24460 + }, + { + "epoch": 3.956026190283728, + "grad_norm": 1.0375410318374634, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 24470 + }, + { + "epoch": 3.957642874464473, + "grad_norm": 0.9439185261726379, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 24480 + }, + { + "epoch": 3.9592595586452184, + "grad_norm": 0.935467004776001, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 24490 + }, + { + "epoch": 3.960876242825964, + "grad_norm": 0.6900772452354431, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 24500 + }, + { + "epoch": 3.9624929270067093, + "grad_norm": 1.0172916650772095, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 24510 + }, + { + "epoch": 3.9641096111874545, + "grad_norm": 0.9167046546936035, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 24520 + }, + { + "epoch": 3.9657262953681998, + "grad_norm": 0.7230527997016907, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 24530 + }, + { + "epoch": 3.967342979548945, + "grad_norm": 0.8980403542518616, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 24540 + }, + { + "epoch": 3.9689596637296907, + "grad_norm": 0.8555465936660767, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 24550 + }, + { + "epoch": 3.970576347910436, + "grad_norm": 0.7825445532798767, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 24560 + }, + { + "epoch": 3.972193032091181, + "grad_norm": 0.7273133993148804, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 24570 + }, + { + "epoch": 3.9738097162719264, + "grad_norm": 0.9612047672271729, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 24580 + }, + { + "epoch": 3.9754264004526716, + "grad_norm": 0.9865460991859436, + "learning_rate": 0.0002, + "loss": 0.6426, + "step": 24590 + }, + { + "epoch": 3.977043084633417, + "grad_norm": 0.8638762831687927, + "learning_rate": 0.0002, + "loss": 0.6052, + "step": 24600 + }, + { + "epoch": 3.978659768814162, + "grad_norm": 1.0096198320388794, + "learning_rate": 0.0002, + "loss": 0.6097, + "step": 24610 + }, + { + "epoch": 3.9802764529949073, + "grad_norm": 0.8475532531738281, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 24620 + }, + { + "epoch": 3.9818931371756525, + "grad_norm": 0.9696195721626282, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 24630 + }, + { + "epoch": 3.9835098213563978, + "grad_norm": 0.7499843239784241, + "learning_rate": 0.0002, + "loss": 0.6446, + "step": 24640 + }, + { + "epoch": 3.9851265055371434, + "grad_norm": 0.8865424990653992, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 24650 + }, + { + "epoch": 3.9867431897178887, + "grad_norm": 0.8089959025382996, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 24660 + }, + { + "epoch": 3.988359873898634, + "grad_norm": 0.6946012377738953, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 24670 + }, + { + "epoch": 3.989976558079379, + "grad_norm": 0.7991759181022644, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 24680 + }, + { + "epoch": 3.9915932422601244, + "grad_norm": 0.8803931474685669, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 24690 + }, + { + "epoch": 3.99320992644087, + "grad_norm": 0.8848299980163574, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 24700 + }, + { + "epoch": 3.9948266106216153, + "grad_norm": 0.7448889017105103, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 24710 + }, + { + "epoch": 3.9964432948023605, + "grad_norm": 0.9361620545387268, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 24720 + }, + { + "epoch": 3.9980599789831057, + "grad_norm": 0.9958081245422363, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 24730 + }, + { + "epoch": 3.999676663163851, + "grad_norm": 1.026004672050476, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 24740 + }, + { + "epoch": 4.0, + "eval_loss": 1.1524168252944946, + "eval_runtime": 122.1585, + "eval_samples_per_second": 6.0, + "eval_steps_per_second": 0.753, + "step": 24742 + }, + { + "epoch": 4.001293347344596, + "grad_norm": 1.0664808750152588, + "learning_rate": 0.0002, + "loss": 0.6057, + "step": 24750 + }, + { + "epoch": 4.002910031525341, + "grad_norm": 1.0113720893859863, + "learning_rate": 0.0002, + "loss": 0.5644, + "step": 24760 + }, + { + "epoch": 4.004526715706087, + "grad_norm": 0.991486668586731, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 24770 + }, + { + "epoch": 4.006143399886832, + "grad_norm": 0.951754629611969, + "learning_rate": 0.0002, + "loss": 0.508, + "step": 24780 + }, + { + "epoch": 4.007760084067577, + "grad_norm": 1.13059401512146, + "learning_rate": 0.0002, + "loss": 0.5314, + "step": 24790 + }, + { + "epoch": 4.009376768248322, + "grad_norm": 0.9343926310539246, + "learning_rate": 0.0002, + "loss": 0.5323, + "step": 24800 + }, + { + "epoch": 4.010993452429068, + "grad_norm": 1.0680590867996216, + "learning_rate": 0.0002, + "loss": 0.5161, + "step": 24810 + }, + { + "epoch": 4.012610136609814, + "grad_norm": 1.0022706985473633, + "learning_rate": 0.0002, + "loss": 0.513, + "step": 24820 + }, + { + "epoch": 4.014226820790559, + "grad_norm": 1.0285297632217407, + "learning_rate": 0.0002, + "loss": 0.543, + "step": 24830 + }, + { + "epoch": 4.015843504971304, + "grad_norm": 0.8347002863883972, + "learning_rate": 0.0002, + "loss": 0.5311, + "step": 24840 + }, + { + "epoch": 4.017460189152049, + "grad_norm": 0.9675396680831909, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 24850 + }, + { + "epoch": 4.019076873332795, + "grad_norm": 0.9238511323928833, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 24860 + }, + { + "epoch": 4.02069355751354, + "grad_norm": 1.1576941013336182, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 24870 + }, + { + "epoch": 4.022310241694285, + "grad_norm": 0.8583757281303406, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 24880 + }, + { + "epoch": 4.02392692587503, + "grad_norm": 0.9816817045211792, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 24890 + }, + { + "epoch": 4.0255436100557755, + "grad_norm": 0.955073893070221, + "learning_rate": 0.0002, + "loss": 0.5605, + "step": 24900 + }, + { + "epoch": 4.027160294236521, + "grad_norm": 1.1054974794387817, + "learning_rate": 0.0002, + "loss": 0.4896, + "step": 24910 + }, + { + "epoch": 4.028776978417266, + "grad_norm": 1.1240060329437256, + "learning_rate": 0.0002, + "loss": 0.5246, + "step": 24920 + }, + { + "epoch": 4.030393662598011, + "grad_norm": 0.9512825012207031, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 24930 + }, + { + "epoch": 4.0320103467787565, + "grad_norm": 0.85965496301651, + "learning_rate": 0.0002, + "loss": 0.5584, + "step": 24940 + }, + { + "epoch": 4.033627030959502, + "grad_norm": 0.9378061294555664, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 24950 + }, + { + "epoch": 4.035243715140247, + "grad_norm": 0.9655424356460571, + "learning_rate": 0.0002, + "loss": 0.5008, + "step": 24960 + }, + { + "epoch": 4.036860399320993, + "grad_norm": 1.1393707990646362, + "learning_rate": 0.0002, + "loss": 0.5538, + "step": 24970 + }, + { + "epoch": 4.038477083501738, + "grad_norm": 1.0220451354980469, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 24980 + }, + { + "epoch": 4.0400937676824835, + "grad_norm": 0.9785808324813843, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 24990 + }, + { + "epoch": 4.041710451863229, + "grad_norm": 1.0257649421691895, + "learning_rate": 0.0002, + "loss": 0.5153, + "step": 25000 + }, + { + "epoch": 4.043327136043974, + "grad_norm": 0.9737892150878906, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 25010 + }, + { + "epoch": 4.044943820224719, + "grad_norm": 0.7416959404945374, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 25020 + }, + { + "epoch": 4.046560504405464, + "grad_norm": 0.7909596562385559, + "learning_rate": 0.0002, + "loss": 0.5372, + "step": 25030 + }, + { + "epoch": 4.04817718858621, + "grad_norm": 0.8923130631446838, + "learning_rate": 0.0002, + "loss": 0.5265, + "step": 25040 + }, + { + "epoch": 4.049793872766955, + "grad_norm": 0.9044941663742065, + "learning_rate": 0.0002, + "loss": 0.5035, + "step": 25050 + }, + { + "epoch": 4.0514105569477, + "grad_norm": 0.866352379322052, + "learning_rate": 0.0002, + "loss": 0.5135, + "step": 25060 + }, + { + "epoch": 4.053027241128445, + "grad_norm": 1.544549822807312, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 25070 + }, + { + "epoch": 4.054643925309191, + "grad_norm": 0.8426995277404785, + "learning_rate": 0.0002, + "loss": 0.5418, + "step": 25080 + }, + { + "epoch": 4.056260609489936, + "grad_norm": 0.9797548651695251, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 25090 + }, + { + "epoch": 4.057877293670681, + "grad_norm": 0.8468434810638428, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 25100 + }, + { + "epoch": 4.059493977851426, + "grad_norm": 0.9294559955596924, + "learning_rate": 0.0002, + "loss": 0.5242, + "step": 25110 + }, + { + "epoch": 4.061110662032172, + "grad_norm": 0.9686688780784607, + "learning_rate": 0.0002, + "loss": 0.5295, + "step": 25120 + }, + { + "epoch": 4.062727346212918, + "grad_norm": 0.8042728304862976, + "learning_rate": 0.0002, + "loss": 0.5642, + "step": 25130 + }, + { + "epoch": 4.064344030393663, + "grad_norm": 1.165160894393921, + "learning_rate": 0.0002, + "loss": 0.548, + "step": 25140 + }, + { + "epoch": 4.065960714574408, + "grad_norm": 1.2161961793899536, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 25150 + }, + { + "epoch": 4.067577398755153, + "grad_norm": 1.0762810707092285, + "learning_rate": 0.0002, + "loss": 0.5217, + "step": 25160 + }, + { + "epoch": 4.069194082935899, + "grad_norm": 0.7580869793891907, + "learning_rate": 0.0002, + "loss": 0.5886, + "step": 25170 + }, + { + "epoch": 4.070810767116644, + "grad_norm": 0.9630117416381836, + "learning_rate": 0.0002, + "loss": 0.5401, + "step": 25180 + }, + { + "epoch": 4.072427451297389, + "grad_norm": 0.9049716591835022, + "learning_rate": 0.0002, + "loss": 0.5378, + "step": 25190 + }, + { + "epoch": 4.074044135478134, + "grad_norm": 1.1536930799484253, + "learning_rate": 0.0002, + "loss": 0.5266, + "step": 25200 + }, + { + "epoch": 4.0756608196588795, + "grad_norm": 0.901461124420166, + "learning_rate": 0.0002, + "loss": 0.5523, + "step": 25210 + }, + { + "epoch": 4.077277503839625, + "grad_norm": 1.3318437337875366, + "learning_rate": 0.0002, + "loss": 0.5132, + "step": 25220 + }, + { + "epoch": 4.07889418802037, + "grad_norm": 0.8811455368995667, + "learning_rate": 0.0002, + "loss": 0.5317, + "step": 25230 + }, + { + "epoch": 4.080510872201115, + "grad_norm": 1.0564165115356445, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 25240 + }, + { + "epoch": 4.08212755638186, + "grad_norm": 1.1008027791976929, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 25250 + }, + { + "epoch": 4.083744240562606, + "grad_norm": 1.150097131729126, + "learning_rate": 0.0002, + "loss": 0.5195, + "step": 25260 + }, + { + "epoch": 4.085360924743352, + "grad_norm": 0.9339924454689026, + "learning_rate": 0.0002, + "loss": 0.5321, + "step": 25270 + }, + { + "epoch": 4.086977608924097, + "grad_norm": 1.0902045965194702, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 25280 + }, + { + "epoch": 4.088594293104842, + "grad_norm": 0.8483911156654358, + "learning_rate": 0.0002, + "loss": 0.5203, + "step": 25290 + }, + { + "epoch": 4.0902109772855875, + "grad_norm": 0.9477024674415588, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 25300 + }, + { + "epoch": 4.091827661466333, + "grad_norm": 0.9500215649604797, + "learning_rate": 0.0002, + "loss": 0.5384, + "step": 25310 + }, + { + "epoch": 4.093444345647078, + "grad_norm": 1.040468454360962, + "learning_rate": 0.0002, + "loss": 0.5045, + "step": 25320 + }, + { + "epoch": 4.095061029827823, + "grad_norm": 0.7457592487335205, + "learning_rate": 0.0002, + "loss": 0.5488, + "step": 25330 + }, + { + "epoch": 4.096677714008568, + "grad_norm": 1.2092097997665405, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 25340 + }, + { + "epoch": 4.098294398189314, + "grad_norm": 0.9652107954025269, + "learning_rate": 0.0002, + "loss": 0.5174, + "step": 25350 + }, + { + "epoch": 4.099911082370059, + "grad_norm": 0.8464955687522888, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 25360 + }, + { + "epoch": 4.101527766550804, + "grad_norm": 0.875026285648346, + "learning_rate": 0.0002, + "loss": 0.5635, + "step": 25370 + }, + { + "epoch": 4.103144450731549, + "grad_norm": 0.9241740107536316, + "learning_rate": 0.0002, + "loss": 0.5774, + "step": 25380 + }, + { + "epoch": 4.1047611349122946, + "grad_norm": 0.9769546389579773, + "learning_rate": 0.0002, + "loss": 0.5578, + "step": 25390 + }, + { + "epoch": 4.10637781909304, + "grad_norm": 1.1501960754394531, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 25400 + }, + { + "epoch": 4.107994503273786, + "grad_norm": 0.9135243892669678, + "learning_rate": 0.0002, + "loss": 0.5241, + "step": 25410 + }, + { + "epoch": 4.109611187454531, + "grad_norm": 0.9905396103858948, + "learning_rate": 0.0002, + "loss": 0.5152, + "step": 25420 + }, + { + "epoch": 4.111227871635276, + "grad_norm": 0.9845104217529297, + "learning_rate": 0.0002, + "loss": 0.5064, + "step": 25430 + }, + { + "epoch": 4.112844555816022, + "grad_norm": 0.8326883912086487, + "learning_rate": 0.0002, + "loss": 0.5029, + "step": 25440 + }, + { + "epoch": 4.114461239996767, + "grad_norm": 0.9264556765556335, + "learning_rate": 0.0002, + "loss": 0.5312, + "step": 25450 + }, + { + "epoch": 4.116077924177512, + "grad_norm": 1.043080449104309, + "learning_rate": 0.0002, + "loss": 0.5968, + "step": 25460 + }, + { + "epoch": 4.117694608358257, + "grad_norm": 0.8533386588096619, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 25470 + }, + { + "epoch": 4.1193112925390025, + "grad_norm": 1.0133965015411377, + "learning_rate": 0.0002, + "loss": 0.5584, + "step": 25480 + }, + { + "epoch": 4.120927976719748, + "grad_norm": 0.7476310133934021, + "learning_rate": 0.0002, + "loss": 0.566, + "step": 25490 + }, + { + "epoch": 4.122544660900493, + "grad_norm": 1.1247259378433228, + "learning_rate": 0.0002, + "loss": 0.5189, + "step": 25500 + }, + { + "epoch": 4.124161345081238, + "grad_norm": 1.0764678716659546, + "learning_rate": 0.0002, + "loss": 0.5751, + "step": 25510 + }, + { + "epoch": 4.1257780292619834, + "grad_norm": 0.7679798007011414, + "learning_rate": 0.0002, + "loss": 0.5391, + "step": 25520 + }, + { + "epoch": 4.127394713442729, + "grad_norm": 0.8877071142196655, + "learning_rate": 0.0002, + "loss": 0.5233, + "step": 25530 + }, + { + "epoch": 4.129011397623474, + "grad_norm": 1.0440239906311035, + "learning_rate": 0.0002, + "loss": 0.5769, + "step": 25540 + }, + { + "epoch": 4.130628081804219, + "grad_norm": 0.984145998954773, + "learning_rate": 0.0002, + "loss": 0.5723, + "step": 25550 + }, + { + "epoch": 4.132244765984965, + "grad_norm": 0.8667055368423462, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 25560 + }, + { + "epoch": 4.1338614501657105, + "grad_norm": 1.1300835609436035, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 25570 + }, + { + "epoch": 4.135478134346456, + "grad_norm": 0.9314348101615906, + "learning_rate": 0.0002, + "loss": 0.524, + "step": 25580 + }, + { + "epoch": 4.137094818527201, + "grad_norm": 0.7731879949569702, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 25590 + }, + { + "epoch": 4.138711502707946, + "grad_norm": 1.0080097913742065, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 25600 + }, + { + "epoch": 4.140328186888691, + "grad_norm": 1.2475038766860962, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 25610 + }, + { + "epoch": 4.141944871069437, + "grad_norm": 0.9912930727005005, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 25620 + }, + { + "epoch": 4.143561555250182, + "grad_norm": 0.9088651537895203, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 25630 + }, + { + "epoch": 4.145178239430927, + "grad_norm": 0.8940697312355042, + "learning_rate": 0.0002, + "loss": 0.5393, + "step": 25640 + }, + { + "epoch": 4.146794923611672, + "grad_norm": 1.0798203945159912, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 25650 + }, + { + "epoch": 4.148411607792418, + "grad_norm": 0.955172061920166, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 25660 + }, + { + "epoch": 4.150028291973163, + "grad_norm": 0.9692716002464294, + "learning_rate": 0.0002, + "loss": 0.569, + "step": 25670 + }, + { + "epoch": 4.151644976153908, + "grad_norm": 1.0813939571380615, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 25680 + }, + { + "epoch": 4.153261660334653, + "grad_norm": 1.135675072669983, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 25690 + }, + { + "epoch": 4.1548783445153985, + "grad_norm": 1.0392236709594727, + "learning_rate": 0.0002, + "loss": 0.5247, + "step": 25700 + }, + { + "epoch": 4.156495028696145, + "grad_norm": 0.9473116993904114, + "learning_rate": 0.0002, + "loss": 0.5204, + "step": 25710 + }, + { + "epoch": 4.15811171287689, + "grad_norm": 0.712493896484375, + "learning_rate": 0.0002, + "loss": 0.5339, + "step": 25720 + }, + { + "epoch": 4.159728397057635, + "grad_norm": 0.8724465370178223, + "learning_rate": 0.0002, + "loss": 0.5781, + "step": 25730 + }, + { + "epoch": 4.16134508123838, + "grad_norm": 0.9870015978813171, + "learning_rate": 0.0002, + "loss": 0.5325, + "step": 25740 + }, + { + "epoch": 4.1629617654191255, + "grad_norm": 1.025273084640503, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 25750 + }, + { + "epoch": 4.164578449599871, + "grad_norm": 0.9243090152740479, + "learning_rate": 0.0002, + "loss": 0.5223, + "step": 25760 + }, + { + "epoch": 4.166195133780616, + "grad_norm": 1.1656451225280762, + "learning_rate": 0.0002, + "loss": 0.5177, + "step": 25770 + }, + { + "epoch": 4.167811817961361, + "grad_norm": 0.936358630657196, + "learning_rate": 0.0002, + "loss": 0.5334, + "step": 25780 + }, + { + "epoch": 4.1694285021421065, + "grad_norm": 0.8618208169937134, + "learning_rate": 0.0002, + "loss": 0.5236, + "step": 25790 + }, + { + "epoch": 4.171045186322852, + "grad_norm": 0.8580600023269653, + "learning_rate": 0.0002, + "loss": 0.5186, + "step": 25800 + }, + { + "epoch": 4.172661870503597, + "grad_norm": 1.0128562450408936, + "learning_rate": 0.0002, + "loss": 0.5212, + "step": 25810 + }, + { + "epoch": 4.174278554684342, + "grad_norm": 0.854865312576294, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 25820 + }, + { + "epoch": 4.175895238865087, + "grad_norm": 1.235082745552063, + "learning_rate": 0.0002, + "loss": 0.5377, + "step": 25830 + }, + { + "epoch": 4.177511923045833, + "grad_norm": 0.9796220660209656, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 25840 + }, + { + "epoch": 4.179128607226578, + "grad_norm": 0.8922094702720642, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 25850 + }, + { + "epoch": 4.180745291407324, + "grad_norm": 0.9672530293464661, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 25860 + }, + { + "epoch": 4.182361975588069, + "grad_norm": 0.8662548661231995, + "learning_rate": 0.0002, + "loss": 0.5074, + "step": 25870 + }, + { + "epoch": 4.1839786597688144, + "grad_norm": 0.7938798069953918, + "learning_rate": 0.0002, + "loss": 0.5329, + "step": 25880 + }, + { + "epoch": 4.18559534394956, + "grad_norm": 1.0517958402633667, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 25890 + }, + { + "epoch": 4.187212028130305, + "grad_norm": 0.8939275145530701, + "learning_rate": 0.0002, + "loss": 0.5147, + "step": 25900 + }, + { + "epoch": 4.18882871231105, + "grad_norm": 1.0296672582626343, + "learning_rate": 0.0002, + "loss": 0.5199, + "step": 25910 + }, + { + "epoch": 4.190445396491795, + "grad_norm": 0.8104017972946167, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 25920 + }, + { + "epoch": 4.192062080672541, + "grad_norm": 0.9984509944915771, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 25930 + }, + { + "epoch": 4.193678764853286, + "grad_norm": 0.9844784736633301, + "learning_rate": 0.0002, + "loss": 0.5356, + "step": 25940 + }, + { + "epoch": 4.195295449034031, + "grad_norm": 0.8168622255325317, + "learning_rate": 0.0002, + "loss": 0.5198, + "step": 25950 + }, + { + "epoch": 4.196912133214776, + "grad_norm": 1.0878913402557373, + "learning_rate": 0.0002, + "loss": 0.542, + "step": 25960 + }, + { + "epoch": 4.1985288173955215, + "grad_norm": 0.927126407623291, + "learning_rate": 0.0002, + "loss": 0.5414, + "step": 25970 + }, + { + "epoch": 4.200145501576267, + "grad_norm": 0.838586688041687, + "learning_rate": 0.0002, + "loss": 0.5794, + "step": 25980 + }, + { + "epoch": 4.201762185757012, + "grad_norm": 1.2572145462036133, + "learning_rate": 0.0002, + "loss": 0.5454, + "step": 25990 + }, + { + "epoch": 4.203378869937758, + "grad_norm": 1.0476740598678589, + "learning_rate": 0.0002, + "loss": 0.5048, + "step": 26000 + }, + { + "epoch": 4.204995554118503, + "grad_norm": 1.0873368978500366, + "learning_rate": 0.0002, + "loss": 0.5127, + "step": 26010 + }, + { + "epoch": 4.206612238299249, + "grad_norm": 1.2664896249771118, + "learning_rate": 0.0002, + "loss": 0.5679, + "step": 26020 + }, + { + "epoch": 4.208228922479994, + "grad_norm": 1.0312391519546509, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 26030 + }, + { + "epoch": 4.209845606660739, + "grad_norm": 1.0235042572021484, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 26040 + }, + { + "epoch": 4.211462290841484, + "grad_norm": 0.8882219195365906, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 26050 + }, + { + "epoch": 4.2130789750222295, + "grad_norm": 0.9115961790084839, + "learning_rate": 0.0002, + "loss": 0.5557, + "step": 26060 + }, + { + "epoch": 4.214695659202975, + "grad_norm": 1.0218228101730347, + "learning_rate": 0.0002, + "loss": 0.5455, + "step": 26070 + }, + { + "epoch": 4.21631234338372, + "grad_norm": 1.0802232027053833, + "learning_rate": 0.0002, + "loss": 0.5462, + "step": 26080 + }, + { + "epoch": 4.217929027564465, + "grad_norm": 1.1488053798675537, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 26090 + }, + { + "epoch": 4.21954571174521, + "grad_norm": 1.0487725734710693, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 26100 + }, + { + "epoch": 4.221162395925956, + "grad_norm": 0.9131165742874146, + "learning_rate": 0.0002, + "loss": 0.5568, + "step": 26110 + }, + { + "epoch": 4.222779080106701, + "grad_norm": 0.9012845158576965, + "learning_rate": 0.0002, + "loss": 0.5206, + "step": 26120 + }, + { + "epoch": 4.224395764287446, + "grad_norm": 0.8389840126037598, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 26130 + }, + { + "epoch": 4.226012448468191, + "grad_norm": 0.8924660682678223, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 26140 + }, + { + "epoch": 4.2276291326489375, + "grad_norm": 0.8556463718414307, + "learning_rate": 0.0002, + "loss": 0.5715, + "step": 26150 + }, + { + "epoch": 4.229245816829683, + "grad_norm": 0.9643129110336304, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 26160 + }, + { + "epoch": 4.230862501010428, + "grad_norm": 0.9865712523460388, + "learning_rate": 0.0002, + "loss": 0.5321, + "step": 26170 + }, + { + "epoch": 4.232479185191173, + "grad_norm": 1.152641773223877, + "learning_rate": 0.0002, + "loss": 0.5406, + "step": 26180 + }, + { + "epoch": 4.234095869371918, + "grad_norm": 0.9157698154449463, + "learning_rate": 0.0002, + "loss": 0.5632, + "step": 26190 + }, + { + "epoch": 4.235712553552664, + "grad_norm": 0.8418048620223999, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 26200 + }, + { + "epoch": 4.237329237733409, + "grad_norm": 0.9430168867111206, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 26210 + }, + { + "epoch": 4.238945921914154, + "grad_norm": 1.012582778930664, + "learning_rate": 0.0002, + "loss": 0.5574, + "step": 26220 + }, + { + "epoch": 4.240562606094899, + "grad_norm": 1.112619400024414, + "learning_rate": 0.0002, + "loss": 0.5693, + "step": 26230 + }, + { + "epoch": 4.2421792902756446, + "grad_norm": 0.9243621826171875, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 26240 + }, + { + "epoch": 4.24379597445639, + "grad_norm": 0.6977595686912537, + "learning_rate": 0.0002, + "loss": 0.569, + "step": 26250 + }, + { + "epoch": 4.245412658637135, + "grad_norm": 0.9600721597671509, + "learning_rate": 0.0002, + "loss": 0.5379, + "step": 26260 + }, + { + "epoch": 4.24702934281788, + "grad_norm": 0.882641613483429, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 26270 + }, + { + "epoch": 4.2486460269986255, + "grad_norm": 1.010920763015747, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 26280 + }, + { + "epoch": 4.250262711179371, + "grad_norm": 0.9289400577545166, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 26290 + }, + { + "epoch": 4.251879395360117, + "grad_norm": 1.137397289276123, + "learning_rate": 0.0002, + "loss": 0.541, + "step": 26300 + }, + { + "epoch": 4.253496079540862, + "grad_norm": 1.0136182308197021, + "learning_rate": 0.0002, + "loss": 0.5204, + "step": 26310 + }, + { + "epoch": 4.255112763721607, + "grad_norm": 0.9387356042861938, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 26320 + }, + { + "epoch": 4.2567294479023525, + "grad_norm": 1.1833957433700562, + "learning_rate": 0.0002, + "loss": 0.5948, + "step": 26330 + }, + { + "epoch": 4.258346132083098, + "grad_norm": 0.9415934681892395, + "learning_rate": 0.0002, + "loss": 0.5905, + "step": 26340 + }, + { + "epoch": 4.259962816263843, + "grad_norm": 0.8550165891647339, + "learning_rate": 0.0002, + "loss": 0.5539, + "step": 26350 + }, + { + "epoch": 4.261579500444588, + "grad_norm": 9.924622535705566, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 26360 + }, + { + "epoch": 4.2631961846253335, + "grad_norm": 1.0104902982711792, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 26370 + }, + { + "epoch": 4.264812868806079, + "grad_norm": 0.890794038772583, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 26380 + }, + { + "epoch": 4.266429552986824, + "grad_norm": 1.0560191869735718, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 26390 + }, + { + "epoch": 4.268046237167569, + "grad_norm": 1.0135581493377686, + "learning_rate": 0.0002, + "loss": 0.5119, + "step": 26400 + }, + { + "epoch": 4.269662921348314, + "grad_norm": 1.1304140090942383, + "learning_rate": 0.0002, + "loss": 0.5359, + "step": 26410 + }, + { + "epoch": 4.27127960552906, + "grad_norm": 0.9899303913116455, + "learning_rate": 0.0002, + "loss": 0.5615, + "step": 26420 + }, + { + "epoch": 4.272896289709805, + "grad_norm": 1.0505329370498657, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 26430 + }, + { + "epoch": 4.27451297389055, + "grad_norm": 0.9389396905899048, + "learning_rate": 0.0002, + "loss": 0.5384, + "step": 26440 + }, + { + "epoch": 4.276129658071296, + "grad_norm": 0.875328779220581, + "learning_rate": 0.0002, + "loss": 0.5558, + "step": 26450 + }, + { + "epoch": 4.277746342252041, + "grad_norm": 1.0689256191253662, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 26460 + }, + { + "epoch": 4.279363026432787, + "grad_norm": 0.9988957643508911, + "learning_rate": 0.0002, + "loss": 0.546, + "step": 26470 + }, + { + "epoch": 4.280979710613532, + "grad_norm": 0.8721813559532166, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 26480 + }, + { + "epoch": 4.282596394794277, + "grad_norm": 1.100109577178955, + "learning_rate": 0.0002, + "loss": 0.5424, + "step": 26490 + }, + { + "epoch": 4.284213078975022, + "grad_norm": 1.1607271432876587, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 26500 + }, + { + "epoch": 4.285829763155768, + "grad_norm": 0.879088819026947, + "learning_rate": 0.0002, + "loss": 0.6287, + "step": 26510 + }, + { + "epoch": 4.287446447336513, + "grad_norm": 0.9891700744628906, + "learning_rate": 0.0002, + "loss": 0.573, + "step": 26520 + }, + { + "epoch": 4.289063131517258, + "grad_norm": 1.0831127166748047, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 26530 + }, + { + "epoch": 4.290679815698003, + "grad_norm": 1.4108285903930664, + "learning_rate": 0.0002, + "loss": 0.5693, + "step": 26540 + }, + { + "epoch": 4.2922964998787485, + "grad_norm": 1.0630289316177368, + "learning_rate": 0.0002, + "loss": 0.5888, + "step": 26550 + }, + { + "epoch": 4.293913184059494, + "grad_norm": 1.0854572057724, + "learning_rate": 0.0002, + "loss": 0.5817, + "step": 26560 + }, + { + "epoch": 4.295529868240239, + "grad_norm": 0.9561646580696106, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 26570 + }, + { + "epoch": 4.297146552420984, + "grad_norm": 0.9064981937408447, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 26580 + }, + { + "epoch": 4.298763236601729, + "grad_norm": 1.0082972049713135, + "learning_rate": 0.0002, + "loss": 0.5847, + "step": 26590 + }, + { + "epoch": 4.3003799207824756, + "grad_norm": 1.1613214015960693, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 26600 + }, + { + "epoch": 4.301996604963221, + "grad_norm": 0.9847695231437683, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 26610 + }, + { + "epoch": 4.303613289143966, + "grad_norm": 1.0980697870254517, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 26620 + }, + { + "epoch": 4.305229973324711, + "grad_norm": 0.8861175179481506, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 26630 + }, + { + "epoch": 4.3068466575054565, + "grad_norm": 0.8917363286018372, + "learning_rate": 0.0002, + "loss": 0.5716, + "step": 26640 + }, + { + "epoch": 4.308463341686202, + "grad_norm": 1.0458378791809082, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 26650 + }, + { + "epoch": 4.310080025866947, + "grad_norm": 1.4859240055084229, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 26660 + }, + { + "epoch": 4.311696710047692, + "grad_norm": 1.1376359462738037, + "learning_rate": 0.0002, + "loss": 0.5296, + "step": 26670 + }, + { + "epoch": 4.313313394228437, + "grad_norm": 0.991349995136261, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 26680 + }, + { + "epoch": 4.314930078409183, + "grad_norm": 0.9995543956756592, + "learning_rate": 0.0002, + "loss": 0.5338, + "step": 26690 + }, + { + "epoch": 4.316546762589928, + "grad_norm": 1.0515851974487305, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 26700 + }, + { + "epoch": 4.318163446770673, + "grad_norm": 1.008023977279663, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 26710 + }, + { + "epoch": 4.319780130951418, + "grad_norm": 1.0184582471847534, + "learning_rate": 0.0002, + "loss": 0.5506, + "step": 26720 + }, + { + "epoch": 4.321396815132164, + "grad_norm": 1.161071538925171, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 26730 + }, + { + "epoch": 4.323013499312909, + "grad_norm": 0.9580779671669006, + "learning_rate": 0.0002, + "loss": 0.5633, + "step": 26740 + }, + { + "epoch": 4.324630183493655, + "grad_norm": 1.0189911127090454, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 26750 + }, + { + "epoch": 4.3262468676744, + "grad_norm": 0.7484358549118042, + "learning_rate": 0.0002, + "loss": 0.5237, + "step": 26760 + }, + { + "epoch": 4.327863551855145, + "grad_norm": 1.0015908479690552, + "learning_rate": 0.0002, + "loss": 0.5728, + "step": 26770 + }, + { + "epoch": 4.329480236035891, + "grad_norm": 0.8972945809364319, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 26780 + }, + { + "epoch": 4.331096920216636, + "grad_norm": 1.01099693775177, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 26790 + }, + { + "epoch": 4.332713604397381, + "grad_norm": 0.846958339214325, + "learning_rate": 0.0002, + "loss": 0.5591, + "step": 26800 + }, + { + "epoch": 4.334330288578126, + "grad_norm": 1.0792603492736816, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 26810 + }, + { + "epoch": 4.3359469727588715, + "grad_norm": 1.0373345613479614, + "learning_rate": 0.0002, + "loss": 0.5747, + "step": 26820 + }, + { + "epoch": 4.337563656939617, + "grad_norm": 0.9779167771339417, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 26830 + }, + { + "epoch": 4.339180341120362, + "grad_norm": 1.0235520601272583, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 26840 + }, + { + "epoch": 4.340797025301107, + "grad_norm": 1.04195237159729, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 26850 + }, + { + "epoch": 4.3424137094818525, + "grad_norm": 0.9479565620422363, + "learning_rate": 0.0002, + "loss": 0.5474, + "step": 26860 + }, + { + "epoch": 4.344030393662598, + "grad_norm": 0.9526172280311584, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 26870 + }, + { + "epoch": 4.345647077843343, + "grad_norm": 0.8571456074714661, + "learning_rate": 0.0002, + "loss": 0.521, + "step": 26880 + }, + { + "epoch": 4.347263762024088, + "grad_norm": 0.9475828409194946, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 26890 + }, + { + "epoch": 4.348880446204834, + "grad_norm": 1.0529576539993286, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 26900 + }, + { + "epoch": 4.3504971303855795, + "grad_norm": 0.9648140072822571, + "learning_rate": 0.0002, + "loss": 0.56, + "step": 26910 + }, + { + "epoch": 4.352113814566325, + "grad_norm": 1.0488841533660889, + "learning_rate": 0.0002, + "loss": 0.5162, + "step": 26920 + }, + { + "epoch": 4.35373049874707, + "grad_norm": 0.8771942257881165, + "learning_rate": 0.0002, + "loss": 0.5842, + "step": 26930 + }, + { + "epoch": 4.355347182927815, + "grad_norm": 0.9411202073097229, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 26940 + }, + { + "epoch": 4.35696386710856, + "grad_norm": 1.0997588634490967, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 26950 + }, + { + "epoch": 4.358580551289306, + "grad_norm": 0.968754768371582, + "learning_rate": 0.0002, + "loss": 0.5528, + "step": 26960 + }, + { + "epoch": 4.360197235470051, + "grad_norm": 0.9990773797035217, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 26970 + }, + { + "epoch": 4.361813919650796, + "grad_norm": 1.0210620164871216, + "learning_rate": 0.0002, + "loss": 0.5761, + "step": 26980 + }, + { + "epoch": 4.363430603831541, + "grad_norm": 0.855462908744812, + "learning_rate": 0.0002, + "loss": 0.5768, + "step": 26990 + }, + { + "epoch": 4.365047288012287, + "grad_norm": 0.9169660806655884, + "learning_rate": 0.0002, + "loss": 0.5493, + "step": 27000 + }, + { + "epoch": 4.366663972193032, + "grad_norm": 1.089629888534546, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 27010 + }, + { + "epoch": 4.368280656373777, + "grad_norm": 1.0932867527008057, + "learning_rate": 0.0002, + "loss": 0.5854, + "step": 27020 + }, + { + "epoch": 4.369897340554522, + "grad_norm": 0.9290956854820251, + "learning_rate": 0.0002, + "loss": 0.5656, + "step": 27030 + }, + { + "epoch": 4.3715140247352675, + "grad_norm": 1.2800624370574951, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 27040 + }, + { + "epoch": 4.373130708916014, + "grad_norm": 0.8993493318557739, + "learning_rate": 0.0002, + "loss": 0.5837, + "step": 27050 + }, + { + "epoch": 4.374747393096759, + "grad_norm": 1.1566431522369385, + "learning_rate": 0.0002, + "loss": 0.6232, + "step": 27060 + }, + { + "epoch": 4.376364077277504, + "grad_norm": 0.9479052424430847, + "learning_rate": 0.0002, + "loss": 0.5902, + "step": 27070 + }, + { + "epoch": 4.377980761458249, + "grad_norm": 1.0063648223876953, + "learning_rate": 0.0002, + "loss": 0.6189, + "step": 27080 + }, + { + "epoch": 4.379597445638995, + "grad_norm": 0.8342045545578003, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 27090 + }, + { + "epoch": 4.38121412981974, + "grad_norm": 1.1390739679336548, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 27100 + }, + { + "epoch": 4.382830814000485, + "grad_norm": 0.9547637104988098, + "learning_rate": 0.0002, + "loss": 0.5372, + "step": 27110 + }, + { + "epoch": 4.38444749818123, + "grad_norm": 1.0503804683685303, + "learning_rate": 0.0002, + "loss": 0.5728, + "step": 27120 + }, + { + "epoch": 4.3860641823619755, + "grad_norm": 0.9064017534255981, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 27130 + }, + { + "epoch": 4.387680866542721, + "grad_norm": 0.9382519125938416, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 27140 + }, + { + "epoch": 4.389297550723466, + "grad_norm": 1.0410341024398804, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 27150 + }, + { + "epoch": 4.390914234904211, + "grad_norm": 0.9218655824661255, + "learning_rate": 0.0002, + "loss": 0.6034, + "step": 27160 + }, + { + "epoch": 4.392530919084956, + "grad_norm": 0.8119737505912781, + "learning_rate": 0.0002, + "loss": 0.5204, + "step": 27170 + }, + { + "epoch": 4.394147603265702, + "grad_norm": 0.8584722876548767, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 27180 + }, + { + "epoch": 4.395764287446447, + "grad_norm": 0.9668293595314026, + "learning_rate": 0.0002, + "loss": 0.5772, + "step": 27190 + }, + { + "epoch": 4.397380971627193, + "grad_norm": 1.022334098815918, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 27200 + }, + { + "epoch": 4.398997655807938, + "grad_norm": 0.9553216099739075, + "learning_rate": 0.0002, + "loss": 0.5573, + "step": 27210 + }, + { + "epoch": 4.4006143399886835, + "grad_norm": 0.9282339215278625, + "learning_rate": 0.0002, + "loss": 0.5604, + "step": 27220 + }, + { + "epoch": 4.402231024169429, + "grad_norm": 1.0232292413711548, + "learning_rate": 0.0002, + "loss": 0.5599, + "step": 27230 + }, + { + "epoch": 4.403847708350174, + "grad_norm": 0.9915700554847717, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 27240 + }, + { + "epoch": 4.405464392530919, + "grad_norm": 1.0014961957931519, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 27250 + }, + { + "epoch": 4.407081076711664, + "grad_norm": 1.1172103881835938, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 27260 + }, + { + "epoch": 4.40869776089241, + "grad_norm": 0.8583093285560608, + "learning_rate": 0.0002, + "loss": 0.5286, + "step": 27270 + }, + { + "epoch": 4.410314445073155, + "grad_norm": 0.7609201669692993, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 27280 + }, + { + "epoch": 4.4119311292539, + "grad_norm": 1.0619351863861084, + "learning_rate": 0.0002, + "loss": 0.575, + "step": 27290 + }, + { + "epoch": 4.413547813434645, + "grad_norm": 1.0177674293518066, + "learning_rate": 0.0002, + "loss": 0.5579, + "step": 27300 + }, + { + "epoch": 4.4151644976153905, + "grad_norm": 0.9921218156814575, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 27310 + }, + { + "epoch": 4.416781181796136, + "grad_norm": 1.126244306564331, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 27320 + }, + { + "epoch": 4.418397865976881, + "grad_norm": 1.0678540468215942, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 27330 + }, + { + "epoch": 4.420014550157627, + "grad_norm": 0.8705704212188721, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 27340 + }, + { + "epoch": 4.421631234338372, + "grad_norm": 1.272074818611145, + "learning_rate": 0.0002, + "loss": 0.5763, + "step": 27350 + }, + { + "epoch": 4.423247918519118, + "grad_norm": 0.8740444183349609, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 27360 + }, + { + "epoch": 4.424864602699863, + "grad_norm": 1.0584250688552856, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 27370 + }, + { + "epoch": 4.426481286880608, + "grad_norm": 1.059870719909668, + "learning_rate": 0.0002, + "loss": 0.589, + "step": 27380 + }, + { + "epoch": 4.428097971061353, + "grad_norm": 1.072265863418579, + "learning_rate": 0.0002, + "loss": 0.5551, + "step": 27390 + }, + { + "epoch": 4.4297146552420985, + "grad_norm": 0.871481716632843, + "learning_rate": 0.0002, + "loss": 0.5584, + "step": 27400 + }, + { + "epoch": 4.431331339422844, + "grad_norm": 0.9555448293685913, + "learning_rate": 0.0002, + "loss": 0.5372, + "step": 27410 + }, + { + "epoch": 4.432948023603589, + "grad_norm": 1.0402292013168335, + "learning_rate": 0.0002, + "loss": 0.5593, + "step": 27420 + }, + { + "epoch": 4.434564707784334, + "grad_norm": 1.12587571144104, + "learning_rate": 0.0002, + "loss": 0.5532, + "step": 27430 + }, + { + "epoch": 4.436181391965079, + "grad_norm": 1.0783193111419678, + "learning_rate": 0.0002, + "loss": 0.5403, + "step": 27440 + }, + { + "epoch": 4.437798076145825, + "grad_norm": 1.024133563041687, + "learning_rate": 0.0002, + "loss": 0.5313, + "step": 27450 + }, + { + "epoch": 4.43941476032657, + "grad_norm": 0.9156768918037415, + "learning_rate": 0.0002, + "loss": 0.5621, + "step": 27460 + }, + { + "epoch": 4.441031444507315, + "grad_norm": 1.0215224027633667, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 27470 + }, + { + "epoch": 4.442648128688061, + "grad_norm": 1.082116961479187, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 27480 + }, + { + "epoch": 4.4442648128688065, + "grad_norm": 1.0412873029708862, + "learning_rate": 0.0002, + "loss": 0.6203, + "step": 27490 + }, + { + "epoch": 4.445881497049552, + "grad_norm": 1.0509289503097534, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 27500 + }, + { + "epoch": 4.447498181230297, + "grad_norm": 0.9291498064994812, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 27510 + }, + { + "epoch": 4.449114865411042, + "grad_norm": 0.970184326171875, + "learning_rate": 0.0002, + "loss": 0.5408, + "step": 27520 + }, + { + "epoch": 4.450731549591787, + "grad_norm": 0.8418883681297302, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 27530 + }, + { + "epoch": 4.452348233772533, + "grad_norm": 0.8823825120925903, + "learning_rate": 0.0002, + "loss": 0.5124, + "step": 27540 + }, + { + "epoch": 4.453964917953278, + "grad_norm": 1.1909019947052002, + "learning_rate": 0.0002, + "loss": 0.5867, + "step": 27550 + }, + { + "epoch": 4.455581602134023, + "grad_norm": 1.0317302942276, + "learning_rate": 0.0002, + "loss": 0.5685, + "step": 27560 + }, + { + "epoch": 4.457198286314768, + "grad_norm": 0.9977751970291138, + "learning_rate": 0.0002, + "loss": 0.5538, + "step": 27570 + }, + { + "epoch": 4.458814970495514, + "grad_norm": 0.8909519910812378, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 27580 + }, + { + "epoch": 4.460431654676259, + "grad_norm": 0.8653029799461365, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 27590 + }, + { + "epoch": 4.462048338857004, + "grad_norm": 1.0783653259277344, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 27600 + }, + { + "epoch": 4.463665023037749, + "grad_norm": 1.1235394477844238, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 27610 + }, + { + "epoch": 4.4652817072184945, + "grad_norm": 0.9386643767356873, + "learning_rate": 0.0002, + "loss": 0.5545, + "step": 27620 + }, + { + "epoch": 4.466898391399241, + "grad_norm": 1.0605148077011108, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 27630 + }, + { + "epoch": 4.468515075579986, + "grad_norm": 1.1283893585205078, + "learning_rate": 0.0002, + "loss": 0.5886, + "step": 27640 + }, + { + "epoch": 4.470131759760731, + "grad_norm": 1.0583468675613403, + "learning_rate": 0.0002, + "loss": 0.5801, + "step": 27650 + }, + { + "epoch": 4.471748443941476, + "grad_norm": 0.9563992023468018, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 27660 + }, + { + "epoch": 4.4733651281222215, + "grad_norm": 1.100598931312561, + "learning_rate": 0.0002, + "loss": 0.5687, + "step": 27670 + }, + { + "epoch": 4.474981812302967, + "grad_norm": 0.9386957287788391, + "learning_rate": 0.0002, + "loss": 0.589, + "step": 27680 + }, + { + "epoch": 4.476598496483712, + "grad_norm": 1.2946288585662842, + "learning_rate": 0.0002, + "loss": 0.6241, + "step": 27690 + }, + { + "epoch": 4.478215180664457, + "grad_norm": 1.0325199365615845, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 27700 + }, + { + "epoch": 4.4798318648452025, + "grad_norm": 1.0318928956985474, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 27710 + }, + { + "epoch": 4.481448549025948, + "grad_norm": 0.8721024394035339, + "learning_rate": 0.0002, + "loss": 0.5656, + "step": 27720 + }, + { + "epoch": 4.483065233206693, + "grad_norm": 1.17376708984375, + "learning_rate": 0.0002, + "loss": 0.5421, + "step": 27730 + }, + { + "epoch": 4.484681917387438, + "grad_norm": 1.0926326513290405, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 27740 + }, + { + "epoch": 4.486298601568183, + "grad_norm": 0.9043852686882019, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 27750 + }, + { + "epoch": 4.487915285748929, + "grad_norm": 1.064600944519043, + "learning_rate": 0.0002, + "loss": 0.582, + "step": 27760 + }, + { + "epoch": 4.489531969929674, + "grad_norm": 0.7833460569381714, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 27770 + }, + { + "epoch": 4.49114865411042, + "grad_norm": 1.1073496341705322, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 27780 + }, + { + "epoch": 4.492765338291165, + "grad_norm": 1.0799397230148315, + "learning_rate": 0.0002, + "loss": 0.5577, + "step": 27790 + }, + { + "epoch": 4.49438202247191, + "grad_norm": 1.1062238216400146, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 27800 + }, + { + "epoch": 4.495998706652656, + "grad_norm": 1.0568242073059082, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 27810 + }, + { + "epoch": 4.497615390833401, + "grad_norm": 0.8861091732978821, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 27820 + }, + { + "epoch": 4.499232075014146, + "grad_norm": 1.2297543287277222, + "learning_rate": 0.0002, + "loss": 0.5858, + "step": 27830 + }, + { + "epoch": 4.500848759194891, + "grad_norm": 0.9600302577018738, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 27840 + }, + { + "epoch": 4.502465443375637, + "grad_norm": 1.057051181793213, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 27850 + }, + { + "epoch": 4.504082127556382, + "grad_norm": 0.9839690923690796, + "learning_rate": 0.0002, + "loss": 0.5788, + "step": 27860 + }, + { + "epoch": 4.505698811737127, + "grad_norm": 1.1479853391647339, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 27870 + }, + { + "epoch": 4.507315495917872, + "grad_norm": 1.0550768375396729, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 27880 + }, + { + "epoch": 4.5089321800986175, + "grad_norm": 0.898209273815155, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 27890 + }, + { + "epoch": 4.510548864279363, + "grad_norm": 0.9460315108299255, + "learning_rate": 0.0002, + "loss": 0.5734, + "step": 27900 + }, + { + "epoch": 4.512165548460108, + "grad_norm": 0.9499884247779846, + "learning_rate": 0.0002, + "loss": 0.5702, + "step": 27910 + }, + { + "epoch": 4.513782232640853, + "grad_norm": 0.7801318764686584, + "learning_rate": 0.0002, + "loss": 0.5385, + "step": 27920 + }, + { + "epoch": 4.515398916821599, + "grad_norm": 0.9286966323852539, + "learning_rate": 0.0002, + "loss": 0.5391, + "step": 27930 + }, + { + "epoch": 4.517015601002345, + "grad_norm": 0.9539980292320251, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 27940 + }, + { + "epoch": 4.51863228518309, + "grad_norm": 1.1053401231765747, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 27950 + }, + { + "epoch": 4.520248969363835, + "grad_norm": 0.7535534501075745, + "learning_rate": 0.0002, + "loss": 0.6087, + "step": 27960 + }, + { + "epoch": 4.52186565354458, + "grad_norm": 1.076926589012146, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 27970 + }, + { + "epoch": 4.5234823377253255, + "grad_norm": 1.181935429573059, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 27980 + }, + { + "epoch": 4.525099021906071, + "grad_norm": 0.9293407201766968, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 27990 + }, + { + "epoch": 4.526715706086816, + "grad_norm": 0.8953009247779846, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 28000 + }, + { + "epoch": 4.528332390267561, + "grad_norm": 1.0850225687026978, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 28010 + }, + { + "epoch": 4.529949074448306, + "grad_norm": 0.9125663042068481, + "learning_rate": 0.0002, + "loss": 0.5459, + "step": 28020 + }, + { + "epoch": 4.531565758629052, + "grad_norm": 0.8745216727256775, + "learning_rate": 0.0002, + "loss": 0.5922, + "step": 28030 + }, + { + "epoch": 4.533182442809797, + "grad_norm": 1.0783463716506958, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 28040 + }, + { + "epoch": 4.534799126990542, + "grad_norm": 0.7513844966888428, + "learning_rate": 0.0002, + "loss": 0.5754, + "step": 28050 + }, + { + "epoch": 4.536415811171287, + "grad_norm": 1.0135776996612549, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 28060 + }, + { + "epoch": 4.538032495352033, + "grad_norm": 0.8886825442314148, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 28070 + }, + { + "epoch": 4.539649179532779, + "grad_norm": 0.8153995275497437, + "learning_rate": 0.0002, + "loss": 0.5605, + "step": 28080 + }, + { + "epoch": 4.541265863713524, + "grad_norm": 0.9853341579437256, + "learning_rate": 0.0002, + "loss": 0.6377, + "step": 28090 + }, + { + "epoch": 4.542882547894269, + "grad_norm": 0.9365800023078918, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 28100 + }, + { + "epoch": 4.544499232075014, + "grad_norm": 0.9765017628669739, + "learning_rate": 0.0002, + "loss": 0.5477, + "step": 28110 + }, + { + "epoch": 4.54611591625576, + "grad_norm": 0.9811279773712158, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 28120 + }, + { + "epoch": 4.547732600436505, + "grad_norm": 1.0387924909591675, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 28130 + }, + { + "epoch": 4.54934928461725, + "grad_norm": 1.0684878826141357, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 28140 + }, + { + "epoch": 4.550965968797995, + "grad_norm": 1.0000102519989014, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 28150 + }, + { + "epoch": 4.5525826529787405, + "grad_norm": 1.0717930793762207, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 28160 + }, + { + "epoch": 4.554199337159486, + "grad_norm": 0.990074634552002, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 28170 + }, + { + "epoch": 4.555816021340231, + "grad_norm": 0.8673754930496216, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 28180 + }, + { + "epoch": 4.557432705520976, + "grad_norm": 0.864247739315033, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 28190 + }, + { + "epoch": 4.5590493897017215, + "grad_norm": 0.8280200958251953, + "learning_rate": 0.0002, + "loss": 0.5516, + "step": 28200 + }, + { + "epoch": 4.560666073882467, + "grad_norm": 1.1312172412872314, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 28210 + }, + { + "epoch": 4.562282758063212, + "grad_norm": 0.9147403240203857, + "learning_rate": 0.0002, + "loss": 0.5776, + "step": 28220 + }, + { + "epoch": 4.563899442243958, + "grad_norm": 1.0321218967437744, + "learning_rate": 0.0002, + "loss": 0.5591, + "step": 28230 + }, + { + "epoch": 4.565516126424703, + "grad_norm": 1.168332815170288, + "learning_rate": 0.0002, + "loss": 0.5508, + "step": 28240 + }, + { + "epoch": 4.5671328106054485, + "grad_norm": 1.0067222118377686, + "learning_rate": 0.0002, + "loss": 0.5649, + "step": 28250 + }, + { + "epoch": 4.568749494786194, + "grad_norm": 1.0283393859863281, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 28260 + }, + { + "epoch": 4.570366178966939, + "grad_norm": 0.9912363886833191, + "learning_rate": 0.0002, + "loss": 0.5772, + "step": 28270 + }, + { + "epoch": 4.571982863147684, + "grad_norm": 1.108032464981079, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 28280 + }, + { + "epoch": 4.573599547328429, + "grad_norm": 0.8260078430175781, + "learning_rate": 0.0002, + "loss": 0.5529, + "step": 28290 + }, + { + "epoch": 4.575216231509175, + "grad_norm": 0.8946247100830078, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 28300 + }, + { + "epoch": 4.57683291568992, + "grad_norm": 0.8273587822914124, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 28310 + }, + { + "epoch": 4.578449599870665, + "grad_norm": 0.9040093421936035, + "learning_rate": 0.0002, + "loss": 0.6058, + "step": 28320 + }, + { + "epoch": 4.58006628405141, + "grad_norm": 0.8435290455818176, + "learning_rate": 0.0002, + "loss": 0.5521, + "step": 28330 + }, + { + "epoch": 4.581682968232156, + "grad_norm": 1.164088249206543, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 28340 + }, + { + "epoch": 4.583299652412901, + "grad_norm": 0.9861085414886475, + "learning_rate": 0.0002, + "loss": 0.5603, + "step": 28350 + }, + { + "epoch": 4.584916336593646, + "grad_norm": 0.8892980813980103, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 28360 + }, + { + "epoch": 4.586533020774391, + "grad_norm": 1.240574836730957, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 28370 + }, + { + "epoch": 4.588149704955137, + "grad_norm": 0.8669408559799194, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 28380 + }, + { + "epoch": 4.589766389135883, + "grad_norm": 0.9145985841751099, + "learning_rate": 0.0002, + "loss": 0.5603, + "step": 28390 + }, + { + "epoch": 4.591383073316628, + "grad_norm": 0.8584614992141724, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 28400 + }, + { + "epoch": 4.592999757497373, + "grad_norm": 1.118829369544983, + "learning_rate": 0.0002, + "loss": 0.5898, + "step": 28410 + }, + { + "epoch": 4.594616441678118, + "grad_norm": 1.1411553621292114, + "learning_rate": 0.0002, + "loss": 0.5641, + "step": 28420 + }, + { + "epoch": 4.596233125858864, + "grad_norm": 0.9433278441429138, + "learning_rate": 0.0002, + "loss": 0.549, + "step": 28430 + }, + { + "epoch": 4.597849810039609, + "grad_norm": 0.816830039024353, + "learning_rate": 0.0002, + "loss": 0.5496, + "step": 28440 + }, + { + "epoch": 4.599466494220354, + "grad_norm": 1.2124968767166138, + "learning_rate": 0.0002, + "loss": 0.5543, + "step": 28450 + }, + { + "epoch": 4.601083178401099, + "grad_norm": 0.9658762216567993, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 28460 + }, + { + "epoch": 4.6026998625818445, + "grad_norm": 0.836100161075592, + "learning_rate": 0.0002, + "loss": 0.5902, + "step": 28470 + }, + { + "epoch": 4.60431654676259, + "grad_norm": 0.9989104270935059, + "learning_rate": 0.0002, + "loss": 0.5749, + "step": 28480 + }, + { + "epoch": 4.605933230943335, + "grad_norm": 1.1298956871032715, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 28490 + }, + { + "epoch": 4.60754991512408, + "grad_norm": 1.1731704473495483, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 28500 + }, + { + "epoch": 4.609166599304825, + "grad_norm": 0.9624714255332947, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 28510 + }, + { + "epoch": 4.610783283485571, + "grad_norm": 1.364073634147644, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 28520 + }, + { + "epoch": 4.612399967666317, + "grad_norm": 1.1827356815338135, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 28530 + }, + { + "epoch": 4.614016651847062, + "grad_norm": 0.6651531457901001, + "learning_rate": 0.0002, + "loss": 0.5643, + "step": 28540 + }, + { + "epoch": 4.615633336027807, + "grad_norm": 1.1640995740890503, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 28550 + }, + { + "epoch": 4.6172500202085525, + "grad_norm": 1.028918743133545, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 28560 + }, + { + "epoch": 4.618866704389298, + "grad_norm": 0.8252120614051819, + "learning_rate": 0.0002, + "loss": 0.5607, + "step": 28570 + }, + { + "epoch": 4.620483388570043, + "grad_norm": 1.3536735773086548, + "learning_rate": 0.0002, + "loss": 0.5769, + "step": 28580 + }, + { + "epoch": 4.622100072750788, + "grad_norm": 1.2146915197372437, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 28590 + }, + { + "epoch": 4.623716756931533, + "grad_norm": 1.0122549533843994, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 28600 + }, + { + "epoch": 4.625333441112279, + "grad_norm": 0.9977872967720032, + "learning_rate": 0.0002, + "loss": 0.6072, + "step": 28610 + }, + { + "epoch": 4.626950125293024, + "grad_norm": 1.0159751176834106, + "learning_rate": 0.0002, + "loss": 0.5669, + "step": 28620 + }, + { + "epoch": 4.628566809473769, + "grad_norm": 1.0028325319290161, + "learning_rate": 0.0002, + "loss": 0.5935, + "step": 28630 + }, + { + "epoch": 4.630183493654514, + "grad_norm": 0.901638388633728, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 28640 + }, + { + "epoch": 4.6318001778352595, + "grad_norm": 0.9450507164001465, + "learning_rate": 0.0002, + "loss": 0.595, + "step": 28650 + }, + { + "epoch": 4.633416862016006, + "grad_norm": 0.9987545013427734, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 28660 + }, + { + "epoch": 4.63503354619675, + "grad_norm": 0.9574332237243652, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 28670 + }, + { + "epoch": 4.636650230377496, + "grad_norm": 1.2215653657913208, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 28680 + }, + { + "epoch": 4.638266914558241, + "grad_norm": 0.9798858761787415, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 28690 + }, + { + "epoch": 4.639883598738987, + "grad_norm": 1.0648466348648071, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 28700 + }, + { + "epoch": 4.641500282919732, + "grad_norm": 1.0606504678726196, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 28710 + }, + { + "epoch": 4.643116967100477, + "grad_norm": 1.0892442464828491, + "learning_rate": 0.0002, + "loss": 0.5801, + "step": 28720 + }, + { + "epoch": 4.644733651281222, + "grad_norm": 0.914391040802002, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 28730 + }, + { + "epoch": 4.6463503354619675, + "grad_norm": 0.9782370328903198, + "learning_rate": 0.0002, + "loss": 0.5439, + "step": 28740 + }, + { + "epoch": 4.647967019642713, + "grad_norm": 1.0344339609146118, + "learning_rate": 0.0002, + "loss": 0.6035, + "step": 28750 + }, + { + "epoch": 4.649583703823458, + "grad_norm": 1.0513931512832642, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 28760 + }, + { + "epoch": 4.651200388004203, + "grad_norm": 0.9711475968360901, + "learning_rate": 0.0002, + "loss": 0.546, + "step": 28770 + }, + { + "epoch": 4.652817072184948, + "grad_norm": 0.977519690990448, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 28780 + }, + { + "epoch": 4.654433756365694, + "grad_norm": 0.9150224924087524, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 28790 + }, + { + "epoch": 4.656050440546439, + "grad_norm": 1.0973542928695679, + "learning_rate": 0.0002, + "loss": 0.5382, + "step": 28800 + }, + { + "epoch": 4.657667124727185, + "grad_norm": 0.944877564907074, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 28810 + }, + { + "epoch": 4.659283808907929, + "grad_norm": 0.9508748650550842, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 28820 + }, + { + "epoch": 4.6609004930886755, + "grad_norm": 0.9681721329689026, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 28830 + }, + { + "epoch": 4.662517177269421, + "grad_norm": 1.0214351415634155, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 28840 + }, + { + "epoch": 4.664133861450166, + "grad_norm": 0.9748611450195312, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 28850 + }, + { + "epoch": 4.665750545630911, + "grad_norm": 0.8484147191047668, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 28860 + }, + { + "epoch": 4.667367229811656, + "grad_norm": 1.1252986192703247, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 28870 + }, + { + "epoch": 4.668983913992402, + "grad_norm": 0.8706206679344177, + "learning_rate": 0.0002, + "loss": 0.5724, + "step": 28880 + }, + { + "epoch": 4.670600598173147, + "grad_norm": 1.1432424783706665, + "learning_rate": 0.0002, + "loss": 0.6002, + "step": 28890 + }, + { + "epoch": 4.672217282353892, + "grad_norm": 1.017029047012329, + "learning_rate": 0.0002, + "loss": 0.5675, + "step": 28900 + }, + { + "epoch": 4.673833966534637, + "grad_norm": 1.085597038269043, + "learning_rate": 0.0002, + "loss": 0.5831, + "step": 28910 + }, + { + "epoch": 4.675450650715383, + "grad_norm": 0.9275796413421631, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 28920 + }, + { + "epoch": 4.677067334896128, + "grad_norm": 0.9518964886665344, + "learning_rate": 0.0002, + "loss": 0.5603, + "step": 28930 + }, + { + "epoch": 4.678684019076873, + "grad_norm": 1.0352122783660889, + "learning_rate": 0.0002, + "loss": 0.6232, + "step": 28940 + }, + { + "epoch": 4.680300703257618, + "grad_norm": 1.090124249458313, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 28950 + }, + { + "epoch": 4.681917387438364, + "grad_norm": 0.8799563050270081, + "learning_rate": 0.0002, + "loss": 0.5728, + "step": 28960 + }, + { + "epoch": 4.683534071619109, + "grad_norm": 1.0929821729660034, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 28970 + }, + { + "epoch": 4.685150755799855, + "grad_norm": 0.903727650642395, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 28980 + }, + { + "epoch": 4.6867674399806, + "grad_norm": 0.9752424955368042, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 28990 + }, + { + "epoch": 4.688384124161345, + "grad_norm": 0.9351571202278137, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 29000 + }, + { + "epoch": 4.6900008083420905, + "grad_norm": 0.923877477645874, + "learning_rate": 0.0002, + "loss": 0.5811, + "step": 29010 + }, + { + "epoch": 4.691617492522836, + "grad_norm": 1.045389175415039, + "learning_rate": 0.0002, + "loss": 0.5682, + "step": 29020 + }, + { + "epoch": 4.693234176703581, + "grad_norm": 1.0200831890106201, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 29030 + }, + { + "epoch": 4.694850860884326, + "grad_norm": 1.1499706506729126, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 29040 + }, + { + "epoch": 4.6964675450650715, + "grad_norm": 0.860118567943573, + "learning_rate": 0.0002, + "loss": 0.5745, + "step": 29050 + }, + { + "epoch": 4.698084229245817, + "grad_norm": 0.9774864315986633, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 29060 + }, + { + "epoch": 4.699700913426562, + "grad_norm": 1.0323210954666138, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 29070 + }, + { + "epoch": 4.701317597607307, + "grad_norm": 0.8492481112480164, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 29080 + }, + { + "epoch": 4.702934281788052, + "grad_norm": 1.131951093673706, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 29090 + }, + { + "epoch": 4.704550965968798, + "grad_norm": 0.8763113021850586, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 29100 + }, + { + "epoch": 4.706167650149544, + "grad_norm": 1.045028805732727, + "learning_rate": 0.0002, + "loss": 0.575, + "step": 29110 + }, + { + "epoch": 4.707784334330288, + "grad_norm": 0.9961401224136353, + "learning_rate": 0.0002, + "loss": 0.5548, + "step": 29120 + }, + { + "epoch": 4.709401018511034, + "grad_norm": 0.9282503724098206, + "learning_rate": 0.0002, + "loss": 0.559, + "step": 29130 + }, + { + "epoch": 4.711017702691779, + "grad_norm": 1.1418932676315308, + "learning_rate": 0.0002, + "loss": 0.5744, + "step": 29140 + }, + { + "epoch": 4.712634386872525, + "grad_norm": 0.9950099587440491, + "learning_rate": 0.0002, + "loss": 0.5394, + "step": 29150 + }, + { + "epoch": 4.71425107105327, + "grad_norm": 0.8304893374443054, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 29160 + }, + { + "epoch": 4.715867755234015, + "grad_norm": 1.115626335144043, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 29170 + }, + { + "epoch": 4.71748443941476, + "grad_norm": 1.079818606376648, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 29180 + }, + { + "epoch": 4.719101123595506, + "grad_norm": 1.1929082870483398, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 29190 + }, + { + "epoch": 4.720717807776251, + "grad_norm": 0.9621080756187439, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 29200 + }, + { + "epoch": 4.722334491956996, + "grad_norm": 0.8549222350120544, + "learning_rate": 0.0002, + "loss": 0.546, + "step": 29210 + }, + { + "epoch": 4.723951176137741, + "grad_norm": 0.9341941475868225, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 29220 + }, + { + "epoch": 4.7255678603184865, + "grad_norm": 1.075406789779663, + "learning_rate": 0.0002, + "loss": 0.5436, + "step": 29230 + }, + { + "epoch": 4.727184544499232, + "grad_norm": 1.0859880447387695, + "learning_rate": 0.0002, + "loss": 0.576, + "step": 29240 + }, + { + "epoch": 4.728801228679977, + "grad_norm": 0.8475605249404907, + "learning_rate": 0.0002, + "loss": 0.5525, + "step": 29250 + }, + { + "epoch": 4.730417912860723, + "grad_norm": 0.9331845641136169, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 29260 + }, + { + "epoch": 4.7320345970414674, + "grad_norm": 0.9279314279556274, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 29270 + }, + { + "epoch": 4.733651281222214, + "grad_norm": 0.7803558707237244, + "learning_rate": 0.0002, + "loss": 0.597, + "step": 29280 + }, + { + "epoch": 4.735267965402959, + "grad_norm": 1.0159329175949097, + "learning_rate": 0.0002, + "loss": 0.5968, + "step": 29290 + }, + { + "epoch": 4.736884649583704, + "grad_norm": 0.9448670744895935, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 29300 + }, + { + "epoch": 4.738501333764449, + "grad_norm": 1.0732197761535645, + "learning_rate": 0.0002, + "loss": 0.574, + "step": 29310 + }, + { + "epoch": 4.7401180179451945, + "grad_norm": 0.901830792427063, + "learning_rate": 0.0002, + "loss": 0.6066, + "step": 29320 + }, + { + "epoch": 4.74173470212594, + "grad_norm": 0.9141789674758911, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 29330 + }, + { + "epoch": 4.743351386306685, + "grad_norm": 0.9733418226242065, + "learning_rate": 0.0002, + "loss": 0.5481, + "step": 29340 + }, + { + "epoch": 4.74496807048743, + "grad_norm": 0.909810483455658, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 29350 + }, + { + "epoch": 4.746584754668175, + "grad_norm": 0.909541666507721, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 29360 + }, + { + "epoch": 4.748201438848921, + "grad_norm": 0.9383015632629395, + "learning_rate": 0.0002, + "loss": 0.5579, + "step": 29370 + }, + { + "epoch": 4.749818123029666, + "grad_norm": 0.9275668263435364, + "learning_rate": 0.0002, + "loss": 0.5529, + "step": 29380 + }, + { + "epoch": 4.751434807210411, + "grad_norm": 1.1146225929260254, + "learning_rate": 0.0002, + "loss": 0.5623, + "step": 29390 + }, + { + "epoch": 4.753051491391156, + "grad_norm": 1.0062453746795654, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 29400 + }, + { + "epoch": 4.7546681755719025, + "grad_norm": 0.9451895952224731, + "learning_rate": 0.0002, + "loss": 0.5872, + "step": 29410 + }, + { + "epoch": 4.756284859752648, + "grad_norm": 0.870457649230957, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 29420 + }, + { + "epoch": 4.757901543933393, + "grad_norm": 1.0411282777786255, + "learning_rate": 0.0002, + "loss": 0.57, + "step": 29430 + }, + { + "epoch": 4.759518228114138, + "grad_norm": 1.1648986339569092, + "learning_rate": 0.0002, + "loss": 0.5688, + "step": 29440 + }, + { + "epoch": 4.761134912294883, + "grad_norm": 0.8999572992324829, + "learning_rate": 0.0002, + "loss": 0.5432, + "step": 29450 + }, + { + "epoch": 4.762751596475629, + "grad_norm": 0.9863559007644653, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 29460 + }, + { + "epoch": 4.764368280656374, + "grad_norm": 0.9676542282104492, + "learning_rate": 0.0002, + "loss": 0.5779, + "step": 29470 + }, + { + "epoch": 4.765984964837119, + "grad_norm": 1.004775047302246, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 29480 + }, + { + "epoch": 4.767601649017864, + "grad_norm": 1.0937515497207642, + "learning_rate": 0.0002, + "loss": 0.6044, + "step": 29490 + }, + { + "epoch": 4.7692183331986095, + "grad_norm": 0.9551598429679871, + "learning_rate": 0.0002, + "loss": 0.5433, + "step": 29500 + }, + { + "epoch": 4.770835017379355, + "grad_norm": 1.0757228136062622, + "learning_rate": 0.0002, + "loss": 0.5609, + "step": 29510 + }, + { + "epoch": 4.7724517015601, + "grad_norm": 1.0588841438293457, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 29520 + }, + { + "epoch": 4.774068385740845, + "grad_norm": 1.0744032859802246, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 29530 + }, + { + "epoch": 4.7756850699215905, + "grad_norm": 1.0066277980804443, + "learning_rate": 0.0002, + "loss": 0.5681, + "step": 29540 + }, + { + "epoch": 4.777301754102336, + "grad_norm": 1.082319736480713, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 29550 + }, + { + "epoch": 4.778918438283082, + "grad_norm": 0.8252472877502441, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 29560 + }, + { + "epoch": 4.780535122463827, + "grad_norm": 0.9855340123176575, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 29570 + }, + { + "epoch": 4.782151806644572, + "grad_norm": 0.9991421699523926, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 29580 + }, + { + "epoch": 4.7837684908253175, + "grad_norm": 1.316841959953308, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 29590 + }, + { + "epoch": 4.785385175006063, + "grad_norm": 1.1513035297393799, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 29600 + }, + { + "epoch": 4.787001859186808, + "grad_norm": 0.9767683744430542, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 29610 + }, + { + "epoch": 4.788618543367553, + "grad_norm": 0.9786278605461121, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 29620 + }, + { + "epoch": 4.7902352275482984, + "grad_norm": 0.8004973530769348, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 29630 + }, + { + "epoch": 4.791851911729044, + "grad_norm": 1.0997767448425293, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 29640 + }, + { + "epoch": 4.793468595909789, + "grad_norm": 0.9752856492996216, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 29650 + }, + { + "epoch": 4.795085280090534, + "grad_norm": 1.0518392324447632, + "learning_rate": 0.0002, + "loss": 0.5916, + "step": 29660 + }, + { + "epoch": 4.796701964271279, + "grad_norm": 1.1050055027008057, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 29670 + }, + { + "epoch": 4.798318648452025, + "grad_norm": 0.9933857917785645, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 29680 + }, + { + "epoch": 4.79993533263277, + "grad_norm": 1.2804018259048462, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 29690 + }, + { + "epoch": 4.801552016813515, + "grad_norm": 1.0133371353149414, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 29700 + }, + { + "epoch": 4.803168700994261, + "grad_norm": 1.080350637435913, + "learning_rate": 0.0002, + "loss": 0.5662, + "step": 29710 + }, + { + "epoch": 4.804785385175006, + "grad_norm": 0.9986529350280762, + "learning_rate": 0.0002, + "loss": 0.5603, + "step": 29720 + }, + { + "epoch": 4.806402069355752, + "grad_norm": 0.975665807723999, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 29730 + }, + { + "epoch": 4.808018753536497, + "grad_norm": 0.8458138704299927, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 29740 + }, + { + "epoch": 4.809635437717242, + "grad_norm": 0.99330073595047, + "learning_rate": 0.0002, + "loss": 0.5837, + "step": 29750 + }, + { + "epoch": 4.811252121897987, + "grad_norm": 0.898274302482605, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 29760 + }, + { + "epoch": 4.812868806078733, + "grad_norm": 1.0504480600357056, + "learning_rate": 0.0002, + "loss": 0.5842, + "step": 29770 + }, + { + "epoch": 4.814485490259478, + "grad_norm": 0.937919020652771, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 29780 + }, + { + "epoch": 4.816102174440223, + "grad_norm": 0.9593307971954346, + "learning_rate": 0.0002, + "loss": 0.5885, + "step": 29790 + }, + { + "epoch": 4.817718858620968, + "grad_norm": 0.9431198835372925, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 29800 + }, + { + "epoch": 4.8193355428017135, + "grad_norm": 1.2729957103729248, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 29810 + }, + { + "epoch": 4.820952226982459, + "grad_norm": 0.8876838684082031, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 29820 + }, + { + "epoch": 4.822568911163204, + "grad_norm": 1.0185000896453857, + "learning_rate": 0.0002, + "loss": 0.5583, + "step": 29830 + }, + { + "epoch": 4.824185595343949, + "grad_norm": 1.064276099205017, + "learning_rate": 0.0002, + "loss": 0.5686, + "step": 29840 + }, + { + "epoch": 4.825802279524694, + "grad_norm": 0.9774803519248962, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 29850 + }, + { + "epoch": 4.8274189637054405, + "grad_norm": 1.131646990776062, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 29860 + }, + { + "epoch": 4.829035647886186, + "grad_norm": 1.081455945968628, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 29870 + }, + { + "epoch": 4.830652332066931, + "grad_norm": 0.990538477897644, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 29880 + }, + { + "epoch": 4.832269016247676, + "grad_norm": 0.9750600457191467, + "learning_rate": 0.0002, + "loss": 0.5833, + "step": 29890 + }, + { + "epoch": 4.8338857004284215, + "grad_norm": 1.0600621700286865, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 29900 + }, + { + "epoch": 4.835502384609167, + "grad_norm": 0.9237320423126221, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 29910 + }, + { + "epoch": 4.837119068789912, + "grad_norm": 0.9739177227020264, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 29920 + }, + { + "epoch": 4.838735752970657, + "grad_norm": 1.128677248954773, + "learning_rate": 0.0002, + "loss": 0.587, + "step": 29930 + }, + { + "epoch": 4.840352437151402, + "grad_norm": 1.042604923248291, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 29940 + }, + { + "epoch": 4.841969121332148, + "grad_norm": 0.849758505821228, + "learning_rate": 0.0002, + "loss": 0.5885, + "step": 29950 + }, + { + "epoch": 4.843585805512893, + "grad_norm": 1.2809888124465942, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 29960 + }, + { + "epoch": 4.845202489693638, + "grad_norm": 1.0177865028381348, + "learning_rate": 0.0002, + "loss": 0.5703, + "step": 29970 + }, + { + "epoch": 4.846819173874383, + "grad_norm": 1.0026639699935913, + "learning_rate": 0.0002, + "loss": 0.5946, + "step": 29980 + }, + { + "epoch": 4.8484358580551286, + "grad_norm": 0.9679505228996277, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 29990 + }, + { + "epoch": 4.850052542235874, + "grad_norm": 0.8939532041549683, + "learning_rate": 0.0002, + "loss": 0.5621, + "step": 30000 + }, + { + "epoch": 4.85166922641662, + "grad_norm": 0.9957457780838013, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 30010 + }, + { + "epoch": 4.853285910597365, + "grad_norm": 1.1646790504455566, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 30020 + }, + { + "epoch": 4.85490259477811, + "grad_norm": 0.8804680705070496, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 30030 + }, + { + "epoch": 4.856519278958856, + "grad_norm": 1.161970853805542, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 30040 + }, + { + "epoch": 4.858135963139601, + "grad_norm": 0.9081037640571594, + "learning_rate": 0.0002, + "loss": 0.5552, + "step": 30050 + }, + { + "epoch": 4.859752647320346, + "grad_norm": 0.9402848482131958, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 30060 + }, + { + "epoch": 4.861369331501091, + "grad_norm": 0.9023865461349487, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 30070 + }, + { + "epoch": 4.8629860156818365, + "grad_norm": 1.0173414945602417, + "learning_rate": 0.0002, + "loss": 0.5926, + "step": 30080 + }, + { + "epoch": 4.864602699862582, + "grad_norm": 1.084402322769165, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 30090 + }, + { + "epoch": 4.866219384043327, + "grad_norm": 0.9577937126159668, + "learning_rate": 0.0002, + "loss": 0.6311, + "step": 30100 + }, + { + "epoch": 4.867836068224072, + "grad_norm": 0.9807606935501099, + "learning_rate": 0.0002, + "loss": 0.5724, + "step": 30110 + }, + { + "epoch": 4.8694527524048175, + "grad_norm": 0.978784441947937, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 30120 + }, + { + "epoch": 4.871069436585563, + "grad_norm": 0.9762914776802063, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 30130 + }, + { + "epoch": 4.872686120766308, + "grad_norm": 0.9404871463775635, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 30140 + }, + { + "epoch": 4.874302804947053, + "grad_norm": 1.0069509744644165, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 30150 + }, + { + "epoch": 4.875919489127799, + "grad_norm": 1.1770923137664795, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 30160 + }, + { + "epoch": 4.8775361733085445, + "grad_norm": 1.021210789680481, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 30170 + }, + { + "epoch": 4.87915285748929, + "grad_norm": 0.8512648940086365, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 30180 + }, + { + "epoch": 4.880769541670035, + "grad_norm": 0.9345870018005371, + "learning_rate": 0.0002, + "loss": 0.5519, + "step": 30190 + }, + { + "epoch": 4.88238622585078, + "grad_norm": 1.0224418640136719, + "learning_rate": 0.0002, + "loss": 0.5682, + "step": 30200 + }, + { + "epoch": 4.884002910031525, + "grad_norm": 1.0316044092178345, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 30210 + }, + { + "epoch": 4.885619594212271, + "grad_norm": 1.102437973022461, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 30220 + }, + { + "epoch": 4.887236278393016, + "grad_norm": 1.0220023393630981, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 30230 + }, + { + "epoch": 4.888852962573761, + "grad_norm": 1.0934523344039917, + "learning_rate": 0.0002, + "loss": 0.5781, + "step": 30240 + }, + { + "epoch": 4.890469646754506, + "grad_norm": 1.264630913734436, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 30250 + }, + { + "epoch": 4.892086330935252, + "grad_norm": 1.0999879837036133, + "learning_rate": 0.0002, + "loss": 0.5712, + "step": 30260 + }, + { + "epoch": 4.893703015115997, + "grad_norm": 0.9124550223350525, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 30270 + }, + { + "epoch": 4.895319699296742, + "grad_norm": 0.9853624105453491, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 30280 + }, + { + "epoch": 4.896936383477488, + "grad_norm": 1.0589802265167236, + "learning_rate": 0.0002, + "loss": 0.595, + "step": 30290 + }, + { + "epoch": 4.8985530676582325, + "grad_norm": 0.8487226366996765, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 30300 + }, + { + "epoch": 4.900169751838979, + "grad_norm": 1.0212191343307495, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 30310 + }, + { + "epoch": 4.901786436019724, + "grad_norm": 1.0187491178512573, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 30320 + }, + { + "epoch": 4.903403120200469, + "grad_norm": 1.0013091564178467, + "learning_rate": 0.0002, + "loss": 0.5809, + "step": 30330 + }, + { + "epoch": 4.905019804381214, + "grad_norm": 1.0017542839050293, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 30340 + }, + { + "epoch": 4.9066364885619596, + "grad_norm": 0.9665151238441467, + "learning_rate": 0.0002, + "loss": 0.6002, + "step": 30350 + }, + { + "epoch": 4.908253172742705, + "grad_norm": 0.8774822950363159, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 30360 + }, + { + "epoch": 4.90986985692345, + "grad_norm": 0.9449850916862488, + "learning_rate": 0.0002, + "loss": 0.5771, + "step": 30370 + }, + { + "epoch": 4.911486541104195, + "grad_norm": 0.7368341088294983, + "learning_rate": 0.0002, + "loss": 0.58, + "step": 30380 + }, + { + "epoch": 4.9131032252849405, + "grad_norm": 0.9669167995452881, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 30390 + }, + { + "epoch": 4.914719909465686, + "grad_norm": 1.1227794885635376, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 30400 + }, + { + "epoch": 4.916336593646431, + "grad_norm": 0.9884361028671265, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 30410 + }, + { + "epoch": 4.917953277827176, + "grad_norm": 0.9949551224708557, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 30420 + }, + { + "epoch": 4.919569962007921, + "grad_norm": 0.9491621851921082, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 30430 + }, + { + "epoch": 4.9211866461886675, + "grad_norm": 0.78848797082901, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 30440 + }, + { + "epoch": 4.922803330369412, + "grad_norm": 1.0693835020065308, + "learning_rate": 0.0002, + "loss": 0.5561, + "step": 30450 + }, + { + "epoch": 4.924420014550158, + "grad_norm": 0.9573729634284973, + "learning_rate": 0.0002, + "loss": 0.566, + "step": 30460 + }, + { + "epoch": 4.926036698730903, + "grad_norm": 0.9975152611732483, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 30470 + }, + { + "epoch": 4.9276533829116484, + "grad_norm": 0.8695693016052246, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 30480 + }, + { + "epoch": 4.929270067092394, + "grad_norm": 1.145394206047058, + "learning_rate": 0.0002, + "loss": 0.6144, + "step": 30490 + }, + { + "epoch": 4.930886751273139, + "grad_norm": 0.7668989896774292, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 30500 + }, + { + "epoch": 4.932503435453884, + "grad_norm": 0.9630151391029358, + "learning_rate": 0.0002, + "loss": 0.6052, + "step": 30510 + }, + { + "epoch": 4.934120119634629, + "grad_norm": 0.940705418586731, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 30520 + }, + { + "epoch": 4.935736803815375, + "grad_norm": 1.3243348598480225, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 30530 + }, + { + "epoch": 4.93735348799612, + "grad_norm": 1.004347801208496, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 30540 + }, + { + "epoch": 4.938970172176865, + "grad_norm": 0.8711541295051575, + "learning_rate": 0.0002, + "loss": 0.583, + "step": 30550 + }, + { + "epoch": 4.94058685635761, + "grad_norm": 0.8980631828308105, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 30560 + }, + { + "epoch": 4.9422035405383555, + "grad_norm": 0.8388893604278564, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 30570 + }, + { + "epoch": 4.943820224719101, + "grad_norm": 1.0991183519363403, + "learning_rate": 0.0002, + "loss": 0.6189, + "step": 30580 + }, + { + "epoch": 4.945436908899847, + "grad_norm": 0.9731075763702393, + "learning_rate": 0.0002, + "loss": 0.5906, + "step": 30590 + }, + { + "epoch": 4.947053593080591, + "grad_norm": 1.3904452323913574, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 30600 + }, + { + "epoch": 4.948670277261337, + "grad_norm": 1.2489882707595825, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 30610 + }, + { + "epoch": 4.950286961442083, + "grad_norm": 1.240072250366211, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 30620 + }, + { + "epoch": 4.951903645622828, + "grad_norm": 0.9191411733627319, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 30630 + }, + { + "epoch": 4.953520329803573, + "grad_norm": 0.8888895511627197, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 30640 + }, + { + "epoch": 4.955137013984318, + "grad_norm": 0.9001450538635254, + "learning_rate": 0.0002, + "loss": 0.6594, + "step": 30650 + }, + { + "epoch": 4.9567536981650635, + "grad_norm": 1.053971767425537, + "learning_rate": 0.0002, + "loss": 0.6047, + "step": 30660 + }, + { + "epoch": 4.958370382345809, + "grad_norm": 1.2224042415618896, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 30670 + }, + { + "epoch": 4.959987066526554, + "grad_norm": 0.8855111598968506, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 30680 + }, + { + "epoch": 4.961603750707299, + "grad_norm": 0.9489575624465942, + "learning_rate": 0.0002, + "loss": 0.5764, + "step": 30690 + }, + { + "epoch": 4.963220434888044, + "grad_norm": 0.9635404944419861, + "learning_rate": 0.0002, + "loss": 0.5371, + "step": 30700 + }, + { + "epoch": 4.96483711906879, + "grad_norm": 1.1784121990203857, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 30710 + }, + { + "epoch": 4.966453803249535, + "grad_norm": 1.0059462785720825, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 30720 + }, + { + "epoch": 4.96807048743028, + "grad_norm": 0.9479738473892212, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 30730 + }, + { + "epoch": 4.969687171611026, + "grad_norm": 1.0624593496322632, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 30740 + }, + { + "epoch": 4.971303855791771, + "grad_norm": 1.1429259777069092, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 30750 + }, + { + "epoch": 4.972920539972517, + "grad_norm": 0.9102491140365601, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 30760 + }, + { + "epoch": 4.974537224153262, + "grad_norm": 1.1262688636779785, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 30770 + }, + { + "epoch": 4.976153908334007, + "grad_norm": 1.1415393352508545, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 30780 + }, + { + "epoch": 4.977770592514752, + "grad_norm": 1.083078384399414, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 30790 + }, + { + "epoch": 4.979387276695498, + "grad_norm": 0.964859127998352, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 30800 + }, + { + "epoch": 4.981003960876243, + "grad_norm": 0.8704743385314941, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 30810 + }, + { + "epoch": 4.982620645056988, + "grad_norm": 1.0714856386184692, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 30820 + }, + { + "epoch": 4.984237329237733, + "grad_norm": 0.6818771362304688, + "learning_rate": 0.0002, + "loss": 0.565, + "step": 30830 + }, + { + "epoch": 4.985854013418479, + "grad_norm": 1.0454156398773193, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 30840 + }, + { + "epoch": 4.987470697599224, + "grad_norm": 0.9410776495933533, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 30850 + }, + { + "epoch": 4.989087381779969, + "grad_norm": 1.0878902673721313, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 30860 + }, + { + "epoch": 4.990704065960714, + "grad_norm": 0.8916727304458618, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 30870 + }, + { + "epoch": 4.9923207501414595, + "grad_norm": 1.045776128768921, + "learning_rate": 0.0002, + "loss": 0.6066, + "step": 30880 + }, + { + "epoch": 4.993937434322206, + "grad_norm": 0.9861903786659241, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 30890 + }, + { + "epoch": 4.995554118502951, + "grad_norm": 0.9275050759315491, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 30900 + }, + { + "epoch": 4.997170802683696, + "grad_norm": 0.94013911485672, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 30910 + }, + { + "epoch": 4.998787486864441, + "grad_norm": 0.9771268367767334, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 30920 + }, + { + "epoch": 4.9999191657909625, + "eval_loss": 1.1968598365783691, + "eval_runtime": 122.2519, + "eval_samples_per_second": 5.996, + "eval_steps_per_second": 0.753, + "step": 30927 + }, + { + "epoch": 5.0004041710451865, + "grad_norm": 0.8021580576896667, + "learning_rate": 0.0002, + "loss": 0.5238, + "step": 30930 + }, + { + "epoch": 5.002020855225932, + "grad_norm": 1.0807327032089233, + "learning_rate": 0.0002, + "loss": 0.4984, + "step": 30940 + }, + { + "epoch": 5.003637539406677, + "grad_norm": 1.1638425588607788, + "learning_rate": 0.0002, + "loss": 0.514, + "step": 30950 + }, + { + "epoch": 5.005254223587422, + "grad_norm": 1.1700230836868286, + "learning_rate": 0.0002, + "loss": 0.4621, + "step": 30960 + }, + { + "epoch": 5.0068709077681675, + "grad_norm": 0.9053420424461365, + "learning_rate": 0.0002, + "loss": 0.4657, + "step": 30970 + }, + { + "epoch": 5.008487591948913, + "grad_norm": 0.9226111769676208, + "learning_rate": 0.0002, + "loss": 0.4865, + "step": 30980 + }, + { + "epoch": 5.010104276129658, + "grad_norm": 1.238669514656067, + "learning_rate": 0.0002, + "loss": 0.5011, + "step": 30990 + }, + { + "epoch": 5.011720960310403, + "grad_norm": 1.0668327808380127, + "learning_rate": 0.0002, + "loss": 0.4754, + "step": 31000 + }, + { + "epoch": 5.013337644491148, + "grad_norm": 1.0903944969177246, + "learning_rate": 0.0002, + "loss": 0.5414, + "step": 31010 + }, + { + "epoch": 5.014954328671894, + "grad_norm": 1.0763911008834839, + "learning_rate": 0.0002, + "loss": 0.5117, + "step": 31020 + }, + { + "epoch": 5.016571012852639, + "grad_norm": 1.0108771324157715, + "learning_rate": 0.0002, + "loss": 0.4908, + "step": 31030 + }, + { + "epoch": 5.018187697033385, + "grad_norm": 0.8816103935241699, + "learning_rate": 0.0002, + "loss": 0.5052, + "step": 31040 + }, + { + "epoch": 5.01980438121413, + "grad_norm": 1.11434805393219, + "learning_rate": 0.0002, + "loss": 0.4985, + "step": 31050 + }, + { + "epoch": 5.021421065394875, + "grad_norm": 1.0727789402008057, + "learning_rate": 0.0002, + "loss": 0.5074, + "step": 31060 + }, + { + "epoch": 5.023037749575621, + "grad_norm": 1.1480379104614258, + "learning_rate": 0.0002, + "loss": 0.4938, + "step": 31070 + }, + { + "epoch": 5.024654433756366, + "grad_norm": 1.0913071632385254, + "learning_rate": 0.0002, + "loss": 0.491, + "step": 31080 + }, + { + "epoch": 5.026271117937111, + "grad_norm": 0.9891864657402039, + "learning_rate": 0.0002, + "loss": 0.4896, + "step": 31090 + }, + { + "epoch": 5.027887802117856, + "grad_norm": 0.9167473912239075, + "learning_rate": 0.0002, + "loss": 0.4965, + "step": 31100 + }, + { + "epoch": 5.029504486298602, + "grad_norm": 1.2259035110473633, + "learning_rate": 0.0002, + "loss": 0.5098, + "step": 31110 + }, + { + "epoch": 5.031121170479347, + "grad_norm": 1.1812787055969238, + "learning_rate": 0.0002, + "loss": 0.5206, + "step": 31120 + }, + { + "epoch": 5.032737854660092, + "grad_norm": 1.0890522003173828, + "learning_rate": 0.0002, + "loss": 0.4725, + "step": 31130 + }, + { + "epoch": 5.034354538840837, + "grad_norm": 1.0521091222763062, + "learning_rate": 0.0002, + "loss": 0.4768, + "step": 31140 + }, + { + "epoch": 5.0359712230215825, + "grad_norm": 1.1274569034576416, + "learning_rate": 0.0002, + "loss": 0.4718, + "step": 31150 + }, + { + "epoch": 5.037587907202328, + "grad_norm": 1.140974998474121, + "learning_rate": 0.0002, + "loss": 0.4604, + "step": 31160 + }, + { + "epoch": 5.039204591383073, + "grad_norm": 1.1215609312057495, + "learning_rate": 0.0002, + "loss": 0.5077, + "step": 31170 + }, + { + "epoch": 5.040821275563818, + "grad_norm": 1.0107218027114868, + "learning_rate": 0.0002, + "loss": 0.4746, + "step": 31180 + }, + { + "epoch": 5.042437959744564, + "grad_norm": 1.0198770761489868, + "learning_rate": 0.0002, + "loss": 0.5126, + "step": 31190 + }, + { + "epoch": 5.0440546439253096, + "grad_norm": 1.1613430976867676, + "learning_rate": 0.0002, + "loss": 0.5004, + "step": 31200 + }, + { + "epoch": 5.045671328106055, + "grad_norm": 0.8555458188056946, + "learning_rate": 0.0002, + "loss": 0.5181, + "step": 31210 + }, + { + "epoch": 5.0472880122868, + "grad_norm": 1.0235545635223389, + "learning_rate": 0.0002, + "loss": 0.4878, + "step": 31220 + }, + { + "epoch": 5.048904696467545, + "grad_norm": 1.0228750705718994, + "learning_rate": 0.0002, + "loss": 0.499, + "step": 31230 + }, + { + "epoch": 5.0505213806482905, + "grad_norm": 0.8216419816017151, + "learning_rate": 0.0002, + "loss": 0.4544, + "step": 31240 + }, + { + "epoch": 5.052138064829036, + "grad_norm": 0.925828218460083, + "learning_rate": 0.0002, + "loss": 0.4947, + "step": 31250 + }, + { + "epoch": 5.053754749009781, + "grad_norm": 0.9229369759559631, + "learning_rate": 0.0002, + "loss": 0.4835, + "step": 31260 + }, + { + "epoch": 5.055371433190526, + "grad_norm": 0.9531727433204651, + "learning_rate": 0.0002, + "loss": 0.5136, + "step": 31270 + }, + { + "epoch": 5.056988117371271, + "grad_norm": 0.7738548517227173, + "learning_rate": 0.0002, + "loss": 0.5161, + "step": 31280 + }, + { + "epoch": 5.058604801552017, + "grad_norm": 1.0551451444625854, + "learning_rate": 0.0002, + "loss": 0.5166, + "step": 31290 + }, + { + "epoch": 5.060221485732762, + "grad_norm": 0.9782299399375916, + "learning_rate": 0.0002, + "loss": 0.4953, + "step": 31300 + }, + { + "epoch": 5.061838169913507, + "grad_norm": 1.0220632553100586, + "learning_rate": 0.0002, + "loss": 0.4776, + "step": 31310 + }, + { + "epoch": 5.063454854094252, + "grad_norm": 0.9808892607688904, + "learning_rate": 0.0002, + "loss": 0.5117, + "step": 31320 + }, + { + "epoch": 5.065071538274998, + "grad_norm": 1.0662003755569458, + "learning_rate": 0.0002, + "loss": 0.501, + "step": 31330 + }, + { + "epoch": 5.066688222455744, + "grad_norm": 1.0036940574645996, + "learning_rate": 0.0002, + "loss": 0.4844, + "step": 31340 + }, + { + "epoch": 5.068304906636489, + "grad_norm": 1.1931052207946777, + "learning_rate": 0.0002, + "loss": 0.5299, + "step": 31350 + }, + { + "epoch": 5.069921590817234, + "grad_norm": 0.9370693564414978, + "learning_rate": 0.0002, + "loss": 0.4646, + "step": 31360 + }, + { + "epoch": 5.071538274997979, + "grad_norm": 0.9589039087295532, + "learning_rate": 0.0002, + "loss": 0.5274, + "step": 31370 + }, + { + "epoch": 5.073154959178725, + "grad_norm": 1.0052711963653564, + "learning_rate": 0.0002, + "loss": 0.4669, + "step": 31380 + }, + { + "epoch": 5.07477164335947, + "grad_norm": 0.9991368651390076, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 31390 + }, + { + "epoch": 5.076388327540215, + "grad_norm": 0.8539695739746094, + "learning_rate": 0.0002, + "loss": 0.4579, + "step": 31400 + }, + { + "epoch": 5.07800501172096, + "grad_norm": 1.048775553703308, + "learning_rate": 0.0002, + "loss": 0.4609, + "step": 31410 + }, + { + "epoch": 5.0796216959017055, + "grad_norm": 0.9983724355697632, + "learning_rate": 0.0002, + "loss": 0.4915, + "step": 31420 + }, + { + "epoch": 5.081238380082451, + "grad_norm": 1.0189813375473022, + "learning_rate": 0.0002, + "loss": 0.4594, + "step": 31430 + }, + { + "epoch": 5.082855064263196, + "grad_norm": 0.9781646728515625, + "learning_rate": 0.0002, + "loss": 0.5449, + "step": 31440 + }, + { + "epoch": 5.084471748443941, + "grad_norm": 0.9424566030502319, + "learning_rate": 0.0002, + "loss": 0.4698, + "step": 31450 + }, + { + "epoch": 5.0860884326246865, + "grad_norm": 1.0036484003067017, + "learning_rate": 0.0002, + "loss": 0.4768, + "step": 31460 + }, + { + "epoch": 5.087705116805432, + "grad_norm": 1.0983147621154785, + "learning_rate": 0.0002, + "loss": 0.487, + "step": 31470 + }, + { + "epoch": 5.089321800986177, + "grad_norm": 1.0856730937957764, + "learning_rate": 0.0002, + "loss": 0.5236, + "step": 31480 + }, + { + "epoch": 5.090938485166923, + "grad_norm": 1.2191699743270874, + "learning_rate": 0.0002, + "loss": 0.485, + "step": 31490 + }, + { + "epoch": 5.092555169347668, + "grad_norm": 0.939346194267273, + "learning_rate": 0.0002, + "loss": 0.4936, + "step": 31500 + }, + { + "epoch": 5.0941718535284135, + "grad_norm": 0.9730121493339539, + "learning_rate": 0.0002, + "loss": 0.5107, + "step": 31510 + }, + { + "epoch": 5.095788537709159, + "grad_norm": 0.923686146736145, + "learning_rate": 0.0002, + "loss": 0.4973, + "step": 31520 + }, + { + "epoch": 5.097405221889904, + "grad_norm": 1.1734349727630615, + "learning_rate": 0.0002, + "loss": 0.4906, + "step": 31530 + }, + { + "epoch": 5.099021906070649, + "grad_norm": 1.084509015083313, + "learning_rate": 0.0002, + "loss": 0.5165, + "step": 31540 + }, + { + "epoch": 5.100638590251394, + "grad_norm": 1.0144678354263306, + "learning_rate": 0.0002, + "loss": 0.5078, + "step": 31550 + }, + { + "epoch": 5.10225527443214, + "grad_norm": 0.9958019256591797, + "learning_rate": 0.0002, + "loss": 0.4719, + "step": 31560 + }, + { + "epoch": 5.103871958612885, + "grad_norm": 0.8900736570358276, + "learning_rate": 0.0002, + "loss": 0.4876, + "step": 31570 + }, + { + "epoch": 5.10548864279363, + "grad_norm": 1.0921649932861328, + "learning_rate": 0.0002, + "loss": 0.463, + "step": 31580 + }, + { + "epoch": 5.107105326974375, + "grad_norm": 1.1613792181015015, + "learning_rate": 0.0002, + "loss": 0.5148, + "step": 31590 + }, + { + "epoch": 5.108722011155121, + "grad_norm": 0.9211367964744568, + "learning_rate": 0.0002, + "loss": 0.5055, + "step": 31600 + }, + { + "epoch": 5.110338695335866, + "grad_norm": 1.3315813541412354, + "learning_rate": 0.0002, + "loss": 0.5364, + "step": 31610 + }, + { + "epoch": 5.111955379516611, + "grad_norm": 1.3765019178390503, + "learning_rate": 0.0002, + "loss": 0.5336, + "step": 31620 + }, + { + "epoch": 5.113572063697356, + "grad_norm": 1.070198893547058, + "learning_rate": 0.0002, + "loss": 0.4861, + "step": 31630 + }, + { + "epoch": 5.115188747878102, + "grad_norm": 0.947631299495697, + "learning_rate": 0.0002, + "loss": 0.5046, + "step": 31640 + }, + { + "epoch": 5.116805432058848, + "grad_norm": 1.0197371244430542, + "learning_rate": 0.0002, + "loss": 0.5297, + "step": 31650 + }, + { + "epoch": 5.118422116239593, + "grad_norm": 0.8647911548614502, + "learning_rate": 0.0002, + "loss": 0.5014, + "step": 31660 + }, + { + "epoch": 5.120038800420338, + "grad_norm": 0.8944075107574463, + "learning_rate": 0.0002, + "loss": 0.4705, + "step": 31670 + }, + { + "epoch": 5.121655484601083, + "grad_norm": 1.124497652053833, + "learning_rate": 0.0002, + "loss": 0.5175, + "step": 31680 + }, + { + "epoch": 5.123272168781829, + "grad_norm": 0.893131673336029, + "learning_rate": 0.0002, + "loss": 0.5109, + "step": 31690 + }, + { + "epoch": 5.124888852962574, + "grad_norm": 1.0122284889221191, + "learning_rate": 0.0002, + "loss": 0.4937, + "step": 31700 + }, + { + "epoch": 5.126505537143319, + "grad_norm": 0.9493719935417175, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 31710 + }, + { + "epoch": 5.128122221324064, + "grad_norm": 0.9700539112091064, + "learning_rate": 0.0002, + "loss": 0.5031, + "step": 31720 + }, + { + "epoch": 5.1297389055048095, + "grad_norm": 1.111677646636963, + "learning_rate": 0.0002, + "loss": 0.5126, + "step": 31730 + }, + { + "epoch": 5.131355589685555, + "grad_norm": 0.8204274773597717, + "learning_rate": 0.0002, + "loss": 0.5272, + "step": 31740 + }, + { + "epoch": 5.1329722738663, + "grad_norm": 1.1029267311096191, + "learning_rate": 0.0002, + "loss": 0.5029, + "step": 31750 + }, + { + "epoch": 5.134588958047045, + "grad_norm": 1.065575122833252, + "learning_rate": 0.0002, + "loss": 0.505, + "step": 31760 + }, + { + "epoch": 5.13620564222779, + "grad_norm": 0.8208706974983215, + "learning_rate": 0.0002, + "loss": 0.502, + "step": 31770 + }, + { + "epoch": 5.137822326408536, + "grad_norm": 1.0520979166030884, + "learning_rate": 0.0002, + "loss": 0.5352, + "step": 31780 + }, + { + "epoch": 5.139439010589282, + "grad_norm": 0.8585538268089294, + "learning_rate": 0.0002, + "loss": 0.4911, + "step": 31790 + }, + { + "epoch": 5.141055694770027, + "grad_norm": 1.1491447687149048, + "learning_rate": 0.0002, + "loss": 0.5159, + "step": 31800 + }, + { + "epoch": 5.142672378950772, + "grad_norm": 0.9441081285476685, + "learning_rate": 0.0002, + "loss": 0.5157, + "step": 31810 + }, + { + "epoch": 5.1442890631315175, + "grad_norm": 1.4146889448165894, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 31820 + }, + { + "epoch": 5.145905747312263, + "grad_norm": 1.0326547622680664, + "learning_rate": 0.0002, + "loss": 0.5159, + "step": 31830 + }, + { + "epoch": 5.147522431493008, + "grad_norm": 0.9879202842712402, + "learning_rate": 0.0002, + "loss": 0.5348, + "step": 31840 + }, + { + "epoch": 5.149139115673753, + "grad_norm": 1.0374281406402588, + "learning_rate": 0.0002, + "loss": 0.5083, + "step": 31850 + }, + { + "epoch": 5.150755799854498, + "grad_norm": 1.181229591369629, + "learning_rate": 0.0002, + "loss": 0.4827, + "step": 31860 + }, + { + "epoch": 5.152372484035244, + "grad_norm": 1.2078537940979004, + "learning_rate": 0.0002, + "loss": 0.5313, + "step": 31870 + }, + { + "epoch": 5.153989168215989, + "grad_norm": 0.9599190354347229, + "learning_rate": 0.0002, + "loss": 0.5329, + "step": 31880 + }, + { + "epoch": 5.155605852396734, + "grad_norm": 1.0378568172454834, + "learning_rate": 0.0002, + "loss": 0.4953, + "step": 31890 + }, + { + "epoch": 5.157222536577479, + "grad_norm": 0.8746536374092102, + "learning_rate": 0.0002, + "loss": 0.5069, + "step": 31900 + }, + { + "epoch": 5.1588392207582245, + "grad_norm": 1.0232136249542236, + "learning_rate": 0.0002, + "loss": 0.5272, + "step": 31910 + }, + { + "epoch": 5.16045590493897, + "grad_norm": 0.9827565550804138, + "learning_rate": 0.0002, + "loss": 0.4844, + "step": 31920 + }, + { + "epoch": 5.162072589119716, + "grad_norm": 1.342657208442688, + "learning_rate": 0.0002, + "loss": 0.5029, + "step": 31930 + }, + { + "epoch": 5.163689273300461, + "grad_norm": 1.18390691280365, + "learning_rate": 0.0002, + "loss": 0.513, + "step": 31940 + }, + { + "epoch": 5.165305957481206, + "grad_norm": 0.996350109577179, + "learning_rate": 0.0002, + "loss": 0.5267, + "step": 31950 + }, + { + "epoch": 5.166922641661952, + "grad_norm": 0.9710391163825989, + "learning_rate": 0.0002, + "loss": 0.5063, + "step": 31960 + }, + { + "epoch": 5.168539325842697, + "grad_norm": 1.0264002084732056, + "learning_rate": 0.0002, + "loss": 0.5115, + "step": 31970 + }, + { + "epoch": 5.170156010023442, + "grad_norm": 1.0028311014175415, + "learning_rate": 0.0002, + "loss": 0.4972, + "step": 31980 + }, + { + "epoch": 5.171772694204187, + "grad_norm": 1.1078234910964966, + "learning_rate": 0.0002, + "loss": 0.5103, + "step": 31990 + }, + { + "epoch": 5.1733893783849325, + "grad_norm": 0.9659610390663147, + "learning_rate": 0.0002, + "loss": 0.495, + "step": 32000 + }, + { + "epoch": 5.175006062565678, + "grad_norm": 0.841986894607544, + "learning_rate": 0.0002, + "loss": 0.5114, + "step": 32010 + }, + { + "epoch": 5.176622746746423, + "grad_norm": 1.095332384109497, + "learning_rate": 0.0002, + "loss": 0.48, + "step": 32020 + }, + { + "epoch": 5.178239430927168, + "grad_norm": 1.1242377758026123, + "learning_rate": 0.0002, + "loss": 0.4741, + "step": 32030 + }, + { + "epoch": 5.179856115107913, + "grad_norm": 0.9872292280197144, + "learning_rate": 0.0002, + "loss": 0.5573, + "step": 32040 + }, + { + "epoch": 5.181472799288659, + "grad_norm": 0.936161994934082, + "learning_rate": 0.0002, + "loss": 0.48, + "step": 32050 + }, + { + "epoch": 5.183089483469404, + "grad_norm": 1.166100025177002, + "learning_rate": 0.0002, + "loss": 0.5093, + "step": 32060 + }, + { + "epoch": 5.184706167650149, + "grad_norm": 1.0764425992965698, + "learning_rate": 0.0002, + "loss": 0.5438, + "step": 32070 + }, + { + "epoch": 5.186322851830895, + "grad_norm": 1.0480051040649414, + "learning_rate": 0.0002, + "loss": 0.4843, + "step": 32080 + }, + { + "epoch": 5.1879395360116405, + "grad_norm": 1.0874916315078735, + "learning_rate": 0.0002, + "loss": 0.5386, + "step": 32090 + }, + { + "epoch": 5.189556220192386, + "grad_norm": 1.0817396640777588, + "learning_rate": 0.0002, + "loss": 0.4975, + "step": 32100 + }, + { + "epoch": 5.191172904373131, + "grad_norm": 1.054111361503601, + "learning_rate": 0.0002, + "loss": 0.5177, + "step": 32110 + }, + { + "epoch": 5.192789588553876, + "grad_norm": 0.9655823707580566, + "learning_rate": 0.0002, + "loss": 0.5229, + "step": 32120 + }, + { + "epoch": 5.194406272734621, + "grad_norm": 1.1384109258651733, + "learning_rate": 0.0002, + "loss": 0.5105, + "step": 32130 + }, + { + "epoch": 5.196022956915367, + "grad_norm": 1.0149348974227905, + "learning_rate": 0.0002, + "loss": 0.5073, + "step": 32140 + }, + { + "epoch": 5.197639641096112, + "grad_norm": 1.1084046363830566, + "learning_rate": 0.0002, + "loss": 0.5293, + "step": 32150 + }, + { + "epoch": 5.199256325276857, + "grad_norm": 1.1209309101104736, + "learning_rate": 0.0002, + "loss": 0.4936, + "step": 32160 + }, + { + "epoch": 5.200873009457602, + "grad_norm": 1.133089542388916, + "learning_rate": 0.0002, + "loss": 0.5101, + "step": 32170 + }, + { + "epoch": 5.202489693638348, + "grad_norm": 1.0893020629882812, + "learning_rate": 0.0002, + "loss": 0.5242, + "step": 32180 + }, + { + "epoch": 5.204106377819093, + "grad_norm": 0.90018630027771, + "learning_rate": 0.0002, + "loss": 0.4872, + "step": 32190 + }, + { + "epoch": 5.205723061999838, + "grad_norm": 0.977622926235199, + "learning_rate": 0.0002, + "loss": 0.4999, + "step": 32200 + }, + { + "epoch": 5.207339746180583, + "grad_norm": 1.2940177917480469, + "learning_rate": 0.0002, + "loss": 0.5028, + "step": 32210 + }, + { + "epoch": 5.2089564303613285, + "grad_norm": 1.2131710052490234, + "learning_rate": 0.0002, + "loss": 0.5396, + "step": 32220 + }, + { + "epoch": 5.210573114542075, + "grad_norm": 1.0234841108322144, + "learning_rate": 0.0002, + "loss": 0.5189, + "step": 32230 + }, + { + "epoch": 5.21218979872282, + "grad_norm": 1.157975435256958, + "learning_rate": 0.0002, + "loss": 0.5424, + "step": 32240 + }, + { + "epoch": 5.213806482903565, + "grad_norm": 1.0381282567977905, + "learning_rate": 0.0002, + "loss": 0.5396, + "step": 32250 + }, + { + "epoch": 5.21542316708431, + "grad_norm": 1.0125395059585571, + "learning_rate": 0.0002, + "loss": 0.5192, + "step": 32260 + }, + { + "epoch": 5.2170398512650555, + "grad_norm": 1.272691011428833, + "learning_rate": 0.0002, + "loss": 0.5216, + "step": 32270 + }, + { + "epoch": 5.218656535445801, + "grad_norm": 1.0061250925064087, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 32280 + }, + { + "epoch": 5.220273219626546, + "grad_norm": 0.9752234816551208, + "learning_rate": 0.0002, + "loss": 0.4739, + "step": 32290 + }, + { + "epoch": 5.221889903807291, + "grad_norm": 1.1193140745162964, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 32300 + }, + { + "epoch": 5.2235065879880365, + "grad_norm": 1.0126434564590454, + "learning_rate": 0.0002, + "loss": 0.4976, + "step": 32310 + }, + { + "epoch": 5.225123272168782, + "grad_norm": 1.4338394403457642, + "learning_rate": 0.0002, + "loss": 0.5257, + "step": 32320 + }, + { + "epoch": 5.226739956349527, + "grad_norm": 1.004101276397705, + "learning_rate": 0.0002, + "loss": 0.5235, + "step": 32330 + }, + { + "epoch": 5.228356640530272, + "grad_norm": 0.8744166493415833, + "learning_rate": 0.0002, + "loss": 0.5091, + "step": 32340 + }, + { + "epoch": 5.229973324711017, + "grad_norm": 1.0165376663208008, + "learning_rate": 0.0002, + "loss": 0.5388, + "step": 32350 + }, + { + "epoch": 5.231590008891763, + "grad_norm": 0.8635954260826111, + "learning_rate": 0.0002, + "loss": 0.5469, + "step": 32360 + }, + { + "epoch": 5.233206693072509, + "grad_norm": 1.1392399072647095, + "learning_rate": 0.0002, + "loss": 0.5609, + "step": 32370 + }, + { + "epoch": 5.234823377253254, + "grad_norm": 1.0202113389968872, + "learning_rate": 0.0002, + "loss": 0.5173, + "step": 32380 + }, + { + "epoch": 5.236440061433999, + "grad_norm": 1.0417983531951904, + "learning_rate": 0.0002, + "loss": 0.4983, + "step": 32390 + }, + { + "epoch": 5.238056745614744, + "grad_norm": 0.8729333877563477, + "learning_rate": 0.0002, + "loss": 0.507, + "step": 32400 + }, + { + "epoch": 5.23967342979549, + "grad_norm": 1.1626229286193848, + "learning_rate": 0.0002, + "loss": 0.5426, + "step": 32410 + }, + { + "epoch": 5.241290113976235, + "grad_norm": 0.9086161851882935, + "learning_rate": 0.0002, + "loss": 0.5355, + "step": 32420 + }, + { + "epoch": 5.24290679815698, + "grad_norm": 1.3999892473220825, + "learning_rate": 0.0002, + "loss": 0.4927, + "step": 32430 + }, + { + "epoch": 5.244523482337725, + "grad_norm": 1.0356311798095703, + "learning_rate": 0.0002, + "loss": 0.4795, + "step": 32440 + }, + { + "epoch": 5.246140166518471, + "grad_norm": 0.9655531644821167, + "learning_rate": 0.0002, + "loss": 0.5035, + "step": 32450 + }, + { + "epoch": 5.247756850699216, + "grad_norm": 1.0411828756332397, + "learning_rate": 0.0002, + "loss": 0.5166, + "step": 32460 + }, + { + "epoch": 5.249373534879961, + "grad_norm": 1.1199816465377808, + "learning_rate": 0.0002, + "loss": 0.5141, + "step": 32470 + }, + { + "epoch": 5.250990219060706, + "grad_norm": 1.260321855545044, + "learning_rate": 0.0002, + "loss": 0.4864, + "step": 32480 + }, + { + "epoch": 5.2526069032414515, + "grad_norm": 1.2950857877731323, + "learning_rate": 0.0002, + "loss": 0.4893, + "step": 32490 + }, + { + "epoch": 5.254223587422197, + "grad_norm": 0.8982820510864258, + "learning_rate": 0.0002, + "loss": 0.4952, + "step": 32500 + }, + { + "epoch": 5.255840271602942, + "grad_norm": 0.8512987494468689, + "learning_rate": 0.0002, + "loss": 0.5138, + "step": 32510 + }, + { + "epoch": 5.257456955783688, + "grad_norm": 1.067443609237671, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 32520 + }, + { + "epoch": 5.259073639964433, + "grad_norm": 1.0957417488098145, + "learning_rate": 0.0002, + "loss": 0.4928, + "step": 32530 + }, + { + "epoch": 5.260690324145179, + "grad_norm": 1.4161807298660278, + "learning_rate": 0.0002, + "loss": 0.5169, + "step": 32540 + }, + { + "epoch": 5.262307008325924, + "grad_norm": 1.2264093160629272, + "learning_rate": 0.0002, + "loss": 0.5599, + "step": 32550 + }, + { + "epoch": 5.263923692506669, + "grad_norm": 1.0015931129455566, + "learning_rate": 0.0002, + "loss": 0.5221, + "step": 32560 + }, + { + "epoch": 5.265540376687414, + "grad_norm": 1.0743094682693481, + "learning_rate": 0.0002, + "loss": 0.5253, + "step": 32570 + }, + { + "epoch": 5.2671570608681595, + "grad_norm": 1.1386840343475342, + "learning_rate": 0.0002, + "loss": 0.5289, + "step": 32580 + }, + { + "epoch": 5.268773745048905, + "grad_norm": 1.0093860626220703, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 32590 + }, + { + "epoch": 5.27039042922965, + "grad_norm": 0.9593744874000549, + "learning_rate": 0.0002, + "loss": 0.5175, + "step": 32600 + }, + { + "epoch": 5.272007113410395, + "grad_norm": 1.146021842956543, + "learning_rate": 0.0002, + "loss": 0.528, + "step": 32610 + }, + { + "epoch": 5.27362379759114, + "grad_norm": 0.9579031467437744, + "learning_rate": 0.0002, + "loss": 0.4983, + "step": 32620 + }, + { + "epoch": 5.275240481771886, + "grad_norm": 1.0548793077468872, + "learning_rate": 0.0002, + "loss": 0.5376, + "step": 32630 + }, + { + "epoch": 5.276857165952631, + "grad_norm": 1.0380561351776123, + "learning_rate": 0.0002, + "loss": 0.5267, + "step": 32640 + }, + { + "epoch": 5.278473850133376, + "grad_norm": 1.2119969129562378, + "learning_rate": 0.0002, + "loss": 0.5182, + "step": 32650 + }, + { + "epoch": 5.280090534314121, + "grad_norm": 1.0507797002792358, + "learning_rate": 0.0002, + "loss": 0.5298, + "step": 32660 + }, + { + "epoch": 5.2817072184948675, + "grad_norm": 1.0185176134109497, + "learning_rate": 0.0002, + "loss": 0.5253, + "step": 32670 + }, + { + "epoch": 5.283323902675613, + "grad_norm": 1.2358098030090332, + "learning_rate": 0.0002, + "loss": 0.4904, + "step": 32680 + }, + { + "epoch": 5.284940586856358, + "grad_norm": 0.7937114238739014, + "learning_rate": 0.0002, + "loss": 0.5169, + "step": 32690 + }, + { + "epoch": 5.286557271037103, + "grad_norm": 0.9825124740600586, + "learning_rate": 0.0002, + "loss": 0.495, + "step": 32700 + }, + { + "epoch": 5.288173955217848, + "grad_norm": 1.2059301137924194, + "learning_rate": 0.0002, + "loss": 0.5149, + "step": 32710 + }, + { + "epoch": 5.289790639398594, + "grad_norm": 1.0828571319580078, + "learning_rate": 0.0002, + "loss": 0.5272, + "step": 32720 + }, + { + "epoch": 5.291407323579339, + "grad_norm": 1.0129735469818115, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 32730 + }, + { + "epoch": 5.293024007760084, + "grad_norm": 1.0591634511947632, + "learning_rate": 0.0002, + "loss": 0.5216, + "step": 32740 + }, + { + "epoch": 5.294640691940829, + "grad_norm": 0.9256815910339355, + "learning_rate": 0.0002, + "loss": 0.522, + "step": 32750 + }, + { + "epoch": 5.2962573761215745, + "grad_norm": 1.0928633213043213, + "learning_rate": 0.0002, + "loss": 0.5396, + "step": 32760 + }, + { + "epoch": 5.29787406030232, + "grad_norm": 0.9415594935417175, + "learning_rate": 0.0002, + "loss": 0.5093, + "step": 32770 + }, + { + "epoch": 5.299490744483065, + "grad_norm": 1.141316294670105, + "learning_rate": 0.0002, + "loss": 0.5252, + "step": 32780 + }, + { + "epoch": 5.30110742866381, + "grad_norm": 1.0646510124206543, + "learning_rate": 0.0002, + "loss": 0.4837, + "step": 32790 + }, + { + "epoch": 5.3027241128445555, + "grad_norm": 1.189661979675293, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 32800 + }, + { + "epoch": 5.304340797025301, + "grad_norm": 0.9568731188774109, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 32810 + }, + { + "epoch": 5.305957481206047, + "grad_norm": 1.1556824445724487, + "learning_rate": 0.0002, + "loss": 0.5344, + "step": 32820 + }, + { + "epoch": 5.307574165386792, + "grad_norm": 0.9353463649749756, + "learning_rate": 0.0002, + "loss": 0.4894, + "step": 32830 + }, + { + "epoch": 5.309190849567537, + "grad_norm": 1.1208295822143555, + "learning_rate": 0.0002, + "loss": 0.5052, + "step": 32840 + }, + { + "epoch": 5.3108075337482825, + "grad_norm": 1.0894153118133545, + "learning_rate": 0.0002, + "loss": 0.5126, + "step": 32850 + }, + { + "epoch": 5.312424217929028, + "grad_norm": 1.090329647064209, + "learning_rate": 0.0002, + "loss": 0.5046, + "step": 32860 + }, + { + "epoch": 5.314040902109773, + "grad_norm": 1.0781712532043457, + "learning_rate": 0.0002, + "loss": 0.5237, + "step": 32870 + }, + { + "epoch": 5.315657586290518, + "grad_norm": 1.1785295009613037, + "learning_rate": 0.0002, + "loss": 0.57, + "step": 32880 + }, + { + "epoch": 5.317274270471263, + "grad_norm": 1.0406851768493652, + "learning_rate": 0.0002, + "loss": 0.4953, + "step": 32890 + }, + { + "epoch": 5.318890954652009, + "grad_norm": 1.0982953310012817, + "learning_rate": 0.0002, + "loss": 0.514, + "step": 32900 + }, + { + "epoch": 5.320507638832754, + "grad_norm": 1.2969383001327515, + "learning_rate": 0.0002, + "loss": 0.4944, + "step": 32910 + }, + { + "epoch": 5.322124323013499, + "grad_norm": 0.9687288999557495, + "learning_rate": 0.0002, + "loss": 0.4786, + "step": 32920 + }, + { + "epoch": 5.323741007194244, + "grad_norm": 1.136760950088501, + "learning_rate": 0.0002, + "loss": 0.5286, + "step": 32930 + }, + { + "epoch": 5.32535769137499, + "grad_norm": 1.3045495748519897, + "learning_rate": 0.0002, + "loss": 0.5321, + "step": 32940 + }, + { + "epoch": 5.326974375555735, + "grad_norm": 1.221675992012024, + "learning_rate": 0.0002, + "loss": 0.5413, + "step": 32950 + }, + { + "epoch": 5.32859105973648, + "grad_norm": 1.1380633115768433, + "learning_rate": 0.0002, + "loss": 0.4999, + "step": 32960 + }, + { + "epoch": 5.330207743917226, + "grad_norm": 1.1065956354141235, + "learning_rate": 0.0002, + "loss": 0.5037, + "step": 32970 + }, + { + "epoch": 5.331824428097971, + "grad_norm": 1.0187175273895264, + "learning_rate": 0.0002, + "loss": 0.4913, + "step": 32980 + }, + { + "epoch": 5.333441112278717, + "grad_norm": 0.9077118039131165, + "learning_rate": 0.0002, + "loss": 0.5234, + "step": 32990 + }, + { + "epoch": 5.335057796459462, + "grad_norm": 1.0092815160751343, + "learning_rate": 0.0002, + "loss": 0.5071, + "step": 33000 + }, + { + "epoch": 5.336674480640207, + "grad_norm": 1.0168777704238892, + "learning_rate": 0.0002, + "loss": 0.498, + "step": 33010 + }, + { + "epoch": 5.338291164820952, + "grad_norm": 0.996161937713623, + "learning_rate": 0.0002, + "loss": 0.4952, + "step": 33020 + }, + { + "epoch": 5.339907849001698, + "grad_norm": 0.794463038444519, + "learning_rate": 0.0002, + "loss": 0.5024, + "step": 33030 + }, + { + "epoch": 5.341524533182443, + "grad_norm": 0.9750674962997437, + "learning_rate": 0.0002, + "loss": 0.5112, + "step": 33040 + }, + { + "epoch": 5.343141217363188, + "grad_norm": 1.2770029306411743, + "learning_rate": 0.0002, + "loss": 0.528, + "step": 33050 + }, + { + "epoch": 5.344757901543933, + "grad_norm": 1.1500186920166016, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 33060 + }, + { + "epoch": 5.3463745857246785, + "grad_norm": 1.0726377964019775, + "learning_rate": 0.0002, + "loss": 0.4906, + "step": 33070 + }, + { + "epoch": 5.347991269905424, + "grad_norm": 0.9314153790473938, + "learning_rate": 0.0002, + "loss": 0.5212, + "step": 33080 + }, + { + "epoch": 5.349607954086169, + "grad_norm": 1.344988465309143, + "learning_rate": 0.0002, + "loss": 0.5434, + "step": 33090 + }, + { + "epoch": 5.351224638266914, + "grad_norm": 0.863196611404419, + "learning_rate": 0.0002, + "loss": 0.4874, + "step": 33100 + }, + { + "epoch": 5.352841322447659, + "grad_norm": 1.128100037574768, + "learning_rate": 0.0002, + "loss": 0.534, + "step": 33110 + }, + { + "epoch": 5.3544580066284055, + "grad_norm": 1.1673583984375, + "learning_rate": 0.0002, + "loss": 0.5293, + "step": 33120 + }, + { + "epoch": 5.356074690809151, + "grad_norm": 0.9416789412498474, + "learning_rate": 0.0002, + "loss": 0.4787, + "step": 33130 + }, + { + "epoch": 5.357691374989896, + "grad_norm": 1.1855236291885376, + "learning_rate": 0.0002, + "loss": 0.5155, + "step": 33140 + }, + { + "epoch": 5.359308059170641, + "grad_norm": 1.0415170192718506, + "learning_rate": 0.0002, + "loss": 0.515, + "step": 33150 + }, + { + "epoch": 5.3609247433513865, + "grad_norm": 0.9953004121780396, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 33160 + }, + { + "epoch": 5.362541427532132, + "grad_norm": 0.96138596534729, + "learning_rate": 0.0002, + "loss": 0.5305, + "step": 33170 + }, + { + "epoch": 5.364158111712877, + "grad_norm": 1.341979742050171, + "learning_rate": 0.0002, + "loss": 0.5064, + "step": 33180 + }, + { + "epoch": 5.365774795893622, + "grad_norm": 1.0136911869049072, + "learning_rate": 0.0002, + "loss": 0.4986, + "step": 33190 + }, + { + "epoch": 5.367391480074367, + "grad_norm": 0.8685575127601624, + "learning_rate": 0.0002, + "loss": 0.5459, + "step": 33200 + }, + { + "epoch": 5.369008164255113, + "grad_norm": 0.8833574652671814, + "learning_rate": 0.0002, + "loss": 0.5146, + "step": 33210 + }, + { + "epoch": 5.370624848435858, + "grad_norm": 0.9123612642288208, + "learning_rate": 0.0002, + "loss": 0.4982, + "step": 33220 + }, + { + "epoch": 5.372241532616603, + "grad_norm": 1.2720599174499512, + "learning_rate": 0.0002, + "loss": 0.5047, + "step": 33230 + }, + { + "epoch": 5.373858216797348, + "grad_norm": 1.0596648454666138, + "learning_rate": 0.0002, + "loss": 0.5175, + "step": 33240 + }, + { + "epoch": 5.3754749009780936, + "grad_norm": 1.119701623916626, + "learning_rate": 0.0002, + "loss": 0.5284, + "step": 33250 + }, + { + "epoch": 5.377091585158839, + "grad_norm": 1.3000061511993408, + "learning_rate": 0.0002, + "loss": 0.5217, + "step": 33260 + }, + { + "epoch": 5.378708269339585, + "grad_norm": 1.083891749382019, + "learning_rate": 0.0002, + "loss": 0.5125, + "step": 33270 + }, + { + "epoch": 5.38032495352033, + "grad_norm": 0.9402718544006348, + "learning_rate": 0.0002, + "loss": 0.5065, + "step": 33280 + }, + { + "epoch": 5.381941637701075, + "grad_norm": 1.3376892805099487, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 33290 + }, + { + "epoch": 5.383558321881821, + "grad_norm": 1.1600074768066406, + "learning_rate": 0.0002, + "loss": 0.5193, + "step": 33300 + }, + { + "epoch": 5.385175006062566, + "grad_norm": 1.1449427604675293, + "learning_rate": 0.0002, + "loss": 0.4907, + "step": 33310 + }, + { + "epoch": 5.386791690243311, + "grad_norm": 1.3118891716003418, + "learning_rate": 0.0002, + "loss": 0.5449, + "step": 33320 + }, + { + "epoch": 5.388408374424056, + "grad_norm": 0.743449866771698, + "learning_rate": 0.0002, + "loss": 0.547, + "step": 33330 + }, + { + "epoch": 5.3900250586048015, + "grad_norm": 0.9358304142951965, + "learning_rate": 0.0002, + "loss": 0.5555, + "step": 33340 + }, + { + "epoch": 5.391641742785547, + "grad_norm": 1.0447142124176025, + "learning_rate": 0.0002, + "loss": 0.5558, + "step": 33350 + }, + { + "epoch": 5.393258426966292, + "grad_norm": 1.1088626384735107, + "learning_rate": 0.0002, + "loss": 0.5106, + "step": 33360 + }, + { + "epoch": 5.394875111147037, + "grad_norm": 1.1267958879470825, + "learning_rate": 0.0002, + "loss": 0.4929, + "step": 33370 + }, + { + "epoch": 5.3964917953277824, + "grad_norm": 0.9709370136260986, + "learning_rate": 0.0002, + "loss": 0.5165, + "step": 33380 + }, + { + "epoch": 5.398108479508528, + "grad_norm": 1.0939103364944458, + "learning_rate": 0.0002, + "loss": 0.5206, + "step": 33390 + }, + { + "epoch": 5.399725163689273, + "grad_norm": 0.9559304714202881, + "learning_rate": 0.0002, + "loss": 0.5177, + "step": 33400 + }, + { + "epoch": 5.401341847870018, + "grad_norm": 1.199580430984497, + "learning_rate": 0.0002, + "loss": 0.5064, + "step": 33410 + }, + { + "epoch": 5.402958532050764, + "grad_norm": 0.9097000360488892, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 33420 + }, + { + "epoch": 5.4045752162315095, + "grad_norm": 1.1940981149673462, + "learning_rate": 0.0002, + "loss": 0.514, + "step": 33430 + }, + { + "epoch": 5.406191900412255, + "grad_norm": 1.0530916452407837, + "learning_rate": 0.0002, + "loss": 0.5069, + "step": 33440 + }, + { + "epoch": 5.407808584593, + "grad_norm": 1.0482549667358398, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 33450 + }, + { + "epoch": 5.409425268773745, + "grad_norm": 1.2524714469909668, + "learning_rate": 0.0002, + "loss": 0.501, + "step": 33460 + }, + { + "epoch": 5.41104195295449, + "grad_norm": 1.1091666221618652, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 33470 + }, + { + "epoch": 5.412658637135236, + "grad_norm": 0.9981587529182434, + "learning_rate": 0.0002, + "loss": 0.546, + "step": 33480 + }, + { + "epoch": 5.414275321315981, + "grad_norm": 1.016681432723999, + "learning_rate": 0.0002, + "loss": 0.4977, + "step": 33490 + }, + { + "epoch": 5.415892005496726, + "grad_norm": 1.1456854343414307, + "learning_rate": 0.0002, + "loss": 0.5388, + "step": 33500 + }, + { + "epoch": 5.417508689677471, + "grad_norm": 1.1454259157180786, + "learning_rate": 0.0002, + "loss": 0.5292, + "step": 33510 + }, + { + "epoch": 5.419125373858217, + "grad_norm": 0.9858416318893433, + "learning_rate": 0.0002, + "loss": 0.5061, + "step": 33520 + }, + { + "epoch": 5.420742058038962, + "grad_norm": 0.9764766693115234, + "learning_rate": 0.0002, + "loss": 0.5139, + "step": 33530 + }, + { + "epoch": 5.422358742219707, + "grad_norm": 1.199920892715454, + "learning_rate": 0.0002, + "loss": 0.5518, + "step": 33540 + }, + { + "epoch": 5.423975426400452, + "grad_norm": 1.3107370138168335, + "learning_rate": 0.0002, + "loss": 0.5182, + "step": 33550 + }, + { + "epoch": 5.4255921105811975, + "grad_norm": 0.9637970328330994, + "learning_rate": 0.0002, + "loss": 0.5149, + "step": 33560 + }, + { + "epoch": 5.427208794761944, + "grad_norm": 1.023359775543213, + "learning_rate": 0.0002, + "loss": 0.526, + "step": 33570 + }, + { + "epoch": 5.428825478942689, + "grad_norm": 1.060417652130127, + "learning_rate": 0.0002, + "loss": 0.5206, + "step": 33580 + }, + { + "epoch": 5.430442163123434, + "grad_norm": 0.9971120953559875, + "learning_rate": 0.0002, + "loss": 0.5052, + "step": 33590 + }, + { + "epoch": 5.432058847304179, + "grad_norm": 0.9213743209838867, + "learning_rate": 0.0002, + "loss": 0.5044, + "step": 33600 + }, + { + "epoch": 5.4336755314849245, + "grad_norm": 1.1512309312820435, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 33610 + }, + { + "epoch": 5.43529221566567, + "grad_norm": 1.2198847532272339, + "learning_rate": 0.0002, + "loss": 0.5317, + "step": 33620 + }, + { + "epoch": 5.436908899846415, + "grad_norm": 1.0329595804214478, + "learning_rate": 0.0002, + "loss": 0.5237, + "step": 33630 + }, + { + "epoch": 5.43852558402716, + "grad_norm": 1.1075750589370728, + "learning_rate": 0.0002, + "loss": 0.5364, + "step": 33640 + }, + { + "epoch": 5.4401422682079055, + "grad_norm": 1.006342887878418, + "learning_rate": 0.0002, + "loss": 0.5295, + "step": 33650 + }, + { + "epoch": 5.441758952388651, + "grad_norm": 0.9179885983467102, + "learning_rate": 0.0002, + "loss": 0.5394, + "step": 33660 + }, + { + "epoch": 5.443375636569396, + "grad_norm": 1.2799493074417114, + "learning_rate": 0.0002, + "loss": 0.5124, + "step": 33670 + }, + { + "epoch": 5.444992320750141, + "grad_norm": 1.1153863668441772, + "learning_rate": 0.0002, + "loss": 0.5426, + "step": 33680 + }, + { + "epoch": 5.446609004930886, + "grad_norm": 1.0681028366088867, + "learning_rate": 0.0002, + "loss": 0.5087, + "step": 33690 + }, + { + "epoch": 5.448225689111632, + "grad_norm": 0.9788817167282104, + "learning_rate": 0.0002, + "loss": 0.5272, + "step": 33700 + }, + { + "epoch": 5.449842373292377, + "grad_norm": 0.8481608629226685, + "learning_rate": 0.0002, + "loss": 0.5308, + "step": 33710 + }, + { + "epoch": 5.451459057473123, + "grad_norm": 1.113756537437439, + "learning_rate": 0.0002, + "loss": 0.5225, + "step": 33720 + }, + { + "epoch": 5.453075741653868, + "grad_norm": 0.8425475358963013, + "learning_rate": 0.0002, + "loss": 0.5213, + "step": 33730 + }, + { + "epoch": 5.4546924258346134, + "grad_norm": 1.0852208137512207, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 33740 + }, + { + "epoch": 5.456309110015359, + "grad_norm": 1.1664748191833496, + "learning_rate": 0.0002, + "loss": 0.5535, + "step": 33750 + }, + { + "epoch": 5.457925794196104, + "grad_norm": 1.217241644859314, + "learning_rate": 0.0002, + "loss": 0.5419, + "step": 33760 + }, + { + "epoch": 5.459542478376849, + "grad_norm": 1.1572928428649902, + "learning_rate": 0.0002, + "loss": 0.5351, + "step": 33770 + }, + { + "epoch": 5.461159162557594, + "grad_norm": 1.0437318086624146, + "learning_rate": 0.0002, + "loss": 0.5161, + "step": 33780 + }, + { + "epoch": 5.46277584673834, + "grad_norm": 0.9807571768760681, + "learning_rate": 0.0002, + "loss": 0.5266, + "step": 33790 + }, + { + "epoch": 5.464392530919085, + "grad_norm": 1.1436342000961304, + "learning_rate": 0.0002, + "loss": 0.5384, + "step": 33800 + }, + { + "epoch": 5.46600921509983, + "grad_norm": 1.1004794836044312, + "learning_rate": 0.0002, + "loss": 0.5338, + "step": 33810 + }, + { + "epoch": 5.467625899280575, + "grad_norm": 1.2130268812179565, + "learning_rate": 0.0002, + "loss": 0.4868, + "step": 33820 + }, + { + "epoch": 5.4692425834613205, + "grad_norm": 1.3154419660568237, + "learning_rate": 0.0002, + "loss": 0.516, + "step": 33830 + }, + { + "epoch": 5.470859267642066, + "grad_norm": 0.7934383749961853, + "learning_rate": 0.0002, + "loss": 0.4934, + "step": 33840 + }, + { + "epoch": 5.472475951822812, + "grad_norm": 0.7838410139083862, + "learning_rate": 0.0002, + "loss": 0.5133, + "step": 33850 + }, + { + "epoch": 5.474092636003557, + "grad_norm": 1.0415139198303223, + "learning_rate": 0.0002, + "loss": 0.4926, + "step": 33860 + }, + { + "epoch": 5.475709320184302, + "grad_norm": 0.9213164448738098, + "learning_rate": 0.0002, + "loss": 0.5323, + "step": 33870 + }, + { + "epoch": 5.477326004365048, + "grad_norm": 1.0364776849746704, + "learning_rate": 0.0002, + "loss": 0.5125, + "step": 33880 + }, + { + "epoch": 5.478942688545793, + "grad_norm": 0.9994072318077087, + "learning_rate": 0.0002, + "loss": 0.5212, + "step": 33890 + }, + { + "epoch": 5.480559372726538, + "grad_norm": 1.196730136871338, + "learning_rate": 0.0002, + "loss": 0.5396, + "step": 33900 + }, + { + "epoch": 5.482176056907283, + "grad_norm": 0.9955780506134033, + "learning_rate": 0.0002, + "loss": 0.538, + "step": 33910 + }, + { + "epoch": 5.4837927410880285, + "grad_norm": 1.168188214302063, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 33920 + }, + { + "epoch": 5.485409425268774, + "grad_norm": 1.1816450357437134, + "learning_rate": 0.0002, + "loss": 0.5548, + "step": 33930 + }, + { + "epoch": 5.487026109449519, + "grad_norm": 1.079715609550476, + "learning_rate": 0.0002, + "loss": 0.5535, + "step": 33940 + }, + { + "epoch": 5.488642793630264, + "grad_norm": 1.153850793838501, + "learning_rate": 0.0002, + "loss": 0.5262, + "step": 33950 + }, + { + "epoch": 5.490259477811009, + "grad_norm": 1.0207297801971436, + "learning_rate": 0.0002, + "loss": 0.5248, + "step": 33960 + }, + { + "epoch": 5.491876161991755, + "grad_norm": 1.1290855407714844, + "learning_rate": 0.0002, + "loss": 0.5142, + "step": 33970 + }, + { + "epoch": 5.4934928461725, + "grad_norm": 1.068058967590332, + "learning_rate": 0.0002, + "loss": 0.5168, + "step": 33980 + }, + { + "epoch": 5.495109530353245, + "grad_norm": 0.9789979457855225, + "learning_rate": 0.0002, + "loss": 0.5317, + "step": 33990 + }, + { + "epoch": 5.496726214533991, + "grad_norm": 0.9696692824363708, + "learning_rate": 0.0002, + "loss": 0.5113, + "step": 34000 + }, + { + "epoch": 5.4983428987147365, + "grad_norm": 1.0539981126785278, + "learning_rate": 0.0002, + "loss": 0.5413, + "step": 34010 + }, + { + "epoch": 5.499959582895482, + "grad_norm": 1.0249929428100586, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 34020 + }, + { + "epoch": 5.501576267076227, + "grad_norm": 0.9577504992485046, + "learning_rate": 0.0002, + "loss": 0.4888, + "step": 34030 + }, + { + "epoch": 5.503192951256972, + "grad_norm": 1.0963513851165771, + "learning_rate": 0.0002, + "loss": 0.5291, + "step": 34040 + }, + { + "epoch": 5.504809635437717, + "grad_norm": 0.8339345455169678, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 34050 + }, + { + "epoch": 5.506426319618463, + "grad_norm": 1.0138782262802124, + "learning_rate": 0.0002, + "loss": 0.5191, + "step": 34060 + }, + { + "epoch": 5.508043003799208, + "grad_norm": 1.0180109739303589, + "learning_rate": 0.0002, + "loss": 0.5463, + "step": 34070 + }, + { + "epoch": 5.509659687979953, + "grad_norm": 1.2790818214416504, + "learning_rate": 0.0002, + "loss": 0.5083, + "step": 34080 + }, + { + "epoch": 5.511276372160698, + "grad_norm": 1.428247332572937, + "learning_rate": 0.0002, + "loss": 0.5195, + "step": 34090 + }, + { + "epoch": 5.5128930563414436, + "grad_norm": 1.0926059484481812, + "learning_rate": 0.0002, + "loss": 0.5291, + "step": 34100 + }, + { + "epoch": 5.514509740522189, + "grad_norm": 1.2353343963623047, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 34110 + }, + { + "epoch": 5.516126424702934, + "grad_norm": 0.935587465763092, + "learning_rate": 0.0002, + "loss": 0.5331, + "step": 34120 + }, + { + "epoch": 5.517743108883679, + "grad_norm": 0.9767586588859558, + "learning_rate": 0.0002, + "loss": 0.5512, + "step": 34130 + }, + { + "epoch": 5.5193597930644245, + "grad_norm": 1.1660610437393188, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 34140 + }, + { + "epoch": 5.520976477245171, + "grad_norm": 0.9828870892524719, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 34150 + }, + { + "epoch": 5.522593161425916, + "grad_norm": 1.0097278356552124, + "learning_rate": 0.0002, + "loss": 0.5198, + "step": 34160 + }, + { + "epoch": 5.524209845606661, + "grad_norm": 1.1766167879104614, + "learning_rate": 0.0002, + "loss": 0.5293, + "step": 34170 + }, + { + "epoch": 5.525826529787406, + "grad_norm": 0.982292115688324, + "learning_rate": 0.0002, + "loss": 0.5258, + "step": 34180 + }, + { + "epoch": 5.5274432139681515, + "grad_norm": 1.0744609832763672, + "learning_rate": 0.0002, + "loss": 0.5114, + "step": 34190 + }, + { + "epoch": 5.529059898148897, + "grad_norm": 1.3831160068511963, + "learning_rate": 0.0002, + "loss": 0.5469, + "step": 34200 + }, + { + "epoch": 5.530676582329642, + "grad_norm": 1.074771761894226, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 34210 + }, + { + "epoch": 5.532293266510387, + "grad_norm": 1.016652226448059, + "learning_rate": 0.0002, + "loss": 0.5399, + "step": 34220 + }, + { + "epoch": 5.5339099506911325, + "grad_norm": 1.2231552600860596, + "learning_rate": 0.0002, + "loss": 0.5158, + "step": 34230 + }, + { + "epoch": 5.535526634871878, + "grad_norm": 0.8051198720932007, + "learning_rate": 0.0002, + "loss": 0.5091, + "step": 34240 + }, + { + "epoch": 5.537143319052623, + "grad_norm": 1.1779674291610718, + "learning_rate": 0.0002, + "loss": 0.5583, + "step": 34250 + }, + { + "epoch": 5.538760003233368, + "grad_norm": 1.2468291521072388, + "learning_rate": 0.0002, + "loss": 0.5044, + "step": 34260 + }, + { + "epoch": 5.540376687414113, + "grad_norm": 1.14818274974823, + "learning_rate": 0.0002, + "loss": 0.523, + "step": 34270 + }, + { + "epoch": 5.541993371594859, + "grad_norm": 1.2362616062164307, + "learning_rate": 0.0002, + "loss": 0.5375, + "step": 34280 + }, + { + "epoch": 5.543610055775604, + "grad_norm": 1.0206977128982544, + "learning_rate": 0.0002, + "loss": 0.4996, + "step": 34290 + }, + { + "epoch": 5.54522673995635, + "grad_norm": 1.2018457651138306, + "learning_rate": 0.0002, + "loss": 0.5212, + "step": 34300 + }, + { + "epoch": 5.546843424137095, + "grad_norm": 1.0349043607711792, + "learning_rate": 0.0002, + "loss": 0.5462, + "step": 34310 + }, + { + "epoch": 5.54846010831784, + "grad_norm": 1.2022006511688232, + "learning_rate": 0.0002, + "loss": 0.5231, + "step": 34320 + }, + { + "epoch": 5.550076792498586, + "grad_norm": 1.0810624361038208, + "learning_rate": 0.0002, + "loss": 0.5173, + "step": 34330 + }, + { + "epoch": 5.551693476679331, + "grad_norm": 1.3297529220581055, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 34340 + }, + { + "epoch": 5.553310160860076, + "grad_norm": 0.9722549915313721, + "learning_rate": 0.0002, + "loss": 0.5321, + "step": 34350 + }, + { + "epoch": 5.554926845040821, + "grad_norm": 0.9903425574302673, + "learning_rate": 0.0002, + "loss": 0.4823, + "step": 34360 + }, + { + "epoch": 5.556543529221567, + "grad_norm": 0.9568067789077759, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 34370 + }, + { + "epoch": 5.558160213402312, + "grad_norm": 1.113870620727539, + "learning_rate": 0.0002, + "loss": 0.5242, + "step": 34380 + }, + { + "epoch": 5.559776897583057, + "grad_norm": 1.0557632446289062, + "learning_rate": 0.0002, + "loss": 0.5278, + "step": 34390 + }, + { + "epoch": 5.561393581763802, + "grad_norm": 0.9615673422813416, + "learning_rate": 0.0002, + "loss": 0.5501, + "step": 34400 + }, + { + "epoch": 5.5630102659445475, + "grad_norm": 0.9536027312278748, + "learning_rate": 0.0002, + "loss": 0.5066, + "step": 34410 + }, + { + "epoch": 5.564626950125293, + "grad_norm": 0.8808749318122864, + "learning_rate": 0.0002, + "loss": 0.4949, + "step": 34420 + }, + { + "epoch": 5.566243634306038, + "grad_norm": 1.286132574081421, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 34430 + }, + { + "epoch": 5.567860318486783, + "grad_norm": 1.259644865989685, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 34440 + }, + { + "epoch": 5.569477002667529, + "grad_norm": 0.9920216798782349, + "learning_rate": 0.0002, + "loss": 0.4922, + "step": 34450 + }, + { + "epoch": 5.5710936868482746, + "grad_norm": 1.182926893234253, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 34460 + }, + { + "epoch": 5.57271037102902, + "grad_norm": 1.1434749364852905, + "learning_rate": 0.0002, + "loss": 0.5185, + "step": 34470 + }, + { + "epoch": 5.574327055209765, + "grad_norm": 1.2420979738235474, + "learning_rate": 0.0002, + "loss": 0.5256, + "step": 34480 + }, + { + "epoch": 5.57594373939051, + "grad_norm": 0.9338384866714478, + "learning_rate": 0.0002, + "loss": 0.5039, + "step": 34490 + }, + { + "epoch": 5.5775604235712555, + "grad_norm": 1.0196425914764404, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 34500 + }, + { + "epoch": 5.579177107752001, + "grad_norm": 0.9586997032165527, + "learning_rate": 0.0002, + "loss": 0.5132, + "step": 34510 + }, + { + "epoch": 5.580793791932746, + "grad_norm": 1.2409086227416992, + "learning_rate": 0.0002, + "loss": 0.5336, + "step": 34520 + }, + { + "epoch": 5.582410476113491, + "grad_norm": 1.1483757495880127, + "learning_rate": 0.0002, + "loss": 0.5364, + "step": 34530 + }, + { + "epoch": 5.584027160294236, + "grad_norm": 1.1624305248260498, + "learning_rate": 0.0002, + "loss": 0.5325, + "step": 34540 + }, + { + "epoch": 5.585643844474982, + "grad_norm": 1.2635223865509033, + "learning_rate": 0.0002, + "loss": 0.5342, + "step": 34550 + }, + { + "epoch": 5.587260528655727, + "grad_norm": 0.9824051856994629, + "learning_rate": 0.0002, + "loss": 0.4924, + "step": 34560 + }, + { + "epoch": 5.588877212836472, + "grad_norm": 1.0858620405197144, + "learning_rate": 0.0002, + "loss": 0.5395, + "step": 34570 + }, + { + "epoch": 5.590493897017217, + "grad_norm": 1.1452655792236328, + "learning_rate": 0.0002, + "loss": 0.5459, + "step": 34580 + }, + { + "epoch": 5.592110581197963, + "grad_norm": 1.110610842704773, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 34590 + }, + { + "epoch": 5.593727265378709, + "grad_norm": 0.9976194500923157, + "learning_rate": 0.0002, + "loss": 0.5285, + "step": 34600 + }, + { + "epoch": 5.595343949559454, + "grad_norm": 1.0698920488357544, + "learning_rate": 0.0002, + "loss": 0.548, + "step": 34610 + }, + { + "epoch": 5.596960633740199, + "grad_norm": 1.1505171060562134, + "learning_rate": 0.0002, + "loss": 0.5311, + "step": 34620 + }, + { + "epoch": 5.598577317920944, + "grad_norm": 1.1014643907546997, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 34630 + }, + { + "epoch": 5.60019400210169, + "grad_norm": 0.915595293045044, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 34640 + }, + { + "epoch": 5.601810686282435, + "grad_norm": 1.1856765747070312, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 34650 + }, + { + "epoch": 5.60342737046318, + "grad_norm": 1.1357687711715698, + "learning_rate": 0.0002, + "loss": 0.5502, + "step": 34660 + }, + { + "epoch": 5.605044054643925, + "grad_norm": 1.0232492685317993, + "learning_rate": 0.0002, + "loss": 0.5034, + "step": 34670 + }, + { + "epoch": 5.6066607388246705, + "grad_norm": 0.9375017881393433, + "learning_rate": 0.0002, + "loss": 0.5357, + "step": 34680 + }, + { + "epoch": 5.608277423005416, + "grad_norm": 1.0796529054641724, + "learning_rate": 0.0002, + "loss": 0.5518, + "step": 34690 + }, + { + "epoch": 5.609894107186161, + "grad_norm": 1.1383336782455444, + "learning_rate": 0.0002, + "loss": 0.5173, + "step": 34700 + }, + { + "epoch": 5.611510791366906, + "grad_norm": 1.0248544216156006, + "learning_rate": 0.0002, + "loss": 0.5477, + "step": 34710 + }, + { + "epoch": 5.6131274755476515, + "grad_norm": 1.0986040830612183, + "learning_rate": 0.0002, + "loss": 0.5669, + "step": 34720 + }, + { + "epoch": 5.614744159728397, + "grad_norm": 1.2689568996429443, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 34730 + }, + { + "epoch": 5.616360843909142, + "grad_norm": 1.4044264554977417, + "learning_rate": 0.0002, + "loss": 0.5136, + "step": 34740 + }, + { + "epoch": 5.617977528089888, + "grad_norm": 1.2084474563598633, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 34750 + }, + { + "epoch": 5.619594212270633, + "grad_norm": 1.061248540878296, + "learning_rate": 0.0002, + "loss": 0.5377, + "step": 34760 + }, + { + "epoch": 5.6212108964513785, + "grad_norm": 1.0220764875411987, + "learning_rate": 0.0002, + "loss": 0.5669, + "step": 34770 + }, + { + "epoch": 5.622827580632124, + "grad_norm": 1.0859092473983765, + "learning_rate": 0.0002, + "loss": 0.54, + "step": 34780 + }, + { + "epoch": 5.624444264812869, + "grad_norm": 0.9049732089042664, + "learning_rate": 0.0002, + "loss": 0.5308, + "step": 34790 + }, + { + "epoch": 5.626060948993614, + "grad_norm": 1.2103937864303589, + "learning_rate": 0.0002, + "loss": 0.5433, + "step": 34800 + }, + { + "epoch": 5.627677633174359, + "grad_norm": 0.9854230284690857, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 34810 + }, + { + "epoch": 5.629294317355105, + "grad_norm": 0.9316635131835938, + "learning_rate": 0.0002, + "loss": 0.5274, + "step": 34820 + }, + { + "epoch": 5.63091100153585, + "grad_norm": 1.105296015739441, + "learning_rate": 0.0002, + "loss": 0.5393, + "step": 34830 + }, + { + "epoch": 5.632527685716595, + "grad_norm": 0.993383526802063, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 34840 + }, + { + "epoch": 5.63414436989734, + "grad_norm": 1.1544116735458374, + "learning_rate": 0.0002, + "loss": 0.5375, + "step": 34850 + }, + { + "epoch": 5.635761054078086, + "grad_norm": 1.284475326538086, + "learning_rate": 0.0002, + "loss": 0.5448, + "step": 34860 + }, + { + "epoch": 5.637377738258831, + "grad_norm": 1.121997594833374, + "learning_rate": 0.0002, + "loss": 0.5069, + "step": 34870 + }, + { + "epoch": 5.638994422439576, + "grad_norm": 1.213040828704834, + "learning_rate": 0.0002, + "loss": 0.5335, + "step": 34880 + }, + { + "epoch": 5.640611106620321, + "grad_norm": 1.23222017288208, + "learning_rate": 0.0002, + "loss": 0.5623, + "step": 34890 + }, + { + "epoch": 5.642227790801067, + "grad_norm": 0.9793637990951538, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 34900 + }, + { + "epoch": 5.643844474981813, + "grad_norm": 1.38919997215271, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 34910 + }, + { + "epoch": 5.645461159162558, + "grad_norm": 0.8390951156616211, + "learning_rate": 0.0002, + "loss": 0.5007, + "step": 34920 + }, + { + "epoch": 5.647077843343303, + "grad_norm": 0.9465909004211426, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 34930 + }, + { + "epoch": 5.648694527524048, + "grad_norm": 1.066957712173462, + "learning_rate": 0.0002, + "loss": 0.5264, + "step": 34940 + }, + { + "epoch": 5.650311211704794, + "grad_norm": 0.9842154383659363, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 34950 + }, + { + "epoch": 5.651927895885539, + "grad_norm": 1.1766440868377686, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 34960 + }, + { + "epoch": 5.653544580066284, + "grad_norm": 0.9061306118965149, + "learning_rate": 0.0002, + "loss": 0.5462, + "step": 34970 + }, + { + "epoch": 5.655161264247029, + "grad_norm": 1.2941309213638306, + "learning_rate": 0.0002, + "loss": 0.5446, + "step": 34980 + }, + { + "epoch": 5.6567779484277745, + "grad_norm": 0.9741247892379761, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 34990 + }, + { + "epoch": 5.65839463260852, + "grad_norm": 1.0784187316894531, + "learning_rate": 0.0002, + "loss": 0.5152, + "step": 35000 + }, + { + "epoch": 5.660011316789265, + "grad_norm": 0.937889814376831, + "learning_rate": 0.0002, + "loss": 0.5363, + "step": 35010 + }, + { + "epoch": 5.66162800097001, + "grad_norm": 0.9667879939079285, + "learning_rate": 0.0002, + "loss": 0.5019, + "step": 35020 + }, + { + "epoch": 5.663244685150756, + "grad_norm": 1.0554876327514648, + "learning_rate": 0.0002, + "loss": 0.5209, + "step": 35030 + }, + { + "epoch": 5.664861369331501, + "grad_norm": 1.2030539512634277, + "learning_rate": 0.0002, + "loss": 0.523, + "step": 35040 + }, + { + "epoch": 5.666478053512247, + "grad_norm": 1.0849953889846802, + "learning_rate": 0.0002, + "loss": 0.5406, + "step": 35050 + }, + { + "epoch": 5.668094737692992, + "grad_norm": 1.1598973274230957, + "learning_rate": 0.0002, + "loss": 0.5747, + "step": 35060 + }, + { + "epoch": 5.669711421873737, + "grad_norm": 1.0233359336853027, + "learning_rate": 0.0002, + "loss": 0.5488, + "step": 35070 + }, + { + "epoch": 5.6713281060544825, + "grad_norm": 1.1124799251556396, + "learning_rate": 0.0002, + "loss": 0.5409, + "step": 35080 + }, + { + "epoch": 5.672944790235228, + "grad_norm": 1.2351475954055786, + "learning_rate": 0.0002, + "loss": 0.5578, + "step": 35090 + }, + { + "epoch": 5.674561474415973, + "grad_norm": 1.0240728855133057, + "learning_rate": 0.0002, + "loss": 0.5638, + "step": 35100 + }, + { + "epoch": 5.676178158596718, + "grad_norm": 1.0223692655563354, + "learning_rate": 0.0002, + "loss": 0.5192, + "step": 35110 + }, + { + "epoch": 5.677794842777463, + "grad_norm": 1.4569132328033447, + "learning_rate": 0.0002, + "loss": 0.524, + "step": 35120 + }, + { + "epoch": 5.679411526958209, + "grad_norm": 0.8983587026596069, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 35130 + }, + { + "epoch": 5.681028211138954, + "grad_norm": 1.0775383710861206, + "learning_rate": 0.0002, + "loss": 0.5439, + "step": 35140 + }, + { + "epoch": 5.682644895319699, + "grad_norm": 0.9800270795822144, + "learning_rate": 0.0002, + "loss": 0.5289, + "step": 35150 + }, + { + "epoch": 5.684261579500444, + "grad_norm": 0.9858237504959106, + "learning_rate": 0.0002, + "loss": 0.533, + "step": 35160 + }, + { + "epoch": 5.6858782636811895, + "grad_norm": 1.031087040901184, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 35170 + }, + { + "epoch": 5.687494947861936, + "grad_norm": 1.0294365882873535, + "learning_rate": 0.0002, + "loss": 0.5528, + "step": 35180 + }, + { + "epoch": 5.68911163204268, + "grad_norm": 1.108144760131836, + "learning_rate": 0.0002, + "loss": 0.5581, + "step": 35190 + }, + { + "epoch": 5.690728316223426, + "grad_norm": 1.0813100337982178, + "learning_rate": 0.0002, + "loss": 0.5373, + "step": 35200 + }, + { + "epoch": 5.692345000404171, + "grad_norm": 1.3146867752075195, + "learning_rate": 0.0002, + "loss": 0.5429, + "step": 35210 + }, + { + "epoch": 5.693961684584917, + "grad_norm": 1.16780424118042, + "learning_rate": 0.0002, + "loss": 0.5297, + "step": 35220 + }, + { + "epoch": 5.695578368765662, + "grad_norm": 0.9929125905036926, + "learning_rate": 0.0002, + "loss": 0.577, + "step": 35230 + }, + { + "epoch": 5.697195052946407, + "grad_norm": 0.9049441814422607, + "learning_rate": 0.0002, + "loss": 0.5441, + "step": 35240 + }, + { + "epoch": 5.698811737127152, + "grad_norm": 0.9768866300582886, + "learning_rate": 0.0002, + "loss": 0.5349, + "step": 35250 + }, + { + "epoch": 5.7004284213078975, + "grad_norm": 0.8306029438972473, + "learning_rate": 0.0002, + "loss": 0.542, + "step": 35260 + }, + { + "epoch": 5.702045105488643, + "grad_norm": 0.8417280316352844, + "learning_rate": 0.0002, + "loss": 0.4771, + "step": 35270 + }, + { + "epoch": 5.703661789669388, + "grad_norm": 0.9954485893249512, + "learning_rate": 0.0002, + "loss": 0.574, + "step": 35280 + }, + { + "epoch": 5.705278473850133, + "grad_norm": 1.2417993545532227, + "learning_rate": 0.0002, + "loss": 0.5469, + "step": 35290 + }, + { + "epoch": 5.706895158030878, + "grad_norm": 1.1696544885635376, + "learning_rate": 0.0002, + "loss": 0.5275, + "step": 35300 + }, + { + "epoch": 5.708511842211624, + "grad_norm": 1.2424817085266113, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 35310 + }, + { + "epoch": 5.710128526392369, + "grad_norm": 1.1791106462478638, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 35320 + }, + { + "epoch": 5.711745210573115, + "grad_norm": 1.202181339263916, + "learning_rate": 0.0002, + "loss": 0.5076, + "step": 35330 + }, + { + "epoch": 5.713361894753859, + "grad_norm": 1.1006861925125122, + "learning_rate": 0.0002, + "loss": 0.5847, + "step": 35340 + }, + { + "epoch": 5.7149785789346055, + "grad_norm": 1.0918344259262085, + "learning_rate": 0.0002, + "loss": 0.5627, + "step": 35350 + }, + { + "epoch": 5.716595263115351, + "grad_norm": 1.0427305698394775, + "learning_rate": 0.0002, + "loss": 0.5677, + "step": 35360 + }, + { + "epoch": 5.718211947296096, + "grad_norm": 1.0818872451782227, + "learning_rate": 0.0002, + "loss": 0.5288, + "step": 35370 + }, + { + "epoch": 5.719828631476841, + "grad_norm": 1.186006784439087, + "learning_rate": 0.0002, + "loss": 0.5296, + "step": 35380 + }, + { + "epoch": 5.721445315657586, + "grad_norm": 1.2073674201965332, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 35390 + }, + { + "epoch": 5.723061999838332, + "grad_norm": 1.065338134765625, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 35400 + }, + { + "epoch": 5.724678684019077, + "grad_norm": 0.9448973536491394, + "learning_rate": 0.0002, + "loss": 0.5195, + "step": 35410 + }, + { + "epoch": 5.726295368199822, + "grad_norm": 1.1487499475479126, + "learning_rate": 0.0002, + "loss": 0.5276, + "step": 35420 + }, + { + "epoch": 5.727912052380567, + "grad_norm": 1.1334216594696045, + "learning_rate": 0.0002, + "loss": 0.5435, + "step": 35430 + }, + { + "epoch": 5.729528736561313, + "grad_norm": 1.1932826042175293, + "learning_rate": 0.0002, + "loss": 0.5074, + "step": 35440 + }, + { + "epoch": 5.731145420742058, + "grad_norm": 1.2615786790847778, + "learning_rate": 0.0002, + "loss": 0.5502, + "step": 35450 + }, + { + "epoch": 5.732762104922803, + "grad_norm": 1.2803694009780884, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 35460 + }, + { + "epoch": 5.734378789103548, + "grad_norm": 0.9271906614303589, + "learning_rate": 0.0002, + "loss": 0.5458, + "step": 35470 + }, + { + "epoch": 5.735995473284294, + "grad_norm": 1.0958917140960693, + "learning_rate": 0.0002, + "loss": 0.5342, + "step": 35480 + }, + { + "epoch": 5.737612157465039, + "grad_norm": 1.1072784662246704, + "learning_rate": 0.0002, + "loss": 0.538, + "step": 35490 + }, + { + "epoch": 5.739228841645785, + "grad_norm": 1.1641002893447876, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 35500 + }, + { + "epoch": 5.74084552582653, + "grad_norm": 1.0246447324752808, + "learning_rate": 0.0002, + "loss": 0.5252, + "step": 35510 + }, + { + "epoch": 5.742462210007275, + "grad_norm": 1.032474398612976, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 35520 + }, + { + "epoch": 5.7440788941880205, + "grad_norm": 1.1600854396820068, + "learning_rate": 0.0002, + "loss": 0.4965, + "step": 35530 + }, + { + "epoch": 5.745695578368766, + "grad_norm": 1.0686054229736328, + "learning_rate": 0.0002, + "loss": 0.5543, + "step": 35540 + }, + { + "epoch": 5.747312262549511, + "grad_norm": 1.2314637899398804, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 35550 + }, + { + "epoch": 5.748928946730256, + "grad_norm": 0.922134280204773, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 35560 + }, + { + "epoch": 5.7505456309110015, + "grad_norm": 0.933043360710144, + "learning_rate": 0.0002, + "loss": 0.5495, + "step": 35570 + }, + { + "epoch": 5.752162315091747, + "grad_norm": 1.1911931037902832, + "learning_rate": 0.0002, + "loss": 0.5007, + "step": 35580 + }, + { + "epoch": 5.753778999272492, + "grad_norm": 0.8984857797622681, + "learning_rate": 0.0002, + "loss": 0.5244, + "step": 35590 + }, + { + "epoch": 5.755395683453237, + "grad_norm": 0.9495107531547546, + "learning_rate": 0.0002, + "loss": 0.5493, + "step": 35600 + }, + { + "epoch": 5.757012367633982, + "grad_norm": 1.2805472612380981, + "learning_rate": 0.0002, + "loss": 0.5326, + "step": 35610 + }, + { + "epoch": 5.758629051814728, + "grad_norm": 1.1236625909805298, + "learning_rate": 0.0002, + "loss": 0.5276, + "step": 35620 + }, + { + "epoch": 5.760245735995474, + "grad_norm": 1.0552798509597778, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 35630 + }, + { + "epoch": 5.761862420176218, + "grad_norm": 1.119909644126892, + "learning_rate": 0.0002, + "loss": 0.5479, + "step": 35640 + }, + { + "epoch": 5.763479104356964, + "grad_norm": 0.8786116242408752, + "learning_rate": 0.0002, + "loss": 0.5282, + "step": 35650 + }, + { + "epoch": 5.765095788537709, + "grad_norm": 1.2417117357254028, + "learning_rate": 0.0002, + "loss": 0.5406, + "step": 35660 + }, + { + "epoch": 5.766712472718455, + "grad_norm": 1.255200982093811, + "learning_rate": 0.0002, + "loss": 0.537, + "step": 35670 + }, + { + "epoch": 5.7683291568992, + "grad_norm": 1.0611358880996704, + "learning_rate": 0.0002, + "loss": 0.5308, + "step": 35680 + }, + { + "epoch": 5.769945841079945, + "grad_norm": 1.1443911790847778, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 35690 + }, + { + "epoch": 5.77156252526069, + "grad_norm": 1.1437989473342896, + "learning_rate": 0.0002, + "loss": 0.5386, + "step": 35700 + }, + { + "epoch": 5.773179209441436, + "grad_norm": 1.1375046968460083, + "learning_rate": 0.0002, + "loss": 0.537, + "step": 35710 + }, + { + "epoch": 5.774795893622181, + "grad_norm": 1.0777729749679565, + "learning_rate": 0.0002, + "loss": 0.5198, + "step": 35720 + }, + { + "epoch": 5.776412577802926, + "grad_norm": 1.1160215139389038, + "learning_rate": 0.0002, + "loss": 0.5521, + "step": 35730 + }, + { + "epoch": 5.778029261983671, + "grad_norm": 1.1268514394760132, + "learning_rate": 0.0002, + "loss": 0.5569, + "step": 35740 + }, + { + "epoch": 5.7796459461644165, + "grad_norm": 1.2752262353897095, + "learning_rate": 0.0002, + "loss": 0.5311, + "step": 35750 + }, + { + "epoch": 5.781262630345162, + "grad_norm": 1.0416184663772583, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 35760 + }, + { + "epoch": 5.782879314525907, + "grad_norm": 1.0622444152832031, + "learning_rate": 0.0002, + "loss": 0.5438, + "step": 35770 + }, + { + "epoch": 5.784495998706653, + "grad_norm": 1.1217877864837646, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 35780 + }, + { + "epoch": 5.786112682887398, + "grad_norm": 0.9363139867782593, + "learning_rate": 0.0002, + "loss": 0.5225, + "step": 35790 + }, + { + "epoch": 5.787729367068144, + "grad_norm": 0.96628737449646, + "learning_rate": 0.0002, + "loss": 0.5524, + "step": 35800 + }, + { + "epoch": 5.789346051248889, + "grad_norm": 0.9572572112083435, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 35810 + }, + { + "epoch": 5.790962735429634, + "grad_norm": 0.938724935054779, + "learning_rate": 0.0002, + "loss": 0.5615, + "step": 35820 + }, + { + "epoch": 5.792579419610379, + "grad_norm": 1.3314417600631714, + "learning_rate": 0.0002, + "loss": 0.5391, + "step": 35830 + }, + { + "epoch": 5.7941961037911245, + "grad_norm": 1.0097602605819702, + "learning_rate": 0.0002, + "loss": 0.5441, + "step": 35840 + }, + { + "epoch": 5.79581278797187, + "grad_norm": 1.1265122890472412, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 35850 + }, + { + "epoch": 5.797429472152615, + "grad_norm": 1.2191909551620483, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 35860 + }, + { + "epoch": 5.79904615633336, + "grad_norm": 0.9690808057785034, + "learning_rate": 0.0002, + "loss": 0.5274, + "step": 35870 + }, + { + "epoch": 5.800662840514105, + "grad_norm": 1.0871665477752686, + "learning_rate": 0.0002, + "loss": 0.5425, + "step": 35880 + }, + { + "epoch": 5.802279524694851, + "grad_norm": 1.1093597412109375, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 35890 + }, + { + "epoch": 5.803896208875596, + "grad_norm": 1.2434282302856445, + "learning_rate": 0.0002, + "loss": 0.5475, + "step": 35900 + }, + { + "epoch": 5.805512893056341, + "grad_norm": 1.2933623790740967, + "learning_rate": 0.0002, + "loss": 0.5288, + "step": 35910 + }, + { + "epoch": 5.807129577237086, + "grad_norm": 1.0005441904067993, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 35920 + }, + { + "epoch": 5.8087462614178325, + "grad_norm": 1.2373108863830566, + "learning_rate": 0.0002, + "loss": 0.5318, + "step": 35930 + }, + { + "epoch": 5.810362945598578, + "grad_norm": 1.2622692584991455, + "learning_rate": 0.0002, + "loss": 0.5413, + "step": 35940 + }, + { + "epoch": 5.811979629779323, + "grad_norm": 1.0112963914871216, + "learning_rate": 0.0002, + "loss": 0.5558, + "step": 35950 + }, + { + "epoch": 5.813596313960068, + "grad_norm": 1.050572395324707, + "learning_rate": 0.0002, + "loss": 0.5115, + "step": 35960 + }, + { + "epoch": 5.815212998140813, + "grad_norm": 0.9774560928344727, + "learning_rate": 0.0002, + "loss": 0.5288, + "step": 35970 + }, + { + "epoch": 5.816829682321559, + "grad_norm": 1.19438898563385, + "learning_rate": 0.0002, + "loss": 0.585, + "step": 35980 + }, + { + "epoch": 5.818446366502304, + "grad_norm": 1.0267130136489868, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 35990 + }, + { + "epoch": 5.820063050683049, + "grad_norm": 0.9813851714134216, + "learning_rate": 0.0002, + "loss": 0.5126, + "step": 36000 + }, + { + "epoch": 5.821679734863794, + "grad_norm": 0.9177457094192505, + "learning_rate": 0.0002, + "loss": 0.5138, + "step": 36010 + }, + { + "epoch": 5.8232964190445395, + "grad_norm": 1.0020731687545776, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 36020 + }, + { + "epoch": 5.824913103225285, + "grad_norm": 1.073222041130066, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 36030 + }, + { + "epoch": 5.82652978740603, + "grad_norm": 1.016337513923645, + "learning_rate": 0.0002, + "loss": 0.5539, + "step": 36040 + }, + { + "epoch": 5.828146471586775, + "grad_norm": 1.267364263534546, + "learning_rate": 0.0002, + "loss": 0.5592, + "step": 36050 + }, + { + "epoch": 5.8297631557675205, + "grad_norm": 1.2730127573013306, + "learning_rate": 0.0002, + "loss": 0.595, + "step": 36060 + }, + { + "epoch": 5.831379839948266, + "grad_norm": 1.108442783355713, + "learning_rate": 0.0002, + "loss": 0.5247, + "step": 36070 + }, + { + "epoch": 5.832996524129012, + "grad_norm": 1.198072075843811, + "learning_rate": 0.0002, + "loss": 0.5103, + "step": 36080 + }, + { + "epoch": 5.834613208309757, + "grad_norm": 1.0458786487579346, + "learning_rate": 0.0002, + "loss": 0.5479, + "step": 36090 + }, + { + "epoch": 5.836229892490502, + "grad_norm": 0.9096664786338806, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 36100 + }, + { + "epoch": 5.8378465766712475, + "grad_norm": 0.9957793951034546, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 36110 + }, + { + "epoch": 5.839463260851993, + "grad_norm": 1.3693058490753174, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 36120 + }, + { + "epoch": 5.841079945032738, + "grad_norm": 1.268608808517456, + "learning_rate": 0.0002, + "loss": 0.5425, + "step": 36130 + }, + { + "epoch": 5.842696629213483, + "grad_norm": 0.8516020178794861, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 36140 + }, + { + "epoch": 5.844313313394228, + "grad_norm": 0.90385502576828, + "learning_rate": 0.0002, + "loss": 0.5475, + "step": 36150 + }, + { + "epoch": 5.845929997574974, + "grad_norm": 1.0910571813583374, + "learning_rate": 0.0002, + "loss": 0.5274, + "step": 36160 + }, + { + "epoch": 5.847546681755719, + "grad_norm": 0.9417795538902283, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 36170 + }, + { + "epoch": 5.849163365936464, + "grad_norm": 1.0027360916137695, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 36180 + }, + { + "epoch": 5.850780050117209, + "grad_norm": 1.1480516195297241, + "learning_rate": 0.0002, + "loss": 0.5423, + "step": 36190 + }, + { + "epoch": 5.852396734297955, + "grad_norm": 1.2431457042694092, + "learning_rate": 0.0002, + "loss": 0.5517, + "step": 36200 + }, + { + "epoch": 5.8540134184787, + "grad_norm": 1.091465950012207, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 36210 + }, + { + "epoch": 5.855630102659445, + "grad_norm": 0.9693930745124817, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 36220 + }, + { + "epoch": 5.857246786840191, + "grad_norm": 0.9937465190887451, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 36230 + }, + { + "epoch": 5.858863471020936, + "grad_norm": 1.0731011629104614, + "learning_rate": 0.0002, + "loss": 0.5621, + "step": 36240 + }, + { + "epoch": 5.860480155201682, + "grad_norm": 1.0869048833847046, + "learning_rate": 0.0002, + "loss": 0.5687, + "step": 36250 + }, + { + "epoch": 5.862096839382427, + "grad_norm": 0.9226390719413757, + "learning_rate": 0.0002, + "loss": 0.5576, + "step": 36260 + }, + { + "epoch": 5.863713523563172, + "grad_norm": 1.1755430698394775, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 36270 + }, + { + "epoch": 5.865330207743917, + "grad_norm": 0.8815974593162537, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 36280 + }, + { + "epoch": 5.866946891924663, + "grad_norm": 1.3648751974105835, + "learning_rate": 0.0002, + "loss": 0.5065, + "step": 36290 + }, + { + "epoch": 5.868563576105408, + "grad_norm": 0.8729211091995239, + "learning_rate": 0.0002, + "loss": 0.536, + "step": 36300 + }, + { + "epoch": 5.870180260286153, + "grad_norm": 1.0870907306671143, + "learning_rate": 0.0002, + "loss": 0.5192, + "step": 36310 + }, + { + "epoch": 5.871796944466898, + "grad_norm": 1.1164259910583496, + "learning_rate": 0.0002, + "loss": 0.5609, + "step": 36320 + }, + { + "epoch": 5.8734136286476435, + "grad_norm": 1.1572535037994385, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 36330 + }, + { + "epoch": 5.875030312828389, + "grad_norm": 1.0456238985061646, + "learning_rate": 0.0002, + "loss": 0.5898, + "step": 36340 + }, + { + "epoch": 5.876646997009134, + "grad_norm": 1.1310722827911377, + "learning_rate": 0.0002, + "loss": 0.5008, + "step": 36350 + }, + { + "epoch": 5.878263681189879, + "grad_norm": 1.0004712343215942, + "learning_rate": 0.0002, + "loss": 0.5352, + "step": 36360 + }, + { + "epoch": 5.879880365370624, + "grad_norm": 1.0991777181625366, + "learning_rate": 0.0002, + "loss": 0.5632, + "step": 36370 + }, + { + "epoch": 5.8814970495513705, + "grad_norm": 1.2789239883422852, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 36380 + }, + { + "epoch": 5.883113733732116, + "grad_norm": 0.9524819850921631, + "learning_rate": 0.0002, + "loss": 0.56, + "step": 36390 + }, + { + "epoch": 5.884730417912861, + "grad_norm": 1.1115771532058716, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 36400 + }, + { + "epoch": 5.886347102093606, + "grad_norm": 1.37419855594635, + "learning_rate": 0.0002, + "loss": 0.5463, + "step": 36410 + }, + { + "epoch": 5.8879637862743515, + "grad_norm": 1.1449527740478516, + "learning_rate": 0.0002, + "loss": 0.5675, + "step": 36420 + }, + { + "epoch": 5.889580470455097, + "grad_norm": 1.198046326637268, + "learning_rate": 0.0002, + "loss": 0.5255, + "step": 36430 + }, + { + "epoch": 5.891197154635842, + "grad_norm": 1.0180530548095703, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 36440 + }, + { + "epoch": 5.892813838816587, + "grad_norm": 1.0516417026519775, + "learning_rate": 0.0002, + "loss": 0.5319, + "step": 36450 + }, + { + "epoch": 5.894430522997332, + "grad_norm": 1.1658052206039429, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 36460 + }, + { + "epoch": 5.896047207178078, + "grad_norm": 1.190699577331543, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 36470 + }, + { + "epoch": 5.897663891358823, + "grad_norm": 1.1235495805740356, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 36480 + }, + { + "epoch": 5.899280575539568, + "grad_norm": 1.1926926374435425, + "learning_rate": 0.0002, + "loss": 0.5284, + "step": 36490 + }, + { + "epoch": 5.900897259720313, + "grad_norm": 1.1184662580490112, + "learning_rate": 0.0002, + "loss": 0.5686, + "step": 36500 + }, + { + "epoch": 5.9025139439010585, + "grad_norm": 1.000970721244812, + "learning_rate": 0.0002, + "loss": 0.5147, + "step": 36510 + }, + { + "epoch": 5.904130628081804, + "grad_norm": 1.0373306274414062, + "learning_rate": 0.0002, + "loss": 0.5351, + "step": 36520 + }, + { + "epoch": 5.90574731226255, + "grad_norm": 1.0840669870376587, + "learning_rate": 0.0002, + "loss": 0.535, + "step": 36530 + }, + { + "epoch": 5.907363996443295, + "grad_norm": 0.9908381104469299, + "learning_rate": 0.0002, + "loss": 0.538, + "step": 36540 + }, + { + "epoch": 5.90898068062404, + "grad_norm": 1.0456029176712036, + "learning_rate": 0.0002, + "loss": 0.5313, + "step": 36550 + }, + { + "epoch": 5.910597364804786, + "grad_norm": 1.1381454467773438, + "learning_rate": 0.0002, + "loss": 0.5693, + "step": 36560 + }, + { + "epoch": 5.912214048985531, + "grad_norm": 0.9440900087356567, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 36570 + }, + { + "epoch": 5.913830733166276, + "grad_norm": 1.1674573421478271, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 36580 + }, + { + "epoch": 5.915447417347021, + "grad_norm": 1.1226966381072998, + "learning_rate": 0.0002, + "loss": 0.526, + "step": 36590 + }, + { + "epoch": 5.9170641015277665, + "grad_norm": 0.9696915745735168, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 36600 + }, + { + "epoch": 5.918680785708512, + "grad_norm": 0.9593005180358887, + "learning_rate": 0.0002, + "loss": 0.5523, + "step": 36610 + }, + { + "epoch": 5.920297469889257, + "grad_norm": 1.122169852256775, + "learning_rate": 0.0002, + "loss": 0.5536, + "step": 36620 + }, + { + "epoch": 5.921914154070002, + "grad_norm": 0.9923415780067444, + "learning_rate": 0.0002, + "loss": 0.5039, + "step": 36630 + }, + { + "epoch": 5.923530838250747, + "grad_norm": 1.063838005065918, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 36640 + }, + { + "epoch": 5.925147522431493, + "grad_norm": 0.9083505272865295, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 36650 + }, + { + "epoch": 5.926764206612239, + "grad_norm": 0.9439437985420227, + "learning_rate": 0.0002, + "loss": 0.5264, + "step": 36660 + }, + { + "epoch": 5.928380890792983, + "grad_norm": 0.9778534173965454, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 36670 + }, + { + "epoch": 5.929997574973729, + "grad_norm": 0.9723961353302002, + "learning_rate": 0.0002, + "loss": 0.566, + "step": 36680 + }, + { + "epoch": 5.9316142591544745, + "grad_norm": 1.162333607673645, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 36690 + }, + { + "epoch": 5.93323094333522, + "grad_norm": 1.2784897089004517, + "learning_rate": 0.0002, + "loss": 0.5771, + "step": 36700 + }, + { + "epoch": 5.934847627515965, + "grad_norm": 1.0924867391586304, + "learning_rate": 0.0002, + "loss": 0.5343, + "step": 36710 + }, + { + "epoch": 5.93646431169671, + "grad_norm": 1.046922206878662, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 36720 + }, + { + "epoch": 5.938080995877455, + "grad_norm": 0.8632535338401794, + "learning_rate": 0.0002, + "loss": 0.5476, + "step": 36730 + }, + { + "epoch": 5.939697680058201, + "grad_norm": 1.358762502670288, + "learning_rate": 0.0002, + "loss": 0.5456, + "step": 36740 + }, + { + "epoch": 5.941314364238946, + "grad_norm": 1.2058624029159546, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 36750 + }, + { + "epoch": 5.942931048419691, + "grad_norm": 1.1396408081054688, + "learning_rate": 0.0002, + "loss": 0.5462, + "step": 36760 + }, + { + "epoch": 5.944547732600436, + "grad_norm": 1.1510354280471802, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 36770 + }, + { + "epoch": 5.946164416781182, + "grad_norm": 1.1401607990264893, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 36780 + }, + { + "epoch": 5.947781100961927, + "grad_norm": 1.1871325969696045, + "learning_rate": 0.0002, + "loss": 0.5557, + "step": 36790 + }, + { + "epoch": 5.949397785142672, + "grad_norm": 0.9928333163261414, + "learning_rate": 0.0002, + "loss": 0.4945, + "step": 36800 + }, + { + "epoch": 5.951014469323418, + "grad_norm": 1.0549445152282715, + "learning_rate": 0.0002, + "loss": 0.5303, + "step": 36810 + }, + { + "epoch": 5.9526311535041625, + "grad_norm": 0.9791563749313354, + "learning_rate": 0.0002, + "loss": 0.5532, + "step": 36820 + }, + { + "epoch": 5.954247837684909, + "grad_norm": 1.1268441677093506, + "learning_rate": 0.0002, + "loss": 0.5317, + "step": 36830 + }, + { + "epoch": 5.955864521865654, + "grad_norm": 1.0533992052078247, + "learning_rate": 0.0002, + "loss": 0.5585, + "step": 36840 + }, + { + "epoch": 5.957481206046399, + "grad_norm": 1.023358941078186, + "learning_rate": 0.0002, + "loss": 0.4972, + "step": 36850 + }, + { + "epoch": 5.959097890227144, + "grad_norm": 1.2631961107254028, + "learning_rate": 0.0002, + "loss": 0.5557, + "step": 36860 + }, + { + "epoch": 5.9607145744078895, + "grad_norm": 0.9397698640823364, + "learning_rate": 0.0002, + "loss": 0.5662, + "step": 36870 + }, + { + "epoch": 5.962331258588635, + "grad_norm": 1.1678427457809448, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 36880 + }, + { + "epoch": 5.96394794276938, + "grad_norm": 1.1403759717941284, + "learning_rate": 0.0002, + "loss": 0.5435, + "step": 36890 + }, + { + "epoch": 5.965564626950125, + "grad_norm": 1.030572772026062, + "learning_rate": 0.0002, + "loss": 0.5479, + "step": 36900 + }, + { + "epoch": 5.9671813111308705, + "grad_norm": 1.0992497205734253, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 36910 + }, + { + "epoch": 5.968797995311616, + "grad_norm": 1.075466275215149, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 36920 + }, + { + "epoch": 5.970414679492361, + "grad_norm": 1.0153694152832031, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 36930 + }, + { + "epoch": 5.972031363673106, + "grad_norm": 0.973193883895874, + "learning_rate": 0.0002, + "loss": 0.5672, + "step": 36940 + }, + { + "epoch": 5.973648047853851, + "grad_norm": 0.8294678926467896, + "learning_rate": 0.0002, + "loss": 0.5585, + "step": 36950 + }, + { + "epoch": 5.9752647320345975, + "grad_norm": 1.0048716068267822, + "learning_rate": 0.0002, + "loss": 0.5631, + "step": 36960 + }, + { + "epoch": 5.976881416215342, + "grad_norm": 0.9714070558547974, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 36970 + }, + { + "epoch": 5.978498100396088, + "grad_norm": 0.8667682409286499, + "learning_rate": 0.0002, + "loss": 0.5419, + "step": 36980 + }, + { + "epoch": 5.980114784576833, + "grad_norm": 1.0461409091949463, + "learning_rate": 0.0002, + "loss": 0.5474, + "step": 36990 + }, + { + "epoch": 5.981731468757578, + "grad_norm": 0.9229754209518433, + "learning_rate": 0.0002, + "loss": 0.5454, + "step": 37000 + }, + { + "epoch": 5.983348152938324, + "grad_norm": 1.0406876802444458, + "learning_rate": 0.0002, + "loss": 0.5599, + "step": 37010 + }, + { + "epoch": 5.984964837119069, + "grad_norm": 0.8993828296661377, + "learning_rate": 0.0002, + "loss": 0.5569, + "step": 37020 + }, + { + "epoch": 5.986581521299814, + "grad_norm": 1.2260479927062988, + "learning_rate": 0.0002, + "loss": 0.5611, + "step": 37030 + }, + { + "epoch": 5.988198205480559, + "grad_norm": 1.0107380151748657, + "learning_rate": 0.0002, + "loss": 0.5523, + "step": 37040 + }, + { + "epoch": 5.989814889661305, + "grad_norm": 1.0240139961242676, + "learning_rate": 0.0002, + "loss": 0.5639, + "step": 37050 + }, + { + "epoch": 5.99143157384205, + "grad_norm": 1.0185275077819824, + "learning_rate": 0.0002, + "loss": 0.5209, + "step": 37060 + }, + { + "epoch": 5.993048258022795, + "grad_norm": 1.1361802816390991, + "learning_rate": 0.0002, + "loss": 0.5114, + "step": 37070 + }, + { + "epoch": 5.99466494220354, + "grad_norm": 1.0395532846450806, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 37080 + }, + { + "epoch": 5.9962816263842855, + "grad_norm": 0.9463558197021484, + "learning_rate": 0.0002, + "loss": 0.594, + "step": 37090 + }, + { + "epoch": 5.997898310565031, + "grad_norm": 1.2066948413848877, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 37100 + }, + { + "epoch": 5.999514994745777, + "grad_norm": 0.9749386310577393, + "learning_rate": 0.0002, + "loss": 0.5356, + "step": 37110 + }, + { + "epoch": 6.0, + "eval_loss": 1.2270219326019287, + "eval_runtime": 122.2047, + "eval_samples_per_second": 5.998, + "eval_steps_per_second": 0.753, + "step": 37113 + } + ], + "logging_steps": 10, + "max_steps": 49480, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.7175065406272963e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..155b12fa9acbc6e71dba75c92bfa79e152397ebf --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-37113/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28694d5564a2b5c7d6881d4ba2af103356aa22489d2c22768ebbe47283c0f4a1 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..28601f2be0b06ae9321ec6060321b6f2ec7a7237 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c1406cb62687223f2383ed0c258f0e930e79fed0698d70c29b14f26bb411939 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce132b138ff281db21e3012de036863389b9c56b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85f6de3550fb5c5e303b3a93ee6be0a88e1684e070e6e05bbc1b53f188580a27 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..eb65ab3ffed153e0604aeb2a485badf41287aa40 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a404304c9f5ba871a0af32ddd9cb5af2e5087b5984920bd41be7f2405c4de25 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..497e899a7927d55a3579d08a7fa7106328985e8c --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75fc1866fc3d98969c78ad715fdff0568f69b0d75b9c4a54d77c7ffc5e910d3b +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1cca2ea5ee533ef48bb3612e2d7c4deebbfa9104 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/trainer_state.json @@ -0,0 +1,30392 @@ +{ + "best_metric": 1.0871200561523438, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185", + "epoch": 6.9999191657909625, + "eval_steps": 10, + "global_step": 43298, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016166841807452913, + "grad_norm": 0.9894065856933594, + "learning_rate": 0.0002, + "loss": 1.6636, + "step": 10 + }, + { + "epoch": 0.0032333683614905826, + "grad_norm": 1.7810699939727783, + "learning_rate": 0.0002, + "loss": 1.1528, + "step": 20 + }, + { + "epoch": 0.004850052542235874, + "grad_norm": 0.5969577431678772, + "learning_rate": 0.0002, + "loss": 0.9767, + "step": 30 + }, + { + "epoch": 0.006466736722981165, + "grad_norm": 0.6354120969772339, + "learning_rate": 0.0002, + "loss": 0.9772, + "step": 40 + }, + { + "epoch": 0.008083420903726457, + "grad_norm": 0.5604607462882996, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 50 + }, + { + "epoch": 0.009700105084471748, + "grad_norm": 0.4676193594932556, + "learning_rate": 0.0002, + "loss": 0.8841, + "step": 60 + }, + { + "epoch": 0.01131678926521704, + "grad_norm": 0.6099211573600769, + "learning_rate": 0.0002, + "loss": 0.9022, + "step": 70 + }, + { + "epoch": 0.01293347344596233, + "grad_norm": 0.48639994859695435, + "learning_rate": 0.0002, + "loss": 0.9133, + "step": 80 + }, + { + "epoch": 0.014550157626707623, + "grad_norm": 0.4904264509677887, + "learning_rate": 0.0002, + "loss": 0.8704, + "step": 90 + }, + { + "epoch": 0.016166841807452915, + "grad_norm": 2.8334362506866455, + "learning_rate": 0.0002, + "loss": 0.8855, + "step": 100 + }, + { + "epoch": 0.017783525988198205, + "grad_norm": 0.43221670389175415, + "learning_rate": 0.0002, + "loss": 0.8958, + "step": 110 + }, + { + "epoch": 0.019400210168943496, + "grad_norm": 0.42244166135787964, + "learning_rate": 0.0002, + "loss": 0.8412, + "step": 120 + }, + { + "epoch": 0.02101689434968879, + "grad_norm": 0.45363298058509827, + "learning_rate": 0.0002, + "loss": 0.8467, + "step": 130 + }, + { + "epoch": 0.02263357853043408, + "grad_norm": 0.44816508889198303, + "learning_rate": 0.0002, + "loss": 0.8641, + "step": 140 + }, + { + "epoch": 0.02425026271117937, + "grad_norm": 0.43308213353157043, + "learning_rate": 0.0002, + "loss": 0.8496, + "step": 150 + }, + { + "epoch": 0.02586694689192466, + "grad_norm": 0.4084763526916504, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 160 + }, + { + "epoch": 0.027483631072669955, + "grad_norm": 0.5363703966140747, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 170 + }, + { + "epoch": 0.029100315253415245, + "grad_norm": 0.4619699716567993, + "learning_rate": 0.0002, + "loss": 0.8558, + "step": 180 + }, + { + "epoch": 0.030716999434160536, + "grad_norm": 0.49069908261299133, + "learning_rate": 0.0002, + "loss": 0.8878, + "step": 190 + }, + { + "epoch": 0.03233368361490583, + "grad_norm": 0.4645835757255554, + "learning_rate": 0.0002, + "loss": 0.8867, + "step": 200 + }, + { + "epoch": 0.03395036779565112, + "grad_norm": 1.2411243915557861, + "learning_rate": 0.0002, + "loss": 0.8842, + "step": 210 + }, + { + "epoch": 0.03556705197639641, + "grad_norm": 0.5211851596832275, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 220 + }, + { + "epoch": 0.037183736157141704, + "grad_norm": 0.5253691673278809, + "learning_rate": 0.0002, + "loss": 0.8194, + "step": 230 + }, + { + "epoch": 0.03880042033788699, + "grad_norm": 0.4567478895187378, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 240 + }, + { + "epoch": 0.040417104518632285, + "grad_norm": 0.5472128391265869, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 250 + }, + { + "epoch": 0.04203378869937758, + "grad_norm": 0.42978546023368835, + "learning_rate": 0.0002, + "loss": 0.8201, + "step": 260 + }, + { + "epoch": 0.043650472880122866, + "grad_norm": 0.601734459400177, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 270 + }, + { + "epoch": 0.04526715706086816, + "grad_norm": 0.4286513328552246, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 280 + }, + { + "epoch": 0.046883841241613454, + "grad_norm": 0.5230861902236938, + "learning_rate": 0.0002, + "loss": 0.8758, + "step": 290 + }, + { + "epoch": 0.04850052542235874, + "grad_norm": 0.6504611968994141, + "learning_rate": 0.0002, + "loss": 0.8636, + "step": 300 + }, + { + "epoch": 0.050117209603104035, + "grad_norm": 0.43485215306282043, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 310 + }, + { + "epoch": 0.05173389378384932, + "grad_norm": 0.4717007875442505, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 320 + }, + { + "epoch": 0.053350577964594616, + "grad_norm": 0.4059787690639496, + "learning_rate": 0.0002, + "loss": 0.8469, + "step": 330 + }, + { + "epoch": 0.05496726214533991, + "grad_norm": 0.4366913437843323, + "learning_rate": 0.0002, + "loss": 0.8866, + "step": 340 + }, + { + "epoch": 0.0565839463260852, + "grad_norm": 0.4233848452568054, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 350 + }, + { + "epoch": 0.05820063050683049, + "grad_norm": 0.4209108352661133, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 360 + }, + { + "epoch": 0.059817314687575784, + "grad_norm": 0.41637396812438965, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 370 + }, + { + "epoch": 0.06143399886832107, + "grad_norm": 0.46235376596450806, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 380 + }, + { + "epoch": 0.06305068304906636, + "grad_norm": 0.4013484716415405, + "learning_rate": 0.0002, + "loss": 0.7966, + "step": 390 + }, + { + "epoch": 0.06466736722981166, + "grad_norm": 0.47443896532058716, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 400 + }, + { + "epoch": 0.06628405141055695, + "grad_norm": 0.3942156434059143, + "learning_rate": 0.0002, + "loss": 0.8666, + "step": 410 + }, + { + "epoch": 0.06790073559130223, + "grad_norm": 0.4965320825576782, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 420 + }, + { + "epoch": 0.06951741977204753, + "grad_norm": 0.4304835796356201, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 430 + }, + { + "epoch": 0.07113410395279282, + "grad_norm": 0.511726975440979, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 440 + }, + { + "epoch": 0.07275078813353811, + "grad_norm": 0.4040689170360565, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 450 + }, + { + "epoch": 0.07436747231428341, + "grad_norm": 0.5402171015739441, + "learning_rate": 0.0002, + "loss": 0.8788, + "step": 460 + }, + { + "epoch": 0.0759841564950287, + "grad_norm": 0.4174517095088959, + "learning_rate": 0.0002, + "loss": 0.8737, + "step": 470 + }, + { + "epoch": 0.07760084067577398, + "grad_norm": 0.4306182265281677, + "learning_rate": 0.0002, + "loss": 0.7605, + "step": 480 + }, + { + "epoch": 0.07921752485651928, + "grad_norm": 0.535210132598877, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 490 + }, + { + "epoch": 0.08083420903726457, + "grad_norm": 0.5339109897613525, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 500 + }, + { + "epoch": 0.08245089321800986, + "grad_norm": 0.45754891633987427, + "learning_rate": 0.0002, + "loss": 0.8985, + "step": 510 + }, + { + "epoch": 0.08406757739875516, + "grad_norm": 0.43820783495903015, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 520 + }, + { + "epoch": 0.08568426157950045, + "grad_norm": 0.4434749186038971, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 530 + }, + { + "epoch": 0.08730094576024573, + "grad_norm": 0.43111467361450195, + "learning_rate": 0.0002, + "loss": 0.7857, + "step": 540 + }, + { + "epoch": 0.08891762994099103, + "grad_norm": 0.4378940165042877, + "learning_rate": 0.0002, + "loss": 0.8418, + "step": 550 + }, + { + "epoch": 0.09053431412173632, + "grad_norm": 0.4772215187549591, + "learning_rate": 0.0002, + "loss": 0.8361, + "step": 560 + }, + { + "epoch": 0.09215099830248161, + "grad_norm": 0.6837629079818726, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 570 + }, + { + "epoch": 0.09376768248322691, + "grad_norm": 0.42241212725639343, + "learning_rate": 0.0002, + "loss": 0.8607, + "step": 580 + }, + { + "epoch": 0.0953843666639722, + "grad_norm": 0.5165936350822449, + "learning_rate": 0.0002, + "loss": 0.852, + "step": 590 + }, + { + "epoch": 0.09700105084471748, + "grad_norm": 0.48737478256225586, + "learning_rate": 0.0002, + "loss": 0.8664, + "step": 600 + }, + { + "epoch": 0.09861773502546278, + "grad_norm": 0.47419852018356323, + "learning_rate": 0.0002, + "loss": 0.8806, + "step": 610 + }, + { + "epoch": 0.10023441920620807, + "grad_norm": 0.4975486099720001, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 620 + }, + { + "epoch": 0.10185110338695336, + "grad_norm": 0.49123844504356384, + "learning_rate": 0.0002, + "loss": 0.8548, + "step": 630 + }, + { + "epoch": 0.10346778756769864, + "grad_norm": 0.6288952827453613, + "learning_rate": 0.0002, + "loss": 0.8911, + "step": 640 + }, + { + "epoch": 0.10508447174844394, + "grad_norm": 0.4277345836162567, + "learning_rate": 0.0002, + "loss": 0.827, + "step": 650 + }, + { + "epoch": 0.10670115592918923, + "grad_norm": 0.4021061956882477, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 660 + }, + { + "epoch": 0.10831784010993452, + "grad_norm": 0.3492237329483032, + "learning_rate": 0.0002, + "loss": 0.87, + "step": 670 + }, + { + "epoch": 0.10993452429067982, + "grad_norm": 0.4341012239456177, + "learning_rate": 0.0002, + "loss": 0.8698, + "step": 680 + }, + { + "epoch": 0.1115512084714251, + "grad_norm": 0.7296304106712341, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 690 + }, + { + "epoch": 0.1131678926521704, + "grad_norm": 0.397494912147522, + "learning_rate": 0.0002, + "loss": 0.8433, + "step": 700 + }, + { + "epoch": 0.1147845768329157, + "grad_norm": 0.396431028842926, + "learning_rate": 0.0002, + "loss": 0.827, + "step": 710 + }, + { + "epoch": 0.11640126101366098, + "grad_norm": 0.48842838406562805, + "learning_rate": 0.0002, + "loss": 0.8379, + "step": 720 + }, + { + "epoch": 0.11801794519440627, + "grad_norm": 0.46322616934776306, + "learning_rate": 0.0002, + "loss": 0.8238, + "step": 730 + }, + { + "epoch": 0.11963462937515157, + "grad_norm": 0.47990912199020386, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 740 + }, + { + "epoch": 0.12125131355589686, + "grad_norm": 0.4997142255306244, + "learning_rate": 0.0002, + "loss": 0.82, + "step": 750 + }, + { + "epoch": 0.12286799773664214, + "grad_norm": 0.4040526747703552, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 760 + }, + { + "epoch": 0.12448468191738744, + "grad_norm": 0.453095942735672, + "learning_rate": 0.0002, + "loss": 0.863, + "step": 770 + }, + { + "epoch": 0.12610136609813272, + "grad_norm": 0.4636971950531006, + "learning_rate": 0.0002, + "loss": 0.8792, + "step": 780 + }, + { + "epoch": 0.12771805027887803, + "grad_norm": 0.4279276132583618, + "learning_rate": 0.0002, + "loss": 0.8112, + "step": 790 + }, + { + "epoch": 0.12933473445962332, + "grad_norm": 0.46212655305862427, + "learning_rate": 0.0002, + "loss": 0.8711, + "step": 800 + }, + { + "epoch": 0.1309514186403686, + "grad_norm": 0.43127650022506714, + "learning_rate": 0.0002, + "loss": 0.8368, + "step": 810 + }, + { + "epoch": 0.1325681028211139, + "grad_norm": 0.4201301336288452, + "learning_rate": 0.0002, + "loss": 0.8476, + "step": 820 + }, + { + "epoch": 0.13418478700185918, + "grad_norm": 0.42583167552948, + "learning_rate": 0.0002, + "loss": 0.8078, + "step": 830 + }, + { + "epoch": 0.13580147118260447, + "grad_norm": 0.4535622000694275, + "learning_rate": 0.0002, + "loss": 0.8219, + "step": 840 + }, + { + "epoch": 0.13741815536334978, + "grad_norm": 0.4116036891937256, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 850 + }, + { + "epoch": 0.13903483954409507, + "grad_norm": 0.45997580885887146, + "learning_rate": 0.0002, + "loss": 0.8466, + "step": 860 + }, + { + "epoch": 0.14065152372484035, + "grad_norm": 0.4487837255001068, + "learning_rate": 0.0002, + "loss": 0.8917, + "step": 870 + }, + { + "epoch": 0.14226820790558564, + "grad_norm": 0.43650057911872864, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 880 + }, + { + "epoch": 0.14388489208633093, + "grad_norm": 0.5335358381271362, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 890 + }, + { + "epoch": 0.14550157626707622, + "grad_norm": 0.5989000201225281, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 900 + }, + { + "epoch": 0.14711826044782153, + "grad_norm": 0.517179012298584, + "learning_rate": 0.0002, + "loss": 0.8385, + "step": 910 + }, + { + "epoch": 0.14873494462856682, + "grad_norm": 0.44435232877731323, + "learning_rate": 0.0002, + "loss": 0.8255, + "step": 920 + }, + { + "epoch": 0.1503516288093121, + "grad_norm": 0.42635923624038696, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 930 + }, + { + "epoch": 0.1519683129900574, + "grad_norm": 0.49603334069252014, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 940 + }, + { + "epoch": 0.15358499717080268, + "grad_norm": 0.40639808773994446, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 950 + }, + { + "epoch": 0.15520168135154797, + "grad_norm": 0.4850759208202362, + "learning_rate": 0.0002, + "loss": 0.8529, + "step": 960 + }, + { + "epoch": 0.15681836553229328, + "grad_norm": 0.4427442252635956, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 970 + }, + { + "epoch": 0.15843504971303857, + "grad_norm": 0.3760930001735687, + "learning_rate": 0.0002, + "loss": 0.8705, + "step": 980 + }, + { + "epoch": 0.16005173389378385, + "grad_norm": 0.4794144332408905, + "learning_rate": 0.0002, + "loss": 0.8644, + "step": 990 + }, + { + "epoch": 0.16166841807452914, + "grad_norm": 0.45828768610954285, + "learning_rate": 0.0002, + "loss": 0.8002, + "step": 1000 + }, + { + "epoch": 0.16328510225527443, + "grad_norm": 0.6313053369522095, + "learning_rate": 0.0002, + "loss": 0.7658, + "step": 1010 + }, + { + "epoch": 0.16490178643601971, + "grad_norm": 0.45041006803512573, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1020 + }, + { + "epoch": 0.166518470616765, + "grad_norm": 0.441403865814209, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 1030 + }, + { + "epoch": 0.16813515479751032, + "grad_norm": 0.8171296119689941, + "learning_rate": 0.0002, + "loss": 0.8475, + "step": 1040 + }, + { + "epoch": 0.1697518389782556, + "grad_norm": 0.7137420773506165, + "learning_rate": 0.0002, + "loss": 0.845, + "step": 1050 + }, + { + "epoch": 0.1713685231590009, + "grad_norm": 0.5236809849739075, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 1060 + }, + { + "epoch": 0.17298520733974618, + "grad_norm": 0.5021864175796509, + "learning_rate": 0.0002, + "loss": 0.8265, + "step": 1070 + }, + { + "epoch": 0.17460189152049146, + "grad_norm": 0.47347521781921387, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 1080 + }, + { + "epoch": 0.17621857570123675, + "grad_norm": 0.4631653428077698, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 1090 + }, + { + "epoch": 0.17783525988198207, + "grad_norm": 0.49169182777404785, + "learning_rate": 0.0002, + "loss": 0.8166, + "step": 1100 + }, + { + "epoch": 0.17945194406272735, + "grad_norm": 0.5019739270210266, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 1110 + }, + { + "epoch": 0.18106862824347264, + "grad_norm": 0.5100422501564026, + "learning_rate": 0.0002, + "loss": 0.8247, + "step": 1120 + }, + { + "epoch": 0.18268531242421793, + "grad_norm": 0.3888324499130249, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 1130 + }, + { + "epoch": 0.18430199660496321, + "grad_norm": 0.39765217900276184, + "learning_rate": 0.0002, + "loss": 0.8533, + "step": 1140 + }, + { + "epoch": 0.1859186807857085, + "grad_norm": 0.47190186381340027, + "learning_rate": 0.0002, + "loss": 0.8541, + "step": 1150 + }, + { + "epoch": 0.18753536496645382, + "grad_norm": 0.4464188814163208, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 1160 + }, + { + "epoch": 0.1891520491471991, + "grad_norm": 0.5153930187225342, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 1170 + }, + { + "epoch": 0.1907687333279444, + "grad_norm": 0.4779708683490753, + "learning_rate": 0.0002, + "loss": 0.8033, + "step": 1180 + }, + { + "epoch": 0.19238541750868968, + "grad_norm": 0.4834315776824951, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 1190 + }, + { + "epoch": 0.19400210168943496, + "grad_norm": 0.402357816696167, + "learning_rate": 0.0002, + "loss": 0.7721, + "step": 1200 + }, + { + "epoch": 0.19561878587018025, + "grad_norm": 0.45899084210395813, + "learning_rate": 0.0002, + "loss": 0.7941, + "step": 1210 + }, + { + "epoch": 0.19723547005092557, + "grad_norm": 0.5106529593467712, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 1220 + }, + { + "epoch": 0.19885215423167085, + "grad_norm": 0.45261722803115845, + "learning_rate": 0.0002, + "loss": 0.7816, + "step": 1230 + }, + { + "epoch": 0.20046883841241614, + "grad_norm": 0.4647127091884613, + "learning_rate": 0.0002, + "loss": 0.8068, + "step": 1240 + }, + { + "epoch": 0.20208552259316143, + "grad_norm": 0.4849368929862976, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 1250 + }, + { + "epoch": 0.2037022067739067, + "grad_norm": 0.4518061578273773, + "learning_rate": 0.0002, + "loss": 0.8514, + "step": 1260 + }, + { + "epoch": 0.205318890954652, + "grad_norm": 0.49535325169563293, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 1270 + }, + { + "epoch": 0.2069355751353973, + "grad_norm": 0.4835205376148224, + "learning_rate": 0.0002, + "loss": 0.8348, + "step": 1280 + }, + { + "epoch": 0.2085522593161426, + "grad_norm": 0.45308539271354675, + "learning_rate": 0.0002, + "loss": 0.8428, + "step": 1290 + }, + { + "epoch": 0.2101689434968879, + "grad_norm": 0.5369905233383179, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 1300 + }, + { + "epoch": 0.21178562767763318, + "grad_norm": 0.5031622052192688, + "learning_rate": 0.0002, + "loss": 0.8676, + "step": 1310 + }, + { + "epoch": 0.21340231185837846, + "grad_norm": 0.48010334372520447, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 1320 + }, + { + "epoch": 0.21501899603912375, + "grad_norm": 0.4905701279640198, + "learning_rate": 0.0002, + "loss": 0.806, + "step": 1330 + }, + { + "epoch": 0.21663568021986904, + "grad_norm": 0.43531742691993713, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 1340 + }, + { + "epoch": 0.21825236440061435, + "grad_norm": 0.44330692291259766, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 1350 + }, + { + "epoch": 0.21986904858135964, + "grad_norm": 0.5384416580200195, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 1360 + }, + { + "epoch": 0.22148573276210493, + "grad_norm": 0.4181833863258362, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 1370 + }, + { + "epoch": 0.2231024169428502, + "grad_norm": 0.523833692073822, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 1380 + }, + { + "epoch": 0.2247191011235955, + "grad_norm": 0.5528736710548401, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 1390 + }, + { + "epoch": 0.2263357853043408, + "grad_norm": 0.43515023589134216, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 1400 + }, + { + "epoch": 0.2279524694850861, + "grad_norm": 0.48809877038002014, + "learning_rate": 0.0002, + "loss": 0.8403, + "step": 1410 + }, + { + "epoch": 0.2295691536658314, + "grad_norm": 0.43591251969337463, + "learning_rate": 0.0002, + "loss": 0.8165, + "step": 1420 + }, + { + "epoch": 0.23118583784657668, + "grad_norm": 0.44625312089920044, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 1430 + }, + { + "epoch": 0.23280252202732196, + "grad_norm": 0.4390665292739868, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 1440 + }, + { + "epoch": 0.23441920620806725, + "grad_norm": 0.48496049642562866, + "learning_rate": 0.0002, + "loss": 0.8465, + "step": 1450 + }, + { + "epoch": 0.23603589038881254, + "grad_norm": 0.45919957756996155, + "learning_rate": 0.0002, + "loss": 0.775, + "step": 1460 + }, + { + "epoch": 0.23765257456955785, + "grad_norm": 0.5471845865249634, + "learning_rate": 0.0002, + "loss": 0.8659, + "step": 1470 + }, + { + "epoch": 0.23926925875030314, + "grad_norm": 0.47269317507743835, + "learning_rate": 0.0002, + "loss": 0.8164, + "step": 1480 + }, + { + "epoch": 0.24088594293104842, + "grad_norm": 0.4930245578289032, + "learning_rate": 0.0002, + "loss": 0.854, + "step": 1490 + }, + { + "epoch": 0.2425026271117937, + "grad_norm": 0.5605630278587341, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 1500 + }, + { + "epoch": 0.244119311292539, + "grad_norm": 0.4435870945453644, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 1510 + }, + { + "epoch": 0.24573599547328429, + "grad_norm": 0.4941999912261963, + "learning_rate": 0.0002, + "loss": 0.8123, + "step": 1520 + }, + { + "epoch": 0.24735267965402957, + "grad_norm": 0.5100624561309814, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 1530 + }, + { + "epoch": 0.2489693638347749, + "grad_norm": 0.4638267457485199, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 1540 + }, + { + "epoch": 0.25058604801552015, + "grad_norm": 0.5071570873260498, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 1550 + }, + { + "epoch": 0.25220273219626543, + "grad_norm": 0.4291319251060486, + "learning_rate": 0.0002, + "loss": 0.7724, + "step": 1560 + }, + { + "epoch": 0.2538194163770108, + "grad_norm": 0.5388049483299255, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 1570 + }, + { + "epoch": 0.25543610055775606, + "grad_norm": 0.5083683729171753, + "learning_rate": 0.0002, + "loss": 0.8176, + "step": 1580 + }, + { + "epoch": 0.25705278473850135, + "grad_norm": 0.4824463725090027, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 1590 + }, + { + "epoch": 0.25866946891924664, + "grad_norm": 0.41177722811698914, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 1600 + }, + { + "epoch": 0.2602861530999919, + "grad_norm": 0.5656219124794006, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 1610 + }, + { + "epoch": 0.2619028372807372, + "grad_norm": 0.41063204407691956, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 1620 + }, + { + "epoch": 0.2635195214614825, + "grad_norm": 0.4897061288356781, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 1630 + }, + { + "epoch": 0.2651362056422278, + "grad_norm": 0.4454376697540283, + "learning_rate": 0.0002, + "loss": 0.8198, + "step": 1640 + }, + { + "epoch": 0.26675288982297307, + "grad_norm": 0.4355238378047943, + "learning_rate": 0.0002, + "loss": 0.8684, + "step": 1650 + }, + { + "epoch": 0.26836957400371836, + "grad_norm": 0.458310067653656, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 1660 + }, + { + "epoch": 0.26998625818446365, + "grad_norm": 0.4752083718776703, + "learning_rate": 0.0002, + "loss": 0.7935, + "step": 1670 + }, + { + "epoch": 0.27160294236520893, + "grad_norm": 0.4666106402873993, + "learning_rate": 0.0002, + "loss": 0.8267, + "step": 1680 + }, + { + "epoch": 0.2732196265459543, + "grad_norm": 0.4213818609714508, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 1690 + }, + { + "epoch": 0.27483631072669956, + "grad_norm": 0.5768913626670837, + "learning_rate": 0.0002, + "loss": 0.8559, + "step": 1700 + }, + { + "epoch": 0.27645299490744485, + "grad_norm": 0.4209914803504944, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 1710 + }, + { + "epoch": 0.27806967908819014, + "grad_norm": 0.501909613609314, + "learning_rate": 0.0002, + "loss": 0.8167, + "step": 1720 + }, + { + "epoch": 0.2796863632689354, + "grad_norm": 0.5266261100769043, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 1730 + }, + { + "epoch": 0.2813030474496807, + "grad_norm": 0.43806859850883484, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 1740 + }, + { + "epoch": 0.282919731630426, + "grad_norm": 0.46048814058303833, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 1750 + }, + { + "epoch": 0.2845364158111713, + "grad_norm": 0.44972819089889526, + "learning_rate": 0.0002, + "loss": 0.8596, + "step": 1760 + }, + { + "epoch": 0.28615309999191657, + "grad_norm": 0.5114831328392029, + "learning_rate": 0.0002, + "loss": 0.8421, + "step": 1770 + }, + { + "epoch": 0.28776978417266186, + "grad_norm": 0.47931742668151855, + "learning_rate": 0.0002, + "loss": 0.8361, + "step": 1780 + }, + { + "epoch": 0.28938646835340714, + "grad_norm": 0.5092599987983704, + "learning_rate": 0.0002, + "loss": 0.8265, + "step": 1790 + }, + { + "epoch": 0.29100315253415243, + "grad_norm": 0.37581443786621094, + "learning_rate": 0.0002, + "loss": 0.8506, + "step": 1800 + }, + { + "epoch": 0.2926198367148977, + "grad_norm": 0.47097381949424744, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 1810 + }, + { + "epoch": 0.29423652089564306, + "grad_norm": 0.48300236463546753, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 1820 + }, + { + "epoch": 0.29585320507638835, + "grad_norm": 0.5600419640541077, + "learning_rate": 0.0002, + "loss": 0.8391, + "step": 1830 + }, + { + "epoch": 0.29746988925713364, + "grad_norm": 0.48555272817611694, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 1840 + }, + { + "epoch": 0.2990865734378789, + "grad_norm": 0.3752668499946594, + "learning_rate": 0.0002, + "loss": 0.7657, + "step": 1850 + }, + { + "epoch": 0.3007032576186242, + "grad_norm": 0.5328747034072876, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 1860 + }, + { + "epoch": 0.3023199417993695, + "grad_norm": 0.48716455698013306, + "learning_rate": 0.0002, + "loss": 0.8426, + "step": 1870 + }, + { + "epoch": 0.3039366259801148, + "grad_norm": 0.5011493563652039, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 1880 + }, + { + "epoch": 0.30555331016086007, + "grad_norm": 0.46461427211761475, + "learning_rate": 0.0002, + "loss": 0.852, + "step": 1890 + }, + { + "epoch": 0.30716999434160536, + "grad_norm": 0.36630210280418396, + "learning_rate": 0.0002, + "loss": 0.8478, + "step": 1900 + }, + { + "epoch": 0.30878667852235064, + "grad_norm": 0.4217296242713928, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 1910 + }, + { + "epoch": 0.31040336270309593, + "grad_norm": 0.4394875466823578, + "learning_rate": 0.0002, + "loss": 0.8128, + "step": 1920 + }, + { + "epoch": 0.3120200468838412, + "grad_norm": 0.6587965488433838, + "learning_rate": 0.0002, + "loss": 0.8471, + "step": 1930 + }, + { + "epoch": 0.31363673106458656, + "grad_norm": 0.5469298958778381, + "learning_rate": 0.0002, + "loss": 0.8565, + "step": 1940 + }, + { + "epoch": 0.31525341524533185, + "grad_norm": 0.4371595084667206, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 1950 + }, + { + "epoch": 0.31687009942607713, + "grad_norm": 0.4809541404247284, + "learning_rate": 0.0002, + "loss": 0.887, + "step": 1960 + }, + { + "epoch": 0.3184867836068224, + "grad_norm": 0.6061086654663086, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 1970 + }, + { + "epoch": 0.3201034677875677, + "grad_norm": 0.5342657566070557, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 1980 + }, + { + "epoch": 0.321720151968313, + "grad_norm": 0.5057743787765503, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 1990 + }, + { + "epoch": 0.3233368361490583, + "grad_norm": 0.528626024723053, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 2000 + }, + { + "epoch": 0.32495352032980357, + "grad_norm": 0.46742770075798035, + "learning_rate": 0.0002, + "loss": 0.8845, + "step": 2010 + }, + { + "epoch": 0.32657020451054886, + "grad_norm": 0.515101432800293, + "learning_rate": 0.0002, + "loss": 0.8484, + "step": 2020 + }, + { + "epoch": 0.32818688869129414, + "grad_norm": 0.41941216588020325, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 2030 + }, + { + "epoch": 0.32980357287203943, + "grad_norm": 0.49902522563934326, + "learning_rate": 0.0002, + "loss": 0.7637, + "step": 2040 + }, + { + "epoch": 0.3314202570527847, + "grad_norm": 0.4120897650718689, + "learning_rate": 0.0002, + "loss": 0.7822, + "step": 2050 + }, + { + "epoch": 0.33303694123353, + "grad_norm": 0.45352041721343994, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 2060 + }, + { + "epoch": 0.33465362541427535, + "grad_norm": 0.523199737071991, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 2070 + }, + { + "epoch": 0.33627030959502063, + "grad_norm": 0.4390358626842499, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 2080 + }, + { + "epoch": 0.3378869937757659, + "grad_norm": 0.6752901077270508, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 2090 + }, + { + "epoch": 0.3395036779565112, + "grad_norm": 0.547821044921875, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 2100 + }, + { + "epoch": 0.3411203621372565, + "grad_norm": 0.5161308646202087, + "learning_rate": 0.0002, + "loss": 0.8561, + "step": 2110 + }, + { + "epoch": 0.3427370463180018, + "grad_norm": 0.4565401077270508, + "learning_rate": 0.0002, + "loss": 0.7697, + "step": 2120 + }, + { + "epoch": 0.34435373049874707, + "grad_norm": 0.4666115939617157, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 2130 + }, + { + "epoch": 0.34597041467949236, + "grad_norm": 0.4090428352355957, + "learning_rate": 0.0002, + "loss": 0.8189, + "step": 2140 + }, + { + "epoch": 0.34758709886023764, + "grad_norm": 0.510845422744751, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 2150 + }, + { + "epoch": 0.34920378304098293, + "grad_norm": 0.42861923575401306, + "learning_rate": 0.0002, + "loss": 0.8398, + "step": 2160 + }, + { + "epoch": 0.3508204672217282, + "grad_norm": 0.4476332664489746, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 2170 + }, + { + "epoch": 0.3524371514024735, + "grad_norm": 0.6065791249275208, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 2180 + }, + { + "epoch": 0.35405383558321885, + "grad_norm": 0.42335066199302673, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 2190 + }, + { + "epoch": 0.35567051976396413, + "grad_norm": 0.5094629526138306, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 2200 + }, + { + "epoch": 0.3572872039447094, + "grad_norm": 0.5476373434066772, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 2210 + }, + { + "epoch": 0.3589038881254547, + "grad_norm": 0.3911719024181366, + "learning_rate": 0.0002, + "loss": 0.814, + "step": 2220 + }, + { + "epoch": 0.3605205723062, + "grad_norm": 0.6599636077880859, + "learning_rate": 0.0002, + "loss": 0.8599, + "step": 2230 + }, + { + "epoch": 0.3621372564869453, + "grad_norm": 0.40381914377212524, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 2240 + }, + { + "epoch": 0.36375394066769057, + "grad_norm": 0.4433908462524414, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 2250 + }, + { + "epoch": 0.36537062484843585, + "grad_norm": 0.578326940536499, + "learning_rate": 0.0002, + "loss": 0.8503, + "step": 2260 + }, + { + "epoch": 0.36698730902918114, + "grad_norm": 0.5734784007072449, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 2270 + }, + { + "epoch": 0.36860399320992643, + "grad_norm": 0.45555487275123596, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 2280 + }, + { + "epoch": 0.3702206773906717, + "grad_norm": 0.5666276216506958, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 2290 + }, + { + "epoch": 0.371837361571417, + "grad_norm": 0.5461117625236511, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 2300 + }, + { + "epoch": 0.3734540457521623, + "grad_norm": 0.6318911910057068, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 2310 + }, + { + "epoch": 0.37507072993290763, + "grad_norm": 0.493263304233551, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 2320 + }, + { + "epoch": 0.3766874141136529, + "grad_norm": 0.5888760089874268, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 2330 + }, + { + "epoch": 0.3783040982943982, + "grad_norm": 0.48671841621398926, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 2340 + }, + { + "epoch": 0.3799207824751435, + "grad_norm": 0.4385145306587219, + "learning_rate": 0.0002, + "loss": 0.8367, + "step": 2350 + }, + { + "epoch": 0.3815374666558888, + "grad_norm": 0.5523318648338318, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 2360 + }, + { + "epoch": 0.38315415083663407, + "grad_norm": 0.7308220267295837, + "learning_rate": 0.0002, + "loss": 0.8351, + "step": 2370 + }, + { + "epoch": 0.38477083501737935, + "grad_norm": 0.554214358329773, + "learning_rate": 0.0002, + "loss": 0.859, + "step": 2380 + }, + { + "epoch": 0.38638751919812464, + "grad_norm": 0.5425800085067749, + "learning_rate": 0.0002, + "loss": 0.8146, + "step": 2390 + }, + { + "epoch": 0.3880042033788699, + "grad_norm": 0.48811158537864685, + "learning_rate": 0.0002, + "loss": 0.8282, + "step": 2400 + }, + { + "epoch": 0.3896208875596152, + "grad_norm": 0.49212366342544556, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 2410 + }, + { + "epoch": 0.3912375717403605, + "grad_norm": 0.5222218632698059, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 2420 + }, + { + "epoch": 0.3928542559211058, + "grad_norm": 0.4699819087982178, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 2430 + }, + { + "epoch": 0.39447094010185113, + "grad_norm": 0.46153587102890015, + "learning_rate": 0.0002, + "loss": 0.7919, + "step": 2440 + }, + { + "epoch": 0.3960876242825964, + "grad_norm": 0.4150611162185669, + "learning_rate": 0.0002, + "loss": 0.8111, + "step": 2450 + }, + { + "epoch": 0.3977043084633417, + "grad_norm": 0.5799614787101746, + "learning_rate": 0.0002, + "loss": 0.8589, + "step": 2460 + }, + { + "epoch": 0.399320992644087, + "grad_norm": 0.56536865234375, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 2470 + }, + { + "epoch": 0.4009376768248323, + "grad_norm": 0.5451247096061707, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 2480 + }, + { + "epoch": 0.40255436100557757, + "grad_norm": 0.5914521217346191, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 2490 + }, + { + "epoch": 0.40417104518632285, + "grad_norm": 0.4428117275238037, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 2500 + }, + { + "epoch": 0.40578772936706814, + "grad_norm": 0.48580947518348694, + "learning_rate": 0.0002, + "loss": 0.8054, + "step": 2510 + }, + { + "epoch": 0.4074044135478134, + "grad_norm": 0.436734676361084, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 2520 + }, + { + "epoch": 0.4090210977285587, + "grad_norm": 0.5752223134040833, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 2530 + }, + { + "epoch": 0.410637781909304, + "grad_norm": 0.4271308183670044, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 2540 + }, + { + "epoch": 0.4122544660900493, + "grad_norm": 0.46294718980789185, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 2550 + }, + { + "epoch": 0.4138711502707946, + "grad_norm": 0.49407583475112915, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 2560 + }, + { + "epoch": 0.4154878344515399, + "grad_norm": 0.4729035496711731, + "learning_rate": 0.0002, + "loss": 0.7881, + "step": 2570 + }, + { + "epoch": 0.4171045186322852, + "grad_norm": 0.4129747152328491, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 2580 + }, + { + "epoch": 0.4187212028130305, + "grad_norm": 0.5684236288070679, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 2590 + }, + { + "epoch": 0.4203378869937758, + "grad_norm": 0.4862157106399536, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 2600 + }, + { + "epoch": 0.42195457117452106, + "grad_norm": 0.46567976474761963, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 2610 + }, + { + "epoch": 0.42357125535526635, + "grad_norm": 0.5710650682449341, + "learning_rate": 0.0002, + "loss": 0.7755, + "step": 2620 + }, + { + "epoch": 0.42518793953601164, + "grad_norm": 0.5660041570663452, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 2630 + }, + { + "epoch": 0.4268046237167569, + "grad_norm": 0.47944375872612, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 2640 + }, + { + "epoch": 0.4284213078975022, + "grad_norm": 0.537223756313324, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 2650 + }, + { + "epoch": 0.4300379920782475, + "grad_norm": 0.41669997572898865, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 2660 + }, + { + "epoch": 0.4316546762589928, + "grad_norm": 0.44727686047554016, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 2670 + }, + { + "epoch": 0.4332713604397381, + "grad_norm": 0.5600888729095459, + "learning_rate": 0.0002, + "loss": 0.8241, + "step": 2680 + }, + { + "epoch": 0.4348880446204834, + "grad_norm": 0.39820605516433716, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 2690 + }, + { + "epoch": 0.4365047288012287, + "grad_norm": 0.5637655854225159, + "learning_rate": 0.0002, + "loss": 0.8202, + "step": 2700 + }, + { + "epoch": 0.438121412981974, + "grad_norm": 0.6363666653633118, + "learning_rate": 0.0002, + "loss": 0.855, + "step": 2710 + }, + { + "epoch": 0.4397380971627193, + "grad_norm": 0.5656129121780396, + "learning_rate": 0.0002, + "loss": 0.8468, + "step": 2720 + }, + { + "epoch": 0.44135478134346456, + "grad_norm": 0.5600156188011169, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 2730 + }, + { + "epoch": 0.44297146552420985, + "grad_norm": 0.5506579875946045, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 2740 + }, + { + "epoch": 0.44458814970495514, + "grad_norm": 0.49878305196762085, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 2750 + }, + { + "epoch": 0.4462048338857004, + "grad_norm": 0.4569213092327118, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 2760 + }, + { + "epoch": 0.4478215180664457, + "grad_norm": 0.6056680083274841, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 2770 + }, + { + "epoch": 0.449438202247191, + "grad_norm": 0.44474557042121887, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 2780 + }, + { + "epoch": 0.4510548864279363, + "grad_norm": 0.46055394411087036, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 2790 + }, + { + "epoch": 0.4526715706086816, + "grad_norm": 0.4904133379459381, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 2800 + }, + { + "epoch": 0.45428825478942686, + "grad_norm": 0.5647031664848328, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 2810 + }, + { + "epoch": 0.4559049389701722, + "grad_norm": 0.5759473443031311, + "learning_rate": 0.0002, + "loss": 0.8622, + "step": 2820 + }, + { + "epoch": 0.4575216231509175, + "grad_norm": 0.5161895751953125, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 2830 + }, + { + "epoch": 0.4591383073316628, + "grad_norm": 0.4248254597187042, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 2840 + }, + { + "epoch": 0.46075499151240806, + "grad_norm": 0.45395001769065857, + "learning_rate": 0.0002, + "loss": 0.7838, + "step": 2850 + }, + { + "epoch": 0.46237167569315335, + "grad_norm": 0.5358697772026062, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 2860 + }, + { + "epoch": 0.46398835987389864, + "grad_norm": 0.5379165410995483, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 2870 + }, + { + "epoch": 0.4656050440546439, + "grad_norm": 0.4601989686489105, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 2880 + }, + { + "epoch": 0.4672217282353892, + "grad_norm": 0.671115517616272, + "learning_rate": 0.0002, + "loss": 0.8523, + "step": 2890 + }, + { + "epoch": 0.4688384124161345, + "grad_norm": 0.4425133168697357, + "learning_rate": 0.0002, + "loss": 0.8262, + "step": 2900 + }, + { + "epoch": 0.4704550965968798, + "grad_norm": 0.5446155071258545, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 2910 + }, + { + "epoch": 0.47207178077762507, + "grad_norm": 0.603306233882904, + "learning_rate": 0.0002, + "loss": 0.8106, + "step": 2920 + }, + { + "epoch": 0.47368846495837036, + "grad_norm": 0.5377997159957886, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 2930 + }, + { + "epoch": 0.4753051491391157, + "grad_norm": 0.4931027591228485, + "learning_rate": 0.0002, + "loss": 0.8075, + "step": 2940 + }, + { + "epoch": 0.476921833319861, + "grad_norm": 0.4711960256099701, + "learning_rate": 0.0002, + "loss": 0.8004, + "step": 2950 + }, + { + "epoch": 0.4785385175006063, + "grad_norm": 0.5020492672920227, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 2960 + }, + { + "epoch": 0.48015520168135156, + "grad_norm": 0.5428946614265442, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 2970 + }, + { + "epoch": 0.48177188586209685, + "grad_norm": 0.5294089317321777, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 2980 + }, + { + "epoch": 0.48338857004284214, + "grad_norm": 0.648289144039154, + "learning_rate": 0.0002, + "loss": 0.8553, + "step": 2990 + }, + { + "epoch": 0.4850052542235874, + "grad_norm": 0.47916680574417114, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 3000 + }, + { + "epoch": 0.4866219384043327, + "grad_norm": 0.43849772214889526, + "learning_rate": 0.0002, + "loss": 0.8087, + "step": 3010 + }, + { + "epoch": 0.488238622585078, + "grad_norm": 0.47007861733436584, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 3020 + }, + { + "epoch": 0.4898553067658233, + "grad_norm": 0.6314331293106079, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 3030 + }, + { + "epoch": 0.49147199094656857, + "grad_norm": 0.49211493134498596, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 3040 + }, + { + "epoch": 0.49308867512731386, + "grad_norm": 0.4537973403930664, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 3050 + }, + { + "epoch": 0.49470535930805914, + "grad_norm": 0.47326919436454773, + "learning_rate": 0.0002, + "loss": 0.8095, + "step": 3060 + }, + { + "epoch": 0.4963220434888045, + "grad_norm": 0.525874137878418, + "learning_rate": 0.0002, + "loss": 0.8447, + "step": 3070 + }, + { + "epoch": 0.4979387276695498, + "grad_norm": 0.6361091732978821, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 3080 + }, + { + "epoch": 0.49955541185029506, + "grad_norm": 0.5850642919540405, + "learning_rate": 0.0002, + "loss": 0.821, + "step": 3090 + }, + { + "epoch": 0.5011720960310403, + "grad_norm": 0.47299543023109436, + "learning_rate": 0.0002, + "loss": 0.8279, + "step": 3100 + }, + { + "epoch": 0.5027887802117856, + "grad_norm": 0.473099946975708, + "learning_rate": 0.0002, + "loss": 0.8681, + "step": 3110 + }, + { + "epoch": 0.5044054643925309, + "grad_norm": 0.48186397552490234, + "learning_rate": 0.0002, + "loss": 0.8223, + "step": 3120 + }, + { + "epoch": 0.5060221485732762, + "grad_norm": 0.5015401840209961, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 3130 + }, + { + "epoch": 0.5076388327540216, + "grad_norm": 0.5617750287055969, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 3140 + }, + { + "epoch": 0.5092555169347668, + "grad_norm": 0.5169327259063721, + "learning_rate": 0.0002, + "loss": 0.8708, + "step": 3150 + }, + { + "epoch": 0.5108722011155121, + "grad_norm": 0.545657753944397, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 3160 + }, + { + "epoch": 0.5124888852962574, + "grad_norm": 0.512864351272583, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 3170 + }, + { + "epoch": 0.5141055694770027, + "grad_norm": 0.4113546311855316, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 3180 + }, + { + "epoch": 0.5157222536577479, + "grad_norm": 0.44532445073127747, + "learning_rate": 0.0002, + "loss": 0.8206, + "step": 3190 + }, + { + "epoch": 0.5173389378384933, + "grad_norm": 0.5623497366905212, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 3200 + }, + { + "epoch": 0.5189556220192385, + "grad_norm": 0.5084741115570068, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 3210 + }, + { + "epoch": 0.5205723061999838, + "grad_norm": 0.5305403470993042, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 3220 + }, + { + "epoch": 0.5221889903807291, + "grad_norm": 0.4708254337310791, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 3230 + }, + { + "epoch": 0.5238056745614744, + "grad_norm": 0.43827131390571594, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 3240 + }, + { + "epoch": 0.5254223587422197, + "grad_norm": 0.5630002617835999, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 3250 + }, + { + "epoch": 0.527039042922965, + "grad_norm": 0.5010961890220642, + "learning_rate": 0.0002, + "loss": 0.7522, + "step": 3260 + }, + { + "epoch": 0.5286557271037103, + "grad_norm": 0.6303122043609619, + "learning_rate": 0.0002, + "loss": 0.8374, + "step": 3270 + }, + { + "epoch": 0.5302724112844556, + "grad_norm": 0.5107331275939941, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 3280 + }, + { + "epoch": 0.5318890954652009, + "grad_norm": 0.5700443387031555, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 3290 + }, + { + "epoch": 0.5335057796459461, + "grad_norm": 0.46296367049217224, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 3300 + }, + { + "epoch": 0.5351224638266915, + "grad_norm": 0.531568706035614, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 3310 + }, + { + "epoch": 0.5367391480074367, + "grad_norm": 0.4686741530895233, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 3320 + }, + { + "epoch": 0.5383558321881821, + "grad_norm": 0.5404331088066101, + "learning_rate": 0.0002, + "loss": 0.8104, + "step": 3330 + }, + { + "epoch": 0.5399725163689273, + "grad_norm": 0.6368790864944458, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 3340 + }, + { + "epoch": 0.5415892005496726, + "grad_norm": 0.42300888895988464, + "learning_rate": 0.0002, + "loss": 0.8514, + "step": 3350 + }, + { + "epoch": 0.5432058847304179, + "grad_norm": 0.5362542867660522, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 3360 + }, + { + "epoch": 0.5448225689111632, + "grad_norm": 0.497128963470459, + "learning_rate": 0.0002, + "loss": 0.858, + "step": 3370 + }, + { + "epoch": 0.5464392530919085, + "grad_norm": 0.5006386041641235, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 3380 + }, + { + "epoch": 0.5480559372726538, + "grad_norm": 0.44136837124824524, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 3390 + }, + { + "epoch": 0.5496726214533991, + "grad_norm": 0.5897833108901978, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 3400 + }, + { + "epoch": 0.5512893056341444, + "grad_norm": 0.641075611114502, + "learning_rate": 0.0002, + "loss": 0.8895, + "step": 3410 + }, + { + "epoch": 0.5529059898148897, + "grad_norm": 0.7251322269439697, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 3420 + }, + { + "epoch": 0.5545226739956349, + "grad_norm": 0.47411349415779114, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 3430 + }, + { + "epoch": 0.5561393581763803, + "grad_norm": 0.4994310438632965, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 3440 + }, + { + "epoch": 0.5577560423571255, + "grad_norm": 0.5814438462257385, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 3450 + }, + { + "epoch": 0.5593727265378708, + "grad_norm": 0.6278898119926453, + "learning_rate": 0.0002, + "loss": 0.8805, + "step": 3460 + }, + { + "epoch": 0.5609894107186161, + "grad_norm": 0.46208274364471436, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 3470 + }, + { + "epoch": 0.5626060948993614, + "grad_norm": 0.5718930959701538, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 3480 + }, + { + "epoch": 0.5642227790801067, + "grad_norm": 0.48178744316101074, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 3490 + }, + { + "epoch": 0.565839463260852, + "grad_norm": 0.47336965799331665, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 3500 + }, + { + "epoch": 0.5674561474415973, + "grad_norm": 0.43442684412002563, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 3510 + }, + { + "epoch": 0.5690728316223426, + "grad_norm": 0.6463358998298645, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 3520 + }, + { + "epoch": 0.5706895158030879, + "grad_norm": 0.5286486744880676, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 3530 + }, + { + "epoch": 0.5723061999838331, + "grad_norm": 0.5405499935150146, + "learning_rate": 0.0002, + "loss": 0.8421, + "step": 3540 + }, + { + "epoch": 0.5739228841645785, + "grad_norm": 0.6654391884803772, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 3550 + }, + { + "epoch": 0.5755395683453237, + "grad_norm": 0.5081980228424072, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 3560 + }, + { + "epoch": 0.5771562525260691, + "grad_norm": 0.48978179693222046, + "learning_rate": 0.0002, + "loss": 0.7753, + "step": 3570 + }, + { + "epoch": 0.5787729367068143, + "grad_norm": 0.5840612053871155, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3580 + }, + { + "epoch": 0.5803896208875596, + "grad_norm": 0.5235261917114258, + "learning_rate": 0.0002, + "loss": 0.8937, + "step": 3590 + }, + { + "epoch": 0.5820063050683049, + "grad_norm": 0.5672075748443604, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 3600 + }, + { + "epoch": 0.5836229892490502, + "grad_norm": 0.5613429546356201, + "learning_rate": 0.0002, + "loss": 0.8347, + "step": 3610 + }, + { + "epoch": 0.5852396734297954, + "grad_norm": 0.4032273590564728, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 3620 + }, + { + "epoch": 0.5868563576105408, + "grad_norm": 0.49559324979782104, + "learning_rate": 0.0002, + "loss": 0.8421, + "step": 3630 + }, + { + "epoch": 0.5884730417912861, + "grad_norm": 0.6895697712898254, + "learning_rate": 0.0002, + "loss": 0.8332, + "step": 3640 + }, + { + "epoch": 0.5900897259720314, + "grad_norm": 0.4750136435031891, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 3650 + }, + { + "epoch": 0.5917064101527767, + "grad_norm": 0.5176819562911987, + "learning_rate": 0.0002, + "loss": 0.8219, + "step": 3660 + }, + { + "epoch": 0.5933230943335219, + "grad_norm": 0.5817760229110718, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3670 + }, + { + "epoch": 0.5949397785142673, + "grad_norm": 0.6064626574516296, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 3680 + }, + { + "epoch": 0.5965564626950125, + "grad_norm": 0.6728700995445251, + "learning_rate": 0.0002, + "loss": 0.8422, + "step": 3690 + }, + { + "epoch": 0.5981731468757578, + "grad_norm": 0.609305202960968, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 3700 + }, + { + "epoch": 0.5997898310565031, + "grad_norm": 0.4615488350391388, + "learning_rate": 0.0002, + "loss": 0.8048, + "step": 3710 + }, + { + "epoch": 0.6014065152372484, + "grad_norm": 2.0531179904937744, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 3720 + }, + { + "epoch": 0.6030231994179936, + "grad_norm": 0.5091132521629333, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 3730 + }, + { + "epoch": 0.604639883598739, + "grad_norm": 0.5951124429702759, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 3740 + }, + { + "epoch": 0.6062565677794842, + "grad_norm": 0.5870208144187927, + "learning_rate": 0.0002, + "loss": 0.7784, + "step": 3750 + }, + { + "epoch": 0.6078732519602296, + "grad_norm": 0.6254619359970093, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 3760 + }, + { + "epoch": 0.6094899361409749, + "grad_norm": 0.5577626824378967, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 3770 + }, + { + "epoch": 0.6111066203217201, + "grad_norm": 0.5004405379295349, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 3780 + }, + { + "epoch": 0.6127233045024655, + "grad_norm": 0.5527383685112, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 3790 + }, + { + "epoch": 0.6143399886832107, + "grad_norm": 0.49116113781929016, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 3800 + }, + { + "epoch": 0.6159566728639561, + "grad_norm": 0.5299299359321594, + "learning_rate": 0.0002, + "loss": 0.8352, + "step": 3810 + }, + { + "epoch": 0.6175733570447013, + "grad_norm": 0.464897483587265, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 3820 + }, + { + "epoch": 0.6191900412254466, + "grad_norm": 0.6505740880966187, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 3830 + }, + { + "epoch": 0.6208067254061919, + "grad_norm": 0.5512559413909912, + "learning_rate": 0.0002, + "loss": 0.8123, + "step": 3840 + }, + { + "epoch": 0.6224234095869372, + "grad_norm": 0.49427518248558044, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 3850 + }, + { + "epoch": 0.6240400937676824, + "grad_norm": 0.3839147090911865, + "learning_rate": 0.0002, + "loss": 0.7751, + "step": 3860 + }, + { + "epoch": 0.6256567779484278, + "grad_norm": 0.5760218501091003, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 3870 + }, + { + "epoch": 0.6272734621291731, + "grad_norm": 0.7226507067680359, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 3880 + }, + { + "epoch": 0.6288901463099184, + "grad_norm": 0.676781415939331, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 3890 + }, + { + "epoch": 0.6305068304906637, + "grad_norm": 0.4284018278121948, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 3900 + }, + { + "epoch": 0.6321235146714089, + "grad_norm": 0.5060628056526184, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 3910 + }, + { + "epoch": 0.6337401988521543, + "grad_norm": 0.5524522066116333, + "learning_rate": 0.0002, + "loss": 0.8089, + "step": 3920 + }, + { + "epoch": 0.6353568830328995, + "grad_norm": 0.6099881529808044, + "learning_rate": 0.0002, + "loss": 0.8276, + "step": 3930 + }, + { + "epoch": 0.6369735672136448, + "grad_norm": 0.43155938386917114, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 3940 + }, + { + "epoch": 0.6385902513943901, + "grad_norm": 0.6427084803581238, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 3950 + }, + { + "epoch": 0.6402069355751354, + "grad_norm": 0.541220486164093, + "learning_rate": 0.0002, + "loss": 0.8368, + "step": 3960 + }, + { + "epoch": 0.6418236197558806, + "grad_norm": 0.5414294600486755, + "learning_rate": 0.0002, + "loss": 0.8539, + "step": 3970 + }, + { + "epoch": 0.643440303936626, + "grad_norm": 0.46344003081321716, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 3980 + }, + { + "epoch": 0.6450569881173712, + "grad_norm": 0.45209285616874695, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 3990 + }, + { + "epoch": 0.6466736722981166, + "grad_norm": 0.5417284369468689, + "learning_rate": 0.0002, + "loss": 0.8202, + "step": 4000 + }, + { + "epoch": 0.6482903564788619, + "grad_norm": 0.7995685935020447, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 4010 + }, + { + "epoch": 0.6499070406596071, + "grad_norm": 0.6384002566337585, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 4020 + }, + { + "epoch": 0.6515237248403525, + "grad_norm": 0.4472815692424774, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 4030 + }, + { + "epoch": 0.6531404090210977, + "grad_norm": 0.6834294199943542, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 4040 + }, + { + "epoch": 0.654757093201843, + "grad_norm": 0.4612339735031128, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 4050 + }, + { + "epoch": 0.6563737773825883, + "grad_norm": 0.9266576170921326, + "learning_rate": 0.0002, + "loss": 0.8141, + "step": 4060 + }, + { + "epoch": 0.6579904615633336, + "grad_norm": 0.4470861852169037, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 4070 + }, + { + "epoch": 0.6596071457440789, + "grad_norm": 0.45544925332069397, + "learning_rate": 0.0002, + "loss": 0.8293, + "step": 4080 + }, + { + "epoch": 0.6612238299248242, + "grad_norm": 0.6144481301307678, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 4090 + }, + { + "epoch": 0.6628405141055694, + "grad_norm": 0.5936288237571716, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 4100 + }, + { + "epoch": 0.6644571982863148, + "grad_norm": 0.4822963774204254, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 4110 + }, + { + "epoch": 0.66607388246706, + "grad_norm": 0.48432496190071106, + "learning_rate": 0.0002, + "loss": 0.7997, + "step": 4120 + }, + { + "epoch": 0.6676905666478054, + "grad_norm": 0.4901607930660248, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 4130 + }, + { + "epoch": 0.6693072508285507, + "grad_norm": 0.5018393397331238, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 4140 + }, + { + "epoch": 0.6709239350092959, + "grad_norm": 0.6946378946304321, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 4150 + }, + { + "epoch": 0.6725406191900413, + "grad_norm": 0.5997390747070312, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 4160 + }, + { + "epoch": 0.6741573033707865, + "grad_norm": 0.6738849878311157, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 4170 + }, + { + "epoch": 0.6757739875515318, + "grad_norm": 0.6110581159591675, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 4180 + }, + { + "epoch": 0.6773906717322771, + "grad_norm": 0.5703322291374207, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 4190 + }, + { + "epoch": 0.6790073559130224, + "grad_norm": 0.4686066210269928, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 4200 + }, + { + "epoch": 0.6806240400937676, + "grad_norm": 0.6394643783569336, + "learning_rate": 0.0002, + "loss": 0.8441, + "step": 4210 + }, + { + "epoch": 0.682240724274513, + "grad_norm": 0.5454841256141663, + "learning_rate": 0.0002, + "loss": 0.8011, + "step": 4220 + }, + { + "epoch": 0.6838574084552582, + "grad_norm": 0.4859732985496521, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 4230 + }, + { + "epoch": 0.6854740926360036, + "grad_norm": 0.5544065833091736, + "learning_rate": 0.0002, + "loss": 0.8161, + "step": 4240 + }, + { + "epoch": 0.6870907768167488, + "grad_norm": 0.4902505576610565, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 4250 + }, + { + "epoch": 0.6887074609974941, + "grad_norm": 0.4768051505088806, + "learning_rate": 0.0002, + "loss": 0.7977, + "step": 4260 + }, + { + "epoch": 0.6903241451782395, + "grad_norm": 0.49982190132141113, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 4270 + }, + { + "epoch": 0.6919408293589847, + "grad_norm": 0.6351838111877441, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 4280 + }, + { + "epoch": 0.69355751353973, + "grad_norm": 0.5647561550140381, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 4290 + }, + { + "epoch": 0.6951741977204753, + "grad_norm": 0.5340486764907837, + "learning_rate": 0.0002, + "loss": 0.7618, + "step": 4300 + }, + { + "epoch": 0.6967908819012206, + "grad_norm": 0.5649092793464661, + "learning_rate": 0.0002, + "loss": 0.8526, + "step": 4310 + }, + { + "epoch": 0.6984075660819659, + "grad_norm": 0.6183916926383972, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 4320 + }, + { + "epoch": 0.7000242502627112, + "grad_norm": 0.6154509782791138, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 4330 + }, + { + "epoch": 0.7016409344434564, + "grad_norm": 0.5156264305114746, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 4340 + }, + { + "epoch": 0.7032576186242018, + "grad_norm": 0.562171459197998, + "learning_rate": 0.0002, + "loss": 0.8512, + "step": 4350 + }, + { + "epoch": 0.704874302804947, + "grad_norm": 0.4949502646923065, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 4360 + }, + { + "epoch": 0.7064909869856923, + "grad_norm": 0.5171684622764587, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 4370 + }, + { + "epoch": 0.7081076711664377, + "grad_norm": 0.6198443174362183, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 4380 + }, + { + "epoch": 0.7097243553471829, + "grad_norm": 0.5802276134490967, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 4390 + }, + { + "epoch": 0.7113410395279283, + "grad_norm": 0.41096967458724976, + "learning_rate": 0.0002, + "loss": 0.8797, + "step": 4400 + }, + { + "epoch": 0.7129577237086735, + "grad_norm": 0.4397392272949219, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 4410 + }, + { + "epoch": 0.7145744078894188, + "grad_norm": 0.45228442549705505, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 4420 + }, + { + "epoch": 0.7161910920701641, + "grad_norm": 0.4839673936367035, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 4430 + }, + { + "epoch": 0.7178077762509094, + "grad_norm": 0.6140755414962769, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 4440 + }, + { + "epoch": 0.7194244604316546, + "grad_norm": 0.6841378808021545, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 4450 + }, + { + "epoch": 0.7210411446124, + "grad_norm": 0.6664239168167114, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 4460 + }, + { + "epoch": 0.7226578287931452, + "grad_norm": 0.47552719712257385, + "learning_rate": 0.0002, + "loss": 0.7983, + "step": 4470 + }, + { + "epoch": 0.7242745129738906, + "grad_norm": 0.6649776101112366, + "learning_rate": 0.0002, + "loss": 0.8982, + "step": 4480 + }, + { + "epoch": 0.7258911971546358, + "grad_norm": 0.5159541964530945, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 4490 + }, + { + "epoch": 0.7275078813353811, + "grad_norm": 0.6693112850189209, + "learning_rate": 0.0002, + "loss": 0.7786, + "step": 4500 + }, + { + "epoch": 0.7291245655161265, + "grad_norm": 0.48870977759361267, + "learning_rate": 0.0002, + "loss": 0.8655, + "step": 4510 + }, + { + "epoch": 0.7307412496968717, + "grad_norm": 0.4857887923717499, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 4520 + }, + { + "epoch": 0.732357933877617, + "grad_norm": 0.5515662431716919, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 4530 + }, + { + "epoch": 0.7339746180583623, + "grad_norm": 0.6292222738265991, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 4540 + }, + { + "epoch": 0.7355913022391076, + "grad_norm": 0.48265689611434937, + "learning_rate": 0.0002, + "loss": 0.7749, + "step": 4550 + }, + { + "epoch": 0.7372079864198529, + "grad_norm": 0.8044266104698181, + "learning_rate": 0.0002, + "loss": 0.8499, + "step": 4560 + }, + { + "epoch": 0.7388246706005982, + "grad_norm": 0.6111769676208496, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 4570 + }, + { + "epoch": 0.7404413547813434, + "grad_norm": 0.5229553580284119, + "learning_rate": 0.0002, + "loss": 0.7291, + "step": 4580 + }, + { + "epoch": 0.7420580389620888, + "grad_norm": 0.6054152250289917, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 4590 + }, + { + "epoch": 0.743674723142834, + "grad_norm": 0.5574966669082642, + "learning_rate": 0.0002, + "loss": 0.8169, + "step": 4600 + }, + { + "epoch": 0.7452914073235793, + "grad_norm": 0.5395817160606384, + "learning_rate": 0.0002, + "loss": 0.8439, + "step": 4610 + }, + { + "epoch": 0.7469080915043246, + "grad_norm": 0.7116472721099854, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 4620 + }, + { + "epoch": 0.7485247756850699, + "grad_norm": 0.5618700981140137, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 4630 + }, + { + "epoch": 0.7501414598658153, + "grad_norm": 0.5802770853042603, + "learning_rate": 0.0002, + "loss": 0.7744, + "step": 4640 + }, + { + "epoch": 0.7517581440465605, + "grad_norm": 0.5690428018569946, + "learning_rate": 0.0002, + "loss": 0.7924, + "step": 4650 + }, + { + "epoch": 0.7533748282273058, + "grad_norm": 0.4813360273838043, + "learning_rate": 0.0002, + "loss": 0.8017, + "step": 4660 + }, + { + "epoch": 0.7549915124080511, + "grad_norm": 0.5434042811393738, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 4670 + }, + { + "epoch": 0.7566081965887964, + "grad_norm": 0.5502099990844727, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 4680 + }, + { + "epoch": 0.7582248807695416, + "grad_norm": 0.6020621061325073, + "learning_rate": 0.0002, + "loss": 0.8598, + "step": 4690 + }, + { + "epoch": 0.759841564950287, + "grad_norm": 0.4922301471233368, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 4700 + }, + { + "epoch": 0.7614582491310322, + "grad_norm": 0.6492828726768494, + "learning_rate": 0.0002, + "loss": 0.788, + "step": 4710 + }, + { + "epoch": 0.7630749333117776, + "grad_norm": 0.4865580201148987, + "learning_rate": 0.0002, + "loss": 0.8313, + "step": 4720 + }, + { + "epoch": 0.7646916174925228, + "grad_norm": 0.5971422791481018, + "learning_rate": 0.0002, + "loss": 0.7966, + "step": 4730 + }, + { + "epoch": 0.7663083016732681, + "grad_norm": 0.6832674145698547, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 4740 + }, + { + "epoch": 0.7679249858540134, + "grad_norm": 0.500908613204956, + "learning_rate": 0.0002, + "loss": 0.8156, + "step": 4750 + }, + { + "epoch": 0.7695416700347587, + "grad_norm": 0.6112465858459473, + "learning_rate": 0.0002, + "loss": 0.8383, + "step": 4760 + }, + { + "epoch": 0.771158354215504, + "grad_norm": 0.5753506422042847, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 4770 + }, + { + "epoch": 0.7727750383962493, + "grad_norm": 0.6529405117034912, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 4780 + }, + { + "epoch": 0.7743917225769946, + "grad_norm": 0.5916843414306641, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 4790 + }, + { + "epoch": 0.7760084067577399, + "grad_norm": 0.4821224510669708, + "learning_rate": 0.0002, + "loss": 0.83, + "step": 4800 + }, + { + "epoch": 0.7776250909384852, + "grad_norm": 0.5532580018043518, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 4810 + }, + { + "epoch": 0.7792417751192304, + "grad_norm": 0.4604877233505249, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 4820 + }, + { + "epoch": 0.7808584592999758, + "grad_norm": 0.5009613037109375, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 4830 + }, + { + "epoch": 0.782475143480721, + "grad_norm": 0.6448560357093811, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 4840 + }, + { + "epoch": 0.7840918276614663, + "grad_norm": 0.44327953457832336, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 4850 + }, + { + "epoch": 0.7857085118422116, + "grad_norm": 0.5355411171913147, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 4860 + }, + { + "epoch": 0.7873251960229569, + "grad_norm": 0.5635677576065063, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 4870 + }, + { + "epoch": 0.7889418802037023, + "grad_norm": 0.5417491793632507, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 4880 + }, + { + "epoch": 0.7905585643844475, + "grad_norm": 0.4567430913448334, + "learning_rate": 0.0002, + "loss": 0.7819, + "step": 4890 + }, + { + "epoch": 0.7921752485651928, + "grad_norm": 0.44651296734809875, + "learning_rate": 0.0002, + "loss": 0.8454, + "step": 4900 + }, + { + "epoch": 0.7937919327459381, + "grad_norm": 0.5741217136383057, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 4910 + }, + { + "epoch": 0.7954086169266834, + "grad_norm": 0.6605045199394226, + "learning_rate": 0.0002, + "loss": 0.8093, + "step": 4920 + }, + { + "epoch": 0.7970253011074286, + "grad_norm": 0.5126531720161438, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 4930 + }, + { + "epoch": 0.798641985288174, + "grad_norm": 0.513648271560669, + "learning_rate": 0.0002, + "loss": 0.7793, + "step": 4940 + }, + { + "epoch": 0.8002586694689192, + "grad_norm": 0.5350404381752014, + "learning_rate": 0.0002, + "loss": 0.8314, + "step": 4950 + }, + { + "epoch": 0.8018753536496646, + "grad_norm": 0.5731674432754517, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 4960 + }, + { + "epoch": 0.8034920378304098, + "grad_norm": 0.5974258184432983, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 4970 + }, + { + "epoch": 0.8051087220111551, + "grad_norm": 0.8774799704551697, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 4980 + }, + { + "epoch": 0.8067254061919004, + "grad_norm": 0.5994430184364319, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 4990 + }, + { + "epoch": 0.8083420903726457, + "grad_norm": 0.4894903004169464, + "learning_rate": 0.0002, + "loss": 0.7736, + "step": 5000 + }, + { + "epoch": 0.809958774553391, + "grad_norm": 0.5218459367752075, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 5010 + }, + { + "epoch": 0.8115754587341363, + "grad_norm": 0.5232468843460083, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 5020 + }, + { + "epoch": 0.8131921429148816, + "grad_norm": 0.44358372688293457, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 5030 + }, + { + "epoch": 0.8148088270956269, + "grad_norm": 0.6202037334442139, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 5040 + }, + { + "epoch": 0.8164255112763722, + "grad_norm": 0.7721474170684814, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 5050 + }, + { + "epoch": 0.8180421954571174, + "grad_norm": 0.5568501353263855, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 5060 + }, + { + "epoch": 0.8196588796378628, + "grad_norm": 0.49148809909820557, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 5070 + }, + { + "epoch": 0.821275563818608, + "grad_norm": 0.4956012964248657, + "learning_rate": 0.0002, + "loss": 0.8054, + "step": 5080 + }, + { + "epoch": 0.8228922479993533, + "grad_norm": 0.6078833937644958, + "learning_rate": 0.0002, + "loss": 0.8201, + "step": 5090 + }, + { + "epoch": 0.8245089321800986, + "grad_norm": 0.46906954050064087, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 5100 + }, + { + "epoch": 0.8261256163608439, + "grad_norm": 0.50812166929245, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 5110 + }, + { + "epoch": 0.8277423005415891, + "grad_norm": 0.5319661498069763, + "learning_rate": 0.0002, + "loss": 0.8243, + "step": 5120 + }, + { + "epoch": 0.8293589847223345, + "grad_norm": 0.4949689209461212, + "learning_rate": 0.0002, + "loss": 0.7798, + "step": 5130 + }, + { + "epoch": 0.8309756689030798, + "grad_norm": 0.5151591300964355, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 5140 + }, + { + "epoch": 0.8325923530838251, + "grad_norm": 0.5530214309692383, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 5150 + }, + { + "epoch": 0.8342090372645704, + "grad_norm": 0.6297410130500793, + "learning_rate": 0.0002, + "loss": 0.8251, + "step": 5160 + }, + { + "epoch": 0.8358257214453156, + "grad_norm": 0.5466840267181396, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 5170 + }, + { + "epoch": 0.837442405626061, + "grad_norm": 0.652913510799408, + "learning_rate": 0.0002, + "loss": 0.7875, + "step": 5180 + }, + { + "epoch": 0.8390590898068062, + "grad_norm": 0.5811293125152588, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 5190 + }, + { + "epoch": 0.8406757739875516, + "grad_norm": 0.5109550952911377, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 5200 + }, + { + "epoch": 0.8422924581682968, + "grad_norm": 0.4551706612110138, + "learning_rate": 0.0002, + "loss": 0.8077, + "step": 5210 + }, + { + "epoch": 0.8439091423490421, + "grad_norm": 0.5813754200935364, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 5220 + }, + { + "epoch": 0.8455258265297874, + "grad_norm": 0.5856947898864746, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 5230 + }, + { + "epoch": 0.8471425107105327, + "grad_norm": 0.5482739210128784, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 5240 + }, + { + "epoch": 0.8487591948912779, + "grad_norm": 0.49023720622062683, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 5250 + }, + { + "epoch": 0.8503758790720233, + "grad_norm": 0.49472475051879883, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 5260 + }, + { + "epoch": 0.8519925632527686, + "grad_norm": 0.5490226745605469, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 5270 + }, + { + "epoch": 0.8536092474335139, + "grad_norm": 0.5340665578842163, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 5280 + }, + { + "epoch": 0.8552259316142592, + "grad_norm": 0.5962483882904053, + "learning_rate": 0.0002, + "loss": 0.8277, + "step": 5290 + }, + { + "epoch": 0.8568426157950044, + "grad_norm": 0.586358368396759, + "learning_rate": 0.0002, + "loss": 0.8765, + "step": 5300 + }, + { + "epoch": 0.8584592999757498, + "grad_norm": 0.49120277166366577, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 5310 + }, + { + "epoch": 0.860075984156495, + "grad_norm": 0.5887332558631897, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 5320 + }, + { + "epoch": 0.8616926683372403, + "grad_norm": 0.42496153712272644, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 5330 + }, + { + "epoch": 0.8633093525179856, + "grad_norm": 0.5489874482154846, + "learning_rate": 0.0002, + "loss": 0.7905, + "step": 5340 + }, + { + "epoch": 0.8649260366987309, + "grad_norm": 0.5850813984870911, + "learning_rate": 0.0002, + "loss": 0.7958, + "step": 5350 + }, + { + "epoch": 0.8665427208794761, + "grad_norm": 0.517487108707428, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 5360 + }, + { + "epoch": 0.8681594050602215, + "grad_norm": 0.5339142680168152, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 5370 + }, + { + "epoch": 0.8697760892409668, + "grad_norm": 0.6236387491226196, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 5380 + }, + { + "epoch": 0.8713927734217121, + "grad_norm": 0.5752192735671997, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 5390 + }, + { + "epoch": 0.8730094576024574, + "grad_norm": 0.6724614500999451, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 5400 + }, + { + "epoch": 0.8746261417832026, + "grad_norm": 0.5280613303184509, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 5410 + }, + { + "epoch": 0.876242825963948, + "grad_norm": 0.44033288955688477, + "learning_rate": 0.0002, + "loss": 0.8231, + "step": 5420 + }, + { + "epoch": 0.8778595101446932, + "grad_norm": 0.5199708342552185, + "learning_rate": 0.0002, + "loss": 0.8839, + "step": 5430 + }, + { + "epoch": 0.8794761943254386, + "grad_norm": 0.46778348088264465, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 5440 + }, + { + "epoch": 0.8810928785061838, + "grad_norm": 0.4657754898071289, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 5450 + }, + { + "epoch": 0.8827095626869291, + "grad_norm": 0.5472902655601501, + "learning_rate": 0.0002, + "loss": 0.7799, + "step": 5460 + }, + { + "epoch": 0.8843262468676744, + "grad_norm": 0.4876766800880432, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 5470 + }, + { + "epoch": 0.8859429310484197, + "grad_norm": 0.5057248473167419, + "learning_rate": 0.0002, + "loss": 0.7906, + "step": 5480 + }, + { + "epoch": 0.8875596152291649, + "grad_norm": 0.4637320637702942, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 5490 + }, + { + "epoch": 0.8891762994099103, + "grad_norm": 0.471955806016922, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 5500 + }, + { + "epoch": 0.8907929835906556, + "grad_norm": 0.5209813714027405, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 5510 + }, + { + "epoch": 0.8924096677714008, + "grad_norm": 0.6213834285736084, + "learning_rate": 0.0002, + "loss": 0.8106, + "step": 5520 + }, + { + "epoch": 0.8940263519521462, + "grad_norm": 0.5215408205986023, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 5530 + }, + { + "epoch": 0.8956430361328914, + "grad_norm": 0.580478310585022, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 5540 + }, + { + "epoch": 0.8972597203136368, + "grad_norm": 0.49102169275283813, + "learning_rate": 0.0002, + "loss": 0.8371, + "step": 5550 + }, + { + "epoch": 0.898876404494382, + "grad_norm": 0.6043479442596436, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 5560 + }, + { + "epoch": 0.9004930886751273, + "grad_norm": 0.5636463165283203, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 5570 + }, + { + "epoch": 0.9021097728558726, + "grad_norm": 0.5620124340057373, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 5580 + }, + { + "epoch": 0.9037264570366179, + "grad_norm": 0.5206354856491089, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 5590 + }, + { + "epoch": 0.9053431412173631, + "grad_norm": 0.5798229575157166, + "learning_rate": 0.0002, + "loss": 0.8557, + "step": 5600 + }, + { + "epoch": 0.9069598253981085, + "grad_norm": 0.6428212523460388, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 5610 + }, + { + "epoch": 0.9085765095788537, + "grad_norm": 0.48064687848091125, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 5620 + }, + { + "epoch": 0.9101931937595991, + "grad_norm": 0.6347860097885132, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 5630 + }, + { + "epoch": 0.9118098779403444, + "grad_norm": 0.5353913307189941, + "learning_rate": 0.0002, + "loss": 0.851, + "step": 5640 + }, + { + "epoch": 0.9134265621210896, + "grad_norm": 0.5323944091796875, + "learning_rate": 0.0002, + "loss": 0.7736, + "step": 5650 + }, + { + "epoch": 0.915043246301835, + "grad_norm": 0.5261843204498291, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 5660 + }, + { + "epoch": 0.9166599304825802, + "grad_norm": 0.5451326966285706, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 5670 + }, + { + "epoch": 0.9182766146633256, + "grad_norm": 0.5183324217796326, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 5680 + }, + { + "epoch": 0.9198932988440708, + "grad_norm": 0.47229018807411194, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 5690 + }, + { + "epoch": 0.9215099830248161, + "grad_norm": 0.49180513620376587, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 5700 + }, + { + "epoch": 0.9231266672055614, + "grad_norm": 0.5419785380363464, + "learning_rate": 0.0002, + "loss": 0.8735, + "step": 5710 + }, + { + "epoch": 0.9247433513863067, + "grad_norm": 0.5408698916435242, + "learning_rate": 0.0002, + "loss": 0.7378, + "step": 5720 + }, + { + "epoch": 0.9263600355670519, + "grad_norm": 0.5286232829093933, + "learning_rate": 0.0002, + "loss": 0.7701, + "step": 5730 + }, + { + "epoch": 0.9279767197477973, + "grad_norm": 0.7539758086204529, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 5740 + }, + { + "epoch": 0.9295934039285425, + "grad_norm": 0.5166944861412048, + "learning_rate": 0.0002, + "loss": 0.8118, + "step": 5750 + }, + { + "epoch": 0.9312100881092878, + "grad_norm": 0.6601425409317017, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 5760 + }, + { + "epoch": 0.9328267722900332, + "grad_norm": 0.5029960870742798, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 5770 + }, + { + "epoch": 0.9344434564707784, + "grad_norm": 0.4926645755767822, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 5780 + }, + { + "epoch": 0.9360601406515238, + "grad_norm": 0.5739615559577942, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 5790 + }, + { + "epoch": 0.937676824832269, + "grad_norm": 0.5058279037475586, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 5800 + }, + { + "epoch": 0.9392935090130143, + "grad_norm": 0.5260962247848511, + "learning_rate": 0.0002, + "loss": 0.8537, + "step": 5810 + }, + { + "epoch": 0.9409101931937596, + "grad_norm": 0.5768588185310364, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 5820 + }, + { + "epoch": 0.9425268773745049, + "grad_norm": 0.5170126557350159, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 5830 + }, + { + "epoch": 0.9441435615552501, + "grad_norm": 0.5745864510536194, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 5840 + }, + { + "epoch": 0.9457602457359955, + "grad_norm": 0.5551357865333557, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 5850 + }, + { + "epoch": 0.9473769299167407, + "grad_norm": 0.5776078701019287, + "learning_rate": 0.0002, + "loss": 0.8529, + "step": 5860 + }, + { + "epoch": 0.9489936140974861, + "grad_norm": 0.5340062379837036, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 5870 + }, + { + "epoch": 0.9506102982782314, + "grad_norm": 0.6447290182113647, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 5880 + }, + { + "epoch": 0.9522269824589766, + "grad_norm": 0.5123815536499023, + "learning_rate": 0.0002, + "loss": 0.7945, + "step": 5890 + }, + { + "epoch": 0.953843666639722, + "grad_norm": 0.48547613620758057, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 5900 + }, + { + "epoch": 0.9554603508204672, + "grad_norm": 0.5791414976119995, + "learning_rate": 0.0002, + "loss": 0.7896, + "step": 5910 + }, + { + "epoch": 0.9570770350012126, + "grad_norm": 0.6195011734962463, + "learning_rate": 0.0002, + "loss": 0.8408, + "step": 5920 + }, + { + "epoch": 0.9586937191819578, + "grad_norm": 0.6323803067207336, + "learning_rate": 0.0002, + "loss": 0.7805, + "step": 5930 + }, + { + "epoch": 0.9603104033627031, + "grad_norm": 0.45552879571914673, + "learning_rate": 0.0002, + "loss": 0.8484, + "step": 5940 + }, + { + "epoch": 0.9619270875434484, + "grad_norm": 0.5796473622322083, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 5950 + }, + { + "epoch": 0.9635437717241937, + "grad_norm": 0.647261381149292, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 5960 + }, + { + "epoch": 0.9651604559049389, + "grad_norm": 0.5487682819366455, + "learning_rate": 0.0002, + "loss": 0.8086, + "step": 5970 + }, + { + "epoch": 0.9667771400856843, + "grad_norm": 0.5743663907051086, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 5980 + }, + { + "epoch": 0.9683938242664295, + "grad_norm": 0.5470591187477112, + "learning_rate": 0.0002, + "loss": 0.8153, + "step": 5990 + }, + { + "epoch": 0.9700105084471748, + "grad_norm": 0.5901660323143005, + "learning_rate": 0.0002, + "loss": 0.8119, + "step": 6000 + }, + { + "epoch": 0.9716271926279202, + "grad_norm": 0.6544759273529053, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 6010 + }, + { + "epoch": 0.9732438768086654, + "grad_norm": 0.6288470029830933, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 6020 + }, + { + "epoch": 0.9748605609894108, + "grad_norm": 0.673153817653656, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 6030 + }, + { + "epoch": 0.976477245170156, + "grad_norm": 0.42854753136634827, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 6040 + }, + { + "epoch": 0.9780939293509013, + "grad_norm": 0.5227066278457642, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 6050 + }, + { + "epoch": 0.9797106135316466, + "grad_norm": 0.5372416973114014, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 6060 + }, + { + "epoch": 0.9813272977123919, + "grad_norm": 0.6026402115821838, + "learning_rate": 0.0002, + "loss": 0.7591, + "step": 6070 + }, + { + "epoch": 0.9829439818931371, + "grad_norm": 0.49547791481018066, + "learning_rate": 0.0002, + "loss": 0.8347, + "step": 6080 + }, + { + "epoch": 0.9845606660738825, + "grad_norm": 0.4641951322555542, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 6090 + }, + { + "epoch": 0.9861773502546277, + "grad_norm": 0.5818535089492798, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 6100 + }, + { + "epoch": 0.9877940344353731, + "grad_norm": 0.63955157995224, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 6110 + }, + { + "epoch": 0.9894107186161183, + "grad_norm": 0.5649438500404358, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 6120 + }, + { + "epoch": 0.9910274027968636, + "grad_norm": 0.5290433168411255, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 6130 + }, + { + "epoch": 0.992644086977609, + "grad_norm": 0.6399374008178711, + "learning_rate": 0.0002, + "loss": 0.8109, + "step": 6140 + }, + { + "epoch": 0.9942607711583542, + "grad_norm": 0.6736576557159424, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 6150 + }, + { + "epoch": 0.9958774553390995, + "grad_norm": 0.515420138835907, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 6160 + }, + { + "epoch": 0.9974941395198448, + "grad_norm": 0.562677800655365, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 6170 + }, + { + "epoch": 0.9991108237005901, + "grad_norm": 0.7113858461380005, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 6180 + }, + { + "epoch": 0.9999191657909627, + "eval_loss": 1.0871200561523438, + "eval_runtime": 122.2071, + "eval_samples_per_second": 5.998, + "eval_steps_per_second": 0.753, + "step": 6185 + }, + { + "epoch": 1.0007275078813354, + "grad_norm": 0.7111801505088806, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 6190 + }, + { + "epoch": 1.0023441920620806, + "grad_norm": 0.5402125716209412, + "learning_rate": 0.0002, + "loss": 0.6865, + "step": 6200 + }, + { + "epoch": 1.003960876242826, + "grad_norm": 0.6098830103874207, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 6210 + }, + { + "epoch": 1.0055775604235713, + "grad_norm": 0.5829983353614807, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 6220 + }, + { + "epoch": 1.0071942446043165, + "grad_norm": 0.5614621043205261, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 6230 + }, + { + "epoch": 1.0088109287850617, + "grad_norm": 0.5954238772392273, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 6240 + }, + { + "epoch": 1.0104276129658072, + "grad_norm": 0.6480574607849121, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 6250 + }, + { + "epoch": 1.0120442971465524, + "grad_norm": 0.6051128506660461, + "learning_rate": 0.0002, + "loss": 0.7514, + "step": 6260 + }, + { + "epoch": 1.0136609813272976, + "grad_norm": 0.6318870782852173, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 6270 + }, + { + "epoch": 1.015277665508043, + "grad_norm": 0.5048980116844177, + "learning_rate": 0.0002, + "loss": 0.7178, + "step": 6280 + }, + { + "epoch": 1.0168943496887883, + "grad_norm": 0.6346936225891113, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 6290 + }, + { + "epoch": 1.0185110338695336, + "grad_norm": 0.5711665749549866, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 6300 + }, + { + "epoch": 1.0201277180502788, + "grad_norm": 0.5175361037254333, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 6310 + }, + { + "epoch": 1.0217444022310243, + "grad_norm": 0.5360831618309021, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 6320 + }, + { + "epoch": 1.0233610864117695, + "grad_norm": 0.614675760269165, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 6330 + }, + { + "epoch": 1.0249777705925147, + "grad_norm": 0.5626118183135986, + "learning_rate": 0.0002, + "loss": 0.7748, + "step": 6340 + }, + { + "epoch": 1.02659445477326, + "grad_norm": 0.574897289276123, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 6350 + }, + { + "epoch": 1.0282111389540054, + "grad_norm": 0.7185447812080383, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 6360 + }, + { + "epoch": 1.0298278231347506, + "grad_norm": 0.6705799698829651, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 6370 + }, + { + "epoch": 1.0314445073154959, + "grad_norm": 0.6740428805351257, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 6380 + }, + { + "epoch": 1.0330611914962413, + "grad_norm": 0.663902759552002, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 6390 + }, + { + "epoch": 1.0346778756769865, + "grad_norm": 0.5029543042182922, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 6400 + }, + { + "epoch": 1.0362945598577318, + "grad_norm": 0.7813863158226013, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 6410 + }, + { + "epoch": 1.037911244038477, + "grad_norm": 0.5396282076835632, + "learning_rate": 0.0002, + "loss": 0.7433, + "step": 6420 + }, + { + "epoch": 1.0395279282192225, + "grad_norm": 0.5253293514251709, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 6430 + }, + { + "epoch": 1.0411446123999677, + "grad_norm": 0.7236770987510681, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 6440 + }, + { + "epoch": 1.042761296580713, + "grad_norm": 0.5670917630195618, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 6450 + }, + { + "epoch": 1.0443779807614582, + "grad_norm": 0.6031978726387024, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 6460 + }, + { + "epoch": 1.0459946649422036, + "grad_norm": 0.5309213399887085, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 6470 + }, + { + "epoch": 1.0476113491229488, + "grad_norm": 0.7114651799201965, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 6480 + }, + { + "epoch": 1.049228033303694, + "grad_norm": 0.5591610670089722, + "learning_rate": 0.0002, + "loss": 0.7583, + "step": 6490 + }, + { + "epoch": 1.0508447174844395, + "grad_norm": 0.5185961127281189, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 6500 + }, + { + "epoch": 1.0524614016651848, + "grad_norm": 0.6510552167892456, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 6510 + }, + { + "epoch": 1.05407808584593, + "grad_norm": 0.6557928919792175, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 6520 + }, + { + "epoch": 1.0556947700266752, + "grad_norm": 0.6973192691802979, + "learning_rate": 0.0002, + "loss": 0.8056, + "step": 6530 + }, + { + "epoch": 1.0573114542074207, + "grad_norm": 0.6226583123207092, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 6540 + }, + { + "epoch": 1.058928138388166, + "grad_norm": 0.5633195638656616, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 6550 + }, + { + "epoch": 1.0605448225689111, + "grad_norm": 0.7466658353805542, + "learning_rate": 0.0002, + "loss": 0.7082, + "step": 6560 + }, + { + "epoch": 1.0621615067496564, + "grad_norm": 0.6462772488594055, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 6570 + }, + { + "epoch": 1.0637781909304018, + "grad_norm": 0.5266856551170349, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 6580 + }, + { + "epoch": 1.065394875111147, + "grad_norm": 0.534392774105072, + "learning_rate": 0.0002, + "loss": 0.7157, + "step": 6590 + }, + { + "epoch": 1.0670115592918923, + "grad_norm": 0.7514177560806274, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 6600 + }, + { + "epoch": 1.0686282434726375, + "grad_norm": 0.7593035697937012, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 6610 + }, + { + "epoch": 1.070244927653383, + "grad_norm": 0.5277858972549438, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 6620 + }, + { + "epoch": 1.0718616118341282, + "grad_norm": 0.5573670268058777, + "learning_rate": 0.0002, + "loss": 0.7405, + "step": 6630 + }, + { + "epoch": 1.0734782960148734, + "grad_norm": 0.6802396774291992, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 6640 + }, + { + "epoch": 1.0750949801956189, + "grad_norm": 0.7367215752601624, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 6650 + }, + { + "epoch": 1.0767116643763641, + "grad_norm": 0.5961891412734985, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 6660 + }, + { + "epoch": 1.0783283485571094, + "grad_norm": 0.5736313462257385, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 6670 + }, + { + "epoch": 1.0799450327378546, + "grad_norm": 0.619219183921814, + "learning_rate": 0.0002, + "loss": 0.7178, + "step": 6680 + }, + { + "epoch": 1.0815617169186, + "grad_norm": 0.6214390993118286, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 6690 + }, + { + "epoch": 1.0831784010993453, + "grad_norm": 0.564536988735199, + "learning_rate": 0.0002, + "loss": 0.7554, + "step": 6700 + }, + { + "epoch": 1.0847950852800905, + "grad_norm": 0.5838140249252319, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 6710 + }, + { + "epoch": 1.0864117694608357, + "grad_norm": 0.7000553607940674, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 6720 + }, + { + "epoch": 1.0880284536415812, + "grad_norm": 0.7078263759613037, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 6730 + }, + { + "epoch": 1.0896451378223264, + "grad_norm": 0.8353848457336426, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 6740 + }, + { + "epoch": 1.0912618220030716, + "grad_norm": 0.5615518689155579, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 6750 + }, + { + "epoch": 1.0928785061838169, + "grad_norm": 0.5475581288337708, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 6760 + }, + { + "epoch": 1.0944951903645623, + "grad_norm": 0.5835978388786316, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 6770 + }, + { + "epoch": 1.0961118745453076, + "grad_norm": 0.5516105890274048, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 6780 + }, + { + "epoch": 1.0977285587260528, + "grad_norm": 0.5875251889228821, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 6790 + }, + { + "epoch": 1.0993452429067982, + "grad_norm": 0.7376947999000549, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 6800 + }, + { + "epoch": 1.1009619270875435, + "grad_norm": 0.5656165480613708, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 6810 + }, + { + "epoch": 1.1025786112682887, + "grad_norm": 0.6365954279899597, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 6820 + }, + { + "epoch": 1.104195295449034, + "grad_norm": 0.5033080577850342, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 6830 + }, + { + "epoch": 1.1058119796297794, + "grad_norm": 0.617396891117096, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 6840 + }, + { + "epoch": 1.1074286638105246, + "grad_norm": 0.6395374536514282, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 6850 + }, + { + "epoch": 1.1090453479912699, + "grad_norm": 0.6775295734405518, + "learning_rate": 0.0002, + "loss": 0.7335, + "step": 6860 + }, + { + "epoch": 1.1106620321720153, + "grad_norm": 0.6655223965644836, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 6870 + }, + { + "epoch": 1.1122787163527605, + "grad_norm": 0.676655113697052, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 6880 + }, + { + "epoch": 1.1138954005335058, + "grad_norm": 0.6062718629837036, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 6890 + }, + { + "epoch": 1.115512084714251, + "grad_norm": 0.590943455696106, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 6900 + }, + { + "epoch": 1.1171287688949965, + "grad_norm": 0.6315317153930664, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 6910 + }, + { + "epoch": 1.1187454530757417, + "grad_norm": 0.47979024052619934, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 6920 + }, + { + "epoch": 1.120362137256487, + "grad_norm": 0.647298276424408, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 6930 + }, + { + "epoch": 1.1219788214372322, + "grad_norm": 0.7336484789848328, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 6940 + }, + { + "epoch": 1.1235955056179776, + "grad_norm": 0.5071424245834351, + "learning_rate": 0.0002, + "loss": 0.693, + "step": 6950 + }, + { + "epoch": 1.1252121897987228, + "grad_norm": 0.6527144312858582, + "learning_rate": 0.0002, + "loss": 0.7378, + "step": 6960 + }, + { + "epoch": 1.126828873979468, + "grad_norm": 0.6935935020446777, + "learning_rate": 0.0002, + "loss": 0.7228, + "step": 6970 + }, + { + "epoch": 1.1284455581602133, + "grad_norm": 0.8026931881904602, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 6980 + }, + { + "epoch": 1.1300622423409588, + "grad_norm": 0.5210393667221069, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 6990 + }, + { + "epoch": 1.131678926521704, + "grad_norm": 0.60475093126297, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 7000 + }, + { + "epoch": 1.1332956107024492, + "grad_norm": 0.6417073607444763, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 7010 + }, + { + "epoch": 1.1349122948831947, + "grad_norm": 0.6732175946235657, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 7020 + }, + { + "epoch": 1.13652897906394, + "grad_norm": 0.6719491481781006, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 7030 + }, + { + "epoch": 1.1381456632446851, + "grad_norm": 0.5708295106887817, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 7040 + }, + { + "epoch": 1.1397623474254304, + "grad_norm": 0.7141719460487366, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 7050 + }, + { + "epoch": 1.1413790316061758, + "grad_norm": 0.6187017560005188, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 7060 + }, + { + "epoch": 1.142995715786921, + "grad_norm": 0.50581294298172, + "learning_rate": 0.0002, + "loss": 0.7657, + "step": 7070 + }, + { + "epoch": 1.1446123999676663, + "grad_norm": 0.5620143413543701, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 7080 + }, + { + "epoch": 1.1462290841484115, + "grad_norm": 0.6231929659843445, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 7090 + }, + { + "epoch": 1.147845768329157, + "grad_norm": 0.5775774121284485, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 7100 + }, + { + "epoch": 1.1494624525099022, + "grad_norm": 0.6492809653282166, + "learning_rate": 0.0002, + "loss": 0.7728, + "step": 7110 + }, + { + "epoch": 1.1510791366906474, + "grad_norm": 0.6434972286224365, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 7120 + }, + { + "epoch": 1.1526958208713927, + "grad_norm": 0.6191812753677368, + "learning_rate": 0.0002, + "loss": 0.7374, + "step": 7130 + }, + { + "epoch": 1.1543125050521381, + "grad_norm": 0.6690331697463989, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 7140 + }, + { + "epoch": 1.1559291892328833, + "grad_norm": 0.5977938175201416, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 7150 + }, + { + "epoch": 1.1575458734136286, + "grad_norm": 0.6195854544639587, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 7160 + }, + { + "epoch": 1.159162557594374, + "grad_norm": 0.5752048492431641, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 7170 + }, + { + "epoch": 1.1607792417751193, + "grad_norm": 0.589081883430481, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 7180 + }, + { + "epoch": 1.1623959259558645, + "grad_norm": 0.756996750831604, + "learning_rate": 0.0002, + "loss": 0.704, + "step": 7190 + }, + { + "epoch": 1.1640126101366097, + "grad_norm": 0.7614967226982117, + "learning_rate": 0.0002, + "loss": 0.7404, + "step": 7200 + }, + { + "epoch": 1.1656292943173552, + "grad_norm": 0.6120437979698181, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 7210 + }, + { + "epoch": 1.1672459784981004, + "grad_norm": 0.6210004687309265, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 7220 + }, + { + "epoch": 1.1688626626788456, + "grad_norm": 0.6044116020202637, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 7230 + }, + { + "epoch": 1.170479346859591, + "grad_norm": 0.5418457388877869, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 7240 + }, + { + "epoch": 1.1720960310403363, + "grad_norm": 0.6413537263870239, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 7250 + }, + { + "epoch": 1.1737127152210816, + "grad_norm": 0.5777867436408997, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 7260 + }, + { + "epoch": 1.1753293994018268, + "grad_norm": 0.7092402577400208, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 7270 + }, + { + "epoch": 1.176946083582572, + "grad_norm": 0.6351709365844727, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 7280 + }, + { + "epoch": 1.1785627677633175, + "grad_norm": 0.6172189712524414, + "learning_rate": 0.0002, + "loss": 0.7527, + "step": 7290 + }, + { + "epoch": 1.1801794519440627, + "grad_norm": 0.6801714897155762, + "learning_rate": 0.0002, + "loss": 0.7319, + "step": 7300 + }, + { + "epoch": 1.181796136124808, + "grad_norm": 0.6044712066650391, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 7310 + }, + { + "epoch": 1.1834128203055534, + "grad_norm": 0.7413212060928345, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 7320 + }, + { + "epoch": 1.1850295044862986, + "grad_norm": 0.5303856134414673, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 7330 + }, + { + "epoch": 1.1866461886670439, + "grad_norm": 0.5647098422050476, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 7340 + }, + { + "epoch": 1.188262872847789, + "grad_norm": 0.7374135255813599, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 7350 + }, + { + "epoch": 1.1898795570285345, + "grad_norm": 0.5710089206695557, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 7360 + }, + { + "epoch": 1.1914962412092798, + "grad_norm": 0.6073619723320007, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 7370 + }, + { + "epoch": 1.193112925390025, + "grad_norm": 0.5899916887283325, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 7380 + }, + { + "epoch": 1.1947296095707705, + "grad_norm": 0.7762434482574463, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 7390 + }, + { + "epoch": 1.1963462937515157, + "grad_norm": 0.679949939250946, + "learning_rate": 0.0002, + "loss": 0.7319, + "step": 7400 + }, + { + "epoch": 1.197962977932261, + "grad_norm": 0.6106849312782288, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 7410 + }, + { + "epoch": 1.1995796621130062, + "grad_norm": 0.682461678981781, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 7420 + }, + { + "epoch": 1.2011963462937516, + "grad_norm": 0.6087017059326172, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 7430 + }, + { + "epoch": 1.2028130304744968, + "grad_norm": 0.63739013671875, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 7440 + }, + { + "epoch": 1.204429714655242, + "grad_norm": 0.6154777407646179, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 7450 + }, + { + "epoch": 1.2060463988359873, + "grad_norm": 0.7491534948348999, + "learning_rate": 0.0002, + "loss": 0.7565, + "step": 7460 + }, + { + "epoch": 1.2076630830167328, + "grad_norm": 0.6664797067642212, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 7470 + }, + { + "epoch": 1.209279767197478, + "grad_norm": 0.6660266518592834, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 7480 + }, + { + "epoch": 1.2108964513782232, + "grad_norm": 0.6972551345825195, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 7490 + }, + { + "epoch": 1.2125131355589684, + "grad_norm": 0.6157945990562439, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 7500 + }, + { + "epoch": 1.214129819739714, + "grad_norm": 0.5199310183525085, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 7510 + }, + { + "epoch": 1.2157465039204591, + "grad_norm": 0.577610433101654, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 7520 + }, + { + "epoch": 1.2173631881012044, + "grad_norm": 0.53652423620224, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 7530 + }, + { + "epoch": 1.2189798722819498, + "grad_norm": 0.6479050517082214, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 7540 + }, + { + "epoch": 1.220596556462695, + "grad_norm": 0.618748128414154, + "learning_rate": 0.0002, + "loss": 0.7534, + "step": 7550 + }, + { + "epoch": 1.2222132406434403, + "grad_norm": 0.6311424374580383, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 7560 + }, + { + "epoch": 1.2238299248241855, + "grad_norm": 0.6595825552940369, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 7570 + }, + { + "epoch": 1.225446609004931, + "grad_norm": 0.5198960900306702, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 7580 + }, + { + "epoch": 1.2270632931856762, + "grad_norm": 0.578650712966919, + "learning_rate": 0.0002, + "loss": 0.674, + "step": 7590 + }, + { + "epoch": 1.2286799773664214, + "grad_norm": 0.6080220937728882, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 7600 + }, + { + "epoch": 1.2302966615471669, + "grad_norm": 0.7050248384475708, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 7610 + }, + { + "epoch": 1.2319133457279121, + "grad_norm": 0.6652196049690247, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 7620 + }, + { + "epoch": 1.2335300299086573, + "grad_norm": 0.7322776317596436, + "learning_rate": 0.0002, + "loss": 0.7085, + "step": 7630 + }, + { + "epoch": 1.2351467140894026, + "grad_norm": 0.4998728036880493, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 7640 + }, + { + "epoch": 1.2367633982701478, + "grad_norm": 0.6428788900375366, + "learning_rate": 0.0002, + "loss": 0.7214, + "step": 7650 + }, + { + "epoch": 1.2383800824508933, + "grad_norm": 0.585242509841919, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 7660 + }, + { + "epoch": 1.2399967666316385, + "grad_norm": 0.5211917757987976, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 7670 + }, + { + "epoch": 1.2416134508123837, + "grad_norm": 0.6490384340286255, + "learning_rate": 0.0002, + "loss": 0.746, + "step": 7680 + }, + { + "epoch": 1.2432301349931292, + "grad_norm": 0.6249763369560242, + "learning_rate": 0.0002, + "loss": 0.7186, + "step": 7690 + }, + { + "epoch": 1.2448468191738744, + "grad_norm": 0.71870356798172, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 7700 + }, + { + "epoch": 1.2464635033546196, + "grad_norm": 0.6761967539787292, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 7710 + }, + { + "epoch": 1.2480801875353649, + "grad_norm": 0.6500617265701294, + "learning_rate": 0.0002, + "loss": 0.7501, + "step": 7720 + }, + { + "epoch": 1.2496968717161103, + "grad_norm": 0.8069869875907898, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 7730 + }, + { + "epoch": 1.2513135558968556, + "grad_norm": 0.6044608950614929, + "learning_rate": 0.0002, + "loss": 0.6747, + "step": 7740 + }, + { + "epoch": 1.2529302400776008, + "grad_norm": 0.6573283076286316, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 7750 + }, + { + "epoch": 1.2545469242583462, + "grad_norm": 0.625430166721344, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 7760 + }, + { + "epoch": 1.2561636084390915, + "grad_norm": 0.5442022681236267, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 7770 + }, + { + "epoch": 1.2577802926198367, + "grad_norm": 0.6818386912345886, + "learning_rate": 0.0002, + "loss": 0.7172, + "step": 7780 + }, + { + "epoch": 1.259396976800582, + "grad_norm": 0.6381874084472656, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 7790 + }, + { + "epoch": 1.2610136609813272, + "grad_norm": 0.6269212961196899, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 7800 + }, + { + "epoch": 1.2626303451620726, + "grad_norm": 0.600121259689331, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 7810 + }, + { + "epoch": 1.2642470293428179, + "grad_norm": 0.6337703466415405, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 7820 + }, + { + "epoch": 1.2658637135235633, + "grad_norm": 0.7234963774681091, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 7830 + }, + { + "epoch": 1.2674803977043085, + "grad_norm": 0.800184965133667, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 7840 + }, + { + "epoch": 1.2690970818850538, + "grad_norm": 0.7539464831352234, + "learning_rate": 0.0002, + "loss": 0.7426, + "step": 7850 + }, + { + "epoch": 1.270713766065799, + "grad_norm": 0.5493760704994202, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 7860 + }, + { + "epoch": 1.2723304502465442, + "grad_norm": 0.7477145791053772, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 7870 + }, + { + "epoch": 1.2739471344272897, + "grad_norm": 0.6366362571716309, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 7880 + }, + { + "epoch": 1.275563818608035, + "grad_norm": 0.7419533729553223, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 7890 + }, + { + "epoch": 1.2771805027887801, + "grad_norm": 0.6141223311424255, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 7900 + }, + { + "epoch": 1.2787971869695256, + "grad_norm": 0.7522598505020142, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 7910 + }, + { + "epoch": 1.2804138711502708, + "grad_norm": 0.6935804486274719, + "learning_rate": 0.0002, + "loss": 0.7097, + "step": 7920 + }, + { + "epoch": 1.282030555331016, + "grad_norm": 0.7239290475845337, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 7930 + }, + { + "epoch": 1.2836472395117613, + "grad_norm": 0.8800187110900879, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 7940 + }, + { + "epoch": 1.2852639236925067, + "grad_norm": 0.540458083152771, + "learning_rate": 0.0002, + "loss": 0.6991, + "step": 7950 + }, + { + "epoch": 1.286880607873252, + "grad_norm": 0.6492934226989746, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 7960 + }, + { + "epoch": 1.2884972920539972, + "grad_norm": 0.6543959379196167, + "learning_rate": 0.0002, + "loss": 0.7742, + "step": 7970 + }, + { + "epoch": 1.2901139762347427, + "grad_norm": 0.5804705619812012, + "learning_rate": 0.0002, + "loss": 0.7316, + "step": 7980 + }, + { + "epoch": 1.291730660415488, + "grad_norm": 0.7074727416038513, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 7990 + }, + { + "epoch": 1.2933473445962331, + "grad_norm": 0.5347974300384521, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 8000 + }, + { + "epoch": 1.2949640287769784, + "grad_norm": 0.6457298398017883, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 8010 + }, + { + "epoch": 1.2965807129577236, + "grad_norm": 0.6407219171524048, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 8020 + }, + { + "epoch": 1.298197397138469, + "grad_norm": 0.828439474105835, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 8030 + }, + { + "epoch": 1.2998140813192143, + "grad_norm": 0.4840380549430847, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 8040 + }, + { + "epoch": 1.3014307654999595, + "grad_norm": 0.5921024680137634, + "learning_rate": 0.0002, + "loss": 0.7283, + "step": 8050 + }, + { + "epoch": 1.303047449680705, + "grad_norm": 0.6170315146446228, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 8060 + }, + { + "epoch": 1.3046641338614502, + "grad_norm": 0.5374847054481506, + "learning_rate": 0.0002, + "loss": 0.7534, + "step": 8070 + }, + { + "epoch": 1.3062808180421954, + "grad_norm": 0.545758068561554, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 8080 + }, + { + "epoch": 1.3078975022229407, + "grad_norm": 0.55641770362854, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 8090 + }, + { + "epoch": 1.309514186403686, + "grad_norm": 0.6724897027015686, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 8100 + }, + { + "epoch": 1.3111308705844313, + "grad_norm": 0.6923972368240356, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 8110 + }, + { + "epoch": 1.3127475547651766, + "grad_norm": 0.5136841535568237, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 8120 + }, + { + "epoch": 1.314364238945922, + "grad_norm": 0.6766283512115479, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 8130 + }, + { + "epoch": 1.3159809231266673, + "grad_norm": 0.6283926367759705, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 8140 + }, + { + "epoch": 1.3175976073074125, + "grad_norm": 0.644216001033783, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 8150 + }, + { + "epoch": 1.3192142914881577, + "grad_norm": 0.7827503085136414, + "learning_rate": 0.0002, + "loss": 0.7125, + "step": 8160 + }, + { + "epoch": 1.320830975668903, + "grad_norm": 0.6651390790939331, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 8170 + }, + { + "epoch": 1.3224476598496484, + "grad_norm": 0.5547412633895874, + "learning_rate": 0.0002, + "loss": 0.7778, + "step": 8180 + }, + { + "epoch": 1.3240643440303936, + "grad_norm": 0.6765179634094238, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 8190 + }, + { + "epoch": 1.325681028211139, + "grad_norm": 0.6822077035903931, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 8200 + }, + { + "epoch": 1.3272977123918843, + "grad_norm": 0.5941002368927002, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 8210 + }, + { + "epoch": 1.3289143965726296, + "grad_norm": 0.4850037097930908, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 8220 + }, + { + "epoch": 1.3305310807533748, + "grad_norm": 0.6162990927696228, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 8230 + }, + { + "epoch": 1.33214776493412, + "grad_norm": 0.6665613651275635, + "learning_rate": 0.0002, + "loss": 0.7751, + "step": 8240 + }, + { + "epoch": 1.3337644491148655, + "grad_norm": 0.618192732334137, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 8250 + }, + { + "epoch": 1.3353811332956107, + "grad_norm": 0.710418701171875, + "learning_rate": 0.0002, + "loss": 0.7532, + "step": 8260 + }, + { + "epoch": 1.336997817476356, + "grad_norm": 0.5109876990318298, + "learning_rate": 0.0002, + "loss": 0.7306, + "step": 8270 + }, + { + "epoch": 1.3386145016571014, + "grad_norm": 0.6791711449623108, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 8280 + }, + { + "epoch": 1.3402311858378466, + "grad_norm": 0.6836432814598083, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 8290 + }, + { + "epoch": 1.3418478700185918, + "grad_norm": 0.5579386353492737, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 8300 + }, + { + "epoch": 1.343464554199337, + "grad_norm": 0.6713546514511108, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 8310 + }, + { + "epoch": 1.3450812383800825, + "grad_norm": 0.5353720188140869, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 8320 + }, + { + "epoch": 1.3466979225608278, + "grad_norm": 0.5813682675361633, + "learning_rate": 0.0002, + "loss": 0.718, + "step": 8330 + }, + { + "epoch": 1.348314606741573, + "grad_norm": 0.8158791661262512, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 8340 + }, + { + "epoch": 1.3499312909223184, + "grad_norm": 0.6193785071372986, + "learning_rate": 0.0002, + "loss": 0.6992, + "step": 8350 + }, + { + "epoch": 1.3515479751030637, + "grad_norm": 0.6353939771652222, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 8360 + }, + { + "epoch": 1.353164659283809, + "grad_norm": 0.6925048232078552, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 8370 + }, + { + "epoch": 1.3547813434645541, + "grad_norm": 0.988264799118042, + "learning_rate": 0.0002, + "loss": 0.736, + "step": 8380 + }, + { + "epoch": 1.3563980276452994, + "grad_norm": 0.6476002931594849, + "learning_rate": 0.0002, + "loss": 0.7744, + "step": 8390 + }, + { + "epoch": 1.3580147118260448, + "grad_norm": 0.7120398879051208, + "learning_rate": 0.0002, + "loss": 0.776, + "step": 8400 + }, + { + "epoch": 1.35963139600679, + "grad_norm": 0.9048416614532471, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 8410 + }, + { + "epoch": 1.3612480801875353, + "grad_norm": 0.7000672817230225, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 8420 + }, + { + "epoch": 1.3628647643682807, + "grad_norm": 0.6015632152557373, + "learning_rate": 0.0002, + "loss": 0.7358, + "step": 8430 + }, + { + "epoch": 1.364481448549026, + "grad_norm": 0.612516462802887, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 8440 + }, + { + "epoch": 1.3660981327297712, + "grad_norm": 0.5969301462173462, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 8450 + }, + { + "epoch": 1.3677148169105164, + "grad_norm": 0.6730654239654541, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 8460 + }, + { + "epoch": 1.369331501091262, + "grad_norm": 0.6386392116546631, + "learning_rate": 0.0002, + "loss": 0.7465, + "step": 8470 + }, + { + "epoch": 1.3709481852720071, + "grad_norm": 0.739544153213501, + "learning_rate": 0.0002, + "loss": 0.7433, + "step": 8480 + }, + { + "epoch": 1.3725648694527524, + "grad_norm": 0.6462782621383667, + "learning_rate": 0.0002, + "loss": 0.7892, + "step": 8490 + }, + { + "epoch": 1.3741815536334978, + "grad_norm": 0.7346843481063843, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 8500 + }, + { + "epoch": 1.375798237814243, + "grad_norm": 0.6884821057319641, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 8510 + }, + { + "epoch": 1.3774149219949883, + "grad_norm": 0.6999333500862122, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 8520 + }, + { + "epoch": 1.3790316061757335, + "grad_norm": 0.5378713011741638, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 8530 + }, + { + "epoch": 1.3806482903564787, + "grad_norm": 0.5417906641960144, + "learning_rate": 0.0002, + "loss": 0.6797, + "step": 8540 + }, + { + "epoch": 1.3822649745372242, + "grad_norm": 0.6602526307106018, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 8550 + }, + { + "epoch": 1.3838816587179694, + "grad_norm": 0.7073674201965332, + "learning_rate": 0.0002, + "loss": 0.7356, + "step": 8560 + }, + { + "epoch": 1.3854983428987149, + "grad_norm": 0.5841707587242126, + "learning_rate": 0.0002, + "loss": 0.75, + "step": 8570 + }, + { + "epoch": 1.38711502707946, + "grad_norm": 0.7031095027923584, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 8580 + }, + { + "epoch": 1.3887317112602053, + "grad_norm": 0.5198570489883423, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 8590 + }, + { + "epoch": 1.3903483954409506, + "grad_norm": 0.7261320352554321, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 8600 + }, + { + "epoch": 1.3919650796216958, + "grad_norm": 0.5616350173950195, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 8610 + }, + { + "epoch": 1.3935817638024413, + "grad_norm": 0.5185914635658264, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 8620 + }, + { + "epoch": 1.3951984479831865, + "grad_norm": 0.5814694762229919, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 8630 + }, + { + "epoch": 1.3968151321639317, + "grad_norm": 0.6977371573448181, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 8640 + }, + { + "epoch": 1.3984318163446772, + "grad_norm": 0.6855689883232117, + "learning_rate": 0.0002, + "loss": 0.7574, + "step": 8650 + }, + { + "epoch": 1.4000485005254224, + "grad_norm": 0.5414357781410217, + "learning_rate": 0.0002, + "loss": 0.7802, + "step": 8660 + }, + { + "epoch": 1.4016651847061676, + "grad_norm": 0.6970012784004211, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 8670 + }, + { + "epoch": 1.4032818688869129, + "grad_norm": 0.526079535484314, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 8680 + }, + { + "epoch": 1.404898553067658, + "grad_norm": 0.758712887763977, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 8690 + }, + { + "epoch": 1.4065152372484035, + "grad_norm": 0.7118762731552124, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 8700 + }, + { + "epoch": 1.4081319214291488, + "grad_norm": 0.5696909427642822, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 8710 + }, + { + "epoch": 1.4097486056098942, + "grad_norm": 0.7995436787605286, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 8720 + }, + { + "epoch": 1.4113652897906395, + "grad_norm": 0.7237521409988403, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 8730 + }, + { + "epoch": 1.4129819739713847, + "grad_norm": 0.744628369808197, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 8740 + }, + { + "epoch": 1.41459865815213, + "grad_norm": 0.6082926988601685, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 8750 + }, + { + "epoch": 1.4162153423328752, + "grad_norm": 0.5185243487358093, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 8760 + }, + { + "epoch": 1.4178320265136206, + "grad_norm": 0.5183082222938538, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 8770 + }, + { + "epoch": 1.4194487106943658, + "grad_norm": 0.7326041460037231, + "learning_rate": 0.0002, + "loss": 0.7509, + "step": 8780 + }, + { + "epoch": 1.421065394875111, + "grad_norm": 0.7174660563468933, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 8790 + }, + { + "epoch": 1.4226820790558565, + "grad_norm": 0.8080165982246399, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 8800 + }, + { + "epoch": 1.4242987632366018, + "grad_norm": 0.5061507821083069, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 8810 + }, + { + "epoch": 1.425915447417347, + "grad_norm": 0.801602840423584, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 8820 + }, + { + "epoch": 1.4275321315980922, + "grad_norm": 0.6150273084640503, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 8830 + }, + { + "epoch": 1.4291488157788377, + "grad_norm": 0.8786525726318359, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 8840 + }, + { + "epoch": 1.430765499959583, + "grad_norm": 0.6371538639068604, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 8850 + }, + { + "epoch": 1.4323821841403281, + "grad_norm": 0.6409295797348022, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 8860 + }, + { + "epoch": 1.4339988683210736, + "grad_norm": 0.6452359557151794, + "learning_rate": 0.0002, + "loss": 0.7891, + "step": 8870 + }, + { + "epoch": 1.4356155525018188, + "grad_norm": 0.5842334628105164, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 8880 + }, + { + "epoch": 1.437232236682564, + "grad_norm": 0.696761965751648, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 8890 + }, + { + "epoch": 1.4388489208633093, + "grad_norm": 0.6384600400924683, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 8900 + }, + { + "epoch": 1.4404656050440545, + "grad_norm": 0.5981136560440063, + "learning_rate": 0.0002, + "loss": 0.7049, + "step": 8910 + }, + { + "epoch": 1.4420822892248, + "grad_norm": 0.6355637907981873, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 8920 + }, + { + "epoch": 1.4436989734055452, + "grad_norm": 0.6374830603599548, + "learning_rate": 0.0002, + "loss": 0.7653, + "step": 8930 + }, + { + "epoch": 1.4453156575862904, + "grad_norm": 0.559013307094574, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 8940 + }, + { + "epoch": 1.446932341767036, + "grad_norm": 0.7289170026779175, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 8950 + }, + { + "epoch": 1.4485490259477811, + "grad_norm": 0.8649206757545471, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 8960 + }, + { + "epoch": 1.4501657101285264, + "grad_norm": 0.7664689421653748, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 8970 + }, + { + "epoch": 1.4517823943092716, + "grad_norm": 0.7109952569007874, + "learning_rate": 0.0002, + "loss": 0.7431, + "step": 8980 + }, + { + "epoch": 1.453399078490017, + "grad_norm": 0.6312844753265381, + "learning_rate": 0.0002, + "loss": 0.7997, + "step": 8990 + }, + { + "epoch": 1.4550157626707623, + "grad_norm": 0.6616617441177368, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 9000 + }, + { + "epoch": 1.4566324468515075, + "grad_norm": 0.7384068965911865, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 9010 + }, + { + "epoch": 1.458249131032253, + "grad_norm": 0.6549670100212097, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 9020 + }, + { + "epoch": 1.4598658152129982, + "grad_norm": 0.6254119277000427, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 9030 + }, + { + "epoch": 1.4614824993937434, + "grad_norm": 0.6806328892707825, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 9040 + }, + { + "epoch": 1.4630991835744886, + "grad_norm": 0.6803115010261536, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 9050 + }, + { + "epoch": 1.4647158677552339, + "grad_norm": 0.48529282212257385, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 9060 + }, + { + "epoch": 1.4663325519359793, + "grad_norm": 0.5995030999183655, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 9070 + }, + { + "epoch": 1.4679492361167246, + "grad_norm": 0.6005427837371826, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 9080 + }, + { + "epoch": 1.46956592029747, + "grad_norm": 0.718564510345459, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 9090 + }, + { + "epoch": 1.4711826044782153, + "grad_norm": 0.7003577351570129, + "learning_rate": 0.0002, + "loss": 0.7089, + "step": 9100 + }, + { + "epoch": 1.4727992886589605, + "grad_norm": 0.5888323783874512, + "learning_rate": 0.0002, + "loss": 0.8069, + "step": 9110 + }, + { + "epoch": 1.4744159728397057, + "grad_norm": 0.6417609453201294, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 9120 + }, + { + "epoch": 1.476032657020451, + "grad_norm": 0.572294294834137, + "learning_rate": 0.0002, + "loss": 0.7441, + "step": 9130 + }, + { + "epoch": 1.4776493412011964, + "grad_norm": 0.8200714588165283, + "learning_rate": 0.0002, + "loss": 0.8053, + "step": 9140 + }, + { + "epoch": 1.4792660253819416, + "grad_norm": 0.6343288421630859, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 9150 + }, + { + "epoch": 1.4808827095626869, + "grad_norm": 0.7017961144447327, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 9160 + }, + { + "epoch": 1.4824993937434323, + "grad_norm": 0.6202912926673889, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 9170 + }, + { + "epoch": 1.4841160779241775, + "grad_norm": 0.6677869558334351, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 9180 + }, + { + "epoch": 1.4857327621049228, + "grad_norm": 0.6052267551422119, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 9190 + }, + { + "epoch": 1.487349446285668, + "grad_norm": 0.6638872027397156, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 9200 + }, + { + "epoch": 1.4889661304664135, + "grad_norm": 0.6245523691177368, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 9210 + }, + { + "epoch": 1.4905828146471587, + "grad_norm": 0.5761767625808716, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 9220 + }, + { + "epoch": 1.492199498827904, + "grad_norm": 0.8175981640815735, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 9230 + }, + { + "epoch": 1.4938161830086494, + "grad_norm": 0.9144009947776794, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 9240 + }, + { + "epoch": 1.4954328671893946, + "grad_norm": 0.5742552876472473, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 9250 + }, + { + "epoch": 1.4970495513701398, + "grad_norm": 0.534534215927124, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 9260 + }, + { + "epoch": 1.498666235550885, + "grad_norm": 0.7836225032806396, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 9270 + }, + { + "epoch": 1.5002829197316303, + "grad_norm": 0.5292993187904358, + "learning_rate": 0.0002, + "loss": 0.7453, + "step": 9280 + }, + { + "epoch": 1.5018996039123758, + "grad_norm": 0.8044071793556213, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 9290 + }, + { + "epoch": 1.503516288093121, + "grad_norm": 0.6185805201530457, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 9300 + }, + { + "epoch": 1.5051329722738664, + "grad_norm": 0.6093607544898987, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 9310 + }, + { + "epoch": 1.5067496564546117, + "grad_norm": 0.5891730189323425, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 9320 + }, + { + "epoch": 1.508366340635357, + "grad_norm": 0.6331129670143127, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 9330 + }, + { + "epoch": 1.5099830248161021, + "grad_norm": 0.7690958380699158, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 9340 + }, + { + "epoch": 1.5115997089968474, + "grad_norm": 0.6548877358436584, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 9350 + }, + { + "epoch": 1.5132163931775926, + "grad_norm": 0.6545143127441406, + "learning_rate": 0.0002, + "loss": 0.7408, + "step": 9360 + }, + { + "epoch": 1.514833077358338, + "grad_norm": 0.553247332572937, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 9370 + }, + { + "epoch": 1.5164497615390833, + "grad_norm": 0.8145074844360352, + "learning_rate": 0.0002, + "loss": 0.7265, + "step": 9380 + }, + { + "epoch": 1.5180664457198287, + "grad_norm": 0.7636994123458862, + "learning_rate": 0.0002, + "loss": 0.7379, + "step": 9390 + }, + { + "epoch": 1.519683129900574, + "grad_norm": 0.6838982701301575, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 9400 + }, + { + "epoch": 1.5212998140813192, + "grad_norm": 0.8599441647529602, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 9410 + }, + { + "epoch": 1.5229164982620644, + "grad_norm": 0.7020329833030701, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 9420 + }, + { + "epoch": 1.5245331824428097, + "grad_norm": 0.6964772343635559, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 9430 + }, + { + "epoch": 1.5261498666235551, + "grad_norm": 0.6916600465774536, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 9440 + }, + { + "epoch": 1.5277665508043003, + "grad_norm": 0.7282621264457703, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 9450 + }, + { + "epoch": 1.5293832349850458, + "grad_norm": 0.5363983511924744, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 9460 + }, + { + "epoch": 1.530999919165791, + "grad_norm": 0.6184861063957214, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 9470 + }, + { + "epoch": 1.5326166033465363, + "grad_norm": 0.5991285443305969, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 9480 + }, + { + "epoch": 1.5342332875272815, + "grad_norm": 0.8176587820053101, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 9490 + }, + { + "epoch": 1.5358499717080267, + "grad_norm": 0.6473721861839294, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 9500 + }, + { + "epoch": 1.5374666558887722, + "grad_norm": 0.7319952845573425, + "learning_rate": 0.0002, + "loss": 0.7632, + "step": 9510 + }, + { + "epoch": 1.5390833400695174, + "grad_norm": 0.702900230884552, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 9520 + }, + { + "epoch": 1.5407000242502629, + "grad_norm": 0.7971600294113159, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 9530 + }, + { + "epoch": 1.542316708431008, + "grad_norm": 0.6527525186538696, + "learning_rate": 0.0002, + "loss": 0.7352, + "step": 9540 + }, + { + "epoch": 1.5439333926117533, + "grad_norm": 0.5791676044464111, + "learning_rate": 0.0002, + "loss": 0.7425, + "step": 9550 + }, + { + "epoch": 1.5455500767924986, + "grad_norm": 0.5619390606880188, + "learning_rate": 0.0002, + "loss": 0.7585, + "step": 9560 + }, + { + "epoch": 1.5471667609732438, + "grad_norm": 0.5701689124107361, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 9570 + }, + { + "epoch": 1.548783445153989, + "grad_norm": 0.47549352049827576, + "learning_rate": 0.0002, + "loss": 0.793, + "step": 9580 + }, + { + "epoch": 1.5504001293347345, + "grad_norm": 0.8730611205101013, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 9590 + }, + { + "epoch": 1.5520168135154797, + "grad_norm": 0.6842091083526611, + "learning_rate": 0.0002, + "loss": 0.798, + "step": 9600 + }, + { + "epoch": 1.5536334976962252, + "grad_norm": 0.6675129532814026, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 9610 + }, + { + "epoch": 1.5552501818769704, + "grad_norm": 0.8173956274986267, + "learning_rate": 0.0002, + "loss": 0.7954, + "step": 9620 + }, + { + "epoch": 1.5568668660577156, + "grad_norm": 0.724947452545166, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 9630 + }, + { + "epoch": 1.5584835502384609, + "grad_norm": 0.6154758930206299, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 9640 + }, + { + "epoch": 1.560100234419206, + "grad_norm": 0.6072008013725281, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 9650 + }, + { + "epoch": 1.5617169185999515, + "grad_norm": 0.659010648727417, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 9660 + }, + { + "epoch": 1.5633336027806968, + "grad_norm": 0.65857994556427, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 9670 + }, + { + "epoch": 1.5649502869614422, + "grad_norm": 0.5914267301559448, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 9680 + }, + { + "epoch": 1.5665669711421875, + "grad_norm": 0.6248020529747009, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 9690 + }, + { + "epoch": 1.5681836553229327, + "grad_norm": 0.7147795557975769, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 9700 + }, + { + "epoch": 1.569800339503678, + "grad_norm": 0.7076232433319092, + "learning_rate": 0.0002, + "loss": 0.7335, + "step": 9710 + }, + { + "epoch": 1.5714170236844232, + "grad_norm": 0.6217400431632996, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 9720 + }, + { + "epoch": 1.5730337078651684, + "grad_norm": 0.6709911227226257, + "learning_rate": 0.0002, + "loss": 0.7296, + "step": 9730 + }, + { + "epoch": 1.5746503920459138, + "grad_norm": 0.749171257019043, + "learning_rate": 0.0002, + "loss": 0.7306, + "step": 9740 + }, + { + "epoch": 1.576267076226659, + "grad_norm": 0.6241145730018616, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 9750 + }, + { + "epoch": 1.5778837604074045, + "grad_norm": 0.4960934817790985, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 9760 + }, + { + "epoch": 1.5795004445881498, + "grad_norm": 0.6593309640884399, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 9770 + }, + { + "epoch": 1.581117128768895, + "grad_norm": 0.5814042091369629, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 9780 + }, + { + "epoch": 1.5827338129496402, + "grad_norm": 0.5936070680618286, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 9790 + }, + { + "epoch": 1.5843504971303854, + "grad_norm": 0.6454403400421143, + "learning_rate": 0.0002, + "loss": 0.7769, + "step": 9800 + }, + { + "epoch": 1.585967181311131, + "grad_norm": 0.7612107992172241, + "learning_rate": 0.0002, + "loss": 0.7677, + "step": 9810 + }, + { + "epoch": 1.5875838654918761, + "grad_norm": 0.6494482755661011, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 9820 + }, + { + "epoch": 1.5892005496726216, + "grad_norm": 0.7825694680213928, + "learning_rate": 0.0002, + "loss": 0.7569, + "step": 9830 + }, + { + "epoch": 1.5908172338533668, + "grad_norm": 0.6757757663726807, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 9840 + }, + { + "epoch": 1.592433918034112, + "grad_norm": 0.7105609178543091, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 9850 + }, + { + "epoch": 1.5940506022148573, + "grad_norm": 0.7596991062164307, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 9860 + }, + { + "epoch": 1.5956672863956025, + "grad_norm": 0.5681525468826294, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 9870 + }, + { + "epoch": 1.5972839705763477, + "grad_norm": 0.6090980768203735, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 9880 + }, + { + "epoch": 1.5989006547570932, + "grad_norm": 0.6271613240242004, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 9890 + }, + { + "epoch": 1.6005173389378387, + "grad_norm": 0.7656369805335999, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 9900 + }, + { + "epoch": 1.6021340231185839, + "grad_norm": 0.7504446506500244, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 9910 + }, + { + "epoch": 1.6037507072993291, + "grad_norm": 0.659656286239624, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 9920 + }, + { + "epoch": 1.6053673914800743, + "grad_norm": 0.6006826162338257, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 9930 + }, + { + "epoch": 1.6069840756608196, + "grad_norm": 0.7872757911682129, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 9940 + }, + { + "epoch": 1.6086007598415648, + "grad_norm": 0.5545852780342102, + "learning_rate": 0.0002, + "loss": 0.768, + "step": 9950 + }, + { + "epoch": 1.6102174440223103, + "grad_norm": 0.7429468631744385, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 9960 + }, + { + "epoch": 1.6118341282030555, + "grad_norm": 0.6873556971549988, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 9970 + }, + { + "epoch": 1.613450812383801, + "grad_norm": 0.5874287486076355, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 9980 + }, + { + "epoch": 1.6150674965645462, + "grad_norm": 0.6039386987686157, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 9990 + }, + { + "epoch": 1.6166841807452914, + "grad_norm": 0.6233575940132141, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 10000 + }, + { + "epoch": 1.6183008649260366, + "grad_norm": 0.7676448225975037, + "learning_rate": 0.0002, + "loss": 0.7602, + "step": 10010 + }, + { + "epoch": 1.6199175491067819, + "grad_norm": 0.6565698385238647, + "learning_rate": 0.0002, + "loss": 0.7784, + "step": 10020 + }, + { + "epoch": 1.6215342332875273, + "grad_norm": 0.6787590384483337, + "learning_rate": 0.0002, + "loss": 0.7104, + "step": 10030 + }, + { + "epoch": 1.6231509174682726, + "grad_norm": 0.6137678027153015, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 10040 + }, + { + "epoch": 1.624767601649018, + "grad_norm": 0.5236800312995911, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 10050 + }, + { + "epoch": 1.6263842858297632, + "grad_norm": 0.7626367807388306, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 10060 + }, + { + "epoch": 1.6280009700105085, + "grad_norm": 0.5657260417938232, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 10070 + }, + { + "epoch": 1.6296176541912537, + "grad_norm": 0.4913991391658783, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 10080 + }, + { + "epoch": 1.631234338371999, + "grad_norm": 0.7715556621551514, + "learning_rate": 0.0002, + "loss": 0.7596, + "step": 10090 + }, + { + "epoch": 1.6328510225527442, + "grad_norm": 0.6509000062942505, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 10100 + }, + { + "epoch": 1.6344677067334896, + "grad_norm": 0.6215850114822388, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 10110 + }, + { + "epoch": 1.6360843909142349, + "grad_norm": 0.6956844329833984, + "learning_rate": 0.0002, + "loss": 0.7705, + "step": 10120 + }, + { + "epoch": 1.6377010750949803, + "grad_norm": 0.6111597418785095, + "learning_rate": 0.0002, + "loss": 0.7129, + "step": 10130 + }, + { + "epoch": 1.6393177592757255, + "grad_norm": 0.6518288850784302, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 10140 + }, + { + "epoch": 1.6409344434564708, + "grad_norm": 0.6914522051811218, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 10150 + }, + { + "epoch": 1.642551127637216, + "grad_norm": 0.63785719871521, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 10160 + }, + { + "epoch": 1.6441678118179612, + "grad_norm": 0.6379287838935852, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 10170 + }, + { + "epoch": 1.6457844959987067, + "grad_norm": 0.6793403029441833, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 10180 + }, + { + "epoch": 1.647401180179452, + "grad_norm": 0.6099132895469666, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 10190 + }, + { + "epoch": 1.6490178643601974, + "grad_norm": 0.5869854092597961, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 10200 + }, + { + "epoch": 1.6506345485409426, + "grad_norm": 0.7716999053955078, + "learning_rate": 0.0002, + "loss": 0.8308, + "step": 10210 + }, + { + "epoch": 1.6522512327216878, + "grad_norm": 0.6854110360145569, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 10220 + }, + { + "epoch": 1.653867916902433, + "grad_norm": 0.6957170367240906, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 10230 + }, + { + "epoch": 1.6554846010831783, + "grad_norm": 0.6932903528213501, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 10240 + }, + { + "epoch": 1.6571012852639235, + "grad_norm": 0.7713165283203125, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 10250 + }, + { + "epoch": 1.658717969444669, + "grad_norm": 0.7455793619155884, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 10260 + }, + { + "epoch": 1.6603346536254144, + "grad_norm": 0.5464168190956116, + "learning_rate": 0.0002, + "loss": 0.7524, + "step": 10270 + }, + { + "epoch": 1.6619513378061597, + "grad_norm": 0.6782926321029663, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 10280 + }, + { + "epoch": 1.663568021986905, + "grad_norm": 0.7962649464607239, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 10290 + }, + { + "epoch": 1.6651847061676501, + "grad_norm": 0.6814526319503784, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 10300 + }, + { + "epoch": 1.6668013903483954, + "grad_norm": 0.656895101070404, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 10310 + }, + { + "epoch": 1.6684180745291406, + "grad_norm": 0.6085672378540039, + "learning_rate": 0.0002, + "loss": 0.7358, + "step": 10320 + }, + { + "epoch": 1.670034758709886, + "grad_norm": 0.585508406162262, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 10330 + }, + { + "epoch": 1.6716514428906313, + "grad_norm": 0.6930184364318848, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 10340 + }, + { + "epoch": 1.6732681270713767, + "grad_norm": 0.575663149356842, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 10350 + }, + { + "epoch": 1.674884811252122, + "grad_norm": 0.582502543926239, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 10360 + }, + { + "epoch": 1.6765014954328672, + "grad_norm": 0.5668916702270508, + "learning_rate": 0.0002, + "loss": 0.7793, + "step": 10370 + }, + { + "epoch": 1.6781181796136124, + "grad_norm": 0.6070065498352051, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 10380 + }, + { + "epoch": 1.6797348637943577, + "grad_norm": 0.6141316294670105, + "learning_rate": 0.0002, + "loss": 0.7939, + "step": 10390 + }, + { + "epoch": 1.6813515479751031, + "grad_norm": 0.8359124064445496, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 10400 + }, + { + "epoch": 1.6829682321558483, + "grad_norm": 0.5378185510635376, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 10410 + }, + { + "epoch": 1.6845849163365938, + "grad_norm": 0.6959536075592041, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 10420 + }, + { + "epoch": 1.686201600517339, + "grad_norm": 0.6514357328414917, + "learning_rate": 0.0002, + "loss": 0.7872, + "step": 10430 + }, + { + "epoch": 1.6878182846980843, + "grad_norm": 0.7706646919250488, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 10440 + }, + { + "epoch": 1.6894349688788295, + "grad_norm": 0.6183337569236755, + "learning_rate": 0.0002, + "loss": 0.7673, + "step": 10450 + }, + { + "epoch": 1.6910516530595747, + "grad_norm": 0.6123278141021729, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 10460 + }, + { + "epoch": 1.69266833724032, + "grad_norm": 0.6894851326942444, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 10470 + }, + { + "epoch": 1.6942850214210654, + "grad_norm": 0.7497312426567078, + "learning_rate": 0.0002, + "loss": 0.7435, + "step": 10480 + }, + { + "epoch": 1.6959017056018106, + "grad_norm": 0.5968214273452759, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 10490 + }, + { + "epoch": 1.697518389782556, + "grad_norm": 0.6747927069664001, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 10500 + }, + { + "epoch": 1.6991350739633013, + "grad_norm": 0.5708310008049011, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 10510 + }, + { + "epoch": 1.7007517581440466, + "grad_norm": 0.606526792049408, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 10520 + }, + { + "epoch": 1.7023684423247918, + "grad_norm": 0.662011981010437, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 10530 + }, + { + "epoch": 1.703985126505537, + "grad_norm": 0.7583045363426208, + "learning_rate": 0.0002, + "loss": 0.7999, + "step": 10540 + }, + { + "epoch": 1.7056018106862825, + "grad_norm": 0.721632182598114, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 10550 + }, + { + "epoch": 1.7072184948670277, + "grad_norm": 0.6107715368270874, + "learning_rate": 0.0002, + "loss": 0.7407, + "step": 10560 + }, + { + "epoch": 1.7088351790477732, + "grad_norm": 0.6652471423149109, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 10570 + }, + { + "epoch": 1.7104518632285184, + "grad_norm": 0.6308087110519409, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 10580 + }, + { + "epoch": 1.7120685474092636, + "grad_norm": 0.5464386940002441, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 10590 + }, + { + "epoch": 1.7136852315900089, + "grad_norm": 0.6558911204338074, + "learning_rate": 0.0002, + "loss": 0.7063, + "step": 10600 + }, + { + "epoch": 1.715301915770754, + "grad_norm": 0.5665024518966675, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 10610 + }, + { + "epoch": 1.7169185999514993, + "grad_norm": 0.7888094186782837, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 10620 + }, + { + "epoch": 1.7185352841322448, + "grad_norm": 0.7084909081459045, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 10630 + }, + { + "epoch": 1.7201519683129902, + "grad_norm": 0.7982324361801147, + "learning_rate": 0.0002, + "loss": 0.7557, + "step": 10640 + }, + { + "epoch": 1.7217686524937355, + "grad_norm": 0.6418732404708862, + "learning_rate": 0.0002, + "loss": 0.7345, + "step": 10650 + }, + { + "epoch": 1.7233853366744807, + "grad_norm": 0.7636681795120239, + "learning_rate": 0.0002, + "loss": 0.7734, + "step": 10660 + }, + { + "epoch": 1.725002020855226, + "grad_norm": 0.5646875500679016, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 10670 + }, + { + "epoch": 1.7266187050359711, + "grad_norm": 0.5231260657310486, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 10680 + }, + { + "epoch": 1.7282353892167164, + "grad_norm": 0.7635011672973633, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 10690 + }, + { + "epoch": 1.7298520733974618, + "grad_norm": 0.7518259286880493, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 10700 + }, + { + "epoch": 1.731468757578207, + "grad_norm": 0.7295602560043335, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 10710 + }, + { + "epoch": 1.7330854417589525, + "grad_norm": 0.6984632015228271, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 10720 + }, + { + "epoch": 1.7347021259396977, + "grad_norm": 0.6198219060897827, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 10730 + }, + { + "epoch": 1.736318810120443, + "grad_norm": 0.6957576274871826, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 10740 + }, + { + "epoch": 1.7379354943011882, + "grad_norm": 0.6430263519287109, + "learning_rate": 0.0002, + "loss": 0.7917, + "step": 10750 + }, + { + "epoch": 1.7395521784819334, + "grad_norm": 0.6134995222091675, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 10760 + }, + { + "epoch": 1.741168862662679, + "grad_norm": 0.7209452986717224, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 10770 + }, + { + "epoch": 1.7427855468434241, + "grad_norm": 0.6735447645187378, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 10780 + }, + { + "epoch": 1.7444022310241696, + "grad_norm": 0.5605693459510803, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 10790 + }, + { + "epoch": 1.7460189152049148, + "grad_norm": 0.6882363557815552, + "learning_rate": 0.0002, + "loss": 0.7759, + "step": 10800 + }, + { + "epoch": 1.74763559938566, + "grad_norm": 0.6386259198188782, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 10810 + }, + { + "epoch": 1.7492522835664053, + "grad_norm": 0.6529015302658081, + "learning_rate": 0.0002, + "loss": 0.7697, + "step": 10820 + }, + { + "epoch": 1.7508689677471505, + "grad_norm": 0.5664082765579224, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 10830 + }, + { + "epoch": 1.7524856519278957, + "grad_norm": 0.7532684206962585, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 10840 + }, + { + "epoch": 1.7541023361086412, + "grad_norm": 0.77171391248703, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 10850 + }, + { + "epoch": 1.7557190202893864, + "grad_norm": 0.7255431413650513, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 10860 + }, + { + "epoch": 1.7573357044701319, + "grad_norm": 0.763083279132843, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 10870 + }, + { + "epoch": 1.758952388650877, + "grad_norm": 0.6042402982711792, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 10880 + }, + { + "epoch": 1.7605690728316223, + "grad_norm": 0.7642518281936646, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 10890 + }, + { + "epoch": 1.7621857570123676, + "grad_norm": 0.6347904801368713, + "learning_rate": 0.0002, + "loss": 0.746, + "step": 10900 + }, + { + "epoch": 1.7638024411931128, + "grad_norm": 0.5371627807617188, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 10910 + }, + { + "epoch": 1.7654191253738583, + "grad_norm": 0.6840225458145142, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 10920 + }, + { + "epoch": 1.7670358095546035, + "grad_norm": 0.5288469195365906, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 10930 + }, + { + "epoch": 1.768652493735349, + "grad_norm": 0.69020676612854, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 10940 + }, + { + "epoch": 1.7702691779160942, + "grad_norm": 0.5943242311477661, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 10950 + }, + { + "epoch": 1.7718858620968394, + "grad_norm": 0.5616418123245239, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 10960 + }, + { + "epoch": 1.7735025462775846, + "grad_norm": 0.7209470868110657, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 10970 + }, + { + "epoch": 1.7751192304583299, + "grad_norm": 0.6657957434654236, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 10980 + }, + { + "epoch": 1.776735914639075, + "grad_norm": 0.6469064950942993, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 10990 + }, + { + "epoch": 1.7783525988198206, + "grad_norm": 0.6615678071975708, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 11000 + }, + { + "epoch": 1.779969283000566, + "grad_norm": 0.6722439527511597, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 11010 + }, + { + "epoch": 1.7815859671813112, + "grad_norm": 0.634136974811554, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 11020 + }, + { + "epoch": 1.7832026513620565, + "grad_norm": 0.6024377346038818, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 11030 + }, + { + "epoch": 1.7848193355428017, + "grad_norm": 0.6909403800964355, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 11040 + }, + { + "epoch": 1.786436019723547, + "grad_norm": 0.7148767709732056, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 11050 + }, + { + "epoch": 1.7880527039042922, + "grad_norm": 0.7442979216575623, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 11060 + }, + { + "epoch": 1.7896693880850376, + "grad_norm": 0.6830431818962097, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 11070 + }, + { + "epoch": 1.7912860722657828, + "grad_norm": 0.9172667264938354, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 11080 + }, + { + "epoch": 1.7929027564465283, + "grad_norm": 0.6799490451812744, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 11090 + }, + { + "epoch": 1.7945194406272735, + "grad_norm": 0.7617024779319763, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 11100 + }, + { + "epoch": 1.7961361248080188, + "grad_norm": 0.7701810002326965, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 11110 + }, + { + "epoch": 1.797752808988764, + "grad_norm": 0.7454385757446289, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 11120 + }, + { + "epoch": 1.7993694931695092, + "grad_norm": 0.6121436953544617, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 11130 + }, + { + "epoch": 1.8009861773502547, + "grad_norm": 0.6237571835517883, + "learning_rate": 0.0002, + "loss": 0.7305, + "step": 11140 + }, + { + "epoch": 1.802602861531, + "grad_norm": 0.6818515658378601, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 11150 + }, + { + "epoch": 1.8042195457117454, + "grad_norm": 0.7768308520317078, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 11160 + }, + { + "epoch": 1.8058362298924906, + "grad_norm": 0.6875537633895874, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 11170 + }, + { + "epoch": 1.8074529140732358, + "grad_norm": 0.7950584888458252, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 11180 + }, + { + "epoch": 1.809069598253981, + "grad_norm": 0.8210248351097107, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 11190 + }, + { + "epoch": 1.8106862824347263, + "grad_norm": 0.6674110889434814, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 11200 + }, + { + "epoch": 1.8123029666154715, + "grad_norm": 0.6261674761772156, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 11210 + }, + { + "epoch": 1.813919650796217, + "grad_norm": 0.6484741568565369, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 11220 + }, + { + "epoch": 1.8155363349769622, + "grad_norm": 0.6231244206428528, + "learning_rate": 0.0002, + "loss": 0.7718, + "step": 11230 + }, + { + "epoch": 1.8171530191577077, + "grad_norm": 0.7243146896362305, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 11240 + }, + { + "epoch": 1.818769703338453, + "grad_norm": 0.6776193380355835, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 11250 + }, + { + "epoch": 1.8203863875191981, + "grad_norm": 0.5973618030548096, + "learning_rate": 0.0002, + "loss": 0.7317, + "step": 11260 + }, + { + "epoch": 1.8220030716999434, + "grad_norm": 0.6451361179351807, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 11270 + }, + { + "epoch": 1.8236197558806886, + "grad_norm": 0.5963068008422852, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 11280 + }, + { + "epoch": 1.825236440061434, + "grad_norm": 0.536902129650116, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 11290 + }, + { + "epoch": 1.8268531242421793, + "grad_norm": 0.6993787288665771, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 11300 + }, + { + "epoch": 1.8284698084229247, + "grad_norm": 0.6135255098342896, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 11310 + }, + { + "epoch": 1.83008649260367, + "grad_norm": 0.6057423949241638, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 11320 + }, + { + "epoch": 1.8317031767844152, + "grad_norm": 0.6598812341690063, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 11330 + }, + { + "epoch": 1.8333198609651604, + "grad_norm": 0.6075948476791382, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 11340 + }, + { + "epoch": 1.8349365451459057, + "grad_norm": 0.7065447568893433, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 11350 + }, + { + "epoch": 1.8365532293266509, + "grad_norm": 0.680526614189148, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 11360 + }, + { + "epoch": 1.8381699135073963, + "grad_norm": 0.6356695294380188, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 11370 + }, + { + "epoch": 1.8397865976881416, + "grad_norm": 0.6399052143096924, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 11380 + }, + { + "epoch": 1.841403281868887, + "grad_norm": 0.6125704050064087, + "learning_rate": 0.0002, + "loss": 0.7618, + "step": 11390 + }, + { + "epoch": 1.8430199660496323, + "grad_norm": 0.7124643325805664, + "learning_rate": 0.0002, + "loss": 0.755, + "step": 11400 + }, + { + "epoch": 1.8446366502303775, + "grad_norm": 0.6099604964256287, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 11410 + }, + { + "epoch": 1.8462533344111227, + "grad_norm": 0.7338208556175232, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 11420 + }, + { + "epoch": 1.847870018591868, + "grad_norm": 0.7534668445587158, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 11430 + }, + { + "epoch": 1.8494867027726134, + "grad_norm": 0.6135470271110535, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 11440 + }, + { + "epoch": 1.8511033869533586, + "grad_norm": 0.6229309439659119, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 11450 + }, + { + "epoch": 1.852720071134104, + "grad_norm": 0.706423282623291, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 11460 + }, + { + "epoch": 1.8543367553148493, + "grad_norm": 0.5460049510002136, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 11470 + }, + { + "epoch": 1.8559534394955945, + "grad_norm": 0.6616711020469666, + "learning_rate": 0.0002, + "loss": 0.7416, + "step": 11480 + }, + { + "epoch": 1.8575701236763398, + "grad_norm": 0.6372783184051514, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 11490 + }, + { + "epoch": 1.859186807857085, + "grad_norm": 0.7162668108940125, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 11500 + }, + { + "epoch": 1.8608034920378305, + "grad_norm": 0.6605209708213806, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 11510 + }, + { + "epoch": 1.8624201762185757, + "grad_norm": 0.6933956742286682, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 11520 + }, + { + "epoch": 1.8640368603993211, + "grad_norm": 0.6582090854644775, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 11530 + }, + { + "epoch": 1.8656535445800664, + "grad_norm": 0.6416500806808472, + "learning_rate": 0.0002, + "loss": 0.7313, + "step": 11540 + }, + { + "epoch": 1.8672702287608116, + "grad_norm": 0.5434312224388123, + "learning_rate": 0.0002, + "loss": 0.7372, + "step": 11550 + }, + { + "epoch": 1.8688869129415568, + "grad_norm": 0.6827567219734192, + "learning_rate": 0.0002, + "loss": 0.7635, + "step": 11560 + }, + { + "epoch": 1.870503597122302, + "grad_norm": 0.7354370951652527, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 11570 + }, + { + "epoch": 1.8721202813030473, + "grad_norm": 0.590372622013092, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 11580 + }, + { + "epoch": 1.8737369654837928, + "grad_norm": 0.853183925151825, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 11590 + }, + { + "epoch": 1.875353649664538, + "grad_norm": 0.822678804397583, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 11600 + }, + { + "epoch": 1.8769703338452834, + "grad_norm": 0.6591550707817078, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 11610 + }, + { + "epoch": 1.8785870180260287, + "grad_norm": 0.7475301623344421, + "learning_rate": 0.0002, + "loss": 0.7054, + "step": 11620 + }, + { + "epoch": 1.880203702206774, + "grad_norm": 0.6390765309333801, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 11630 + }, + { + "epoch": 1.8818203863875191, + "grad_norm": 0.6589758992195129, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 11640 + }, + { + "epoch": 1.8834370705682644, + "grad_norm": 0.6765508651733398, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 11650 + }, + { + "epoch": 1.8850537547490098, + "grad_norm": 0.6527857780456543, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 11660 + }, + { + "epoch": 1.886670438929755, + "grad_norm": 0.6642923951148987, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 11670 + }, + { + "epoch": 1.8882871231105005, + "grad_norm": 0.6945584416389465, + "learning_rate": 0.0002, + "loss": 0.7701, + "step": 11680 + }, + { + "epoch": 1.8899038072912457, + "grad_norm": 0.694018542766571, + "learning_rate": 0.0002, + "loss": 0.7711, + "step": 11690 + }, + { + "epoch": 1.891520491471991, + "grad_norm": 0.7237417101860046, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 11700 + }, + { + "epoch": 1.8931371756527362, + "grad_norm": 0.7401309609413147, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 11710 + }, + { + "epoch": 1.8947538598334814, + "grad_norm": 0.6537784337997437, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 11720 + }, + { + "epoch": 1.8963705440142267, + "grad_norm": 0.7398539185523987, + "learning_rate": 0.0002, + "loss": 0.793, + "step": 11730 + }, + { + "epoch": 1.8979872281949721, + "grad_norm": 0.6696075797080994, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 11740 + }, + { + "epoch": 1.8996039123757174, + "grad_norm": 0.6014142036437988, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 11750 + }, + { + "epoch": 1.9012205965564628, + "grad_norm": 0.7023524641990662, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 11760 + }, + { + "epoch": 1.902837280737208, + "grad_norm": 0.739973783493042, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 11770 + }, + { + "epoch": 1.9044539649179533, + "grad_norm": 0.5576770901679993, + "learning_rate": 0.0002, + "loss": 0.7848, + "step": 11780 + }, + { + "epoch": 1.9060706490986985, + "grad_norm": 0.6907393932342529, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 11790 + }, + { + "epoch": 1.9076873332794437, + "grad_norm": 0.6934581995010376, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 11800 + }, + { + "epoch": 1.9093040174601892, + "grad_norm": 0.591774582862854, + "learning_rate": 0.0002, + "loss": 0.7199, + "step": 11810 + }, + { + "epoch": 1.9109207016409344, + "grad_norm": 0.6249791383743286, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 11820 + }, + { + "epoch": 1.9125373858216799, + "grad_norm": 0.6755744218826294, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 11830 + }, + { + "epoch": 1.914154070002425, + "grad_norm": 0.7286285161972046, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 11840 + }, + { + "epoch": 1.9157707541831703, + "grad_norm": 0.7867850065231323, + "learning_rate": 0.0002, + "loss": 0.7509, + "step": 11850 + }, + { + "epoch": 1.9173874383639156, + "grad_norm": 0.6283972859382629, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 11860 + }, + { + "epoch": 1.9190041225446608, + "grad_norm": 0.605823814868927, + "learning_rate": 0.0002, + "loss": 0.7296, + "step": 11870 + }, + { + "epoch": 1.920620806725406, + "grad_norm": 0.5927976965904236, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 11880 + }, + { + "epoch": 1.9222374909061515, + "grad_norm": 0.5974002480506897, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 11890 + }, + { + "epoch": 1.923854175086897, + "grad_norm": 0.7091866135597229, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 11900 + }, + { + "epoch": 1.9254708592676422, + "grad_norm": 0.72496497631073, + "learning_rate": 0.0002, + "loss": 0.775, + "step": 11910 + }, + { + "epoch": 1.9270875434483874, + "grad_norm": 0.6131896376609802, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 11920 + }, + { + "epoch": 1.9287042276291326, + "grad_norm": 0.6556436419487, + "learning_rate": 0.0002, + "loss": 0.7228, + "step": 11930 + }, + { + "epoch": 1.9303209118098779, + "grad_norm": 0.622932493686676, + "learning_rate": 0.0002, + "loss": 0.7319, + "step": 11940 + }, + { + "epoch": 1.931937595990623, + "grad_norm": 0.6618631482124329, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 11950 + }, + { + "epoch": 1.9335542801713685, + "grad_norm": 0.630966305732727, + "learning_rate": 0.0002, + "loss": 0.8332, + "step": 11960 + }, + { + "epoch": 1.9351709643521138, + "grad_norm": 0.6336734890937805, + "learning_rate": 0.0002, + "loss": 0.6854, + "step": 11970 + }, + { + "epoch": 1.9367876485328592, + "grad_norm": 0.655403196811676, + "learning_rate": 0.0002, + "loss": 0.7433, + "step": 11980 + }, + { + "epoch": 1.9384043327136045, + "grad_norm": 0.5640574097633362, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 11990 + }, + { + "epoch": 1.9400210168943497, + "grad_norm": 0.6322951316833496, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 12000 + }, + { + "epoch": 1.941637701075095, + "grad_norm": 0.615703821182251, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 12010 + }, + { + "epoch": 1.9432543852558402, + "grad_norm": 0.6487536430358887, + "learning_rate": 0.0002, + "loss": 0.786, + "step": 12020 + }, + { + "epoch": 1.9448710694365856, + "grad_norm": 0.9209630489349365, + "learning_rate": 0.0002, + "loss": 0.7435, + "step": 12030 + }, + { + "epoch": 1.9464877536173308, + "grad_norm": 0.67485511302948, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 12040 + }, + { + "epoch": 1.9481044377980763, + "grad_norm": 0.6831230521202087, + "learning_rate": 0.0002, + "loss": 0.7551, + "step": 12050 + }, + { + "epoch": 1.9497211219788215, + "grad_norm": 0.6578302383422852, + "learning_rate": 0.0002, + "loss": 0.7546, + "step": 12060 + }, + { + "epoch": 1.9513378061595668, + "grad_norm": 0.9975938200950623, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 12070 + }, + { + "epoch": 1.952954490340312, + "grad_norm": 0.6637365221977234, + "learning_rate": 0.0002, + "loss": 0.7952, + "step": 12080 + }, + { + "epoch": 1.9545711745210572, + "grad_norm": 0.605707049369812, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 12090 + }, + { + "epoch": 1.9561878587018025, + "grad_norm": 0.6584440469741821, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 12100 + }, + { + "epoch": 1.957804542882548, + "grad_norm": 0.6070835590362549, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 12110 + }, + { + "epoch": 1.9594212270632931, + "grad_norm": 0.7862601280212402, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 12120 + }, + { + "epoch": 1.9610379112440386, + "grad_norm": 0.8175255060195923, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 12130 + }, + { + "epoch": 1.9626545954247838, + "grad_norm": 0.5648472905158997, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 12140 + }, + { + "epoch": 1.964271279605529, + "grad_norm": 0.6591973304748535, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 12150 + }, + { + "epoch": 1.9658879637862743, + "grad_norm": 0.5960676074028015, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 12160 + }, + { + "epoch": 1.9675046479670195, + "grad_norm": 0.7272544503211975, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 12170 + }, + { + "epoch": 1.969121332147765, + "grad_norm": 0.7176699042320251, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 12180 + }, + { + "epoch": 1.9707380163285102, + "grad_norm": 0.6927123665809631, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 12190 + }, + { + "epoch": 1.9723547005092557, + "grad_norm": 0.5536034107208252, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 12200 + }, + { + "epoch": 1.9739713846900009, + "grad_norm": 0.8348390460014343, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 12210 + }, + { + "epoch": 1.9755880688707461, + "grad_norm": 0.6591181755065918, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 12220 + }, + { + "epoch": 1.9772047530514913, + "grad_norm": 1.0624109506607056, + "learning_rate": 0.0002, + "loss": 0.763, + "step": 12230 + }, + { + "epoch": 1.9788214372322366, + "grad_norm": 0.9265586137771606, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 12240 + }, + { + "epoch": 1.9804381214129818, + "grad_norm": 0.5998196005821228, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 12250 + }, + { + "epoch": 1.9820548055937273, + "grad_norm": 0.6960851550102234, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 12260 + }, + { + "epoch": 1.9836714897744727, + "grad_norm": 0.7674502730369568, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 12270 + }, + { + "epoch": 1.985288173955218, + "grad_norm": 0.6407275795936584, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 12280 + }, + { + "epoch": 1.9869048581359632, + "grad_norm": 0.6673079133033752, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 12290 + }, + { + "epoch": 1.9885215423167084, + "grad_norm": 0.6989844441413879, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 12300 + }, + { + "epoch": 1.9901382264974536, + "grad_norm": 0.7564442157745361, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 12310 + }, + { + "epoch": 1.9917549106781989, + "grad_norm": 0.6385478973388672, + "learning_rate": 0.0002, + "loss": 0.7719, + "step": 12320 + }, + { + "epoch": 1.9933715948589443, + "grad_norm": 0.7193717956542969, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 12330 + }, + { + "epoch": 1.9949882790396896, + "grad_norm": 0.7987112402915955, + "learning_rate": 0.0002, + "loss": 0.7583, + "step": 12340 + }, + { + "epoch": 1.996604963220435, + "grad_norm": 0.7260826826095581, + "learning_rate": 0.0002, + "loss": 0.7793, + "step": 12350 + }, + { + "epoch": 1.9982216474011802, + "grad_norm": 0.7968255281448364, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 12360 + }, + { + "epoch": 1.9998383315819255, + "grad_norm": 0.6893062591552734, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 12370 + }, + { + "epoch": 2.0, + "eval_loss": 1.1044032573699951, + "eval_runtime": 122.1508, + "eval_samples_per_second": 6.001, + "eval_steps_per_second": 0.753, + "step": 12371 + }, + { + "epoch": 2.0014550157626707, + "grad_norm": 0.7775409817695618, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 12380 + }, + { + "epoch": 2.003071699943416, + "grad_norm": 0.76218581199646, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 12390 + }, + { + "epoch": 2.004688384124161, + "grad_norm": 0.5677764415740967, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 12400 + }, + { + "epoch": 2.006305068304907, + "grad_norm": 0.808442234992981, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 12410 + }, + { + "epoch": 2.007921752485652, + "grad_norm": 0.7144765257835388, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 12420 + }, + { + "epoch": 2.0095384366663973, + "grad_norm": 0.6914031505584717, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 12430 + }, + { + "epoch": 2.0111551208471425, + "grad_norm": 0.7581454515457153, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 12440 + }, + { + "epoch": 2.0127718050278878, + "grad_norm": 0.8388504981994629, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 12450 + }, + { + "epoch": 2.014388489208633, + "grad_norm": 0.6716406941413879, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 12460 + }, + { + "epoch": 2.0160051733893782, + "grad_norm": 0.898902416229248, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 12470 + }, + { + "epoch": 2.0176218575701235, + "grad_norm": 0.6432679891586304, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 12480 + }, + { + "epoch": 2.019238541750869, + "grad_norm": 0.8021109104156494, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 12490 + }, + { + "epoch": 2.0208552259316144, + "grad_norm": 0.7039216756820679, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 12500 + }, + { + "epoch": 2.0224719101123596, + "grad_norm": 0.646531879901886, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 12510 + }, + { + "epoch": 2.024088594293105, + "grad_norm": 0.783704400062561, + "learning_rate": 0.0002, + "loss": 0.6302, + "step": 12520 + }, + { + "epoch": 2.02570527847385, + "grad_norm": 0.8805046677589417, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 12530 + }, + { + "epoch": 2.0273219626545953, + "grad_norm": 0.7289270758628845, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 12540 + }, + { + "epoch": 2.0289386468353405, + "grad_norm": 0.71653151512146, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 12550 + }, + { + "epoch": 2.030555331016086, + "grad_norm": 0.73281329870224, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 12560 + }, + { + "epoch": 2.0321720151968314, + "grad_norm": 0.6657090187072754, + "learning_rate": 0.0002, + "loss": 0.6448, + "step": 12570 + }, + { + "epoch": 2.0337886993775767, + "grad_norm": 0.8241133093833923, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 12580 + }, + { + "epoch": 2.035405383558322, + "grad_norm": 0.5834135413169861, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 12590 + }, + { + "epoch": 2.037022067739067, + "grad_norm": 0.84502112865448, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 12600 + }, + { + "epoch": 2.0386387519198124, + "grad_norm": 0.8952481746673584, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 12610 + }, + { + "epoch": 2.0402554361005576, + "grad_norm": 0.7801461815834045, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 12620 + }, + { + "epoch": 2.041872120281303, + "grad_norm": 0.6788367033004761, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 12630 + }, + { + "epoch": 2.0434888044620485, + "grad_norm": 0.7241756319999695, + "learning_rate": 0.0002, + "loss": 0.6162, + "step": 12640 + }, + { + "epoch": 2.0451054886427937, + "grad_norm": 0.6933388113975525, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 12650 + }, + { + "epoch": 2.046722172823539, + "grad_norm": 0.8029746413230896, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 12660 + }, + { + "epoch": 2.048338857004284, + "grad_norm": 0.946399986743927, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 12670 + }, + { + "epoch": 2.0499555411850294, + "grad_norm": 0.7072678804397583, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 12680 + }, + { + "epoch": 2.0515722253657747, + "grad_norm": 0.6810618042945862, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 12690 + }, + { + "epoch": 2.05318890954652, + "grad_norm": 0.7661160230636597, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 12700 + }, + { + "epoch": 2.0548055937272656, + "grad_norm": 0.6350653767585754, + "learning_rate": 0.0002, + "loss": 0.6799, + "step": 12710 + }, + { + "epoch": 2.056422277908011, + "grad_norm": 0.861890971660614, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 12720 + }, + { + "epoch": 2.058038962088756, + "grad_norm": 0.6489875912666321, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 12730 + }, + { + "epoch": 2.0596556462695013, + "grad_norm": 0.8268506526947021, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 12740 + }, + { + "epoch": 2.0612723304502465, + "grad_norm": 0.607679545879364, + "learning_rate": 0.0002, + "loss": 0.6524, + "step": 12750 + }, + { + "epoch": 2.0628890146309917, + "grad_norm": 0.6754153370857239, + "learning_rate": 0.0002, + "loss": 0.6649, + "step": 12760 + }, + { + "epoch": 2.064505698811737, + "grad_norm": 0.7263124585151672, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 12770 + }, + { + "epoch": 2.0661223829924826, + "grad_norm": 0.6986154317855835, + "learning_rate": 0.0002, + "loss": 0.6189, + "step": 12780 + }, + { + "epoch": 2.067739067173228, + "grad_norm": 0.7768576741218567, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 12790 + }, + { + "epoch": 2.069355751353973, + "grad_norm": 0.7546762824058533, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 12800 + }, + { + "epoch": 2.0709724355347183, + "grad_norm": 0.7588880062103271, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 12810 + }, + { + "epoch": 2.0725891197154636, + "grad_norm": 0.7457242608070374, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 12820 + }, + { + "epoch": 2.074205803896209, + "grad_norm": 0.6983516812324524, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 12830 + }, + { + "epoch": 2.075822488076954, + "grad_norm": 0.7950928807258606, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 12840 + }, + { + "epoch": 2.0774391722576993, + "grad_norm": 0.9248087406158447, + "learning_rate": 0.0002, + "loss": 0.6603, + "step": 12850 + }, + { + "epoch": 2.079055856438445, + "grad_norm": 0.7229493260383606, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 12860 + }, + { + "epoch": 2.08067254061919, + "grad_norm": 0.5710847973823547, + "learning_rate": 0.0002, + "loss": 0.6702, + "step": 12870 + }, + { + "epoch": 2.0822892247999354, + "grad_norm": 0.9580423831939697, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 12880 + }, + { + "epoch": 2.0839059089806806, + "grad_norm": 0.7399665713310242, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 12890 + }, + { + "epoch": 2.085522593161426, + "grad_norm": 0.7981410622596741, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 12900 + }, + { + "epoch": 2.087139277342171, + "grad_norm": 0.870759904384613, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 12910 + }, + { + "epoch": 2.0887559615229163, + "grad_norm": 0.7001481652259827, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 12920 + }, + { + "epoch": 2.090372645703662, + "grad_norm": 0.6745418310165405, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 12930 + }, + { + "epoch": 2.0919893298844072, + "grad_norm": 0.7739067673683167, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 12940 + }, + { + "epoch": 2.0936060140651525, + "grad_norm": 0.6742934584617615, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 12950 + }, + { + "epoch": 2.0952226982458977, + "grad_norm": 0.7270349860191345, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 12960 + }, + { + "epoch": 2.096839382426643, + "grad_norm": 0.7150624394416809, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 12970 + }, + { + "epoch": 2.098456066607388, + "grad_norm": 0.7734767198562622, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 12980 + }, + { + "epoch": 2.1000727507881334, + "grad_norm": 0.7618662118911743, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 12990 + }, + { + "epoch": 2.101689434968879, + "grad_norm": 0.6557944416999817, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 13000 + }, + { + "epoch": 2.1033061191496243, + "grad_norm": 0.8786448240280151, + "learning_rate": 0.0002, + "loss": 0.7268, + "step": 13010 + }, + { + "epoch": 2.1049228033303695, + "grad_norm": 0.6878724098205566, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 13020 + }, + { + "epoch": 2.1065394875111147, + "grad_norm": 0.822318971157074, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 13030 + }, + { + "epoch": 2.10815617169186, + "grad_norm": 0.831468939781189, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 13040 + }, + { + "epoch": 2.109772855872605, + "grad_norm": 0.7699505686759949, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 13050 + }, + { + "epoch": 2.1113895400533504, + "grad_norm": 0.7559016346931458, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 13060 + }, + { + "epoch": 2.1130062242340957, + "grad_norm": 0.6942209601402283, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 13070 + }, + { + "epoch": 2.1146229084148414, + "grad_norm": 0.6098947525024414, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 13080 + }, + { + "epoch": 2.1162395925955866, + "grad_norm": 0.6499016284942627, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 13090 + }, + { + "epoch": 2.117856276776332, + "grad_norm": 0.7719953060150146, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 13100 + }, + { + "epoch": 2.119472960957077, + "grad_norm": 0.6708134412765503, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 13110 + }, + { + "epoch": 2.1210896451378223, + "grad_norm": 0.8119585514068604, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 13120 + }, + { + "epoch": 2.1227063293185675, + "grad_norm": 0.6947157979011536, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 13130 + }, + { + "epoch": 2.1243230134993127, + "grad_norm": 0.8831837773323059, + "learning_rate": 0.0002, + "loss": 0.6375, + "step": 13140 + }, + { + "epoch": 2.1259396976800584, + "grad_norm": 0.7266910672187805, + "learning_rate": 0.0002, + "loss": 0.6997, + "step": 13150 + }, + { + "epoch": 2.1275563818608036, + "grad_norm": 0.8864351511001587, + "learning_rate": 0.0002, + "loss": 0.6446, + "step": 13160 + }, + { + "epoch": 2.129173066041549, + "grad_norm": 0.8104248046875, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 13170 + }, + { + "epoch": 2.130789750222294, + "grad_norm": 0.6077079772949219, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 13180 + }, + { + "epoch": 2.1324064344030393, + "grad_norm": 0.6874213814735413, + "learning_rate": 0.0002, + "loss": 0.6572, + "step": 13190 + }, + { + "epoch": 2.1340231185837846, + "grad_norm": 0.7134367823600769, + "learning_rate": 0.0002, + "loss": 0.642, + "step": 13200 + }, + { + "epoch": 2.13563980276453, + "grad_norm": 0.6101235151290894, + "learning_rate": 0.0002, + "loss": 0.7016, + "step": 13210 + }, + { + "epoch": 2.137256486945275, + "grad_norm": 0.6042411923408508, + "learning_rate": 0.0002, + "loss": 0.6529, + "step": 13220 + }, + { + "epoch": 2.1388731711260207, + "grad_norm": 0.914601743221283, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 13230 + }, + { + "epoch": 2.140489855306766, + "grad_norm": 0.7104284167289734, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 13240 + }, + { + "epoch": 2.142106539487511, + "grad_norm": 0.664395272731781, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 13250 + }, + { + "epoch": 2.1437232236682564, + "grad_norm": 0.6991241574287415, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 13260 + }, + { + "epoch": 2.1453399078490016, + "grad_norm": 0.5469560623168945, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 13270 + }, + { + "epoch": 2.146956592029747, + "grad_norm": 0.8454998135566711, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 13280 + }, + { + "epoch": 2.148573276210492, + "grad_norm": 0.7088868618011475, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 13290 + }, + { + "epoch": 2.1501899603912378, + "grad_norm": 0.7002687454223633, + "learning_rate": 0.0002, + "loss": 0.6835, + "step": 13300 + }, + { + "epoch": 2.151806644571983, + "grad_norm": 0.7785214781761169, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 13310 + }, + { + "epoch": 2.1534233287527282, + "grad_norm": 0.8049132227897644, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 13320 + }, + { + "epoch": 2.1550400129334735, + "grad_norm": 0.8062595129013062, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 13330 + }, + { + "epoch": 2.1566566971142187, + "grad_norm": 0.6208319067955017, + "learning_rate": 0.0002, + "loss": 0.6603, + "step": 13340 + }, + { + "epoch": 2.158273381294964, + "grad_norm": 0.7519655823707581, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 13350 + }, + { + "epoch": 2.159890065475709, + "grad_norm": 0.7645747065544128, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 13360 + }, + { + "epoch": 2.1615067496564544, + "grad_norm": 0.6847302913665771, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 13370 + }, + { + "epoch": 2.1631234338372, + "grad_norm": 0.8630441427230835, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 13380 + }, + { + "epoch": 2.1647401180179453, + "grad_norm": 0.7947702407836914, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 13390 + }, + { + "epoch": 2.1663568021986905, + "grad_norm": 0.6836977005004883, + "learning_rate": 0.0002, + "loss": 0.7206, + "step": 13400 + }, + { + "epoch": 2.1679734863794358, + "grad_norm": 0.7340566515922546, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 13410 + }, + { + "epoch": 2.169590170560181, + "grad_norm": 0.7075738906860352, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 13420 + }, + { + "epoch": 2.1712068547409262, + "grad_norm": 0.7080879807472229, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 13430 + }, + { + "epoch": 2.1728235389216715, + "grad_norm": 0.6218613386154175, + "learning_rate": 0.0002, + "loss": 0.6615, + "step": 13440 + }, + { + "epoch": 2.174440223102417, + "grad_norm": 0.8211479187011719, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 13450 + }, + { + "epoch": 2.1760569072831624, + "grad_norm": 0.864466667175293, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 13460 + }, + { + "epoch": 2.1776735914639076, + "grad_norm": 0.7943857908248901, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 13470 + }, + { + "epoch": 2.179290275644653, + "grad_norm": 0.78728187084198, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 13480 + }, + { + "epoch": 2.180906959825398, + "grad_norm": 0.697527289390564, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 13490 + }, + { + "epoch": 2.1825236440061433, + "grad_norm": 0.8205804228782654, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 13500 + }, + { + "epoch": 2.1841403281868885, + "grad_norm": 0.8709042072296143, + "learning_rate": 0.0002, + "loss": 0.7227, + "step": 13510 + }, + { + "epoch": 2.1857570123676338, + "grad_norm": 0.6228537559509277, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 13520 + }, + { + "epoch": 2.1873736965483794, + "grad_norm": 0.9566980004310608, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 13530 + }, + { + "epoch": 2.1889903807291247, + "grad_norm": 0.7128894329071045, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 13540 + }, + { + "epoch": 2.19060706490987, + "grad_norm": 0.6888654232025146, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 13550 + }, + { + "epoch": 2.192223749090615, + "grad_norm": 0.6444337368011475, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 13560 + }, + { + "epoch": 2.1938404332713604, + "grad_norm": 0.8008806705474854, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 13570 + }, + { + "epoch": 2.1954571174521056, + "grad_norm": 0.8482748866081238, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 13580 + }, + { + "epoch": 2.197073801632851, + "grad_norm": 0.8584157228469849, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 13590 + }, + { + "epoch": 2.1986904858135965, + "grad_norm": 0.7513734698295593, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 13600 + }, + { + "epoch": 2.2003071699943417, + "grad_norm": 0.7864262461662292, + "learning_rate": 0.0002, + "loss": 0.6632, + "step": 13610 + }, + { + "epoch": 2.201923854175087, + "grad_norm": 0.8493645191192627, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 13620 + }, + { + "epoch": 2.203540538355832, + "grad_norm": 0.6902140974998474, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 13630 + }, + { + "epoch": 2.2051572225365774, + "grad_norm": 0.8711254596710205, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 13640 + }, + { + "epoch": 2.2067739067173227, + "grad_norm": 0.7832191586494446, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 13650 + }, + { + "epoch": 2.208390590898068, + "grad_norm": 0.5668176412582397, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 13660 + }, + { + "epoch": 2.2100072750788136, + "grad_norm": 0.8648375272750854, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 13670 + }, + { + "epoch": 2.211623959259559, + "grad_norm": 0.7643089890480042, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 13680 + }, + { + "epoch": 2.213240643440304, + "grad_norm": 0.6293777823448181, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 13690 + }, + { + "epoch": 2.2148573276210493, + "grad_norm": 0.6459372639656067, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 13700 + }, + { + "epoch": 2.2164740118017945, + "grad_norm": 0.7060744166374207, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 13710 + }, + { + "epoch": 2.2180906959825397, + "grad_norm": 0.674109160900116, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 13720 + }, + { + "epoch": 2.219707380163285, + "grad_norm": 0.830392062664032, + "learning_rate": 0.0002, + "loss": 0.6529, + "step": 13730 + }, + { + "epoch": 2.2213240643440306, + "grad_norm": 0.6474477052688599, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 13740 + }, + { + "epoch": 2.222940748524776, + "grad_norm": 0.7037909626960754, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 13750 + }, + { + "epoch": 2.224557432705521, + "grad_norm": 0.6554131507873535, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 13760 + }, + { + "epoch": 2.2261741168862663, + "grad_norm": 0.7822230458259583, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 13770 + }, + { + "epoch": 2.2277908010670116, + "grad_norm": 0.9082167744636536, + "learning_rate": 0.0002, + "loss": 0.6505, + "step": 13780 + }, + { + "epoch": 2.229407485247757, + "grad_norm": 0.7918276190757751, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 13790 + }, + { + "epoch": 2.231024169428502, + "grad_norm": 0.7354569435119629, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 13800 + }, + { + "epoch": 2.2326408536092472, + "grad_norm": 0.8265249133110046, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 13810 + }, + { + "epoch": 2.234257537789993, + "grad_norm": 0.6653847098350525, + "learning_rate": 0.0002, + "loss": 0.6871, + "step": 13820 + }, + { + "epoch": 2.235874221970738, + "grad_norm": 0.7157923579216003, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 13830 + }, + { + "epoch": 2.2374909061514834, + "grad_norm": 0.7110323309898376, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 13840 + }, + { + "epoch": 2.2391075903322286, + "grad_norm": 0.7155357599258423, + "learning_rate": 0.0002, + "loss": 0.6913, + "step": 13850 + }, + { + "epoch": 2.240724274512974, + "grad_norm": 1.0177817344665527, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 13860 + }, + { + "epoch": 2.242340958693719, + "grad_norm": 0.7601948380470276, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 13870 + }, + { + "epoch": 2.2439576428744643, + "grad_norm": 0.7628820538520813, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 13880 + }, + { + "epoch": 2.24557432705521, + "grad_norm": 0.7089297771453857, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 13890 + }, + { + "epoch": 2.247191011235955, + "grad_norm": 0.695178210735321, + "learning_rate": 0.0002, + "loss": 0.7236, + "step": 13900 + }, + { + "epoch": 2.2488076954167004, + "grad_norm": 0.7631948590278625, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 13910 + }, + { + "epoch": 2.2504243795974457, + "grad_norm": 0.8203101754188538, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 13920 + }, + { + "epoch": 2.252041063778191, + "grad_norm": 0.8099079728126526, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 13930 + }, + { + "epoch": 2.253657747958936, + "grad_norm": 0.6498546004295349, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 13940 + }, + { + "epoch": 2.2552744321396814, + "grad_norm": 0.7797415256500244, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 13950 + }, + { + "epoch": 2.2568911163204266, + "grad_norm": 0.8254124522209167, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 13960 + }, + { + "epoch": 2.2585078005011723, + "grad_norm": 0.6327953338623047, + "learning_rate": 0.0002, + "loss": 0.6806, + "step": 13970 + }, + { + "epoch": 2.2601244846819175, + "grad_norm": 0.734194278717041, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 13980 + }, + { + "epoch": 2.2617411688626627, + "grad_norm": 0.9014202952384949, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 13990 + }, + { + "epoch": 2.263357853043408, + "grad_norm": 0.7643631100654602, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 14000 + }, + { + "epoch": 2.264974537224153, + "grad_norm": 0.8882834911346436, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 14010 + }, + { + "epoch": 2.2665912214048984, + "grad_norm": 0.7975873351097107, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 14020 + }, + { + "epoch": 2.2682079055856437, + "grad_norm": 0.7765783071517944, + "learning_rate": 0.0002, + "loss": 0.7193, + "step": 14030 + }, + { + "epoch": 2.2698245897663893, + "grad_norm": 0.8846288323402405, + "learning_rate": 0.0002, + "loss": 0.662, + "step": 14040 + }, + { + "epoch": 2.2714412739471346, + "grad_norm": 0.9006744027137756, + "learning_rate": 0.0002, + "loss": 0.6494, + "step": 14050 + }, + { + "epoch": 2.27305795812788, + "grad_norm": 0.7420173287391663, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 14060 + }, + { + "epoch": 2.274674642308625, + "grad_norm": 0.7956424951553345, + "learning_rate": 0.0002, + "loss": 0.7068, + "step": 14070 + }, + { + "epoch": 2.2762913264893703, + "grad_norm": 0.7783209085464478, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 14080 + }, + { + "epoch": 2.2779080106701155, + "grad_norm": 0.7597188949584961, + "learning_rate": 0.0002, + "loss": 0.7202, + "step": 14090 + }, + { + "epoch": 2.2795246948508607, + "grad_norm": 0.6718921661376953, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 14100 + }, + { + "epoch": 2.281141379031606, + "grad_norm": 0.7528082132339478, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 14110 + }, + { + "epoch": 2.2827580632123516, + "grad_norm": 0.8379864692687988, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 14120 + }, + { + "epoch": 2.284374747393097, + "grad_norm": 0.748613715171814, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 14130 + }, + { + "epoch": 2.285991431573842, + "grad_norm": 0.7435423135757446, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 14140 + }, + { + "epoch": 2.2876081157545873, + "grad_norm": 0.7580803632736206, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 14150 + }, + { + "epoch": 2.2892247999353326, + "grad_norm": 0.6278321146965027, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 14160 + }, + { + "epoch": 2.290841484116078, + "grad_norm": 0.7663896083831787, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 14170 + }, + { + "epoch": 2.292458168296823, + "grad_norm": 0.9716812372207642, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 14180 + }, + { + "epoch": 2.2940748524775687, + "grad_norm": 0.8993458151817322, + "learning_rate": 0.0002, + "loss": 0.7067, + "step": 14190 + }, + { + "epoch": 2.295691536658314, + "grad_norm": 0.6156117916107178, + "learning_rate": 0.0002, + "loss": 0.6172, + "step": 14200 + }, + { + "epoch": 2.297308220839059, + "grad_norm": 0.8911278247833252, + "learning_rate": 0.0002, + "loss": 0.6318, + "step": 14210 + }, + { + "epoch": 2.2989249050198044, + "grad_norm": 0.6422147154808044, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 14220 + }, + { + "epoch": 2.3005415892005496, + "grad_norm": 0.6866879463195801, + "learning_rate": 0.0002, + "loss": 0.6795, + "step": 14230 + }, + { + "epoch": 2.302158273381295, + "grad_norm": 0.9297130107879639, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 14240 + }, + { + "epoch": 2.30377495756204, + "grad_norm": 0.7501356601715088, + "learning_rate": 0.0002, + "loss": 0.6823, + "step": 14250 + }, + { + "epoch": 2.3053916417427853, + "grad_norm": 0.8363515138626099, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 14260 + }, + { + "epoch": 2.307008325923531, + "grad_norm": 0.9083868265151978, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 14270 + }, + { + "epoch": 2.3086250101042762, + "grad_norm": 0.7791516780853271, + "learning_rate": 0.0002, + "loss": 0.6862, + "step": 14280 + }, + { + "epoch": 2.3102416942850215, + "grad_norm": 0.8766953349113464, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 14290 + }, + { + "epoch": 2.3118583784657667, + "grad_norm": 0.7916635274887085, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 14300 + }, + { + "epoch": 2.313475062646512, + "grad_norm": 0.627525269985199, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 14310 + }, + { + "epoch": 2.315091746827257, + "grad_norm": 0.8856783509254456, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 14320 + }, + { + "epoch": 2.316708431008003, + "grad_norm": 0.6758689284324646, + "learning_rate": 0.0002, + "loss": 0.6806, + "step": 14330 + }, + { + "epoch": 2.318325115188748, + "grad_norm": 0.6428321003913879, + "learning_rate": 0.0002, + "loss": 0.6794, + "step": 14340 + }, + { + "epoch": 2.3199417993694933, + "grad_norm": 0.9032121300697327, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 14350 + }, + { + "epoch": 2.3215584835502385, + "grad_norm": 0.8035986423492432, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 14360 + }, + { + "epoch": 2.3231751677309838, + "grad_norm": 0.7974579334259033, + "learning_rate": 0.0002, + "loss": 0.7067, + "step": 14370 + }, + { + "epoch": 2.324791851911729, + "grad_norm": 0.8356034755706787, + "learning_rate": 0.0002, + "loss": 0.6451, + "step": 14380 + }, + { + "epoch": 2.326408536092474, + "grad_norm": 0.998760998249054, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 14390 + }, + { + "epoch": 2.3280252202732195, + "grad_norm": 0.6518142223358154, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 14400 + }, + { + "epoch": 2.3296419044539647, + "grad_norm": 0.7443506717681885, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 14410 + }, + { + "epoch": 2.3312585886347104, + "grad_norm": 0.8436172604560852, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 14420 + }, + { + "epoch": 2.3328752728154556, + "grad_norm": 0.7411080598831177, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 14430 + }, + { + "epoch": 2.334491956996201, + "grad_norm": 0.8839048743247986, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 14440 + }, + { + "epoch": 2.336108641176946, + "grad_norm": 0.8360885977745056, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 14450 + }, + { + "epoch": 2.3377253253576913, + "grad_norm": 0.7608986496925354, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 14460 + }, + { + "epoch": 2.3393420095384365, + "grad_norm": 0.8179867267608643, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 14470 + }, + { + "epoch": 2.340958693719182, + "grad_norm": 0.5989999771118164, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 14480 + }, + { + "epoch": 2.3425753778999274, + "grad_norm": 0.9450054168701172, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 14490 + }, + { + "epoch": 2.3441920620806727, + "grad_norm": 0.7885149717330933, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 14500 + }, + { + "epoch": 2.345808746261418, + "grad_norm": 0.8152616620063782, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 14510 + }, + { + "epoch": 2.347425430442163, + "grad_norm": 0.7193838953971863, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 14520 + }, + { + "epoch": 2.3490421146229084, + "grad_norm": 0.6701092720031738, + "learning_rate": 0.0002, + "loss": 0.6594, + "step": 14530 + }, + { + "epoch": 2.3506587988036536, + "grad_norm": 0.7529364228248596, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 14540 + }, + { + "epoch": 2.352275482984399, + "grad_norm": 0.6599733829498291, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 14550 + }, + { + "epoch": 2.353892167165144, + "grad_norm": 0.9502474069595337, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 14560 + }, + { + "epoch": 2.3555088513458897, + "grad_norm": 0.7619650959968567, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 14570 + }, + { + "epoch": 2.357125535526635, + "grad_norm": 0.9854652285575867, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 14580 + }, + { + "epoch": 2.35874221970738, + "grad_norm": 0.727439284324646, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 14590 + }, + { + "epoch": 2.3603589038881254, + "grad_norm": 0.6994746327400208, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 14600 + }, + { + "epoch": 2.3619755880688706, + "grad_norm": 0.7117531299591064, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 14610 + }, + { + "epoch": 2.363592272249616, + "grad_norm": 0.6403067708015442, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 14620 + }, + { + "epoch": 2.3652089564303616, + "grad_norm": 0.8377841711044312, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 14630 + }, + { + "epoch": 2.366825640611107, + "grad_norm": 0.749171257019043, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 14640 + }, + { + "epoch": 2.368442324791852, + "grad_norm": 0.8418586254119873, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 14650 + }, + { + "epoch": 2.3700590089725972, + "grad_norm": 0.6178573369979858, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 14660 + }, + { + "epoch": 2.3716756931533425, + "grad_norm": 0.6368302702903748, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 14670 + }, + { + "epoch": 2.3732923773340877, + "grad_norm": 0.9122977256774902, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 14680 + }, + { + "epoch": 2.374909061514833, + "grad_norm": 0.7086195349693298, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 14690 + }, + { + "epoch": 2.376525745695578, + "grad_norm": 0.7500800490379333, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 14700 + }, + { + "epoch": 2.378142429876324, + "grad_norm": 0.6634900569915771, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 14710 + }, + { + "epoch": 2.379759114057069, + "grad_norm": 0.839898407459259, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 14720 + }, + { + "epoch": 2.3813757982378143, + "grad_norm": 0.7578426003456116, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 14730 + }, + { + "epoch": 2.3829924824185595, + "grad_norm": 1.0213173627853394, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 14740 + }, + { + "epoch": 2.3846091665993048, + "grad_norm": 0.7855949401855469, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 14750 + }, + { + "epoch": 2.38622585078005, + "grad_norm": 0.7224128842353821, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 14760 + }, + { + "epoch": 2.3878425349607952, + "grad_norm": 0.8040381669998169, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 14770 + }, + { + "epoch": 2.389459219141541, + "grad_norm": 0.7705281376838684, + "learning_rate": 0.0002, + "loss": 0.6799, + "step": 14780 + }, + { + "epoch": 2.391075903322286, + "grad_norm": 0.667966902256012, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 14790 + }, + { + "epoch": 2.3926925875030314, + "grad_norm": 0.6611011028289795, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 14800 + }, + { + "epoch": 2.3943092716837766, + "grad_norm": 0.6862651705741882, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 14810 + }, + { + "epoch": 2.395925955864522, + "grad_norm": 0.8086010217666626, + "learning_rate": 0.0002, + "loss": 0.6537, + "step": 14820 + }, + { + "epoch": 2.397542640045267, + "grad_norm": 0.7189689874649048, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 14830 + }, + { + "epoch": 2.3991593242260123, + "grad_norm": 0.6280009150505066, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 14840 + }, + { + "epoch": 2.4007760084067575, + "grad_norm": 0.7826612591743469, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 14850 + }, + { + "epoch": 2.402392692587503, + "grad_norm": 0.7681610584259033, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 14860 + }, + { + "epoch": 2.4040093767682484, + "grad_norm": 0.720966100692749, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 14870 + }, + { + "epoch": 2.4056260609489937, + "grad_norm": 0.8202250599861145, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 14880 + }, + { + "epoch": 2.407242745129739, + "grad_norm": 0.786212682723999, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 14890 + }, + { + "epoch": 2.408859429310484, + "grad_norm": 0.6647164821624756, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 14900 + }, + { + "epoch": 2.4104761134912294, + "grad_norm": 0.7566399574279785, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 14910 + }, + { + "epoch": 2.4120927976719746, + "grad_norm": 0.748814582824707, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 14920 + }, + { + "epoch": 2.4137094818527203, + "grad_norm": 0.7624038457870483, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 14930 + }, + { + "epoch": 2.4153261660334655, + "grad_norm": 0.8267335295677185, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 14940 + }, + { + "epoch": 2.4169428502142107, + "grad_norm": 0.8785360455513, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 14950 + }, + { + "epoch": 2.418559534394956, + "grad_norm": 0.679887592792511, + "learning_rate": 0.0002, + "loss": 0.6718, + "step": 14960 + }, + { + "epoch": 2.420176218575701, + "grad_norm": 0.7218474745750427, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 14970 + }, + { + "epoch": 2.4217929027564464, + "grad_norm": 0.6342799663543701, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 14980 + }, + { + "epoch": 2.4234095869371917, + "grad_norm": 0.7098712921142578, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 14990 + }, + { + "epoch": 2.425026271117937, + "grad_norm": 0.7497431635856628, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 15000 + }, + { + "epoch": 2.4266429552986826, + "grad_norm": 0.934836208820343, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 15010 + }, + { + "epoch": 2.428259639479428, + "grad_norm": 0.8430966734886169, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 15020 + }, + { + "epoch": 2.429876323660173, + "grad_norm": 0.7032104730606079, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 15030 + }, + { + "epoch": 2.4314930078409183, + "grad_norm": 0.7746111750602722, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 15040 + }, + { + "epoch": 2.4331096920216635, + "grad_norm": 0.7661406397819519, + "learning_rate": 0.0002, + "loss": 0.7177, + "step": 15050 + }, + { + "epoch": 2.4347263762024087, + "grad_norm": 0.6941645741462708, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 15060 + }, + { + "epoch": 2.436343060383154, + "grad_norm": 0.7487249374389648, + "learning_rate": 0.0002, + "loss": 0.6421, + "step": 15070 + }, + { + "epoch": 2.4379597445638996, + "grad_norm": 0.7639912962913513, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 15080 + }, + { + "epoch": 2.439576428744645, + "grad_norm": 0.7708953619003296, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 15090 + }, + { + "epoch": 2.44119311292539, + "grad_norm": 0.9135832190513611, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 15100 + }, + { + "epoch": 2.4428097971061353, + "grad_norm": 0.8283005356788635, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 15110 + }, + { + "epoch": 2.4444264812868806, + "grad_norm": 0.925299346446991, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 15120 + }, + { + "epoch": 2.446043165467626, + "grad_norm": 0.7013528943061829, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 15130 + }, + { + "epoch": 2.447659849648371, + "grad_norm": 0.622303307056427, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 15140 + }, + { + "epoch": 2.4492765338291163, + "grad_norm": 0.876569390296936, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 15150 + }, + { + "epoch": 2.450893218009862, + "grad_norm": 0.6836351752281189, + "learning_rate": 0.0002, + "loss": 0.6463, + "step": 15160 + }, + { + "epoch": 2.452509902190607, + "grad_norm": 0.7886684536933899, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 15170 + }, + { + "epoch": 2.4541265863713524, + "grad_norm": 0.6647440791130066, + "learning_rate": 0.0002, + "loss": 0.6794, + "step": 15180 + }, + { + "epoch": 2.4557432705520976, + "grad_norm": 0.7477722764015198, + "learning_rate": 0.0002, + "loss": 0.6353, + "step": 15190 + }, + { + "epoch": 2.457359954732843, + "grad_norm": 0.8192033767700195, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 15200 + }, + { + "epoch": 2.458976638913588, + "grad_norm": 0.847537100315094, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 15210 + }, + { + "epoch": 2.4605933230943338, + "grad_norm": 0.9027776122093201, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 15220 + }, + { + "epoch": 2.462210007275079, + "grad_norm": 0.7217772006988525, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 15230 + }, + { + "epoch": 2.4638266914558242, + "grad_norm": 0.7994546294212341, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 15240 + }, + { + "epoch": 2.4654433756365695, + "grad_norm": 0.939916729927063, + "learning_rate": 0.0002, + "loss": 0.6828, + "step": 15250 + }, + { + "epoch": 2.4670600598173147, + "grad_norm": 1.0009053945541382, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 15260 + }, + { + "epoch": 2.46867674399806, + "grad_norm": 0.625555694103241, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 15270 + }, + { + "epoch": 2.470293428178805, + "grad_norm": 0.7924878597259521, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 15280 + }, + { + "epoch": 2.4719101123595504, + "grad_norm": 0.8536689877510071, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 15290 + }, + { + "epoch": 2.4735267965402956, + "grad_norm": 0.8572589755058289, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 15300 + }, + { + "epoch": 2.4751434807210413, + "grad_norm": 0.773279070854187, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 15310 + }, + { + "epoch": 2.4767601649017865, + "grad_norm": 0.7708749771118164, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 15320 + }, + { + "epoch": 2.4783768490825318, + "grad_norm": 0.770905077457428, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 15330 + }, + { + "epoch": 2.479993533263277, + "grad_norm": 0.8238571882247925, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 15340 + }, + { + "epoch": 2.481610217444022, + "grad_norm": 0.7670477032661438, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 15350 + }, + { + "epoch": 2.4832269016247674, + "grad_norm": 0.905036985874176, + "learning_rate": 0.0002, + "loss": 0.7759, + "step": 15360 + }, + { + "epoch": 2.484843585805513, + "grad_norm": 0.6672089695930481, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 15370 + }, + { + "epoch": 2.4864602699862584, + "grad_norm": 0.625095784664154, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 15380 + }, + { + "epoch": 2.4880769541670036, + "grad_norm": 0.679772675037384, + "learning_rate": 0.0002, + "loss": 0.6396, + "step": 15390 + }, + { + "epoch": 2.489693638347749, + "grad_norm": 0.711492121219635, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 15400 + }, + { + "epoch": 2.491310322528494, + "grad_norm": 0.876189112663269, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 15410 + }, + { + "epoch": 2.4929270067092393, + "grad_norm": 0.7236915230751038, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 15420 + }, + { + "epoch": 2.4945436908899845, + "grad_norm": 0.6629832983016968, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 15430 + }, + { + "epoch": 2.4961603750707297, + "grad_norm": 0.9756859540939331, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 15440 + }, + { + "epoch": 2.4977770592514754, + "grad_norm": 0.6896940469741821, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 15450 + }, + { + "epoch": 2.4993937434322206, + "grad_norm": 0.7105149626731873, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 15460 + }, + { + "epoch": 2.501010427612966, + "grad_norm": 0.8374546766281128, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 15470 + }, + { + "epoch": 2.502627111793711, + "grad_norm": 0.7320070266723633, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 15480 + }, + { + "epoch": 2.5042437959744563, + "grad_norm": 0.8306367993354797, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 15490 + }, + { + "epoch": 2.5058604801552016, + "grad_norm": 0.7472721338272095, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 15500 + }, + { + "epoch": 2.507477164335947, + "grad_norm": 0.6147692203521729, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 15510 + }, + { + "epoch": 2.5090938485166925, + "grad_norm": 0.7788505554199219, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 15520 + }, + { + "epoch": 2.5107105326974377, + "grad_norm": 0.8807527422904968, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 15530 + }, + { + "epoch": 2.512327216878183, + "grad_norm": 0.7521643042564392, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 15540 + }, + { + "epoch": 2.513943901058928, + "grad_norm": 0.6900225281715393, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 15550 + }, + { + "epoch": 2.5155605852396734, + "grad_norm": 0.6601938605308533, + "learning_rate": 0.0002, + "loss": 0.6769, + "step": 15560 + }, + { + "epoch": 2.5171772694204186, + "grad_norm": 0.8179984092712402, + "learning_rate": 0.0002, + "loss": 0.6648, + "step": 15570 + }, + { + "epoch": 2.518793953601164, + "grad_norm": 0.792556881904602, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 15580 + }, + { + "epoch": 2.520410637781909, + "grad_norm": 0.7081938982009888, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 15590 + }, + { + "epoch": 2.5220273219626543, + "grad_norm": 0.8733121156692505, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 15600 + }, + { + "epoch": 2.5236440061434, + "grad_norm": 0.7980992794036865, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 15610 + }, + { + "epoch": 2.5252606903241452, + "grad_norm": 0.883664071559906, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 15620 + }, + { + "epoch": 2.5268773745048905, + "grad_norm": 0.6963341236114502, + "learning_rate": 0.0002, + "loss": 0.7022, + "step": 15630 + }, + { + "epoch": 2.5284940586856357, + "grad_norm": 0.6433573365211487, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 15640 + }, + { + "epoch": 2.530110742866381, + "grad_norm": 0.8538183569908142, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 15650 + }, + { + "epoch": 2.5317274270471266, + "grad_norm": 0.9748201370239258, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 15660 + }, + { + "epoch": 2.533344111227872, + "grad_norm": 0.7670575380325317, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 15670 + }, + { + "epoch": 2.534960795408617, + "grad_norm": 0.8738890290260315, + "learning_rate": 0.0002, + "loss": 0.6435, + "step": 15680 + }, + { + "epoch": 2.5365774795893623, + "grad_norm": 0.8391636610031128, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 15690 + }, + { + "epoch": 2.5381941637701075, + "grad_norm": 0.7239366769790649, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 15700 + }, + { + "epoch": 2.5398108479508528, + "grad_norm": 0.8498379588127136, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 15710 + }, + { + "epoch": 2.541427532131598, + "grad_norm": 0.8029484152793884, + "learning_rate": 0.0002, + "loss": 0.6998, + "step": 15720 + }, + { + "epoch": 2.5430442163123432, + "grad_norm": 1.0639333724975586, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 15730 + }, + { + "epoch": 2.5446609004930885, + "grad_norm": 0.6401297450065613, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 15740 + }, + { + "epoch": 2.5462775846738337, + "grad_norm": 0.7123814821243286, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 15750 + }, + { + "epoch": 2.5478942688545794, + "grad_norm": 0.7874974608421326, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 15760 + }, + { + "epoch": 2.5495109530353246, + "grad_norm": 0.8046808838844299, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 15770 + }, + { + "epoch": 2.55112763721607, + "grad_norm": 0.7888661623001099, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 15780 + }, + { + "epoch": 2.552744321396815, + "grad_norm": 0.8445866107940674, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 15790 + }, + { + "epoch": 2.5543610055775603, + "grad_norm": 0.7475846409797668, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 15800 + }, + { + "epoch": 2.555977689758306, + "grad_norm": 0.7455102801322937, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 15810 + }, + { + "epoch": 2.557594373939051, + "grad_norm": 0.8226983547210693, + "learning_rate": 0.0002, + "loss": 0.6932, + "step": 15820 + }, + { + "epoch": 2.5592110581197964, + "grad_norm": 0.8920368552207947, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 15830 + }, + { + "epoch": 2.5608277423005417, + "grad_norm": 0.8413904905319214, + "learning_rate": 0.0002, + "loss": 0.6297, + "step": 15840 + }, + { + "epoch": 2.562444426481287, + "grad_norm": 0.8483649492263794, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 15850 + }, + { + "epoch": 2.564061110662032, + "grad_norm": 0.5923284292221069, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 15860 + }, + { + "epoch": 2.5656777948427774, + "grad_norm": 0.8518726229667664, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 15870 + }, + { + "epoch": 2.5672944790235226, + "grad_norm": 0.731235146522522, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 15880 + }, + { + "epoch": 2.568911163204268, + "grad_norm": 0.7517194151878357, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 15890 + }, + { + "epoch": 2.5705278473850135, + "grad_norm": 0.8378692269325256, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 15900 + }, + { + "epoch": 2.5721445315657587, + "grad_norm": 0.843701958656311, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 15910 + }, + { + "epoch": 2.573761215746504, + "grad_norm": 0.7254629731178284, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 15920 + }, + { + "epoch": 2.575377899927249, + "grad_norm": 0.8863335847854614, + "learning_rate": 0.0002, + "loss": 0.6872, + "step": 15930 + }, + { + "epoch": 2.5769945841079944, + "grad_norm": 0.7675097584724426, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 15940 + }, + { + "epoch": 2.5786112682887397, + "grad_norm": 0.82063889503479, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 15950 + }, + { + "epoch": 2.5802279524694853, + "grad_norm": 0.7729717493057251, + "learning_rate": 0.0002, + "loss": 0.662, + "step": 15960 + }, + { + "epoch": 2.5818446366502306, + "grad_norm": 0.8301846981048584, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 15970 + }, + { + "epoch": 2.583461320830976, + "grad_norm": 0.7906861305236816, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 15980 + }, + { + "epoch": 2.585078005011721, + "grad_norm": 0.6749057173728943, + "learning_rate": 0.0002, + "loss": 0.7175, + "step": 15990 + }, + { + "epoch": 2.5866946891924663, + "grad_norm": 0.9386842846870422, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 16000 + }, + { + "epoch": 2.5883113733732115, + "grad_norm": 0.7868891358375549, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 16010 + }, + { + "epoch": 2.5899280575539567, + "grad_norm": 0.8674671053886414, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 16020 + }, + { + "epoch": 2.591544741734702, + "grad_norm": 0.7043559551239014, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 16030 + }, + { + "epoch": 2.593161425915447, + "grad_norm": 0.5846083760261536, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 16040 + }, + { + "epoch": 2.594778110096193, + "grad_norm": 0.7323982119560242, + "learning_rate": 0.0002, + "loss": 0.7322, + "step": 16050 + }, + { + "epoch": 2.596394794276938, + "grad_norm": 0.9069556593894958, + "learning_rate": 0.0002, + "loss": 0.6794, + "step": 16060 + }, + { + "epoch": 2.5980114784576833, + "grad_norm": 0.7522736191749573, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 16070 + }, + { + "epoch": 2.5996281626384286, + "grad_norm": 0.8149648308753967, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 16080 + }, + { + "epoch": 2.601244846819174, + "grad_norm": 0.6214233040809631, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 16090 + }, + { + "epoch": 2.602861530999919, + "grad_norm": 0.6803743839263916, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 16100 + }, + { + "epoch": 2.6044782151806647, + "grad_norm": 0.7223997116088867, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 16110 + }, + { + "epoch": 2.60609489936141, + "grad_norm": 0.7324174642562866, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 16120 + }, + { + "epoch": 2.607711583542155, + "grad_norm": 0.9594739675521851, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 16130 + }, + { + "epoch": 2.6093282677229004, + "grad_norm": 0.9485327005386353, + "learning_rate": 0.0002, + "loss": 0.6451, + "step": 16140 + }, + { + "epoch": 2.6109449519036456, + "grad_norm": 0.8449000120162964, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 16150 + }, + { + "epoch": 2.612561636084391, + "grad_norm": 0.8520140051841736, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 16160 + }, + { + "epoch": 2.614178320265136, + "grad_norm": 0.7456524968147278, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 16170 + }, + { + "epoch": 2.6157950044458813, + "grad_norm": 0.9912857413291931, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 16180 + }, + { + "epoch": 2.6174116886266265, + "grad_norm": 0.9001946449279785, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 16190 + }, + { + "epoch": 2.619028372807372, + "grad_norm": 0.6568667888641357, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 16200 + }, + { + "epoch": 2.6206450569881174, + "grad_norm": 1.0248128175735474, + "learning_rate": 0.0002, + "loss": 0.7013, + "step": 16210 + }, + { + "epoch": 2.6222617411688627, + "grad_norm": 0.6509039998054504, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 16220 + }, + { + "epoch": 2.623878425349608, + "grad_norm": 0.7626351118087769, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 16230 + }, + { + "epoch": 2.625495109530353, + "grad_norm": 0.6938552260398865, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 16240 + }, + { + "epoch": 2.6271117937110984, + "grad_norm": 0.6434680819511414, + "learning_rate": 0.0002, + "loss": 0.65, + "step": 16250 + }, + { + "epoch": 2.628728477891844, + "grad_norm": 0.7111515998840332, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 16260 + }, + { + "epoch": 2.6303451620725893, + "grad_norm": 0.7712395787239075, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 16270 + }, + { + "epoch": 2.6319618462533345, + "grad_norm": 0.792209267616272, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 16280 + }, + { + "epoch": 2.6335785304340797, + "grad_norm": 0.6801066398620605, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 16290 + }, + { + "epoch": 2.635195214614825, + "grad_norm": 0.7802573442459106, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 16300 + }, + { + "epoch": 2.63681189879557, + "grad_norm": 0.7742244601249695, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 16310 + }, + { + "epoch": 2.6384285829763154, + "grad_norm": 0.664184033870697, + "learning_rate": 0.0002, + "loss": 0.6524, + "step": 16320 + }, + { + "epoch": 2.6400452671570607, + "grad_norm": 0.9242228865623474, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 16330 + }, + { + "epoch": 2.641661951337806, + "grad_norm": 0.9661325216293335, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 16340 + }, + { + "epoch": 2.6432786355185516, + "grad_norm": 0.837526798248291, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 16350 + }, + { + "epoch": 2.644895319699297, + "grad_norm": 1.1834373474121094, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 16360 + }, + { + "epoch": 2.646512003880042, + "grad_norm": 0.7467831373214722, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 16370 + }, + { + "epoch": 2.6481286880607873, + "grad_norm": 0.8627146482467651, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 16380 + }, + { + "epoch": 2.6497453722415325, + "grad_norm": 0.790447473526001, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 16390 + }, + { + "epoch": 2.651362056422278, + "grad_norm": 0.8447365164756775, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 16400 + }, + { + "epoch": 2.6529787406030234, + "grad_norm": 0.7831417918205261, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 16410 + }, + { + "epoch": 2.6545954247837686, + "grad_norm": 0.6837952136993408, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 16420 + }, + { + "epoch": 2.656212108964514, + "grad_norm": 0.7031801342964172, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 16430 + }, + { + "epoch": 2.657828793145259, + "grad_norm": 0.8963770866394043, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 16440 + }, + { + "epoch": 2.6594454773260043, + "grad_norm": 0.6852328181266785, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 16450 + }, + { + "epoch": 2.6610621615067496, + "grad_norm": 0.8069294095039368, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 16460 + }, + { + "epoch": 2.662678845687495, + "grad_norm": 0.7503686547279358, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 16470 + }, + { + "epoch": 2.66429552986824, + "grad_norm": 0.6430956125259399, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 16480 + }, + { + "epoch": 2.6659122140489853, + "grad_norm": 0.7894312739372253, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 16490 + }, + { + "epoch": 2.667528898229731, + "grad_norm": 0.7277431488037109, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 16500 + }, + { + "epoch": 2.669145582410476, + "grad_norm": 0.6816153526306152, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 16510 + }, + { + "epoch": 2.6707622665912214, + "grad_norm": 0.8145235776901245, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 16520 + }, + { + "epoch": 2.6723789507719666, + "grad_norm": 0.8645890355110168, + "learning_rate": 0.0002, + "loss": 0.709, + "step": 16530 + }, + { + "epoch": 2.673995634952712, + "grad_norm": 0.704393208026886, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 16540 + }, + { + "epoch": 2.6756123191334575, + "grad_norm": 1.0120846033096313, + "learning_rate": 0.0002, + "loss": 0.6378, + "step": 16550 + }, + { + "epoch": 2.6772290033142028, + "grad_norm": 0.6919328570365906, + "learning_rate": 0.0002, + "loss": 0.7241, + "step": 16560 + }, + { + "epoch": 2.678845687494948, + "grad_norm": 0.6924574971199036, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 16570 + }, + { + "epoch": 2.6804623716756932, + "grad_norm": 0.9679301381111145, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 16580 + }, + { + "epoch": 2.6820790558564385, + "grad_norm": 0.6810211539268494, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 16590 + }, + { + "epoch": 2.6836957400371837, + "grad_norm": 0.9730555415153503, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 16600 + }, + { + "epoch": 2.685312424217929, + "grad_norm": 0.7852821350097656, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 16610 + }, + { + "epoch": 2.686929108398674, + "grad_norm": 0.6059057116508484, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 16620 + }, + { + "epoch": 2.6885457925794194, + "grad_norm": 0.9395958781242371, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 16630 + }, + { + "epoch": 2.690162476760165, + "grad_norm": 0.7473729848861694, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 16640 + }, + { + "epoch": 2.6917791609409103, + "grad_norm": 0.765934407711029, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 16650 + }, + { + "epoch": 2.6933958451216555, + "grad_norm": 0.8496677279472351, + "learning_rate": 0.0002, + "loss": 0.707, + "step": 16660 + }, + { + "epoch": 2.6950125293024008, + "grad_norm": 0.7641879916191101, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 16670 + }, + { + "epoch": 2.696629213483146, + "grad_norm": 0.8471952676773071, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 16680 + }, + { + "epoch": 2.6982458976638912, + "grad_norm": 0.6946060657501221, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 16690 + }, + { + "epoch": 2.699862581844637, + "grad_norm": 0.7361312508583069, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 16700 + }, + { + "epoch": 2.701479266025382, + "grad_norm": 0.6605038046836853, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 16710 + }, + { + "epoch": 2.7030959502061274, + "grad_norm": 0.7164411544799805, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 16720 + }, + { + "epoch": 2.7047126343868726, + "grad_norm": 0.6496201157569885, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 16730 + }, + { + "epoch": 2.706329318567618, + "grad_norm": 0.7826663851737976, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 16740 + }, + { + "epoch": 2.707946002748363, + "grad_norm": 0.7639131546020508, + "learning_rate": 0.0002, + "loss": 0.6773, + "step": 16750 + }, + { + "epoch": 2.7095626869291083, + "grad_norm": 0.7976210713386536, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 16760 + }, + { + "epoch": 2.7111793711098535, + "grad_norm": 0.6836577653884888, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 16770 + }, + { + "epoch": 2.7127960552905988, + "grad_norm": 0.8025202751159668, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 16780 + }, + { + "epoch": 2.7144127394713444, + "grad_norm": 0.7636463642120361, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 16790 + }, + { + "epoch": 2.7160294236520897, + "grad_norm": 0.7481677532196045, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 16800 + }, + { + "epoch": 2.717646107832835, + "grad_norm": 0.7566834688186646, + "learning_rate": 0.0002, + "loss": 0.6925, + "step": 16810 + }, + { + "epoch": 2.71926279201358, + "grad_norm": 0.7931267619132996, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 16820 + }, + { + "epoch": 2.7208794761943254, + "grad_norm": 0.8811662197113037, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 16830 + }, + { + "epoch": 2.7224961603750706, + "grad_norm": 0.8561240434646606, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 16840 + }, + { + "epoch": 2.7241128445558163, + "grad_norm": 0.7121599316596985, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 16850 + }, + { + "epoch": 2.7257295287365615, + "grad_norm": 0.8066257238388062, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 16860 + }, + { + "epoch": 2.7273462129173067, + "grad_norm": 0.7699271440505981, + "learning_rate": 0.0002, + "loss": 0.6839, + "step": 16870 + }, + { + "epoch": 2.728962897098052, + "grad_norm": 1.1828432083129883, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 16880 + }, + { + "epoch": 2.730579581278797, + "grad_norm": 0.9989302754402161, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 16890 + }, + { + "epoch": 2.7321962654595424, + "grad_norm": 0.8100560307502747, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 16900 + }, + { + "epoch": 2.7338129496402876, + "grad_norm": 0.8615233898162842, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 16910 + }, + { + "epoch": 2.735429633821033, + "grad_norm": 0.8633756041526794, + "learning_rate": 0.0002, + "loss": 0.6322, + "step": 16920 + }, + { + "epoch": 2.737046318001778, + "grad_norm": 0.7769348621368408, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 16930 + }, + { + "epoch": 2.738663002182524, + "grad_norm": 0.6943058371543884, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 16940 + }, + { + "epoch": 2.740279686363269, + "grad_norm": 0.8510736227035522, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 16950 + }, + { + "epoch": 2.7418963705440142, + "grad_norm": 0.7732602953910828, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 16960 + }, + { + "epoch": 2.7435130547247595, + "grad_norm": 0.5981788635253906, + "learning_rate": 0.0002, + "loss": 0.6553, + "step": 16970 + }, + { + "epoch": 2.7451297389055047, + "grad_norm": 0.7604416012763977, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 16980 + }, + { + "epoch": 2.74674642308625, + "grad_norm": 0.7377738356590271, + "learning_rate": 0.0002, + "loss": 0.6981, + "step": 16990 + }, + { + "epoch": 2.7483631072669956, + "grad_norm": 0.9400289058685303, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 17000 + }, + { + "epoch": 2.749979791447741, + "grad_norm": 0.6340599656105042, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 17010 + }, + { + "epoch": 2.751596475628486, + "grad_norm": 0.7297601103782654, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 17020 + }, + { + "epoch": 2.7532131598092313, + "grad_norm": 0.9479979872703552, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 17030 + }, + { + "epoch": 2.7548298439899765, + "grad_norm": 0.8461511135101318, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 17040 + }, + { + "epoch": 2.7564465281707218, + "grad_norm": 0.7477551698684692, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 17050 + }, + { + "epoch": 2.758063212351467, + "grad_norm": 1.019270420074463, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 17060 + }, + { + "epoch": 2.7596798965322122, + "grad_norm": 0.7730235457420349, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 17070 + }, + { + "epoch": 2.7612965807129575, + "grad_norm": 0.8216866254806519, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 17080 + }, + { + "epoch": 2.762913264893703, + "grad_norm": 0.7235931754112244, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 17090 + }, + { + "epoch": 2.7645299490744484, + "grad_norm": 0.7352296710014343, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 17100 + }, + { + "epoch": 2.7661466332551936, + "grad_norm": 0.8129373788833618, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 17110 + }, + { + "epoch": 2.767763317435939, + "grad_norm": 0.7387019991874695, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 17120 + }, + { + "epoch": 2.769380001616684, + "grad_norm": 0.9149190187454224, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 17130 + }, + { + "epoch": 2.7709966857974297, + "grad_norm": 0.7352971434593201, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 17140 + }, + { + "epoch": 2.772613369978175, + "grad_norm": 0.7903780341148376, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 17150 + }, + { + "epoch": 2.77423005415892, + "grad_norm": 0.8255927562713623, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 17160 + }, + { + "epoch": 2.7758467383396654, + "grad_norm": 0.7235927581787109, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 17170 + }, + { + "epoch": 2.7774634225204107, + "grad_norm": 0.8281434774398804, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 17180 + }, + { + "epoch": 2.779080106701156, + "grad_norm": 0.7586921453475952, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 17190 + }, + { + "epoch": 2.780696790881901, + "grad_norm": 0.7161715030670166, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 17200 + }, + { + "epoch": 2.7823134750626464, + "grad_norm": 0.762868344783783, + "learning_rate": 0.0002, + "loss": 0.6426, + "step": 17210 + }, + { + "epoch": 2.7839301592433916, + "grad_norm": 0.9285483360290527, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 17220 + }, + { + "epoch": 2.785546843424137, + "grad_norm": 0.6900462508201599, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 17230 + }, + { + "epoch": 2.7871635276048825, + "grad_norm": 0.780384361743927, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 17240 + }, + { + "epoch": 2.7887802117856277, + "grad_norm": 0.7580406665802002, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 17250 + }, + { + "epoch": 2.790396895966373, + "grad_norm": 0.8145199418067932, + "learning_rate": 0.0002, + "loss": 0.6833, + "step": 17260 + }, + { + "epoch": 2.792013580147118, + "grad_norm": 0.9159596562385559, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 17270 + }, + { + "epoch": 2.7936302643278634, + "grad_norm": 0.9590014219284058, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 17280 + }, + { + "epoch": 2.795246948508609, + "grad_norm": 0.7603529691696167, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 17290 + }, + { + "epoch": 2.7968636326893543, + "grad_norm": 0.8039976358413696, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 17300 + }, + { + "epoch": 2.7984803168700996, + "grad_norm": 0.8364847302436829, + "learning_rate": 0.0002, + "loss": 0.7037, + "step": 17310 + }, + { + "epoch": 2.800097001050845, + "grad_norm": 0.8763046860694885, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 17320 + }, + { + "epoch": 2.80171368523159, + "grad_norm": 0.8409647941589355, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 17330 + }, + { + "epoch": 2.8033303694123353, + "grad_norm": 0.7649006247520447, + "learning_rate": 0.0002, + "loss": 0.6936, + "step": 17340 + }, + { + "epoch": 2.8049470535930805, + "grad_norm": 0.7970262169837952, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 17350 + }, + { + "epoch": 2.8065637377738257, + "grad_norm": 0.9088607430458069, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 17360 + }, + { + "epoch": 2.808180421954571, + "grad_norm": 0.6454846858978271, + "learning_rate": 0.0002, + "loss": 0.675, + "step": 17370 + }, + { + "epoch": 2.809797106135316, + "grad_norm": 0.7744787931442261, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 17380 + }, + { + "epoch": 2.811413790316062, + "grad_norm": 0.6678640842437744, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 17390 + }, + { + "epoch": 2.813030474496807, + "grad_norm": 0.772676944732666, + "learning_rate": 0.0002, + "loss": 0.6784, + "step": 17400 + }, + { + "epoch": 2.8146471586775523, + "grad_norm": 0.7088175415992737, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 17410 + }, + { + "epoch": 2.8162638428582976, + "grad_norm": 0.8280573487281799, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 17420 + }, + { + "epoch": 2.817880527039043, + "grad_norm": 0.6665388345718384, + "learning_rate": 0.0002, + "loss": 0.6732, + "step": 17430 + }, + { + "epoch": 2.8194972112197885, + "grad_norm": 0.6427883505821228, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 17440 + }, + { + "epoch": 2.8211138954005337, + "grad_norm": 0.9697760343551636, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 17450 + }, + { + "epoch": 2.822730579581279, + "grad_norm": 0.7573966383934021, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 17460 + }, + { + "epoch": 2.824347263762024, + "grad_norm": 0.878688633441925, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 17470 + }, + { + "epoch": 2.8259639479427694, + "grad_norm": 0.7752242684364319, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 17480 + }, + { + "epoch": 2.8275806321235146, + "grad_norm": 0.6135398745536804, + "learning_rate": 0.0002, + "loss": 0.6638, + "step": 17490 + }, + { + "epoch": 2.82919731630426, + "grad_norm": 0.6924924850463867, + "learning_rate": 0.0002, + "loss": 0.6829, + "step": 17500 + }, + { + "epoch": 2.830814000485005, + "grad_norm": 0.7471627593040466, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 17510 + }, + { + "epoch": 2.8324306846657503, + "grad_norm": 0.7145499587059021, + "learning_rate": 0.0002, + "loss": 0.7016, + "step": 17520 + }, + { + "epoch": 2.834047368846496, + "grad_norm": 0.7415414452552795, + "learning_rate": 0.0002, + "loss": 0.6787, + "step": 17530 + }, + { + "epoch": 2.8356640530272412, + "grad_norm": 0.7328441739082336, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 17540 + }, + { + "epoch": 2.8372807372079865, + "grad_norm": 0.8267839550971985, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 17550 + }, + { + "epoch": 2.8388974213887317, + "grad_norm": 0.8877885341644287, + "learning_rate": 0.0002, + "loss": 0.6787, + "step": 17560 + }, + { + "epoch": 2.840514105569477, + "grad_norm": 0.857138454914093, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 17570 + }, + { + "epoch": 2.842130789750222, + "grad_norm": 0.8470779657363892, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 17580 + }, + { + "epoch": 2.843747473930968, + "grad_norm": 0.8553254008293152, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 17590 + }, + { + "epoch": 2.845364158111713, + "grad_norm": 0.8033196926116943, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 17600 + }, + { + "epoch": 2.8469808422924583, + "grad_norm": 0.7949087023735046, + "learning_rate": 0.0002, + "loss": 0.7062, + "step": 17610 + }, + { + "epoch": 2.8485975264732035, + "grad_norm": 0.9241406321525574, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 17620 + }, + { + "epoch": 2.8502142106539488, + "grad_norm": 0.7721285223960876, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 17630 + }, + { + "epoch": 2.851830894834694, + "grad_norm": 1.0246692895889282, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 17640 + }, + { + "epoch": 2.853447579015439, + "grad_norm": 0.9244589805603027, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 17650 + }, + { + "epoch": 2.8550642631961844, + "grad_norm": 0.7243508696556091, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 17660 + }, + { + "epoch": 2.8566809473769297, + "grad_norm": 0.8943371176719666, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 17670 + }, + { + "epoch": 2.8582976315576754, + "grad_norm": 0.6531758904457092, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 17680 + }, + { + "epoch": 2.8599143157384206, + "grad_norm": 0.8367000818252563, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 17690 + }, + { + "epoch": 2.861530999919166, + "grad_norm": 0.7868556380271912, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 17700 + }, + { + "epoch": 2.863147684099911, + "grad_norm": 0.7213859558105469, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 17710 + }, + { + "epoch": 2.8647643682806563, + "grad_norm": 0.7383931279182434, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 17720 + }, + { + "epoch": 2.8663810524614015, + "grad_norm": 0.7566812634468079, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 17730 + }, + { + "epoch": 2.867997736642147, + "grad_norm": 0.6930373311042786, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 17740 + }, + { + "epoch": 2.8696144208228924, + "grad_norm": 0.7911090850830078, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 17750 + }, + { + "epoch": 2.8712311050036377, + "grad_norm": 0.8484548926353455, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 17760 + }, + { + "epoch": 2.872847789184383, + "grad_norm": 0.7647597193717957, + "learning_rate": 0.0002, + "loss": 0.6931, + "step": 17770 + }, + { + "epoch": 2.874464473365128, + "grad_norm": 0.8791151642799377, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 17780 + }, + { + "epoch": 2.8760811575458733, + "grad_norm": 0.7253178358078003, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 17790 + }, + { + "epoch": 2.8776978417266186, + "grad_norm": 0.7956077456474304, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 17800 + }, + { + "epoch": 2.879314525907364, + "grad_norm": 0.8657688498497009, + "learning_rate": 0.0002, + "loss": 0.6687, + "step": 17810 + }, + { + "epoch": 2.880931210088109, + "grad_norm": 0.7059141993522644, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 17820 + }, + { + "epoch": 2.8825478942688547, + "grad_norm": 0.8886896967887878, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 17830 + }, + { + "epoch": 2.8841645784496, + "grad_norm": 0.821032702922821, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 17840 + }, + { + "epoch": 2.885781262630345, + "grad_norm": 0.7183963656425476, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 17850 + }, + { + "epoch": 2.8873979468110904, + "grad_norm": 0.6222899556159973, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 17860 + }, + { + "epoch": 2.8890146309918356, + "grad_norm": 0.8187434077262878, + "learning_rate": 0.0002, + "loss": 0.6626, + "step": 17870 + }, + { + "epoch": 2.890631315172581, + "grad_norm": 0.9838479161262512, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 17880 + }, + { + "epoch": 2.8922479993533265, + "grad_norm": 0.7567742466926575, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 17890 + }, + { + "epoch": 2.893864683534072, + "grad_norm": 0.6875903606414795, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 17900 + }, + { + "epoch": 2.895481367714817, + "grad_norm": 0.8043789267539978, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 17910 + }, + { + "epoch": 2.8970980518955622, + "grad_norm": 0.8062626719474792, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 17920 + }, + { + "epoch": 2.8987147360763075, + "grad_norm": 1.0251191854476929, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 17930 + }, + { + "epoch": 2.9003314202570527, + "grad_norm": 0.882253110408783, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 17940 + }, + { + "epoch": 2.901948104437798, + "grad_norm": 0.8683299422264099, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 17950 + }, + { + "epoch": 2.903564788618543, + "grad_norm": 0.7167282104492188, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 17960 + }, + { + "epoch": 2.9051814727992884, + "grad_norm": 0.7093694806098938, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 17970 + }, + { + "epoch": 2.906798156980034, + "grad_norm": 0.8549879193305969, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 17980 + }, + { + "epoch": 2.9084148411607793, + "grad_norm": 0.6989606618881226, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 17990 + }, + { + "epoch": 2.9100315253415245, + "grad_norm": 0.9482976794242859, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 18000 + }, + { + "epoch": 2.9116482095222698, + "grad_norm": 0.7182440161705017, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 18010 + }, + { + "epoch": 2.913264893703015, + "grad_norm": 0.7732226252555847, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 18020 + }, + { + "epoch": 2.9148815778837607, + "grad_norm": 0.7936875224113464, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 18030 + }, + { + "epoch": 2.916498262064506, + "grad_norm": 0.8825615644454956, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 18040 + }, + { + "epoch": 2.918114946245251, + "grad_norm": 0.6778587102890015, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 18050 + }, + { + "epoch": 2.9197316304259964, + "grad_norm": 0.7529265880584717, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 18060 + }, + { + "epoch": 2.9213483146067416, + "grad_norm": 0.7111883163452148, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 18070 + }, + { + "epoch": 2.922964998787487, + "grad_norm": 0.7214767932891846, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 18080 + }, + { + "epoch": 2.924581682968232, + "grad_norm": 0.800417423248291, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 18090 + }, + { + "epoch": 2.9261983671489773, + "grad_norm": 1.248575210571289, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 18100 + }, + { + "epoch": 2.9278150513297225, + "grad_norm": 0.757788360118866, + "learning_rate": 0.0002, + "loss": 0.7004, + "step": 18110 + }, + { + "epoch": 2.9294317355104678, + "grad_norm": 1.0583995580673218, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 18120 + }, + { + "epoch": 2.9310484196912134, + "grad_norm": 0.8228777647018433, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 18130 + }, + { + "epoch": 2.9326651038719587, + "grad_norm": 0.8374035358428955, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 18140 + }, + { + "epoch": 2.934281788052704, + "grad_norm": 0.7976473569869995, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 18150 + }, + { + "epoch": 2.935898472233449, + "grad_norm": 0.8009907603263855, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 18160 + }, + { + "epoch": 2.9375151564141944, + "grad_norm": 0.835213303565979, + "learning_rate": 0.0002, + "loss": 0.7485, + "step": 18170 + }, + { + "epoch": 2.93913184059494, + "grad_norm": 0.7982219457626343, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 18180 + }, + { + "epoch": 2.9407485247756853, + "grad_norm": 0.7070978879928589, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 18190 + }, + { + "epoch": 2.9423652089564305, + "grad_norm": 0.8619440197944641, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 18200 + }, + { + "epoch": 2.9439818931371757, + "grad_norm": 0.6693987250328064, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 18210 + }, + { + "epoch": 2.945598577317921, + "grad_norm": 0.6747021079063416, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 18220 + }, + { + "epoch": 2.947215261498666, + "grad_norm": 0.860387921333313, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 18230 + }, + { + "epoch": 2.9488319456794114, + "grad_norm": 0.799976646900177, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 18240 + }, + { + "epoch": 2.9504486298601567, + "grad_norm": 0.7864769101142883, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 18250 + }, + { + "epoch": 2.952065314040902, + "grad_norm": 0.6713884472846985, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 18260 + }, + { + "epoch": 2.9536819982216476, + "grad_norm": 0.9031508564949036, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 18270 + }, + { + "epoch": 2.955298682402393, + "grad_norm": 0.7205073237419128, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 18280 + }, + { + "epoch": 2.956915366583138, + "grad_norm": 0.7746205925941467, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 18290 + }, + { + "epoch": 2.9585320507638833, + "grad_norm": 0.6533427834510803, + "learning_rate": 0.0002, + "loss": 0.6218, + "step": 18300 + }, + { + "epoch": 2.9601487349446285, + "grad_norm": 0.9083208441734314, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 18310 + }, + { + "epoch": 2.9617654191253737, + "grad_norm": 0.7446991801261902, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 18320 + }, + { + "epoch": 2.9633821033061194, + "grad_norm": 0.6514461636543274, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 18330 + }, + { + "epoch": 2.9649987874868646, + "grad_norm": 0.8580465912818909, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 18340 + }, + { + "epoch": 2.96661547166761, + "grad_norm": 0.7074266076087952, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 18350 + }, + { + "epoch": 2.968232155848355, + "grad_norm": 0.899892270565033, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 18360 + }, + { + "epoch": 2.9698488400291003, + "grad_norm": 0.8217641711235046, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 18370 + }, + { + "epoch": 2.9714655242098456, + "grad_norm": 0.8611799478530884, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 18380 + }, + { + "epoch": 2.973082208390591, + "grad_norm": 0.6909302473068237, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 18390 + }, + { + "epoch": 2.974698892571336, + "grad_norm": 0.6554358005523682, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 18400 + }, + { + "epoch": 2.9763155767520812, + "grad_norm": 0.7803071737289429, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 18410 + }, + { + "epoch": 2.977932260932827, + "grad_norm": 0.7838954925537109, + "learning_rate": 0.0002, + "loss": 0.7322, + "step": 18420 + }, + { + "epoch": 2.979548945113572, + "grad_norm": 0.7098495364189148, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 18430 + }, + { + "epoch": 2.9811656292943174, + "grad_norm": 0.8981785774230957, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 18440 + }, + { + "epoch": 2.9827823134750626, + "grad_norm": 0.7197171449661255, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 18450 + }, + { + "epoch": 2.984398997655808, + "grad_norm": 0.793185293674469, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 18460 + }, + { + "epoch": 2.986015681836553, + "grad_norm": 0.8531473875045776, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 18470 + }, + { + "epoch": 2.9876323660172988, + "grad_norm": 0.6627361178398132, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 18480 + }, + { + "epoch": 2.989249050198044, + "grad_norm": 0.5708155035972595, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 18490 + }, + { + "epoch": 2.990865734378789, + "grad_norm": 0.8227280378341675, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 18500 + }, + { + "epoch": 2.9924824185595345, + "grad_norm": 0.7102749943733215, + "learning_rate": 0.0002, + "loss": 0.6701, + "step": 18510 + }, + { + "epoch": 2.9940991027402797, + "grad_norm": 0.839485228061676, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 18520 + }, + { + "epoch": 2.995715786921025, + "grad_norm": 0.9038704037666321, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 18530 + }, + { + "epoch": 2.99733247110177, + "grad_norm": 0.8737510442733765, + "learning_rate": 0.0002, + "loss": 0.7186, + "step": 18540 + }, + { + "epoch": 2.9989491552825154, + "grad_norm": 0.7323142886161804, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 18550 + }, + { + "epoch": 2.9999191657909625, + "eval_loss": 1.1262480020523071, + "eval_runtime": 122.0868, + "eval_samples_per_second": 6.004, + "eval_steps_per_second": 0.754, + "step": 18556 + }, + { + "epoch": 3.000565839463261, + "grad_norm": 0.8465463519096375, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 18560 + }, + { + "epoch": 3.0021825236440063, + "grad_norm": 0.9134138822555542, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 18570 + }, + { + "epoch": 3.0037992078247515, + "grad_norm": 0.760715126991272, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 18580 + }, + { + "epoch": 3.0054158920054967, + "grad_norm": 0.9208743572235107, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 18590 + }, + { + "epoch": 3.007032576186242, + "grad_norm": 0.9232364892959595, + "learning_rate": 0.0002, + "loss": 0.5731, + "step": 18600 + }, + { + "epoch": 3.008649260366987, + "grad_norm": 1.1881544589996338, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 18610 + }, + { + "epoch": 3.0102659445477324, + "grad_norm": 0.9372987747192383, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 18620 + }, + { + "epoch": 3.0118826287284777, + "grad_norm": 0.6900241374969482, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 18630 + }, + { + "epoch": 3.0134993129092233, + "grad_norm": 0.8451071381568909, + "learning_rate": 0.0002, + "loss": 0.5256, + "step": 18640 + }, + { + "epoch": 3.0151159970899686, + "grad_norm": 0.7763112187385559, + "learning_rate": 0.0002, + "loss": 0.5916, + "step": 18650 + }, + { + "epoch": 3.016732681270714, + "grad_norm": 1.043653964996338, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 18660 + }, + { + "epoch": 3.018349365451459, + "grad_norm": 1.0170660018920898, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 18670 + }, + { + "epoch": 3.0199660496322043, + "grad_norm": 0.7534180283546448, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 18680 + }, + { + "epoch": 3.0215827338129495, + "grad_norm": 0.7507367730140686, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 18690 + }, + { + "epoch": 3.0231994179936947, + "grad_norm": 0.7861620187759399, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 18700 + }, + { + "epoch": 3.0248161021744404, + "grad_norm": 1.0580339431762695, + "learning_rate": 0.0002, + "loss": 0.5802, + "step": 18710 + }, + { + "epoch": 3.0264327863551856, + "grad_norm": 0.7542710900306702, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 18720 + }, + { + "epoch": 3.028049470535931, + "grad_norm": 0.8189544677734375, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 18730 + }, + { + "epoch": 3.029666154716676, + "grad_norm": 0.9126611351966858, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 18740 + }, + { + "epoch": 3.0312828388974213, + "grad_norm": 0.8891341686248779, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 18750 + }, + { + "epoch": 3.0328995230781666, + "grad_norm": 0.8419283032417297, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 18760 + }, + { + "epoch": 3.034516207258912, + "grad_norm": 0.8048048615455627, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 18770 + }, + { + "epoch": 3.0361328914396575, + "grad_norm": 0.7820217609405518, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 18780 + }, + { + "epoch": 3.0377495756204027, + "grad_norm": 0.854721188545227, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 18790 + }, + { + "epoch": 3.039366259801148, + "grad_norm": 0.912092924118042, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 18800 + }, + { + "epoch": 3.040982943981893, + "grad_norm": 0.6596226096153259, + "learning_rate": 0.0002, + "loss": 0.5734, + "step": 18810 + }, + { + "epoch": 3.0425996281626384, + "grad_norm": 0.6351348757743835, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 18820 + }, + { + "epoch": 3.0442163123433836, + "grad_norm": 0.778188943862915, + "learning_rate": 0.0002, + "loss": 0.5953, + "step": 18830 + }, + { + "epoch": 3.045832996524129, + "grad_norm": 0.68234783411026, + "learning_rate": 0.0002, + "loss": 0.602, + "step": 18840 + }, + { + "epoch": 3.047449680704874, + "grad_norm": 0.998628556728363, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 18850 + }, + { + "epoch": 3.0490663648856198, + "grad_norm": 0.7393841743469238, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 18860 + }, + { + "epoch": 3.050683049066365, + "grad_norm": 0.84438556432724, + "learning_rate": 0.0002, + "loss": 0.568, + "step": 18870 + }, + { + "epoch": 3.0522997332471102, + "grad_norm": 0.8857501745223999, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 18880 + }, + { + "epoch": 3.0539164174278555, + "grad_norm": 0.7208474278450012, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 18890 + }, + { + "epoch": 3.0555331016086007, + "grad_norm": 0.7135229110717773, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 18900 + }, + { + "epoch": 3.057149785789346, + "grad_norm": 0.9130001664161682, + "learning_rate": 0.0002, + "loss": 0.5575, + "step": 18910 + }, + { + "epoch": 3.058766469970091, + "grad_norm": 0.9001716375350952, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 18920 + }, + { + "epoch": 3.060383154150837, + "grad_norm": 0.8667559623718262, + "learning_rate": 0.0002, + "loss": 0.6052, + "step": 18930 + }, + { + "epoch": 3.061999838331582, + "grad_norm": 0.8943959474563599, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 18940 + }, + { + "epoch": 3.0636165225123273, + "grad_norm": 0.8298377990722656, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 18950 + }, + { + "epoch": 3.0652332066930725, + "grad_norm": 0.7935267686843872, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 18960 + }, + { + "epoch": 3.0668498908738178, + "grad_norm": 1.1506379842758179, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 18970 + }, + { + "epoch": 3.068466575054563, + "grad_norm": 0.7693049907684326, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 18980 + }, + { + "epoch": 3.0700832592353082, + "grad_norm": 0.8040135502815247, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 18990 + }, + { + "epoch": 3.0716999434160535, + "grad_norm": 0.828404426574707, + "learning_rate": 0.0002, + "loss": 0.5541, + "step": 19000 + }, + { + "epoch": 3.073316627596799, + "grad_norm": 0.8811164498329163, + "learning_rate": 0.0002, + "loss": 0.6048, + "step": 19010 + }, + { + "epoch": 3.0749333117775444, + "grad_norm": 1.036205768585205, + "learning_rate": 0.0002, + "loss": 0.5845, + "step": 19020 + }, + { + "epoch": 3.0765499959582896, + "grad_norm": 0.8857285976409912, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 19030 + }, + { + "epoch": 3.078166680139035, + "grad_norm": 0.8392079472541809, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 19040 + }, + { + "epoch": 3.07978336431978, + "grad_norm": 1.0287401676177979, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 19050 + }, + { + "epoch": 3.0814000485005253, + "grad_norm": 1.0086315870285034, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 19060 + }, + { + "epoch": 3.0830167326812705, + "grad_norm": 0.9245324730873108, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 19070 + }, + { + "epoch": 3.084633416862016, + "grad_norm": 0.8680877089500427, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 19080 + }, + { + "epoch": 3.0862501010427614, + "grad_norm": 0.8814793825149536, + "learning_rate": 0.0002, + "loss": 0.5611, + "step": 19090 + }, + { + "epoch": 3.0878667852235067, + "grad_norm": 0.9234458208084106, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 19100 + }, + { + "epoch": 3.089483469404252, + "grad_norm": 1.1291664838790894, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 19110 + }, + { + "epoch": 3.091100153584997, + "grad_norm": 0.9191402792930603, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 19120 + }, + { + "epoch": 3.0927168377657424, + "grad_norm": 0.7103154063224792, + "learning_rate": 0.0002, + "loss": 0.5856, + "step": 19130 + }, + { + "epoch": 3.0943335219464876, + "grad_norm": 0.9368883967399597, + "learning_rate": 0.0002, + "loss": 0.6479, + "step": 19140 + }, + { + "epoch": 3.095950206127233, + "grad_norm": 0.9676656723022461, + "learning_rate": 0.0002, + "loss": 0.6167, + "step": 19150 + }, + { + "epoch": 3.0975668903079785, + "grad_norm": 0.8739792704582214, + "learning_rate": 0.0002, + "loss": 0.5794, + "step": 19160 + }, + { + "epoch": 3.0991835744887237, + "grad_norm": 0.8530174493789673, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 19170 + }, + { + "epoch": 3.100800258669469, + "grad_norm": 0.794945478439331, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 19180 + }, + { + "epoch": 3.102416942850214, + "grad_norm": 0.9508888125419617, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 19190 + }, + { + "epoch": 3.1040336270309594, + "grad_norm": 1.0599955320358276, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 19200 + }, + { + "epoch": 3.1056503112117047, + "grad_norm": 1.0673625469207764, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 19210 + }, + { + "epoch": 3.10726699539245, + "grad_norm": 0.7739115953445435, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 19220 + }, + { + "epoch": 3.1088836795731956, + "grad_norm": 0.9884951114654541, + "learning_rate": 0.0002, + "loss": 0.6046, + "step": 19230 + }, + { + "epoch": 3.110500363753941, + "grad_norm": 0.862260103225708, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 19240 + }, + { + "epoch": 3.112117047934686, + "grad_norm": 0.7690284848213196, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 19250 + }, + { + "epoch": 3.1137337321154313, + "grad_norm": 0.8758958578109741, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 19260 + }, + { + "epoch": 3.1153504162961765, + "grad_norm": 1.0356395244598389, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 19270 + }, + { + "epoch": 3.1169671004769217, + "grad_norm": 0.6950937509536743, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 19280 + }, + { + "epoch": 3.118583784657667, + "grad_norm": 0.760998010635376, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 19290 + }, + { + "epoch": 3.1202004688384126, + "grad_norm": 0.9335789084434509, + "learning_rate": 0.0002, + "loss": 0.575, + "step": 19300 + }, + { + "epoch": 3.121817153019158, + "grad_norm": 0.9636204242706299, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 19310 + }, + { + "epoch": 3.123433837199903, + "grad_norm": 1.0820997953414917, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 19320 + }, + { + "epoch": 3.1250505213806483, + "grad_norm": 0.7333487272262573, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 19330 + }, + { + "epoch": 3.1266672055613935, + "grad_norm": 1.0417509078979492, + "learning_rate": 0.0002, + "loss": 0.6178, + "step": 19340 + }, + { + "epoch": 3.128283889742139, + "grad_norm": 0.9267749190330505, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 19350 + }, + { + "epoch": 3.129900573922884, + "grad_norm": 0.777798593044281, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 19360 + }, + { + "epoch": 3.1315172581036297, + "grad_norm": 0.8425456881523132, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 19370 + }, + { + "epoch": 3.133133942284375, + "grad_norm": 0.9617102146148682, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 19380 + }, + { + "epoch": 3.13475062646512, + "grad_norm": 1.0052828788757324, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 19390 + }, + { + "epoch": 3.1363673106458654, + "grad_norm": 0.7637009024620056, + "learning_rate": 0.0002, + "loss": 0.5713, + "step": 19400 + }, + { + "epoch": 3.1379839948266106, + "grad_norm": 0.7958088517189026, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 19410 + }, + { + "epoch": 3.139600679007356, + "grad_norm": 0.9161727428436279, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 19420 + }, + { + "epoch": 3.141217363188101, + "grad_norm": 0.8402149677276611, + "learning_rate": 0.0002, + "loss": 0.5638, + "step": 19430 + }, + { + "epoch": 3.1428340473688463, + "grad_norm": 1.0056525468826294, + "learning_rate": 0.0002, + "loss": 0.5848, + "step": 19440 + }, + { + "epoch": 3.144450731549592, + "grad_norm": 1.0129190683364868, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 19450 + }, + { + "epoch": 3.146067415730337, + "grad_norm": 0.790825366973877, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 19460 + }, + { + "epoch": 3.1476840999110824, + "grad_norm": 1.441665530204773, + "learning_rate": 0.0002, + "loss": 0.5607, + "step": 19470 + }, + { + "epoch": 3.1493007840918277, + "grad_norm": 0.7846331596374512, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 19480 + }, + { + "epoch": 3.150917468272573, + "grad_norm": 0.7915332913398743, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 19490 + }, + { + "epoch": 3.152534152453318, + "grad_norm": 0.933982253074646, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 19500 + }, + { + "epoch": 3.1541508366340634, + "grad_norm": 1.038408637046814, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 19510 + }, + { + "epoch": 3.155767520814809, + "grad_norm": 1.018935203552246, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 19520 + }, + { + "epoch": 3.1573842049955543, + "grad_norm": 0.9618112444877625, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 19530 + }, + { + "epoch": 3.1590008891762995, + "grad_norm": 0.8900452852249146, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 19540 + }, + { + "epoch": 3.1606175733570447, + "grad_norm": 0.8254160284996033, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 19550 + }, + { + "epoch": 3.16223425753779, + "grad_norm": 1.004376769065857, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 19560 + }, + { + "epoch": 3.163850941718535, + "grad_norm": 1.0490446090698242, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 19570 + }, + { + "epoch": 3.1654676258992804, + "grad_norm": 0.7387403845787048, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 19580 + }, + { + "epoch": 3.1670843100800257, + "grad_norm": 0.7611538171768188, + "learning_rate": 0.0002, + "loss": 0.5898, + "step": 19590 + }, + { + "epoch": 3.1687009942607713, + "grad_norm": 0.8239886164665222, + "learning_rate": 0.0002, + "loss": 0.5937, + "step": 19600 + }, + { + "epoch": 3.1703176784415166, + "grad_norm": 0.9327243566513062, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 19610 + }, + { + "epoch": 3.171934362622262, + "grad_norm": 0.9662560224533081, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 19620 + }, + { + "epoch": 3.173551046803007, + "grad_norm": 0.9183341860771179, + "learning_rate": 0.0002, + "loss": 0.5988, + "step": 19630 + }, + { + "epoch": 3.1751677309837523, + "grad_norm": 0.875066876411438, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 19640 + }, + { + "epoch": 3.1767844151644975, + "grad_norm": 0.8567508459091187, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 19650 + }, + { + "epoch": 3.1784010993452427, + "grad_norm": 0.6805780529975891, + "learning_rate": 0.0002, + "loss": 0.5805, + "step": 19660 + }, + { + "epoch": 3.1800177835259884, + "grad_norm": 0.8776944279670715, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 19670 + }, + { + "epoch": 3.1816344677067336, + "grad_norm": 0.9036329984664917, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 19680 + }, + { + "epoch": 3.183251151887479, + "grad_norm": 0.8527372479438782, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 19690 + }, + { + "epoch": 3.184867836068224, + "grad_norm": 1.1045585870742798, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 19700 + }, + { + "epoch": 3.1864845202489693, + "grad_norm": 0.9213830828666687, + "learning_rate": 0.0002, + "loss": 0.5491, + "step": 19710 + }, + { + "epoch": 3.1881012044297146, + "grad_norm": 0.8865814805030823, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 19720 + }, + { + "epoch": 3.18971788861046, + "grad_norm": 0.7939388751983643, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 19730 + }, + { + "epoch": 3.191334572791205, + "grad_norm": 0.6966729760169983, + "learning_rate": 0.0002, + "loss": 0.5682, + "step": 19740 + }, + { + "epoch": 3.1929512569719507, + "grad_norm": 0.8023673295974731, + "learning_rate": 0.0002, + "loss": 0.5839, + "step": 19750 + }, + { + "epoch": 3.194567941152696, + "grad_norm": 0.7992037534713745, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 19760 + }, + { + "epoch": 3.196184625333441, + "grad_norm": 0.7412247657775879, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 19770 + }, + { + "epoch": 3.1978013095141864, + "grad_norm": 0.9598729014396667, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 19780 + }, + { + "epoch": 3.1994179936949316, + "grad_norm": 0.8331366777420044, + "learning_rate": 0.0002, + "loss": 0.5685, + "step": 19790 + }, + { + "epoch": 3.201034677875677, + "grad_norm": 0.8939169645309448, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 19800 + }, + { + "epoch": 3.202651362056422, + "grad_norm": 0.9219734072685242, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 19810 + }, + { + "epoch": 3.2042680462371678, + "grad_norm": 0.869490385055542, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 19820 + }, + { + "epoch": 3.205884730417913, + "grad_norm": 0.8989706635475159, + "learning_rate": 0.0002, + "loss": 0.5946, + "step": 19830 + }, + { + "epoch": 3.2075014145986582, + "grad_norm": 0.8477165102958679, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 19840 + }, + { + "epoch": 3.2091180987794035, + "grad_norm": 0.8720678687095642, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 19850 + }, + { + "epoch": 3.2107347829601487, + "grad_norm": 0.861406683921814, + "learning_rate": 0.0002, + "loss": 0.5694, + "step": 19860 + }, + { + "epoch": 3.212351467140894, + "grad_norm": 0.8228686451911926, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 19870 + }, + { + "epoch": 3.213968151321639, + "grad_norm": 0.7936596870422363, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 19880 + }, + { + "epoch": 3.2155848355023844, + "grad_norm": 1.097377896308899, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 19890 + }, + { + "epoch": 3.21720151968313, + "grad_norm": 0.9544782638549805, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 19900 + }, + { + "epoch": 3.2188182038638753, + "grad_norm": 0.8240751624107361, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 19910 + }, + { + "epoch": 3.2204348880446205, + "grad_norm": 0.8332096338272095, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 19920 + }, + { + "epoch": 3.2220515722253658, + "grad_norm": 1.0954567193984985, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 19930 + }, + { + "epoch": 3.223668256406111, + "grad_norm": 0.7790525555610657, + "learning_rate": 0.0002, + "loss": 0.652, + "step": 19940 + }, + { + "epoch": 3.225284940586856, + "grad_norm": 0.7966814041137695, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 19950 + }, + { + "epoch": 3.2269016247676015, + "grad_norm": 0.9751881957054138, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 19960 + }, + { + "epoch": 3.228518308948347, + "grad_norm": 0.9856047630310059, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 19970 + }, + { + "epoch": 3.2301349931290924, + "grad_norm": 1.3062353134155273, + "learning_rate": 0.0002, + "loss": 0.5837, + "step": 19980 + }, + { + "epoch": 3.2317516773098376, + "grad_norm": 0.9510692358016968, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 19990 + }, + { + "epoch": 3.233368361490583, + "grad_norm": 0.8630342483520508, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 20000 + }, + { + "epoch": 3.234985045671328, + "grad_norm": 0.8966519236564636, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 20010 + }, + { + "epoch": 3.2366017298520733, + "grad_norm": 0.7093510627746582, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 20020 + }, + { + "epoch": 3.2382184140328185, + "grad_norm": 0.7771096229553223, + "learning_rate": 0.0002, + "loss": 0.585, + "step": 20030 + }, + { + "epoch": 3.2398350982135637, + "grad_norm": 0.841058075428009, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 20040 + }, + { + "epoch": 3.2414517823943094, + "grad_norm": 0.909712553024292, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 20050 + }, + { + "epoch": 3.2430684665750547, + "grad_norm": 0.8321019411087036, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 20060 + }, + { + "epoch": 3.2446851507558, + "grad_norm": 0.779901921749115, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 20070 + }, + { + "epoch": 3.246301834936545, + "grad_norm": 0.6249170303344727, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 20080 + }, + { + "epoch": 3.2479185191172903, + "grad_norm": 0.8000940680503845, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 20090 + }, + { + "epoch": 3.2495352032980356, + "grad_norm": 0.7627735137939453, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 20100 + }, + { + "epoch": 3.2511518874787813, + "grad_norm": 0.8780747056007385, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 20110 + }, + { + "epoch": 3.2527685716595265, + "grad_norm": 0.772037148475647, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 20120 + }, + { + "epoch": 3.2543852558402717, + "grad_norm": 1.0086580514907837, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 20130 + }, + { + "epoch": 3.256001940021017, + "grad_norm": 0.9360289573669434, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 20140 + }, + { + "epoch": 3.257618624201762, + "grad_norm": 1.2099586725234985, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 20150 + }, + { + "epoch": 3.2592353083825074, + "grad_norm": 0.8368481397628784, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 20160 + }, + { + "epoch": 3.2608519925632526, + "grad_norm": 0.7391039133071899, + "learning_rate": 0.0002, + "loss": 0.5626, + "step": 20170 + }, + { + "epoch": 3.262468676743998, + "grad_norm": 0.9122273325920105, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 20180 + }, + { + "epoch": 3.264085360924743, + "grad_norm": 0.8502281904220581, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 20190 + }, + { + "epoch": 3.265702045105489, + "grad_norm": 1.0926852226257324, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 20200 + }, + { + "epoch": 3.267318729286234, + "grad_norm": 0.7902828454971313, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 20210 + }, + { + "epoch": 3.2689354134669792, + "grad_norm": 0.8724729418754578, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 20220 + }, + { + "epoch": 3.2705520976477245, + "grad_norm": 0.8469277024269104, + "learning_rate": 0.0002, + "loss": 0.6242, + "step": 20230 + }, + { + "epoch": 3.2721687818284697, + "grad_norm": 0.8865092992782593, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 20240 + }, + { + "epoch": 3.273785466009215, + "grad_norm": 1.0979334115982056, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 20250 + }, + { + "epoch": 3.2754021501899606, + "grad_norm": 1.0860793590545654, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 20260 + }, + { + "epoch": 3.277018834370706, + "grad_norm": 0.981745183467865, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 20270 + }, + { + "epoch": 3.278635518551451, + "grad_norm": 0.9155020713806152, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 20280 + }, + { + "epoch": 3.2802522027321963, + "grad_norm": 0.8436718583106995, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 20290 + }, + { + "epoch": 3.2818688869129415, + "grad_norm": 1.0329409837722778, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 20300 + }, + { + "epoch": 3.2834855710936868, + "grad_norm": 0.9876394271850586, + "learning_rate": 0.0002, + "loss": 0.6, + "step": 20310 + }, + { + "epoch": 3.285102255274432, + "grad_norm": 0.8052917718887329, + "learning_rate": 0.0002, + "loss": 0.5463, + "step": 20320 + }, + { + "epoch": 3.2867189394551772, + "grad_norm": 0.8390680551528931, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 20330 + }, + { + "epoch": 3.288335623635923, + "grad_norm": 0.9515735507011414, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 20340 + }, + { + "epoch": 3.289952307816668, + "grad_norm": 0.8028870224952698, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 20350 + }, + { + "epoch": 3.2915689919974134, + "grad_norm": 0.862592339515686, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 20360 + }, + { + "epoch": 3.2931856761781586, + "grad_norm": 0.7451621890068054, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 20370 + }, + { + "epoch": 3.294802360358904, + "grad_norm": 0.8966776728630066, + "learning_rate": 0.0002, + "loss": 0.6458, + "step": 20380 + }, + { + "epoch": 3.296419044539649, + "grad_norm": 0.9289216995239258, + "learning_rate": 0.0002, + "loss": 0.5967, + "step": 20390 + }, + { + "epoch": 3.2980357287203943, + "grad_norm": 0.9649626612663269, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 20400 + }, + { + "epoch": 3.29965241290114, + "grad_norm": 1.1953798532485962, + "learning_rate": 0.0002, + "loss": 0.5781, + "step": 20410 + }, + { + "epoch": 3.301269097081885, + "grad_norm": 0.8929083943367004, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 20420 + }, + { + "epoch": 3.3028857812626304, + "grad_norm": 0.8922014236450195, + "learning_rate": 0.0002, + "loss": 0.597, + "step": 20430 + }, + { + "epoch": 3.3045024654433757, + "grad_norm": 0.9754860401153564, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 20440 + }, + { + "epoch": 3.306119149624121, + "grad_norm": 0.8873140215873718, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 20450 + }, + { + "epoch": 3.307735833804866, + "grad_norm": 0.857271671295166, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 20460 + }, + { + "epoch": 3.3093525179856114, + "grad_norm": 0.9022141098976135, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 20470 + }, + { + "epoch": 3.3109692021663566, + "grad_norm": 0.8614798188209534, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 20480 + }, + { + "epoch": 3.3125858863471023, + "grad_norm": 0.8838164210319519, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 20490 + }, + { + "epoch": 3.3142025705278475, + "grad_norm": 0.8709736466407776, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 20500 + }, + { + "epoch": 3.3158192547085927, + "grad_norm": 0.9533300995826721, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 20510 + }, + { + "epoch": 3.317435938889338, + "grad_norm": 0.8259269595146179, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 20520 + }, + { + "epoch": 3.319052623070083, + "grad_norm": 0.8607608079910278, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 20530 + }, + { + "epoch": 3.3206693072508284, + "grad_norm": 1.0863020420074463, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 20540 + }, + { + "epoch": 3.3222859914315737, + "grad_norm": 1.011489987373352, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 20550 + }, + { + "epoch": 3.3239026756123193, + "grad_norm": 0.6952177882194519, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 20560 + }, + { + "epoch": 3.3255193597930646, + "grad_norm": 0.9638974070549011, + "learning_rate": 0.0002, + "loss": 0.6229, + "step": 20570 + }, + { + "epoch": 3.32713604397381, + "grad_norm": 1.0310138463974, + "learning_rate": 0.0002, + "loss": 0.5882, + "step": 20580 + }, + { + "epoch": 3.328752728154555, + "grad_norm": 0.9371318221092224, + "learning_rate": 0.0002, + "loss": 0.594, + "step": 20590 + }, + { + "epoch": 3.3303694123353003, + "grad_norm": 0.8756691813468933, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 20600 + }, + { + "epoch": 3.3319860965160455, + "grad_norm": 1.054175853729248, + "learning_rate": 0.0002, + "loss": 0.5994, + "step": 20610 + }, + { + "epoch": 3.3336027806967907, + "grad_norm": 0.9074128270149231, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 20620 + }, + { + "epoch": 3.335219464877536, + "grad_norm": 0.906900942325592, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 20630 + }, + { + "epoch": 3.3368361490582816, + "grad_norm": 0.8689333200454712, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 20640 + }, + { + "epoch": 3.338452833239027, + "grad_norm": 0.9889747500419617, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 20650 + }, + { + "epoch": 3.340069517419772, + "grad_norm": 1.0685805082321167, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 20660 + }, + { + "epoch": 3.3416862016005173, + "grad_norm": 0.7495010495185852, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 20670 + }, + { + "epoch": 3.3433028857812626, + "grad_norm": 0.8747848272323608, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 20680 + }, + { + "epoch": 3.344919569962008, + "grad_norm": 0.9762673377990723, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 20690 + }, + { + "epoch": 3.346536254142753, + "grad_norm": 1.0284489393234253, + "learning_rate": 0.0002, + "loss": 0.6784, + "step": 20700 + }, + { + "epoch": 3.3481529383234987, + "grad_norm": 0.7293812036514282, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 20710 + }, + { + "epoch": 3.349769622504244, + "grad_norm": 0.8330199122428894, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 20720 + }, + { + "epoch": 3.351386306684989, + "grad_norm": 0.9808499217033386, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 20730 + }, + { + "epoch": 3.3530029908657344, + "grad_norm": 0.9508825540542603, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 20740 + }, + { + "epoch": 3.3546196750464796, + "grad_norm": 0.790483832359314, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 20750 + }, + { + "epoch": 3.356236359227225, + "grad_norm": 1.022793173789978, + "learning_rate": 0.0002, + "loss": 0.6327, + "step": 20760 + }, + { + "epoch": 3.35785304340797, + "grad_norm": 0.8318950533866882, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 20770 + }, + { + "epoch": 3.3594697275887153, + "grad_norm": 0.7980858087539673, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 20780 + }, + { + "epoch": 3.361086411769461, + "grad_norm": 0.8114802241325378, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 20790 + }, + { + "epoch": 3.3627030959502062, + "grad_norm": 0.8522519469261169, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 20800 + }, + { + "epoch": 3.3643197801309515, + "grad_norm": 0.9142431616783142, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 20810 + }, + { + "epoch": 3.3659364643116967, + "grad_norm": 0.771170437335968, + "learning_rate": 0.0002, + "loss": 0.6331, + "step": 20820 + }, + { + "epoch": 3.367553148492442, + "grad_norm": 1.0628231763839722, + "learning_rate": 0.0002, + "loss": 0.5879, + "step": 20830 + }, + { + "epoch": 3.369169832673187, + "grad_norm": 0.9384352564811707, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 20840 + }, + { + "epoch": 3.370786516853933, + "grad_norm": 1.1286591291427612, + "learning_rate": 0.0002, + "loss": 0.6292, + "step": 20850 + }, + { + "epoch": 3.372403201034678, + "grad_norm": 1.1349513530731201, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 20860 + }, + { + "epoch": 3.3740198852154233, + "grad_norm": 1.0127464532852173, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 20870 + }, + { + "epoch": 3.3756365693961685, + "grad_norm": 0.9111971855163574, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 20880 + }, + { + "epoch": 3.3772532535769137, + "grad_norm": 0.871356725692749, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 20890 + }, + { + "epoch": 3.378869937757659, + "grad_norm": 0.7774117588996887, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 20900 + }, + { + "epoch": 3.380486621938404, + "grad_norm": 1.0089964866638184, + "learning_rate": 0.0002, + "loss": 0.6062, + "step": 20910 + }, + { + "epoch": 3.3821033061191494, + "grad_norm": 0.7855867147445679, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 20920 + }, + { + "epoch": 3.3837199902998947, + "grad_norm": 1.3713710308074951, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 20930 + }, + { + "epoch": 3.3853366744806404, + "grad_norm": 0.8599116206169128, + "learning_rate": 0.0002, + "loss": 0.6627, + "step": 20940 + }, + { + "epoch": 3.3869533586613856, + "grad_norm": 0.9392673373222351, + "learning_rate": 0.0002, + "loss": 0.6224, + "step": 20950 + }, + { + "epoch": 3.388570042842131, + "grad_norm": 0.8764075040817261, + "learning_rate": 0.0002, + "loss": 0.5855, + "step": 20960 + }, + { + "epoch": 3.390186727022876, + "grad_norm": 0.8240136504173279, + "learning_rate": 0.0002, + "loss": 0.5734, + "step": 20970 + }, + { + "epoch": 3.3918034112036213, + "grad_norm": 1.0982369184494019, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 20980 + }, + { + "epoch": 3.3934200953843665, + "grad_norm": 1.0599013566970825, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 20990 + }, + { + "epoch": 3.395036779565112, + "grad_norm": 0.895438015460968, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 21000 + }, + { + "epoch": 3.3966534637458574, + "grad_norm": 0.6974841356277466, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 21010 + }, + { + "epoch": 3.3982701479266026, + "grad_norm": 0.9571719765663147, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 21020 + }, + { + "epoch": 3.399886832107348, + "grad_norm": 0.831912636756897, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 21030 + }, + { + "epoch": 3.401503516288093, + "grad_norm": 0.831936240196228, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 21040 + }, + { + "epoch": 3.4031202004688383, + "grad_norm": 0.7388373613357544, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 21050 + }, + { + "epoch": 3.4047368846495836, + "grad_norm": 0.938667356967926, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 21060 + }, + { + "epoch": 3.406353568830329, + "grad_norm": 0.9202313423156738, + "learning_rate": 0.0002, + "loss": 0.6121, + "step": 21070 + }, + { + "epoch": 3.4079702530110745, + "grad_norm": 0.9888381958007812, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 21080 + }, + { + "epoch": 3.4095869371918197, + "grad_norm": 0.8526970744132996, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 21090 + }, + { + "epoch": 3.411203621372565, + "grad_norm": 0.7939383387565613, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 21100 + }, + { + "epoch": 3.41282030555331, + "grad_norm": 0.9986352920532227, + "learning_rate": 0.0002, + "loss": 0.6066, + "step": 21110 + }, + { + "epoch": 3.4144369897340554, + "grad_norm": 0.8895300030708313, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 21120 + }, + { + "epoch": 3.4160536739148006, + "grad_norm": 0.9559482932090759, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 21130 + }, + { + "epoch": 3.417670358095546, + "grad_norm": 0.8351506590843201, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 21140 + }, + { + "epoch": 3.4192870422762915, + "grad_norm": 0.8224456906318665, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 21150 + }, + { + "epoch": 3.4209037264570368, + "grad_norm": 1.0110299587249756, + "learning_rate": 0.0002, + "loss": 0.5871, + "step": 21160 + }, + { + "epoch": 3.422520410637782, + "grad_norm": 0.82564777135849, + "learning_rate": 0.0002, + "loss": 0.6116, + "step": 21170 + }, + { + "epoch": 3.4241370948185272, + "grad_norm": 1.004738688468933, + "learning_rate": 0.0002, + "loss": 0.595, + "step": 21180 + }, + { + "epoch": 3.4257537789992725, + "grad_norm": 0.7545676827430725, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 21190 + }, + { + "epoch": 3.4273704631800177, + "grad_norm": 0.8918704390525818, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 21200 + }, + { + "epoch": 3.428987147360763, + "grad_norm": 0.8336876034736633, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 21210 + }, + { + "epoch": 3.430603831541508, + "grad_norm": 0.8928771018981934, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 21220 + }, + { + "epoch": 3.432220515722254, + "grad_norm": 0.7663705945014954, + "learning_rate": 0.0002, + "loss": 0.6468, + "step": 21230 + }, + { + "epoch": 3.433837199902999, + "grad_norm": 0.8392598628997803, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 21240 + }, + { + "epoch": 3.4354538840837443, + "grad_norm": 0.8819600343704224, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 21250 + }, + { + "epoch": 3.4370705682644895, + "grad_norm": 0.9124642014503479, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 21260 + }, + { + "epoch": 3.4386872524452348, + "grad_norm": 0.8329763412475586, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 21270 + }, + { + "epoch": 3.44030393662598, + "grad_norm": 0.9982839822769165, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 21280 + }, + { + "epoch": 3.4419206208067252, + "grad_norm": 0.9105954766273499, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 21290 + }, + { + "epoch": 3.443537304987471, + "grad_norm": 0.8182359337806702, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 21300 + }, + { + "epoch": 3.445153989168216, + "grad_norm": 1.0568904876708984, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 21310 + }, + { + "epoch": 3.4467706733489614, + "grad_norm": 0.968539834022522, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 21320 + }, + { + "epoch": 3.4483873575297066, + "grad_norm": 0.8774511218070984, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 21330 + }, + { + "epoch": 3.450004041710452, + "grad_norm": 0.7598156332969666, + "learning_rate": 0.0002, + "loss": 0.6438, + "step": 21340 + }, + { + "epoch": 3.451620725891197, + "grad_norm": 1.1012897491455078, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 21350 + }, + { + "epoch": 3.4532374100719423, + "grad_norm": 0.8040637373924255, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 21360 + }, + { + "epoch": 3.4548540942526875, + "grad_norm": 0.8497496247291565, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 21370 + }, + { + "epoch": 3.456470778433433, + "grad_norm": 0.8429915904998779, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 21380 + }, + { + "epoch": 3.4580874626141784, + "grad_norm": 0.8107112646102905, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 21390 + }, + { + "epoch": 3.4597041467949237, + "grad_norm": 1.00872004032135, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 21400 + }, + { + "epoch": 3.461320830975669, + "grad_norm": 0.8266542553901672, + "learning_rate": 0.0002, + "loss": 0.5712, + "step": 21410 + }, + { + "epoch": 3.462937515156414, + "grad_norm": 0.8972568511962891, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 21420 + }, + { + "epoch": 3.4645541993371594, + "grad_norm": 1.0781476497650146, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 21430 + }, + { + "epoch": 3.4661708835179046, + "grad_norm": 0.9571592807769775, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 21440 + }, + { + "epoch": 3.4677875676986503, + "grad_norm": 0.881547212600708, + "learning_rate": 0.0002, + "loss": 0.6309, + "step": 21450 + }, + { + "epoch": 3.4694042518793955, + "grad_norm": 0.6955338716506958, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 21460 + }, + { + "epoch": 3.4710209360601407, + "grad_norm": 0.901187539100647, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 21470 + }, + { + "epoch": 3.472637620240886, + "grad_norm": 0.7063511610031128, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 21480 + }, + { + "epoch": 3.474254304421631, + "grad_norm": 0.8462792038917542, + "learning_rate": 0.0002, + "loss": 0.6154, + "step": 21490 + }, + { + "epoch": 3.4758709886023764, + "grad_norm": 1.1861060857772827, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 21500 + }, + { + "epoch": 3.4774876727831217, + "grad_norm": 0.70503169298172, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 21510 + }, + { + "epoch": 3.479104356963867, + "grad_norm": 0.9650066494941711, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 21520 + }, + { + "epoch": 3.4807210411446126, + "grad_norm": 1.0266852378845215, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 21530 + }, + { + "epoch": 3.482337725325358, + "grad_norm": 0.956372857093811, + "learning_rate": 0.0002, + "loss": 0.6553, + "step": 21540 + }, + { + "epoch": 3.483954409506103, + "grad_norm": 0.8848432898521423, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 21550 + }, + { + "epoch": 3.4855710936868483, + "grad_norm": 1.0805351734161377, + "learning_rate": 0.0002, + "loss": 0.6375, + "step": 21560 + }, + { + "epoch": 3.4871877778675935, + "grad_norm": 0.9279725551605225, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 21570 + }, + { + "epoch": 3.4888044620483387, + "grad_norm": 0.9049562215805054, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 21580 + }, + { + "epoch": 3.4904211462290844, + "grad_norm": 0.9619429111480713, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 21590 + }, + { + "epoch": 3.4920378304098296, + "grad_norm": 0.8508906960487366, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 21600 + }, + { + "epoch": 3.493654514590575, + "grad_norm": 0.8692502379417419, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 21610 + }, + { + "epoch": 3.49527119877132, + "grad_norm": 0.8187332153320312, + "learning_rate": 0.0002, + "loss": 0.5878, + "step": 21620 + }, + { + "epoch": 3.4968878829520653, + "grad_norm": 1.145400047302246, + "learning_rate": 0.0002, + "loss": 0.5874, + "step": 21630 + }, + { + "epoch": 3.4985045671328105, + "grad_norm": 0.8281388282775879, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 21640 + }, + { + "epoch": 3.500121251313556, + "grad_norm": 0.82256019115448, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 21650 + }, + { + "epoch": 3.501737935494301, + "grad_norm": 0.9315484762191772, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 21660 + }, + { + "epoch": 3.5033546196750462, + "grad_norm": 0.7626111507415771, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 21670 + }, + { + "epoch": 3.504971303855792, + "grad_norm": 0.9275059103965759, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 21680 + }, + { + "epoch": 3.506587988036537, + "grad_norm": 0.7906724810600281, + "learning_rate": 0.0002, + "loss": 0.64, + "step": 21690 + }, + { + "epoch": 3.5082046722172824, + "grad_norm": 0.8289761543273926, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 21700 + }, + { + "epoch": 3.5098213563980276, + "grad_norm": 0.8316431045532227, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 21710 + }, + { + "epoch": 3.511438040578773, + "grad_norm": 1.0451812744140625, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 21720 + }, + { + "epoch": 3.513054724759518, + "grad_norm": 0.928252637386322, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 21730 + }, + { + "epoch": 3.5146714089402638, + "grad_norm": 0.7985895276069641, + "learning_rate": 0.0002, + "loss": 0.6062, + "step": 21740 + }, + { + "epoch": 3.516288093121009, + "grad_norm": 0.6740974187850952, + "learning_rate": 0.0002, + "loss": 0.6463, + "step": 21750 + }, + { + "epoch": 3.517904777301754, + "grad_norm": 0.8482223749160767, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 21760 + }, + { + "epoch": 3.5195214614824994, + "grad_norm": 0.889947772026062, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 21770 + }, + { + "epoch": 3.5211381456632447, + "grad_norm": 0.8304598927497864, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 21780 + }, + { + "epoch": 3.52275482984399, + "grad_norm": 0.8002981543540955, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 21790 + }, + { + "epoch": 3.524371514024735, + "grad_norm": 0.8115083575248718, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 21800 + }, + { + "epoch": 3.5259881982054804, + "grad_norm": 0.9715048670768738, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 21810 + }, + { + "epoch": 3.5276048823862256, + "grad_norm": 1.0910786390304565, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 21820 + }, + { + "epoch": 3.5292215665669713, + "grad_norm": 0.8438942432403564, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 21830 + }, + { + "epoch": 3.5308382507477165, + "grad_norm": 0.8813382983207703, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 21840 + }, + { + "epoch": 3.5324549349284617, + "grad_norm": 0.7092908024787903, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 21850 + }, + { + "epoch": 3.534071619109207, + "grad_norm": 0.8332187533378601, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 21860 + }, + { + "epoch": 3.535688303289952, + "grad_norm": 0.8958209156990051, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 21870 + }, + { + "epoch": 3.5373049874706974, + "grad_norm": 0.824138879776001, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 21880 + }, + { + "epoch": 3.538921671651443, + "grad_norm": 0.8375158309936523, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 21890 + }, + { + "epoch": 3.5405383558321883, + "grad_norm": 1.0274608135223389, + "learning_rate": 0.0002, + "loss": 0.6019, + "step": 21900 + }, + { + "epoch": 3.5421550400129336, + "grad_norm": 0.7088932394981384, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 21910 + }, + { + "epoch": 3.543771724193679, + "grad_norm": 0.8172445297241211, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 21920 + }, + { + "epoch": 3.545388408374424, + "grad_norm": 0.9904135465621948, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 21930 + }, + { + "epoch": 3.5470050925551693, + "grad_norm": 0.9900432229042053, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 21940 + }, + { + "epoch": 3.5486217767359145, + "grad_norm": 0.8963301181793213, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 21950 + }, + { + "epoch": 3.5502384609166597, + "grad_norm": 0.8551464676856995, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 21960 + }, + { + "epoch": 3.551855145097405, + "grad_norm": 1.0916603803634644, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 21970 + }, + { + "epoch": 3.5534718292781506, + "grad_norm": 0.841598391532898, + "learning_rate": 0.0002, + "loss": 0.6523, + "step": 21980 + }, + { + "epoch": 3.555088513458896, + "grad_norm": 0.8566757440567017, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 21990 + }, + { + "epoch": 3.556705197639641, + "grad_norm": 1.0145052671432495, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 22000 + }, + { + "epoch": 3.5583218818203863, + "grad_norm": 0.9293754696846008, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 22010 + }, + { + "epoch": 3.5599385660011316, + "grad_norm": 0.9568536281585693, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 22020 + }, + { + "epoch": 3.5615552501818772, + "grad_norm": 0.8613139986991882, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 22030 + }, + { + "epoch": 3.5631719343626225, + "grad_norm": 0.8179237246513367, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 22040 + }, + { + "epoch": 3.5647886185433677, + "grad_norm": 0.9059830904006958, + "learning_rate": 0.0002, + "loss": 0.574, + "step": 22050 + }, + { + "epoch": 3.566405302724113, + "grad_norm": 1.0068252086639404, + "learning_rate": 0.0002, + "loss": 0.6448, + "step": 22060 + }, + { + "epoch": 3.568021986904858, + "grad_norm": 0.9682072997093201, + "learning_rate": 0.0002, + "loss": 0.6239, + "step": 22070 + }, + { + "epoch": 3.5696386710856034, + "grad_norm": 0.8514005541801453, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 22080 + }, + { + "epoch": 3.5712553552663486, + "grad_norm": 0.8327770829200745, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 22090 + }, + { + "epoch": 3.572872039447094, + "grad_norm": 1.024976372718811, + "learning_rate": 0.0002, + "loss": 0.5976, + "step": 22100 + }, + { + "epoch": 3.574488723627839, + "grad_norm": 0.7721174955368042, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 22110 + }, + { + "epoch": 3.5761054078085843, + "grad_norm": 1.0351054668426514, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 22120 + }, + { + "epoch": 3.57772209198933, + "grad_norm": 0.9680907130241394, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 22130 + }, + { + "epoch": 3.5793387761700752, + "grad_norm": 0.8016974925994873, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 22140 + }, + { + "epoch": 3.5809554603508205, + "grad_norm": 1.0109003782272339, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 22150 + }, + { + "epoch": 3.5825721445315657, + "grad_norm": 1.0473392009735107, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 22160 + }, + { + "epoch": 3.584188828712311, + "grad_norm": 0.8686613440513611, + "learning_rate": 0.0002, + "loss": 0.6353, + "step": 22170 + }, + { + "epoch": 3.5858055128930566, + "grad_norm": 0.869149923324585, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 22180 + }, + { + "epoch": 3.587422197073802, + "grad_norm": 0.9769062995910645, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 22190 + }, + { + "epoch": 3.589038881254547, + "grad_norm": 0.779636561870575, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 22200 + }, + { + "epoch": 3.5906555654352923, + "grad_norm": 0.9063841104507446, + "learning_rate": 0.0002, + "loss": 0.5875, + "step": 22210 + }, + { + "epoch": 3.5922722496160375, + "grad_norm": 0.9216037392616272, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 22220 + }, + { + "epoch": 3.5938889337967828, + "grad_norm": 1.0217336416244507, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 22230 + }, + { + "epoch": 3.595505617977528, + "grad_norm": 0.8513161540031433, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 22240 + }, + { + "epoch": 3.597122302158273, + "grad_norm": 0.8084813952445984, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 22250 + }, + { + "epoch": 3.5987389863390185, + "grad_norm": 0.8524802923202515, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 22260 + }, + { + "epoch": 3.600355670519764, + "grad_norm": 0.9356237649917603, + "learning_rate": 0.0002, + "loss": 0.5599, + "step": 22270 + }, + { + "epoch": 3.6019723547005094, + "grad_norm": 1.009600281715393, + "learning_rate": 0.0002, + "loss": 0.628, + "step": 22280 + }, + { + "epoch": 3.6035890388812546, + "grad_norm": 0.9900581240653992, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 22290 + }, + { + "epoch": 3.605205723062, + "grad_norm": 1.062495231628418, + "learning_rate": 0.0002, + "loss": 0.5725, + "step": 22300 + }, + { + "epoch": 3.606822407242745, + "grad_norm": 0.8832381367683411, + "learning_rate": 0.0002, + "loss": 0.607, + "step": 22310 + }, + { + "epoch": 3.6084390914234903, + "grad_norm": 0.9284297823905945, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 22320 + }, + { + "epoch": 3.610055775604236, + "grad_norm": 1.2381829023361206, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 22330 + }, + { + "epoch": 3.611672459784981, + "grad_norm": 0.929434597492218, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 22340 + }, + { + "epoch": 3.6132891439657264, + "grad_norm": 0.9714490175247192, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 22350 + }, + { + "epoch": 3.6149058281464717, + "grad_norm": 0.808014988899231, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 22360 + }, + { + "epoch": 3.616522512327217, + "grad_norm": 1.0364398956298828, + "learning_rate": 0.0002, + "loss": 0.6428, + "step": 22370 + }, + { + "epoch": 3.618139196507962, + "grad_norm": 0.7858489751815796, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 22380 + }, + { + "epoch": 3.6197558806887074, + "grad_norm": 0.9920870065689087, + "learning_rate": 0.0002, + "loss": 0.6214, + "step": 22390 + }, + { + "epoch": 3.6213725648694526, + "grad_norm": 0.9183220863342285, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 22400 + }, + { + "epoch": 3.622989249050198, + "grad_norm": 0.9826246500015259, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 22410 + }, + { + "epoch": 3.6246059332309435, + "grad_norm": 0.8632931113243103, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 22420 + }, + { + "epoch": 3.6262226174116887, + "grad_norm": 0.8468965291976929, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 22430 + }, + { + "epoch": 3.627839301592434, + "grad_norm": 0.8466871976852417, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 22440 + }, + { + "epoch": 3.629455985773179, + "grad_norm": 0.9501169919967651, + "learning_rate": 0.0002, + "loss": 0.5941, + "step": 22450 + }, + { + "epoch": 3.6310726699539244, + "grad_norm": 0.8906720876693726, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 22460 + }, + { + "epoch": 3.6326893541346696, + "grad_norm": 0.7400227189064026, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 22470 + }, + { + "epoch": 3.6343060383154153, + "grad_norm": 0.9756355881690979, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 22480 + }, + { + "epoch": 3.6359227224961606, + "grad_norm": 0.7504993081092834, + "learning_rate": 0.0002, + "loss": 0.6203, + "step": 22490 + }, + { + "epoch": 3.637539406676906, + "grad_norm": 0.9270039200782776, + "learning_rate": 0.0002, + "loss": 0.6302, + "step": 22500 + }, + { + "epoch": 3.639156090857651, + "grad_norm": 0.8841686844825745, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 22510 + }, + { + "epoch": 3.6407727750383962, + "grad_norm": 0.8533213138580322, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 22520 + }, + { + "epoch": 3.6423894592191415, + "grad_norm": 1.0052043199539185, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 22530 + }, + { + "epoch": 3.6440061433998867, + "grad_norm": 1.0323461294174194, + "learning_rate": 0.0002, + "loss": 0.6363, + "step": 22540 + }, + { + "epoch": 3.645622827580632, + "grad_norm": 0.8654312491416931, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 22550 + }, + { + "epoch": 3.647239511761377, + "grad_norm": 0.6400038003921509, + "learning_rate": 0.0002, + "loss": 0.6155, + "step": 22560 + }, + { + "epoch": 3.648856195942123, + "grad_norm": 0.8061298727989197, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 22570 + }, + { + "epoch": 3.650472880122868, + "grad_norm": 0.9257854223251343, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 22580 + }, + { + "epoch": 3.6520895643036133, + "grad_norm": 0.8439396619796753, + "learning_rate": 0.0002, + "loss": 0.6409, + "step": 22590 + }, + { + "epoch": 3.6537062484843585, + "grad_norm": 0.7764544486999512, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 22600 + }, + { + "epoch": 3.6553229326651038, + "grad_norm": 1.125451683998108, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 22610 + }, + { + "epoch": 3.656939616845849, + "grad_norm": 0.7523018717765808, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 22620 + }, + { + "epoch": 3.6585563010265947, + "grad_norm": 1.071026086807251, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 22630 + }, + { + "epoch": 3.66017298520734, + "grad_norm": 0.945791482925415, + "learning_rate": 0.0002, + "loss": 0.6459, + "step": 22640 + }, + { + "epoch": 3.661789669388085, + "grad_norm": 0.8001811504364014, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 22650 + }, + { + "epoch": 3.6634063535688304, + "grad_norm": 0.9700816869735718, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 22660 + }, + { + "epoch": 3.6650230377495756, + "grad_norm": 0.9053242206573486, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 22670 + }, + { + "epoch": 3.666639721930321, + "grad_norm": 0.944362461566925, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 22680 + }, + { + "epoch": 3.668256406111066, + "grad_norm": 1.067489504814148, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 22690 + }, + { + "epoch": 3.6698730902918113, + "grad_norm": 1.0984995365142822, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 22700 + }, + { + "epoch": 3.6714897744725565, + "grad_norm": 0.9336317777633667, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 22710 + }, + { + "epoch": 3.673106458653302, + "grad_norm": 0.9261918663978577, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 22720 + }, + { + "epoch": 3.6747231428340474, + "grad_norm": 0.8648008704185486, + "learning_rate": 0.0002, + "loss": 0.6332, + "step": 22730 + }, + { + "epoch": 3.6763398270147927, + "grad_norm": 0.7225083708763123, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 22740 + }, + { + "epoch": 3.677956511195538, + "grad_norm": 0.9258282780647278, + "learning_rate": 0.0002, + "loss": 0.6406, + "step": 22750 + }, + { + "epoch": 3.679573195376283, + "grad_norm": 0.70876145362854, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 22760 + }, + { + "epoch": 3.681189879557029, + "grad_norm": 0.8780210018157959, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 22770 + }, + { + "epoch": 3.682806563737774, + "grad_norm": 0.8075440526008606, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 22780 + }, + { + "epoch": 3.6844232479185193, + "grad_norm": 0.8503130674362183, + "learning_rate": 0.0002, + "loss": 0.6561, + "step": 22790 + }, + { + "epoch": 3.6860399320992645, + "grad_norm": 0.8413618206977844, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 22800 + }, + { + "epoch": 3.6876566162800097, + "grad_norm": 0.8675165176391602, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 22810 + }, + { + "epoch": 3.689273300460755, + "grad_norm": 0.8235884308815002, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 22820 + }, + { + "epoch": 3.6908899846415, + "grad_norm": 0.9477725625038147, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 22830 + }, + { + "epoch": 3.6925066688222454, + "grad_norm": 0.7883533835411072, + "learning_rate": 0.0002, + "loss": 0.6481, + "step": 22840 + }, + { + "epoch": 3.6941233530029907, + "grad_norm": 1.047913908958435, + "learning_rate": 0.0002, + "loss": 0.5872, + "step": 22850 + }, + { + "epoch": 3.695740037183736, + "grad_norm": 0.9171528816223145, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 22860 + }, + { + "epoch": 3.6973567213644816, + "grad_norm": 0.9338192343711853, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 22870 + }, + { + "epoch": 3.698973405545227, + "grad_norm": 0.8799443244934082, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 22880 + }, + { + "epoch": 3.700590089725972, + "grad_norm": 0.8515434861183167, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 22890 + }, + { + "epoch": 3.7022067739067173, + "grad_norm": 0.7805591821670532, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 22900 + }, + { + "epoch": 3.7038234580874625, + "grad_norm": 0.8470911979675293, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 22910 + }, + { + "epoch": 3.705440142268208, + "grad_norm": 0.9452309012413025, + "learning_rate": 0.0002, + "loss": 0.6557, + "step": 22920 + }, + { + "epoch": 3.7070568264489534, + "grad_norm": 0.950243353843689, + "learning_rate": 0.0002, + "loss": 0.6529, + "step": 22930 + }, + { + "epoch": 3.7086735106296986, + "grad_norm": 0.7882499098777771, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 22940 + }, + { + "epoch": 3.710290194810444, + "grad_norm": 0.8307787775993347, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 22950 + }, + { + "epoch": 3.711906878991189, + "grad_norm": 1.0970630645751953, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 22960 + }, + { + "epoch": 3.7135235631719343, + "grad_norm": 0.8269566297531128, + "learning_rate": 0.0002, + "loss": 0.6281, + "step": 22970 + }, + { + "epoch": 3.7151402473526796, + "grad_norm": 0.8306704759597778, + "learning_rate": 0.0002, + "loss": 0.6561, + "step": 22980 + }, + { + "epoch": 3.716756931533425, + "grad_norm": 0.9710225462913513, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 22990 + }, + { + "epoch": 3.71837361571417, + "grad_norm": 0.8890530467033386, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 23000 + }, + { + "epoch": 3.7199902998949153, + "grad_norm": 0.883522629737854, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 23010 + }, + { + "epoch": 3.721606984075661, + "grad_norm": 0.8662652373313904, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 23020 + }, + { + "epoch": 3.723223668256406, + "grad_norm": 0.7228406667709351, + "learning_rate": 0.0002, + "loss": 0.6266, + "step": 23030 + }, + { + "epoch": 3.7248403524371514, + "grad_norm": 1.060792088508606, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 23040 + }, + { + "epoch": 3.7264570366178966, + "grad_norm": 1.0119613409042358, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 23050 + }, + { + "epoch": 3.728073720798642, + "grad_norm": 0.9212996959686279, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 23060 + }, + { + "epoch": 3.7296904049793875, + "grad_norm": 0.925690233707428, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 23070 + }, + { + "epoch": 3.7313070891601328, + "grad_norm": 0.8323310613632202, + "learning_rate": 0.0002, + "loss": 0.615, + "step": 23080 + }, + { + "epoch": 3.732923773340878, + "grad_norm": 0.8966048955917358, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 23090 + }, + { + "epoch": 3.7345404575216232, + "grad_norm": 0.8995837569236755, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 23100 + }, + { + "epoch": 3.7361571417023685, + "grad_norm": 0.8748890161514282, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 23110 + }, + { + "epoch": 3.7377738258831137, + "grad_norm": 0.7985540628433228, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 23120 + }, + { + "epoch": 3.739390510063859, + "grad_norm": 1.0240917205810547, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 23130 + }, + { + "epoch": 3.741007194244604, + "grad_norm": 0.9181789755821228, + "learning_rate": 0.0002, + "loss": 0.6747, + "step": 23140 + }, + { + "epoch": 3.7426238784253494, + "grad_norm": 0.8896583914756775, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 23150 + }, + { + "epoch": 3.744240562606095, + "grad_norm": 0.8635515570640564, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 23160 + }, + { + "epoch": 3.7458572467868403, + "grad_norm": 0.8873575329780579, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 23170 + }, + { + "epoch": 3.7474739309675855, + "grad_norm": 0.9807148575782776, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 23180 + }, + { + "epoch": 3.7490906151483308, + "grad_norm": 0.900477945804596, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 23190 + }, + { + "epoch": 3.750707299329076, + "grad_norm": 0.9379992485046387, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 23200 + }, + { + "epoch": 3.752323983509821, + "grad_norm": 0.9649890661239624, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 23210 + }, + { + "epoch": 3.753940667690567, + "grad_norm": 0.824442446231842, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 23220 + }, + { + "epoch": 3.755557351871312, + "grad_norm": 0.8896150588989258, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 23230 + }, + { + "epoch": 3.7571740360520574, + "grad_norm": 0.751249372959137, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 23240 + }, + { + "epoch": 3.7587907202328026, + "grad_norm": 0.9392193555831909, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 23250 + }, + { + "epoch": 3.760407404413548, + "grad_norm": 0.9284586310386658, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 23260 + }, + { + "epoch": 3.762024088594293, + "grad_norm": 0.7738175392150879, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 23270 + }, + { + "epoch": 3.7636407727750383, + "grad_norm": 0.9252978563308716, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 23280 + }, + { + "epoch": 3.7652574569557835, + "grad_norm": 0.9501895904541016, + "learning_rate": 0.0002, + "loss": 0.5984, + "step": 23290 + }, + { + "epoch": 3.7668741411365287, + "grad_norm": 0.9416276216506958, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 23300 + }, + { + "epoch": 3.7684908253172744, + "grad_norm": 0.7076631784439087, + "learning_rate": 0.0002, + "loss": 0.6507, + "step": 23310 + }, + { + "epoch": 3.7701075094980196, + "grad_norm": 0.9864492416381836, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 23320 + }, + { + "epoch": 3.771724193678765, + "grad_norm": 0.8450456261634827, + "learning_rate": 0.0002, + "loss": 0.6537, + "step": 23330 + }, + { + "epoch": 3.77334087785951, + "grad_norm": 1.0768941640853882, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 23340 + }, + { + "epoch": 3.7749575620402553, + "grad_norm": 0.9956819415092468, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 23350 + }, + { + "epoch": 3.7765742462210006, + "grad_norm": 0.9234658479690552, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 23360 + }, + { + "epoch": 3.7781909304017463, + "grad_norm": 1.0993858575820923, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 23370 + }, + { + "epoch": 3.7798076145824915, + "grad_norm": 0.923159658908844, + "learning_rate": 0.0002, + "loss": 0.6391, + "step": 23380 + }, + { + "epoch": 3.7814242987632367, + "grad_norm": 0.9311541318893433, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 23390 + }, + { + "epoch": 3.783040982943982, + "grad_norm": 0.919681191444397, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 23400 + }, + { + "epoch": 3.784657667124727, + "grad_norm": 1.7406195402145386, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 23410 + }, + { + "epoch": 3.7862743513054724, + "grad_norm": 0.7789074182510376, + "learning_rate": 0.0002, + "loss": 0.657, + "step": 23420 + }, + { + "epoch": 3.7878910354862176, + "grad_norm": 0.8302814960479736, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 23430 + }, + { + "epoch": 3.789507719666963, + "grad_norm": 0.8089349269866943, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 23440 + }, + { + "epoch": 3.791124403847708, + "grad_norm": 0.9006284475326538, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 23450 + }, + { + "epoch": 3.7927410880284538, + "grad_norm": 0.8426766991615295, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 23460 + }, + { + "epoch": 3.794357772209199, + "grad_norm": 1.2576252222061157, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 23470 + }, + { + "epoch": 3.7959744563899442, + "grad_norm": 1.0307610034942627, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 23480 + }, + { + "epoch": 3.7975911405706895, + "grad_norm": 0.8525972962379456, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 23490 + }, + { + "epoch": 3.7992078247514347, + "grad_norm": 1.159039855003357, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 23500 + }, + { + "epoch": 3.80082450893218, + "grad_norm": 1.4193549156188965, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 23510 + }, + { + "epoch": 3.8024411931129256, + "grad_norm": 0.8245543837547302, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 23520 + }, + { + "epoch": 3.804057877293671, + "grad_norm": 0.8847230076789856, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 23530 + }, + { + "epoch": 3.805674561474416, + "grad_norm": 0.9574624300003052, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 23540 + }, + { + "epoch": 3.8072912456551613, + "grad_norm": 1.048020601272583, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 23550 + }, + { + "epoch": 3.8089079298359065, + "grad_norm": 0.8302255868911743, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 23560 + }, + { + "epoch": 3.8105246140166518, + "grad_norm": 0.8269215822219849, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 23570 + }, + { + "epoch": 3.812141298197397, + "grad_norm": 0.9375753402709961, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 23580 + }, + { + "epoch": 3.8137579823781422, + "grad_norm": 1.0234097242355347, + "learning_rate": 0.0002, + "loss": 0.6132, + "step": 23590 + }, + { + "epoch": 3.8153746665588875, + "grad_norm": 0.8978445529937744, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 23600 + }, + { + "epoch": 3.816991350739633, + "grad_norm": 0.7929515838623047, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 23610 + }, + { + "epoch": 3.8186080349203784, + "grad_norm": 1.3255881071090698, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 23620 + }, + { + "epoch": 3.8202247191011236, + "grad_norm": 0.9188598990440369, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 23630 + }, + { + "epoch": 3.821841403281869, + "grad_norm": 0.8811675906181335, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 23640 + }, + { + "epoch": 3.823458087462614, + "grad_norm": 0.8061038255691528, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 23650 + }, + { + "epoch": 3.8250747716433597, + "grad_norm": 0.9975376129150391, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 23660 + }, + { + "epoch": 3.826691455824105, + "grad_norm": 0.8036105036735535, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 23670 + }, + { + "epoch": 3.82830814000485, + "grad_norm": 0.7401984333992004, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 23680 + }, + { + "epoch": 3.8299248241855954, + "grad_norm": 0.829753041267395, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 23690 + }, + { + "epoch": 3.8315415083663407, + "grad_norm": 0.8753240704536438, + "learning_rate": 0.0002, + "loss": 0.6611, + "step": 23700 + }, + { + "epoch": 3.833158192547086, + "grad_norm": 0.8157842755317688, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 23710 + }, + { + "epoch": 3.834774876727831, + "grad_norm": 0.6183798909187317, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 23720 + }, + { + "epoch": 3.8363915609085764, + "grad_norm": 0.9548442363739014, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 23730 + }, + { + "epoch": 3.8380082450893216, + "grad_norm": 0.8319669961929321, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 23740 + }, + { + "epoch": 3.839624929270067, + "grad_norm": 0.9718693494796753, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 23750 + }, + { + "epoch": 3.8412416134508125, + "grad_norm": 0.8672235012054443, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 23760 + }, + { + "epoch": 3.8428582976315577, + "grad_norm": 1.1210707426071167, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 23770 + }, + { + "epoch": 3.844474981812303, + "grad_norm": 0.9177767634391785, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 23780 + }, + { + "epoch": 3.846091665993048, + "grad_norm": 0.8714171648025513, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 23790 + }, + { + "epoch": 3.8477083501737934, + "grad_norm": 1.1853246688842773, + "learning_rate": 0.0002, + "loss": 0.6302, + "step": 23800 + }, + { + "epoch": 3.849325034354539, + "grad_norm": 0.8091260194778442, + "learning_rate": 0.0002, + "loss": 0.6144, + "step": 23810 + }, + { + "epoch": 3.8509417185352843, + "grad_norm": 0.9710774421691895, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 23820 + }, + { + "epoch": 3.8525584027160296, + "grad_norm": 0.7648707628250122, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 23830 + }, + { + "epoch": 3.854175086896775, + "grad_norm": 0.7809253931045532, + "learning_rate": 0.0002, + "loss": 0.6013, + "step": 23840 + }, + { + "epoch": 3.85579177107752, + "grad_norm": 0.8337951898574829, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 23850 + }, + { + "epoch": 3.8574084552582653, + "grad_norm": 0.9271913170814514, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 23860 + }, + { + "epoch": 3.8590251394390105, + "grad_norm": 0.985334038734436, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 23870 + }, + { + "epoch": 3.8606418236197557, + "grad_norm": 0.8458583354949951, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 23880 + }, + { + "epoch": 3.862258507800501, + "grad_norm": 1.015348196029663, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 23890 + }, + { + "epoch": 3.8638751919812466, + "grad_norm": 1.0121688842773438, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 23900 + }, + { + "epoch": 3.865491876161992, + "grad_norm": 0.8883971571922302, + "learning_rate": 0.0002, + "loss": 0.5921, + "step": 23910 + }, + { + "epoch": 3.867108560342737, + "grad_norm": 1.028086543083191, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 23920 + }, + { + "epoch": 3.8687252445234823, + "grad_norm": 0.9645734429359436, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 23930 + }, + { + "epoch": 3.8703419287042276, + "grad_norm": 0.8235350251197815, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 23940 + }, + { + "epoch": 3.871958612884973, + "grad_norm": 1.0298916101455688, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 23950 + }, + { + "epoch": 3.8735752970657185, + "grad_norm": 1.0063377618789673, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 23960 + }, + { + "epoch": 3.8751919812464637, + "grad_norm": 0.9230626821517944, + "learning_rate": 0.0002, + "loss": 0.6234, + "step": 23970 + }, + { + "epoch": 3.876808665427209, + "grad_norm": 0.9243063926696777, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 23980 + }, + { + "epoch": 3.878425349607954, + "grad_norm": 1.0211291313171387, + "learning_rate": 0.0002, + "loss": 0.6035, + "step": 23990 + }, + { + "epoch": 3.8800420337886994, + "grad_norm": 0.7800535559654236, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 24000 + }, + { + "epoch": 3.8816587179694446, + "grad_norm": 0.7904248833656311, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 24010 + }, + { + "epoch": 3.88327540215019, + "grad_norm": 1.1975988149642944, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 24020 + }, + { + "epoch": 3.884892086330935, + "grad_norm": 1.0626593828201294, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 24030 + }, + { + "epoch": 3.8865087705116803, + "grad_norm": 0.9012193083763123, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 24040 + }, + { + "epoch": 3.888125454692426, + "grad_norm": 1.1159172058105469, + "learning_rate": 0.0002, + "loss": 0.6786, + "step": 24050 + }, + { + "epoch": 3.889742138873171, + "grad_norm": 1.276838779449463, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 24060 + }, + { + "epoch": 3.8913588230539164, + "grad_norm": 0.8467690348625183, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 24070 + }, + { + "epoch": 3.8929755072346617, + "grad_norm": 0.9862841963768005, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 24080 + }, + { + "epoch": 3.894592191415407, + "grad_norm": 0.7134621739387512, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 24090 + }, + { + "epoch": 3.896208875596152, + "grad_norm": 0.8178175091743469, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 24100 + }, + { + "epoch": 3.897825559776898, + "grad_norm": 0.9229172468185425, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 24110 + }, + { + "epoch": 3.899442243957643, + "grad_norm": 1.0878316164016724, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 24120 + }, + { + "epoch": 3.9010589281383883, + "grad_norm": 0.971645712852478, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 24130 + }, + { + "epoch": 3.9026756123191335, + "grad_norm": 0.8862188458442688, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 24140 + }, + { + "epoch": 3.9042922964998787, + "grad_norm": 0.9126982688903809, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 24150 + }, + { + "epoch": 3.905908980680624, + "grad_norm": 0.8833470940589905, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 24160 + }, + { + "epoch": 3.907525664861369, + "grad_norm": 0.8320947885513306, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 24170 + }, + { + "epoch": 3.9091423490421144, + "grad_norm": 0.9156602025032043, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 24180 + }, + { + "epoch": 3.9107590332228597, + "grad_norm": 1.029181957244873, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 24190 + }, + { + "epoch": 3.9123757174036053, + "grad_norm": 0.9052802324295044, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 24200 + }, + { + "epoch": 3.9139924015843506, + "grad_norm": 0.8847255110740662, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 24210 + }, + { + "epoch": 3.915609085765096, + "grad_norm": 0.9642062187194824, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 24220 + }, + { + "epoch": 3.917225769945841, + "grad_norm": 0.8629093766212463, + "learning_rate": 0.0002, + "loss": 0.6557, + "step": 24230 + }, + { + "epoch": 3.9188424541265863, + "grad_norm": 0.8674976825714111, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 24240 + }, + { + "epoch": 3.9204591383073315, + "grad_norm": 1.104846477508545, + "learning_rate": 0.0002, + "loss": 0.5874, + "step": 24250 + }, + { + "epoch": 3.922075822488077, + "grad_norm": 1.0874955654144287, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 24260 + }, + { + "epoch": 3.9236925066688224, + "grad_norm": 0.8689812421798706, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 24270 + }, + { + "epoch": 3.9253091908495676, + "grad_norm": 0.9724617004394531, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 24280 + }, + { + "epoch": 3.926925875030313, + "grad_norm": 0.9165538549423218, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 24290 + }, + { + "epoch": 3.928542559211058, + "grad_norm": 0.9307710528373718, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 24300 + }, + { + "epoch": 3.9301592433918033, + "grad_norm": 0.8589295148849487, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 24310 + }, + { + "epoch": 3.9317759275725486, + "grad_norm": 0.9151099920272827, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 24320 + }, + { + "epoch": 3.933392611753294, + "grad_norm": 0.9633517265319824, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 24330 + }, + { + "epoch": 3.935009295934039, + "grad_norm": 0.9521116018295288, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 24340 + }, + { + "epoch": 3.9366259801147847, + "grad_norm": 0.8366776704788208, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 24350 + }, + { + "epoch": 3.93824266429553, + "grad_norm": 0.8972663283348083, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 24360 + }, + { + "epoch": 3.939859348476275, + "grad_norm": 0.8102919459342957, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 24370 + }, + { + "epoch": 3.9414760326570204, + "grad_norm": 0.8189975023269653, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 24380 + }, + { + "epoch": 3.9430927168377656, + "grad_norm": 0.9569464921951294, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 24390 + }, + { + "epoch": 3.9447094010185113, + "grad_norm": 0.7459101676940918, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 24400 + }, + { + "epoch": 3.9463260851992565, + "grad_norm": 0.8536974787712097, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 24410 + }, + { + "epoch": 3.9479427693800018, + "grad_norm": 0.8763698935508728, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 24420 + }, + { + "epoch": 3.949559453560747, + "grad_norm": 0.9381106495857239, + "learning_rate": 0.0002, + "loss": 0.6478, + "step": 24430 + }, + { + "epoch": 3.9511761377414922, + "grad_norm": 0.934440016746521, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 24440 + }, + { + "epoch": 3.9527928219222375, + "grad_norm": 0.903918981552124, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 24450 + }, + { + "epoch": 3.9544095061029827, + "grad_norm": 0.8771953582763672, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 24460 + }, + { + "epoch": 3.956026190283728, + "grad_norm": 1.0375410318374634, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 24470 + }, + { + "epoch": 3.957642874464473, + "grad_norm": 0.9439185261726379, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 24480 + }, + { + "epoch": 3.9592595586452184, + "grad_norm": 0.935467004776001, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 24490 + }, + { + "epoch": 3.960876242825964, + "grad_norm": 0.6900772452354431, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 24500 + }, + { + "epoch": 3.9624929270067093, + "grad_norm": 1.0172916650772095, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 24510 + }, + { + "epoch": 3.9641096111874545, + "grad_norm": 0.9167046546936035, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 24520 + }, + { + "epoch": 3.9657262953681998, + "grad_norm": 0.7230527997016907, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 24530 + }, + { + "epoch": 3.967342979548945, + "grad_norm": 0.8980403542518616, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 24540 + }, + { + "epoch": 3.9689596637296907, + "grad_norm": 0.8555465936660767, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 24550 + }, + { + "epoch": 3.970576347910436, + "grad_norm": 0.7825445532798767, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 24560 + }, + { + "epoch": 3.972193032091181, + "grad_norm": 0.7273133993148804, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 24570 + }, + { + "epoch": 3.9738097162719264, + "grad_norm": 0.9612047672271729, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 24580 + }, + { + "epoch": 3.9754264004526716, + "grad_norm": 0.9865460991859436, + "learning_rate": 0.0002, + "loss": 0.6426, + "step": 24590 + }, + { + "epoch": 3.977043084633417, + "grad_norm": 0.8638762831687927, + "learning_rate": 0.0002, + "loss": 0.6052, + "step": 24600 + }, + { + "epoch": 3.978659768814162, + "grad_norm": 1.0096198320388794, + "learning_rate": 0.0002, + "loss": 0.6097, + "step": 24610 + }, + { + "epoch": 3.9802764529949073, + "grad_norm": 0.8475532531738281, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 24620 + }, + { + "epoch": 3.9818931371756525, + "grad_norm": 0.9696195721626282, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 24630 + }, + { + "epoch": 3.9835098213563978, + "grad_norm": 0.7499843239784241, + "learning_rate": 0.0002, + "loss": 0.6446, + "step": 24640 + }, + { + "epoch": 3.9851265055371434, + "grad_norm": 0.8865424990653992, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 24650 + }, + { + "epoch": 3.9867431897178887, + "grad_norm": 0.8089959025382996, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 24660 + }, + { + "epoch": 3.988359873898634, + "grad_norm": 0.6946012377738953, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 24670 + }, + { + "epoch": 3.989976558079379, + "grad_norm": 0.7991759181022644, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 24680 + }, + { + "epoch": 3.9915932422601244, + "grad_norm": 0.8803931474685669, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 24690 + }, + { + "epoch": 3.99320992644087, + "grad_norm": 0.8848299980163574, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 24700 + }, + { + "epoch": 3.9948266106216153, + "grad_norm": 0.7448889017105103, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 24710 + }, + { + "epoch": 3.9964432948023605, + "grad_norm": 0.9361620545387268, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 24720 + }, + { + "epoch": 3.9980599789831057, + "grad_norm": 0.9958081245422363, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 24730 + }, + { + "epoch": 3.999676663163851, + "grad_norm": 1.026004672050476, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 24740 + }, + { + "epoch": 4.0, + "eval_loss": 1.1524168252944946, + "eval_runtime": 122.1585, + "eval_samples_per_second": 6.0, + "eval_steps_per_second": 0.753, + "step": 24742 + }, + { + "epoch": 4.001293347344596, + "grad_norm": 1.0664808750152588, + "learning_rate": 0.0002, + "loss": 0.6057, + "step": 24750 + }, + { + "epoch": 4.002910031525341, + "grad_norm": 1.0113720893859863, + "learning_rate": 0.0002, + "loss": 0.5644, + "step": 24760 + }, + { + "epoch": 4.004526715706087, + "grad_norm": 0.991486668586731, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 24770 + }, + { + "epoch": 4.006143399886832, + "grad_norm": 0.951754629611969, + "learning_rate": 0.0002, + "loss": 0.508, + "step": 24780 + }, + { + "epoch": 4.007760084067577, + "grad_norm": 1.13059401512146, + "learning_rate": 0.0002, + "loss": 0.5314, + "step": 24790 + }, + { + "epoch": 4.009376768248322, + "grad_norm": 0.9343926310539246, + "learning_rate": 0.0002, + "loss": 0.5323, + "step": 24800 + }, + { + "epoch": 4.010993452429068, + "grad_norm": 1.0680590867996216, + "learning_rate": 0.0002, + "loss": 0.5161, + "step": 24810 + }, + { + "epoch": 4.012610136609814, + "grad_norm": 1.0022706985473633, + "learning_rate": 0.0002, + "loss": 0.513, + "step": 24820 + }, + { + "epoch": 4.014226820790559, + "grad_norm": 1.0285297632217407, + "learning_rate": 0.0002, + "loss": 0.543, + "step": 24830 + }, + { + "epoch": 4.015843504971304, + "grad_norm": 0.8347002863883972, + "learning_rate": 0.0002, + "loss": 0.5311, + "step": 24840 + }, + { + "epoch": 4.017460189152049, + "grad_norm": 0.9675396680831909, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 24850 + }, + { + "epoch": 4.019076873332795, + "grad_norm": 0.9238511323928833, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 24860 + }, + { + "epoch": 4.02069355751354, + "grad_norm": 1.1576941013336182, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 24870 + }, + { + "epoch": 4.022310241694285, + "grad_norm": 0.8583757281303406, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 24880 + }, + { + "epoch": 4.02392692587503, + "grad_norm": 0.9816817045211792, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 24890 + }, + { + "epoch": 4.0255436100557755, + "grad_norm": 0.955073893070221, + "learning_rate": 0.0002, + "loss": 0.5605, + "step": 24900 + }, + { + "epoch": 4.027160294236521, + "grad_norm": 1.1054974794387817, + "learning_rate": 0.0002, + "loss": 0.4896, + "step": 24910 + }, + { + "epoch": 4.028776978417266, + "grad_norm": 1.1240060329437256, + "learning_rate": 0.0002, + "loss": 0.5246, + "step": 24920 + }, + { + "epoch": 4.030393662598011, + "grad_norm": 0.9512825012207031, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 24930 + }, + { + "epoch": 4.0320103467787565, + "grad_norm": 0.85965496301651, + "learning_rate": 0.0002, + "loss": 0.5584, + "step": 24940 + }, + { + "epoch": 4.033627030959502, + "grad_norm": 0.9378061294555664, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 24950 + }, + { + "epoch": 4.035243715140247, + "grad_norm": 0.9655424356460571, + "learning_rate": 0.0002, + "loss": 0.5008, + "step": 24960 + }, + { + "epoch": 4.036860399320993, + "grad_norm": 1.1393707990646362, + "learning_rate": 0.0002, + "loss": 0.5538, + "step": 24970 + }, + { + "epoch": 4.038477083501738, + "grad_norm": 1.0220451354980469, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 24980 + }, + { + "epoch": 4.0400937676824835, + "grad_norm": 0.9785808324813843, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 24990 + }, + { + "epoch": 4.041710451863229, + "grad_norm": 1.0257649421691895, + "learning_rate": 0.0002, + "loss": 0.5153, + "step": 25000 + }, + { + "epoch": 4.043327136043974, + "grad_norm": 0.9737892150878906, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 25010 + }, + { + "epoch": 4.044943820224719, + "grad_norm": 0.7416959404945374, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 25020 + }, + { + "epoch": 4.046560504405464, + "grad_norm": 0.7909596562385559, + "learning_rate": 0.0002, + "loss": 0.5372, + "step": 25030 + }, + { + "epoch": 4.04817718858621, + "grad_norm": 0.8923130631446838, + "learning_rate": 0.0002, + "loss": 0.5265, + "step": 25040 + }, + { + "epoch": 4.049793872766955, + "grad_norm": 0.9044941663742065, + "learning_rate": 0.0002, + "loss": 0.5035, + "step": 25050 + }, + { + "epoch": 4.0514105569477, + "grad_norm": 0.866352379322052, + "learning_rate": 0.0002, + "loss": 0.5135, + "step": 25060 + }, + { + "epoch": 4.053027241128445, + "grad_norm": 1.544549822807312, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 25070 + }, + { + "epoch": 4.054643925309191, + "grad_norm": 0.8426995277404785, + "learning_rate": 0.0002, + "loss": 0.5418, + "step": 25080 + }, + { + "epoch": 4.056260609489936, + "grad_norm": 0.9797548651695251, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 25090 + }, + { + "epoch": 4.057877293670681, + "grad_norm": 0.8468434810638428, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 25100 + }, + { + "epoch": 4.059493977851426, + "grad_norm": 0.9294559955596924, + "learning_rate": 0.0002, + "loss": 0.5242, + "step": 25110 + }, + { + "epoch": 4.061110662032172, + "grad_norm": 0.9686688780784607, + "learning_rate": 0.0002, + "loss": 0.5295, + "step": 25120 + }, + { + "epoch": 4.062727346212918, + "grad_norm": 0.8042728304862976, + "learning_rate": 0.0002, + "loss": 0.5642, + "step": 25130 + }, + { + "epoch": 4.064344030393663, + "grad_norm": 1.165160894393921, + "learning_rate": 0.0002, + "loss": 0.548, + "step": 25140 + }, + { + "epoch": 4.065960714574408, + "grad_norm": 1.2161961793899536, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 25150 + }, + { + "epoch": 4.067577398755153, + "grad_norm": 1.0762810707092285, + "learning_rate": 0.0002, + "loss": 0.5217, + "step": 25160 + }, + { + "epoch": 4.069194082935899, + "grad_norm": 0.7580869793891907, + "learning_rate": 0.0002, + "loss": 0.5886, + "step": 25170 + }, + { + "epoch": 4.070810767116644, + "grad_norm": 0.9630117416381836, + "learning_rate": 0.0002, + "loss": 0.5401, + "step": 25180 + }, + { + "epoch": 4.072427451297389, + "grad_norm": 0.9049716591835022, + "learning_rate": 0.0002, + "loss": 0.5378, + "step": 25190 + }, + { + "epoch": 4.074044135478134, + "grad_norm": 1.1536930799484253, + "learning_rate": 0.0002, + "loss": 0.5266, + "step": 25200 + }, + { + "epoch": 4.0756608196588795, + "grad_norm": 0.901461124420166, + "learning_rate": 0.0002, + "loss": 0.5523, + "step": 25210 + }, + { + "epoch": 4.077277503839625, + "grad_norm": 1.3318437337875366, + "learning_rate": 0.0002, + "loss": 0.5132, + "step": 25220 + }, + { + "epoch": 4.07889418802037, + "grad_norm": 0.8811455368995667, + "learning_rate": 0.0002, + "loss": 0.5317, + "step": 25230 + }, + { + "epoch": 4.080510872201115, + "grad_norm": 1.0564165115356445, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 25240 + }, + { + "epoch": 4.08212755638186, + "grad_norm": 1.1008027791976929, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 25250 + }, + { + "epoch": 4.083744240562606, + "grad_norm": 1.150097131729126, + "learning_rate": 0.0002, + "loss": 0.5195, + "step": 25260 + }, + { + "epoch": 4.085360924743352, + "grad_norm": 0.9339924454689026, + "learning_rate": 0.0002, + "loss": 0.5321, + "step": 25270 + }, + { + "epoch": 4.086977608924097, + "grad_norm": 1.0902045965194702, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 25280 + }, + { + "epoch": 4.088594293104842, + "grad_norm": 0.8483911156654358, + "learning_rate": 0.0002, + "loss": 0.5203, + "step": 25290 + }, + { + "epoch": 4.0902109772855875, + "grad_norm": 0.9477024674415588, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 25300 + }, + { + "epoch": 4.091827661466333, + "grad_norm": 0.9500215649604797, + "learning_rate": 0.0002, + "loss": 0.5384, + "step": 25310 + }, + { + "epoch": 4.093444345647078, + "grad_norm": 1.040468454360962, + "learning_rate": 0.0002, + "loss": 0.5045, + "step": 25320 + }, + { + "epoch": 4.095061029827823, + "grad_norm": 0.7457592487335205, + "learning_rate": 0.0002, + "loss": 0.5488, + "step": 25330 + }, + { + "epoch": 4.096677714008568, + "grad_norm": 1.2092097997665405, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 25340 + }, + { + "epoch": 4.098294398189314, + "grad_norm": 0.9652107954025269, + "learning_rate": 0.0002, + "loss": 0.5174, + "step": 25350 + }, + { + "epoch": 4.099911082370059, + "grad_norm": 0.8464955687522888, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 25360 + }, + { + "epoch": 4.101527766550804, + "grad_norm": 0.875026285648346, + "learning_rate": 0.0002, + "loss": 0.5635, + "step": 25370 + }, + { + "epoch": 4.103144450731549, + "grad_norm": 0.9241740107536316, + "learning_rate": 0.0002, + "loss": 0.5774, + "step": 25380 + }, + { + "epoch": 4.1047611349122946, + "grad_norm": 0.9769546389579773, + "learning_rate": 0.0002, + "loss": 0.5578, + "step": 25390 + }, + { + "epoch": 4.10637781909304, + "grad_norm": 1.1501960754394531, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 25400 + }, + { + "epoch": 4.107994503273786, + "grad_norm": 0.9135243892669678, + "learning_rate": 0.0002, + "loss": 0.5241, + "step": 25410 + }, + { + "epoch": 4.109611187454531, + "grad_norm": 0.9905396103858948, + "learning_rate": 0.0002, + "loss": 0.5152, + "step": 25420 + }, + { + "epoch": 4.111227871635276, + "grad_norm": 0.9845104217529297, + "learning_rate": 0.0002, + "loss": 0.5064, + "step": 25430 + }, + { + "epoch": 4.112844555816022, + "grad_norm": 0.8326883912086487, + "learning_rate": 0.0002, + "loss": 0.5029, + "step": 25440 + }, + { + "epoch": 4.114461239996767, + "grad_norm": 0.9264556765556335, + "learning_rate": 0.0002, + "loss": 0.5312, + "step": 25450 + }, + { + "epoch": 4.116077924177512, + "grad_norm": 1.043080449104309, + "learning_rate": 0.0002, + "loss": 0.5968, + "step": 25460 + }, + { + "epoch": 4.117694608358257, + "grad_norm": 0.8533386588096619, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 25470 + }, + { + "epoch": 4.1193112925390025, + "grad_norm": 1.0133965015411377, + "learning_rate": 0.0002, + "loss": 0.5584, + "step": 25480 + }, + { + "epoch": 4.120927976719748, + "grad_norm": 0.7476310133934021, + "learning_rate": 0.0002, + "loss": 0.566, + "step": 25490 + }, + { + "epoch": 4.122544660900493, + "grad_norm": 1.1247259378433228, + "learning_rate": 0.0002, + "loss": 0.5189, + "step": 25500 + }, + { + "epoch": 4.124161345081238, + "grad_norm": 1.0764678716659546, + "learning_rate": 0.0002, + "loss": 0.5751, + "step": 25510 + }, + { + "epoch": 4.1257780292619834, + "grad_norm": 0.7679798007011414, + "learning_rate": 0.0002, + "loss": 0.5391, + "step": 25520 + }, + { + "epoch": 4.127394713442729, + "grad_norm": 0.8877071142196655, + "learning_rate": 0.0002, + "loss": 0.5233, + "step": 25530 + }, + { + "epoch": 4.129011397623474, + "grad_norm": 1.0440239906311035, + "learning_rate": 0.0002, + "loss": 0.5769, + "step": 25540 + }, + { + "epoch": 4.130628081804219, + "grad_norm": 0.984145998954773, + "learning_rate": 0.0002, + "loss": 0.5723, + "step": 25550 + }, + { + "epoch": 4.132244765984965, + "grad_norm": 0.8667055368423462, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 25560 + }, + { + "epoch": 4.1338614501657105, + "grad_norm": 1.1300835609436035, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 25570 + }, + { + "epoch": 4.135478134346456, + "grad_norm": 0.9314348101615906, + "learning_rate": 0.0002, + "loss": 0.524, + "step": 25580 + }, + { + "epoch": 4.137094818527201, + "grad_norm": 0.7731879949569702, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 25590 + }, + { + "epoch": 4.138711502707946, + "grad_norm": 1.0080097913742065, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 25600 + }, + { + "epoch": 4.140328186888691, + "grad_norm": 1.2475038766860962, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 25610 + }, + { + "epoch": 4.141944871069437, + "grad_norm": 0.9912930727005005, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 25620 + }, + { + "epoch": 4.143561555250182, + "grad_norm": 0.9088651537895203, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 25630 + }, + { + "epoch": 4.145178239430927, + "grad_norm": 0.8940697312355042, + "learning_rate": 0.0002, + "loss": 0.5393, + "step": 25640 + }, + { + "epoch": 4.146794923611672, + "grad_norm": 1.0798203945159912, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 25650 + }, + { + "epoch": 4.148411607792418, + "grad_norm": 0.955172061920166, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 25660 + }, + { + "epoch": 4.150028291973163, + "grad_norm": 0.9692716002464294, + "learning_rate": 0.0002, + "loss": 0.569, + "step": 25670 + }, + { + "epoch": 4.151644976153908, + "grad_norm": 1.0813939571380615, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 25680 + }, + { + "epoch": 4.153261660334653, + "grad_norm": 1.135675072669983, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 25690 + }, + { + "epoch": 4.1548783445153985, + "grad_norm": 1.0392236709594727, + "learning_rate": 0.0002, + "loss": 0.5247, + "step": 25700 + }, + { + "epoch": 4.156495028696145, + "grad_norm": 0.9473116993904114, + "learning_rate": 0.0002, + "loss": 0.5204, + "step": 25710 + }, + { + "epoch": 4.15811171287689, + "grad_norm": 0.712493896484375, + "learning_rate": 0.0002, + "loss": 0.5339, + "step": 25720 + }, + { + "epoch": 4.159728397057635, + "grad_norm": 0.8724465370178223, + "learning_rate": 0.0002, + "loss": 0.5781, + "step": 25730 + }, + { + "epoch": 4.16134508123838, + "grad_norm": 0.9870015978813171, + "learning_rate": 0.0002, + "loss": 0.5325, + "step": 25740 + }, + { + "epoch": 4.1629617654191255, + "grad_norm": 1.025273084640503, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 25750 + }, + { + "epoch": 4.164578449599871, + "grad_norm": 0.9243090152740479, + "learning_rate": 0.0002, + "loss": 0.5223, + "step": 25760 + }, + { + "epoch": 4.166195133780616, + "grad_norm": 1.1656451225280762, + "learning_rate": 0.0002, + "loss": 0.5177, + "step": 25770 + }, + { + "epoch": 4.167811817961361, + "grad_norm": 0.936358630657196, + "learning_rate": 0.0002, + "loss": 0.5334, + "step": 25780 + }, + { + "epoch": 4.1694285021421065, + "grad_norm": 0.8618208169937134, + "learning_rate": 0.0002, + "loss": 0.5236, + "step": 25790 + }, + { + "epoch": 4.171045186322852, + "grad_norm": 0.8580600023269653, + "learning_rate": 0.0002, + "loss": 0.5186, + "step": 25800 + }, + { + "epoch": 4.172661870503597, + "grad_norm": 1.0128562450408936, + "learning_rate": 0.0002, + "loss": 0.5212, + "step": 25810 + }, + { + "epoch": 4.174278554684342, + "grad_norm": 0.854865312576294, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 25820 + }, + { + "epoch": 4.175895238865087, + "grad_norm": 1.235082745552063, + "learning_rate": 0.0002, + "loss": 0.5377, + "step": 25830 + }, + { + "epoch": 4.177511923045833, + "grad_norm": 0.9796220660209656, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 25840 + }, + { + "epoch": 4.179128607226578, + "grad_norm": 0.8922094702720642, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 25850 + }, + { + "epoch": 4.180745291407324, + "grad_norm": 0.9672530293464661, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 25860 + }, + { + "epoch": 4.182361975588069, + "grad_norm": 0.8662548661231995, + "learning_rate": 0.0002, + "loss": 0.5074, + "step": 25870 + }, + { + "epoch": 4.1839786597688144, + "grad_norm": 0.7938798069953918, + "learning_rate": 0.0002, + "loss": 0.5329, + "step": 25880 + }, + { + "epoch": 4.18559534394956, + "grad_norm": 1.0517958402633667, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 25890 + }, + { + "epoch": 4.187212028130305, + "grad_norm": 0.8939275145530701, + "learning_rate": 0.0002, + "loss": 0.5147, + "step": 25900 + }, + { + "epoch": 4.18882871231105, + "grad_norm": 1.0296672582626343, + "learning_rate": 0.0002, + "loss": 0.5199, + "step": 25910 + }, + { + "epoch": 4.190445396491795, + "grad_norm": 0.8104017972946167, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 25920 + }, + { + "epoch": 4.192062080672541, + "grad_norm": 0.9984509944915771, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 25930 + }, + { + "epoch": 4.193678764853286, + "grad_norm": 0.9844784736633301, + "learning_rate": 0.0002, + "loss": 0.5356, + "step": 25940 + }, + { + "epoch": 4.195295449034031, + "grad_norm": 0.8168622255325317, + "learning_rate": 0.0002, + "loss": 0.5198, + "step": 25950 + }, + { + "epoch": 4.196912133214776, + "grad_norm": 1.0878913402557373, + "learning_rate": 0.0002, + "loss": 0.542, + "step": 25960 + }, + { + "epoch": 4.1985288173955215, + "grad_norm": 0.927126407623291, + "learning_rate": 0.0002, + "loss": 0.5414, + "step": 25970 + }, + { + "epoch": 4.200145501576267, + "grad_norm": 0.838586688041687, + "learning_rate": 0.0002, + "loss": 0.5794, + "step": 25980 + }, + { + "epoch": 4.201762185757012, + "grad_norm": 1.2572145462036133, + "learning_rate": 0.0002, + "loss": 0.5454, + "step": 25990 + }, + { + "epoch": 4.203378869937758, + "grad_norm": 1.0476740598678589, + "learning_rate": 0.0002, + "loss": 0.5048, + "step": 26000 + }, + { + "epoch": 4.204995554118503, + "grad_norm": 1.0873368978500366, + "learning_rate": 0.0002, + "loss": 0.5127, + "step": 26010 + }, + { + "epoch": 4.206612238299249, + "grad_norm": 1.2664896249771118, + "learning_rate": 0.0002, + "loss": 0.5679, + "step": 26020 + }, + { + "epoch": 4.208228922479994, + "grad_norm": 1.0312391519546509, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 26030 + }, + { + "epoch": 4.209845606660739, + "grad_norm": 1.0235042572021484, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 26040 + }, + { + "epoch": 4.211462290841484, + "grad_norm": 0.8882219195365906, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 26050 + }, + { + "epoch": 4.2130789750222295, + "grad_norm": 0.9115961790084839, + "learning_rate": 0.0002, + "loss": 0.5557, + "step": 26060 + }, + { + "epoch": 4.214695659202975, + "grad_norm": 1.0218228101730347, + "learning_rate": 0.0002, + "loss": 0.5455, + "step": 26070 + }, + { + "epoch": 4.21631234338372, + "grad_norm": 1.0802232027053833, + "learning_rate": 0.0002, + "loss": 0.5462, + "step": 26080 + }, + { + "epoch": 4.217929027564465, + "grad_norm": 1.1488053798675537, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 26090 + }, + { + "epoch": 4.21954571174521, + "grad_norm": 1.0487725734710693, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 26100 + }, + { + "epoch": 4.221162395925956, + "grad_norm": 0.9131165742874146, + "learning_rate": 0.0002, + "loss": 0.5568, + "step": 26110 + }, + { + "epoch": 4.222779080106701, + "grad_norm": 0.9012845158576965, + "learning_rate": 0.0002, + "loss": 0.5206, + "step": 26120 + }, + { + "epoch": 4.224395764287446, + "grad_norm": 0.8389840126037598, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 26130 + }, + { + "epoch": 4.226012448468191, + "grad_norm": 0.8924660682678223, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 26140 + }, + { + "epoch": 4.2276291326489375, + "grad_norm": 0.8556463718414307, + "learning_rate": 0.0002, + "loss": 0.5715, + "step": 26150 + }, + { + "epoch": 4.229245816829683, + "grad_norm": 0.9643129110336304, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 26160 + }, + { + "epoch": 4.230862501010428, + "grad_norm": 0.9865712523460388, + "learning_rate": 0.0002, + "loss": 0.5321, + "step": 26170 + }, + { + "epoch": 4.232479185191173, + "grad_norm": 1.152641773223877, + "learning_rate": 0.0002, + "loss": 0.5406, + "step": 26180 + }, + { + "epoch": 4.234095869371918, + "grad_norm": 0.9157698154449463, + "learning_rate": 0.0002, + "loss": 0.5632, + "step": 26190 + }, + { + "epoch": 4.235712553552664, + "grad_norm": 0.8418048620223999, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 26200 + }, + { + "epoch": 4.237329237733409, + "grad_norm": 0.9430168867111206, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 26210 + }, + { + "epoch": 4.238945921914154, + "grad_norm": 1.012582778930664, + "learning_rate": 0.0002, + "loss": 0.5574, + "step": 26220 + }, + { + "epoch": 4.240562606094899, + "grad_norm": 1.112619400024414, + "learning_rate": 0.0002, + "loss": 0.5693, + "step": 26230 + }, + { + "epoch": 4.2421792902756446, + "grad_norm": 0.9243621826171875, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 26240 + }, + { + "epoch": 4.24379597445639, + "grad_norm": 0.6977595686912537, + "learning_rate": 0.0002, + "loss": 0.569, + "step": 26250 + }, + { + "epoch": 4.245412658637135, + "grad_norm": 0.9600721597671509, + "learning_rate": 0.0002, + "loss": 0.5379, + "step": 26260 + }, + { + "epoch": 4.24702934281788, + "grad_norm": 0.882641613483429, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 26270 + }, + { + "epoch": 4.2486460269986255, + "grad_norm": 1.010920763015747, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 26280 + }, + { + "epoch": 4.250262711179371, + "grad_norm": 0.9289400577545166, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 26290 + }, + { + "epoch": 4.251879395360117, + "grad_norm": 1.137397289276123, + "learning_rate": 0.0002, + "loss": 0.541, + "step": 26300 + }, + { + "epoch": 4.253496079540862, + "grad_norm": 1.0136182308197021, + "learning_rate": 0.0002, + "loss": 0.5204, + "step": 26310 + }, + { + "epoch": 4.255112763721607, + "grad_norm": 0.9387356042861938, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 26320 + }, + { + "epoch": 4.2567294479023525, + "grad_norm": 1.1833957433700562, + "learning_rate": 0.0002, + "loss": 0.5948, + "step": 26330 + }, + { + "epoch": 4.258346132083098, + "grad_norm": 0.9415934681892395, + "learning_rate": 0.0002, + "loss": 0.5905, + "step": 26340 + }, + { + "epoch": 4.259962816263843, + "grad_norm": 0.8550165891647339, + "learning_rate": 0.0002, + "loss": 0.5539, + "step": 26350 + }, + { + "epoch": 4.261579500444588, + "grad_norm": 9.924622535705566, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 26360 + }, + { + "epoch": 4.2631961846253335, + "grad_norm": 1.0104902982711792, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 26370 + }, + { + "epoch": 4.264812868806079, + "grad_norm": 0.890794038772583, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 26380 + }, + { + "epoch": 4.266429552986824, + "grad_norm": 1.0560191869735718, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 26390 + }, + { + "epoch": 4.268046237167569, + "grad_norm": 1.0135581493377686, + "learning_rate": 0.0002, + "loss": 0.5119, + "step": 26400 + }, + { + "epoch": 4.269662921348314, + "grad_norm": 1.1304140090942383, + "learning_rate": 0.0002, + "loss": 0.5359, + "step": 26410 + }, + { + "epoch": 4.27127960552906, + "grad_norm": 0.9899303913116455, + "learning_rate": 0.0002, + "loss": 0.5615, + "step": 26420 + }, + { + "epoch": 4.272896289709805, + "grad_norm": 1.0505329370498657, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 26430 + }, + { + "epoch": 4.27451297389055, + "grad_norm": 0.9389396905899048, + "learning_rate": 0.0002, + "loss": 0.5384, + "step": 26440 + }, + { + "epoch": 4.276129658071296, + "grad_norm": 0.875328779220581, + "learning_rate": 0.0002, + "loss": 0.5558, + "step": 26450 + }, + { + "epoch": 4.277746342252041, + "grad_norm": 1.0689256191253662, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 26460 + }, + { + "epoch": 4.279363026432787, + "grad_norm": 0.9988957643508911, + "learning_rate": 0.0002, + "loss": 0.546, + "step": 26470 + }, + { + "epoch": 4.280979710613532, + "grad_norm": 0.8721813559532166, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 26480 + }, + { + "epoch": 4.282596394794277, + "grad_norm": 1.100109577178955, + "learning_rate": 0.0002, + "loss": 0.5424, + "step": 26490 + }, + { + "epoch": 4.284213078975022, + "grad_norm": 1.1607271432876587, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 26500 + }, + { + "epoch": 4.285829763155768, + "grad_norm": 0.879088819026947, + "learning_rate": 0.0002, + "loss": 0.6287, + "step": 26510 + }, + { + "epoch": 4.287446447336513, + "grad_norm": 0.9891700744628906, + "learning_rate": 0.0002, + "loss": 0.573, + "step": 26520 + }, + { + "epoch": 4.289063131517258, + "grad_norm": 1.0831127166748047, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 26530 + }, + { + "epoch": 4.290679815698003, + "grad_norm": 1.4108285903930664, + "learning_rate": 0.0002, + "loss": 0.5693, + "step": 26540 + }, + { + "epoch": 4.2922964998787485, + "grad_norm": 1.0630289316177368, + "learning_rate": 0.0002, + "loss": 0.5888, + "step": 26550 + }, + { + "epoch": 4.293913184059494, + "grad_norm": 1.0854572057724, + "learning_rate": 0.0002, + "loss": 0.5817, + "step": 26560 + }, + { + "epoch": 4.295529868240239, + "grad_norm": 0.9561646580696106, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 26570 + }, + { + "epoch": 4.297146552420984, + "grad_norm": 0.9064981937408447, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 26580 + }, + { + "epoch": 4.298763236601729, + "grad_norm": 1.0082972049713135, + "learning_rate": 0.0002, + "loss": 0.5847, + "step": 26590 + }, + { + "epoch": 4.3003799207824756, + "grad_norm": 1.1613214015960693, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 26600 + }, + { + "epoch": 4.301996604963221, + "grad_norm": 0.9847695231437683, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 26610 + }, + { + "epoch": 4.303613289143966, + "grad_norm": 1.0980697870254517, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 26620 + }, + { + "epoch": 4.305229973324711, + "grad_norm": 0.8861175179481506, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 26630 + }, + { + "epoch": 4.3068466575054565, + "grad_norm": 0.8917363286018372, + "learning_rate": 0.0002, + "loss": 0.5716, + "step": 26640 + }, + { + "epoch": 4.308463341686202, + "grad_norm": 1.0458378791809082, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 26650 + }, + { + "epoch": 4.310080025866947, + "grad_norm": 1.4859240055084229, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 26660 + }, + { + "epoch": 4.311696710047692, + "grad_norm": 1.1376359462738037, + "learning_rate": 0.0002, + "loss": 0.5296, + "step": 26670 + }, + { + "epoch": 4.313313394228437, + "grad_norm": 0.991349995136261, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 26680 + }, + { + "epoch": 4.314930078409183, + "grad_norm": 0.9995543956756592, + "learning_rate": 0.0002, + "loss": 0.5338, + "step": 26690 + }, + { + "epoch": 4.316546762589928, + "grad_norm": 1.0515851974487305, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 26700 + }, + { + "epoch": 4.318163446770673, + "grad_norm": 1.008023977279663, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 26710 + }, + { + "epoch": 4.319780130951418, + "grad_norm": 1.0184582471847534, + "learning_rate": 0.0002, + "loss": 0.5506, + "step": 26720 + }, + { + "epoch": 4.321396815132164, + "grad_norm": 1.161071538925171, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 26730 + }, + { + "epoch": 4.323013499312909, + "grad_norm": 0.9580779671669006, + "learning_rate": 0.0002, + "loss": 0.5633, + "step": 26740 + }, + { + "epoch": 4.324630183493655, + "grad_norm": 1.0189911127090454, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 26750 + }, + { + "epoch": 4.3262468676744, + "grad_norm": 0.7484358549118042, + "learning_rate": 0.0002, + "loss": 0.5237, + "step": 26760 + }, + { + "epoch": 4.327863551855145, + "grad_norm": 1.0015908479690552, + "learning_rate": 0.0002, + "loss": 0.5728, + "step": 26770 + }, + { + "epoch": 4.329480236035891, + "grad_norm": 0.8972945809364319, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 26780 + }, + { + "epoch": 4.331096920216636, + "grad_norm": 1.01099693775177, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 26790 + }, + { + "epoch": 4.332713604397381, + "grad_norm": 0.846958339214325, + "learning_rate": 0.0002, + "loss": 0.5591, + "step": 26800 + }, + { + "epoch": 4.334330288578126, + "grad_norm": 1.0792603492736816, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 26810 + }, + { + "epoch": 4.3359469727588715, + "grad_norm": 1.0373345613479614, + "learning_rate": 0.0002, + "loss": 0.5747, + "step": 26820 + }, + { + "epoch": 4.337563656939617, + "grad_norm": 0.9779167771339417, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 26830 + }, + { + "epoch": 4.339180341120362, + "grad_norm": 1.0235520601272583, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 26840 + }, + { + "epoch": 4.340797025301107, + "grad_norm": 1.04195237159729, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 26850 + }, + { + "epoch": 4.3424137094818525, + "grad_norm": 0.9479565620422363, + "learning_rate": 0.0002, + "loss": 0.5474, + "step": 26860 + }, + { + "epoch": 4.344030393662598, + "grad_norm": 0.9526172280311584, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 26870 + }, + { + "epoch": 4.345647077843343, + "grad_norm": 0.8571456074714661, + "learning_rate": 0.0002, + "loss": 0.521, + "step": 26880 + }, + { + "epoch": 4.347263762024088, + "grad_norm": 0.9475828409194946, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 26890 + }, + { + "epoch": 4.348880446204834, + "grad_norm": 1.0529576539993286, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 26900 + }, + { + "epoch": 4.3504971303855795, + "grad_norm": 0.9648140072822571, + "learning_rate": 0.0002, + "loss": 0.56, + "step": 26910 + }, + { + "epoch": 4.352113814566325, + "grad_norm": 1.0488841533660889, + "learning_rate": 0.0002, + "loss": 0.5162, + "step": 26920 + }, + { + "epoch": 4.35373049874707, + "grad_norm": 0.8771942257881165, + "learning_rate": 0.0002, + "loss": 0.5842, + "step": 26930 + }, + { + "epoch": 4.355347182927815, + "grad_norm": 0.9411202073097229, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 26940 + }, + { + "epoch": 4.35696386710856, + "grad_norm": 1.0997588634490967, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 26950 + }, + { + "epoch": 4.358580551289306, + "grad_norm": 0.968754768371582, + "learning_rate": 0.0002, + "loss": 0.5528, + "step": 26960 + }, + { + "epoch": 4.360197235470051, + "grad_norm": 0.9990773797035217, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 26970 + }, + { + "epoch": 4.361813919650796, + "grad_norm": 1.0210620164871216, + "learning_rate": 0.0002, + "loss": 0.5761, + "step": 26980 + }, + { + "epoch": 4.363430603831541, + "grad_norm": 0.855462908744812, + "learning_rate": 0.0002, + "loss": 0.5768, + "step": 26990 + }, + { + "epoch": 4.365047288012287, + "grad_norm": 0.9169660806655884, + "learning_rate": 0.0002, + "loss": 0.5493, + "step": 27000 + }, + { + "epoch": 4.366663972193032, + "grad_norm": 1.089629888534546, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 27010 + }, + { + "epoch": 4.368280656373777, + "grad_norm": 1.0932867527008057, + "learning_rate": 0.0002, + "loss": 0.5854, + "step": 27020 + }, + { + "epoch": 4.369897340554522, + "grad_norm": 0.9290956854820251, + "learning_rate": 0.0002, + "loss": 0.5656, + "step": 27030 + }, + { + "epoch": 4.3715140247352675, + "grad_norm": 1.2800624370574951, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 27040 + }, + { + "epoch": 4.373130708916014, + "grad_norm": 0.8993493318557739, + "learning_rate": 0.0002, + "loss": 0.5837, + "step": 27050 + }, + { + "epoch": 4.374747393096759, + "grad_norm": 1.1566431522369385, + "learning_rate": 0.0002, + "loss": 0.6232, + "step": 27060 + }, + { + "epoch": 4.376364077277504, + "grad_norm": 0.9479052424430847, + "learning_rate": 0.0002, + "loss": 0.5902, + "step": 27070 + }, + { + "epoch": 4.377980761458249, + "grad_norm": 1.0063648223876953, + "learning_rate": 0.0002, + "loss": 0.6189, + "step": 27080 + }, + { + "epoch": 4.379597445638995, + "grad_norm": 0.8342045545578003, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 27090 + }, + { + "epoch": 4.38121412981974, + "grad_norm": 1.1390739679336548, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 27100 + }, + { + "epoch": 4.382830814000485, + "grad_norm": 0.9547637104988098, + "learning_rate": 0.0002, + "loss": 0.5372, + "step": 27110 + }, + { + "epoch": 4.38444749818123, + "grad_norm": 1.0503804683685303, + "learning_rate": 0.0002, + "loss": 0.5728, + "step": 27120 + }, + { + "epoch": 4.3860641823619755, + "grad_norm": 0.9064017534255981, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 27130 + }, + { + "epoch": 4.387680866542721, + "grad_norm": 0.9382519125938416, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 27140 + }, + { + "epoch": 4.389297550723466, + "grad_norm": 1.0410341024398804, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 27150 + }, + { + "epoch": 4.390914234904211, + "grad_norm": 0.9218655824661255, + "learning_rate": 0.0002, + "loss": 0.6034, + "step": 27160 + }, + { + "epoch": 4.392530919084956, + "grad_norm": 0.8119737505912781, + "learning_rate": 0.0002, + "loss": 0.5204, + "step": 27170 + }, + { + "epoch": 4.394147603265702, + "grad_norm": 0.8584722876548767, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 27180 + }, + { + "epoch": 4.395764287446447, + "grad_norm": 0.9668293595314026, + "learning_rate": 0.0002, + "loss": 0.5772, + "step": 27190 + }, + { + "epoch": 4.397380971627193, + "grad_norm": 1.022334098815918, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 27200 + }, + { + "epoch": 4.398997655807938, + "grad_norm": 0.9553216099739075, + "learning_rate": 0.0002, + "loss": 0.5573, + "step": 27210 + }, + { + "epoch": 4.4006143399886835, + "grad_norm": 0.9282339215278625, + "learning_rate": 0.0002, + "loss": 0.5604, + "step": 27220 + }, + { + "epoch": 4.402231024169429, + "grad_norm": 1.0232292413711548, + "learning_rate": 0.0002, + "loss": 0.5599, + "step": 27230 + }, + { + "epoch": 4.403847708350174, + "grad_norm": 0.9915700554847717, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 27240 + }, + { + "epoch": 4.405464392530919, + "grad_norm": 1.0014961957931519, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 27250 + }, + { + "epoch": 4.407081076711664, + "grad_norm": 1.1172103881835938, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 27260 + }, + { + "epoch": 4.40869776089241, + "grad_norm": 0.8583093285560608, + "learning_rate": 0.0002, + "loss": 0.5286, + "step": 27270 + }, + { + "epoch": 4.410314445073155, + "grad_norm": 0.7609201669692993, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 27280 + }, + { + "epoch": 4.4119311292539, + "grad_norm": 1.0619351863861084, + "learning_rate": 0.0002, + "loss": 0.575, + "step": 27290 + }, + { + "epoch": 4.413547813434645, + "grad_norm": 1.0177674293518066, + "learning_rate": 0.0002, + "loss": 0.5579, + "step": 27300 + }, + { + "epoch": 4.4151644976153905, + "grad_norm": 0.9921218156814575, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 27310 + }, + { + "epoch": 4.416781181796136, + "grad_norm": 1.126244306564331, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 27320 + }, + { + "epoch": 4.418397865976881, + "grad_norm": 1.0678540468215942, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 27330 + }, + { + "epoch": 4.420014550157627, + "grad_norm": 0.8705704212188721, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 27340 + }, + { + "epoch": 4.421631234338372, + "grad_norm": 1.272074818611145, + "learning_rate": 0.0002, + "loss": 0.5763, + "step": 27350 + }, + { + "epoch": 4.423247918519118, + "grad_norm": 0.8740444183349609, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 27360 + }, + { + "epoch": 4.424864602699863, + "grad_norm": 1.0584250688552856, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 27370 + }, + { + "epoch": 4.426481286880608, + "grad_norm": 1.059870719909668, + "learning_rate": 0.0002, + "loss": 0.589, + "step": 27380 + }, + { + "epoch": 4.428097971061353, + "grad_norm": 1.072265863418579, + "learning_rate": 0.0002, + "loss": 0.5551, + "step": 27390 + }, + { + "epoch": 4.4297146552420985, + "grad_norm": 0.871481716632843, + "learning_rate": 0.0002, + "loss": 0.5584, + "step": 27400 + }, + { + "epoch": 4.431331339422844, + "grad_norm": 0.9555448293685913, + "learning_rate": 0.0002, + "loss": 0.5372, + "step": 27410 + }, + { + "epoch": 4.432948023603589, + "grad_norm": 1.0402292013168335, + "learning_rate": 0.0002, + "loss": 0.5593, + "step": 27420 + }, + { + "epoch": 4.434564707784334, + "grad_norm": 1.12587571144104, + "learning_rate": 0.0002, + "loss": 0.5532, + "step": 27430 + }, + { + "epoch": 4.436181391965079, + "grad_norm": 1.0783193111419678, + "learning_rate": 0.0002, + "loss": 0.5403, + "step": 27440 + }, + { + "epoch": 4.437798076145825, + "grad_norm": 1.024133563041687, + "learning_rate": 0.0002, + "loss": 0.5313, + "step": 27450 + }, + { + "epoch": 4.43941476032657, + "grad_norm": 0.9156768918037415, + "learning_rate": 0.0002, + "loss": 0.5621, + "step": 27460 + }, + { + "epoch": 4.441031444507315, + "grad_norm": 1.0215224027633667, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 27470 + }, + { + "epoch": 4.442648128688061, + "grad_norm": 1.082116961479187, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 27480 + }, + { + "epoch": 4.4442648128688065, + "grad_norm": 1.0412873029708862, + "learning_rate": 0.0002, + "loss": 0.6203, + "step": 27490 + }, + { + "epoch": 4.445881497049552, + "grad_norm": 1.0509289503097534, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 27500 + }, + { + "epoch": 4.447498181230297, + "grad_norm": 0.9291498064994812, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 27510 + }, + { + "epoch": 4.449114865411042, + "grad_norm": 0.970184326171875, + "learning_rate": 0.0002, + "loss": 0.5408, + "step": 27520 + }, + { + "epoch": 4.450731549591787, + "grad_norm": 0.8418883681297302, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 27530 + }, + { + "epoch": 4.452348233772533, + "grad_norm": 0.8823825120925903, + "learning_rate": 0.0002, + "loss": 0.5124, + "step": 27540 + }, + { + "epoch": 4.453964917953278, + "grad_norm": 1.1909019947052002, + "learning_rate": 0.0002, + "loss": 0.5867, + "step": 27550 + }, + { + "epoch": 4.455581602134023, + "grad_norm": 1.0317302942276, + "learning_rate": 0.0002, + "loss": 0.5685, + "step": 27560 + }, + { + "epoch": 4.457198286314768, + "grad_norm": 0.9977751970291138, + "learning_rate": 0.0002, + "loss": 0.5538, + "step": 27570 + }, + { + "epoch": 4.458814970495514, + "grad_norm": 0.8909519910812378, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 27580 + }, + { + "epoch": 4.460431654676259, + "grad_norm": 0.8653029799461365, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 27590 + }, + { + "epoch": 4.462048338857004, + "grad_norm": 1.0783653259277344, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 27600 + }, + { + "epoch": 4.463665023037749, + "grad_norm": 1.1235394477844238, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 27610 + }, + { + "epoch": 4.4652817072184945, + "grad_norm": 0.9386643767356873, + "learning_rate": 0.0002, + "loss": 0.5545, + "step": 27620 + }, + { + "epoch": 4.466898391399241, + "grad_norm": 1.0605148077011108, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 27630 + }, + { + "epoch": 4.468515075579986, + "grad_norm": 1.1283893585205078, + "learning_rate": 0.0002, + "loss": 0.5886, + "step": 27640 + }, + { + "epoch": 4.470131759760731, + "grad_norm": 1.0583468675613403, + "learning_rate": 0.0002, + "loss": 0.5801, + "step": 27650 + }, + { + "epoch": 4.471748443941476, + "grad_norm": 0.9563992023468018, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 27660 + }, + { + "epoch": 4.4733651281222215, + "grad_norm": 1.100598931312561, + "learning_rate": 0.0002, + "loss": 0.5687, + "step": 27670 + }, + { + "epoch": 4.474981812302967, + "grad_norm": 0.9386957287788391, + "learning_rate": 0.0002, + "loss": 0.589, + "step": 27680 + }, + { + "epoch": 4.476598496483712, + "grad_norm": 1.2946288585662842, + "learning_rate": 0.0002, + "loss": 0.6241, + "step": 27690 + }, + { + "epoch": 4.478215180664457, + "grad_norm": 1.0325199365615845, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 27700 + }, + { + "epoch": 4.4798318648452025, + "grad_norm": 1.0318928956985474, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 27710 + }, + { + "epoch": 4.481448549025948, + "grad_norm": 0.8721024394035339, + "learning_rate": 0.0002, + "loss": 0.5656, + "step": 27720 + }, + { + "epoch": 4.483065233206693, + "grad_norm": 1.17376708984375, + "learning_rate": 0.0002, + "loss": 0.5421, + "step": 27730 + }, + { + "epoch": 4.484681917387438, + "grad_norm": 1.0926326513290405, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 27740 + }, + { + "epoch": 4.486298601568183, + "grad_norm": 0.9043852686882019, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 27750 + }, + { + "epoch": 4.487915285748929, + "grad_norm": 1.064600944519043, + "learning_rate": 0.0002, + "loss": 0.582, + "step": 27760 + }, + { + "epoch": 4.489531969929674, + "grad_norm": 0.7833460569381714, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 27770 + }, + { + "epoch": 4.49114865411042, + "grad_norm": 1.1073496341705322, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 27780 + }, + { + "epoch": 4.492765338291165, + "grad_norm": 1.0799397230148315, + "learning_rate": 0.0002, + "loss": 0.5577, + "step": 27790 + }, + { + "epoch": 4.49438202247191, + "grad_norm": 1.1062238216400146, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 27800 + }, + { + "epoch": 4.495998706652656, + "grad_norm": 1.0568242073059082, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 27810 + }, + { + "epoch": 4.497615390833401, + "grad_norm": 0.8861091732978821, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 27820 + }, + { + "epoch": 4.499232075014146, + "grad_norm": 1.2297543287277222, + "learning_rate": 0.0002, + "loss": 0.5858, + "step": 27830 + }, + { + "epoch": 4.500848759194891, + "grad_norm": 0.9600302577018738, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 27840 + }, + { + "epoch": 4.502465443375637, + "grad_norm": 1.057051181793213, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 27850 + }, + { + "epoch": 4.504082127556382, + "grad_norm": 0.9839690923690796, + "learning_rate": 0.0002, + "loss": 0.5788, + "step": 27860 + }, + { + "epoch": 4.505698811737127, + "grad_norm": 1.1479853391647339, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 27870 + }, + { + "epoch": 4.507315495917872, + "grad_norm": 1.0550768375396729, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 27880 + }, + { + "epoch": 4.5089321800986175, + "grad_norm": 0.898209273815155, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 27890 + }, + { + "epoch": 4.510548864279363, + "grad_norm": 0.9460315108299255, + "learning_rate": 0.0002, + "loss": 0.5734, + "step": 27900 + }, + { + "epoch": 4.512165548460108, + "grad_norm": 0.9499884247779846, + "learning_rate": 0.0002, + "loss": 0.5702, + "step": 27910 + }, + { + "epoch": 4.513782232640853, + "grad_norm": 0.7801318764686584, + "learning_rate": 0.0002, + "loss": 0.5385, + "step": 27920 + }, + { + "epoch": 4.515398916821599, + "grad_norm": 0.9286966323852539, + "learning_rate": 0.0002, + "loss": 0.5391, + "step": 27930 + }, + { + "epoch": 4.517015601002345, + "grad_norm": 0.9539980292320251, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 27940 + }, + { + "epoch": 4.51863228518309, + "grad_norm": 1.1053401231765747, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 27950 + }, + { + "epoch": 4.520248969363835, + "grad_norm": 0.7535534501075745, + "learning_rate": 0.0002, + "loss": 0.6087, + "step": 27960 + }, + { + "epoch": 4.52186565354458, + "grad_norm": 1.076926589012146, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 27970 + }, + { + "epoch": 4.5234823377253255, + "grad_norm": 1.181935429573059, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 27980 + }, + { + "epoch": 4.525099021906071, + "grad_norm": 0.9293407201766968, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 27990 + }, + { + "epoch": 4.526715706086816, + "grad_norm": 0.8953009247779846, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 28000 + }, + { + "epoch": 4.528332390267561, + "grad_norm": 1.0850225687026978, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 28010 + }, + { + "epoch": 4.529949074448306, + "grad_norm": 0.9125663042068481, + "learning_rate": 0.0002, + "loss": 0.5459, + "step": 28020 + }, + { + "epoch": 4.531565758629052, + "grad_norm": 0.8745216727256775, + "learning_rate": 0.0002, + "loss": 0.5922, + "step": 28030 + }, + { + "epoch": 4.533182442809797, + "grad_norm": 1.0783463716506958, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 28040 + }, + { + "epoch": 4.534799126990542, + "grad_norm": 0.7513844966888428, + "learning_rate": 0.0002, + "loss": 0.5754, + "step": 28050 + }, + { + "epoch": 4.536415811171287, + "grad_norm": 1.0135776996612549, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 28060 + }, + { + "epoch": 4.538032495352033, + "grad_norm": 0.8886825442314148, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 28070 + }, + { + "epoch": 4.539649179532779, + "grad_norm": 0.8153995275497437, + "learning_rate": 0.0002, + "loss": 0.5605, + "step": 28080 + }, + { + "epoch": 4.541265863713524, + "grad_norm": 0.9853341579437256, + "learning_rate": 0.0002, + "loss": 0.6377, + "step": 28090 + }, + { + "epoch": 4.542882547894269, + "grad_norm": 0.9365800023078918, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 28100 + }, + { + "epoch": 4.544499232075014, + "grad_norm": 0.9765017628669739, + "learning_rate": 0.0002, + "loss": 0.5477, + "step": 28110 + }, + { + "epoch": 4.54611591625576, + "grad_norm": 0.9811279773712158, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 28120 + }, + { + "epoch": 4.547732600436505, + "grad_norm": 1.0387924909591675, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 28130 + }, + { + "epoch": 4.54934928461725, + "grad_norm": 1.0684878826141357, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 28140 + }, + { + "epoch": 4.550965968797995, + "grad_norm": 1.0000102519989014, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 28150 + }, + { + "epoch": 4.5525826529787405, + "grad_norm": 1.0717930793762207, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 28160 + }, + { + "epoch": 4.554199337159486, + "grad_norm": 0.990074634552002, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 28170 + }, + { + "epoch": 4.555816021340231, + "grad_norm": 0.8673754930496216, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 28180 + }, + { + "epoch": 4.557432705520976, + "grad_norm": 0.864247739315033, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 28190 + }, + { + "epoch": 4.5590493897017215, + "grad_norm": 0.8280200958251953, + "learning_rate": 0.0002, + "loss": 0.5516, + "step": 28200 + }, + { + "epoch": 4.560666073882467, + "grad_norm": 1.1312172412872314, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 28210 + }, + { + "epoch": 4.562282758063212, + "grad_norm": 0.9147403240203857, + "learning_rate": 0.0002, + "loss": 0.5776, + "step": 28220 + }, + { + "epoch": 4.563899442243958, + "grad_norm": 1.0321218967437744, + "learning_rate": 0.0002, + "loss": 0.5591, + "step": 28230 + }, + { + "epoch": 4.565516126424703, + "grad_norm": 1.168332815170288, + "learning_rate": 0.0002, + "loss": 0.5508, + "step": 28240 + }, + { + "epoch": 4.5671328106054485, + "grad_norm": 1.0067222118377686, + "learning_rate": 0.0002, + "loss": 0.5649, + "step": 28250 + }, + { + "epoch": 4.568749494786194, + "grad_norm": 1.0283393859863281, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 28260 + }, + { + "epoch": 4.570366178966939, + "grad_norm": 0.9912363886833191, + "learning_rate": 0.0002, + "loss": 0.5772, + "step": 28270 + }, + { + "epoch": 4.571982863147684, + "grad_norm": 1.108032464981079, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 28280 + }, + { + "epoch": 4.573599547328429, + "grad_norm": 0.8260078430175781, + "learning_rate": 0.0002, + "loss": 0.5529, + "step": 28290 + }, + { + "epoch": 4.575216231509175, + "grad_norm": 0.8946247100830078, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 28300 + }, + { + "epoch": 4.57683291568992, + "grad_norm": 0.8273587822914124, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 28310 + }, + { + "epoch": 4.578449599870665, + "grad_norm": 0.9040093421936035, + "learning_rate": 0.0002, + "loss": 0.6058, + "step": 28320 + }, + { + "epoch": 4.58006628405141, + "grad_norm": 0.8435290455818176, + "learning_rate": 0.0002, + "loss": 0.5521, + "step": 28330 + }, + { + "epoch": 4.581682968232156, + "grad_norm": 1.164088249206543, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 28340 + }, + { + "epoch": 4.583299652412901, + "grad_norm": 0.9861085414886475, + "learning_rate": 0.0002, + "loss": 0.5603, + "step": 28350 + }, + { + "epoch": 4.584916336593646, + "grad_norm": 0.8892980813980103, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 28360 + }, + { + "epoch": 4.586533020774391, + "grad_norm": 1.240574836730957, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 28370 + }, + { + "epoch": 4.588149704955137, + "grad_norm": 0.8669408559799194, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 28380 + }, + { + "epoch": 4.589766389135883, + "grad_norm": 0.9145985841751099, + "learning_rate": 0.0002, + "loss": 0.5603, + "step": 28390 + }, + { + "epoch": 4.591383073316628, + "grad_norm": 0.8584614992141724, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 28400 + }, + { + "epoch": 4.592999757497373, + "grad_norm": 1.118829369544983, + "learning_rate": 0.0002, + "loss": 0.5898, + "step": 28410 + }, + { + "epoch": 4.594616441678118, + "grad_norm": 1.1411553621292114, + "learning_rate": 0.0002, + "loss": 0.5641, + "step": 28420 + }, + { + "epoch": 4.596233125858864, + "grad_norm": 0.9433278441429138, + "learning_rate": 0.0002, + "loss": 0.549, + "step": 28430 + }, + { + "epoch": 4.597849810039609, + "grad_norm": 0.816830039024353, + "learning_rate": 0.0002, + "loss": 0.5496, + "step": 28440 + }, + { + "epoch": 4.599466494220354, + "grad_norm": 1.2124968767166138, + "learning_rate": 0.0002, + "loss": 0.5543, + "step": 28450 + }, + { + "epoch": 4.601083178401099, + "grad_norm": 0.9658762216567993, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 28460 + }, + { + "epoch": 4.6026998625818445, + "grad_norm": 0.836100161075592, + "learning_rate": 0.0002, + "loss": 0.5902, + "step": 28470 + }, + { + "epoch": 4.60431654676259, + "grad_norm": 0.9989104270935059, + "learning_rate": 0.0002, + "loss": 0.5749, + "step": 28480 + }, + { + "epoch": 4.605933230943335, + "grad_norm": 1.1298956871032715, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 28490 + }, + { + "epoch": 4.60754991512408, + "grad_norm": 1.1731704473495483, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 28500 + }, + { + "epoch": 4.609166599304825, + "grad_norm": 0.9624714255332947, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 28510 + }, + { + "epoch": 4.610783283485571, + "grad_norm": 1.364073634147644, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 28520 + }, + { + "epoch": 4.612399967666317, + "grad_norm": 1.1827356815338135, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 28530 + }, + { + "epoch": 4.614016651847062, + "grad_norm": 0.6651531457901001, + "learning_rate": 0.0002, + "loss": 0.5643, + "step": 28540 + }, + { + "epoch": 4.615633336027807, + "grad_norm": 1.1640995740890503, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 28550 + }, + { + "epoch": 4.6172500202085525, + "grad_norm": 1.028918743133545, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 28560 + }, + { + "epoch": 4.618866704389298, + "grad_norm": 0.8252120614051819, + "learning_rate": 0.0002, + "loss": 0.5607, + "step": 28570 + }, + { + "epoch": 4.620483388570043, + "grad_norm": 1.3536735773086548, + "learning_rate": 0.0002, + "loss": 0.5769, + "step": 28580 + }, + { + "epoch": 4.622100072750788, + "grad_norm": 1.2146915197372437, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 28590 + }, + { + "epoch": 4.623716756931533, + "grad_norm": 1.0122549533843994, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 28600 + }, + { + "epoch": 4.625333441112279, + "grad_norm": 0.9977872967720032, + "learning_rate": 0.0002, + "loss": 0.6072, + "step": 28610 + }, + { + "epoch": 4.626950125293024, + "grad_norm": 1.0159751176834106, + "learning_rate": 0.0002, + "loss": 0.5669, + "step": 28620 + }, + { + "epoch": 4.628566809473769, + "grad_norm": 1.0028325319290161, + "learning_rate": 0.0002, + "loss": 0.5935, + "step": 28630 + }, + { + "epoch": 4.630183493654514, + "grad_norm": 0.901638388633728, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 28640 + }, + { + "epoch": 4.6318001778352595, + "grad_norm": 0.9450507164001465, + "learning_rate": 0.0002, + "loss": 0.595, + "step": 28650 + }, + { + "epoch": 4.633416862016006, + "grad_norm": 0.9987545013427734, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 28660 + }, + { + "epoch": 4.63503354619675, + "grad_norm": 0.9574332237243652, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 28670 + }, + { + "epoch": 4.636650230377496, + "grad_norm": 1.2215653657913208, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 28680 + }, + { + "epoch": 4.638266914558241, + "grad_norm": 0.9798858761787415, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 28690 + }, + { + "epoch": 4.639883598738987, + "grad_norm": 1.0648466348648071, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 28700 + }, + { + "epoch": 4.641500282919732, + "grad_norm": 1.0606504678726196, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 28710 + }, + { + "epoch": 4.643116967100477, + "grad_norm": 1.0892442464828491, + "learning_rate": 0.0002, + "loss": 0.5801, + "step": 28720 + }, + { + "epoch": 4.644733651281222, + "grad_norm": 0.914391040802002, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 28730 + }, + { + "epoch": 4.6463503354619675, + "grad_norm": 0.9782370328903198, + "learning_rate": 0.0002, + "loss": 0.5439, + "step": 28740 + }, + { + "epoch": 4.647967019642713, + "grad_norm": 1.0344339609146118, + "learning_rate": 0.0002, + "loss": 0.6035, + "step": 28750 + }, + { + "epoch": 4.649583703823458, + "grad_norm": 1.0513931512832642, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 28760 + }, + { + "epoch": 4.651200388004203, + "grad_norm": 0.9711475968360901, + "learning_rate": 0.0002, + "loss": 0.546, + "step": 28770 + }, + { + "epoch": 4.652817072184948, + "grad_norm": 0.977519690990448, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 28780 + }, + { + "epoch": 4.654433756365694, + "grad_norm": 0.9150224924087524, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 28790 + }, + { + "epoch": 4.656050440546439, + "grad_norm": 1.0973542928695679, + "learning_rate": 0.0002, + "loss": 0.5382, + "step": 28800 + }, + { + "epoch": 4.657667124727185, + "grad_norm": 0.944877564907074, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 28810 + }, + { + "epoch": 4.659283808907929, + "grad_norm": 0.9508748650550842, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 28820 + }, + { + "epoch": 4.6609004930886755, + "grad_norm": 0.9681721329689026, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 28830 + }, + { + "epoch": 4.662517177269421, + "grad_norm": 1.0214351415634155, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 28840 + }, + { + "epoch": 4.664133861450166, + "grad_norm": 0.9748611450195312, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 28850 + }, + { + "epoch": 4.665750545630911, + "grad_norm": 0.8484147191047668, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 28860 + }, + { + "epoch": 4.667367229811656, + "grad_norm": 1.1252986192703247, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 28870 + }, + { + "epoch": 4.668983913992402, + "grad_norm": 0.8706206679344177, + "learning_rate": 0.0002, + "loss": 0.5724, + "step": 28880 + }, + { + "epoch": 4.670600598173147, + "grad_norm": 1.1432424783706665, + "learning_rate": 0.0002, + "loss": 0.6002, + "step": 28890 + }, + { + "epoch": 4.672217282353892, + "grad_norm": 1.017029047012329, + "learning_rate": 0.0002, + "loss": 0.5675, + "step": 28900 + }, + { + "epoch": 4.673833966534637, + "grad_norm": 1.085597038269043, + "learning_rate": 0.0002, + "loss": 0.5831, + "step": 28910 + }, + { + "epoch": 4.675450650715383, + "grad_norm": 0.9275796413421631, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 28920 + }, + { + "epoch": 4.677067334896128, + "grad_norm": 0.9518964886665344, + "learning_rate": 0.0002, + "loss": 0.5603, + "step": 28930 + }, + { + "epoch": 4.678684019076873, + "grad_norm": 1.0352122783660889, + "learning_rate": 0.0002, + "loss": 0.6232, + "step": 28940 + }, + { + "epoch": 4.680300703257618, + "grad_norm": 1.090124249458313, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 28950 + }, + { + "epoch": 4.681917387438364, + "grad_norm": 0.8799563050270081, + "learning_rate": 0.0002, + "loss": 0.5728, + "step": 28960 + }, + { + "epoch": 4.683534071619109, + "grad_norm": 1.0929821729660034, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 28970 + }, + { + "epoch": 4.685150755799855, + "grad_norm": 0.903727650642395, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 28980 + }, + { + "epoch": 4.6867674399806, + "grad_norm": 0.9752424955368042, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 28990 + }, + { + "epoch": 4.688384124161345, + "grad_norm": 0.9351571202278137, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 29000 + }, + { + "epoch": 4.6900008083420905, + "grad_norm": 0.923877477645874, + "learning_rate": 0.0002, + "loss": 0.5811, + "step": 29010 + }, + { + "epoch": 4.691617492522836, + "grad_norm": 1.045389175415039, + "learning_rate": 0.0002, + "loss": 0.5682, + "step": 29020 + }, + { + "epoch": 4.693234176703581, + "grad_norm": 1.0200831890106201, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 29030 + }, + { + "epoch": 4.694850860884326, + "grad_norm": 1.1499706506729126, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 29040 + }, + { + "epoch": 4.6964675450650715, + "grad_norm": 0.860118567943573, + "learning_rate": 0.0002, + "loss": 0.5745, + "step": 29050 + }, + { + "epoch": 4.698084229245817, + "grad_norm": 0.9774864315986633, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 29060 + }, + { + "epoch": 4.699700913426562, + "grad_norm": 1.0323210954666138, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 29070 + }, + { + "epoch": 4.701317597607307, + "grad_norm": 0.8492481112480164, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 29080 + }, + { + "epoch": 4.702934281788052, + "grad_norm": 1.131951093673706, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 29090 + }, + { + "epoch": 4.704550965968798, + "grad_norm": 0.8763113021850586, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 29100 + }, + { + "epoch": 4.706167650149544, + "grad_norm": 1.045028805732727, + "learning_rate": 0.0002, + "loss": 0.575, + "step": 29110 + }, + { + "epoch": 4.707784334330288, + "grad_norm": 0.9961401224136353, + "learning_rate": 0.0002, + "loss": 0.5548, + "step": 29120 + }, + { + "epoch": 4.709401018511034, + "grad_norm": 0.9282503724098206, + "learning_rate": 0.0002, + "loss": 0.559, + "step": 29130 + }, + { + "epoch": 4.711017702691779, + "grad_norm": 1.1418932676315308, + "learning_rate": 0.0002, + "loss": 0.5744, + "step": 29140 + }, + { + "epoch": 4.712634386872525, + "grad_norm": 0.9950099587440491, + "learning_rate": 0.0002, + "loss": 0.5394, + "step": 29150 + }, + { + "epoch": 4.71425107105327, + "grad_norm": 0.8304893374443054, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 29160 + }, + { + "epoch": 4.715867755234015, + "grad_norm": 1.115626335144043, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 29170 + }, + { + "epoch": 4.71748443941476, + "grad_norm": 1.079818606376648, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 29180 + }, + { + "epoch": 4.719101123595506, + "grad_norm": 1.1929082870483398, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 29190 + }, + { + "epoch": 4.720717807776251, + "grad_norm": 0.9621080756187439, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 29200 + }, + { + "epoch": 4.722334491956996, + "grad_norm": 0.8549222350120544, + "learning_rate": 0.0002, + "loss": 0.546, + "step": 29210 + }, + { + "epoch": 4.723951176137741, + "grad_norm": 0.9341941475868225, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 29220 + }, + { + "epoch": 4.7255678603184865, + "grad_norm": 1.075406789779663, + "learning_rate": 0.0002, + "loss": 0.5436, + "step": 29230 + }, + { + "epoch": 4.727184544499232, + "grad_norm": 1.0859880447387695, + "learning_rate": 0.0002, + "loss": 0.576, + "step": 29240 + }, + { + "epoch": 4.728801228679977, + "grad_norm": 0.8475605249404907, + "learning_rate": 0.0002, + "loss": 0.5525, + "step": 29250 + }, + { + "epoch": 4.730417912860723, + "grad_norm": 0.9331845641136169, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 29260 + }, + { + "epoch": 4.7320345970414674, + "grad_norm": 0.9279314279556274, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 29270 + }, + { + "epoch": 4.733651281222214, + "grad_norm": 0.7803558707237244, + "learning_rate": 0.0002, + "loss": 0.597, + "step": 29280 + }, + { + "epoch": 4.735267965402959, + "grad_norm": 1.0159329175949097, + "learning_rate": 0.0002, + "loss": 0.5968, + "step": 29290 + }, + { + "epoch": 4.736884649583704, + "grad_norm": 0.9448670744895935, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 29300 + }, + { + "epoch": 4.738501333764449, + "grad_norm": 1.0732197761535645, + "learning_rate": 0.0002, + "loss": 0.574, + "step": 29310 + }, + { + "epoch": 4.7401180179451945, + "grad_norm": 0.901830792427063, + "learning_rate": 0.0002, + "loss": 0.6066, + "step": 29320 + }, + { + "epoch": 4.74173470212594, + "grad_norm": 0.9141789674758911, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 29330 + }, + { + "epoch": 4.743351386306685, + "grad_norm": 0.9733418226242065, + "learning_rate": 0.0002, + "loss": 0.5481, + "step": 29340 + }, + { + "epoch": 4.74496807048743, + "grad_norm": 0.909810483455658, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 29350 + }, + { + "epoch": 4.746584754668175, + "grad_norm": 0.909541666507721, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 29360 + }, + { + "epoch": 4.748201438848921, + "grad_norm": 0.9383015632629395, + "learning_rate": 0.0002, + "loss": 0.5579, + "step": 29370 + }, + { + "epoch": 4.749818123029666, + "grad_norm": 0.9275668263435364, + "learning_rate": 0.0002, + "loss": 0.5529, + "step": 29380 + }, + { + "epoch": 4.751434807210411, + "grad_norm": 1.1146225929260254, + "learning_rate": 0.0002, + "loss": 0.5623, + "step": 29390 + }, + { + "epoch": 4.753051491391156, + "grad_norm": 1.0062453746795654, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 29400 + }, + { + "epoch": 4.7546681755719025, + "grad_norm": 0.9451895952224731, + "learning_rate": 0.0002, + "loss": 0.5872, + "step": 29410 + }, + { + "epoch": 4.756284859752648, + "grad_norm": 0.870457649230957, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 29420 + }, + { + "epoch": 4.757901543933393, + "grad_norm": 1.0411282777786255, + "learning_rate": 0.0002, + "loss": 0.57, + "step": 29430 + }, + { + "epoch": 4.759518228114138, + "grad_norm": 1.1648986339569092, + "learning_rate": 0.0002, + "loss": 0.5688, + "step": 29440 + }, + { + "epoch": 4.761134912294883, + "grad_norm": 0.8999572992324829, + "learning_rate": 0.0002, + "loss": 0.5432, + "step": 29450 + }, + { + "epoch": 4.762751596475629, + "grad_norm": 0.9863559007644653, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 29460 + }, + { + "epoch": 4.764368280656374, + "grad_norm": 0.9676542282104492, + "learning_rate": 0.0002, + "loss": 0.5779, + "step": 29470 + }, + { + "epoch": 4.765984964837119, + "grad_norm": 1.004775047302246, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 29480 + }, + { + "epoch": 4.767601649017864, + "grad_norm": 1.0937515497207642, + "learning_rate": 0.0002, + "loss": 0.6044, + "step": 29490 + }, + { + "epoch": 4.7692183331986095, + "grad_norm": 0.9551598429679871, + "learning_rate": 0.0002, + "loss": 0.5433, + "step": 29500 + }, + { + "epoch": 4.770835017379355, + "grad_norm": 1.0757228136062622, + "learning_rate": 0.0002, + "loss": 0.5609, + "step": 29510 + }, + { + "epoch": 4.7724517015601, + "grad_norm": 1.0588841438293457, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 29520 + }, + { + "epoch": 4.774068385740845, + "grad_norm": 1.0744032859802246, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 29530 + }, + { + "epoch": 4.7756850699215905, + "grad_norm": 1.0066277980804443, + "learning_rate": 0.0002, + "loss": 0.5681, + "step": 29540 + }, + { + "epoch": 4.777301754102336, + "grad_norm": 1.082319736480713, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 29550 + }, + { + "epoch": 4.778918438283082, + "grad_norm": 0.8252472877502441, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 29560 + }, + { + "epoch": 4.780535122463827, + "grad_norm": 0.9855340123176575, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 29570 + }, + { + "epoch": 4.782151806644572, + "grad_norm": 0.9991421699523926, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 29580 + }, + { + "epoch": 4.7837684908253175, + "grad_norm": 1.316841959953308, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 29590 + }, + { + "epoch": 4.785385175006063, + "grad_norm": 1.1513035297393799, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 29600 + }, + { + "epoch": 4.787001859186808, + "grad_norm": 0.9767683744430542, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 29610 + }, + { + "epoch": 4.788618543367553, + "grad_norm": 0.9786278605461121, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 29620 + }, + { + "epoch": 4.7902352275482984, + "grad_norm": 0.8004973530769348, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 29630 + }, + { + "epoch": 4.791851911729044, + "grad_norm": 1.0997767448425293, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 29640 + }, + { + "epoch": 4.793468595909789, + "grad_norm": 0.9752856492996216, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 29650 + }, + { + "epoch": 4.795085280090534, + "grad_norm": 1.0518392324447632, + "learning_rate": 0.0002, + "loss": 0.5916, + "step": 29660 + }, + { + "epoch": 4.796701964271279, + "grad_norm": 1.1050055027008057, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 29670 + }, + { + "epoch": 4.798318648452025, + "grad_norm": 0.9933857917785645, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 29680 + }, + { + "epoch": 4.79993533263277, + "grad_norm": 1.2804018259048462, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 29690 + }, + { + "epoch": 4.801552016813515, + "grad_norm": 1.0133371353149414, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 29700 + }, + { + "epoch": 4.803168700994261, + "grad_norm": 1.080350637435913, + "learning_rate": 0.0002, + "loss": 0.5662, + "step": 29710 + }, + { + "epoch": 4.804785385175006, + "grad_norm": 0.9986529350280762, + "learning_rate": 0.0002, + "loss": 0.5603, + "step": 29720 + }, + { + "epoch": 4.806402069355752, + "grad_norm": 0.975665807723999, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 29730 + }, + { + "epoch": 4.808018753536497, + "grad_norm": 0.8458138704299927, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 29740 + }, + { + "epoch": 4.809635437717242, + "grad_norm": 0.99330073595047, + "learning_rate": 0.0002, + "loss": 0.5837, + "step": 29750 + }, + { + "epoch": 4.811252121897987, + "grad_norm": 0.898274302482605, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 29760 + }, + { + "epoch": 4.812868806078733, + "grad_norm": 1.0504480600357056, + "learning_rate": 0.0002, + "loss": 0.5842, + "step": 29770 + }, + { + "epoch": 4.814485490259478, + "grad_norm": 0.937919020652771, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 29780 + }, + { + "epoch": 4.816102174440223, + "grad_norm": 0.9593307971954346, + "learning_rate": 0.0002, + "loss": 0.5885, + "step": 29790 + }, + { + "epoch": 4.817718858620968, + "grad_norm": 0.9431198835372925, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 29800 + }, + { + "epoch": 4.8193355428017135, + "grad_norm": 1.2729957103729248, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 29810 + }, + { + "epoch": 4.820952226982459, + "grad_norm": 0.8876838684082031, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 29820 + }, + { + "epoch": 4.822568911163204, + "grad_norm": 1.0185000896453857, + "learning_rate": 0.0002, + "loss": 0.5583, + "step": 29830 + }, + { + "epoch": 4.824185595343949, + "grad_norm": 1.064276099205017, + "learning_rate": 0.0002, + "loss": 0.5686, + "step": 29840 + }, + { + "epoch": 4.825802279524694, + "grad_norm": 0.9774803519248962, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 29850 + }, + { + "epoch": 4.8274189637054405, + "grad_norm": 1.131646990776062, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 29860 + }, + { + "epoch": 4.829035647886186, + "grad_norm": 1.081455945968628, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 29870 + }, + { + "epoch": 4.830652332066931, + "grad_norm": 0.990538477897644, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 29880 + }, + { + "epoch": 4.832269016247676, + "grad_norm": 0.9750600457191467, + "learning_rate": 0.0002, + "loss": 0.5833, + "step": 29890 + }, + { + "epoch": 4.8338857004284215, + "grad_norm": 1.0600621700286865, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 29900 + }, + { + "epoch": 4.835502384609167, + "grad_norm": 0.9237320423126221, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 29910 + }, + { + "epoch": 4.837119068789912, + "grad_norm": 0.9739177227020264, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 29920 + }, + { + "epoch": 4.838735752970657, + "grad_norm": 1.128677248954773, + "learning_rate": 0.0002, + "loss": 0.587, + "step": 29930 + }, + { + "epoch": 4.840352437151402, + "grad_norm": 1.042604923248291, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 29940 + }, + { + "epoch": 4.841969121332148, + "grad_norm": 0.849758505821228, + "learning_rate": 0.0002, + "loss": 0.5885, + "step": 29950 + }, + { + "epoch": 4.843585805512893, + "grad_norm": 1.2809888124465942, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 29960 + }, + { + "epoch": 4.845202489693638, + "grad_norm": 1.0177865028381348, + "learning_rate": 0.0002, + "loss": 0.5703, + "step": 29970 + }, + { + "epoch": 4.846819173874383, + "grad_norm": 1.0026639699935913, + "learning_rate": 0.0002, + "loss": 0.5946, + "step": 29980 + }, + { + "epoch": 4.8484358580551286, + "grad_norm": 0.9679505228996277, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 29990 + }, + { + "epoch": 4.850052542235874, + "grad_norm": 0.8939532041549683, + "learning_rate": 0.0002, + "loss": 0.5621, + "step": 30000 + }, + { + "epoch": 4.85166922641662, + "grad_norm": 0.9957457780838013, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 30010 + }, + { + "epoch": 4.853285910597365, + "grad_norm": 1.1646790504455566, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 30020 + }, + { + "epoch": 4.85490259477811, + "grad_norm": 0.8804680705070496, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 30030 + }, + { + "epoch": 4.856519278958856, + "grad_norm": 1.161970853805542, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 30040 + }, + { + "epoch": 4.858135963139601, + "grad_norm": 0.9081037640571594, + "learning_rate": 0.0002, + "loss": 0.5552, + "step": 30050 + }, + { + "epoch": 4.859752647320346, + "grad_norm": 0.9402848482131958, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 30060 + }, + { + "epoch": 4.861369331501091, + "grad_norm": 0.9023865461349487, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 30070 + }, + { + "epoch": 4.8629860156818365, + "grad_norm": 1.0173414945602417, + "learning_rate": 0.0002, + "loss": 0.5926, + "step": 30080 + }, + { + "epoch": 4.864602699862582, + "grad_norm": 1.084402322769165, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 30090 + }, + { + "epoch": 4.866219384043327, + "grad_norm": 0.9577937126159668, + "learning_rate": 0.0002, + "loss": 0.6311, + "step": 30100 + }, + { + "epoch": 4.867836068224072, + "grad_norm": 0.9807606935501099, + "learning_rate": 0.0002, + "loss": 0.5724, + "step": 30110 + }, + { + "epoch": 4.8694527524048175, + "grad_norm": 0.978784441947937, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 30120 + }, + { + "epoch": 4.871069436585563, + "grad_norm": 0.9762914776802063, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 30130 + }, + { + "epoch": 4.872686120766308, + "grad_norm": 0.9404871463775635, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 30140 + }, + { + "epoch": 4.874302804947053, + "grad_norm": 1.0069509744644165, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 30150 + }, + { + "epoch": 4.875919489127799, + "grad_norm": 1.1770923137664795, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 30160 + }, + { + "epoch": 4.8775361733085445, + "grad_norm": 1.021210789680481, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 30170 + }, + { + "epoch": 4.87915285748929, + "grad_norm": 0.8512648940086365, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 30180 + }, + { + "epoch": 4.880769541670035, + "grad_norm": 0.9345870018005371, + "learning_rate": 0.0002, + "loss": 0.5519, + "step": 30190 + }, + { + "epoch": 4.88238622585078, + "grad_norm": 1.0224418640136719, + "learning_rate": 0.0002, + "loss": 0.5682, + "step": 30200 + }, + { + "epoch": 4.884002910031525, + "grad_norm": 1.0316044092178345, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 30210 + }, + { + "epoch": 4.885619594212271, + "grad_norm": 1.102437973022461, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 30220 + }, + { + "epoch": 4.887236278393016, + "grad_norm": 1.0220023393630981, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 30230 + }, + { + "epoch": 4.888852962573761, + "grad_norm": 1.0934523344039917, + "learning_rate": 0.0002, + "loss": 0.5781, + "step": 30240 + }, + { + "epoch": 4.890469646754506, + "grad_norm": 1.264630913734436, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 30250 + }, + { + "epoch": 4.892086330935252, + "grad_norm": 1.0999879837036133, + "learning_rate": 0.0002, + "loss": 0.5712, + "step": 30260 + }, + { + "epoch": 4.893703015115997, + "grad_norm": 0.9124550223350525, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 30270 + }, + { + "epoch": 4.895319699296742, + "grad_norm": 0.9853624105453491, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 30280 + }, + { + "epoch": 4.896936383477488, + "grad_norm": 1.0589802265167236, + "learning_rate": 0.0002, + "loss": 0.595, + "step": 30290 + }, + { + "epoch": 4.8985530676582325, + "grad_norm": 0.8487226366996765, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 30300 + }, + { + "epoch": 4.900169751838979, + "grad_norm": 1.0212191343307495, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 30310 + }, + { + "epoch": 4.901786436019724, + "grad_norm": 1.0187491178512573, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 30320 + }, + { + "epoch": 4.903403120200469, + "grad_norm": 1.0013091564178467, + "learning_rate": 0.0002, + "loss": 0.5809, + "step": 30330 + }, + { + "epoch": 4.905019804381214, + "grad_norm": 1.0017542839050293, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 30340 + }, + { + "epoch": 4.9066364885619596, + "grad_norm": 0.9665151238441467, + "learning_rate": 0.0002, + "loss": 0.6002, + "step": 30350 + }, + { + "epoch": 4.908253172742705, + "grad_norm": 0.8774822950363159, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 30360 + }, + { + "epoch": 4.90986985692345, + "grad_norm": 0.9449850916862488, + "learning_rate": 0.0002, + "loss": 0.5771, + "step": 30370 + }, + { + "epoch": 4.911486541104195, + "grad_norm": 0.7368341088294983, + "learning_rate": 0.0002, + "loss": 0.58, + "step": 30380 + }, + { + "epoch": 4.9131032252849405, + "grad_norm": 0.9669167995452881, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 30390 + }, + { + "epoch": 4.914719909465686, + "grad_norm": 1.1227794885635376, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 30400 + }, + { + "epoch": 4.916336593646431, + "grad_norm": 0.9884361028671265, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 30410 + }, + { + "epoch": 4.917953277827176, + "grad_norm": 0.9949551224708557, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 30420 + }, + { + "epoch": 4.919569962007921, + "grad_norm": 0.9491621851921082, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 30430 + }, + { + "epoch": 4.9211866461886675, + "grad_norm": 0.78848797082901, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 30440 + }, + { + "epoch": 4.922803330369412, + "grad_norm": 1.0693835020065308, + "learning_rate": 0.0002, + "loss": 0.5561, + "step": 30450 + }, + { + "epoch": 4.924420014550158, + "grad_norm": 0.9573729634284973, + "learning_rate": 0.0002, + "loss": 0.566, + "step": 30460 + }, + { + "epoch": 4.926036698730903, + "grad_norm": 0.9975152611732483, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 30470 + }, + { + "epoch": 4.9276533829116484, + "grad_norm": 0.8695693016052246, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 30480 + }, + { + "epoch": 4.929270067092394, + "grad_norm": 1.145394206047058, + "learning_rate": 0.0002, + "loss": 0.6144, + "step": 30490 + }, + { + "epoch": 4.930886751273139, + "grad_norm": 0.7668989896774292, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 30500 + }, + { + "epoch": 4.932503435453884, + "grad_norm": 0.9630151391029358, + "learning_rate": 0.0002, + "loss": 0.6052, + "step": 30510 + }, + { + "epoch": 4.934120119634629, + "grad_norm": 0.940705418586731, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 30520 + }, + { + "epoch": 4.935736803815375, + "grad_norm": 1.3243348598480225, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 30530 + }, + { + "epoch": 4.93735348799612, + "grad_norm": 1.004347801208496, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 30540 + }, + { + "epoch": 4.938970172176865, + "grad_norm": 0.8711541295051575, + "learning_rate": 0.0002, + "loss": 0.583, + "step": 30550 + }, + { + "epoch": 4.94058685635761, + "grad_norm": 0.8980631828308105, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 30560 + }, + { + "epoch": 4.9422035405383555, + "grad_norm": 0.8388893604278564, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 30570 + }, + { + "epoch": 4.943820224719101, + "grad_norm": 1.0991183519363403, + "learning_rate": 0.0002, + "loss": 0.6189, + "step": 30580 + }, + { + "epoch": 4.945436908899847, + "grad_norm": 0.9731075763702393, + "learning_rate": 0.0002, + "loss": 0.5906, + "step": 30590 + }, + { + "epoch": 4.947053593080591, + "grad_norm": 1.3904452323913574, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 30600 + }, + { + "epoch": 4.948670277261337, + "grad_norm": 1.2489882707595825, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 30610 + }, + { + "epoch": 4.950286961442083, + "grad_norm": 1.240072250366211, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 30620 + }, + { + "epoch": 4.951903645622828, + "grad_norm": 0.9191411733627319, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 30630 + }, + { + "epoch": 4.953520329803573, + "grad_norm": 0.8888895511627197, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 30640 + }, + { + "epoch": 4.955137013984318, + "grad_norm": 0.9001450538635254, + "learning_rate": 0.0002, + "loss": 0.6594, + "step": 30650 + }, + { + "epoch": 4.9567536981650635, + "grad_norm": 1.053971767425537, + "learning_rate": 0.0002, + "loss": 0.6047, + "step": 30660 + }, + { + "epoch": 4.958370382345809, + "grad_norm": 1.2224042415618896, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 30670 + }, + { + "epoch": 4.959987066526554, + "grad_norm": 0.8855111598968506, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 30680 + }, + { + "epoch": 4.961603750707299, + "grad_norm": 0.9489575624465942, + "learning_rate": 0.0002, + "loss": 0.5764, + "step": 30690 + }, + { + "epoch": 4.963220434888044, + "grad_norm": 0.9635404944419861, + "learning_rate": 0.0002, + "loss": 0.5371, + "step": 30700 + }, + { + "epoch": 4.96483711906879, + "grad_norm": 1.1784121990203857, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 30710 + }, + { + "epoch": 4.966453803249535, + "grad_norm": 1.0059462785720825, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 30720 + }, + { + "epoch": 4.96807048743028, + "grad_norm": 0.9479738473892212, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 30730 + }, + { + "epoch": 4.969687171611026, + "grad_norm": 1.0624593496322632, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 30740 + }, + { + "epoch": 4.971303855791771, + "grad_norm": 1.1429259777069092, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 30750 + }, + { + "epoch": 4.972920539972517, + "grad_norm": 0.9102491140365601, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 30760 + }, + { + "epoch": 4.974537224153262, + "grad_norm": 1.1262688636779785, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 30770 + }, + { + "epoch": 4.976153908334007, + "grad_norm": 1.1415393352508545, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 30780 + }, + { + "epoch": 4.977770592514752, + "grad_norm": 1.083078384399414, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 30790 + }, + { + "epoch": 4.979387276695498, + "grad_norm": 0.964859127998352, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 30800 + }, + { + "epoch": 4.981003960876243, + "grad_norm": 0.8704743385314941, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 30810 + }, + { + "epoch": 4.982620645056988, + "grad_norm": 1.0714856386184692, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 30820 + }, + { + "epoch": 4.984237329237733, + "grad_norm": 0.6818771362304688, + "learning_rate": 0.0002, + "loss": 0.565, + "step": 30830 + }, + { + "epoch": 4.985854013418479, + "grad_norm": 1.0454156398773193, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 30840 + }, + { + "epoch": 4.987470697599224, + "grad_norm": 0.9410776495933533, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 30850 + }, + { + "epoch": 4.989087381779969, + "grad_norm": 1.0878902673721313, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 30860 + }, + { + "epoch": 4.990704065960714, + "grad_norm": 0.8916727304458618, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 30870 + }, + { + "epoch": 4.9923207501414595, + "grad_norm": 1.045776128768921, + "learning_rate": 0.0002, + "loss": 0.6066, + "step": 30880 + }, + { + "epoch": 4.993937434322206, + "grad_norm": 0.9861903786659241, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 30890 + }, + { + "epoch": 4.995554118502951, + "grad_norm": 0.9275050759315491, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 30900 + }, + { + "epoch": 4.997170802683696, + "grad_norm": 0.94013911485672, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 30910 + }, + { + "epoch": 4.998787486864441, + "grad_norm": 0.9771268367767334, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 30920 + }, + { + "epoch": 4.9999191657909625, + "eval_loss": 1.1968598365783691, + "eval_runtime": 122.2519, + "eval_samples_per_second": 5.996, + "eval_steps_per_second": 0.753, + "step": 30927 + }, + { + "epoch": 5.0004041710451865, + "grad_norm": 0.8021580576896667, + "learning_rate": 0.0002, + "loss": 0.5238, + "step": 30930 + }, + { + "epoch": 5.002020855225932, + "grad_norm": 1.0807327032089233, + "learning_rate": 0.0002, + "loss": 0.4984, + "step": 30940 + }, + { + "epoch": 5.003637539406677, + "grad_norm": 1.1638425588607788, + "learning_rate": 0.0002, + "loss": 0.514, + "step": 30950 + }, + { + "epoch": 5.005254223587422, + "grad_norm": 1.1700230836868286, + "learning_rate": 0.0002, + "loss": 0.4621, + "step": 30960 + }, + { + "epoch": 5.0068709077681675, + "grad_norm": 0.9053420424461365, + "learning_rate": 0.0002, + "loss": 0.4657, + "step": 30970 + }, + { + "epoch": 5.008487591948913, + "grad_norm": 0.9226111769676208, + "learning_rate": 0.0002, + "loss": 0.4865, + "step": 30980 + }, + { + "epoch": 5.010104276129658, + "grad_norm": 1.238669514656067, + "learning_rate": 0.0002, + "loss": 0.5011, + "step": 30990 + }, + { + "epoch": 5.011720960310403, + "grad_norm": 1.0668327808380127, + "learning_rate": 0.0002, + "loss": 0.4754, + "step": 31000 + }, + { + "epoch": 5.013337644491148, + "grad_norm": 1.0903944969177246, + "learning_rate": 0.0002, + "loss": 0.5414, + "step": 31010 + }, + { + "epoch": 5.014954328671894, + "grad_norm": 1.0763911008834839, + "learning_rate": 0.0002, + "loss": 0.5117, + "step": 31020 + }, + { + "epoch": 5.016571012852639, + "grad_norm": 1.0108771324157715, + "learning_rate": 0.0002, + "loss": 0.4908, + "step": 31030 + }, + { + "epoch": 5.018187697033385, + "grad_norm": 0.8816103935241699, + "learning_rate": 0.0002, + "loss": 0.5052, + "step": 31040 + }, + { + "epoch": 5.01980438121413, + "grad_norm": 1.11434805393219, + "learning_rate": 0.0002, + "loss": 0.4985, + "step": 31050 + }, + { + "epoch": 5.021421065394875, + "grad_norm": 1.0727789402008057, + "learning_rate": 0.0002, + "loss": 0.5074, + "step": 31060 + }, + { + "epoch": 5.023037749575621, + "grad_norm": 1.1480379104614258, + "learning_rate": 0.0002, + "loss": 0.4938, + "step": 31070 + }, + { + "epoch": 5.024654433756366, + "grad_norm": 1.0913071632385254, + "learning_rate": 0.0002, + "loss": 0.491, + "step": 31080 + }, + { + "epoch": 5.026271117937111, + "grad_norm": 0.9891864657402039, + "learning_rate": 0.0002, + "loss": 0.4896, + "step": 31090 + }, + { + "epoch": 5.027887802117856, + "grad_norm": 0.9167473912239075, + "learning_rate": 0.0002, + "loss": 0.4965, + "step": 31100 + }, + { + "epoch": 5.029504486298602, + "grad_norm": 1.2259035110473633, + "learning_rate": 0.0002, + "loss": 0.5098, + "step": 31110 + }, + { + "epoch": 5.031121170479347, + "grad_norm": 1.1812787055969238, + "learning_rate": 0.0002, + "loss": 0.5206, + "step": 31120 + }, + { + "epoch": 5.032737854660092, + "grad_norm": 1.0890522003173828, + "learning_rate": 0.0002, + "loss": 0.4725, + "step": 31130 + }, + { + "epoch": 5.034354538840837, + "grad_norm": 1.0521091222763062, + "learning_rate": 0.0002, + "loss": 0.4768, + "step": 31140 + }, + { + "epoch": 5.0359712230215825, + "grad_norm": 1.1274569034576416, + "learning_rate": 0.0002, + "loss": 0.4718, + "step": 31150 + }, + { + "epoch": 5.037587907202328, + "grad_norm": 1.140974998474121, + "learning_rate": 0.0002, + "loss": 0.4604, + "step": 31160 + }, + { + "epoch": 5.039204591383073, + "grad_norm": 1.1215609312057495, + "learning_rate": 0.0002, + "loss": 0.5077, + "step": 31170 + }, + { + "epoch": 5.040821275563818, + "grad_norm": 1.0107218027114868, + "learning_rate": 0.0002, + "loss": 0.4746, + "step": 31180 + }, + { + "epoch": 5.042437959744564, + "grad_norm": 1.0198770761489868, + "learning_rate": 0.0002, + "loss": 0.5126, + "step": 31190 + }, + { + "epoch": 5.0440546439253096, + "grad_norm": 1.1613430976867676, + "learning_rate": 0.0002, + "loss": 0.5004, + "step": 31200 + }, + { + "epoch": 5.045671328106055, + "grad_norm": 0.8555458188056946, + "learning_rate": 0.0002, + "loss": 0.5181, + "step": 31210 + }, + { + "epoch": 5.0472880122868, + "grad_norm": 1.0235545635223389, + "learning_rate": 0.0002, + "loss": 0.4878, + "step": 31220 + }, + { + "epoch": 5.048904696467545, + "grad_norm": 1.0228750705718994, + "learning_rate": 0.0002, + "loss": 0.499, + "step": 31230 + }, + { + "epoch": 5.0505213806482905, + "grad_norm": 0.8216419816017151, + "learning_rate": 0.0002, + "loss": 0.4544, + "step": 31240 + }, + { + "epoch": 5.052138064829036, + "grad_norm": 0.925828218460083, + "learning_rate": 0.0002, + "loss": 0.4947, + "step": 31250 + }, + { + "epoch": 5.053754749009781, + "grad_norm": 0.9229369759559631, + "learning_rate": 0.0002, + "loss": 0.4835, + "step": 31260 + }, + { + "epoch": 5.055371433190526, + "grad_norm": 0.9531727433204651, + "learning_rate": 0.0002, + "loss": 0.5136, + "step": 31270 + }, + { + "epoch": 5.056988117371271, + "grad_norm": 0.7738548517227173, + "learning_rate": 0.0002, + "loss": 0.5161, + "step": 31280 + }, + { + "epoch": 5.058604801552017, + "grad_norm": 1.0551451444625854, + "learning_rate": 0.0002, + "loss": 0.5166, + "step": 31290 + }, + { + "epoch": 5.060221485732762, + "grad_norm": 0.9782299399375916, + "learning_rate": 0.0002, + "loss": 0.4953, + "step": 31300 + }, + { + "epoch": 5.061838169913507, + "grad_norm": 1.0220632553100586, + "learning_rate": 0.0002, + "loss": 0.4776, + "step": 31310 + }, + { + "epoch": 5.063454854094252, + "grad_norm": 0.9808892607688904, + "learning_rate": 0.0002, + "loss": 0.5117, + "step": 31320 + }, + { + "epoch": 5.065071538274998, + "grad_norm": 1.0662003755569458, + "learning_rate": 0.0002, + "loss": 0.501, + "step": 31330 + }, + { + "epoch": 5.066688222455744, + "grad_norm": 1.0036940574645996, + "learning_rate": 0.0002, + "loss": 0.4844, + "step": 31340 + }, + { + "epoch": 5.068304906636489, + "grad_norm": 1.1931052207946777, + "learning_rate": 0.0002, + "loss": 0.5299, + "step": 31350 + }, + { + "epoch": 5.069921590817234, + "grad_norm": 0.9370693564414978, + "learning_rate": 0.0002, + "loss": 0.4646, + "step": 31360 + }, + { + "epoch": 5.071538274997979, + "grad_norm": 0.9589039087295532, + "learning_rate": 0.0002, + "loss": 0.5274, + "step": 31370 + }, + { + "epoch": 5.073154959178725, + "grad_norm": 1.0052711963653564, + "learning_rate": 0.0002, + "loss": 0.4669, + "step": 31380 + }, + { + "epoch": 5.07477164335947, + "grad_norm": 0.9991368651390076, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 31390 + }, + { + "epoch": 5.076388327540215, + "grad_norm": 0.8539695739746094, + "learning_rate": 0.0002, + "loss": 0.4579, + "step": 31400 + }, + { + "epoch": 5.07800501172096, + "grad_norm": 1.048775553703308, + "learning_rate": 0.0002, + "loss": 0.4609, + "step": 31410 + }, + { + "epoch": 5.0796216959017055, + "grad_norm": 0.9983724355697632, + "learning_rate": 0.0002, + "loss": 0.4915, + "step": 31420 + }, + { + "epoch": 5.081238380082451, + "grad_norm": 1.0189813375473022, + "learning_rate": 0.0002, + "loss": 0.4594, + "step": 31430 + }, + { + "epoch": 5.082855064263196, + "grad_norm": 0.9781646728515625, + "learning_rate": 0.0002, + "loss": 0.5449, + "step": 31440 + }, + { + "epoch": 5.084471748443941, + "grad_norm": 0.9424566030502319, + "learning_rate": 0.0002, + "loss": 0.4698, + "step": 31450 + }, + { + "epoch": 5.0860884326246865, + "grad_norm": 1.0036484003067017, + "learning_rate": 0.0002, + "loss": 0.4768, + "step": 31460 + }, + { + "epoch": 5.087705116805432, + "grad_norm": 1.0983147621154785, + "learning_rate": 0.0002, + "loss": 0.487, + "step": 31470 + }, + { + "epoch": 5.089321800986177, + "grad_norm": 1.0856730937957764, + "learning_rate": 0.0002, + "loss": 0.5236, + "step": 31480 + }, + { + "epoch": 5.090938485166923, + "grad_norm": 1.2191699743270874, + "learning_rate": 0.0002, + "loss": 0.485, + "step": 31490 + }, + { + "epoch": 5.092555169347668, + "grad_norm": 0.939346194267273, + "learning_rate": 0.0002, + "loss": 0.4936, + "step": 31500 + }, + { + "epoch": 5.0941718535284135, + "grad_norm": 0.9730121493339539, + "learning_rate": 0.0002, + "loss": 0.5107, + "step": 31510 + }, + { + "epoch": 5.095788537709159, + "grad_norm": 0.923686146736145, + "learning_rate": 0.0002, + "loss": 0.4973, + "step": 31520 + }, + { + "epoch": 5.097405221889904, + "grad_norm": 1.1734349727630615, + "learning_rate": 0.0002, + "loss": 0.4906, + "step": 31530 + }, + { + "epoch": 5.099021906070649, + "grad_norm": 1.084509015083313, + "learning_rate": 0.0002, + "loss": 0.5165, + "step": 31540 + }, + { + "epoch": 5.100638590251394, + "grad_norm": 1.0144678354263306, + "learning_rate": 0.0002, + "loss": 0.5078, + "step": 31550 + }, + { + "epoch": 5.10225527443214, + "grad_norm": 0.9958019256591797, + "learning_rate": 0.0002, + "loss": 0.4719, + "step": 31560 + }, + { + "epoch": 5.103871958612885, + "grad_norm": 0.8900736570358276, + "learning_rate": 0.0002, + "loss": 0.4876, + "step": 31570 + }, + { + "epoch": 5.10548864279363, + "grad_norm": 1.0921649932861328, + "learning_rate": 0.0002, + "loss": 0.463, + "step": 31580 + }, + { + "epoch": 5.107105326974375, + "grad_norm": 1.1613792181015015, + "learning_rate": 0.0002, + "loss": 0.5148, + "step": 31590 + }, + { + "epoch": 5.108722011155121, + "grad_norm": 0.9211367964744568, + "learning_rate": 0.0002, + "loss": 0.5055, + "step": 31600 + }, + { + "epoch": 5.110338695335866, + "grad_norm": 1.3315813541412354, + "learning_rate": 0.0002, + "loss": 0.5364, + "step": 31610 + }, + { + "epoch": 5.111955379516611, + "grad_norm": 1.3765019178390503, + "learning_rate": 0.0002, + "loss": 0.5336, + "step": 31620 + }, + { + "epoch": 5.113572063697356, + "grad_norm": 1.070198893547058, + "learning_rate": 0.0002, + "loss": 0.4861, + "step": 31630 + }, + { + "epoch": 5.115188747878102, + "grad_norm": 0.947631299495697, + "learning_rate": 0.0002, + "loss": 0.5046, + "step": 31640 + }, + { + "epoch": 5.116805432058848, + "grad_norm": 1.0197371244430542, + "learning_rate": 0.0002, + "loss": 0.5297, + "step": 31650 + }, + { + "epoch": 5.118422116239593, + "grad_norm": 0.8647911548614502, + "learning_rate": 0.0002, + "loss": 0.5014, + "step": 31660 + }, + { + "epoch": 5.120038800420338, + "grad_norm": 0.8944075107574463, + "learning_rate": 0.0002, + "loss": 0.4705, + "step": 31670 + }, + { + "epoch": 5.121655484601083, + "grad_norm": 1.124497652053833, + "learning_rate": 0.0002, + "loss": 0.5175, + "step": 31680 + }, + { + "epoch": 5.123272168781829, + "grad_norm": 0.893131673336029, + "learning_rate": 0.0002, + "loss": 0.5109, + "step": 31690 + }, + { + "epoch": 5.124888852962574, + "grad_norm": 1.0122284889221191, + "learning_rate": 0.0002, + "loss": 0.4937, + "step": 31700 + }, + { + "epoch": 5.126505537143319, + "grad_norm": 0.9493719935417175, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 31710 + }, + { + "epoch": 5.128122221324064, + "grad_norm": 0.9700539112091064, + "learning_rate": 0.0002, + "loss": 0.5031, + "step": 31720 + }, + { + "epoch": 5.1297389055048095, + "grad_norm": 1.111677646636963, + "learning_rate": 0.0002, + "loss": 0.5126, + "step": 31730 + }, + { + "epoch": 5.131355589685555, + "grad_norm": 0.8204274773597717, + "learning_rate": 0.0002, + "loss": 0.5272, + "step": 31740 + }, + { + "epoch": 5.1329722738663, + "grad_norm": 1.1029267311096191, + "learning_rate": 0.0002, + "loss": 0.5029, + "step": 31750 + }, + { + "epoch": 5.134588958047045, + "grad_norm": 1.065575122833252, + "learning_rate": 0.0002, + "loss": 0.505, + "step": 31760 + }, + { + "epoch": 5.13620564222779, + "grad_norm": 0.8208706974983215, + "learning_rate": 0.0002, + "loss": 0.502, + "step": 31770 + }, + { + "epoch": 5.137822326408536, + "grad_norm": 1.0520979166030884, + "learning_rate": 0.0002, + "loss": 0.5352, + "step": 31780 + }, + { + "epoch": 5.139439010589282, + "grad_norm": 0.8585538268089294, + "learning_rate": 0.0002, + "loss": 0.4911, + "step": 31790 + }, + { + "epoch": 5.141055694770027, + "grad_norm": 1.1491447687149048, + "learning_rate": 0.0002, + "loss": 0.5159, + "step": 31800 + }, + { + "epoch": 5.142672378950772, + "grad_norm": 0.9441081285476685, + "learning_rate": 0.0002, + "loss": 0.5157, + "step": 31810 + }, + { + "epoch": 5.1442890631315175, + "grad_norm": 1.4146889448165894, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 31820 + }, + { + "epoch": 5.145905747312263, + "grad_norm": 1.0326547622680664, + "learning_rate": 0.0002, + "loss": 0.5159, + "step": 31830 + }, + { + "epoch": 5.147522431493008, + "grad_norm": 0.9879202842712402, + "learning_rate": 0.0002, + "loss": 0.5348, + "step": 31840 + }, + { + "epoch": 5.149139115673753, + "grad_norm": 1.0374281406402588, + "learning_rate": 0.0002, + "loss": 0.5083, + "step": 31850 + }, + { + "epoch": 5.150755799854498, + "grad_norm": 1.181229591369629, + "learning_rate": 0.0002, + "loss": 0.4827, + "step": 31860 + }, + { + "epoch": 5.152372484035244, + "grad_norm": 1.2078537940979004, + "learning_rate": 0.0002, + "loss": 0.5313, + "step": 31870 + }, + { + "epoch": 5.153989168215989, + "grad_norm": 0.9599190354347229, + "learning_rate": 0.0002, + "loss": 0.5329, + "step": 31880 + }, + { + "epoch": 5.155605852396734, + "grad_norm": 1.0378568172454834, + "learning_rate": 0.0002, + "loss": 0.4953, + "step": 31890 + }, + { + "epoch": 5.157222536577479, + "grad_norm": 0.8746536374092102, + "learning_rate": 0.0002, + "loss": 0.5069, + "step": 31900 + }, + { + "epoch": 5.1588392207582245, + "grad_norm": 1.0232136249542236, + "learning_rate": 0.0002, + "loss": 0.5272, + "step": 31910 + }, + { + "epoch": 5.16045590493897, + "grad_norm": 0.9827565550804138, + "learning_rate": 0.0002, + "loss": 0.4844, + "step": 31920 + }, + { + "epoch": 5.162072589119716, + "grad_norm": 1.342657208442688, + "learning_rate": 0.0002, + "loss": 0.5029, + "step": 31930 + }, + { + "epoch": 5.163689273300461, + "grad_norm": 1.18390691280365, + "learning_rate": 0.0002, + "loss": 0.513, + "step": 31940 + }, + { + "epoch": 5.165305957481206, + "grad_norm": 0.996350109577179, + "learning_rate": 0.0002, + "loss": 0.5267, + "step": 31950 + }, + { + "epoch": 5.166922641661952, + "grad_norm": 0.9710391163825989, + "learning_rate": 0.0002, + "loss": 0.5063, + "step": 31960 + }, + { + "epoch": 5.168539325842697, + "grad_norm": 1.0264002084732056, + "learning_rate": 0.0002, + "loss": 0.5115, + "step": 31970 + }, + { + "epoch": 5.170156010023442, + "grad_norm": 1.0028311014175415, + "learning_rate": 0.0002, + "loss": 0.4972, + "step": 31980 + }, + { + "epoch": 5.171772694204187, + "grad_norm": 1.1078234910964966, + "learning_rate": 0.0002, + "loss": 0.5103, + "step": 31990 + }, + { + "epoch": 5.1733893783849325, + "grad_norm": 0.9659610390663147, + "learning_rate": 0.0002, + "loss": 0.495, + "step": 32000 + }, + { + "epoch": 5.175006062565678, + "grad_norm": 0.841986894607544, + "learning_rate": 0.0002, + "loss": 0.5114, + "step": 32010 + }, + { + "epoch": 5.176622746746423, + "grad_norm": 1.095332384109497, + "learning_rate": 0.0002, + "loss": 0.48, + "step": 32020 + }, + { + "epoch": 5.178239430927168, + "grad_norm": 1.1242377758026123, + "learning_rate": 0.0002, + "loss": 0.4741, + "step": 32030 + }, + { + "epoch": 5.179856115107913, + "grad_norm": 0.9872292280197144, + "learning_rate": 0.0002, + "loss": 0.5573, + "step": 32040 + }, + { + "epoch": 5.181472799288659, + "grad_norm": 0.936161994934082, + "learning_rate": 0.0002, + "loss": 0.48, + "step": 32050 + }, + { + "epoch": 5.183089483469404, + "grad_norm": 1.166100025177002, + "learning_rate": 0.0002, + "loss": 0.5093, + "step": 32060 + }, + { + "epoch": 5.184706167650149, + "grad_norm": 1.0764425992965698, + "learning_rate": 0.0002, + "loss": 0.5438, + "step": 32070 + }, + { + "epoch": 5.186322851830895, + "grad_norm": 1.0480051040649414, + "learning_rate": 0.0002, + "loss": 0.4843, + "step": 32080 + }, + { + "epoch": 5.1879395360116405, + "grad_norm": 1.0874916315078735, + "learning_rate": 0.0002, + "loss": 0.5386, + "step": 32090 + }, + { + "epoch": 5.189556220192386, + "grad_norm": 1.0817396640777588, + "learning_rate": 0.0002, + "loss": 0.4975, + "step": 32100 + }, + { + "epoch": 5.191172904373131, + "grad_norm": 1.054111361503601, + "learning_rate": 0.0002, + "loss": 0.5177, + "step": 32110 + }, + { + "epoch": 5.192789588553876, + "grad_norm": 0.9655823707580566, + "learning_rate": 0.0002, + "loss": 0.5229, + "step": 32120 + }, + { + "epoch": 5.194406272734621, + "grad_norm": 1.1384109258651733, + "learning_rate": 0.0002, + "loss": 0.5105, + "step": 32130 + }, + { + "epoch": 5.196022956915367, + "grad_norm": 1.0149348974227905, + "learning_rate": 0.0002, + "loss": 0.5073, + "step": 32140 + }, + { + "epoch": 5.197639641096112, + "grad_norm": 1.1084046363830566, + "learning_rate": 0.0002, + "loss": 0.5293, + "step": 32150 + }, + { + "epoch": 5.199256325276857, + "grad_norm": 1.1209309101104736, + "learning_rate": 0.0002, + "loss": 0.4936, + "step": 32160 + }, + { + "epoch": 5.200873009457602, + "grad_norm": 1.133089542388916, + "learning_rate": 0.0002, + "loss": 0.5101, + "step": 32170 + }, + { + "epoch": 5.202489693638348, + "grad_norm": 1.0893020629882812, + "learning_rate": 0.0002, + "loss": 0.5242, + "step": 32180 + }, + { + "epoch": 5.204106377819093, + "grad_norm": 0.90018630027771, + "learning_rate": 0.0002, + "loss": 0.4872, + "step": 32190 + }, + { + "epoch": 5.205723061999838, + "grad_norm": 0.977622926235199, + "learning_rate": 0.0002, + "loss": 0.4999, + "step": 32200 + }, + { + "epoch": 5.207339746180583, + "grad_norm": 1.2940177917480469, + "learning_rate": 0.0002, + "loss": 0.5028, + "step": 32210 + }, + { + "epoch": 5.2089564303613285, + "grad_norm": 1.2131710052490234, + "learning_rate": 0.0002, + "loss": 0.5396, + "step": 32220 + }, + { + "epoch": 5.210573114542075, + "grad_norm": 1.0234841108322144, + "learning_rate": 0.0002, + "loss": 0.5189, + "step": 32230 + }, + { + "epoch": 5.21218979872282, + "grad_norm": 1.157975435256958, + "learning_rate": 0.0002, + "loss": 0.5424, + "step": 32240 + }, + { + "epoch": 5.213806482903565, + "grad_norm": 1.0381282567977905, + "learning_rate": 0.0002, + "loss": 0.5396, + "step": 32250 + }, + { + "epoch": 5.21542316708431, + "grad_norm": 1.0125395059585571, + "learning_rate": 0.0002, + "loss": 0.5192, + "step": 32260 + }, + { + "epoch": 5.2170398512650555, + "grad_norm": 1.272691011428833, + "learning_rate": 0.0002, + "loss": 0.5216, + "step": 32270 + }, + { + "epoch": 5.218656535445801, + "grad_norm": 1.0061250925064087, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 32280 + }, + { + "epoch": 5.220273219626546, + "grad_norm": 0.9752234816551208, + "learning_rate": 0.0002, + "loss": 0.4739, + "step": 32290 + }, + { + "epoch": 5.221889903807291, + "grad_norm": 1.1193140745162964, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 32300 + }, + { + "epoch": 5.2235065879880365, + "grad_norm": 1.0126434564590454, + "learning_rate": 0.0002, + "loss": 0.4976, + "step": 32310 + }, + { + "epoch": 5.225123272168782, + "grad_norm": 1.4338394403457642, + "learning_rate": 0.0002, + "loss": 0.5257, + "step": 32320 + }, + { + "epoch": 5.226739956349527, + "grad_norm": 1.004101276397705, + "learning_rate": 0.0002, + "loss": 0.5235, + "step": 32330 + }, + { + "epoch": 5.228356640530272, + "grad_norm": 0.8744166493415833, + "learning_rate": 0.0002, + "loss": 0.5091, + "step": 32340 + }, + { + "epoch": 5.229973324711017, + "grad_norm": 1.0165376663208008, + "learning_rate": 0.0002, + "loss": 0.5388, + "step": 32350 + }, + { + "epoch": 5.231590008891763, + "grad_norm": 0.8635954260826111, + "learning_rate": 0.0002, + "loss": 0.5469, + "step": 32360 + }, + { + "epoch": 5.233206693072509, + "grad_norm": 1.1392399072647095, + "learning_rate": 0.0002, + "loss": 0.5609, + "step": 32370 + }, + { + "epoch": 5.234823377253254, + "grad_norm": 1.0202113389968872, + "learning_rate": 0.0002, + "loss": 0.5173, + "step": 32380 + }, + { + "epoch": 5.236440061433999, + "grad_norm": 1.0417983531951904, + "learning_rate": 0.0002, + "loss": 0.4983, + "step": 32390 + }, + { + "epoch": 5.238056745614744, + "grad_norm": 0.8729333877563477, + "learning_rate": 0.0002, + "loss": 0.507, + "step": 32400 + }, + { + "epoch": 5.23967342979549, + "grad_norm": 1.1626229286193848, + "learning_rate": 0.0002, + "loss": 0.5426, + "step": 32410 + }, + { + "epoch": 5.241290113976235, + "grad_norm": 0.9086161851882935, + "learning_rate": 0.0002, + "loss": 0.5355, + "step": 32420 + }, + { + "epoch": 5.24290679815698, + "grad_norm": 1.3999892473220825, + "learning_rate": 0.0002, + "loss": 0.4927, + "step": 32430 + }, + { + "epoch": 5.244523482337725, + "grad_norm": 1.0356311798095703, + "learning_rate": 0.0002, + "loss": 0.4795, + "step": 32440 + }, + { + "epoch": 5.246140166518471, + "grad_norm": 0.9655531644821167, + "learning_rate": 0.0002, + "loss": 0.5035, + "step": 32450 + }, + { + "epoch": 5.247756850699216, + "grad_norm": 1.0411828756332397, + "learning_rate": 0.0002, + "loss": 0.5166, + "step": 32460 + }, + { + "epoch": 5.249373534879961, + "grad_norm": 1.1199816465377808, + "learning_rate": 0.0002, + "loss": 0.5141, + "step": 32470 + }, + { + "epoch": 5.250990219060706, + "grad_norm": 1.260321855545044, + "learning_rate": 0.0002, + "loss": 0.4864, + "step": 32480 + }, + { + "epoch": 5.2526069032414515, + "grad_norm": 1.2950857877731323, + "learning_rate": 0.0002, + "loss": 0.4893, + "step": 32490 + }, + { + "epoch": 5.254223587422197, + "grad_norm": 0.8982820510864258, + "learning_rate": 0.0002, + "loss": 0.4952, + "step": 32500 + }, + { + "epoch": 5.255840271602942, + "grad_norm": 0.8512987494468689, + "learning_rate": 0.0002, + "loss": 0.5138, + "step": 32510 + }, + { + "epoch": 5.257456955783688, + "grad_norm": 1.067443609237671, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 32520 + }, + { + "epoch": 5.259073639964433, + "grad_norm": 1.0957417488098145, + "learning_rate": 0.0002, + "loss": 0.4928, + "step": 32530 + }, + { + "epoch": 5.260690324145179, + "grad_norm": 1.4161807298660278, + "learning_rate": 0.0002, + "loss": 0.5169, + "step": 32540 + }, + { + "epoch": 5.262307008325924, + "grad_norm": 1.2264093160629272, + "learning_rate": 0.0002, + "loss": 0.5599, + "step": 32550 + }, + { + "epoch": 5.263923692506669, + "grad_norm": 1.0015931129455566, + "learning_rate": 0.0002, + "loss": 0.5221, + "step": 32560 + }, + { + "epoch": 5.265540376687414, + "grad_norm": 1.0743094682693481, + "learning_rate": 0.0002, + "loss": 0.5253, + "step": 32570 + }, + { + "epoch": 5.2671570608681595, + "grad_norm": 1.1386840343475342, + "learning_rate": 0.0002, + "loss": 0.5289, + "step": 32580 + }, + { + "epoch": 5.268773745048905, + "grad_norm": 1.0093860626220703, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 32590 + }, + { + "epoch": 5.27039042922965, + "grad_norm": 0.9593744874000549, + "learning_rate": 0.0002, + "loss": 0.5175, + "step": 32600 + }, + { + "epoch": 5.272007113410395, + "grad_norm": 1.146021842956543, + "learning_rate": 0.0002, + "loss": 0.528, + "step": 32610 + }, + { + "epoch": 5.27362379759114, + "grad_norm": 0.9579031467437744, + "learning_rate": 0.0002, + "loss": 0.4983, + "step": 32620 + }, + { + "epoch": 5.275240481771886, + "grad_norm": 1.0548793077468872, + "learning_rate": 0.0002, + "loss": 0.5376, + "step": 32630 + }, + { + "epoch": 5.276857165952631, + "grad_norm": 1.0380561351776123, + "learning_rate": 0.0002, + "loss": 0.5267, + "step": 32640 + }, + { + "epoch": 5.278473850133376, + "grad_norm": 1.2119969129562378, + "learning_rate": 0.0002, + "loss": 0.5182, + "step": 32650 + }, + { + "epoch": 5.280090534314121, + "grad_norm": 1.0507797002792358, + "learning_rate": 0.0002, + "loss": 0.5298, + "step": 32660 + }, + { + "epoch": 5.2817072184948675, + "grad_norm": 1.0185176134109497, + "learning_rate": 0.0002, + "loss": 0.5253, + "step": 32670 + }, + { + "epoch": 5.283323902675613, + "grad_norm": 1.2358098030090332, + "learning_rate": 0.0002, + "loss": 0.4904, + "step": 32680 + }, + { + "epoch": 5.284940586856358, + "grad_norm": 0.7937114238739014, + "learning_rate": 0.0002, + "loss": 0.5169, + "step": 32690 + }, + { + "epoch": 5.286557271037103, + "grad_norm": 0.9825124740600586, + "learning_rate": 0.0002, + "loss": 0.495, + "step": 32700 + }, + { + "epoch": 5.288173955217848, + "grad_norm": 1.2059301137924194, + "learning_rate": 0.0002, + "loss": 0.5149, + "step": 32710 + }, + { + "epoch": 5.289790639398594, + "grad_norm": 1.0828571319580078, + "learning_rate": 0.0002, + "loss": 0.5272, + "step": 32720 + }, + { + "epoch": 5.291407323579339, + "grad_norm": 1.0129735469818115, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 32730 + }, + { + "epoch": 5.293024007760084, + "grad_norm": 1.0591634511947632, + "learning_rate": 0.0002, + "loss": 0.5216, + "step": 32740 + }, + { + "epoch": 5.294640691940829, + "grad_norm": 0.9256815910339355, + "learning_rate": 0.0002, + "loss": 0.522, + "step": 32750 + }, + { + "epoch": 5.2962573761215745, + "grad_norm": 1.0928633213043213, + "learning_rate": 0.0002, + "loss": 0.5396, + "step": 32760 + }, + { + "epoch": 5.29787406030232, + "grad_norm": 0.9415594935417175, + "learning_rate": 0.0002, + "loss": 0.5093, + "step": 32770 + }, + { + "epoch": 5.299490744483065, + "grad_norm": 1.141316294670105, + "learning_rate": 0.0002, + "loss": 0.5252, + "step": 32780 + }, + { + "epoch": 5.30110742866381, + "grad_norm": 1.0646510124206543, + "learning_rate": 0.0002, + "loss": 0.4837, + "step": 32790 + }, + { + "epoch": 5.3027241128445555, + "grad_norm": 1.189661979675293, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 32800 + }, + { + "epoch": 5.304340797025301, + "grad_norm": 0.9568731188774109, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 32810 + }, + { + "epoch": 5.305957481206047, + "grad_norm": 1.1556824445724487, + "learning_rate": 0.0002, + "loss": 0.5344, + "step": 32820 + }, + { + "epoch": 5.307574165386792, + "grad_norm": 0.9353463649749756, + "learning_rate": 0.0002, + "loss": 0.4894, + "step": 32830 + }, + { + "epoch": 5.309190849567537, + "grad_norm": 1.1208295822143555, + "learning_rate": 0.0002, + "loss": 0.5052, + "step": 32840 + }, + { + "epoch": 5.3108075337482825, + "grad_norm": 1.0894153118133545, + "learning_rate": 0.0002, + "loss": 0.5126, + "step": 32850 + }, + { + "epoch": 5.312424217929028, + "grad_norm": 1.090329647064209, + "learning_rate": 0.0002, + "loss": 0.5046, + "step": 32860 + }, + { + "epoch": 5.314040902109773, + "grad_norm": 1.0781712532043457, + "learning_rate": 0.0002, + "loss": 0.5237, + "step": 32870 + }, + { + "epoch": 5.315657586290518, + "grad_norm": 1.1785295009613037, + "learning_rate": 0.0002, + "loss": 0.57, + "step": 32880 + }, + { + "epoch": 5.317274270471263, + "grad_norm": 1.0406851768493652, + "learning_rate": 0.0002, + "loss": 0.4953, + "step": 32890 + }, + { + "epoch": 5.318890954652009, + "grad_norm": 1.0982953310012817, + "learning_rate": 0.0002, + "loss": 0.514, + "step": 32900 + }, + { + "epoch": 5.320507638832754, + "grad_norm": 1.2969383001327515, + "learning_rate": 0.0002, + "loss": 0.4944, + "step": 32910 + }, + { + "epoch": 5.322124323013499, + "grad_norm": 0.9687288999557495, + "learning_rate": 0.0002, + "loss": 0.4786, + "step": 32920 + }, + { + "epoch": 5.323741007194244, + "grad_norm": 1.136760950088501, + "learning_rate": 0.0002, + "loss": 0.5286, + "step": 32930 + }, + { + "epoch": 5.32535769137499, + "grad_norm": 1.3045495748519897, + "learning_rate": 0.0002, + "loss": 0.5321, + "step": 32940 + }, + { + "epoch": 5.326974375555735, + "grad_norm": 1.221675992012024, + "learning_rate": 0.0002, + "loss": 0.5413, + "step": 32950 + }, + { + "epoch": 5.32859105973648, + "grad_norm": 1.1380633115768433, + "learning_rate": 0.0002, + "loss": 0.4999, + "step": 32960 + }, + { + "epoch": 5.330207743917226, + "grad_norm": 1.1065956354141235, + "learning_rate": 0.0002, + "loss": 0.5037, + "step": 32970 + }, + { + "epoch": 5.331824428097971, + "grad_norm": 1.0187175273895264, + "learning_rate": 0.0002, + "loss": 0.4913, + "step": 32980 + }, + { + "epoch": 5.333441112278717, + "grad_norm": 0.9077118039131165, + "learning_rate": 0.0002, + "loss": 0.5234, + "step": 32990 + }, + { + "epoch": 5.335057796459462, + "grad_norm": 1.0092815160751343, + "learning_rate": 0.0002, + "loss": 0.5071, + "step": 33000 + }, + { + "epoch": 5.336674480640207, + "grad_norm": 1.0168777704238892, + "learning_rate": 0.0002, + "loss": 0.498, + "step": 33010 + }, + { + "epoch": 5.338291164820952, + "grad_norm": 0.996161937713623, + "learning_rate": 0.0002, + "loss": 0.4952, + "step": 33020 + }, + { + "epoch": 5.339907849001698, + "grad_norm": 0.794463038444519, + "learning_rate": 0.0002, + "loss": 0.5024, + "step": 33030 + }, + { + "epoch": 5.341524533182443, + "grad_norm": 0.9750674962997437, + "learning_rate": 0.0002, + "loss": 0.5112, + "step": 33040 + }, + { + "epoch": 5.343141217363188, + "grad_norm": 1.2770029306411743, + "learning_rate": 0.0002, + "loss": 0.528, + "step": 33050 + }, + { + "epoch": 5.344757901543933, + "grad_norm": 1.1500186920166016, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 33060 + }, + { + "epoch": 5.3463745857246785, + "grad_norm": 1.0726377964019775, + "learning_rate": 0.0002, + "loss": 0.4906, + "step": 33070 + }, + { + "epoch": 5.347991269905424, + "grad_norm": 0.9314153790473938, + "learning_rate": 0.0002, + "loss": 0.5212, + "step": 33080 + }, + { + "epoch": 5.349607954086169, + "grad_norm": 1.344988465309143, + "learning_rate": 0.0002, + "loss": 0.5434, + "step": 33090 + }, + { + "epoch": 5.351224638266914, + "grad_norm": 0.863196611404419, + "learning_rate": 0.0002, + "loss": 0.4874, + "step": 33100 + }, + { + "epoch": 5.352841322447659, + "grad_norm": 1.128100037574768, + "learning_rate": 0.0002, + "loss": 0.534, + "step": 33110 + }, + { + "epoch": 5.3544580066284055, + "grad_norm": 1.1673583984375, + "learning_rate": 0.0002, + "loss": 0.5293, + "step": 33120 + }, + { + "epoch": 5.356074690809151, + "grad_norm": 0.9416789412498474, + "learning_rate": 0.0002, + "loss": 0.4787, + "step": 33130 + }, + { + "epoch": 5.357691374989896, + "grad_norm": 1.1855236291885376, + "learning_rate": 0.0002, + "loss": 0.5155, + "step": 33140 + }, + { + "epoch": 5.359308059170641, + "grad_norm": 1.0415170192718506, + "learning_rate": 0.0002, + "loss": 0.515, + "step": 33150 + }, + { + "epoch": 5.3609247433513865, + "grad_norm": 0.9953004121780396, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 33160 + }, + { + "epoch": 5.362541427532132, + "grad_norm": 0.96138596534729, + "learning_rate": 0.0002, + "loss": 0.5305, + "step": 33170 + }, + { + "epoch": 5.364158111712877, + "grad_norm": 1.341979742050171, + "learning_rate": 0.0002, + "loss": 0.5064, + "step": 33180 + }, + { + "epoch": 5.365774795893622, + "grad_norm": 1.0136911869049072, + "learning_rate": 0.0002, + "loss": 0.4986, + "step": 33190 + }, + { + "epoch": 5.367391480074367, + "grad_norm": 0.8685575127601624, + "learning_rate": 0.0002, + "loss": 0.5459, + "step": 33200 + }, + { + "epoch": 5.369008164255113, + "grad_norm": 0.8833574652671814, + "learning_rate": 0.0002, + "loss": 0.5146, + "step": 33210 + }, + { + "epoch": 5.370624848435858, + "grad_norm": 0.9123612642288208, + "learning_rate": 0.0002, + "loss": 0.4982, + "step": 33220 + }, + { + "epoch": 5.372241532616603, + "grad_norm": 1.2720599174499512, + "learning_rate": 0.0002, + "loss": 0.5047, + "step": 33230 + }, + { + "epoch": 5.373858216797348, + "grad_norm": 1.0596648454666138, + "learning_rate": 0.0002, + "loss": 0.5175, + "step": 33240 + }, + { + "epoch": 5.3754749009780936, + "grad_norm": 1.119701623916626, + "learning_rate": 0.0002, + "loss": 0.5284, + "step": 33250 + }, + { + "epoch": 5.377091585158839, + "grad_norm": 1.3000061511993408, + "learning_rate": 0.0002, + "loss": 0.5217, + "step": 33260 + }, + { + "epoch": 5.378708269339585, + "grad_norm": 1.083891749382019, + "learning_rate": 0.0002, + "loss": 0.5125, + "step": 33270 + }, + { + "epoch": 5.38032495352033, + "grad_norm": 0.9402718544006348, + "learning_rate": 0.0002, + "loss": 0.5065, + "step": 33280 + }, + { + "epoch": 5.381941637701075, + "grad_norm": 1.3376892805099487, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 33290 + }, + { + "epoch": 5.383558321881821, + "grad_norm": 1.1600074768066406, + "learning_rate": 0.0002, + "loss": 0.5193, + "step": 33300 + }, + { + "epoch": 5.385175006062566, + "grad_norm": 1.1449427604675293, + "learning_rate": 0.0002, + "loss": 0.4907, + "step": 33310 + }, + { + "epoch": 5.386791690243311, + "grad_norm": 1.3118891716003418, + "learning_rate": 0.0002, + "loss": 0.5449, + "step": 33320 + }, + { + "epoch": 5.388408374424056, + "grad_norm": 0.743449866771698, + "learning_rate": 0.0002, + "loss": 0.547, + "step": 33330 + }, + { + "epoch": 5.3900250586048015, + "grad_norm": 0.9358304142951965, + "learning_rate": 0.0002, + "loss": 0.5555, + "step": 33340 + }, + { + "epoch": 5.391641742785547, + "grad_norm": 1.0447142124176025, + "learning_rate": 0.0002, + "loss": 0.5558, + "step": 33350 + }, + { + "epoch": 5.393258426966292, + "grad_norm": 1.1088626384735107, + "learning_rate": 0.0002, + "loss": 0.5106, + "step": 33360 + }, + { + "epoch": 5.394875111147037, + "grad_norm": 1.1267958879470825, + "learning_rate": 0.0002, + "loss": 0.4929, + "step": 33370 + }, + { + "epoch": 5.3964917953277824, + "grad_norm": 0.9709370136260986, + "learning_rate": 0.0002, + "loss": 0.5165, + "step": 33380 + }, + { + "epoch": 5.398108479508528, + "grad_norm": 1.0939103364944458, + "learning_rate": 0.0002, + "loss": 0.5206, + "step": 33390 + }, + { + "epoch": 5.399725163689273, + "grad_norm": 0.9559304714202881, + "learning_rate": 0.0002, + "loss": 0.5177, + "step": 33400 + }, + { + "epoch": 5.401341847870018, + "grad_norm": 1.199580430984497, + "learning_rate": 0.0002, + "loss": 0.5064, + "step": 33410 + }, + { + "epoch": 5.402958532050764, + "grad_norm": 0.9097000360488892, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 33420 + }, + { + "epoch": 5.4045752162315095, + "grad_norm": 1.1940981149673462, + "learning_rate": 0.0002, + "loss": 0.514, + "step": 33430 + }, + { + "epoch": 5.406191900412255, + "grad_norm": 1.0530916452407837, + "learning_rate": 0.0002, + "loss": 0.5069, + "step": 33440 + }, + { + "epoch": 5.407808584593, + "grad_norm": 1.0482549667358398, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 33450 + }, + { + "epoch": 5.409425268773745, + "grad_norm": 1.2524714469909668, + "learning_rate": 0.0002, + "loss": 0.501, + "step": 33460 + }, + { + "epoch": 5.41104195295449, + "grad_norm": 1.1091666221618652, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 33470 + }, + { + "epoch": 5.412658637135236, + "grad_norm": 0.9981587529182434, + "learning_rate": 0.0002, + "loss": 0.546, + "step": 33480 + }, + { + "epoch": 5.414275321315981, + "grad_norm": 1.016681432723999, + "learning_rate": 0.0002, + "loss": 0.4977, + "step": 33490 + }, + { + "epoch": 5.415892005496726, + "grad_norm": 1.1456854343414307, + "learning_rate": 0.0002, + "loss": 0.5388, + "step": 33500 + }, + { + "epoch": 5.417508689677471, + "grad_norm": 1.1454259157180786, + "learning_rate": 0.0002, + "loss": 0.5292, + "step": 33510 + }, + { + "epoch": 5.419125373858217, + "grad_norm": 0.9858416318893433, + "learning_rate": 0.0002, + "loss": 0.5061, + "step": 33520 + }, + { + "epoch": 5.420742058038962, + "grad_norm": 0.9764766693115234, + "learning_rate": 0.0002, + "loss": 0.5139, + "step": 33530 + }, + { + "epoch": 5.422358742219707, + "grad_norm": 1.199920892715454, + "learning_rate": 0.0002, + "loss": 0.5518, + "step": 33540 + }, + { + "epoch": 5.423975426400452, + "grad_norm": 1.3107370138168335, + "learning_rate": 0.0002, + "loss": 0.5182, + "step": 33550 + }, + { + "epoch": 5.4255921105811975, + "grad_norm": 0.9637970328330994, + "learning_rate": 0.0002, + "loss": 0.5149, + "step": 33560 + }, + { + "epoch": 5.427208794761944, + "grad_norm": 1.023359775543213, + "learning_rate": 0.0002, + "loss": 0.526, + "step": 33570 + }, + { + "epoch": 5.428825478942689, + "grad_norm": 1.060417652130127, + "learning_rate": 0.0002, + "loss": 0.5206, + "step": 33580 + }, + { + "epoch": 5.430442163123434, + "grad_norm": 0.9971120953559875, + "learning_rate": 0.0002, + "loss": 0.5052, + "step": 33590 + }, + { + "epoch": 5.432058847304179, + "grad_norm": 0.9213743209838867, + "learning_rate": 0.0002, + "loss": 0.5044, + "step": 33600 + }, + { + "epoch": 5.4336755314849245, + "grad_norm": 1.1512309312820435, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 33610 + }, + { + "epoch": 5.43529221566567, + "grad_norm": 1.2198847532272339, + "learning_rate": 0.0002, + "loss": 0.5317, + "step": 33620 + }, + { + "epoch": 5.436908899846415, + "grad_norm": 1.0329595804214478, + "learning_rate": 0.0002, + "loss": 0.5237, + "step": 33630 + }, + { + "epoch": 5.43852558402716, + "grad_norm": 1.1075750589370728, + "learning_rate": 0.0002, + "loss": 0.5364, + "step": 33640 + }, + { + "epoch": 5.4401422682079055, + "grad_norm": 1.006342887878418, + "learning_rate": 0.0002, + "loss": 0.5295, + "step": 33650 + }, + { + "epoch": 5.441758952388651, + "grad_norm": 0.9179885983467102, + "learning_rate": 0.0002, + "loss": 0.5394, + "step": 33660 + }, + { + "epoch": 5.443375636569396, + "grad_norm": 1.2799493074417114, + "learning_rate": 0.0002, + "loss": 0.5124, + "step": 33670 + }, + { + "epoch": 5.444992320750141, + "grad_norm": 1.1153863668441772, + "learning_rate": 0.0002, + "loss": 0.5426, + "step": 33680 + }, + { + "epoch": 5.446609004930886, + "grad_norm": 1.0681028366088867, + "learning_rate": 0.0002, + "loss": 0.5087, + "step": 33690 + }, + { + "epoch": 5.448225689111632, + "grad_norm": 0.9788817167282104, + "learning_rate": 0.0002, + "loss": 0.5272, + "step": 33700 + }, + { + "epoch": 5.449842373292377, + "grad_norm": 0.8481608629226685, + "learning_rate": 0.0002, + "loss": 0.5308, + "step": 33710 + }, + { + "epoch": 5.451459057473123, + "grad_norm": 1.113756537437439, + "learning_rate": 0.0002, + "loss": 0.5225, + "step": 33720 + }, + { + "epoch": 5.453075741653868, + "grad_norm": 0.8425475358963013, + "learning_rate": 0.0002, + "loss": 0.5213, + "step": 33730 + }, + { + "epoch": 5.4546924258346134, + "grad_norm": 1.0852208137512207, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 33740 + }, + { + "epoch": 5.456309110015359, + "grad_norm": 1.1664748191833496, + "learning_rate": 0.0002, + "loss": 0.5535, + "step": 33750 + }, + { + "epoch": 5.457925794196104, + "grad_norm": 1.217241644859314, + "learning_rate": 0.0002, + "loss": 0.5419, + "step": 33760 + }, + { + "epoch": 5.459542478376849, + "grad_norm": 1.1572928428649902, + "learning_rate": 0.0002, + "loss": 0.5351, + "step": 33770 + }, + { + "epoch": 5.461159162557594, + "grad_norm": 1.0437318086624146, + "learning_rate": 0.0002, + "loss": 0.5161, + "step": 33780 + }, + { + "epoch": 5.46277584673834, + "grad_norm": 0.9807571768760681, + "learning_rate": 0.0002, + "loss": 0.5266, + "step": 33790 + }, + { + "epoch": 5.464392530919085, + "grad_norm": 1.1436342000961304, + "learning_rate": 0.0002, + "loss": 0.5384, + "step": 33800 + }, + { + "epoch": 5.46600921509983, + "grad_norm": 1.1004794836044312, + "learning_rate": 0.0002, + "loss": 0.5338, + "step": 33810 + }, + { + "epoch": 5.467625899280575, + "grad_norm": 1.2130268812179565, + "learning_rate": 0.0002, + "loss": 0.4868, + "step": 33820 + }, + { + "epoch": 5.4692425834613205, + "grad_norm": 1.3154419660568237, + "learning_rate": 0.0002, + "loss": 0.516, + "step": 33830 + }, + { + "epoch": 5.470859267642066, + "grad_norm": 0.7934383749961853, + "learning_rate": 0.0002, + "loss": 0.4934, + "step": 33840 + }, + { + "epoch": 5.472475951822812, + "grad_norm": 0.7838410139083862, + "learning_rate": 0.0002, + "loss": 0.5133, + "step": 33850 + }, + { + "epoch": 5.474092636003557, + "grad_norm": 1.0415139198303223, + "learning_rate": 0.0002, + "loss": 0.4926, + "step": 33860 + }, + { + "epoch": 5.475709320184302, + "grad_norm": 0.9213164448738098, + "learning_rate": 0.0002, + "loss": 0.5323, + "step": 33870 + }, + { + "epoch": 5.477326004365048, + "grad_norm": 1.0364776849746704, + "learning_rate": 0.0002, + "loss": 0.5125, + "step": 33880 + }, + { + "epoch": 5.478942688545793, + "grad_norm": 0.9994072318077087, + "learning_rate": 0.0002, + "loss": 0.5212, + "step": 33890 + }, + { + "epoch": 5.480559372726538, + "grad_norm": 1.196730136871338, + "learning_rate": 0.0002, + "loss": 0.5396, + "step": 33900 + }, + { + "epoch": 5.482176056907283, + "grad_norm": 0.9955780506134033, + "learning_rate": 0.0002, + "loss": 0.538, + "step": 33910 + }, + { + "epoch": 5.4837927410880285, + "grad_norm": 1.168188214302063, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 33920 + }, + { + "epoch": 5.485409425268774, + "grad_norm": 1.1816450357437134, + "learning_rate": 0.0002, + "loss": 0.5548, + "step": 33930 + }, + { + "epoch": 5.487026109449519, + "grad_norm": 1.079715609550476, + "learning_rate": 0.0002, + "loss": 0.5535, + "step": 33940 + }, + { + "epoch": 5.488642793630264, + "grad_norm": 1.153850793838501, + "learning_rate": 0.0002, + "loss": 0.5262, + "step": 33950 + }, + { + "epoch": 5.490259477811009, + "grad_norm": 1.0207297801971436, + "learning_rate": 0.0002, + "loss": 0.5248, + "step": 33960 + }, + { + "epoch": 5.491876161991755, + "grad_norm": 1.1290855407714844, + "learning_rate": 0.0002, + "loss": 0.5142, + "step": 33970 + }, + { + "epoch": 5.4934928461725, + "grad_norm": 1.068058967590332, + "learning_rate": 0.0002, + "loss": 0.5168, + "step": 33980 + }, + { + "epoch": 5.495109530353245, + "grad_norm": 0.9789979457855225, + "learning_rate": 0.0002, + "loss": 0.5317, + "step": 33990 + }, + { + "epoch": 5.496726214533991, + "grad_norm": 0.9696692824363708, + "learning_rate": 0.0002, + "loss": 0.5113, + "step": 34000 + }, + { + "epoch": 5.4983428987147365, + "grad_norm": 1.0539981126785278, + "learning_rate": 0.0002, + "loss": 0.5413, + "step": 34010 + }, + { + "epoch": 5.499959582895482, + "grad_norm": 1.0249929428100586, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 34020 + }, + { + "epoch": 5.501576267076227, + "grad_norm": 0.9577504992485046, + "learning_rate": 0.0002, + "loss": 0.4888, + "step": 34030 + }, + { + "epoch": 5.503192951256972, + "grad_norm": 1.0963513851165771, + "learning_rate": 0.0002, + "loss": 0.5291, + "step": 34040 + }, + { + "epoch": 5.504809635437717, + "grad_norm": 0.8339345455169678, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 34050 + }, + { + "epoch": 5.506426319618463, + "grad_norm": 1.0138782262802124, + "learning_rate": 0.0002, + "loss": 0.5191, + "step": 34060 + }, + { + "epoch": 5.508043003799208, + "grad_norm": 1.0180109739303589, + "learning_rate": 0.0002, + "loss": 0.5463, + "step": 34070 + }, + { + "epoch": 5.509659687979953, + "grad_norm": 1.2790818214416504, + "learning_rate": 0.0002, + "loss": 0.5083, + "step": 34080 + }, + { + "epoch": 5.511276372160698, + "grad_norm": 1.428247332572937, + "learning_rate": 0.0002, + "loss": 0.5195, + "step": 34090 + }, + { + "epoch": 5.5128930563414436, + "grad_norm": 1.0926059484481812, + "learning_rate": 0.0002, + "loss": 0.5291, + "step": 34100 + }, + { + "epoch": 5.514509740522189, + "grad_norm": 1.2353343963623047, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 34110 + }, + { + "epoch": 5.516126424702934, + "grad_norm": 0.935587465763092, + "learning_rate": 0.0002, + "loss": 0.5331, + "step": 34120 + }, + { + "epoch": 5.517743108883679, + "grad_norm": 0.9767586588859558, + "learning_rate": 0.0002, + "loss": 0.5512, + "step": 34130 + }, + { + "epoch": 5.5193597930644245, + "grad_norm": 1.1660610437393188, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 34140 + }, + { + "epoch": 5.520976477245171, + "grad_norm": 0.9828870892524719, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 34150 + }, + { + "epoch": 5.522593161425916, + "grad_norm": 1.0097278356552124, + "learning_rate": 0.0002, + "loss": 0.5198, + "step": 34160 + }, + { + "epoch": 5.524209845606661, + "grad_norm": 1.1766167879104614, + "learning_rate": 0.0002, + "loss": 0.5293, + "step": 34170 + }, + { + "epoch": 5.525826529787406, + "grad_norm": 0.982292115688324, + "learning_rate": 0.0002, + "loss": 0.5258, + "step": 34180 + }, + { + "epoch": 5.5274432139681515, + "grad_norm": 1.0744609832763672, + "learning_rate": 0.0002, + "loss": 0.5114, + "step": 34190 + }, + { + "epoch": 5.529059898148897, + "grad_norm": 1.3831160068511963, + "learning_rate": 0.0002, + "loss": 0.5469, + "step": 34200 + }, + { + "epoch": 5.530676582329642, + "grad_norm": 1.074771761894226, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 34210 + }, + { + "epoch": 5.532293266510387, + "grad_norm": 1.016652226448059, + "learning_rate": 0.0002, + "loss": 0.5399, + "step": 34220 + }, + { + "epoch": 5.5339099506911325, + "grad_norm": 1.2231552600860596, + "learning_rate": 0.0002, + "loss": 0.5158, + "step": 34230 + }, + { + "epoch": 5.535526634871878, + "grad_norm": 0.8051198720932007, + "learning_rate": 0.0002, + "loss": 0.5091, + "step": 34240 + }, + { + "epoch": 5.537143319052623, + "grad_norm": 1.1779674291610718, + "learning_rate": 0.0002, + "loss": 0.5583, + "step": 34250 + }, + { + "epoch": 5.538760003233368, + "grad_norm": 1.2468291521072388, + "learning_rate": 0.0002, + "loss": 0.5044, + "step": 34260 + }, + { + "epoch": 5.540376687414113, + "grad_norm": 1.14818274974823, + "learning_rate": 0.0002, + "loss": 0.523, + "step": 34270 + }, + { + "epoch": 5.541993371594859, + "grad_norm": 1.2362616062164307, + "learning_rate": 0.0002, + "loss": 0.5375, + "step": 34280 + }, + { + "epoch": 5.543610055775604, + "grad_norm": 1.0206977128982544, + "learning_rate": 0.0002, + "loss": 0.4996, + "step": 34290 + }, + { + "epoch": 5.54522673995635, + "grad_norm": 1.2018457651138306, + "learning_rate": 0.0002, + "loss": 0.5212, + "step": 34300 + }, + { + "epoch": 5.546843424137095, + "grad_norm": 1.0349043607711792, + "learning_rate": 0.0002, + "loss": 0.5462, + "step": 34310 + }, + { + "epoch": 5.54846010831784, + "grad_norm": 1.2022006511688232, + "learning_rate": 0.0002, + "loss": 0.5231, + "step": 34320 + }, + { + "epoch": 5.550076792498586, + "grad_norm": 1.0810624361038208, + "learning_rate": 0.0002, + "loss": 0.5173, + "step": 34330 + }, + { + "epoch": 5.551693476679331, + "grad_norm": 1.3297529220581055, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 34340 + }, + { + "epoch": 5.553310160860076, + "grad_norm": 0.9722549915313721, + "learning_rate": 0.0002, + "loss": 0.5321, + "step": 34350 + }, + { + "epoch": 5.554926845040821, + "grad_norm": 0.9903425574302673, + "learning_rate": 0.0002, + "loss": 0.4823, + "step": 34360 + }, + { + "epoch": 5.556543529221567, + "grad_norm": 0.9568067789077759, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 34370 + }, + { + "epoch": 5.558160213402312, + "grad_norm": 1.113870620727539, + "learning_rate": 0.0002, + "loss": 0.5242, + "step": 34380 + }, + { + "epoch": 5.559776897583057, + "grad_norm": 1.0557632446289062, + "learning_rate": 0.0002, + "loss": 0.5278, + "step": 34390 + }, + { + "epoch": 5.561393581763802, + "grad_norm": 0.9615673422813416, + "learning_rate": 0.0002, + "loss": 0.5501, + "step": 34400 + }, + { + "epoch": 5.5630102659445475, + "grad_norm": 0.9536027312278748, + "learning_rate": 0.0002, + "loss": 0.5066, + "step": 34410 + }, + { + "epoch": 5.564626950125293, + "grad_norm": 0.8808749318122864, + "learning_rate": 0.0002, + "loss": 0.4949, + "step": 34420 + }, + { + "epoch": 5.566243634306038, + "grad_norm": 1.286132574081421, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 34430 + }, + { + "epoch": 5.567860318486783, + "grad_norm": 1.259644865989685, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 34440 + }, + { + "epoch": 5.569477002667529, + "grad_norm": 0.9920216798782349, + "learning_rate": 0.0002, + "loss": 0.4922, + "step": 34450 + }, + { + "epoch": 5.5710936868482746, + "grad_norm": 1.182926893234253, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 34460 + }, + { + "epoch": 5.57271037102902, + "grad_norm": 1.1434749364852905, + "learning_rate": 0.0002, + "loss": 0.5185, + "step": 34470 + }, + { + "epoch": 5.574327055209765, + "grad_norm": 1.2420979738235474, + "learning_rate": 0.0002, + "loss": 0.5256, + "step": 34480 + }, + { + "epoch": 5.57594373939051, + "grad_norm": 0.9338384866714478, + "learning_rate": 0.0002, + "loss": 0.5039, + "step": 34490 + }, + { + "epoch": 5.5775604235712555, + "grad_norm": 1.0196425914764404, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 34500 + }, + { + "epoch": 5.579177107752001, + "grad_norm": 0.9586997032165527, + "learning_rate": 0.0002, + "loss": 0.5132, + "step": 34510 + }, + { + "epoch": 5.580793791932746, + "grad_norm": 1.2409086227416992, + "learning_rate": 0.0002, + "loss": 0.5336, + "step": 34520 + }, + { + "epoch": 5.582410476113491, + "grad_norm": 1.1483757495880127, + "learning_rate": 0.0002, + "loss": 0.5364, + "step": 34530 + }, + { + "epoch": 5.584027160294236, + "grad_norm": 1.1624305248260498, + "learning_rate": 0.0002, + "loss": 0.5325, + "step": 34540 + }, + { + "epoch": 5.585643844474982, + "grad_norm": 1.2635223865509033, + "learning_rate": 0.0002, + "loss": 0.5342, + "step": 34550 + }, + { + "epoch": 5.587260528655727, + "grad_norm": 0.9824051856994629, + "learning_rate": 0.0002, + "loss": 0.4924, + "step": 34560 + }, + { + "epoch": 5.588877212836472, + "grad_norm": 1.0858620405197144, + "learning_rate": 0.0002, + "loss": 0.5395, + "step": 34570 + }, + { + "epoch": 5.590493897017217, + "grad_norm": 1.1452655792236328, + "learning_rate": 0.0002, + "loss": 0.5459, + "step": 34580 + }, + { + "epoch": 5.592110581197963, + "grad_norm": 1.110610842704773, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 34590 + }, + { + "epoch": 5.593727265378709, + "grad_norm": 0.9976194500923157, + "learning_rate": 0.0002, + "loss": 0.5285, + "step": 34600 + }, + { + "epoch": 5.595343949559454, + "grad_norm": 1.0698920488357544, + "learning_rate": 0.0002, + "loss": 0.548, + "step": 34610 + }, + { + "epoch": 5.596960633740199, + "grad_norm": 1.1505171060562134, + "learning_rate": 0.0002, + "loss": 0.5311, + "step": 34620 + }, + { + "epoch": 5.598577317920944, + "grad_norm": 1.1014643907546997, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 34630 + }, + { + "epoch": 5.60019400210169, + "grad_norm": 0.915595293045044, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 34640 + }, + { + "epoch": 5.601810686282435, + "grad_norm": 1.1856765747070312, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 34650 + }, + { + "epoch": 5.60342737046318, + "grad_norm": 1.1357687711715698, + "learning_rate": 0.0002, + "loss": 0.5502, + "step": 34660 + }, + { + "epoch": 5.605044054643925, + "grad_norm": 1.0232492685317993, + "learning_rate": 0.0002, + "loss": 0.5034, + "step": 34670 + }, + { + "epoch": 5.6066607388246705, + "grad_norm": 0.9375017881393433, + "learning_rate": 0.0002, + "loss": 0.5357, + "step": 34680 + }, + { + "epoch": 5.608277423005416, + "grad_norm": 1.0796529054641724, + "learning_rate": 0.0002, + "loss": 0.5518, + "step": 34690 + }, + { + "epoch": 5.609894107186161, + "grad_norm": 1.1383336782455444, + "learning_rate": 0.0002, + "loss": 0.5173, + "step": 34700 + }, + { + "epoch": 5.611510791366906, + "grad_norm": 1.0248544216156006, + "learning_rate": 0.0002, + "loss": 0.5477, + "step": 34710 + }, + { + "epoch": 5.6131274755476515, + "grad_norm": 1.0986040830612183, + "learning_rate": 0.0002, + "loss": 0.5669, + "step": 34720 + }, + { + "epoch": 5.614744159728397, + "grad_norm": 1.2689568996429443, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 34730 + }, + { + "epoch": 5.616360843909142, + "grad_norm": 1.4044264554977417, + "learning_rate": 0.0002, + "loss": 0.5136, + "step": 34740 + }, + { + "epoch": 5.617977528089888, + "grad_norm": 1.2084474563598633, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 34750 + }, + { + "epoch": 5.619594212270633, + "grad_norm": 1.061248540878296, + "learning_rate": 0.0002, + "loss": 0.5377, + "step": 34760 + }, + { + "epoch": 5.6212108964513785, + "grad_norm": 1.0220764875411987, + "learning_rate": 0.0002, + "loss": 0.5669, + "step": 34770 + }, + { + "epoch": 5.622827580632124, + "grad_norm": 1.0859092473983765, + "learning_rate": 0.0002, + "loss": 0.54, + "step": 34780 + }, + { + "epoch": 5.624444264812869, + "grad_norm": 0.9049732089042664, + "learning_rate": 0.0002, + "loss": 0.5308, + "step": 34790 + }, + { + "epoch": 5.626060948993614, + "grad_norm": 1.2103937864303589, + "learning_rate": 0.0002, + "loss": 0.5433, + "step": 34800 + }, + { + "epoch": 5.627677633174359, + "grad_norm": 0.9854230284690857, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 34810 + }, + { + "epoch": 5.629294317355105, + "grad_norm": 0.9316635131835938, + "learning_rate": 0.0002, + "loss": 0.5274, + "step": 34820 + }, + { + "epoch": 5.63091100153585, + "grad_norm": 1.105296015739441, + "learning_rate": 0.0002, + "loss": 0.5393, + "step": 34830 + }, + { + "epoch": 5.632527685716595, + "grad_norm": 0.993383526802063, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 34840 + }, + { + "epoch": 5.63414436989734, + "grad_norm": 1.1544116735458374, + "learning_rate": 0.0002, + "loss": 0.5375, + "step": 34850 + }, + { + "epoch": 5.635761054078086, + "grad_norm": 1.284475326538086, + "learning_rate": 0.0002, + "loss": 0.5448, + "step": 34860 + }, + { + "epoch": 5.637377738258831, + "grad_norm": 1.121997594833374, + "learning_rate": 0.0002, + "loss": 0.5069, + "step": 34870 + }, + { + "epoch": 5.638994422439576, + "grad_norm": 1.213040828704834, + "learning_rate": 0.0002, + "loss": 0.5335, + "step": 34880 + }, + { + "epoch": 5.640611106620321, + "grad_norm": 1.23222017288208, + "learning_rate": 0.0002, + "loss": 0.5623, + "step": 34890 + }, + { + "epoch": 5.642227790801067, + "grad_norm": 0.9793637990951538, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 34900 + }, + { + "epoch": 5.643844474981813, + "grad_norm": 1.38919997215271, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 34910 + }, + { + "epoch": 5.645461159162558, + "grad_norm": 0.8390951156616211, + "learning_rate": 0.0002, + "loss": 0.5007, + "step": 34920 + }, + { + "epoch": 5.647077843343303, + "grad_norm": 0.9465909004211426, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 34930 + }, + { + "epoch": 5.648694527524048, + "grad_norm": 1.066957712173462, + "learning_rate": 0.0002, + "loss": 0.5264, + "step": 34940 + }, + { + "epoch": 5.650311211704794, + "grad_norm": 0.9842154383659363, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 34950 + }, + { + "epoch": 5.651927895885539, + "grad_norm": 1.1766440868377686, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 34960 + }, + { + "epoch": 5.653544580066284, + "grad_norm": 0.9061306118965149, + "learning_rate": 0.0002, + "loss": 0.5462, + "step": 34970 + }, + { + "epoch": 5.655161264247029, + "grad_norm": 1.2941309213638306, + "learning_rate": 0.0002, + "loss": 0.5446, + "step": 34980 + }, + { + "epoch": 5.6567779484277745, + "grad_norm": 0.9741247892379761, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 34990 + }, + { + "epoch": 5.65839463260852, + "grad_norm": 1.0784187316894531, + "learning_rate": 0.0002, + "loss": 0.5152, + "step": 35000 + }, + { + "epoch": 5.660011316789265, + "grad_norm": 0.937889814376831, + "learning_rate": 0.0002, + "loss": 0.5363, + "step": 35010 + }, + { + "epoch": 5.66162800097001, + "grad_norm": 0.9667879939079285, + "learning_rate": 0.0002, + "loss": 0.5019, + "step": 35020 + }, + { + "epoch": 5.663244685150756, + "grad_norm": 1.0554876327514648, + "learning_rate": 0.0002, + "loss": 0.5209, + "step": 35030 + }, + { + "epoch": 5.664861369331501, + "grad_norm": 1.2030539512634277, + "learning_rate": 0.0002, + "loss": 0.523, + "step": 35040 + }, + { + "epoch": 5.666478053512247, + "grad_norm": 1.0849953889846802, + "learning_rate": 0.0002, + "loss": 0.5406, + "step": 35050 + }, + { + "epoch": 5.668094737692992, + "grad_norm": 1.1598973274230957, + "learning_rate": 0.0002, + "loss": 0.5747, + "step": 35060 + }, + { + "epoch": 5.669711421873737, + "grad_norm": 1.0233359336853027, + "learning_rate": 0.0002, + "loss": 0.5488, + "step": 35070 + }, + { + "epoch": 5.6713281060544825, + "grad_norm": 1.1124799251556396, + "learning_rate": 0.0002, + "loss": 0.5409, + "step": 35080 + }, + { + "epoch": 5.672944790235228, + "grad_norm": 1.2351475954055786, + "learning_rate": 0.0002, + "loss": 0.5578, + "step": 35090 + }, + { + "epoch": 5.674561474415973, + "grad_norm": 1.0240728855133057, + "learning_rate": 0.0002, + "loss": 0.5638, + "step": 35100 + }, + { + "epoch": 5.676178158596718, + "grad_norm": 1.0223692655563354, + "learning_rate": 0.0002, + "loss": 0.5192, + "step": 35110 + }, + { + "epoch": 5.677794842777463, + "grad_norm": 1.4569132328033447, + "learning_rate": 0.0002, + "loss": 0.524, + "step": 35120 + }, + { + "epoch": 5.679411526958209, + "grad_norm": 0.8983587026596069, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 35130 + }, + { + "epoch": 5.681028211138954, + "grad_norm": 1.0775383710861206, + "learning_rate": 0.0002, + "loss": 0.5439, + "step": 35140 + }, + { + "epoch": 5.682644895319699, + "grad_norm": 0.9800270795822144, + "learning_rate": 0.0002, + "loss": 0.5289, + "step": 35150 + }, + { + "epoch": 5.684261579500444, + "grad_norm": 0.9858237504959106, + "learning_rate": 0.0002, + "loss": 0.533, + "step": 35160 + }, + { + "epoch": 5.6858782636811895, + "grad_norm": 1.031087040901184, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 35170 + }, + { + "epoch": 5.687494947861936, + "grad_norm": 1.0294365882873535, + "learning_rate": 0.0002, + "loss": 0.5528, + "step": 35180 + }, + { + "epoch": 5.68911163204268, + "grad_norm": 1.108144760131836, + "learning_rate": 0.0002, + "loss": 0.5581, + "step": 35190 + }, + { + "epoch": 5.690728316223426, + "grad_norm": 1.0813100337982178, + "learning_rate": 0.0002, + "loss": 0.5373, + "step": 35200 + }, + { + "epoch": 5.692345000404171, + "grad_norm": 1.3146867752075195, + "learning_rate": 0.0002, + "loss": 0.5429, + "step": 35210 + }, + { + "epoch": 5.693961684584917, + "grad_norm": 1.16780424118042, + "learning_rate": 0.0002, + "loss": 0.5297, + "step": 35220 + }, + { + "epoch": 5.695578368765662, + "grad_norm": 0.9929125905036926, + "learning_rate": 0.0002, + "loss": 0.577, + "step": 35230 + }, + { + "epoch": 5.697195052946407, + "grad_norm": 0.9049441814422607, + "learning_rate": 0.0002, + "loss": 0.5441, + "step": 35240 + }, + { + "epoch": 5.698811737127152, + "grad_norm": 0.9768866300582886, + "learning_rate": 0.0002, + "loss": 0.5349, + "step": 35250 + }, + { + "epoch": 5.7004284213078975, + "grad_norm": 0.8306029438972473, + "learning_rate": 0.0002, + "loss": 0.542, + "step": 35260 + }, + { + "epoch": 5.702045105488643, + "grad_norm": 0.8417280316352844, + "learning_rate": 0.0002, + "loss": 0.4771, + "step": 35270 + }, + { + "epoch": 5.703661789669388, + "grad_norm": 0.9954485893249512, + "learning_rate": 0.0002, + "loss": 0.574, + "step": 35280 + }, + { + "epoch": 5.705278473850133, + "grad_norm": 1.2417993545532227, + "learning_rate": 0.0002, + "loss": 0.5469, + "step": 35290 + }, + { + "epoch": 5.706895158030878, + "grad_norm": 1.1696544885635376, + "learning_rate": 0.0002, + "loss": 0.5275, + "step": 35300 + }, + { + "epoch": 5.708511842211624, + "grad_norm": 1.2424817085266113, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 35310 + }, + { + "epoch": 5.710128526392369, + "grad_norm": 1.1791106462478638, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 35320 + }, + { + "epoch": 5.711745210573115, + "grad_norm": 1.202181339263916, + "learning_rate": 0.0002, + "loss": 0.5076, + "step": 35330 + }, + { + "epoch": 5.713361894753859, + "grad_norm": 1.1006861925125122, + "learning_rate": 0.0002, + "loss": 0.5847, + "step": 35340 + }, + { + "epoch": 5.7149785789346055, + "grad_norm": 1.0918344259262085, + "learning_rate": 0.0002, + "loss": 0.5627, + "step": 35350 + }, + { + "epoch": 5.716595263115351, + "grad_norm": 1.0427305698394775, + "learning_rate": 0.0002, + "loss": 0.5677, + "step": 35360 + }, + { + "epoch": 5.718211947296096, + "grad_norm": 1.0818872451782227, + "learning_rate": 0.0002, + "loss": 0.5288, + "step": 35370 + }, + { + "epoch": 5.719828631476841, + "grad_norm": 1.186006784439087, + "learning_rate": 0.0002, + "loss": 0.5296, + "step": 35380 + }, + { + "epoch": 5.721445315657586, + "grad_norm": 1.2073674201965332, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 35390 + }, + { + "epoch": 5.723061999838332, + "grad_norm": 1.065338134765625, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 35400 + }, + { + "epoch": 5.724678684019077, + "grad_norm": 0.9448973536491394, + "learning_rate": 0.0002, + "loss": 0.5195, + "step": 35410 + }, + { + "epoch": 5.726295368199822, + "grad_norm": 1.1487499475479126, + "learning_rate": 0.0002, + "loss": 0.5276, + "step": 35420 + }, + { + "epoch": 5.727912052380567, + "grad_norm": 1.1334216594696045, + "learning_rate": 0.0002, + "loss": 0.5435, + "step": 35430 + }, + { + "epoch": 5.729528736561313, + "grad_norm": 1.1932826042175293, + "learning_rate": 0.0002, + "loss": 0.5074, + "step": 35440 + }, + { + "epoch": 5.731145420742058, + "grad_norm": 1.2615786790847778, + "learning_rate": 0.0002, + "loss": 0.5502, + "step": 35450 + }, + { + "epoch": 5.732762104922803, + "grad_norm": 1.2803694009780884, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 35460 + }, + { + "epoch": 5.734378789103548, + "grad_norm": 0.9271906614303589, + "learning_rate": 0.0002, + "loss": 0.5458, + "step": 35470 + }, + { + "epoch": 5.735995473284294, + "grad_norm": 1.0958917140960693, + "learning_rate": 0.0002, + "loss": 0.5342, + "step": 35480 + }, + { + "epoch": 5.737612157465039, + "grad_norm": 1.1072784662246704, + "learning_rate": 0.0002, + "loss": 0.538, + "step": 35490 + }, + { + "epoch": 5.739228841645785, + "grad_norm": 1.1641002893447876, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 35500 + }, + { + "epoch": 5.74084552582653, + "grad_norm": 1.0246447324752808, + "learning_rate": 0.0002, + "loss": 0.5252, + "step": 35510 + }, + { + "epoch": 5.742462210007275, + "grad_norm": 1.032474398612976, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 35520 + }, + { + "epoch": 5.7440788941880205, + "grad_norm": 1.1600854396820068, + "learning_rate": 0.0002, + "loss": 0.4965, + "step": 35530 + }, + { + "epoch": 5.745695578368766, + "grad_norm": 1.0686054229736328, + "learning_rate": 0.0002, + "loss": 0.5543, + "step": 35540 + }, + { + "epoch": 5.747312262549511, + "grad_norm": 1.2314637899398804, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 35550 + }, + { + "epoch": 5.748928946730256, + "grad_norm": 0.922134280204773, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 35560 + }, + { + "epoch": 5.7505456309110015, + "grad_norm": 0.933043360710144, + "learning_rate": 0.0002, + "loss": 0.5495, + "step": 35570 + }, + { + "epoch": 5.752162315091747, + "grad_norm": 1.1911931037902832, + "learning_rate": 0.0002, + "loss": 0.5007, + "step": 35580 + }, + { + "epoch": 5.753778999272492, + "grad_norm": 0.8984857797622681, + "learning_rate": 0.0002, + "loss": 0.5244, + "step": 35590 + }, + { + "epoch": 5.755395683453237, + "grad_norm": 0.9495107531547546, + "learning_rate": 0.0002, + "loss": 0.5493, + "step": 35600 + }, + { + "epoch": 5.757012367633982, + "grad_norm": 1.2805472612380981, + "learning_rate": 0.0002, + "loss": 0.5326, + "step": 35610 + }, + { + "epoch": 5.758629051814728, + "grad_norm": 1.1236625909805298, + "learning_rate": 0.0002, + "loss": 0.5276, + "step": 35620 + }, + { + "epoch": 5.760245735995474, + "grad_norm": 1.0552798509597778, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 35630 + }, + { + "epoch": 5.761862420176218, + "grad_norm": 1.119909644126892, + "learning_rate": 0.0002, + "loss": 0.5479, + "step": 35640 + }, + { + "epoch": 5.763479104356964, + "grad_norm": 0.8786116242408752, + "learning_rate": 0.0002, + "loss": 0.5282, + "step": 35650 + }, + { + "epoch": 5.765095788537709, + "grad_norm": 1.2417117357254028, + "learning_rate": 0.0002, + "loss": 0.5406, + "step": 35660 + }, + { + "epoch": 5.766712472718455, + "grad_norm": 1.255200982093811, + "learning_rate": 0.0002, + "loss": 0.537, + "step": 35670 + }, + { + "epoch": 5.7683291568992, + "grad_norm": 1.0611358880996704, + "learning_rate": 0.0002, + "loss": 0.5308, + "step": 35680 + }, + { + "epoch": 5.769945841079945, + "grad_norm": 1.1443911790847778, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 35690 + }, + { + "epoch": 5.77156252526069, + "grad_norm": 1.1437989473342896, + "learning_rate": 0.0002, + "loss": 0.5386, + "step": 35700 + }, + { + "epoch": 5.773179209441436, + "grad_norm": 1.1375046968460083, + "learning_rate": 0.0002, + "loss": 0.537, + "step": 35710 + }, + { + "epoch": 5.774795893622181, + "grad_norm": 1.0777729749679565, + "learning_rate": 0.0002, + "loss": 0.5198, + "step": 35720 + }, + { + "epoch": 5.776412577802926, + "grad_norm": 1.1160215139389038, + "learning_rate": 0.0002, + "loss": 0.5521, + "step": 35730 + }, + { + "epoch": 5.778029261983671, + "grad_norm": 1.1268514394760132, + "learning_rate": 0.0002, + "loss": 0.5569, + "step": 35740 + }, + { + "epoch": 5.7796459461644165, + "grad_norm": 1.2752262353897095, + "learning_rate": 0.0002, + "loss": 0.5311, + "step": 35750 + }, + { + "epoch": 5.781262630345162, + "grad_norm": 1.0416184663772583, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 35760 + }, + { + "epoch": 5.782879314525907, + "grad_norm": 1.0622444152832031, + "learning_rate": 0.0002, + "loss": 0.5438, + "step": 35770 + }, + { + "epoch": 5.784495998706653, + "grad_norm": 1.1217877864837646, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 35780 + }, + { + "epoch": 5.786112682887398, + "grad_norm": 0.9363139867782593, + "learning_rate": 0.0002, + "loss": 0.5225, + "step": 35790 + }, + { + "epoch": 5.787729367068144, + "grad_norm": 0.96628737449646, + "learning_rate": 0.0002, + "loss": 0.5524, + "step": 35800 + }, + { + "epoch": 5.789346051248889, + "grad_norm": 0.9572572112083435, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 35810 + }, + { + "epoch": 5.790962735429634, + "grad_norm": 0.938724935054779, + "learning_rate": 0.0002, + "loss": 0.5615, + "step": 35820 + }, + { + "epoch": 5.792579419610379, + "grad_norm": 1.3314417600631714, + "learning_rate": 0.0002, + "loss": 0.5391, + "step": 35830 + }, + { + "epoch": 5.7941961037911245, + "grad_norm": 1.0097602605819702, + "learning_rate": 0.0002, + "loss": 0.5441, + "step": 35840 + }, + { + "epoch": 5.79581278797187, + "grad_norm": 1.1265122890472412, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 35850 + }, + { + "epoch": 5.797429472152615, + "grad_norm": 1.2191909551620483, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 35860 + }, + { + "epoch": 5.79904615633336, + "grad_norm": 0.9690808057785034, + "learning_rate": 0.0002, + "loss": 0.5274, + "step": 35870 + }, + { + "epoch": 5.800662840514105, + "grad_norm": 1.0871665477752686, + "learning_rate": 0.0002, + "loss": 0.5425, + "step": 35880 + }, + { + "epoch": 5.802279524694851, + "grad_norm": 1.1093597412109375, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 35890 + }, + { + "epoch": 5.803896208875596, + "grad_norm": 1.2434282302856445, + "learning_rate": 0.0002, + "loss": 0.5475, + "step": 35900 + }, + { + "epoch": 5.805512893056341, + "grad_norm": 1.2933623790740967, + "learning_rate": 0.0002, + "loss": 0.5288, + "step": 35910 + }, + { + "epoch": 5.807129577237086, + "grad_norm": 1.0005441904067993, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 35920 + }, + { + "epoch": 5.8087462614178325, + "grad_norm": 1.2373108863830566, + "learning_rate": 0.0002, + "loss": 0.5318, + "step": 35930 + }, + { + "epoch": 5.810362945598578, + "grad_norm": 1.2622692584991455, + "learning_rate": 0.0002, + "loss": 0.5413, + "step": 35940 + }, + { + "epoch": 5.811979629779323, + "grad_norm": 1.0112963914871216, + "learning_rate": 0.0002, + "loss": 0.5558, + "step": 35950 + }, + { + "epoch": 5.813596313960068, + "grad_norm": 1.050572395324707, + "learning_rate": 0.0002, + "loss": 0.5115, + "step": 35960 + }, + { + "epoch": 5.815212998140813, + "grad_norm": 0.9774560928344727, + "learning_rate": 0.0002, + "loss": 0.5288, + "step": 35970 + }, + { + "epoch": 5.816829682321559, + "grad_norm": 1.19438898563385, + "learning_rate": 0.0002, + "loss": 0.585, + "step": 35980 + }, + { + "epoch": 5.818446366502304, + "grad_norm": 1.0267130136489868, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 35990 + }, + { + "epoch": 5.820063050683049, + "grad_norm": 0.9813851714134216, + "learning_rate": 0.0002, + "loss": 0.5126, + "step": 36000 + }, + { + "epoch": 5.821679734863794, + "grad_norm": 0.9177457094192505, + "learning_rate": 0.0002, + "loss": 0.5138, + "step": 36010 + }, + { + "epoch": 5.8232964190445395, + "grad_norm": 1.0020731687545776, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 36020 + }, + { + "epoch": 5.824913103225285, + "grad_norm": 1.073222041130066, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 36030 + }, + { + "epoch": 5.82652978740603, + "grad_norm": 1.016337513923645, + "learning_rate": 0.0002, + "loss": 0.5539, + "step": 36040 + }, + { + "epoch": 5.828146471586775, + "grad_norm": 1.267364263534546, + "learning_rate": 0.0002, + "loss": 0.5592, + "step": 36050 + }, + { + "epoch": 5.8297631557675205, + "grad_norm": 1.2730127573013306, + "learning_rate": 0.0002, + "loss": 0.595, + "step": 36060 + }, + { + "epoch": 5.831379839948266, + "grad_norm": 1.108442783355713, + "learning_rate": 0.0002, + "loss": 0.5247, + "step": 36070 + }, + { + "epoch": 5.832996524129012, + "grad_norm": 1.198072075843811, + "learning_rate": 0.0002, + "loss": 0.5103, + "step": 36080 + }, + { + "epoch": 5.834613208309757, + "grad_norm": 1.0458786487579346, + "learning_rate": 0.0002, + "loss": 0.5479, + "step": 36090 + }, + { + "epoch": 5.836229892490502, + "grad_norm": 0.9096664786338806, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 36100 + }, + { + "epoch": 5.8378465766712475, + "grad_norm": 0.9957793951034546, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 36110 + }, + { + "epoch": 5.839463260851993, + "grad_norm": 1.3693058490753174, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 36120 + }, + { + "epoch": 5.841079945032738, + "grad_norm": 1.268608808517456, + "learning_rate": 0.0002, + "loss": 0.5425, + "step": 36130 + }, + { + "epoch": 5.842696629213483, + "grad_norm": 0.8516020178794861, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 36140 + }, + { + "epoch": 5.844313313394228, + "grad_norm": 0.90385502576828, + "learning_rate": 0.0002, + "loss": 0.5475, + "step": 36150 + }, + { + "epoch": 5.845929997574974, + "grad_norm": 1.0910571813583374, + "learning_rate": 0.0002, + "loss": 0.5274, + "step": 36160 + }, + { + "epoch": 5.847546681755719, + "grad_norm": 0.9417795538902283, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 36170 + }, + { + "epoch": 5.849163365936464, + "grad_norm": 1.0027360916137695, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 36180 + }, + { + "epoch": 5.850780050117209, + "grad_norm": 1.1480516195297241, + "learning_rate": 0.0002, + "loss": 0.5423, + "step": 36190 + }, + { + "epoch": 5.852396734297955, + "grad_norm": 1.2431457042694092, + "learning_rate": 0.0002, + "loss": 0.5517, + "step": 36200 + }, + { + "epoch": 5.8540134184787, + "grad_norm": 1.091465950012207, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 36210 + }, + { + "epoch": 5.855630102659445, + "grad_norm": 0.9693930745124817, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 36220 + }, + { + "epoch": 5.857246786840191, + "grad_norm": 0.9937465190887451, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 36230 + }, + { + "epoch": 5.858863471020936, + "grad_norm": 1.0731011629104614, + "learning_rate": 0.0002, + "loss": 0.5621, + "step": 36240 + }, + { + "epoch": 5.860480155201682, + "grad_norm": 1.0869048833847046, + "learning_rate": 0.0002, + "loss": 0.5687, + "step": 36250 + }, + { + "epoch": 5.862096839382427, + "grad_norm": 0.9226390719413757, + "learning_rate": 0.0002, + "loss": 0.5576, + "step": 36260 + }, + { + "epoch": 5.863713523563172, + "grad_norm": 1.1755430698394775, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 36270 + }, + { + "epoch": 5.865330207743917, + "grad_norm": 0.8815974593162537, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 36280 + }, + { + "epoch": 5.866946891924663, + "grad_norm": 1.3648751974105835, + "learning_rate": 0.0002, + "loss": 0.5065, + "step": 36290 + }, + { + "epoch": 5.868563576105408, + "grad_norm": 0.8729211091995239, + "learning_rate": 0.0002, + "loss": 0.536, + "step": 36300 + }, + { + "epoch": 5.870180260286153, + "grad_norm": 1.0870907306671143, + "learning_rate": 0.0002, + "loss": 0.5192, + "step": 36310 + }, + { + "epoch": 5.871796944466898, + "grad_norm": 1.1164259910583496, + "learning_rate": 0.0002, + "loss": 0.5609, + "step": 36320 + }, + { + "epoch": 5.8734136286476435, + "grad_norm": 1.1572535037994385, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 36330 + }, + { + "epoch": 5.875030312828389, + "grad_norm": 1.0456238985061646, + "learning_rate": 0.0002, + "loss": 0.5898, + "step": 36340 + }, + { + "epoch": 5.876646997009134, + "grad_norm": 1.1310722827911377, + "learning_rate": 0.0002, + "loss": 0.5008, + "step": 36350 + }, + { + "epoch": 5.878263681189879, + "grad_norm": 1.0004712343215942, + "learning_rate": 0.0002, + "loss": 0.5352, + "step": 36360 + }, + { + "epoch": 5.879880365370624, + "grad_norm": 1.0991777181625366, + "learning_rate": 0.0002, + "loss": 0.5632, + "step": 36370 + }, + { + "epoch": 5.8814970495513705, + "grad_norm": 1.2789239883422852, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 36380 + }, + { + "epoch": 5.883113733732116, + "grad_norm": 0.9524819850921631, + "learning_rate": 0.0002, + "loss": 0.56, + "step": 36390 + }, + { + "epoch": 5.884730417912861, + "grad_norm": 1.1115771532058716, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 36400 + }, + { + "epoch": 5.886347102093606, + "grad_norm": 1.37419855594635, + "learning_rate": 0.0002, + "loss": 0.5463, + "step": 36410 + }, + { + "epoch": 5.8879637862743515, + "grad_norm": 1.1449527740478516, + "learning_rate": 0.0002, + "loss": 0.5675, + "step": 36420 + }, + { + "epoch": 5.889580470455097, + "grad_norm": 1.198046326637268, + "learning_rate": 0.0002, + "loss": 0.5255, + "step": 36430 + }, + { + "epoch": 5.891197154635842, + "grad_norm": 1.0180530548095703, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 36440 + }, + { + "epoch": 5.892813838816587, + "grad_norm": 1.0516417026519775, + "learning_rate": 0.0002, + "loss": 0.5319, + "step": 36450 + }, + { + "epoch": 5.894430522997332, + "grad_norm": 1.1658052206039429, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 36460 + }, + { + "epoch": 5.896047207178078, + "grad_norm": 1.190699577331543, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 36470 + }, + { + "epoch": 5.897663891358823, + "grad_norm": 1.1235495805740356, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 36480 + }, + { + "epoch": 5.899280575539568, + "grad_norm": 1.1926926374435425, + "learning_rate": 0.0002, + "loss": 0.5284, + "step": 36490 + }, + { + "epoch": 5.900897259720313, + "grad_norm": 1.1184662580490112, + "learning_rate": 0.0002, + "loss": 0.5686, + "step": 36500 + }, + { + "epoch": 5.9025139439010585, + "grad_norm": 1.000970721244812, + "learning_rate": 0.0002, + "loss": 0.5147, + "step": 36510 + }, + { + "epoch": 5.904130628081804, + "grad_norm": 1.0373306274414062, + "learning_rate": 0.0002, + "loss": 0.5351, + "step": 36520 + }, + { + "epoch": 5.90574731226255, + "grad_norm": 1.0840669870376587, + "learning_rate": 0.0002, + "loss": 0.535, + "step": 36530 + }, + { + "epoch": 5.907363996443295, + "grad_norm": 0.9908381104469299, + "learning_rate": 0.0002, + "loss": 0.538, + "step": 36540 + }, + { + "epoch": 5.90898068062404, + "grad_norm": 1.0456029176712036, + "learning_rate": 0.0002, + "loss": 0.5313, + "step": 36550 + }, + { + "epoch": 5.910597364804786, + "grad_norm": 1.1381454467773438, + "learning_rate": 0.0002, + "loss": 0.5693, + "step": 36560 + }, + { + "epoch": 5.912214048985531, + "grad_norm": 0.9440900087356567, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 36570 + }, + { + "epoch": 5.913830733166276, + "grad_norm": 1.1674573421478271, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 36580 + }, + { + "epoch": 5.915447417347021, + "grad_norm": 1.1226966381072998, + "learning_rate": 0.0002, + "loss": 0.526, + "step": 36590 + }, + { + "epoch": 5.9170641015277665, + "grad_norm": 0.9696915745735168, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 36600 + }, + { + "epoch": 5.918680785708512, + "grad_norm": 0.9593005180358887, + "learning_rate": 0.0002, + "loss": 0.5523, + "step": 36610 + }, + { + "epoch": 5.920297469889257, + "grad_norm": 1.122169852256775, + "learning_rate": 0.0002, + "loss": 0.5536, + "step": 36620 + }, + { + "epoch": 5.921914154070002, + "grad_norm": 0.9923415780067444, + "learning_rate": 0.0002, + "loss": 0.5039, + "step": 36630 + }, + { + "epoch": 5.923530838250747, + "grad_norm": 1.063838005065918, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 36640 + }, + { + "epoch": 5.925147522431493, + "grad_norm": 0.9083505272865295, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 36650 + }, + { + "epoch": 5.926764206612239, + "grad_norm": 0.9439437985420227, + "learning_rate": 0.0002, + "loss": 0.5264, + "step": 36660 + }, + { + "epoch": 5.928380890792983, + "grad_norm": 0.9778534173965454, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 36670 + }, + { + "epoch": 5.929997574973729, + "grad_norm": 0.9723961353302002, + "learning_rate": 0.0002, + "loss": 0.566, + "step": 36680 + }, + { + "epoch": 5.9316142591544745, + "grad_norm": 1.162333607673645, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 36690 + }, + { + "epoch": 5.93323094333522, + "grad_norm": 1.2784897089004517, + "learning_rate": 0.0002, + "loss": 0.5771, + "step": 36700 + }, + { + "epoch": 5.934847627515965, + "grad_norm": 1.0924867391586304, + "learning_rate": 0.0002, + "loss": 0.5343, + "step": 36710 + }, + { + "epoch": 5.93646431169671, + "grad_norm": 1.046922206878662, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 36720 + }, + { + "epoch": 5.938080995877455, + "grad_norm": 0.8632535338401794, + "learning_rate": 0.0002, + "loss": 0.5476, + "step": 36730 + }, + { + "epoch": 5.939697680058201, + "grad_norm": 1.358762502670288, + "learning_rate": 0.0002, + "loss": 0.5456, + "step": 36740 + }, + { + "epoch": 5.941314364238946, + "grad_norm": 1.2058624029159546, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 36750 + }, + { + "epoch": 5.942931048419691, + "grad_norm": 1.1396408081054688, + "learning_rate": 0.0002, + "loss": 0.5462, + "step": 36760 + }, + { + "epoch": 5.944547732600436, + "grad_norm": 1.1510354280471802, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 36770 + }, + { + "epoch": 5.946164416781182, + "grad_norm": 1.1401607990264893, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 36780 + }, + { + "epoch": 5.947781100961927, + "grad_norm": 1.1871325969696045, + "learning_rate": 0.0002, + "loss": 0.5557, + "step": 36790 + }, + { + "epoch": 5.949397785142672, + "grad_norm": 0.9928333163261414, + "learning_rate": 0.0002, + "loss": 0.4945, + "step": 36800 + }, + { + "epoch": 5.951014469323418, + "grad_norm": 1.0549445152282715, + "learning_rate": 0.0002, + "loss": 0.5303, + "step": 36810 + }, + { + "epoch": 5.9526311535041625, + "grad_norm": 0.9791563749313354, + "learning_rate": 0.0002, + "loss": 0.5532, + "step": 36820 + }, + { + "epoch": 5.954247837684909, + "grad_norm": 1.1268441677093506, + "learning_rate": 0.0002, + "loss": 0.5317, + "step": 36830 + }, + { + "epoch": 5.955864521865654, + "grad_norm": 1.0533992052078247, + "learning_rate": 0.0002, + "loss": 0.5585, + "step": 36840 + }, + { + "epoch": 5.957481206046399, + "grad_norm": 1.023358941078186, + "learning_rate": 0.0002, + "loss": 0.4972, + "step": 36850 + }, + { + "epoch": 5.959097890227144, + "grad_norm": 1.2631961107254028, + "learning_rate": 0.0002, + "loss": 0.5557, + "step": 36860 + }, + { + "epoch": 5.9607145744078895, + "grad_norm": 0.9397698640823364, + "learning_rate": 0.0002, + "loss": 0.5662, + "step": 36870 + }, + { + "epoch": 5.962331258588635, + "grad_norm": 1.1678427457809448, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 36880 + }, + { + "epoch": 5.96394794276938, + "grad_norm": 1.1403759717941284, + "learning_rate": 0.0002, + "loss": 0.5435, + "step": 36890 + }, + { + "epoch": 5.965564626950125, + "grad_norm": 1.030572772026062, + "learning_rate": 0.0002, + "loss": 0.5479, + "step": 36900 + }, + { + "epoch": 5.9671813111308705, + "grad_norm": 1.0992497205734253, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 36910 + }, + { + "epoch": 5.968797995311616, + "grad_norm": 1.075466275215149, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 36920 + }, + { + "epoch": 5.970414679492361, + "grad_norm": 1.0153694152832031, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 36930 + }, + { + "epoch": 5.972031363673106, + "grad_norm": 0.973193883895874, + "learning_rate": 0.0002, + "loss": 0.5672, + "step": 36940 + }, + { + "epoch": 5.973648047853851, + "grad_norm": 0.8294678926467896, + "learning_rate": 0.0002, + "loss": 0.5585, + "step": 36950 + }, + { + "epoch": 5.9752647320345975, + "grad_norm": 1.0048716068267822, + "learning_rate": 0.0002, + "loss": 0.5631, + "step": 36960 + }, + { + "epoch": 5.976881416215342, + "grad_norm": 0.9714070558547974, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 36970 + }, + { + "epoch": 5.978498100396088, + "grad_norm": 0.8667682409286499, + "learning_rate": 0.0002, + "loss": 0.5419, + "step": 36980 + }, + { + "epoch": 5.980114784576833, + "grad_norm": 1.0461409091949463, + "learning_rate": 0.0002, + "loss": 0.5474, + "step": 36990 + }, + { + "epoch": 5.981731468757578, + "grad_norm": 0.9229754209518433, + "learning_rate": 0.0002, + "loss": 0.5454, + "step": 37000 + }, + { + "epoch": 5.983348152938324, + "grad_norm": 1.0406876802444458, + "learning_rate": 0.0002, + "loss": 0.5599, + "step": 37010 + }, + { + "epoch": 5.984964837119069, + "grad_norm": 0.8993828296661377, + "learning_rate": 0.0002, + "loss": 0.5569, + "step": 37020 + }, + { + "epoch": 5.986581521299814, + "grad_norm": 1.2260479927062988, + "learning_rate": 0.0002, + "loss": 0.5611, + "step": 37030 + }, + { + "epoch": 5.988198205480559, + "grad_norm": 1.0107380151748657, + "learning_rate": 0.0002, + "loss": 0.5523, + "step": 37040 + }, + { + "epoch": 5.989814889661305, + "grad_norm": 1.0240139961242676, + "learning_rate": 0.0002, + "loss": 0.5639, + "step": 37050 + }, + { + "epoch": 5.99143157384205, + "grad_norm": 1.0185275077819824, + "learning_rate": 0.0002, + "loss": 0.5209, + "step": 37060 + }, + { + "epoch": 5.993048258022795, + "grad_norm": 1.1361802816390991, + "learning_rate": 0.0002, + "loss": 0.5114, + "step": 37070 + }, + { + "epoch": 5.99466494220354, + "grad_norm": 1.0395532846450806, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 37080 + }, + { + "epoch": 5.9962816263842855, + "grad_norm": 0.9463558197021484, + "learning_rate": 0.0002, + "loss": 0.594, + "step": 37090 + }, + { + "epoch": 5.997898310565031, + "grad_norm": 1.2066948413848877, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 37100 + }, + { + "epoch": 5.999514994745777, + "grad_norm": 0.9749386310577393, + "learning_rate": 0.0002, + "loss": 0.5356, + "step": 37110 + }, + { + "epoch": 6.0, + "eval_loss": 1.2270219326019287, + "eval_runtime": 122.2047, + "eval_samples_per_second": 5.998, + "eval_steps_per_second": 0.753, + "step": 37113 + }, + { + "epoch": 6.001131678926522, + "grad_norm": 0.9641092419624329, + "learning_rate": 0.0002, + "loss": 0.4855, + "step": 37120 + }, + { + "epoch": 6.002748363107267, + "grad_norm": 1.103379249572754, + "learning_rate": 0.0002, + "loss": 0.4112, + "step": 37130 + }, + { + "epoch": 6.004365047288013, + "grad_norm": 0.8381665349006653, + "learning_rate": 0.0002, + "loss": 0.4577, + "step": 37140 + }, + { + "epoch": 6.005981731468758, + "grad_norm": 1.245323896408081, + "learning_rate": 0.0002, + "loss": 0.4794, + "step": 37150 + }, + { + "epoch": 6.007598415649503, + "grad_norm": 1.3140289783477783, + "learning_rate": 0.0002, + "loss": 0.4503, + "step": 37160 + }, + { + "epoch": 6.009215099830248, + "grad_norm": 0.8479695916175842, + "learning_rate": 0.0002, + "loss": 0.4456, + "step": 37170 + }, + { + "epoch": 6.0108317840109935, + "grad_norm": 0.8841437101364136, + "learning_rate": 0.0002, + "loss": 0.4573, + "step": 37180 + }, + { + "epoch": 6.012448468191739, + "grad_norm": 0.8900154829025269, + "learning_rate": 0.0002, + "loss": 0.4565, + "step": 37190 + }, + { + "epoch": 6.014065152372484, + "grad_norm": 1.2753345966339111, + "learning_rate": 0.0002, + "loss": 0.457, + "step": 37200 + }, + { + "epoch": 6.015681836553229, + "grad_norm": 1.4625498056411743, + "learning_rate": 0.0002, + "loss": 0.4365, + "step": 37210 + }, + { + "epoch": 6.017298520733974, + "grad_norm": 0.7455034852027893, + "learning_rate": 0.0002, + "loss": 0.4252, + "step": 37220 + }, + { + "epoch": 6.01891520491472, + "grad_norm": 1.1658862829208374, + "learning_rate": 0.0002, + "loss": 0.4433, + "step": 37230 + }, + { + "epoch": 6.020531889095465, + "grad_norm": 0.9785751104354858, + "learning_rate": 0.0002, + "loss": 0.4499, + "step": 37240 + }, + { + "epoch": 6.02214857327621, + "grad_norm": 1.3193122148513794, + "learning_rate": 0.0002, + "loss": 0.4956, + "step": 37250 + }, + { + "epoch": 6.023765257456955, + "grad_norm": 1.038273572921753, + "learning_rate": 0.0002, + "loss": 0.4727, + "step": 37260 + }, + { + "epoch": 6.0253819416377015, + "grad_norm": 1.0550594329833984, + "learning_rate": 0.0002, + "loss": 0.4395, + "step": 37270 + }, + { + "epoch": 6.026998625818447, + "grad_norm": 0.9745930433273315, + "learning_rate": 0.0002, + "loss": 0.4767, + "step": 37280 + }, + { + "epoch": 6.028615309999192, + "grad_norm": 0.9273530840873718, + "learning_rate": 0.0002, + "loss": 0.4233, + "step": 37290 + }, + { + "epoch": 6.030231994179937, + "grad_norm": 1.3844057321548462, + "learning_rate": 0.0002, + "loss": 0.4195, + "step": 37300 + }, + { + "epoch": 6.031848678360682, + "grad_norm": 1.2058762311935425, + "learning_rate": 0.0002, + "loss": 0.4768, + "step": 37310 + }, + { + "epoch": 6.033465362541428, + "grad_norm": 1.242663025856018, + "learning_rate": 0.0002, + "loss": 0.4499, + "step": 37320 + }, + { + "epoch": 6.035082046722173, + "grad_norm": 1.3504270315170288, + "learning_rate": 0.0002, + "loss": 0.4597, + "step": 37330 + }, + { + "epoch": 6.036698730902918, + "grad_norm": 0.8734912276268005, + "learning_rate": 0.0002, + "loss": 0.4402, + "step": 37340 + }, + { + "epoch": 6.038315415083663, + "grad_norm": 1.0182311534881592, + "learning_rate": 0.0002, + "loss": 0.477, + "step": 37350 + }, + { + "epoch": 6.0399320992644085, + "grad_norm": 0.9898499846458435, + "learning_rate": 0.0002, + "loss": 0.4261, + "step": 37360 + }, + { + "epoch": 6.041548783445154, + "grad_norm": 1.0637860298156738, + "learning_rate": 0.0002, + "loss": 0.4459, + "step": 37370 + }, + { + "epoch": 6.043165467625899, + "grad_norm": 1.0099523067474365, + "learning_rate": 0.0002, + "loss": 0.4958, + "step": 37380 + }, + { + "epoch": 6.044782151806644, + "grad_norm": 1.1080750226974487, + "learning_rate": 0.0002, + "loss": 0.4459, + "step": 37390 + }, + { + "epoch": 6.0463988359873895, + "grad_norm": 1.2551289796829224, + "learning_rate": 0.0002, + "loss": 0.4473, + "step": 37400 + }, + { + "epoch": 6.048015520168136, + "grad_norm": 0.8959632515907288, + "learning_rate": 0.0002, + "loss": 0.468, + "step": 37410 + }, + { + "epoch": 6.049632204348881, + "grad_norm": 1.1748892068862915, + "learning_rate": 0.0002, + "loss": 0.4255, + "step": 37420 + }, + { + "epoch": 6.051248888529626, + "grad_norm": 1.3122745752334595, + "learning_rate": 0.0002, + "loss": 0.4458, + "step": 37430 + }, + { + "epoch": 6.052865572710371, + "grad_norm": 1.0227985382080078, + "learning_rate": 0.0002, + "loss": 0.4676, + "step": 37440 + }, + { + "epoch": 6.0544822568911165, + "grad_norm": 1.0380030870437622, + "learning_rate": 0.0002, + "loss": 0.4503, + "step": 37450 + }, + { + "epoch": 6.056098941071862, + "grad_norm": 0.8919622898101807, + "learning_rate": 0.0002, + "loss": 0.4686, + "step": 37460 + }, + { + "epoch": 6.057715625252607, + "grad_norm": 1.4554150104522705, + "learning_rate": 0.0002, + "loss": 0.4406, + "step": 37470 + }, + { + "epoch": 6.059332309433352, + "grad_norm": 1.2853292226791382, + "learning_rate": 0.0002, + "loss": 0.4688, + "step": 37480 + }, + { + "epoch": 6.0609489936140974, + "grad_norm": 1.2951840162277222, + "learning_rate": 0.0002, + "loss": 0.4489, + "step": 37490 + }, + { + "epoch": 6.062565677794843, + "grad_norm": 1.1750973463058472, + "learning_rate": 0.0002, + "loss": 0.4819, + "step": 37500 + }, + { + "epoch": 6.064182361975588, + "grad_norm": 0.9328424334526062, + "learning_rate": 0.0002, + "loss": 0.4574, + "step": 37510 + }, + { + "epoch": 6.065799046156333, + "grad_norm": 1.0353537797927856, + "learning_rate": 0.0002, + "loss": 0.4597, + "step": 37520 + }, + { + "epoch": 6.067415730337078, + "grad_norm": 1.1594274044036865, + "learning_rate": 0.0002, + "loss": 0.4407, + "step": 37530 + }, + { + "epoch": 6.069032414517824, + "grad_norm": 0.9034168124198914, + "learning_rate": 0.0002, + "loss": 0.4642, + "step": 37540 + }, + { + "epoch": 6.070649098698569, + "grad_norm": 1.068617820739746, + "learning_rate": 0.0002, + "loss": 0.4625, + "step": 37550 + }, + { + "epoch": 6.072265782879315, + "grad_norm": 1.0931321382522583, + "learning_rate": 0.0002, + "loss": 0.4378, + "step": 37560 + }, + { + "epoch": 6.07388246706006, + "grad_norm": 1.2542688846588135, + "learning_rate": 0.0002, + "loss": 0.4527, + "step": 37570 + }, + { + "epoch": 6.075499151240805, + "grad_norm": 1.273384928703308, + "learning_rate": 0.0002, + "loss": 0.4725, + "step": 37580 + }, + { + "epoch": 6.077115835421551, + "grad_norm": 1.4771400690078735, + "learning_rate": 0.0002, + "loss": 0.4928, + "step": 37590 + }, + { + "epoch": 6.078732519602296, + "grad_norm": 1.3751444816589355, + "learning_rate": 0.0002, + "loss": 0.461, + "step": 37600 + }, + { + "epoch": 6.080349203783041, + "grad_norm": 1.4532550573349, + "learning_rate": 0.0002, + "loss": 0.4602, + "step": 37610 + }, + { + "epoch": 6.081965887963786, + "grad_norm": 1.3175991773605347, + "learning_rate": 0.0002, + "loss": 0.4428, + "step": 37620 + }, + { + "epoch": 6.083582572144532, + "grad_norm": 1.0624970197677612, + "learning_rate": 0.0002, + "loss": 0.4746, + "step": 37630 + }, + { + "epoch": 6.085199256325277, + "grad_norm": 1.099715232849121, + "learning_rate": 0.0002, + "loss": 0.413, + "step": 37640 + }, + { + "epoch": 6.086815940506022, + "grad_norm": 1.0380114316940308, + "learning_rate": 0.0002, + "loss": 0.4528, + "step": 37650 + }, + { + "epoch": 6.088432624686767, + "grad_norm": 1.1136109828948975, + "learning_rate": 0.0002, + "loss": 0.4373, + "step": 37660 + }, + { + "epoch": 6.0900493088675125, + "grad_norm": 0.996498703956604, + "learning_rate": 0.0002, + "loss": 0.4915, + "step": 37670 + }, + { + "epoch": 6.091665993048258, + "grad_norm": 1.0552574396133423, + "learning_rate": 0.0002, + "loss": 0.4713, + "step": 37680 + }, + { + "epoch": 6.093282677229003, + "grad_norm": 1.4108527898788452, + "learning_rate": 0.0002, + "loss": 0.4414, + "step": 37690 + }, + { + "epoch": 6.094899361409748, + "grad_norm": 1.1323093175888062, + "learning_rate": 0.0002, + "loss": 0.4851, + "step": 37700 + }, + { + "epoch": 6.096516045590494, + "grad_norm": 0.9364377856254578, + "learning_rate": 0.0002, + "loss": 0.4455, + "step": 37710 + }, + { + "epoch": 6.0981327297712395, + "grad_norm": 1.1300561428070068, + "learning_rate": 0.0002, + "loss": 0.4791, + "step": 37720 + }, + { + "epoch": 6.099749413951985, + "grad_norm": 1.0616047382354736, + "learning_rate": 0.0002, + "loss": 0.4539, + "step": 37730 + }, + { + "epoch": 6.10136609813273, + "grad_norm": 1.1205905675888062, + "learning_rate": 0.0002, + "loss": 0.4516, + "step": 37740 + }, + { + "epoch": 6.102982782313475, + "grad_norm": 0.9592534303665161, + "learning_rate": 0.0002, + "loss": 0.4688, + "step": 37750 + }, + { + "epoch": 6.1045994664942205, + "grad_norm": 0.9797531962394714, + "learning_rate": 0.0002, + "loss": 0.4494, + "step": 37760 + }, + { + "epoch": 6.106216150674966, + "grad_norm": 1.093404769897461, + "learning_rate": 0.0002, + "loss": 0.4237, + "step": 37770 + }, + { + "epoch": 6.107832834855711, + "grad_norm": 1.2172642946243286, + "learning_rate": 0.0002, + "loss": 0.4691, + "step": 37780 + }, + { + "epoch": 6.109449519036456, + "grad_norm": 1.0467255115509033, + "learning_rate": 0.0002, + "loss": 0.4398, + "step": 37790 + }, + { + "epoch": 6.111066203217201, + "grad_norm": 1.159318208694458, + "learning_rate": 0.0002, + "loss": 0.4676, + "step": 37800 + }, + { + "epoch": 6.112682887397947, + "grad_norm": 1.0615603923797607, + "learning_rate": 0.0002, + "loss": 0.4539, + "step": 37810 + }, + { + "epoch": 6.114299571578692, + "grad_norm": 1.0542045831680298, + "learning_rate": 0.0002, + "loss": 0.4957, + "step": 37820 + }, + { + "epoch": 6.115916255759437, + "grad_norm": 0.8962697982788086, + "learning_rate": 0.0002, + "loss": 0.4512, + "step": 37830 + }, + { + "epoch": 6.117532939940182, + "grad_norm": 1.106352686882019, + "learning_rate": 0.0002, + "loss": 0.4519, + "step": 37840 + }, + { + "epoch": 6.1191496241209276, + "grad_norm": 1.1660276651382446, + "learning_rate": 0.0002, + "loss": 0.4421, + "step": 37850 + }, + { + "epoch": 6.120766308301674, + "grad_norm": 1.3524385690689087, + "learning_rate": 0.0002, + "loss": 0.4701, + "step": 37860 + }, + { + "epoch": 6.122382992482419, + "grad_norm": 1.1056050062179565, + "learning_rate": 0.0002, + "loss": 0.4684, + "step": 37870 + }, + { + "epoch": 6.123999676663164, + "grad_norm": 1.0772725343704224, + "learning_rate": 0.0002, + "loss": 0.4518, + "step": 37880 + }, + { + "epoch": 6.125616360843909, + "grad_norm": 1.1011115312576294, + "learning_rate": 0.0002, + "loss": 0.4356, + "step": 37890 + }, + { + "epoch": 6.127233045024655, + "grad_norm": 0.8952536582946777, + "learning_rate": 0.0002, + "loss": 0.4909, + "step": 37900 + }, + { + "epoch": 6.1288497292054, + "grad_norm": 1.244398593902588, + "learning_rate": 0.0002, + "loss": 0.4299, + "step": 37910 + }, + { + "epoch": 6.130466413386145, + "grad_norm": 0.9658283591270447, + "learning_rate": 0.0002, + "loss": 0.4764, + "step": 37920 + }, + { + "epoch": 6.13208309756689, + "grad_norm": 1.0649068355560303, + "learning_rate": 0.0002, + "loss": 0.4378, + "step": 37930 + }, + { + "epoch": 6.1336997817476355, + "grad_norm": 0.94698166847229, + "learning_rate": 0.0002, + "loss": 0.4638, + "step": 37940 + }, + { + "epoch": 6.135316465928381, + "grad_norm": 1.1450897455215454, + "learning_rate": 0.0002, + "loss": 0.488, + "step": 37950 + }, + { + "epoch": 6.136933150109126, + "grad_norm": 1.032482624053955, + "learning_rate": 0.0002, + "loss": 0.4791, + "step": 37960 + }, + { + "epoch": 6.138549834289871, + "grad_norm": 1.0993428230285645, + "learning_rate": 0.0002, + "loss": 0.4179, + "step": 37970 + }, + { + "epoch": 6.1401665184706165, + "grad_norm": 1.2907029390335083, + "learning_rate": 0.0002, + "loss": 0.4781, + "step": 37980 + }, + { + "epoch": 6.141783202651362, + "grad_norm": 1.1007903814315796, + "learning_rate": 0.0002, + "loss": 0.4671, + "step": 37990 + }, + { + "epoch": 6.143399886832107, + "grad_norm": 0.9286124110221863, + "learning_rate": 0.0002, + "loss": 0.4213, + "step": 38000 + }, + { + "epoch": 6.145016571012853, + "grad_norm": 1.1426366567611694, + "learning_rate": 0.0002, + "loss": 0.4741, + "step": 38010 + }, + { + "epoch": 6.146633255193598, + "grad_norm": 1.2608287334442139, + "learning_rate": 0.0002, + "loss": 0.4746, + "step": 38020 + }, + { + "epoch": 6.1482499393743435, + "grad_norm": 1.1346837282180786, + "learning_rate": 0.0002, + "loss": 0.454, + "step": 38030 + }, + { + "epoch": 6.149866623555089, + "grad_norm": 1.144080400466919, + "learning_rate": 0.0002, + "loss": 0.4469, + "step": 38040 + }, + { + "epoch": 6.151483307735834, + "grad_norm": 1.3456705808639526, + "learning_rate": 0.0002, + "loss": 0.4515, + "step": 38050 + }, + { + "epoch": 6.153099991916579, + "grad_norm": 1.0517960786819458, + "learning_rate": 0.0002, + "loss": 0.4775, + "step": 38060 + }, + { + "epoch": 6.154716676097324, + "grad_norm": 1.1887445449829102, + "learning_rate": 0.0002, + "loss": 0.4986, + "step": 38070 + }, + { + "epoch": 6.15633336027807, + "grad_norm": 1.0449163913726807, + "learning_rate": 0.0002, + "loss": 0.4516, + "step": 38080 + }, + { + "epoch": 6.157950044458815, + "grad_norm": 1.3218743801116943, + "learning_rate": 0.0002, + "loss": 0.4808, + "step": 38090 + }, + { + "epoch": 6.15956672863956, + "grad_norm": 1.003208875656128, + "learning_rate": 0.0002, + "loss": 0.4632, + "step": 38100 + }, + { + "epoch": 6.161183412820305, + "grad_norm": 1.008623719215393, + "learning_rate": 0.0002, + "loss": 0.4978, + "step": 38110 + }, + { + "epoch": 6.162800097001051, + "grad_norm": 1.2122787237167358, + "learning_rate": 0.0002, + "loss": 0.4608, + "step": 38120 + }, + { + "epoch": 6.164416781181796, + "grad_norm": 1.253403902053833, + "learning_rate": 0.0002, + "loss": 0.4666, + "step": 38130 + }, + { + "epoch": 6.166033465362541, + "grad_norm": 1.2289724349975586, + "learning_rate": 0.0002, + "loss": 0.4778, + "step": 38140 + }, + { + "epoch": 6.167650149543286, + "grad_norm": 1.330694556236267, + "learning_rate": 0.0002, + "loss": 0.4774, + "step": 38150 + }, + { + "epoch": 6.169266833724032, + "grad_norm": 1.0946741104125977, + "learning_rate": 0.0002, + "loss": 0.4699, + "step": 38160 + }, + { + "epoch": 6.170883517904778, + "grad_norm": 1.0719934701919556, + "learning_rate": 0.0002, + "loss": 0.4816, + "step": 38170 + }, + { + "epoch": 6.172500202085523, + "grad_norm": 1.1142133474349976, + "learning_rate": 0.0002, + "loss": 0.4678, + "step": 38180 + }, + { + "epoch": 6.174116886266268, + "grad_norm": 1.1221938133239746, + "learning_rate": 0.0002, + "loss": 0.4911, + "step": 38190 + }, + { + "epoch": 6.175733570447013, + "grad_norm": 1.1391617059707642, + "learning_rate": 0.0002, + "loss": 0.4462, + "step": 38200 + }, + { + "epoch": 6.1773502546277586, + "grad_norm": 1.2263455390930176, + "learning_rate": 0.0002, + "loss": 0.4867, + "step": 38210 + }, + { + "epoch": 6.178966938808504, + "grad_norm": 1.0930434465408325, + "learning_rate": 0.0002, + "loss": 0.4633, + "step": 38220 + }, + { + "epoch": 6.180583622989249, + "grad_norm": 1.3489030599594116, + "learning_rate": 0.0002, + "loss": 0.4406, + "step": 38230 + }, + { + "epoch": 6.182200307169994, + "grad_norm": 1.1383486986160278, + "learning_rate": 0.0002, + "loss": 0.4994, + "step": 38240 + }, + { + "epoch": 6.1838169913507395, + "grad_norm": 1.2408897876739502, + "learning_rate": 0.0002, + "loss": 0.4851, + "step": 38250 + }, + { + "epoch": 6.185433675531485, + "grad_norm": 1.1436222791671753, + "learning_rate": 0.0002, + "loss": 0.4848, + "step": 38260 + }, + { + "epoch": 6.18705035971223, + "grad_norm": 1.370117425918579, + "learning_rate": 0.0002, + "loss": 0.4594, + "step": 38270 + }, + { + "epoch": 6.188667043892975, + "grad_norm": 0.8862423300743103, + "learning_rate": 0.0002, + "loss": 0.5023, + "step": 38280 + }, + { + "epoch": 6.19028372807372, + "grad_norm": 0.9603779315948486, + "learning_rate": 0.0002, + "loss": 0.4559, + "step": 38290 + }, + { + "epoch": 6.191900412254466, + "grad_norm": 1.389291524887085, + "learning_rate": 0.0002, + "loss": 0.4835, + "step": 38300 + }, + { + "epoch": 6.193517096435212, + "grad_norm": 1.0767031908035278, + "learning_rate": 0.0002, + "loss": 0.4435, + "step": 38310 + }, + { + "epoch": 6.195133780615957, + "grad_norm": 1.1800403594970703, + "learning_rate": 0.0002, + "loss": 0.4683, + "step": 38320 + }, + { + "epoch": 6.196750464796702, + "grad_norm": 0.997891366481781, + "learning_rate": 0.0002, + "loss": 0.4608, + "step": 38330 + }, + { + "epoch": 6.1983671489774474, + "grad_norm": 1.1201492547988892, + "learning_rate": 0.0002, + "loss": 0.4575, + "step": 38340 + }, + { + "epoch": 6.199983833158193, + "grad_norm": 0.9769026637077332, + "learning_rate": 0.0002, + "loss": 0.4952, + "step": 38350 + }, + { + "epoch": 6.201600517338938, + "grad_norm": 0.9447069764137268, + "learning_rate": 0.0002, + "loss": 0.4563, + "step": 38360 + }, + { + "epoch": 6.203217201519683, + "grad_norm": 1.0959235429763794, + "learning_rate": 0.0002, + "loss": 0.516, + "step": 38370 + }, + { + "epoch": 6.204833885700428, + "grad_norm": 1.2495406866073608, + "learning_rate": 0.0002, + "loss": 0.4688, + "step": 38380 + }, + { + "epoch": 6.206450569881174, + "grad_norm": 0.8589218258857727, + "learning_rate": 0.0002, + "loss": 0.4445, + "step": 38390 + }, + { + "epoch": 6.208067254061919, + "grad_norm": 0.959155797958374, + "learning_rate": 0.0002, + "loss": 0.4808, + "step": 38400 + }, + { + "epoch": 6.209683938242664, + "grad_norm": 1.0105533599853516, + "learning_rate": 0.0002, + "loss": 0.4622, + "step": 38410 + }, + { + "epoch": 6.211300622423409, + "grad_norm": 0.9824615120887756, + "learning_rate": 0.0002, + "loss": 0.4887, + "step": 38420 + }, + { + "epoch": 6.2129173066041545, + "grad_norm": 0.8616500496864319, + "learning_rate": 0.0002, + "loss": 0.4656, + "step": 38430 + }, + { + "epoch": 6.2145339907849, + "grad_norm": 1.2917758226394653, + "learning_rate": 0.0002, + "loss": 0.449, + "step": 38440 + }, + { + "epoch": 6.216150674965646, + "grad_norm": 1.0564531087875366, + "learning_rate": 0.0002, + "loss": 0.4201, + "step": 38450 + }, + { + "epoch": 6.217767359146391, + "grad_norm": 1.152331829071045, + "learning_rate": 0.0002, + "loss": 0.4849, + "step": 38460 + }, + { + "epoch": 6.219384043327136, + "grad_norm": 0.9152206778526306, + "learning_rate": 0.0002, + "loss": 0.4887, + "step": 38470 + }, + { + "epoch": 6.221000727507882, + "grad_norm": 0.9931167960166931, + "learning_rate": 0.0002, + "loss": 0.4686, + "step": 38480 + }, + { + "epoch": 6.222617411688627, + "grad_norm": 1.3248072862625122, + "learning_rate": 0.0002, + "loss": 0.4765, + "step": 38490 + }, + { + "epoch": 6.224234095869372, + "grad_norm": 1.3916507959365845, + "learning_rate": 0.0002, + "loss": 0.4636, + "step": 38500 + }, + { + "epoch": 6.225850780050117, + "grad_norm": 1.1775140762329102, + "learning_rate": 0.0002, + "loss": 0.506, + "step": 38510 + }, + { + "epoch": 6.2274674642308625, + "grad_norm": 1.1581059694290161, + "learning_rate": 0.0002, + "loss": 0.47, + "step": 38520 + }, + { + "epoch": 6.229084148411608, + "grad_norm": 1.359320878982544, + "learning_rate": 0.0002, + "loss": 0.4679, + "step": 38530 + }, + { + "epoch": 6.230700832592353, + "grad_norm": 1.185041904449463, + "learning_rate": 0.0002, + "loss": 0.4697, + "step": 38540 + }, + { + "epoch": 6.232317516773098, + "grad_norm": 1.1861097812652588, + "learning_rate": 0.0002, + "loss": 0.4815, + "step": 38550 + }, + { + "epoch": 6.233934200953843, + "grad_norm": 1.126990556716919, + "learning_rate": 0.0002, + "loss": 0.4925, + "step": 38560 + }, + { + "epoch": 6.235550885134589, + "grad_norm": 0.9744541049003601, + "learning_rate": 0.0002, + "loss": 0.4414, + "step": 38570 + }, + { + "epoch": 6.237167569315334, + "grad_norm": 1.1260887384414673, + "learning_rate": 0.0002, + "loss": 0.4577, + "step": 38580 + }, + { + "epoch": 6.238784253496079, + "grad_norm": 1.1290327310562134, + "learning_rate": 0.0002, + "loss": 0.4852, + "step": 38590 + }, + { + "epoch": 6.240400937676825, + "grad_norm": 1.0952879190444946, + "learning_rate": 0.0002, + "loss": 0.4805, + "step": 38600 + }, + { + "epoch": 6.2420176218575705, + "grad_norm": 1.1037684679031372, + "learning_rate": 0.0002, + "loss": 0.4436, + "step": 38610 + }, + { + "epoch": 6.243634306038316, + "grad_norm": 1.1356085538864136, + "learning_rate": 0.0002, + "loss": 0.466, + "step": 38620 + }, + { + "epoch": 6.245250990219061, + "grad_norm": 1.0677106380462646, + "learning_rate": 0.0002, + "loss": 0.5129, + "step": 38630 + }, + { + "epoch": 6.246867674399806, + "grad_norm": 1.1573411226272583, + "learning_rate": 0.0002, + "loss": 0.4907, + "step": 38640 + }, + { + "epoch": 6.248484358580551, + "grad_norm": 1.2707505226135254, + "learning_rate": 0.0002, + "loss": 0.5098, + "step": 38650 + }, + { + "epoch": 6.250101042761297, + "grad_norm": 1.0480109453201294, + "learning_rate": 0.0002, + "loss": 0.4926, + "step": 38660 + }, + { + "epoch": 6.251717726942042, + "grad_norm": 1.3668724298477173, + "learning_rate": 0.0002, + "loss": 0.4654, + "step": 38670 + }, + { + "epoch": 6.253334411122787, + "grad_norm": 1.217289686203003, + "learning_rate": 0.0002, + "loss": 0.5128, + "step": 38680 + }, + { + "epoch": 6.254951095303532, + "grad_norm": 1.2950236797332764, + "learning_rate": 0.0002, + "loss": 0.4621, + "step": 38690 + }, + { + "epoch": 6.256567779484278, + "grad_norm": 1.4506934881210327, + "learning_rate": 0.0002, + "loss": 0.5076, + "step": 38700 + }, + { + "epoch": 6.258184463665023, + "grad_norm": 1.1248667240142822, + "learning_rate": 0.0002, + "loss": 0.4803, + "step": 38710 + }, + { + "epoch": 6.259801147845768, + "grad_norm": 1.3384023904800415, + "learning_rate": 0.0002, + "loss": 0.4746, + "step": 38720 + }, + { + "epoch": 6.261417832026513, + "grad_norm": 1.128074288368225, + "learning_rate": 0.0002, + "loss": 0.473, + "step": 38730 + }, + { + "epoch": 6.263034516207259, + "grad_norm": 1.1169012784957886, + "learning_rate": 0.0002, + "loss": 0.4638, + "step": 38740 + }, + { + "epoch": 6.264651200388005, + "grad_norm": 1.195198893547058, + "learning_rate": 0.0002, + "loss": 0.4747, + "step": 38750 + }, + { + "epoch": 6.26626788456875, + "grad_norm": 1.2471518516540527, + "learning_rate": 0.0002, + "loss": 0.4906, + "step": 38760 + }, + { + "epoch": 6.267884568749495, + "grad_norm": 1.2646394968032837, + "learning_rate": 0.0002, + "loss": 0.4507, + "step": 38770 + }, + { + "epoch": 6.26950125293024, + "grad_norm": 1.0286450386047363, + "learning_rate": 0.0002, + "loss": 0.4934, + "step": 38780 + }, + { + "epoch": 6.2711179371109855, + "grad_norm": 1.2440695762634277, + "learning_rate": 0.0002, + "loss": 0.4787, + "step": 38790 + }, + { + "epoch": 6.272734621291731, + "grad_norm": 0.8941256403923035, + "learning_rate": 0.0002, + "loss": 0.4806, + "step": 38800 + }, + { + "epoch": 6.274351305472476, + "grad_norm": 1.0693447589874268, + "learning_rate": 0.0002, + "loss": 0.4741, + "step": 38810 + }, + { + "epoch": 6.275967989653221, + "grad_norm": 1.0936840772628784, + "learning_rate": 0.0002, + "loss": 0.4408, + "step": 38820 + }, + { + "epoch": 6.2775846738339665, + "grad_norm": 1.0961874723434448, + "learning_rate": 0.0002, + "loss": 0.4729, + "step": 38830 + }, + { + "epoch": 6.279201358014712, + "grad_norm": 1.1465433835983276, + "learning_rate": 0.0002, + "loss": 0.4504, + "step": 38840 + }, + { + "epoch": 6.280818042195457, + "grad_norm": 1.2987004518508911, + "learning_rate": 0.0002, + "loss": 0.4771, + "step": 38850 + }, + { + "epoch": 6.282434726376202, + "grad_norm": 1.1310304403305054, + "learning_rate": 0.0002, + "loss": 0.4945, + "step": 38860 + }, + { + "epoch": 6.284051410556947, + "grad_norm": 1.306538462638855, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 38870 + }, + { + "epoch": 6.285668094737693, + "grad_norm": 1.2405401468276978, + "learning_rate": 0.0002, + "loss": 0.4873, + "step": 38880 + }, + { + "epoch": 6.287284778918439, + "grad_norm": 1.0934767723083496, + "learning_rate": 0.0002, + "loss": 0.4929, + "step": 38890 + }, + { + "epoch": 6.288901463099184, + "grad_norm": 1.3370496034622192, + "learning_rate": 0.0002, + "loss": 0.4853, + "step": 38900 + }, + { + "epoch": 6.290518147279929, + "grad_norm": 1.0319404602050781, + "learning_rate": 0.0002, + "loss": 0.4892, + "step": 38910 + }, + { + "epoch": 6.292134831460674, + "grad_norm": 0.9734271168708801, + "learning_rate": 0.0002, + "loss": 0.4685, + "step": 38920 + }, + { + "epoch": 6.29375151564142, + "grad_norm": 1.0940454006195068, + "learning_rate": 0.0002, + "loss": 0.5085, + "step": 38930 + }, + { + "epoch": 6.295368199822165, + "grad_norm": 1.036500334739685, + "learning_rate": 0.0002, + "loss": 0.4985, + "step": 38940 + }, + { + "epoch": 6.29698488400291, + "grad_norm": 1.020308256149292, + "learning_rate": 0.0002, + "loss": 0.4878, + "step": 38950 + }, + { + "epoch": 6.298601568183655, + "grad_norm": 1.1416399478912354, + "learning_rate": 0.0002, + "loss": 0.4668, + "step": 38960 + }, + { + "epoch": 6.300218252364401, + "grad_norm": 1.2497479915618896, + "learning_rate": 0.0002, + "loss": 0.4727, + "step": 38970 + }, + { + "epoch": 6.301834936545146, + "grad_norm": 1.1692523956298828, + "learning_rate": 0.0002, + "loss": 0.4721, + "step": 38980 + }, + { + "epoch": 6.303451620725891, + "grad_norm": 1.0693109035491943, + "learning_rate": 0.0002, + "loss": 0.505, + "step": 38990 + }, + { + "epoch": 6.305068304906636, + "grad_norm": 0.8883291482925415, + "learning_rate": 0.0002, + "loss": 0.4875, + "step": 39000 + }, + { + "epoch": 6.3066849890873815, + "grad_norm": 1.1445088386535645, + "learning_rate": 0.0002, + "loss": 0.5371, + "step": 39010 + }, + { + "epoch": 6.308301673268127, + "grad_norm": 1.226792335510254, + "learning_rate": 0.0002, + "loss": 0.5089, + "step": 39020 + }, + { + "epoch": 6.309918357448872, + "grad_norm": 1.0498932600021362, + "learning_rate": 0.0002, + "loss": 0.474, + "step": 39030 + }, + { + "epoch": 6.311535041629618, + "grad_norm": 1.0834535360336304, + "learning_rate": 0.0002, + "loss": 0.4964, + "step": 39040 + }, + { + "epoch": 6.313151725810363, + "grad_norm": 1.144666075706482, + "learning_rate": 0.0002, + "loss": 0.4733, + "step": 39050 + }, + { + "epoch": 6.3147684099911086, + "grad_norm": 1.1468489170074463, + "learning_rate": 0.0002, + "loss": 0.4784, + "step": 39060 + }, + { + "epoch": 6.316385094171854, + "grad_norm": 1.290949821472168, + "learning_rate": 0.0002, + "loss": 0.4911, + "step": 39070 + }, + { + "epoch": 6.318001778352599, + "grad_norm": 1.087868094444275, + "learning_rate": 0.0002, + "loss": 0.5002, + "step": 39080 + }, + { + "epoch": 6.319618462533344, + "grad_norm": 1.0156296491622925, + "learning_rate": 0.0002, + "loss": 0.4944, + "step": 39090 + }, + { + "epoch": 6.3212351467140895, + "grad_norm": 1.0805060863494873, + "learning_rate": 0.0002, + "loss": 0.5019, + "step": 39100 + }, + { + "epoch": 6.322851830894835, + "grad_norm": 0.9030579924583435, + "learning_rate": 0.0002, + "loss": 0.4598, + "step": 39110 + }, + { + "epoch": 6.32446851507558, + "grad_norm": 1.1488285064697266, + "learning_rate": 0.0002, + "loss": 0.4635, + "step": 39120 + }, + { + "epoch": 6.326085199256325, + "grad_norm": 1.2050796747207642, + "learning_rate": 0.0002, + "loss": 0.5368, + "step": 39130 + }, + { + "epoch": 6.32770188343707, + "grad_norm": 1.093451738357544, + "learning_rate": 0.0002, + "loss": 0.4854, + "step": 39140 + }, + { + "epoch": 6.329318567617816, + "grad_norm": 1.2046772241592407, + "learning_rate": 0.0002, + "loss": 0.5055, + "step": 39150 + }, + { + "epoch": 6.330935251798561, + "grad_norm": 1.045777678489685, + "learning_rate": 0.0002, + "loss": 0.4703, + "step": 39160 + }, + { + "epoch": 6.332551935979306, + "grad_norm": 1.2008492946624756, + "learning_rate": 0.0002, + "loss": 0.513, + "step": 39170 + }, + { + "epoch": 6.334168620160051, + "grad_norm": 1.0613869428634644, + "learning_rate": 0.0002, + "loss": 0.4909, + "step": 39180 + }, + { + "epoch": 6.3357853043407975, + "grad_norm": 1.058440089225769, + "learning_rate": 0.0002, + "loss": 0.4708, + "step": 39190 + }, + { + "epoch": 6.337401988521543, + "grad_norm": 1.195658802986145, + "learning_rate": 0.0002, + "loss": 0.4719, + "step": 39200 + }, + { + "epoch": 6.339018672702288, + "grad_norm": 1.1595174074172974, + "learning_rate": 0.0002, + "loss": 0.4901, + "step": 39210 + }, + { + "epoch": 6.340635356883033, + "grad_norm": 1.0674750804901123, + "learning_rate": 0.0002, + "loss": 0.4587, + "step": 39220 + }, + { + "epoch": 6.342252041063778, + "grad_norm": 1.3306758403778076, + "learning_rate": 0.0002, + "loss": 0.4801, + "step": 39230 + }, + { + "epoch": 6.343868725244524, + "grad_norm": 1.3582593202590942, + "learning_rate": 0.0002, + "loss": 0.4839, + "step": 39240 + }, + { + "epoch": 6.345485409425269, + "grad_norm": 1.2351572513580322, + "learning_rate": 0.0002, + "loss": 0.4964, + "step": 39250 + }, + { + "epoch": 6.347102093606014, + "grad_norm": 1.3623450994491577, + "learning_rate": 0.0002, + "loss": 0.4806, + "step": 39260 + }, + { + "epoch": 6.348718777786759, + "grad_norm": 1.201270580291748, + "learning_rate": 0.0002, + "loss": 0.466, + "step": 39270 + }, + { + "epoch": 6.3503354619675045, + "grad_norm": 0.9300584197044373, + "learning_rate": 0.0002, + "loss": 0.4899, + "step": 39280 + }, + { + "epoch": 6.35195214614825, + "grad_norm": 0.944525957107544, + "learning_rate": 0.0002, + "loss": 0.4867, + "step": 39290 + }, + { + "epoch": 6.353568830328995, + "grad_norm": 1.4263732433319092, + "learning_rate": 0.0002, + "loss": 0.4954, + "step": 39300 + }, + { + "epoch": 6.35518551450974, + "grad_norm": 1.392592191696167, + "learning_rate": 0.0002, + "loss": 0.4982, + "step": 39310 + }, + { + "epoch": 6.3568021986904855, + "grad_norm": 1.0753393173217773, + "learning_rate": 0.0002, + "loss": 0.4868, + "step": 39320 + }, + { + "epoch": 6.358418882871231, + "grad_norm": 1.0088151693344116, + "learning_rate": 0.0002, + "loss": 0.4896, + "step": 39330 + }, + { + "epoch": 6.360035567051977, + "grad_norm": 1.1784582138061523, + "learning_rate": 0.0002, + "loss": 0.4684, + "step": 39340 + }, + { + "epoch": 6.361652251232722, + "grad_norm": 1.020526647567749, + "learning_rate": 0.0002, + "loss": 0.4732, + "step": 39350 + }, + { + "epoch": 6.363268935413467, + "grad_norm": 1.1400747299194336, + "learning_rate": 0.0002, + "loss": 0.5177, + "step": 39360 + }, + { + "epoch": 6.3648856195942125, + "grad_norm": 0.9960665702819824, + "learning_rate": 0.0002, + "loss": 0.4976, + "step": 39370 + }, + { + "epoch": 6.366502303774958, + "grad_norm": 1.1547569036483765, + "learning_rate": 0.0002, + "loss": 0.483, + "step": 39380 + }, + { + "epoch": 6.368118987955703, + "grad_norm": 1.2180676460266113, + "learning_rate": 0.0002, + "loss": 0.4861, + "step": 39390 + }, + { + "epoch": 6.369735672136448, + "grad_norm": 1.1391799449920654, + "learning_rate": 0.0002, + "loss": 0.4805, + "step": 39400 + }, + { + "epoch": 6.371352356317193, + "grad_norm": 1.2893574237823486, + "learning_rate": 0.0002, + "loss": 0.5004, + "step": 39410 + }, + { + "epoch": 6.372969040497939, + "grad_norm": 1.192878246307373, + "learning_rate": 0.0002, + "loss": 0.4807, + "step": 39420 + }, + { + "epoch": 6.374585724678684, + "grad_norm": 0.9771704077720642, + "learning_rate": 0.0002, + "loss": 0.4637, + "step": 39430 + }, + { + "epoch": 6.376202408859429, + "grad_norm": 1.285387635231018, + "learning_rate": 0.0002, + "loss": 0.4867, + "step": 39440 + }, + { + "epoch": 6.377819093040174, + "grad_norm": 1.019957184791565, + "learning_rate": 0.0002, + "loss": 0.4593, + "step": 39450 + }, + { + "epoch": 6.37943577722092, + "grad_norm": 1.2002915143966675, + "learning_rate": 0.0002, + "loss": 0.473, + "step": 39460 + }, + { + "epoch": 6.381052461401665, + "grad_norm": 1.3285092115402222, + "learning_rate": 0.0002, + "loss": 0.5025, + "step": 39470 + }, + { + "epoch": 6.38266914558241, + "grad_norm": 1.097846269607544, + "learning_rate": 0.0002, + "loss": 0.4626, + "step": 39480 + }, + { + "epoch": 6.384285829763156, + "grad_norm": 0.9537988305091858, + "learning_rate": 0.0002, + "loss": 0.5109, + "step": 39490 + }, + { + "epoch": 6.385902513943901, + "grad_norm": 1.0350042581558228, + "learning_rate": 0.0002, + "loss": 0.4492, + "step": 39500 + }, + { + "epoch": 6.387519198124647, + "grad_norm": 0.9559133052825928, + "learning_rate": 0.0002, + "loss": 0.4824, + "step": 39510 + }, + { + "epoch": 6.389135882305392, + "grad_norm": 0.9615123271942139, + "learning_rate": 0.0002, + "loss": 0.5189, + "step": 39520 + }, + { + "epoch": 6.390752566486137, + "grad_norm": 1.0604504346847534, + "learning_rate": 0.0002, + "loss": 0.4915, + "step": 39530 + }, + { + "epoch": 6.392369250666882, + "grad_norm": 1.2460750341415405, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 39540 + }, + { + "epoch": 6.393985934847628, + "grad_norm": 1.1496477127075195, + "learning_rate": 0.0002, + "loss": 0.4929, + "step": 39550 + }, + { + "epoch": 6.395602619028373, + "grad_norm": 1.048043966293335, + "learning_rate": 0.0002, + "loss": 0.4872, + "step": 39560 + }, + { + "epoch": 6.397219303209118, + "grad_norm": 1.333539366722107, + "learning_rate": 0.0002, + "loss": 0.5231, + "step": 39570 + }, + { + "epoch": 6.398835987389863, + "grad_norm": 1.0605626106262207, + "learning_rate": 0.0002, + "loss": 0.4877, + "step": 39580 + }, + { + "epoch": 6.4004526715706085, + "grad_norm": 1.163220763206482, + "learning_rate": 0.0002, + "loss": 0.4643, + "step": 39590 + }, + { + "epoch": 6.402069355751354, + "grad_norm": 1.1878494024276733, + "learning_rate": 0.0002, + "loss": 0.4824, + "step": 39600 + }, + { + "epoch": 6.403686039932099, + "grad_norm": 1.4630796909332275, + "learning_rate": 0.0002, + "loss": 0.5242, + "step": 39610 + }, + { + "epoch": 6.405302724112844, + "grad_norm": 1.073255181312561, + "learning_rate": 0.0002, + "loss": 0.4985, + "step": 39620 + }, + { + "epoch": 6.406919408293589, + "grad_norm": 1.0538873672485352, + "learning_rate": 0.0002, + "loss": 0.5108, + "step": 39630 + }, + { + "epoch": 6.4085360924743355, + "grad_norm": 1.015525221824646, + "learning_rate": 0.0002, + "loss": 0.4801, + "step": 39640 + }, + { + "epoch": 6.410152776655081, + "grad_norm": 1.1454379558563232, + "learning_rate": 0.0002, + "loss": 0.4781, + "step": 39650 + }, + { + "epoch": 6.411769460835826, + "grad_norm": 1.2801800966262817, + "learning_rate": 0.0002, + "loss": 0.498, + "step": 39660 + }, + { + "epoch": 6.413386145016571, + "grad_norm": 1.077579140663147, + "learning_rate": 0.0002, + "loss": 0.4804, + "step": 39670 + }, + { + "epoch": 6.4150028291973165, + "grad_norm": 1.376662015914917, + "learning_rate": 0.0002, + "loss": 0.51, + "step": 39680 + }, + { + "epoch": 6.416619513378062, + "grad_norm": 1.2064344882965088, + "learning_rate": 0.0002, + "loss": 0.4956, + "step": 39690 + }, + { + "epoch": 6.418236197558807, + "grad_norm": 1.0689115524291992, + "learning_rate": 0.0002, + "loss": 0.4762, + "step": 39700 + }, + { + "epoch": 6.419852881739552, + "grad_norm": 0.9997019171714783, + "learning_rate": 0.0002, + "loss": 0.4762, + "step": 39710 + }, + { + "epoch": 6.421469565920297, + "grad_norm": 1.2368080615997314, + "learning_rate": 0.0002, + "loss": 0.49, + "step": 39720 + }, + { + "epoch": 6.423086250101043, + "grad_norm": 1.2085820436477661, + "learning_rate": 0.0002, + "loss": 0.4774, + "step": 39730 + }, + { + "epoch": 6.424702934281788, + "grad_norm": 1.057246208190918, + "learning_rate": 0.0002, + "loss": 0.4671, + "step": 39740 + }, + { + "epoch": 6.426319618462533, + "grad_norm": 1.1311043500900269, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 39750 + }, + { + "epoch": 6.427936302643278, + "grad_norm": 1.2352231740951538, + "learning_rate": 0.0002, + "loss": 0.5171, + "step": 39760 + }, + { + "epoch": 6.4295529868240235, + "grad_norm": 0.953233540058136, + "learning_rate": 0.0002, + "loss": 0.466, + "step": 39770 + }, + { + "epoch": 6.431169671004769, + "grad_norm": 1.0632505416870117, + "learning_rate": 0.0002, + "loss": 0.4834, + "step": 39780 + }, + { + "epoch": 6.432786355185515, + "grad_norm": 1.0916751623153687, + "learning_rate": 0.0002, + "loss": 0.5053, + "step": 39790 + }, + { + "epoch": 6.43440303936626, + "grad_norm": 0.9732703566551208, + "learning_rate": 0.0002, + "loss": 0.4788, + "step": 39800 + }, + { + "epoch": 6.436019723547005, + "grad_norm": 1.1673705577850342, + "learning_rate": 0.0002, + "loss": 0.4982, + "step": 39810 + }, + { + "epoch": 6.437636407727751, + "grad_norm": 1.1049559116363525, + "learning_rate": 0.0002, + "loss": 0.4484, + "step": 39820 + }, + { + "epoch": 6.439253091908496, + "grad_norm": 1.345277190208435, + "learning_rate": 0.0002, + "loss": 0.4784, + "step": 39830 + }, + { + "epoch": 6.440869776089241, + "grad_norm": 1.1118950843811035, + "learning_rate": 0.0002, + "loss": 0.4716, + "step": 39840 + }, + { + "epoch": 6.442486460269986, + "grad_norm": 1.4872850179672241, + "learning_rate": 0.0002, + "loss": 0.5133, + "step": 39850 + }, + { + "epoch": 6.4441031444507315, + "grad_norm": 1.0763497352600098, + "learning_rate": 0.0002, + "loss": 0.4532, + "step": 39860 + }, + { + "epoch": 6.445719828631477, + "grad_norm": 0.9245555400848389, + "learning_rate": 0.0002, + "loss": 0.4572, + "step": 39870 + }, + { + "epoch": 6.447336512812222, + "grad_norm": 1.4154807329177856, + "learning_rate": 0.0002, + "loss": 0.4917, + "step": 39880 + }, + { + "epoch": 6.448953196992967, + "grad_norm": 1.0885124206542969, + "learning_rate": 0.0002, + "loss": 0.4852, + "step": 39890 + }, + { + "epoch": 6.450569881173712, + "grad_norm": 1.3989344835281372, + "learning_rate": 0.0002, + "loss": 0.5399, + "step": 39900 + }, + { + "epoch": 6.452186565354458, + "grad_norm": 0.9763124585151672, + "learning_rate": 0.0002, + "loss": 0.509, + "step": 39910 + }, + { + "epoch": 6.453803249535203, + "grad_norm": 1.135272741317749, + "learning_rate": 0.0002, + "loss": 0.5134, + "step": 39920 + }, + { + "epoch": 6.455419933715948, + "grad_norm": 1.1140081882476807, + "learning_rate": 0.0002, + "loss": 0.4941, + "step": 39930 + }, + { + "epoch": 6.457036617896694, + "grad_norm": 1.0992448329925537, + "learning_rate": 0.0002, + "loss": 0.5137, + "step": 39940 + }, + { + "epoch": 6.4586533020774395, + "grad_norm": 1.1658501625061035, + "learning_rate": 0.0002, + "loss": 0.4914, + "step": 39950 + }, + { + "epoch": 6.460269986258185, + "grad_norm": 1.1122797727584839, + "learning_rate": 0.0002, + "loss": 0.5036, + "step": 39960 + }, + { + "epoch": 6.46188667043893, + "grad_norm": 0.9664968252182007, + "learning_rate": 0.0002, + "loss": 0.5159, + "step": 39970 + }, + { + "epoch": 6.463503354619675, + "grad_norm": 1.2513965368270874, + "learning_rate": 0.0002, + "loss": 0.4989, + "step": 39980 + }, + { + "epoch": 6.46512003880042, + "grad_norm": 1.1198630332946777, + "learning_rate": 0.0002, + "loss": 0.4694, + "step": 39990 + }, + { + "epoch": 6.466736722981166, + "grad_norm": 0.8783249855041504, + "learning_rate": 0.0002, + "loss": 0.5023, + "step": 40000 + }, + { + "epoch": 6.468353407161911, + "grad_norm": 1.1313109397888184, + "learning_rate": 0.0002, + "loss": 0.4648, + "step": 40010 + }, + { + "epoch": 6.469970091342656, + "grad_norm": 1.0854487419128418, + "learning_rate": 0.0002, + "loss": 0.4965, + "step": 40020 + }, + { + "epoch": 6.471586775523401, + "grad_norm": 1.1738566160202026, + "learning_rate": 0.0002, + "loss": 0.5253, + "step": 40030 + }, + { + "epoch": 6.473203459704147, + "grad_norm": 0.9720084071159363, + "learning_rate": 0.0002, + "loss": 0.4947, + "step": 40040 + }, + { + "epoch": 6.474820143884892, + "grad_norm": 1.105618953704834, + "learning_rate": 0.0002, + "loss": 0.5218, + "step": 40050 + }, + { + "epoch": 6.476436828065637, + "grad_norm": 1.2007657289505005, + "learning_rate": 0.0002, + "loss": 0.4943, + "step": 40060 + }, + { + "epoch": 6.478053512246382, + "grad_norm": 1.088402509689331, + "learning_rate": 0.0002, + "loss": 0.4882, + "step": 40070 + }, + { + "epoch": 6.4796701964271275, + "grad_norm": 1.0775291919708252, + "learning_rate": 0.0002, + "loss": 0.504, + "step": 40080 + }, + { + "epoch": 6.481286880607874, + "grad_norm": 1.1018189191818237, + "learning_rate": 0.0002, + "loss": 0.4791, + "step": 40090 + }, + { + "epoch": 6.482903564788619, + "grad_norm": 1.1676557064056396, + "learning_rate": 0.0002, + "loss": 0.488, + "step": 40100 + }, + { + "epoch": 6.484520248969364, + "grad_norm": 0.9619805812835693, + "learning_rate": 0.0002, + "loss": 0.4818, + "step": 40110 + }, + { + "epoch": 6.486136933150109, + "grad_norm": 1.2408208847045898, + "learning_rate": 0.0002, + "loss": 0.4986, + "step": 40120 + }, + { + "epoch": 6.4877536173308545, + "grad_norm": 1.3488136529922485, + "learning_rate": 0.0002, + "loss": 0.4668, + "step": 40130 + }, + { + "epoch": 6.4893703015116, + "grad_norm": 0.9864488244056702, + "learning_rate": 0.0002, + "loss": 0.4774, + "step": 40140 + }, + { + "epoch": 6.490986985692345, + "grad_norm": 0.9437947273254395, + "learning_rate": 0.0002, + "loss": 0.4651, + "step": 40150 + }, + { + "epoch": 6.49260366987309, + "grad_norm": 1.2005455493927002, + "learning_rate": 0.0002, + "loss": 0.542, + "step": 40160 + }, + { + "epoch": 6.4942203540538355, + "grad_norm": 1.0796732902526855, + "learning_rate": 0.0002, + "loss": 0.4704, + "step": 40170 + }, + { + "epoch": 6.495837038234581, + "grad_norm": 1.1347825527191162, + "learning_rate": 0.0002, + "loss": 0.498, + "step": 40180 + }, + { + "epoch": 6.497453722415326, + "grad_norm": 1.2311455011367798, + "learning_rate": 0.0002, + "loss": 0.5215, + "step": 40190 + }, + { + "epoch": 6.499070406596071, + "grad_norm": 1.068609356880188, + "learning_rate": 0.0002, + "loss": 0.5043, + "step": 40200 + }, + { + "epoch": 6.500687090776816, + "grad_norm": 1.196425437927246, + "learning_rate": 0.0002, + "loss": 0.4868, + "step": 40210 + }, + { + "epoch": 6.5023037749575625, + "grad_norm": 1.183927297592163, + "learning_rate": 0.0002, + "loss": 0.4881, + "step": 40220 + }, + { + "epoch": 6.503920459138307, + "grad_norm": 0.9099724292755127, + "learning_rate": 0.0002, + "loss": 0.4958, + "step": 40230 + }, + { + "epoch": 6.505537143319053, + "grad_norm": 0.9261038899421692, + "learning_rate": 0.0002, + "loss": 0.4816, + "step": 40240 + }, + { + "epoch": 6.507153827499798, + "grad_norm": 1.185491681098938, + "learning_rate": 0.0002, + "loss": 0.5151, + "step": 40250 + }, + { + "epoch": 6.508770511680543, + "grad_norm": 1.1866052150726318, + "learning_rate": 0.0002, + "loss": 0.4853, + "step": 40260 + }, + { + "epoch": 6.510387195861289, + "grad_norm": 1.1600912809371948, + "learning_rate": 0.0002, + "loss": 0.491, + "step": 40270 + }, + { + "epoch": 6.512003880042034, + "grad_norm": 0.9609426259994507, + "learning_rate": 0.0002, + "loss": 0.5181, + "step": 40280 + }, + { + "epoch": 6.513620564222779, + "grad_norm": 1.078864336013794, + "learning_rate": 0.0002, + "loss": 0.4794, + "step": 40290 + }, + { + "epoch": 6.515237248403524, + "grad_norm": 1.042761206626892, + "learning_rate": 0.0002, + "loss": 0.46, + "step": 40300 + }, + { + "epoch": 6.51685393258427, + "grad_norm": 0.9742481112480164, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 40310 + }, + { + "epoch": 6.518470616765015, + "grad_norm": 1.2544835805892944, + "learning_rate": 0.0002, + "loss": 0.5234, + "step": 40320 + }, + { + "epoch": 6.52008730094576, + "grad_norm": 1.3019760847091675, + "learning_rate": 0.0002, + "loss": 0.4815, + "step": 40330 + }, + { + "epoch": 6.521703985126505, + "grad_norm": 1.3196964263916016, + "learning_rate": 0.0002, + "loss": 0.5039, + "step": 40340 + }, + { + "epoch": 6.5233206693072505, + "grad_norm": 1.2795668840408325, + "learning_rate": 0.0002, + "loss": 0.4979, + "step": 40350 + }, + { + "epoch": 6.524937353487996, + "grad_norm": 1.1618940830230713, + "learning_rate": 0.0002, + "loss": 0.5075, + "step": 40360 + }, + { + "epoch": 6.526554037668742, + "grad_norm": 1.330543041229248, + "learning_rate": 0.0002, + "loss": 0.5081, + "step": 40370 + }, + { + "epoch": 6.528170721849486, + "grad_norm": 1.1946901082992554, + "learning_rate": 0.0002, + "loss": 0.5055, + "step": 40380 + }, + { + "epoch": 6.529787406030232, + "grad_norm": 1.1708201169967651, + "learning_rate": 0.0002, + "loss": 0.4518, + "step": 40390 + }, + { + "epoch": 6.531404090210978, + "grad_norm": 0.894036591053009, + "learning_rate": 0.0002, + "loss": 0.4556, + "step": 40400 + }, + { + "epoch": 6.533020774391723, + "grad_norm": 1.1199041604995728, + "learning_rate": 0.0002, + "loss": 0.4919, + "step": 40410 + }, + { + "epoch": 6.534637458572468, + "grad_norm": 1.180317759513855, + "learning_rate": 0.0002, + "loss": 0.471, + "step": 40420 + }, + { + "epoch": 6.536254142753213, + "grad_norm": 1.37367582321167, + "learning_rate": 0.0002, + "loss": 0.4914, + "step": 40430 + }, + { + "epoch": 6.5378708269339585, + "grad_norm": 1.134791612625122, + "learning_rate": 0.0002, + "loss": 0.4561, + "step": 40440 + }, + { + "epoch": 6.539487511114704, + "grad_norm": 1.1160204410552979, + "learning_rate": 0.0002, + "loss": 0.5337, + "step": 40450 + }, + { + "epoch": 6.541104195295449, + "grad_norm": 1.268347978591919, + "learning_rate": 0.0002, + "loss": 0.5299, + "step": 40460 + }, + { + "epoch": 6.542720879476194, + "grad_norm": 1.1424330472946167, + "learning_rate": 0.0002, + "loss": 0.5167, + "step": 40470 + }, + { + "epoch": 6.544337563656939, + "grad_norm": 1.3098465204238892, + "learning_rate": 0.0002, + "loss": 0.5114, + "step": 40480 + }, + { + "epoch": 6.545954247837685, + "grad_norm": 1.3439544439315796, + "learning_rate": 0.0002, + "loss": 0.4865, + "step": 40490 + }, + { + "epoch": 6.54757093201843, + "grad_norm": 1.2708452939987183, + "learning_rate": 0.0002, + "loss": 0.5183, + "step": 40500 + }, + { + "epoch": 6.549187616199175, + "grad_norm": 1.483680248260498, + "learning_rate": 0.0002, + "loss": 0.5099, + "step": 40510 + }, + { + "epoch": 6.550804300379921, + "grad_norm": 1.1697806119918823, + "learning_rate": 0.0002, + "loss": 0.4811, + "step": 40520 + }, + { + "epoch": 6.5524209845606665, + "grad_norm": 1.1665642261505127, + "learning_rate": 0.0002, + "loss": 0.4814, + "step": 40530 + }, + { + "epoch": 6.554037668741412, + "grad_norm": 1.1243325471878052, + "learning_rate": 0.0002, + "loss": 0.4985, + "step": 40540 + }, + { + "epoch": 6.555654352922157, + "grad_norm": 1.0277988910675049, + "learning_rate": 0.0002, + "loss": 0.4936, + "step": 40550 + }, + { + "epoch": 6.557271037102902, + "grad_norm": 1.1466810703277588, + "learning_rate": 0.0002, + "loss": 0.487, + "step": 40560 + }, + { + "epoch": 6.558887721283647, + "grad_norm": 1.1415363550186157, + "learning_rate": 0.0002, + "loss": 0.4851, + "step": 40570 + }, + { + "epoch": 6.560504405464393, + "grad_norm": 1.1923491954803467, + "learning_rate": 0.0002, + "loss": 0.4631, + "step": 40580 + }, + { + "epoch": 6.562121089645138, + "grad_norm": 0.9264549612998962, + "learning_rate": 0.0002, + "loss": 0.5071, + "step": 40590 + }, + { + "epoch": 6.563737773825883, + "grad_norm": 0.8810341954231262, + "learning_rate": 0.0002, + "loss": 0.466, + "step": 40600 + }, + { + "epoch": 6.565354458006628, + "grad_norm": 2.3296701908111572, + "learning_rate": 0.0002, + "loss": 0.5085, + "step": 40610 + }, + { + "epoch": 6.5669711421873735, + "grad_norm": 1.0865163803100586, + "learning_rate": 0.0002, + "loss": 0.5196, + "step": 40620 + }, + { + "epoch": 6.568587826368119, + "grad_norm": 0.9844607710838318, + "learning_rate": 0.0002, + "loss": 0.5132, + "step": 40630 + }, + { + "epoch": 6.570204510548864, + "grad_norm": 1.1686855554580688, + "learning_rate": 0.0002, + "loss": 0.5437, + "step": 40640 + }, + { + "epoch": 6.571821194729609, + "grad_norm": 1.016829252243042, + "learning_rate": 0.0002, + "loss": 0.5293, + "step": 40650 + }, + { + "epoch": 6.5734378789103545, + "grad_norm": 1.2789337635040283, + "learning_rate": 0.0002, + "loss": 0.5243, + "step": 40660 + }, + { + "epoch": 6.575054563091101, + "grad_norm": 1.0819072723388672, + "learning_rate": 0.0002, + "loss": 0.4867, + "step": 40670 + }, + { + "epoch": 6.576671247271846, + "grad_norm": 1.1478345394134521, + "learning_rate": 0.0002, + "loss": 0.5024, + "step": 40680 + }, + { + "epoch": 6.578287931452591, + "grad_norm": 0.7972208857536316, + "learning_rate": 0.0002, + "loss": 0.5282, + "step": 40690 + }, + { + "epoch": 6.579904615633336, + "grad_norm": 1.1481789350509644, + "learning_rate": 0.0002, + "loss": 0.4877, + "step": 40700 + }, + { + "epoch": 6.5815212998140815, + "grad_norm": 1.0921871662139893, + "learning_rate": 0.0002, + "loss": 0.5143, + "step": 40710 + }, + { + "epoch": 6.583137983994827, + "grad_norm": 1.0230315923690796, + "learning_rate": 0.0002, + "loss": 0.5441, + "step": 40720 + }, + { + "epoch": 6.584754668175572, + "grad_norm": 1.151049017906189, + "learning_rate": 0.0002, + "loss": 0.4734, + "step": 40730 + }, + { + "epoch": 6.586371352356317, + "grad_norm": 1.4016883373260498, + "learning_rate": 0.0002, + "loss": 0.4782, + "step": 40740 + }, + { + "epoch": 6.587988036537062, + "grad_norm": 1.2211825847625732, + "learning_rate": 0.0002, + "loss": 0.5195, + "step": 40750 + }, + { + "epoch": 6.589604720717808, + "grad_norm": 1.2803404331207275, + "learning_rate": 0.0002, + "loss": 0.4815, + "step": 40760 + }, + { + "epoch": 6.591221404898553, + "grad_norm": 1.1119942665100098, + "learning_rate": 0.0002, + "loss": 0.5329, + "step": 40770 + }, + { + "epoch": 6.592838089079298, + "grad_norm": 1.464650273323059, + "learning_rate": 0.0002, + "loss": 0.5135, + "step": 40780 + }, + { + "epoch": 6.594454773260043, + "grad_norm": 1.1751397848129272, + "learning_rate": 0.0002, + "loss": 0.5181, + "step": 40790 + }, + { + "epoch": 6.596071457440789, + "grad_norm": 1.0866316556930542, + "learning_rate": 0.0002, + "loss": 0.4772, + "step": 40800 + }, + { + "epoch": 6.597688141621534, + "grad_norm": 1.1733694076538086, + "learning_rate": 0.0002, + "loss": 0.5132, + "step": 40810 + }, + { + "epoch": 6.59930482580228, + "grad_norm": 1.184708833694458, + "learning_rate": 0.0002, + "loss": 0.5138, + "step": 40820 + }, + { + "epoch": 6.600921509983025, + "grad_norm": 1.406081199645996, + "learning_rate": 0.0002, + "loss": 0.4885, + "step": 40830 + }, + { + "epoch": 6.60253819416377, + "grad_norm": 0.9658212661743164, + "learning_rate": 0.0002, + "loss": 0.499, + "step": 40840 + }, + { + "epoch": 6.604154878344516, + "grad_norm": 1.1457678079605103, + "learning_rate": 0.0002, + "loss": 0.5113, + "step": 40850 + }, + { + "epoch": 6.605771562525261, + "grad_norm": 1.0487784147262573, + "learning_rate": 0.0002, + "loss": 0.4916, + "step": 40860 + }, + { + "epoch": 6.607388246706006, + "grad_norm": 0.9357177019119263, + "learning_rate": 0.0002, + "loss": 0.4682, + "step": 40870 + }, + { + "epoch": 6.609004930886751, + "grad_norm": 1.1479727029800415, + "learning_rate": 0.0002, + "loss": 0.4751, + "step": 40880 + }, + { + "epoch": 6.610621615067497, + "grad_norm": 1.3729329109191895, + "learning_rate": 0.0002, + "loss": 0.5493, + "step": 40890 + }, + { + "epoch": 6.612238299248242, + "grad_norm": 1.0085599422454834, + "learning_rate": 0.0002, + "loss": 0.4886, + "step": 40900 + }, + { + "epoch": 6.613854983428987, + "grad_norm": 1.2750911712646484, + "learning_rate": 0.0002, + "loss": 0.516, + "step": 40910 + }, + { + "epoch": 6.615471667609732, + "grad_norm": 1.1929547786712646, + "learning_rate": 0.0002, + "loss": 0.5342, + "step": 40920 + }, + { + "epoch": 6.6170883517904775, + "grad_norm": 1.0821375846862793, + "learning_rate": 0.0002, + "loss": 0.4919, + "step": 40930 + }, + { + "epoch": 6.618705035971223, + "grad_norm": 1.197347640991211, + "learning_rate": 0.0002, + "loss": 0.5057, + "step": 40940 + }, + { + "epoch": 6.620321720151968, + "grad_norm": 1.2074699401855469, + "learning_rate": 0.0002, + "loss": 0.492, + "step": 40950 + }, + { + "epoch": 6.621938404332713, + "grad_norm": 1.312009572982788, + "learning_rate": 0.0002, + "loss": 0.5089, + "step": 40960 + }, + { + "epoch": 6.623555088513459, + "grad_norm": 1.4381471872329712, + "learning_rate": 0.0002, + "loss": 0.5476, + "step": 40970 + }, + { + "epoch": 6.6251717726942045, + "grad_norm": 1.1574671268463135, + "learning_rate": 0.0002, + "loss": 0.4904, + "step": 40980 + }, + { + "epoch": 6.62678845687495, + "grad_norm": 0.885661780834198, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 40990 + }, + { + "epoch": 6.628405141055695, + "grad_norm": 1.024571180343628, + "learning_rate": 0.0002, + "loss": 0.5145, + "step": 41000 + }, + { + "epoch": 6.63002182523644, + "grad_norm": 1.103437900543213, + "learning_rate": 0.0002, + "loss": 0.4791, + "step": 41010 + }, + { + "epoch": 6.6316385094171855, + "grad_norm": 1.122450828552246, + "learning_rate": 0.0002, + "loss": 0.4671, + "step": 41020 + }, + { + "epoch": 6.633255193597931, + "grad_norm": 1.2256295680999756, + "learning_rate": 0.0002, + "loss": 0.5134, + "step": 41030 + }, + { + "epoch": 6.634871877778676, + "grad_norm": 1.364594578742981, + "learning_rate": 0.0002, + "loss": 0.4908, + "step": 41040 + }, + { + "epoch": 6.636488561959421, + "grad_norm": 0.9550056457519531, + "learning_rate": 0.0002, + "loss": 0.4964, + "step": 41050 + }, + { + "epoch": 6.638105246140166, + "grad_norm": 1.3174707889556885, + "learning_rate": 0.0002, + "loss": 0.5028, + "step": 41060 + }, + { + "epoch": 6.639721930320912, + "grad_norm": 1.0835540294647217, + "learning_rate": 0.0002, + "loss": 0.4717, + "step": 41070 + }, + { + "epoch": 6.641338614501657, + "grad_norm": 1.1432770490646362, + "learning_rate": 0.0002, + "loss": 0.497, + "step": 41080 + }, + { + "epoch": 6.642955298682402, + "grad_norm": 1.2398556470870972, + "learning_rate": 0.0002, + "loss": 0.4903, + "step": 41090 + }, + { + "epoch": 6.644571982863147, + "grad_norm": 1.1147747039794922, + "learning_rate": 0.0002, + "loss": 0.4991, + "step": 41100 + }, + { + "epoch": 6.6461886670438926, + "grad_norm": 1.0730493068695068, + "learning_rate": 0.0002, + "loss": 0.505, + "step": 41110 + }, + { + "epoch": 6.647805351224639, + "grad_norm": 1.3218451738357544, + "learning_rate": 0.0002, + "loss": 0.486, + "step": 41120 + }, + { + "epoch": 6.649422035405384, + "grad_norm": 1.3027331829071045, + "learning_rate": 0.0002, + "loss": 0.5276, + "step": 41130 + }, + { + "epoch": 6.651038719586129, + "grad_norm": 1.0280735492706299, + "learning_rate": 0.0002, + "loss": 0.5263, + "step": 41140 + }, + { + "epoch": 6.652655403766874, + "grad_norm": 1.109916090965271, + "learning_rate": 0.0002, + "loss": 0.4952, + "step": 41150 + }, + { + "epoch": 6.65427208794762, + "grad_norm": 1.078734040260315, + "learning_rate": 0.0002, + "loss": 0.5001, + "step": 41160 + }, + { + "epoch": 6.655888772128365, + "grad_norm": 1.1595654487609863, + "learning_rate": 0.0002, + "loss": 0.484, + "step": 41170 + }, + { + "epoch": 6.65750545630911, + "grad_norm": 1.1701031923294067, + "learning_rate": 0.0002, + "loss": 0.5101, + "step": 41180 + }, + { + "epoch": 6.659122140489855, + "grad_norm": 1.0424643754959106, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 41190 + }, + { + "epoch": 6.6607388246706005, + "grad_norm": 1.22880220413208, + "learning_rate": 0.0002, + "loss": 0.4863, + "step": 41200 + }, + { + "epoch": 6.662355508851346, + "grad_norm": 1.1907655000686646, + "learning_rate": 0.0002, + "loss": 0.4987, + "step": 41210 + }, + { + "epoch": 6.663972193032091, + "grad_norm": 1.0765007734298706, + "learning_rate": 0.0002, + "loss": 0.5343, + "step": 41220 + }, + { + "epoch": 6.665588877212836, + "grad_norm": 0.9994917511940002, + "learning_rate": 0.0002, + "loss": 0.5039, + "step": 41230 + }, + { + "epoch": 6.6672055613935814, + "grad_norm": 0.968578040599823, + "learning_rate": 0.0002, + "loss": 0.507, + "step": 41240 + }, + { + "epoch": 6.668822245574327, + "grad_norm": 1.0576032400131226, + "learning_rate": 0.0002, + "loss": 0.5068, + "step": 41250 + }, + { + "epoch": 6.670438929755072, + "grad_norm": 1.2183765172958374, + "learning_rate": 0.0002, + "loss": 0.486, + "step": 41260 + }, + { + "epoch": 6.672055613935818, + "grad_norm": 1.2548623085021973, + "learning_rate": 0.0002, + "loss": 0.4764, + "step": 41270 + }, + { + "epoch": 6.673672298116563, + "grad_norm": 1.0848388671875, + "learning_rate": 0.0002, + "loss": 0.5014, + "step": 41280 + }, + { + "epoch": 6.6752889822973085, + "grad_norm": 1.21421217918396, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 41290 + }, + { + "epoch": 6.676905666478054, + "grad_norm": 1.1453598737716675, + "learning_rate": 0.0002, + "loss": 0.4911, + "step": 41300 + }, + { + "epoch": 6.678522350658799, + "grad_norm": 1.2682722806930542, + "learning_rate": 0.0002, + "loss": 0.5033, + "step": 41310 + }, + { + "epoch": 6.680139034839544, + "grad_norm": 1.1659725904464722, + "learning_rate": 0.0002, + "loss": 0.5313, + "step": 41320 + }, + { + "epoch": 6.681755719020289, + "grad_norm": 1.36194908618927, + "learning_rate": 0.0002, + "loss": 0.5505, + "step": 41330 + }, + { + "epoch": 6.683372403201035, + "grad_norm": 1.1712592840194702, + "learning_rate": 0.0002, + "loss": 0.5127, + "step": 41340 + }, + { + "epoch": 6.68498908738178, + "grad_norm": 1.4168336391448975, + "learning_rate": 0.0002, + "loss": 0.5082, + "step": 41350 + }, + { + "epoch": 6.686605771562525, + "grad_norm": 1.0395328998565674, + "learning_rate": 0.0002, + "loss": 0.5124, + "step": 41360 + }, + { + "epoch": 6.68822245574327, + "grad_norm": 1.2511054277420044, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 41370 + }, + { + "epoch": 6.689839139924016, + "grad_norm": 1.0438542366027832, + "learning_rate": 0.0002, + "loss": 0.5027, + "step": 41380 + }, + { + "epoch": 6.691455824104761, + "grad_norm": 1.08684241771698, + "learning_rate": 0.0002, + "loss": 0.5069, + "step": 41390 + }, + { + "epoch": 6.693072508285506, + "grad_norm": 1.250788927078247, + "learning_rate": 0.0002, + "loss": 0.5224, + "step": 41400 + }, + { + "epoch": 6.694689192466251, + "grad_norm": 1.313890814781189, + "learning_rate": 0.0002, + "loss": 0.4921, + "step": 41410 + }, + { + "epoch": 6.696305876646997, + "grad_norm": 1.3218982219696045, + "learning_rate": 0.0002, + "loss": 0.5028, + "step": 41420 + }, + { + "epoch": 6.697922560827743, + "grad_norm": 1.0366582870483398, + "learning_rate": 0.0002, + "loss": 0.4851, + "step": 41430 + }, + { + "epoch": 6.699539245008488, + "grad_norm": 1.066121220588684, + "learning_rate": 0.0002, + "loss": 0.5103, + "step": 41440 + }, + { + "epoch": 6.701155929189233, + "grad_norm": 1.0239925384521484, + "learning_rate": 0.0002, + "loss": 0.4966, + "step": 41450 + }, + { + "epoch": 6.702772613369978, + "grad_norm": 0.9402176141738892, + "learning_rate": 0.0002, + "loss": 0.4767, + "step": 41460 + }, + { + "epoch": 6.7043892975507235, + "grad_norm": 1.391718864440918, + "learning_rate": 0.0002, + "loss": 0.5381, + "step": 41470 + }, + { + "epoch": 6.706005981731469, + "grad_norm": 1.215600609779358, + "learning_rate": 0.0002, + "loss": 0.512, + "step": 41480 + }, + { + "epoch": 6.707622665912214, + "grad_norm": 1.063722848892212, + "learning_rate": 0.0002, + "loss": 0.5219, + "step": 41490 + }, + { + "epoch": 6.709239350092959, + "grad_norm": 1.132149577140808, + "learning_rate": 0.0002, + "loss": 0.492, + "step": 41500 + }, + { + "epoch": 6.7108560342737045, + "grad_norm": 1.0302950143814087, + "learning_rate": 0.0002, + "loss": 0.4812, + "step": 41510 + }, + { + "epoch": 6.71247271845445, + "grad_norm": 1.5342752933502197, + "learning_rate": 0.0002, + "loss": 0.5141, + "step": 41520 + }, + { + "epoch": 6.714089402635195, + "grad_norm": 1.177137017250061, + "learning_rate": 0.0002, + "loss": 0.5123, + "step": 41530 + }, + { + "epoch": 6.71570608681594, + "grad_norm": 1.2335538864135742, + "learning_rate": 0.0002, + "loss": 0.5082, + "step": 41540 + }, + { + "epoch": 6.717322770996686, + "grad_norm": 1.140604853630066, + "learning_rate": 0.0002, + "loss": 0.4864, + "step": 41550 + }, + { + "epoch": 6.718939455177431, + "grad_norm": 1.3567465543746948, + "learning_rate": 0.0002, + "loss": 0.4888, + "step": 41560 + }, + { + "epoch": 6.720556139358177, + "grad_norm": 1.0693929195404053, + "learning_rate": 0.0002, + "loss": 0.5183, + "step": 41570 + }, + { + "epoch": 6.722172823538922, + "grad_norm": 1.1592605113983154, + "learning_rate": 0.0002, + "loss": 0.5131, + "step": 41580 + }, + { + "epoch": 6.723789507719667, + "grad_norm": 0.989006519317627, + "learning_rate": 0.0002, + "loss": 0.5476, + "step": 41590 + }, + { + "epoch": 6.7254061919004124, + "grad_norm": 1.04103422164917, + "learning_rate": 0.0002, + "loss": 0.4952, + "step": 41600 + }, + { + "epoch": 6.727022876081158, + "grad_norm": 1.1129004955291748, + "learning_rate": 0.0002, + "loss": 0.4823, + "step": 41610 + }, + { + "epoch": 6.728639560261903, + "grad_norm": 1.1473113298416138, + "learning_rate": 0.0002, + "loss": 0.5032, + "step": 41620 + }, + { + "epoch": 6.730256244442648, + "grad_norm": 1.348036527633667, + "learning_rate": 0.0002, + "loss": 0.5253, + "step": 41630 + }, + { + "epoch": 6.731872928623393, + "grad_norm": 1.259942650794983, + "learning_rate": 0.0002, + "loss": 0.4983, + "step": 41640 + }, + { + "epoch": 6.733489612804139, + "grad_norm": 1.0591514110565186, + "learning_rate": 0.0002, + "loss": 0.5182, + "step": 41650 + }, + { + "epoch": 6.735106296984884, + "grad_norm": 0.9737129211425781, + "learning_rate": 0.0002, + "loss": 0.4886, + "step": 41660 + }, + { + "epoch": 6.736722981165629, + "grad_norm": 1.2520451545715332, + "learning_rate": 0.0002, + "loss": 0.5051, + "step": 41670 + }, + { + "epoch": 6.738339665346374, + "grad_norm": 1.0555530786514282, + "learning_rate": 0.0002, + "loss": 0.5364, + "step": 41680 + }, + { + "epoch": 6.7399563495271195, + "grad_norm": 1.0025697946548462, + "learning_rate": 0.0002, + "loss": 0.4954, + "step": 41690 + }, + { + "epoch": 6.741573033707866, + "grad_norm": 1.1114100217819214, + "learning_rate": 0.0002, + "loss": 0.5485, + "step": 41700 + }, + { + "epoch": 6.74318971788861, + "grad_norm": 1.1537504196166992, + "learning_rate": 0.0002, + "loss": 0.4986, + "step": 41710 + }, + { + "epoch": 6.744806402069356, + "grad_norm": 1.037880539894104, + "learning_rate": 0.0002, + "loss": 0.5025, + "step": 41720 + }, + { + "epoch": 6.746423086250101, + "grad_norm": 1.0691965818405151, + "learning_rate": 0.0002, + "loss": 0.482, + "step": 41730 + }, + { + "epoch": 6.748039770430847, + "grad_norm": 1.376325011253357, + "learning_rate": 0.0002, + "loss": 0.5272, + "step": 41740 + }, + { + "epoch": 6.749656454611592, + "grad_norm": 1.4667129516601562, + "learning_rate": 0.0002, + "loss": 0.5484, + "step": 41750 + }, + { + "epoch": 6.751273138792337, + "grad_norm": 1.1517162322998047, + "learning_rate": 0.0002, + "loss": 0.5139, + "step": 41760 + }, + { + "epoch": 6.752889822973082, + "grad_norm": 1.1454511880874634, + "learning_rate": 0.0002, + "loss": 0.5523, + "step": 41770 + }, + { + "epoch": 6.7545065071538275, + "grad_norm": 1.6323128938674927, + "learning_rate": 0.0002, + "loss": 0.4664, + "step": 41780 + }, + { + "epoch": 6.756123191334573, + "grad_norm": 1.0951642990112305, + "learning_rate": 0.0002, + "loss": 0.5153, + "step": 41790 + }, + { + "epoch": 6.757739875515318, + "grad_norm": 1.0766983032226562, + "learning_rate": 0.0002, + "loss": 0.4998, + "step": 41800 + }, + { + "epoch": 6.759356559696063, + "grad_norm": 1.3472381830215454, + "learning_rate": 0.0002, + "loss": 0.548, + "step": 41810 + }, + { + "epoch": 6.760973243876808, + "grad_norm": 1.0248444080352783, + "learning_rate": 0.0002, + "loss": 0.5172, + "step": 41820 + }, + { + "epoch": 6.762589928057554, + "grad_norm": 1.1276055574417114, + "learning_rate": 0.0002, + "loss": 0.5236, + "step": 41830 + }, + { + "epoch": 6.764206612238299, + "grad_norm": 1.5398495197296143, + "learning_rate": 0.0002, + "loss": 0.5044, + "step": 41840 + }, + { + "epoch": 6.765823296419045, + "grad_norm": 1.1886497735977173, + "learning_rate": 0.0002, + "loss": 0.5097, + "step": 41850 + }, + { + "epoch": 6.767439980599789, + "grad_norm": 1.027198076248169, + "learning_rate": 0.0002, + "loss": 0.499, + "step": 41860 + }, + { + "epoch": 6.7690566647805355, + "grad_norm": 1.4644980430603027, + "learning_rate": 0.0002, + "loss": 0.5444, + "step": 41870 + }, + { + "epoch": 6.770673348961281, + "grad_norm": 0.9633586406707764, + "learning_rate": 0.0002, + "loss": 0.5009, + "step": 41880 + }, + { + "epoch": 6.772290033142026, + "grad_norm": 1.0895354747772217, + "learning_rate": 0.0002, + "loss": 0.484, + "step": 41890 + }, + { + "epoch": 6.773906717322771, + "grad_norm": 1.1887167692184448, + "learning_rate": 0.0002, + "loss": 0.5172, + "step": 41900 + }, + { + "epoch": 6.775523401503516, + "grad_norm": 1.3699820041656494, + "learning_rate": 0.0002, + "loss": 0.5399, + "step": 41910 + }, + { + "epoch": 6.777140085684262, + "grad_norm": 1.0266352891921997, + "learning_rate": 0.0002, + "loss": 0.5504, + "step": 41920 + }, + { + "epoch": 6.778756769865007, + "grad_norm": 1.0919075012207031, + "learning_rate": 0.0002, + "loss": 0.5105, + "step": 41930 + }, + { + "epoch": 6.780373454045752, + "grad_norm": 0.9839563369750977, + "learning_rate": 0.0002, + "loss": 0.4842, + "step": 41940 + }, + { + "epoch": 6.781990138226497, + "grad_norm": 1.2605451345443726, + "learning_rate": 0.0002, + "loss": 0.5081, + "step": 41950 + }, + { + "epoch": 6.7836068224072426, + "grad_norm": 0.9268672466278076, + "learning_rate": 0.0002, + "loss": 0.5391, + "step": 41960 + }, + { + "epoch": 6.785223506587988, + "grad_norm": 1.2002313137054443, + "learning_rate": 0.0002, + "loss": 0.4916, + "step": 41970 + }, + { + "epoch": 6.786840190768733, + "grad_norm": 1.2018438577651978, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 41980 + }, + { + "epoch": 6.788456874949478, + "grad_norm": 1.17646062374115, + "learning_rate": 0.0002, + "loss": 0.5491, + "step": 41990 + }, + { + "epoch": 6.790073559130224, + "grad_norm": 1.1080009937286377, + "learning_rate": 0.0002, + "loss": 0.5354, + "step": 42000 + }, + { + "epoch": 6.791690243310969, + "grad_norm": 1.1606498956680298, + "learning_rate": 0.0002, + "loss": 0.5384, + "step": 42010 + }, + { + "epoch": 6.793306927491715, + "grad_norm": 1.2484819889068604, + "learning_rate": 0.0002, + "loss": 0.4931, + "step": 42020 + }, + { + "epoch": 6.79492361167246, + "grad_norm": 1.1363215446472168, + "learning_rate": 0.0002, + "loss": 0.498, + "step": 42030 + }, + { + "epoch": 6.796540295853205, + "grad_norm": 1.4469727277755737, + "learning_rate": 0.0002, + "loss": 0.5343, + "step": 42040 + }, + { + "epoch": 6.7981569800339505, + "grad_norm": 1.0617138147354126, + "learning_rate": 0.0002, + "loss": 0.5146, + "step": 42050 + }, + { + "epoch": 6.799773664214696, + "grad_norm": 1.1459330320358276, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 42060 + }, + { + "epoch": 6.801390348395441, + "grad_norm": 1.2095019817352295, + "learning_rate": 0.0002, + "loss": 0.5116, + "step": 42070 + }, + { + "epoch": 6.803007032576186, + "grad_norm": 1.3200831413269043, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 42080 + }, + { + "epoch": 6.8046237167569315, + "grad_norm": 1.1633318662643433, + "learning_rate": 0.0002, + "loss": 0.5406, + "step": 42090 + }, + { + "epoch": 6.806240400937677, + "grad_norm": 0.8986614942550659, + "learning_rate": 0.0002, + "loss": 0.4938, + "step": 42100 + }, + { + "epoch": 6.807857085118422, + "grad_norm": 1.3705275058746338, + "learning_rate": 0.0002, + "loss": 0.559, + "step": 42110 + }, + { + "epoch": 6.809473769299167, + "grad_norm": 1.2418090105056763, + "learning_rate": 0.0002, + "loss": 0.5022, + "step": 42120 + }, + { + "epoch": 6.811090453479912, + "grad_norm": 1.0818954706192017, + "learning_rate": 0.0002, + "loss": 0.5014, + "step": 42130 + }, + { + "epoch": 6.812707137660658, + "grad_norm": 0.9293872117996216, + "learning_rate": 0.0002, + "loss": 0.4791, + "step": 42140 + }, + { + "epoch": 6.814323821841404, + "grad_norm": 0.9791894555091858, + "learning_rate": 0.0002, + "loss": 0.5009, + "step": 42150 + }, + { + "epoch": 6.815940506022149, + "grad_norm": 1.1956568956375122, + "learning_rate": 0.0002, + "loss": 0.5142, + "step": 42160 + }, + { + "epoch": 6.817557190202894, + "grad_norm": 0.9643568992614746, + "learning_rate": 0.0002, + "loss": 0.5126, + "step": 42170 + }, + { + "epoch": 6.819173874383639, + "grad_norm": 1.2499792575836182, + "learning_rate": 0.0002, + "loss": 0.5121, + "step": 42180 + }, + { + "epoch": 6.820790558564385, + "grad_norm": 1.1779413223266602, + "learning_rate": 0.0002, + "loss": 0.4942, + "step": 42190 + }, + { + "epoch": 6.82240724274513, + "grad_norm": 1.0570595264434814, + "learning_rate": 0.0002, + "loss": 0.498, + "step": 42200 + }, + { + "epoch": 6.824023926925875, + "grad_norm": 1.1393938064575195, + "learning_rate": 0.0002, + "loss": 0.4997, + "step": 42210 + }, + { + "epoch": 6.82564061110662, + "grad_norm": 1.152463436126709, + "learning_rate": 0.0002, + "loss": 0.4842, + "step": 42220 + }, + { + "epoch": 6.827257295287366, + "grad_norm": 1.3353025913238525, + "learning_rate": 0.0002, + "loss": 0.5234, + "step": 42230 + }, + { + "epoch": 6.828873979468111, + "grad_norm": 1.1719051599502563, + "learning_rate": 0.0002, + "loss": 0.539, + "step": 42240 + }, + { + "epoch": 6.830490663648856, + "grad_norm": 1.262141227722168, + "learning_rate": 0.0002, + "loss": 0.5139, + "step": 42250 + }, + { + "epoch": 6.832107347829601, + "grad_norm": 1.240899920463562, + "learning_rate": 0.0002, + "loss": 0.5021, + "step": 42260 + }, + { + "epoch": 6.8337240320103465, + "grad_norm": 1.0505269765853882, + "learning_rate": 0.0002, + "loss": 0.4961, + "step": 42270 + }, + { + "epoch": 6.835340716191092, + "grad_norm": 1.1556071043014526, + "learning_rate": 0.0002, + "loss": 0.4932, + "step": 42280 + }, + { + "epoch": 6.836957400371837, + "grad_norm": 1.1427719593048096, + "learning_rate": 0.0002, + "loss": 0.5461, + "step": 42290 + }, + { + "epoch": 6.838574084552583, + "grad_norm": 1.1540080308914185, + "learning_rate": 0.0002, + "loss": 0.5199, + "step": 42300 + }, + { + "epoch": 6.840190768733328, + "grad_norm": 1.0521200895309448, + "learning_rate": 0.0002, + "loss": 0.5269, + "step": 42310 + }, + { + "epoch": 6.8418074529140736, + "grad_norm": 1.0205531120300293, + "learning_rate": 0.0002, + "loss": 0.541, + "step": 42320 + }, + { + "epoch": 6.843424137094819, + "grad_norm": 1.0010193586349487, + "learning_rate": 0.0002, + "loss": 0.5225, + "step": 42330 + }, + { + "epoch": 6.845040821275564, + "grad_norm": 1.2138770818710327, + "learning_rate": 0.0002, + "loss": 0.5101, + "step": 42340 + }, + { + "epoch": 6.846657505456309, + "grad_norm": 1.3028651475906372, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 42350 + }, + { + "epoch": 6.8482741896370545, + "grad_norm": 1.0326353311538696, + "learning_rate": 0.0002, + "loss": 0.4894, + "step": 42360 + }, + { + "epoch": 6.8498908738178, + "grad_norm": 1.036085605621338, + "learning_rate": 0.0002, + "loss": 0.5285, + "step": 42370 + }, + { + "epoch": 6.851507557998545, + "grad_norm": 1.0575472116470337, + "learning_rate": 0.0002, + "loss": 0.505, + "step": 42380 + }, + { + "epoch": 6.85312424217929, + "grad_norm": 1.1749629974365234, + "learning_rate": 0.0002, + "loss": 0.4997, + "step": 42390 + }, + { + "epoch": 6.854740926360035, + "grad_norm": 1.1747760772705078, + "learning_rate": 0.0002, + "loss": 0.4961, + "step": 42400 + }, + { + "epoch": 6.856357610540781, + "grad_norm": 1.1877071857452393, + "learning_rate": 0.0002, + "loss": 0.5138, + "step": 42410 + }, + { + "epoch": 6.857974294721526, + "grad_norm": 1.1209983825683594, + "learning_rate": 0.0002, + "loss": 0.4972, + "step": 42420 + }, + { + "epoch": 6.859590978902271, + "grad_norm": 1.2918205261230469, + "learning_rate": 0.0002, + "loss": 0.4939, + "step": 42430 + }, + { + "epoch": 6.861207663083016, + "grad_norm": 1.2443464994430542, + "learning_rate": 0.0002, + "loss": 0.5012, + "step": 42440 + }, + { + "epoch": 6.8628243472637624, + "grad_norm": 0.9336795210838318, + "learning_rate": 0.0002, + "loss": 0.5226, + "step": 42450 + }, + { + "epoch": 6.864441031444508, + "grad_norm": 1.2183542251586914, + "learning_rate": 0.0002, + "loss": 0.5108, + "step": 42460 + }, + { + "epoch": 6.866057715625253, + "grad_norm": 1.0071234703063965, + "learning_rate": 0.0002, + "loss": 0.5245, + "step": 42470 + }, + { + "epoch": 6.867674399805998, + "grad_norm": 1.2914012670516968, + "learning_rate": 0.0002, + "loss": 0.4753, + "step": 42480 + }, + { + "epoch": 6.869291083986743, + "grad_norm": 1.1050426959991455, + "learning_rate": 0.0002, + "loss": 0.4865, + "step": 42490 + }, + { + "epoch": 6.870907768167489, + "grad_norm": 1.1163811683654785, + "learning_rate": 0.0002, + "loss": 0.5243, + "step": 42500 + }, + { + "epoch": 6.872524452348234, + "grad_norm": 1.1575818061828613, + "learning_rate": 0.0002, + "loss": 0.5065, + "step": 42510 + }, + { + "epoch": 6.874141136528979, + "grad_norm": 1.11167311668396, + "learning_rate": 0.0002, + "loss": 0.5353, + "step": 42520 + }, + { + "epoch": 6.875757820709724, + "grad_norm": 1.0379102230072021, + "learning_rate": 0.0002, + "loss": 0.5141, + "step": 42530 + }, + { + "epoch": 6.8773745048904695, + "grad_norm": 1.2617160081863403, + "learning_rate": 0.0002, + "loss": 0.5355, + "step": 42540 + }, + { + "epoch": 6.878991189071215, + "grad_norm": 1.1749719381332397, + "learning_rate": 0.0002, + "loss": 0.4785, + "step": 42550 + }, + { + "epoch": 6.88060787325196, + "grad_norm": 1.2284821271896362, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 42560 + }, + { + "epoch": 6.882224557432705, + "grad_norm": 1.1917030811309814, + "learning_rate": 0.0002, + "loss": 0.5065, + "step": 42570 + }, + { + "epoch": 6.8838412416134505, + "grad_norm": 1.1943914890289307, + "learning_rate": 0.0002, + "loss": 0.5176, + "step": 42580 + }, + { + "epoch": 6.885457925794196, + "grad_norm": 1.2641394138336182, + "learning_rate": 0.0002, + "loss": 0.5072, + "step": 42590 + }, + { + "epoch": 6.887074609974942, + "grad_norm": 1.1280436515808105, + "learning_rate": 0.0002, + "loss": 0.5004, + "step": 42600 + }, + { + "epoch": 6.888691294155687, + "grad_norm": 0.9865449070930481, + "learning_rate": 0.0002, + "loss": 0.5328, + "step": 42610 + }, + { + "epoch": 6.890307978336432, + "grad_norm": 0.994987428188324, + "learning_rate": 0.0002, + "loss": 0.4953, + "step": 42620 + }, + { + "epoch": 6.8919246625171775, + "grad_norm": 0.9900388717651367, + "learning_rate": 0.0002, + "loss": 0.4805, + "step": 42630 + }, + { + "epoch": 6.893541346697923, + "grad_norm": 1.2992421388626099, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 42640 + }, + { + "epoch": 6.895158030878668, + "grad_norm": 1.0152487754821777, + "learning_rate": 0.0002, + "loss": 0.5017, + "step": 42650 + }, + { + "epoch": 6.896774715059413, + "grad_norm": 1.199453353881836, + "learning_rate": 0.0002, + "loss": 0.5043, + "step": 42660 + }, + { + "epoch": 6.898391399240158, + "grad_norm": 1.100630521774292, + "learning_rate": 0.0002, + "loss": 0.5106, + "step": 42670 + }, + { + "epoch": 6.900008083420904, + "grad_norm": 1.0489764213562012, + "learning_rate": 0.0002, + "loss": 0.503, + "step": 42680 + }, + { + "epoch": 6.901624767601649, + "grad_norm": 1.101407527923584, + "learning_rate": 0.0002, + "loss": 0.4634, + "step": 42690 + }, + { + "epoch": 6.903241451782394, + "grad_norm": 1.3130593299865723, + "learning_rate": 0.0002, + "loss": 0.5361, + "step": 42700 + }, + { + "epoch": 6.904858135963139, + "grad_norm": 0.9906072616577148, + "learning_rate": 0.0002, + "loss": 0.5119, + "step": 42710 + }, + { + "epoch": 6.906474820143885, + "grad_norm": 1.094502329826355, + "learning_rate": 0.0002, + "loss": 0.5146, + "step": 42720 + }, + { + "epoch": 6.90809150432463, + "grad_norm": 1.1025426387786865, + "learning_rate": 0.0002, + "loss": 0.5165, + "step": 42730 + }, + { + "epoch": 6.909708188505375, + "grad_norm": 1.0644042491912842, + "learning_rate": 0.0002, + "loss": 0.5463, + "step": 42740 + }, + { + "epoch": 6.911324872686121, + "grad_norm": 1.0709129571914673, + "learning_rate": 0.0002, + "loss": 0.5024, + "step": 42750 + }, + { + "epoch": 6.912941556866866, + "grad_norm": 1.2445871829986572, + "learning_rate": 0.0002, + "loss": 0.5093, + "step": 42760 + }, + { + "epoch": 6.914558241047612, + "grad_norm": 1.020058035850525, + "learning_rate": 0.0002, + "loss": 0.5305, + "step": 42770 + }, + { + "epoch": 6.916174925228357, + "grad_norm": 0.9795091152191162, + "learning_rate": 0.0002, + "loss": 0.5382, + "step": 42780 + }, + { + "epoch": 6.917791609409102, + "grad_norm": 0.9369977116584778, + "learning_rate": 0.0002, + "loss": 0.5429, + "step": 42790 + }, + { + "epoch": 6.919408293589847, + "grad_norm": 1.0741904973983765, + "learning_rate": 0.0002, + "loss": 0.5444, + "step": 42800 + }, + { + "epoch": 6.921024977770593, + "grad_norm": 1.0702799558639526, + "learning_rate": 0.0002, + "loss": 0.5402, + "step": 42810 + }, + { + "epoch": 6.922641661951338, + "grad_norm": 1.0383983850479126, + "learning_rate": 0.0002, + "loss": 0.5291, + "step": 42820 + }, + { + "epoch": 6.924258346132083, + "grad_norm": 1.0761083364486694, + "learning_rate": 0.0002, + "loss": 0.5106, + "step": 42830 + }, + { + "epoch": 6.925875030312828, + "grad_norm": 1.2332350015640259, + "learning_rate": 0.0002, + "loss": 0.5726, + "step": 42840 + }, + { + "epoch": 6.9274917144935735, + "grad_norm": 1.3184348344802856, + "learning_rate": 0.0002, + "loss": 0.4996, + "step": 42850 + }, + { + "epoch": 6.929108398674319, + "grad_norm": 1.0586378574371338, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 42860 + }, + { + "epoch": 6.930725082855064, + "grad_norm": 1.2294201850891113, + "learning_rate": 0.0002, + "loss": 0.511, + "step": 42870 + }, + { + "epoch": 6.932341767035809, + "grad_norm": 1.3097991943359375, + "learning_rate": 0.0002, + "loss": 0.54, + "step": 42880 + }, + { + "epoch": 6.933958451216554, + "grad_norm": 0.9006873965263367, + "learning_rate": 0.0002, + "loss": 0.5228, + "step": 42890 + }, + { + "epoch": 6.9355751353973005, + "grad_norm": 1.265931248664856, + "learning_rate": 0.0002, + "loss": 0.4617, + "step": 42900 + }, + { + "epoch": 6.937191819578046, + "grad_norm": 1.1013522148132324, + "learning_rate": 0.0002, + "loss": 0.5029, + "step": 42910 + }, + { + "epoch": 6.938808503758791, + "grad_norm": 0.9910131692886353, + "learning_rate": 0.0002, + "loss": 0.5334, + "step": 42920 + }, + { + "epoch": 6.940425187939536, + "grad_norm": 1.102683424949646, + "learning_rate": 0.0002, + "loss": 0.5211, + "step": 42930 + }, + { + "epoch": 6.9420418721202815, + "grad_norm": 1.232961893081665, + "learning_rate": 0.0002, + "loss": 0.5588, + "step": 42940 + }, + { + "epoch": 6.943658556301027, + "grad_norm": 1.1714650392532349, + "learning_rate": 0.0002, + "loss": 0.5357, + "step": 42950 + }, + { + "epoch": 6.945275240481772, + "grad_norm": 1.1684318780899048, + "learning_rate": 0.0002, + "loss": 0.5232, + "step": 42960 + }, + { + "epoch": 6.946891924662517, + "grad_norm": 1.2074716091156006, + "learning_rate": 0.0002, + "loss": 0.5035, + "step": 42970 + }, + { + "epoch": 6.948508608843262, + "grad_norm": 1.2061275243759155, + "learning_rate": 0.0002, + "loss": 0.5111, + "step": 42980 + }, + { + "epoch": 6.950125293024008, + "grad_norm": 1.1216989755630493, + "learning_rate": 0.0002, + "loss": 0.5066, + "step": 42990 + }, + { + "epoch": 6.951741977204753, + "grad_norm": 1.304117202758789, + "learning_rate": 0.0002, + "loss": 0.4948, + "step": 43000 + }, + { + "epoch": 6.953358661385498, + "grad_norm": 1.2377972602844238, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 43010 + }, + { + "epoch": 6.954975345566243, + "grad_norm": 1.2332178354263306, + "learning_rate": 0.0002, + "loss": 0.4792, + "step": 43020 + }, + { + "epoch": 6.956592029746989, + "grad_norm": 1.1919599771499634, + "learning_rate": 0.0002, + "loss": 0.5181, + "step": 43030 + }, + { + "epoch": 6.958208713927734, + "grad_norm": 1.272700548171997, + "learning_rate": 0.0002, + "loss": 0.5352, + "step": 43040 + }, + { + "epoch": 6.95982539810848, + "grad_norm": 1.4377546310424805, + "learning_rate": 0.0002, + "loss": 0.5328, + "step": 43050 + }, + { + "epoch": 6.961442082289225, + "grad_norm": 1.2070353031158447, + "learning_rate": 0.0002, + "loss": 0.4894, + "step": 43060 + }, + { + "epoch": 6.96305876646997, + "grad_norm": 1.090205430984497, + "learning_rate": 0.0002, + "loss": 0.525, + "step": 43070 + }, + { + "epoch": 6.964675450650716, + "grad_norm": 1.1832911968231201, + "learning_rate": 0.0002, + "loss": 0.5255, + "step": 43080 + }, + { + "epoch": 6.966292134831461, + "grad_norm": 1.2921082973480225, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 43090 + }, + { + "epoch": 6.967908819012206, + "grad_norm": 1.4303096532821655, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 43100 + }, + { + "epoch": 6.969525503192951, + "grad_norm": 1.0788004398345947, + "learning_rate": 0.0002, + "loss": 0.4807, + "step": 43110 + }, + { + "epoch": 6.9711421873736965, + "grad_norm": 1.2192047834396362, + "learning_rate": 0.0002, + "loss": 0.5006, + "step": 43120 + }, + { + "epoch": 6.972758871554442, + "grad_norm": 1.0735143423080444, + "learning_rate": 0.0002, + "loss": 0.4714, + "step": 43130 + }, + { + "epoch": 6.974375555735187, + "grad_norm": 1.0317153930664062, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 43140 + }, + { + "epoch": 6.975992239915932, + "grad_norm": 1.0926798582077026, + "learning_rate": 0.0002, + "loss": 0.5154, + "step": 43150 + }, + { + "epoch": 6.977608924096677, + "grad_norm": 1.1660500764846802, + "learning_rate": 0.0002, + "loss": 0.4976, + "step": 43160 + }, + { + "epoch": 6.979225608277423, + "grad_norm": 1.3945232629776, + "learning_rate": 0.0002, + "loss": 0.5456, + "step": 43170 + }, + { + "epoch": 6.980842292458169, + "grad_norm": 1.2684587240219116, + "learning_rate": 0.0002, + "loss": 0.4979, + "step": 43180 + }, + { + "epoch": 6.982458976638913, + "grad_norm": 1.1574004888534546, + "learning_rate": 0.0002, + "loss": 0.5406, + "step": 43190 + }, + { + "epoch": 6.984075660819659, + "grad_norm": 1.2534198760986328, + "learning_rate": 0.0002, + "loss": 0.5629, + "step": 43200 + }, + { + "epoch": 6.9856923450004045, + "grad_norm": 1.135245442390442, + "learning_rate": 0.0002, + "loss": 0.5191, + "step": 43210 + }, + { + "epoch": 6.98730902918115, + "grad_norm": 1.3824104070663452, + "learning_rate": 0.0002, + "loss": 0.548, + "step": 43220 + }, + { + "epoch": 6.988925713361895, + "grad_norm": 1.2128452062606812, + "learning_rate": 0.0002, + "loss": 0.5294, + "step": 43230 + }, + { + "epoch": 6.99054239754264, + "grad_norm": 1.0795245170593262, + "learning_rate": 0.0002, + "loss": 0.505, + "step": 43240 + }, + { + "epoch": 6.992159081723385, + "grad_norm": 1.337353229522705, + "learning_rate": 0.0002, + "loss": 0.4889, + "step": 43250 + }, + { + "epoch": 6.993775765904131, + "grad_norm": 1.1731765270233154, + "learning_rate": 0.0002, + "loss": 0.4749, + "step": 43260 + }, + { + "epoch": 6.995392450084876, + "grad_norm": 1.0203192234039307, + "learning_rate": 0.0002, + "loss": 0.4897, + "step": 43270 + }, + { + "epoch": 6.997009134265621, + "grad_norm": 0.9261201620101929, + "learning_rate": 0.0002, + "loss": 0.5324, + "step": 43280 + }, + { + "epoch": 6.998625818446366, + "grad_norm": 1.107865810394287, + "learning_rate": 0.0002, + "loss": 0.5227, + "step": 43290 + }, + { + "epoch": 6.9999191657909625, + "eval_loss": 1.2679380178451538, + "eval_runtime": 122.202, + "eval_samples_per_second": 5.998, + "eval_steps_per_second": 0.753, + "step": 43298 + } + ], + "logging_steps": 10, + "max_steps": 49480, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.0037576307318456e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..155b12fa9acbc6e71dba75c92bfa79e152397ebf --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-43298/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28694d5564a2b5c7d6881d4ba2af103356aa22489d2c22768ebbe47283c0f4a1 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f828a74fd080647f5b76b649265f019c1dee6822 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ae514d1496760bcd747d5f35daaea1ebe7589d9dd859a867aab3f6c7ee63ffb +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3afc3bb8dbb51f750dab1dac300b22f2bc060ea --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ffad8d9c012795bb87f2c48fc7e292b984d249e84ceb16f5b4353e87aaf6e92 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e6f2c4ca9aa9b93bb0507887bfdf97dc98716951 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ffb5fee38f909e20f0db9c8d5d103ccd6be5344dadb2c10329d2136c9aa207c +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9f94c302b2027faa9c2dc68a9cd9b3395df05d67 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67233313ac4b876ce7a58fbb5923f3e93ad0293bc11cc82e5d9a6d5c6521e23e +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..148526696eab1bc2c6d827bf826dde47f5f93dd3 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/trainer_state.json @@ -0,0 +1,34733 @@ +{ + "best_metric": 1.0871200561523438, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185", + "epoch": 7.999353326327702, + "eval_steps": 10, + "global_step": 49480, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016166841807452913, + "grad_norm": 0.9894065856933594, + "learning_rate": 0.0002, + "loss": 1.6636, + "step": 10 + }, + { + "epoch": 0.0032333683614905826, + "grad_norm": 1.7810699939727783, + "learning_rate": 0.0002, + "loss": 1.1528, + "step": 20 + }, + { + "epoch": 0.004850052542235874, + "grad_norm": 0.5969577431678772, + "learning_rate": 0.0002, + "loss": 0.9767, + "step": 30 + }, + { + "epoch": 0.006466736722981165, + "grad_norm": 0.6354120969772339, + "learning_rate": 0.0002, + "loss": 0.9772, + "step": 40 + }, + { + "epoch": 0.008083420903726457, + "grad_norm": 0.5604607462882996, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 50 + }, + { + "epoch": 0.009700105084471748, + "grad_norm": 0.4676193594932556, + "learning_rate": 0.0002, + "loss": 0.8841, + "step": 60 + }, + { + "epoch": 0.01131678926521704, + "grad_norm": 0.6099211573600769, + "learning_rate": 0.0002, + "loss": 0.9022, + "step": 70 + }, + { + "epoch": 0.01293347344596233, + "grad_norm": 0.48639994859695435, + "learning_rate": 0.0002, + "loss": 0.9133, + "step": 80 + }, + { + "epoch": 0.014550157626707623, + "grad_norm": 0.4904264509677887, + "learning_rate": 0.0002, + "loss": 0.8704, + "step": 90 + }, + { + "epoch": 0.016166841807452915, + "grad_norm": 2.8334362506866455, + "learning_rate": 0.0002, + "loss": 0.8855, + "step": 100 + }, + { + "epoch": 0.017783525988198205, + "grad_norm": 0.43221670389175415, + "learning_rate": 0.0002, + "loss": 0.8958, + "step": 110 + }, + { + "epoch": 0.019400210168943496, + "grad_norm": 0.42244166135787964, + "learning_rate": 0.0002, + "loss": 0.8412, + "step": 120 + }, + { + "epoch": 0.02101689434968879, + "grad_norm": 0.45363298058509827, + "learning_rate": 0.0002, + "loss": 0.8467, + "step": 130 + }, + { + "epoch": 0.02263357853043408, + "grad_norm": 0.44816508889198303, + "learning_rate": 0.0002, + "loss": 0.8641, + "step": 140 + }, + { + "epoch": 0.02425026271117937, + "grad_norm": 0.43308213353157043, + "learning_rate": 0.0002, + "loss": 0.8496, + "step": 150 + }, + { + "epoch": 0.02586694689192466, + "grad_norm": 0.4084763526916504, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 160 + }, + { + "epoch": 0.027483631072669955, + "grad_norm": 0.5363703966140747, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 170 + }, + { + "epoch": 0.029100315253415245, + "grad_norm": 0.4619699716567993, + "learning_rate": 0.0002, + "loss": 0.8558, + "step": 180 + }, + { + "epoch": 0.030716999434160536, + "grad_norm": 0.49069908261299133, + "learning_rate": 0.0002, + "loss": 0.8878, + "step": 190 + }, + { + "epoch": 0.03233368361490583, + "grad_norm": 0.4645835757255554, + "learning_rate": 0.0002, + "loss": 0.8867, + "step": 200 + }, + { + "epoch": 0.03395036779565112, + "grad_norm": 1.2411243915557861, + "learning_rate": 0.0002, + "loss": 0.8842, + "step": 210 + }, + { + "epoch": 0.03556705197639641, + "grad_norm": 0.5211851596832275, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 220 + }, + { + "epoch": 0.037183736157141704, + "grad_norm": 0.5253691673278809, + "learning_rate": 0.0002, + "loss": 0.8194, + "step": 230 + }, + { + "epoch": 0.03880042033788699, + "grad_norm": 0.4567478895187378, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 240 + }, + { + "epoch": 0.040417104518632285, + "grad_norm": 0.5472128391265869, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 250 + }, + { + "epoch": 0.04203378869937758, + "grad_norm": 0.42978546023368835, + "learning_rate": 0.0002, + "loss": 0.8201, + "step": 260 + }, + { + "epoch": 0.043650472880122866, + "grad_norm": 0.601734459400177, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 270 + }, + { + "epoch": 0.04526715706086816, + "grad_norm": 0.4286513328552246, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 280 + }, + { + "epoch": 0.046883841241613454, + "grad_norm": 0.5230861902236938, + "learning_rate": 0.0002, + "loss": 0.8758, + "step": 290 + }, + { + "epoch": 0.04850052542235874, + "grad_norm": 0.6504611968994141, + "learning_rate": 0.0002, + "loss": 0.8636, + "step": 300 + }, + { + "epoch": 0.050117209603104035, + "grad_norm": 0.43485215306282043, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 310 + }, + { + "epoch": 0.05173389378384932, + "grad_norm": 0.4717007875442505, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 320 + }, + { + "epoch": 0.053350577964594616, + "grad_norm": 0.4059787690639496, + "learning_rate": 0.0002, + "loss": 0.8469, + "step": 330 + }, + { + "epoch": 0.05496726214533991, + "grad_norm": 0.4366913437843323, + "learning_rate": 0.0002, + "loss": 0.8866, + "step": 340 + }, + { + "epoch": 0.0565839463260852, + "grad_norm": 0.4233848452568054, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 350 + }, + { + "epoch": 0.05820063050683049, + "grad_norm": 0.4209108352661133, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 360 + }, + { + "epoch": 0.059817314687575784, + "grad_norm": 0.41637396812438965, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 370 + }, + { + "epoch": 0.06143399886832107, + "grad_norm": 0.46235376596450806, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 380 + }, + { + "epoch": 0.06305068304906636, + "grad_norm": 0.4013484716415405, + "learning_rate": 0.0002, + "loss": 0.7966, + "step": 390 + }, + { + "epoch": 0.06466736722981166, + "grad_norm": 0.47443896532058716, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 400 + }, + { + "epoch": 0.06628405141055695, + "grad_norm": 0.3942156434059143, + "learning_rate": 0.0002, + "loss": 0.8666, + "step": 410 + }, + { + "epoch": 0.06790073559130223, + "grad_norm": 0.4965320825576782, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 420 + }, + { + "epoch": 0.06951741977204753, + "grad_norm": 0.4304835796356201, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 430 + }, + { + "epoch": 0.07113410395279282, + "grad_norm": 0.511726975440979, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 440 + }, + { + "epoch": 0.07275078813353811, + "grad_norm": 0.4040689170360565, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 450 + }, + { + "epoch": 0.07436747231428341, + "grad_norm": 0.5402171015739441, + "learning_rate": 0.0002, + "loss": 0.8788, + "step": 460 + }, + { + "epoch": 0.0759841564950287, + "grad_norm": 0.4174517095088959, + "learning_rate": 0.0002, + "loss": 0.8737, + "step": 470 + }, + { + "epoch": 0.07760084067577398, + "grad_norm": 0.4306182265281677, + "learning_rate": 0.0002, + "loss": 0.7605, + "step": 480 + }, + { + "epoch": 0.07921752485651928, + "grad_norm": 0.535210132598877, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 490 + }, + { + "epoch": 0.08083420903726457, + "grad_norm": 0.5339109897613525, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 500 + }, + { + "epoch": 0.08245089321800986, + "grad_norm": 0.45754891633987427, + "learning_rate": 0.0002, + "loss": 0.8985, + "step": 510 + }, + { + "epoch": 0.08406757739875516, + "grad_norm": 0.43820783495903015, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 520 + }, + { + "epoch": 0.08568426157950045, + "grad_norm": 0.4434749186038971, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 530 + }, + { + "epoch": 0.08730094576024573, + "grad_norm": 0.43111467361450195, + "learning_rate": 0.0002, + "loss": 0.7857, + "step": 540 + }, + { + "epoch": 0.08891762994099103, + "grad_norm": 0.4378940165042877, + "learning_rate": 0.0002, + "loss": 0.8418, + "step": 550 + }, + { + "epoch": 0.09053431412173632, + "grad_norm": 0.4772215187549591, + "learning_rate": 0.0002, + "loss": 0.8361, + "step": 560 + }, + { + "epoch": 0.09215099830248161, + "grad_norm": 0.6837629079818726, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 570 + }, + { + "epoch": 0.09376768248322691, + "grad_norm": 0.42241212725639343, + "learning_rate": 0.0002, + "loss": 0.8607, + "step": 580 + }, + { + "epoch": 0.0953843666639722, + "grad_norm": 0.5165936350822449, + "learning_rate": 0.0002, + "loss": 0.852, + "step": 590 + }, + { + "epoch": 0.09700105084471748, + "grad_norm": 0.48737478256225586, + "learning_rate": 0.0002, + "loss": 0.8664, + "step": 600 + }, + { + "epoch": 0.09861773502546278, + "grad_norm": 0.47419852018356323, + "learning_rate": 0.0002, + "loss": 0.8806, + "step": 610 + }, + { + "epoch": 0.10023441920620807, + "grad_norm": 0.4975486099720001, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 620 + }, + { + "epoch": 0.10185110338695336, + "grad_norm": 0.49123844504356384, + "learning_rate": 0.0002, + "loss": 0.8548, + "step": 630 + }, + { + "epoch": 0.10346778756769864, + "grad_norm": 0.6288952827453613, + "learning_rate": 0.0002, + "loss": 0.8911, + "step": 640 + }, + { + "epoch": 0.10508447174844394, + "grad_norm": 0.4277345836162567, + "learning_rate": 0.0002, + "loss": 0.827, + "step": 650 + }, + { + "epoch": 0.10670115592918923, + "grad_norm": 0.4021061956882477, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 660 + }, + { + "epoch": 0.10831784010993452, + "grad_norm": 0.3492237329483032, + "learning_rate": 0.0002, + "loss": 0.87, + "step": 670 + }, + { + "epoch": 0.10993452429067982, + "grad_norm": 0.4341012239456177, + "learning_rate": 0.0002, + "loss": 0.8698, + "step": 680 + }, + { + "epoch": 0.1115512084714251, + "grad_norm": 0.7296304106712341, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 690 + }, + { + "epoch": 0.1131678926521704, + "grad_norm": 0.397494912147522, + "learning_rate": 0.0002, + "loss": 0.8433, + "step": 700 + }, + { + "epoch": 0.1147845768329157, + "grad_norm": 0.396431028842926, + "learning_rate": 0.0002, + "loss": 0.827, + "step": 710 + }, + { + "epoch": 0.11640126101366098, + "grad_norm": 0.48842838406562805, + "learning_rate": 0.0002, + "loss": 0.8379, + "step": 720 + }, + { + "epoch": 0.11801794519440627, + "grad_norm": 0.46322616934776306, + "learning_rate": 0.0002, + "loss": 0.8238, + "step": 730 + }, + { + "epoch": 0.11963462937515157, + "grad_norm": 0.47990912199020386, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 740 + }, + { + "epoch": 0.12125131355589686, + "grad_norm": 0.4997142255306244, + "learning_rate": 0.0002, + "loss": 0.82, + "step": 750 + }, + { + "epoch": 0.12286799773664214, + "grad_norm": 0.4040526747703552, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 760 + }, + { + "epoch": 0.12448468191738744, + "grad_norm": 0.453095942735672, + "learning_rate": 0.0002, + "loss": 0.863, + "step": 770 + }, + { + "epoch": 0.12610136609813272, + "grad_norm": 0.4636971950531006, + "learning_rate": 0.0002, + "loss": 0.8792, + "step": 780 + }, + { + "epoch": 0.12771805027887803, + "grad_norm": 0.4279276132583618, + "learning_rate": 0.0002, + "loss": 0.8112, + "step": 790 + }, + { + "epoch": 0.12933473445962332, + "grad_norm": 0.46212655305862427, + "learning_rate": 0.0002, + "loss": 0.8711, + "step": 800 + }, + { + "epoch": 0.1309514186403686, + "grad_norm": 0.43127650022506714, + "learning_rate": 0.0002, + "loss": 0.8368, + "step": 810 + }, + { + "epoch": 0.1325681028211139, + "grad_norm": 0.4201301336288452, + "learning_rate": 0.0002, + "loss": 0.8476, + "step": 820 + }, + { + "epoch": 0.13418478700185918, + "grad_norm": 0.42583167552948, + "learning_rate": 0.0002, + "loss": 0.8078, + "step": 830 + }, + { + "epoch": 0.13580147118260447, + "grad_norm": 0.4535622000694275, + "learning_rate": 0.0002, + "loss": 0.8219, + "step": 840 + }, + { + "epoch": 0.13741815536334978, + "grad_norm": 0.4116036891937256, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 850 + }, + { + "epoch": 0.13903483954409507, + "grad_norm": 0.45997580885887146, + "learning_rate": 0.0002, + "loss": 0.8466, + "step": 860 + }, + { + "epoch": 0.14065152372484035, + "grad_norm": 0.4487837255001068, + "learning_rate": 0.0002, + "loss": 0.8917, + "step": 870 + }, + { + "epoch": 0.14226820790558564, + "grad_norm": 0.43650057911872864, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 880 + }, + { + "epoch": 0.14388489208633093, + "grad_norm": 0.5335358381271362, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 890 + }, + { + "epoch": 0.14550157626707622, + "grad_norm": 0.5989000201225281, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 900 + }, + { + "epoch": 0.14711826044782153, + "grad_norm": 0.517179012298584, + "learning_rate": 0.0002, + "loss": 0.8385, + "step": 910 + }, + { + "epoch": 0.14873494462856682, + "grad_norm": 0.44435232877731323, + "learning_rate": 0.0002, + "loss": 0.8255, + "step": 920 + }, + { + "epoch": 0.1503516288093121, + "grad_norm": 0.42635923624038696, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 930 + }, + { + "epoch": 0.1519683129900574, + "grad_norm": 0.49603334069252014, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 940 + }, + { + "epoch": 0.15358499717080268, + "grad_norm": 0.40639808773994446, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 950 + }, + { + "epoch": 0.15520168135154797, + "grad_norm": 0.4850759208202362, + "learning_rate": 0.0002, + "loss": 0.8529, + "step": 960 + }, + { + "epoch": 0.15681836553229328, + "grad_norm": 0.4427442252635956, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 970 + }, + { + "epoch": 0.15843504971303857, + "grad_norm": 0.3760930001735687, + "learning_rate": 0.0002, + "loss": 0.8705, + "step": 980 + }, + { + "epoch": 0.16005173389378385, + "grad_norm": 0.4794144332408905, + "learning_rate": 0.0002, + "loss": 0.8644, + "step": 990 + }, + { + "epoch": 0.16166841807452914, + "grad_norm": 0.45828768610954285, + "learning_rate": 0.0002, + "loss": 0.8002, + "step": 1000 + }, + { + "epoch": 0.16328510225527443, + "grad_norm": 0.6313053369522095, + "learning_rate": 0.0002, + "loss": 0.7658, + "step": 1010 + }, + { + "epoch": 0.16490178643601971, + "grad_norm": 0.45041006803512573, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1020 + }, + { + "epoch": 0.166518470616765, + "grad_norm": 0.441403865814209, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 1030 + }, + { + "epoch": 0.16813515479751032, + "grad_norm": 0.8171296119689941, + "learning_rate": 0.0002, + "loss": 0.8475, + "step": 1040 + }, + { + "epoch": 0.1697518389782556, + "grad_norm": 0.7137420773506165, + "learning_rate": 0.0002, + "loss": 0.845, + "step": 1050 + }, + { + "epoch": 0.1713685231590009, + "grad_norm": 0.5236809849739075, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 1060 + }, + { + "epoch": 0.17298520733974618, + "grad_norm": 0.5021864175796509, + "learning_rate": 0.0002, + "loss": 0.8265, + "step": 1070 + }, + { + "epoch": 0.17460189152049146, + "grad_norm": 0.47347521781921387, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 1080 + }, + { + "epoch": 0.17621857570123675, + "grad_norm": 0.4631653428077698, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 1090 + }, + { + "epoch": 0.17783525988198207, + "grad_norm": 0.49169182777404785, + "learning_rate": 0.0002, + "loss": 0.8166, + "step": 1100 + }, + { + "epoch": 0.17945194406272735, + "grad_norm": 0.5019739270210266, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 1110 + }, + { + "epoch": 0.18106862824347264, + "grad_norm": 0.5100422501564026, + "learning_rate": 0.0002, + "loss": 0.8247, + "step": 1120 + }, + { + "epoch": 0.18268531242421793, + "grad_norm": 0.3888324499130249, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 1130 + }, + { + "epoch": 0.18430199660496321, + "grad_norm": 0.39765217900276184, + "learning_rate": 0.0002, + "loss": 0.8533, + "step": 1140 + }, + { + "epoch": 0.1859186807857085, + "grad_norm": 0.47190186381340027, + "learning_rate": 0.0002, + "loss": 0.8541, + "step": 1150 + }, + { + "epoch": 0.18753536496645382, + "grad_norm": 0.4464188814163208, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 1160 + }, + { + "epoch": 0.1891520491471991, + "grad_norm": 0.5153930187225342, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 1170 + }, + { + "epoch": 0.1907687333279444, + "grad_norm": 0.4779708683490753, + "learning_rate": 0.0002, + "loss": 0.8033, + "step": 1180 + }, + { + "epoch": 0.19238541750868968, + "grad_norm": 0.4834315776824951, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 1190 + }, + { + "epoch": 0.19400210168943496, + "grad_norm": 0.402357816696167, + "learning_rate": 0.0002, + "loss": 0.7721, + "step": 1200 + }, + { + "epoch": 0.19561878587018025, + "grad_norm": 0.45899084210395813, + "learning_rate": 0.0002, + "loss": 0.7941, + "step": 1210 + }, + { + "epoch": 0.19723547005092557, + "grad_norm": 0.5106529593467712, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 1220 + }, + { + "epoch": 0.19885215423167085, + "grad_norm": 0.45261722803115845, + "learning_rate": 0.0002, + "loss": 0.7816, + "step": 1230 + }, + { + "epoch": 0.20046883841241614, + "grad_norm": 0.4647127091884613, + "learning_rate": 0.0002, + "loss": 0.8068, + "step": 1240 + }, + { + "epoch": 0.20208552259316143, + "grad_norm": 0.4849368929862976, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 1250 + }, + { + "epoch": 0.2037022067739067, + "grad_norm": 0.4518061578273773, + "learning_rate": 0.0002, + "loss": 0.8514, + "step": 1260 + }, + { + "epoch": 0.205318890954652, + "grad_norm": 0.49535325169563293, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 1270 + }, + { + "epoch": 0.2069355751353973, + "grad_norm": 0.4835205376148224, + "learning_rate": 0.0002, + "loss": 0.8348, + "step": 1280 + }, + { + "epoch": 0.2085522593161426, + "grad_norm": 0.45308539271354675, + "learning_rate": 0.0002, + "loss": 0.8428, + "step": 1290 + }, + { + "epoch": 0.2101689434968879, + "grad_norm": 0.5369905233383179, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 1300 + }, + { + "epoch": 0.21178562767763318, + "grad_norm": 0.5031622052192688, + "learning_rate": 0.0002, + "loss": 0.8676, + "step": 1310 + }, + { + "epoch": 0.21340231185837846, + "grad_norm": 0.48010334372520447, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 1320 + }, + { + "epoch": 0.21501899603912375, + "grad_norm": 0.4905701279640198, + "learning_rate": 0.0002, + "loss": 0.806, + "step": 1330 + }, + { + "epoch": 0.21663568021986904, + "grad_norm": 0.43531742691993713, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 1340 + }, + { + "epoch": 0.21825236440061435, + "grad_norm": 0.44330692291259766, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 1350 + }, + { + "epoch": 0.21986904858135964, + "grad_norm": 0.5384416580200195, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 1360 + }, + { + "epoch": 0.22148573276210493, + "grad_norm": 0.4181833863258362, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 1370 + }, + { + "epoch": 0.2231024169428502, + "grad_norm": 0.523833692073822, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 1380 + }, + { + "epoch": 0.2247191011235955, + "grad_norm": 0.5528736710548401, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 1390 + }, + { + "epoch": 0.2263357853043408, + "grad_norm": 0.43515023589134216, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 1400 + }, + { + "epoch": 0.2279524694850861, + "grad_norm": 0.48809877038002014, + "learning_rate": 0.0002, + "loss": 0.8403, + "step": 1410 + }, + { + "epoch": 0.2295691536658314, + "grad_norm": 0.43591251969337463, + "learning_rate": 0.0002, + "loss": 0.8165, + "step": 1420 + }, + { + "epoch": 0.23118583784657668, + "grad_norm": 0.44625312089920044, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 1430 + }, + { + "epoch": 0.23280252202732196, + "grad_norm": 0.4390665292739868, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 1440 + }, + { + "epoch": 0.23441920620806725, + "grad_norm": 0.48496049642562866, + "learning_rate": 0.0002, + "loss": 0.8465, + "step": 1450 + }, + { + "epoch": 0.23603589038881254, + "grad_norm": 0.45919957756996155, + "learning_rate": 0.0002, + "loss": 0.775, + "step": 1460 + }, + { + "epoch": 0.23765257456955785, + "grad_norm": 0.5471845865249634, + "learning_rate": 0.0002, + "loss": 0.8659, + "step": 1470 + }, + { + "epoch": 0.23926925875030314, + "grad_norm": 0.47269317507743835, + "learning_rate": 0.0002, + "loss": 0.8164, + "step": 1480 + }, + { + "epoch": 0.24088594293104842, + "grad_norm": 0.4930245578289032, + "learning_rate": 0.0002, + "loss": 0.854, + "step": 1490 + }, + { + "epoch": 0.2425026271117937, + "grad_norm": 0.5605630278587341, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 1500 + }, + { + "epoch": 0.244119311292539, + "grad_norm": 0.4435870945453644, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 1510 + }, + { + "epoch": 0.24573599547328429, + "grad_norm": 0.4941999912261963, + "learning_rate": 0.0002, + "loss": 0.8123, + "step": 1520 + }, + { + "epoch": 0.24735267965402957, + "grad_norm": 0.5100624561309814, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 1530 + }, + { + "epoch": 0.2489693638347749, + "grad_norm": 0.4638267457485199, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 1540 + }, + { + "epoch": 0.25058604801552015, + "grad_norm": 0.5071570873260498, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 1550 + }, + { + "epoch": 0.25220273219626543, + "grad_norm": 0.4291319251060486, + "learning_rate": 0.0002, + "loss": 0.7724, + "step": 1560 + }, + { + "epoch": 0.2538194163770108, + "grad_norm": 0.5388049483299255, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 1570 + }, + { + "epoch": 0.25543610055775606, + "grad_norm": 0.5083683729171753, + "learning_rate": 0.0002, + "loss": 0.8176, + "step": 1580 + }, + { + "epoch": 0.25705278473850135, + "grad_norm": 0.4824463725090027, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 1590 + }, + { + "epoch": 0.25866946891924664, + "grad_norm": 0.41177722811698914, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 1600 + }, + { + "epoch": 0.2602861530999919, + "grad_norm": 0.5656219124794006, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 1610 + }, + { + "epoch": 0.2619028372807372, + "grad_norm": 0.41063204407691956, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 1620 + }, + { + "epoch": 0.2635195214614825, + "grad_norm": 0.4897061288356781, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 1630 + }, + { + "epoch": 0.2651362056422278, + "grad_norm": 0.4454376697540283, + "learning_rate": 0.0002, + "loss": 0.8198, + "step": 1640 + }, + { + "epoch": 0.26675288982297307, + "grad_norm": 0.4355238378047943, + "learning_rate": 0.0002, + "loss": 0.8684, + "step": 1650 + }, + { + "epoch": 0.26836957400371836, + "grad_norm": 0.458310067653656, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 1660 + }, + { + "epoch": 0.26998625818446365, + "grad_norm": 0.4752083718776703, + "learning_rate": 0.0002, + "loss": 0.7935, + "step": 1670 + }, + { + "epoch": 0.27160294236520893, + "grad_norm": 0.4666106402873993, + "learning_rate": 0.0002, + "loss": 0.8267, + "step": 1680 + }, + { + "epoch": 0.2732196265459543, + "grad_norm": 0.4213818609714508, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 1690 + }, + { + "epoch": 0.27483631072669956, + "grad_norm": 0.5768913626670837, + "learning_rate": 0.0002, + "loss": 0.8559, + "step": 1700 + }, + { + "epoch": 0.27645299490744485, + "grad_norm": 0.4209914803504944, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 1710 + }, + { + "epoch": 0.27806967908819014, + "grad_norm": 0.501909613609314, + "learning_rate": 0.0002, + "loss": 0.8167, + "step": 1720 + }, + { + "epoch": 0.2796863632689354, + "grad_norm": 0.5266261100769043, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 1730 + }, + { + "epoch": 0.2813030474496807, + "grad_norm": 0.43806859850883484, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 1740 + }, + { + "epoch": 0.282919731630426, + "grad_norm": 0.46048814058303833, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 1750 + }, + { + "epoch": 0.2845364158111713, + "grad_norm": 0.44972819089889526, + "learning_rate": 0.0002, + "loss": 0.8596, + "step": 1760 + }, + { + "epoch": 0.28615309999191657, + "grad_norm": 0.5114831328392029, + "learning_rate": 0.0002, + "loss": 0.8421, + "step": 1770 + }, + { + "epoch": 0.28776978417266186, + "grad_norm": 0.47931742668151855, + "learning_rate": 0.0002, + "loss": 0.8361, + "step": 1780 + }, + { + "epoch": 0.28938646835340714, + "grad_norm": 0.5092599987983704, + "learning_rate": 0.0002, + "loss": 0.8265, + "step": 1790 + }, + { + "epoch": 0.29100315253415243, + "grad_norm": 0.37581443786621094, + "learning_rate": 0.0002, + "loss": 0.8506, + "step": 1800 + }, + { + "epoch": 0.2926198367148977, + "grad_norm": 0.47097381949424744, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 1810 + }, + { + "epoch": 0.29423652089564306, + "grad_norm": 0.48300236463546753, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 1820 + }, + { + "epoch": 0.29585320507638835, + "grad_norm": 0.5600419640541077, + "learning_rate": 0.0002, + "loss": 0.8391, + "step": 1830 + }, + { + "epoch": 0.29746988925713364, + "grad_norm": 0.48555272817611694, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 1840 + }, + { + "epoch": 0.2990865734378789, + "grad_norm": 0.3752668499946594, + "learning_rate": 0.0002, + "loss": 0.7657, + "step": 1850 + }, + { + "epoch": 0.3007032576186242, + "grad_norm": 0.5328747034072876, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 1860 + }, + { + "epoch": 0.3023199417993695, + "grad_norm": 0.48716455698013306, + "learning_rate": 0.0002, + "loss": 0.8426, + "step": 1870 + }, + { + "epoch": 0.3039366259801148, + "grad_norm": 0.5011493563652039, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 1880 + }, + { + "epoch": 0.30555331016086007, + "grad_norm": 0.46461427211761475, + "learning_rate": 0.0002, + "loss": 0.852, + "step": 1890 + }, + { + "epoch": 0.30716999434160536, + "grad_norm": 0.36630210280418396, + "learning_rate": 0.0002, + "loss": 0.8478, + "step": 1900 + }, + { + "epoch": 0.30878667852235064, + "grad_norm": 0.4217296242713928, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 1910 + }, + { + "epoch": 0.31040336270309593, + "grad_norm": 0.4394875466823578, + "learning_rate": 0.0002, + "loss": 0.8128, + "step": 1920 + }, + { + "epoch": 0.3120200468838412, + "grad_norm": 0.6587965488433838, + "learning_rate": 0.0002, + "loss": 0.8471, + "step": 1930 + }, + { + "epoch": 0.31363673106458656, + "grad_norm": 0.5469298958778381, + "learning_rate": 0.0002, + "loss": 0.8565, + "step": 1940 + }, + { + "epoch": 0.31525341524533185, + "grad_norm": 0.4371595084667206, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 1950 + }, + { + "epoch": 0.31687009942607713, + "grad_norm": 0.4809541404247284, + "learning_rate": 0.0002, + "loss": 0.887, + "step": 1960 + }, + { + "epoch": 0.3184867836068224, + "grad_norm": 0.6061086654663086, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 1970 + }, + { + "epoch": 0.3201034677875677, + "grad_norm": 0.5342657566070557, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 1980 + }, + { + "epoch": 0.321720151968313, + "grad_norm": 0.5057743787765503, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 1990 + }, + { + "epoch": 0.3233368361490583, + "grad_norm": 0.528626024723053, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 2000 + }, + { + "epoch": 0.32495352032980357, + "grad_norm": 0.46742770075798035, + "learning_rate": 0.0002, + "loss": 0.8845, + "step": 2010 + }, + { + "epoch": 0.32657020451054886, + "grad_norm": 0.515101432800293, + "learning_rate": 0.0002, + "loss": 0.8484, + "step": 2020 + }, + { + "epoch": 0.32818688869129414, + "grad_norm": 0.41941216588020325, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 2030 + }, + { + "epoch": 0.32980357287203943, + "grad_norm": 0.49902522563934326, + "learning_rate": 0.0002, + "loss": 0.7637, + "step": 2040 + }, + { + "epoch": 0.3314202570527847, + "grad_norm": 0.4120897650718689, + "learning_rate": 0.0002, + "loss": 0.7822, + "step": 2050 + }, + { + "epoch": 0.33303694123353, + "grad_norm": 0.45352041721343994, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 2060 + }, + { + "epoch": 0.33465362541427535, + "grad_norm": 0.523199737071991, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 2070 + }, + { + "epoch": 0.33627030959502063, + "grad_norm": 0.4390358626842499, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 2080 + }, + { + "epoch": 0.3378869937757659, + "grad_norm": 0.6752901077270508, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 2090 + }, + { + "epoch": 0.3395036779565112, + "grad_norm": 0.547821044921875, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 2100 + }, + { + "epoch": 0.3411203621372565, + "grad_norm": 0.5161308646202087, + "learning_rate": 0.0002, + "loss": 0.8561, + "step": 2110 + }, + { + "epoch": 0.3427370463180018, + "grad_norm": 0.4565401077270508, + "learning_rate": 0.0002, + "loss": 0.7697, + "step": 2120 + }, + { + "epoch": 0.34435373049874707, + "grad_norm": 0.4666115939617157, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 2130 + }, + { + "epoch": 0.34597041467949236, + "grad_norm": 0.4090428352355957, + "learning_rate": 0.0002, + "loss": 0.8189, + "step": 2140 + }, + { + "epoch": 0.34758709886023764, + "grad_norm": 0.510845422744751, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 2150 + }, + { + "epoch": 0.34920378304098293, + "grad_norm": 0.42861923575401306, + "learning_rate": 0.0002, + "loss": 0.8398, + "step": 2160 + }, + { + "epoch": 0.3508204672217282, + "grad_norm": 0.4476332664489746, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 2170 + }, + { + "epoch": 0.3524371514024735, + "grad_norm": 0.6065791249275208, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 2180 + }, + { + "epoch": 0.35405383558321885, + "grad_norm": 0.42335066199302673, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 2190 + }, + { + "epoch": 0.35567051976396413, + "grad_norm": 0.5094629526138306, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 2200 + }, + { + "epoch": 0.3572872039447094, + "grad_norm": 0.5476373434066772, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 2210 + }, + { + "epoch": 0.3589038881254547, + "grad_norm": 0.3911719024181366, + "learning_rate": 0.0002, + "loss": 0.814, + "step": 2220 + }, + { + "epoch": 0.3605205723062, + "grad_norm": 0.6599636077880859, + "learning_rate": 0.0002, + "loss": 0.8599, + "step": 2230 + }, + { + "epoch": 0.3621372564869453, + "grad_norm": 0.40381914377212524, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 2240 + }, + { + "epoch": 0.36375394066769057, + "grad_norm": 0.4433908462524414, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 2250 + }, + { + "epoch": 0.36537062484843585, + "grad_norm": 0.578326940536499, + "learning_rate": 0.0002, + "loss": 0.8503, + "step": 2260 + }, + { + "epoch": 0.36698730902918114, + "grad_norm": 0.5734784007072449, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 2270 + }, + { + "epoch": 0.36860399320992643, + "grad_norm": 0.45555487275123596, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 2280 + }, + { + "epoch": 0.3702206773906717, + "grad_norm": 0.5666276216506958, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 2290 + }, + { + "epoch": 0.371837361571417, + "grad_norm": 0.5461117625236511, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 2300 + }, + { + "epoch": 0.3734540457521623, + "grad_norm": 0.6318911910057068, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 2310 + }, + { + "epoch": 0.37507072993290763, + "grad_norm": 0.493263304233551, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 2320 + }, + { + "epoch": 0.3766874141136529, + "grad_norm": 0.5888760089874268, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 2330 + }, + { + "epoch": 0.3783040982943982, + "grad_norm": 0.48671841621398926, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 2340 + }, + { + "epoch": 0.3799207824751435, + "grad_norm": 0.4385145306587219, + "learning_rate": 0.0002, + "loss": 0.8367, + "step": 2350 + }, + { + "epoch": 0.3815374666558888, + "grad_norm": 0.5523318648338318, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 2360 + }, + { + "epoch": 0.38315415083663407, + "grad_norm": 0.7308220267295837, + "learning_rate": 0.0002, + "loss": 0.8351, + "step": 2370 + }, + { + "epoch": 0.38477083501737935, + "grad_norm": 0.554214358329773, + "learning_rate": 0.0002, + "loss": 0.859, + "step": 2380 + }, + { + "epoch": 0.38638751919812464, + "grad_norm": 0.5425800085067749, + "learning_rate": 0.0002, + "loss": 0.8146, + "step": 2390 + }, + { + "epoch": 0.3880042033788699, + "grad_norm": 0.48811158537864685, + "learning_rate": 0.0002, + "loss": 0.8282, + "step": 2400 + }, + { + "epoch": 0.3896208875596152, + "grad_norm": 0.49212366342544556, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 2410 + }, + { + "epoch": 0.3912375717403605, + "grad_norm": 0.5222218632698059, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 2420 + }, + { + "epoch": 0.3928542559211058, + "grad_norm": 0.4699819087982178, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 2430 + }, + { + "epoch": 0.39447094010185113, + "grad_norm": 0.46153587102890015, + "learning_rate": 0.0002, + "loss": 0.7919, + "step": 2440 + }, + { + "epoch": 0.3960876242825964, + "grad_norm": 0.4150611162185669, + "learning_rate": 0.0002, + "loss": 0.8111, + "step": 2450 + }, + { + "epoch": 0.3977043084633417, + "grad_norm": 0.5799614787101746, + "learning_rate": 0.0002, + "loss": 0.8589, + "step": 2460 + }, + { + "epoch": 0.399320992644087, + "grad_norm": 0.56536865234375, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 2470 + }, + { + "epoch": 0.4009376768248323, + "grad_norm": 0.5451247096061707, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 2480 + }, + { + "epoch": 0.40255436100557757, + "grad_norm": 0.5914521217346191, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 2490 + }, + { + "epoch": 0.40417104518632285, + "grad_norm": 0.4428117275238037, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 2500 + }, + { + "epoch": 0.40578772936706814, + "grad_norm": 0.48580947518348694, + "learning_rate": 0.0002, + "loss": 0.8054, + "step": 2510 + }, + { + "epoch": 0.4074044135478134, + "grad_norm": 0.436734676361084, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 2520 + }, + { + "epoch": 0.4090210977285587, + "grad_norm": 0.5752223134040833, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 2530 + }, + { + "epoch": 0.410637781909304, + "grad_norm": 0.4271308183670044, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 2540 + }, + { + "epoch": 0.4122544660900493, + "grad_norm": 0.46294718980789185, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 2550 + }, + { + "epoch": 0.4138711502707946, + "grad_norm": 0.49407583475112915, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 2560 + }, + { + "epoch": 0.4154878344515399, + "grad_norm": 0.4729035496711731, + "learning_rate": 0.0002, + "loss": 0.7881, + "step": 2570 + }, + { + "epoch": 0.4171045186322852, + "grad_norm": 0.4129747152328491, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 2580 + }, + { + "epoch": 0.4187212028130305, + "grad_norm": 0.5684236288070679, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 2590 + }, + { + "epoch": 0.4203378869937758, + "grad_norm": 0.4862157106399536, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 2600 + }, + { + "epoch": 0.42195457117452106, + "grad_norm": 0.46567976474761963, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 2610 + }, + { + "epoch": 0.42357125535526635, + "grad_norm": 0.5710650682449341, + "learning_rate": 0.0002, + "loss": 0.7755, + "step": 2620 + }, + { + "epoch": 0.42518793953601164, + "grad_norm": 0.5660041570663452, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 2630 + }, + { + "epoch": 0.4268046237167569, + "grad_norm": 0.47944375872612, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 2640 + }, + { + "epoch": 0.4284213078975022, + "grad_norm": 0.537223756313324, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 2650 + }, + { + "epoch": 0.4300379920782475, + "grad_norm": 0.41669997572898865, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 2660 + }, + { + "epoch": 0.4316546762589928, + "grad_norm": 0.44727686047554016, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 2670 + }, + { + "epoch": 0.4332713604397381, + "grad_norm": 0.5600888729095459, + "learning_rate": 0.0002, + "loss": 0.8241, + "step": 2680 + }, + { + "epoch": 0.4348880446204834, + "grad_norm": 0.39820605516433716, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 2690 + }, + { + "epoch": 0.4365047288012287, + "grad_norm": 0.5637655854225159, + "learning_rate": 0.0002, + "loss": 0.8202, + "step": 2700 + }, + { + "epoch": 0.438121412981974, + "grad_norm": 0.6363666653633118, + "learning_rate": 0.0002, + "loss": 0.855, + "step": 2710 + }, + { + "epoch": 0.4397380971627193, + "grad_norm": 0.5656129121780396, + "learning_rate": 0.0002, + "loss": 0.8468, + "step": 2720 + }, + { + "epoch": 0.44135478134346456, + "grad_norm": 0.5600156188011169, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 2730 + }, + { + "epoch": 0.44297146552420985, + "grad_norm": 0.5506579875946045, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 2740 + }, + { + "epoch": 0.44458814970495514, + "grad_norm": 0.49878305196762085, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 2750 + }, + { + "epoch": 0.4462048338857004, + "grad_norm": 0.4569213092327118, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 2760 + }, + { + "epoch": 0.4478215180664457, + "grad_norm": 0.6056680083274841, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 2770 + }, + { + "epoch": 0.449438202247191, + "grad_norm": 0.44474557042121887, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 2780 + }, + { + "epoch": 0.4510548864279363, + "grad_norm": 0.46055394411087036, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 2790 + }, + { + "epoch": 0.4526715706086816, + "grad_norm": 0.4904133379459381, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 2800 + }, + { + "epoch": 0.45428825478942686, + "grad_norm": 0.5647031664848328, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 2810 + }, + { + "epoch": 0.4559049389701722, + "grad_norm": 0.5759473443031311, + "learning_rate": 0.0002, + "loss": 0.8622, + "step": 2820 + }, + { + "epoch": 0.4575216231509175, + "grad_norm": 0.5161895751953125, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 2830 + }, + { + "epoch": 0.4591383073316628, + "grad_norm": 0.4248254597187042, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 2840 + }, + { + "epoch": 0.46075499151240806, + "grad_norm": 0.45395001769065857, + "learning_rate": 0.0002, + "loss": 0.7838, + "step": 2850 + }, + { + "epoch": 0.46237167569315335, + "grad_norm": 0.5358697772026062, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 2860 + }, + { + "epoch": 0.46398835987389864, + "grad_norm": 0.5379165410995483, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 2870 + }, + { + "epoch": 0.4656050440546439, + "grad_norm": 0.4601989686489105, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 2880 + }, + { + "epoch": 0.4672217282353892, + "grad_norm": 0.671115517616272, + "learning_rate": 0.0002, + "loss": 0.8523, + "step": 2890 + }, + { + "epoch": 0.4688384124161345, + "grad_norm": 0.4425133168697357, + "learning_rate": 0.0002, + "loss": 0.8262, + "step": 2900 + }, + { + "epoch": 0.4704550965968798, + "grad_norm": 0.5446155071258545, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 2910 + }, + { + "epoch": 0.47207178077762507, + "grad_norm": 0.603306233882904, + "learning_rate": 0.0002, + "loss": 0.8106, + "step": 2920 + }, + { + "epoch": 0.47368846495837036, + "grad_norm": 0.5377997159957886, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 2930 + }, + { + "epoch": 0.4753051491391157, + "grad_norm": 0.4931027591228485, + "learning_rate": 0.0002, + "loss": 0.8075, + "step": 2940 + }, + { + "epoch": 0.476921833319861, + "grad_norm": 0.4711960256099701, + "learning_rate": 0.0002, + "loss": 0.8004, + "step": 2950 + }, + { + "epoch": 0.4785385175006063, + "grad_norm": 0.5020492672920227, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 2960 + }, + { + "epoch": 0.48015520168135156, + "grad_norm": 0.5428946614265442, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 2970 + }, + { + "epoch": 0.48177188586209685, + "grad_norm": 0.5294089317321777, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 2980 + }, + { + "epoch": 0.48338857004284214, + "grad_norm": 0.648289144039154, + "learning_rate": 0.0002, + "loss": 0.8553, + "step": 2990 + }, + { + "epoch": 0.4850052542235874, + "grad_norm": 0.47916680574417114, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 3000 + }, + { + "epoch": 0.4866219384043327, + "grad_norm": 0.43849772214889526, + "learning_rate": 0.0002, + "loss": 0.8087, + "step": 3010 + }, + { + "epoch": 0.488238622585078, + "grad_norm": 0.47007861733436584, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 3020 + }, + { + "epoch": 0.4898553067658233, + "grad_norm": 0.6314331293106079, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 3030 + }, + { + "epoch": 0.49147199094656857, + "grad_norm": 0.49211493134498596, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 3040 + }, + { + "epoch": 0.49308867512731386, + "grad_norm": 0.4537973403930664, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 3050 + }, + { + "epoch": 0.49470535930805914, + "grad_norm": 0.47326919436454773, + "learning_rate": 0.0002, + "loss": 0.8095, + "step": 3060 + }, + { + "epoch": 0.4963220434888045, + "grad_norm": 0.525874137878418, + "learning_rate": 0.0002, + "loss": 0.8447, + "step": 3070 + }, + { + "epoch": 0.4979387276695498, + "grad_norm": 0.6361091732978821, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 3080 + }, + { + "epoch": 0.49955541185029506, + "grad_norm": 0.5850642919540405, + "learning_rate": 0.0002, + "loss": 0.821, + "step": 3090 + }, + { + "epoch": 0.5011720960310403, + "grad_norm": 0.47299543023109436, + "learning_rate": 0.0002, + "loss": 0.8279, + "step": 3100 + }, + { + "epoch": 0.5027887802117856, + "grad_norm": 0.473099946975708, + "learning_rate": 0.0002, + "loss": 0.8681, + "step": 3110 + }, + { + "epoch": 0.5044054643925309, + "grad_norm": 0.48186397552490234, + "learning_rate": 0.0002, + "loss": 0.8223, + "step": 3120 + }, + { + "epoch": 0.5060221485732762, + "grad_norm": 0.5015401840209961, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 3130 + }, + { + "epoch": 0.5076388327540216, + "grad_norm": 0.5617750287055969, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 3140 + }, + { + "epoch": 0.5092555169347668, + "grad_norm": 0.5169327259063721, + "learning_rate": 0.0002, + "loss": 0.8708, + "step": 3150 + }, + { + "epoch": 0.5108722011155121, + "grad_norm": 0.545657753944397, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 3160 + }, + { + "epoch": 0.5124888852962574, + "grad_norm": 0.512864351272583, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 3170 + }, + { + "epoch": 0.5141055694770027, + "grad_norm": 0.4113546311855316, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 3180 + }, + { + "epoch": 0.5157222536577479, + "grad_norm": 0.44532445073127747, + "learning_rate": 0.0002, + "loss": 0.8206, + "step": 3190 + }, + { + "epoch": 0.5173389378384933, + "grad_norm": 0.5623497366905212, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 3200 + }, + { + "epoch": 0.5189556220192385, + "grad_norm": 0.5084741115570068, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 3210 + }, + { + "epoch": 0.5205723061999838, + "grad_norm": 0.5305403470993042, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 3220 + }, + { + "epoch": 0.5221889903807291, + "grad_norm": 0.4708254337310791, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 3230 + }, + { + "epoch": 0.5238056745614744, + "grad_norm": 0.43827131390571594, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 3240 + }, + { + "epoch": 0.5254223587422197, + "grad_norm": 0.5630002617835999, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 3250 + }, + { + "epoch": 0.527039042922965, + "grad_norm": 0.5010961890220642, + "learning_rate": 0.0002, + "loss": 0.7522, + "step": 3260 + }, + { + "epoch": 0.5286557271037103, + "grad_norm": 0.6303122043609619, + "learning_rate": 0.0002, + "loss": 0.8374, + "step": 3270 + }, + { + "epoch": 0.5302724112844556, + "grad_norm": 0.5107331275939941, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 3280 + }, + { + "epoch": 0.5318890954652009, + "grad_norm": 0.5700443387031555, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 3290 + }, + { + "epoch": 0.5335057796459461, + "grad_norm": 0.46296367049217224, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 3300 + }, + { + "epoch": 0.5351224638266915, + "grad_norm": 0.531568706035614, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 3310 + }, + { + "epoch": 0.5367391480074367, + "grad_norm": 0.4686741530895233, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 3320 + }, + { + "epoch": 0.5383558321881821, + "grad_norm": 0.5404331088066101, + "learning_rate": 0.0002, + "loss": 0.8104, + "step": 3330 + }, + { + "epoch": 0.5399725163689273, + "grad_norm": 0.6368790864944458, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 3340 + }, + { + "epoch": 0.5415892005496726, + "grad_norm": 0.42300888895988464, + "learning_rate": 0.0002, + "loss": 0.8514, + "step": 3350 + }, + { + "epoch": 0.5432058847304179, + "grad_norm": 0.5362542867660522, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 3360 + }, + { + "epoch": 0.5448225689111632, + "grad_norm": 0.497128963470459, + "learning_rate": 0.0002, + "loss": 0.858, + "step": 3370 + }, + { + "epoch": 0.5464392530919085, + "grad_norm": 0.5006386041641235, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 3380 + }, + { + "epoch": 0.5480559372726538, + "grad_norm": 0.44136837124824524, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 3390 + }, + { + "epoch": 0.5496726214533991, + "grad_norm": 0.5897833108901978, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 3400 + }, + { + "epoch": 0.5512893056341444, + "grad_norm": 0.641075611114502, + "learning_rate": 0.0002, + "loss": 0.8895, + "step": 3410 + }, + { + "epoch": 0.5529059898148897, + "grad_norm": 0.7251322269439697, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 3420 + }, + { + "epoch": 0.5545226739956349, + "grad_norm": 0.47411349415779114, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 3430 + }, + { + "epoch": 0.5561393581763803, + "grad_norm": 0.4994310438632965, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 3440 + }, + { + "epoch": 0.5577560423571255, + "grad_norm": 0.5814438462257385, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 3450 + }, + { + "epoch": 0.5593727265378708, + "grad_norm": 0.6278898119926453, + "learning_rate": 0.0002, + "loss": 0.8805, + "step": 3460 + }, + { + "epoch": 0.5609894107186161, + "grad_norm": 0.46208274364471436, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 3470 + }, + { + "epoch": 0.5626060948993614, + "grad_norm": 0.5718930959701538, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 3480 + }, + { + "epoch": 0.5642227790801067, + "grad_norm": 0.48178744316101074, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 3490 + }, + { + "epoch": 0.565839463260852, + "grad_norm": 0.47336965799331665, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 3500 + }, + { + "epoch": 0.5674561474415973, + "grad_norm": 0.43442684412002563, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 3510 + }, + { + "epoch": 0.5690728316223426, + "grad_norm": 0.6463358998298645, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 3520 + }, + { + "epoch": 0.5706895158030879, + "grad_norm": 0.5286486744880676, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 3530 + }, + { + "epoch": 0.5723061999838331, + "grad_norm": 0.5405499935150146, + "learning_rate": 0.0002, + "loss": 0.8421, + "step": 3540 + }, + { + "epoch": 0.5739228841645785, + "grad_norm": 0.6654391884803772, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 3550 + }, + { + "epoch": 0.5755395683453237, + "grad_norm": 0.5081980228424072, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 3560 + }, + { + "epoch": 0.5771562525260691, + "grad_norm": 0.48978179693222046, + "learning_rate": 0.0002, + "loss": 0.7753, + "step": 3570 + }, + { + "epoch": 0.5787729367068143, + "grad_norm": 0.5840612053871155, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3580 + }, + { + "epoch": 0.5803896208875596, + "grad_norm": 0.5235261917114258, + "learning_rate": 0.0002, + "loss": 0.8937, + "step": 3590 + }, + { + "epoch": 0.5820063050683049, + "grad_norm": 0.5672075748443604, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 3600 + }, + { + "epoch": 0.5836229892490502, + "grad_norm": 0.5613429546356201, + "learning_rate": 0.0002, + "loss": 0.8347, + "step": 3610 + }, + { + "epoch": 0.5852396734297954, + "grad_norm": 0.4032273590564728, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 3620 + }, + { + "epoch": 0.5868563576105408, + "grad_norm": 0.49559324979782104, + "learning_rate": 0.0002, + "loss": 0.8421, + "step": 3630 + }, + { + "epoch": 0.5884730417912861, + "grad_norm": 0.6895697712898254, + "learning_rate": 0.0002, + "loss": 0.8332, + "step": 3640 + }, + { + "epoch": 0.5900897259720314, + "grad_norm": 0.4750136435031891, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 3650 + }, + { + "epoch": 0.5917064101527767, + "grad_norm": 0.5176819562911987, + "learning_rate": 0.0002, + "loss": 0.8219, + "step": 3660 + }, + { + "epoch": 0.5933230943335219, + "grad_norm": 0.5817760229110718, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3670 + }, + { + "epoch": 0.5949397785142673, + "grad_norm": 0.6064626574516296, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 3680 + }, + { + "epoch": 0.5965564626950125, + "grad_norm": 0.6728700995445251, + "learning_rate": 0.0002, + "loss": 0.8422, + "step": 3690 + }, + { + "epoch": 0.5981731468757578, + "grad_norm": 0.609305202960968, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 3700 + }, + { + "epoch": 0.5997898310565031, + "grad_norm": 0.4615488350391388, + "learning_rate": 0.0002, + "loss": 0.8048, + "step": 3710 + }, + { + "epoch": 0.6014065152372484, + "grad_norm": 2.0531179904937744, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 3720 + }, + { + "epoch": 0.6030231994179936, + "grad_norm": 0.5091132521629333, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 3730 + }, + { + "epoch": 0.604639883598739, + "grad_norm": 0.5951124429702759, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 3740 + }, + { + "epoch": 0.6062565677794842, + "grad_norm": 0.5870208144187927, + "learning_rate": 0.0002, + "loss": 0.7784, + "step": 3750 + }, + { + "epoch": 0.6078732519602296, + "grad_norm": 0.6254619359970093, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 3760 + }, + { + "epoch": 0.6094899361409749, + "grad_norm": 0.5577626824378967, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 3770 + }, + { + "epoch": 0.6111066203217201, + "grad_norm": 0.5004405379295349, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 3780 + }, + { + "epoch": 0.6127233045024655, + "grad_norm": 0.5527383685112, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 3790 + }, + { + "epoch": 0.6143399886832107, + "grad_norm": 0.49116113781929016, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 3800 + }, + { + "epoch": 0.6159566728639561, + "grad_norm": 0.5299299359321594, + "learning_rate": 0.0002, + "loss": 0.8352, + "step": 3810 + }, + { + "epoch": 0.6175733570447013, + "grad_norm": 0.464897483587265, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 3820 + }, + { + "epoch": 0.6191900412254466, + "grad_norm": 0.6505740880966187, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 3830 + }, + { + "epoch": 0.6208067254061919, + "grad_norm": 0.5512559413909912, + "learning_rate": 0.0002, + "loss": 0.8123, + "step": 3840 + }, + { + "epoch": 0.6224234095869372, + "grad_norm": 0.49427518248558044, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 3850 + }, + { + "epoch": 0.6240400937676824, + "grad_norm": 0.3839147090911865, + "learning_rate": 0.0002, + "loss": 0.7751, + "step": 3860 + }, + { + "epoch": 0.6256567779484278, + "grad_norm": 0.5760218501091003, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 3870 + }, + { + "epoch": 0.6272734621291731, + "grad_norm": 0.7226507067680359, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 3880 + }, + { + "epoch": 0.6288901463099184, + "grad_norm": 0.676781415939331, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 3890 + }, + { + "epoch": 0.6305068304906637, + "grad_norm": 0.4284018278121948, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 3900 + }, + { + "epoch": 0.6321235146714089, + "grad_norm": 0.5060628056526184, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 3910 + }, + { + "epoch": 0.6337401988521543, + "grad_norm": 0.5524522066116333, + "learning_rate": 0.0002, + "loss": 0.8089, + "step": 3920 + }, + { + "epoch": 0.6353568830328995, + "grad_norm": 0.6099881529808044, + "learning_rate": 0.0002, + "loss": 0.8276, + "step": 3930 + }, + { + "epoch": 0.6369735672136448, + "grad_norm": 0.43155938386917114, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 3940 + }, + { + "epoch": 0.6385902513943901, + "grad_norm": 0.6427084803581238, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 3950 + }, + { + "epoch": 0.6402069355751354, + "grad_norm": 0.541220486164093, + "learning_rate": 0.0002, + "loss": 0.8368, + "step": 3960 + }, + { + "epoch": 0.6418236197558806, + "grad_norm": 0.5414294600486755, + "learning_rate": 0.0002, + "loss": 0.8539, + "step": 3970 + }, + { + "epoch": 0.643440303936626, + "grad_norm": 0.46344003081321716, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 3980 + }, + { + "epoch": 0.6450569881173712, + "grad_norm": 0.45209285616874695, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 3990 + }, + { + "epoch": 0.6466736722981166, + "grad_norm": 0.5417284369468689, + "learning_rate": 0.0002, + "loss": 0.8202, + "step": 4000 + }, + { + "epoch": 0.6482903564788619, + "grad_norm": 0.7995685935020447, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 4010 + }, + { + "epoch": 0.6499070406596071, + "grad_norm": 0.6384002566337585, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 4020 + }, + { + "epoch": 0.6515237248403525, + "grad_norm": 0.4472815692424774, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 4030 + }, + { + "epoch": 0.6531404090210977, + "grad_norm": 0.6834294199943542, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 4040 + }, + { + "epoch": 0.654757093201843, + "grad_norm": 0.4612339735031128, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 4050 + }, + { + "epoch": 0.6563737773825883, + "grad_norm": 0.9266576170921326, + "learning_rate": 0.0002, + "loss": 0.8141, + "step": 4060 + }, + { + "epoch": 0.6579904615633336, + "grad_norm": 0.4470861852169037, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 4070 + }, + { + "epoch": 0.6596071457440789, + "grad_norm": 0.45544925332069397, + "learning_rate": 0.0002, + "loss": 0.8293, + "step": 4080 + }, + { + "epoch": 0.6612238299248242, + "grad_norm": 0.6144481301307678, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 4090 + }, + { + "epoch": 0.6628405141055694, + "grad_norm": 0.5936288237571716, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 4100 + }, + { + "epoch": 0.6644571982863148, + "grad_norm": 0.4822963774204254, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 4110 + }, + { + "epoch": 0.66607388246706, + "grad_norm": 0.48432496190071106, + "learning_rate": 0.0002, + "loss": 0.7997, + "step": 4120 + }, + { + "epoch": 0.6676905666478054, + "grad_norm": 0.4901607930660248, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 4130 + }, + { + "epoch": 0.6693072508285507, + "grad_norm": 0.5018393397331238, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 4140 + }, + { + "epoch": 0.6709239350092959, + "grad_norm": 0.6946378946304321, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 4150 + }, + { + "epoch": 0.6725406191900413, + "grad_norm": 0.5997390747070312, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 4160 + }, + { + "epoch": 0.6741573033707865, + "grad_norm": 0.6738849878311157, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 4170 + }, + { + "epoch": 0.6757739875515318, + "grad_norm": 0.6110581159591675, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 4180 + }, + { + "epoch": 0.6773906717322771, + "grad_norm": 0.5703322291374207, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 4190 + }, + { + "epoch": 0.6790073559130224, + "grad_norm": 0.4686066210269928, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 4200 + }, + { + "epoch": 0.6806240400937676, + "grad_norm": 0.6394643783569336, + "learning_rate": 0.0002, + "loss": 0.8441, + "step": 4210 + }, + { + "epoch": 0.682240724274513, + "grad_norm": 0.5454841256141663, + "learning_rate": 0.0002, + "loss": 0.8011, + "step": 4220 + }, + { + "epoch": 0.6838574084552582, + "grad_norm": 0.4859732985496521, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 4230 + }, + { + "epoch": 0.6854740926360036, + "grad_norm": 0.5544065833091736, + "learning_rate": 0.0002, + "loss": 0.8161, + "step": 4240 + }, + { + "epoch": 0.6870907768167488, + "grad_norm": 0.4902505576610565, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 4250 + }, + { + "epoch": 0.6887074609974941, + "grad_norm": 0.4768051505088806, + "learning_rate": 0.0002, + "loss": 0.7977, + "step": 4260 + }, + { + "epoch": 0.6903241451782395, + "grad_norm": 0.49982190132141113, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 4270 + }, + { + "epoch": 0.6919408293589847, + "grad_norm": 0.6351838111877441, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 4280 + }, + { + "epoch": 0.69355751353973, + "grad_norm": 0.5647561550140381, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 4290 + }, + { + "epoch": 0.6951741977204753, + "grad_norm": 0.5340486764907837, + "learning_rate": 0.0002, + "loss": 0.7618, + "step": 4300 + }, + { + "epoch": 0.6967908819012206, + "grad_norm": 0.5649092793464661, + "learning_rate": 0.0002, + "loss": 0.8526, + "step": 4310 + }, + { + "epoch": 0.6984075660819659, + "grad_norm": 0.6183916926383972, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 4320 + }, + { + "epoch": 0.7000242502627112, + "grad_norm": 0.6154509782791138, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 4330 + }, + { + "epoch": 0.7016409344434564, + "grad_norm": 0.5156264305114746, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 4340 + }, + { + "epoch": 0.7032576186242018, + "grad_norm": 0.562171459197998, + "learning_rate": 0.0002, + "loss": 0.8512, + "step": 4350 + }, + { + "epoch": 0.704874302804947, + "grad_norm": 0.4949502646923065, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 4360 + }, + { + "epoch": 0.7064909869856923, + "grad_norm": 0.5171684622764587, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 4370 + }, + { + "epoch": 0.7081076711664377, + "grad_norm": 0.6198443174362183, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 4380 + }, + { + "epoch": 0.7097243553471829, + "grad_norm": 0.5802276134490967, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 4390 + }, + { + "epoch": 0.7113410395279283, + "grad_norm": 0.41096967458724976, + "learning_rate": 0.0002, + "loss": 0.8797, + "step": 4400 + }, + { + "epoch": 0.7129577237086735, + "grad_norm": 0.4397392272949219, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 4410 + }, + { + "epoch": 0.7145744078894188, + "grad_norm": 0.45228442549705505, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 4420 + }, + { + "epoch": 0.7161910920701641, + "grad_norm": 0.4839673936367035, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 4430 + }, + { + "epoch": 0.7178077762509094, + "grad_norm": 0.6140755414962769, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 4440 + }, + { + "epoch": 0.7194244604316546, + "grad_norm": 0.6841378808021545, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 4450 + }, + { + "epoch": 0.7210411446124, + "grad_norm": 0.6664239168167114, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 4460 + }, + { + "epoch": 0.7226578287931452, + "grad_norm": 0.47552719712257385, + "learning_rate": 0.0002, + "loss": 0.7983, + "step": 4470 + }, + { + "epoch": 0.7242745129738906, + "grad_norm": 0.6649776101112366, + "learning_rate": 0.0002, + "loss": 0.8982, + "step": 4480 + }, + { + "epoch": 0.7258911971546358, + "grad_norm": 0.5159541964530945, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 4490 + }, + { + "epoch": 0.7275078813353811, + "grad_norm": 0.6693112850189209, + "learning_rate": 0.0002, + "loss": 0.7786, + "step": 4500 + }, + { + "epoch": 0.7291245655161265, + "grad_norm": 0.48870977759361267, + "learning_rate": 0.0002, + "loss": 0.8655, + "step": 4510 + }, + { + "epoch": 0.7307412496968717, + "grad_norm": 0.4857887923717499, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 4520 + }, + { + "epoch": 0.732357933877617, + "grad_norm": 0.5515662431716919, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 4530 + }, + { + "epoch": 0.7339746180583623, + "grad_norm": 0.6292222738265991, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 4540 + }, + { + "epoch": 0.7355913022391076, + "grad_norm": 0.48265689611434937, + "learning_rate": 0.0002, + "loss": 0.7749, + "step": 4550 + }, + { + "epoch": 0.7372079864198529, + "grad_norm": 0.8044266104698181, + "learning_rate": 0.0002, + "loss": 0.8499, + "step": 4560 + }, + { + "epoch": 0.7388246706005982, + "grad_norm": 0.6111769676208496, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 4570 + }, + { + "epoch": 0.7404413547813434, + "grad_norm": 0.5229553580284119, + "learning_rate": 0.0002, + "loss": 0.7291, + "step": 4580 + }, + { + "epoch": 0.7420580389620888, + "grad_norm": 0.6054152250289917, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 4590 + }, + { + "epoch": 0.743674723142834, + "grad_norm": 0.5574966669082642, + "learning_rate": 0.0002, + "loss": 0.8169, + "step": 4600 + }, + { + "epoch": 0.7452914073235793, + "grad_norm": 0.5395817160606384, + "learning_rate": 0.0002, + "loss": 0.8439, + "step": 4610 + }, + { + "epoch": 0.7469080915043246, + "grad_norm": 0.7116472721099854, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 4620 + }, + { + "epoch": 0.7485247756850699, + "grad_norm": 0.5618700981140137, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 4630 + }, + { + "epoch": 0.7501414598658153, + "grad_norm": 0.5802770853042603, + "learning_rate": 0.0002, + "loss": 0.7744, + "step": 4640 + }, + { + "epoch": 0.7517581440465605, + "grad_norm": 0.5690428018569946, + "learning_rate": 0.0002, + "loss": 0.7924, + "step": 4650 + }, + { + "epoch": 0.7533748282273058, + "grad_norm": 0.4813360273838043, + "learning_rate": 0.0002, + "loss": 0.8017, + "step": 4660 + }, + { + "epoch": 0.7549915124080511, + "grad_norm": 0.5434042811393738, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 4670 + }, + { + "epoch": 0.7566081965887964, + "grad_norm": 0.5502099990844727, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 4680 + }, + { + "epoch": 0.7582248807695416, + "grad_norm": 0.6020621061325073, + "learning_rate": 0.0002, + "loss": 0.8598, + "step": 4690 + }, + { + "epoch": 0.759841564950287, + "grad_norm": 0.4922301471233368, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 4700 + }, + { + "epoch": 0.7614582491310322, + "grad_norm": 0.6492828726768494, + "learning_rate": 0.0002, + "loss": 0.788, + "step": 4710 + }, + { + "epoch": 0.7630749333117776, + "grad_norm": 0.4865580201148987, + "learning_rate": 0.0002, + "loss": 0.8313, + "step": 4720 + }, + { + "epoch": 0.7646916174925228, + "grad_norm": 0.5971422791481018, + "learning_rate": 0.0002, + "loss": 0.7966, + "step": 4730 + }, + { + "epoch": 0.7663083016732681, + "grad_norm": 0.6832674145698547, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 4740 + }, + { + "epoch": 0.7679249858540134, + "grad_norm": 0.500908613204956, + "learning_rate": 0.0002, + "loss": 0.8156, + "step": 4750 + }, + { + "epoch": 0.7695416700347587, + "grad_norm": 0.6112465858459473, + "learning_rate": 0.0002, + "loss": 0.8383, + "step": 4760 + }, + { + "epoch": 0.771158354215504, + "grad_norm": 0.5753506422042847, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 4770 + }, + { + "epoch": 0.7727750383962493, + "grad_norm": 0.6529405117034912, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 4780 + }, + { + "epoch": 0.7743917225769946, + "grad_norm": 0.5916843414306641, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 4790 + }, + { + "epoch": 0.7760084067577399, + "grad_norm": 0.4821224510669708, + "learning_rate": 0.0002, + "loss": 0.83, + "step": 4800 + }, + { + "epoch": 0.7776250909384852, + "grad_norm": 0.5532580018043518, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 4810 + }, + { + "epoch": 0.7792417751192304, + "grad_norm": 0.4604877233505249, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 4820 + }, + { + "epoch": 0.7808584592999758, + "grad_norm": 0.5009613037109375, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 4830 + }, + { + "epoch": 0.782475143480721, + "grad_norm": 0.6448560357093811, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 4840 + }, + { + "epoch": 0.7840918276614663, + "grad_norm": 0.44327953457832336, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 4850 + }, + { + "epoch": 0.7857085118422116, + "grad_norm": 0.5355411171913147, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 4860 + }, + { + "epoch": 0.7873251960229569, + "grad_norm": 0.5635677576065063, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 4870 + }, + { + "epoch": 0.7889418802037023, + "grad_norm": 0.5417491793632507, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 4880 + }, + { + "epoch": 0.7905585643844475, + "grad_norm": 0.4567430913448334, + "learning_rate": 0.0002, + "loss": 0.7819, + "step": 4890 + }, + { + "epoch": 0.7921752485651928, + "grad_norm": 0.44651296734809875, + "learning_rate": 0.0002, + "loss": 0.8454, + "step": 4900 + }, + { + "epoch": 0.7937919327459381, + "grad_norm": 0.5741217136383057, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 4910 + }, + { + "epoch": 0.7954086169266834, + "grad_norm": 0.6605045199394226, + "learning_rate": 0.0002, + "loss": 0.8093, + "step": 4920 + }, + { + "epoch": 0.7970253011074286, + "grad_norm": 0.5126531720161438, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 4930 + }, + { + "epoch": 0.798641985288174, + "grad_norm": 0.513648271560669, + "learning_rate": 0.0002, + "loss": 0.7793, + "step": 4940 + }, + { + "epoch": 0.8002586694689192, + "grad_norm": 0.5350404381752014, + "learning_rate": 0.0002, + "loss": 0.8314, + "step": 4950 + }, + { + "epoch": 0.8018753536496646, + "grad_norm": 0.5731674432754517, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 4960 + }, + { + "epoch": 0.8034920378304098, + "grad_norm": 0.5974258184432983, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 4970 + }, + { + "epoch": 0.8051087220111551, + "grad_norm": 0.8774799704551697, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 4980 + }, + { + "epoch": 0.8067254061919004, + "grad_norm": 0.5994430184364319, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 4990 + }, + { + "epoch": 0.8083420903726457, + "grad_norm": 0.4894903004169464, + "learning_rate": 0.0002, + "loss": 0.7736, + "step": 5000 + }, + { + "epoch": 0.809958774553391, + "grad_norm": 0.5218459367752075, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 5010 + }, + { + "epoch": 0.8115754587341363, + "grad_norm": 0.5232468843460083, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 5020 + }, + { + "epoch": 0.8131921429148816, + "grad_norm": 0.44358372688293457, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 5030 + }, + { + "epoch": 0.8148088270956269, + "grad_norm": 0.6202037334442139, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 5040 + }, + { + "epoch": 0.8164255112763722, + "grad_norm": 0.7721474170684814, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 5050 + }, + { + "epoch": 0.8180421954571174, + "grad_norm": 0.5568501353263855, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 5060 + }, + { + "epoch": 0.8196588796378628, + "grad_norm": 0.49148809909820557, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 5070 + }, + { + "epoch": 0.821275563818608, + "grad_norm": 0.4956012964248657, + "learning_rate": 0.0002, + "loss": 0.8054, + "step": 5080 + }, + { + "epoch": 0.8228922479993533, + "grad_norm": 0.6078833937644958, + "learning_rate": 0.0002, + "loss": 0.8201, + "step": 5090 + }, + { + "epoch": 0.8245089321800986, + "grad_norm": 0.46906954050064087, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 5100 + }, + { + "epoch": 0.8261256163608439, + "grad_norm": 0.50812166929245, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 5110 + }, + { + "epoch": 0.8277423005415891, + "grad_norm": 0.5319661498069763, + "learning_rate": 0.0002, + "loss": 0.8243, + "step": 5120 + }, + { + "epoch": 0.8293589847223345, + "grad_norm": 0.4949689209461212, + "learning_rate": 0.0002, + "loss": 0.7798, + "step": 5130 + }, + { + "epoch": 0.8309756689030798, + "grad_norm": 0.5151591300964355, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 5140 + }, + { + "epoch": 0.8325923530838251, + "grad_norm": 0.5530214309692383, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 5150 + }, + { + "epoch": 0.8342090372645704, + "grad_norm": 0.6297410130500793, + "learning_rate": 0.0002, + "loss": 0.8251, + "step": 5160 + }, + { + "epoch": 0.8358257214453156, + "grad_norm": 0.5466840267181396, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 5170 + }, + { + "epoch": 0.837442405626061, + "grad_norm": 0.652913510799408, + "learning_rate": 0.0002, + "loss": 0.7875, + "step": 5180 + }, + { + "epoch": 0.8390590898068062, + "grad_norm": 0.5811293125152588, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 5190 + }, + { + "epoch": 0.8406757739875516, + "grad_norm": 0.5109550952911377, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 5200 + }, + { + "epoch": 0.8422924581682968, + "grad_norm": 0.4551706612110138, + "learning_rate": 0.0002, + "loss": 0.8077, + "step": 5210 + }, + { + "epoch": 0.8439091423490421, + "grad_norm": 0.5813754200935364, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 5220 + }, + { + "epoch": 0.8455258265297874, + "grad_norm": 0.5856947898864746, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 5230 + }, + { + "epoch": 0.8471425107105327, + "grad_norm": 0.5482739210128784, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 5240 + }, + { + "epoch": 0.8487591948912779, + "grad_norm": 0.49023720622062683, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 5250 + }, + { + "epoch": 0.8503758790720233, + "grad_norm": 0.49472475051879883, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 5260 + }, + { + "epoch": 0.8519925632527686, + "grad_norm": 0.5490226745605469, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 5270 + }, + { + "epoch": 0.8536092474335139, + "grad_norm": 0.5340665578842163, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 5280 + }, + { + "epoch": 0.8552259316142592, + "grad_norm": 0.5962483882904053, + "learning_rate": 0.0002, + "loss": 0.8277, + "step": 5290 + }, + { + "epoch": 0.8568426157950044, + "grad_norm": 0.586358368396759, + "learning_rate": 0.0002, + "loss": 0.8765, + "step": 5300 + }, + { + "epoch": 0.8584592999757498, + "grad_norm": 0.49120277166366577, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 5310 + }, + { + "epoch": 0.860075984156495, + "grad_norm": 0.5887332558631897, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 5320 + }, + { + "epoch": 0.8616926683372403, + "grad_norm": 0.42496153712272644, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 5330 + }, + { + "epoch": 0.8633093525179856, + "grad_norm": 0.5489874482154846, + "learning_rate": 0.0002, + "loss": 0.7905, + "step": 5340 + }, + { + "epoch": 0.8649260366987309, + "grad_norm": 0.5850813984870911, + "learning_rate": 0.0002, + "loss": 0.7958, + "step": 5350 + }, + { + "epoch": 0.8665427208794761, + "grad_norm": 0.517487108707428, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 5360 + }, + { + "epoch": 0.8681594050602215, + "grad_norm": 0.5339142680168152, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 5370 + }, + { + "epoch": 0.8697760892409668, + "grad_norm": 0.6236387491226196, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 5380 + }, + { + "epoch": 0.8713927734217121, + "grad_norm": 0.5752192735671997, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 5390 + }, + { + "epoch": 0.8730094576024574, + "grad_norm": 0.6724614500999451, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 5400 + }, + { + "epoch": 0.8746261417832026, + "grad_norm": 0.5280613303184509, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 5410 + }, + { + "epoch": 0.876242825963948, + "grad_norm": 0.44033288955688477, + "learning_rate": 0.0002, + "loss": 0.8231, + "step": 5420 + }, + { + "epoch": 0.8778595101446932, + "grad_norm": 0.5199708342552185, + "learning_rate": 0.0002, + "loss": 0.8839, + "step": 5430 + }, + { + "epoch": 0.8794761943254386, + "grad_norm": 0.46778348088264465, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 5440 + }, + { + "epoch": 0.8810928785061838, + "grad_norm": 0.4657754898071289, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 5450 + }, + { + "epoch": 0.8827095626869291, + "grad_norm": 0.5472902655601501, + "learning_rate": 0.0002, + "loss": 0.7799, + "step": 5460 + }, + { + "epoch": 0.8843262468676744, + "grad_norm": 0.4876766800880432, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 5470 + }, + { + "epoch": 0.8859429310484197, + "grad_norm": 0.5057248473167419, + "learning_rate": 0.0002, + "loss": 0.7906, + "step": 5480 + }, + { + "epoch": 0.8875596152291649, + "grad_norm": 0.4637320637702942, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 5490 + }, + { + "epoch": 0.8891762994099103, + "grad_norm": 0.471955806016922, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 5500 + }, + { + "epoch": 0.8907929835906556, + "grad_norm": 0.5209813714027405, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 5510 + }, + { + "epoch": 0.8924096677714008, + "grad_norm": 0.6213834285736084, + "learning_rate": 0.0002, + "loss": 0.8106, + "step": 5520 + }, + { + "epoch": 0.8940263519521462, + "grad_norm": 0.5215408205986023, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 5530 + }, + { + "epoch": 0.8956430361328914, + "grad_norm": 0.580478310585022, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 5540 + }, + { + "epoch": 0.8972597203136368, + "grad_norm": 0.49102169275283813, + "learning_rate": 0.0002, + "loss": 0.8371, + "step": 5550 + }, + { + "epoch": 0.898876404494382, + "grad_norm": 0.6043479442596436, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 5560 + }, + { + "epoch": 0.9004930886751273, + "grad_norm": 0.5636463165283203, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 5570 + }, + { + "epoch": 0.9021097728558726, + "grad_norm": 0.5620124340057373, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 5580 + }, + { + "epoch": 0.9037264570366179, + "grad_norm": 0.5206354856491089, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 5590 + }, + { + "epoch": 0.9053431412173631, + "grad_norm": 0.5798229575157166, + "learning_rate": 0.0002, + "loss": 0.8557, + "step": 5600 + }, + { + "epoch": 0.9069598253981085, + "grad_norm": 0.6428212523460388, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 5610 + }, + { + "epoch": 0.9085765095788537, + "grad_norm": 0.48064687848091125, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 5620 + }, + { + "epoch": 0.9101931937595991, + "grad_norm": 0.6347860097885132, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 5630 + }, + { + "epoch": 0.9118098779403444, + "grad_norm": 0.5353913307189941, + "learning_rate": 0.0002, + "loss": 0.851, + "step": 5640 + }, + { + "epoch": 0.9134265621210896, + "grad_norm": 0.5323944091796875, + "learning_rate": 0.0002, + "loss": 0.7736, + "step": 5650 + }, + { + "epoch": 0.915043246301835, + "grad_norm": 0.5261843204498291, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 5660 + }, + { + "epoch": 0.9166599304825802, + "grad_norm": 0.5451326966285706, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 5670 + }, + { + "epoch": 0.9182766146633256, + "grad_norm": 0.5183324217796326, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 5680 + }, + { + "epoch": 0.9198932988440708, + "grad_norm": 0.47229018807411194, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 5690 + }, + { + "epoch": 0.9215099830248161, + "grad_norm": 0.49180513620376587, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 5700 + }, + { + "epoch": 0.9231266672055614, + "grad_norm": 0.5419785380363464, + "learning_rate": 0.0002, + "loss": 0.8735, + "step": 5710 + }, + { + "epoch": 0.9247433513863067, + "grad_norm": 0.5408698916435242, + "learning_rate": 0.0002, + "loss": 0.7378, + "step": 5720 + }, + { + "epoch": 0.9263600355670519, + "grad_norm": 0.5286232829093933, + "learning_rate": 0.0002, + "loss": 0.7701, + "step": 5730 + }, + { + "epoch": 0.9279767197477973, + "grad_norm": 0.7539758086204529, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 5740 + }, + { + "epoch": 0.9295934039285425, + "grad_norm": 0.5166944861412048, + "learning_rate": 0.0002, + "loss": 0.8118, + "step": 5750 + }, + { + "epoch": 0.9312100881092878, + "grad_norm": 0.6601425409317017, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 5760 + }, + { + "epoch": 0.9328267722900332, + "grad_norm": 0.5029960870742798, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 5770 + }, + { + "epoch": 0.9344434564707784, + "grad_norm": 0.4926645755767822, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 5780 + }, + { + "epoch": 0.9360601406515238, + "grad_norm": 0.5739615559577942, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 5790 + }, + { + "epoch": 0.937676824832269, + "grad_norm": 0.5058279037475586, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 5800 + }, + { + "epoch": 0.9392935090130143, + "grad_norm": 0.5260962247848511, + "learning_rate": 0.0002, + "loss": 0.8537, + "step": 5810 + }, + { + "epoch": 0.9409101931937596, + "grad_norm": 0.5768588185310364, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 5820 + }, + { + "epoch": 0.9425268773745049, + "grad_norm": 0.5170126557350159, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 5830 + }, + { + "epoch": 0.9441435615552501, + "grad_norm": 0.5745864510536194, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 5840 + }, + { + "epoch": 0.9457602457359955, + "grad_norm": 0.5551357865333557, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 5850 + }, + { + "epoch": 0.9473769299167407, + "grad_norm": 0.5776078701019287, + "learning_rate": 0.0002, + "loss": 0.8529, + "step": 5860 + }, + { + "epoch": 0.9489936140974861, + "grad_norm": 0.5340062379837036, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 5870 + }, + { + "epoch": 0.9506102982782314, + "grad_norm": 0.6447290182113647, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 5880 + }, + { + "epoch": 0.9522269824589766, + "grad_norm": 0.5123815536499023, + "learning_rate": 0.0002, + "loss": 0.7945, + "step": 5890 + }, + { + "epoch": 0.953843666639722, + "grad_norm": 0.48547613620758057, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 5900 + }, + { + "epoch": 0.9554603508204672, + "grad_norm": 0.5791414976119995, + "learning_rate": 0.0002, + "loss": 0.7896, + "step": 5910 + }, + { + "epoch": 0.9570770350012126, + "grad_norm": 0.6195011734962463, + "learning_rate": 0.0002, + "loss": 0.8408, + "step": 5920 + }, + { + "epoch": 0.9586937191819578, + "grad_norm": 0.6323803067207336, + "learning_rate": 0.0002, + "loss": 0.7805, + "step": 5930 + }, + { + "epoch": 0.9603104033627031, + "grad_norm": 0.45552879571914673, + "learning_rate": 0.0002, + "loss": 0.8484, + "step": 5940 + }, + { + "epoch": 0.9619270875434484, + "grad_norm": 0.5796473622322083, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 5950 + }, + { + "epoch": 0.9635437717241937, + "grad_norm": 0.647261381149292, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 5960 + }, + { + "epoch": 0.9651604559049389, + "grad_norm": 0.5487682819366455, + "learning_rate": 0.0002, + "loss": 0.8086, + "step": 5970 + }, + { + "epoch": 0.9667771400856843, + "grad_norm": 0.5743663907051086, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 5980 + }, + { + "epoch": 0.9683938242664295, + "grad_norm": 0.5470591187477112, + "learning_rate": 0.0002, + "loss": 0.8153, + "step": 5990 + }, + { + "epoch": 0.9700105084471748, + "grad_norm": 0.5901660323143005, + "learning_rate": 0.0002, + "loss": 0.8119, + "step": 6000 + }, + { + "epoch": 0.9716271926279202, + "grad_norm": 0.6544759273529053, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 6010 + }, + { + "epoch": 0.9732438768086654, + "grad_norm": 0.6288470029830933, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 6020 + }, + { + "epoch": 0.9748605609894108, + "grad_norm": 0.673153817653656, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 6030 + }, + { + "epoch": 0.976477245170156, + "grad_norm": 0.42854753136634827, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 6040 + }, + { + "epoch": 0.9780939293509013, + "grad_norm": 0.5227066278457642, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 6050 + }, + { + "epoch": 0.9797106135316466, + "grad_norm": 0.5372416973114014, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 6060 + }, + { + "epoch": 0.9813272977123919, + "grad_norm": 0.6026402115821838, + "learning_rate": 0.0002, + "loss": 0.7591, + "step": 6070 + }, + { + "epoch": 0.9829439818931371, + "grad_norm": 0.49547791481018066, + "learning_rate": 0.0002, + "loss": 0.8347, + "step": 6080 + }, + { + "epoch": 0.9845606660738825, + "grad_norm": 0.4641951322555542, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 6090 + }, + { + "epoch": 0.9861773502546277, + "grad_norm": 0.5818535089492798, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 6100 + }, + { + "epoch": 0.9877940344353731, + "grad_norm": 0.63955157995224, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 6110 + }, + { + "epoch": 0.9894107186161183, + "grad_norm": 0.5649438500404358, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 6120 + }, + { + "epoch": 0.9910274027968636, + "grad_norm": 0.5290433168411255, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 6130 + }, + { + "epoch": 0.992644086977609, + "grad_norm": 0.6399374008178711, + "learning_rate": 0.0002, + "loss": 0.8109, + "step": 6140 + }, + { + "epoch": 0.9942607711583542, + "grad_norm": 0.6736576557159424, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 6150 + }, + { + "epoch": 0.9958774553390995, + "grad_norm": 0.515420138835907, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 6160 + }, + { + "epoch": 0.9974941395198448, + "grad_norm": 0.562677800655365, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 6170 + }, + { + "epoch": 0.9991108237005901, + "grad_norm": 0.7113858461380005, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 6180 + }, + { + "epoch": 0.9999191657909627, + "eval_loss": 1.0871200561523438, + "eval_runtime": 122.2071, + "eval_samples_per_second": 5.998, + "eval_steps_per_second": 0.753, + "step": 6185 + }, + { + "epoch": 1.0007275078813354, + "grad_norm": 0.7111801505088806, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 6190 + }, + { + "epoch": 1.0023441920620806, + "grad_norm": 0.5402125716209412, + "learning_rate": 0.0002, + "loss": 0.6865, + "step": 6200 + }, + { + "epoch": 1.003960876242826, + "grad_norm": 0.6098830103874207, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 6210 + }, + { + "epoch": 1.0055775604235713, + "grad_norm": 0.5829983353614807, + "learning_rate": 0.0002, + "loss": 0.7631, + "step": 6220 + }, + { + "epoch": 1.0071942446043165, + "grad_norm": 0.5614621043205261, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 6230 + }, + { + "epoch": 1.0088109287850617, + "grad_norm": 0.5954238772392273, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 6240 + }, + { + "epoch": 1.0104276129658072, + "grad_norm": 0.6480574607849121, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 6250 + }, + { + "epoch": 1.0120442971465524, + "grad_norm": 0.6051128506660461, + "learning_rate": 0.0002, + "loss": 0.7514, + "step": 6260 + }, + { + "epoch": 1.0136609813272976, + "grad_norm": 0.6318870782852173, + "learning_rate": 0.0002, + "loss": 0.7237, + "step": 6270 + }, + { + "epoch": 1.015277665508043, + "grad_norm": 0.5048980116844177, + "learning_rate": 0.0002, + "loss": 0.7178, + "step": 6280 + }, + { + "epoch": 1.0168943496887883, + "grad_norm": 0.6346936225891113, + "learning_rate": 0.0002, + "loss": 0.7391, + "step": 6290 + }, + { + "epoch": 1.0185110338695336, + "grad_norm": 0.5711665749549866, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 6300 + }, + { + "epoch": 1.0201277180502788, + "grad_norm": 0.5175361037254333, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 6310 + }, + { + "epoch": 1.0217444022310243, + "grad_norm": 0.5360831618309021, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 6320 + }, + { + "epoch": 1.0233610864117695, + "grad_norm": 0.614675760269165, + "learning_rate": 0.0002, + "loss": 0.7112, + "step": 6330 + }, + { + "epoch": 1.0249777705925147, + "grad_norm": 0.5626118183135986, + "learning_rate": 0.0002, + "loss": 0.7748, + "step": 6340 + }, + { + "epoch": 1.02659445477326, + "grad_norm": 0.574897289276123, + "learning_rate": 0.0002, + "loss": 0.7375, + "step": 6350 + }, + { + "epoch": 1.0282111389540054, + "grad_norm": 0.7185447812080383, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 6360 + }, + { + "epoch": 1.0298278231347506, + "grad_norm": 0.6705799698829651, + "learning_rate": 0.0002, + "loss": 0.703, + "step": 6370 + }, + { + "epoch": 1.0314445073154959, + "grad_norm": 0.6740428805351257, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 6380 + }, + { + "epoch": 1.0330611914962413, + "grad_norm": 0.663902759552002, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 6390 + }, + { + "epoch": 1.0346778756769865, + "grad_norm": 0.5029543042182922, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 6400 + }, + { + "epoch": 1.0362945598577318, + "grad_norm": 0.7813863158226013, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 6410 + }, + { + "epoch": 1.037911244038477, + "grad_norm": 0.5396282076835632, + "learning_rate": 0.0002, + "loss": 0.7433, + "step": 6420 + }, + { + "epoch": 1.0395279282192225, + "grad_norm": 0.5253293514251709, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 6430 + }, + { + "epoch": 1.0411446123999677, + "grad_norm": 0.7236770987510681, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 6440 + }, + { + "epoch": 1.042761296580713, + "grad_norm": 0.5670917630195618, + "learning_rate": 0.0002, + "loss": 0.7259, + "step": 6450 + }, + { + "epoch": 1.0443779807614582, + "grad_norm": 0.6031978726387024, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 6460 + }, + { + "epoch": 1.0459946649422036, + "grad_norm": 0.5309213399887085, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 6470 + }, + { + "epoch": 1.0476113491229488, + "grad_norm": 0.7114651799201965, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 6480 + }, + { + "epoch": 1.049228033303694, + "grad_norm": 0.5591610670089722, + "learning_rate": 0.0002, + "loss": 0.7583, + "step": 6490 + }, + { + "epoch": 1.0508447174844395, + "grad_norm": 0.5185961127281189, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 6500 + }, + { + "epoch": 1.0524614016651848, + "grad_norm": 0.6510552167892456, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 6510 + }, + { + "epoch": 1.05407808584593, + "grad_norm": 0.6557928919792175, + "learning_rate": 0.0002, + "loss": 0.7057, + "step": 6520 + }, + { + "epoch": 1.0556947700266752, + "grad_norm": 0.6973192691802979, + "learning_rate": 0.0002, + "loss": 0.8056, + "step": 6530 + }, + { + "epoch": 1.0573114542074207, + "grad_norm": 0.6226583123207092, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 6540 + }, + { + "epoch": 1.058928138388166, + "grad_norm": 0.5633195638656616, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 6550 + }, + { + "epoch": 1.0605448225689111, + "grad_norm": 0.7466658353805542, + "learning_rate": 0.0002, + "loss": 0.7082, + "step": 6560 + }, + { + "epoch": 1.0621615067496564, + "grad_norm": 0.6462772488594055, + "learning_rate": 0.0002, + "loss": 0.7059, + "step": 6570 + }, + { + "epoch": 1.0637781909304018, + "grad_norm": 0.5266856551170349, + "learning_rate": 0.0002, + "loss": 0.7046, + "step": 6580 + }, + { + "epoch": 1.065394875111147, + "grad_norm": 0.534392774105072, + "learning_rate": 0.0002, + "loss": 0.7157, + "step": 6590 + }, + { + "epoch": 1.0670115592918923, + "grad_norm": 0.7514177560806274, + "learning_rate": 0.0002, + "loss": 0.7115, + "step": 6600 + }, + { + "epoch": 1.0686282434726375, + "grad_norm": 0.7593035697937012, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 6610 + }, + { + "epoch": 1.070244927653383, + "grad_norm": 0.5277858972549438, + "learning_rate": 0.0002, + "loss": 0.6836, + "step": 6620 + }, + { + "epoch": 1.0718616118341282, + "grad_norm": 0.5573670268058777, + "learning_rate": 0.0002, + "loss": 0.7405, + "step": 6630 + }, + { + "epoch": 1.0734782960148734, + "grad_norm": 0.6802396774291992, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 6640 + }, + { + "epoch": 1.0750949801956189, + "grad_norm": 0.7367215752601624, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 6650 + }, + { + "epoch": 1.0767116643763641, + "grad_norm": 0.5961891412734985, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 6660 + }, + { + "epoch": 1.0783283485571094, + "grad_norm": 0.5736313462257385, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 6670 + }, + { + "epoch": 1.0799450327378546, + "grad_norm": 0.619219183921814, + "learning_rate": 0.0002, + "loss": 0.7178, + "step": 6680 + }, + { + "epoch": 1.0815617169186, + "grad_norm": 0.6214390993118286, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 6690 + }, + { + "epoch": 1.0831784010993453, + "grad_norm": 0.564536988735199, + "learning_rate": 0.0002, + "loss": 0.7554, + "step": 6700 + }, + { + "epoch": 1.0847950852800905, + "grad_norm": 0.5838140249252319, + "learning_rate": 0.0002, + "loss": 0.7362, + "step": 6710 + }, + { + "epoch": 1.0864117694608357, + "grad_norm": 0.7000553607940674, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 6720 + }, + { + "epoch": 1.0880284536415812, + "grad_norm": 0.7078263759613037, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 6730 + }, + { + "epoch": 1.0896451378223264, + "grad_norm": 0.8353848457336426, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 6740 + }, + { + "epoch": 1.0912618220030716, + "grad_norm": 0.5615518689155579, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 6750 + }, + { + "epoch": 1.0928785061838169, + "grad_norm": 0.5475581288337708, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 6760 + }, + { + "epoch": 1.0944951903645623, + "grad_norm": 0.5835978388786316, + "learning_rate": 0.0002, + "loss": 0.7652, + "step": 6770 + }, + { + "epoch": 1.0961118745453076, + "grad_norm": 0.5516105890274048, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 6780 + }, + { + "epoch": 1.0977285587260528, + "grad_norm": 0.5875251889228821, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 6790 + }, + { + "epoch": 1.0993452429067982, + "grad_norm": 0.7376947999000549, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 6800 + }, + { + "epoch": 1.1009619270875435, + "grad_norm": 0.5656165480613708, + "learning_rate": 0.0002, + "loss": 0.7512, + "step": 6810 + }, + { + "epoch": 1.1025786112682887, + "grad_norm": 0.6365954279899597, + "learning_rate": 0.0002, + "loss": 0.7409, + "step": 6820 + }, + { + "epoch": 1.104195295449034, + "grad_norm": 0.5033080577850342, + "learning_rate": 0.0002, + "loss": 0.7392, + "step": 6830 + }, + { + "epoch": 1.1058119796297794, + "grad_norm": 0.617396891117096, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 6840 + }, + { + "epoch": 1.1074286638105246, + "grad_norm": 0.6395374536514282, + "learning_rate": 0.0002, + "loss": 0.7006, + "step": 6850 + }, + { + "epoch": 1.1090453479912699, + "grad_norm": 0.6775295734405518, + "learning_rate": 0.0002, + "loss": 0.7335, + "step": 6860 + }, + { + "epoch": 1.1106620321720153, + "grad_norm": 0.6655223965644836, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 6870 + }, + { + "epoch": 1.1122787163527605, + "grad_norm": 0.676655113697052, + "learning_rate": 0.0002, + "loss": 0.7553, + "step": 6880 + }, + { + "epoch": 1.1138954005335058, + "grad_norm": 0.6062718629837036, + "learning_rate": 0.0002, + "loss": 0.7342, + "step": 6890 + }, + { + "epoch": 1.115512084714251, + "grad_norm": 0.590943455696106, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 6900 + }, + { + "epoch": 1.1171287688949965, + "grad_norm": 0.6315317153930664, + "learning_rate": 0.0002, + "loss": 0.6705, + "step": 6910 + }, + { + "epoch": 1.1187454530757417, + "grad_norm": 0.47979024052619934, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 6920 + }, + { + "epoch": 1.120362137256487, + "grad_norm": 0.647298276424408, + "learning_rate": 0.0002, + "loss": 0.7002, + "step": 6930 + }, + { + "epoch": 1.1219788214372322, + "grad_norm": 0.7336484789848328, + "learning_rate": 0.0002, + "loss": 0.7502, + "step": 6940 + }, + { + "epoch": 1.1235955056179776, + "grad_norm": 0.5071424245834351, + "learning_rate": 0.0002, + "loss": 0.693, + "step": 6950 + }, + { + "epoch": 1.1252121897987228, + "grad_norm": 0.6527144312858582, + "learning_rate": 0.0002, + "loss": 0.7378, + "step": 6960 + }, + { + "epoch": 1.126828873979468, + "grad_norm": 0.6935935020446777, + "learning_rate": 0.0002, + "loss": 0.7228, + "step": 6970 + }, + { + "epoch": 1.1284455581602133, + "grad_norm": 0.8026931881904602, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 6980 + }, + { + "epoch": 1.1300622423409588, + "grad_norm": 0.5210393667221069, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 6990 + }, + { + "epoch": 1.131678926521704, + "grad_norm": 0.60475093126297, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 7000 + }, + { + "epoch": 1.1332956107024492, + "grad_norm": 0.6417073607444763, + "learning_rate": 0.0002, + "loss": 0.7495, + "step": 7010 + }, + { + "epoch": 1.1349122948831947, + "grad_norm": 0.6732175946235657, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 7020 + }, + { + "epoch": 1.13652897906394, + "grad_norm": 0.6719491481781006, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 7030 + }, + { + "epoch": 1.1381456632446851, + "grad_norm": 0.5708295106887817, + "learning_rate": 0.0002, + "loss": 0.7694, + "step": 7040 + }, + { + "epoch": 1.1397623474254304, + "grad_norm": 0.7141719460487366, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 7050 + }, + { + "epoch": 1.1413790316061758, + "grad_norm": 0.6187017560005188, + "learning_rate": 0.0002, + "loss": 0.764, + "step": 7060 + }, + { + "epoch": 1.142995715786921, + "grad_norm": 0.50581294298172, + "learning_rate": 0.0002, + "loss": 0.7657, + "step": 7070 + }, + { + "epoch": 1.1446123999676663, + "grad_norm": 0.5620143413543701, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 7080 + }, + { + "epoch": 1.1462290841484115, + "grad_norm": 0.6231929659843445, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 7090 + }, + { + "epoch": 1.147845768329157, + "grad_norm": 0.5775774121284485, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 7100 + }, + { + "epoch": 1.1494624525099022, + "grad_norm": 0.6492809653282166, + "learning_rate": 0.0002, + "loss": 0.7728, + "step": 7110 + }, + { + "epoch": 1.1510791366906474, + "grad_norm": 0.6434972286224365, + "learning_rate": 0.0002, + "loss": 0.7545, + "step": 7120 + }, + { + "epoch": 1.1526958208713927, + "grad_norm": 0.6191812753677368, + "learning_rate": 0.0002, + "loss": 0.7374, + "step": 7130 + }, + { + "epoch": 1.1543125050521381, + "grad_norm": 0.6690331697463989, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 7140 + }, + { + "epoch": 1.1559291892328833, + "grad_norm": 0.5977938175201416, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 7150 + }, + { + "epoch": 1.1575458734136286, + "grad_norm": 0.6195854544639587, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 7160 + }, + { + "epoch": 1.159162557594374, + "grad_norm": 0.5752048492431641, + "learning_rate": 0.0002, + "loss": 0.7249, + "step": 7170 + }, + { + "epoch": 1.1607792417751193, + "grad_norm": 0.589081883430481, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 7180 + }, + { + "epoch": 1.1623959259558645, + "grad_norm": 0.756996750831604, + "learning_rate": 0.0002, + "loss": 0.704, + "step": 7190 + }, + { + "epoch": 1.1640126101366097, + "grad_norm": 0.7614967226982117, + "learning_rate": 0.0002, + "loss": 0.7404, + "step": 7200 + }, + { + "epoch": 1.1656292943173552, + "grad_norm": 0.6120437979698181, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 7210 + }, + { + "epoch": 1.1672459784981004, + "grad_norm": 0.6210004687309265, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 7220 + }, + { + "epoch": 1.1688626626788456, + "grad_norm": 0.6044116020202637, + "learning_rate": 0.0002, + "loss": 0.7251, + "step": 7230 + }, + { + "epoch": 1.170479346859591, + "grad_norm": 0.5418457388877869, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 7240 + }, + { + "epoch": 1.1720960310403363, + "grad_norm": 0.6413537263870239, + "learning_rate": 0.0002, + "loss": 0.6938, + "step": 7250 + }, + { + "epoch": 1.1737127152210816, + "grad_norm": 0.5777867436408997, + "learning_rate": 0.0002, + "loss": 0.6978, + "step": 7260 + }, + { + "epoch": 1.1753293994018268, + "grad_norm": 0.7092402577400208, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 7270 + }, + { + "epoch": 1.176946083582572, + "grad_norm": 0.6351709365844727, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 7280 + }, + { + "epoch": 1.1785627677633175, + "grad_norm": 0.6172189712524414, + "learning_rate": 0.0002, + "loss": 0.7527, + "step": 7290 + }, + { + "epoch": 1.1801794519440627, + "grad_norm": 0.6801714897155762, + "learning_rate": 0.0002, + "loss": 0.7319, + "step": 7300 + }, + { + "epoch": 1.181796136124808, + "grad_norm": 0.6044712066650391, + "learning_rate": 0.0002, + "loss": 0.6941, + "step": 7310 + }, + { + "epoch": 1.1834128203055534, + "grad_norm": 0.7413212060928345, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 7320 + }, + { + "epoch": 1.1850295044862986, + "grad_norm": 0.5303856134414673, + "learning_rate": 0.0002, + "loss": 0.7396, + "step": 7330 + }, + { + "epoch": 1.1866461886670439, + "grad_norm": 0.5647098422050476, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 7340 + }, + { + "epoch": 1.188262872847789, + "grad_norm": 0.7374135255813599, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 7350 + }, + { + "epoch": 1.1898795570285345, + "grad_norm": 0.5710089206695557, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 7360 + }, + { + "epoch": 1.1914962412092798, + "grad_norm": 0.6073619723320007, + "learning_rate": 0.0002, + "loss": 0.8289, + "step": 7370 + }, + { + "epoch": 1.193112925390025, + "grad_norm": 0.5899916887283325, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 7380 + }, + { + "epoch": 1.1947296095707705, + "grad_norm": 0.7762434482574463, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 7390 + }, + { + "epoch": 1.1963462937515157, + "grad_norm": 0.679949939250946, + "learning_rate": 0.0002, + "loss": 0.7319, + "step": 7400 + }, + { + "epoch": 1.197962977932261, + "grad_norm": 0.6106849312782288, + "learning_rate": 0.0002, + "loss": 0.7599, + "step": 7410 + }, + { + "epoch": 1.1995796621130062, + "grad_norm": 0.682461678981781, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 7420 + }, + { + "epoch": 1.2011963462937516, + "grad_norm": 0.6087017059326172, + "learning_rate": 0.0002, + "loss": 0.7741, + "step": 7430 + }, + { + "epoch": 1.2028130304744968, + "grad_norm": 0.63739013671875, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 7440 + }, + { + "epoch": 1.204429714655242, + "grad_norm": 0.6154777407646179, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 7450 + }, + { + "epoch": 1.2060463988359873, + "grad_norm": 0.7491534948348999, + "learning_rate": 0.0002, + "loss": 0.7565, + "step": 7460 + }, + { + "epoch": 1.2076630830167328, + "grad_norm": 0.6664797067642212, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 7470 + }, + { + "epoch": 1.209279767197478, + "grad_norm": 0.6660266518592834, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 7480 + }, + { + "epoch": 1.2108964513782232, + "grad_norm": 0.6972551345825195, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 7490 + }, + { + "epoch": 1.2125131355589684, + "grad_norm": 0.6157945990562439, + "learning_rate": 0.0002, + "loss": 0.7023, + "step": 7500 + }, + { + "epoch": 1.214129819739714, + "grad_norm": 0.5199310183525085, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 7510 + }, + { + "epoch": 1.2157465039204591, + "grad_norm": 0.577610433101654, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 7520 + }, + { + "epoch": 1.2173631881012044, + "grad_norm": 0.53652423620224, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 7530 + }, + { + "epoch": 1.2189798722819498, + "grad_norm": 0.6479050517082214, + "learning_rate": 0.0002, + "loss": 0.7393, + "step": 7540 + }, + { + "epoch": 1.220596556462695, + "grad_norm": 0.618748128414154, + "learning_rate": 0.0002, + "loss": 0.7534, + "step": 7550 + }, + { + "epoch": 1.2222132406434403, + "grad_norm": 0.6311424374580383, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 7560 + }, + { + "epoch": 1.2238299248241855, + "grad_norm": 0.6595825552940369, + "learning_rate": 0.0002, + "loss": 0.7272, + "step": 7570 + }, + { + "epoch": 1.225446609004931, + "grad_norm": 0.5198960900306702, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 7580 + }, + { + "epoch": 1.2270632931856762, + "grad_norm": 0.578650712966919, + "learning_rate": 0.0002, + "loss": 0.674, + "step": 7590 + }, + { + "epoch": 1.2286799773664214, + "grad_norm": 0.6080220937728882, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 7600 + }, + { + "epoch": 1.2302966615471669, + "grad_norm": 0.7050248384475708, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 7610 + }, + { + "epoch": 1.2319133457279121, + "grad_norm": 0.6652196049690247, + "learning_rate": 0.0002, + "loss": 0.7032, + "step": 7620 + }, + { + "epoch": 1.2335300299086573, + "grad_norm": 0.7322776317596436, + "learning_rate": 0.0002, + "loss": 0.7085, + "step": 7630 + }, + { + "epoch": 1.2351467140894026, + "grad_norm": 0.4998728036880493, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 7640 + }, + { + "epoch": 1.2367633982701478, + "grad_norm": 0.6428788900375366, + "learning_rate": 0.0002, + "loss": 0.7214, + "step": 7650 + }, + { + "epoch": 1.2383800824508933, + "grad_norm": 0.585242509841919, + "learning_rate": 0.0002, + "loss": 0.7699, + "step": 7660 + }, + { + "epoch": 1.2399967666316385, + "grad_norm": 0.5211917757987976, + "learning_rate": 0.0002, + "loss": 0.7621, + "step": 7670 + }, + { + "epoch": 1.2416134508123837, + "grad_norm": 0.6490384340286255, + "learning_rate": 0.0002, + "loss": 0.746, + "step": 7680 + }, + { + "epoch": 1.2432301349931292, + "grad_norm": 0.6249763369560242, + "learning_rate": 0.0002, + "loss": 0.7186, + "step": 7690 + }, + { + "epoch": 1.2448468191738744, + "grad_norm": 0.71870356798172, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 7700 + }, + { + "epoch": 1.2464635033546196, + "grad_norm": 0.6761967539787292, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 7710 + }, + { + "epoch": 1.2480801875353649, + "grad_norm": 0.6500617265701294, + "learning_rate": 0.0002, + "loss": 0.7501, + "step": 7720 + }, + { + "epoch": 1.2496968717161103, + "grad_norm": 0.8069869875907898, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 7730 + }, + { + "epoch": 1.2513135558968556, + "grad_norm": 0.6044608950614929, + "learning_rate": 0.0002, + "loss": 0.6747, + "step": 7740 + }, + { + "epoch": 1.2529302400776008, + "grad_norm": 0.6573283076286316, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 7750 + }, + { + "epoch": 1.2545469242583462, + "grad_norm": 0.625430166721344, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 7760 + }, + { + "epoch": 1.2561636084390915, + "grad_norm": 0.5442022681236267, + "learning_rate": 0.0002, + "loss": 0.7041, + "step": 7770 + }, + { + "epoch": 1.2577802926198367, + "grad_norm": 0.6818386912345886, + "learning_rate": 0.0002, + "loss": 0.7172, + "step": 7780 + }, + { + "epoch": 1.259396976800582, + "grad_norm": 0.6381874084472656, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 7790 + }, + { + "epoch": 1.2610136609813272, + "grad_norm": 0.6269212961196899, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 7800 + }, + { + "epoch": 1.2626303451620726, + "grad_norm": 0.600121259689331, + "learning_rate": 0.0002, + "loss": 0.7821, + "step": 7810 + }, + { + "epoch": 1.2642470293428179, + "grad_norm": 0.6337703466415405, + "learning_rate": 0.0002, + "loss": 0.7761, + "step": 7820 + }, + { + "epoch": 1.2658637135235633, + "grad_norm": 0.7234963774681091, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 7830 + }, + { + "epoch": 1.2674803977043085, + "grad_norm": 0.800184965133667, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 7840 + }, + { + "epoch": 1.2690970818850538, + "grad_norm": 0.7539464831352234, + "learning_rate": 0.0002, + "loss": 0.7426, + "step": 7850 + }, + { + "epoch": 1.270713766065799, + "grad_norm": 0.5493760704994202, + "learning_rate": 0.0002, + "loss": 0.7496, + "step": 7860 + }, + { + "epoch": 1.2723304502465442, + "grad_norm": 0.7477145791053772, + "learning_rate": 0.0002, + "loss": 0.7537, + "step": 7870 + }, + { + "epoch": 1.2739471344272897, + "grad_norm": 0.6366362571716309, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 7880 + }, + { + "epoch": 1.275563818608035, + "grad_norm": 0.7419533729553223, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 7890 + }, + { + "epoch": 1.2771805027887801, + "grad_norm": 0.6141223311424255, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 7900 + }, + { + "epoch": 1.2787971869695256, + "grad_norm": 0.7522598505020142, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 7910 + }, + { + "epoch": 1.2804138711502708, + "grad_norm": 0.6935804486274719, + "learning_rate": 0.0002, + "loss": 0.7097, + "step": 7920 + }, + { + "epoch": 1.282030555331016, + "grad_norm": 0.7239290475845337, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 7930 + }, + { + "epoch": 1.2836472395117613, + "grad_norm": 0.8800187110900879, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 7940 + }, + { + "epoch": 1.2852639236925067, + "grad_norm": 0.540458083152771, + "learning_rate": 0.0002, + "loss": 0.6991, + "step": 7950 + }, + { + "epoch": 1.286880607873252, + "grad_norm": 0.6492934226989746, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 7960 + }, + { + "epoch": 1.2884972920539972, + "grad_norm": 0.6543959379196167, + "learning_rate": 0.0002, + "loss": 0.7742, + "step": 7970 + }, + { + "epoch": 1.2901139762347427, + "grad_norm": 0.5804705619812012, + "learning_rate": 0.0002, + "loss": 0.7316, + "step": 7980 + }, + { + "epoch": 1.291730660415488, + "grad_norm": 0.7074727416038513, + "learning_rate": 0.0002, + "loss": 0.796, + "step": 7990 + }, + { + "epoch": 1.2933473445962331, + "grad_norm": 0.5347974300384521, + "learning_rate": 0.0002, + "loss": 0.7034, + "step": 8000 + }, + { + "epoch": 1.2949640287769784, + "grad_norm": 0.6457298398017883, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 8010 + }, + { + "epoch": 1.2965807129577236, + "grad_norm": 0.6407219171524048, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 8020 + }, + { + "epoch": 1.298197397138469, + "grad_norm": 0.828439474105835, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 8030 + }, + { + "epoch": 1.2998140813192143, + "grad_norm": 0.4840380549430847, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 8040 + }, + { + "epoch": 1.3014307654999595, + "grad_norm": 0.5921024680137634, + "learning_rate": 0.0002, + "loss": 0.7283, + "step": 8050 + }, + { + "epoch": 1.303047449680705, + "grad_norm": 0.6170315146446228, + "learning_rate": 0.0002, + "loss": 0.7477, + "step": 8060 + }, + { + "epoch": 1.3046641338614502, + "grad_norm": 0.5374847054481506, + "learning_rate": 0.0002, + "loss": 0.7534, + "step": 8070 + }, + { + "epoch": 1.3062808180421954, + "grad_norm": 0.545758068561554, + "learning_rate": 0.0002, + "loss": 0.7593, + "step": 8080 + }, + { + "epoch": 1.3078975022229407, + "grad_norm": 0.55641770362854, + "learning_rate": 0.0002, + "loss": 0.7463, + "step": 8090 + }, + { + "epoch": 1.309514186403686, + "grad_norm": 0.6724897027015686, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 8100 + }, + { + "epoch": 1.3111308705844313, + "grad_norm": 0.6923972368240356, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 8110 + }, + { + "epoch": 1.3127475547651766, + "grad_norm": 0.5136841535568237, + "learning_rate": 0.0002, + "loss": 0.7149, + "step": 8120 + }, + { + "epoch": 1.314364238945922, + "grad_norm": 0.6766283512115479, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 8130 + }, + { + "epoch": 1.3159809231266673, + "grad_norm": 0.6283926367759705, + "learning_rate": 0.0002, + "loss": 0.7489, + "step": 8140 + }, + { + "epoch": 1.3175976073074125, + "grad_norm": 0.644216001033783, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 8150 + }, + { + "epoch": 1.3192142914881577, + "grad_norm": 0.7827503085136414, + "learning_rate": 0.0002, + "loss": 0.7125, + "step": 8160 + }, + { + "epoch": 1.320830975668903, + "grad_norm": 0.6651390790939331, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 8170 + }, + { + "epoch": 1.3224476598496484, + "grad_norm": 0.5547412633895874, + "learning_rate": 0.0002, + "loss": 0.7778, + "step": 8180 + }, + { + "epoch": 1.3240643440303936, + "grad_norm": 0.6765179634094238, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 8190 + }, + { + "epoch": 1.325681028211139, + "grad_norm": 0.6822077035903931, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 8200 + }, + { + "epoch": 1.3272977123918843, + "grad_norm": 0.5941002368927002, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 8210 + }, + { + "epoch": 1.3289143965726296, + "grad_norm": 0.4850037097930908, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 8220 + }, + { + "epoch": 1.3305310807533748, + "grad_norm": 0.6162990927696228, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 8230 + }, + { + "epoch": 1.33214776493412, + "grad_norm": 0.6665613651275635, + "learning_rate": 0.0002, + "loss": 0.7751, + "step": 8240 + }, + { + "epoch": 1.3337644491148655, + "grad_norm": 0.618192732334137, + "learning_rate": 0.0002, + "loss": 0.759, + "step": 8250 + }, + { + "epoch": 1.3353811332956107, + "grad_norm": 0.710418701171875, + "learning_rate": 0.0002, + "loss": 0.7532, + "step": 8260 + }, + { + "epoch": 1.336997817476356, + "grad_norm": 0.5109876990318298, + "learning_rate": 0.0002, + "loss": 0.7306, + "step": 8270 + }, + { + "epoch": 1.3386145016571014, + "grad_norm": 0.6791711449623108, + "learning_rate": 0.0002, + "loss": 0.7303, + "step": 8280 + }, + { + "epoch": 1.3402311858378466, + "grad_norm": 0.6836432814598083, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 8290 + }, + { + "epoch": 1.3418478700185918, + "grad_norm": 0.5579386353492737, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 8300 + }, + { + "epoch": 1.343464554199337, + "grad_norm": 0.6713546514511108, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 8310 + }, + { + "epoch": 1.3450812383800825, + "grad_norm": 0.5353720188140869, + "learning_rate": 0.0002, + "loss": 0.7756, + "step": 8320 + }, + { + "epoch": 1.3466979225608278, + "grad_norm": 0.5813682675361633, + "learning_rate": 0.0002, + "loss": 0.718, + "step": 8330 + }, + { + "epoch": 1.348314606741573, + "grad_norm": 0.8158791661262512, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 8340 + }, + { + "epoch": 1.3499312909223184, + "grad_norm": 0.6193785071372986, + "learning_rate": 0.0002, + "loss": 0.6992, + "step": 8350 + }, + { + "epoch": 1.3515479751030637, + "grad_norm": 0.6353939771652222, + "learning_rate": 0.0002, + "loss": 0.7654, + "step": 8360 + }, + { + "epoch": 1.353164659283809, + "grad_norm": 0.6925048232078552, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 8370 + }, + { + "epoch": 1.3547813434645541, + "grad_norm": 0.988264799118042, + "learning_rate": 0.0002, + "loss": 0.736, + "step": 8380 + }, + { + "epoch": 1.3563980276452994, + "grad_norm": 0.6476002931594849, + "learning_rate": 0.0002, + "loss": 0.7744, + "step": 8390 + }, + { + "epoch": 1.3580147118260448, + "grad_norm": 0.7120398879051208, + "learning_rate": 0.0002, + "loss": 0.776, + "step": 8400 + }, + { + "epoch": 1.35963139600679, + "grad_norm": 0.9048416614532471, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 8410 + }, + { + "epoch": 1.3612480801875353, + "grad_norm": 0.7000672817230225, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 8420 + }, + { + "epoch": 1.3628647643682807, + "grad_norm": 0.6015632152557373, + "learning_rate": 0.0002, + "loss": 0.7358, + "step": 8430 + }, + { + "epoch": 1.364481448549026, + "grad_norm": 0.612516462802887, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 8440 + }, + { + "epoch": 1.3660981327297712, + "grad_norm": 0.5969301462173462, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 8450 + }, + { + "epoch": 1.3677148169105164, + "grad_norm": 0.6730654239654541, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 8460 + }, + { + "epoch": 1.369331501091262, + "grad_norm": 0.6386392116546631, + "learning_rate": 0.0002, + "loss": 0.7465, + "step": 8470 + }, + { + "epoch": 1.3709481852720071, + "grad_norm": 0.739544153213501, + "learning_rate": 0.0002, + "loss": 0.7433, + "step": 8480 + }, + { + "epoch": 1.3725648694527524, + "grad_norm": 0.6462782621383667, + "learning_rate": 0.0002, + "loss": 0.7892, + "step": 8490 + }, + { + "epoch": 1.3741815536334978, + "grad_norm": 0.7346843481063843, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 8500 + }, + { + "epoch": 1.375798237814243, + "grad_norm": 0.6884821057319641, + "learning_rate": 0.0002, + "loss": 0.7634, + "step": 8510 + }, + { + "epoch": 1.3774149219949883, + "grad_norm": 0.6999333500862122, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 8520 + }, + { + "epoch": 1.3790316061757335, + "grad_norm": 0.5378713011741638, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 8530 + }, + { + "epoch": 1.3806482903564787, + "grad_norm": 0.5417906641960144, + "learning_rate": 0.0002, + "loss": 0.6797, + "step": 8540 + }, + { + "epoch": 1.3822649745372242, + "grad_norm": 0.6602526307106018, + "learning_rate": 0.0002, + "loss": 0.7499, + "step": 8550 + }, + { + "epoch": 1.3838816587179694, + "grad_norm": 0.7073674201965332, + "learning_rate": 0.0002, + "loss": 0.7356, + "step": 8560 + }, + { + "epoch": 1.3854983428987149, + "grad_norm": 0.5841707587242126, + "learning_rate": 0.0002, + "loss": 0.75, + "step": 8570 + }, + { + "epoch": 1.38711502707946, + "grad_norm": 0.7031095027923584, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 8580 + }, + { + "epoch": 1.3887317112602053, + "grad_norm": 0.5198570489883423, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 8590 + }, + { + "epoch": 1.3903483954409506, + "grad_norm": 0.7261320352554321, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 8600 + }, + { + "epoch": 1.3919650796216958, + "grad_norm": 0.5616350173950195, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 8610 + }, + { + "epoch": 1.3935817638024413, + "grad_norm": 0.5185914635658264, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 8620 + }, + { + "epoch": 1.3951984479831865, + "grad_norm": 0.5814694762229919, + "learning_rate": 0.0002, + "loss": 0.7456, + "step": 8630 + }, + { + "epoch": 1.3968151321639317, + "grad_norm": 0.6977371573448181, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 8640 + }, + { + "epoch": 1.3984318163446772, + "grad_norm": 0.6855689883232117, + "learning_rate": 0.0002, + "loss": 0.7574, + "step": 8650 + }, + { + "epoch": 1.4000485005254224, + "grad_norm": 0.5414357781410217, + "learning_rate": 0.0002, + "loss": 0.7802, + "step": 8660 + }, + { + "epoch": 1.4016651847061676, + "grad_norm": 0.6970012784004211, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 8670 + }, + { + "epoch": 1.4032818688869129, + "grad_norm": 0.526079535484314, + "learning_rate": 0.0002, + "loss": 0.7421, + "step": 8680 + }, + { + "epoch": 1.404898553067658, + "grad_norm": 0.758712887763977, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 8690 + }, + { + "epoch": 1.4065152372484035, + "grad_norm": 0.7118762731552124, + "learning_rate": 0.0002, + "loss": 0.7612, + "step": 8700 + }, + { + "epoch": 1.4081319214291488, + "grad_norm": 0.5696909427642822, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 8710 + }, + { + "epoch": 1.4097486056098942, + "grad_norm": 0.7995436787605286, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 8720 + }, + { + "epoch": 1.4113652897906395, + "grad_norm": 0.7237521409988403, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 8730 + }, + { + "epoch": 1.4129819739713847, + "grad_norm": 0.744628369808197, + "learning_rate": 0.0002, + "loss": 0.7661, + "step": 8740 + }, + { + "epoch": 1.41459865815213, + "grad_norm": 0.6082926988601685, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 8750 + }, + { + "epoch": 1.4162153423328752, + "grad_norm": 0.5185243487358093, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 8760 + }, + { + "epoch": 1.4178320265136206, + "grad_norm": 0.5183082222938538, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 8770 + }, + { + "epoch": 1.4194487106943658, + "grad_norm": 0.7326041460037231, + "learning_rate": 0.0002, + "loss": 0.7509, + "step": 8780 + }, + { + "epoch": 1.421065394875111, + "grad_norm": 0.7174660563468933, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 8790 + }, + { + "epoch": 1.4226820790558565, + "grad_norm": 0.8080165982246399, + "learning_rate": 0.0002, + "loss": 0.7507, + "step": 8800 + }, + { + "epoch": 1.4242987632366018, + "grad_norm": 0.5061507821083069, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 8810 + }, + { + "epoch": 1.425915447417347, + "grad_norm": 0.801602840423584, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 8820 + }, + { + "epoch": 1.4275321315980922, + "grad_norm": 0.6150273084640503, + "learning_rate": 0.0002, + "loss": 0.7287, + "step": 8830 + }, + { + "epoch": 1.4291488157788377, + "grad_norm": 0.8786525726318359, + "learning_rate": 0.0002, + "loss": 0.7452, + "step": 8840 + }, + { + "epoch": 1.430765499959583, + "grad_norm": 0.6371538639068604, + "learning_rate": 0.0002, + "loss": 0.7257, + "step": 8850 + }, + { + "epoch": 1.4323821841403281, + "grad_norm": 0.6409295797348022, + "learning_rate": 0.0002, + "loss": 0.711, + "step": 8860 + }, + { + "epoch": 1.4339988683210736, + "grad_norm": 0.6452359557151794, + "learning_rate": 0.0002, + "loss": 0.7891, + "step": 8870 + }, + { + "epoch": 1.4356155525018188, + "grad_norm": 0.5842334628105164, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 8880 + }, + { + "epoch": 1.437232236682564, + "grad_norm": 0.696761965751648, + "learning_rate": 0.0002, + "loss": 0.7446, + "step": 8890 + }, + { + "epoch": 1.4388489208633093, + "grad_norm": 0.6384600400924683, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 8900 + }, + { + "epoch": 1.4404656050440545, + "grad_norm": 0.5981136560440063, + "learning_rate": 0.0002, + "loss": 0.7049, + "step": 8910 + }, + { + "epoch": 1.4420822892248, + "grad_norm": 0.6355637907981873, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 8920 + }, + { + "epoch": 1.4436989734055452, + "grad_norm": 0.6374830603599548, + "learning_rate": 0.0002, + "loss": 0.7653, + "step": 8930 + }, + { + "epoch": 1.4453156575862904, + "grad_norm": 0.559013307094574, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 8940 + }, + { + "epoch": 1.446932341767036, + "grad_norm": 0.7289170026779175, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 8950 + }, + { + "epoch": 1.4485490259477811, + "grad_norm": 0.8649206757545471, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 8960 + }, + { + "epoch": 1.4501657101285264, + "grad_norm": 0.7664689421653748, + "learning_rate": 0.0002, + "loss": 0.7745, + "step": 8970 + }, + { + "epoch": 1.4517823943092716, + "grad_norm": 0.7109952569007874, + "learning_rate": 0.0002, + "loss": 0.7431, + "step": 8980 + }, + { + "epoch": 1.453399078490017, + "grad_norm": 0.6312844753265381, + "learning_rate": 0.0002, + "loss": 0.7997, + "step": 8990 + }, + { + "epoch": 1.4550157626707623, + "grad_norm": 0.6616617441177368, + "learning_rate": 0.0002, + "loss": 0.7467, + "step": 9000 + }, + { + "epoch": 1.4566324468515075, + "grad_norm": 0.7384068965911865, + "learning_rate": 0.0002, + "loss": 0.7518, + "step": 9010 + }, + { + "epoch": 1.458249131032253, + "grad_norm": 0.6549670100212097, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 9020 + }, + { + "epoch": 1.4598658152129982, + "grad_norm": 0.6254119277000427, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 9030 + }, + { + "epoch": 1.4614824993937434, + "grad_norm": 0.6806328892707825, + "learning_rate": 0.0002, + "loss": 0.7645, + "step": 9040 + }, + { + "epoch": 1.4630991835744886, + "grad_norm": 0.6803115010261536, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 9050 + }, + { + "epoch": 1.4647158677552339, + "grad_norm": 0.48529282212257385, + "learning_rate": 0.0002, + "loss": 0.7264, + "step": 9060 + }, + { + "epoch": 1.4663325519359793, + "grad_norm": 0.5995030999183655, + "learning_rate": 0.0002, + "loss": 0.7542, + "step": 9070 + }, + { + "epoch": 1.4679492361167246, + "grad_norm": 0.6005427837371826, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 9080 + }, + { + "epoch": 1.46956592029747, + "grad_norm": 0.718564510345459, + "learning_rate": 0.0002, + "loss": 0.7288, + "step": 9090 + }, + { + "epoch": 1.4711826044782153, + "grad_norm": 0.7003577351570129, + "learning_rate": 0.0002, + "loss": 0.7089, + "step": 9100 + }, + { + "epoch": 1.4727992886589605, + "grad_norm": 0.5888323783874512, + "learning_rate": 0.0002, + "loss": 0.8069, + "step": 9110 + }, + { + "epoch": 1.4744159728397057, + "grad_norm": 0.6417609453201294, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 9120 + }, + { + "epoch": 1.476032657020451, + "grad_norm": 0.572294294834137, + "learning_rate": 0.0002, + "loss": 0.7441, + "step": 9130 + }, + { + "epoch": 1.4776493412011964, + "grad_norm": 0.8200714588165283, + "learning_rate": 0.0002, + "loss": 0.8053, + "step": 9140 + }, + { + "epoch": 1.4792660253819416, + "grad_norm": 0.6343288421630859, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 9150 + }, + { + "epoch": 1.4808827095626869, + "grad_norm": 0.7017961144447327, + "learning_rate": 0.0002, + "loss": 0.7641, + "step": 9160 + }, + { + "epoch": 1.4824993937434323, + "grad_norm": 0.6202912926673889, + "learning_rate": 0.0002, + "loss": 0.7619, + "step": 9170 + }, + { + "epoch": 1.4841160779241775, + "grad_norm": 0.6677869558334351, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 9180 + }, + { + "epoch": 1.4857327621049228, + "grad_norm": 0.6052267551422119, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 9190 + }, + { + "epoch": 1.487349446285668, + "grad_norm": 0.6638872027397156, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 9200 + }, + { + "epoch": 1.4889661304664135, + "grad_norm": 0.6245523691177368, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 9210 + }, + { + "epoch": 1.4905828146471587, + "grad_norm": 0.5761767625808716, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 9220 + }, + { + "epoch": 1.492199498827904, + "grad_norm": 0.8175981640815735, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 9230 + }, + { + "epoch": 1.4938161830086494, + "grad_norm": 0.9144009947776794, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 9240 + }, + { + "epoch": 1.4954328671893946, + "grad_norm": 0.5742552876472473, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 9250 + }, + { + "epoch": 1.4970495513701398, + "grad_norm": 0.534534215927124, + "learning_rate": 0.0002, + "loss": 0.7418, + "step": 9260 + }, + { + "epoch": 1.498666235550885, + "grad_norm": 0.7836225032806396, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 9270 + }, + { + "epoch": 1.5002829197316303, + "grad_norm": 0.5292993187904358, + "learning_rate": 0.0002, + "loss": 0.7453, + "step": 9280 + }, + { + "epoch": 1.5018996039123758, + "grad_norm": 0.8044071793556213, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 9290 + }, + { + "epoch": 1.503516288093121, + "grad_norm": 0.6185805201530457, + "learning_rate": 0.0002, + "loss": 0.7229, + "step": 9300 + }, + { + "epoch": 1.5051329722738664, + "grad_norm": 0.6093607544898987, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 9310 + }, + { + "epoch": 1.5067496564546117, + "grad_norm": 0.5891730189323425, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 9320 + }, + { + "epoch": 1.508366340635357, + "grad_norm": 0.6331129670143127, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 9330 + }, + { + "epoch": 1.5099830248161021, + "grad_norm": 0.7690958380699158, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 9340 + }, + { + "epoch": 1.5115997089968474, + "grad_norm": 0.6548877358436584, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 9350 + }, + { + "epoch": 1.5132163931775926, + "grad_norm": 0.6545143127441406, + "learning_rate": 0.0002, + "loss": 0.7408, + "step": 9360 + }, + { + "epoch": 1.514833077358338, + "grad_norm": 0.553247332572937, + "learning_rate": 0.0002, + "loss": 0.7432, + "step": 9370 + }, + { + "epoch": 1.5164497615390833, + "grad_norm": 0.8145074844360352, + "learning_rate": 0.0002, + "loss": 0.7265, + "step": 9380 + }, + { + "epoch": 1.5180664457198287, + "grad_norm": 0.7636994123458862, + "learning_rate": 0.0002, + "loss": 0.7379, + "step": 9390 + }, + { + "epoch": 1.519683129900574, + "grad_norm": 0.6838982701301575, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 9400 + }, + { + "epoch": 1.5212998140813192, + "grad_norm": 0.8599441647529602, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 9410 + }, + { + "epoch": 1.5229164982620644, + "grad_norm": 0.7020329833030701, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 9420 + }, + { + "epoch": 1.5245331824428097, + "grad_norm": 0.6964772343635559, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 9430 + }, + { + "epoch": 1.5261498666235551, + "grad_norm": 0.6916600465774536, + "learning_rate": 0.0002, + "loss": 0.7168, + "step": 9440 + }, + { + "epoch": 1.5277665508043003, + "grad_norm": 0.7282621264457703, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 9450 + }, + { + "epoch": 1.5293832349850458, + "grad_norm": 0.5363983511924744, + "learning_rate": 0.0002, + "loss": 0.7628, + "step": 9460 + }, + { + "epoch": 1.530999919165791, + "grad_norm": 0.6184861063957214, + "learning_rate": 0.0002, + "loss": 0.7154, + "step": 9470 + }, + { + "epoch": 1.5326166033465363, + "grad_norm": 0.5991285443305969, + "learning_rate": 0.0002, + "loss": 0.7837, + "step": 9480 + }, + { + "epoch": 1.5342332875272815, + "grad_norm": 0.8176587820053101, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 9490 + }, + { + "epoch": 1.5358499717080267, + "grad_norm": 0.6473721861839294, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 9500 + }, + { + "epoch": 1.5374666558887722, + "grad_norm": 0.7319952845573425, + "learning_rate": 0.0002, + "loss": 0.7632, + "step": 9510 + }, + { + "epoch": 1.5390833400695174, + "grad_norm": 0.702900230884552, + "learning_rate": 0.0002, + "loss": 0.7706, + "step": 9520 + }, + { + "epoch": 1.5407000242502629, + "grad_norm": 0.7971600294113159, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 9530 + }, + { + "epoch": 1.542316708431008, + "grad_norm": 0.6527525186538696, + "learning_rate": 0.0002, + "loss": 0.7352, + "step": 9540 + }, + { + "epoch": 1.5439333926117533, + "grad_norm": 0.5791676044464111, + "learning_rate": 0.0002, + "loss": 0.7425, + "step": 9550 + }, + { + "epoch": 1.5455500767924986, + "grad_norm": 0.5619390606880188, + "learning_rate": 0.0002, + "loss": 0.7585, + "step": 9560 + }, + { + "epoch": 1.5471667609732438, + "grad_norm": 0.5701689124107361, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 9570 + }, + { + "epoch": 1.548783445153989, + "grad_norm": 0.47549352049827576, + "learning_rate": 0.0002, + "loss": 0.793, + "step": 9580 + }, + { + "epoch": 1.5504001293347345, + "grad_norm": 0.8730611205101013, + "learning_rate": 0.0002, + "loss": 0.7276, + "step": 9590 + }, + { + "epoch": 1.5520168135154797, + "grad_norm": 0.6842091083526611, + "learning_rate": 0.0002, + "loss": 0.798, + "step": 9600 + }, + { + "epoch": 1.5536334976962252, + "grad_norm": 0.6675129532814026, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 9610 + }, + { + "epoch": 1.5552501818769704, + "grad_norm": 0.8173956274986267, + "learning_rate": 0.0002, + "loss": 0.7954, + "step": 9620 + }, + { + "epoch": 1.5568668660577156, + "grad_norm": 0.724947452545166, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 9630 + }, + { + "epoch": 1.5584835502384609, + "grad_norm": 0.6154758930206299, + "learning_rate": 0.0002, + "loss": 0.7738, + "step": 9640 + }, + { + "epoch": 1.560100234419206, + "grad_norm": 0.6072008013725281, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 9650 + }, + { + "epoch": 1.5617169185999515, + "grad_norm": 0.659010648727417, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 9660 + }, + { + "epoch": 1.5633336027806968, + "grad_norm": 0.65857994556427, + "learning_rate": 0.0002, + "loss": 0.673, + "step": 9670 + }, + { + "epoch": 1.5649502869614422, + "grad_norm": 0.5914267301559448, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 9680 + }, + { + "epoch": 1.5665669711421875, + "grad_norm": 0.6248020529747009, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 9690 + }, + { + "epoch": 1.5681836553229327, + "grad_norm": 0.7147795557975769, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 9700 + }, + { + "epoch": 1.569800339503678, + "grad_norm": 0.7076232433319092, + "learning_rate": 0.0002, + "loss": 0.7335, + "step": 9710 + }, + { + "epoch": 1.5714170236844232, + "grad_norm": 0.6217400431632996, + "learning_rate": 0.0002, + "loss": 0.7413, + "step": 9720 + }, + { + "epoch": 1.5730337078651684, + "grad_norm": 0.6709911227226257, + "learning_rate": 0.0002, + "loss": 0.7296, + "step": 9730 + }, + { + "epoch": 1.5746503920459138, + "grad_norm": 0.749171257019043, + "learning_rate": 0.0002, + "loss": 0.7306, + "step": 9740 + }, + { + "epoch": 1.576267076226659, + "grad_norm": 0.6241145730018616, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 9750 + }, + { + "epoch": 1.5778837604074045, + "grad_norm": 0.4960934817790985, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 9760 + }, + { + "epoch": 1.5795004445881498, + "grad_norm": 0.6593309640884399, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 9770 + }, + { + "epoch": 1.581117128768895, + "grad_norm": 0.5814042091369629, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 9780 + }, + { + "epoch": 1.5827338129496402, + "grad_norm": 0.5936070680618286, + "learning_rate": 0.0002, + "loss": 0.7109, + "step": 9790 + }, + { + "epoch": 1.5843504971303854, + "grad_norm": 0.6454403400421143, + "learning_rate": 0.0002, + "loss": 0.7769, + "step": 9800 + }, + { + "epoch": 1.585967181311131, + "grad_norm": 0.7612107992172241, + "learning_rate": 0.0002, + "loss": 0.7677, + "step": 9810 + }, + { + "epoch": 1.5875838654918761, + "grad_norm": 0.6494482755661011, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 9820 + }, + { + "epoch": 1.5892005496726216, + "grad_norm": 0.7825694680213928, + "learning_rate": 0.0002, + "loss": 0.7569, + "step": 9830 + }, + { + "epoch": 1.5908172338533668, + "grad_norm": 0.6757757663726807, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 9840 + }, + { + "epoch": 1.592433918034112, + "grad_norm": 0.7105609178543091, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 9850 + }, + { + "epoch": 1.5940506022148573, + "grad_norm": 0.7596991062164307, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 9860 + }, + { + "epoch": 1.5956672863956025, + "grad_norm": 0.5681525468826294, + "learning_rate": 0.0002, + "loss": 0.7108, + "step": 9870 + }, + { + "epoch": 1.5972839705763477, + "grad_norm": 0.6090980768203735, + "learning_rate": 0.0002, + "loss": 0.7811, + "step": 9880 + }, + { + "epoch": 1.5989006547570932, + "grad_norm": 0.6271613240242004, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 9890 + }, + { + "epoch": 1.6005173389378387, + "grad_norm": 0.7656369805335999, + "learning_rate": 0.0002, + "loss": 0.7419, + "step": 9900 + }, + { + "epoch": 1.6021340231185839, + "grad_norm": 0.7504446506500244, + "learning_rate": 0.0002, + "loss": 0.7336, + "step": 9910 + }, + { + "epoch": 1.6037507072993291, + "grad_norm": 0.659656286239624, + "learning_rate": 0.0002, + "loss": 0.7479, + "step": 9920 + }, + { + "epoch": 1.6053673914800743, + "grad_norm": 0.6006826162338257, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 9930 + }, + { + "epoch": 1.6069840756608196, + "grad_norm": 0.7872757911682129, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 9940 + }, + { + "epoch": 1.6086007598415648, + "grad_norm": 0.5545852780342102, + "learning_rate": 0.0002, + "loss": 0.768, + "step": 9950 + }, + { + "epoch": 1.6102174440223103, + "grad_norm": 0.7429468631744385, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 9960 + }, + { + "epoch": 1.6118341282030555, + "grad_norm": 0.6873556971549988, + "learning_rate": 0.0002, + "loss": 0.714, + "step": 9970 + }, + { + "epoch": 1.613450812383801, + "grad_norm": 0.5874287486076355, + "learning_rate": 0.0002, + "loss": 0.7324, + "step": 9980 + }, + { + "epoch": 1.6150674965645462, + "grad_norm": 0.6039386987686157, + "learning_rate": 0.0002, + "loss": 0.7141, + "step": 9990 + }, + { + "epoch": 1.6166841807452914, + "grad_norm": 0.6233575940132141, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 10000 + }, + { + "epoch": 1.6183008649260366, + "grad_norm": 0.7676448225975037, + "learning_rate": 0.0002, + "loss": 0.7602, + "step": 10010 + }, + { + "epoch": 1.6199175491067819, + "grad_norm": 0.6565698385238647, + "learning_rate": 0.0002, + "loss": 0.7784, + "step": 10020 + }, + { + "epoch": 1.6215342332875273, + "grad_norm": 0.6787590384483337, + "learning_rate": 0.0002, + "loss": 0.7104, + "step": 10030 + }, + { + "epoch": 1.6231509174682726, + "grad_norm": 0.6137678027153015, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 10040 + }, + { + "epoch": 1.624767601649018, + "grad_norm": 0.5236800312995911, + "learning_rate": 0.0002, + "loss": 0.7646, + "step": 10050 + }, + { + "epoch": 1.6263842858297632, + "grad_norm": 0.7626367807388306, + "learning_rate": 0.0002, + "loss": 0.7437, + "step": 10060 + }, + { + "epoch": 1.6280009700105085, + "grad_norm": 0.5657260417938232, + "learning_rate": 0.0002, + "loss": 0.7273, + "step": 10070 + }, + { + "epoch": 1.6296176541912537, + "grad_norm": 0.4913991391658783, + "learning_rate": 0.0002, + "loss": 0.7354, + "step": 10080 + }, + { + "epoch": 1.631234338371999, + "grad_norm": 0.7715556621551514, + "learning_rate": 0.0002, + "loss": 0.7596, + "step": 10090 + }, + { + "epoch": 1.6328510225527442, + "grad_norm": 0.6509000062942505, + "learning_rate": 0.0002, + "loss": 0.7105, + "step": 10100 + }, + { + "epoch": 1.6344677067334896, + "grad_norm": 0.6215850114822388, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 10110 + }, + { + "epoch": 1.6360843909142349, + "grad_norm": 0.6956844329833984, + "learning_rate": 0.0002, + "loss": 0.7705, + "step": 10120 + }, + { + "epoch": 1.6377010750949803, + "grad_norm": 0.6111597418785095, + "learning_rate": 0.0002, + "loss": 0.7129, + "step": 10130 + }, + { + "epoch": 1.6393177592757255, + "grad_norm": 0.6518288850784302, + "learning_rate": 0.0002, + "loss": 0.6955, + "step": 10140 + }, + { + "epoch": 1.6409344434564708, + "grad_norm": 0.6914522051811218, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 10150 + }, + { + "epoch": 1.642551127637216, + "grad_norm": 0.63785719871521, + "learning_rate": 0.0002, + "loss": 0.7295, + "step": 10160 + }, + { + "epoch": 1.6441678118179612, + "grad_norm": 0.6379287838935852, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 10170 + }, + { + "epoch": 1.6457844959987067, + "grad_norm": 0.6793403029441833, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 10180 + }, + { + "epoch": 1.647401180179452, + "grad_norm": 0.6099132895469666, + "learning_rate": 0.0002, + "loss": 0.7402, + "step": 10190 + }, + { + "epoch": 1.6490178643601974, + "grad_norm": 0.5869854092597961, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 10200 + }, + { + "epoch": 1.6506345485409426, + "grad_norm": 0.7716999053955078, + "learning_rate": 0.0002, + "loss": 0.8308, + "step": 10210 + }, + { + "epoch": 1.6522512327216878, + "grad_norm": 0.6854110360145569, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 10220 + }, + { + "epoch": 1.653867916902433, + "grad_norm": 0.6957170367240906, + "learning_rate": 0.0002, + "loss": 0.782, + "step": 10230 + }, + { + "epoch": 1.6554846010831783, + "grad_norm": 0.6932903528213501, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 10240 + }, + { + "epoch": 1.6571012852639235, + "grad_norm": 0.7713165283203125, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 10250 + }, + { + "epoch": 1.658717969444669, + "grad_norm": 0.7455793619155884, + "learning_rate": 0.0002, + "loss": 0.7099, + "step": 10260 + }, + { + "epoch": 1.6603346536254144, + "grad_norm": 0.5464168190956116, + "learning_rate": 0.0002, + "loss": 0.7524, + "step": 10270 + }, + { + "epoch": 1.6619513378061597, + "grad_norm": 0.6782926321029663, + "learning_rate": 0.0002, + "loss": 0.7328, + "step": 10280 + }, + { + "epoch": 1.663568021986905, + "grad_norm": 0.7962649464607239, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 10290 + }, + { + "epoch": 1.6651847061676501, + "grad_norm": 0.6814526319503784, + "learning_rate": 0.0002, + "loss": 0.7142, + "step": 10300 + }, + { + "epoch": 1.6668013903483954, + "grad_norm": 0.656895101070404, + "learning_rate": 0.0002, + "loss": 0.7285, + "step": 10310 + }, + { + "epoch": 1.6684180745291406, + "grad_norm": 0.6085672378540039, + "learning_rate": 0.0002, + "loss": 0.7358, + "step": 10320 + }, + { + "epoch": 1.670034758709886, + "grad_norm": 0.585508406162262, + "learning_rate": 0.0002, + "loss": 0.7074, + "step": 10330 + }, + { + "epoch": 1.6716514428906313, + "grad_norm": 0.6930184364318848, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 10340 + }, + { + "epoch": 1.6732681270713767, + "grad_norm": 0.575663149356842, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 10350 + }, + { + "epoch": 1.674884811252122, + "grad_norm": 0.582502543926239, + "learning_rate": 0.0002, + "loss": 0.7198, + "step": 10360 + }, + { + "epoch": 1.6765014954328672, + "grad_norm": 0.5668916702270508, + "learning_rate": 0.0002, + "loss": 0.7793, + "step": 10370 + }, + { + "epoch": 1.6781181796136124, + "grad_norm": 0.6070065498352051, + "learning_rate": 0.0002, + "loss": 0.7478, + "step": 10380 + }, + { + "epoch": 1.6797348637943577, + "grad_norm": 0.6141316294670105, + "learning_rate": 0.0002, + "loss": 0.7939, + "step": 10390 + }, + { + "epoch": 1.6813515479751031, + "grad_norm": 0.8359124064445496, + "learning_rate": 0.0002, + "loss": 0.7573, + "step": 10400 + }, + { + "epoch": 1.6829682321558483, + "grad_norm": 0.5378185510635376, + "learning_rate": 0.0002, + "loss": 0.7488, + "step": 10410 + }, + { + "epoch": 1.6845849163365938, + "grad_norm": 0.6959536075592041, + "learning_rate": 0.0002, + "loss": 0.7588, + "step": 10420 + }, + { + "epoch": 1.686201600517339, + "grad_norm": 0.6514357328414917, + "learning_rate": 0.0002, + "loss": 0.7872, + "step": 10430 + }, + { + "epoch": 1.6878182846980843, + "grad_norm": 0.7706646919250488, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 10440 + }, + { + "epoch": 1.6894349688788295, + "grad_norm": 0.6183337569236755, + "learning_rate": 0.0002, + "loss": 0.7673, + "step": 10450 + }, + { + "epoch": 1.6910516530595747, + "grad_norm": 0.6123278141021729, + "learning_rate": 0.0002, + "loss": 0.7566, + "step": 10460 + }, + { + "epoch": 1.69266833724032, + "grad_norm": 0.6894851326942444, + "learning_rate": 0.0002, + "loss": 0.7169, + "step": 10470 + }, + { + "epoch": 1.6942850214210654, + "grad_norm": 0.7497312426567078, + "learning_rate": 0.0002, + "loss": 0.7435, + "step": 10480 + }, + { + "epoch": 1.6959017056018106, + "grad_norm": 0.5968214273452759, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 10490 + }, + { + "epoch": 1.697518389782556, + "grad_norm": 0.6747927069664001, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 10500 + }, + { + "epoch": 1.6991350739633013, + "grad_norm": 0.5708310008049011, + "learning_rate": 0.0002, + "loss": 0.7415, + "step": 10510 + }, + { + "epoch": 1.7007517581440466, + "grad_norm": 0.606526792049408, + "learning_rate": 0.0002, + "loss": 0.7385, + "step": 10520 + }, + { + "epoch": 1.7023684423247918, + "grad_norm": 0.662011981010437, + "learning_rate": 0.0002, + "loss": 0.7204, + "step": 10530 + }, + { + "epoch": 1.703985126505537, + "grad_norm": 0.7583045363426208, + "learning_rate": 0.0002, + "loss": 0.7999, + "step": 10540 + }, + { + "epoch": 1.7056018106862825, + "grad_norm": 0.721632182598114, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 10550 + }, + { + "epoch": 1.7072184948670277, + "grad_norm": 0.6107715368270874, + "learning_rate": 0.0002, + "loss": 0.7407, + "step": 10560 + }, + { + "epoch": 1.7088351790477732, + "grad_norm": 0.6652471423149109, + "learning_rate": 0.0002, + "loss": 0.7519, + "step": 10570 + }, + { + "epoch": 1.7104518632285184, + "grad_norm": 0.6308087110519409, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 10580 + }, + { + "epoch": 1.7120685474092636, + "grad_norm": 0.5464386940002441, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 10590 + }, + { + "epoch": 1.7136852315900089, + "grad_norm": 0.6558911204338074, + "learning_rate": 0.0002, + "loss": 0.7063, + "step": 10600 + }, + { + "epoch": 1.715301915770754, + "grad_norm": 0.5665024518966675, + "learning_rate": 0.0002, + "loss": 0.7126, + "step": 10610 + }, + { + "epoch": 1.7169185999514993, + "grad_norm": 0.7888094186782837, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 10620 + }, + { + "epoch": 1.7185352841322448, + "grad_norm": 0.7084909081459045, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 10630 + }, + { + "epoch": 1.7201519683129902, + "grad_norm": 0.7982324361801147, + "learning_rate": 0.0002, + "loss": 0.7557, + "step": 10640 + }, + { + "epoch": 1.7217686524937355, + "grad_norm": 0.6418732404708862, + "learning_rate": 0.0002, + "loss": 0.7345, + "step": 10650 + }, + { + "epoch": 1.7233853366744807, + "grad_norm": 0.7636681795120239, + "learning_rate": 0.0002, + "loss": 0.7734, + "step": 10660 + }, + { + "epoch": 1.725002020855226, + "grad_norm": 0.5646875500679016, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 10670 + }, + { + "epoch": 1.7266187050359711, + "grad_norm": 0.5231260657310486, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 10680 + }, + { + "epoch": 1.7282353892167164, + "grad_norm": 0.7635011672973633, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 10690 + }, + { + "epoch": 1.7298520733974618, + "grad_norm": 0.7518259286880493, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 10700 + }, + { + "epoch": 1.731468757578207, + "grad_norm": 0.7295602560043335, + "learning_rate": 0.0002, + "loss": 0.751, + "step": 10710 + }, + { + "epoch": 1.7330854417589525, + "grad_norm": 0.6984632015228271, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 10720 + }, + { + "epoch": 1.7347021259396977, + "grad_norm": 0.6198219060897827, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 10730 + }, + { + "epoch": 1.736318810120443, + "grad_norm": 0.6957576274871826, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 10740 + }, + { + "epoch": 1.7379354943011882, + "grad_norm": 0.6430263519287109, + "learning_rate": 0.0002, + "loss": 0.7917, + "step": 10750 + }, + { + "epoch": 1.7395521784819334, + "grad_norm": 0.6134995222091675, + "learning_rate": 0.0002, + "loss": 0.7156, + "step": 10760 + }, + { + "epoch": 1.741168862662679, + "grad_norm": 0.7209452986717224, + "learning_rate": 0.0002, + "loss": 0.7584, + "step": 10770 + }, + { + "epoch": 1.7427855468434241, + "grad_norm": 0.6735447645187378, + "learning_rate": 0.0002, + "loss": 0.7528, + "step": 10780 + }, + { + "epoch": 1.7444022310241696, + "grad_norm": 0.5605693459510803, + "learning_rate": 0.0002, + "loss": 0.756, + "step": 10790 + }, + { + "epoch": 1.7460189152049148, + "grad_norm": 0.6882363557815552, + "learning_rate": 0.0002, + "loss": 0.7759, + "step": 10800 + }, + { + "epoch": 1.74763559938566, + "grad_norm": 0.6386259198188782, + "learning_rate": 0.0002, + "loss": 0.7544, + "step": 10810 + }, + { + "epoch": 1.7492522835664053, + "grad_norm": 0.6529015302658081, + "learning_rate": 0.0002, + "loss": 0.7697, + "step": 10820 + }, + { + "epoch": 1.7508689677471505, + "grad_norm": 0.5664082765579224, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 10830 + }, + { + "epoch": 1.7524856519278957, + "grad_norm": 0.7532684206962585, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 10840 + }, + { + "epoch": 1.7541023361086412, + "grad_norm": 0.77171391248703, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 10850 + }, + { + "epoch": 1.7557190202893864, + "grad_norm": 0.7255431413650513, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 10860 + }, + { + "epoch": 1.7573357044701319, + "grad_norm": 0.763083279132843, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 10870 + }, + { + "epoch": 1.758952388650877, + "grad_norm": 0.6042402982711792, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 10880 + }, + { + "epoch": 1.7605690728316223, + "grad_norm": 0.7642518281936646, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 10890 + }, + { + "epoch": 1.7621857570123676, + "grad_norm": 0.6347904801368713, + "learning_rate": 0.0002, + "loss": 0.746, + "step": 10900 + }, + { + "epoch": 1.7638024411931128, + "grad_norm": 0.5371627807617188, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 10910 + }, + { + "epoch": 1.7654191253738583, + "grad_norm": 0.6840225458145142, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 10920 + }, + { + "epoch": 1.7670358095546035, + "grad_norm": 0.5288469195365906, + "learning_rate": 0.0002, + "loss": 0.725, + "step": 10930 + }, + { + "epoch": 1.768652493735349, + "grad_norm": 0.69020676612854, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 10940 + }, + { + "epoch": 1.7702691779160942, + "grad_norm": 0.5943242311477661, + "learning_rate": 0.0002, + "loss": 0.7468, + "step": 10950 + }, + { + "epoch": 1.7718858620968394, + "grad_norm": 0.5616418123245239, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 10960 + }, + { + "epoch": 1.7735025462775846, + "grad_norm": 0.7209470868110657, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 10970 + }, + { + "epoch": 1.7751192304583299, + "grad_norm": 0.6657957434654236, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 10980 + }, + { + "epoch": 1.776735914639075, + "grad_norm": 0.6469064950942993, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 10990 + }, + { + "epoch": 1.7783525988198206, + "grad_norm": 0.6615678071975708, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 11000 + }, + { + "epoch": 1.779969283000566, + "grad_norm": 0.6722439527511597, + "learning_rate": 0.0002, + "loss": 0.747, + "step": 11010 + }, + { + "epoch": 1.7815859671813112, + "grad_norm": 0.634136974811554, + "learning_rate": 0.0002, + "loss": 0.7302, + "step": 11020 + }, + { + "epoch": 1.7832026513620565, + "grad_norm": 0.6024377346038818, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 11030 + }, + { + "epoch": 1.7848193355428017, + "grad_norm": 0.6909403800964355, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 11040 + }, + { + "epoch": 1.786436019723547, + "grad_norm": 0.7148767709732056, + "learning_rate": 0.0002, + "loss": 0.7471, + "step": 11050 + }, + { + "epoch": 1.7880527039042922, + "grad_norm": 0.7442979216575623, + "learning_rate": 0.0002, + "loss": 0.7145, + "step": 11060 + }, + { + "epoch": 1.7896693880850376, + "grad_norm": 0.6830431818962097, + "learning_rate": 0.0002, + "loss": 0.7215, + "step": 11070 + }, + { + "epoch": 1.7912860722657828, + "grad_norm": 0.9172667264938354, + "learning_rate": 0.0002, + "loss": 0.7625, + "step": 11080 + }, + { + "epoch": 1.7929027564465283, + "grad_norm": 0.6799490451812744, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 11090 + }, + { + "epoch": 1.7945194406272735, + "grad_norm": 0.7617024779319763, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 11100 + }, + { + "epoch": 1.7961361248080188, + "grad_norm": 0.7701810002326965, + "learning_rate": 0.0002, + "loss": 0.7586, + "step": 11110 + }, + { + "epoch": 1.797752808988764, + "grad_norm": 0.7454385757446289, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 11120 + }, + { + "epoch": 1.7993694931695092, + "grad_norm": 0.6121436953544617, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 11130 + }, + { + "epoch": 1.8009861773502547, + "grad_norm": 0.6237571835517883, + "learning_rate": 0.0002, + "loss": 0.7305, + "step": 11140 + }, + { + "epoch": 1.802602861531, + "grad_norm": 0.6818515658378601, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 11150 + }, + { + "epoch": 1.8042195457117454, + "grad_norm": 0.7768308520317078, + "learning_rate": 0.0002, + "loss": 0.6876, + "step": 11160 + }, + { + "epoch": 1.8058362298924906, + "grad_norm": 0.6875537633895874, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 11170 + }, + { + "epoch": 1.8074529140732358, + "grad_norm": 0.7950584888458252, + "learning_rate": 0.0002, + "loss": 0.761, + "step": 11180 + }, + { + "epoch": 1.809069598253981, + "grad_norm": 0.8210248351097107, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 11190 + }, + { + "epoch": 1.8106862824347263, + "grad_norm": 0.6674110889434814, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 11200 + }, + { + "epoch": 1.8123029666154715, + "grad_norm": 0.6261674761772156, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 11210 + }, + { + "epoch": 1.813919650796217, + "grad_norm": 0.6484741568565369, + "learning_rate": 0.0002, + "loss": 0.7122, + "step": 11220 + }, + { + "epoch": 1.8155363349769622, + "grad_norm": 0.6231244206428528, + "learning_rate": 0.0002, + "loss": 0.7718, + "step": 11230 + }, + { + "epoch": 1.8171530191577077, + "grad_norm": 0.7243146896362305, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 11240 + }, + { + "epoch": 1.818769703338453, + "grad_norm": 0.6776193380355835, + "learning_rate": 0.0002, + "loss": 0.7448, + "step": 11250 + }, + { + "epoch": 1.8203863875191981, + "grad_norm": 0.5973618030548096, + "learning_rate": 0.0002, + "loss": 0.7317, + "step": 11260 + }, + { + "epoch": 1.8220030716999434, + "grad_norm": 0.6451361179351807, + "learning_rate": 0.0002, + "loss": 0.7961, + "step": 11270 + }, + { + "epoch": 1.8236197558806886, + "grad_norm": 0.5963068008422852, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 11280 + }, + { + "epoch": 1.825236440061434, + "grad_norm": 0.536902129650116, + "learning_rate": 0.0002, + "loss": 0.7466, + "step": 11290 + }, + { + "epoch": 1.8268531242421793, + "grad_norm": 0.6993787288665771, + "learning_rate": 0.0002, + "loss": 0.708, + "step": 11300 + }, + { + "epoch": 1.8284698084229247, + "grad_norm": 0.6135255098342896, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 11310 + }, + { + "epoch": 1.83008649260367, + "grad_norm": 0.6057423949241638, + "learning_rate": 0.0002, + "loss": 0.7423, + "step": 11320 + }, + { + "epoch": 1.8317031767844152, + "grad_norm": 0.6598812341690063, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 11330 + }, + { + "epoch": 1.8333198609651604, + "grad_norm": 0.6075948476791382, + "learning_rate": 0.0002, + "loss": 0.7278, + "step": 11340 + }, + { + "epoch": 1.8349365451459057, + "grad_norm": 0.7065447568893433, + "learning_rate": 0.0002, + "loss": 0.7846, + "step": 11350 + }, + { + "epoch": 1.8365532293266509, + "grad_norm": 0.680526614189148, + "learning_rate": 0.0002, + "loss": 0.7365, + "step": 11360 + }, + { + "epoch": 1.8381699135073963, + "grad_norm": 0.6356695294380188, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 11370 + }, + { + "epoch": 1.8397865976881416, + "grad_norm": 0.6399052143096924, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 11380 + }, + { + "epoch": 1.841403281868887, + "grad_norm": 0.6125704050064087, + "learning_rate": 0.0002, + "loss": 0.7618, + "step": 11390 + }, + { + "epoch": 1.8430199660496323, + "grad_norm": 0.7124643325805664, + "learning_rate": 0.0002, + "loss": 0.755, + "step": 11400 + }, + { + "epoch": 1.8446366502303775, + "grad_norm": 0.6099604964256287, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 11410 + }, + { + "epoch": 1.8462533344111227, + "grad_norm": 0.7338208556175232, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 11420 + }, + { + "epoch": 1.847870018591868, + "grad_norm": 0.7534668445587158, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 11430 + }, + { + "epoch": 1.8494867027726134, + "grad_norm": 0.6135470271110535, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 11440 + }, + { + "epoch": 1.8511033869533586, + "grad_norm": 0.6229309439659119, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 11450 + }, + { + "epoch": 1.852720071134104, + "grad_norm": 0.706423282623291, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 11460 + }, + { + "epoch": 1.8543367553148493, + "grad_norm": 0.5460049510002136, + "learning_rate": 0.0002, + "loss": 0.7411, + "step": 11470 + }, + { + "epoch": 1.8559534394955945, + "grad_norm": 0.6616711020469666, + "learning_rate": 0.0002, + "loss": 0.7416, + "step": 11480 + }, + { + "epoch": 1.8575701236763398, + "grad_norm": 0.6372783184051514, + "learning_rate": 0.0002, + "loss": 0.729, + "step": 11490 + }, + { + "epoch": 1.859186807857085, + "grad_norm": 0.7162668108940125, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 11500 + }, + { + "epoch": 1.8608034920378305, + "grad_norm": 0.6605209708213806, + "learning_rate": 0.0002, + "loss": 0.7747, + "step": 11510 + }, + { + "epoch": 1.8624201762185757, + "grad_norm": 0.6933956742286682, + "learning_rate": 0.0002, + "loss": 0.7258, + "step": 11520 + }, + { + "epoch": 1.8640368603993211, + "grad_norm": 0.6582090854644775, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 11530 + }, + { + "epoch": 1.8656535445800664, + "grad_norm": 0.6416500806808472, + "learning_rate": 0.0002, + "loss": 0.7313, + "step": 11540 + }, + { + "epoch": 1.8672702287608116, + "grad_norm": 0.5434312224388123, + "learning_rate": 0.0002, + "loss": 0.7372, + "step": 11550 + }, + { + "epoch": 1.8688869129415568, + "grad_norm": 0.6827567219734192, + "learning_rate": 0.0002, + "loss": 0.7635, + "step": 11560 + }, + { + "epoch": 1.870503597122302, + "grad_norm": 0.7354370951652527, + "learning_rate": 0.0002, + "loss": 0.7137, + "step": 11570 + }, + { + "epoch": 1.8721202813030473, + "grad_norm": 0.590372622013092, + "learning_rate": 0.0002, + "loss": 0.7526, + "step": 11580 + }, + { + "epoch": 1.8737369654837928, + "grad_norm": 0.853183925151825, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 11590 + }, + { + "epoch": 1.875353649664538, + "grad_norm": 0.822678804397583, + "learning_rate": 0.0002, + "loss": 0.7487, + "step": 11600 + }, + { + "epoch": 1.8769703338452834, + "grad_norm": 0.6591550707817078, + "learning_rate": 0.0002, + "loss": 0.7427, + "step": 11610 + }, + { + "epoch": 1.8785870180260287, + "grad_norm": 0.7475301623344421, + "learning_rate": 0.0002, + "loss": 0.7054, + "step": 11620 + }, + { + "epoch": 1.880203702206774, + "grad_norm": 0.6390765309333801, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 11630 + }, + { + "epoch": 1.8818203863875191, + "grad_norm": 0.6589758992195129, + "learning_rate": 0.0002, + "loss": 0.7531, + "step": 11640 + }, + { + "epoch": 1.8834370705682644, + "grad_norm": 0.6765508651733398, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 11650 + }, + { + "epoch": 1.8850537547490098, + "grad_norm": 0.6527857780456543, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 11660 + }, + { + "epoch": 1.886670438929755, + "grad_norm": 0.6642923951148987, + "learning_rate": 0.0002, + "loss": 0.7504, + "step": 11670 + }, + { + "epoch": 1.8882871231105005, + "grad_norm": 0.6945584416389465, + "learning_rate": 0.0002, + "loss": 0.7701, + "step": 11680 + }, + { + "epoch": 1.8899038072912457, + "grad_norm": 0.694018542766571, + "learning_rate": 0.0002, + "loss": 0.7711, + "step": 11690 + }, + { + "epoch": 1.891520491471991, + "grad_norm": 0.7237417101860046, + "learning_rate": 0.0002, + "loss": 0.7195, + "step": 11700 + }, + { + "epoch": 1.8931371756527362, + "grad_norm": 0.7401309609413147, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 11710 + }, + { + "epoch": 1.8947538598334814, + "grad_norm": 0.6537784337997437, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 11720 + }, + { + "epoch": 1.8963705440142267, + "grad_norm": 0.7398539185523987, + "learning_rate": 0.0002, + "loss": 0.793, + "step": 11730 + }, + { + "epoch": 1.8979872281949721, + "grad_norm": 0.6696075797080994, + "learning_rate": 0.0002, + "loss": 0.7561, + "step": 11740 + }, + { + "epoch": 1.8996039123757174, + "grad_norm": 0.6014142036437988, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 11750 + }, + { + "epoch": 1.9012205965564628, + "grad_norm": 0.7023524641990662, + "learning_rate": 0.0002, + "loss": 0.7714, + "step": 11760 + }, + { + "epoch": 1.902837280737208, + "grad_norm": 0.739973783493042, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 11770 + }, + { + "epoch": 1.9044539649179533, + "grad_norm": 0.5576770901679993, + "learning_rate": 0.0002, + "loss": 0.7848, + "step": 11780 + }, + { + "epoch": 1.9060706490986985, + "grad_norm": 0.6907393932342529, + "learning_rate": 0.0002, + "loss": 0.7483, + "step": 11790 + }, + { + "epoch": 1.9076873332794437, + "grad_norm": 0.6934581995010376, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 11800 + }, + { + "epoch": 1.9093040174601892, + "grad_norm": 0.591774582862854, + "learning_rate": 0.0002, + "loss": 0.7199, + "step": 11810 + }, + { + "epoch": 1.9109207016409344, + "grad_norm": 0.6249791383743286, + "learning_rate": 0.0002, + "loss": 0.7333, + "step": 11820 + }, + { + "epoch": 1.9125373858216799, + "grad_norm": 0.6755744218826294, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 11830 + }, + { + "epoch": 1.914154070002425, + "grad_norm": 0.7286285161972046, + "learning_rate": 0.0002, + "loss": 0.696, + "step": 11840 + }, + { + "epoch": 1.9157707541831703, + "grad_norm": 0.7867850065231323, + "learning_rate": 0.0002, + "loss": 0.7509, + "step": 11850 + }, + { + "epoch": 1.9173874383639156, + "grad_norm": 0.6283972859382629, + "learning_rate": 0.0002, + "loss": 0.735, + "step": 11860 + }, + { + "epoch": 1.9190041225446608, + "grad_norm": 0.605823814868927, + "learning_rate": 0.0002, + "loss": 0.7296, + "step": 11870 + }, + { + "epoch": 1.920620806725406, + "grad_norm": 0.5927976965904236, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 11880 + }, + { + "epoch": 1.9222374909061515, + "grad_norm": 0.5974002480506897, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 11890 + }, + { + "epoch": 1.923854175086897, + "grad_norm": 0.7091866135597229, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 11900 + }, + { + "epoch": 1.9254708592676422, + "grad_norm": 0.72496497631073, + "learning_rate": 0.0002, + "loss": 0.775, + "step": 11910 + }, + { + "epoch": 1.9270875434483874, + "grad_norm": 0.6131896376609802, + "learning_rate": 0.0002, + "loss": 0.7153, + "step": 11920 + }, + { + "epoch": 1.9287042276291326, + "grad_norm": 0.6556436419487, + "learning_rate": 0.0002, + "loss": 0.7228, + "step": 11930 + }, + { + "epoch": 1.9303209118098779, + "grad_norm": 0.622932493686676, + "learning_rate": 0.0002, + "loss": 0.7319, + "step": 11940 + }, + { + "epoch": 1.931937595990623, + "grad_norm": 0.6618631482124329, + "learning_rate": 0.0002, + "loss": 0.7592, + "step": 11950 + }, + { + "epoch": 1.9335542801713685, + "grad_norm": 0.630966305732727, + "learning_rate": 0.0002, + "loss": 0.8332, + "step": 11960 + }, + { + "epoch": 1.9351709643521138, + "grad_norm": 0.6336734890937805, + "learning_rate": 0.0002, + "loss": 0.6854, + "step": 11970 + }, + { + "epoch": 1.9367876485328592, + "grad_norm": 0.655403196811676, + "learning_rate": 0.0002, + "loss": 0.7433, + "step": 11980 + }, + { + "epoch": 1.9384043327136045, + "grad_norm": 0.5640574097633362, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 11990 + }, + { + "epoch": 1.9400210168943497, + "grad_norm": 0.6322951316833496, + "learning_rate": 0.0002, + "loss": 0.7289, + "step": 12000 + }, + { + "epoch": 1.941637701075095, + "grad_norm": 0.615703821182251, + "learning_rate": 0.0002, + "loss": 0.7627, + "step": 12010 + }, + { + "epoch": 1.9432543852558402, + "grad_norm": 0.6487536430358887, + "learning_rate": 0.0002, + "loss": 0.786, + "step": 12020 + }, + { + "epoch": 1.9448710694365856, + "grad_norm": 0.9209630489349365, + "learning_rate": 0.0002, + "loss": 0.7435, + "step": 12030 + }, + { + "epoch": 1.9464877536173308, + "grad_norm": 0.67485511302948, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 12040 + }, + { + "epoch": 1.9481044377980763, + "grad_norm": 0.6831230521202087, + "learning_rate": 0.0002, + "loss": 0.7551, + "step": 12050 + }, + { + "epoch": 1.9497211219788215, + "grad_norm": 0.6578302383422852, + "learning_rate": 0.0002, + "loss": 0.7546, + "step": 12060 + }, + { + "epoch": 1.9513378061595668, + "grad_norm": 0.9975938200950623, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 12070 + }, + { + "epoch": 1.952954490340312, + "grad_norm": 0.6637365221977234, + "learning_rate": 0.0002, + "loss": 0.7952, + "step": 12080 + }, + { + "epoch": 1.9545711745210572, + "grad_norm": 0.605707049369812, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 12090 + }, + { + "epoch": 1.9561878587018025, + "grad_norm": 0.6584440469741821, + "learning_rate": 0.0002, + "loss": 0.7768, + "step": 12100 + }, + { + "epoch": 1.957804542882548, + "grad_norm": 0.6070835590362549, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 12110 + }, + { + "epoch": 1.9594212270632931, + "grad_norm": 0.7862601280212402, + "learning_rate": 0.0002, + "loss": 0.7491, + "step": 12120 + }, + { + "epoch": 1.9610379112440386, + "grad_norm": 0.8175255060195923, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 12130 + }, + { + "epoch": 1.9626545954247838, + "grad_norm": 0.5648472905158997, + "learning_rate": 0.0002, + "loss": 0.7242, + "step": 12140 + }, + { + "epoch": 1.964271279605529, + "grad_norm": 0.6591973304748535, + "learning_rate": 0.0002, + "loss": 0.7321, + "step": 12150 + }, + { + "epoch": 1.9658879637862743, + "grad_norm": 0.5960676074028015, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 12160 + }, + { + "epoch": 1.9675046479670195, + "grad_norm": 0.7272544503211975, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 12170 + }, + { + "epoch": 1.969121332147765, + "grad_norm": 0.7176699042320251, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 12180 + }, + { + "epoch": 1.9707380163285102, + "grad_norm": 0.6927123665809631, + "learning_rate": 0.0002, + "loss": 0.7525, + "step": 12190 + }, + { + "epoch": 1.9723547005092557, + "grad_norm": 0.5536034107208252, + "learning_rate": 0.0002, + "loss": 0.7318, + "step": 12200 + }, + { + "epoch": 1.9739713846900009, + "grad_norm": 0.8348390460014343, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 12210 + }, + { + "epoch": 1.9755880688707461, + "grad_norm": 0.6591181755065918, + "learning_rate": 0.0002, + "loss": 0.7494, + "step": 12220 + }, + { + "epoch": 1.9772047530514913, + "grad_norm": 1.0624109506607056, + "learning_rate": 0.0002, + "loss": 0.763, + "step": 12230 + }, + { + "epoch": 1.9788214372322366, + "grad_norm": 0.9265586137771606, + "learning_rate": 0.0002, + "loss": 0.7541, + "step": 12240 + }, + { + "epoch": 1.9804381214129818, + "grad_norm": 0.5998196005821228, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 12250 + }, + { + "epoch": 1.9820548055937273, + "grad_norm": 0.6960851550102234, + "learning_rate": 0.0002, + "loss": 0.7225, + "step": 12260 + }, + { + "epoch": 1.9836714897744727, + "grad_norm": 0.7674502730369568, + "learning_rate": 0.0002, + "loss": 0.7398, + "step": 12270 + }, + { + "epoch": 1.985288173955218, + "grad_norm": 0.6407275795936584, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 12280 + }, + { + "epoch": 1.9869048581359632, + "grad_norm": 0.6673079133033752, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 12290 + }, + { + "epoch": 1.9885215423167084, + "grad_norm": 0.6989844441413879, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 12300 + }, + { + "epoch": 1.9901382264974536, + "grad_norm": 0.7564442157745361, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 12310 + }, + { + "epoch": 1.9917549106781989, + "grad_norm": 0.6385478973388672, + "learning_rate": 0.0002, + "loss": 0.7719, + "step": 12320 + }, + { + "epoch": 1.9933715948589443, + "grad_norm": 0.7193717956542969, + "learning_rate": 0.0002, + "loss": 0.7369, + "step": 12330 + }, + { + "epoch": 1.9949882790396896, + "grad_norm": 0.7987112402915955, + "learning_rate": 0.0002, + "loss": 0.7583, + "step": 12340 + }, + { + "epoch": 1.996604963220435, + "grad_norm": 0.7260826826095581, + "learning_rate": 0.0002, + "loss": 0.7793, + "step": 12350 + }, + { + "epoch": 1.9982216474011802, + "grad_norm": 0.7968255281448364, + "learning_rate": 0.0002, + "loss": 0.7505, + "step": 12360 + }, + { + "epoch": 1.9998383315819255, + "grad_norm": 0.6893062591552734, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 12370 + }, + { + "epoch": 2.0, + "eval_loss": 1.1044032573699951, + "eval_runtime": 122.1508, + "eval_samples_per_second": 6.001, + "eval_steps_per_second": 0.753, + "step": 12371 + }, + { + "epoch": 2.0014550157626707, + "grad_norm": 0.7775409817695618, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 12380 + }, + { + "epoch": 2.003071699943416, + "grad_norm": 0.76218581199646, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 12390 + }, + { + "epoch": 2.004688384124161, + "grad_norm": 0.5677764415740967, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 12400 + }, + { + "epoch": 2.006305068304907, + "grad_norm": 0.808442234992981, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 12410 + }, + { + "epoch": 2.007921752485652, + "grad_norm": 0.7144765257835388, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 12420 + }, + { + "epoch": 2.0095384366663973, + "grad_norm": 0.6914031505584717, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 12430 + }, + { + "epoch": 2.0111551208471425, + "grad_norm": 0.7581454515457153, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 12440 + }, + { + "epoch": 2.0127718050278878, + "grad_norm": 0.8388504981994629, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 12450 + }, + { + "epoch": 2.014388489208633, + "grad_norm": 0.6716406941413879, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 12460 + }, + { + "epoch": 2.0160051733893782, + "grad_norm": 0.898902416229248, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 12470 + }, + { + "epoch": 2.0176218575701235, + "grad_norm": 0.6432679891586304, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 12480 + }, + { + "epoch": 2.019238541750869, + "grad_norm": 0.8021109104156494, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 12490 + }, + { + "epoch": 2.0208552259316144, + "grad_norm": 0.7039216756820679, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 12500 + }, + { + "epoch": 2.0224719101123596, + "grad_norm": 0.646531879901886, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 12510 + }, + { + "epoch": 2.024088594293105, + "grad_norm": 0.783704400062561, + "learning_rate": 0.0002, + "loss": 0.6302, + "step": 12520 + }, + { + "epoch": 2.02570527847385, + "grad_norm": 0.8805046677589417, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 12530 + }, + { + "epoch": 2.0273219626545953, + "grad_norm": 0.7289270758628845, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 12540 + }, + { + "epoch": 2.0289386468353405, + "grad_norm": 0.71653151512146, + "learning_rate": 0.0002, + "loss": 0.6663, + "step": 12550 + }, + { + "epoch": 2.030555331016086, + "grad_norm": 0.73281329870224, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 12560 + }, + { + "epoch": 2.0321720151968314, + "grad_norm": 0.6657090187072754, + "learning_rate": 0.0002, + "loss": 0.6448, + "step": 12570 + }, + { + "epoch": 2.0337886993775767, + "grad_norm": 0.8241133093833923, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 12580 + }, + { + "epoch": 2.035405383558322, + "grad_norm": 0.5834135413169861, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 12590 + }, + { + "epoch": 2.037022067739067, + "grad_norm": 0.84502112865448, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 12600 + }, + { + "epoch": 2.0386387519198124, + "grad_norm": 0.8952481746673584, + "learning_rate": 0.0002, + "loss": 0.6349, + "step": 12610 + }, + { + "epoch": 2.0402554361005576, + "grad_norm": 0.7801461815834045, + "learning_rate": 0.0002, + "loss": 0.6923, + "step": 12620 + }, + { + "epoch": 2.041872120281303, + "grad_norm": 0.6788367033004761, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 12630 + }, + { + "epoch": 2.0434888044620485, + "grad_norm": 0.7241756319999695, + "learning_rate": 0.0002, + "loss": 0.6162, + "step": 12640 + }, + { + "epoch": 2.0451054886427937, + "grad_norm": 0.6933388113975525, + "learning_rate": 0.0002, + "loss": 0.655, + "step": 12650 + }, + { + "epoch": 2.046722172823539, + "grad_norm": 0.8029746413230896, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 12660 + }, + { + "epoch": 2.048338857004284, + "grad_norm": 0.946399986743927, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 12670 + }, + { + "epoch": 2.0499555411850294, + "grad_norm": 0.7072678804397583, + "learning_rate": 0.0002, + "loss": 0.638, + "step": 12680 + }, + { + "epoch": 2.0515722253657747, + "grad_norm": 0.6810618042945862, + "learning_rate": 0.0002, + "loss": 0.6487, + "step": 12690 + }, + { + "epoch": 2.05318890954652, + "grad_norm": 0.7661160230636597, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 12700 + }, + { + "epoch": 2.0548055937272656, + "grad_norm": 0.6350653767585754, + "learning_rate": 0.0002, + "loss": 0.6799, + "step": 12710 + }, + { + "epoch": 2.056422277908011, + "grad_norm": 0.861890971660614, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 12720 + }, + { + "epoch": 2.058038962088756, + "grad_norm": 0.6489875912666321, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 12730 + }, + { + "epoch": 2.0596556462695013, + "grad_norm": 0.8268506526947021, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 12740 + }, + { + "epoch": 2.0612723304502465, + "grad_norm": 0.607679545879364, + "learning_rate": 0.0002, + "loss": 0.6524, + "step": 12750 + }, + { + "epoch": 2.0628890146309917, + "grad_norm": 0.6754153370857239, + "learning_rate": 0.0002, + "loss": 0.6649, + "step": 12760 + }, + { + "epoch": 2.064505698811737, + "grad_norm": 0.7263124585151672, + "learning_rate": 0.0002, + "loss": 0.6549, + "step": 12770 + }, + { + "epoch": 2.0661223829924826, + "grad_norm": 0.6986154317855835, + "learning_rate": 0.0002, + "loss": 0.6189, + "step": 12780 + }, + { + "epoch": 2.067739067173228, + "grad_norm": 0.7768576741218567, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 12790 + }, + { + "epoch": 2.069355751353973, + "grad_norm": 0.7546762824058533, + "learning_rate": 0.0002, + "loss": 0.677, + "step": 12800 + }, + { + "epoch": 2.0709724355347183, + "grad_norm": 0.7588880062103271, + "learning_rate": 0.0002, + "loss": 0.6485, + "step": 12810 + }, + { + "epoch": 2.0725891197154636, + "grad_norm": 0.7457242608070374, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 12820 + }, + { + "epoch": 2.074205803896209, + "grad_norm": 0.6983516812324524, + "learning_rate": 0.0002, + "loss": 0.6489, + "step": 12830 + }, + { + "epoch": 2.075822488076954, + "grad_norm": 0.7950928807258606, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 12840 + }, + { + "epoch": 2.0774391722576993, + "grad_norm": 0.9248087406158447, + "learning_rate": 0.0002, + "loss": 0.6603, + "step": 12850 + }, + { + "epoch": 2.079055856438445, + "grad_norm": 0.7229493260383606, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 12860 + }, + { + "epoch": 2.08067254061919, + "grad_norm": 0.5710847973823547, + "learning_rate": 0.0002, + "loss": 0.6702, + "step": 12870 + }, + { + "epoch": 2.0822892247999354, + "grad_norm": 0.9580423831939697, + "learning_rate": 0.0002, + "loss": 0.6974, + "step": 12880 + }, + { + "epoch": 2.0839059089806806, + "grad_norm": 0.7399665713310242, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 12890 + }, + { + "epoch": 2.085522593161426, + "grad_norm": 0.7981410622596741, + "learning_rate": 0.0002, + "loss": 0.6993, + "step": 12900 + }, + { + "epoch": 2.087139277342171, + "grad_norm": 0.870759904384613, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 12910 + }, + { + "epoch": 2.0887559615229163, + "grad_norm": 0.7001481652259827, + "learning_rate": 0.0002, + "loss": 0.7194, + "step": 12920 + }, + { + "epoch": 2.090372645703662, + "grad_norm": 0.6745418310165405, + "learning_rate": 0.0002, + "loss": 0.6383, + "step": 12930 + }, + { + "epoch": 2.0919893298844072, + "grad_norm": 0.7739067673683167, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 12940 + }, + { + "epoch": 2.0936060140651525, + "grad_norm": 0.6742934584617615, + "learning_rate": 0.0002, + "loss": 0.6856, + "step": 12950 + }, + { + "epoch": 2.0952226982458977, + "grad_norm": 0.7270349860191345, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 12960 + }, + { + "epoch": 2.096839382426643, + "grad_norm": 0.7150624394416809, + "learning_rate": 0.0002, + "loss": 0.6783, + "step": 12970 + }, + { + "epoch": 2.098456066607388, + "grad_norm": 0.7734767198562622, + "learning_rate": 0.0002, + "loss": 0.6093, + "step": 12980 + }, + { + "epoch": 2.1000727507881334, + "grad_norm": 0.7618662118911743, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 12990 + }, + { + "epoch": 2.101689434968879, + "grad_norm": 0.6557944416999817, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 13000 + }, + { + "epoch": 2.1033061191496243, + "grad_norm": 0.8786448240280151, + "learning_rate": 0.0002, + "loss": 0.7268, + "step": 13010 + }, + { + "epoch": 2.1049228033303695, + "grad_norm": 0.6878724098205566, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 13020 + }, + { + "epoch": 2.1065394875111147, + "grad_norm": 0.822318971157074, + "learning_rate": 0.0002, + "loss": 0.6824, + "step": 13030 + }, + { + "epoch": 2.10815617169186, + "grad_norm": 0.831468939781189, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 13040 + }, + { + "epoch": 2.109772855872605, + "grad_norm": 0.7699505686759949, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 13050 + }, + { + "epoch": 2.1113895400533504, + "grad_norm": 0.7559016346931458, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 13060 + }, + { + "epoch": 2.1130062242340957, + "grad_norm": 0.6942209601402283, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 13070 + }, + { + "epoch": 2.1146229084148414, + "grad_norm": 0.6098947525024414, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 13080 + }, + { + "epoch": 2.1162395925955866, + "grad_norm": 0.6499016284942627, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 13090 + }, + { + "epoch": 2.117856276776332, + "grad_norm": 0.7719953060150146, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 13100 + }, + { + "epoch": 2.119472960957077, + "grad_norm": 0.6708134412765503, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 13110 + }, + { + "epoch": 2.1210896451378223, + "grad_norm": 0.8119585514068604, + "learning_rate": 0.0002, + "loss": 0.6056, + "step": 13120 + }, + { + "epoch": 2.1227063293185675, + "grad_norm": 0.6947157979011536, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 13130 + }, + { + "epoch": 2.1243230134993127, + "grad_norm": 0.8831837773323059, + "learning_rate": 0.0002, + "loss": 0.6375, + "step": 13140 + }, + { + "epoch": 2.1259396976800584, + "grad_norm": 0.7266910672187805, + "learning_rate": 0.0002, + "loss": 0.6997, + "step": 13150 + }, + { + "epoch": 2.1275563818608036, + "grad_norm": 0.8864351511001587, + "learning_rate": 0.0002, + "loss": 0.6446, + "step": 13160 + }, + { + "epoch": 2.129173066041549, + "grad_norm": 0.8104248046875, + "learning_rate": 0.0002, + "loss": 0.6762, + "step": 13170 + }, + { + "epoch": 2.130789750222294, + "grad_norm": 0.6077079772949219, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 13180 + }, + { + "epoch": 2.1324064344030393, + "grad_norm": 0.6874213814735413, + "learning_rate": 0.0002, + "loss": 0.6572, + "step": 13190 + }, + { + "epoch": 2.1340231185837846, + "grad_norm": 0.7134367823600769, + "learning_rate": 0.0002, + "loss": 0.642, + "step": 13200 + }, + { + "epoch": 2.13563980276453, + "grad_norm": 0.6101235151290894, + "learning_rate": 0.0002, + "loss": 0.7016, + "step": 13210 + }, + { + "epoch": 2.137256486945275, + "grad_norm": 0.6042411923408508, + "learning_rate": 0.0002, + "loss": 0.6529, + "step": 13220 + }, + { + "epoch": 2.1388731711260207, + "grad_norm": 0.914601743221283, + "learning_rate": 0.0002, + "loss": 0.7179, + "step": 13230 + }, + { + "epoch": 2.140489855306766, + "grad_norm": 0.7104284167289734, + "learning_rate": 0.0002, + "loss": 0.6513, + "step": 13240 + }, + { + "epoch": 2.142106539487511, + "grad_norm": 0.664395272731781, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 13250 + }, + { + "epoch": 2.1437232236682564, + "grad_norm": 0.6991241574287415, + "learning_rate": 0.0002, + "loss": 0.7211, + "step": 13260 + }, + { + "epoch": 2.1453399078490016, + "grad_norm": 0.5469560623168945, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 13270 + }, + { + "epoch": 2.146956592029747, + "grad_norm": 0.8454998135566711, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 13280 + }, + { + "epoch": 2.148573276210492, + "grad_norm": 0.7088868618011475, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 13290 + }, + { + "epoch": 2.1501899603912378, + "grad_norm": 0.7002687454223633, + "learning_rate": 0.0002, + "loss": 0.6835, + "step": 13300 + }, + { + "epoch": 2.151806644571983, + "grad_norm": 0.7785214781761169, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 13310 + }, + { + "epoch": 2.1534233287527282, + "grad_norm": 0.8049132227897644, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 13320 + }, + { + "epoch": 2.1550400129334735, + "grad_norm": 0.8062595129013062, + "learning_rate": 0.0002, + "loss": 0.6495, + "step": 13330 + }, + { + "epoch": 2.1566566971142187, + "grad_norm": 0.6208319067955017, + "learning_rate": 0.0002, + "loss": 0.6603, + "step": 13340 + }, + { + "epoch": 2.158273381294964, + "grad_norm": 0.7519655823707581, + "learning_rate": 0.0002, + "loss": 0.6584, + "step": 13350 + }, + { + "epoch": 2.159890065475709, + "grad_norm": 0.7645747065544128, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 13360 + }, + { + "epoch": 2.1615067496564544, + "grad_norm": 0.6847302913665771, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 13370 + }, + { + "epoch": 2.1631234338372, + "grad_norm": 0.8630441427230835, + "learning_rate": 0.0002, + "loss": 0.6903, + "step": 13380 + }, + { + "epoch": 2.1647401180179453, + "grad_norm": 0.7947702407836914, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 13390 + }, + { + "epoch": 2.1663568021986905, + "grad_norm": 0.6836977005004883, + "learning_rate": 0.0002, + "loss": 0.7206, + "step": 13400 + }, + { + "epoch": 2.1679734863794358, + "grad_norm": 0.7340566515922546, + "learning_rate": 0.0002, + "loss": 0.6304, + "step": 13410 + }, + { + "epoch": 2.169590170560181, + "grad_norm": 0.7075738906860352, + "learning_rate": 0.0002, + "loss": 0.6528, + "step": 13420 + }, + { + "epoch": 2.1712068547409262, + "grad_norm": 0.7080879807472229, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 13430 + }, + { + "epoch": 2.1728235389216715, + "grad_norm": 0.6218613386154175, + "learning_rate": 0.0002, + "loss": 0.6615, + "step": 13440 + }, + { + "epoch": 2.174440223102417, + "grad_norm": 0.8211479187011719, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 13450 + }, + { + "epoch": 2.1760569072831624, + "grad_norm": 0.864466667175293, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 13460 + }, + { + "epoch": 2.1776735914639076, + "grad_norm": 0.7943857908248901, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 13470 + }, + { + "epoch": 2.179290275644653, + "grad_norm": 0.78728187084198, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 13480 + }, + { + "epoch": 2.180906959825398, + "grad_norm": 0.697527289390564, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 13490 + }, + { + "epoch": 2.1825236440061433, + "grad_norm": 0.8205804228782654, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 13500 + }, + { + "epoch": 2.1841403281868885, + "grad_norm": 0.8709042072296143, + "learning_rate": 0.0002, + "loss": 0.7227, + "step": 13510 + }, + { + "epoch": 2.1857570123676338, + "grad_norm": 0.6228537559509277, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 13520 + }, + { + "epoch": 2.1873736965483794, + "grad_norm": 0.9566980004310608, + "learning_rate": 0.0002, + "loss": 0.7025, + "step": 13530 + }, + { + "epoch": 2.1889903807291247, + "grad_norm": 0.7128894329071045, + "learning_rate": 0.0002, + "loss": 0.6755, + "step": 13540 + }, + { + "epoch": 2.19060706490987, + "grad_norm": 0.6888654232025146, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 13550 + }, + { + "epoch": 2.192223749090615, + "grad_norm": 0.6444337368011475, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 13560 + }, + { + "epoch": 2.1938404332713604, + "grad_norm": 0.8008806705474854, + "learning_rate": 0.0002, + "loss": 0.656, + "step": 13570 + }, + { + "epoch": 2.1954571174521056, + "grad_norm": 0.8482748866081238, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 13580 + }, + { + "epoch": 2.197073801632851, + "grad_norm": 0.8584157228469849, + "learning_rate": 0.0002, + "loss": 0.7326, + "step": 13590 + }, + { + "epoch": 2.1986904858135965, + "grad_norm": 0.7513734698295593, + "learning_rate": 0.0002, + "loss": 0.7014, + "step": 13600 + }, + { + "epoch": 2.2003071699943417, + "grad_norm": 0.7864262461662292, + "learning_rate": 0.0002, + "loss": 0.6632, + "step": 13610 + }, + { + "epoch": 2.201923854175087, + "grad_norm": 0.8493645191192627, + "learning_rate": 0.0002, + "loss": 0.6879, + "step": 13620 + }, + { + "epoch": 2.203540538355832, + "grad_norm": 0.6902140974998474, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 13630 + }, + { + "epoch": 2.2051572225365774, + "grad_norm": 0.8711254596710205, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 13640 + }, + { + "epoch": 2.2067739067173227, + "grad_norm": 0.7832191586494446, + "learning_rate": 0.0002, + "loss": 0.6359, + "step": 13650 + }, + { + "epoch": 2.208390590898068, + "grad_norm": 0.5668176412582397, + "learning_rate": 0.0002, + "loss": 0.6723, + "step": 13660 + }, + { + "epoch": 2.2100072750788136, + "grad_norm": 0.8648375272750854, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 13670 + }, + { + "epoch": 2.211623959259559, + "grad_norm": 0.7643089890480042, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 13680 + }, + { + "epoch": 2.213240643440304, + "grad_norm": 0.6293777823448181, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 13690 + }, + { + "epoch": 2.2148573276210493, + "grad_norm": 0.6459372639656067, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 13700 + }, + { + "epoch": 2.2164740118017945, + "grad_norm": 0.7060744166374207, + "learning_rate": 0.0002, + "loss": 0.6526, + "step": 13710 + }, + { + "epoch": 2.2180906959825397, + "grad_norm": 0.674109160900116, + "learning_rate": 0.0002, + "loss": 0.7101, + "step": 13720 + }, + { + "epoch": 2.219707380163285, + "grad_norm": 0.830392062664032, + "learning_rate": 0.0002, + "loss": 0.6529, + "step": 13730 + }, + { + "epoch": 2.2213240643440306, + "grad_norm": 0.6474477052688599, + "learning_rate": 0.0002, + "loss": 0.6733, + "step": 13740 + }, + { + "epoch": 2.222940748524776, + "grad_norm": 0.7037909626960754, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 13750 + }, + { + "epoch": 2.224557432705521, + "grad_norm": 0.6554131507873535, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 13760 + }, + { + "epoch": 2.2261741168862663, + "grad_norm": 0.7822230458259583, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 13770 + }, + { + "epoch": 2.2277908010670116, + "grad_norm": 0.9082167744636536, + "learning_rate": 0.0002, + "loss": 0.6505, + "step": 13780 + }, + { + "epoch": 2.229407485247757, + "grad_norm": 0.7918276190757751, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 13790 + }, + { + "epoch": 2.231024169428502, + "grad_norm": 0.7354569435119629, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 13800 + }, + { + "epoch": 2.2326408536092472, + "grad_norm": 0.8265249133110046, + "learning_rate": 0.0002, + "loss": 0.6503, + "step": 13810 + }, + { + "epoch": 2.234257537789993, + "grad_norm": 0.6653847098350525, + "learning_rate": 0.0002, + "loss": 0.6871, + "step": 13820 + }, + { + "epoch": 2.235874221970738, + "grad_norm": 0.7157923579216003, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 13830 + }, + { + "epoch": 2.2374909061514834, + "grad_norm": 0.7110323309898376, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 13840 + }, + { + "epoch": 2.2391075903322286, + "grad_norm": 0.7155357599258423, + "learning_rate": 0.0002, + "loss": 0.6913, + "step": 13850 + }, + { + "epoch": 2.240724274512974, + "grad_norm": 1.0177817344665527, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 13860 + }, + { + "epoch": 2.242340958693719, + "grad_norm": 0.7601948380470276, + "learning_rate": 0.0002, + "loss": 0.635, + "step": 13870 + }, + { + "epoch": 2.2439576428744643, + "grad_norm": 0.7628820538520813, + "learning_rate": 0.0002, + "loss": 0.6679, + "step": 13880 + }, + { + "epoch": 2.24557432705521, + "grad_norm": 0.7089297771453857, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 13890 + }, + { + "epoch": 2.247191011235955, + "grad_norm": 0.695178210735321, + "learning_rate": 0.0002, + "loss": 0.7236, + "step": 13900 + }, + { + "epoch": 2.2488076954167004, + "grad_norm": 0.7631948590278625, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 13910 + }, + { + "epoch": 2.2504243795974457, + "grad_norm": 0.8203101754188538, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 13920 + }, + { + "epoch": 2.252041063778191, + "grad_norm": 0.8099079728126526, + "learning_rate": 0.0002, + "loss": 0.653, + "step": 13930 + }, + { + "epoch": 2.253657747958936, + "grad_norm": 0.6498546004295349, + "learning_rate": 0.0002, + "loss": 0.694, + "step": 13940 + }, + { + "epoch": 2.2552744321396814, + "grad_norm": 0.7797415256500244, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 13950 + }, + { + "epoch": 2.2568911163204266, + "grad_norm": 0.8254124522209167, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 13960 + }, + { + "epoch": 2.2585078005011723, + "grad_norm": 0.6327953338623047, + "learning_rate": 0.0002, + "loss": 0.6806, + "step": 13970 + }, + { + "epoch": 2.2601244846819175, + "grad_norm": 0.734194278717041, + "learning_rate": 0.0002, + "loss": 0.668, + "step": 13980 + }, + { + "epoch": 2.2617411688626627, + "grad_norm": 0.9014202952384949, + "learning_rate": 0.0002, + "loss": 0.6912, + "step": 13990 + }, + { + "epoch": 2.263357853043408, + "grad_norm": 0.7643631100654602, + "learning_rate": 0.0002, + "loss": 0.692, + "step": 14000 + }, + { + "epoch": 2.264974537224153, + "grad_norm": 0.8882834911346436, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 14010 + }, + { + "epoch": 2.2665912214048984, + "grad_norm": 0.7975873351097107, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 14020 + }, + { + "epoch": 2.2682079055856437, + "grad_norm": 0.7765783071517944, + "learning_rate": 0.0002, + "loss": 0.7193, + "step": 14030 + }, + { + "epoch": 2.2698245897663893, + "grad_norm": 0.8846288323402405, + "learning_rate": 0.0002, + "loss": 0.662, + "step": 14040 + }, + { + "epoch": 2.2714412739471346, + "grad_norm": 0.9006744027137756, + "learning_rate": 0.0002, + "loss": 0.6494, + "step": 14050 + }, + { + "epoch": 2.27305795812788, + "grad_norm": 0.7420173287391663, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 14060 + }, + { + "epoch": 2.274674642308625, + "grad_norm": 0.7956424951553345, + "learning_rate": 0.0002, + "loss": 0.7068, + "step": 14070 + }, + { + "epoch": 2.2762913264893703, + "grad_norm": 0.7783209085464478, + "learning_rate": 0.0002, + "loss": 0.6581, + "step": 14080 + }, + { + "epoch": 2.2779080106701155, + "grad_norm": 0.7597188949584961, + "learning_rate": 0.0002, + "loss": 0.7202, + "step": 14090 + }, + { + "epoch": 2.2795246948508607, + "grad_norm": 0.6718921661376953, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 14100 + }, + { + "epoch": 2.281141379031606, + "grad_norm": 0.7528082132339478, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 14110 + }, + { + "epoch": 2.2827580632123516, + "grad_norm": 0.8379864692687988, + "learning_rate": 0.0002, + "loss": 0.7608, + "step": 14120 + }, + { + "epoch": 2.284374747393097, + "grad_norm": 0.748613715171814, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 14130 + }, + { + "epoch": 2.285991431573842, + "grad_norm": 0.7435423135757446, + "learning_rate": 0.0002, + "loss": 0.6641, + "step": 14140 + }, + { + "epoch": 2.2876081157545873, + "grad_norm": 0.7580803632736206, + "learning_rate": 0.0002, + "loss": 0.6849, + "step": 14150 + }, + { + "epoch": 2.2892247999353326, + "grad_norm": 0.6278321146965027, + "learning_rate": 0.0002, + "loss": 0.6604, + "step": 14160 + }, + { + "epoch": 2.290841484116078, + "grad_norm": 0.7663896083831787, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 14170 + }, + { + "epoch": 2.292458168296823, + "grad_norm": 0.9716812372207642, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 14180 + }, + { + "epoch": 2.2940748524775687, + "grad_norm": 0.8993458151817322, + "learning_rate": 0.0002, + "loss": 0.7067, + "step": 14190 + }, + { + "epoch": 2.295691536658314, + "grad_norm": 0.6156117916107178, + "learning_rate": 0.0002, + "loss": 0.6172, + "step": 14200 + }, + { + "epoch": 2.297308220839059, + "grad_norm": 0.8911278247833252, + "learning_rate": 0.0002, + "loss": 0.6318, + "step": 14210 + }, + { + "epoch": 2.2989249050198044, + "grad_norm": 0.6422147154808044, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 14220 + }, + { + "epoch": 2.3005415892005496, + "grad_norm": 0.6866879463195801, + "learning_rate": 0.0002, + "loss": 0.6795, + "step": 14230 + }, + { + "epoch": 2.302158273381295, + "grad_norm": 0.9297130107879639, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 14240 + }, + { + "epoch": 2.30377495756204, + "grad_norm": 0.7501356601715088, + "learning_rate": 0.0002, + "loss": 0.6823, + "step": 14250 + }, + { + "epoch": 2.3053916417427853, + "grad_norm": 0.8363515138626099, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 14260 + }, + { + "epoch": 2.307008325923531, + "grad_norm": 0.9083868265151978, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 14270 + }, + { + "epoch": 2.3086250101042762, + "grad_norm": 0.7791516780853271, + "learning_rate": 0.0002, + "loss": 0.6862, + "step": 14280 + }, + { + "epoch": 2.3102416942850215, + "grad_norm": 0.8766953349113464, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 14290 + }, + { + "epoch": 2.3118583784657667, + "grad_norm": 0.7916635274887085, + "learning_rate": 0.0002, + "loss": 0.6698, + "step": 14300 + }, + { + "epoch": 2.313475062646512, + "grad_norm": 0.627525269985199, + "learning_rate": 0.0002, + "loss": 0.6927, + "step": 14310 + }, + { + "epoch": 2.315091746827257, + "grad_norm": 0.8856783509254456, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 14320 + }, + { + "epoch": 2.316708431008003, + "grad_norm": 0.6758689284324646, + "learning_rate": 0.0002, + "loss": 0.6806, + "step": 14330 + }, + { + "epoch": 2.318325115188748, + "grad_norm": 0.6428321003913879, + "learning_rate": 0.0002, + "loss": 0.6794, + "step": 14340 + }, + { + "epoch": 2.3199417993694933, + "grad_norm": 0.9032121300697327, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 14350 + }, + { + "epoch": 2.3215584835502385, + "grad_norm": 0.8035986423492432, + "learning_rate": 0.0002, + "loss": 0.6569, + "step": 14360 + }, + { + "epoch": 2.3231751677309838, + "grad_norm": 0.7974579334259033, + "learning_rate": 0.0002, + "loss": 0.7067, + "step": 14370 + }, + { + "epoch": 2.324791851911729, + "grad_norm": 0.8356034755706787, + "learning_rate": 0.0002, + "loss": 0.6451, + "step": 14380 + }, + { + "epoch": 2.326408536092474, + "grad_norm": 0.998760998249054, + "learning_rate": 0.0002, + "loss": 0.6623, + "step": 14390 + }, + { + "epoch": 2.3280252202732195, + "grad_norm": 0.6518142223358154, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 14400 + }, + { + "epoch": 2.3296419044539647, + "grad_norm": 0.7443506717681885, + "learning_rate": 0.0002, + "loss": 0.7146, + "step": 14410 + }, + { + "epoch": 2.3312585886347104, + "grad_norm": 0.8436172604560852, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 14420 + }, + { + "epoch": 2.3328752728154556, + "grad_norm": 0.7411080598831177, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 14430 + }, + { + "epoch": 2.334491956996201, + "grad_norm": 0.8839048743247986, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 14440 + }, + { + "epoch": 2.336108641176946, + "grad_norm": 0.8360885977745056, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 14450 + }, + { + "epoch": 2.3377253253576913, + "grad_norm": 0.7608986496925354, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 14460 + }, + { + "epoch": 2.3393420095384365, + "grad_norm": 0.8179867267608643, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 14470 + }, + { + "epoch": 2.340958693719182, + "grad_norm": 0.5989999771118164, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 14480 + }, + { + "epoch": 2.3425753778999274, + "grad_norm": 0.9450054168701172, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 14490 + }, + { + "epoch": 2.3441920620806727, + "grad_norm": 0.7885149717330933, + "learning_rate": 0.0002, + "loss": 0.6983, + "step": 14500 + }, + { + "epoch": 2.345808746261418, + "grad_norm": 0.8152616620063782, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 14510 + }, + { + "epoch": 2.347425430442163, + "grad_norm": 0.7193838953971863, + "learning_rate": 0.0002, + "loss": 0.6989, + "step": 14520 + }, + { + "epoch": 2.3490421146229084, + "grad_norm": 0.6701092720031738, + "learning_rate": 0.0002, + "loss": 0.6594, + "step": 14530 + }, + { + "epoch": 2.3506587988036536, + "grad_norm": 0.7529364228248596, + "learning_rate": 0.0002, + "loss": 0.6559, + "step": 14540 + }, + { + "epoch": 2.352275482984399, + "grad_norm": 0.6599733829498291, + "learning_rate": 0.0002, + "loss": 0.6306, + "step": 14550 + }, + { + "epoch": 2.353892167165144, + "grad_norm": 0.9502474069595337, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 14560 + }, + { + "epoch": 2.3555088513458897, + "grad_norm": 0.7619650959968567, + "learning_rate": 0.0002, + "loss": 0.717, + "step": 14570 + }, + { + "epoch": 2.357125535526635, + "grad_norm": 0.9854652285575867, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 14580 + }, + { + "epoch": 2.35874221970738, + "grad_norm": 0.727439284324646, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 14590 + }, + { + "epoch": 2.3603589038881254, + "grad_norm": 0.6994746327400208, + "learning_rate": 0.0002, + "loss": 0.6645, + "step": 14600 + }, + { + "epoch": 2.3619755880688706, + "grad_norm": 0.7117531299591064, + "learning_rate": 0.0002, + "loss": 0.6587, + "step": 14610 + }, + { + "epoch": 2.363592272249616, + "grad_norm": 0.6403067708015442, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 14620 + }, + { + "epoch": 2.3652089564303616, + "grad_norm": 0.8377841711044312, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 14630 + }, + { + "epoch": 2.366825640611107, + "grad_norm": 0.749171257019043, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 14640 + }, + { + "epoch": 2.368442324791852, + "grad_norm": 0.8418586254119873, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 14650 + }, + { + "epoch": 2.3700590089725972, + "grad_norm": 0.6178573369979858, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 14660 + }, + { + "epoch": 2.3716756931533425, + "grad_norm": 0.6368302702903748, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 14670 + }, + { + "epoch": 2.3732923773340877, + "grad_norm": 0.9122977256774902, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 14680 + }, + { + "epoch": 2.374909061514833, + "grad_norm": 0.7086195349693298, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 14690 + }, + { + "epoch": 2.376525745695578, + "grad_norm": 0.7500800490379333, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 14700 + }, + { + "epoch": 2.378142429876324, + "grad_norm": 0.6634900569915771, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 14710 + }, + { + "epoch": 2.379759114057069, + "grad_norm": 0.839898407459259, + "learning_rate": 0.0002, + "loss": 0.6614, + "step": 14720 + }, + { + "epoch": 2.3813757982378143, + "grad_norm": 0.7578426003456116, + "learning_rate": 0.0002, + "loss": 0.6453, + "step": 14730 + }, + { + "epoch": 2.3829924824185595, + "grad_norm": 1.0213173627853394, + "learning_rate": 0.0002, + "loss": 0.7282, + "step": 14740 + }, + { + "epoch": 2.3846091665993048, + "grad_norm": 0.7855949401855469, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 14750 + }, + { + "epoch": 2.38622585078005, + "grad_norm": 0.7224128842353821, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 14760 + }, + { + "epoch": 2.3878425349607952, + "grad_norm": 0.8040381669998169, + "learning_rate": 0.0002, + "loss": 0.7017, + "step": 14770 + }, + { + "epoch": 2.389459219141541, + "grad_norm": 0.7705281376838684, + "learning_rate": 0.0002, + "loss": 0.6799, + "step": 14780 + }, + { + "epoch": 2.391075903322286, + "grad_norm": 0.667966902256012, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 14790 + }, + { + "epoch": 2.3926925875030314, + "grad_norm": 0.6611011028289795, + "learning_rate": 0.0002, + "loss": 0.7061, + "step": 14800 + }, + { + "epoch": 2.3943092716837766, + "grad_norm": 0.6862651705741882, + "learning_rate": 0.0002, + "loss": 0.6527, + "step": 14810 + }, + { + "epoch": 2.395925955864522, + "grad_norm": 0.8086010217666626, + "learning_rate": 0.0002, + "loss": 0.6537, + "step": 14820 + }, + { + "epoch": 2.397542640045267, + "grad_norm": 0.7189689874649048, + "learning_rate": 0.0002, + "loss": 0.7189, + "step": 14830 + }, + { + "epoch": 2.3991593242260123, + "grad_norm": 0.6280009150505066, + "learning_rate": 0.0002, + "loss": 0.6709, + "step": 14840 + }, + { + "epoch": 2.4007760084067575, + "grad_norm": 0.7826612591743469, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 14850 + }, + { + "epoch": 2.402392692587503, + "grad_norm": 0.7681610584259033, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 14860 + }, + { + "epoch": 2.4040093767682484, + "grad_norm": 0.720966100692749, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 14870 + }, + { + "epoch": 2.4056260609489937, + "grad_norm": 0.8202250599861145, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 14880 + }, + { + "epoch": 2.407242745129739, + "grad_norm": 0.786212682723999, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 14890 + }, + { + "epoch": 2.408859429310484, + "grad_norm": 0.6647164821624756, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 14900 + }, + { + "epoch": 2.4104761134912294, + "grad_norm": 0.7566399574279785, + "learning_rate": 0.0002, + "loss": 0.6706, + "step": 14910 + }, + { + "epoch": 2.4120927976719746, + "grad_norm": 0.748814582824707, + "learning_rate": 0.0002, + "loss": 0.7188, + "step": 14920 + }, + { + "epoch": 2.4137094818527203, + "grad_norm": 0.7624038457870483, + "learning_rate": 0.0002, + "loss": 0.6684, + "step": 14930 + }, + { + "epoch": 2.4153261660334655, + "grad_norm": 0.8267335295677185, + "learning_rate": 0.0002, + "loss": 0.6483, + "step": 14940 + }, + { + "epoch": 2.4169428502142107, + "grad_norm": 0.8785360455513, + "learning_rate": 0.0002, + "loss": 0.6612, + "step": 14950 + }, + { + "epoch": 2.418559534394956, + "grad_norm": 0.679887592792511, + "learning_rate": 0.0002, + "loss": 0.6718, + "step": 14960 + }, + { + "epoch": 2.420176218575701, + "grad_norm": 0.7218474745750427, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 14970 + }, + { + "epoch": 2.4217929027564464, + "grad_norm": 0.6342799663543701, + "learning_rate": 0.0002, + "loss": 0.648, + "step": 14980 + }, + { + "epoch": 2.4234095869371917, + "grad_norm": 0.7098712921142578, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 14990 + }, + { + "epoch": 2.425026271117937, + "grad_norm": 0.7497431635856628, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 15000 + }, + { + "epoch": 2.4266429552986826, + "grad_norm": 0.934836208820343, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 15010 + }, + { + "epoch": 2.428259639479428, + "grad_norm": 0.8430966734886169, + "learning_rate": 0.0002, + "loss": 0.7221, + "step": 15020 + }, + { + "epoch": 2.429876323660173, + "grad_norm": 0.7032104730606079, + "learning_rate": 0.0002, + "loss": 0.6985, + "step": 15030 + }, + { + "epoch": 2.4314930078409183, + "grad_norm": 0.7746111750602722, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 15040 + }, + { + "epoch": 2.4331096920216635, + "grad_norm": 0.7661406397819519, + "learning_rate": 0.0002, + "loss": 0.7177, + "step": 15050 + }, + { + "epoch": 2.4347263762024087, + "grad_norm": 0.6941645741462708, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 15060 + }, + { + "epoch": 2.436343060383154, + "grad_norm": 0.7487249374389648, + "learning_rate": 0.0002, + "loss": 0.6421, + "step": 15070 + }, + { + "epoch": 2.4379597445638996, + "grad_norm": 0.7639912962913513, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 15080 + }, + { + "epoch": 2.439576428744645, + "grad_norm": 0.7708953619003296, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 15090 + }, + { + "epoch": 2.44119311292539, + "grad_norm": 0.9135832190513611, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 15100 + }, + { + "epoch": 2.4428097971061353, + "grad_norm": 0.8283005356788635, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 15110 + }, + { + "epoch": 2.4444264812868806, + "grad_norm": 0.925299346446991, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 15120 + }, + { + "epoch": 2.446043165467626, + "grad_norm": 0.7013528943061829, + "learning_rate": 0.0002, + "loss": 0.687, + "step": 15130 + }, + { + "epoch": 2.447659849648371, + "grad_norm": 0.622303307056427, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 15140 + }, + { + "epoch": 2.4492765338291163, + "grad_norm": 0.876569390296936, + "learning_rate": 0.0002, + "loss": 0.6676, + "step": 15150 + }, + { + "epoch": 2.450893218009862, + "grad_norm": 0.6836351752281189, + "learning_rate": 0.0002, + "loss": 0.6463, + "step": 15160 + }, + { + "epoch": 2.452509902190607, + "grad_norm": 0.7886684536933899, + "learning_rate": 0.0002, + "loss": 0.6781, + "step": 15170 + }, + { + "epoch": 2.4541265863713524, + "grad_norm": 0.6647440791130066, + "learning_rate": 0.0002, + "loss": 0.6794, + "step": 15180 + }, + { + "epoch": 2.4557432705520976, + "grad_norm": 0.7477722764015198, + "learning_rate": 0.0002, + "loss": 0.6353, + "step": 15190 + }, + { + "epoch": 2.457359954732843, + "grad_norm": 0.8192033767700195, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 15200 + }, + { + "epoch": 2.458976638913588, + "grad_norm": 0.847537100315094, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 15210 + }, + { + "epoch": 2.4605933230943338, + "grad_norm": 0.9027776122093201, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 15220 + }, + { + "epoch": 2.462210007275079, + "grad_norm": 0.7217772006988525, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 15230 + }, + { + "epoch": 2.4638266914558242, + "grad_norm": 0.7994546294212341, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 15240 + }, + { + "epoch": 2.4654433756365695, + "grad_norm": 0.939916729927063, + "learning_rate": 0.0002, + "loss": 0.6828, + "step": 15250 + }, + { + "epoch": 2.4670600598173147, + "grad_norm": 1.0009053945541382, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 15260 + }, + { + "epoch": 2.46867674399806, + "grad_norm": 0.625555694103241, + "learning_rate": 0.0002, + "loss": 0.643, + "step": 15270 + }, + { + "epoch": 2.470293428178805, + "grad_norm": 0.7924878597259521, + "learning_rate": 0.0002, + "loss": 0.688, + "step": 15280 + }, + { + "epoch": 2.4719101123595504, + "grad_norm": 0.8536689877510071, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 15290 + }, + { + "epoch": 2.4735267965402956, + "grad_norm": 0.8572589755058289, + "learning_rate": 0.0002, + "loss": 0.6924, + "step": 15300 + }, + { + "epoch": 2.4751434807210413, + "grad_norm": 0.773279070854187, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 15310 + }, + { + "epoch": 2.4767601649017865, + "grad_norm": 0.7708749771118164, + "learning_rate": 0.0002, + "loss": 0.6573, + "step": 15320 + }, + { + "epoch": 2.4783768490825318, + "grad_norm": 0.770905077457428, + "learning_rate": 0.0002, + "loss": 0.7065, + "step": 15330 + }, + { + "epoch": 2.479993533263277, + "grad_norm": 0.8238571882247925, + "learning_rate": 0.0002, + "loss": 0.6878, + "step": 15340 + }, + { + "epoch": 2.481610217444022, + "grad_norm": 0.7670477032661438, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 15350 + }, + { + "epoch": 2.4832269016247674, + "grad_norm": 0.905036985874176, + "learning_rate": 0.0002, + "loss": 0.7759, + "step": 15360 + }, + { + "epoch": 2.484843585805513, + "grad_norm": 0.6672089695930481, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 15370 + }, + { + "epoch": 2.4864602699862584, + "grad_norm": 0.625095784664154, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 15380 + }, + { + "epoch": 2.4880769541670036, + "grad_norm": 0.679772675037384, + "learning_rate": 0.0002, + "loss": 0.6396, + "step": 15390 + }, + { + "epoch": 2.489693638347749, + "grad_norm": 0.711492121219635, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 15400 + }, + { + "epoch": 2.491310322528494, + "grad_norm": 0.876189112663269, + "learning_rate": 0.0002, + "loss": 0.6966, + "step": 15410 + }, + { + "epoch": 2.4929270067092393, + "grad_norm": 0.7236915230751038, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 15420 + }, + { + "epoch": 2.4945436908899845, + "grad_norm": 0.6629832983016968, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 15430 + }, + { + "epoch": 2.4961603750707297, + "grad_norm": 0.9756859540939331, + "learning_rate": 0.0002, + "loss": 0.6669, + "step": 15440 + }, + { + "epoch": 2.4977770592514754, + "grad_norm": 0.6896940469741821, + "learning_rate": 0.0002, + "loss": 0.7559, + "step": 15450 + }, + { + "epoch": 2.4993937434322206, + "grad_norm": 0.7105149626731873, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 15460 + }, + { + "epoch": 2.501010427612966, + "grad_norm": 0.8374546766281128, + "learning_rate": 0.0002, + "loss": 0.6859, + "step": 15470 + }, + { + "epoch": 2.502627111793711, + "grad_norm": 0.7320070266723633, + "learning_rate": 0.0002, + "loss": 0.6512, + "step": 15480 + }, + { + "epoch": 2.5042437959744563, + "grad_norm": 0.8306367993354797, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 15490 + }, + { + "epoch": 2.5058604801552016, + "grad_norm": 0.7472721338272095, + "learning_rate": 0.0002, + "loss": 0.7253, + "step": 15500 + }, + { + "epoch": 2.507477164335947, + "grad_norm": 0.6147692203521729, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 15510 + }, + { + "epoch": 2.5090938485166925, + "grad_norm": 0.7788505554199219, + "learning_rate": 0.0002, + "loss": 0.7158, + "step": 15520 + }, + { + "epoch": 2.5107105326974377, + "grad_norm": 0.8807527422904968, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 15530 + }, + { + "epoch": 2.512327216878183, + "grad_norm": 0.7521643042564392, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 15540 + }, + { + "epoch": 2.513943901058928, + "grad_norm": 0.6900225281715393, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 15550 + }, + { + "epoch": 2.5155605852396734, + "grad_norm": 0.6601938605308533, + "learning_rate": 0.0002, + "loss": 0.6769, + "step": 15560 + }, + { + "epoch": 2.5171772694204186, + "grad_norm": 0.8179984092712402, + "learning_rate": 0.0002, + "loss": 0.6648, + "step": 15570 + }, + { + "epoch": 2.518793953601164, + "grad_norm": 0.792556881904602, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 15580 + }, + { + "epoch": 2.520410637781909, + "grad_norm": 0.7081938982009888, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 15590 + }, + { + "epoch": 2.5220273219626543, + "grad_norm": 0.8733121156692505, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 15600 + }, + { + "epoch": 2.5236440061434, + "grad_norm": 0.7980992794036865, + "learning_rate": 0.0002, + "loss": 0.6969, + "step": 15610 + }, + { + "epoch": 2.5252606903241452, + "grad_norm": 0.883664071559906, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 15620 + }, + { + "epoch": 2.5268773745048905, + "grad_norm": 0.6963341236114502, + "learning_rate": 0.0002, + "loss": 0.7022, + "step": 15630 + }, + { + "epoch": 2.5284940586856357, + "grad_norm": 0.6433573365211487, + "learning_rate": 0.0002, + "loss": 0.7334, + "step": 15640 + }, + { + "epoch": 2.530110742866381, + "grad_norm": 0.8538183569908142, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 15650 + }, + { + "epoch": 2.5317274270471266, + "grad_norm": 0.9748201370239258, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 15660 + }, + { + "epoch": 2.533344111227872, + "grad_norm": 0.7670575380325317, + "learning_rate": 0.0002, + "loss": 0.6765, + "step": 15670 + }, + { + "epoch": 2.534960795408617, + "grad_norm": 0.8738890290260315, + "learning_rate": 0.0002, + "loss": 0.6435, + "step": 15680 + }, + { + "epoch": 2.5365774795893623, + "grad_norm": 0.8391636610031128, + "learning_rate": 0.0002, + "loss": 0.6802, + "step": 15690 + }, + { + "epoch": 2.5381941637701075, + "grad_norm": 0.7239366769790649, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 15700 + }, + { + "epoch": 2.5398108479508528, + "grad_norm": 0.8498379588127136, + "learning_rate": 0.0002, + "loss": 0.7011, + "step": 15710 + }, + { + "epoch": 2.541427532131598, + "grad_norm": 0.8029484152793884, + "learning_rate": 0.0002, + "loss": 0.6998, + "step": 15720 + }, + { + "epoch": 2.5430442163123432, + "grad_norm": 1.0639333724975586, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 15730 + }, + { + "epoch": 2.5446609004930885, + "grad_norm": 0.6401297450065613, + "learning_rate": 0.0002, + "loss": 0.6341, + "step": 15740 + }, + { + "epoch": 2.5462775846738337, + "grad_norm": 0.7123814821243286, + "learning_rate": 0.0002, + "loss": 0.7196, + "step": 15750 + }, + { + "epoch": 2.5478942688545794, + "grad_norm": 0.7874974608421326, + "learning_rate": 0.0002, + "loss": 0.654, + "step": 15760 + }, + { + "epoch": 2.5495109530353246, + "grad_norm": 0.8046808838844299, + "learning_rate": 0.0002, + "loss": 0.6721, + "step": 15770 + }, + { + "epoch": 2.55112763721607, + "grad_norm": 0.7888661623001099, + "learning_rate": 0.0002, + "loss": 0.6665, + "step": 15780 + }, + { + "epoch": 2.552744321396815, + "grad_norm": 0.8445866107940674, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 15790 + }, + { + "epoch": 2.5543610055775603, + "grad_norm": 0.7475846409797668, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 15800 + }, + { + "epoch": 2.555977689758306, + "grad_norm": 0.7455102801322937, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 15810 + }, + { + "epoch": 2.557594373939051, + "grad_norm": 0.8226983547210693, + "learning_rate": 0.0002, + "loss": 0.6932, + "step": 15820 + }, + { + "epoch": 2.5592110581197964, + "grad_norm": 0.8920368552207947, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 15830 + }, + { + "epoch": 2.5608277423005417, + "grad_norm": 0.8413904905319214, + "learning_rate": 0.0002, + "loss": 0.6297, + "step": 15840 + }, + { + "epoch": 2.562444426481287, + "grad_norm": 0.8483649492263794, + "learning_rate": 0.0002, + "loss": 0.7106, + "step": 15850 + }, + { + "epoch": 2.564061110662032, + "grad_norm": 0.5923284292221069, + "learning_rate": 0.0002, + "loss": 0.6957, + "step": 15860 + }, + { + "epoch": 2.5656777948427774, + "grad_norm": 0.8518726229667664, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 15870 + }, + { + "epoch": 2.5672944790235226, + "grad_norm": 0.731235146522522, + "learning_rate": 0.0002, + "loss": 0.6362, + "step": 15880 + }, + { + "epoch": 2.568911163204268, + "grad_norm": 0.7517194151878357, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 15890 + }, + { + "epoch": 2.5705278473850135, + "grad_norm": 0.8378692269325256, + "learning_rate": 0.0002, + "loss": 0.6907, + "step": 15900 + }, + { + "epoch": 2.5721445315657587, + "grad_norm": 0.843701958656311, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 15910 + }, + { + "epoch": 2.573761215746504, + "grad_norm": 0.7254629731178284, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 15920 + }, + { + "epoch": 2.575377899927249, + "grad_norm": 0.8863335847854614, + "learning_rate": 0.0002, + "loss": 0.6872, + "step": 15930 + }, + { + "epoch": 2.5769945841079944, + "grad_norm": 0.7675097584724426, + "learning_rate": 0.0002, + "loss": 0.6813, + "step": 15940 + }, + { + "epoch": 2.5786112682887397, + "grad_norm": 0.82063889503479, + "learning_rate": 0.0002, + "loss": 0.7357, + "step": 15950 + }, + { + "epoch": 2.5802279524694853, + "grad_norm": 0.7729717493057251, + "learning_rate": 0.0002, + "loss": 0.662, + "step": 15960 + }, + { + "epoch": 2.5818446366502306, + "grad_norm": 0.8301846981048584, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 15970 + }, + { + "epoch": 2.583461320830976, + "grad_norm": 0.7906861305236816, + "learning_rate": 0.0002, + "loss": 0.6897, + "step": 15980 + }, + { + "epoch": 2.585078005011721, + "grad_norm": 0.6749057173728943, + "learning_rate": 0.0002, + "loss": 0.7175, + "step": 15990 + }, + { + "epoch": 2.5866946891924663, + "grad_norm": 0.9386842846870422, + "learning_rate": 0.0002, + "loss": 0.7212, + "step": 16000 + }, + { + "epoch": 2.5883113733732115, + "grad_norm": 0.7868891358375549, + "learning_rate": 0.0002, + "loss": 0.6934, + "step": 16010 + }, + { + "epoch": 2.5899280575539567, + "grad_norm": 0.8674671053886414, + "learning_rate": 0.0002, + "loss": 0.7036, + "step": 16020 + }, + { + "epoch": 2.591544741734702, + "grad_norm": 0.7043559551239014, + "learning_rate": 0.0002, + "loss": 0.7217, + "step": 16030 + }, + { + "epoch": 2.593161425915447, + "grad_norm": 0.5846083760261536, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 16040 + }, + { + "epoch": 2.594778110096193, + "grad_norm": 0.7323982119560242, + "learning_rate": 0.0002, + "loss": 0.7322, + "step": 16050 + }, + { + "epoch": 2.596394794276938, + "grad_norm": 0.9069556593894958, + "learning_rate": 0.0002, + "loss": 0.6794, + "step": 16060 + }, + { + "epoch": 2.5980114784576833, + "grad_norm": 0.7522736191749573, + "learning_rate": 0.0002, + "loss": 0.7076, + "step": 16070 + }, + { + "epoch": 2.5996281626384286, + "grad_norm": 0.8149648308753967, + "learning_rate": 0.0002, + "loss": 0.6477, + "step": 16080 + }, + { + "epoch": 2.601244846819174, + "grad_norm": 0.6214233040809631, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 16090 + }, + { + "epoch": 2.602861530999919, + "grad_norm": 0.6803743839263916, + "learning_rate": 0.0002, + "loss": 0.7307, + "step": 16100 + }, + { + "epoch": 2.6044782151806647, + "grad_norm": 0.7223997116088867, + "learning_rate": 0.0002, + "loss": 0.7244, + "step": 16110 + }, + { + "epoch": 2.60609489936141, + "grad_norm": 0.7324174642562866, + "learning_rate": 0.0002, + "loss": 0.6867, + "step": 16120 + }, + { + "epoch": 2.607711583542155, + "grad_norm": 0.9594739675521851, + "learning_rate": 0.0002, + "loss": 0.7159, + "step": 16130 + }, + { + "epoch": 2.6093282677229004, + "grad_norm": 0.9485327005386353, + "learning_rate": 0.0002, + "loss": 0.6451, + "step": 16140 + }, + { + "epoch": 2.6109449519036456, + "grad_norm": 0.8449000120162964, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 16150 + }, + { + "epoch": 2.612561636084391, + "grad_norm": 0.8520140051841736, + "learning_rate": 0.0002, + "loss": 0.7152, + "step": 16160 + }, + { + "epoch": 2.614178320265136, + "grad_norm": 0.7456524968147278, + "learning_rate": 0.0002, + "loss": 0.6759, + "step": 16170 + }, + { + "epoch": 2.6157950044458813, + "grad_norm": 0.9912857413291931, + "learning_rate": 0.0002, + "loss": 0.6893, + "step": 16180 + }, + { + "epoch": 2.6174116886266265, + "grad_norm": 0.9001946449279785, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 16190 + }, + { + "epoch": 2.619028372807372, + "grad_norm": 0.6568667888641357, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 16200 + }, + { + "epoch": 2.6206450569881174, + "grad_norm": 1.0248128175735474, + "learning_rate": 0.0002, + "loss": 0.7013, + "step": 16210 + }, + { + "epoch": 2.6222617411688627, + "grad_norm": 0.6509039998054504, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 16220 + }, + { + "epoch": 2.623878425349608, + "grad_norm": 0.7626351118087769, + "learning_rate": 0.0002, + "loss": 0.72, + "step": 16230 + }, + { + "epoch": 2.625495109530353, + "grad_norm": 0.6938552260398865, + "learning_rate": 0.0002, + "loss": 0.6556, + "step": 16240 + }, + { + "epoch": 2.6271117937110984, + "grad_norm": 0.6434680819511414, + "learning_rate": 0.0002, + "loss": 0.65, + "step": 16250 + }, + { + "epoch": 2.628728477891844, + "grad_norm": 0.7111515998840332, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 16260 + }, + { + "epoch": 2.6303451620725893, + "grad_norm": 0.7712395787239075, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 16270 + }, + { + "epoch": 2.6319618462533345, + "grad_norm": 0.792209267616272, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 16280 + }, + { + "epoch": 2.6335785304340797, + "grad_norm": 0.6801066398620605, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 16290 + }, + { + "epoch": 2.635195214614825, + "grad_norm": 0.7802573442459106, + "learning_rate": 0.0002, + "loss": 0.73, + "step": 16300 + }, + { + "epoch": 2.63681189879557, + "grad_norm": 0.7742244601249695, + "learning_rate": 0.0002, + "loss": 0.7484, + "step": 16310 + }, + { + "epoch": 2.6384285829763154, + "grad_norm": 0.664184033870697, + "learning_rate": 0.0002, + "loss": 0.6524, + "step": 16320 + }, + { + "epoch": 2.6400452671570607, + "grad_norm": 0.9242228865623474, + "learning_rate": 0.0002, + "loss": 0.6442, + "step": 16330 + }, + { + "epoch": 2.641661951337806, + "grad_norm": 0.9661325216293335, + "learning_rate": 0.0002, + "loss": 0.6792, + "step": 16340 + }, + { + "epoch": 2.6432786355185516, + "grad_norm": 0.837526798248291, + "learning_rate": 0.0002, + "loss": 0.6847, + "step": 16350 + }, + { + "epoch": 2.644895319699297, + "grad_norm": 1.1834373474121094, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 16360 + }, + { + "epoch": 2.646512003880042, + "grad_norm": 0.7467831373214722, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 16370 + }, + { + "epoch": 2.6481286880607873, + "grad_norm": 0.8627146482467651, + "learning_rate": 0.0002, + "loss": 0.6935, + "step": 16380 + }, + { + "epoch": 2.6497453722415325, + "grad_norm": 0.790447473526001, + "learning_rate": 0.0002, + "loss": 0.715, + "step": 16390 + }, + { + "epoch": 2.651362056422278, + "grad_norm": 0.8447365164756775, + "learning_rate": 0.0002, + "loss": 0.723, + "step": 16400 + }, + { + "epoch": 2.6529787406030234, + "grad_norm": 0.7831417918205261, + "learning_rate": 0.0002, + "loss": 0.6628, + "step": 16410 + }, + { + "epoch": 2.6545954247837686, + "grad_norm": 0.6837952136993408, + "learning_rate": 0.0002, + "loss": 0.6691, + "step": 16420 + }, + { + "epoch": 2.656212108964514, + "grad_norm": 0.7031801342964172, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 16430 + }, + { + "epoch": 2.657828793145259, + "grad_norm": 0.8963770866394043, + "learning_rate": 0.0002, + "loss": 0.7382, + "step": 16440 + }, + { + "epoch": 2.6594454773260043, + "grad_norm": 0.6852328181266785, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 16450 + }, + { + "epoch": 2.6610621615067496, + "grad_norm": 0.8069294095039368, + "learning_rate": 0.0002, + "loss": 0.6278, + "step": 16460 + }, + { + "epoch": 2.662678845687495, + "grad_norm": 0.7503686547279358, + "learning_rate": 0.0002, + "loss": 0.6939, + "step": 16470 + }, + { + "epoch": 2.66429552986824, + "grad_norm": 0.6430956125259399, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 16480 + }, + { + "epoch": 2.6659122140489853, + "grad_norm": 0.7894312739372253, + "learning_rate": 0.0002, + "loss": 0.6863, + "step": 16490 + }, + { + "epoch": 2.667528898229731, + "grad_norm": 0.7277431488037109, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 16500 + }, + { + "epoch": 2.669145582410476, + "grad_norm": 0.6816153526306152, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 16510 + }, + { + "epoch": 2.6707622665912214, + "grad_norm": 0.8145235776901245, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 16520 + }, + { + "epoch": 2.6723789507719666, + "grad_norm": 0.8645890355110168, + "learning_rate": 0.0002, + "loss": 0.709, + "step": 16530 + }, + { + "epoch": 2.673995634952712, + "grad_norm": 0.704393208026886, + "learning_rate": 0.0002, + "loss": 0.6946, + "step": 16540 + }, + { + "epoch": 2.6756123191334575, + "grad_norm": 1.0120846033096313, + "learning_rate": 0.0002, + "loss": 0.6378, + "step": 16550 + }, + { + "epoch": 2.6772290033142028, + "grad_norm": 0.6919328570365906, + "learning_rate": 0.0002, + "loss": 0.7241, + "step": 16560 + }, + { + "epoch": 2.678845687494948, + "grad_norm": 0.6924574971199036, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 16570 + }, + { + "epoch": 2.6804623716756932, + "grad_norm": 0.9679301381111145, + "learning_rate": 0.0002, + "loss": 0.731, + "step": 16580 + }, + { + "epoch": 2.6820790558564385, + "grad_norm": 0.6810211539268494, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 16590 + }, + { + "epoch": 2.6836957400371837, + "grad_norm": 0.9730555415153503, + "learning_rate": 0.0002, + "loss": 0.6688, + "step": 16600 + }, + { + "epoch": 2.685312424217929, + "grad_norm": 0.7852821350097656, + "learning_rate": 0.0002, + "loss": 0.7344, + "step": 16610 + }, + { + "epoch": 2.686929108398674, + "grad_norm": 0.6059057116508484, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 16620 + }, + { + "epoch": 2.6885457925794194, + "grad_norm": 0.9395958781242371, + "learning_rate": 0.0002, + "loss": 0.6796, + "step": 16630 + }, + { + "epoch": 2.690162476760165, + "grad_norm": 0.7473729848861694, + "learning_rate": 0.0002, + "loss": 0.7174, + "step": 16640 + }, + { + "epoch": 2.6917791609409103, + "grad_norm": 0.765934407711029, + "learning_rate": 0.0002, + "loss": 0.7087, + "step": 16650 + }, + { + "epoch": 2.6933958451216555, + "grad_norm": 0.8496677279472351, + "learning_rate": 0.0002, + "loss": 0.707, + "step": 16660 + }, + { + "epoch": 2.6950125293024008, + "grad_norm": 0.7641879916191101, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 16670 + }, + { + "epoch": 2.696629213483146, + "grad_norm": 0.8471952676773071, + "learning_rate": 0.0002, + "loss": 0.6566, + "step": 16680 + }, + { + "epoch": 2.6982458976638912, + "grad_norm": 0.6946060657501221, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 16690 + }, + { + "epoch": 2.699862581844637, + "grad_norm": 0.7361312508583069, + "learning_rate": 0.0002, + "loss": 0.7027, + "step": 16700 + }, + { + "epoch": 2.701479266025382, + "grad_norm": 0.6605038046836853, + "learning_rate": 0.0002, + "loss": 0.6767, + "step": 16710 + }, + { + "epoch": 2.7030959502061274, + "grad_norm": 0.7164411544799805, + "learning_rate": 0.0002, + "loss": 0.6885, + "step": 16720 + }, + { + "epoch": 2.7047126343868726, + "grad_norm": 0.6496201157569885, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 16730 + }, + { + "epoch": 2.706329318567618, + "grad_norm": 0.7826663851737976, + "learning_rate": 0.0002, + "loss": 0.6942, + "step": 16740 + }, + { + "epoch": 2.707946002748363, + "grad_norm": 0.7639131546020508, + "learning_rate": 0.0002, + "loss": 0.6773, + "step": 16750 + }, + { + "epoch": 2.7095626869291083, + "grad_norm": 0.7976210713386536, + "learning_rate": 0.0002, + "loss": 0.69, + "step": 16760 + }, + { + "epoch": 2.7111793711098535, + "grad_norm": 0.6836577653884888, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 16770 + }, + { + "epoch": 2.7127960552905988, + "grad_norm": 0.8025202751159668, + "learning_rate": 0.0002, + "loss": 0.6596, + "step": 16780 + }, + { + "epoch": 2.7144127394713444, + "grad_norm": 0.7636463642120361, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 16790 + }, + { + "epoch": 2.7160294236520897, + "grad_norm": 0.7481677532196045, + "learning_rate": 0.0002, + "loss": 0.6227, + "step": 16800 + }, + { + "epoch": 2.717646107832835, + "grad_norm": 0.7566834688186646, + "learning_rate": 0.0002, + "loss": 0.6925, + "step": 16810 + }, + { + "epoch": 2.71926279201358, + "grad_norm": 0.7931267619132996, + "learning_rate": 0.0002, + "loss": 0.6531, + "step": 16820 + }, + { + "epoch": 2.7208794761943254, + "grad_norm": 0.8811662197113037, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 16830 + }, + { + "epoch": 2.7224961603750706, + "grad_norm": 0.8561240434646606, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 16840 + }, + { + "epoch": 2.7241128445558163, + "grad_norm": 0.7121599316596985, + "learning_rate": 0.0002, + "loss": 0.7135, + "step": 16850 + }, + { + "epoch": 2.7257295287365615, + "grad_norm": 0.8066257238388062, + "learning_rate": 0.0002, + "loss": 0.6825, + "step": 16860 + }, + { + "epoch": 2.7273462129173067, + "grad_norm": 0.7699271440505981, + "learning_rate": 0.0002, + "loss": 0.6839, + "step": 16870 + }, + { + "epoch": 2.728962897098052, + "grad_norm": 1.1828432083129883, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 16880 + }, + { + "epoch": 2.730579581278797, + "grad_norm": 0.9989302754402161, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 16890 + }, + { + "epoch": 2.7321962654595424, + "grad_norm": 0.8100560307502747, + "learning_rate": 0.0002, + "loss": 0.7015, + "step": 16900 + }, + { + "epoch": 2.7338129496402876, + "grad_norm": 0.8615233898162842, + "learning_rate": 0.0002, + "loss": 0.6851, + "step": 16910 + }, + { + "epoch": 2.735429633821033, + "grad_norm": 0.8633756041526794, + "learning_rate": 0.0002, + "loss": 0.6322, + "step": 16920 + }, + { + "epoch": 2.737046318001778, + "grad_norm": 0.7769348621368408, + "learning_rate": 0.0002, + "loss": 0.6488, + "step": 16930 + }, + { + "epoch": 2.738663002182524, + "grad_norm": 0.6943058371543884, + "learning_rate": 0.0002, + "loss": 0.6582, + "step": 16940 + }, + { + "epoch": 2.740279686363269, + "grad_norm": 0.8510736227035522, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 16950 + }, + { + "epoch": 2.7418963705440142, + "grad_norm": 0.7732602953910828, + "learning_rate": 0.0002, + "loss": 0.7275, + "step": 16960 + }, + { + "epoch": 2.7435130547247595, + "grad_norm": 0.5981788635253906, + "learning_rate": 0.0002, + "loss": 0.6553, + "step": 16970 + }, + { + "epoch": 2.7451297389055047, + "grad_norm": 0.7604416012763977, + "learning_rate": 0.0002, + "loss": 0.6777, + "step": 16980 + }, + { + "epoch": 2.74674642308625, + "grad_norm": 0.7377738356590271, + "learning_rate": 0.0002, + "loss": 0.6981, + "step": 16990 + }, + { + "epoch": 2.7483631072669956, + "grad_norm": 0.9400289058685303, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 17000 + }, + { + "epoch": 2.749979791447741, + "grad_norm": 0.6340599656105042, + "learning_rate": 0.0002, + "loss": 0.6952, + "step": 17010 + }, + { + "epoch": 2.751596475628486, + "grad_norm": 0.7297601103782654, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 17020 + }, + { + "epoch": 2.7532131598092313, + "grad_norm": 0.9479979872703552, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 17030 + }, + { + "epoch": 2.7548298439899765, + "grad_norm": 0.8461511135101318, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 17040 + }, + { + "epoch": 2.7564465281707218, + "grad_norm": 0.7477551698684692, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 17050 + }, + { + "epoch": 2.758063212351467, + "grad_norm": 1.019270420074463, + "learning_rate": 0.0002, + "loss": 0.684, + "step": 17060 + }, + { + "epoch": 2.7596798965322122, + "grad_norm": 0.7730235457420349, + "learning_rate": 0.0002, + "loss": 0.7119, + "step": 17070 + }, + { + "epoch": 2.7612965807129575, + "grad_norm": 0.8216866254806519, + "learning_rate": 0.0002, + "loss": 0.6886, + "step": 17080 + }, + { + "epoch": 2.762913264893703, + "grad_norm": 0.7235931754112244, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 17090 + }, + { + "epoch": 2.7645299490744484, + "grad_norm": 0.7352296710014343, + "learning_rate": 0.0002, + "loss": 0.7031, + "step": 17100 + }, + { + "epoch": 2.7661466332551936, + "grad_norm": 0.8129373788833618, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 17110 + }, + { + "epoch": 2.767763317435939, + "grad_norm": 0.7387019991874695, + "learning_rate": 0.0002, + "loss": 0.6703, + "step": 17120 + }, + { + "epoch": 2.769380001616684, + "grad_norm": 0.9149190187454224, + "learning_rate": 0.0002, + "loss": 0.6789, + "step": 17130 + }, + { + "epoch": 2.7709966857974297, + "grad_norm": 0.7352971434593201, + "learning_rate": 0.0002, + "loss": 0.6038, + "step": 17140 + }, + { + "epoch": 2.772613369978175, + "grad_norm": 0.7903780341148376, + "learning_rate": 0.0002, + "loss": 0.6728, + "step": 17150 + }, + { + "epoch": 2.77423005415892, + "grad_norm": 0.8255927562713623, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 17160 + }, + { + "epoch": 2.7758467383396654, + "grad_norm": 0.7235927581787109, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 17170 + }, + { + "epoch": 2.7774634225204107, + "grad_norm": 0.8281434774398804, + "learning_rate": 0.0002, + "loss": 0.7161, + "step": 17180 + }, + { + "epoch": 2.779080106701156, + "grad_norm": 0.7586921453475952, + "learning_rate": 0.0002, + "loss": 0.682, + "step": 17190 + }, + { + "epoch": 2.780696790881901, + "grad_norm": 0.7161715030670166, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 17200 + }, + { + "epoch": 2.7823134750626464, + "grad_norm": 0.762868344783783, + "learning_rate": 0.0002, + "loss": 0.6426, + "step": 17210 + }, + { + "epoch": 2.7839301592433916, + "grad_norm": 0.9285483360290527, + "learning_rate": 0.0002, + "loss": 0.705, + "step": 17220 + }, + { + "epoch": 2.785546843424137, + "grad_norm": 0.6900462508201599, + "learning_rate": 0.0002, + "loss": 0.7084, + "step": 17230 + }, + { + "epoch": 2.7871635276048825, + "grad_norm": 0.780384361743927, + "learning_rate": 0.0002, + "loss": 0.6988, + "step": 17240 + }, + { + "epoch": 2.7887802117856277, + "grad_norm": 0.7580406665802002, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 17250 + }, + { + "epoch": 2.790396895966373, + "grad_norm": 0.8145199418067932, + "learning_rate": 0.0002, + "loss": 0.6833, + "step": 17260 + }, + { + "epoch": 2.792013580147118, + "grad_norm": 0.9159596562385559, + "learning_rate": 0.0002, + "loss": 0.6909, + "step": 17270 + }, + { + "epoch": 2.7936302643278634, + "grad_norm": 0.9590014219284058, + "learning_rate": 0.0002, + "loss": 0.6008, + "step": 17280 + }, + { + "epoch": 2.795246948508609, + "grad_norm": 0.7603529691696167, + "learning_rate": 0.0002, + "loss": 0.6704, + "step": 17290 + }, + { + "epoch": 2.7968636326893543, + "grad_norm": 0.8039976358413696, + "learning_rate": 0.0002, + "loss": 0.7165, + "step": 17300 + }, + { + "epoch": 2.7984803168700996, + "grad_norm": 0.8364847302436829, + "learning_rate": 0.0002, + "loss": 0.7037, + "step": 17310 + }, + { + "epoch": 2.800097001050845, + "grad_norm": 0.8763046860694885, + "learning_rate": 0.0002, + "loss": 0.6749, + "step": 17320 + }, + { + "epoch": 2.80171368523159, + "grad_norm": 0.8409647941589355, + "learning_rate": 0.0002, + "loss": 0.6844, + "step": 17330 + }, + { + "epoch": 2.8033303694123353, + "grad_norm": 0.7649006247520447, + "learning_rate": 0.0002, + "loss": 0.6936, + "step": 17340 + }, + { + "epoch": 2.8049470535930805, + "grad_norm": 0.7970262169837952, + "learning_rate": 0.0002, + "loss": 0.7051, + "step": 17350 + }, + { + "epoch": 2.8065637377738257, + "grad_norm": 0.9088607430458069, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 17360 + }, + { + "epoch": 2.808180421954571, + "grad_norm": 0.6454846858978271, + "learning_rate": 0.0002, + "loss": 0.675, + "step": 17370 + }, + { + "epoch": 2.809797106135316, + "grad_norm": 0.7744787931442261, + "learning_rate": 0.0002, + "loss": 0.7069, + "step": 17380 + }, + { + "epoch": 2.811413790316062, + "grad_norm": 0.6678640842437744, + "learning_rate": 0.0002, + "loss": 0.6772, + "step": 17390 + }, + { + "epoch": 2.813030474496807, + "grad_norm": 0.772676944732666, + "learning_rate": 0.0002, + "loss": 0.6784, + "step": 17400 + }, + { + "epoch": 2.8146471586775523, + "grad_norm": 0.7088175415992737, + "learning_rate": 0.0002, + "loss": 0.7252, + "step": 17410 + }, + { + "epoch": 2.8162638428582976, + "grad_norm": 0.8280573487281799, + "learning_rate": 0.0002, + "loss": 0.7086, + "step": 17420 + }, + { + "epoch": 2.817880527039043, + "grad_norm": 0.6665388345718384, + "learning_rate": 0.0002, + "loss": 0.6732, + "step": 17430 + }, + { + "epoch": 2.8194972112197885, + "grad_norm": 0.6427883505821228, + "learning_rate": 0.0002, + "loss": 0.6675, + "step": 17440 + }, + { + "epoch": 2.8211138954005337, + "grad_norm": 0.9697760343551636, + "learning_rate": 0.0002, + "loss": 0.6972, + "step": 17450 + }, + { + "epoch": 2.822730579581279, + "grad_norm": 0.7573966383934021, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 17460 + }, + { + "epoch": 2.824347263762024, + "grad_norm": 0.878688633441925, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 17470 + }, + { + "epoch": 2.8259639479427694, + "grad_norm": 0.7752242684364319, + "learning_rate": 0.0002, + "loss": 0.6666, + "step": 17480 + }, + { + "epoch": 2.8275806321235146, + "grad_norm": 0.6135398745536804, + "learning_rate": 0.0002, + "loss": 0.6638, + "step": 17490 + }, + { + "epoch": 2.82919731630426, + "grad_norm": 0.6924924850463867, + "learning_rate": 0.0002, + "loss": 0.6829, + "step": 17500 + }, + { + "epoch": 2.830814000485005, + "grad_norm": 0.7471627593040466, + "learning_rate": 0.0002, + "loss": 0.6731, + "step": 17510 + }, + { + "epoch": 2.8324306846657503, + "grad_norm": 0.7145499587059021, + "learning_rate": 0.0002, + "loss": 0.7016, + "step": 17520 + }, + { + "epoch": 2.834047368846496, + "grad_norm": 0.7415414452552795, + "learning_rate": 0.0002, + "loss": 0.6787, + "step": 17530 + }, + { + "epoch": 2.8356640530272412, + "grad_norm": 0.7328441739082336, + "learning_rate": 0.0002, + "loss": 0.6811, + "step": 17540 + }, + { + "epoch": 2.8372807372079865, + "grad_norm": 0.8267839550971985, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 17550 + }, + { + "epoch": 2.8388974213887317, + "grad_norm": 0.8877885341644287, + "learning_rate": 0.0002, + "loss": 0.6787, + "step": 17560 + }, + { + "epoch": 2.840514105569477, + "grad_norm": 0.857138454914093, + "learning_rate": 0.0002, + "loss": 0.7136, + "step": 17570 + }, + { + "epoch": 2.842130789750222, + "grad_norm": 0.8470779657363892, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 17580 + }, + { + "epoch": 2.843747473930968, + "grad_norm": 0.8553254008293152, + "learning_rate": 0.0002, + "loss": 0.6976, + "step": 17590 + }, + { + "epoch": 2.845364158111713, + "grad_norm": 0.8033196926116943, + "learning_rate": 0.0002, + "loss": 0.7297, + "step": 17600 + }, + { + "epoch": 2.8469808422924583, + "grad_norm": 0.7949087023735046, + "learning_rate": 0.0002, + "loss": 0.7062, + "step": 17610 + }, + { + "epoch": 2.8485975264732035, + "grad_norm": 0.9241406321525574, + "learning_rate": 0.0002, + "loss": 0.651, + "step": 17620 + }, + { + "epoch": 2.8502142106539488, + "grad_norm": 0.7721285223960876, + "learning_rate": 0.0002, + "loss": 0.6601, + "step": 17630 + }, + { + "epoch": 2.851830894834694, + "grad_norm": 1.0246692895889282, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 17640 + }, + { + "epoch": 2.853447579015439, + "grad_norm": 0.9244589805603027, + "learning_rate": 0.0002, + "loss": 0.7007, + "step": 17650 + }, + { + "epoch": 2.8550642631961844, + "grad_norm": 0.7243508696556091, + "learning_rate": 0.0002, + "loss": 0.7274, + "step": 17660 + }, + { + "epoch": 2.8566809473769297, + "grad_norm": 0.8943371176719666, + "learning_rate": 0.0002, + "loss": 0.6471, + "step": 17670 + }, + { + "epoch": 2.8582976315576754, + "grad_norm": 0.6531758904457092, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 17680 + }, + { + "epoch": 2.8599143157384206, + "grad_norm": 0.8367000818252563, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 17690 + }, + { + "epoch": 2.861530999919166, + "grad_norm": 0.7868556380271912, + "learning_rate": 0.0002, + "loss": 0.6943, + "step": 17700 + }, + { + "epoch": 2.863147684099911, + "grad_norm": 0.7213859558105469, + "learning_rate": 0.0002, + "loss": 0.6919, + "step": 17710 + }, + { + "epoch": 2.8647643682806563, + "grad_norm": 0.7383931279182434, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 17720 + }, + { + "epoch": 2.8663810524614015, + "grad_norm": 0.7566812634468079, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 17730 + }, + { + "epoch": 2.867997736642147, + "grad_norm": 0.6930373311042786, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 17740 + }, + { + "epoch": 2.8696144208228924, + "grad_norm": 0.7911090850830078, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 17750 + }, + { + "epoch": 2.8712311050036377, + "grad_norm": 0.8484548926353455, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 17760 + }, + { + "epoch": 2.872847789184383, + "grad_norm": 0.7647597193717957, + "learning_rate": 0.0002, + "loss": 0.6931, + "step": 17770 + }, + { + "epoch": 2.874464473365128, + "grad_norm": 0.8791151642799377, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 17780 + }, + { + "epoch": 2.8760811575458733, + "grad_norm": 0.7253178358078003, + "learning_rate": 0.0002, + "loss": 0.7078, + "step": 17790 + }, + { + "epoch": 2.8776978417266186, + "grad_norm": 0.7956077456474304, + "learning_rate": 0.0002, + "loss": 0.6474, + "step": 17800 + }, + { + "epoch": 2.879314525907364, + "grad_norm": 0.8657688498497009, + "learning_rate": 0.0002, + "loss": 0.6687, + "step": 17810 + }, + { + "epoch": 2.880931210088109, + "grad_norm": 0.7059141993522644, + "learning_rate": 0.0002, + "loss": 0.7171, + "step": 17820 + }, + { + "epoch": 2.8825478942688547, + "grad_norm": 0.8886896967887878, + "learning_rate": 0.0002, + "loss": 0.683, + "step": 17830 + }, + { + "epoch": 2.8841645784496, + "grad_norm": 0.821032702922821, + "learning_rate": 0.0002, + "loss": 0.669, + "step": 17840 + }, + { + "epoch": 2.885781262630345, + "grad_norm": 0.7183963656425476, + "learning_rate": 0.0002, + "loss": 0.6805, + "step": 17850 + }, + { + "epoch": 2.8873979468110904, + "grad_norm": 0.6222899556159973, + "learning_rate": 0.0002, + "loss": 0.7088, + "step": 17860 + }, + { + "epoch": 2.8890146309918356, + "grad_norm": 0.8187434077262878, + "learning_rate": 0.0002, + "loss": 0.6626, + "step": 17870 + }, + { + "epoch": 2.890631315172581, + "grad_norm": 0.9838479161262512, + "learning_rate": 0.0002, + "loss": 0.6815, + "step": 17880 + }, + { + "epoch": 2.8922479993533265, + "grad_norm": 0.7567742466926575, + "learning_rate": 0.0002, + "loss": 0.6967, + "step": 17890 + }, + { + "epoch": 2.893864683534072, + "grad_norm": 0.6875903606414795, + "learning_rate": 0.0002, + "loss": 0.7073, + "step": 17900 + }, + { + "epoch": 2.895481367714817, + "grad_norm": 0.8043789267539978, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 17910 + }, + { + "epoch": 2.8970980518955622, + "grad_norm": 0.8062626719474792, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 17920 + }, + { + "epoch": 2.8987147360763075, + "grad_norm": 1.0251191854476929, + "learning_rate": 0.0002, + "loss": 0.7151, + "step": 17930 + }, + { + "epoch": 2.9003314202570527, + "grad_norm": 0.882253110408783, + "learning_rate": 0.0002, + "loss": 0.6605, + "step": 17940 + }, + { + "epoch": 2.901948104437798, + "grad_norm": 0.8683299422264099, + "learning_rate": 0.0002, + "loss": 0.6719, + "step": 17950 + }, + { + "epoch": 2.903564788618543, + "grad_norm": 0.7167282104492188, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 17960 + }, + { + "epoch": 2.9051814727992884, + "grad_norm": 0.7093694806098938, + "learning_rate": 0.0002, + "loss": 0.663, + "step": 17970 + }, + { + "epoch": 2.906798156980034, + "grad_norm": 0.8549879193305969, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 17980 + }, + { + "epoch": 2.9084148411607793, + "grad_norm": 0.6989606618881226, + "learning_rate": 0.0002, + "loss": 0.6962, + "step": 17990 + }, + { + "epoch": 2.9100315253415245, + "grad_norm": 0.9482976794242859, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 18000 + }, + { + "epoch": 2.9116482095222698, + "grad_norm": 0.7182440161705017, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 18010 + }, + { + "epoch": 2.913264893703015, + "grad_norm": 0.7732226252555847, + "learning_rate": 0.0002, + "loss": 0.6827, + "step": 18020 + }, + { + "epoch": 2.9148815778837607, + "grad_norm": 0.7936875224113464, + "learning_rate": 0.0002, + "loss": 0.7123, + "step": 18030 + }, + { + "epoch": 2.916498262064506, + "grad_norm": 0.8825615644454956, + "learning_rate": 0.0002, + "loss": 0.6736, + "step": 18040 + }, + { + "epoch": 2.918114946245251, + "grad_norm": 0.6778587102890015, + "learning_rate": 0.0002, + "loss": 0.7139, + "step": 18050 + }, + { + "epoch": 2.9197316304259964, + "grad_norm": 0.7529265880584717, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 18060 + }, + { + "epoch": 2.9213483146067416, + "grad_norm": 0.7111883163452148, + "learning_rate": 0.0002, + "loss": 0.737, + "step": 18070 + }, + { + "epoch": 2.922964998787487, + "grad_norm": 0.7214767932891846, + "learning_rate": 0.0002, + "loss": 0.7475, + "step": 18080 + }, + { + "epoch": 2.924581682968232, + "grad_norm": 0.800417423248291, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 18090 + }, + { + "epoch": 2.9261983671489773, + "grad_norm": 1.248575210571289, + "learning_rate": 0.0002, + "loss": 0.6694, + "step": 18100 + }, + { + "epoch": 2.9278150513297225, + "grad_norm": 0.757788360118866, + "learning_rate": 0.0002, + "loss": 0.7004, + "step": 18110 + }, + { + "epoch": 2.9294317355104678, + "grad_norm": 1.0583995580673218, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 18120 + }, + { + "epoch": 2.9310484196912134, + "grad_norm": 0.8228777647018433, + "learning_rate": 0.0002, + "loss": 0.6365, + "step": 18130 + }, + { + "epoch": 2.9326651038719587, + "grad_norm": 0.8374035358428955, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 18140 + }, + { + "epoch": 2.934281788052704, + "grad_norm": 0.7976473569869995, + "learning_rate": 0.0002, + "loss": 0.6399, + "step": 18150 + }, + { + "epoch": 2.935898472233449, + "grad_norm": 0.8009907603263855, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 18160 + }, + { + "epoch": 2.9375151564141944, + "grad_norm": 0.835213303565979, + "learning_rate": 0.0002, + "loss": 0.7485, + "step": 18170 + }, + { + "epoch": 2.93913184059494, + "grad_norm": 0.7982219457626343, + "learning_rate": 0.0002, + "loss": 0.7376, + "step": 18180 + }, + { + "epoch": 2.9407485247756853, + "grad_norm": 0.7070978879928589, + "learning_rate": 0.0002, + "loss": 0.6348, + "step": 18190 + }, + { + "epoch": 2.9423652089564305, + "grad_norm": 0.8619440197944641, + "learning_rate": 0.0002, + "loss": 0.6608, + "step": 18200 + }, + { + "epoch": 2.9439818931371757, + "grad_norm": 0.6693987250328064, + "learning_rate": 0.0002, + "loss": 0.666, + "step": 18210 + }, + { + "epoch": 2.945598577317921, + "grad_norm": 0.6747021079063416, + "learning_rate": 0.0002, + "loss": 0.728, + "step": 18220 + }, + { + "epoch": 2.947215261498666, + "grad_norm": 0.860387921333313, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 18230 + }, + { + "epoch": 2.9488319456794114, + "grad_norm": 0.799976646900177, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 18240 + }, + { + "epoch": 2.9504486298601567, + "grad_norm": 0.7864769101142883, + "learning_rate": 0.0002, + "loss": 0.7243, + "step": 18250 + }, + { + "epoch": 2.952065314040902, + "grad_norm": 0.6713884472846985, + "learning_rate": 0.0002, + "loss": 0.6785, + "step": 18260 + }, + { + "epoch": 2.9536819982216476, + "grad_norm": 0.9031508564949036, + "learning_rate": 0.0002, + "loss": 0.7429, + "step": 18270 + }, + { + "epoch": 2.955298682402393, + "grad_norm": 0.7205073237419128, + "learning_rate": 0.0002, + "loss": 0.7055, + "step": 18280 + }, + { + "epoch": 2.956915366583138, + "grad_norm": 0.7746205925941467, + "learning_rate": 0.0002, + "loss": 0.7298, + "step": 18290 + }, + { + "epoch": 2.9585320507638833, + "grad_norm": 0.6533427834510803, + "learning_rate": 0.0002, + "loss": 0.6218, + "step": 18300 + }, + { + "epoch": 2.9601487349446285, + "grad_norm": 0.9083208441734314, + "learning_rate": 0.0002, + "loss": 0.6674, + "step": 18310 + }, + { + "epoch": 2.9617654191253737, + "grad_norm": 0.7446991801261902, + "learning_rate": 0.0002, + "loss": 0.7359, + "step": 18320 + }, + { + "epoch": 2.9633821033061194, + "grad_norm": 0.6514461636543274, + "learning_rate": 0.0002, + "loss": 0.6738, + "step": 18330 + }, + { + "epoch": 2.9649987874868646, + "grad_norm": 0.8580465912818909, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 18340 + }, + { + "epoch": 2.96661547166761, + "grad_norm": 0.7074266076087952, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 18350 + }, + { + "epoch": 2.968232155848355, + "grad_norm": 0.899892270565033, + "learning_rate": 0.0002, + "loss": 0.6804, + "step": 18360 + }, + { + "epoch": 2.9698488400291003, + "grad_norm": 0.8217641711235046, + "learning_rate": 0.0002, + "loss": 0.7094, + "step": 18370 + }, + { + "epoch": 2.9714655242098456, + "grad_norm": 0.8611799478530884, + "learning_rate": 0.0002, + "loss": 0.6916, + "step": 18380 + }, + { + "epoch": 2.973082208390591, + "grad_norm": 0.6909302473068237, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 18390 + }, + { + "epoch": 2.974698892571336, + "grad_norm": 0.6554358005523682, + "learning_rate": 0.0002, + "loss": 0.7247, + "step": 18400 + }, + { + "epoch": 2.9763155767520812, + "grad_norm": 0.7803071737289429, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 18410 + }, + { + "epoch": 2.977932260932827, + "grad_norm": 0.7838954925537109, + "learning_rate": 0.0002, + "loss": 0.7322, + "step": 18420 + }, + { + "epoch": 2.979548945113572, + "grad_norm": 0.7098495364189148, + "learning_rate": 0.0002, + "loss": 0.6522, + "step": 18430 + }, + { + "epoch": 2.9811656292943174, + "grad_norm": 0.8981785774230957, + "learning_rate": 0.0002, + "loss": 0.739, + "step": 18440 + }, + { + "epoch": 2.9827823134750626, + "grad_norm": 0.7197171449661255, + "learning_rate": 0.0002, + "loss": 0.6689, + "step": 18450 + }, + { + "epoch": 2.984398997655808, + "grad_norm": 0.793185293674469, + "learning_rate": 0.0002, + "loss": 0.706, + "step": 18460 + }, + { + "epoch": 2.986015681836553, + "grad_norm": 0.8531473875045776, + "learning_rate": 0.0002, + "loss": 0.7124, + "step": 18470 + }, + { + "epoch": 2.9876323660172988, + "grad_norm": 0.6627361178398132, + "learning_rate": 0.0002, + "loss": 0.6901, + "step": 18480 + }, + { + "epoch": 2.989249050198044, + "grad_norm": 0.5708155035972595, + "learning_rate": 0.0002, + "loss": 0.6591, + "step": 18490 + }, + { + "epoch": 2.990865734378789, + "grad_norm": 0.8227280378341675, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 18500 + }, + { + "epoch": 2.9924824185595345, + "grad_norm": 0.7102749943733215, + "learning_rate": 0.0002, + "loss": 0.6701, + "step": 18510 + }, + { + "epoch": 2.9940991027402797, + "grad_norm": 0.839485228061676, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 18520 + }, + { + "epoch": 2.995715786921025, + "grad_norm": 0.9038704037666321, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 18530 + }, + { + "epoch": 2.99733247110177, + "grad_norm": 0.8737510442733765, + "learning_rate": 0.0002, + "loss": 0.7186, + "step": 18540 + }, + { + "epoch": 2.9989491552825154, + "grad_norm": 0.7323142886161804, + "learning_rate": 0.0002, + "loss": 0.6819, + "step": 18550 + }, + { + "epoch": 2.9999191657909625, + "eval_loss": 1.1262480020523071, + "eval_runtime": 122.0868, + "eval_samples_per_second": 6.004, + "eval_steps_per_second": 0.754, + "step": 18556 + }, + { + "epoch": 3.000565839463261, + "grad_norm": 0.8465463519096375, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 18560 + }, + { + "epoch": 3.0021825236440063, + "grad_norm": 0.9134138822555542, + "learning_rate": 0.0002, + "loss": 0.6064, + "step": 18570 + }, + { + "epoch": 3.0037992078247515, + "grad_norm": 0.760715126991272, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 18580 + }, + { + "epoch": 3.0054158920054967, + "grad_norm": 0.9208743572235107, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 18590 + }, + { + "epoch": 3.007032576186242, + "grad_norm": 0.9232364892959595, + "learning_rate": 0.0002, + "loss": 0.5731, + "step": 18600 + }, + { + "epoch": 3.008649260366987, + "grad_norm": 1.1881544589996338, + "learning_rate": 0.0002, + "loss": 0.6299, + "step": 18610 + }, + { + "epoch": 3.0102659445477324, + "grad_norm": 0.9372987747192383, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 18620 + }, + { + "epoch": 3.0118826287284777, + "grad_norm": 0.6900241374969482, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 18630 + }, + { + "epoch": 3.0134993129092233, + "grad_norm": 0.8451071381568909, + "learning_rate": 0.0002, + "loss": 0.5256, + "step": 18640 + }, + { + "epoch": 3.0151159970899686, + "grad_norm": 0.7763112187385559, + "learning_rate": 0.0002, + "loss": 0.5916, + "step": 18650 + }, + { + "epoch": 3.016732681270714, + "grad_norm": 1.043653964996338, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 18660 + }, + { + "epoch": 3.018349365451459, + "grad_norm": 1.0170660018920898, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 18670 + }, + { + "epoch": 3.0199660496322043, + "grad_norm": 0.7534180283546448, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 18680 + }, + { + "epoch": 3.0215827338129495, + "grad_norm": 0.7507367730140686, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 18690 + }, + { + "epoch": 3.0231994179936947, + "grad_norm": 0.7861620187759399, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 18700 + }, + { + "epoch": 3.0248161021744404, + "grad_norm": 1.0580339431762695, + "learning_rate": 0.0002, + "loss": 0.5802, + "step": 18710 + }, + { + "epoch": 3.0264327863551856, + "grad_norm": 0.7542710900306702, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 18720 + }, + { + "epoch": 3.028049470535931, + "grad_norm": 0.8189544677734375, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 18730 + }, + { + "epoch": 3.029666154716676, + "grad_norm": 0.9126611351966858, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 18740 + }, + { + "epoch": 3.0312828388974213, + "grad_norm": 0.8891341686248779, + "learning_rate": 0.0002, + "loss": 0.6443, + "step": 18750 + }, + { + "epoch": 3.0328995230781666, + "grad_norm": 0.8419283032417297, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 18760 + }, + { + "epoch": 3.034516207258912, + "grad_norm": 0.8048048615455627, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 18770 + }, + { + "epoch": 3.0361328914396575, + "grad_norm": 0.7820217609405518, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 18780 + }, + { + "epoch": 3.0377495756204027, + "grad_norm": 0.854721188545227, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 18790 + }, + { + "epoch": 3.039366259801148, + "grad_norm": 0.912092924118042, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 18800 + }, + { + "epoch": 3.040982943981893, + "grad_norm": 0.6596226096153259, + "learning_rate": 0.0002, + "loss": 0.5734, + "step": 18810 + }, + { + "epoch": 3.0425996281626384, + "grad_norm": 0.6351348757743835, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 18820 + }, + { + "epoch": 3.0442163123433836, + "grad_norm": 0.778188943862915, + "learning_rate": 0.0002, + "loss": 0.5953, + "step": 18830 + }, + { + "epoch": 3.045832996524129, + "grad_norm": 0.68234783411026, + "learning_rate": 0.0002, + "loss": 0.602, + "step": 18840 + }, + { + "epoch": 3.047449680704874, + "grad_norm": 0.998628556728363, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 18850 + }, + { + "epoch": 3.0490663648856198, + "grad_norm": 0.7393841743469238, + "learning_rate": 0.0002, + "loss": 0.6231, + "step": 18860 + }, + { + "epoch": 3.050683049066365, + "grad_norm": 0.84438556432724, + "learning_rate": 0.0002, + "loss": 0.568, + "step": 18870 + }, + { + "epoch": 3.0522997332471102, + "grad_norm": 0.8857501745223999, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 18880 + }, + { + "epoch": 3.0539164174278555, + "grad_norm": 0.7208474278450012, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 18890 + }, + { + "epoch": 3.0555331016086007, + "grad_norm": 0.7135229110717773, + "learning_rate": 0.0002, + "loss": 0.5998, + "step": 18900 + }, + { + "epoch": 3.057149785789346, + "grad_norm": 0.9130001664161682, + "learning_rate": 0.0002, + "loss": 0.5575, + "step": 18910 + }, + { + "epoch": 3.058766469970091, + "grad_norm": 0.9001716375350952, + "learning_rate": 0.0002, + "loss": 0.5955, + "step": 18920 + }, + { + "epoch": 3.060383154150837, + "grad_norm": 0.8667559623718262, + "learning_rate": 0.0002, + "loss": 0.6052, + "step": 18930 + }, + { + "epoch": 3.061999838331582, + "grad_norm": 0.8943959474563599, + "learning_rate": 0.0002, + "loss": 0.5818, + "step": 18940 + }, + { + "epoch": 3.0636165225123273, + "grad_norm": 0.8298377990722656, + "learning_rate": 0.0002, + "loss": 0.5978, + "step": 18950 + }, + { + "epoch": 3.0652332066930725, + "grad_norm": 0.7935267686843872, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 18960 + }, + { + "epoch": 3.0668498908738178, + "grad_norm": 1.1506379842758179, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 18970 + }, + { + "epoch": 3.068466575054563, + "grad_norm": 0.7693049907684326, + "learning_rate": 0.0002, + "loss": 0.5571, + "step": 18980 + }, + { + "epoch": 3.0700832592353082, + "grad_norm": 0.8040135502815247, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 18990 + }, + { + "epoch": 3.0716999434160535, + "grad_norm": 0.828404426574707, + "learning_rate": 0.0002, + "loss": 0.5541, + "step": 19000 + }, + { + "epoch": 3.073316627596799, + "grad_norm": 0.8811164498329163, + "learning_rate": 0.0002, + "loss": 0.6048, + "step": 19010 + }, + { + "epoch": 3.0749333117775444, + "grad_norm": 1.036205768585205, + "learning_rate": 0.0002, + "loss": 0.5845, + "step": 19020 + }, + { + "epoch": 3.0765499959582896, + "grad_norm": 0.8857285976409912, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 19030 + }, + { + "epoch": 3.078166680139035, + "grad_norm": 0.8392079472541809, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 19040 + }, + { + "epoch": 3.07978336431978, + "grad_norm": 1.0287401676177979, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 19050 + }, + { + "epoch": 3.0814000485005253, + "grad_norm": 1.0086315870285034, + "learning_rate": 0.0002, + "loss": 0.5964, + "step": 19060 + }, + { + "epoch": 3.0830167326812705, + "grad_norm": 0.9245324730873108, + "learning_rate": 0.0002, + "loss": 0.5567, + "step": 19070 + }, + { + "epoch": 3.084633416862016, + "grad_norm": 0.8680877089500427, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 19080 + }, + { + "epoch": 3.0862501010427614, + "grad_norm": 0.8814793825149536, + "learning_rate": 0.0002, + "loss": 0.5611, + "step": 19090 + }, + { + "epoch": 3.0878667852235067, + "grad_norm": 0.9234458208084106, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 19100 + }, + { + "epoch": 3.089483469404252, + "grad_norm": 1.1291664838790894, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 19110 + }, + { + "epoch": 3.091100153584997, + "grad_norm": 0.9191402792930603, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 19120 + }, + { + "epoch": 3.0927168377657424, + "grad_norm": 0.7103154063224792, + "learning_rate": 0.0002, + "loss": 0.5856, + "step": 19130 + }, + { + "epoch": 3.0943335219464876, + "grad_norm": 0.9368883967399597, + "learning_rate": 0.0002, + "loss": 0.6479, + "step": 19140 + }, + { + "epoch": 3.095950206127233, + "grad_norm": 0.9676656723022461, + "learning_rate": 0.0002, + "loss": 0.6167, + "step": 19150 + }, + { + "epoch": 3.0975668903079785, + "grad_norm": 0.8739792704582214, + "learning_rate": 0.0002, + "loss": 0.5794, + "step": 19160 + }, + { + "epoch": 3.0991835744887237, + "grad_norm": 0.8530174493789673, + "learning_rate": 0.0002, + "loss": 0.6112, + "step": 19170 + }, + { + "epoch": 3.100800258669469, + "grad_norm": 0.794945478439331, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 19180 + }, + { + "epoch": 3.102416942850214, + "grad_norm": 0.9508888125419617, + "learning_rate": 0.0002, + "loss": 0.5928, + "step": 19190 + }, + { + "epoch": 3.1040336270309594, + "grad_norm": 1.0599955320358276, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 19200 + }, + { + "epoch": 3.1056503112117047, + "grad_norm": 1.0673625469207764, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 19210 + }, + { + "epoch": 3.10726699539245, + "grad_norm": 0.7739115953445435, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 19220 + }, + { + "epoch": 3.1088836795731956, + "grad_norm": 0.9884951114654541, + "learning_rate": 0.0002, + "loss": 0.6046, + "step": 19230 + }, + { + "epoch": 3.110500363753941, + "grad_norm": 0.862260103225708, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 19240 + }, + { + "epoch": 3.112117047934686, + "grad_norm": 0.7690284848213196, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 19250 + }, + { + "epoch": 3.1137337321154313, + "grad_norm": 0.8758958578109741, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 19260 + }, + { + "epoch": 3.1153504162961765, + "grad_norm": 1.0356395244598389, + "learning_rate": 0.0002, + "loss": 0.6136, + "step": 19270 + }, + { + "epoch": 3.1169671004769217, + "grad_norm": 0.6950937509536743, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 19280 + }, + { + "epoch": 3.118583784657667, + "grad_norm": 0.760998010635376, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 19290 + }, + { + "epoch": 3.1202004688384126, + "grad_norm": 0.9335789084434509, + "learning_rate": 0.0002, + "loss": 0.575, + "step": 19300 + }, + { + "epoch": 3.121817153019158, + "grad_norm": 0.9636204242706299, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 19310 + }, + { + "epoch": 3.123433837199903, + "grad_norm": 1.0820997953414917, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 19320 + }, + { + "epoch": 3.1250505213806483, + "grad_norm": 0.7333487272262573, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 19330 + }, + { + "epoch": 3.1266672055613935, + "grad_norm": 1.0417509078979492, + "learning_rate": 0.0002, + "loss": 0.6178, + "step": 19340 + }, + { + "epoch": 3.128283889742139, + "grad_norm": 0.9267749190330505, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 19350 + }, + { + "epoch": 3.129900573922884, + "grad_norm": 0.777798593044281, + "learning_rate": 0.0002, + "loss": 0.6063, + "step": 19360 + }, + { + "epoch": 3.1315172581036297, + "grad_norm": 0.8425456881523132, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 19370 + }, + { + "epoch": 3.133133942284375, + "grad_norm": 0.9617102146148682, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 19380 + }, + { + "epoch": 3.13475062646512, + "grad_norm": 1.0052828788757324, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 19390 + }, + { + "epoch": 3.1363673106458654, + "grad_norm": 0.7637009024620056, + "learning_rate": 0.0002, + "loss": 0.5713, + "step": 19400 + }, + { + "epoch": 3.1379839948266106, + "grad_norm": 0.7958088517189026, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 19410 + }, + { + "epoch": 3.139600679007356, + "grad_norm": 0.9161727428436279, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 19420 + }, + { + "epoch": 3.141217363188101, + "grad_norm": 0.8402149677276611, + "learning_rate": 0.0002, + "loss": 0.5638, + "step": 19430 + }, + { + "epoch": 3.1428340473688463, + "grad_norm": 1.0056525468826294, + "learning_rate": 0.0002, + "loss": 0.5848, + "step": 19440 + }, + { + "epoch": 3.144450731549592, + "grad_norm": 1.0129190683364868, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 19450 + }, + { + "epoch": 3.146067415730337, + "grad_norm": 0.790825366973877, + "learning_rate": 0.0002, + "loss": 0.5808, + "step": 19460 + }, + { + "epoch": 3.1476840999110824, + "grad_norm": 1.441665530204773, + "learning_rate": 0.0002, + "loss": 0.5607, + "step": 19470 + }, + { + "epoch": 3.1493007840918277, + "grad_norm": 0.7846331596374512, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 19480 + }, + { + "epoch": 3.150917468272573, + "grad_norm": 0.7915332913398743, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 19490 + }, + { + "epoch": 3.152534152453318, + "grad_norm": 0.933982253074646, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 19500 + }, + { + "epoch": 3.1541508366340634, + "grad_norm": 1.038408637046814, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 19510 + }, + { + "epoch": 3.155767520814809, + "grad_norm": 1.018935203552246, + "learning_rate": 0.0002, + "loss": 0.6271, + "step": 19520 + }, + { + "epoch": 3.1573842049955543, + "grad_norm": 0.9618112444877625, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 19530 + }, + { + "epoch": 3.1590008891762995, + "grad_norm": 0.8900452852249146, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 19540 + }, + { + "epoch": 3.1606175733570447, + "grad_norm": 0.8254160284996033, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 19550 + }, + { + "epoch": 3.16223425753779, + "grad_norm": 1.004376769065857, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 19560 + }, + { + "epoch": 3.163850941718535, + "grad_norm": 1.0490446090698242, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 19570 + }, + { + "epoch": 3.1654676258992804, + "grad_norm": 0.7387403845787048, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 19580 + }, + { + "epoch": 3.1670843100800257, + "grad_norm": 0.7611538171768188, + "learning_rate": 0.0002, + "loss": 0.5898, + "step": 19590 + }, + { + "epoch": 3.1687009942607713, + "grad_norm": 0.8239886164665222, + "learning_rate": 0.0002, + "loss": 0.5937, + "step": 19600 + }, + { + "epoch": 3.1703176784415166, + "grad_norm": 0.9327243566513062, + "learning_rate": 0.0002, + "loss": 0.6068, + "step": 19610 + }, + { + "epoch": 3.171934362622262, + "grad_norm": 0.9662560224533081, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 19620 + }, + { + "epoch": 3.173551046803007, + "grad_norm": 0.9183341860771179, + "learning_rate": 0.0002, + "loss": 0.5988, + "step": 19630 + }, + { + "epoch": 3.1751677309837523, + "grad_norm": 0.875066876411438, + "learning_rate": 0.0002, + "loss": 0.5909, + "step": 19640 + }, + { + "epoch": 3.1767844151644975, + "grad_norm": 0.8567508459091187, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 19650 + }, + { + "epoch": 3.1784010993452427, + "grad_norm": 0.6805780529975891, + "learning_rate": 0.0002, + "loss": 0.5805, + "step": 19660 + }, + { + "epoch": 3.1800177835259884, + "grad_norm": 0.8776944279670715, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 19670 + }, + { + "epoch": 3.1816344677067336, + "grad_norm": 0.9036329984664917, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 19680 + }, + { + "epoch": 3.183251151887479, + "grad_norm": 0.8527372479438782, + "learning_rate": 0.0002, + "loss": 0.6238, + "step": 19690 + }, + { + "epoch": 3.184867836068224, + "grad_norm": 1.1045585870742798, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 19700 + }, + { + "epoch": 3.1864845202489693, + "grad_norm": 0.9213830828666687, + "learning_rate": 0.0002, + "loss": 0.5491, + "step": 19710 + }, + { + "epoch": 3.1881012044297146, + "grad_norm": 0.8865814805030823, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 19720 + }, + { + "epoch": 3.18971788861046, + "grad_norm": 0.7939388751983643, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 19730 + }, + { + "epoch": 3.191334572791205, + "grad_norm": 0.6966729760169983, + "learning_rate": 0.0002, + "loss": 0.5682, + "step": 19740 + }, + { + "epoch": 3.1929512569719507, + "grad_norm": 0.8023673295974731, + "learning_rate": 0.0002, + "loss": 0.5839, + "step": 19750 + }, + { + "epoch": 3.194567941152696, + "grad_norm": 0.7992037534713745, + "learning_rate": 0.0002, + "loss": 0.6267, + "step": 19760 + }, + { + "epoch": 3.196184625333441, + "grad_norm": 0.7412247657775879, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 19770 + }, + { + "epoch": 3.1978013095141864, + "grad_norm": 0.9598729014396667, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 19780 + }, + { + "epoch": 3.1994179936949316, + "grad_norm": 0.8331366777420044, + "learning_rate": 0.0002, + "loss": 0.5685, + "step": 19790 + }, + { + "epoch": 3.201034677875677, + "grad_norm": 0.8939169645309448, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 19800 + }, + { + "epoch": 3.202651362056422, + "grad_norm": 0.9219734072685242, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 19810 + }, + { + "epoch": 3.2042680462371678, + "grad_norm": 0.869490385055542, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 19820 + }, + { + "epoch": 3.205884730417913, + "grad_norm": 0.8989706635475159, + "learning_rate": 0.0002, + "loss": 0.5946, + "step": 19830 + }, + { + "epoch": 3.2075014145986582, + "grad_norm": 0.8477165102958679, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 19840 + }, + { + "epoch": 3.2091180987794035, + "grad_norm": 0.8720678687095642, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 19850 + }, + { + "epoch": 3.2107347829601487, + "grad_norm": 0.861406683921814, + "learning_rate": 0.0002, + "loss": 0.5694, + "step": 19860 + }, + { + "epoch": 3.212351467140894, + "grad_norm": 0.8228686451911926, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 19870 + }, + { + "epoch": 3.213968151321639, + "grad_norm": 0.7936596870422363, + "learning_rate": 0.0002, + "loss": 0.625, + "step": 19880 + }, + { + "epoch": 3.2155848355023844, + "grad_norm": 1.097377896308899, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 19890 + }, + { + "epoch": 3.21720151968313, + "grad_norm": 0.9544782638549805, + "learning_rate": 0.0002, + "loss": 0.6725, + "step": 19900 + }, + { + "epoch": 3.2188182038638753, + "grad_norm": 0.8240751624107361, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 19910 + }, + { + "epoch": 3.2204348880446205, + "grad_norm": 0.8332096338272095, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 19920 + }, + { + "epoch": 3.2220515722253658, + "grad_norm": 1.0954567193984985, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 19930 + }, + { + "epoch": 3.223668256406111, + "grad_norm": 0.7790525555610657, + "learning_rate": 0.0002, + "loss": 0.652, + "step": 19940 + }, + { + "epoch": 3.225284940586856, + "grad_norm": 0.7966814041137695, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 19950 + }, + { + "epoch": 3.2269016247676015, + "grad_norm": 0.9751881957054138, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 19960 + }, + { + "epoch": 3.228518308948347, + "grad_norm": 0.9856047630310059, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 19970 + }, + { + "epoch": 3.2301349931290924, + "grad_norm": 1.3062353134155273, + "learning_rate": 0.0002, + "loss": 0.5837, + "step": 19980 + }, + { + "epoch": 3.2317516773098376, + "grad_norm": 0.9510692358016968, + "learning_rate": 0.0002, + "loss": 0.6588, + "step": 19990 + }, + { + "epoch": 3.233368361490583, + "grad_norm": 0.8630342483520508, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 20000 + }, + { + "epoch": 3.234985045671328, + "grad_norm": 0.8966519236564636, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 20010 + }, + { + "epoch": 3.2366017298520733, + "grad_norm": 0.7093510627746582, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 20020 + }, + { + "epoch": 3.2382184140328185, + "grad_norm": 0.7771096229553223, + "learning_rate": 0.0002, + "loss": 0.585, + "step": 20030 + }, + { + "epoch": 3.2398350982135637, + "grad_norm": 0.841058075428009, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 20040 + }, + { + "epoch": 3.2414517823943094, + "grad_norm": 0.909712553024292, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 20050 + }, + { + "epoch": 3.2430684665750547, + "grad_norm": 0.8321019411087036, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 20060 + }, + { + "epoch": 3.2446851507558, + "grad_norm": 0.779901921749115, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 20070 + }, + { + "epoch": 3.246301834936545, + "grad_norm": 0.6249170303344727, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 20080 + }, + { + "epoch": 3.2479185191172903, + "grad_norm": 0.8000940680503845, + "learning_rate": 0.0002, + "loss": 0.603, + "step": 20090 + }, + { + "epoch": 3.2495352032980356, + "grad_norm": 0.7627735137939453, + "learning_rate": 0.0002, + "loss": 0.6273, + "step": 20100 + }, + { + "epoch": 3.2511518874787813, + "grad_norm": 0.8780747056007385, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 20110 + }, + { + "epoch": 3.2527685716595265, + "grad_norm": 0.772037148475647, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 20120 + }, + { + "epoch": 3.2543852558402717, + "grad_norm": 1.0086580514907837, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 20130 + }, + { + "epoch": 3.256001940021017, + "grad_norm": 0.9360289573669434, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 20140 + }, + { + "epoch": 3.257618624201762, + "grad_norm": 1.2099586725234985, + "learning_rate": 0.0002, + "loss": 0.5777, + "step": 20150 + }, + { + "epoch": 3.2592353083825074, + "grad_norm": 0.8368481397628784, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 20160 + }, + { + "epoch": 3.2608519925632526, + "grad_norm": 0.7391039133071899, + "learning_rate": 0.0002, + "loss": 0.5626, + "step": 20170 + }, + { + "epoch": 3.262468676743998, + "grad_norm": 0.9122273325920105, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 20180 + }, + { + "epoch": 3.264085360924743, + "grad_norm": 0.8502281904220581, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 20190 + }, + { + "epoch": 3.265702045105489, + "grad_norm": 1.0926852226257324, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 20200 + }, + { + "epoch": 3.267318729286234, + "grad_norm": 0.7902828454971313, + "learning_rate": 0.0002, + "loss": 0.6027, + "step": 20210 + }, + { + "epoch": 3.2689354134669792, + "grad_norm": 0.8724729418754578, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 20220 + }, + { + "epoch": 3.2705520976477245, + "grad_norm": 0.8469277024269104, + "learning_rate": 0.0002, + "loss": 0.6242, + "step": 20230 + }, + { + "epoch": 3.2721687818284697, + "grad_norm": 0.8865092992782593, + "learning_rate": 0.0002, + "loss": 0.644, + "step": 20240 + }, + { + "epoch": 3.273785466009215, + "grad_norm": 1.0979334115982056, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 20250 + }, + { + "epoch": 3.2754021501899606, + "grad_norm": 1.0860793590545654, + "learning_rate": 0.0002, + "loss": 0.647, + "step": 20260 + }, + { + "epoch": 3.277018834370706, + "grad_norm": 0.981745183467865, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 20270 + }, + { + "epoch": 3.278635518551451, + "grad_norm": 0.9155020713806152, + "learning_rate": 0.0002, + "loss": 0.627, + "step": 20280 + }, + { + "epoch": 3.2802522027321963, + "grad_norm": 0.8436718583106995, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 20290 + }, + { + "epoch": 3.2818688869129415, + "grad_norm": 1.0329409837722778, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 20300 + }, + { + "epoch": 3.2834855710936868, + "grad_norm": 0.9876394271850586, + "learning_rate": 0.0002, + "loss": 0.6, + "step": 20310 + }, + { + "epoch": 3.285102255274432, + "grad_norm": 0.8052917718887329, + "learning_rate": 0.0002, + "loss": 0.5463, + "step": 20320 + }, + { + "epoch": 3.2867189394551772, + "grad_norm": 0.8390680551528931, + "learning_rate": 0.0002, + "loss": 0.5949, + "step": 20330 + }, + { + "epoch": 3.288335623635923, + "grad_norm": 0.9515735507011414, + "learning_rate": 0.0002, + "loss": 0.6492, + "step": 20340 + }, + { + "epoch": 3.289952307816668, + "grad_norm": 0.8028870224952698, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 20350 + }, + { + "epoch": 3.2915689919974134, + "grad_norm": 0.862592339515686, + "learning_rate": 0.0002, + "loss": 0.634, + "step": 20360 + }, + { + "epoch": 3.2931856761781586, + "grad_norm": 0.7451621890068054, + "learning_rate": 0.0002, + "loss": 0.6345, + "step": 20370 + }, + { + "epoch": 3.294802360358904, + "grad_norm": 0.8966776728630066, + "learning_rate": 0.0002, + "loss": 0.6458, + "step": 20380 + }, + { + "epoch": 3.296419044539649, + "grad_norm": 0.9289216995239258, + "learning_rate": 0.0002, + "loss": 0.5967, + "step": 20390 + }, + { + "epoch": 3.2980357287203943, + "grad_norm": 0.9649626612663269, + "learning_rate": 0.0002, + "loss": 0.6599, + "step": 20400 + }, + { + "epoch": 3.29965241290114, + "grad_norm": 1.1953798532485962, + "learning_rate": 0.0002, + "loss": 0.5781, + "step": 20410 + }, + { + "epoch": 3.301269097081885, + "grad_norm": 0.8929083943367004, + "learning_rate": 0.0002, + "loss": 0.5997, + "step": 20420 + }, + { + "epoch": 3.3028857812626304, + "grad_norm": 0.8922014236450195, + "learning_rate": 0.0002, + "loss": 0.597, + "step": 20430 + }, + { + "epoch": 3.3045024654433757, + "grad_norm": 0.9754860401153564, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 20440 + }, + { + "epoch": 3.306119149624121, + "grad_norm": 0.8873140215873718, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 20450 + }, + { + "epoch": 3.307735833804866, + "grad_norm": 0.857271671295166, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 20460 + }, + { + "epoch": 3.3093525179856114, + "grad_norm": 0.9022141098976135, + "learning_rate": 0.0002, + "loss": 0.633, + "step": 20470 + }, + { + "epoch": 3.3109692021663566, + "grad_norm": 0.8614798188209534, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 20480 + }, + { + "epoch": 3.3125858863471023, + "grad_norm": 0.8838164210319519, + "learning_rate": 0.0002, + "loss": 0.6254, + "step": 20490 + }, + { + "epoch": 3.3142025705278475, + "grad_norm": 0.8709736466407776, + "learning_rate": 0.0002, + "loss": 0.5849, + "step": 20500 + }, + { + "epoch": 3.3158192547085927, + "grad_norm": 0.9533300995826721, + "learning_rate": 0.0002, + "loss": 0.6146, + "step": 20510 + }, + { + "epoch": 3.317435938889338, + "grad_norm": 0.8259269595146179, + "learning_rate": 0.0002, + "loss": 0.6029, + "step": 20520 + }, + { + "epoch": 3.319052623070083, + "grad_norm": 0.8607608079910278, + "learning_rate": 0.0002, + "loss": 0.6268, + "step": 20530 + }, + { + "epoch": 3.3206693072508284, + "grad_norm": 1.0863020420074463, + "learning_rate": 0.0002, + "loss": 0.5676, + "step": 20540 + }, + { + "epoch": 3.3222859914315737, + "grad_norm": 1.011489987373352, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 20550 + }, + { + "epoch": 3.3239026756123193, + "grad_norm": 0.6952177882194519, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 20560 + }, + { + "epoch": 3.3255193597930646, + "grad_norm": 0.9638974070549011, + "learning_rate": 0.0002, + "loss": 0.6229, + "step": 20570 + }, + { + "epoch": 3.32713604397381, + "grad_norm": 1.0310138463974, + "learning_rate": 0.0002, + "loss": 0.5882, + "step": 20580 + }, + { + "epoch": 3.328752728154555, + "grad_norm": 0.9371318221092224, + "learning_rate": 0.0002, + "loss": 0.594, + "step": 20590 + }, + { + "epoch": 3.3303694123353003, + "grad_norm": 0.8756691813468933, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 20600 + }, + { + "epoch": 3.3319860965160455, + "grad_norm": 1.054175853729248, + "learning_rate": 0.0002, + "loss": 0.5994, + "step": 20610 + }, + { + "epoch": 3.3336027806967907, + "grad_norm": 0.9074128270149231, + "learning_rate": 0.0002, + "loss": 0.6169, + "step": 20620 + }, + { + "epoch": 3.335219464877536, + "grad_norm": 0.906900942325592, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 20630 + }, + { + "epoch": 3.3368361490582816, + "grad_norm": 0.8689333200454712, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 20640 + }, + { + "epoch": 3.338452833239027, + "grad_norm": 0.9889747500419617, + "learning_rate": 0.0002, + "loss": 0.6079, + "step": 20650 + }, + { + "epoch": 3.340069517419772, + "grad_norm": 1.0685805082321167, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 20660 + }, + { + "epoch": 3.3416862016005173, + "grad_norm": 0.7495010495185852, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 20670 + }, + { + "epoch": 3.3433028857812626, + "grad_norm": 0.8747848272323608, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 20680 + }, + { + "epoch": 3.344919569962008, + "grad_norm": 0.9762673377990723, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 20690 + }, + { + "epoch": 3.346536254142753, + "grad_norm": 1.0284489393234253, + "learning_rate": 0.0002, + "loss": 0.6784, + "step": 20700 + }, + { + "epoch": 3.3481529383234987, + "grad_norm": 0.7293812036514282, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 20710 + }, + { + "epoch": 3.349769622504244, + "grad_norm": 0.8330199122428894, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 20720 + }, + { + "epoch": 3.351386306684989, + "grad_norm": 0.9808499217033386, + "learning_rate": 0.0002, + "loss": 0.5729, + "step": 20730 + }, + { + "epoch": 3.3530029908657344, + "grad_norm": 0.9508825540542603, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 20740 + }, + { + "epoch": 3.3546196750464796, + "grad_norm": 0.790483832359314, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 20750 + }, + { + "epoch": 3.356236359227225, + "grad_norm": 1.022793173789978, + "learning_rate": 0.0002, + "loss": 0.6327, + "step": 20760 + }, + { + "epoch": 3.35785304340797, + "grad_norm": 0.8318950533866882, + "learning_rate": 0.0002, + "loss": 0.6439, + "step": 20770 + }, + { + "epoch": 3.3594697275887153, + "grad_norm": 0.7980858087539673, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 20780 + }, + { + "epoch": 3.361086411769461, + "grad_norm": 0.8114802241325378, + "learning_rate": 0.0002, + "loss": 0.6746, + "step": 20790 + }, + { + "epoch": 3.3627030959502062, + "grad_norm": 0.8522519469261169, + "learning_rate": 0.0002, + "loss": 0.6017, + "step": 20800 + }, + { + "epoch": 3.3643197801309515, + "grad_norm": 0.9142431616783142, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 20810 + }, + { + "epoch": 3.3659364643116967, + "grad_norm": 0.771170437335968, + "learning_rate": 0.0002, + "loss": 0.6331, + "step": 20820 + }, + { + "epoch": 3.367553148492442, + "grad_norm": 1.0628231763839722, + "learning_rate": 0.0002, + "loss": 0.5879, + "step": 20830 + }, + { + "epoch": 3.369169832673187, + "grad_norm": 0.9384352564811707, + "learning_rate": 0.0002, + "loss": 0.6533, + "step": 20840 + }, + { + "epoch": 3.370786516853933, + "grad_norm": 1.1286591291427612, + "learning_rate": 0.0002, + "loss": 0.6292, + "step": 20850 + }, + { + "epoch": 3.372403201034678, + "grad_norm": 1.1349513530731201, + "learning_rate": 0.0002, + "loss": 0.5986, + "step": 20860 + }, + { + "epoch": 3.3740198852154233, + "grad_norm": 1.0127464532852173, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 20870 + }, + { + "epoch": 3.3756365693961685, + "grad_norm": 0.9111971855163574, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 20880 + }, + { + "epoch": 3.3772532535769137, + "grad_norm": 0.871356725692749, + "learning_rate": 0.0002, + "loss": 0.6101, + "step": 20890 + }, + { + "epoch": 3.378869937757659, + "grad_norm": 0.7774117588996887, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 20900 + }, + { + "epoch": 3.380486621938404, + "grad_norm": 1.0089964866638184, + "learning_rate": 0.0002, + "loss": 0.6062, + "step": 20910 + }, + { + "epoch": 3.3821033061191494, + "grad_norm": 0.7855867147445679, + "learning_rate": 0.0002, + "loss": 0.5908, + "step": 20920 + }, + { + "epoch": 3.3837199902998947, + "grad_norm": 1.3713710308074951, + "learning_rate": 0.0002, + "loss": 0.6373, + "step": 20930 + }, + { + "epoch": 3.3853366744806404, + "grad_norm": 0.8599116206169128, + "learning_rate": 0.0002, + "loss": 0.6627, + "step": 20940 + }, + { + "epoch": 3.3869533586613856, + "grad_norm": 0.9392673373222351, + "learning_rate": 0.0002, + "loss": 0.6224, + "step": 20950 + }, + { + "epoch": 3.388570042842131, + "grad_norm": 0.8764075040817261, + "learning_rate": 0.0002, + "loss": 0.5855, + "step": 20960 + }, + { + "epoch": 3.390186727022876, + "grad_norm": 0.8240136504173279, + "learning_rate": 0.0002, + "loss": 0.5734, + "step": 20970 + }, + { + "epoch": 3.3918034112036213, + "grad_norm": 1.0982369184494019, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 20980 + }, + { + "epoch": 3.3934200953843665, + "grad_norm": 1.0599013566970825, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 20990 + }, + { + "epoch": 3.395036779565112, + "grad_norm": 0.895438015460968, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 21000 + }, + { + "epoch": 3.3966534637458574, + "grad_norm": 0.6974841356277466, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 21010 + }, + { + "epoch": 3.3982701479266026, + "grad_norm": 0.9571719765663147, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 21020 + }, + { + "epoch": 3.399886832107348, + "grad_norm": 0.831912636756897, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 21030 + }, + { + "epoch": 3.401503516288093, + "grad_norm": 0.831936240196228, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 21040 + }, + { + "epoch": 3.4031202004688383, + "grad_norm": 0.7388373613357544, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 21050 + }, + { + "epoch": 3.4047368846495836, + "grad_norm": 0.938667356967926, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 21060 + }, + { + "epoch": 3.406353568830329, + "grad_norm": 0.9202313423156738, + "learning_rate": 0.0002, + "loss": 0.6121, + "step": 21070 + }, + { + "epoch": 3.4079702530110745, + "grad_norm": 0.9888381958007812, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 21080 + }, + { + "epoch": 3.4095869371918197, + "grad_norm": 0.8526970744132996, + "learning_rate": 0.0002, + "loss": 0.6245, + "step": 21090 + }, + { + "epoch": 3.411203621372565, + "grad_norm": 0.7939383387565613, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 21100 + }, + { + "epoch": 3.41282030555331, + "grad_norm": 0.9986352920532227, + "learning_rate": 0.0002, + "loss": 0.6066, + "step": 21110 + }, + { + "epoch": 3.4144369897340554, + "grad_norm": 0.8895300030708313, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 21120 + }, + { + "epoch": 3.4160536739148006, + "grad_norm": 0.9559482932090759, + "learning_rate": 0.0002, + "loss": 0.6264, + "step": 21130 + }, + { + "epoch": 3.417670358095546, + "grad_norm": 0.8351506590843201, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 21140 + }, + { + "epoch": 3.4192870422762915, + "grad_norm": 0.8224456906318665, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 21150 + }, + { + "epoch": 3.4209037264570368, + "grad_norm": 1.0110299587249756, + "learning_rate": 0.0002, + "loss": 0.5871, + "step": 21160 + }, + { + "epoch": 3.422520410637782, + "grad_norm": 0.82564777135849, + "learning_rate": 0.0002, + "loss": 0.6116, + "step": 21170 + }, + { + "epoch": 3.4241370948185272, + "grad_norm": 1.004738688468933, + "learning_rate": 0.0002, + "loss": 0.595, + "step": 21180 + }, + { + "epoch": 3.4257537789992725, + "grad_norm": 0.7545676827430725, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 21190 + }, + { + "epoch": 3.4273704631800177, + "grad_norm": 0.8918704390525818, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 21200 + }, + { + "epoch": 3.428987147360763, + "grad_norm": 0.8336876034736633, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 21210 + }, + { + "epoch": 3.430603831541508, + "grad_norm": 0.8928771018981934, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 21220 + }, + { + "epoch": 3.432220515722254, + "grad_norm": 0.7663705945014954, + "learning_rate": 0.0002, + "loss": 0.6468, + "step": 21230 + }, + { + "epoch": 3.433837199902999, + "grad_norm": 0.8392598628997803, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 21240 + }, + { + "epoch": 3.4354538840837443, + "grad_norm": 0.8819600343704224, + "learning_rate": 0.0002, + "loss": 0.5971, + "step": 21250 + }, + { + "epoch": 3.4370705682644895, + "grad_norm": 0.9124642014503479, + "learning_rate": 0.0002, + "loss": 0.6791, + "step": 21260 + }, + { + "epoch": 3.4386872524452348, + "grad_norm": 0.8329763412475586, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 21270 + }, + { + "epoch": 3.44030393662598, + "grad_norm": 0.9982839822769165, + "learning_rate": 0.0002, + "loss": 0.6541, + "step": 21280 + }, + { + "epoch": 3.4419206208067252, + "grad_norm": 0.9105954766273499, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 21290 + }, + { + "epoch": 3.443537304987471, + "grad_norm": 0.8182359337806702, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 21300 + }, + { + "epoch": 3.445153989168216, + "grad_norm": 1.0568904876708984, + "learning_rate": 0.0002, + "loss": 0.5991, + "step": 21310 + }, + { + "epoch": 3.4467706733489614, + "grad_norm": 0.968539834022522, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 21320 + }, + { + "epoch": 3.4483873575297066, + "grad_norm": 0.8774511218070984, + "learning_rate": 0.0002, + "loss": 0.6219, + "step": 21330 + }, + { + "epoch": 3.450004041710452, + "grad_norm": 0.7598156332969666, + "learning_rate": 0.0002, + "loss": 0.6438, + "step": 21340 + }, + { + "epoch": 3.451620725891197, + "grad_norm": 1.1012897491455078, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 21350 + }, + { + "epoch": 3.4532374100719423, + "grad_norm": 0.8040637373924255, + "learning_rate": 0.0002, + "loss": 0.6137, + "step": 21360 + }, + { + "epoch": 3.4548540942526875, + "grad_norm": 0.8497496247291565, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 21370 + }, + { + "epoch": 3.456470778433433, + "grad_norm": 0.8429915904998779, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 21380 + }, + { + "epoch": 3.4580874626141784, + "grad_norm": 0.8107112646102905, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 21390 + }, + { + "epoch": 3.4597041467949237, + "grad_norm": 1.00872004032135, + "learning_rate": 0.0002, + "loss": 0.6109, + "step": 21400 + }, + { + "epoch": 3.461320830975669, + "grad_norm": 0.8266542553901672, + "learning_rate": 0.0002, + "loss": 0.5712, + "step": 21410 + }, + { + "epoch": 3.462937515156414, + "grad_norm": 0.8972568511962891, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 21420 + }, + { + "epoch": 3.4645541993371594, + "grad_norm": 1.0781476497650146, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 21430 + }, + { + "epoch": 3.4661708835179046, + "grad_norm": 0.9571592807769775, + "learning_rate": 0.0002, + "loss": 0.6303, + "step": 21440 + }, + { + "epoch": 3.4677875676986503, + "grad_norm": 0.881547212600708, + "learning_rate": 0.0002, + "loss": 0.6309, + "step": 21450 + }, + { + "epoch": 3.4694042518793955, + "grad_norm": 0.6955338716506958, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 21460 + }, + { + "epoch": 3.4710209360601407, + "grad_norm": 0.901187539100647, + "learning_rate": 0.0002, + "loss": 0.6205, + "step": 21470 + }, + { + "epoch": 3.472637620240886, + "grad_norm": 0.7063511610031128, + "learning_rate": 0.0002, + "loss": 0.639, + "step": 21480 + }, + { + "epoch": 3.474254304421631, + "grad_norm": 0.8462792038917542, + "learning_rate": 0.0002, + "loss": 0.6154, + "step": 21490 + }, + { + "epoch": 3.4758709886023764, + "grad_norm": 1.1861060857772827, + "learning_rate": 0.0002, + "loss": 0.61, + "step": 21500 + }, + { + "epoch": 3.4774876727831217, + "grad_norm": 0.70503169298172, + "learning_rate": 0.0002, + "loss": 0.6586, + "step": 21510 + }, + { + "epoch": 3.479104356963867, + "grad_norm": 0.9650066494941711, + "learning_rate": 0.0002, + "loss": 0.6475, + "step": 21520 + }, + { + "epoch": 3.4807210411446126, + "grad_norm": 1.0266852378845215, + "learning_rate": 0.0002, + "loss": 0.6452, + "step": 21530 + }, + { + "epoch": 3.482337725325358, + "grad_norm": 0.956372857093811, + "learning_rate": 0.0002, + "loss": 0.6553, + "step": 21540 + }, + { + "epoch": 3.483954409506103, + "grad_norm": 0.8848432898521423, + "learning_rate": 0.0002, + "loss": 0.6667, + "step": 21550 + }, + { + "epoch": 3.4855710936868483, + "grad_norm": 1.0805351734161377, + "learning_rate": 0.0002, + "loss": 0.6375, + "step": 21560 + }, + { + "epoch": 3.4871877778675935, + "grad_norm": 0.9279725551605225, + "learning_rate": 0.0002, + "loss": 0.6958, + "step": 21570 + }, + { + "epoch": 3.4888044620483387, + "grad_norm": 0.9049562215805054, + "learning_rate": 0.0002, + "loss": 0.6354, + "step": 21580 + }, + { + "epoch": 3.4904211462290844, + "grad_norm": 0.9619429111480713, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 21590 + }, + { + "epoch": 3.4920378304098296, + "grad_norm": 0.8508906960487366, + "learning_rate": 0.0002, + "loss": 0.5927, + "step": 21600 + }, + { + "epoch": 3.493654514590575, + "grad_norm": 0.8692502379417419, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 21610 + }, + { + "epoch": 3.49527119877132, + "grad_norm": 0.8187332153320312, + "learning_rate": 0.0002, + "loss": 0.5878, + "step": 21620 + }, + { + "epoch": 3.4968878829520653, + "grad_norm": 1.145400047302246, + "learning_rate": 0.0002, + "loss": 0.5874, + "step": 21630 + }, + { + "epoch": 3.4985045671328105, + "grad_norm": 0.8281388282775879, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 21640 + }, + { + "epoch": 3.500121251313556, + "grad_norm": 0.82256019115448, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 21650 + }, + { + "epoch": 3.501737935494301, + "grad_norm": 0.9315484762191772, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 21660 + }, + { + "epoch": 3.5033546196750462, + "grad_norm": 0.7626111507415771, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 21670 + }, + { + "epoch": 3.504971303855792, + "grad_norm": 0.9275059103965759, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 21680 + }, + { + "epoch": 3.506587988036537, + "grad_norm": 0.7906724810600281, + "learning_rate": 0.0002, + "loss": 0.64, + "step": 21690 + }, + { + "epoch": 3.5082046722172824, + "grad_norm": 0.8289761543273926, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 21700 + }, + { + "epoch": 3.5098213563980276, + "grad_norm": 0.8316431045532227, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 21710 + }, + { + "epoch": 3.511438040578773, + "grad_norm": 1.0451812744140625, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 21720 + }, + { + "epoch": 3.513054724759518, + "grad_norm": 0.928252637386322, + "learning_rate": 0.0002, + "loss": 0.632, + "step": 21730 + }, + { + "epoch": 3.5146714089402638, + "grad_norm": 0.7985895276069641, + "learning_rate": 0.0002, + "loss": 0.6062, + "step": 21740 + }, + { + "epoch": 3.516288093121009, + "grad_norm": 0.6740974187850952, + "learning_rate": 0.0002, + "loss": 0.6463, + "step": 21750 + }, + { + "epoch": 3.517904777301754, + "grad_norm": 0.8482223749160767, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 21760 + }, + { + "epoch": 3.5195214614824994, + "grad_norm": 0.889947772026062, + "learning_rate": 0.0002, + "loss": 0.6277, + "step": 21770 + }, + { + "epoch": 3.5211381456632447, + "grad_norm": 0.8304598927497864, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 21780 + }, + { + "epoch": 3.52275482984399, + "grad_norm": 0.8002981543540955, + "learning_rate": 0.0002, + "loss": 0.6156, + "step": 21790 + }, + { + "epoch": 3.524371514024735, + "grad_norm": 0.8115083575248718, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 21800 + }, + { + "epoch": 3.5259881982054804, + "grad_norm": 0.9715048670768738, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 21810 + }, + { + "epoch": 3.5276048823862256, + "grad_norm": 1.0910786390304565, + "learning_rate": 0.0002, + "loss": 0.6715, + "step": 21820 + }, + { + "epoch": 3.5292215665669713, + "grad_norm": 0.8438942432403564, + "learning_rate": 0.0002, + "loss": 0.6543, + "step": 21830 + }, + { + "epoch": 3.5308382507477165, + "grad_norm": 0.8813382983207703, + "learning_rate": 0.0002, + "loss": 0.6509, + "step": 21840 + }, + { + "epoch": 3.5324549349284617, + "grad_norm": 0.7092908024787903, + "learning_rate": 0.0002, + "loss": 0.6049, + "step": 21850 + }, + { + "epoch": 3.534071619109207, + "grad_norm": 0.8332187533378601, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 21860 + }, + { + "epoch": 3.535688303289952, + "grad_norm": 0.8958209156990051, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 21870 + }, + { + "epoch": 3.5373049874706974, + "grad_norm": 0.824138879776001, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 21880 + }, + { + "epoch": 3.538921671651443, + "grad_norm": 0.8375158309936523, + "learning_rate": 0.0002, + "loss": 0.6022, + "step": 21890 + }, + { + "epoch": 3.5405383558321883, + "grad_norm": 1.0274608135223389, + "learning_rate": 0.0002, + "loss": 0.6019, + "step": 21900 + }, + { + "epoch": 3.5421550400129336, + "grad_norm": 0.7088932394981384, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 21910 + }, + { + "epoch": 3.543771724193679, + "grad_norm": 0.8172445297241211, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 21920 + }, + { + "epoch": 3.545388408374424, + "grad_norm": 0.9904135465621948, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 21930 + }, + { + "epoch": 3.5470050925551693, + "grad_norm": 0.9900432229042053, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 21940 + }, + { + "epoch": 3.5486217767359145, + "grad_norm": 0.8963301181793213, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 21950 + }, + { + "epoch": 3.5502384609166597, + "grad_norm": 0.8551464676856995, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 21960 + }, + { + "epoch": 3.551855145097405, + "grad_norm": 1.0916603803634644, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 21970 + }, + { + "epoch": 3.5534718292781506, + "grad_norm": 0.841598391532898, + "learning_rate": 0.0002, + "loss": 0.6523, + "step": 21980 + }, + { + "epoch": 3.555088513458896, + "grad_norm": 0.8566757440567017, + "learning_rate": 0.0002, + "loss": 0.617, + "step": 21990 + }, + { + "epoch": 3.556705197639641, + "grad_norm": 1.0145052671432495, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 22000 + }, + { + "epoch": 3.5583218818203863, + "grad_norm": 0.9293754696846008, + "learning_rate": 0.0002, + "loss": 0.6173, + "step": 22010 + }, + { + "epoch": 3.5599385660011316, + "grad_norm": 0.9568536281585693, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 22020 + }, + { + "epoch": 3.5615552501818772, + "grad_norm": 0.8613139986991882, + "learning_rate": 0.0002, + "loss": 0.641, + "step": 22030 + }, + { + "epoch": 3.5631719343626225, + "grad_norm": 0.8179237246513367, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 22040 + }, + { + "epoch": 3.5647886185433677, + "grad_norm": 0.9059830904006958, + "learning_rate": 0.0002, + "loss": 0.574, + "step": 22050 + }, + { + "epoch": 3.566405302724113, + "grad_norm": 1.0068252086639404, + "learning_rate": 0.0002, + "loss": 0.6448, + "step": 22060 + }, + { + "epoch": 3.568021986904858, + "grad_norm": 0.9682072997093201, + "learning_rate": 0.0002, + "loss": 0.6239, + "step": 22070 + }, + { + "epoch": 3.5696386710856034, + "grad_norm": 0.8514005541801453, + "learning_rate": 0.0002, + "loss": 0.6808, + "step": 22080 + }, + { + "epoch": 3.5712553552663486, + "grad_norm": 0.8327770829200745, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 22090 + }, + { + "epoch": 3.572872039447094, + "grad_norm": 1.024976372718811, + "learning_rate": 0.0002, + "loss": 0.5976, + "step": 22100 + }, + { + "epoch": 3.574488723627839, + "grad_norm": 0.7721174955368042, + "learning_rate": 0.0002, + "loss": 0.624, + "step": 22110 + }, + { + "epoch": 3.5761054078085843, + "grad_norm": 1.0351054668426514, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 22120 + }, + { + "epoch": 3.57772209198933, + "grad_norm": 0.9680907130241394, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 22130 + }, + { + "epoch": 3.5793387761700752, + "grad_norm": 0.8016974925994873, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 22140 + }, + { + "epoch": 3.5809554603508205, + "grad_norm": 1.0109003782272339, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 22150 + }, + { + "epoch": 3.5825721445315657, + "grad_norm": 1.0473392009735107, + "learning_rate": 0.0002, + "loss": 0.6368, + "step": 22160 + }, + { + "epoch": 3.584188828712311, + "grad_norm": 0.8686613440513611, + "learning_rate": 0.0002, + "loss": 0.6353, + "step": 22170 + }, + { + "epoch": 3.5858055128930566, + "grad_norm": 0.869149923324585, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 22180 + }, + { + "epoch": 3.587422197073802, + "grad_norm": 0.9769062995910645, + "learning_rate": 0.0002, + "loss": 0.5895, + "step": 22190 + }, + { + "epoch": 3.589038881254547, + "grad_norm": 0.779636561870575, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 22200 + }, + { + "epoch": 3.5906555654352923, + "grad_norm": 0.9063841104507446, + "learning_rate": 0.0002, + "loss": 0.5875, + "step": 22210 + }, + { + "epoch": 3.5922722496160375, + "grad_norm": 0.9216037392616272, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 22220 + }, + { + "epoch": 3.5938889337967828, + "grad_norm": 1.0217336416244507, + "learning_rate": 0.0002, + "loss": 0.6484, + "step": 22230 + }, + { + "epoch": 3.595505617977528, + "grad_norm": 0.8513161540031433, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 22240 + }, + { + "epoch": 3.597122302158273, + "grad_norm": 0.8084813952445984, + "learning_rate": 0.0002, + "loss": 0.6301, + "step": 22250 + }, + { + "epoch": 3.5987389863390185, + "grad_norm": 0.8524802923202515, + "learning_rate": 0.0002, + "loss": 0.6197, + "step": 22260 + }, + { + "epoch": 3.600355670519764, + "grad_norm": 0.9356237649917603, + "learning_rate": 0.0002, + "loss": 0.5599, + "step": 22270 + }, + { + "epoch": 3.6019723547005094, + "grad_norm": 1.009600281715393, + "learning_rate": 0.0002, + "loss": 0.628, + "step": 22280 + }, + { + "epoch": 3.6035890388812546, + "grad_norm": 0.9900581240653992, + "learning_rate": 0.0002, + "loss": 0.6179, + "step": 22290 + }, + { + "epoch": 3.605205723062, + "grad_norm": 1.062495231628418, + "learning_rate": 0.0002, + "loss": 0.5725, + "step": 22300 + }, + { + "epoch": 3.606822407242745, + "grad_norm": 0.8832381367683411, + "learning_rate": 0.0002, + "loss": 0.607, + "step": 22310 + }, + { + "epoch": 3.6084390914234903, + "grad_norm": 0.9284297823905945, + "learning_rate": 0.0002, + "loss": 0.6215, + "step": 22320 + }, + { + "epoch": 3.610055775604236, + "grad_norm": 1.2381829023361206, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 22330 + }, + { + "epoch": 3.611672459784981, + "grad_norm": 0.929434597492218, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 22340 + }, + { + "epoch": 3.6132891439657264, + "grad_norm": 0.9714490175247192, + "learning_rate": 0.0002, + "loss": 0.6141, + "step": 22350 + }, + { + "epoch": 3.6149058281464717, + "grad_norm": 0.808014988899231, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 22360 + }, + { + "epoch": 3.616522512327217, + "grad_norm": 1.0364398956298828, + "learning_rate": 0.0002, + "loss": 0.6428, + "step": 22370 + }, + { + "epoch": 3.618139196507962, + "grad_norm": 0.7858489751815796, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 22380 + }, + { + "epoch": 3.6197558806887074, + "grad_norm": 0.9920870065689087, + "learning_rate": 0.0002, + "loss": 0.6214, + "step": 22390 + }, + { + "epoch": 3.6213725648694526, + "grad_norm": 0.9183220863342285, + "learning_rate": 0.0002, + "loss": 0.6659, + "step": 22400 + }, + { + "epoch": 3.622989249050198, + "grad_norm": 0.9826246500015259, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 22410 + }, + { + "epoch": 3.6246059332309435, + "grad_norm": 0.8632931113243103, + "learning_rate": 0.0002, + "loss": 0.6441, + "step": 22420 + }, + { + "epoch": 3.6262226174116887, + "grad_norm": 0.8468965291976929, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 22430 + }, + { + "epoch": 3.627839301592434, + "grad_norm": 0.8466871976852417, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 22440 + }, + { + "epoch": 3.629455985773179, + "grad_norm": 0.9501169919967651, + "learning_rate": 0.0002, + "loss": 0.5941, + "step": 22450 + }, + { + "epoch": 3.6310726699539244, + "grad_norm": 0.8906720876693726, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 22460 + }, + { + "epoch": 3.6326893541346696, + "grad_norm": 0.7400227189064026, + "learning_rate": 0.0002, + "loss": 0.6928, + "step": 22470 + }, + { + "epoch": 3.6343060383154153, + "grad_norm": 0.9756355881690979, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 22480 + }, + { + "epoch": 3.6359227224961606, + "grad_norm": 0.7504993081092834, + "learning_rate": 0.0002, + "loss": 0.6203, + "step": 22490 + }, + { + "epoch": 3.637539406676906, + "grad_norm": 0.9270039200782776, + "learning_rate": 0.0002, + "loss": 0.6302, + "step": 22500 + }, + { + "epoch": 3.639156090857651, + "grad_norm": 0.8841686844825745, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 22510 + }, + { + "epoch": 3.6407727750383962, + "grad_norm": 0.8533213138580322, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 22520 + }, + { + "epoch": 3.6423894592191415, + "grad_norm": 1.0052043199539185, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 22530 + }, + { + "epoch": 3.6440061433998867, + "grad_norm": 1.0323461294174194, + "learning_rate": 0.0002, + "loss": 0.6363, + "step": 22540 + }, + { + "epoch": 3.645622827580632, + "grad_norm": 0.8654312491416931, + "learning_rate": 0.0002, + "loss": 0.6545, + "step": 22550 + }, + { + "epoch": 3.647239511761377, + "grad_norm": 0.6400038003921509, + "learning_rate": 0.0002, + "loss": 0.6155, + "step": 22560 + }, + { + "epoch": 3.648856195942123, + "grad_norm": 0.8061298727989197, + "learning_rate": 0.0002, + "loss": 0.5829, + "step": 22570 + }, + { + "epoch": 3.650472880122868, + "grad_norm": 0.9257854223251343, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 22580 + }, + { + "epoch": 3.6520895643036133, + "grad_norm": 0.8439396619796753, + "learning_rate": 0.0002, + "loss": 0.6409, + "step": 22590 + }, + { + "epoch": 3.6537062484843585, + "grad_norm": 0.7764544486999512, + "learning_rate": 0.0002, + "loss": 0.5996, + "step": 22600 + }, + { + "epoch": 3.6553229326651038, + "grad_norm": 1.125451683998108, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 22610 + }, + { + "epoch": 3.656939616845849, + "grad_norm": 0.7523018717765808, + "learning_rate": 0.0002, + "loss": 0.6579, + "step": 22620 + }, + { + "epoch": 3.6585563010265947, + "grad_norm": 1.071026086807251, + "learning_rate": 0.0002, + "loss": 0.6476, + "step": 22630 + }, + { + "epoch": 3.66017298520734, + "grad_norm": 0.945791482925415, + "learning_rate": 0.0002, + "loss": 0.6459, + "step": 22640 + }, + { + "epoch": 3.661789669388085, + "grad_norm": 0.8001811504364014, + "learning_rate": 0.0002, + "loss": 0.659, + "step": 22650 + }, + { + "epoch": 3.6634063535688304, + "grad_norm": 0.9700816869735718, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 22660 + }, + { + "epoch": 3.6650230377495756, + "grad_norm": 0.9053242206573486, + "learning_rate": 0.0002, + "loss": 0.6337, + "step": 22670 + }, + { + "epoch": 3.666639721930321, + "grad_norm": 0.944362461566925, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 22680 + }, + { + "epoch": 3.668256406111066, + "grad_norm": 1.067489504814148, + "learning_rate": 0.0002, + "loss": 0.6235, + "step": 22690 + }, + { + "epoch": 3.6698730902918113, + "grad_norm": 1.0984995365142822, + "learning_rate": 0.0002, + "loss": 0.698, + "step": 22700 + }, + { + "epoch": 3.6714897744725565, + "grad_norm": 0.9336317777633667, + "learning_rate": 0.0002, + "loss": 0.6717, + "step": 22710 + }, + { + "epoch": 3.673106458653302, + "grad_norm": 0.9261918663978577, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 22720 + }, + { + "epoch": 3.6747231428340474, + "grad_norm": 0.8648008704185486, + "learning_rate": 0.0002, + "loss": 0.6332, + "step": 22730 + }, + { + "epoch": 3.6763398270147927, + "grad_norm": 0.7225083708763123, + "learning_rate": 0.0002, + "loss": 0.6576, + "step": 22740 + }, + { + "epoch": 3.677956511195538, + "grad_norm": 0.9258282780647278, + "learning_rate": 0.0002, + "loss": 0.6406, + "step": 22750 + }, + { + "epoch": 3.679573195376283, + "grad_norm": 0.70876145362854, + "learning_rate": 0.0002, + "loss": 0.6397, + "step": 22760 + }, + { + "epoch": 3.681189879557029, + "grad_norm": 0.8780210018157959, + "learning_rate": 0.0002, + "loss": 0.6821, + "step": 22770 + }, + { + "epoch": 3.682806563737774, + "grad_norm": 0.8075440526008606, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 22780 + }, + { + "epoch": 3.6844232479185193, + "grad_norm": 0.8503130674362183, + "learning_rate": 0.0002, + "loss": 0.6561, + "step": 22790 + }, + { + "epoch": 3.6860399320992645, + "grad_norm": 0.8413618206977844, + "learning_rate": 0.0002, + "loss": 0.6082, + "step": 22800 + }, + { + "epoch": 3.6876566162800097, + "grad_norm": 0.8675165176391602, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 22810 + }, + { + "epoch": 3.689273300460755, + "grad_norm": 0.8235884308815002, + "learning_rate": 0.0002, + "loss": 0.6157, + "step": 22820 + }, + { + "epoch": 3.6908899846415, + "grad_norm": 0.9477725625038147, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 22830 + }, + { + "epoch": 3.6925066688222454, + "grad_norm": 0.7883533835411072, + "learning_rate": 0.0002, + "loss": 0.6481, + "step": 22840 + }, + { + "epoch": 3.6941233530029907, + "grad_norm": 1.047913908958435, + "learning_rate": 0.0002, + "loss": 0.5872, + "step": 22850 + }, + { + "epoch": 3.695740037183736, + "grad_norm": 0.9171528816223145, + "learning_rate": 0.0002, + "loss": 0.6176, + "step": 22860 + }, + { + "epoch": 3.6973567213644816, + "grad_norm": 0.9338192343711853, + "learning_rate": 0.0002, + "loss": 0.6204, + "step": 22870 + }, + { + "epoch": 3.698973405545227, + "grad_norm": 0.8799443244934082, + "learning_rate": 0.0002, + "loss": 0.686, + "step": 22880 + }, + { + "epoch": 3.700590089725972, + "grad_norm": 0.8515434861183167, + "learning_rate": 0.0002, + "loss": 0.6206, + "step": 22890 + }, + { + "epoch": 3.7022067739067173, + "grad_norm": 0.7805591821670532, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 22900 + }, + { + "epoch": 3.7038234580874625, + "grad_norm": 0.8470911979675293, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 22910 + }, + { + "epoch": 3.705440142268208, + "grad_norm": 0.9452309012413025, + "learning_rate": 0.0002, + "loss": 0.6557, + "step": 22920 + }, + { + "epoch": 3.7070568264489534, + "grad_norm": 0.950243353843689, + "learning_rate": 0.0002, + "loss": 0.6529, + "step": 22930 + }, + { + "epoch": 3.7086735106296986, + "grad_norm": 0.7882499098777771, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 22940 + }, + { + "epoch": 3.710290194810444, + "grad_norm": 0.8307787775993347, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 22950 + }, + { + "epoch": 3.711906878991189, + "grad_norm": 1.0970630645751953, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 22960 + }, + { + "epoch": 3.7135235631719343, + "grad_norm": 0.8269566297531128, + "learning_rate": 0.0002, + "loss": 0.6281, + "step": 22970 + }, + { + "epoch": 3.7151402473526796, + "grad_norm": 0.8306704759597778, + "learning_rate": 0.0002, + "loss": 0.6561, + "step": 22980 + }, + { + "epoch": 3.716756931533425, + "grad_norm": 0.9710225462913513, + "learning_rate": 0.0002, + "loss": 0.6418, + "step": 22990 + }, + { + "epoch": 3.71837361571417, + "grad_norm": 0.8890530467033386, + "learning_rate": 0.0002, + "loss": 0.6639, + "step": 23000 + }, + { + "epoch": 3.7199902998949153, + "grad_norm": 0.883522629737854, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 23010 + }, + { + "epoch": 3.721606984075661, + "grad_norm": 0.8662652373313904, + "learning_rate": 0.0002, + "loss": 0.6183, + "step": 23020 + }, + { + "epoch": 3.723223668256406, + "grad_norm": 0.7228406667709351, + "learning_rate": 0.0002, + "loss": 0.6266, + "step": 23030 + }, + { + "epoch": 3.7248403524371514, + "grad_norm": 1.060792088508606, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 23040 + }, + { + "epoch": 3.7264570366178966, + "grad_norm": 1.0119613409042358, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 23050 + }, + { + "epoch": 3.728073720798642, + "grad_norm": 0.9212996959686279, + "learning_rate": 0.0002, + "loss": 0.6466, + "step": 23060 + }, + { + "epoch": 3.7296904049793875, + "grad_norm": 0.925690233707428, + "learning_rate": 0.0002, + "loss": 0.6454, + "step": 23070 + }, + { + "epoch": 3.7313070891601328, + "grad_norm": 0.8323310613632202, + "learning_rate": 0.0002, + "loss": 0.615, + "step": 23080 + }, + { + "epoch": 3.732923773340878, + "grad_norm": 0.8966048955917358, + "learning_rate": 0.0002, + "loss": 0.679, + "step": 23090 + }, + { + "epoch": 3.7345404575216232, + "grad_norm": 0.8995837569236755, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 23100 + }, + { + "epoch": 3.7361571417023685, + "grad_norm": 0.8748890161514282, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 23110 + }, + { + "epoch": 3.7377738258831137, + "grad_norm": 0.7985540628433228, + "learning_rate": 0.0002, + "loss": 0.6246, + "step": 23120 + }, + { + "epoch": 3.739390510063859, + "grad_norm": 1.0240917205810547, + "learning_rate": 0.0002, + "loss": 0.6279, + "step": 23130 + }, + { + "epoch": 3.741007194244604, + "grad_norm": 0.9181789755821228, + "learning_rate": 0.0002, + "loss": 0.6747, + "step": 23140 + }, + { + "epoch": 3.7426238784253494, + "grad_norm": 0.8896583914756775, + "learning_rate": 0.0002, + "loss": 0.6026, + "step": 23150 + }, + { + "epoch": 3.744240562606095, + "grad_norm": 0.8635515570640564, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 23160 + }, + { + "epoch": 3.7458572467868403, + "grad_norm": 0.8873575329780579, + "learning_rate": 0.0002, + "loss": 0.6683, + "step": 23170 + }, + { + "epoch": 3.7474739309675855, + "grad_norm": 0.9807148575782776, + "learning_rate": 0.0002, + "loss": 0.6143, + "step": 23180 + }, + { + "epoch": 3.7490906151483308, + "grad_norm": 0.900477945804596, + "learning_rate": 0.0002, + "loss": 0.6381, + "step": 23190 + }, + { + "epoch": 3.750707299329076, + "grad_norm": 0.9379992485046387, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 23200 + }, + { + "epoch": 3.752323983509821, + "grad_norm": 0.9649890661239624, + "learning_rate": 0.0002, + "loss": 0.6015, + "step": 23210 + }, + { + "epoch": 3.753940667690567, + "grad_norm": 0.824442446231842, + "learning_rate": 0.0002, + "loss": 0.6735, + "step": 23220 + }, + { + "epoch": 3.755557351871312, + "grad_norm": 0.8896150588989258, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 23230 + }, + { + "epoch": 3.7571740360520574, + "grad_norm": 0.751249372959137, + "learning_rate": 0.0002, + "loss": 0.6081, + "step": 23240 + }, + { + "epoch": 3.7587907202328026, + "grad_norm": 0.9392193555831909, + "learning_rate": 0.0002, + "loss": 0.629, + "step": 23250 + }, + { + "epoch": 3.760407404413548, + "grad_norm": 0.9284586310386658, + "learning_rate": 0.0002, + "loss": 0.6209, + "step": 23260 + }, + { + "epoch": 3.762024088594293, + "grad_norm": 0.7738175392150879, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 23270 + }, + { + "epoch": 3.7636407727750383, + "grad_norm": 0.9252978563308716, + "learning_rate": 0.0002, + "loss": 0.6743, + "step": 23280 + }, + { + "epoch": 3.7652574569557835, + "grad_norm": 0.9501895904541016, + "learning_rate": 0.0002, + "loss": 0.5984, + "step": 23290 + }, + { + "epoch": 3.7668741411365287, + "grad_norm": 0.9416276216506958, + "learning_rate": 0.0002, + "loss": 0.6568, + "step": 23300 + }, + { + "epoch": 3.7684908253172744, + "grad_norm": 0.7076631784439087, + "learning_rate": 0.0002, + "loss": 0.6507, + "step": 23310 + }, + { + "epoch": 3.7701075094980196, + "grad_norm": 0.9864492416381836, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 23320 + }, + { + "epoch": 3.771724193678765, + "grad_norm": 0.8450456261634827, + "learning_rate": 0.0002, + "loss": 0.6537, + "step": 23330 + }, + { + "epoch": 3.77334087785951, + "grad_norm": 1.0768941640853882, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 23340 + }, + { + "epoch": 3.7749575620402553, + "grad_norm": 0.9956819415092468, + "learning_rate": 0.0002, + "loss": 0.6408, + "step": 23350 + }, + { + "epoch": 3.7765742462210006, + "grad_norm": 0.9234658479690552, + "learning_rate": 0.0002, + "loss": 0.6464, + "step": 23360 + }, + { + "epoch": 3.7781909304017463, + "grad_norm": 1.0993858575820923, + "learning_rate": 0.0002, + "loss": 0.6542, + "step": 23370 + }, + { + "epoch": 3.7798076145824915, + "grad_norm": 0.923159658908844, + "learning_rate": 0.0002, + "loss": 0.6391, + "step": 23380 + }, + { + "epoch": 3.7814242987632367, + "grad_norm": 0.9311541318893433, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 23390 + }, + { + "epoch": 3.783040982943982, + "grad_norm": 0.919681191444397, + "learning_rate": 0.0002, + "loss": 0.6535, + "step": 23400 + }, + { + "epoch": 3.784657667124727, + "grad_norm": 1.7406195402145386, + "learning_rate": 0.0002, + "loss": 0.6138, + "step": 23410 + }, + { + "epoch": 3.7862743513054724, + "grad_norm": 0.7789074182510376, + "learning_rate": 0.0002, + "loss": 0.657, + "step": 23420 + }, + { + "epoch": 3.7878910354862176, + "grad_norm": 0.8302814960479736, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 23430 + }, + { + "epoch": 3.789507719666963, + "grad_norm": 0.8089349269866943, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 23440 + }, + { + "epoch": 3.791124403847708, + "grad_norm": 0.9006284475326538, + "learning_rate": 0.0002, + "loss": 0.6682, + "step": 23450 + }, + { + "epoch": 3.7927410880284538, + "grad_norm": 0.8426766991615295, + "learning_rate": 0.0002, + "loss": 0.6335, + "step": 23460 + }, + { + "epoch": 3.794357772209199, + "grad_norm": 1.2576252222061157, + "learning_rate": 0.0002, + "loss": 0.6364, + "step": 23470 + }, + { + "epoch": 3.7959744563899442, + "grad_norm": 1.0307610034942627, + "learning_rate": 0.0002, + "loss": 0.6324, + "step": 23480 + }, + { + "epoch": 3.7975911405706895, + "grad_norm": 0.8525972962379456, + "learning_rate": 0.0002, + "loss": 0.6262, + "step": 23490 + }, + { + "epoch": 3.7992078247514347, + "grad_norm": 1.159039855003357, + "learning_rate": 0.0002, + "loss": 0.6757, + "step": 23500 + }, + { + "epoch": 3.80082450893218, + "grad_norm": 1.4193549156188965, + "learning_rate": 0.0002, + "loss": 0.6414, + "step": 23510 + }, + { + "epoch": 3.8024411931129256, + "grad_norm": 0.8245543837547302, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 23520 + }, + { + "epoch": 3.804057877293671, + "grad_norm": 0.8847230076789856, + "learning_rate": 0.0002, + "loss": 0.6417, + "step": 23530 + }, + { + "epoch": 3.805674561474416, + "grad_norm": 0.9574624300003052, + "learning_rate": 0.0002, + "loss": 0.6415, + "step": 23540 + }, + { + "epoch": 3.8072912456551613, + "grad_norm": 1.048020601272583, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 23550 + }, + { + "epoch": 3.8089079298359065, + "grad_norm": 0.8302255868911743, + "learning_rate": 0.0002, + "loss": 0.6497, + "step": 23560 + }, + { + "epoch": 3.8105246140166518, + "grad_norm": 0.8269215822219849, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 23570 + }, + { + "epoch": 3.812141298197397, + "grad_norm": 0.9375753402709961, + "learning_rate": 0.0002, + "loss": 0.6294, + "step": 23580 + }, + { + "epoch": 3.8137579823781422, + "grad_norm": 1.0234097242355347, + "learning_rate": 0.0002, + "loss": 0.6132, + "step": 23590 + }, + { + "epoch": 3.8153746665588875, + "grad_norm": 0.8978445529937744, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 23600 + }, + { + "epoch": 3.816991350739633, + "grad_norm": 0.7929515838623047, + "learning_rate": 0.0002, + "loss": 0.6315, + "step": 23610 + }, + { + "epoch": 3.8186080349203784, + "grad_norm": 1.3255881071090698, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 23620 + }, + { + "epoch": 3.8202247191011236, + "grad_norm": 0.9188598990440369, + "learning_rate": 0.0002, + "loss": 0.5947, + "step": 23630 + }, + { + "epoch": 3.821841403281869, + "grad_norm": 0.8811675906181335, + "learning_rate": 0.0002, + "loss": 0.6152, + "step": 23640 + }, + { + "epoch": 3.823458087462614, + "grad_norm": 0.8061038255691528, + "learning_rate": 0.0002, + "loss": 0.6253, + "step": 23650 + }, + { + "epoch": 3.8250747716433597, + "grad_norm": 0.9975376129150391, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 23660 + }, + { + "epoch": 3.826691455824105, + "grad_norm": 0.8036105036735535, + "learning_rate": 0.0002, + "loss": 0.6288, + "step": 23670 + }, + { + "epoch": 3.82830814000485, + "grad_norm": 0.7401984333992004, + "learning_rate": 0.0002, + "loss": 0.6845, + "step": 23680 + }, + { + "epoch": 3.8299248241855954, + "grad_norm": 0.829753041267395, + "learning_rate": 0.0002, + "loss": 0.6423, + "step": 23690 + }, + { + "epoch": 3.8315415083663407, + "grad_norm": 0.8753240704536438, + "learning_rate": 0.0002, + "loss": 0.6611, + "step": 23700 + }, + { + "epoch": 3.833158192547086, + "grad_norm": 0.8157842755317688, + "learning_rate": 0.0002, + "loss": 0.6686, + "step": 23710 + }, + { + "epoch": 3.834774876727831, + "grad_norm": 0.6183798909187317, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 23720 + }, + { + "epoch": 3.8363915609085764, + "grad_norm": 0.9548442363739014, + "learning_rate": 0.0002, + "loss": 0.5965, + "step": 23730 + }, + { + "epoch": 3.8380082450893216, + "grad_norm": 0.8319669961929321, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 23740 + }, + { + "epoch": 3.839624929270067, + "grad_norm": 0.9718693494796753, + "learning_rate": 0.0002, + "loss": 0.6585, + "step": 23750 + }, + { + "epoch": 3.8412416134508125, + "grad_norm": 0.8672235012054443, + "learning_rate": 0.0002, + "loss": 0.6518, + "step": 23760 + }, + { + "epoch": 3.8428582976315577, + "grad_norm": 1.1210707426071167, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 23770 + }, + { + "epoch": 3.844474981812303, + "grad_norm": 0.9177767634391785, + "learning_rate": 0.0002, + "loss": 0.5923, + "step": 23780 + }, + { + "epoch": 3.846091665993048, + "grad_norm": 0.8714171648025513, + "learning_rate": 0.0002, + "loss": 0.6286, + "step": 23790 + }, + { + "epoch": 3.8477083501737934, + "grad_norm": 1.1853246688842773, + "learning_rate": 0.0002, + "loss": 0.6302, + "step": 23800 + }, + { + "epoch": 3.849325034354539, + "grad_norm": 0.8091260194778442, + "learning_rate": 0.0002, + "loss": 0.6144, + "step": 23810 + }, + { + "epoch": 3.8509417185352843, + "grad_norm": 0.9710774421691895, + "learning_rate": 0.0002, + "loss": 0.658, + "step": 23820 + }, + { + "epoch": 3.8525584027160296, + "grad_norm": 0.7648707628250122, + "learning_rate": 0.0002, + "loss": 0.6151, + "step": 23830 + }, + { + "epoch": 3.854175086896775, + "grad_norm": 0.7809253931045532, + "learning_rate": 0.0002, + "loss": 0.6013, + "step": 23840 + }, + { + "epoch": 3.85579177107752, + "grad_norm": 0.8337951898574829, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 23850 + }, + { + "epoch": 3.8574084552582653, + "grad_norm": 0.9271913170814514, + "learning_rate": 0.0002, + "loss": 0.6456, + "step": 23860 + }, + { + "epoch": 3.8590251394390105, + "grad_norm": 0.985334038734436, + "learning_rate": 0.0002, + "loss": 0.6671, + "step": 23870 + }, + { + "epoch": 3.8606418236197557, + "grad_norm": 0.8458583354949951, + "learning_rate": 0.0002, + "loss": 0.6693, + "step": 23880 + }, + { + "epoch": 3.862258507800501, + "grad_norm": 1.015348196029663, + "learning_rate": 0.0002, + "loss": 0.6207, + "step": 23890 + }, + { + "epoch": 3.8638751919812466, + "grad_norm": 1.0121688842773438, + "learning_rate": 0.0002, + "loss": 0.649, + "step": 23900 + }, + { + "epoch": 3.865491876161992, + "grad_norm": 0.8883971571922302, + "learning_rate": 0.0002, + "loss": 0.5921, + "step": 23910 + }, + { + "epoch": 3.867108560342737, + "grad_norm": 1.028086543083191, + "learning_rate": 0.0002, + "loss": 0.6597, + "step": 23920 + }, + { + "epoch": 3.8687252445234823, + "grad_norm": 0.9645734429359436, + "learning_rate": 0.0002, + "loss": 0.6654, + "step": 23930 + }, + { + "epoch": 3.8703419287042276, + "grad_norm": 0.8235350251197815, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 23940 + }, + { + "epoch": 3.871958612884973, + "grad_norm": 1.0298916101455688, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 23950 + }, + { + "epoch": 3.8735752970657185, + "grad_norm": 1.0063377618789673, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 23960 + }, + { + "epoch": 3.8751919812464637, + "grad_norm": 0.9230626821517944, + "learning_rate": 0.0002, + "loss": 0.6234, + "step": 23970 + }, + { + "epoch": 3.876808665427209, + "grad_norm": 0.9243063926696777, + "learning_rate": 0.0002, + "loss": 0.6159, + "step": 23980 + }, + { + "epoch": 3.878425349607954, + "grad_norm": 1.0211291313171387, + "learning_rate": 0.0002, + "loss": 0.6035, + "step": 23990 + }, + { + "epoch": 3.8800420337886994, + "grad_norm": 0.7800535559654236, + "learning_rate": 0.0002, + "loss": 0.6351, + "step": 24000 + }, + { + "epoch": 3.8816587179694446, + "grad_norm": 0.7904248833656311, + "learning_rate": 0.0002, + "loss": 0.7, + "step": 24010 + }, + { + "epoch": 3.88327540215019, + "grad_norm": 1.1975988149642944, + "learning_rate": 0.0002, + "loss": 0.6516, + "step": 24020 + }, + { + "epoch": 3.884892086330935, + "grad_norm": 1.0626593828201294, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 24030 + }, + { + "epoch": 3.8865087705116803, + "grad_norm": 0.9012193083763123, + "learning_rate": 0.0002, + "loss": 0.6115, + "step": 24040 + }, + { + "epoch": 3.888125454692426, + "grad_norm": 1.1159172058105469, + "learning_rate": 0.0002, + "loss": 0.6786, + "step": 24050 + }, + { + "epoch": 3.889742138873171, + "grad_norm": 1.276838779449463, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 24060 + }, + { + "epoch": 3.8913588230539164, + "grad_norm": 0.8467690348625183, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 24070 + }, + { + "epoch": 3.8929755072346617, + "grad_norm": 0.9862841963768005, + "learning_rate": 0.0002, + "loss": 0.6655, + "step": 24080 + }, + { + "epoch": 3.894592191415407, + "grad_norm": 0.7134621739387512, + "learning_rate": 0.0002, + "loss": 0.6098, + "step": 24090 + }, + { + "epoch": 3.896208875596152, + "grad_norm": 0.8178175091743469, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 24100 + }, + { + "epoch": 3.897825559776898, + "grad_norm": 0.9229172468185425, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 24110 + }, + { + "epoch": 3.899442243957643, + "grad_norm": 1.0878316164016724, + "learning_rate": 0.0002, + "loss": 0.6554, + "step": 24120 + }, + { + "epoch": 3.9010589281383883, + "grad_norm": 0.971645712852478, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 24130 + }, + { + "epoch": 3.9026756123191335, + "grad_norm": 0.8862188458442688, + "learning_rate": 0.0002, + "loss": 0.6228, + "step": 24140 + }, + { + "epoch": 3.9042922964998787, + "grad_norm": 0.9126982688903809, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 24150 + }, + { + "epoch": 3.905908980680624, + "grad_norm": 0.8833470940589905, + "learning_rate": 0.0002, + "loss": 0.6734, + "step": 24160 + }, + { + "epoch": 3.907525664861369, + "grad_norm": 0.8320947885513306, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 24170 + }, + { + "epoch": 3.9091423490421144, + "grad_norm": 0.9156602025032043, + "learning_rate": 0.0002, + "loss": 0.6247, + "step": 24180 + }, + { + "epoch": 3.9107590332228597, + "grad_norm": 1.029181957244873, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 24190 + }, + { + "epoch": 3.9123757174036053, + "grad_norm": 0.9052802324295044, + "learning_rate": 0.0002, + "loss": 0.6565, + "step": 24200 + }, + { + "epoch": 3.9139924015843506, + "grad_norm": 0.8847255110740662, + "learning_rate": 0.0002, + "loss": 0.6346, + "step": 24210 + }, + { + "epoch": 3.915609085765096, + "grad_norm": 0.9642062187194824, + "learning_rate": 0.0002, + "loss": 0.6343, + "step": 24220 + }, + { + "epoch": 3.917225769945841, + "grad_norm": 0.8629093766212463, + "learning_rate": 0.0002, + "loss": 0.6557, + "step": 24230 + }, + { + "epoch": 3.9188424541265863, + "grad_norm": 0.8674976825714111, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 24240 + }, + { + "epoch": 3.9204591383073315, + "grad_norm": 1.104846477508545, + "learning_rate": 0.0002, + "loss": 0.5874, + "step": 24250 + }, + { + "epoch": 3.922075822488077, + "grad_norm": 1.0874955654144287, + "learning_rate": 0.0002, + "loss": 0.6501, + "step": 24260 + }, + { + "epoch": 3.9236925066688224, + "grad_norm": 0.8689812421798706, + "learning_rate": 0.0002, + "loss": 0.6455, + "step": 24270 + }, + { + "epoch": 3.9253091908495676, + "grad_norm": 0.9724617004394531, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 24280 + }, + { + "epoch": 3.926925875030313, + "grad_norm": 0.9165538549423218, + "learning_rate": 0.0002, + "loss": 0.6616, + "step": 24290 + }, + { + "epoch": 3.928542559211058, + "grad_norm": 0.9307710528373718, + "learning_rate": 0.0002, + "loss": 0.645, + "step": 24300 + }, + { + "epoch": 3.9301592433918033, + "grad_norm": 0.8589295148849487, + "learning_rate": 0.0002, + "loss": 0.6071, + "step": 24310 + }, + { + "epoch": 3.9317759275725486, + "grad_norm": 0.9151099920272827, + "learning_rate": 0.0002, + "loss": 0.6662, + "step": 24320 + }, + { + "epoch": 3.933392611753294, + "grad_norm": 0.9633517265319824, + "learning_rate": 0.0002, + "loss": 0.7075, + "step": 24330 + }, + { + "epoch": 3.935009295934039, + "grad_norm": 0.9521116018295288, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 24340 + }, + { + "epoch": 3.9366259801147847, + "grad_norm": 0.8366776704788208, + "learning_rate": 0.0002, + "loss": 0.6457, + "step": 24350 + }, + { + "epoch": 3.93824266429553, + "grad_norm": 0.8972663283348083, + "learning_rate": 0.0002, + "loss": 0.6139, + "step": 24360 + }, + { + "epoch": 3.939859348476275, + "grad_norm": 0.8102919459342957, + "learning_rate": 0.0002, + "loss": 0.661, + "step": 24370 + }, + { + "epoch": 3.9414760326570204, + "grad_norm": 0.8189975023269653, + "learning_rate": 0.0002, + "loss": 0.6388, + "step": 24380 + }, + { + "epoch": 3.9430927168377656, + "grad_norm": 0.9569464921951294, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 24390 + }, + { + "epoch": 3.9447094010185113, + "grad_norm": 0.7459101676940918, + "learning_rate": 0.0002, + "loss": 0.6999, + "step": 24400 + }, + { + "epoch": 3.9463260851992565, + "grad_norm": 0.8536974787712097, + "learning_rate": 0.0002, + "loss": 0.6069, + "step": 24410 + }, + { + "epoch": 3.9479427693800018, + "grad_norm": 0.8763698935508728, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 24420 + }, + { + "epoch": 3.949559453560747, + "grad_norm": 0.9381106495857239, + "learning_rate": 0.0002, + "loss": 0.6478, + "step": 24430 + }, + { + "epoch": 3.9511761377414922, + "grad_norm": 0.934440016746521, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 24440 + }, + { + "epoch": 3.9527928219222375, + "grad_norm": 0.903918981552124, + "learning_rate": 0.0002, + "loss": 0.6393, + "step": 24450 + }, + { + "epoch": 3.9544095061029827, + "grad_norm": 0.8771953582763672, + "learning_rate": 0.0002, + "loss": 0.6175, + "step": 24460 + }, + { + "epoch": 3.956026190283728, + "grad_norm": 1.0375410318374634, + "learning_rate": 0.0002, + "loss": 0.6971, + "step": 24470 + }, + { + "epoch": 3.957642874464473, + "grad_norm": 0.9439185261726379, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 24480 + }, + { + "epoch": 3.9592595586452184, + "grad_norm": 0.935467004776001, + "learning_rate": 0.0002, + "loss": 0.6076, + "step": 24490 + }, + { + "epoch": 3.960876242825964, + "grad_norm": 0.6900772452354431, + "learning_rate": 0.0002, + "loss": 0.6437, + "step": 24500 + }, + { + "epoch": 3.9624929270067093, + "grad_norm": 1.0172916650772095, + "learning_rate": 0.0002, + "loss": 0.6445, + "step": 24510 + }, + { + "epoch": 3.9641096111874545, + "grad_norm": 0.9167046546936035, + "learning_rate": 0.0002, + "loss": 0.6308, + "step": 24520 + }, + { + "epoch": 3.9657262953681998, + "grad_norm": 0.7230527997016907, + "learning_rate": 0.0002, + "loss": 0.6519, + "step": 24530 + }, + { + "epoch": 3.967342979548945, + "grad_norm": 0.8980403542518616, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 24540 + }, + { + "epoch": 3.9689596637296907, + "grad_norm": 0.8555465936660767, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 24550 + }, + { + "epoch": 3.970576347910436, + "grad_norm": 0.7825445532798767, + "learning_rate": 0.0002, + "loss": 0.6617, + "step": 24560 + }, + { + "epoch": 3.972193032091181, + "grad_norm": 0.7273133993148804, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 24570 + }, + { + "epoch": 3.9738097162719264, + "grad_norm": 0.9612047672271729, + "learning_rate": 0.0002, + "loss": 0.6427, + "step": 24580 + }, + { + "epoch": 3.9754264004526716, + "grad_norm": 0.9865460991859436, + "learning_rate": 0.0002, + "loss": 0.6426, + "step": 24590 + }, + { + "epoch": 3.977043084633417, + "grad_norm": 0.8638762831687927, + "learning_rate": 0.0002, + "loss": 0.6052, + "step": 24600 + }, + { + "epoch": 3.978659768814162, + "grad_norm": 1.0096198320388794, + "learning_rate": 0.0002, + "loss": 0.6097, + "step": 24610 + }, + { + "epoch": 3.9802764529949073, + "grad_norm": 0.8475532531738281, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 24620 + }, + { + "epoch": 3.9818931371756525, + "grad_norm": 0.9696195721626282, + "learning_rate": 0.0002, + "loss": 0.6711, + "step": 24630 + }, + { + "epoch": 3.9835098213563978, + "grad_norm": 0.7499843239784241, + "learning_rate": 0.0002, + "loss": 0.6446, + "step": 24640 + }, + { + "epoch": 3.9851265055371434, + "grad_norm": 0.8865424990653992, + "learning_rate": 0.0002, + "loss": 0.6054, + "step": 24650 + }, + { + "epoch": 3.9867431897178887, + "grad_norm": 0.8089959025382996, + "learning_rate": 0.0002, + "loss": 0.5975, + "step": 24660 + }, + { + "epoch": 3.988359873898634, + "grad_norm": 0.6946012377738953, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 24670 + }, + { + "epoch": 3.989976558079379, + "grad_norm": 0.7991759181022644, + "learning_rate": 0.0002, + "loss": 0.6329, + "step": 24680 + }, + { + "epoch": 3.9915932422601244, + "grad_norm": 0.8803931474685669, + "learning_rate": 0.0002, + "loss": 0.6449, + "step": 24690 + }, + { + "epoch": 3.99320992644087, + "grad_norm": 0.8848299980163574, + "learning_rate": 0.0002, + "loss": 0.7091, + "step": 24700 + }, + { + "epoch": 3.9948266106216153, + "grad_norm": 0.7448889017105103, + "learning_rate": 0.0002, + "loss": 0.6551, + "step": 24710 + }, + { + "epoch": 3.9964432948023605, + "grad_norm": 0.9361620545387268, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 24720 + }, + { + "epoch": 3.9980599789831057, + "grad_norm": 0.9958081245422363, + "learning_rate": 0.0002, + "loss": 0.5917, + "step": 24730 + }, + { + "epoch": 3.999676663163851, + "grad_norm": 1.026004672050476, + "learning_rate": 0.0002, + "loss": 0.6567, + "step": 24740 + }, + { + "epoch": 4.0, + "eval_loss": 1.1524168252944946, + "eval_runtime": 122.1585, + "eval_samples_per_second": 6.0, + "eval_steps_per_second": 0.753, + "step": 24742 + }, + { + "epoch": 4.001293347344596, + "grad_norm": 1.0664808750152588, + "learning_rate": 0.0002, + "loss": 0.6057, + "step": 24750 + }, + { + "epoch": 4.002910031525341, + "grad_norm": 1.0113720893859863, + "learning_rate": 0.0002, + "loss": 0.5644, + "step": 24760 + }, + { + "epoch": 4.004526715706087, + "grad_norm": 0.991486668586731, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 24770 + }, + { + "epoch": 4.006143399886832, + "grad_norm": 0.951754629611969, + "learning_rate": 0.0002, + "loss": 0.508, + "step": 24780 + }, + { + "epoch": 4.007760084067577, + "grad_norm": 1.13059401512146, + "learning_rate": 0.0002, + "loss": 0.5314, + "step": 24790 + }, + { + "epoch": 4.009376768248322, + "grad_norm": 0.9343926310539246, + "learning_rate": 0.0002, + "loss": 0.5323, + "step": 24800 + }, + { + "epoch": 4.010993452429068, + "grad_norm": 1.0680590867996216, + "learning_rate": 0.0002, + "loss": 0.5161, + "step": 24810 + }, + { + "epoch": 4.012610136609814, + "grad_norm": 1.0022706985473633, + "learning_rate": 0.0002, + "loss": 0.513, + "step": 24820 + }, + { + "epoch": 4.014226820790559, + "grad_norm": 1.0285297632217407, + "learning_rate": 0.0002, + "loss": 0.543, + "step": 24830 + }, + { + "epoch": 4.015843504971304, + "grad_norm": 0.8347002863883972, + "learning_rate": 0.0002, + "loss": 0.5311, + "step": 24840 + }, + { + "epoch": 4.017460189152049, + "grad_norm": 0.9675396680831909, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 24850 + }, + { + "epoch": 4.019076873332795, + "grad_norm": 0.9238511323928833, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 24860 + }, + { + "epoch": 4.02069355751354, + "grad_norm": 1.1576941013336182, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 24870 + }, + { + "epoch": 4.022310241694285, + "grad_norm": 0.8583757281303406, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 24880 + }, + { + "epoch": 4.02392692587503, + "grad_norm": 0.9816817045211792, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 24890 + }, + { + "epoch": 4.0255436100557755, + "grad_norm": 0.955073893070221, + "learning_rate": 0.0002, + "loss": 0.5605, + "step": 24900 + }, + { + "epoch": 4.027160294236521, + "grad_norm": 1.1054974794387817, + "learning_rate": 0.0002, + "loss": 0.4896, + "step": 24910 + }, + { + "epoch": 4.028776978417266, + "grad_norm": 1.1240060329437256, + "learning_rate": 0.0002, + "loss": 0.5246, + "step": 24920 + }, + { + "epoch": 4.030393662598011, + "grad_norm": 0.9512825012207031, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 24930 + }, + { + "epoch": 4.0320103467787565, + "grad_norm": 0.85965496301651, + "learning_rate": 0.0002, + "loss": 0.5584, + "step": 24940 + }, + { + "epoch": 4.033627030959502, + "grad_norm": 0.9378061294555664, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 24950 + }, + { + "epoch": 4.035243715140247, + "grad_norm": 0.9655424356460571, + "learning_rate": 0.0002, + "loss": 0.5008, + "step": 24960 + }, + { + "epoch": 4.036860399320993, + "grad_norm": 1.1393707990646362, + "learning_rate": 0.0002, + "loss": 0.5538, + "step": 24970 + }, + { + "epoch": 4.038477083501738, + "grad_norm": 1.0220451354980469, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 24980 + }, + { + "epoch": 4.0400937676824835, + "grad_norm": 0.9785808324813843, + "learning_rate": 0.0002, + "loss": 0.5813, + "step": 24990 + }, + { + "epoch": 4.041710451863229, + "grad_norm": 1.0257649421691895, + "learning_rate": 0.0002, + "loss": 0.5153, + "step": 25000 + }, + { + "epoch": 4.043327136043974, + "grad_norm": 0.9737892150878906, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 25010 + }, + { + "epoch": 4.044943820224719, + "grad_norm": 0.7416959404945374, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 25020 + }, + { + "epoch": 4.046560504405464, + "grad_norm": 0.7909596562385559, + "learning_rate": 0.0002, + "loss": 0.5372, + "step": 25030 + }, + { + "epoch": 4.04817718858621, + "grad_norm": 0.8923130631446838, + "learning_rate": 0.0002, + "loss": 0.5265, + "step": 25040 + }, + { + "epoch": 4.049793872766955, + "grad_norm": 0.9044941663742065, + "learning_rate": 0.0002, + "loss": 0.5035, + "step": 25050 + }, + { + "epoch": 4.0514105569477, + "grad_norm": 0.866352379322052, + "learning_rate": 0.0002, + "loss": 0.5135, + "step": 25060 + }, + { + "epoch": 4.053027241128445, + "grad_norm": 1.544549822807312, + "learning_rate": 0.0002, + "loss": 0.5956, + "step": 25070 + }, + { + "epoch": 4.054643925309191, + "grad_norm": 0.8426995277404785, + "learning_rate": 0.0002, + "loss": 0.5418, + "step": 25080 + }, + { + "epoch": 4.056260609489936, + "grad_norm": 0.9797548651695251, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 25090 + }, + { + "epoch": 4.057877293670681, + "grad_norm": 0.8468434810638428, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 25100 + }, + { + "epoch": 4.059493977851426, + "grad_norm": 0.9294559955596924, + "learning_rate": 0.0002, + "loss": 0.5242, + "step": 25110 + }, + { + "epoch": 4.061110662032172, + "grad_norm": 0.9686688780784607, + "learning_rate": 0.0002, + "loss": 0.5295, + "step": 25120 + }, + { + "epoch": 4.062727346212918, + "grad_norm": 0.8042728304862976, + "learning_rate": 0.0002, + "loss": 0.5642, + "step": 25130 + }, + { + "epoch": 4.064344030393663, + "grad_norm": 1.165160894393921, + "learning_rate": 0.0002, + "loss": 0.548, + "step": 25140 + }, + { + "epoch": 4.065960714574408, + "grad_norm": 1.2161961793899536, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 25150 + }, + { + "epoch": 4.067577398755153, + "grad_norm": 1.0762810707092285, + "learning_rate": 0.0002, + "loss": 0.5217, + "step": 25160 + }, + { + "epoch": 4.069194082935899, + "grad_norm": 0.7580869793891907, + "learning_rate": 0.0002, + "loss": 0.5886, + "step": 25170 + }, + { + "epoch": 4.070810767116644, + "grad_norm": 0.9630117416381836, + "learning_rate": 0.0002, + "loss": 0.5401, + "step": 25180 + }, + { + "epoch": 4.072427451297389, + "grad_norm": 0.9049716591835022, + "learning_rate": 0.0002, + "loss": 0.5378, + "step": 25190 + }, + { + "epoch": 4.074044135478134, + "grad_norm": 1.1536930799484253, + "learning_rate": 0.0002, + "loss": 0.5266, + "step": 25200 + }, + { + "epoch": 4.0756608196588795, + "grad_norm": 0.901461124420166, + "learning_rate": 0.0002, + "loss": 0.5523, + "step": 25210 + }, + { + "epoch": 4.077277503839625, + "grad_norm": 1.3318437337875366, + "learning_rate": 0.0002, + "loss": 0.5132, + "step": 25220 + }, + { + "epoch": 4.07889418802037, + "grad_norm": 0.8811455368995667, + "learning_rate": 0.0002, + "loss": 0.5317, + "step": 25230 + }, + { + "epoch": 4.080510872201115, + "grad_norm": 1.0564165115356445, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 25240 + }, + { + "epoch": 4.08212755638186, + "grad_norm": 1.1008027791976929, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 25250 + }, + { + "epoch": 4.083744240562606, + "grad_norm": 1.150097131729126, + "learning_rate": 0.0002, + "loss": 0.5195, + "step": 25260 + }, + { + "epoch": 4.085360924743352, + "grad_norm": 0.9339924454689026, + "learning_rate": 0.0002, + "loss": 0.5321, + "step": 25270 + }, + { + "epoch": 4.086977608924097, + "grad_norm": 1.0902045965194702, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 25280 + }, + { + "epoch": 4.088594293104842, + "grad_norm": 0.8483911156654358, + "learning_rate": 0.0002, + "loss": 0.5203, + "step": 25290 + }, + { + "epoch": 4.0902109772855875, + "grad_norm": 0.9477024674415588, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 25300 + }, + { + "epoch": 4.091827661466333, + "grad_norm": 0.9500215649604797, + "learning_rate": 0.0002, + "loss": 0.5384, + "step": 25310 + }, + { + "epoch": 4.093444345647078, + "grad_norm": 1.040468454360962, + "learning_rate": 0.0002, + "loss": 0.5045, + "step": 25320 + }, + { + "epoch": 4.095061029827823, + "grad_norm": 0.7457592487335205, + "learning_rate": 0.0002, + "loss": 0.5488, + "step": 25330 + }, + { + "epoch": 4.096677714008568, + "grad_norm": 1.2092097997665405, + "learning_rate": 0.0002, + "loss": 0.609, + "step": 25340 + }, + { + "epoch": 4.098294398189314, + "grad_norm": 0.9652107954025269, + "learning_rate": 0.0002, + "loss": 0.5174, + "step": 25350 + }, + { + "epoch": 4.099911082370059, + "grad_norm": 0.8464955687522888, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 25360 + }, + { + "epoch": 4.101527766550804, + "grad_norm": 0.875026285648346, + "learning_rate": 0.0002, + "loss": 0.5635, + "step": 25370 + }, + { + "epoch": 4.103144450731549, + "grad_norm": 0.9241740107536316, + "learning_rate": 0.0002, + "loss": 0.5774, + "step": 25380 + }, + { + "epoch": 4.1047611349122946, + "grad_norm": 0.9769546389579773, + "learning_rate": 0.0002, + "loss": 0.5578, + "step": 25390 + }, + { + "epoch": 4.10637781909304, + "grad_norm": 1.1501960754394531, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 25400 + }, + { + "epoch": 4.107994503273786, + "grad_norm": 0.9135243892669678, + "learning_rate": 0.0002, + "loss": 0.5241, + "step": 25410 + }, + { + "epoch": 4.109611187454531, + "grad_norm": 0.9905396103858948, + "learning_rate": 0.0002, + "loss": 0.5152, + "step": 25420 + }, + { + "epoch": 4.111227871635276, + "grad_norm": 0.9845104217529297, + "learning_rate": 0.0002, + "loss": 0.5064, + "step": 25430 + }, + { + "epoch": 4.112844555816022, + "grad_norm": 0.8326883912086487, + "learning_rate": 0.0002, + "loss": 0.5029, + "step": 25440 + }, + { + "epoch": 4.114461239996767, + "grad_norm": 0.9264556765556335, + "learning_rate": 0.0002, + "loss": 0.5312, + "step": 25450 + }, + { + "epoch": 4.116077924177512, + "grad_norm": 1.043080449104309, + "learning_rate": 0.0002, + "loss": 0.5968, + "step": 25460 + }, + { + "epoch": 4.117694608358257, + "grad_norm": 0.8533386588096619, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 25470 + }, + { + "epoch": 4.1193112925390025, + "grad_norm": 1.0133965015411377, + "learning_rate": 0.0002, + "loss": 0.5584, + "step": 25480 + }, + { + "epoch": 4.120927976719748, + "grad_norm": 0.7476310133934021, + "learning_rate": 0.0002, + "loss": 0.566, + "step": 25490 + }, + { + "epoch": 4.122544660900493, + "grad_norm": 1.1247259378433228, + "learning_rate": 0.0002, + "loss": 0.5189, + "step": 25500 + }, + { + "epoch": 4.124161345081238, + "grad_norm": 1.0764678716659546, + "learning_rate": 0.0002, + "loss": 0.5751, + "step": 25510 + }, + { + "epoch": 4.1257780292619834, + "grad_norm": 0.7679798007011414, + "learning_rate": 0.0002, + "loss": 0.5391, + "step": 25520 + }, + { + "epoch": 4.127394713442729, + "grad_norm": 0.8877071142196655, + "learning_rate": 0.0002, + "loss": 0.5233, + "step": 25530 + }, + { + "epoch": 4.129011397623474, + "grad_norm": 1.0440239906311035, + "learning_rate": 0.0002, + "loss": 0.5769, + "step": 25540 + }, + { + "epoch": 4.130628081804219, + "grad_norm": 0.984145998954773, + "learning_rate": 0.0002, + "loss": 0.5723, + "step": 25550 + }, + { + "epoch": 4.132244765984965, + "grad_norm": 0.8667055368423462, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 25560 + }, + { + "epoch": 4.1338614501657105, + "grad_norm": 1.1300835609436035, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 25570 + }, + { + "epoch": 4.135478134346456, + "grad_norm": 0.9314348101615906, + "learning_rate": 0.0002, + "loss": 0.524, + "step": 25580 + }, + { + "epoch": 4.137094818527201, + "grad_norm": 0.7731879949569702, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 25590 + }, + { + "epoch": 4.138711502707946, + "grad_norm": 1.0080097913742065, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 25600 + }, + { + "epoch": 4.140328186888691, + "grad_norm": 1.2475038766860962, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 25610 + }, + { + "epoch": 4.141944871069437, + "grad_norm": 0.9912930727005005, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 25620 + }, + { + "epoch": 4.143561555250182, + "grad_norm": 0.9088651537895203, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 25630 + }, + { + "epoch": 4.145178239430927, + "grad_norm": 0.8940697312355042, + "learning_rate": 0.0002, + "loss": 0.5393, + "step": 25640 + }, + { + "epoch": 4.146794923611672, + "grad_norm": 1.0798203945159912, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 25650 + }, + { + "epoch": 4.148411607792418, + "grad_norm": 0.955172061920166, + "learning_rate": 0.0002, + "loss": 0.5987, + "step": 25660 + }, + { + "epoch": 4.150028291973163, + "grad_norm": 0.9692716002464294, + "learning_rate": 0.0002, + "loss": 0.569, + "step": 25670 + }, + { + "epoch": 4.151644976153908, + "grad_norm": 1.0813939571380615, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 25680 + }, + { + "epoch": 4.153261660334653, + "grad_norm": 1.135675072669983, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 25690 + }, + { + "epoch": 4.1548783445153985, + "grad_norm": 1.0392236709594727, + "learning_rate": 0.0002, + "loss": 0.5247, + "step": 25700 + }, + { + "epoch": 4.156495028696145, + "grad_norm": 0.9473116993904114, + "learning_rate": 0.0002, + "loss": 0.5204, + "step": 25710 + }, + { + "epoch": 4.15811171287689, + "grad_norm": 0.712493896484375, + "learning_rate": 0.0002, + "loss": 0.5339, + "step": 25720 + }, + { + "epoch": 4.159728397057635, + "grad_norm": 0.8724465370178223, + "learning_rate": 0.0002, + "loss": 0.5781, + "step": 25730 + }, + { + "epoch": 4.16134508123838, + "grad_norm": 0.9870015978813171, + "learning_rate": 0.0002, + "loss": 0.5325, + "step": 25740 + }, + { + "epoch": 4.1629617654191255, + "grad_norm": 1.025273084640503, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 25750 + }, + { + "epoch": 4.164578449599871, + "grad_norm": 0.9243090152740479, + "learning_rate": 0.0002, + "loss": 0.5223, + "step": 25760 + }, + { + "epoch": 4.166195133780616, + "grad_norm": 1.1656451225280762, + "learning_rate": 0.0002, + "loss": 0.5177, + "step": 25770 + }, + { + "epoch": 4.167811817961361, + "grad_norm": 0.936358630657196, + "learning_rate": 0.0002, + "loss": 0.5334, + "step": 25780 + }, + { + "epoch": 4.1694285021421065, + "grad_norm": 0.8618208169937134, + "learning_rate": 0.0002, + "loss": 0.5236, + "step": 25790 + }, + { + "epoch": 4.171045186322852, + "grad_norm": 0.8580600023269653, + "learning_rate": 0.0002, + "loss": 0.5186, + "step": 25800 + }, + { + "epoch": 4.172661870503597, + "grad_norm": 1.0128562450408936, + "learning_rate": 0.0002, + "loss": 0.5212, + "step": 25810 + }, + { + "epoch": 4.174278554684342, + "grad_norm": 0.854865312576294, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 25820 + }, + { + "epoch": 4.175895238865087, + "grad_norm": 1.235082745552063, + "learning_rate": 0.0002, + "loss": 0.5377, + "step": 25830 + }, + { + "epoch": 4.177511923045833, + "grad_norm": 0.9796220660209656, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 25840 + }, + { + "epoch": 4.179128607226578, + "grad_norm": 0.8922094702720642, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 25850 + }, + { + "epoch": 4.180745291407324, + "grad_norm": 0.9672530293464661, + "learning_rate": 0.0002, + "loss": 0.5806, + "step": 25860 + }, + { + "epoch": 4.182361975588069, + "grad_norm": 0.8662548661231995, + "learning_rate": 0.0002, + "loss": 0.5074, + "step": 25870 + }, + { + "epoch": 4.1839786597688144, + "grad_norm": 0.7938798069953918, + "learning_rate": 0.0002, + "loss": 0.5329, + "step": 25880 + }, + { + "epoch": 4.18559534394956, + "grad_norm": 1.0517958402633667, + "learning_rate": 0.0002, + "loss": 0.5427, + "step": 25890 + }, + { + "epoch": 4.187212028130305, + "grad_norm": 0.8939275145530701, + "learning_rate": 0.0002, + "loss": 0.5147, + "step": 25900 + }, + { + "epoch": 4.18882871231105, + "grad_norm": 1.0296672582626343, + "learning_rate": 0.0002, + "loss": 0.5199, + "step": 25910 + }, + { + "epoch": 4.190445396491795, + "grad_norm": 0.8104017972946167, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 25920 + }, + { + "epoch": 4.192062080672541, + "grad_norm": 0.9984509944915771, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 25930 + }, + { + "epoch": 4.193678764853286, + "grad_norm": 0.9844784736633301, + "learning_rate": 0.0002, + "loss": 0.5356, + "step": 25940 + }, + { + "epoch": 4.195295449034031, + "grad_norm": 0.8168622255325317, + "learning_rate": 0.0002, + "loss": 0.5198, + "step": 25950 + }, + { + "epoch": 4.196912133214776, + "grad_norm": 1.0878913402557373, + "learning_rate": 0.0002, + "loss": 0.542, + "step": 25960 + }, + { + "epoch": 4.1985288173955215, + "grad_norm": 0.927126407623291, + "learning_rate": 0.0002, + "loss": 0.5414, + "step": 25970 + }, + { + "epoch": 4.200145501576267, + "grad_norm": 0.838586688041687, + "learning_rate": 0.0002, + "loss": 0.5794, + "step": 25980 + }, + { + "epoch": 4.201762185757012, + "grad_norm": 1.2572145462036133, + "learning_rate": 0.0002, + "loss": 0.5454, + "step": 25990 + }, + { + "epoch": 4.203378869937758, + "grad_norm": 1.0476740598678589, + "learning_rate": 0.0002, + "loss": 0.5048, + "step": 26000 + }, + { + "epoch": 4.204995554118503, + "grad_norm": 1.0873368978500366, + "learning_rate": 0.0002, + "loss": 0.5127, + "step": 26010 + }, + { + "epoch": 4.206612238299249, + "grad_norm": 1.2664896249771118, + "learning_rate": 0.0002, + "loss": 0.5679, + "step": 26020 + }, + { + "epoch": 4.208228922479994, + "grad_norm": 1.0312391519546509, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 26030 + }, + { + "epoch": 4.209845606660739, + "grad_norm": 1.0235042572021484, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 26040 + }, + { + "epoch": 4.211462290841484, + "grad_norm": 0.8882219195365906, + "learning_rate": 0.0002, + "loss": 0.5766, + "step": 26050 + }, + { + "epoch": 4.2130789750222295, + "grad_norm": 0.9115961790084839, + "learning_rate": 0.0002, + "loss": 0.5557, + "step": 26060 + }, + { + "epoch": 4.214695659202975, + "grad_norm": 1.0218228101730347, + "learning_rate": 0.0002, + "loss": 0.5455, + "step": 26070 + }, + { + "epoch": 4.21631234338372, + "grad_norm": 1.0802232027053833, + "learning_rate": 0.0002, + "loss": 0.5462, + "step": 26080 + }, + { + "epoch": 4.217929027564465, + "grad_norm": 1.1488053798675537, + "learning_rate": 0.0002, + "loss": 0.557, + "step": 26090 + }, + { + "epoch": 4.21954571174521, + "grad_norm": 1.0487725734710693, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 26100 + }, + { + "epoch": 4.221162395925956, + "grad_norm": 0.9131165742874146, + "learning_rate": 0.0002, + "loss": 0.5568, + "step": 26110 + }, + { + "epoch": 4.222779080106701, + "grad_norm": 0.9012845158576965, + "learning_rate": 0.0002, + "loss": 0.5206, + "step": 26120 + }, + { + "epoch": 4.224395764287446, + "grad_norm": 0.8389840126037598, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 26130 + }, + { + "epoch": 4.226012448468191, + "grad_norm": 0.8924660682678223, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 26140 + }, + { + "epoch": 4.2276291326489375, + "grad_norm": 0.8556463718414307, + "learning_rate": 0.0002, + "loss": 0.5715, + "step": 26150 + }, + { + "epoch": 4.229245816829683, + "grad_norm": 0.9643129110336304, + "learning_rate": 0.0002, + "loss": 0.5695, + "step": 26160 + }, + { + "epoch": 4.230862501010428, + "grad_norm": 0.9865712523460388, + "learning_rate": 0.0002, + "loss": 0.5321, + "step": 26170 + }, + { + "epoch": 4.232479185191173, + "grad_norm": 1.152641773223877, + "learning_rate": 0.0002, + "loss": 0.5406, + "step": 26180 + }, + { + "epoch": 4.234095869371918, + "grad_norm": 0.9157698154449463, + "learning_rate": 0.0002, + "loss": 0.5632, + "step": 26190 + }, + { + "epoch": 4.235712553552664, + "grad_norm": 0.8418048620223999, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 26200 + }, + { + "epoch": 4.237329237733409, + "grad_norm": 0.9430168867111206, + "learning_rate": 0.0002, + "loss": 0.5624, + "step": 26210 + }, + { + "epoch": 4.238945921914154, + "grad_norm": 1.012582778930664, + "learning_rate": 0.0002, + "loss": 0.5574, + "step": 26220 + }, + { + "epoch": 4.240562606094899, + "grad_norm": 1.112619400024414, + "learning_rate": 0.0002, + "loss": 0.5693, + "step": 26230 + }, + { + "epoch": 4.2421792902756446, + "grad_norm": 0.9243621826171875, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 26240 + }, + { + "epoch": 4.24379597445639, + "grad_norm": 0.6977595686912537, + "learning_rate": 0.0002, + "loss": 0.569, + "step": 26250 + }, + { + "epoch": 4.245412658637135, + "grad_norm": 0.9600721597671509, + "learning_rate": 0.0002, + "loss": 0.5379, + "step": 26260 + }, + { + "epoch": 4.24702934281788, + "grad_norm": 0.882641613483429, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 26270 + }, + { + "epoch": 4.2486460269986255, + "grad_norm": 1.010920763015747, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 26280 + }, + { + "epoch": 4.250262711179371, + "grad_norm": 0.9289400577545166, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 26290 + }, + { + "epoch": 4.251879395360117, + "grad_norm": 1.137397289276123, + "learning_rate": 0.0002, + "loss": 0.541, + "step": 26300 + }, + { + "epoch": 4.253496079540862, + "grad_norm": 1.0136182308197021, + "learning_rate": 0.0002, + "loss": 0.5204, + "step": 26310 + }, + { + "epoch": 4.255112763721607, + "grad_norm": 0.9387356042861938, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 26320 + }, + { + "epoch": 4.2567294479023525, + "grad_norm": 1.1833957433700562, + "learning_rate": 0.0002, + "loss": 0.5948, + "step": 26330 + }, + { + "epoch": 4.258346132083098, + "grad_norm": 0.9415934681892395, + "learning_rate": 0.0002, + "loss": 0.5905, + "step": 26340 + }, + { + "epoch": 4.259962816263843, + "grad_norm": 0.8550165891647339, + "learning_rate": 0.0002, + "loss": 0.5539, + "step": 26350 + }, + { + "epoch": 4.261579500444588, + "grad_norm": 9.924622535705566, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 26360 + }, + { + "epoch": 4.2631961846253335, + "grad_norm": 1.0104902982711792, + "learning_rate": 0.0002, + "loss": 0.5689, + "step": 26370 + }, + { + "epoch": 4.264812868806079, + "grad_norm": 0.890794038772583, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 26380 + }, + { + "epoch": 4.266429552986824, + "grad_norm": 1.0560191869735718, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 26390 + }, + { + "epoch": 4.268046237167569, + "grad_norm": 1.0135581493377686, + "learning_rate": 0.0002, + "loss": 0.5119, + "step": 26400 + }, + { + "epoch": 4.269662921348314, + "grad_norm": 1.1304140090942383, + "learning_rate": 0.0002, + "loss": 0.5359, + "step": 26410 + }, + { + "epoch": 4.27127960552906, + "grad_norm": 0.9899303913116455, + "learning_rate": 0.0002, + "loss": 0.5615, + "step": 26420 + }, + { + "epoch": 4.272896289709805, + "grad_norm": 1.0505329370498657, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 26430 + }, + { + "epoch": 4.27451297389055, + "grad_norm": 0.9389396905899048, + "learning_rate": 0.0002, + "loss": 0.5384, + "step": 26440 + }, + { + "epoch": 4.276129658071296, + "grad_norm": 0.875328779220581, + "learning_rate": 0.0002, + "loss": 0.5558, + "step": 26450 + }, + { + "epoch": 4.277746342252041, + "grad_norm": 1.0689256191253662, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 26460 + }, + { + "epoch": 4.279363026432787, + "grad_norm": 0.9988957643508911, + "learning_rate": 0.0002, + "loss": 0.546, + "step": 26470 + }, + { + "epoch": 4.280979710613532, + "grad_norm": 0.8721813559532166, + "learning_rate": 0.0002, + "loss": 0.5478, + "step": 26480 + }, + { + "epoch": 4.282596394794277, + "grad_norm": 1.100109577178955, + "learning_rate": 0.0002, + "loss": 0.5424, + "step": 26490 + }, + { + "epoch": 4.284213078975022, + "grad_norm": 1.1607271432876587, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 26500 + }, + { + "epoch": 4.285829763155768, + "grad_norm": 0.879088819026947, + "learning_rate": 0.0002, + "loss": 0.6287, + "step": 26510 + }, + { + "epoch": 4.287446447336513, + "grad_norm": 0.9891700744628906, + "learning_rate": 0.0002, + "loss": 0.573, + "step": 26520 + }, + { + "epoch": 4.289063131517258, + "grad_norm": 1.0831127166748047, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 26530 + }, + { + "epoch": 4.290679815698003, + "grad_norm": 1.4108285903930664, + "learning_rate": 0.0002, + "loss": 0.5693, + "step": 26540 + }, + { + "epoch": 4.2922964998787485, + "grad_norm": 1.0630289316177368, + "learning_rate": 0.0002, + "loss": 0.5888, + "step": 26550 + }, + { + "epoch": 4.293913184059494, + "grad_norm": 1.0854572057724, + "learning_rate": 0.0002, + "loss": 0.5817, + "step": 26560 + }, + { + "epoch": 4.295529868240239, + "grad_norm": 0.9561646580696106, + "learning_rate": 0.0002, + "loss": 0.5586, + "step": 26570 + }, + { + "epoch": 4.297146552420984, + "grad_norm": 0.9064981937408447, + "learning_rate": 0.0002, + "loss": 0.5674, + "step": 26580 + }, + { + "epoch": 4.298763236601729, + "grad_norm": 1.0082972049713135, + "learning_rate": 0.0002, + "loss": 0.5847, + "step": 26590 + }, + { + "epoch": 4.3003799207824756, + "grad_norm": 1.1613214015960693, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 26600 + }, + { + "epoch": 4.301996604963221, + "grad_norm": 0.9847695231437683, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 26610 + }, + { + "epoch": 4.303613289143966, + "grad_norm": 1.0980697870254517, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 26620 + }, + { + "epoch": 4.305229973324711, + "grad_norm": 0.8861175179481506, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 26630 + }, + { + "epoch": 4.3068466575054565, + "grad_norm": 0.8917363286018372, + "learning_rate": 0.0002, + "loss": 0.5716, + "step": 26640 + }, + { + "epoch": 4.308463341686202, + "grad_norm": 1.0458378791809082, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 26650 + }, + { + "epoch": 4.310080025866947, + "grad_norm": 1.4859240055084229, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 26660 + }, + { + "epoch": 4.311696710047692, + "grad_norm": 1.1376359462738037, + "learning_rate": 0.0002, + "loss": 0.5296, + "step": 26670 + }, + { + "epoch": 4.313313394228437, + "grad_norm": 0.991349995136261, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 26680 + }, + { + "epoch": 4.314930078409183, + "grad_norm": 0.9995543956756592, + "learning_rate": 0.0002, + "loss": 0.5338, + "step": 26690 + }, + { + "epoch": 4.316546762589928, + "grad_norm": 1.0515851974487305, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 26700 + }, + { + "epoch": 4.318163446770673, + "grad_norm": 1.008023977279663, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 26710 + }, + { + "epoch": 4.319780130951418, + "grad_norm": 1.0184582471847534, + "learning_rate": 0.0002, + "loss": 0.5506, + "step": 26720 + }, + { + "epoch": 4.321396815132164, + "grad_norm": 1.161071538925171, + "learning_rate": 0.0002, + "loss": 0.5828, + "step": 26730 + }, + { + "epoch": 4.323013499312909, + "grad_norm": 0.9580779671669006, + "learning_rate": 0.0002, + "loss": 0.5633, + "step": 26740 + }, + { + "epoch": 4.324630183493655, + "grad_norm": 1.0189911127090454, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 26750 + }, + { + "epoch": 4.3262468676744, + "grad_norm": 0.7484358549118042, + "learning_rate": 0.0002, + "loss": 0.5237, + "step": 26760 + }, + { + "epoch": 4.327863551855145, + "grad_norm": 1.0015908479690552, + "learning_rate": 0.0002, + "loss": 0.5728, + "step": 26770 + }, + { + "epoch": 4.329480236035891, + "grad_norm": 0.8972945809364319, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 26780 + }, + { + "epoch": 4.331096920216636, + "grad_norm": 1.01099693775177, + "learning_rate": 0.0002, + "loss": 0.5857, + "step": 26790 + }, + { + "epoch": 4.332713604397381, + "grad_norm": 0.846958339214325, + "learning_rate": 0.0002, + "loss": 0.5591, + "step": 26800 + }, + { + "epoch": 4.334330288578126, + "grad_norm": 1.0792603492736816, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 26810 + }, + { + "epoch": 4.3359469727588715, + "grad_norm": 1.0373345613479614, + "learning_rate": 0.0002, + "loss": 0.5747, + "step": 26820 + }, + { + "epoch": 4.337563656939617, + "grad_norm": 0.9779167771339417, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 26830 + }, + { + "epoch": 4.339180341120362, + "grad_norm": 1.0235520601272583, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 26840 + }, + { + "epoch": 4.340797025301107, + "grad_norm": 1.04195237159729, + "learning_rate": 0.0002, + "loss": 0.5843, + "step": 26850 + }, + { + "epoch": 4.3424137094818525, + "grad_norm": 0.9479565620422363, + "learning_rate": 0.0002, + "loss": 0.5474, + "step": 26860 + }, + { + "epoch": 4.344030393662598, + "grad_norm": 0.9526172280311584, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 26870 + }, + { + "epoch": 4.345647077843343, + "grad_norm": 0.8571456074714661, + "learning_rate": 0.0002, + "loss": 0.521, + "step": 26880 + }, + { + "epoch": 4.347263762024088, + "grad_norm": 0.9475828409194946, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 26890 + }, + { + "epoch": 4.348880446204834, + "grad_norm": 1.0529576539993286, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 26900 + }, + { + "epoch": 4.3504971303855795, + "grad_norm": 0.9648140072822571, + "learning_rate": 0.0002, + "loss": 0.56, + "step": 26910 + }, + { + "epoch": 4.352113814566325, + "grad_norm": 1.0488841533660889, + "learning_rate": 0.0002, + "loss": 0.5162, + "step": 26920 + }, + { + "epoch": 4.35373049874707, + "grad_norm": 0.8771942257881165, + "learning_rate": 0.0002, + "loss": 0.5842, + "step": 26930 + }, + { + "epoch": 4.355347182927815, + "grad_norm": 0.9411202073097229, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 26940 + }, + { + "epoch": 4.35696386710856, + "grad_norm": 1.0997588634490967, + "learning_rate": 0.0002, + "loss": 0.6001, + "step": 26950 + }, + { + "epoch": 4.358580551289306, + "grad_norm": 0.968754768371582, + "learning_rate": 0.0002, + "loss": 0.5528, + "step": 26960 + }, + { + "epoch": 4.360197235470051, + "grad_norm": 0.9990773797035217, + "learning_rate": 0.0002, + "loss": 0.5881, + "step": 26970 + }, + { + "epoch": 4.361813919650796, + "grad_norm": 1.0210620164871216, + "learning_rate": 0.0002, + "loss": 0.5761, + "step": 26980 + }, + { + "epoch": 4.363430603831541, + "grad_norm": 0.855462908744812, + "learning_rate": 0.0002, + "loss": 0.5768, + "step": 26990 + }, + { + "epoch": 4.365047288012287, + "grad_norm": 0.9169660806655884, + "learning_rate": 0.0002, + "loss": 0.5493, + "step": 27000 + }, + { + "epoch": 4.366663972193032, + "grad_norm": 1.089629888534546, + "learning_rate": 0.0002, + "loss": 0.5697, + "step": 27010 + }, + { + "epoch": 4.368280656373777, + "grad_norm": 1.0932867527008057, + "learning_rate": 0.0002, + "loss": 0.5854, + "step": 27020 + }, + { + "epoch": 4.369897340554522, + "grad_norm": 0.9290956854820251, + "learning_rate": 0.0002, + "loss": 0.5656, + "step": 27030 + }, + { + "epoch": 4.3715140247352675, + "grad_norm": 1.2800624370574951, + "learning_rate": 0.0002, + "loss": 0.5727, + "step": 27040 + }, + { + "epoch": 4.373130708916014, + "grad_norm": 0.8993493318557739, + "learning_rate": 0.0002, + "loss": 0.5837, + "step": 27050 + }, + { + "epoch": 4.374747393096759, + "grad_norm": 1.1566431522369385, + "learning_rate": 0.0002, + "loss": 0.6232, + "step": 27060 + }, + { + "epoch": 4.376364077277504, + "grad_norm": 0.9479052424430847, + "learning_rate": 0.0002, + "loss": 0.5902, + "step": 27070 + }, + { + "epoch": 4.377980761458249, + "grad_norm": 1.0063648223876953, + "learning_rate": 0.0002, + "loss": 0.6189, + "step": 27080 + }, + { + "epoch": 4.379597445638995, + "grad_norm": 0.8342045545578003, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 27090 + }, + { + "epoch": 4.38121412981974, + "grad_norm": 1.1390739679336548, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 27100 + }, + { + "epoch": 4.382830814000485, + "grad_norm": 0.9547637104988098, + "learning_rate": 0.0002, + "loss": 0.5372, + "step": 27110 + }, + { + "epoch": 4.38444749818123, + "grad_norm": 1.0503804683685303, + "learning_rate": 0.0002, + "loss": 0.5728, + "step": 27120 + }, + { + "epoch": 4.3860641823619755, + "grad_norm": 0.9064017534255981, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 27130 + }, + { + "epoch": 4.387680866542721, + "grad_norm": 0.9382519125938416, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 27140 + }, + { + "epoch": 4.389297550723466, + "grad_norm": 1.0410341024398804, + "learning_rate": 0.0002, + "loss": 0.5791, + "step": 27150 + }, + { + "epoch": 4.390914234904211, + "grad_norm": 0.9218655824661255, + "learning_rate": 0.0002, + "loss": 0.6034, + "step": 27160 + }, + { + "epoch": 4.392530919084956, + "grad_norm": 0.8119737505912781, + "learning_rate": 0.0002, + "loss": 0.5204, + "step": 27170 + }, + { + "epoch": 4.394147603265702, + "grad_norm": 0.8584722876548767, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 27180 + }, + { + "epoch": 4.395764287446447, + "grad_norm": 0.9668293595314026, + "learning_rate": 0.0002, + "loss": 0.5772, + "step": 27190 + }, + { + "epoch": 4.397380971627193, + "grad_norm": 1.022334098815918, + "learning_rate": 0.0002, + "loss": 0.6009, + "step": 27200 + }, + { + "epoch": 4.398997655807938, + "grad_norm": 0.9553216099739075, + "learning_rate": 0.0002, + "loss": 0.5573, + "step": 27210 + }, + { + "epoch": 4.4006143399886835, + "grad_norm": 0.9282339215278625, + "learning_rate": 0.0002, + "loss": 0.5604, + "step": 27220 + }, + { + "epoch": 4.402231024169429, + "grad_norm": 1.0232292413711548, + "learning_rate": 0.0002, + "loss": 0.5599, + "step": 27230 + }, + { + "epoch": 4.403847708350174, + "grad_norm": 0.9915700554847717, + "learning_rate": 0.0002, + "loss": 0.6078, + "step": 27240 + }, + { + "epoch": 4.405464392530919, + "grad_norm": 1.0014961957931519, + "learning_rate": 0.0002, + "loss": 0.5778, + "step": 27250 + }, + { + "epoch": 4.407081076711664, + "grad_norm": 1.1172103881835938, + "learning_rate": 0.0002, + "loss": 0.5824, + "step": 27260 + }, + { + "epoch": 4.40869776089241, + "grad_norm": 0.8583093285560608, + "learning_rate": 0.0002, + "loss": 0.5286, + "step": 27270 + }, + { + "epoch": 4.410314445073155, + "grad_norm": 0.7609201669692993, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 27280 + }, + { + "epoch": 4.4119311292539, + "grad_norm": 1.0619351863861084, + "learning_rate": 0.0002, + "loss": 0.575, + "step": 27290 + }, + { + "epoch": 4.413547813434645, + "grad_norm": 1.0177674293518066, + "learning_rate": 0.0002, + "loss": 0.5579, + "step": 27300 + }, + { + "epoch": 4.4151644976153905, + "grad_norm": 0.9921218156814575, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 27310 + }, + { + "epoch": 4.416781181796136, + "grad_norm": 1.126244306564331, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 27320 + }, + { + "epoch": 4.418397865976881, + "grad_norm": 1.0678540468215942, + "learning_rate": 0.0002, + "loss": 0.5743, + "step": 27330 + }, + { + "epoch": 4.420014550157627, + "grad_norm": 0.8705704212188721, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 27340 + }, + { + "epoch": 4.421631234338372, + "grad_norm": 1.272074818611145, + "learning_rate": 0.0002, + "loss": 0.5763, + "step": 27350 + }, + { + "epoch": 4.423247918519118, + "grad_norm": 0.8740444183349609, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 27360 + }, + { + "epoch": 4.424864602699863, + "grad_norm": 1.0584250688552856, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 27370 + }, + { + "epoch": 4.426481286880608, + "grad_norm": 1.059870719909668, + "learning_rate": 0.0002, + "loss": 0.589, + "step": 27380 + }, + { + "epoch": 4.428097971061353, + "grad_norm": 1.072265863418579, + "learning_rate": 0.0002, + "loss": 0.5551, + "step": 27390 + }, + { + "epoch": 4.4297146552420985, + "grad_norm": 0.871481716632843, + "learning_rate": 0.0002, + "loss": 0.5584, + "step": 27400 + }, + { + "epoch": 4.431331339422844, + "grad_norm": 0.9555448293685913, + "learning_rate": 0.0002, + "loss": 0.5372, + "step": 27410 + }, + { + "epoch": 4.432948023603589, + "grad_norm": 1.0402292013168335, + "learning_rate": 0.0002, + "loss": 0.5593, + "step": 27420 + }, + { + "epoch": 4.434564707784334, + "grad_norm": 1.12587571144104, + "learning_rate": 0.0002, + "loss": 0.5532, + "step": 27430 + }, + { + "epoch": 4.436181391965079, + "grad_norm": 1.0783193111419678, + "learning_rate": 0.0002, + "loss": 0.5403, + "step": 27440 + }, + { + "epoch": 4.437798076145825, + "grad_norm": 1.024133563041687, + "learning_rate": 0.0002, + "loss": 0.5313, + "step": 27450 + }, + { + "epoch": 4.43941476032657, + "grad_norm": 0.9156768918037415, + "learning_rate": 0.0002, + "loss": 0.5621, + "step": 27460 + }, + { + "epoch": 4.441031444507315, + "grad_norm": 1.0215224027633667, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 27470 + }, + { + "epoch": 4.442648128688061, + "grad_norm": 1.082116961479187, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 27480 + }, + { + "epoch": 4.4442648128688065, + "grad_norm": 1.0412873029708862, + "learning_rate": 0.0002, + "loss": 0.6203, + "step": 27490 + }, + { + "epoch": 4.445881497049552, + "grad_norm": 1.0509289503097534, + "learning_rate": 0.0002, + "loss": 0.5939, + "step": 27500 + }, + { + "epoch": 4.447498181230297, + "grad_norm": 0.9291498064994812, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 27510 + }, + { + "epoch": 4.449114865411042, + "grad_norm": 0.970184326171875, + "learning_rate": 0.0002, + "loss": 0.5408, + "step": 27520 + }, + { + "epoch": 4.450731549591787, + "grad_norm": 0.8418883681297302, + "learning_rate": 0.0002, + "loss": 0.5705, + "step": 27530 + }, + { + "epoch": 4.452348233772533, + "grad_norm": 0.8823825120925903, + "learning_rate": 0.0002, + "loss": 0.5124, + "step": 27540 + }, + { + "epoch": 4.453964917953278, + "grad_norm": 1.1909019947052002, + "learning_rate": 0.0002, + "loss": 0.5867, + "step": 27550 + }, + { + "epoch": 4.455581602134023, + "grad_norm": 1.0317302942276, + "learning_rate": 0.0002, + "loss": 0.5685, + "step": 27560 + }, + { + "epoch": 4.457198286314768, + "grad_norm": 0.9977751970291138, + "learning_rate": 0.0002, + "loss": 0.5538, + "step": 27570 + }, + { + "epoch": 4.458814970495514, + "grad_norm": 0.8909519910812378, + "learning_rate": 0.0002, + "loss": 0.5628, + "step": 27580 + }, + { + "epoch": 4.460431654676259, + "grad_norm": 0.8653029799461365, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 27590 + }, + { + "epoch": 4.462048338857004, + "grad_norm": 1.0783653259277344, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 27600 + }, + { + "epoch": 4.463665023037749, + "grad_norm": 1.1235394477844238, + "learning_rate": 0.0002, + "loss": 0.579, + "step": 27610 + }, + { + "epoch": 4.4652817072184945, + "grad_norm": 0.9386643767356873, + "learning_rate": 0.0002, + "loss": 0.5545, + "step": 27620 + }, + { + "epoch": 4.466898391399241, + "grad_norm": 1.0605148077011108, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 27630 + }, + { + "epoch": 4.468515075579986, + "grad_norm": 1.1283893585205078, + "learning_rate": 0.0002, + "loss": 0.5886, + "step": 27640 + }, + { + "epoch": 4.470131759760731, + "grad_norm": 1.0583468675613403, + "learning_rate": 0.0002, + "loss": 0.5801, + "step": 27650 + }, + { + "epoch": 4.471748443941476, + "grad_norm": 0.9563992023468018, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 27660 + }, + { + "epoch": 4.4733651281222215, + "grad_norm": 1.100598931312561, + "learning_rate": 0.0002, + "loss": 0.5687, + "step": 27670 + }, + { + "epoch": 4.474981812302967, + "grad_norm": 0.9386957287788391, + "learning_rate": 0.0002, + "loss": 0.589, + "step": 27680 + }, + { + "epoch": 4.476598496483712, + "grad_norm": 1.2946288585662842, + "learning_rate": 0.0002, + "loss": 0.6241, + "step": 27690 + }, + { + "epoch": 4.478215180664457, + "grad_norm": 1.0325199365615845, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 27700 + }, + { + "epoch": 4.4798318648452025, + "grad_norm": 1.0318928956985474, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 27710 + }, + { + "epoch": 4.481448549025948, + "grad_norm": 0.8721024394035339, + "learning_rate": 0.0002, + "loss": 0.5656, + "step": 27720 + }, + { + "epoch": 4.483065233206693, + "grad_norm": 1.17376708984375, + "learning_rate": 0.0002, + "loss": 0.5421, + "step": 27730 + }, + { + "epoch": 4.484681917387438, + "grad_norm": 1.0926326513290405, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 27740 + }, + { + "epoch": 4.486298601568183, + "grad_norm": 0.9043852686882019, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 27750 + }, + { + "epoch": 4.487915285748929, + "grad_norm": 1.064600944519043, + "learning_rate": 0.0002, + "loss": 0.582, + "step": 27760 + }, + { + "epoch": 4.489531969929674, + "grad_norm": 0.7833460569381714, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 27770 + }, + { + "epoch": 4.49114865411042, + "grad_norm": 1.1073496341705322, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 27780 + }, + { + "epoch": 4.492765338291165, + "grad_norm": 1.0799397230148315, + "learning_rate": 0.0002, + "loss": 0.5577, + "step": 27790 + }, + { + "epoch": 4.49438202247191, + "grad_norm": 1.1062238216400146, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 27800 + }, + { + "epoch": 4.495998706652656, + "grad_norm": 1.0568242073059082, + "learning_rate": 0.0002, + "loss": 0.6126, + "step": 27810 + }, + { + "epoch": 4.497615390833401, + "grad_norm": 0.8861091732978821, + "learning_rate": 0.0002, + "loss": 0.5913, + "step": 27820 + }, + { + "epoch": 4.499232075014146, + "grad_norm": 1.2297543287277222, + "learning_rate": 0.0002, + "loss": 0.5858, + "step": 27830 + }, + { + "epoch": 4.500848759194891, + "grad_norm": 0.9600302577018738, + "learning_rate": 0.0002, + "loss": 0.5859, + "step": 27840 + }, + { + "epoch": 4.502465443375637, + "grad_norm": 1.057051181793213, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 27850 + }, + { + "epoch": 4.504082127556382, + "grad_norm": 0.9839690923690796, + "learning_rate": 0.0002, + "loss": 0.5788, + "step": 27860 + }, + { + "epoch": 4.505698811737127, + "grad_norm": 1.1479853391647339, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 27870 + }, + { + "epoch": 4.507315495917872, + "grad_norm": 1.0550768375396729, + "learning_rate": 0.0002, + "loss": 0.6039, + "step": 27880 + }, + { + "epoch": 4.5089321800986175, + "grad_norm": 0.898209273815155, + "learning_rate": 0.0002, + "loss": 0.563, + "step": 27890 + }, + { + "epoch": 4.510548864279363, + "grad_norm": 0.9460315108299255, + "learning_rate": 0.0002, + "loss": 0.5734, + "step": 27900 + }, + { + "epoch": 4.512165548460108, + "grad_norm": 0.9499884247779846, + "learning_rate": 0.0002, + "loss": 0.5702, + "step": 27910 + }, + { + "epoch": 4.513782232640853, + "grad_norm": 0.7801318764686584, + "learning_rate": 0.0002, + "loss": 0.5385, + "step": 27920 + }, + { + "epoch": 4.515398916821599, + "grad_norm": 0.9286966323852539, + "learning_rate": 0.0002, + "loss": 0.5391, + "step": 27930 + }, + { + "epoch": 4.517015601002345, + "grad_norm": 0.9539980292320251, + "learning_rate": 0.0002, + "loss": 0.5717, + "step": 27940 + }, + { + "epoch": 4.51863228518309, + "grad_norm": 1.1053401231765747, + "learning_rate": 0.0002, + "loss": 0.6073, + "step": 27950 + }, + { + "epoch": 4.520248969363835, + "grad_norm": 0.7535534501075745, + "learning_rate": 0.0002, + "loss": 0.6087, + "step": 27960 + }, + { + "epoch": 4.52186565354458, + "grad_norm": 1.076926589012146, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 27970 + }, + { + "epoch": 4.5234823377253255, + "grad_norm": 1.181935429573059, + "learning_rate": 0.0002, + "loss": 0.6028, + "step": 27980 + }, + { + "epoch": 4.525099021906071, + "grad_norm": 0.9293407201766968, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 27990 + }, + { + "epoch": 4.526715706086816, + "grad_norm": 0.8953009247779846, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 28000 + }, + { + "epoch": 4.528332390267561, + "grad_norm": 1.0850225687026978, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 28010 + }, + { + "epoch": 4.529949074448306, + "grad_norm": 0.9125663042068481, + "learning_rate": 0.0002, + "loss": 0.5459, + "step": 28020 + }, + { + "epoch": 4.531565758629052, + "grad_norm": 0.8745216727256775, + "learning_rate": 0.0002, + "loss": 0.5922, + "step": 28030 + }, + { + "epoch": 4.533182442809797, + "grad_norm": 1.0783463716506958, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 28040 + }, + { + "epoch": 4.534799126990542, + "grad_norm": 0.7513844966888428, + "learning_rate": 0.0002, + "loss": 0.5754, + "step": 28050 + }, + { + "epoch": 4.536415811171287, + "grad_norm": 1.0135776996612549, + "learning_rate": 0.0002, + "loss": 0.5608, + "step": 28060 + }, + { + "epoch": 4.538032495352033, + "grad_norm": 0.8886825442314148, + "learning_rate": 0.0002, + "loss": 0.5827, + "step": 28070 + }, + { + "epoch": 4.539649179532779, + "grad_norm": 0.8153995275497437, + "learning_rate": 0.0002, + "loss": 0.5605, + "step": 28080 + }, + { + "epoch": 4.541265863713524, + "grad_norm": 0.9853341579437256, + "learning_rate": 0.0002, + "loss": 0.6377, + "step": 28090 + }, + { + "epoch": 4.542882547894269, + "grad_norm": 0.9365800023078918, + "learning_rate": 0.0002, + "loss": 0.5957, + "step": 28100 + }, + { + "epoch": 4.544499232075014, + "grad_norm": 0.9765017628669739, + "learning_rate": 0.0002, + "loss": 0.5477, + "step": 28110 + }, + { + "epoch": 4.54611591625576, + "grad_norm": 0.9811279773712158, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 28120 + }, + { + "epoch": 4.547732600436505, + "grad_norm": 1.0387924909591675, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 28130 + }, + { + "epoch": 4.54934928461725, + "grad_norm": 1.0684878826141357, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 28140 + }, + { + "epoch": 4.550965968797995, + "grad_norm": 1.0000102519989014, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 28150 + }, + { + "epoch": 4.5525826529787405, + "grad_norm": 1.0717930793762207, + "learning_rate": 0.0002, + "loss": 0.5327, + "step": 28160 + }, + { + "epoch": 4.554199337159486, + "grad_norm": 0.990074634552002, + "learning_rate": 0.0002, + "loss": 0.5594, + "step": 28170 + }, + { + "epoch": 4.555816021340231, + "grad_norm": 0.8673754930496216, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 28180 + }, + { + "epoch": 4.557432705520976, + "grad_norm": 0.864247739315033, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 28190 + }, + { + "epoch": 4.5590493897017215, + "grad_norm": 0.8280200958251953, + "learning_rate": 0.0002, + "loss": 0.5516, + "step": 28200 + }, + { + "epoch": 4.560666073882467, + "grad_norm": 1.1312172412872314, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 28210 + }, + { + "epoch": 4.562282758063212, + "grad_norm": 0.9147403240203857, + "learning_rate": 0.0002, + "loss": 0.5776, + "step": 28220 + }, + { + "epoch": 4.563899442243958, + "grad_norm": 1.0321218967437744, + "learning_rate": 0.0002, + "loss": 0.5591, + "step": 28230 + }, + { + "epoch": 4.565516126424703, + "grad_norm": 1.168332815170288, + "learning_rate": 0.0002, + "loss": 0.5508, + "step": 28240 + }, + { + "epoch": 4.5671328106054485, + "grad_norm": 1.0067222118377686, + "learning_rate": 0.0002, + "loss": 0.5649, + "step": 28250 + }, + { + "epoch": 4.568749494786194, + "grad_norm": 1.0283393859863281, + "learning_rate": 0.0002, + "loss": 0.5853, + "step": 28260 + }, + { + "epoch": 4.570366178966939, + "grad_norm": 0.9912363886833191, + "learning_rate": 0.0002, + "loss": 0.5772, + "step": 28270 + }, + { + "epoch": 4.571982863147684, + "grad_norm": 1.108032464981079, + "learning_rate": 0.0002, + "loss": 0.5757, + "step": 28280 + }, + { + "epoch": 4.573599547328429, + "grad_norm": 0.8260078430175781, + "learning_rate": 0.0002, + "loss": 0.5529, + "step": 28290 + }, + { + "epoch": 4.575216231509175, + "grad_norm": 0.8946247100830078, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 28300 + }, + { + "epoch": 4.57683291568992, + "grad_norm": 0.8273587822914124, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 28310 + }, + { + "epoch": 4.578449599870665, + "grad_norm": 0.9040093421936035, + "learning_rate": 0.0002, + "loss": 0.6058, + "step": 28320 + }, + { + "epoch": 4.58006628405141, + "grad_norm": 0.8435290455818176, + "learning_rate": 0.0002, + "loss": 0.5521, + "step": 28330 + }, + { + "epoch": 4.581682968232156, + "grad_norm": 1.164088249206543, + "learning_rate": 0.0002, + "loss": 0.6086, + "step": 28340 + }, + { + "epoch": 4.583299652412901, + "grad_norm": 0.9861085414886475, + "learning_rate": 0.0002, + "loss": 0.5603, + "step": 28350 + }, + { + "epoch": 4.584916336593646, + "grad_norm": 0.8892980813980103, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 28360 + }, + { + "epoch": 4.586533020774391, + "grad_norm": 1.240574836730957, + "learning_rate": 0.0002, + "loss": 0.598, + "step": 28370 + }, + { + "epoch": 4.588149704955137, + "grad_norm": 0.8669408559799194, + "learning_rate": 0.0002, + "loss": 0.5797, + "step": 28380 + }, + { + "epoch": 4.589766389135883, + "grad_norm": 0.9145985841751099, + "learning_rate": 0.0002, + "loss": 0.5603, + "step": 28390 + }, + { + "epoch": 4.591383073316628, + "grad_norm": 0.8584614992141724, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 28400 + }, + { + "epoch": 4.592999757497373, + "grad_norm": 1.118829369544983, + "learning_rate": 0.0002, + "loss": 0.5898, + "step": 28410 + }, + { + "epoch": 4.594616441678118, + "grad_norm": 1.1411553621292114, + "learning_rate": 0.0002, + "loss": 0.5641, + "step": 28420 + }, + { + "epoch": 4.596233125858864, + "grad_norm": 0.9433278441429138, + "learning_rate": 0.0002, + "loss": 0.549, + "step": 28430 + }, + { + "epoch": 4.597849810039609, + "grad_norm": 0.816830039024353, + "learning_rate": 0.0002, + "loss": 0.5496, + "step": 28440 + }, + { + "epoch": 4.599466494220354, + "grad_norm": 1.2124968767166138, + "learning_rate": 0.0002, + "loss": 0.5543, + "step": 28450 + }, + { + "epoch": 4.601083178401099, + "grad_norm": 0.9658762216567993, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 28460 + }, + { + "epoch": 4.6026998625818445, + "grad_norm": 0.836100161075592, + "learning_rate": 0.0002, + "loss": 0.5902, + "step": 28470 + }, + { + "epoch": 4.60431654676259, + "grad_norm": 0.9989104270935059, + "learning_rate": 0.0002, + "loss": 0.5749, + "step": 28480 + }, + { + "epoch": 4.605933230943335, + "grad_norm": 1.1298956871032715, + "learning_rate": 0.0002, + "loss": 0.5616, + "step": 28490 + }, + { + "epoch": 4.60754991512408, + "grad_norm": 1.1731704473495483, + "learning_rate": 0.0002, + "loss": 0.5846, + "step": 28500 + }, + { + "epoch": 4.609166599304825, + "grad_norm": 0.9624714255332947, + "learning_rate": 0.0002, + "loss": 0.5816, + "step": 28510 + }, + { + "epoch": 4.610783283485571, + "grad_norm": 1.364073634147644, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 28520 + }, + { + "epoch": 4.612399967666317, + "grad_norm": 1.1827356815338135, + "learning_rate": 0.0002, + "loss": 0.6237, + "step": 28530 + }, + { + "epoch": 4.614016651847062, + "grad_norm": 0.6651531457901001, + "learning_rate": 0.0002, + "loss": 0.5643, + "step": 28540 + }, + { + "epoch": 4.615633336027807, + "grad_norm": 1.1640995740890503, + "learning_rate": 0.0002, + "loss": 0.6051, + "step": 28550 + }, + { + "epoch": 4.6172500202085525, + "grad_norm": 1.028918743133545, + "learning_rate": 0.0002, + "loss": 0.5995, + "step": 28560 + }, + { + "epoch": 4.618866704389298, + "grad_norm": 0.8252120614051819, + "learning_rate": 0.0002, + "loss": 0.5607, + "step": 28570 + }, + { + "epoch": 4.620483388570043, + "grad_norm": 1.3536735773086548, + "learning_rate": 0.0002, + "loss": 0.5769, + "step": 28580 + }, + { + "epoch": 4.622100072750788, + "grad_norm": 1.2146915197372437, + "learning_rate": 0.0002, + "loss": 0.6006, + "step": 28590 + }, + { + "epoch": 4.623716756931533, + "grad_norm": 1.0122549533843994, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 28600 + }, + { + "epoch": 4.625333441112279, + "grad_norm": 0.9977872967720032, + "learning_rate": 0.0002, + "loss": 0.6072, + "step": 28610 + }, + { + "epoch": 4.626950125293024, + "grad_norm": 1.0159751176834106, + "learning_rate": 0.0002, + "loss": 0.5669, + "step": 28620 + }, + { + "epoch": 4.628566809473769, + "grad_norm": 1.0028325319290161, + "learning_rate": 0.0002, + "loss": 0.5935, + "step": 28630 + }, + { + "epoch": 4.630183493654514, + "grad_norm": 0.901638388633728, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 28640 + }, + { + "epoch": 4.6318001778352595, + "grad_norm": 0.9450507164001465, + "learning_rate": 0.0002, + "loss": 0.595, + "step": 28650 + }, + { + "epoch": 4.633416862016006, + "grad_norm": 0.9987545013427734, + "learning_rate": 0.0002, + "loss": 0.5972, + "step": 28660 + }, + { + "epoch": 4.63503354619675, + "grad_norm": 0.9574332237243652, + "learning_rate": 0.0002, + "loss": 0.5863, + "step": 28670 + }, + { + "epoch": 4.636650230377496, + "grad_norm": 1.2215653657913208, + "learning_rate": 0.0002, + "loss": 0.5804, + "step": 28680 + }, + { + "epoch": 4.638266914558241, + "grad_norm": 0.9798858761787415, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 28690 + }, + { + "epoch": 4.639883598738987, + "grad_norm": 1.0648466348648071, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 28700 + }, + { + "epoch": 4.641500282919732, + "grad_norm": 1.0606504678726196, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 28710 + }, + { + "epoch": 4.643116967100477, + "grad_norm": 1.0892442464828491, + "learning_rate": 0.0002, + "loss": 0.5801, + "step": 28720 + }, + { + "epoch": 4.644733651281222, + "grad_norm": 0.914391040802002, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 28730 + }, + { + "epoch": 4.6463503354619675, + "grad_norm": 0.9782370328903198, + "learning_rate": 0.0002, + "loss": 0.5439, + "step": 28740 + }, + { + "epoch": 4.647967019642713, + "grad_norm": 1.0344339609146118, + "learning_rate": 0.0002, + "loss": 0.6035, + "step": 28750 + }, + { + "epoch": 4.649583703823458, + "grad_norm": 1.0513931512832642, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 28760 + }, + { + "epoch": 4.651200388004203, + "grad_norm": 0.9711475968360901, + "learning_rate": 0.0002, + "loss": 0.546, + "step": 28770 + }, + { + "epoch": 4.652817072184948, + "grad_norm": 0.977519690990448, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 28780 + }, + { + "epoch": 4.654433756365694, + "grad_norm": 0.9150224924087524, + "learning_rate": 0.0002, + "loss": 0.5826, + "step": 28790 + }, + { + "epoch": 4.656050440546439, + "grad_norm": 1.0973542928695679, + "learning_rate": 0.0002, + "loss": 0.5382, + "step": 28800 + }, + { + "epoch": 4.657667124727185, + "grad_norm": 0.944877564907074, + "learning_rate": 0.0002, + "loss": 0.6147, + "step": 28810 + }, + { + "epoch": 4.659283808907929, + "grad_norm": 0.9508748650550842, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 28820 + }, + { + "epoch": 4.6609004930886755, + "grad_norm": 0.9681721329689026, + "learning_rate": 0.0002, + "loss": 0.5537, + "step": 28830 + }, + { + "epoch": 4.662517177269421, + "grad_norm": 1.0214351415634155, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 28840 + }, + { + "epoch": 4.664133861450166, + "grad_norm": 0.9748611450195312, + "learning_rate": 0.0002, + "loss": 0.6031, + "step": 28850 + }, + { + "epoch": 4.665750545630911, + "grad_norm": 0.8484147191047668, + "learning_rate": 0.0002, + "loss": 0.572, + "step": 28860 + }, + { + "epoch": 4.667367229811656, + "grad_norm": 1.1252986192703247, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 28870 + }, + { + "epoch": 4.668983913992402, + "grad_norm": 0.8706206679344177, + "learning_rate": 0.0002, + "loss": 0.5724, + "step": 28880 + }, + { + "epoch": 4.670600598173147, + "grad_norm": 1.1432424783706665, + "learning_rate": 0.0002, + "loss": 0.6002, + "step": 28890 + }, + { + "epoch": 4.672217282353892, + "grad_norm": 1.017029047012329, + "learning_rate": 0.0002, + "loss": 0.5675, + "step": 28900 + }, + { + "epoch": 4.673833966534637, + "grad_norm": 1.085597038269043, + "learning_rate": 0.0002, + "loss": 0.5831, + "step": 28910 + }, + { + "epoch": 4.675450650715383, + "grad_norm": 0.9275796413421631, + "learning_rate": 0.0002, + "loss": 0.5678, + "step": 28920 + }, + { + "epoch": 4.677067334896128, + "grad_norm": 0.9518964886665344, + "learning_rate": 0.0002, + "loss": 0.5603, + "step": 28930 + }, + { + "epoch": 4.678684019076873, + "grad_norm": 1.0352122783660889, + "learning_rate": 0.0002, + "loss": 0.6232, + "step": 28940 + }, + { + "epoch": 4.680300703257618, + "grad_norm": 1.090124249458313, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 28950 + }, + { + "epoch": 4.681917387438364, + "grad_norm": 0.8799563050270081, + "learning_rate": 0.0002, + "loss": 0.5728, + "step": 28960 + }, + { + "epoch": 4.683534071619109, + "grad_norm": 1.0929821729660034, + "learning_rate": 0.0002, + "loss": 0.5787, + "step": 28970 + }, + { + "epoch": 4.685150755799855, + "grad_norm": 0.903727650642395, + "learning_rate": 0.0002, + "loss": 0.6134, + "step": 28980 + }, + { + "epoch": 4.6867674399806, + "grad_norm": 0.9752424955368042, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 28990 + }, + { + "epoch": 4.688384124161345, + "grad_norm": 0.9351571202278137, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 29000 + }, + { + "epoch": 4.6900008083420905, + "grad_norm": 0.923877477645874, + "learning_rate": 0.0002, + "loss": 0.5811, + "step": 29010 + }, + { + "epoch": 4.691617492522836, + "grad_norm": 1.045389175415039, + "learning_rate": 0.0002, + "loss": 0.5682, + "step": 29020 + }, + { + "epoch": 4.693234176703581, + "grad_norm": 1.0200831890106201, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 29030 + }, + { + "epoch": 4.694850860884326, + "grad_norm": 1.1499706506729126, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 29040 + }, + { + "epoch": 4.6964675450650715, + "grad_norm": 0.860118567943573, + "learning_rate": 0.0002, + "loss": 0.5745, + "step": 29050 + }, + { + "epoch": 4.698084229245817, + "grad_norm": 0.9774864315986633, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 29060 + }, + { + "epoch": 4.699700913426562, + "grad_norm": 1.0323210954666138, + "learning_rate": 0.0002, + "loss": 0.5765, + "step": 29070 + }, + { + "epoch": 4.701317597607307, + "grad_norm": 0.8492481112480164, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 29080 + }, + { + "epoch": 4.702934281788052, + "grad_norm": 1.131951093673706, + "learning_rate": 0.0002, + "loss": 0.5985, + "step": 29090 + }, + { + "epoch": 4.704550965968798, + "grad_norm": 0.8763113021850586, + "learning_rate": 0.0002, + "loss": 0.6412, + "step": 29100 + }, + { + "epoch": 4.706167650149544, + "grad_norm": 1.045028805732727, + "learning_rate": 0.0002, + "loss": 0.575, + "step": 29110 + }, + { + "epoch": 4.707784334330288, + "grad_norm": 0.9961401224136353, + "learning_rate": 0.0002, + "loss": 0.5548, + "step": 29120 + }, + { + "epoch": 4.709401018511034, + "grad_norm": 0.9282503724098206, + "learning_rate": 0.0002, + "loss": 0.559, + "step": 29130 + }, + { + "epoch": 4.711017702691779, + "grad_norm": 1.1418932676315308, + "learning_rate": 0.0002, + "loss": 0.5744, + "step": 29140 + }, + { + "epoch": 4.712634386872525, + "grad_norm": 0.9950099587440491, + "learning_rate": 0.0002, + "loss": 0.5394, + "step": 29150 + }, + { + "epoch": 4.71425107105327, + "grad_norm": 0.8304893374443054, + "learning_rate": 0.0002, + "loss": 0.6177, + "step": 29160 + }, + { + "epoch": 4.715867755234015, + "grad_norm": 1.115626335144043, + "learning_rate": 0.0002, + "loss": 0.6074, + "step": 29170 + }, + { + "epoch": 4.71748443941476, + "grad_norm": 1.079818606376648, + "learning_rate": 0.0002, + "loss": 0.6265, + "step": 29180 + }, + { + "epoch": 4.719101123595506, + "grad_norm": 1.1929082870483398, + "learning_rate": 0.0002, + "loss": 0.561, + "step": 29190 + }, + { + "epoch": 4.720717807776251, + "grad_norm": 0.9621080756187439, + "learning_rate": 0.0002, + "loss": 0.5708, + "step": 29200 + }, + { + "epoch": 4.722334491956996, + "grad_norm": 0.8549222350120544, + "learning_rate": 0.0002, + "loss": 0.546, + "step": 29210 + }, + { + "epoch": 4.723951176137741, + "grad_norm": 0.9341941475868225, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 29220 + }, + { + "epoch": 4.7255678603184865, + "grad_norm": 1.075406789779663, + "learning_rate": 0.0002, + "loss": 0.5436, + "step": 29230 + }, + { + "epoch": 4.727184544499232, + "grad_norm": 1.0859880447387695, + "learning_rate": 0.0002, + "loss": 0.576, + "step": 29240 + }, + { + "epoch": 4.728801228679977, + "grad_norm": 0.8475605249404907, + "learning_rate": 0.0002, + "loss": 0.5525, + "step": 29250 + }, + { + "epoch": 4.730417912860723, + "grad_norm": 0.9331845641136169, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 29260 + }, + { + "epoch": 4.7320345970414674, + "grad_norm": 0.9279314279556274, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 29270 + }, + { + "epoch": 4.733651281222214, + "grad_norm": 0.7803558707237244, + "learning_rate": 0.0002, + "loss": 0.597, + "step": 29280 + }, + { + "epoch": 4.735267965402959, + "grad_norm": 1.0159329175949097, + "learning_rate": 0.0002, + "loss": 0.5968, + "step": 29290 + }, + { + "epoch": 4.736884649583704, + "grad_norm": 0.9448670744895935, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 29300 + }, + { + "epoch": 4.738501333764449, + "grad_norm": 1.0732197761535645, + "learning_rate": 0.0002, + "loss": 0.574, + "step": 29310 + }, + { + "epoch": 4.7401180179451945, + "grad_norm": 0.901830792427063, + "learning_rate": 0.0002, + "loss": 0.6066, + "step": 29320 + }, + { + "epoch": 4.74173470212594, + "grad_norm": 0.9141789674758911, + "learning_rate": 0.0002, + "loss": 0.6105, + "step": 29330 + }, + { + "epoch": 4.743351386306685, + "grad_norm": 0.9733418226242065, + "learning_rate": 0.0002, + "loss": 0.5481, + "step": 29340 + }, + { + "epoch": 4.74496807048743, + "grad_norm": 0.909810483455658, + "learning_rate": 0.0002, + "loss": 0.612, + "step": 29350 + }, + { + "epoch": 4.746584754668175, + "grad_norm": 0.909541666507721, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 29360 + }, + { + "epoch": 4.748201438848921, + "grad_norm": 0.9383015632629395, + "learning_rate": 0.0002, + "loss": 0.5579, + "step": 29370 + }, + { + "epoch": 4.749818123029666, + "grad_norm": 0.9275668263435364, + "learning_rate": 0.0002, + "loss": 0.5529, + "step": 29380 + }, + { + "epoch": 4.751434807210411, + "grad_norm": 1.1146225929260254, + "learning_rate": 0.0002, + "loss": 0.5623, + "step": 29390 + }, + { + "epoch": 4.753051491391156, + "grad_norm": 1.0062453746795654, + "learning_rate": 0.0002, + "loss": 0.6018, + "step": 29400 + }, + { + "epoch": 4.7546681755719025, + "grad_norm": 0.9451895952224731, + "learning_rate": 0.0002, + "loss": 0.5872, + "step": 29410 + }, + { + "epoch": 4.756284859752648, + "grad_norm": 0.870457649230957, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 29420 + }, + { + "epoch": 4.757901543933393, + "grad_norm": 1.0411282777786255, + "learning_rate": 0.0002, + "loss": 0.57, + "step": 29430 + }, + { + "epoch": 4.759518228114138, + "grad_norm": 1.1648986339569092, + "learning_rate": 0.0002, + "loss": 0.5688, + "step": 29440 + }, + { + "epoch": 4.761134912294883, + "grad_norm": 0.8999572992324829, + "learning_rate": 0.0002, + "loss": 0.5432, + "step": 29450 + }, + { + "epoch": 4.762751596475629, + "grad_norm": 0.9863559007644653, + "learning_rate": 0.0002, + "loss": 0.5667, + "step": 29460 + }, + { + "epoch": 4.764368280656374, + "grad_norm": 0.9676542282104492, + "learning_rate": 0.0002, + "loss": 0.5779, + "step": 29470 + }, + { + "epoch": 4.765984964837119, + "grad_norm": 1.004775047302246, + "learning_rate": 0.0002, + "loss": 0.6075, + "step": 29480 + }, + { + "epoch": 4.767601649017864, + "grad_norm": 1.0937515497207642, + "learning_rate": 0.0002, + "loss": 0.6044, + "step": 29490 + }, + { + "epoch": 4.7692183331986095, + "grad_norm": 0.9551598429679871, + "learning_rate": 0.0002, + "loss": 0.5433, + "step": 29500 + }, + { + "epoch": 4.770835017379355, + "grad_norm": 1.0757228136062622, + "learning_rate": 0.0002, + "loss": 0.5609, + "step": 29510 + }, + { + "epoch": 4.7724517015601, + "grad_norm": 1.0588841438293457, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 29520 + }, + { + "epoch": 4.774068385740845, + "grad_norm": 1.0744032859802246, + "learning_rate": 0.0002, + "loss": 0.5814, + "step": 29530 + }, + { + "epoch": 4.7756850699215905, + "grad_norm": 1.0066277980804443, + "learning_rate": 0.0002, + "loss": 0.5681, + "step": 29540 + }, + { + "epoch": 4.777301754102336, + "grad_norm": 1.082319736480713, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 29550 + }, + { + "epoch": 4.778918438283082, + "grad_norm": 0.8252472877502441, + "learning_rate": 0.0002, + "loss": 0.5709, + "step": 29560 + }, + { + "epoch": 4.780535122463827, + "grad_norm": 0.9855340123176575, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 29570 + }, + { + "epoch": 4.782151806644572, + "grad_norm": 0.9991421699523926, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 29580 + }, + { + "epoch": 4.7837684908253175, + "grad_norm": 1.316841959953308, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 29590 + }, + { + "epoch": 4.785385175006063, + "grad_norm": 1.1513035297393799, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 29600 + }, + { + "epoch": 4.787001859186808, + "grad_norm": 0.9767683744430542, + "learning_rate": 0.0002, + "loss": 0.5785, + "step": 29610 + }, + { + "epoch": 4.788618543367553, + "grad_norm": 0.9786278605461121, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 29620 + }, + { + "epoch": 4.7902352275482984, + "grad_norm": 0.8004973530769348, + "learning_rate": 0.0002, + "loss": 0.6108, + "step": 29630 + }, + { + "epoch": 4.791851911729044, + "grad_norm": 1.0997767448425293, + "learning_rate": 0.0002, + "loss": 0.5932, + "step": 29640 + }, + { + "epoch": 4.793468595909789, + "grad_norm": 0.9752856492996216, + "learning_rate": 0.0002, + "loss": 0.5655, + "step": 29650 + }, + { + "epoch": 4.795085280090534, + "grad_norm": 1.0518392324447632, + "learning_rate": 0.0002, + "loss": 0.5916, + "step": 29660 + }, + { + "epoch": 4.796701964271279, + "grad_norm": 1.1050055027008057, + "learning_rate": 0.0002, + "loss": 0.6042, + "step": 29670 + }, + { + "epoch": 4.798318648452025, + "grad_norm": 0.9933857917785645, + "learning_rate": 0.0002, + "loss": 0.6089, + "step": 29680 + }, + { + "epoch": 4.79993533263277, + "grad_norm": 1.2804018259048462, + "learning_rate": 0.0002, + "loss": 0.6041, + "step": 29690 + }, + { + "epoch": 4.801552016813515, + "grad_norm": 1.0133371353149414, + "learning_rate": 0.0002, + "loss": 0.636, + "step": 29700 + }, + { + "epoch": 4.803168700994261, + "grad_norm": 1.080350637435913, + "learning_rate": 0.0002, + "loss": 0.5662, + "step": 29710 + }, + { + "epoch": 4.804785385175006, + "grad_norm": 0.9986529350280762, + "learning_rate": 0.0002, + "loss": 0.5603, + "step": 29720 + }, + { + "epoch": 4.806402069355752, + "grad_norm": 0.975665807723999, + "learning_rate": 0.0002, + "loss": 0.5894, + "step": 29730 + }, + { + "epoch": 4.808018753536497, + "grad_norm": 0.8458138704299927, + "learning_rate": 0.0002, + "loss": 0.6328, + "step": 29740 + }, + { + "epoch": 4.809635437717242, + "grad_norm": 0.99330073595047, + "learning_rate": 0.0002, + "loss": 0.5837, + "step": 29750 + }, + { + "epoch": 4.811252121897987, + "grad_norm": 0.898274302482605, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 29760 + }, + { + "epoch": 4.812868806078733, + "grad_norm": 1.0504480600357056, + "learning_rate": 0.0002, + "loss": 0.5842, + "step": 29770 + }, + { + "epoch": 4.814485490259478, + "grad_norm": 0.937919020652771, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 29780 + }, + { + "epoch": 4.816102174440223, + "grad_norm": 0.9593307971954346, + "learning_rate": 0.0002, + "loss": 0.5885, + "step": 29790 + }, + { + "epoch": 4.817718858620968, + "grad_norm": 0.9431198835372925, + "learning_rate": 0.0002, + "loss": 0.578, + "step": 29800 + }, + { + "epoch": 4.8193355428017135, + "grad_norm": 1.2729957103729248, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 29810 + }, + { + "epoch": 4.820952226982459, + "grad_norm": 0.8876838684082031, + "learning_rate": 0.0002, + "loss": 0.6124, + "step": 29820 + }, + { + "epoch": 4.822568911163204, + "grad_norm": 1.0185000896453857, + "learning_rate": 0.0002, + "loss": 0.5583, + "step": 29830 + }, + { + "epoch": 4.824185595343949, + "grad_norm": 1.064276099205017, + "learning_rate": 0.0002, + "loss": 0.5686, + "step": 29840 + }, + { + "epoch": 4.825802279524694, + "grad_norm": 0.9774803519248962, + "learning_rate": 0.0002, + "loss": 0.5698, + "step": 29850 + }, + { + "epoch": 4.8274189637054405, + "grad_norm": 1.131646990776062, + "learning_rate": 0.0002, + "loss": 0.5533, + "step": 29860 + }, + { + "epoch": 4.829035647886186, + "grad_norm": 1.081455945968628, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 29870 + }, + { + "epoch": 4.830652332066931, + "grad_norm": 0.990538477897644, + "learning_rate": 0.0002, + "loss": 0.5793, + "step": 29880 + }, + { + "epoch": 4.832269016247676, + "grad_norm": 0.9750600457191467, + "learning_rate": 0.0002, + "loss": 0.5833, + "step": 29890 + }, + { + "epoch": 4.8338857004284215, + "grad_norm": 1.0600621700286865, + "learning_rate": 0.0002, + "loss": 0.619, + "step": 29900 + }, + { + "epoch": 4.835502384609167, + "grad_norm": 0.9237320423126221, + "learning_rate": 0.0002, + "loss": 0.5841, + "step": 29910 + }, + { + "epoch": 4.837119068789912, + "grad_norm": 0.9739177227020264, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 29920 + }, + { + "epoch": 4.838735752970657, + "grad_norm": 1.128677248954773, + "learning_rate": 0.0002, + "loss": 0.587, + "step": 29930 + }, + { + "epoch": 4.840352437151402, + "grad_norm": 1.042604923248291, + "learning_rate": 0.0002, + "loss": 0.564, + "step": 29940 + }, + { + "epoch": 4.841969121332148, + "grad_norm": 0.849758505821228, + "learning_rate": 0.0002, + "loss": 0.5885, + "step": 29950 + }, + { + "epoch": 4.843585805512893, + "grad_norm": 1.2809888124465942, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 29960 + }, + { + "epoch": 4.845202489693638, + "grad_norm": 1.0177865028381348, + "learning_rate": 0.0002, + "loss": 0.5703, + "step": 29970 + }, + { + "epoch": 4.846819173874383, + "grad_norm": 1.0026639699935913, + "learning_rate": 0.0002, + "loss": 0.5946, + "step": 29980 + }, + { + "epoch": 4.8484358580551286, + "grad_norm": 0.9679505228996277, + "learning_rate": 0.0002, + "loss": 0.5897, + "step": 29990 + }, + { + "epoch": 4.850052542235874, + "grad_norm": 0.8939532041549683, + "learning_rate": 0.0002, + "loss": 0.5621, + "step": 30000 + }, + { + "epoch": 4.85166922641662, + "grad_norm": 0.9957457780838013, + "learning_rate": 0.0002, + "loss": 0.5852, + "step": 30010 + }, + { + "epoch": 4.853285910597365, + "grad_norm": 1.1646790504455566, + "learning_rate": 0.0002, + "loss": 0.6117, + "step": 30020 + }, + { + "epoch": 4.85490259477811, + "grad_norm": 0.8804680705070496, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 30030 + }, + { + "epoch": 4.856519278958856, + "grad_norm": 1.161970853805542, + "learning_rate": 0.0002, + "loss": 0.5397, + "step": 30040 + }, + { + "epoch": 4.858135963139601, + "grad_norm": 0.9081037640571594, + "learning_rate": 0.0002, + "loss": 0.5552, + "step": 30050 + }, + { + "epoch": 4.859752647320346, + "grad_norm": 0.9402848482131958, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 30060 + }, + { + "epoch": 4.861369331501091, + "grad_norm": 0.9023865461349487, + "learning_rate": 0.0002, + "loss": 0.6256, + "step": 30070 + }, + { + "epoch": 4.8629860156818365, + "grad_norm": 1.0173414945602417, + "learning_rate": 0.0002, + "loss": 0.5926, + "step": 30080 + }, + { + "epoch": 4.864602699862582, + "grad_norm": 1.084402322769165, + "learning_rate": 0.0002, + "loss": 0.6274, + "step": 30090 + }, + { + "epoch": 4.866219384043327, + "grad_norm": 0.9577937126159668, + "learning_rate": 0.0002, + "loss": 0.6311, + "step": 30100 + }, + { + "epoch": 4.867836068224072, + "grad_norm": 0.9807606935501099, + "learning_rate": 0.0002, + "loss": 0.5724, + "step": 30110 + }, + { + "epoch": 4.8694527524048175, + "grad_norm": 0.978784441947937, + "learning_rate": 0.0002, + "loss": 0.5786, + "step": 30120 + }, + { + "epoch": 4.871069436585563, + "grad_norm": 0.9762914776802063, + "learning_rate": 0.0002, + "loss": 0.6194, + "step": 30130 + }, + { + "epoch": 4.872686120766308, + "grad_norm": 0.9404871463775635, + "learning_rate": 0.0002, + "loss": 0.5892, + "step": 30140 + }, + { + "epoch": 4.874302804947053, + "grad_norm": 1.0069509744644165, + "learning_rate": 0.0002, + "loss": 0.6182, + "step": 30150 + }, + { + "epoch": 4.875919489127799, + "grad_norm": 1.1770923137664795, + "learning_rate": 0.0002, + "loss": 0.6225, + "step": 30160 + }, + { + "epoch": 4.8775361733085445, + "grad_norm": 1.021210789680481, + "learning_rate": 0.0002, + "loss": 0.5657, + "step": 30170 + }, + { + "epoch": 4.87915285748929, + "grad_norm": 0.8512648940086365, + "learning_rate": 0.0002, + "loss": 0.6033, + "step": 30180 + }, + { + "epoch": 4.880769541670035, + "grad_norm": 0.9345870018005371, + "learning_rate": 0.0002, + "loss": 0.5519, + "step": 30190 + }, + { + "epoch": 4.88238622585078, + "grad_norm": 1.0224418640136719, + "learning_rate": 0.0002, + "loss": 0.5682, + "step": 30200 + }, + { + "epoch": 4.884002910031525, + "grad_norm": 1.0316044092178345, + "learning_rate": 0.0002, + "loss": 0.5807, + "step": 30210 + }, + { + "epoch": 4.885619594212271, + "grad_norm": 1.102437973022461, + "learning_rate": 0.0002, + "loss": 0.6065, + "step": 30220 + }, + { + "epoch": 4.887236278393016, + "grad_norm": 1.0220023393630981, + "learning_rate": 0.0002, + "loss": 0.586, + "step": 30230 + }, + { + "epoch": 4.888852962573761, + "grad_norm": 1.0934523344039917, + "learning_rate": 0.0002, + "loss": 0.5781, + "step": 30240 + }, + { + "epoch": 4.890469646754506, + "grad_norm": 1.264630913734436, + "learning_rate": 0.0002, + "loss": 0.6313, + "step": 30250 + }, + { + "epoch": 4.892086330935252, + "grad_norm": 1.0999879837036133, + "learning_rate": 0.0002, + "loss": 0.5712, + "step": 30260 + }, + { + "epoch": 4.893703015115997, + "grad_norm": 0.9124550223350525, + "learning_rate": 0.0002, + "loss": 0.6413, + "step": 30270 + }, + { + "epoch": 4.895319699296742, + "grad_norm": 0.9853624105453491, + "learning_rate": 0.0002, + "loss": 0.596, + "step": 30280 + }, + { + "epoch": 4.896936383477488, + "grad_norm": 1.0589802265167236, + "learning_rate": 0.0002, + "loss": 0.595, + "step": 30290 + }, + { + "epoch": 4.8985530676582325, + "grad_norm": 0.8487226366996765, + "learning_rate": 0.0002, + "loss": 0.6129, + "step": 30300 + }, + { + "epoch": 4.900169751838979, + "grad_norm": 1.0212191343307495, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 30310 + }, + { + "epoch": 4.901786436019724, + "grad_norm": 1.0187491178512573, + "learning_rate": 0.0002, + "loss": 0.5896, + "step": 30320 + }, + { + "epoch": 4.903403120200469, + "grad_norm": 1.0013091564178467, + "learning_rate": 0.0002, + "loss": 0.5809, + "step": 30330 + }, + { + "epoch": 4.905019804381214, + "grad_norm": 1.0017542839050293, + "learning_rate": 0.0002, + "loss": 0.5658, + "step": 30340 + }, + { + "epoch": 4.9066364885619596, + "grad_norm": 0.9665151238441467, + "learning_rate": 0.0002, + "loss": 0.6002, + "step": 30350 + }, + { + "epoch": 4.908253172742705, + "grad_norm": 0.8774822950363159, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 30360 + }, + { + "epoch": 4.90986985692345, + "grad_norm": 0.9449850916862488, + "learning_rate": 0.0002, + "loss": 0.5771, + "step": 30370 + }, + { + "epoch": 4.911486541104195, + "grad_norm": 0.7368341088294983, + "learning_rate": 0.0002, + "loss": 0.58, + "step": 30380 + }, + { + "epoch": 4.9131032252849405, + "grad_norm": 0.9669167995452881, + "learning_rate": 0.0002, + "loss": 0.5992, + "step": 30390 + }, + { + "epoch": 4.914719909465686, + "grad_norm": 1.1227794885635376, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 30400 + }, + { + "epoch": 4.916336593646431, + "grad_norm": 0.9884361028671265, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 30410 + }, + { + "epoch": 4.917953277827176, + "grad_norm": 0.9949551224708557, + "learning_rate": 0.0002, + "loss": 0.6185, + "step": 30420 + }, + { + "epoch": 4.919569962007921, + "grad_norm": 0.9491621851921082, + "learning_rate": 0.0002, + "loss": 0.5866, + "step": 30430 + }, + { + "epoch": 4.9211866461886675, + "grad_norm": 0.78848797082901, + "learning_rate": 0.0002, + "loss": 0.6005, + "step": 30440 + }, + { + "epoch": 4.922803330369412, + "grad_norm": 1.0693835020065308, + "learning_rate": 0.0002, + "loss": 0.5561, + "step": 30450 + }, + { + "epoch": 4.924420014550158, + "grad_norm": 0.9573729634284973, + "learning_rate": 0.0002, + "loss": 0.566, + "step": 30460 + }, + { + "epoch": 4.926036698730903, + "grad_norm": 0.9975152611732483, + "learning_rate": 0.0002, + "loss": 0.6084, + "step": 30470 + }, + { + "epoch": 4.9276533829116484, + "grad_norm": 0.8695693016052246, + "learning_rate": 0.0002, + "loss": 0.5969, + "step": 30480 + }, + { + "epoch": 4.929270067092394, + "grad_norm": 1.145394206047058, + "learning_rate": 0.0002, + "loss": 0.6144, + "step": 30490 + }, + { + "epoch": 4.930886751273139, + "grad_norm": 0.7668989896774292, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 30500 + }, + { + "epoch": 4.932503435453884, + "grad_norm": 0.9630151391029358, + "learning_rate": 0.0002, + "loss": 0.6052, + "step": 30510 + }, + { + "epoch": 4.934120119634629, + "grad_norm": 0.940705418586731, + "learning_rate": 0.0002, + "loss": 0.6461, + "step": 30520 + }, + { + "epoch": 4.935736803815375, + "grad_norm": 1.3243348598480225, + "learning_rate": 0.0002, + "loss": 0.6326, + "step": 30530 + }, + { + "epoch": 4.93735348799612, + "grad_norm": 1.004347801208496, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 30540 + }, + { + "epoch": 4.938970172176865, + "grad_norm": 0.8711541295051575, + "learning_rate": 0.0002, + "loss": 0.583, + "step": 30550 + }, + { + "epoch": 4.94058685635761, + "grad_norm": 0.8980631828308105, + "learning_rate": 0.0002, + "loss": 0.599, + "step": 30560 + }, + { + "epoch": 4.9422035405383555, + "grad_norm": 0.8388893604278564, + "learning_rate": 0.0002, + "loss": 0.6024, + "step": 30570 + }, + { + "epoch": 4.943820224719101, + "grad_norm": 1.0991183519363403, + "learning_rate": 0.0002, + "loss": 0.6189, + "step": 30580 + }, + { + "epoch": 4.945436908899847, + "grad_norm": 0.9731075763702393, + "learning_rate": 0.0002, + "loss": 0.5906, + "step": 30590 + }, + { + "epoch": 4.947053593080591, + "grad_norm": 1.3904452323913574, + "learning_rate": 0.0002, + "loss": 0.5883, + "step": 30600 + }, + { + "epoch": 4.948670277261337, + "grad_norm": 1.2489882707595825, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 30610 + }, + { + "epoch": 4.950286961442083, + "grad_norm": 1.240072250366211, + "learning_rate": 0.0002, + "loss": 0.5887, + "step": 30620 + }, + { + "epoch": 4.951903645622828, + "grad_norm": 0.9191411733627319, + "learning_rate": 0.0002, + "loss": 0.5762, + "step": 30630 + }, + { + "epoch": 4.953520329803573, + "grad_norm": 0.8888895511627197, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 30640 + }, + { + "epoch": 4.955137013984318, + "grad_norm": 0.9001450538635254, + "learning_rate": 0.0002, + "loss": 0.6594, + "step": 30650 + }, + { + "epoch": 4.9567536981650635, + "grad_norm": 1.053971767425537, + "learning_rate": 0.0002, + "loss": 0.6047, + "step": 30660 + }, + { + "epoch": 4.958370382345809, + "grad_norm": 1.2224042415618896, + "learning_rate": 0.0002, + "loss": 0.6107, + "step": 30670 + }, + { + "epoch": 4.959987066526554, + "grad_norm": 0.8855111598968506, + "learning_rate": 0.0002, + "loss": 0.6211, + "step": 30680 + }, + { + "epoch": 4.961603750707299, + "grad_norm": 0.9489575624465942, + "learning_rate": 0.0002, + "loss": 0.5764, + "step": 30690 + }, + { + "epoch": 4.963220434888044, + "grad_norm": 0.9635404944419861, + "learning_rate": 0.0002, + "loss": 0.5371, + "step": 30700 + }, + { + "epoch": 4.96483711906879, + "grad_norm": 1.1784121990203857, + "learning_rate": 0.0002, + "loss": 0.6043, + "step": 30710 + }, + { + "epoch": 4.966453803249535, + "grad_norm": 1.0059462785720825, + "learning_rate": 0.0002, + "loss": 0.5803, + "step": 30720 + }, + { + "epoch": 4.96807048743028, + "grad_norm": 0.9479738473892212, + "learning_rate": 0.0002, + "loss": 0.5759, + "step": 30730 + }, + { + "epoch": 4.969687171611026, + "grad_norm": 1.0624593496322632, + "learning_rate": 0.0002, + "loss": 0.584, + "step": 30740 + }, + { + "epoch": 4.971303855791771, + "grad_norm": 1.1429259777069092, + "learning_rate": 0.0002, + "loss": 0.6202, + "step": 30750 + }, + { + "epoch": 4.972920539972517, + "grad_norm": 0.9102491140365601, + "learning_rate": 0.0002, + "loss": 0.6174, + "step": 30760 + }, + { + "epoch": 4.974537224153262, + "grad_norm": 1.1262688636779785, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 30770 + }, + { + "epoch": 4.976153908334007, + "grad_norm": 1.1415393352508545, + "learning_rate": 0.0002, + "loss": 0.588, + "step": 30780 + }, + { + "epoch": 4.977770592514752, + "grad_norm": 1.083078384399414, + "learning_rate": 0.0002, + "loss": 0.5832, + "step": 30790 + }, + { + "epoch": 4.979387276695498, + "grad_norm": 0.964859127998352, + "learning_rate": 0.0002, + "loss": 0.6025, + "step": 30800 + }, + { + "epoch": 4.981003960876243, + "grad_norm": 0.8704743385314941, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 30810 + }, + { + "epoch": 4.982620645056988, + "grad_norm": 1.0714856386184692, + "learning_rate": 0.0002, + "loss": 0.5666, + "step": 30820 + }, + { + "epoch": 4.984237329237733, + "grad_norm": 0.6818771362304688, + "learning_rate": 0.0002, + "loss": 0.565, + "step": 30830 + }, + { + "epoch": 4.985854013418479, + "grad_norm": 1.0454156398773193, + "learning_rate": 0.0002, + "loss": 0.5999, + "step": 30840 + }, + { + "epoch": 4.987470697599224, + "grad_norm": 0.9410776495933533, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 30850 + }, + { + "epoch": 4.989087381779969, + "grad_norm": 1.0878902673721313, + "learning_rate": 0.0002, + "loss": 0.5899, + "step": 30860 + }, + { + "epoch": 4.990704065960714, + "grad_norm": 0.8916727304458618, + "learning_rate": 0.0002, + "loss": 0.5914, + "step": 30870 + }, + { + "epoch": 4.9923207501414595, + "grad_norm": 1.045776128768921, + "learning_rate": 0.0002, + "loss": 0.6066, + "step": 30880 + }, + { + "epoch": 4.993937434322206, + "grad_norm": 0.9861903786659241, + "learning_rate": 0.0002, + "loss": 0.5767, + "step": 30890 + }, + { + "epoch": 4.995554118502951, + "grad_norm": 0.9275050759315491, + "learning_rate": 0.0002, + "loss": 0.6192, + "step": 30900 + }, + { + "epoch": 4.997170802683696, + "grad_norm": 0.94013911485672, + "learning_rate": 0.0002, + "loss": 0.6181, + "step": 30910 + }, + { + "epoch": 4.998787486864441, + "grad_norm": 0.9771268367767334, + "learning_rate": 0.0002, + "loss": 0.614, + "step": 30920 + }, + { + "epoch": 4.9999191657909625, + "eval_loss": 1.1968598365783691, + "eval_runtime": 122.2519, + "eval_samples_per_second": 5.996, + "eval_steps_per_second": 0.753, + "step": 30927 + }, + { + "epoch": 5.0004041710451865, + "grad_norm": 0.8021580576896667, + "learning_rate": 0.0002, + "loss": 0.5238, + "step": 30930 + }, + { + "epoch": 5.002020855225932, + "grad_norm": 1.0807327032089233, + "learning_rate": 0.0002, + "loss": 0.4984, + "step": 30940 + }, + { + "epoch": 5.003637539406677, + "grad_norm": 1.1638425588607788, + "learning_rate": 0.0002, + "loss": 0.514, + "step": 30950 + }, + { + "epoch": 5.005254223587422, + "grad_norm": 1.1700230836868286, + "learning_rate": 0.0002, + "loss": 0.4621, + "step": 30960 + }, + { + "epoch": 5.0068709077681675, + "grad_norm": 0.9053420424461365, + "learning_rate": 0.0002, + "loss": 0.4657, + "step": 30970 + }, + { + "epoch": 5.008487591948913, + "grad_norm": 0.9226111769676208, + "learning_rate": 0.0002, + "loss": 0.4865, + "step": 30980 + }, + { + "epoch": 5.010104276129658, + "grad_norm": 1.238669514656067, + "learning_rate": 0.0002, + "loss": 0.5011, + "step": 30990 + }, + { + "epoch": 5.011720960310403, + "grad_norm": 1.0668327808380127, + "learning_rate": 0.0002, + "loss": 0.4754, + "step": 31000 + }, + { + "epoch": 5.013337644491148, + "grad_norm": 1.0903944969177246, + "learning_rate": 0.0002, + "loss": 0.5414, + "step": 31010 + }, + { + "epoch": 5.014954328671894, + "grad_norm": 1.0763911008834839, + "learning_rate": 0.0002, + "loss": 0.5117, + "step": 31020 + }, + { + "epoch": 5.016571012852639, + "grad_norm": 1.0108771324157715, + "learning_rate": 0.0002, + "loss": 0.4908, + "step": 31030 + }, + { + "epoch": 5.018187697033385, + "grad_norm": 0.8816103935241699, + "learning_rate": 0.0002, + "loss": 0.5052, + "step": 31040 + }, + { + "epoch": 5.01980438121413, + "grad_norm": 1.11434805393219, + "learning_rate": 0.0002, + "loss": 0.4985, + "step": 31050 + }, + { + "epoch": 5.021421065394875, + "grad_norm": 1.0727789402008057, + "learning_rate": 0.0002, + "loss": 0.5074, + "step": 31060 + }, + { + "epoch": 5.023037749575621, + "grad_norm": 1.1480379104614258, + "learning_rate": 0.0002, + "loss": 0.4938, + "step": 31070 + }, + { + "epoch": 5.024654433756366, + "grad_norm": 1.0913071632385254, + "learning_rate": 0.0002, + "loss": 0.491, + "step": 31080 + }, + { + "epoch": 5.026271117937111, + "grad_norm": 0.9891864657402039, + "learning_rate": 0.0002, + "loss": 0.4896, + "step": 31090 + }, + { + "epoch": 5.027887802117856, + "grad_norm": 0.9167473912239075, + "learning_rate": 0.0002, + "loss": 0.4965, + "step": 31100 + }, + { + "epoch": 5.029504486298602, + "grad_norm": 1.2259035110473633, + "learning_rate": 0.0002, + "loss": 0.5098, + "step": 31110 + }, + { + "epoch": 5.031121170479347, + "grad_norm": 1.1812787055969238, + "learning_rate": 0.0002, + "loss": 0.5206, + "step": 31120 + }, + { + "epoch": 5.032737854660092, + "grad_norm": 1.0890522003173828, + "learning_rate": 0.0002, + "loss": 0.4725, + "step": 31130 + }, + { + "epoch": 5.034354538840837, + "grad_norm": 1.0521091222763062, + "learning_rate": 0.0002, + "loss": 0.4768, + "step": 31140 + }, + { + "epoch": 5.0359712230215825, + "grad_norm": 1.1274569034576416, + "learning_rate": 0.0002, + "loss": 0.4718, + "step": 31150 + }, + { + "epoch": 5.037587907202328, + "grad_norm": 1.140974998474121, + "learning_rate": 0.0002, + "loss": 0.4604, + "step": 31160 + }, + { + "epoch": 5.039204591383073, + "grad_norm": 1.1215609312057495, + "learning_rate": 0.0002, + "loss": 0.5077, + "step": 31170 + }, + { + "epoch": 5.040821275563818, + "grad_norm": 1.0107218027114868, + "learning_rate": 0.0002, + "loss": 0.4746, + "step": 31180 + }, + { + "epoch": 5.042437959744564, + "grad_norm": 1.0198770761489868, + "learning_rate": 0.0002, + "loss": 0.5126, + "step": 31190 + }, + { + "epoch": 5.0440546439253096, + "grad_norm": 1.1613430976867676, + "learning_rate": 0.0002, + "loss": 0.5004, + "step": 31200 + }, + { + "epoch": 5.045671328106055, + "grad_norm": 0.8555458188056946, + "learning_rate": 0.0002, + "loss": 0.5181, + "step": 31210 + }, + { + "epoch": 5.0472880122868, + "grad_norm": 1.0235545635223389, + "learning_rate": 0.0002, + "loss": 0.4878, + "step": 31220 + }, + { + "epoch": 5.048904696467545, + "grad_norm": 1.0228750705718994, + "learning_rate": 0.0002, + "loss": 0.499, + "step": 31230 + }, + { + "epoch": 5.0505213806482905, + "grad_norm": 0.8216419816017151, + "learning_rate": 0.0002, + "loss": 0.4544, + "step": 31240 + }, + { + "epoch": 5.052138064829036, + "grad_norm": 0.925828218460083, + "learning_rate": 0.0002, + "loss": 0.4947, + "step": 31250 + }, + { + "epoch": 5.053754749009781, + "grad_norm": 0.9229369759559631, + "learning_rate": 0.0002, + "loss": 0.4835, + "step": 31260 + }, + { + "epoch": 5.055371433190526, + "grad_norm": 0.9531727433204651, + "learning_rate": 0.0002, + "loss": 0.5136, + "step": 31270 + }, + { + "epoch": 5.056988117371271, + "grad_norm": 0.7738548517227173, + "learning_rate": 0.0002, + "loss": 0.5161, + "step": 31280 + }, + { + "epoch": 5.058604801552017, + "grad_norm": 1.0551451444625854, + "learning_rate": 0.0002, + "loss": 0.5166, + "step": 31290 + }, + { + "epoch": 5.060221485732762, + "grad_norm": 0.9782299399375916, + "learning_rate": 0.0002, + "loss": 0.4953, + "step": 31300 + }, + { + "epoch": 5.061838169913507, + "grad_norm": 1.0220632553100586, + "learning_rate": 0.0002, + "loss": 0.4776, + "step": 31310 + }, + { + "epoch": 5.063454854094252, + "grad_norm": 0.9808892607688904, + "learning_rate": 0.0002, + "loss": 0.5117, + "step": 31320 + }, + { + "epoch": 5.065071538274998, + "grad_norm": 1.0662003755569458, + "learning_rate": 0.0002, + "loss": 0.501, + "step": 31330 + }, + { + "epoch": 5.066688222455744, + "grad_norm": 1.0036940574645996, + "learning_rate": 0.0002, + "loss": 0.4844, + "step": 31340 + }, + { + "epoch": 5.068304906636489, + "grad_norm": 1.1931052207946777, + "learning_rate": 0.0002, + "loss": 0.5299, + "step": 31350 + }, + { + "epoch": 5.069921590817234, + "grad_norm": 0.9370693564414978, + "learning_rate": 0.0002, + "loss": 0.4646, + "step": 31360 + }, + { + "epoch": 5.071538274997979, + "grad_norm": 0.9589039087295532, + "learning_rate": 0.0002, + "loss": 0.5274, + "step": 31370 + }, + { + "epoch": 5.073154959178725, + "grad_norm": 1.0052711963653564, + "learning_rate": 0.0002, + "loss": 0.4669, + "step": 31380 + }, + { + "epoch": 5.07477164335947, + "grad_norm": 0.9991368651390076, + "learning_rate": 0.0002, + "loss": 0.5283, + "step": 31390 + }, + { + "epoch": 5.076388327540215, + "grad_norm": 0.8539695739746094, + "learning_rate": 0.0002, + "loss": 0.4579, + "step": 31400 + }, + { + "epoch": 5.07800501172096, + "grad_norm": 1.048775553703308, + "learning_rate": 0.0002, + "loss": 0.4609, + "step": 31410 + }, + { + "epoch": 5.0796216959017055, + "grad_norm": 0.9983724355697632, + "learning_rate": 0.0002, + "loss": 0.4915, + "step": 31420 + }, + { + "epoch": 5.081238380082451, + "grad_norm": 1.0189813375473022, + "learning_rate": 0.0002, + "loss": 0.4594, + "step": 31430 + }, + { + "epoch": 5.082855064263196, + "grad_norm": 0.9781646728515625, + "learning_rate": 0.0002, + "loss": 0.5449, + "step": 31440 + }, + { + "epoch": 5.084471748443941, + "grad_norm": 0.9424566030502319, + "learning_rate": 0.0002, + "loss": 0.4698, + "step": 31450 + }, + { + "epoch": 5.0860884326246865, + "grad_norm": 1.0036484003067017, + "learning_rate": 0.0002, + "loss": 0.4768, + "step": 31460 + }, + { + "epoch": 5.087705116805432, + "grad_norm": 1.0983147621154785, + "learning_rate": 0.0002, + "loss": 0.487, + "step": 31470 + }, + { + "epoch": 5.089321800986177, + "grad_norm": 1.0856730937957764, + "learning_rate": 0.0002, + "loss": 0.5236, + "step": 31480 + }, + { + "epoch": 5.090938485166923, + "grad_norm": 1.2191699743270874, + "learning_rate": 0.0002, + "loss": 0.485, + "step": 31490 + }, + { + "epoch": 5.092555169347668, + "grad_norm": 0.939346194267273, + "learning_rate": 0.0002, + "loss": 0.4936, + "step": 31500 + }, + { + "epoch": 5.0941718535284135, + "grad_norm": 0.9730121493339539, + "learning_rate": 0.0002, + "loss": 0.5107, + "step": 31510 + }, + { + "epoch": 5.095788537709159, + "grad_norm": 0.923686146736145, + "learning_rate": 0.0002, + "loss": 0.4973, + "step": 31520 + }, + { + "epoch": 5.097405221889904, + "grad_norm": 1.1734349727630615, + "learning_rate": 0.0002, + "loss": 0.4906, + "step": 31530 + }, + { + "epoch": 5.099021906070649, + "grad_norm": 1.084509015083313, + "learning_rate": 0.0002, + "loss": 0.5165, + "step": 31540 + }, + { + "epoch": 5.100638590251394, + "grad_norm": 1.0144678354263306, + "learning_rate": 0.0002, + "loss": 0.5078, + "step": 31550 + }, + { + "epoch": 5.10225527443214, + "grad_norm": 0.9958019256591797, + "learning_rate": 0.0002, + "loss": 0.4719, + "step": 31560 + }, + { + "epoch": 5.103871958612885, + "grad_norm": 0.8900736570358276, + "learning_rate": 0.0002, + "loss": 0.4876, + "step": 31570 + }, + { + "epoch": 5.10548864279363, + "grad_norm": 1.0921649932861328, + "learning_rate": 0.0002, + "loss": 0.463, + "step": 31580 + }, + { + "epoch": 5.107105326974375, + "grad_norm": 1.1613792181015015, + "learning_rate": 0.0002, + "loss": 0.5148, + "step": 31590 + }, + { + "epoch": 5.108722011155121, + "grad_norm": 0.9211367964744568, + "learning_rate": 0.0002, + "loss": 0.5055, + "step": 31600 + }, + { + "epoch": 5.110338695335866, + "grad_norm": 1.3315813541412354, + "learning_rate": 0.0002, + "loss": 0.5364, + "step": 31610 + }, + { + "epoch": 5.111955379516611, + "grad_norm": 1.3765019178390503, + "learning_rate": 0.0002, + "loss": 0.5336, + "step": 31620 + }, + { + "epoch": 5.113572063697356, + "grad_norm": 1.070198893547058, + "learning_rate": 0.0002, + "loss": 0.4861, + "step": 31630 + }, + { + "epoch": 5.115188747878102, + "grad_norm": 0.947631299495697, + "learning_rate": 0.0002, + "loss": 0.5046, + "step": 31640 + }, + { + "epoch": 5.116805432058848, + "grad_norm": 1.0197371244430542, + "learning_rate": 0.0002, + "loss": 0.5297, + "step": 31650 + }, + { + "epoch": 5.118422116239593, + "grad_norm": 0.8647911548614502, + "learning_rate": 0.0002, + "loss": 0.5014, + "step": 31660 + }, + { + "epoch": 5.120038800420338, + "grad_norm": 0.8944075107574463, + "learning_rate": 0.0002, + "loss": 0.4705, + "step": 31670 + }, + { + "epoch": 5.121655484601083, + "grad_norm": 1.124497652053833, + "learning_rate": 0.0002, + "loss": 0.5175, + "step": 31680 + }, + { + "epoch": 5.123272168781829, + "grad_norm": 0.893131673336029, + "learning_rate": 0.0002, + "loss": 0.5109, + "step": 31690 + }, + { + "epoch": 5.124888852962574, + "grad_norm": 1.0122284889221191, + "learning_rate": 0.0002, + "loss": 0.4937, + "step": 31700 + }, + { + "epoch": 5.126505537143319, + "grad_norm": 0.9493719935417175, + "learning_rate": 0.0002, + "loss": 0.5522, + "step": 31710 + }, + { + "epoch": 5.128122221324064, + "grad_norm": 0.9700539112091064, + "learning_rate": 0.0002, + "loss": 0.5031, + "step": 31720 + }, + { + "epoch": 5.1297389055048095, + "grad_norm": 1.111677646636963, + "learning_rate": 0.0002, + "loss": 0.5126, + "step": 31730 + }, + { + "epoch": 5.131355589685555, + "grad_norm": 0.8204274773597717, + "learning_rate": 0.0002, + "loss": 0.5272, + "step": 31740 + }, + { + "epoch": 5.1329722738663, + "grad_norm": 1.1029267311096191, + "learning_rate": 0.0002, + "loss": 0.5029, + "step": 31750 + }, + { + "epoch": 5.134588958047045, + "grad_norm": 1.065575122833252, + "learning_rate": 0.0002, + "loss": 0.505, + "step": 31760 + }, + { + "epoch": 5.13620564222779, + "grad_norm": 0.8208706974983215, + "learning_rate": 0.0002, + "loss": 0.502, + "step": 31770 + }, + { + "epoch": 5.137822326408536, + "grad_norm": 1.0520979166030884, + "learning_rate": 0.0002, + "loss": 0.5352, + "step": 31780 + }, + { + "epoch": 5.139439010589282, + "grad_norm": 0.8585538268089294, + "learning_rate": 0.0002, + "loss": 0.4911, + "step": 31790 + }, + { + "epoch": 5.141055694770027, + "grad_norm": 1.1491447687149048, + "learning_rate": 0.0002, + "loss": 0.5159, + "step": 31800 + }, + { + "epoch": 5.142672378950772, + "grad_norm": 0.9441081285476685, + "learning_rate": 0.0002, + "loss": 0.5157, + "step": 31810 + }, + { + "epoch": 5.1442890631315175, + "grad_norm": 1.4146889448165894, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 31820 + }, + { + "epoch": 5.145905747312263, + "grad_norm": 1.0326547622680664, + "learning_rate": 0.0002, + "loss": 0.5159, + "step": 31830 + }, + { + "epoch": 5.147522431493008, + "grad_norm": 0.9879202842712402, + "learning_rate": 0.0002, + "loss": 0.5348, + "step": 31840 + }, + { + "epoch": 5.149139115673753, + "grad_norm": 1.0374281406402588, + "learning_rate": 0.0002, + "loss": 0.5083, + "step": 31850 + }, + { + "epoch": 5.150755799854498, + "grad_norm": 1.181229591369629, + "learning_rate": 0.0002, + "loss": 0.4827, + "step": 31860 + }, + { + "epoch": 5.152372484035244, + "grad_norm": 1.2078537940979004, + "learning_rate": 0.0002, + "loss": 0.5313, + "step": 31870 + }, + { + "epoch": 5.153989168215989, + "grad_norm": 0.9599190354347229, + "learning_rate": 0.0002, + "loss": 0.5329, + "step": 31880 + }, + { + "epoch": 5.155605852396734, + "grad_norm": 1.0378568172454834, + "learning_rate": 0.0002, + "loss": 0.4953, + "step": 31890 + }, + { + "epoch": 5.157222536577479, + "grad_norm": 0.8746536374092102, + "learning_rate": 0.0002, + "loss": 0.5069, + "step": 31900 + }, + { + "epoch": 5.1588392207582245, + "grad_norm": 1.0232136249542236, + "learning_rate": 0.0002, + "loss": 0.5272, + "step": 31910 + }, + { + "epoch": 5.16045590493897, + "grad_norm": 0.9827565550804138, + "learning_rate": 0.0002, + "loss": 0.4844, + "step": 31920 + }, + { + "epoch": 5.162072589119716, + "grad_norm": 1.342657208442688, + "learning_rate": 0.0002, + "loss": 0.5029, + "step": 31930 + }, + { + "epoch": 5.163689273300461, + "grad_norm": 1.18390691280365, + "learning_rate": 0.0002, + "loss": 0.513, + "step": 31940 + }, + { + "epoch": 5.165305957481206, + "grad_norm": 0.996350109577179, + "learning_rate": 0.0002, + "loss": 0.5267, + "step": 31950 + }, + { + "epoch": 5.166922641661952, + "grad_norm": 0.9710391163825989, + "learning_rate": 0.0002, + "loss": 0.5063, + "step": 31960 + }, + { + "epoch": 5.168539325842697, + "grad_norm": 1.0264002084732056, + "learning_rate": 0.0002, + "loss": 0.5115, + "step": 31970 + }, + { + "epoch": 5.170156010023442, + "grad_norm": 1.0028311014175415, + "learning_rate": 0.0002, + "loss": 0.4972, + "step": 31980 + }, + { + "epoch": 5.171772694204187, + "grad_norm": 1.1078234910964966, + "learning_rate": 0.0002, + "loss": 0.5103, + "step": 31990 + }, + { + "epoch": 5.1733893783849325, + "grad_norm": 0.9659610390663147, + "learning_rate": 0.0002, + "loss": 0.495, + "step": 32000 + }, + { + "epoch": 5.175006062565678, + "grad_norm": 0.841986894607544, + "learning_rate": 0.0002, + "loss": 0.5114, + "step": 32010 + }, + { + "epoch": 5.176622746746423, + "grad_norm": 1.095332384109497, + "learning_rate": 0.0002, + "loss": 0.48, + "step": 32020 + }, + { + "epoch": 5.178239430927168, + "grad_norm": 1.1242377758026123, + "learning_rate": 0.0002, + "loss": 0.4741, + "step": 32030 + }, + { + "epoch": 5.179856115107913, + "grad_norm": 0.9872292280197144, + "learning_rate": 0.0002, + "loss": 0.5573, + "step": 32040 + }, + { + "epoch": 5.181472799288659, + "grad_norm": 0.936161994934082, + "learning_rate": 0.0002, + "loss": 0.48, + "step": 32050 + }, + { + "epoch": 5.183089483469404, + "grad_norm": 1.166100025177002, + "learning_rate": 0.0002, + "loss": 0.5093, + "step": 32060 + }, + { + "epoch": 5.184706167650149, + "grad_norm": 1.0764425992965698, + "learning_rate": 0.0002, + "loss": 0.5438, + "step": 32070 + }, + { + "epoch": 5.186322851830895, + "grad_norm": 1.0480051040649414, + "learning_rate": 0.0002, + "loss": 0.4843, + "step": 32080 + }, + { + "epoch": 5.1879395360116405, + "grad_norm": 1.0874916315078735, + "learning_rate": 0.0002, + "loss": 0.5386, + "step": 32090 + }, + { + "epoch": 5.189556220192386, + "grad_norm": 1.0817396640777588, + "learning_rate": 0.0002, + "loss": 0.4975, + "step": 32100 + }, + { + "epoch": 5.191172904373131, + "grad_norm": 1.054111361503601, + "learning_rate": 0.0002, + "loss": 0.5177, + "step": 32110 + }, + { + "epoch": 5.192789588553876, + "grad_norm": 0.9655823707580566, + "learning_rate": 0.0002, + "loss": 0.5229, + "step": 32120 + }, + { + "epoch": 5.194406272734621, + "grad_norm": 1.1384109258651733, + "learning_rate": 0.0002, + "loss": 0.5105, + "step": 32130 + }, + { + "epoch": 5.196022956915367, + "grad_norm": 1.0149348974227905, + "learning_rate": 0.0002, + "loss": 0.5073, + "step": 32140 + }, + { + "epoch": 5.197639641096112, + "grad_norm": 1.1084046363830566, + "learning_rate": 0.0002, + "loss": 0.5293, + "step": 32150 + }, + { + "epoch": 5.199256325276857, + "grad_norm": 1.1209309101104736, + "learning_rate": 0.0002, + "loss": 0.4936, + "step": 32160 + }, + { + "epoch": 5.200873009457602, + "grad_norm": 1.133089542388916, + "learning_rate": 0.0002, + "loss": 0.5101, + "step": 32170 + }, + { + "epoch": 5.202489693638348, + "grad_norm": 1.0893020629882812, + "learning_rate": 0.0002, + "loss": 0.5242, + "step": 32180 + }, + { + "epoch": 5.204106377819093, + "grad_norm": 0.90018630027771, + "learning_rate": 0.0002, + "loss": 0.4872, + "step": 32190 + }, + { + "epoch": 5.205723061999838, + "grad_norm": 0.977622926235199, + "learning_rate": 0.0002, + "loss": 0.4999, + "step": 32200 + }, + { + "epoch": 5.207339746180583, + "grad_norm": 1.2940177917480469, + "learning_rate": 0.0002, + "loss": 0.5028, + "step": 32210 + }, + { + "epoch": 5.2089564303613285, + "grad_norm": 1.2131710052490234, + "learning_rate": 0.0002, + "loss": 0.5396, + "step": 32220 + }, + { + "epoch": 5.210573114542075, + "grad_norm": 1.0234841108322144, + "learning_rate": 0.0002, + "loss": 0.5189, + "step": 32230 + }, + { + "epoch": 5.21218979872282, + "grad_norm": 1.157975435256958, + "learning_rate": 0.0002, + "loss": 0.5424, + "step": 32240 + }, + { + "epoch": 5.213806482903565, + "grad_norm": 1.0381282567977905, + "learning_rate": 0.0002, + "loss": 0.5396, + "step": 32250 + }, + { + "epoch": 5.21542316708431, + "grad_norm": 1.0125395059585571, + "learning_rate": 0.0002, + "loss": 0.5192, + "step": 32260 + }, + { + "epoch": 5.2170398512650555, + "grad_norm": 1.272691011428833, + "learning_rate": 0.0002, + "loss": 0.5216, + "step": 32270 + }, + { + "epoch": 5.218656535445801, + "grad_norm": 1.0061250925064087, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 32280 + }, + { + "epoch": 5.220273219626546, + "grad_norm": 0.9752234816551208, + "learning_rate": 0.0002, + "loss": 0.4739, + "step": 32290 + }, + { + "epoch": 5.221889903807291, + "grad_norm": 1.1193140745162964, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 32300 + }, + { + "epoch": 5.2235065879880365, + "grad_norm": 1.0126434564590454, + "learning_rate": 0.0002, + "loss": 0.4976, + "step": 32310 + }, + { + "epoch": 5.225123272168782, + "grad_norm": 1.4338394403457642, + "learning_rate": 0.0002, + "loss": 0.5257, + "step": 32320 + }, + { + "epoch": 5.226739956349527, + "grad_norm": 1.004101276397705, + "learning_rate": 0.0002, + "loss": 0.5235, + "step": 32330 + }, + { + "epoch": 5.228356640530272, + "grad_norm": 0.8744166493415833, + "learning_rate": 0.0002, + "loss": 0.5091, + "step": 32340 + }, + { + "epoch": 5.229973324711017, + "grad_norm": 1.0165376663208008, + "learning_rate": 0.0002, + "loss": 0.5388, + "step": 32350 + }, + { + "epoch": 5.231590008891763, + "grad_norm": 0.8635954260826111, + "learning_rate": 0.0002, + "loss": 0.5469, + "step": 32360 + }, + { + "epoch": 5.233206693072509, + "grad_norm": 1.1392399072647095, + "learning_rate": 0.0002, + "loss": 0.5609, + "step": 32370 + }, + { + "epoch": 5.234823377253254, + "grad_norm": 1.0202113389968872, + "learning_rate": 0.0002, + "loss": 0.5173, + "step": 32380 + }, + { + "epoch": 5.236440061433999, + "grad_norm": 1.0417983531951904, + "learning_rate": 0.0002, + "loss": 0.4983, + "step": 32390 + }, + { + "epoch": 5.238056745614744, + "grad_norm": 0.8729333877563477, + "learning_rate": 0.0002, + "loss": 0.507, + "step": 32400 + }, + { + "epoch": 5.23967342979549, + "grad_norm": 1.1626229286193848, + "learning_rate": 0.0002, + "loss": 0.5426, + "step": 32410 + }, + { + "epoch": 5.241290113976235, + "grad_norm": 0.9086161851882935, + "learning_rate": 0.0002, + "loss": 0.5355, + "step": 32420 + }, + { + "epoch": 5.24290679815698, + "grad_norm": 1.3999892473220825, + "learning_rate": 0.0002, + "loss": 0.4927, + "step": 32430 + }, + { + "epoch": 5.244523482337725, + "grad_norm": 1.0356311798095703, + "learning_rate": 0.0002, + "loss": 0.4795, + "step": 32440 + }, + { + "epoch": 5.246140166518471, + "grad_norm": 0.9655531644821167, + "learning_rate": 0.0002, + "loss": 0.5035, + "step": 32450 + }, + { + "epoch": 5.247756850699216, + "grad_norm": 1.0411828756332397, + "learning_rate": 0.0002, + "loss": 0.5166, + "step": 32460 + }, + { + "epoch": 5.249373534879961, + "grad_norm": 1.1199816465377808, + "learning_rate": 0.0002, + "loss": 0.5141, + "step": 32470 + }, + { + "epoch": 5.250990219060706, + "grad_norm": 1.260321855545044, + "learning_rate": 0.0002, + "loss": 0.4864, + "step": 32480 + }, + { + "epoch": 5.2526069032414515, + "grad_norm": 1.2950857877731323, + "learning_rate": 0.0002, + "loss": 0.4893, + "step": 32490 + }, + { + "epoch": 5.254223587422197, + "grad_norm": 0.8982820510864258, + "learning_rate": 0.0002, + "loss": 0.4952, + "step": 32500 + }, + { + "epoch": 5.255840271602942, + "grad_norm": 0.8512987494468689, + "learning_rate": 0.0002, + "loss": 0.5138, + "step": 32510 + }, + { + "epoch": 5.257456955783688, + "grad_norm": 1.067443609237671, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 32520 + }, + { + "epoch": 5.259073639964433, + "grad_norm": 1.0957417488098145, + "learning_rate": 0.0002, + "loss": 0.4928, + "step": 32530 + }, + { + "epoch": 5.260690324145179, + "grad_norm": 1.4161807298660278, + "learning_rate": 0.0002, + "loss": 0.5169, + "step": 32540 + }, + { + "epoch": 5.262307008325924, + "grad_norm": 1.2264093160629272, + "learning_rate": 0.0002, + "loss": 0.5599, + "step": 32550 + }, + { + "epoch": 5.263923692506669, + "grad_norm": 1.0015931129455566, + "learning_rate": 0.0002, + "loss": 0.5221, + "step": 32560 + }, + { + "epoch": 5.265540376687414, + "grad_norm": 1.0743094682693481, + "learning_rate": 0.0002, + "loss": 0.5253, + "step": 32570 + }, + { + "epoch": 5.2671570608681595, + "grad_norm": 1.1386840343475342, + "learning_rate": 0.0002, + "loss": 0.5289, + "step": 32580 + }, + { + "epoch": 5.268773745048905, + "grad_norm": 1.0093860626220703, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 32590 + }, + { + "epoch": 5.27039042922965, + "grad_norm": 0.9593744874000549, + "learning_rate": 0.0002, + "loss": 0.5175, + "step": 32600 + }, + { + "epoch": 5.272007113410395, + "grad_norm": 1.146021842956543, + "learning_rate": 0.0002, + "loss": 0.528, + "step": 32610 + }, + { + "epoch": 5.27362379759114, + "grad_norm": 0.9579031467437744, + "learning_rate": 0.0002, + "loss": 0.4983, + "step": 32620 + }, + { + "epoch": 5.275240481771886, + "grad_norm": 1.0548793077468872, + "learning_rate": 0.0002, + "loss": 0.5376, + "step": 32630 + }, + { + "epoch": 5.276857165952631, + "grad_norm": 1.0380561351776123, + "learning_rate": 0.0002, + "loss": 0.5267, + "step": 32640 + }, + { + "epoch": 5.278473850133376, + "grad_norm": 1.2119969129562378, + "learning_rate": 0.0002, + "loss": 0.5182, + "step": 32650 + }, + { + "epoch": 5.280090534314121, + "grad_norm": 1.0507797002792358, + "learning_rate": 0.0002, + "loss": 0.5298, + "step": 32660 + }, + { + "epoch": 5.2817072184948675, + "grad_norm": 1.0185176134109497, + "learning_rate": 0.0002, + "loss": 0.5253, + "step": 32670 + }, + { + "epoch": 5.283323902675613, + "grad_norm": 1.2358098030090332, + "learning_rate": 0.0002, + "loss": 0.4904, + "step": 32680 + }, + { + "epoch": 5.284940586856358, + "grad_norm": 0.7937114238739014, + "learning_rate": 0.0002, + "loss": 0.5169, + "step": 32690 + }, + { + "epoch": 5.286557271037103, + "grad_norm": 0.9825124740600586, + "learning_rate": 0.0002, + "loss": 0.495, + "step": 32700 + }, + { + "epoch": 5.288173955217848, + "grad_norm": 1.2059301137924194, + "learning_rate": 0.0002, + "loss": 0.5149, + "step": 32710 + }, + { + "epoch": 5.289790639398594, + "grad_norm": 1.0828571319580078, + "learning_rate": 0.0002, + "loss": 0.5272, + "step": 32720 + }, + { + "epoch": 5.291407323579339, + "grad_norm": 1.0129735469818115, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 32730 + }, + { + "epoch": 5.293024007760084, + "grad_norm": 1.0591634511947632, + "learning_rate": 0.0002, + "loss": 0.5216, + "step": 32740 + }, + { + "epoch": 5.294640691940829, + "grad_norm": 0.9256815910339355, + "learning_rate": 0.0002, + "loss": 0.522, + "step": 32750 + }, + { + "epoch": 5.2962573761215745, + "grad_norm": 1.0928633213043213, + "learning_rate": 0.0002, + "loss": 0.5396, + "step": 32760 + }, + { + "epoch": 5.29787406030232, + "grad_norm": 0.9415594935417175, + "learning_rate": 0.0002, + "loss": 0.5093, + "step": 32770 + }, + { + "epoch": 5.299490744483065, + "grad_norm": 1.141316294670105, + "learning_rate": 0.0002, + "loss": 0.5252, + "step": 32780 + }, + { + "epoch": 5.30110742866381, + "grad_norm": 1.0646510124206543, + "learning_rate": 0.0002, + "loss": 0.4837, + "step": 32790 + }, + { + "epoch": 5.3027241128445555, + "grad_norm": 1.189661979675293, + "learning_rate": 0.0002, + "loss": 0.5547, + "step": 32800 + }, + { + "epoch": 5.304340797025301, + "grad_norm": 0.9568731188774109, + "learning_rate": 0.0002, + "loss": 0.5664, + "step": 32810 + }, + { + "epoch": 5.305957481206047, + "grad_norm": 1.1556824445724487, + "learning_rate": 0.0002, + "loss": 0.5344, + "step": 32820 + }, + { + "epoch": 5.307574165386792, + "grad_norm": 0.9353463649749756, + "learning_rate": 0.0002, + "loss": 0.4894, + "step": 32830 + }, + { + "epoch": 5.309190849567537, + "grad_norm": 1.1208295822143555, + "learning_rate": 0.0002, + "loss": 0.5052, + "step": 32840 + }, + { + "epoch": 5.3108075337482825, + "grad_norm": 1.0894153118133545, + "learning_rate": 0.0002, + "loss": 0.5126, + "step": 32850 + }, + { + "epoch": 5.312424217929028, + "grad_norm": 1.090329647064209, + "learning_rate": 0.0002, + "loss": 0.5046, + "step": 32860 + }, + { + "epoch": 5.314040902109773, + "grad_norm": 1.0781712532043457, + "learning_rate": 0.0002, + "loss": 0.5237, + "step": 32870 + }, + { + "epoch": 5.315657586290518, + "grad_norm": 1.1785295009613037, + "learning_rate": 0.0002, + "loss": 0.57, + "step": 32880 + }, + { + "epoch": 5.317274270471263, + "grad_norm": 1.0406851768493652, + "learning_rate": 0.0002, + "loss": 0.4953, + "step": 32890 + }, + { + "epoch": 5.318890954652009, + "grad_norm": 1.0982953310012817, + "learning_rate": 0.0002, + "loss": 0.514, + "step": 32900 + }, + { + "epoch": 5.320507638832754, + "grad_norm": 1.2969383001327515, + "learning_rate": 0.0002, + "loss": 0.4944, + "step": 32910 + }, + { + "epoch": 5.322124323013499, + "grad_norm": 0.9687288999557495, + "learning_rate": 0.0002, + "loss": 0.4786, + "step": 32920 + }, + { + "epoch": 5.323741007194244, + "grad_norm": 1.136760950088501, + "learning_rate": 0.0002, + "loss": 0.5286, + "step": 32930 + }, + { + "epoch": 5.32535769137499, + "grad_norm": 1.3045495748519897, + "learning_rate": 0.0002, + "loss": 0.5321, + "step": 32940 + }, + { + "epoch": 5.326974375555735, + "grad_norm": 1.221675992012024, + "learning_rate": 0.0002, + "loss": 0.5413, + "step": 32950 + }, + { + "epoch": 5.32859105973648, + "grad_norm": 1.1380633115768433, + "learning_rate": 0.0002, + "loss": 0.4999, + "step": 32960 + }, + { + "epoch": 5.330207743917226, + "grad_norm": 1.1065956354141235, + "learning_rate": 0.0002, + "loss": 0.5037, + "step": 32970 + }, + { + "epoch": 5.331824428097971, + "grad_norm": 1.0187175273895264, + "learning_rate": 0.0002, + "loss": 0.4913, + "step": 32980 + }, + { + "epoch": 5.333441112278717, + "grad_norm": 0.9077118039131165, + "learning_rate": 0.0002, + "loss": 0.5234, + "step": 32990 + }, + { + "epoch": 5.335057796459462, + "grad_norm": 1.0092815160751343, + "learning_rate": 0.0002, + "loss": 0.5071, + "step": 33000 + }, + { + "epoch": 5.336674480640207, + "grad_norm": 1.0168777704238892, + "learning_rate": 0.0002, + "loss": 0.498, + "step": 33010 + }, + { + "epoch": 5.338291164820952, + "grad_norm": 0.996161937713623, + "learning_rate": 0.0002, + "loss": 0.4952, + "step": 33020 + }, + { + "epoch": 5.339907849001698, + "grad_norm": 0.794463038444519, + "learning_rate": 0.0002, + "loss": 0.5024, + "step": 33030 + }, + { + "epoch": 5.341524533182443, + "grad_norm": 0.9750674962997437, + "learning_rate": 0.0002, + "loss": 0.5112, + "step": 33040 + }, + { + "epoch": 5.343141217363188, + "grad_norm": 1.2770029306411743, + "learning_rate": 0.0002, + "loss": 0.528, + "step": 33050 + }, + { + "epoch": 5.344757901543933, + "grad_norm": 1.1500186920166016, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 33060 + }, + { + "epoch": 5.3463745857246785, + "grad_norm": 1.0726377964019775, + "learning_rate": 0.0002, + "loss": 0.4906, + "step": 33070 + }, + { + "epoch": 5.347991269905424, + "grad_norm": 0.9314153790473938, + "learning_rate": 0.0002, + "loss": 0.5212, + "step": 33080 + }, + { + "epoch": 5.349607954086169, + "grad_norm": 1.344988465309143, + "learning_rate": 0.0002, + "loss": 0.5434, + "step": 33090 + }, + { + "epoch": 5.351224638266914, + "grad_norm": 0.863196611404419, + "learning_rate": 0.0002, + "loss": 0.4874, + "step": 33100 + }, + { + "epoch": 5.352841322447659, + "grad_norm": 1.128100037574768, + "learning_rate": 0.0002, + "loss": 0.534, + "step": 33110 + }, + { + "epoch": 5.3544580066284055, + "grad_norm": 1.1673583984375, + "learning_rate": 0.0002, + "loss": 0.5293, + "step": 33120 + }, + { + "epoch": 5.356074690809151, + "grad_norm": 0.9416789412498474, + "learning_rate": 0.0002, + "loss": 0.4787, + "step": 33130 + }, + { + "epoch": 5.357691374989896, + "grad_norm": 1.1855236291885376, + "learning_rate": 0.0002, + "loss": 0.5155, + "step": 33140 + }, + { + "epoch": 5.359308059170641, + "grad_norm": 1.0415170192718506, + "learning_rate": 0.0002, + "loss": 0.515, + "step": 33150 + }, + { + "epoch": 5.3609247433513865, + "grad_norm": 0.9953004121780396, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 33160 + }, + { + "epoch": 5.362541427532132, + "grad_norm": 0.96138596534729, + "learning_rate": 0.0002, + "loss": 0.5305, + "step": 33170 + }, + { + "epoch": 5.364158111712877, + "grad_norm": 1.341979742050171, + "learning_rate": 0.0002, + "loss": 0.5064, + "step": 33180 + }, + { + "epoch": 5.365774795893622, + "grad_norm": 1.0136911869049072, + "learning_rate": 0.0002, + "loss": 0.4986, + "step": 33190 + }, + { + "epoch": 5.367391480074367, + "grad_norm": 0.8685575127601624, + "learning_rate": 0.0002, + "loss": 0.5459, + "step": 33200 + }, + { + "epoch": 5.369008164255113, + "grad_norm": 0.8833574652671814, + "learning_rate": 0.0002, + "loss": 0.5146, + "step": 33210 + }, + { + "epoch": 5.370624848435858, + "grad_norm": 0.9123612642288208, + "learning_rate": 0.0002, + "loss": 0.4982, + "step": 33220 + }, + { + "epoch": 5.372241532616603, + "grad_norm": 1.2720599174499512, + "learning_rate": 0.0002, + "loss": 0.5047, + "step": 33230 + }, + { + "epoch": 5.373858216797348, + "grad_norm": 1.0596648454666138, + "learning_rate": 0.0002, + "loss": 0.5175, + "step": 33240 + }, + { + "epoch": 5.3754749009780936, + "grad_norm": 1.119701623916626, + "learning_rate": 0.0002, + "loss": 0.5284, + "step": 33250 + }, + { + "epoch": 5.377091585158839, + "grad_norm": 1.3000061511993408, + "learning_rate": 0.0002, + "loss": 0.5217, + "step": 33260 + }, + { + "epoch": 5.378708269339585, + "grad_norm": 1.083891749382019, + "learning_rate": 0.0002, + "loss": 0.5125, + "step": 33270 + }, + { + "epoch": 5.38032495352033, + "grad_norm": 0.9402718544006348, + "learning_rate": 0.0002, + "loss": 0.5065, + "step": 33280 + }, + { + "epoch": 5.381941637701075, + "grad_norm": 1.3376892805099487, + "learning_rate": 0.0002, + "loss": 0.5559, + "step": 33290 + }, + { + "epoch": 5.383558321881821, + "grad_norm": 1.1600074768066406, + "learning_rate": 0.0002, + "loss": 0.5193, + "step": 33300 + }, + { + "epoch": 5.385175006062566, + "grad_norm": 1.1449427604675293, + "learning_rate": 0.0002, + "loss": 0.4907, + "step": 33310 + }, + { + "epoch": 5.386791690243311, + "grad_norm": 1.3118891716003418, + "learning_rate": 0.0002, + "loss": 0.5449, + "step": 33320 + }, + { + "epoch": 5.388408374424056, + "grad_norm": 0.743449866771698, + "learning_rate": 0.0002, + "loss": 0.547, + "step": 33330 + }, + { + "epoch": 5.3900250586048015, + "grad_norm": 0.9358304142951965, + "learning_rate": 0.0002, + "loss": 0.5555, + "step": 33340 + }, + { + "epoch": 5.391641742785547, + "grad_norm": 1.0447142124176025, + "learning_rate": 0.0002, + "loss": 0.5558, + "step": 33350 + }, + { + "epoch": 5.393258426966292, + "grad_norm": 1.1088626384735107, + "learning_rate": 0.0002, + "loss": 0.5106, + "step": 33360 + }, + { + "epoch": 5.394875111147037, + "grad_norm": 1.1267958879470825, + "learning_rate": 0.0002, + "loss": 0.4929, + "step": 33370 + }, + { + "epoch": 5.3964917953277824, + "grad_norm": 0.9709370136260986, + "learning_rate": 0.0002, + "loss": 0.5165, + "step": 33380 + }, + { + "epoch": 5.398108479508528, + "grad_norm": 1.0939103364944458, + "learning_rate": 0.0002, + "loss": 0.5206, + "step": 33390 + }, + { + "epoch": 5.399725163689273, + "grad_norm": 0.9559304714202881, + "learning_rate": 0.0002, + "loss": 0.5177, + "step": 33400 + }, + { + "epoch": 5.401341847870018, + "grad_norm": 1.199580430984497, + "learning_rate": 0.0002, + "loss": 0.5064, + "step": 33410 + }, + { + "epoch": 5.402958532050764, + "grad_norm": 0.9097000360488892, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 33420 + }, + { + "epoch": 5.4045752162315095, + "grad_norm": 1.1940981149673462, + "learning_rate": 0.0002, + "loss": 0.514, + "step": 33430 + }, + { + "epoch": 5.406191900412255, + "grad_norm": 1.0530916452407837, + "learning_rate": 0.0002, + "loss": 0.5069, + "step": 33440 + }, + { + "epoch": 5.407808584593, + "grad_norm": 1.0482549667358398, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 33450 + }, + { + "epoch": 5.409425268773745, + "grad_norm": 1.2524714469909668, + "learning_rate": 0.0002, + "loss": 0.501, + "step": 33460 + }, + { + "epoch": 5.41104195295449, + "grad_norm": 1.1091666221618652, + "learning_rate": 0.0002, + "loss": 0.5597, + "step": 33470 + }, + { + "epoch": 5.412658637135236, + "grad_norm": 0.9981587529182434, + "learning_rate": 0.0002, + "loss": 0.546, + "step": 33480 + }, + { + "epoch": 5.414275321315981, + "grad_norm": 1.016681432723999, + "learning_rate": 0.0002, + "loss": 0.4977, + "step": 33490 + }, + { + "epoch": 5.415892005496726, + "grad_norm": 1.1456854343414307, + "learning_rate": 0.0002, + "loss": 0.5388, + "step": 33500 + }, + { + "epoch": 5.417508689677471, + "grad_norm": 1.1454259157180786, + "learning_rate": 0.0002, + "loss": 0.5292, + "step": 33510 + }, + { + "epoch": 5.419125373858217, + "grad_norm": 0.9858416318893433, + "learning_rate": 0.0002, + "loss": 0.5061, + "step": 33520 + }, + { + "epoch": 5.420742058038962, + "grad_norm": 0.9764766693115234, + "learning_rate": 0.0002, + "loss": 0.5139, + "step": 33530 + }, + { + "epoch": 5.422358742219707, + "grad_norm": 1.199920892715454, + "learning_rate": 0.0002, + "loss": 0.5518, + "step": 33540 + }, + { + "epoch": 5.423975426400452, + "grad_norm": 1.3107370138168335, + "learning_rate": 0.0002, + "loss": 0.5182, + "step": 33550 + }, + { + "epoch": 5.4255921105811975, + "grad_norm": 0.9637970328330994, + "learning_rate": 0.0002, + "loss": 0.5149, + "step": 33560 + }, + { + "epoch": 5.427208794761944, + "grad_norm": 1.023359775543213, + "learning_rate": 0.0002, + "loss": 0.526, + "step": 33570 + }, + { + "epoch": 5.428825478942689, + "grad_norm": 1.060417652130127, + "learning_rate": 0.0002, + "loss": 0.5206, + "step": 33580 + }, + { + "epoch": 5.430442163123434, + "grad_norm": 0.9971120953559875, + "learning_rate": 0.0002, + "loss": 0.5052, + "step": 33590 + }, + { + "epoch": 5.432058847304179, + "grad_norm": 0.9213743209838867, + "learning_rate": 0.0002, + "loss": 0.5044, + "step": 33600 + }, + { + "epoch": 5.4336755314849245, + "grad_norm": 1.1512309312820435, + "learning_rate": 0.0002, + "loss": 0.5714, + "step": 33610 + }, + { + "epoch": 5.43529221566567, + "grad_norm": 1.2198847532272339, + "learning_rate": 0.0002, + "loss": 0.5317, + "step": 33620 + }, + { + "epoch": 5.436908899846415, + "grad_norm": 1.0329595804214478, + "learning_rate": 0.0002, + "loss": 0.5237, + "step": 33630 + }, + { + "epoch": 5.43852558402716, + "grad_norm": 1.1075750589370728, + "learning_rate": 0.0002, + "loss": 0.5364, + "step": 33640 + }, + { + "epoch": 5.4401422682079055, + "grad_norm": 1.006342887878418, + "learning_rate": 0.0002, + "loss": 0.5295, + "step": 33650 + }, + { + "epoch": 5.441758952388651, + "grad_norm": 0.9179885983467102, + "learning_rate": 0.0002, + "loss": 0.5394, + "step": 33660 + }, + { + "epoch": 5.443375636569396, + "grad_norm": 1.2799493074417114, + "learning_rate": 0.0002, + "loss": 0.5124, + "step": 33670 + }, + { + "epoch": 5.444992320750141, + "grad_norm": 1.1153863668441772, + "learning_rate": 0.0002, + "loss": 0.5426, + "step": 33680 + }, + { + "epoch": 5.446609004930886, + "grad_norm": 1.0681028366088867, + "learning_rate": 0.0002, + "loss": 0.5087, + "step": 33690 + }, + { + "epoch": 5.448225689111632, + "grad_norm": 0.9788817167282104, + "learning_rate": 0.0002, + "loss": 0.5272, + "step": 33700 + }, + { + "epoch": 5.449842373292377, + "grad_norm": 0.8481608629226685, + "learning_rate": 0.0002, + "loss": 0.5308, + "step": 33710 + }, + { + "epoch": 5.451459057473123, + "grad_norm": 1.113756537437439, + "learning_rate": 0.0002, + "loss": 0.5225, + "step": 33720 + }, + { + "epoch": 5.453075741653868, + "grad_norm": 0.8425475358963013, + "learning_rate": 0.0002, + "loss": 0.5213, + "step": 33730 + }, + { + "epoch": 5.4546924258346134, + "grad_norm": 1.0852208137512207, + "learning_rate": 0.0002, + "loss": 0.571, + "step": 33740 + }, + { + "epoch": 5.456309110015359, + "grad_norm": 1.1664748191833496, + "learning_rate": 0.0002, + "loss": 0.5535, + "step": 33750 + }, + { + "epoch": 5.457925794196104, + "grad_norm": 1.217241644859314, + "learning_rate": 0.0002, + "loss": 0.5419, + "step": 33760 + }, + { + "epoch": 5.459542478376849, + "grad_norm": 1.1572928428649902, + "learning_rate": 0.0002, + "loss": 0.5351, + "step": 33770 + }, + { + "epoch": 5.461159162557594, + "grad_norm": 1.0437318086624146, + "learning_rate": 0.0002, + "loss": 0.5161, + "step": 33780 + }, + { + "epoch": 5.46277584673834, + "grad_norm": 0.9807571768760681, + "learning_rate": 0.0002, + "loss": 0.5266, + "step": 33790 + }, + { + "epoch": 5.464392530919085, + "grad_norm": 1.1436342000961304, + "learning_rate": 0.0002, + "loss": 0.5384, + "step": 33800 + }, + { + "epoch": 5.46600921509983, + "grad_norm": 1.1004794836044312, + "learning_rate": 0.0002, + "loss": 0.5338, + "step": 33810 + }, + { + "epoch": 5.467625899280575, + "grad_norm": 1.2130268812179565, + "learning_rate": 0.0002, + "loss": 0.4868, + "step": 33820 + }, + { + "epoch": 5.4692425834613205, + "grad_norm": 1.3154419660568237, + "learning_rate": 0.0002, + "loss": 0.516, + "step": 33830 + }, + { + "epoch": 5.470859267642066, + "grad_norm": 0.7934383749961853, + "learning_rate": 0.0002, + "loss": 0.4934, + "step": 33840 + }, + { + "epoch": 5.472475951822812, + "grad_norm": 0.7838410139083862, + "learning_rate": 0.0002, + "loss": 0.5133, + "step": 33850 + }, + { + "epoch": 5.474092636003557, + "grad_norm": 1.0415139198303223, + "learning_rate": 0.0002, + "loss": 0.4926, + "step": 33860 + }, + { + "epoch": 5.475709320184302, + "grad_norm": 0.9213164448738098, + "learning_rate": 0.0002, + "loss": 0.5323, + "step": 33870 + }, + { + "epoch": 5.477326004365048, + "grad_norm": 1.0364776849746704, + "learning_rate": 0.0002, + "loss": 0.5125, + "step": 33880 + }, + { + "epoch": 5.478942688545793, + "grad_norm": 0.9994072318077087, + "learning_rate": 0.0002, + "loss": 0.5212, + "step": 33890 + }, + { + "epoch": 5.480559372726538, + "grad_norm": 1.196730136871338, + "learning_rate": 0.0002, + "loss": 0.5396, + "step": 33900 + }, + { + "epoch": 5.482176056907283, + "grad_norm": 0.9955780506134033, + "learning_rate": 0.0002, + "loss": 0.538, + "step": 33910 + }, + { + "epoch": 5.4837927410880285, + "grad_norm": 1.168188214302063, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 33920 + }, + { + "epoch": 5.485409425268774, + "grad_norm": 1.1816450357437134, + "learning_rate": 0.0002, + "loss": 0.5548, + "step": 33930 + }, + { + "epoch": 5.487026109449519, + "grad_norm": 1.079715609550476, + "learning_rate": 0.0002, + "loss": 0.5535, + "step": 33940 + }, + { + "epoch": 5.488642793630264, + "grad_norm": 1.153850793838501, + "learning_rate": 0.0002, + "loss": 0.5262, + "step": 33950 + }, + { + "epoch": 5.490259477811009, + "grad_norm": 1.0207297801971436, + "learning_rate": 0.0002, + "loss": 0.5248, + "step": 33960 + }, + { + "epoch": 5.491876161991755, + "grad_norm": 1.1290855407714844, + "learning_rate": 0.0002, + "loss": 0.5142, + "step": 33970 + }, + { + "epoch": 5.4934928461725, + "grad_norm": 1.068058967590332, + "learning_rate": 0.0002, + "loss": 0.5168, + "step": 33980 + }, + { + "epoch": 5.495109530353245, + "grad_norm": 0.9789979457855225, + "learning_rate": 0.0002, + "loss": 0.5317, + "step": 33990 + }, + { + "epoch": 5.496726214533991, + "grad_norm": 0.9696692824363708, + "learning_rate": 0.0002, + "loss": 0.5113, + "step": 34000 + }, + { + "epoch": 5.4983428987147365, + "grad_norm": 1.0539981126785278, + "learning_rate": 0.0002, + "loss": 0.5413, + "step": 34010 + }, + { + "epoch": 5.499959582895482, + "grad_norm": 1.0249929428100586, + "learning_rate": 0.0002, + "loss": 0.5783, + "step": 34020 + }, + { + "epoch": 5.501576267076227, + "grad_norm": 0.9577504992485046, + "learning_rate": 0.0002, + "loss": 0.4888, + "step": 34030 + }, + { + "epoch": 5.503192951256972, + "grad_norm": 1.0963513851165771, + "learning_rate": 0.0002, + "loss": 0.5291, + "step": 34040 + }, + { + "epoch": 5.504809635437717, + "grad_norm": 0.8339345455169678, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 34050 + }, + { + "epoch": 5.506426319618463, + "grad_norm": 1.0138782262802124, + "learning_rate": 0.0002, + "loss": 0.5191, + "step": 34060 + }, + { + "epoch": 5.508043003799208, + "grad_norm": 1.0180109739303589, + "learning_rate": 0.0002, + "loss": 0.5463, + "step": 34070 + }, + { + "epoch": 5.509659687979953, + "grad_norm": 1.2790818214416504, + "learning_rate": 0.0002, + "loss": 0.5083, + "step": 34080 + }, + { + "epoch": 5.511276372160698, + "grad_norm": 1.428247332572937, + "learning_rate": 0.0002, + "loss": 0.5195, + "step": 34090 + }, + { + "epoch": 5.5128930563414436, + "grad_norm": 1.0926059484481812, + "learning_rate": 0.0002, + "loss": 0.5291, + "step": 34100 + }, + { + "epoch": 5.514509740522189, + "grad_norm": 1.2353343963623047, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 34110 + }, + { + "epoch": 5.516126424702934, + "grad_norm": 0.935587465763092, + "learning_rate": 0.0002, + "loss": 0.5331, + "step": 34120 + }, + { + "epoch": 5.517743108883679, + "grad_norm": 0.9767586588859558, + "learning_rate": 0.0002, + "loss": 0.5512, + "step": 34130 + }, + { + "epoch": 5.5193597930644245, + "grad_norm": 1.1660610437393188, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 34140 + }, + { + "epoch": 5.520976477245171, + "grad_norm": 0.9828870892524719, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 34150 + }, + { + "epoch": 5.522593161425916, + "grad_norm": 1.0097278356552124, + "learning_rate": 0.0002, + "loss": 0.5198, + "step": 34160 + }, + { + "epoch": 5.524209845606661, + "grad_norm": 1.1766167879104614, + "learning_rate": 0.0002, + "loss": 0.5293, + "step": 34170 + }, + { + "epoch": 5.525826529787406, + "grad_norm": 0.982292115688324, + "learning_rate": 0.0002, + "loss": 0.5258, + "step": 34180 + }, + { + "epoch": 5.5274432139681515, + "grad_norm": 1.0744609832763672, + "learning_rate": 0.0002, + "loss": 0.5114, + "step": 34190 + }, + { + "epoch": 5.529059898148897, + "grad_norm": 1.3831160068511963, + "learning_rate": 0.0002, + "loss": 0.5469, + "step": 34200 + }, + { + "epoch": 5.530676582329642, + "grad_norm": 1.074771761894226, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 34210 + }, + { + "epoch": 5.532293266510387, + "grad_norm": 1.016652226448059, + "learning_rate": 0.0002, + "loss": 0.5399, + "step": 34220 + }, + { + "epoch": 5.5339099506911325, + "grad_norm": 1.2231552600860596, + "learning_rate": 0.0002, + "loss": 0.5158, + "step": 34230 + }, + { + "epoch": 5.535526634871878, + "grad_norm": 0.8051198720932007, + "learning_rate": 0.0002, + "loss": 0.5091, + "step": 34240 + }, + { + "epoch": 5.537143319052623, + "grad_norm": 1.1779674291610718, + "learning_rate": 0.0002, + "loss": 0.5583, + "step": 34250 + }, + { + "epoch": 5.538760003233368, + "grad_norm": 1.2468291521072388, + "learning_rate": 0.0002, + "loss": 0.5044, + "step": 34260 + }, + { + "epoch": 5.540376687414113, + "grad_norm": 1.14818274974823, + "learning_rate": 0.0002, + "loss": 0.523, + "step": 34270 + }, + { + "epoch": 5.541993371594859, + "grad_norm": 1.2362616062164307, + "learning_rate": 0.0002, + "loss": 0.5375, + "step": 34280 + }, + { + "epoch": 5.543610055775604, + "grad_norm": 1.0206977128982544, + "learning_rate": 0.0002, + "loss": 0.4996, + "step": 34290 + }, + { + "epoch": 5.54522673995635, + "grad_norm": 1.2018457651138306, + "learning_rate": 0.0002, + "loss": 0.5212, + "step": 34300 + }, + { + "epoch": 5.546843424137095, + "grad_norm": 1.0349043607711792, + "learning_rate": 0.0002, + "loss": 0.5462, + "step": 34310 + }, + { + "epoch": 5.54846010831784, + "grad_norm": 1.2022006511688232, + "learning_rate": 0.0002, + "loss": 0.5231, + "step": 34320 + }, + { + "epoch": 5.550076792498586, + "grad_norm": 1.0810624361038208, + "learning_rate": 0.0002, + "loss": 0.5173, + "step": 34330 + }, + { + "epoch": 5.551693476679331, + "grad_norm": 1.3297529220581055, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 34340 + }, + { + "epoch": 5.553310160860076, + "grad_norm": 0.9722549915313721, + "learning_rate": 0.0002, + "loss": 0.5321, + "step": 34350 + }, + { + "epoch": 5.554926845040821, + "grad_norm": 0.9903425574302673, + "learning_rate": 0.0002, + "loss": 0.4823, + "step": 34360 + }, + { + "epoch": 5.556543529221567, + "grad_norm": 0.9568067789077759, + "learning_rate": 0.0002, + "loss": 0.5601, + "step": 34370 + }, + { + "epoch": 5.558160213402312, + "grad_norm": 1.113870620727539, + "learning_rate": 0.0002, + "loss": 0.5242, + "step": 34380 + }, + { + "epoch": 5.559776897583057, + "grad_norm": 1.0557632446289062, + "learning_rate": 0.0002, + "loss": 0.5278, + "step": 34390 + }, + { + "epoch": 5.561393581763802, + "grad_norm": 0.9615673422813416, + "learning_rate": 0.0002, + "loss": 0.5501, + "step": 34400 + }, + { + "epoch": 5.5630102659445475, + "grad_norm": 0.9536027312278748, + "learning_rate": 0.0002, + "loss": 0.5066, + "step": 34410 + }, + { + "epoch": 5.564626950125293, + "grad_norm": 0.8808749318122864, + "learning_rate": 0.0002, + "loss": 0.4949, + "step": 34420 + }, + { + "epoch": 5.566243634306038, + "grad_norm": 1.286132574081421, + "learning_rate": 0.0002, + "loss": 0.5954, + "step": 34430 + }, + { + "epoch": 5.567860318486783, + "grad_norm": 1.259644865989685, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 34440 + }, + { + "epoch": 5.569477002667529, + "grad_norm": 0.9920216798782349, + "learning_rate": 0.0002, + "loss": 0.4922, + "step": 34450 + }, + { + "epoch": 5.5710936868482746, + "grad_norm": 1.182926893234253, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 34460 + }, + { + "epoch": 5.57271037102902, + "grad_norm": 1.1434749364852905, + "learning_rate": 0.0002, + "loss": 0.5185, + "step": 34470 + }, + { + "epoch": 5.574327055209765, + "grad_norm": 1.2420979738235474, + "learning_rate": 0.0002, + "loss": 0.5256, + "step": 34480 + }, + { + "epoch": 5.57594373939051, + "grad_norm": 0.9338384866714478, + "learning_rate": 0.0002, + "loss": 0.5039, + "step": 34490 + }, + { + "epoch": 5.5775604235712555, + "grad_norm": 1.0196425914764404, + "learning_rate": 0.0002, + "loss": 0.5634, + "step": 34500 + }, + { + "epoch": 5.579177107752001, + "grad_norm": 0.9586997032165527, + "learning_rate": 0.0002, + "loss": 0.5132, + "step": 34510 + }, + { + "epoch": 5.580793791932746, + "grad_norm": 1.2409086227416992, + "learning_rate": 0.0002, + "loss": 0.5336, + "step": 34520 + }, + { + "epoch": 5.582410476113491, + "grad_norm": 1.1483757495880127, + "learning_rate": 0.0002, + "loss": 0.5364, + "step": 34530 + }, + { + "epoch": 5.584027160294236, + "grad_norm": 1.1624305248260498, + "learning_rate": 0.0002, + "loss": 0.5325, + "step": 34540 + }, + { + "epoch": 5.585643844474982, + "grad_norm": 1.2635223865509033, + "learning_rate": 0.0002, + "loss": 0.5342, + "step": 34550 + }, + { + "epoch": 5.587260528655727, + "grad_norm": 0.9824051856994629, + "learning_rate": 0.0002, + "loss": 0.4924, + "step": 34560 + }, + { + "epoch": 5.588877212836472, + "grad_norm": 1.0858620405197144, + "learning_rate": 0.0002, + "loss": 0.5395, + "step": 34570 + }, + { + "epoch": 5.590493897017217, + "grad_norm": 1.1452655792236328, + "learning_rate": 0.0002, + "loss": 0.5459, + "step": 34580 + }, + { + "epoch": 5.592110581197963, + "grad_norm": 1.110610842704773, + "learning_rate": 0.0002, + "loss": 0.5746, + "step": 34590 + }, + { + "epoch": 5.593727265378709, + "grad_norm": 0.9976194500923157, + "learning_rate": 0.0002, + "loss": 0.5285, + "step": 34600 + }, + { + "epoch": 5.595343949559454, + "grad_norm": 1.0698920488357544, + "learning_rate": 0.0002, + "loss": 0.548, + "step": 34610 + }, + { + "epoch": 5.596960633740199, + "grad_norm": 1.1505171060562134, + "learning_rate": 0.0002, + "loss": 0.5311, + "step": 34620 + }, + { + "epoch": 5.598577317920944, + "grad_norm": 1.1014643907546997, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 34630 + }, + { + "epoch": 5.60019400210169, + "grad_norm": 0.915595293045044, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 34640 + }, + { + "epoch": 5.601810686282435, + "grad_norm": 1.1856765747070312, + "learning_rate": 0.0002, + "loss": 0.5821, + "step": 34650 + }, + { + "epoch": 5.60342737046318, + "grad_norm": 1.1357687711715698, + "learning_rate": 0.0002, + "loss": 0.5502, + "step": 34660 + }, + { + "epoch": 5.605044054643925, + "grad_norm": 1.0232492685317993, + "learning_rate": 0.0002, + "loss": 0.5034, + "step": 34670 + }, + { + "epoch": 5.6066607388246705, + "grad_norm": 0.9375017881393433, + "learning_rate": 0.0002, + "loss": 0.5357, + "step": 34680 + }, + { + "epoch": 5.608277423005416, + "grad_norm": 1.0796529054641724, + "learning_rate": 0.0002, + "loss": 0.5518, + "step": 34690 + }, + { + "epoch": 5.609894107186161, + "grad_norm": 1.1383336782455444, + "learning_rate": 0.0002, + "loss": 0.5173, + "step": 34700 + }, + { + "epoch": 5.611510791366906, + "grad_norm": 1.0248544216156006, + "learning_rate": 0.0002, + "loss": 0.5477, + "step": 34710 + }, + { + "epoch": 5.6131274755476515, + "grad_norm": 1.0986040830612183, + "learning_rate": 0.0002, + "loss": 0.5669, + "step": 34720 + }, + { + "epoch": 5.614744159728397, + "grad_norm": 1.2689568996429443, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 34730 + }, + { + "epoch": 5.616360843909142, + "grad_norm": 1.4044264554977417, + "learning_rate": 0.0002, + "loss": 0.5136, + "step": 34740 + }, + { + "epoch": 5.617977528089888, + "grad_norm": 1.2084474563598633, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 34750 + }, + { + "epoch": 5.619594212270633, + "grad_norm": 1.061248540878296, + "learning_rate": 0.0002, + "loss": 0.5377, + "step": 34760 + }, + { + "epoch": 5.6212108964513785, + "grad_norm": 1.0220764875411987, + "learning_rate": 0.0002, + "loss": 0.5669, + "step": 34770 + }, + { + "epoch": 5.622827580632124, + "grad_norm": 1.0859092473983765, + "learning_rate": 0.0002, + "loss": 0.54, + "step": 34780 + }, + { + "epoch": 5.624444264812869, + "grad_norm": 0.9049732089042664, + "learning_rate": 0.0002, + "loss": 0.5308, + "step": 34790 + }, + { + "epoch": 5.626060948993614, + "grad_norm": 1.2103937864303589, + "learning_rate": 0.0002, + "loss": 0.5433, + "step": 34800 + }, + { + "epoch": 5.627677633174359, + "grad_norm": 0.9854230284690857, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 34810 + }, + { + "epoch": 5.629294317355105, + "grad_norm": 0.9316635131835938, + "learning_rate": 0.0002, + "loss": 0.5274, + "step": 34820 + }, + { + "epoch": 5.63091100153585, + "grad_norm": 1.105296015739441, + "learning_rate": 0.0002, + "loss": 0.5393, + "step": 34830 + }, + { + "epoch": 5.632527685716595, + "grad_norm": 0.993383526802063, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 34840 + }, + { + "epoch": 5.63414436989734, + "grad_norm": 1.1544116735458374, + "learning_rate": 0.0002, + "loss": 0.5375, + "step": 34850 + }, + { + "epoch": 5.635761054078086, + "grad_norm": 1.284475326538086, + "learning_rate": 0.0002, + "loss": 0.5448, + "step": 34860 + }, + { + "epoch": 5.637377738258831, + "grad_norm": 1.121997594833374, + "learning_rate": 0.0002, + "loss": 0.5069, + "step": 34870 + }, + { + "epoch": 5.638994422439576, + "grad_norm": 1.213040828704834, + "learning_rate": 0.0002, + "loss": 0.5335, + "step": 34880 + }, + { + "epoch": 5.640611106620321, + "grad_norm": 1.23222017288208, + "learning_rate": 0.0002, + "loss": 0.5623, + "step": 34890 + }, + { + "epoch": 5.642227790801067, + "grad_norm": 0.9793637990951538, + "learning_rate": 0.0002, + "loss": 0.5622, + "step": 34900 + }, + { + "epoch": 5.643844474981813, + "grad_norm": 1.38919997215271, + "learning_rate": 0.0002, + "loss": 0.5405, + "step": 34910 + }, + { + "epoch": 5.645461159162558, + "grad_norm": 0.8390951156616211, + "learning_rate": 0.0002, + "loss": 0.5007, + "step": 34920 + }, + { + "epoch": 5.647077843343303, + "grad_norm": 0.9465909004211426, + "learning_rate": 0.0002, + "loss": 0.5974, + "step": 34930 + }, + { + "epoch": 5.648694527524048, + "grad_norm": 1.066957712173462, + "learning_rate": 0.0002, + "loss": 0.5264, + "step": 34940 + }, + { + "epoch": 5.650311211704794, + "grad_norm": 0.9842154383659363, + "learning_rate": 0.0002, + "loss": 0.5513, + "step": 34950 + }, + { + "epoch": 5.651927895885539, + "grad_norm": 1.1766440868377686, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 34960 + }, + { + "epoch": 5.653544580066284, + "grad_norm": 0.9061306118965149, + "learning_rate": 0.0002, + "loss": 0.5462, + "step": 34970 + }, + { + "epoch": 5.655161264247029, + "grad_norm": 1.2941309213638306, + "learning_rate": 0.0002, + "loss": 0.5446, + "step": 34980 + }, + { + "epoch": 5.6567779484277745, + "grad_norm": 0.9741247892379761, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 34990 + }, + { + "epoch": 5.65839463260852, + "grad_norm": 1.0784187316894531, + "learning_rate": 0.0002, + "loss": 0.5152, + "step": 35000 + }, + { + "epoch": 5.660011316789265, + "grad_norm": 0.937889814376831, + "learning_rate": 0.0002, + "loss": 0.5363, + "step": 35010 + }, + { + "epoch": 5.66162800097001, + "grad_norm": 0.9667879939079285, + "learning_rate": 0.0002, + "loss": 0.5019, + "step": 35020 + }, + { + "epoch": 5.663244685150756, + "grad_norm": 1.0554876327514648, + "learning_rate": 0.0002, + "loss": 0.5209, + "step": 35030 + }, + { + "epoch": 5.664861369331501, + "grad_norm": 1.2030539512634277, + "learning_rate": 0.0002, + "loss": 0.523, + "step": 35040 + }, + { + "epoch": 5.666478053512247, + "grad_norm": 1.0849953889846802, + "learning_rate": 0.0002, + "loss": 0.5406, + "step": 35050 + }, + { + "epoch": 5.668094737692992, + "grad_norm": 1.1598973274230957, + "learning_rate": 0.0002, + "loss": 0.5747, + "step": 35060 + }, + { + "epoch": 5.669711421873737, + "grad_norm": 1.0233359336853027, + "learning_rate": 0.0002, + "loss": 0.5488, + "step": 35070 + }, + { + "epoch": 5.6713281060544825, + "grad_norm": 1.1124799251556396, + "learning_rate": 0.0002, + "loss": 0.5409, + "step": 35080 + }, + { + "epoch": 5.672944790235228, + "grad_norm": 1.2351475954055786, + "learning_rate": 0.0002, + "loss": 0.5578, + "step": 35090 + }, + { + "epoch": 5.674561474415973, + "grad_norm": 1.0240728855133057, + "learning_rate": 0.0002, + "loss": 0.5638, + "step": 35100 + }, + { + "epoch": 5.676178158596718, + "grad_norm": 1.0223692655563354, + "learning_rate": 0.0002, + "loss": 0.5192, + "step": 35110 + }, + { + "epoch": 5.677794842777463, + "grad_norm": 1.4569132328033447, + "learning_rate": 0.0002, + "loss": 0.524, + "step": 35120 + }, + { + "epoch": 5.679411526958209, + "grad_norm": 0.8983587026596069, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 35130 + }, + { + "epoch": 5.681028211138954, + "grad_norm": 1.0775383710861206, + "learning_rate": 0.0002, + "loss": 0.5439, + "step": 35140 + }, + { + "epoch": 5.682644895319699, + "grad_norm": 0.9800270795822144, + "learning_rate": 0.0002, + "loss": 0.5289, + "step": 35150 + }, + { + "epoch": 5.684261579500444, + "grad_norm": 0.9858237504959106, + "learning_rate": 0.0002, + "loss": 0.533, + "step": 35160 + }, + { + "epoch": 5.6858782636811895, + "grad_norm": 1.031087040901184, + "learning_rate": 0.0002, + "loss": 0.5671, + "step": 35170 + }, + { + "epoch": 5.687494947861936, + "grad_norm": 1.0294365882873535, + "learning_rate": 0.0002, + "loss": 0.5528, + "step": 35180 + }, + { + "epoch": 5.68911163204268, + "grad_norm": 1.108144760131836, + "learning_rate": 0.0002, + "loss": 0.5581, + "step": 35190 + }, + { + "epoch": 5.690728316223426, + "grad_norm": 1.0813100337982178, + "learning_rate": 0.0002, + "loss": 0.5373, + "step": 35200 + }, + { + "epoch": 5.692345000404171, + "grad_norm": 1.3146867752075195, + "learning_rate": 0.0002, + "loss": 0.5429, + "step": 35210 + }, + { + "epoch": 5.693961684584917, + "grad_norm": 1.16780424118042, + "learning_rate": 0.0002, + "loss": 0.5297, + "step": 35220 + }, + { + "epoch": 5.695578368765662, + "grad_norm": 0.9929125905036926, + "learning_rate": 0.0002, + "loss": 0.577, + "step": 35230 + }, + { + "epoch": 5.697195052946407, + "grad_norm": 0.9049441814422607, + "learning_rate": 0.0002, + "loss": 0.5441, + "step": 35240 + }, + { + "epoch": 5.698811737127152, + "grad_norm": 0.9768866300582886, + "learning_rate": 0.0002, + "loss": 0.5349, + "step": 35250 + }, + { + "epoch": 5.7004284213078975, + "grad_norm": 0.8306029438972473, + "learning_rate": 0.0002, + "loss": 0.542, + "step": 35260 + }, + { + "epoch": 5.702045105488643, + "grad_norm": 0.8417280316352844, + "learning_rate": 0.0002, + "loss": 0.4771, + "step": 35270 + }, + { + "epoch": 5.703661789669388, + "grad_norm": 0.9954485893249512, + "learning_rate": 0.0002, + "loss": 0.574, + "step": 35280 + }, + { + "epoch": 5.705278473850133, + "grad_norm": 1.2417993545532227, + "learning_rate": 0.0002, + "loss": 0.5469, + "step": 35290 + }, + { + "epoch": 5.706895158030878, + "grad_norm": 1.1696544885635376, + "learning_rate": 0.0002, + "loss": 0.5275, + "step": 35300 + }, + { + "epoch": 5.708511842211624, + "grad_norm": 1.2424817085266113, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 35310 + }, + { + "epoch": 5.710128526392369, + "grad_norm": 1.1791106462478638, + "learning_rate": 0.0002, + "loss": 0.5595, + "step": 35320 + }, + { + "epoch": 5.711745210573115, + "grad_norm": 1.202181339263916, + "learning_rate": 0.0002, + "loss": 0.5076, + "step": 35330 + }, + { + "epoch": 5.713361894753859, + "grad_norm": 1.1006861925125122, + "learning_rate": 0.0002, + "loss": 0.5847, + "step": 35340 + }, + { + "epoch": 5.7149785789346055, + "grad_norm": 1.0918344259262085, + "learning_rate": 0.0002, + "loss": 0.5627, + "step": 35350 + }, + { + "epoch": 5.716595263115351, + "grad_norm": 1.0427305698394775, + "learning_rate": 0.0002, + "loss": 0.5677, + "step": 35360 + }, + { + "epoch": 5.718211947296096, + "grad_norm": 1.0818872451782227, + "learning_rate": 0.0002, + "loss": 0.5288, + "step": 35370 + }, + { + "epoch": 5.719828631476841, + "grad_norm": 1.186006784439087, + "learning_rate": 0.0002, + "loss": 0.5296, + "step": 35380 + }, + { + "epoch": 5.721445315657586, + "grad_norm": 1.2073674201965332, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 35390 + }, + { + "epoch": 5.723061999838332, + "grad_norm": 1.065338134765625, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 35400 + }, + { + "epoch": 5.724678684019077, + "grad_norm": 0.9448973536491394, + "learning_rate": 0.0002, + "loss": 0.5195, + "step": 35410 + }, + { + "epoch": 5.726295368199822, + "grad_norm": 1.1487499475479126, + "learning_rate": 0.0002, + "loss": 0.5276, + "step": 35420 + }, + { + "epoch": 5.727912052380567, + "grad_norm": 1.1334216594696045, + "learning_rate": 0.0002, + "loss": 0.5435, + "step": 35430 + }, + { + "epoch": 5.729528736561313, + "grad_norm": 1.1932826042175293, + "learning_rate": 0.0002, + "loss": 0.5074, + "step": 35440 + }, + { + "epoch": 5.731145420742058, + "grad_norm": 1.2615786790847778, + "learning_rate": 0.0002, + "loss": 0.5502, + "step": 35450 + }, + { + "epoch": 5.732762104922803, + "grad_norm": 1.2803694009780884, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 35460 + }, + { + "epoch": 5.734378789103548, + "grad_norm": 0.9271906614303589, + "learning_rate": 0.0002, + "loss": 0.5458, + "step": 35470 + }, + { + "epoch": 5.735995473284294, + "grad_norm": 1.0958917140960693, + "learning_rate": 0.0002, + "loss": 0.5342, + "step": 35480 + }, + { + "epoch": 5.737612157465039, + "grad_norm": 1.1072784662246704, + "learning_rate": 0.0002, + "loss": 0.538, + "step": 35490 + }, + { + "epoch": 5.739228841645785, + "grad_norm": 1.1641002893447876, + "learning_rate": 0.0002, + "loss": 0.5683, + "step": 35500 + }, + { + "epoch": 5.74084552582653, + "grad_norm": 1.0246447324752808, + "learning_rate": 0.0002, + "loss": 0.5252, + "step": 35510 + }, + { + "epoch": 5.742462210007275, + "grad_norm": 1.032474398612976, + "learning_rate": 0.0002, + "loss": 0.55, + "step": 35520 + }, + { + "epoch": 5.7440788941880205, + "grad_norm": 1.1600854396820068, + "learning_rate": 0.0002, + "loss": 0.4965, + "step": 35530 + }, + { + "epoch": 5.745695578368766, + "grad_norm": 1.0686054229736328, + "learning_rate": 0.0002, + "loss": 0.5543, + "step": 35540 + }, + { + "epoch": 5.747312262549511, + "grad_norm": 1.2314637899398804, + "learning_rate": 0.0002, + "loss": 0.5706, + "step": 35550 + }, + { + "epoch": 5.748928946730256, + "grad_norm": 0.922134280204773, + "learning_rate": 0.0002, + "loss": 0.5492, + "step": 35560 + }, + { + "epoch": 5.7505456309110015, + "grad_norm": 0.933043360710144, + "learning_rate": 0.0002, + "loss": 0.5495, + "step": 35570 + }, + { + "epoch": 5.752162315091747, + "grad_norm": 1.1911931037902832, + "learning_rate": 0.0002, + "loss": 0.5007, + "step": 35580 + }, + { + "epoch": 5.753778999272492, + "grad_norm": 0.8984857797622681, + "learning_rate": 0.0002, + "loss": 0.5244, + "step": 35590 + }, + { + "epoch": 5.755395683453237, + "grad_norm": 0.9495107531547546, + "learning_rate": 0.0002, + "loss": 0.5493, + "step": 35600 + }, + { + "epoch": 5.757012367633982, + "grad_norm": 1.2805472612380981, + "learning_rate": 0.0002, + "loss": 0.5326, + "step": 35610 + }, + { + "epoch": 5.758629051814728, + "grad_norm": 1.1236625909805298, + "learning_rate": 0.0002, + "loss": 0.5276, + "step": 35620 + }, + { + "epoch": 5.760245735995474, + "grad_norm": 1.0552798509597778, + "learning_rate": 0.0002, + "loss": 0.6102, + "step": 35630 + }, + { + "epoch": 5.761862420176218, + "grad_norm": 1.119909644126892, + "learning_rate": 0.0002, + "loss": 0.5479, + "step": 35640 + }, + { + "epoch": 5.763479104356964, + "grad_norm": 0.8786116242408752, + "learning_rate": 0.0002, + "loss": 0.5282, + "step": 35650 + }, + { + "epoch": 5.765095788537709, + "grad_norm": 1.2417117357254028, + "learning_rate": 0.0002, + "loss": 0.5406, + "step": 35660 + }, + { + "epoch": 5.766712472718455, + "grad_norm": 1.255200982093811, + "learning_rate": 0.0002, + "loss": 0.537, + "step": 35670 + }, + { + "epoch": 5.7683291568992, + "grad_norm": 1.0611358880996704, + "learning_rate": 0.0002, + "loss": 0.5308, + "step": 35680 + }, + { + "epoch": 5.769945841079945, + "grad_norm": 1.1443911790847778, + "learning_rate": 0.0002, + "loss": 0.5614, + "step": 35690 + }, + { + "epoch": 5.77156252526069, + "grad_norm": 1.1437989473342896, + "learning_rate": 0.0002, + "loss": 0.5386, + "step": 35700 + }, + { + "epoch": 5.773179209441436, + "grad_norm": 1.1375046968460083, + "learning_rate": 0.0002, + "loss": 0.537, + "step": 35710 + }, + { + "epoch": 5.774795893622181, + "grad_norm": 1.0777729749679565, + "learning_rate": 0.0002, + "loss": 0.5198, + "step": 35720 + }, + { + "epoch": 5.776412577802926, + "grad_norm": 1.1160215139389038, + "learning_rate": 0.0002, + "loss": 0.5521, + "step": 35730 + }, + { + "epoch": 5.778029261983671, + "grad_norm": 1.1268514394760132, + "learning_rate": 0.0002, + "loss": 0.5569, + "step": 35740 + }, + { + "epoch": 5.7796459461644165, + "grad_norm": 1.2752262353897095, + "learning_rate": 0.0002, + "loss": 0.5311, + "step": 35750 + }, + { + "epoch": 5.781262630345162, + "grad_norm": 1.0416184663772583, + "learning_rate": 0.0002, + "loss": 0.5625, + "step": 35760 + }, + { + "epoch": 5.782879314525907, + "grad_norm": 1.0622444152832031, + "learning_rate": 0.0002, + "loss": 0.5438, + "step": 35770 + }, + { + "epoch": 5.784495998706653, + "grad_norm": 1.1217877864837646, + "learning_rate": 0.0002, + "loss": 0.5268, + "step": 35780 + }, + { + "epoch": 5.786112682887398, + "grad_norm": 0.9363139867782593, + "learning_rate": 0.0002, + "loss": 0.5225, + "step": 35790 + }, + { + "epoch": 5.787729367068144, + "grad_norm": 0.96628737449646, + "learning_rate": 0.0002, + "loss": 0.5524, + "step": 35800 + }, + { + "epoch": 5.789346051248889, + "grad_norm": 0.9572572112083435, + "learning_rate": 0.0002, + "loss": 0.52, + "step": 35810 + }, + { + "epoch": 5.790962735429634, + "grad_norm": 0.938724935054779, + "learning_rate": 0.0002, + "loss": 0.5615, + "step": 35820 + }, + { + "epoch": 5.792579419610379, + "grad_norm": 1.3314417600631714, + "learning_rate": 0.0002, + "loss": 0.5391, + "step": 35830 + }, + { + "epoch": 5.7941961037911245, + "grad_norm": 1.0097602605819702, + "learning_rate": 0.0002, + "loss": 0.5441, + "step": 35840 + }, + { + "epoch": 5.79581278797187, + "grad_norm": 1.1265122890472412, + "learning_rate": 0.0002, + "loss": 0.591, + "step": 35850 + }, + { + "epoch": 5.797429472152615, + "grad_norm": 1.2191909551620483, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 35860 + }, + { + "epoch": 5.79904615633336, + "grad_norm": 0.9690808057785034, + "learning_rate": 0.0002, + "loss": 0.5274, + "step": 35870 + }, + { + "epoch": 5.800662840514105, + "grad_norm": 1.0871665477752686, + "learning_rate": 0.0002, + "loss": 0.5425, + "step": 35880 + }, + { + "epoch": 5.802279524694851, + "grad_norm": 1.1093597412109375, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 35890 + }, + { + "epoch": 5.803896208875596, + "grad_norm": 1.2434282302856445, + "learning_rate": 0.0002, + "loss": 0.5475, + "step": 35900 + }, + { + "epoch": 5.805512893056341, + "grad_norm": 1.2933623790740967, + "learning_rate": 0.0002, + "loss": 0.5288, + "step": 35910 + }, + { + "epoch": 5.807129577237086, + "grad_norm": 1.0005441904067993, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 35920 + }, + { + "epoch": 5.8087462614178325, + "grad_norm": 1.2373108863830566, + "learning_rate": 0.0002, + "loss": 0.5318, + "step": 35930 + }, + { + "epoch": 5.810362945598578, + "grad_norm": 1.2622692584991455, + "learning_rate": 0.0002, + "loss": 0.5413, + "step": 35940 + }, + { + "epoch": 5.811979629779323, + "grad_norm": 1.0112963914871216, + "learning_rate": 0.0002, + "loss": 0.5558, + "step": 35950 + }, + { + "epoch": 5.813596313960068, + "grad_norm": 1.050572395324707, + "learning_rate": 0.0002, + "loss": 0.5115, + "step": 35960 + }, + { + "epoch": 5.815212998140813, + "grad_norm": 0.9774560928344727, + "learning_rate": 0.0002, + "loss": 0.5288, + "step": 35970 + }, + { + "epoch": 5.816829682321559, + "grad_norm": 1.19438898563385, + "learning_rate": 0.0002, + "loss": 0.585, + "step": 35980 + }, + { + "epoch": 5.818446366502304, + "grad_norm": 1.0267130136489868, + "learning_rate": 0.0002, + "loss": 0.5798, + "step": 35990 + }, + { + "epoch": 5.820063050683049, + "grad_norm": 0.9813851714134216, + "learning_rate": 0.0002, + "loss": 0.5126, + "step": 36000 + }, + { + "epoch": 5.821679734863794, + "grad_norm": 0.9177457094192505, + "learning_rate": 0.0002, + "loss": 0.5138, + "step": 36010 + }, + { + "epoch": 5.8232964190445395, + "grad_norm": 1.0020731687545776, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 36020 + }, + { + "epoch": 5.824913103225285, + "grad_norm": 1.073222041130066, + "learning_rate": 0.0002, + "loss": 0.5646, + "step": 36030 + }, + { + "epoch": 5.82652978740603, + "grad_norm": 1.016337513923645, + "learning_rate": 0.0002, + "loss": 0.5539, + "step": 36040 + }, + { + "epoch": 5.828146471586775, + "grad_norm": 1.267364263534546, + "learning_rate": 0.0002, + "loss": 0.5592, + "step": 36050 + }, + { + "epoch": 5.8297631557675205, + "grad_norm": 1.2730127573013306, + "learning_rate": 0.0002, + "loss": 0.595, + "step": 36060 + }, + { + "epoch": 5.831379839948266, + "grad_norm": 1.108442783355713, + "learning_rate": 0.0002, + "loss": 0.5247, + "step": 36070 + }, + { + "epoch": 5.832996524129012, + "grad_norm": 1.198072075843811, + "learning_rate": 0.0002, + "loss": 0.5103, + "step": 36080 + }, + { + "epoch": 5.834613208309757, + "grad_norm": 1.0458786487579346, + "learning_rate": 0.0002, + "loss": 0.5479, + "step": 36090 + }, + { + "epoch": 5.836229892490502, + "grad_norm": 0.9096664786338806, + "learning_rate": 0.0002, + "loss": 0.5564, + "step": 36100 + }, + { + "epoch": 5.8378465766712475, + "grad_norm": 0.9957793951034546, + "learning_rate": 0.0002, + "loss": 0.5602, + "step": 36110 + }, + { + "epoch": 5.839463260851993, + "grad_norm": 1.3693058490753174, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 36120 + }, + { + "epoch": 5.841079945032738, + "grad_norm": 1.268608808517456, + "learning_rate": 0.0002, + "loss": 0.5425, + "step": 36130 + }, + { + "epoch": 5.842696629213483, + "grad_norm": 0.8516020178794861, + "learning_rate": 0.0002, + "loss": 0.5653, + "step": 36140 + }, + { + "epoch": 5.844313313394228, + "grad_norm": 0.90385502576828, + "learning_rate": 0.0002, + "loss": 0.5475, + "step": 36150 + }, + { + "epoch": 5.845929997574974, + "grad_norm": 1.0910571813583374, + "learning_rate": 0.0002, + "loss": 0.5274, + "step": 36160 + }, + { + "epoch": 5.847546681755719, + "grad_norm": 0.9417795538902283, + "learning_rate": 0.0002, + "loss": 0.555, + "step": 36170 + }, + { + "epoch": 5.849163365936464, + "grad_norm": 1.0027360916137695, + "learning_rate": 0.0002, + "loss": 0.5784, + "step": 36180 + }, + { + "epoch": 5.850780050117209, + "grad_norm": 1.1480516195297241, + "learning_rate": 0.0002, + "loss": 0.5423, + "step": 36190 + }, + { + "epoch": 5.852396734297955, + "grad_norm": 1.2431457042694092, + "learning_rate": 0.0002, + "loss": 0.5517, + "step": 36200 + }, + { + "epoch": 5.8540134184787, + "grad_norm": 1.091465950012207, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 36210 + }, + { + "epoch": 5.855630102659445, + "grad_norm": 0.9693930745124817, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 36220 + }, + { + "epoch": 5.857246786840191, + "grad_norm": 0.9937465190887451, + "learning_rate": 0.0002, + "loss": 0.5453, + "step": 36230 + }, + { + "epoch": 5.858863471020936, + "grad_norm": 1.0731011629104614, + "learning_rate": 0.0002, + "loss": 0.5621, + "step": 36240 + }, + { + "epoch": 5.860480155201682, + "grad_norm": 1.0869048833847046, + "learning_rate": 0.0002, + "loss": 0.5687, + "step": 36250 + }, + { + "epoch": 5.862096839382427, + "grad_norm": 0.9226390719413757, + "learning_rate": 0.0002, + "loss": 0.5576, + "step": 36260 + }, + { + "epoch": 5.863713523563172, + "grad_norm": 1.1755430698394775, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 36270 + }, + { + "epoch": 5.865330207743917, + "grad_norm": 0.8815974593162537, + "learning_rate": 0.0002, + "loss": 0.558, + "step": 36280 + }, + { + "epoch": 5.866946891924663, + "grad_norm": 1.3648751974105835, + "learning_rate": 0.0002, + "loss": 0.5065, + "step": 36290 + }, + { + "epoch": 5.868563576105408, + "grad_norm": 0.8729211091995239, + "learning_rate": 0.0002, + "loss": 0.536, + "step": 36300 + }, + { + "epoch": 5.870180260286153, + "grad_norm": 1.0870907306671143, + "learning_rate": 0.0002, + "loss": 0.5192, + "step": 36310 + }, + { + "epoch": 5.871796944466898, + "grad_norm": 1.1164259910583496, + "learning_rate": 0.0002, + "loss": 0.5609, + "step": 36320 + }, + { + "epoch": 5.8734136286476435, + "grad_norm": 1.1572535037994385, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 36330 + }, + { + "epoch": 5.875030312828389, + "grad_norm": 1.0456238985061646, + "learning_rate": 0.0002, + "loss": 0.5898, + "step": 36340 + }, + { + "epoch": 5.876646997009134, + "grad_norm": 1.1310722827911377, + "learning_rate": 0.0002, + "loss": 0.5008, + "step": 36350 + }, + { + "epoch": 5.878263681189879, + "grad_norm": 1.0004712343215942, + "learning_rate": 0.0002, + "loss": 0.5352, + "step": 36360 + }, + { + "epoch": 5.879880365370624, + "grad_norm": 1.0991777181625366, + "learning_rate": 0.0002, + "loss": 0.5632, + "step": 36370 + }, + { + "epoch": 5.8814970495513705, + "grad_norm": 1.2789239883422852, + "learning_rate": 0.0002, + "loss": 0.5815, + "step": 36380 + }, + { + "epoch": 5.883113733732116, + "grad_norm": 0.9524819850921631, + "learning_rate": 0.0002, + "loss": 0.56, + "step": 36390 + }, + { + "epoch": 5.884730417912861, + "grad_norm": 1.1115771532058716, + "learning_rate": 0.0002, + "loss": 0.5701, + "step": 36400 + }, + { + "epoch": 5.886347102093606, + "grad_norm": 1.37419855594635, + "learning_rate": 0.0002, + "loss": 0.5463, + "step": 36410 + }, + { + "epoch": 5.8879637862743515, + "grad_norm": 1.1449527740478516, + "learning_rate": 0.0002, + "loss": 0.5675, + "step": 36420 + }, + { + "epoch": 5.889580470455097, + "grad_norm": 1.198046326637268, + "learning_rate": 0.0002, + "loss": 0.5255, + "step": 36430 + }, + { + "epoch": 5.891197154635842, + "grad_norm": 1.0180530548095703, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 36440 + }, + { + "epoch": 5.892813838816587, + "grad_norm": 1.0516417026519775, + "learning_rate": 0.0002, + "loss": 0.5319, + "step": 36450 + }, + { + "epoch": 5.894430522997332, + "grad_norm": 1.1658052206039429, + "learning_rate": 0.0002, + "loss": 0.5782, + "step": 36460 + }, + { + "epoch": 5.896047207178078, + "grad_norm": 1.190699577331543, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 36470 + }, + { + "epoch": 5.897663891358823, + "grad_norm": 1.1235495805740356, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 36480 + }, + { + "epoch": 5.899280575539568, + "grad_norm": 1.1926926374435425, + "learning_rate": 0.0002, + "loss": 0.5284, + "step": 36490 + }, + { + "epoch": 5.900897259720313, + "grad_norm": 1.1184662580490112, + "learning_rate": 0.0002, + "loss": 0.5686, + "step": 36500 + }, + { + "epoch": 5.9025139439010585, + "grad_norm": 1.000970721244812, + "learning_rate": 0.0002, + "loss": 0.5147, + "step": 36510 + }, + { + "epoch": 5.904130628081804, + "grad_norm": 1.0373306274414062, + "learning_rate": 0.0002, + "loss": 0.5351, + "step": 36520 + }, + { + "epoch": 5.90574731226255, + "grad_norm": 1.0840669870376587, + "learning_rate": 0.0002, + "loss": 0.535, + "step": 36530 + }, + { + "epoch": 5.907363996443295, + "grad_norm": 0.9908381104469299, + "learning_rate": 0.0002, + "loss": 0.538, + "step": 36540 + }, + { + "epoch": 5.90898068062404, + "grad_norm": 1.0456029176712036, + "learning_rate": 0.0002, + "loss": 0.5313, + "step": 36550 + }, + { + "epoch": 5.910597364804786, + "grad_norm": 1.1381454467773438, + "learning_rate": 0.0002, + "loss": 0.5693, + "step": 36560 + }, + { + "epoch": 5.912214048985531, + "grad_norm": 0.9440900087356567, + "learning_rate": 0.0002, + "loss": 0.5473, + "step": 36570 + }, + { + "epoch": 5.913830733166276, + "grad_norm": 1.1674573421478271, + "learning_rate": 0.0002, + "loss": 0.5542, + "step": 36580 + }, + { + "epoch": 5.915447417347021, + "grad_norm": 1.1226966381072998, + "learning_rate": 0.0002, + "loss": 0.526, + "step": 36590 + }, + { + "epoch": 5.9170641015277665, + "grad_norm": 0.9696915745735168, + "learning_rate": 0.0002, + "loss": 0.6091, + "step": 36600 + }, + { + "epoch": 5.918680785708512, + "grad_norm": 0.9593005180358887, + "learning_rate": 0.0002, + "loss": 0.5523, + "step": 36610 + }, + { + "epoch": 5.920297469889257, + "grad_norm": 1.122169852256775, + "learning_rate": 0.0002, + "loss": 0.5536, + "step": 36620 + }, + { + "epoch": 5.921914154070002, + "grad_norm": 0.9923415780067444, + "learning_rate": 0.0002, + "loss": 0.5039, + "step": 36630 + }, + { + "epoch": 5.923530838250747, + "grad_norm": 1.063838005065918, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 36640 + }, + { + "epoch": 5.925147522431493, + "grad_norm": 0.9083505272865295, + "learning_rate": 0.0002, + "loss": 0.5799, + "step": 36650 + }, + { + "epoch": 5.926764206612239, + "grad_norm": 0.9439437985420227, + "learning_rate": 0.0002, + "loss": 0.5264, + "step": 36660 + }, + { + "epoch": 5.928380890792983, + "grad_norm": 0.9778534173965454, + "learning_rate": 0.0002, + "loss": 0.5891, + "step": 36670 + }, + { + "epoch": 5.929997574973729, + "grad_norm": 0.9723961353302002, + "learning_rate": 0.0002, + "loss": 0.566, + "step": 36680 + }, + { + "epoch": 5.9316142591544745, + "grad_norm": 1.162333607673645, + "learning_rate": 0.0002, + "loss": 0.5741, + "step": 36690 + }, + { + "epoch": 5.93323094333522, + "grad_norm": 1.2784897089004517, + "learning_rate": 0.0002, + "loss": 0.5771, + "step": 36700 + }, + { + "epoch": 5.934847627515965, + "grad_norm": 1.0924867391586304, + "learning_rate": 0.0002, + "loss": 0.5343, + "step": 36710 + }, + { + "epoch": 5.93646431169671, + "grad_norm": 1.046922206878662, + "learning_rate": 0.0002, + "loss": 0.5554, + "step": 36720 + }, + { + "epoch": 5.938080995877455, + "grad_norm": 0.8632535338401794, + "learning_rate": 0.0002, + "loss": 0.5476, + "step": 36730 + }, + { + "epoch": 5.939697680058201, + "grad_norm": 1.358762502670288, + "learning_rate": 0.0002, + "loss": 0.5456, + "step": 36740 + }, + { + "epoch": 5.941314364238946, + "grad_norm": 1.2058624029159546, + "learning_rate": 0.0002, + "loss": 0.551, + "step": 36750 + }, + { + "epoch": 5.942931048419691, + "grad_norm": 1.1396408081054688, + "learning_rate": 0.0002, + "loss": 0.5462, + "step": 36760 + }, + { + "epoch": 5.944547732600436, + "grad_norm": 1.1510354280471802, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 36770 + }, + { + "epoch": 5.946164416781182, + "grad_norm": 1.1401607990264893, + "learning_rate": 0.0002, + "loss": 0.5659, + "step": 36780 + }, + { + "epoch": 5.947781100961927, + "grad_norm": 1.1871325969696045, + "learning_rate": 0.0002, + "loss": 0.5557, + "step": 36790 + }, + { + "epoch": 5.949397785142672, + "grad_norm": 0.9928333163261414, + "learning_rate": 0.0002, + "loss": 0.4945, + "step": 36800 + }, + { + "epoch": 5.951014469323418, + "grad_norm": 1.0549445152282715, + "learning_rate": 0.0002, + "loss": 0.5303, + "step": 36810 + }, + { + "epoch": 5.9526311535041625, + "grad_norm": 0.9791563749313354, + "learning_rate": 0.0002, + "loss": 0.5532, + "step": 36820 + }, + { + "epoch": 5.954247837684909, + "grad_norm": 1.1268441677093506, + "learning_rate": 0.0002, + "loss": 0.5317, + "step": 36830 + }, + { + "epoch": 5.955864521865654, + "grad_norm": 1.0533992052078247, + "learning_rate": 0.0002, + "loss": 0.5585, + "step": 36840 + }, + { + "epoch": 5.957481206046399, + "grad_norm": 1.023358941078186, + "learning_rate": 0.0002, + "loss": 0.4972, + "step": 36850 + }, + { + "epoch": 5.959097890227144, + "grad_norm": 1.2631961107254028, + "learning_rate": 0.0002, + "loss": 0.5557, + "step": 36860 + }, + { + "epoch": 5.9607145744078895, + "grad_norm": 0.9397698640823364, + "learning_rate": 0.0002, + "loss": 0.5662, + "step": 36870 + }, + { + "epoch": 5.962331258588635, + "grad_norm": 1.1678427457809448, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 36880 + }, + { + "epoch": 5.96394794276938, + "grad_norm": 1.1403759717941284, + "learning_rate": 0.0002, + "loss": 0.5435, + "step": 36890 + }, + { + "epoch": 5.965564626950125, + "grad_norm": 1.030572772026062, + "learning_rate": 0.0002, + "loss": 0.5479, + "step": 36900 + }, + { + "epoch": 5.9671813111308705, + "grad_norm": 1.0992497205734253, + "learning_rate": 0.0002, + "loss": 0.5838, + "step": 36910 + }, + { + "epoch": 5.968797995311616, + "grad_norm": 1.075466275215149, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 36920 + }, + { + "epoch": 5.970414679492361, + "grad_norm": 1.0153694152832031, + "learning_rate": 0.0002, + "loss": 0.5739, + "step": 36930 + }, + { + "epoch": 5.972031363673106, + "grad_norm": 0.973193883895874, + "learning_rate": 0.0002, + "loss": 0.5672, + "step": 36940 + }, + { + "epoch": 5.973648047853851, + "grad_norm": 0.8294678926467896, + "learning_rate": 0.0002, + "loss": 0.5585, + "step": 36950 + }, + { + "epoch": 5.9752647320345975, + "grad_norm": 1.0048716068267822, + "learning_rate": 0.0002, + "loss": 0.5631, + "step": 36960 + }, + { + "epoch": 5.976881416215342, + "grad_norm": 0.9714070558547974, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 36970 + }, + { + "epoch": 5.978498100396088, + "grad_norm": 0.8667682409286499, + "learning_rate": 0.0002, + "loss": 0.5419, + "step": 36980 + }, + { + "epoch": 5.980114784576833, + "grad_norm": 1.0461409091949463, + "learning_rate": 0.0002, + "loss": 0.5474, + "step": 36990 + }, + { + "epoch": 5.981731468757578, + "grad_norm": 0.9229754209518433, + "learning_rate": 0.0002, + "loss": 0.5454, + "step": 37000 + }, + { + "epoch": 5.983348152938324, + "grad_norm": 1.0406876802444458, + "learning_rate": 0.0002, + "loss": 0.5599, + "step": 37010 + }, + { + "epoch": 5.984964837119069, + "grad_norm": 0.8993828296661377, + "learning_rate": 0.0002, + "loss": 0.5569, + "step": 37020 + }, + { + "epoch": 5.986581521299814, + "grad_norm": 1.2260479927062988, + "learning_rate": 0.0002, + "loss": 0.5611, + "step": 37030 + }, + { + "epoch": 5.988198205480559, + "grad_norm": 1.0107380151748657, + "learning_rate": 0.0002, + "loss": 0.5523, + "step": 37040 + }, + { + "epoch": 5.989814889661305, + "grad_norm": 1.0240139961242676, + "learning_rate": 0.0002, + "loss": 0.5639, + "step": 37050 + }, + { + "epoch": 5.99143157384205, + "grad_norm": 1.0185275077819824, + "learning_rate": 0.0002, + "loss": 0.5209, + "step": 37060 + }, + { + "epoch": 5.993048258022795, + "grad_norm": 1.1361802816390991, + "learning_rate": 0.0002, + "loss": 0.5114, + "step": 37070 + }, + { + "epoch": 5.99466494220354, + "grad_norm": 1.0395532846450806, + "learning_rate": 0.0002, + "loss": 0.5692, + "step": 37080 + }, + { + "epoch": 5.9962816263842855, + "grad_norm": 0.9463558197021484, + "learning_rate": 0.0002, + "loss": 0.594, + "step": 37090 + }, + { + "epoch": 5.997898310565031, + "grad_norm": 1.2066948413848877, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 37100 + }, + { + "epoch": 5.999514994745777, + "grad_norm": 0.9749386310577393, + "learning_rate": 0.0002, + "loss": 0.5356, + "step": 37110 + }, + { + "epoch": 6.0, + "eval_loss": 1.2270219326019287, + "eval_runtime": 122.2047, + "eval_samples_per_second": 5.998, + "eval_steps_per_second": 0.753, + "step": 37113 + }, + { + "epoch": 6.001131678926522, + "grad_norm": 0.9641092419624329, + "learning_rate": 0.0002, + "loss": 0.4855, + "step": 37120 + }, + { + "epoch": 6.002748363107267, + "grad_norm": 1.103379249572754, + "learning_rate": 0.0002, + "loss": 0.4112, + "step": 37130 + }, + { + "epoch": 6.004365047288013, + "grad_norm": 0.8381665349006653, + "learning_rate": 0.0002, + "loss": 0.4577, + "step": 37140 + }, + { + "epoch": 6.005981731468758, + "grad_norm": 1.245323896408081, + "learning_rate": 0.0002, + "loss": 0.4794, + "step": 37150 + }, + { + "epoch": 6.007598415649503, + "grad_norm": 1.3140289783477783, + "learning_rate": 0.0002, + "loss": 0.4503, + "step": 37160 + }, + { + "epoch": 6.009215099830248, + "grad_norm": 0.8479695916175842, + "learning_rate": 0.0002, + "loss": 0.4456, + "step": 37170 + }, + { + "epoch": 6.0108317840109935, + "grad_norm": 0.8841437101364136, + "learning_rate": 0.0002, + "loss": 0.4573, + "step": 37180 + }, + { + "epoch": 6.012448468191739, + "grad_norm": 0.8900154829025269, + "learning_rate": 0.0002, + "loss": 0.4565, + "step": 37190 + }, + { + "epoch": 6.014065152372484, + "grad_norm": 1.2753345966339111, + "learning_rate": 0.0002, + "loss": 0.457, + "step": 37200 + }, + { + "epoch": 6.015681836553229, + "grad_norm": 1.4625498056411743, + "learning_rate": 0.0002, + "loss": 0.4365, + "step": 37210 + }, + { + "epoch": 6.017298520733974, + "grad_norm": 0.7455034852027893, + "learning_rate": 0.0002, + "loss": 0.4252, + "step": 37220 + }, + { + "epoch": 6.01891520491472, + "grad_norm": 1.1658862829208374, + "learning_rate": 0.0002, + "loss": 0.4433, + "step": 37230 + }, + { + "epoch": 6.020531889095465, + "grad_norm": 0.9785751104354858, + "learning_rate": 0.0002, + "loss": 0.4499, + "step": 37240 + }, + { + "epoch": 6.02214857327621, + "grad_norm": 1.3193122148513794, + "learning_rate": 0.0002, + "loss": 0.4956, + "step": 37250 + }, + { + "epoch": 6.023765257456955, + "grad_norm": 1.038273572921753, + "learning_rate": 0.0002, + "loss": 0.4727, + "step": 37260 + }, + { + "epoch": 6.0253819416377015, + "grad_norm": 1.0550594329833984, + "learning_rate": 0.0002, + "loss": 0.4395, + "step": 37270 + }, + { + "epoch": 6.026998625818447, + "grad_norm": 0.9745930433273315, + "learning_rate": 0.0002, + "loss": 0.4767, + "step": 37280 + }, + { + "epoch": 6.028615309999192, + "grad_norm": 0.9273530840873718, + "learning_rate": 0.0002, + "loss": 0.4233, + "step": 37290 + }, + { + "epoch": 6.030231994179937, + "grad_norm": 1.3844057321548462, + "learning_rate": 0.0002, + "loss": 0.4195, + "step": 37300 + }, + { + "epoch": 6.031848678360682, + "grad_norm": 1.2058762311935425, + "learning_rate": 0.0002, + "loss": 0.4768, + "step": 37310 + }, + { + "epoch": 6.033465362541428, + "grad_norm": 1.242663025856018, + "learning_rate": 0.0002, + "loss": 0.4499, + "step": 37320 + }, + { + "epoch": 6.035082046722173, + "grad_norm": 1.3504270315170288, + "learning_rate": 0.0002, + "loss": 0.4597, + "step": 37330 + }, + { + "epoch": 6.036698730902918, + "grad_norm": 0.8734912276268005, + "learning_rate": 0.0002, + "loss": 0.4402, + "step": 37340 + }, + { + "epoch": 6.038315415083663, + "grad_norm": 1.0182311534881592, + "learning_rate": 0.0002, + "loss": 0.477, + "step": 37350 + }, + { + "epoch": 6.0399320992644085, + "grad_norm": 0.9898499846458435, + "learning_rate": 0.0002, + "loss": 0.4261, + "step": 37360 + }, + { + "epoch": 6.041548783445154, + "grad_norm": 1.0637860298156738, + "learning_rate": 0.0002, + "loss": 0.4459, + "step": 37370 + }, + { + "epoch": 6.043165467625899, + "grad_norm": 1.0099523067474365, + "learning_rate": 0.0002, + "loss": 0.4958, + "step": 37380 + }, + { + "epoch": 6.044782151806644, + "grad_norm": 1.1080750226974487, + "learning_rate": 0.0002, + "loss": 0.4459, + "step": 37390 + }, + { + "epoch": 6.0463988359873895, + "grad_norm": 1.2551289796829224, + "learning_rate": 0.0002, + "loss": 0.4473, + "step": 37400 + }, + { + "epoch": 6.048015520168136, + "grad_norm": 0.8959632515907288, + "learning_rate": 0.0002, + "loss": 0.468, + "step": 37410 + }, + { + "epoch": 6.049632204348881, + "grad_norm": 1.1748892068862915, + "learning_rate": 0.0002, + "loss": 0.4255, + "step": 37420 + }, + { + "epoch": 6.051248888529626, + "grad_norm": 1.3122745752334595, + "learning_rate": 0.0002, + "loss": 0.4458, + "step": 37430 + }, + { + "epoch": 6.052865572710371, + "grad_norm": 1.0227985382080078, + "learning_rate": 0.0002, + "loss": 0.4676, + "step": 37440 + }, + { + "epoch": 6.0544822568911165, + "grad_norm": 1.0380030870437622, + "learning_rate": 0.0002, + "loss": 0.4503, + "step": 37450 + }, + { + "epoch": 6.056098941071862, + "grad_norm": 0.8919622898101807, + "learning_rate": 0.0002, + "loss": 0.4686, + "step": 37460 + }, + { + "epoch": 6.057715625252607, + "grad_norm": 1.4554150104522705, + "learning_rate": 0.0002, + "loss": 0.4406, + "step": 37470 + }, + { + "epoch": 6.059332309433352, + "grad_norm": 1.2853292226791382, + "learning_rate": 0.0002, + "loss": 0.4688, + "step": 37480 + }, + { + "epoch": 6.0609489936140974, + "grad_norm": 1.2951840162277222, + "learning_rate": 0.0002, + "loss": 0.4489, + "step": 37490 + }, + { + "epoch": 6.062565677794843, + "grad_norm": 1.1750973463058472, + "learning_rate": 0.0002, + "loss": 0.4819, + "step": 37500 + }, + { + "epoch": 6.064182361975588, + "grad_norm": 0.9328424334526062, + "learning_rate": 0.0002, + "loss": 0.4574, + "step": 37510 + }, + { + "epoch": 6.065799046156333, + "grad_norm": 1.0353537797927856, + "learning_rate": 0.0002, + "loss": 0.4597, + "step": 37520 + }, + { + "epoch": 6.067415730337078, + "grad_norm": 1.1594274044036865, + "learning_rate": 0.0002, + "loss": 0.4407, + "step": 37530 + }, + { + "epoch": 6.069032414517824, + "grad_norm": 0.9034168124198914, + "learning_rate": 0.0002, + "loss": 0.4642, + "step": 37540 + }, + { + "epoch": 6.070649098698569, + "grad_norm": 1.068617820739746, + "learning_rate": 0.0002, + "loss": 0.4625, + "step": 37550 + }, + { + "epoch": 6.072265782879315, + "grad_norm": 1.0931321382522583, + "learning_rate": 0.0002, + "loss": 0.4378, + "step": 37560 + }, + { + "epoch": 6.07388246706006, + "grad_norm": 1.2542688846588135, + "learning_rate": 0.0002, + "loss": 0.4527, + "step": 37570 + }, + { + "epoch": 6.075499151240805, + "grad_norm": 1.273384928703308, + "learning_rate": 0.0002, + "loss": 0.4725, + "step": 37580 + }, + { + "epoch": 6.077115835421551, + "grad_norm": 1.4771400690078735, + "learning_rate": 0.0002, + "loss": 0.4928, + "step": 37590 + }, + { + "epoch": 6.078732519602296, + "grad_norm": 1.3751444816589355, + "learning_rate": 0.0002, + "loss": 0.461, + "step": 37600 + }, + { + "epoch": 6.080349203783041, + "grad_norm": 1.4532550573349, + "learning_rate": 0.0002, + "loss": 0.4602, + "step": 37610 + }, + { + "epoch": 6.081965887963786, + "grad_norm": 1.3175991773605347, + "learning_rate": 0.0002, + "loss": 0.4428, + "step": 37620 + }, + { + "epoch": 6.083582572144532, + "grad_norm": 1.0624970197677612, + "learning_rate": 0.0002, + "loss": 0.4746, + "step": 37630 + }, + { + "epoch": 6.085199256325277, + "grad_norm": 1.099715232849121, + "learning_rate": 0.0002, + "loss": 0.413, + "step": 37640 + }, + { + "epoch": 6.086815940506022, + "grad_norm": 1.0380114316940308, + "learning_rate": 0.0002, + "loss": 0.4528, + "step": 37650 + }, + { + "epoch": 6.088432624686767, + "grad_norm": 1.1136109828948975, + "learning_rate": 0.0002, + "loss": 0.4373, + "step": 37660 + }, + { + "epoch": 6.0900493088675125, + "grad_norm": 0.996498703956604, + "learning_rate": 0.0002, + "loss": 0.4915, + "step": 37670 + }, + { + "epoch": 6.091665993048258, + "grad_norm": 1.0552574396133423, + "learning_rate": 0.0002, + "loss": 0.4713, + "step": 37680 + }, + { + "epoch": 6.093282677229003, + "grad_norm": 1.4108527898788452, + "learning_rate": 0.0002, + "loss": 0.4414, + "step": 37690 + }, + { + "epoch": 6.094899361409748, + "grad_norm": 1.1323093175888062, + "learning_rate": 0.0002, + "loss": 0.4851, + "step": 37700 + }, + { + "epoch": 6.096516045590494, + "grad_norm": 0.9364377856254578, + "learning_rate": 0.0002, + "loss": 0.4455, + "step": 37710 + }, + { + "epoch": 6.0981327297712395, + "grad_norm": 1.1300561428070068, + "learning_rate": 0.0002, + "loss": 0.4791, + "step": 37720 + }, + { + "epoch": 6.099749413951985, + "grad_norm": 1.0616047382354736, + "learning_rate": 0.0002, + "loss": 0.4539, + "step": 37730 + }, + { + "epoch": 6.10136609813273, + "grad_norm": 1.1205905675888062, + "learning_rate": 0.0002, + "loss": 0.4516, + "step": 37740 + }, + { + "epoch": 6.102982782313475, + "grad_norm": 0.9592534303665161, + "learning_rate": 0.0002, + "loss": 0.4688, + "step": 37750 + }, + { + "epoch": 6.1045994664942205, + "grad_norm": 0.9797531962394714, + "learning_rate": 0.0002, + "loss": 0.4494, + "step": 37760 + }, + { + "epoch": 6.106216150674966, + "grad_norm": 1.093404769897461, + "learning_rate": 0.0002, + "loss": 0.4237, + "step": 37770 + }, + { + "epoch": 6.107832834855711, + "grad_norm": 1.2172642946243286, + "learning_rate": 0.0002, + "loss": 0.4691, + "step": 37780 + }, + { + "epoch": 6.109449519036456, + "grad_norm": 1.0467255115509033, + "learning_rate": 0.0002, + "loss": 0.4398, + "step": 37790 + }, + { + "epoch": 6.111066203217201, + "grad_norm": 1.159318208694458, + "learning_rate": 0.0002, + "loss": 0.4676, + "step": 37800 + }, + { + "epoch": 6.112682887397947, + "grad_norm": 1.0615603923797607, + "learning_rate": 0.0002, + "loss": 0.4539, + "step": 37810 + }, + { + "epoch": 6.114299571578692, + "grad_norm": 1.0542045831680298, + "learning_rate": 0.0002, + "loss": 0.4957, + "step": 37820 + }, + { + "epoch": 6.115916255759437, + "grad_norm": 0.8962697982788086, + "learning_rate": 0.0002, + "loss": 0.4512, + "step": 37830 + }, + { + "epoch": 6.117532939940182, + "grad_norm": 1.106352686882019, + "learning_rate": 0.0002, + "loss": 0.4519, + "step": 37840 + }, + { + "epoch": 6.1191496241209276, + "grad_norm": 1.1660276651382446, + "learning_rate": 0.0002, + "loss": 0.4421, + "step": 37850 + }, + { + "epoch": 6.120766308301674, + "grad_norm": 1.3524385690689087, + "learning_rate": 0.0002, + "loss": 0.4701, + "step": 37860 + }, + { + "epoch": 6.122382992482419, + "grad_norm": 1.1056050062179565, + "learning_rate": 0.0002, + "loss": 0.4684, + "step": 37870 + }, + { + "epoch": 6.123999676663164, + "grad_norm": 1.0772725343704224, + "learning_rate": 0.0002, + "loss": 0.4518, + "step": 37880 + }, + { + "epoch": 6.125616360843909, + "grad_norm": 1.1011115312576294, + "learning_rate": 0.0002, + "loss": 0.4356, + "step": 37890 + }, + { + "epoch": 6.127233045024655, + "grad_norm": 0.8952536582946777, + "learning_rate": 0.0002, + "loss": 0.4909, + "step": 37900 + }, + { + "epoch": 6.1288497292054, + "grad_norm": 1.244398593902588, + "learning_rate": 0.0002, + "loss": 0.4299, + "step": 37910 + }, + { + "epoch": 6.130466413386145, + "grad_norm": 0.9658283591270447, + "learning_rate": 0.0002, + "loss": 0.4764, + "step": 37920 + }, + { + "epoch": 6.13208309756689, + "grad_norm": 1.0649068355560303, + "learning_rate": 0.0002, + "loss": 0.4378, + "step": 37930 + }, + { + "epoch": 6.1336997817476355, + "grad_norm": 0.94698166847229, + "learning_rate": 0.0002, + "loss": 0.4638, + "step": 37940 + }, + { + "epoch": 6.135316465928381, + "grad_norm": 1.1450897455215454, + "learning_rate": 0.0002, + "loss": 0.488, + "step": 37950 + }, + { + "epoch": 6.136933150109126, + "grad_norm": 1.032482624053955, + "learning_rate": 0.0002, + "loss": 0.4791, + "step": 37960 + }, + { + "epoch": 6.138549834289871, + "grad_norm": 1.0993428230285645, + "learning_rate": 0.0002, + "loss": 0.4179, + "step": 37970 + }, + { + "epoch": 6.1401665184706165, + "grad_norm": 1.2907029390335083, + "learning_rate": 0.0002, + "loss": 0.4781, + "step": 37980 + }, + { + "epoch": 6.141783202651362, + "grad_norm": 1.1007903814315796, + "learning_rate": 0.0002, + "loss": 0.4671, + "step": 37990 + }, + { + "epoch": 6.143399886832107, + "grad_norm": 0.9286124110221863, + "learning_rate": 0.0002, + "loss": 0.4213, + "step": 38000 + }, + { + "epoch": 6.145016571012853, + "grad_norm": 1.1426366567611694, + "learning_rate": 0.0002, + "loss": 0.4741, + "step": 38010 + }, + { + "epoch": 6.146633255193598, + "grad_norm": 1.2608287334442139, + "learning_rate": 0.0002, + "loss": 0.4746, + "step": 38020 + }, + { + "epoch": 6.1482499393743435, + "grad_norm": 1.1346837282180786, + "learning_rate": 0.0002, + "loss": 0.454, + "step": 38030 + }, + { + "epoch": 6.149866623555089, + "grad_norm": 1.144080400466919, + "learning_rate": 0.0002, + "loss": 0.4469, + "step": 38040 + }, + { + "epoch": 6.151483307735834, + "grad_norm": 1.3456705808639526, + "learning_rate": 0.0002, + "loss": 0.4515, + "step": 38050 + }, + { + "epoch": 6.153099991916579, + "grad_norm": 1.0517960786819458, + "learning_rate": 0.0002, + "loss": 0.4775, + "step": 38060 + }, + { + "epoch": 6.154716676097324, + "grad_norm": 1.1887445449829102, + "learning_rate": 0.0002, + "loss": 0.4986, + "step": 38070 + }, + { + "epoch": 6.15633336027807, + "grad_norm": 1.0449163913726807, + "learning_rate": 0.0002, + "loss": 0.4516, + "step": 38080 + }, + { + "epoch": 6.157950044458815, + "grad_norm": 1.3218743801116943, + "learning_rate": 0.0002, + "loss": 0.4808, + "step": 38090 + }, + { + "epoch": 6.15956672863956, + "grad_norm": 1.003208875656128, + "learning_rate": 0.0002, + "loss": 0.4632, + "step": 38100 + }, + { + "epoch": 6.161183412820305, + "grad_norm": 1.008623719215393, + "learning_rate": 0.0002, + "loss": 0.4978, + "step": 38110 + }, + { + "epoch": 6.162800097001051, + "grad_norm": 1.2122787237167358, + "learning_rate": 0.0002, + "loss": 0.4608, + "step": 38120 + }, + { + "epoch": 6.164416781181796, + "grad_norm": 1.253403902053833, + "learning_rate": 0.0002, + "loss": 0.4666, + "step": 38130 + }, + { + "epoch": 6.166033465362541, + "grad_norm": 1.2289724349975586, + "learning_rate": 0.0002, + "loss": 0.4778, + "step": 38140 + }, + { + "epoch": 6.167650149543286, + "grad_norm": 1.330694556236267, + "learning_rate": 0.0002, + "loss": 0.4774, + "step": 38150 + }, + { + "epoch": 6.169266833724032, + "grad_norm": 1.0946741104125977, + "learning_rate": 0.0002, + "loss": 0.4699, + "step": 38160 + }, + { + "epoch": 6.170883517904778, + "grad_norm": 1.0719934701919556, + "learning_rate": 0.0002, + "loss": 0.4816, + "step": 38170 + }, + { + "epoch": 6.172500202085523, + "grad_norm": 1.1142133474349976, + "learning_rate": 0.0002, + "loss": 0.4678, + "step": 38180 + }, + { + "epoch": 6.174116886266268, + "grad_norm": 1.1221938133239746, + "learning_rate": 0.0002, + "loss": 0.4911, + "step": 38190 + }, + { + "epoch": 6.175733570447013, + "grad_norm": 1.1391617059707642, + "learning_rate": 0.0002, + "loss": 0.4462, + "step": 38200 + }, + { + "epoch": 6.1773502546277586, + "grad_norm": 1.2263455390930176, + "learning_rate": 0.0002, + "loss": 0.4867, + "step": 38210 + }, + { + "epoch": 6.178966938808504, + "grad_norm": 1.0930434465408325, + "learning_rate": 0.0002, + "loss": 0.4633, + "step": 38220 + }, + { + "epoch": 6.180583622989249, + "grad_norm": 1.3489030599594116, + "learning_rate": 0.0002, + "loss": 0.4406, + "step": 38230 + }, + { + "epoch": 6.182200307169994, + "grad_norm": 1.1383486986160278, + "learning_rate": 0.0002, + "loss": 0.4994, + "step": 38240 + }, + { + "epoch": 6.1838169913507395, + "grad_norm": 1.2408897876739502, + "learning_rate": 0.0002, + "loss": 0.4851, + "step": 38250 + }, + { + "epoch": 6.185433675531485, + "grad_norm": 1.1436222791671753, + "learning_rate": 0.0002, + "loss": 0.4848, + "step": 38260 + }, + { + "epoch": 6.18705035971223, + "grad_norm": 1.370117425918579, + "learning_rate": 0.0002, + "loss": 0.4594, + "step": 38270 + }, + { + "epoch": 6.188667043892975, + "grad_norm": 0.8862423300743103, + "learning_rate": 0.0002, + "loss": 0.5023, + "step": 38280 + }, + { + "epoch": 6.19028372807372, + "grad_norm": 0.9603779315948486, + "learning_rate": 0.0002, + "loss": 0.4559, + "step": 38290 + }, + { + "epoch": 6.191900412254466, + "grad_norm": 1.389291524887085, + "learning_rate": 0.0002, + "loss": 0.4835, + "step": 38300 + }, + { + "epoch": 6.193517096435212, + "grad_norm": 1.0767031908035278, + "learning_rate": 0.0002, + "loss": 0.4435, + "step": 38310 + }, + { + "epoch": 6.195133780615957, + "grad_norm": 1.1800403594970703, + "learning_rate": 0.0002, + "loss": 0.4683, + "step": 38320 + }, + { + "epoch": 6.196750464796702, + "grad_norm": 0.997891366481781, + "learning_rate": 0.0002, + "loss": 0.4608, + "step": 38330 + }, + { + "epoch": 6.1983671489774474, + "grad_norm": 1.1201492547988892, + "learning_rate": 0.0002, + "loss": 0.4575, + "step": 38340 + }, + { + "epoch": 6.199983833158193, + "grad_norm": 0.9769026637077332, + "learning_rate": 0.0002, + "loss": 0.4952, + "step": 38350 + }, + { + "epoch": 6.201600517338938, + "grad_norm": 0.9447069764137268, + "learning_rate": 0.0002, + "loss": 0.4563, + "step": 38360 + }, + { + "epoch": 6.203217201519683, + "grad_norm": 1.0959235429763794, + "learning_rate": 0.0002, + "loss": 0.516, + "step": 38370 + }, + { + "epoch": 6.204833885700428, + "grad_norm": 1.2495406866073608, + "learning_rate": 0.0002, + "loss": 0.4688, + "step": 38380 + }, + { + "epoch": 6.206450569881174, + "grad_norm": 0.8589218258857727, + "learning_rate": 0.0002, + "loss": 0.4445, + "step": 38390 + }, + { + "epoch": 6.208067254061919, + "grad_norm": 0.959155797958374, + "learning_rate": 0.0002, + "loss": 0.4808, + "step": 38400 + }, + { + "epoch": 6.209683938242664, + "grad_norm": 1.0105533599853516, + "learning_rate": 0.0002, + "loss": 0.4622, + "step": 38410 + }, + { + "epoch": 6.211300622423409, + "grad_norm": 0.9824615120887756, + "learning_rate": 0.0002, + "loss": 0.4887, + "step": 38420 + }, + { + "epoch": 6.2129173066041545, + "grad_norm": 0.8616500496864319, + "learning_rate": 0.0002, + "loss": 0.4656, + "step": 38430 + }, + { + "epoch": 6.2145339907849, + "grad_norm": 1.2917758226394653, + "learning_rate": 0.0002, + "loss": 0.449, + "step": 38440 + }, + { + "epoch": 6.216150674965646, + "grad_norm": 1.0564531087875366, + "learning_rate": 0.0002, + "loss": 0.4201, + "step": 38450 + }, + { + "epoch": 6.217767359146391, + "grad_norm": 1.152331829071045, + "learning_rate": 0.0002, + "loss": 0.4849, + "step": 38460 + }, + { + "epoch": 6.219384043327136, + "grad_norm": 0.9152206778526306, + "learning_rate": 0.0002, + "loss": 0.4887, + "step": 38470 + }, + { + "epoch": 6.221000727507882, + "grad_norm": 0.9931167960166931, + "learning_rate": 0.0002, + "loss": 0.4686, + "step": 38480 + }, + { + "epoch": 6.222617411688627, + "grad_norm": 1.3248072862625122, + "learning_rate": 0.0002, + "loss": 0.4765, + "step": 38490 + }, + { + "epoch": 6.224234095869372, + "grad_norm": 1.3916507959365845, + "learning_rate": 0.0002, + "loss": 0.4636, + "step": 38500 + }, + { + "epoch": 6.225850780050117, + "grad_norm": 1.1775140762329102, + "learning_rate": 0.0002, + "loss": 0.506, + "step": 38510 + }, + { + "epoch": 6.2274674642308625, + "grad_norm": 1.1581059694290161, + "learning_rate": 0.0002, + "loss": 0.47, + "step": 38520 + }, + { + "epoch": 6.229084148411608, + "grad_norm": 1.359320878982544, + "learning_rate": 0.0002, + "loss": 0.4679, + "step": 38530 + }, + { + "epoch": 6.230700832592353, + "grad_norm": 1.185041904449463, + "learning_rate": 0.0002, + "loss": 0.4697, + "step": 38540 + }, + { + "epoch": 6.232317516773098, + "grad_norm": 1.1861097812652588, + "learning_rate": 0.0002, + "loss": 0.4815, + "step": 38550 + }, + { + "epoch": 6.233934200953843, + "grad_norm": 1.126990556716919, + "learning_rate": 0.0002, + "loss": 0.4925, + "step": 38560 + }, + { + "epoch": 6.235550885134589, + "grad_norm": 0.9744541049003601, + "learning_rate": 0.0002, + "loss": 0.4414, + "step": 38570 + }, + { + "epoch": 6.237167569315334, + "grad_norm": 1.1260887384414673, + "learning_rate": 0.0002, + "loss": 0.4577, + "step": 38580 + }, + { + "epoch": 6.238784253496079, + "grad_norm": 1.1290327310562134, + "learning_rate": 0.0002, + "loss": 0.4852, + "step": 38590 + }, + { + "epoch": 6.240400937676825, + "grad_norm": 1.0952879190444946, + "learning_rate": 0.0002, + "loss": 0.4805, + "step": 38600 + }, + { + "epoch": 6.2420176218575705, + "grad_norm": 1.1037684679031372, + "learning_rate": 0.0002, + "loss": 0.4436, + "step": 38610 + }, + { + "epoch": 6.243634306038316, + "grad_norm": 1.1356085538864136, + "learning_rate": 0.0002, + "loss": 0.466, + "step": 38620 + }, + { + "epoch": 6.245250990219061, + "grad_norm": 1.0677106380462646, + "learning_rate": 0.0002, + "loss": 0.5129, + "step": 38630 + }, + { + "epoch": 6.246867674399806, + "grad_norm": 1.1573411226272583, + "learning_rate": 0.0002, + "loss": 0.4907, + "step": 38640 + }, + { + "epoch": 6.248484358580551, + "grad_norm": 1.2707505226135254, + "learning_rate": 0.0002, + "loss": 0.5098, + "step": 38650 + }, + { + "epoch": 6.250101042761297, + "grad_norm": 1.0480109453201294, + "learning_rate": 0.0002, + "loss": 0.4926, + "step": 38660 + }, + { + "epoch": 6.251717726942042, + "grad_norm": 1.3668724298477173, + "learning_rate": 0.0002, + "loss": 0.4654, + "step": 38670 + }, + { + "epoch": 6.253334411122787, + "grad_norm": 1.217289686203003, + "learning_rate": 0.0002, + "loss": 0.5128, + "step": 38680 + }, + { + "epoch": 6.254951095303532, + "grad_norm": 1.2950236797332764, + "learning_rate": 0.0002, + "loss": 0.4621, + "step": 38690 + }, + { + "epoch": 6.256567779484278, + "grad_norm": 1.4506934881210327, + "learning_rate": 0.0002, + "loss": 0.5076, + "step": 38700 + }, + { + "epoch": 6.258184463665023, + "grad_norm": 1.1248667240142822, + "learning_rate": 0.0002, + "loss": 0.4803, + "step": 38710 + }, + { + "epoch": 6.259801147845768, + "grad_norm": 1.3384023904800415, + "learning_rate": 0.0002, + "loss": 0.4746, + "step": 38720 + }, + { + "epoch": 6.261417832026513, + "grad_norm": 1.128074288368225, + "learning_rate": 0.0002, + "loss": 0.473, + "step": 38730 + }, + { + "epoch": 6.263034516207259, + "grad_norm": 1.1169012784957886, + "learning_rate": 0.0002, + "loss": 0.4638, + "step": 38740 + }, + { + "epoch": 6.264651200388005, + "grad_norm": 1.195198893547058, + "learning_rate": 0.0002, + "loss": 0.4747, + "step": 38750 + }, + { + "epoch": 6.26626788456875, + "grad_norm": 1.2471518516540527, + "learning_rate": 0.0002, + "loss": 0.4906, + "step": 38760 + }, + { + "epoch": 6.267884568749495, + "grad_norm": 1.2646394968032837, + "learning_rate": 0.0002, + "loss": 0.4507, + "step": 38770 + }, + { + "epoch": 6.26950125293024, + "grad_norm": 1.0286450386047363, + "learning_rate": 0.0002, + "loss": 0.4934, + "step": 38780 + }, + { + "epoch": 6.2711179371109855, + "grad_norm": 1.2440695762634277, + "learning_rate": 0.0002, + "loss": 0.4787, + "step": 38790 + }, + { + "epoch": 6.272734621291731, + "grad_norm": 0.8941256403923035, + "learning_rate": 0.0002, + "loss": 0.4806, + "step": 38800 + }, + { + "epoch": 6.274351305472476, + "grad_norm": 1.0693447589874268, + "learning_rate": 0.0002, + "loss": 0.4741, + "step": 38810 + }, + { + "epoch": 6.275967989653221, + "grad_norm": 1.0936840772628784, + "learning_rate": 0.0002, + "loss": 0.4408, + "step": 38820 + }, + { + "epoch": 6.2775846738339665, + "grad_norm": 1.0961874723434448, + "learning_rate": 0.0002, + "loss": 0.4729, + "step": 38830 + }, + { + "epoch": 6.279201358014712, + "grad_norm": 1.1465433835983276, + "learning_rate": 0.0002, + "loss": 0.4504, + "step": 38840 + }, + { + "epoch": 6.280818042195457, + "grad_norm": 1.2987004518508911, + "learning_rate": 0.0002, + "loss": 0.4771, + "step": 38850 + }, + { + "epoch": 6.282434726376202, + "grad_norm": 1.1310304403305054, + "learning_rate": 0.0002, + "loss": 0.4945, + "step": 38860 + }, + { + "epoch": 6.284051410556947, + "grad_norm": 1.306538462638855, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 38870 + }, + { + "epoch": 6.285668094737693, + "grad_norm": 1.2405401468276978, + "learning_rate": 0.0002, + "loss": 0.4873, + "step": 38880 + }, + { + "epoch": 6.287284778918439, + "grad_norm": 1.0934767723083496, + "learning_rate": 0.0002, + "loss": 0.4929, + "step": 38890 + }, + { + "epoch": 6.288901463099184, + "grad_norm": 1.3370496034622192, + "learning_rate": 0.0002, + "loss": 0.4853, + "step": 38900 + }, + { + "epoch": 6.290518147279929, + "grad_norm": 1.0319404602050781, + "learning_rate": 0.0002, + "loss": 0.4892, + "step": 38910 + }, + { + "epoch": 6.292134831460674, + "grad_norm": 0.9734271168708801, + "learning_rate": 0.0002, + "loss": 0.4685, + "step": 38920 + }, + { + "epoch": 6.29375151564142, + "grad_norm": 1.0940454006195068, + "learning_rate": 0.0002, + "loss": 0.5085, + "step": 38930 + }, + { + "epoch": 6.295368199822165, + "grad_norm": 1.036500334739685, + "learning_rate": 0.0002, + "loss": 0.4985, + "step": 38940 + }, + { + "epoch": 6.29698488400291, + "grad_norm": 1.020308256149292, + "learning_rate": 0.0002, + "loss": 0.4878, + "step": 38950 + }, + { + "epoch": 6.298601568183655, + "grad_norm": 1.1416399478912354, + "learning_rate": 0.0002, + "loss": 0.4668, + "step": 38960 + }, + { + "epoch": 6.300218252364401, + "grad_norm": 1.2497479915618896, + "learning_rate": 0.0002, + "loss": 0.4727, + "step": 38970 + }, + { + "epoch": 6.301834936545146, + "grad_norm": 1.1692523956298828, + "learning_rate": 0.0002, + "loss": 0.4721, + "step": 38980 + }, + { + "epoch": 6.303451620725891, + "grad_norm": 1.0693109035491943, + "learning_rate": 0.0002, + "loss": 0.505, + "step": 38990 + }, + { + "epoch": 6.305068304906636, + "grad_norm": 0.8883291482925415, + "learning_rate": 0.0002, + "loss": 0.4875, + "step": 39000 + }, + { + "epoch": 6.3066849890873815, + "grad_norm": 1.1445088386535645, + "learning_rate": 0.0002, + "loss": 0.5371, + "step": 39010 + }, + { + "epoch": 6.308301673268127, + "grad_norm": 1.226792335510254, + "learning_rate": 0.0002, + "loss": 0.5089, + "step": 39020 + }, + { + "epoch": 6.309918357448872, + "grad_norm": 1.0498932600021362, + "learning_rate": 0.0002, + "loss": 0.474, + "step": 39030 + }, + { + "epoch": 6.311535041629618, + "grad_norm": 1.0834535360336304, + "learning_rate": 0.0002, + "loss": 0.4964, + "step": 39040 + }, + { + "epoch": 6.313151725810363, + "grad_norm": 1.144666075706482, + "learning_rate": 0.0002, + "loss": 0.4733, + "step": 39050 + }, + { + "epoch": 6.3147684099911086, + "grad_norm": 1.1468489170074463, + "learning_rate": 0.0002, + "loss": 0.4784, + "step": 39060 + }, + { + "epoch": 6.316385094171854, + "grad_norm": 1.290949821472168, + "learning_rate": 0.0002, + "loss": 0.4911, + "step": 39070 + }, + { + "epoch": 6.318001778352599, + "grad_norm": 1.087868094444275, + "learning_rate": 0.0002, + "loss": 0.5002, + "step": 39080 + }, + { + "epoch": 6.319618462533344, + "grad_norm": 1.0156296491622925, + "learning_rate": 0.0002, + "loss": 0.4944, + "step": 39090 + }, + { + "epoch": 6.3212351467140895, + "grad_norm": 1.0805060863494873, + "learning_rate": 0.0002, + "loss": 0.5019, + "step": 39100 + }, + { + "epoch": 6.322851830894835, + "grad_norm": 0.9030579924583435, + "learning_rate": 0.0002, + "loss": 0.4598, + "step": 39110 + }, + { + "epoch": 6.32446851507558, + "grad_norm": 1.1488285064697266, + "learning_rate": 0.0002, + "loss": 0.4635, + "step": 39120 + }, + { + "epoch": 6.326085199256325, + "grad_norm": 1.2050796747207642, + "learning_rate": 0.0002, + "loss": 0.5368, + "step": 39130 + }, + { + "epoch": 6.32770188343707, + "grad_norm": 1.093451738357544, + "learning_rate": 0.0002, + "loss": 0.4854, + "step": 39140 + }, + { + "epoch": 6.329318567617816, + "grad_norm": 1.2046772241592407, + "learning_rate": 0.0002, + "loss": 0.5055, + "step": 39150 + }, + { + "epoch": 6.330935251798561, + "grad_norm": 1.045777678489685, + "learning_rate": 0.0002, + "loss": 0.4703, + "step": 39160 + }, + { + "epoch": 6.332551935979306, + "grad_norm": 1.2008492946624756, + "learning_rate": 0.0002, + "loss": 0.513, + "step": 39170 + }, + { + "epoch": 6.334168620160051, + "grad_norm": 1.0613869428634644, + "learning_rate": 0.0002, + "loss": 0.4909, + "step": 39180 + }, + { + "epoch": 6.3357853043407975, + "grad_norm": 1.058440089225769, + "learning_rate": 0.0002, + "loss": 0.4708, + "step": 39190 + }, + { + "epoch": 6.337401988521543, + "grad_norm": 1.195658802986145, + "learning_rate": 0.0002, + "loss": 0.4719, + "step": 39200 + }, + { + "epoch": 6.339018672702288, + "grad_norm": 1.1595174074172974, + "learning_rate": 0.0002, + "loss": 0.4901, + "step": 39210 + }, + { + "epoch": 6.340635356883033, + "grad_norm": 1.0674750804901123, + "learning_rate": 0.0002, + "loss": 0.4587, + "step": 39220 + }, + { + "epoch": 6.342252041063778, + "grad_norm": 1.3306758403778076, + "learning_rate": 0.0002, + "loss": 0.4801, + "step": 39230 + }, + { + "epoch": 6.343868725244524, + "grad_norm": 1.3582593202590942, + "learning_rate": 0.0002, + "loss": 0.4839, + "step": 39240 + }, + { + "epoch": 6.345485409425269, + "grad_norm": 1.2351572513580322, + "learning_rate": 0.0002, + "loss": 0.4964, + "step": 39250 + }, + { + "epoch": 6.347102093606014, + "grad_norm": 1.3623450994491577, + "learning_rate": 0.0002, + "loss": 0.4806, + "step": 39260 + }, + { + "epoch": 6.348718777786759, + "grad_norm": 1.201270580291748, + "learning_rate": 0.0002, + "loss": 0.466, + "step": 39270 + }, + { + "epoch": 6.3503354619675045, + "grad_norm": 0.9300584197044373, + "learning_rate": 0.0002, + "loss": 0.4899, + "step": 39280 + }, + { + "epoch": 6.35195214614825, + "grad_norm": 0.944525957107544, + "learning_rate": 0.0002, + "loss": 0.4867, + "step": 39290 + }, + { + "epoch": 6.353568830328995, + "grad_norm": 1.4263732433319092, + "learning_rate": 0.0002, + "loss": 0.4954, + "step": 39300 + }, + { + "epoch": 6.35518551450974, + "grad_norm": 1.392592191696167, + "learning_rate": 0.0002, + "loss": 0.4982, + "step": 39310 + }, + { + "epoch": 6.3568021986904855, + "grad_norm": 1.0753393173217773, + "learning_rate": 0.0002, + "loss": 0.4868, + "step": 39320 + }, + { + "epoch": 6.358418882871231, + "grad_norm": 1.0088151693344116, + "learning_rate": 0.0002, + "loss": 0.4896, + "step": 39330 + }, + { + "epoch": 6.360035567051977, + "grad_norm": 1.1784582138061523, + "learning_rate": 0.0002, + "loss": 0.4684, + "step": 39340 + }, + { + "epoch": 6.361652251232722, + "grad_norm": 1.020526647567749, + "learning_rate": 0.0002, + "loss": 0.4732, + "step": 39350 + }, + { + "epoch": 6.363268935413467, + "grad_norm": 1.1400747299194336, + "learning_rate": 0.0002, + "loss": 0.5177, + "step": 39360 + }, + { + "epoch": 6.3648856195942125, + "grad_norm": 0.9960665702819824, + "learning_rate": 0.0002, + "loss": 0.4976, + "step": 39370 + }, + { + "epoch": 6.366502303774958, + "grad_norm": 1.1547569036483765, + "learning_rate": 0.0002, + "loss": 0.483, + "step": 39380 + }, + { + "epoch": 6.368118987955703, + "grad_norm": 1.2180676460266113, + "learning_rate": 0.0002, + "loss": 0.4861, + "step": 39390 + }, + { + "epoch": 6.369735672136448, + "grad_norm": 1.1391799449920654, + "learning_rate": 0.0002, + "loss": 0.4805, + "step": 39400 + }, + { + "epoch": 6.371352356317193, + "grad_norm": 1.2893574237823486, + "learning_rate": 0.0002, + "loss": 0.5004, + "step": 39410 + }, + { + "epoch": 6.372969040497939, + "grad_norm": 1.192878246307373, + "learning_rate": 0.0002, + "loss": 0.4807, + "step": 39420 + }, + { + "epoch": 6.374585724678684, + "grad_norm": 0.9771704077720642, + "learning_rate": 0.0002, + "loss": 0.4637, + "step": 39430 + }, + { + "epoch": 6.376202408859429, + "grad_norm": 1.285387635231018, + "learning_rate": 0.0002, + "loss": 0.4867, + "step": 39440 + }, + { + "epoch": 6.377819093040174, + "grad_norm": 1.019957184791565, + "learning_rate": 0.0002, + "loss": 0.4593, + "step": 39450 + }, + { + "epoch": 6.37943577722092, + "grad_norm": 1.2002915143966675, + "learning_rate": 0.0002, + "loss": 0.473, + "step": 39460 + }, + { + "epoch": 6.381052461401665, + "grad_norm": 1.3285092115402222, + "learning_rate": 0.0002, + "loss": 0.5025, + "step": 39470 + }, + { + "epoch": 6.38266914558241, + "grad_norm": 1.097846269607544, + "learning_rate": 0.0002, + "loss": 0.4626, + "step": 39480 + }, + { + "epoch": 6.384285829763156, + "grad_norm": 0.9537988305091858, + "learning_rate": 0.0002, + "loss": 0.5109, + "step": 39490 + }, + { + "epoch": 6.385902513943901, + "grad_norm": 1.0350042581558228, + "learning_rate": 0.0002, + "loss": 0.4492, + "step": 39500 + }, + { + "epoch": 6.387519198124647, + "grad_norm": 0.9559133052825928, + "learning_rate": 0.0002, + "loss": 0.4824, + "step": 39510 + }, + { + "epoch": 6.389135882305392, + "grad_norm": 0.9615123271942139, + "learning_rate": 0.0002, + "loss": 0.5189, + "step": 39520 + }, + { + "epoch": 6.390752566486137, + "grad_norm": 1.0604504346847534, + "learning_rate": 0.0002, + "loss": 0.4915, + "step": 39530 + }, + { + "epoch": 6.392369250666882, + "grad_norm": 1.2460750341415405, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 39540 + }, + { + "epoch": 6.393985934847628, + "grad_norm": 1.1496477127075195, + "learning_rate": 0.0002, + "loss": 0.4929, + "step": 39550 + }, + { + "epoch": 6.395602619028373, + "grad_norm": 1.048043966293335, + "learning_rate": 0.0002, + "loss": 0.4872, + "step": 39560 + }, + { + "epoch": 6.397219303209118, + "grad_norm": 1.333539366722107, + "learning_rate": 0.0002, + "loss": 0.5231, + "step": 39570 + }, + { + "epoch": 6.398835987389863, + "grad_norm": 1.0605626106262207, + "learning_rate": 0.0002, + "loss": 0.4877, + "step": 39580 + }, + { + "epoch": 6.4004526715706085, + "grad_norm": 1.163220763206482, + "learning_rate": 0.0002, + "loss": 0.4643, + "step": 39590 + }, + { + "epoch": 6.402069355751354, + "grad_norm": 1.1878494024276733, + "learning_rate": 0.0002, + "loss": 0.4824, + "step": 39600 + }, + { + "epoch": 6.403686039932099, + "grad_norm": 1.4630796909332275, + "learning_rate": 0.0002, + "loss": 0.5242, + "step": 39610 + }, + { + "epoch": 6.405302724112844, + "grad_norm": 1.073255181312561, + "learning_rate": 0.0002, + "loss": 0.4985, + "step": 39620 + }, + { + "epoch": 6.406919408293589, + "grad_norm": 1.0538873672485352, + "learning_rate": 0.0002, + "loss": 0.5108, + "step": 39630 + }, + { + "epoch": 6.4085360924743355, + "grad_norm": 1.015525221824646, + "learning_rate": 0.0002, + "loss": 0.4801, + "step": 39640 + }, + { + "epoch": 6.410152776655081, + "grad_norm": 1.1454379558563232, + "learning_rate": 0.0002, + "loss": 0.4781, + "step": 39650 + }, + { + "epoch": 6.411769460835826, + "grad_norm": 1.2801800966262817, + "learning_rate": 0.0002, + "loss": 0.498, + "step": 39660 + }, + { + "epoch": 6.413386145016571, + "grad_norm": 1.077579140663147, + "learning_rate": 0.0002, + "loss": 0.4804, + "step": 39670 + }, + { + "epoch": 6.4150028291973165, + "grad_norm": 1.376662015914917, + "learning_rate": 0.0002, + "loss": 0.51, + "step": 39680 + }, + { + "epoch": 6.416619513378062, + "grad_norm": 1.2064344882965088, + "learning_rate": 0.0002, + "loss": 0.4956, + "step": 39690 + }, + { + "epoch": 6.418236197558807, + "grad_norm": 1.0689115524291992, + "learning_rate": 0.0002, + "loss": 0.4762, + "step": 39700 + }, + { + "epoch": 6.419852881739552, + "grad_norm": 0.9997019171714783, + "learning_rate": 0.0002, + "loss": 0.4762, + "step": 39710 + }, + { + "epoch": 6.421469565920297, + "grad_norm": 1.2368080615997314, + "learning_rate": 0.0002, + "loss": 0.49, + "step": 39720 + }, + { + "epoch": 6.423086250101043, + "grad_norm": 1.2085820436477661, + "learning_rate": 0.0002, + "loss": 0.4774, + "step": 39730 + }, + { + "epoch": 6.424702934281788, + "grad_norm": 1.057246208190918, + "learning_rate": 0.0002, + "loss": 0.4671, + "step": 39740 + }, + { + "epoch": 6.426319618462533, + "grad_norm": 1.1311043500900269, + "learning_rate": 0.0002, + "loss": 0.5315, + "step": 39750 + }, + { + "epoch": 6.427936302643278, + "grad_norm": 1.2352231740951538, + "learning_rate": 0.0002, + "loss": 0.5171, + "step": 39760 + }, + { + "epoch": 6.4295529868240235, + "grad_norm": 0.953233540058136, + "learning_rate": 0.0002, + "loss": 0.466, + "step": 39770 + }, + { + "epoch": 6.431169671004769, + "grad_norm": 1.0632505416870117, + "learning_rate": 0.0002, + "loss": 0.4834, + "step": 39780 + }, + { + "epoch": 6.432786355185515, + "grad_norm": 1.0916751623153687, + "learning_rate": 0.0002, + "loss": 0.5053, + "step": 39790 + }, + { + "epoch": 6.43440303936626, + "grad_norm": 0.9732703566551208, + "learning_rate": 0.0002, + "loss": 0.4788, + "step": 39800 + }, + { + "epoch": 6.436019723547005, + "grad_norm": 1.1673705577850342, + "learning_rate": 0.0002, + "loss": 0.4982, + "step": 39810 + }, + { + "epoch": 6.437636407727751, + "grad_norm": 1.1049559116363525, + "learning_rate": 0.0002, + "loss": 0.4484, + "step": 39820 + }, + { + "epoch": 6.439253091908496, + "grad_norm": 1.345277190208435, + "learning_rate": 0.0002, + "loss": 0.4784, + "step": 39830 + }, + { + "epoch": 6.440869776089241, + "grad_norm": 1.1118950843811035, + "learning_rate": 0.0002, + "loss": 0.4716, + "step": 39840 + }, + { + "epoch": 6.442486460269986, + "grad_norm": 1.4872850179672241, + "learning_rate": 0.0002, + "loss": 0.5133, + "step": 39850 + }, + { + "epoch": 6.4441031444507315, + "grad_norm": 1.0763497352600098, + "learning_rate": 0.0002, + "loss": 0.4532, + "step": 39860 + }, + { + "epoch": 6.445719828631477, + "grad_norm": 0.9245555400848389, + "learning_rate": 0.0002, + "loss": 0.4572, + "step": 39870 + }, + { + "epoch": 6.447336512812222, + "grad_norm": 1.4154807329177856, + "learning_rate": 0.0002, + "loss": 0.4917, + "step": 39880 + }, + { + "epoch": 6.448953196992967, + "grad_norm": 1.0885124206542969, + "learning_rate": 0.0002, + "loss": 0.4852, + "step": 39890 + }, + { + "epoch": 6.450569881173712, + "grad_norm": 1.3989344835281372, + "learning_rate": 0.0002, + "loss": 0.5399, + "step": 39900 + }, + { + "epoch": 6.452186565354458, + "grad_norm": 0.9763124585151672, + "learning_rate": 0.0002, + "loss": 0.509, + "step": 39910 + }, + { + "epoch": 6.453803249535203, + "grad_norm": 1.135272741317749, + "learning_rate": 0.0002, + "loss": 0.5134, + "step": 39920 + }, + { + "epoch": 6.455419933715948, + "grad_norm": 1.1140081882476807, + "learning_rate": 0.0002, + "loss": 0.4941, + "step": 39930 + }, + { + "epoch": 6.457036617896694, + "grad_norm": 1.0992448329925537, + "learning_rate": 0.0002, + "loss": 0.5137, + "step": 39940 + }, + { + "epoch": 6.4586533020774395, + "grad_norm": 1.1658501625061035, + "learning_rate": 0.0002, + "loss": 0.4914, + "step": 39950 + }, + { + "epoch": 6.460269986258185, + "grad_norm": 1.1122797727584839, + "learning_rate": 0.0002, + "loss": 0.5036, + "step": 39960 + }, + { + "epoch": 6.46188667043893, + "grad_norm": 0.9664968252182007, + "learning_rate": 0.0002, + "loss": 0.5159, + "step": 39970 + }, + { + "epoch": 6.463503354619675, + "grad_norm": 1.2513965368270874, + "learning_rate": 0.0002, + "loss": 0.4989, + "step": 39980 + }, + { + "epoch": 6.46512003880042, + "grad_norm": 1.1198630332946777, + "learning_rate": 0.0002, + "loss": 0.4694, + "step": 39990 + }, + { + "epoch": 6.466736722981166, + "grad_norm": 0.8783249855041504, + "learning_rate": 0.0002, + "loss": 0.5023, + "step": 40000 + }, + { + "epoch": 6.468353407161911, + "grad_norm": 1.1313109397888184, + "learning_rate": 0.0002, + "loss": 0.4648, + "step": 40010 + }, + { + "epoch": 6.469970091342656, + "grad_norm": 1.0854487419128418, + "learning_rate": 0.0002, + "loss": 0.4965, + "step": 40020 + }, + { + "epoch": 6.471586775523401, + "grad_norm": 1.1738566160202026, + "learning_rate": 0.0002, + "loss": 0.5253, + "step": 40030 + }, + { + "epoch": 6.473203459704147, + "grad_norm": 0.9720084071159363, + "learning_rate": 0.0002, + "loss": 0.4947, + "step": 40040 + }, + { + "epoch": 6.474820143884892, + "grad_norm": 1.105618953704834, + "learning_rate": 0.0002, + "loss": 0.5218, + "step": 40050 + }, + { + "epoch": 6.476436828065637, + "grad_norm": 1.2007657289505005, + "learning_rate": 0.0002, + "loss": 0.4943, + "step": 40060 + }, + { + "epoch": 6.478053512246382, + "grad_norm": 1.088402509689331, + "learning_rate": 0.0002, + "loss": 0.4882, + "step": 40070 + }, + { + "epoch": 6.4796701964271275, + "grad_norm": 1.0775291919708252, + "learning_rate": 0.0002, + "loss": 0.504, + "step": 40080 + }, + { + "epoch": 6.481286880607874, + "grad_norm": 1.1018189191818237, + "learning_rate": 0.0002, + "loss": 0.4791, + "step": 40090 + }, + { + "epoch": 6.482903564788619, + "grad_norm": 1.1676557064056396, + "learning_rate": 0.0002, + "loss": 0.488, + "step": 40100 + }, + { + "epoch": 6.484520248969364, + "grad_norm": 0.9619805812835693, + "learning_rate": 0.0002, + "loss": 0.4818, + "step": 40110 + }, + { + "epoch": 6.486136933150109, + "grad_norm": 1.2408208847045898, + "learning_rate": 0.0002, + "loss": 0.4986, + "step": 40120 + }, + { + "epoch": 6.4877536173308545, + "grad_norm": 1.3488136529922485, + "learning_rate": 0.0002, + "loss": 0.4668, + "step": 40130 + }, + { + "epoch": 6.4893703015116, + "grad_norm": 0.9864488244056702, + "learning_rate": 0.0002, + "loss": 0.4774, + "step": 40140 + }, + { + "epoch": 6.490986985692345, + "grad_norm": 0.9437947273254395, + "learning_rate": 0.0002, + "loss": 0.4651, + "step": 40150 + }, + { + "epoch": 6.49260366987309, + "grad_norm": 1.2005455493927002, + "learning_rate": 0.0002, + "loss": 0.542, + "step": 40160 + }, + { + "epoch": 6.4942203540538355, + "grad_norm": 1.0796732902526855, + "learning_rate": 0.0002, + "loss": 0.4704, + "step": 40170 + }, + { + "epoch": 6.495837038234581, + "grad_norm": 1.1347825527191162, + "learning_rate": 0.0002, + "loss": 0.498, + "step": 40180 + }, + { + "epoch": 6.497453722415326, + "grad_norm": 1.2311455011367798, + "learning_rate": 0.0002, + "loss": 0.5215, + "step": 40190 + }, + { + "epoch": 6.499070406596071, + "grad_norm": 1.068609356880188, + "learning_rate": 0.0002, + "loss": 0.5043, + "step": 40200 + }, + { + "epoch": 6.500687090776816, + "grad_norm": 1.196425437927246, + "learning_rate": 0.0002, + "loss": 0.4868, + "step": 40210 + }, + { + "epoch": 6.5023037749575625, + "grad_norm": 1.183927297592163, + "learning_rate": 0.0002, + "loss": 0.4881, + "step": 40220 + }, + { + "epoch": 6.503920459138307, + "grad_norm": 0.9099724292755127, + "learning_rate": 0.0002, + "loss": 0.4958, + "step": 40230 + }, + { + "epoch": 6.505537143319053, + "grad_norm": 0.9261038899421692, + "learning_rate": 0.0002, + "loss": 0.4816, + "step": 40240 + }, + { + "epoch": 6.507153827499798, + "grad_norm": 1.185491681098938, + "learning_rate": 0.0002, + "loss": 0.5151, + "step": 40250 + }, + { + "epoch": 6.508770511680543, + "grad_norm": 1.1866052150726318, + "learning_rate": 0.0002, + "loss": 0.4853, + "step": 40260 + }, + { + "epoch": 6.510387195861289, + "grad_norm": 1.1600912809371948, + "learning_rate": 0.0002, + "loss": 0.491, + "step": 40270 + }, + { + "epoch": 6.512003880042034, + "grad_norm": 0.9609426259994507, + "learning_rate": 0.0002, + "loss": 0.5181, + "step": 40280 + }, + { + "epoch": 6.513620564222779, + "grad_norm": 1.078864336013794, + "learning_rate": 0.0002, + "loss": 0.4794, + "step": 40290 + }, + { + "epoch": 6.515237248403524, + "grad_norm": 1.042761206626892, + "learning_rate": 0.0002, + "loss": 0.46, + "step": 40300 + }, + { + "epoch": 6.51685393258427, + "grad_norm": 0.9742481112480164, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 40310 + }, + { + "epoch": 6.518470616765015, + "grad_norm": 1.2544835805892944, + "learning_rate": 0.0002, + "loss": 0.5234, + "step": 40320 + }, + { + "epoch": 6.52008730094576, + "grad_norm": 1.3019760847091675, + "learning_rate": 0.0002, + "loss": 0.4815, + "step": 40330 + }, + { + "epoch": 6.521703985126505, + "grad_norm": 1.3196964263916016, + "learning_rate": 0.0002, + "loss": 0.5039, + "step": 40340 + }, + { + "epoch": 6.5233206693072505, + "grad_norm": 1.2795668840408325, + "learning_rate": 0.0002, + "loss": 0.4979, + "step": 40350 + }, + { + "epoch": 6.524937353487996, + "grad_norm": 1.1618940830230713, + "learning_rate": 0.0002, + "loss": 0.5075, + "step": 40360 + }, + { + "epoch": 6.526554037668742, + "grad_norm": 1.330543041229248, + "learning_rate": 0.0002, + "loss": 0.5081, + "step": 40370 + }, + { + "epoch": 6.528170721849486, + "grad_norm": 1.1946901082992554, + "learning_rate": 0.0002, + "loss": 0.5055, + "step": 40380 + }, + { + "epoch": 6.529787406030232, + "grad_norm": 1.1708201169967651, + "learning_rate": 0.0002, + "loss": 0.4518, + "step": 40390 + }, + { + "epoch": 6.531404090210978, + "grad_norm": 0.894036591053009, + "learning_rate": 0.0002, + "loss": 0.4556, + "step": 40400 + }, + { + "epoch": 6.533020774391723, + "grad_norm": 1.1199041604995728, + "learning_rate": 0.0002, + "loss": 0.4919, + "step": 40410 + }, + { + "epoch": 6.534637458572468, + "grad_norm": 1.180317759513855, + "learning_rate": 0.0002, + "loss": 0.471, + "step": 40420 + }, + { + "epoch": 6.536254142753213, + "grad_norm": 1.37367582321167, + "learning_rate": 0.0002, + "loss": 0.4914, + "step": 40430 + }, + { + "epoch": 6.5378708269339585, + "grad_norm": 1.134791612625122, + "learning_rate": 0.0002, + "loss": 0.4561, + "step": 40440 + }, + { + "epoch": 6.539487511114704, + "grad_norm": 1.1160204410552979, + "learning_rate": 0.0002, + "loss": 0.5337, + "step": 40450 + }, + { + "epoch": 6.541104195295449, + "grad_norm": 1.268347978591919, + "learning_rate": 0.0002, + "loss": 0.5299, + "step": 40460 + }, + { + "epoch": 6.542720879476194, + "grad_norm": 1.1424330472946167, + "learning_rate": 0.0002, + "loss": 0.5167, + "step": 40470 + }, + { + "epoch": 6.544337563656939, + "grad_norm": 1.3098465204238892, + "learning_rate": 0.0002, + "loss": 0.5114, + "step": 40480 + }, + { + "epoch": 6.545954247837685, + "grad_norm": 1.3439544439315796, + "learning_rate": 0.0002, + "loss": 0.4865, + "step": 40490 + }, + { + "epoch": 6.54757093201843, + "grad_norm": 1.2708452939987183, + "learning_rate": 0.0002, + "loss": 0.5183, + "step": 40500 + }, + { + "epoch": 6.549187616199175, + "grad_norm": 1.483680248260498, + "learning_rate": 0.0002, + "loss": 0.5099, + "step": 40510 + }, + { + "epoch": 6.550804300379921, + "grad_norm": 1.1697806119918823, + "learning_rate": 0.0002, + "loss": 0.4811, + "step": 40520 + }, + { + "epoch": 6.5524209845606665, + "grad_norm": 1.1665642261505127, + "learning_rate": 0.0002, + "loss": 0.4814, + "step": 40530 + }, + { + "epoch": 6.554037668741412, + "grad_norm": 1.1243325471878052, + "learning_rate": 0.0002, + "loss": 0.4985, + "step": 40540 + }, + { + "epoch": 6.555654352922157, + "grad_norm": 1.0277988910675049, + "learning_rate": 0.0002, + "loss": 0.4936, + "step": 40550 + }, + { + "epoch": 6.557271037102902, + "grad_norm": 1.1466810703277588, + "learning_rate": 0.0002, + "loss": 0.487, + "step": 40560 + }, + { + "epoch": 6.558887721283647, + "grad_norm": 1.1415363550186157, + "learning_rate": 0.0002, + "loss": 0.4851, + "step": 40570 + }, + { + "epoch": 6.560504405464393, + "grad_norm": 1.1923491954803467, + "learning_rate": 0.0002, + "loss": 0.4631, + "step": 40580 + }, + { + "epoch": 6.562121089645138, + "grad_norm": 0.9264549612998962, + "learning_rate": 0.0002, + "loss": 0.5071, + "step": 40590 + }, + { + "epoch": 6.563737773825883, + "grad_norm": 0.8810341954231262, + "learning_rate": 0.0002, + "loss": 0.466, + "step": 40600 + }, + { + "epoch": 6.565354458006628, + "grad_norm": 2.3296701908111572, + "learning_rate": 0.0002, + "loss": 0.5085, + "step": 40610 + }, + { + "epoch": 6.5669711421873735, + "grad_norm": 1.0865163803100586, + "learning_rate": 0.0002, + "loss": 0.5196, + "step": 40620 + }, + { + "epoch": 6.568587826368119, + "grad_norm": 0.9844607710838318, + "learning_rate": 0.0002, + "loss": 0.5132, + "step": 40630 + }, + { + "epoch": 6.570204510548864, + "grad_norm": 1.1686855554580688, + "learning_rate": 0.0002, + "loss": 0.5437, + "step": 40640 + }, + { + "epoch": 6.571821194729609, + "grad_norm": 1.016829252243042, + "learning_rate": 0.0002, + "loss": 0.5293, + "step": 40650 + }, + { + "epoch": 6.5734378789103545, + "grad_norm": 1.2789337635040283, + "learning_rate": 0.0002, + "loss": 0.5243, + "step": 40660 + }, + { + "epoch": 6.575054563091101, + "grad_norm": 1.0819072723388672, + "learning_rate": 0.0002, + "loss": 0.4867, + "step": 40670 + }, + { + "epoch": 6.576671247271846, + "grad_norm": 1.1478345394134521, + "learning_rate": 0.0002, + "loss": 0.5024, + "step": 40680 + }, + { + "epoch": 6.578287931452591, + "grad_norm": 0.7972208857536316, + "learning_rate": 0.0002, + "loss": 0.5282, + "step": 40690 + }, + { + "epoch": 6.579904615633336, + "grad_norm": 1.1481789350509644, + "learning_rate": 0.0002, + "loss": 0.4877, + "step": 40700 + }, + { + "epoch": 6.5815212998140815, + "grad_norm": 1.0921871662139893, + "learning_rate": 0.0002, + "loss": 0.5143, + "step": 40710 + }, + { + "epoch": 6.583137983994827, + "grad_norm": 1.0230315923690796, + "learning_rate": 0.0002, + "loss": 0.5441, + "step": 40720 + }, + { + "epoch": 6.584754668175572, + "grad_norm": 1.151049017906189, + "learning_rate": 0.0002, + "loss": 0.4734, + "step": 40730 + }, + { + "epoch": 6.586371352356317, + "grad_norm": 1.4016883373260498, + "learning_rate": 0.0002, + "loss": 0.4782, + "step": 40740 + }, + { + "epoch": 6.587988036537062, + "grad_norm": 1.2211825847625732, + "learning_rate": 0.0002, + "loss": 0.5195, + "step": 40750 + }, + { + "epoch": 6.589604720717808, + "grad_norm": 1.2803404331207275, + "learning_rate": 0.0002, + "loss": 0.4815, + "step": 40760 + }, + { + "epoch": 6.591221404898553, + "grad_norm": 1.1119942665100098, + "learning_rate": 0.0002, + "loss": 0.5329, + "step": 40770 + }, + { + "epoch": 6.592838089079298, + "grad_norm": 1.464650273323059, + "learning_rate": 0.0002, + "loss": 0.5135, + "step": 40780 + }, + { + "epoch": 6.594454773260043, + "grad_norm": 1.1751397848129272, + "learning_rate": 0.0002, + "loss": 0.5181, + "step": 40790 + }, + { + "epoch": 6.596071457440789, + "grad_norm": 1.0866316556930542, + "learning_rate": 0.0002, + "loss": 0.4772, + "step": 40800 + }, + { + "epoch": 6.597688141621534, + "grad_norm": 1.1733694076538086, + "learning_rate": 0.0002, + "loss": 0.5132, + "step": 40810 + }, + { + "epoch": 6.59930482580228, + "grad_norm": 1.184708833694458, + "learning_rate": 0.0002, + "loss": 0.5138, + "step": 40820 + }, + { + "epoch": 6.600921509983025, + "grad_norm": 1.406081199645996, + "learning_rate": 0.0002, + "loss": 0.4885, + "step": 40830 + }, + { + "epoch": 6.60253819416377, + "grad_norm": 0.9658212661743164, + "learning_rate": 0.0002, + "loss": 0.499, + "step": 40840 + }, + { + "epoch": 6.604154878344516, + "grad_norm": 1.1457678079605103, + "learning_rate": 0.0002, + "loss": 0.5113, + "step": 40850 + }, + { + "epoch": 6.605771562525261, + "grad_norm": 1.0487784147262573, + "learning_rate": 0.0002, + "loss": 0.4916, + "step": 40860 + }, + { + "epoch": 6.607388246706006, + "grad_norm": 0.9357177019119263, + "learning_rate": 0.0002, + "loss": 0.4682, + "step": 40870 + }, + { + "epoch": 6.609004930886751, + "grad_norm": 1.1479727029800415, + "learning_rate": 0.0002, + "loss": 0.4751, + "step": 40880 + }, + { + "epoch": 6.610621615067497, + "grad_norm": 1.3729329109191895, + "learning_rate": 0.0002, + "loss": 0.5493, + "step": 40890 + }, + { + "epoch": 6.612238299248242, + "grad_norm": 1.0085599422454834, + "learning_rate": 0.0002, + "loss": 0.4886, + "step": 40900 + }, + { + "epoch": 6.613854983428987, + "grad_norm": 1.2750911712646484, + "learning_rate": 0.0002, + "loss": 0.516, + "step": 40910 + }, + { + "epoch": 6.615471667609732, + "grad_norm": 1.1929547786712646, + "learning_rate": 0.0002, + "loss": 0.5342, + "step": 40920 + }, + { + "epoch": 6.6170883517904775, + "grad_norm": 1.0821375846862793, + "learning_rate": 0.0002, + "loss": 0.4919, + "step": 40930 + }, + { + "epoch": 6.618705035971223, + "grad_norm": 1.197347640991211, + "learning_rate": 0.0002, + "loss": 0.5057, + "step": 40940 + }, + { + "epoch": 6.620321720151968, + "grad_norm": 1.2074699401855469, + "learning_rate": 0.0002, + "loss": 0.492, + "step": 40950 + }, + { + "epoch": 6.621938404332713, + "grad_norm": 1.312009572982788, + "learning_rate": 0.0002, + "loss": 0.5089, + "step": 40960 + }, + { + "epoch": 6.623555088513459, + "grad_norm": 1.4381471872329712, + "learning_rate": 0.0002, + "loss": 0.5476, + "step": 40970 + }, + { + "epoch": 6.6251717726942045, + "grad_norm": 1.1574671268463135, + "learning_rate": 0.0002, + "loss": 0.4904, + "step": 40980 + }, + { + "epoch": 6.62678845687495, + "grad_norm": 0.885661780834198, + "learning_rate": 0.0002, + "loss": 0.531, + "step": 40990 + }, + { + "epoch": 6.628405141055695, + "grad_norm": 1.024571180343628, + "learning_rate": 0.0002, + "loss": 0.5145, + "step": 41000 + }, + { + "epoch": 6.63002182523644, + "grad_norm": 1.103437900543213, + "learning_rate": 0.0002, + "loss": 0.4791, + "step": 41010 + }, + { + "epoch": 6.6316385094171855, + "grad_norm": 1.122450828552246, + "learning_rate": 0.0002, + "loss": 0.4671, + "step": 41020 + }, + { + "epoch": 6.633255193597931, + "grad_norm": 1.2256295680999756, + "learning_rate": 0.0002, + "loss": 0.5134, + "step": 41030 + }, + { + "epoch": 6.634871877778676, + "grad_norm": 1.364594578742981, + "learning_rate": 0.0002, + "loss": 0.4908, + "step": 41040 + }, + { + "epoch": 6.636488561959421, + "grad_norm": 0.9550056457519531, + "learning_rate": 0.0002, + "loss": 0.4964, + "step": 41050 + }, + { + "epoch": 6.638105246140166, + "grad_norm": 1.3174707889556885, + "learning_rate": 0.0002, + "loss": 0.5028, + "step": 41060 + }, + { + "epoch": 6.639721930320912, + "grad_norm": 1.0835540294647217, + "learning_rate": 0.0002, + "loss": 0.4717, + "step": 41070 + }, + { + "epoch": 6.641338614501657, + "grad_norm": 1.1432770490646362, + "learning_rate": 0.0002, + "loss": 0.497, + "step": 41080 + }, + { + "epoch": 6.642955298682402, + "grad_norm": 1.2398556470870972, + "learning_rate": 0.0002, + "loss": 0.4903, + "step": 41090 + }, + { + "epoch": 6.644571982863147, + "grad_norm": 1.1147747039794922, + "learning_rate": 0.0002, + "loss": 0.4991, + "step": 41100 + }, + { + "epoch": 6.6461886670438926, + "grad_norm": 1.0730493068695068, + "learning_rate": 0.0002, + "loss": 0.505, + "step": 41110 + }, + { + "epoch": 6.647805351224639, + "grad_norm": 1.3218451738357544, + "learning_rate": 0.0002, + "loss": 0.486, + "step": 41120 + }, + { + "epoch": 6.649422035405384, + "grad_norm": 1.3027331829071045, + "learning_rate": 0.0002, + "loss": 0.5276, + "step": 41130 + }, + { + "epoch": 6.651038719586129, + "grad_norm": 1.0280735492706299, + "learning_rate": 0.0002, + "loss": 0.5263, + "step": 41140 + }, + { + "epoch": 6.652655403766874, + "grad_norm": 1.109916090965271, + "learning_rate": 0.0002, + "loss": 0.4952, + "step": 41150 + }, + { + "epoch": 6.65427208794762, + "grad_norm": 1.078734040260315, + "learning_rate": 0.0002, + "loss": 0.5001, + "step": 41160 + }, + { + "epoch": 6.655888772128365, + "grad_norm": 1.1595654487609863, + "learning_rate": 0.0002, + "loss": 0.484, + "step": 41170 + }, + { + "epoch": 6.65750545630911, + "grad_norm": 1.1701031923294067, + "learning_rate": 0.0002, + "loss": 0.5101, + "step": 41180 + }, + { + "epoch": 6.659122140489855, + "grad_norm": 1.0424643754959106, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 41190 + }, + { + "epoch": 6.6607388246706005, + "grad_norm": 1.22880220413208, + "learning_rate": 0.0002, + "loss": 0.4863, + "step": 41200 + }, + { + "epoch": 6.662355508851346, + "grad_norm": 1.1907655000686646, + "learning_rate": 0.0002, + "loss": 0.4987, + "step": 41210 + }, + { + "epoch": 6.663972193032091, + "grad_norm": 1.0765007734298706, + "learning_rate": 0.0002, + "loss": 0.5343, + "step": 41220 + }, + { + "epoch": 6.665588877212836, + "grad_norm": 0.9994917511940002, + "learning_rate": 0.0002, + "loss": 0.5039, + "step": 41230 + }, + { + "epoch": 6.6672055613935814, + "grad_norm": 0.968578040599823, + "learning_rate": 0.0002, + "loss": 0.507, + "step": 41240 + }, + { + "epoch": 6.668822245574327, + "grad_norm": 1.0576032400131226, + "learning_rate": 0.0002, + "loss": 0.5068, + "step": 41250 + }, + { + "epoch": 6.670438929755072, + "grad_norm": 1.2183765172958374, + "learning_rate": 0.0002, + "loss": 0.486, + "step": 41260 + }, + { + "epoch": 6.672055613935818, + "grad_norm": 1.2548623085021973, + "learning_rate": 0.0002, + "loss": 0.4764, + "step": 41270 + }, + { + "epoch": 6.673672298116563, + "grad_norm": 1.0848388671875, + "learning_rate": 0.0002, + "loss": 0.5014, + "step": 41280 + }, + { + "epoch": 6.6752889822973085, + "grad_norm": 1.21421217918396, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 41290 + }, + { + "epoch": 6.676905666478054, + "grad_norm": 1.1453598737716675, + "learning_rate": 0.0002, + "loss": 0.4911, + "step": 41300 + }, + { + "epoch": 6.678522350658799, + "grad_norm": 1.2682722806930542, + "learning_rate": 0.0002, + "loss": 0.5033, + "step": 41310 + }, + { + "epoch": 6.680139034839544, + "grad_norm": 1.1659725904464722, + "learning_rate": 0.0002, + "loss": 0.5313, + "step": 41320 + }, + { + "epoch": 6.681755719020289, + "grad_norm": 1.36194908618927, + "learning_rate": 0.0002, + "loss": 0.5505, + "step": 41330 + }, + { + "epoch": 6.683372403201035, + "grad_norm": 1.1712592840194702, + "learning_rate": 0.0002, + "loss": 0.5127, + "step": 41340 + }, + { + "epoch": 6.68498908738178, + "grad_norm": 1.4168336391448975, + "learning_rate": 0.0002, + "loss": 0.5082, + "step": 41350 + }, + { + "epoch": 6.686605771562525, + "grad_norm": 1.0395328998565674, + "learning_rate": 0.0002, + "loss": 0.5124, + "step": 41360 + }, + { + "epoch": 6.68822245574327, + "grad_norm": 1.2511054277420044, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 41370 + }, + { + "epoch": 6.689839139924016, + "grad_norm": 1.0438542366027832, + "learning_rate": 0.0002, + "loss": 0.5027, + "step": 41380 + }, + { + "epoch": 6.691455824104761, + "grad_norm": 1.08684241771698, + "learning_rate": 0.0002, + "loss": 0.5069, + "step": 41390 + }, + { + "epoch": 6.693072508285506, + "grad_norm": 1.250788927078247, + "learning_rate": 0.0002, + "loss": 0.5224, + "step": 41400 + }, + { + "epoch": 6.694689192466251, + "grad_norm": 1.313890814781189, + "learning_rate": 0.0002, + "loss": 0.4921, + "step": 41410 + }, + { + "epoch": 6.696305876646997, + "grad_norm": 1.3218982219696045, + "learning_rate": 0.0002, + "loss": 0.5028, + "step": 41420 + }, + { + "epoch": 6.697922560827743, + "grad_norm": 1.0366582870483398, + "learning_rate": 0.0002, + "loss": 0.4851, + "step": 41430 + }, + { + "epoch": 6.699539245008488, + "grad_norm": 1.066121220588684, + "learning_rate": 0.0002, + "loss": 0.5103, + "step": 41440 + }, + { + "epoch": 6.701155929189233, + "grad_norm": 1.0239925384521484, + "learning_rate": 0.0002, + "loss": 0.4966, + "step": 41450 + }, + { + "epoch": 6.702772613369978, + "grad_norm": 0.9402176141738892, + "learning_rate": 0.0002, + "loss": 0.4767, + "step": 41460 + }, + { + "epoch": 6.7043892975507235, + "grad_norm": 1.391718864440918, + "learning_rate": 0.0002, + "loss": 0.5381, + "step": 41470 + }, + { + "epoch": 6.706005981731469, + "grad_norm": 1.215600609779358, + "learning_rate": 0.0002, + "loss": 0.512, + "step": 41480 + }, + { + "epoch": 6.707622665912214, + "grad_norm": 1.063722848892212, + "learning_rate": 0.0002, + "loss": 0.5219, + "step": 41490 + }, + { + "epoch": 6.709239350092959, + "grad_norm": 1.132149577140808, + "learning_rate": 0.0002, + "loss": 0.492, + "step": 41500 + }, + { + "epoch": 6.7108560342737045, + "grad_norm": 1.0302950143814087, + "learning_rate": 0.0002, + "loss": 0.4812, + "step": 41510 + }, + { + "epoch": 6.71247271845445, + "grad_norm": 1.5342752933502197, + "learning_rate": 0.0002, + "loss": 0.5141, + "step": 41520 + }, + { + "epoch": 6.714089402635195, + "grad_norm": 1.177137017250061, + "learning_rate": 0.0002, + "loss": 0.5123, + "step": 41530 + }, + { + "epoch": 6.71570608681594, + "grad_norm": 1.2335538864135742, + "learning_rate": 0.0002, + "loss": 0.5082, + "step": 41540 + }, + { + "epoch": 6.717322770996686, + "grad_norm": 1.140604853630066, + "learning_rate": 0.0002, + "loss": 0.4864, + "step": 41550 + }, + { + "epoch": 6.718939455177431, + "grad_norm": 1.3567465543746948, + "learning_rate": 0.0002, + "loss": 0.4888, + "step": 41560 + }, + { + "epoch": 6.720556139358177, + "grad_norm": 1.0693929195404053, + "learning_rate": 0.0002, + "loss": 0.5183, + "step": 41570 + }, + { + "epoch": 6.722172823538922, + "grad_norm": 1.1592605113983154, + "learning_rate": 0.0002, + "loss": 0.5131, + "step": 41580 + }, + { + "epoch": 6.723789507719667, + "grad_norm": 0.989006519317627, + "learning_rate": 0.0002, + "loss": 0.5476, + "step": 41590 + }, + { + "epoch": 6.7254061919004124, + "grad_norm": 1.04103422164917, + "learning_rate": 0.0002, + "loss": 0.4952, + "step": 41600 + }, + { + "epoch": 6.727022876081158, + "grad_norm": 1.1129004955291748, + "learning_rate": 0.0002, + "loss": 0.4823, + "step": 41610 + }, + { + "epoch": 6.728639560261903, + "grad_norm": 1.1473113298416138, + "learning_rate": 0.0002, + "loss": 0.5032, + "step": 41620 + }, + { + "epoch": 6.730256244442648, + "grad_norm": 1.348036527633667, + "learning_rate": 0.0002, + "loss": 0.5253, + "step": 41630 + }, + { + "epoch": 6.731872928623393, + "grad_norm": 1.259942650794983, + "learning_rate": 0.0002, + "loss": 0.4983, + "step": 41640 + }, + { + "epoch": 6.733489612804139, + "grad_norm": 1.0591514110565186, + "learning_rate": 0.0002, + "loss": 0.5182, + "step": 41650 + }, + { + "epoch": 6.735106296984884, + "grad_norm": 0.9737129211425781, + "learning_rate": 0.0002, + "loss": 0.4886, + "step": 41660 + }, + { + "epoch": 6.736722981165629, + "grad_norm": 1.2520451545715332, + "learning_rate": 0.0002, + "loss": 0.5051, + "step": 41670 + }, + { + "epoch": 6.738339665346374, + "grad_norm": 1.0555530786514282, + "learning_rate": 0.0002, + "loss": 0.5364, + "step": 41680 + }, + { + "epoch": 6.7399563495271195, + "grad_norm": 1.0025697946548462, + "learning_rate": 0.0002, + "loss": 0.4954, + "step": 41690 + }, + { + "epoch": 6.741573033707866, + "grad_norm": 1.1114100217819214, + "learning_rate": 0.0002, + "loss": 0.5485, + "step": 41700 + }, + { + "epoch": 6.74318971788861, + "grad_norm": 1.1537504196166992, + "learning_rate": 0.0002, + "loss": 0.4986, + "step": 41710 + }, + { + "epoch": 6.744806402069356, + "grad_norm": 1.037880539894104, + "learning_rate": 0.0002, + "loss": 0.5025, + "step": 41720 + }, + { + "epoch": 6.746423086250101, + "grad_norm": 1.0691965818405151, + "learning_rate": 0.0002, + "loss": 0.482, + "step": 41730 + }, + { + "epoch": 6.748039770430847, + "grad_norm": 1.376325011253357, + "learning_rate": 0.0002, + "loss": 0.5272, + "step": 41740 + }, + { + "epoch": 6.749656454611592, + "grad_norm": 1.4667129516601562, + "learning_rate": 0.0002, + "loss": 0.5484, + "step": 41750 + }, + { + "epoch": 6.751273138792337, + "grad_norm": 1.1517162322998047, + "learning_rate": 0.0002, + "loss": 0.5139, + "step": 41760 + }, + { + "epoch": 6.752889822973082, + "grad_norm": 1.1454511880874634, + "learning_rate": 0.0002, + "loss": 0.5523, + "step": 41770 + }, + { + "epoch": 6.7545065071538275, + "grad_norm": 1.6323128938674927, + "learning_rate": 0.0002, + "loss": 0.4664, + "step": 41780 + }, + { + "epoch": 6.756123191334573, + "grad_norm": 1.0951642990112305, + "learning_rate": 0.0002, + "loss": 0.5153, + "step": 41790 + }, + { + "epoch": 6.757739875515318, + "grad_norm": 1.0766983032226562, + "learning_rate": 0.0002, + "loss": 0.4998, + "step": 41800 + }, + { + "epoch": 6.759356559696063, + "grad_norm": 1.3472381830215454, + "learning_rate": 0.0002, + "loss": 0.548, + "step": 41810 + }, + { + "epoch": 6.760973243876808, + "grad_norm": 1.0248444080352783, + "learning_rate": 0.0002, + "loss": 0.5172, + "step": 41820 + }, + { + "epoch": 6.762589928057554, + "grad_norm": 1.1276055574417114, + "learning_rate": 0.0002, + "loss": 0.5236, + "step": 41830 + }, + { + "epoch": 6.764206612238299, + "grad_norm": 1.5398495197296143, + "learning_rate": 0.0002, + "loss": 0.5044, + "step": 41840 + }, + { + "epoch": 6.765823296419045, + "grad_norm": 1.1886497735977173, + "learning_rate": 0.0002, + "loss": 0.5097, + "step": 41850 + }, + { + "epoch": 6.767439980599789, + "grad_norm": 1.027198076248169, + "learning_rate": 0.0002, + "loss": 0.499, + "step": 41860 + }, + { + "epoch": 6.7690566647805355, + "grad_norm": 1.4644980430603027, + "learning_rate": 0.0002, + "loss": 0.5444, + "step": 41870 + }, + { + "epoch": 6.770673348961281, + "grad_norm": 0.9633586406707764, + "learning_rate": 0.0002, + "loss": 0.5009, + "step": 41880 + }, + { + "epoch": 6.772290033142026, + "grad_norm": 1.0895354747772217, + "learning_rate": 0.0002, + "loss": 0.484, + "step": 41890 + }, + { + "epoch": 6.773906717322771, + "grad_norm": 1.1887167692184448, + "learning_rate": 0.0002, + "loss": 0.5172, + "step": 41900 + }, + { + "epoch": 6.775523401503516, + "grad_norm": 1.3699820041656494, + "learning_rate": 0.0002, + "loss": 0.5399, + "step": 41910 + }, + { + "epoch": 6.777140085684262, + "grad_norm": 1.0266352891921997, + "learning_rate": 0.0002, + "loss": 0.5504, + "step": 41920 + }, + { + "epoch": 6.778756769865007, + "grad_norm": 1.0919075012207031, + "learning_rate": 0.0002, + "loss": 0.5105, + "step": 41930 + }, + { + "epoch": 6.780373454045752, + "grad_norm": 0.9839563369750977, + "learning_rate": 0.0002, + "loss": 0.4842, + "step": 41940 + }, + { + "epoch": 6.781990138226497, + "grad_norm": 1.2605451345443726, + "learning_rate": 0.0002, + "loss": 0.5081, + "step": 41950 + }, + { + "epoch": 6.7836068224072426, + "grad_norm": 0.9268672466278076, + "learning_rate": 0.0002, + "loss": 0.5391, + "step": 41960 + }, + { + "epoch": 6.785223506587988, + "grad_norm": 1.2002313137054443, + "learning_rate": 0.0002, + "loss": 0.4916, + "step": 41970 + }, + { + "epoch": 6.786840190768733, + "grad_norm": 1.2018438577651978, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 41980 + }, + { + "epoch": 6.788456874949478, + "grad_norm": 1.17646062374115, + "learning_rate": 0.0002, + "loss": 0.5491, + "step": 41990 + }, + { + "epoch": 6.790073559130224, + "grad_norm": 1.1080009937286377, + "learning_rate": 0.0002, + "loss": 0.5354, + "step": 42000 + }, + { + "epoch": 6.791690243310969, + "grad_norm": 1.1606498956680298, + "learning_rate": 0.0002, + "loss": 0.5384, + "step": 42010 + }, + { + "epoch": 6.793306927491715, + "grad_norm": 1.2484819889068604, + "learning_rate": 0.0002, + "loss": 0.4931, + "step": 42020 + }, + { + "epoch": 6.79492361167246, + "grad_norm": 1.1363215446472168, + "learning_rate": 0.0002, + "loss": 0.498, + "step": 42030 + }, + { + "epoch": 6.796540295853205, + "grad_norm": 1.4469727277755737, + "learning_rate": 0.0002, + "loss": 0.5343, + "step": 42040 + }, + { + "epoch": 6.7981569800339505, + "grad_norm": 1.0617138147354126, + "learning_rate": 0.0002, + "loss": 0.5146, + "step": 42050 + }, + { + "epoch": 6.799773664214696, + "grad_norm": 1.1459330320358276, + "learning_rate": 0.0002, + "loss": 0.5188, + "step": 42060 + }, + { + "epoch": 6.801390348395441, + "grad_norm": 1.2095019817352295, + "learning_rate": 0.0002, + "loss": 0.5116, + "step": 42070 + }, + { + "epoch": 6.803007032576186, + "grad_norm": 1.3200831413269043, + "learning_rate": 0.0002, + "loss": 0.545, + "step": 42080 + }, + { + "epoch": 6.8046237167569315, + "grad_norm": 1.1633318662643433, + "learning_rate": 0.0002, + "loss": 0.5406, + "step": 42090 + }, + { + "epoch": 6.806240400937677, + "grad_norm": 0.8986614942550659, + "learning_rate": 0.0002, + "loss": 0.4938, + "step": 42100 + }, + { + "epoch": 6.807857085118422, + "grad_norm": 1.3705275058746338, + "learning_rate": 0.0002, + "loss": 0.559, + "step": 42110 + }, + { + "epoch": 6.809473769299167, + "grad_norm": 1.2418090105056763, + "learning_rate": 0.0002, + "loss": 0.5022, + "step": 42120 + }, + { + "epoch": 6.811090453479912, + "grad_norm": 1.0818954706192017, + "learning_rate": 0.0002, + "loss": 0.5014, + "step": 42130 + }, + { + "epoch": 6.812707137660658, + "grad_norm": 0.9293872117996216, + "learning_rate": 0.0002, + "loss": 0.4791, + "step": 42140 + }, + { + "epoch": 6.814323821841404, + "grad_norm": 0.9791894555091858, + "learning_rate": 0.0002, + "loss": 0.5009, + "step": 42150 + }, + { + "epoch": 6.815940506022149, + "grad_norm": 1.1956568956375122, + "learning_rate": 0.0002, + "loss": 0.5142, + "step": 42160 + }, + { + "epoch": 6.817557190202894, + "grad_norm": 0.9643568992614746, + "learning_rate": 0.0002, + "loss": 0.5126, + "step": 42170 + }, + { + "epoch": 6.819173874383639, + "grad_norm": 1.2499792575836182, + "learning_rate": 0.0002, + "loss": 0.5121, + "step": 42180 + }, + { + "epoch": 6.820790558564385, + "grad_norm": 1.1779413223266602, + "learning_rate": 0.0002, + "loss": 0.4942, + "step": 42190 + }, + { + "epoch": 6.82240724274513, + "grad_norm": 1.0570595264434814, + "learning_rate": 0.0002, + "loss": 0.498, + "step": 42200 + }, + { + "epoch": 6.824023926925875, + "grad_norm": 1.1393938064575195, + "learning_rate": 0.0002, + "loss": 0.4997, + "step": 42210 + }, + { + "epoch": 6.82564061110662, + "grad_norm": 1.152463436126709, + "learning_rate": 0.0002, + "loss": 0.4842, + "step": 42220 + }, + { + "epoch": 6.827257295287366, + "grad_norm": 1.3353025913238525, + "learning_rate": 0.0002, + "loss": 0.5234, + "step": 42230 + }, + { + "epoch": 6.828873979468111, + "grad_norm": 1.1719051599502563, + "learning_rate": 0.0002, + "loss": 0.539, + "step": 42240 + }, + { + "epoch": 6.830490663648856, + "grad_norm": 1.262141227722168, + "learning_rate": 0.0002, + "loss": 0.5139, + "step": 42250 + }, + { + "epoch": 6.832107347829601, + "grad_norm": 1.240899920463562, + "learning_rate": 0.0002, + "loss": 0.5021, + "step": 42260 + }, + { + "epoch": 6.8337240320103465, + "grad_norm": 1.0505269765853882, + "learning_rate": 0.0002, + "loss": 0.4961, + "step": 42270 + }, + { + "epoch": 6.835340716191092, + "grad_norm": 1.1556071043014526, + "learning_rate": 0.0002, + "loss": 0.4932, + "step": 42280 + }, + { + "epoch": 6.836957400371837, + "grad_norm": 1.1427719593048096, + "learning_rate": 0.0002, + "loss": 0.5461, + "step": 42290 + }, + { + "epoch": 6.838574084552583, + "grad_norm": 1.1540080308914185, + "learning_rate": 0.0002, + "loss": 0.5199, + "step": 42300 + }, + { + "epoch": 6.840190768733328, + "grad_norm": 1.0521200895309448, + "learning_rate": 0.0002, + "loss": 0.5269, + "step": 42310 + }, + { + "epoch": 6.8418074529140736, + "grad_norm": 1.0205531120300293, + "learning_rate": 0.0002, + "loss": 0.541, + "step": 42320 + }, + { + "epoch": 6.843424137094819, + "grad_norm": 1.0010193586349487, + "learning_rate": 0.0002, + "loss": 0.5225, + "step": 42330 + }, + { + "epoch": 6.845040821275564, + "grad_norm": 1.2138770818710327, + "learning_rate": 0.0002, + "loss": 0.5101, + "step": 42340 + }, + { + "epoch": 6.846657505456309, + "grad_norm": 1.3028651475906372, + "learning_rate": 0.0002, + "loss": 0.5452, + "step": 42350 + }, + { + "epoch": 6.8482741896370545, + "grad_norm": 1.0326353311538696, + "learning_rate": 0.0002, + "loss": 0.4894, + "step": 42360 + }, + { + "epoch": 6.8498908738178, + "grad_norm": 1.036085605621338, + "learning_rate": 0.0002, + "loss": 0.5285, + "step": 42370 + }, + { + "epoch": 6.851507557998545, + "grad_norm": 1.0575472116470337, + "learning_rate": 0.0002, + "loss": 0.505, + "step": 42380 + }, + { + "epoch": 6.85312424217929, + "grad_norm": 1.1749629974365234, + "learning_rate": 0.0002, + "loss": 0.4997, + "step": 42390 + }, + { + "epoch": 6.854740926360035, + "grad_norm": 1.1747760772705078, + "learning_rate": 0.0002, + "loss": 0.4961, + "step": 42400 + }, + { + "epoch": 6.856357610540781, + "grad_norm": 1.1877071857452393, + "learning_rate": 0.0002, + "loss": 0.5138, + "step": 42410 + }, + { + "epoch": 6.857974294721526, + "grad_norm": 1.1209983825683594, + "learning_rate": 0.0002, + "loss": 0.4972, + "step": 42420 + }, + { + "epoch": 6.859590978902271, + "grad_norm": 1.2918205261230469, + "learning_rate": 0.0002, + "loss": 0.4939, + "step": 42430 + }, + { + "epoch": 6.861207663083016, + "grad_norm": 1.2443464994430542, + "learning_rate": 0.0002, + "loss": 0.5012, + "step": 42440 + }, + { + "epoch": 6.8628243472637624, + "grad_norm": 0.9336795210838318, + "learning_rate": 0.0002, + "loss": 0.5226, + "step": 42450 + }, + { + "epoch": 6.864441031444508, + "grad_norm": 1.2183542251586914, + "learning_rate": 0.0002, + "loss": 0.5108, + "step": 42460 + }, + { + "epoch": 6.866057715625253, + "grad_norm": 1.0071234703063965, + "learning_rate": 0.0002, + "loss": 0.5245, + "step": 42470 + }, + { + "epoch": 6.867674399805998, + "grad_norm": 1.2914012670516968, + "learning_rate": 0.0002, + "loss": 0.4753, + "step": 42480 + }, + { + "epoch": 6.869291083986743, + "grad_norm": 1.1050426959991455, + "learning_rate": 0.0002, + "loss": 0.4865, + "step": 42490 + }, + { + "epoch": 6.870907768167489, + "grad_norm": 1.1163811683654785, + "learning_rate": 0.0002, + "loss": 0.5243, + "step": 42500 + }, + { + "epoch": 6.872524452348234, + "grad_norm": 1.1575818061828613, + "learning_rate": 0.0002, + "loss": 0.5065, + "step": 42510 + }, + { + "epoch": 6.874141136528979, + "grad_norm": 1.11167311668396, + "learning_rate": 0.0002, + "loss": 0.5353, + "step": 42520 + }, + { + "epoch": 6.875757820709724, + "grad_norm": 1.0379102230072021, + "learning_rate": 0.0002, + "loss": 0.5141, + "step": 42530 + }, + { + "epoch": 6.8773745048904695, + "grad_norm": 1.2617160081863403, + "learning_rate": 0.0002, + "loss": 0.5355, + "step": 42540 + }, + { + "epoch": 6.878991189071215, + "grad_norm": 1.1749719381332397, + "learning_rate": 0.0002, + "loss": 0.4785, + "step": 42550 + }, + { + "epoch": 6.88060787325196, + "grad_norm": 1.2284821271896362, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 42560 + }, + { + "epoch": 6.882224557432705, + "grad_norm": 1.1917030811309814, + "learning_rate": 0.0002, + "loss": 0.5065, + "step": 42570 + }, + { + "epoch": 6.8838412416134505, + "grad_norm": 1.1943914890289307, + "learning_rate": 0.0002, + "loss": 0.5176, + "step": 42580 + }, + { + "epoch": 6.885457925794196, + "grad_norm": 1.2641394138336182, + "learning_rate": 0.0002, + "loss": 0.5072, + "step": 42590 + }, + { + "epoch": 6.887074609974942, + "grad_norm": 1.1280436515808105, + "learning_rate": 0.0002, + "loss": 0.5004, + "step": 42600 + }, + { + "epoch": 6.888691294155687, + "grad_norm": 0.9865449070930481, + "learning_rate": 0.0002, + "loss": 0.5328, + "step": 42610 + }, + { + "epoch": 6.890307978336432, + "grad_norm": 0.994987428188324, + "learning_rate": 0.0002, + "loss": 0.4953, + "step": 42620 + }, + { + "epoch": 6.8919246625171775, + "grad_norm": 0.9900388717651367, + "learning_rate": 0.0002, + "loss": 0.4805, + "step": 42630 + }, + { + "epoch": 6.893541346697923, + "grad_norm": 1.2992421388626099, + "learning_rate": 0.0002, + "loss": 0.5467, + "step": 42640 + }, + { + "epoch": 6.895158030878668, + "grad_norm": 1.0152487754821777, + "learning_rate": 0.0002, + "loss": 0.5017, + "step": 42650 + }, + { + "epoch": 6.896774715059413, + "grad_norm": 1.199453353881836, + "learning_rate": 0.0002, + "loss": 0.5043, + "step": 42660 + }, + { + "epoch": 6.898391399240158, + "grad_norm": 1.100630521774292, + "learning_rate": 0.0002, + "loss": 0.5106, + "step": 42670 + }, + { + "epoch": 6.900008083420904, + "grad_norm": 1.0489764213562012, + "learning_rate": 0.0002, + "loss": 0.503, + "step": 42680 + }, + { + "epoch": 6.901624767601649, + "grad_norm": 1.101407527923584, + "learning_rate": 0.0002, + "loss": 0.4634, + "step": 42690 + }, + { + "epoch": 6.903241451782394, + "grad_norm": 1.3130593299865723, + "learning_rate": 0.0002, + "loss": 0.5361, + "step": 42700 + }, + { + "epoch": 6.904858135963139, + "grad_norm": 0.9906072616577148, + "learning_rate": 0.0002, + "loss": 0.5119, + "step": 42710 + }, + { + "epoch": 6.906474820143885, + "grad_norm": 1.094502329826355, + "learning_rate": 0.0002, + "loss": 0.5146, + "step": 42720 + }, + { + "epoch": 6.90809150432463, + "grad_norm": 1.1025426387786865, + "learning_rate": 0.0002, + "loss": 0.5165, + "step": 42730 + }, + { + "epoch": 6.909708188505375, + "grad_norm": 1.0644042491912842, + "learning_rate": 0.0002, + "loss": 0.5463, + "step": 42740 + }, + { + "epoch": 6.911324872686121, + "grad_norm": 1.0709129571914673, + "learning_rate": 0.0002, + "loss": 0.5024, + "step": 42750 + }, + { + "epoch": 6.912941556866866, + "grad_norm": 1.2445871829986572, + "learning_rate": 0.0002, + "loss": 0.5093, + "step": 42760 + }, + { + "epoch": 6.914558241047612, + "grad_norm": 1.020058035850525, + "learning_rate": 0.0002, + "loss": 0.5305, + "step": 42770 + }, + { + "epoch": 6.916174925228357, + "grad_norm": 0.9795091152191162, + "learning_rate": 0.0002, + "loss": 0.5382, + "step": 42780 + }, + { + "epoch": 6.917791609409102, + "grad_norm": 0.9369977116584778, + "learning_rate": 0.0002, + "loss": 0.5429, + "step": 42790 + }, + { + "epoch": 6.919408293589847, + "grad_norm": 1.0741904973983765, + "learning_rate": 0.0002, + "loss": 0.5444, + "step": 42800 + }, + { + "epoch": 6.921024977770593, + "grad_norm": 1.0702799558639526, + "learning_rate": 0.0002, + "loss": 0.5402, + "step": 42810 + }, + { + "epoch": 6.922641661951338, + "grad_norm": 1.0383983850479126, + "learning_rate": 0.0002, + "loss": 0.5291, + "step": 42820 + }, + { + "epoch": 6.924258346132083, + "grad_norm": 1.0761083364486694, + "learning_rate": 0.0002, + "loss": 0.5106, + "step": 42830 + }, + { + "epoch": 6.925875030312828, + "grad_norm": 1.2332350015640259, + "learning_rate": 0.0002, + "loss": 0.5726, + "step": 42840 + }, + { + "epoch": 6.9274917144935735, + "grad_norm": 1.3184348344802856, + "learning_rate": 0.0002, + "loss": 0.4996, + "step": 42850 + }, + { + "epoch": 6.929108398674319, + "grad_norm": 1.0586378574371338, + "learning_rate": 0.0002, + "loss": 0.5503, + "step": 42860 + }, + { + "epoch": 6.930725082855064, + "grad_norm": 1.2294201850891113, + "learning_rate": 0.0002, + "loss": 0.511, + "step": 42870 + }, + { + "epoch": 6.932341767035809, + "grad_norm": 1.3097991943359375, + "learning_rate": 0.0002, + "loss": 0.54, + "step": 42880 + }, + { + "epoch": 6.933958451216554, + "grad_norm": 0.9006873965263367, + "learning_rate": 0.0002, + "loss": 0.5228, + "step": 42890 + }, + { + "epoch": 6.9355751353973005, + "grad_norm": 1.265931248664856, + "learning_rate": 0.0002, + "loss": 0.4617, + "step": 42900 + }, + { + "epoch": 6.937191819578046, + "grad_norm": 1.1013522148132324, + "learning_rate": 0.0002, + "loss": 0.5029, + "step": 42910 + }, + { + "epoch": 6.938808503758791, + "grad_norm": 0.9910131692886353, + "learning_rate": 0.0002, + "loss": 0.5334, + "step": 42920 + }, + { + "epoch": 6.940425187939536, + "grad_norm": 1.102683424949646, + "learning_rate": 0.0002, + "loss": 0.5211, + "step": 42930 + }, + { + "epoch": 6.9420418721202815, + "grad_norm": 1.232961893081665, + "learning_rate": 0.0002, + "loss": 0.5588, + "step": 42940 + }, + { + "epoch": 6.943658556301027, + "grad_norm": 1.1714650392532349, + "learning_rate": 0.0002, + "loss": 0.5357, + "step": 42950 + }, + { + "epoch": 6.945275240481772, + "grad_norm": 1.1684318780899048, + "learning_rate": 0.0002, + "loss": 0.5232, + "step": 42960 + }, + { + "epoch": 6.946891924662517, + "grad_norm": 1.2074716091156006, + "learning_rate": 0.0002, + "loss": 0.5035, + "step": 42970 + }, + { + "epoch": 6.948508608843262, + "grad_norm": 1.2061275243759155, + "learning_rate": 0.0002, + "loss": 0.5111, + "step": 42980 + }, + { + "epoch": 6.950125293024008, + "grad_norm": 1.1216989755630493, + "learning_rate": 0.0002, + "loss": 0.5066, + "step": 42990 + }, + { + "epoch": 6.951741977204753, + "grad_norm": 1.304117202758789, + "learning_rate": 0.0002, + "loss": 0.4948, + "step": 43000 + }, + { + "epoch": 6.953358661385498, + "grad_norm": 1.2377972602844238, + "learning_rate": 0.0002, + "loss": 0.5684, + "step": 43010 + }, + { + "epoch": 6.954975345566243, + "grad_norm": 1.2332178354263306, + "learning_rate": 0.0002, + "loss": 0.4792, + "step": 43020 + }, + { + "epoch": 6.956592029746989, + "grad_norm": 1.1919599771499634, + "learning_rate": 0.0002, + "loss": 0.5181, + "step": 43030 + }, + { + "epoch": 6.958208713927734, + "grad_norm": 1.272700548171997, + "learning_rate": 0.0002, + "loss": 0.5352, + "step": 43040 + }, + { + "epoch": 6.95982539810848, + "grad_norm": 1.4377546310424805, + "learning_rate": 0.0002, + "loss": 0.5328, + "step": 43050 + }, + { + "epoch": 6.961442082289225, + "grad_norm": 1.2070353031158447, + "learning_rate": 0.0002, + "loss": 0.4894, + "step": 43060 + }, + { + "epoch": 6.96305876646997, + "grad_norm": 1.090205430984497, + "learning_rate": 0.0002, + "loss": 0.525, + "step": 43070 + }, + { + "epoch": 6.964675450650716, + "grad_norm": 1.1832911968231201, + "learning_rate": 0.0002, + "loss": 0.5255, + "step": 43080 + }, + { + "epoch": 6.966292134831461, + "grad_norm": 1.2921082973480225, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 43090 + }, + { + "epoch": 6.967908819012206, + "grad_norm": 1.4303096532821655, + "learning_rate": 0.0002, + "loss": 0.5527, + "step": 43100 + }, + { + "epoch": 6.969525503192951, + "grad_norm": 1.0788004398345947, + "learning_rate": 0.0002, + "loss": 0.4807, + "step": 43110 + }, + { + "epoch": 6.9711421873736965, + "grad_norm": 1.2192047834396362, + "learning_rate": 0.0002, + "loss": 0.5006, + "step": 43120 + }, + { + "epoch": 6.972758871554442, + "grad_norm": 1.0735143423080444, + "learning_rate": 0.0002, + "loss": 0.4714, + "step": 43130 + }, + { + "epoch": 6.974375555735187, + "grad_norm": 1.0317153930664062, + "learning_rate": 0.0002, + "loss": 0.5307, + "step": 43140 + }, + { + "epoch": 6.975992239915932, + "grad_norm": 1.0926798582077026, + "learning_rate": 0.0002, + "loss": 0.5154, + "step": 43150 + }, + { + "epoch": 6.977608924096677, + "grad_norm": 1.1660500764846802, + "learning_rate": 0.0002, + "loss": 0.4976, + "step": 43160 + }, + { + "epoch": 6.979225608277423, + "grad_norm": 1.3945232629776, + "learning_rate": 0.0002, + "loss": 0.5456, + "step": 43170 + }, + { + "epoch": 6.980842292458169, + "grad_norm": 1.2684587240219116, + "learning_rate": 0.0002, + "loss": 0.4979, + "step": 43180 + }, + { + "epoch": 6.982458976638913, + "grad_norm": 1.1574004888534546, + "learning_rate": 0.0002, + "loss": 0.5406, + "step": 43190 + }, + { + "epoch": 6.984075660819659, + "grad_norm": 1.2534198760986328, + "learning_rate": 0.0002, + "loss": 0.5629, + "step": 43200 + }, + { + "epoch": 6.9856923450004045, + "grad_norm": 1.135245442390442, + "learning_rate": 0.0002, + "loss": 0.5191, + "step": 43210 + }, + { + "epoch": 6.98730902918115, + "grad_norm": 1.3824104070663452, + "learning_rate": 0.0002, + "loss": 0.548, + "step": 43220 + }, + { + "epoch": 6.988925713361895, + "grad_norm": 1.2128452062606812, + "learning_rate": 0.0002, + "loss": 0.5294, + "step": 43230 + }, + { + "epoch": 6.99054239754264, + "grad_norm": 1.0795245170593262, + "learning_rate": 0.0002, + "loss": 0.505, + "step": 43240 + }, + { + "epoch": 6.992159081723385, + "grad_norm": 1.337353229522705, + "learning_rate": 0.0002, + "loss": 0.4889, + "step": 43250 + }, + { + "epoch": 6.993775765904131, + "grad_norm": 1.1731765270233154, + "learning_rate": 0.0002, + "loss": 0.4749, + "step": 43260 + }, + { + "epoch": 6.995392450084876, + "grad_norm": 1.0203192234039307, + "learning_rate": 0.0002, + "loss": 0.4897, + "step": 43270 + }, + { + "epoch": 6.997009134265621, + "grad_norm": 0.9261201620101929, + "learning_rate": 0.0002, + "loss": 0.5324, + "step": 43280 + }, + { + "epoch": 6.998625818446366, + "grad_norm": 1.107865810394287, + "learning_rate": 0.0002, + "loss": 0.5227, + "step": 43290 + }, + { + "epoch": 6.9999191657909625, + "eval_loss": 1.2679380178451538, + "eval_runtime": 122.202, + "eval_samples_per_second": 5.998, + "eval_steps_per_second": 0.753, + "step": 43298 + }, + { + "epoch": 7.000242502627112, + "grad_norm": 0.9555306434631348, + "learning_rate": 0.0002, + "loss": 0.4651, + "step": 43300 + }, + { + "epoch": 7.001859186807857, + "grad_norm": 1.3280415534973145, + "learning_rate": 0.0002, + "loss": 0.4301, + "step": 43310 + }, + { + "epoch": 7.003475870988602, + "grad_norm": 1.5583289861679077, + "learning_rate": 0.0002, + "loss": 0.437, + "step": 43320 + }, + { + "epoch": 7.005092555169347, + "grad_norm": 1.0714443922042847, + "learning_rate": 0.0002, + "loss": 0.4532, + "step": 43330 + }, + { + "epoch": 7.006709239350093, + "grad_norm": 1.048075795173645, + "learning_rate": 0.0002, + "loss": 0.4048, + "step": 43340 + }, + { + "epoch": 7.008325923530839, + "grad_norm": 1.1053836345672607, + "learning_rate": 0.0002, + "loss": 0.4119, + "step": 43350 + }, + { + "epoch": 7.009942607711584, + "grad_norm": 0.8911725282669067, + "learning_rate": 0.0002, + "loss": 0.4352, + "step": 43360 + }, + { + "epoch": 7.011559291892329, + "grad_norm": 0.9404396414756775, + "learning_rate": 0.0002, + "loss": 0.4236, + "step": 43370 + }, + { + "epoch": 7.013175976073074, + "grad_norm": 1.152365803718567, + "learning_rate": 0.0002, + "loss": 0.4529, + "step": 43380 + }, + { + "epoch": 7.0147926602538195, + "grad_norm": 1.2118251323699951, + "learning_rate": 0.0002, + "loss": 0.4381, + "step": 43390 + }, + { + "epoch": 7.016409344434565, + "grad_norm": 1.2046295404434204, + "learning_rate": 0.0002, + "loss": 0.4404, + "step": 43400 + }, + { + "epoch": 7.01802602861531, + "grad_norm": 0.929465115070343, + "learning_rate": 0.0002, + "loss": 0.4177, + "step": 43410 + }, + { + "epoch": 7.019642712796055, + "grad_norm": 1.3720149993896484, + "learning_rate": 0.0002, + "loss": 0.4086, + "step": 43420 + }, + { + "epoch": 7.0212593969768005, + "grad_norm": 1.1316810846328735, + "learning_rate": 0.0002, + "loss": 0.4174, + "step": 43430 + }, + { + "epoch": 7.022876081157546, + "grad_norm": 1.0342087745666504, + "learning_rate": 0.0002, + "loss": 0.451, + "step": 43440 + }, + { + "epoch": 7.024492765338291, + "grad_norm": 1.1455655097961426, + "learning_rate": 0.0002, + "loss": 0.4084, + "step": 43450 + }, + { + "epoch": 7.026109449519036, + "grad_norm": 1.1308859586715698, + "learning_rate": 0.0002, + "loss": 0.4168, + "step": 43460 + }, + { + "epoch": 7.027726133699781, + "grad_norm": 1.0796722173690796, + "learning_rate": 0.0002, + "loss": 0.4099, + "step": 43470 + }, + { + "epoch": 7.029342817880527, + "grad_norm": 1.0031877756118774, + "learning_rate": 0.0002, + "loss": 0.4047, + "step": 43480 + }, + { + "epoch": 7.030959502061273, + "grad_norm": 1.2391340732574463, + "learning_rate": 0.0002, + "loss": 0.3968, + "step": 43490 + }, + { + "epoch": 7.032576186242018, + "grad_norm": 1.0807358026504517, + "learning_rate": 0.0002, + "loss": 0.4155, + "step": 43500 + }, + { + "epoch": 7.034192870422763, + "grad_norm": 1.230995535850525, + "learning_rate": 0.0002, + "loss": 0.4322, + "step": 43510 + }, + { + "epoch": 7.035809554603508, + "grad_norm": 1.2200509309768677, + "learning_rate": 0.0002, + "loss": 0.3971, + "step": 43520 + }, + { + "epoch": 7.037426238784254, + "grad_norm": 0.9785236120223999, + "learning_rate": 0.0002, + "loss": 0.4242, + "step": 43530 + }, + { + "epoch": 7.039042922964999, + "grad_norm": 1.0009595155715942, + "learning_rate": 0.0002, + "loss": 0.4173, + "step": 43540 + }, + { + "epoch": 7.040659607145744, + "grad_norm": 0.9783103466033936, + "learning_rate": 0.0002, + "loss": 0.4175, + "step": 43550 + }, + { + "epoch": 7.042276291326489, + "grad_norm": 1.1303530931472778, + "learning_rate": 0.0002, + "loss": 0.4307, + "step": 43560 + }, + { + "epoch": 7.043892975507235, + "grad_norm": 1.1768499612808228, + "learning_rate": 0.0002, + "loss": 0.4066, + "step": 43570 + }, + { + "epoch": 7.04550965968798, + "grad_norm": 1.1040459871292114, + "learning_rate": 0.0002, + "loss": 0.4492, + "step": 43580 + }, + { + "epoch": 7.047126343868725, + "grad_norm": 1.0673959255218506, + "learning_rate": 0.0002, + "loss": 0.4314, + "step": 43590 + }, + { + "epoch": 7.04874302804947, + "grad_norm": 1.1220765113830566, + "learning_rate": 0.0002, + "loss": 0.402, + "step": 43600 + }, + { + "epoch": 7.0503597122302155, + "grad_norm": 1.1746923923492432, + "learning_rate": 0.0002, + "loss": 0.4108, + "step": 43610 + }, + { + "epoch": 7.051976396410961, + "grad_norm": 1.2764517068862915, + "learning_rate": 0.0002, + "loss": 0.4618, + "step": 43620 + }, + { + "epoch": 7.053593080591706, + "grad_norm": 1.1180157661437988, + "learning_rate": 0.0002, + "loss": 0.4243, + "step": 43630 + }, + { + "epoch": 7.055209764772452, + "grad_norm": 1.3558318614959717, + "learning_rate": 0.0002, + "loss": 0.4593, + "step": 43640 + }, + { + "epoch": 7.056826448953197, + "grad_norm": 0.9804982542991638, + "learning_rate": 0.0002, + "loss": 0.4351, + "step": 43650 + }, + { + "epoch": 7.058443133133943, + "grad_norm": 1.106404423713684, + "learning_rate": 0.0002, + "loss": 0.4309, + "step": 43660 + }, + { + "epoch": 7.060059817314688, + "grad_norm": 0.9469243884086609, + "learning_rate": 0.0002, + "loss": 0.4183, + "step": 43670 + }, + { + "epoch": 7.061676501495433, + "grad_norm": 1.272987723350525, + "learning_rate": 0.0002, + "loss": 0.4335, + "step": 43680 + }, + { + "epoch": 7.063293185676178, + "grad_norm": 1.0536233186721802, + "learning_rate": 0.0002, + "loss": 0.4017, + "step": 43690 + }, + { + "epoch": 7.0649098698569235, + "grad_norm": 1.1730698347091675, + "learning_rate": 0.0002, + "loss": 0.4597, + "step": 43700 + }, + { + "epoch": 7.066526554037669, + "grad_norm": 1.150707483291626, + "learning_rate": 0.0002, + "loss": 0.4304, + "step": 43710 + }, + { + "epoch": 7.068143238218414, + "grad_norm": 1.4583828449249268, + "learning_rate": 0.0002, + "loss": 0.4136, + "step": 43720 + }, + { + "epoch": 7.069759922399159, + "grad_norm": 1.569705843925476, + "learning_rate": 0.0002, + "loss": 0.4385, + "step": 43730 + }, + { + "epoch": 7.071376606579904, + "grad_norm": 1.156192660331726, + "learning_rate": 0.0002, + "loss": 0.4051, + "step": 43740 + }, + { + "epoch": 7.07299329076065, + "grad_norm": 1.25005304813385, + "learning_rate": 0.0002, + "loss": 0.4375, + "step": 43750 + }, + { + "epoch": 7.074609974941395, + "grad_norm": 1.0468846559524536, + "learning_rate": 0.0002, + "loss": 0.4096, + "step": 43760 + }, + { + "epoch": 7.07622665912214, + "grad_norm": 1.2045108079910278, + "learning_rate": 0.0002, + "loss": 0.4253, + "step": 43770 + }, + { + "epoch": 7.077843343302886, + "grad_norm": 1.1341021060943604, + "learning_rate": 0.0002, + "loss": 0.4248, + "step": 43780 + }, + { + "epoch": 7.0794600274836315, + "grad_norm": 1.0712201595306396, + "learning_rate": 0.0002, + "loss": 0.394, + "step": 43790 + }, + { + "epoch": 7.081076711664377, + "grad_norm": 1.0421321392059326, + "learning_rate": 0.0002, + "loss": 0.4093, + "step": 43800 + }, + { + "epoch": 7.082693395845122, + "grad_norm": 1.2241183519363403, + "learning_rate": 0.0002, + "loss": 0.4317, + "step": 43810 + }, + { + "epoch": 7.084310080025867, + "grad_norm": 1.0945624113082886, + "learning_rate": 0.0002, + "loss": 0.4064, + "step": 43820 + }, + { + "epoch": 7.085926764206612, + "grad_norm": 1.2772969007492065, + "learning_rate": 0.0002, + "loss": 0.4049, + "step": 43830 + }, + { + "epoch": 7.087543448387358, + "grad_norm": 1.1715508699417114, + "learning_rate": 0.0002, + "loss": 0.4098, + "step": 43840 + }, + { + "epoch": 7.089160132568103, + "grad_norm": 1.1975586414337158, + "learning_rate": 0.0002, + "loss": 0.4756, + "step": 43850 + }, + { + "epoch": 7.090776816748848, + "grad_norm": 1.1673274040222168, + "learning_rate": 0.0002, + "loss": 0.4272, + "step": 43860 + }, + { + "epoch": 7.092393500929593, + "grad_norm": 1.096590518951416, + "learning_rate": 0.0002, + "loss": 0.4435, + "step": 43870 + }, + { + "epoch": 7.0940101851103385, + "grad_norm": 1.0174020528793335, + "learning_rate": 0.0002, + "loss": 0.4329, + "step": 43880 + }, + { + "epoch": 7.095626869291084, + "grad_norm": 1.0147380828857422, + "learning_rate": 0.0002, + "loss": 0.4307, + "step": 43890 + }, + { + "epoch": 7.097243553471829, + "grad_norm": 1.0056098699569702, + "learning_rate": 0.0002, + "loss": 0.4115, + "step": 43900 + }, + { + "epoch": 7.098860237652574, + "grad_norm": 1.4678083658218384, + "learning_rate": 0.0002, + "loss": 0.4181, + "step": 43910 + }, + { + "epoch": 7.1004769218333195, + "grad_norm": 1.3740565776824951, + "learning_rate": 0.0002, + "loss": 0.4404, + "step": 43920 + }, + { + "epoch": 7.102093606014066, + "grad_norm": 1.0279403924942017, + "learning_rate": 0.0002, + "loss": 0.4435, + "step": 43930 + }, + { + "epoch": 7.103710290194811, + "grad_norm": 1.186720371246338, + "learning_rate": 0.0002, + "loss": 0.4247, + "step": 43940 + }, + { + "epoch": 7.105326974375556, + "grad_norm": 1.3767904043197632, + "learning_rate": 0.0002, + "loss": 0.4001, + "step": 43950 + }, + { + "epoch": 7.106943658556301, + "grad_norm": 1.1637471914291382, + "learning_rate": 0.0002, + "loss": 0.4314, + "step": 43960 + }, + { + "epoch": 7.1085603427370465, + "grad_norm": 1.1860042810440063, + "learning_rate": 0.0002, + "loss": 0.3996, + "step": 43970 + }, + { + "epoch": 7.110177026917792, + "grad_norm": 1.080944538116455, + "learning_rate": 0.0002, + "loss": 0.4014, + "step": 43980 + }, + { + "epoch": 7.111793711098537, + "grad_norm": 1.0119353532791138, + "learning_rate": 0.0002, + "loss": 0.4152, + "step": 43990 + }, + { + "epoch": 7.113410395279282, + "grad_norm": 1.179388403892517, + "learning_rate": 0.0002, + "loss": 0.4354, + "step": 44000 + }, + { + "epoch": 7.115027079460027, + "grad_norm": 0.9202800989151001, + "learning_rate": 0.0002, + "loss": 0.4494, + "step": 44010 + }, + { + "epoch": 7.116643763640773, + "grad_norm": 1.142206072807312, + "learning_rate": 0.0002, + "loss": 0.4356, + "step": 44020 + }, + { + "epoch": 7.118260447821518, + "grad_norm": 1.17897367477417, + "learning_rate": 0.0002, + "loss": 0.4197, + "step": 44030 + }, + { + "epoch": 7.119877132002263, + "grad_norm": 1.238087773323059, + "learning_rate": 0.0002, + "loss": 0.4394, + "step": 44040 + }, + { + "epoch": 7.121493816183008, + "grad_norm": 1.5113195180892944, + "learning_rate": 0.0002, + "loss": 0.4844, + "step": 44050 + }, + { + "epoch": 7.123110500363754, + "grad_norm": 1.1819349527359009, + "learning_rate": 0.0002, + "loss": 0.4526, + "step": 44060 + }, + { + "epoch": 7.124727184544499, + "grad_norm": 1.1062556505203247, + "learning_rate": 0.0002, + "loss": 0.4071, + "step": 44070 + }, + { + "epoch": 7.126343868725245, + "grad_norm": 0.986954927444458, + "learning_rate": 0.0002, + "loss": 0.4282, + "step": 44080 + }, + { + "epoch": 7.12796055290599, + "grad_norm": 0.9641291499137878, + "learning_rate": 0.0002, + "loss": 0.4497, + "step": 44090 + }, + { + "epoch": 7.129577237086735, + "grad_norm": 0.9519979953765869, + "learning_rate": 0.0002, + "loss": 0.4348, + "step": 44100 + }, + { + "epoch": 7.131193921267481, + "grad_norm": 1.0477287769317627, + "learning_rate": 0.0002, + "loss": 0.4527, + "step": 44110 + }, + { + "epoch": 7.132810605448226, + "grad_norm": 0.9185389280319214, + "learning_rate": 0.0002, + "loss": 0.4168, + "step": 44120 + }, + { + "epoch": 7.134427289628971, + "grad_norm": 1.0224069356918335, + "learning_rate": 0.0002, + "loss": 0.4255, + "step": 44130 + }, + { + "epoch": 7.136043973809716, + "grad_norm": 1.0762630701065063, + "learning_rate": 0.0002, + "loss": 0.4598, + "step": 44140 + }, + { + "epoch": 7.137660657990462, + "grad_norm": 1.330917477607727, + "learning_rate": 0.0002, + "loss": 0.4308, + "step": 44150 + }, + { + "epoch": 7.139277342171207, + "grad_norm": 1.220115303993225, + "learning_rate": 0.0002, + "loss": 0.4548, + "step": 44160 + }, + { + "epoch": 7.140894026351952, + "grad_norm": 0.9959004521369934, + "learning_rate": 0.0002, + "loss": 0.4089, + "step": 44170 + }, + { + "epoch": 7.142510710532697, + "grad_norm": 1.272449016571045, + "learning_rate": 0.0002, + "loss": 0.4475, + "step": 44180 + }, + { + "epoch": 7.1441273947134425, + "grad_norm": 1.0696483850479126, + "learning_rate": 0.0002, + "loss": 0.4268, + "step": 44190 + }, + { + "epoch": 7.145744078894188, + "grad_norm": 1.347206711769104, + "learning_rate": 0.0002, + "loss": 0.4218, + "step": 44200 + }, + { + "epoch": 7.147360763074933, + "grad_norm": 1.1455401182174683, + "learning_rate": 0.0002, + "loss": 0.4652, + "step": 44210 + }, + { + "epoch": 7.148977447255678, + "grad_norm": 1.1443370580673218, + "learning_rate": 0.0002, + "loss": 0.4186, + "step": 44220 + }, + { + "epoch": 7.150594131436424, + "grad_norm": 1.0239921808242798, + "learning_rate": 0.0002, + "loss": 0.4669, + "step": 44230 + }, + { + "epoch": 7.1522108156171695, + "grad_norm": 1.1596333980560303, + "learning_rate": 0.0002, + "loss": 0.4601, + "step": 44240 + }, + { + "epoch": 7.153827499797915, + "grad_norm": 1.2471510171890259, + "learning_rate": 0.0002, + "loss": 0.44, + "step": 44250 + }, + { + "epoch": 7.15544418397866, + "grad_norm": 1.0713822841644287, + "learning_rate": 0.0002, + "loss": 0.426, + "step": 44260 + }, + { + "epoch": 7.157060868159405, + "grad_norm": 1.3523266315460205, + "learning_rate": 0.0002, + "loss": 0.4381, + "step": 44270 + }, + { + "epoch": 7.1586775523401505, + "grad_norm": 1.1620066165924072, + "learning_rate": 0.0002, + "loss": 0.4101, + "step": 44280 + }, + { + "epoch": 7.160294236520896, + "grad_norm": 1.2988619804382324, + "learning_rate": 0.0002, + "loss": 0.4195, + "step": 44290 + }, + { + "epoch": 7.161910920701641, + "grad_norm": 1.2527822256088257, + "learning_rate": 0.0002, + "loss": 0.4405, + "step": 44300 + }, + { + "epoch": 7.163527604882386, + "grad_norm": 1.2322553396224976, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 44310 + }, + { + "epoch": 7.165144289063131, + "grad_norm": 1.0497055053710938, + "learning_rate": 0.0002, + "loss": 0.4274, + "step": 44320 + }, + { + "epoch": 7.166760973243877, + "grad_norm": 1.1928341388702393, + "learning_rate": 0.0002, + "loss": 0.4236, + "step": 44330 + }, + { + "epoch": 7.168377657424622, + "grad_norm": 1.0016584396362305, + "learning_rate": 0.0002, + "loss": 0.4511, + "step": 44340 + }, + { + "epoch": 7.169994341605367, + "grad_norm": 1.0385509729385376, + "learning_rate": 0.0002, + "loss": 0.4566, + "step": 44350 + }, + { + "epoch": 7.171611025786112, + "grad_norm": 1.3217328786849976, + "learning_rate": 0.0002, + "loss": 0.4178, + "step": 44360 + }, + { + "epoch": 7.1732277099668575, + "grad_norm": 1.240696668624878, + "learning_rate": 0.0002, + "loss": 0.425, + "step": 44370 + }, + { + "epoch": 7.174844394147604, + "grad_norm": 1.1037760972976685, + "learning_rate": 0.0002, + "loss": 0.4572, + "step": 44380 + }, + { + "epoch": 7.176461078328349, + "grad_norm": 1.062762975692749, + "learning_rate": 0.0002, + "loss": 0.4525, + "step": 44390 + }, + { + "epoch": 7.178077762509094, + "grad_norm": 1.2859047651290894, + "learning_rate": 0.0002, + "loss": 0.4766, + "step": 44400 + }, + { + "epoch": 7.179694446689839, + "grad_norm": 1.1852408647537231, + "learning_rate": 0.0002, + "loss": 0.4511, + "step": 44410 + }, + { + "epoch": 7.181311130870585, + "grad_norm": 1.315587043762207, + "learning_rate": 0.0002, + "loss": 0.4386, + "step": 44420 + }, + { + "epoch": 7.18292781505133, + "grad_norm": 0.889542281627655, + "learning_rate": 0.0002, + "loss": 0.4491, + "step": 44430 + }, + { + "epoch": 7.184544499232075, + "grad_norm": 1.0123721361160278, + "learning_rate": 0.0002, + "loss": 0.4328, + "step": 44440 + }, + { + "epoch": 7.18616118341282, + "grad_norm": 1.0503462553024292, + "learning_rate": 0.0002, + "loss": 0.4096, + "step": 44450 + }, + { + "epoch": 7.1877778675935655, + "grad_norm": 1.338188886642456, + "learning_rate": 0.0002, + "loss": 0.44, + "step": 44460 + }, + { + "epoch": 7.189394551774311, + "grad_norm": 1.206543207168579, + "learning_rate": 0.0002, + "loss": 0.4451, + "step": 44470 + }, + { + "epoch": 7.191011235955056, + "grad_norm": 1.2013356685638428, + "learning_rate": 0.0002, + "loss": 0.4415, + "step": 44480 + }, + { + "epoch": 7.192627920135801, + "grad_norm": 1.1124168634414673, + "learning_rate": 0.0002, + "loss": 0.4291, + "step": 44490 + }, + { + "epoch": 7.194244604316546, + "grad_norm": 1.199379324913025, + "learning_rate": 0.0002, + "loss": 0.4182, + "step": 44500 + }, + { + "epoch": 7.195861288497292, + "grad_norm": 1.196746587753296, + "learning_rate": 0.0002, + "loss": 0.4525, + "step": 44510 + }, + { + "epoch": 7.197477972678037, + "grad_norm": 0.9684673547744751, + "learning_rate": 0.0002, + "loss": 0.4876, + "step": 44520 + }, + { + "epoch": 7.199094656858783, + "grad_norm": 1.5727651119232178, + "learning_rate": 0.0002, + "loss": 0.4403, + "step": 44530 + }, + { + "epoch": 7.200711341039528, + "grad_norm": 0.8371674418449402, + "learning_rate": 0.0002, + "loss": 0.4424, + "step": 44540 + }, + { + "epoch": 7.2023280252202735, + "grad_norm": 1.0343716144561768, + "learning_rate": 0.0002, + "loss": 0.4366, + "step": 44550 + }, + { + "epoch": 7.203944709401019, + "grad_norm": 1.1839478015899658, + "learning_rate": 0.0002, + "loss": 0.4557, + "step": 44560 + }, + { + "epoch": 7.205561393581764, + "grad_norm": 0.9466627836227417, + "learning_rate": 0.0002, + "loss": 0.4293, + "step": 44570 + }, + { + "epoch": 7.207178077762509, + "grad_norm": 1.1452360153198242, + "learning_rate": 0.0002, + "loss": 0.4651, + "step": 44580 + }, + { + "epoch": 7.208794761943254, + "grad_norm": 1.63698410987854, + "learning_rate": 0.0002, + "loss": 0.5037, + "step": 44590 + }, + { + "epoch": 7.210411446124, + "grad_norm": 1.1124789714813232, + "learning_rate": 0.0002, + "loss": 0.4212, + "step": 44600 + }, + { + "epoch": 7.212028130304745, + "grad_norm": 1.4233685731887817, + "learning_rate": 0.0002, + "loss": 0.4323, + "step": 44610 + }, + { + "epoch": 7.21364481448549, + "grad_norm": 1.302145004272461, + "learning_rate": 0.0002, + "loss": 0.4176, + "step": 44620 + }, + { + "epoch": 7.215261498666235, + "grad_norm": 1.2115466594696045, + "learning_rate": 0.0002, + "loss": 0.457, + "step": 44630 + }, + { + "epoch": 7.216878182846981, + "grad_norm": 1.0771325826644897, + "learning_rate": 0.0002, + "loss": 0.4419, + "step": 44640 + }, + { + "epoch": 7.218494867027726, + "grad_norm": 1.1603602170944214, + "learning_rate": 0.0002, + "loss": 0.4183, + "step": 44650 + }, + { + "epoch": 7.220111551208471, + "grad_norm": 1.4013969898223877, + "learning_rate": 0.0002, + "loss": 0.468, + "step": 44660 + }, + { + "epoch": 7.221728235389216, + "grad_norm": 1.2145777940750122, + "learning_rate": 0.0002, + "loss": 0.4867, + "step": 44670 + }, + { + "epoch": 7.223344919569962, + "grad_norm": 1.2084238529205322, + "learning_rate": 0.0002, + "loss": 0.4436, + "step": 44680 + }, + { + "epoch": 7.224961603750708, + "grad_norm": 1.1801965236663818, + "learning_rate": 0.0002, + "loss": 0.4423, + "step": 44690 + }, + { + "epoch": 7.226578287931453, + "grad_norm": 0.9561195969581604, + "learning_rate": 0.0002, + "loss": 0.426, + "step": 44700 + }, + { + "epoch": 7.228194972112198, + "grad_norm": 1.1857006549835205, + "learning_rate": 0.0002, + "loss": 0.4895, + "step": 44710 + }, + { + "epoch": 7.229811656292943, + "grad_norm": 1.1576673984527588, + "learning_rate": 0.0002, + "loss": 0.4382, + "step": 44720 + }, + { + "epoch": 7.2314283404736885, + "grad_norm": 1.3517892360687256, + "learning_rate": 0.0002, + "loss": 0.4717, + "step": 44730 + }, + { + "epoch": 7.233045024654434, + "grad_norm": 1.1489306688308716, + "learning_rate": 0.0002, + "loss": 0.4495, + "step": 44740 + }, + { + "epoch": 7.234661708835179, + "grad_norm": 1.0758644342422485, + "learning_rate": 0.0002, + "loss": 0.438, + "step": 44750 + }, + { + "epoch": 7.236278393015924, + "grad_norm": 1.1679041385650635, + "learning_rate": 0.0002, + "loss": 0.4431, + "step": 44760 + }, + { + "epoch": 7.2378950771966695, + "grad_norm": 1.1404961347579956, + "learning_rate": 0.0002, + "loss": 0.504, + "step": 44770 + }, + { + "epoch": 7.239511761377415, + "grad_norm": 1.2602572441101074, + "learning_rate": 0.0002, + "loss": 0.4499, + "step": 44780 + }, + { + "epoch": 7.24112844555816, + "grad_norm": 1.2912664413452148, + "learning_rate": 0.0002, + "loss": 0.4669, + "step": 44790 + }, + { + "epoch": 7.242745129738905, + "grad_norm": 1.340198278427124, + "learning_rate": 0.0002, + "loss": 0.4336, + "step": 44800 + }, + { + "epoch": 7.24436181391965, + "grad_norm": 1.0613332986831665, + "learning_rate": 0.0002, + "loss": 0.4336, + "step": 44810 + }, + { + "epoch": 7.2459784981003965, + "grad_norm": 1.1658564805984497, + "learning_rate": 0.0002, + "loss": 0.4433, + "step": 44820 + }, + { + "epoch": 7.247595182281142, + "grad_norm": 1.046440839767456, + "learning_rate": 0.0002, + "loss": 0.4532, + "step": 44830 + }, + { + "epoch": 7.249211866461887, + "grad_norm": 1.2335407733917236, + "learning_rate": 0.0002, + "loss": 0.4332, + "step": 44840 + }, + { + "epoch": 7.250828550642632, + "grad_norm": 1.3742769956588745, + "learning_rate": 0.0002, + "loss": 0.455, + "step": 44850 + }, + { + "epoch": 7.252445234823377, + "grad_norm": 1.1744071245193481, + "learning_rate": 0.0002, + "loss": 0.4297, + "step": 44860 + }, + { + "epoch": 7.254061919004123, + "grad_norm": 1.4268226623535156, + "learning_rate": 0.0002, + "loss": 0.4348, + "step": 44870 + }, + { + "epoch": 7.255678603184868, + "grad_norm": 1.1255686283111572, + "learning_rate": 0.0002, + "loss": 0.4485, + "step": 44880 + }, + { + "epoch": 7.257295287365613, + "grad_norm": 1.255053162574768, + "learning_rate": 0.0002, + "loss": 0.4264, + "step": 44890 + }, + { + "epoch": 7.258911971546358, + "grad_norm": 1.4957616329193115, + "learning_rate": 0.0002, + "loss": 0.455, + "step": 44900 + }, + { + "epoch": 7.260528655727104, + "grad_norm": 1.0546756982803345, + "learning_rate": 0.0002, + "loss": 0.4465, + "step": 44910 + }, + { + "epoch": 7.262145339907849, + "grad_norm": 1.4683036804199219, + "learning_rate": 0.0002, + "loss": 0.4802, + "step": 44920 + }, + { + "epoch": 7.263762024088594, + "grad_norm": 1.2027722597122192, + "learning_rate": 0.0002, + "loss": 0.4175, + "step": 44930 + }, + { + "epoch": 7.265378708269339, + "grad_norm": 1.277767539024353, + "learning_rate": 0.0002, + "loss": 0.4316, + "step": 44940 + }, + { + "epoch": 7.2669953924500845, + "grad_norm": 1.4894379377365112, + "learning_rate": 0.0002, + "loss": 0.4749, + "step": 44950 + }, + { + "epoch": 7.26861207663083, + "grad_norm": 1.0998231172561646, + "learning_rate": 0.0002, + "loss": 0.46, + "step": 44960 + }, + { + "epoch": 7.270228760811576, + "grad_norm": 1.3713536262512207, + "learning_rate": 0.0002, + "loss": 0.4414, + "step": 44970 + }, + { + "epoch": 7.271845444992321, + "grad_norm": 1.473396897315979, + "learning_rate": 0.0002, + "loss": 0.4431, + "step": 44980 + }, + { + "epoch": 7.273462129173066, + "grad_norm": 1.0893826484680176, + "learning_rate": 0.0002, + "loss": 0.4582, + "step": 44990 + }, + { + "epoch": 7.275078813353812, + "grad_norm": 1.4798463582992554, + "learning_rate": 0.0002, + "loss": 0.4297, + "step": 45000 + }, + { + "epoch": 7.276695497534557, + "grad_norm": 1.0536930561065674, + "learning_rate": 0.0002, + "loss": 0.4604, + "step": 45010 + }, + { + "epoch": 7.278312181715302, + "grad_norm": 1.064450740814209, + "learning_rate": 0.0002, + "loss": 0.4396, + "step": 45020 + }, + { + "epoch": 7.279928865896047, + "grad_norm": 1.3605865240097046, + "learning_rate": 0.0002, + "loss": 0.4445, + "step": 45030 + }, + { + "epoch": 7.2815455500767925, + "grad_norm": 1.1779286861419678, + "learning_rate": 0.0002, + "loss": 0.4123, + "step": 45040 + }, + { + "epoch": 7.283162234257538, + "grad_norm": 1.1568892002105713, + "learning_rate": 0.0002, + "loss": 0.4588, + "step": 45050 + }, + { + "epoch": 7.284778918438283, + "grad_norm": 1.0677175521850586, + "learning_rate": 0.0002, + "loss": 0.4557, + "step": 45060 + }, + { + "epoch": 7.286395602619028, + "grad_norm": 1.1939430236816406, + "learning_rate": 0.0002, + "loss": 0.4765, + "step": 45070 + }, + { + "epoch": 7.288012286799773, + "grad_norm": 1.0273144245147705, + "learning_rate": 0.0002, + "loss": 0.443, + "step": 45080 + }, + { + "epoch": 7.289628970980519, + "grad_norm": 1.358487844467163, + "learning_rate": 0.0002, + "loss": 0.4472, + "step": 45090 + }, + { + "epoch": 7.291245655161264, + "grad_norm": 1.2139160633087158, + "learning_rate": 0.0002, + "loss": 0.4207, + "step": 45100 + }, + { + "epoch": 7.29286233934201, + "grad_norm": 1.2484227418899536, + "learning_rate": 0.0002, + "loss": 0.4221, + "step": 45110 + }, + { + "epoch": 7.294479023522755, + "grad_norm": 1.2373738288879395, + "learning_rate": 0.0002, + "loss": 0.4351, + "step": 45120 + }, + { + "epoch": 7.2960957077035005, + "grad_norm": 1.3877158164978027, + "learning_rate": 0.0002, + "loss": 0.472, + "step": 45130 + }, + { + "epoch": 7.297712391884246, + "grad_norm": 1.1372028589248657, + "learning_rate": 0.0002, + "loss": 0.4741, + "step": 45140 + }, + { + "epoch": 7.299329076064991, + "grad_norm": 1.259987711906433, + "learning_rate": 0.0002, + "loss": 0.4465, + "step": 45150 + }, + { + "epoch": 7.300945760245736, + "grad_norm": 1.6501492261886597, + "learning_rate": 0.0002, + "loss": 0.4795, + "step": 45160 + }, + { + "epoch": 7.302562444426481, + "grad_norm": 1.5927983522415161, + "learning_rate": 0.0002, + "loss": 0.4441, + "step": 45170 + }, + { + "epoch": 7.304179128607227, + "grad_norm": 0.957084596157074, + "learning_rate": 0.0002, + "loss": 0.4513, + "step": 45180 + }, + { + "epoch": 7.305795812787972, + "grad_norm": 1.7777647972106934, + "learning_rate": 0.0002, + "loss": 0.4367, + "step": 45190 + }, + { + "epoch": 7.307412496968717, + "grad_norm": 1.1905052661895752, + "learning_rate": 0.0002, + "loss": 0.4365, + "step": 45200 + }, + { + "epoch": 7.309029181149462, + "grad_norm": 1.0944236516952515, + "learning_rate": 0.0002, + "loss": 0.4354, + "step": 45210 + }, + { + "epoch": 7.3106458653302075, + "grad_norm": 1.171034336090088, + "learning_rate": 0.0002, + "loss": 0.4558, + "step": 45220 + }, + { + "epoch": 7.312262549510953, + "grad_norm": 1.421743392944336, + "learning_rate": 0.0002, + "loss": 0.4518, + "step": 45230 + }, + { + "epoch": 7.313879233691698, + "grad_norm": 1.1282994747161865, + "learning_rate": 0.0002, + "loss": 0.4713, + "step": 45240 + }, + { + "epoch": 7.315495917872443, + "grad_norm": 1.0742822885513306, + "learning_rate": 0.0002, + "loss": 0.471, + "step": 45250 + }, + { + "epoch": 7.317112602053189, + "grad_norm": 1.2697997093200684, + "learning_rate": 0.0002, + "loss": 0.4887, + "step": 45260 + }, + { + "epoch": 7.318729286233935, + "grad_norm": 1.2066359519958496, + "learning_rate": 0.0002, + "loss": 0.461, + "step": 45270 + }, + { + "epoch": 7.32034597041468, + "grad_norm": 1.0044163465499878, + "learning_rate": 0.0002, + "loss": 0.463, + "step": 45280 + }, + { + "epoch": 7.321962654595425, + "grad_norm": 1.2365968227386475, + "learning_rate": 0.0002, + "loss": 0.4394, + "step": 45290 + }, + { + "epoch": 7.32357933877617, + "grad_norm": 1.0731542110443115, + "learning_rate": 0.0002, + "loss": 0.4305, + "step": 45300 + }, + { + "epoch": 7.3251960229569155, + "grad_norm": 1.1595830917358398, + "learning_rate": 0.0002, + "loss": 0.4744, + "step": 45310 + }, + { + "epoch": 7.326812707137661, + "grad_norm": 1.3445849418640137, + "learning_rate": 0.0002, + "loss": 0.4393, + "step": 45320 + }, + { + "epoch": 7.328429391318406, + "grad_norm": 1.3067926168441772, + "learning_rate": 0.0002, + "loss": 0.4288, + "step": 45330 + }, + { + "epoch": 7.330046075499151, + "grad_norm": 1.200667381286621, + "learning_rate": 0.0002, + "loss": 0.4569, + "step": 45340 + }, + { + "epoch": 7.3316627596798964, + "grad_norm": 0.9936319589614868, + "learning_rate": 0.0002, + "loss": 0.4449, + "step": 45350 + }, + { + "epoch": 7.333279443860642, + "grad_norm": 1.1291998624801636, + "learning_rate": 0.0002, + "loss": 0.4481, + "step": 45360 + }, + { + "epoch": 7.334896128041387, + "grad_norm": 1.3663034439086914, + "learning_rate": 0.0002, + "loss": 0.4643, + "step": 45370 + }, + { + "epoch": 7.336512812222132, + "grad_norm": 1.0762227773666382, + "learning_rate": 0.0002, + "loss": 0.4548, + "step": 45380 + }, + { + "epoch": 7.338129496402877, + "grad_norm": 0.9525768160820007, + "learning_rate": 0.0002, + "loss": 0.4495, + "step": 45390 + }, + { + "epoch": 7.339746180583623, + "grad_norm": 1.1143709421157837, + "learning_rate": 0.0002, + "loss": 0.472, + "step": 45400 + }, + { + "epoch": 7.341362864764369, + "grad_norm": 1.0711175203323364, + "learning_rate": 0.0002, + "loss": 0.4432, + "step": 45410 + }, + { + "epoch": 7.342979548945114, + "grad_norm": 1.2650856971740723, + "learning_rate": 0.0002, + "loss": 0.4603, + "step": 45420 + }, + { + "epoch": 7.344596233125859, + "grad_norm": 1.194861888885498, + "learning_rate": 0.0002, + "loss": 0.5021, + "step": 45430 + }, + { + "epoch": 7.346212917306604, + "grad_norm": 1.4936751127243042, + "learning_rate": 0.0002, + "loss": 0.467, + "step": 45440 + }, + { + "epoch": 7.34782960148735, + "grad_norm": 1.2938975095748901, + "learning_rate": 0.0002, + "loss": 0.4798, + "step": 45450 + }, + { + "epoch": 7.349446285668095, + "grad_norm": 1.2841941118240356, + "learning_rate": 0.0002, + "loss": 0.4589, + "step": 45460 + }, + { + "epoch": 7.35106296984884, + "grad_norm": 1.5376560688018799, + "learning_rate": 0.0002, + "loss": 0.4398, + "step": 45470 + }, + { + "epoch": 7.352679654029585, + "grad_norm": 1.1307156085968018, + "learning_rate": 0.0002, + "loss": 0.4583, + "step": 45480 + }, + { + "epoch": 7.354296338210331, + "grad_norm": 1.4883167743682861, + "learning_rate": 0.0002, + "loss": 0.4678, + "step": 45490 + }, + { + "epoch": 7.355913022391076, + "grad_norm": 1.0547393560409546, + "learning_rate": 0.0002, + "loss": 0.4966, + "step": 45500 + }, + { + "epoch": 7.357529706571821, + "grad_norm": 1.5476845502853394, + "learning_rate": 0.0002, + "loss": 0.4601, + "step": 45510 + }, + { + "epoch": 7.359146390752566, + "grad_norm": 1.1916698217391968, + "learning_rate": 0.0002, + "loss": 0.4466, + "step": 45520 + }, + { + "epoch": 7.3607630749333115, + "grad_norm": 1.238319754600525, + "learning_rate": 0.0002, + "loss": 0.4791, + "step": 45530 + }, + { + "epoch": 7.362379759114057, + "grad_norm": 1.4216728210449219, + "learning_rate": 0.0002, + "loss": 0.4818, + "step": 45540 + }, + { + "epoch": 7.363996443294802, + "grad_norm": 1.303995132446289, + "learning_rate": 0.0002, + "loss": 0.4828, + "step": 45550 + }, + { + "epoch": 7.365613127475548, + "grad_norm": 1.2453089952468872, + "learning_rate": 0.0002, + "loss": 0.464, + "step": 45560 + }, + { + "epoch": 7.367229811656293, + "grad_norm": 1.1971137523651123, + "learning_rate": 0.0002, + "loss": 0.4735, + "step": 45570 + }, + { + "epoch": 7.3688464958370385, + "grad_norm": 1.0801963806152344, + "learning_rate": 0.0002, + "loss": 0.4415, + "step": 45580 + }, + { + "epoch": 7.370463180017784, + "grad_norm": 1.1602367162704468, + "learning_rate": 0.0002, + "loss": 0.4946, + "step": 45590 + }, + { + "epoch": 7.372079864198529, + "grad_norm": 1.1623423099517822, + "learning_rate": 0.0002, + "loss": 0.45, + "step": 45600 + }, + { + "epoch": 7.373696548379274, + "grad_norm": 1.108467936515808, + "learning_rate": 0.0002, + "loss": 0.4648, + "step": 45610 + }, + { + "epoch": 7.3753132325600195, + "grad_norm": 1.087322473526001, + "learning_rate": 0.0002, + "loss": 0.4566, + "step": 45620 + }, + { + "epoch": 7.376929916740765, + "grad_norm": 1.0945587158203125, + "learning_rate": 0.0002, + "loss": 0.4505, + "step": 45630 + }, + { + "epoch": 7.37854660092151, + "grad_norm": 1.6565983295440674, + "learning_rate": 0.0002, + "loss": 0.4864, + "step": 45640 + }, + { + "epoch": 7.380163285102255, + "grad_norm": 1.1279444694519043, + "learning_rate": 0.0002, + "loss": 0.4491, + "step": 45650 + }, + { + "epoch": 7.381779969283, + "grad_norm": 1.0888527631759644, + "learning_rate": 0.0002, + "loss": 0.4606, + "step": 45660 + }, + { + "epoch": 7.383396653463746, + "grad_norm": 1.1114956140518188, + "learning_rate": 0.0002, + "loss": 0.429, + "step": 45670 + }, + { + "epoch": 7.385013337644491, + "grad_norm": 1.195497751235962, + "learning_rate": 0.0002, + "loss": 0.4726, + "step": 45680 + }, + { + "epoch": 7.386630021825236, + "grad_norm": 1.3111436367034912, + "learning_rate": 0.0002, + "loss": 0.4643, + "step": 45690 + }, + { + "epoch": 7.388246706005981, + "grad_norm": 1.22647225856781, + "learning_rate": 0.0002, + "loss": 0.4777, + "step": 45700 + }, + { + "epoch": 7.389863390186727, + "grad_norm": 0.9309225678443909, + "learning_rate": 0.0002, + "loss": 0.4877, + "step": 45710 + }, + { + "epoch": 7.391480074367473, + "grad_norm": 1.198773741722107, + "learning_rate": 0.0002, + "loss": 0.4789, + "step": 45720 + }, + { + "epoch": 7.393096758548218, + "grad_norm": 1.2208130359649658, + "learning_rate": 0.0002, + "loss": 0.4496, + "step": 45730 + }, + { + "epoch": 7.394713442728963, + "grad_norm": 1.0756449699401855, + "learning_rate": 0.0002, + "loss": 0.4614, + "step": 45740 + }, + { + "epoch": 7.396330126909708, + "grad_norm": 1.0117692947387695, + "learning_rate": 0.0002, + "loss": 0.4469, + "step": 45750 + }, + { + "epoch": 7.397946811090454, + "grad_norm": 1.1144468784332275, + "learning_rate": 0.0002, + "loss": 0.4217, + "step": 45760 + }, + { + "epoch": 7.399563495271199, + "grad_norm": 1.140549898147583, + "learning_rate": 0.0002, + "loss": 0.4737, + "step": 45770 + }, + { + "epoch": 7.401180179451944, + "grad_norm": 1.2335172891616821, + "learning_rate": 0.0002, + "loss": 0.453, + "step": 45780 + }, + { + "epoch": 7.402796863632689, + "grad_norm": 1.296393632888794, + "learning_rate": 0.0002, + "loss": 0.4501, + "step": 45790 + }, + { + "epoch": 7.4044135478134345, + "grad_norm": 1.2551302909851074, + "learning_rate": 0.0002, + "loss": 0.4716, + "step": 45800 + }, + { + "epoch": 7.40603023199418, + "grad_norm": 1.1909204721450806, + "learning_rate": 0.0002, + "loss": 0.451, + "step": 45810 + }, + { + "epoch": 7.407646916174925, + "grad_norm": 1.17038893699646, + "learning_rate": 0.0002, + "loss": 0.4296, + "step": 45820 + }, + { + "epoch": 7.40926360035567, + "grad_norm": 1.0033377408981323, + "learning_rate": 0.0002, + "loss": 0.4574, + "step": 45830 + }, + { + "epoch": 7.4108802845364155, + "grad_norm": 1.2957805395126343, + "learning_rate": 0.0002, + "loss": 0.4413, + "step": 45840 + }, + { + "epoch": 7.412496968717161, + "grad_norm": 1.347462773323059, + "learning_rate": 0.0002, + "loss": 0.4597, + "step": 45850 + }, + { + "epoch": 7.414113652897907, + "grad_norm": 1.3187026977539062, + "learning_rate": 0.0002, + "loss": 0.4499, + "step": 45860 + }, + { + "epoch": 7.415730337078652, + "grad_norm": 1.092236876487732, + "learning_rate": 0.0002, + "loss": 0.4624, + "step": 45870 + }, + { + "epoch": 7.417347021259397, + "grad_norm": 1.075634241104126, + "learning_rate": 0.0002, + "loss": 0.4636, + "step": 45880 + }, + { + "epoch": 7.4189637054401425, + "grad_norm": 1.0200046300888062, + "learning_rate": 0.0002, + "loss": 0.4465, + "step": 45890 + }, + { + "epoch": 7.420580389620888, + "grad_norm": 1.1419479846954346, + "learning_rate": 0.0002, + "loss": 0.47, + "step": 45900 + }, + { + "epoch": 7.422197073801633, + "grad_norm": 1.0798102617263794, + "learning_rate": 0.0002, + "loss": 0.4409, + "step": 45910 + }, + { + "epoch": 7.423813757982378, + "grad_norm": 0.9999571442604065, + "learning_rate": 0.0002, + "loss": 0.5173, + "step": 45920 + }, + { + "epoch": 7.425430442163123, + "grad_norm": 1.2220723628997803, + "learning_rate": 0.0002, + "loss": 0.4714, + "step": 45930 + }, + { + "epoch": 7.427047126343869, + "grad_norm": 1.1209388971328735, + "learning_rate": 0.0002, + "loss": 0.4844, + "step": 45940 + }, + { + "epoch": 7.428663810524614, + "grad_norm": 1.1198307275772095, + "learning_rate": 0.0002, + "loss": 0.4534, + "step": 45950 + }, + { + "epoch": 7.430280494705359, + "grad_norm": 1.0170516967773438, + "learning_rate": 0.0002, + "loss": 0.4486, + "step": 45960 + }, + { + "epoch": 7.431897178886104, + "grad_norm": 1.2963446378707886, + "learning_rate": 0.0002, + "loss": 0.488, + "step": 45970 + }, + { + "epoch": 7.43351386306685, + "grad_norm": 1.4202494621276855, + "learning_rate": 0.0002, + "loss": 0.4346, + "step": 45980 + }, + { + "epoch": 7.435130547247595, + "grad_norm": 1.066774845123291, + "learning_rate": 0.0002, + "loss": 0.4917, + "step": 45990 + }, + { + "epoch": 7.43674723142834, + "grad_norm": 1.2760428190231323, + "learning_rate": 0.0002, + "loss": 0.4897, + "step": 46000 + }, + { + "epoch": 7.438363915609086, + "grad_norm": 1.530720829963684, + "learning_rate": 0.0002, + "loss": 0.4562, + "step": 46010 + }, + { + "epoch": 7.439980599789831, + "grad_norm": 1.1914178133010864, + "learning_rate": 0.0002, + "loss": 0.4691, + "step": 46020 + }, + { + "epoch": 7.441597283970577, + "grad_norm": 1.466650128364563, + "learning_rate": 0.0002, + "loss": 0.5038, + "step": 46030 + }, + { + "epoch": 7.443213968151322, + "grad_norm": 1.1567928791046143, + "learning_rate": 0.0002, + "loss": 0.4673, + "step": 46040 + }, + { + "epoch": 7.444830652332067, + "grad_norm": 1.252336025238037, + "learning_rate": 0.0002, + "loss": 0.4778, + "step": 46050 + }, + { + "epoch": 7.446447336512812, + "grad_norm": 1.2095589637756348, + "learning_rate": 0.0002, + "loss": 0.4493, + "step": 46060 + }, + { + "epoch": 7.4480640206935576, + "grad_norm": 1.4075263738632202, + "learning_rate": 0.0002, + "loss": 0.4407, + "step": 46070 + }, + { + "epoch": 7.449680704874303, + "grad_norm": 1.2527226209640503, + "learning_rate": 0.0002, + "loss": 0.4328, + "step": 46080 + }, + { + "epoch": 7.451297389055048, + "grad_norm": 1.3044105768203735, + "learning_rate": 0.0002, + "loss": 0.4922, + "step": 46090 + }, + { + "epoch": 7.452914073235793, + "grad_norm": 1.2888941764831543, + "learning_rate": 0.0002, + "loss": 0.4465, + "step": 46100 + }, + { + "epoch": 7.4545307574165385, + "grad_norm": 1.3148317337036133, + "learning_rate": 0.0002, + "loss": 0.4414, + "step": 46110 + }, + { + "epoch": 7.456147441597284, + "grad_norm": 0.9526162147521973, + "learning_rate": 0.0002, + "loss": 0.4431, + "step": 46120 + }, + { + "epoch": 7.457764125778029, + "grad_norm": 1.2618519067764282, + "learning_rate": 0.0002, + "loss": 0.4422, + "step": 46130 + }, + { + "epoch": 7.459380809958774, + "grad_norm": 1.0392966270446777, + "learning_rate": 0.0002, + "loss": 0.4745, + "step": 46140 + }, + { + "epoch": 7.460997494139519, + "grad_norm": 1.3286794424057007, + "learning_rate": 0.0002, + "loss": 0.4589, + "step": 46150 + }, + { + "epoch": 7.4626141783202655, + "grad_norm": 1.2377561330795288, + "learning_rate": 0.0002, + "loss": 0.4762, + "step": 46160 + }, + { + "epoch": 7.464230862501011, + "grad_norm": 1.034134030342102, + "learning_rate": 0.0002, + "loss": 0.4119, + "step": 46170 + }, + { + "epoch": 7.465847546681756, + "grad_norm": 1.1719683408737183, + "learning_rate": 0.0002, + "loss": 0.4487, + "step": 46180 + }, + { + "epoch": 7.467464230862501, + "grad_norm": 1.182691216468811, + "learning_rate": 0.0002, + "loss": 0.4423, + "step": 46190 + }, + { + "epoch": 7.4690809150432464, + "grad_norm": 1.1898412704467773, + "learning_rate": 0.0002, + "loss": 0.4341, + "step": 46200 + }, + { + "epoch": 7.470697599223992, + "grad_norm": 1.0543978214263916, + "learning_rate": 0.0002, + "loss": 0.4753, + "step": 46210 + }, + { + "epoch": 7.472314283404737, + "grad_norm": 1.176971673965454, + "learning_rate": 0.0002, + "loss": 0.4673, + "step": 46220 + }, + { + "epoch": 7.473930967585482, + "grad_norm": 1.129456639289856, + "learning_rate": 0.0002, + "loss": 0.4598, + "step": 46230 + }, + { + "epoch": 7.475547651766227, + "grad_norm": 1.1782855987548828, + "learning_rate": 0.0002, + "loss": 0.4805, + "step": 46240 + }, + { + "epoch": 7.477164335946973, + "grad_norm": 1.1678800582885742, + "learning_rate": 0.0002, + "loss": 0.4979, + "step": 46250 + }, + { + "epoch": 7.478781020127718, + "grad_norm": 0.9768722653388977, + "learning_rate": 0.0002, + "loss": 0.4374, + "step": 46260 + }, + { + "epoch": 7.480397704308463, + "grad_norm": 1.3222670555114746, + "learning_rate": 0.0002, + "loss": 0.4683, + "step": 46270 + }, + { + "epoch": 7.482014388489208, + "grad_norm": 1.0573948621749878, + "learning_rate": 0.0002, + "loss": 0.459, + "step": 46280 + }, + { + "epoch": 7.4836310726699535, + "grad_norm": 1.3233898878097534, + "learning_rate": 0.0002, + "loss": 0.5019, + "step": 46290 + }, + { + "epoch": 7.485247756850699, + "grad_norm": 0.9695420265197754, + "learning_rate": 0.0002, + "loss": 0.4689, + "step": 46300 + }, + { + "epoch": 7.486864441031445, + "grad_norm": 1.2072020769119263, + "learning_rate": 0.0002, + "loss": 0.471, + "step": 46310 + }, + { + "epoch": 7.48848112521219, + "grad_norm": 1.2161253690719604, + "learning_rate": 0.0002, + "loss": 0.486, + "step": 46320 + }, + { + "epoch": 7.490097809392935, + "grad_norm": 1.185958743095398, + "learning_rate": 0.0002, + "loss": 0.4581, + "step": 46330 + }, + { + "epoch": 7.491714493573681, + "grad_norm": 1.3741549253463745, + "learning_rate": 0.0002, + "loss": 0.4617, + "step": 46340 + }, + { + "epoch": 7.493331177754426, + "grad_norm": 1.0586212873458862, + "learning_rate": 0.0002, + "loss": 0.4772, + "step": 46350 + }, + { + "epoch": 7.494947861935171, + "grad_norm": 1.2000513076782227, + "learning_rate": 0.0002, + "loss": 0.4644, + "step": 46360 + }, + { + "epoch": 7.496564546115916, + "grad_norm": 1.3326879739761353, + "learning_rate": 0.0002, + "loss": 0.4584, + "step": 46370 + }, + { + "epoch": 7.4981812302966615, + "grad_norm": 1.3452857732772827, + "learning_rate": 0.0002, + "loss": 0.4741, + "step": 46380 + }, + { + "epoch": 7.499797914477407, + "grad_norm": 1.2885284423828125, + "learning_rate": 0.0002, + "loss": 0.4747, + "step": 46390 + }, + { + "epoch": 7.501414598658152, + "grad_norm": 1.097342610359192, + "learning_rate": 0.0002, + "loss": 0.4648, + "step": 46400 + }, + { + "epoch": 7.503031282838897, + "grad_norm": 1.2342469692230225, + "learning_rate": 0.0002, + "loss": 0.4714, + "step": 46410 + }, + { + "epoch": 7.504647967019642, + "grad_norm": 1.0151721239089966, + "learning_rate": 0.0002, + "loss": 0.4421, + "step": 46420 + }, + { + "epoch": 7.506264651200388, + "grad_norm": 1.2487123012542725, + "learning_rate": 0.0002, + "loss": 0.4347, + "step": 46430 + }, + { + "epoch": 7.507881335381134, + "grad_norm": 0.9319046139717102, + "learning_rate": 0.0002, + "loss": 0.4768, + "step": 46440 + }, + { + "epoch": 7.509498019561878, + "grad_norm": 1.1362226009368896, + "learning_rate": 0.0002, + "loss": 0.4693, + "step": 46450 + }, + { + "epoch": 7.511114703742624, + "grad_norm": 1.2883973121643066, + "learning_rate": 0.0002, + "loss": 0.5007, + "step": 46460 + }, + { + "epoch": 7.5127313879233695, + "grad_norm": 1.0892037153244019, + "learning_rate": 0.0002, + "loss": 0.4455, + "step": 46470 + }, + { + "epoch": 7.514348072104115, + "grad_norm": 1.1870533227920532, + "learning_rate": 0.0002, + "loss": 0.4721, + "step": 46480 + }, + { + "epoch": 7.51596475628486, + "grad_norm": 1.2103877067565918, + "learning_rate": 0.0002, + "loss": 0.4824, + "step": 46490 + }, + { + "epoch": 7.517581440465605, + "grad_norm": 1.0980644226074219, + "learning_rate": 0.0002, + "loss": 0.4573, + "step": 46500 + }, + { + "epoch": 7.51919812464635, + "grad_norm": 1.4729726314544678, + "learning_rate": 0.0002, + "loss": 0.4759, + "step": 46510 + }, + { + "epoch": 7.520814808827096, + "grad_norm": 1.1808913946151733, + "learning_rate": 0.0002, + "loss": 0.4413, + "step": 46520 + }, + { + "epoch": 7.522431493007841, + "grad_norm": 1.2347747087478638, + "learning_rate": 0.0002, + "loss": 0.4278, + "step": 46530 + }, + { + "epoch": 7.524048177188586, + "grad_norm": 1.5921525955200195, + "learning_rate": 0.0002, + "loss": 0.4745, + "step": 46540 + }, + { + "epoch": 7.525664861369331, + "grad_norm": 1.1328861713409424, + "learning_rate": 0.0002, + "loss": 0.4525, + "step": 46550 + }, + { + "epoch": 7.527281545550077, + "grad_norm": 1.289947748184204, + "learning_rate": 0.0002, + "loss": 0.4771, + "step": 46560 + }, + { + "epoch": 7.528898229730822, + "grad_norm": 1.0198370218276978, + "learning_rate": 0.0002, + "loss": 0.4711, + "step": 46570 + }, + { + "epoch": 7.530514913911567, + "grad_norm": 1.3007137775421143, + "learning_rate": 0.0002, + "loss": 0.504, + "step": 46580 + }, + { + "epoch": 7.532131598092313, + "grad_norm": 1.2864280939102173, + "learning_rate": 0.0002, + "loss": 0.4496, + "step": 46590 + }, + { + "epoch": 7.5337482822730575, + "grad_norm": 1.1005513668060303, + "learning_rate": 0.0002, + "loss": 0.463, + "step": 46600 + }, + { + "epoch": 7.535364966453804, + "grad_norm": 0.9998318552970886, + "learning_rate": 0.0002, + "loss": 0.4426, + "step": 46610 + }, + { + "epoch": 7.536981650634549, + "grad_norm": 1.2042466402053833, + "learning_rate": 0.0002, + "loss": 0.4762, + "step": 46620 + }, + { + "epoch": 7.538598334815294, + "grad_norm": 1.3240692615509033, + "learning_rate": 0.0002, + "loss": 0.4685, + "step": 46630 + }, + { + "epoch": 7.540215018996039, + "grad_norm": 1.2145483493804932, + "learning_rate": 0.0002, + "loss": 0.4608, + "step": 46640 + }, + { + "epoch": 7.5418317031767845, + "grad_norm": 1.169691801071167, + "learning_rate": 0.0002, + "loss": 0.4608, + "step": 46650 + }, + { + "epoch": 7.54344838735753, + "grad_norm": 1.194045901298523, + "learning_rate": 0.0002, + "loss": 0.4527, + "step": 46660 + }, + { + "epoch": 7.545065071538275, + "grad_norm": 1.0481327772140503, + "learning_rate": 0.0002, + "loss": 0.4599, + "step": 46670 + }, + { + "epoch": 7.54668175571902, + "grad_norm": 1.0714460611343384, + "learning_rate": 0.0002, + "loss": 0.4729, + "step": 46680 + }, + { + "epoch": 7.5482984398997655, + "grad_norm": 1.1811443567276, + "learning_rate": 0.0002, + "loss": 0.4703, + "step": 46690 + }, + { + "epoch": 7.549915124080511, + "grad_norm": 1.2794281244277954, + "learning_rate": 0.0002, + "loss": 0.4628, + "step": 46700 + }, + { + "epoch": 7.551531808261256, + "grad_norm": 1.001287817955017, + "learning_rate": 0.0002, + "loss": 0.4659, + "step": 46710 + }, + { + "epoch": 7.553148492442001, + "grad_norm": 1.3598867654800415, + "learning_rate": 0.0002, + "loss": 0.4938, + "step": 46720 + }, + { + "epoch": 7.554765176622746, + "grad_norm": 1.206254482269287, + "learning_rate": 0.0002, + "loss": 0.4731, + "step": 46730 + }, + { + "epoch": 7.5563818608034925, + "grad_norm": 1.1095832586288452, + "learning_rate": 0.0002, + "loss": 0.4581, + "step": 46740 + }, + { + "epoch": 7.557998544984237, + "grad_norm": 1.3912206888198853, + "learning_rate": 0.0002, + "loss": 0.4625, + "step": 46750 + }, + { + "epoch": 7.559615229164983, + "grad_norm": 0.9883413314819336, + "learning_rate": 0.0002, + "loss": 0.4464, + "step": 46760 + }, + { + "epoch": 7.561231913345728, + "grad_norm": 1.0965087413787842, + "learning_rate": 0.0002, + "loss": 0.4535, + "step": 46770 + }, + { + "epoch": 7.562848597526473, + "grad_norm": 1.092261552810669, + "learning_rate": 0.0002, + "loss": 0.469, + "step": 46780 + }, + { + "epoch": 7.564465281707219, + "grad_norm": 1.0443673133850098, + "learning_rate": 0.0002, + "loss": 0.488, + "step": 46790 + }, + { + "epoch": 7.566081965887964, + "grad_norm": 1.2420614957809448, + "learning_rate": 0.0002, + "loss": 0.4875, + "step": 46800 + }, + { + "epoch": 7.567698650068709, + "grad_norm": 1.0510783195495605, + "learning_rate": 0.0002, + "loss": 0.4911, + "step": 46810 + }, + { + "epoch": 7.569315334249454, + "grad_norm": 1.0291800498962402, + "learning_rate": 0.0002, + "loss": 0.4541, + "step": 46820 + }, + { + "epoch": 7.5709320184302, + "grad_norm": 1.1784595251083374, + "learning_rate": 0.0002, + "loss": 0.4591, + "step": 46830 + }, + { + "epoch": 7.572548702610945, + "grad_norm": 1.0424436330795288, + "learning_rate": 0.0002, + "loss": 0.5154, + "step": 46840 + }, + { + "epoch": 7.57416538679169, + "grad_norm": 1.182131290435791, + "learning_rate": 0.0002, + "loss": 0.4612, + "step": 46850 + }, + { + "epoch": 7.575782070972435, + "grad_norm": 0.9917051792144775, + "learning_rate": 0.0002, + "loss": 0.446, + "step": 46860 + }, + { + "epoch": 7.5773987551531805, + "grad_norm": 1.1616078615188599, + "learning_rate": 0.0002, + "loss": 0.4428, + "step": 46870 + }, + { + "epoch": 7.579015439333926, + "grad_norm": 1.401071548461914, + "learning_rate": 0.0002, + "loss": 0.4769, + "step": 46880 + }, + { + "epoch": 7.580632123514672, + "grad_norm": 0.874487578868866, + "learning_rate": 0.0002, + "loss": 0.4635, + "step": 46890 + }, + { + "epoch": 7.582248807695416, + "grad_norm": 1.2511193752288818, + "learning_rate": 0.0002, + "loss": 0.4641, + "step": 46900 + }, + { + "epoch": 7.583865491876162, + "grad_norm": 1.7548277378082275, + "learning_rate": 0.0002, + "loss": 0.4715, + "step": 46910 + }, + { + "epoch": 7.5854821760569076, + "grad_norm": 1.349366545677185, + "learning_rate": 0.0002, + "loss": 0.4681, + "step": 46920 + }, + { + "epoch": 7.587098860237653, + "grad_norm": 1.0609583854675293, + "learning_rate": 0.0002, + "loss": 0.4819, + "step": 46930 + }, + { + "epoch": 7.588715544418398, + "grad_norm": 1.031512975692749, + "learning_rate": 0.0002, + "loss": 0.4498, + "step": 46940 + }, + { + "epoch": 7.590332228599143, + "grad_norm": 1.1440242528915405, + "learning_rate": 0.0002, + "loss": 0.4688, + "step": 46950 + }, + { + "epoch": 7.5919489127798885, + "grad_norm": 1.2762987613677979, + "learning_rate": 0.0002, + "loss": 0.4568, + "step": 46960 + }, + { + "epoch": 7.593565596960634, + "grad_norm": 1.167269229888916, + "learning_rate": 0.0002, + "loss": 0.4569, + "step": 46970 + }, + { + "epoch": 7.595182281141379, + "grad_norm": 1.131127953529358, + "learning_rate": 0.0002, + "loss": 0.461, + "step": 46980 + }, + { + "epoch": 7.596798965322124, + "grad_norm": 1.4527075290679932, + "learning_rate": 0.0002, + "loss": 0.4666, + "step": 46990 + }, + { + "epoch": 7.598415649502869, + "grad_norm": 1.330132007598877, + "learning_rate": 0.0002, + "loss": 0.4973, + "step": 47000 + }, + { + "epoch": 7.600032333683615, + "grad_norm": 1.4223501682281494, + "learning_rate": 0.0002, + "loss": 0.4969, + "step": 47010 + }, + { + "epoch": 7.60164901786436, + "grad_norm": 1.2045072317123413, + "learning_rate": 0.0002, + "loss": 0.4572, + "step": 47020 + }, + { + "epoch": 7.603265702045105, + "grad_norm": 1.1549896001815796, + "learning_rate": 0.0002, + "loss": 0.4666, + "step": 47030 + }, + { + "epoch": 7.604882386225851, + "grad_norm": 1.2221543788909912, + "learning_rate": 0.0002, + "loss": 0.4383, + "step": 47040 + }, + { + "epoch": 7.6064990704065965, + "grad_norm": 1.1171326637268066, + "learning_rate": 0.0002, + "loss": 0.4826, + "step": 47050 + }, + { + "epoch": 7.608115754587342, + "grad_norm": 1.073671817779541, + "learning_rate": 0.0002, + "loss": 0.4465, + "step": 47060 + }, + { + "epoch": 7.609732438768087, + "grad_norm": 1.2524123191833496, + "learning_rate": 0.0002, + "loss": 0.4623, + "step": 47070 + }, + { + "epoch": 7.611349122948832, + "grad_norm": 1.2015056610107422, + "learning_rate": 0.0002, + "loss": 0.4538, + "step": 47080 + }, + { + "epoch": 7.612965807129577, + "grad_norm": 1.2454534769058228, + "learning_rate": 0.0002, + "loss": 0.4871, + "step": 47090 + }, + { + "epoch": 7.614582491310323, + "grad_norm": 0.9815779328346252, + "learning_rate": 0.0002, + "loss": 0.5064, + "step": 47100 + }, + { + "epoch": 7.616199175491068, + "grad_norm": 1.1437602043151855, + "learning_rate": 0.0002, + "loss": 0.4841, + "step": 47110 + }, + { + "epoch": 7.617815859671813, + "grad_norm": 1.1004078388214111, + "learning_rate": 0.0002, + "loss": 0.453, + "step": 47120 + }, + { + "epoch": 7.619432543852558, + "grad_norm": 1.069453477859497, + "learning_rate": 0.0002, + "loss": 0.4552, + "step": 47130 + }, + { + "epoch": 7.6210492280333035, + "grad_norm": 1.1434191465377808, + "learning_rate": 0.0002, + "loss": 0.4627, + "step": 47140 + }, + { + "epoch": 7.622665912214049, + "grad_norm": 1.216845989227295, + "learning_rate": 0.0002, + "loss": 0.4882, + "step": 47150 + }, + { + "epoch": 7.624282596394794, + "grad_norm": 1.2302134037017822, + "learning_rate": 0.0002, + "loss": 0.481, + "step": 47160 + }, + { + "epoch": 7.625899280575539, + "grad_norm": 1.4284924268722534, + "learning_rate": 0.0002, + "loss": 0.4806, + "step": 47170 + }, + { + "epoch": 7.6275159647562845, + "grad_norm": 1.3359615802764893, + "learning_rate": 0.0002, + "loss": 0.4458, + "step": 47180 + }, + { + "epoch": 7.629132648937031, + "grad_norm": 1.0242379903793335, + "learning_rate": 0.0002, + "loss": 0.4842, + "step": 47190 + }, + { + "epoch": 7.630749333117776, + "grad_norm": 1.249513030052185, + "learning_rate": 0.0002, + "loss": 0.5137, + "step": 47200 + }, + { + "epoch": 7.632366017298521, + "grad_norm": 1.0881463289260864, + "learning_rate": 0.0002, + "loss": 0.4462, + "step": 47210 + }, + { + "epoch": 7.633982701479266, + "grad_norm": 1.2903773784637451, + "learning_rate": 0.0002, + "loss": 0.4864, + "step": 47220 + }, + { + "epoch": 7.6355993856600115, + "grad_norm": 1.1671710014343262, + "learning_rate": 0.0002, + "loss": 0.4729, + "step": 47230 + }, + { + "epoch": 7.637216069840757, + "grad_norm": 1.1960735321044922, + "learning_rate": 0.0002, + "loss": 0.4936, + "step": 47240 + }, + { + "epoch": 7.638832754021502, + "grad_norm": 1.2692298889160156, + "learning_rate": 0.0002, + "loss": 0.4884, + "step": 47250 + }, + { + "epoch": 7.640449438202247, + "grad_norm": 0.9812195301055908, + "learning_rate": 0.0002, + "loss": 0.423, + "step": 47260 + }, + { + "epoch": 7.642066122382992, + "grad_norm": 1.3986053466796875, + "learning_rate": 0.0002, + "loss": 0.4737, + "step": 47270 + }, + { + "epoch": 7.643682806563738, + "grad_norm": 1.2692067623138428, + "learning_rate": 0.0002, + "loss": 0.4834, + "step": 47280 + }, + { + "epoch": 7.645299490744483, + "grad_norm": 1.1185054779052734, + "learning_rate": 0.0002, + "loss": 0.4893, + "step": 47290 + }, + { + "epoch": 7.646916174925228, + "grad_norm": 1.2837327718734741, + "learning_rate": 0.0002, + "loss": 0.4828, + "step": 47300 + }, + { + "epoch": 7.648532859105973, + "grad_norm": 1.8518418073654175, + "learning_rate": 0.0002, + "loss": 0.4891, + "step": 47310 + }, + { + "epoch": 7.650149543286719, + "grad_norm": 0.9781302213668823, + "learning_rate": 0.0002, + "loss": 0.4626, + "step": 47320 + }, + { + "epoch": 7.651766227467464, + "grad_norm": 1.0777910947799683, + "learning_rate": 0.0002, + "loss": 0.501, + "step": 47330 + }, + { + "epoch": 7.65338291164821, + "grad_norm": 1.2031499147415161, + "learning_rate": 0.0002, + "loss": 0.4927, + "step": 47340 + }, + { + "epoch": 7.654999595828955, + "grad_norm": 1.14322829246521, + "learning_rate": 0.0002, + "loss": 0.439, + "step": 47350 + }, + { + "epoch": 7.6566162800097, + "grad_norm": 1.3211992979049683, + "learning_rate": 0.0002, + "loss": 0.4481, + "step": 47360 + }, + { + "epoch": 7.658232964190446, + "grad_norm": 1.3632899522781372, + "learning_rate": 0.0002, + "loss": 0.4462, + "step": 47370 + }, + { + "epoch": 7.659849648371191, + "grad_norm": 1.2593929767608643, + "learning_rate": 0.0002, + "loss": 0.4934, + "step": 47380 + }, + { + "epoch": 7.661466332551936, + "grad_norm": 1.442670464515686, + "learning_rate": 0.0002, + "loss": 0.4645, + "step": 47390 + }, + { + "epoch": 7.663083016732681, + "grad_norm": 1.2304763793945312, + "learning_rate": 0.0002, + "loss": 0.4584, + "step": 47400 + }, + { + "epoch": 7.664699700913427, + "grad_norm": 1.0182652473449707, + "learning_rate": 0.0002, + "loss": 0.464, + "step": 47410 + }, + { + "epoch": 7.666316385094172, + "grad_norm": 1.365441083908081, + "learning_rate": 0.0002, + "loss": 0.457, + "step": 47420 + }, + { + "epoch": 7.667933069274917, + "grad_norm": 1.1578556299209595, + "learning_rate": 0.0002, + "loss": 0.4787, + "step": 47430 + }, + { + "epoch": 7.669549753455662, + "grad_norm": 1.0346194505691528, + "learning_rate": 0.0002, + "loss": 0.486, + "step": 47440 + }, + { + "epoch": 7.6711664376364075, + "grad_norm": 1.2567378282546997, + "learning_rate": 0.0002, + "loss": 0.4703, + "step": 47450 + }, + { + "epoch": 7.672783121817153, + "grad_norm": 1.1669118404388428, + "learning_rate": 0.0002, + "loss": 0.4853, + "step": 47460 + }, + { + "epoch": 7.674399805997898, + "grad_norm": 1.0174756050109863, + "learning_rate": 0.0002, + "loss": 0.4869, + "step": 47470 + }, + { + "epoch": 7.676016490178643, + "grad_norm": 1.0962231159210205, + "learning_rate": 0.0002, + "loss": 0.4601, + "step": 47480 + }, + { + "epoch": 7.677633174359389, + "grad_norm": 1.1098674535751343, + "learning_rate": 0.0002, + "loss": 0.4866, + "step": 47490 + }, + { + "epoch": 7.6792498585401345, + "grad_norm": 1.1441160440444946, + "learning_rate": 0.0002, + "loss": 0.4682, + "step": 47500 + }, + { + "epoch": 7.68086654272088, + "grad_norm": 1.0473432540893555, + "learning_rate": 0.0002, + "loss": 0.4552, + "step": 47510 + }, + { + "epoch": 7.682483226901625, + "grad_norm": 1.2954738140106201, + "learning_rate": 0.0002, + "loss": 0.4771, + "step": 47520 + }, + { + "epoch": 7.68409991108237, + "grad_norm": 1.2931294441223145, + "learning_rate": 0.0002, + "loss": 0.5012, + "step": 47530 + }, + { + "epoch": 7.6857165952631155, + "grad_norm": 1.4005156755447388, + "learning_rate": 0.0002, + "loss": 0.4808, + "step": 47540 + }, + { + "epoch": 7.687333279443861, + "grad_norm": 1.0998929738998413, + "learning_rate": 0.0002, + "loss": 0.4847, + "step": 47550 + }, + { + "epoch": 7.688949963624606, + "grad_norm": 1.3478347063064575, + "learning_rate": 0.0002, + "loss": 0.4839, + "step": 47560 + }, + { + "epoch": 7.690566647805351, + "grad_norm": 1.2991969585418701, + "learning_rate": 0.0002, + "loss": 0.4918, + "step": 47570 + }, + { + "epoch": 7.692183331986096, + "grad_norm": 1.0892608165740967, + "learning_rate": 0.0002, + "loss": 0.4673, + "step": 47580 + }, + { + "epoch": 7.693800016166842, + "grad_norm": 1.2230998277664185, + "learning_rate": 0.0002, + "loss": 0.4937, + "step": 47590 + }, + { + "epoch": 7.695416700347587, + "grad_norm": 1.2635555267333984, + "learning_rate": 0.0002, + "loss": 0.5222, + "step": 47600 + }, + { + "epoch": 7.697033384528332, + "grad_norm": 1.1720705032348633, + "learning_rate": 0.0002, + "loss": 0.4883, + "step": 47610 + }, + { + "epoch": 7.698650068709077, + "grad_norm": 1.1134333610534668, + "learning_rate": 0.0002, + "loss": 0.452, + "step": 47620 + }, + { + "epoch": 7.7002667528898225, + "grad_norm": 1.2643009424209595, + "learning_rate": 0.0002, + "loss": 0.4859, + "step": 47630 + }, + { + "epoch": 7.701883437070569, + "grad_norm": 1.1145045757293701, + "learning_rate": 0.0002, + "loss": 0.4825, + "step": 47640 + }, + { + "epoch": 7.703500121251314, + "grad_norm": 1.1808549165725708, + "learning_rate": 0.0002, + "loss": 0.4735, + "step": 47650 + }, + { + "epoch": 7.705116805432059, + "grad_norm": 1.2996630668640137, + "learning_rate": 0.0002, + "loss": 0.4841, + "step": 47660 + }, + { + "epoch": 7.706733489612804, + "grad_norm": 1.2786413431167603, + "learning_rate": 0.0002, + "loss": 0.4712, + "step": 47670 + }, + { + "epoch": 7.70835017379355, + "grad_norm": 1.3245121240615845, + "learning_rate": 0.0002, + "loss": 0.4694, + "step": 47680 + }, + { + "epoch": 7.709966857974295, + "grad_norm": 1.4168202877044678, + "learning_rate": 0.0002, + "loss": 0.4467, + "step": 47690 + }, + { + "epoch": 7.71158354215504, + "grad_norm": 1.0354000329971313, + "learning_rate": 0.0002, + "loss": 0.5143, + "step": 47700 + }, + { + "epoch": 7.713200226335785, + "grad_norm": 0.9630362391471863, + "learning_rate": 0.0002, + "loss": 0.4703, + "step": 47710 + }, + { + "epoch": 7.7148169105165305, + "grad_norm": 1.1045806407928467, + "learning_rate": 0.0002, + "loss": 0.4996, + "step": 47720 + }, + { + "epoch": 7.716433594697276, + "grad_norm": 1.2403767108917236, + "learning_rate": 0.0002, + "loss": 0.4756, + "step": 47730 + }, + { + "epoch": 7.718050278878021, + "grad_norm": 0.9893410801887512, + "learning_rate": 0.0002, + "loss": 0.4658, + "step": 47740 + }, + { + "epoch": 7.719666963058766, + "grad_norm": 1.0749315023422241, + "learning_rate": 0.0002, + "loss": 0.4463, + "step": 47750 + }, + { + "epoch": 7.721283647239511, + "grad_norm": 1.2851510047912598, + "learning_rate": 0.0002, + "loss": 0.467, + "step": 47760 + }, + { + "epoch": 7.722900331420257, + "grad_norm": 1.2964261770248413, + "learning_rate": 0.0002, + "loss": 0.489, + "step": 47770 + }, + { + "epoch": 7.724517015601002, + "grad_norm": 1.0603861808776855, + "learning_rate": 0.0002, + "loss": 0.4702, + "step": 47780 + }, + { + "epoch": 7.726133699781748, + "grad_norm": 1.2728440761566162, + "learning_rate": 0.0002, + "loss": 0.478, + "step": 47790 + }, + { + "epoch": 7.727750383962493, + "grad_norm": 1.20509934425354, + "learning_rate": 0.0002, + "loss": 0.4746, + "step": 47800 + }, + { + "epoch": 7.7293670681432385, + "grad_norm": 1.397595763206482, + "learning_rate": 0.0002, + "loss": 0.4556, + "step": 47810 + }, + { + "epoch": 7.730983752323984, + "grad_norm": 1.2595560550689697, + "learning_rate": 0.0002, + "loss": 0.4736, + "step": 47820 + }, + { + "epoch": 7.732600436504729, + "grad_norm": 1.166074514389038, + "learning_rate": 0.0002, + "loss": 0.5061, + "step": 47830 + }, + { + "epoch": 7.734217120685474, + "grad_norm": 1.258192777633667, + "learning_rate": 0.0002, + "loss": 0.4907, + "step": 47840 + }, + { + "epoch": 7.735833804866219, + "grad_norm": 1.0394890308380127, + "learning_rate": 0.0002, + "loss": 0.5256, + "step": 47850 + }, + { + "epoch": 7.737450489046965, + "grad_norm": 1.2017768621444702, + "learning_rate": 0.0002, + "loss": 0.4863, + "step": 47860 + }, + { + "epoch": 7.73906717322771, + "grad_norm": 1.1070265769958496, + "learning_rate": 0.0002, + "loss": 0.4784, + "step": 47870 + }, + { + "epoch": 7.740683857408455, + "grad_norm": 1.0544345378875732, + "learning_rate": 0.0002, + "loss": 0.4616, + "step": 47880 + }, + { + "epoch": 7.7423005415892, + "grad_norm": 1.0194088220596313, + "learning_rate": 0.0002, + "loss": 0.4519, + "step": 47890 + }, + { + "epoch": 7.743917225769946, + "grad_norm": 1.3095234632492065, + "learning_rate": 0.0002, + "loss": 0.4758, + "step": 47900 + }, + { + "epoch": 7.745533909950691, + "grad_norm": 1.0579626560211182, + "learning_rate": 0.0002, + "loss": 0.4646, + "step": 47910 + }, + { + "epoch": 7.747150594131437, + "grad_norm": 1.012990951538086, + "learning_rate": 0.0002, + "loss": 0.4532, + "step": 47920 + }, + { + "epoch": 7.748767278312181, + "grad_norm": 1.485148549079895, + "learning_rate": 0.0002, + "loss": 0.4775, + "step": 47930 + }, + { + "epoch": 7.750383962492927, + "grad_norm": 1.3595696687698364, + "learning_rate": 0.0002, + "loss": 0.4892, + "step": 47940 + }, + { + "epoch": 7.752000646673673, + "grad_norm": 0.9945753216743469, + "learning_rate": 0.0002, + "loss": 0.4609, + "step": 47950 + }, + { + "epoch": 7.753617330854418, + "grad_norm": 1.2098956108093262, + "learning_rate": 0.0002, + "loss": 0.5138, + "step": 47960 + }, + { + "epoch": 7.755234015035163, + "grad_norm": 1.3056198358535767, + "learning_rate": 0.0002, + "loss": 0.4815, + "step": 47970 + }, + { + "epoch": 7.756850699215908, + "grad_norm": 1.2247772216796875, + "learning_rate": 0.0002, + "loss": 0.4761, + "step": 47980 + }, + { + "epoch": 7.7584673833966535, + "grad_norm": 1.397642970085144, + "learning_rate": 0.0002, + "loss": 0.5023, + "step": 47990 + }, + { + "epoch": 7.760084067577399, + "grad_norm": 1.2565888166427612, + "learning_rate": 0.0002, + "loss": 0.4901, + "step": 48000 + }, + { + "epoch": 7.761700751758144, + "grad_norm": 1.0065099000930786, + "learning_rate": 0.0002, + "loss": 0.469, + "step": 48010 + }, + { + "epoch": 7.763317435938889, + "grad_norm": 1.1466305255889893, + "learning_rate": 0.0002, + "loss": 0.4886, + "step": 48020 + }, + { + "epoch": 7.7649341201196345, + "grad_norm": 1.4492419958114624, + "learning_rate": 0.0002, + "loss": 0.4898, + "step": 48030 + }, + { + "epoch": 7.76655080430038, + "grad_norm": 1.0945932865142822, + "learning_rate": 0.0002, + "loss": 0.489, + "step": 48040 + }, + { + "epoch": 7.768167488481125, + "grad_norm": 1.1938602924346924, + "learning_rate": 0.0002, + "loss": 0.4968, + "step": 48050 + }, + { + "epoch": 7.76978417266187, + "grad_norm": 1.168890357017517, + "learning_rate": 0.0002, + "loss": 0.4497, + "step": 48060 + }, + { + "epoch": 7.771400856842616, + "grad_norm": 1.3134305477142334, + "learning_rate": 0.0002, + "loss": 0.4881, + "step": 48070 + }, + { + "epoch": 7.773017541023361, + "grad_norm": 1.044438123703003, + "learning_rate": 0.0002, + "loss": 0.5226, + "step": 48080 + }, + { + "epoch": 7.774634225204107, + "grad_norm": 1.1275628805160522, + "learning_rate": 0.0002, + "loss": 0.4497, + "step": 48090 + }, + { + "epoch": 7.776250909384852, + "grad_norm": 1.0877318382263184, + "learning_rate": 0.0002, + "loss": 0.5063, + "step": 48100 + }, + { + "epoch": 7.777867593565597, + "grad_norm": 1.4800893068313599, + "learning_rate": 0.0002, + "loss": 0.4795, + "step": 48110 + }, + { + "epoch": 7.779484277746342, + "grad_norm": 1.1495977640151978, + "learning_rate": 0.0002, + "loss": 0.4984, + "step": 48120 + }, + { + "epoch": 7.781100961927088, + "grad_norm": 1.2175556421279907, + "learning_rate": 0.0002, + "loss": 0.4662, + "step": 48130 + }, + { + "epoch": 7.782717646107833, + "grad_norm": 1.150556206703186, + "learning_rate": 0.0002, + "loss": 0.4935, + "step": 48140 + }, + { + "epoch": 7.784334330288578, + "grad_norm": 1.051145315170288, + "learning_rate": 0.0002, + "loss": 0.5039, + "step": 48150 + }, + { + "epoch": 7.785951014469323, + "grad_norm": 1.2842742204666138, + "learning_rate": 0.0002, + "loss": 0.4611, + "step": 48160 + }, + { + "epoch": 7.787567698650069, + "grad_norm": 1.2251030206680298, + "learning_rate": 0.0002, + "loss": 0.4589, + "step": 48170 + }, + { + "epoch": 7.789184382830814, + "grad_norm": 1.2809321880340576, + "learning_rate": 0.0002, + "loss": 0.4905, + "step": 48180 + }, + { + "epoch": 7.790801067011559, + "grad_norm": 1.005690336227417, + "learning_rate": 0.0002, + "loss": 0.4569, + "step": 48190 + }, + { + "epoch": 7.792417751192304, + "grad_norm": 1.325501561164856, + "learning_rate": 0.0002, + "loss": 0.4862, + "step": 48200 + }, + { + "epoch": 7.7940344353730495, + "grad_norm": 1.4551857709884644, + "learning_rate": 0.0002, + "loss": 0.4384, + "step": 48210 + }, + { + "epoch": 7.795651119553796, + "grad_norm": 1.3399626016616821, + "learning_rate": 0.0002, + "loss": 0.4696, + "step": 48220 + }, + { + "epoch": 7.79726780373454, + "grad_norm": 1.0379714965820312, + "learning_rate": 0.0002, + "loss": 0.4654, + "step": 48230 + }, + { + "epoch": 7.798884487915286, + "grad_norm": 0.9725802540779114, + "learning_rate": 0.0002, + "loss": 0.4915, + "step": 48240 + }, + { + "epoch": 7.800501172096031, + "grad_norm": 1.0202224254608154, + "learning_rate": 0.0002, + "loss": 0.4583, + "step": 48250 + }, + { + "epoch": 7.802117856276777, + "grad_norm": 0.9477742910385132, + "learning_rate": 0.0002, + "loss": 0.4792, + "step": 48260 + }, + { + "epoch": 7.803734540457522, + "grad_norm": 1.2726924419403076, + "learning_rate": 0.0002, + "loss": 0.4836, + "step": 48270 + }, + { + "epoch": 7.805351224638267, + "grad_norm": 1.453190565109253, + "learning_rate": 0.0002, + "loss": 0.494, + "step": 48280 + }, + { + "epoch": 7.806967908819012, + "grad_norm": 1.2806978225708008, + "learning_rate": 0.0002, + "loss": 0.4559, + "step": 48290 + }, + { + "epoch": 7.8085845929997575, + "grad_norm": 1.0897129774093628, + "learning_rate": 0.0002, + "loss": 0.4867, + "step": 48300 + }, + { + "epoch": 7.810201277180503, + "grad_norm": 1.381636381149292, + "learning_rate": 0.0002, + "loss": 0.4939, + "step": 48310 + }, + { + "epoch": 7.811817961361248, + "grad_norm": 0.9954851269721985, + "learning_rate": 0.0002, + "loss": 0.4797, + "step": 48320 + }, + { + "epoch": 7.813434645541993, + "grad_norm": 1.1756198406219482, + "learning_rate": 0.0002, + "loss": 0.4995, + "step": 48330 + }, + { + "epoch": 7.815051329722738, + "grad_norm": 1.2087817192077637, + "learning_rate": 0.0002, + "loss": 0.4904, + "step": 48340 + }, + { + "epoch": 7.816668013903484, + "grad_norm": 1.3075505495071411, + "learning_rate": 0.0002, + "loss": 0.4935, + "step": 48350 + }, + { + "epoch": 7.818284698084229, + "grad_norm": 1.1872076988220215, + "learning_rate": 0.0002, + "loss": 0.486, + "step": 48360 + }, + { + "epoch": 7.819901382264975, + "grad_norm": 1.2134783267974854, + "learning_rate": 0.0002, + "loss": 0.538, + "step": 48370 + }, + { + "epoch": 7.821518066445719, + "grad_norm": 1.28566312789917, + "learning_rate": 0.0002, + "loss": 0.4759, + "step": 48380 + }, + { + "epoch": 7.8231347506264655, + "grad_norm": 1.0578798055648804, + "learning_rate": 0.0002, + "loss": 0.4962, + "step": 48390 + }, + { + "epoch": 7.824751434807211, + "grad_norm": 1.1225441694259644, + "learning_rate": 0.0002, + "loss": 0.4924, + "step": 48400 + }, + { + "epoch": 7.826368118987956, + "grad_norm": 1.2029428482055664, + "learning_rate": 0.0002, + "loss": 0.5081, + "step": 48410 + }, + { + "epoch": 7.827984803168701, + "grad_norm": 1.252485990524292, + "learning_rate": 0.0002, + "loss": 0.4669, + "step": 48420 + }, + { + "epoch": 7.829601487349446, + "grad_norm": 1.1822574138641357, + "learning_rate": 0.0002, + "loss": 0.4932, + "step": 48430 + }, + { + "epoch": 7.831218171530192, + "grad_norm": 1.2428245544433594, + "learning_rate": 0.0002, + "loss": 0.4692, + "step": 48440 + }, + { + "epoch": 7.832834855710937, + "grad_norm": 1.0565894842147827, + "learning_rate": 0.0002, + "loss": 0.4796, + "step": 48450 + }, + { + "epoch": 7.834451539891682, + "grad_norm": 1.363452672958374, + "learning_rate": 0.0002, + "loss": 0.5016, + "step": 48460 + }, + { + "epoch": 7.836068224072427, + "grad_norm": 1.2436026334762573, + "learning_rate": 0.0002, + "loss": 0.463, + "step": 48470 + }, + { + "epoch": 7.8376849082531725, + "grad_norm": 1.2623029947280884, + "learning_rate": 0.0002, + "loss": 0.4794, + "step": 48480 + }, + { + "epoch": 7.839301592433918, + "grad_norm": 1.0942288637161255, + "learning_rate": 0.0002, + "loss": 0.4863, + "step": 48490 + }, + { + "epoch": 7.840918276614663, + "grad_norm": 1.1791462898254395, + "learning_rate": 0.0002, + "loss": 0.4723, + "step": 48500 + }, + { + "epoch": 7.842534960795408, + "grad_norm": 1.3342814445495605, + "learning_rate": 0.0002, + "loss": 0.501, + "step": 48510 + }, + { + "epoch": 7.844151644976154, + "grad_norm": 1.0511828660964966, + "learning_rate": 0.0002, + "loss": 0.4654, + "step": 48520 + }, + { + "epoch": 7.845768329156899, + "grad_norm": 1.48568594455719, + "learning_rate": 0.0002, + "loss": 0.5051, + "step": 48530 + }, + { + "epoch": 7.847385013337645, + "grad_norm": 1.296844720840454, + "learning_rate": 0.0002, + "loss": 0.4572, + "step": 48540 + }, + { + "epoch": 7.84900169751839, + "grad_norm": 1.3032835721969604, + "learning_rate": 0.0002, + "loss": 0.5216, + "step": 48550 + }, + { + "epoch": 7.850618381699135, + "grad_norm": 1.260769009590149, + "learning_rate": 0.0002, + "loss": 0.472, + "step": 48560 + }, + { + "epoch": 7.8522350658798805, + "grad_norm": 1.3309531211853027, + "learning_rate": 0.0002, + "loss": 0.5094, + "step": 48570 + }, + { + "epoch": 7.853851750060626, + "grad_norm": 1.1907469034194946, + "learning_rate": 0.0002, + "loss": 0.4759, + "step": 48580 + }, + { + "epoch": 7.855468434241371, + "grad_norm": 0.9690865874290466, + "learning_rate": 0.0002, + "loss": 0.485, + "step": 48590 + }, + { + "epoch": 7.857085118422116, + "grad_norm": 1.2417343854904175, + "learning_rate": 0.0002, + "loss": 0.4899, + "step": 48600 + }, + { + "epoch": 7.858701802602861, + "grad_norm": 1.1366082429885864, + "learning_rate": 0.0002, + "loss": 0.4888, + "step": 48610 + }, + { + "epoch": 7.860318486783607, + "grad_norm": 1.4737876653671265, + "learning_rate": 0.0002, + "loss": 0.4909, + "step": 48620 + }, + { + "epoch": 7.861935170964352, + "grad_norm": 1.3934144973754883, + "learning_rate": 0.0002, + "loss": 0.4476, + "step": 48630 + }, + { + "epoch": 7.863551855145097, + "grad_norm": 0.9997506737709045, + "learning_rate": 0.0002, + "loss": 0.5141, + "step": 48640 + }, + { + "epoch": 7.865168539325842, + "grad_norm": 1.3827011585235596, + "learning_rate": 0.0002, + "loss": 0.4766, + "step": 48650 + }, + { + "epoch": 7.866785223506588, + "grad_norm": 1.2811808586120605, + "learning_rate": 0.0002, + "loss": 0.4926, + "step": 48660 + }, + { + "epoch": 7.868401907687334, + "grad_norm": 1.394400715827942, + "learning_rate": 0.0002, + "loss": 0.4898, + "step": 48670 + }, + { + "epoch": 7.870018591868079, + "grad_norm": 1.5635628700256348, + "learning_rate": 0.0002, + "loss": 0.4889, + "step": 48680 + }, + { + "epoch": 7.871635276048824, + "grad_norm": 1.147349238395691, + "learning_rate": 0.0002, + "loss": 0.4822, + "step": 48690 + }, + { + "epoch": 7.873251960229569, + "grad_norm": 1.2417502403259277, + "learning_rate": 0.0002, + "loss": 0.485, + "step": 48700 + }, + { + "epoch": 7.874868644410315, + "grad_norm": 1.0380291938781738, + "learning_rate": 0.0002, + "loss": 0.4976, + "step": 48710 + }, + { + "epoch": 7.87648532859106, + "grad_norm": 1.2139482498168945, + "learning_rate": 0.0002, + "loss": 0.5119, + "step": 48720 + }, + { + "epoch": 7.878102012771805, + "grad_norm": 1.2833739519119263, + "learning_rate": 0.0002, + "loss": 0.497, + "step": 48730 + }, + { + "epoch": 7.87971869695255, + "grad_norm": 1.2405574321746826, + "learning_rate": 0.0002, + "loss": 0.4776, + "step": 48740 + }, + { + "epoch": 7.881335381133296, + "grad_norm": 1.1267465353012085, + "learning_rate": 0.0002, + "loss": 0.5085, + "step": 48750 + }, + { + "epoch": 7.882952065314041, + "grad_norm": 1.3052713871002197, + "learning_rate": 0.0002, + "loss": 0.4894, + "step": 48760 + }, + { + "epoch": 7.884568749494786, + "grad_norm": 1.0581550598144531, + "learning_rate": 0.0002, + "loss": 0.5077, + "step": 48770 + }, + { + "epoch": 7.886185433675531, + "grad_norm": 1.1074683666229248, + "learning_rate": 0.0002, + "loss": 0.4598, + "step": 48780 + }, + { + "epoch": 7.8878021178562765, + "grad_norm": 1.0812418460845947, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 48790 + }, + { + "epoch": 7.889418802037022, + "grad_norm": 1.3083902597427368, + "learning_rate": 0.0002, + "loss": 0.477, + "step": 48800 + }, + { + "epoch": 7.891035486217767, + "grad_norm": 1.457373023033142, + "learning_rate": 0.0002, + "loss": 0.4717, + "step": 48810 + }, + { + "epoch": 7.892652170398513, + "grad_norm": 1.048091173171997, + "learning_rate": 0.0002, + "loss": 0.4607, + "step": 48820 + }, + { + "epoch": 7.894268854579258, + "grad_norm": 1.1420985460281372, + "learning_rate": 0.0002, + "loss": 0.4848, + "step": 48830 + }, + { + "epoch": 7.8958855387600035, + "grad_norm": 1.0286061763763428, + "learning_rate": 0.0002, + "loss": 0.5019, + "step": 48840 + }, + { + "epoch": 7.897502222940749, + "grad_norm": 1.0361840724945068, + "learning_rate": 0.0002, + "loss": 0.5054, + "step": 48850 + }, + { + "epoch": 7.899118907121494, + "grad_norm": 1.1862726211547852, + "learning_rate": 0.0002, + "loss": 0.4908, + "step": 48860 + }, + { + "epoch": 7.900735591302239, + "grad_norm": 1.2256416082382202, + "learning_rate": 0.0002, + "loss": 0.4491, + "step": 48870 + }, + { + "epoch": 7.9023522754829845, + "grad_norm": 1.0664557218551636, + "learning_rate": 0.0002, + "loss": 0.4732, + "step": 48880 + }, + { + "epoch": 7.90396895966373, + "grad_norm": 1.3960802555084229, + "learning_rate": 0.0002, + "loss": 0.4741, + "step": 48890 + }, + { + "epoch": 7.905585643844475, + "grad_norm": 1.230430245399475, + "learning_rate": 0.0002, + "loss": 0.5061, + "step": 48900 + }, + { + "epoch": 7.90720232802522, + "grad_norm": 1.0949305295944214, + "learning_rate": 0.0002, + "loss": 0.4698, + "step": 48910 + }, + { + "epoch": 7.908819012205965, + "grad_norm": 1.4402074813842773, + "learning_rate": 0.0002, + "loss": 0.4964, + "step": 48920 + }, + { + "epoch": 7.910435696386711, + "grad_norm": 1.1064879894256592, + "learning_rate": 0.0002, + "loss": 0.5057, + "step": 48930 + }, + { + "epoch": 7.912052380567456, + "grad_norm": 0.9874461889266968, + "learning_rate": 0.0002, + "loss": 0.482, + "step": 48940 + }, + { + "epoch": 7.913669064748201, + "grad_norm": 1.2584952116012573, + "learning_rate": 0.0002, + "loss": 0.4851, + "step": 48950 + }, + { + "epoch": 7.915285748928946, + "grad_norm": 1.3016353845596313, + "learning_rate": 0.0002, + "loss": 0.4744, + "step": 48960 + }, + { + "epoch": 7.916902433109692, + "grad_norm": 1.104179859161377, + "learning_rate": 0.0002, + "loss": 0.4734, + "step": 48970 + }, + { + "epoch": 7.918519117290438, + "grad_norm": 1.26803457736969, + "learning_rate": 0.0002, + "loss": 0.5143, + "step": 48980 + }, + { + "epoch": 7.920135801471183, + "grad_norm": 1.0336869955062866, + "learning_rate": 0.0002, + "loss": 0.4808, + "step": 48990 + }, + { + "epoch": 7.921752485651928, + "grad_norm": 1.0630918741226196, + "learning_rate": 0.0002, + "loss": 0.4938, + "step": 49000 + }, + { + "epoch": 7.923369169832673, + "grad_norm": 1.2257622480392456, + "learning_rate": 0.0002, + "loss": 0.4988, + "step": 49010 + }, + { + "epoch": 7.924985854013419, + "grad_norm": 1.1722705364227295, + "learning_rate": 0.0002, + "loss": 0.5116, + "step": 49020 + }, + { + "epoch": 7.926602538194164, + "grad_norm": 1.4473323822021484, + "learning_rate": 0.0002, + "loss": 0.4737, + "step": 49030 + }, + { + "epoch": 7.928219222374909, + "grad_norm": 1.3780192136764526, + "learning_rate": 0.0002, + "loss": 0.5412, + "step": 49040 + }, + { + "epoch": 7.929835906555654, + "grad_norm": 1.253423810005188, + "learning_rate": 0.0002, + "loss": 0.4849, + "step": 49050 + }, + { + "epoch": 7.9314525907363995, + "grad_norm": 1.1733828783035278, + "learning_rate": 0.0002, + "loss": 0.5103, + "step": 49060 + }, + { + "epoch": 7.933069274917145, + "grad_norm": 1.249990701675415, + "learning_rate": 0.0002, + "loss": 0.4967, + "step": 49070 + }, + { + "epoch": 7.93468595909789, + "grad_norm": 1.4012458324432373, + "learning_rate": 0.0002, + "loss": 0.51, + "step": 49080 + }, + { + "epoch": 7.936302643278635, + "grad_norm": 1.268652319908142, + "learning_rate": 0.0002, + "loss": 0.4785, + "step": 49090 + }, + { + "epoch": 7.9379193274593804, + "grad_norm": 1.0469073057174683, + "learning_rate": 0.0002, + "loss": 0.5319, + "step": 49100 + }, + { + "epoch": 7.939536011640126, + "grad_norm": 1.3028813600540161, + "learning_rate": 0.0002, + "loss": 0.4836, + "step": 49110 + }, + { + "epoch": 7.941152695820872, + "grad_norm": 1.0998128652572632, + "learning_rate": 0.0002, + "loss": 0.4791, + "step": 49120 + }, + { + "epoch": 7.942769380001617, + "grad_norm": 1.300884485244751, + "learning_rate": 0.0002, + "loss": 0.5022, + "step": 49130 + }, + { + "epoch": 7.944386064182362, + "grad_norm": 1.257865071296692, + "learning_rate": 0.0002, + "loss": 0.5193, + "step": 49140 + }, + { + "epoch": 7.9460027483631075, + "grad_norm": 1.074731707572937, + "learning_rate": 0.0002, + "loss": 0.4755, + "step": 49150 + }, + { + "epoch": 7.947619432543853, + "grad_norm": 1.1055876016616821, + "learning_rate": 0.0002, + "loss": 0.4675, + "step": 49160 + }, + { + "epoch": 7.949236116724598, + "grad_norm": 1.1986541748046875, + "learning_rate": 0.0002, + "loss": 0.4801, + "step": 49170 + }, + { + "epoch": 7.950852800905343, + "grad_norm": 1.094555139541626, + "learning_rate": 0.0002, + "loss": 0.488, + "step": 49180 + }, + { + "epoch": 7.952469485086088, + "grad_norm": 1.2922005653381348, + "learning_rate": 0.0002, + "loss": 0.4974, + "step": 49190 + }, + { + "epoch": 7.954086169266834, + "grad_norm": 1.1557104587554932, + "learning_rate": 0.0002, + "loss": 0.4973, + "step": 49200 + }, + { + "epoch": 7.955702853447579, + "grad_norm": 1.2414908409118652, + "learning_rate": 0.0002, + "loss": 0.4806, + "step": 49210 + }, + { + "epoch": 7.957319537628324, + "grad_norm": 1.3606830835342407, + "learning_rate": 0.0002, + "loss": 0.4848, + "step": 49220 + }, + { + "epoch": 7.958936221809069, + "grad_norm": 0.9592481851577759, + "learning_rate": 0.0002, + "loss": 0.4981, + "step": 49230 + }, + { + "epoch": 7.960552905989815, + "grad_norm": 1.2130779027938843, + "learning_rate": 0.0002, + "loss": 0.4731, + "step": 49240 + }, + { + "epoch": 7.96216959017056, + "grad_norm": 1.1078767776489258, + "learning_rate": 0.0002, + "loss": 0.4529, + "step": 49250 + }, + { + "epoch": 7.963786274351305, + "grad_norm": 1.0684230327606201, + "learning_rate": 0.0002, + "loss": 0.4983, + "step": 49260 + }, + { + "epoch": 7.965402958532051, + "grad_norm": 1.1368396282196045, + "learning_rate": 0.0002, + "loss": 0.4832, + "step": 49270 + }, + { + "epoch": 7.967019642712796, + "grad_norm": 1.2161095142364502, + "learning_rate": 0.0002, + "loss": 0.5226, + "step": 49280 + }, + { + "epoch": 7.968636326893542, + "grad_norm": 1.2087634801864624, + "learning_rate": 0.0002, + "loss": 0.4938, + "step": 49290 + }, + { + "epoch": 7.970253011074287, + "grad_norm": 1.1078447103500366, + "learning_rate": 0.0002, + "loss": 0.4969, + "step": 49300 + }, + { + "epoch": 7.971869695255032, + "grad_norm": 1.3378221988677979, + "learning_rate": 0.0002, + "loss": 0.5333, + "step": 49310 + }, + { + "epoch": 7.973486379435777, + "grad_norm": 1.0475801229476929, + "learning_rate": 0.0002, + "loss": 0.4736, + "step": 49320 + }, + { + "epoch": 7.9751030636165225, + "grad_norm": 0.9948194622993469, + "learning_rate": 0.0002, + "loss": 0.4515, + "step": 49330 + }, + { + "epoch": 7.976719747797268, + "grad_norm": 1.06312894821167, + "learning_rate": 0.0002, + "loss": 0.4685, + "step": 49340 + }, + { + "epoch": 7.978336431978013, + "grad_norm": 1.4047085046768188, + "learning_rate": 0.0002, + "loss": 0.453, + "step": 49350 + }, + { + "epoch": 7.979953116158758, + "grad_norm": 1.086578130722046, + "learning_rate": 0.0002, + "loss": 0.5054, + "step": 49360 + }, + { + "epoch": 7.9815698003395035, + "grad_norm": 1.2896746397018433, + "learning_rate": 0.0002, + "loss": 0.5024, + "step": 49370 + }, + { + "epoch": 7.983186484520249, + "grad_norm": 1.260717511177063, + "learning_rate": 0.0002, + "loss": 0.5102, + "step": 49380 + }, + { + "epoch": 7.984803168700994, + "grad_norm": 1.4238426685333252, + "learning_rate": 0.0002, + "loss": 0.4836, + "step": 49390 + }, + { + "epoch": 7.986419852881739, + "grad_norm": 1.1800259351730347, + "learning_rate": 0.0002, + "loss": 0.4797, + "step": 49400 + }, + { + "epoch": 7.988036537062484, + "grad_norm": 1.128868579864502, + "learning_rate": 0.0002, + "loss": 0.4911, + "step": 49410 + }, + { + "epoch": 7.9896532212432305, + "grad_norm": 1.1832106113433838, + "learning_rate": 0.0002, + "loss": 0.4674, + "step": 49420 + }, + { + "epoch": 7.991269905423976, + "grad_norm": 1.1728334426879883, + "learning_rate": 0.0002, + "loss": 0.5016, + "step": 49430 + }, + { + "epoch": 7.992886589604721, + "grad_norm": 1.2403929233551025, + "learning_rate": 0.0002, + "loss": 0.4706, + "step": 49440 + }, + { + "epoch": 7.994503273785466, + "grad_norm": 1.245354175567627, + "learning_rate": 0.0002, + "loss": 0.4989, + "step": 49450 + }, + { + "epoch": 7.9961199579662114, + "grad_norm": 1.3526462316513062, + "learning_rate": 0.0002, + "loss": 0.5088, + "step": 49460 + }, + { + "epoch": 7.997736642146957, + "grad_norm": 1.2117315530776978, + "learning_rate": 0.0002, + "loss": 0.4703, + "step": 49470 + }, + { + "epoch": 7.999353326327702, + "grad_norm": 1.0393620729446411, + "learning_rate": 0.0002, + "loss": 0.4802, + "step": 49480 + }, + { + "epoch": 7.999353326327702, + "eval_loss": 1.292362093925476, + "eval_runtime": 122.1709, + "eval_samples_per_second": 6.0, + "eval_steps_per_second": 0.753, + "step": 49480 + } + ], + "logging_steps": 10, + "max_steps": 49480, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.2898236097927578e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..155b12fa9acbc6e71dba75c92bfa79e152397ebf --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-49480/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28694d5564a2b5c7d6881d4ba2af103356aa22489d2c22768ebbe47283c0f4a1 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/README.md b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/README.md new file mode 100644 index 0000000000000000000000000000000000000000..835e31ab8469ee39ddc8b2b6b2143a8c66dad510 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Meta-Llama-3-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/adapter_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..406c5a08dc4a2a33b52c62a482f98c217c417215 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/adapter_model.safetensors b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..894306c21591f0f9fc1ad6edef9d7f95864e6e88 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6e6524f0f0d768c7962609d66799cf291b435521c1246c8b6c63ea99e47dce1 +size 109069176 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/optimizer.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b063d727e44d7b9a7a80ed77f4aacaf940f61605 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3898ae941d376d381d8f74c20011586558919a6034d4d4f1516c378d88476c35 +size 55532666 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/rng_state.pth b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4499ab3321a9b904d4df9aa714313a7423559597 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63e86de25b60f1f051d0aad81f1d4dae37811756184027f20eeb1d49d57860e4 +size 14244 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/scheduler.pt b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ee6ca37eba996eef47c9431c9823aaa6ddcc0c6 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43eacc2532dd42a4d4cd9c082df2820848486282537958f00be686f4de55e6ba +size 1064 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/trainer_state.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e5c06890b605df5b7d9a602659fcf0147712e156 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/trainer_state.json @@ -0,0 +1,4367 @@ +{ + "best_metric": 1.0871200561523438, + "best_model_checkpoint": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185", + "epoch": 0.9999191657909627, + "eval_steps": 10, + "global_step": 6185, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016166841807452913, + "grad_norm": 0.9894065856933594, + "learning_rate": 0.0002, + "loss": 1.6636, + "step": 10 + }, + { + "epoch": 0.0032333683614905826, + "grad_norm": 1.7810699939727783, + "learning_rate": 0.0002, + "loss": 1.1528, + "step": 20 + }, + { + "epoch": 0.004850052542235874, + "grad_norm": 0.5969577431678772, + "learning_rate": 0.0002, + "loss": 0.9767, + "step": 30 + }, + { + "epoch": 0.006466736722981165, + "grad_norm": 0.6354120969772339, + "learning_rate": 0.0002, + "loss": 0.9772, + "step": 40 + }, + { + "epoch": 0.008083420903726457, + "grad_norm": 0.5604607462882996, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 50 + }, + { + "epoch": 0.009700105084471748, + "grad_norm": 0.4676193594932556, + "learning_rate": 0.0002, + "loss": 0.8841, + "step": 60 + }, + { + "epoch": 0.01131678926521704, + "grad_norm": 0.6099211573600769, + "learning_rate": 0.0002, + "loss": 0.9022, + "step": 70 + }, + { + "epoch": 0.01293347344596233, + "grad_norm": 0.48639994859695435, + "learning_rate": 0.0002, + "loss": 0.9133, + "step": 80 + }, + { + "epoch": 0.014550157626707623, + "grad_norm": 0.4904264509677887, + "learning_rate": 0.0002, + "loss": 0.8704, + "step": 90 + }, + { + "epoch": 0.016166841807452915, + "grad_norm": 2.8334362506866455, + "learning_rate": 0.0002, + "loss": 0.8855, + "step": 100 + }, + { + "epoch": 0.017783525988198205, + "grad_norm": 0.43221670389175415, + "learning_rate": 0.0002, + "loss": 0.8958, + "step": 110 + }, + { + "epoch": 0.019400210168943496, + "grad_norm": 0.42244166135787964, + "learning_rate": 0.0002, + "loss": 0.8412, + "step": 120 + }, + { + "epoch": 0.02101689434968879, + "grad_norm": 0.45363298058509827, + "learning_rate": 0.0002, + "loss": 0.8467, + "step": 130 + }, + { + "epoch": 0.02263357853043408, + "grad_norm": 0.44816508889198303, + "learning_rate": 0.0002, + "loss": 0.8641, + "step": 140 + }, + { + "epoch": 0.02425026271117937, + "grad_norm": 0.43308213353157043, + "learning_rate": 0.0002, + "loss": 0.8496, + "step": 150 + }, + { + "epoch": 0.02586694689192466, + "grad_norm": 0.4084763526916504, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 160 + }, + { + "epoch": 0.027483631072669955, + "grad_norm": 0.5363703966140747, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 170 + }, + { + "epoch": 0.029100315253415245, + "grad_norm": 0.4619699716567993, + "learning_rate": 0.0002, + "loss": 0.8558, + "step": 180 + }, + { + "epoch": 0.030716999434160536, + "grad_norm": 0.49069908261299133, + "learning_rate": 0.0002, + "loss": 0.8878, + "step": 190 + }, + { + "epoch": 0.03233368361490583, + "grad_norm": 0.4645835757255554, + "learning_rate": 0.0002, + "loss": 0.8867, + "step": 200 + }, + { + "epoch": 0.03395036779565112, + "grad_norm": 1.2411243915557861, + "learning_rate": 0.0002, + "loss": 0.8842, + "step": 210 + }, + { + "epoch": 0.03556705197639641, + "grad_norm": 0.5211851596832275, + "learning_rate": 0.0002, + "loss": 0.8245, + "step": 220 + }, + { + "epoch": 0.037183736157141704, + "grad_norm": 0.5253691673278809, + "learning_rate": 0.0002, + "loss": 0.8194, + "step": 230 + }, + { + "epoch": 0.03880042033788699, + "grad_norm": 0.4567478895187378, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 240 + }, + { + "epoch": 0.040417104518632285, + "grad_norm": 0.5472128391265869, + "learning_rate": 0.0002, + "loss": 0.838, + "step": 250 + }, + { + "epoch": 0.04203378869937758, + "grad_norm": 0.42978546023368835, + "learning_rate": 0.0002, + "loss": 0.8201, + "step": 260 + }, + { + "epoch": 0.043650472880122866, + "grad_norm": 0.601734459400177, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 270 + }, + { + "epoch": 0.04526715706086816, + "grad_norm": 0.4286513328552246, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 280 + }, + { + "epoch": 0.046883841241613454, + "grad_norm": 0.5230861902236938, + "learning_rate": 0.0002, + "loss": 0.8758, + "step": 290 + }, + { + "epoch": 0.04850052542235874, + "grad_norm": 0.6504611968994141, + "learning_rate": 0.0002, + "loss": 0.8636, + "step": 300 + }, + { + "epoch": 0.050117209603104035, + "grad_norm": 0.43485215306282043, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 310 + }, + { + "epoch": 0.05173389378384932, + "grad_norm": 0.4717007875442505, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 320 + }, + { + "epoch": 0.053350577964594616, + "grad_norm": 0.4059787690639496, + "learning_rate": 0.0002, + "loss": 0.8469, + "step": 330 + }, + { + "epoch": 0.05496726214533991, + "grad_norm": 0.4366913437843323, + "learning_rate": 0.0002, + "loss": 0.8866, + "step": 340 + }, + { + "epoch": 0.0565839463260852, + "grad_norm": 0.4233848452568054, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 350 + }, + { + "epoch": 0.05820063050683049, + "grad_norm": 0.4209108352661133, + "learning_rate": 0.0002, + "loss": 0.8456, + "step": 360 + }, + { + "epoch": 0.059817314687575784, + "grad_norm": 0.41637396812438965, + "learning_rate": 0.0002, + "loss": 0.816, + "step": 370 + }, + { + "epoch": 0.06143399886832107, + "grad_norm": 0.46235376596450806, + "learning_rate": 0.0002, + "loss": 0.7976, + "step": 380 + }, + { + "epoch": 0.06305068304906636, + "grad_norm": 0.4013484716415405, + "learning_rate": 0.0002, + "loss": 0.7966, + "step": 390 + }, + { + "epoch": 0.06466736722981166, + "grad_norm": 0.47443896532058716, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 400 + }, + { + "epoch": 0.06628405141055695, + "grad_norm": 0.3942156434059143, + "learning_rate": 0.0002, + "loss": 0.8666, + "step": 410 + }, + { + "epoch": 0.06790073559130223, + "grad_norm": 0.4965320825576782, + "learning_rate": 0.0002, + "loss": 0.8402, + "step": 420 + }, + { + "epoch": 0.06951741977204753, + "grad_norm": 0.4304835796356201, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 430 + }, + { + "epoch": 0.07113410395279282, + "grad_norm": 0.511726975440979, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 440 + }, + { + "epoch": 0.07275078813353811, + "grad_norm": 0.4040689170360565, + "learning_rate": 0.0002, + "loss": 0.8675, + "step": 450 + }, + { + "epoch": 0.07436747231428341, + "grad_norm": 0.5402171015739441, + "learning_rate": 0.0002, + "loss": 0.8788, + "step": 460 + }, + { + "epoch": 0.0759841564950287, + "grad_norm": 0.4174517095088959, + "learning_rate": 0.0002, + "loss": 0.8737, + "step": 470 + }, + { + "epoch": 0.07760084067577398, + "grad_norm": 0.4306182265281677, + "learning_rate": 0.0002, + "loss": 0.7605, + "step": 480 + }, + { + "epoch": 0.07921752485651928, + "grad_norm": 0.535210132598877, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 490 + }, + { + "epoch": 0.08083420903726457, + "grad_norm": 0.5339109897613525, + "learning_rate": 0.0002, + "loss": 0.7825, + "step": 500 + }, + { + "epoch": 0.08245089321800986, + "grad_norm": 0.45754891633987427, + "learning_rate": 0.0002, + "loss": 0.8985, + "step": 510 + }, + { + "epoch": 0.08406757739875516, + "grad_norm": 0.43820783495903015, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 520 + }, + { + "epoch": 0.08568426157950045, + "grad_norm": 0.4434749186038971, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 530 + }, + { + "epoch": 0.08730094576024573, + "grad_norm": 0.43111467361450195, + "learning_rate": 0.0002, + "loss": 0.7857, + "step": 540 + }, + { + "epoch": 0.08891762994099103, + "grad_norm": 0.4378940165042877, + "learning_rate": 0.0002, + "loss": 0.8418, + "step": 550 + }, + { + "epoch": 0.09053431412173632, + "grad_norm": 0.4772215187549591, + "learning_rate": 0.0002, + "loss": 0.8361, + "step": 560 + }, + { + "epoch": 0.09215099830248161, + "grad_norm": 0.6837629079818726, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 570 + }, + { + "epoch": 0.09376768248322691, + "grad_norm": 0.42241212725639343, + "learning_rate": 0.0002, + "loss": 0.8607, + "step": 580 + }, + { + "epoch": 0.0953843666639722, + "grad_norm": 0.5165936350822449, + "learning_rate": 0.0002, + "loss": 0.852, + "step": 590 + }, + { + "epoch": 0.09700105084471748, + "grad_norm": 0.48737478256225586, + "learning_rate": 0.0002, + "loss": 0.8664, + "step": 600 + }, + { + "epoch": 0.09861773502546278, + "grad_norm": 0.47419852018356323, + "learning_rate": 0.0002, + "loss": 0.8806, + "step": 610 + }, + { + "epoch": 0.10023441920620807, + "grad_norm": 0.4975486099720001, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 620 + }, + { + "epoch": 0.10185110338695336, + "grad_norm": 0.49123844504356384, + "learning_rate": 0.0002, + "loss": 0.8548, + "step": 630 + }, + { + "epoch": 0.10346778756769864, + "grad_norm": 0.6288952827453613, + "learning_rate": 0.0002, + "loss": 0.8911, + "step": 640 + }, + { + "epoch": 0.10508447174844394, + "grad_norm": 0.4277345836162567, + "learning_rate": 0.0002, + "loss": 0.827, + "step": 650 + }, + { + "epoch": 0.10670115592918923, + "grad_norm": 0.4021061956882477, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 660 + }, + { + "epoch": 0.10831784010993452, + "grad_norm": 0.3492237329483032, + "learning_rate": 0.0002, + "loss": 0.87, + "step": 670 + }, + { + "epoch": 0.10993452429067982, + "grad_norm": 0.4341012239456177, + "learning_rate": 0.0002, + "loss": 0.8698, + "step": 680 + }, + { + "epoch": 0.1115512084714251, + "grad_norm": 0.7296304106712341, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 690 + }, + { + "epoch": 0.1131678926521704, + "grad_norm": 0.397494912147522, + "learning_rate": 0.0002, + "loss": 0.8433, + "step": 700 + }, + { + "epoch": 0.1147845768329157, + "grad_norm": 0.396431028842926, + "learning_rate": 0.0002, + "loss": 0.827, + "step": 710 + }, + { + "epoch": 0.11640126101366098, + "grad_norm": 0.48842838406562805, + "learning_rate": 0.0002, + "loss": 0.8379, + "step": 720 + }, + { + "epoch": 0.11801794519440627, + "grad_norm": 0.46322616934776306, + "learning_rate": 0.0002, + "loss": 0.8238, + "step": 730 + }, + { + "epoch": 0.11963462937515157, + "grad_norm": 0.47990912199020386, + "learning_rate": 0.0002, + "loss": 0.8041, + "step": 740 + }, + { + "epoch": 0.12125131355589686, + "grad_norm": 0.4997142255306244, + "learning_rate": 0.0002, + "loss": 0.82, + "step": 750 + }, + { + "epoch": 0.12286799773664214, + "grad_norm": 0.4040526747703552, + "learning_rate": 0.0002, + "loss": 0.7702, + "step": 760 + }, + { + "epoch": 0.12448468191738744, + "grad_norm": 0.453095942735672, + "learning_rate": 0.0002, + "loss": 0.863, + "step": 770 + }, + { + "epoch": 0.12610136609813272, + "grad_norm": 0.4636971950531006, + "learning_rate": 0.0002, + "loss": 0.8792, + "step": 780 + }, + { + "epoch": 0.12771805027887803, + "grad_norm": 0.4279276132583618, + "learning_rate": 0.0002, + "loss": 0.8112, + "step": 790 + }, + { + "epoch": 0.12933473445962332, + "grad_norm": 0.46212655305862427, + "learning_rate": 0.0002, + "loss": 0.8711, + "step": 800 + }, + { + "epoch": 0.1309514186403686, + "grad_norm": 0.43127650022506714, + "learning_rate": 0.0002, + "loss": 0.8368, + "step": 810 + }, + { + "epoch": 0.1325681028211139, + "grad_norm": 0.4201301336288452, + "learning_rate": 0.0002, + "loss": 0.8476, + "step": 820 + }, + { + "epoch": 0.13418478700185918, + "grad_norm": 0.42583167552948, + "learning_rate": 0.0002, + "loss": 0.8078, + "step": 830 + }, + { + "epoch": 0.13580147118260447, + "grad_norm": 0.4535622000694275, + "learning_rate": 0.0002, + "loss": 0.8219, + "step": 840 + }, + { + "epoch": 0.13741815536334978, + "grad_norm": 0.4116036891937256, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 850 + }, + { + "epoch": 0.13903483954409507, + "grad_norm": 0.45997580885887146, + "learning_rate": 0.0002, + "loss": 0.8466, + "step": 860 + }, + { + "epoch": 0.14065152372484035, + "grad_norm": 0.4487837255001068, + "learning_rate": 0.0002, + "loss": 0.8917, + "step": 870 + }, + { + "epoch": 0.14226820790558564, + "grad_norm": 0.43650057911872864, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 880 + }, + { + "epoch": 0.14388489208633093, + "grad_norm": 0.5335358381271362, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 890 + }, + { + "epoch": 0.14550157626707622, + "grad_norm": 0.5989000201225281, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 900 + }, + { + "epoch": 0.14711826044782153, + "grad_norm": 0.517179012298584, + "learning_rate": 0.0002, + "loss": 0.8385, + "step": 910 + }, + { + "epoch": 0.14873494462856682, + "grad_norm": 0.44435232877731323, + "learning_rate": 0.0002, + "loss": 0.8255, + "step": 920 + }, + { + "epoch": 0.1503516288093121, + "grad_norm": 0.42635923624038696, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 930 + }, + { + "epoch": 0.1519683129900574, + "grad_norm": 0.49603334069252014, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 940 + }, + { + "epoch": 0.15358499717080268, + "grad_norm": 0.40639808773994446, + "learning_rate": 0.0002, + "loss": 0.8377, + "step": 950 + }, + { + "epoch": 0.15520168135154797, + "grad_norm": 0.4850759208202362, + "learning_rate": 0.0002, + "loss": 0.8529, + "step": 960 + }, + { + "epoch": 0.15681836553229328, + "grad_norm": 0.4427442252635956, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 970 + }, + { + "epoch": 0.15843504971303857, + "grad_norm": 0.3760930001735687, + "learning_rate": 0.0002, + "loss": 0.8705, + "step": 980 + }, + { + "epoch": 0.16005173389378385, + "grad_norm": 0.4794144332408905, + "learning_rate": 0.0002, + "loss": 0.8644, + "step": 990 + }, + { + "epoch": 0.16166841807452914, + "grad_norm": 0.45828768610954285, + "learning_rate": 0.0002, + "loss": 0.8002, + "step": 1000 + }, + { + "epoch": 0.16328510225527443, + "grad_norm": 0.6313053369522095, + "learning_rate": 0.0002, + "loss": 0.7658, + "step": 1010 + }, + { + "epoch": 0.16490178643601971, + "grad_norm": 0.45041006803512573, + "learning_rate": 0.0002, + "loss": 0.8047, + "step": 1020 + }, + { + "epoch": 0.166518470616765, + "grad_norm": 0.441403865814209, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 1030 + }, + { + "epoch": 0.16813515479751032, + "grad_norm": 0.8171296119689941, + "learning_rate": 0.0002, + "loss": 0.8475, + "step": 1040 + }, + { + "epoch": 0.1697518389782556, + "grad_norm": 0.7137420773506165, + "learning_rate": 0.0002, + "loss": 0.845, + "step": 1050 + }, + { + "epoch": 0.1713685231590009, + "grad_norm": 0.5236809849739075, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 1060 + }, + { + "epoch": 0.17298520733974618, + "grad_norm": 0.5021864175796509, + "learning_rate": 0.0002, + "loss": 0.8265, + "step": 1070 + }, + { + "epoch": 0.17460189152049146, + "grad_norm": 0.47347521781921387, + "learning_rate": 0.0002, + "loss": 0.8305, + "step": 1080 + }, + { + "epoch": 0.17621857570123675, + "grad_norm": 0.4631653428077698, + "learning_rate": 0.0002, + "loss": 0.8105, + "step": 1090 + }, + { + "epoch": 0.17783525988198207, + "grad_norm": 0.49169182777404785, + "learning_rate": 0.0002, + "loss": 0.8166, + "step": 1100 + }, + { + "epoch": 0.17945194406272735, + "grad_norm": 0.5019739270210266, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 1110 + }, + { + "epoch": 0.18106862824347264, + "grad_norm": 0.5100422501564026, + "learning_rate": 0.0002, + "loss": 0.8247, + "step": 1120 + }, + { + "epoch": 0.18268531242421793, + "grad_norm": 0.3888324499130249, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 1130 + }, + { + "epoch": 0.18430199660496321, + "grad_norm": 0.39765217900276184, + "learning_rate": 0.0002, + "loss": 0.8533, + "step": 1140 + }, + { + "epoch": 0.1859186807857085, + "grad_norm": 0.47190186381340027, + "learning_rate": 0.0002, + "loss": 0.8541, + "step": 1150 + }, + { + "epoch": 0.18753536496645382, + "grad_norm": 0.4464188814163208, + "learning_rate": 0.0002, + "loss": 0.8301, + "step": 1160 + }, + { + "epoch": 0.1891520491471991, + "grad_norm": 0.5153930187225342, + "learning_rate": 0.0002, + "loss": 0.8341, + "step": 1170 + }, + { + "epoch": 0.1907687333279444, + "grad_norm": 0.4779708683490753, + "learning_rate": 0.0002, + "loss": 0.8033, + "step": 1180 + }, + { + "epoch": 0.19238541750868968, + "grad_norm": 0.4834315776824951, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 1190 + }, + { + "epoch": 0.19400210168943496, + "grad_norm": 0.402357816696167, + "learning_rate": 0.0002, + "loss": 0.7721, + "step": 1200 + }, + { + "epoch": 0.19561878587018025, + "grad_norm": 0.45899084210395813, + "learning_rate": 0.0002, + "loss": 0.7941, + "step": 1210 + }, + { + "epoch": 0.19723547005092557, + "grad_norm": 0.5106529593467712, + "learning_rate": 0.0002, + "loss": 0.8353, + "step": 1220 + }, + { + "epoch": 0.19885215423167085, + "grad_norm": 0.45261722803115845, + "learning_rate": 0.0002, + "loss": 0.7816, + "step": 1230 + }, + { + "epoch": 0.20046883841241614, + "grad_norm": 0.4647127091884613, + "learning_rate": 0.0002, + "loss": 0.8068, + "step": 1240 + }, + { + "epoch": 0.20208552259316143, + "grad_norm": 0.4849368929862976, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 1250 + }, + { + "epoch": 0.2037022067739067, + "grad_norm": 0.4518061578273773, + "learning_rate": 0.0002, + "loss": 0.8514, + "step": 1260 + }, + { + "epoch": 0.205318890954652, + "grad_norm": 0.49535325169563293, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 1270 + }, + { + "epoch": 0.2069355751353973, + "grad_norm": 0.4835205376148224, + "learning_rate": 0.0002, + "loss": 0.8348, + "step": 1280 + }, + { + "epoch": 0.2085522593161426, + "grad_norm": 0.45308539271354675, + "learning_rate": 0.0002, + "loss": 0.8428, + "step": 1290 + }, + { + "epoch": 0.2101689434968879, + "grad_norm": 0.5369905233383179, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 1300 + }, + { + "epoch": 0.21178562767763318, + "grad_norm": 0.5031622052192688, + "learning_rate": 0.0002, + "loss": 0.8676, + "step": 1310 + }, + { + "epoch": 0.21340231185837846, + "grad_norm": 0.48010334372520447, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 1320 + }, + { + "epoch": 0.21501899603912375, + "grad_norm": 0.4905701279640198, + "learning_rate": 0.0002, + "loss": 0.806, + "step": 1330 + }, + { + "epoch": 0.21663568021986904, + "grad_norm": 0.43531742691993713, + "learning_rate": 0.0002, + "loss": 0.7885, + "step": 1340 + }, + { + "epoch": 0.21825236440061435, + "grad_norm": 0.44330692291259766, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 1350 + }, + { + "epoch": 0.21986904858135964, + "grad_norm": 0.5384416580200195, + "learning_rate": 0.0002, + "loss": 0.8205, + "step": 1360 + }, + { + "epoch": 0.22148573276210493, + "grad_norm": 0.4181833863258362, + "learning_rate": 0.0002, + "loss": 0.7726, + "step": 1370 + }, + { + "epoch": 0.2231024169428502, + "grad_norm": 0.523833692073822, + "learning_rate": 0.0002, + "loss": 0.8311, + "step": 1380 + }, + { + "epoch": 0.2247191011235955, + "grad_norm": 0.5528736710548401, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 1390 + }, + { + "epoch": 0.2263357853043408, + "grad_norm": 0.43515023589134216, + "learning_rate": 0.0002, + "loss": 0.8079, + "step": 1400 + }, + { + "epoch": 0.2279524694850861, + "grad_norm": 0.48809877038002014, + "learning_rate": 0.0002, + "loss": 0.8403, + "step": 1410 + }, + { + "epoch": 0.2295691536658314, + "grad_norm": 0.43591251969337463, + "learning_rate": 0.0002, + "loss": 0.8165, + "step": 1420 + }, + { + "epoch": 0.23118583784657668, + "grad_norm": 0.44625312089920044, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 1430 + }, + { + "epoch": 0.23280252202732196, + "grad_norm": 0.4390665292739868, + "learning_rate": 0.0002, + "loss": 0.8134, + "step": 1440 + }, + { + "epoch": 0.23441920620806725, + "grad_norm": 0.48496049642562866, + "learning_rate": 0.0002, + "loss": 0.8465, + "step": 1450 + }, + { + "epoch": 0.23603589038881254, + "grad_norm": 0.45919957756996155, + "learning_rate": 0.0002, + "loss": 0.775, + "step": 1460 + }, + { + "epoch": 0.23765257456955785, + "grad_norm": 0.5471845865249634, + "learning_rate": 0.0002, + "loss": 0.8659, + "step": 1470 + }, + { + "epoch": 0.23926925875030314, + "grad_norm": 0.47269317507743835, + "learning_rate": 0.0002, + "loss": 0.8164, + "step": 1480 + }, + { + "epoch": 0.24088594293104842, + "grad_norm": 0.4930245578289032, + "learning_rate": 0.0002, + "loss": 0.854, + "step": 1490 + }, + { + "epoch": 0.2425026271117937, + "grad_norm": 0.5605630278587341, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 1500 + }, + { + "epoch": 0.244119311292539, + "grad_norm": 0.4435870945453644, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 1510 + }, + { + "epoch": 0.24573599547328429, + "grad_norm": 0.4941999912261963, + "learning_rate": 0.0002, + "loss": 0.8123, + "step": 1520 + }, + { + "epoch": 0.24735267965402957, + "grad_norm": 0.5100624561309814, + "learning_rate": 0.0002, + "loss": 0.8427, + "step": 1530 + }, + { + "epoch": 0.2489693638347749, + "grad_norm": 0.4638267457485199, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 1540 + }, + { + "epoch": 0.25058604801552015, + "grad_norm": 0.5071570873260498, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 1550 + }, + { + "epoch": 0.25220273219626543, + "grad_norm": 0.4291319251060486, + "learning_rate": 0.0002, + "loss": 0.7724, + "step": 1560 + }, + { + "epoch": 0.2538194163770108, + "grad_norm": 0.5388049483299255, + "learning_rate": 0.0002, + "loss": 0.7984, + "step": 1570 + }, + { + "epoch": 0.25543610055775606, + "grad_norm": 0.5083683729171753, + "learning_rate": 0.0002, + "loss": 0.8176, + "step": 1580 + }, + { + "epoch": 0.25705278473850135, + "grad_norm": 0.4824463725090027, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 1590 + }, + { + "epoch": 0.25866946891924664, + "grad_norm": 0.41177722811698914, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 1600 + }, + { + "epoch": 0.2602861530999919, + "grad_norm": 0.5656219124794006, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 1610 + }, + { + "epoch": 0.2619028372807372, + "grad_norm": 0.41063204407691956, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 1620 + }, + { + "epoch": 0.2635195214614825, + "grad_norm": 0.4897061288356781, + "learning_rate": 0.0002, + "loss": 0.7998, + "step": 1630 + }, + { + "epoch": 0.2651362056422278, + "grad_norm": 0.4454376697540283, + "learning_rate": 0.0002, + "loss": 0.8198, + "step": 1640 + }, + { + "epoch": 0.26675288982297307, + "grad_norm": 0.4355238378047943, + "learning_rate": 0.0002, + "loss": 0.8684, + "step": 1650 + }, + { + "epoch": 0.26836957400371836, + "grad_norm": 0.458310067653656, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 1660 + }, + { + "epoch": 0.26998625818446365, + "grad_norm": 0.4752083718776703, + "learning_rate": 0.0002, + "loss": 0.7935, + "step": 1670 + }, + { + "epoch": 0.27160294236520893, + "grad_norm": 0.4666106402873993, + "learning_rate": 0.0002, + "loss": 0.8267, + "step": 1680 + }, + { + "epoch": 0.2732196265459543, + "grad_norm": 0.4213818609714508, + "learning_rate": 0.0002, + "loss": 0.8252, + "step": 1690 + }, + { + "epoch": 0.27483631072669956, + "grad_norm": 0.5768913626670837, + "learning_rate": 0.0002, + "loss": 0.8559, + "step": 1700 + }, + { + "epoch": 0.27645299490744485, + "grad_norm": 0.4209914803504944, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 1710 + }, + { + "epoch": 0.27806967908819014, + "grad_norm": 0.501909613609314, + "learning_rate": 0.0002, + "loss": 0.8167, + "step": 1720 + }, + { + "epoch": 0.2796863632689354, + "grad_norm": 0.5266261100769043, + "learning_rate": 0.0002, + "loss": 0.7832, + "step": 1730 + }, + { + "epoch": 0.2813030474496807, + "grad_norm": 0.43806859850883484, + "learning_rate": 0.0002, + "loss": 0.8102, + "step": 1740 + }, + { + "epoch": 0.282919731630426, + "grad_norm": 0.46048814058303833, + "learning_rate": 0.0002, + "loss": 0.8157, + "step": 1750 + }, + { + "epoch": 0.2845364158111713, + "grad_norm": 0.44972819089889526, + "learning_rate": 0.0002, + "loss": 0.8596, + "step": 1760 + }, + { + "epoch": 0.28615309999191657, + "grad_norm": 0.5114831328392029, + "learning_rate": 0.0002, + "loss": 0.8421, + "step": 1770 + }, + { + "epoch": 0.28776978417266186, + "grad_norm": 0.47931742668151855, + "learning_rate": 0.0002, + "loss": 0.8361, + "step": 1780 + }, + { + "epoch": 0.28938646835340714, + "grad_norm": 0.5092599987983704, + "learning_rate": 0.0002, + "loss": 0.8265, + "step": 1790 + }, + { + "epoch": 0.29100315253415243, + "grad_norm": 0.37581443786621094, + "learning_rate": 0.0002, + "loss": 0.8506, + "step": 1800 + }, + { + "epoch": 0.2926198367148977, + "grad_norm": 0.47097381949424744, + "learning_rate": 0.0002, + "loss": 0.7932, + "step": 1810 + }, + { + "epoch": 0.29423652089564306, + "grad_norm": 0.48300236463546753, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 1820 + }, + { + "epoch": 0.29585320507638835, + "grad_norm": 0.5600419640541077, + "learning_rate": 0.0002, + "loss": 0.8391, + "step": 1830 + }, + { + "epoch": 0.29746988925713364, + "grad_norm": 0.48555272817611694, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 1840 + }, + { + "epoch": 0.2990865734378789, + "grad_norm": 0.3752668499946594, + "learning_rate": 0.0002, + "loss": 0.7657, + "step": 1850 + }, + { + "epoch": 0.3007032576186242, + "grad_norm": 0.5328747034072876, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 1860 + }, + { + "epoch": 0.3023199417993695, + "grad_norm": 0.48716455698013306, + "learning_rate": 0.0002, + "loss": 0.8426, + "step": 1870 + }, + { + "epoch": 0.3039366259801148, + "grad_norm": 0.5011493563652039, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 1880 + }, + { + "epoch": 0.30555331016086007, + "grad_norm": 0.46461427211761475, + "learning_rate": 0.0002, + "loss": 0.852, + "step": 1890 + }, + { + "epoch": 0.30716999434160536, + "grad_norm": 0.36630210280418396, + "learning_rate": 0.0002, + "loss": 0.8478, + "step": 1900 + }, + { + "epoch": 0.30878667852235064, + "grad_norm": 0.4217296242713928, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 1910 + }, + { + "epoch": 0.31040336270309593, + "grad_norm": 0.4394875466823578, + "learning_rate": 0.0002, + "loss": 0.8128, + "step": 1920 + }, + { + "epoch": 0.3120200468838412, + "grad_norm": 0.6587965488433838, + "learning_rate": 0.0002, + "loss": 0.8471, + "step": 1930 + }, + { + "epoch": 0.31363673106458656, + "grad_norm": 0.5469298958778381, + "learning_rate": 0.0002, + "loss": 0.8565, + "step": 1940 + }, + { + "epoch": 0.31525341524533185, + "grad_norm": 0.4371595084667206, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 1950 + }, + { + "epoch": 0.31687009942607713, + "grad_norm": 0.4809541404247284, + "learning_rate": 0.0002, + "loss": 0.887, + "step": 1960 + }, + { + "epoch": 0.3184867836068224, + "grad_norm": 0.6061086654663086, + "learning_rate": 0.0002, + "loss": 0.7855, + "step": 1970 + }, + { + "epoch": 0.3201034677875677, + "grad_norm": 0.5342657566070557, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 1980 + }, + { + "epoch": 0.321720151968313, + "grad_norm": 0.5057743787765503, + "learning_rate": 0.0002, + "loss": 0.7955, + "step": 1990 + }, + { + "epoch": 0.3233368361490583, + "grad_norm": 0.528626024723053, + "learning_rate": 0.0002, + "loss": 0.7774, + "step": 2000 + }, + { + "epoch": 0.32495352032980357, + "grad_norm": 0.46742770075798035, + "learning_rate": 0.0002, + "loss": 0.8845, + "step": 2010 + }, + { + "epoch": 0.32657020451054886, + "grad_norm": 0.515101432800293, + "learning_rate": 0.0002, + "loss": 0.8484, + "step": 2020 + }, + { + "epoch": 0.32818688869129414, + "grad_norm": 0.41941216588020325, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 2030 + }, + { + "epoch": 0.32980357287203943, + "grad_norm": 0.49902522563934326, + "learning_rate": 0.0002, + "loss": 0.7637, + "step": 2040 + }, + { + "epoch": 0.3314202570527847, + "grad_norm": 0.4120897650718689, + "learning_rate": 0.0002, + "loss": 0.7822, + "step": 2050 + }, + { + "epoch": 0.33303694123353, + "grad_norm": 0.45352041721343994, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 2060 + }, + { + "epoch": 0.33465362541427535, + "grad_norm": 0.523199737071991, + "learning_rate": 0.0002, + "loss": 0.7913, + "step": 2070 + }, + { + "epoch": 0.33627030959502063, + "grad_norm": 0.4390358626842499, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 2080 + }, + { + "epoch": 0.3378869937757659, + "grad_norm": 0.6752901077270508, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 2090 + }, + { + "epoch": 0.3395036779565112, + "grad_norm": 0.547821044921875, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 2100 + }, + { + "epoch": 0.3411203621372565, + "grad_norm": 0.5161308646202087, + "learning_rate": 0.0002, + "loss": 0.8561, + "step": 2110 + }, + { + "epoch": 0.3427370463180018, + "grad_norm": 0.4565401077270508, + "learning_rate": 0.0002, + "loss": 0.7697, + "step": 2120 + }, + { + "epoch": 0.34435373049874707, + "grad_norm": 0.4666115939617157, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 2130 + }, + { + "epoch": 0.34597041467949236, + "grad_norm": 0.4090428352355957, + "learning_rate": 0.0002, + "loss": 0.8189, + "step": 2140 + }, + { + "epoch": 0.34758709886023764, + "grad_norm": 0.510845422744751, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 2150 + }, + { + "epoch": 0.34920378304098293, + "grad_norm": 0.42861923575401306, + "learning_rate": 0.0002, + "loss": 0.8398, + "step": 2160 + }, + { + "epoch": 0.3508204672217282, + "grad_norm": 0.4476332664489746, + "learning_rate": 0.0002, + "loss": 0.7716, + "step": 2170 + }, + { + "epoch": 0.3524371514024735, + "grad_norm": 0.6065791249275208, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 2180 + }, + { + "epoch": 0.35405383558321885, + "grad_norm": 0.42335066199302673, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 2190 + }, + { + "epoch": 0.35567051976396413, + "grad_norm": 0.5094629526138306, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 2200 + }, + { + "epoch": 0.3572872039447094, + "grad_norm": 0.5476373434066772, + "learning_rate": 0.0002, + "loss": 0.7807, + "step": 2210 + }, + { + "epoch": 0.3589038881254547, + "grad_norm": 0.3911719024181366, + "learning_rate": 0.0002, + "loss": 0.814, + "step": 2220 + }, + { + "epoch": 0.3605205723062, + "grad_norm": 0.6599636077880859, + "learning_rate": 0.0002, + "loss": 0.8599, + "step": 2230 + }, + { + "epoch": 0.3621372564869453, + "grad_norm": 0.40381914377212524, + "learning_rate": 0.0002, + "loss": 0.7482, + "step": 2240 + }, + { + "epoch": 0.36375394066769057, + "grad_norm": 0.4433908462524414, + "learning_rate": 0.0002, + "loss": 0.7772, + "step": 2250 + }, + { + "epoch": 0.36537062484843585, + "grad_norm": 0.578326940536499, + "learning_rate": 0.0002, + "loss": 0.8503, + "step": 2260 + }, + { + "epoch": 0.36698730902918114, + "grad_norm": 0.5734784007072449, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 2270 + }, + { + "epoch": 0.36860399320992643, + "grad_norm": 0.45555487275123596, + "learning_rate": 0.0002, + "loss": 0.8193, + "step": 2280 + }, + { + "epoch": 0.3702206773906717, + "grad_norm": 0.5666276216506958, + "learning_rate": 0.0002, + "loss": 0.7929, + "step": 2290 + }, + { + "epoch": 0.371837361571417, + "grad_norm": 0.5461117625236511, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 2300 + }, + { + "epoch": 0.3734540457521623, + "grad_norm": 0.6318911910057068, + "learning_rate": 0.0002, + "loss": 0.8204, + "step": 2310 + }, + { + "epoch": 0.37507072993290763, + "grad_norm": 0.493263304233551, + "learning_rate": 0.0002, + "loss": 0.7964, + "step": 2320 + }, + { + "epoch": 0.3766874141136529, + "grad_norm": 0.5888760089874268, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 2330 + }, + { + "epoch": 0.3783040982943982, + "grad_norm": 0.48671841621398926, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 2340 + }, + { + "epoch": 0.3799207824751435, + "grad_norm": 0.4385145306587219, + "learning_rate": 0.0002, + "loss": 0.8367, + "step": 2350 + }, + { + "epoch": 0.3815374666558888, + "grad_norm": 0.5523318648338318, + "learning_rate": 0.0002, + "loss": 0.812, + "step": 2360 + }, + { + "epoch": 0.38315415083663407, + "grad_norm": 0.7308220267295837, + "learning_rate": 0.0002, + "loss": 0.8351, + "step": 2370 + }, + { + "epoch": 0.38477083501737935, + "grad_norm": 0.554214358329773, + "learning_rate": 0.0002, + "loss": 0.859, + "step": 2380 + }, + { + "epoch": 0.38638751919812464, + "grad_norm": 0.5425800085067749, + "learning_rate": 0.0002, + "loss": 0.8146, + "step": 2390 + }, + { + "epoch": 0.3880042033788699, + "grad_norm": 0.48811158537864685, + "learning_rate": 0.0002, + "loss": 0.8282, + "step": 2400 + }, + { + "epoch": 0.3896208875596152, + "grad_norm": 0.49212366342544556, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 2410 + }, + { + "epoch": 0.3912375717403605, + "grad_norm": 0.5222218632698059, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 2420 + }, + { + "epoch": 0.3928542559211058, + "grad_norm": 0.4699819087982178, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 2430 + }, + { + "epoch": 0.39447094010185113, + "grad_norm": 0.46153587102890015, + "learning_rate": 0.0002, + "loss": 0.7919, + "step": 2440 + }, + { + "epoch": 0.3960876242825964, + "grad_norm": 0.4150611162185669, + "learning_rate": 0.0002, + "loss": 0.8111, + "step": 2450 + }, + { + "epoch": 0.3977043084633417, + "grad_norm": 0.5799614787101746, + "learning_rate": 0.0002, + "loss": 0.8589, + "step": 2460 + }, + { + "epoch": 0.399320992644087, + "grad_norm": 0.56536865234375, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 2470 + }, + { + "epoch": 0.4009376768248323, + "grad_norm": 0.5451247096061707, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 2480 + }, + { + "epoch": 0.40255436100557757, + "grad_norm": 0.5914521217346191, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 2490 + }, + { + "epoch": 0.40417104518632285, + "grad_norm": 0.4428117275238037, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 2500 + }, + { + "epoch": 0.40578772936706814, + "grad_norm": 0.48580947518348694, + "learning_rate": 0.0002, + "loss": 0.8054, + "step": 2510 + }, + { + "epoch": 0.4074044135478134, + "grad_norm": 0.436734676361084, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 2520 + }, + { + "epoch": 0.4090210977285587, + "grad_norm": 0.5752223134040833, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 2530 + }, + { + "epoch": 0.410637781909304, + "grad_norm": 0.4271308183670044, + "learning_rate": 0.0002, + "loss": 0.8181, + "step": 2540 + }, + { + "epoch": 0.4122544660900493, + "grad_norm": 0.46294718980789185, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 2550 + }, + { + "epoch": 0.4138711502707946, + "grad_norm": 0.49407583475112915, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 2560 + }, + { + "epoch": 0.4154878344515399, + "grad_norm": 0.4729035496711731, + "learning_rate": 0.0002, + "loss": 0.7881, + "step": 2570 + }, + { + "epoch": 0.4171045186322852, + "grad_norm": 0.4129747152328491, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 2580 + }, + { + "epoch": 0.4187212028130305, + "grad_norm": 0.5684236288070679, + "learning_rate": 0.0002, + "loss": 0.7859, + "step": 2590 + }, + { + "epoch": 0.4203378869937758, + "grad_norm": 0.4862157106399536, + "learning_rate": 0.0002, + "loss": 0.811, + "step": 2600 + }, + { + "epoch": 0.42195457117452106, + "grad_norm": 0.46567976474761963, + "learning_rate": 0.0002, + "loss": 0.7582, + "step": 2610 + }, + { + "epoch": 0.42357125535526635, + "grad_norm": 0.5710650682449341, + "learning_rate": 0.0002, + "loss": 0.7755, + "step": 2620 + }, + { + "epoch": 0.42518793953601164, + "grad_norm": 0.5660041570663452, + "learning_rate": 0.0002, + "loss": 0.8573, + "step": 2630 + }, + { + "epoch": 0.4268046237167569, + "grad_norm": 0.47944375872612, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 2640 + }, + { + "epoch": 0.4284213078975022, + "grad_norm": 0.537223756313324, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 2650 + }, + { + "epoch": 0.4300379920782475, + "grad_norm": 0.41669997572898865, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 2660 + }, + { + "epoch": 0.4316546762589928, + "grad_norm": 0.44727686047554016, + "learning_rate": 0.0002, + "loss": 0.7785, + "step": 2670 + }, + { + "epoch": 0.4332713604397381, + "grad_norm": 0.5600888729095459, + "learning_rate": 0.0002, + "loss": 0.8241, + "step": 2680 + }, + { + "epoch": 0.4348880446204834, + "grad_norm": 0.39820605516433716, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 2690 + }, + { + "epoch": 0.4365047288012287, + "grad_norm": 0.5637655854225159, + "learning_rate": 0.0002, + "loss": 0.8202, + "step": 2700 + }, + { + "epoch": 0.438121412981974, + "grad_norm": 0.6363666653633118, + "learning_rate": 0.0002, + "loss": 0.855, + "step": 2710 + }, + { + "epoch": 0.4397380971627193, + "grad_norm": 0.5656129121780396, + "learning_rate": 0.0002, + "loss": 0.8468, + "step": 2720 + }, + { + "epoch": 0.44135478134346456, + "grad_norm": 0.5600156188011169, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 2730 + }, + { + "epoch": 0.44297146552420985, + "grad_norm": 0.5506579875946045, + "learning_rate": 0.0002, + "loss": 0.8405, + "step": 2740 + }, + { + "epoch": 0.44458814970495514, + "grad_norm": 0.49878305196762085, + "learning_rate": 0.0002, + "loss": 0.7725, + "step": 2750 + }, + { + "epoch": 0.4462048338857004, + "grad_norm": 0.4569213092327118, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 2760 + }, + { + "epoch": 0.4478215180664457, + "grad_norm": 0.6056680083274841, + "learning_rate": 0.0002, + "loss": 0.8028, + "step": 2770 + }, + { + "epoch": 0.449438202247191, + "grad_norm": 0.44474557042121887, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 2780 + }, + { + "epoch": 0.4510548864279363, + "grad_norm": 0.46055394411087036, + "learning_rate": 0.0002, + "loss": 0.801, + "step": 2790 + }, + { + "epoch": 0.4526715706086816, + "grad_norm": 0.4904133379459381, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 2800 + }, + { + "epoch": 0.45428825478942686, + "grad_norm": 0.5647031664848328, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 2810 + }, + { + "epoch": 0.4559049389701722, + "grad_norm": 0.5759473443031311, + "learning_rate": 0.0002, + "loss": 0.8622, + "step": 2820 + }, + { + "epoch": 0.4575216231509175, + "grad_norm": 0.5161895751953125, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 2830 + }, + { + "epoch": 0.4591383073316628, + "grad_norm": 0.4248254597187042, + "learning_rate": 0.0002, + "loss": 0.8045, + "step": 2840 + }, + { + "epoch": 0.46075499151240806, + "grad_norm": 0.45395001769065857, + "learning_rate": 0.0002, + "loss": 0.7838, + "step": 2850 + }, + { + "epoch": 0.46237167569315335, + "grad_norm": 0.5358697772026062, + "learning_rate": 0.0002, + "loss": 0.8208, + "step": 2860 + }, + { + "epoch": 0.46398835987389864, + "grad_norm": 0.5379165410995483, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 2870 + }, + { + "epoch": 0.4656050440546439, + "grad_norm": 0.4601989686489105, + "learning_rate": 0.0002, + "loss": 0.7403, + "step": 2880 + }, + { + "epoch": 0.4672217282353892, + "grad_norm": 0.671115517616272, + "learning_rate": 0.0002, + "loss": 0.8523, + "step": 2890 + }, + { + "epoch": 0.4688384124161345, + "grad_norm": 0.4425133168697357, + "learning_rate": 0.0002, + "loss": 0.8262, + "step": 2900 + }, + { + "epoch": 0.4704550965968798, + "grad_norm": 0.5446155071258545, + "learning_rate": 0.0002, + "loss": 0.8178, + "step": 2910 + }, + { + "epoch": 0.47207178077762507, + "grad_norm": 0.603306233882904, + "learning_rate": 0.0002, + "loss": 0.8106, + "step": 2920 + }, + { + "epoch": 0.47368846495837036, + "grad_norm": 0.5377997159957886, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 2930 + }, + { + "epoch": 0.4753051491391157, + "grad_norm": 0.4931027591228485, + "learning_rate": 0.0002, + "loss": 0.8075, + "step": 2940 + }, + { + "epoch": 0.476921833319861, + "grad_norm": 0.4711960256099701, + "learning_rate": 0.0002, + "loss": 0.8004, + "step": 2950 + }, + { + "epoch": 0.4785385175006063, + "grad_norm": 0.5020492672920227, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 2960 + }, + { + "epoch": 0.48015520168135156, + "grad_norm": 0.5428946614265442, + "learning_rate": 0.0002, + "loss": 0.8221, + "step": 2970 + }, + { + "epoch": 0.48177188586209685, + "grad_norm": 0.5294089317321777, + "learning_rate": 0.0002, + "loss": 0.7849, + "step": 2980 + }, + { + "epoch": 0.48338857004284214, + "grad_norm": 0.648289144039154, + "learning_rate": 0.0002, + "loss": 0.8553, + "step": 2990 + }, + { + "epoch": 0.4850052542235874, + "grad_norm": 0.47916680574417114, + "learning_rate": 0.0002, + "loss": 0.7874, + "step": 3000 + }, + { + "epoch": 0.4866219384043327, + "grad_norm": 0.43849772214889526, + "learning_rate": 0.0002, + "loss": 0.8087, + "step": 3010 + }, + { + "epoch": 0.488238622585078, + "grad_norm": 0.47007861733436584, + "learning_rate": 0.0002, + "loss": 0.7662, + "step": 3020 + }, + { + "epoch": 0.4898553067658233, + "grad_norm": 0.6314331293106079, + "learning_rate": 0.0002, + "loss": 0.757, + "step": 3030 + }, + { + "epoch": 0.49147199094656857, + "grad_norm": 0.49211493134498596, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 3040 + }, + { + "epoch": 0.49308867512731386, + "grad_norm": 0.4537973403930664, + "learning_rate": 0.0002, + "loss": 0.8335, + "step": 3050 + }, + { + "epoch": 0.49470535930805914, + "grad_norm": 0.47326919436454773, + "learning_rate": 0.0002, + "loss": 0.8095, + "step": 3060 + }, + { + "epoch": 0.4963220434888045, + "grad_norm": 0.525874137878418, + "learning_rate": 0.0002, + "loss": 0.8447, + "step": 3070 + }, + { + "epoch": 0.4979387276695498, + "grad_norm": 0.6361091732978821, + "learning_rate": 0.0002, + "loss": 0.8339, + "step": 3080 + }, + { + "epoch": 0.49955541185029506, + "grad_norm": 0.5850642919540405, + "learning_rate": 0.0002, + "loss": 0.821, + "step": 3090 + }, + { + "epoch": 0.5011720960310403, + "grad_norm": 0.47299543023109436, + "learning_rate": 0.0002, + "loss": 0.8279, + "step": 3100 + }, + { + "epoch": 0.5027887802117856, + "grad_norm": 0.473099946975708, + "learning_rate": 0.0002, + "loss": 0.8681, + "step": 3110 + }, + { + "epoch": 0.5044054643925309, + "grad_norm": 0.48186397552490234, + "learning_rate": 0.0002, + "loss": 0.8223, + "step": 3120 + }, + { + "epoch": 0.5060221485732762, + "grad_norm": 0.5015401840209961, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 3130 + }, + { + "epoch": 0.5076388327540216, + "grad_norm": 0.5617750287055969, + "learning_rate": 0.0002, + "loss": 0.7692, + "step": 3140 + }, + { + "epoch": 0.5092555169347668, + "grad_norm": 0.5169327259063721, + "learning_rate": 0.0002, + "loss": 0.8708, + "step": 3150 + }, + { + "epoch": 0.5108722011155121, + "grad_norm": 0.545657753944397, + "learning_rate": 0.0002, + "loss": 0.7845, + "step": 3160 + }, + { + "epoch": 0.5124888852962574, + "grad_norm": 0.512864351272583, + "learning_rate": 0.0002, + "loss": 0.799, + "step": 3170 + }, + { + "epoch": 0.5141055694770027, + "grad_norm": 0.4113546311855316, + "learning_rate": 0.0002, + "loss": 0.7794, + "step": 3180 + }, + { + "epoch": 0.5157222536577479, + "grad_norm": 0.44532445073127747, + "learning_rate": 0.0002, + "loss": 0.8206, + "step": 3190 + }, + { + "epoch": 0.5173389378384933, + "grad_norm": 0.5623497366905212, + "learning_rate": 0.0002, + "loss": 0.8213, + "step": 3200 + }, + { + "epoch": 0.5189556220192385, + "grad_norm": 0.5084741115570068, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 3210 + }, + { + "epoch": 0.5205723061999838, + "grad_norm": 0.5305403470993042, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 3220 + }, + { + "epoch": 0.5221889903807291, + "grad_norm": 0.4708254337310791, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 3230 + }, + { + "epoch": 0.5238056745614744, + "grad_norm": 0.43827131390571594, + "learning_rate": 0.0002, + "loss": 0.7639, + "step": 3240 + }, + { + "epoch": 0.5254223587422197, + "grad_norm": 0.5630002617835999, + "learning_rate": 0.0002, + "loss": 0.7993, + "step": 3250 + }, + { + "epoch": 0.527039042922965, + "grad_norm": 0.5010961890220642, + "learning_rate": 0.0002, + "loss": 0.7522, + "step": 3260 + }, + { + "epoch": 0.5286557271037103, + "grad_norm": 0.6303122043609619, + "learning_rate": 0.0002, + "loss": 0.8374, + "step": 3270 + }, + { + "epoch": 0.5302724112844556, + "grad_norm": 0.5107331275939941, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 3280 + }, + { + "epoch": 0.5318890954652009, + "grad_norm": 0.5700443387031555, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 3290 + }, + { + "epoch": 0.5335057796459461, + "grad_norm": 0.46296367049217224, + "learning_rate": 0.0002, + "loss": 0.7776, + "step": 3300 + }, + { + "epoch": 0.5351224638266915, + "grad_norm": 0.531568706035614, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 3310 + }, + { + "epoch": 0.5367391480074367, + "grad_norm": 0.4686741530895233, + "learning_rate": 0.0002, + "loss": 0.843, + "step": 3320 + }, + { + "epoch": 0.5383558321881821, + "grad_norm": 0.5404331088066101, + "learning_rate": 0.0002, + "loss": 0.8104, + "step": 3330 + }, + { + "epoch": 0.5399725163689273, + "grad_norm": 0.6368790864944458, + "learning_rate": 0.0002, + "loss": 0.7686, + "step": 3340 + }, + { + "epoch": 0.5415892005496726, + "grad_norm": 0.42300888895988464, + "learning_rate": 0.0002, + "loss": 0.8514, + "step": 3350 + }, + { + "epoch": 0.5432058847304179, + "grad_norm": 0.5362542867660522, + "learning_rate": 0.0002, + "loss": 0.8236, + "step": 3360 + }, + { + "epoch": 0.5448225689111632, + "grad_norm": 0.497128963470459, + "learning_rate": 0.0002, + "loss": 0.858, + "step": 3370 + }, + { + "epoch": 0.5464392530919085, + "grad_norm": 0.5006386041641235, + "learning_rate": 0.0002, + "loss": 0.8519, + "step": 3380 + }, + { + "epoch": 0.5480559372726538, + "grad_norm": 0.44136837124824524, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 3390 + }, + { + "epoch": 0.5496726214533991, + "grad_norm": 0.5897833108901978, + "learning_rate": 0.0002, + "loss": 0.773, + "step": 3400 + }, + { + "epoch": 0.5512893056341444, + "grad_norm": 0.641075611114502, + "learning_rate": 0.0002, + "loss": 0.8895, + "step": 3410 + }, + { + "epoch": 0.5529059898148897, + "grad_norm": 0.7251322269439697, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 3420 + }, + { + "epoch": 0.5545226739956349, + "grad_norm": 0.47411349415779114, + "learning_rate": 0.0002, + "loss": 0.7626, + "step": 3430 + }, + { + "epoch": 0.5561393581763803, + "grad_norm": 0.4994310438632965, + "learning_rate": 0.0002, + "loss": 0.8196, + "step": 3440 + }, + { + "epoch": 0.5577560423571255, + "grad_norm": 0.5814438462257385, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 3450 + }, + { + "epoch": 0.5593727265378708, + "grad_norm": 0.6278898119926453, + "learning_rate": 0.0002, + "loss": 0.8805, + "step": 3460 + }, + { + "epoch": 0.5609894107186161, + "grad_norm": 0.46208274364471436, + "learning_rate": 0.0002, + "loss": 0.813, + "step": 3470 + }, + { + "epoch": 0.5626060948993614, + "grad_norm": 0.5718930959701538, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 3480 + }, + { + "epoch": 0.5642227790801067, + "grad_norm": 0.48178744316101074, + "learning_rate": 0.0002, + "loss": 0.8152, + "step": 3490 + }, + { + "epoch": 0.565839463260852, + "grad_norm": 0.47336965799331665, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 3500 + }, + { + "epoch": 0.5674561474415973, + "grad_norm": 0.43442684412002563, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 3510 + }, + { + "epoch": 0.5690728316223426, + "grad_norm": 0.6463358998298645, + "learning_rate": 0.0002, + "loss": 0.7564, + "step": 3520 + }, + { + "epoch": 0.5706895158030879, + "grad_norm": 0.5286486744880676, + "learning_rate": 0.0002, + "loss": 0.836, + "step": 3530 + }, + { + "epoch": 0.5723061999838331, + "grad_norm": 0.5405499935150146, + "learning_rate": 0.0002, + "loss": 0.8421, + "step": 3540 + }, + { + "epoch": 0.5739228841645785, + "grad_norm": 0.6654391884803772, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 3550 + }, + { + "epoch": 0.5755395683453237, + "grad_norm": 0.5081980228424072, + "learning_rate": 0.0002, + "loss": 0.7803, + "step": 3560 + }, + { + "epoch": 0.5771562525260691, + "grad_norm": 0.48978179693222046, + "learning_rate": 0.0002, + "loss": 0.7753, + "step": 3570 + }, + { + "epoch": 0.5787729367068143, + "grad_norm": 0.5840612053871155, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3580 + }, + { + "epoch": 0.5803896208875596, + "grad_norm": 0.5235261917114258, + "learning_rate": 0.0002, + "loss": 0.8937, + "step": 3590 + }, + { + "epoch": 0.5820063050683049, + "grad_norm": 0.5672075748443604, + "learning_rate": 0.0002, + "loss": 0.7894, + "step": 3600 + }, + { + "epoch": 0.5836229892490502, + "grad_norm": 0.5613429546356201, + "learning_rate": 0.0002, + "loss": 0.8347, + "step": 3610 + }, + { + "epoch": 0.5852396734297954, + "grad_norm": 0.4032273590564728, + "learning_rate": 0.0002, + "loss": 0.8274, + "step": 3620 + }, + { + "epoch": 0.5868563576105408, + "grad_norm": 0.49559324979782104, + "learning_rate": 0.0002, + "loss": 0.8421, + "step": 3630 + }, + { + "epoch": 0.5884730417912861, + "grad_norm": 0.6895697712898254, + "learning_rate": 0.0002, + "loss": 0.8332, + "step": 3640 + }, + { + "epoch": 0.5900897259720314, + "grad_norm": 0.4750136435031891, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 3650 + }, + { + "epoch": 0.5917064101527767, + "grad_norm": 0.5176819562911987, + "learning_rate": 0.0002, + "loss": 0.8219, + "step": 3660 + }, + { + "epoch": 0.5933230943335219, + "grad_norm": 0.5817760229110718, + "learning_rate": 0.0002, + "loss": 0.8151, + "step": 3670 + }, + { + "epoch": 0.5949397785142673, + "grad_norm": 0.6064626574516296, + "learning_rate": 0.0002, + "loss": 0.7823, + "step": 3680 + }, + { + "epoch": 0.5965564626950125, + "grad_norm": 0.6728700995445251, + "learning_rate": 0.0002, + "loss": 0.8422, + "step": 3690 + }, + { + "epoch": 0.5981731468757578, + "grad_norm": 0.609305202960968, + "learning_rate": 0.0002, + "loss": 0.7679, + "step": 3700 + }, + { + "epoch": 0.5997898310565031, + "grad_norm": 0.4615488350391388, + "learning_rate": 0.0002, + "loss": 0.8048, + "step": 3710 + }, + { + "epoch": 0.6014065152372484, + "grad_norm": 2.0531179904937744, + "learning_rate": 0.0002, + "loss": 0.8214, + "step": 3720 + }, + { + "epoch": 0.6030231994179936, + "grad_norm": 0.5091132521629333, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 3730 + }, + { + "epoch": 0.604639883598739, + "grad_norm": 0.5951124429702759, + "learning_rate": 0.0002, + "loss": 0.7833, + "step": 3740 + }, + { + "epoch": 0.6062565677794842, + "grad_norm": 0.5870208144187927, + "learning_rate": 0.0002, + "loss": 0.7784, + "step": 3750 + }, + { + "epoch": 0.6078732519602296, + "grad_norm": 0.6254619359970093, + "learning_rate": 0.0002, + "loss": 0.8044, + "step": 3760 + }, + { + "epoch": 0.6094899361409749, + "grad_norm": 0.5577626824378967, + "learning_rate": 0.0002, + "loss": 0.7868, + "step": 3770 + }, + { + "epoch": 0.6111066203217201, + "grad_norm": 0.5004405379295349, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 3780 + }, + { + "epoch": 0.6127233045024655, + "grad_norm": 0.5527383685112, + "learning_rate": 0.0002, + "loss": 0.8092, + "step": 3790 + }, + { + "epoch": 0.6143399886832107, + "grad_norm": 0.49116113781929016, + "learning_rate": 0.0002, + "loss": 0.8036, + "step": 3800 + }, + { + "epoch": 0.6159566728639561, + "grad_norm": 0.5299299359321594, + "learning_rate": 0.0002, + "loss": 0.8352, + "step": 3810 + }, + { + "epoch": 0.6175733570447013, + "grad_norm": 0.464897483587265, + "learning_rate": 0.0002, + "loss": 0.7737, + "step": 3820 + }, + { + "epoch": 0.6191900412254466, + "grad_norm": 0.6505740880966187, + "learning_rate": 0.0002, + "loss": 0.7923, + "step": 3830 + }, + { + "epoch": 0.6208067254061919, + "grad_norm": 0.5512559413909912, + "learning_rate": 0.0002, + "loss": 0.8123, + "step": 3840 + }, + { + "epoch": 0.6224234095869372, + "grad_norm": 0.49427518248558044, + "learning_rate": 0.0002, + "loss": 0.8856, + "step": 3850 + }, + { + "epoch": 0.6240400937676824, + "grad_norm": 0.3839147090911865, + "learning_rate": 0.0002, + "loss": 0.7751, + "step": 3860 + }, + { + "epoch": 0.6256567779484278, + "grad_norm": 0.5760218501091003, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 3870 + }, + { + "epoch": 0.6272734621291731, + "grad_norm": 0.7226507067680359, + "learning_rate": 0.0002, + "loss": 0.7836, + "step": 3880 + }, + { + "epoch": 0.6288901463099184, + "grad_norm": 0.676781415939331, + "learning_rate": 0.0002, + "loss": 0.8244, + "step": 3890 + }, + { + "epoch": 0.6305068304906637, + "grad_norm": 0.4284018278121948, + "learning_rate": 0.0002, + "loss": 0.8239, + "step": 3900 + }, + { + "epoch": 0.6321235146714089, + "grad_norm": 0.5060628056526184, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 3910 + }, + { + "epoch": 0.6337401988521543, + "grad_norm": 0.5524522066116333, + "learning_rate": 0.0002, + "loss": 0.8089, + "step": 3920 + }, + { + "epoch": 0.6353568830328995, + "grad_norm": 0.6099881529808044, + "learning_rate": 0.0002, + "loss": 0.8276, + "step": 3930 + }, + { + "epoch": 0.6369735672136448, + "grad_norm": 0.43155938386917114, + "learning_rate": 0.0002, + "loss": 0.809, + "step": 3940 + }, + { + "epoch": 0.6385902513943901, + "grad_norm": 0.6427084803581238, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 3950 + }, + { + "epoch": 0.6402069355751354, + "grad_norm": 0.541220486164093, + "learning_rate": 0.0002, + "loss": 0.8368, + "step": 3960 + }, + { + "epoch": 0.6418236197558806, + "grad_norm": 0.5414294600486755, + "learning_rate": 0.0002, + "loss": 0.8539, + "step": 3970 + }, + { + "epoch": 0.643440303936626, + "grad_norm": 0.46344003081321716, + "learning_rate": 0.0002, + "loss": 0.7996, + "step": 3980 + }, + { + "epoch": 0.6450569881173712, + "grad_norm": 0.45209285616874695, + "learning_rate": 0.0002, + "loss": 0.7474, + "step": 3990 + }, + { + "epoch": 0.6466736722981166, + "grad_norm": 0.5417284369468689, + "learning_rate": 0.0002, + "loss": 0.8202, + "step": 4000 + }, + { + "epoch": 0.6482903564788619, + "grad_norm": 0.7995685935020447, + "learning_rate": 0.0002, + "loss": 0.7563, + "step": 4010 + }, + { + "epoch": 0.6499070406596071, + "grad_norm": 0.6384002566337585, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 4020 + }, + { + "epoch": 0.6515237248403525, + "grad_norm": 0.4472815692424774, + "learning_rate": 0.0002, + "loss": 0.732, + "step": 4030 + }, + { + "epoch": 0.6531404090210977, + "grad_norm": 0.6834294199943542, + "learning_rate": 0.0002, + "loss": 0.8071, + "step": 4040 + }, + { + "epoch": 0.654757093201843, + "grad_norm": 0.4612339735031128, + "learning_rate": 0.0002, + "loss": 0.7812, + "step": 4050 + }, + { + "epoch": 0.6563737773825883, + "grad_norm": 0.9266576170921326, + "learning_rate": 0.0002, + "loss": 0.8141, + "step": 4060 + }, + { + "epoch": 0.6579904615633336, + "grad_norm": 0.4470861852169037, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 4070 + }, + { + "epoch": 0.6596071457440789, + "grad_norm": 0.45544925332069397, + "learning_rate": 0.0002, + "loss": 0.8293, + "step": 4080 + }, + { + "epoch": 0.6612238299248242, + "grad_norm": 0.6144481301307678, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 4090 + }, + { + "epoch": 0.6628405141055694, + "grad_norm": 0.5936288237571716, + "learning_rate": 0.0002, + "loss": 0.7877, + "step": 4100 + }, + { + "epoch": 0.6644571982863148, + "grad_norm": 0.4822963774204254, + "learning_rate": 0.0002, + "loss": 0.7617, + "step": 4110 + }, + { + "epoch": 0.66607388246706, + "grad_norm": 0.48432496190071106, + "learning_rate": 0.0002, + "loss": 0.7997, + "step": 4120 + }, + { + "epoch": 0.6676905666478054, + "grad_norm": 0.4901607930660248, + "learning_rate": 0.0002, + "loss": 0.8404, + "step": 4130 + }, + { + "epoch": 0.6693072508285507, + "grad_norm": 0.5018393397331238, + "learning_rate": 0.0002, + "loss": 0.8085, + "step": 4140 + }, + { + "epoch": 0.6709239350092959, + "grad_norm": 0.6946378946304321, + "learning_rate": 0.0002, + "loss": 0.8065, + "step": 4150 + }, + { + "epoch": 0.6725406191900413, + "grad_norm": 0.5997390747070312, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 4160 + }, + { + "epoch": 0.6741573033707865, + "grad_norm": 0.6738849878311157, + "learning_rate": 0.0002, + "loss": 0.8268, + "step": 4170 + }, + { + "epoch": 0.6757739875515318, + "grad_norm": 0.6110581159591675, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 4180 + }, + { + "epoch": 0.6773906717322771, + "grad_norm": 0.5703322291374207, + "learning_rate": 0.0002, + "loss": 0.8043, + "step": 4190 + }, + { + "epoch": 0.6790073559130224, + "grad_norm": 0.4686066210269928, + "learning_rate": 0.0002, + "loss": 0.8099, + "step": 4200 + }, + { + "epoch": 0.6806240400937676, + "grad_norm": 0.6394643783569336, + "learning_rate": 0.0002, + "loss": 0.8441, + "step": 4210 + }, + { + "epoch": 0.682240724274513, + "grad_norm": 0.5454841256141663, + "learning_rate": 0.0002, + "loss": 0.8011, + "step": 4220 + }, + { + "epoch": 0.6838574084552582, + "grad_norm": 0.4859732985496521, + "learning_rate": 0.0002, + "loss": 0.8307, + "step": 4230 + }, + { + "epoch": 0.6854740926360036, + "grad_norm": 0.5544065833091736, + "learning_rate": 0.0002, + "loss": 0.8161, + "step": 4240 + }, + { + "epoch": 0.6870907768167488, + "grad_norm": 0.4902505576610565, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 4250 + }, + { + "epoch": 0.6887074609974941, + "grad_norm": 0.4768051505088806, + "learning_rate": 0.0002, + "loss": 0.7977, + "step": 4260 + }, + { + "epoch": 0.6903241451782395, + "grad_norm": 0.49982190132141113, + "learning_rate": 0.0002, + "loss": 0.7539, + "step": 4270 + }, + { + "epoch": 0.6919408293589847, + "grad_norm": 0.6351838111877441, + "learning_rate": 0.0002, + "loss": 0.7353, + "step": 4280 + }, + { + "epoch": 0.69355751353973, + "grad_norm": 0.5647561550140381, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 4290 + }, + { + "epoch": 0.6951741977204753, + "grad_norm": 0.5340486764907837, + "learning_rate": 0.0002, + "loss": 0.7618, + "step": 4300 + }, + { + "epoch": 0.6967908819012206, + "grad_norm": 0.5649092793464661, + "learning_rate": 0.0002, + "loss": 0.8526, + "step": 4310 + }, + { + "epoch": 0.6984075660819659, + "grad_norm": 0.6183916926383972, + "learning_rate": 0.0002, + "loss": 0.8246, + "step": 4320 + }, + { + "epoch": 0.7000242502627112, + "grad_norm": 0.6154509782791138, + "learning_rate": 0.0002, + "loss": 0.792, + "step": 4330 + }, + { + "epoch": 0.7016409344434564, + "grad_norm": 0.5156264305114746, + "learning_rate": 0.0002, + "loss": 0.8397, + "step": 4340 + }, + { + "epoch": 0.7032576186242018, + "grad_norm": 0.562171459197998, + "learning_rate": 0.0002, + "loss": 0.8512, + "step": 4350 + }, + { + "epoch": 0.704874302804947, + "grad_norm": 0.4949502646923065, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 4360 + }, + { + "epoch": 0.7064909869856923, + "grad_norm": 0.5171684622764587, + "learning_rate": 0.0002, + "loss": 0.738, + "step": 4370 + }, + { + "epoch": 0.7081076711664377, + "grad_norm": 0.6198443174362183, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 4380 + }, + { + "epoch": 0.7097243553471829, + "grad_norm": 0.5802276134490967, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 4390 + }, + { + "epoch": 0.7113410395279283, + "grad_norm": 0.41096967458724976, + "learning_rate": 0.0002, + "loss": 0.8797, + "step": 4400 + }, + { + "epoch": 0.7129577237086735, + "grad_norm": 0.4397392272949219, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 4410 + }, + { + "epoch": 0.7145744078894188, + "grad_norm": 0.45228442549705505, + "learning_rate": 0.0002, + "loss": 0.7651, + "step": 4420 + }, + { + "epoch": 0.7161910920701641, + "grad_norm": 0.4839673936367035, + "learning_rate": 0.0002, + "loss": 0.7938, + "step": 4430 + }, + { + "epoch": 0.7178077762509094, + "grad_norm": 0.6140755414962769, + "learning_rate": 0.0002, + "loss": 0.8362, + "step": 4440 + }, + { + "epoch": 0.7194244604316546, + "grad_norm": 0.6841378808021545, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 4450 + }, + { + "epoch": 0.7210411446124, + "grad_norm": 0.6664239168167114, + "learning_rate": 0.0002, + "loss": 0.8177, + "step": 4460 + }, + { + "epoch": 0.7226578287931452, + "grad_norm": 0.47552719712257385, + "learning_rate": 0.0002, + "loss": 0.7983, + "step": 4470 + }, + { + "epoch": 0.7242745129738906, + "grad_norm": 0.6649776101112366, + "learning_rate": 0.0002, + "loss": 0.8982, + "step": 4480 + }, + { + "epoch": 0.7258911971546358, + "grad_norm": 0.5159541964530945, + "learning_rate": 0.0002, + "loss": 0.8074, + "step": 4490 + }, + { + "epoch": 0.7275078813353811, + "grad_norm": 0.6693112850189209, + "learning_rate": 0.0002, + "loss": 0.7786, + "step": 4500 + }, + { + "epoch": 0.7291245655161265, + "grad_norm": 0.48870977759361267, + "learning_rate": 0.0002, + "loss": 0.8655, + "step": 4510 + }, + { + "epoch": 0.7307412496968717, + "grad_norm": 0.4857887923717499, + "learning_rate": 0.0002, + "loss": 0.7337, + "step": 4520 + }, + { + "epoch": 0.732357933877617, + "grad_norm": 0.5515662431716919, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 4530 + }, + { + "epoch": 0.7339746180583623, + "grad_norm": 0.6292222738265991, + "learning_rate": 0.0002, + "loss": 0.8031, + "step": 4540 + }, + { + "epoch": 0.7355913022391076, + "grad_norm": 0.48265689611434937, + "learning_rate": 0.0002, + "loss": 0.7749, + "step": 4550 + }, + { + "epoch": 0.7372079864198529, + "grad_norm": 0.8044266104698181, + "learning_rate": 0.0002, + "loss": 0.8499, + "step": 4560 + }, + { + "epoch": 0.7388246706005982, + "grad_norm": 0.6111769676208496, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 4570 + }, + { + "epoch": 0.7404413547813434, + "grad_norm": 0.5229553580284119, + "learning_rate": 0.0002, + "loss": 0.7291, + "step": 4580 + }, + { + "epoch": 0.7420580389620888, + "grad_norm": 0.6054152250289917, + "learning_rate": 0.0002, + "loss": 0.8038, + "step": 4590 + }, + { + "epoch": 0.743674723142834, + "grad_norm": 0.5574966669082642, + "learning_rate": 0.0002, + "loss": 0.8169, + "step": 4600 + }, + { + "epoch": 0.7452914073235793, + "grad_norm": 0.5395817160606384, + "learning_rate": 0.0002, + "loss": 0.8439, + "step": 4610 + }, + { + "epoch": 0.7469080915043246, + "grad_norm": 0.7116472721099854, + "learning_rate": 0.0002, + "loss": 0.8495, + "step": 4620 + }, + { + "epoch": 0.7485247756850699, + "grad_norm": 0.5618700981140137, + "learning_rate": 0.0002, + "loss": 0.7743, + "step": 4630 + }, + { + "epoch": 0.7501414598658153, + "grad_norm": 0.5802770853042603, + "learning_rate": 0.0002, + "loss": 0.7744, + "step": 4640 + }, + { + "epoch": 0.7517581440465605, + "grad_norm": 0.5690428018569946, + "learning_rate": 0.0002, + "loss": 0.7924, + "step": 4650 + }, + { + "epoch": 0.7533748282273058, + "grad_norm": 0.4813360273838043, + "learning_rate": 0.0002, + "loss": 0.8017, + "step": 4660 + }, + { + "epoch": 0.7549915124080511, + "grad_norm": 0.5434042811393738, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 4670 + }, + { + "epoch": 0.7566081965887964, + "grad_norm": 0.5502099990844727, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 4680 + }, + { + "epoch": 0.7582248807695416, + "grad_norm": 0.6020621061325073, + "learning_rate": 0.0002, + "loss": 0.8598, + "step": 4690 + }, + { + "epoch": 0.759841564950287, + "grad_norm": 0.4922301471233368, + "learning_rate": 0.0002, + "loss": 0.7937, + "step": 4700 + }, + { + "epoch": 0.7614582491310322, + "grad_norm": 0.6492828726768494, + "learning_rate": 0.0002, + "loss": 0.788, + "step": 4710 + }, + { + "epoch": 0.7630749333117776, + "grad_norm": 0.4865580201148987, + "learning_rate": 0.0002, + "loss": 0.8313, + "step": 4720 + }, + { + "epoch": 0.7646916174925228, + "grad_norm": 0.5971422791481018, + "learning_rate": 0.0002, + "loss": 0.7966, + "step": 4730 + }, + { + "epoch": 0.7663083016732681, + "grad_norm": 0.6832674145698547, + "learning_rate": 0.0002, + "loss": 0.8298, + "step": 4740 + }, + { + "epoch": 0.7679249858540134, + "grad_norm": 0.500908613204956, + "learning_rate": 0.0002, + "loss": 0.8156, + "step": 4750 + }, + { + "epoch": 0.7695416700347587, + "grad_norm": 0.6112465858459473, + "learning_rate": 0.0002, + "loss": 0.8383, + "step": 4760 + }, + { + "epoch": 0.771158354215504, + "grad_norm": 0.5753506422042847, + "learning_rate": 0.0002, + "loss": 0.76, + "step": 4770 + }, + { + "epoch": 0.7727750383962493, + "grad_norm": 0.6529405117034912, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 4780 + }, + { + "epoch": 0.7743917225769946, + "grad_norm": 0.5916843414306641, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 4790 + }, + { + "epoch": 0.7760084067577399, + "grad_norm": 0.4821224510669708, + "learning_rate": 0.0002, + "loss": 0.83, + "step": 4800 + }, + { + "epoch": 0.7776250909384852, + "grad_norm": 0.5532580018043518, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 4810 + }, + { + "epoch": 0.7792417751192304, + "grad_norm": 0.4604877233505249, + "learning_rate": 0.0002, + "loss": 0.7363, + "step": 4820 + }, + { + "epoch": 0.7808584592999758, + "grad_norm": 0.5009613037109375, + "learning_rate": 0.0002, + "loss": 0.7506, + "step": 4830 + }, + { + "epoch": 0.782475143480721, + "grad_norm": 0.6448560357093811, + "learning_rate": 0.0002, + "loss": 0.7863, + "step": 4840 + }, + { + "epoch": 0.7840918276614663, + "grad_norm": 0.44327953457832336, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 4850 + }, + { + "epoch": 0.7857085118422116, + "grad_norm": 0.5355411171913147, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 4860 + }, + { + "epoch": 0.7873251960229569, + "grad_norm": 0.5635677576065063, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 4870 + }, + { + "epoch": 0.7889418802037023, + "grad_norm": 0.5417491793632507, + "learning_rate": 0.0002, + "loss": 0.7931, + "step": 4880 + }, + { + "epoch": 0.7905585643844475, + "grad_norm": 0.4567430913448334, + "learning_rate": 0.0002, + "loss": 0.7819, + "step": 4890 + }, + { + "epoch": 0.7921752485651928, + "grad_norm": 0.44651296734809875, + "learning_rate": 0.0002, + "loss": 0.8454, + "step": 4900 + }, + { + "epoch": 0.7937919327459381, + "grad_norm": 0.5741217136383057, + "learning_rate": 0.0002, + "loss": 0.7959, + "step": 4910 + }, + { + "epoch": 0.7954086169266834, + "grad_norm": 0.6605045199394226, + "learning_rate": 0.0002, + "loss": 0.8093, + "step": 4920 + }, + { + "epoch": 0.7970253011074286, + "grad_norm": 0.5126531720161438, + "learning_rate": 0.0002, + "loss": 0.77, + "step": 4930 + }, + { + "epoch": 0.798641985288174, + "grad_norm": 0.513648271560669, + "learning_rate": 0.0002, + "loss": 0.7793, + "step": 4940 + }, + { + "epoch": 0.8002586694689192, + "grad_norm": 0.5350404381752014, + "learning_rate": 0.0002, + "loss": 0.8314, + "step": 4950 + }, + { + "epoch": 0.8018753536496646, + "grad_norm": 0.5731674432754517, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 4960 + }, + { + "epoch": 0.8034920378304098, + "grad_norm": 0.5974258184432983, + "learning_rate": 0.0002, + "loss": 0.8572, + "step": 4970 + }, + { + "epoch": 0.8051087220111551, + "grad_norm": 0.8774799704551697, + "learning_rate": 0.0002, + "loss": 0.7972, + "step": 4980 + }, + { + "epoch": 0.8067254061919004, + "grad_norm": 0.5994430184364319, + "learning_rate": 0.0002, + "loss": 0.7899, + "step": 4990 + }, + { + "epoch": 0.8083420903726457, + "grad_norm": 0.4894903004169464, + "learning_rate": 0.0002, + "loss": 0.7736, + "step": 5000 + }, + { + "epoch": 0.809958774553391, + "grad_norm": 0.5218459367752075, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 5010 + }, + { + "epoch": 0.8115754587341363, + "grad_norm": 0.5232468843460083, + "learning_rate": 0.0002, + "loss": 0.817, + "step": 5020 + }, + { + "epoch": 0.8131921429148816, + "grad_norm": 0.44358372688293457, + "learning_rate": 0.0002, + "loss": 0.7704, + "step": 5030 + }, + { + "epoch": 0.8148088270956269, + "grad_norm": 0.6202037334442139, + "learning_rate": 0.0002, + "loss": 0.785, + "step": 5040 + }, + { + "epoch": 0.8164255112763722, + "grad_norm": 0.7721474170684814, + "learning_rate": 0.0002, + "loss": 0.7351, + "step": 5050 + }, + { + "epoch": 0.8180421954571174, + "grad_norm": 0.5568501353263855, + "learning_rate": 0.0002, + "loss": 0.8297, + "step": 5060 + }, + { + "epoch": 0.8196588796378628, + "grad_norm": 0.49148809909820557, + "learning_rate": 0.0002, + "loss": 0.7733, + "step": 5070 + }, + { + "epoch": 0.821275563818608, + "grad_norm": 0.4956012964248657, + "learning_rate": 0.0002, + "loss": 0.8054, + "step": 5080 + }, + { + "epoch": 0.8228922479993533, + "grad_norm": 0.6078833937644958, + "learning_rate": 0.0002, + "loss": 0.8201, + "step": 5090 + }, + { + "epoch": 0.8245089321800986, + "grad_norm": 0.46906954050064087, + "learning_rate": 0.0002, + "loss": 0.828, + "step": 5100 + }, + { + "epoch": 0.8261256163608439, + "grad_norm": 0.50812166929245, + "learning_rate": 0.0002, + "loss": 0.7703, + "step": 5110 + }, + { + "epoch": 0.8277423005415891, + "grad_norm": 0.5319661498069763, + "learning_rate": 0.0002, + "loss": 0.8243, + "step": 5120 + }, + { + "epoch": 0.8293589847223345, + "grad_norm": 0.4949689209461212, + "learning_rate": 0.0002, + "loss": 0.7798, + "step": 5130 + }, + { + "epoch": 0.8309756689030798, + "grad_norm": 0.5151591300964355, + "learning_rate": 0.0002, + "loss": 0.7428, + "step": 5140 + }, + { + "epoch": 0.8325923530838251, + "grad_norm": 0.5530214309692383, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 5150 + }, + { + "epoch": 0.8342090372645704, + "grad_norm": 0.6297410130500793, + "learning_rate": 0.0002, + "loss": 0.8251, + "step": 5160 + }, + { + "epoch": 0.8358257214453156, + "grad_norm": 0.5466840267181396, + "learning_rate": 0.0002, + "loss": 0.8067, + "step": 5170 + }, + { + "epoch": 0.837442405626061, + "grad_norm": 0.652913510799408, + "learning_rate": 0.0002, + "loss": 0.7875, + "step": 5180 + }, + { + "epoch": 0.8390590898068062, + "grad_norm": 0.5811293125152588, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 5190 + }, + { + "epoch": 0.8406757739875516, + "grad_norm": 0.5109550952911377, + "learning_rate": 0.0002, + "loss": 0.7412, + "step": 5200 + }, + { + "epoch": 0.8422924581682968, + "grad_norm": 0.4551706612110138, + "learning_rate": 0.0002, + "loss": 0.8077, + "step": 5210 + }, + { + "epoch": 0.8439091423490421, + "grad_norm": 0.5813754200935364, + "learning_rate": 0.0002, + "loss": 0.7827, + "step": 5220 + }, + { + "epoch": 0.8455258265297874, + "grad_norm": 0.5856947898864746, + "learning_rate": 0.0002, + "loss": 0.802, + "step": 5230 + }, + { + "epoch": 0.8471425107105327, + "grad_norm": 0.5482739210128784, + "learning_rate": 0.0002, + "loss": 0.7957, + "step": 5240 + }, + { + "epoch": 0.8487591948912779, + "grad_norm": 0.49023720622062683, + "learning_rate": 0.0002, + "loss": 0.8295, + "step": 5250 + }, + { + "epoch": 0.8503758790720233, + "grad_norm": 0.49472475051879883, + "learning_rate": 0.0002, + "loss": 0.8022, + "step": 5260 + }, + { + "epoch": 0.8519925632527686, + "grad_norm": 0.5490226745605469, + "learning_rate": 0.0002, + "loss": 0.8001, + "step": 5270 + }, + { + "epoch": 0.8536092474335139, + "grad_norm": 0.5340665578842163, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 5280 + }, + { + "epoch": 0.8552259316142592, + "grad_norm": 0.5962483882904053, + "learning_rate": 0.0002, + "loss": 0.8277, + "step": 5290 + }, + { + "epoch": 0.8568426157950044, + "grad_norm": 0.586358368396759, + "learning_rate": 0.0002, + "loss": 0.8765, + "step": 5300 + }, + { + "epoch": 0.8584592999757498, + "grad_norm": 0.49120277166366577, + "learning_rate": 0.0002, + "loss": 0.7831, + "step": 5310 + }, + { + "epoch": 0.860075984156495, + "grad_norm": 0.5887332558631897, + "learning_rate": 0.0002, + "loss": 0.8162, + "step": 5320 + }, + { + "epoch": 0.8616926683372403, + "grad_norm": 0.42496153712272644, + "learning_rate": 0.0002, + "loss": 0.7464, + "step": 5330 + }, + { + "epoch": 0.8633093525179856, + "grad_norm": 0.5489874482154846, + "learning_rate": 0.0002, + "loss": 0.7905, + "step": 5340 + }, + { + "epoch": 0.8649260366987309, + "grad_norm": 0.5850813984870911, + "learning_rate": 0.0002, + "loss": 0.7958, + "step": 5350 + }, + { + "epoch": 0.8665427208794761, + "grad_norm": 0.517487108707428, + "learning_rate": 0.0002, + "loss": 0.7642, + "step": 5360 + }, + { + "epoch": 0.8681594050602215, + "grad_norm": 0.5339142680168152, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 5370 + }, + { + "epoch": 0.8697760892409668, + "grad_norm": 0.6236387491226196, + "learning_rate": 0.0002, + "loss": 0.818, + "step": 5380 + }, + { + "epoch": 0.8713927734217121, + "grad_norm": 0.5752192735671997, + "learning_rate": 0.0002, + "loss": 0.7708, + "step": 5390 + }, + { + "epoch": 0.8730094576024574, + "grad_norm": 0.6724614500999451, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 5400 + }, + { + "epoch": 0.8746261417832026, + "grad_norm": 0.5280613303184509, + "learning_rate": 0.0002, + "loss": 0.7581, + "step": 5410 + }, + { + "epoch": 0.876242825963948, + "grad_norm": 0.44033288955688477, + "learning_rate": 0.0002, + "loss": 0.8231, + "step": 5420 + }, + { + "epoch": 0.8778595101446932, + "grad_norm": 0.5199708342552185, + "learning_rate": 0.0002, + "loss": 0.8839, + "step": 5430 + }, + { + "epoch": 0.8794761943254386, + "grad_norm": 0.46778348088264465, + "learning_rate": 0.0002, + "loss": 0.7852, + "step": 5440 + }, + { + "epoch": 0.8810928785061838, + "grad_norm": 0.4657754898071289, + "learning_rate": 0.0002, + "loss": 0.7834, + "step": 5450 + }, + { + "epoch": 0.8827095626869291, + "grad_norm": 0.5472902655601501, + "learning_rate": 0.0002, + "loss": 0.7799, + "step": 5460 + }, + { + "epoch": 0.8843262468676744, + "grad_norm": 0.4876766800880432, + "learning_rate": 0.0002, + "loss": 0.8253, + "step": 5470 + }, + { + "epoch": 0.8859429310484197, + "grad_norm": 0.5057248473167419, + "learning_rate": 0.0002, + "loss": 0.7906, + "step": 5480 + }, + { + "epoch": 0.8875596152291649, + "grad_norm": 0.4637320637702942, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 5490 + }, + { + "epoch": 0.8891762994099103, + "grad_norm": 0.471955806016922, + "learning_rate": 0.0002, + "loss": 0.781, + "step": 5500 + }, + { + "epoch": 0.8907929835906556, + "grad_norm": 0.5209813714027405, + "learning_rate": 0.0002, + "loss": 0.8057, + "step": 5510 + }, + { + "epoch": 0.8924096677714008, + "grad_norm": 0.6213834285736084, + "learning_rate": 0.0002, + "loss": 0.8106, + "step": 5520 + }, + { + "epoch": 0.8940263519521462, + "grad_norm": 0.5215408205986023, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 5530 + }, + { + "epoch": 0.8956430361328914, + "grad_norm": 0.580478310585022, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 5540 + }, + { + "epoch": 0.8972597203136368, + "grad_norm": 0.49102169275283813, + "learning_rate": 0.0002, + "loss": 0.8371, + "step": 5550 + }, + { + "epoch": 0.898876404494382, + "grad_norm": 0.6043479442596436, + "learning_rate": 0.0002, + "loss": 0.7806, + "step": 5560 + }, + { + "epoch": 0.9004930886751273, + "grad_norm": 0.5636463165283203, + "learning_rate": 0.0002, + "loss": 0.7754, + "step": 5570 + }, + { + "epoch": 0.9021097728558726, + "grad_norm": 0.5620124340057373, + "learning_rate": 0.0002, + "loss": 0.8145, + "step": 5580 + }, + { + "epoch": 0.9037264570366179, + "grad_norm": 0.5206354856491089, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 5590 + }, + { + "epoch": 0.9053431412173631, + "grad_norm": 0.5798229575157166, + "learning_rate": 0.0002, + "loss": 0.8557, + "step": 5600 + }, + { + "epoch": 0.9069598253981085, + "grad_norm": 0.6428212523460388, + "learning_rate": 0.0002, + "loss": 0.8097, + "step": 5610 + }, + { + "epoch": 0.9085765095788537, + "grad_norm": 0.48064687848091125, + "learning_rate": 0.0002, + "loss": 0.7839, + "step": 5620 + }, + { + "epoch": 0.9101931937595991, + "grad_norm": 0.6347860097885132, + "learning_rate": 0.0002, + "loss": 0.8343, + "step": 5630 + }, + { + "epoch": 0.9118098779403444, + "grad_norm": 0.5353913307189941, + "learning_rate": 0.0002, + "loss": 0.851, + "step": 5640 + }, + { + "epoch": 0.9134265621210896, + "grad_norm": 0.5323944091796875, + "learning_rate": 0.0002, + "loss": 0.7736, + "step": 5650 + }, + { + "epoch": 0.915043246301835, + "grad_norm": 0.5261843204498291, + "learning_rate": 0.0002, + "loss": 0.8393, + "step": 5660 + }, + { + "epoch": 0.9166599304825802, + "grad_norm": 0.5451326966285706, + "learning_rate": 0.0002, + "loss": 0.7355, + "step": 5670 + }, + { + "epoch": 0.9182766146633256, + "grad_norm": 0.5183324217796326, + "learning_rate": 0.0002, + "loss": 0.8012, + "step": 5680 + }, + { + "epoch": 0.9198932988440708, + "grad_norm": 0.47229018807411194, + "learning_rate": 0.0002, + "loss": 0.7659, + "step": 5690 + }, + { + "epoch": 0.9215099830248161, + "grad_norm": 0.49180513620376587, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 5700 + }, + { + "epoch": 0.9231266672055614, + "grad_norm": 0.5419785380363464, + "learning_rate": 0.0002, + "loss": 0.8735, + "step": 5710 + }, + { + "epoch": 0.9247433513863067, + "grad_norm": 0.5408698916435242, + "learning_rate": 0.0002, + "loss": 0.7378, + "step": 5720 + }, + { + "epoch": 0.9263600355670519, + "grad_norm": 0.5286232829093933, + "learning_rate": 0.0002, + "loss": 0.7701, + "step": 5730 + }, + { + "epoch": 0.9279767197477973, + "grad_norm": 0.7539758086204529, + "learning_rate": 0.0002, + "loss": 0.8242, + "step": 5740 + }, + { + "epoch": 0.9295934039285425, + "grad_norm": 0.5166944861412048, + "learning_rate": 0.0002, + "loss": 0.8118, + "step": 5750 + }, + { + "epoch": 0.9312100881092878, + "grad_norm": 0.6601425409317017, + "learning_rate": 0.0002, + "loss": 0.783, + "step": 5760 + }, + { + "epoch": 0.9328267722900332, + "grad_norm": 0.5029960870742798, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 5770 + }, + { + "epoch": 0.9344434564707784, + "grad_norm": 0.4926645755767822, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 5780 + }, + { + "epoch": 0.9360601406515238, + "grad_norm": 0.5739615559577942, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 5790 + }, + { + "epoch": 0.937676824832269, + "grad_norm": 0.5058279037475586, + "learning_rate": 0.0002, + "loss": 0.8037, + "step": 5800 + }, + { + "epoch": 0.9392935090130143, + "grad_norm": 0.5260962247848511, + "learning_rate": 0.0002, + "loss": 0.8537, + "step": 5810 + }, + { + "epoch": 0.9409101931937596, + "grad_norm": 0.5768588185310364, + "learning_rate": 0.0002, + "loss": 0.7486, + "step": 5820 + }, + { + "epoch": 0.9425268773745049, + "grad_norm": 0.5170126557350159, + "learning_rate": 0.0002, + "loss": 0.8215, + "step": 5830 + }, + { + "epoch": 0.9441435615552501, + "grad_norm": 0.5745864510536194, + "learning_rate": 0.0002, + "loss": 0.7422, + "step": 5840 + }, + { + "epoch": 0.9457602457359955, + "grad_norm": 0.5551357865333557, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 5850 + }, + { + "epoch": 0.9473769299167407, + "grad_norm": 0.5776078701019287, + "learning_rate": 0.0002, + "loss": 0.8529, + "step": 5860 + }, + { + "epoch": 0.9489936140974861, + "grad_norm": 0.5340062379837036, + "learning_rate": 0.0002, + "loss": 0.8527, + "step": 5870 + }, + { + "epoch": 0.9506102982782314, + "grad_norm": 0.6447290182113647, + "learning_rate": 0.0002, + "loss": 0.8217, + "step": 5880 + }, + { + "epoch": 0.9522269824589766, + "grad_norm": 0.5123815536499023, + "learning_rate": 0.0002, + "loss": 0.7945, + "step": 5890 + }, + { + "epoch": 0.953843666639722, + "grad_norm": 0.48547613620758057, + "learning_rate": 0.0002, + "loss": 0.8209, + "step": 5900 + }, + { + "epoch": 0.9554603508204672, + "grad_norm": 0.5791414976119995, + "learning_rate": 0.0002, + "loss": 0.7896, + "step": 5910 + }, + { + "epoch": 0.9570770350012126, + "grad_norm": 0.6195011734962463, + "learning_rate": 0.0002, + "loss": 0.8408, + "step": 5920 + }, + { + "epoch": 0.9586937191819578, + "grad_norm": 0.6323803067207336, + "learning_rate": 0.0002, + "loss": 0.7805, + "step": 5930 + }, + { + "epoch": 0.9603104033627031, + "grad_norm": 0.45552879571914673, + "learning_rate": 0.0002, + "loss": 0.8484, + "step": 5940 + }, + { + "epoch": 0.9619270875434484, + "grad_norm": 0.5796473622322083, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 5950 + }, + { + "epoch": 0.9635437717241937, + "grad_norm": 0.647261381149292, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 5960 + }, + { + "epoch": 0.9651604559049389, + "grad_norm": 0.5487682819366455, + "learning_rate": 0.0002, + "loss": 0.8086, + "step": 5970 + }, + { + "epoch": 0.9667771400856843, + "grad_norm": 0.5743663907051086, + "learning_rate": 0.0002, + "loss": 0.7973, + "step": 5980 + }, + { + "epoch": 0.9683938242664295, + "grad_norm": 0.5470591187477112, + "learning_rate": 0.0002, + "loss": 0.8153, + "step": 5990 + }, + { + "epoch": 0.9700105084471748, + "grad_norm": 0.5901660323143005, + "learning_rate": 0.0002, + "loss": 0.8119, + "step": 6000 + }, + { + "epoch": 0.9716271926279202, + "grad_norm": 0.6544759273529053, + "learning_rate": 0.0002, + "loss": 0.8147, + "step": 6010 + }, + { + "epoch": 0.9732438768086654, + "grad_norm": 0.6288470029830933, + "learning_rate": 0.0002, + "loss": 0.7536, + "step": 6020 + }, + { + "epoch": 0.9748605609894108, + "grad_norm": 0.673153817653656, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 6030 + }, + { + "epoch": 0.976477245170156, + "grad_norm": 0.42854753136634827, + "learning_rate": 0.0002, + "loss": 0.7556, + "step": 6040 + }, + { + "epoch": 0.9780939293509013, + "grad_norm": 0.5227066278457642, + "learning_rate": 0.0002, + "loss": 0.8006, + "step": 6050 + }, + { + "epoch": 0.9797106135316466, + "grad_norm": 0.5372416973114014, + "learning_rate": 0.0002, + "loss": 0.795, + "step": 6060 + }, + { + "epoch": 0.9813272977123919, + "grad_norm": 0.6026402115821838, + "learning_rate": 0.0002, + "loss": 0.7591, + "step": 6070 + }, + { + "epoch": 0.9829439818931371, + "grad_norm": 0.49547791481018066, + "learning_rate": 0.0002, + "loss": 0.8347, + "step": 6080 + }, + { + "epoch": 0.9845606660738825, + "grad_norm": 0.4641951322555542, + "learning_rate": 0.0002, + "loss": 0.7722, + "step": 6090 + }, + { + "epoch": 0.9861773502546277, + "grad_norm": 0.5818535089492798, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 6100 + }, + { + "epoch": 0.9877940344353731, + "grad_norm": 0.63955157995224, + "learning_rate": 0.0002, + "loss": 0.81, + "step": 6110 + }, + { + "epoch": 0.9894107186161183, + "grad_norm": 0.5649438500404358, + "learning_rate": 0.0002, + "loss": 0.7547, + "step": 6120 + }, + { + "epoch": 0.9910274027968636, + "grad_norm": 0.5290433168411255, + "learning_rate": 0.0002, + "loss": 0.7861, + "step": 6130 + }, + { + "epoch": 0.992644086977609, + "grad_norm": 0.6399374008178711, + "learning_rate": 0.0002, + "loss": 0.8109, + "step": 6140 + }, + { + "epoch": 0.9942607711583542, + "grad_norm": 0.6736576557159424, + "learning_rate": 0.0002, + "loss": 0.8373, + "step": 6150 + }, + { + "epoch": 0.9958774553390995, + "grad_norm": 0.515420138835907, + "learning_rate": 0.0002, + "loss": 0.7915, + "step": 6160 + }, + { + "epoch": 0.9974941395198448, + "grad_norm": 0.562677800655365, + "learning_rate": 0.0002, + "loss": 0.8032, + "step": 6170 + }, + { + "epoch": 0.9991108237005901, + "grad_norm": 0.7113858461380005, + "learning_rate": 0.0002, + "loss": 0.8187, + "step": 6180 + }, + { + "epoch": 0.9999191657909627, + "eval_loss": 1.0871200561523438, + "eval_runtime": 122.2071, + "eval_samples_per_second": 5.998, + "eval_steps_per_second": 0.753, + "step": 6185 + } + ], + "logging_steps": 10, + "max_steps": 49480, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.8625109010454938e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..155b12fa9acbc6e71dba75c92bfa79e152397ebf --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28694d5564a2b5c7d6881d4ba2af103356aa22489d2c22768ebbe47283c0f4a1 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/special_tokens_map.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/tokenizer.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..86a33946b0c77216d2cce91bb28c8fada4a5e80b --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c5cf44023714fb39b05e71e425f8d7b92805ff73f7988b083b8c87f0bf87393 +size 17209961 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/tokenizer_config.json b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..061e40d9db3253624f86e8e364c15ef546527c9d --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_248|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_249|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_250|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 1000000000000000019884624838656, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/training_args.bin b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..155b12fa9acbc6e71dba75c92bfa79e152397ebf --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28694d5564a2b5c7d6881d4ba2af103356aa22489d2c22768ebbe47283c0f4a1 +size 5560 diff --git a/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/training_log.jsonl b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bf2bb281c11c4ec2a30943e797dbce81ac45baed --- /dev/null +++ b/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/training_log.jsonl @@ -0,0 +1,8 @@ +{"epoch": 0.9999191657909627, "step": 6185, "epoch_duration": 16870.920749664307, "total_accumulated_duration": 16870.920749664307, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 9688.99365234375}, "avg_memory_reserved": {"GPU_0": 10406.0}, "peak_memory_reserved": {"GPU_0": 10406.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.6636, "grad_norm": 0.9894065856933594, "learning_rate": 0.0002, "epoch": 0.0016166841807452913, "step": 10}, {"loss": 1.1528, "grad_norm": 1.7810699939727783, "learning_rate": 0.0002, "epoch": 0.0032333683614905826, "step": 20}, {"loss": 0.9767, "grad_norm": 0.5969577431678772, "learning_rate": 0.0002, "epoch": 0.004850052542235874, "step": 30}, {"loss": 0.9772, "grad_norm": 0.6354120969772339, "learning_rate": 0.0002, "epoch": 0.006466736722981165, "step": 40}, {"loss": 0.8643, "grad_norm": 0.5604607462882996, "learning_rate": 0.0002, "epoch": 0.008083420903726457, "step": 50}, {"loss": 0.8841, "grad_norm": 0.4676193594932556, "learning_rate": 0.0002, "epoch": 0.009700105084471748, "step": 60}, {"loss": 0.9022, "grad_norm": 0.6099211573600769, "learning_rate": 0.0002, "epoch": 0.01131678926521704, "step": 70}, {"loss": 0.9133, "grad_norm": 0.48639994859695435, "learning_rate": 0.0002, "epoch": 0.01293347344596233, "step": 80}, {"loss": 0.8704, "grad_norm": 0.4904264509677887, "learning_rate": 0.0002, "epoch": 0.014550157626707623, "step": 90}, {"loss": 0.8855, "grad_norm": 2.8334362506866455, "learning_rate": 0.0002, "epoch": 0.016166841807452915, "step": 100}, {"loss": 0.8958, "grad_norm": 0.43221670389175415, "learning_rate": 0.0002, "epoch": 0.017783525988198205, "step": 110}, {"loss": 0.8412, "grad_norm": 0.42244166135787964, "learning_rate": 0.0002, "epoch": 0.019400210168943496, "step": 120}, {"loss": 0.8467, "grad_norm": 0.45363298058509827, "learning_rate": 0.0002, "epoch": 0.02101689434968879, "step": 130}, {"loss": 0.8641, "grad_norm": 0.44816508889198303, "learning_rate": 0.0002, "epoch": 0.02263357853043408, "step": 140}, {"loss": 0.8496, "grad_norm": 0.43308213353157043, "learning_rate": 0.0002, "epoch": 0.02425026271117937, "step": 150}, {"loss": 0.8213, "grad_norm": 0.4084763526916504, "learning_rate": 0.0002, "epoch": 0.02586694689192466, "step": 160}, {"loss": 0.8343, "grad_norm": 0.5363703966140747, "learning_rate": 0.0002, "epoch": 0.027483631072669955, "step": 170}, {"loss": 0.8558, "grad_norm": 0.4619699716567993, "learning_rate": 0.0002, "epoch": 0.029100315253415245, "step": 180}, {"loss": 0.8878, "grad_norm": 0.49069908261299133, "learning_rate": 0.0002, "epoch": 0.030716999434160536, "step": 190}, {"loss": 0.8867, "grad_norm": 0.4645835757255554, "learning_rate": 0.0002, "epoch": 0.03233368361490583, "step": 200}, {"loss": 0.8842, "grad_norm": 1.2411243915557861, "learning_rate": 0.0002, "epoch": 0.03395036779565112, "step": 210}, {"loss": 0.8245, "grad_norm": 0.5211851596832275, "learning_rate": 0.0002, "epoch": 0.03556705197639641, "step": 220}, {"loss": 0.8194, "grad_norm": 0.5253691673278809, "learning_rate": 0.0002, "epoch": 0.037183736157141704, "step": 230}, {"loss": 0.8856, "grad_norm": 0.4567478895187378, "learning_rate": 0.0002, "epoch": 0.03880042033788699, "step": 240}, {"loss": 0.838, "grad_norm": 0.5472128391265869, "learning_rate": 0.0002, "epoch": 0.040417104518632285, "step": 250}, {"loss": 0.8201, "grad_norm": 0.42978546023368835, "learning_rate": 0.0002, "epoch": 0.04203378869937758, "step": 260}, {"loss": 0.8334, "grad_norm": 0.601734459400177, "learning_rate": 0.0002, "epoch": 0.043650472880122866, "step": 270}, {"loss": 0.815, "grad_norm": 0.4286513328552246, "learning_rate": 0.0002, "epoch": 0.04526715706086816, "step": 280}, {"loss": 0.8758, "grad_norm": 0.5230861902236938, "learning_rate": 0.0002, "epoch": 0.046883841241613454, "step": 290}, {"loss": 0.8636, "grad_norm": 0.6504611968994141, "learning_rate": 0.0002, "epoch": 0.04850052542235874, "step": 300}, {"loss": 0.8102, "grad_norm": 0.43485215306282043, "learning_rate": 0.0002, "epoch": 0.050117209603104035, "step": 310}, {"loss": 0.8221, "grad_norm": 0.4717007875442505, "learning_rate": 0.0002, "epoch": 0.05173389378384932, "step": 320}, {"loss": 0.8469, "grad_norm": 0.4059787690639496, "learning_rate": 0.0002, "epoch": 0.053350577964594616, "step": 330}, {"loss": 0.8866, "grad_norm": 0.4366913437843323, "learning_rate": 0.0002, "epoch": 0.05496726214533991, "step": 340}, {"loss": 0.7976, "grad_norm": 0.4233848452568054, "learning_rate": 0.0002, "epoch": 0.0565839463260852, "step": 350}, {"loss": 0.8456, "grad_norm": 0.4209108352661133, "learning_rate": 0.0002, "epoch": 0.05820063050683049, "step": 360}, {"loss": 0.816, "grad_norm": 0.41637396812438965, "learning_rate": 0.0002, "epoch": 0.059817314687575784, "step": 370}, {"loss": 0.7976, "grad_norm": 0.46235376596450806, "learning_rate": 0.0002, "epoch": 0.06143399886832107, "step": 380}, {"loss": 0.7966, "grad_norm": 0.4013484716415405, "learning_rate": 0.0002, "epoch": 0.06305068304906636, "step": 390}, {"loss": 0.8253, "grad_norm": 0.47443896532058716, "learning_rate": 0.0002, "epoch": 0.06466736722981166, "step": 400}, {"loss": 0.8666, "grad_norm": 0.3942156434059143, "learning_rate": 0.0002, "epoch": 0.06628405141055695, "step": 410}, {"loss": 0.8402, "grad_norm": 0.4965320825576782, "learning_rate": 0.0002, "epoch": 0.06790073559130223, "step": 420}, {"loss": 0.8317, "grad_norm": 0.4304835796356201, "learning_rate": 0.0002, "epoch": 0.06951741977204753, "step": 430}, {"loss": 0.8528, "grad_norm": 0.511726975440979, "learning_rate": 0.0002, "epoch": 0.07113410395279282, "step": 440}, {"loss": 0.8675, "grad_norm": 0.4040689170360565, "learning_rate": 0.0002, "epoch": 0.07275078813353811, "step": 450}, {"loss": 0.8788, "grad_norm": 0.5402171015739441, "learning_rate": 0.0002, "epoch": 0.07436747231428341, "step": 460}, {"loss": 0.8737, "grad_norm": 0.4174517095088959, "learning_rate": 0.0002, "epoch": 0.0759841564950287, "step": 470}, {"loss": 0.7605, "grad_norm": 0.4306182265281677, "learning_rate": 0.0002, "epoch": 0.07760084067577398, "step": 480}, {"loss": 0.799, "grad_norm": 0.535210132598877, "learning_rate": 0.0002, "epoch": 0.07921752485651928, "step": 490}, {"loss": 0.7825, "grad_norm": 0.5339109897613525, "learning_rate": 0.0002, "epoch": 0.08083420903726457, "step": 500}, {"loss": 0.8985, "grad_norm": 0.45754891633987427, "learning_rate": 0.0002, "epoch": 0.08245089321800986, "step": 510}, {"loss": 0.8144, "grad_norm": 0.43820783495903015, "learning_rate": 0.0002, "epoch": 0.08406757739875516, "step": 520}, {"loss": 0.8001, "grad_norm": 0.4434749186038971, "learning_rate": 0.0002, "epoch": 0.08568426157950045, "step": 530}, {"loss": 0.7857, "grad_norm": 0.43111467361450195, "learning_rate": 0.0002, "epoch": 0.08730094576024573, "step": 540}, {"loss": 0.8418, "grad_norm": 0.4378940165042877, "learning_rate": 0.0002, "epoch": 0.08891762994099103, "step": 550}, {"loss": 0.8361, "grad_norm": 0.4772215187549591, "learning_rate": 0.0002, "epoch": 0.09053431412173632, "step": 560}, {"loss": 0.8268, "grad_norm": 0.6837629079818726, "learning_rate": 0.0002, "epoch": 0.09215099830248161, "step": 570}, {"loss": 0.8607, "grad_norm": 0.42241212725639343, "learning_rate": 0.0002, "epoch": 0.09376768248322691, "step": 580}, {"loss": 0.852, "grad_norm": 0.5165936350822449, "learning_rate": 0.0002, "epoch": 0.0953843666639722, "step": 590}, {"loss": 0.8664, "grad_norm": 0.48737478256225586, "learning_rate": 0.0002, "epoch": 0.09700105084471748, "step": 600}, {"loss": 0.8806, "grad_norm": 0.47419852018356323, "learning_rate": 0.0002, "epoch": 0.09861773502546278, "step": 610}, {"loss": 0.8254, "grad_norm": 0.4975486099720001, "learning_rate": 0.0002, "epoch": 0.10023441920620807, "step": 620}, {"loss": 0.8548, "grad_norm": 0.49123844504356384, "learning_rate": 0.0002, "epoch": 0.10185110338695336, "step": 630}, {"loss": 0.8911, "grad_norm": 0.6288952827453613, "learning_rate": 0.0002, "epoch": 0.10346778756769864, "step": 640}, {"loss": 0.827, "grad_norm": 0.4277345836162567, "learning_rate": 0.0002, "epoch": 0.10508447174844394, "step": 650}, {"loss": 0.7996, "grad_norm": 0.4021061956882477, "learning_rate": 0.0002, "epoch": 0.10670115592918923, "step": 660}, {"loss": 0.87, "grad_norm": 0.3492237329483032, "learning_rate": 0.0002, "epoch": 0.10831784010993452, "step": 670}, {"loss": 0.8698, "grad_norm": 0.4341012239456177, "learning_rate": 0.0002, "epoch": 0.10993452429067982, "step": 680}, {"loss": 0.781, "grad_norm": 0.7296304106712341, "learning_rate": 0.0002, "epoch": 0.1115512084714251, "step": 690}, {"loss": 0.8433, "grad_norm": 0.397494912147522, "learning_rate": 0.0002, "epoch": 0.1131678926521704, "step": 700}, {"loss": 0.827, "grad_norm": 0.396431028842926, "learning_rate": 0.0002, "epoch": 0.1147845768329157, "step": 710}, {"loss": 0.8379, "grad_norm": 0.48842838406562805, "learning_rate": 0.0002, "epoch": 0.11640126101366098, "step": 720}, {"loss": 0.8238, "grad_norm": 0.46322616934776306, "learning_rate": 0.0002, "epoch": 0.11801794519440627, "step": 730}, {"loss": 0.8041, "grad_norm": 0.47990912199020386, "learning_rate": 0.0002, "epoch": 0.11963462937515157, "step": 740}, {"loss": 0.82, "grad_norm": 0.4997142255306244, "learning_rate": 0.0002, "epoch": 0.12125131355589686, "step": 750}, {"loss": 0.7702, "grad_norm": 0.4040526747703552, "learning_rate": 0.0002, "epoch": 0.12286799773664214, "step": 760}, {"loss": 0.863, "grad_norm": 0.453095942735672, "learning_rate": 0.0002, "epoch": 0.12448468191738744, "step": 770}, {"loss": 0.8792, "grad_norm": 0.4636971950531006, "learning_rate": 0.0002, "epoch": 0.12610136609813272, "step": 780}, {"loss": 0.8112, "grad_norm": 0.4279276132583618, "learning_rate": 0.0002, "epoch": 0.12771805027887803, "step": 790}, {"loss": 0.8711, "grad_norm": 0.46212655305862427, "learning_rate": 0.0002, "epoch": 0.12933473445962332, "step": 800}, {"loss": 0.8368, "grad_norm": 0.43127650022506714, "learning_rate": 0.0002, "epoch": 0.1309514186403686, "step": 810}, {"loss": 0.8476, "grad_norm": 0.4201301336288452, "learning_rate": 0.0002, "epoch": 0.1325681028211139, "step": 820}, {"loss": 0.8078, "grad_norm": 0.42583167552948, "learning_rate": 0.0002, "epoch": 0.13418478700185918, "step": 830}, {"loss": 0.8219, "grad_norm": 0.4535622000694275, "learning_rate": 0.0002, "epoch": 0.13580147118260447, "step": 840}, {"loss": 0.8423, "grad_norm": 0.4116036891937256, "learning_rate": 0.0002, "epoch": 0.13741815536334978, "step": 850}, {"loss": 0.8466, "grad_norm": 0.45997580885887146, "learning_rate": 0.0002, "epoch": 0.13903483954409507, "step": 860}, {"loss": 0.8917, "grad_norm": 0.4487837255001068, "learning_rate": 0.0002, "epoch": 0.14065152372484035, "step": 870}, {"loss": 0.8217, "grad_norm": 0.43650057911872864, "learning_rate": 0.0002, "epoch": 0.14226820790558564, "step": 880}, {"loss": 0.8178, "grad_norm": 0.5335358381271362, "learning_rate": 0.0002, "epoch": 0.14388489208633093, "step": 890}, {"loss": 0.7957, "grad_norm": 0.5989000201225281, "learning_rate": 0.0002, "epoch": 0.14550157626707622, "step": 900}, {"loss": 0.8385, "grad_norm": 0.517179012298584, "learning_rate": 0.0002, "epoch": 0.14711826044782153, "step": 910}, {"loss": 0.8255, "grad_norm": 0.44435232877731323, "learning_rate": 0.0002, "epoch": 0.14873494462856682, "step": 920}, {"loss": 0.8305, "grad_norm": 0.42635923624038696, "learning_rate": 0.0002, "epoch": 0.1503516288093121, "step": 930}, {"loss": 0.8043, "grad_norm": 0.49603334069252014, "learning_rate": 0.0002, "epoch": 0.1519683129900574, "step": 940}, {"loss": 0.8377, "grad_norm": 0.40639808773994446, "learning_rate": 0.0002, "epoch": 0.15358499717080268, "step": 950}, {"loss": 0.8529, "grad_norm": 0.4850759208202362, "learning_rate": 0.0002, "epoch": 0.15520168135154797, "step": 960}, {"loss": 0.846, "grad_norm": 0.4427442252635956, "learning_rate": 0.0002, "epoch": 0.15681836553229328, "step": 970}, {"loss": 0.8705, "grad_norm": 0.3760930001735687, "learning_rate": 0.0002, "epoch": 0.15843504971303857, "step": 980}, {"loss": 0.8644, "grad_norm": 0.4794144332408905, "learning_rate": 0.0002, "epoch": 0.16005173389378385, "step": 990}, {"loss": 0.8002, "grad_norm": 0.45828768610954285, "learning_rate": 0.0002, "epoch": 0.16166841807452914, "step": 1000}, {"loss": 0.7658, "grad_norm": 0.6313053369522095, "learning_rate": 0.0002, "epoch": 0.16328510225527443, "step": 1010}, {"loss": 0.8047, "grad_norm": 0.45041006803512573, "learning_rate": 0.0002, "epoch": 0.16490178643601971, "step": 1020}, {"loss": 0.8423, "grad_norm": 0.441403865814209, "learning_rate": 0.0002, "epoch": 0.166518470616765, "step": 1030}, {"loss": 0.8475, "grad_norm": 0.8171296119689941, "learning_rate": 0.0002, "epoch": 0.16813515479751032, "step": 1040}, {"loss": 0.845, "grad_norm": 0.7137420773506165, "learning_rate": 0.0002, "epoch": 0.1697518389782556, "step": 1050}, {"loss": 0.8213, "grad_norm": 0.5236809849739075, "learning_rate": 0.0002, "epoch": 0.1713685231590009, "step": 1060}, {"loss": 0.8265, "grad_norm": 0.5021864175796509, "learning_rate": 0.0002, "epoch": 0.17298520733974618, "step": 1070}, {"loss": 0.8305, "grad_norm": 0.47347521781921387, "learning_rate": 0.0002, "epoch": 0.17460189152049146, "step": 1080}, {"loss": 0.8105, "grad_norm": 0.4631653428077698, "learning_rate": 0.0002, "epoch": 0.17621857570123675, "step": 1090}, {"loss": 0.8166, "grad_norm": 0.49169182777404785, "learning_rate": 0.0002, "epoch": 0.17783525988198207, "step": 1100}, {"loss": 0.8012, "grad_norm": 0.5019739270210266, "learning_rate": 0.0002, "epoch": 0.17945194406272735, "step": 1110}, {"loss": 0.8247, "grad_norm": 0.5100422501564026, "learning_rate": 0.0002, "epoch": 0.18106862824347264, "step": 1120}, {"loss": 0.8142, "grad_norm": 0.3888324499130249, "learning_rate": 0.0002, "epoch": 0.18268531242421793, "step": 1130}, {"loss": 0.8533, "grad_norm": 0.39765217900276184, "learning_rate": 0.0002, "epoch": 0.18430199660496321, "step": 1140}, {"loss": 0.8541, "grad_norm": 0.47190186381340027, "learning_rate": 0.0002, "epoch": 0.1859186807857085, "step": 1150}, {"loss": 0.8301, "grad_norm": 0.4464188814163208, "learning_rate": 0.0002, "epoch": 0.18753536496645382, "step": 1160}, {"loss": 0.8341, "grad_norm": 0.5153930187225342, "learning_rate": 0.0002, "epoch": 0.1891520491471991, "step": 1170}, {"loss": 0.8033, "grad_norm": 0.4779708683490753, "learning_rate": 0.0002, "epoch": 0.1907687333279444, "step": 1180}, {"loss": 0.8187, "grad_norm": 0.4834315776824951, "learning_rate": 0.0002, "epoch": 0.19238541750868968, "step": 1190}, {"loss": 0.7721, "grad_norm": 0.402357816696167, "learning_rate": 0.0002, "epoch": 0.19400210168943496, "step": 1200}, {"loss": 0.7941, "grad_norm": 0.45899084210395813, "learning_rate": 0.0002, "epoch": 0.19561878587018025, "step": 1210}, {"loss": 0.8353, "grad_norm": 0.5106529593467712, "learning_rate": 0.0002, "epoch": 0.19723547005092557, "step": 1220}, {"loss": 0.7816, "grad_norm": 0.45261722803115845, "learning_rate": 0.0002, "epoch": 0.19885215423167085, "step": 1230}, {"loss": 0.8068, "grad_norm": 0.4647127091884613, "learning_rate": 0.0002, "epoch": 0.20046883841241614, "step": 1240}, {"loss": 0.8239, "grad_norm": 0.4849368929862976, "learning_rate": 0.0002, "epoch": 0.20208552259316143, "step": 1250}, {"loss": 0.8514, "grad_norm": 0.4518061578273773, "learning_rate": 0.0002, "epoch": 0.2037022067739067, "step": 1260}, {"loss": 0.8158, "grad_norm": 0.49535325169563293, "learning_rate": 0.0002, "epoch": 0.205318890954652, "step": 1270}, {"loss": 0.8348, "grad_norm": 0.4835205376148224, "learning_rate": 0.0002, "epoch": 0.2069355751353973, "step": 1280}, {"loss": 0.8428, "grad_norm": 0.45308539271354675, "learning_rate": 0.0002, "epoch": 0.2085522593161426, "step": 1290}, {"loss": 0.7993, "grad_norm": 0.5369905233383179, "learning_rate": 0.0002, "epoch": 0.2101689434968879, "step": 1300}, {"loss": 0.8676, "grad_norm": 0.5031622052192688, "learning_rate": 0.0002, "epoch": 0.21178562767763318, "step": 1310}, {"loss": 0.7686, "grad_norm": 0.48010334372520447, "learning_rate": 0.0002, "epoch": 0.21340231185837846, "step": 1320}, {"loss": 0.806, "grad_norm": 0.4905701279640198, "learning_rate": 0.0002, "epoch": 0.21501899603912375, "step": 1330}, {"loss": 0.7885, "grad_norm": 0.43531742691993713, "learning_rate": 0.0002, "epoch": 0.21663568021986904, "step": 1340}, {"loss": 0.8191, "grad_norm": 0.44330692291259766, "learning_rate": 0.0002, "epoch": 0.21825236440061435, "step": 1350}, {"loss": 0.8205, "grad_norm": 0.5384416580200195, "learning_rate": 0.0002, "epoch": 0.21986904858135964, "step": 1360}, {"loss": 0.7726, "grad_norm": 0.4181833863258362, "learning_rate": 0.0002, "epoch": 0.22148573276210493, "step": 1370}, {"loss": 0.8311, "grad_norm": 0.523833692073822, "learning_rate": 0.0002, "epoch": 0.2231024169428502, "step": 1380}, {"loss": 0.7913, "grad_norm": 0.5528736710548401, "learning_rate": 0.0002, "epoch": 0.2247191011235955, "step": 1390}, {"loss": 0.8079, "grad_norm": 0.43515023589134216, "learning_rate": 0.0002, "epoch": 0.2263357853043408, "step": 1400}, {"loss": 0.8403, "grad_norm": 0.48809877038002014, "learning_rate": 0.0002, "epoch": 0.2279524694850861, "step": 1410}, {"loss": 0.8165, "grad_norm": 0.43591251969337463, "learning_rate": 0.0002, "epoch": 0.2295691536658314, "step": 1420}, {"loss": 0.8147, "grad_norm": 0.44625312089920044, "learning_rate": 0.0002, "epoch": 0.23118583784657668, "step": 1430}, {"loss": 0.8134, "grad_norm": 0.4390665292739868, "learning_rate": 0.0002, "epoch": 0.23280252202732196, "step": 1440}, {"loss": 0.8465, "grad_norm": 0.48496049642562866, "learning_rate": 0.0002, "epoch": 0.23441920620806725, "step": 1450}, {"loss": 0.775, "grad_norm": 0.45919957756996155, "learning_rate": 0.0002, "epoch": 0.23603589038881254, "step": 1460}, {"loss": 0.8659, "grad_norm": 0.5471845865249634, "learning_rate": 0.0002, "epoch": 0.23765257456955785, "step": 1470}, {"loss": 0.8164, "grad_norm": 0.47269317507743835, "learning_rate": 0.0002, "epoch": 0.23926925875030314, "step": 1480}, {"loss": 0.854, "grad_norm": 0.4930245578289032, "learning_rate": 0.0002, "epoch": 0.24088594293104842, "step": 1490}, {"loss": 0.8139, "grad_norm": 0.5605630278587341, "learning_rate": 0.0002, "epoch": 0.2425026271117937, "step": 1500}, {"loss": 0.8125, "grad_norm": 0.4435870945453644, "learning_rate": 0.0002, "epoch": 0.244119311292539, "step": 1510}, {"loss": 0.8123, "grad_norm": 0.4941999912261963, "learning_rate": 0.0002, "epoch": 0.24573599547328429, "step": 1520}, {"loss": 0.8427, "grad_norm": 0.5100624561309814, "learning_rate": 0.0002, "epoch": 0.24735267965402957, "step": 1530}, {"loss": 0.8405, "grad_norm": 0.4638267457485199, "learning_rate": 0.0002, "epoch": 0.2489693638347749, "step": 1540}, {"loss": 0.81, "grad_norm": 0.5071570873260498, "learning_rate": 0.0002, "epoch": 0.25058604801552015, "step": 1550}, {"loss": 0.7724, "grad_norm": 0.4291319251060486, "learning_rate": 0.0002, "epoch": 0.25220273219626543, "step": 1560}, {"loss": 0.7984, "grad_norm": 0.5388049483299255, "learning_rate": 0.0002, "epoch": 0.2538194163770108, "step": 1570}, {"loss": 0.8176, "grad_norm": 0.5083683729171753, "learning_rate": 0.0002, "epoch": 0.25543610055775606, "step": 1580}, {"loss": 0.843, "grad_norm": 0.4824463725090027, "learning_rate": 0.0002, "epoch": 0.25705278473850135, "step": 1590}, {"loss": 0.7996, "grad_norm": 0.41177722811698914, "learning_rate": 0.0002, "epoch": 0.25866946891924664, "step": 1600}, {"loss": 0.7772, "grad_norm": 0.5656219124794006, "learning_rate": 0.0002, "epoch": 0.2602861530999919, "step": 1610}, {"loss": 0.7955, "grad_norm": 0.41063204407691956, "learning_rate": 0.0002, "epoch": 0.2619028372807372, "step": 1620}, {"loss": 0.7998, "grad_norm": 0.4897061288356781, "learning_rate": 0.0002, "epoch": 0.2635195214614825, "step": 1630}, {"loss": 0.8198, "grad_norm": 0.4454376697540283, "learning_rate": 0.0002, "epoch": 0.2651362056422278, "step": 1640}, {"loss": 0.8684, "grad_norm": 0.4355238378047943, "learning_rate": 0.0002, "epoch": 0.26675288982297307, "step": 1650}, {"loss": 0.7801, "grad_norm": 0.458310067653656, "learning_rate": 0.0002, "epoch": 0.26836957400371836, "step": 1660}, {"loss": 0.7935, "grad_norm": 0.4752083718776703, "learning_rate": 0.0002, "epoch": 0.26998625818446365, "step": 1670}, {"loss": 0.8267, "grad_norm": 0.4666106402873993, "learning_rate": 0.0002, "epoch": 0.27160294236520893, "step": 1680}, {"loss": 0.8252, "grad_norm": 0.4213818609714508, "learning_rate": 0.0002, "epoch": 0.2732196265459543, "step": 1690}, {"loss": 0.8559, "grad_norm": 0.5768913626670837, "learning_rate": 0.0002, "epoch": 0.27483631072669956, "step": 1700}, {"loss": 0.7931, "grad_norm": 0.4209914803504944, "learning_rate": 0.0002, "epoch": 0.27645299490744485, "step": 1710}, {"loss": 0.8167, "grad_norm": 0.501909613609314, "learning_rate": 0.0002, "epoch": 0.27806967908819014, "step": 1720}, {"loss": 0.7832, "grad_norm": 0.5266261100769043, "learning_rate": 0.0002, "epoch": 0.2796863632689354, "step": 1730}, {"loss": 0.8102, "grad_norm": 0.43806859850883484, "learning_rate": 0.0002, "epoch": 0.2813030474496807, "step": 1740}, {"loss": 0.8157, "grad_norm": 0.46048814058303833, "learning_rate": 0.0002, "epoch": 0.282919731630426, "step": 1750}, {"loss": 0.8596, "grad_norm": 0.44972819089889526, "learning_rate": 0.0002, "epoch": 0.2845364158111713, "step": 1760}, {"loss": 0.8421, "grad_norm": 0.5114831328392029, "learning_rate": 0.0002, "epoch": 0.28615309999191657, "step": 1770}, {"loss": 0.8361, "grad_norm": 0.47931742668151855, "learning_rate": 0.0002, "epoch": 0.28776978417266186, "step": 1780}, {"loss": 0.8265, "grad_norm": 0.5092599987983704, "learning_rate": 0.0002, "epoch": 0.28938646835340714, "step": 1790}, {"loss": 0.8506, "grad_norm": 0.37581443786621094, "learning_rate": 0.0002, "epoch": 0.29100315253415243, "step": 1800}, {"loss": 0.7932, "grad_norm": 0.47097381949424744, "learning_rate": 0.0002, "epoch": 0.2926198367148977, "step": 1810}, {"loss": 0.7787, "grad_norm": 0.48300236463546753, "learning_rate": 0.0002, "epoch": 0.29423652089564306, "step": 1820}, {"loss": 0.8391, "grad_norm": 0.5600419640541077, "learning_rate": 0.0002, "epoch": 0.29585320507638835, "step": 1830}, {"loss": 0.8507, "grad_norm": 0.48555272817611694, "learning_rate": 0.0002, "epoch": 0.29746988925713364, "step": 1840}, {"loss": 0.7657, "grad_norm": 0.3752668499946594, "learning_rate": 0.0002, "epoch": 0.2990865734378789, "step": 1850}, {"loss": 0.7915, "grad_norm": 0.5328747034072876, "learning_rate": 0.0002, "epoch": 0.3007032576186242, "step": 1860}, {"loss": 0.8426, "grad_norm": 0.48716455698013306, "learning_rate": 0.0002, "epoch": 0.3023199417993695, "step": 1870}, {"loss": 0.8335, "grad_norm": 0.5011493563652039, "learning_rate": 0.0002, "epoch": 0.3039366259801148, "step": 1880}, {"loss": 0.852, "grad_norm": 0.46461427211761475, "learning_rate": 0.0002, "epoch": 0.30555331016086007, "step": 1890}, {"loss": 0.8478, "grad_norm": 0.36630210280418396, "learning_rate": 0.0002, "epoch": 0.30716999434160536, "step": 1900}, {"loss": 0.8162, "grad_norm": 0.4217296242713928, "learning_rate": 0.0002, "epoch": 0.30878667852235064, "step": 1910}, {"loss": 0.8128, "grad_norm": 0.4394875466823578, "learning_rate": 0.0002, "epoch": 0.31040336270309593, "step": 1920}, {"loss": 0.8471, "grad_norm": 0.6587965488433838, "learning_rate": 0.0002, "epoch": 0.3120200468838412, "step": 1930}, {"loss": 0.8565, "grad_norm": 0.5469298958778381, "learning_rate": 0.0002, "epoch": 0.31363673106458656, "step": 1940}, {"loss": 0.8236, "grad_norm": 0.4371595084667206, "learning_rate": 0.0002, "epoch": 0.31525341524533185, "step": 1950}, {"loss": 0.887, "grad_norm": 0.4809541404247284, "learning_rate": 0.0002, "epoch": 0.31687009942607713, "step": 1960}, {"loss": 0.7855, "grad_norm": 0.6061086654663086, "learning_rate": 0.0002, "epoch": 0.3184867836068224, "step": 1970}, {"loss": 0.7679, "grad_norm": 0.5342657566070557, "learning_rate": 0.0002, "epoch": 0.3201034677875677, "step": 1980}, {"loss": 0.7955, "grad_norm": 0.5057743787765503, "learning_rate": 0.0002, "epoch": 0.321720151968313, "step": 1990}, {"loss": 0.7774, "grad_norm": 0.528626024723053, "learning_rate": 0.0002, "epoch": 0.3233368361490583, "step": 2000}, {"loss": 0.8845, "grad_norm": 0.46742770075798035, "learning_rate": 0.0002, "epoch": 0.32495352032980357, "step": 2010}, {"loss": 0.8484, "grad_norm": 0.515101432800293, "learning_rate": 0.0002, "epoch": 0.32657020451054886, "step": 2020}, {"loss": 0.8139, "grad_norm": 0.41941216588020325, "learning_rate": 0.0002, "epoch": 0.32818688869129414, "step": 2030}, {"loss": 0.7637, "grad_norm": 0.49902522563934326, "learning_rate": 0.0002, "epoch": 0.32980357287203943, "step": 2040}, {"loss": 0.7822, "grad_norm": 0.4120897650718689, "learning_rate": 0.0002, "epoch": 0.3314202570527847, "step": 2050}, {"loss": 0.8057, "grad_norm": 0.45352041721343994, "learning_rate": 0.0002, "epoch": 0.33303694123353, "step": 2060}, {"loss": 0.7913, "grad_norm": 0.523199737071991, "learning_rate": 0.0002, "epoch": 0.33465362541427535, "step": 2070}, {"loss": 0.8036, "grad_norm": 0.4390358626842499, "learning_rate": 0.0002, "epoch": 0.33627030959502063, "step": 2080}, {"loss": 0.8145, "grad_norm": 0.6752901077270508, "learning_rate": 0.0002, "epoch": 0.3378869937757659, "step": 2090}, {"loss": 0.7807, "grad_norm": 0.547821044921875, "learning_rate": 0.0002, "epoch": 0.3395036779565112, "step": 2100}, {"loss": 0.8561, "grad_norm": 0.5161308646202087, "learning_rate": 0.0002, "epoch": 0.3411203621372565, "step": 2110}, {"loss": 0.7697, "grad_norm": 0.4565401077270508, "learning_rate": 0.0002, "epoch": 0.3427370463180018, "step": 2120}, {"loss": 0.7964, "grad_norm": 0.4666115939617157, "learning_rate": 0.0002, "epoch": 0.34435373049874707, "step": 2130}, {"loss": 0.8189, "grad_norm": 0.4090428352355957, "learning_rate": 0.0002, "epoch": 0.34597041467949236, "step": 2140}, {"loss": 0.8817, "grad_norm": 0.510845422744751, "learning_rate": 0.0002, "epoch": 0.34758709886023764, "step": 2150}, {"loss": 0.8398, "grad_norm": 0.42861923575401306, "learning_rate": 0.0002, "epoch": 0.34920378304098293, "step": 2160}, {"loss": 0.7716, "grad_norm": 0.4476332664489746, "learning_rate": 0.0002, "epoch": 0.3508204672217282, "step": 2170}, {"loss": 0.7845, "grad_norm": 0.6065791249275208, "learning_rate": 0.0002, "epoch": 0.3524371514024735, "step": 2180}, {"loss": 0.8187, "grad_norm": 0.42335066199302673, "learning_rate": 0.0002, "epoch": 0.35405383558321885, "step": 2190}, {"loss": 0.8239, "grad_norm": 0.5094629526138306, "learning_rate": 0.0002, "epoch": 0.35567051976396413, "step": 2200}, {"loss": 0.7807, "grad_norm": 0.5476373434066772, "learning_rate": 0.0002, "epoch": 0.3572872039447094, "step": 2210}, {"loss": 0.814, "grad_norm": 0.3911719024181366, "learning_rate": 0.0002, "epoch": 0.3589038881254547, "step": 2220}, {"loss": 0.8599, "grad_norm": 0.6599636077880859, "learning_rate": 0.0002, "epoch": 0.3605205723062, "step": 2230}, {"loss": 0.7482, "grad_norm": 0.40381914377212524, "learning_rate": 0.0002, "epoch": 0.3621372564869453, "step": 2240}, {"loss": 0.7772, "grad_norm": 0.4433908462524414, "learning_rate": 0.0002, "epoch": 0.36375394066769057, "step": 2250}, {"loss": 0.8503, "grad_norm": 0.578326940536499, "learning_rate": 0.0002, "epoch": 0.36537062484843585, "step": 2260}, {"loss": 0.8178, "grad_norm": 0.5734784007072449, "learning_rate": 0.0002, "epoch": 0.36698730902918114, "step": 2270}, {"loss": 0.8193, "grad_norm": 0.45555487275123596, "learning_rate": 0.0002, "epoch": 0.36860399320992643, "step": 2280}, {"loss": 0.7929, "grad_norm": 0.5666276216506958, "learning_rate": 0.0002, "epoch": 0.3702206773906717, "step": 2290}, {"loss": 0.8292, "grad_norm": 0.5461117625236511, "learning_rate": 0.0002, "epoch": 0.371837361571417, "step": 2300}, {"loss": 0.8204, "grad_norm": 0.6318911910057068, "learning_rate": 0.0002, "epoch": 0.3734540457521623, "step": 2310}, {"loss": 0.7964, "grad_norm": 0.493263304233551, "learning_rate": 0.0002, "epoch": 0.37507072993290763, "step": 2320}, {"loss": 0.8339, "grad_norm": 0.5888760089874268, "learning_rate": 0.0002, "epoch": 0.3766874141136529, "step": 2330}, {"loss": 0.7737, "grad_norm": 0.48671841621398926, "learning_rate": 0.0002, "epoch": 0.3783040982943982, "step": 2340}, {"loss": 0.8367, "grad_norm": 0.4385145306587219, "learning_rate": 0.0002, "epoch": 0.3799207824751435, "step": 2350}, {"loss": 0.812, "grad_norm": 0.5523318648338318, "learning_rate": 0.0002, "epoch": 0.3815374666558888, "step": 2360}, {"loss": 0.8351, "grad_norm": 0.7308220267295837, "learning_rate": 0.0002, "epoch": 0.38315415083663407, "step": 2370}, {"loss": 0.859, "grad_norm": 0.554214358329773, "learning_rate": 0.0002, "epoch": 0.38477083501737935, "step": 2380}, {"loss": 0.8146, "grad_norm": 0.5425800085067749, "learning_rate": 0.0002, "epoch": 0.38638751919812464, "step": 2390}, {"loss": 0.8282, "grad_norm": 0.48811158537864685, "learning_rate": 0.0002, "epoch": 0.3880042033788699, "step": 2400}, {"loss": 0.8074, "grad_norm": 0.49212366342544556, "learning_rate": 0.0002, "epoch": 0.3896208875596152, "step": 2410}, {"loss": 0.7991, "grad_norm": 0.5222218632698059, "learning_rate": 0.0002, "epoch": 0.3912375717403605, "step": 2420}, {"loss": 0.8182, "grad_norm": 0.4699819087982178, "learning_rate": 0.0002, "epoch": 0.3928542559211058, "step": 2430}, {"loss": 0.7919, "grad_norm": 0.46153587102890015, "learning_rate": 0.0002, "epoch": 0.39447094010185113, "step": 2440}, {"loss": 0.8111, "grad_norm": 0.4150611162185669, "learning_rate": 0.0002, "epoch": 0.3960876242825964, "step": 2450}, {"loss": 0.8589, "grad_norm": 0.5799614787101746, "learning_rate": 0.0002, "epoch": 0.3977043084633417, "step": 2460}, {"loss": 0.8085, "grad_norm": 0.56536865234375, "learning_rate": 0.0002, "epoch": 0.399320992644087, "step": 2470}, {"loss": 0.8022, "grad_norm": 0.5451247096061707, "learning_rate": 0.0002, "epoch": 0.4009376768248323, "step": 2480}, {"loss": 0.8217, "grad_norm": 0.5914521217346191, "learning_rate": 0.0002, "epoch": 0.40255436100557757, "step": 2490}, {"loss": 0.7859, "grad_norm": 0.4428117275238037, "learning_rate": 0.0002, "epoch": 0.40417104518632285, "step": 2500}, {"loss": 0.8054, "grad_norm": 0.48580947518348694, "learning_rate": 0.0002, "epoch": 0.40578772936706814, "step": 2510}, {"loss": 0.8405, "grad_norm": 0.436734676361084, "learning_rate": 0.0002, "epoch": 0.4074044135478134, "step": 2520}, {"loss": 0.8209, "grad_norm": 0.5752223134040833, "learning_rate": 0.0002, "epoch": 0.4090210977285587, "step": 2530}, {"loss": 0.8181, "grad_norm": 0.4271308183670044, "learning_rate": 0.0002, "epoch": 0.410637781909304, "step": 2540}, {"loss": 0.8058, "grad_norm": 0.46294718980789185, "learning_rate": 0.0002, "epoch": 0.4122544660900493, "step": 2550}, {"loss": 0.8473, "grad_norm": 0.49407583475112915, "learning_rate": 0.0002, "epoch": 0.4138711502707946, "step": 2560}, {"loss": 0.7881, "grad_norm": 0.4729035496711731, "learning_rate": 0.0002, "epoch": 0.4154878344515399, "step": 2570}, {"loss": 0.7834, "grad_norm": 0.4129747152328491, "learning_rate": 0.0002, "epoch": 0.4171045186322852, "step": 2580}, {"loss": 0.7859, "grad_norm": 0.5684236288070679, "learning_rate": 0.0002, "epoch": 0.4187212028130305, "step": 2590}, {"loss": 0.811, "grad_norm": 0.4862157106399536, "learning_rate": 0.0002, "epoch": 0.4203378869937758, "step": 2600}, {"loss": 0.7582, "grad_norm": 0.46567976474761963, "learning_rate": 0.0002, "epoch": 0.42195457117452106, "step": 2610}, {"loss": 0.7755, "grad_norm": 0.5710650682449341, "learning_rate": 0.0002, "epoch": 0.42357125535526635, "step": 2620}, {"loss": 0.8573, "grad_norm": 0.5660041570663452, "learning_rate": 0.0002, "epoch": 0.42518793953601164, "step": 2630}, {"loss": 0.7812, "grad_norm": 0.47944375872612, "learning_rate": 0.0002, "epoch": 0.4268046237167569, "step": 2640}, {"loss": 0.7459, "grad_norm": 0.537223756313324, "learning_rate": 0.0002, "epoch": 0.4284213078975022, "step": 2650}, {"loss": 0.8246, "grad_norm": 0.41669997572898865, "learning_rate": 0.0002, "epoch": 0.4300379920782475, "step": 2660}, {"loss": 0.7785, "grad_norm": 0.44727686047554016, "learning_rate": 0.0002, "epoch": 0.4316546762589928, "step": 2670}, {"loss": 0.8241, "grad_norm": 0.5600888729095459, "learning_rate": 0.0002, "epoch": 0.4332713604397381, "step": 2680}, {"loss": 0.7708, "grad_norm": 0.39820605516433716, "learning_rate": 0.0002, "epoch": 0.4348880446204834, "step": 2690}, {"loss": 0.8202, "grad_norm": 0.5637655854225159, "learning_rate": 0.0002, "epoch": 0.4365047288012287, "step": 2700}, {"loss": 0.855, "grad_norm": 0.6363666653633118, "learning_rate": 0.0002, "epoch": 0.438121412981974, "step": 2710}, {"loss": 0.8468, "grad_norm": 0.5656129121780396, "learning_rate": 0.0002, "epoch": 0.4397380971627193, "step": 2720}, {"loss": 0.7845, "grad_norm": 0.5600156188011169, "learning_rate": 0.0002, "epoch": 0.44135478134346456, "step": 2730}, {"loss": 0.8405, "grad_norm": 0.5506579875946045, "learning_rate": 0.0002, "epoch": 0.44297146552420985, "step": 2740}, {"loss": 0.7725, "grad_norm": 0.49878305196762085, "learning_rate": 0.0002, "epoch": 0.44458814970495514, "step": 2750}, {"loss": 0.8292, "grad_norm": 0.4569213092327118, "learning_rate": 0.0002, "epoch": 0.4462048338857004, "step": 2760}, {"loss": 0.8028, "grad_norm": 0.6056680083274841, "learning_rate": 0.0002, "epoch": 0.4478215180664457, "step": 2770}, {"loss": 0.8242, "grad_norm": 0.44474557042121887, "learning_rate": 0.0002, "epoch": 0.449438202247191, "step": 2780}, {"loss": 0.801, "grad_norm": 0.46055394411087036, "learning_rate": 0.0002, "epoch": 0.4510548864279363, "step": 2790}, {"loss": 0.7521, "grad_norm": 0.4904133379459381, "learning_rate": 0.0002, "epoch": 0.4526715706086816, "step": 2800}, {"loss": 0.8829, "grad_norm": 0.5647031664848328, "learning_rate": 0.0002, "epoch": 0.45428825478942686, "step": 2810}, {"loss": 0.8622, "grad_norm": 0.5759473443031311, "learning_rate": 0.0002, "epoch": 0.4559049389701722, "step": 2820}, {"loss": 0.7812, "grad_norm": 0.5161895751953125, "learning_rate": 0.0002, "epoch": 0.4575216231509175, "step": 2830}, {"loss": 0.8045, "grad_norm": 0.4248254597187042, "learning_rate": 0.0002, "epoch": 0.4591383073316628, "step": 2840}, {"loss": 0.7838, "grad_norm": 0.45395001769065857, "learning_rate": 0.0002, "epoch": 0.46075499151240806, "step": 2850}, {"loss": 0.8208, "grad_norm": 0.5358697772026062, "learning_rate": 0.0002, "epoch": 0.46237167569315335, "step": 2860}, {"loss": 0.8147, "grad_norm": 0.5379165410995483, "learning_rate": 0.0002, "epoch": 0.46398835987389864, "step": 2870}, {"loss": 0.7403, "grad_norm": 0.4601989686489105, "learning_rate": 0.0002, "epoch": 0.4656050440546439, "step": 2880}, {"loss": 0.8523, "grad_norm": 0.671115517616272, "learning_rate": 0.0002, "epoch": 0.4672217282353892, "step": 2890}, {"loss": 0.8262, "grad_norm": 0.4425133168697357, "learning_rate": 0.0002, "epoch": 0.4688384124161345, "step": 2900}, {"loss": 0.8178, "grad_norm": 0.5446155071258545, "learning_rate": 0.0002, "epoch": 0.4704550965968798, "step": 2910}, {"loss": 0.8106, "grad_norm": 0.603306233882904, "learning_rate": 0.0002, "epoch": 0.47207178077762507, "step": 2920}, {"loss": 0.8044, "grad_norm": 0.5377997159957886, "learning_rate": 0.0002, "epoch": 0.47368846495837036, "step": 2930}, {"loss": 0.8075, "grad_norm": 0.4931027591228485, "learning_rate": 0.0002, "epoch": 0.4753051491391157, "step": 2940}, {"loss": 0.8004, "grad_norm": 0.4711960256099701, "learning_rate": 0.0002, "epoch": 0.476921833319861, "step": 2950}, {"loss": 0.8121, "grad_norm": 0.5020492672920227, "learning_rate": 0.0002, "epoch": 0.4785385175006063, "step": 2960}, {"loss": 0.8221, "grad_norm": 0.5428946614265442, "learning_rate": 0.0002, "epoch": 0.48015520168135156, "step": 2970}, {"loss": 0.7849, "grad_norm": 0.5294089317321777, "learning_rate": 0.0002, "epoch": 0.48177188586209685, "step": 2980}, {"loss": 0.8553, "grad_norm": 0.648289144039154, "learning_rate": 0.0002, "epoch": 0.48338857004284214, "step": 2990}, {"loss": 0.7874, "grad_norm": 0.47916680574417114, "learning_rate": 0.0002, "epoch": 0.4850052542235874, "step": 3000}, {"loss": 0.8087, "grad_norm": 0.43849772214889526, "learning_rate": 0.0002, "epoch": 0.4866219384043327, "step": 3010}, {"loss": 0.7662, "grad_norm": 0.47007861733436584, "learning_rate": 0.0002, "epoch": 0.488238622585078, "step": 3020}, {"loss": 0.757, "grad_norm": 0.6314331293106079, "learning_rate": 0.0002, "epoch": 0.4898553067658233, "step": 3030}, {"loss": 0.7863, "grad_norm": 0.49211493134498596, "learning_rate": 0.0002, "epoch": 0.49147199094656857, "step": 3040}, {"loss": 0.8335, "grad_norm": 0.4537973403930664, "learning_rate": 0.0002, "epoch": 0.49308867512731386, "step": 3050}, {"loss": 0.8095, "grad_norm": 0.47326919436454773, "learning_rate": 0.0002, "epoch": 0.49470535930805914, "step": 3060}, {"loss": 0.8447, "grad_norm": 0.525874137878418, "learning_rate": 0.0002, "epoch": 0.4963220434888045, "step": 3070}, {"loss": 0.8339, "grad_norm": 0.6361091732978821, "learning_rate": 0.0002, "epoch": 0.4979387276695498, "step": 3080}, {"loss": 0.821, "grad_norm": 0.5850642919540405, "learning_rate": 0.0002, "epoch": 0.49955541185029506, "step": 3090}, {"loss": 0.8279, "grad_norm": 0.47299543023109436, "learning_rate": 0.0002, "epoch": 0.5011720960310403, "step": 3100}, {"loss": 0.8681, "grad_norm": 0.473099946975708, "learning_rate": 0.0002, "epoch": 0.5027887802117856, "step": 3110}, {"loss": 0.8223, "grad_norm": 0.48186397552490234, "learning_rate": 0.0002, "epoch": 0.5044054643925309, "step": 3120}, {"loss": 0.8292, "grad_norm": 0.5015401840209961, "learning_rate": 0.0002, "epoch": 0.5060221485732762, "step": 3130}, {"loss": 0.7692, "grad_norm": 0.5617750287055969, "learning_rate": 0.0002, "epoch": 0.5076388327540216, "step": 3140}, {"loss": 0.8708, "grad_norm": 0.5169327259063721, "learning_rate": 0.0002, "epoch": 0.5092555169347668, "step": 3150}, {"loss": 0.7845, "grad_norm": 0.545657753944397, "learning_rate": 0.0002, "epoch": 0.5108722011155121, "step": 3160}, {"loss": 0.799, "grad_norm": 0.512864351272583, "learning_rate": 0.0002, "epoch": 0.5124888852962574, "step": 3170}, {"loss": 0.7794, "grad_norm": 0.4113546311855316, "learning_rate": 0.0002, "epoch": 0.5141055694770027, "step": 3180}, {"loss": 0.8206, "grad_norm": 0.44532445073127747, "learning_rate": 0.0002, "epoch": 0.5157222536577479, "step": 3190}, {"loss": 0.8213, "grad_norm": 0.5623497366905212, "learning_rate": 0.0002, "epoch": 0.5173389378384933, "step": 3200}, {"loss": 0.7928, "grad_norm": 0.5084741115570068, "learning_rate": 0.0002, "epoch": 0.5189556220192385, "step": 3210}, {"loss": 0.8174, "grad_norm": 0.5305403470993042, "learning_rate": 0.0002, "epoch": 0.5205723061999838, "step": 3220}, {"loss": 0.8139, "grad_norm": 0.4708254337310791, "learning_rate": 0.0002, "epoch": 0.5221889903807291, "step": 3230}, {"loss": 0.7639, "grad_norm": 0.43827131390571594, "learning_rate": 0.0002, "epoch": 0.5238056745614744, "step": 3240}, {"loss": 0.7993, "grad_norm": 0.5630002617835999, "learning_rate": 0.0002, "epoch": 0.5254223587422197, "step": 3250}, {"loss": 0.7522, "grad_norm": 0.5010961890220642, "learning_rate": 0.0002, "epoch": 0.527039042922965, "step": 3260}, {"loss": 0.8374, "grad_norm": 0.6303122043609619, "learning_rate": 0.0002, "epoch": 0.5286557271037103, "step": 3270}, {"loss": 0.7727, "grad_norm": 0.5107331275939941, "learning_rate": 0.0002, "epoch": 0.5302724112844556, "step": 3280}, {"loss": 0.8495, "grad_norm": 0.5700443387031555, "learning_rate": 0.0002, "epoch": 0.5318890954652009, "step": 3290}, {"loss": 0.7776, "grad_norm": 0.46296367049217224, "learning_rate": 0.0002, "epoch": 0.5335057796459461, "step": 3300}, {"loss": 0.7931, "grad_norm": 0.531568706035614, "learning_rate": 0.0002, "epoch": 0.5351224638266915, "step": 3310}, {"loss": 0.843, "grad_norm": 0.4686741530895233, "learning_rate": 0.0002, "epoch": 0.5367391480074367, "step": 3320}, {"loss": 0.8104, "grad_norm": 0.5404331088066101, "learning_rate": 0.0002, "epoch": 0.5383558321881821, "step": 3330}, {"loss": 0.7686, "grad_norm": 0.6368790864944458, "learning_rate": 0.0002, "epoch": 0.5399725163689273, "step": 3340}, {"loss": 0.8514, "grad_norm": 0.42300888895988464, "learning_rate": 0.0002, "epoch": 0.5415892005496726, "step": 3350}, {"loss": 0.8236, "grad_norm": 0.5362542867660522, "learning_rate": 0.0002, "epoch": 0.5432058847304179, "step": 3360}, {"loss": 0.858, "grad_norm": 0.497128963470459, "learning_rate": 0.0002, "epoch": 0.5448225689111632, "step": 3370}, {"loss": 0.8519, "grad_norm": 0.5006386041641235, "learning_rate": 0.0002, "epoch": 0.5464392530919085, "step": 3380}, {"loss": 0.7867, "grad_norm": 0.44136837124824524, "learning_rate": 0.0002, "epoch": 0.5480559372726538, "step": 3390}, {"loss": 0.773, "grad_norm": 0.5897833108901978, "learning_rate": 0.0002, "epoch": 0.5496726214533991, "step": 3400}, {"loss": 0.8895, "grad_norm": 0.641075611114502, "learning_rate": 0.0002, "epoch": 0.5512893056341444, "step": 3410}, {"loss": 0.7827, "grad_norm": 0.7251322269439697, "learning_rate": 0.0002, "epoch": 0.5529059898148897, "step": 3420}, {"loss": 0.7626, "grad_norm": 0.47411349415779114, "learning_rate": 0.0002, "epoch": 0.5545226739956349, "step": 3430}, {"loss": 0.8196, "grad_norm": 0.4994310438632965, "learning_rate": 0.0002, "epoch": 0.5561393581763803, "step": 3440}, {"loss": 0.7812, "grad_norm": 0.5814438462257385, "learning_rate": 0.0002, "epoch": 0.5577560423571255, "step": 3450}, {"loss": 0.8805, "grad_norm": 0.6278898119926453, "learning_rate": 0.0002, "epoch": 0.5593727265378708, "step": 3460}, {"loss": 0.813, "grad_norm": 0.46208274364471436, "learning_rate": 0.0002, "epoch": 0.5609894107186161, "step": 3470}, {"loss": 0.8295, "grad_norm": 0.5718930959701538, "learning_rate": 0.0002, "epoch": 0.5626060948993614, "step": 3480}, {"loss": 0.8152, "grad_norm": 0.48178744316101074, "learning_rate": 0.0002, "epoch": 0.5642227790801067, "step": 3490}, {"loss": 0.8244, "grad_norm": 0.47336965799331665, "learning_rate": 0.0002, "epoch": 0.565839463260852, "step": 3500}, {"loss": 0.8099, "grad_norm": 0.43442684412002563, "learning_rate": 0.0002, "epoch": 0.5674561474415973, "step": 3510}, {"loss": 0.7564, "grad_norm": 0.6463358998298645, "learning_rate": 0.0002, "epoch": 0.5690728316223426, "step": 3520}, {"loss": 0.836, "grad_norm": 0.5286486744880676, "learning_rate": 0.0002, "epoch": 0.5706895158030879, "step": 3530}, {"loss": 0.8421, "grad_norm": 0.5405499935150146, "learning_rate": 0.0002, "epoch": 0.5723061999838331, "step": 3540}, {"loss": 0.7614, "grad_norm": 0.6654391884803772, "learning_rate": 0.0002, "epoch": 0.5739228841645785, "step": 3550}, {"loss": 0.7803, "grad_norm": 0.5081980228424072, "learning_rate": 0.0002, "epoch": 0.5755395683453237, "step": 3560}, {"loss": 0.7753, "grad_norm": 0.48978179693222046, "learning_rate": 0.0002, "epoch": 0.5771562525260691, "step": 3570}, {"loss": 0.8151, "grad_norm": 0.5840612053871155, "learning_rate": 0.0002, "epoch": 0.5787729367068143, "step": 3580}, {"loss": 0.8937, "grad_norm": 0.5235261917114258, "learning_rate": 0.0002, "epoch": 0.5803896208875596, "step": 3590}, {"loss": 0.7894, "grad_norm": 0.5672075748443604, "learning_rate": 0.0002, "epoch": 0.5820063050683049, "step": 3600}, {"loss": 0.8347, "grad_norm": 0.5613429546356201, "learning_rate": 0.0002, "epoch": 0.5836229892490502, "step": 3610}, {"loss": 0.8274, "grad_norm": 0.4032273590564728, "learning_rate": 0.0002, "epoch": 0.5852396734297954, "step": 3620}, {"loss": 0.8421, "grad_norm": 0.49559324979782104, "learning_rate": 0.0002, "epoch": 0.5868563576105408, "step": 3630}, {"loss": 0.8332, "grad_norm": 0.6895697712898254, "learning_rate": 0.0002, "epoch": 0.5884730417912861, "step": 3640}, {"loss": 0.7877, "grad_norm": 0.4750136435031891, "learning_rate": 0.0002, "epoch": 0.5900897259720314, "step": 3650}, {"loss": 0.8219, "grad_norm": 0.5176819562911987, "learning_rate": 0.0002, "epoch": 0.5917064101527767, "step": 3660}, {"loss": 0.8151, "grad_norm": 0.5817760229110718, "learning_rate": 0.0002, "epoch": 0.5933230943335219, "step": 3670}, {"loss": 0.7823, "grad_norm": 0.6064626574516296, "learning_rate": 0.0002, "epoch": 0.5949397785142673, "step": 3680}, {"loss": 0.8422, "grad_norm": 0.6728700995445251, "learning_rate": 0.0002, "epoch": 0.5965564626950125, "step": 3690}, {"loss": 0.7679, "grad_norm": 0.609305202960968, "learning_rate": 0.0002, "epoch": 0.5981731468757578, "step": 3700}, {"loss": 0.8048, "grad_norm": 0.4615488350391388, "learning_rate": 0.0002, "epoch": 0.5997898310565031, "step": 3710}, {"loss": 0.8214, "grad_norm": 2.0531179904937744, "learning_rate": 0.0002, "epoch": 0.6014065152372484, "step": 3720}, {"loss": 0.8158, "grad_norm": 0.5091132521629333, "learning_rate": 0.0002, "epoch": 0.6030231994179936, "step": 3730}, {"loss": 0.7833, "grad_norm": 0.5951124429702759, "learning_rate": 0.0002, "epoch": 0.604639883598739, "step": 3740}, {"loss": 0.7784, "grad_norm": 0.5870208144187927, "learning_rate": 0.0002, "epoch": 0.6062565677794842, "step": 3750}, {"loss": 0.8044, "grad_norm": 0.6254619359970093, "learning_rate": 0.0002, "epoch": 0.6078732519602296, "step": 3760}, {"loss": 0.7868, "grad_norm": 0.5577626824378967, "learning_rate": 0.0002, "epoch": 0.6094899361409749, "step": 3770}, {"loss": 0.8108, "grad_norm": 0.5004405379295349, "learning_rate": 0.0002, "epoch": 0.6111066203217201, "step": 3780}, {"loss": 0.8092, "grad_norm": 0.5527383685112, "learning_rate": 0.0002, "epoch": 0.6127233045024655, "step": 3790}, {"loss": 0.8036, "grad_norm": 0.49116113781929016, "learning_rate": 0.0002, "epoch": 0.6143399886832107, "step": 3800}, {"loss": 0.8352, "grad_norm": 0.5299299359321594, "learning_rate": 0.0002, "epoch": 0.6159566728639561, "step": 3810}, {"loss": 0.7737, "grad_norm": 0.464897483587265, "learning_rate": 0.0002, "epoch": 0.6175733570447013, "step": 3820}, {"loss": 0.7923, "grad_norm": 0.6505740880966187, "learning_rate": 0.0002, "epoch": 0.6191900412254466, "step": 3830}, {"loss": 0.8123, "grad_norm": 0.5512559413909912, "learning_rate": 0.0002, "epoch": 0.6208067254061919, "step": 3840}, {"loss": 0.8856, "grad_norm": 0.49427518248558044, "learning_rate": 0.0002, "epoch": 0.6224234095869372, "step": 3850}, {"loss": 0.7751, "grad_norm": 0.3839147090911865, "learning_rate": 0.0002, "epoch": 0.6240400937676824, "step": 3860}, {"loss": 0.8006, "grad_norm": 0.5760218501091003, "learning_rate": 0.0002, "epoch": 0.6256567779484278, "step": 3870}, {"loss": 0.7836, "grad_norm": 0.7226507067680359, "learning_rate": 0.0002, "epoch": 0.6272734621291731, "step": 3880}, {"loss": 0.8244, "grad_norm": 0.676781415939331, "learning_rate": 0.0002, "epoch": 0.6288901463099184, "step": 3890}, {"loss": 0.8239, "grad_norm": 0.4284018278121948, "learning_rate": 0.0002, "epoch": 0.6305068304906637, "step": 3900}, {"loss": 0.7996, "grad_norm": 0.5060628056526184, "learning_rate": 0.0002, "epoch": 0.6321235146714089, "step": 3910}, {"loss": 0.8089, "grad_norm": 0.5524522066116333, "learning_rate": 0.0002, "epoch": 0.6337401988521543, "step": 3920}, {"loss": 0.8276, "grad_norm": 0.6099881529808044, "learning_rate": 0.0002, "epoch": 0.6353568830328995, "step": 3930}, {"loss": 0.809, "grad_norm": 0.43155938386917114, "learning_rate": 0.0002, "epoch": 0.6369735672136448, "step": 3940}, {"loss": 0.8404, "grad_norm": 0.6427084803581238, "learning_rate": 0.0002, "epoch": 0.6385902513943901, "step": 3950}, {"loss": 0.8368, "grad_norm": 0.541220486164093, "learning_rate": 0.0002, "epoch": 0.6402069355751354, "step": 3960}, {"loss": 0.8539, "grad_norm": 0.5414294600486755, "learning_rate": 0.0002, "epoch": 0.6418236197558806, "step": 3970}, {"loss": 0.7996, "grad_norm": 0.46344003081321716, "learning_rate": 0.0002, "epoch": 0.643440303936626, "step": 3980}, {"loss": 0.7474, "grad_norm": 0.45209285616874695, "learning_rate": 0.0002, "epoch": 0.6450569881173712, "step": 3990}, {"loss": 0.8202, "grad_norm": 0.5417284369468689, "learning_rate": 0.0002, "epoch": 0.6466736722981166, "step": 4000}, {"loss": 0.7563, "grad_norm": 0.7995685935020447, "learning_rate": 0.0002, "epoch": 0.6482903564788619, "step": 4010}, {"loss": 0.7812, "grad_norm": 0.6384002566337585, "learning_rate": 0.0002, "epoch": 0.6499070406596071, "step": 4020}, {"loss": 0.732, "grad_norm": 0.4472815692424774, "learning_rate": 0.0002, "epoch": 0.6515237248403525, "step": 4030}, {"loss": 0.8071, "grad_norm": 0.6834294199943542, "learning_rate": 0.0002, "epoch": 0.6531404090210977, "step": 4040}, {"loss": 0.7812, "grad_norm": 0.4612339735031128, "learning_rate": 0.0002, "epoch": 0.654757093201843, "step": 4050}, {"loss": 0.8141, "grad_norm": 0.9266576170921326, "learning_rate": 0.0002, "epoch": 0.6563737773825883, "step": 4060}, {"loss": 0.7991, "grad_norm": 0.4470861852169037, "learning_rate": 0.0002, "epoch": 0.6579904615633336, "step": 4070}, {"loss": 0.8293, "grad_norm": 0.45544925332069397, "learning_rate": 0.0002, "epoch": 0.6596071457440789, "step": 4080}, {"loss": 0.8455, "grad_norm": 0.6144481301307678, "learning_rate": 0.0002, "epoch": 0.6612238299248242, "step": 4090}, {"loss": 0.7877, "grad_norm": 0.5936288237571716, "learning_rate": 0.0002, "epoch": 0.6628405141055694, "step": 4100}, {"loss": 0.7617, "grad_norm": 0.4822963774204254, "learning_rate": 0.0002, "epoch": 0.6644571982863148, "step": 4110}, {"loss": 0.7997, "grad_norm": 0.48432496190071106, "learning_rate": 0.0002, "epoch": 0.66607388246706, "step": 4120}, {"loss": 0.8404, "grad_norm": 0.4901607930660248, "learning_rate": 0.0002, "epoch": 0.6676905666478054, "step": 4130}, {"loss": 0.8085, "grad_norm": 0.5018393397331238, "learning_rate": 0.0002, "epoch": 0.6693072508285507, "step": 4140}, {"loss": 0.8065, "grad_norm": 0.6946378946304321, "learning_rate": 0.0002, "epoch": 0.6709239350092959, "step": 4150}, {"loss": 0.8147, "grad_norm": 0.5997390747070312, "learning_rate": 0.0002, "epoch": 0.6725406191900413, "step": 4160}, {"loss": 0.8268, "grad_norm": 0.6738849878311157, "learning_rate": 0.0002, "epoch": 0.6741573033707865, "step": 4170}, {"loss": 0.7704, "grad_norm": 0.6110581159591675, "learning_rate": 0.0002, "epoch": 0.6757739875515318, "step": 4180}, {"loss": 0.8043, "grad_norm": 0.5703322291374207, "learning_rate": 0.0002, "epoch": 0.6773906717322771, "step": 4190}, {"loss": 0.8099, "grad_norm": 0.4686066210269928, "learning_rate": 0.0002, "epoch": 0.6790073559130224, "step": 4200}, {"loss": 0.8441, "grad_norm": 0.6394643783569336, "learning_rate": 0.0002, "epoch": 0.6806240400937676, "step": 4210}, {"loss": 0.8011, "grad_norm": 0.5454841256141663, "learning_rate": 0.0002, "epoch": 0.682240724274513, "step": 4220}, {"loss": 0.8307, "grad_norm": 0.4859732985496521, "learning_rate": 0.0002, "epoch": 0.6838574084552582, "step": 4230}, {"loss": 0.8161, "grad_norm": 0.5544065833091736, "learning_rate": 0.0002, "epoch": 0.6854740926360036, "step": 4240}, {"loss": 0.7839, "grad_norm": 0.4902505576610565, "learning_rate": 0.0002, "epoch": 0.6870907768167488, "step": 4250}, {"loss": 0.7977, "grad_norm": 0.4768051505088806, "learning_rate": 0.0002, "epoch": 0.6887074609974941, "step": 4260}, {"loss": 0.7539, "grad_norm": 0.49982190132141113, "learning_rate": 0.0002, "epoch": 0.6903241451782395, "step": 4270}, {"loss": 0.7353, "grad_norm": 0.6351838111877441, "learning_rate": 0.0002, "epoch": 0.6919408293589847, "step": 4280}, {"loss": 0.7664, "grad_norm": 0.5647561550140381, "learning_rate": 0.0002, "epoch": 0.69355751353973, "step": 4290}, {"loss": 0.7618, "grad_norm": 0.5340486764907837, "learning_rate": 0.0002, "epoch": 0.6951741977204753, "step": 4300}, {"loss": 0.8526, "grad_norm": 0.5649092793464661, "learning_rate": 0.0002, "epoch": 0.6967908819012206, "step": 4310}, {"loss": 0.8246, "grad_norm": 0.6183916926383972, "learning_rate": 0.0002, "epoch": 0.6984075660819659, "step": 4320}, {"loss": 0.792, "grad_norm": 0.6154509782791138, "learning_rate": 0.0002, "epoch": 0.7000242502627112, "step": 4330}, {"loss": 0.8397, "grad_norm": 0.5156264305114746, "learning_rate": 0.0002, "epoch": 0.7016409344434564, "step": 4340}, {"loss": 0.8512, "grad_norm": 0.562171459197998, "learning_rate": 0.0002, "epoch": 0.7032576186242018, "step": 4350}, {"loss": 0.7882, "grad_norm": 0.4949502646923065, "learning_rate": 0.0002, "epoch": 0.704874302804947, "step": 4360}, {"loss": 0.738, "grad_norm": 0.5171684622764587, "learning_rate": 0.0002, "epoch": 0.7064909869856923, "step": 4370}, {"loss": 0.8001, "grad_norm": 0.6198443174362183, "learning_rate": 0.0002, "epoch": 0.7081076711664377, "step": 4380}, {"loss": 0.7606, "grad_norm": 0.5802276134490967, "learning_rate": 0.0002, "epoch": 0.7097243553471829, "step": 4390}, {"loss": 0.8797, "grad_norm": 0.41096967458724976, "learning_rate": 0.0002, "epoch": 0.7113410395279283, "step": 4400}, {"loss": 0.805, "grad_norm": 0.4397392272949219, "learning_rate": 0.0002, "epoch": 0.7129577237086735, "step": 4410}, {"loss": 0.7651, "grad_norm": 0.45228442549705505, "learning_rate": 0.0002, "epoch": 0.7145744078894188, "step": 4420}, {"loss": 0.7938, "grad_norm": 0.4839673936367035, "learning_rate": 0.0002, "epoch": 0.7161910920701641, "step": 4430}, {"loss": 0.8362, "grad_norm": 0.6140755414962769, "learning_rate": 0.0002, "epoch": 0.7178077762509094, "step": 4440}, {"loss": 0.7722, "grad_norm": 0.6841378808021545, "learning_rate": 0.0002, "epoch": 0.7194244604316546, "step": 4450}, {"loss": 0.8177, "grad_norm": 0.6664239168167114, "learning_rate": 0.0002, "epoch": 0.7210411446124, "step": 4460}, {"loss": 0.7983, "grad_norm": 0.47552719712257385, "learning_rate": 0.0002, "epoch": 0.7226578287931452, "step": 4470}, {"loss": 0.8982, "grad_norm": 0.6649776101112366, "learning_rate": 0.0002, "epoch": 0.7242745129738906, "step": 4480}, {"loss": 0.8074, "grad_norm": 0.5159541964530945, "learning_rate": 0.0002, "epoch": 0.7258911971546358, "step": 4490}, {"loss": 0.7786, "grad_norm": 0.6693112850189209, "learning_rate": 0.0002, "epoch": 0.7275078813353811, "step": 4500}, {"loss": 0.8655, "grad_norm": 0.48870977759361267, "learning_rate": 0.0002, "epoch": 0.7291245655161265, "step": 4510}, {"loss": 0.7337, "grad_norm": 0.4857887923717499, "learning_rate": 0.0002, "epoch": 0.7307412496968717, "step": 4520}, {"loss": 0.8026, "grad_norm": 0.5515662431716919, "learning_rate": 0.0002, "epoch": 0.732357933877617, "step": 4530}, {"loss": 0.8031, "grad_norm": 0.6292222738265991, "learning_rate": 0.0002, "epoch": 0.7339746180583623, "step": 4540}, {"loss": 0.7749, "grad_norm": 0.48265689611434937, "learning_rate": 0.0002, "epoch": 0.7355913022391076, "step": 4550}, {"loss": 0.8499, "grad_norm": 0.8044266104698181, "learning_rate": 0.0002, "epoch": 0.7372079864198529, "step": 4560}, {"loss": 0.8162, "grad_norm": 0.6111769676208496, "learning_rate": 0.0002, "epoch": 0.7388246706005982, "step": 4570}, {"loss": 0.7291, "grad_norm": 0.5229553580284119, "learning_rate": 0.0002, "epoch": 0.7404413547813434, "step": 4580}, {"loss": 0.8038, "grad_norm": 0.6054152250289917, "learning_rate": 0.0002, "epoch": 0.7420580389620888, "step": 4590}, {"loss": 0.8169, "grad_norm": 0.5574966669082642, "learning_rate": 0.0002, "epoch": 0.743674723142834, "step": 4600}, {"loss": 0.8439, "grad_norm": 0.5395817160606384, "learning_rate": 0.0002, "epoch": 0.7452914073235793, "step": 4610}, {"loss": 0.8495, "grad_norm": 0.7116472721099854, "learning_rate": 0.0002, "epoch": 0.7469080915043246, "step": 4620}, {"loss": 0.7743, "grad_norm": 0.5618700981140137, "learning_rate": 0.0002, "epoch": 0.7485247756850699, "step": 4630}, {"loss": 0.7744, "grad_norm": 0.5802770853042603, "learning_rate": 0.0002, "epoch": 0.7501414598658153, "step": 4640}, {"loss": 0.7924, "grad_norm": 0.5690428018569946, "learning_rate": 0.0002, "epoch": 0.7517581440465605, "step": 4650}, {"loss": 0.8017, "grad_norm": 0.4813360273838043, "learning_rate": 0.0002, "epoch": 0.7533748282273058, "step": 4660}, {"loss": 0.8108, "grad_norm": 0.5434042811393738, "learning_rate": 0.0002, "epoch": 0.7549915124080511, "step": 4670}, {"loss": 0.7824, "grad_norm": 0.5502099990844727, "learning_rate": 0.0002, "epoch": 0.7566081965887964, "step": 4680}, {"loss": 0.8598, "grad_norm": 0.6020621061325073, "learning_rate": 0.0002, "epoch": 0.7582248807695416, "step": 4690}, {"loss": 0.7937, "grad_norm": 0.4922301471233368, "learning_rate": 0.0002, "epoch": 0.759841564950287, "step": 4700}, {"loss": 0.788, "grad_norm": 0.6492828726768494, "learning_rate": 0.0002, "epoch": 0.7614582491310322, "step": 4710}, {"loss": 0.8313, "grad_norm": 0.4865580201148987, "learning_rate": 0.0002, "epoch": 0.7630749333117776, "step": 4720}, {"loss": 0.7966, "grad_norm": 0.5971422791481018, "learning_rate": 0.0002, "epoch": 0.7646916174925228, "step": 4730}, {"loss": 0.8298, "grad_norm": 0.6832674145698547, "learning_rate": 0.0002, "epoch": 0.7663083016732681, "step": 4740}, {"loss": 0.8156, "grad_norm": 0.500908613204956, "learning_rate": 0.0002, "epoch": 0.7679249858540134, "step": 4750}, {"loss": 0.8383, "grad_norm": 0.6112465858459473, "learning_rate": 0.0002, "epoch": 0.7695416700347587, "step": 4760}, {"loss": 0.76, "grad_norm": 0.5753506422042847, "learning_rate": 0.0002, "epoch": 0.771158354215504, "step": 4770}, {"loss": 0.8297, "grad_norm": 0.6529405117034912, "learning_rate": 0.0002, "epoch": 0.7727750383962493, "step": 4780}, {"loss": 0.8171, "grad_norm": 0.5916843414306641, "learning_rate": 0.0002, "epoch": 0.7743917225769946, "step": 4790}, {"loss": 0.83, "grad_norm": 0.4821224510669708, "learning_rate": 0.0002, "epoch": 0.7760084067577399, "step": 4800}, {"loss": 0.7703, "grad_norm": 0.5532580018043518, "learning_rate": 0.0002, "epoch": 0.7776250909384852, "step": 4810}, {"loss": 0.7363, "grad_norm": 0.4604877233505249, "learning_rate": 0.0002, "epoch": 0.7792417751192304, "step": 4820}, {"loss": 0.7506, "grad_norm": 0.5009613037109375, "learning_rate": 0.0002, "epoch": 0.7808584592999758, "step": 4830}, {"loss": 0.7863, "grad_norm": 0.6448560357093811, "learning_rate": 0.0002, "epoch": 0.782475143480721, "step": 4840}, {"loss": 0.7957, "grad_norm": 0.44327953457832336, "learning_rate": 0.0002, "epoch": 0.7840918276614663, "step": 4850}, {"loss": 0.7925, "grad_norm": 0.5355411171913147, "learning_rate": 0.0002, "epoch": 0.7857085118422116, "step": 4860}, {"loss": 0.7754, "grad_norm": 0.5635677576065063, "learning_rate": 0.0002, "epoch": 0.7873251960229569, "step": 4870}, {"loss": 0.7931, "grad_norm": 0.5417491793632507, "learning_rate": 0.0002, "epoch": 0.7889418802037023, "step": 4880}, {"loss": 0.7819, "grad_norm": 0.4567430913448334, "learning_rate": 0.0002, "epoch": 0.7905585643844475, "step": 4890}, {"loss": 0.8454, "grad_norm": 0.44651296734809875, "learning_rate": 0.0002, "epoch": 0.7921752485651928, "step": 4900}, {"loss": 0.7959, "grad_norm": 0.5741217136383057, "learning_rate": 0.0002, "epoch": 0.7937919327459381, "step": 4910}, {"loss": 0.8093, "grad_norm": 0.6605045199394226, "learning_rate": 0.0002, "epoch": 0.7954086169266834, "step": 4920}, {"loss": 0.77, "grad_norm": 0.5126531720161438, "learning_rate": 0.0002, "epoch": 0.7970253011074286, "step": 4930}, {"loss": 0.7793, "grad_norm": 0.513648271560669, "learning_rate": 0.0002, "epoch": 0.798641985288174, "step": 4940}, {"loss": 0.8314, "grad_norm": 0.5350404381752014, "learning_rate": 0.0002, "epoch": 0.8002586694689192, "step": 4950}, {"loss": 0.7649, "grad_norm": 0.5731674432754517, "learning_rate": 0.0002, "epoch": 0.8018753536496646, "step": 4960}, {"loss": 0.8572, "grad_norm": 0.5974258184432983, "learning_rate": 0.0002, "epoch": 0.8034920378304098, "step": 4970}, {"loss": 0.7972, "grad_norm": 0.8774799704551697, "learning_rate": 0.0002, "epoch": 0.8051087220111551, "step": 4980}, {"loss": 0.7899, "grad_norm": 0.5994430184364319, "learning_rate": 0.0002, "epoch": 0.8067254061919004, "step": 4990}, {"loss": 0.7736, "grad_norm": 0.4894903004169464, "learning_rate": 0.0002, "epoch": 0.8083420903726457, "step": 5000}, {"loss": 0.78, "grad_norm": 0.5218459367752075, "learning_rate": 0.0002, "epoch": 0.809958774553391, "step": 5010}, {"loss": 0.817, "grad_norm": 0.5232468843460083, "learning_rate": 0.0002, "epoch": 0.8115754587341363, "step": 5020}, {"loss": 0.7704, "grad_norm": 0.44358372688293457, "learning_rate": 0.0002, "epoch": 0.8131921429148816, "step": 5030}, {"loss": 0.785, "grad_norm": 0.6202037334442139, "learning_rate": 0.0002, "epoch": 0.8148088270956269, "step": 5040}, {"loss": 0.7351, "grad_norm": 0.7721474170684814, "learning_rate": 0.0002, "epoch": 0.8164255112763722, "step": 5050}, {"loss": 0.8297, "grad_norm": 0.5568501353263855, "learning_rate": 0.0002, "epoch": 0.8180421954571174, "step": 5060}, {"loss": 0.7733, "grad_norm": 0.49148809909820557, "learning_rate": 0.0002, "epoch": 0.8196588796378628, "step": 5070}, {"loss": 0.8054, "grad_norm": 0.4956012964248657, "learning_rate": 0.0002, "epoch": 0.821275563818608, "step": 5080}, {"loss": 0.8201, "grad_norm": 0.6078833937644958, "learning_rate": 0.0002, "epoch": 0.8228922479993533, "step": 5090}, {"loss": 0.828, "grad_norm": 0.46906954050064087, "learning_rate": 0.0002, "epoch": 0.8245089321800986, "step": 5100}, {"loss": 0.7703, "grad_norm": 0.50812166929245, "learning_rate": 0.0002, "epoch": 0.8261256163608439, "step": 5110}, {"loss": 0.8243, "grad_norm": 0.5319661498069763, "learning_rate": 0.0002, "epoch": 0.8277423005415891, "step": 5120}, {"loss": 0.7798, "grad_norm": 0.4949689209461212, "learning_rate": 0.0002, "epoch": 0.8293589847223345, "step": 5130}, {"loss": 0.7428, "grad_norm": 0.5151591300964355, "learning_rate": 0.0002, "epoch": 0.8309756689030798, "step": 5140}, {"loss": 0.8147, "grad_norm": 0.5530214309692383, "learning_rate": 0.0002, "epoch": 0.8325923530838251, "step": 5150}, {"loss": 0.8251, "grad_norm": 0.6297410130500793, "learning_rate": 0.0002, "epoch": 0.8342090372645704, "step": 5160}, {"loss": 0.8067, "grad_norm": 0.5466840267181396, "learning_rate": 0.0002, "epoch": 0.8358257214453156, "step": 5170}, {"loss": 0.7875, "grad_norm": 0.652913510799408, "learning_rate": 0.0002, "epoch": 0.837442405626061, "step": 5180}, {"loss": 0.8295, "grad_norm": 0.5811293125152588, "learning_rate": 0.0002, "epoch": 0.8390590898068062, "step": 5190}, {"loss": 0.7412, "grad_norm": 0.5109550952911377, "learning_rate": 0.0002, "epoch": 0.8406757739875516, "step": 5200}, {"loss": 0.8077, "grad_norm": 0.4551706612110138, "learning_rate": 0.0002, "epoch": 0.8422924581682968, "step": 5210}, {"loss": 0.7827, "grad_norm": 0.5813754200935364, "learning_rate": 0.0002, "epoch": 0.8439091423490421, "step": 5220}, {"loss": 0.802, "grad_norm": 0.5856947898864746, "learning_rate": 0.0002, "epoch": 0.8455258265297874, "step": 5230}, {"loss": 0.7957, "grad_norm": 0.5482739210128784, "learning_rate": 0.0002, "epoch": 0.8471425107105327, "step": 5240}, {"loss": 0.8295, "grad_norm": 0.49023720622062683, "learning_rate": 0.0002, "epoch": 0.8487591948912779, "step": 5250}, {"loss": 0.8022, "grad_norm": 0.49472475051879883, "learning_rate": 0.0002, "epoch": 0.8503758790720233, "step": 5260}, {"loss": 0.8001, "grad_norm": 0.5490226745605469, "learning_rate": 0.0002, "epoch": 0.8519925632527686, "step": 5270}, {"loss": 0.8333, "grad_norm": 0.5340665578842163, "learning_rate": 0.0002, "epoch": 0.8536092474335139, "step": 5280}, {"loss": 0.8277, "grad_norm": 0.5962483882904053, "learning_rate": 0.0002, "epoch": 0.8552259316142592, "step": 5290}, {"loss": 0.8765, "grad_norm": 0.586358368396759, "learning_rate": 0.0002, "epoch": 0.8568426157950044, "step": 5300}, {"loss": 0.7831, "grad_norm": 0.49120277166366577, "learning_rate": 0.0002, "epoch": 0.8584592999757498, "step": 5310}, {"loss": 0.8162, "grad_norm": 0.5887332558631897, "learning_rate": 0.0002, "epoch": 0.860075984156495, "step": 5320}, {"loss": 0.7464, "grad_norm": 0.42496153712272644, "learning_rate": 0.0002, "epoch": 0.8616926683372403, "step": 5330}, {"loss": 0.7905, "grad_norm": 0.5489874482154846, "learning_rate": 0.0002, "epoch": 0.8633093525179856, "step": 5340}, {"loss": 0.7958, "grad_norm": 0.5850813984870911, "learning_rate": 0.0002, "epoch": 0.8649260366987309, "step": 5350}, {"loss": 0.7642, "grad_norm": 0.517487108707428, "learning_rate": 0.0002, "epoch": 0.8665427208794761, "step": 5360}, {"loss": 0.7801, "grad_norm": 0.5339142680168152, "learning_rate": 0.0002, "epoch": 0.8681594050602215, "step": 5370}, {"loss": 0.818, "grad_norm": 0.6236387491226196, "learning_rate": 0.0002, "epoch": 0.8697760892409668, "step": 5380}, {"loss": 0.7708, "grad_norm": 0.5752192735671997, "learning_rate": 0.0002, "epoch": 0.8713927734217121, "step": 5390}, {"loss": 0.8542, "grad_norm": 0.6724614500999451, "learning_rate": 0.0002, "epoch": 0.8730094576024574, "step": 5400}, {"loss": 0.7581, "grad_norm": 0.5280613303184509, "learning_rate": 0.0002, "epoch": 0.8746261417832026, "step": 5410}, {"loss": 0.8231, "grad_norm": 0.44033288955688477, "learning_rate": 0.0002, "epoch": 0.876242825963948, "step": 5420}, {"loss": 0.8839, "grad_norm": 0.5199708342552185, "learning_rate": 0.0002, "epoch": 0.8778595101446932, "step": 5430}, {"loss": 0.7852, "grad_norm": 0.46778348088264465, "learning_rate": 0.0002, "epoch": 0.8794761943254386, "step": 5440}, {"loss": 0.7834, "grad_norm": 0.4657754898071289, "learning_rate": 0.0002, "epoch": 0.8810928785061838, "step": 5450}, {"loss": 0.7799, "grad_norm": 0.5472902655601501, "learning_rate": 0.0002, "epoch": 0.8827095626869291, "step": 5460}, {"loss": 0.8253, "grad_norm": 0.4876766800880432, "learning_rate": 0.0002, "epoch": 0.8843262468676744, "step": 5470}, {"loss": 0.7906, "grad_norm": 0.5057248473167419, "learning_rate": 0.0002, "epoch": 0.8859429310484197, "step": 5480}, {"loss": 0.8124, "grad_norm": 0.4637320637702942, "learning_rate": 0.0002, "epoch": 0.8875596152291649, "step": 5490}, {"loss": 0.781, "grad_norm": 0.471955806016922, "learning_rate": 0.0002, "epoch": 0.8891762994099103, "step": 5500}, {"loss": 0.8057, "grad_norm": 0.5209813714027405, "learning_rate": 0.0002, "epoch": 0.8907929835906556, "step": 5510}, {"loss": 0.8106, "grad_norm": 0.6213834285736084, "learning_rate": 0.0002, "epoch": 0.8924096677714008, "step": 5520}, {"loss": 0.7787, "grad_norm": 0.5215408205986023, "learning_rate": 0.0002, "epoch": 0.8940263519521462, "step": 5530}, {"loss": 0.8174, "grad_norm": 0.580478310585022, "learning_rate": 0.0002, "epoch": 0.8956430361328914, "step": 5540}, {"loss": 0.8371, "grad_norm": 0.49102169275283813, "learning_rate": 0.0002, "epoch": 0.8972597203136368, "step": 5550}, {"loss": 0.7806, "grad_norm": 0.6043479442596436, "learning_rate": 0.0002, "epoch": 0.898876404494382, "step": 5560}, {"loss": 0.7754, "grad_norm": 0.5636463165283203, "learning_rate": 0.0002, "epoch": 0.9004930886751273, "step": 5570}, {"loss": 0.8145, "grad_norm": 0.5620124340057373, "learning_rate": 0.0002, "epoch": 0.9021097728558726, "step": 5580}, {"loss": 0.8083, "grad_norm": 0.5206354856491089, "learning_rate": 0.0002, "epoch": 0.9037264570366179, "step": 5590}, {"loss": 0.8557, "grad_norm": 0.5798229575157166, "learning_rate": 0.0002, "epoch": 0.9053431412173631, "step": 5600}, {"loss": 0.8097, "grad_norm": 0.6428212523460388, "learning_rate": 0.0002, "epoch": 0.9069598253981085, "step": 5610}, {"loss": 0.7839, "grad_norm": 0.48064687848091125, "learning_rate": 0.0002, "epoch": 0.9085765095788537, "step": 5620}, {"loss": 0.8343, "grad_norm": 0.6347860097885132, "learning_rate": 0.0002, "epoch": 0.9101931937595991, "step": 5630}, {"loss": 0.851, "grad_norm": 0.5353913307189941, "learning_rate": 0.0002, "epoch": 0.9118098779403444, "step": 5640}, {"loss": 0.7736, "grad_norm": 0.5323944091796875, "learning_rate": 0.0002, "epoch": 0.9134265621210896, "step": 5650}, {"loss": 0.8393, "grad_norm": 0.5261843204498291, "learning_rate": 0.0002, "epoch": 0.915043246301835, "step": 5660}, {"loss": 0.7355, "grad_norm": 0.5451326966285706, "learning_rate": 0.0002, "epoch": 0.9166599304825802, "step": 5670}, {"loss": 0.8012, "grad_norm": 0.5183324217796326, "learning_rate": 0.0002, "epoch": 0.9182766146633256, "step": 5680}, {"loss": 0.7659, "grad_norm": 0.47229018807411194, "learning_rate": 0.0002, "epoch": 0.9198932988440708, "step": 5690}, {"loss": 0.7757, "grad_norm": 0.49180513620376587, "learning_rate": 0.0002, "epoch": 0.9215099830248161, "step": 5700}, {"loss": 0.8735, "grad_norm": 0.5419785380363464, "learning_rate": 0.0002, "epoch": 0.9231266672055614, "step": 5710}, {"loss": 0.7378, "grad_norm": 0.5408698916435242, "learning_rate": 0.0002, "epoch": 0.9247433513863067, "step": 5720}, {"loss": 0.7701, "grad_norm": 0.5286232829093933, "learning_rate": 0.0002, "epoch": 0.9263600355670519, "step": 5730}, {"loss": 0.8242, "grad_norm": 0.7539758086204529, "learning_rate": 0.0002, "epoch": 0.9279767197477973, "step": 5740}, {"loss": 0.8118, "grad_norm": 0.5166944861412048, "learning_rate": 0.0002, "epoch": 0.9295934039285425, "step": 5750}, {"loss": 0.783, "grad_norm": 0.6601425409317017, "learning_rate": 0.0002, "epoch": 0.9312100881092878, "step": 5760}, {"loss": 0.7873, "grad_norm": 0.5029960870742798, "learning_rate": 0.0002, "epoch": 0.9328267722900332, "step": 5770}, {"loss": 0.7989, "grad_norm": 0.4926645755767822, "learning_rate": 0.0002, "epoch": 0.9344434564707784, "step": 5780}, {"loss": 0.8174, "grad_norm": 0.5739615559577942, "learning_rate": 0.0002, "epoch": 0.9360601406515238, "step": 5790}, {"loss": 0.8037, "grad_norm": 0.5058279037475586, "learning_rate": 0.0002, "epoch": 0.937676824832269, "step": 5800}, {"loss": 0.8537, "grad_norm": 0.5260962247848511, "learning_rate": 0.0002, "epoch": 0.9392935090130143, "step": 5810}, {"loss": 0.7486, "grad_norm": 0.5768588185310364, "learning_rate": 0.0002, "epoch": 0.9409101931937596, "step": 5820}, {"loss": 0.8215, "grad_norm": 0.5170126557350159, "learning_rate": 0.0002, "epoch": 0.9425268773745049, "step": 5830}, {"loss": 0.7422, "grad_norm": 0.5745864510536194, "learning_rate": 0.0002, "epoch": 0.9441435615552501, "step": 5840}, {"loss": 0.7824, "grad_norm": 0.5551357865333557, "learning_rate": 0.0002, "epoch": 0.9457602457359955, "step": 5850}, {"loss": 0.8529, "grad_norm": 0.5776078701019287, "learning_rate": 0.0002, "epoch": 0.9473769299167407, "step": 5860}, {"loss": 0.8527, "grad_norm": 0.5340062379837036, "learning_rate": 0.0002, "epoch": 0.9489936140974861, "step": 5870}, {"loss": 0.8217, "grad_norm": 0.6447290182113647, "learning_rate": 0.0002, "epoch": 0.9506102982782314, "step": 5880}, {"loss": 0.7945, "grad_norm": 0.5123815536499023, "learning_rate": 0.0002, "epoch": 0.9522269824589766, "step": 5890}, {"loss": 0.8209, "grad_norm": 0.48547613620758057, "learning_rate": 0.0002, "epoch": 0.953843666639722, "step": 5900}, {"loss": 0.7896, "grad_norm": 0.5791414976119995, "learning_rate": 0.0002, "epoch": 0.9554603508204672, "step": 5910}, {"loss": 0.8408, "grad_norm": 0.6195011734962463, "learning_rate": 0.0002, "epoch": 0.9570770350012126, "step": 5920}, {"loss": 0.7805, "grad_norm": 0.6323803067207336, "learning_rate": 0.0002, "epoch": 0.9586937191819578, "step": 5930}, {"loss": 0.8484, "grad_norm": 0.45552879571914673, "learning_rate": 0.0002, "epoch": 0.9603104033627031, "step": 5940}, {"loss": 0.7367, "grad_norm": 0.5796473622322083, "learning_rate": 0.0002, "epoch": 0.9619270875434484, "step": 5950}, {"loss": 0.7672, "grad_norm": 0.647261381149292, "learning_rate": 0.0002, "epoch": 0.9635437717241937, "step": 5960}, {"loss": 0.8086, "grad_norm": 0.5487682819366455, "learning_rate": 0.0002, "epoch": 0.9651604559049389, "step": 5970}, {"loss": 0.7973, "grad_norm": 0.5743663907051086, "learning_rate": 0.0002, "epoch": 0.9667771400856843, "step": 5980}, {"loss": 0.8153, "grad_norm": 0.5470591187477112, "learning_rate": 0.0002, "epoch": 0.9683938242664295, "step": 5990}, {"loss": 0.8119, "grad_norm": 0.5901660323143005, "learning_rate": 0.0002, "epoch": 0.9700105084471748, "step": 6000}, {"loss": 0.8147, "grad_norm": 0.6544759273529053, "learning_rate": 0.0002, "epoch": 0.9716271926279202, "step": 6010}, {"loss": 0.7536, "grad_norm": 0.6288470029830933, "learning_rate": 0.0002, "epoch": 0.9732438768086654, "step": 6020}, {"loss": 0.7989, "grad_norm": 0.673153817653656, "learning_rate": 0.0002, "epoch": 0.9748605609894108, "step": 6030}, {"loss": 0.7556, "grad_norm": 0.42854753136634827, "learning_rate": 0.0002, "epoch": 0.976477245170156, "step": 6040}, {"loss": 0.8006, "grad_norm": 0.5227066278457642, "learning_rate": 0.0002, "epoch": 0.9780939293509013, "step": 6050}, {"loss": 0.795, "grad_norm": 0.5372416973114014, "learning_rate": 0.0002, "epoch": 0.9797106135316466, "step": 6060}, {"loss": 0.7591, "grad_norm": 0.6026402115821838, "learning_rate": 0.0002, "epoch": 0.9813272977123919, "step": 6070}, {"loss": 0.8347, "grad_norm": 0.49547791481018066, "learning_rate": 0.0002, "epoch": 0.9829439818931371, "step": 6080}, {"loss": 0.7722, "grad_norm": 0.4641951322555542, "learning_rate": 0.0002, "epoch": 0.9845606660738825, "step": 6090}, {"loss": 0.8125, "grad_norm": 0.5818535089492798, "learning_rate": 0.0002, "epoch": 0.9861773502546277, "step": 6100}, {"loss": 0.81, "grad_norm": 0.63955157995224, "learning_rate": 0.0002, "epoch": 0.9877940344353731, "step": 6110}, {"loss": 0.7547, "grad_norm": 0.5649438500404358, "learning_rate": 0.0002, "epoch": 0.9894107186161183, "step": 6120}, {"loss": 0.7861, "grad_norm": 0.5290433168411255, "learning_rate": 0.0002, "epoch": 0.9910274027968636, "step": 6130}, {"loss": 0.8109, "grad_norm": 0.6399374008178711, "learning_rate": 0.0002, "epoch": 0.992644086977609, "step": 6140}, {"loss": 0.8373, "grad_norm": 0.6736576557159424, "learning_rate": 0.0002, "epoch": 0.9942607711583542, "step": 6150}, {"loss": 0.7915, "grad_norm": 0.515420138835907, "learning_rate": 0.0002, "epoch": 0.9958774553390995, "step": 6160}, {"loss": 0.8032, "grad_norm": 0.562677800655365, "learning_rate": 0.0002, "epoch": 0.9974941395198448, "step": 6170}, {"loss": 0.8187, "grad_norm": 0.7113858461380005, "learning_rate": 0.0002, "epoch": 0.9991108237005901, "step": 6180}]} +{"epoch": 2.0, "step": 12371, "epoch_duration": 16880.00914287567, "total_accumulated_duration": 33750.92989253998, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75537109375}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.6636, "grad_norm": 0.9894065856933594, "learning_rate": 0.0002, "epoch": 0.0016166841807452913, "step": 10}, {"loss": 1.1528, "grad_norm": 1.7810699939727783, "learning_rate": 0.0002, "epoch": 0.0032333683614905826, "step": 20}, {"loss": 0.9767, "grad_norm": 0.5969577431678772, "learning_rate": 0.0002, "epoch": 0.004850052542235874, "step": 30}, {"loss": 0.9772, "grad_norm": 0.6354120969772339, "learning_rate": 0.0002, "epoch": 0.006466736722981165, "step": 40}, {"loss": 0.8643, "grad_norm": 0.5604607462882996, "learning_rate": 0.0002, "epoch": 0.008083420903726457, "step": 50}, {"loss": 0.8841, "grad_norm": 0.4676193594932556, "learning_rate": 0.0002, "epoch": 0.009700105084471748, "step": 60}, {"loss": 0.9022, "grad_norm": 0.6099211573600769, "learning_rate": 0.0002, "epoch": 0.01131678926521704, "step": 70}, {"loss": 0.9133, "grad_norm": 0.48639994859695435, "learning_rate": 0.0002, "epoch": 0.01293347344596233, "step": 80}, {"loss": 0.8704, "grad_norm": 0.4904264509677887, "learning_rate": 0.0002, "epoch": 0.014550157626707623, "step": 90}, {"loss": 0.8855, "grad_norm": 2.8334362506866455, "learning_rate": 0.0002, "epoch": 0.016166841807452915, "step": 100}, {"loss": 0.8958, "grad_norm": 0.43221670389175415, "learning_rate": 0.0002, "epoch": 0.017783525988198205, "step": 110}, {"loss": 0.8412, "grad_norm": 0.42244166135787964, "learning_rate": 0.0002, "epoch": 0.019400210168943496, "step": 120}, {"loss": 0.8467, "grad_norm": 0.45363298058509827, "learning_rate": 0.0002, "epoch": 0.02101689434968879, "step": 130}, {"loss": 0.8641, "grad_norm": 0.44816508889198303, "learning_rate": 0.0002, "epoch": 0.02263357853043408, "step": 140}, {"loss": 0.8496, "grad_norm": 0.43308213353157043, "learning_rate": 0.0002, "epoch": 0.02425026271117937, "step": 150}, {"loss": 0.8213, "grad_norm": 0.4084763526916504, "learning_rate": 0.0002, "epoch": 0.02586694689192466, "step": 160}, {"loss": 0.8343, "grad_norm": 0.5363703966140747, "learning_rate": 0.0002, "epoch": 0.027483631072669955, "step": 170}, {"loss": 0.8558, "grad_norm": 0.4619699716567993, "learning_rate": 0.0002, "epoch": 0.029100315253415245, "step": 180}, {"loss": 0.8878, "grad_norm": 0.49069908261299133, "learning_rate": 0.0002, "epoch": 0.030716999434160536, "step": 190}, {"loss": 0.8867, "grad_norm": 0.4645835757255554, "learning_rate": 0.0002, "epoch": 0.03233368361490583, "step": 200}, {"loss": 0.8842, "grad_norm": 1.2411243915557861, "learning_rate": 0.0002, "epoch": 0.03395036779565112, "step": 210}, {"loss": 0.8245, "grad_norm": 0.5211851596832275, "learning_rate": 0.0002, "epoch": 0.03556705197639641, "step": 220}, {"loss": 0.8194, "grad_norm": 0.5253691673278809, "learning_rate": 0.0002, "epoch": 0.037183736157141704, "step": 230}, {"loss": 0.8856, "grad_norm": 0.4567478895187378, "learning_rate": 0.0002, "epoch": 0.03880042033788699, "step": 240}, {"loss": 0.838, "grad_norm": 0.5472128391265869, "learning_rate": 0.0002, "epoch": 0.040417104518632285, "step": 250}, {"loss": 0.8201, "grad_norm": 0.42978546023368835, "learning_rate": 0.0002, "epoch": 0.04203378869937758, "step": 260}, {"loss": 0.8334, "grad_norm": 0.601734459400177, "learning_rate": 0.0002, "epoch": 0.043650472880122866, "step": 270}, {"loss": 0.815, "grad_norm": 0.4286513328552246, "learning_rate": 0.0002, "epoch": 0.04526715706086816, "step": 280}, {"loss": 0.8758, "grad_norm": 0.5230861902236938, "learning_rate": 0.0002, "epoch": 0.046883841241613454, "step": 290}, {"loss": 0.8636, "grad_norm": 0.6504611968994141, "learning_rate": 0.0002, "epoch": 0.04850052542235874, "step": 300}, {"loss": 0.8102, "grad_norm": 0.43485215306282043, "learning_rate": 0.0002, "epoch": 0.050117209603104035, "step": 310}, {"loss": 0.8221, "grad_norm": 0.4717007875442505, "learning_rate": 0.0002, "epoch": 0.05173389378384932, "step": 320}, {"loss": 0.8469, "grad_norm": 0.4059787690639496, "learning_rate": 0.0002, "epoch": 0.053350577964594616, "step": 330}, {"loss": 0.8866, "grad_norm": 0.4366913437843323, "learning_rate": 0.0002, "epoch": 0.05496726214533991, "step": 340}, {"loss": 0.7976, "grad_norm": 0.4233848452568054, "learning_rate": 0.0002, "epoch": 0.0565839463260852, "step": 350}, {"loss": 0.8456, "grad_norm": 0.4209108352661133, "learning_rate": 0.0002, "epoch": 0.05820063050683049, "step": 360}, {"loss": 0.816, "grad_norm": 0.41637396812438965, "learning_rate": 0.0002, "epoch": 0.059817314687575784, "step": 370}, {"loss": 0.7976, "grad_norm": 0.46235376596450806, "learning_rate": 0.0002, "epoch": 0.06143399886832107, "step": 380}, {"loss": 0.7966, "grad_norm": 0.4013484716415405, "learning_rate": 0.0002, "epoch": 0.06305068304906636, "step": 390}, {"loss": 0.8253, "grad_norm": 0.47443896532058716, "learning_rate": 0.0002, "epoch": 0.06466736722981166, "step": 400}, {"loss": 0.8666, "grad_norm": 0.3942156434059143, "learning_rate": 0.0002, "epoch": 0.06628405141055695, "step": 410}, {"loss": 0.8402, "grad_norm": 0.4965320825576782, "learning_rate": 0.0002, "epoch": 0.06790073559130223, "step": 420}, {"loss": 0.8317, "grad_norm": 0.4304835796356201, "learning_rate": 0.0002, "epoch": 0.06951741977204753, "step": 430}, {"loss": 0.8528, "grad_norm": 0.511726975440979, "learning_rate": 0.0002, "epoch": 0.07113410395279282, "step": 440}, {"loss": 0.8675, "grad_norm": 0.4040689170360565, "learning_rate": 0.0002, "epoch": 0.07275078813353811, "step": 450}, {"loss": 0.8788, "grad_norm": 0.5402171015739441, "learning_rate": 0.0002, "epoch": 0.07436747231428341, "step": 460}, {"loss": 0.8737, "grad_norm": 0.4174517095088959, "learning_rate": 0.0002, "epoch": 0.0759841564950287, "step": 470}, {"loss": 0.7605, "grad_norm": 0.4306182265281677, "learning_rate": 0.0002, "epoch": 0.07760084067577398, "step": 480}, {"loss": 0.799, "grad_norm": 0.535210132598877, "learning_rate": 0.0002, "epoch": 0.07921752485651928, "step": 490}, {"loss": 0.7825, "grad_norm": 0.5339109897613525, "learning_rate": 0.0002, "epoch": 0.08083420903726457, "step": 500}, {"loss": 0.8985, "grad_norm": 0.45754891633987427, "learning_rate": 0.0002, "epoch": 0.08245089321800986, "step": 510}, {"loss": 0.8144, "grad_norm": 0.43820783495903015, "learning_rate": 0.0002, "epoch": 0.08406757739875516, "step": 520}, {"loss": 0.8001, "grad_norm": 0.4434749186038971, "learning_rate": 0.0002, "epoch": 0.08568426157950045, "step": 530}, {"loss": 0.7857, "grad_norm": 0.43111467361450195, "learning_rate": 0.0002, "epoch": 0.08730094576024573, "step": 540}, {"loss": 0.8418, "grad_norm": 0.4378940165042877, "learning_rate": 0.0002, "epoch": 0.08891762994099103, "step": 550}, {"loss": 0.8361, "grad_norm": 0.4772215187549591, "learning_rate": 0.0002, "epoch": 0.09053431412173632, "step": 560}, {"loss": 0.8268, "grad_norm": 0.6837629079818726, "learning_rate": 0.0002, "epoch": 0.09215099830248161, "step": 570}, {"loss": 0.8607, "grad_norm": 0.42241212725639343, "learning_rate": 0.0002, "epoch": 0.09376768248322691, "step": 580}, {"loss": 0.852, "grad_norm": 0.5165936350822449, "learning_rate": 0.0002, "epoch": 0.0953843666639722, "step": 590}, {"loss": 0.8664, "grad_norm": 0.48737478256225586, "learning_rate": 0.0002, "epoch": 0.09700105084471748, "step": 600}, {"loss": 0.8806, "grad_norm": 0.47419852018356323, "learning_rate": 0.0002, "epoch": 0.09861773502546278, "step": 610}, {"loss": 0.8254, "grad_norm": 0.4975486099720001, "learning_rate": 0.0002, "epoch": 0.10023441920620807, "step": 620}, {"loss": 0.8548, "grad_norm": 0.49123844504356384, "learning_rate": 0.0002, "epoch": 0.10185110338695336, "step": 630}, {"loss": 0.8911, "grad_norm": 0.6288952827453613, "learning_rate": 0.0002, "epoch": 0.10346778756769864, "step": 640}, {"loss": 0.827, "grad_norm": 0.4277345836162567, "learning_rate": 0.0002, "epoch": 0.10508447174844394, "step": 650}, {"loss": 0.7996, "grad_norm": 0.4021061956882477, "learning_rate": 0.0002, "epoch": 0.10670115592918923, "step": 660}, {"loss": 0.87, "grad_norm": 0.3492237329483032, "learning_rate": 0.0002, "epoch": 0.10831784010993452, "step": 670}, {"loss": 0.8698, "grad_norm": 0.4341012239456177, "learning_rate": 0.0002, "epoch": 0.10993452429067982, "step": 680}, {"loss": 0.781, "grad_norm": 0.7296304106712341, "learning_rate": 0.0002, "epoch": 0.1115512084714251, "step": 690}, {"loss": 0.8433, "grad_norm": 0.397494912147522, "learning_rate": 0.0002, "epoch": 0.1131678926521704, "step": 700}, {"loss": 0.827, "grad_norm": 0.396431028842926, "learning_rate": 0.0002, "epoch": 0.1147845768329157, "step": 710}, {"loss": 0.8379, "grad_norm": 0.48842838406562805, "learning_rate": 0.0002, "epoch": 0.11640126101366098, "step": 720}, {"loss": 0.8238, "grad_norm": 0.46322616934776306, "learning_rate": 0.0002, "epoch": 0.11801794519440627, "step": 730}, {"loss": 0.8041, "grad_norm": 0.47990912199020386, "learning_rate": 0.0002, "epoch": 0.11963462937515157, "step": 740}, {"loss": 0.82, "grad_norm": 0.4997142255306244, "learning_rate": 0.0002, "epoch": 0.12125131355589686, "step": 750}, {"loss": 0.7702, "grad_norm": 0.4040526747703552, "learning_rate": 0.0002, "epoch": 0.12286799773664214, "step": 760}, {"loss": 0.863, "grad_norm": 0.453095942735672, "learning_rate": 0.0002, "epoch": 0.12448468191738744, "step": 770}, {"loss": 0.8792, "grad_norm": 0.4636971950531006, "learning_rate": 0.0002, "epoch": 0.12610136609813272, "step": 780}, {"loss": 0.8112, "grad_norm": 0.4279276132583618, "learning_rate": 0.0002, "epoch": 0.12771805027887803, "step": 790}, {"loss": 0.8711, "grad_norm": 0.46212655305862427, "learning_rate": 0.0002, "epoch": 0.12933473445962332, "step": 800}, {"loss": 0.8368, "grad_norm": 0.43127650022506714, "learning_rate": 0.0002, "epoch": 0.1309514186403686, "step": 810}, {"loss": 0.8476, "grad_norm": 0.4201301336288452, "learning_rate": 0.0002, "epoch": 0.1325681028211139, "step": 820}, {"loss": 0.8078, "grad_norm": 0.42583167552948, "learning_rate": 0.0002, "epoch": 0.13418478700185918, "step": 830}, {"loss": 0.8219, "grad_norm": 0.4535622000694275, "learning_rate": 0.0002, "epoch": 0.13580147118260447, "step": 840}, {"loss": 0.8423, "grad_norm": 0.4116036891937256, "learning_rate": 0.0002, "epoch": 0.13741815536334978, "step": 850}, {"loss": 0.8466, "grad_norm": 0.45997580885887146, "learning_rate": 0.0002, "epoch": 0.13903483954409507, "step": 860}, {"loss": 0.8917, "grad_norm": 0.4487837255001068, "learning_rate": 0.0002, "epoch": 0.14065152372484035, "step": 870}, {"loss": 0.8217, "grad_norm": 0.43650057911872864, "learning_rate": 0.0002, "epoch": 0.14226820790558564, "step": 880}, {"loss": 0.8178, "grad_norm": 0.5335358381271362, "learning_rate": 0.0002, "epoch": 0.14388489208633093, "step": 890}, {"loss": 0.7957, "grad_norm": 0.5989000201225281, "learning_rate": 0.0002, "epoch": 0.14550157626707622, "step": 900}, {"loss": 0.8385, "grad_norm": 0.517179012298584, "learning_rate": 0.0002, "epoch": 0.14711826044782153, "step": 910}, {"loss": 0.8255, "grad_norm": 0.44435232877731323, "learning_rate": 0.0002, "epoch": 0.14873494462856682, "step": 920}, {"loss": 0.8305, "grad_norm": 0.42635923624038696, "learning_rate": 0.0002, "epoch": 0.1503516288093121, "step": 930}, {"loss": 0.8043, "grad_norm": 0.49603334069252014, "learning_rate": 0.0002, "epoch": 0.1519683129900574, "step": 940}, {"loss": 0.8377, "grad_norm": 0.40639808773994446, "learning_rate": 0.0002, "epoch": 0.15358499717080268, "step": 950}, {"loss": 0.8529, "grad_norm": 0.4850759208202362, "learning_rate": 0.0002, "epoch": 0.15520168135154797, "step": 960}, {"loss": 0.846, "grad_norm": 0.4427442252635956, "learning_rate": 0.0002, "epoch": 0.15681836553229328, "step": 970}, {"loss": 0.8705, "grad_norm": 0.3760930001735687, "learning_rate": 0.0002, "epoch": 0.15843504971303857, "step": 980}, {"loss": 0.8644, "grad_norm": 0.4794144332408905, "learning_rate": 0.0002, "epoch": 0.16005173389378385, "step": 990}, {"loss": 0.8002, "grad_norm": 0.45828768610954285, "learning_rate": 0.0002, "epoch": 0.16166841807452914, "step": 1000}, {"loss": 0.7658, "grad_norm": 0.6313053369522095, "learning_rate": 0.0002, "epoch": 0.16328510225527443, "step": 1010}, {"loss": 0.8047, "grad_norm": 0.45041006803512573, "learning_rate": 0.0002, "epoch": 0.16490178643601971, "step": 1020}, {"loss": 0.8423, "grad_norm": 0.441403865814209, "learning_rate": 0.0002, "epoch": 0.166518470616765, "step": 1030}, {"loss": 0.8475, "grad_norm": 0.8171296119689941, "learning_rate": 0.0002, "epoch": 0.16813515479751032, "step": 1040}, {"loss": 0.845, "grad_norm": 0.7137420773506165, "learning_rate": 0.0002, "epoch": 0.1697518389782556, "step": 1050}, {"loss": 0.8213, "grad_norm": 0.5236809849739075, "learning_rate": 0.0002, "epoch": 0.1713685231590009, "step": 1060}, {"loss": 0.8265, "grad_norm": 0.5021864175796509, "learning_rate": 0.0002, "epoch": 0.17298520733974618, "step": 1070}, {"loss": 0.8305, "grad_norm": 0.47347521781921387, "learning_rate": 0.0002, "epoch": 0.17460189152049146, "step": 1080}, {"loss": 0.8105, "grad_norm": 0.4631653428077698, "learning_rate": 0.0002, "epoch": 0.17621857570123675, "step": 1090}, {"loss": 0.8166, "grad_norm": 0.49169182777404785, "learning_rate": 0.0002, "epoch": 0.17783525988198207, "step": 1100}, {"loss": 0.8012, "grad_norm": 0.5019739270210266, "learning_rate": 0.0002, "epoch": 0.17945194406272735, "step": 1110}, {"loss": 0.8247, "grad_norm": 0.5100422501564026, "learning_rate": 0.0002, "epoch": 0.18106862824347264, "step": 1120}, {"loss": 0.8142, "grad_norm": 0.3888324499130249, "learning_rate": 0.0002, "epoch": 0.18268531242421793, "step": 1130}, {"loss": 0.8533, "grad_norm": 0.39765217900276184, "learning_rate": 0.0002, "epoch": 0.18430199660496321, "step": 1140}, {"loss": 0.8541, "grad_norm": 0.47190186381340027, "learning_rate": 0.0002, "epoch": 0.1859186807857085, "step": 1150}, {"loss": 0.8301, "grad_norm": 0.4464188814163208, "learning_rate": 0.0002, "epoch": 0.18753536496645382, "step": 1160}, {"loss": 0.8341, "grad_norm": 0.5153930187225342, "learning_rate": 0.0002, "epoch": 0.1891520491471991, "step": 1170}, {"loss": 0.8033, "grad_norm": 0.4779708683490753, "learning_rate": 0.0002, "epoch": 0.1907687333279444, "step": 1180}, {"loss": 0.8187, "grad_norm": 0.4834315776824951, "learning_rate": 0.0002, "epoch": 0.19238541750868968, "step": 1190}, {"loss": 0.7721, "grad_norm": 0.402357816696167, "learning_rate": 0.0002, "epoch": 0.19400210168943496, "step": 1200}, {"loss": 0.7941, "grad_norm": 0.45899084210395813, "learning_rate": 0.0002, "epoch": 0.19561878587018025, "step": 1210}, {"loss": 0.8353, "grad_norm": 0.5106529593467712, "learning_rate": 0.0002, "epoch": 0.19723547005092557, "step": 1220}, {"loss": 0.7816, "grad_norm": 0.45261722803115845, "learning_rate": 0.0002, "epoch": 0.19885215423167085, "step": 1230}, {"loss": 0.8068, "grad_norm": 0.4647127091884613, "learning_rate": 0.0002, "epoch": 0.20046883841241614, "step": 1240}, {"loss": 0.8239, "grad_norm": 0.4849368929862976, "learning_rate": 0.0002, "epoch": 0.20208552259316143, "step": 1250}, {"loss": 0.8514, "grad_norm": 0.4518061578273773, "learning_rate": 0.0002, "epoch": 0.2037022067739067, "step": 1260}, {"loss": 0.8158, "grad_norm": 0.49535325169563293, "learning_rate": 0.0002, "epoch": 0.205318890954652, "step": 1270}, {"loss": 0.8348, "grad_norm": 0.4835205376148224, "learning_rate": 0.0002, "epoch": 0.2069355751353973, "step": 1280}, {"loss": 0.8428, "grad_norm": 0.45308539271354675, "learning_rate": 0.0002, "epoch": 0.2085522593161426, "step": 1290}, {"loss": 0.7993, "grad_norm": 0.5369905233383179, "learning_rate": 0.0002, "epoch": 0.2101689434968879, "step": 1300}, {"loss": 0.8676, "grad_norm": 0.5031622052192688, "learning_rate": 0.0002, "epoch": 0.21178562767763318, "step": 1310}, {"loss": 0.7686, "grad_norm": 0.48010334372520447, "learning_rate": 0.0002, "epoch": 0.21340231185837846, "step": 1320}, {"loss": 0.806, "grad_norm": 0.4905701279640198, "learning_rate": 0.0002, "epoch": 0.21501899603912375, "step": 1330}, {"loss": 0.7885, "grad_norm": 0.43531742691993713, "learning_rate": 0.0002, "epoch": 0.21663568021986904, "step": 1340}, {"loss": 0.8191, "grad_norm": 0.44330692291259766, "learning_rate": 0.0002, "epoch": 0.21825236440061435, "step": 1350}, {"loss": 0.8205, "grad_norm": 0.5384416580200195, "learning_rate": 0.0002, "epoch": 0.21986904858135964, "step": 1360}, {"loss": 0.7726, "grad_norm": 0.4181833863258362, "learning_rate": 0.0002, "epoch": 0.22148573276210493, "step": 1370}, {"loss": 0.8311, "grad_norm": 0.523833692073822, "learning_rate": 0.0002, "epoch": 0.2231024169428502, "step": 1380}, {"loss": 0.7913, "grad_norm": 0.5528736710548401, "learning_rate": 0.0002, "epoch": 0.2247191011235955, "step": 1390}, {"loss": 0.8079, "grad_norm": 0.43515023589134216, "learning_rate": 0.0002, "epoch": 0.2263357853043408, "step": 1400}, {"loss": 0.8403, "grad_norm": 0.48809877038002014, "learning_rate": 0.0002, "epoch": 0.2279524694850861, "step": 1410}, {"loss": 0.8165, "grad_norm": 0.43591251969337463, "learning_rate": 0.0002, "epoch": 0.2295691536658314, "step": 1420}, {"loss": 0.8147, "grad_norm": 0.44625312089920044, "learning_rate": 0.0002, "epoch": 0.23118583784657668, "step": 1430}, {"loss": 0.8134, "grad_norm": 0.4390665292739868, "learning_rate": 0.0002, "epoch": 0.23280252202732196, "step": 1440}, {"loss": 0.8465, "grad_norm": 0.48496049642562866, "learning_rate": 0.0002, "epoch": 0.23441920620806725, "step": 1450}, {"loss": 0.775, "grad_norm": 0.45919957756996155, "learning_rate": 0.0002, "epoch": 0.23603589038881254, "step": 1460}, {"loss": 0.8659, "grad_norm": 0.5471845865249634, "learning_rate": 0.0002, "epoch": 0.23765257456955785, "step": 1470}, {"loss": 0.8164, "grad_norm": 0.47269317507743835, "learning_rate": 0.0002, "epoch": 0.23926925875030314, "step": 1480}, {"loss": 0.854, "grad_norm": 0.4930245578289032, "learning_rate": 0.0002, "epoch": 0.24088594293104842, "step": 1490}, {"loss": 0.8139, "grad_norm": 0.5605630278587341, "learning_rate": 0.0002, "epoch": 0.2425026271117937, "step": 1500}, {"loss": 0.8125, "grad_norm": 0.4435870945453644, "learning_rate": 0.0002, "epoch": 0.244119311292539, "step": 1510}, {"loss": 0.8123, "grad_norm": 0.4941999912261963, "learning_rate": 0.0002, "epoch": 0.24573599547328429, "step": 1520}, {"loss": 0.8427, "grad_norm": 0.5100624561309814, "learning_rate": 0.0002, "epoch": 0.24735267965402957, "step": 1530}, {"loss": 0.8405, "grad_norm": 0.4638267457485199, "learning_rate": 0.0002, "epoch": 0.2489693638347749, "step": 1540}, {"loss": 0.81, "grad_norm": 0.5071570873260498, "learning_rate": 0.0002, "epoch": 0.25058604801552015, "step": 1550}, {"loss": 0.7724, "grad_norm": 0.4291319251060486, "learning_rate": 0.0002, "epoch": 0.25220273219626543, "step": 1560}, {"loss": 0.7984, "grad_norm": 0.5388049483299255, "learning_rate": 0.0002, "epoch": 0.2538194163770108, "step": 1570}, {"loss": 0.8176, "grad_norm": 0.5083683729171753, "learning_rate": 0.0002, "epoch": 0.25543610055775606, "step": 1580}, {"loss": 0.843, "grad_norm": 0.4824463725090027, "learning_rate": 0.0002, "epoch": 0.25705278473850135, "step": 1590}, {"loss": 0.7996, "grad_norm": 0.41177722811698914, "learning_rate": 0.0002, "epoch": 0.25866946891924664, "step": 1600}, {"loss": 0.7772, "grad_norm": 0.5656219124794006, "learning_rate": 0.0002, "epoch": 0.2602861530999919, "step": 1610}, {"loss": 0.7955, "grad_norm": 0.41063204407691956, "learning_rate": 0.0002, "epoch": 0.2619028372807372, "step": 1620}, {"loss": 0.7998, "grad_norm": 0.4897061288356781, "learning_rate": 0.0002, "epoch": 0.2635195214614825, "step": 1630}, {"loss": 0.8198, "grad_norm": 0.4454376697540283, "learning_rate": 0.0002, "epoch": 0.2651362056422278, "step": 1640}, {"loss": 0.8684, "grad_norm": 0.4355238378047943, "learning_rate": 0.0002, "epoch": 0.26675288982297307, "step": 1650}, {"loss": 0.7801, "grad_norm": 0.458310067653656, "learning_rate": 0.0002, "epoch": 0.26836957400371836, "step": 1660}, {"loss": 0.7935, "grad_norm": 0.4752083718776703, "learning_rate": 0.0002, "epoch": 0.26998625818446365, "step": 1670}, {"loss": 0.8267, "grad_norm": 0.4666106402873993, "learning_rate": 0.0002, "epoch": 0.27160294236520893, "step": 1680}, {"loss": 0.8252, "grad_norm": 0.4213818609714508, "learning_rate": 0.0002, "epoch": 0.2732196265459543, "step": 1690}, {"loss": 0.8559, "grad_norm": 0.5768913626670837, "learning_rate": 0.0002, "epoch": 0.27483631072669956, "step": 1700}, {"loss": 0.7931, "grad_norm": 0.4209914803504944, "learning_rate": 0.0002, "epoch": 0.27645299490744485, "step": 1710}, {"loss": 0.8167, "grad_norm": 0.501909613609314, "learning_rate": 0.0002, "epoch": 0.27806967908819014, "step": 1720}, {"loss": 0.7832, "grad_norm": 0.5266261100769043, "learning_rate": 0.0002, "epoch": 0.2796863632689354, "step": 1730}, {"loss": 0.8102, "grad_norm": 0.43806859850883484, "learning_rate": 0.0002, "epoch": 0.2813030474496807, "step": 1740}, {"loss": 0.8157, "grad_norm": 0.46048814058303833, "learning_rate": 0.0002, "epoch": 0.282919731630426, "step": 1750}, {"loss": 0.8596, "grad_norm": 0.44972819089889526, "learning_rate": 0.0002, "epoch": 0.2845364158111713, "step": 1760}, {"loss": 0.8421, "grad_norm": 0.5114831328392029, "learning_rate": 0.0002, "epoch": 0.28615309999191657, "step": 1770}, {"loss": 0.8361, "grad_norm": 0.47931742668151855, "learning_rate": 0.0002, "epoch": 0.28776978417266186, "step": 1780}, {"loss": 0.8265, "grad_norm": 0.5092599987983704, "learning_rate": 0.0002, "epoch": 0.28938646835340714, "step": 1790}, {"loss": 0.8506, "grad_norm": 0.37581443786621094, "learning_rate": 0.0002, "epoch": 0.29100315253415243, "step": 1800}, {"loss": 0.7932, "grad_norm": 0.47097381949424744, "learning_rate": 0.0002, "epoch": 0.2926198367148977, "step": 1810}, {"loss": 0.7787, "grad_norm": 0.48300236463546753, "learning_rate": 0.0002, "epoch": 0.29423652089564306, "step": 1820}, {"loss": 0.8391, "grad_norm": 0.5600419640541077, "learning_rate": 0.0002, "epoch": 0.29585320507638835, "step": 1830}, {"loss": 0.8507, "grad_norm": 0.48555272817611694, "learning_rate": 0.0002, "epoch": 0.29746988925713364, "step": 1840}, {"loss": 0.7657, "grad_norm": 0.3752668499946594, "learning_rate": 0.0002, "epoch": 0.2990865734378789, "step": 1850}, {"loss": 0.7915, "grad_norm": 0.5328747034072876, "learning_rate": 0.0002, "epoch": 0.3007032576186242, "step": 1860}, {"loss": 0.8426, "grad_norm": 0.48716455698013306, "learning_rate": 0.0002, "epoch": 0.3023199417993695, "step": 1870}, {"loss": 0.8335, "grad_norm": 0.5011493563652039, "learning_rate": 0.0002, "epoch": 0.3039366259801148, "step": 1880}, {"loss": 0.852, "grad_norm": 0.46461427211761475, "learning_rate": 0.0002, "epoch": 0.30555331016086007, "step": 1890}, {"loss": 0.8478, "grad_norm": 0.36630210280418396, "learning_rate": 0.0002, "epoch": 0.30716999434160536, "step": 1900}, {"loss": 0.8162, "grad_norm": 0.4217296242713928, "learning_rate": 0.0002, "epoch": 0.30878667852235064, "step": 1910}, {"loss": 0.8128, "grad_norm": 0.4394875466823578, "learning_rate": 0.0002, "epoch": 0.31040336270309593, "step": 1920}, {"loss": 0.8471, "grad_norm": 0.6587965488433838, "learning_rate": 0.0002, "epoch": 0.3120200468838412, "step": 1930}, {"loss": 0.8565, "grad_norm": 0.5469298958778381, "learning_rate": 0.0002, "epoch": 0.31363673106458656, "step": 1940}, {"loss": 0.8236, "grad_norm": 0.4371595084667206, "learning_rate": 0.0002, "epoch": 0.31525341524533185, "step": 1950}, {"loss": 0.887, "grad_norm": 0.4809541404247284, "learning_rate": 0.0002, "epoch": 0.31687009942607713, "step": 1960}, {"loss": 0.7855, "grad_norm": 0.6061086654663086, "learning_rate": 0.0002, "epoch": 0.3184867836068224, "step": 1970}, {"loss": 0.7679, "grad_norm": 0.5342657566070557, "learning_rate": 0.0002, "epoch": 0.3201034677875677, "step": 1980}, {"loss": 0.7955, "grad_norm": 0.5057743787765503, "learning_rate": 0.0002, "epoch": 0.321720151968313, "step": 1990}, {"loss": 0.7774, "grad_norm": 0.528626024723053, "learning_rate": 0.0002, "epoch": 0.3233368361490583, "step": 2000}, {"loss": 0.8845, "grad_norm": 0.46742770075798035, "learning_rate": 0.0002, "epoch": 0.32495352032980357, "step": 2010}, {"loss": 0.8484, "grad_norm": 0.515101432800293, "learning_rate": 0.0002, "epoch": 0.32657020451054886, "step": 2020}, {"loss": 0.8139, "grad_norm": 0.41941216588020325, "learning_rate": 0.0002, "epoch": 0.32818688869129414, "step": 2030}, {"loss": 0.7637, "grad_norm": 0.49902522563934326, "learning_rate": 0.0002, "epoch": 0.32980357287203943, "step": 2040}, {"loss": 0.7822, "grad_norm": 0.4120897650718689, "learning_rate": 0.0002, "epoch": 0.3314202570527847, "step": 2050}, {"loss": 0.8057, "grad_norm": 0.45352041721343994, "learning_rate": 0.0002, "epoch": 0.33303694123353, "step": 2060}, {"loss": 0.7913, "grad_norm": 0.523199737071991, "learning_rate": 0.0002, "epoch": 0.33465362541427535, "step": 2070}, {"loss": 0.8036, "grad_norm": 0.4390358626842499, "learning_rate": 0.0002, "epoch": 0.33627030959502063, "step": 2080}, {"loss": 0.8145, "grad_norm": 0.6752901077270508, "learning_rate": 0.0002, "epoch": 0.3378869937757659, "step": 2090}, {"loss": 0.7807, "grad_norm": 0.547821044921875, "learning_rate": 0.0002, "epoch": 0.3395036779565112, "step": 2100}, {"loss": 0.8561, "grad_norm": 0.5161308646202087, "learning_rate": 0.0002, "epoch": 0.3411203621372565, "step": 2110}, {"loss": 0.7697, "grad_norm": 0.4565401077270508, "learning_rate": 0.0002, "epoch": 0.3427370463180018, "step": 2120}, {"loss": 0.7964, "grad_norm": 0.4666115939617157, "learning_rate": 0.0002, "epoch": 0.34435373049874707, "step": 2130}, {"loss": 0.8189, "grad_norm": 0.4090428352355957, "learning_rate": 0.0002, "epoch": 0.34597041467949236, "step": 2140}, {"loss": 0.8817, "grad_norm": 0.510845422744751, "learning_rate": 0.0002, "epoch": 0.34758709886023764, "step": 2150}, {"loss": 0.8398, "grad_norm": 0.42861923575401306, "learning_rate": 0.0002, "epoch": 0.34920378304098293, "step": 2160}, {"loss": 0.7716, "grad_norm": 0.4476332664489746, "learning_rate": 0.0002, "epoch": 0.3508204672217282, "step": 2170}, {"loss": 0.7845, "grad_norm": 0.6065791249275208, "learning_rate": 0.0002, "epoch": 0.3524371514024735, "step": 2180}, {"loss": 0.8187, "grad_norm": 0.42335066199302673, "learning_rate": 0.0002, "epoch": 0.35405383558321885, "step": 2190}, {"loss": 0.8239, "grad_norm": 0.5094629526138306, "learning_rate": 0.0002, "epoch": 0.35567051976396413, "step": 2200}, {"loss": 0.7807, "grad_norm": 0.5476373434066772, "learning_rate": 0.0002, "epoch": 0.3572872039447094, "step": 2210}, {"loss": 0.814, "grad_norm": 0.3911719024181366, "learning_rate": 0.0002, "epoch": 0.3589038881254547, "step": 2220}, {"loss": 0.8599, "grad_norm": 0.6599636077880859, "learning_rate": 0.0002, "epoch": 0.3605205723062, "step": 2230}, {"loss": 0.7482, "grad_norm": 0.40381914377212524, "learning_rate": 0.0002, "epoch": 0.3621372564869453, "step": 2240}, {"loss": 0.7772, "grad_norm": 0.4433908462524414, "learning_rate": 0.0002, "epoch": 0.36375394066769057, "step": 2250}, {"loss": 0.8503, "grad_norm": 0.578326940536499, "learning_rate": 0.0002, "epoch": 0.36537062484843585, "step": 2260}, {"loss": 0.8178, "grad_norm": 0.5734784007072449, "learning_rate": 0.0002, "epoch": 0.36698730902918114, "step": 2270}, {"loss": 0.8193, "grad_norm": 0.45555487275123596, "learning_rate": 0.0002, "epoch": 0.36860399320992643, "step": 2280}, {"loss": 0.7929, "grad_norm": 0.5666276216506958, "learning_rate": 0.0002, "epoch": 0.3702206773906717, "step": 2290}, {"loss": 0.8292, "grad_norm": 0.5461117625236511, "learning_rate": 0.0002, "epoch": 0.371837361571417, "step": 2300}, {"loss": 0.8204, "grad_norm": 0.6318911910057068, "learning_rate": 0.0002, "epoch": 0.3734540457521623, "step": 2310}, {"loss": 0.7964, "grad_norm": 0.493263304233551, "learning_rate": 0.0002, "epoch": 0.37507072993290763, "step": 2320}, {"loss": 0.8339, "grad_norm": 0.5888760089874268, "learning_rate": 0.0002, "epoch": 0.3766874141136529, "step": 2330}, {"loss": 0.7737, "grad_norm": 0.48671841621398926, "learning_rate": 0.0002, "epoch": 0.3783040982943982, "step": 2340}, {"loss": 0.8367, "grad_norm": 0.4385145306587219, "learning_rate": 0.0002, "epoch": 0.3799207824751435, "step": 2350}, {"loss": 0.812, "grad_norm": 0.5523318648338318, "learning_rate": 0.0002, "epoch": 0.3815374666558888, "step": 2360}, {"loss": 0.8351, "grad_norm": 0.7308220267295837, "learning_rate": 0.0002, "epoch": 0.38315415083663407, "step": 2370}, {"loss": 0.859, "grad_norm": 0.554214358329773, "learning_rate": 0.0002, "epoch": 0.38477083501737935, "step": 2380}, {"loss": 0.8146, "grad_norm": 0.5425800085067749, "learning_rate": 0.0002, "epoch": 0.38638751919812464, "step": 2390}, {"loss": 0.8282, "grad_norm": 0.48811158537864685, "learning_rate": 0.0002, "epoch": 0.3880042033788699, "step": 2400}, {"loss": 0.8074, "grad_norm": 0.49212366342544556, "learning_rate": 0.0002, "epoch": 0.3896208875596152, "step": 2410}, {"loss": 0.7991, "grad_norm": 0.5222218632698059, "learning_rate": 0.0002, "epoch": 0.3912375717403605, "step": 2420}, {"loss": 0.8182, "grad_norm": 0.4699819087982178, "learning_rate": 0.0002, "epoch": 0.3928542559211058, "step": 2430}, {"loss": 0.7919, "grad_norm": 0.46153587102890015, "learning_rate": 0.0002, "epoch": 0.39447094010185113, "step": 2440}, {"loss": 0.8111, "grad_norm": 0.4150611162185669, "learning_rate": 0.0002, "epoch": 0.3960876242825964, "step": 2450}, {"loss": 0.8589, "grad_norm": 0.5799614787101746, "learning_rate": 0.0002, "epoch": 0.3977043084633417, "step": 2460}, {"loss": 0.8085, "grad_norm": 0.56536865234375, "learning_rate": 0.0002, "epoch": 0.399320992644087, "step": 2470}, {"loss": 0.8022, "grad_norm": 0.5451247096061707, "learning_rate": 0.0002, "epoch": 0.4009376768248323, "step": 2480}, {"loss": 0.8217, "grad_norm": 0.5914521217346191, "learning_rate": 0.0002, "epoch": 0.40255436100557757, "step": 2490}, {"loss": 0.7859, "grad_norm": 0.4428117275238037, "learning_rate": 0.0002, "epoch": 0.40417104518632285, "step": 2500}, {"loss": 0.8054, "grad_norm": 0.48580947518348694, "learning_rate": 0.0002, "epoch": 0.40578772936706814, "step": 2510}, {"loss": 0.8405, "grad_norm": 0.436734676361084, "learning_rate": 0.0002, "epoch": 0.4074044135478134, "step": 2520}, {"loss": 0.8209, "grad_norm": 0.5752223134040833, "learning_rate": 0.0002, "epoch": 0.4090210977285587, "step": 2530}, {"loss": 0.8181, "grad_norm": 0.4271308183670044, "learning_rate": 0.0002, "epoch": 0.410637781909304, "step": 2540}, {"loss": 0.8058, "grad_norm": 0.46294718980789185, "learning_rate": 0.0002, "epoch": 0.4122544660900493, "step": 2550}, {"loss": 0.8473, "grad_norm": 0.49407583475112915, "learning_rate": 0.0002, "epoch": 0.4138711502707946, "step": 2560}, {"loss": 0.7881, "grad_norm": 0.4729035496711731, "learning_rate": 0.0002, "epoch": 0.4154878344515399, "step": 2570}, {"loss": 0.7834, "grad_norm": 0.4129747152328491, "learning_rate": 0.0002, "epoch": 0.4171045186322852, "step": 2580}, {"loss": 0.7859, "grad_norm": 0.5684236288070679, "learning_rate": 0.0002, "epoch": 0.4187212028130305, "step": 2590}, {"loss": 0.811, "grad_norm": 0.4862157106399536, "learning_rate": 0.0002, "epoch": 0.4203378869937758, "step": 2600}, {"loss": 0.7582, "grad_norm": 0.46567976474761963, "learning_rate": 0.0002, "epoch": 0.42195457117452106, "step": 2610}, {"loss": 0.7755, "grad_norm": 0.5710650682449341, "learning_rate": 0.0002, "epoch": 0.42357125535526635, "step": 2620}, {"loss": 0.8573, "grad_norm": 0.5660041570663452, "learning_rate": 0.0002, "epoch": 0.42518793953601164, "step": 2630}, {"loss": 0.7812, "grad_norm": 0.47944375872612, "learning_rate": 0.0002, "epoch": 0.4268046237167569, "step": 2640}, {"loss": 0.7459, "grad_norm": 0.537223756313324, "learning_rate": 0.0002, "epoch": 0.4284213078975022, "step": 2650}, {"loss": 0.8246, "grad_norm": 0.41669997572898865, "learning_rate": 0.0002, "epoch": 0.4300379920782475, "step": 2660}, {"loss": 0.7785, "grad_norm": 0.44727686047554016, "learning_rate": 0.0002, "epoch": 0.4316546762589928, "step": 2670}, {"loss": 0.8241, "grad_norm": 0.5600888729095459, "learning_rate": 0.0002, "epoch": 0.4332713604397381, "step": 2680}, {"loss": 0.7708, "grad_norm": 0.39820605516433716, "learning_rate": 0.0002, "epoch": 0.4348880446204834, "step": 2690}, {"loss": 0.8202, "grad_norm": 0.5637655854225159, "learning_rate": 0.0002, "epoch": 0.4365047288012287, "step": 2700}, {"loss": 0.855, "grad_norm": 0.6363666653633118, "learning_rate": 0.0002, "epoch": 0.438121412981974, "step": 2710}, {"loss": 0.8468, "grad_norm": 0.5656129121780396, "learning_rate": 0.0002, "epoch": 0.4397380971627193, "step": 2720}, {"loss": 0.7845, "grad_norm": 0.5600156188011169, "learning_rate": 0.0002, "epoch": 0.44135478134346456, "step": 2730}, {"loss": 0.8405, "grad_norm": 0.5506579875946045, "learning_rate": 0.0002, "epoch": 0.44297146552420985, "step": 2740}, {"loss": 0.7725, "grad_norm": 0.49878305196762085, "learning_rate": 0.0002, "epoch": 0.44458814970495514, "step": 2750}, {"loss": 0.8292, "grad_norm": 0.4569213092327118, "learning_rate": 0.0002, "epoch": 0.4462048338857004, "step": 2760}, {"loss": 0.8028, "grad_norm": 0.6056680083274841, "learning_rate": 0.0002, "epoch": 0.4478215180664457, "step": 2770}, {"loss": 0.8242, "grad_norm": 0.44474557042121887, "learning_rate": 0.0002, "epoch": 0.449438202247191, "step": 2780}, {"loss": 0.801, "grad_norm": 0.46055394411087036, "learning_rate": 0.0002, "epoch": 0.4510548864279363, "step": 2790}, {"loss": 0.7521, "grad_norm": 0.4904133379459381, "learning_rate": 0.0002, "epoch": 0.4526715706086816, "step": 2800}, {"loss": 0.8829, "grad_norm": 0.5647031664848328, "learning_rate": 0.0002, "epoch": 0.45428825478942686, "step": 2810}, {"loss": 0.8622, "grad_norm": 0.5759473443031311, "learning_rate": 0.0002, "epoch": 0.4559049389701722, "step": 2820}, {"loss": 0.7812, "grad_norm": 0.5161895751953125, "learning_rate": 0.0002, "epoch": 0.4575216231509175, "step": 2830}, {"loss": 0.8045, "grad_norm": 0.4248254597187042, "learning_rate": 0.0002, "epoch": 0.4591383073316628, "step": 2840}, {"loss": 0.7838, "grad_norm": 0.45395001769065857, "learning_rate": 0.0002, "epoch": 0.46075499151240806, "step": 2850}, {"loss": 0.8208, "grad_norm": 0.5358697772026062, "learning_rate": 0.0002, "epoch": 0.46237167569315335, "step": 2860}, {"loss": 0.8147, "grad_norm": 0.5379165410995483, "learning_rate": 0.0002, "epoch": 0.46398835987389864, "step": 2870}, {"loss": 0.7403, "grad_norm": 0.4601989686489105, "learning_rate": 0.0002, "epoch": 0.4656050440546439, "step": 2880}, {"loss": 0.8523, "grad_norm": 0.671115517616272, "learning_rate": 0.0002, "epoch": 0.4672217282353892, "step": 2890}, {"loss": 0.8262, "grad_norm": 0.4425133168697357, "learning_rate": 0.0002, "epoch": 0.4688384124161345, "step": 2900}, {"loss": 0.8178, "grad_norm": 0.5446155071258545, "learning_rate": 0.0002, "epoch": 0.4704550965968798, "step": 2910}, {"loss": 0.8106, "grad_norm": 0.603306233882904, "learning_rate": 0.0002, "epoch": 0.47207178077762507, "step": 2920}, {"loss": 0.8044, "grad_norm": 0.5377997159957886, "learning_rate": 0.0002, "epoch": 0.47368846495837036, "step": 2930}, {"loss": 0.8075, "grad_norm": 0.4931027591228485, "learning_rate": 0.0002, "epoch": 0.4753051491391157, "step": 2940}, {"loss": 0.8004, "grad_norm": 0.4711960256099701, "learning_rate": 0.0002, "epoch": 0.476921833319861, "step": 2950}, {"loss": 0.8121, "grad_norm": 0.5020492672920227, "learning_rate": 0.0002, "epoch": 0.4785385175006063, "step": 2960}, {"loss": 0.8221, "grad_norm": 0.5428946614265442, "learning_rate": 0.0002, "epoch": 0.48015520168135156, "step": 2970}, {"loss": 0.7849, "grad_norm": 0.5294089317321777, "learning_rate": 0.0002, "epoch": 0.48177188586209685, "step": 2980}, {"loss": 0.8553, "grad_norm": 0.648289144039154, "learning_rate": 0.0002, "epoch": 0.48338857004284214, "step": 2990}, {"loss": 0.7874, "grad_norm": 0.47916680574417114, "learning_rate": 0.0002, "epoch": 0.4850052542235874, "step": 3000}, {"loss": 0.8087, "grad_norm": 0.43849772214889526, "learning_rate": 0.0002, "epoch": 0.4866219384043327, "step": 3010}, {"loss": 0.7662, "grad_norm": 0.47007861733436584, "learning_rate": 0.0002, "epoch": 0.488238622585078, "step": 3020}, {"loss": 0.757, "grad_norm": 0.6314331293106079, "learning_rate": 0.0002, "epoch": 0.4898553067658233, "step": 3030}, {"loss": 0.7863, "grad_norm": 0.49211493134498596, "learning_rate": 0.0002, "epoch": 0.49147199094656857, "step": 3040}, {"loss": 0.8335, "grad_norm": 0.4537973403930664, "learning_rate": 0.0002, "epoch": 0.49308867512731386, "step": 3050}, {"loss": 0.8095, "grad_norm": 0.47326919436454773, "learning_rate": 0.0002, "epoch": 0.49470535930805914, "step": 3060}, {"loss": 0.8447, "grad_norm": 0.525874137878418, "learning_rate": 0.0002, "epoch": 0.4963220434888045, "step": 3070}, {"loss": 0.8339, "grad_norm": 0.6361091732978821, "learning_rate": 0.0002, "epoch": 0.4979387276695498, "step": 3080}, {"loss": 0.821, "grad_norm": 0.5850642919540405, "learning_rate": 0.0002, "epoch": 0.49955541185029506, "step": 3090}, {"loss": 0.8279, "grad_norm": 0.47299543023109436, "learning_rate": 0.0002, "epoch": 0.5011720960310403, "step": 3100}, {"loss": 0.8681, "grad_norm": 0.473099946975708, "learning_rate": 0.0002, "epoch": 0.5027887802117856, "step": 3110}, {"loss": 0.8223, "grad_norm": 0.48186397552490234, "learning_rate": 0.0002, "epoch": 0.5044054643925309, "step": 3120}, {"loss": 0.8292, "grad_norm": 0.5015401840209961, "learning_rate": 0.0002, "epoch": 0.5060221485732762, "step": 3130}, {"loss": 0.7692, "grad_norm": 0.5617750287055969, "learning_rate": 0.0002, "epoch": 0.5076388327540216, "step": 3140}, {"loss": 0.8708, "grad_norm": 0.5169327259063721, "learning_rate": 0.0002, "epoch": 0.5092555169347668, "step": 3150}, {"loss": 0.7845, "grad_norm": 0.545657753944397, "learning_rate": 0.0002, "epoch": 0.5108722011155121, "step": 3160}, {"loss": 0.799, "grad_norm": 0.512864351272583, "learning_rate": 0.0002, "epoch": 0.5124888852962574, "step": 3170}, {"loss": 0.7794, "grad_norm": 0.4113546311855316, "learning_rate": 0.0002, "epoch": 0.5141055694770027, "step": 3180}, {"loss": 0.8206, "grad_norm": 0.44532445073127747, "learning_rate": 0.0002, "epoch": 0.5157222536577479, "step": 3190}, {"loss": 0.8213, "grad_norm": 0.5623497366905212, "learning_rate": 0.0002, "epoch": 0.5173389378384933, "step": 3200}, {"loss": 0.7928, "grad_norm": 0.5084741115570068, "learning_rate": 0.0002, "epoch": 0.5189556220192385, "step": 3210}, {"loss": 0.8174, "grad_norm": 0.5305403470993042, "learning_rate": 0.0002, "epoch": 0.5205723061999838, "step": 3220}, {"loss": 0.8139, "grad_norm": 0.4708254337310791, "learning_rate": 0.0002, "epoch": 0.5221889903807291, "step": 3230}, {"loss": 0.7639, "grad_norm": 0.43827131390571594, "learning_rate": 0.0002, "epoch": 0.5238056745614744, "step": 3240}, {"loss": 0.7993, "grad_norm": 0.5630002617835999, "learning_rate": 0.0002, "epoch": 0.5254223587422197, "step": 3250}, {"loss": 0.7522, "grad_norm": 0.5010961890220642, "learning_rate": 0.0002, "epoch": 0.527039042922965, "step": 3260}, {"loss": 0.8374, "grad_norm": 0.6303122043609619, "learning_rate": 0.0002, "epoch": 0.5286557271037103, "step": 3270}, {"loss": 0.7727, "grad_norm": 0.5107331275939941, "learning_rate": 0.0002, "epoch": 0.5302724112844556, "step": 3280}, {"loss": 0.8495, "grad_norm": 0.5700443387031555, "learning_rate": 0.0002, "epoch": 0.5318890954652009, "step": 3290}, {"loss": 0.7776, "grad_norm": 0.46296367049217224, "learning_rate": 0.0002, "epoch": 0.5335057796459461, "step": 3300}, {"loss": 0.7931, "grad_norm": 0.531568706035614, "learning_rate": 0.0002, "epoch": 0.5351224638266915, "step": 3310}, {"loss": 0.843, "grad_norm": 0.4686741530895233, "learning_rate": 0.0002, "epoch": 0.5367391480074367, "step": 3320}, {"loss": 0.8104, "grad_norm": 0.5404331088066101, "learning_rate": 0.0002, "epoch": 0.5383558321881821, "step": 3330}, {"loss": 0.7686, "grad_norm": 0.6368790864944458, "learning_rate": 0.0002, "epoch": 0.5399725163689273, "step": 3340}, {"loss": 0.8514, "grad_norm": 0.42300888895988464, "learning_rate": 0.0002, "epoch": 0.5415892005496726, "step": 3350}, {"loss": 0.8236, "grad_norm": 0.5362542867660522, "learning_rate": 0.0002, "epoch": 0.5432058847304179, "step": 3360}, {"loss": 0.858, "grad_norm": 0.497128963470459, "learning_rate": 0.0002, "epoch": 0.5448225689111632, "step": 3370}, {"loss": 0.8519, "grad_norm": 0.5006386041641235, "learning_rate": 0.0002, "epoch": 0.5464392530919085, "step": 3380}, {"loss": 0.7867, "grad_norm": 0.44136837124824524, "learning_rate": 0.0002, "epoch": 0.5480559372726538, "step": 3390}, {"loss": 0.773, "grad_norm": 0.5897833108901978, "learning_rate": 0.0002, "epoch": 0.5496726214533991, "step": 3400}, {"loss": 0.8895, "grad_norm": 0.641075611114502, "learning_rate": 0.0002, "epoch": 0.5512893056341444, "step": 3410}, {"loss": 0.7827, "grad_norm": 0.7251322269439697, "learning_rate": 0.0002, "epoch": 0.5529059898148897, "step": 3420}, {"loss": 0.7626, "grad_norm": 0.47411349415779114, "learning_rate": 0.0002, "epoch": 0.5545226739956349, "step": 3430}, {"loss": 0.8196, "grad_norm": 0.4994310438632965, "learning_rate": 0.0002, "epoch": 0.5561393581763803, "step": 3440}, {"loss": 0.7812, "grad_norm": 0.5814438462257385, "learning_rate": 0.0002, "epoch": 0.5577560423571255, "step": 3450}, {"loss": 0.8805, "grad_norm": 0.6278898119926453, "learning_rate": 0.0002, "epoch": 0.5593727265378708, "step": 3460}, {"loss": 0.813, "grad_norm": 0.46208274364471436, "learning_rate": 0.0002, "epoch": 0.5609894107186161, "step": 3470}, {"loss": 0.8295, "grad_norm": 0.5718930959701538, "learning_rate": 0.0002, "epoch": 0.5626060948993614, "step": 3480}, {"loss": 0.8152, "grad_norm": 0.48178744316101074, "learning_rate": 0.0002, "epoch": 0.5642227790801067, "step": 3490}, {"loss": 0.8244, "grad_norm": 0.47336965799331665, "learning_rate": 0.0002, "epoch": 0.565839463260852, "step": 3500}, {"loss": 0.8099, "grad_norm": 0.43442684412002563, "learning_rate": 0.0002, "epoch": 0.5674561474415973, "step": 3510}, {"loss": 0.7564, "grad_norm": 0.6463358998298645, "learning_rate": 0.0002, "epoch": 0.5690728316223426, "step": 3520}, {"loss": 0.836, "grad_norm": 0.5286486744880676, "learning_rate": 0.0002, "epoch": 0.5706895158030879, "step": 3530}, {"loss": 0.8421, "grad_norm": 0.5405499935150146, "learning_rate": 0.0002, "epoch": 0.5723061999838331, "step": 3540}, {"loss": 0.7614, "grad_norm": 0.6654391884803772, "learning_rate": 0.0002, "epoch": 0.5739228841645785, "step": 3550}, {"loss": 0.7803, "grad_norm": 0.5081980228424072, "learning_rate": 0.0002, "epoch": 0.5755395683453237, "step": 3560}, {"loss": 0.7753, "grad_norm": 0.48978179693222046, "learning_rate": 0.0002, "epoch": 0.5771562525260691, "step": 3570}, {"loss": 0.8151, "grad_norm": 0.5840612053871155, "learning_rate": 0.0002, "epoch": 0.5787729367068143, "step": 3580}, {"loss": 0.8937, "grad_norm": 0.5235261917114258, "learning_rate": 0.0002, "epoch": 0.5803896208875596, "step": 3590}, {"loss": 0.7894, "grad_norm": 0.5672075748443604, "learning_rate": 0.0002, "epoch": 0.5820063050683049, "step": 3600}, {"loss": 0.8347, "grad_norm": 0.5613429546356201, "learning_rate": 0.0002, "epoch": 0.5836229892490502, "step": 3610}, {"loss": 0.8274, "grad_norm": 0.4032273590564728, "learning_rate": 0.0002, "epoch": 0.5852396734297954, "step": 3620}, {"loss": 0.8421, "grad_norm": 0.49559324979782104, "learning_rate": 0.0002, "epoch": 0.5868563576105408, "step": 3630}, {"loss": 0.8332, "grad_norm": 0.6895697712898254, "learning_rate": 0.0002, "epoch": 0.5884730417912861, "step": 3640}, {"loss": 0.7877, "grad_norm": 0.4750136435031891, "learning_rate": 0.0002, "epoch": 0.5900897259720314, "step": 3650}, {"loss": 0.8219, "grad_norm": 0.5176819562911987, "learning_rate": 0.0002, "epoch": 0.5917064101527767, "step": 3660}, {"loss": 0.8151, "grad_norm": 0.5817760229110718, "learning_rate": 0.0002, "epoch": 0.5933230943335219, "step": 3670}, {"loss": 0.7823, "grad_norm": 0.6064626574516296, "learning_rate": 0.0002, "epoch": 0.5949397785142673, "step": 3680}, {"loss": 0.8422, "grad_norm": 0.6728700995445251, "learning_rate": 0.0002, "epoch": 0.5965564626950125, "step": 3690}, {"loss": 0.7679, "grad_norm": 0.609305202960968, "learning_rate": 0.0002, "epoch": 0.5981731468757578, "step": 3700}, {"loss": 0.8048, "grad_norm": 0.4615488350391388, "learning_rate": 0.0002, "epoch": 0.5997898310565031, "step": 3710}, {"loss": 0.8214, "grad_norm": 2.0531179904937744, "learning_rate": 0.0002, "epoch": 0.6014065152372484, "step": 3720}, {"loss": 0.8158, "grad_norm": 0.5091132521629333, "learning_rate": 0.0002, "epoch": 0.6030231994179936, "step": 3730}, {"loss": 0.7833, "grad_norm": 0.5951124429702759, "learning_rate": 0.0002, "epoch": 0.604639883598739, "step": 3740}, {"loss": 0.7784, "grad_norm": 0.5870208144187927, "learning_rate": 0.0002, "epoch": 0.6062565677794842, "step": 3750}, {"loss": 0.8044, "grad_norm": 0.6254619359970093, "learning_rate": 0.0002, "epoch": 0.6078732519602296, "step": 3760}, {"loss": 0.7868, "grad_norm": 0.5577626824378967, "learning_rate": 0.0002, "epoch": 0.6094899361409749, "step": 3770}, {"loss": 0.8108, "grad_norm": 0.5004405379295349, "learning_rate": 0.0002, "epoch": 0.6111066203217201, "step": 3780}, {"loss": 0.8092, "grad_norm": 0.5527383685112, "learning_rate": 0.0002, "epoch": 0.6127233045024655, "step": 3790}, {"loss": 0.8036, "grad_norm": 0.49116113781929016, "learning_rate": 0.0002, "epoch": 0.6143399886832107, "step": 3800}, {"loss": 0.8352, "grad_norm": 0.5299299359321594, "learning_rate": 0.0002, "epoch": 0.6159566728639561, "step": 3810}, {"loss": 0.7737, "grad_norm": 0.464897483587265, "learning_rate": 0.0002, "epoch": 0.6175733570447013, "step": 3820}, {"loss": 0.7923, "grad_norm": 0.6505740880966187, "learning_rate": 0.0002, "epoch": 0.6191900412254466, "step": 3830}, {"loss": 0.8123, "grad_norm": 0.5512559413909912, "learning_rate": 0.0002, "epoch": 0.6208067254061919, "step": 3840}, {"loss": 0.8856, "grad_norm": 0.49427518248558044, "learning_rate": 0.0002, "epoch": 0.6224234095869372, "step": 3850}, {"loss": 0.7751, "grad_norm": 0.3839147090911865, "learning_rate": 0.0002, "epoch": 0.6240400937676824, "step": 3860}, {"loss": 0.8006, "grad_norm": 0.5760218501091003, "learning_rate": 0.0002, "epoch": 0.6256567779484278, "step": 3870}, {"loss": 0.7836, "grad_norm": 0.7226507067680359, "learning_rate": 0.0002, "epoch": 0.6272734621291731, "step": 3880}, {"loss": 0.8244, "grad_norm": 0.676781415939331, "learning_rate": 0.0002, "epoch": 0.6288901463099184, "step": 3890}, {"loss": 0.8239, "grad_norm": 0.4284018278121948, "learning_rate": 0.0002, "epoch": 0.6305068304906637, "step": 3900}, {"loss": 0.7996, "grad_norm": 0.5060628056526184, "learning_rate": 0.0002, "epoch": 0.6321235146714089, "step": 3910}, {"loss": 0.8089, "grad_norm": 0.5524522066116333, "learning_rate": 0.0002, "epoch": 0.6337401988521543, "step": 3920}, {"loss": 0.8276, "grad_norm": 0.6099881529808044, "learning_rate": 0.0002, "epoch": 0.6353568830328995, "step": 3930}, {"loss": 0.809, "grad_norm": 0.43155938386917114, "learning_rate": 0.0002, "epoch": 0.6369735672136448, "step": 3940}, {"loss": 0.8404, "grad_norm": 0.6427084803581238, "learning_rate": 0.0002, "epoch": 0.6385902513943901, "step": 3950}, {"loss": 0.8368, "grad_norm": 0.541220486164093, "learning_rate": 0.0002, "epoch": 0.6402069355751354, "step": 3960}, {"loss": 0.8539, "grad_norm": 0.5414294600486755, "learning_rate": 0.0002, "epoch": 0.6418236197558806, "step": 3970}, {"loss": 0.7996, "grad_norm": 0.46344003081321716, "learning_rate": 0.0002, "epoch": 0.643440303936626, "step": 3980}, {"loss": 0.7474, "grad_norm": 0.45209285616874695, "learning_rate": 0.0002, "epoch": 0.6450569881173712, "step": 3990}, {"loss": 0.8202, "grad_norm": 0.5417284369468689, "learning_rate": 0.0002, "epoch": 0.6466736722981166, "step": 4000}, {"loss": 0.7563, "grad_norm": 0.7995685935020447, "learning_rate": 0.0002, "epoch": 0.6482903564788619, "step": 4010}, {"loss": 0.7812, "grad_norm": 0.6384002566337585, "learning_rate": 0.0002, "epoch": 0.6499070406596071, "step": 4020}, {"loss": 0.732, "grad_norm": 0.4472815692424774, "learning_rate": 0.0002, "epoch": 0.6515237248403525, "step": 4030}, {"loss": 0.8071, "grad_norm": 0.6834294199943542, "learning_rate": 0.0002, "epoch": 0.6531404090210977, "step": 4040}, {"loss": 0.7812, "grad_norm": 0.4612339735031128, "learning_rate": 0.0002, "epoch": 0.654757093201843, "step": 4050}, {"loss": 0.8141, "grad_norm": 0.9266576170921326, "learning_rate": 0.0002, "epoch": 0.6563737773825883, "step": 4060}, {"loss": 0.7991, "grad_norm": 0.4470861852169037, "learning_rate": 0.0002, "epoch": 0.6579904615633336, "step": 4070}, {"loss": 0.8293, "grad_norm": 0.45544925332069397, "learning_rate": 0.0002, "epoch": 0.6596071457440789, "step": 4080}, {"loss": 0.8455, "grad_norm": 0.6144481301307678, "learning_rate": 0.0002, "epoch": 0.6612238299248242, "step": 4090}, {"loss": 0.7877, "grad_norm": 0.5936288237571716, "learning_rate": 0.0002, "epoch": 0.6628405141055694, "step": 4100}, {"loss": 0.7617, "grad_norm": 0.4822963774204254, "learning_rate": 0.0002, "epoch": 0.6644571982863148, "step": 4110}, {"loss": 0.7997, "grad_norm": 0.48432496190071106, "learning_rate": 0.0002, "epoch": 0.66607388246706, "step": 4120}, {"loss": 0.8404, "grad_norm": 0.4901607930660248, "learning_rate": 0.0002, "epoch": 0.6676905666478054, "step": 4130}, {"loss": 0.8085, "grad_norm": 0.5018393397331238, "learning_rate": 0.0002, "epoch": 0.6693072508285507, "step": 4140}, {"loss": 0.8065, "grad_norm": 0.6946378946304321, "learning_rate": 0.0002, "epoch": 0.6709239350092959, "step": 4150}, {"loss": 0.8147, "grad_norm": 0.5997390747070312, "learning_rate": 0.0002, "epoch": 0.6725406191900413, "step": 4160}, {"loss": 0.8268, "grad_norm": 0.6738849878311157, "learning_rate": 0.0002, "epoch": 0.6741573033707865, "step": 4170}, {"loss": 0.7704, "grad_norm": 0.6110581159591675, "learning_rate": 0.0002, "epoch": 0.6757739875515318, "step": 4180}, {"loss": 0.8043, "grad_norm": 0.5703322291374207, "learning_rate": 0.0002, "epoch": 0.6773906717322771, "step": 4190}, {"loss": 0.8099, "grad_norm": 0.4686066210269928, "learning_rate": 0.0002, "epoch": 0.6790073559130224, "step": 4200}, {"loss": 0.8441, "grad_norm": 0.6394643783569336, "learning_rate": 0.0002, "epoch": 0.6806240400937676, "step": 4210}, {"loss": 0.8011, "grad_norm": 0.5454841256141663, "learning_rate": 0.0002, "epoch": 0.682240724274513, "step": 4220}, {"loss": 0.8307, "grad_norm": 0.4859732985496521, "learning_rate": 0.0002, "epoch": 0.6838574084552582, "step": 4230}, {"loss": 0.8161, "grad_norm": 0.5544065833091736, "learning_rate": 0.0002, "epoch": 0.6854740926360036, "step": 4240}, {"loss": 0.7839, "grad_norm": 0.4902505576610565, "learning_rate": 0.0002, "epoch": 0.6870907768167488, "step": 4250}, {"loss": 0.7977, "grad_norm": 0.4768051505088806, "learning_rate": 0.0002, "epoch": 0.6887074609974941, "step": 4260}, {"loss": 0.7539, "grad_norm": 0.49982190132141113, "learning_rate": 0.0002, "epoch": 0.6903241451782395, "step": 4270}, {"loss": 0.7353, "grad_norm": 0.6351838111877441, "learning_rate": 0.0002, "epoch": 0.6919408293589847, "step": 4280}, {"loss": 0.7664, "grad_norm": 0.5647561550140381, "learning_rate": 0.0002, "epoch": 0.69355751353973, "step": 4290}, {"loss": 0.7618, "grad_norm": 0.5340486764907837, "learning_rate": 0.0002, "epoch": 0.6951741977204753, "step": 4300}, {"loss": 0.8526, "grad_norm": 0.5649092793464661, "learning_rate": 0.0002, "epoch": 0.6967908819012206, "step": 4310}, {"loss": 0.8246, "grad_norm": 0.6183916926383972, "learning_rate": 0.0002, "epoch": 0.6984075660819659, "step": 4320}, {"loss": 0.792, "grad_norm": 0.6154509782791138, "learning_rate": 0.0002, "epoch": 0.7000242502627112, "step": 4330}, {"loss": 0.8397, "grad_norm": 0.5156264305114746, "learning_rate": 0.0002, "epoch": 0.7016409344434564, "step": 4340}, {"loss": 0.8512, "grad_norm": 0.562171459197998, "learning_rate": 0.0002, "epoch": 0.7032576186242018, "step": 4350}, {"loss": 0.7882, "grad_norm": 0.4949502646923065, "learning_rate": 0.0002, "epoch": 0.704874302804947, "step": 4360}, {"loss": 0.738, "grad_norm": 0.5171684622764587, "learning_rate": 0.0002, "epoch": 0.7064909869856923, "step": 4370}, {"loss": 0.8001, "grad_norm": 0.6198443174362183, "learning_rate": 0.0002, "epoch": 0.7081076711664377, "step": 4380}, {"loss": 0.7606, "grad_norm": 0.5802276134490967, "learning_rate": 0.0002, "epoch": 0.7097243553471829, "step": 4390}, {"loss": 0.8797, "grad_norm": 0.41096967458724976, "learning_rate": 0.0002, "epoch": 0.7113410395279283, "step": 4400}, {"loss": 0.805, "grad_norm": 0.4397392272949219, "learning_rate": 0.0002, "epoch": 0.7129577237086735, "step": 4410}, {"loss": 0.7651, "grad_norm": 0.45228442549705505, "learning_rate": 0.0002, "epoch": 0.7145744078894188, "step": 4420}, {"loss": 0.7938, "grad_norm": 0.4839673936367035, "learning_rate": 0.0002, "epoch": 0.7161910920701641, "step": 4430}, {"loss": 0.8362, "grad_norm": 0.6140755414962769, "learning_rate": 0.0002, "epoch": 0.7178077762509094, "step": 4440}, {"loss": 0.7722, "grad_norm": 0.6841378808021545, "learning_rate": 0.0002, "epoch": 0.7194244604316546, "step": 4450}, {"loss": 0.8177, "grad_norm": 0.6664239168167114, "learning_rate": 0.0002, "epoch": 0.7210411446124, "step": 4460}, {"loss": 0.7983, "grad_norm": 0.47552719712257385, "learning_rate": 0.0002, "epoch": 0.7226578287931452, "step": 4470}, {"loss": 0.8982, "grad_norm": 0.6649776101112366, "learning_rate": 0.0002, "epoch": 0.7242745129738906, "step": 4480}, {"loss": 0.8074, "grad_norm": 0.5159541964530945, "learning_rate": 0.0002, "epoch": 0.7258911971546358, "step": 4490}, {"loss": 0.7786, "grad_norm": 0.6693112850189209, "learning_rate": 0.0002, "epoch": 0.7275078813353811, "step": 4500}, {"loss": 0.8655, "grad_norm": 0.48870977759361267, "learning_rate": 0.0002, "epoch": 0.7291245655161265, "step": 4510}, {"loss": 0.7337, "grad_norm": 0.4857887923717499, "learning_rate": 0.0002, "epoch": 0.7307412496968717, "step": 4520}, {"loss": 0.8026, "grad_norm": 0.5515662431716919, "learning_rate": 0.0002, "epoch": 0.732357933877617, "step": 4530}, {"loss": 0.8031, "grad_norm": 0.6292222738265991, "learning_rate": 0.0002, "epoch": 0.7339746180583623, "step": 4540}, {"loss": 0.7749, "grad_norm": 0.48265689611434937, "learning_rate": 0.0002, "epoch": 0.7355913022391076, "step": 4550}, {"loss": 0.8499, "grad_norm": 0.8044266104698181, "learning_rate": 0.0002, "epoch": 0.7372079864198529, "step": 4560}, {"loss": 0.8162, "grad_norm": 0.6111769676208496, "learning_rate": 0.0002, "epoch": 0.7388246706005982, "step": 4570}, {"loss": 0.7291, "grad_norm": 0.5229553580284119, "learning_rate": 0.0002, "epoch": 0.7404413547813434, "step": 4580}, {"loss": 0.8038, "grad_norm": 0.6054152250289917, "learning_rate": 0.0002, "epoch": 0.7420580389620888, "step": 4590}, {"loss": 0.8169, "grad_norm": 0.5574966669082642, "learning_rate": 0.0002, "epoch": 0.743674723142834, "step": 4600}, {"loss": 0.8439, "grad_norm": 0.5395817160606384, "learning_rate": 0.0002, "epoch": 0.7452914073235793, "step": 4610}, {"loss": 0.8495, "grad_norm": 0.7116472721099854, "learning_rate": 0.0002, "epoch": 0.7469080915043246, "step": 4620}, {"loss": 0.7743, "grad_norm": 0.5618700981140137, "learning_rate": 0.0002, "epoch": 0.7485247756850699, "step": 4630}, {"loss": 0.7744, "grad_norm": 0.5802770853042603, "learning_rate": 0.0002, "epoch": 0.7501414598658153, "step": 4640}, {"loss": 0.7924, "grad_norm": 0.5690428018569946, "learning_rate": 0.0002, "epoch": 0.7517581440465605, "step": 4650}, {"loss": 0.8017, "grad_norm": 0.4813360273838043, "learning_rate": 0.0002, "epoch": 0.7533748282273058, "step": 4660}, {"loss": 0.8108, "grad_norm": 0.5434042811393738, "learning_rate": 0.0002, "epoch": 0.7549915124080511, "step": 4670}, {"loss": 0.7824, "grad_norm": 0.5502099990844727, "learning_rate": 0.0002, "epoch": 0.7566081965887964, "step": 4680}, {"loss": 0.8598, "grad_norm": 0.6020621061325073, "learning_rate": 0.0002, "epoch": 0.7582248807695416, "step": 4690}, {"loss": 0.7937, "grad_norm": 0.4922301471233368, "learning_rate": 0.0002, "epoch": 0.759841564950287, "step": 4700}, {"loss": 0.788, "grad_norm": 0.6492828726768494, "learning_rate": 0.0002, "epoch": 0.7614582491310322, "step": 4710}, {"loss": 0.8313, "grad_norm": 0.4865580201148987, "learning_rate": 0.0002, "epoch": 0.7630749333117776, "step": 4720}, {"loss": 0.7966, "grad_norm": 0.5971422791481018, "learning_rate": 0.0002, "epoch": 0.7646916174925228, "step": 4730}, {"loss": 0.8298, "grad_norm": 0.6832674145698547, "learning_rate": 0.0002, "epoch": 0.7663083016732681, "step": 4740}, {"loss": 0.8156, "grad_norm": 0.500908613204956, "learning_rate": 0.0002, "epoch": 0.7679249858540134, "step": 4750}, {"loss": 0.8383, "grad_norm": 0.6112465858459473, "learning_rate": 0.0002, "epoch": 0.7695416700347587, "step": 4760}, {"loss": 0.76, "grad_norm": 0.5753506422042847, "learning_rate": 0.0002, "epoch": 0.771158354215504, "step": 4770}, {"loss": 0.8297, "grad_norm": 0.6529405117034912, "learning_rate": 0.0002, "epoch": 0.7727750383962493, "step": 4780}, {"loss": 0.8171, "grad_norm": 0.5916843414306641, "learning_rate": 0.0002, "epoch": 0.7743917225769946, "step": 4790}, {"loss": 0.83, "grad_norm": 0.4821224510669708, "learning_rate": 0.0002, "epoch": 0.7760084067577399, "step": 4800}, {"loss": 0.7703, "grad_norm": 0.5532580018043518, "learning_rate": 0.0002, "epoch": 0.7776250909384852, "step": 4810}, {"loss": 0.7363, "grad_norm": 0.4604877233505249, "learning_rate": 0.0002, "epoch": 0.7792417751192304, "step": 4820}, {"loss": 0.7506, "grad_norm": 0.5009613037109375, "learning_rate": 0.0002, "epoch": 0.7808584592999758, "step": 4830}, {"loss": 0.7863, "grad_norm": 0.6448560357093811, "learning_rate": 0.0002, "epoch": 0.782475143480721, "step": 4840}, {"loss": 0.7957, "grad_norm": 0.44327953457832336, "learning_rate": 0.0002, "epoch": 0.7840918276614663, "step": 4850}, {"loss": 0.7925, "grad_norm": 0.5355411171913147, "learning_rate": 0.0002, "epoch": 0.7857085118422116, "step": 4860}, {"loss": 0.7754, "grad_norm": 0.5635677576065063, "learning_rate": 0.0002, "epoch": 0.7873251960229569, "step": 4870}, {"loss": 0.7931, "grad_norm": 0.5417491793632507, "learning_rate": 0.0002, "epoch": 0.7889418802037023, "step": 4880}, {"loss": 0.7819, "grad_norm": 0.4567430913448334, "learning_rate": 0.0002, "epoch": 0.7905585643844475, "step": 4890}, {"loss": 0.8454, "grad_norm": 0.44651296734809875, "learning_rate": 0.0002, "epoch": 0.7921752485651928, "step": 4900}, {"loss": 0.7959, "grad_norm": 0.5741217136383057, "learning_rate": 0.0002, "epoch": 0.7937919327459381, "step": 4910}, {"loss": 0.8093, "grad_norm": 0.6605045199394226, "learning_rate": 0.0002, "epoch": 0.7954086169266834, "step": 4920}, {"loss": 0.77, "grad_norm": 0.5126531720161438, "learning_rate": 0.0002, "epoch": 0.7970253011074286, "step": 4930}, {"loss": 0.7793, "grad_norm": 0.513648271560669, "learning_rate": 0.0002, "epoch": 0.798641985288174, "step": 4940}, {"loss": 0.8314, "grad_norm": 0.5350404381752014, "learning_rate": 0.0002, "epoch": 0.8002586694689192, "step": 4950}, {"loss": 0.7649, "grad_norm": 0.5731674432754517, "learning_rate": 0.0002, "epoch": 0.8018753536496646, "step": 4960}, {"loss": 0.8572, "grad_norm": 0.5974258184432983, "learning_rate": 0.0002, "epoch": 0.8034920378304098, "step": 4970}, {"loss": 0.7972, "grad_norm": 0.8774799704551697, "learning_rate": 0.0002, "epoch": 0.8051087220111551, "step": 4980}, {"loss": 0.7899, "grad_norm": 0.5994430184364319, "learning_rate": 0.0002, "epoch": 0.8067254061919004, "step": 4990}, {"loss": 0.7736, "grad_norm": 0.4894903004169464, "learning_rate": 0.0002, "epoch": 0.8083420903726457, "step": 5000}, {"loss": 0.78, "grad_norm": 0.5218459367752075, "learning_rate": 0.0002, "epoch": 0.809958774553391, "step": 5010}, {"loss": 0.817, "grad_norm": 0.5232468843460083, "learning_rate": 0.0002, "epoch": 0.8115754587341363, "step": 5020}, {"loss": 0.7704, "grad_norm": 0.44358372688293457, "learning_rate": 0.0002, "epoch": 0.8131921429148816, "step": 5030}, {"loss": 0.785, "grad_norm": 0.6202037334442139, "learning_rate": 0.0002, "epoch": 0.8148088270956269, "step": 5040}, {"loss": 0.7351, "grad_norm": 0.7721474170684814, "learning_rate": 0.0002, "epoch": 0.8164255112763722, "step": 5050}, {"loss": 0.8297, "grad_norm": 0.5568501353263855, "learning_rate": 0.0002, "epoch": 0.8180421954571174, "step": 5060}, {"loss": 0.7733, "grad_norm": 0.49148809909820557, "learning_rate": 0.0002, "epoch": 0.8196588796378628, "step": 5070}, {"loss": 0.8054, "grad_norm": 0.4956012964248657, "learning_rate": 0.0002, "epoch": 0.821275563818608, "step": 5080}, {"loss": 0.8201, "grad_norm": 0.6078833937644958, "learning_rate": 0.0002, "epoch": 0.8228922479993533, "step": 5090}, {"loss": 0.828, "grad_norm": 0.46906954050064087, "learning_rate": 0.0002, "epoch": 0.8245089321800986, "step": 5100}, {"loss": 0.7703, "grad_norm": 0.50812166929245, "learning_rate": 0.0002, "epoch": 0.8261256163608439, "step": 5110}, {"loss": 0.8243, "grad_norm": 0.5319661498069763, "learning_rate": 0.0002, "epoch": 0.8277423005415891, "step": 5120}, {"loss": 0.7798, "grad_norm": 0.4949689209461212, "learning_rate": 0.0002, "epoch": 0.8293589847223345, "step": 5130}, {"loss": 0.7428, "grad_norm": 0.5151591300964355, "learning_rate": 0.0002, "epoch": 0.8309756689030798, "step": 5140}, {"loss": 0.8147, "grad_norm": 0.5530214309692383, "learning_rate": 0.0002, "epoch": 0.8325923530838251, "step": 5150}, {"loss": 0.8251, "grad_norm": 0.6297410130500793, "learning_rate": 0.0002, "epoch": 0.8342090372645704, "step": 5160}, {"loss": 0.8067, "grad_norm": 0.5466840267181396, "learning_rate": 0.0002, "epoch": 0.8358257214453156, "step": 5170}, {"loss": 0.7875, "grad_norm": 0.652913510799408, "learning_rate": 0.0002, "epoch": 0.837442405626061, "step": 5180}, {"loss": 0.8295, "grad_norm": 0.5811293125152588, "learning_rate": 0.0002, "epoch": 0.8390590898068062, "step": 5190}, {"loss": 0.7412, "grad_norm": 0.5109550952911377, "learning_rate": 0.0002, "epoch": 0.8406757739875516, "step": 5200}, {"loss": 0.8077, "grad_norm": 0.4551706612110138, "learning_rate": 0.0002, "epoch": 0.8422924581682968, "step": 5210}, {"loss": 0.7827, "grad_norm": 0.5813754200935364, "learning_rate": 0.0002, "epoch": 0.8439091423490421, "step": 5220}, {"loss": 0.802, "grad_norm": 0.5856947898864746, "learning_rate": 0.0002, "epoch": 0.8455258265297874, "step": 5230}, {"loss": 0.7957, "grad_norm": 0.5482739210128784, "learning_rate": 0.0002, "epoch": 0.8471425107105327, "step": 5240}, {"loss": 0.8295, "grad_norm": 0.49023720622062683, "learning_rate": 0.0002, "epoch": 0.8487591948912779, "step": 5250}, {"loss": 0.8022, "grad_norm": 0.49472475051879883, "learning_rate": 0.0002, "epoch": 0.8503758790720233, "step": 5260}, {"loss": 0.8001, "grad_norm": 0.5490226745605469, "learning_rate": 0.0002, "epoch": 0.8519925632527686, "step": 5270}, {"loss": 0.8333, "grad_norm": 0.5340665578842163, "learning_rate": 0.0002, "epoch": 0.8536092474335139, "step": 5280}, {"loss": 0.8277, "grad_norm": 0.5962483882904053, "learning_rate": 0.0002, "epoch": 0.8552259316142592, "step": 5290}, {"loss": 0.8765, "grad_norm": 0.586358368396759, "learning_rate": 0.0002, "epoch": 0.8568426157950044, "step": 5300}, {"loss": 0.7831, "grad_norm": 0.49120277166366577, "learning_rate": 0.0002, "epoch": 0.8584592999757498, "step": 5310}, {"loss": 0.8162, "grad_norm": 0.5887332558631897, "learning_rate": 0.0002, "epoch": 0.860075984156495, "step": 5320}, {"loss": 0.7464, "grad_norm": 0.42496153712272644, "learning_rate": 0.0002, "epoch": 0.8616926683372403, "step": 5330}, {"loss": 0.7905, "grad_norm": 0.5489874482154846, "learning_rate": 0.0002, "epoch": 0.8633093525179856, "step": 5340}, {"loss": 0.7958, "grad_norm": 0.5850813984870911, "learning_rate": 0.0002, "epoch": 0.8649260366987309, "step": 5350}, {"loss": 0.7642, "grad_norm": 0.517487108707428, "learning_rate": 0.0002, "epoch": 0.8665427208794761, "step": 5360}, {"loss": 0.7801, "grad_norm": 0.5339142680168152, "learning_rate": 0.0002, "epoch": 0.8681594050602215, "step": 5370}, {"loss": 0.818, "grad_norm": 0.6236387491226196, "learning_rate": 0.0002, "epoch": 0.8697760892409668, "step": 5380}, {"loss": 0.7708, "grad_norm": 0.5752192735671997, "learning_rate": 0.0002, "epoch": 0.8713927734217121, "step": 5390}, {"loss": 0.8542, "grad_norm": 0.6724614500999451, "learning_rate": 0.0002, "epoch": 0.8730094576024574, "step": 5400}, {"loss": 0.7581, "grad_norm": 0.5280613303184509, "learning_rate": 0.0002, "epoch": 0.8746261417832026, "step": 5410}, {"loss": 0.8231, "grad_norm": 0.44033288955688477, "learning_rate": 0.0002, "epoch": 0.876242825963948, "step": 5420}, {"loss": 0.8839, "grad_norm": 0.5199708342552185, "learning_rate": 0.0002, "epoch": 0.8778595101446932, "step": 5430}, {"loss": 0.7852, "grad_norm": 0.46778348088264465, "learning_rate": 0.0002, "epoch": 0.8794761943254386, "step": 5440}, {"loss": 0.7834, "grad_norm": 0.4657754898071289, "learning_rate": 0.0002, "epoch": 0.8810928785061838, "step": 5450}, {"loss": 0.7799, "grad_norm": 0.5472902655601501, "learning_rate": 0.0002, "epoch": 0.8827095626869291, "step": 5460}, {"loss": 0.8253, "grad_norm": 0.4876766800880432, "learning_rate": 0.0002, "epoch": 0.8843262468676744, "step": 5470}, {"loss": 0.7906, "grad_norm": 0.5057248473167419, "learning_rate": 0.0002, "epoch": 0.8859429310484197, "step": 5480}, {"loss": 0.8124, "grad_norm": 0.4637320637702942, "learning_rate": 0.0002, "epoch": 0.8875596152291649, "step": 5490}, {"loss": 0.781, "grad_norm": 0.471955806016922, "learning_rate": 0.0002, "epoch": 0.8891762994099103, "step": 5500}, {"loss": 0.8057, "grad_norm": 0.5209813714027405, "learning_rate": 0.0002, "epoch": 0.8907929835906556, "step": 5510}, {"loss": 0.8106, "grad_norm": 0.6213834285736084, "learning_rate": 0.0002, "epoch": 0.8924096677714008, "step": 5520}, {"loss": 0.7787, "grad_norm": 0.5215408205986023, "learning_rate": 0.0002, "epoch": 0.8940263519521462, "step": 5530}, {"loss": 0.8174, "grad_norm": 0.580478310585022, "learning_rate": 0.0002, "epoch": 0.8956430361328914, "step": 5540}, {"loss": 0.8371, "grad_norm": 0.49102169275283813, "learning_rate": 0.0002, "epoch": 0.8972597203136368, "step": 5550}, {"loss": 0.7806, "grad_norm": 0.6043479442596436, "learning_rate": 0.0002, "epoch": 0.898876404494382, "step": 5560}, {"loss": 0.7754, "grad_norm": 0.5636463165283203, "learning_rate": 0.0002, "epoch": 0.9004930886751273, "step": 5570}, {"loss": 0.8145, "grad_norm": 0.5620124340057373, "learning_rate": 0.0002, "epoch": 0.9021097728558726, "step": 5580}, {"loss": 0.8083, "grad_norm": 0.5206354856491089, "learning_rate": 0.0002, "epoch": 0.9037264570366179, "step": 5590}, {"loss": 0.8557, "grad_norm": 0.5798229575157166, "learning_rate": 0.0002, "epoch": 0.9053431412173631, "step": 5600}, {"loss": 0.8097, "grad_norm": 0.6428212523460388, "learning_rate": 0.0002, "epoch": 0.9069598253981085, "step": 5610}, {"loss": 0.7839, "grad_norm": 0.48064687848091125, "learning_rate": 0.0002, "epoch": 0.9085765095788537, "step": 5620}, {"loss": 0.8343, "grad_norm": 0.6347860097885132, "learning_rate": 0.0002, "epoch": 0.9101931937595991, "step": 5630}, {"loss": 0.851, "grad_norm": 0.5353913307189941, "learning_rate": 0.0002, "epoch": 0.9118098779403444, "step": 5640}, {"loss": 0.7736, "grad_norm": 0.5323944091796875, "learning_rate": 0.0002, "epoch": 0.9134265621210896, "step": 5650}, {"loss": 0.8393, "grad_norm": 0.5261843204498291, "learning_rate": 0.0002, "epoch": 0.915043246301835, "step": 5660}, {"loss": 0.7355, "grad_norm": 0.5451326966285706, "learning_rate": 0.0002, "epoch": 0.9166599304825802, "step": 5670}, {"loss": 0.8012, "grad_norm": 0.5183324217796326, "learning_rate": 0.0002, "epoch": 0.9182766146633256, "step": 5680}, {"loss": 0.7659, "grad_norm": 0.47229018807411194, "learning_rate": 0.0002, "epoch": 0.9198932988440708, "step": 5690}, {"loss": 0.7757, "grad_norm": 0.49180513620376587, "learning_rate": 0.0002, "epoch": 0.9215099830248161, "step": 5700}, {"loss": 0.8735, "grad_norm": 0.5419785380363464, "learning_rate": 0.0002, "epoch": 0.9231266672055614, "step": 5710}, {"loss": 0.7378, "grad_norm": 0.5408698916435242, "learning_rate": 0.0002, "epoch": 0.9247433513863067, "step": 5720}, {"loss": 0.7701, "grad_norm": 0.5286232829093933, "learning_rate": 0.0002, "epoch": 0.9263600355670519, "step": 5730}, {"loss": 0.8242, "grad_norm": 0.7539758086204529, "learning_rate": 0.0002, "epoch": 0.9279767197477973, "step": 5740}, {"loss": 0.8118, "grad_norm": 0.5166944861412048, "learning_rate": 0.0002, "epoch": 0.9295934039285425, "step": 5750}, {"loss": 0.783, "grad_norm": 0.6601425409317017, "learning_rate": 0.0002, "epoch": 0.9312100881092878, "step": 5760}, {"loss": 0.7873, "grad_norm": 0.5029960870742798, "learning_rate": 0.0002, "epoch": 0.9328267722900332, "step": 5770}, {"loss": 0.7989, "grad_norm": 0.4926645755767822, "learning_rate": 0.0002, "epoch": 0.9344434564707784, "step": 5780}, {"loss": 0.8174, "grad_norm": 0.5739615559577942, "learning_rate": 0.0002, "epoch": 0.9360601406515238, "step": 5790}, {"loss": 0.8037, "grad_norm": 0.5058279037475586, "learning_rate": 0.0002, "epoch": 0.937676824832269, "step": 5800}, {"loss": 0.8537, "grad_norm": 0.5260962247848511, "learning_rate": 0.0002, "epoch": 0.9392935090130143, "step": 5810}, {"loss": 0.7486, "grad_norm": 0.5768588185310364, "learning_rate": 0.0002, "epoch": 0.9409101931937596, "step": 5820}, {"loss": 0.8215, "grad_norm": 0.5170126557350159, "learning_rate": 0.0002, "epoch": 0.9425268773745049, "step": 5830}, {"loss": 0.7422, "grad_norm": 0.5745864510536194, "learning_rate": 0.0002, "epoch": 0.9441435615552501, "step": 5840}, {"loss": 0.7824, "grad_norm": 0.5551357865333557, "learning_rate": 0.0002, "epoch": 0.9457602457359955, "step": 5850}, {"loss": 0.8529, "grad_norm": 0.5776078701019287, "learning_rate": 0.0002, "epoch": 0.9473769299167407, "step": 5860}, {"loss": 0.8527, "grad_norm": 0.5340062379837036, "learning_rate": 0.0002, "epoch": 0.9489936140974861, "step": 5870}, {"loss": 0.8217, "grad_norm": 0.6447290182113647, "learning_rate": 0.0002, "epoch": 0.9506102982782314, "step": 5880}, {"loss": 0.7945, "grad_norm": 0.5123815536499023, "learning_rate": 0.0002, "epoch": 0.9522269824589766, "step": 5890}, {"loss": 0.8209, "grad_norm": 0.48547613620758057, "learning_rate": 0.0002, "epoch": 0.953843666639722, "step": 5900}, {"loss": 0.7896, "grad_norm": 0.5791414976119995, "learning_rate": 0.0002, "epoch": 0.9554603508204672, "step": 5910}, {"loss": 0.8408, "grad_norm": 0.6195011734962463, "learning_rate": 0.0002, "epoch": 0.9570770350012126, "step": 5920}, {"loss": 0.7805, "grad_norm": 0.6323803067207336, "learning_rate": 0.0002, "epoch": 0.9586937191819578, "step": 5930}, {"loss": 0.8484, "grad_norm": 0.45552879571914673, "learning_rate": 0.0002, "epoch": 0.9603104033627031, "step": 5940}, {"loss": 0.7367, "grad_norm": 0.5796473622322083, "learning_rate": 0.0002, "epoch": 0.9619270875434484, "step": 5950}, {"loss": 0.7672, "grad_norm": 0.647261381149292, "learning_rate": 0.0002, "epoch": 0.9635437717241937, "step": 5960}, {"loss": 0.8086, "grad_norm": 0.5487682819366455, "learning_rate": 0.0002, "epoch": 0.9651604559049389, "step": 5970}, {"loss": 0.7973, "grad_norm": 0.5743663907051086, "learning_rate": 0.0002, "epoch": 0.9667771400856843, "step": 5980}, {"loss": 0.8153, "grad_norm": 0.5470591187477112, "learning_rate": 0.0002, "epoch": 0.9683938242664295, "step": 5990}, {"loss": 0.8119, "grad_norm": 0.5901660323143005, "learning_rate": 0.0002, "epoch": 0.9700105084471748, "step": 6000}, {"loss": 0.8147, "grad_norm": 0.6544759273529053, "learning_rate": 0.0002, "epoch": 0.9716271926279202, "step": 6010}, {"loss": 0.7536, "grad_norm": 0.6288470029830933, "learning_rate": 0.0002, "epoch": 0.9732438768086654, "step": 6020}, {"loss": 0.7989, "grad_norm": 0.673153817653656, "learning_rate": 0.0002, "epoch": 0.9748605609894108, "step": 6030}, {"loss": 0.7556, "grad_norm": 0.42854753136634827, "learning_rate": 0.0002, "epoch": 0.976477245170156, "step": 6040}, {"loss": 0.8006, "grad_norm": 0.5227066278457642, "learning_rate": 0.0002, "epoch": 0.9780939293509013, "step": 6050}, {"loss": 0.795, "grad_norm": 0.5372416973114014, "learning_rate": 0.0002, "epoch": 0.9797106135316466, "step": 6060}, {"loss": 0.7591, "grad_norm": 0.6026402115821838, "learning_rate": 0.0002, "epoch": 0.9813272977123919, "step": 6070}, {"loss": 0.8347, "grad_norm": 0.49547791481018066, "learning_rate": 0.0002, "epoch": 0.9829439818931371, "step": 6080}, {"loss": 0.7722, "grad_norm": 0.4641951322555542, "learning_rate": 0.0002, "epoch": 0.9845606660738825, "step": 6090}, {"loss": 0.8125, "grad_norm": 0.5818535089492798, "learning_rate": 0.0002, "epoch": 0.9861773502546277, "step": 6100}, {"loss": 0.81, "grad_norm": 0.63955157995224, "learning_rate": 0.0002, "epoch": 0.9877940344353731, "step": 6110}, {"loss": 0.7547, "grad_norm": 0.5649438500404358, "learning_rate": 0.0002, "epoch": 0.9894107186161183, "step": 6120}, {"loss": 0.7861, "grad_norm": 0.5290433168411255, "learning_rate": 0.0002, "epoch": 0.9910274027968636, "step": 6130}, {"loss": 0.8109, "grad_norm": 0.6399374008178711, "learning_rate": 0.0002, "epoch": 0.992644086977609, "step": 6140}, {"loss": 0.8373, "grad_norm": 0.6736576557159424, "learning_rate": 0.0002, "epoch": 0.9942607711583542, "step": 6150}, {"loss": 0.7915, "grad_norm": 0.515420138835907, "learning_rate": 0.0002, "epoch": 0.9958774553390995, "step": 6160}, {"loss": 0.8032, "grad_norm": 0.562677800655365, "learning_rate": 0.0002, "epoch": 0.9974941395198448, "step": 6170}, {"loss": 0.8187, "grad_norm": 0.7113858461380005, "learning_rate": 0.0002, "epoch": 0.9991108237005901, "step": 6180}, {"eval_loss": 1.0871200561523438, "eval_runtime": 122.2071, "eval_samples_per_second": 5.998, "eval_steps_per_second": 0.753, "epoch": 0.9999191657909627, "step": 6185}, {"loss": 0.7507, "grad_norm": 0.7111801505088806, "learning_rate": 0.0002, "epoch": 1.0007275078813354, "step": 6190}, {"loss": 0.6865, "grad_norm": 0.5402125716209412, "learning_rate": 0.0002, "epoch": 1.0023441920620806, "step": 6200}, {"loss": 0.7625, "grad_norm": 0.6098830103874207, "learning_rate": 0.0002, "epoch": 1.003960876242826, "step": 6210}, {"loss": 0.7631, "grad_norm": 0.5829983353614807, "learning_rate": 0.0002, "epoch": 1.0055775604235713, "step": 6220}, {"loss": 0.7188, "grad_norm": 0.5614621043205261, "learning_rate": 0.0002, "epoch": 1.0071942446043165, "step": 6230}, {"loss": 0.7505, "grad_norm": 0.5954238772392273, "learning_rate": 0.0002, "epoch": 1.0088109287850617, "step": 6240}, {"loss": 0.7448, "grad_norm": 0.6480574607849121, "learning_rate": 0.0002, "epoch": 1.0104276129658072, "step": 6250}, {"loss": 0.7514, "grad_norm": 0.6051128506660461, "learning_rate": 0.0002, "epoch": 1.0120442971465524, "step": 6260}, {"loss": 0.7237, "grad_norm": 0.6318870782852173, "learning_rate": 0.0002, "epoch": 1.0136609813272976, "step": 6270}, {"loss": 0.7178, "grad_norm": 0.5048980116844177, "learning_rate": 0.0002, "epoch": 1.015277665508043, "step": 6280}, {"loss": 0.7391, "grad_norm": 0.6346936225891113, "learning_rate": 0.0002, "epoch": 1.0168943496887883, "step": 6290}, {"loss": 0.7486, "grad_norm": 0.5711665749549866, "learning_rate": 0.0002, "epoch": 1.0185110338695336, "step": 6300}, {"loss": 0.6808, "grad_norm": 0.5175361037254333, "learning_rate": 0.0002, "epoch": 1.0201277180502788, "step": 6310}, {"loss": 0.7539, "grad_norm": 0.5360831618309021, "learning_rate": 0.0002, "epoch": 1.0217444022310243, "step": 6320}, {"loss": 0.7112, "grad_norm": 0.614675760269165, "learning_rate": 0.0002, "epoch": 1.0233610864117695, "step": 6330}, {"loss": 0.7748, "grad_norm": 0.5626118183135986, "learning_rate": 0.0002, "epoch": 1.0249777705925147, "step": 6340}, {"loss": 0.7375, "grad_norm": 0.574897289276123, "learning_rate": 0.0002, "epoch": 1.02659445477326, "step": 6350}, {"loss": 0.759, "grad_norm": 0.7185447812080383, "learning_rate": 0.0002, "epoch": 1.0282111389540054, "step": 6360}, {"loss": 0.703, "grad_norm": 0.6705799698829651, "learning_rate": 0.0002, "epoch": 1.0298278231347506, "step": 6370}, {"loss": 0.7139, "grad_norm": 0.6740428805351257, "learning_rate": 0.0002, "epoch": 1.0314445073154959, "step": 6380}, {"loss": 0.7252, "grad_norm": 0.663902759552002, "learning_rate": 0.0002, "epoch": 1.0330611914962413, "step": 6390}, {"loss": 0.7065, "grad_norm": 0.5029543042182922, "learning_rate": 0.0002, "epoch": 1.0346778756769865, "step": 6400}, {"loss": 0.711, "grad_norm": 0.7813863158226013, "learning_rate": 0.0002, "epoch": 1.0362945598577318, "step": 6410}, {"loss": 0.7433, "grad_norm": 0.5396282076835632, "learning_rate": 0.0002, "epoch": 1.037911244038477, "step": 6420}, {"loss": 0.7222, "grad_norm": 0.5253293514251709, "learning_rate": 0.0002, "epoch": 1.0395279282192225, "step": 6430}, {"loss": 0.715, "grad_norm": 0.7236770987510681, "learning_rate": 0.0002, "epoch": 1.0411446123999677, "step": 6440}, {"loss": 0.7259, "grad_norm": 0.5670917630195618, "learning_rate": 0.0002, "epoch": 1.042761296580713, "step": 6450}, {"loss": 0.7195, "grad_norm": 0.6031978726387024, "learning_rate": 0.0002, "epoch": 1.0443779807614582, "step": 6460}, {"loss": 0.7648, "grad_norm": 0.5309213399887085, "learning_rate": 0.0002, "epoch": 1.0459946649422036, "step": 6470}, {"loss": 0.7161, "grad_norm": 0.7114651799201965, "learning_rate": 0.0002, "epoch": 1.0476113491229488, "step": 6480}, {"loss": 0.7583, "grad_norm": 0.5591610670089722, "learning_rate": 0.0002, "epoch": 1.049228033303694, "step": 6490}, {"loss": 0.6645, "grad_norm": 0.5185961127281189, "learning_rate": 0.0002, "epoch": 1.0508447174844395, "step": 6500}, {"loss": 0.7654, "grad_norm": 0.6510552167892456, "learning_rate": 0.0002, "epoch": 1.0524614016651848, "step": 6510}, {"loss": 0.7057, "grad_norm": 0.6557928919792175, "learning_rate": 0.0002, "epoch": 1.05407808584593, "step": 6520}, {"loss": 0.8056, "grad_norm": 0.6973192691802979, "learning_rate": 0.0002, "epoch": 1.0556947700266752, "step": 6530}, {"loss": 0.6793, "grad_norm": 0.6226583123207092, "learning_rate": 0.0002, "epoch": 1.0573114542074207, "step": 6540}, {"loss": 0.7151, "grad_norm": 0.5633195638656616, "learning_rate": 0.0002, "epoch": 1.058928138388166, "step": 6550}, {"loss": 0.7082, "grad_norm": 0.7466658353805542, "learning_rate": 0.0002, "epoch": 1.0605448225689111, "step": 6560}, {"loss": 0.7059, "grad_norm": 0.6462772488594055, "learning_rate": 0.0002, "epoch": 1.0621615067496564, "step": 6570}, {"loss": 0.7046, "grad_norm": 0.5266856551170349, "learning_rate": 0.0002, "epoch": 1.0637781909304018, "step": 6580}, {"loss": 0.7157, "grad_norm": 0.534392774105072, "learning_rate": 0.0002, "epoch": 1.065394875111147, "step": 6590}, {"loss": 0.7115, "grad_norm": 0.7514177560806274, "learning_rate": 0.0002, "epoch": 1.0670115592918923, "step": 6600}, {"loss": 0.7545, "grad_norm": 0.7593035697937012, "learning_rate": 0.0002, "epoch": 1.0686282434726375, "step": 6610}, {"loss": 0.6836, "grad_norm": 0.5277858972549438, "learning_rate": 0.0002, "epoch": 1.070244927653383, "step": 6620}, {"loss": 0.7405, "grad_norm": 0.5573670268058777, "learning_rate": 0.0002, "epoch": 1.0718616118341282, "step": 6630}, {"loss": 0.6774, "grad_norm": 0.6802396774291992, "learning_rate": 0.0002, "epoch": 1.0734782960148734, "step": 6640}, {"loss": 0.723, "grad_norm": 0.7367215752601624, "learning_rate": 0.0002, "epoch": 1.0750949801956189, "step": 6650}, {"loss": 0.7429, "grad_norm": 0.5961891412734985, "learning_rate": 0.0002, "epoch": 1.0767116643763641, "step": 6660}, {"loss": 0.6791, "grad_norm": 0.5736313462257385, "learning_rate": 0.0002, "epoch": 1.0783283485571094, "step": 6670}, {"loss": 0.7178, "grad_norm": 0.619219183921814, "learning_rate": 0.0002, "epoch": 1.0799450327378546, "step": 6680}, {"loss": 0.7318, "grad_norm": 0.6214390993118286, "learning_rate": 0.0002, "epoch": 1.0815617169186, "step": 6690}, {"loss": 0.7554, "grad_norm": 0.564536988735199, "learning_rate": 0.0002, "epoch": 1.0831784010993453, "step": 6700}, {"loss": 0.7362, "grad_norm": 0.5838140249252319, "learning_rate": 0.0002, "epoch": 1.0847950852800905, "step": 6710}, {"loss": 0.739, "grad_norm": 0.7000553607940674, "learning_rate": 0.0002, "epoch": 1.0864117694608357, "step": 6720}, {"loss": 0.7369, "grad_norm": 0.7078263759613037, "learning_rate": 0.0002, "epoch": 1.0880284536415812, "step": 6730}, {"loss": 0.7654, "grad_norm": 0.8353848457336426, "learning_rate": 0.0002, "epoch": 1.0896451378223264, "step": 6740}, {"loss": 0.7015, "grad_norm": 0.5615518689155579, "learning_rate": 0.0002, "epoch": 1.0912618220030716, "step": 6750}, {"loss": 0.7396, "grad_norm": 0.5475581288337708, "learning_rate": 0.0002, "epoch": 1.0928785061838169, "step": 6760}, {"loss": 0.7652, "grad_norm": 0.5835978388786316, "learning_rate": 0.0002, "epoch": 1.0944951903645623, "step": 6770}, {"loss": 0.7541, "grad_norm": 0.5516105890274048, "learning_rate": 0.0002, "epoch": 1.0961118745453076, "step": 6780}, {"loss": 0.6842, "grad_norm": 0.5875251889228821, "learning_rate": 0.0002, "epoch": 1.0977285587260528, "step": 6790}, {"loss": 0.6903, "grad_norm": 0.7376947999000549, "learning_rate": 0.0002, "epoch": 1.0993452429067982, "step": 6800}, {"loss": 0.7512, "grad_norm": 0.5656165480613708, "learning_rate": 0.0002, "epoch": 1.1009619270875435, "step": 6810}, {"loss": 0.7409, "grad_norm": 0.6365954279899597, "learning_rate": 0.0002, "epoch": 1.1025786112682887, "step": 6820}, {"loss": 0.7392, "grad_norm": 0.5033080577850342, "learning_rate": 0.0002, "epoch": 1.104195295449034, "step": 6830}, {"loss": 0.6909, "grad_norm": 0.617396891117096, "learning_rate": 0.0002, "epoch": 1.1058119796297794, "step": 6840}, {"loss": 0.7006, "grad_norm": 0.6395374536514282, "learning_rate": 0.0002, "epoch": 1.1074286638105246, "step": 6850}, {"loss": 0.7335, "grad_norm": 0.6775295734405518, "learning_rate": 0.0002, "epoch": 1.1090453479912699, "step": 6860}, {"loss": 0.764, "grad_norm": 0.6655223965644836, "learning_rate": 0.0002, "epoch": 1.1106620321720153, "step": 6870}, {"loss": 0.7553, "grad_norm": 0.676655113697052, "learning_rate": 0.0002, "epoch": 1.1122787163527605, "step": 6880}, {"loss": 0.7342, "grad_norm": 0.6062718629837036, "learning_rate": 0.0002, "epoch": 1.1138954005335058, "step": 6890}, {"loss": 0.7446, "grad_norm": 0.590943455696106, "learning_rate": 0.0002, "epoch": 1.115512084714251, "step": 6900}, {"loss": 0.6705, "grad_norm": 0.6315317153930664, "learning_rate": 0.0002, "epoch": 1.1171287688949965, "step": 6910}, {"loss": 0.6912, "grad_norm": 0.47979024052619934, "learning_rate": 0.0002, "epoch": 1.1187454530757417, "step": 6920}, {"loss": 0.7002, "grad_norm": 0.647298276424408, "learning_rate": 0.0002, "epoch": 1.120362137256487, "step": 6930}, {"loss": 0.7502, "grad_norm": 0.7336484789848328, "learning_rate": 0.0002, "epoch": 1.1219788214372322, "step": 6940}, {"loss": 0.693, "grad_norm": 0.5071424245834351, "learning_rate": 0.0002, "epoch": 1.1235955056179776, "step": 6950}, {"loss": 0.7378, "grad_norm": 0.6527144312858582, "learning_rate": 0.0002, "epoch": 1.1252121897987228, "step": 6960}, {"loss": 0.7228, "grad_norm": 0.6935935020446777, "learning_rate": 0.0002, "epoch": 1.126828873979468, "step": 6970}, {"loss": 0.699, "grad_norm": 0.8026931881904602, "learning_rate": 0.0002, "epoch": 1.1284455581602133, "step": 6980}, {"loss": 0.7361, "grad_norm": 0.5210393667221069, "learning_rate": 0.0002, "epoch": 1.1300622423409588, "step": 6990}, {"loss": 0.7456, "grad_norm": 0.60475093126297, "learning_rate": 0.0002, "epoch": 1.131678926521704, "step": 7000}, {"loss": 0.7495, "grad_norm": 0.6417073607444763, "learning_rate": 0.0002, "epoch": 1.1332956107024492, "step": 7010}, {"loss": 0.7459, "grad_norm": 0.6732175946235657, "learning_rate": 0.0002, "epoch": 1.1349122948831947, "step": 7020}, {"loss": 0.7278, "grad_norm": 0.6719491481781006, "learning_rate": 0.0002, "epoch": 1.13652897906394, "step": 7030}, {"loss": 0.7694, "grad_norm": 0.5708295106887817, "learning_rate": 0.0002, "epoch": 1.1381456632446851, "step": 7040}, {"loss": 0.7823, "grad_norm": 0.7141719460487366, "learning_rate": 0.0002, "epoch": 1.1397623474254304, "step": 7050}, {"loss": 0.764, "grad_norm": 0.6187017560005188, "learning_rate": 0.0002, "epoch": 1.1413790316061758, "step": 7060}, {"loss": 0.7657, "grad_norm": 0.50581294298172, "learning_rate": 0.0002, "epoch": 1.142995715786921, "step": 7070}, {"loss": 0.7357, "grad_norm": 0.5620143413543701, "learning_rate": 0.0002, "epoch": 1.1446123999676663, "step": 7080}, {"loss": 0.7287, "grad_norm": 0.6231929659843445, "learning_rate": 0.0002, "epoch": 1.1462290841484115, "step": 7090}, {"loss": 0.7328, "grad_norm": 0.5775774121284485, "learning_rate": 0.0002, "epoch": 1.147845768329157, "step": 7100}, {"loss": 0.7728, "grad_norm": 0.6492809653282166, "learning_rate": 0.0002, "epoch": 1.1494624525099022, "step": 7110}, {"loss": 0.7545, "grad_norm": 0.6434972286224365, "learning_rate": 0.0002, "epoch": 1.1510791366906474, "step": 7120}, {"loss": 0.7374, "grad_norm": 0.6191812753677368, "learning_rate": 0.0002, "epoch": 1.1526958208713927, "step": 7130}, {"loss": 0.7276, "grad_norm": 0.6690331697463989, "learning_rate": 0.0002, "epoch": 1.1543125050521381, "step": 7140}, {"loss": 0.7704, "grad_norm": 0.5977938175201416, "learning_rate": 0.0002, "epoch": 1.1559291892328833, "step": 7150}, {"loss": 0.7251, "grad_norm": 0.6195854544639587, "learning_rate": 0.0002, "epoch": 1.1575458734136286, "step": 7160}, {"loss": 0.7249, "grad_norm": 0.5752048492431641, "learning_rate": 0.0002, "epoch": 1.159162557594374, "step": 7170}, {"loss": 0.7593, "grad_norm": 0.589081883430481, "learning_rate": 0.0002, "epoch": 1.1607792417751193, "step": 7180}, {"loss": 0.704, "grad_norm": 0.756996750831604, "learning_rate": 0.0002, "epoch": 1.1623959259558645, "step": 7190}, {"loss": 0.7404, "grad_norm": 0.7614967226982117, "learning_rate": 0.0002, "epoch": 1.1640126101366097, "step": 7200}, {"loss": 0.7867, "grad_norm": 0.6120437979698181, "learning_rate": 0.0002, "epoch": 1.1656292943173552, "step": 7210}, {"loss": 0.7384, "grad_norm": 0.6210004687309265, "learning_rate": 0.0002, "epoch": 1.1672459784981004, "step": 7220}, {"loss": 0.7251, "grad_norm": 0.6044116020202637, "learning_rate": 0.0002, "epoch": 1.1688626626788456, "step": 7230}, {"loss": 0.7361, "grad_norm": 0.5418457388877869, "learning_rate": 0.0002, "epoch": 1.170479346859591, "step": 7240}, {"loss": 0.6938, "grad_norm": 0.6413537263870239, "learning_rate": 0.0002, "epoch": 1.1720960310403363, "step": 7250}, {"loss": 0.6978, "grad_norm": 0.5777867436408997, "learning_rate": 0.0002, "epoch": 1.1737127152210816, "step": 7260}, {"loss": 0.7503, "grad_norm": 0.7092402577400208, "learning_rate": 0.0002, "epoch": 1.1753293994018268, "step": 7270}, {"loss": 0.7487, "grad_norm": 0.6351709365844727, "learning_rate": 0.0002, "epoch": 1.176946083582572, "step": 7280}, {"loss": 0.7527, "grad_norm": 0.6172189712524414, "learning_rate": 0.0002, "epoch": 1.1785627677633175, "step": 7290}, {"loss": 0.7319, "grad_norm": 0.6801714897155762, "learning_rate": 0.0002, "epoch": 1.1801794519440627, "step": 7300}, {"loss": 0.6941, "grad_norm": 0.6044712066650391, "learning_rate": 0.0002, "epoch": 1.181796136124808, "step": 7310}, {"loss": 0.6951, "grad_norm": 0.7413212060928345, "learning_rate": 0.0002, "epoch": 1.1834128203055534, "step": 7320}, {"loss": 0.7396, "grad_norm": 0.5303856134414673, "learning_rate": 0.0002, "epoch": 1.1850295044862986, "step": 7330}, {"loss": 0.6915, "grad_norm": 0.5647098422050476, "learning_rate": 0.0002, "epoch": 1.1866461886670439, "step": 7340}, {"loss": 0.7506, "grad_norm": 0.7374135255813599, "learning_rate": 0.0002, "epoch": 1.188262872847789, "step": 7350}, {"loss": 0.7041, "grad_norm": 0.5710089206695557, "learning_rate": 0.0002, "epoch": 1.1898795570285345, "step": 7360}, {"loss": 0.8289, "grad_norm": 0.6073619723320007, "learning_rate": 0.0002, "epoch": 1.1914962412092798, "step": 7370}, {"loss": 0.7722, "grad_norm": 0.5899916887283325, "learning_rate": 0.0002, "epoch": 1.193112925390025, "step": 7380}, {"loss": 0.756, "grad_norm": 0.7762434482574463, "learning_rate": 0.0002, "epoch": 1.1947296095707705, "step": 7390}, {"loss": 0.7319, "grad_norm": 0.679949939250946, "learning_rate": 0.0002, "epoch": 1.1963462937515157, "step": 7400}, {"loss": 0.7599, "grad_norm": 0.6106849312782288, "learning_rate": 0.0002, "epoch": 1.197962977932261, "step": 7410}, {"loss": 0.7648, "grad_norm": 0.682461678981781, "learning_rate": 0.0002, "epoch": 1.1995796621130062, "step": 7420}, {"loss": 0.7741, "grad_norm": 0.6087017059326172, "learning_rate": 0.0002, "epoch": 1.2011963462937516, "step": 7430}, {"loss": 0.7642, "grad_norm": 0.63739013671875, "learning_rate": 0.0002, "epoch": 1.2028130304744968, "step": 7440}, {"loss": 0.7611, "grad_norm": 0.6154777407646179, "learning_rate": 0.0002, "epoch": 1.204429714655242, "step": 7450}, {"loss": 0.7565, "grad_norm": 0.7491534948348999, "learning_rate": 0.0002, "epoch": 1.2060463988359873, "step": 7460}, {"loss": 0.698, "grad_norm": 0.6664797067642212, "learning_rate": 0.0002, "epoch": 1.2076630830167328, "step": 7470}, {"loss": 0.7456, "grad_norm": 0.6660266518592834, "learning_rate": 0.0002, "epoch": 1.209279767197478, "step": 7480}, {"loss": 0.714, "grad_norm": 0.6972551345825195, "learning_rate": 0.0002, "epoch": 1.2108964513782232, "step": 7490}, {"loss": 0.7023, "grad_norm": 0.6157945990562439, "learning_rate": 0.0002, "epoch": 1.2125131355589684, "step": 7500}, {"loss": 0.7326, "grad_norm": 0.5199310183525085, "learning_rate": 0.0002, "epoch": 1.214129819739714, "step": 7510}, {"loss": 0.7586, "grad_norm": 0.577610433101654, "learning_rate": 0.0002, "epoch": 1.2157465039204591, "step": 7520}, {"loss": 0.7179, "grad_norm": 0.53652423620224, "learning_rate": 0.0002, "epoch": 1.2173631881012044, "step": 7530}, {"loss": 0.7393, "grad_norm": 0.6479050517082214, "learning_rate": 0.0002, "epoch": 1.2189798722819498, "step": 7540}, {"loss": 0.7534, "grad_norm": 0.618748128414154, "learning_rate": 0.0002, "epoch": 1.220596556462695, "step": 7550}, {"loss": 0.6886, "grad_norm": 0.6311424374580383, "learning_rate": 0.0002, "epoch": 1.2222132406434403, "step": 7560}, {"loss": 0.7272, "grad_norm": 0.6595825552940369, "learning_rate": 0.0002, "epoch": 1.2238299248241855, "step": 7570}, {"loss": 0.7353, "grad_norm": 0.5198960900306702, "learning_rate": 0.0002, "epoch": 1.225446609004931, "step": 7580}, {"loss": 0.674, "grad_norm": 0.578650712966919, "learning_rate": 0.0002, "epoch": 1.2270632931856762, "step": 7590}, {"loss": 0.7507, "grad_norm": 0.6080220937728882, "learning_rate": 0.0002, "epoch": 1.2286799773664214, "step": 7600}, {"loss": 0.7733, "grad_norm": 0.7050248384475708, "learning_rate": 0.0002, "epoch": 1.2302966615471669, "step": 7610}, {"loss": 0.7032, "grad_norm": 0.6652196049690247, "learning_rate": 0.0002, "epoch": 1.2319133457279121, "step": 7620}, {"loss": 0.7085, "grad_norm": 0.7322776317596436, "learning_rate": 0.0002, "epoch": 1.2335300299086573, "step": 7630}, {"loss": 0.7402, "grad_norm": 0.4998728036880493, "learning_rate": 0.0002, "epoch": 1.2351467140894026, "step": 7640}, {"loss": 0.7214, "grad_norm": 0.6428788900375366, "learning_rate": 0.0002, "epoch": 1.2367633982701478, "step": 7650}, {"loss": 0.7699, "grad_norm": 0.585242509841919, "learning_rate": 0.0002, "epoch": 1.2383800824508933, "step": 7660}, {"loss": 0.7621, "grad_norm": 0.5211917757987976, "learning_rate": 0.0002, "epoch": 1.2399967666316385, "step": 7670}, {"loss": 0.746, "grad_norm": 0.6490384340286255, "learning_rate": 0.0002, "epoch": 1.2416134508123837, "step": 7680}, {"loss": 0.7186, "grad_norm": 0.6249763369560242, "learning_rate": 0.0002, "epoch": 1.2432301349931292, "step": 7690}, {"loss": 0.7761, "grad_norm": 0.71870356798172, "learning_rate": 0.0002, "epoch": 1.2448468191738744, "step": 7700}, {"loss": 0.7525, "grad_norm": 0.6761967539787292, "learning_rate": 0.0002, "epoch": 1.2464635033546196, "step": 7710}, {"loss": 0.7501, "grad_norm": 0.6500617265701294, "learning_rate": 0.0002, "epoch": 1.2480801875353649, "step": 7720}, {"loss": 0.7903, "grad_norm": 0.8069869875907898, "learning_rate": 0.0002, "epoch": 1.2496968717161103, "step": 7730}, {"loss": 0.6747, "grad_norm": 0.6044608950614929, "learning_rate": 0.0002, "epoch": 1.2513135558968556, "step": 7740}, {"loss": 0.6825, "grad_norm": 0.6573283076286316, "learning_rate": 0.0002, "epoch": 1.2529302400776008, "step": 7750}, {"loss": 0.7617, "grad_norm": 0.625430166721344, "learning_rate": 0.0002, "epoch": 1.2545469242583462, "step": 7760}, {"loss": 0.7041, "grad_norm": 0.5442022681236267, "learning_rate": 0.0002, "epoch": 1.2561636084390915, "step": 7770}, {"loss": 0.7172, "grad_norm": 0.6818386912345886, "learning_rate": 0.0002, "epoch": 1.2577802926198367, "step": 7780}, {"loss": 0.696, "grad_norm": 0.6381874084472656, "learning_rate": 0.0002, "epoch": 1.259396976800582, "step": 7790}, {"loss": 0.6834, "grad_norm": 0.6269212961196899, "learning_rate": 0.0002, "epoch": 1.2610136609813272, "step": 7800}, {"loss": 0.7821, "grad_norm": 0.600121259689331, "learning_rate": 0.0002, "epoch": 1.2626303451620726, "step": 7810}, {"loss": 0.7761, "grad_norm": 0.6337703466415405, "learning_rate": 0.0002, "epoch": 1.2642470293428179, "step": 7820}, {"loss": 0.732, "grad_norm": 0.7234963774681091, "learning_rate": 0.0002, "epoch": 1.2658637135235633, "step": 7830}, {"loss": 0.785, "grad_norm": 0.800184965133667, "learning_rate": 0.0002, "epoch": 1.2674803977043085, "step": 7840}, {"loss": 0.7426, "grad_norm": 0.7539464831352234, "learning_rate": 0.0002, "epoch": 1.2690970818850538, "step": 7850}, {"loss": 0.7496, "grad_norm": 0.5493760704994202, "learning_rate": 0.0002, "epoch": 1.270713766065799, "step": 7860}, {"loss": 0.7537, "grad_norm": 0.7477145791053772, "learning_rate": 0.0002, "epoch": 1.2723304502465442, "step": 7870}, {"loss": 0.7573, "grad_norm": 0.6366362571716309, "learning_rate": 0.0002, "epoch": 1.2739471344272897, "step": 7880}, {"loss": 0.7608, "grad_norm": 0.7419533729553223, "learning_rate": 0.0002, "epoch": 1.275563818608035, "step": 7890}, {"loss": 0.7873, "grad_norm": 0.6141223311424255, "learning_rate": 0.0002, "epoch": 1.2771805027887801, "step": 7900}, {"loss": 0.6916, "grad_norm": 0.7522598505020142, "learning_rate": 0.0002, "epoch": 1.2787971869695256, "step": 7910}, {"loss": 0.7097, "grad_norm": 0.6935804486274719, "learning_rate": 0.0002, "epoch": 1.2804138711502708, "step": 7920}, {"loss": 0.7185, "grad_norm": 0.7239290475845337, "learning_rate": 0.0002, "epoch": 1.282030555331016, "step": 7930}, {"loss": 0.7145, "grad_norm": 0.8800187110900879, "learning_rate": 0.0002, "epoch": 1.2836472395117613, "step": 7940}, {"loss": 0.6991, "grad_norm": 0.540458083152771, "learning_rate": 0.0002, "epoch": 1.2852639236925067, "step": 7950}, {"loss": 0.7139, "grad_norm": 0.6492934226989746, "learning_rate": 0.0002, "epoch": 1.286880607873252, "step": 7960}, {"loss": 0.7742, "grad_norm": 0.6543959379196167, "learning_rate": 0.0002, "epoch": 1.2884972920539972, "step": 7970}, {"loss": 0.7316, "grad_norm": 0.5804705619812012, "learning_rate": 0.0002, "epoch": 1.2901139762347427, "step": 7980}, {"loss": 0.796, "grad_norm": 0.7074727416038513, "learning_rate": 0.0002, "epoch": 1.291730660415488, "step": 7990}, {"loss": 0.7034, "grad_norm": 0.5347974300384521, "learning_rate": 0.0002, "epoch": 1.2933473445962331, "step": 8000}, {"loss": 0.738, "grad_norm": 0.6457298398017883, "learning_rate": 0.0002, "epoch": 1.2949640287769784, "step": 8010}, {"loss": 0.7634, "grad_norm": 0.6407219171524048, "learning_rate": 0.0002, "epoch": 1.2965807129577236, "step": 8020}, {"loss": 0.7506, "grad_norm": 0.828439474105835, "learning_rate": 0.0002, "epoch": 1.298197397138469, "step": 8030}, {"loss": 0.735, "grad_norm": 0.4840380549430847, "learning_rate": 0.0002, "epoch": 1.2998140813192143, "step": 8040}, {"loss": 0.7283, "grad_norm": 0.5921024680137634, "learning_rate": 0.0002, "epoch": 1.3014307654999595, "step": 8050}, {"loss": 0.7477, "grad_norm": 0.6170315146446228, "learning_rate": 0.0002, "epoch": 1.303047449680705, "step": 8060}, {"loss": 0.7534, "grad_norm": 0.5374847054481506, "learning_rate": 0.0002, "epoch": 1.3046641338614502, "step": 8070}, {"loss": 0.7593, "grad_norm": 0.545758068561554, "learning_rate": 0.0002, "epoch": 1.3062808180421954, "step": 8080}, {"loss": 0.7463, "grad_norm": 0.55641770362854, "learning_rate": 0.0002, "epoch": 1.3078975022229407, "step": 8090}, {"loss": 0.7594, "grad_norm": 0.6724897027015686, "learning_rate": 0.0002, "epoch": 1.309514186403686, "step": 8100}, {"loss": 0.7105, "grad_norm": 0.6923972368240356, "learning_rate": 0.0002, "epoch": 1.3111308705844313, "step": 8110}, {"loss": 0.7149, "grad_norm": 0.5136841535568237, "learning_rate": 0.0002, "epoch": 1.3127475547651766, "step": 8120}, {"loss": 0.7504, "grad_norm": 0.6766283512115479, "learning_rate": 0.0002, "epoch": 1.314364238945922, "step": 8130}, {"loss": 0.7489, "grad_norm": 0.6283926367759705, "learning_rate": 0.0002, "epoch": 1.3159809231266673, "step": 8140}, {"loss": 0.7459, "grad_norm": 0.644216001033783, "learning_rate": 0.0002, "epoch": 1.3175976073074125, "step": 8150}, {"loss": 0.7125, "grad_norm": 0.7827503085136414, "learning_rate": 0.0002, "epoch": 1.3192142914881577, "step": 8160}, {"loss": 0.7271, "grad_norm": 0.6651390790939331, "learning_rate": 0.0002, "epoch": 1.320830975668903, "step": 8170}, {"loss": 0.7778, "grad_norm": 0.5547412633895874, "learning_rate": 0.0002, "epoch": 1.3224476598496484, "step": 8180}, {"loss": 0.7402, "grad_norm": 0.6765179634094238, "learning_rate": 0.0002, "epoch": 1.3240643440303936, "step": 8190}, {"loss": 0.7106, "grad_norm": 0.6822077035903931, "learning_rate": 0.0002, "epoch": 1.325681028211139, "step": 8200}, {"loss": 0.7288, "grad_norm": 0.5941002368927002, "learning_rate": 0.0002, "epoch": 1.3272977123918843, "step": 8210}, {"loss": 0.7494, "grad_norm": 0.4850037097930908, "learning_rate": 0.0002, "epoch": 1.3289143965726296, "step": 8220}, {"loss": 0.7474, "grad_norm": 0.6162990927696228, "learning_rate": 0.0002, "epoch": 1.3305310807533748, "step": 8230}, {"loss": 0.7751, "grad_norm": 0.6665613651275635, "learning_rate": 0.0002, "epoch": 1.33214776493412, "step": 8240}, {"loss": 0.759, "grad_norm": 0.618192732334137, "learning_rate": 0.0002, "epoch": 1.3337644491148655, "step": 8250}, {"loss": 0.7532, "grad_norm": 0.710418701171875, "learning_rate": 0.0002, "epoch": 1.3353811332956107, "step": 8260}, {"loss": 0.7306, "grad_norm": 0.5109876990318298, "learning_rate": 0.0002, "epoch": 1.336997817476356, "step": 8270}, {"loss": 0.7303, "grad_norm": 0.6791711449623108, "learning_rate": 0.0002, "epoch": 1.3386145016571014, "step": 8280}, {"loss": 0.7594, "grad_norm": 0.6836432814598083, "learning_rate": 0.0002, "epoch": 1.3402311858378466, "step": 8290}, {"loss": 0.7594, "grad_norm": 0.5579386353492737, "learning_rate": 0.0002, "epoch": 1.3418478700185918, "step": 8300}, {"loss": 0.7377, "grad_norm": 0.6713546514511108, "learning_rate": 0.0002, "epoch": 1.343464554199337, "step": 8310}, {"loss": 0.7756, "grad_norm": 0.5353720188140869, "learning_rate": 0.0002, "epoch": 1.3450812383800825, "step": 8320}, {"loss": 0.718, "grad_norm": 0.5813682675361633, "learning_rate": 0.0002, "epoch": 1.3466979225608278, "step": 8330}, {"loss": 0.7294, "grad_norm": 0.8158791661262512, "learning_rate": 0.0002, "epoch": 1.348314606741573, "step": 8340}, {"loss": 0.6992, "grad_norm": 0.6193785071372986, "learning_rate": 0.0002, "epoch": 1.3499312909223184, "step": 8350}, {"loss": 0.7654, "grad_norm": 0.6353939771652222, "learning_rate": 0.0002, "epoch": 1.3515479751030637, "step": 8360}, {"loss": 0.7519, "grad_norm": 0.6925048232078552, "learning_rate": 0.0002, "epoch": 1.353164659283809, "step": 8370}, {"loss": 0.736, "grad_norm": 0.988264799118042, "learning_rate": 0.0002, "epoch": 1.3547813434645541, "step": 8380}, {"loss": 0.7744, "grad_norm": 0.6476002931594849, "learning_rate": 0.0002, "epoch": 1.3563980276452994, "step": 8390}, {"loss": 0.776, "grad_norm": 0.7120398879051208, "learning_rate": 0.0002, "epoch": 1.3580147118260448, "step": 8400}, {"loss": 0.7368, "grad_norm": 0.9048416614532471, "learning_rate": 0.0002, "epoch": 1.35963139600679, "step": 8410}, {"loss": 0.7544, "grad_norm": 0.7000672817230225, "learning_rate": 0.0002, "epoch": 1.3612480801875353, "step": 8420}, {"loss": 0.7358, "grad_norm": 0.6015632152557373, "learning_rate": 0.0002, "epoch": 1.3628647643682807, "step": 8430}, {"loss": 0.7298, "grad_norm": 0.612516462802887, "learning_rate": 0.0002, "epoch": 1.364481448549026, "step": 8440}, {"loss": 0.7055, "grad_norm": 0.5969301462173462, "learning_rate": 0.0002, "epoch": 1.3660981327297712, "step": 8450}, {"loss": 0.7754, "grad_norm": 0.6730654239654541, "learning_rate": 0.0002, "epoch": 1.3677148169105164, "step": 8460}, {"loss": 0.7465, "grad_norm": 0.6386392116546631, "learning_rate": 0.0002, "epoch": 1.369331501091262, "step": 8470}, {"loss": 0.7433, "grad_norm": 0.739544153213501, "learning_rate": 0.0002, "epoch": 1.3709481852720071, "step": 8480}, {"loss": 0.7892, "grad_norm": 0.6462782621383667, "learning_rate": 0.0002, "epoch": 1.3725648694527524, "step": 8490}, {"loss": 0.7302, "grad_norm": 0.7346843481063843, "learning_rate": 0.0002, "epoch": 1.3741815536334978, "step": 8500}, {"loss": 0.7634, "grad_norm": 0.6884821057319641, "learning_rate": 0.0002, "epoch": 1.375798237814243, "step": 8510}, {"loss": 0.7614, "grad_norm": 0.6999333500862122, "learning_rate": 0.0002, "epoch": 1.3774149219949883, "step": 8520}, {"loss": 0.729, "grad_norm": 0.5378713011741638, "learning_rate": 0.0002, "epoch": 1.3790316061757335, "step": 8530}, {"loss": 0.6797, "grad_norm": 0.5417906641960144, "learning_rate": 0.0002, "epoch": 1.3806482903564787, "step": 8540}, {"loss": 0.7499, "grad_norm": 0.6602526307106018, "learning_rate": 0.0002, "epoch": 1.3822649745372242, "step": 8550}, {"loss": 0.7356, "grad_norm": 0.7073674201965332, "learning_rate": 0.0002, "epoch": 1.3838816587179694, "step": 8560}, {"loss": 0.75, "grad_norm": 0.5841707587242126, "learning_rate": 0.0002, "epoch": 1.3854983428987149, "step": 8570}, {"loss": 0.732, "grad_norm": 0.7031095027923584, "learning_rate": 0.0002, "epoch": 1.38711502707946, "step": 8580}, {"loss": 0.7464, "grad_norm": 0.5198570489883423, "learning_rate": 0.0002, "epoch": 1.3887317112602053, "step": 8590}, {"loss": 0.7354, "grad_norm": 0.7261320352554321, "learning_rate": 0.0002, "epoch": 1.3903483954409506, "step": 8600}, {"loss": 0.7339, "grad_norm": 0.5616350173950195, "learning_rate": 0.0002, "epoch": 1.3919650796216958, "step": 8610}, {"loss": 0.7382, "grad_norm": 0.5185914635658264, "learning_rate": 0.0002, "epoch": 1.3935817638024413, "step": 8620}, {"loss": 0.7456, "grad_norm": 0.5814694762229919, "learning_rate": 0.0002, "epoch": 1.3951984479831865, "step": 8630}, {"loss": 0.7413, "grad_norm": 0.6977371573448181, "learning_rate": 0.0002, "epoch": 1.3968151321639317, "step": 8640}, {"loss": 0.7574, "grad_norm": 0.6855689883232117, "learning_rate": 0.0002, "epoch": 1.3984318163446772, "step": 8650}, {"loss": 0.7802, "grad_norm": 0.5414357781410217, "learning_rate": 0.0002, "epoch": 1.4000485005254224, "step": 8660}, {"loss": 0.7487, "grad_norm": 0.6970012784004211, "learning_rate": 0.0002, "epoch": 1.4016651847061676, "step": 8670}, {"loss": 0.7421, "grad_norm": 0.526079535484314, "learning_rate": 0.0002, "epoch": 1.4032818688869129, "step": 8680}, {"loss": 0.737, "grad_norm": 0.758712887763977, "learning_rate": 0.0002, "epoch": 1.404898553067658, "step": 8690}, {"loss": 0.7612, "grad_norm": 0.7118762731552124, "learning_rate": 0.0002, "epoch": 1.4065152372484035, "step": 8700}, {"loss": 0.7628, "grad_norm": 0.5696909427642822, "learning_rate": 0.0002, "epoch": 1.4081319214291488, "step": 8710}, {"loss": 0.7156, "grad_norm": 0.7995436787605286, "learning_rate": 0.0002, "epoch": 1.4097486056098942, "step": 8720}, {"loss": 0.7521, "grad_norm": 0.7237521409988403, "learning_rate": 0.0002, "epoch": 1.4113652897906395, "step": 8730}, {"loss": 0.7661, "grad_norm": 0.744628369808197, "learning_rate": 0.0002, "epoch": 1.4129819739713847, "step": 8740}, {"loss": 0.7073, "grad_norm": 0.6082926988601685, "learning_rate": 0.0002, "epoch": 1.41459865815213, "step": 8750}, {"loss": 0.7282, "grad_norm": 0.5185243487358093, "learning_rate": 0.0002, "epoch": 1.4162153423328752, "step": 8760}, {"loss": 0.7592, "grad_norm": 0.5183082222938538, "learning_rate": 0.0002, "epoch": 1.4178320265136206, "step": 8770}, {"loss": 0.7509, "grad_norm": 0.7326041460037231, "learning_rate": 0.0002, "epoch": 1.4194487106943658, "step": 8780}, {"loss": 0.7398, "grad_norm": 0.7174660563468933, "learning_rate": 0.0002, "epoch": 1.421065394875111, "step": 8790}, {"loss": 0.7507, "grad_norm": 0.8080165982246399, "learning_rate": 0.0002, "epoch": 1.4226820790558565, "step": 8800}, {"loss": 0.72, "grad_norm": 0.5061507821083069, "learning_rate": 0.0002, "epoch": 1.4242987632366018, "step": 8810}, {"loss": 0.7563, "grad_norm": 0.801602840423584, "learning_rate": 0.0002, "epoch": 1.425915447417347, "step": 8820}, {"loss": 0.7287, "grad_norm": 0.6150273084640503, "learning_rate": 0.0002, "epoch": 1.4275321315980922, "step": 8830}, {"loss": 0.7452, "grad_norm": 0.8786525726318359, "learning_rate": 0.0002, "epoch": 1.4291488157788377, "step": 8840}, {"loss": 0.7257, "grad_norm": 0.6371538639068604, "learning_rate": 0.0002, "epoch": 1.430765499959583, "step": 8850}, {"loss": 0.711, "grad_norm": 0.6409295797348022, "learning_rate": 0.0002, "epoch": 1.4323821841403281, "step": 8860}, {"loss": 0.7891, "grad_norm": 0.6452359557151794, "learning_rate": 0.0002, "epoch": 1.4339988683210736, "step": 8870}, {"loss": 0.7588, "grad_norm": 0.5842334628105164, "learning_rate": 0.0002, "epoch": 1.4356155525018188, "step": 8880}, {"loss": 0.7446, "grad_norm": 0.696761965751648, "learning_rate": 0.0002, "epoch": 1.437232236682564, "step": 8890}, {"loss": 0.7541, "grad_norm": 0.6384600400924683, "learning_rate": 0.0002, "epoch": 1.4388489208633093, "step": 8900}, {"loss": 0.7049, "grad_norm": 0.5981136560440063, "learning_rate": 0.0002, "epoch": 1.4404656050440545, "step": 8910}, {"loss": 0.795, "grad_norm": 0.6355637907981873, "learning_rate": 0.0002, "epoch": 1.4420822892248, "step": 8920}, {"loss": 0.7653, "grad_norm": 0.6374830603599548, "learning_rate": 0.0002, "epoch": 1.4436989734055452, "step": 8930}, {"loss": 0.8108, "grad_norm": 0.559013307094574, "learning_rate": 0.0002, "epoch": 1.4453156575862904, "step": 8940}, {"loss": 0.7045, "grad_norm": 0.7289170026779175, "learning_rate": 0.0002, "epoch": 1.446932341767036, "step": 8950}, {"loss": 0.7484, "grad_norm": 0.8649206757545471, "learning_rate": 0.0002, "epoch": 1.4485490259477811, "step": 8960}, {"loss": 0.7745, "grad_norm": 0.7664689421653748, "learning_rate": 0.0002, "epoch": 1.4501657101285264, "step": 8970}, {"loss": 0.7431, "grad_norm": 0.7109952569007874, "learning_rate": 0.0002, "epoch": 1.4517823943092716, "step": 8980}, {"loss": 0.7997, "grad_norm": 0.6312844753265381, "learning_rate": 0.0002, "epoch": 1.453399078490017, "step": 8990}, {"loss": 0.7467, "grad_norm": 0.6616617441177368, "learning_rate": 0.0002, "epoch": 1.4550157626707623, "step": 9000}, {"loss": 0.7518, "grad_norm": 0.7384068965911865, "learning_rate": 0.0002, "epoch": 1.4566324468515075, "step": 9010}, {"loss": 0.7483, "grad_norm": 0.6549670100212097, "learning_rate": 0.0002, "epoch": 1.458249131032253, "step": 9020}, {"loss": 0.7423, "grad_norm": 0.6254119277000427, "learning_rate": 0.0002, "epoch": 1.4598658152129982, "step": 9030}, {"loss": 0.7645, "grad_norm": 0.6806328892707825, "learning_rate": 0.0002, "epoch": 1.4614824993937434, "step": 9040}, {"loss": 0.7221, "grad_norm": 0.6803115010261536, "learning_rate": 0.0002, "epoch": 1.4630991835744886, "step": 9050}, {"loss": 0.7264, "grad_norm": 0.48529282212257385, "learning_rate": 0.0002, "epoch": 1.4647158677552339, "step": 9060}, {"loss": 0.7542, "grad_norm": 0.5995030999183655, "learning_rate": 0.0002, "epoch": 1.4663325519359793, "step": 9070}, {"loss": 0.7894, "grad_norm": 0.6005427837371826, "learning_rate": 0.0002, "epoch": 1.4679492361167246, "step": 9080}, {"loss": 0.7288, "grad_norm": 0.718564510345459, "learning_rate": 0.0002, "epoch": 1.46956592029747, "step": 9090}, {"loss": 0.7089, "grad_norm": 0.7003577351570129, "learning_rate": 0.0002, "epoch": 1.4711826044782153, "step": 9100}, {"loss": 0.8069, "grad_norm": 0.5888323783874512, "learning_rate": 0.0002, "epoch": 1.4727992886589605, "step": 9110}, {"loss": 0.7275, "grad_norm": 0.6417609453201294, "learning_rate": 0.0002, "epoch": 1.4744159728397057, "step": 9120}, {"loss": 0.7441, "grad_norm": 0.572294294834137, "learning_rate": 0.0002, "epoch": 1.476032657020451, "step": 9130}, {"loss": 0.8053, "grad_norm": 0.8200714588165283, "learning_rate": 0.0002, "epoch": 1.4776493412011964, "step": 9140}, {"loss": 0.7382, "grad_norm": 0.6343288421630859, "learning_rate": 0.0002, "epoch": 1.4792660253819416, "step": 9150}, {"loss": 0.7641, "grad_norm": 0.7017961144447327, "learning_rate": 0.0002, "epoch": 1.4808827095626869, "step": 9160}, {"loss": 0.7619, "grad_norm": 0.6202912926673889, "learning_rate": 0.0002, "epoch": 1.4824993937434323, "step": 9170}, {"loss": 0.7428, "grad_norm": 0.6677869558334351, "learning_rate": 0.0002, "epoch": 1.4841160779241775, "step": 9180}, {"loss": 0.7648, "grad_norm": 0.6052267551422119, "learning_rate": 0.0002, "epoch": 1.4857327621049228, "step": 9190}, {"loss": 0.7152, "grad_norm": 0.6638872027397156, "learning_rate": 0.0002, "epoch": 1.487349446285668, "step": 9200}, {"loss": 0.7448, "grad_norm": 0.6245523691177368, "learning_rate": 0.0002, "epoch": 1.4889661304664135, "step": 9210}, {"loss": 0.6958, "grad_norm": 0.5761767625808716, "learning_rate": 0.0002, "epoch": 1.4905828146471587, "step": 9220}, {"loss": 0.8012, "grad_norm": 0.8175981640815735, "learning_rate": 0.0002, "epoch": 1.492199498827904, "step": 9230}, {"loss": 0.683, "grad_norm": 0.9144009947776794, "learning_rate": 0.0002, "epoch": 1.4938161830086494, "step": 9240}, {"loss": 0.7623, "grad_norm": 0.5742552876472473, "learning_rate": 0.0002, "epoch": 1.4954328671893946, "step": 9250}, {"loss": 0.7418, "grad_norm": 0.534534215927124, "learning_rate": 0.0002, "epoch": 1.4970495513701398, "step": 9260}, {"loss": 0.7194, "grad_norm": 0.7836225032806396, "learning_rate": 0.0002, "epoch": 1.498666235550885, "step": 9270}, {"loss": 0.7453, "grad_norm": 0.5292993187904358, "learning_rate": 0.0002, "epoch": 1.5002829197316303, "step": 9280}, {"loss": 0.7168, "grad_norm": 0.8044071793556213, "learning_rate": 0.0002, "epoch": 1.5018996039123758, "step": 9290}, {"loss": 0.7229, "grad_norm": 0.6185805201530457, "learning_rate": 0.0002, "epoch": 1.503516288093121, "step": 9300}, {"loss": 0.684, "grad_norm": 0.6093607544898987, "learning_rate": 0.0002, "epoch": 1.5051329722738664, "step": 9310}, {"loss": 0.7973, "grad_norm": 0.5891730189323425, "learning_rate": 0.0002, "epoch": 1.5067496564546117, "step": 9320}, {"loss": 0.7474, "grad_norm": 0.6331129670143127, "learning_rate": 0.0002, "epoch": 1.508366340635357, "step": 9330}, {"loss": 0.7074, "grad_norm": 0.7690958380699158, "learning_rate": 0.0002, "epoch": 1.5099830248161021, "step": 9340}, {"loss": 0.672, "grad_norm": 0.6548877358436584, "learning_rate": 0.0002, "epoch": 1.5115997089968474, "step": 9350}, {"loss": 0.7408, "grad_norm": 0.6545143127441406, "learning_rate": 0.0002, "epoch": 1.5132163931775926, "step": 9360}, {"loss": 0.7432, "grad_norm": 0.553247332572937, "learning_rate": 0.0002, "epoch": 1.514833077358338, "step": 9370}, {"loss": 0.7265, "grad_norm": 0.8145074844360352, "learning_rate": 0.0002, "epoch": 1.5164497615390833, "step": 9380}, {"loss": 0.7379, "grad_norm": 0.7636994123458862, "learning_rate": 0.0002, "epoch": 1.5180664457198287, "step": 9390}, {"loss": 0.7413, "grad_norm": 0.6838982701301575, "learning_rate": 0.0002, "epoch": 1.519683129900574, "step": 9400}, {"loss": 0.7367, "grad_norm": 0.8599441647529602, "learning_rate": 0.0002, "epoch": 1.5212998140813192, "step": 9410}, {"loss": 0.7663, "grad_norm": 0.7020329833030701, "learning_rate": 0.0002, "epoch": 1.5229164982620644, "step": 9420}, {"loss": 0.7928, "grad_norm": 0.6964772343635559, "learning_rate": 0.0002, "epoch": 1.5245331824428097, "step": 9430}, {"loss": 0.7168, "grad_norm": 0.6916600465774536, "learning_rate": 0.0002, "epoch": 1.5261498666235551, "step": 9440}, {"loss": 0.7519, "grad_norm": 0.7282621264457703, "learning_rate": 0.0002, "epoch": 1.5277665508043003, "step": 9450}, {"loss": 0.7628, "grad_norm": 0.5363983511924744, "learning_rate": 0.0002, "epoch": 1.5293832349850458, "step": 9460}, {"loss": 0.7154, "grad_norm": 0.6184861063957214, "learning_rate": 0.0002, "epoch": 1.530999919165791, "step": 9470}, {"loss": 0.7837, "grad_norm": 0.5991285443305969, "learning_rate": 0.0002, "epoch": 1.5326166033465363, "step": 9480}, {"loss": 0.7827, "grad_norm": 0.8176587820053101, "learning_rate": 0.0002, "epoch": 1.5342332875272815, "step": 9490}, {"loss": 0.7415, "grad_norm": 0.6473721861839294, "learning_rate": 0.0002, "epoch": 1.5358499717080267, "step": 9500}, {"loss": 0.7632, "grad_norm": 0.7319952845573425, "learning_rate": 0.0002, "epoch": 1.5374666558887722, "step": 9510}, {"loss": 0.7706, "grad_norm": 0.702900230884552, "learning_rate": 0.0002, "epoch": 1.5390833400695174, "step": 9520}, {"loss": 0.7754, "grad_norm": 0.7971600294113159, "learning_rate": 0.0002, "epoch": 1.5407000242502629, "step": 9530}, {"loss": 0.7352, "grad_norm": 0.6527525186538696, "learning_rate": 0.0002, "epoch": 1.542316708431008, "step": 9540}, {"loss": 0.7425, "grad_norm": 0.5791676044464111, "learning_rate": 0.0002, "epoch": 1.5439333926117533, "step": 9550}, {"loss": 0.7585, "grad_norm": 0.5619390606880188, "learning_rate": 0.0002, "epoch": 1.5455500767924986, "step": 9560}, {"loss": 0.7894, "grad_norm": 0.5701689124107361, "learning_rate": 0.0002, "epoch": 1.5471667609732438, "step": 9570}, {"loss": 0.793, "grad_norm": 0.47549352049827576, "learning_rate": 0.0002, "epoch": 1.548783445153989, "step": 9580}, {"loss": 0.7276, "grad_norm": 0.8730611205101013, "learning_rate": 0.0002, "epoch": 1.5504001293347345, "step": 9590}, {"loss": 0.798, "grad_norm": 0.6842091083526611, "learning_rate": 0.0002, "epoch": 1.5520168135154797, "step": 9600}, {"loss": 0.7528, "grad_norm": 0.6675129532814026, "learning_rate": 0.0002, "epoch": 1.5536334976962252, "step": 9610}, {"loss": 0.7954, "grad_norm": 0.8173956274986267, "learning_rate": 0.0002, "epoch": 1.5552501818769704, "step": 9620}, {"loss": 0.7535, "grad_norm": 0.724947452545166, "learning_rate": 0.0002, "epoch": 1.5568668660577156, "step": 9630}, {"loss": 0.7738, "grad_norm": 0.6154758930206299, "learning_rate": 0.0002, "epoch": 1.5584835502384609, "step": 9640}, {"loss": 0.7568, "grad_norm": 0.6072008013725281, "learning_rate": 0.0002, "epoch": 1.560100234419206, "step": 9650}, {"loss": 0.7219, "grad_norm": 0.659010648727417, "learning_rate": 0.0002, "epoch": 1.5617169185999515, "step": 9660}, {"loss": 0.673, "grad_norm": 0.65857994556427, "learning_rate": 0.0002, "epoch": 1.5633336027806968, "step": 9670}, {"loss": 0.7156, "grad_norm": 0.5914267301559448, "learning_rate": 0.0002, "epoch": 1.5649502869614422, "step": 9680}, {"loss": 0.7414, "grad_norm": 0.6248020529747009, "learning_rate": 0.0002, "epoch": 1.5665669711421875, "step": 9690}, {"loss": 0.694, "grad_norm": 0.7147795557975769, "learning_rate": 0.0002, "epoch": 1.5681836553229327, "step": 9700}, {"loss": 0.7335, "grad_norm": 0.7076232433319092, "learning_rate": 0.0002, "epoch": 1.569800339503678, "step": 9710}, {"loss": 0.7413, "grad_norm": 0.6217400431632996, "learning_rate": 0.0002, "epoch": 1.5714170236844232, "step": 9720}, {"loss": 0.7296, "grad_norm": 0.6709911227226257, "learning_rate": 0.0002, "epoch": 1.5730337078651684, "step": 9730}, {"loss": 0.7306, "grad_norm": 0.749171257019043, "learning_rate": 0.0002, "epoch": 1.5746503920459138, "step": 9740}, {"loss": 0.7242, "grad_norm": 0.6241145730018616, "learning_rate": 0.0002, "epoch": 1.576267076226659, "step": 9750}, {"loss": 0.7384, "grad_norm": 0.4960934817790985, "learning_rate": 0.0002, "epoch": 1.5778837604074045, "step": 9760}, {"loss": 0.725, "grad_norm": 0.6593309640884399, "learning_rate": 0.0002, "epoch": 1.5795004445881498, "step": 9770}, {"loss": 0.7531, "grad_norm": 0.5814042091369629, "learning_rate": 0.0002, "epoch": 1.581117128768895, "step": 9780}, {"loss": 0.7109, "grad_norm": 0.5936070680618286, "learning_rate": 0.0002, "epoch": 1.5827338129496402, "step": 9790}, {"loss": 0.7769, "grad_norm": 0.6454403400421143, "learning_rate": 0.0002, "epoch": 1.5843504971303854, "step": 9800}, {"loss": 0.7677, "grad_norm": 0.7612107992172241, "learning_rate": 0.0002, "epoch": 1.585967181311131, "step": 9810}, {"loss": 0.7649, "grad_norm": 0.6494482755661011, "learning_rate": 0.0002, "epoch": 1.5875838654918761, "step": 9820}, {"loss": 0.7569, "grad_norm": 0.7825694680213928, "learning_rate": 0.0002, "epoch": 1.5892005496726216, "step": 9830}, {"loss": 0.706, "grad_norm": 0.6757757663726807, "learning_rate": 0.0002, "epoch": 1.5908172338533668, "step": 9840}, {"loss": 0.7803, "grad_norm": 0.7105609178543091, "learning_rate": 0.0002, "epoch": 1.592433918034112, "step": 9850}, {"loss": 0.7925, "grad_norm": 0.7596991062164307, "learning_rate": 0.0002, "epoch": 1.5940506022148573, "step": 9860}, {"loss": 0.7108, "grad_norm": 0.5681525468826294, "learning_rate": 0.0002, "epoch": 1.5956672863956025, "step": 9870}, {"loss": 0.7811, "grad_norm": 0.6090980768203735, "learning_rate": 0.0002, "epoch": 1.5972839705763477, "step": 9880}, {"loss": 0.7339, "grad_norm": 0.6271613240242004, "learning_rate": 0.0002, "epoch": 1.5989006547570932, "step": 9890}, {"loss": 0.7419, "grad_norm": 0.7656369805335999, "learning_rate": 0.0002, "epoch": 1.6005173389378387, "step": 9900}, {"loss": 0.7336, "grad_norm": 0.7504446506500244, "learning_rate": 0.0002, "epoch": 1.6021340231185839, "step": 9910}, {"loss": 0.7479, "grad_norm": 0.659656286239624, "learning_rate": 0.0002, "epoch": 1.6037507072993291, "step": 9920}, {"loss": 0.7483, "grad_norm": 0.6006826162338257, "learning_rate": 0.0002, "epoch": 1.6053673914800743, "step": 9930}, {"loss": 0.732, "grad_norm": 0.7872757911682129, "learning_rate": 0.0002, "epoch": 1.6069840756608196, "step": 9940}, {"loss": 0.768, "grad_norm": 0.5545852780342102, "learning_rate": 0.0002, "epoch": 1.6086007598415648, "step": 9950}, {"loss": 0.8064, "grad_norm": 0.7429468631744385, "learning_rate": 0.0002, "epoch": 1.6102174440223103, "step": 9960}, {"loss": 0.714, "grad_norm": 0.6873556971549988, "learning_rate": 0.0002, "epoch": 1.6118341282030555, "step": 9970}, {"loss": 0.7324, "grad_norm": 0.5874287486076355, "learning_rate": 0.0002, "epoch": 1.613450812383801, "step": 9980}, {"loss": 0.7141, "grad_norm": 0.6039386987686157, "learning_rate": 0.0002, "epoch": 1.6150674965645462, "step": 9990}, {"loss": 0.6674, "grad_norm": 0.6233575940132141, "learning_rate": 0.0002, "epoch": 1.6166841807452914, "step": 10000}, {"loss": 0.7602, "grad_norm": 0.7676448225975037, "learning_rate": 0.0002, "epoch": 1.6183008649260366, "step": 10010}, {"loss": 0.7784, "grad_norm": 0.6565698385238647, "learning_rate": 0.0002, "epoch": 1.6199175491067819, "step": 10020}, {"loss": 0.7104, "grad_norm": 0.6787590384483337, "learning_rate": 0.0002, "epoch": 1.6215342332875273, "step": 10030}, {"loss": 0.7464, "grad_norm": 0.6137678027153015, "learning_rate": 0.0002, "epoch": 1.6231509174682726, "step": 10040}, {"loss": 0.7646, "grad_norm": 0.5236800312995911, "learning_rate": 0.0002, "epoch": 1.624767601649018, "step": 10050}, {"loss": 0.7437, "grad_norm": 0.7626367807388306, "learning_rate": 0.0002, "epoch": 1.6263842858297632, "step": 10060}, {"loss": 0.7273, "grad_norm": 0.5657260417938232, "learning_rate": 0.0002, "epoch": 1.6280009700105085, "step": 10070}, {"loss": 0.7354, "grad_norm": 0.4913991391658783, "learning_rate": 0.0002, "epoch": 1.6296176541912537, "step": 10080}, {"loss": 0.7596, "grad_norm": 0.7715556621551514, "learning_rate": 0.0002, "epoch": 1.631234338371999, "step": 10090}, {"loss": 0.7105, "grad_norm": 0.6509000062942505, "learning_rate": 0.0002, "epoch": 1.6328510225527442, "step": 10100}, {"loss": 0.7274, "grad_norm": 0.6215850114822388, "learning_rate": 0.0002, "epoch": 1.6344677067334896, "step": 10110}, {"loss": 0.7705, "grad_norm": 0.6956844329833984, "learning_rate": 0.0002, "epoch": 1.6360843909142349, "step": 10120}, {"loss": 0.7129, "grad_norm": 0.6111597418785095, "learning_rate": 0.0002, "epoch": 1.6377010750949803, "step": 10130}, {"loss": 0.6955, "grad_norm": 0.6518288850784302, "learning_rate": 0.0002, "epoch": 1.6393177592757255, "step": 10140}, {"loss": 0.731, "grad_norm": 0.6914522051811218, "learning_rate": 0.0002, "epoch": 1.6409344434564708, "step": 10150}, {"loss": 0.7295, "grad_norm": 0.63785719871521, "learning_rate": 0.0002, "epoch": 1.642551127637216, "step": 10160}, {"loss": 0.7355, "grad_norm": 0.6379287838935852, "learning_rate": 0.0002, "epoch": 1.6441678118179612, "step": 10170}, {"loss": 0.7359, "grad_norm": 0.6793403029441833, "learning_rate": 0.0002, "epoch": 1.6457844959987067, "step": 10180}, {"loss": 0.7402, "grad_norm": 0.6099132895469666, "learning_rate": 0.0002, "epoch": 1.647401180179452, "step": 10190}, {"loss": 0.7353, "grad_norm": 0.5869854092597961, "learning_rate": 0.0002, "epoch": 1.6490178643601974, "step": 10200}, {"loss": 0.8308, "grad_norm": 0.7716999053955078, "learning_rate": 0.0002, "epoch": 1.6506345485409426, "step": 10210}, {"loss": 0.7215, "grad_norm": 0.6854110360145569, "learning_rate": 0.0002, "epoch": 1.6522512327216878, "step": 10220}, {"loss": 0.782, "grad_norm": 0.6957170367240906, "learning_rate": 0.0002, "epoch": 1.653867916902433, "step": 10230}, {"loss": 0.7282, "grad_norm": 0.6932903528213501, "learning_rate": 0.0002, "epoch": 1.6554846010831783, "step": 10240}, {"loss": 0.7478, "grad_norm": 0.7713165283203125, "learning_rate": 0.0002, "epoch": 1.6571012852639235, "step": 10250}, {"loss": 0.7099, "grad_norm": 0.7455793619155884, "learning_rate": 0.0002, "epoch": 1.658717969444669, "step": 10260}, {"loss": 0.7524, "grad_norm": 0.5464168190956116, "learning_rate": 0.0002, "epoch": 1.6603346536254144, "step": 10270}, {"loss": 0.7328, "grad_norm": 0.6782926321029663, "learning_rate": 0.0002, "epoch": 1.6619513378061597, "step": 10280}, {"loss": 0.7801, "grad_norm": 0.7962649464607239, "learning_rate": 0.0002, "epoch": 1.663568021986905, "step": 10290}, {"loss": 0.7142, "grad_norm": 0.6814526319503784, "learning_rate": 0.0002, "epoch": 1.6651847061676501, "step": 10300}, {"loss": 0.7285, "grad_norm": 0.656895101070404, "learning_rate": 0.0002, "epoch": 1.6668013903483954, "step": 10310}, {"loss": 0.7358, "grad_norm": 0.6085672378540039, "learning_rate": 0.0002, "epoch": 1.6684180745291406, "step": 10320}, {"loss": 0.7074, "grad_norm": 0.585508406162262, "learning_rate": 0.0002, "epoch": 1.670034758709886, "step": 10330}, {"loss": 0.7604, "grad_norm": 0.6930184364318848, "learning_rate": 0.0002, "epoch": 1.6716514428906313, "step": 10340}, {"loss": 0.7169, "grad_norm": 0.575663149356842, "learning_rate": 0.0002, "epoch": 1.6732681270713767, "step": 10350}, {"loss": 0.7198, "grad_norm": 0.582502543926239, "learning_rate": 0.0002, "epoch": 1.674884811252122, "step": 10360}, {"loss": 0.7793, "grad_norm": 0.5668916702270508, "learning_rate": 0.0002, "epoch": 1.6765014954328672, "step": 10370}, {"loss": 0.7478, "grad_norm": 0.6070065498352051, "learning_rate": 0.0002, "epoch": 1.6781181796136124, "step": 10380}, {"loss": 0.7939, "grad_norm": 0.6141316294670105, "learning_rate": 0.0002, "epoch": 1.6797348637943577, "step": 10390}, {"loss": 0.7573, "grad_norm": 0.8359124064445496, "learning_rate": 0.0002, "epoch": 1.6813515479751031, "step": 10400}, {"loss": 0.7488, "grad_norm": 0.5378185510635376, "learning_rate": 0.0002, "epoch": 1.6829682321558483, "step": 10410}, {"loss": 0.7588, "grad_norm": 0.6959536075592041, "learning_rate": 0.0002, "epoch": 1.6845849163365938, "step": 10420}, {"loss": 0.7872, "grad_norm": 0.6514357328414917, "learning_rate": 0.0002, "epoch": 1.686201600517339, "step": 10430}, {"loss": 0.725, "grad_norm": 0.7706646919250488, "learning_rate": 0.0002, "epoch": 1.6878182846980843, "step": 10440}, {"loss": 0.7673, "grad_norm": 0.6183337569236755, "learning_rate": 0.0002, "epoch": 1.6894349688788295, "step": 10450}, {"loss": 0.7566, "grad_norm": 0.6123278141021729, "learning_rate": 0.0002, "epoch": 1.6910516530595747, "step": 10460}, {"loss": 0.7169, "grad_norm": 0.6894851326942444, "learning_rate": 0.0002, "epoch": 1.69266833724032, "step": 10470}, {"loss": 0.7435, "grad_norm": 0.7497312426567078, "learning_rate": 0.0002, "epoch": 1.6942850214210654, "step": 10480}, {"loss": 0.7544, "grad_norm": 0.5968214273452759, "learning_rate": 0.0002, "epoch": 1.6959017056018106, "step": 10490}, {"loss": 0.6793, "grad_norm": 0.6747927069664001, "learning_rate": 0.0002, "epoch": 1.697518389782556, "step": 10500}, {"loss": 0.7415, "grad_norm": 0.5708310008049011, "learning_rate": 0.0002, "epoch": 1.6991350739633013, "step": 10510}, {"loss": 0.7385, "grad_norm": 0.606526792049408, "learning_rate": 0.0002, "epoch": 1.7007517581440466, "step": 10520}, {"loss": 0.7204, "grad_norm": 0.662011981010437, "learning_rate": 0.0002, "epoch": 1.7023684423247918, "step": 10530}, {"loss": 0.7999, "grad_norm": 0.7583045363426208, "learning_rate": 0.0002, "epoch": 1.703985126505537, "step": 10540}, {"loss": 0.7563, "grad_norm": 0.721632182598114, "learning_rate": 0.0002, "epoch": 1.7056018106862825, "step": 10550}, {"loss": 0.7407, "grad_norm": 0.6107715368270874, "learning_rate": 0.0002, "epoch": 1.7072184948670277, "step": 10560}, {"loss": 0.7519, "grad_norm": 0.6652471423149109, "learning_rate": 0.0002, "epoch": 1.7088351790477732, "step": 10570}, {"loss": 0.7767, "grad_norm": 0.6308087110519409, "learning_rate": 0.0002, "epoch": 1.7104518632285184, "step": 10580}, {"loss": 0.7659, "grad_norm": 0.5464386940002441, "learning_rate": 0.0002, "epoch": 1.7120685474092636, "step": 10590}, {"loss": 0.7063, "grad_norm": 0.6558911204338074, "learning_rate": 0.0002, "epoch": 1.7136852315900089, "step": 10600}, {"loss": 0.7126, "grad_norm": 0.5665024518966675, "learning_rate": 0.0002, "epoch": 1.715301915770754, "step": 10610}, {"loss": 0.6958, "grad_norm": 0.7888094186782837, "learning_rate": 0.0002, "epoch": 1.7169185999514993, "step": 10620}, {"loss": 0.7785, "grad_norm": 0.7084909081459045, "learning_rate": 0.0002, "epoch": 1.7185352841322448, "step": 10630}, {"loss": 0.7557, "grad_norm": 0.7982324361801147, "learning_rate": 0.0002, "epoch": 1.7201519683129902, "step": 10640}, {"loss": 0.7345, "grad_norm": 0.6418732404708862, "learning_rate": 0.0002, "epoch": 1.7217686524937355, "step": 10650}, {"loss": 0.7734, "grad_norm": 0.7636681795120239, "learning_rate": 0.0002, "epoch": 1.7233853366744807, "step": 10660}, {"loss": 0.7541, "grad_norm": 0.5646875500679016, "learning_rate": 0.0002, "epoch": 1.725002020855226, "step": 10670}, {"loss": 0.7642, "grad_norm": 0.5231260657310486, "learning_rate": 0.0002, "epoch": 1.7266187050359711, "step": 10680}, {"loss": 0.7846, "grad_norm": 0.7635011672973633, "learning_rate": 0.0002, "epoch": 1.7282353892167164, "step": 10690}, {"loss": 0.7471, "grad_norm": 0.7518259286880493, "learning_rate": 0.0002, "epoch": 1.7298520733974618, "step": 10700}, {"loss": 0.751, "grad_norm": 0.7295602560043335, "learning_rate": 0.0002, "epoch": 1.731468757578207, "step": 10710}, {"loss": 0.731, "grad_norm": 0.6984632015228271, "learning_rate": 0.0002, "epoch": 1.7330854417589525, "step": 10720}, {"loss": 0.7921, "grad_norm": 0.6198219060897827, "learning_rate": 0.0002, "epoch": 1.7347021259396977, "step": 10730}, {"loss": 0.7642, "grad_norm": 0.6957576274871826, "learning_rate": 0.0002, "epoch": 1.736318810120443, "step": 10740}, {"loss": 0.7917, "grad_norm": 0.6430263519287109, "learning_rate": 0.0002, "epoch": 1.7379354943011882, "step": 10750}, {"loss": 0.7156, "grad_norm": 0.6134995222091675, "learning_rate": 0.0002, "epoch": 1.7395521784819334, "step": 10760}, {"loss": 0.7584, "grad_norm": 0.7209452986717224, "learning_rate": 0.0002, "epoch": 1.741168862662679, "step": 10770}, {"loss": 0.7528, "grad_norm": 0.6735447645187378, "learning_rate": 0.0002, "epoch": 1.7427855468434241, "step": 10780}, {"loss": 0.756, "grad_norm": 0.5605693459510803, "learning_rate": 0.0002, "epoch": 1.7444022310241696, "step": 10790}, {"loss": 0.7759, "grad_norm": 0.6882363557815552, "learning_rate": 0.0002, "epoch": 1.7460189152049148, "step": 10800}, {"loss": 0.7544, "grad_norm": 0.6386259198188782, "learning_rate": 0.0002, "epoch": 1.74763559938566, "step": 10810}, {"loss": 0.7697, "grad_norm": 0.6529015302658081, "learning_rate": 0.0002, "epoch": 1.7492522835664053, "step": 10820}, {"loss": 0.7219, "grad_norm": 0.5664082765579224, "learning_rate": 0.0002, "epoch": 1.7508689677471505, "step": 10830}, {"loss": 0.7586, "grad_norm": 0.7532684206962585, "learning_rate": 0.0002, "epoch": 1.7524856519278957, "step": 10840}, {"loss": 0.6919, "grad_norm": 0.77171391248703, "learning_rate": 0.0002, "epoch": 1.7541023361086412, "step": 10850}, {"loss": 0.785, "grad_norm": 0.7255431413650513, "learning_rate": 0.0002, "epoch": 1.7557190202893864, "step": 10860}, {"loss": 0.7458, "grad_norm": 0.763083279132843, "learning_rate": 0.0002, "epoch": 1.7573357044701319, "step": 10870}, {"loss": 0.7846, "grad_norm": 0.6042402982711792, "learning_rate": 0.0002, "epoch": 1.758952388650877, "step": 10880}, {"loss": 0.7027, "grad_norm": 0.7642518281936646, "learning_rate": 0.0002, "epoch": 1.7605690728316223, "step": 10890}, {"loss": 0.746, "grad_norm": 0.6347904801368713, "learning_rate": 0.0002, "epoch": 1.7621857570123676, "step": 10900}, {"loss": 0.7458, "grad_norm": 0.5371627807617188, "learning_rate": 0.0002, "epoch": 1.7638024411931128, "step": 10910}, {"loss": 0.7466, "grad_norm": 0.6840225458145142, "learning_rate": 0.0002, "epoch": 1.7654191253738583, "step": 10920}, {"loss": 0.725, "grad_norm": 0.5288469195365906, "learning_rate": 0.0002, "epoch": 1.7670358095546035, "step": 10930}, {"loss": 0.7863, "grad_norm": 0.69020676612854, "learning_rate": 0.0002, "epoch": 1.768652493735349, "step": 10940}, {"loss": 0.7468, "grad_norm": 0.5943242311477661, "learning_rate": 0.0002, "epoch": 1.7702691779160942, "step": 10950}, {"loss": 0.7244, "grad_norm": 0.5616418123245239, "learning_rate": 0.0002, "epoch": 1.7718858620968394, "step": 10960}, {"loss": 0.7137, "grad_norm": 0.7209470868110657, "learning_rate": 0.0002, "epoch": 1.7735025462775846, "step": 10970}, {"loss": 0.7459, "grad_norm": 0.6657957434654236, "learning_rate": 0.0002, "epoch": 1.7751192304583299, "step": 10980}, {"loss": 0.7076, "grad_norm": 0.6469064950942993, "learning_rate": 0.0002, "epoch": 1.776735914639075, "step": 10990}, {"loss": 0.7321, "grad_norm": 0.6615678071975708, "learning_rate": 0.0002, "epoch": 1.7783525988198206, "step": 11000}, {"loss": 0.747, "grad_norm": 0.6722439527511597, "learning_rate": 0.0002, "epoch": 1.779969283000566, "step": 11010}, {"loss": 0.7302, "grad_norm": 0.634136974811554, "learning_rate": 0.0002, "epoch": 1.7815859671813112, "step": 11020}, {"loss": 0.8105, "grad_norm": 0.6024377346038818, "learning_rate": 0.0002, "epoch": 1.7832026513620565, "step": 11030}, {"loss": 0.7855, "grad_norm": 0.6909403800964355, "learning_rate": 0.0002, "epoch": 1.7848193355428017, "step": 11040}, {"loss": 0.7471, "grad_norm": 0.7148767709732056, "learning_rate": 0.0002, "epoch": 1.786436019723547, "step": 11050}, {"loss": 0.7145, "grad_norm": 0.7442979216575623, "learning_rate": 0.0002, "epoch": 1.7880527039042922, "step": 11060}, {"loss": 0.7215, "grad_norm": 0.6830431818962097, "learning_rate": 0.0002, "epoch": 1.7896693880850376, "step": 11070}, {"loss": 0.7625, "grad_norm": 0.9172667264938354, "learning_rate": 0.0002, "epoch": 1.7912860722657828, "step": 11080}, {"loss": 0.76, "grad_norm": 0.6799490451812744, "learning_rate": 0.0002, "epoch": 1.7929027564465283, "step": 11090}, {"loss": 0.7716, "grad_norm": 0.7617024779319763, "learning_rate": 0.0002, "epoch": 1.7945194406272735, "step": 11100}, {"loss": 0.7586, "grad_norm": 0.7701810002326965, "learning_rate": 0.0002, "epoch": 1.7961361248080188, "step": 11110}, {"loss": 0.7843, "grad_norm": 0.7454385757446289, "learning_rate": 0.0002, "epoch": 1.797752808988764, "step": 11120}, {"loss": 0.7873, "grad_norm": 0.6121436953544617, "learning_rate": 0.0002, "epoch": 1.7993694931695092, "step": 11130}, {"loss": 0.7305, "grad_norm": 0.6237571835517883, "learning_rate": 0.0002, "epoch": 1.8009861773502547, "step": 11140}, {"loss": 0.6827, "grad_norm": 0.6818515658378601, "learning_rate": 0.0002, "epoch": 1.802602861531, "step": 11150}, {"loss": 0.6876, "grad_norm": 0.7768308520317078, "learning_rate": 0.0002, "epoch": 1.8042195457117454, "step": 11160}, {"loss": 0.7533, "grad_norm": 0.6875537633895874, "learning_rate": 0.0002, "epoch": 1.8058362298924906, "step": 11170}, {"loss": 0.761, "grad_norm": 0.7950584888458252, "learning_rate": 0.0002, "epoch": 1.8074529140732358, "step": 11180}, {"loss": 0.7623, "grad_norm": 0.8210248351097107, "learning_rate": 0.0002, "epoch": 1.809069598253981, "step": 11190}, {"loss": 0.7556, "grad_norm": 0.6674110889434814, "learning_rate": 0.0002, "epoch": 1.8106862824347263, "step": 11200}, {"loss": 0.7663, "grad_norm": 0.6261674761772156, "learning_rate": 0.0002, "epoch": 1.8123029666154715, "step": 11210}, {"loss": 0.7122, "grad_norm": 0.6484741568565369, "learning_rate": 0.0002, "epoch": 1.813919650796217, "step": 11220}, {"loss": 0.7718, "grad_norm": 0.6231244206428528, "learning_rate": 0.0002, "epoch": 1.8155363349769622, "step": 11230}, {"loss": 0.7152, "grad_norm": 0.7243146896362305, "learning_rate": 0.0002, "epoch": 1.8171530191577077, "step": 11240}, {"loss": 0.7448, "grad_norm": 0.6776193380355835, "learning_rate": 0.0002, "epoch": 1.818769703338453, "step": 11250}, {"loss": 0.7317, "grad_norm": 0.5973618030548096, "learning_rate": 0.0002, "epoch": 1.8203863875191981, "step": 11260}, {"loss": 0.7961, "grad_norm": 0.6451361179351807, "learning_rate": 0.0002, "epoch": 1.8220030716999434, "step": 11270}, {"loss": 0.7611, "grad_norm": 0.5963068008422852, "learning_rate": 0.0002, "epoch": 1.8236197558806886, "step": 11280}, {"loss": 0.7466, "grad_norm": 0.536902129650116, "learning_rate": 0.0002, "epoch": 1.825236440061434, "step": 11290}, {"loss": 0.708, "grad_norm": 0.6993787288665771, "learning_rate": 0.0002, "epoch": 1.8268531242421793, "step": 11300}, {"loss": 0.7153, "grad_norm": 0.6135255098342896, "learning_rate": 0.0002, "epoch": 1.8284698084229247, "step": 11310}, {"loss": 0.7423, "grad_norm": 0.6057423949241638, "learning_rate": 0.0002, "epoch": 1.83008649260367, "step": 11320}, {"loss": 0.735, "grad_norm": 0.6598812341690063, "learning_rate": 0.0002, "epoch": 1.8317031767844152, "step": 11330}, {"loss": 0.7278, "grad_norm": 0.6075948476791382, "learning_rate": 0.0002, "epoch": 1.8333198609651604, "step": 11340}, {"loss": 0.7846, "grad_norm": 0.7065447568893433, "learning_rate": 0.0002, "epoch": 1.8349365451459057, "step": 11350}, {"loss": 0.7365, "grad_norm": 0.680526614189148, "learning_rate": 0.0002, "epoch": 1.8365532293266509, "step": 11360}, {"loss": 0.7152, "grad_norm": 0.6356695294380188, "learning_rate": 0.0002, "epoch": 1.8381699135073963, "step": 11370}, {"loss": 0.721, "grad_norm": 0.6399052143096924, "learning_rate": 0.0002, "epoch": 1.8397865976881416, "step": 11380}, {"loss": 0.7618, "grad_norm": 0.6125704050064087, "learning_rate": 0.0002, "epoch": 1.841403281868887, "step": 11390}, {"loss": 0.755, "grad_norm": 0.7124643325805664, "learning_rate": 0.0002, "epoch": 1.8430199660496323, "step": 11400}, {"loss": 0.7972, "grad_norm": 0.6099604964256287, "learning_rate": 0.0002, "epoch": 1.8446366502303775, "step": 11410}, {"loss": 0.7187, "grad_norm": 0.7338208556175232, "learning_rate": 0.0002, "epoch": 1.8462533344111227, "step": 11420}, {"loss": 0.7007, "grad_norm": 0.7534668445587158, "learning_rate": 0.0002, "epoch": 1.847870018591868, "step": 11430}, {"loss": 0.7464, "grad_norm": 0.6135470271110535, "learning_rate": 0.0002, "epoch": 1.8494867027726134, "step": 11440}, {"loss": 0.7955, "grad_norm": 0.6229309439659119, "learning_rate": 0.0002, "epoch": 1.8511033869533586, "step": 11450}, {"loss": 0.7594, "grad_norm": 0.706423282623291, "learning_rate": 0.0002, "epoch": 1.852720071134104, "step": 11460}, {"loss": 0.7411, "grad_norm": 0.5460049510002136, "learning_rate": 0.0002, "epoch": 1.8543367553148493, "step": 11470}, {"loss": 0.7416, "grad_norm": 0.6616711020469666, "learning_rate": 0.0002, "epoch": 1.8559534394955945, "step": 11480}, {"loss": 0.729, "grad_norm": 0.6372783184051514, "learning_rate": 0.0002, "epoch": 1.8575701236763398, "step": 11490}, {"loss": 0.7333, "grad_norm": 0.7162668108940125, "learning_rate": 0.0002, "epoch": 1.859186807857085, "step": 11500}, {"loss": 0.7747, "grad_norm": 0.6605209708213806, "learning_rate": 0.0002, "epoch": 1.8608034920378305, "step": 11510}, {"loss": 0.7258, "grad_norm": 0.6933956742286682, "learning_rate": 0.0002, "epoch": 1.8624201762185757, "step": 11520}, {"loss": 0.7243, "grad_norm": 0.6582090854644775, "learning_rate": 0.0002, "epoch": 1.8640368603993211, "step": 11530}, {"loss": 0.7313, "grad_norm": 0.6416500806808472, "learning_rate": 0.0002, "epoch": 1.8656535445800664, "step": 11540}, {"loss": 0.7372, "grad_norm": 0.5434312224388123, "learning_rate": 0.0002, "epoch": 1.8672702287608116, "step": 11550}, {"loss": 0.7635, "grad_norm": 0.6827567219734192, "learning_rate": 0.0002, "epoch": 1.8688869129415568, "step": 11560}, {"loss": 0.7137, "grad_norm": 0.7354370951652527, "learning_rate": 0.0002, "epoch": 1.870503597122302, "step": 11570}, {"loss": 0.7526, "grad_norm": 0.590372622013092, "learning_rate": 0.0002, "epoch": 1.8721202813030473, "step": 11580}, {"loss": 0.731, "grad_norm": 0.853183925151825, "learning_rate": 0.0002, "epoch": 1.8737369654837928, "step": 11590}, {"loss": 0.7487, "grad_norm": 0.822678804397583, "learning_rate": 0.0002, "epoch": 1.875353649664538, "step": 11600}, {"loss": 0.7427, "grad_norm": 0.6591550707817078, "learning_rate": 0.0002, "epoch": 1.8769703338452834, "step": 11610}, {"loss": 0.7054, "grad_norm": 0.7475301623344421, "learning_rate": 0.0002, "epoch": 1.8785870180260287, "step": 11620}, {"loss": 0.811, "grad_norm": 0.6390765309333801, "learning_rate": 0.0002, "epoch": 1.880203702206774, "step": 11630}, {"loss": 0.7531, "grad_norm": 0.6589758992195129, "learning_rate": 0.0002, "epoch": 1.8818203863875191, "step": 11640}, {"loss": 0.7475, "grad_norm": 0.6765508651733398, "learning_rate": 0.0002, "epoch": 1.8834370705682644, "step": 11650}, {"loss": 0.738, "grad_norm": 0.6527857780456543, "learning_rate": 0.0002, "epoch": 1.8850537547490098, "step": 11660}, {"loss": 0.7504, "grad_norm": 0.6642923951148987, "learning_rate": 0.0002, "epoch": 1.886670438929755, "step": 11670}, {"loss": 0.7701, "grad_norm": 0.6945584416389465, "learning_rate": 0.0002, "epoch": 1.8882871231105005, "step": 11680}, {"loss": 0.7711, "grad_norm": 0.694018542766571, "learning_rate": 0.0002, "epoch": 1.8899038072912457, "step": 11690}, {"loss": 0.7195, "grad_norm": 0.7237417101860046, "learning_rate": 0.0002, "epoch": 1.891520491471991, "step": 11700}, {"loss": 0.7491, "grad_norm": 0.7401309609413147, "learning_rate": 0.0002, "epoch": 1.8931371756527362, "step": 11710}, {"loss": 0.805, "grad_norm": 0.6537784337997437, "learning_rate": 0.0002, "epoch": 1.8947538598334814, "step": 11720}, {"loss": 0.793, "grad_norm": 0.7398539185523987, "learning_rate": 0.0002, "epoch": 1.8963705440142267, "step": 11730}, {"loss": 0.7561, "grad_norm": 0.6696075797080994, "learning_rate": 0.0002, "epoch": 1.8979872281949721, "step": 11740}, {"loss": 0.7353, "grad_norm": 0.6014142036437988, "learning_rate": 0.0002, "epoch": 1.8996039123757174, "step": 11750}, {"loss": 0.7714, "grad_norm": 0.7023524641990662, "learning_rate": 0.0002, "epoch": 1.9012205965564628, "step": 11760}, {"loss": 0.7088, "grad_norm": 0.739973783493042, "learning_rate": 0.0002, "epoch": 1.902837280737208, "step": 11770}, {"loss": 0.7848, "grad_norm": 0.5576770901679993, "learning_rate": 0.0002, "epoch": 1.9044539649179533, "step": 11780}, {"loss": 0.7483, "grad_norm": 0.6907393932342529, "learning_rate": 0.0002, "epoch": 1.9060706490986985, "step": 11790}, {"loss": 0.7827, "grad_norm": 0.6934581995010376, "learning_rate": 0.0002, "epoch": 1.9076873332794437, "step": 11800}, {"loss": 0.7199, "grad_norm": 0.591774582862854, "learning_rate": 0.0002, "epoch": 1.9093040174601892, "step": 11810}, {"loss": 0.7333, "grad_norm": 0.6249791383743286, "learning_rate": 0.0002, "epoch": 1.9109207016409344, "step": 11820}, {"loss": 0.7581, "grad_norm": 0.6755744218826294, "learning_rate": 0.0002, "epoch": 1.9125373858216799, "step": 11830}, {"loss": 0.696, "grad_norm": 0.7286285161972046, "learning_rate": 0.0002, "epoch": 1.914154070002425, "step": 11840}, {"loss": 0.7509, "grad_norm": 0.7867850065231323, "learning_rate": 0.0002, "epoch": 1.9157707541831703, "step": 11850}, {"loss": 0.735, "grad_norm": 0.6283972859382629, "learning_rate": 0.0002, "epoch": 1.9173874383639156, "step": 11860}, {"loss": 0.7296, "grad_norm": 0.605823814868927, "learning_rate": 0.0002, "epoch": 1.9190041225446608, "step": 11870}, {"loss": 0.6598, "grad_norm": 0.5927976965904236, "learning_rate": 0.0002, "epoch": 1.920620806725406, "step": 11880}, {"loss": 0.7649, "grad_norm": 0.5974002480506897, "learning_rate": 0.0002, "epoch": 1.9222374909061515, "step": 11890}, {"loss": 0.7843, "grad_norm": 0.7091866135597229, "learning_rate": 0.0002, "epoch": 1.923854175086897, "step": 11900}, {"loss": 0.775, "grad_norm": 0.72496497631073, "learning_rate": 0.0002, "epoch": 1.9254708592676422, "step": 11910}, {"loss": 0.7153, "grad_norm": 0.6131896376609802, "learning_rate": 0.0002, "epoch": 1.9270875434483874, "step": 11920}, {"loss": 0.7228, "grad_norm": 0.6556436419487, "learning_rate": 0.0002, "epoch": 1.9287042276291326, "step": 11930}, {"loss": 0.7319, "grad_norm": 0.622932493686676, "learning_rate": 0.0002, "epoch": 1.9303209118098779, "step": 11940}, {"loss": 0.7592, "grad_norm": 0.6618631482124329, "learning_rate": 0.0002, "epoch": 1.931937595990623, "step": 11950}, {"loss": 0.8332, "grad_norm": 0.630966305732727, "learning_rate": 0.0002, "epoch": 1.9335542801713685, "step": 11960}, {"loss": 0.6854, "grad_norm": 0.6336734890937805, "learning_rate": 0.0002, "epoch": 1.9351709643521138, "step": 11970}, {"loss": 0.7433, "grad_norm": 0.655403196811676, "learning_rate": 0.0002, "epoch": 1.9367876485328592, "step": 11980}, {"loss": 0.7282, "grad_norm": 0.5640574097633362, "learning_rate": 0.0002, "epoch": 1.9384043327136045, "step": 11990}, {"loss": 0.7289, "grad_norm": 0.6322951316833496, "learning_rate": 0.0002, "epoch": 1.9400210168943497, "step": 12000}, {"loss": 0.7627, "grad_norm": 0.615703821182251, "learning_rate": 0.0002, "epoch": 1.941637701075095, "step": 12010}, {"loss": 0.786, "grad_norm": 0.6487536430358887, "learning_rate": 0.0002, "epoch": 1.9432543852558402, "step": 12020}, {"loss": 0.7435, "grad_norm": 0.9209630489349365, "learning_rate": 0.0002, "epoch": 1.9448710694365856, "step": 12030}, {"loss": 0.7274, "grad_norm": 0.67485511302948, "learning_rate": 0.0002, "epoch": 1.9464877536173308, "step": 12040}, {"loss": 0.7551, "grad_norm": 0.6831230521202087, "learning_rate": 0.0002, "epoch": 1.9481044377980763, "step": 12050}, {"loss": 0.7546, "grad_norm": 0.6578302383422852, "learning_rate": 0.0002, "epoch": 1.9497211219788215, "step": 12060}, {"loss": 0.6989, "grad_norm": 0.9975938200950623, "learning_rate": 0.0002, "epoch": 1.9513378061595668, "step": 12070}, {"loss": 0.7952, "grad_norm": 0.6637365221977234, "learning_rate": 0.0002, "epoch": 1.952954490340312, "step": 12080}, {"loss": 0.7482, "grad_norm": 0.605707049369812, "learning_rate": 0.0002, "epoch": 1.9545711745210572, "step": 12090}, {"loss": 0.7768, "grad_norm": 0.6584440469741821, "learning_rate": 0.0002, "epoch": 1.9561878587018025, "step": 12100}, {"loss": 0.7187, "grad_norm": 0.6070835590362549, "learning_rate": 0.0002, "epoch": 1.957804542882548, "step": 12110}, {"loss": 0.7491, "grad_norm": 0.7862601280212402, "learning_rate": 0.0002, "epoch": 1.9594212270632931, "step": 12120}, {"loss": 0.7972, "grad_norm": 0.8175255060195923, "learning_rate": 0.0002, "epoch": 1.9610379112440386, "step": 12130}, {"loss": 0.7242, "grad_norm": 0.5648472905158997, "learning_rate": 0.0002, "epoch": 1.9626545954247838, "step": 12140}, {"loss": 0.7321, "grad_norm": 0.6591973304748535, "learning_rate": 0.0002, "epoch": 1.964271279605529, "step": 12150}, {"loss": 0.739, "grad_norm": 0.5960676074028015, "learning_rate": 0.0002, "epoch": 1.9658879637862743, "step": 12160}, {"loss": 0.7254, "grad_norm": 0.7272544503211975, "learning_rate": 0.0002, "epoch": 1.9675046479670195, "step": 12170}, {"loss": 0.7376, "grad_norm": 0.7176699042320251, "learning_rate": 0.0002, "epoch": 1.969121332147765, "step": 12180}, {"loss": 0.7525, "grad_norm": 0.6927123665809631, "learning_rate": 0.0002, "epoch": 1.9707380163285102, "step": 12190}, {"loss": 0.7318, "grad_norm": 0.5536034107208252, "learning_rate": 0.0002, "epoch": 1.9723547005092557, "step": 12200}, {"loss": 0.7737, "grad_norm": 0.8348390460014343, "learning_rate": 0.0002, "epoch": 1.9739713846900009, "step": 12210}, {"loss": 0.7494, "grad_norm": 0.6591181755065918, "learning_rate": 0.0002, "epoch": 1.9755880688707461, "step": 12220}, {"loss": 0.763, "grad_norm": 1.0624109506607056, "learning_rate": 0.0002, "epoch": 1.9772047530514913, "step": 12230}, {"loss": 0.7541, "grad_norm": 0.9265586137771606, "learning_rate": 0.0002, "epoch": 1.9788214372322366, "step": 12240}, {"loss": 0.7533, "grad_norm": 0.5998196005821228, "learning_rate": 0.0002, "epoch": 1.9804381214129818, "step": 12250}, {"loss": 0.7225, "grad_norm": 0.6960851550102234, "learning_rate": 0.0002, "epoch": 1.9820548055937273, "step": 12260}, {"loss": 0.7398, "grad_norm": 0.7674502730369568, "learning_rate": 0.0002, "epoch": 1.9836714897744727, "step": 12270}, {"loss": 0.7185, "grad_norm": 0.6407275795936584, "learning_rate": 0.0002, "epoch": 1.985288173955218, "step": 12280}, {"loss": 0.7382, "grad_norm": 0.6673079133033752, "learning_rate": 0.0002, "epoch": 1.9869048581359632, "step": 12290}, {"loss": 0.7326, "grad_norm": 0.6989844441413879, "learning_rate": 0.0002, "epoch": 1.9885215423167084, "step": 12300}, {"loss": 0.7559, "grad_norm": 0.7564442157745361, "learning_rate": 0.0002, "epoch": 1.9901382264974536, "step": 12310}, {"loss": 0.7719, "grad_norm": 0.6385478973388672, "learning_rate": 0.0002, "epoch": 1.9917549106781989, "step": 12320}, {"loss": 0.7369, "grad_norm": 0.7193717956542969, "learning_rate": 0.0002, "epoch": 1.9933715948589443, "step": 12330}, {"loss": 0.7583, "grad_norm": 0.7987112402915955, "learning_rate": 0.0002, "epoch": 1.9949882790396896, "step": 12340}, {"loss": 0.7793, "grad_norm": 0.7260826826095581, "learning_rate": 0.0002, "epoch": 1.996604963220435, "step": 12350}, {"loss": 0.7505, "grad_norm": 0.7968255281448364, "learning_rate": 0.0002, "epoch": 1.9982216474011802, "step": 12360}, {"loss": 0.717, "grad_norm": 0.6893062591552734, "learning_rate": 0.0002, "epoch": 1.9998383315819255, "step": 12370}]} +{"epoch": 2.9999191657909625, "step": 18556, "epoch_duration": 16900.518332481384, "total_accumulated_duration": 50651.44822502136, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75537109375}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.6636, "grad_norm": 0.9894065856933594, "learning_rate": 0.0002, "epoch": 0.0016166841807452913, "step": 10}, {"loss": 1.1528, "grad_norm": 1.7810699939727783, "learning_rate": 0.0002, "epoch": 0.0032333683614905826, "step": 20}, {"loss": 0.9767, "grad_norm": 0.5969577431678772, "learning_rate": 0.0002, "epoch": 0.004850052542235874, "step": 30}, {"loss": 0.9772, "grad_norm": 0.6354120969772339, "learning_rate": 0.0002, "epoch": 0.006466736722981165, "step": 40}, {"loss": 0.8643, "grad_norm": 0.5604607462882996, "learning_rate": 0.0002, "epoch": 0.008083420903726457, "step": 50}, {"loss": 0.8841, "grad_norm": 0.4676193594932556, "learning_rate": 0.0002, "epoch": 0.009700105084471748, "step": 60}, {"loss": 0.9022, "grad_norm": 0.6099211573600769, "learning_rate": 0.0002, "epoch": 0.01131678926521704, "step": 70}, {"loss": 0.9133, "grad_norm": 0.48639994859695435, "learning_rate": 0.0002, "epoch": 0.01293347344596233, "step": 80}, {"loss": 0.8704, "grad_norm": 0.4904264509677887, "learning_rate": 0.0002, "epoch": 0.014550157626707623, "step": 90}, {"loss": 0.8855, "grad_norm": 2.8334362506866455, "learning_rate": 0.0002, "epoch": 0.016166841807452915, "step": 100}, {"loss": 0.8958, "grad_norm": 0.43221670389175415, "learning_rate": 0.0002, "epoch": 0.017783525988198205, "step": 110}, {"loss": 0.8412, "grad_norm": 0.42244166135787964, "learning_rate": 0.0002, "epoch": 0.019400210168943496, "step": 120}, {"loss": 0.8467, "grad_norm": 0.45363298058509827, "learning_rate": 0.0002, "epoch": 0.02101689434968879, "step": 130}, {"loss": 0.8641, "grad_norm": 0.44816508889198303, "learning_rate": 0.0002, "epoch": 0.02263357853043408, "step": 140}, {"loss": 0.8496, "grad_norm": 0.43308213353157043, "learning_rate": 0.0002, "epoch": 0.02425026271117937, "step": 150}, {"loss": 0.8213, "grad_norm": 0.4084763526916504, "learning_rate": 0.0002, "epoch": 0.02586694689192466, "step": 160}, {"loss": 0.8343, "grad_norm": 0.5363703966140747, "learning_rate": 0.0002, "epoch": 0.027483631072669955, "step": 170}, {"loss": 0.8558, "grad_norm": 0.4619699716567993, "learning_rate": 0.0002, "epoch": 0.029100315253415245, "step": 180}, {"loss": 0.8878, "grad_norm": 0.49069908261299133, "learning_rate": 0.0002, "epoch": 0.030716999434160536, "step": 190}, {"loss": 0.8867, "grad_norm": 0.4645835757255554, "learning_rate": 0.0002, "epoch": 0.03233368361490583, "step": 200}, {"loss": 0.8842, "grad_norm": 1.2411243915557861, "learning_rate": 0.0002, "epoch": 0.03395036779565112, "step": 210}, {"loss": 0.8245, "grad_norm": 0.5211851596832275, "learning_rate": 0.0002, "epoch": 0.03556705197639641, "step": 220}, {"loss": 0.8194, "grad_norm": 0.5253691673278809, "learning_rate": 0.0002, "epoch": 0.037183736157141704, "step": 230}, {"loss": 0.8856, "grad_norm": 0.4567478895187378, "learning_rate": 0.0002, "epoch": 0.03880042033788699, "step": 240}, {"loss": 0.838, "grad_norm": 0.5472128391265869, "learning_rate": 0.0002, "epoch": 0.040417104518632285, "step": 250}, {"loss": 0.8201, "grad_norm": 0.42978546023368835, "learning_rate": 0.0002, "epoch": 0.04203378869937758, "step": 260}, {"loss": 0.8334, "grad_norm": 0.601734459400177, "learning_rate": 0.0002, "epoch": 0.043650472880122866, "step": 270}, {"loss": 0.815, "grad_norm": 0.4286513328552246, "learning_rate": 0.0002, "epoch": 0.04526715706086816, "step": 280}, {"loss": 0.8758, "grad_norm": 0.5230861902236938, "learning_rate": 0.0002, "epoch": 0.046883841241613454, "step": 290}, {"loss": 0.8636, "grad_norm": 0.6504611968994141, "learning_rate": 0.0002, "epoch": 0.04850052542235874, "step": 300}, {"loss": 0.8102, "grad_norm": 0.43485215306282043, "learning_rate": 0.0002, "epoch": 0.050117209603104035, "step": 310}, {"loss": 0.8221, "grad_norm": 0.4717007875442505, "learning_rate": 0.0002, "epoch": 0.05173389378384932, "step": 320}, {"loss": 0.8469, "grad_norm": 0.4059787690639496, "learning_rate": 0.0002, "epoch": 0.053350577964594616, "step": 330}, {"loss": 0.8866, "grad_norm": 0.4366913437843323, "learning_rate": 0.0002, "epoch": 0.05496726214533991, "step": 340}, {"loss": 0.7976, "grad_norm": 0.4233848452568054, "learning_rate": 0.0002, "epoch": 0.0565839463260852, "step": 350}, {"loss": 0.8456, "grad_norm": 0.4209108352661133, "learning_rate": 0.0002, "epoch": 0.05820063050683049, "step": 360}, {"loss": 0.816, "grad_norm": 0.41637396812438965, "learning_rate": 0.0002, "epoch": 0.059817314687575784, "step": 370}, {"loss": 0.7976, "grad_norm": 0.46235376596450806, "learning_rate": 0.0002, "epoch": 0.06143399886832107, "step": 380}, {"loss": 0.7966, "grad_norm": 0.4013484716415405, "learning_rate": 0.0002, "epoch": 0.06305068304906636, "step": 390}, {"loss": 0.8253, "grad_norm": 0.47443896532058716, "learning_rate": 0.0002, "epoch": 0.06466736722981166, "step": 400}, {"loss": 0.8666, "grad_norm": 0.3942156434059143, "learning_rate": 0.0002, "epoch": 0.06628405141055695, "step": 410}, {"loss": 0.8402, "grad_norm": 0.4965320825576782, "learning_rate": 0.0002, "epoch": 0.06790073559130223, "step": 420}, {"loss": 0.8317, "grad_norm": 0.4304835796356201, "learning_rate": 0.0002, "epoch": 0.06951741977204753, "step": 430}, {"loss": 0.8528, "grad_norm": 0.511726975440979, "learning_rate": 0.0002, "epoch": 0.07113410395279282, "step": 440}, {"loss": 0.8675, "grad_norm": 0.4040689170360565, "learning_rate": 0.0002, "epoch": 0.07275078813353811, "step": 450}, {"loss": 0.8788, "grad_norm": 0.5402171015739441, "learning_rate": 0.0002, "epoch": 0.07436747231428341, "step": 460}, {"loss": 0.8737, "grad_norm": 0.4174517095088959, "learning_rate": 0.0002, "epoch": 0.0759841564950287, "step": 470}, {"loss": 0.7605, "grad_norm": 0.4306182265281677, "learning_rate": 0.0002, "epoch": 0.07760084067577398, "step": 480}, {"loss": 0.799, "grad_norm": 0.535210132598877, "learning_rate": 0.0002, "epoch": 0.07921752485651928, "step": 490}, {"loss": 0.7825, "grad_norm": 0.5339109897613525, "learning_rate": 0.0002, "epoch": 0.08083420903726457, "step": 500}, {"loss": 0.8985, "grad_norm": 0.45754891633987427, "learning_rate": 0.0002, "epoch": 0.08245089321800986, "step": 510}, {"loss": 0.8144, "grad_norm": 0.43820783495903015, "learning_rate": 0.0002, "epoch": 0.08406757739875516, "step": 520}, {"loss": 0.8001, "grad_norm": 0.4434749186038971, "learning_rate": 0.0002, "epoch": 0.08568426157950045, "step": 530}, {"loss": 0.7857, "grad_norm": 0.43111467361450195, "learning_rate": 0.0002, "epoch": 0.08730094576024573, "step": 540}, {"loss": 0.8418, "grad_norm": 0.4378940165042877, "learning_rate": 0.0002, "epoch": 0.08891762994099103, "step": 550}, {"loss": 0.8361, "grad_norm": 0.4772215187549591, "learning_rate": 0.0002, "epoch": 0.09053431412173632, "step": 560}, {"loss": 0.8268, "grad_norm": 0.6837629079818726, "learning_rate": 0.0002, "epoch": 0.09215099830248161, "step": 570}, {"loss": 0.8607, "grad_norm": 0.42241212725639343, "learning_rate": 0.0002, "epoch": 0.09376768248322691, "step": 580}, {"loss": 0.852, "grad_norm": 0.5165936350822449, "learning_rate": 0.0002, "epoch": 0.0953843666639722, "step": 590}, {"loss": 0.8664, "grad_norm": 0.48737478256225586, "learning_rate": 0.0002, "epoch": 0.09700105084471748, "step": 600}, {"loss": 0.8806, "grad_norm": 0.47419852018356323, "learning_rate": 0.0002, "epoch": 0.09861773502546278, "step": 610}, {"loss": 0.8254, "grad_norm": 0.4975486099720001, "learning_rate": 0.0002, "epoch": 0.10023441920620807, "step": 620}, {"loss": 0.8548, "grad_norm": 0.49123844504356384, "learning_rate": 0.0002, "epoch": 0.10185110338695336, "step": 630}, {"loss": 0.8911, "grad_norm": 0.6288952827453613, "learning_rate": 0.0002, "epoch": 0.10346778756769864, "step": 640}, {"loss": 0.827, "grad_norm": 0.4277345836162567, "learning_rate": 0.0002, "epoch": 0.10508447174844394, "step": 650}, {"loss": 0.7996, "grad_norm": 0.4021061956882477, "learning_rate": 0.0002, "epoch": 0.10670115592918923, "step": 660}, {"loss": 0.87, "grad_norm": 0.3492237329483032, "learning_rate": 0.0002, "epoch": 0.10831784010993452, "step": 670}, {"loss": 0.8698, "grad_norm": 0.4341012239456177, "learning_rate": 0.0002, "epoch": 0.10993452429067982, "step": 680}, {"loss": 0.781, "grad_norm": 0.7296304106712341, "learning_rate": 0.0002, "epoch": 0.1115512084714251, "step": 690}, {"loss": 0.8433, "grad_norm": 0.397494912147522, "learning_rate": 0.0002, "epoch": 0.1131678926521704, "step": 700}, {"loss": 0.827, "grad_norm": 0.396431028842926, "learning_rate": 0.0002, "epoch": 0.1147845768329157, "step": 710}, {"loss": 0.8379, "grad_norm": 0.48842838406562805, "learning_rate": 0.0002, "epoch": 0.11640126101366098, "step": 720}, {"loss": 0.8238, "grad_norm": 0.46322616934776306, "learning_rate": 0.0002, "epoch": 0.11801794519440627, "step": 730}, {"loss": 0.8041, "grad_norm": 0.47990912199020386, "learning_rate": 0.0002, "epoch": 0.11963462937515157, "step": 740}, {"loss": 0.82, "grad_norm": 0.4997142255306244, "learning_rate": 0.0002, "epoch": 0.12125131355589686, "step": 750}, {"loss": 0.7702, "grad_norm": 0.4040526747703552, "learning_rate": 0.0002, "epoch": 0.12286799773664214, "step": 760}, {"loss": 0.863, "grad_norm": 0.453095942735672, "learning_rate": 0.0002, "epoch": 0.12448468191738744, "step": 770}, {"loss": 0.8792, "grad_norm": 0.4636971950531006, "learning_rate": 0.0002, "epoch": 0.12610136609813272, "step": 780}, {"loss": 0.8112, "grad_norm": 0.4279276132583618, "learning_rate": 0.0002, "epoch": 0.12771805027887803, "step": 790}, {"loss": 0.8711, "grad_norm": 0.46212655305862427, "learning_rate": 0.0002, "epoch": 0.12933473445962332, "step": 800}, {"loss": 0.8368, "grad_norm": 0.43127650022506714, "learning_rate": 0.0002, "epoch": 0.1309514186403686, "step": 810}, {"loss": 0.8476, "grad_norm": 0.4201301336288452, "learning_rate": 0.0002, "epoch": 0.1325681028211139, "step": 820}, {"loss": 0.8078, "grad_norm": 0.42583167552948, "learning_rate": 0.0002, "epoch": 0.13418478700185918, "step": 830}, {"loss": 0.8219, "grad_norm": 0.4535622000694275, "learning_rate": 0.0002, "epoch": 0.13580147118260447, "step": 840}, {"loss": 0.8423, "grad_norm": 0.4116036891937256, "learning_rate": 0.0002, "epoch": 0.13741815536334978, "step": 850}, {"loss": 0.8466, "grad_norm": 0.45997580885887146, "learning_rate": 0.0002, "epoch": 0.13903483954409507, "step": 860}, {"loss": 0.8917, "grad_norm": 0.4487837255001068, "learning_rate": 0.0002, "epoch": 0.14065152372484035, "step": 870}, {"loss": 0.8217, "grad_norm": 0.43650057911872864, "learning_rate": 0.0002, "epoch": 0.14226820790558564, "step": 880}, {"loss": 0.8178, "grad_norm": 0.5335358381271362, "learning_rate": 0.0002, "epoch": 0.14388489208633093, "step": 890}, {"loss": 0.7957, "grad_norm": 0.5989000201225281, "learning_rate": 0.0002, "epoch": 0.14550157626707622, "step": 900}, {"loss": 0.8385, "grad_norm": 0.517179012298584, "learning_rate": 0.0002, "epoch": 0.14711826044782153, "step": 910}, {"loss": 0.8255, "grad_norm": 0.44435232877731323, "learning_rate": 0.0002, "epoch": 0.14873494462856682, "step": 920}, {"loss": 0.8305, "grad_norm": 0.42635923624038696, "learning_rate": 0.0002, "epoch": 0.1503516288093121, "step": 930}, {"loss": 0.8043, "grad_norm": 0.49603334069252014, "learning_rate": 0.0002, "epoch": 0.1519683129900574, "step": 940}, {"loss": 0.8377, "grad_norm": 0.40639808773994446, "learning_rate": 0.0002, "epoch": 0.15358499717080268, "step": 950}, {"loss": 0.8529, "grad_norm": 0.4850759208202362, "learning_rate": 0.0002, "epoch": 0.15520168135154797, "step": 960}, {"loss": 0.846, "grad_norm": 0.4427442252635956, "learning_rate": 0.0002, "epoch": 0.15681836553229328, "step": 970}, {"loss": 0.8705, "grad_norm": 0.3760930001735687, "learning_rate": 0.0002, "epoch": 0.15843504971303857, "step": 980}, {"loss": 0.8644, "grad_norm": 0.4794144332408905, "learning_rate": 0.0002, "epoch": 0.16005173389378385, "step": 990}, {"loss": 0.8002, "grad_norm": 0.45828768610954285, "learning_rate": 0.0002, "epoch": 0.16166841807452914, "step": 1000}, {"loss": 0.7658, "grad_norm": 0.6313053369522095, "learning_rate": 0.0002, "epoch": 0.16328510225527443, "step": 1010}, {"loss": 0.8047, "grad_norm": 0.45041006803512573, "learning_rate": 0.0002, "epoch": 0.16490178643601971, "step": 1020}, {"loss": 0.8423, "grad_norm": 0.441403865814209, "learning_rate": 0.0002, "epoch": 0.166518470616765, "step": 1030}, {"loss": 0.8475, "grad_norm": 0.8171296119689941, "learning_rate": 0.0002, "epoch": 0.16813515479751032, "step": 1040}, {"loss": 0.845, "grad_norm": 0.7137420773506165, "learning_rate": 0.0002, "epoch": 0.1697518389782556, "step": 1050}, {"loss": 0.8213, "grad_norm": 0.5236809849739075, "learning_rate": 0.0002, "epoch": 0.1713685231590009, "step": 1060}, {"loss": 0.8265, "grad_norm": 0.5021864175796509, "learning_rate": 0.0002, "epoch": 0.17298520733974618, "step": 1070}, {"loss": 0.8305, "grad_norm": 0.47347521781921387, "learning_rate": 0.0002, "epoch": 0.17460189152049146, "step": 1080}, {"loss": 0.8105, "grad_norm": 0.4631653428077698, "learning_rate": 0.0002, "epoch": 0.17621857570123675, "step": 1090}, {"loss": 0.8166, "grad_norm": 0.49169182777404785, "learning_rate": 0.0002, "epoch": 0.17783525988198207, "step": 1100}, {"loss": 0.8012, "grad_norm": 0.5019739270210266, "learning_rate": 0.0002, "epoch": 0.17945194406272735, "step": 1110}, {"loss": 0.8247, "grad_norm": 0.5100422501564026, "learning_rate": 0.0002, "epoch": 0.18106862824347264, "step": 1120}, {"loss": 0.8142, "grad_norm": 0.3888324499130249, "learning_rate": 0.0002, "epoch": 0.18268531242421793, "step": 1130}, {"loss": 0.8533, "grad_norm": 0.39765217900276184, "learning_rate": 0.0002, "epoch": 0.18430199660496321, "step": 1140}, {"loss": 0.8541, "grad_norm": 0.47190186381340027, "learning_rate": 0.0002, "epoch": 0.1859186807857085, "step": 1150}, {"loss": 0.8301, "grad_norm": 0.4464188814163208, "learning_rate": 0.0002, "epoch": 0.18753536496645382, "step": 1160}, {"loss": 0.8341, "grad_norm": 0.5153930187225342, "learning_rate": 0.0002, "epoch": 0.1891520491471991, "step": 1170}, {"loss": 0.8033, "grad_norm": 0.4779708683490753, "learning_rate": 0.0002, "epoch": 0.1907687333279444, "step": 1180}, {"loss": 0.8187, "grad_norm": 0.4834315776824951, "learning_rate": 0.0002, "epoch": 0.19238541750868968, "step": 1190}, {"loss": 0.7721, "grad_norm": 0.402357816696167, "learning_rate": 0.0002, "epoch": 0.19400210168943496, "step": 1200}, {"loss": 0.7941, "grad_norm": 0.45899084210395813, "learning_rate": 0.0002, "epoch": 0.19561878587018025, "step": 1210}, {"loss": 0.8353, "grad_norm": 0.5106529593467712, "learning_rate": 0.0002, "epoch": 0.19723547005092557, "step": 1220}, {"loss": 0.7816, "grad_norm": 0.45261722803115845, "learning_rate": 0.0002, "epoch": 0.19885215423167085, "step": 1230}, {"loss": 0.8068, "grad_norm": 0.4647127091884613, "learning_rate": 0.0002, "epoch": 0.20046883841241614, "step": 1240}, {"loss": 0.8239, "grad_norm": 0.4849368929862976, "learning_rate": 0.0002, "epoch": 0.20208552259316143, "step": 1250}, {"loss": 0.8514, "grad_norm": 0.4518061578273773, "learning_rate": 0.0002, "epoch": 0.2037022067739067, "step": 1260}, {"loss": 0.8158, "grad_norm": 0.49535325169563293, "learning_rate": 0.0002, "epoch": 0.205318890954652, "step": 1270}, {"loss": 0.8348, "grad_norm": 0.4835205376148224, "learning_rate": 0.0002, "epoch": 0.2069355751353973, "step": 1280}, {"loss": 0.8428, "grad_norm": 0.45308539271354675, "learning_rate": 0.0002, "epoch": 0.2085522593161426, "step": 1290}, {"loss": 0.7993, "grad_norm": 0.5369905233383179, "learning_rate": 0.0002, "epoch": 0.2101689434968879, "step": 1300}, {"loss": 0.8676, "grad_norm": 0.5031622052192688, "learning_rate": 0.0002, "epoch": 0.21178562767763318, "step": 1310}, {"loss": 0.7686, "grad_norm": 0.48010334372520447, "learning_rate": 0.0002, "epoch": 0.21340231185837846, "step": 1320}, {"loss": 0.806, "grad_norm": 0.4905701279640198, "learning_rate": 0.0002, "epoch": 0.21501899603912375, "step": 1330}, {"loss": 0.7885, "grad_norm": 0.43531742691993713, "learning_rate": 0.0002, "epoch": 0.21663568021986904, "step": 1340}, {"loss": 0.8191, "grad_norm": 0.44330692291259766, "learning_rate": 0.0002, "epoch": 0.21825236440061435, "step": 1350}, {"loss": 0.8205, "grad_norm": 0.5384416580200195, "learning_rate": 0.0002, "epoch": 0.21986904858135964, "step": 1360}, {"loss": 0.7726, "grad_norm": 0.4181833863258362, "learning_rate": 0.0002, "epoch": 0.22148573276210493, "step": 1370}, {"loss": 0.8311, "grad_norm": 0.523833692073822, "learning_rate": 0.0002, "epoch": 0.2231024169428502, "step": 1380}, {"loss": 0.7913, "grad_norm": 0.5528736710548401, "learning_rate": 0.0002, "epoch": 0.2247191011235955, "step": 1390}, {"loss": 0.8079, "grad_norm": 0.43515023589134216, "learning_rate": 0.0002, "epoch": 0.2263357853043408, "step": 1400}, {"loss": 0.8403, "grad_norm": 0.48809877038002014, "learning_rate": 0.0002, "epoch": 0.2279524694850861, "step": 1410}, {"loss": 0.8165, "grad_norm": 0.43591251969337463, "learning_rate": 0.0002, "epoch": 0.2295691536658314, "step": 1420}, {"loss": 0.8147, "grad_norm": 0.44625312089920044, "learning_rate": 0.0002, "epoch": 0.23118583784657668, "step": 1430}, {"loss": 0.8134, "grad_norm": 0.4390665292739868, "learning_rate": 0.0002, "epoch": 0.23280252202732196, "step": 1440}, {"loss": 0.8465, "grad_norm": 0.48496049642562866, "learning_rate": 0.0002, "epoch": 0.23441920620806725, "step": 1450}, {"loss": 0.775, "grad_norm": 0.45919957756996155, "learning_rate": 0.0002, "epoch": 0.23603589038881254, "step": 1460}, {"loss": 0.8659, "grad_norm": 0.5471845865249634, "learning_rate": 0.0002, "epoch": 0.23765257456955785, "step": 1470}, {"loss": 0.8164, "grad_norm": 0.47269317507743835, "learning_rate": 0.0002, "epoch": 0.23926925875030314, "step": 1480}, {"loss": 0.854, "grad_norm": 0.4930245578289032, "learning_rate": 0.0002, "epoch": 0.24088594293104842, "step": 1490}, {"loss": 0.8139, "grad_norm": 0.5605630278587341, "learning_rate": 0.0002, "epoch": 0.2425026271117937, "step": 1500}, {"loss": 0.8125, "grad_norm": 0.4435870945453644, "learning_rate": 0.0002, "epoch": 0.244119311292539, "step": 1510}, {"loss": 0.8123, "grad_norm": 0.4941999912261963, "learning_rate": 0.0002, "epoch": 0.24573599547328429, "step": 1520}, {"loss": 0.8427, "grad_norm": 0.5100624561309814, "learning_rate": 0.0002, "epoch": 0.24735267965402957, "step": 1530}, {"loss": 0.8405, "grad_norm": 0.4638267457485199, "learning_rate": 0.0002, "epoch": 0.2489693638347749, "step": 1540}, {"loss": 0.81, "grad_norm": 0.5071570873260498, "learning_rate": 0.0002, "epoch": 0.25058604801552015, "step": 1550}, {"loss": 0.7724, "grad_norm": 0.4291319251060486, "learning_rate": 0.0002, "epoch": 0.25220273219626543, "step": 1560}, {"loss": 0.7984, "grad_norm": 0.5388049483299255, "learning_rate": 0.0002, "epoch": 0.2538194163770108, "step": 1570}, {"loss": 0.8176, "grad_norm": 0.5083683729171753, "learning_rate": 0.0002, "epoch": 0.25543610055775606, "step": 1580}, {"loss": 0.843, "grad_norm": 0.4824463725090027, "learning_rate": 0.0002, "epoch": 0.25705278473850135, "step": 1590}, {"loss": 0.7996, "grad_norm": 0.41177722811698914, "learning_rate": 0.0002, "epoch": 0.25866946891924664, "step": 1600}, {"loss": 0.7772, "grad_norm": 0.5656219124794006, "learning_rate": 0.0002, "epoch": 0.2602861530999919, "step": 1610}, {"loss": 0.7955, "grad_norm": 0.41063204407691956, "learning_rate": 0.0002, "epoch": 0.2619028372807372, "step": 1620}, {"loss": 0.7998, "grad_norm": 0.4897061288356781, "learning_rate": 0.0002, "epoch": 0.2635195214614825, "step": 1630}, {"loss": 0.8198, "grad_norm": 0.4454376697540283, "learning_rate": 0.0002, "epoch": 0.2651362056422278, "step": 1640}, {"loss": 0.8684, "grad_norm": 0.4355238378047943, "learning_rate": 0.0002, "epoch": 0.26675288982297307, "step": 1650}, {"loss": 0.7801, "grad_norm": 0.458310067653656, "learning_rate": 0.0002, "epoch": 0.26836957400371836, "step": 1660}, {"loss": 0.7935, "grad_norm": 0.4752083718776703, "learning_rate": 0.0002, "epoch": 0.26998625818446365, "step": 1670}, {"loss": 0.8267, "grad_norm": 0.4666106402873993, "learning_rate": 0.0002, "epoch": 0.27160294236520893, "step": 1680}, {"loss": 0.8252, "grad_norm": 0.4213818609714508, "learning_rate": 0.0002, "epoch": 0.2732196265459543, "step": 1690}, {"loss": 0.8559, "grad_norm": 0.5768913626670837, "learning_rate": 0.0002, "epoch": 0.27483631072669956, "step": 1700}, {"loss": 0.7931, "grad_norm": 0.4209914803504944, "learning_rate": 0.0002, "epoch": 0.27645299490744485, "step": 1710}, {"loss": 0.8167, "grad_norm": 0.501909613609314, "learning_rate": 0.0002, "epoch": 0.27806967908819014, "step": 1720}, {"loss": 0.7832, "grad_norm": 0.5266261100769043, "learning_rate": 0.0002, "epoch": 0.2796863632689354, "step": 1730}, {"loss": 0.8102, "grad_norm": 0.43806859850883484, "learning_rate": 0.0002, "epoch": 0.2813030474496807, "step": 1740}, {"loss": 0.8157, "grad_norm": 0.46048814058303833, "learning_rate": 0.0002, "epoch": 0.282919731630426, "step": 1750}, {"loss": 0.8596, "grad_norm": 0.44972819089889526, "learning_rate": 0.0002, "epoch": 0.2845364158111713, "step": 1760}, {"loss": 0.8421, "grad_norm": 0.5114831328392029, "learning_rate": 0.0002, "epoch": 0.28615309999191657, "step": 1770}, {"loss": 0.8361, "grad_norm": 0.47931742668151855, "learning_rate": 0.0002, "epoch": 0.28776978417266186, "step": 1780}, {"loss": 0.8265, "grad_norm": 0.5092599987983704, "learning_rate": 0.0002, "epoch": 0.28938646835340714, "step": 1790}, {"loss": 0.8506, "grad_norm": 0.37581443786621094, "learning_rate": 0.0002, "epoch": 0.29100315253415243, "step": 1800}, {"loss": 0.7932, "grad_norm": 0.47097381949424744, "learning_rate": 0.0002, "epoch": 0.2926198367148977, "step": 1810}, {"loss": 0.7787, "grad_norm": 0.48300236463546753, "learning_rate": 0.0002, "epoch": 0.29423652089564306, "step": 1820}, {"loss": 0.8391, "grad_norm": 0.5600419640541077, "learning_rate": 0.0002, "epoch": 0.29585320507638835, "step": 1830}, {"loss": 0.8507, "grad_norm": 0.48555272817611694, "learning_rate": 0.0002, "epoch": 0.29746988925713364, "step": 1840}, {"loss": 0.7657, "grad_norm": 0.3752668499946594, "learning_rate": 0.0002, "epoch": 0.2990865734378789, "step": 1850}, {"loss": 0.7915, "grad_norm": 0.5328747034072876, "learning_rate": 0.0002, "epoch": 0.3007032576186242, "step": 1860}, {"loss": 0.8426, "grad_norm": 0.48716455698013306, "learning_rate": 0.0002, "epoch": 0.3023199417993695, "step": 1870}, {"loss": 0.8335, "grad_norm": 0.5011493563652039, "learning_rate": 0.0002, "epoch": 0.3039366259801148, "step": 1880}, {"loss": 0.852, "grad_norm": 0.46461427211761475, "learning_rate": 0.0002, "epoch": 0.30555331016086007, "step": 1890}, {"loss": 0.8478, "grad_norm": 0.36630210280418396, "learning_rate": 0.0002, "epoch": 0.30716999434160536, "step": 1900}, {"loss": 0.8162, "grad_norm": 0.4217296242713928, "learning_rate": 0.0002, "epoch": 0.30878667852235064, "step": 1910}, {"loss": 0.8128, "grad_norm": 0.4394875466823578, "learning_rate": 0.0002, "epoch": 0.31040336270309593, "step": 1920}, {"loss": 0.8471, "grad_norm": 0.6587965488433838, "learning_rate": 0.0002, "epoch": 0.3120200468838412, "step": 1930}, {"loss": 0.8565, "grad_norm": 0.5469298958778381, "learning_rate": 0.0002, "epoch": 0.31363673106458656, "step": 1940}, {"loss": 0.8236, "grad_norm": 0.4371595084667206, "learning_rate": 0.0002, "epoch": 0.31525341524533185, "step": 1950}, {"loss": 0.887, "grad_norm": 0.4809541404247284, "learning_rate": 0.0002, "epoch": 0.31687009942607713, "step": 1960}, {"loss": 0.7855, "grad_norm": 0.6061086654663086, "learning_rate": 0.0002, "epoch": 0.3184867836068224, "step": 1970}, {"loss": 0.7679, "grad_norm": 0.5342657566070557, "learning_rate": 0.0002, "epoch": 0.3201034677875677, "step": 1980}, {"loss": 0.7955, "grad_norm": 0.5057743787765503, "learning_rate": 0.0002, "epoch": 0.321720151968313, "step": 1990}, {"loss": 0.7774, "grad_norm": 0.528626024723053, "learning_rate": 0.0002, "epoch": 0.3233368361490583, "step": 2000}, {"loss": 0.8845, "grad_norm": 0.46742770075798035, "learning_rate": 0.0002, "epoch": 0.32495352032980357, "step": 2010}, {"loss": 0.8484, "grad_norm": 0.515101432800293, "learning_rate": 0.0002, "epoch": 0.32657020451054886, "step": 2020}, {"loss": 0.8139, "grad_norm": 0.41941216588020325, "learning_rate": 0.0002, "epoch": 0.32818688869129414, "step": 2030}, {"loss": 0.7637, "grad_norm": 0.49902522563934326, "learning_rate": 0.0002, "epoch": 0.32980357287203943, "step": 2040}, {"loss": 0.7822, "grad_norm": 0.4120897650718689, "learning_rate": 0.0002, "epoch": 0.3314202570527847, "step": 2050}, {"loss": 0.8057, "grad_norm": 0.45352041721343994, "learning_rate": 0.0002, "epoch": 0.33303694123353, "step": 2060}, {"loss": 0.7913, "grad_norm": 0.523199737071991, "learning_rate": 0.0002, "epoch": 0.33465362541427535, "step": 2070}, {"loss": 0.8036, "grad_norm": 0.4390358626842499, "learning_rate": 0.0002, "epoch": 0.33627030959502063, "step": 2080}, {"loss": 0.8145, "grad_norm": 0.6752901077270508, "learning_rate": 0.0002, "epoch": 0.3378869937757659, "step": 2090}, {"loss": 0.7807, "grad_norm": 0.547821044921875, "learning_rate": 0.0002, "epoch": 0.3395036779565112, "step": 2100}, {"loss": 0.8561, "grad_norm": 0.5161308646202087, "learning_rate": 0.0002, "epoch": 0.3411203621372565, "step": 2110}, {"loss": 0.7697, "grad_norm": 0.4565401077270508, "learning_rate": 0.0002, "epoch": 0.3427370463180018, "step": 2120}, {"loss": 0.7964, "grad_norm": 0.4666115939617157, "learning_rate": 0.0002, "epoch": 0.34435373049874707, "step": 2130}, {"loss": 0.8189, "grad_norm": 0.4090428352355957, "learning_rate": 0.0002, "epoch": 0.34597041467949236, "step": 2140}, {"loss": 0.8817, "grad_norm": 0.510845422744751, "learning_rate": 0.0002, "epoch": 0.34758709886023764, "step": 2150}, {"loss": 0.8398, "grad_norm": 0.42861923575401306, "learning_rate": 0.0002, "epoch": 0.34920378304098293, "step": 2160}, {"loss": 0.7716, "grad_norm": 0.4476332664489746, "learning_rate": 0.0002, "epoch": 0.3508204672217282, "step": 2170}, {"loss": 0.7845, "grad_norm": 0.6065791249275208, "learning_rate": 0.0002, "epoch": 0.3524371514024735, "step": 2180}, {"loss": 0.8187, "grad_norm": 0.42335066199302673, "learning_rate": 0.0002, "epoch": 0.35405383558321885, "step": 2190}, {"loss": 0.8239, "grad_norm": 0.5094629526138306, "learning_rate": 0.0002, "epoch": 0.35567051976396413, "step": 2200}, {"loss": 0.7807, "grad_norm": 0.5476373434066772, "learning_rate": 0.0002, "epoch": 0.3572872039447094, "step": 2210}, {"loss": 0.814, "grad_norm": 0.3911719024181366, "learning_rate": 0.0002, "epoch": 0.3589038881254547, "step": 2220}, {"loss": 0.8599, "grad_norm": 0.6599636077880859, "learning_rate": 0.0002, "epoch": 0.3605205723062, "step": 2230}, {"loss": 0.7482, "grad_norm": 0.40381914377212524, "learning_rate": 0.0002, "epoch": 0.3621372564869453, "step": 2240}, {"loss": 0.7772, "grad_norm": 0.4433908462524414, "learning_rate": 0.0002, "epoch": 0.36375394066769057, "step": 2250}, {"loss": 0.8503, "grad_norm": 0.578326940536499, "learning_rate": 0.0002, "epoch": 0.36537062484843585, "step": 2260}, {"loss": 0.8178, "grad_norm": 0.5734784007072449, "learning_rate": 0.0002, "epoch": 0.36698730902918114, "step": 2270}, {"loss": 0.8193, "grad_norm": 0.45555487275123596, "learning_rate": 0.0002, "epoch": 0.36860399320992643, "step": 2280}, {"loss": 0.7929, "grad_norm": 0.5666276216506958, "learning_rate": 0.0002, "epoch": 0.3702206773906717, "step": 2290}, {"loss": 0.8292, "grad_norm": 0.5461117625236511, "learning_rate": 0.0002, "epoch": 0.371837361571417, "step": 2300}, {"loss": 0.8204, "grad_norm": 0.6318911910057068, "learning_rate": 0.0002, "epoch": 0.3734540457521623, "step": 2310}, {"loss": 0.7964, "grad_norm": 0.493263304233551, "learning_rate": 0.0002, "epoch": 0.37507072993290763, "step": 2320}, {"loss": 0.8339, "grad_norm": 0.5888760089874268, "learning_rate": 0.0002, "epoch": 0.3766874141136529, "step": 2330}, {"loss": 0.7737, "grad_norm": 0.48671841621398926, "learning_rate": 0.0002, "epoch": 0.3783040982943982, "step": 2340}, {"loss": 0.8367, "grad_norm": 0.4385145306587219, "learning_rate": 0.0002, "epoch": 0.3799207824751435, "step": 2350}, {"loss": 0.812, "grad_norm": 0.5523318648338318, "learning_rate": 0.0002, "epoch": 0.3815374666558888, "step": 2360}, {"loss": 0.8351, "grad_norm": 0.7308220267295837, "learning_rate": 0.0002, "epoch": 0.38315415083663407, "step": 2370}, {"loss": 0.859, "grad_norm": 0.554214358329773, "learning_rate": 0.0002, "epoch": 0.38477083501737935, "step": 2380}, {"loss": 0.8146, "grad_norm": 0.5425800085067749, "learning_rate": 0.0002, "epoch": 0.38638751919812464, "step": 2390}, {"loss": 0.8282, "grad_norm": 0.48811158537864685, "learning_rate": 0.0002, "epoch": 0.3880042033788699, "step": 2400}, {"loss": 0.8074, "grad_norm": 0.49212366342544556, "learning_rate": 0.0002, "epoch": 0.3896208875596152, "step": 2410}, {"loss": 0.7991, "grad_norm": 0.5222218632698059, "learning_rate": 0.0002, "epoch": 0.3912375717403605, "step": 2420}, {"loss": 0.8182, "grad_norm": 0.4699819087982178, "learning_rate": 0.0002, "epoch": 0.3928542559211058, "step": 2430}, {"loss": 0.7919, "grad_norm": 0.46153587102890015, "learning_rate": 0.0002, "epoch": 0.39447094010185113, "step": 2440}, {"loss": 0.8111, "grad_norm": 0.4150611162185669, "learning_rate": 0.0002, "epoch": 0.3960876242825964, "step": 2450}, {"loss": 0.8589, "grad_norm": 0.5799614787101746, "learning_rate": 0.0002, "epoch": 0.3977043084633417, "step": 2460}, {"loss": 0.8085, "grad_norm": 0.56536865234375, "learning_rate": 0.0002, "epoch": 0.399320992644087, "step": 2470}, {"loss": 0.8022, "grad_norm": 0.5451247096061707, "learning_rate": 0.0002, "epoch": 0.4009376768248323, "step": 2480}, {"loss": 0.8217, "grad_norm": 0.5914521217346191, "learning_rate": 0.0002, "epoch": 0.40255436100557757, "step": 2490}, {"loss": 0.7859, "grad_norm": 0.4428117275238037, "learning_rate": 0.0002, "epoch": 0.40417104518632285, "step": 2500}, {"loss": 0.8054, "grad_norm": 0.48580947518348694, "learning_rate": 0.0002, "epoch": 0.40578772936706814, "step": 2510}, {"loss": 0.8405, "grad_norm": 0.436734676361084, "learning_rate": 0.0002, "epoch": 0.4074044135478134, "step": 2520}, {"loss": 0.8209, "grad_norm": 0.5752223134040833, "learning_rate": 0.0002, "epoch": 0.4090210977285587, "step": 2530}, {"loss": 0.8181, "grad_norm": 0.4271308183670044, "learning_rate": 0.0002, "epoch": 0.410637781909304, "step": 2540}, {"loss": 0.8058, "grad_norm": 0.46294718980789185, "learning_rate": 0.0002, "epoch": 0.4122544660900493, "step": 2550}, {"loss": 0.8473, "grad_norm": 0.49407583475112915, "learning_rate": 0.0002, "epoch": 0.4138711502707946, "step": 2560}, {"loss": 0.7881, "grad_norm": 0.4729035496711731, "learning_rate": 0.0002, "epoch": 0.4154878344515399, "step": 2570}, {"loss": 0.7834, "grad_norm": 0.4129747152328491, "learning_rate": 0.0002, "epoch": 0.4171045186322852, "step": 2580}, {"loss": 0.7859, "grad_norm": 0.5684236288070679, "learning_rate": 0.0002, "epoch": 0.4187212028130305, "step": 2590}, {"loss": 0.811, "grad_norm": 0.4862157106399536, "learning_rate": 0.0002, "epoch": 0.4203378869937758, "step": 2600}, {"loss": 0.7582, "grad_norm": 0.46567976474761963, "learning_rate": 0.0002, "epoch": 0.42195457117452106, "step": 2610}, {"loss": 0.7755, "grad_norm": 0.5710650682449341, "learning_rate": 0.0002, "epoch": 0.42357125535526635, "step": 2620}, {"loss": 0.8573, "grad_norm": 0.5660041570663452, "learning_rate": 0.0002, "epoch": 0.42518793953601164, "step": 2630}, {"loss": 0.7812, "grad_norm": 0.47944375872612, "learning_rate": 0.0002, "epoch": 0.4268046237167569, "step": 2640}, {"loss": 0.7459, "grad_norm": 0.537223756313324, "learning_rate": 0.0002, "epoch": 0.4284213078975022, "step": 2650}, {"loss": 0.8246, "grad_norm": 0.41669997572898865, "learning_rate": 0.0002, "epoch": 0.4300379920782475, "step": 2660}, {"loss": 0.7785, "grad_norm": 0.44727686047554016, "learning_rate": 0.0002, "epoch": 0.4316546762589928, "step": 2670}, {"loss": 0.8241, "grad_norm": 0.5600888729095459, "learning_rate": 0.0002, "epoch": 0.4332713604397381, "step": 2680}, {"loss": 0.7708, "grad_norm": 0.39820605516433716, "learning_rate": 0.0002, "epoch": 0.4348880446204834, "step": 2690}, {"loss": 0.8202, "grad_norm": 0.5637655854225159, "learning_rate": 0.0002, "epoch": 0.4365047288012287, "step": 2700}, {"loss": 0.855, "grad_norm": 0.6363666653633118, "learning_rate": 0.0002, "epoch": 0.438121412981974, "step": 2710}, {"loss": 0.8468, "grad_norm": 0.5656129121780396, "learning_rate": 0.0002, "epoch": 0.4397380971627193, "step": 2720}, {"loss": 0.7845, "grad_norm": 0.5600156188011169, "learning_rate": 0.0002, "epoch": 0.44135478134346456, "step": 2730}, {"loss": 0.8405, "grad_norm": 0.5506579875946045, "learning_rate": 0.0002, "epoch": 0.44297146552420985, "step": 2740}, {"loss": 0.7725, "grad_norm": 0.49878305196762085, "learning_rate": 0.0002, "epoch": 0.44458814970495514, "step": 2750}, {"loss": 0.8292, "grad_norm": 0.4569213092327118, "learning_rate": 0.0002, "epoch": 0.4462048338857004, "step": 2760}, {"loss": 0.8028, "grad_norm": 0.6056680083274841, "learning_rate": 0.0002, "epoch": 0.4478215180664457, "step": 2770}, {"loss": 0.8242, "grad_norm": 0.44474557042121887, "learning_rate": 0.0002, "epoch": 0.449438202247191, "step": 2780}, {"loss": 0.801, "grad_norm": 0.46055394411087036, "learning_rate": 0.0002, "epoch": 0.4510548864279363, "step": 2790}, {"loss": 0.7521, "grad_norm": 0.4904133379459381, "learning_rate": 0.0002, "epoch": 0.4526715706086816, "step": 2800}, {"loss": 0.8829, "grad_norm": 0.5647031664848328, "learning_rate": 0.0002, "epoch": 0.45428825478942686, "step": 2810}, {"loss": 0.8622, "grad_norm": 0.5759473443031311, "learning_rate": 0.0002, "epoch": 0.4559049389701722, "step": 2820}, {"loss": 0.7812, "grad_norm": 0.5161895751953125, "learning_rate": 0.0002, "epoch": 0.4575216231509175, "step": 2830}, {"loss": 0.8045, "grad_norm": 0.4248254597187042, "learning_rate": 0.0002, "epoch": 0.4591383073316628, "step": 2840}, {"loss": 0.7838, "grad_norm": 0.45395001769065857, "learning_rate": 0.0002, "epoch": 0.46075499151240806, "step": 2850}, {"loss": 0.8208, "grad_norm": 0.5358697772026062, "learning_rate": 0.0002, "epoch": 0.46237167569315335, "step": 2860}, {"loss": 0.8147, "grad_norm": 0.5379165410995483, "learning_rate": 0.0002, "epoch": 0.46398835987389864, "step": 2870}, {"loss": 0.7403, "grad_norm": 0.4601989686489105, "learning_rate": 0.0002, "epoch": 0.4656050440546439, "step": 2880}, {"loss": 0.8523, "grad_norm": 0.671115517616272, "learning_rate": 0.0002, "epoch": 0.4672217282353892, "step": 2890}, {"loss": 0.8262, "grad_norm": 0.4425133168697357, "learning_rate": 0.0002, "epoch": 0.4688384124161345, "step": 2900}, {"loss": 0.8178, "grad_norm": 0.5446155071258545, "learning_rate": 0.0002, "epoch": 0.4704550965968798, "step": 2910}, {"loss": 0.8106, "grad_norm": 0.603306233882904, "learning_rate": 0.0002, "epoch": 0.47207178077762507, "step": 2920}, {"loss": 0.8044, "grad_norm": 0.5377997159957886, "learning_rate": 0.0002, "epoch": 0.47368846495837036, "step": 2930}, {"loss": 0.8075, "grad_norm": 0.4931027591228485, "learning_rate": 0.0002, "epoch": 0.4753051491391157, "step": 2940}, {"loss": 0.8004, "grad_norm": 0.4711960256099701, "learning_rate": 0.0002, "epoch": 0.476921833319861, "step": 2950}, {"loss": 0.8121, "grad_norm": 0.5020492672920227, "learning_rate": 0.0002, "epoch": 0.4785385175006063, "step": 2960}, {"loss": 0.8221, "grad_norm": 0.5428946614265442, "learning_rate": 0.0002, "epoch": 0.48015520168135156, "step": 2970}, {"loss": 0.7849, "grad_norm": 0.5294089317321777, "learning_rate": 0.0002, "epoch": 0.48177188586209685, "step": 2980}, {"loss": 0.8553, "grad_norm": 0.648289144039154, "learning_rate": 0.0002, "epoch": 0.48338857004284214, "step": 2990}, {"loss": 0.7874, "grad_norm": 0.47916680574417114, "learning_rate": 0.0002, "epoch": 0.4850052542235874, "step": 3000}, {"loss": 0.8087, "grad_norm": 0.43849772214889526, "learning_rate": 0.0002, "epoch": 0.4866219384043327, "step": 3010}, {"loss": 0.7662, "grad_norm": 0.47007861733436584, "learning_rate": 0.0002, "epoch": 0.488238622585078, "step": 3020}, {"loss": 0.757, "grad_norm": 0.6314331293106079, "learning_rate": 0.0002, "epoch": 0.4898553067658233, "step": 3030}, {"loss": 0.7863, "grad_norm": 0.49211493134498596, "learning_rate": 0.0002, "epoch": 0.49147199094656857, "step": 3040}, {"loss": 0.8335, "grad_norm": 0.4537973403930664, "learning_rate": 0.0002, "epoch": 0.49308867512731386, "step": 3050}, {"loss": 0.8095, "grad_norm": 0.47326919436454773, "learning_rate": 0.0002, "epoch": 0.49470535930805914, "step": 3060}, {"loss": 0.8447, "grad_norm": 0.525874137878418, "learning_rate": 0.0002, "epoch": 0.4963220434888045, "step": 3070}, {"loss": 0.8339, "grad_norm": 0.6361091732978821, "learning_rate": 0.0002, "epoch": 0.4979387276695498, "step": 3080}, {"loss": 0.821, "grad_norm": 0.5850642919540405, "learning_rate": 0.0002, "epoch": 0.49955541185029506, "step": 3090}, {"loss": 0.8279, "grad_norm": 0.47299543023109436, "learning_rate": 0.0002, "epoch": 0.5011720960310403, "step": 3100}, {"loss": 0.8681, "grad_norm": 0.473099946975708, "learning_rate": 0.0002, "epoch": 0.5027887802117856, "step": 3110}, {"loss": 0.8223, "grad_norm": 0.48186397552490234, "learning_rate": 0.0002, "epoch": 0.5044054643925309, "step": 3120}, {"loss": 0.8292, "grad_norm": 0.5015401840209961, "learning_rate": 0.0002, "epoch": 0.5060221485732762, "step": 3130}, {"loss": 0.7692, "grad_norm": 0.5617750287055969, "learning_rate": 0.0002, "epoch": 0.5076388327540216, "step": 3140}, {"loss": 0.8708, "grad_norm": 0.5169327259063721, "learning_rate": 0.0002, "epoch": 0.5092555169347668, "step": 3150}, {"loss": 0.7845, "grad_norm": 0.545657753944397, "learning_rate": 0.0002, "epoch": 0.5108722011155121, "step": 3160}, {"loss": 0.799, "grad_norm": 0.512864351272583, "learning_rate": 0.0002, "epoch": 0.5124888852962574, "step": 3170}, {"loss": 0.7794, "grad_norm": 0.4113546311855316, "learning_rate": 0.0002, "epoch": 0.5141055694770027, "step": 3180}, {"loss": 0.8206, "grad_norm": 0.44532445073127747, "learning_rate": 0.0002, "epoch": 0.5157222536577479, "step": 3190}, {"loss": 0.8213, "grad_norm": 0.5623497366905212, "learning_rate": 0.0002, "epoch": 0.5173389378384933, "step": 3200}, {"loss": 0.7928, "grad_norm": 0.5084741115570068, "learning_rate": 0.0002, "epoch": 0.5189556220192385, "step": 3210}, {"loss": 0.8174, "grad_norm": 0.5305403470993042, "learning_rate": 0.0002, "epoch": 0.5205723061999838, "step": 3220}, {"loss": 0.8139, "grad_norm": 0.4708254337310791, "learning_rate": 0.0002, "epoch": 0.5221889903807291, "step": 3230}, {"loss": 0.7639, "grad_norm": 0.43827131390571594, "learning_rate": 0.0002, "epoch": 0.5238056745614744, "step": 3240}, {"loss": 0.7993, "grad_norm": 0.5630002617835999, "learning_rate": 0.0002, "epoch": 0.5254223587422197, "step": 3250}, {"loss": 0.7522, "grad_norm": 0.5010961890220642, "learning_rate": 0.0002, "epoch": 0.527039042922965, "step": 3260}, {"loss": 0.8374, "grad_norm": 0.6303122043609619, "learning_rate": 0.0002, "epoch": 0.5286557271037103, "step": 3270}, {"loss": 0.7727, "grad_norm": 0.5107331275939941, "learning_rate": 0.0002, "epoch": 0.5302724112844556, "step": 3280}, {"loss": 0.8495, "grad_norm": 0.5700443387031555, "learning_rate": 0.0002, "epoch": 0.5318890954652009, "step": 3290}, {"loss": 0.7776, "grad_norm": 0.46296367049217224, "learning_rate": 0.0002, "epoch": 0.5335057796459461, "step": 3300}, {"loss": 0.7931, "grad_norm": 0.531568706035614, "learning_rate": 0.0002, "epoch": 0.5351224638266915, "step": 3310}, {"loss": 0.843, "grad_norm": 0.4686741530895233, "learning_rate": 0.0002, "epoch": 0.5367391480074367, "step": 3320}, {"loss": 0.8104, "grad_norm": 0.5404331088066101, "learning_rate": 0.0002, "epoch": 0.5383558321881821, "step": 3330}, {"loss": 0.7686, "grad_norm": 0.6368790864944458, "learning_rate": 0.0002, "epoch": 0.5399725163689273, "step": 3340}, {"loss": 0.8514, "grad_norm": 0.42300888895988464, "learning_rate": 0.0002, "epoch": 0.5415892005496726, "step": 3350}, {"loss": 0.8236, "grad_norm": 0.5362542867660522, "learning_rate": 0.0002, "epoch": 0.5432058847304179, "step": 3360}, {"loss": 0.858, "grad_norm": 0.497128963470459, "learning_rate": 0.0002, "epoch": 0.5448225689111632, "step": 3370}, {"loss": 0.8519, "grad_norm": 0.5006386041641235, "learning_rate": 0.0002, "epoch": 0.5464392530919085, "step": 3380}, {"loss": 0.7867, "grad_norm": 0.44136837124824524, "learning_rate": 0.0002, "epoch": 0.5480559372726538, "step": 3390}, {"loss": 0.773, "grad_norm": 0.5897833108901978, "learning_rate": 0.0002, "epoch": 0.5496726214533991, "step": 3400}, {"loss": 0.8895, "grad_norm": 0.641075611114502, "learning_rate": 0.0002, "epoch": 0.5512893056341444, "step": 3410}, {"loss": 0.7827, "grad_norm": 0.7251322269439697, "learning_rate": 0.0002, "epoch": 0.5529059898148897, "step": 3420}, {"loss": 0.7626, "grad_norm": 0.47411349415779114, "learning_rate": 0.0002, "epoch": 0.5545226739956349, "step": 3430}, {"loss": 0.8196, "grad_norm": 0.4994310438632965, "learning_rate": 0.0002, "epoch": 0.5561393581763803, "step": 3440}, {"loss": 0.7812, "grad_norm": 0.5814438462257385, "learning_rate": 0.0002, "epoch": 0.5577560423571255, "step": 3450}, {"loss": 0.8805, "grad_norm": 0.6278898119926453, "learning_rate": 0.0002, "epoch": 0.5593727265378708, "step": 3460}, {"loss": 0.813, "grad_norm": 0.46208274364471436, "learning_rate": 0.0002, "epoch": 0.5609894107186161, "step": 3470}, {"loss": 0.8295, "grad_norm": 0.5718930959701538, "learning_rate": 0.0002, "epoch": 0.5626060948993614, "step": 3480}, {"loss": 0.8152, "grad_norm": 0.48178744316101074, "learning_rate": 0.0002, "epoch": 0.5642227790801067, "step": 3490}, {"loss": 0.8244, "grad_norm": 0.47336965799331665, "learning_rate": 0.0002, "epoch": 0.565839463260852, "step": 3500}, {"loss": 0.8099, "grad_norm": 0.43442684412002563, "learning_rate": 0.0002, "epoch": 0.5674561474415973, "step": 3510}, {"loss": 0.7564, "grad_norm": 0.6463358998298645, "learning_rate": 0.0002, "epoch": 0.5690728316223426, "step": 3520}, {"loss": 0.836, "grad_norm": 0.5286486744880676, "learning_rate": 0.0002, "epoch": 0.5706895158030879, "step": 3530}, {"loss": 0.8421, "grad_norm": 0.5405499935150146, "learning_rate": 0.0002, "epoch": 0.5723061999838331, "step": 3540}, {"loss": 0.7614, "grad_norm": 0.6654391884803772, "learning_rate": 0.0002, "epoch": 0.5739228841645785, "step": 3550}, {"loss": 0.7803, "grad_norm": 0.5081980228424072, "learning_rate": 0.0002, "epoch": 0.5755395683453237, "step": 3560}, {"loss": 0.7753, "grad_norm": 0.48978179693222046, "learning_rate": 0.0002, "epoch": 0.5771562525260691, "step": 3570}, {"loss": 0.8151, "grad_norm": 0.5840612053871155, "learning_rate": 0.0002, "epoch": 0.5787729367068143, "step": 3580}, {"loss": 0.8937, "grad_norm": 0.5235261917114258, "learning_rate": 0.0002, "epoch": 0.5803896208875596, "step": 3590}, {"loss": 0.7894, "grad_norm": 0.5672075748443604, "learning_rate": 0.0002, "epoch": 0.5820063050683049, "step": 3600}, {"loss": 0.8347, "grad_norm": 0.5613429546356201, "learning_rate": 0.0002, "epoch": 0.5836229892490502, "step": 3610}, {"loss": 0.8274, "grad_norm": 0.4032273590564728, "learning_rate": 0.0002, "epoch": 0.5852396734297954, "step": 3620}, {"loss": 0.8421, "grad_norm": 0.49559324979782104, "learning_rate": 0.0002, "epoch": 0.5868563576105408, "step": 3630}, {"loss": 0.8332, "grad_norm": 0.6895697712898254, "learning_rate": 0.0002, "epoch": 0.5884730417912861, "step": 3640}, {"loss": 0.7877, "grad_norm": 0.4750136435031891, "learning_rate": 0.0002, "epoch": 0.5900897259720314, "step": 3650}, {"loss": 0.8219, "grad_norm": 0.5176819562911987, "learning_rate": 0.0002, "epoch": 0.5917064101527767, "step": 3660}, {"loss": 0.8151, "grad_norm": 0.5817760229110718, "learning_rate": 0.0002, "epoch": 0.5933230943335219, "step": 3670}, {"loss": 0.7823, "grad_norm": 0.6064626574516296, "learning_rate": 0.0002, "epoch": 0.5949397785142673, "step": 3680}, {"loss": 0.8422, "grad_norm": 0.6728700995445251, "learning_rate": 0.0002, "epoch": 0.5965564626950125, "step": 3690}, {"loss": 0.7679, "grad_norm": 0.609305202960968, "learning_rate": 0.0002, "epoch": 0.5981731468757578, "step": 3700}, {"loss": 0.8048, "grad_norm": 0.4615488350391388, "learning_rate": 0.0002, "epoch": 0.5997898310565031, "step": 3710}, {"loss": 0.8214, "grad_norm": 2.0531179904937744, "learning_rate": 0.0002, "epoch": 0.6014065152372484, "step": 3720}, {"loss": 0.8158, "grad_norm": 0.5091132521629333, "learning_rate": 0.0002, "epoch": 0.6030231994179936, "step": 3730}, {"loss": 0.7833, "grad_norm": 0.5951124429702759, "learning_rate": 0.0002, "epoch": 0.604639883598739, "step": 3740}, {"loss": 0.7784, "grad_norm": 0.5870208144187927, "learning_rate": 0.0002, "epoch": 0.6062565677794842, "step": 3750}, {"loss": 0.8044, "grad_norm": 0.6254619359970093, "learning_rate": 0.0002, "epoch": 0.6078732519602296, "step": 3760}, {"loss": 0.7868, "grad_norm": 0.5577626824378967, "learning_rate": 0.0002, "epoch": 0.6094899361409749, "step": 3770}, {"loss": 0.8108, "grad_norm": 0.5004405379295349, "learning_rate": 0.0002, "epoch": 0.6111066203217201, "step": 3780}, {"loss": 0.8092, "grad_norm": 0.5527383685112, "learning_rate": 0.0002, "epoch": 0.6127233045024655, "step": 3790}, {"loss": 0.8036, "grad_norm": 0.49116113781929016, "learning_rate": 0.0002, "epoch": 0.6143399886832107, "step": 3800}, {"loss": 0.8352, "grad_norm": 0.5299299359321594, "learning_rate": 0.0002, "epoch": 0.6159566728639561, "step": 3810}, {"loss": 0.7737, "grad_norm": 0.464897483587265, "learning_rate": 0.0002, "epoch": 0.6175733570447013, "step": 3820}, {"loss": 0.7923, "grad_norm": 0.6505740880966187, "learning_rate": 0.0002, "epoch": 0.6191900412254466, "step": 3830}, {"loss": 0.8123, "grad_norm": 0.5512559413909912, "learning_rate": 0.0002, "epoch": 0.6208067254061919, "step": 3840}, {"loss": 0.8856, "grad_norm": 0.49427518248558044, "learning_rate": 0.0002, "epoch": 0.6224234095869372, "step": 3850}, {"loss": 0.7751, "grad_norm": 0.3839147090911865, "learning_rate": 0.0002, "epoch": 0.6240400937676824, "step": 3860}, {"loss": 0.8006, "grad_norm": 0.5760218501091003, "learning_rate": 0.0002, "epoch": 0.6256567779484278, "step": 3870}, {"loss": 0.7836, "grad_norm": 0.7226507067680359, "learning_rate": 0.0002, "epoch": 0.6272734621291731, "step": 3880}, {"loss": 0.8244, "grad_norm": 0.676781415939331, "learning_rate": 0.0002, "epoch": 0.6288901463099184, "step": 3890}, {"loss": 0.8239, "grad_norm": 0.4284018278121948, "learning_rate": 0.0002, "epoch": 0.6305068304906637, "step": 3900}, {"loss": 0.7996, "grad_norm": 0.5060628056526184, "learning_rate": 0.0002, "epoch": 0.6321235146714089, "step": 3910}, {"loss": 0.8089, "grad_norm": 0.5524522066116333, "learning_rate": 0.0002, "epoch": 0.6337401988521543, "step": 3920}, {"loss": 0.8276, "grad_norm": 0.6099881529808044, "learning_rate": 0.0002, "epoch": 0.6353568830328995, "step": 3930}, {"loss": 0.809, "grad_norm": 0.43155938386917114, "learning_rate": 0.0002, "epoch": 0.6369735672136448, "step": 3940}, {"loss": 0.8404, "grad_norm": 0.6427084803581238, "learning_rate": 0.0002, "epoch": 0.6385902513943901, "step": 3950}, {"loss": 0.8368, "grad_norm": 0.541220486164093, "learning_rate": 0.0002, "epoch": 0.6402069355751354, "step": 3960}, {"loss": 0.8539, "grad_norm": 0.5414294600486755, "learning_rate": 0.0002, "epoch": 0.6418236197558806, "step": 3970}, {"loss": 0.7996, "grad_norm": 0.46344003081321716, "learning_rate": 0.0002, "epoch": 0.643440303936626, "step": 3980}, {"loss": 0.7474, "grad_norm": 0.45209285616874695, "learning_rate": 0.0002, "epoch": 0.6450569881173712, "step": 3990}, {"loss": 0.8202, "grad_norm": 0.5417284369468689, "learning_rate": 0.0002, "epoch": 0.6466736722981166, "step": 4000}, {"loss": 0.7563, "grad_norm": 0.7995685935020447, "learning_rate": 0.0002, "epoch": 0.6482903564788619, "step": 4010}, {"loss": 0.7812, "grad_norm": 0.6384002566337585, "learning_rate": 0.0002, "epoch": 0.6499070406596071, "step": 4020}, {"loss": 0.732, "grad_norm": 0.4472815692424774, "learning_rate": 0.0002, "epoch": 0.6515237248403525, "step": 4030}, {"loss": 0.8071, "grad_norm": 0.6834294199943542, "learning_rate": 0.0002, "epoch": 0.6531404090210977, "step": 4040}, {"loss": 0.7812, "grad_norm": 0.4612339735031128, "learning_rate": 0.0002, "epoch": 0.654757093201843, "step": 4050}, {"loss": 0.8141, "grad_norm": 0.9266576170921326, "learning_rate": 0.0002, "epoch": 0.6563737773825883, "step": 4060}, {"loss": 0.7991, "grad_norm": 0.4470861852169037, "learning_rate": 0.0002, "epoch": 0.6579904615633336, "step": 4070}, {"loss": 0.8293, "grad_norm": 0.45544925332069397, "learning_rate": 0.0002, "epoch": 0.6596071457440789, "step": 4080}, {"loss": 0.8455, "grad_norm": 0.6144481301307678, "learning_rate": 0.0002, "epoch": 0.6612238299248242, "step": 4090}, {"loss": 0.7877, "grad_norm": 0.5936288237571716, "learning_rate": 0.0002, "epoch": 0.6628405141055694, "step": 4100}, {"loss": 0.7617, "grad_norm": 0.4822963774204254, "learning_rate": 0.0002, "epoch": 0.6644571982863148, "step": 4110}, {"loss": 0.7997, "grad_norm": 0.48432496190071106, "learning_rate": 0.0002, "epoch": 0.66607388246706, "step": 4120}, {"loss": 0.8404, "grad_norm": 0.4901607930660248, "learning_rate": 0.0002, "epoch": 0.6676905666478054, "step": 4130}, {"loss": 0.8085, "grad_norm": 0.5018393397331238, "learning_rate": 0.0002, "epoch": 0.6693072508285507, "step": 4140}, {"loss": 0.8065, "grad_norm": 0.6946378946304321, "learning_rate": 0.0002, "epoch": 0.6709239350092959, "step": 4150}, {"loss": 0.8147, "grad_norm": 0.5997390747070312, "learning_rate": 0.0002, "epoch": 0.6725406191900413, "step": 4160}, {"loss": 0.8268, "grad_norm": 0.6738849878311157, "learning_rate": 0.0002, "epoch": 0.6741573033707865, "step": 4170}, {"loss": 0.7704, "grad_norm": 0.6110581159591675, "learning_rate": 0.0002, "epoch": 0.6757739875515318, "step": 4180}, {"loss": 0.8043, "grad_norm": 0.5703322291374207, "learning_rate": 0.0002, "epoch": 0.6773906717322771, "step": 4190}, {"loss": 0.8099, "grad_norm": 0.4686066210269928, "learning_rate": 0.0002, "epoch": 0.6790073559130224, "step": 4200}, {"loss": 0.8441, "grad_norm": 0.6394643783569336, "learning_rate": 0.0002, "epoch": 0.6806240400937676, "step": 4210}, {"loss": 0.8011, "grad_norm": 0.5454841256141663, "learning_rate": 0.0002, "epoch": 0.682240724274513, "step": 4220}, {"loss": 0.8307, "grad_norm": 0.4859732985496521, "learning_rate": 0.0002, "epoch": 0.6838574084552582, "step": 4230}, {"loss": 0.8161, "grad_norm": 0.5544065833091736, "learning_rate": 0.0002, "epoch": 0.6854740926360036, "step": 4240}, {"loss": 0.7839, "grad_norm": 0.4902505576610565, "learning_rate": 0.0002, "epoch": 0.6870907768167488, "step": 4250}, {"loss": 0.7977, "grad_norm": 0.4768051505088806, "learning_rate": 0.0002, "epoch": 0.6887074609974941, "step": 4260}, {"loss": 0.7539, "grad_norm": 0.49982190132141113, "learning_rate": 0.0002, "epoch": 0.6903241451782395, "step": 4270}, {"loss": 0.7353, "grad_norm": 0.6351838111877441, "learning_rate": 0.0002, "epoch": 0.6919408293589847, "step": 4280}, {"loss": 0.7664, "grad_norm": 0.5647561550140381, "learning_rate": 0.0002, "epoch": 0.69355751353973, "step": 4290}, {"loss": 0.7618, "grad_norm": 0.5340486764907837, "learning_rate": 0.0002, "epoch": 0.6951741977204753, "step": 4300}, {"loss": 0.8526, "grad_norm": 0.5649092793464661, "learning_rate": 0.0002, "epoch": 0.6967908819012206, "step": 4310}, {"loss": 0.8246, "grad_norm": 0.6183916926383972, "learning_rate": 0.0002, "epoch": 0.6984075660819659, "step": 4320}, {"loss": 0.792, "grad_norm": 0.6154509782791138, "learning_rate": 0.0002, "epoch": 0.7000242502627112, "step": 4330}, {"loss": 0.8397, "grad_norm": 0.5156264305114746, "learning_rate": 0.0002, "epoch": 0.7016409344434564, "step": 4340}, {"loss": 0.8512, "grad_norm": 0.562171459197998, "learning_rate": 0.0002, "epoch": 0.7032576186242018, "step": 4350}, {"loss": 0.7882, "grad_norm": 0.4949502646923065, "learning_rate": 0.0002, "epoch": 0.704874302804947, "step": 4360}, {"loss": 0.738, "grad_norm": 0.5171684622764587, "learning_rate": 0.0002, "epoch": 0.7064909869856923, "step": 4370}, {"loss": 0.8001, "grad_norm": 0.6198443174362183, "learning_rate": 0.0002, "epoch": 0.7081076711664377, "step": 4380}, {"loss": 0.7606, "grad_norm": 0.5802276134490967, "learning_rate": 0.0002, "epoch": 0.7097243553471829, "step": 4390}, {"loss": 0.8797, "grad_norm": 0.41096967458724976, "learning_rate": 0.0002, "epoch": 0.7113410395279283, "step": 4400}, {"loss": 0.805, "grad_norm": 0.4397392272949219, "learning_rate": 0.0002, "epoch": 0.7129577237086735, "step": 4410}, {"loss": 0.7651, "grad_norm": 0.45228442549705505, "learning_rate": 0.0002, "epoch": 0.7145744078894188, "step": 4420}, {"loss": 0.7938, "grad_norm": 0.4839673936367035, "learning_rate": 0.0002, "epoch": 0.7161910920701641, "step": 4430}, {"loss": 0.8362, "grad_norm": 0.6140755414962769, "learning_rate": 0.0002, "epoch": 0.7178077762509094, "step": 4440}, {"loss": 0.7722, "grad_norm": 0.6841378808021545, "learning_rate": 0.0002, "epoch": 0.7194244604316546, "step": 4450}, {"loss": 0.8177, "grad_norm": 0.6664239168167114, "learning_rate": 0.0002, "epoch": 0.7210411446124, "step": 4460}, {"loss": 0.7983, "grad_norm": 0.47552719712257385, "learning_rate": 0.0002, "epoch": 0.7226578287931452, "step": 4470}, {"loss": 0.8982, "grad_norm": 0.6649776101112366, "learning_rate": 0.0002, "epoch": 0.7242745129738906, "step": 4480}, {"loss": 0.8074, "grad_norm": 0.5159541964530945, "learning_rate": 0.0002, "epoch": 0.7258911971546358, "step": 4490}, {"loss": 0.7786, "grad_norm": 0.6693112850189209, "learning_rate": 0.0002, "epoch": 0.7275078813353811, "step": 4500}, {"loss": 0.8655, "grad_norm": 0.48870977759361267, "learning_rate": 0.0002, "epoch": 0.7291245655161265, "step": 4510}, {"loss": 0.7337, "grad_norm": 0.4857887923717499, "learning_rate": 0.0002, "epoch": 0.7307412496968717, "step": 4520}, {"loss": 0.8026, "grad_norm": 0.5515662431716919, "learning_rate": 0.0002, "epoch": 0.732357933877617, "step": 4530}, {"loss": 0.8031, "grad_norm": 0.6292222738265991, "learning_rate": 0.0002, "epoch": 0.7339746180583623, "step": 4540}, {"loss": 0.7749, "grad_norm": 0.48265689611434937, "learning_rate": 0.0002, "epoch": 0.7355913022391076, "step": 4550}, {"loss": 0.8499, "grad_norm": 0.8044266104698181, "learning_rate": 0.0002, "epoch": 0.7372079864198529, "step": 4560}, {"loss": 0.8162, "grad_norm": 0.6111769676208496, "learning_rate": 0.0002, "epoch": 0.7388246706005982, "step": 4570}, {"loss": 0.7291, "grad_norm": 0.5229553580284119, "learning_rate": 0.0002, "epoch": 0.7404413547813434, "step": 4580}, {"loss": 0.8038, "grad_norm": 0.6054152250289917, "learning_rate": 0.0002, "epoch": 0.7420580389620888, "step": 4590}, {"loss": 0.8169, "grad_norm": 0.5574966669082642, "learning_rate": 0.0002, "epoch": 0.743674723142834, "step": 4600}, {"loss": 0.8439, "grad_norm": 0.5395817160606384, "learning_rate": 0.0002, "epoch": 0.7452914073235793, "step": 4610}, {"loss": 0.8495, "grad_norm": 0.7116472721099854, "learning_rate": 0.0002, "epoch": 0.7469080915043246, "step": 4620}, {"loss": 0.7743, "grad_norm": 0.5618700981140137, "learning_rate": 0.0002, "epoch": 0.7485247756850699, "step": 4630}, {"loss": 0.7744, "grad_norm": 0.5802770853042603, "learning_rate": 0.0002, "epoch": 0.7501414598658153, "step": 4640}, {"loss": 0.7924, "grad_norm": 0.5690428018569946, "learning_rate": 0.0002, "epoch": 0.7517581440465605, "step": 4650}, {"loss": 0.8017, "grad_norm": 0.4813360273838043, "learning_rate": 0.0002, "epoch": 0.7533748282273058, "step": 4660}, {"loss": 0.8108, "grad_norm": 0.5434042811393738, "learning_rate": 0.0002, "epoch": 0.7549915124080511, "step": 4670}, {"loss": 0.7824, "grad_norm": 0.5502099990844727, "learning_rate": 0.0002, "epoch": 0.7566081965887964, "step": 4680}, {"loss": 0.8598, "grad_norm": 0.6020621061325073, "learning_rate": 0.0002, "epoch": 0.7582248807695416, "step": 4690}, {"loss": 0.7937, "grad_norm": 0.4922301471233368, "learning_rate": 0.0002, "epoch": 0.759841564950287, "step": 4700}, {"loss": 0.788, "grad_norm": 0.6492828726768494, "learning_rate": 0.0002, "epoch": 0.7614582491310322, "step": 4710}, {"loss": 0.8313, "grad_norm": 0.4865580201148987, "learning_rate": 0.0002, "epoch": 0.7630749333117776, "step": 4720}, {"loss": 0.7966, "grad_norm": 0.5971422791481018, "learning_rate": 0.0002, "epoch": 0.7646916174925228, "step": 4730}, {"loss": 0.8298, "grad_norm": 0.6832674145698547, "learning_rate": 0.0002, "epoch": 0.7663083016732681, "step": 4740}, {"loss": 0.8156, "grad_norm": 0.500908613204956, "learning_rate": 0.0002, "epoch": 0.7679249858540134, "step": 4750}, {"loss": 0.8383, "grad_norm": 0.6112465858459473, "learning_rate": 0.0002, "epoch": 0.7695416700347587, "step": 4760}, {"loss": 0.76, "grad_norm": 0.5753506422042847, "learning_rate": 0.0002, "epoch": 0.771158354215504, "step": 4770}, {"loss": 0.8297, "grad_norm": 0.6529405117034912, "learning_rate": 0.0002, "epoch": 0.7727750383962493, "step": 4780}, {"loss": 0.8171, "grad_norm": 0.5916843414306641, "learning_rate": 0.0002, "epoch": 0.7743917225769946, "step": 4790}, {"loss": 0.83, "grad_norm": 0.4821224510669708, "learning_rate": 0.0002, "epoch": 0.7760084067577399, "step": 4800}, {"loss": 0.7703, "grad_norm": 0.5532580018043518, "learning_rate": 0.0002, "epoch": 0.7776250909384852, "step": 4810}, {"loss": 0.7363, "grad_norm": 0.4604877233505249, "learning_rate": 0.0002, "epoch": 0.7792417751192304, "step": 4820}, {"loss": 0.7506, "grad_norm": 0.5009613037109375, "learning_rate": 0.0002, "epoch": 0.7808584592999758, "step": 4830}, {"loss": 0.7863, "grad_norm": 0.6448560357093811, "learning_rate": 0.0002, "epoch": 0.782475143480721, "step": 4840}, {"loss": 0.7957, "grad_norm": 0.44327953457832336, "learning_rate": 0.0002, "epoch": 0.7840918276614663, "step": 4850}, {"loss": 0.7925, "grad_norm": 0.5355411171913147, "learning_rate": 0.0002, "epoch": 0.7857085118422116, "step": 4860}, {"loss": 0.7754, "grad_norm": 0.5635677576065063, "learning_rate": 0.0002, "epoch": 0.7873251960229569, "step": 4870}, {"loss": 0.7931, "grad_norm": 0.5417491793632507, "learning_rate": 0.0002, "epoch": 0.7889418802037023, "step": 4880}, {"loss": 0.7819, "grad_norm": 0.4567430913448334, "learning_rate": 0.0002, "epoch": 0.7905585643844475, "step": 4890}, {"loss": 0.8454, "grad_norm": 0.44651296734809875, "learning_rate": 0.0002, "epoch": 0.7921752485651928, "step": 4900}, {"loss": 0.7959, "grad_norm": 0.5741217136383057, "learning_rate": 0.0002, "epoch": 0.7937919327459381, "step": 4910}, {"loss": 0.8093, "grad_norm": 0.6605045199394226, "learning_rate": 0.0002, "epoch": 0.7954086169266834, "step": 4920}, {"loss": 0.77, "grad_norm": 0.5126531720161438, "learning_rate": 0.0002, "epoch": 0.7970253011074286, "step": 4930}, {"loss": 0.7793, "grad_norm": 0.513648271560669, "learning_rate": 0.0002, "epoch": 0.798641985288174, "step": 4940}, {"loss": 0.8314, "grad_norm": 0.5350404381752014, "learning_rate": 0.0002, "epoch": 0.8002586694689192, "step": 4950}, {"loss": 0.7649, "grad_norm": 0.5731674432754517, "learning_rate": 0.0002, "epoch": 0.8018753536496646, "step": 4960}, {"loss": 0.8572, "grad_norm": 0.5974258184432983, "learning_rate": 0.0002, "epoch": 0.8034920378304098, "step": 4970}, {"loss": 0.7972, "grad_norm": 0.8774799704551697, "learning_rate": 0.0002, "epoch": 0.8051087220111551, "step": 4980}, {"loss": 0.7899, "grad_norm": 0.5994430184364319, "learning_rate": 0.0002, "epoch": 0.8067254061919004, "step": 4990}, {"loss": 0.7736, "grad_norm": 0.4894903004169464, "learning_rate": 0.0002, "epoch": 0.8083420903726457, "step": 5000}, {"loss": 0.78, "grad_norm": 0.5218459367752075, "learning_rate": 0.0002, "epoch": 0.809958774553391, "step": 5010}, {"loss": 0.817, "grad_norm": 0.5232468843460083, "learning_rate": 0.0002, "epoch": 0.8115754587341363, "step": 5020}, {"loss": 0.7704, "grad_norm": 0.44358372688293457, "learning_rate": 0.0002, "epoch": 0.8131921429148816, "step": 5030}, {"loss": 0.785, "grad_norm": 0.6202037334442139, "learning_rate": 0.0002, "epoch": 0.8148088270956269, "step": 5040}, {"loss": 0.7351, "grad_norm": 0.7721474170684814, "learning_rate": 0.0002, "epoch": 0.8164255112763722, "step": 5050}, {"loss": 0.8297, "grad_norm": 0.5568501353263855, "learning_rate": 0.0002, "epoch": 0.8180421954571174, "step": 5060}, {"loss": 0.7733, "grad_norm": 0.49148809909820557, "learning_rate": 0.0002, "epoch": 0.8196588796378628, "step": 5070}, {"loss": 0.8054, "grad_norm": 0.4956012964248657, "learning_rate": 0.0002, "epoch": 0.821275563818608, "step": 5080}, {"loss": 0.8201, "grad_norm": 0.6078833937644958, "learning_rate": 0.0002, "epoch": 0.8228922479993533, "step": 5090}, {"loss": 0.828, "grad_norm": 0.46906954050064087, "learning_rate": 0.0002, "epoch": 0.8245089321800986, "step": 5100}, {"loss": 0.7703, "grad_norm": 0.50812166929245, "learning_rate": 0.0002, "epoch": 0.8261256163608439, "step": 5110}, {"loss": 0.8243, "grad_norm": 0.5319661498069763, "learning_rate": 0.0002, "epoch": 0.8277423005415891, "step": 5120}, {"loss": 0.7798, "grad_norm": 0.4949689209461212, "learning_rate": 0.0002, "epoch": 0.8293589847223345, "step": 5130}, {"loss": 0.7428, "grad_norm": 0.5151591300964355, "learning_rate": 0.0002, "epoch": 0.8309756689030798, "step": 5140}, {"loss": 0.8147, "grad_norm": 0.5530214309692383, "learning_rate": 0.0002, "epoch": 0.8325923530838251, "step": 5150}, {"loss": 0.8251, "grad_norm": 0.6297410130500793, "learning_rate": 0.0002, "epoch": 0.8342090372645704, "step": 5160}, {"loss": 0.8067, "grad_norm": 0.5466840267181396, "learning_rate": 0.0002, "epoch": 0.8358257214453156, "step": 5170}, {"loss": 0.7875, "grad_norm": 0.652913510799408, "learning_rate": 0.0002, "epoch": 0.837442405626061, "step": 5180}, {"loss": 0.8295, "grad_norm": 0.5811293125152588, "learning_rate": 0.0002, "epoch": 0.8390590898068062, "step": 5190}, {"loss": 0.7412, "grad_norm": 0.5109550952911377, "learning_rate": 0.0002, "epoch": 0.8406757739875516, "step": 5200}, {"loss": 0.8077, "grad_norm": 0.4551706612110138, "learning_rate": 0.0002, "epoch": 0.8422924581682968, "step": 5210}, {"loss": 0.7827, "grad_norm": 0.5813754200935364, "learning_rate": 0.0002, "epoch": 0.8439091423490421, "step": 5220}, {"loss": 0.802, "grad_norm": 0.5856947898864746, "learning_rate": 0.0002, "epoch": 0.8455258265297874, "step": 5230}, {"loss": 0.7957, "grad_norm": 0.5482739210128784, "learning_rate": 0.0002, "epoch": 0.8471425107105327, "step": 5240}, {"loss": 0.8295, "grad_norm": 0.49023720622062683, "learning_rate": 0.0002, "epoch": 0.8487591948912779, "step": 5250}, {"loss": 0.8022, "grad_norm": 0.49472475051879883, "learning_rate": 0.0002, "epoch": 0.8503758790720233, "step": 5260}, {"loss": 0.8001, "grad_norm": 0.5490226745605469, "learning_rate": 0.0002, "epoch": 0.8519925632527686, "step": 5270}, {"loss": 0.8333, "grad_norm": 0.5340665578842163, "learning_rate": 0.0002, "epoch": 0.8536092474335139, "step": 5280}, {"loss": 0.8277, "grad_norm": 0.5962483882904053, "learning_rate": 0.0002, "epoch": 0.8552259316142592, "step": 5290}, {"loss": 0.8765, "grad_norm": 0.586358368396759, "learning_rate": 0.0002, "epoch": 0.8568426157950044, "step": 5300}, {"loss": 0.7831, "grad_norm": 0.49120277166366577, "learning_rate": 0.0002, "epoch": 0.8584592999757498, "step": 5310}, {"loss": 0.8162, "grad_norm": 0.5887332558631897, "learning_rate": 0.0002, "epoch": 0.860075984156495, "step": 5320}, {"loss": 0.7464, "grad_norm": 0.42496153712272644, "learning_rate": 0.0002, "epoch": 0.8616926683372403, "step": 5330}, {"loss": 0.7905, "grad_norm": 0.5489874482154846, "learning_rate": 0.0002, "epoch": 0.8633093525179856, "step": 5340}, {"loss": 0.7958, "grad_norm": 0.5850813984870911, "learning_rate": 0.0002, "epoch": 0.8649260366987309, "step": 5350}, {"loss": 0.7642, "grad_norm": 0.517487108707428, "learning_rate": 0.0002, "epoch": 0.8665427208794761, "step": 5360}, {"loss": 0.7801, "grad_norm": 0.5339142680168152, "learning_rate": 0.0002, "epoch": 0.8681594050602215, "step": 5370}, {"loss": 0.818, "grad_norm": 0.6236387491226196, "learning_rate": 0.0002, "epoch": 0.8697760892409668, "step": 5380}, {"loss": 0.7708, "grad_norm": 0.5752192735671997, "learning_rate": 0.0002, "epoch": 0.8713927734217121, "step": 5390}, {"loss": 0.8542, "grad_norm": 0.6724614500999451, "learning_rate": 0.0002, "epoch": 0.8730094576024574, "step": 5400}, {"loss": 0.7581, "grad_norm": 0.5280613303184509, "learning_rate": 0.0002, "epoch": 0.8746261417832026, "step": 5410}, {"loss": 0.8231, "grad_norm": 0.44033288955688477, "learning_rate": 0.0002, "epoch": 0.876242825963948, "step": 5420}, {"loss": 0.8839, "grad_norm": 0.5199708342552185, "learning_rate": 0.0002, "epoch": 0.8778595101446932, "step": 5430}, {"loss": 0.7852, "grad_norm": 0.46778348088264465, "learning_rate": 0.0002, "epoch": 0.8794761943254386, "step": 5440}, {"loss": 0.7834, "grad_norm": 0.4657754898071289, "learning_rate": 0.0002, "epoch": 0.8810928785061838, "step": 5450}, {"loss": 0.7799, "grad_norm": 0.5472902655601501, "learning_rate": 0.0002, "epoch": 0.8827095626869291, "step": 5460}, {"loss": 0.8253, "grad_norm": 0.4876766800880432, "learning_rate": 0.0002, "epoch": 0.8843262468676744, "step": 5470}, {"loss": 0.7906, "grad_norm": 0.5057248473167419, "learning_rate": 0.0002, "epoch": 0.8859429310484197, "step": 5480}, {"loss": 0.8124, "grad_norm": 0.4637320637702942, "learning_rate": 0.0002, "epoch": 0.8875596152291649, "step": 5490}, {"loss": 0.781, "grad_norm": 0.471955806016922, "learning_rate": 0.0002, "epoch": 0.8891762994099103, "step": 5500}, {"loss": 0.8057, "grad_norm": 0.5209813714027405, "learning_rate": 0.0002, "epoch": 0.8907929835906556, "step": 5510}, {"loss": 0.8106, "grad_norm": 0.6213834285736084, "learning_rate": 0.0002, "epoch": 0.8924096677714008, "step": 5520}, {"loss": 0.7787, "grad_norm": 0.5215408205986023, "learning_rate": 0.0002, "epoch": 0.8940263519521462, "step": 5530}, {"loss": 0.8174, "grad_norm": 0.580478310585022, "learning_rate": 0.0002, "epoch": 0.8956430361328914, "step": 5540}, {"loss": 0.8371, "grad_norm": 0.49102169275283813, "learning_rate": 0.0002, "epoch": 0.8972597203136368, "step": 5550}, {"loss": 0.7806, "grad_norm": 0.6043479442596436, "learning_rate": 0.0002, "epoch": 0.898876404494382, "step": 5560}, {"loss": 0.7754, "grad_norm": 0.5636463165283203, "learning_rate": 0.0002, "epoch": 0.9004930886751273, "step": 5570}, {"loss": 0.8145, "grad_norm": 0.5620124340057373, "learning_rate": 0.0002, "epoch": 0.9021097728558726, "step": 5580}, {"loss": 0.8083, "grad_norm": 0.5206354856491089, "learning_rate": 0.0002, "epoch": 0.9037264570366179, "step": 5590}, {"loss": 0.8557, "grad_norm": 0.5798229575157166, "learning_rate": 0.0002, "epoch": 0.9053431412173631, "step": 5600}, {"loss": 0.8097, "grad_norm": 0.6428212523460388, "learning_rate": 0.0002, "epoch": 0.9069598253981085, "step": 5610}, {"loss": 0.7839, "grad_norm": 0.48064687848091125, "learning_rate": 0.0002, "epoch": 0.9085765095788537, "step": 5620}, {"loss": 0.8343, "grad_norm": 0.6347860097885132, "learning_rate": 0.0002, "epoch": 0.9101931937595991, "step": 5630}, {"loss": 0.851, "grad_norm": 0.5353913307189941, "learning_rate": 0.0002, "epoch": 0.9118098779403444, "step": 5640}, {"loss": 0.7736, "grad_norm": 0.5323944091796875, "learning_rate": 0.0002, "epoch": 0.9134265621210896, "step": 5650}, {"loss": 0.8393, "grad_norm": 0.5261843204498291, "learning_rate": 0.0002, "epoch": 0.915043246301835, "step": 5660}, {"loss": 0.7355, "grad_norm": 0.5451326966285706, "learning_rate": 0.0002, "epoch": 0.9166599304825802, "step": 5670}, {"loss": 0.8012, "grad_norm": 0.5183324217796326, "learning_rate": 0.0002, "epoch": 0.9182766146633256, "step": 5680}, {"loss": 0.7659, "grad_norm": 0.47229018807411194, "learning_rate": 0.0002, "epoch": 0.9198932988440708, "step": 5690}, {"loss": 0.7757, "grad_norm": 0.49180513620376587, "learning_rate": 0.0002, "epoch": 0.9215099830248161, "step": 5700}, {"loss": 0.8735, "grad_norm": 0.5419785380363464, "learning_rate": 0.0002, "epoch": 0.9231266672055614, "step": 5710}, {"loss": 0.7378, "grad_norm": 0.5408698916435242, "learning_rate": 0.0002, "epoch": 0.9247433513863067, "step": 5720}, {"loss": 0.7701, "grad_norm": 0.5286232829093933, "learning_rate": 0.0002, "epoch": 0.9263600355670519, "step": 5730}, {"loss": 0.8242, "grad_norm": 0.7539758086204529, "learning_rate": 0.0002, "epoch": 0.9279767197477973, "step": 5740}, {"loss": 0.8118, "grad_norm": 0.5166944861412048, "learning_rate": 0.0002, "epoch": 0.9295934039285425, "step": 5750}, {"loss": 0.783, "grad_norm": 0.6601425409317017, "learning_rate": 0.0002, "epoch": 0.9312100881092878, "step": 5760}, {"loss": 0.7873, "grad_norm": 0.5029960870742798, "learning_rate": 0.0002, "epoch": 0.9328267722900332, "step": 5770}, {"loss": 0.7989, "grad_norm": 0.4926645755767822, "learning_rate": 0.0002, "epoch": 0.9344434564707784, "step": 5780}, {"loss": 0.8174, "grad_norm": 0.5739615559577942, "learning_rate": 0.0002, "epoch": 0.9360601406515238, "step": 5790}, {"loss": 0.8037, "grad_norm": 0.5058279037475586, "learning_rate": 0.0002, "epoch": 0.937676824832269, "step": 5800}, {"loss": 0.8537, "grad_norm": 0.5260962247848511, "learning_rate": 0.0002, "epoch": 0.9392935090130143, "step": 5810}, {"loss": 0.7486, "grad_norm": 0.5768588185310364, "learning_rate": 0.0002, "epoch": 0.9409101931937596, "step": 5820}, {"loss": 0.8215, "grad_norm": 0.5170126557350159, "learning_rate": 0.0002, "epoch": 0.9425268773745049, "step": 5830}, {"loss": 0.7422, "grad_norm": 0.5745864510536194, "learning_rate": 0.0002, "epoch": 0.9441435615552501, "step": 5840}, {"loss": 0.7824, "grad_norm": 0.5551357865333557, "learning_rate": 0.0002, "epoch": 0.9457602457359955, "step": 5850}, {"loss": 0.8529, "grad_norm": 0.5776078701019287, "learning_rate": 0.0002, "epoch": 0.9473769299167407, "step": 5860}, {"loss": 0.8527, "grad_norm": 0.5340062379837036, "learning_rate": 0.0002, "epoch": 0.9489936140974861, "step": 5870}, {"loss": 0.8217, "grad_norm": 0.6447290182113647, "learning_rate": 0.0002, "epoch": 0.9506102982782314, "step": 5880}, {"loss": 0.7945, "grad_norm": 0.5123815536499023, "learning_rate": 0.0002, "epoch": 0.9522269824589766, "step": 5890}, {"loss": 0.8209, "grad_norm": 0.48547613620758057, "learning_rate": 0.0002, "epoch": 0.953843666639722, "step": 5900}, {"loss": 0.7896, "grad_norm": 0.5791414976119995, "learning_rate": 0.0002, "epoch": 0.9554603508204672, "step": 5910}, {"loss": 0.8408, "grad_norm": 0.6195011734962463, "learning_rate": 0.0002, "epoch": 0.9570770350012126, "step": 5920}, {"loss": 0.7805, "grad_norm": 0.6323803067207336, "learning_rate": 0.0002, "epoch": 0.9586937191819578, "step": 5930}, {"loss": 0.8484, "grad_norm": 0.45552879571914673, "learning_rate": 0.0002, "epoch": 0.9603104033627031, "step": 5940}, {"loss": 0.7367, "grad_norm": 0.5796473622322083, "learning_rate": 0.0002, "epoch": 0.9619270875434484, "step": 5950}, {"loss": 0.7672, "grad_norm": 0.647261381149292, "learning_rate": 0.0002, "epoch": 0.9635437717241937, "step": 5960}, {"loss": 0.8086, "grad_norm": 0.5487682819366455, "learning_rate": 0.0002, "epoch": 0.9651604559049389, "step": 5970}, {"loss": 0.7973, "grad_norm": 0.5743663907051086, "learning_rate": 0.0002, "epoch": 0.9667771400856843, "step": 5980}, {"loss": 0.8153, "grad_norm": 0.5470591187477112, "learning_rate": 0.0002, "epoch": 0.9683938242664295, "step": 5990}, {"loss": 0.8119, "grad_norm": 0.5901660323143005, "learning_rate": 0.0002, "epoch": 0.9700105084471748, "step": 6000}, {"loss": 0.8147, "grad_norm": 0.6544759273529053, "learning_rate": 0.0002, "epoch": 0.9716271926279202, "step": 6010}, {"loss": 0.7536, "grad_norm": 0.6288470029830933, "learning_rate": 0.0002, "epoch": 0.9732438768086654, "step": 6020}, {"loss": 0.7989, "grad_norm": 0.673153817653656, "learning_rate": 0.0002, "epoch": 0.9748605609894108, "step": 6030}, {"loss": 0.7556, "grad_norm": 0.42854753136634827, "learning_rate": 0.0002, "epoch": 0.976477245170156, "step": 6040}, {"loss": 0.8006, "grad_norm": 0.5227066278457642, "learning_rate": 0.0002, "epoch": 0.9780939293509013, "step": 6050}, {"loss": 0.795, "grad_norm": 0.5372416973114014, "learning_rate": 0.0002, "epoch": 0.9797106135316466, "step": 6060}, {"loss": 0.7591, "grad_norm": 0.6026402115821838, "learning_rate": 0.0002, "epoch": 0.9813272977123919, "step": 6070}, {"loss": 0.8347, "grad_norm": 0.49547791481018066, "learning_rate": 0.0002, "epoch": 0.9829439818931371, "step": 6080}, {"loss": 0.7722, "grad_norm": 0.4641951322555542, "learning_rate": 0.0002, "epoch": 0.9845606660738825, "step": 6090}, {"loss": 0.8125, "grad_norm": 0.5818535089492798, "learning_rate": 0.0002, "epoch": 0.9861773502546277, "step": 6100}, {"loss": 0.81, "grad_norm": 0.63955157995224, "learning_rate": 0.0002, "epoch": 0.9877940344353731, "step": 6110}, {"loss": 0.7547, "grad_norm": 0.5649438500404358, "learning_rate": 0.0002, "epoch": 0.9894107186161183, "step": 6120}, {"loss": 0.7861, "grad_norm": 0.5290433168411255, "learning_rate": 0.0002, "epoch": 0.9910274027968636, "step": 6130}, {"loss": 0.8109, "grad_norm": 0.6399374008178711, "learning_rate": 0.0002, "epoch": 0.992644086977609, "step": 6140}, {"loss": 0.8373, "grad_norm": 0.6736576557159424, "learning_rate": 0.0002, "epoch": 0.9942607711583542, "step": 6150}, {"loss": 0.7915, "grad_norm": 0.515420138835907, "learning_rate": 0.0002, "epoch": 0.9958774553390995, "step": 6160}, {"loss": 0.8032, "grad_norm": 0.562677800655365, "learning_rate": 0.0002, "epoch": 0.9974941395198448, "step": 6170}, {"loss": 0.8187, "grad_norm": 0.7113858461380005, "learning_rate": 0.0002, "epoch": 0.9991108237005901, "step": 6180}, {"eval_loss": 1.0871200561523438, "eval_runtime": 122.2071, "eval_samples_per_second": 5.998, "eval_steps_per_second": 0.753, "epoch": 0.9999191657909627, "step": 6185}, {"loss": 0.7507, "grad_norm": 0.7111801505088806, "learning_rate": 0.0002, "epoch": 1.0007275078813354, "step": 6190}, {"loss": 0.6865, "grad_norm": 0.5402125716209412, "learning_rate": 0.0002, "epoch": 1.0023441920620806, "step": 6200}, {"loss": 0.7625, "grad_norm": 0.6098830103874207, "learning_rate": 0.0002, "epoch": 1.003960876242826, "step": 6210}, {"loss": 0.7631, "grad_norm": 0.5829983353614807, "learning_rate": 0.0002, "epoch": 1.0055775604235713, "step": 6220}, {"loss": 0.7188, "grad_norm": 0.5614621043205261, "learning_rate": 0.0002, "epoch": 1.0071942446043165, "step": 6230}, {"loss": 0.7505, "grad_norm": 0.5954238772392273, "learning_rate": 0.0002, "epoch": 1.0088109287850617, "step": 6240}, {"loss": 0.7448, "grad_norm": 0.6480574607849121, "learning_rate": 0.0002, "epoch": 1.0104276129658072, "step": 6250}, {"loss": 0.7514, "grad_norm": 0.6051128506660461, "learning_rate": 0.0002, "epoch": 1.0120442971465524, "step": 6260}, {"loss": 0.7237, "grad_norm": 0.6318870782852173, "learning_rate": 0.0002, "epoch": 1.0136609813272976, "step": 6270}, {"loss": 0.7178, "grad_norm": 0.5048980116844177, "learning_rate": 0.0002, "epoch": 1.015277665508043, "step": 6280}, {"loss": 0.7391, "grad_norm": 0.6346936225891113, "learning_rate": 0.0002, "epoch": 1.0168943496887883, "step": 6290}, {"loss": 0.7486, "grad_norm": 0.5711665749549866, "learning_rate": 0.0002, "epoch": 1.0185110338695336, "step": 6300}, {"loss": 0.6808, "grad_norm": 0.5175361037254333, "learning_rate": 0.0002, "epoch": 1.0201277180502788, "step": 6310}, {"loss": 0.7539, "grad_norm": 0.5360831618309021, "learning_rate": 0.0002, "epoch": 1.0217444022310243, "step": 6320}, {"loss": 0.7112, "grad_norm": 0.614675760269165, "learning_rate": 0.0002, "epoch": 1.0233610864117695, "step": 6330}, {"loss": 0.7748, "grad_norm": 0.5626118183135986, "learning_rate": 0.0002, "epoch": 1.0249777705925147, "step": 6340}, {"loss": 0.7375, "grad_norm": 0.574897289276123, "learning_rate": 0.0002, "epoch": 1.02659445477326, "step": 6350}, {"loss": 0.759, "grad_norm": 0.7185447812080383, "learning_rate": 0.0002, "epoch": 1.0282111389540054, "step": 6360}, {"loss": 0.703, "grad_norm": 0.6705799698829651, "learning_rate": 0.0002, "epoch": 1.0298278231347506, "step": 6370}, {"loss": 0.7139, "grad_norm": 0.6740428805351257, "learning_rate": 0.0002, "epoch": 1.0314445073154959, "step": 6380}, {"loss": 0.7252, "grad_norm": 0.663902759552002, "learning_rate": 0.0002, "epoch": 1.0330611914962413, "step": 6390}, {"loss": 0.7065, "grad_norm": 0.5029543042182922, "learning_rate": 0.0002, "epoch": 1.0346778756769865, "step": 6400}, {"loss": 0.711, "grad_norm": 0.7813863158226013, "learning_rate": 0.0002, "epoch": 1.0362945598577318, "step": 6410}, {"loss": 0.7433, "grad_norm": 0.5396282076835632, "learning_rate": 0.0002, "epoch": 1.037911244038477, "step": 6420}, {"loss": 0.7222, "grad_norm": 0.5253293514251709, "learning_rate": 0.0002, "epoch": 1.0395279282192225, "step": 6430}, {"loss": 0.715, "grad_norm": 0.7236770987510681, "learning_rate": 0.0002, "epoch": 1.0411446123999677, "step": 6440}, {"loss": 0.7259, "grad_norm": 0.5670917630195618, "learning_rate": 0.0002, "epoch": 1.042761296580713, "step": 6450}, {"loss": 0.7195, "grad_norm": 0.6031978726387024, "learning_rate": 0.0002, "epoch": 1.0443779807614582, "step": 6460}, {"loss": 0.7648, "grad_norm": 0.5309213399887085, "learning_rate": 0.0002, "epoch": 1.0459946649422036, "step": 6470}, {"loss": 0.7161, "grad_norm": 0.7114651799201965, "learning_rate": 0.0002, "epoch": 1.0476113491229488, "step": 6480}, {"loss": 0.7583, "grad_norm": 0.5591610670089722, "learning_rate": 0.0002, "epoch": 1.049228033303694, "step": 6490}, {"loss": 0.6645, "grad_norm": 0.5185961127281189, "learning_rate": 0.0002, "epoch": 1.0508447174844395, "step": 6500}, {"loss": 0.7654, "grad_norm": 0.6510552167892456, "learning_rate": 0.0002, "epoch": 1.0524614016651848, "step": 6510}, {"loss": 0.7057, "grad_norm": 0.6557928919792175, "learning_rate": 0.0002, "epoch": 1.05407808584593, "step": 6520}, {"loss": 0.8056, "grad_norm": 0.6973192691802979, "learning_rate": 0.0002, "epoch": 1.0556947700266752, "step": 6530}, {"loss": 0.6793, "grad_norm": 0.6226583123207092, "learning_rate": 0.0002, "epoch": 1.0573114542074207, "step": 6540}, {"loss": 0.7151, "grad_norm": 0.5633195638656616, "learning_rate": 0.0002, "epoch": 1.058928138388166, "step": 6550}, {"loss": 0.7082, "grad_norm": 0.7466658353805542, "learning_rate": 0.0002, "epoch": 1.0605448225689111, "step": 6560}, {"loss": 0.7059, "grad_norm": 0.6462772488594055, "learning_rate": 0.0002, "epoch": 1.0621615067496564, "step": 6570}, {"loss": 0.7046, "grad_norm": 0.5266856551170349, "learning_rate": 0.0002, "epoch": 1.0637781909304018, "step": 6580}, {"loss": 0.7157, "grad_norm": 0.534392774105072, "learning_rate": 0.0002, "epoch": 1.065394875111147, "step": 6590}, {"loss": 0.7115, "grad_norm": 0.7514177560806274, "learning_rate": 0.0002, "epoch": 1.0670115592918923, "step": 6600}, {"loss": 0.7545, "grad_norm": 0.7593035697937012, "learning_rate": 0.0002, "epoch": 1.0686282434726375, "step": 6610}, {"loss": 0.6836, "grad_norm": 0.5277858972549438, "learning_rate": 0.0002, "epoch": 1.070244927653383, "step": 6620}, {"loss": 0.7405, "grad_norm": 0.5573670268058777, "learning_rate": 0.0002, "epoch": 1.0718616118341282, "step": 6630}, {"loss": 0.6774, "grad_norm": 0.6802396774291992, "learning_rate": 0.0002, "epoch": 1.0734782960148734, "step": 6640}, {"loss": 0.723, "grad_norm": 0.7367215752601624, "learning_rate": 0.0002, "epoch": 1.0750949801956189, "step": 6650}, {"loss": 0.7429, "grad_norm": 0.5961891412734985, "learning_rate": 0.0002, "epoch": 1.0767116643763641, "step": 6660}, {"loss": 0.6791, "grad_norm": 0.5736313462257385, "learning_rate": 0.0002, "epoch": 1.0783283485571094, "step": 6670}, {"loss": 0.7178, "grad_norm": 0.619219183921814, "learning_rate": 0.0002, "epoch": 1.0799450327378546, "step": 6680}, {"loss": 0.7318, "grad_norm": 0.6214390993118286, "learning_rate": 0.0002, "epoch": 1.0815617169186, "step": 6690}, {"loss": 0.7554, "grad_norm": 0.564536988735199, "learning_rate": 0.0002, "epoch": 1.0831784010993453, "step": 6700}, {"loss": 0.7362, "grad_norm": 0.5838140249252319, "learning_rate": 0.0002, "epoch": 1.0847950852800905, "step": 6710}, {"loss": 0.739, "grad_norm": 0.7000553607940674, "learning_rate": 0.0002, "epoch": 1.0864117694608357, "step": 6720}, {"loss": 0.7369, "grad_norm": 0.7078263759613037, "learning_rate": 0.0002, "epoch": 1.0880284536415812, "step": 6730}, {"loss": 0.7654, "grad_norm": 0.8353848457336426, "learning_rate": 0.0002, "epoch": 1.0896451378223264, "step": 6740}, {"loss": 0.7015, "grad_norm": 0.5615518689155579, "learning_rate": 0.0002, "epoch": 1.0912618220030716, "step": 6750}, {"loss": 0.7396, "grad_norm": 0.5475581288337708, "learning_rate": 0.0002, "epoch": 1.0928785061838169, "step": 6760}, {"loss": 0.7652, "grad_norm": 0.5835978388786316, "learning_rate": 0.0002, "epoch": 1.0944951903645623, "step": 6770}, {"loss": 0.7541, "grad_norm": 0.5516105890274048, "learning_rate": 0.0002, "epoch": 1.0961118745453076, "step": 6780}, {"loss": 0.6842, "grad_norm": 0.5875251889228821, "learning_rate": 0.0002, "epoch": 1.0977285587260528, "step": 6790}, {"loss": 0.6903, "grad_norm": 0.7376947999000549, "learning_rate": 0.0002, "epoch": 1.0993452429067982, "step": 6800}, {"loss": 0.7512, "grad_norm": 0.5656165480613708, "learning_rate": 0.0002, "epoch": 1.1009619270875435, "step": 6810}, {"loss": 0.7409, "grad_norm": 0.6365954279899597, "learning_rate": 0.0002, "epoch": 1.1025786112682887, "step": 6820}, {"loss": 0.7392, "grad_norm": 0.5033080577850342, "learning_rate": 0.0002, "epoch": 1.104195295449034, "step": 6830}, {"loss": 0.6909, "grad_norm": 0.617396891117096, "learning_rate": 0.0002, "epoch": 1.1058119796297794, "step": 6840}, {"loss": 0.7006, "grad_norm": 0.6395374536514282, "learning_rate": 0.0002, "epoch": 1.1074286638105246, "step": 6850}, {"loss": 0.7335, "grad_norm": 0.6775295734405518, "learning_rate": 0.0002, "epoch": 1.1090453479912699, "step": 6860}, {"loss": 0.764, "grad_norm": 0.6655223965644836, "learning_rate": 0.0002, "epoch": 1.1106620321720153, "step": 6870}, {"loss": 0.7553, "grad_norm": 0.676655113697052, "learning_rate": 0.0002, "epoch": 1.1122787163527605, "step": 6880}, {"loss": 0.7342, "grad_norm": 0.6062718629837036, "learning_rate": 0.0002, "epoch": 1.1138954005335058, "step": 6890}, {"loss": 0.7446, "grad_norm": 0.590943455696106, "learning_rate": 0.0002, "epoch": 1.115512084714251, "step": 6900}, {"loss": 0.6705, "grad_norm": 0.6315317153930664, "learning_rate": 0.0002, "epoch": 1.1171287688949965, "step": 6910}, {"loss": 0.6912, "grad_norm": 0.47979024052619934, "learning_rate": 0.0002, "epoch": 1.1187454530757417, "step": 6920}, {"loss": 0.7002, "grad_norm": 0.647298276424408, "learning_rate": 0.0002, "epoch": 1.120362137256487, "step": 6930}, {"loss": 0.7502, "grad_norm": 0.7336484789848328, "learning_rate": 0.0002, "epoch": 1.1219788214372322, "step": 6940}, {"loss": 0.693, "grad_norm": 0.5071424245834351, "learning_rate": 0.0002, "epoch": 1.1235955056179776, "step": 6950}, {"loss": 0.7378, "grad_norm": 0.6527144312858582, "learning_rate": 0.0002, "epoch": 1.1252121897987228, "step": 6960}, {"loss": 0.7228, "grad_norm": 0.6935935020446777, "learning_rate": 0.0002, "epoch": 1.126828873979468, "step": 6970}, {"loss": 0.699, "grad_norm": 0.8026931881904602, "learning_rate": 0.0002, "epoch": 1.1284455581602133, "step": 6980}, {"loss": 0.7361, "grad_norm": 0.5210393667221069, "learning_rate": 0.0002, "epoch": 1.1300622423409588, "step": 6990}, {"loss": 0.7456, "grad_norm": 0.60475093126297, "learning_rate": 0.0002, "epoch": 1.131678926521704, "step": 7000}, {"loss": 0.7495, "grad_norm": 0.6417073607444763, "learning_rate": 0.0002, "epoch": 1.1332956107024492, "step": 7010}, {"loss": 0.7459, "grad_norm": 0.6732175946235657, "learning_rate": 0.0002, "epoch": 1.1349122948831947, "step": 7020}, {"loss": 0.7278, "grad_norm": 0.6719491481781006, "learning_rate": 0.0002, "epoch": 1.13652897906394, "step": 7030}, {"loss": 0.7694, "grad_norm": 0.5708295106887817, "learning_rate": 0.0002, "epoch": 1.1381456632446851, "step": 7040}, {"loss": 0.7823, "grad_norm": 0.7141719460487366, "learning_rate": 0.0002, "epoch": 1.1397623474254304, "step": 7050}, {"loss": 0.764, "grad_norm": 0.6187017560005188, "learning_rate": 0.0002, "epoch": 1.1413790316061758, "step": 7060}, {"loss": 0.7657, "grad_norm": 0.50581294298172, "learning_rate": 0.0002, "epoch": 1.142995715786921, "step": 7070}, {"loss": 0.7357, "grad_norm": 0.5620143413543701, "learning_rate": 0.0002, "epoch": 1.1446123999676663, "step": 7080}, {"loss": 0.7287, "grad_norm": 0.6231929659843445, "learning_rate": 0.0002, "epoch": 1.1462290841484115, "step": 7090}, {"loss": 0.7328, "grad_norm": 0.5775774121284485, "learning_rate": 0.0002, "epoch": 1.147845768329157, "step": 7100}, {"loss": 0.7728, "grad_norm": 0.6492809653282166, "learning_rate": 0.0002, "epoch": 1.1494624525099022, "step": 7110}, {"loss": 0.7545, "grad_norm": 0.6434972286224365, "learning_rate": 0.0002, "epoch": 1.1510791366906474, "step": 7120}, {"loss": 0.7374, "grad_norm": 0.6191812753677368, "learning_rate": 0.0002, "epoch": 1.1526958208713927, "step": 7130}, {"loss": 0.7276, "grad_norm": 0.6690331697463989, "learning_rate": 0.0002, "epoch": 1.1543125050521381, "step": 7140}, {"loss": 0.7704, "grad_norm": 0.5977938175201416, "learning_rate": 0.0002, "epoch": 1.1559291892328833, "step": 7150}, {"loss": 0.7251, "grad_norm": 0.6195854544639587, "learning_rate": 0.0002, "epoch": 1.1575458734136286, "step": 7160}, {"loss": 0.7249, "grad_norm": 0.5752048492431641, "learning_rate": 0.0002, "epoch": 1.159162557594374, "step": 7170}, {"loss": 0.7593, "grad_norm": 0.589081883430481, "learning_rate": 0.0002, "epoch": 1.1607792417751193, "step": 7180}, {"loss": 0.704, "grad_norm": 0.756996750831604, "learning_rate": 0.0002, "epoch": 1.1623959259558645, "step": 7190}, {"loss": 0.7404, "grad_norm": 0.7614967226982117, "learning_rate": 0.0002, "epoch": 1.1640126101366097, "step": 7200}, {"loss": 0.7867, "grad_norm": 0.6120437979698181, "learning_rate": 0.0002, "epoch": 1.1656292943173552, "step": 7210}, {"loss": 0.7384, "grad_norm": 0.6210004687309265, "learning_rate": 0.0002, "epoch": 1.1672459784981004, "step": 7220}, {"loss": 0.7251, "grad_norm": 0.6044116020202637, "learning_rate": 0.0002, "epoch": 1.1688626626788456, "step": 7230}, {"loss": 0.7361, "grad_norm": 0.5418457388877869, "learning_rate": 0.0002, "epoch": 1.170479346859591, "step": 7240}, {"loss": 0.6938, "grad_norm": 0.6413537263870239, "learning_rate": 0.0002, "epoch": 1.1720960310403363, "step": 7250}, {"loss": 0.6978, "grad_norm": 0.5777867436408997, "learning_rate": 0.0002, "epoch": 1.1737127152210816, "step": 7260}, {"loss": 0.7503, "grad_norm": 0.7092402577400208, "learning_rate": 0.0002, "epoch": 1.1753293994018268, "step": 7270}, {"loss": 0.7487, "grad_norm": 0.6351709365844727, "learning_rate": 0.0002, "epoch": 1.176946083582572, "step": 7280}, {"loss": 0.7527, "grad_norm": 0.6172189712524414, "learning_rate": 0.0002, "epoch": 1.1785627677633175, "step": 7290}, {"loss": 0.7319, "grad_norm": 0.6801714897155762, "learning_rate": 0.0002, "epoch": 1.1801794519440627, "step": 7300}, {"loss": 0.6941, "grad_norm": 0.6044712066650391, "learning_rate": 0.0002, "epoch": 1.181796136124808, "step": 7310}, {"loss": 0.6951, "grad_norm": 0.7413212060928345, "learning_rate": 0.0002, "epoch": 1.1834128203055534, "step": 7320}, {"loss": 0.7396, "grad_norm": 0.5303856134414673, "learning_rate": 0.0002, "epoch": 1.1850295044862986, "step": 7330}, {"loss": 0.6915, "grad_norm": 0.5647098422050476, "learning_rate": 0.0002, "epoch": 1.1866461886670439, "step": 7340}, {"loss": 0.7506, "grad_norm": 0.7374135255813599, "learning_rate": 0.0002, "epoch": 1.188262872847789, "step": 7350}, {"loss": 0.7041, "grad_norm": 0.5710089206695557, "learning_rate": 0.0002, "epoch": 1.1898795570285345, "step": 7360}, {"loss": 0.8289, "grad_norm": 0.6073619723320007, "learning_rate": 0.0002, "epoch": 1.1914962412092798, "step": 7370}, {"loss": 0.7722, "grad_norm": 0.5899916887283325, "learning_rate": 0.0002, "epoch": 1.193112925390025, "step": 7380}, {"loss": 0.756, "grad_norm": 0.7762434482574463, "learning_rate": 0.0002, "epoch": 1.1947296095707705, "step": 7390}, {"loss": 0.7319, "grad_norm": 0.679949939250946, "learning_rate": 0.0002, "epoch": 1.1963462937515157, "step": 7400}, {"loss": 0.7599, "grad_norm": 0.6106849312782288, "learning_rate": 0.0002, "epoch": 1.197962977932261, "step": 7410}, {"loss": 0.7648, "grad_norm": 0.682461678981781, "learning_rate": 0.0002, "epoch": 1.1995796621130062, "step": 7420}, {"loss": 0.7741, "grad_norm": 0.6087017059326172, "learning_rate": 0.0002, "epoch": 1.2011963462937516, "step": 7430}, {"loss": 0.7642, "grad_norm": 0.63739013671875, "learning_rate": 0.0002, "epoch": 1.2028130304744968, "step": 7440}, {"loss": 0.7611, "grad_norm": 0.6154777407646179, "learning_rate": 0.0002, "epoch": 1.204429714655242, "step": 7450}, {"loss": 0.7565, "grad_norm": 0.7491534948348999, "learning_rate": 0.0002, "epoch": 1.2060463988359873, "step": 7460}, {"loss": 0.698, "grad_norm": 0.6664797067642212, "learning_rate": 0.0002, "epoch": 1.2076630830167328, "step": 7470}, {"loss": 0.7456, "grad_norm": 0.6660266518592834, "learning_rate": 0.0002, "epoch": 1.209279767197478, "step": 7480}, {"loss": 0.714, "grad_norm": 0.6972551345825195, "learning_rate": 0.0002, "epoch": 1.2108964513782232, "step": 7490}, {"loss": 0.7023, "grad_norm": 0.6157945990562439, "learning_rate": 0.0002, "epoch": 1.2125131355589684, "step": 7500}, {"loss": 0.7326, "grad_norm": 0.5199310183525085, "learning_rate": 0.0002, "epoch": 1.214129819739714, "step": 7510}, {"loss": 0.7586, "grad_norm": 0.577610433101654, "learning_rate": 0.0002, "epoch": 1.2157465039204591, "step": 7520}, {"loss": 0.7179, "grad_norm": 0.53652423620224, "learning_rate": 0.0002, "epoch": 1.2173631881012044, "step": 7530}, {"loss": 0.7393, "grad_norm": 0.6479050517082214, "learning_rate": 0.0002, "epoch": 1.2189798722819498, "step": 7540}, {"loss": 0.7534, "grad_norm": 0.618748128414154, "learning_rate": 0.0002, "epoch": 1.220596556462695, "step": 7550}, {"loss": 0.6886, "grad_norm": 0.6311424374580383, "learning_rate": 0.0002, "epoch": 1.2222132406434403, "step": 7560}, {"loss": 0.7272, "grad_norm": 0.6595825552940369, "learning_rate": 0.0002, "epoch": 1.2238299248241855, "step": 7570}, {"loss": 0.7353, "grad_norm": 0.5198960900306702, "learning_rate": 0.0002, "epoch": 1.225446609004931, "step": 7580}, {"loss": 0.674, "grad_norm": 0.578650712966919, "learning_rate": 0.0002, "epoch": 1.2270632931856762, "step": 7590}, {"loss": 0.7507, "grad_norm": 0.6080220937728882, "learning_rate": 0.0002, "epoch": 1.2286799773664214, "step": 7600}, {"loss": 0.7733, "grad_norm": 0.7050248384475708, "learning_rate": 0.0002, "epoch": 1.2302966615471669, "step": 7610}, {"loss": 0.7032, "grad_norm": 0.6652196049690247, "learning_rate": 0.0002, "epoch": 1.2319133457279121, "step": 7620}, {"loss": 0.7085, "grad_norm": 0.7322776317596436, "learning_rate": 0.0002, "epoch": 1.2335300299086573, "step": 7630}, {"loss": 0.7402, "grad_norm": 0.4998728036880493, "learning_rate": 0.0002, "epoch": 1.2351467140894026, "step": 7640}, {"loss": 0.7214, "grad_norm": 0.6428788900375366, "learning_rate": 0.0002, "epoch": 1.2367633982701478, "step": 7650}, {"loss": 0.7699, "grad_norm": 0.585242509841919, "learning_rate": 0.0002, "epoch": 1.2383800824508933, "step": 7660}, {"loss": 0.7621, "grad_norm": 0.5211917757987976, "learning_rate": 0.0002, "epoch": 1.2399967666316385, "step": 7670}, {"loss": 0.746, "grad_norm": 0.6490384340286255, "learning_rate": 0.0002, "epoch": 1.2416134508123837, "step": 7680}, {"loss": 0.7186, "grad_norm": 0.6249763369560242, "learning_rate": 0.0002, "epoch": 1.2432301349931292, "step": 7690}, {"loss": 0.7761, "grad_norm": 0.71870356798172, "learning_rate": 0.0002, "epoch": 1.2448468191738744, "step": 7700}, {"loss": 0.7525, "grad_norm": 0.6761967539787292, "learning_rate": 0.0002, "epoch": 1.2464635033546196, "step": 7710}, {"loss": 0.7501, "grad_norm": 0.6500617265701294, "learning_rate": 0.0002, "epoch": 1.2480801875353649, "step": 7720}, {"loss": 0.7903, "grad_norm": 0.8069869875907898, "learning_rate": 0.0002, "epoch": 1.2496968717161103, "step": 7730}, {"loss": 0.6747, "grad_norm": 0.6044608950614929, "learning_rate": 0.0002, "epoch": 1.2513135558968556, "step": 7740}, {"loss": 0.6825, "grad_norm": 0.6573283076286316, "learning_rate": 0.0002, "epoch": 1.2529302400776008, "step": 7750}, {"loss": 0.7617, "grad_norm": 0.625430166721344, "learning_rate": 0.0002, "epoch": 1.2545469242583462, "step": 7760}, {"loss": 0.7041, "grad_norm": 0.5442022681236267, "learning_rate": 0.0002, "epoch": 1.2561636084390915, "step": 7770}, {"loss": 0.7172, "grad_norm": 0.6818386912345886, "learning_rate": 0.0002, "epoch": 1.2577802926198367, "step": 7780}, {"loss": 0.696, "grad_norm": 0.6381874084472656, "learning_rate": 0.0002, "epoch": 1.259396976800582, "step": 7790}, {"loss": 0.6834, "grad_norm": 0.6269212961196899, "learning_rate": 0.0002, "epoch": 1.2610136609813272, "step": 7800}, {"loss": 0.7821, "grad_norm": 0.600121259689331, "learning_rate": 0.0002, "epoch": 1.2626303451620726, "step": 7810}, {"loss": 0.7761, "grad_norm": 0.6337703466415405, "learning_rate": 0.0002, "epoch": 1.2642470293428179, "step": 7820}, {"loss": 0.732, "grad_norm": 0.7234963774681091, "learning_rate": 0.0002, "epoch": 1.2658637135235633, "step": 7830}, {"loss": 0.785, "grad_norm": 0.800184965133667, "learning_rate": 0.0002, "epoch": 1.2674803977043085, "step": 7840}, {"loss": 0.7426, "grad_norm": 0.7539464831352234, "learning_rate": 0.0002, "epoch": 1.2690970818850538, "step": 7850}, {"loss": 0.7496, "grad_norm": 0.5493760704994202, "learning_rate": 0.0002, "epoch": 1.270713766065799, "step": 7860}, {"loss": 0.7537, "grad_norm": 0.7477145791053772, "learning_rate": 0.0002, "epoch": 1.2723304502465442, "step": 7870}, {"loss": 0.7573, "grad_norm": 0.6366362571716309, "learning_rate": 0.0002, "epoch": 1.2739471344272897, "step": 7880}, {"loss": 0.7608, "grad_norm": 0.7419533729553223, "learning_rate": 0.0002, "epoch": 1.275563818608035, "step": 7890}, {"loss": 0.7873, "grad_norm": 0.6141223311424255, "learning_rate": 0.0002, "epoch": 1.2771805027887801, "step": 7900}, {"loss": 0.6916, "grad_norm": 0.7522598505020142, "learning_rate": 0.0002, "epoch": 1.2787971869695256, "step": 7910}, {"loss": 0.7097, "grad_norm": 0.6935804486274719, "learning_rate": 0.0002, "epoch": 1.2804138711502708, "step": 7920}, {"loss": 0.7185, "grad_norm": 0.7239290475845337, "learning_rate": 0.0002, "epoch": 1.282030555331016, "step": 7930}, {"loss": 0.7145, "grad_norm": 0.8800187110900879, "learning_rate": 0.0002, "epoch": 1.2836472395117613, "step": 7940}, {"loss": 0.6991, "grad_norm": 0.540458083152771, "learning_rate": 0.0002, "epoch": 1.2852639236925067, "step": 7950}, {"loss": 0.7139, "grad_norm": 0.6492934226989746, "learning_rate": 0.0002, "epoch": 1.286880607873252, "step": 7960}, {"loss": 0.7742, "grad_norm": 0.6543959379196167, "learning_rate": 0.0002, "epoch": 1.2884972920539972, "step": 7970}, {"loss": 0.7316, "grad_norm": 0.5804705619812012, "learning_rate": 0.0002, "epoch": 1.2901139762347427, "step": 7980}, {"loss": 0.796, "grad_norm": 0.7074727416038513, "learning_rate": 0.0002, "epoch": 1.291730660415488, "step": 7990}, {"loss": 0.7034, "grad_norm": 0.5347974300384521, "learning_rate": 0.0002, "epoch": 1.2933473445962331, "step": 8000}, {"loss": 0.738, "grad_norm": 0.6457298398017883, "learning_rate": 0.0002, "epoch": 1.2949640287769784, "step": 8010}, {"loss": 0.7634, "grad_norm": 0.6407219171524048, "learning_rate": 0.0002, "epoch": 1.2965807129577236, "step": 8020}, {"loss": 0.7506, "grad_norm": 0.828439474105835, "learning_rate": 0.0002, "epoch": 1.298197397138469, "step": 8030}, {"loss": 0.735, "grad_norm": 0.4840380549430847, "learning_rate": 0.0002, "epoch": 1.2998140813192143, "step": 8040}, {"loss": 0.7283, "grad_norm": 0.5921024680137634, "learning_rate": 0.0002, "epoch": 1.3014307654999595, "step": 8050}, {"loss": 0.7477, "grad_norm": 0.6170315146446228, "learning_rate": 0.0002, "epoch": 1.303047449680705, "step": 8060}, {"loss": 0.7534, "grad_norm": 0.5374847054481506, "learning_rate": 0.0002, "epoch": 1.3046641338614502, "step": 8070}, {"loss": 0.7593, "grad_norm": 0.545758068561554, "learning_rate": 0.0002, "epoch": 1.3062808180421954, "step": 8080}, {"loss": 0.7463, "grad_norm": 0.55641770362854, "learning_rate": 0.0002, "epoch": 1.3078975022229407, "step": 8090}, {"loss": 0.7594, "grad_norm": 0.6724897027015686, "learning_rate": 0.0002, "epoch": 1.309514186403686, "step": 8100}, {"loss": 0.7105, "grad_norm": 0.6923972368240356, "learning_rate": 0.0002, "epoch": 1.3111308705844313, "step": 8110}, {"loss": 0.7149, "grad_norm": 0.5136841535568237, "learning_rate": 0.0002, "epoch": 1.3127475547651766, "step": 8120}, {"loss": 0.7504, "grad_norm": 0.6766283512115479, "learning_rate": 0.0002, "epoch": 1.314364238945922, "step": 8130}, {"loss": 0.7489, "grad_norm": 0.6283926367759705, "learning_rate": 0.0002, "epoch": 1.3159809231266673, "step": 8140}, {"loss": 0.7459, "grad_norm": 0.644216001033783, "learning_rate": 0.0002, "epoch": 1.3175976073074125, "step": 8150}, {"loss": 0.7125, "grad_norm": 0.7827503085136414, "learning_rate": 0.0002, "epoch": 1.3192142914881577, "step": 8160}, {"loss": 0.7271, "grad_norm": 0.6651390790939331, "learning_rate": 0.0002, "epoch": 1.320830975668903, "step": 8170}, {"loss": 0.7778, "grad_norm": 0.5547412633895874, "learning_rate": 0.0002, "epoch": 1.3224476598496484, "step": 8180}, {"loss": 0.7402, "grad_norm": 0.6765179634094238, "learning_rate": 0.0002, "epoch": 1.3240643440303936, "step": 8190}, {"loss": 0.7106, "grad_norm": 0.6822077035903931, "learning_rate": 0.0002, "epoch": 1.325681028211139, "step": 8200}, {"loss": 0.7288, "grad_norm": 0.5941002368927002, "learning_rate": 0.0002, "epoch": 1.3272977123918843, "step": 8210}, {"loss": 0.7494, "grad_norm": 0.4850037097930908, "learning_rate": 0.0002, "epoch": 1.3289143965726296, "step": 8220}, {"loss": 0.7474, "grad_norm": 0.6162990927696228, "learning_rate": 0.0002, "epoch": 1.3305310807533748, "step": 8230}, {"loss": 0.7751, "grad_norm": 0.6665613651275635, "learning_rate": 0.0002, "epoch": 1.33214776493412, "step": 8240}, {"loss": 0.759, "grad_norm": 0.618192732334137, "learning_rate": 0.0002, "epoch": 1.3337644491148655, "step": 8250}, {"loss": 0.7532, "grad_norm": 0.710418701171875, "learning_rate": 0.0002, "epoch": 1.3353811332956107, "step": 8260}, {"loss": 0.7306, "grad_norm": 0.5109876990318298, "learning_rate": 0.0002, "epoch": 1.336997817476356, "step": 8270}, {"loss": 0.7303, "grad_norm": 0.6791711449623108, "learning_rate": 0.0002, "epoch": 1.3386145016571014, "step": 8280}, {"loss": 0.7594, "grad_norm": 0.6836432814598083, "learning_rate": 0.0002, "epoch": 1.3402311858378466, "step": 8290}, {"loss": 0.7594, "grad_norm": 0.5579386353492737, "learning_rate": 0.0002, "epoch": 1.3418478700185918, "step": 8300}, {"loss": 0.7377, "grad_norm": 0.6713546514511108, "learning_rate": 0.0002, "epoch": 1.343464554199337, "step": 8310}, {"loss": 0.7756, "grad_norm": 0.5353720188140869, "learning_rate": 0.0002, "epoch": 1.3450812383800825, "step": 8320}, {"loss": 0.718, "grad_norm": 0.5813682675361633, "learning_rate": 0.0002, "epoch": 1.3466979225608278, "step": 8330}, {"loss": 0.7294, "grad_norm": 0.8158791661262512, "learning_rate": 0.0002, "epoch": 1.348314606741573, "step": 8340}, {"loss": 0.6992, "grad_norm": 0.6193785071372986, "learning_rate": 0.0002, "epoch": 1.3499312909223184, "step": 8350}, {"loss": 0.7654, "grad_norm": 0.6353939771652222, "learning_rate": 0.0002, "epoch": 1.3515479751030637, "step": 8360}, {"loss": 0.7519, "grad_norm": 0.6925048232078552, "learning_rate": 0.0002, "epoch": 1.353164659283809, "step": 8370}, {"loss": 0.736, "grad_norm": 0.988264799118042, "learning_rate": 0.0002, "epoch": 1.3547813434645541, "step": 8380}, {"loss": 0.7744, "grad_norm": 0.6476002931594849, "learning_rate": 0.0002, "epoch": 1.3563980276452994, "step": 8390}, {"loss": 0.776, "grad_norm": 0.7120398879051208, "learning_rate": 0.0002, "epoch": 1.3580147118260448, "step": 8400}, {"loss": 0.7368, "grad_norm": 0.9048416614532471, "learning_rate": 0.0002, "epoch": 1.35963139600679, "step": 8410}, {"loss": 0.7544, "grad_norm": 0.7000672817230225, "learning_rate": 0.0002, "epoch": 1.3612480801875353, "step": 8420}, {"loss": 0.7358, "grad_norm": 0.6015632152557373, "learning_rate": 0.0002, "epoch": 1.3628647643682807, "step": 8430}, {"loss": 0.7298, "grad_norm": 0.612516462802887, "learning_rate": 0.0002, "epoch": 1.364481448549026, "step": 8440}, {"loss": 0.7055, "grad_norm": 0.5969301462173462, "learning_rate": 0.0002, "epoch": 1.3660981327297712, "step": 8450}, {"loss": 0.7754, "grad_norm": 0.6730654239654541, "learning_rate": 0.0002, "epoch": 1.3677148169105164, "step": 8460}, {"loss": 0.7465, "grad_norm": 0.6386392116546631, "learning_rate": 0.0002, "epoch": 1.369331501091262, "step": 8470}, {"loss": 0.7433, "grad_norm": 0.739544153213501, "learning_rate": 0.0002, "epoch": 1.3709481852720071, "step": 8480}, {"loss": 0.7892, "grad_norm": 0.6462782621383667, "learning_rate": 0.0002, "epoch": 1.3725648694527524, "step": 8490}, {"loss": 0.7302, "grad_norm": 0.7346843481063843, "learning_rate": 0.0002, "epoch": 1.3741815536334978, "step": 8500}, {"loss": 0.7634, "grad_norm": 0.6884821057319641, "learning_rate": 0.0002, "epoch": 1.375798237814243, "step": 8510}, {"loss": 0.7614, "grad_norm": 0.6999333500862122, "learning_rate": 0.0002, "epoch": 1.3774149219949883, "step": 8520}, {"loss": 0.729, "grad_norm": 0.5378713011741638, "learning_rate": 0.0002, "epoch": 1.3790316061757335, "step": 8530}, {"loss": 0.6797, "grad_norm": 0.5417906641960144, "learning_rate": 0.0002, "epoch": 1.3806482903564787, "step": 8540}, {"loss": 0.7499, "grad_norm": 0.6602526307106018, "learning_rate": 0.0002, "epoch": 1.3822649745372242, "step": 8550}, {"loss": 0.7356, "grad_norm": 0.7073674201965332, "learning_rate": 0.0002, "epoch": 1.3838816587179694, "step": 8560}, {"loss": 0.75, "grad_norm": 0.5841707587242126, "learning_rate": 0.0002, "epoch": 1.3854983428987149, "step": 8570}, {"loss": 0.732, "grad_norm": 0.7031095027923584, "learning_rate": 0.0002, "epoch": 1.38711502707946, "step": 8580}, {"loss": 0.7464, "grad_norm": 0.5198570489883423, "learning_rate": 0.0002, "epoch": 1.3887317112602053, "step": 8590}, {"loss": 0.7354, "grad_norm": 0.7261320352554321, "learning_rate": 0.0002, "epoch": 1.3903483954409506, "step": 8600}, {"loss": 0.7339, "grad_norm": 0.5616350173950195, "learning_rate": 0.0002, "epoch": 1.3919650796216958, "step": 8610}, {"loss": 0.7382, "grad_norm": 0.5185914635658264, "learning_rate": 0.0002, "epoch": 1.3935817638024413, "step": 8620}, {"loss": 0.7456, "grad_norm": 0.5814694762229919, "learning_rate": 0.0002, "epoch": 1.3951984479831865, "step": 8630}, {"loss": 0.7413, "grad_norm": 0.6977371573448181, "learning_rate": 0.0002, "epoch": 1.3968151321639317, "step": 8640}, {"loss": 0.7574, "grad_norm": 0.6855689883232117, "learning_rate": 0.0002, "epoch": 1.3984318163446772, "step": 8650}, {"loss": 0.7802, "grad_norm": 0.5414357781410217, "learning_rate": 0.0002, "epoch": 1.4000485005254224, "step": 8660}, {"loss": 0.7487, "grad_norm": 0.6970012784004211, "learning_rate": 0.0002, "epoch": 1.4016651847061676, "step": 8670}, {"loss": 0.7421, "grad_norm": 0.526079535484314, "learning_rate": 0.0002, "epoch": 1.4032818688869129, "step": 8680}, {"loss": 0.737, "grad_norm": 0.758712887763977, "learning_rate": 0.0002, "epoch": 1.404898553067658, "step": 8690}, {"loss": 0.7612, "grad_norm": 0.7118762731552124, "learning_rate": 0.0002, "epoch": 1.4065152372484035, "step": 8700}, {"loss": 0.7628, "grad_norm": 0.5696909427642822, "learning_rate": 0.0002, "epoch": 1.4081319214291488, "step": 8710}, {"loss": 0.7156, "grad_norm": 0.7995436787605286, "learning_rate": 0.0002, "epoch": 1.4097486056098942, "step": 8720}, {"loss": 0.7521, "grad_norm": 0.7237521409988403, "learning_rate": 0.0002, "epoch": 1.4113652897906395, "step": 8730}, {"loss": 0.7661, "grad_norm": 0.744628369808197, "learning_rate": 0.0002, "epoch": 1.4129819739713847, "step": 8740}, {"loss": 0.7073, "grad_norm": 0.6082926988601685, "learning_rate": 0.0002, "epoch": 1.41459865815213, "step": 8750}, {"loss": 0.7282, "grad_norm": 0.5185243487358093, "learning_rate": 0.0002, "epoch": 1.4162153423328752, "step": 8760}, {"loss": 0.7592, "grad_norm": 0.5183082222938538, "learning_rate": 0.0002, "epoch": 1.4178320265136206, "step": 8770}, {"loss": 0.7509, "grad_norm": 0.7326041460037231, "learning_rate": 0.0002, "epoch": 1.4194487106943658, "step": 8780}, {"loss": 0.7398, "grad_norm": 0.7174660563468933, "learning_rate": 0.0002, "epoch": 1.421065394875111, "step": 8790}, {"loss": 0.7507, "grad_norm": 0.8080165982246399, "learning_rate": 0.0002, "epoch": 1.4226820790558565, "step": 8800}, {"loss": 0.72, "grad_norm": 0.5061507821083069, "learning_rate": 0.0002, "epoch": 1.4242987632366018, "step": 8810}, {"loss": 0.7563, "grad_norm": 0.801602840423584, "learning_rate": 0.0002, "epoch": 1.425915447417347, "step": 8820}, {"loss": 0.7287, "grad_norm": 0.6150273084640503, "learning_rate": 0.0002, "epoch": 1.4275321315980922, "step": 8830}, {"loss": 0.7452, "grad_norm": 0.8786525726318359, "learning_rate": 0.0002, "epoch": 1.4291488157788377, "step": 8840}, {"loss": 0.7257, "grad_norm": 0.6371538639068604, "learning_rate": 0.0002, "epoch": 1.430765499959583, "step": 8850}, {"loss": 0.711, "grad_norm": 0.6409295797348022, "learning_rate": 0.0002, "epoch": 1.4323821841403281, "step": 8860}, {"loss": 0.7891, "grad_norm": 0.6452359557151794, "learning_rate": 0.0002, "epoch": 1.4339988683210736, "step": 8870}, {"loss": 0.7588, "grad_norm": 0.5842334628105164, "learning_rate": 0.0002, "epoch": 1.4356155525018188, "step": 8880}, {"loss": 0.7446, "grad_norm": 0.696761965751648, "learning_rate": 0.0002, "epoch": 1.437232236682564, "step": 8890}, {"loss": 0.7541, "grad_norm": 0.6384600400924683, "learning_rate": 0.0002, "epoch": 1.4388489208633093, "step": 8900}, {"loss": 0.7049, "grad_norm": 0.5981136560440063, "learning_rate": 0.0002, "epoch": 1.4404656050440545, "step": 8910}, {"loss": 0.795, "grad_norm": 0.6355637907981873, "learning_rate": 0.0002, "epoch": 1.4420822892248, "step": 8920}, {"loss": 0.7653, "grad_norm": 0.6374830603599548, "learning_rate": 0.0002, "epoch": 1.4436989734055452, "step": 8930}, {"loss": 0.8108, "grad_norm": 0.559013307094574, "learning_rate": 0.0002, "epoch": 1.4453156575862904, "step": 8940}, {"loss": 0.7045, "grad_norm": 0.7289170026779175, "learning_rate": 0.0002, "epoch": 1.446932341767036, "step": 8950}, {"loss": 0.7484, "grad_norm": 0.8649206757545471, "learning_rate": 0.0002, "epoch": 1.4485490259477811, "step": 8960}, {"loss": 0.7745, "grad_norm": 0.7664689421653748, "learning_rate": 0.0002, "epoch": 1.4501657101285264, "step": 8970}, {"loss": 0.7431, "grad_norm": 0.7109952569007874, "learning_rate": 0.0002, "epoch": 1.4517823943092716, "step": 8980}, {"loss": 0.7997, "grad_norm": 0.6312844753265381, "learning_rate": 0.0002, "epoch": 1.453399078490017, "step": 8990}, {"loss": 0.7467, "grad_norm": 0.6616617441177368, "learning_rate": 0.0002, "epoch": 1.4550157626707623, "step": 9000}, {"loss": 0.7518, "grad_norm": 0.7384068965911865, "learning_rate": 0.0002, "epoch": 1.4566324468515075, "step": 9010}, {"loss": 0.7483, "grad_norm": 0.6549670100212097, "learning_rate": 0.0002, "epoch": 1.458249131032253, "step": 9020}, {"loss": 0.7423, "grad_norm": 0.6254119277000427, "learning_rate": 0.0002, "epoch": 1.4598658152129982, "step": 9030}, {"loss": 0.7645, "grad_norm": 0.6806328892707825, "learning_rate": 0.0002, "epoch": 1.4614824993937434, "step": 9040}, {"loss": 0.7221, "grad_norm": 0.6803115010261536, "learning_rate": 0.0002, "epoch": 1.4630991835744886, "step": 9050}, {"loss": 0.7264, "grad_norm": 0.48529282212257385, "learning_rate": 0.0002, "epoch": 1.4647158677552339, "step": 9060}, {"loss": 0.7542, "grad_norm": 0.5995030999183655, "learning_rate": 0.0002, "epoch": 1.4663325519359793, "step": 9070}, {"loss": 0.7894, "grad_norm": 0.6005427837371826, "learning_rate": 0.0002, "epoch": 1.4679492361167246, "step": 9080}, {"loss": 0.7288, "grad_norm": 0.718564510345459, "learning_rate": 0.0002, "epoch": 1.46956592029747, "step": 9090}, {"loss": 0.7089, "grad_norm": 0.7003577351570129, "learning_rate": 0.0002, "epoch": 1.4711826044782153, "step": 9100}, {"loss": 0.8069, "grad_norm": 0.5888323783874512, "learning_rate": 0.0002, "epoch": 1.4727992886589605, "step": 9110}, {"loss": 0.7275, "grad_norm": 0.6417609453201294, "learning_rate": 0.0002, "epoch": 1.4744159728397057, "step": 9120}, {"loss": 0.7441, "grad_norm": 0.572294294834137, "learning_rate": 0.0002, "epoch": 1.476032657020451, "step": 9130}, {"loss": 0.8053, "grad_norm": 0.8200714588165283, "learning_rate": 0.0002, "epoch": 1.4776493412011964, "step": 9140}, {"loss": 0.7382, "grad_norm": 0.6343288421630859, "learning_rate": 0.0002, "epoch": 1.4792660253819416, "step": 9150}, {"loss": 0.7641, "grad_norm": 0.7017961144447327, "learning_rate": 0.0002, "epoch": 1.4808827095626869, "step": 9160}, {"loss": 0.7619, "grad_norm": 0.6202912926673889, "learning_rate": 0.0002, "epoch": 1.4824993937434323, "step": 9170}, {"loss": 0.7428, "grad_norm": 0.6677869558334351, "learning_rate": 0.0002, "epoch": 1.4841160779241775, "step": 9180}, {"loss": 0.7648, "grad_norm": 0.6052267551422119, "learning_rate": 0.0002, "epoch": 1.4857327621049228, "step": 9190}, {"loss": 0.7152, "grad_norm": 0.6638872027397156, "learning_rate": 0.0002, "epoch": 1.487349446285668, "step": 9200}, {"loss": 0.7448, "grad_norm": 0.6245523691177368, "learning_rate": 0.0002, "epoch": 1.4889661304664135, "step": 9210}, {"loss": 0.6958, "grad_norm": 0.5761767625808716, "learning_rate": 0.0002, "epoch": 1.4905828146471587, "step": 9220}, {"loss": 0.8012, "grad_norm": 0.8175981640815735, "learning_rate": 0.0002, "epoch": 1.492199498827904, "step": 9230}, {"loss": 0.683, "grad_norm": 0.9144009947776794, "learning_rate": 0.0002, "epoch": 1.4938161830086494, "step": 9240}, {"loss": 0.7623, "grad_norm": 0.5742552876472473, "learning_rate": 0.0002, "epoch": 1.4954328671893946, "step": 9250}, {"loss": 0.7418, "grad_norm": 0.534534215927124, "learning_rate": 0.0002, "epoch": 1.4970495513701398, "step": 9260}, {"loss": 0.7194, "grad_norm": 0.7836225032806396, "learning_rate": 0.0002, "epoch": 1.498666235550885, "step": 9270}, {"loss": 0.7453, "grad_norm": 0.5292993187904358, "learning_rate": 0.0002, "epoch": 1.5002829197316303, "step": 9280}, {"loss": 0.7168, "grad_norm": 0.8044071793556213, "learning_rate": 0.0002, "epoch": 1.5018996039123758, "step": 9290}, {"loss": 0.7229, "grad_norm": 0.6185805201530457, "learning_rate": 0.0002, "epoch": 1.503516288093121, "step": 9300}, {"loss": 0.684, "grad_norm": 0.6093607544898987, "learning_rate": 0.0002, "epoch": 1.5051329722738664, "step": 9310}, {"loss": 0.7973, "grad_norm": 0.5891730189323425, "learning_rate": 0.0002, "epoch": 1.5067496564546117, "step": 9320}, {"loss": 0.7474, "grad_norm": 0.6331129670143127, "learning_rate": 0.0002, "epoch": 1.508366340635357, "step": 9330}, {"loss": 0.7074, "grad_norm": 0.7690958380699158, "learning_rate": 0.0002, "epoch": 1.5099830248161021, "step": 9340}, {"loss": 0.672, "grad_norm": 0.6548877358436584, "learning_rate": 0.0002, "epoch": 1.5115997089968474, "step": 9350}, {"loss": 0.7408, "grad_norm": 0.6545143127441406, "learning_rate": 0.0002, "epoch": 1.5132163931775926, "step": 9360}, {"loss": 0.7432, "grad_norm": 0.553247332572937, "learning_rate": 0.0002, "epoch": 1.514833077358338, "step": 9370}, {"loss": 0.7265, "grad_norm": 0.8145074844360352, "learning_rate": 0.0002, "epoch": 1.5164497615390833, "step": 9380}, {"loss": 0.7379, "grad_norm": 0.7636994123458862, "learning_rate": 0.0002, "epoch": 1.5180664457198287, "step": 9390}, {"loss": 0.7413, "grad_norm": 0.6838982701301575, "learning_rate": 0.0002, "epoch": 1.519683129900574, "step": 9400}, {"loss": 0.7367, "grad_norm": 0.8599441647529602, "learning_rate": 0.0002, "epoch": 1.5212998140813192, "step": 9410}, {"loss": 0.7663, "grad_norm": 0.7020329833030701, "learning_rate": 0.0002, "epoch": 1.5229164982620644, "step": 9420}, {"loss": 0.7928, "grad_norm": 0.6964772343635559, "learning_rate": 0.0002, "epoch": 1.5245331824428097, "step": 9430}, {"loss": 0.7168, "grad_norm": 0.6916600465774536, "learning_rate": 0.0002, "epoch": 1.5261498666235551, "step": 9440}, {"loss": 0.7519, "grad_norm": 0.7282621264457703, "learning_rate": 0.0002, "epoch": 1.5277665508043003, "step": 9450}, {"loss": 0.7628, "grad_norm": 0.5363983511924744, "learning_rate": 0.0002, "epoch": 1.5293832349850458, "step": 9460}, {"loss": 0.7154, "grad_norm": 0.6184861063957214, "learning_rate": 0.0002, "epoch": 1.530999919165791, "step": 9470}, {"loss": 0.7837, "grad_norm": 0.5991285443305969, "learning_rate": 0.0002, "epoch": 1.5326166033465363, "step": 9480}, {"loss": 0.7827, "grad_norm": 0.8176587820053101, "learning_rate": 0.0002, "epoch": 1.5342332875272815, "step": 9490}, {"loss": 0.7415, "grad_norm": 0.6473721861839294, "learning_rate": 0.0002, "epoch": 1.5358499717080267, "step": 9500}, {"loss": 0.7632, "grad_norm": 0.7319952845573425, "learning_rate": 0.0002, "epoch": 1.5374666558887722, "step": 9510}, {"loss": 0.7706, "grad_norm": 0.702900230884552, "learning_rate": 0.0002, "epoch": 1.5390833400695174, "step": 9520}, {"loss": 0.7754, "grad_norm": 0.7971600294113159, "learning_rate": 0.0002, "epoch": 1.5407000242502629, "step": 9530}, {"loss": 0.7352, "grad_norm": 0.6527525186538696, "learning_rate": 0.0002, "epoch": 1.542316708431008, "step": 9540}, {"loss": 0.7425, "grad_norm": 0.5791676044464111, "learning_rate": 0.0002, "epoch": 1.5439333926117533, "step": 9550}, {"loss": 0.7585, "grad_norm": 0.5619390606880188, "learning_rate": 0.0002, "epoch": 1.5455500767924986, "step": 9560}, {"loss": 0.7894, "grad_norm": 0.5701689124107361, "learning_rate": 0.0002, "epoch": 1.5471667609732438, "step": 9570}, {"loss": 0.793, "grad_norm": 0.47549352049827576, "learning_rate": 0.0002, "epoch": 1.548783445153989, "step": 9580}, {"loss": 0.7276, "grad_norm": 0.8730611205101013, "learning_rate": 0.0002, "epoch": 1.5504001293347345, "step": 9590}, {"loss": 0.798, "grad_norm": 0.6842091083526611, "learning_rate": 0.0002, "epoch": 1.5520168135154797, "step": 9600}, {"loss": 0.7528, "grad_norm": 0.6675129532814026, "learning_rate": 0.0002, "epoch": 1.5536334976962252, "step": 9610}, {"loss": 0.7954, "grad_norm": 0.8173956274986267, "learning_rate": 0.0002, "epoch": 1.5552501818769704, "step": 9620}, {"loss": 0.7535, "grad_norm": 0.724947452545166, "learning_rate": 0.0002, "epoch": 1.5568668660577156, "step": 9630}, {"loss": 0.7738, "grad_norm": 0.6154758930206299, "learning_rate": 0.0002, "epoch": 1.5584835502384609, "step": 9640}, {"loss": 0.7568, "grad_norm": 0.6072008013725281, "learning_rate": 0.0002, "epoch": 1.560100234419206, "step": 9650}, {"loss": 0.7219, "grad_norm": 0.659010648727417, "learning_rate": 0.0002, "epoch": 1.5617169185999515, "step": 9660}, {"loss": 0.673, "grad_norm": 0.65857994556427, "learning_rate": 0.0002, "epoch": 1.5633336027806968, "step": 9670}, {"loss": 0.7156, "grad_norm": 0.5914267301559448, "learning_rate": 0.0002, "epoch": 1.5649502869614422, "step": 9680}, {"loss": 0.7414, "grad_norm": 0.6248020529747009, "learning_rate": 0.0002, "epoch": 1.5665669711421875, "step": 9690}, {"loss": 0.694, "grad_norm": 0.7147795557975769, "learning_rate": 0.0002, "epoch": 1.5681836553229327, "step": 9700}, {"loss": 0.7335, "grad_norm": 0.7076232433319092, "learning_rate": 0.0002, "epoch": 1.569800339503678, "step": 9710}, {"loss": 0.7413, "grad_norm": 0.6217400431632996, "learning_rate": 0.0002, "epoch": 1.5714170236844232, "step": 9720}, {"loss": 0.7296, "grad_norm": 0.6709911227226257, "learning_rate": 0.0002, "epoch": 1.5730337078651684, "step": 9730}, {"loss": 0.7306, "grad_norm": 0.749171257019043, "learning_rate": 0.0002, "epoch": 1.5746503920459138, "step": 9740}, {"loss": 0.7242, "grad_norm": 0.6241145730018616, "learning_rate": 0.0002, "epoch": 1.576267076226659, "step": 9750}, {"loss": 0.7384, "grad_norm": 0.4960934817790985, "learning_rate": 0.0002, "epoch": 1.5778837604074045, "step": 9760}, {"loss": 0.725, "grad_norm": 0.6593309640884399, "learning_rate": 0.0002, "epoch": 1.5795004445881498, "step": 9770}, {"loss": 0.7531, "grad_norm": 0.5814042091369629, "learning_rate": 0.0002, "epoch": 1.581117128768895, "step": 9780}, {"loss": 0.7109, "grad_norm": 0.5936070680618286, "learning_rate": 0.0002, "epoch": 1.5827338129496402, "step": 9790}, {"loss": 0.7769, "grad_norm": 0.6454403400421143, "learning_rate": 0.0002, "epoch": 1.5843504971303854, "step": 9800}, {"loss": 0.7677, "grad_norm": 0.7612107992172241, "learning_rate": 0.0002, "epoch": 1.585967181311131, "step": 9810}, {"loss": 0.7649, "grad_norm": 0.6494482755661011, "learning_rate": 0.0002, "epoch": 1.5875838654918761, "step": 9820}, {"loss": 0.7569, "grad_norm": 0.7825694680213928, "learning_rate": 0.0002, "epoch": 1.5892005496726216, "step": 9830}, {"loss": 0.706, "grad_norm": 0.6757757663726807, "learning_rate": 0.0002, "epoch": 1.5908172338533668, "step": 9840}, {"loss": 0.7803, "grad_norm": 0.7105609178543091, "learning_rate": 0.0002, "epoch": 1.592433918034112, "step": 9850}, {"loss": 0.7925, "grad_norm": 0.7596991062164307, "learning_rate": 0.0002, "epoch": 1.5940506022148573, "step": 9860}, {"loss": 0.7108, "grad_norm": 0.5681525468826294, "learning_rate": 0.0002, "epoch": 1.5956672863956025, "step": 9870}, {"loss": 0.7811, "grad_norm": 0.6090980768203735, "learning_rate": 0.0002, "epoch": 1.5972839705763477, "step": 9880}, {"loss": 0.7339, "grad_norm": 0.6271613240242004, "learning_rate": 0.0002, "epoch": 1.5989006547570932, "step": 9890}, {"loss": 0.7419, "grad_norm": 0.7656369805335999, "learning_rate": 0.0002, "epoch": 1.6005173389378387, "step": 9900}, {"loss": 0.7336, "grad_norm": 0.7504446506500244, "learning_rate": 0.0002, "epoch": 1.6021340231185839, "step": 9910}, {"loss": 0.7479, "grad_norm": 0.659656286239624, "learning_rate": 0.0002, "epoch": 1.6037507072993291, "step": 9920}, {"loss": 0.7483, "grad_norm": 0.6006826162338257, "learning_rate": 0.0002, "epoch": 1.6053673914800743, "step": 9930}, {"loss": 0.732, "grad_norm": 0.7872757911682129, "learning_rate": 0.0002, "epoch": 1.6069840756608196, "step": 9940}, {"loss": 0.768, "grad_norm": 0.5545852780342102, "learning_rate": 0.0002, "epoch": 1.6086007598415648, "step": 9950}, {"loss": 0.8064, "grad_norm": 0.7429468631744385, "learning_rate": 0.0002, "epoch": 1.6102174440223103, "step": 9960}, {"loss": 0.714, "grad_norm": 0.6873556971549988, "learning_rate": 0.0002, "epoch": 1.6118341282030555, "step": 9970}, {"loss": 0.7324, "grad_norm": 0.5874287486076355, "learning_rate": 0.0002, "epoch": 1.613450812383801, "step": 9980}, {"loss": 0.7141, "grad_norm": 0.6039386987686157, "learning_rate": 0.0002, "epoch": 1.6150674965645462, "step": 9990}, {"loss": 0.6674, "grad_norm": 0.6233575940132141, "learning_rate": 0.0002, "epoch": 1.6166841807452914, "step": 10000}, {"loss": 0.7602, "grad_norm": 0.7676448225975037, "learning_rate": 0.0002, "epoch": 1.6183008649260366, "step": 10010}, {"loss": 0.7784, "grad_norm": 0.6565698385238647, "learning_rate": 0.0002, "epoch": 1.6199175491067819, "step": 10020}, {"loss": 0.7104, "grad_norm": 0.6787590384483337, "learning_rate": 0.0002, "epoch": 1.6215342332875273, "step": 10030}, {"loss": 0.7464, "grad_norm": 0.6137678027153015, "learning_rate": 0.0002, "epoch": 1.6231509174682726, "step": 10040}, {"loss": 0.7646, "grad_norm": 0.5236800312995911, "learning_rate": 0.0002, "epoch": 1.624767601649018, "step": 10050}, {"loss": 0.7437, "grad_norm": 0.7626367807388306, "learning_rate": 0.0002, "epoch": 1.6263842858297632, "step": 10060}, {"loss": 0.7273, "grad_norm": 0.5657260417938232, "learning_rate": 0.0002, "epoch": 1.6280009700105085, "step": 10070}, {"loss": 0.7354, "grad_norm": 0.4913991391658783, "learning_rate": 0.0002, "epoch": 1.6296176541912537, "step": 10080}, {"loss": 0.7596, "grad_norm": 0.7715556621551514, "learning_rate": 0.0002, "epoch": 1.631234338371999, "step": 10090}, {"loss": 0.7105, "grad_norm": 0.6509000062942505, "learning_rate": 0.0002, "epoch": 1.6328510225527442, "step": 10100}, {"loss": 0.7274, "grad_norm": 0.6215850114822388, "learning_rate": 0.0002, "epoch": 1.6344677067334896, "step": 10110}, {"loss": 0.7705, "grad_norm": 0.6956844329833984, "learning_rate": 0.0002, "epoch": 1.6360843909142349, "step": 10120}, {"loss": 0.7129, "grad_norm": 0.6111597418785095, "learning_rate": 0.0002, "epoch": 1.6377010750949803, "step": 10130}, {"loss": 0.6955, "grad_norm": 0.6518288850784302, "learning_rate": 0.0002, "epoch": 1.6393177592757255, "step": 10140}, {"loss": 0.731, "grad_norm": 0.6914522051811218, "learning_rate": 0.0002, "epoch": 1.6409344434564708, "step": 10150}, {"loss": 0.7295, "grad_norm": 0.63785719871521, "learning_rate": 0.0002, "epoch": 1.642551127637216, "step": 10160}, {"loss": 0.7355, "grad_norm": 0.6379287838935852, "learning_rate": 0.0002, "epoch": 1.6441678118179612, "step": 10170}, {"loss": 0.7359, "grad_norm": 0.6793403029441833, "learning_rate": 0.0002, "epoch": 1.6457844959987067, "step": 10180}, {"loss": 0.7402, "grad_norm": 0.6099132895469666, "learning_rate": 0.0002, "epoch": 1.647401180179452, "step": 10190}, {"loss": 0.7353, "grad_norm": 0.5869854092597961, "learning_rate": 0.0002, "epoch": 1.6490178643601974, "step": 10200}, {"loss": 0.8308, "grad_norm": 0.7716999053955078, "learning_rate": 0.0002, "epoch": 1.6506345485409426, "step": 10210}, {"loss": 0.7215, "grad_norm": 0.6854110360145569, "learning_rate": 0.0002, "epoch": 1.6522512327216878, "step": 10220}, {"loss": 0.782, "grad_norm": 0.6957170367240906, "learning_rate": 0.0002, "epoch": 1.653867916902433, "step": 10230}, {"loss": 0.7282, "grad_norm": 0.6932903528213501, "learning_rate": 0.0002, "epoch": 1.6554846010831783, "step": 10240}, {"loss": 0.7478, "grad_norm": 0.7713165283203125, "learning_rate": 0.0002, "epoch": 1.6571012852639235, "step": 10250}, {"loss": 0.7099, "grad_norm": 0.7455793619155884, "learning_rate": 0.0002, "epoch": 1.658717969444669, "step": 10260}, {"loss": 0.7524, "grad_norm": 0.5464168190956116, "learning_rate": 0.0002, "epoch": 1.6603346536254144, "step": 10270}, {"loss": 0.7328, "grad_norm": 0.6782926321029663, "learning_rate": 0.0002, "epoch": 1.6619513378061597, "step": 10280}, {"loss": 0.7801, "grad_norm": 0.7962649464607239, "learning_rate": 0.0002, "epoch": 1.663568021986905, "step": 10290}, {"loss": 0.7142, "grad_norm": 0.6814526319503784, "learning_rate": 0.0002, "epoch": 1.6651847061676501, "step": 10300}, {"loss": 0.7285, "grad_norm": 0.656895101070404, "learning_rate": 0.0002, "epoch": 1.6668013903483954, "step": 10310}, {"loss": 0.7358, "grad_norm": 0.6085672378540039, "learning_rate": 0.0002, "epoch": 1.6684180745291406, "step": 10320}, {"loss": 0.7074, "grad_norm": 0.585508406162262, "learning_rate": 0.0002, "epoch": 1.670034758709886, "step": 10330}, {"loss": 0.7604, "grad_norm": 0.6930184364318848, "learning_rate": 0.0002, "epoch": 1.6716514428906313, "step": 10340}, {"loss": 0.7169, "grad_norm": 0.575663149356842, "learning_rate": 0.0002, "epoch": 1.6732681270713767, "step": 10350}, {"loss": 0.7198, "grad_norm": 0.582502543926239, "learning_rate": 0.0002, "epoch": 1.674884811252122, "step": 10360}, {"loss": 0.7793, "grad_norm": 0.5668916702270508, "learning_rate": 0.0002, "epoch": 1.6765014954328672, "step": 10370}, {"loss": 0.7478, "grad_norm": 0.6070065498352051, "learning_rate": 0.0002, "epoch": 1.6781181796136124, "step": 10380}, {"loss": 0.7939, "grad_norm": 0.6141316294670105, "learning_rate": 0.0002, "epoch": 1.6797348637943577, "step": 10390}, {"loss": 0.7573, "grad_norm": 0.8359124064445496, "learning_rate": 0.0002, "epoch": 1.6813515479751031, "step": 10400}, {"loss": 0.7488, "grad_norm": 0.5378185510635376, "learning_rate": 0.0002, "epoch": 1.6829682321558483, "step": 10410}, {"loss": 0.7588, "grad_norm": 0.6959536075592041, "learning_rate": 0.0002, "epoch": 1.6845849163365938, "step": 10420}, {"loss": 0.7872, "grad_norm": 0.6514357328414917, "learning_rate": 0.0002, "epoch": 1.686201600517339, "step": 10430}, {"loss": 0.725, "grad_norm": 0.7706646919250488, "learning_rate": 0.0002, "epoch": 1.6878182846980843, "step": 10440}, {"loss": 0.7673, "grad_norm": 0.6183337569236755, "learning_rate": 0.0002, "epoch": 1.6894349688788295, "step": 10450}, {"loss": 0.7566, "grad_norm": 0.6123278141021729, "learning_rate": 0.0002, "epoch": 1.6910516530595747, "step": 10460}, {"loss": 0.7169, "grad_norm": 0.6894851326942444, "learning_rate": 0.0002, "epoch": 1.69266833724032, "step": 10470}, {"loss": 0.7435, "grad_norm": 0.7497312426567078, "learning_rate": 0.0002, "epoch": 1.6942850214210654, "step": 10480}, {"loss": 0.7544, "grad_norm": 0.5968214273452759, "learning_rate": 0.0002, "epoch": 1.6959017056018106, "step": 10490}, {"loss": 0.6793, "grad_norm": 0.6747927069664001, "learning_rate": 0.0002, "epoch": 1.697518389782556, "step": 10500}, {"loss": 0.7415, "grad_norm": 0.5708310008049011, "learning_rate": 0.0002, "epoch": 1.6991350739633013, "step": 10510}, {"loss": 0.7385, "grad_norm": 0.606526792049408, "learning_rate": 0.0002, "epoch": 1.7007517581440466, "step": 10520}, {"loss": 0.7204, "grad_norm": 0.662011981010437, "learning_rate": 0.0002, "epoch": 1.7023684423247918, "step": 10530}, {"loss": 0.7999, "grad_norm": 0.7583045363426208, "learning_rate": 0.0002, "epoch": 1.703985126505537, "step": 10540}, {"loss": 0.7563, "grad_norm": 0.721632182598114, "learning_rate": 0.0002, "epoch": 1.7056018106862825, "step": 10550}, {"loss": 0.7407, "grad_norm": 0.6107715368270874, "learning_rate": 0.0002, "epoch": 1.7072184948670277, "step": 10560}, {"loss": 0.7519, "grad_norm": 0.6652471423149109, "learning_rate": 0.0002, "epoch": 1.7088351790477732, "step": 10570}, {"loss": 0.7767, "grad_norm": 0.6308087110519409, "learning_rate": 0.0002, "epoch": 1.7104518632285184, "step": 10580}, {"loss": 0.7659, "grad_norm": 0.5464386940002441, "learning_rate": 0.0002, "epoch": 1.7120685474092636, "step": 10590}, {"loss": 0.7063, "grad_norm": 0.6558911204338074, "learning_rate": 0.0002, "epoch": 1.7136852315900089, "step": 10600}, {"loss": 0.7126, "grad_norm": 0.5665024518966675, "learning_rate": 0.0002, "epoch": 1.715301915770754, "step": 10610}, {"loss": 0.6958, "grad_norm": 0.7888094186782837, "learning_rate": 0.0002, "epoch": 1.7169185999514993, "step": 10620}, {"loss": 0.7785, "grad_norm": 0.7084909081459045, "learning_rate": 0.0002, "epoch": 1.7185352841322448, "step": 10630}, {"loss": 0.7557, "grad_norm": 0.7982324361801147, "learning_rate": 0.0002, "epoch": 1.7201519683129902, "step": 10640}, {"loss": 0.7345, "grad_norm": 0.6418732404708862, "learning_rate": 0.0002, "epoch": 1.7217686524937355, "step": 10650}, {"loss": 0.7734, "grad_norm": 0.7636681795120239, "learning_rate": 0.0002, "epoch": 1.7233853366744807, "step": 10660}, {"loss": 0.7541, "grad_norm": 0.5646875500679016, "learning_rate": 0.0002, "epoch": 1.725002020855226, "step": 10670}, {"loss": 0.7642, "grad_norm": 0.5231260657310486, "learning_rate": 0.0002, "epoch": 1.7266187050359711, "step": 10680}, {"loss": 0.7846, "grad_norm": 0.7635011672973633, "learning_rate": 0.0002, "epoch": 1.7282353892167164, "step": 10690}, {"loss": 0.7471, "grad_norm": 0.7518259286880493, "learning_rate": 0.0002, "epoch": 1.7298520733974618, "step": 10700}, {"loss": 0.751, "grad_norm": 0.7295602560043335, "learning_rate": 0.0002, "epoch": 1.731468757578207, "step": 10710}, {"loss": 0.731, "grad_norm": 0.6984632015228271, "learning_rate": 0.0002, "epoch": 1.7330854417589525, "step": 10720}, {"loss": 0.7921, "grad_norm": 0.6198219060897827, "learning_rate": 0.0002, "epoch": 1.7347021259396977, "step": 10730}, {"loss": 0.7642, "grad_norm": 0.6957576274871826, "learning_rate": 0.0002, "epoch": 1.736318810120443, "step": 10740}, {"loss": 0.7917, "grad_norm": 0.6430263519287109, "learning_rate": 0.0002, "epoch": 1.7379354943011882, "step": 10750}, {"loss": 0.7156, "grad_norm": 0.6134995222091675, "learning_rate": 0.0002, "epoch": 1.7395521784819334, "step": 10760}, {"loss": 0.7584, "grad_norm": 0.7209452986717224, "learning_rate": 0.0002, "epoch": 1.741168862662679, "step": 10770}, {"loss": 0.7528, "grad_norm": 0.6735447645187378, "learning_rate": 0.0002, "epoch": 1.7427855468434241, "step": 10780}, {"loss": 0.756, "grad_norm": 0.5605693459510803, "learning_rate": 0.0002, "epoch": 1.7444022310241696, "step": 10790}, {"loss": 0.7759, "grad_norm": 0.6882363557815552, "learning_rate": 0.0002, "epoch": 1.7460189152049148, "step": 10800}, {"loss": 0.7544, "grad_norm": 0.6386259198188782, "learning_rate": 0.0002, "epoch": 1.74763559938566, "step": 10810}, {"loss": 0.7697, "grad_norm": 0.6529015302658081, "learning_rate": 0.0002, "epoch": 1.7492522835664053, "step": 10820}, {"loss": 0.7219, "grad_norm": 0.5664082765579224, "learning_rate": 0.0002, "epoch": 1.7508689677471505, "step": 10830}, {"loss": 0.7586, "grad_norm": 0.7532684206962585, "learning_rate": 0.0002, "epoch": 1.7524856519278957, "step": 10840}, {"loss": 0.6919, "grad_norm": 0.77171391248703, "learning_rate": 0.0002, "epoch": 1.7541023361086412, "step": 10850}, {"loss": 0.785, "grad_norm": 0.7255431413650513, "learning_rate": 0.0002, "epoch": 1.7557190202893864, "step": 10860}, {"loss": 0.7458, "grad_norm": 0.763083279132843, "learning_rate": 0.0002, "epoch": 1.7573357044701319, "step": 10870}, {"loss": 0.7846, "grad_norm": 0.6042402982711792, "learning_rate": 0.0002, "epoch": 1.758952388650877, "step": 10880}, {"loss": 0.7027, "grad_norm": 0.7642518281936646, "learning_rate": 0.0002, "epoch": 1.7605690728316223, "step": 10890}, {"loss": 0.746, "grad_norm": 0.6347904801368713, "learning_rate": 0.0002, "epoch": 1.7621857570123676, "step": 10900}, {"loss": 0.7458, "grad_norm": 0.5371627807617188, "learning_rate": 0.0002, "epoch": 1.7638024411931128, "step": 10910}, {"loss": 0.7466, "grad_norm": 0.6840225458145142, "learning_rate": 0.0002, "epoch": 1.7654191253738583, "step": 10920}, {"loss": 0.725, "grad_norm": 0.5288469195365906, "learning_rate": 0.0002, "epoch": 1.7670358095546035, "step": 10930}, {"loss": 0.7863, "grad_norm": 0.69020676612854, "learning_rate": 0.0002, "epoch": 1.768652493735349, "step": 10940}, {"loss": 0.7468, "grad_norm": 0.5943242311477661, "learning_rate": 0.0002, "epoch": 1.7702691779160942, "step": 10950}, {"loss": 0.7244, "grad_norm": 0.5616418123245239, "learning_rate": 0.0002, "epoch": 1.7718858620968394, "step": 10960}, {"loss": 0.7137, "grad_norm": 0.7209470868110657, "learning_rate": 0.0002, "epoch": 1.7735025462775846, "step": 10970}, {"loss": 0.7459, "grad_norm": 0.6657957434654236, "learning_rate": 0.0002, "epoch": 1.7751192304583299, "step": 10980}, {"loss": 0.7076, "grad_norm": 0.6469064950942993, "learning_rate": 0.0002, "epoch": 1.776735914639075, "step": 10990}, {"loss": 0.7321, "grad_norm": 0.6615678071975708, "learning_rate": 0.0002, "epoch": 1.7783525988198206, "step": 11000}, {"loss": 0.747, "grad_norm": 0.6722439527511597, "learning_rate": 0.0002, "epoch": 1.779969283000566, "step": 11010}, {"loss": 0.7302, "grad_norm": 0.634136974811554, "learning_rate": 0.0002, "epoch": 1.7815859671813112, "step": 11020}, {"loss": 0.8105, "grad_norm": 0.6024377346038818, "learning_rate": 0.0002, "epoch": 1.7832026513620565, "step": 11030}, {"loss": 0.7855, "grad_norm": 0.6909403800964355, "learning_rate": 0.0002, "epoch": 1.7848193355428017, "step": 11040}, {"loss": 0.7471, "grad_norm": 0.7148767709732056, "learning_rate": 0.0002, "epoch": 1.786436019723547, "step": 11050}, {"loss": 0.7145, "grad_norm": 0.7442979216575623, "learning_rate": 0.0002, "epoch": 1.7880527039042922, "step": 11060}, {"loss": 0.7215, "grad_norm": 0.6830431818962097, "learning_rate": 0.0002, "epoch": 1.7896693880850376, "step": 11070}, {"loss": 0.7625, "grad_norm": 0.9172667264938354, "learning_rate": 0.0002, "epoch": 1.7912860722657828, "step": 11080}, {"loss": 0.76, "grad_norm": 0.6799490451812744, "learning_rate": 0.0002, "epoch": 1.7929027564465283, "step": 11090}, {"loss": 0.7716, "grad_norm": 0.7617024779319763, "learning_rate": 0.0002, "epoch": 1.7945194406272735, "step": 11100}, {"loss": 0.7586, "grad_norm": 0.7701810002326965, "learning_rate": 0.0002, "epoch": 1.7961361248080188, "step": 11110}, {"loss": 0.7843, "grad_norm": 0.7454385757446289, "learning_rate": 0.0002, "epoch": 1.797752808988764, "step": 11120}, {"loss": 0.7873, "grad_norm": 0.6121436953544617, "learning_rate": 0.0002, "epoch": 1.7993694931695092, "step": 11130}, {"loss": 0.7305, "grad_norm": 0.6237571835517883, "learning_rate": 0.0002, "epoch": 1.8009861773502547, "step": 11140}, {"loss": 0.6827, "grad_norm": 0.6818515658378601, "learning_rate": 0.0002, "epoch": 1.802602861531, "step": 11150}, {"loss": 0.6876, "grad_norm": 0.7768308520317078, "learning_rate": 0.0002, "epoch": 1.8042195457117454, "step": 11160}, {"loss": 0.7533, "grad_norm": 0.6875537633895874, "learning_rate": 0.0002, "epoch": 1.8058362298924906, "step": 11170}, {"loss": 0.761, "grad_norm": 0.7950584888458252, "learning_rate": 0.0002, "epoch": 1.8074529140732358, "step": 11180}, {"loss": 0.7623, "grad_norm": 0.8210248351097107, "learning_rate": 0.0002, "epoch": 1.809069598253981, "step": 11190}, {"loss": 0.7556, "grad_norm": 0.6674110889434814, "learning_rate": 0.0002, "epoch": 1.8106862824347263, "step": 11200}, {"loss": 0.7663, "grad_norm": 0.6261674761772156, "learning_rate": 0.0002, "epoch": 1.8123029666154715, "step": 11210}, {"loss": 0.7122, "grad_norm": 0.6484741568565369, "learning_rate": 0.0002, "epoch": 1.813919650796217, "step": 11220}, {"loss": 0.7718, "grad_norm": 0.6231244206428528, "learning_rate": 0.0002, "epoch": 1.8155363349769622, "step": 11230}, {"loss": 0.7152, "grad_norm": 0.7243146896362305, "learning_rate": 0.0002, "epoch": 1.8171530191577077, "step": 11240}, {"loss": 0.7448, "grad_norm": 0.6776193380355835, "learning_rate": 0.0002, "epoch": 1.818769703338453, "step": 11250}, {"loss": 0.7317, "grad_norm": 0.5973618030548096, "learning_rate": 0.0002, "epoch": 1.8203863875191981, "step": 11260}, {"loss": 0.7961, "grad_norm": 0.6451361179351807, "learning_rate": 0.0002, "epoch": 1.8220030716999434, "step": 11270}, {"loss": 0.7611, "grad_norm": 0.5963068008422852, "learning_rate": 0.0002, "epoch": 1.8236197558806886, "step": 11280}, {"loss": 0.7466, "grad_norm": 0.536902129650116, "learning_rate": 0.0002, "epoch": 1.825236440061434, "step": 11290}, {"loss": 0.708, "grad_norm": 0.6993787288665771, "learning_rate": 0.0002, "epoch": 1.8268531242421793, "step": 11300}, {"loss": 0.7153, "grad_norm": 0.6135255098342896, "learning_rate": 0.0002, "epoch": 1.8284698084229247, "step": 11310}, {"loss": 0.7423, "grad_norm": 0.6057423949241638, "learning_rate": 0.0002, "epoch": 1.83008649260367, "step": 11320}, {"loss": 0.735, "grad_norm": 0.6598812341690063, "learning_rate": 0.0002, "epoch": 1.8317031767844152, "step": 11330}, {"loss": 0.7278, "grad_norm": 0.6075948476791382, "learning_rate": 0.0002, "epoch": 1.8333198609651604, "step": 11340}, {"loss": 0.7846, "grad_norm": 0.7065447568893433, "learning_rate": 0.0002, "epoch": 1.8349365451459057, "step": 11350}, {"loss": 0.7365, "grad_norm": 0.680526614189148, "learning_rate": 0.0002, "epoch": 1.8365532293266509, "step": 11360}, {"loss": 0.7152, "grad_norm": 0.6356695294380188, "learning_rate": 0.0002, "epoch": 1.8381699135073963, "step": 11370}, {"loss": 0.721, "grad_norm": 0.6399052143096924, "learning_rate": 0.0002, "epoch": 1.8397865976881416, "step": 11380}, {"loss": 0.7618, "grad_norm": 0.6125704050064087, "learning_rate": 0.0002, "epoch": 1.841403281868887, "step": 11390}, {"loss": 0.755, "grad_norm": 0.7124643325805664, "learning_rate": 0.0002, "epoch": 1.8430199660496323, "step": 11400}, {"loss": 0.7972, "grad_norm": 0.6099604964256287, "learning_rate": 0.0002, "epoch": 1.8446366502303775, "step": 11410}, {"loss": 0.7187, "grad_norm": 0.7338208556175232, "learning_rate": 0.0002, "epoch": 1.8462533344111227, "step": 11420}, {"loss": 0.7007, "grad_norm": 0.7534668445587158, "learning_rate": 0.0002, "epoch": 1.847870018591868, "step": 11430}, {"loss": 0.7464, "grad_norm": 0.6135470271110535, "learning_rate": 0.0002, "epoch": 1.8494867027726134, "step": 11440}, {"loss": 0.7955, "grad_norm": 0.6229309439659119, "learning_rate": 0.0002, "epoch": 1.8511033869533586, "step": 11450}, {"loss": 0.7594, "grad_norm": 0.706423282623291, "learning_rate": 0.0002, "epoch": 1.852720071134104, "step": 11460}, {"loss": 0.7411, "grad_norm": 0.5460049510002136, "learning_rate": 0.0002, "epoch": 1.8543367553148493, "step": 11470}, {"loss": 0.7416, "grad_norm": 0.6616711020469666, "learning_rate": 0.0002, "epoch": 1.8559534394955945, "step": 11480}, {"loss": 0.729, "grad_norm": 0.6372783184051514, "learning_rate": 0.0002, "epoch": 1.8575701236763398, "step": 11490}, {"loss": 0.7333, "grad_norm": 0.7162668108940125, "learning_rate": 0.0002, "epoch": 1.859186807857085, "step": 11500}, {"loss": 0.7747, "grad_norm": 0.6605209708213806, "learning_rate": 0.0002, "epoch": 1.8608034920378305, "step": 11510}, {"loss": 0.7258, "grad_norm": 0.6933956742286682, "learning_rate": 0.0002, "epoch": 1.8624201762185757, "step": 11520}, {"loss": 0.7243, "grad_norm": 0.6582090854644775, "learning_rate": 0.0002, "epoch": 1.8640368603993211, "step": 11530}, {"loss": 0.7313, "grad_norm": 0.6416500806808472, "learning_rate": 0.0002, "epoch": 1.8656535445800664, "step": 11540}, {"loss": 0.7372, "grad_norm": 0.5434312224388123, "learning_rate": 0.0002, "epoch": 1.8672702287608116, "step": 11550}, {"loss": 0.7635, "grad_norm": 0.6827567219734192, "learning_rate": 0.0002, "epoch": 1.8688869129415568, "step": 11560}, {"loss": 0.7137, "grad_norm": 0.7354370951652527, "learning_rate": 0.0002, "epoch": 1.870503597122302, "step": 11570}, {"loss": 0.7526, "grad_norm": 0.590372622013092, "learning_rate": 0.0002, "epoch": 1.8721202813030473, "step": 11580}, {"loss": 0.731, "grad_norm": 0.853183925151825, "learning_rate": 0.0002, "epoch": 1.8737369654837928, "step": 11590}, {"loss": 0.7487, "grad_norm": 0.822678804397583, "learning_rate": 0.0002, "epoch": 1.875353649664538, "step": 11600}, {"loss": 0.7427, "grad_norm": 0.6591550707817078, "learning_rate": 0.0002, "epoch": 1.8769703338452834, "step": 11610}, {"loss": 0.7054, "grad_norm": 0.7475301623344421, "learning_rate": 0.0002, "epoch": 1.8785870180260287, "step": 11620}, {"loss": 0.811, "grad_norm": 0.6390765309333801, "learning_rate": 0.0002, "epoch": 1.880203702206774, "step": 11630}, {"loss": 0.7531, "grad_norm": 0.6589758992195129, "learning_rate": 0.0002, "epoch": 1.8818203863875191, "step": 11640}, {"loss": 0.7475, "grad_norm": 0.6765508651733398, "learning_rate": 0.0002, "epoch": 1.8834370705682644, "step": 11650}, {"loss": 0.738, "grad_norm": 0.6527857780456543, "learning_rate": 0.0002, "epoch": 1.8850537547490098, "step": 11660}, {"loss": 0.7504, "grad_norm": 0.6642923951148987, "learning_rate": 0.0002, "epoch": 1.886670438929755, "step": 11670}, {"loss": 0.7701, "grad_norm": 0.6945584416389465, "learning_rate": 0.0002, "epoch": 1.8882871231105005, "step": 11680}, {"loss": 0.7711, "grad_norm": 0.694018542766571, "learning_rate": 0.0002, "epoch": 1.8899038072912457, "step": 11690}, {"loss": 0.7195, "grad_norm": 0.7237417101860046, "learning_rate": 0.0002, "epoch": 1.891520491471991, "step": 11700}, {"loss": 0.7491, "grad_norm": 0.7401309609413147, "learning_rate": 0.0002, "epoch": 1.8931371756527362, "step": 11710}, {"loss": 0.805, "grad_norm": 0.6537784337997437, "learning_rate": 0.0002, "epoch": 1.8947538598334814, "step": 11720}, {"loss": 0.793, "grad_norm": 0.7398539185523987, "learning_rate": 0.0002, "epoch": 1.8963705440142267, "step": 11730}, {"loss": 0.7561, "grad_norm": 0.6696075797080994, "learning_rate": 0.0002, "epoch": 1.8979872281949721, "step": 11740}, {"loss": 0.7353, "grad_norm": 0.6014142036437988, "learning_rate": 0.0002, "epoch": 1.8996039123757174, "step": 11750}, {"loss": 0.7714, "grad_norm": 0.7023524641990662, "learning_rate": 0.0002, "epoch": 1.9012205965564628, "step": 11760}, {"loss": 0.7088, "grad_norm": 0.739973783493042, "learning_rate": 0.0002, "epoch": 1.902837280737208, "step": 11770}, {"loss": 0.7848, "grad_norm": 0.5576770901679993, "learning_rate": 0.0002, "epoch": 1.9044539649179533, "step": 11780}, {"loss": 0.7483, "grad_norm": 0.6907393932342529, "learning_rate": 0.0002, "epoch": 1.9060706490986985, "step": 11790}, {"loss": 0.7827, "grad_norm": 0.6934581995010376, "learning_rate": 0.0002, "epoch": 1.9076873332794437, "step": 11800}, {"loss": 0.7199, "grad_norm": 0.591774582862854, "learning_rate": 0.0002, "epoch": 1.9093040174601892, "step": 11810}, {"loss": 0.7333, "grad_norm": 0.6249791383743286, "learning_rate": 0.0002, "epoch": 1.9109207016409344, "step": 11820}, {"loss": 0.7581, "grad_norm": 0.6755744218826294, "learning_rate": 0.0002, "epoch": 1.9125373858216799, "step": 11830}, {"loss": 0.696, "grad_norm": 0.7286285161972046, "learning_rate": 0.0002, "epoch": 1.914154070002425, "step": 11840}, {"loss": 0.7509, "grad_norm": 0.7867850065231323, "learning_rate": 0.0002, "epoch": 1.9157707541831703, "step": 11850}, {"loss": 0.735, "grad_norm": 0.6283972859382629, "learning_rate": 0.0002, "epoch": 1.9173874383639156, "step": 11860}, {"loss": 0.7296, "grad_norm": 0.605823814868927, "learning_rate": 0.0002, "epoch": 1.9190041225446608, "step": 11870}, {"loss": 0.6598, "grad_norm": 0.5927976965904236, "learning_rate": 0.0002, "epoch": 1.920620806725406, "step": 11880}, {"loss": 0.7649, "grad_norm": 0.5974002480506897, "learning_rate": 0.0002, "epoch": 1.9222374909061515, "step": 11890}, {"loss": 0.7843, "grad_norm": 0.7091866135597229, "learning_rate": 0.0002, "epoch": 1.923854175086897, "step": 11900}, {"loss": 0.775, "grad_norm": 0.72496497631073, "learning_rate": 0.0002, "epoch": 1.9254708592676422, "step": 11910}, {"loss": 0.7153, "grad_norm": 0.6131896376609802, "learning_rate": 0.0002, "epoch": 1.9270875434483874, "step": 11920}, {"loss": 0.7228, "grad_norm": 0.6556436419487, "learning_rate": 0.0002, "epoch": 1.9287042276291326, "step": 11930}, {"loss": 0.7319, "grad_norm": 0.622932493686676, "learning_rate": 0.0002, "epoch": 1.9303209118098779, "step": 11940}, {"loss": 0.7592, "grad_norm": 0.6618631482124329, "learning_rate": 0.0002, "epoch": 1.931937595990623, "step": 11950}, {"loss": 0.8332, "grad_norm": 0.630966305732727, "learning_rate": 0.0002, "epoch": 1.9335542801713685, "step": 11960}, {"loss": 0.6854, "grad_norm": 0.6336734890937805, "learning_rate": 0.0002, "epoch": 1.9351709643521138, "step": 11970}, {"loss": 0.7433, "grad_norm": 0.655403196811676, "learning_rate": 0.0002, "epoch": 1.9367876485328592, "step": 11980}, {"loss": 0.7282, "grad_norm": 0.5640574097633362, "learning_rate": 0.0002, "epoch": 1.9384043327136045, "step": 11990}, {"loss": 0.7289, "grad_norm": 0.6322951316833496, "learning_rate": 0.0002, "epoch": 1.9400210168943497, "step": 12000}, {"loss": 0.7627, "grad_norm": 0.615703821182251, "learning_rate": 0.0002, "epoch": 1.941637701075095, "step": 12010}, {"loss": 0.786, "grad_norm": 0.6487536430358887, "learning_rate": 0.0002, "epoch": 1.9432543852558402, "step": 12020}, {"loss": 0.7435, "grad_norm": 0.9209630489349365, "learning_rate": 0.0002, "epoch": 1.9448710694365856, "step": 12030}, {"loss": 0.7274, "grad_norm": 0.67485511302948, "learning_rate": 0.0002, "epoch": 1.9464877536173308, "step": 12040}, {"loss": 0.7551, "grad_norm": 0.6831230521202087, "learning_rate": 0.0002, "epoch": 1.9481044377980763, "step": 12050}, {"loss": 0.7546, "grad_norm": 0.6578302383422852, "learning_rate": 0.0002, "epoch": 1.9497211219788215, "step": 12060}, {"loss": 0.6989, "grad_norm": 0.9975938200950623, "learning_rate": 0.0002, "epoch": 1.9513378061595668, "step": 12070}, {"loss": 0.7952, "grad_norm": 0.6637365221977234, "learning_rate": 0.0002, "epoch": 1.952954490340312, "step": 12080}, {"loss": 0.7482, "grad_norm": 0.605707049369812, "learning_rate": 0.0002, "epoch": 1.9545711745210572, "step": 12090}, {"loss": 0.7768, "grad_norm": 0.6584440469741821, "learning_rate": 0.0002, "epoch": 1.9561878587018025, "step": 12100}, {"loss": 0.7187, "grad_norm": 0.6070835590362549, "learning_rate": 0.0002, "epoch": 1.957804542882548, "step": 12110}, {"loss": 0.7491, "grad_norm": 0.7862601280212402, "learning_rate": 0.0002, "epoch": 1.9594212270632931, "step": 12120}, {"loss": 0.7972, "grad_norm": 0.8175255060195923, "learning_rate": 0.0002, "epoch": 1.9610379112440386, "step": 12130}, {"loss": 0.7242, "grad_norm": 0.5648472905158997, "learning_rate": 0.0002, "epoch": 1.9626545954247838, "step": 12140}, {"loss": 0.7321, "grad_norm": 0.6591973304748535, "learning_rate": 0.0002, "epoch": 1.964271279605529, "step": 12150}, {"loss": 0.739, "grad_norm": 0.5960676074028015, "learning_rate": 0.0002, "epoch": 1.9658879637862743, "step": 12160}, {"loss": 0.7254, "grad_norm": 0.7272544503211975, "learning_rate": 0.0002, "epoch": 1.9675046479670195, "step": 12170}, {"loss": 0.7376, "grad_norm": 0.7176699042320251, "learning_rate": 0.0002, "epoch": 1.969121332147765, "step": 12180}, {"loss": 0.7525, "grad_norm": 0.6927123665809631, "learning_rate": 0.0002, "epoch": 1.9707380163285102, "step": 12190}, {"loss": 0.7318, "grad_norm": 0.5536034107208252, "learning_rate": 0.0002, "epoch": 1.9723547005092557, "step": 12200}, {"loss": 0.7737, "grad_norm": 0.8348390460014343, "learning_rate": 0.0002, "epoch": 1.9739713846900009, "step": 12210}, {"loss": 0.7494, "grad_norm": 0.6591181755065918, "learning_rate": 0.0002, "epoch": 1.9755880688707461, "step": 12220}, {"loss": 0.763, "grad_norm": 1.0624109506607056, "learning_rate": 0.0002, "epoch": 1.9772047530514913, "step": 12230}, {"loss": 0.7541, "grad_norm": 0.9265586137771606, "learning_rate": 0.0002, "epoch": 1.9788214372322366, "step": 12240}, {"loss": 0.7533, "grad_norm": 0.5998196005821228, "learning_rate": 0.0002, "epoch": 1.9804381214129818, "step": 12250}, {"loss": 0.7225, "grad_norm": 0.6960851550102234, "learning_rate": 0.0002, "epoch": 1.9820548055937273, "step": 12260}, {"loss": 0.7398, "grad_norm": 0.7674502730369568, "learning_rate": 0.0002, "epoch": 1.9836714897744727, "step": 12270}, {"loss": 0.7185, "grad_norm": 0.6407275795936584, "learning_rate": 0.0002, "epoch": 1.985288173955218, "step": 12280}, {"loss": 0.7382, "grad_norm": 0.6673079133033752, "learning_rate": 0.0002, "epoch": 1.9869048581359632, "step": 12290}, {"loss": 0.7326, "grad_norm": 0.6989844441413879, "learning_rate": 0.0002, "epoch": 1.9885215423167084, "step": 12300}, {"loss": 0.7559, "grad_norm": 0.7564442157745361, "learning_rate": 0.0002, "epoch": 1.9901382264974536, "step": 12310}, {"loss": 0.7719, "grad_norm": 0.6385478973388672, "learning_rate": 0.0002, "epoch": 1.9917549106781989, "step": 12320}, {"loss": 0.7369, "grad_norm": 0.7193717956542969, "learning_rate": 0.0002, "epoch": 1.9933715948589443, "step": 12330}, {"loss": 0.7583, "grad_norm": 0.7987112402915955, "learning_rate": 0.0002, "epoch": 1.9949882790396896, "step": 12340}, {"loss": 0.7793, "grad_norm": 0.7260826826095581, "learning_rate": 0.0002, "epoch": 1.996604963220435, "step": 12350}, {"loss": 0.7505, "grad_norm": 0.7968255281448364, "learning_rate": 0.0002, "epoch": 1.9982216474011802, "step": 12360}, {"loss": 0.717, "grad_norm": 0.6893062591552734, "learning_rate": 0.0002, "epoch": 1.9998383315819255, "step": 12370}, {"eval_loss": 1.1044032573699951, "eval_runtime": 122.1508, "eval_samples_per_second": 6.001, "eval_steps_per_second": 0.753, "epoch": 2.0, "step": 12371}, {"loss": 0.6604, "grad_norm": 0.7775409817695618, "learning_rate": 0.0002, "epoch": 2.0014550157626707, "step": 12380}, {"loss": 0.6845, "grad_norm": 0.76218581199646, "learning_rate": 0.0002, "epoch": 2.003071699943416, "step": 12390}, {"loss": 0.6909, "grad_norm": 0.5677764415740967, "learning_rate": 0.0002, "epoch": 2.004688384124161, "step": 12400}, {"loss": 0.6584, "grad_norm": 0.808442234992981, "learning_rate": 0.0002, "epoch": 2.006305068304907, "step": 12410}, {"loss": 0.659, "grad_norm": 0.7144765257835388, "learning_rate": 0.0002, "epoch": 2.007921752485652, "step": 12420}, {"loss": 0.6666, "grad_norm": 0.6914031505584717, "learning_rate": 0.0002, "epoch": 2.0095384366663973, "step": 12430}, {"loss": 0.6596, "grad_norm": 0.7581454515457153, "learning_rate": 0.0002, "epoch": 2.0111551208471425, "step": 12440}, {"loss": 0.6785, "grad_norm": 0.8388504981994629, "learning_rate": 0.0002, "epoch": 2.0127718050278878, "step": 12450}, {"loss": 0.6942, "grad_norm": 0.6716406941413879, "learning_rate": 0.0002, "epoch": 2.014388489208633, "step": 12460}, {"loss": 0.6441, "grad_norm": 0.898902416229248, "learning_rate": 0.0002, "epoch": 2.0160051733893782, "step": 12470}, {"loss": 0.6655, "grad_norm": 0.6432679891586304, "learning_rate": 0.0002, "epoch": 2.0176218575701235, "step": 12480}, {"loss": 0.6521, "grad_norm": 0.8021109104156494, "learning_rate": 0.0002, "epoch": 2.019238541750869, "step": 12490}, {"loss": 0.6581, "grad_norm": 0.7039216756820679, "learning_rate": 0.0002, "epoch": 2.0208552259316144, "step": 12500}, {"loss": 0.6521, "grad_norm": 0.646531879901886, "learning_rate": 0.0002, "epoch": 2.0224719101123596, "step": 12510}, {"loss": 0.6302, "grad_norm": 0.783704400062561, "learning_rate": 0.0002, "epoch": 2.024088594293105, "step": 12520}, {"loss": 0.6288, "grad_norm": 0.8805046677589417, "learning_rate": 0.0002, "epoch": 2.02570527847385, "step": 12530}, {"loss": 0.6288, "grad_norm": 0.7289270758628845, "learning_rate": 0.0002, "epoch": 2.0273219626545953, "step": 12540}, {"loss": 0.6663, "grad_norm": 0.71653151512146, "learning_rate": 0.0002, "epoch": 2.0289386468353405, "step": 12550}, {"loss": 0.625, "grad_norm": 0.73281329870224, "learning_rate": 0.0002, "epoch": 2.030555331016086, "step": 12560}, {"loss": 0.6448, "grad_norm": 0.6657090187072754, "learning_rate": 0.0002, "epoch": 2.0321720151968314, "step": 12570}, {"loss": 0.6983, "grad_norm": 0.8241133093833923, "learning_rate": 0.0002, "epoch": 2.0337886993775767, "step": 12580}, {"loss": 0.6488, "grad_norm": 0.5834135413169861, "learning_rate": 0.0002, "epoch": 2.035405383558322, "step": 12590}, {"loss": 0.6188, "grad_norm": 0.84502112865448, "learning_rate": 0.0002, "epoch": 2.037022067739067, "step": 12600}, {"loss": 0.6349, "grad_norm": 0.8952481746673584, "learning_rate": 0.0002, "epoch": 2.0386387519198124, "step": 12610}, {"loss": 0.6923, "grad_norm": 0.7801461815834045, "learning_rate": 0.0002, "epoch": 2.0402554361005576, "step": 12620}, {"loss": 0.6176, "grad_norm": 0.6788367033004761, "learning_rate": 0.0002, "epoch": 2.041872120281303, "step": 12630}, {"loss": 0.6162, "grad_norm": 0.7241756319999695, "learning_rate": 0.0002, "epoch": 2.0434888044620485, "step": 12640}, {"loss": 0.655, "grad_norm": 0.6933388113975525, "learning_rate": 0.0002, "epoch": 2.0451054886427937, "step": 12650}, {"loss": 0.6431, "grad_norm": 0.8029746413230896, "learning_rate": 0.0002, "epoch": 2.046722172823539, "step": 12660}, {"loss": 0.7164, "grad_norm": 0.946399986743927, "learning_rate": 0.0002, "epoch": 2.048338857004284, "step": 12670}, {"loss": 0.638, "grad_norm": 0.7072678804397583, "learning_rate": 0.0002, "epoch": 2.0499555411850294, "step": 12680}, {"loss": 0.6487, "grad_norm": 0.6810618042945862, "learning_rate": 0.0002, "epoch": 2.0515722253657747, "step": 12690}, {"loss": 0.6554, "grad_norm": 0.7661160230636597, "learning_rate": 0.0002, "epoch": 2.05318890954652, "step": 12700}, {"loss": 0.6799, "grad_norm": 0.6350653767585754, "learning_rate": 0.0002, "epoch": 2.0548055937272656, "step": 12710}, {"loss": 0.6654, "grad_norm": 0.861890971660614, "learning_rate": 0.0002, "epoch": 2.056422277908011, "step": 12720}, {"loss": 0.6286, "grad_norm": 0.6489875912666321, "learning_rate": 0.0002, "epoch": 2.058038962088756, "step": 12730}, {"loss": 0.6811, "grad_norm": 0.8268506526947021, "learning_rate": 0.0002, "epoch": 2.0596556462695013, "step": 12740}, {"loss": 0.6524, "grad_norm": 0.607679545879364, "learning_rate": 0.0002, "epoch": 2.0612723304502465, "step": 12750}, {"loss": 0.6649, "grad_norm": 0.6754153370857239, "learning_rate": 0.0002, "epoch": 2.0628890146309917, "step": 12760}, {"loss": 0.6549, "grad_norm": 0.7263124585151672, "learning_rate": 0.0002, "epoch": 2.064505698811737, "step": 12770}, {"loss": 0.6189, "grad_norm": 0.6986154317855835, "learning_rate": 0.0002, "epoch": 2.0661223829924826, "step": 12780}, {"loss": 0.6723, "grad_norm": 0.7768576741218567, "learning_rate": 0.0002, "epoch": 2.067739067173228, "step": 12790}, {"loss": 0.677, "grad_norm": 0.7546762824058533, "learning_rate": 0.0002, "epoch": 2.069355751353973, "step": 12800}, {"loss": 0.6485, "grad_norm": 0.7588880062103271, "learning_rate": 0.0002, "epoch": 2.0709724355347183, "step": 12810}, {"loss": 0.6989, "grad_norm": 0.7457242608070374, "learning_rate": 0.0002, "epoch": 2.0725891197154636, "step": 12820}, {"loss": 0.6489, "grad_norm": 0.6983516812324524, "learning_rate": 0.0002, "epoch": 2.074205803896209, "step": 12830}, {"loss": 0.651, "grad_norm": 0.7950928807258606, "learning_rate": 0.0002, "epoch": 2.075822488076954, "step": 12840}, {"loss": 0.6603, "grad_norm": 0.9248087406158447, "learning_rate": 0.0002, "epoch": 2.0774391722576993, "step": 12850}, {"loss": 0.6847, "grad_norm": 0.7229493260383606, "learning_rate": 0.0002, "epoch": 2.079055856438445, "step": 12860}, {"loss": 0.6702, "grad_norm": 0.5710847973823547, "learning_rate": 0.0002, "epoch": 2.08067254061919, "step": 12870}, {"loss": 0.6974, "grad_norm": 0.9580423831939697, "learning_rate": 0.0002, "epoch": 2.0822892247999354, "step": 12880}, {"loss": 0.6341, "grad_norm": 0.7399665713310242, "learning_rate": 0.0002, "epoch": 2.0839059089806806, "step": 12890}, {"loss": 0.6993, "grad_norm": 0.7981410622596741, "learning_rate": 0.0002, "epoch": 2.085522593161426, "step": 12900}, {"loss": 0.6976, "grad_norm": 0.870759904384613, "learning_rate": 0.0002, "epoch": 2.087139277342171, "step": 12910}, {"loss": 0.7194, "grad_norm": 0.7001481652259827, "learning_rate": 0.0002, "epoch": 2.0887559615229163, "step": 12920}, {"loss": 0.6383, "grad_norm": 0.6745418310165405, "learning_rate": 0.0002, "epoch": 2.090372645703662, "step": 12930}, {"loss": 0.6519, "grad_norm": 0.7739067673683167, "learning_rate": 0.0002, "epoch": 2.0919893298844072, "step": 12940}, {"loss": 0.6856, "grad_norm": 0.6742934584617615, "learning_rate": 0.0002, "epoch": 2.0936060140651525, "step": 12950}, {"loss": 0.6279, "grad_norm": 0.7270349860191345, "learning_rate": 0.0002, "epoch": 2.0952226982458977, "step": 12960}, {"loss": 0.6783, "grad_norm": 0.7150624394416809, "learning_rate": 0.0002, "epoch": 2.096839382426643, "step": 12970}, {"loss": 0.6093, "grad_norm": 0.7734767198562622, "learning_rate": 0.0002, "epoch": 2.098456066607388, "step": 12980}, {"loss": 0.6534, "grad_norm": 0.7618662118911743, "learning_rate": 0.0002, "epoch": 2.1000727507881334, "step": 12990}, {"loss": 0.6707, "grad_norm": 0.6557944416999817, "learning_rate": 0.0002, "epoch": 2.101689434968879, "step": 13000}, {"loss": 0.7268, "grad_norm": 0.8786448240280151, "learning_rate": 0.0002, "epoch": 2.1033061191496243, "step": 13010}, {"loss": 0.6677, "grad_norm": 0.6878724098205566, "learning_rate": 0.0002, "epoch": 2.1049228033303695, "step": 13020}, {"loss": 0.6824, "grad_norm": 0.822318971157074, "learning_rate": 0.0002, "epoch": 2.1065394875111147, "step": 13030}, {"loss": 0.6228, "grad_norm": 0.831468939781189, "learning_rate": 0.0002, "epoch": 2.10815617169186, "step": 13040}, {"loss": 0.6511, "grad_norm": 0.7699505686759949, "learning_rate": 0.0002, "epoch": 2.109772855872605, "step": 13050}, {"loss": 0.6671, "grad_norm": 0.7559016346931458, "learning_rate": 0.0002, "epoch": 2.1113895400533504, "step": 13060}, {"loss": 0.6215, "grad_norm": 0.6942209601402283, "learning_rate": 0.0002, "epoch": 2.1130062242340957, "step": 13070}, {"loss": 0.6449, "grad_norm": 0.6098947525024414, "learning_rate": 0.0002, "epoch": 2.1146229084148414, "step": 13080}, {"loss": 0.7091, "grad_norm": 0.6499016284942627, "learning_rate": 0.0002, "epoch": 2.1162395925955866, "step": 13090}, {"loss": 0.6247, "grad_norm": 0.7719953060150146, "learning_rate": 0.0002, "epoch": 2.117856276776332, "step": 13100}, {"loss": 0.6064, "grad_norm": 0.6708134412765503, "learning_rate": 0.0002, "epoch": 2.119472960957077, "step": 13110}, {"loss": 0.6056, "grad_norm": 0.8119585514068604, "learning_rate": 0.0002, "epoch": 2.1210896451378223, "step": 13120}, {"loss": 0.6628, "grad_norm": 0.6947157979011536, "learning_rate": 0.0002, "epoch": 2.1227063293185675, "step": 13130}, {"loss": 0.6375, "grad_norm": 0.8831837773323059, "learning_rate": 0.0002, "epoch": 2.1243230134993127, "step": 13140}, {"loss": 0.6997, "grad_norm": 0.7266910672187805, "learning_rate": 0.0002, "epoch": 2.1259396976800584, "step": 13150}, {"loss": 0.6446, "grad_norm": 0.8864351511001587, "learning_rate": 0.0002, "epoch": 2.1275563818608036, "step": 13160}, {"loss": 0.6762, "grad_norm": 0.8104248046875, "learning_rate": 0.0002, "epoch": 2.129173066041549, "step": 13170}, {"loss": 0.6581, "grad_norm": 0.6077079772949219, "learning_rate": 0.0002, "epoch": 2.130789750222294, "step": 13180}, {"loss": 0.6572, "grad_norm": 0.6874213814735413, "learning_rate": 0.0002, "epoch": 2.1324064344030393, "step": 13190}, {"loss": 0.642, "grad_norm": 0.7134367823600769, "learning_rate": 0.0002, "epoch": 2.1340231185837846, "step": 13200}, {"loss": 0.7016, "grad_norm": 0.6101235151290894, "learning_rate": 0.0002, "epoch": 2.13563980276453, "step": 13210}, {"loss": 0.6529, "grad_norm": 0.6042411923408508, "learning_rate": 0.0002, "epoch": 2.137256486945275, "step": 13220}, {"loss": 0.7179, "grad_norm": 0.914601743221283, "learning_rate": 0.0002, "epoch": 2.1388731711260207, "step": 13230}, {"loss": 0.6513, "grad_norm": 0.7104284167289734, "learning_rate": 0.0002, "epoch": 2.140489855306766, "step": 13240}, {"loss": 0.6607, "grad_norm": 0.664395272731781, "learning_rate": 0.0002, "epoch": 2.142106539487511, "step": 13250}, {"loss": 0.7211, "grad_norm": 0.6991241574287415, "learning_rate": 0.0002, "epoch": 2.1437232236682564, "step": 13260}, {"loss": 0.6484, "grad_norm": 0.5469560623168945, "learning_rate": 0.0002, "epoch": 2.1453399078490016, "step": 13270}, {"loss": 0.6765, "grad_norm": 0.8454998135566711, "learning_rate": 0.0002, "epoch": 2.146956592029747, "step": 13280}, {"loss": 0.6683, "grad_norm": 0.7088868618011475, "learning_rate": 0.0002, "epoch": 2.148573276210492, "step": 13290}, {"loss": 0.6835, "grad_norm": 0.7002687454223633, "learning_rate": 0.0002, "epoch": 2.1501899603912378, "step": 13300}, {"loss": 0.6399, "grad_norm": 0.7785214781761169, "learning_rate": 0.0002, "epoch": 2.151806644571983, "step": 13310}, {"loss": 0.67, "grad_norm": 0.8049132227897644, "learning_rate": 0.0002, "epoch": 2.1534233287527282, "step": 13320}, {"loss": 0.6495, "grad_norm": 0.8062595129013062, "learning_rate": 0.0002, "epoch": 2.1550400129334735, "step": 13330}, {"loss": 0.6603, "grad_norm": 0.6208319067955017, "learning_rate": 0.0002, "epoch": 2.1566566971142187, "step": 13340}, {"loss": 0.6584, "grad_norm": 0.7519655823707581, "learning_rate": 0.0002, "epoch": 2.158273381294964, "step": 13350}, {"loss": 0.6457, "grad_norm": 0.7645747065544128, "learning_rate": 0.0002, "epoch": 2.159890065475709, "step": 13360}, {"loss": 0.645, "grad_norm": 0.6847302913665771, "learning_rate": 0.0002, "epoch": 2.1615067496564544, "step": 13370}, {"loss": 0.6903, "grad_norm": 0.8630441427230835, "learning_rate": 0.0002, "epoch": 2.1631234338372, "step": 13380}, {"loss": 0.6742, "grad_norm": 0.7947702407836914, "learning_rate": 0.0002, "epoch": 2.1647401180179453, "step": 13390}, {"loss": 0.7206, "grad_norm": 0.6836977005004883, "learning_rate": 0.0002, "epoch": 2.1663568021986905, "step": 13400}, {"loss": 0.6304, "grad_norm": 0.7340566515922546, "learning_rate": 0.0002, "epoch": 2.1679734863794358, "step": 13410}, {"loss": 0.6528, "grad_norm": 0.7075738906860352, "learning_rate": 0.0002, "epoch": 2.169590170560181, "step": 13420}, {"loss": 0.6585, "grad_norm": 0.7080879807472229, "learning_rate": 0.0002, "epoch": 2.1712068547409262, "step": 13430}, {"loss": 0.6615, "grad_norm": 0.6218613386154175, "learning_rate": 0.0002, "epoch": 2.1728235389216715, "step": 13440}, {"loss": 0.6488, "grad_norm": 0.8211479187011719, "learning_rate": 0.0002, "epoch": 2.174440223102417, "step": 13450}, {"loss": 0.6738, "grad_norm": 0.864466667175293, "learning_rate": 0.0002, "epoch": 2.1760569072831624, "step": 13460}, {"loss": 0.679, "grad_norm": 0.7943857908248901, "learning_rate": 0.0002, "epoch": 2.1776735914639076, "step": 13470}, {"loss": 0.6838, "grad_norm": 0.78728187084198, "learning_rate": 0.0002, "epoch": 2.179290275644653, "step": 13480}, {"loss": 0.6397, "grad_norm": 0.697527289390564, "learning_rate": 0.0002, "epoch": 2.180906959825398, "step": 13490}, {"loss": 0.669, "grad_norm": 0.8205804228782654, "learning_rate": 0.0002, "epoch": 2.1825236440061433, "step": 13500}, {"loss": 0.7227, "grad_norm": 0.8709042072296143, "learning_rate": 0.0002, "epoch": 2.1841403281868885, "step": 13510}, {"loss": 0.6313, "grad_norm": 0.6228537559509277, "learning_rate": 0.0002, "epoch": 2.1857570123676338, "step": 13520}, {"loss": 0.7025, "grad_norm": 0.9566980004310608, "learning_rate": 0.0002, "epoch": 2.1873736965483794, "step": 13530}, {"loss": 0.6755, "grad_norm": 0.7128894329071045, "learning_rate": 0.0002, "epoch": 2.1889903807291247, "step": 13540}, {"loss": 0.6827, "grad_norm": 0.6888654232025146, "learning_rate": 0.0002, "epoch": 2.19060706490987, "step": 13550}, {"loss": 0.6961, "grad_norm": 0.6444337368011475, "learning_rate": 0.0002, "epoch": 2.192223749090615, "step": 13560}, {"loss": 0.656, "grad_norm": 0.8008806705474854, "learning_rate": 0.0002, "epoch": 2.1938404332713604, "step": 13570}, {"loss": 0.7, "grad_norm": 0.8482748866081238, "learning_rate": 0.0002, "epoch": 2.1954571174521056, "step": 13580}, {"loss": 0.7326, "grad_norm": 0.8584157228469849, "learning_rate": 0.0002, "epoch": 2.197073801632851, "step": 13590}, {"loss": 0.7014, "grad_norm": 0.7513734698295593, "learning_rate": 0.0002, "epoch": 2.1986904858135965, "step": 13600}, {"loss": 0.6632, "grad_norm": 0.7864262461662292, "learning_rate": 0.0002, "epoch": 2.2003071699943417, "step": 13610}, {"loss": 0.6879, "grad_norm": 0.8493645191192627, "learning_rate": 0.0002, "epoch": 2.201923854175087, "step": 13620}, {"loss": 0.6617, "grad_norm": 0.6902140974998474, "learning_rate": 0.0002, "epoch": 2.203540538355832, "step": 13630}, {"loss": 0.6655, "grad_norm": 0.8711254596710205, "learning_rate": 0.0002, "epoch": 2.2051572225365774, "step": 13640}, {"loss": 0.6359, "grad_norm": 0.7832191586494446, "learning_rate": 0.0002, "epoch": 2.2067739067173227, "step": 13650}, {"loss": 0.6723, "grad_norm": 0.5668176412582397, "learning_rate": 0.0002, "epoch": 2.208390590898068, "step": 13660}, {"loss": 0.635, "grad_norm": 0.8648375272750854, "learning_rate": 0.0002, "epoch": 2.2100072750788136, "step": 13670}, {"loss": 0.653, "grad_norm": 0.7643089890480042, "learning_rate": 0.0002, "epoch": 2.211623959259559, "step": 13680}, {"loss": 0.6765, "grad_norm": 0.6293777823448181, "learning_rate": 0.0002, "epoch": 2.213240643440304, "step": 13690}, {"loss": 0.6842, "grad_norm": 0.6459372639656067, "learning_rate": 0.0002, "epoch": 2.2148573276210493, "step": 13700}, {"loss": 0.6526, "grad_norm": 0.7060744166374207, "learning_rate": 0.0002, "epoch": 2.2164740118017945, "step": 13710}, {"loss": 0.7101, "grad_norm": 0.674109160900116, "learning_rate": 0.0002, "epoch": 2.2180906959825397, "step": 13720}, {"loss": 0.6529, "grad_norm": 0.830392062664032, "learning_rate": 0.0002, "epoch": 2.219707380163285, "step": 13730}, {"loss": 0.6733, "grad_norm": 0.6474477052688599, "learning_rate": 0.0002, "epoch": 2.2213240643440306, "step": 13740}, {"loss": 0.6413, "grad_norm": 0.7037909626960754, "learning_rate": 0.0002, "epoch": 2.222940748524776, "step": 13750}, {"loss": 0.6417, "grad_norm": 0.6554131507873535, "learning_rate": 0.0002, "epoch": 2.224557432705521, "step": 13760}, {"loss": 0.6907, "grad_norm": 0.7822230458259583, "learning_rate": 0.0002, "epoch": 2.2261741168862663, "step": 13770}, {"loss": 0.6505, "grad_norm": 0.9082167744636536, "learning_rate": 0.0002, "epoch": 2.2277908010670116, "step": 13780}, {"loss": 0.6878, "grad_norm": 0.7918276190757751, "learning_rate": 0.0002, "epoch": 2.229407485247757, "step": 13790}, {"loss": 0.6669, "grad_norm": 0.7354569435119629, "learning_rate": 0.0002, "epoch": 2.231024169428502, "step": 13800}, {"loss": 0.6503, "grad_norm": 0.8265249133110046, "learning_rate": 0.0002, "epoch": 2.2326408536092472, "step": 13810}, {"loss": 0.6871, "grad_norm": 0.6653847098350525, "learning_rate": 0.0002, "epoch": 2.234257537789993, "step": 13820}, {"loss": 0.6413, "grad_norm": 0.7157923579216003, "learning_rate": 0.0002, "epoch": 2.235874221970738, "step": 13830}, {"loss": 0.6306, "grad_norm": 0.7110323309898376, "learning_rate": 0.0002, "epoch": 2.2374909061514834, "step": 13840}, {"loss": 0.6913, "grad_norm": 0.7155357599258423, "learning_rate": 0.0002, "epoch": 2.2391075903322286, "step": 13850}, {"loss": 0.6579, "grad_norm": 1.0177817344665527, "learning_rate": 0.0002, "epoch": 2.240724274512974, "step": 13860}, {"loss": 0.635, "grad_norm": 0.7601948380470276, "learning_rate": 0.0002, "epoch": 2.242340958693719, "step": 13870}, {"loss": 0.6679, "grad_norm": 0.7628820538520813, "learning_rate": 0.0002, "epoch": 2.2439576428744643, "step": 13880}, {"loss": 0.6805, "grad_norm": 0.7089297771453857, "learning_rate": 0.0002, "epoch": 2.24557432705521, "step": 13890}, {"loss": 0.7236, "grad_norm": 0.695178210735321, "learning_rate": 0.0002, "epoch": 2.247191011235955, "step": 13900}, {"loss": 0.7084, "grad_norm": 0.7631948590278625, "learning_rate": 0.0002, "epoch": 2.2488076954167004, "step": 13910}, {"loss": 0.685, "grad_norm": 0.8203101754188538, "learning_rate": 0.0002, "epoch": 2.2504243795974457, "step": 13920}, {"loss": 0.653, "grad_norm": 0.8099079728126526, "learning_rate": 0.0002, "epoch": 2.252041063778191, "step": 13930}, {"loss": 0.694, "grad_norm": 0.6498546004295349, "learning_rate": 0.0002, "epoch": 2.253657747958936, "step": 13940}, {"loss": 0.6684, "grad_norm": 0.7797415256500244, "learning_rate": 0.0002, "epoch": 2.2552744321396814, "step": 13950}, {"loss": 0.683, "grad_norm": 0.8254124522209167, "learning_rate": 0.0002, "epoch": 2.2568911163204266, "step": 13960}, {"loss": 0.6806, "grad_norm": 0.6327953338623047, "learning_rate": 0.0002, "epoch": 2.2585078005011723, "step": 13970}, {"loss": 0.668, "grad_norm": 0.734194278717041, "learning_rate": 0.0002, "epoch": 2.2601244846819175, "step": 13980}, {"loss": 0.6912, "grad_norm": 0.9014202952384949, "learning_rate": 0.0002, "epoch": 2.2617411688626627, "step": 13990}, {"loss": 0.692, "grad_norm": 0.7643631100654602, "learning_rate": 0.0002, "epoch": 2.263357853043408, "step": 14000}, {"loss": 0.6657, "grad_norm": 0.8882834911346436, "learning_rate": 0.0002, "epoch": 2.264974537224153, "step": 14010}, {"loss": 0.6453, "grad_norm": 0.7975873351097107, "learning_rate": 0.0002, "epoch": 2.2665912214048984, "step": 14020}, {"loss": 0.7193, "grad_norm": 0.7765783071517944, "learning_rate": 0.0002, "epoch": 2.2682079055856437, "step": 14030}, {"loss": 0.662, "grad_norm": 0.8846288323402405, "learning_rate": 0.0002, "epoch": 2.2698245897663893, "step": 14040}, {"loss": 0.6494, "grad_norm": 0.9006744027137756, "learning_rate": 0.0002, "epoch": 2.2714412739471346, "step": 14050}, {"loss": 0.6423, "grad_norm": 0.7420173287391663, "learning_rate": 0.0002, "epoch": 2.27305795812788, "step": 14060}, {"loss": 0.7068, "grad_norm": 0.7956424951553345, "learning_rate": 0.0002, "epoch": 2.274674642308625, "step": 14070}, {"loss": 0.6581, "grad_norm": 0.7783209085464478, "learning_rate": 0.0002, "epoch": 2.2762913264893703, "step": 14080}, {"loss": 0.7202, "grad_norm": 0.7597188949584961, "learning_rate": 0.0002, "epoch": 2.2779080106701155, "step": 14090}, {"loss": 0.6778, "grad_norm": 0.6718921661376953, "learning_rate": 0.0002, "epoch": 2.2795246948508607, "step": 14100}, {"loss": 0.632, "grad_norm": 0.7528082132339478, "learning_rate": 0.0002, "epoch": 2.281141379031606, "step": 14110}, {"loss": 0.7608, "grad_norm": 0.8379864692687988, "learning_rate": 0.0002, "epoch": 2.2827580632123516, "step": 14120}, {"loss": 0.6767, "grad_norm": 0.748613715171814, "learning_rate": 0.0002, "epoch": 2.284374747393097, "step": 14130}, {"loss": 0.6641, "grad_norm": 0.7435423135757446, "learning_rate": 0.0002, "epoch": 2.285991431573842, "step": 14140}, {"loss": 0.6849, "grad_norm": 0.7580803632736206, "learning_rate": 0.0002, "epoch": 2.2876081157545873, "step": 14150}, {"loss": 0.6604, "grad_norm": 0.6278321146965027, "learning_rate": 0.0002, "epoch": 2.2892247999353326, "step": 14160}, {"loss": 0.6573, "grad_norm": 0.7663896083831787, "learning_rate": 0.0002, "epoch": 2.290841484116078, "step": 14170}, {"loss": 0.6655, "grad_norm": 0.9716812372207642, "learning_rate": 0.0002, "epoch": 2.292458168296823, "step": 14180}, {"loss": 0.7067, "grad_norm": 0.8993458151817322, "learning_rate": 0.0002, "epoch": 2.2940748524775687, "step": 14190}, {"loss": 0.6172, "grad_norm": 0.6156117916107178, "learning_rate": 0.0002, "epoch": 2.295691536658314, "step": 14200}, {"loss": 0.6318, "grad_norm": 0.8911278247833252, "learning_rate": 0.0002, "epoch": 2.297308220839059, "step": 14210}, {"loss": 0.6364, "grad_norm": 0.6422147154808044, "learning_rate": 0.0002, "epoch": 2.2989249050198044, "step": 14220}, {"loss": 0.6795, "grad_norm": 0.6866879463195801, "learning_rate": 0.0002, "epoch": 2.3005415892005496, "step": 14230}, {"loss": 0.6907, "grad_norm": 0.9297130107879639, "learning_rate": 0.0002, "epoch": 2.302158273381295, "step": 14240}, {"loss": 0.6823, "grad_norm": 0.7501356601715088, "learning_rate": 0.0002, "epoch": 2.30377495756204, "step": 14250}, {"loss": 0.6414, "grad_norm": 0.8363515138626099, "learning_rate": 0.0002, "epoch": 2.3053916417427853, "step": 14260}, {"loss": 0.6362, "grad_norm": 0.9083868265151978, "learning_rate": 0.0002, "epoch": 2.307008325923531, "step": 14270}, {"loss": 0.6862, "grad_norm": 0.7791516780853271, "learning_rate": 0.0002, "epoch": 2.3086250101042762, "step": 14280}, {"loss": 0.6569, "grad_norm": 0.8766953349113464, "learning_rate": 0.0002, "epoch": 2.3102416942850215, "step": 14290}, {"loss": 0.6698, "grad_norm": 0.7916635274887085, "learning_rate": 0.0002, "epoch": 2.3118583784657667, "step": 14300}, {"loss": 0.6927, "grad_norm": 0.627525269985199, "learning_rate": 0.0002, "epoch": 2.313475062646512, "step": 14310}, {"loss": 0.6541, "grad_norm": 0.8856783509254456, "learning_rate": 0.0002, "epoch": 2.315091746827257, "step": 14320}, {"loss": 0.6806, "grad_norm": 0.6758689284324646, "learning_rate": 0.0002, "epoch": 2.316708431008003, "step": 14330}, {"loss": 0.6794, "grad_norm": 0.6428321003913879, "learning_rate": 0.0002, "epoch": 2.318325115188748, "step": 14340}, {"loss": 0.682, "grad_norm": 0.9032121300697327, "learning_rate": 0.0002, "epoch": 2.3199417993694933, "step": 14350}, {"loss": 0.6569, "grad_norm": 0.8035986423492432, "learning_rate": 0.0002, "epoch": 2.3215584835502385, "step": 14360}, {"loss": 0.7067, "grad_norm": 0.7974579334259033, "learning_rate": 0.0002, "epoch": 2.3231751677309838, "step": 14370}, {"loss": 0.6451, "grad_norm": 0.8356034755706787, "learning_rate": 0.0002, "epoch": 2.324791851911729, "step": 14380}, {"loss": 0.6623, "grad_norm": 0.998760998249054, "learning_rate": 0.0002, "epoch": 2.326408536092474, "step": 14390}, {"loss": 0.649, "grad_norm": 0.6518142223358154, "learning_rate": 0.0002, "epoch": 2.3280252202732195, "step": 14400}, {"loss": 0.7146, "grad_norm": 0.7443506717681885, "learning_rate": 0.0002, "epoch": 2.3296419044539647, "step": 14410}, {"loss": 0.648, "grad_norm": 0.8436172604560852, "learning_rate": 0.0002, "epoch": 2.3312585886347104, "step": 14420}, {"loss": 0.6585, "grad_norm": 0.7411080598831177, "learning_rate": 0.0002, "epoch": 2.3328752728154556, "step": 14430}, {"loss": 0.6781, "grad_norm": 0.8839048743247986, "learning_rate": 0.0002, "epoch": 2.334491956996201, "step": 14440}, {"loss": 0.6565, "grad_norm": 0.8360885977745056, "learning_rate": 0.0002, "epoch": 2.336108641176946, "step": 14450}, {"loss": 0.6662, "grad_norm": 0.7608986496925354, "learning_rate": 0.0002, "epoch": 2.3377253253576913, "step": 14460}, {"loss": 0.6685, "grad_norm": 0.8179867267608643, "learning_rate": 0.0002, "epoch": 2.3393420095384365, "step": 14470}, {"loss": 0.7055, "grad_norm": 0.5989999771118164, "learning_rate": 0.0002, "epoch": 2.340958693719182, "step": 14480}, {"loss": 0.644, "grad_norm": 0.9450054168701172, "learning_rate": 0.0002, "epoch": 2.3425753778999274, "step": 14490}, {"loss": 0.6983, "grad_norm": 0.7885149717330933, "learning_rate": 0.0002, "epoch": 2.3441920620806727, "step": 14500}, {"loss": 0.6819, "grad_norm": 0.8152616620063782, "learning_rate": 0.0002, "epoch": 2.345808746261418, "step": 14510}, {"loss": 0.6989, "grad_norm": 0.7193838953971863, "learning_rate": 0.0002, "epoch": 2.347425430442163, "step": 14520}, {"loss": 0.6594, "grad_norm": 0.6701092720031738, "learning_rate": 0.0002, "epoch": 2.3490421146229084, "step": 14530}, {"loss": 0.6559, "grad_norm": 0.7529364228248596, "learning_rate": 0.0002, "epoch": 2.3506587988036536, "step": 14540}, {"loss": 0.6306, "grad_norm": 0.6599733829498291, "learning_rate": 0.0002, "epoch": 2.352275482984399, "step": 14550}, {"loss": 0.706, "grad_norm": 0.9502474069595337, "learning_rate": 0.0002, "epoch": 2.353892167165144, "step": 14560}, {"loss": 0.717, "grad_norm": 0.7619650959968567, "learning_rate": 0.0002, "epoch": 2.3555088513458897, "step": 14570}, {"loss": 0.6684, "grad_norm": 0.9854652285575867, "learning_rate": 0.0002, "epoch": 2.357125535526635, "step": 14580}, {"loss": 0.6455, "grad_norm": 0.727439284324646, "learning_rate": 0.0002, "epoch": 2.35874221970738, "step": 14590}, {"loss": 0.6645, "grad_norm": 0.6994746327400208, "learning_rate": 0.0002, "epoch": 2.3603589038881254, "step": 14600}, {"loss": 0.6587, "grad_norm": 0.7117531299591064, "learning_rate": 0.0002, "epoch": 2.3619755880688706, "step": 14610}, {"loss": 0.6804, "grad_norm": 0.6403067708015442, "learning_rate": 0.0002, "epoch": 2.363592272249616, "step": 14620}, {"loss": 0.7055, "grad_norm": 0.8377841711044312, "learning_rate": 0.0002, "epoch": 2.3652089564303616, "step": 14630}, {"loss": 0.6778, "grad_norm": 0.749171257019043, "learning_rate": 0.0002, "epoch": 2.366825640611107, "step": 14640}, {"loss": 0.6552, "grad_norm": 0.8418586254119873, "learning_rate": 0.0002, "epoch": 2.368442324791852, "step": 14650}, {"loss": 0.6685, "grad_norm": 0.6178573369979858, "learning_rate": 0.0002, "epoch": 2.3700590089725972, "step": 14660}, {"loss": 0.6774, "grad_norm": 0.6368302702903748, "learning_rate": 0.0002, "epoch": 2.3716756931533425, "step": 14670}, {"loss": 0.6136, "grad_norm": 0.9122977256774902, "learning_rate": 0.0002, "epoch": 2.3732923773340877, "step": 14680}, {"loss": 0.6675, "grad_norm": 0.7086195349693298, "learning_rate": 0.0002, "epoch": 2.374909061514833, "step": 14690}, {"loss": 0.6582, "grad_norm": 0.7500800490379333, "learning_rate": 0.0002, "epoch": 2.376525745695578, "step": 14700}, {"loss": 0.6792, "grad_norm": 0.6634900569915771, "learning_rate": 0.0002, "epoch": 2.378142429876324, "step": 14710}, {"loss": 0.6614, "grad_norm": 0.839898407459259, "learning_rate": 0.0002, "epoch": 2.379759114057069, "step": 14720}, {"loss": 0.6453, "grad_norm": 0.7578426003456116, "learning_rate": 0.0002, "epoch": 2.3813757982378143, "step": 14730}, {"loss": 0.7282, "grad_norm": 1.0213173627853394, "learning_rate": 0.0002, "epoch": 2.3829924824185595, "step": 14740}, {"loss": 0.6704, "grad_norm": 0.7855949401855469, "learning_rate": 0.0002, "epoch": 2.3846091665993048, "step": 14750}, {"loss": 0.6694, "grad_norm": 0.7224128842353821, "learning_rate": 0.0002, "epoch": 2.38622585078005, "step": 14760}, {"loss": 0.7017, "grad_norm": 0.8040381669998169, "learning_rate": 0.0002, "epoch": 2.3878425349607952, "step": 14770}, {"loss": 0.6799, "grad_norm": 0.7705281376838684, "learning_rate": 0.0002, "epoch": 2.389459219141541, "step": 14780}, {"loss": 0.6326, "grad_norm": 0.667966902256012, "learning_rate": 0.0002, "epoch": 2.391075903322286, "step": 14790}, {"loss": 0.7061, "grad_norm": 0.6611011028289795, "learning_rate": 0.0002, "epoch": 2.3926925875030314, "step": 14800}, {"loss": 0.6527, "grad_norm": 0.6862651705741882, "learning_rate": 0.0002, "epoch": 2.3943092716837766, "step": 14810}, {"loss": 0.6537, "grad_norm": 0.8086010217666626, "learning_rate": 0.0002, "epoch": 2.395925955864522, "step": 14820}, {"loss": 0.7189, "grad_norm": 0.7189689874649048, "learning_rate": 0.0002, "epoch": 2.397542640045267, "step": 14830}, {"loss": 0.6709, "grad_norm": 0.6280009150505066, "learning_rate": 0.0002, "epoch": 2.3991593242260123, "step": 14840}, {"loss": 0.706, "grad_norm": 0.7826612591743469, "learning_rate": 0.0002, "epoch": 2.4007760084067575, "step": 14850}, {"loss": 0.6738, "grad_norm": 0.7681610584259033, "learning_rate": 0.0002, "epoch": 2.402392692587503, "step": 14860}, {"loss": 0.636, "grad_norm": 0.720966100692749, "learning_rate": 0.0002, "epoch": 2.4040093767682484, "step": 14870}, {"loss": 0.6667, "grad_norm": 0.8202250599861145, "learning_rate": 0.0002, "epoch": 2.4056260609489937, "step": 14880}, {"loss": 0.6935, "grad_norm": 0.786212682723999, "learning_rate": 0.0002, "epoch": 2.407242745129739, "step": 14890}, {"loss": 0.6628, "grad_norm": 0.6647164821624756, "learning_rate": 0.0002, "epoch": 2.408859429310484, "step": 14900}, {"loss": 0.6706, "grad_norm": 0.7566399574279785, "learning_rate": 0.0002, "epoch": 2.4104761134912294, "step": 14910}, {"loss": 0.7188, "grad_norm": 0.748814582824707, "learning_rate": 0.0002, "epoch": 2.4120927976719746, "step": 14920}, {"loss": 0.6684, "grad_norm": 0.7624038457870483, "learning_rate": 0.0002, "epoch": 2.4137094818527203, "step": 14930}, {"loss": 0.6483, "grad_norm": 0.8267335295677185, "learning_rate": 0.0002, "epoch": 2.4153261660334655, "step": 14940}, {"loss": 0.6612, "grad_norm": 0.8785360455513, "learning_rate": 0.0002, "epoch": 2.4169428502142107, "step": 14950}, {"loss": 0.6718, "grad_norm": 0.679887592792511, "learning_rate": 0.0002, "epoch": 2.418559534394956, "step": 14960}, {"loss": 0.6136, "grad_norm": 0.7218474745750427, "learning_rate": 0.0002, "epoch": 2.420176218575701, "step": 14970}, {"loss": 0.648, "grad_norm": 0.6342799663543701, "learning_rate": 0.0002, "epoch": 2.4217929027564464, "step": 14980}, {"loss": 0.6617, "grad_norm": 0.7098712921142578, "learning_rate": 0.0002, "epoch": 2.4234095869371917, "step": 14990}, {"loss": 0.6942, "grad_norm": 0.7497431635856628, "learning_rate": 0.0002, "epoch": 2.425026271117937, "step": 15000}, {"loss": 0.6772, "grad_norm": 0.934836208820343, "learning_rate": 0.0002, "epoch": 2.4266429552986826, "step": 15010}, {"loss": 0.7221, "grad_norm": 0.8430966734886169, "learning_rate": 0.0002, "epoch": 2.428259639479428, "step": 15020}, {"loss": 0.6985, "grad_norm": 0.7032104730606079, "learning_rate": 0.0002, "epoch": 2.429876323660173, "step": 15030}, {"loss": 0.6715, "grad_norm": 0.7746111750602722, "learning_rate": 0.0002, "epoch": 2.4314930078409183, "step": 15040}, {"loss": 0.7177, "grad_norm": 0.7661406397819519, "learning_rate": 0.0002, "epoch": 2.4331096920216635, "step": 15050}, {"loss": 0.6517, "grad_norm": 0.6941645741462708, "learning_rate": 0.0002, "epoch": 2.4347263762024087, "step": 15060}, {"loss": 0.6421, "grad_norm": 0.7487249374389648, "learning_rate": 0.0002, "epoch": 2.436343060383154, "step": 15070}, {"loss": 0.6796, "grad_norm": 0.7639912962913513, "learning_rate": 0.0002, "epoch": 2.4379597445638996, "step": 15080}, {"loss": 0.7087, "grad_norm": 0.7708953619003296, "learning_rate": 0.0002, "epoch": 2.439576428744645, "step": 15090}, {"loss": 0.7065, "grad_norm": 0.9135832190513611, "learning_rate": 0.0002, "epoch": 2.44119311292539, "step": 15100}, {"loss": 0.672, "grad_norm": 0.8283005356788635, "learning_rate": 0.0002, "epoch": 2.4428097971061353, "step": 15110}, {"loss": 0.6551, "grad_norm": 0.925299346446991, "learning_rate": 0.0002, "epoch": 2.4444264812868806, "step": 15120}, {"loss": 0.687, "grad_norm": 0.7013528943061829, "learning_rate": 0.0002, "epoch": 2.446043165467626, "step": 15130}, {"loss": 0.6842, "grad_norm": 0.622303307056427, "learning_rate": 0.0002, "epoch": 2.447659849648371, "step": 15140}, {"loss": 0.6676, "grad_norm": 0.876569390296936, "learning_rate": 0.0002, "epoch": 2.4492765338291163, "step": 15150}, {"loss": 0.6463, "grad_norm": 0.6836351752281189, "learning_rate": 0.0002, "epoch": 2.450893218009862, "step": 15160}, {"loss": 0.6781, "grad_norm": 0.7886684536933899, "learning_rate": 0.0002, "epoch": 2.452509902190607, "step": 15170}, {"loss": 0.6794, "grad_norm": 0.6647440791130066, "learning_rate": 0.0002, "epoch": 2.4541265863713524, "step": 15180}, {"loss": 0.6353, "grad_norm": 0.7477722764015198, "learning_rate": 0.0002, "epoch": 2.4557432705520976, "step": 15190}, {"loss": 0.698, "grad_norm": 0.8192033767700195, "learning_rate": 0.0002, "epoch": 2.457359954732843, "step": 15200}, {"loss": 0.6735, "grad_norm": 0.847537100315094, "learning_rate": 0.0002, "epoch": 2.458976638913588, "step": 15210}, {"loss": 0.6962, "grad_norm": 0.9027776122093201, "learning_rate": 0.0002, "epoch": 2.4605933230943338, "step": 15220}, {"loss": 0.7084, "grad_norm": 0.7217772006988525, "learning_rate": 0.0002, "epoch": 2.462210007275079, "step": 15230}, {"loss": 0.691, "grad_norm": 0.7994546294212341, "learning_rate": 0.0002, "epoch": 2.4638266914558242, "step": 15240}, {"loss": 0.6828, "grad_norm": 0.939916729927063, "learning_rate": 0.0002, "epoch": 2.4654433756365695, "step": 15250}, {"loss": 0.6893, "grad_norm": 1.0009053945541382, "learning_rate": 0.0002, "epoch": 2.4670600598173147, "step": 15260}, {"loss": 0.643, "grad_norm": 0.625555694103241, "learning_rate": 0.0002, "epoch": 2.46867674399806, "step": 15270}, {"loss": 0.688, "grad_norm": 0.7924878597259521, "learning_rate": 0.0002, "epoch": 2.470293428178805, "step": 15280}, {"loss": 0.6789, "grad_norm": 0.8536689877510071, "learning_rate": 0.0002, "epoch": 2.4719101123595504, "step": 15290}, {"loss": 0.6924, "grad_norm": 0.8572589755058289, "learning_rate": 0.0002, "epoch": 2.4735267965402956, "step": 15300}, {"loss": 0.604, "grad_norm": 0.773279070854187, "learning_rate": 0.0002, "epoch": 2.4751434807210413, "step": 15310}, {"loss": 0.6573, "grad_norm": 0.7708749771118164, "learning_rate": 0.0002, "epoch": 2.4767601649017865, "step": 15320}, {"loss": 0.7065, "grad_norm": 0.770905077457428, "learning_rate": 0.0002, "epoch": 2.4783768490825318, "step": 15330}, {"loss": 0.6878, "grad_norm": 0.8238571882247925, "learning_rate": 0.0002, "epoch": 2.479993533263277, "step": 15340}, {"loss": 0.6772, "grad_norm": 0.7670477032661438, "learning_rate": 0.0002, "epoch": 2.481610217444022, "step": 15350}, {"loss": 0.7759, "grad_norm": 0.905036985874176, "learning_rate": 0.0002, "epoch": 2.4832269016247674, "step": 15360}, {"loss": 0.706, "grad_norm": 0.6672089695930481, "learning_rate": 0.0002, "epoch": 2.484843585805513, "step": 15370}, {"loss": 0.6722, "grad_norm": 0.625095784664154, "learning_rate": 0.0002, "epoch": 2.4864602699862584, "step": 15380}, {"loss": 0.6396, "grad_norm": 0.679772675037384, "learning_rate": 0.0002, "epoch": 2.4880769541670036, "step": 15390}, {"loss": 0.6778, "grad_norm": 0.711492121219635, "learning_rate": 0.0002, "epoch": 2.489693638347749, "step": 15400}, {"loss": 0.6966, "grad_norm": 0.876189112663269, "learning_rate": 0.0002, "epoch": 2.491310322528494, "step": 15410}, {"loss": 0.7307, "grad_norm": 0.7236915230751038, "learning_rate": 0.0002, "epoch": 2.4929270067092393, "step": 15420}, {"loss": 0.647, "grad_norm": 0.6629832983016968, "learning_rate": 0.0002, "epoch": 2.4945436908899845, "step": 15430}, {"loss": 0.6669, "grad_norm": 0.9756859540939331, "learning_rate": 0.0002, "epoch": 2.4961603750707297, "step": 15440}, {"loss": 0.7559, "grad_norm": 0.6896940469741821, "learning_rate": 0.0002, "epoch": 2.4977770592514754, "step": 15450}, {"loss": 0.6818, "grad_norm": 0.7105149626731873, "learning_rate": 0.0002, "epoch": 2.4993937434322206, "step": 15460}, {"loss": 0.6859, "grad_norm": 0.8374546766281128, "learning_rate": 0.0002, "epoch": 2.501010427612966, "step": 15470}, {"loss": 0.6512, "grad_norm": 0.7320070266723633, "learning_rate": 0.0002, "epoch": 2.502627111793711, "step": 15480}, {"loss": 0.685, "grad_norm": 0.8306367993354797, "learning_rate": 0.0002, "epoch": 2.5042437959744563, "step": 15490}, {"loss": 0.7253, "grad_norm": 0.7472721338272095, "learning_rate": 0.0002, "epoch": 2.5058604801552016, "step": 15500}, {"loss": 0.6699, "grad_norm": 0.6147692203521729, "learning_rate": 0.0002, "epoch": 2.507477164335947, "step": 15510}, {"loss": 0.7158, "grad_norm": 0.7788505554199219, "learning_rate": 0.0002, "epoch": 2.5090938485166925, "step": 15520}, {"loss": 0.6521, "grad_norm": 0.8807527422904968, "learning_rate": 0.0002, "epoch": 2.5107105326974377, "step": 15530}, {"loss": 0.6792, "grad_norm": 0.7521643042564392, "learning_rate": 0.0002, "epoch": 2.512327216878183, "step": 15540}, {"loss": 0.6772, "grad_norm": 0.6900225281715393, "learning_rate": 0.0002, "epoch": 2.513943901058928, "step": 15550}, {"loss": 0.6769, "grad_norm": 0.6601938605308533, "learning_rate": 0.0002, "epoch": 2.5155605852396734, "step": 15560}, {"loss": 0.6648, "grad_norm": 0.8179984092712402, "learning_rate": 0.0002, "epoch": 2.5171772694204186, "step": 15570}, {"loss": 0.7028, "grad_norm": 0.792556881904602, "learning_rate": 0.0002, "epoch": 2.518793953601164, "step": 15580}, {"loss": 0.6464, "grad_norm": 0.7081938982009888, "learning_rate": 0.0002, "epoch": 2.520410637781909, "step": 15590}, {"loss": 0.6691, "grad_norm": 0.8733121156692505, "learning_rate": 0.0002, "epoch": 2.5220273219626543, "step": 15600}, {"loss": 0.6969, "grad_norm": 0.7980992794036865, "learning_rate": 0.0002, "epoch": 2.5236440061434, "step": 15610}, {"loss": 0.7124, "grad_norm": 0.883664071559906, "learning_rate": 0.0002, "epoch": 2.5252606903241452, "step": 15620}, {"loss": 0.7022, "grad_norm": 0.6963341236114502, "learning_rate": 0.0002, "epoch": 2.5268773745048905, "step": 15630}, {"loss": 0.7334, "grad_norm": 0.6433573365211487, "learning_rate": 0.0002, "epoch": 2.5284940586856357, "step": 15640}, {"loss": 0.6889, "grad_norm": 0.8538183569908142, "learning_rate": 0.0002, "epoch": 2.530110742866381, "step": 15650}, {"loss": 0.6841, "grad_norm": 0.9748201370239258, "learning_rate": 0.0002, "epoch": 2.5317274270471266, "step": 15660}, {"loss": 0.6765, "grad_norm": 0.7670575380325317, "learning_rate": 0.0002, "epoch": 2.533344111227872, "step": 15670}, {"loss": 0.6435, "grad_norm": 0.8738890290260315, "learning_rate": 0.0002, "epoch": 2.534960795408617, "step": 15680}, {"loss": 0.6802, "grad_norm": 0.8391636610031128, "learning_rate": 0.0002, "epoch": 2.5365774795893623, "step": 15690}, {"loss": 0.6901, "grad_norm": 0.7239366769790649, "learning_rate": 0.0002, "epoch": 2.5381941637701075, "step": 15700}, {"loss": 0.7011, "grad_norm": 0.8498379588127136, "learning_rate": 0.0002, "epoch": 2.5398108479508528, "step": 15710}, {"loss": 0.6998, "grad_norm": 0.8029484152793884, "learning_rate": 0.0002, "epoch": 2.541427532131598, "step": 15720}, {"loss": 0.6678, "grad_norm": 1.0639333724975586, "learning_rate": 0.0002, "epoch": 2.5430442163123432, "step": 15730}, {"loss": 0.6341, "grad_norm": 0.6401297450065613, "learning_rate": 0.0002, "epoch": 2.5446609004930885, "step": 15740}, {"loss": 0.7196, "grad_norm": 0.7123814821243286, "learning_rate": 0.0002, "epoch": 2.5462775846738337, "step": 15750}, {"loss": 0.654, "grad_norm": 0.7874974608421326, "learning_rate": 0.0002, "epoch": 2.5478942688545794, "step": 15760}, {"loss": 0.6721, "grad_norm": 0.8046808838844299, "learning_rate": 0.0002, "epoch": 2.5495109530353246, "step": 15770}, {"loss": 0.6665, "grad_norm": 0.7888661623001099, "learning_rate": 0.0002, "epoch": 2.55112763721607, "step": 15780}, {"loss": 0.6893, "grad_norm": 0.8445866107940674, "learning_rate": 0.0002, "epoch": 2.552744321396815, "step": 15790}, {"loss": 0.6815, "grad_norm": 0.7475846409797668, "learning_rate": 0.0002, "epoch": 2.5543610055775603, "step": 15800}, {"loss": 0.6711, "grad_norm": 0.7455102801322937, "learning_rate": 0.0002, "epoch": 2.555977689758306, "step": 15810}, {"loss": 0.6932, "grad_norm": 0.8226983547210693, "learning_rate": 0.0002, "epoch": 2.557594373939051, "step": 15820}, {"loss": 0.651, "grad_norm": 0.8920368552207947, "learning_rate": 0.0002, "epoch": 2.5592110581197964, "step": 15830}, {"loss": 0.6297, "grad_norm": 0.8413904905319214, "learning_rate": 0.0002, "epoch": 2.5608277423005417, "step": 15840}, {"loss": 0.7106, "grad_norm": 0.8483649492263794, "learning_rate": 0.0002, "epoch": 2.562444426481287, "step": 15850}, {"loss": 0.6957, "grad_norm": 0.5923284292221069, "learning_rate": 0.0002, "epoch": 2.564061110662032, "step": 15860}, {"loss": 0.6847, "grad_norm": 0.8518726229667664, "learning_rate": 0.0002, "epoch": 2.5656777948427774, "step": 15870}, {"loss": 0.6362, "grad_norm": 0.731235146522522, "learning_rate": 0.0002, "epoch": 2.5672944790235226, "step": 15880}, {"loss": 0.7611, "grad_norm": 0.7517194151878357, "learning_rate": 0.0002, "epoch": 2.568911163204268, "step": 15890}, {"loss": 0.6907, "grad_norm": 0.8378692269325256, "learning_rate": 0.0002, "epoch": 2.5705278473850135, "step": 15900}, {"loss": 0.7055, "grad_norm": 0.843701958656311, "learning_rate": 0.0002, "epoch": 2.5721445315657587, "step": 15910}, {"loss": 0.6882, "grad_norm": 0.7254629731178284, "learning_rate": 0.0002, "epoch": 2.573761215746504, "step": 15920}, {"loss": 0.6872, "grad_norm": 0.8863335847854614, "learning_rate": 0.0002, "epoch": 2.575377899927249, "step": 15930}, {"loss": 0.6813, "grad_norm": 0.7675097584724426, "learning_rate": 0.0002, "epoch": 2.5769945841079944, "step": 15940}, {"loss": 0.7357, "grad_norm": 0.82063889503479, "learning_rate": 0.0002, "epoch": 2.5786112682887397, "step": 15950}, {"loss": 0.662, "grad_norm": 0.7729717493057251, "learning_rate": 0.0002, "epoch": 2.5802279524694853, "step": 15960}, {"loss": 0.633, "grad_norm": 0.8301846981048584, "learning_rate": 0.0002, "epoch": 2.5818446366502306, "step": 15970}, {"loss": 0.6897, "grad_norm": 0.7906861305236816, "learning_rate": 0.0002, "epoch": 2.583461320830976, "step": 15980}, {"loss": 0.7175, "grad_norm": 0.6749057173728943, "learning_rate": 0.0002, "epoch": 2.585078005011721, "step": 15990}, {"loss": 0.7212, "grad_norm": 0.9386842846870422, "learning_rate": 0.0002, "epoch": 2.5866946891924663, "step": 16000}, {"loss": 0.6934, "grad_norm": 0.7868891358375549, "learning_rate": 0.0002, "epoch": 2.5883113733732115, "step": 16010}, {"loss": 0.7036, "grad_norm": 0.8674671053886414, "learning_rate": 0.0002, "epoch": 2.5899280575539567, "step": 16020}, {"loss": 0.7217, "grad_norm": 0.7043559551239014, "learning_rate": 0.0002, "epoch": 2.591544741734702, "step": 16030}, {"loss": 0.6967, "grad_norm": 0.5846083760261536, "learning_rate": 0.0002, "epoch": 2.593161425915447, "step": 16040}, {"loss": 0.7322, "grad_norm": 0.7323982119560242, "learning_rate": 0.0002, "epoch": 2.594778110096193, "step": 16050}, {"loss": 0.6794, "grad_norm": 0.9069556593894958, "learning_rate": 0.0002, "epoch": 2.596394794276938, "step": 16060}, {"loss": 0.7076, "grad_norm": 0.7522736191749573, "learning_rate": 0.0002, "epoch": 2.5980114784576833, "step": 16070}, {"loss": 0.6477, "grad_norm": 0.8149648308753967, "learning_rate": 0.0002, "epoch": 2.5996281626384286, "step": 16080}, {"loss": 0.6664, "grad_norm": 0.6214233040809631, "learning_rate": 0.0002, "epoch": 2.601244846819174, "step": 16090}, {"loss": 0.7307, "grad_norm": 0.6803743839263916, "learning_rate": 0.0002, "epoch": 2.602861530999919, "step": 16100}, {"loss": 0.7244, "grad_norm": 0.7223997116088867, "learning_rate": 0.0002, "epoch": 2.6044782151806647, "step": 16110}, {"loss": 0.6867, "grad_norm": 0.7324174642562866, "learning_rate": 0.0002, "epoch": 2.60609489936141, "step": 16120}, {"loss": 0.7159, "grad_norm": 0.9594739675521851, "learning_rate": 0.0002, "epoch": 2.607711583542155, "step": 16130}, {"loss": 0.6451, "grad_norm": 0.9485327005386353, "learning_rate": 0.0002, "epoch": 2.6093282677229004, "step": 16140}, {"loss": 0.6815, "grad_norm": 0.8449000120162964, "learning_rate": 0.0002, "epoch": 2.6109449519036456, "step": 16150}, {"loss": 0.7152, "grad_norm": 0.8520140051841736, "learning_rate": 0.0002, "epoch": 2.612561636084391, "step": 16160}, {"loss": 0.6759, "grad_norm": 0.7456524968147278, "learning_rate": 0.0002, "epoch": 2.614178320265136, "step": 16170}, {"loss": 0.6893, "grad_norm": 0.9912857413291931, "learning_rate": 0.0002, "epoch": 2.6157950044458813, "step": 16180}, {"loss": 0.7243, "grad_norm": 0.9001946449279785, "learning_rate": 0.0002, "epoch": 2.6174116886266265, "step": 16190}, {"loss": 0.6825, "grad_norm": 0.6568667888641357, "learning_rate": 0.0002, "epoch": 2.619028372807372, "step": 16200}, {"loss": 0.7013, "grad_norm": 1.0248128175735474, "learning_rate": 0.0002, "epoch": 2.6206450569881174, "step": 16210}, {"loss": 0.7045, "grad_norm": 0.6509039998054504, "learning_rate": 0.0002, "epoch": 2.6222617411688627, "step": 16220}, {"loss": 0.72, "grad_norm": 0.7626351118087769, "learning_rate": 0.0002, "epoch": 2.623878425349608, "step": 16230}, {"loss": 0.6556, "grad_norm": 0.6938552260398865, "learning_rate": 0.0002, "epoch": 2.625495109530353, "step": 16240}, {"loss": 0.65, "grad_norm": 0.6434680819511414, "learning_rate": 0.0002, "epoch": 2.6271117937110984, "step": 16250}, {"loss": 0.6943, "grad_norm": 0.7111515998840332, "learning_rate": 0.0002, "epoch": 2.628728477891844, "step": 16260}, {"loss": 0.679, "grad_norm": 0.7712395787239075, "learning_rate": 0.0002, "epoch": 2.6303451620725893, "step": 16270}, {"loss": 0.6886, "grad_norm": 0.792209267616272, "learning_rate": 0.0002, "epoch": 2.6319618462533345, "step": 16280}, {"loss": 0.6554, "grad_norm": 0.6801066398620605, "learning_rate": 0.0002, "epoch": 2.6335785304340797, "step": 16290}, {"loss": 0.73, "grad_norm": 0.7802573442459106, "learning_rate": 0.0002, "epoch": 2.635195214614825, "step": 16300}, {"loss": 0.7484, "grad_norm": 0.7742244601249695, "learning_rate": 0.0002, "epoch": 2.63681189879557, "step": 16310}, {"loss": 0.6524, "grad_norm": 0.664184033870697, "learning_rate": 0.0002, "epoch": 2.6384285829763154, "step": 16320}, {"loss": 0.6442, "grad_norm": 0.9242228865623474, "learning_rate": 0.0002, "epoch": 2.6400452671570607, "step": 16330}, {"loss": 0.6792, "grad_norm": 0.9661325216293335, "learning_rate": 0.0002, "epoch": 2.641661951337806, "step": 16340}, {"loss": 0.6847, "grad_norm": 0.837526798248291, "learning_rate": 0.0002, "epoch": 2.6432786355185516, "step": 16350}, {"loss": 0.7686, "grad_norm": 1.1834373474121094, "learning_rate": 0.0002, "epoch": 2.644895319699297, "step": 16360}, {"loss": 0.6746, "grad_norm": 0.7467831373214722, "learning_rate": 0.0002, "epoch": 2.646512003880042, "step": 16370}, {"loss": 0.6935, "grad_norm": 0.8627146482467651, "learning_rate": 0.0002, "epoch": 2.6481286880607873, "step": 16380}, {"loss": 0.715, "grad_norm": 0.790447473526001, "learning_rate": 0.0002, "epoch": 2.6497453722415325, "step": 16390}, {"loss": 0.723, "grad_norm": 0.8447365164756775, "learning_rate": 0.0002, "epoch": 2.651362056422278, "step": 16400}, {"loss": 0.6628, "grad_norm": 0.7831417918205261, "learning_rate": 0.0002, "epoch": 2.6529787406030234, "step": 16410}, {"loss": 0.6691, "grad_norm": 0.6837952136993408, "learning_rate": 0.0002, "epoch": 2.6545954247837686, "step": 16420}, {"loss": 0.6139, "grad_norm": 0.7031801342964172, "learning_rate": 0.0002, "epoch": 2.656212108964514, "step": 16430}, {"loss": 0.7382, "grad_norm": 0.8963770866394043, "learning_rate": 0.0002, "epoch": 2.657828793145259, "step": 16440}, {"loss": 0.6439, "grad_norm": 0.6852328181266785, "learning_rate": 0.0002, "epoch": 2.6594454773260043, "step": 16450}, {"loss": 0.6278, "grad_norm": 0.8069294095039368, "learning_rate": 0.0002, "epoch": 2.6610621615067496, "step": 16460}, {"loss": 0.6939, "grad_norm": 0.7503686547279358, "learning_rate": 0.0002, "epoch": 2.662678845687495, "step": 16470}, {"loss": 0.6777, "grad_norm": 0.6430956125259399, "learning_rate": 0.0002, "epoch": 2.66429552986824, "step": 16480}, {"loss": 0.6863, "grad_norm": 0.7894312739372253, "learning_rate": 0.0002, "epoch": 2.6659122140489853, "step": 16490}, {"loss": 0.7165, "grad_norm": 0.7277431488037109, "learning_rate": 0.0002, "epoch": 2.667528898229731, "step": 16500}, {"loss": 0.6772, "grad_norm": 0.6816153526306152, "learning_rate": 0.0002, "epoch": 2.669145582410476, "step": 16510}, {"loss": 0.691, "grad_norm": 0.8145235776901245, "learning_rate": 0.0002, "epoch": 2.6707622665912214, "step": 16520}, {"loss": 0.709, "grad_norm": 0.8645890355110168, "learning_rate": 0.0002, "epoch": 2.6723789507719666, "step": 16530}, {"loss": 0.6946, "grad_norm": 0.704393208026886, "learning_rate": 0.0002, "epoch": 2.673995634952712, "step": 16540}, {"loss": 0.6378, "grad_norm": 1.0120846033096313, "learning_rate": 0.0002, "epoch": 2.6756123191334575, "step": 16550}, {"loss": 0.7241, "grad_norm": 0.6919328570365906, "learning_rate": 0.0002, "epoch": 2.6772290033142028, "step": 16560}, {"loss": 0.7098, "grad_norm": 0.6924574971199036, "learning_rate": 0.0002, "epoch": 2.678845687494948, "step": 16570}, {"loss": 0.731, "grad_norm": 0.9679301381111145, "learning_rate": 0.0002, "epoch": 2.6804623716756932, "step": 16580}, {"loss": 0.7124, "grad_norm": 0.6810211539268494, "learning_rate": 0.0002, "epoch": 2.6820790558564385, "step": 16590}, {"loss": 0.6688, "grad_norm": 0.9730555415153503, "learning_rate": 0.0002, "epoch": 2.6836957400371837, "step": 16600}, {"loss": 0.7344, "grad_norm": 0.7852821350097656, "learning_rate": 0.0002, "epoch": 2.685312424217929, "step": 16610}, {"loss": 0.6401, "grad_norm": 0.6059057116508484, "learning_rate": 0.0002, "epoch": 2.686929108398674, "step": 16620}, {"loss": 0.6796, "grad_norm": 0.9395958781242371, "learning_rate": 0.0002, "epoch": 2.6885457925794194, "step": 16630}, {"loss": 0.7174, "grad_norm": 0.7473729848861694, "learning_rate": 0.0002, "epoch": 2.690162476760165, "step": 16640}, {"loss": 0.7087, "grad_norm": 0.765934407711029, "learning_rate": 0.0002, "epoch": 2.6917791609409103, "step": 16650}, {"loss": 0.707, "grad_norm": 0.8496677279472351, "learning_rate": 0.0002, "epoch": 2.6933958451216555, "step": 16660}, {"loss": 0.7084, "grad_norm": 0.7641879916191101, "learning_rate": 0.0002, "epoch": 2.6950125293024008, "step": 16670}, {"loss": 0.6566, "grad_norm": 0.8471952676773071, "learning_rate": 0.0002, "epoch": 2.696629213483146, "step": 16680}, {"loss": 0.6635, "grad_norm": 0.6946060657501221, "learning_rate": 0.0002, "epoch": 2.6982458976638912, "step": 16690}, {"loss": 0.7027, "grad_norm": 0.7361312508583069, "learning_rate": 0.0002, "epoch": 2.699862581844637, "step": 16700}, {"loss": 0.6767, "grad_norm": 0.6605038046836853, "learning_rate": 0.0002, "epoch": 2.701479266025382, "step": 16710}, {"loss": 0.6885, "grad_norm": 0.7164411544799805, "learning_rate": 0.0002, "epoch": 2.7030959502061274, "step": 16720}, {"loss": 0.6736, "grad_norm": 0.6496201157569885, "learning_rate": 0.0002, "epoch": 2.7047126343868726, "step": 16730}, {"loss": 0.6942, "grad_norm": 0.7826663851737976, "learning_rate": 0.0002, "epoch": 2.706329318567618, "step": 16740}, {"loss": 0.6773, "grad_norm": 0.7639131546020508, "learning_rate": 0.0002, "epoch": 2.707946002748363, "step": 16750}, {"loss": 0.69, "grad_norm": 0.7976210713386536, "learning_rate": 0.0002, "epoch": 2.7095626869291083, "step": 16760}, {"loss": 0.6735, "grad_norm": 0.6836577653884888, "learning_rate": 0.0002, "epoch": 2.7111793711098535, "step": 16770}, {"loss": 0.6596, "grad_norm": 0.8025202751159668, "learning_rate": 0.0002, "epoch": 2.7127960552905988, "step": 16780}, {"loss": 0.6324, "grad_norm": 0.7636463642120361, "learning_rate": 0.0002, "epoch": 2.7144127394713444, "step": 16790}, {"loss": 0.6227, "grad_norm": 0.7481677532196045, "learning_rate": 0.0002, "epoch": 2.7160294236520897, "step": 16800}, {"loss": 0.6925, "grad_norm": 0.7566834688186646, "learning_rate": 0.0002, "epoch": 2.717646107832835, "step": 16810}, {"loss": 0.6531, "grad_norm": 0.7931267619132996, "learning_rate": 0.0002, "epoch": 2.71926279201358, "step": 16820}, {"loss": 0.6672, "grad_norm": 0.8811662197113037, "learning_rate": 0.0002, "epoch": 2.7208794761943254, "step": 16830}, {"loss": 0.6675, "grad_norm": 0.8561240434646606, "learning_rate": 0.0002, "epoch": 2.7224961603750706, "step": 16840}, {"loss": 0.7135, "grad_norm": 0.7121599316596985, "learning_rate": 0.0002, "epoch": 2.7241128445558163, "step": 16850}, {"loss": 0.6825, "grad_norm": 0.8066257238388062, "learning_rate": 0.0002, "epoch": 2.7257295287365615, "step": 16860}, {"loss": 0.6839, "grad_norm": 0.7699271440505981, "learning_rate": 0.0002, "epoch": 2.7273462129173067, "step": 16870}, {"loss": 0.699, "grad_norm": 1.1828432083129883, "learning_rate": 0.0002, "epoch": 2.728962897098052, "step": 16880}, {"loss": 0.6518, "grad_norm": 0.9989302754402161, "learning_rate": 0.0002, "epoch": 2.730579581278797, "step": 16890}, {"loss": 0.7015, "grad_norm": 0.8100560307502747, "learning_rate": 0.0002, "epoch": 2.7321962654595424, "step": 16900}, {"loss": 0.6851, "grad_norm": 0.8615233898162842, "learning_rate": 0.0002, "epoch": 2.7338129496402876, "step": 16910}, {"loss": 0.6322, "grad_norm": 0.8633756041526794, "learning_rate": 0.0002, "epoch": 2.735429633821033, "step": 16920}, {"loss": 0.6488, "grad_norm": 0.7769348621368408, "learning_rate": 0.0002, "epoch": 2.737046318001778, "step": 16930}, {"loss": 0.6582, "grad_norm": 0.6943058371543884, "learning_rate": 0.0002, "epoch": 2.738663002182524, "step": 16940}, {"loss": 0.6516, "grad_norm": 0.8510736227035522, "learning_rate": 0.0002, "epoch": 2.740279686363269, "step": 16950}, {"loss": 0.7275, "grad_norm": 0.7732602953910828, "learning_rate": 0.0002, "epoch": 2.7418963705440142, "step": 16960}, {"loss": 0.6553, "grad_norm": 0.5981788635253906, "learning_rate": 0.0002, "epoch": 2.7435130547247595, "step": 16970}, {"loss": 0.6777, "grad_norm": 0.7604416012763977, "learning_rate": 0.0002, "epoch": 2.7451297389055047, "step": 16980}, {"loss": 0.6981, "grad_norm": 0.7377738356590271, "learning_rate": 0.0002, "epoch": 2.74674642308625, "step": 16990}, {"loss": 0.6294, "grad_norm": 0.9400289058685303, "learning_rate": 0.0002, "epoch": 2.7483631072669956, "step": 17000}, {"loss": 0.6952, "grad_norm": 0.6340599656105042, "learning_rate": 0.0002, "epoch": 2.749979791447741, "step": 17010}, {"loss": 0.7222, "grad_norm": 0.7297601103782654, "learning_rate": 0.0002, "epoch": 2.751596475628486, "step": 17020}, {"loss": 0.6659, "grad_norm": 0.9479979872703552, "learning_rate": 0.0002, "epoch": 2.7532131598092313, "step": 17030}, {"loss": 0.691, "grad_norm": 0.8461511135101318, "learning_rate": 0.0002, "epoch": 2.7548298439899765, "step": 17040}, {"loss": 0.6764, "grad_norm": 0.7477551698684692, "learning_rate": 0.0002, "epoch": 2.7564465281707218, "step": 17050}, {"loss": 0.684, "grad_norm": 1.019270420074463, "learning_rate": 0.0002, "epoch": 2.758063212351467, "step": 17060}, {"loss": 0.7119, "grad_norm": 0.7730235457420349, "learning_rate": 0.0002, "epoch": 2.7596798965322122, "step": 17070}, {"loss": 0.6886, "grad_norm": 0.8216866254806519, "learning_rate": 0.0002, "epoch": 2.7612965807129575, "step": 17080}, {"loss": 0.6811, "grad_norm": 0.7235931754112244, "learning_rate": 0.0002, "epoch": 2.762913264893703, "step": 17090}, {"loss": 0.7031, "grad_norm": 0.7352296710014343, "learning_rate": 0.0002, "epoch": 2.7645299490744484, "step": 17100}, {"loss": 0.6951, "grad_norm": 0.8129373788833618, "learning_rate": 0.0002, "epoch": 2.7661466332551936, "step": 17110}, {"loss": 0.6703, "grad_norm": 0.7387019991874695, "learning_rate": 0.0002, "epoch": 2.767763317435939, "step": 17120}, {"loss": 0.6789, "grad_norm": 0.9149190187454224, "learning_rate": 0.0002, "epoch": 2.769380001616684, "step": 17130}, {"loss": 0.6038, "grad_norm": 0.7352971434593201, "learning_rate": 0.0002, "epoch": 2.7709966857974297, "step": 17140}, {"loss": 0.6728, "grad_norm": 0.7903780341148376, "learning_rate": 0.0002, "epoch": 2.772613369978175, "step": 17150}, {"loss": 0.6988, "grad_norm": 0.8255927562713623, "learning_rate": 0.0002, "epoch": 2.77423005415892, "step": 17160}, {"loss": 0.6694, "grad_norm": 0.7235927581787109, "learning_rate": 0.0002, "epoch": 2.7758467383396654, "step": 17170}, {"loss": 0.7161, "grad_norm": 0.8281434774398804, "learning_rate": 0.0002, "epoch": 2.7774634225204107, "step": 17180}, {"loss": 0.682, "grad_norm": 0.7586921453475952, "learning_rate": 0.0002, "epoch": 2.779080106701156, "step": 17190}, {"loss": 0.6427, "grad_norm": 0.7161715030670166, "learning_rate": 0.0002, "epoch": 2.780696790881901, "step": 17200}, {"loss": 0.6426, "grad_norm": 0.762868344783783, "learning_rate": 0.0002, "epoch": 2.7823134750626464, "step": 17210}, {"loss": 0.705, "grad_norm": 0.9285483360290527, "learning_rate": 0.0002, "epoch": 2.7839301592433916, "step": 17220}, {"loss": 0.7084, "grad_norm": 0.6900462508201599, "learning_rate": 0.0002, "epoch": 2.785546843424137, "step": 17230}, {"loss": 0.6988, "grad_norm": 0.780384361743927, "learning_rate": 0.0002, "epoch": 2.7871635276048825, "step": 17240}, {"loss": 0.7073, "grad_norm": 0.7580406665802002, "learning_rate": 0.0002, "epoch": 2.7887802117856277, "step": 17250}, {"loss": 0.6833, "grad_norm": 0.8145199418067932, "learning_rate": 0.0002, "epoch": 2.790396895966373, "step": 17260}, {"loss": 0.6909, "grad_norm": 0.9159596562385559, "learning_rate": 0.0002, "epoch": 2.792013580147118, "step": 17270}, {"loss": 0.6008, "grad_norm": 0.9590014219284058, "learning_rate": 0.0002, "epoch": 2.7936302643278634, "step": 17280}, {"loss": 0.6704, "grad_norm": 0.7603529691696167, "learning_rate": 0.0002, "epoch": 2.795246948508609, "step": 17290}, {"loss": 0.7165, "grad_norm": 0.8039976358413696, "learning_rate": 0.0002, "epoch": 2.7968636326893543, "step": 17300}, {"loss": 0.7037, "grad_norm": 0.8364847302436829, "learning_rate": 0.0002, "epoch": 2.7984803168700996, "step": 17310}, {"loss": 0.6749, "grad_norm": 0.8763046860694885, "learning_rate": 0.0002, "epoch": 2.800097001050845, "step": 17320}, {"loss": 0.6844, "grad_norm": 0.8409647941589355, "learning_rate": 0.0002, "epoch": 2.80171368523159, "step": 17330}, {"loss": 0.6936, "grad_norm": 0.7649006247520447, "learning_rate": 0.0002, "epoch": 2.8033303694123353, "step": 17340}, {"loss": 0.7051, "grad_norm": 0.7970262169837952, "learning_rate": 0.0002, "epoch": 2.8049470535930805, "step": 17350}, {"loss": 0.6533, "grad_norm": 0.9088607430458069, "learning_rate": 0.0002, "epoch": 2.8065637377738257, "step": 17360}, {"loss": 0.675, "grad_norm": 0.6454846858978271, "learning_rate": 0.0002, "epoch": 2.808180421954571, "step": 17370}, {"loss": 0.7069, "grad_norm": 0.7744787931442261, "learning_rate": 0.0002, "epoch": 2.809797106135316, "step": 17380}, {"loss": 0.6772, "grad_norm": 0.6678640842437744, "learning_rate": 0.0002, "epoch": 2.811413790316062, "step": 17390}, {"loss": 0.6784, "grad_norm": 0.772676944732666, "learning_rate": 0.0002, "epoch": 2.813030474496807, "step": 17400}, {"loss": 0.7252, "grad_norm": 0.7088175415992737, "learning_rate": 0.0002, "epoch": 2.8146471586775523, "step": 17410}, {"loss": 0.7086, "grad_norm": 0.8280573487281799, "learning_rate": 0.0002, "epoch": 2.8162638428582976, "step": 17420}, {"loss": 0.6732, "grad_norm": 0.6665388345718384, "learning_rate": 0.0002, "epoch": 2.817880527039043, "step": 17430}, {"loss": 0.6675, "grad_norm": 0.6427883505821228, "learning_rate": 0.0002, "epoch": 2.8194972112197885, "step": 17440}, {"loss": 0.6972, "grad_norm": 0.9697760343551636, "learning_rate": 0.0002, "epoch": 2.8211138954005337, "step": 17450}, {"loss": 0.6838, "grad_norm": 0.7573966383934021, "learning_rate": 0.0002, "epoch": 2.822730579581279, "step": 17460}, {"loss": 0.7243, "grad_norm": 0.878688633441925, "learning_rate": 0.0002, "epoch": 2.824347263762024, "step": 17470}, {"loss": 0.6666, "grad_norm": 0.7752242684364319, "learning_rate": 0.0002, "epoch": 2.8259639479427694, "step": 17480}, {"loss": 0.6638, "grad_norm": 0.6135398745536804, "learning_rate": 0.0002, "epoch": 2.8275806321235146, "step": 17490}, {"loss": 0.6829, "grad_norm": 0.6924924850463867, "learning_rate": 0.0002, "epoch": 2.82919731630426, "step": 17500}, {"loss": 0.6731, "grad_norm": 0.7471627593040466, "learning_rate": 0.0002, "epoch": 2.830814000485005, "step": 17510}, {"loss": 0.7016, "grad_norm": 0.7145499587059021, "learning_rate": 0.0002, "epoch": 2.8324306846657503, "step": 17520}, {"loss": 0.6787, "grad_norm": 0.7415414452552795, "learning_rate": 0.0002, "epoch": 2.834047368846496, "step": 17530}, {"loss": 0.6811, "grad_norm": 0.7328441739082336, "learning_rate": 0.0002, "epoch": 2.8356640530272412, "step": 17540}, {"loss": 0.6866, "grad_norm": 0.8267839550971985, "learning_rate": 0.0002, "epoch": 2.8372807372079865, "step": 17550}, {"loss": 0.6787, "grad_norm": 0.8877885341644287, "learning_rate": 0.0002, "epoch": 2.8388974213887317, "step": 17560}, {"loss": 0.7136, "grad_norm": 0.857138454914093, "learning_rate": 0.0002, "epoch": 2.840514105569477, "step": 17570}, {"loss": 0.6454, "grad_norm": 0.8470779657363892, "learning_rate": 0.0002, "epoch": 2.842130789750222, "step": 17580}, {"loss": 0.6976, "grad_norm": 0.8553254008293152, "learning_rate": 0.0002, "epoch": 2.843747473930968, "step": 17590}, {"loss": 0.7297, "grad_norm": 0.8033196926116943, "learning_rate": 0.0002, "epoch": 2.845364158111713, "step": 17600}, {"loss": 0.7062, "grad_norm": 0.7949087023735046, "learning_rate": 0.0002, "epoch": 2.8469808422924583, "step": 17610}, {"loss": 0.651, "grad_norm": 0.9241406321525574, "learning_rate": 0.0002, "epoch": 2.8485975264732035, "step": 17620}, {"loss": 0.6601, "grad_norm": 0.7721285223960876, "learning_rate": 0.0002, "epoch": 2.8502142106539488, "step": 17630}, {"loss": 0.6183, "grad_norm": 1.0246692895889282, "learning_rate": 0.0002, "epoch": 2.851830894834694, "step": 17640}, {"loss": 0.7007, "grad_norm": 0.9244589805603027, "learning_rate": 0.0002, "epoch": 2.853447579015439, "step": 17650}, {"loss": 0.7274, "grad_norm": 0.7243508696556091, "learning_rate": 0.0002, "epoch": 2.8550642631961844, "step": 17660}, {"loss": 0.6471, "grad_norm": 0.8943371176719666, "learning_rate": 0.0002, "epoch": 2.8566809473769297, "step": 17670}, {"loss": 0.686, "grad_norm": 0.6531758904457092, "learning_rate": 0.0002, "epoch": 2.8582976315576754, "step": 17680}, {"loss": 0.6253, "grad_norm": 0.8367000818252563, "learning_rate": 0.0002, "epoch": 2.8599143157384206, "step": 17690}, {"loss": 0.6943, "grad_norm": 0.7868556380271912, "learning_rate": 0.0002, "epoch": 2.861530999919166, "step": 17700}, {"loss": 0.6919, "grad_norm": 0.7213859558105469, "learning_rate": 0.0002, "epoch": 2.863147684099911, "step": 17710}, {"loss": 0.6657, "grad_norm": 0.7383931279182434, "learning_rate": 0.0002, "epoch": 2.8647643682806563, "step": 17720}, {"loss": 0.6841, "grad_norm": 0.7566812634468079, "learning_rate": 0.0002, "epoch": 2.8663810524614015, "step": 17730}, {"loss": 0.6449, "grad_norm": 0.6930373311042786, "learning_rate": 0.0002, "epoch": 2.867997736642147, "step": 17740}, {"loss": 0.6764, "grad_norm": 0.7911090850830078, "learning_rate": 0.0002, "epoch": 2.8696144208228924, "step": 17750}, {"loss": 0.6554, "grad_norm": 0.8484548926353455, "learning_rate": 0.0002, "epoch": 2.8712311050036377, "step": 17760}, {"loss": 0.6931, "grad_norm": 0.7647597193717957, "learning_rate": 0.0002, "epoch": 2.872847789184383, "step": 17770}, {"loss": 0.6945, "grad_norm": 0.8791151642799377, "learning_rate": 0.0002, "epoch": 2.874464473365128, "step": 17780}, {"loss": 0.7078, "grad_norm": 0.7253178358078003, "learning_rate": 0.0002, "epoch": 2.8760811575458733, "step": 17790}, {"loss": 0.6474, "grad_norm": 0.7956077456474304, "learning_rate": 0.0002, "epoch": 2.8776978417266186, "step": 17800}, {"loss": 0.6687, "grad_norm": 0.8657688498497009, "learning_rate": 0.0002, "epoch": 2.879314525907364, "step": 17810}, {"loss": 0.7171, "grad_norm": 0.7059141993522644, "learning_rate": 0.0002, "epoch": 2.880931210088109, "step": 17820}, {"loss": 0.683, "grad_norm": 0.8886896967887878, "learning_rate": 0.0002, "epoch": 2.8825478942688547, "step": 17830}, {"loss": 0.669, "grad_norm": 0.821032702922821, "learning_rate": 0.0002, "epoch": 2.8841645784496, "step": 17840}, {"loss": 0.6805, "grad_norm": 0.7183963656425476, "learning_rate": 0.0002, "epoch": 2.885781262630345, "step": 17850}, {"loss": 0.7088, "grad_norm": 0.6222899556159973, "learning_rate": 0.0002, "epoch": 2.8873979468110904, "step": 17860}, {"loss": 0.6626, "grad_norm": 0.8187434077262878, "learning_rate": 0.0002, "epoch": 2.8890146309918356, "step": 17870}, {"loss": 0.6815, "grad_norm": 0.9838479161262512, "learning_rate": 0.0002, "epoch": 2.890631315172581, "step": 17880}, {"loss": 0.6967, "grad_norm": 0.7567742466926575, "learning_rate": 0.0002, "epoch": 2.8922479993533265, "step": 17890}, {"loss": 0.7073, "grad_norm": 0.6875903606414795, "learning_rate": 0.0002, "epoch": 2.893864683534072, "step": 17900}, {"loss": 0.6415, "grad_norm": 0.8043789267539978, "learning_rate": 0.0002, "epoch": 2.895481367714817, "step": 17910}, {"loss": 0.6588, "grad_norm": 0.8062626719474792, "learning_rate": 0.0002, "epoch": 2.8970980518955622, "step": 17920}, {"loss": 0.7151, "grad_norm": 1.0251191854476929, "learning_rate": 0.0002, "epoch": 2.8987147360763075, "step": 17930}, {"loss": 0.6605, "grad_norm": 0.882253110408783, "learning_rate": 0.0002, "epoch": 2.9003314202570527, "step": 17940}, {"loss": 0.6719, "grad_norm": 0.8683299422264099, "learning_rate": 0.0002, "epoch": 2.901948104437798, "step": 17950}, {"loss": 0.6896, "grad_norm": 0.7167282104492188, "learning_rate": 0.0002, "epoch": 2.903564788618543, "step": 17960}, {"loss": 0.663, "grad_norm": 0.7093694806098938, "learning_rate": 0.0002, "epoch": 2.9051814727992884, "step": 17970}, {"loss": 0.6591, "grad_norm": 0.8549879193305969, "learning_rate": 0.0002, "epoch": 2.906798156980034, "step": 17980}, {"loss": 0.6962, "grad_norm": 0.6989606618881226, "learning_rate": 0.0002, "epoch": 2.9084148411607793, "step": 17990}, {"loss": 0.6635, "grad_norm": 0.9482976794242859, "learning_rate": 0.0002, "epoch": 2.9100315253415245, "step": 18000}, {"loss": 0.6586, "grad_norm": 0.7182440161705017, "learning_rate": 0.0002, "epoch": 2.9116482095222698, "step": 18010}, {"loss": 0.6827, "grad_norm": 0.7732226252555847, "learning_rate": 0.0002, "epoch": 2.913264893703015, "step": 18020}, {"loss": 0.7123, "grad_norm": 0.7936875224113464, "learning_rate": 0.0002, "epoch": 2.9148815778837607, "step": 18030}, {"loss": 0.6736, "grad_norm": 0.8825615644454956, "learning_rate": 0.0002, "epoch": 2.916498262064506, "step": 18040}, {"loss": 0.7139, "grad_norm": 0.6778587102890015, "learning_rate": 0.0002, "epoch": 2.918114946245251, "step": 18050}, {"loss": 0.6588, "grad_norm": 0.7529265880584717, "learning_rate": 0.0002, "epoch": 2.9197316304259964, "step": 18060}, {"loss": 0.737, "grad_norm": 0.7111883163452148, "learning_rate": 0.0002, "epoch": 2.9213483146067416, "step": 18070}, {"loss": 0.7475, "grad_norm": 0.7214767932891846, "learning_rate": 0.0002, "epoch": 2.922964998787487, "step": 18080}, {"loss": 0.6672, "grad_norm": 0.800417423248291, "learning_rate": 0.0002, "epoch": 2.924581682968232, "step": 18090}, {"loss": 0.6694, "grad_norm": 1.248575210571289, "learning_rate": 0.0002, "epoch": 2.9261983671489773, "step": 18100}, {"loss": 0.7004, "grad_norm": 0.757788360118866, "learning_rate": 0.0002, "epoch": 2.9278150513297225, "step": 18110}, {"loss": 0.6999, "grad_norm": 1.0583995580673218, "learning_rate": 0.0002, "epoch": 2.9294317355104678, "step": 18120}, {"loss": 0.6365, "grad_norm": 0.8228777647018433, "learning_rate": 0.0002, "epoch": 2.9310484196912134, "step": 18130}, {"loss": 0.6791, "grad_norm": 0.8374035358428955, "learning_rate": 0.0002, "epoch": 2.9326651038719587, "step": 18140}, {"loss": 0.6399, "grad_norm": 0.7976473569869995, "learning_rate": 0.0002, "epoch": 2.934281788052704, "step": 18150}, {"loss": 0.6585, "grad_norm": 0.8009907603263855, "learning_rate": 0.0002, "epoch": 2.935898472233449, "step": 18160}, {"loss": 0.7485, "grad_norm": 0.835213303565979, "learning_rate": 0.0002, "epoch": 2.9375151564141944, "step": 18170}, {"loss": 0.7376, "grad_norm": 0.7982219457626343, "learning_rate": 0.0002, "epoch": 2.93913184059494, "step": 18180}, {"loss": 0.6348, "grad_norm": 0.7070978879928589, "learning_rate": 0.0002, "epoch": 2.9407485247756853, "step": 18190}, {"loss": 0.6608, "grad_norm": 0.8619440197944641, "learning_rate": 0.0002, "epoch": 2.9423652089564305, "step": 18200}, {"loss": 0.666, "grad_norm": 0.6693987250328064, "learning_rate": 0.0002, "epoch": 2.9439818931371757, "step": 18210}, {"loss": 0.728, "grad_norm": 0.6747021079063416, "learning_rate": 0.0002, "epoch": 2.945598577317921, "step": 18220}, {"loss": 0.6686, "grad_norm": 0.860387921333313, "learning_rate": 0.0002, "epoch": 2.947215261498666, "step": 18230}, {"loss": 0.6945, "grad_norm": 0.799976646900177, "learning_rate": 0.0002, "epoch": 2.9488319456794114, "step": 18240}, {"loss": 0.7243, "grad_norm": 0.7864769101142883, "learning_rate": 0.0002, "epoch": 2.9504486298601567, "step": 18250}, {"loss": 0.6785, "grad_norm": 0.6713884472846985, "learning_rate": 0.0002, "epoch": 2.952065314040902, "step": 18260}, {"loss": 0.7429, "grad_norm": 0.9031508564949036, "learning_rate": 0.0002, "epoch": 2.9536819982216476, "step": 18270}, {"loss": 0.7055, "grad_norm": 0.7205073237419128, "learning_rate": 0.0002, "epoch": 2.955298682402393, "step": 18280}, {"loss": 0.7298, "grad_norm": 0.7746205925941467, "learning_rate": 0.0002, "epoch": 2.956915366583138, "step": 18290}, {"loss": 0.6218, "grad_norm": 0.6533427834510803, "learning_rate": 0.0002, "epoch": 2.9585320507638833, "step": 18300}, {"loss": 0.6674, "grad_norm": 0.9083208441734314, "learning_rate": 0.0002, "epoch": 2.9601487349446285, "step": 18310}, {"loss": 0.7359, "grad_norm": 0.7446991801261902, "learning_rate": 0.0002, "epoch": 2.9617654191253737, "step": 18320}, {"loss": 0.6738, "grad_norm": 0.6514461636543274, "learning_rate": 0.0002, "epoch": 2.9633821033061194, "step": 18330}, {"loss": 0.6677, "grad_norm": 0.8580465912818909, "learning_rate": 0.0002, "epoch": 2.9649987874868646, "step": 18340}, {"loss": 0.6971, "grad_norm": 0.7074266076087952, "learning_rate": 0.0002, "epoch": 2.96661547166761, "step": 18350}, {"loss": 0.6804, "grad_norm": 0.899892270565033, "learning_rate": 0.0002, "epoch": 2.968232155848355, "step": 18360}, {"loss": 0.7094, "grad_norm": 0.8217641711235046, "learning_rate": 0.0002, "epoch": 2.9698488400291003, "step": 18370}, {"loss": 0.6916, "grad_norm": 0.8611799478530884, "learning_rate": 0.0002, "epoch": 2.9714655242098456, "step": 18380}, {"loss": 0.6677, "grad_norm": 0.6909302473068237, "learning_rate": 0.0002, "epoch": 2.973082208390591, "step": 18390}, {"loss": 0.7247, "grad_norm": 0.6554358005523682, "learning_rate": 0.0002, "epoch": 2.974698892571336, "step": 18400}, {"loss": 0.6516, "grad_norm": 0.7803071737289429, "learning_rate": 0.0002, "epoch": 2.9763155767520812, "step": 18410}, {"loss": 0.7322, "grad_norm": 0.7838954925537109, "learning_rate": 0.0002, "epoch": 2.977932260932827, "step": 18420}, {"loss": 0.6522, "grad_norm": 0.7098495364189148, "learning_rate": 0.0002, "epoch": 2.979548945113572, "step": 18430}, {"loss": 0.739, "grad_norm": 0.8981785774230957, "learning_rate": 0.0002, "epoch": 2.9811656292943174, "step": 18440}, {"loss": 0.6689, "grad_norm": 0.7197171449661255, "learning_rate": 0.0002, "epoch": 2.9827823134750626, "step": 18450}, {"loss": 0.706, "grad_norm": 0.793185293674469, "learning_rate": 0.0002, "epoch": 2.984398997655808, "step": 18460}, {"loss": 0.7124, "grad_norm": 0.8531473875045776, "learning_rate": 0.0002, "epoch": 2.986015681836553, "step": 18470}, {"loss": 0.6901, "grad_norm": 0.6627361178398132, "learning_rate": 0.0002, "epoch": 2.9876323660172988, "step": 18480}, {"loss": 0.6591, "grad_norm": 0.5708155035972595, "learning_rate": 0.0002, "epoch": 2.989249050198044, "step": 18490}, {"loss": 0.6725, "grad_norm": 0.8227280378341675, "learning_rate": 0.0002, "epoch": 2.990865734378789, "step": 18500}, {"loss": 0.6701, "grad_norm": 0.7102749943733215, "learning_rate": 0.0002, "epoch": 2.9924824185595345, "step": 18510}, {"loss": 0.7091, "grad_norm": 0.839485228061676, "learning_rate": 0.0002, "epoch": 2.9940991027402797, "step": 18520}, {"loss": 0.6521, "grad_norm": 0.9038704037666321, "learning_rate": 0.0002, "epoch": 2.995715786921025, "step": 18530}, {"loss": 0.7186, "grad_norm": 0.8737510442733765, "learning_rate": 0.0002, "epoch": 2.99733247110177, "step": 18540}, {"loss": 0.6819, "grad_norm": 0.7323142886161804, "learning_rate": 0.0002, "epoch": 2.9989491552825154, "step": 18550}]} +{"epoch": 4.0, "step": 24742, "epoch_duration": 16883.95777654648, "total_accumulated_duration": 67535.40600156784, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75537109375}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.6636, "grad_norm": 0.9894065856933594, "learning_rate": 0.0002, "epoch": 0.0016166841807452913, "step": 10}, {"loss": 1.1528, "grad_norm": 1.7810699939727783, "learning_rate": 0.0002, "epoch": 0.0032333683614905826, "step": 20}, {"loss": 0.9767, "grad_norm": 0.5969577431678772, "learning_rate": 0.0002, "epoch": 0.004850052542235874, "step": 30}, {"loss": 0.9772, "grad_norm": 0.6354120969772339, "learning_rate": 0.0002, "epoch": 0.006466736722981165, "step": 40}, {"loss": 0.8643, "grad_norm": 0.5604607462882996, "learning_rate": 0.0002, "epoch": 0.008083420903726457, "step": 50}, {"loss": 0.8841, "grad_norm": 0.4676193594932556, "learning_rate": 0.0002, "epoch": 0.009700105084471748, "step": 60}, {"loss": 0.9022, "grad_norm": 0.6099211573600769, "learning_rate": 0.0002, "epoch": 0.01131678926521704, "step": 70}, {"loss": 0.9133, "grad_norm": 0.48639994859695435, "learning_rate": 0.0002, "epoch": 0.01293347344596233, "step": 80}, {"loss": 0.8704, "grad_norm": 0.4904264509677887, "learning_rate": 0.0002, "epoch": 0.014550157626707623, "step": 90}, {"loss": 0.8855, "grad_norm": 2.8334362506866455, "learning_rate": 0.0002, "epoch": 0.016166841807452915, "step": 100}, {"loss": 0.8958, "grad_norm": 0.43221670389175415, "learning_rate": 0.0002, "epoch": 0.017783525988198205, "step": 110}, {"loss": 0.8412, "grad_norm": 0.42244166135787964, "learning_rate": 0.0002, "epoch": 0.019400210168943496, "step": 120}, {"loss": 0.8467, "grad_norm": 0.45363298058509827, "learning_rate": 0.0002, "epoch": 0.02101689434968879, "step": 130}, {"loss": 0.8641, "grad_norm": 0.44816508889198303, "learning_rate": 0.0002, "epoch": 0.02263357853043408, "step": 140}, {"loss": 0.8496, "grad_norm": 0.43308213353157043, "learning_rate": 0.0002, "epoch": 0.02425026271117937, "step": 150}, {"loss": 0.8213, "grad_norm": 0.4084763526916504, "learning_rate": 0.0002, "epoch": 0.02586694689192466, "step": 160}, {"loss": 0.8343, "grad_norm": 0.5363703966140747, "learning_rate": 0.0002, "epoch": 0.027483631072669955, "step": 170}, {"loss": 0.8558, "grad_norm": 0.4619699716567993, "learning_rate": 0.0002, "epoch": 0.029100315253415245, "step": 180}, {"loss": 0.8878, "grad_norm": 0.49069908261299133, "learning_rate": 0.0002, "epoch": 0.030716999434160536, "step": 190}, {"loss": 0.8867, "grad_norm": 0.4645835757255554, "learning_rate": 0.0002, "epoch": 0.03233368361490583, "step": 200}, {"loss": 0.8842, "grad_norm": 1.2411243915557861, "learning_rate": 0.0002, "epoch": 0.03395036779565112, "step": 210}, {"loss": 0.8245, "grad_norm": 0.5211851596832275, "learning_rate": 0.0002, "epoch": 0.03556705197639641, "step": 220}, {"loss": 0.8194, "grad_norm": 0.5253691673278809, "learning_rate": 0.0002, "epoch": 0.037183736157141704, "step": 230}, {"loss": 0.8856, "grad_norm": 0.4567478895187378, "learning_rate": 0.0002, "epoch": 0.03880042033788699, "step": 240}, {"loss": 0.838, "grad_norm": 0.5472128391265869, "learning_rate": 0.0002, "epoch": 0.040417104518632285, "step": 250}, {"loss": 0.8201, "grad_norm": 0.42978546023368835, "learning_rate": 0.0002, "epoch": 0.04203378869937758, "step": 260}, {"loss": 0.8334, "grad_norm": 0.601734459400177, "learning_rate": 0.0002, "epoch": 0.043650472880122866, "step": 270}, {"loss": 0.815, "grad_norm": 0.4286513328552246, "learning_rate": 0.0002, "epoch": 0.04526715706086816, "step": 280}, {"loss": 0.8758, "grad_norm": 0.5230861902236938, "learning_rate": 0.0002, "epoch": 0.046883841241613454, "step": 290}, {"loss": 0.8636, "grad_norm": 0.6504611968994141, "learning_rate": 0.0002, "epoch": 0.04850052542235874, "step": 300}, {"loss": 0.8102, "grad_norm": 0.43485215306282043, "learning_rate": 0.0002, "epoch": 0.050117209603104035, "step": 310}, {"loss": 0.8221, "grad_norm": 0.4717007875442505, "learning_rate": 0.0002, "epoch": 0.05173389378384932, "step": 320}, {"loss": 0.8469, "grad_norm": 0.4059787690639496, "learning_rate": 0.0002, "epoch": 0.053350577964594616, "step": 330}, {"loss": 0.8866, "grad_norm": 0.4366913437843323, "learning_rate": 0.0002, "epoch": 0.05496726214533991, "step": 340}, {"loss": 0.7976, "grad_norm": 0.4233848452568054, "learning_rate": 0.0002, "epoch": 0.0565839463260852, "step": 350}, {"loss": 0.8456, "grad_norm": 0.4209108352661133, "learning_rate": 0.0002, "epoch": 0.05820063050683049, "step": 360}, {"loss": 0.816, "grad_norm": 0.41637396812438965, "learning_rate": 0.0002, "epoch": 0.059817314687575784, "step": 370}, {"loss": 0.7976, "grad_norm": 0.46235376596450806, "learning_rate": 0.0002, "epoch": 0.06143399886832107, "step": 380}, {"loss": 0.7966, "grad_norm": 0.4013484716415405, "learning_rate": 0.0002, "epoch": 0.06305068304906636, "step": 390}, {"loss": 0.8253, "grad_norm": 0.47443896532058716, "learning_rate": 0.0002, "epoch": 0.06466736722981166, "step": 400}, {"loss": 0.8666, "grad_norm": 0.3942156434059143, "learning_rate": 0.0002, "epoch": 0.06628405141055695, "step": 410}, {"loss": 0.8402, "grad_norm": 0.4965320825576782, "learning_rate": 0.0002, "epoch": 0.06790073559130223, "step": 420}, {"loss": 0.8317, "grad_norm": 0.4304835796356201, "learning_rate": 0.0002, "epoch": 0.06951741977204753, "step": 430}, {"loss": 0.8528, "grad_norm": 0.511726975440979, "learning_rate": 0.0002, "epoch": 0.07113410395279282, "step": 440}, {"loss": 0.8675, "grad_norm": 0.4040689170360565, "learning_rate": 0.0002, "epoch": 0.07275078813353811, "step": 450}, {"loss": 0.8788, "grad_norm": 0.5402171015739441, "learning_rate": 0.0002, "epoch": 0.07436747231428341, "step": 460}, {"loss": 0.8737, "grad_norm": 0.4174517095088959, "learning_rate": 0.0002, "epoch": 0.0759841564950287, "step": 470}, {"loss": 0.7605, "grad_norm": 0.4306182265281677, "learning_rate": 0.0002, "epoch": 0.07760084067577398, "step": 480}, {"loss": 0.799, "grad_norm": 0.535210132598877, "learning_rate": 0.0002, "epoch": 0.07921752485651928, "step": 490}, {"loss": 0.7825, "grad_norm": 0.5339109897613525, "learning_rate": 0.0002, "epoch": 0.08083420903726457, "step": 500}, {"loss": 0.8985, "grad_norm": 0.45754891633987427, "learning_rate": 0.0002, "epoch": 0.08245089321800986, "step": 510}, {"loss": 0.8144, "grad_norm": 0.43820783495903015, "learning_rate": 0.0002, "epoch": 0.08406757739875516, "step": 520}, {"loss": 0.8001, "grad_norm": 0.4434749186038971, "learning_rate": 0.0002, "epoch": 0.08568426157950045, "step": 530}, {"loss": 0.7857, "grad_norm": 0.43111467361450195, "learning_rate": 0.0002, "epoch": 0.08730094576024573, "step": 540}, {"loss": 0.8418, "grad_norm": 0.4378940165042877, "learning_rate": 0.0002, "epoch": 0.08891762994099103, "step": 550}, {"loss": 0.8361, "grad_norm": 0.4772215187549591, "learning_rate": 0.0002, "epoch": 0.09053431412173632, "step": 560}, {"loss": 0.8268, "grad_norm": 0.6837629079818726, "learning_rate": 0.0002, "epoch": 0.09215099830248161, "step": 570}, {"loss": 0.8607, "grad_norm": 0.42241212725639343, "learning_rate": 0.0002, "epoch": 0.09376768248322691, "step": 580}, {"loss": 0.852, "grad_norm": 0.5165936350822449, "learning_rate": 0.0002, "epoch": 0.0953843666639722, "step": 590}, {"loss": 0.8664, "grad_norm": 0.48737478256225586, "learning_rate": 0.0002, "epoch": 0.09700105084471748, "step": 600}, {"loss": 0.8806, "grad_norm": 0.47419852018356323, "learning_rate": 0.0002, "epoch": 0.09861773502546278, "step": 610}, {"loss": 0.8254, "grad_norm": 0.4975486099720001, "learning_rate": 0.0002, "epoch": 0.10023441920620807, "step": 620}, {"loss": 0.8548, "grad_norm": 0.49123844504356384, "learning_rate": 0.0002, "epoch": 0.10185110338695336, "step": 630}, {"loss": 0.8911, "grad_norm": 0.6288952827453613, "learning_rate": 0.0002, "epoch": 0.10346778756769864, "step": 640}, {"loss": 0.827, "grad_norm": 0.4277345836162567, "learning_rate": 0.0002, "epoch": 0.10508447174844394, "step": 650}, {"loss": 0.7996, "grad_norm": 0.4021061956882477, "learning_rate": 0.0002, "epoch": 0.10670115592918923, "step": 660}, {"loss": 0.87, "grad_norm": 0.3492237329483032, "learning_rate": 0.0002, "epoch": 0.10831784010993452, "step": 670}, {"loss": 0.8698, "grad_norm": 0.4341012239456177, "learning_rate": 0.0002, "epoch": 0.10993452429067982, "step": 680}, {"loss": 0.781, "grad_norm": 0.7296304106712341, "learning_rate": 0.0002, "epoch": 0.1115512084714251, "step": 690}, {"loss": 0.8433, "grad_norm": 0.397494912147522, "learning_rate": 0.0002, "epoch": 0.1131678926521704, "step": 700}, {"loss": 0.827, "grad_norm": 0.396431028842926, "learning_rate": 0.0002, "epoch": 0.1147845768329157, "step": 710}, {"loss": 0.8379, "grad_norm": 0.48842838406562805, "learning_rate": 0.0002, "epoch": 0.11640126101366098, "step": 720}, {"loss": 0.8238, "grad_norm": 0.46322616934776306, "learning_rate": 0.0002, "epoch": 0.11801794519440627, "step": 730}, {"loss": 0.8041, "grad_norm": 0.47990912199020386, "learning_rate": 0.0002, "epoch": 0.11963462937515157, "step": 740}, {"loss": 0.82, "grad_norm": 0.4997142255306244, "learning_rate": 0.0002, "epoch": 0.12125131355589686, "step": 750}, {"loss": 0.7702, "grad_norm": 0.4040526747703552, "learning_rate": 0.0002, "epoch": 0.12286799773664214, "step": 760}, {"loss": 0.863, "grad_norm": 0.453095942735672, "learning_rate": 0.0002, "epoch": 0.12448468191738744, "step": 770}, {"loss": 0.8792, "grad_norm": 0.4636971950531006, "learning_rate": 0.0002, "epoch": 0.12610136609813272, "step": 780}, {"loss": 0.8112, "grad_norm": 0.4279276132583618, "learning_rate": 0.0002, "epoch": 0.12771805027887803, "step": 790}, {"loss": 0.8711, "grad_norm": 0.46212655305862427, "learning_rate": 0.0002, "epoch": 0.12933473445962332, "step": 800}, {"loss": 0.8368, "grad_norm": 0.43127650022506714, "learning_rate": 0.0002, "epoch": 0.1309514186403686, "step": 810}, {"loss": 0.8476, "grad_norm": 0.4201301336288452, "learning_rate": 0.0002, "epoch": 0.1325681028211139, "step": 820}, {"loss": 0.8078, "grad_norm": 0.42583167552948, "learning_rate": 0.0002, "epoch": 0.13418478700185918, "step": 830}, {"loss": 0.8219, "grad_norm": 0.4535622000694275, "learning_rate": 0.0002, "epoch": 0.13580147118260447, "step": 840}, {"loss": 0.8423, "grad_norm": 0.4116036891937256, "learning_rate": 0.0002, "epoch": 0.13741815536334978, "step": 850}, {"loss": 0.8466, "grad_norm": 0.45997580885887146, "learning_rate": 0.0002, "epoch": 0.13903483954409507, "step": 860}, {"loss": 0.8917, "grad_norm": 0.4487837255001068, "learning_rate": 0.0002, "epoch": 0.14065152372484035, "step": 870}, {"loss": 0.8217, "grad_norm": 0.43650057911872864, "learning_rate": 0.0002, "epoch": 0.14226820790558564, "step": 880}, {"loss": 0.8178, "grad_norm": 0.5335358381271362, "learning_rate": 0.0002, "epoch": 0.14388489208633093, "step": 890}, {"loss": 0.7957, "grad_norm": 0.5989000201225281, "learning_rate": 0.0002, "epoch": 0.14550157626707622, "step": 900}, {"loss": 0.8385, "grad_norm": 0.517179012298584, "learning_rate": 0.0002, "epoch": 0.14711826044782153, "step": 910}, {"loss": 0.8255, "grad_norm": 0.44435232877731323, "learning_rate": 0.0002, "epoch": 0.14873494462856682, "step": 920}, {"loss": 0.8305, "grad_norm": 0.42635923624038696, "learning_rate": 0.0002, "epoch": 0.1503516288093121, "step": 930}, {"loss": 0.8043, "grad_norm": 0.49603334069252014, "learning_rate": 0.0002, "epoch": 0.1519683129900574, "step": 940}, {"loss": 0.8377, "grad_norm": 0.40639808773994446, "learning_rate": 0.0002, "epoch": 0.15358499717080268, "step": 950}, {"loss": 0.8529, "grad_norm": 0.4850759208202362, "learning_rate": 0.0002, "epoch": 0.15520168135154797, "step": 960}, {"loss": 0.846, "grad_norm": 0.4427442252635956, "learning_rate": 0.0002, "epoch": 0.15681836553229328, "step": 970}, {"loss": 0.8705, "grad_norm": 0.3760930001735687, "learning_rate": 0.0002, "epoch": 0.15843504971303857, "step": 980}, {"loss": 0.8644, "grad_norm": 0.4794144332408905, "learning_rate": 0.0002, "epoch": 0.16005173389378385, "step": 990}, {"loss": 0.8002, "grad_norm": 0.45828768610954285, "learning_rate": 0.0002, "epoch": 0.16166841807452914, "step": 1000}, {"loss": 0.7658, "grad_norm": 0.6313053369522095, "learning_rate": 0.0002, "epoch": 0.16328510225527443, "step": 1010}, {"loss": 0.8047, "grad_norm": 0.45041006803512573, "learning_rate": 0.0002, "epoch": 0.16490178643601971, "step": 1020}, {"loss": 0.8423, "grad_norm": 0.441403865814209, "learning_rate": 0.0002, "epoch": 0.166518470616765, "step": 1030}, {"loss": 0.8475, "grad_norm": 0.8171296119689941, "learning_rate": 0.0002, "epoch": 0.16813515479751032, "step": 1040}, {"loss": 0.845, "grad_norm": 0.7137420773506165, "learning_rate": 0.0002, "epoch": 0.1697518389782556, "step": 1050}, {"loss": 0.8213, "grad_norm": 0.5236809849739075, "learning_rate": 0.0002, "epoch": 0.1713685231590009, "step": 1060}, {"loss": 0.8265, "grad_norm": 0.5021864175796509, "learning_rate": 0.0002, "epoch": 0.17298520733974618, "step": 1070}, {"loss": 0.8305, "grad_norm": 0.47347521781921387, "learning_rate": 0.0002, "epoch": 0.17460189152049146, "step": 1080}, {"loss": 0.8105, "grad_norm": 0.4631653428077698, "learning_rate": 0.0002, "epoch": 0.17621857570123675, "step": 1090}, {"loss": 0.8166, "grad_norm": 0.49169182777404785, "learning_rate": 0.0002, "epoch": 0.17783525988198207, "step": 1100}, {"loss": 0.8012, "grad_norm": 0.5019739270210266, "learning_rate": 0.0002, "epoch": 0.17945194406272735, "step": 1110}, {"loss": 0.8247, "grad_norm": 0.5100422501564026, "learning_rate": 0.0002, "epoch": 0.18106862824347264, "step": 1120}, {"loss": 0.8142, "grad_norm": 0.3888324499130249, "learning_rate": 0.0002, "epoch": 0.18268531242421793, "step": 1130}, {"loss": 0.8533, "grad_norm": 0.39765217900276184, "learning_rate": 0.0002, "epoch": 0.18430199660496321, "step": 1140}, {"loss": 0.8541, "grad_norm": 0.47190186381340027, "learning_rate": 0.0002, "epoch": 0.1859186807857085, "step": 1150}, {"loss": 0.8301, "grad_norm": 0.4464188814163208, "learning_rate": 0.0002, "epoch": 0.18753536496645382, "step": 1160}, {"loss": 0.8341, "grad_norm": 0.5153930187225342, "learning_rate": 0.0002, "epoch": 0.1891520491471991, "step": 1170}, {"loss": 0.8033, "grad_norm": 0.4779708683490753, "learning_rate": 0.0002, "epoch": 0.1907687333279444, "step": 1180}, {"loss": 0.8187, "grad_norm": 0.4834315776824951, "learning_rate": 0.0002, "epoch": 0.19238541750868968, "step": 1190}, {"loss": 0.7721, "grad_norm": 0.402357816696167, "learning_rate": 0.0002, "epoch": 0.19400210168943496, "step": 1200}, {"loss": 0.7941, "grad_norm": 0.45899084210395813, "learning_rate": 0.0002, "epoch": 0.19561878587018025, "step": 1210}, {"loss": 0.8353, "grad_norm": 0.5106529593467712, "learning_rate": 0.0002, "epoch": 0.19723547005092557, "step": 1220}, {"loss": 0.7816, "grad_norm": 0.45261722803115845, "learning_rate": 0.0002, "epoch": 0.19885215423167085, "step": 1230}, {"loss": 0.8068, "grad_norm": 0.4647127091884613, "learning_rate": 0.0002, "epoch": 0.20046883841241614, "step": 1240}, {"loss": 0.8239, "grad_norm": 0.4849368929862976, "learning_rate": 0.0002, "epoch": 0.20208552259316143, "step": 1250}, {"loss": 0.8514, "grad_norm": 0.4518061578273773, "learning_rate": 0.0002, "epoch": 0.2037022067739067, "step": 1260}, {"loss": 0.8158, "grad_norm": 0.49535325169563293, "learning_rate": 0.0002, "epoch": 0.205318890954652, "step": 1270}, {"loss": 0.8348, "grad_norm": 0.4835205376148224, "learning_rate": 0.0002, "epoch": 0.2069355751353973, "step": 1280}, {"loss": 0.8428, "grad_norm": 0.45308539271354675, "learning_rate": 0.0002, "epoch": 0.2085522593161426, "step": 1290}, {"loss": 0.7993, "grad_norm": 0.5369905233383179, "learning_rate": 0.0002, "epoch": 0.2101689434968879, "step": 1300}, {"loss": 0.8676, "grad_norm": 0.5031622052192688, "learning_rate": 0.0002, "epoch": 0.21178562767763318, "step": 1310}, {"loss": 0.7686, "grad_norm": 0.48010334372520447, "learning_rate": 0.0002, "epoch": 0.21340231185837846, "step": 1320}, {"loss": 0.806, "grad_norm": 0.4905701279640198, "learning_rate": 0.0002, "epoch": 0.21501899603912375, "step": 1330}, {"loss": 0.7885, "grad_norm": 0.43531742691993713, "learning_rate": 0.0002, "epoch": 0.21663568021986904, "step": 1340}, {"loss": 0.8191, "grad_norm": 0.44330692291259766, "learning_rate": 0.0002, "epoch": 0.21825236440061435, "step": 1350}, {"loss": 0.8205, "grad_norm": 0.5384416580200195, "learning_rate": 0.0002, "epoch": 0.21986904858135964, "step": 1360}, {"loss": 0.7726, "grad_norm": 0.4181833863258362, "learning_rate": 0.0002, "epoch": 0.22148573276210493, "step": 1370}, {"loss": 0.8311, "grad_norm": 0.523833692073822, "learning_rate": 0.0002, "epoch": 0.2231024169428502, "step": 1380}, {"loss": 0.7913, "grad_norm": 0.5528736710548401, "learning_rate": 0.0002, "epoch": 0.2247191011235955, "step": 1390}, {"loss": 0.8079, "grad_norm": 0.43515023589134216, "learning_rate": 0.0002, "epoch": 0.2263357853043408, "step": 1400}, {"loss": 0.8403, "grad_norm": 0.48809877038002014, "learning_rate": 0.0002, "epoch": 0.2279524694850861, "step": 1410}, {"loss": 0.8165, "grad_norm": 0.43591251969337463, "learning_rate": 0.0002, "epoch": 0.2295691536658314, "step": 1420}, {"loss": 0.8147, "grad_norm": 0.44625312089920044, "learning_rate": 0.0002, "epoch": 0.23118583784657668, "step": 1430}, {"loss": 0.8134, "grad_norm": 0.4390665292739868, "learning_rate": 0.0002, "epoch": 0.23280252202732196, "step": 1440}, {"loss": 0.8465, "grad_norm": 0.48496049642562866, "learning_rate": 0.0002, "epoch": 0.23441920620806725, "step": 1450}, {"loss": 0.775, "grad_norm": 0.45919957756996155, "learning_rate": 0.0002, "epoch": 0.23603589038881254, "step": 1460}, {"loss": 0.8659, "grad_norm": 0.5471845865249634, "learning_rate": 0.0002, "epoch": 0.23765257456955785, "step": 1470}, {"loss": 0.8164, "grad_norm": 0.47269317507743835, "learning_rate": 0.0002, "epoch": 0.23926925875030314, "step": 1480}, {"loss": 0.854, "grad_norm": 0.4930245578289032, "learning_rate": 0.0002, "epoch": 0.24088594293104842, "step": 1490}, {"loss": 0.8139, "grad_norm": 0.5605630278587341, "learning_rate": 0.0002, "epoch": 0.2425026271117937, "step": 1500}, {"loss": 0.8125, "grad_norm": 0.4435870945453644, "learning_rate": 0.0002, "epoch": 0.244119311292539, "step": 1510}, {"loss": 0.8123, "grad_norm": 0.4941999912261963, "learning_rate": 0.0002, "epoch": 0.24573599547328429, "step": 1520}, {"loss": 0.8427, "grad_norm": 0.5100624561309814, "learning_rate": 0.0002, "epoch": 0.24735267965402957, "step": 1530}, {"loss": 0.8405, "grad_norm": 0.4638267457485199, "learning_rate": 0.0002, "epoch": 0.2489693638347749, "step": 1540}, {"loss": 0.81, "grad_norm": 0.5071570873260498, "learning_rate": 0.0002, "epoch": 0.25058604801552015, "step": 1550}, {"loss": 0.7724, "grad_norm": 0.4291319251060486, "learning_rate": 0.0002, "epoch": 0.25220273219626543, "step": 1560}, {"loss": 0.7984, "grad_norm": 0.5388049483299255, "learning_rate": 0.0002, "epoch": 0.2538194163770108, "step": 1570}, {"loss": 0.8176, "grad_norm": 0.5083683729171753, "learning_rate": 0.0002, "epoch": 0.25543610055775606, "step": 1580}, {"loss": 0.843, "grad_norm": 0.4824463725090027, "learning_rate": 0.0002, "epoch": 0.25705278473850135, "step": 1590}, {"loss": 0.7996, "grad_norm": 0.41177722811698914, "learning_rate": 0.0002, "epoch": 0.25866946891924664, "step": 1600}, {"loss": 0.7772, "grad_norm": 0.5656219124794006, "learning_rate": 0.0002, "epoch": 0.2602861530999919, "step": 1610}, {"loss": 0.7955, "grad_norm": 0.41063204407691956, "learning_rate": 0.0002, "epoch": 0.2619028372807372, "step": 1620}, {"loss": 0.7998, "grad_norm": 0.4897061288356781, "learning_rate": 0.0002, "epoch": 0.2635195214614825, "step": 1630}, {"loss": 0.8198, "grad_norm": 0.4454376697540283, "learning_rate": 0.0002, "epoch": 0.2651362056422278, "step": 1640}, {"loss": 0.8684, "grad_norm": 0.4355238378047943, "learning_rate": 0.0002, "epoch": 0.26675288982297307, "step": 1650}, {"loss": 0.7801, "grad_norm": 0.458310067653656, "learning_rate": 0.0002, "epoch": 0.26836957400371836, "step": 1660}, {"loss": 0.7935, "grad_norm": 0.4752083718776703, "learning_rate": 0.0002, "epoch": 0.26998625818446365, "step": 1670}, {"loss": 0.8267, "grad_norm": 0.4666106402873993, "learning_rate": 0.0002, "epoch": 0.27160294236520893, "step": 1680}, {"loss": 0.8252, "grad_norm": 0.4213818609714508, "learning_rate": 0.0002, "epoch": 0.2732196265459543, "step": 1690}, {"loss": 0.8559, "grad_norm": 0.5768913626670837, "learning_rate": 0.0002, "epoch": 0.27483631072669956, "step": 1700}, {"loss": 0.7931, "grad_norm": 0.4209914803504944, "learning_rate": 0.0002, "epoch": 0.27645299490744485, "step": 1710}, {"loss": 0.8167, "grad_norm": 0.501909613609314, "learning_rate": 0.0002, "epoch": 0.27806967908819014, "step": 1720}, {"loss": 0.7832, "grad_norm": 0.5266261100769043, "learning_rate": 0.0002, "epoch": 0.2796863632689354, "step": 1730}, {"loss": 0.8102, "grad_norm": 0.43806859850883484, "learning_rate": 0.0002, "epoch": 0.2813030474496807, "step": 1740}, {"loss": 0.8157, "grad_norm": 0.46048814058303833, "learning_rate": 0.0002, "epoch": 0.282919731630426, "step": 1750}, {"loss": 0.8596, "grad_norm": 0.44972819089889526, "learning_rate": 0.0002, "epoch": 0.2845364158111713, "step": 1760}, {"loss": 0.8421, "grad_norm": 0.5114831328392029, "learning_rate": 0.0002, "epoch": 0.28615309999191657, "step": 1770}, {"loss": 0.8361, "grad_norm": 0.47931742668151855, "learning_rate": 0.0002, "epoch": 0.28776978417266186, "step": 1780}, {"loss": 0.8265, "grad_norm": 0.5092599987983704, "learning_rate": 0.0002, "epoch": 0.28938646835340714, "step": 1790}, {"loss": 0.8506, "grad_norm": 0.37581443786621094, "learning_rate": 0.0002, "epoch": 0.29100315253415243, "step": 1800}, {"loss": 0.7932, "grad_norm": 0.47097381949424744, "learning_rate": 0.0002, "epoch": 0.2926198367148977, "step": 1810}, {"loss": 0.7787, "grad_norm": 0.48300236463546753, "learning_rate": 0.0002, "epoch": 0.29423652089564306, "step": 1820}, {"loss": 0.8391, "grad_norm": 0.5600419640541077, "learning_rate": 0.0002, "epoch": 0.29585320507638835, "step": 1830}, {"loss": 0.8507, "grad_norm": 0.48555272817611694, "learning_rate": 0.0002, "epoch": 0.29746988925713364, "step": 1840}, {"loss": 0.7657, "grad_norm": 0.3752668499946594, "learning_rate": 0.0002, "epoch": 0.2990865734378789, "step": 1850}, {"loss": 0.7915, "grad_norm": 0.5328747034072876, "learning_rate": 0.0002, "epoch": 0.3007032576186242, "step": 1860}, {"loss": 0.8426, "grad_norm": 0.48716455698013306, "learning_rate": 0.0002, "epoch": 0.3023199417993695, "step": 1870}, {"loss": 0.8335, "grad_norm": 0.5011493563652039, "learning_rate": 0.0002, "epoch": 0.3039366259801148, "step": 1880}, {"loss": 0.852, "grad_norm": 0.46461427211761475, "learning_rate": 0.0002, "epoch": 0.30555331016086007, "step": 1890}, {"loss": 0.8478, "grad_norm": 0.36630210280418396, "learning_rate": 0.0002, "epoch": 0.30716999434160536, "step": 1900}, {"loss": 0.8162, "grad_norm": 0.4217296242713928, "learning_rate": 0.0002, "epoch": 0.30878667852235064, "step": 1910}, {"loss": 0.8128, "grad_norm": 0.4394875466823578, "learning_rate": 0.0002, "epoch": 0.31040336270309593, "step": 1920}, {"loss": 0.8471, "grad_norm": 0.6587965488433838, "learning_rate": 0.0002, "epoch": 0.3120200468838412, "step": 1930}, {"loss": 0.8565, "grad_norm": 0.5469298958778381, "learning_rate": 0.0002, "epoch": 0.31363673106458656, "step": 1940}, {"loss": 0.8236, "grad_norm": 0.4371595084667206, "learning_rate": 0.0002, "epoch": 0.31525341524533185, "step": 1950}, {"loss": 0.887, "grad_norm": 0.4809541404247284, "learning_rate": 0.0002, "epoch": 0.31687009942607713, "step": 1960}, {"loss": 0.7855, "grad_norm": 0.6061086654663086, "learning_rate": 0.0002, "epoch": 0.3184867836068224, "step": 1970}, {"loss": 0.7679, "grad_norm": 0.5342657566070557, "learning_rate": 0.0002, "epoch": 0.3201034677875677, "step": 1980}, {"loss": 0.7955, "grad_norm": 0.5057743787765503, "learning_rate": 0.0002, "epoch": 0.321720151968313, "step": 1990}, {"loss": 0.7774, "grad_norm": 0.528626024723053, "learning_rate": 0.0002, "epoch": 0.3233368361490583, "step": 2000}, {"loss": 0.8845, "grad_norm": 0.46742770075798035, "learning_rate": 0.0002, "epoch": 0.32495352032980357, "step": 2010}, {"loss": 0.8484, "grad_norm": 0.515101432800293, "learning_rate": 0.0002, "epoch": 0.32657020451054886, "step": 2020}, {"loss": 0.8139, "grad_norm": 0.41941216588020325, "learning_rate": 0.0002, "epoch": 0.32818688869129414, "step": 2030}, {"loss": 0.7637, "grad_norm": 0.49902522563934326, "learning_rate": 0.0002, "epoch": 0.32980357287203943, "step": 2040}, {"loss": 0.7822, "grad_norm": 0.4120897650718689, "learning_rate": 0.0002, "epoch": 0.3314202570527847, "step": 2050}, {"loss": 0.8057, "grad_norm": 0.45352041721343994, "learning_rate": 0.0002, "epoch": 0.33303694123353, "step": 2060}, {"loss": 0.7913, "grad_norm": 0.523199737071991, "learning_rate": 0.0002, "epoch": 0.33465362541427535, "step": 2070}, {"loss": 0.8036, "grad_norm": 0.4390358626842499, "learning_rate": 0.0002, "epoch": 0.33627030959502063, "step": 2080}, {"loss": 0.8145, "grad_norm": 0.6752901077270508, "learning_rate": 0.0002, "epoch": 0.3378869937757659, "step": 2090}, {"loss": 0.7807, "grad_norm": 0.547821044921875, "learning_rate": 0.0002, "epoch": 0.3395036779565112, "step": 2100}, {"loss": 0.8561, "grad_norm": 0.5161308646202087, "learning_rate": 0.0002, "epoch": 0.3411203621372565, "step": 2110}, {"loss": 0.7697, "grad_norm": 0.4565401077270508, "learning_rate": 0.0002, "epoch": 0.3427370463180018, "step": 2120}, {"loss": 0.7964, "grad_norm": 0.4666115939617157, "learning_rate": 0.0002, "epoch": 0.34435373049874707, "step": 2130}, {"loss": 0.8189, "grad_norm": 0.4090428352355957, "learning_rate": 0.0002, "epoch": 0.34597041467949236, "step": 2140}, {"loss": 0.8817, "grad_norm": 0.510845422744751, "learning_rate": 0.0002, "epoch": 0.34758709886023764, "step": 2150}, {"loss": 0.8398, "grad_norm": 0.42861923575401306, "learning_rate": 0.0002, "epoch": 0.34920378304098293, "step": 2160}, {"loss": 0.7716, "grad_norm": 0.4476332664489746, "learning_rate": 0.0002, "epoch": 0.3508204672217282, "step": 2170}, {"loss": 0.7845, "grad_norm": 0.6065791249275208, "learning_rate": 0.0002, "epoch": 0.3524371514024735, "step": 2180}, {"loss": 0.8187, "grad_norm": 0.42335066199302673, "learning_rate": 0.0002, "epoch": 0.35405383558321885, "step": 2190}, {"loss": 0.8239, "grad_norm": 0.5094629526138306, "learning_rate": 0.0002, "epoch": 0.35567051976396413, "step": 2200}, {"loss": 0.7807, "grad_norm": 0.5476373434066772, "learning_rate": 0.0002, "epoch": 0.3572872039447094, "step": 2210}, {"loss": 0.814, "grad_norm": 0.3911719024181366, "learning_rate": 0.0002, "epoch": 0.3589038881254547, "step": 2220}, {"loss": 0.8599, "grad_norm": 0.6599636077880859, "learning_rate": 0.0002, "epoch": 0.3605205723062, "step": 2230}, {"loss": 0.7482, "grad_norm": 0.40381914377212524, "learning_rate": 0.0002, "epoch": 0.3621372564869453, "step": 2240}, {"loss": 0.7772, "grad_norm": 0.4433908462524414, "learning_rate": 0.0002, "epoch": 0.36375394066769057, "step": 2250}, {"loss": 0.8503, "grad_norm": 0.578326940536499, "learning_rate": 0.0002, "epoch": 0.36537062484843585, "step": 2260}, {"loss": 0.8178, "grad_norm": 0.5734784007072449, "learning_rate": 0.0002, "epoch": 0.36698730902918114, "step": 2270}, {"loss": 0.8193, "grad_norm": 0.45555487275123596, "learning_rate": 0.0002, "epoch": 0.36860399320992643, "step": 2280}, {"loss": 0.7929, "grad_norm": 0.5666276216506958, "learning_rate": 0.0002, "epoch": 0.3702206773906717, "step": 2290}, {"loss": 0.8292, "grad_norm": 0.5461117625236511, "learning_rate": 0.0002, "epoch": 0.371837361571417, "step": 2300}, {"loss": 0.8204, "grad_norm": 0.6318911910057068, "learning_rate": 0.0002, "epoch": 0.3734540457521623, "step": 2310}, {"loss": 0.7964, "grad_norm": 0.493263304233551, "learning_rate": 0.0002, "epoch": 0.37507072993290763, "step": 2320}, {"loss": 0.8339, "grad_norm": 0.5888760089874268, "learning_rate": 0.0002, "epoch": 0.3766874141136529, "step": 2330}, {"loss": 0.7737, "grad_norm": 0.48671841621398926, "learning_rate": 0.0002, "epoch": 0.3783040982943982, "step": 2340}, {"loss": 0.8367, "grad_norm": 0.4385145306587219, "learning_rate": 0.0002, "epoch": 0.3799207824751435, "step": 2350}, {"loss": 0.812, "grad_norm": 0.5523318648338318, "learning_rate": 0.0002, "epoch": 0.3815374666558888, "step": 2360}, {"loss": 0.8351, "grad_norm": 0.7308220267295837, "learning_rate": 0.0002, "epoch": 0.38315415083663407, "step": 2370}, {"loss": 0.859, "grad_norm": 0.554214358329773, "learning_rate": 0.0002, "epoch": 0.38477083501737935, "step": 2380}, {"loss": 0.8146, "grad_norm": 0.5425800085067749, "learning_rate": 0.0002, "epoch": 0.38638751919812464, "step": 2390}, {"loss": 0.8282, "grad_norm": 0.48811158537864685, "learning_rate": 0.0002, "epoch": 0.3880042033788699, "step": 2400}, {"loss": 0.8074, "grad_norm": 0.49212366342544556, "learning_rate": 0.0002, "epoch": 0.3896208875596152, "step": 2410}, {"loss": 0.7991, "grad_norm": 0.5222218632698059, "learning_rate": 0.0002, "epoch": 0.3912375717403605, "step": 2420}, {"loss": 0.8182, "grad_norm": 0.4699819087982178, "learning_rate": 0.0002, "epoch": 0.3928542559211058, "step": 2430}, {"loss": 0.7919, "grad_norm": 0.46153587102890015, "learning_rate": 0.0002, "epoch": 0.39447094010185113, "step": 2440}, {"loss": 0.8111, "grad_norm": 0.4150611162185669, "learning_rate": 0.0002, "epoch": 0.3960876242825964, "step": 2450}, {"loss": 0.8589, "grad_norm": 0.5799614787101746, "learning_rate": 0.0002, "epoch": 0.3977043084633417, "step": 2460}, {"loss": 0.8085, "grad_norm": 0.56536865234375, "learning_rate": 0.0002, "epoch": 0.399320992644087, "step": 2470}, {"loss": 0.8022, "grad_norm": 0.5451247096061707, "learning_rate": 0.0002, "epoch": 0.4009376768248323, "step": 2480}, {"loss": 0.8217, "grad_norm": 0.5914521217346191, "learning_rate": 0.0002, "epoch": 0.40255436100557757, "step": 2490}, {"loss": 0.7859, "grad_norm": 0.4428117275238037, "learning_rate": 0.0002, "epoch": 0.40417104518632285, "step": 2500}, {"loss": 0.8054, "grad_norm": 0.48580947518348694, "learning_rate": 0.0002, "epoch": 0.40578772936706814, "step": 2510}, {"loss": 0.8405, "grad_norm": 0.436734676361084, "learning_rate": 0.0002, "epoch": 0.4074044135478134, "step": 2520}, {"loss": 0.8209, "grad_norm": 0.5752223134040833, "learning_rate": 0.0002, "epoch": 0.4090210977285587, "step": 2530}, {"loss": 0.8181, "grad_norm": 0.4271308183670044, "learning_rate": 0.0002, "epoch": 0.410637781909304, "step": 2540}, {"loss": 0.8058, "grad_norm": 0.46294718980789185, "learning_rate": 0.0002, "epoch": 0.4122544660900493, "step": 2550}, {"loss": 0.8473, "grad_norm": 0.49407583475112915, "learning_rate": 0.0002, "epoch": 0.4138711502707946, "step": 2560}, {"loss": 0.7881, "grad_norm": 0.4729035496711731, "learning_rate": 0.0002, "epoch": 0.4154878344515399, "step": 2570}, {"loss": 0.7834, "grad_norm": 0.4129747152328491, "learning_rate": 0.0002, "epoch": 0.4171045186322852, "step": 2580}, {"loss": 0.7859, "grad_norm": 0.5684236288070679, "learning_rate": 0.0002, "epoch": 0.4187212028130305, "step": 2590}, {"loss": 0.811, "grad_norm": 0.4862157106399536, "learning_rate": 0.0002, "epoch": 0.4203378869937758, "step": 2600}, {"loss": 0.7582, "grad_norm": 0.46567976474761963, "learning_rate": 0.0002, "epoch": 0.42195457117452106, "step": 2610}, {"loss": 0.7755, "grad_norm": 0.5710650682449341, "learning_rate": 0.0002, "epoch": 0.42357125535526635, "step": 2620}, {"loss": 0.8573, "grad_norm": 0.5660041570663452, "learning_rate": 0.0002, "epoch": 0.42518793953601164, "step": 2630}, {"loss": 0.7812, "grad_norm": 0.47944375872612, "learning_rate": 0.0002, "epoch": 0.4268046237167569, "step": 2640}, {"loss": 0.7459, "grad_norm": 0.537223756313324, "learning_rate": 0.0002, "epoch": 0.4284213078975022, "step": 2650}, {"loss": 0.8246, "grad_norm": 0.41669997572898865, "learning_rate": 0.0002, "epoch": 0.4300379920782475, "step": 2660}, {"loss": 0.7785, "grad_norm": 0.44727686047554016, "learning_rate": 0.0002, "epoch": 0.4316546762589928, "step": 2670}, {"loss": 0.8241, "grad_norm": 0.5600888729095459, "learning_rate": 0.0002, "epoch": 0.4332713604397381, "step": 2680}, {"loss": 0.7708, "grad_norm": 0.39820605516433716, "learning_rate": 0.0002, "epoch": 0.4348880446204834, "step": 2690}, {"loss": 0.8202, "grad_norm": 0.5637655854225159, "learning_rate": 0.0002, "epoch": 0.4365047288012287, "step": 2700}, {"loss": 0.855, "grad_norm": 0.6363666653633118, "learning_rate": 0.0002, "epoch": 0.438121412981974, "step": 2710}, {"loss": 0.8468, "grad_norm": 0.5656129121780396, "learning_rate": 0.0002, "epoch": 0.4397380971627193, "step": 2720}, {"loss": 0.7845, "grad_norm": 0.5600156188011169, "learning_rate": 0.0002, "epoch": 0.44135478134346456, "step": 2730}, {"loss": 0.8405, "grad_norm": 0.5506579875946045, "learning_rate": 0.0002, "epoch": 0.44297146552420985, "step": 2740}, {"loss": 0.7725, "grad_norm": 0.49878305196762085, "learning_rate": 0.0002, "epoch": 0.44458814970495514, "step": 2750}, {"loss": 0.8292, "grad_norm": 0.4569213092327118, "learning_rate": 0.0002, "epoch": 0.4462048338857004, "step": 2760}, {"loss": 0.8028, "grad_norm": 0.6056680083274841, "learning_rate": 0.0002, "epoch": 0.4478215180664457, "step": 2770}, {"loss": 0.8242, "grad_norm": 0.44474557042121887, "learning_rate": 0.0002, "epoch": 0.449438202247191, "step": 2780}, {"loss": 0.801, "grad_norm": 0.46055394411087036, "learning_rate": 0.0002, "epoch": 0.4510548864279363, "step": 2790}, {"loss": 0.7521, "grad_norm": 0.4904133379459381, "learning_rate": 0.0002, "epoch": 0.4526715706086816, "step": 2800}, {"loss": 0.8829, "grad_norm": 0.5647031664848328, "learning_rate": 0.0002, "epoch": 0.45428825478942686, "step": 2810}, {"loss": 0.8622, "grad_norm": 0.5759473443031311, "learning_rate": 0.0002, "epoch": 0.4559049389701722, "step": 2820}, {"loss": 0.7812, "grad_norm": 0.5161895751953125, "learning_rate": 0.0002, "epoch": 0.4575216231509175, "step": 2830}, {"loss": 0.8045, "grad_norm": 0.4248254597187042, "learning_rate": 0.0002, "epoch": 0.4591383073316628, "step": 2840}, {"loss": 0.7838, "grad_norm": 0.45395001769065857, "learning_rate": 0.0002, "epoch": 0.46075499151240806, "step": 2850}, {"loss": 0.8208, "grad_norm": 0.5358697772026062, "learning_rate": 0.0002, "epoch": 0.46237167569315335, "step": 2860}, {"loss": 0.8147, "grad_norm": 0.5379165410995483, "learning_rate": 0.0002, "epoch": 0.46398835987389864, "step": 2870}, {"loss": 0.7403, "grad_norm": 0.4601989686489105, "learning_rate": 0.0002, "epoch": 0.4656050440546439, "step": 2880}, {"loss": 0.8523, "grad_norm": 0.671115517616272, "learning_rate": 0.0002, "epoch": 0.4672217282353892, "step": 2890}, {"loss": 0.8262, "grad_norm": 0.4425133168697357, "learning_rate": 0.0002, "epoch": 0.4688384124161345, "step": 2900}, {"loss": 0.8178, "grad_norm": 0.5446155071258545, "learning_rate": 0.0002, "epoch": 0.4704550965968798, "step": 2910}, {"loss": 0.8106, "grad_norm": 0.603306233882904, "learning_rate": 0.0002, "epoch": 0.47207178077762507, "step": 2920}, {"loss": 0.8044, "grad_norm": 0.5377997159957886, "learning_rate": 0.0002, "epoch": 0.47368846495837036, "step": 2930}, {"loss": 0.8075, "grad_norm": 0.4931027591228485, "learning_rate": 0.0002, "epoch": 0.4753051491391157, "step": 2940}, {"loss": 0.8004, "grad_norm": 0.4711960256099701, "learning_rate": 0.0002, "epoch": 0.476921833319861, "step": 2950}, {"loss": 0.8121, "grad_norm": 0.5020492672920227, "learning_rate": 0.0002, "epoch": 0.4785385175006063, "step": 2960}, {"loss": 0.8221, "grad_norm": 0.5428946614265442, "learning_rate": 0.0002, "epoch": 0.48015520168135156, "step": 2970}, {"loss": 0.7849, "grad_norm": 0.5294089317321777, "learning_rate": 0.0002, "epoch": 0.48177188586209685, "step": 2980}, {"loss": 0.8553, "grad_norm": 0.648289144039154, "learning_rate": 0.0002, "epoch": 0.48338857004284214, "step": 2990}, {"loss": 0.7874, "grad_norm": 0.47916680574417114, "learning_rate": 0.0002, "epoch": 0.4850052542235874, "step": 3000}, {"loss": 0.8087, "grad_norm": 0.43849772214889526, "learning_rate": 0.0002, "epoch": 0.4866219384043327, "step": 3010}, {"loss": 0.7662, "grad_norm": 0.47007861733436584, "learning_rate": 0.0002, "epoch": 0.488238622585078, "step": 3020}, {"loss": 0.757, "grad_norm": 0.6314331293106079, "learning_rate": 0.0002, "epoch": 0.4898553067658233, "step": 3030}, {"loss": 0.7863, "grad_norm": 0.49211493134498596, "learning_rate": 0.0002, "epoch": 0.49147199094656857, "step": 3040}, {"loss": 0.8335, "grad_norm": 0.4537973403930664, "learning_rate": 0.0002, "epoch": 0.49308867512731386, "step": 3050}, {"loss": 0.8095, "grad_norm": 0.47326919436454773, "learning_rate": 0.0002, "epoch": 0.49470535930805914, "step": 3060}, {"loss": 0.8447, "grad_norm": 0.525874137878418, "learning_rate": 0.0002, "epoch": 0.4963220434888045, "step": 3070}, {"loss": 0.8339, "grad_norm": 0.6361091732978821, "learning_rate": 0.0002, "epoch": 0.4979387276695498, "step": 3080}, {"loss": 0.821, "grad_norm": 0.5850642919540405, "learning_rate": 0.0002, "epoch": 0.49955541185029506, "step": 3090}, {"loss": 0.8279, "grad_norm": 0.47299543023109436, "learning_rate": 0.0002, "epoch": 0.5011720960310403, "step": 3100}, {"loss": 0.8681, "grad_norm": 0.473099946975708, "learning_rate": 0.0002, "epoch": 0.5027887802117856, "step": 3110}, {"loss": 0.8223, "grad_norm": 0.48186397552490234, "learning_rate": 0.0002, "epoch": 0.5044054643925309, "step": 3120}, {"loss": 0.8292, "grad_norm": 0.5015401840209961, "learning_rate": 0.0002, "epoch": 0.5060221485732762, "step": 3130}, {"loss": 0.7692, "grad_norm": 0.5617750287055969, "learning_rate": 0.0002, "epoch": 0.5076388327540216, "step": 3140}, {"loss": 0.8708, "grad_norm": 0.5169327259063721, "learning_rate": 0.0002, "epoch": 0.5092555169347668, "step": 3150}, {"loss": 0.7845, "grad_norm": 0.545657753944397, "learning_rate": 0.0002, "epoch": 0.5108722011155121, "step": 3160}, {"loss": 0.799, "grad_norm": 0.512864351272583, "learning_rate": 0.0002, "epoch": 0.5124888852962574, "step": 3170}, {"loss": 0.7794, "grad_norm": 0.4113546311855316, "learning_rate": 0.0002, "epoch": 0.5141055694770027, "step": 3180}, {"loss": 0.8206, "grad_norm": 0.44532445073127747, "learning_rate": 0.0002, "epoch": 0.5157222536577479, "step": 3190}, {"loss": 0.8213, "grad_norm": 0.5623497366905212, "learning_rate": 0.0002, "epoch": 0.5173389378384933, "step": 3200}, {"loss": 0.7928, "grad_norm": 0.5084741115570068, "learning_rate": 0.0002, "epoch": 0.5189556220192385, "step": 3210}, {"loss": 0.8174, "grad_norm": 0.5305403470993042, "learning_rate": 0.0002, "epoch": 0.5205723061999838, "step": 3220}, {"loss": 0.8139, "grad_norm": 0.4708254337310791, "learning_rate": 0.0002, "epoch": 0.5221889903807291, "step": 3230}, {"loss": 0.7639, "grad_norm": 0.43827131390571594, "learning_rate": 0.0002, "epoch": 0.5238056745614744, "step": 3240}, {"loss": 0.7993, "grad_norm": 0.5630002617835999, "learning_rate": 0.0002, "epoch": 0.5254223587422197, "step": 3250}, {"loss": 0.7522, "grad_norm": 0.5010961890220642, "learning_rate": 0.0002, "epoch": 0.527039042922965, "step": 3260}, {"loss": 0.8374, "grad_norm": 0.6303122043609619, "learning_rate": 0.0002, "epoch": 0.5286557271037103, "step": 3270}, {"loss": 0.7727, "grad_norm": 0.5107331275939941, "learning_rate": 0.0002, "epoch": 0.5302724112844556, "step": 3280}, {"loss": 0.8495, "grad_norm": 0.5700443387031555, "learning_rate": 0.0002, "epoch": 0.5318890954652009, "step": 3290}, {"loss": 0.7776, "grad_norm": 0.46296367049217224, "learning_rate": 0.0002, "epoch": 0.5335057796459461, "step": 3300}, {"loss": 0.7931, "grad_norm": 0.531568706035614, "learning_rate": 0.0002, "epoch": 0.5351224638266915, "step": 3310}, {"loss": 0.843, "grad_norm": 0.4686741530895233, "learning_rate": 0.0002, "epoch": 0.5367391480074367, "step": 3320}, {"loss": 0.8104, "grad_norm": 0.5404331088066101, "learning_rate": 0.0002, "epoch": 0.5383558321881821, "step": 3330}, {"loss": 0.7686, "grad_norm": 0.6368790864944458, "learning_rate": 0.0002, "epoch": 0.5399725163689273, "step": 3340}, {"loss": 0.8514, "grad_norm": 0.42300888895988464, "learning_rate": 0.0002, "epoch": 0.5415892005496726, "step": 3350}, {"loss": 0.8236, "grad_norm": 0.5362542867660522, "learning_rate": 0.0002, "epoch": 0.5432058847304179, "step": 3360}, {"loss": 0.858, "grad_norm": 0.497128963470459, "learning_rate": 0.0002, "epoch": 0.5448225689111632, "step": 3370}, {"loss": 0.8519, "grad_norm": 0.5006386041641235, "learning_rate": 0.0002, "epoch": 0.5464392530919085, "step": 3380}, {"loss": 0.7867, "grad_norm": 0.44136837124824524, "learning_rate": 0.0002, "epoch": 0.5480559372726538, "step": 3390}, {"loss": 0.773, "grad_norm": 0.5897833108901978, "learning_rate": 0.0002, "epoch": 0.5496726214533991, "step": 3400}, {"loss": 0.8895, "grad_norm": 0.641075611114502, "learning_rate": 0.0002, "epoch": 0.5512893056341444, "step": 3410}, {"loss": 0.7827, "grad_norm": 0.7251322269439697, "learning_rate": 0.0002, "epoch": 0.5529059898148897, "step": 3420}, {"loss": 0.7626, "grad_norm": 0.47411349415779114, "learning_rate": 0.0002, "epoch": 0.5545226739956349, "step": 3430}, {"loss": 0.8196, "grad_norm": 0.4994310438632965, "learning_rate": 0.0002, "epoch": 0.5561393581763803, "step": 3440}, {"loss": 0.7812, "grad_norm": 0.5814438462257385, "learning_rate": 0.0002, "epoch": 0.5577560423571255, "step": 3450}, {"loss": 0.8805, "grad_norm": 0.6278898119926453, "learning_rate": 0.0002, "epoch": 0.5593727265378708, "step": 3460}, {"loss": 0.813, "grad_norm": 0.46208274364471436, "learning_rate": 0.0002, "epoch": 0.5609894107186161, "step": 3470}, {"loss": 0.8295, "grad_norm": 0.5718930959701538, "learning_rate": 0.0002, "epoch": 0.5626060948993614, "step": 3480}, {"loss": 0.8152, "grad_norm": 0.48178744316101074, "learning_rate": 0.0002, "epoch": 0.5642227790801067, "step": 3490}, {"loss": 0.8244, "grad_norm": 0.47336965799331665, "learning_rate": 0.0002, "epoch": 0.565839463260852, "step": 3500}, {"loss": 0.8099, "grad_norm": 0.43442684412002563, "learning_rate": 0.0002, "epoch": 0.5674561474415973, "step": 3510}, {"loss": 0.7564, "grad_norm": 0.6463358998298645, "learning_rate": 0.0002, "epoch": 0.5690728316223426, "step": 3520}, {"loss": 0.836, "grad_norm": 0.5286486744880676, "learning_rate": 0.0002, "epoch": 0.5706895158030879, "step": 3530}, {"loss": 0.8421, "grad_norm": 0.5405499935150146, "learning_rate": 0.0002, "epoch": 0.5723061999838331, "step": 3540}, {"loss": 0.7614, "grad_norm": 0.6654391884803772, "learning_rate": 0.0002, "epoch": 0.5739228841645785, "step": 3550}, {"loss": 0.7803, "grad_norm": 0.5081980228424072, "learning_rate": 0.0002, "epoch": 0.5755395683453237, "step": 3560}, {"loss": 0.7753, "grad_norm": 0.48978179693222046, "learning_rate": 0.0002, "epoch": 0.5771562525260691, "step": 3570}, {"loss": 0.8151, "grad_norm": 0.5840612053871155, "learning_rate": 0.0002, "epoch": 0.5787729367068143, "step": 3580}, {"loss": 0.8937, "grad_norm": 0.5235261917114258, "learning_rate": 0.0002, "epoch": 0.5803896208875596, "step": 3590}, {"loss": 0.7894, "grad_norm": 0.5672075748443604, "learning_rate": 0.0002, "epoch": 0.5820063050683049, "step": 3600}, {"loss": 0.8347, "grad_norm": 0.5613429546356201, "learning_rate": 0.0002, "epoch": 0.5836229892490502, "step": 3610}, {"loss": 0.8274, "grad_norm": 0.4032273590564728, "learning_rate": 0.0002, "epoch": 0.5852396734297954, "step": 3620}, {"loss": 0.8421, "grad_norm": 0.49559324979782104, "learning_rate": 0.0002, "epoch": 0.5868563576105408, "step": 3630}, {"loss": 0.8332, "grad_norm": 0.6895697712898254, "learning_rate": 0.0002, "epoch": 0.5884730417912861, "step": 3640}, {"loss": 0.7877, "grad_norm": 0.4750136435031891, "learning_rate": 0.0002, "epoch": 0.5900897259720314, "step": 3650}, {"loss": 0.8219, "grad_norm": 0.5176819562911987, "learning_rate": 0.0002, "epoch": 0.5917064101527767, "step": 3660}, {"loss": 0.8151, "grad_norm": 0.5817760229110718, "learning_rate": 0.0002, "epoch": 0.5933230943335219, "step": 3670}, {"loss": 0.7823, "grad_norm": 0.6064626574516296, "learning_rate": 0.0002, "epoch": 0.5949397785142673, "step": 3680}, {"loss": 0.8422, "grad_norm": 0.6728700995445251, "learning_rate": 0.0002, "epoch": 0.5965564626950125, "step": 3690}, {"loss": 0.7679, "grad_norm": 0.609305202960968, "learning_rate": 0.0002, "epoch": 0.5981731468757578, "step": 3700}, {"loss": 0.8048, "grad_norm": 0.4615488350391388, "learning_rate": 0.0002, "epoch": 0.5997898310565031, "step": 3710}, {"loss": 0.8214, "grad_norm": 2.0531179904937744, "learning_rate": 0.0002, "epoch": 0.6014065152372484, "step": 3720}, {"loss": 0.8158, "grad_norm": 0.5091132521629333, "learning_rate": 0.0002, "epoch": 0.6030231994179936, "step": 3730}, {"loss": 0.7833, "grad_norm": 0.5951124429702759, "learning_rate": 0.0002, "epoch": 0.604639883598739, "step": 3740}, {"loss": 0.7784, "grad_norm": 0.5870208144187927, "learning_rate": 0.0002, "epoch": 0.6062565677794842, "step": 3750}, {"loss": 0.8044, "grad_norm": 0.6254619359970093, "learning_rate": 0.0002, "epoch": 0.6078732519602296, "step": 3760}, {"loss": 0.7868, "grad_norm": 0.5577626824378967, "learning_rate": 0.0002, "epoch": 0.6094899361409749, "step": 3770}, {"loss": 0.8108, "grad_norm": 0.5004405379295349, "learning_rate": 0.0002, "epoch": 0.6111066203217201, "step": 3780}, {"loss": 0.8092, "grad_norm": 0.5527383685112, "learning_rate": 0.0002, "epoch": 0.6127233045024655, "step": 3790}, {"loss": 0.8036, "grad_norm": 0.49116113781929016, "learning_rate": 0.0002, "epoch": 0.6143399886832107, "step": 3800}, {"loss": 0.8352, "grad_norm": 0.5299299359321594, "learning_rate": 0.0002, "epoch": 0.6159566728639561, "step": 3810}, {"loss": 0.7737, "grad_norm": 0.464897483587265, "learning_rate": 0.0002, "epoch": 0.6175733570447013, "step": 3820}, {"loss": 0.7923, "grad_norm": 0.6505740880966187, "learning_rate": 0.0002, "epoch": 0.6191900412254466, "step": 3830}, {"loss": 0.8123, "grad_norm": 0.5512559413909912, "learning_rate": 0.0002, "epoch": 0.6208067254061919, "step": 3840}, {"loss": 0.8856, "grad_norm": 0.49427518248558044, "learning_rate": 0.0002, "epoch": 0.6224234095869372, "step": 3850}, {"loss": 0.7751, "grad_norm": 0.3839147090911865, "learning_rate": 0.0002, "epoch": 0.6240400937676824, "step": 3860}, {"loss": 0.8006, "grad_norm": 0.5760218501091003, "learning_rate": 0.0002, "epoch": 0.6256567779484278, "step": 3870}, {"loss": 0.7836, "grad_norm": 0.7226507067680359, "learning_rate": 0.0002, "epoch": 0.6272734621291731, "step": 3880}, {"loss": 0.8244, "grad_norm": 0.676781415939331, "learning_rate": 0.0002, "epoch": 0.6288901463099184, "step": 3890}, {"loss": 0.8239, "grad_norm": 0.4284018278121948, "learning_rate": 0.0002, "epoch": 0.6305068304906637, "step": 3900}, {"loss": 0.7996, "grad_norm": 0.5060628056526184, "learning_rate": 0.0002, "epoch": 0.6321235146714089, "step": 3910}, {"loss": 0.8089, "grad_norm": 0.5524522066116333, "learning_rate": 0.0002, "epoch": 0.6337401988521543, "step": 3920}, {"loss": 0.8276, "grad_norm": 0.6099881529808044, "learning_rate": 0.0002, "epoch": 0.6353568830328995, "step": 3930}, {"loss": 0.809, "grad_norm": 0.43155938386917114, "learning_rate": 0.0002, "epoch": 0.6369735672136448, "step": 3940}, {"loss": 0.8404, "grad_norm": 0.6427084803581238, "learning_rate": 0.0002, "epoch": 0.6385902513943901, "step": 3950}, {"loss": 0.8368, "grad_norm": 0.541220486164093, "learning_rate": 0.0002, "epoch": 0.6402069355751354, "step": 3960}, {"loss": 0.8539, "grad_norm": 0.5414294600486755, "learning_rate": 0.0002, "epoch": 0.6418236197558806, "step": 3970}, {"loss": 0.7996, "grad_norm": 0.46344003081321716, "learning_rate": 0.0002, "epoch": 0.643440303936626, "step": 3980}, {"loss": 0.7474, "grad_norm": 0.45209285616874695, "learning_rate": 0.0002, "epoch": 0.6450569881173712, "step": 3990}, {"loss": 0.8202, "grad_norm": 0.5417284369468689, "learning_rate": 0.0002, "epoch": 0.6466736722981166, "step": 4000}, {"loss": 0.7563, "grad_norm": 0.7995685935020447, "learning_rate": 0.0002, "epoch": 0.6482903564788619, "step": 4010}, {"loss": 0.7812, "grad_norm": 0.6384002566337585, "learning_rate": 0.0002, "epoch": 0.6499070406596071, "step": 4020}, {"loss": 0.732, "grad_norm": 0.4472815692424774, "learning_rate": 0.0002, "epoch": 0.6515237248403525, "step": 4030}, {"loss": 0.8071, "grad_norm": 0.6834294199943542, "learning_rate": 0.0002, "epoch": 0.6531404090210977, "step": 4040}, {"loss": 0.7812, "grad_norm": 0.4612339735031128, "learning_rate": 0.0002, "epoch": 0.654757093201843, "step": 4050}, {"loss": 0.8141, "grad_norm": 0.9266576170921326, "learning_rate": 0.0002, "epoch": 0.6563737773825883, "step": 4060}, {"loss": 0.7991, "grad_norm": 0.4470861852169037, "learning_rate": 0.0002, "epoch": 0.6579904615633336, "step": 4070}, {"loss": 0.8293, "grad_norm": 0.45544925332069397, "learning_rate": 0.0002, "epoch": 0.6596071457440789, "step": 4080}, {"loss": 0.8455, "grad_norm": 0.6144481301307678, "learning_rate": 0.0002, "epoch": 0.6612238299248242, "step": 4090}, {"loss": 0.7877, "grad_norm": 0.5936288237571716, "learning_rate": 0.0002, "epoch": 0.6628405141055694, "step": 4100}, {"loss": 0.7617, "grad_norm": 0.4822963774204254, "learning_rate": 0.0002, "epoch": 0.6644571982863148, "step": 4110}, {"loss": 0.7997, "grad_norm": 0.48432496190071106, "learning_rate": 0.0002, "epoch": 0.66607388246706, "step": 4120}, {"loss": 0.8404, "grad_norm": 0.4901607930660248, "learning_rate": 0.0002, "epoch": 0.6676905666478054, "step": 4130}, {"loss": 0.8085, "grad_norm": 0.5018393397331238, "learning_rate": 0.0002, "epoch": 0.6693072508285507, "step": 4140}, {"loss": 0.8065, "grad_norm": 0.6946378946304321, "learning_rate": 0.0002, "epoch": 0.6709239350092959, "step": 4150}, {"loss": 0.8147, "grad_norm": 0.5997390747070312, "learning_rate": 0.0002, "epoch": 0.6725406191900413, "step": 4160}, {"loss": 0.8268, "grad_norm": 0.6738849878311157, "learning_rate": 0.0002, "epoch": 0.6741573033707865, "step": 4170}, {"loss": 0.7704, "grad_norm": 0.6110581159591675, "learning_rate": 0.0002, "epoch": 0.6757739875515318, "step": 4180}, {"loss": 0.8043, "grad_norm": 0.5703322291374207, "learning_rate": 0.0002, "epoch": 0.6773906717322771, "step": 4190}, {"loss": 0.8099, "grad_norm": 0.4686066210269928, "learning_rate": 0.0002, "epoch": 0.6790073559130224, "step": 4200}, {"loss": 0.8441, "grad_norm": 0.6394643783569336, "learning_rate": 0.0002, "epoch": 0.6806240400937676, "step": 4210}, {"loss": 0.8011, "grad_norm": 0.5454841256141663, "learning_rate": 0.0002, "epoch": 0.682240724274513, "step": 4220}, {"loss": 0.8307, "grad_norm": 0.4859732985496521, "learning_rate": 0.0002, "epoch": 0.6838574084552582, "step": 4230}, {"loss": 0.8161, "grad_norm": 0.5544065833091736, "learning_rate": 0.0002, "epoch": 0.6854740926360036, "step": 4240}, {"loss": 0.7839, "grad_norm": 0.4902505576610565, "learning_rate": 0.0002, "epoch": 0.6870907768167488, "step": 4250}, {"loss": 0.7977, "grad_norm": 0.4768051505088806, "learning_rate": 0.0002, "epoch": 0.6887074609974941, "step": 4260}, {"loss": 0.7539, "grad_norm": 0.49982190132141113, "learning_rate": 0.0002, "epoch": 0.6903241451782395, "step": 4270}, {"loss": 0.7353, "grad_norm": 0.6351838111877441, "learning_rate": 0.0002, "epoch": 0.6919408293589847, "step": 4280}, {"loss": 0.7664, "grad_norm": 0.5647561550140381, "learning_rate": 0.0002, "epoch": 0.69355751353973, "step": 4290}, {"loss": 0.7618, "grad_norm": 0.5340486764907837, "learning_rate": 0.0002, "epoch": 0.6951741977204753, "step": 4300}, {"loss": 0.8526, "grad_norm": 0.5649092793464661, "learning_rate": 0.0002, "epoch": 0.6967908819012206, "step": 4310}, {"loss": 0.8246, "grad_norm": 0.6183916926383972, "learning_rate": 0.0002, "epoch": 0.6984075660819659, "step": 4320}, {"loss": 0.792, "grad_norm": 0.6154509782791138, "learning_rate": 0.0002, "epoch": 0.7000242502627112, "step": 4330}, {"loss": 0.8397, "grad_norm": 0.5156264305114746, "learning_rate": 0.0002, "epoch": 0.7016409344434564, "step": 4340}, {"loss": 0.8512, "grad_norm": 0.562171459197998, "learning_rate": 0.0002, "epoch": 0.7032576186242018, "step": 4350}, {"loss": 0.7882, "grad_norm": 0.4949502646923065, "learning_rate": 0.0002, "epoch": 0.704874302804947, "step": 4360}, {"loss": 0.738, "grad_norm": 0.5171684622764587, "learning_rate": 0.0002, "epoch": 0.7064909869856923, "step": 4370}, {"loss": 0.8001, "grad_norm": 0.6198443174362183, "learning_rate": 0.0002, "epoch": 0.7081076711664377, "step": 4380}, {"loss": 0.7606, "grad_norm": 0.5802276134490967, "learning_rate": 0.0002, "epoch": 0.7097243553471829, "step": 4390}, {"loss": 0.8797, "grad_norm": 0.41096967458724976, "learning_rate": 0.0002, "epoch": 0.7113410395279283, "step": 4400}, {"loss": 0.805, "grad_norm": 0.4397392272949219, "learning_rate": 0.0002, "epoch": 0.7129577237086735, "step": 4410}, {"loss": 0.7651, "grad_norm": 0.45228442549705505, "learning_rate": 0.0002, "epoch": 0.7145744078894188, "step": 4420}, {"loss": 0.7938, "grad_norm": 0.4839673936367035, "learning_rate": 0.0002, "epoch": 0.7161910920701641, "step": 4430}, {"loss": 0.8362, "grad_norm": 0.6140755414962769, "learning_rate": 0.0002, "epoch": 0.7178077762509094, "step": 4440}, {"loss": 0.7722, "grad_norm": 0.6841378808021545, "learning_rate": 0.0002, "epoch": 0.7194244604316546, "step": 4450}, {"loss": 0.8177, "grad_norm": 0.6664239168167114, "learning_rate": 0.0002, "epoch": 0.7210411446124, "step": 4460}, {"loss": 0.7983, "grad_norm": 0.47552719712257385, "learning_rate": 0.0002, "epoch": 0.7226578287931452, "step": 4470}, {"loss": 0.8982, "grad_norm": 0.6649776101112366, "learning_rate": 0.0002, "epoch": 0.7242745129738906, "step": 4480}, {"loss": 0.8074, "grad_norm": 0.5159541964530945, "learning_rate": 0.0002, "epoch": 0.7258911971546358, "step": 4490}, {"loss": 0.7786, "grad_norm": 0.6693112850189209, "learning_rate": 0.0002, "epoch": 0.7275078813353811, "step": 4500}, {"loss": 0.8655, "grad_norm": 0.48870977759361267, "learning_rate": 0.0002, "epoch": 0.7291245655161265, "step": 4510}, {"loss": 0.7337, "grad_norm": 0.4857887923717499, "learning_rate": 0.0002, "epoch": 0.7307412496968717, "step": 4520}, {"loss": 0.8026, "grad_norm": 0.5515662431716919, "learning_rate": 0.0002, "epoch": 0.732357933877617, "step": 4530}, {"loss": 0.8031, "grad_norm": 0.6292222738265991, "learning_rate": 0.0002, "epoch": 0.7339746180583623, "step": 4540}, {"loss": 0.7749, "grad_norm": 0.48265689611434937, "learning_rate": 0.0002, "epoch": 0.7355913022391076, "step": 4550}, {"loss": 0.8499, "grad_norm": 0.8044266104698181, "learning_rate": 0.0002, "epoch": 0.7372079864198529, "step": 4560}, {"loss": 0.8162, "grad_norm": 0.6111769676208496, "learning_rate": 0.0002, "epoch": 0.7388246706005982, "step": 4570}, {"loss": 0.7291, "grad_norm": 0.5229553580284119, "learning_rate": 0.0002, "epoch": 0.7404413547813434, "step": 4580}, {"loss": 0.8038, "grad_norm": 0.6054152250289917, "learning_rate": 0.0002, "epoch": 0.7420580389620888, "step": 4590}, {"loss": 0.8169, "grad_norm": 0.5574966669082642, "learning_rate": 0.0002, "epoch": 0.743674723142834, "step": 4600}, {"loss": 0.8439, "grad_norm": 0.5395817160606384, "learning_rate": 0.0002, "epoch": 0.7452914073235793, "step": 4610}, {"loss": 0.8495, "grad_norm": 0.7116472721099854, "learning_rate": 0.0002, "epoch": 0.7469080915043246, "step": 4620}, {"loss": 0.7743, "grad_norm": 0.5618700981140137, "learning_rate": 0.0002, "epoch": 0.7485247756850699, "step": 4630}, {"loss": 0.7744, "grad_norm": 0.5802770853042603, "learning_rate": 0.0002, "epoch": 0.7501414598658153, "step": 4640}, {"loss": 0.7924, "grad_norm": 0.5690428018569946, "learning_rate": 0.0002, "epoch": 0.7517581440465605, "step": 4650}, {"loss": 0.8017, "grad_norm": 0.4813360273838043, "learning_rate": 0.0002, "epoch": 0.7533748282273058, "step": 4660}, {"loss": 0.8108, "grad_norm": 0.5434042811393738, "learning_rate": 0.0002, "epoch": 0.7549915124080511, "step": 4670}, {"loss": 0.7824, "grad_norm": 0.5502099990844727, "learning_rate": 0.0002, "epoch": 0.7566081965887964, "step": 4680}, {"loss": 0.8598, "grad_norm": 0.6020621061325073, "learning_rate": 0.0002, "epoch": 0.7582248807695416, "step": 4690}, {"loss": 0.7937, "grad_norm": 0.4922301471233368, "learning_rate": 0.0002, "epoch": 0.759841564950287, "step": 4700}, {"loss": 0.788, "grad_norm": 0.6492828726768494, "learning_rate": 0.0002, "epoch": 0.7614582491310322, "step": 4710}, {"loss": 0.8313, "grad_norm": 0.4865580201148987, "learning_rate": 0.0002, "epoch": 0.7630749333117776, "step": 4720}, {"loss": 0.7966, "grad_norm": 0.5971422791481018, "learning_rate": 0.0002, "epoch": 0.7646916174925228, "step": 4730}, {"loss": 0.8298, "grad_norm": 0.6832674145698547, "learning_rate": 0.0002, "epoch": 0.7663083016732681, "step": 4740}, {"loss": 0.8156, "grad_norm": 0.500908613204956, "learning_rate": 0.0002, "epoch": 0.7679249858540134, "step": 4750}, {"loss": 0.8383, "grad_norm": 0.6112465858459473, "learning_rate": 0.0002, "epoch": 0.7695416700347587, "step": 4760}, {"loss": 0.76, "grad_norm": 0.5753506422042847, "learning_rate": 0.0002, "epoch": 0.771158354215504, "step": 4770}, {"loss": 0.8297, "grad_norm": 0.6529405117034912, "learning_rate": 0.0002, "epoch": 0.7727750383962493, "step": 4780}, {"loss": 0.8171, "grad_norm": 0.5916843414306641, "learning_rate": 0.0002, "epoch": 0.7743917225769946, "step": 4790}, {"loss": 0.83, "grad_norm": 0.4821224510669708, "learning_rate": 0.0002, "epoch": 0.7760084067577399, "step": 4800}, {"loss": 0.7703, "grad_norm": 0.5532580018043518, "learning_rate": 0.0002, "epoch": 0.7776250909384852, "step": 4810}, {"loss": 0.7363, "grad_norm": 0.4604877233505249, "learning_rate": 0.0002, "epoch": 0.7792417751192304, "step": 4820}, {"loss": 0.7506, "grad_norm": 0.5009613037109375, "learning_rate": 0.0002, "epoch": 0.7808584592999758, "step": 4830}, {"loss": 0.7863, "grad_norm": 0.6448560357093811, "learning_rate": 0.0002, "epoch": 0.782475143480721, "step": 4840}, {"loss": 0.7957, "grad_norm": 0.44327953457832336, "learning_rate": 0.0002, "epoch": 0.7840918276614663, "step": 4850}, {"loss": 0.7925, "grad_norm": 0.5355411171913147, "learning_rate": 0.0002, "epoch": 0.7857085118422116, "step": 4860}, {"loss": 0.7754, "grad_norm": 0.5635677576065063, "learning_rate": 0.0002, "epoch": 0.7873251960229569, "step": 4870}, {"loss": 0.7931, "grad_norm": 0.5417491793632507, "learning_rate": 0.0002, "epoch": 0.7889418802037023, "step": 4880}, {"loss": 0.7819, "grad_norm": 0.4567430913448334, "learning_rate": 0.0002, "epoch": 0.7905585643844475, "step": 4890}, {"loss": 0.8454, "grad_norm": 0.44651296734809875, "learning_rate": 0.0002, "epoch": 0.7921752485651928, "step": 4900}, {"loss": 0.7959, "grad_norm": 0.5741217136383057, "learning_rate": 0.0002, "epoch": 0.7937919327459381, "step": 4910}, {"loss": 0.8093, "grad_norm": 0.6605045199394226, "learning_rate": 0.0002, "epoch": 0.7954086169266834, "step": 4920}, {"loss": 0.77, "grad_norm": 0.5126531720161438, "learning_rate": 0.0002, "epoch": 0.7970253011074286, "step": 4930}, {"loss": 0.7793, "grad_norm": 0.513648271560669, "learning_rate": 0.0002, "epoch": 0.798641985288174, "step": 4940}, {"loss": 0.8314, "grad_norm": 0.5350404381752014, "learning_rate": 0.0002, "epoch": 0.8002586694689192, "step": 4950}, {"loss": 0.7649, "grad_norm": 0.5731674432754517, "learning_rate": 0.0002, "epoch": 0.8018753536496646, "step": 4960}, {"loss": 0.8572, "grad_norm": 0.5974258184432983, "learning_rate": 0.0002, "epoch": 0.8034920378304098, "step": 4970}, {"loss": 0.7972, "grad_norm": 0.8774799704551697, "learning_rate": 0.0002, "epoch": 0.8051087220111551, "step": 4980}, {"loss": 0.7899, "grad_norm": 0.5994430184364319, "learning_rate": 0.0002, "epoch": 0.8067254061919004, "step": 4990}, {"loss": 0.7736, "grad_norm": 0.4894903004169464, "learning_rate": 0.0002, "epoch": 0.8083420903726457, "step": 5000}, {"loss": 0.78, "grad_norm": 0.5218459367752075, "learning_rate": 0.0002, "epoch": 0.809958774553391, "step": 5010}, {"loss": 0.817, "grad_norm": 0.5232468843460083, "learning_rate": 0.0002, "epoch": 0.8115754587341363, "step": 5020}, {"loss": 0.7704, "grad_norm": 0.44358372688293457, "learning_rate": 0.0002, "epoch": 0.8131921429148816, "step": 5030}, {"loss": 0.785, "grad_norm": 0.6202037334442139, "learning_rate": 0.0002, "epoch": 0.8148088270956269, "step": 5040}, {"loss": 0.7351, "grad_norm": 0.7721474170684814, "learning_rate": 0.0002, "epoch": 0.8164255112763722, "step": 5050}, {"loss": 0.8297, "grad_norm": 0.5568501353263855, "learning_rate": 0.0002, "epoch": 0.8180421954571174, "step": 5060}, {"loss": 0.7733, "grad_norm": 0.49148809909820557, "learning_rate": 0.0002, "epoch": 0.8196588796378628, "step": 5070}, {"loss": 0.8054, "grad_norm": 0.4956012964248657, "learning_rate": 0.0002, "epoch": 0.821275563818608, "step": 5080}, {"loss": 0.8201, "grad_norm": 0.6078833937644958, "learning_rate": 0.0002, "epoch": 0.8228922479993533, "step": 5090}, {"loss": 0.828, "grad_norm": 0.46906954050064087, "learning_rate": 0.0002, "epoch": 0.8245089321800986, "step": 5100}, {"loss": 0.7703, "grad_norm": 0.50812166929245, "learning_rate": 0.0002, "epoch": 0.8261256163608439, "step": 5110}, {"loss": 0.8243, "grad_norm": 0.5319661498069763, "learning_rate": 0.0002, "epoch": 0.8277423005415891, "step": 5120}, {"loss": 0.7798, "grad_norm": 0.4949689209461212, "learning_rate": 0.0002, "epoch": 0.8293589847223345, "step": 5130}, {"loss": 0.7428, "grad_norm": 0.5151591300964355, "learning_rate": 0.0002, "epoch": 0.8309756689030798, "step": 5140}, {"loss": 0.8147, "grad_norm": 0.5530214309692383, "learning_rate": 0.0002, "epoch": 0.8325923530838251, "step": 5150}, {"loss": 0.8251, "grad_norm": 0.6297410130500793, "learning_rate": 0.0002, "epoch": 0.8342090372645704, "step": 5160}, {"loss": 0.8067, "grad_norm": 0.5466840267181396, "learning_rate": 0.0002, "epoch": 0.8358257214453156, "step": 5170}, {"loss": 0.7875, "grad_norm": 0.652913510799408, "learning_rate": 0.0002, "epoch": 0.837442405626061, "step": 5180}, {"loss": 0.8295, "grad_norm": 0.5811293125152588, "learning_rate": 0.0002, "epoch": 0.8390590898068062, "step": 5190}, {"loss": 0.7412, "grad_norm": 0.5109550952911377, "learning_rate": 0.0002, "epoch": 0.8406757739875516, "step": 5200}, {"loss": 0.8077, "grad_norm": 0.4551706612110138, "learning_rate": 0.0002, "epoch": 0.8422924581682968, "step": 5210}, {"loss": 0.7827, "grad_norm": 0.5813754200935364, "learning_rate": 0.0002, "epoch": 0.8439091423490421, "step": 5220}, {"loss": 0.802, "grad_norm": 0.5856947898864746, "learning_rate": 0.0002, "epoch": 0.8455258265297874, "step": 5230}, {"loss": 0.7957, "grad_norm": 0.5482739210128784, "learning_rate": 0.0002, "epoch": 0.8471425107105327, "step": 5240}, {"loss": 0.8295, "grad_norm": 0.49023720622062683, "learning_rate": 0.0002, "epoch": 0.8487591948912779, "step": 5250}, {"loss": 0.8022, "grad_norm": 0.49472475051879883, "learning_rate": 0.0002, "epoch": 0.8503758790720233, "step": 5260}, {"loss": 0.8001, "grad_norm": 0.5490226745605469, "learning_rate": 0.0002, "epoch": 0.8519925632527686, "step": 5270}, {"loss": 0.8333, "grad_norm": 0.5340665578842163, "learning_rate": 0.0002, "epoch": 0.8536092474335139, "step": 5280}, {"loss": 0.8277, "grad_norm": 0.5962483882904053, "learning_rate": 0.0002, "epoch": 0.8552259316142592, "step": 5290}, {"loss": 0.8765, "grad_norm": 0.586358368396759, "learning_rate": 0.0002, "epoch": 0.8568426157950044, "step": 5300}, {"loss": 0.7831, "grad_norm": 0.49120277166366577, "learning_rate": 0.0002, "epoch": 0.8584592999757498, "step": 5310}, {"loss": 0.8162, "grad_norm": 0.5887332558631897, "learning_rate": 0.0002, "epoch": 0.860075984156495, "step": 5320}, {"loss": 0.7464, "grad_norm": 0.42496153712272644, "learning_rate": 0.0002, "epoch": 0.8616926683372403, "step": 5330}, {"loss": 0.7905, "grad_norm": 0.5489874482154846, "learning_rate": 0.0002, "epoch": 0.8633093525179856, "step": 5340}, {"loss": 0.7958, "grad_norm": 0.5850813984870911, "learning_rate": 0.0002, "epoch": 0.8649260366987309, "step": 5350}, {"loss": 0.7642, "grad_norm": 0.517487108707428, "learning_rate": 0.0002, "epoch": 0.8665427208794761, "step": 5360}, {"loss": 0.7801, "grad_norm": 0.5339142680168152, "learning_rate": 0.0002, "epoch": 0.8681594050602215, "step": 5370}, {"loss": 0.818, "grad_norm": 0.6236387491226196, "learning_rate": 0.0002, "epoch": 0.8697760892409668, "step": 5380}, {"loss": 0.7708, "grad_norm": 0.5752192735671997, "learning_rate": 0.0002, "epoch": 0.8713927734217121, "step": 5390}, {"loss": 0.8542, "grad_norm": 0.6724614500999451, "learning_rate": 0.0002, "epoch": 0.8730094576024574, "step": 5400}, {"loss": 0.7581, "grad_norm": 0.5280613303184509, "learning_rate": 0.0002, "epoch": 0.8746261417832026, "step": 5410}, {"loss": 0.8231, "grad_norm": 0.44033288955688477, "learning_rate": 0.0002, "epoch": 0.876242825963948, "step": 5420}, {"loss": 0.8839, "grad_norm": 0.5199708342552185, "learning_rate": 0.0002, "epoch": 0.8778595101446932, "step": 5430}, {"loss": 0.7852, "grad_norm": 0.46778348088264465, "learning_rate": 0.0002, "epoch": 0.8794761943254386, "step": 5440}, {"loss": 0.7834, "grad_norm": 0.4657754898071289, "learning_rate": 0.0002, "epoch": 0.8810928785061838, "step": 5450}, {"loss": 0.7799, "grad_norm": 0.5472902655601501, "learning_rate": 0.0002, "epoch": 0.8827095626869291, "step": 5460}, {"loss": 0.8253, "grad_norm": 0.4876766800880432, "learning_rate": 0.0002, "epoch": 0.8843262468676744, "step": 5470}, {"loss": 0.7906, "grad_norm": 0.5057248473167419, "learning_rate": 0.0002, "epoch": 0.8859429310484197, "step": 5480}, {"loss": 0.8124, "grad_norm": 0.4637320637702942, "learning_rate": 0.0002, "epoch": 0.8875596152291649, "step": 5490}, {"loss": 0.781, "grad_norm": 0.471955806016922, "learning_rate": 0.0002, "epoch": 0.8891762994099103, "step": 5500}, {"loss": 0.8057, "grad_norm": 0.5209813714027405, "learning_rate": 0.0002, "epoch": 0.8907929835906556, "step": 5510}, {"loss": 0.8106, "grad_norm": 0.6213834285736084, "learning_rate": 0.0002, "epoch": 0.8924096677714008, "step": 5520}, {"loss": 0.7787, "grad_norm": 0.5215408205986023, "learning_rate": 0.0002, "epoch": 0.8940263519521462, "step": 5530}, {"loss": 0.8174, "grad_norm": 0.580478310585022, "learning_rate": 0.0002, "epoch": 0.8956430361328914, "step": 5540}, {"loss": 0.8371, "grad_norm": 0.49102169275283813, "learning_rate": 0.0002, "epoch": 0.8972597203136368, "step": 5550}, {"loss": 0.7806, "grad_norm": 0.6043479442596436, "learning_rate": 0.0002, "epoch": 0.898876404494382, "step": 5560}, {"loss": 0.7754, "grad_norm": 0.5636463165283203, "learning_rate": 0.0002, "epoch": 0.9004930886751273, "step": 5570}, {"loss": 0.8145, "grad_norm": 0.5620124340057373, "learning_rate": 0.0002, "epoch": 0.9021097728558726, "step": 5580}, {"loss": 0.8083, "grad_norm": 0.5206354856491089, "learning_rate": 0.0002, "epoch": 0.9037264570366179, "step": 5590}, {"loss": 0.8557, "grad_norm": 0.5798229575157166, "learning_rate": 0.0002, "epoch": 0.9053431412173631, "step": 5600}, {"loss": 0.8097, "grad_norm": 0.6428212523460388, "learning_rate": 0.0002, "epoch": 0.9069598253981085, "step": 5610}, {"loss": 0.7839, "grad_norm": 0.48064687848091125, "learning_rate": 0.0002, "epoch": 0.9085765095788537, "step": 5620}, {"loss": 0.8343, "grad_norm": 0.6347860097885132, "learning_rate": 0.0002, "epoch": 0.9101931937595991, "step": 5630}, {"loss": 0.851, "grad_norm": 0.5353913307189941, "learning_rate": 0.0002, "epoch": 0.9118098779403444, "step": 5640}, {"loss": 0.7736, "grad_norm": 0.5323944091796875, "learning_rate": 0.0002, "epoch": 0.9134265621210896, "step": 5650}, {"loss": 0.8393, "grad_norm": 0.5261843204498291, "learning_rate": 0.0002, "epoch": 0.915043246301835, "step": 5660}, {"loss": 0.7355, "grad_norm": 0.5451326966285706, "learning_rate": 0.0002, "epoch": 0.9166599304825802, "step": 5670}, {"loss": 0.8012, "grad_norm": 0.5183324217796326, "learning_rate": 0.0002, "epoch": 0.9182766146633256, "step": 5680}, {"loss": 0.7659, "grad_norm": 0.47229018807411194, "learning_rate": 0.0002, "epoch": 0.9198932988440708, "step": 5690}, {"loss": 0.7757, "grad_norm": 0.49180513620376587, "learning_rate": 0.0002, "epoch": 0.9215099830248161, "step": 5700}, {"loss": 0.8735, "grad_norm": 0.5419785380363464, "learning_rate": 0.0002, "epoch": 0.9231266672055614, "step": 5710}, {"loss": 0.7378, "grad_norm": 0.5408698916435242, "learning_rate": 0.0002, "epoch": 0.9247433513863067, "step": 5720}, {"loss": 0.7701, "grad_norm": 0.5286232829093933, "learning_rate": 0.0002, "epoch": 0.9263600355670519, "step": 5730}, {"loss": 0.8242, "grad_norm": 0.7539758086204529, "learning_rate": 0.0002, "epoch": 0.9279767197477973, "step": 5740}, {"loss": 0.8118, "grad_norm": 0.5166944861412048, "learning_rate": 0.0002, "epoch": 0.9295934039285425, "step": 5750}, {"loss": 0.783, "grad_norm": 0.6601425409317017, "learning_rate": 0.0002, "epoch": 0.9312100881092878, "step": 5760}, {"loss": 0.7873, "grad_norm": 0.5029960870742798, "learning_rate": 0.0002, "epoch": 0.9328267722900332, "step": 5770}, {"loss": 0.7989, "grad_norm": 0.4926645755767822, "learning_rate": 0.0002, "epoch": 0.9344434564707784, "step": 5780}, {"loss": 0.8174, "grad_norm": 0.5739615559577942, "learning_rate": 0.0002, "epoch": 0.9360601406515238, "step": 5790}, {"loss": 0.8037, "grad_norm": 0.5058279037475586, "learning_rate": 0.0002, "epoch": 0.937676824832269, "step": 5800}, {"loss": 0.8537, "grad_norm": 0.5260962247848511, "learning_rate": 0.0002, "epoch": 0.9392935090130143, "step": 5810}, {"loss": 0.7486, "grad_norm": 0.5768588185310364, "learning_rate": 0.0002, "epoch": 0.9409101931937596, "step": 5820}, {"loss": 0.8215, "grad_norm": 0.5170126557350159, "learning_rate": 0.0002, "epoch": 0.9425268773745049, "step": 5830}, {"loss": 0.7422, "grad_norm": 0.5745864510536194, "learning_rate": 0.0002, "epoch": 0.9441435615552501, "step": 5840}, {"loss": 0.7824, "grad_norm": 0.5551357865333557, "learning_rate": 0.0002, "epoch": 0.9457602457359955, "step": 5850}, {"loss": 0.8529, "grad_norm": 0.5776078701019287, "learning_rate": 0.0002, "epoch": 0.9473769299167407, "step": 5860}, {"loss": 0.8527, "grad_norm": 0.5340062379837036, "learning_rate": 0.0002, "epoch": 0.9489936140974861, "step": 5870}, {"loss": 0.8217, "grad_norm": 0.6447290182113647, "learning_rate": 0.0002, "epoch": 0.9506102982782314, "step": 5880}, {"loss": 0.7945, "grad_norm": 0.5123815536499023, "learning_rate": 0.0002, "epoch": 0.9522269824589766, "step": 5890}, {"loss": 0.8209, "grad_norm": 0.48547613620758057, "learning_rate": 0.0002, "epoch": 0.953843666639722, "step": 5900}, {"loss": 0.7896, "grad_norm": 0.5791414976119995, "learning_rate": 0.0002, "epoch": 0.9554603508204672, "step": 5910}, {"loss": 0.8408, "grad_norm": 0.6195011734962463, "learning_rate": 0.0002, "epoch": 0.9570770350012126, "step": 5920}, {"loss": 0.7805, "grad_norm": 0.6323803067207336, "learning_rate": 0.0002, "epoch": 0.9586937191819578, "step": 5930}, {"loss": 0.8484, "grad_norm": 0.45552879571914673, "learning_rate": 0.0002, "epoch": 0.9603104033627031, "step": 5940}, {"loss": 0.7367, "grad_norm": 0.5796473622322083, "learning_rate": 0.0002, "epoch": 0.9619270875434484, "step": 5950}, {"loss": 0.7672, "grad_norm": 0.647261381149292, "learning_rate": 0.0002, "epoch": 0.9635437717241937, "step": 5960}, {"loss": 0.8086, "grad_norm": 0.5487682819366455, "learning_rate": 0.0002, "epoch": 0.9651604559049389, "step": 5970}, {"loss": 0.7973, "grad_norm": 0.5743663907051086, "learning_rate": 0.0002, "epoch": 0.9667771400856843, "step": 5980}, {"loss": 0.8153, "grad_norm": 0.5470591187477112, "learning_rate": 0.0002, "epoch": 0.9683938242664295, "step": 5990}, {"loss": 0.8119, "grad_norm": 0.5901660323143005, "learning_rate": 0.0002, "epoch": 0.9700105084471748, "step": 6000}, {"loss": 0.8147, "grad_norm": 0.6544759273529053, "learning_rate": 0.0002, "epoch": 0.9716271926279202, "step": 6010}, {"loss": 0.7536, "grad_norm": 0.6288470029830933, "learning_rate": 0.0002, "epoch": 0.9732438768086654, "step": 6020}, {"loss": 0.7989, "grad_norm": 0.673153817653656, "learning_rate": 0.0002, "epoch": 0.9748605609894108, "step": 6030}, {"loss": 0.7556, "grad_norm": 0.42854753136634827, "learning_rate": 0.0002, "epoch": 0.976477245170156, "step": 6040}, {"loss": 0.8006, "grad_norm": 0.5227066278457642, "learning_rate": 0.0002, "epoch": 0.9780939293509013, "step": 6050}, {"loss": 0.795, "grad_norm": 0.5372416973114014, "learning_rate": 0.0002, "epoch": 0.9797106135316466, "step": 6060}, {"loss": 0.7591, "grad_norm": 0.6026402115821838, "learning_rate": 0.0002, "epoch": 0.9813272977123919, "step": 6070}, {"loss": 0.8347, "grad_norm": 0.49547791481018066, "learning_rate": 0.0002, "epoch": 0.9829439818931371, "step": 6080}, {"loss": 0.7722, "grad_norm": 0.4641951322555542, "learning_rate": 0.0002, "epoch": 0.9845606660738825, "step": 6090}, {"loss": 0.8125, "grad_norm": 0.5818535089492798, "learning_rate": 0.0002, "epoch": 0.9861773502546277, "step": 6100}, {"loss": 0.81, "grad_norm": 0.63955157995224, "learning_rate": 0.0002, "epoch": 0.9877940344353731, "step": 6110}, {"loss": 0.7547, "grad_norm": 0.5649438500404358, "learning_rate": 0.0002, "epoch": 0.9894107186161183, "step": 6120}, {"loss": 0.7861, "grad_norm": 0.5290433168411255, "learning_rate": 0.0002, "epoch": 0.9910274027968636, "step": 6130}, {"loss": 0.8109, "grad_norm": 0.6399374008178711, "learning_rate": 0.0002, "epoch": 0.992644086977609, "step": 6140}, {"loss": 0.8373, "grad_norm": 0.6736576557159424, "learning_rate": 0.0002, "epoch": 0.9942607711583542, "step": 6150}, {"loss": 0.7915, "grad_norm": 0.515420138835907, "learning_rate": 0.0002, "epoch": 0.9958774553390995, "step": 6160}, {"loss": 0.8032, "grad_norm": 0.562677800655365, "learning_rate": 0.0002, "epoch": 0.9974941395198448, "step": 6170}, {"loss": 0.8187, "grad_norm": 0.7113858461380005, "learning_rate": 0.0002, "epoch": 0.9991108237005901, "step": 6180}, {"eval_loss": 1.0871200561523438, "eval_runtime": 122.2071, "eval_samples_per_second": 5.998, "eval_steps_per_second": 0.753, "epoch": 0.9999191657909627, "step": 6185}, {"loss": 0.7507, "grad_norm": 0.7111801505088806, "learning_rate": 0.0002, "epoch": 1.0007275078813354, "step": 6190}, {"loss": 0.6865, "grad_norm": 0.5402125716209412, "learning_rate": 0.0002, "epoch": 1.0023441920620806, "step": 6200}, {"loss": 0.7625, "grad_norm": 0.6098830103874207, "learning_rate": 0.0002, "epoch": 1.003960876242826, "step": 6210}, {"loss": 0.7631, "grad_norm": 0.5829983353614807, "learning_rate": 0.0002, "epoch": 1.0055775604235713, "step": 6220}, {"loss": 0.7188, "grad_norm": 0.5614621043205261, "learning_rate": 0.0002, "epoch": 1.0071942446043165, "step": 6230}, {"loss": 0.7505, "grad_norm": 0.5954238772392273, "learning_rate": 0.0002, "epoch": 1.0088109287850617, "step": 6240}, {"loss": 0.7448, "grad_norm": 0.6480574607849121, "learning_rate": 0.0002, "epoch": 1.0104276129658072, "step": 6250}, {"loss": 0.7514, "grad_norm": 0.6051128506660461, "learning_rate": 0.0002, "epoch": 1.0120442971465524, "step": 6260}, {"loss": 0.7237, "grad_norm": 0.6318870782852173, "learning_rate": 0.0002, "epoch": 1.0136609813272976, "step": 6270}, {"loss": 0.7178, "grad_norm": 0.5048980116844177, "learning_rate": 0.0002, "epoch": 1.015277665508043, "step": 6280}, {"loss": 0.7391, "grad_norm": 0.6346936225891113, "learning_rate": 0.0002, "epoch": 1.0168943496887883, "step": 6290}, {"loss": 0.7486, "grad_norm": 0.5711665749549866, "learning_rate": 0.0002, "epoch": 1.0185110338695336, "step": 6300}, {"loss": 0.6808, "grad_norm": 0.5175361037254333, "learning_rate": 0.0002, "epoch": 1.0201277180502788, "step": 6310}, {"loss": 0.7539, "grad_norm": 0.5360831618309021, "learning_rate": 0.0002, "epoch": 1.0217444022310243, "step": 6320}, {"loss": 0.7112, "grad_norm": 0.614675760269165, "learning_rate": 0.0002, "epoch": 1.0233610864117695, "step": 6330}, {"loss": 0.7748, "grad_norm": 0.5626118183135986, "learning_rate": 0.0002, "epoch": 1.0249777705925147, "step": 6340}, {"loss": 0.7375, "grad_norm": 0.574897289276123, "learning_rate": 0.0002, "epoch": 1.02659445477326, "step": 6350}, {"loss": 0.759, "grad_norm": 0.7185447812080383, "learning_rate": 0.0002, "epoch": 1.0282111389540054, "step": 6360}, {"loss": 0.703, "grad_norm": 0.6705799698829651, "learning_rate": 0.0002, "epoch": 1.0298278231347506, "step": 6370}, {"loss": 0.7139, "grad_norm": 0.6740428805351257, "learning_rate": 0.0002, "epoch": 1.0314445073154959, "step": 6380}, {"loss": 0.7252, "grad_norm": 0.663902759552002, "learning_rate": 0.0002, "epoch": 1.0330611914962413, "step": 6390}, {"loss": 0.7065, "grad_norm": 0.5029543042182922, "learning_rate": 0.0002, "epoch": 1.0346778756769865, "step": 6400}, {"loss": 0.711, "grad_norm": 0.7813863158226013, "learning_rate": 0.0002, "epoch": 1.0362945598577318, "step": 6410}, {"loss": 0.7433, "grad_norm": 0.5396282076835632, "learning_rate": 0.0002, "epoch": 1.037911244038477, "step": 6420}, {"loss": 0.7222, "grad_norm": 0.5253293514251709, "learning_rate": 0.0002, "epoch": 1.0395279282192225, "step": 6430}, {"loss": 0.715, "grad_norm": 0.7236770987510681, "learning_rate": 0.0002, "epoch": 1.0411446123999677, "step": 6440}, {"loss": 0.7259, "grad_norm": 0.5670917630195618, "learning_rate": 0.0002, "epoch": 1.042761296580713, "step": 6450}, {"loss": 0.7195, "grad_norm": 0.6031978726387024, "learning_rate": 0.0002, "epoch": 1.0443779807614582, "step": 6460}, {"loss": 0.7648, "grad_norm": 0.5309213399887085, "learning_rate": 0.0002, "epoch": 1.0459946649422036, "step": 6470}, {"loss": 0.7161, "grad_norm": 0.7114651799201965, "learning_rate": 0.0002, "epoch": 1.0476113491229488, "step": 6480}, {"loss": 0.7583, "grad_norm": 0.5591610670089722, "learning_rate": 0.0002, "epoch": 1.049228033303694, "step": 6490}, {"loss": 0.6645, "grad_norm": 0.5185961127281189, "learning_rate": 0.0002, "epoch": 1.0508447174844395, "step": 6500}, {"loss": 0.7654, "grad_norm": 0.6510552167892456, "learning_rate": 0.0002, "epoch": 1.0524614016651848, "step": 6510}, {"loss": 0.7057, "grad_norm": 0.6557928919792175, "learning_rate": 0.0002, "epoch": 1.05407808584593, "step": 6520}, {"loss": 0.8056, "grad_norm": 0.6973192691802979, "learning_rate": 0.0002, "epoch": 1.0556947700266752, "step": 6530}, {"loss": 0.6793, "grad_norm": 0.6226583123207092, "learning_rate": 0.0002, "epoch": 1.0573114542074207, "step": 6540}, {"loss": 0.7151, "grad_norm": 0.5633195638656616, "learning_rate": 0.0002, "epoch": 1.058928138388166, "step": 6550}, {"loss": 0.7082, "grad_norm": 0.7466658353805542, "learning_rate": 0.0002, "epoch": 1.0605448225689111, "step": 6560}, {"loss": 0.7059, "grad_norm": 0.6462772488594055, "learning_rate": 0.0002, "epoch": 1.0621615067496564, "step": 6570}, {"loss": 0.7046, "grad_norm": 0.5266856551170349, "learning_rate": 0.0002, "epoch": 1.0637781909304018, "step": 6580}, {"loss": 0.7157, "grad_norm": 0.534392774105072, "learning_rate": 0.0002, "epoch": 1.065394875111147, "step": 6590}, {"loss": 0.7115, "grad_norm": 0.7514177560806274, "learning_rate": 0.0002, "epoch": 1.0670115592918923, "step": 6600}, {"loss": 0.7545, "grad_norm": 0.7593035697937012, "learning_rate": 0.0002, "epoch": 1.0686282434726375, "step": 6610}, {"loss": 0.6836, "grad_norm": 0.5277858972549438, "learning_rate": 0.0002, "epoch": 1.070244927653383, "step": 6620}, {"loss": 0.7405, "grad_norm": 0.5573670268058777, "learning_rate": 0.0002, "epoch": 1.0718616118341282, "step": 6630}, {"loss": 0.6774, "grad_norm": 0.6802396774291992, "learning_rate": 0.0002, "epoch": 1.0734782960148734, "step": 6640}, {"loss": 0.723, "grad_norm": 0.7367215752601624, "learning_rate": 0.0002, "epoch": 1.0750949801956189, "step": 6650}, {"loss": 0.7429, "grad_norm": 0.5961891412734985, "learning_rate": 0.0002, "epoch": 1.0767116643763641, "step": 6660}, {"loss": 0.6791, "grad_norm": 0.5736313462257385, "learning_rate": 0.0002, "epoch": 1.0783283485571094, "step": 6670}, {"loss": 0.7178, "grad_norm": 0.619219183921814, "learning_rate": 0.0002, "epoch": 1.0799450327378546, "step": 6680}, {"loss": 0.7318, "grad_norm": 0.6214390993118286, "learning_rate": 0.0002, "epoch": 1.0815617169186, "step": 6690}, {"loss": 0.7554, "grad_norm": 0.564536988735199, "learning_rate": 0.0002, "epoch": 1.0831784010993453, "step": 6700}, {"loss": 0.7362, "grad_norm": 0.5838140249252319, "learning_rate": 0.0002, "epoch": 1.0847950852800905, "step": 6710}, {"loss": 0.739, "grad_norm": 0.7000553607940674, "learning_rate": 0.0002, "epoch": 1.0864117694608357, "step": 6720}, {"loss": 0.7369, "grad_norm": 0.7078263759613037, "learning_rate": 0.0002, "epoch": 1.0880284536415812, "step": 6730}, {"loss": 0.7654, "grad_norm": 0.8353848457336426, "learning_rate": 0.0002, "epoch": 1.0896451378223264, "step": 6740}, {"loss": 0.7015, "grad_norm": 0.5615518689155579, "learning_rate": 0.0002, "epoch": 1.0912618220030716, "step": 6750}, {"loss": 0.7396, "grad_norm": 0.5475581288337708, "learning_rate": 0.0002, "epoch": 1.0928785061838169, "step": 6760}, {"loss": 0.7652, "grad_norm": 0.5835978388786316, "learning_rate": 0.0002, "epoch": 1.0944951903645623, "step": 6770}, {"loss": 0.7541, "grad_norm": 0.5516105890274048, "learning_rate": 0.0002, "epoch": 1.0961118745453076, "step": 6780}, {"loss": 0.6842, "grad_norm": 0.5875251889228821, "learning_rate": 0.0002, "epoch": 1.0977285587260528, "step": 6790}, {"loss": 0.6903, "grad_norm": 0.7376947999000549, "learning_rate": 0.0002, "epoch": 1.0993452429067982, "step": 6800}, {"loss": 0.7512, "grad_norm": 0.5656165480613708, "learning_rate": 0.0002, "epoch": 1.1009619270875435, "step": 6810}, {"loss": 0.7409, "grad_norm": 0.6365954279899597, "learning_rate": 0.0002, "epoch": 1.1025786112682887, "step": 6820}, {"loss": 0.7392, "grad_norm": 0.5033080577850342, "learning_rate": 0.0002, "epoch": 1.104195295449034, "step": 6830}, {"loss": 0.6909, "grad_norm": 0.617396891117096, "learning_rate": 0.0002, "epoch": 1.1058119796297794, "step": 6840}, {"loss": 0.7006, "grad_norm": 0.6395374536514282, "learning_rate": 0.0002, "epoch": 1.1074286638105246, "step": 6850}, {"loss": 0.7335, "grad_norm": 0.6775295734405518, "learning_rate": 0.0002, "epoch": 1.1090453479912699, "step": 6860}, {"loss": 0.764, "grad_norm": 0.6655223965644836, "learning_rate": 0.0002, "epoch": 1.1106620321720153, "step": 6870}, {"loss": 0.7553, "grad_norm": 0.676655113697052, "learning_rate": 0.0002, "epoch": 1.1122787163527605, "step": 6880}, {"loss": 0.7342, "grad_norm": 0.6062718629837036, "learning_rate": 0.0002, "epoch": 1.1138954005335058, "step": 6890}, {"loss": 0.7446, "grad_norm": 0.590943455696106, "learning_rate": 0.0002, "epoch": 1.115512084714251, "step": 6900}, {"loss": 0.6705, "grad_norm": 0.6315317153930664, "learning_rate": 0.0002, "epoch": 1.1171287688949965, "step": 6910}, {"loss": 0.6912, "grad_norm": 0.47979024052619934, "learning_rate": 0.0002, "epoch": 1.1187454530757417, "step": 6920}, {"loss": 0.7002, "grad_norm": 0.647298276424408, "learning_rate": 0.0002, "epoch": 1.120362137256487, "step": 6930}, {"loss": 0.7502, "grad_norm": 0.7336484789848328, "learning_rate": 0.0002, "epoch": 1.1219788214372322, "step": 6940}, {"loss": 0.693, "grad_norm": 0.5071424245834351, "learning_rate": 0.0002, "epoch": 1.1235955056179776, "step": 6950}, {"loss": 0.7378, "grad_norm": 0.6527144312858582, "learning_rate": 0.0002, "epoch": 1.1252121897987228, "step": 6960}, {"loss": 0.7228, "grad_norm": 0.6935935020446777, "learning_rate": 0.0002, "epoch": 1.126828873979468, "step": 6970}, {"loss": 0.699, "grad_norm": 0.8026931881904602, "learning_rate": 0.0002, "epoch": 1.1284455581602133, "step": 6980}, {"loss": 0.7361, "grad_norm": 0.5210393667221069, "learning_rate": 0.0002, "epoch": 1.1300622423409588, "step": 6990}, {"loss": 0.7456, "grad_norm": 0.60475093126297, "learning_rate": 0.0002, "epoch": 1.131678926521704, "step": 7000}, {"loss": 0.7495, "grad_norm": 0.6417073607444763, "learning_rate": 0.0002, "epoch": 1.1332956107024492, "step": 7010}, {"loss": 0.7459, "grad_norm": 0.6732175946235657, "learning_rate": 0.0002, "epoch": 1.1349122948831947, "step": 7020}, {"loss": 0.7278, "grad_norm": 0.6719491481781006, "learning_rate": 0.0002, "epoch": 1.13652897906394, "step": 7030}, {"loss": 0.7694, "grad_norm": 0.5708295106887817, "learning_rate": 0.0002, "epoch": 1.1381456632446851, "step": 7040}, {"loss": 0.7823, "grad_norm": 0.7141719460487366, "learning_rate": 0.0002, "epoch": 1.1397623474254304, "step": 7050}, {"loss": 0.764, "grad_norm": 0.6187017560005188, "learning_rate": 0.0002, "epoch": 1.1413790316061758, "step": 7060}, {"loss": 0.7657, "grad_norm": 0.50581294298172, "learning_rate": 0.0002, "epoch": 1.142995715786921, "step": 7070}, {"loss": 0.7357, "grad_norm": 0.5620143413543701, "learning_rate": 0.0002, "epoch": 1.1446123999676663, "step": 7080}, {"loss": 0.7287, "grad_norm": 0.6231929659843445, "learning_rate": 0.0002, "epoch": 1.1462290841484115, "step": 7090}, {"loss": 0.7328, "grad_norm": 0.5775774121284485, "learning_rate": 0.0002, "epoch": 1.147845768329157, "step": 7100}, {"loss": 0.7728, "grad_norm": 0.6492809653282166, "learning_rate": 0.0002, "epoch": 1.1494624525099022, "step": 7110}, {"loss": 0.7545, "grad_norm": 0.6434972286224365, "learning_rate": 0.0002, "epoch": 1.1510791366906474, "step": 7120}, {"loss": 0.7374, "grad_norm": 0.6191812753677368, "learning_rate": 0.0002, "epoch": 1.1526958208713927, "step": 7130}, {"loss": 0.7276, "grad_norm": 0.6690331697463989, "learning_rate": 0.0002, "epoch": 1.1543125050521381, "step": 7140}, {"loss": 0.7704, "grad_norm": 0.5977938175201416, "learning_rate": 0.0002, "epoch": 1.1559291892328833, "step": 7150}, {"loss": 0.7251, "grad_norm": 0.6195854544639587, "learning_rate": 0.0002, "epoch": 1.1575458734136286, "step": 7160}, {"loss": 0.7249, "grad_norm": 0.5752048492431641, "learning_rate": 0.0002, "epoch": 1.159162557594374, "step": 7170}, {"loss": 0.7593, "grad_norm": 0.589081883430481, "learning_rate": 0.0002, "epoch": 1.1607792417751193, "step": 7180}, {"loss": 0.704, "grad_norm": 0.756996750831604, "learning_rate": 0.0002, "epoch": 1.1623959259558645, "step": 7190}, {"loss": 0.7404, "grad_norm": 0.7614967226982117, "learning_rate": 0.0002, "epoch": 1.1640126101366097, "step": 7200}, {"loss": 0.7867, "grad_norm": 0.6120437979698181, "learning_rate": 0.0002, "epoch": 1.1656292943173552, "step": 7210}, {"loss": 0.7384, "grad_norm": 0.6210004687309265, "learning_rate": 0.0002, "epoch": 1.1672459784981004, "step": 7220}, {"loss": 0.7251, "grad_norm": 0.6044116020202637, "learning_rate": 0.0002, "epoch": 1.1688626626788456, "step": 7230}, {"loss": 0.7361, "grad_norm": 0.5418457388877869, "learning_rate": 0.0002, "epoch": 1.170479346859591, "step": 7240}, {"loss": 0.6938, "grad_norm": 0.6413537263870239, "learning_rate": 0.0002, "epoch": 1.1720960310403363, "step": 7250}, {"loss": 0.6978, "grad_norm": 0.5777867436408997, "learning_rate": 0.0002, "epoch": 1.1737127152210816, "step": 7260}, {"loss": 0.7503, "grad_norm": 0.7092402577400208, "learning_rate": 0.0002, "epoch": 1.1753293994018268, "step": 7270}, {"loss": 0.7487, "grad_norm": 0.6351709365844727, "learning_rate": 0.0002, "epoch": 1.176946083582572, "step": 7280}, {"loss": 0.7527, "grad_norm": 0.6172189712524414, "learning_rate": 0.0002, "epoch": 1.1785627677633175, "step": 7290}, {"loss": 0.7319, "grad_norm": 0.6801714897155762, "learning_rate": 0.0002, "epoch": 1.1801794519440627, "step": 7300}, {"loss": 0.6941, "grad_norm": 0.6044712066650391, "learning_rate": 0.0002, "epoch": 1.181796136124808, "step": 7310}, {"loss": 0.6951, "grad_norm": 0.7413212060928345, "learning_rate": 0.0002, "epoch": 1.1834128203055534, "step": 7320}, {"loss": 0.7396, "grad_norm": 0.5303856134414673, "learning_rate": 0.0002, "epoch": 1.1850295044862986, "step": 7330}, {"loss": 0.6915, "grad_norm": 0.5647098422050476, "learning_rate": 0.0002, "epoch": 1.1866461886670439, "step": 7340}, {"loss": 0.7506, "grad_norm": 0.7374135255813599, "learning_rate": 0.0002, "epoch": 1.188262872847789, "step": 7350}, {"loss": 0.7041, "grad_norm": 0.5710089206695557, "learning_rate": 0.0002, "epoch": 1.1898795570285345, "step": 7360}, {"loss": 0.8289, "grad_norm": 0.6073619723320007, "learning_rate": 0.0002, "epoch": 1.1914962412092798, "step": 7370}, {"loss": 0.7722, "grad_norm": 0.5899916887283325, "learning_rate": 0.0002, "epoch": 1.193112925390025, "step": 7380}, {"loss": 0.756, "grad_norm": 0.7762434482574463, "learning_rate": 0.0002, "epoch": 1.1947296095707705, "step": 7390}, {"loss": 0.7319, "grad_norm": 0.679949939250946, "learning_rate": 0.0002, "epoch": 1.1963462937515157, "step": 7400}, {"loss": 0.7599, "grad_norm": 0.6106849312782288, "learning_rate": 0.0002, "epoch": 1.197962977932261, "step": 7410}, {"loss": 0.7648, "grad_norm": 0.682461678981781, "learning_rate": 0.0002, "epoch": 1.1995796621130062, "step": 7420}, {"loss": 0.7741, "grad_norm": 0.6087017059326172, "learning_rate": 0.0002, "epoch": 1.2011963462937516, "step": 7430}, {"loss": 0.7642, "grad_norm": 0.63739013671875, "learning_rate": 0.0002, "epoch": 1.2028130304744968, "step": 7440}, {"loss": 0.7611, "grad_norm": 0.6154777407646179, "learning_rate": 0.0002, "epoch": 1.204429714655242, "step": 7450}, {"loss": 0.7565, "grad_norm": 0.7491534948348999, "learning_rate": 0.0002, "epoch": 1.2060463988359873, "step": 7460}, {"loss": 0.698, "grad_norm": 0.6664797067642212, "learning_rate": 0.0002, "epoch": 1.2076630830167328, "step": 7470}, {"loss": 0.7456, "grad_norm": 0.6660266518592834, "learning_rate": 0.0002, "epoch": 1.209279767197478, "step": 7480}, {"loss": 0.714, "grad_norm": 0.6972551345825195, "learning_rate": 0.0002, "epoch": 1.2108964513782232, "step": 7490}, {"loss": 0.7023, "grad_norm": 0.6157945990562439, "learning_rate": 0.0002, "epoch": 1.2125131355589684, "step": 7500}, {"loss": 0.7326, "grad_norm": 0.5199310183525085, "learning_rate": 0.0002, "epoch": 1.214129819739714, "step": 7510}, {"loss": 0.7586, "grad_norm": 0.577610433101654, "learning_rate": 0.0002, "epoch": 1.2157465039204591, "step": 7520}, {"loss": 0.7179, "grad_norm": 0.53652423620224, "learning_rate": 0.0002, "epoch": 1.2173631881012044, "step": 7530}, {"loss": 0.7393, "grad_norm": 0.6479050517082214, "learning_rate": 0.0002, "epoch": 1.2189798722819498, "step": 7540}, {"loss": 0.7534, "grad_norm": 0.618748128414154, "learning_rate": 0.0002, "epoch": 1.220596556462695, "step": 7550}, {"loss": 0.6886, "grad_norm": 0.6311424374580383, "learning_rate": 0.0002, "epoch": 1.2222132406434403, "step": 7560}, {"loss": 0.7272, "grad_norm": 0.6595825552940369, "learning_rate": 0.0002, "epoch": 1.2238299248241855, "step": 7570}, {"loss": 0.7353, "grad_norm": 0.5198960900306702, "learning_rate": 0.0002, "epoch": 1.225446609004931, "step": 7580}, {"loss": 0.674, "grad_norm": 0.578650712966919, "learning_rate": 0.0002, "epoch": 1.2270632931856762, "step": 7590}, {"loss": 0.7507, "grad_norm": 0.6080220937728882, "learning_rate": 0.0002, "epoch": 1.2286799773664214, "step": 7600}, {"loss": 0.7733, "grad_norm": 0.7050248384475708, "learning_rate": 0.0002, "epoch": 1.2302966615471669, "step": 7610}, {"loss": 0.7032, "grad_norm": 0.6652196049690247, "learning_rate": 0.0002, "epoch": 1.2319133457279121, "step": 7620}, {"loss": 0.7085, "grad_norm": 0.7322776317596436, "learning_rate": 0.0002, "epoch": 1.2335300299086573, "step": 7630}, {"loss": 0.7402, "grad_norm": 0.4998728036880493, "learning_rate": 0.0002, "epoch": 1.2351467140894026, "step": 7640}, {"loss": 0.7214, "grad_norm": 0.6428788900375366, "learning_rate": 0.0002, "epoch": 1.2367633982701478, "step": 7650}, {"loss": 0.7699, "grad_norm": 0.585242509841919, "learning_rate": 0.0002, "epoch": 1.2383800824508933, "step": 7660}, {"loss": 0.7621, "grad_norm": 0.5211917757987976, "learning_rate": 0.0002, "epoch": 1.2399967666316385, "step": 7670}, {"loss": 0.746, "grad_norm": 0.6490384340286255, "learning_rate": 0.0002, "epoch": 1.2416134508123837, "step": 7680}, {"loss": 0.7186, "grad_norm": 0.6249763369560242, "learning_rate": 0.0002, "epoch": 1.2432301349931292, "step": 7690}, {"loss": 0.7761, "grad_norm": 0.71870356798172, "learning_rate": 0.0002, "epoch": 1.2448468191738744, "step": 7700}, {"loss": 0.7525, "grad_norm": 0.6761967539787292, "learning_rate": 0.0002, "epoch": 1.2464635033546196, "step": 7710}, {"loss": 0.7501, "grad_norm": 0.6500617265701294, "learning_rate": 0.0002, "epoch": 1.2480801875353649, "step": 7720}, {"loss": 0.7903, "grad_norm": 0.8069869875907898, "learning_rate": 0.0002, "epoch": 1.2496968717161103, "step": 7730}, {"loss": 0.6747, "grad_norm": 0.6044608950614929, "learning_rate": 0.0002, "epoch": 1.2513135558968556, "step": 7740}, {"loss": 0.6825, "grad_norm": 0.6573283076286316, "learning_rate": 0.0002, "epoch": 1.2529302400776008, "step": 7750}, {"loss": 0.7617, "grad_norm": 0.625430166721344, "learning_rate": 0.0002, "epoch": 1.2545469242583462, "step": 7760}, {"loss": 0.7041, "grad_norm": 0.5442022681236267, "learning_rate": 0.0002, "epoch": 1.2561636084390915, "step": 7770}, {"loss": 0.7172, "grad_norm": 0.6818386912345886, "learning_rate": 0.0002, "epoch": 1.2577802926198367, "step": 7780}, {"loss": 0.696, "grad_norm": 0.6381874084472656, "learning_rate": 0.0002, "epoch": 1.259396976800582, "step": 7790}, {"loss": 0.6834, "grad_norm": 0.6269212961196899, "learning_rate": 0.0002, "epoch": 1.2610136609813272, "step": 7800}, {"loss": 0.7821, "grad_norm": 0.600121259689331, "learning_rate": 0.0002, "epoch": 1.2626303451620726, "step": 7810}, {"loss": 0.7761, "grad_norm": 0.6337703466415405, "learning_rate": 0.0002, "epoch": 1.2642470293428179, "step": 7820}, {"loss": 0.732, "grad_norm": 0.7234963774681091, "learning_rate": 0.0002, "epoch": 1.2658637135235633, "step": 7830}, {"loss": 0.785, "grad_norm": 0.800184965133667, "learning_rate": 0.0002, "epoch": 1.2674803977043085, "step": 7840}, {"loss": 0.7426, "grad_norm": 0.7539464831352234, "learning_rate": 0.0002, "epoch": 1.2690970818850538, "step": 7850}, {"loss": 0.7496, "grad_norm": 0.5493760704994202, "learning_rate": 0.0002, "epoch": 1.270713766065799, "step": 7860}, {"loss": 0.7537, "grad_norm": 0.7477145791053772, "learning_rate": 0.0002, "epoch": 1.2723304502465442, "step": 7870}, {"loss": 0.7573, "grad_norm": 0.6366362571716309, "learning_rate": 0.0002, "epoch": 1.2739471344272897, "step": 7880}, {"loss": 0.7608, "grad_norm": 0.7419533729553223, "learning_rate": 0.0002, "epoch": 1.275563818608035, "step": 7890}, {"loss": 0.7873, "grad_norm": 0.6141223311424255, "learning_rate": 0.0002, "epoch": 1.2771805027887801, "step": 7900}, {"loss": 0.6916, "grad_norm": 0.7522598505020142, "learning_rate": 0.0002, "epoch": 1.2787971869695256, "step": 7910}, {"loss": 0.7097, "grad_norm": 0.6935804486274719, "learning_rate": 0.0002, "epoch": 1.2804138711502708, "step": 7920}, {"loss": 0.7185, "grad_norm": 0.7239290475845337, "learning_rate": 0.0002, "epoch": 1.282030555331016, "step": 7930}, {"loss": 0.7145, "grad_norm": 0.8800187110900879, "learning_rate": 0.0002, "epoch": 1.2836472395117613, "step": 7940}, {"loss": 0.6991, "grad_norm": 0.540458083152771, "learning_rate": 0.0002, "epoch": 1.2852639236925067, "step": 7950}, {"loss": 0.7139, "grad_norm": 0.6492934226989746, "learning_rate": 0.0002, "epoch": 1.286880607873252, "step": 7960}, {"loss": 0.7742, "grad_norm": 0.6543959379196167, "learning_rate": 0.0002, "epoch": 1.2884972920539972, "step": 7970}, {"loss": 0.7316, "grad_norm": 0.5804705619812012, "learning_rate": 0.0002, "epoch": 1.2901139762347427, "step": 7980}, {"loss": 0.796, "grad_norm": 0.7074727416038513, "learning_rate": 0.0002, "epoch": 1.291730660415488, "step": 7990}, {"loss": 0.7034, "grad_norm": 0.5347974300384521, "learning_rate": 0.0002, "epoch": 1.2933473445962331, "step": 8000}, {"loss": 0.738, "grad_norm": 0.6457298398017883, "learning_rate": 0.0002, "epoch": 1.2949640287769784, "step": 8010}, {"loss": 0.7634, "grad_norm": 0.6407219171524048, "learning_rate": 0.0002, "epoch": 1.2965807129577236, "step": 8020}, {"loss": 0.7506, "grad_norm": 0.828439474105835, "learning_rate": 0.0002, "epoch": 1.298197397138469, "step": 8030}, {"loss": 0.735, "grad_norm": 0.4840380549430847, "learning_rate": 0.0002, "epoch": 1.2998140813192143, "step": 8040}, {"loss": 0.7283, "grad_norm": 0.5921024680137634, "learning_rate": 0.0002, "epoch": 1.3014307654999595, "step": 8050}, {"loss": 0.7477, "grad_norm": 0.6170315146446228, "learning_rate": 0.0002, "epoch": 1.303047449680705, "step": 8060}, {"loss": 0.7534, "grad_norm": 0.5374847054481506, "learning_rate": 0.0002, "epoch": 1.3046641338614502, "step": 8070}, {"loss": 0.7593, "grad_norm": 0.545758068561554, "learning_rate": 0.0002, "epoch": 1.3062808180421954, "step": 8080}, {"loss": 0.7463, "grad_norm": 0.55641770362854, "learning_rate": 0.0002, "epoch": 1.3078975022229407, "step": 8090}, {"loss": 0.7594, "grad_norm": 0.6724897027015686, "learning_rate": 0.0002, "epoch": 1.309514186403686, "step": 8100}, {"loss": 0.7105, "grad_norm": 0.6923972368240356, "learning_rate": 0.0002, "epoch": 1.3111308705844313, "step": 8110}, {"loss": 0.7149, "grad_norm": 0.5136841535568237, "learning_rate": 0.0002, "epoch": 1.3127475547651766, "step": 8120}, {"loss": 0.7504, "grad_norm": 0.6766283512115479, "learning_rate": 0.0002, "epoch": 1.314364238945922, "step": 8130}, {"loss": 0.7489, "grad_norm": 0.6283926367759705, "learning_rate": 0.0002, "epoch": 1.3159809231266673, "step": 8140}, {"loss": 0.7459, "grad_norm": 0.644216001033783, "learning_rate": 0.0002, "epoch": 1.3175976073074125, "step": 8150}, {"loss": 0.7125, "grad_norm": 0.7827503085136414, "learning_rate": 0.0002, "epoch": 1.3192142914881577, "step": 8160}, {"loss": 0.7271, "grad_norm": 0.6651390790939331, "learning_rate": 0.0002, "epoch": 1.320830975668903, "step": 8170}, {"loss": 0.7778, "grad_norm": 0.5547412633895874, "learning_rate": 0.0002, "epoch": 1.3224476598496484, "step": 8180}, {"loss": 0.7402, "grad_norm": 0.6765179634094238, "learning_rate": 0.0002, "epoch": 1.3240643440303936, "step": 8190}, {"loss": 0.7106, "grad_norm": 0.6822077035903931, "learning_rate": 0.0002, "epoch": 1.325681028211139, "step": 8200}, {"loss": 0.7288, "grad_norm": 0.5941002368927002, "learning_rate": 0.0002, "epoch": 1.3272977123918843, "step": 8210}, {"loss": 0.7494, "grad_norm": 0.4850037097930908, "learning_rate": 0.0002, "epoch": 1.3289143965726296, "step": 8220}, {"loss": 0.7474, "grad_norm": 0.6162990927696228, "learning_rate": 0.0002, "epoch": 1.3305310807533748, "step": 8230}, {"loss": 0.7751, "grad_norm": 0.6665613651275635, "learning_rate": 0.0002, "epoch": 1.33214776493412, "step": 8240}, {"loss": 0.759, "grad_norm": 0.618192732334137, "learning_rate": 0.0002, "epoch": 1.3337644491148655, "step": 8250}, {"loss": 0.7532, "grad_norm": 0.710418701171875, "learning_rate": 0.0002, "epoch": 1.3353811332956107, "step": 8260}, {"loss": 0.7306, "grad_norm": 0.5109876990318298, "learning_rate": 0.0002, "epoch": 1.336997817476356, "step": 8270}, {"loss": 0.7303, "grad_norm": 0.6791711449623108, "learning_rate": 0.0002, "epoch": 1.3386145016571014, "step": 8280}, {"loss": 0.7594, "grad_norm": 0.6836432814598083, "learning_rate": 0.0002, "epoch": 1.3402311858378466, "step": 8290}, {"loss": 0.7594, "grad_norm": 0.5579386353492737, "learning_rate": 0.0002, "epoch": 1.3418478700185918, "step": 8300}, {"loss": 0.7377, "grad_norm": 0.6713546514511108, "learning_rate": 0.0002, "epoch": 1.343464554199337, "step": 8310}, {"loss": 0.7756, "grad_norm": 0.5353720188140869, "learning_rate": 0.0002, "epoch": 1.3450812383800825, "step": 8320}, {"loss": 0.718, "grad_norm": 0.5813682675361633, "learning_rate": 0.0002, "epoch": 1.3466979225608278, "step": 8330}, {"loss": 0.7294, "grad_norm": 0.8158791661262512, "learning_rate": 0.0002, "epoch": 1.348314606741573, "step": 8340}, {"loss": 0.6992, "grad_norm": 0.6193785071372986, "learning_rate": 0.0002, "epoch": 1.3499312909223184, "step": 8350}, {"loss": 0.7654, "grad_norm": 0.6353939771652222, "learning_rate": 0.0002, "epoch": 1.3515479751030637, "step": 8360}, {"loss": 0.7519, "grad_norm": 0.6925048232078552, "learning_rate": 0.0002, "epoch": 1.353164659283809, "step": 8370}, {"loss": 0.736, "grad_norm": 0.988264799118042, "learning_rate": 0.0002, "epoch": 1.3547813434645541, "step": 8380}, {"loss": 0.7744, "grad_norm": 0.6476002931594849, "learning_rate": 0.0002, "epoch": 1.3563980276452994, "step": 8390}, {"loss": 0.776, "grad_norm": 0.7120398879051208, "learning_rate": 0.0002, "epoch": 1.3580147118260448, "step": 8400}, {"loss": 0.7368, "grad_norm": 0.9048416614532471, "learning_rate": 0.0002, "epoch": 1.35963139600679, "step": 8410}, {"loss": 0.7544, "grad_norm": 0.7000672817230225, "learning_rate": 0.0002, "epoch": 1.3612480801875353, "step": 8420}, {"loss": 0.7358, "grad_norm": 0.6015632152557373, "learning_rate": 0.0002, "epoch": 1.3628647643682807, "step": 8430}, {"loss": 0.7298, "grad_norm": 0.612516462802887, "learning_rate": 0.0002, "epoch": 1.364481448549026, "step": 8440}, {"loss": 0.7055, "grad_norm": 0.5969301462173462, "learning_rate": 0.0002, "epoch": 1.3660981327297712, "step": 8450}, {"loss": 0.7754, "grad_norm": 0.6730654239654541, "learning_rate": 0.0002, "epoch": 1.3677148169105164, "step": 8460}, {"loss": 0.7465, "grad_norm": 0.6386392116546631, "learning_rate": 0.0002, "epoch": 1.369331501091262, "step": 8470}, {"loss": 0.7433, "grad_norm": 0.739544153213501, "learning_rate": 0.0002, "epoch": 1.3709481852720071, "step": 8480}, {"loss": 0.7892, "grad_norm": 0.6462782621383667, "learning_rate": 0.0002, "epoch": 1.3725648694527524, "step": 8490}, {"loss": 0.7302, "grad_norm": 0.7346843481063843, "learning_rate": 0.0002, "epoch": 1.3741815536334978, "step": 8500}, {"loss": 0.7634, "grad_norm": 0.6884821057319641, "learning_rate": 0.0002, "epoch": 1.375798237814243, "step": 8510}, {"loss": 0.7614, "grad_norm": 0.6999333500862122, "learning_rate": 0.0002, "epoch": 1.3774149219949883, "step": 8520}, {"loss": 0.729, "grad_norm": 0.5378713011741638, "learning_rate": 0.0002, "epoch": 1.3790316061757335, "step": 8530}, {"loss": 0.6797, "grad_norm": 0.5417906641960144, "learning_rate": 0.0002, "epoch": 1.3806482903564787, "step": 8540}, {"loss": 0.7499, "grad_norm": 0.6602526307106018, "learning_rate": 0.0002, "epoch": 1.3822649745372242, "step": 8550}, {"loss": 0.7356, "grad_norm": 0.7073674201965332, "learning_rate": 0.0002, "epoch": 1.3838816587179694, "step": 8560}, {"loss": 0.75, "grad_norm": 0.5841707587242126, "learning_rate": 0.0002, "epoch": 1.3854983428987149, "step": 8570}, {"loss": 0.732, "grad_norm": 0.7031095027923584, "learning_rate": 0.0002, "epoch": 1.38711502707946, "step": 8580}, {"loss": 0.7464, "grad_norm": 0.5198570489883423, "learning_rate": 0.0002, "epoch": 1.3887317112602053, "step": 8590}, {"loss": 0.7354, "grad_norm": 0.7261320352554321, "learning_rate": 0.0002, "epoch": 1.3903483954409506, "step": 8600}, {"loss": 0.7339, "grad_norm": 0.5616350173950195, "learning_rate": 0.0002, "epoch": 1.3919650796216958, "step": 8610}, {"loss": 0.7382, "grad_norm": 0.5185914635658264, "learning_rate": 0.0002, "epoch": 1.3935817638024413, "step": 8620}, {"loss": 0.7456, "grad_norm": 0.5814694762229919, "learning_rate": 0.0002, "epoch": 1.3951984479831865, "step": 8630}, {"loss": 0.7413, "grad_norm": 0.6977371573448181, "learning_rate": 0.0002, "epoch": 1.3968151321639317, "step": 8640}, {"loss": 0.7574, "grad_norm": 0.6855689883232117, "learning_rate": 0.0002, "epoch": 1.3984318163446772, "step": 8650}, {"loss": 0.7802, "grad_norm": 0.5414357781410217, "learning_rate": 0.0002, "epoch": 1.4000485005254224, "step": 8660}, {"loss": 0.7487, "grad_norm": 0.6970012784004211, "learning_rate": 0.0002, "epoch": 1.4016651847061676, "step": 8670}, {"loss": 0.7421, "grad_norm": 0.526079535484314, "learning_rate": 0.0002, "epoch": 1.4032818688869129, "step": 8680}, {"loss": 0.737, "grad_norm": 0.758712887763977, "learning_rate": 0.0002, "epoch": 1.404898553067658, "step": 8690}, {"loss": 0.7612, "grad_norm": 0.7118762731552124, "learning_rate": 0.0002, "epoch": 1.4065152372484035, "step": 8700}, {"loss": 0.7628, "grad_norm": 0.5696909427642822, "learning_rate": 0.0002, "epoch": 1.4081319214291488, "step": 8710}, {"loss": 0.7156, "grad_norm": 0.7995436787605286, "learning_rate": 0.0002, "epoch": 1.4097486056098942, "step": 8720}, {"loss": 0.7521, "grad_norm": 0.7237521409988403, "learning_rate": 0.0002, "epoch": 1.4113652897906395, "step": 8730}, {"loss": 0.7661, "grad_norm": 0.744628369808197, "learning_rate": 0.0002, "epoch": 1.4129819739713847, "step": 8740}, {"loss": 0.7073, "grad_norm": 0.6082926988601685, "learning_rate": 0.0002, "epoch": 1.41459865815213, "step": 8750}, {"loss": 0.7282, "grad_norm": 0.5185243487358093, "learning_rate": 0.0002, "epoch": 1.4162153423328752, "step": 8760}, {"loss": 0.7592, "grad_norm": 0.5183082222938538, "learning_rate": 0.0002, "epoch": 1.4178320265136206, "step": 8770}, {"loss": 0.7509, "grad_norm": 0.7326041460037231, "learning_rate": 0.0002, "epoch": 1.4194487106943658, "step": 8780}, {"loss": 0.7398, "grad_norm": 0.7174660563468933, "learning_rate": 0.0002, "epoch": 1.421065394875111, "step": 8790}, {"loss": 0.7507, "grad_norm": 0.8080165982246399, "learning_rate": 0.0002, "epoch": 1.4226820790558565, "step": 8800}, {"loss": 0.72, "grad_norm": 0.5061507821083069, "learning_rate": 0.0002, "epoch": 1.4242987632366018, "step": 8810}, {"loss": 0.7563, "grad_norm": 0.801602840423584, "learning_rate": 0.0002, "epoch": 1.425915447417347, "step": 8820}, {"loss": 0.7287, "grad_norm": 0.6150273084640503, "learning_rate": 0.0002, "epoch": 1.4275321315980922, "step": 8830}, {"loss": 0.7452, "grad_norm": 0.8786525726318359, "learning_rate": 0.0002, "epoch": 1.4291488157788377, "step": 8840}, {"loss": 0.7257, "grad_norm": 0.6371538639068604, "learning_rate": 0.0002, "epoch": 1.430765499959583, "step": 8850}, {"loss": 0.711, "grad_norm": 0.6409295797348022, "learning_rate": 0.0002, "epoch": 1.4323821841403281, "step": 8860}, {"loss": 0.7891, "grad_norm": 0.6452359557151794, "learning_rate": 0.0002, "epoch": 1.4339988683210736, "step": 8870}, {"loss": 0.7588, "grad_norm": 0.5842334628105164, "learning_rate": 0.0002, "epoch": 1.4356155525018188, "step": 8880}, {"loss": 0.7446, "grad_norm": 0.696761965751648, "learning_rate": 0.0002, "epoch": 1.437232236682564, "step": 8890}, {"loss": 0.7541, "grad_norm": 0.6384600400924683, "learning_rate": 0.0002, "epoch": 1.4388489208633093, "step": 8900}, {"loss": 0.7049, "grad_norm": 0.5981136560440063, "learning_rate": 0.0002, "epoch": 1.4404656050440545, "step": 8910}, {"loss": 0.795, "grad_norm": 0.6355637907981873, "learning_rate": 0.0002, "epoch": 1.4420822892248, "step": 8920}, {"loss": 0.7653, "grad_norm": 0.6374830603599548, "learning_rate": 0.0002, "epoch": 1.4436989734055452, "step": 8930}, {"loss": 0.8108, "grad_norm": 0.559013307094574, "learning_rate": 0.0002, "epoch": 1.4453156575862904, "step": 8940}, {"loss": 0.7045, "grad_norm": 0.7289170026779175, "learning_rate": 0.0002, "epoch": 1.446932341767036, "step": 8950}, {"loss": 0.7484, "grad_norm": 0.8649206757545471, "learning_rate": 0.0002, "epoch": 1.4485490259477811, "step": 8960}, {"loss": 0.7745, "grad_norm": 0.7664689421653748, "learning_rate": 0.0002, "epoch": 1.4501657101285264, "step": 8970}, {"loss": 0.7431, "grad_norm": 0.7109952569007874, "learning_rate": 0.0002, "epoch": 1.4517823943092716, "step": 8980}, {"loss": 0.7997, "grad_norm": 0.6312844753265381, "learning_rate": 0.0002, "epoch": 1.453399078490017, "step": 8990}, {"loss": 0.7467, "grad_norm": 0.6616617441177368, "learning_rate": 0.0002, "epoch": 1.4550157626707623, "step": 9000}, {"loss": 0.7518, "grad_norm": 0.7384068965911865, "learning_rate": 0.0002, "epoch": 1.4566324468515075, "step": 9010}, {"loss": 0.7483, "grad_norm": 0.6549670100212097, "learning_rate": 0.0002, "epoch": 1.458249131032253, "step": 9020}, {"loss": 0.7423, "grad_norm": 0.6254119277000427, "learning_rate": 0.0002, "epoch": 1.4598658152129982, "step": 9030}, {"loss": 0.7645, "grad_norm": 0.6806328892707825, "learning_rate": 0.0002, "epoch": 1.4614824993937434, "step": 9040}, {"loss": 0.7221, "grad_norm": 0.6803115010261536, "learning_rate": 0.0002, "epoch": 1.4630991835744886, "step": 9050}, {"loss": 0.7264, "grad_norm": 0.48529282212257385, "learning_rate": 0.0002, "epoch": 1.4647158677552339, "step": 9060}, {"loss": 0.7542, "grad_norm": 0.5995030999183655, "learning_rate": 0.0002, "epoch": 1.4663325519359793, "step": 9070}, {"loss": 0.7894, "grad_norm": 0.6005427837371826, "learning_rate": 0.0002, "epoch": 1.4679492361167246, "step": 9080}, {"loss": 0.7288, "grad_norm": 0.718564510345459, "learning_rate": 0.0002, "epoch": 1.46956592029747, "step": 9090}, {"loss": 0.7089, "grad_norm": 0.7003577351570129, "learning_rate": 0.0002, "epoch": 1.4711826044782153, "step": 9100}, {"loss": 0.8069, "grad_norm": 0.5888323783874512, "learning_rate": 0.0002, "epoch": 1.4727992886589605, "step": 9110}, {"loss": 0.7275, "grad_norm": 0.6417609453201294, "learning_rate": 0.0002, "epoch": 1.4744159728397057, "step": 9120}, {"loss": 0.7441, "grad_norm": 0.572294294834137, "learning_rate": 0.0002, "epoch": 1.476032657020451, "step": 9130}, {"loss": 0.8053, "grad_norm": 0.8200714588165283, "learning_rate": 0.0002, "epoch": 1.4776493412011964, "step": 9140}, {"loss": 0.7382, "grad_norm": 0.6343288421630859, "learning_rate": 0.0002, "epoch": 1.4792660253819416, "step": 9150}, {"loss": 0.7641, "grad_norm": 0.7017961144447327, "learning_rate": 0.0002, "epoch": 1.4808827095626869, "step": 9160}, {"loss": 0.7619, "grad_norm": 0.6202912926673889, "learning_rate": 0.0002, "epoch": 1.4824993937434323, "step": 9170}, {"loss": 0.7428, "grad_norm": 0.6677869558334351, "learning_rate": 0.0002, "epoch": 1.4841160779241775, "step": 9180}, {"loss": 0.7648, "grad_norm": 0.6052267551422119, "learning_rate": 0.0002, "epoch": 1.4857327621049228, "step": 9190}, {"loss": 0.7152, "grad_norm": 0.6638872027397156, "learning_rate": 0.0002, "epoch": 1.487349446285668, "step": 9200}, {"loss": 0.7448, "grad_norm": 0.6245523691177368, "learning_rate": 0.0002, "epoch": 1.4889661304664135, "step": 9210}, {"loss": 0.6958, "grad_norm": 0.5761767625808716, "learning_rate": 0.0002, "epoch": 1.4905828146471587, "step": 9220}, {"loss": 0.8012, "grad_norm": 0.8175981640815735, "learning_rate": 0.0002, "epoch": 1.492199498827904, "step": 9230}, {"loss": 0.683, "grad_norm": 0.9144009947776794, "learning_rate": 0.0002, "epoch": 1.4938161830086494, "step": 9240}, {"loss": 0.7623, "grad_norm": 0.5742552876472473, "learning_rate": 0.0002, "epoch": 1.4954328671893946, "step": 9250}, {"loss": 0.7418, "grad_norm": 0.534534215927124, "learning_rate": 0.0002, "epoch": 1.4970495513701398, "step": 9260}, {"loss": 0.7194, "grad_norm": 0.7836225032806396, "learning_rate": 0.0002, "epoch": 1.498666235550885, "step": 9270}, {"loss": 0.7453, "grad_norm": 0.5292993187904358, "learning_rate": 0.0002, "epoch": 1.5002829197316303, "step": 9280}, {"loss": 0.7168, "grad_norm": 0.8044071793556213, "learning_rate": 0.0002, "epoch": 1.5018996039123758, "step": 9290}, {"loss": 0.7229, "grad_norm": 0.6185805201530457, "learning_rate": 0.0002, "epoch": 1.503516288093121, "step": 9300}, {"loss": 0.684, "grad_norm": 0.6093607544898987, "learning_rate": 0.0002, "epoch": 1.5051329722738664, "step": 9310}, {"loss": 0.7973, "grad_norm": 0.5891730189323425, "learning_rate": 0.0002, "epoch": 1.5067496564546117, "step": 9320}, {"loss": 0.7474, "grad_norm": 0.6331129670143127, "learning_rate": 0.0002, "epoch": 1.508366340635357, "step": 9330}, {"loss": 0.7074, "grad_norm": 0.7690958380699158, "learning_rate": 0.0002, "epoch": 1.5099830248161021, "step": 9340}, {"loss": 0.672, "grad_norm": 0.6548877358436584, "learning_rate": 0.0002, "epoch": 1.5115997089968474, "step": 9350}, {"loss": 0.7408, "grad_norm": 0.6545143127441406, "learning_rate": 0.0002, "epoch": 1.5132163931775926, "step": 9360}, {"loss": 0.7432, "grad_norm": 0.553247332572937, "learning_rate": 0.0002, "epoch": 1.514833077358338, "step": 9370}, {"loss": 0.7265, "grad_norm": 0.8145074844360352, "learning_rate": 0.0002, "epoch": 1.5164497615390833, "step": 9380}, {"loss": 0.7379, "grad_norm": 0.7636994123458862, "learning_rate": 0.0002, "epoch": 1.5180664457198287, "step": 9390}, {"loss": 0.7413, "grad_norm": 0.6838982701301575, "learning_rate": 0.0002, "epoch": 1.519683129900574, "step": 9400}, {"loss": 0.7367, "grad_norm": 0.8599441647529602, "learning_rate": 0.0002, "epoch": 1.5212998140813192, "step": 9410}, {"loss": 0.7663, "grad_norm": 0.7020329833030701, "learning_rate": 0.0002, "epoch": 1.5229164982620644, "step": 9420}, {"loss": 0.7928, "grad_norm": 0.6964772343635559, "learning_rate": 0.0002, "epoch": 1.5245331824428097, "step": 9430}, {"loss": 0.7168, "grad_norm": 0.6916600465774536, "learning_rate": 0.0002, "epoch": 1.5261498666235551, "step": 9440}, {"loss": 0.7519, "grad_norm": 0.7282621264457703, "learning_rate": 0.0002, "epoch": 1.5277665508043003, "step": 9450}, {"loss": 0.7628, "grad_norm": 0.5363983511924744, "learning_rate": 0.0002, "epoch": 1.5293832349850458, "step": 9460}, {"loss": 0.7154, "grad_norm": 0.6184861063957214, "learning_rate": 0.0002, "epoch": 1.530999919165791, "step": 9470}, {"loss": 0.7837, "grad_norm": 0.5991285443305969, "learning_rate": 0.0002, "epoch": 1.5326166033465363, "step": 9480}, {"loss": 0.7827, "grad_norm": 0.8176587820053101, "learning_rate": 0.0002, "epoch": 1.5342332875272815, "step": 9490}, {"loss": 0.7415, "grad_norm": 0.6473721861839294, "learning_rate": 0.0002, "epoch": 1.5358499717080267, "step": 9500}, {"loss": 0.7632, "grad_norm": 0.7319952845573425, "learning_rate": 0.0002, "epoch": 1.5374666558887722, "step": 9510}, {"loss": 0.7706, "grad_norm": 0.702900230884552, "learning_rate": 0.0002, "epoch": 1.5390833400695174, "step": 9520}, {"loss": 0.7754, "grad_norm": 0.7971600294113159, "learning_rate": 0.0002, "epoch": 1.5407000242502629, "step": 9530}, {"loss": 0.7352, "grad_norm": 0.6527525186538696, "learning_rate": 0.0002, "epoch": 1.542316708431008, "step": 9540}, {"loss": 0.7425, "grad_norm": 0.5791676044464111, "learning_rate": 0.0002, "epoch": 1.5439333926117533, "step": 9550}, {"loss": 0.7585, "grad_norm": 0.5619390606880188, "learning_rate": 0.0002, "epoch": 1.5455500767924986, "step": 9560}, {"loss": 0.7894, "grad_norm": 0.5701689124107361, "learning_rate": 0.0002, "epoch": 1.5471667609732438, "step": 9570}, {"loss": 0.793, "grad_norm": 0.47549352049827576, "learning_rate": 0.0002, "epoch": 1.548783445153989, "step": 9580}, {"loss": 0.7276, "grad_norm": 0.8730611205101013, "learning_rate": 0.0002, "epoch": 1.5504001293347345, "step": 9590}, {"loss": 0.798, "grad_norm": 0.6842091083526611, "learning_rate": 0.0002, "epoch": 1.5520168135154797, "step": 9600}, {"loss": 0.7528, "grad_norm": 0.6675129532814026, "learning_rate": 0.0002, "epoch": 1.5536334976962252, "step": 9610}, {"loss": 0.7954, "grad_norm": 0.8173956274986267, "learning_rate": 0.0002, "epoch": 1.5552501818769704, "step": 9620}, {"loss": 0.7535, "grad_norm": 0.724947452545166, "learning_rate": 0.0002, "epoch": 1.5568668660577156, "step": 9630}, {"loss": 0.7738, "grad_norm": 0.6154758930206299, "learning_rate": 0.0002, "epoch": 1.5584835502384609, "step": 9640}, {"loss": 0.7568, "grad_norm": 0.6072008013725281, "learning_rate": 0.0002, "epoch": 1.560100234419206, "step": 9650}, {"loss": 0.7219, "grad_norm": 0.659010648727417, "learning_rate": 0.0002, "epoch": 1.5617169185999515, "step": 9660}, {"loss": 0.673, "grad_norm": 0.65857994556427, "learning_rate": 0.0002, "epoch": 1.5633336027806968, "step": 9670}, {"loss": 0.7156, "grad_norm": 0.5914267301559448, "learning_rate": 0.0002, "epoch": 1.5649502869614422, "step": 9680}, {"loss": 0.7414, "grad_norm": 0.6248020529747009, "learning_rate": 0.0002, "epoch": 1.5665669711421875, "step": 9690}, {"loss": 0.694, "grad_norm": 0.7147795557975769, "learning_rate": 0.0002, "epoch": 1.5681836553229327, "step": 9700}, {"loss": 0.7335, "grad_norm": 0.7076232433319092, "learning_rate": 0.0002, "epoch": 1.569800339503678, "step": 9710}, {"loss": 0.7413, "grad_norm": 0.6217400431632996, "learning_rate": 0.0002, "epoch": 1.5714170236844232, "step": 9720}, {"loss": 0.7296, "grad_norm": 0.6709911227226257, "learning_rate": 0.0002, "epoch": 1.5730337078651684, "step": 9730}, {"loss": 0.7306, "grad_norm": 0.749171257019043, "learning_rate": 0.0002, "epoch": 1.5746503920459138, "step": 9740}, {"loss": 0.7242, "grad_norm": 0.6241145730018616, "learning_rate": 0.0002, "epoch": 1.576267076226659, "step": 9750}, {"loss": 0.7384, "grad_norm": 0.4960934817790985, "learning_rate": 0.0002, "epoch": 1.5778837604074045, "step": 9760}, {"loss": 0.725, "grad_norm": 0.6593309640884399, "learning_rate": 0.0002, "epoch": 1.5795004445881498, "step": 9770}, {"loss": 0.7531, "grad_norm": 0.5814042091369629, "learning_rate": 0.0002, "epoch": 1.581117128768895, "step": 9780}, {"loss": 0.7109, "grad_norm": 0.5936070680618286, "learning_rate": 0.0002, "epoch": 1.5827338129496402, "step": 9790}, {"loss": 0.7769, "grad_norm": 0.6454403400421143, "learning_rate": 0.0002, "epoch": 1.5843504971303854, "step": 9800}, {"loss": 0.7677, "grad_norm": 0.7612107992172241, "learning_rate": 0.0002, "epoch": 1.585967181311131, "step": 9810}, {"loss": 0.7649, "grad_norm": 0.6494482755661011, "learning_rate": 0.0002, "epoch": 1.5875838654918761, "step": 9820}, {"loss": 0.7569, "grad_norm": 0.7825694680213928, "learning_rate": 0.0002, "epoch": 1.5892005496726216, "step": 9830}, {"loss": 0.706, "grad_norm": 0.6757757663726807, "learning_rate": 0.0002, "epoch": 1.5908172338533668, "step": 9840}, {"loss": 0.7803, "grad_norm": 0.7105609178543091, "learning_rate": 0.0002, "epoch": 1.592433918034112, "step": 9850}, {"loss": 0.7925, "grad_norm": 0.7596991062164307, "learning_rate": 0.0002, "epoch": 1.5940506022148573, "step": 9860}, {"loss": 0.7108, "grad_norm": 0.5681525468826294, "learning_rate": 0.0002, "epoch": 1.5956672863956025, "step": 9870}, {"loss": 0.7811, "grad_norm": 0.6090980768203735, "learning_rate": 0.0002, "epoch": 1.5972839705763477, "step": 9880}, {"loss": 0.7339, "grad_norm": 0.6271613240242004, "learning_rate": 0.0002, "epoch": 1.5989006547570932, "step": 9890}, {"loss": 0.7419, "grad_norm": 0.7656369805335999, "learning_rate": 0.0002, "epoch": 1.6005173389378387, "step": 9900}, {"loss": 0.7336, "grad_norm": 0.7504446506500244, "learning_rate": 0.0002, "epoch": 1.6021340231185839, "step": 9910}, {"loss": 0.7479, "grad_norm": 0.659656286239624, "learning_rate": 0.0002, "epoch": 1.6037507072993291, "step": 9920}, {"loss": 0.7483, "grad_norm": 0.6006826162338257, "learning_rate": 0.0002, "epoch": 1.6053673914800743, "step": 9930}, {"loss": 0.732, "grad_norm": 0.7872757911682129, "learning_rate": 0.0002, "epoch": 1.6069840756608196, "step": 9940}, {"loss": 0.768, "grad_norm": 0.5545852780342102, "learning_rate": 0.0002, "epoch": 1.6086007598415648, "step": 9950}, {"loss": 0.8064, "grad_norm": 0.7429468631744385, "learning_rate": 0.0002, "epoch": 1.6102174440223103, "step": 9960}, {"loss": 0.714, "grad_norm": 0.6873556971549988, "learning_rate": 0.0002, "epoch": 1.6118341282030555, "step": 9970}, {"loss": 0.7324, "grad_norm": 0.5874287486076355, "learning_rate": 0.0002, "epoch": 1.613450812383801, "step": 9980}, {"loss": 0.7141, "grad_norm": 0.6039386987686157, "learning_rate": 0.0002, "epoch": 1.6150674965645462, "step": 9990}, {"loss": 0.6674, "grad_norm": 0.6233575940132141, "learning_rate": 0.0002, "epoch": 1.6166841807452914, "step": 10000}, {"loss": 0.7602, "grad_norm": 0.7676448225975037, "learning_rate": 0.0002, "epoch": 1.6183008649260366, "step": 10010}, {"loss": 0.7784, "grad_norm": 0.6565698385238647, "learning_rate": 0.0002, "epoch": 1.6199175491067819, "step": 10020}, {"loss": 0.7104, "grad_norm": 0.6787590384483337, "learning_rate": 0.0002, "epoch": 1.6215342332875273, "step": 10030}, {"loss": 0.7464, "grad_norm": 0.6137678027153015, "learning_rate": 0.0002, "epoch": 1.6231509174682726, "step": 10040}, {"loss": 0.7646, "grad_norm": 0.5236800312995911, "learning_rate": 0.0002, "epoch": 1.624767601649018, "step": 10050}, {"loss": 0.7437, "grad_norm": 0.7626367807388306, "learning_rate": 0.0002, "epoch": 1.6263842858297632, "step": 10060}, {"loss": 0.7273, "grad_norm": 0.5657260417938232, "learning_rate": 0.0002, "epoch": 1.6280009700105085, "step": 10070}, {"loss": 0.7354, "grad_norm": 0.4913991391658783, "learning_rate": 0.0002, "epoch": 1.6296176541912537, "step": 10080}, {"loss": 0.7596, "grad_norm": 0.7715556621551514, "learning_rate": 0.0002, "epoch": 1.631234338371999, "step": 10090}, {"loss": 0.7105, "grad_norm": 0.6509000062942505, "learning_rate": 0.0002, "epoch": 1.6328510225527442, "step": 10100}, {"loss": 0.7274, "grad_norm": 0.6215850114822388, "learning_rate": 0.0002, "epoch": 1.6344677067334896, "step": 10110}, {"loss": 0.7705, "grad_norm": 0.6956844329833984, "learning_rate": 0.0002, "epoch": 1.6360843909142349, "step": 10120}, {"loss": 0.7129, "grad_norm": 0.6111597418785095, "learning_rate": 0.0002, "epoch": 1.6377010750949803, "step": 10130}, {"loss": 0.6955, "grad_norm": 0.6518288850784302, "learning_rate": 0.0002, "epoch": 1.6393177592757255, "step": 10140}, {"loss": 0.731, "grad_norm": 0.6914522051811218, "learning_rate": 0.0002, "epoch": 1.6409344434564708, "step": 10150}, {"loss": 0.7295, "grad_norm": 0.63785719871521, "learning_rate": 0.0002, "epoch": 1.642551127637216, "step": 10160}, {"loss": 0.7355, "grad_norm": 0.6379287838935852, "learning_rate": 0.0002, "epoch": 1.6441678118179612, "step": 10170}, {"loss": 0.7359, "grad_norm": 0.6793403029441833, "learning_rate": 0.0002, "epoch": 1.6457844959987067, "step": 10180}, {"loss": 0.7402, "grad_norm": 0.6099132895469666, "learning_rate": 0.0002, "epoch": 1.647401180179452, "step": 10190}, {"loss": 0.7353, "grad_norm": 0.5869854092597961, "learning_rate": 0.0002, "epoch": 1.6490178643601974, "step": 10200}, {"loss": 0.8308, "grad_norm": 0.7716999053955078, "learning_rate": 0.0002, "epoch": 1.6506345485409426, "step": 10210}, {"loss": 0.7215, "grad_norm": 0.6854110360145569, "learning_rate": 0.0002, "epoch": 1.6522512327216878, "step": 10220}, {"loss": 0.782, "grad_norm": 0.6957170367240906, "learning_rate": 0.0002, "epoch": 1.653867916902433, "step": 10230}, {"loss": 0.7282, "grad_norm": 0.6932903528213501, "learning_rate": 0.0002, "epoch": 1.6554846010831783, "step": 10240}, {"loss": 0.7478, "grad_norm": 0.7713165283203125, "learning_rate": 0.0002, "epoch": 1.6571012852639235, "step": 10250}, {"loss": 0.7099, "grad_norm": 0.7455793619155884, "learning_rate": 0.0002, "epoch": 1.658717969444669, "step": 10260}, {"loss": 0.7524, "grad_norm": 0.5464168190956116, "learning_rate": 0.0002, "epoch": 1.6603346536254144, "step": 10270}, {"loss": 0.7328, "grad_norm": 0.6782926321029663, "learning_rate": 0.0002, "epoch": 1.6619513378061597, "step": 10280}, {"loss": 0.7801, "grad_norm": 0.7962649464607239, "learning_rate": 0.0002, "epoch": 1.663568021986905, "step": 10290}, {"loss": 0.7142, "grad_norm": 0.6814526319503784, "learning_rate": 0.0002, "epoch": 1.6651847061676501, "step": 10300}, {"loss": 0.7285, "grad_norm": 0.656895101070404, "learning_rate": 0.0002, "epoch": 1.6668013903483954, "step": 10310}, {"loss": 0.7358, "grad_norm": 0.6085672378540039, "learning_rate": 0.0002, "epoch": 1.6684180745291406, "step": 10320}, {"loss": 0.7074, "grad_norm": 0.585508406162262, "learning_rate": 0.0002, "epoch": 1.670034758709886, "step": 10330}, {"loss": 0.7604, "grad_norm": 0.6930184364318848, "learning_rate": 0.0002, "epoch": 1.6716514428906313, "step": 10340}, {"loss": 0.7169, "grad_norm": 0.575663149356842, "learning_rate": 0.0002, "epoch": 1.6732681270713767, "step": 10350}, {"loss": 0.7198, "grad_norm": 0.582502543926239, "learning_rate": 0.0002, "epoch": 1.674884811252122, "step": 10360}, {"loss": 0.7793, "grad_norm": 0.5668916702270508, "learning_rate": 0.0002, "epoch": 1.6765014954328672, "step": 10370}, {"loss": 0.7478, "grad_norm": 0.6070065498352051, "learning_rate": 0.0002, "epoch": 1.6781181796136124, "step": 10380}, {"loss": 0.7939, "grad_norm": 0.6141316294670105, "learning_rate": 0.0002, "epoch": 1.6797348637943577, "step": 10390}, {"loss": 0.7573, "grad_norm": 0.8359124064445496, "learning_rate": 0.0002, "epoch": 1.6813515479751031, "step": 10400}, {"loss": 0.7488, "grad_norm": 0.5378185510635376, "learning_rate": 0.0002, "epoch": 1.6829682321558483, "step": 10410}, {"loss": 0.7588, "grad_norm": 0.6959536075592041, "learning_rate": 0.0002, "epoch": 1.6845849163365938, "step": 10420}, {"loss": 0.7872, "grad_norm": 0.6514357328414917, "learning_rate": 0.0002, "epoch": 1.686201600517339, "step": 10430}, {"loss": 0.725, "grad_norm": 0.7706646919250488, "learning_rate": 0.0002, "epoch": 1.6878182846980843, "step": 10440}, {"loss": 0.7673, "grad_norm": 0.6183337569236755, "learning_rate": 0.0002, "epoch": 1.6894349688788295, "step": 10450}, {"loss": 0.7566, "grad_norm": 0.6123278141021729, "learning_rate": 0.0002, "epoch": 1.6910516530595747, "step": 10460}, {"loss": 0.7169, "grad_norm": 0.6894851326942444, "learning_rate": 0.0002, "epoch": 1.69266833724032, "step": 10470}, {"loss": 0.7435, "grad_norm": 0.7497312426567078, "learning_rate": 0.0002, "epoch": 1.6942850214210654, "step": 10480}, {"loss": 0.7544, "grad_norm": 0.5968214273452759, "learning_rate": 0.0002, "epoch": 1.6959017056018106, "step": 10490}, {"loss": 0.6793, "grad_norm": 0.6747927069664001, "learning_rate": 0.0002, "epoch": 1.697518389782556, "step": 10500}, {"loss": 0.7415, "grad_norm": 0.5708310008049011, "learning_rate": 0.0002, "epoch": 1.6991350739633013, "step": 10510}, {"loss": 0.7385, "grad_norm": 0.606526792049408, "learning_rate": 0.0002, "epoch": 1.7007517581440466, "step": 10520}, {"loss": 0.7204, "grad_norm": 0.662011981010437, "learning_rate": 0.0002, "epoch": 1.7023684423247918, "step": 10530}, {"loss": 0.7999, "grad_norm": 0.7583045363426208, "learning_rate": 0.0002, "epoch": 1.703985126505537, "step": 10540}, {"loss": 0.7563, "grad_norm": 0.721632182598114, "learning_rate": 0.0002, "epoch": 1.7056018106862825, "step": 10550}, {"loss": 0.7407, "grad_norm": 0.6107715368270874, "learning_rate": 0.0002, "epoch": 1.7072184948670277, "step": 10560}, {"loss": 0.7519, "grad_norm": 0.6652471423149109, "learning_rate": 0.0002, "epoch": 1.7088351790477732, "step": 10570}, {"loss": 0.7767, "grad_norm": 0.6308087110519409, "learning_rate": 0.0002, "epoch": 1.7104518632285184, "step": 10580}, {"loss": 0.7659, "grad_norm": 0.5464386940002441, "learning_rate": 0.0002, "epoch": 1.7120685474092636, "step": 10590}, {"loss": 0.7063, "grad_norm": 0.6558911204338074, "learning_rate": 0.0002, "epoch": 1.7136852315900089, "step": 10600}, {"loss": 0.7126, "grad_norm": 0.5665024518966675, "learning_rate": 0.0002, "epoch": 1.715301915770754, "step": 10610}, {"loss": 0.6958, "grad_norm": 0.7888094186782837, "learning_rate": 0.0002, "epoch": 1.7169185999514993, "step": 10620}, {"loss": 0.7785, "grad_norm": 0.7084909081459045, "learning_rate": 0.0002, "epoch": 1.7185352841322448, "step": 10630}, {"loss": 0.7557, "grad_norm": 0.7982324361801147, "learning_rate": 0.0002, "epoch": 1.7201519683129902, "step": 10640}, {"loss": 0.7345, "grad_norm": 0.6418732404708862, "learning_rate": 0.0002, "epoch": 1.7217686524937355, "step": 10650}, {"loss": 0.7734, "grad_norm": 0.7636681795120239, "learning_rate": 0.0002, "epoch": 1.7233853366744807, "step": 10660}, {"loss": 0.7541, "grad_norm": 0.5646875500679016, "learning_rate": 0.0002, "epoch": 1.725002020855226, "step": 10670}, {"loss": 0.7642, "grad_norm": 0.5231260657310486, "learning_rate": 0.0002, "epoch": 1.7266187050359711, "step": 10680}, {"loss": 0.7846, "grad_norm": 0.7635011672973633, "learning_rate": 0.0002, "epoch": 1.7282353892167164, "step": 10690}, {"loss": 0.7471, "grad_norm": 0.7518259286880493, "learning_rate": 0.0002, "epoch": 1.7298520733974618, "step": 10700}, {"loss": 0.751, "grad_norm": 0.7295602560043335, "learning_rate": 0.0002, "epoch": 1.731468757578207, "step": 10710}, {"loss": 0.731, "grad_norm": 0.6984632015228271, "learning_rate": 0.0002, "epoch": 1.7330854417589525, "step": 10720}, {"loss": 0.7921, "grad_norm": 0.6198219060897827, "learning_rate": 0.0002, "epoch": 1.7347021259396977, "step": 10730}, {"loss": 0.7642, "grad_norm": 0.6957576274871826, "learning_rate": 0.0002, "epoch": 1.736318810120443, "step": 10740}, {"loss": 0.7917, "grad_norm": 0.6430263519287109, "learning_rate": 0.0002, "epoch": 1.7379354943011882, "step": 10750}, {"loss": 0.7156, "grad_norm": 0.6134995222091675, "learning_rate": 0.0002, "epoch": 1.7395521784819334, "step": 10760}, {"loss": 0.7584, "grad_norm": 0.7209452986717224, "learning_rate": 0.0002, "epoch": 1.741168862662679, "step": 10770}, {"loss": 0.7528, "grad_norm": 0.6735447645187378, "learning_rate": 0.0002, "epoch": 1.7427855468434241, "step": 10780}, {"loss": 0.756, "grad_norm": 0.5605693459510803, "learning_rate": 0.0002, "epoch": 1.7444022310241696, "step": 10790}, {"loss": 0.7759, "grad_norm": 0.6882363557815552, "learning_rate": 0.0002, "epoch": 1.7460189152049148, "step": 10800}, {"loss": 0.7544, "grad_norm": 0.6386259198188782, "learning_rate": 0.0002, "epoch": 1.74763559938566, "step": 10810}, {"loss": 0.7697, "grad_norm": 0.6529015302658081, "learning_rate": 0.0002, "epoch": 1.7492522835664053, "step": 10820}, {"loss": 0.7219, "grad_norm": 0.5664082765579224, "learning_rate": 0.0002, "epoch": 1.7508689677471505, "step": 10830}, {"loss": 0.7586, "grad_norm": 0.7532684206962585, "learning_rate": 0.0002, "epoch": 1.7524856519278957, "step": 10840}, {"loss": 0.6919, "grad_norm": 0.77171391248703, "learning_rate": 0.0002, "epoch": 1.7541023361086412, "step": 10850}, {"loss": 0.785, "grad_norm": 0.7255431413650513, "learning_rate": 0.0002, "epoch": 1.7557190202893864, "step": 10860}, {"loss": 0.7458, "grad_norm": 0.763083279132843, "learning_rate": 0.0002, "epoch": 1.7573357044701319, "step": 10870}, {"loss": 0.7846, "grad_norm": 0.6042402982711792, "learning_rate": 0.0002, "epoch": 1.758952388650877, "step": 10880}, {"loss": 0.7027, "grad_norm": 0.7642518281936646, "learning_rate": 0.0002, "epoch": 1.7605690728316223, "step": 10890}, {"loss": 0.746, "grad_norm": 0.6347904801368713, "learning_rate": 0.0002, "epoch": 1.7621857570123676, "step": 10900}, {"loss": 0.7458, "grad_norm": 0.5371627807617188, "learning_rate": 0.0002, "epoch": 1.7638024411931128, "step": 10910}, {"loss": 0.7466, "grad_norm": 0.6840225458145142, "learning_rate": 0.0002, "epoch": 1.7654191253738583, "step": 10920}, {"loss": 0.725, "grad_norm": 0.5288469195365906, "learning_rate": 0.0002, "epoch": 1.7670358095546035, "step": 10930}, {"loss": 0.7863, "grad_norm": 0.69020676612854, "learning_rate": 0.0002, "epoch": 1.768652493735349, "step": 10940}, {"loss": 0.7468, "grad_norm": 0.5943242311477661, "learning_rate": 0.0002, "epoch": 1.7702691779160942, "step": 10950}, {"loss": 0.7244, "grad_norm": 0.5616418123245239, "learning_rate": 0.0002, "epoch": 1.7718858620968394, "step": 10960}, {"loss": 0.7137, "grad_norm": 0.7209470868110657, "learning_rate": 0.0002, "epoch": 1.7735025462775846, "step": 10970}, {"loss": 0.7459, "grad_norm": 0.6657957434654236, "learning_rate": 0.0002, "epoch": 1.7751192304583299, "step": 10980}, {"loss": 0.7076, "grad_norm": 0.6469064950942993, "learning_rate": 0.0002, "epoch": 1.776735914639075, "step": 10990}, {"loss": 0.7321, "grad_norm": 0.6615678071975708, "learning_rate": 0.0002, "epoch": 1.7783525988198206, "step": 11000}, {"loss": 0.747, "grad_norm": 0.6722439527511597, "learning_rate": 0.0002, "epoch": 1.779969283000566, "step": 11010}, {"loss": 0.7302, "grad_norm": 0.634136974811554, "learning_rate": 0.0002, "epoch": 1.7815859671813112, "step": 11020}, {"loss": 0.8105, "grad_norm": 0.6024377346038818, "learning_rate": 0.0002, "epoch": 1.7832026513620565, "step": 11030}, {"loss": 0.7855, "grad_norm": 0.6909403800964355, "learning_rate": 0.0002, "epoch": 1.7848193355428017, "step": 11040}, {"loss": 0.7471, "grad_norm": 0.7148767709732056, "learning_rate": 0.0002, "epoch": 1.786436019723547, "step": 11050}, {"loss": 0.7145, "grad_norm": 0.7442979216575623, "learning_rate": 0.0002, "epoch": 1.7880527039042922, "step": 11060}, {"loss": 0.7215, "grad_norm": 0.6830431818962097, "learning_rate": 0.0002, "epoch": 1.7896693880850376, "step": 11070}, {"loss": 0.7625, "grad_norm": 0.9172667264938354, "learning_rate": 0.0002, "epoch": 1.7912860722657828, "step": 11080}, {"loss": 0.76, "grad_norm": 0.6799490451812744, "learning_rate": 0.0002, "epoch": 1.7929027564465283, "step": 11090}, {"loss": 0.7716, "grad_norm": 0.7617024779319763, "learning_rate": 0.0002, "epoch": 1.7945194406272735, "step": 11100}, {"loss": 0.7586, "grad_norm": 0.7701810002326965, "learning_rate": 0.0002, "epoch": 1.7961361248080188, "step": 11110}, {"loss": 0.7843, "grad_norm": 0.7454385757446289, "learning_rate": 0.0002, "epoch": 1.797752808988764, "step": 11120}, {"loss": 0.7873, "grad_norm": 0.6121436953544617, "learning_rate": 0.0002, "epoch": 1.7993694931695092, "step": 11130}, {"loss": 0.7305, "grad_norm": 0.6237571835517883, "learning_rate": 0.0002, "epoch": 1.8009861773502547, "step": 11140}, {"loss": 0.6827, "grad_norm": 0.6818515658378601, "learning_rate": 0.0002, "epoch": 1.802602861531, "step": 11150}, {"loss": 0.6876, "grad_norm": 0.7768308520317078, "learning_rate": 0.0002, "epoch": 1.8042195457117454, "step": 11160}, {"loss": 0.7533, "grad_norm": 0.6875537633895874, "learning_rate": 0.0002, "epoch": 1.8058362298924906, "step": 11170}, {"loss": 0.761, "grad_norm": 0.7950584888458252, "learning_rate": 0.0002, "epoch": 1.8074529140732358, "step": 11180}, {"loss": 0.7623, "grad_norm": 0.8210248351097107, "learning_rate": 0.0002, "epoch": 1.809069598253981, "step": 11190}, {"loss": 0.7556, "grad_norm": 0.6674110889434814, "learning_rate": 0.0002, "epoch": 1.8106862824347263, "step": 11200}, {"loss": 0.7663, "grad_norm": 0.6261674761772156, "learning_rate": 0.0002, "epoch": 1.8123029666154715, "step": 11210}, {"loss": 0.7122, "grad_norm": 0.6484741568565369, "learning_rate": 0.0002, "epoch": 1.813919650796217, "step": 11220}, {"loss": 0.7718, "grad_norm": 0.6231244206428528, "learning_rate": 0.0002, "epoch": 1.8155363349769622, "step": 11230}, {"loss": 0.7152, "grad_norm": 0.7243146896362305, "learning_rate": 0.0002, "epoch": 1.8171530191577077, "step": 11240}, {"loss": 0.7448, "grad_norm": 0.6776193380355835, "learning_rate": 0.0002, "epoch": 1.818769703338453, "step": 11250}, {"loss": 0.7317, "grad_norm": 0.5973618030548096, "learning_rate": 0.0002, "epoch": 1.8203863875191981, "step": 11260}, {"loss": 0.7961, "grad_norm": 0.6451361179351807, "learning_rate": 0.0002, "epoch": 1.8220030716999434, "step": 11270}, {"loss": 0.7611, "grad_norm": 0.5963068008422852, "learning_rate": 0.0002, "epoch": 1.8236197558806886, "step": 11280}, {"loss": 0.7466, "grad_norm": 0.536902129650116, "learning_rate": 0.0002, "epoch": 1.825236440061434, "step": 11290}, {"loss": 0.708, "grad_norm": 0.6993787288665771, "learning_rate": 0.0002, "epoch": 1.8268531242421793, "step": 11300}, {"loss": 0.7153, "grad_norm": 0.6135255098342896, "learning_rate": 0.0002, "epoch": 1.8284698084229247, "step": 11310}, {"loss": 0.7423, "grad_norm": 0.6057423949241638, "learning_rate": 0.0002, "epoch": 1.83008649260367, "step": 11320}, {"loss": 0.735, "grad_norm": 0.6598812341690063, "learning_rate": 0.0002, "epoch": 1.8317031767844152, "step": 11330}, {"loss": 0.7278, "grad_norm": 0.6075948476791382, "learning_rate": 0.0002, "epoch": 1.8333198609651604, "step": 11340}, {"loss": 0.7846, "grad_norm": 0.7065447568893433, "learning_rate": 0.0002, "epoch": 1.8349365451459057, "step": 11350}, {"loss": 0.7365, "grad_norm": 0.680526614189148, "learning_rate": 0.0002, "epoch": 1.8365532293266509, "step": 11360}, {"loss": 0.7152, "grad_norm": 0.6356695294380188, "learning_rate": 0.0002, "epoch": 1.8381699135073963, "step": 11370}, {"loss": 0.721, "grad_norm": 0.6399052143096924, "learning_rate": 0.0002, "epoch": 1.8397865976881416, "step": 11380}, {"loss": 0.7618, "grad_norm": 0.6125704050064087, "learning_rate": 0.0002, "epoch": 1.841403281868887, "step": 11390}, {"loss": 0.755, "grad_norm": 0.7124643325805664, "learning_rate": 0.0002, "epoch": 1.8430199660496323, "step": 11400}, {"loss": 0.7972, "grad_norm": 0.6099604964256287, "learning_rate": 0.0002, "epoch": 1.8446366502303775, "step": 11410}, {"loss": 0.7187, "grad_norm": 0.7338208556175232, "learning_rate": 0.0002, "epoch": 1.8462533344111227, "step": 11420}, {"loss": 0.7007, "grad_norm": 0.7534668445587158, "learning_rate": 0.0002, "epoch": 1.847870018591868, "step": 11430}, {"loss": 0.7464, "grad_norm": 0.6135470271110535, "learning_rate": 0.0002, "epoch": 1.8494867027726134, "step": 11440}, {"loss": 0.7955, "grad_norm": 0.6229309439659119, "learning_rate": 0.0002, "epoch": 1.8511033869533586, "step": 11450}, {"loss": 0.7594, "grad_norm": 0.706423282623291, "learning_rate": 0.0002, "epoch": 1.852720071134104, "step": 11460}, {"loss": 0.7411, "grad_norm": 0.5460049510002136, "learning_rate": 0.0002, "epoch": 1.8543367553148493, "step": 11470}, {"loss": 0.7416, "grad_norm": 0.6616711020469666, "learning_rate": 0.0002, "epoch": 1.8559534394955945, "step": 11480}, {"loss": 0.729, "grad_norm": 0.6372783184051514, "learning_rate": 0.0002, "epoch": 1.8575701236763398, "step": 11490}, {"loss": 0.7333, "grad_norm": 0.7162668108940125, "learning_rate": 0.0002, "epoch": 1.859186807857085, "step": 11500}, {"loss": 0.7747, "grad_norm": 0.6605209708213806, "learning_rate": 0.0002, "epoch": 1.8608034920378305, "step": 11510}, {"loss": 0.7258, "grad_norm": 0.6933956742286682, "learning_rate": 0.0002, "epoch": 1.8624201762185757, "step": 11520}, {"loss": 0.7243, "grad_norm": 0.6582090854644775, "learning_rate": 0.0002, "epoch": 1.8640368603993211, "step": 11530}, {"loss": 0.7313, "grad_norm": 0.6416500806808472, "learning_rate": 0.0002, "epoch": 1.8656535445800664, "step": 11540}, {"loss": 0.7372, "grad_norm": 0.5434312224388123, "learning_rate": 0.0002, "epoch": 1.8672702287608116, "step": 11550}, {"loss": 0.7635, "grad_norm": 0.6827567219734192, "learning_rate": 0.0002, "epoch": 1.8688869129415568, "step": 11560}, {"loss": 0.7137, "grad_norm": 0.7354370951652527, "learning_rate": 0.0002, "epoch": 1.870503597122302, "step": 11570}, {"loss": 0.7526, "grad_norm": 0.590372622013092, "learning_rate": 0.0002, "epoch": 1.8721202813030473, "step": 11580}, {"loss": 0.731, "grad_norm": 0.853183925151825, "learning_rate": 0.0002, "epoch": 1.8737369654837928, "step": 11590}, {"loss": 0.7487, "grad_norm": 0.822678804397583, "learning_rate": 0.0002, "epoch": 1.875353649664538, "step": 11600}, {"loss": 0.7427, "grad_norm": 0.6591550707817078, "learning_rate": 0.0002, "epoch": 1.8769703338452834, "step": 11610}, {"loss": 0.7054, "grad_norm": 0.7475301623344421, "learning_rate": 0.0002, "epoch": 1.8785870180260287, "step": 11620}, {"loss": 0.811, "grad_norm": 0.6390765309333801, "learning_rate": 0.0002, "epoch": 1.880203702206774, "step": 11630}, {"loss": 0.7531, "grad_norm": 0.6589758992195129, "learning_rate": 0.0002, "epoch": 1.8818203863875191, "step": 11640}, {"loss": 0.7475, "grad_norm": 0.6765508651733398, "learning_rate": 0.0002, "epoch": 1.8834370705682644, "step": 11650}, {"loss": 0.738, "grad_norm": 0.6527857780456543, "learning_rate": 0.0002, "epoch": 1.8850537547490098, "step": 11660}, {"loss": 0.7504, "grad_norm": 0.6642923951148987, "learning_rate": 0.0002, "epoch": 1.886670438929755, "step": 11670}, {"loss": 0.7701, "grad_norm": 0.6945584416389465, "learning_rate": 0.0002, "epoch": 1.8882871231105005, "step": 11680}, {"loss": 0.7711, "grad_norm": 0.694018542766571, "learning_rate": 0.0002, "epoch": 1.8899038072912457, "step": 11690}, {"loss": 0.7195, "grad_norm": 0.7237417101860046, "learning_rate": 0.0002, "epoch": 1.891520491471991, "step": 11700}, {"loss": 0.7491, "grad_norm": 0.7401309609413147, "learning_rate": 0.0002, "epoch": 1.8931371756527362, "step": 11710}, {"loss": 0.805, "grad_norm": 0.6537784337997437, "learning_rate": 0.0002, "epoch": 1.8947538598334814, "step": 11720}, {"loss": 0.793, "grad_norm": 0.7398539185523987, "learning_rate": 0.0002, "epoch": 1.8963705440142267, "step": 11730}, {"loss": 0.7561, "grad_norm": 0.6696075797080994, "learning_rate": 0.0002, "epoch": 1.8979872281949721, "step": 11740}, {"loss": 0.7353, "grad_norm": 0.6014142036437988, "learning_rate": 0.0002, "epoch": 1.8996039123757174, "step": 11750}, {"loss": 0.7714, "grad_norm": 0.7023524641990662, "learning_rate": 0.0002, "epoch": 1.9012205965564628, "step": 11760}, {"loss": 0.7088, "grad_norm": 0.739973783493042, "learning_rate": 0.0002, "epoch": 1.902837280737208, "step": 11770}, {"loss": 0.7848, "grad_norm": 0.5576770901679993, "learning_rate": 0.0002, "epoch": 1.9044539649179533, "step": 11780}, {"loss": 0.7483, "grad_norm": 0.6907393932342529, "learning_rate": 0.0002, "epoch": 1.9060706490986985, "step": 11790}, {"loss": 0.7827, "grad_norm": 0.6934581995010376, "learning_rate": 0.0002, "epoch": 1.9076873332794437, "step": 11800}, {"loss": 0.7199, "grad_norm": 0.591774582862854, "learning_rate": 0.0002, "epoch": 1.9093040174601892, "step": 11810}, {"loss": 0.7333, "grad_norm": 0.6249791383743286, "learning_rate": 0.0002, "epoch": 1.9109207016409344, "step": 11820}, {"loss": 0.7581, "grad_norm": 0.6755744218826294, "learning_rate": 0.0002, "epoch": 1.9125373858216799, "step": 11830}, {"loss": 0.696, "grad_norm": 0.7286285161972046, "learning_rate": 0.0002, "epoch": 1.914154070002425, "step": 11840}, {"loss": 0.7509, "grad_norm": 0.7867850065231323, "learning_rate": 0.0002, "epoch": 1.9157707541831703, "step": 11850}, {"loss": 0.735, "grad_norm": 0.6283972859382629, "learning_rate": 0.0002, "epoch": 1.9173874383639156, "step": 11860}, {"loss": 0.7296, "grad_norm": 0.605823814868927, "learning_rate": 0.0002, "epoch": 1.9190041225446608, "step": 11870}, {"loss": 0.6598, "grad_norm": 0.5927976965904236, "learning_rate": 0.0002, "epoch": 1.920620806725406, "step": 11880}, {"loss": 0.7649, "grad_norm": 0.5974002480506897, "learning_rate": 0.0002, "epoch": 1.9222374909061515, "step": 11890}, {"loss": 0.7843, "grad_norm": 0.7091866135597229, "learning_rate": 0.0002, "epoch": 1.923854175086897, "step": 11900}, {"loss": 0.775, "grad_norm": 0.72496497631073, "learning_rate": 0.0002, "epoch": 1.9254708592676422, "step": 11910}, {"loss": 0.7153, "grad_norm": 0.6131896376609802, "learning_rate": 0.0002, "epoch": 1.9270875434483874, "step": 11920}, {"loss": 0.7228, "grad_norm": 0.6556436419487, "learning_rate": 0.0002, "epoch": 1.9287042276291326, "step": 11930}, {"loss": 0.7319, "grad_norm": 0.622932493686676, "learning_rate": 0.0002, "epoch": 1.9303209118098779, "step": 11940}, {"loss": 0.7592, "grad_norm": 0.6618631482124329, "learning_rate": 0.0002, "epoch": 1.931937595990623, "step": 11950}, {"loss": 0.8332, "grad_norm": 0.630966305732727, "learning_rate": 0.0002, "epoch": 1.9335542801713685, "step": 11960}, {"loss": 0.6854, "grad_norm": 0.6336734890937805, "learning_rate": 0.0002, "epoch": 1.9351709643521138, "step": 11970}, {"loss": 0.7433, "grad_norm": 0.655403196811676, "learning_rate": 0.0002, "epoch": 1.9367876485328592, "step": 11980}, {"loss": 0.7282, "grad_norm": 0.5640574097633362, "learning_rate": 0.0002, "epoch": 1.9384043327136045, "step": 11990}, {"loss": 0.7289, "grad_norm": 0.6322951316833496, "learning_rate": 0.0002, "epoch": 1.9400210168943497, "step": 12000}, {"loss": 0.7627, "grad_norm": 0.615703821182251, "learning_rate": 0.0002, "epoch": 1.941637701075095, "step": 12010}, {"loss": 0.786, "grad_norm": 0.6487536430358887, "learning_rate": 0.0002, "epoch": 1.9432543852558402, "step": 12020}, {"loss": 0.7435, "grad_norm": 0.9209630489349365, "learning_rate": 0.0002, "epoch": 1.9448710694365856, "step": 12030}, {"loss": 0.7274, "grad_norm": 0.67485511302948, "learning_rate": 0.0002, "epoch": 1.9464877536173308, "step": 12040}, {"loss": 0.7551, "grad_norm": 0.6831230521202087, "learning_rate": 0.0002, "epoch": 1.9481044377980763, "step": 12050}, {"loss": 0.7546, "grad_norm": 0.6578302383422852, "learning_rate": 0.0002, "epoch": 1.9497211219788215, "step": 12060}, {"loss": 0.6989, "grad_norm": 0.9975938200950623, "learning_rate": 0.0002, "epoch": 1.9513378061595668, "step": 12070}, {"loss": 0.7952, "grad_norm": 0.6637365221977234, "learning_rate": 0.0002, "epoch": 1.952954490340312, "step": 12080}, {"loss": 0.7482, "grad_norm": 0.605707049369812, "learning_rate": 0.0002, "epoch": 1.9545711745210572, "step": 12090}, {"loss": 0.7768, "grad_norm": 0.6584440469741821, "learning_rate": 0.0002, "epoch": 1.9561878587018025, "step": 12100}, {"loss": 0.7187, "grad_norm": 0.6070835590362549, "learning_rate": 0.0002, "epoch": 1.957804542882548, "step": 12110}, {"loss": 0.7491, "grad_norm": 0.7862601280212402, "learning_rate": 0.0002, "epoch": 1.9594212270632931, "step": 12120}, {"loss": 0.7972, "grad_norm": 0.8175255060195923, "learning_rate": 0.0002, "epoch": 1.9610379112440386, "step": 12130}, {"loss": 0.7242, "grad_norm": 0.5648472905158997, "learning_rate": 0.0002, "epoch": 1.9626545954247838, "step": 12140}, {"loss": 0.7321, "grad_norm": 0.6591973304748535, "learning_rate": 0.0002, "epoch": 1.964271279605529, "step": 12150}, {"loss": 0.739, "grad_norm": 0.5960676074028015, "learning_rate": 0.0002, "epoch": 1.9658879637862743, "step": 12160}, {"loss": 0.7254, "grad_norm": 0.7272544503211975, "learning_rate": 0.0002, "epoch": 1.9675046479670195, "step": 12170}, {"loss": 0.7376, "grad_norm": 0.7176699042320251, "learning_rate": 0.0002, "epoch": 1.969121332147765, "step": 12180}, {"loss": 0.7525, "grad_norm": 0.6927123665809631, "learning_rate": 0.0002, "epoch": 1.9707380163285102, "step": 12190}, {"loss": 0.7318, "grad_norm": 0.5536034107208252, "learning_rate": 0.0002, "epoch": 1.9723547005092557, "step": 12200}, {"loss": 0.7737, "grad_norm": 0.8348390460014343, "learning_rate": 0.0002, "epoch": 1.9739713846900009, "step": 12210}, {"loss": 0.7494, "grad_norm": 0.6591181755065918, "learning_rate": 0.0002, "epoch": 1.9755880688707461, "step": 12220}, {"loss": 0.763, "grad_norm": 1.0624109506607056, "learning_rate": 0.0002, "epoch": 1.9772047530514913, "step": 12230}, {"loss": 0.7541, "grad_norm": 0.9265586137771606, "learning_rate": 0.0002, "epoch": 1.9788214372322366, "step": 12240}, {"loss": 0.7533, "grad_norm": 0.5998196005821228, "learning_rate": 0.0002, "epoch": 1.9804381214129818, "step": 12250}, {"loss": 0.7225, "grad_norm": 0.6960851550102234, "learning_rate": 0.0002, "epoch": 1.9820548055937273, "step": 12260}, {"loss": 0.7398, "grad_norm": 0.7674502730369568, "learning_rate": 0.0002, "epoch": 1.9836714897744727, "step": 12270}, {"loss": 0.7185, "grad_norm": 0.6407275795936584, "learning_rate": 0.0002, "epoch": 1.985288173955218, "step": 12280}, {"loss": 0.7382, "grad_norm": 0.6673079133033752, "learning_rate": 0.0002, "epoch": 1.9869048581359632, "step": 12290}, {"loss": 0.7326, "grad_norm": 0.6989844441413879, "learning_rate": 0.0002, "epoch": 1.9885215423167084, "step": 12300}, {"loss": 0.7559, "grad_norm": 0.7564442157745361, "learning_rate": 0.0002, "epoch": 1.9901382264974536, "step": 12310}, {"loss": 0.7719, "grad_norm": 0.6385478973388672, "learning_rate": 0.0002, "epoch": 1.9917549106781989, "step": 12320}, {"loss": 0.7369, "grad_norm": 0.7193717956542969, "learning_rate": 0.0002, "epoch": 1.9933715948589443, "step": 12330}, {"loss": 0.7583, "grad_norm": 0.7987112402915955, "learning_rate": 0.0002, "epoch": 1.9949882790396896, "step": 12340}, {"loss": 0.7793, "grad_norm": 0.7260826826095581, "learning_rate": 0.0002, "epoch": 1.996604963220435, "step": 12350}, {"loss": 0.7505, "grad_norm": 0.7968255281448364, "learning_rate": 0.0002, "epoch": 1.9982216474011802, "step": 12360}, {"loss": 0.717, "grad_norm": 0.6893062591552734, "learning_rate": 0.0002, "epoch": 1.9998383315819255, "step": 12370}, {"eval_loss": 1.1044032573699951, "eval_runtime": 122.1508, "eval_samples_per_second": 6.001, "eval_steps_per_second": 0.753, "epoch": 2.0, "step": 12371}, {"loss": 0.6604, "grad_norm": 0.7775409817695618, "learning_rate": 0.0002, "epoch": 2.0014550157626707, "step": 12380}, {"loss": 0.6845, "grad_norm": 0.76218581199646, "learning_rate": 0.0002, "epoch": 2.003071699943416, "step": 12390}, {"loss": 0.6909, "grad_norm": 0.5677764415740967, "learning_rate": 0.0002, "epoch": 2.004688384124161, "step": 12400}, {"loss": 0.6584, "grad_norm": 0.808442234992981, "learning_rate": 0.0002, "epoch": 2.006305068304907, "step": 12410}, {"loss": 0.659, "grad_norm": 0.7144765257835388, "learning_rate": 0.0002, "epoch": 2.007921752485652, "step": 12420}, {"loss": 0.6666, "grad_norm": 0.6914031505584717, "learning_rate": 0.0002, "epoch": 2.0095384366663973, "step": 12430}, {"loss": 0.6596, "grad_norm": 0.7581454515457153, "learning_rate": 0.0002, "epoch": 2.0111551208471425, "step": 12440}, {"loss": 0.6785, "grad_norm": 0.8388504981994629, "learning_rate": 0.0002, "epoch": 2.0127718050278878, "step": 12450}, {"loss": 0.6942, "grad_norm": 0.6716406941413879, "learning_rate": 0.0002, "epoch": 2.014388489208633, "step": 12460}, {"loss": 0.6441, "grad_norm": 0.898902416229248, "learning_rate": 0.0002, "epoch": 2.0160051733893782, "step": 12470}, {"loss": 0.6655, "grad_norm": 0.6432679891586304, "learning_rate": 0.0002, "epoch": 2.0176218575701235, "step": 12480}, {"loss": 0.6521, "grad_norm": 0.8021109104156494, "learning_rate": 0.0002, "epoch": 2.019238541750869, "step": 12490}, {"loss": 0.6581, "grad_norm": 0.7039216756820679, "learning_rate": 0.0002, "epoch": 2.0208552259316144, "step": 12500}, {"loss": 0.6521, "grad_norm": 0.646531879901886, "learning_rate": 0.0002, "epoch": 2.0224719101123596, "step": 12510}, {"loss": 0.6302, "grad_norm": 0.783704400062561, "learning_rate": 0.0002, "epoch": 2.024088594293105, "step": 12520}, {"loss": 0.6288, "grad_norm": 0.8805046677589417, "learning_rate": 0.0002, "epoch": 2.02570527847385, "step": 12530}, {"loss": 0.6288, "grad_norm": 0.7289270758628845, "learning_rate": 0.0002, "epoch": 2.0273219626545953, "step": 12540}, {"loss": 0.6663, "grad_norm": 0.71653151512146, "learning_rate": 0.0002, "epoch": 2.0289386468353405, "step": 12550}, {"loss": 0.625, "grad_norm": 0.73281329870224, "learning_rate": 0.0002, "epoch": 2.030555331016086, "step": 12560}, {"loss": 0.6448, "grad_norm": 0.6657090187072754, "learning_rate": 0.0002, "epoch": 2.0321720151968314, "step": 12570}, {"loss": 0.6983, "grad_norm": 0.8241133093833923, "learning_rate": 0.0002, "epoch": 2.0337886993775767, "step": 12580}, {"loss": 0.6488, "grad_norm": 0.5834135413169861, "learning_rate": 0.0002, "epoch": 2.035405383558322, "step": 12590}, {"loss": 0.6188, "grad_norm": 0.84502112865448, "learning_rate": 0.0002, "epoch": 2.037022067739067, "step": 12600}, {"loss": 0.6349, "grad_norm": 0.8952481746673584, "learning_rate": 0.0002, "epoch": 2.0386387519198124, "step": 12610}, {"loss": 0.6923, "grad_norm": 0.7801461815834045, "learning_rate": 0.0002, "epoch": 2.0402554361005576, "step": 12620}, {"loss": 0.6176, "grad_norm": 0.6788367033004761, "learning_rate": 0.0002, "epoch": 2.041872120281303, "step": 12630}, {"loss": 0.6162, "grad_norm": 0.7241756319999695, "learning_rate": 0.0002, "epoch": 2.0434888044620485, "step": 12640}, {"loss": 0.655, "grad_norm": 0.6933388113975525, "learning_rate": 0.0002, "epoch": 2.0451054886427937, "step": 12650}, {"loss": 0.6431, "grad_norm": 0.8029746413230896, "learning_rate": 0.0002, "epoch": 2.046722172823539, "step": 12660}, {"loss": 0.7164, "grad_norm": 0.946399986743927, "learning_rate": 0.0002, "epoch": 2.048338857004284, "step": 12670}, {"loss": 0.638, "grad_norm": 0.7072678804397583, "learning_rate": 0.0002, "epoch": 2.0499555411850294, "step": 12680}, {"loss": 0.6487, "grad_norm": 0.6810618042945862, "learning_rate": 0.0002, "epoch": 2.0515722253657747, "step": 12690}, {"loss": 0.6554, "grad_norm": 0.7661160230636597, "learning_rate": 0.0002, "epoch": 2.05318890954652, "step": 12700}, {"loss": 0.6799, "grad_norm": 0.6350653767585754, "learning_rate": 0.0002, "epoch": 2.0548055937272656, "step": 12710}, {"loss": 0.6654, "grad_norm": 0.861890971660614, "learning_rate": 0.0002, "epoch": 2.056422277908011, "step": 12720}, {"loss": 0.6286, "grad_norm": 0.6489875912666321, "learning_rate": 0.0002, "epoch": 2.058038962088756, "step": 12730}, {"loss": 0.6811, "grad_norm": 0.8268506526947021, "learning_rate": 0.0002, "epoch": 2.0596556462695013, "step": 12740}, {"loss": 0.6524, "grad_norm": 0.607679545879364, "learning_rate": 0.0002, "epoch": 2.0612723304502465, "step": 12750}, {"loss": 0.6649, "grad_norm": 0.6754153370857239, "learning_rate": 0.0002, "epoch": 2.0628890146309917, "step": 12760}, {"loss": 0.6549, "grad_norm": 0.7263124585151672, "learning_rate": 0.0002, "epoch": 2.064505698811737, "step": 12770}, {"loss": 0.6189, "grad_norm": 0.6986154317855835, "learning_rate": 0.0002, "epoch": 2.0661223829924826, "step": 12780}, {"loss": 0.6723, "grad_norm": 0.7768576741218567, "learning_rate": 0.0002, "epoch": 2.067739067173228, "step": 12790}, {"loss": 0.677, "grad_norm": 0.7546762824058533, "learning_rate": 0.0002, "epoch": 2.069355751353973, "step": 12800}, {"loss": 0.6485, "grad_norm": 0.7588880062103271, "learning_rate": 0.0002, "epoch": 2.0709724355347183, "step": 12810}, {"loss": 0.6989, "grad_norm": 0.7457242608070374, "learning_rate": 0.0002, "epoch": 2.0725891197154636, "step": 12820}, {"loss": 0.6489, "grad_norm": 0.6983516812324524, "learning_rate": 0.0002, "epoch": 2.074205803896209, "step": 12830}, {"loss": 0.651, "grad_norm": 0.7950928807258606, "learning_rate": 0.0002, "epoch": 2.075822488076954, "step": 12840}, {"loss": 0.6603, "grad_norm": 0.9248087406158447, "learning_rate": 0.0002, "epoch": 2.0774391722576993, "step": 12850}, {"loss": 0.6847, "grad_norm": 0.7229493260383606, "learning_rate": 0.0002, "epoch": 2.079055856438445, "step": 12860}, {"loss": 0.6702, "grad_norm": 0.5710847973823547, "learning_rate": 0.0002, "epoch": 2.08067254061919, "step": 12870}, {"loss": 0.6974, "grad_norm": 0.9580423831939697, "learning_rate": 0.0002, "epoch": 2.0822892247999354, "step": 12880}, {"loss": 0.6341, "grad_norm": 0.7399665713310242, "learning_rate": 0.0002, "epoch": 2.0839059089806806, "step": 12890}, {"loss": 0.6993, "grad_norm": 0.7981410622596741, "learning_rate": 0.0002, "epoch": 2.085522593161426, "step": 12900}, {"loss": 0.6976, "grad_norm": 0.870759904384613, "learning_rate": 0.0002, "epoch": 2.087139277342171, "step": 12910}, {"loss": 0.7194, "grad_norm": 0.7001481652259827, "learning_rate": 0.0002, "epoch": 2.0887559615229163, "step": 12920}, {"loss": 0.6383, "grad_norm": 0.6745418310165405, "learning_rate": 0.0002, "epoch": 2.090372645703662, "step": 12930}, {"loss": 0.6519, "grad_norm": 0.7739067673683167, "learning_rate": 0.0002, "epoch": 2.0919893298844072, "step": 12940}, {"loss": 0.6856, "grad_norm": 0.6742934584617615, "learning_rate": 0.0002, "epoch": 2.0936060140651525, "step": 12950}, {"loss": 0.6279, "grad_norm": 0.7270349860191345, "learning_rate": 0.0002, "epoch": 2.0952226982458977, "step": 12960}, {"loss": 0.6783, "grad_norm": 0.7150624394416809, "learning_rate": 0.0002, "epoch": 2.096839382426643, "step": 12970}, {"loss": 0.6093, "grad_norm": 0.7734767198562622, "learning_rate": 0.0002, "epoch": 2.098456066607388, "step": 12980}, {"loss": 0.6534, "grad_norm": 0.7618662118911743, "learning_rate": 0.0002, "epoch": 2.1000727507881334, "step": 12990}, {"loss": 0.6707, "grad_norm": 0.6557944416999817, "learning_rate": 0.0002, "epoch": 2.101689434968879, "step": 13000}, {"loss": 0.7268, "grad_norm": 0.8786448240280151, "learning_rate": 0.0002, "epoch": 2.1033061191496243, "step": 13010}, {"loss": 0.6677, "grad_norm": 0.6878724098205566, "learning_rate": 0.0002, "epoch": 2.1049228033303695, "step": 13020}, {"loss": 0.6824, "grad_norm": 0.822318971157074, "learning_rate": 0.0002, "epoch": 2.1065394875111147, "step": 13030}, {"loss": 0.6228, "grad_norm": 0.831468939781189, "learning_rate": 0.0002, "epoch": 2.10815617169186, "step": 13040}, {"loss": 0.6511, "grad_norm": 0.7699505686759949, "learning_rate": 0.0002, "epoch": 2.109772855872605, "step": 13050}, {"loss": 0.6671, "grad_norm": 0.7559016346931458, "learning_rate": 0.0002, "epoch": 2.1113895400533504, "step": 13060}, {"loss": 0.6215, "grad_norm": 0.6942209601402283, "learning_rate": 0.0002, "epoch": 2.1130062242340957, "step": 13070}, {"loss": 0.6449, "grad_norm": 0.6098947525024414, "learning_rate": 0.0002, "epoch": 2.1146229084148414, "step": 13080}, {"loss": 0.7091, "grad_norm": 0.6499016284942627, "learning_rate": 0.0002, "epoch": 2.1162395925955866, "step": 13090}, {"loss": 0.6247, "grad_norm": 0.7719953060150146, "learning_rate": 0.0002, "epoch": 2.117856276776332, "step": 13100}, {"loss": 0.6064, "grad_norm": 0.6708134412765503, "learning_rate": 0.0002, "epoch": 2.119472960957077, "step": 13110}, {"loss": 0.6056, "grad_norm": 0.8119585514068604, "learning_rate": 0.0002, "epoch": 2.1210896451378223, "step": 13120}, {"loss": 0.6628, "grad_norm": 0.6947157979011536, "learning_rate": 0.0002, "epoch": 2.1227063293185675, "step": 13130}, {"loss": 0.6375, "grad_norm": 0.8831837773323059, "learning_rate": 0.0002, "epoch": 2.1243230134993127, "step": 13140}, {"loss": 0.6997, "grad_norm": 0.7266910672187805, "learning_rate": 0.0002, "epoch": 2.1259396976800584, "step": 13150}, {"loss": 0.6446, "grad_norm": 0.8864351511001587, "learning_rate": 0.0002, "epoch": 2.1275563818608036, "step": 13160}, {"loss": 0.6762, "grad_norm": 0.8104248046875, "learning_rate": 0.0002, "epoch": 2.129173066041549, "step": 13170}, {"loss": 0.6581, "grad_norm": 0.6077079772949219, "learning_rate": 0.0002, "epoch": 2.130789750222294, "step": 13180}, {"loss": 0.6572, "grad_norm": 0.6874213814735413, "learning_rate": 0.0002, "epoch": 2.1324064344030393, "step": 13190}, {"loss": 0.642, "grad_norm": 0.7134367823600769, "learning_rate": 0.0002, "epoch": 2.1340231185837846, "step": 13200}, {"loss": 0.7016, "grad_norm": 0.6101235151290894, "learning_rate": 0.0002, "epoch": 2.13563980276453, "step": 13210}, {"loss": 0.6529, "grad_norm": 0.6042411923408508, "learning_rate": 0.0002, "epoch": 2.137256486945275, "step": 13220}, {"loss": 0.7179, "grad_norm": 0.914601743221283, "learning_rate": 0.0002, "epoch": 2.1388731711260207, "step": 13230}, {"loss": 0.6513, "grad_norm": 0.7104284167289734, "learning_rate": 0.0002, "epoch": 2.140489855306766, "step": 13240}, {"loss": 0.6607, "grad_norm": 0.664395272731781, "learning_rate": 0.0002, "epoch": 2.142106539487511, "step": 13250}, {"loss": 0.7211, "grad_norm": 0.6991241574287415, "learning_rate": 0.0002, "epoch": 2.1437232236682564, "step": 13260}, {"loss": 0.6484, "grad_norm": 0.5469560623168945, "learning_rate": 0.0002, "epoch": 2.1453399078490016, "step": 13270}, {"loss": 0.6765, "grad_norm": 0.8454998135566711, "learning_rate": 0.0002, "epoch": 2.146956592029747, "step": 13280}, {"loss": 0.6683, "grad_norm": 0.7088868618011475, "learning_rate": 0.0002, "epoch": 2.148573276210492, "step": 13290}, {"loss": 0.6835, "grad_norm": 0.7002687454223633, "learning_rate": 0.0002, "epoch": 2.1501899603912378, "step": 13300}, {"loss": 0.6399, "grad_norm": 0.7785214781761169, "learning_rate": 0.0002, "epoch": 2.151806644571983, "step": 13310}, {"loss": 0.67, "grad_norm": 0.8049132227897644, "learning_rate": 0.0002, "epoch": 2.1534233287527282, "step": 13320}, {"loss": 0.6495, "grad_norm": 0.8062595129013062, "learning_rate": 0.0002, "epoch": 2.1550400129334735, "step": 13330}, {"loss": 0.6603, "grad_norm": 0.6208319067955017, "learning_rate": 0.0002, "epoch": 2.1566566971142187, "step": 13340}, {"loss": 0.6584, "grad_norm": 0.7519655823707581, "learning_rate": 0.0002, "epoch": 2.158273381294964, "step": 13350}, {"loss": 0.6457, "grad_norm": 0.7645747065544128, "learning_rate": 0.0002, "epoch": 2.159890065475709, "step": 13360}, {"loss": 0.645, "grad_norm": 0.6847302913665771, "learning_rate": 0.0002, "epoch": 2.1615067496564544, "step": 13370}, {"loss": 0.6903, "grad_norm": 0.8630441427230835, "learning_rate": 0.0002, "epoch": 2.1631234338372, "step": 13380}, {"loss": 0.6742, "grad_norm": 0.7947702407836914, "learning_rate": 0.0002, "epoch": 2.1647401180179453, "step": 13390}, {"loss": 0.7206, "grad_norm": 0.6836977005004883, "learning_rate": 0.0002, "epoch": 2.1663568021986905, "step": 13400}, {"loss": 0.6304, "grad_norm": 0.7340566515922546, "learning_rate": 0.0002, "epoch": 2.1679734863794358, "step": 13410}, {"loss": 0.6528, "grad_norm": 0.7075738906860352, "learning_rate": 0.0002, "epoch": 2.169590170560181, "step": 13420}, {"loss": 0.6585, "grad_norm": 0.7080879807472229, "learning_rate": 0.0002, "epoch": 2.1712068547409262, "step": 13430}, {"loss": 0.6615, "grad_norm": 0.6218613386154175, "learning_rate": 0.0002, "epoch": 2.1728235389216715, "step": 13440}, {"loss": 0.6488, "grad_norm": 0.8211479187011719, "learning_rate": 0.0002, "epoch": 2.174440223102417, "step": 13450}, {"loss": 0.6738, "grad_norm": 0.864466667175293, "learning_rate": 0.0002, "epoch": 2.1760569072831624, "step": 13460}, {"loss": 0.679, "grad_norm": 0.7943857908248901, "learning_rate": 0.0002, "epoch": 2.1776735914639076, "step": 13470}, {"loss": 0.6838, "grad_norm": 0.78728187084198, "learning_rate": 0.0002, "epoch": 2.179290275644653, "step": 13480}, {"loss": 0.6397, "grad_norm": 0.697527289390564, "learning_rate": 0.0002, "epoch": 2.180906959825398, "step": 13490}, {"loss": 0.669, "grad_norm": 0.8205804228782654, "learning_rate": 0.0002, "epoch": 2.1825236440061433, "step": 13500}, {"loss": 0.7227, "grad_norm": 0.8709042072296143, "learning_rate": 0.0002, "epoch": 2.1841403281868885, "step": 13510}, {"loss": 0.6313, "grad_norm": 0.6228537559509277, "learning_rate": 0.0002, "epoch": 2.1857570123676338, "step": 13520}, {"loss": 0.7025, "grad_norm": 0.9566980004310608, "learning_rate": 0.0002, "epoch": 2.1873736965483794, "step": 13530}, {"loss": 0.6755, "grad_norm": 0.7128894329071045, "learning_rate": 0.0002, "epoch": 2.1889903807291247, "step": 13540}, {"loss": 0.6827, "grad_norm": 0.6888654232025146, "learning_rate": 0.0002, "epoch": 2.19060706490987, "step": 13550}, {"loss": 0.6961, "grad_norm": 0.6444337368011475, "learning_rate": 0.0002, "epoch": 2.192223749090615, "step": 13560}, {"loss": 0.656, "grad_norm": 0.8008806705474854, "learning_rate": 0.0002, "epoch": 2.1938404332713604, "step": 13570}, {"loss": 0.7, "grad_norm": 0.8482748866081238, "learning_rate": 0.0002, "epoch": 2.1954571174521056, "step": 13580}, {"loss": 0.7326, "grad_norm": 0.8584157228469849, "learning_rate": 0.0002, "epoch": 2.197073801632851, "step": 13590}, {"loss": 0.7014, "grad_norm": 0.7513734698295593, "learning_rate": 0.0002, "epoch": 2.1986904858135965, "step": 13600}, {"loss": 0.6632, "grad_norm": 0.7864262461662292, "learning_rate": 0.0002, "epoch": 2.2003071699943417, "step": 13610}, {"loss": 0.6879, "grad_norm": 0.8493645191192627, "learning_rate": 0.0002, "epoch": 2.201923854175087, "step": 13620}, {"loss": 0.6617, "grad_norm": 0.6902140974998474, "learning_rate": 0.0002, "epoch": 2.203540538355832, "step": 13630}, {"loss": 0.6655, "grad_norm": 0.8711254596710205, "learning_rate": 0.0002, "epoch": 2.2051572225365774, "step": 13640}, {"loss": 0.6359, "grad_norm": 0.7832191586494446, "learning_rate": 0.0002, "epoch": 2.2067739067173227, "step": 13650}, {"loss": 0.6723, "grad_norm": 0.5668176412582397, "learning_rate": 0.0002, "epoch": 2.208390590898068, "step": 13660}, {"loss": 0.635, "grad_norm": 0.8648375272750854, "learning_rate": 0.0002, "epoch": 2.2100072750788136, "step": 13670}, {"loss": 0.653, "grad_norm": 0.7643089890480042, "learning_rate": 0.0002, "epoch": 2.211623959259559, "step": 13680}, {"loss": 0.6765, "grad_norm": 0.6293777823448181, "learning_rate": 0.0002, "epoch": 2.213240643440304, "step": 13690}, {"loss": 0.6842, "grad_norm": 0.6459372639656067, "learning_rate": 0.0002, "epoch": 2.2148573276210493, "step": 13700}, {"loss": 0.6526, "grad_norm": 0.7060744166374207, "learning_rate": 0.0002, "epoch": 2.2164740118017945, "step": 13710}, {"loss": 0.7101, "grad_norm": 0.674109160900116, "learning_rate": 0.0002, "epoch": 2.2180906959825397, "step": 13720}, {"loss": 0.6529, "grad_norm": 0.830392062664032, "learning_rate": 0.0002, "epoch": 2.219707380163285, "step": 13730}, {"loss": 0.6733, "grad_norm": 0.6474477052688599, "learning_rate": 0.0002, "epoch": 2.2213240643440306, "step": 13740}, {"loss": 0.6413, "grad_norm": 0.7037909626960754, "learning_rate": 0.0002, "epoch": 2.222940748524776, "step": 13750}, {"loss": 0.6417, "grad_norm": 0.6554131507873535, "learning_rate": 0.0002, "epoch": 2.224557432705521, "step": 13760}, {"loss": 0.6907, "grad_norm": 0.7822230458259583, "learning_rate": 0.0002, "epoch": 2.2261741168862663, "step": 13770}, {"loss": 0.6505, "grad_norm": 0.9082167744636536, "learning_rate": 0.0002, "epoch": 2.2277908010670116, "step": 13780}, {"loss": 0.6878, "grad_norm": 0.7918276190757751, "learning_rate": 0.0002, "epoch": 2.229407485247757, "step": 13790}, {"loss": 0.6669, "grad_norm": 0.7354569435119629, "learning_rate": 0.0002, "epoch": 2.231024169428502, "step": 13800}, {"loss": 0.6503, "grad_norm": 0.8265249133110046, "learning_rate": 0.0002, "epoch": 2.2326408536092472, "step": 13810}, {"loss": 0.6871, "grad_norm": 0.6653847098350525, "learning_rate": 0.0002, "epoch": 2.234257537789993, "step": 13820}, {"loss": 0.6413, "grad_norm": 0.7157923579216003, "learning_rate": 0.0002, "epoch": 2.235874221970738, "step": 13830}, {"loss": 0.6306, "grad_norm": 0.7110323309898376, "learning_rate": 0.0002, "epoch": 2.2374909061514834, "step": 13840}, {"loss": 0.6913, "grad_norm": 0.7155357599258423, "learning_rate": 0.0002, "epoch": 2.2391075903322286, "step": 13850}, {"loss": 0.6579, "grad_norm": 1.0177817344665527, "learning_rate": 0.0002, "epoch": 2.240724274512974, "step": 13860}, {"loss": 0.635, "grad_norm": 0.7601948380470276, "learning_rate": 0.0002, "epoch": 2.242340958693719, "step": 13870}, {"loss": 0.6679, "grad_norm": 0.7628820538520813, "learning_rate": 0.0002, "epoch": 2.2439576428744643, "step": 13880}, {"loss": 0.6805, "grad_norm": 0.7089297771453857, "learning_rate": 0.0002, "epoch": 2.24557432705521, "step": 13890}, {"loss": 0.7236, "grad_norm": 0.695178210735321, "learning_rate": 0.0002, "epoch": 2.247191011235955, "step": 13900}, {"loss": 0.7084, "grad_norm": 0.7631948590278625, "learning_rate": 0.0002, "epoch": 2.2488076954167004, "step": 13910}, {"loss": 0.685, "grad_norm": 0.8203101754188538, "learning_rate": 0.0002, "epoch": 2.2504243795974457, "step": 13920}, {"loss": 0.653, "grad_norm": 0.8099079728126526, "learning_rate": 0.0002, "epoch": 2.252041063778191, "step": 13930}, {"loss": 0.694, "grad_norm": 0.6498546004295349, "learning_rate": 0.0002, "epoch": 2.253657747958936, "step": 13940}, {"loss": 0.6684, "grad_norm": 0.7797415256500244, "learning_rate": 0.0002, "epoch": 2.2552744321396814, "step": 13950}, {"loss": 0.683, "grad_norm": 0.8254124522209167, "learning_rate": 0.0002, "epoch": 2.2568911163204266, "step": 13960}, {"loss": 0.6806, "grad_norm": 0.6327953338623047, "learning_rate": 0.0002, "epoch": 2.2585078005011723, "step": 13970}, {"loss": 0.668, "grad_norm": 0.734194278717041, "learning_rate": 0.0002, "epoch": 2.2601244846819175, "step": 13980}, {"loss": 0.6912, "grad_norm": 0.9014202952384949, "learning_rate": 0.0002, "epoch": 2.2617411688626627, "step": 13990}, {"loss": 0.692, "grad_norm": 0.7643631100654602, "learning_rate": 0.0002, "epoch": 2.263357853043408, "step": 14000}, {"loss": 0.6657, "grad_norm": 0.8882834911346436, "learning_rate": 0.0002, "epoch": 2.264974537224153, "step": 14010}, {"loss": 0.6453, "grad_norm": 0.7975873351097107, "learning_rate": 0.0002, "epoch": 2.2665912214048984, "step": 14020}, {"loss": 0.7193, "grad_norm": 0.7765783071517944, "learning_rate": 0.0002, "epoch": 2.2682079055856437, "step": 14030}, {"loss": 0.662, "grad_norm": 0.8846288323402405, "learning_rate": 0.0002, "epoch": 2.2698245897663893, "step": 14040}, {"loss": 0.6494, "grad_norm": 0.9006744027137756, "learning_rate": 0.0002, "epoch": 2.2714412739471346, "step": 14050}, {"loss": 0.6423, "grad_norm": 0.7420173287391663, "learning_rate": 0.0002, "epoch": 2.27305795812788, "step": 14060}, {"loss": 0.7068, "grad_norm": 0.7956424951553345, "learning_rate": 0.0002, "epoch": 2.274674642308625, "step": 14070}, {"loss": 0.6581, "grad_norm": 0.7783209085464478, "learning_rate": 0.0002, "epoch": 2.2762913264893703, "step": 14080}, {"loss": 0.7202, "grad_norm": 0.7597188949584961, "learning_rate": 0.0002, "epoch": 2.2779080106701155, "step": 14090}, {"loss": 0.6778, "grad_norm": 0.6718921661376953, "learning_rate": 0.0002, "epoch": 2.2795246948508607, "step": 14100}, {"loss": 0.632, "grad_norm": 0.7528082132339478, "learning_rate": 0.0002, "epoch": 2.281141379031606, "step": 14110}, {"loss": 0.7608, "grad_norm": 0.8379864692687988, "learning_rate": 0.0002, "epoch": 2.2827580632123516, "step": 14120}, {"loss": 0.6767, "grad_norm": 0.748613715171814, "learning_rate": 0.0002, "epoch": 2.284374747393097, "step": 14130}, {"loss": 0.6641, "grad_norm": 0.7435423135757446, "learning_rate": 0.0002, "epoch": 2.285991431573842, "step": 14140}, {"loss": 0.6849, "grad_norm": 0.7580803632736206, "learning_rate": 0.0002, "epoch": 2.2876081157545873, "step": 14150}, {"loss": 0.6604, "grad_norm": 0.6278321146965027, "learning_rate": 0.0002, "epoch": 2.2892247999353326, "step": 14160}, {"loss": 0.6573, "grad_norm": 0.7663896083831787, "learning_rate": 0.0002, "epoch": 2.290841484116078, "step": 14170}, {"loss": 0.6655, "grad_norm": 0.9716812372207642, "learning_rate": 0.0002, "epoch": 2.292458168296823, "step": 14180}, {"loss": 0.7067, "grad_norm": 0.8993458151817322, "learning_rate": 0.0002, "epoch": 2.2940748524775687, "step": 14190}, {"loss": 0.6172, "grad_norm": 0.6156117916107178, "learning_rate": 0.0002, "epoch": 2.295691536658314, "step": 14200}, {"loss": 0.6318, "grad_norm": 0.8911278247833252, "learning_rate": 0.0002, "epoch": 2.297308220839059, "step": 14210}, {"loss": 0.6364, "grad_norm": 0.6422147154808044, "learning_rate": 0.0002, "epoch": 2.2989249050198044, "step": 14220}, {"loss": 0.6795, "grad_norm": 0.6866879463195801, "learning_rate": 0.0002, "epoch": 2.3005415892005496, "step": 14230}, {"loss": 0.6907, "grad_norm": 0.9297130107879639, "learning_rate": 0.0002, "epoch": 2.302158273381295, "step": 14240}, {"loss": 0.6823, "grad_norm": 0.7501356601715088, "learning_rate": 0.0002, "epoch": 2.30377495756204, "step": 14250}, {"loss": 0.6414, "grad_norm": 0.8363515138626099, "learning_rate": 0.0002, "epoch": 2.3053916417427853, "step": 14260}, {"loss": 0.6362, "grad_norm": 0.9083868265151978, "learning_rate": 0.0002, "epoch": 2.307008325923531, "step": 14270}, {"loss": 0.6862, "grad_norm": 0.7791516780853271, "learning_rate": 0.0002, "epoch": 2.3086250101042762, "step": 14280}, {"loss": 0.6569, "grad_norm": 0.8766953349113464, "learning_rate": 0.0002, "epoch": 2.3102416942850215, "step": 14290}, {"loss": 0.6698, "grad_norm": 0.7916635274887085, "learning_rate": 0.0002, "epoch": 2.3118583784657667, "step": 14300}, {"loss": 0.6927, "grad_norm": 0.627525269985199, "learning_rate": 0.0002, "epoch": 2.313475062646512, "step": 14310}, {"loss": 0.6541, "grad_norm": 0.8856783509254456, "learning_rate": 0.0002, "epoch": 2.315091746827257, "step": 14320}, {"loss": 0.6806, "grad_norm": 0.6758689284324646, "learning_rate": 0.0002, "epoch": 2.316708431008003, "step": 14330}, {"loss": 0.6794, "grad_norm": 0.6428321003913879, "learning_rate": 0.0002, "epoch": 2.318325115188748, "step": 14340}, {"loss": 0.682, "grad_norm": 0.9032121300697327, "learning_rate": 0.0002, "epoch": 2.3199417993694933, "step": 14350}, {"loss": 0.6569, "grad_norm": 0.8035986423492432, "learning_rate": 0.0002, "epoch": 2.3215584835502385, "step": 14360}, {"loss": 0.7067, "grad_norm": 0.7974579334259033, "learning_rate": 0.0002, "epoch": 2.3231751677309838, "step": 14370}, {"loss": 0.6451, "grad_norm": 0.8356034755706787, "learning_rate": 0.0002, "epoch": 2.324791851911729, "step": 14380}, {"loss": 0.6623, "grad_norm": 0.998760998249054, "learning_rate": 0.0002, "epoch": 2.326408536092474, "step": 14390}, {"loss": 0.649, "grad_norm": 0.6518142223358154, "learning_rate": 0.0002, "epoch": 2.3280252202732195, "step": 14400}, {"loss": 0.7146, "grad_norm": 0.7443506717681885, "learning_rate": 0.0002, "epoch": 2.3296419044539647, "step": 14410}, {"loss": 0.648, "grad_norm": 0.8436172604560852, "learning_rate": 0.0002, "epoch": 2.3312585886347104, "step": 14420}, {"loss": 0.6585, "grad_norm": 0.7411080598831177, "learning_rate": 0.0002, "epoch": 2.3328752728154556, "step": 14430}, {"loss": 0.6781, "grad_norm": 0.8839048743247986, "learning_rate": 0.0002, "epoch": 2.334491956996201, "step": 14440}, {"loss": 0.6565, "grad_norm": 0.8360885977745056, "learning_rate": 0.0002, "epoch": 2.336108641176946, "step": 14450}, {"loss": 0.6662, "grad_norm": 0.7608986496925354, "learning_rate": 0.0002, "epoch": 2.3377253253576913, "step": 14460}, {"loss": 0.6685, "grad_norm": 0.8179867267608643, "learning_rate": 0.0002, "epoch": 2.3393420095384365, "step": 14470}, {"loss": 0.7055, "grad_norm": 0.5989999771118164, "learning_rate": 0.0002, "epoch": 2.340958693719182, "step": 14480}, {"loss": 0.644, "grad_norm": 0.9450054168701172, "learning_rate": 0.0002, "epoch": 2.3425753778999274, "step": 14490}, {"loss": 0.6983, "grad_norm": 0.7885149717330933, "learning_rate": 0.0002, "epoch": 2.3441920620806727, "step": 14500}, {"loss": 0.6819, "grad_norm": 0.8152616620063782, "learning_rate": 0.0002, "epoch": 2.345808746261418, "step": 14510}, {"loss": 0.6989, "grad_norm": 0.7193838953971863, "learning_rate": 0.0002, "epoch": 2.347425430442163, "step": 14520}, {"loss": 0.6594, "grad_norm": 0.6701092720031738, "learning_rate": 0.0002, "epoch": 2.3490421146229084, "step": 14530}, {"loss": 0.6559, "grad_norm": 0.7529364228248596, "learning_rate": 0.0002, "epoch": 2.3506587988036536, "step": 14540}, {"loss": 0.6306, "grad_norm": 0.6599733829498291, "learning_rate": 0.0002, "epoch": 2.352275482984399, "step": 14550}, {"loss": 0.706, "grad_norm": 0.9502474069595337, "learning_rate": 0.0002, "epoch": 2.353892167165144, "step": 14560}, {"loss": 0.717, "grad_norm": 0.7619650959968567, "learning_rate": 0.0002, "epoch": 2.3555088513458897, "step": 14570}, {"loss": 0.6684, "grad_norm": 0.9854652285575867, "learning_rate": 0.0002, "epoch": 2.357125535526635, "step": 14580}, {"loss": 0.6455, "grad_norm": 0.727439284324646, "learning_rate": 0.0002, "epoch": 2.35874221970738, "step": 14590}, {"loss": 0.6645, "grad_norm": 0.6994746327400208, "learning_rate": 0.0002, "epoch": 2.3603589038881254, "step": 14600}, {"loss": 0.6587, "grad_norm": 0.7117531299591064, "learning_rate": 0.0002, "epoch": 2.3619755880688706, "step": 14610}, {"loss": 0.6804, "grad_norm": 0.6403067708015442, "learning_rate": 0.0002, "epoch": 2.363592272249616, "step": 14620}, {"loss": 0.7055, "grad_norm": 0.8377841711044312, "learning_rate": 0.0002, "epoch": 2.3652089564303616, "step": 14630}, {"loss": 0.6778, "grad_norm": 0.749171257019043, "learning_rate": 0.0002, "epoch": 2.366825640611107, "step": 14640}, {"loss": 0.6552, "grad_norm": 0.8418586254119873, "learning_rate": 0.0002, "epoch": 2.368442324791852, "step": 14650}, {"loss": 0.6685, "grad_norm": 0.6178573369979858, "learning_rate": 0.0002, "epoch": 2.3700590089725972, "step": 14660}, {"loss": 0.6774, "grad_norm": 0.6368302702903748, "learning_rate": 0.0002, "epoch": 2.3716756931533425, "step": 14670}, {"loss": 0.6136, "grad_norm": 0.9122977256774902, "learning_rate": 0.0002, "epoch": 2.3732923773340877, "step": 14680}, {"loss": 0.6675, "grad_norm": 0.7086195349693298, "learning_rate": 0.0002, "epoch": 2.374909061514833, "step": 14690}, {"loss": 0.6582, "grad_norm": 0.7500800490379333, "learning_rate": 0.0002, "epoch": 2.376525745695578, "step": 14700}, {"loss": 0.6792, "grad_norm": 0.6634900569915771, "learning_rate": 0.0002, "epoch": 2.378142429876324, "step": 14710}, {"loss": 0.6614, "grad_norm": 0.839898407459259, "learning_rate": 0.0002, "epoch": 2.379759114057069, "step": 14720}, {"loss": 0.6453, "grad_norm": 0.7578426003456116, "learning_rate": 0.0002, "epoch": 2.3813757982378143, "step": 14730}, {"loss": 0.7282, "grad_norm": 1.0213173627853394, "learning_rate": 0.0002, "epoch": 2.3829924824185595, "step": 14740}, {"loss": 0.6704, "grad_norm": 0.7855949401855469, "learning_rate": 0.0002, "epoch": 2.3846091665993048, "step": 14750}, {"loss": 0.6694, "grad_norm": 0.7224128842353821, "learning_rate": 0.0002, "epoch": 2.38622585078005, "step": 14760}, {"loss": 0.7017, "grad_norm": 0.8040381669998169, "learning_rate": 0.0002, "epoch": 2.3878425349607952, "step": 14770}, {"loss": 0.6799, "grad_norm": 0.7705281376838684, "learning_rate": 0.0002, "epoch": 2.389459219141541, "step": 14780}, {"loss": 0.6326, "grad_norm": 0.667966902256012, "learning_rate": 0.0002, "epoch": 2.391075903322286, "step": 14790}, {"loss": 0.7061, "grad_norm": 0.6611011028289795, "learning_rate": 0.0002, "epoch": 2.3926925875030314, "step": 14800}, {"loss": 0.6527, "grad_norm": 0.6862651705741882, "learning_rate": 0.0002, "epoch": 2.3943092716837766, "step": 14810}, {"loss": 0.6537, "grad_norm": 0.8086010217666626, "learning_rate": 0.0002, "epoch": 2.395925955864522, "step": 14820}, {"loss": 0.7189, "grad_norm": 0.7189689874649048, "learning_rate": 0.0002, "epoch": 2.397542640045267, "step": 14830}, {"loss": 0.6709, "grad_norm": 0.6280009150505066, "learning_rate": 0.0002, "epoch": 2.3991593242260123, "step": 14840}, {"loss": 0.706, "grad_norm": 0.7826612591743469, "learning_rate": 0.0002, "epoch": 2.4007760084067575, "step": 14850}, {"loss": 0.6738, "grad_norm": 0.7681610584259033, "learning_rate": 0.0002, "epoch": 2.402392692587503, "step": 14860}, {"loss": 0.636, "grad_norm": 0.720966100692749, "learning_rate": 0.0002, "epoch": 2.4040093767682484, "step": 14870}, {"loss": 0.6667, "grad_norm": 0.8202250599861145, "learning_rate": 0.0002, "epoch": 2.4056260609489937, "step": 14880}, {"loss": 0.6935, "grad_norm": 0.786212682723999, "learning_rate": 0.0002, "epoch": 2.407242745129739, "step": 14890}, {"loss": 0.6628, "grad_norm": 0.6647164821624756, "learning_rate": 0.0002, "epoch": 2.408859429310484, "step": 14900}, {"loss": 0.6706, "grad_norm": 0.7566399574279785, "learning_rate": 0.0002, "epoch": 2.4104761134912294, "step": 14910}, {"loss": 0.7188, "grad_norm": 0.748814582824707, "learning_rate": 0.0002, "epoch": 2.4120927976719746, "step": 14920}, {"loss": 0.6684, "grad_norm": 0.7624038457870483, "learning_rate": 0.0002, "epoch": 2.4137094818527203, "step": 14930}, {"loss": 0.6483, "grad_norm": 0.8267335295677185, "learning_rate": 0.0002, "epoch": 2.4153261660334655, "step": 14940}, {"loss": 0.6612, "grad_norm": 0.8785360455513, "learning_rate": 0.0002, "epoch": 2.4169428502142107, "step": 14950}, {"loss": 0.6718, "grad_norm": 0.679887592792511, "learning_rate": 0.0002, "epoch": 2.418559534394956, "step": 14960}, {"loss": 0.6136, "grad_norm": 0.7218474745750427, "learning_rate": 0.0002, "epoch": 2.420176218575701, "step": 14970}, {"loss": 0.648, "grad_norm": 0.6342799663543701, "learning_rate": 0.0002, "epoch": 2.4217929027564464, "step": 14980}, {"loss": 0.6617, "grad_norm": 0.7098712921142578, "learning_rate": 0.0002, "epoch": 2.4234095869371917, "step": 14990}, {"loss": 0.6942, "grad_norm": 0.7497431635856628, "learning_rate": 0.0002, "epoch": 2.425026271117937, "step": 15000}, {"loss": 0.6772, "grad_norm": 0.934836208820343, "learning_rate": 0.0002, "epoch": 2.4266429552986826, "step": 15010}, {"loss": 0.7221, "grad_norm": 0.8430966734886169, "learning_rate": 0.0002, "epoch": 2.428259639479428, "step": 15020}, {"loss": 0.6985, "grad_norm": 0.7032104730606079, "learning_rate": 0.0002, "epoch": 2.429876323660173, "step": 15030}, {"loss": 0.6715, "grad_norm": 0.7746111750602722, "learning_rate": 0.0002, "epoch": 2.4314930078409183, "step": 15040}, {"loss": 0.7177, "grad_norm": 0.7661406397819519, "learning_rate": 0.0002, "epoch": 2.4331096920216635, "step": 15050}, {"loss": 0.6517, "grad_norm": 0.6941645741462708, "learning_rate": 0.0002, "epoch": 2.4347263762024087, "step": 15060}, {"loss": 0.6421, "grad_norm": 0.7487249374389648, "learning_rate": 0.0002, "epoch": 2.436343060383154, "step": 15070}, {"loss": 0.6796, "grad_norm": 0.7639912962913513, "learning_rate": 0.0002, "epoch": 2.4379597445638996, "step": 15080}, {"loss": 0.7087, "grad_norm": 0.7708953619003296, "learning_rate": 0.0002, "epoch": 2.439576428744645, "step": 15090}, {"loss": 0.7065, "grad_norm": 0.9135832190513611, "learning_rate": 0.0002, "epoch": 2.44119311292539, "step": 15100}, {"loss": 0.672, "grad_norm": 0.8283005356788635, "learning_rate": 0.0002, "epoch": 2.4428097971061353, "step": 15110}, {"loss": 0.6551, "grad_norm": 0.925299346446991, "learning_rate": 0.0002, "epoch": 2.4444264812868806, "step": 15120}, {"loss": 0.687, "grad_norm": 0.7013528943061829, "learning_rate": 0.0002, "epoch": 2.446043165467626, "step": 15130}, {"loss": 0.6842, "grad_norm": 0.622303307056427, "learning_rate": 0.0002, "epoch": 2.447659849648371, "step": 15140}, {"loss": 0.6676, "grad_norm": 0.876569390296936, "learning_rate": 0.0002, "epoch": 2.4492765338291163, "step": 15150}, {"loss": 0.6463, "grad_norm": 0.6836351752281189, "learning_rate": 0.0002, "epoch": 2.450893218009862, "step": 15160}, {"loss": 0.6781, "grad_norm": 0.7886684536933899, "learning_rate": 0.0002, "epoch": 2.452509902190607, "step": 15170}, {"loss": 0.6794, "grad_norm": 0.6647440791130066, "learning_rate": 0.0002, "epoch": 2.4541265863713524, "step": 15180}, {"loss": 0.6353, "grad_norm": 0.7477722764015198, "learning_rate": 0.0002, "epoch": 2.4557432705520976, "step": 15190}, {"loss": 0.698, "grad_norm": 0.8192033767700195, "learning_rate": 0.0002, "epoch": 2.457359954732843, "step": 15200}, {"loss": 0.6735, "grad_norm": 0.847537100315094, "learning_rate": 0.0002, "epoch": 2.458976638913588, "step": 15210}, {"loss": 0.6962, "grad_norm": 0.9027776122093201, "learning_rate": 0.0002, "epoch": 2.4605933230943338, "step": 15220}, {"loss": 0.7084, "grad_norm": 0.7217772006988525, "learning_rate": 0.0002, "epoch": 2.462210007275079, "step": 15230}, {"loss": 0.691, "grad_norm": 0.7994546294212341, "learning_rate": 0.0002, "epoch": 2.4638266914558242, "step": 15240}, {"loss": 0.6828, "grad_norm": 0.939916729927063, "learning_rate": 0.0002, "epoch": 2.4654433756365695, "step": 15250}, {"loss": 0.6893, "grad_norm": 1.0009053945541382, "learning_rate": 0.0002, "epoch": 2.4670600598173147, "step": 15260}, {"loss": 0.643, "grad_norm": 0.625555694103241, "learning_rate": 0.0002, "epoch": 2.46867674399806, "step": 15270}, {"loss": 0.688, "grad_norm": 0.7924878597259521, "learning_rate": 0.0002, "epoch": 2.470293428178805, "step": 15280}, {"loss": 0.6789, "grad_norm": 0.8536689877510071, "learning_rate": 0.0002, "epoch": 2.4719101123595504, "step": 15290}, {"loss": 0.6924, "grad_norm": 0.8572589755058289, "learning_rate": 0.0002, "epoch": 2.4735267965402956, "step": 15300}, {"loss": 0.604, "grad_norm": 0.773279070854187, "learning_rate": 0.0002, "epoch": 2.4751434807210413, "step": 15310}, {"loss": 0.6573, "grad_norm": 0.7708749771118164, "learning_rate": 0.0002, "epoch": 2.4767601649017865, "step": 15320}, {"loss": 0.7065, "grad_norm": 0.770905077457428, "learning_rate": 0.0002, "epoch": 2.4783768490825318, "step": 15330}, {"loss": 0.6878, "grad_norm": 0.8238571882247925, "learning_rate": 0.0002, "epoch": 2.479993533263277, "step": 15340}, {"loss": 0.6772, "grad_norm": 0.7670477032661438, "learning_rate": 0.0002, "epoch": 2.481610217444022, "step": 15350}, {"loss": 0.7759, "grad_norm": 0.905036985874176, "learning_rate": 0.0002, "epoch": 2.4832269016247674, "step": 15360}, {"loss": 0.706, "grad_norm": 0.6672089695930481, "learning_rate": 0.0002, "epoch": 2.484843585805513, "step": 15370}, {"loss": 0.6722, "grad_norm": 0.625095784664154, "learning_rate": 0.0002, "epoch": 2.4864602699862584, "step": 15380}, {"loss": 0.6396, "grad_norm": 0.679772675037384, "learning_rate": 0.0002, "epoch": 2.4880769541670036, "step": 15390}, {"loss": 0.6778, "grad_norm": 0.711492121219635, "learning_rate": 0.0002, "epoch": 2.489693638347749, "step": 15400}, {"loss": 0.6966, "grad_norm": 0.876189112663269, "learning_rate": 0.0002, "epoch": 2.491310322528494, "step": 15410}, {"loss": 0.7307, "grad_norm": 0.7236915230751038, "learning_rate": 0.0002, "epoch": 2.4929270067092393, "step": 15420}, {"loss": 0.647, "grad_norm": 0.6629832983016968, "learning_rate": 0.0002, "epoch": 2.4945436908899845, "step": 15430}, {"loss": 0.6669, "grad_norm": 0.9756859540939331, "learning_rate": 0.0002, "epoch": 2.4961603750707297, "step": 15440}, {"loss": 0.7559, "grad_norm": 0.6896940469741821, "learning_rate": 0.0002, "epoch": 2.4977770592514754, "step": 15450}, {"loss": 0.6818, "grad_norm": 0.7105149626731873, "learning_rate": 0.0002, "epoch": 2.4993937434322206, "step": 15460}, {"loss": 0.6859, "grad_norm": 0.8374546766281128, "learning_rate": 0.0002, "epoch": 2.501010427612966, "step": 15470}, {"loss": 0.6512, "grad_norm": 0.7320070266723633, "learning_rate": 0.0002, "epoch": 2.502627111793711, "step": 15480}, {"loss": 0.685, "grad_norm": 0.8306367993354797, "learning_rate": 0.0002, "epoch": 2.5042437959744563, "step": 15490}, {"loss": 0.7253, "grad_norm": 0.7472721338272095, "learning_rate": 0.0002, "epoch": 2.5058604801552016, "step": 15500}, {"loss": 0.6699, "grad_norm": 0.6147692203521729, "learning_rate": 0.0002, "epoch": 2.507477164335947, "step": 15510}, {"loss": 0.7158, "grad_norm": 0.7788505554199219, "learning_rate": 0.0002, "epoch": 2.5090938485166925, "step": 15520}, {"loss": 0.6521, "grad_norm": 0.8807527422904968, "learning_rate": 0.0002, "epoch": 2.5107105326974377, "step": 15530}, {"loss": 0.6792, "grad_norm": 0.7521643042564392, "learning_rate": 0.0002, "epoch": 2.512327216878183, "step": 15540}, {"loss": 0.6772, "grad_norm": 0.6900225281715393, "learning_rate": 0.0002, "epoch": 2.513943901058928, "step": 15550}, {"loss": 0.6769, "grad_norm": 0.6601938605308533, "learning_rate": 0.0002, "epoch": 2.5155605852396734, "step": 15560}, {"loss": 0.6648, "grad_norm": 0.8179984092712402, "learning_rate": 0.0002, "epoch": 2.5171772694204186, "step": 15570}, {"loss": 0.7028, "grad_norm": 0.792556881904602, "learning_rate": 0.0002, "epoch": 2.518793953601164, "step": 15580}, {"loss": 0.6464, "grad_norm": 0.7081938982009888, "learning_rate": 0.0002, "epoch": 2.520410637781909, "step": 15590}, {"loss": 0.6691, "grad_norm": 0.8733121156692505, "learning_rate": 0.0002, "epoch": 2.5220273219626543, "step": 15600}, {"loss": 0.6969, "grad_norm": 0.7980992794036865, "learning_rate": 0.0002, "epoch": 2.5236440061434, "step": 15610}, {"loss": 0.7124, "grad_norm": 0.883664071559906, "learning_rate": 0.0002, "epoch": 2.5252606903241452, "step": 15620}, {"loss": 0.7022, "grad_norm": 0.6963341236114502, "learning_rate": 0.0002, "epoch": 2.5268773745048905, "step": 15630}, {"loss": 0.7334, "grad_norm": 0.6433573365211487, "learning_rate": 0.0002, "epoch": 2.5284940586856357, "step": 15640}, {"loss": 0.6889, "grad_norm": 0.8538183569908142, "learning_rate": 0.0002, "epoch": 2.530110742866381, "step": 15650}, {"loss": 0.6841, "grad_norm": 0.9748201370239258, "learning_rate": 0.0002, "epoch": 2.5317274270471266, "step": 15660}, {"loss": 0.6765, "grad_norm": 0.7670575380325317, "learning_rate": 0.0002, "epoch": 2.533344111227872, "step": 15670}, {"loss": 0.6435, "grad_norm": 0.8738890290260315, "learning_rate": 0.0002, "epoch": 2.534960795408617, "step": 15680}, {"loss": 0.6802, "grad_norm": 0.8391636610031128, "learning_rate": 0.0002, "epoch": 2.5365774795893623, "step": 15690}, {"loss": 0.6901, "grad_norm": 0.7239366769790649, "learning_rate": 0.0002, "epoch": 2.5381941637701075, "step": 15700}, {"loss": 0.7011, "grad_norm": 0.8498379588127136, "learning_rate": 0.0002, "epoch": 2.5398108479508528, "step": 15710}, {"loss": 0.6998, "grad_norm": 0.8029484152793884, "learning_rate": 0.0002, "epoch": 2.541427532131598, "step": 15720}, {"loss": 0.6678, "grad_norm": 1.0639333724975586, "learning_rate": 0.0002, "epoch": 2.5430442163123432, "step": 15730}, {"loss": 0.6341, "grad_norm": 0.6401297450065613, "learning_rate": 0.0002, "epoch": 2.5446609004930885, "step": 15740}, {"loss": 0.7196, "grad_norm": 0.7123814821243286, "learning_rate": 0.0002, "epoch": 2.5462775846738337, "step": 15750}, {"loss": 0.654, "grad_norm": 0.7874974608421326, "learning_rate": 0.0002, "epoch": 2.5478942688545794, "step": 15760}, {"loss": 0.6721, "grad_norm": 0.8046808838844299, "learning_rate": 0.0002, "epoch": 2.5495109530353246, "step": 15770}, {"loss": 0.6665, "grad_norm": 0.7888661623001099, "learning_rate": 0.0002, "epoch": 2.55112763721607, "step": 15780}, {"loss": 0.6893, "grad_norm": 0.8445866107940674, "learning_rate": 0.0002, "epoch": 2.552744321396815, "step": 15790}, {"loss": 0.6815, "grad_norm": 0.7475846409797668, "learning_rate": 0.0002, "epoch": 2.5543610055775603, "step": 15800}, {"loss": 0.6711, "grad_norm": 0.7455102801322937, "learning_rate": 0.0002, "epoch": 2.555977689758306, "step": 15810}, {"loss": 0.6932, "grad_norm": 0.8226983547210693, "learning_rate": 0.0002, "epoch": 2.557594373939051, "step": 15820}, {"loss": 0.651, "grad_norm": 0.8920368552207947, "learning_rate": 0.0002, "epoch": 2.5592110581197964, "step": 15830}, {"loss": 0.6297, "grad_norm": 0.8413904905319214, "learning_rate": 0.0002, "epoch": 2.5608277423005417, "step": 15840}, {"loss": 0.7106, "grad_norm": 0.8483649492263794, "learning_rate": 0.0002, "epoch": 2.562444426481287, "step": 15850}, {"loss": 0.6957, "grad_norm": 0.5923284292221069, "learning_rate": 0.0002, "epoch": 2.564061110662032, "step": 15860}, {"loss": 0.6847, "grad_norm": 0.8518726229667664, "learning_rate": 0.0002, "epoch": 2.5656777948427774, "step": 15870}, {"loss": 0.6362, "grad_norm": 0.731235146522522, "learning_rate": 0.0002, "epoch": 2.5672944790235226, "step": 15880}, {"loss": 0.7611, "grad_norm": 0.7517194151878357, "learning_rate": 0.0002, "epoch": 2.568911163204268, "step": 15890}, {"loss": 0.6907, "grad_norm": 0.8378692269325256, "learning_rate": 0.0002, "epoch": 2.5705278473850135, "step": 15900}, {"loss": 0.7055, "grad_norm": 0.843701958656311, "learning_rate": 0.0002, "epoch": 2.5721445315657587, "step": 15910}, {"loss": 0.6882, "grad_norm": 0.7254629731178284, "learning_rate": 0.0002, "epoch": 2.573761215746504, "step": 15920}, {"loss": 0.6872, "grad_norm": 0.8863335847854614, "learning_rate": 0.0002, "epoch": 2.575377899927249, "step": 15930}, {"loss": 0.6813, "grad_norm": 0.7675097584724426, "learning_rate": 0.0002, "epoch": 2.5769945841079944, "step": 15940}, {"loss": 0.7357, "grad_norm": 0.82063889503479, "learning_rate": 0.0002, "epoch": 2.5786112682887397, "step": 15950}, {"loss": 0.662, "grad_norm": 0.7729717493057251, "learning_rate": 0.0002, "epoch": 2.5802279524694853, "step": 15960}, {"loss": 0.633, "grad_norm": 0.8301846981048584, "learning_rate": 0.0002, "epoch": 2.5818446366502306, "step": 15970}, {"loss": 0.6897, "grad_norm": 0.7906861305236816, "learning_rate": 0.0002, "epoch": 2.583461320830976, "step": 15980}, {"loss": 0.7175, "grad_norm": 0.6749057173728943, "learning_rate": 0.0002, "epoch": 2.585078005011721, "step": 15990}, {"loss": 0.7212, "grad_norm": 0.9386842846870422, "learning_rate": 0.0002, "epoch": 2.5866946891924663, "step": 16000}, {"loss": 0.6934, "grad_norm": 0.7868891358375549, "learning_rate": 0.0002, "epoch": 2.5883113733732115, "step": 16010}, {"loss": 0.7036, "grad_norm": 0.8674671053886414, "learning_rate": 0.0002, "epoch": 2.5899280575539567, "step": 16020}, {"loss": 0.7217, "grad_norm": 0.7043559551239014, "learning_rate": 0.0002, "epoch": 2.591544741734702, "step": 16030}, {"loss": 0.6967, "grad_norm": 0.5846083760261536, "learning_rate": 0.0002, "epoch": 2.593161425915447, "step": 16040}, {"loss": 0.7322, "grad_norm": 0.7323982119560242, "learning_rate": 0.0002, "epoch": 2.594778110096193, "step": 16050}, {"loss": 0.6794, "grad_norm": 0.9069556593894958, "learning_rate": 0.0002, "epoch": 2.596394794276938, "step": 16060}, {"loss": 0.7076, "grad_norm": 0.7522736191749573, "learning_rate": 0.0002, "epoch": 2.5980114784576833, "step": 16070}, {"loss": 0.6477, "grad_norm": 0.8149648308753967, "learning_rate": 0.0002, "epoch": 2.5996281626384286, "step": 16080}, {"loss": 0.6664, "grad_norm": 0.6214233040809631, "learning_rate": 0.0002, "epoch": 2.601244846819174, "step": 16090}, {"loss": 0.7307, "grad_norm": 0.6803743839263916, "learning_rate": 0.0002, "epoch": 2.602861530999919, "step": 16100}, {"loss": 0.7244, "grad_norm": 0.7223997116088867, "learning_rate": 0.0002, "epoch": 2.6044782151806647, "step": 16110}, {"loss": 0.6867, "grad_norm": 0.7324174642562866, "learning_rate": 0.0002, "epoch": 2.60609489936141, "step": 16120}, {"loss": 0.7159, "grad_norm": 0.9594739675521851, "learning_rate": 0.0002, "epoch": 2.607711583542155, "step": 16130}, {"loss": 0.6451, "grad_norm": 0.9485327005386353, "learning_rate": 0.0002, "epoch": 2.6093282677229004, "step": 16140}, {"loss": 0.6815, "grad_norm": 0.8449000120162964, "learning_rate": 0.0002, "epoch": 2.6109449519036456, "step": 16150}, {"loss": 0.7152, "grad_norm": 0.8520140051841736, "learning_rate": 0.0002, "epoch": 2.612561636084391, "step": 16160}, {"loss": 0.6759, "grad_norm": 0.7456524968147278, "learning_rate": 0.0002, "epoch": 2.614178320265136, "step": 16170}, {"loss": 0.6893, "grad_norm": 0.9912857413291931, "learning_rate": 0.0002, "epoch": 2.6157950044458813, "step": 16180}, {"loss": 0.7243, "grad_norm": 0.9001946449279785, "learning_rate": 0.0002, "epoch": 2.6174116886266265, "step": 16190}, {"loss": 0.6825, "grad_norm": 0.6568667888641357, "learning_rate": 0.0002, "epoch": 2.619028372807372, "step": 16200}, {"loss": 0.7013, "grad_norm": 1.0248128175735474, "learning_rate": 0.0002, "epoch": 2.6206450569881174, "step": 16210}, {"loss": 0.7045, "grad_norm": 0.6509039998054504, "learning_rate": 0.0002, "epoch": 2.6222617411688627, "step": 16220}, {"loss": 0.72, "grad_norm": 0.7626351118087769, "learning_rate": 0.0002, "epoch": 2.623878425349608, "step": 16230}, {"loss": 0.6556, "grad_norm": 0.6938552260398865, "learning_rate": 0.0002, "epoch": 2.625495109530353, "step": 16240}, {"loss": 0.65, "grad_norm": 0.6434680819511414, "learning_rate": 0.0002, "epoch": 2.6271117937110984, "step": 16250}, {"loss": 0.6943, "grad_norm": 0.7111515998840332, "learning_rate": 0.0002, "epoch": 2.628728477891844, "step": 16260}, {"loss": 0.679, "grad_norm": 0.7712395787239075, "learning_rate": 0.0002, "epoch": 2.6303451620725893, "step": 16270}, {"loss": 0.6886, "grad_norm": 0.792209267616272, "learning_rate": 0.0002, "epoch": 2.6319618462533345, "step": 16280}, {"loss": 0.6554, "grad_norm": 0.6801066398620605, "learning_rate": 0.0002, "epoch": 2.6335785304340797, "step": 16290}, {"loss": 0.73, "grad_norm": 0.7802573442459106, "learning_rate": 0.0002, "epoch": 2.635195214614825, "step": 16300}, {"loss": 0.7484, "grad_norm": 0.7742244601249695, "learning_rate": 0.0002, "epoch": 2.63681189879557, "step": 16310}, {"loss": 0.6524, "grad_norm": 0.664184033870697, "learning_rate": 0.0002, "epoch": 2.6384285829763154, "step": 16320}, {"loss": 0.6442, "grad_norm": 0.9242228865623474, "learning_rate": 0.0002, "epoch": 2.6400452671570607, "step": 16330}, {"loss": 0.6792, "grad_norm": 0.9661325216293335, "learning_rate": 0.0002, "epoch": 2.641661951337806, "step": 16340}, {"loss": 0.6847, "grad_norm": 0.837526798248291, "learning_rate": 0.0002, "epoch": 2.6432786355185516, "step": 16350}, {"loss": 0.7686, "grad_norm": 1.1834373474121094, "learning_rate": 0.0002, "epoch": 2.644895319699297, "step": 16360}, {"loss": 0.6746, "grad_norm": 0.7467831373214722, "learning_rate": 0.0002, "epoch": 2.646512003880042, "step": 16370}, {"loss": 0.6935, "grad_norm": 0.8627146482467651, "learning_rate": 0.0002, "epoch": 2.6481286880607873, "step": 16380}, {"loss": 0.715, "grad_norm": 0.790447473526001, "learning_rate": 0.0002, "epoch": 2.6497453722415325, "step": 16390}, {"loss": 0.723, "grad_norm": 0.8447365164756775, "learning_rate": 0.0002, "epoch": 2.651362056422278, "step": 16400}, {"loss": 0.6628, "grad_norm": 0.7831417918205261, "learning_rate": 0.0002, "epoch": 2.6529787406030234, "step": 16410}, {"loss": 0.6691, "grad_norm": 0.6837952136993408, "learning_rate": 0.0002, "epoch": 2.6545954247837686, "step": 16420}, {"loss": 0.6139, "grad_norm": 0.7031801342964172, "learning_rate": 0.0002, "epoch": 2.656212108964514, "step": 16430}, {"loss": 0.7382, "grad_norm": 0.8963770866394043, "learning_rate": 0.0002, "epoch": 2.657828793145259, "step": 16440}, {"loss": 0.6439, "grad_norm": 0.6852328181266785, "learning_rate": 0.0002, "epoch": 2.6594454773260043, "step": 16450}, {"loss": 0.6278, "grad_norm": 0.8069294095039368, "learning_rate": 0.0002, "epoch": 2.6610621615067496, "step": 16460}, {"loss": 0.6939, "grad_norm": 0.7503686547279358, "learning_rate": 0.0002, "epoch": 2.662678845687495, "step": 16470}, {"loss": 0.6777, "grad_norm": 0.6430956125259399, "learning_rate": 0.0002, "epoch": 2.66429552986824, "step": 16480}, {"loss": 0.6863, "grad_norm": 0.7894312739372253, "learning_rate": 0.0002, "epoch": 2.6659122140489853, "step": 16490}, {"loss": 0.7165, "grad_norm": 0.7277431488037109, "learning_rate": 0.0002, "epoch": 2.667528898229731, "step": 16500}, {"loss": 0.6772, "grad_norm": 0.6816153526306152, "learning_rate": 0.0002, "epoch": 2.669145582410476, "step": 16510}, {"loss": 0.691, "grad_norm": 0.8145235776901245, "learning_rate": 0.0002, "epoch": 2.6707622665912214, "step": 16520}, {"loss": 0.709, "grad_norm": 0.8645890355110168, "learning_rate": 0.0002, "epoch": 2.6723789507719666, "step": 16530}, {"loss": 0.6946, "grad_norm": 0.704393208026886, "learning_rate": 0.0002, "epoch": 2.673995634952712, "step": 16540}, {"loss": 0.6378, "grad_norm": 1.0120846033096313, "learning_rate": 0.0002, "epoch": 2.6756123191334575, "step": 16550}, {"loss": 0.7241, "grad_norm": 0.6919328570365906, "learning_rate": 0.0002, "epoch": 2.6772290033142028, "step": 16560}, {"loss": 0.7098, "grad_norm": 0.6924574971199036, "learning_rate": 0.0002, "epoch": 2.678845687494948, "step": 16570}, {"loss": 0.731, "grad_norm": 0.9679301381111145, "learning_rate": 0.0002, "epoch": 2.6804623716756932, "step": 16580}, {"loss": 0.7124, "grad_norm": 0.6810211539268494, "learning_rate": 0.0002, "epoch": 2.6820790558564385, "step": 16590}, {"loss": 0.6688, "grad_norm": 0.9730555415153503, "learning_rate": 0.0002, "epoch": 2.6836957400371837, "step": 16600}, {"loss": 0.7344, "grad_norm": 0.7852821350097656, "learning_rate": 0.0002, "epoch": 2.685312424217929, "step": 16610}, {"loss": 0.6401, "grad_norm": 0.6059057116508484, "learning_rate": 0.0002, "epoch": 2.686929108398674, "step": 16620}, {"loss": 0.6796, "grad_norm": 0.9395958781242371, "learning_rate": 0.0002, "epoch": 2.6885457925794194, "step": 16630}, {"loss": 0.7174, "grad_norm": 0.7473729848861694, "learning_rate": 0.0002, "epoch": 2.690162476760165, "step": 16640}, {"loss": 0.7087, "grad_norm": 0.765934407711029, "learning_rate": 0.0002, "epoch": 2.6917791609409103, "step": 16650}, {"loss": 0.707, "grad_norm": 0.8496677279472351, "learning_rate": 0.0002, "epoch": 2.6933958451216555, "step": 16660}, {"loss": 0.7084, "grad_norm": 0.7641879916191101, "learning_rate": 0.0002, "epoch": 2.6950125293024008, "step": 16670}, {"loss": 0.6566, "grad_norm": 0.8471952676773071, "learning_rate": 0.0002, "epoch": 2.696629213483146, "step": 16680}, {"loss": 0.6635, "grad_norm": 0.6946060657501221, "learning_rate": 0.0002, "epoch": 2.6982458976638912, "step": 16690}, {"loss": 0.7027, "grad_norm": 0.7361312508583069, "learning_rate": 0.0002, "epoch": 2.699862581844637, "step": 16700}, {"loss": 0.6767, "grad_norm": 0.6605038046836853, "learning_rate": 0.0002, "epoch": 2.701479266025382, "step": 16710}, {"loss": 0.6885, "grad_norm": 0.7164411544799805, "learning_rate": 0.0002, "epoch": 2.7030959502061274, "step": 16720}, {"loss": 0.6736, "grad_norm": 0.6496201157569885, "learning_rate": 0.0002, "epoch": 2.7047126343868726, "step": 16730}, {"loss": 0.6942, "grad_norm": 0.7826663851737976, "learning_rate": 0.0002, "epoch": 2.706329318567618, "step": 16740}, {"loss": 0.6773, "grad_norm": 0.7639131546020508, "learning_rate": 0.0002, "epoch": 2.707946002748363, "step": 16750}, {"loss": 0.69, "grad_norm": 0.7976210713386536, "learning_rate": 0.0002, "epoch": 2.7095626869291083, "step": 16760}, {"loss": 0.6735, "grad_norm": 0.6836577653884888, "learning_rate": 0.0002, "epoch": 2.7111793711098535, "step": 16770}, {"loss": 0.6596, "grad_norm": 0.8025202751159668, "learning_rate": 0.0002, "epoch": 2.7127960552905988, "step": 16780}, {"loss": 0.6324, "grad_norm": 0.7636463642120361, "learning_rate": 0.0002, "epoch": 2.7144127394713444, "step": 16790}, {"loss": 0.6227, "grad_norm": 0.7481677532196045, "learning_rate": 0.0002, "epoch": 2.7160294236520897, "step": 16800}, {"loss": 0.6925, "grad_norm": 0.7566834688186646, "learning_rate": 0.0002, "epoch": 2.717646107832835, "step": 16810}, {"loss": 0.6531, "grad_norm": 0.7931267619132996, "learning_rate": 0.0002, "epoch": 2.71926279201358, "step": 16820}, {"loss": 0.6672, "grad_norm": 0.8811662197113037, "learning_rate": 0.0002, "epoch": 2.7208794761943254, "step": 16830}, {"loss": 0.6675, "grad_norm": 0.8561240434646606, "learning_rate": 0.0002, "epoch": 2.7224961603750706, "step": 16840}, {"loss": 0.7135, "grad_norm": 0.7121599316596985, "learning_rate": 0.0002, "epoch": 2.7241128445558163, "step": 16850}, {"loss": 0.6825, "grad_norm": 0.8066257238388062, "learning_rate": 0.0002, "epoch": 2.7257295287365615, "step": 16860}, {"loss": 0.6839, "grad_norm": 0.7699271440505981, "learning_rate": 0.0002, "epoch": 2.7273462129173067, "step": 16870}, {"loss": 0.699, "grad_norm": 1.1828432083129883, "learning_rate": 0.0002, "epoch": 2.728962897098052, "step": 16880}, {"loss": 0.6518, "grad_norm": 0.9989302754402161, "learning_rate": 0.0002, "epoch": 2.730579581278797, "step": 16890}, {"loss": 0.7015, "grad_norm": 0.8100560307502747, "learning_rate": 0.0002, "epoch": 2.7321962654595424, "step": 16900}, {"loss": 0.6851, "grad_norm": 0.8615233898162842, "learning_rate": 0.0002, "epoch": 2.7338129496402876, "step": 16910}, {"loss": 0.6322, "grad_norm": 0.8633756041526794, "learning_rate": 0.0002, "epoch": 2.735429633821033, "step": 16920}, {"loss": 0.6488, "grad_norm": 0.7769348621368408, "learning_rate": 0.0002, "epoch": 2.737046318001778, "step": 16930}, {"loss": 0.6582, "grad_norm": 0.6943058371543884, "learning_rate": 0.0002, "epoch": 2.738663002182524, "step": 16940}, {"loss": 0.6516, "grad_norm": 0.8510736227035522, "learning_rate": 0.0002, "epoch": 2.740279686363269, "step": 16950}, {"loss": 0.7275, "grad_norm": 0.7732602953910828, "learning_rate": 0.0002, "epoch": 2.7418963705440142, "step": 16960}, {"loss": 0.6553, "grad_norm": 0.5981788635253906, "learning_rate": 0.0002, "epoch": 2.7435130547247595, "step": 16970}, {"loss": 0.6777, "grad_norm": 0.7604416012763977, "learning_rate": 0.0002, "epoch": 2.7451297389055047, "step": 16980}, {"loss": 0.6981, "grad_norm": 0.7377738356590271, "learning_rate": 0.0002, "epoch": 2.74674642308625, "step": 16990}, {"loss": 0.6294, "grad_norm": 0.9400289058685303, "learning_rate": 0.0002, "epoch": 2.7483631072669956, "step": 17000}, {"loss": 0.6952, "grad_norm": 0.6340599656105042, "learning_rate": 0.0002, "epoch": 2.749979791447741, "step": 17010}, {"loss": 0.7222, "grad_norm": 0.7297601103782654, "learning_rate": 0.0002, "epoch": 2.751596475628486, "step": 17020}, {"loss": 0.6659, "grad_norm": 0.9479979872703552, "learning_rate": 0.0002, "epoch": 2.7532131598092313, "step": 17030}, {"loss": 0.691, "grad_norm": 0.8461511135101318, "learning_rate": 0.0002, "epoch": 2.7548298439899765, "step": 17040}, {"loss": 0.6764, "grad_norm": 0.7477551698684692, "learning_rate": 0.0002, "epoch": 2.7564465281707218, "step": 17050}, {"loss": 0.684, "grad_norm": 1.019270420074463, "learning_rate": 0.0002, "epoch": 2.758063212351467, "step": 17060}, {"loss": 0.7119, "grad_norm": 0.7730235457420349, "learning_rate": 0.0002, "epoch": 2.7596798965322122, "step": 17070}, {"loss": 0.6886, "grad_norm": 0.8216866254806519, "learning_rate": 0.0002, "epoch": 2.7612965807129575, "step": 17080}, {"loss": 0.6811, "grad_norm": 0.7235931754112244, "learning_rate": 0.0002, "epoch": 2.762913264893703, "step": 17090}, {"loss": 0.7031, "grad_norm": 0.7352296710014343, "learning_rate": 0.0002, "epoch": 2.7645299490744484, "step": 17100}, {"loss": 0.6951, "grad_norm": 0.8129373788833618, "learning_rate": 0.0002, "epoch": 2.7661466332551936, "step": 17110}, {"loss": 0.6703, "grad_norm": 0.7387019991874695, "learning_rate": 0.0002, "epoch": 2.767763317435939, "step": 17120}, {"loss": 0.6789, "grad_norm": 0.9149190187454224, "learning_rate": 0.0002, "epoch": 2.769380001616684, "step": 17130}, {"loss": 0.6038, "grad_norm": 0.7352971434593201, "learning_rate": 0.0002, "epoch": 2.7709966857974297, "step": 17140}, {"loss": 0.6728, "grad_norm": 0.7903780341148376, "learning_rate": 0.0002, "epoch": 2.772613369978175, "step": 17150}, {"loss": 0.6988, "grad_norm": 0.8255927562713623, "learning_rate": 0.0002, "epoch": 2.77423005415892, "step": 17160}, {"loss": 0.6694, "grad_norm": 0.7235927581787109, "learning_rate": 0.0002, "epoch": 2.7758467383396654, "step": 17170}, {"loss": 0.7161, "grad_norm": 0.8281434774398804, "learning_rate": 0.0002, "epoch": 2.7774634225204107, "step": 17180}, {"loss": 0.682, "grad_norm": 0.7586921453475952, "learning_rate": 0.0002, "epoch": 2.779080106701156, "step": 17190}, {"loss": 0.6427, "grad_norm": 0.7161715030670166, "learning_rate": 0.0002, "epoch": 2.780696790881901, "step": 17200}, {"loss": 0.6426, "grad_norm": 0.762868344783783, "learning_rate": 0.0002, "epoch": 2.7823134750626464, "step": 17210}, {"loss": 0.705, "grad_norm": 0.9285483360290527, "learning_rate": 0.0002, "epoch": 2.7839301592433916, "step": 17220}, {"loss": 0.7084, "grad_norm": 0.6900462508201599, "learning_rate": 0.0002, "epoch": 2.785546843424137, "step": 17230}, {"loss": 0.6988, "grad_norm": 0.780384361743927, "learning_rate": 0.0002, "epoch": 2.7871635276048825, "step": 17240}, {"loss": 0.7073, "grad_norm": 0.7580406665802002, "learning_rate": 0.0002, "epoch": 2.7887802117856277, "step": 17250}, {"loss": 0.6833, "grad_norm": 0.8145199418067932, "learning_rate": 0.0002, "epoch": 2.790396895966373, "step": 17260}, {"loss": 0.6909, "grad_norm": 0.9159596562385559, "learning_rate": 0.0002, "epoch": 2.792013580147118, "step": 17270}, {"loss": 0.6008, "grad_norm": 0.9590014219284058, "learning_rate": 0.0002, "epoch": 2.7936302643278634, "step": 17280}, {"loss": 0.6704, "grad_norm": 0.7603529691696167, "learning_rate": 0.0002, "epoch": 2.795246948508609, "step": 17290}, {"loss": 0.7165, "grad_norm": 0.8039976358413696, "learning_rate": 0.0002, "epoch": 2.7968636326893543, "step": 17300}, {"loss": 0.7037, "grad_norm": 0.8364847302436829, "learning_rate": 0.0002, "epoch": 2.7984803168700996, "step": 17310}, {"loss": 0.6749, "grad_norm": 0.8763046860694885, "learning_rate": 0.0002, "epoch": 2.800097001050845, "step": 17320}, {"loss": 0.6844, "grad_norm": 0.8409647941589355, "learning_rate": 0.0002, "epoch": 2.80171368523159, "step": 17330}, {"loss": 0.6936, "grad_norm": 0.7649006247520447, "learning_rate": 0.0002, "epoch": 2.8033303694123353, "step": 17340}, {"loss": 0.7051, "grad_norm": 0.7970262169837952, "learning_rate": 0.0002, "epoch": 2.8049470535930805, "step": 17350}, {"loss": 0.6533, "grad_norm": 0.9088607430458069, "learning_rate": 0.0002, "epoch": 2.8065637377738257, "step": 17360}, {"loss": 0.675, "grad_norm": 0.6454846858978271, "learning_rate": 0.0002, "epoch": 2.808180421954571, "step": 17370}, {"loss": 0.7069, "grad_norm": 0.7744787931442261, "learning_rate": 0.0002, "epoch": 2.809797106135316, "step": 17380}, {"loss": 0.6772, "grad_norm": 0.6678640842437744, "learning_rate": 0.0002, "epoch": 2.811413790316062, "step": 17390}, {"loss": 0.6784, "grad_norm": 0.772676944732666, "learning_rate": 0.0002, "epoch": 2.813030474496807, "step": 17400}, {"loss": 0.7252, "grad_norm": 0.7088175415992737, "learning_rate": 0.0002, "epoch": 2.8146471586775523, "step": 17410}, {"loss": 0.7086, "grad_norm": 0.8280573487281799, "learning_rate": 0.0002, "epoch": 2.8162638428582976, "step": 17420}, {"loss": 0.6732, "grad_norm": 0.6665388345718384, "learning_rate": 0.0002, "epoch": 2.817880527039043, "step": 17430}, {"loss": 0.6675, "grad_norm": 0.6427883505821228, "learning_rate": 0.0002, "epoch": 2.8194972112197885, "step": 17440}, {"loss": 0.6972, "grad_norm": 0.9697760343551636, "learning_rate": 0.0002, "epoch": 2.8211138954005337, "step": 17450}, {"loss": 0.6838, "grad_norm": 0.7573966383934021, "learning_rate": 0.0002, "epoch": 2.822730579581279, "step": 17460}, {"loss": 0.7243, "grad_norm": 0.878688633441925, "learning_rate": 0.0002, "epoch": 2.824347263762024, "step": 17470}, {"loss": 0.6666, "grad_norm": 0.7752242684364319, "learning_rate": 0.0002, "epoch": 2.8259639479427694, "step": 17480}, {"loss": 0.6638, "grad_norm": 0.6135398745536804, "learning_rate": 0.0002, "epoch": 2.8275806321235146, "step": 17490}, {"loss": 0.6829, "grad_norm": 0.6924924850463867, "learning_rate": 0.0002, "epoch": 2.82919731630426, "step": 17500}, {"loss": 0.6731, "grad_norm": 0.7471627593040466, "learning_rate": 0.0002, "epoch": 2.830814000485005, "step": 17510}, {"loss": 0.7016, "grad_norm": 0.7145499587059021, "learning_rate": 0.0002, "epoch": 2.8324306846657503, "step": 17520}, {"loss": 0.6787, "grad_norm": 0.7415414452552795, "learning_rate": 0.0002, "epoch": 2.834047368846496, "step": 17530}, {"loss": 0.6811, "grad_norm": 0.7328441739082336, "learning_rate": 0.0002, "epoch": 2.8356640530272412, "step": 17540}, {"loss": 0.6866, "grad_norm": 0.8267839550971985, "learning_rate": 0.0002, "epoch": 2.8372807372079865, "step": 17550}, {"loss": 0.6787, "grad_norm": 0.8877885341644287, "learning_rate": 0.0002, "epoch": 2.8388974213887317, "step": 17560}, {"loss": 0.7136, "grad_norm": 0.857138454914093, "learning_rate": 0.0002, "epoch": 2.840514105569477, "step": 17570}, {"loss": 0.6454, "grad_norm": 0.8470779657363892, "learning_rate": 0.0002, "epoch": 2.842130789750222, "step": 17580}, {"loss": 0.6976, "grad_norm": 0.8553254008293152, "learning_rate": 0.0002, "epoch": 2.843747473930968, "step": 17590}, {"loss": 0.7297, "grad_norm": 0.8033196926116943, "learning_rate": 0.0002, "epoch": 2.845364158111713, "step": 17600}, {"loss": 0.7062, "grad_norm": 0.7949087023735046, "learning_rate": 0.0002, "epoch": 2.8469808422924583, "step": 17610}, {"loss": 0.651, "grad_norm": 0.9241406321525574, "learning_rate": 0.0002, "epoch": 2.8485975264732035, "step": 17620}, {"loss": 0.6601, "grad_norm": 0.7721285223960876, "learning_rate": 0.0002, "epoch": 2.8502142106539488, "step": 17630}, {"loss": 0.6183, "grad_norm": 1.0246692895889282, "learning_rate": 0.0002, "epoch": 2.851830894834694, "step": 17640}, {"loss": 0.7007, "grad_norm": 0.9244589805603027, "learning_rate": 0.0002, "epoch": 2.853447579015439, "step": 17650}, {"loss": 0.7274, "grad_norm": 0.7243508696556091, "learning_rate": 0.0002, "epoch": 2.8550642631961844, "step": 17660}, {"loss": 0.6471, "grad_norm": 0.8943371176719666, "learning_rate": 0.0002, "epoch": 2.8566809473769297, "step": 17670}, {"loss": 0.686, "grad_norm": 0.6531758904457092, "learning_rate": 0.0002, "epoch": 2.8582976315576754, "step": 17680}, {"loss": 0.6253, "grad_norm": 0.8367000818252563, "learning_rate": 0.0002, "epoch": 2.8599143157384206, "step": 17690}, {"loss": 0.6943, "grad_norm": 0.7868556380271912, "learning_rate": 0.0002, "epoch": 2.861530999919166, "step": 17700}, {"loss": 0.6919, "grad_norm": 0.7213859558105469, "learning_rate": 0.0002, "epoch": 2.863147684099911, "step": 17710}, {"loss": 0.6657, "grad_norm": 0.7383931279182434, "learning_rate": 0.0002, "epoch": 2.8647643682806563, "step": 17720}, {"loss": 0.6841, "grad_norm": 0.7566812634468079, "learning_rate": 0.0002, "epoch": 2.8663810524614015, "step": 17730}, {"loss": 0.6449, "grad_norm": 0.6930373311042786, "learning_rate": 0.0002, "epoch": 2.867997736642147, "step": 17740}, {"loss": 0.6764, "grad_norm": 0.7911090850830078, "learning_rate": 0.0002, "epoch": 2.8696144208228924, "step": 17750}, {"loss": 0.6554, "grad_norm": 0.8484548926353455, "learning_rate": 0.0002, "epoch": 2.8712311050036377, "step": 17760}, {"loss": 0.6931, "grad_norm": 0.7647597193717957, "learning_rate": 0.0002, "epoch": 2.872847789184383, "step": 17770}, {"loss": 0.6945, "grad_norm": 0.8791151642799377, "learning_rate": 0.0002, "epoch": 2.874464473365128, "step": 17780}, {"loss": 0.7078, "grad_norm": 0.7253178358078003, "learning_rate": 0.0002, "epoch": 2.8760811575458733, "step": 17790}, {"loss": 0.6474, "grad_norm": 0.7956077456474304, "learning_rate": 0.0002, "epoch": 2.8776978417266186, "step": 17800}, {"loss": 0.6687, "grad_norm": 0.8657688498497009, "learning_rate": 0.0002, "epoch": 2.879314525907364, "step": 17810}, {"loss": 0.7171, "grad_norm": 0.7059141993522644, "learning_rate": 0.0002, "epoch": 2.880931210088109, "step": 17820}, {"loss": 0.683, "grad_norm": 0.8886896967887878, "learning_rate": 0.0002, "epoch": 2.8825478942688547, "step": 17830}, {"loss": 0.669, "grad_norm": 0.821032702922821, "learning_rate": 0.0002, "epoch": 2.8841645784496, "step": 17840}, {"loss": 0.6805, "grad_norm": 0.7183963656425476, "learning_rate": 0.0002, "epoch": 2.885781262630345, "step": 17850}, {"loss": 0.7088, "grad_norm": 0.6222899556159973, "learning_rate": 0.0002, "epoch": 2.8873979468110904, "step": 17860}, {"loss": 0.6626, "grad_norm": 0.8187434077262878, "learning_rate": 0.0002, "epoch": 2.8890146309918356, "step": 17870}, {"loss": 0.6815, "grad_norm": 0.9838479161262512, "learning_rate": 0.0002, "epoch": 2.890631315172581, "step": 17880}, {"loss": 0.6967, "grad_norm": 0.7567742466926575, "learning_rate": 0.0002, "epoch": 2.8922479993533265, "step": 17890}, {"loss": 0.7073, "grad_norm": 0.6875903606414795, "learning_rate": 0.0002, "epoch": 2.893864683534072, "step": 17900}, {"loss": 0.6415, "grad_norm": 0.8043789267539978, "learning_rate": 0.0002, "epoch": 2.895481367714817, "step": 17910}, {"loss": 0.6588, "grad_norm": 0.8062626719474792, "learning_rate": 0.0002, "epoch": 2.8970980518955622, "step": 17920}, {"loss": 0.7151, "grad_norm": 1.0251191854476929, "learning_rate": 0.0002, "epoch": 2.8987147360763075, "step": 17930}, {"loss": 0.6605, "grad_norm": 0.882253110408783, "learning_rate": 0.0002, "epoch": 2.9003314202570527, "step": 17940}, {"loss": 0.6719, "grad_norm": 0.8683299422264099, "learning_rate": 0.0002, "epoch": 2.901948104437798, "step": 17950}, {"loss": 0.6896, "grad_norm": 0.7167282104492188, "learning_rate": 0.0002, "epoch": 2.903564788618543, "step": 17960}, {"loss": 0.663, "grad_norm": 0.7093694806098938, "learning_rate": 0.0002, "epoch": 2.9051814727992884, "step": 17970}, {"loss": 0.6591, "grad_norm": 0.8549879193305969, "learning_rate": 0.0002, "epoch": 2.906798156980034, "step": 17980}, {"loss": 0.6962, "grad_norm": 0.6989606618881226, "learning_rate": 0.0002, "epoch": 2.9084148411607793, "step": 17990}, {"loss": 0.6635, "grad_norm": 0.9482976794242859, "learning_rate": 0.0002, "epoch": 2.9100315253415245, "step": 18000}, {"loss": 0.6586, "grad_norm": 0.7182440161705017, "learning_rate": 0.0002, "epoch": 2.9116482095222698, "step": 18010}, {"loss": 0.6827, "grad_norm": 0.7732226252555847, "learning_rate": 0.0002, "epoch": 2.913264893703015, "step": 18020}, {"loss": 0.7123, "grad_norm": 0.7936875224113464, "learning_rate": 0.0002, "epoch": 2.9148815778837607, "step": 18030}, {"loss": 0.6736, "grad_norm": 0.8825615644454956, "learning_rate": 0.0002, "epoch": 2.916498262064506, "step": 18040}, {"loss": 0.7139, "grad_norm": 0.6778587102890015, "learning_rate": 0.0002, "epoch": 2.918114946245251, "step": 18050}, {"loss": 0.6588, "grad_norm": 0.7529265880584717, "learning_rate": 0.0002, "epoch": 2.9197316304259964, "step": 18060}, {"loss": 0.737, "grad_norm": 0.7111883163452148, "learning_rate": 0.0002, "epoch": 2.9213483146067416, "step": 18070}, {"loss": 0.7475, "grad_norm": 0.7214767932891846, "learning_rate": 0.0002, "epoch": 2.922964998787487, "step": 18080}, {"loss": 0.6672, "grad_norm": 0.800417423248291, "learning_rate": 0.0002, "epoch": 2.924581682968232, "step": 18090}, {"loss": 0.6694, "grad_norm": 1.248575210571289, "learning_rate": 0.0002, "epoch": 2.9261983671489773, "step": 18100}, {"loss": 0.7004, "grad_norm": 0.757788360118866, "learning_rate": 0.0002, "epoch": 2.9278150513297225, "step": 18110}, {"loss": 0.6999, "grad_norm": 1.0583995580673218, "learning_rate": 0.0002, "epoch": 2.9294317355104678, "step": 18120}, {"loss": 0.6365, "grad_norm": 0.8228777647018433, "learning_rate": 0.0002, "epoch": 2.9310484196912134, "step": 18130}, {"loss": 0.6791, "grad_norm": 0.8374035358428955, "learning_rate": 0.0002, "epoch": 2.9326651038719587, "step": 18140}, {"loss": 0.6399, "grad_norm": 0.7976473569869995, "learning_rate": 0.0002, "epoch": 2.934281788052704, "step": 18150}, {"loss": 0.6585, "grad_norm": 0.8009907603263855, "learning_rate": 0.0002, "epoch": 2.935898472233449, "step": 18160}, {"loss": 0.7485, "grad_norm": 0.835213303565979, "learning_rate": 0.0002, "epoch": 2.9375151564141944, "step": 18170}, {"loss": 0.7376, "grad_norm": 0.7982219457626343, "learning_rate": 0.0002, "epoch": 2.93913184059494, "step": 18180}, {"loss": 0.6348, "grad_norm": 0.7070978879928589, "learning_rate": 0.0002, "epoch": 2.9407485247756853, "step": 18190}, {"loss": 0.6608, "grad_norm": 0.8619440197944641, "learning_rate": 0.0002, "epoch": 2.9423652089564305, "step": 18200}, {"loss": 0.666, "grad_norm": 0.6693987250328064, "learning_rate": 0.0002, "epoch": 2.9439818931371757, "step": 18210}, {"loss": 0.728, "grad_norm": 0.6747021079063416, "learning_rate": 0.0002, "epoch": 2.945598577317921, "step": 18220}, {"loss": 0.6686, "grad_norm": 0.860387921333313, "learning_rate": 0.0002, "epoch": 2.947215261498666, "step": 18230}, {"loss": 0.6945, "grad_norm": 0.799976646900177, "learning_rate": 0.0002, "epoch": 2.9488319456794114, "step": 18240}, {"loss": 0.7243, "grad_norm": 0.7864769101142883, "learning_rate": 0.0002, "epoch": 2.9504486298601567, "step": 18250}, {"loss": 0.6785, "grad_norm": 0.6713884472846985, "learning_rate": 0.0002, "epoch": 2.952065314040902, "step": 18260}, {"loss": 0.7429, "grad_norm": 0.9031508564949036, "learning_rate": 0.0002, "epoch": 2.9536819982216476, "step": 18270}, {"loss": 0.7055, "grad_norm": 0.7205073237419128, "learning_rate": 0.0002, "epoch": 2.955298682402393, "step": 18280}, {"loss": 0.7298, "grad_norm": 0.7746205925941467, "learning_rate": 0.0002, "epoch": 2.956915366583138, "step": 18290}, {"loss": 0.6218, "grad_norm": 0.6533427834510803, "learning_rate": 0.0002, "epoch": 2.9585320507638833, "step": 18300}, {"loss": 0.6674, "grad_norm": 0.9083208441734314, "learning_rate": 0.0002, "epoch": 2.9601487349446285, "step": 18310}, {"loss": 0.7359, "grad_norm": 0.7446991801261902, "learning_rate": 0.0002, "epoch": 2.9617654191253737, "step": 18320}, {"loss": 0.6738, "grad_norm": 0.6514461636543274, "learning_rate": 0.0002, "epoch": 2.9633821033061194, "step": 18330}, {"loss": 0.6677, "grad_norm": 0.8580465912818909, "learning_rate": 0.0002, "epoch": 2.9649987874868646, "step": 18340}, {"loss": 0.6971, "grad_norm": 0.7074266076087952, "learning_rate": 0.0002, "epoch": 2.96661547166761, "step": 18350}, {"loss": 0.6804, "grad_norm": 0.899892270565033, "learning_rate": 0.0002, "epoch": 2.968232155848355, "step": 18360}, {"loss": 0.7094, "grad_norm": 0.8217641711235046, "learning_rate": 0.0002, "epoch": 2.9698488400291003, "step": 18370}, {"loss": 0.6916, "grad_norm": 0.8611799478530884, "learning_rate": 0.0002, "epoch": 2.9714655242098456, "step": 18380}, {"loss": 0.6677, "grad_norm": 0.6909302473068237, "learning_rate": 0.0002, "epoch": 2.973082208390591, "step": 18390}, {"loss": 0.7247, "grad_norm": 0.6554358005523682, "learning_rate": 0.0002, "epoch": 2.974698892571336, "step": 18400}, {"loss": 0.6516, "grad_norm": 0.7803071737289429, "learning_rate": 0.0002, "epoch": 2.9763155767520812, "step": 18410}, {"loss": 0.7322, "grad_norm": 0.7838954925537109, "learning_rate": 0.0002, "epoch": 2.977932260932827, "step": 18420}, {"loss": 0.6522, "grad_norm": 0.7098495364189148, "learning_rate": 0.0002, "epoch": 2.979548945113572, "step": 18430}, {"loss": 0.739, "grad_norm": 0.8981785774230957, "learning_rate": 0.0002, "epoch": 2.9811656292943174, "step": 18440}, {"loss": 0.6689, "grad_norm": 0.7197171449661255, "learning_rate": 0.0002, "epoch": 2.9827823134750626, "step": 18450}, {"loss": 0.706, "grad_norm": 0.793185293674469, "learning_rate": 0.0002, "epoch": 2.984398997655808, "step": 18460}, {"loss": 0.7124, "grad_norm": 0.8531473875045776, "learning_rate": 0.0002, "epoch": 2.986015681836553, "step": 18470}, {"loss": 0.6901, "grad_norm": 0.6627361178398132, "learning_rate": 0.0002, "epoch": 2.9876323660172988, "step": 18480}, {"loss": 0.6591, "grad_norm": 0.5708155035972595, "learning_rate": 0.0002, "epoch": 2.989249050198044, "step": 18490}, {"loss": 0.6725, "grad_norm": 0.8227280378341675, "learning_rate": 0.0002, "epoch": 2.990865734378789, "step": 18500}, {"loss": 0.6701, "grad_norm": 0.7102749943733215, "learning_rate": 0.0002, "epoch": 2.9924824185595345, "step": 18510}, {"loss": 0.7091, "grad_norm": 0.839485228061676, "learning_rate": 0.0002, "epoch": 2.9940991027402797, "step": 18520}, {"loss": 0.6521, "grad_norm": 0.9038704037666321, "learning_rate": 0.0002, "epoch": 2.995715786921025, "step": 18530}, {"loss": 0.7186, "grad_norm": 0.8737510442733765, "learning_rate": 0.0002, "epoch": 2.99733247110177, "step": 18540}, {"loss": 0.6819, "grad_norm": 0.7323142886161804, "learning_rate": 0.0002, "epoch": 2.9989491552825154, "step": 18550}, {"eval_loss": 1.1262480020523071, "eval_runtime": 122.0868, "eval_samples_per_second": 6.004, "eval_steps_per_second": 0.754, "epoch": 2.9999191657909625, "step": 18556}, {"loss": 0.6337, "grad_norm": 0.8465463519096375, "learning_rate": 0.0002, "epoch": 3.000565839463261, "step": 18560}, {"loss": 0.6064, "grad_norm": 0.9134138822555542, "learning_rate": 0.0002, "epoch": 3.0021825236440063, "step": 18570}, {"loss": 0.5804, "grad_norm": 0.760715126991272, "learning_rate": 0.0002, "epoch": 3.0037992078247515, "step": 18580}, {"loss": 0.5571, "grad_norm": 0.9208743572235107, "learning_rate": 0.0002, "epoch": 3.0054158920054967, "step": 18590}, {"loss": 0.5731, "grad_norm": 0.9232364892959595, "learning_rate": 0.0002, "epoch": 3.007032576186242, "step": 18600}, {"loss": 0.6299, "grad_norm": 1.1881544589996338, "learning_rate": 0.0002, "epoch": 3.008649260366987, "step": 18610}, {"loss": 0.5482, "grad_norm": 0.9372987747192383, "learning_rate": 0.0002, "epoch": 3.0102659445477324, "step": 18620}, {"loss": 0.5709, "grad_norm": 0.6900241374969482, "learning_rate": 0.0002, "epoch": 3.0118826287284777, "step": 18630}, {"loss": 0.5256, "grad_norm": 0.8451071381568909, "learning_rate": 0.0002, "epoch": 3.0134993129092233, "step": 18640}, {"loss": 0.5916, "grad_norm": 0.7763112187385559, "learning_rate": 0.0002, "epoch": 3.0151159970899686, "step": 18650}, {"loss": 0.6095, "grad_norm": 1.043653964996338, "learning_rate": 0.0002, "epoch": 3.016732681270714, "step": 18660}, {"loss": 0.6228, "grad_norm": 1.0170660018920898, "learning_rate": 0.0002, "epoch": 3.018349365451459, "step": 18670}, {"loss": 0.5671, "grad_norm": 0.7534180283546448, "learning_rate": 0.0002, "epoch": 3.0199660496322043, "step": 18680}, {"loss": 0.6015, "grad_norm": 0.7507367730140686, "learning_rate": 0.0002, "epoch": 3.0215827338129495, "step": 18690}, {"loss": 0.6201, "grad_norm": 0.7861620187759399, "learning_rate": 0.0002, "epoch": 3.0231994179936947, "step": 18700}, {"loss": 0.5802, "grad_norm": 1.0580339431762695, "learning_rate": 0.0002, "epoch": 3.0248161021744404, "step": 18710}, {"loss": 0.5975, "grad_norm": 0.7542710900306702, "learning_rate": 0.0002, "epoch": 3.0264327863551856, "step": 18720}, {"loss": 0.5695, "grad_norm": 0.8189544677734375, "learning_rate": 0.0002, "epoch": 3.028049470535931, "step": 18730}, {"loss": 0.6109, "grad_norm": 0.9126611351966858, "learning_rate": 0.0002, "epoch": 3.029666154716676, "step": 18740}, {"loss": 0.6443, "grad_norm": 0.8891341686248779, "learning_rate": 0.0002, "epoch": 3.0312828388974213, "step": 18750}, {"loss": 0.6207, "grad_norm": 0.8419283032417297, "learning_rate": 0.0002, "epoch": 3.0328995230781666, "step": 18760}, {"loss": 0.5818, "grad_norm": 0.8048048615455627, "learning_rate": 0.0002, "epoch": 3.034516207258912, "step": 18770}, {"loss": 0.6381, "grad_norm": 0.7820217609405518, "learning_rate": 0.0002, "epoch": 3.0361328914396575, "step": 18780}, {"loss": 0.5843, "grad_norm": 0.854721188545227, "learning_rate": 0.0002, "epoch": 3.0377495756204027, "step": 18790}, {"loss": 0.5784, "grad_norm": 0.912092924118042, "learning_rate": 0.0002, "epoch": 3.039366259801148, "step": 18800}, {"loss": 0.5734, "grad_norm": 0.6596226096153259, "learning_rate": 0.0002, "epoch": 3.040982943981893, "step": 18810}, {"loss": 0.5969, "grad_norm": 0.6351348757743835, "learning_rate": 0.0002, "epoch": 3.0425996281626384, "step": 18820}, {"loss": 0.5953, "grad_norm": 0.778188943862915, "learning_rate": 0.0002, "epoch": 3.0442163123433836, "step": 18830}, {"loss": 0.602, "grad_norm": 0.68234783411026, "learning_rate": 0.0002, "epoch": 3.045832996524129, "step": 18840}, {"loss": 0.5785, "grad_norm": 0.998628556728363, "learning_rate": 0.0002, "epoch": 3.047449680704874, "step": 18850}, {"loss": 0.6231, "grad_norm": 0.7393841743469238, "learning_rate": 0.0002, "epoch": 3.0490663648856198, "step": 18860}, {"loss": 0.568, "grad_norm": 0.84438556432724, "learning_rate": 0.0002, "epoch": 3.050683049066365, "step": 18870}, {"loss": 0.6205, "grad_norm": 0.8857501745223999, "learning_rate": 0.0002, "epoch": 3.0522997332471102, "step": 18880}, {"loss": 0.6335, "grad_norm": 0.7208474278450012, "learning_rate": 0.0002, "epoch": 3.0539164174278555, "step": 18890}, {"loss": 0.5998, "grad_norm": 0.7135229110717773, "learning_rate": 0.0002, "epoch": 3.0555331016086007, "step": 18900}, {"loss": 0.5575, "grad_norm": 0.9130001664161682, "learning_rate": 0.0002, "epoch": 3.057149785789346, "step": 18910}, {"loss": 0.5955, "grad_norm": 0.9001716375350952, "learning_rate": 0.0002, "epoch": 3.058766469970091, "step": 18920}, {"loss": 0.6052, "grad_norm": 0.8667559623718262, "learning_rate": 0.0002, "epoch": 3.060383154150837, "step": 18930}, {"loss": 0.5818, "grad_norm": 0.8943959474563599, "learning_rate": 0.0002, "epoch": 3.061999838331582, "step": 18940}, {"loss": 0.5978, "grad_norm": 0.8298377990722656, "learning_rate": 0.0002, "epoch": 3.0636165225123273, "step": 18950}, {"loss": 0.5782, "grad_norm": 0.7935267686843872, "learning_rate": 0.0002, "epoch": 3.0652332066930725, "step": 18960}, {"loss": 0.6434, "grad_norm": 1.1506379842758179, "learning_rate": 0.0002, "epoch": 3.0668498908738178, "step": 18970}, {"loss": 0.5571, "grad_norm": 0.7693049907684326, "learning_rate": 0.0002, "epoch": 3.068466575054563, "step": 18980}, {"loss": 0.5971, "grad_norm": 0.8040135502815247, "learning_rate": 0.0002, "epoch": 3.0700832592353082, "step": 18990}, {"loss": 0.5541, "grad_norm": 0.828404426574707, "learning_rate": 0.0002, "epoch": 3.0716999434160535, "step": 19000}, {"loss": 0.6048, "grad_norm": 0.8811164498329163, "learning_rate": 0.0002, "epoch": 3.073316627596799, "step": 19010}, {"loss": 0.5845, "grad_norm": 1.036205768585205, "learning_rate": 0.0002, "epoch": 3.0749333117775444, "step": 19020}, {"loss": 0.5838, "grad_norm": 0.8857285976409912, "learning_rate": 0.0002, "epoch": 3.0765499959582896, "step": 19030}, {"loss": 0.592, "grad_norm": 0.8392079472541809, "learning_rate": 0.0002, "epoch": 3.078166680139035, "step": 19040}, {"loss": 0.5927, "grad_norm": 1.0287401676177979, "learning_rate": 0.0002, "epoch": 3.07978336431978, "step": 19050}, {"loss": 0.5964, "grad_norm": 1.0086315870285034, "learning_rate": 0.0002, "epoch": 3.0814000485005253, "step": 19060}, {"loss": 0.5567, "grad_norm": 0.9245324730873108, "learning_rate": 0.0002, "epoch": 3.0830167326812705, "step": 19070}, {"loss": 0.5797, "grad_norm": 0.8680877089500427, "learning_rate": 0.0002, "epoch": 3.084633416862016, "step": 19080}, {"loss": 0.5611, "grad_norm": 0.8814793825149536, "learning_rate": 0.0002, "epoch": 3.0862501010427614, "step": 19090}, {"loss": 0.6051, "grad_norm": 0.9234458208084106, "learning_rate": 0.0002, "epoch": 3.0878667852235067, "step": 19100}, {"loss": 0.6209, "grad_norm": 1.1291664838790894, "learning_rate": 0.0002, "epoch": 3.089483469404252, "step": 19110}, {"loss": 0.5695, "grad_norm": 0.9191402792930603, "learning_rate": 0.0002, "epoch": 3.091100153584997, "step": 19120}, {"loss": 0.5856, "grad_norm": 0.7103154063224792, "learning_rate": 0.0002, "epoch": 3.0927168377657424, "step": 19130}, {"loss": 0.6479, "grad_norm": 0.9368883967399597, "learning_rate": 0.0002, "epoch": 3.0943335219464876, "step": 19140}, {"loss": 0.6167, "grad_norm": 0.9676656723022461, "learning_rate": 0.0002, "epoch": 3.095950206127233, "step": 19150}, {"loss": 0.5794, "grad_norm": 0.8739792704582214, "learning_rate": 0.0002, "epoch": 3.0975668903079785, "step": 19160}, {"loss": 0.6112, "grad_norm": 0.8530174493789673, "learning_rate": 0.0002, "epoch": 3.0991835744887237, "step": 19170}, {"loss": 0.6568, "grad_norm": 0.794945478439331, "learning_rate": 0.0002, "epoch": 3.100800258669469, "step": 19180}, {"loss": 0.5928, "grad_norm": 0.9508888125419617, "learning_rate": 0.0002, "epoch": 3.102416942850214, "step": 19190}, {"loss": 0.5757, "grad_norm": 1.0599955320358276, "learning_rate": 0.0002, "epoch": 3.1040336270309594, "step": 19200}, {"loss": 0.6151, "grad_norm": 1.0673625469207764, "learning_rate": 0.0002, "epoch": 3.1056503112117047, "step": 19210}, {"loss": 0.6043, "grad_norm": 0.7739115953445435, "learning_rate": 0.0002, "epoch": 3.10726699539245, "step": 19220}, {"loss": 0.6046, "grad_norm": 0.9884951114654541, "learning_rate": 0.0002, "epoch": 3.1088836795731956, "step": 19230}, {"loss": 0.5932, "grad_norm": 0.862260103225708, "learning_rate": 0.0002, "epoch": 3.110500363753941, "step": 19240}, {"loss": 0.6098, "grad_norm": 0.7690284848213196, "learning_rate": 0.0002, "epoch": 3.112117047934686, "step": 19250}, {"loss": 0.5791, "grad_norm": 0.8758958578109741, "learning_rate": 0.0002, "epoch": 3.1137337321154313, "step": 19260}, {"loss": 0.6136, "grad_norm": 1.0356395244598389, "learning_rate": 0.0002, "epoch": 3.1153504162961765, "step": 19270}, {"loss": 0.6159, "grad_norm": 0.6950937509536743, "learning_rate": 0.0002, "epoch": 3.1169671004769217, "step": 19280}, {"loss": 0.592, "grad_norm": 0.760998010635376, "learning_rate": 0.0002, "epoch": 3.118583784657667, "step": 19290}, {"loss": 0.575, "grad_norm": 0.9335789084434509, "learning_rate": 0.0002, "epoch": 3.1202004688384126, "step": 19300}, {"loss": 0.6139, "grad_norm": 0.9636204242706299, "learning_rate": 0.0002, "epoch": 3.121817153019158, "step": 19310}, {"loss": 0.6001, "grad_norm": 1.0820997953414917, "learning_rate": 0.0002, "epoch": 3.123433837199903, "step": 19320}, {"loss": 0.6542, "grad_norm": 0.7333487272262573, "learning_rate": 0.0002, "epoch": 3.1250505213806483, "step": 19330}, {"loss": 0.6178, "grad_norm": 1.0417509078979492, "learning_rate": 0.0002, "epoch": 3.1266672055613935, "step": 19340}, {"loss": 0.603, "grad_norm": 0.9267749190330505, "learning_rate": 0.0002, "epoch": 3.128283889742139, "step": 19350}, {"loss": 0.6063, "grad_norm": 0.777798593044281, "learning_rate": 0.0002, "epoch": 3.129900573922884, "step": 19360}, {"loss": 0.5913, "grad_norm": 0.8425456881523132, "learning_rate": 0.0002, "epoch": 3.1315172581036297, "step": 19370}, {"loss": 0.6042, "grad_norm": 0.9617102146148682, "learning_rate": 0.0002, "epoch": 3.133133942284375, "step": 19380}, {"loss": 0.633, "grad_norm": 1.0052828788757324, "learning_rate": 0.0002, "epoch": 3.13475062646512, "step": 19390}, {"loss": 0.5713, "grad_norm": 0.7637009024620056, "learning_rate": 0.0002, "epoch": 3.1363673106458654, "step": 19400}, {"loss": 0.5497, "grad_norm": 0.7958088517189026, "learning_rate": 0.0002, "epoch": 3.1379839948266106, "step": 19410}, {"loss": 0.6283, "grad_norm": 0.9161727428436279, "learning_rate": 0.0002, "epoch": 3.139600679007356, "step": 19420}, {"loss": 0.5638, "grad_norm": 0.8402149677276611, "learning_rate": 0.0002, "epoch": 3.141217363188101, "step": 19430}, {"loss": 0.5848, "grad_norm": 1.0056525468826294, "learning_rate": 0.0002, "epoch": 3.1428340473688463, "step": 19440}, {"loss": 0.5954, "grad_norm": 1.0129190683364868, "learning_rate": 0.0002, "epoch": 3.144450731549592, "step": 19450}, {"loss": 0.5808, "grad_norm": 0.790825366973877, "learning_rate": 0.0002, "epoch": 3.146067415730337, "step": 19460}, {"loss": 0.5607, "grad_norm": 1.441665530204773, "learning_rate": 0.0002, "epoch": 3.1476840999110824, "step": 19470}, {"loss": 0.5785, "grad_norm": 0.7846331596374512, "learning_rate": 0.0002, "epoch": 3.1493007840918277, "step": 19480}, {"loss": 0.5892, "grad_norm": 0.7915332913398743, "learning_rate": 0.0002, "epoch": 3.150917468272573, "step": 19490}, {"loss": 0.5759, "grad_norm": 0.933982253074646, "learning_rate": 0.0002, "epoch": 3.152534152453318, "step": 19500}, {"loss": 0.6206, "grad_norm": 1.038408637046814, "learning_rate": 0.0002, "epoch": 3.1541508366340634, "step": 19510}, {"loss": 0.6271, "grad_norm": 1.018935203552246, "learning_rate": 0.0002, "epoch": 3.155767520814809, "step": 19520}, {"loss": 0.6173, "grad_norm": 0.9618112444877625, "learning_rate": 0.0002, "epoch": 3.1573842049955543, "step": 19530}, {"loss": 0.5972, "grad_norm": 0.8900452852249146, "learning_rate": 0.0002, "epoch": 3.1590008891762995, "step": 19540}, {"loss": 0.5925, "grad_norm": 0.8254160284996033, "learning_rate": 0.0002, "epoch": 3.1606175733570447, "step": 19550}, {"loss": 0.625, "grad_norm": 1.004376769065857, "learning_rate": 0.0002, "epoch": 3.16223425753779, "step": 19560}, {"loss": 0.5775, "grad_norm": 1.0490446090698242, "learning_rate": 0.0002, "epoch": 3.163850941718535, "step": 19570}, {"loss": 0.5986, "grad_norm": 0.7387403845787048, "learning_rate": 0.0002, "epoch": 3.1654676258992804, "step": 19580}, {"loss": 0.5898, "grad_norm": 0.7611538171768188, "learning_rate": 0.0002, "epoch": 3.1670843100800257, "step": 19590}, {"loss": 0.5937, "grad_norm": 0.8239886164665222, "learning_rate": 0.0002, "epoch": 3.1687009942607713, "step": 19600}, {"loss": 0.6068, "grad_norm": 0.9327243566513062, "learning_rate": 0.0002, "epoch": 3.1703176784415166, "step": 19610}, {"loss": 0.572, "grad_norm": 0.9662560224533081, "learning_rate": 0.0002, "epoch": 3.171934362622262, "step": 19620}, {"loss": 0.5988, "grad_norm": 0.9183341860771179, "learning_rate": 0.0002, "epoch": 3.173551046803007, "step": 19630}, {"loss": 0.5909, "grad_norm": 0.875066876411438, "learning_rate": 0.0002, "epoch": 3.1751677309837523, "step": 19640}, {"loss": 0.5956, "grad_norm": 0.8567508459091187, "learning_rate": 0.0002, "epoch": 3.1767844151644975, "step": 19650}, {"loss": 0.5805, "grad_norm": 0.6805780529975891, "learning_rate": 0.0002, "epoch": 3.1784010993452427, "step": 19660}, {"loss": 0.6204, "grad_norm": 0.8776944279670715, "learning_rate": 0.0002, "epoch": 3.1800177835259884, "step": 19670}, {"loss": 0.6108, "grad_norm": 0.9036329984664917, "learning_rate": 0.0002, "epoch": 3.1816344677067336, "step": 19680}, {"loss": 0.6238, "grad_norm": 0.8527372479438782, "learning_rate": 0.0002, "epoch": 3.183251151887479, "step": 19690}, {"loss": 0.6089, "grad_norm": 1.1045585870742798, "learning_rate": 0.0002, "epoch": 3.184867836068224, "step": 19700}, {"loss": 0.5491, "grad_norm": 0.9213830828666687, "learning_rate": 0.0002, "epoch": 3.1864845202489693, "step": 19710}, {"loss": 0.618, "grad_norm": 0.8865814805030823, "learning_rate": 0.0002, "epoch": 3.1881012044297146, "step": 19720}, {"loss": 0.5785, "grad_norm": 0.7939388751983643, "learning_rate": 0.0002, "epoch": 3.18971788861046, "step": 19730}, {"loss": 0.5682, "grad_norm": 0.6966729760169983, "learning_rate": 0.0002, "epoch": 3.191334572791205, "step": 19740}, {"loss": 0.5839, "grad_norm": 0.8023673295974731, "learning_rate": 0.0002, "epoch": 3.1929512569719507, "step": 19750}, {"loss": 0.6267, "grad_norm": 0.7992037534713745, "learning_rate": 0.0002, "epoch": 3.194567941152696, "step": 19760}, {"loss": 0.6141, "grad_norm": 0.7412247657775879, "learning_rate": 0.0002, "epoch": 3.196184625333441, "step": 19770}, {"loss": 0.6179, "grad_norm": 0.9598729014396667, "learning_rate": 0.0002, "epoch": 3.1978013095141864, "step": 19780}, {"loss": 0.5685, "grad_norm": 0.8331366777420044, "learning_rate": 0.0002, "epoch": 3.1994179936949316, "step": 19790}, {"loss": 0.6104, "grad_norm": 0.8939169645309448, "learning_rate": 0.0002, "epoch": 3.201034677875677, "step": 19800}, {"loss": 0.6147, "grad_norm": 0.9219734072685242, "learning_rate": 0.0002, "epoch": 3.202651362056422, "step": 19810}, {"loss": 0.6051, "grad_norm": 0.869490385055542, "learning_rate": 0.0002, "epoch": 3.2042680462371678, "step": 19820}, {"loss": 0.5946, "grad_norm": 0.8989706635475159, "learning_rate": 0.0002, "epoch": 3.205884730417913, "step": 19830}, {"loss": 0.5866, "grad_norm": 0.8477165102958679, "learning_rate": 0.0002, "epoch": 3.2075014145986582, "step": 19840}, {"loss": 0.6176, "grad_norm": 0.8720678687095642, "learning_rate": 0.0002, "epoch": 3.2091180987794035, "step": 19850}, {"loss": 0.5694, "grad_norm": 0.861406683921814, "learning_rate": 0.0002, "epoch": 3.2107347829601487, "step": 19860}, {"loss": 0.6264, "grad_norm": 0.8228686451911926, "learning_rate": 0.0002, "epoch": 3.212351467140894, "step": 19870}, {"loss": 0.625, "grad_norm": 0.7936596870422363, "learning_rate": 0.0002, "epoch": 3.213968151321639, "step": 19880}, {"loss": 0.5698, "grad_norm": 1.097377896308899, "learning_rate": 0.0002, "epoch": 3.2155848355023844, "step": 19890}, {"loss": 0.6725, "grad_norm": 0.9544782638549805, "learning_rate": 0.0002, "epoch": 3.21720151968313, "step": 19900}, {"loss": 0.6022, "grad_norm": 0.8240751624107361, "learning_rate": 0.0002, "epoch": 3.2188182038638753, "step": 19910}, {"loss": 0.5659, "grad_norm": 0.8332096338272095, "learning_rate": 0.0002, "epoch": 3.2204348880446205, "step": 19920}, {"loss": 0.6274, "grad_norm": 1.0954567193984985, "learning_rate": 0.0002, "epoch": 3.2220515722253658, "step": 19930}, {"loss": 0.652, "grad_norm": 0.7790525555610657, "learning_rate": 0.0002, "epoch": 3.223668256406111, "step": 19940}, {"loss": 0.5986, "grad_norm": 0.7966814041137695, "learning_rate": 0.0002, "epoch": 3.225284940586856, "step": 19950}, {"loss": 0.5911, "grad_norm": 0.9751881957054138, "learning_rate": 0.0002, "epoch": 3.2269016247676015, "step": 19960}, {"loss": 0.6071, "grad_norm": 0.9856047630310059, "learning_rate": 0.0002, "epoch": 3.228518308948347, "step": 19970}, {"loss": 0.5837, "grad_norm": 1.3062353134155273, "learning_rate": 0.0002, "epoch": 3.2301349931290924, "step": 19980}, {"loss": 0.6588, "grad_norm": 0.9510692358016968, "learning_rate": 0.0002, "epoch": 3.2317516773098376, "step": 19990}, {"loss": 0.6264, "grad_norm": 0.8630342483520508, "learning_rate": 0.0002, "epoch": 3.233368361490583, "step": 20000}, {"loss": 0.6073, "grad_norm": 0.8966519236564636, "learning_rate": 0.0002, "epoch": 3.234985045671328, "step": 20010}, {"loss": 0.612, "grad_norm": 0.7093510627746582, "learning_rate": 0.0002, "epoch": 3.2366017298520733, "step": 20020}, {"loss": 0.585, "grad_norm": 0.7771096229553223, "learning_rate": 0.0002, "epoch": 3.2382184140328185, "step": 20030}, {"loss": 0.5821, "grad_norm": 0.841058075428009, "learning_rate": 0.0002, "epoch": 3.2398350982135637, "step": 20040}, {"loss": 0.6519, "grad_norm": 0.909712553024292, "learning_rate": 0.0002, "epoch": 3.2414517823943094, "step": 20050}, {"loss": 0.6089, "grad_norm": 0.8321019411087036, "learning_rate": 0.0002, "epoch": 3.2430684665750547, "step": 20060}, {"loss": 0.6115, "grad_norm": 0.779901921749115, "learning_rate": 0.0002, "epoch": 3.2446851507558, "step": 20070}, {"loss": 0.6107, "grad_norm": 0.6249170303344727, "learning_rate": 0.0002, "epoch": 3.246301834936545, "step": 20080}, {"loss": 0.603, "grad_norm": 0.8000940680503845, "learning_rate": 0.0002, "epoch": 3.2479185191172903, "step": 20090}, {"loss": 0.6273, "grad_norm": 0.7627735137939453, "learning_rate": 0.0002, "epoch": 3.2495352032980356, "step": 20100}, {"loss": 0.6223, "grad_norm": 0.8780747056007385, "learning_rate": 0.0002, "epoch": 3.2511518874787813, "step": 20110}, {"loss": 0.5969, "grad_norm": 0.772037148475647, "learning_rate": 0.0002, "epoch": 3.2527685716595265, "step": 20120}, {"loss": 0.5843, "grad_norm": 1.0086580514907837, "learning_rate": 0.0002, "epoch": 3.2543852558402717, "step": 20130}, {"loss": 0.5777, "grad_norm": 0.9360289573669434, "learning_rate": 0.0002, "epoch": 3.256001940021017, "step": 20140}, {"loss": 0.5777, "grad_norm": 1.2099586725234985, "learning_rate": 0.0002, "epoch": 3.257618624201762, "step": 20150}, {"loss": 0.624, "grad_norm": 0.8368481397628784, "learning_rate": 0.0002, "epoch": 3.2592353083825074, "step": 20160}, {"loss": 0.5626, "grad_norm": 0.7391039133071899, "learning_rate": 0.0002, "epoch": 3.2608519925632526, "step": 20170}, {"loss": 0.6041, "grad_norm": 0.9122273325920105, "learning_rate": 0.0002, "epoch": 3.262468676743998, "step": 20180}, {"loss": 0.5868, "grad_norm": 0.8502281904220581, "learning_rate": 0.0002, "epoch": 3.264085360924743, "step": 20190}, {"loss": 0.5841, "grad_norm": 1.0926852226257324, "learning_rate": 0.0002, "epoch": 3.265702045105489, "step": 20200}, {"loss": 0.6027, "grad_norm": 0.7902828454971313, "learning_rate": 0.0002, "epoch": 3.267318729286234, "step": 20210}, {"loss": 0.6089, "grad_norm": 0.8724729418754578, "learning_rate": 0.0002, "epoch": 3.2689354134669792, "step": 20220}, {"loss": 0.6242, "grad_norm": 0.8469277024269104, "learning_rate": 0.0002, "epoch": 3.2705520976477245, "step": 20230}, {"loss": 0.644, "grad_norm": 0.8865092992782593, "learning_rate": 0.0002, "epoch": 3.2721687818284697, "step": 20240}, {"loss": 0.6464, "grad_norm": 1.0979334115982056, "learning_rate": 0.0002, "epoch": 3.273785466009215, "step": 20250}, {"loss": 0.647, "grad_norm": 1.0860793590545654, "learning_rate": 0.0002, "epoch": 3.2754021501899606, "step": 20260}, {"loss": 0.6105, "grad_norm": 0.981745183467865, "learning_rate": 0.0002, "epoch": 3.277018834370706, "step": 20270}, {"loss": 0.627, "grad_norm": 0.9155020713806152, "learning_rate": 0.0002, "epoch": 3.278635518551451, "step": 20280}, {"loss": 0.5899, "grad_norm": 0.8436718583106995, "learning_rate": 0.0002, "epoch": 3.2802522027321963, "step": 20290}, {"loss": 0.6371, "grad_norm": 1.0329409837722778, "learning_rate": 0.0002, "epoch": 3.2818688869129415, "step": 20300}, {"loss": 0.6, "grad_norm": 0.9876394271850586, "learning_rate": 0.0002, "epoch": 3.2834855710936868, "step": 20310}, {"loss": 0.5463, "grad_norm": 0.8052917718887329, "learning_rate": 0.0002, "epoch": 3.285102255274432, "step": 20320}, {"loss": 0.5949, "grad_norm": 0.8390680551528931, "learning_rate": 0.0002, "epoch": 3.2867189394551772, "step": 20330}, {"loss": 0.6492, "grad_norm": 0.9515735507011414, "learning_rate": 0.0002, "epoch": 3.288335623635923, "step": 20340}, {"loss": 0.596, "grad_norm": 0.8028870224952698, "learning_rate": 0.0002, "epoch": 3.289952307816668, "step": 20350}, {"loss": 0.634, "grad_norm": 0.862592339515686, "learning_rate": 0.0002, "epoch": 3.2915689919974134, "step": 20360}, {"loss": 0.6345, "grad_norm": 0.7451621890068054, "learning_rate": 0.0002, "epoch": 3.2931856761781586, "step": 20370}, {"loss": 0.6458, "grad_norm": 0.8966776728630066, "learning_rate": 0.0002, "epoch": 3.294802360358904, "step": 20380}, {"loss": 0.5967, "grad_norm": 0.9289216995239258, "learning_rate": 0.0002, "epoch": 3.296419044539649, "step": 20390}, {"loss": 0.6599, "grad_norm": 0.9649626612663269, "learning_rate": 0.0002, "epoch": 3.2980357287203943, "step": 20400}, {"loss": 0.5781, "grad_norm": 1.1953798532485962, "learning_rate": 0.0002, "epoch": 3.29965241290114, "step": 20410}, {"loss": 0.5997, "grad_norm": 0.8929083943367004, "learning_rate": 0.0002, "epoch": 3.301269097081885, "step": 20420}, {"loss": 0.597, "grad_norm": 0.8922014236450195, "learning_rate": 0.0002, "epoch": 3.3028857812626304, "step": 20430}, {"loss": 0.5766, "grad_norm": 0.9754860401153564, "learning_rate": 0.0002, "epoch": 3.3045024654433757, "step": 20440}, {"loss": 0.5653, "grad_norm": 0.8873140215873718, "learning_rate": 0.0002, "epoch": 3.306119149624121, "step": 20450}, {"loss": 0.6138, "grad_norm": 0.857271671295166, "learning_rate": 0.0002, "epoch": 3.307735833804866, "step": 20460}, {"loss": 0.633, "grad_norm": 0.9022141098976135, "learning_rate": 0.0002, "epoch": 3.3093525179856114, "step": 20470}, {"loss": 0.6654, "grad_norm": 0.8614798188209534, "learning_rate": 0.0002, "epoch": 3.3109692021663566, "step": 20480}, {"loss": 0.6254, "grad_norm": 0.8838164210319519, "learning_rate": 0.0002, "epoch": 3.3125858863471023, "step": 20490}, {"loss": 0.5849, "grad_norm": 0.8709736466407776, "learning_rate": 0.0002, "epoch": 3.3142025705278475, "step": 20500}, {"loss": 0.6146, "grad_norm": 0.9533300995826721, "learning_rate": 0.0002, "epoch": 3.3158192547085927, "step": 20510}, {"loss": 0.6029, "grad_norm": 0.8259269595146179, "learning_rate": 0.0002, "epoch": 3.317435938889338, "step": 20520}, {"loss": 0.6268, "grad_norm": 0.8607608079910278, "learning_rate": 0.0002, "epoch": 3.319052623070083, "step": 20530}, {"loss": 0.5676, "grad_norm": 1.0863020420074463, "learning_rate": 0.0002, "epoch": 3.3206693072508284, "step": 20540}, {"loss": 0.6412, "grad_norm": 1.011489987373352, "learning_rate": 0.0002, "epoch": 3.3222859914315737, "step": 20550}, {"loss": 0.6247, "grad_norm": 0.6952177882194519, "learning_rate": 0.0002, "epoch": 3.3239026756123193, "step": 20560}, {"loss": 0.6229, "grad_norm": 0.9638974070549011, "learning_rate": 0.0002, "epoch": 3.3255193597930646, "step": 20570}, {"loss": 0.5882, "grad_norm": 1.0310138463974, "learning_rate": 0.0002, "epoch": 3.32713604397381, "step": 20580}, {"loss": 0.594, "grad_norm": 0.9371318221092224, "learning_rate": 0.0002, "epoch": 3.328752728154555, "step": 20590}, {"loss": 0.6137, "grad_norm": 0.8756691813468933, "learning_rate": 0.0002, "epoch": 3.3303694123353003, "step": 20600}, {"loss": 0.5994, "grad_norm": 1.054175853729248, "learning_rate": 0.0002, "epoch": 3.3319860965160455, "step": 20610}, {"loss": 0.6169, "grad_norm": 0.9074128270149231, "learning_rate": 0.0002, "epoch": 3.3336027806967907, "step": 20620}, {"loss": 0.6138, "grad_norm": 0.906900942325592, "learning_rate": 0.0002, "epoch": 3.335219464877536, "step": 20630}, {"loss": 0.571, "grad_norm": 0.8689333200454712, "learning_rate": 0.0002, "epoch": 3.3368361490582816, "step": 20640}, {"loss": 0.6079, "grad_norm": 0.9889747500419617, "learning_rate": 0.0002, "epoch": 3.338452833239027, "step": 20650}, {"loss": 0.6073, "grad_norm": 1.0685805082321167, "learning_rate": 0.0002, "epoch": 3.340069517419772, "step": 20660}, {"loss": 0.6091, "grad_norm": 0.7495010495185852, "learning_rate": 0.0002, "epoch": 3.3416862016005173, "step": 20670}, {"loss": 0.5883, "grad_norm": 0.8747848272323608, "learning_rate": 0.0002, "epoch": 3.3433028857812626, "step": 20680}, {"loss": 0.604, "grad_norm": 0.9762673377990723, "learning_rate": 0.0002, "epoch": 3.344919569962008, "step": 20690}, {"loss": 0.6784, "grad_norm": 1.0284489393234253, "learning_rate": 0.0002, "epoch": 3.346536254142753, "step": 20700}, {"loss": 0.6464, "grad_norm": 0.7293812036514282, "learning_rate": 0.0002, "epoch": 3.3481529383234987, "step": 20710}, {"loss": 0.609, "grad_norm": 0.8330199122428894, "learning_rate": 0.0002, "epoch": 3.349769622504244, "step": 20720}, {"loss": 0.5729, "grad_norm": 0.9808499217033386, "learning_rate": 0.0002, "epoch": 3.351386306684989, "step": 20730}, {"loss": 0.6315, "grad_norm": 0.9508825540542603, "learning_rate": 0.0002, "epoch": 3.3530029908657344, "step": 20740}, {"loss": 0.5965, "grad_norm": 0.790483832359314, "learning_rate": 0.0002, "epoch": 3.3546196750464796, "step": 20750}, {"loss": 0.6327, "grad_norm": 1.022793173789978, "learning_rate": 0.0002, "epoch": 3.356236359227225, "step": 20760}, {"loss": 0.6439, "grad_norm": 0.8318950533866882, "learning_rate": 0.0002, "epoch": 3.35785304340797, "step": 20770}, {"loss": 0.6037, "grad_norm": 0.7980858087539673, "learning_rate": 0.0002, "epoch": 3.3594697275887153, "step": 20780}, {"loss": 0.6746, "grad_norm": 0.8114802241325378, "learning_rate": 0.0002, "epoch": 3.361086411769461, "step": 20790}, {"loss": 0.6017, "grad_norm": 0.8522519469261169, "learning_rate": 0.0002, "epoch": 3.3627030959502062, "step": 20800}, {"loss": 0.5864, "grad_norm": 0.9142431616783142, "learning_rate": 0.0002, "epoch": 3.3643197801309515, "step": 20810}, {"loss": 0.6331, "grad_norm": 0.771170437335968, "learning_rate": 0.0002, "epoch": 3.3659364643116967, "step": 20820}, {"loss": 0.5879, "grad_norm": 1.0628231763839722, "learning_rate": 0.0002, "epoch": 3.367553148492442, "step": 20830}, {"loss": 0.6533, "grad_norm": 0.9384352564811707, "learning_rate": 0.0002, "epoch": 3.369169832673187, "step": 20840}, {"loss": 0.6292, "grad_norm": 1.1286591291427612, "learning_rate": 0.0002, "epoch": 3.370786516853933, "step": 20850}, {"loss": 0.5986, "grad_norm": 1.1349513530731201, "learning_rate": 0.0002, "epoch": 3.372403201034678, "step": 20860}, {"loss": 0.6413, "grad_norm": 1.0127464532852173, "learning_rate": 0.0002, "epoch": 3.3740198852154233, "step": 20870}, {"loss": 0.6414, "grad_norm": 0.9111971855163574, "learning_rate": 0.0002, "epoch": 3.3756365693961685, "step": 20880}, {"loss": 0.6101, "grad_norm": 0.871356725692749, "learning_rate": 0.0002, "epoch": 3.3772532535769137, "step": 20890}, {"loss": 0.5995, "grad_norm": 0.7774117588996887, "learning_rate": 0.0002, "epoch": 3.378869937757659, "step": 20900}, {"loss": 0.6062, "grad_norm": 1.0089964866638184, "learning_rate": 0.0002, "epoch": 3.380486621938404, "step": 20910}, {"loss": 0.5908, "grad_norm": 0.7855867147445679, "learning_rate": 0.0002, "epoch": 3.3821033061191494, "step": 20920}, {"loss": 0.6373, "grad_norm": 1.3713710308074951, "learning_rate": 0.0002, "epoch": 3.3837199902998947, "step": 20930}, {"loss": 0.6627, "grad_norm": 0.8599116206169128, "learning_rate": 0.0002, "epoch": 3.3853366744806404, "step": 20940}, {"loss": 0.6224, "grad_norm": 0.9392673373222351, "learning_rate": 0.0002, "epoch": 3.3869533586613856, "step": 20950}, {"loss": 0.5855, "grad_norm": 0.8764075040817261, "learning_rate": 0.0002, "epoch": 3.388570042842131, "step": 20960}, {"loss": 0.5734, "grad_norm": 0.8240136504173279, "learning_rate": 0.0002, "epoch": 3.390186727022876, "step": 20970}, {"loss": 0.5783, "grad_norm": 1.0982369184494019, "learning_rate": 0.0002, "epoch": 3.3918034112036213, "step": 20980}, {"loss": 0.5451, "grad_norm": 1.0599013566970825, "learning_rate": 0.0002, "epoch": 3.3934200953843665, "step": 20990}, {"loss": 0.6356, "grad_norm": 0.895438015460968, "learning_rate": 0.0002, "epoch": 3.395036779565112, "step": 21000}, {"loss": 0.6065, "grad_norm": 0.6974841356277466, "learning_rate": 0.0002, "epoch": 3.3966534637458574, "step": 21010}, {"loss": 0.5704, "grad_norm": 0.9571719765663147, "learning_rate": 0.0002, "epoch": 3.3982701479266026, "step": 21020}, {"loss": 0.679, "grad_norm": 0.831912636756897, "learning_rate": 0.0002, "epoch": 3.399886832107348, "step": 21030}, {"loss": 0.6051, "grad_norm": 0.831936240196228, "learning_rate": 0.0002, "epoch": 3.401503516288093, "step": 21040}, {"loss": 0.5857, "grad_norm": 0.7388373613357544, "learning_rate": 0.0002, "epoch": 3.4031202004688383, "step": 21050}, {"loss": 0.6245, "grad_norm": 0.938667356967926, "learning_rate": 0.0002, "epoch": 3.4047368846495836, "step": 21060}, {"loss": 0.6121, "grad_norm": 0.9202313423156738, "learning_rate": 0.0002, "epoch": 3.406353568830329, "step": 21070}, {"loss": 0.6388, "grad_norm": 0.9888381958007812, "learning_rate": 0.0002, "epoch": 3.4079702530110745, "step": 21080}, {"loss": 0.6245, "grad_norm": 0.8526970744132996, "learning_rate": 0.0002, "epoch": 3.4095869371918197, "step": 21090}, {"loss": 0.5914, "grad_norm": 0.7939383387565613, "learning_rate": 0.0002, "epoch": 3.411203621372565, "step": 21100}, {"loss": 0.6066, "grad_norm": 0.9986352920532227, "learning_rate": 0.0002, "epoch": 3.41282030555331, "step": 21110}, {"loss": 0.5947, "grad_norm": 0.8895300030708313, "learning_rate": 0.0002, "epoch": 3.4144369897340554, "step": 21120}, {"loss": 0.6264, "grad_norm": 0.9559482932090759, "learning_rate": 0.0002, "epoch": 3.4160536739148006, "step": 21130}, {"loss": 0.6491, "grad_norm": 0.8351506590843201, "learning_rate": 0.0002, "epoch": 3.417670358095546, "step": 21140}, {"loss": 0.567, "grad_norm": 0.8224456906318665, "learning_rate": 0.0002, "epoch": 3.4192870422762915, "step": 21150}, {"loss": 0.5871, "grad_norm": 1.0110299587249756, "learning_rate": 0.0002, "epoch": 3.4209037264570368, "step": 21160}, {"loss": 0.6116, "grad_norm": 0.82564777135849, "learning_rate": 0.0002, "epoch": 3.422520410637782, "step": 21170}, {"loss": 0.595, "grad_norm": 1.004738688468933, "learning_rate": 0.0002, "epoch": 3.4241370948185272, "step": 21180}, {"loss": 0.6286, "grad_norm": 0.7545676827430725, "learning_rate": 0.0002, "epoch": 3.4257537789992725, "step": 21190}, {"loss": 0.5868, "grad_norm": 0.8918704390525818, "learning_rate": 0.0002, "epoch": 3.4273704631800177, "step": 21200}, {"loss": 0.6542, "grad_norm": 0.8336876034736633, "learning_rate": 0.0002, "epoch": 3.428987147360763, "step": 21210}, {"loss": 0.5824, "grad_norm": 0.8928771018981934, "learning_rate": 0.0002, "epoch": 3.430603831541508, "step": 21220}, {"loss": 0.6468, "grad_norm": 0.7663705945014954, "learning_rate": 0.0002, "epoch": 3.432220515722254, "step": 21230}, {"loss": 0.6693, "grad_norm": 0.8392598628997803, "learning_rate": 0.0002, "epoch": 3.433837199902999, "step": 21240}, {"loss": 0.5971, "grad_norm": 0.8819600343704224, "learning_rate": 0.0002, "epoch": 3.4354538840837443, "step": 21250}, {"loss": 0.6791, "grad_norm": 0.9124642014503479, "learning_rate": 0.0002, "epoch": 3.4370705682644895, "step": 21260}, {"loss": 0.5925, "grad_norm": 0.8329763412475586, "learning_rate": 0.0002, "epoch": 3.4386872524452348, "step": 21270}, {"loss": 0.6541, "grad_norm": 0.9982839822769165, "learning_rate": 0.0002, "epoch": 3.44030393662598, "step": 21280}, {"loss": 0.6441, "grad_norm": 0.9105954766273499, "learning_rate": 0.0002, "epoch": 3.4419206208067252, "step": 21290}, {"loss": 0.6028, "grad_norm": 0.8182359337806702, "learning_rate": 0.0002, "epoch": 3.443537304987471, "step": 21300}, {"loss": 0.5991, "grad_norm": 1.0568904876708984, "learning_rate": 0.0002, "epoch": 3.445153989168216, "step": 21310}, {"loss": 0.6117, "grad_norm": 0.968539834022522, "learning_rate": 0.0002, "epoch": 3.4467706733489614, "step": 21320}, {"loss": 0.6219, "grad_norm": 0.8774511218070984, "learning_rate": 0.0002, "epoch": 3.4483873575297066, "step": 21330}, {"loss": 0.6438, "grad_norm": 0.7598156332969666, "learning_rate": 0.0002, "epoch": 3.450004041710452, "step": 21340}, {"loss": 0.6033, "grad_norm": 1.1012897491455078, "learning_rate": 0.0002, "epoch": 3.451620725891197, "step": 21350}, {"loss": 0.6137, "grad_norm": 0.8040637373924255, "learning_rate": 0.0002, "epoch": 3.4532374100719423, "step": 21360}, {"loss": 0.6173, "grad_norm": 0.8497496247291565, "learning_rate": 0.0002, "epoch": 3.4548540942526875, "step": 21370}, {"loss": 0.6005, "grad_norm": 0.8429915904998779, "learning_rate": 0.0002, "epoch": 3.456470778433433, "step": 21380}, {"loss": 0.6182, "grad_norm": 0.8107112646102905, "learning_rate": 0.0002, "epoch": 3.4580874626141784, "step": 21390}, {"loss": 0.6109, "grad_norm": 1.00872004032135, "learning_rate": 0.0002, "epoch": 3.4597041467949237, "step": 21400}, {"loss": 0.5712, "grad_norm": 0.8266542553901672, "learning_rate": 0.0002, "epoch": 3.461320830975669, "step": 21410}, {"loss": 0.6457, "grad_norm": 0.8972568511962891, "learning_rate": 0.0002, "epoch": 3.462937515156414, "step": 21420}, {"loss": 0.6081, "grad_norm": 1.0781476497650146, "learning_rate": 0.0002, "epoch": 3.4645541993371594, "step": 21430}, {"loss": 0.6303, "grad_norm": 0.9571592807769775, "learning_rate": 0.0002, "epoch": 3.4661708835179046, "step": 21440}, {"loss": 0.6309, "grad_norm": 0.881547212600708, "learning_rate": 0.0002, "epoch": 3.4677875676986503, "step": 21450}, {"loss": 0.6076, "grad_norm": 0.6955338716506958, "learning_rate": 0.0002, "epoch": 3.4694042518793955, "step": 21460}, {"loss": 0.6205, "grad_norm": 0.901187539100647, "learning_rate": 0.0002, "epoch": 3.4710209360601407, "step": 21470}, {"loss": 0.639, "grad_norm": 0.7063511610031128, "learning_rate": 0.0002, "epoch": 3.472637620240886, "step": 21480}, {"loss": 0.6154, "grad_norm": 0.8462792038917542, "learning_rate": 0.0002, "epoch": 3.474254304421631, "step": 21490}, {"loss": 0.61, "grad_norm": 1.1861060857772827, "learning_rate": 0.0002, "epoch": 3.4758709886023764, "step": 21500}, {"loss": 0.6586, "grad_norm": 0.70503169298172, "learning_rate": 0.0002, "epoch": 3.4774876727831217, "step": 21510}, {"loss": 0.6475, "grad_norm": 0.9650066494941711, "learning_rate": 0.0002, "epoch": 3.479104356963867, "step": 21520}, {"loss": 0.6452, "grad_norm": 1.0266852378845215, "learning_rate": 0.0002, "epoch": 3.4807210411446126, "step": 21530}, {"loss": 0.6553, "grad_norm": 0.956372857093811, "learning_rate": 0.0002, "epoch": 3.482337725325358, "step": 21540}, {"loss": 0.6667, "grad_norm": 0.8848432898521423, "learning_rate": 0.0002, "epoch": 3.483954409506103, "step": 21550}, {"loss": 0.6375, "grad_norm": 1.0805351734161377, "learning_rate": 0.0002, "epoch": 3.4855710936868483, "step": 21560}, {"loss": 0.6958, "grad_norm": 0.9279725551605225, "learning_rate": 0.0002, "epoch": 3.4871877778675935, "step": 21570}, {"loss": 0.6354, "grad_norm": 0.9049562215805054, "learning_rate": 0.0002, "epoch": 3.4888044620483387, "step": 21580}, {"loss": 0.6071, "grad_norm": 0.9619429111480713, "learning_rate": 0.0002, "epoch": 3.4904211462290844, "step": 21590}, {"loss": 0.5927, "grad_norm": 0.8508906960487366, "learning_rate": 0.0002, "epoch": 3.4920378304098296, "step": 21600}, {"loss": 0.6115, "grad_norm": 0.8692502379417419, "learning_rate": 0.0002, "epoch": 3.493654514590575, "step": 21610}, {"loss": 0.5878, "grad_norm": 0.8187332153320312, "learning_rate": 0.0002, "epoch": 3.49527119877132, "step": 21620}, {"loss": 0.5874, "grad_norm": 1.145400047302246, "learning_rate": 0.0002, "epoch": 3.4968878829520653, "step": 21630}, {"loss": 0.6313, "grad_norm": 0.8281388282775879, "learning_rate": 0.0002, "epoch": 3.4985045671328105, "step": 21640}, {"loss": 0.6624, "grad_norm": 0.82256019115448, "learning_rate": 0.0002, "epoch": 3.500121251313556, "step": 21650}, {"loss": 0.6346, "grad_norm": 0.9315484762191772, "learning_rate": 0.0002, "epoch": 3.501737935494301, "step": 21660}, {"loss": 0.6086, "grad_norm": 0.7626111507415771, "learning_rate": 0.0002, "epoch": 3.5033546196750462, "step": 21670}, {"loss": 0.6177, "grad_norm": 0.9275059103965759, "learning_rate": 0.0002, "epoch": 3.504971303855792, "step": 21680}, {"loss": 0.64, "grad_norm": 0.7906724810600281, "learning_rate": 0.0002, "epoch": 3.506587988036537, "step": 21690}, {"loss": 0.6015, "grad_norm": 0.8289761543273926, "learning_rate": 0.0002, "epoch": 3.5082046722172824, "step": 21700}, {"loss": 0.6246, "grad_norm": 0.8316431045532227, "learning_rate": 0.0002, "epoch": 3.5098213563980276, "step": 21710}, {"loss": 0.619, "grad_norm": 1.0451812744140625, "learning_rate": 0.0002, "epoch": 3.511438040578773, "step": 21720}, {"loss": 0.632, "grad_norm": 0.928252637386322, "learning_rate": 0.0002, "epoch": 3.513054724759518, "step": 21730}, {"loss": 0.6062, "grad_norm": 0.7985895276069641, "learning_rate": 0.0002, "epoch": 3.5146714089402638, "step": 21740}, {"loss": 0.6463, "grad_norm": 0.6740974187850952, "learning_rate": 0.0002, "epoch": 3.516288093121009, "step": 21750}, {"loss": 0.6138, "grad_norm": 0.8482223749160767, "learning_rate": 0.0002, "epoch": 3.517904777301754, "step": 21760}, {"loss": 0.6277, "grad_norm": 0.889947772026062, "learning_rate": 0.0002, "epoch": 3.5195214614824994, "step": 21770}, {"loss": 0.6174, "grad_norm": 0.8304598927497864, "learning_rate": 0.0002, "epoch": 3.5211381456632447, "step": 21780}, {"loss": 0.6156, "grad_norm": 0.8002981543540955, "learning_rate": 0.0002, "epoch": 3.52275482984399, "step": 21790}, {"loss": 0.5896, "grad_norm": 0.8115083575248718, "learning_rate": 0.0002, "epoch": 3.524371514024735, "step": 21800}, {"loss": 0.6041, "grad_norm": 0.9715048670768738, "learning_rate": 0.0002, "epoch": 3.5259881982054804, "step": 21810}, {"loss": 0.6715, "grad_norm": 1.0910786390304565, "learning_rate": 0.0002, "epoch": 3.5276048823862256, "step": 21820}, {"loss": 0.6543, "grad_norm": 0.8438942432403564, "learning_rate": 0.0002, "epoch": 3.5292215665669713, "step": 21830}, {"loss": 0.6509, "grad_norm": 0.8813382983207703, "learning_rate": 0.0002, "epoch": 3.5308382507477165, "step": 21840}, {"loss": 0.6049, "grad_norm": 0.7092908024787903, "learning_rate": 0.0002, "epoch": 3.5324549349284617, "step": 21850}, {"loss": 0.5678, "grad_norm": 0.8332187533378601, "learning_rate": 0.0002, "epoch": 3.534071619109207, "step": 21860}, {"loss": 0.5896, "grad_norm": 0.8958209156990051, "learning_rate": 0.0002, "epoch": 3.535688303289952, "step": 21870}, {"loss": 0.6476, "grad_norm": 0.824138879776001, "learning_rate": 0.0002, "epoch": 3.5373049874706974, "step": 21880}, {"loss": 0.6022, "grad_norm": 0.8375158309936523, "learning_rate": 0.0002, "epoch": 3.538921671651443, "step": 21890}, {"loss": 0.6019, "grad_norm": 1.0274608135223389, "learning_rate": 0.0002, "epoch": 3.5405383558321883, "step": 21900}, {"loss": 0.6194, "grad_norm": 0.7088932394981384, "learning_rate": 0.0002, "epoch": 3.5421550400129336, "step": 21910}, {"loss": 0.6554, "grad_norm": 0.8172445297241211, "learning_rate": 0.0002, "epoch": 3.543771724193679, "step": 21920}, {"loss": 0.6711, "grad_norm": 0.9904135465621948, "learning_rate": 0.0002, "epoch": 3.545388408374424, "step": 21930}, {"loss": 0.6001, "grad_norm": 0.9900432229042053, "learning_rate": 0.0002, "epoch": 3.5470050925551693, "step": 21940}, {"loss": 0.6195, "grad_norm": 0.8963301181793213, "learning_rate": 0.0002, "epoch": 3.5486217767359145, "step": 21950}, {"loss": 0.5972, "grad_norm": 0.8551464676856995, "learning_rate": 0.0002, "epoch": 3.5502384609166597, "step": 21960}, {"loss": 0.6206, "grad_norm": 1.0916603803634644, "learning_rate": 0.0002, "epoch": 3.551855145097405, "step": 21970}, {"loss": 0.6523, "grad_norm": 0.841598391532898, "learning_rate": 0.0002, "epoch": 3.5534718292781506, "step": 21980}, {"loss": 0.617, "grad_norm": 0.8566757440567017, "learning_rate": 0.0002, "epoch": 3.555088513458896, "step": 21990}, {"loss": 0.6192, "grad_norm": 1.0145052671432495, "learning_rate": 0.0002, "epoch": 3.556705197639641, "step": 22000}, {"loss": 0.6173, "grad_norm": 0.9293754696846008, "learning_rate": 0.0002, "epoch": 3.5583218818203863, "step": 22010}, {"loss": 0.612, "grad_norm": 0.9568536281585693, "learning_rate": 0.0002, "epoch": 3.5599385660011316, "step": 22020}, {"loss": 0.641, "grad_norm": 0.8613139986991882, "learning_rate": 0.0002, "epoch": 3.5615552501818772, "step": 22030}, {"loss": 0.6496, "grad_norm": 0.8179237246513367, "learning_rate": 0.0002, "epoch": 3.5631719343626225, "step": 22040}, {"loss": 0.574, "grad_norm": 0.9059830904006958, "learning_rate": 0.0002, "epoch": 3.5647886185433677, "step": 22050}, {"loss": 0.6448, "grad_norm": 1.0068252086639404, "learning_rate": 0.0002, "epoch": 3.566405302724113, "step": 22060}, {"loss": 0.6239, "grad_norm": 0.9682072997093201, "learning_rate": 0.0002, "epoch": 3.568021986904858, "step": 22070}, {"loss": 0.6808, "grad_norm": 0.8514005541801453, "learning_rate": 0.0002, "epoch": 3.5696386710856034, "step": 22080}, {"loss": 0.5956, "grad_norm": 0.8327770829200745, "learning_rate": 0.0002, "epoch": 3.5712553552663486, "step": 22090}, {"loss": 0.5976, "grad_norm": 1.024976372718811, "learning_rate": 0.0002, "epoch": 3.572872039447094, "step": 22100}, {"loss": 0.624, "grad_norm": 0.7721174955368042, "learning_rate": 0.0002, "epoch": 3.574488723627839, "step": 22110}, {"loss": 0.5896, "grad_norm": 1.0351054668426514, "learning_rate": 0.0002, "epoch": 3.5761054078085843, "step": 22120}, {"loss": 0.6379, "grad_norm": 0.9680907130241394, "learning_rate": 0.0002, "epoch": 3.57772209198933, "step": 22130}, {"loss": 0.6194, "grad_norm": 0.8016974925994873, "learning_rate": 0.0002, "epoch": 3.5793387761700752, "step": 22140}, {"loss": 0.6387, "grad_norm": 1.0109003782272339, "learning_rate": 0.0002, "epoch": 3.5809554603508205, "step": 22150}, {"loss": 0.6368, "grad_norm": 1.0473392009735107, "learning_rate": 0.0002, "epoch": 3.5825721445315657, "step": 22160}, {"loss": 0.6353, "grad_norm": 0.8686613440513611, "learning_rate": 0.0002, "epoch": 3.584188828712311, "step": 22170}, {"loss": 0.5791, "grad_norm": 0.869149923324585, "learning_rate": 0.0002, "epoch": 3.5858055128930566, "step": 22180}, {"loss": 0.5895, "grad_norm": 0.9769062995910645, "learning_rate": 0.0002, "epoch": 3.587422197073802, "step": 22190}, {"loss": 0.5939, "grad_norm": 0.779636561870575, "learning_rate": 0.0002, "epoch": 3.589038881254547, "step": 22200}, {"loss": 0.5875, "grad_norm": 0.9063841104507446, "learning_rate": 0.0002, "epoch": 3.5906555654352923, "step": 22210}, {"loss": 0.5671, "grad_norm": 0.9216037392616272, "learning_rate": 0.0002, "epoch": 3.5922722496160375, "step": 22220}, {"loss": 0.6484, "grad_norm": 1.0217336416244507, "learning_rate": 0.0002, "epoch": 3.5938889337967828, "step": 22230}, {"loss": 0.6511, "grad_norm": 0.8513161540031433, "learning_rate": 0.0002, "epoch": 3.595505617977528, "step": 22240}, {"loss": 0.6301, "grad_norm": 0.8084813952445984, "learning_rate": 0.0002, "epoch": 3.597122302158273, "step": 22250}, {"loss": 0.6197, "grad_norm": 0.8524802923202515, "learning_rate": 0.0002, "epoch": 3.5987389863390185, "step": 22260}, {"loss": 0.5599, "grad_norm": 0.9356237649917603, "learning_rate": 0.0002, "epoch": 3.600355670519764, "step": 22270}, {"loss": 0.628, "grad_norm": 1.009600281715393, "learning_rate": 0.0002, "epoch": 3.6019723547005094, "step": 22280}, {"loss": 0.6179, "grad_norm": 0.9900581240653992, "learning_rate": 0.0002, "epoch": 3.6035890388812546, "step": 22290}, {"loss": 0.5725, "grad_norm": 1.062495231628418, "learning_rate": 0.0002, "epoch": 3.605205723062, "step": 22300}, {"loss": 0.607, "grad_norm": 0.8832381367683411, "learning_rate": 0.0002, "epoch": 3.606822407242745, "step": 22310}, {"loss": 0.6215, "grad_norm": 0.9284297823905945, "learning_rate": 0.0002, "epoch": 3.6084390914234903, "step": 22320}, {"loss": 0.685, "grad_norm": 1.2381829023361206, "learning_rate": 0.0002, "epoch": 3.610055775604236, "step": 22330}, {"loss": 0.6181, "grad_norm": 0.929434597492218, "learning_rate": 0.0002, "epoch": 3.611672459784981, "step": 22340}, {"loss": 0.6141, "grad_norm": 0.9714490175247192, "learning_rate": 0.0002, "epoch": 3.6132891439657264, "step": 22350}, {"loss": 0.6861, "grad_norm": 0.808014988899231, "learning_rate": 0.0002, "epoch": 3.6149058281464717, "step": 22360}, {"loss": 0.6428, "grad_norm": 1.0364398956298828, "learning_rate": 0.0002, "epoch": 3.616522512327217, "step": 22370}, {"loss": 0.6337, "grad_norm": 0.7858489751815796, "learning_rate": 0.0002, "epoch": 3.618139196507962, "step": 22380}, {"loss": 0.6214, "grad_norm": 0.9920870065689087, "learning_rate": 0.0002, "epoch": 3.6197558806887074, "step": 22390}, {"loss": 0.6659, "grad_norm": 0.9183220863342285, "learning_rate": 0.0002, "epoch": 3.6213725648694526, "step": 22400}, {"loss": 0.6036, "grad_norm": 0.9826246500015259, "learning_rate": 0.0002, "epoch": 3.622989249050198, "step": 22410}, {"loss": 0.6441, "grad_norm": 0.8632931113243103, "learning_rate": 0.0002, "epoch": 3.6246059332309435, "step": 22420}, {"loss": 0.6124, "grad_norm": 0.8468965291976929, "learning_rate": 0.0002, "epoch": 3.6262226174116887, "step": 22430}, {"loss": 0.6328, "grad_norm": 0.8466871976852417, "learning_rate": 0.0002, "epoch": 3.627839301592434, "step": 22440}, {"loss": 0.5941, "grad_norm": 0.9501169919967651, "learning_rate": 0.0002, "epoch": 3.629455985773179, "step": 22450}, {"loss": 0.6069, "grad_norm": 0.8906720876693726, "learning_rate": 0.0002, "epoch": 3.6310726699539244, "step": 22460}, {"loss": 0.6928, "grad_norm": 0.7400227189064026, "learning_rate": 0.0002, "epoch": 3.6326893541346696, "step": 22470}, {"loss": 0.6337, "grad_norm": 0.9756355881690979, "learning_rate": 0.0002, "epoch": 3.6343060383154153, "step": 22480}, {"loss": 0.6203, "grad_norm": 0.7504993081092834, "learning_rate": 0.0002, "epoch": 3.6359227224961606, "step": 22490}, {"loss": 0.6302, "grad_norm": 0.9270039200782776, "learning_rate": 0.0002, "epoch": 3.637539406676906, "step": 22500}, {"loss": 0.6026, "grad_norm": 0.8841686844825745, "learning_rate": 0.0002, "epoch": 3.639156090857651, "step": 22510}, {"loss": 0.6098, "grad_norm": 0.8533213138580322, "learning_rate": 0.0002, "epoch": 3.6407727750383962, "step": 22520}, {"loss": 0.6412, "grad_norm": 1.0052043199539185, "learning_rate": 0.0002, "epoch": 3.6423894592191415, "step": 22530}, {"loss": 0.6363, "grad_norm": 1.0323461294174194, "learning_rate": 0.0002, "epoch": 3.6440061433998867, "step": 22540}, {"loss": 0.6545, "grad_norm": 0.8654312491416931, "learning_rate": 0.0002, "epoch": 3.645622827580632, "step": 22550}, {"loss": 0.6155, "grad_norm": 0.6400038003921509, "learning_rate": 0.0002, "epoch": 3.647239511761377, "step": 22560}, {"loss": 0.5829, "grad_norm": 0.8061298727989197, "learning_rate": 0.0002, "epoch": 3.648856195942123, "step": 22570}, {"loss": 0.6388, "grad_norm": 0.9257854223251343, "learning_rate": 0.0002, "epoch": 3.650472880122868, "step": 22580}, {"loss": 0.6409, "grad_norm": 0.8439396619796753, "learning_rate": 0.0002, "epoch": 3.6520895643036133, "step": 22590}, {"loss": 0.5996, "grad_norm": 0.7764544486999512, "learning_rate": 0.0002, "epoch": 3.6537062484843585, "step": 22600}, {"loss": 0.6434, "grad_norm": 1.125451683998108, "learning_rate": 0.0002, "epoch": 3.6553229326651038, "step": 22610}, {"loss": 0.6579, "grad_norm": 0.7523018717765808, "learning_rate": 0.0002, "epoch": 3.656939616845849, "step": 22620}, {"loss": 0.6476, "grad_norm": 1.071026086807251, "learning_rate": 0.0002, "epoch": 3.6585563010265947, "step": 22630}, {"loss": 0.6459, "grad_norm": 0.945791482925415, "learning_rate": 0.0002, "epoch": 3.66017298520734, "step": 22640}, {"loss": 0.659, "grad_norm": 0.8001811504364014, "learning_rate": 0.0002, "epoch": 3.661789669388085, "step": 22650}, {"loss": 0.6385, "grad_norm": 0.9700816869735718, "learning_rate": 0.0002, "epoch": 3.6634063535688304, "step": 22660}, {"loss": 0.6337, "grad_norm": 0.9053242206573486, "learning_rate": 0.0002, "epoch": 3.6650230377495756, "step": 22670}, {"loss": 0.6335, "grad_norm": 0.944362461566925, "learning_rate": 0.0002, "epoch": 3.666639721930321, "step": 22680}, {"loss": 0.6235, "grad_norm": 1.067489504814148, "learning_rate": 0.0002, "epoch": 3.668256406111066, "step": 22690}, {"loss": 0.698, "grad_norm": 1.0984995365142822, "learning_rate": 0.0002, "epoch": 3.6698730902918113, "step": 22700}, {"loss": 0.6717, "grad_norm": 0.9336317777633667, "learning_rate": 0.0002, "epoch": 3.6714897744725565, "step": 22710}, {"loss": 0.6195, "grad_norm": 0.9261918663978577, "learning_rate": 0.0002, "epoch": 3.673106458653302, "step": 22720}, {"loss": 0.6332, "grad_norm": 0.8648008704185486, "learning_rate": 0.0002, "epoch": 3.6747231428340474, "step": 22730}, {"loss": 0.6576, "grad_norm": 0.7225083708763123, "learning_rate": 0.0002, "epoch": 3.6763398270147927, "step": 22740}, {"loss": 0.6406, "grad_norm": 0.9258282780647278, "learning_rate": 0.0002, "epoch": 3.677956511195538, "step": 22750}, {"loss": 0.6397, "grad_norm": 0.70876145362854, "learning_rate": 0.0002, "epoch": 3.679573195376283, "step": 22760}, {"loss": 0.6821, "grad_norm": 0.8780210018157959, "learning_rate": 0.0002, "epoch": 3.681189879557029, "step": 22770}, {"loss": 0.6036, "grad_norm": 0.8075440526008606, "learning_rate": 0.0002, "epoch": 3.682806563737774, "step": 22780}, {"loss": 0.6561, "grad_norm": 0.8503130674362183, "learning_rate": 0.0002, "epoch": 3.6844232479185193, "step": 22790}, {"loss": 0.6082, "grad_norm": 0.8413618206977844, "learning_rate": 0.0002, "epoch": 3.6860399320992645, "step": 22800}, {"loss": 0.614, "grad_norm": 0.8675165176391602, "learning_rate": 0.0002, "epoch": 3.6876566162800097, "step": 22810}, {"loss": 0.6157, "grad_norm": 0.8235884308815002, "learning_rate": 0.0002, "epoch": 3.689273300460755, "step": 22820}, {"loss": 0.5708, "grad_norm": 0.9477725625038147, "learning_rate": 0.0002, "epoch": 3.6908899846415, "step": 22830}, {"loss": 0.6481, "grad_norm": 0.7883533835411072, "learning_rate": 0.0002, "epoch": 3.6925066688222454, "step": 22840}, {"loss": 0.5872, "grad_norm": 1.047913908958435, "learning_rate": 0.0002, "epoch": 3.6941233530029907, "step": 22850}, {"loss": 0.6176, "grad_norm": 0.9171528816223145, "learning_rate": 0.0002, "epoch": 3.695740037183736, "step": 22860}, {"loss": 0.6204, "grad_norm": 0.9338192343711853, "learning_rate": 0.0002, "epoch": 3.6973567213644816, "step": 22870}, {"loss": 0.686, "grad_norm": 0.8799443244934082, "learning_rate": 0.0002, "epoch": 3.698973405545227, "step": 22880}, {"loss": 0.6206, "grad_norm": 0.8515434861183167, "learning_rate": 0.0002, "epoch": 3.700590089725972, "step": 22890}, {"loss": 0.5954, "grad_norm": 0.7805591821670532, "learning_rate": 0.0002, "epoch": 3.7022067739067173, "step": 22900}, {"loss": 0.6108, "grad_norm": 0.8470911979675293, "learning_rate": 0.0002, "epoch": 3.7038234580874625, "step": 22910}, {"loss": 0.6557, "grad_norm": 0.9452309012413025, "learning_rate": 0.0002, "epoch": 3.705440142268208, "step": 22920}, {"loss": 0.6529, "grad_norm": 0.950243353843689, "learning_rate": 0.0002, "epoch": 3.7070568264489534, "step": 22930}, {"loss": 0.6364, "grad_norm": 0.7882499098777771, "learning_rate": 0.0002, "epoch": 3.7086735106296986, "step": 22940}, {"loss": 0.6462, "grad_norm": 0.8307787775993347, "learning_rate": 0.0002, "epoch": 3.710290194810444, "step": 22950}, {"loss": 0.6371, "grad_norm": 1.0970630645751953, "learning_rate": 0.0002, "epoch": 3.711906878991189, "step": 22960}, {"loss": 0.6281, "grad_norm": 0.8269566297531128, "learning_rate": 0.0002, "epoch": 3.7135235631719343, "step": 22970}, {"loss": 0.6561, "grad_norm": 0.8306704759597778, "learning_rate": 0.0002, "epoch": 3.7151402473526796, "step": 22980}, {"loss": 0.6418, "grad_norm": 0.9710225462913513, "learning_rate": 0.0002, "epoch": 3.716756931533425, "step": 22990}, {"loss": 0.6639, "grad_norm": 0.8890530467033386, "learning_rate": 0.0002, "epoch": 3.71837361571417, "step": 23000}, {"loss": 0.6084, "grad_norm": 0.883522629737854, "learning_rate": 0.0002, "epoch": 3.7199902998949153, "step": 23010}, {"loss": 0.6183, "grad_norm": 0.8662652373313904, "learning_rate": 0.0002, "epoch": 3.721606984075661, "step": 23020}, {"loss": 0.6266, "grad_norm": 0.7228406667709351, "learning_rate": 0.0002, "epoch": 3.723223668256406, "step": 23030}, {"loss": 0.6417, "grad_norm": 1.060792088508606, "learning_rate": 0.0002, "epoch": 3.7248403524371514, "step": 23040}, {"loss": 0.6346, "grad_norm": 1.0119613409042358, "learning_rate": 0.0002, "epoch": 3.7264570366178966, "step": 23050}, {"loss": 0.6466, "grad_norm": 0.9212996959686279, "learning_rate": 0.0002, "epoch": 3.728073720798642, "step": 23060}, {"loss": 0.6454, "grad_norm": 0.925690233707428, "learning_rate": 0.0002, "epoch": 3.7296904049793875, "step": 23070}, {"loss": 0.615, "grad_norm": 0.8323310613632202, "learning_rate": 0.0002, "epoch": 3.7313070891601328, "step": 23080}, {"loss": 0.679, "grad_norm": 0.8966048955917358, "learning_rate": 0.0002, "epoch": 3.732923773340878, "step": 23090}, {"loss": 0.6151, "grad_norm": 0.8995837569236755, "learning_rate": 0.0002, "epoch": 3.7345404575216232, "step": 23100}, {"loss": 0.6143, "grad_norm": 0.8748890161514282, "learning_rate": 0.0002, "epoch": 3.7361571417023685, "step": 23110}, {"loss": 0.6246, "grad_norm": 0.7985540628433228, "learning_rate": 0.0002, "epoch": 3.7377738258831137, "step": 23120}, {"loss": 0.6279, "grad_norm": 1.0240917205810547, "learning_rate": 0.0002, "epoch": 3.739390510063859, "step": 23130}, {"loss": 0.6747, "grad_norm": 0.9181789755821228, "learning_rate": 0.0002, "epoch": 3.741007194244604, "step": 23140}, {"loss": 0.6026, "grad_norm": 0.8896583914756775, "learning_rate": 0.0002, "epoch": 3.7426238784253494, "step": 23150}, {"loss": 0.5972, "grad_norm": 0.8635515570640564, "learning_rate": 0.0002, "epoch": 3.744240562606095, "step": 23160}, {"loss": 0.6683, "grad_norm": 0.8873575329780579, "learning_rate": 0.0002, "epoch": 3.7458572467868403, "step": 23170}, {"loss": 0.6143, "grad_norm": 0.9807148575782776, "learning_rate": 0.0002, "epoch": 3.7474739309675855, "step": 23180}, {"loss": 0.6381, "grad_norm": 0.900477945804596, "learning_rate": 0.0002, "epoch": 3.7490906151483308, "step": 23190}, {"loss": 0.6542, "grad_norm": 0.9379992485046387, "learning_rate": 0.0002, "epoch": 3.750707299329076, "step": 23200}, {"loss": 0.6015, "grad_norm": 0.9649890661239624, "learning_rate": 0.0002, "epoch": 3.752323983509821, "step": 23210}, {"loss": 0.6735, "grad_norm": 0.824442446231842, "learning_rate": 0.0002, "epoch": 3.753940667690567, "step": 23220}, {"loss": 0.5992, "grad_norm": 0.8896150588989258, "learning_rate": 0.0002, "epoch": 3.755557351871312, "step": 23230}, {"loss": 0.6081, "grad_norm": 0.751249372959137, "learning_rate": 0.0002, "epoch": 3.7571740360520574, "step": 23240}, {"loss": 0.629, "grad_norm": 0.9392193555831909, "learning_rate": 0.0002, "epoch": 3.7587907202328026, "step": 23250}, {"loss": 0.6209, "grad_norm": 0.9284586310386658, "learning_rate": 0.0002, "epoch": 3.760407404413548, "step": 23260}, {"loss": 0.6414, "grad_norm": 0.7738175392150879, "learning_rate": 0.0002, "epoch": 3.762024088594293, "step": 23270}, {"loss": 0.6743, "grad_norm": 0.9252978563308716, "learning_rate": 0.0002, "epoch": 3.7636407727750383, "step": 23280}, {"loss": 0.5984, "grad_norm": 0.9501895904541016, "learning_rate": 0.0002, "epoch": 3.7652574569557835, "step": 23290}, {"loss": 0.6568, "grad_norm": 0.9416276216506958, "learning_rate": 0.0002, "epoch": 3.7668741411365287, "step": 23300}, {"loss": 0.6507, "grad_norm": 0.7076631784439087, "learning_rate": 0.0002, "epoch": 3.7684908253172744, "step": 23310}, {"loss": 0.6329, "grad_norm": 0.9864492416381836, "learning_rate": 0.0002, "epoch": 3.7701075094980196, "step": 23320}, {"loss": 0.6537, "grad_norm": 0.8450456261634827, "learning_rate": 0.0002, "epoch": 3.771724193678765, "step": 23330}, {"loss": 0.658, "grad_norm": 1.0768941640853882, "learning_rate": 0.0002, "epoch": 3.77334087785951, "step": 23340}, {"loss": 0.6408, "grad_norm": 0.9956819415092468, "learning_rate": 0.0002, "epoch": 3.7749575620402553, "step": 23350}, {"loss": 0.6464, "grad_norm": 0.9234658479690552, "learning_rate": 0.0002, "epoch": 3.7765742462210006, "step": 23360}, {"loss": 0.6542, "grad_norm": 1.0993858575820923, "learning_rate": 0.0002, "epoch": 3.7781909304017463, "step": 23370}, {"loss": 0.6391, "grad_norm": 0.923159658908844, "learning_rate": 0.0002, "epoch": 3.7798076145824915, "step": 23380}, {"loss": 0.6625, "grad_norm": 0.9311541318893433, "learning_rate": 0.0002, "epoch": 3.7814242987632367, "step": 23390}, {"loss": 0.6535, "grad_norm": 0.919681191444397, "learning_rate": 0.0002, "epoch": 3.783040982943982, "step": 23400}, {"loss": 0.6138, "grad_norm": 1.7406195402145386, "learning_rate": 0.0002, "epoch": 3.784657667124727, "step": 23410}, {"loss": 0.657, "grad_norm": 0.7789074182510376, "learning_rate": 0.0002, "epoch": 3.7862743513054724, "step": 23420}, {"loss": 0.658, "grad_norm": 0.8302814960479736, "learning_rate": 0.0002, "epoch": 3.7878910354862176, "step": 23430}, {"loss": 0.649, "grad_norm": 0.8089349269866943, "learning_rate": 0.0002, "epoch": 3.789507719666963, "step": 23440}, {"loss": 0.6682, "grad_norm": 0.9006284475326538, "learning_rate": 0.0002, "epoch": 3.791124403847708, "step": 23450}, {"loss": 0.6335, "grad_norm": 0.8426766991615295, "learning_rate": 0.0002, "epoch": 3.7927410880284538, "step": 23460}, {"loss": 0.6364, "grad_norm": 1.2576252222061157, "learning_rate": 0.0002, "epoch": 3.794357772209199, "step": 23470}, {"loss": 0.6324, "grad_norm": 1.0307610034942627, "learning_rate": 0.0002, "epoch": 3.7959744563899442, "step": 23480}, {"loss": 0.6262, "grad_norm": 0.8525972962379456, "learning_rate": 0.0002, "epoch": 3.7975911405706895, "step": 23490}, {"loss": 0.6757, "grad_norm": 1.159039855003357, "learning_rate": 0.0002, "epoch": 3.7992078247514347, "step": 23500}, {"loss": 0.6414, "grad_norm": 1.4193549156188965, "learning_rate": 0.0002, "epoch": 3.80082450893218, "step": 23510}, {"loss": 0.6413, "grad_norm": 0.8245543837547302, "learning_rate": 0.0002, "epoch": 3.8024411931129256, "step": 23520}, {"loss": 0.6417, "grad_norm": 0.8847230076789856, "learning_rate": 0.0002, "epoch": 3.804057877293671, "step": 23530}, {"loss": 0.6415, "grad_norm": 0.9574624300003052, "learning_rate": 0.0002, "epoch": 3.805674561474416, "step": 23540}, {"loss": 0.5765, "grad_norm": 1.048020601272583, "learning_rate": 0.0002, "epoch": 3.8072912456551613, "step": 23550}, {"loss": 0.6497, "grad_norm": 0.8302255868911743, "learning_rate": 0.0002, "epoch": 3.8089079298359065, "step": 23560}, {"loss": 0.6534, "grad_norm": 0.8269215822219849, "learning_rate": 0.0002, "epoch": 3.8105246140166518, "step": 23570}, {"loss": 0.6294, "grad_norm": 0.9375753402709961, "learning_rate": 0.0002, "epoch": 3.812141298197397, "step": 23580}, {"loss": 0.6132, "grad_norm": 1.0234097242355347, "learning_rate": 0.0002, "epoch": 3.8137579823781422, "step": 23590}, {"loss": 0.6625, "grad_norm": 0.8978445529937744, "learning_rate": 0.0002, "epoch": 3.8153746665588875, "step": 23600}, {"loss": 0.6315, "grad_norm": 0.7929515838623047, "learning_rate": 0.0002, "epoch": 3.816991350739633, "step": 23610}, {"loss": 0.6387, "grad_norm": 1.3255881071090698, "learning_rate": 0.0002, "epoch": 3.8186080349203784, "step": 23620}, {"loss": 0.5947, "grad_norm": 0.9188598990440369, "learning_rate": 0.0002, "epoch": 3.8202247191011236, "step": 23630}, {"loss": 0.6152, "grad_norm": 0.8811675906181335, "learning_rate": 0.0002, "epoch": 3.821841403281869, "step": 23640}, {"loss": 0.6253, "grad_norm": 0.8061038255691528, "learning_rate": 0.0002, "epoch": 3.823458087462614, "step": 23650}, {"loss": 0.6517, "grad_norm": 0.9975376129150391, "learning_rate": 0.0002, "epoch": 3.8250747716433597, "step": 23660}, {"loss": 0.6288, "grad_norm": 0.8036105036735535, "learning_rate": 0.0002, "epoch": 3.826691455824105, "step": 23670}, {"loss": 0.6845, "grad_norm": 0.7401984333992004, "learning_rate": 0.0002, "epoch": 3.82830814000485, "step": 23680}, {"loss": 0.6423, "grad_norm": 0.829753041267395, "learning_rate": 0.0002, "epoch": 3.8299248241855954, "step": 23690}, {"loss": 0.6611, "grad_norm": 0.8753240704536438, "learning_rate": 0.0002, "epoch": 3.8315415083663407, "step": 23700}, {"loss": 0.6686, "grad_norm": 0.8157842755317688, "learning_rate": 0.0002, "epoch": 3.833158192547086, "step": 23710}, {"loss": 0.6181, "grad_norm": 0.6183798909187317, "learning_rate": 0.0002, "epoch": 3.834774876727831, "step": 23720}, {"loss": 0.5965, "grad_norm": 0.9548442363739014, "learning_rate": 0.0002, "epoch": 3.8363915609085764, "step": 23730}, {"loss": 0.6456, "grad_norm": 0.8319669961929321, "learning_rate": 0.0002, "epoch": 3.8380082450893216, "step": 23740}, {"loss": 0.6585, "grad_norm": 0.9718693494796753, "learning_rate": 0.0002, "epoch": 3.839624929270067, "step": 23750}, {"loss": 0.6518, "grad_norm": 0.8672235012054443, "learning_rate": 0.0002, "epoch": 3.8412416134508125, "step": 23760}, {"loss": 0.6774, "grad_norm": 1.1210707426071167, "learning_rate": 0.0002, "epoch": 3.8428582976315577, "step": 23770}, {"loss": 0.5923, "grad_norm": 0.9177767634391785, "learning_rate": 0.0002, "epoch": 3.844474981812303, "step": 23780}, {"loss": 0.6286, "grad_norm": 0.8714171648025513, "learning_rate": 0.0002, "epoch": 3.846091665993048, "step": 23790}, {"loss": 0.6302, "grad_norm": 1.1853246688842773, "learning_rate": 0.0002, "epoch": 3.8477083501737934, "step": 23800}, {"loss": 0.6144, "grad_norm": 0.8091260194778442, "learning_rate": 0.0002, "epoch": 3.849325034354539, "step": 23810}, {"loss": 0.658, "grad_norm": 0.9710774421691895, "learning_rate": 0.0002, "epoch": 3.8509417185352843, "step": 23820}, {"loss": 0.6151, "grad_norm": 0.7648707628250122, "learning_rate": 0.0002, "epoch": 3.8525584027160296, "step": 23830}, {"loss": 0.6013, "grad_norm": 0.7809253931045532, "learning_rate": 0.0002, "epoch": 3.854175086896775, "step": 23840}, {"loss": 0.6006, "grad_norm": 0.8337951898574829, "learning_rate": 0.0002, "epoch": 3.85579177107752, "step": 23850}, {"loss": 0.6456, "grad_norm": 0.9271913170814514, "learning_rate": 0.0002, "epoch": 3.8574084552582653, "step": 23860}, {"loss": 0.6671, "grad_norm": 0.985334038734436, "learning_rate": 0.0002, "epoch": 3.8590251394390105, "step": 23870}, {"loss": 0.6693, "grad_norm": 0.8458583354949951, "learning_rate": 0.0002, "epoch": 3.8606418236197557, "step": 23880}, {"loss": 0.6207, "grad_norm": 1.015348196029663, "learning_rate": 0.0002, "epoch": 3.862258507800501, "step": 23890}, {"loss": 0.649, "grad_norm": 1.0121688842773438, "learning_rate": 0.0002, "epoch": 3.8638751919812466, "step": 23900}, {"loss": 0.5921, "grad_norm": 0.8883971571922302, "learning_rate": 0.0002, "epoch": 3.865491876161992, "step": 23910}, {"loss": 0.6597, "grad_norm": 1.028086543083191, "learning_rate": 0.0002, "epoch": 3.867108560342737, "step": 23920}, {"loss": 0.6654, "grad_norm": 0.9645734429359436, "learning_rate": 0.0002, "epoch": 3.8687252445234823, "step": 23930}, {"loss": 0.6328, "grad_norm": 0.8235350251197815, "learning_rate": 0.0002, "epoch": 3.8703419287042276, "step": 23940}, {"loss": 0.6387, "grad_norm": 1.0298916101455688, "learning_rate": 0.0002, "epoch": 3.871958612884973, "step": 23950}, {"loss": 0.5966, "grad_norm": 1.0063377618789673, "learning_rate": 0.0002, "epoch": 3.8735752970657185, "step": 23960}, {"loss": 0.6234, "grad_norm": 0.9230626821517944, "learning_rate": 0.0002, "epoch": 3.8751919812464637, "step": 23970}, {"loss": 0.6159, "grad_norm": 0.9243063926696777, "learning_rate": 0.0002, "epoch": 3.876808665427209, "step": 23980}, {"loss": 0.6035, "grad_norm": 1.0211291313171387, "learning_rate": 0.0002, "epoch": 3.878425349607954, "step": 23990}, {"loss": 0.6351, "grad_norm": 0.7800535559654236, "learning_rate": 0.0002, "epoch": 3.8800420337886994, "step": 24000}, {"loss": 0.7, "grad_norm": 0.7904248833656311, "learning_rate": 0.0002, "epoch": 3.8816587179694446, "step": 24010}, {"loss": 0.6516, "grad_norm": 1.1975988149642944, "learning_rate": 0.0002, "epoch": 3.88327540215019, "step": 24020}, {"loss": 0.6006, "grad_norm": 1.0626593828201294, "learning_rate": 0.0002, "epoch": 3.884892086330935, "step": 24030}, {"loss": 0.6115, "grad_norm": 0.9012193083763123, "learning_rate": 0.0002, "epoch": 3.8865087705116803, "step": 24040}, {"loss": 0.6786, "grad_norm": 1.1159172058105469, "learning_rate": 0.0002, "epoch": 3.888125454692426, "step": 24050}, {"loss": 0.6635, "grad_norm": 1.276838779449463, "learning_rate": 0.0002, "epoch": 3.889742138873171, "step": 24060}, {"loss": 0.5985, "grad_norm": 0.8467690348625183, "learning_rate": 0.0002, "epoch": 3.8913588230539164, "step": 24070}, {"loss": 0.6655, "grad_norm": 0.9862841963768005, "learning_rate": 0.0002, "epoch": 3.8929755072346617, "step": 24080}, {"loss": 0.6098, "grad_norm": 0.7134621739387512, "learning_rate": 0.0002, "epoch": 3.894592191415407, "step": 24090}, {"loss": 0.618, "grad_norm": 0.8178175091743469, "learning_rate": 0.0002, "epoch": 3.896208875596152, "step": 24100}, {"loss": 0.6147, "grad_norm": 0.9229172468185425, "learning_rate": 0.0002, "epoch": 3.897825559776898, "step": 24110}, {"loss": 0.6554, "grad_norm": 1.0878316164016724, "learning_rate": 0.0002, "epoch": 3.899442243957643, "step": 24120}, {"loss": 0.6616, "grad_norm": 0.971645712852478, "learning_rate": 0.0002, "epoch": 3.9010589281383883, "step": 24130}, {"loss": 0.6228, "grad_norm": 0.8862188458442688, "learning_rate": 0.0002, "epoch": 3.9026756123191335, "step": 24140}, {"loss": 0.6192, "grad_norm": 0.9126982688903809, "learning_rate": 0.0002, "epoch": 3.9042922964998787, "step": 24150}, {"loss": 0.6734, "grad_norm": 0.8833470940589905, "learning_rate": 0.0002, "epoch": 3.905908980680624, "step": 24160}, {"loss": 0.5832, "grad_norm": 0.8320947885513306, "learning_rate": 0.0002, "epoch": 3.907525664861369, "step": 24170}, {"loss": 0.6247, "grad_norm": 0.9156602025032043, "learning_rate": 0.0002, "epoch": 3.9091423490421144, "step": 24180}, {"loss": 0.6678, "grad_norm": 1.029181957244873, "learning_rate": 0.0002, "epoch": 3.9107590332228597, "step": 24190}, {"loss": 0.6565, "grad_norm": 0.9052802324295044, "learning_rate": 0.0002, "epoch": 3.9123757174036053, "step": 24200}, {"loss": 0.6346, "grad_norm": 0.8847255110740662, "learning_rate": 0.0002, "epoch": 3.9139924015843506, "step": 24210}, {"loss": 0.6343, "grad_norm": 0.9642062187194824, "learning_rate": 0.0002, "epoch": 3.915609085765096, "step": 24220}, {"loss": 0.6557, "grad_norm": 0.8629093766212463, "learning_rate": 0.0002, "epoch": 3.917225769945841, "step": 24230}, {"loss": 0.6086, "grad_norm": 0.8674976825714111, "learning_rate": 0.0002, "epoch": 3.9188424541265863, "step": 24240}, {"loss": 0.5874, "grad_norm": 1.104846477508545, "learning_rate": 0.0002, "epoch": 3.9204591383073315, "step": 24250}, {"loss": 0.6501, "grad_norm": 1.0874955654144287, "learning_rate": 0.0002, "epoch": 3.922075822488077, "step": 24260}, {"loss": 0.6455, "grad_norm": 0.8689812421798706, "learning_rate": 0.0002, "epoch": 3.9236925066688224, "step": 24270}, {"loss": 0.5893, "grad_norm": 0.9724617004394531, "learning_rate": 0.0002, "epoch": 3.9253091908495676, "step": 24280}, {"loss": 0.6616, "grad_norm": 0.9165538549423218, "learning_rate": 0.0002, "epoch": 3.926925875030313, "step": 24290}, {"loss": 0.645, "grad_norm": 0.9307710528373718, "learning_rate": 0.0002, "epoch": 3.928542559211058, "step": 24300}, {"loss": 0.6071, "grad_norm": 0.8589295148849487, "learning_rate": 0.0002, "epoch": 3.9301592433918033, "step": 24310}, {"loss": 0.6662, "grad_norm": 0.9151099920272827, "learning_rate": 0.0002, "epoch": 3.9317759275725486, "step": 24320}, {"loss": 0.7075, "grad_norm": 0.9633517265319824, "learning_rate": 0.0002, "epoch": 3.933392611753294, "step": 24330}, {"loss": 0.6432, "grad_norm": 0.9521116018295288, "learning_rate": 0.0002, "epoch": 3.935009295934039, "step": 24340}, {"loss": 0.6457, "grad_norm": 0.8366776704788208, "learning_rate": 0.0002, "epoch": 3.9366259801147847, "step": 24350}, {"loss": 0.6139, "grad_norm": 0.8972663283348083, "learning_rate": 0.0002, "epoch": 3.93824266429553, "step": 24360}, {"loss": 0.661, "grad_norm": 0.8102919459342957, "learning_rate": 0.0002, "epoch": 3.939859348476275, "step": 24370}, {"loss": 0.6388, "grad_norm": 0.8189975023269653, "learning_rate": 0.0002, "epoch": 3.9414760326570204, "step": 24380}, {"loss": 0.6818, "grad_norm": 0.9569464921951294, "learning_rate": 0.0002, "epoch": 3.9430927168377656, "step": 24390}, {"loss": 0.6999, "grad_norm": 0.7459101676940918, "learning_rate": 0.0002, "epoch": 3.9447094010185113, "step": 24400}, {"loss": 0.6069, "grad_norm": 0.8536974787712097, "learning_rate": 0.0002, "epoch": 3.9463260851992565, "step": 24410}, {"loss": 0.5683, "grad_norm": 0.8763698935508728, "learning_rate": 0.0002, "epoch": 3.9479427693800018, "step": 24420}, {"loss": 0.6478, "grad_norm": 0.9381106495857239, "learning_rate": 0.0002, "epoch": 3.949559453560747, "step": 24430}, {"loss": 0.6371, "grad_norm": 0.934440016746521, "learning_rate": 0.0002, "epoch": 3.9511761377414922, "step": 24440}, {"loss": 0.6393, "grad_norm": 0.903918981552124, "learning_rate": 0.0002, "epoch": 3.9527928219222375, "step": 24450}, {"loss": 0.6175, "grad_norm": 0.8771953582763672, "learning_rate": 0.0002, "epoch": 3.9544095061029827, "step": 24460}, {"loss": 0.6971, "grad_norm": 1.0375410318374634, "learning_rate": 0.0002, "epoch": 3.956026190283728, "step": 24470}, {"loss": 0.6313, "grad_norm": 0.9439185261726379, "learning_rate": 0.0002, "epoch": 3.957642874464473, "step": 24480}, {"loss": 0.6076, "grad_norm": 0.935467004776001, "learning_rate": 0.0002, "epoch": 3.9592595586452184, "step": 24490}, {"loss": 0.6437, "grad_norm": 0.6900772452354431, "learning_rate": 0.0002, "epoch": 3.960876242825964, "step": 24500}, {"loss": 0.6445, "grad_norm": 1.0172916650772095, "learning_rate": 0.0002, "epoch": 3.9624929270067093, "step": 24510}, {"loss": 0.6308, "grad_norm": 0.9167046546936035, "learning_rate": 0.0002, "epoch": 3.9641096111874545, "step": 24520}, {"loss": 0.6519, "grad_norm": 0.7230527997016907, "learning_rate": 0.0002, "epoch": 3.9657262953681998, "step": 24530}, {"loss": 0.6564, "grad_norm": 0.8980403542518616, "learning_rate": 0.0002, "epoch": 3.967342979548945, "step": 24540}, {"loss": 0.6099, "grad_norm": 0.8555465936660767, "learning_rate": 0.0002, "epoch": 3.9689596637296907, "step": 24550}, {"loss": 0.6617, "grad_norm": 0.7825445532798767, "learning_rate": 0.0002, "epoch": 3.970576347910436, "step": 24560}, {"loss": 0.604, "grad_norm": 0.7273133993148804, "learning_rate": 0.0002, "epoch": 3.972193032091181, "step": 24570}, {"loss": 0.6427, "grad_norm": 0.9612047672271729, "learning_rate": 0.0002, "epoch": 3.9738097162719264, "step": 24580}, {"loss": 0.6426, "grad_norm": 0.9865460991859436, "learning_rate": 0.0002, "epoch": 3.9754264004526716, "step": 24590}, {"loss": 0.6052, "grad_norm": 0.8638762831687927, "learning_rate": 0.0002, "epoch": 3.977043084633417, "step": 24600}, {"loss": 0.6097, "grad_norm": 1.0096198320388794, "learning_rate": 0.0002, "epoch": 3.978659768814162, "step": 24610}, {"loss": 0.6664, "grad_norm": 0.8475532531738281, "learning_rate": 0.0002, "epoch": 3.9802764529949073, "step": 24620}, {"loss": 0.6711, "grad_norm": 0.9696195721626282, "learning_rate": 0.0002, "epoch": 3.9818931371756525, "step": 24630}, {"loss": 0.6446, "grad_norm": 0.7499843239784241, "learning_rate": 0.0002, "epoch": 3.9835098213563978, "step": 24640}, {"loss": 0.6054, "grad_norm": 0.8865424990653992, "learning_rate": 0.0002, "epoch": 3.9851265055371434, "step": 24650}, {"loss": 0.5975, "grad_norm": 0.8089959025382996, "learning_rate": 0.0002, "epoch": 3.9867431897178887, "step": 24660}, {"loss": 0.6677, "grad_norm": 0.6946012377738953, "learning_rate": 0.0002, "epoch": 3.988359873898634, "step": 24670}, {"loss": 0.6329, "grad_norm": 0.7991759181022644, "learning_rate": 0.0002, "epoch": 3.989976558079379, "step": 24680}, {"loss": 0.6449, "grad_norm": 0.8803931474685669, "learning_rate": 0.0002, "epoch": 3.9915932422601244, "step": 24690}, {"loss": 0.7091, "grad_norm": 0.8848299980163574, "learning_rate": 0.0002, "epoch": 3.99320992644087, "step": 24700}, {"loss": 0.6551, "grad_norm": 0.7448889017105103, "learning_rate": 0.0002, "epoch": 3.9948266106216153, "step": 24710}, {"loss": 0.6432, "grad_norm": 0.9361620545387268, "learning_rate": 0.0002, "epoch": 3.9964432948023605, "step": 24720}, {"loss": 0.5917, "grad_norm": 0.9958081245422363, "learning_rate": 0.0002, "epoch": 3.9980599789831057, "step": 24730}, {"loss": 0.6567, "grad_norm": 1.026004672050476, "learning_rate": 0.0002, "epoch": 3.999676663163851, "step": 24740}]} +{"epoch": 4.9999191657909625, "step": 30927, "epoch_duration": 16879.647474765778, "total_accumulated_duration": 84415.05347633362, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75537109375}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.6636, "grad_norm": 0.9894065856933594, "learning_rate": 0.0002, "epoch": 0.0016166841807452913, "step": 10}, {"loss": 1.1528, "grad_norm": 1.7810699939727783, "learning_rate": 0.0002, "epoch": 0.0032333683614905826, "step": 20}, {"loss": 0.9767, "grad_norm": 0.5969577431678772, "learning_rate": 0.0002, "epoch": 0.004850052542235874, "step": 30}, {"loss": 0.9772, "grad_norm": 0.6354120969772339, "learning_rate": 0.0002, "epoch": 0.006466736722981165, "step": 40}, {"loss": 0.8643, "grad_norm": 0.5604607462882996, "learning_rate": 0.0002, "epoch": 0.008083420903726457, "step": 50}, {"loss": 0.8841, "grad_norm": 0.4676193594932556, "learning_rate": 0.0002, "epoch": 0.009700105084471748, "step": 60}, {"loss": 0.9022, "grad_norm": 0.6099211573600769, "learning_rate": 0.0002, "epoch": 0.01131678926521704, "step": 70}, {"loss": 0.9133, "grad_norm": 0.48639994859695435, "learning_rate": 0.0002, "epoch": 0.01293347344596233, "step": 80}, {"loss": 0.8704, "grad_norm": 0.4904264509677887, "learning_rate": 0.0002, "epoch": 0.014550157626707623, "step": 90}, {"loss": 0.8855, "grad_norm": 2.8334362506866455, "learning_rate": 0.0002, "epoch": 0.016166841807452915, "step": 100}, {"loss": 0.8958, "grad_norm": 0.43221670389175415, "learning_rate": 0.0002, "epoch": 0.017783525988198205, "step": 110}, {"loss": 0.8412, "grad_norm": 0.42244166135787964, "learning_rate": 0.0002, "epoch": 0.019400210168943496, "step": 120}, {"loss": 0.8467, "grad_norm": 0.45363298058509827, "learning_rate": 0.0002, "epoch": 0.02101689434968879, "step": 130}, {"loss": 0.8641, "grad_norm": 0.44816508889198303, "learning_rate": 0.0002, "epoch": 0.02263357853043408, "step": 140}, {"loss": 0.8496, "grad_norm": 0.43308213353157043, "learning_rate": 0.0002, "epoch": 0.02425026271117937, "step": 150}, {"loss": 0.8213, "grad_norm": 0.4084763526916504, "learning_rate": 0.0002, "epoch": 0.02586694689192466, "step": 160}, {"loss": 0.8343, "grad_norm": 0.5363703966140747, "learning_rate": 0.0002, "epoch": 0.027483631072669955, "step": 170}, {"loss": 0.8558, "grad_norm": 0.4619699716567993, "learning_rate": 0.0002, "epoch": 0.029100315253415245, "step": 180}, {"loss": 0.8878, "grad_norm": 0.49069908261299133, "learning_rate": 0.0002, "epoch": 0.030716999434160536, "step": 190}, {"loss": 0.8867, "grad_norm": 0.4645835757255554, "learning_rate": 0.0002, "epoch": 0.03233368361490583, "step": 200}, {"loss": 0.8842, "grad_norm": 1.2411243915557861, "learning_rate": 0.0002, "epoch": 0.03395036779565112, "step": 210}, {"loss": 0.8245, "grad_norm": 0.5211851596832275, "learning_rate": 0.0002, "epoch": 0.03556705197639641, "step": 220}, {"loss": 0.8194, "grad_norm": 0.5253691673278809, "learning_rate": 0.0002, "epoch": 0.037183736157141704, "step": 230}, {"loss": 0.8856, "grad_norm": 0.4567478895187378, "learning_rate": 0.0002, "epoch": 0.03880042033788699, "step": 240}, {"loss": 0.838, "grad_norm": 0.5472128391265869, "learning_rate": 0.0002, "epoch": 0.040417104518632285, "step": 250}, {"loss": 0.8201, "grad_norm": 0.42978546023368835, "learning_rate": 0.0002, "epoch": 0.04203378869937758, "step": 260}, {"loss": 0.8334, "grad_norm": 0.601734459400177, "learning_rate": 0.0002, "epoch": 0.043650472880122866, "step": 270}, {"loss": 0.815, "grad_norm": 0.4286513328552246, "learning_rate": 0.0002, "epoch": 0.04526715706086816, "step": 280}, {"loss": 0.8758, "grad_norm": 0.5230861902236938, "learning_rate": 0.0002, "epoch": 0.046883841241613454, "step": 290}, {"loss": 0.8636, "grad_norm": 0.6504611968994141, "learning_rate": 0.0002, "epoch": 0.04850052542235874, "step": 300}, {"loss": 0.8102, "grad_norm": 0.43485215306282043, "learning_rate": 0.0002, "epoch": 0.050117209603104035, "step": 310}, {"loss": 0.8221, "grad_norm": 0.4717007875442505, "learning_rate": 0.0002, "epoch": 0.05173389378384932, "step": 320}, {"loss": 0.8469, "grad_norm": 0.4059787690639496, "learning_rate": 0.0002, "epoch": 0.053350577964594616, "step": 330}, {"loss": 0.8866, "grad_norm": 0.4366913437843323, "learning_rate": 0.0002, "epoch": 0.05496726214533991, "step": 340}, {"loss": 0.7976, "grad_norm": 0.4233848452568054, "learning_rate": 0.0002, "epoch": 0.0565839463260852, "step": 350}, {"loss": 0.8456, "grad_norm": 0.4209108352661133, "learning_rate": 0.0002, "epoch": 0.05820063050683049, "step": 360}, {"loss": 0.816, "grad_norm": 0.41637396812438965, "learning_rate": 0.0002, "epoch": 0.059817314687575784, "step": 370}, {"loss": 0.7976, "grad_norm": 0.46235376596450806, "learning_rate": 0.0002, "epoch": 0.06143399886832107, "step": 380}, {"loss": 0.7966, "grad_norm": 0.4013484716415405, "learning_rate": 0.0002, "epoch": 0.06305068304906636, "step": 390}, {"loss": 0.8253, "grad_norm": 0.47443896532058716, "learning_rate": 0.0002, "epoch": 0.06466736722981166, "step": 400}, {"loss": 0.8666, "grad_norm": 0.3942156434059143, "learning_rate": 0.0002, "epoch": 0.06628405141055695, "step": 410}, {"loss": 0.8402, "grad_norm": 0.4965320825576782, "learning_rate": 0.0002, "epoch": 0.06790073559130223, "step": 420}, {"loss": 0.8317, "grad_norm": 0.4304835796356201, "learning_rate": 0.0002, "epoch": 0.06951741977204753, "step": 430}, {"loss": 0.8528, "grad_norm": 0.511726975440979, "learning_rate": 0.0002, "epoch": 0.07113410395279282, "step": 440}, {"loss": 0.8675, "grad_norm": 0.4040689170360565, "learning_rate": 0.0002, "epoch": 0.07275078813353811, "step": 450}, {"loss": 0.8788, "grad_norm": 0.5402171015739441, "learning_rate": 0.0002, "epoch": 0.07436747231428341, "step": 460}, {"loss": 0.8737, "grad_norm": 0.4174517095088959, "learning_rate": 0.0002, "epoch": 0.0759841564950287, "step": 470}, {"loss": 0.7605, "grad_norm": 0.4306182265281677, "learning_rate": 0.0002, "epoch": 0.07760084067577398, "step": 480}, {"loss": 0.799, "grad_norm": 0.535210132598877, "learning_rate": 0.0002, "epoch": 0.07921752485651928, "step": 490}, {"loss": 0.7825, "grad_norm": 0.5339109897613525, "learning_rate": 0.0002, "epoch": 0.08083420903726457, "step": 500}, {"loss": 0.8985, "grad_norm": 0.45754891633987427, "learning_rate": 0.0002, "epoch": 0.08245089321800986, "step": 510}, {"loss": 0.8144, "grad_norm": 0.43820783495903015, "learning_rate": 0.0002, "epoch": 0.08406757739875516, "step": 520}, {"loss": 0.8001, "grad_norm": 0.4434749186038971, "learning_rate": 0.0002, "epoch": 0.08568426157950045, "step": 530}, {"loss": 0.7857, "grad_norm": 0.43111467361450195, "learning_rate": 0.0002, "epoch": 0.08730094576024573, "step": 540}, {"loss": 0.8418, "grad_norm": 0.4378940165042877, "learning_rate": 0.0002, "epoch": 0.08891762994099103, "step": 550}, {"loss": 0.8361, "grad_norm": 0.4772215187549591, "learning_rate": 0.0002, "epoch": 0.09053431412173632, "step": 560}, {"loss": 0.8268, "grad_norm": 0.6837629079818726, "learning_rate": 0.0002, "epoch": 0.09215099830248161, "step": 570}, {"loss": 0.8607, "grad_norm": 0.42241212725639343, "learning_rate": 0.0002, "epoch": 0.09376768248322691, "step": 580}, {"loss": 0.852, "grad_norm": 0.5165936350822449, "learning_rate": 0.0002, "epoch": 0.0953843666639722, "step": 590}, {"loss": 0.8664, "grad_norm": 0.48737478256225586, "learning_rate": 0.0002, "epoch": 0.09700105084471748, "step": 600}, {"loss": 0.8806, "grad_norm": 0.47419852018356323, "learning_rate": 0.0002, "epoch": 0.09861773502546278, "step": 610}, {"loss": 0.8254, "grad_norm": 0.4975486099720001, "learning_rate": 0.0002, "epoch": 0.10023441920620807, "step": 620}, {"loss": 0.8548, "grad_norm": 0.49123844504356384, "learning_rate": 0.0002, "epoch": 0.10185110338695336, "step": 630}, {"loss": 0.8911, "grad_norm": 0.6288952827453613, "learning_rate": 0.0002, "epoch": 0.10346778756769864, "step": 640}, {"loss": 0.827, "grad_norm": 0.4277345836162567, "learning_rate": 0.0002, "epoch": 0.10508447174844394, "step": 650}, {"loss": 0.7996, "grad_norm": 0.4021061956882477, "learning_rate": 0.0002, "epoch": 0.10670115592918923, "step": 660}, {"loss": 0.87, "grad_norm": 0.3492237329483032, "learning_rate": 0.0002, "epoch": 0.10831784010993452, "step": 670}, {"loss": 0.8698, "grad_norm": 0.4341012239456177, "learning_rate": 0.0002, "epoch": 0.10993452429067982, "step": 680}, {"loss": 0.781, "grad_norm": 0.7296304106712341, "learning_rate": 0.0002, "epoch": 0.1115512084714251, "step": 690}, {"loss": 0.8433, "grad_norm": 0.397494912147522, "learning_rate": 0.0002, "epoch": 0.1131678926521704, "step": 700}, {"loss": 0.827, "grad_norm": 0.396431028842926, "learning_rate": 0.0002, "epoch": 0.1147845768329157, "step": 710}, {"loss": 0.8379, "grad_norm": 0.48842838406562805, "learning_rate": 0.0002, "epoch": 0.11640126101366098, "step": 720}, {"loss": 0.8238, "grad_norm": 0.46322616934776306, "learning_rate": 0.0002, "epoch": 0.11801794519440627, "step": 730}, {"loss": 0.8041, "grad_norm": 0.47990912199020386, "learning_rate": 0.0002, "epoch": 0.11963462937515157, "step": 740}, {"loss": 0.82, "grad_norm": 0.4997142255306244, "learning_rate": 0.0002, "epoch": 0.12125131355589686, "step": 750}, {"loss": 0.7702, "grad_norm": 0.4040526747703552, "learning_rate": 0.0002, "epoch": 0.12286799773664214, "step": 760}, {"loss": 0.863, "grad_norm": 0.453095942735672, "learning_rate": 0.0002, "epoch": 0.12448468191738744, "step": 770}, {"loss": 0.8792, "grad_norm": 0.4636971950531006, "learning_rate": 0.0002, "epoch": 0.12610136609813272, "step": 780}, {"loss": 0.8112, "grad_norm": 0.4279276132583618, "learning_rate": 0.0002, "epoch": 0.12771805027887803, "step": 790}, {"loss": 0.8711, "grad_norm": 0.46212655305862427, "learning_rate": 0.0002, "epoch": 0.12933473445962332, "step": 800}, {"loss": 0.8368, "grad_norm": 0.43127650022506714, "learning_rate": 0.0002, "epoch": 0.1309514186403686, "step": 810}, {"loss": 0.8476, "grad_norm": 0.4201301336288452, "learning_rate": 0.0002, "epoch": 0.1325681028211139, "step": 820}, {"loss": 0.8078, "grad_norm": 0.42583167552948, "learning_rate": 0.0002, "epoch": 0.13418478700185918, "step": 830}, {"loss": 0.8219, "grad_norm": 0.4535622000694275, "learning_rate": 0.0002, "epoch": 0.13580147118260447, "step": 840}, {"loss": 0.8423, "grad_norm": 0.4116036891937256, "learning_rate": 0.0002, "epoch": 0.13741815536334978, "step": 850}, {"loss": 0.8466, "grad_norm": 0.45997580885887146, "learning_rate": 0.0002, "epoch": 0.13903483954409507, "step": 860}, {"loss": 0.8917, "grad_norm": 0.4487837255001068, "learning_rate": 0.0002, "epoch": 0.14065152372484035, "step": 870}, {"loss": 0.8217, "grad_norm": 0.43650057911872864, "learning_rate": 0.0002, "epoch": 0.14226820790558564, "step": 880}, {"loss": 0.8178, "grad_norm": 0.5335358381271362, "learning_rate": 0.0002, "epoch": 0.14388489208633093, "step": 890}, {"loss": 0.7957, "grad_norm": 0.5989000201225281, "learning_rate": 0.0002, "epoch": 0.14550157626707622, "step": 900}, {"loss": 0.8385, "grad_norm": 0.517179012298584, "learning_rate": 0.0002, "epoch": 0.14711826044782153, "step": 910}, {"loss": 0.8255, "grad_norm": 0.44435232877731323, "learning_rate": 0.0002, "epoch": 0.14873494462856682, "step": 920}, {"loss": 0.8305, "grad_norm": 0.42635923624038696, "learning_rate": 0.0002, "epoch": 0.1503516288093121, "step": 930}, {"loss": 0.8043, "grad_norm": 0.49603334069252014, "learning_rate": 0.0002, "epoch": 0.1519683129900574, "step": 940}, {"loss": 0.8377, "grad_norm": 0.40639808773994446, "learning_rate": 0.0002, "epoch": 0.15358499717080268, "step": 950}, {"loss": 0.8529, "grad_norm": 0.4850759208202362, "learning_rate": 0.0002, "epoch": 0.15520168135154797, "step": 960}, {"loss": 0.846, "grad_norm": 0.4427442252635956, "learning_rate": 0.0002, "epoch": 0.15681836553229328, "step": 970}, {"loss": 0.8705, "grad_norm": 0.3760930001735687, "learning_rate": 0.0002, "epoch": 0.15843504971303857, "step": 980}, {"loss": 0.8644, "grad_norm": 0.4794144332408905, "learning_rate": 0.0002, "epoch": 0.16005173389378385, "step": 990}, {"loss": 0.8002, "grad_norm": 0.45828768610954285, "learning_rate": 0.0002, "epoch": 0.16166841807452914, "step": 1000}, {"loss": 0.7658, "grad_norm": 0.6313053369522095, "learning_rate": 0.0002, "epoch": 0.16328510225527443, "step": 1010}, {"loss": 0.8047, "grad_norm": 0.45041006803512573, "learning_rate": 0.0002, "epoch": 0.16490178643601971, "step": 1020}, {"loss": 0.8423, "grad_norm": 0.441403865814209, "learning_rate": 0.0002, "epoch": 0.166518470616765, "step": 1030}, {"loss": 0.8475, "grad_norm": 0.8171296119689941, "learning_rate": 0.0002, "epoch": 0.16813515479751032, "step": 1040}, {"loss": 0.845, "grad_norm": 0.7137420773506165, "learning_rate": 0.0002, "epoch": 0.1697518389782556, "step": 1050}, {"loss": 0.8213, "grad_norm": 0.5236809849739075, "learning_rate": 0.0002, "epoch": 0.1713685231590009, "step": 1060}, {"loss": 0.8265, "grad_norm": 0.5021864175796509, "learning_rate": 0.0002, "epoch": 0.17298520733974618, "step": 1070}, {"loss": 0.8305, "grad_norm": 0.47347521781921387, "learning_rate": 0.0002, "epoch": 0.17460189152049146, "step": 1080}, {"loss": 0.8105, "grad_norm": 0.4631653428077698, "learning_rate": 0.0002, "epoch": 0.17621857570123675, "step": 1090}, {"loss": 0.8166, "grad_norm": 0.49169182777404785, "learning_rate": 0.0002, "epoch": 0.17783525988198207, "step": 1100}, {"loss": 0.8012, "grad_norm": 0.5019739270210266, "learning_rate": 0.0002, "epoch": 0.17945194406272735, "step": 1110}, {"loss": 0.8247, "grad_norm": 0.5100422501564026, "learning_rate": 0.0002, "epoch": 0.18106862824347264, "step": 1120}, {"loss": 0.8142, "grad_norm": 0.3888324499130249, "learning_rate": 0.0002, "epoch": 0.18268531242421793, "step": 1130}, {"loss": 0.8533, "grad_norm": 0.39765217900276184, "learning_rate": 0.0002, "epoch": 0.18430199660496321, "step": 1140}, {"loss": 0.8541, "grad_norm": 0.47190186381340027, "learning_rate": 0.0002, "epoch": 0.1859186807857085, "step": 1150}, {"loss": 0.8301, "grad_norm": 0.4464188814163208, "learning_rate": 0.0002, "epoch": 0.18753536496645382, "step": 1160}, {"loss": 0.8341, "grad_norm": 0.5153930187225342, "learning_rate": 0.0002, "epoch": 0.1891520491471991, "step": 1170}, {"loss": 0.8033, "grad_norm": 0.4779708683490753, "learning_rate": 0.0002, "epoch": 0.1907687333279444, "step": 1180}, {"loss": 0.8187, "grad_norm": 0.4834315776824951, "learning_rate": 0.0002, "epoch": 0.19238541750868968, "step": 1190}, {"loss": 0.7721, "grad_norm": 0.402357816696167, "learning_rate": 0.0002, "epoch": 0.19400210168943496, "step": 1200}, {"loss": 0.7941, "grad_norm": 0.45899084210395813, "learning_rate": 0.0002, "epoch": 0.19561878587018025, "step": 1210}, {"loss": 0.8353, "grad_norm": 0.5106529593467712, "learning_rate": 0.0002, "epoch": 0.19723547005092557, "step": 1220}, {"loss": 0.7816, "grad_norm": 0.45261722803115845, "learning_rate": 0.0002, "epoch": 0.19885215423167085, "step": 1230}, {"loss": 0.8068, "grad_norm": 0.4647127091884613, "learning_rate": 0.0002, "epoch": 0.20046883841241614, "step": 1240}, {"loss": 0.8239, "grad_norm": 0.4849368929862976, "learning_rate": 0.0002, "epoch": 0.20208552259316143, "step": 1250}, {"loss": 0.8514, "grad_norm": 0.4518061578273773, "learning_rate": 0.0002, "epoch": 0.2037022067739067, "step": 1260}, {"loss": 0.8158, "grad_norm": 0.49535325169563293, "learning_rate": 0.0002, "epoch": 0.205318890954652, "step": 1270}, {"loss": 0.8348, "grad_norm": 0.4835205376148224, "learning_rate": 0.0002, "epoch": 0.2069355751353973, "step": 1280}, {"loss": 0.8428, "grad_norm": 0.45308539271354675, "learning_rate": 0.0002, "epoch": 0.2085522593161426, "step": 1290}, {"loss": 0.7993, "grad_norm": 0.5369905233383179, "learning_rate": 0.0002, "epoch": 0.2101689434968879, "step": 1300}, {"loss": 0.8676, "grad_norm": 0.5031622052192688, "learning_rate": 0.0002, "epoch": 0.21178562767763318, "step": 1310}, {"loss": 0.7686, "grad_norm": 0.48010334372520447, "learning_rate": 0.0002, "epoch": 0.21340231185837846, "step": 1320}, {"loss": 0.806, "grad_norm": 0.4905701279640198, "learning_rate": 0.0002, "epoch": 0.21501899603912375, "step": 1330}, {"loss": 0.7885, "grad_norm": 0.43531742691993713, "learning_rate": 0.0002, "epoch": 0.21663568021986904, "step": 1340}, {"loss": 0.8191, "grad_norm": 0.44330692291259766, "learning_rate": 0.0002, "epoch": 0.21825236440061435, "step": 1350}, {"loss": 0.8205, "grad_norm": 0.5384416580200195, "learning_rate": 0.0002, "epoch": 0.21986904858135964, "step": 1360}, {"loss": 0.7726, "grad_norm": 0.4181833863258362, "learning_rate": 0.0002, "epoch": 0.22148573276210493, "step": 1370}, {"loss": 0.8311, "grad_norm": 0.523833692073822, "learning_rate": 0.0002, "epoch": 0.2231024169428502, "step": 1380}, {"loss": 0.7913, "grad_norm": 0.5528736710548401, "learning_rate": 0.0002, "epoch": 0.2247191011235955, "step": 1390}, {"loss": 0.8079, "grad_norm": 0.43515023589134216, "learning_rate": 0.0002, "epoch": 0.2263357853043408, "step": 1400}, {"loss": 0.8403, "grad_norm": 0.48809877038002014, "learning_rate": 0.0002, "epoch": 0.2279524694850861, "step": 1410}, {"loss": 0.8165, "grad_norm": 0.43591251969337463, "learning_rate": 0.0002, "epoch": 0.2295691536658314, "step": 1420}, {"loss": 0.8147, "grad_norm": 0.44625312089920044, "learning_rate": 0.0002, "epoch": 0.23118583784657668, "step": 1430}, {"loss": 0.8134, "grad_norm": 0.4390665292739868, "learning_rate": 0.0002, "epoch": 0.23280252202732196, "step": 1440}, {"loss": 0.8465, "grad_norm": 0.48496049642562866, "learning_rate": 0.0002, "epoch": 0.23441920620806725, "step": 1450}, {"loss": 0.775, "grad_norm": 0.45919957756996155, "learning_rate": 0.0002, "epoch": 0.23603589038881254, "step": 1460}, {"loss": 0.8659, "grad_norm": 0.5471845865249634, "learning_rate": 0.0002, "epoch": 0.23765257456955785, "step": 1470}, {"loss": 0.8164, "grad_norm": 0.47269317507743835, "learning_rate": 0.0002, "epoch": 0.23926925875030314, "step": 1480}, {"loss": 0.854, "grad_norm": 0.4930245578289032, "learning_rate": 0.0002, "epoch": 0.24088594293104842, "step": 1490}, {"loss": 0.8139, "grad_norm": 0.5605630278587341, "learning_rate": 0.0002, "epoch": 0.2425026271117937, "step": 1500}, {"loss": 0.8125, "grad_norm": 0.4435870945453644, "learning_rate": 0.0002, "epoch": 0.244119311292539, "step": 1510}, {"loss": 0.8123, "grad_norm": 0.4941999912261963, "learning_rate": 0.0002, "epoch": 0.24573599547328429, "step": 1520}, {"loss": 0.8427, "grad_norm": 0.5100624561309814, "learning_rate": 0.0002, "epoch": 0.24735267965402957, "step": 1530}, {"loss": 0.8405, "grad_norm": 0.4638267457485199, "learning_rate": 0.0002, "epoch": 0.2489693638347749, "step": 1540}, {"loss": 0.81, "grad_norm": 0.5071570873260498, "learning_rate": 0.0002, "epoch": 0.25058604801552015, "step": 1550}, {"loss": 0.7724, "grad_norm": 0.4291319251060486, "learning_rate": 0.0002, "epoch": 0.25220273219626543, "step": 1560}, {"loss": 0.7984, "grad_norm": 0.5388049483299255, "learning_rate": 0.0002, "epoch": 0.2538194163770108, "step": 1570}, {"loss": 0.8176, "grad_norm": 0.5083683729171753, "learning_rate": 0.0002, "epoch": 0.25543610055775606, "step": 1580}, {"loss": 0.843, "grad_norm": 0.4824463725090027, "learning_rate": 0.0002, "epoch": 0.25705278473850135, "step": 1590}, {"loss": 0.7996, "grad_norm": 0.41177722811698914, "learning_rate": 0.0002, "epoch": 0.25866946891924664, "step": 1600}, {"loss": 0.7772, "grad_norm": 0.5656219124794006, "learning_rate": 0.0002, "epoch": 0.2602861530999919, "step": 1610}, {"loss": 0.7955, "grad_norm": 0.41063204407691956, "learning_rate": 0.0002, "epoch": 0.2619028372807372, "step": 1620}, {"loss": 0.7998, "grad_norm": 0.4897061288356781, "learning_rate": 0.0002, "epoch": 0.2635195214614825, "step": 1630}, {"loss": 0.8198, "grad_norm": 0.4454376697540283, "learning_rate": 0.0002, "epoch": 0.2651362056422278, "step": 1640}, {"loss": 0.8684, "grad_norm": 0.4355238378047943, "learning_rate": 0.0002, "epoch": 0.26675288982297307, "step": 1650}, {"loss": 0.7801, "grad_norm": 0.458310067653656, "learning_rate": 0.0002, "epoch": 0.26836957400371836, "step": 1660}, {"loss": 0.7935, "grad_norm": 0.4752083718776703, "learning_rate": 0.0002, "epoch": 0.26998625818446365, "step": 1670}, {"loss": 0.8267, "grad_norm": 0.4666106402873993, "learning_rate": 0.0002, "epoch": 0.27160294236520893, "step": 1680}, {"loss": 0.8252, "grad_norm": 0.4213818609714508, "learning_rate": 0.0002, "epoch": 0.2732196265459543, "step": 1690}, {"loss": 0.8559, "grad_norm": 0.5768913626670837, "learning_rate": 0.0002, "epoch": 0.27483631072669956, "step": 1700}, {"loss": 0.7931, "grad_norm": 0.4209914803504944, "learning_rate": 0.0002, "epoch": 0.27645299490744485, "step": 1710}, {"loss": 0.8167, "grad_norm": 0.501909613609314, "learning_rate": 0.0002, "epoch": 0.27806967908819014, "step": 1720}, {"loss": 0.7832, "grad_norm": 0.5266261100769043, "learning_rate": 0.0002, "epoch": 0.2796863632689354, "step": 1730}, {"loss": 0.8102, "grad_norm": 0.43806859850883484, "learning_rate": 0.0002, "epoch": 0.2813030474496807, "step": 1740}, {"loss": 0.8157, "grad_norm": 0.46048814058303833, "learning_rate": 0.0002, "epoch": 0.282919731630426, "step": 1750}, {"loss": 0.8596, "grad_norm": 0.44972819089889526, "learning_rate": 0.0002, "epoch": 0.2845364158111713, "step": 1760}, {"loss": 0.8421, "grad_norm": 0.5114831328392029, "learning_rate": 0.0002, "epoch": 0.28615309999191657, "step": 1770}, {"loss": 0.8361, "grad_norm": 0.47931742668151855, "learning_rate": 0.0002, "epoch": 0.28776978417266186, "step": 1780}, {"loss": 0.8265, "grad_norm": 0.5092599987983704, "learning_rate": 0.0002, "epoch": 0.28938646835340714, "step": 1790}, {"loss": 0.8506, "grad_norm": 0.37581443786621094, "learning_rate": 0.0002, "epoch": 0.29100315253415243, "step": 1800}, {"loss": 0.7932, "grad_norm": 0.47097381949424744, "learning_rate": 0.0002, "epoch": 0.2926198367148977, "step": 1810}, {"loss": 0.7787, "grad_norm": 0.48300236463546753, "learning_rate": 0.0002, "epoch": 0.29423652089564306, "step": 1820}, {"loss": 0.8391, "grad_norm": 0.5600419640541077, "learning_rate": 0.0002, "epoch": 0.29585320507638835, "step": 1830}, {"loss": 0.8507, "grad_norm": 0.48555272817611694, "learning_rate": 0.0002, "epoch": 0.29746988925713364, "step": 1840}, {"loss": 0.7657, "grad_norm": 0.3752668499946594, "learning_rate": 0.0002, "epoch": 0.2990865734378789, "step": 1850}, {"loss": 0.7915, "grad_norm": 0.5328747034072876, "learning_rate": 0.0002, "epoch": 0.3007032576186242, "step": 1860}, {"loss": 0.8426, "grad_norm": 0.48716455698013306, "learning_rate": 0.0002, "epoch": 0.3023199417993695, "step": 1870}, {"loss": 0.8335, "grad_norm": 0.5011493563652039, "learning_rate": 0.0002, "epoch": 0.3039366259801148, "step": 1880}, {"loss": 0.852, "grad_norm": 0.46461427211761475, "learning_rate": 0.0002, "epoch": 0.30555331016086007, "step": 1890}, {"loss": 0.8478, "grad_norm": 0.36630210280418396, "learning_rate": 0.0002, "epoch": 0.30716999434160536, "step": 1900}, {"loss": 0.8162, "grad_norm": 0.4217296242713928, "learning_rate": 0.0002, "epoch": 0.30878667852235064, "step": 1910}, {"loss": 0.8128, "grad_norm": 0.4394875466823578, "learning_rate": 0.0002, "epoch": 0.31040336270309593, "step": 1920}, {"loss": 0.8471, "grad_norm": 0.6587965488433838, "learning_rate": 0.0002, "epoch": 0.3120200468838412, "step": 1930}, {"loss": 0.8565, "grad_norm": 0.5469298958778381, "learning_rate": 0.0002, "epoch": 0.31363673106458656, "step": 1940}, {"loss": 0.8236, "grad_norm": 0.4371595084667206, "learning_rate": 0.0002, "epoch": 0.31525341524533185, "step": 1950}, {"loss": 0.887, "grad_norm": 0.4809541404247284, "learning_rate": 0.0002, "epoch": 0.31687009942607713, "step": 1960}, {"loss": 0.7855, "grad_norm": 0.6061086654663086, "learning_rate": 0.0002, "epoch": 0.3184867836068224, "step": 1970}, {"loss": 0.7679, "grad_norm": 0.5342657566070557, "learning_rate": 0.0002, "epoch": 0.3201034677875677, "step": 1980}, {"loss": 0.7955, "grad_norm": 0.5057743787765503, "learning_rate": 0.0002, "epoch": 0.321720151968313, "step": 1990}, {"loss": 0.7774, "grad_norm": 0.528626024723053, "learning_rate": 0.0002, "epoch": 0.3233368361490583, "step": 2000}, {"loss": 0.8845, "grad_norm": 0.46742770075798035, "learning_rate": 0.0002, "epoch": 0.32495352032980357, "step": 2010}, {"loss": 0.8484, "grad_norm": 0.515101432800293, "learning_rate": 0.0002, "epoch": 0.32657020451054886, "step": 2020}, {"loss": 0.8139, "grad_norm": 0.41941216588020325, "learning_rate": 0.0002, "epoch": 0.32818688869129414, "step": 2030}, {"loss": 0.7637, "grad_norm": 0.49902522563934326, "learning_rate": 0.0002, "epoch": 0.32980357287203943, "step": 2040}, {"loss": 0.7822, "grad_norm": 0.4120897650718689, "learning_rate": 0.0002, "epoch": 0.3314202570527847, "step": 2050}, {"loss": 0.8057, "grad_norm": 0.45352041721343994, "learning_rate": 0.0002, "epoch": 0.33303694123353, "step": 2060}, {"loss": 0.7913, "grad_norm": 0.523199737071991, "learning_rate": 0.0002, "epoch": 0.33465362541427535, "step": 2070}, {"loss": 0.8036, "grad_norm": 0.4390358626842499, "learning_rate": 0.0002, "epoch": 0.33627030959502063, "step": 2080}, {"loss": 0.8145, "grad_norm": 0.6752901077270508, "learning_rate": 0.0002, "epoch": 0.3378869937757659, "step": 2090}, {"loss": 0.7807, "grad_norm": 0.547821044921875, "learning_rate": 0.0002, "epoch": 0.3395036779565112, "step": 2100}, {"loss": 0.8561, "grad_norm": 0.5161308646202087, "learning_rate": 0.0002, "epoch": 0.3411203621372565, "step": 2110}, {"loss": 0.7697, "grad_norm": 0.4565401077270508, "learning_rate": 0.0002, "epoch": 0.3427370463180018, "step": 2120}, {"loss": 0.7964, "grad_norm": 0.4666115939617157, "learning_rate": 0.0002, "epoch": 0.34435373049874707, "step": 2130}, {"loss": 0.8189, "grad_norm": 0.4090428352355957, "learning_rate": 0.0002, "epoch": 0.34597041467949236, "step": 2140}, {"loss": 0.8817, "grad_norm": 0.510845422744751, "learning_rate": 0.0002, "epoch": 0.34758709886023764, "step": 2150}, {"loss": 0.8398, "grad_norm": 0.42861923575401306, "learning_rate": 0.0002, "epoch": 0.34920378304098293, "step": 2160}, {"loss": 0.7716, "grad_norm": 0.4476332664489746, "learning_rate": 0.0002, "epoch": 0.3508204672217282, "step": 2170}, {"loss": 0.7845, "grad_norm": 0.6065791249275208, "learning_rate": 0.0002, "epoch": 0.3524371514024735, "step": 2180}, {"loss": 0.8187, "grad_norm": 0.42335066199302673, "learning_rate": 0.0002, "epoch": 0.35405383558321885, "step": 2190}, {"loss": 0.8239, "grad_norm": 0.5094629526138306, "learning_rate": 0.0002, "epoch": 0.35567051976396413, "step": 2200}, {"loss": 0.7807, "grad_norm": 0.5476373434066772, "learning_rate": 0.0002, "epoch": 0.3572872039447094, "step": 2210}, {"loss": 0.814, "grad_norm": 0.3911719024181366, "learning_rate": 0.0002, "epoch": 0.3589038881254547, "step": 2220}, {"loss": 0.8599, "grad_norm": 0.6599636077880859, "learning_rate": 0.0002, "epoch": 0.3605205723062, "step": 2230}, {"loss": 0.7482, "grad_norm": 0.40381914377212524, "learning_rate": 0.0002, "epoch": 0.3621372564869453, "step": 2240}, {"loss": 0.7772, "grad_norm": 0.4433908462524414, "learning_rate": 0.0002, "epoch": 0.36375394066769057, "step": 2250}, {"loss": 0.8503, "grad_norm": 0.578326940536499, "learning_rate": 0.0002, "epoch": 0.36537062484843585, "step": 2260}, {"loss": 0.8178, "grad_norm": 0.5734784007072449, "learning_rate": 0.0002, "epoch": 0.36698730902918114, "step": 2270}, {"loss": 0.8193, "grad_norm": 0.45555487275123596, "learning_rate": 0.0002, "epoch": 0.36860399320992643, "step": 2280}, {"loss": 0.7929, "grad_norm": 0.5666276216506958, "learning_rate": 0.0002, "epoch": 0.3702206773906717, "step": 2290}, {"loss": 0.8292, "grad_norm": 0.5461117625236511, "learning_rate": 0.0002, "epoch": 0.371837361571417, "step": 2300}, {"loss": 0.8204, "grad_norm": 0.6318911910057068, "learning_rate": 0.0002, "epoch": 0.3734540457521623, "step": 2310}, {"loss": 0.7964, "grad_norm": 0.493263304233551, "learning_rate": 0.0002, "epoch": 0.37507072993290763, "step": 2320}, {"loss": 0.8339, "grad_norm": 0.5888760089874268, "learning_rate": 0.0002, "epoch": 0.3766874141136529, "step": 2330}, {"loss": 0.7737, "grad_norm": 0.48671841621398926, "learning_rate": 0.0002, "epoch": 0.3783040982943982, "step": 2340}, {"loss": 0.8367, "grad_norm": 0.4385145306587219, "learning_rate": 0.0002, "epoch": 0.3799207824751435, "step": 2350}, {"loss": 0.812, "grad_norm": 0.5523318648338318, "learning_rate": 0.0002, "epoch": 0.3815374666558888, "step": 2360}, {"loss": 0.8351, "grad_norm": 0.7308220267295837, "learning_rate": 0.0002, "epoch": 0.38315415083663407, "step": 2370}, {"loss": 0.859, "grad_norm": 0.554214358329773, "learning_rate": 0.0002, "epoch": 0.38477083501737935, "step": 2380}, {"loss": 0.8146, "grad_norm": 0.5425800085067749, "learning_rate": 0.0002, "epoch": 0.38638751919812464, "step": 2390}, {"loss": 0.8282, "grad_norm": 0.48811158537864685, "learning_rate": 0.0002, "epoch": 0.3880042033788699, "step": 2400}, {"loss": 0.8074, "grad_norm": 0.49212366342544556, "learning_rate": 0.0002, "epoch": 0.3896208875596152, "step": 2410}, {"loss": 0.7991, "grad_norm": 0.5222218632698059, "learning_rate": 0.0002, "epoch": 0.3912375717403605, "step": 2420}, {"loss": 0.8182, "grad_norm": 0.4699819087982178, "learning_rate": 0.0002, "epoch": 0.3928542559211058, "step": 2430}, {"loss": 0.7919, "grad_norm": 0.46153587102890015, "learning_rate": 0.0002, "epoch": 0.39447094010185113, "step": 2440}, {"loss": 0.8111, "grad_norm": 0.4150611162185669, "learning_rate": 0.0002, "epoch": 0.3960876242825964, "step": 2450}, {"loss": 0.8589, "grad_norm": 0.5799614787101746, "learning_rate": 0.0002, "epoch": 0.3977043084633417, "step": 2460}, {"loss": 0.8085, "grad_norm": 0.56536865234375, "learning_rate": 0.0002, "epoch": 0.399320992644087, "step": 2470}, {"loss": 0.8022, "grad_norm": 0.5451247096061707, "learning_rate": 0.0002, "epoch": 0.4009376768248323, "step": 2480}, {"loss": 0.8217, "grad_norm": 0.5914521217346191, "learning_rate": 0.0002, "epoch": 0.40255436100557757, "step": 2490}, {"loss": 0.7859, "grad_norm": 0.4428117275238037, "learning_rate": 0.0002, "epoch": 0.40417104518632285, "step": 2500}, {"loss": 0.8054, "grad_norm": 0.48580947518348694, "learning_rate": 0.0002, "epoch": 0.40578772936706814, "step": 2510}, {"loss": 0.8405, "grad_norm": 0.436734676361084, "learning_rate": 0.0002, "epoch": 0.4074044135478134, "step": 2520}, {"loss": 0.8209, "grad_norm": 0.5752223134040833, "learning_rate": 0.0002, "epoch": 0.4090210977285587, "step": 2530}, {"loss": 0.8181, "grad_norm": 0.4271308183670044, "learning_rate": 0.0002, "epoch": 0.410637781909304, "step": 2540}, {"loss": 0.8058, "grad_norm": 0.46294718980789185, "learning_rate": 0.0002, "epoch": 0.4122544660900493, "step": 2550}, {"loss": 0.8473, "grad_norm": 0.49407583475112915, "learning_rate": 0.0002, "epoch": 0.4138711502707946, "step": 2560}, {"loss": 0.7881, "grad_norm": 0.4729035496711731, "learning_rate": 0.0002, "epoch": 0.4154878344515399, "step": 2570}, {"loss": 0.7834, "grad_norm": 0.4129747152328491, "learning_rate": 0.0002, "epoch": 0.4171045186322852, "step": 2580}, {"loss": 0.7859, "grad_norm": 0.5684236288070679, "learning_rate": 0.0002, "epoch": 0.4187212028130305, "step": 2590}, {"loss": 0.811, "grad_norm": 0.4862157106399536, "learning_rate": 0.0002, "epoch": 0.4203378869937758, "step": 2600}, {"loss": 0.7582, "grad_norm": 0.46567976474761963, "learning_rate": 0.0002, "epoch": 0.42195457117452106, "step": 2610}, {"loss": 0.7755, "grad_norm": 0.5710650682449341, "learning_rate": 0.0002, "epoch": 0.42357125535526635, "step": 2620}, {"loss": 0.8573, "grad_norm": 0.5660041570663452, "learning_rate": 0.0002, "epoch": 0.42518793953601164, "step": 2630}, {"loss": 0.7812, "grad_norm": 0.47944375872612, "learning_rate": 0.0002, "epoch": 0.4268046237167569, "step": 2640}, {"loss": 0.7459, "grad_norm": 0.537223756313324, "learning_rate": 0.0002, "epoch": 0.4284213078975022, "step": 2650}, {"loss": 0.8246, "grad_norm": 0.41669997572898865, "learning_rate": 0.0002, "epoch": 0.4300379920782475, "step": 2660}, {"loss": 0.7785, "grad_norm": 0.44727686047554016, "learning_rate": 0.0002, "epoch": 0.4316546762589928, "step": 2670}, {"loss": 0.8241, "grad_norm": 0.5600888729095459, "learning_rate": 0.0002, "epoch": 0.4332713604397381, "step": 2680}, {"loss": 0.7708, "grad_norm": 0.39820605516433716, "learning_rate": 0.0002, "epoch": 0.4348880446204834, "step": 2690}, {"loss": 0.8202, "grad_norm": 0.5637655854225159, "learning_rate": 0.0002, "epoch": 0.4365047288012287, "step": 2700}, {"loss": 0.855, "grad_norm": 0.6363666653633118, "learning_rate": 0.0002, "epoch": 0.438121412981974, "step": 2710}, {"loss": 0.8468, "grad_norm": 0.5656129121780396, "learning_rate": 0.0002, "epoch": 0.4397380971627193, "step": 2720}, {"loss": 0.7845, "grad_norm": 0.5600156188011169, "learning_rate": 0.0002, "epoch": 0.44135478134346456, "step": 2730}, {"loss": 0.8405, "grad_norm": 0.5506579875946045, "learning_rate": 0.0002, "epoch": 0.44297146552420985, "step": 2740}, {"loss": 0.7725, "grad_norm": 0.49878305196762085, "learning_rate": 0.0002, "epoch": 0.44458814970495514, "step": 2750}, {"loss": 0.8292, "grad_norm": 0.4569213092327118, "learning_rate": 0.0002, "epoch": 0.4462048338857004, "step": 2760}, {"loss": 0.8028, "grad_norm": 0.6056680083274841, "learning_rate": 0.0002, "epoch": 0.4478215180664457, "step": 2770}, {"loss": 0.8242, "grad_norm": 0.44474557042121887, "learning_rate": 0.0002, "epoch": 0.449438202247191, "step": 2780}, {"loss": 0.801, "grad_norm": 0.46055394411087036, "learning_rate": 0.0002, "epoch": 0.4510548864279363, "step": 2790}, {"loss": 0.7521, "grad_norm": 0.4904133379459381, "learning_rate": 0.0002, "epoch": 0.4526715706086816, "step": 2800}, {"loss": 0.8829, "grad_norm": 0.5647031664848328, "learning_rate": 0.0002, "epoch": 0.45428825478942686, "step": 2810}, {"loss": 0.8622, "grad_norm": 0.5759473443031311, "learning_rate": 0.0002, "epoch": 0.4559049389701722, "step": 2820}, {"loss": 0.7812, "grad_norm": 0.5161895751953125, "learning_rate": 0.0002, "epoch": 0.4575216231509175, "step": 2830}, {"loss": 0.8045, "grad_norm": 0.4248254597187042, "learning_rate": 0.0002, "epoch": 0.4591383073316628, "step": 2840}, {"loss": 0.7838, "grad_norm": 0.45395001769065857, "learning_rate": 0.0002, "epoch": 0.46075499151240806, "step": 2850}, {"loss": 0.8208, "grad_norm": 0.5358697772026062, "learning_rate": 0.0002, "epoch": 0.46237167569315335, "step": 2860}, {"loss": 0.8147, "grad_norm": 0.5379165410995483, "learning_rate": 0.0002, "epoch": 0.46398835987389864, "step": 2870}, {"loss": 0.7403, "grad_norm": 0.4601989686489105, "learning_rate": 0.0002, "epoch": 0.4656050440546439, "step": 2880}, {"loss": 0.8523, "grad_norm": 0.671115517616272, "learning_rate": 0.0002, "epoch": 0.4672217282353892, "step": 2890}, {"loss": 0.8262, "grad_norm": 0.4425133168697357, "learning_rate": 0.0002, "epoch": 0.4688384124161345, "step": 2900}, {"loss": 0.8178, "grad_norm": 0.5446155071258545, "learning_rate": 0.0002, "epoch": 0.4704550965968798, "step": 2910}, {"loss": 0.8106, "grad_norm": 0.603306233882904, "learning_rate": 0.0002, "epoch": 0.47207178077762507, "step": 2920}, {"loss": 0.8044, "grad_norm": 0.5377997159957886, "learning_rate": 0.0002, "epoch": 0.47368846495837036, "step": 2930}, {"loss": 0.8075, "grad_norm": 0.4931027591228485, "learning_rate": 0.0002, "epoch": 0.4753051491391157, "step": 2940}, {"loss": 0.8004, "grad_norm": 0.4711960256099701, "learning_rate": 0.0002, "epoch": 0.476921833319861, "step": 2950}, {"loss": 0.8121, "grad_norm": 0.5020492672920227, "learning_rate": 0.0002, "epoch": 0.4785385175006063, "step": 2960}, {"loss": 0.8221, "grad_norm": 0.5428946614265442, "learning_rate": 0.0002, "epoch": 0.48015520168135156, "step": 2970}, {"loss": 0.7849, "grad_norm": 0.5294089317321777, "learning_rate": 0.0002, "epoch": 0.48177188586209685, "step": 2980}, {"loss": 0.8553, "grad_norm": 0.648289144039154, "learning_rate": 0.0002, "epoch": 0.48338857004284214, "step": 2990}, {"loss": 0.7874, "grad_norm": 0.47916680574417114, "learning_rate": 0.0002, "epoch": 0.4850052542235874, "step": 3000}, {"loss": 0.8087, "grad_norm": 0.43849772214889526, "learning_rate": 0.0002, "epoch": 0.4866219384043327, "step": 3010}, {"loss": 0.7662, "grad_norm": 0.47007861733436584, "learning_rate": 0.0002, "epoch": 0.488238622585078, "step": 3020}, {"loss": 0.757, "grad_norm": 0.6314331293106079, "learning_rate": 0.0002, "epoch": 0.4898553067658233, "step": 3030}, {"loss": 0.7863, "grad_norm": 0.49211493134498596, "learning_rate": 0.0002, "epoch": 0.49147199094656857, "step": 3040}, {"loss": 0.8335, "grad_norm": 0.4537973403930664, "learning_rate": 0.0002, "epoch": 0.49308867512731386, "step": 3050}, {"loss": 0.8095, "grad_norm": 0.47326919436454773, "learning_rate": 0.0002, "epoch": 0.49470535930805914, "step": 3060}, {"loss": 0.8447, "grad_norm": 0.525874137878418, "learning_rate": 0.0002, "epoch": 0.4963220434888045, "step": 3070}, {"loss": 0.8339, "grad_norm": 0.6361091732978821, "learning_rate": 0.0002, "epoch": 0.4979387276695498, "step": 3080}, {"loss": 0.821, "grad_norm": 0.5850642919540405, "learning_rate": 0.0002, "epoch": 0.49955541185029506, "step": 3090}, {"loss": 0.8279, "grad_norm": 0.47299543023109436, "learning_rate": 0.0002, "epoch": 0.5011720960310403, "step": 3100}, {"loss": 0.8681, "grad_norm": 0.473099946975708, "learning_rate": 0.0002, "epoch": 0.5027887802117856, "step": 3110}, {"loss": 0.8223, "grad_norm": 0.48186397552490234, "learning_rate": 0.0002, "epoch": 0.5044054643925309, "step": 3120}, {"loss": 0.8292, "grad_norm": 0.5015401840209961, "learning_rate": 0.0002, "epoch": 0.5060221485732762, "step": 3130}, {"loss": 0.7692, "grad_norm": 0.5617750287055969, "learning_rate": 0.0002, "epoch": 0.5076388327540216, "step": 3140}, {"loss": 0.8708, "grad_norm": 0.5169327259063721, "learning_rate": 0.0002, "epoch": 0.5092555169347668, "step": 3150}, {"loss": 0.7845, "grad_norm": 0.545657753944397, "learning_rate": 0.0002, "epoch": 0.5108722011155121, "step": 3160}, {"loss": 0.799, "grad_norm": 0.512864351272583, "learning_rate": 0.0002, "epoch": 0.5124888852962574, "step": 3170}, {"loss": 0.7794, "grad_norm": 0.4113546311855316, "learning_rate": 0.0002, "epoch": 0.5141055694770027, "step": 3180}, {"loss": 0.8206, "grad_norm": 0.44532445073127747, "learning_rate": 0.0002, "epoch": 0.5157222536577479, "step": 3190}, {"loss": 0.8213, "grad_norm": 0.5623497366905212, "learning_rate": 0.0002, "epoch": 0.5173389378384933, "step": 3200}, {"loss": 0.7928, "grad_norm": 0.5084741115570068, "learning_rate": 0.0002, "epoch": 0.5189556220192385, "step": 3210}, {"loss": 0.8174, "grad_norm": 0.5305403470993042, "learning_rate": 0.0002, "epoch": 0.5205723061999838, "step": 3220}, {"loss": 0.8139, "grad_norm": 0.4708254337310791, "learning_rate": 0.0002, "epoch": 0.5221889903807291, "step": 3230}, {"loss": 0.7639, "grad_norm": 0.43827131390571594, "learning_rate": 0.0002, "epoch": 0.5238056745614744, "step": 3240}, {"loss": 0.7993, "grad_norm": 0.5630002617835999, "learning_rate": 0.0002, "epoch": 0.5254223587422197, "step": 3250}, {"loss": 0.7522, "grad_norm": 0.5010961890220642, "learning_rate": 0.0002, "epoch": 0.527039042922965, "step": 3260}, {"loss": 0.8374, "grad_norm": 0.6303122043609619, "learning_rate": 0.0002, "epoch": 0.5286557271037103, "step": 3270}, {"loss": 0.7727, "grad_norm": 0.5107331275939941, "learning_rate": 0.0002, "epoch": 0.5302724112844556, "step": 3280}, {"loss": 0.8495, "grad_norm": 0.5700443387031555, "learning_rate": 0.0002, "epoch": 0.5318890954652009, "step": 3290}, {"loss": 0.7776, "grad_norm": 0.46296367049217224, "learning_rate": 0.0002, "epoch": 0.5335057796459461, "step": 3300}, {"loss": 0.7931, "grad_norm": 0.531568706035614, "learning_rate": 0.0002, "epoch": 0.5351224638266915, "step": 3310}, {"loss": 0.843, "grad_norm": 0.4686741530895233, "learning_rate": 0.0002, "epoch": 0.5367391480074367, "step": 3320}, {"loss": 0.8104, "grad_norm": 0.5404331088066101, "learning_rate": 0.0002, "epoch": 0.5383558321881821, "step": 3330}, {"loss": 0.7686, "grad_norm": 0.6368790864944458, "learning_rate": 0.0002, "epoch": 0.5399725163689273, "step": 3340}, {"loss": 0.8514, "grad_norm": 0.42300888895988464, "learning_rate": 0.0002, "epoch": 0.5415892005496726, "step": 3350}, {"loss": 0.8236, "grad_norm": 0.5362542867660522, "learning_rate": 0.0002, "epoch": 0.5432058847304179, "step": 3360}, {"loss": 0.858, "grad_norm": 0.497128963470459, "learning_rate": 0.0002, "epoch": 0.5448225689111632, "step": 3370}, {"loss": 0.8519, "grad_norm": 0.5006386041641235, "learning_rate": 0.0002, "epoch": 0.5464392530919085, "step": 3380}, {"loss": 0.7867, "grad_norm": 0.44136837124824524, "learning_rate": 0.0002, "epoch": 0.5480559372726538, "step": 3390}, {"loss": 0.773, "grad_norm": 0.5897833108901978, "learning_rate": 0.0002, "epoch": 0.5496726214533991, "step": 3400}, {"loss": 0.8895, "grad_norm": 0.641075611114502, "learning_rate": 0.0002, "epoch": 0.5512893056341444, "step": 3410}, {"loss": 0.7827, "grad_norm": 0.7251322269439697, "learning_rate": 0.0002, "epoch": 0.5529059898148897, "step": 3420}, {"loss": 0.7626, "grad_norm": 0.47411349415779114, "learning_rate": 0.0002, "epoch": 0.5545226739956349, "step": 3430}, {"loss": 0.8196, "grad_norm": 0.4994310438632965, "learning_rate": 0.0002, "epoch": 0.5561393581763803, "step": 3440}, {"loss": 0.7812, "grad_norm": 0.5814438462257385, "learning_rate": 0.0002, "epoch": 0.5577560423571255, "step": 3450}, {"loss": 0.8805, "grad_norm": 0.6278898119926453, "learning_rate": 0.0002, "epoch": 0.5593727265378708, "step": 3460}, {"loss": 0.813, "grad_norm": 0.46208274364471436, "learning_rate": 0.0002, "epoch": 0.5609894107186161, "step": 3470}, {"loss": 0.8295, "grad_norm": 0.5718930959701538, "learning_rate": 0.0002, "epoch": 0.5626060948993614, "step": 3480}, {"loss": 0.8152, "grad_norm": 0.48178744316101074, "learning_rate": 0.0002, "epoch": 0.5642227790801067, "step": 3490}, {"loss": 0.8244, "grad_norm": 0.47336965799331665, "learning_rate": 0.0002, "epoch": 0.565839463260852, "step": 3500}, {"loss": 0.8099, "grad_norm": 0.43442684412002563, "learning_rate": 0.0002, "epoch": 0.5674561474415973, "step": 3510}, {"loss": 0.7564, "grad_norm": 0.6463358998298645, "learning_rate": 0.0002, "epoch": 0.5690728316223426, "step": 3520}, {"loss": 0.836, "grad_norm": 0.5286486744880676, "learning_rate": 0.0002, "epoch": 0.5706895158030879, "step": 3530}, {"loss": 0.8421, "grad_norm": 0.5405499935150146, "learning_rate": 0.0002, "epoch": 0.5723061999838331, "step": 3540}, {"loss": 0.7614, "grad_norm": 0.6654391884803772, "learning_rate": 0.0002, "epoch": 0.5739228841645785, "step": 3550}, {"loss": 0.7803, "grad_norm": 0.5081980228424072, "learning_rate": 0.0002, "epoch": 0.5755395683453237, "step": 3560}, {"loss": 0.7753, "grad_norm": 0.48978179693222046, "learning_rate": 0.0002, "epoch": 0.5771562525260691, "step": 3570}, {"loss": 0.8151, "grad_norm": 0.5840612053871155, "learning_rate": 0.0002, "epoch": 0.5787729367068143, "step": 3580}, {"loss": 0.8937, "grad_norm": 0.5235261917114258, "learning_rate": 0.0002, "epoch": 0.5803896208875596, "step": 3590}, {"loss": 0.7894, "grad_norm": 0.5672075748443604, "learning_rate": 0.0002, "epoch": 0.5820063050683049, "step": 3600}, {"loss": 0.8347, "grad_norm": 0.5613429546356201, "learning_rate": 0.0002, "epoch": 0.5836229892490502, "step": 3610}, {"loss": 0.8274, "grad_norm": 0.4032273590564728, "learning_rate": 0.0002, "epoch": 0.5852396734297954, "step": 3620}, {"loss": 0.8421, "grad_norm": 0.49559324979782104, "learning_rate": 0.0002, "epoch": 0.5868563576105408, "step": 3630}, {"loss": 0.8332, "grad_norm": 0.6895697712898254, "learning_rate": 0.0002, "epoch": 0.5884730417912861, "step": 3640}, {"loss": 0.7877, "grad_norm": 0.4750136435031891, "learning_rate": 0.0002, "epoch": 0.5900897259720314, "step": 3650}, {"loss": 0.8219, "grad_norm": 0.5176819562911987, "learning_rate": 0.0002, "epoch": 0.5917064101527767, "step": 3660}, {"loss": 0.8151, "grad_norm": 0.5817760229110718, "learning_rate": 0.0002, "epoch": 0.5933230943335219, "step": 3670}, {"loss": 0.7823, "grad_norm": 0.6064626574516296, "learning_rate": 0.0002, "epoch": 0.5949397785142673, "step": 3680}, {"loss": 0.8422, "grad_norm": 0.6728700995445251, "learning_rate": 0.0002, "epoch": 0.5965564626950125, "step": 3690}, {"loss": 0.7679, "grad_norm": 0.609305202960968, "learning_rate": 0.0002, "epoch": 0.5981731468757578, "step": 3700}, {"loss": 0.8048, "grad_norm": 0.4615488350391388, "learning_rate": 0.0002, "epoch": 0.5997898310565031, "step": 3710}, {"loss": 0.8214, "grad_norm": 2.0531179904937744, "learning_rate": 0.0002, "epoch": 0.6014065152372484, "step": 3720}, {"loss": 0.8158, "grad_norm": 0.5091132521629333, "learning_rate": 0.0002, "epoch": 0.6030231994179936, "step": 3730}, {"loss": 0.7833, "grad_norm": 0.5951124429702759, "learning_rate": 0.0002, "epoch": 0.604639883598739, "step": 3740}, {"loss": 0.7784, "grad_norm": 0.5870208144187927, "learning_rate": 0.0002, "epoch": 0.6062565677794842, "step": 3750}, {"loss": 0.8044, "grad_norm": 0.6254619359970093, "learning_rate": 0.0002, "epoch": 0.6078732519602296, "step": 3760}, {"loss": 0.7868, "grad_norm": 0.5577626824378967, "learning_rate": 0.0002, "epoch": 0.6094899361409749, "step": 3770}, {"loss": 0.8108, "grad_norm": 0.5004405379295349, "learning_rate": 0.0002, "epoch": 0.6111066203217201, "step": 3780}, {"loss": 0.8092, "grad_norm": 0.5527383685112, "learning_rate": 0.0002, "epoch": 0.6127233045024655, "step": 3790}, {"loss": 0.8036, "grad_norm": 0.49116113781929016, "learning_rate": 0.0002, "epoch": 0.6143399886832107, "step": 3800}, {"loss": 0.8352, "grad_norm": 0.5299299359321594, "learning_rate": 0.0002, "epoch": 0.6159566728639561, "step": 3810}, {"loss": 0.7737, "grad_norm": 0.464897483587265, "learning_rate": 0.0002, "epoch": 0.6175733570447013, "step": 3820}, {"loss": 0.7923, "grad_norm": 0.6505740880966187, "learning_rate": 0.0002, "epoch": 0.6191900412254466, "step": 3830}, {"loss": 0.8123, "grad_norm": 0.5512559413909912, "learning_rate": 0.0002, "epoch": 0.6208067254061919, "step": 3840}, {"loss": 0.8856, "grad_norm": 0.49427518248558044, "learning_rate": 0.0002, "epoch": 0.6224234095869372, "step": 3850}, {"loss": 0.7751, "grad_norm": 0.3839147090911865, "learning_rate": 0.0002, "epoch": 0.6240400937676824, "step": 3860}, {"loss": 0.8006, "grad_norm": 0.5760218501091003, "learning_rate": 0.0002, "epoch": 0.6256567779484278, "step": 3870}, {"loss": 0.7836, "grad_norm": 0.7226507067680359, "learning_rate": 0.0002, "epoch": 0.6272734621291731, "step": 3880}, {"loss": 0.8244, "grad_norm": 0.676781415939331, "learning_rate": 0.0002, "epoch": 0.6288901463099184, "step": 3890}, {"loss": 0.8239, "grad_norm": 0.4284018278121948, "learning_rate": 0.0002, "epoch": 0.6305068304906637, "step": 3900}, {"loss": 0.7996, "grad_norm": 0.5060628056526184, "learning_rate": 0.0002, "epoch": 0.6321235146714089, "step": 3910}, {"loss": 0.8089, "grad_norm": 0.5524522066116333, "learning_rate": 0.0002, "epoch": 0.6337401988521543, "step": 3920}, {"loss": 0.8276, "grad_norm": 0.6099881529808044, "learning_rate": 0.0002, "epoch": 0.6353568830328995, "step": 3930}, {"loss": 0.809, "grad_norm": 0.43155938386917114, "learning_rate": 0.0002, "epoch": 0.6369735672136448, "step": 3940}, {"loss": 0.8404, "grad_norm": 0.6427084803581238, "learning_rate": 0.0002, "epoch": 0.6385902513943901, "step": 3950}, {"loss": 0.8368, "grad_norm": 0.541220486164093, "learning_rate": 0.0002, "epoch": 0.6402069355751354, "step": 3960}, {"loss": 0.8539, "grad_norm": 0.5414294600486755, "learning_rate": 0.0002, "epoch": 0.6418236197558806, "step": 3970}, {"loss": 0.7996, "grad_norm": 0.46344003081321716, "learning_rate": 0.0002, "epoch": 0.643440303936626, "step": 3980}, {"loss": 0.7474, "grad_norm": 0.45209285616874695, "learning_rate": 0.0002, "epoch": 0.6450569881173712, "step": 3990}, {"loss": 0.8202, "grad_norm": 0.5417284369468689, "learning_rate": 0.0002, "epoch": 0.6466736722981166, "step": 4000}, {"loss": 0.7563, "grad_norm": 0.7995685935020447, "learning_rate": 0.0002, "epoch": 0.6482903564788619, "step": 4010}, {"loss": 0.7812, "grad_norm": 0.6384002566337585, "learning_rate": 0.0002, "epoch": 0.6499070406596071, "step": 4020}, {"loss": 0.732, "grad_norm": 0.4472815692424774, "learning_rate": 0.0002, "epoch": 0.6515237248403525, "step": 4030}, {"loss": 0.8071, "grad_norm": 0.6834294199943542, "learning_rate": 0.0002, "epoch": 0.6531404090210977, "step": 4040}, {"loss": 0.7812, "grad_norm": 0.4612339735031128, "learning_rate": 0.0002, "epoch": 0.654757093201843, "step": 4050}, {"loss": 0.8141, "grad_norm": 0.9266576170921326, "learning_rate": 0.0002, "epoch": 0.6563737773825883, "step": 4060}, {"loss": 0.7991, "grad_norm": 0.4470861852169037, "learning_rate": 0.0002, "epoch": 0.6579904615633336, "step": 4070}, {"loss": 0.8293, "grad_norm": 0.45544925332069397, "learning_rate": 0.0002, "epoch": 0.6596071457440789, "step": 4080}, {"loss": 0.8455, "grad_norm": 0.6144481301307678, "learning_rate": 0.0002, "epoch": 0.6612238299248242, "step": 4090}, {"loss": 0.7877, "grad_norm": 0.5936288237571716, "learning_rate": 0.0002, "epoch": 0.6628405141055694, "step": 4100}, {"loss": 0.7617, "grad_norm": 0.4822963774204254, "learning_rate": 0.0002, "epoch": 0.6644571982863148, "step": 4110}, {"loss": 0.7997, "grad_norm": 0.48432496190071106, "learning_rate": 0.0002, "epoch": 0.66607388246706, "step": 4120}, {"loss": 0.8404, "grad_norm": 0.4901607930660248, "learning_rate": 0.0002, "epoch": 0.6676905666478054, "step": 4130}, {"loss": 0.8085, "grad_norm": 0.5018393397331238, "learning_rate": 0.0002, "epoch": 0.6693072508285507, "step": 4140}, {"loss": 0.8065, "grad_norm": 0.6946378946304321, "learning_rate": 0.0002, "epoch": 0.6709239350092959, "step": 4150}, {"loss": 0.8147, "grad_norm": 0.5997390747070312, "learning_rate": 0.0002, "epoch": 0.6725406191900413, "step": 4160}, {"loss": 0.8268, "grad_norm": 0.6738849878311157, "learning_rate": 0.0002, "epoch": 0.6741573033707865, "step": 4170}, {"loss": 0.7704, "grad_norm": 0.6110581159591675, "learning_rate": 0.0002, "epoch": 0.6757739875515318, "step": 4180}, {"loss": 0.8043, "grad_norm": 0.5703322291374207, "learning_rate": 0.0002, "epoch": 0.6773906717322771, "step": 4190}, {"loss": 0.8099, "grad_norm": 0.4686066210269928, "learning_rate": 0.0002, "epoch": 0.6790073559130224, "step": 4200}, {"loss": 0.8441, "grad_norm": 0.6394643783569336, "learning_rate": 0.0002, "epoch": 0.6806240400937676, "step": 4210}, {"loss": 0.8011, "grad_norm": 0.5454841256141663, "learning_rate": 0.0002, "epoch": 0.682240724274513, "step": 4220}, {"loss": 0.8307, "grad_norm": 0.4859732985496521, "learning_rate": 0.0002, "epoch": 0.6838574084552582, "step": 4230}, {"loss": 0.8161, "grad_norm": 0.5544065833091736, "learning_rate": 0.0002, "epoch": 0.6854740926360036, "step": 4240}, {"loss": 0.7839, "grad_norm": 0.4902505576610565, "learning_rate": 0.0002, "epoch": 0.6870907768167488, "step": 4250}, {"loss": 0.7977, "grad_norm": 0.4768051505088806, "learning_rate": 0.0002, "epoch": 0.6887074609974941, "step": 4260}, {"loss": 0.7539, "grad_norm": 0.49982190132141113, "learning_rate": 0.0002, "epoch": 0.6903241451782395, "step": 4270}, {"loss": 0.7353, "grad_norm": 0.6351838111877441, "learning_rate": 0.0002, "epoch": 0.6919408293589847, "step": 4280}, {"loss": 0.7664, "grad_norm": 0.5647561550140381, "learning_rate": 0.0002, "epoch": 0.69355751353973, "step": 4290}, {"loss": 0.7618, "grad_norm": 0.5340486764907837, "learning_rate": 0.0002, "epoch": 0.6951741977204753, "step": 4300}, {"loss": 0.8526, "grad_norm": 0.5649092793464661, "learning_rate": 0.0002, "epoch": 0.6967908819012206, "step": 4310}, {"loss": 0.8246, "grad_norm": 0.6183916926383972, "learning_rate": 0.0002, "epoch": 0.6984075660819659, "step": 4320}, {"loss": 0.792, "grad_norm": 0.6154509782791138, "learning_rate": 0.0002, "epoch": 0.7000242502627112, "step": 4330}, {"loss": 0.8397, "grad_norm": 0.5156264305114746, "learning_rate": 0.0002, "epoch": 0.7016409344434564, "step": 4340}, {"loss": 0.8512, "grad_norm": 0.562171459197998, "learning_rate": 0.0002, "epoch": 0.7032576186242018, "step": 4350}, {"loss": 0.7882, "grad_norm": 0.4949502646923065, "learning_rate": 0.0002, "epoch": 0.704874302804947, "step": 4360}, {"loss": 0.738, "grad_norm": 0.5171684622764587, "learning_rate": 0.0002, "epoch": 0.7064909869856923, "step": 4370}, {"loss": 0.8001, "grad_norm": 0.6198443174362183, "learning_rate": 0.0002, "epoch": 0.7081076711664377, "step": 4380}, {"loss": 0.7606, "grad_norm": 0.5802276134490967, "learning_rate": 0.0002, "epoch": 0.7097243553471829, "step": 4390}, {"loss": 0.8797, "grad_norm": 0.41096967458724976, "learning_rate": 0.0002, "epoch": 0.7113410395279283, "step": 4400}, {"loss": 0.805, "grad_norm": 0.4397392272949219, "learning_rate": 0.0002, "epoch": 0.7129577237086735, "step": 4410}, {"loss": 0.7651, "grad_norm": 0.45228442549705505, "learning_rate": 0.0002, "epoch": 0.7145744078894188, "step": 4420}, {"loss": 0.7938, "grad_norm": 0.4839673936367035, "learning_rate": 0.0002, "epoch": 0.7161910920701641, "step": 4430}, {"loss": 0.8362, "grad_norm": 0.6140755414962769, "learning_rate": 0.0002, "epoch": 0.7178077762509094, "step": 4440}, {"loss": 0.7722, "grad_norm": 0.6841378808021545, "learning_rate": 0.0002, "epoch": 0.7194244604316546, "step": 4450}, {"loss": 0.8177, "grad_norm": 0.6664239168167114, "learning_rate": 0.0002, "epoch": 0.7210411446124, "step": 4460}, {"loss": 0.7983, "grad_norm": 0.47552719712257385, "learning_rate": 0.0002, "epoch": 0.7226578287931452, "step": 4470}, {"loss": 0.8982, "grad_norm": 0.6649776101112366, "learning_rate": 0.0002, "epoch": 0.7242745129738906, "step": 4480}, {"loss": 0.8074, "grad_norm": 0.5159541964530945, "learning_rate": 0.0002, "epoch": 0.7258911971546358, "step": 4490}, {"loss": 0.7786, "grad_norm": 0.6693112850189209, "learning_rate": 0.0002, "epoch": 0.7275078813353811, "step": 4500}, {"loss": 0.8655, "grad_norm": 0.48870977759361267, "learning_rate": 0.0002, "epoch": 0.7291245655161265, "step": 4510}, {"loss": 0.7337, "grad_norm": 0.4857887923717499, "learning_rate": 0.0002, "epoch": 0.7307412496968717, "step": 4520}, {"loss": 0.8026, "grad_norm": 0.5515662431716919, "learning_rate": 0.0002, "epoch": 0.732357933877617, "step": 4530}, {"loss": 0.8031, "grad_norm": 0.6292222738265991, "learning_rate": 0.0002, "epoch": 0.7339746180583623, "step": 4540}, {"loss": 0.7749, "grad_norm": 0.48265689611434937, "learning_rate": 0.0002, "epoch": 0.7355913022391076, "step": 4550}, {"loss": 0.8499, "grad_norm": 0.8044266104698181, "learning_rate": 0.0002, "epoch": 0.7372079864198529, "step": 4560}, {"loss": 0.8162, "grad_norm": 0.6111769676208496, "learning_rate": 0.0002, "epoch": 0.7388246706005982, "step": 4570}, {"loss": 0.7291, "grad_norm": 0.5229553580284119, "learning_rate": 0.0002, "epoch": 0.7404413547813434, "step": 4580}, {"loss": 0.8038, "grad_norm": 0.6054152250289917, "learning_rate": 0.0002, "epoch": 0.7420580389620888, "step": 4590}, {"loss": 0.8169, "grad_norm": 0.5574966669082642, "learning_rate": 0.0002, "epoch": 0.743674723142834, "step": 4600}, {"loss": 0.8439, "grad_norm": 0.5395817160606384, "learning_rate": 0.0002, "epoch": 0.7452914073235793, "step": 4610}, {"loss": 0.8495, "grad_norm": 0.7116472721099854, "learning_rate": 0.0002, "epoch": 0.7469080915043246, "step": 4620}, {"loss": 0.7743, "grad_norm": 0.5618700981140137, "learning_rate": 0.0002, "epoch": 0.7485247756850699, "step": 4630}, {"loss": 0.7744, "grad_norm": 0.5802770853042603, "learning_rate": 0.0002, "epoch": 0.7501414598658153, "step": 4640}, {"loss": 0.7924, "grad_norm": 0.5690428018569946, "learning_rate": 0.0002, "epoch": 0.7517581440465605, "step": 4650}, {"loss": 0.8017, "grad_norm": 0.4813360273838043, "learning_rate": 0.0002, "epoch": 0.7533748282273058, "step": 4660}, {"loss": 0.8108, "grad_norm": 0.5434042811393738, "learning_rate": 0.0002, "epoch": 0.7549915124080511, "step": 4670}, {"loss": 0.7824, "grad_norm": 0.5502099990844727, "learning_rate": 0.0002, "epoch": 0.7566081965887964, "step": 4680}, {"loss": 0.8598, "grad_norm": 0.6020621061325073, "learning_rate": 0.0002, "epoch": 0.7582248807695416, "step": 4690}, {"loss": 0.7937, "grad_norm": 0.4922301471233368, "learning_rate": 0.0002, "epoch": 0.759841564950287, "step": 4700}, {"loss": 0.788, "grad_norm": 0.6492828726768494, "learning_rate": 0.0002, "epoch": 0.7614582491310322, "step": 4710}, {"loss": 0.8313, "grad_norm": 0.4865580201148987, "learning_rate": 0.0002, "epoch": 0.7630749333117776, "step": 4720}, {"loss": 0.7966, "grad_norm": 0.5971422791481018, "learning_rate": 0.0002, "epoch": 0.7646916174925228, "step": 4730}, {"loss": 0.8298, "grad_norm": 0.6832674145698547, "learning_rate": 0.0002, "epoch": 0.7663083016732681, "step": 4740}, {"loss": 0.8156, "grad_norm": 0.500908613204956, "learning_rate": 0.0002, "epoch": 0.7679249858540134, "step": 4750}, {"loss": 0.8383, "grad_norm": 0.6112465858459473, "learning_rate": 0.0002, "epoch": 0.7695416700347587, "step": 4760}, {"loss": 0.76, "grad_norm": 0.5753506422042847, "learning_rate": 0.0002, "epoch": 0.771158354215504, "step": 4770}, {"loss": 0.8297, "grad_norm": 0.6529405117034912, "learning_rate": 0.0002, "epoch": 0.7727750383962493, "step": 4780}, {"loss": 0.8171, "grad_norm": 0.5916843414306641, "learning_rate": 0.0002, "epoch": 0.7743917225769946, "step": 4790}, {"loss": 0.83, "grad_norm": 0.4821224510669708, "learning_rate": 0.0002, "epoch": 0.7760084067577399, "step": 4800}, {"loss": 0.7703, "grad_norm": 0.5532580018043518, "learning_rate": 0.0002, "epoch": 0.7776250909384852, "step": 4810}, {"loss": 0.7363, "grad_norm": 0.4604877233505249, "learning_rate": 0.0002, "epoch": 0.7792417751192304, "step": 4820}, {"loss": 0.7506, "grad_norm": 0.5009613037109375, "learning_rate": 0.0002, "epoch": 0.7808584592999758, "step": 4830}, {"loss": 0.7863, "grad_norm": 0.6448560357093811, "learning_rate": 0.0002, "epoch": 0.782475143480721, "step": 4840}, {"loss": 0.7957, "grad_norm": 0.44327953457832336, "learning_rate": 0.0002, "epoch": 0.7840918276614663, "step": 4850}, {"loss": 0.7925, "grad_norm": 0.5355411171913147, "learning_rate": 0.0002, "epoch": 0.7857085118422116, "step": 4860}, {"loss": 0.7754, "grad_norm": 0.5635677576065063, "learning_rate": 0.0002, "epoch": 0.7873251960229569, "step": 4870}, {"loss": 0.7931, "grad_norm": 0.5417491793632507, "learning_rate": 0.0002, "epoch": 0.7889418802037023, "step": 4880}, {"loss": 0.7819, "grad_norm": 0.4567430913448334, "learning_rate": 0.0002, "epoch": 0.7905585643844475, "step": 4890}, {"loss": 0.8454, "grad_norm": 0.44651296734809875, "learning_rate": 0.0002, "epoch": 0.7921752485651928, "step": 4900}, {"loss": 0.7959, "grad_norm": 0.5741217136383057, "learning_rate": 0.0002, "epoch": 0.7937919327459381, "step": 4910}, {"loss": 0.8093, "grad_norm": 0.6605045199394226, "learning_rate": 0.0002, "epoch": 0.7954086169266834, "step": 4920}, {"loss": 0.77, "grad_norm": 0.5126531720161438, "learning_rate": 0.0002, "epoch": 0.7970253011074286, "step": 4930}, {"loss": 0.7793, "grad_norm": 0.513648271560669, "learning_rate": 0.0002, "epoch": 0.798641985288174, "step": 4940}, {"loss": 0.8314, "grad_norm": 0.5350404381752014, "learning_rate": 0.0002, "epoch": 0.8002586694689192, "step": 4950}, {"loss": 0.7649, "grad_norm": 0.5731674432754517, "learning_rate": 0.0002, "epoch": 0.8018753536496646, "step": 4960}, {"loss": 0.8572, "grad_norm": 0.5974258184432983, "learning_rate": 0.0002, "epoch": 0.8034920378304098, "step": 4970}, {"loss": 0.7972, "grad_norm": 0.8774799704551697, "learning_rate": 0.0002, "epoch": 0.8051087220111551, "step": 4980}, {"loss": 0.7899, "grad_norm": 0.5994430184364319, "learning_rate": 0.0002, "epoch": 0.8067254061919004, "step": 4990}, {"loss": 0.7736, "grad_norm": 0.4894903004169464, "learning_rate": 0.0002, "epoch": 0.8083420903726457, "step": 5000}, {"loss": 0.78, "grad_norm": 0.5218459367752075, "learning_rate": 0.0002, "epoch": 0.809958774553391, "step": 5010}, {"loss": 0.817, "grad_norm": 0.5232468843460083, "learning_rate": 0.0002, "epoch": 0.8115754587341363, "step": 5020}, {"loss": 0.7704, "grad_norm": 0.44358372688293457, "learning_rate": 0.0002, "epoch": 0.8131921429148816, "step": 5030}, {"loss": 0.785, "grad_norm": 0.6202037334442139, "learning_rate": 0.0002, "epoch": 0.8148088270956269, "step": 5040}, {"loss": 0.7351, "grad_norm": 0.7721474170684814, "learning_rate": 0.0002, "epoch": 0.8164255112763722, "step": 5050}, {"loss": 0.8297, "grad_norm": 0.5568501353263855, "learning_rate": 0.0002, "epoch": 0.8180421954571174, "step": 5060}, {"loss": 0.7733, "grad_norm": 0.49148809909820557, "learning_rate": 0.0002, "epoch": 0.8196588796378628, "step": 5070}, {"loss": 0.8054, "grad_norm": 0.4956012964248657, "learning_rate": 0.0002, "epoch": 0.821275563818608, "step": 5080}, {"loss": 0.8201, "grad_norm": 0.6078833937644958, "learning_rate": 0.0002, "epoch": 0.8228922479993533, "step": 5090}, {"loss": 0.828, "grad_norm": 0.46906954050064087, "learning_rate": 0.0002, "epoch": 0.8245089321800986, "step": 5100}, {"loss": 0.7703, "grad_norm": 0.50812166929245, "learning_rate": 0.0002, "epoch": 0.8261256163608439, "step": 5110}, {"loss": 0.8243, "grad_norm": 0.5319661498069763, "learning_rate": 0.0002, "epoch": 0.8277423005415891, "step": 5120}, {"loss": 0.7798, "grad_norm": 0.4949689209461212, "learning_rate": 0.0002, "epoch": 0.8293589847223345, "step": 5130}, {"loss": 0.7428, "grad_norm": 0.5151591300964355, "learning_rate": 0.0002, "epoch": 0.8309756689030798, "step": 5140}, {"loss": 0.8147, "grad_norm": 0.5530214309692383, "learning_rate": 0.0002, "epoch": 0.8325923530838251, "step": 5150}, {"loss": 0.8251, "grad_norm": 0.6297410130500793, "learning_rate": 0.0002, "epoch": 0.8342090372645704, "step": 5160}, {"loss": 0.8067, "grad_norm": 0.5466840267181396, "learning_rate": 0.0002, "epoch": 0.8358257214453156, "step": 5170}, {"loss": 0.7875, "grad_norm": 0.652913510799408, "learning_rate": 0.0002, "epoch": 0.837442405626061, "step": 5180}, {"loss": 0.8295, "grad_norm": 0.5811293125152588, "learning_rate": 0.0002, "epoch": 0.8390590898068062, "step": 5190}, {"loss": 0.7412, "grad_norm": 0.5109550952911377, "learning_rate": 0.0002, "epoch": 0.8406757739875516, "step": 5200}, {"loss": 0.8077, "grad_norm": 0.4551706612110138, "learning_rate": 0.0002, "epoch": 0.8422924581682968, "step": 5210}, {"loss": 0.7827, "grad_norm": 0.5813754200935364, "learning_rate": 0.0002, "epoch": 0.8439091423490421, "step": 5220}, {"loss": 0.802, "grad_norm": 0.5856947898864746, "learning_rate": 0.0002, "epoch": 0.8455258265297874, "step": 5230}, {"loss": 0.7957, "grad_norm": 0.5482739210128784, "learning_rate": 0.0002, "epoch": 0.8471425107105327, "step": 5240}, {"loss": 0.8295, "grad_norm": 0.49023720622062683, "learning_rate": 0.0002, "epoch": 0.8487591948912779, "step": 5250}, {"loss": 0.8022, "grad_norm": 0.49472475051879883, "learning_rate": 0.0002, "epoch": 0.8503758790720233, "step": 5260}, {"loss": 0.8001, "grad_norm": 0.5490226745605469, "learning_rate": 0.0002, "epoch": 0.8519925632527686, "step": 5270}, {"loss": 0.8333, "grad_norm": 0.5340665578842163, "learning_rate": 0.0002, "epoch": 0.8536092474335139, "step": 5280}, {"loss": 0.8277, "grad_norm": 0.5962483882904053, "learning_rate": 0.0002, "epoch": 0.8552259316142592, "step": 5290}, {"loss": 0.8765, "grad_norm": 0.586358368396759, "learning_rate": 0.0002, "epoch": 0.8568426157950044, "step": 5300}, {"loss": 0.7831, "grad_norm": 0.49120277166366577, "learning_rate": 0.0002, "epoch": 0.8584592999757498, "step": 5310}, {"loss": 0.8162, "grad_norm": 0.5887332558631897, "learning_rate": 0.0002, "epoch": 0.860075984156495, "step": 5320}, {"loss": 0.7464, "grad_norm": 0.42496153712272644, "learning_rate": 0.0002, "epoch": 0.8616926683372403, "step": 5330}, {"loss": 0.7905, "grad_norm": 0.5489874482154846, "learning_rate": 0.0002, "epoch": 0.8633093525179856, "step": 5340}, {"loss": 0.7958, "grad_norm": 0.5850813984870911, "learning_rate": 0.0002, "epoch": 0.8649260366987309, "step": 5350}, {"loss": 0.7642, "grad_norm": 0.517487108707428, "learning_rate": 0.0002, "epoch": 0.8665427208794761, "step": 5360}, {"loss": 0.7801, "grad_norm": 0.5339142680168152, "learning_rate": 0.0002, "epoch": 0.8681594050602215, "step": 5370}, {"loss": 0.818, "grad_norm": 0.6236387491226196, "learning_rate": 0.0002, "epoch": 0.8697760892409668, "step": 5380}, {"loss": 0.7708, "grad_norm": 0.5752192735671997, "learning_rate": 0.0002, "epoch": 0.8713927734217121, "step": 5390}, {"loss": 0.8542, "grad_norm": 0.6724614500999451, "learning_rate": 0.0002, "epoch": 0.8730094576024574, "step": 5400}, {"loss": 0.7581, "grad_norm": 0.5280613303184509, "learning_rate": 0.0002, "epoch": 0.8746261417832026, "step": 5410}, {"loss": 0.8231, "grad_norm": 0.44033288955688477, "learning_rate": 0.0002, "epoch": 0.876242825963948, "step": 5420}, {"loss": 0.8839, "grad_norm": 0.5199708342552185, "learning_rate": 0.0002, "epoch": 0.8778595101446932, "step": 5430}, {"loss": 0.7852, "grad_norm": 0.46778348088264465, "learning_rate": 0.0002, "epoch": 0.8794761943254386, "step": 5440}, {"loss": 0.7834, "grad_norm": 0.4657754898071289, "learning_rate": 0.0002, "epoch": 0.8810928785061838, "step": 5450}, {"loss": 0.7799, "grad_norm": 0.5472902655601501, "learning_rate": 0.0002, "epoch": 0.8827095626869291, "step": 5460}, {"loss": 0.8253, "grad_norm": 0.4876766800880432, "learning_rate": 0.0002, "epoch": 0.8843262468676744, "step": 5470}, {"loss": 0.7906, "grad_norm": 0.5057248473167419, "learning_rate": 0.0002, "epoch": 0.8859429310484197, "step": 5480}, {"loss": 0.8124, "grad_norm": 0.4637320637702942, "learning_rate": 0.0002, "epoch": 0.8875596152291649, "step": 5490}, {"loss": 0.781, "grad_norm": 0.471955806016922, "learning_rate": 0.0002, "epoch": 0.8891762994099103, "step": 5500}, {"loss": 0.8057, "grad_norm": 0.5209813714027405, "learning_rate": 0.0002, "epoch": 0.8907929835906556, "step": 5510}, {"loss": 0.8106, "grad_norm": 0.6213834285736084, "learning_rate": 0.0002, "epoch": 0.8924096677714008, "step": 5520}, {"loss": 0.7787, "grad_norm": 0.5215408205986023, "learning_rate": 0.0002, "epoch": 0.8940263519521462, "step": 5530}, {"loss": 0.8174, "grad_norm": 0.580478310585022, "learning_rate": 0.0002, "epoch": 0.8956430361328914, "step": 5540}, {"loss": 0.8371, "grad_norm": 0.49102169275283813, "learning_rate": 0.0002, "epoch": 0.8972597203136368, "step": 5550}, {"loss": 0.7806, "grad_norm": 0.6043479442596436, "learning_rate": 0.0002, "epoch": 0.898876404494382, "step": 5560}, {"loss": 0.7754, "grad_norm": 0.5636463165283203, "learning_rate": 0.0002, "epoch": 0.9004930886751273, "step": 5570}, {"loss": 0.8145, "grad_norm": 0.5620124340057373, "learning_rate": 0.0002, "epoch": 0.9021097728558726, "step": 5580}, {"loss": 0.8083, "grad_norm": 0.5206354856491089, "learning_rate": 0.0002, "epoch": 0.9037264570366179, "step": 5590}, {"loss": 0.8557, "grad_norm": 0.5798229575157166, "learning_rate": 0.0002, "epoch": 0.9053431412173631, "step": 5600}, {"loss": 0.8097, "grad_norm": 0.6428212523460388, "learning_rate": 0.0002, "epoch": 0.9069598253981085, "step": 5610}, {"loss": 0.7839, "grad_norm": 0.48064687848091125, "learning_rate": 0.0002, "epoch": 0.9085765095788537, "step": 5620}, {"loss": 0.8343, "grad_norm": 0.6347860097885132, "learning_rate": 0.0002, "epoch": 0.9101931937595991, "step": 5630}, {"loss": 0.851, "grad_norm": 0.5353913307189941, "learning_rate": 0.0002, "epoch": 0.9118098779403444, "step": 5640}, {"loss": 0.7736, "grad_norm": 0.5323944091796875, "learning_rate": 0.0002, "epoch": 0.9134265621210896, "step": 5650}, {"loss": 0.8393, "grad_norm": 0.5261843204498291, "learning_rate": 0.0002, "epoch": 0.915043246301835, "step": 5660}, {"loss": 0.7355, "grad_norm": 0.5451326966285706, "learning_rate": 0.0002, "epoch": 0.9166599304825802, "step": 5670}, {"loss": 0.8012, "grad_norm": 0.5183324217796326, "learning_rate": 0.0002, "epoch": 0.9182766146633256, "step": 5680}, {"loss": 0.7659, "grad_norm": 0.47229018807411194, "learning_rate": 0.0002, "epoch": 0.9198932988440708, "step": 5690}, {"loss": 0.7757, "grad_norm": 0.49180513620376587, "learning_rate": 0.0002, "epoch": 0.9215099830248161, "step": 5700}, {"loss": 0.8735, "grad_norm": 0.5419785380363464, "learning_rate": 0.0002, "epoch": 0.9231266672055614, "step": 5710}, {"loss": 0.7378, "grad_norm": 0.5408698916435242, "learning_rate": 0.0002, "epoch": 0.9247433513863067, "step": 5720}, {"loss": 0.7701, "grad_norm": 0.5286232829093933, "learning_rate": 0.0002, "epoch": 0.9263600355670519, "step": 5730}, {"loss": 0.8242, "grad_norm": 0.7539758086204529, "learning_rate": 0.0002, "epoch": 0.9279767197477973, "step": 5740}, {"loss": 0.8118, "grad_norm": 0.5166944861412048, "learning_rate": 0.0002, "epoch": 0.9295934039285425, "step": 5750}, {"loss": 0.783, "grad_norm": 0.6601425409317017, "learning_rate": 0.0002, "epoch": 0.9312100881092878, "step": 5760}, {"loss": 0.7873, "grad_norm": 0.5029960870742798, "learning_rate": 0.0002, "epoch": 0.9328267722900332, "step": 5770}, {"loss": 0.7989, "grad_norm": 0.4926645755767822, "learning_rate": 0.0002, "epoch": 0.9344434564707784, "step": 5780}, {"loss": 0.8174, "grad_norm": 0.5739615559577942, "learning_rate": 0.0002, "epoch": 0.9360601406515238, "step": 5790}, {"loss": 0.8037, "grad_norm": 0.5058279037475586, "learning_rate": 0.0002, "epoch": 0.937676824832269, "step": 5800}, {"loss": 0.8537, "grad_norm": 0.5260962247848511, "learning_rate": 0.0002, "epoch": 0.9392935090130143, "step": 5810}, {"loss": 0.7486, "grad_norm": 0.5768588185310364, "learning_rate": 0.0002, "epoch": 0.9409101931937596, "step": 5820}, {"loss": 0.8215, "grad_norm": 0.5170126557350159, "learning_rate": 0.0002, "epoch": 0.9425268773745049, "step": 5830}, {"loss": 0.7422, "grad_norm": 0.5745864510536194, "learning_rate": 0.0002, "epoch": 0.9441435615552501, "step": 5840}, {"loss": 0.7824, "grad_norm": 0.5551357865333557, "learning_rate": 0.0002, "epoch": 0.9457602457359955, "step": 5850}, {"loss": 0.8529, "grad_norm": 0.5776078701019287, "learning_rate": 0.0002, "epoch": 0.9473769299167407, "step": 5860}, {"loss": 0.8527, "grad_norm": 0.5340062379837036, "learning_rate": 0.0002, "epoch": 0.9489936140974861, "step": 5870}, {"loss": 0.8217, "grad_norm": 0.6447290182113647, "learning_rate": 0.0002, "epoch": 0.9506102982782314, "step": 5880}, {"loss": 0.7945, "grad_norm": 0.5123815536499023, "learning_rate": 0.0002, "epoch": 0.9522269824589766, "step": 5890}, {"loss": 0.8209, "grad_norm": 0.48547613620758057, "learning_rate": 0.0002, "epoch": 0.953843666639722, "step": 5900}, {"loss": 0.7896, "grad_norm": 0.5791414976119995, "learning_rate": 0.0002, "epoch": 0.9554603508204672, "step": 5910}, {"loss": 0.8408, "grad_norm": 0.6195011734962463, "learning_rate": 0.0002, "epoch": 0.9570770350012126, "step": 5920}, {"loss": 0.7805, "grad_norm": 0.6323803067207336, "learning_rate": 0.0002, "epoch": 0.9586937191819578, "step": 5930}, {"loss": 0.8484, "grad_norm": 0.45552879571914673, "learning_rate": 0.0002, "epoch": 0.9603104033627031, "step": 5940}, {"loss": 0.7367, "grad_norm": 0.5796473622322083, "learning_rate": 0.0002, "epoch": 0.9619270875434484, "step": 5950}, {"loss": 0.7672, "grad_norm": 0.647261381149292, "learning_rate": 0.0002, "epoch": 0.9635437717241937, "step": 5960}, {"loss": 0.8086, "grad_norm": 0.5487682819366455, "learning_rate": 0.0002, "epoch": 0.9651604559049389, "step": 5970}, {"loss": 0.7973, "grad_norm": 0.5743663907051086, "learning_rate": 0.0002, "epoch": 0.9667771400856843, "step": 5980}, {"loss": 0.8153, "grad_norm": 0.5470591187477112, "learning_rate": 0.0002, "epoch": 0.9683938242664295, "step": 5990}, {"loss": 0.8119, "grad_norm": 0.5901660323143005, "learning_rate": 0.0002, "epoch": 0.9700105084471748, "step": 6000}, {"loss": 0.8147, "grad_norm": 0.6544759273529053, "learning_rate": 0.0002, "epoch": 0.9716271926279202, "step": 6010}, {"loss": 0.7536, "grad_norm": 0.6288470029830933, "learning_rate": 0.0002, "epoch": 0.9732438768086654, "step": 6020}, {"loss": 0.7989, "grad_norm": 0.673153817653656, "learning_rate": 0.0002, "epoch": 0.9748605609894108, "step": 6030}, {"loss": 0.7556, "grad_norm": 0.42854753136634827, "learning_rate": 0.0002, "epoch": 0.976477245170156, "step": 6040}, {"loss": 0.8006, "grad_norm": 0.5227066278457642, "learning_rate": 0.0002, "epoch": 0.9780939293509013, "step": 6050}, {"loss": 0.795, "grad_norm": 0.5372416973114014, "learning_rate": 0.0002, "epoch": 0.9797106135316466, "step": 6060}, {"loss": 0.7591, "grad_norm": 0.6026402115821838, "learning_rate": 0.0002, "epoch": 0.9813272977123919, "step": 6070}, {"loss": 0.8347, "grad_norm": 0.49547791481018066, "learning_rate": 0.0002, "epoch": 0.9829439818931371, "step": 6080}, {"loss": 0.7722, "grad_norm": 0.4641951322555542, "learning_rate": 0.0002, "epoch": 0.9845606660738825, "step": 6090}, {"loss": 0.8125, "grad_norm": 0.5818535089492798, "learning_rate": 0.0002, "epoch": 0.9861773502546277, "step": 6100}, {"loss": 0.81, "grad_norm": 0.63955157995224, "learning_rate": 0.0002, "epoch": 0.9877940344353731, "step": 6110}, {"loss": 0.7547, "grad_norm": 0.5649438500404358, "learning_rate": 0.0002, "epoch": 0.9894107186161183, "step": 6120}, {"loss": 0.7861, "grad_norm": 0.5290433168411255, "learning_rate": 0.0002, "epoch": 0.9910274027968636, "step": 6130}, {"loss": 0.8109, "grad_norm": 0.6399374008178711, "learning_rate": 0.0002, "epoch": 0.992644086977609, "step": 6140}, {"loss": 0.8373, "grad_norm": 0.6736576557159424, "learning_rate": 0.0002, "epoch": 0.9942607711583542, "step": 6150}, {"loss": 0.7915, "grad_norm": 0.515420138835907, "learning_rate": 0.0002, "epoch": 0.9958774553390995, "step": 6160}, {"loss": 0.8032, "grad_norm": 0.562677800655365, "learning_rate": 0.0002, "epoch": 0.9974941395198448, "step": 6170}, {"loss": 0.8187, "grad_norm": 0.7113858461380005, "learning_rate": 0.0002, "epoch": 0.9991108237005901, "step": 6180}, {"eval_loss": 1.0871200561523438, "eval_runtime": 122.2071, "eval_samples_per_second": 5.998, "eval_steps_per_second": 0.753, "epoch": 0.9999191657909627, "step": 6185}, {"loss": 0.7507, "grad_norm": 0.7111801505088806, "learning_rate": 0.0002, "epoch": 1.0007275078813354, "step": 6190}, {"loss": 0.6865, "grad_norm": 0.5402125716209412, "learning_rate": 0.0002, "epoch": 1.0023441920620806, "step": 6200}, {"loss": 0.7625, "grad_norm": 0.6098830103874207, "learning_rate": 0.0002, "epoch": 1.003960876242826, "step": 6210}, {"loss": 0.7631, "grad_norm": 0.5829983353614807, "learning_rate": 0.0002, "epoch": 1.0055775604235713, "step": 6220}, {"loss": 0.7188, "grad_norm": 0.5614621043205261, "learning_rate": 0.0002, "epoch": 1.0071942446043165, "step": 6230}, {"loss": 0.7505, "grad_norm": 0.5954238772392273, "learning_rate": 0.0002, "epoch": 1.0088109287850617, "step": 6240}, {"loss": 0.7448, "grad_norm": 0.6480574607849121, "learning_rate": 0.0002, "epoch": 1.0104276129658072, "step": 6250}, {"loss": 0.7514, "grad_norm": 0.6051128506660461, "learning_rate": 0.0002, "epoch": 1.0120442971465524, "step": 6260}, {"loss": 0.7237, "grad_norm": 0.6318870782852173, "learning_rate": 0.0002, "epoch": 1.0136609813272976, "step": 6270}, {"loss": 0.7178, "grad_norm": 0.5048980116844177, "learning_rate": 0.0002, "epoch": 1.015277665508043, "step": 6280}, {"loss": 0.7391, "grad_norm": 0.6346936225891113, "learning_rate": 0.0002, "epoch": 1.0168943496887883, "step": 6290}, {"loss": 0.7486, "grad_norm": 0.5711665749549866, "learning_rate": 0.0002, "epoch": 1.0185110338695336, "step": 6300}, {"loss": 0.6808, "grad_norm": 0.5175361037254333, "learning_rate": 0.0002, "epoch": 1.0201277180502788, "step": 6310}, {"loss": 0.7539, "grad_norm": 0.5360831618309021, "learning_rate": 0.0002, "epoch": 1.0217444022310243, "step": 6320}, {"loss": 0.7112, "grad_norm": 0.614675760269165, "learning_rate": 0.0002, "epoch": 1.0233610864117695, "step": 6330}, {"loss": 0.7748, "grad_norm": 0.5626118183135986, "learning_rate": 0.0002, "epoch": 1.0249777705925147, "step": 6340}, {"loss": 0.7375, "grad_norm": 0.574897289276123, "learning_rate": 0.0002, "epoch": 1.02659445477326, "step": 6350}, {"loss": 0.759, "grad_norm": 0.7185447812080383, "learning_rate": 0.0002, "epoch": 1.0282111389540054, "step": 6360}, {"loss": 0.703, "grad_norm": 0.6705799698829651, "learning_rate": 0.0002, "epoch": 1.0298278231347506, "step": 6370}, {"loss": 0.7139, "grad_norm": 0.6740428805351257, "learning_rate": 0.0002, "epoch": 1.0314445073154959, "step": 6380}, {"loss": 0.7252, "grad_norm": 0.663902759552002, "learning_rate": 0.0002, "epoch": 1.0330611914962413, "step": 6390}, {"loss": 0.7065, "grad_norm": 0.5029543042182922, "learning_rate": 0.0002, "epoch": 1.0346778756769865, "step": 6400}, {"loss": 0.711, "grad_norm": 0.7813863158226013, "learning_rate": 0.0002, "epoch": 1.0362945598577318, "step": 6410}, {"loss": 0.7433, "grad_norm": 0.5396282076835632, "learning_rate": 0.0002, "epoch": 1.037911244038477, "step": 6420}, {"loss": 0.7222, "grad_norm": 0.5253293514251709, "learning_rate": 0.0002, "epoch": 1.0395279282192225, "step": 6430}, {"loss": 0.715, "grad_norm": 0.7236770987510681, "learning_rate": 0.0002, "epoch": 1.0411446123999677, "step": 6440}, {"loss": 0.7259, "grad_norm": 0.5670917630195618, "learning_rate": 0.0002, "epoch": 1.042761296580713, "step": 6450}, {"loss": 0.7195, "grad_norm": 0.6031978726387024, "learning_rate": 0.0002, "epoch": 1.0443779807614582, "step": 6460}, {"loss": 0.7648, "grad_norm": 0.5309213399887085, "learning_rate": 0.0002, "epoch": 1.0459946649422036, "step": 6470}, {"loss": 0.7161, "grad_norm": 0.7114651799201965, "learning_rate": 0.0002, "epoch": 1.0476113491229488, "step": 6480}, {"loss": 0.7583, "grad_norm": 0.5591610670089722, "learning_rate": 0.0002, "epoch": 1.049228033303694, "step": 6490}, {"loss": 0.6645, "grad_norm": 0.5185961127281189, "learning_rate": 0.0002, "epoch": 1.0508447174844395, "step": 6500}, {"loss": 0.7654, "grad_norm": 0.6510552167892456, "learning_rate": 0.0002, "epoch": 1.0524614016651848, "step": 6510}, {"loss": 0.7057, "grad_norm": 0.6557928919792175, "learning_rate": 0.0002, "epoch": 1.05407808584593, "step": 6520}, {"loss": 0.8056, "grad_norm": 0.6973192691802979, "learning_rate": 0.0002, "epoch": 1.0556947700266752, "step": 6530}, {"loss": 0.6793, "grad_norm": 0.6226583123207092, "learning_rate": 0.0002, "epoch": 1.0573114542074207, "step": 6540}, {"loss": 0.7151, "grad_norm": 0.5633195638656616, "learning_rate": 0.0002, "epoch": 1.058928138388166, "step": 6550}, {"loss": 0.7082, "grad_norm": 0.7466658353805542, "learning_rate": 0.0002, "epoch": 1.0605448225689111, "step": 6560}, {"loss": 0.7059, "grad_norm": 0.6462772488594055, "learning_rate": 0.0002, "epoch": 1.0621615067496564, "step": 6570}, {"loss": 0.7046, "grad_norm": 0.5266856551170349, "learning_rate": 0.0002, "epoch": 1.0637781909304018, "step": 6580}, {"loss": 0.7157, "grad_norm": 0.534392774105072, "learning_rate": 0.0002, "epoch": 1.065394875111147, "step": 6590}, {"loss": 0.7115, "grad_norm": 0.7514177560806274, "learning_rate": 0.0002, "epoch": 1.0670115592918923, "step": 6600}, {"loss": 0.7545, "grad_norm": 0.7593035697937012, "learning_rate": 0.0002, "epoch": 1.0686282434726375, "step": 6610}, {"loss": 0.6836, "grad_norm": 0.5277858972549438, "learning_rate": 0.0002, "epoch": 1.070244927653383, "step": 6620}, {"loss": 0.7405, "grad_norm": 0.5573670268058777, "learning_rate": 0.0002, "epoch": 1.0718616118341282, "step": 6630}, {"loss": 0.6774, "grad_norm": 0.6802396774291992, "learning_rate": 0.0002, "epoch": 1.0734782960148734, "step": 6640}, {"loss": 0.723, "grad_norm": 0.7367215752601624, "learning_rate": 0.0002, "epoch": 1.0750949801956189, "step": 6650}, {"loss": 0.7429, "grad_norm": 0.5961891412734985, "learning_rate": 0.0002, "epoch": 1.0767116643763641, "step": 6660}, {"loss": 0.6791, "grad_norm": 0.5736313462257385, "learning_rate": 0.0002, "epoch": 1.0783283485571094, "step": 6670}, {"loss": 0.7178, "grad_norm": 0.619219183921814, "learning_rate": 0.0002, "epoch": 1.0799450327378546, "step": 6680}, {"loss": 0.7318, "grad_norm": 0.6214390993118286, "learning_rate": 0.0002, "epoch": 1.0815617169186, "step": 6690}, {"loss": 0.7554, "grad_norm": 0.564536988735199, "learning_rate": 0.0002, "epoch": 1.0831784010993453, "step": 6700}, {"loss": 0.7362, "grad_norm": 0.5838140249252319, "learning_rate": 0.0002, "epoch": 1.0847950852800905, "step": 6710}, {"loss": 0.739, "grad_norm": 0.7000553607940674, "learning_rate": 0.0002, "epoch": 1.0864117694608357, "step": 6720}, {"loss": 0.7369, "grad_norm": 0.7078263759613037, "learning_rate": 0.0002, "epoch": 1.0880284536415812, "step": 6730}, {"loss": 0.7654, "grad_norm": 0.8353848457336426, "learning_rate": 0.0002, "epoch": 1.0896451378223264, "step": 6740}, {"loss": 0.7015, "grad_norm": 0.5615518689155579, "learning_rate": 0.0002, "epoch": 1.0912618220030716, "step": 6750}, {"loss": 0.7396, "grad_norm": 0.5475581288337708, "learning_rate": 0.0002, "epoch": 1.0928785061838169, "step": 6760}, {"loss": 0.7652, "grad_norm": 0.5835978388786316, "learning_rate": 0.0002, "epoch": 1.0944951903645623, "step": 6770}, {"loss": 0.7541, "grad_norm": 0.5516105890274048, "learning_rate": 0.0002, "epoch": 1.0961118745453076, "step": 6780}, {"loss": 0.6842, "grad_norm": 0.5875251889228821, "learning_rate": 0.0002, "epoch": 1.0977285587260528, "step": 6790}, {"loss": 0.6903, "grad_norm": 0.7376947999000549, "learning_rate": 0.0002, "epoch": 1.0993452429067982, "step": 6800}, {"loss": 0.7512, "grad_norm": 0.5656165480613708, "learning_rate": 0.0002, "epoch": 1.1009619270875435, "step": 6810}, {"loss": 0.7409, "grad_norm": 0.6365954279899597, "learning_rate": 0.0002, "epoch": 1.1025786112682887, "step": 6820}, {"loss": 0.7392, "grad_norm": 0.5033080577850342, "learning_rate": 0.0002, "epoch": 1.104195295449034, "step": 6830}, {"loss": 0.6909, "grad_norm": 0.617396891117096, "learning_rate": 0.0002, "epoch": 1.1058119796297794, "step": 6840}, {"loss": 0.7006, "grad_norm": 0.6395374536514282, "learning_rate": 0.0002, "epoch": 1.1074286638105246, "step": 6850}, {"loss": 0.7335, "grad_norm": 0.6775295734405518, "learning_rate": 0.0002, "epoch": 1.1090453479912699, "step": 6860}, {"loss": 0.764, "grad_norm": 0.6655223965644836, "learning_rate": 0.0002, "epoch": 1.1106620321720153, "step": 6870}, {"loss": 0.7553, "grad_norm": 0.676655113697052, "learning_rate": 0.0002, "epoch": 1.1122787163527605, "step": 6880}, {"loss": 0.7342, "grad_norm": 0.6062718629837036, "learning_rate": 0.0002, "epoch": 1.1138954005335058, "step": 6890}, {"loss": 0.7446, "grad_norm": 0.590943455696106, "learning_rate": 0.0002, "epoch": 1.115512084714251, "step": 6900}, {"loss": 0.6705, "grad_norm": 0.6315317153930664, "learning_rate": 0.0002, "epoch": 1.1171287688949965, "step": 6910}, {"loss": 0.6912, "grad_norm": 0.47979024052619934, "learning_rate": 0.0002, "epoch": 1.1187454530757417, "step": 6920}, {"loss": 0.7002, "grad_norm": 0.647298276424408, "learning_rate": 0.0002, "epoch": 1.120362137256487, "step": 6930}, {"loss": 0.7502, "grad_norm": 0.7336484789848328, "learning_rate": 0.0002, "epoch": 1.1219788214372322, "step": 6940}, {"loss": 0.693, "grad_norm": 0.5071424245834351, "learning_rate": 0.0002, "epoch": 1.1235955056179776, "step": 6950}, {"loss": 0.7378, "grad_norm": 0.6527144312858582, "learning_rate": 0.0002, "epoch": 1.1252121897987228, "step": 6960}, {"loss": 0.7228, "grad_norm": 0.6935935020446777, "learning_rate": 0.0002, "epoch": 1.126828873979468, "step": 6970}, {"loss": 0.699, "grad_norm": 0.8026931881904602, "learning_rate": 0.0002, "epoch": 1.1284455581602133, "step": 6980}, {"loss": 0.7361, "grad_norm": 0.5210393667221069, "learning_rate": 0.0002, "epoch": 1.1300622423409588, "step": 6990}, {"loss": 0.7456, "grad_norm": 0.60475093126297, "learning_rate": 0.0002, "epoch": 1.131678926521704, "step": 7000}, {"loss": 0.7495, "grad_norm": 0.6417073607444763, "learning_rate": 0.0002, "epoch": 1.1332956107024492, "step": 7010}, {"loss": 0.7459, "grad_norm": 0.6732175946235657, "learning_rate": 0.0002, "epoch": 1.1349122948831947, "step": 7020}, {"loss": 0.7278, "grad_norm": 0.6719491481781006, "learning_rate": 0.0002, "epoch": 1.13652897906394, "step": 7030}, {"loss": 0.7694, "grad_norm": 0.5708295106887817, "learning_rate": 0.0002, "epoch": 1.1381456632446851, "step": 7040}, {"loss": 0.7823, "grad_norm": 0.7141719460487366, "learning_rate": 0.0002, "epoch": 1.1397623474254304, "step": 7050}, {"loss": 0.764, "grad_norm": 0.6187017560005188, "learning_rate": 0.0002, "epoch": 1.1413790316061758, "step": 7060}, {"loss": 0.7657, "grad_norm": 0.50581294298172, "learning_rate": 0.0002, "epoch": 1.142995715786921, "step": 7070}, {"loss": 0.7357, "grad_norm": 0.5620143413543701, "learning_rate": 0.0002, "epoch": 1.1446123999676663, "step": 7080}, {"loss": 0.7287, "grad_norm": 0.6231929659843445, "learning_rate": 0.0002, "epoch": 1.1462290841484115, "step": 7090}, {"loss": 0.7328, "grad_norm": 0.5775774121284485, "learning_rate": 0.0002, "epoch": 1.147845768329157, "step": 7100}, {"loss": 0.7728, "grad_norm": 0.6492809653282166, "learning_rate": 0.0002, "epoch": 1.1494624525099022, "step": 7110}, {"loss": 0.7545, "grad_norm": 0.6434972286224365, "learning_rate": 0.0002, "epoch": 1.1510791366906474, "step": 7120}, {"loss": 0.7374, "grad_norm": 0.6191812753677368, "learning_rate": 0.0002, "epoch": 1.1526958208713927, "step": 7130}, {"loss": 0.7276, "grad_norm": 0.6690331697463989, "learning_rate": 0.0002, "epoch": 1.1543125050521381, "step": 7140}, {"loss": 0.7704, "grad_norm": 0.5977938175201416, "learning_rate": 0.0002, "epoch": 1.1559291892328833, "step": 7150}, {"loss": 0.7251, "grad_norm": 0.6195854544639587, "learning_rate": 0.0002, "epoch": 1.1575458734136286, "step": 7160}, {"loss": 0.7249, "grad_norm": 0.5752048492431641, "learning_rate": 0.0002, "epoch": 1.159162557594374, "step": 7170}, {"loss": 0.7593, "grad_norm": 0.589081883430481, "learning_rate": 0.0002, "epoch": 1.1607792417751193, "step": 7180}, {"loss": 0.704, "grad_norm": 0.756996750831604, "learning_rate": 0.0002, "epoch": 1.1623959259558645, "step": 7190}, {"loss": 0.7404, "grad_norm": 0.7614967226982117, "learning_rate": 0.0002, "epoch": 1.1640126101366097, "step": 7200}, {"loss": 0.7867, "grad_norm": 0.6120437979698181, "learning_rate": 0.0002, "epoch": 1.1656292943173552, "step": 7210}, {"loss": 0.7384, "grad_norm": 0.6210004687309265, "learning_rate": 0.0002, "epoch": 1.1672459784981004, "step": 7220}, {"loss": 0.7251, "grad_norm": 0.6044116020202637, "learning_rate": 0.0002, "epoch": 1.1688626626788456, "step": 7230}, {"loss": 0.7361, "grad_norm": 0.5418457388877869, "learning_rate": 0.0002, "epoch": 1.170479346859591, "step": 7240}, {"loss": 0.6938, "grad_norm": 0.6413537263870239, "learning_rate": 0.0002, "epoch": 1.1720960310403363, "step": 7250}, {"loss": 0.6978, "grad_norm": 0.5777867436408997, "learning_rate": 0.0002, "epoch": 1.1737127152210816, "step": 7260}, {"loss": 0.7503, "grad_norm": 0.7092402577400208, "learning_rate": 0.0002, "epoch": 1.1753293994018268, "step": 7270}, {"loss": 0.7487, "grad_norm": 0.6351709365844727, "learning_rate": 0.0002, "epoch": 1.176946083582572, "step": 7280}, {"loss": 0.7527, "grad_norm": 0.6172189712524414, "learning_rate": 0.0002, "epoch": 1.1785627677633175, "step": 7290}, {"loss": 0.7319, "grad_norm": 0.6801714897155762, "learning_rate": 0.0002, "epoch": 1.1801794519440627, "step": 7300}, {"loss": 0.6941, "grad_norm": 0.6044712066650391, "learning_rate": 0.0002, "epoch": 1.181796136124808, "step": 7310}, {"loss": 0.6951, "grad_norm": 0.7413212060928345, "learning_rate": 0.0002, "epoch": 1.1834128203055534, "step": 7320}, {"loss": 0.7396, "grad_norm": 0.5303856134414673, "learning_rate": 0.0002, "epoch": 1.1850295044862986, "step": 7330}, {"loss": 0.6915, "grad_norm": 0.5647098422050476, "learning_rate": 0.0002, "epoch": 1.1866461886670439, "step": 7340}, {"loss": 0.7506, "grad_norm": 0.7374135255813599, "learning_rate": 0.0002, "epoch": 1.188262872847789, "step": 7350}, {"loss": 0.7041, "grad_norm": 0.5710089206695557, "learning_rate": 0.0002, "epoch": 1.1898795570285345, "step": 7360}, {"loss": 0.8289, "grad_norm": 0.6073619723320007, "learning_rate": 0.0002, "epoch": 1.1914962412092798, "step": 7370}, {"loss": 0.7722, "grad_norm": 0.5899916887283325, "learning_rate": 0.0002, "epoch": 1.193112925390025, "step": 7380}, {"loss": 0.756, "grad_norm": 0.7762434482574463, "learning_rate": 0.0002, "epoch": 1.1947296095707705, "step": 7390}, {"loss": 0.7319, "grad_norm": 0.679949939250946, "learning_rate": 0.0002, "epoch": 1.1963462937515157, "step": 7400}, {"loss": 0.7599, "grad_norm": 0.6106849312782288, "learning_rate": 0.0002, "epoch": 1.197962977932261, "step": 7410}, {"loss": 0.7648, "grad_norm": 0.682461678981781, "learning_rate": 0.0002, "epoch": 1.1995796621130062, "step": 7420}, {"loss": 0.7741, "grad_norm": 0.6087017059326172, "learning_rate": 0.0002, "epoch": 1.2011963462937516, "step": 7430}, {"loss": 0.7642, "grad_norm": 0.63739013671875, "learning_rate": 0.0002, "epoch": 1.2028130304744968, "step": 7440}, {"loss": 0.7611, "grad_norm": 0.6154777407646179, "learning_rate": 0.0002, "epoch": 1.204429714655242, "step": 7450}, {"loss": 0.7565, "grad_norm": 0.7491534948348999, "learning_rate": 0.0002, "epoch": 1.2060463988359873, "step": 7460}, {"loss": 0.698, "grad_norm": 0.6664797067642212, "learning_rate": 0.0002, "epoch": 1.2076630830167328, "step": 7470}, {"loss": 0.7456, "grad_norm": 0.6660266518592834, "learning_rate": 0.0002, "epoch": 1.209279767197478, "step": 7480}, {"loss": 0.714, "grad_norm": 0.6972551345825195, "learning_rate": 0.0002, "epoch": 1.2108964513782232, "step": 7490}, {"loss": 0.7023, "grad_norm": 0.6157945990562439, "learning_rate": 0.0002, "epoch": 1.2125131355589684, "step": 7500}, {"loss": 0.7326, "grad_norm": 0.5199310183525085, "learning_rate": 0.0002, "epoch": 1.214129819739714, "step": 7510}, {"loss": 0.7586, "grad_norm": 0.577610433101654, "learning_rate": 0.0002, "epoch": 1.2157465039204591, "step": 7520}, {"loss": 0.7179, "grad_norm": 0.53652423620224, "learning_rate": 0.0002, "epoch": 1.2173631881012044, "step": 7530}, {"loss": 0.7393, "grad_norm": 0.6479050517082214, "learning_rate": 0.0002, "epoch": 1.2189798722819498, "step": 7540}, {"loss": 0.7534, "grad_norm": 0.618748128414154, "learning_rate": 0.0002, "epoch": 1.220596556462695, "step": 7550}, {"loss": 0.6886, "grad_norm": 0.6311424374580383, "learning_rate": 0.0002, "epoch": 1.2222132406434403, "step": 7560}, {"loss": 0.7272, "grad_norm": 0.6595825552940369, "learning_rate": 0.0002, "epoch": 1.2238299248241855, "step": 7570}, {"loss": 0.7353, "grad_norm": 0.5198960900306702, "learning_rate": 0.0002, "epoch": 1.225446609004931, "step": 7580}, {"loss": 0.674, "grad_norm": 0.578650712966919, "learning_rate": 0.0002, "epoch": 1.2270632931856762, "step": 7590}, {"loss": 0.7507, "grad_norm": 0.6080220937728882, "learning_rate": 0.0002, "epoch": 1.2286799773664214, "step": 7600}, {"loss": 0.7733, "grad_norm": 0.7050248384475708, "learning_rate": 0.0002, "epoch": 1.2302966615471669, "step": 7610}, {"loss": 0.7032, "grad_norm": 0.6652196049690247, "learning_rate": 0.0002, "epoch": 1.2319133457279121, "step": 7620}, {"loss": 0.7085, "grad_norm": 0.7322776317596436, "learning_rate": 0.0002, "epoch": 1.2335300299086573, "step": 7630}, {"loss": 0.7402, "grad_norm": 0.4998728036880493, "learning_rate": 0.0002, "epoch": 1.2351467140894026, "step": 7640}, {"loss": 0.7214, "grad_norm": 0.6428788900375366, "learning_rate": 0.0002, "epoch": 1.2367633982701478, "step": 7650}, {"loss": 0.7699, "grad_norm": 0.585242509841919, "learning_rate": 0.0002, "epoch": 1.2383800824508933, "step": 7660}, {"loss": 0.7621, "grad_norm": 0.5211917757987976, "learning_rate": 0.0002, "epoch": 1.2399967666316385, "step": 7670}, {"loss": 0.746, "grad_norm": 0.6490384340286255, "learning_rate": 0.0002, "epoch": 1.2416134508123837, "step": 7680}, {"loss": 0.7186, "grad_norm": 0.6249763369560242, "learning_rate": 0.0002, "epoch": 1.2432301349931292, "step": 7690}, {"loss": 0.7761, "grad_norm": 0.71870356798172, "learning_rate": 0.0002, "epoch": 1.2448468191738744, "step": 7700}, {"loss": 0.7525, "grad_norm": 0.6761967539787292, "learning_rate": 0.0002, "epoch": 1.2464635033546196, "step": 7710}, {"loss": 0.7501, "grad_norm": 0.6500617265701294, "learning_rate": 0.0002, "epoch": 1.2480801875353649, "step": 7720}, {"loss": 0.7903, "grad_norm": 0.8069869875907898, "learning_rate": 0.0002, "epoch": 1.2496968717161103, "step": 7730}, {"loss": 0.6747, "grad_norm": 0.6044608950614929, "learning_rate": 0.0002, "epoch": 1.2513135558968556, "step": 7740}, {"loss": 0.6825, "grad_norm": 0.6573283076286316, "learning_rate": 0.0002, "epoch": 1.2529302400776008, "step": 7750}, {"loss": 0.7617, "grad_norm": 0.625430166721344, "learning_rate": 0.0002, "epoch": 1.2545469242583462, "step": 7760}, {"loss": 0.7041, "grad_norm": 0.5442022681236267, "learning_rate": 0.0002, "epoch": 1.2561636084390915, "step": 7770}, {"loss": 0.7172, "grad_norm": 0.6818386912345886, "learning_rate": 0.0002, "epoch": 1.2577802926198367, "step": 7780}, {"loss": 0.696, "grad_norm": 0.6381874084472656, "learning_rate": 0.0002, "epoch": 1.259396976800582, "step": 7790}, {"loss": 0.6834, "grad_norm": 0.6269212961196899, "learning_rate": 0.0002, "epoch": 1.2610136609813272, "step": 7800}, {"loss": 0.7821, "grad_norm": 0.600121259689331, "learning_rate": 0.0002, "epoch": 1.2626303451620726, "step": 7810}, {"loss": 0.7761, "grad_norm": 0.6337703466415405, "learning_rate": 0.0002, "epoch": 1.2642470293428179, "step": 7820}, {"loss": 0.732, "grad_norm": 0.7234963774681091, "learning_rate": 0.0002, "epoch": 1.2658637135235633, "step": 7830}, {"loss": 0.785, "grad_norm": 0.800184965133667, "learning_rate": 0.0002, "epoch": 1.2674803977043085, "step": 7840}, {"loss": 0.7426, "grad_norm": 0.7539464831352234, "learning_rate": 0.0002, "epoch": 1.2690970818850538, "step": 7850}, {"loss": 0.7496, "grad_norm": 0.5493760704994202, "learning_rate": 0.0002, "epoch": 1.270713766065799, "step": 7860}, {"loss": 0.7537, "grad_norm": 0.7477145791053772, "learning_rate": 0.0002, "epoch": 1.2723304502465442, "step": 7870}, {"loss": 0.7573, "grad_norm": 0.6366362571716309, "learning_rate": 0.0002, "epoch": 1.2739471344272897, "step": 7880}, {"loss": 0.7608, "grad_norm": 0.7419533729553223, "learning_rate": 0.0002, "epoch": 1.275563818608035, "step": 7890}, {"loss": 0.7873, "grad_norm": 0.6141223311424255, "learning_rate": 0.0002, "epoch": 1.2771805027887801, "step": 7900}, {"loss": 0.6916, "grad_norm": 0.7522598505020142, "learning_rate": 0.0002, "epoch": 1.2787971869695256, "step": 7910}, {"loss": 0.7097, "grad_norm": 0.6935804486274719, "learning_rate": 0.0002, "epoch": 1.2804138711502708, "step": 7920}, {"loss": 0.7185, "grad_norm": 0.7239290475845337, "learning_rate": 0.0002, "epoch": 1.282030555331016, "step": 7930}, {"loss": 0.7145, "grad_norm": 0.8800187110900879, "learning_rate": 0.0002, "epoch": 1.2836472395117613, "step": 7940}, {"loss": 0.6991, "grad_norm": 0.540458083152771, "learning_rate": 0.0002, "epoch": 1.2852639236925067, "step": 7950}, {"loss": 0.7139, "grad_norm": 0.6492934226989746, "learning_rate": 0.0002, "epoch": 1.286880607873252, "step": 7960}, {"loss": 0.7742, "grad_norm": 0.6543959379196167, "learning_rate": 0.0002, "epoch": 1.2884972920539972, "step": 7970}, {"loss": 0.7316, "grad_norm": 0.5804705619812012, "learning_rate": 0.0002, "epoch": 1.2901139762347427, "step": 7980}, {"loss": 0.796, "grad_norm": 0.7074727416038513, "learning_rate": 0.0002, "epoch": 1.291730660415488, "step": 7990}, {"loss": 0.7034, "grad_norm": 0.5347974300384521, "learning_rate": 0.0002, "epoch": 1.2933473445962331, "step": 8000}, {"loss": 0.738, "grad_norm": 0.6457298398017883, "learning_rate": 0.0002, "epoch": 1.2949640287769784, "step": 8010}, {"loss": 0.7634, "grad_norm": 0.6407219171524048, "learning_rate": 0.0002, "epoch": 1.2965807129577236, "step": 8020}, {"loss": 0.7506, "grad_norm": 0.828439474105835, "learning_rate": 0.0002, "epoch": 1.298197397138469, "step": 8030}, {"loss": 0.735, "grad_norm": 0.4840380549430847, "learning_rate": 0.0002, "epoch": 1.2998140813192143, "step": 8040}, {"loss": 0.7283, "grad_norm": 0.5921024680137634, "learning_rate": 0.0002, "epoch": 1.3014307654999595, "step": 8050}, {"loss": 0.7477, "grad_norm": 0.6170315146446228, "learning_rate": 0.0002, "epoch": 1.303047449680705, "step": 8060}, {"loss": 0.7534, "grad_norm": 0.5374847054481506, "learning_rate": 0.0002, "epoch": 1.3046641338614502, "step": 8070}, {"loss": 0.7593, "grad_norm": 0.545758068561554, "learning_rate": 0.0002, "epoch": 1.3062808180421954, "step": 8080}, {"loss": 0.7463, "grad_norm": 0.55641770362854, "learning_rate": 0.0002, "epoch": 1.3078975022229407, "step": 8090}, {"loss": 0.7594, "grad_norm": 0.6724897027015686, "learning_rate": 0.0002, "epoch": 1.309514186403686, "step": 8100}, {"loss": 0.7105, "grad_norm": 0.6923972368240356, "learning_rate": 0.0002, "epoch": 1.3111308705844313, "step": 8110}, {"loss": 0.7149, "grad_norm": 0.5136841535568237, "learning_rate": 0.0002, "epoch": 1.3127475547651766, "step": 8120}, {"loss": 0.7504, "grad_norm": 0.6766283512115479, "learning_rate": 0.0002, "epoch": 1.314364238945922, "step": 8130}, {"loss": 0.7489, "grad_norm": 0.6283926367759705, "learning_rate": 0.0002, "epoch": 1.3159809231266673, "step": 8140}, {"loss": 0.7459, "grad_norm": 0.644216001033783, "learning_rate": 0.0002, "epoch": 1.3175976073074125, "step": 8150}, {"loss": 0.7125, "grad_norm": 0.7827503085136414, "learning_rate": 0.0002, "epoch": 1.3192142914881577, "step": 8160}, {"loss": 0.7271, "grad_norm": 0.6651390790939331, "learning_rate": 0.0002, "epoch": 1.320830975668903, "step": 8170}, {"loss": 0.7778, "grad_norm": 0.5547412633895874, "learning_rate": 0.0002, "epoch": 1.3224476598496484, "step": 8180}, {"loss": 0.7402, "grad_norm": 0.6765179634094238, "learning_rate": 0.0002, "epoch": 1.3240643440303936, "step": 8190}, {"loss": 0.7106, "grad_norm": 0.6822077035903931, "learning_rate": 0.0002, "epoch": 1.325681028211139, "step": 8200}, {"loss": 0.7288, "grad_norm": 0.5941002368927002, "learning_rate": 0.0002, "epoch": 1.3272977123918843, "step": 8210}, {"loss": 0.7494, "grad_norm": 0.4850037097930908, "learning_rate": 0.0002, "epoch": 1.3289143965726296, "step": 8220}, {"loss": 0.7474, "grad_norm": 0.6162990927696228, "learning_rate": 0.0002, "epoch": 1.3305310807533748, "step": 8230}, {"loss": 0.7751, "grad_norm": 0.6665613651275635, "learning_rate": 0.0002, "epoch": 1.33214776493412, "step": 8240}, {"loss": 0.759, "grad_norm": 0.618192732334137, "learning_rate": 0.0002, "epoch": 1.3337644491148655, "step": 8250}, {"loss": 0.7532, "grad_norm": 0.710418701171875, "learning_rate": 0.0002, "epoch": 1.3353811332956107, "step": 8260}, {"loss": 0.7306, "grad_norm": 0.5109876990318298, "learning_rate": 0.0002, "epoch": 1.336997817476356, "step": 8270}, {"loss": 0.7303, "grad_norm": 0.6791711449623108, "learning_rate": 0.0002, "epoch": 1.3386145016571014, "step": 8280}, {"loss": 0.7594, "grad_norm": 0.6836432814598083, "learning_rate": 0.0002, "epoch": 1.3402311858378466, "step": 8290}, {"loss": 0.7594, "grad_norm": 0.5579386353492737, "learning_rate": 0.0002, "epoch": 1.3418478700185918, "step": 8300}, {"loss": 0.7377, "grad_norm": 0.6713546514511108, "learning_rate": 0.0002, "epoch": 1.343464554199337, "step": 8310}, {"loss": 0.7756, "grad_norm": 0.5353720188140869, "learning_rate": 0.0002, "epoch": 1.3450812383800825, "step": 8320}, {"loss": 0.718, "grad_norm": 0.5813682675361633, "learning_rate": 0.0002, "epoch": 1.3466979225608278, "step": 8330}, {"loss": 0.7294, "grad_norm": 0.8158791661262512, "learning_rate": 0.0002, "epoch": 1.348314606741573, "step": 8340}, {"loss": 0.6992, "grad_norm": 0.6193785071372986, "learning_rate": 0.0002, "epoch": 1.3499312909223184, "step": 8350}, {"loss": 0.7654, "grad_norm": 0.6353939771652222, "learning_rate": 0.0002, "epoch": 1.3515479751030637, "step": 8360}, {"loss": 0.7519, "grad_norm": 0.6925048232078552, "learning_rate": 0.0002, "epoch": 1.353164659283809, "step": 8370}, {"loss": 0.736, "grad_norm": 0.988264799118042, "learning_rate": 0.0002, "epoch": 1.3547813434645541, "step": 8380}, {"loss": 0.7744, "grad_norm": 0.6476002931594849, "learning_rate": 0.0002, "epoch": 1.3563980276452994, "step": 8390}, {"loss": 0.776, "grad_norm": 0.7120398879051208, "learning_rate": 0.0002, "epoch": 1.3580147118260448, "step": 8400}, {"loss": 0.7368, "grad_norm": 0.9048416614532471, "learning_rate": 0.0002, "epoch": 1.35963139600679, "step": 8410}, {"loss": 0.7544, "grad_norm": 0.7000672817230225, "learning_rate": 0.0002, "epoch": 1.3612480801875353, "step": 8420}, {"loss": 0.7358, "grad_norm": 0.6015632152557373, "learning_rate": 0.0002, "epoch": 1.3628647643682807, "step": 8430}, {"loss": 0.7298, "grad_norm": 0.612516462802887, "learning_rate": 0.0002, "epoch": 1.364481448549026, "step": 8440}, {"loss": 0.7055, "grad_norm": 0.5969301462173462, "learning_rate": 0.0002, "epoch": 1.3660981327297712, "step": 8450}, {"loss": 0.7754, "grad_norm": 0.6730654239654541, "learning_rate": 0.0002, "epoch": 1.3677148169105164, "step": 8460}, {"loss": 0.7465, "grad_norm": 0.6386392116546631, "learning_rate": 0.0002, "epoch": 1.369331501091262, "step": 8470}, {"loss": 0.7433, "grad_norm": 0.739544153213501, "learning_rate": 0.0002, "epoch": 1.3709481852720071, "step": 8480}, {"loss": 0.7892, "grad_norm": 0.6462782621383667, "learning_rate": 0.0002, "epoch": 1.3725648694527524, "step": 8490}, {"loss": 0.7302, "grad_norm": 0.7346843481063843, "learning_rate": 0.0002, "epoch": 1.3741815536334978, "step": 8500}, {"loss": 0.7634, "grad_norm": 0.6884821057319641, "learning_rate": 0.0002, "epoch": 1.375798237814243, "step": 8510}, {"loss": 0.7614, "grad_norm": 0.6999333500862122, "learning_rate": 0.0002, "epoch": 1.3774149219949883, "step": 8520}, {"loss": 0.729, "grad_norm": 0.5378713011741638, "learning_rate": 0.0002, "epoch": 1.3790316061757335, "step": 8530}, {"loss": 0.6797, "grad_norm": 0.5417906641960144, "learning_rate": 0.0002, "epoch": 1.3806482903564787, "step": 8540}, {"loss": 0.7499, "grad_norm": 0.6602526307106018, "learning_rate": 0.0002, "epoch": 1.3822649745372242, "step": 8550}, {"loss": 0.7356, "grad_norm": 0.7073674201965332, "learning_rate": 0.0002, "epoch": 1.3838816587179694, "step": 8560}, {"loss": 0.75, "grad_norm": 0.5841707587242126, "learning_rate": 0.0002, "epoch": 1.3854983428987149, "step": 8570}, {"loss": 0.732, "grad_norm": 0.7031095027923584, "learning_rate": 0.0002, "epoch": 1.38711502707946, "step": 8580}, {"loss": 0.7464, "grad_norm": 0.5198570489883423, "learning_rate": 0.0002, "epoch": 1.3887317112602053, "step": 8590}, {"loss": 0.7354, "grad_norm": 0.7261320352554321, "learning_rate": 0.0002, "epoch": 1.3903483954409506, "step": 8600}, {"loss": 0.7339, "grad_norm": 0.5616350173950195, "learning_rate": 0.0002, "epoch": 1.3919650796216958, "step": 8610}, {"loss": 0.7382, "grad_norm": 0.5185914635658264, "learning_rate": 0.0002, "epoch": 1.3935817638024413, "step": 8620}, {"loss": 0.7456, "grad_norm": 0.5814694762229919, "learning_rate": 0.0002, "epoch": 1.3951984479831865, "step": 8630}, {"loss": 0.7413, "grad_norm": 0.6977371573448181, "learning_rate": 0.0002, "epoch": 1.3968151321639317, "step": 8640}, {"loss": 0.7574, "grad_norm": 0.6855689883232117, "learning_rate": 0.0002, "epoch": 1.3984318163446772, "step": 8650}, {"loss": 0.7802, "grad_norm": 0.5414357781410217, "learning_rate": 0.0002, "epoch": 1.4000485005254224, "step": 8660}, {"loss": 0.7487, "grad_norm": 0.6970012784004211, "learning_rate": 0.0002, "epoch": 1.4016651847061676, "step": 8670}, {"loss": 0.7421, "grad_norm": 0.526079535484314, "learning_rate": 0.0002, "epoch": 1.4032818688869129, "step": 8680}, {"loss": 0.737, "grad_norm": 0.758712887763977, "learning_rate": 0.0002, "epoch": 1.404898553067658, "step": 8690}, {"loss": 0.7612, "grad_norm": 0.7118762731552124, "learning_rate": 0.0002, "epoch": 1.4065152372484035, "step": 8700}, {"loss": 0.7628, "grad_norm": 0.5696909427642822, "learning_rate": 0.0002, "epoch": 1.4081319214291488, "step": 8710}, {"loss": 0.7156, "grad_norm": 0.7995436787605286, "learning_rate": 0.0002, "epoch": 1.4097486056098942, "step": 8720}, {"loss": 0.7521, "grad_norm": 0.7237521409988403, "learning_rate": 0.0002, "epoch": 1.4113652897906395, "step": 8730}, {"loss": 0.7661, "grad_norm": 0.744628369808197, "learning_rate": 0.0002, "epoch": 1.4129819739713847, "step": 8740}, {"loss": 0.7073, "grad_norm": 0.6082926988601685, "learning_rate": 0.0002, "epoch": 1.41459865815213, "step": 8750}, {"loss": 0.7282, "grad_norm": 0.5185243487358093, "learning_rate": 0.0002, "epoch": 1.4162153423328752, "step": 8760}, {"loss": 0.7592, "grad_norm": 0.5183082222938538, "learning_rate": 0.0002, "epoch": 1.4178320265136206, "step": 8770}, {"loss": 0.7509, "grad_norm": 0.7326041460037231, "learning_rate": 0.0002, "epoch": 1.4194487106943658, "step": 8780}, {"loss": 0.7398, "grad_norm": 0.7174660563468933, "learning_rate": 0.0002, "epoch": 1.421065394875111, "step": 8790}, {"loss": 0.7507, "grad_norm": 0.8080165982246399, "learning_rate": 0.0002, "epoch": 1.4226820790558565, "step": 8800}, {"loss": 0.72, "grad_norm": 0.5061507821083069, "learning_rate": 0.0002, "epoch": 1.4242987632366018, "step": 8810}, {"loss": 0.7563, "grad_norm": 0.801602840423584, "learning_rate": 0.0002, "epoch": 1.425915447417347, "step": 8820}, {"loss": 0.7287, "grad_norm": 0.6150273084640503, "learning_rate": 0.0002, "epoch": 1.4275321315980922, "step": 8830}, {"loss": 0.7452, "grad_norm": 0.8786525726318359, "learning_rate": 0.0002, "epoch": 1.4291488157788377, "step": 8840}, {"loss": 0.7257, "grad_norm": 0.6371538639068604, "learning_rate": 0.0002, "epoch": 1.430765499959583, "step": 8850}, {"loss": 0.711, "grad_norm": 0.6409295797348022, "learning_rate": 0.0002, "epoch": 1.4323821841403281, "step": 8860}, {"loss": 0.7891, "grad_norm": 0.6452359557151794, "learning_rate": 0.0002, "epoch": 1.4339988683210736, "step": 8870}, {"loss": 0.7588, "grad_norm": 0.5842334628105164, "learning_rate": 0.0002, "epoch": 1.4356155525018188, "step": 8880}, {"loss": 0.7446, "grad_norm": 0.696761965751648, "learning_rate": 0.0002, "epoch": 1.437232236682564, "step": 8890}, {"loss": 0.7541, "grad_norm": 0.6384600400924683, "learning_rate": 0.0002, "epoch": 1.4388489208633093, "step": 8900}, {"loss": 0.7049, "grad_norm": 0.5981136560440063, "learning_rate": 0.0002, "epoch": 1.4404656050440545, "step": 8910}, {"loss": 0.795, "grad_norm": 0.6355637907981873, "learning_rate": 0.0002, "epoch": 1.4420822892248, "step": 8920}, {"loss": 0.7653, "grad_norm": 0.6374830603599548, "learning_rate": 0.0002, "epoch": 1.4436989734055452, "step": 8930}, {"loss": 0.8108, "grad_norm": 0.559013307094574, "learning_rate": 0.0002, "epoch": 1.4453156575862904, "step": 8940}, {"loss": 0.7045, "grad_norm": 0.7289170026779175, "learning_rate": 0.0002, "epoch": 1.446932341767036, "step": 8950}, {"loss": 0.7484, "grad_norm": 0.8649206757545471, "learning_rate": 0.0002, "epoch": 1.4485490259477811, "step": 8960}, {"loss": 0.7745, "grad_norm": 0.7664689421653748, "learning_rate": 0.0002, "epoch": 1.4501657101285264, "step": 8970}, {"loss": 0.7431, "grad_norm": 0.7109952569007874, "learning_rate": 0.0002, "epoch": 1.4517823943092716, "step": 8980}, {"loss": 0.7997, "grad_norm": 0.6312844753265381, "learning_rate": 0.0002, "epoch": 1.453399078490017, "step": 8990}, {"loss": 0.7467, "grad_norm": 0.6616617441177368, "learning_rate": 0.0002, "epoch": 1.4550157626707623, "step": 9000}, {"loss": 0.7518, "grad_norm": 0.7384068965911865, "learning_rate": 0.0002, "epoch": 1.4566324468515075, "step": 9010}, {"loss": 0.7483, "grad_norm": 0.6549670100212097, "learning_rate": 0.0002, "epoch": 1.458249131032253, "step": 9020}, {"loss": 0.7423, "grad_norm": 0.6254119277000427, "learning_rate": 0.0002, "epoch": 1.4598658152129982, "step": 9030}, {"loss": 0.7645, "grad_norm": 0.6806328892707825, "learning_rate": 0.0002, "epoch": 1.4614824993937434, "step": 9040}, {"loss": 0.7221, "grad_norm": 0.6803115010261536, "learning_rate": 0.0002, "epoch": 1.4630991835744886, "step": 9050}, {"loss": 0.7264, "grad_norm": 0.48529282212257385, "learning_rate": 0.0002, "epoch": 1.4647158677552339, "step": 9060}, {"loss": 0.7542, "grad_norm": 0.5995030999183655, "learning_rate": 0.0002, "epoch": 1.4663325519359793, "step": 9070}, {"loss": 0.7894, "grad_norm": 0.6005427837371826, "learning_rate": 0.0002, "epoch": 1.4679492361167246, "step": 9080}, {"loss": 0.7288, "grad_norm": 0.718564510345459, "learning_rate": 0.0002, "epoch": 1.46956592029747, "step": 9090}, {"loss": 0.7089, "grad_norm": 0.7003577351570129, "learning_rate": 0.0002, "epoch": 1.4711826044782153, "step": 9100}, {"loss": 0.8069, "grad_norm": 0.5888323783874512, "learning_rate": 0.0002, "epoch": 1.4727992886589605, "step": 9110}, {"loss": 0.7275, "grad_norm": 0.6417609453201294, "learning_rate": 0.0002, "epoch": 1.4744159728397057, "step": 9120}, {"loss": 0.7441, "grad_norm": 0.572294294834137, "learning_rate": 0.0002, "epoch": 1.476032657020451, "step": 9130}, {"loss": 0.8053, "grad_norm": 0.8200714588165283, "learning_rate": 0.0002, "epoch": 1.4776493412011964, "step": 9140}, {"loss": 0.7382, "grad_norm": 0.6343288421630859, "learning_rate": 0.0002, "epoch": 1.4792660253819416, "step": 9150}, {"loss": 0.7641, "grad_norm": 0.7017961144447327, "learning_rate": 0.0002, "epoch": 1.4808827095626869, "step": 9160}, {"loss": 0.7619, "grad_norm": 0.6202912926673889, "learning_rate": 0.0002, "epoch": 1.4824993937434323, "step": 9170}, {"loss": 0.7428, "grad_norm": 0.6677869558334351, "learning_rate": 0.0002, "epoch": 1.4841160779241775, "step": 9180}, {"loss": 0.7648, "grad_norm": 0.6052267551422119, "learning_rate": 0.0002, "epoch": 1.4857327621049228, "step": 9190}, {"loss": 0.7152, "grad_norm": 0.6638872027397156, "learning_rate": 0.0002, "epoch": 1.487349446285668, "step": 9200}, {"loss": 0.7448, "grad_norm": 0.6245523691177368, "learning_rate": 0.0002, "epoch": 1.4889661304664135, "step": 9210}, {"loss": 0.6958, "grad_norm": 0.5761767625808716, "learning_rate": 0.0002, "epoch": 1.4905828146471587, "step": 9220}, {"loss": 0.8012, "grad_norm": 0.8175981640815735, "learning_rate": 0.0002, "epoch": 1.492199498827904, "step": 9230}, {"loss": 0.683, "grad_norm": 0.9144009947776794, "learning_rate": 0.0002, "epoch": 1.4938161830086494, "step": 9240}, {"loss": 0.7623, "grad_norm": 0.5742552876472473, "learning_rate": 0.0002, "epoch": 1.4954328671893946, "step": 9250}, {"loss": 0.7418, "grad_norm": 0.534534215927124, "learning_rate": 0.0002, "epoch": 1.4970495513701398, "step": 9260}, {"loss": 0.7194, "grad_norm": 0.7836225032806396, "learning_rate": 0.0002, "epoch": 1.498666235550885, "step": 9270}, {"loss": 0.7453, "grad_norm": 0.5292993187904358, "learning_rate": 0.0002, "epoch": 1.5002829197316303, "step": 9280}, {"loss": 0.7168, "grad_norm": 0.8044071793556213, "learning_rate": 0.0002, "epoch": 1.5018996039123758, "step": 9290}, {"loss": 0.7229, "grad_norm": 0.6185805201530457, "learning_rate": 0.0002, "epoch": 1.503516288093121, "step": 9300}, {"loss": 0.684, "grad_norm": 0.6093607544898987, "learning_rate": 0.0002, "epoch": 1.5051329722738664, "step": 9310}, {"loss": 0.7973, "grad_norm": 0.5891730189323425, "learning_rate": 0.0002, "epoch": 1.5067496564546117, "step": 9320}, {"loss": 0.7474, "grad_norm": 0.6331129670143127, "learning_rate": 0.0002, "epoch": 1.508366340635357, "step": 9330}, {"loss": 0.7074, "grad_norm": 0.7690958380699158, "learning_rate": 0.0002, "epoch": 1.5099830248161021, "step": 9340}, {"loss": 0.672, "grad_norm": 0.6548877358436584, "learning_rate": 0.0002, "epoch": 1.5115997089968474, "step": 9350}, {"loss": 0.7408, "grad_norm": 0.6545143127441406, "learning_rate": 0.0002, "epoch": 1.5132163931775926, "step": 9360}, {"loss": 0.7432, "grad_norm": 0.553247332572937, "learning_rate": 0.0002, "epoch": 1.514833077358338, "step": 9370}, {"loss": 0.7265, "grad_norm": 0.8145074844360352, "learning_rate": 0.0002, "epoch": 1.5164497615390833, "step": 9380}, {"loss": 0.7379, "grad_norm": 0.7636994123458862, "learning_rate": 0.0002, "epoch": 1.5180664457198287, "step": 9390}, {"loss": 0.7413, "grad_norm": 0.6838982701301575, "learning_rate": 0.0002, "epoch": 1.519683129900574, "step": 9400}, {"loss": 0.7367, "grad_norm": 0.8599441647529602, "learning_rate": 0.0002, "epoch": 1.5212998140813192, "step": 9410}, {"loss": 0.7663, "grad_norm": 0.7020329833030701, "learning_rate": 0.0002, "epoch": 1.5229164982620644, "step": 9420}, {"loss": 0.7928, "grad_norm": 0.6964772343635559, "learning_rate": 0.0002, "epoch": 1.5245331824428097, "step": 9430}, {"loss": 0.7168, "grad_norm": 0.6916600465774536, "learning_rate": 0.0002, "epoch": 1.5261498666235551, "step": 9440}, {"loss": 0.7519, "grad_norm": 0.7282621264457703, "learning_rate": 0.0002, "epoch": 1.5277665508043003, "step": 9450}, {"loss": 0.7628, "grad_norm": 0.5363983511924744, "learning_rate": 0.0002, "epoch": 1.5293832349850458, "step": 9460}, {"loss": 0.7154, "grad_norm": 0.6184861063957214, "learning_rate": 0.0002, "epoch": 1.530999919165791, "step": 9470}, {"loss": 0.7837, "grad_norm": 0.5991285443305969, "learning_rate": 0.0002, "epoch": 1.5326166033465363, "step": 9480}, {"loss": 0.7827, "grad_norm": 0.8176587820053101, "learning_rate": 0.0002, "epoch": 1.5342332875272815, "step": 9490}, {"loss": 0.7415, "grad_norm": 0.6473721861839294, "learning_rate": 0.0002, "epoch": 1.5358499717080267, "step": 9500}, {"loss": 0.7632, "grad_norm": 0.7319952845573425, "learning_rate": 0.0002, "epoch": 1.5374666558887722, "step": 9510}, {"loss": 0.7706, "grad_norm": 0.702900230884552, "learning_rate": 0.0002, "epoch": 1.5390833400695174, "step": 9520}, {"loss": 0.7754, "grad_norm": 0.7971600294113159, "learning_rate": 0.0002, "epoch": 1.5407000242502629, "step": 9530}, {"loss": 0.7352, "grad_norm": 0.6527525186538696, "learning_rate": 0.0002, "epoch": 1.542316708431008, "step": 9540}, {"loss": 0.7425, "grad_norm": 0.5791676044464111, "learning_rate": 0.0002, "epoch": 1.5439333926117533, "step": 9550}, {"loss": 0.7585, "grad_norm": 0.5619390606880188, "learning_rate": 0.0002, "epoch": 1.5455500767924986, "step": 9560}, {"loss": 0.7894, "grad_norm": 0.5701689124107361, "learning_rate": 0.0002, "epoch": 1.5471667609732438, "step": 9570}, {"loss": 0.793, "grad_norm": 0.47549352049827576, "learning_rate": 0.0002, "epoch": 1.548783445153989, "step": 9580}, {"loss": 0.7276, "grad_norm": 0.8730611205101013, "learning_rate": 0.0002, "epoch": 1.5504001293347345, "step": 9590}, {"loss": 0.798, "grad_norm": 0.6842091083526611, "learning_rate": 0.0002, "epoch": 1.5520168135154797, "step": 9600}, {"loss": 0.7528, "grad_norm": 0.6675129532814026, "learning_rate": 0.0002, "epoch": 1.5536334976962252, "step": 9610}, {"loss": 0.7954, "grad_norm": 0.8173956274986267, "learning_rate": 0.0002, "epoch": 1.5552501818769704, "step": 9620}, {"loss": 0.7535, "grad_norm": 0.724947452545166, "learning_rate": 0.0002, "epoch": 1.5568668660577156, "step": 9630}, {"loss": 0.7738, "grad_norm": 0.6154758930206299, "learning_rate": 0.0002, "epoch": 1.5584835502384609, "step": 9640}, {"loss": 0.7568, "grad_norm": 0.6072008013725281, "learning_rate": 0.0002, "epoch": 1.560100234419206, "step": 9650}, {"loss": 0.7219, "grad_norm": 0.659010648727417, "learning_rate": 0.0002, "epoch": 1.5617169185999515, "step": 9660}, {"loss": 0.673, "grad_norm": 0.65857994556427, "learning_rate": 0.0002, "epoch": 1.5633336027806968, "step": 9670}, {"loss": 0.7156, "grad_norm": 0.5914267301559448, "learning_rate": 0.0002, "epoch": 1.5649502869614422, "step": 9680}, {"loss": 0.7414, "grad_norm": 0.6248020529747009, "learning_rate": 0.0002, "epoch": 1.5665669711421875, "step": 9690}, {"loss": 0.694, "grad_norm": 0.7147795557975769, "learning_rate": 0.0002, "epoch": 1.5681836553229327, "step": 9700}, {"loss": 0.7335, "grad_norm": 0.7076232433319092, "learning_rate": 0.0002, "epoch": 1.569800339503678, "step": 9710}, {"loss": 0.7413, "grad_norm": 0.6217400431632996, "learning_rate": 0.0002, "epoch": 1.5714170236844232, "step": 9720}, {"loss": 0.7296, "grad_norm": 0.6709911227226257, "learning_rate": 0.0002, "epoch": 1.5730337078651684, "step": 9730}, {"loss": 0.7306, "grad_norm": 0.749171257019043, "learning_rate": 0.0002, "epoch": 1.5746503920459138, "step": 9740}, {"loss": 0.7242, "grad_norm": 0.6241145730018616, "learning_rate": 0.0002, "epoch": 1.576267076226659, "step": 9750}, {"loss": 0.7384, "grad_norm": 0.4960934817790985, "learning_rate": 0.0002, "epoch": 1.5778837604074045, "step": 9760}, {"loss": 0.725, "grad_norm": 0.6593309640884399, "learning_rate": 0.0002, "epoch": 1.5795004445881498, "step": 9770}, {"loss": 0.7531, "grad_norm": 0.5814042091369629, "learning_rate": 0.0002, "epoch": 1.581117128768895, "step": 9780}, {"loss": 0.7109, "grad_norm": 0.5936070680618286, "learning_rate": 0.0002, "epoch": 1.5827338129496402, "step": 9790}, {"loss": 0.7769, "grad_norm": 0.6454403400421143, "learning_rate": 0.0002, "epoch": 1.5843504971303854, "step": 9800}, {"loss": 0.7677, "grad_norm": 0.7612107992172241, "learning_rate": 0.0002, "epoch": 1.585967181311131, "step": 9810}, {"loss": 0.7649, "grad_norm": 0.6494482755661011, "learning_rate": 0.0002, "epoch": 1.5875838654918761, "step": 9820}, {"loss": 0.7569, "grad_norm": 0.7825694680213928, "learning_rate": 0.0002, "epoch": 1.5892005496726216, "step": 9830}, {"loss": 0.706, "grad_norm": 0.6757757663726807, "learning_rate": 0.0002, "epoch": 1.5908172338533668, "step": 9840}, {"loss": 0.7803, "grad_norm": 0.7105609178543091, "learning_rate": 0.0002, "epoch": 1.592433918034112, "step": 9850}, {"loss": 0.7925, "grad_norm": 0.7596991062164307, "learning_rate": 0.0002, "epoch": 1.5940506022148573, "step": 9860}, {"loss": 0.7108, "grad_norm": 0.5681525468826294, "learning_rate": 0.0002, "epoch": 1.5956672863956025, "step": 9870}, {"loss": 0.7811, "grad_norm": 0.6090980768203735, "learning_rate": 0.0002, "epoch": 1.5972839705763477, "step": 9880}, {"loss": 0.7339, "grad_norm": 0.6271613240242004, "learning_rate": 0.0002, "epoch": 1.5989006547570932, "step": 9890}, {"loss": 0.7419, "grad_norm": 0.7656369805335999, "learning_rate": 0.0002, "epoch": 1.6005173389378387, "step": 9900}, {"loss": 0.7336, "grad_norm": 0.7504446506500244, "learning_rate": 0.0002, "epoch": 1.6021340231185839, "step": 9910}, {"loss": 0.7479, "grad_norm": 0.659656286239624, "learning_rate": 0.0002, "epoch": 1.6037507072993291, "step": 9920}, {"loss": 0.7483, "grad_norm": 0.6006826162338257, "learning_rate": 0.0002, "epoch": 1.6053673914800743, "step": 9930}, {"loss": 0.732, "grad_norm": 0.7872757911682129, "learning_rate": 0.0002, "epoch": 1.6069840756608196, "step": 9940}, {"loss": 0.768, "grad_norm": 0.5545852780342102, "learning_rate": 0.0002, "epoch": 1.6086007598415648, "step": 9950}, {"loss": 0.8064, "grad_norm": 0.7429468631744385, "learning_rate": 0.0002, "epoch": 1.6102174440223103, "step": 9960}, {"loss": 0.714, "grad_norm": 0.6873556971549988, "learning_rate": 0.0002, "epoch": 1.6118341282030555, "step": 9970}, {"loss": 0.7324, "grad_norm": 0.5874287486076355, "learning_rate": 0.0002, "epoch": 1.613450812383801, "step": 9980}, {"loss": 0.7141, "grad_norm": 0.6039386987686157, "learning_rate": 0.0002, "epoch": 1.6150674965645462, "step": 9990}, {"loss": 0.6674, "grad_norm": 0.6233575940132141, "learning_rate": 0.0002, "epoch": 1.6166841807452914, "step": 10000}, {"loss": 0.7602, "grad_norm": 0.7676448225975037, "learning_rate": 0.0002, "epoch": 1.6183008649260366, "step": 10010}, {"loss": 0.7784, "grad_norm": 0.6565698385238647, "learning_rate": 0.0002, "epoch": 1.6199175491067819, "step": 10020}, {"loss": 0.7104, "grad_norm": 0.6787590384483337, "learning_rate": 0.0002, "epoch": 1.6215342332875273, "step": 10030}, {"loss": 0.7464, "grad_norm": 0.6137678027153015, "learning_rate": 0.0002, "epoch": 1.6231509174682726, "step": 10040}, {"loss": 0.7646, "grad_norm": 0.5236800312995911, "learning_rate": 0.0002, "epoch": 1.624767601649018, "step": 10050}, {"loss": 0.7437, "grad_norm": 0.7626367807388306, "learning_rate": 0.0002, "epoch": 1.6263842858297632, "step": 10060}, {"loss": 0.7273, "grad_norm": 0.5657260417938232, "learning_rate": 0.0002, "epoch": 1.6280009700105085, "step": 10070}, {"loss": 0.7354, "grad_norm": 0.4913991391658783, "learning_rate": 0.0002, "epoch": 1.6296176541912537, "step": 10080}, {"loss": 0.7596, "grad_norm": 0.7715556621551514, "learning_rate": 0.0002, "epoch": 1.631234338371999, "step": 10090}, {"loss": 0.7105, "grad_norm": 0.6509000062942505, "learning_rate": 0.0002, "epoch": 1.6328510225527442, "step": 10100}, {"loss": 0.7274, "grad_norm": 0.6215850114822388, "learning_rate": 0.0002, "epoch": 1.6344677067334896, "step": 10110}, {"loss": 0.7705, "grad_norm": 0.6956844329833984, "learning_rate": 0.0002, "epoch": 1.6360843909142349, "step": 10120}, {"loss": 0.7129, "grad_norm": 0.6111597418785095, "learning_rate": 0.0002, "epoch": 1.6377010750949803, "step": 10130}, {"loss": 0.6955, "grad_norm": 0.6518288850784302, "learning_rate": 0.0002, "epoch": 1.6393177592757255, "step": 10140}, {"loss": 0.731, "grad_norm": 0.6914522051811218, "learning_rate": 0.0002, "epoch": 1.6409344434564708, "step": 10150}, {"loss": 0.7295, "grad_norm": 0.63785719871521, "learning_rate": 0.0002, "epoch": 1.642551127637216, "step": 10160}, {"loss": 0.7355, "grad_norm": 0.6379287838935852, "learning_rate": 0.0002, "epoch": 1.6441678118179612, "step": 10170}, {"loss": 0.7359, "grad_norm": 0.6793403029441833, "learning_rate": 0.0002, "epoch": 1.6457844959987067, "step": 10180}, {"loss": 0.7402, "grad_norm": 0.6099132895469666, "learning_rate": 0.0002, "epoch": 1.647401180179452, "step": 10190}, {"loss": 0.7353, "grad_norm": 0.5869854092597961, "learning_rate": 0.0002, "epoch": 1.6490178643601974, "step": 10200}, {"loss": 0.8308, "grad_norm": 0.7716999053955078, "learning_rate": 0.0002, "epoch": 1.6506345485409426, "step": 10210}, {"loss": 0.7215, "grad_norm": 0.6854110360145569, "learning_rate": 0.0002, "epoch": 1.6522512327216878, "step": 10220}, {"loss": 0.782, "grad_norm": 0.6957170367240906, "learning_rate": 0.0002, "epoch": 1.653867916902433, "step": 10230}, {"loss": 0.7282, "grad_norm": 0.6932903528213501, "learning_rate": 0.0002, "epoch": 1.6554846010831783, "step": 10240}, {"loss": 0.7478, "grad_norm": 0.7713165283203125, "learning_rate": 0.0002, "epoch": 1.6571012852639235, "step": 10250}, {"loss": 0.7099, "grad_norm": 0.7455793619155884, "learning_rate": 0.0002, "epoch": 1.658717969444669, "step": 10260}, {"loss": 0.7524, "grad_norm": 0.5464168190956116, "learning_rate": 0.0002, "epoch": 1.6603346536254144, "step": 10270}, {"loss": 0.7328, "grad_norm": 0.6782926321029663, "learning_rate": 0.0002, "epoch": 1.6619513378061597, "step": 10280}, {"loss": 0.7801, "grad_norm": 0.7962649464607239, "learning_rate": 0.0002, "epoch": 1.663568021986905, "step": 10290}, {"loss": 0.7142, "grad_norm": 0.6814526319503784, "learning_rate": 0.0002, "epoch": 1.6651847061676501, "step": 10300}, {"loss": 0.7285, "grad_norm": 0.656895101070404, "learning_rate": 0.0002, "epoch": 1.6668013903483954, "step": 10310}, {"loss": 0.7358, "grad_norm": 0.6085672378540039, "learning_rate": 0.0002, "epoch": 1.6684180745291406, "step": 10320}, {"loss": 0.7074, "grad_norm": 0.585508406162262, "learning_rate": 0.0002, "epoch": 1.670034758709886, "step": 10330}, {"loss": 0.7604, "grad_norm": 0.6930184364318848, "learning_rate": 0.0002, "epoch": 1.6716514428906313, "step": 10340}, {"loss": 0.7169, "grad_norm": 0.575663149356842, "learning_rate": 0.0002, "epoch": 1.6732681270713767, "step": 10350}, {"loss": 0.7198, "grad_norm": 0.582502543926239, "learning_rate": 0.0002, "epoch": 1.674884811252122, "step": 10360}, {"loss": 0.7793, "grad_norm": 0.5668916702270508, "learning_rate": 0.0002, "epoch": 1.6765014954328672, "step": 10370}, {"loss": 0.7478, "grad_norm": 0.6070065498352051, "learning_rate": 0.0002, "epoch": 1.6781181796136124, "step": 10380}, {"loss": 0.7939, "grad_norm": 0.6141316294670105, "learning_rate": 0.0002, "epoch": 1.6797348637943577, "step": 10390}, {"loss": 0.7573, "grad_norm": 0.8359124064445496, "learning_rate": 0.0002, "epoch": 1.6813515479751031, "step": 10400}, {"loss": 0.7488, "grad_norm": 0.5378185510635376, "learning_rate": 0.0002, "epoch": 1.6829682321558483, "step": 10410}, {"loss": 0.7588, "grad_norm": 0.6959536075592041, "learning_rate": 0.0002, "epoch": 1.6845849163365938, "step": 10420}, {"loss": 0.7872, "grad_norm": 0.6514357328414917, "learning_rate": 0.0002, "epoch": 1.686201600517339, "step": 10430}, {"loss": 0.725, "grad_norm": 0.7706646919250488, "learning_rate": 0.0002, "epoch": 1.6878182846980843, "step": 10440}, {"loss": 0.7673, "grad_norm": 0.6183337569236755, "learning_rate": 0.0002, "epoch": 1.6894349688788295, "step": 10450}, {"loss": 0.7566, "grad_norm": 0.6123278141021729, "learning_rate": 0.0002, "epoch": 1.6910516530595747, "step": 10460}, {"loss": 0.7169, "grad_norm": 0.6894851326942444, "learning_rate": 0.0002, "epoch": 1.69266833724032, "step": 10470}, {"loss": 0.7435, "grad_norm": 0.7497312426567078, "learning_rate": 0.0002, "epoch": 1.6942850214210654, "step": 10480}, {"loss": 0.7544, "grad_norm": 0.5968214273452759, "learning_rate": 0.0002, "epoch": 1.6959017056018106, "step": 10490}, {"loss": 0.6793, "grad_norm": 0.6747927069664001, "learning_rate": 0.0002, "epoch": 1.697518389782556, "step": 10500}, {"loss": 0.7415, "grad_norm": 0.5708310008049011, "learning_rate": 0.0002, "epoch": 1.6991350739633013, "step": 10510}, {"loss": 0.7385, "grad_norm": 0.606526792049408, "learning_rate": 0.0002, "epoch": 1.7007517581440466, "step": 10520}, {"loss": 0.7204, "grad_norm": 0.662011981010437, "learning_rate": 0.0002, "epoch": 1.7023684423247918, "step": 10530}, {"loss": 0.7999, "grad_norm": 0.7583045363426208, "learning_rate": 0.0002, "epoch": 1.703985126505537, "step": 10540}, {"loss": 0.7563, "grad_norm": 0.721632182598114, "learning_rate": 0.0002, "epoch": 1.7056018106862825, "step": 10550}, {"loss": 0.7407, "grad_norm": 0.6107715368270874, "learning_rate": 0.0002, "epoch": 1.7072184948670277, "step": 10560}, {"loss": 0.7519, "grad_norm": 0.6652471423149109, "learning_rate": 0.0002, "epoch": 1.7088351790477732, "step": 10570}, {"loss": 0.7767, "grad_norm": 0.6308087110519409, "learning_rate": 0.0002, "epoch": 1.7104518632285184, "step": 10580}, {"loss": 0.7659, "grad_norm": 0.5464386940002441, "learning_rate": 0.0002, "epoch": 1.7120685474092636, "step": 10590}, {"loss": 0.7063, "grad_norm": 0.6558911204338074, "learning_rate": 0.0002, "epoch": 1.7136852315900089, "step": 10600}, {"loss": 0.7126, "grad_norm": 0.5665024518966675, "learning_rate": 0.0002, "epoch": 1.715301915770754, "step": 10610}, {"loss": 0.6958, "grad_norm": 0.7888094186782837, "learning_rate": 0.0002, "epoch": 1.7169185999514993, "step": 10620}, {"loss": 0.7785, "grad_norm": 0.7084909081459045, "learning_rate": 0.0002, "epoch": 1.7185352841322448, "step": 10630}, {"loss": 0.7557, "grad_norm": 0.7982324361801147, "learning_rate": 0.0002, "epoch": 1.7201519683129902, "step": 10640}, {"loss": 0.7345, "grad_norm": 0.6418732404708862, "learning_rate": 0.0002, "epoch": 1.7217686524937355, "step": 10650}, {"loss": 0.7734, "grad_norm": 0.7636681795120239, "learning_rate": 0.0002, "epoch": 1.7233853366744807, "step": 10660}, {"loss": 0.7541, "grad_norm": 0.5646875500679016, "learning_rate": 0.0002, "epoch": 1.725002020855226, "step": 10670}, {"loss": 0.7642, "grad_norm": 0.5231260657310486, "learning_rate": 0.0002, "epoch": 1.7266187050359711, "step": 10680}, {"loss": 0.7846, "grad_norm": 0.7635011672973633, "learning_rate": 0.0002, "epoch": 1.7282353892167164, "step": 10690}, {"loss": 0.7471, "grad_norm": 0.7518259286880493, "learning_rate": 0.0002, "epoch": 1.7298520733974618, "step": 10700}, {"loss": 0.751, "grad_norm": 0.7295602560043335, "learning_rate": 0.0002, "epoch": 1.731468757578207, "step": 10710}, {"loss": 0.731, "grad_norm": 0.6984632015228271, "learning_rate": 0.0002, "epoch": 1.7330854417589525, "step": 10720}, {"loss": 0.7921, "grad_norm": 0.6198219060897827, "learning_rate": 0.0002, "epoch": 1.7347021259396977, "step": 10730}, {"loss": 0.7642, "grad_norm": 0.6957576274871826, "learning_rate": 0.0002, "epoch": 1.736318810120443, "step": 10740}, {"loss": 0.7917, "grad_norm": 0.6430263519287109, "learning_rate": 0.0002, "epoch": 1.7379354943011882, "step": 10750}, {"loss": 0.7156, "grad_norm": 0.6134995222091675, "learning_rate": 0.0002, "epoch": 1.7395521784819334, "step": 10760}, {"loss": 0.7584, "grad_norm": 0.7209452986717224, "learning_rate": 0.0002, "epoch": 1.741168862662679, "step": 10770}, {"loss": 0.7528, "grad_norm": 0.6735447645187378, "learning_rate": 0.0002, "epoch": 1.7427855468434241, "step": 10780}, {"loss": 0.756, "grad_norm": 0.5605693459510803, "learning_rate": 0.0002, "epoch": 1.7444022310241696, "step": 10790}, {"loss": 0.7759, "grad_norm": 0.6882363557815552, "learning_rate": 0.0002, "epoch": 1.7460189152049148, "step": 10800}, {"loss": 0.7544, "grad_norm": 0.6386259198188782, "learning_rate": 0.0002, "epoch": 1.74763559938566, "step": 10810}, {"loss": 0.7697, "grad_norm": 0.6529015302658081, "learning_rate": 0.0002, "epoch": 1.7492522835664053, "step": 10820}, {"loss": 0.7219, "grad_norm": 0.5664082765579224, "learning_rate": 0.0002, "epoch": 1.7508689677471505, "step": 10830}, {"loss": 0.7586, "grad_norm": 0.7532684206962585, "learning_rate": 0.0002, "epoch": 1.7524856519278957, "step": 10840}, {"loss": 0.6919, "grad_norm": 0.77171391248703, "learning_rate": 0.0002, "epoch": 1.7541023361086412, "step": 10850}, {"loss": 0.785, "grad_norm": 0.7255431413650513, "learning_rate": 0.0002, "epoch": 1.7557190202893864, "step": 10860}, {"loss": 0.7458, "grad_norm": 0.763083279132843, "learning_rate": 0.0002, "epoch": 1.7573357044701319, "step": 10870}, {"loss": 0.7846, "grad_norm": 0.6042402982711792, "learning_rate": 0.0002, "epoch": 1.758952388650877, "step": 10880}, {"loss": 0.7027, "grad_norm": 0.7642518281936646, "learning_rate": 0.0002, "epoch": 1.7605690728316223, "step": 10890}, {"loss": 0.746, "grad_norm": 0.6347904801368713, "learning_rate": 0.0002, "epoch": 1.7621857570123676, "step": 10900}, {"loss": 0.7458, "grad_norm": 0.5371627807617188, "learning_rate": 0.0002, "epoch": 1.7638024411931128, "step": 10910}, {"loss": 0.7466, "grad_norm": 0.6840225458145142, "learning_rate": 0.0002, "epoch": 1.7654191253738583, "step": 10920}, {"loss": 0.725, "grad_norm": 0.5288469195365906, "learning_rate": 0.0002, "epoch": 1.7670358095546035, "step": 10930}, {"loss": 0.7863, "grad_norm": 0.69020676612854, "learning_rate": 0.0002, "epoch": 1.768652493735349, "step": 10940}, {"loss": 0.7468, "grad_norm": 0.5943242311477661, "learning_rate": 0.0002, "epoch": 1.7702691779160942, "step": 10950}, {"loss": 0.7244, "grad_norm": 0.5616418123245239, "learning_rate": 0.0002, "epoch": 1.7718858620968394, "step": 10960}, {"loss": 0.7137, "grad_norm": 0.7209470868110657, "learning_rate": 0.0002, "epoch": 1.7735025462775846, "step": 10970}, {"loss": 0.7459, "grad_norm": 0.6657957434654236, "learning_rate": 0.0002, "epoch": 1.7751192304583299, "step": 10980}, {"loss": 0.7076, "grad_norm": 0.6469064950942993, "learning_rate": 0.0002, "epoch": 1.776735914639075, "step": 10990}, {"loss": 0.7321, "grad_norm": 0.6615678071975708, "learning_rate": 0.0002, "epoch": 1.7783525988198206, "step": 11000}, {"loss": 0.747, "grad_norm": 0.6722439527511597, "learning_rate": 0.0002, "epoch": 1.779969283000566, "step": 11010}, {"loss": 0.7302, "grad_norm": 0.634136974811554, "learning_rate": 0.0002, "epoch": 1.7815859671813112, "step": 11020}, {"loss": 0.8105, "grad_norm": 0.6024377346038818, "learning_rate": 0.0002, "epoch": 1.7832026513620565, "step": 11030}, {"loss": 0.7855, "grad_norm": 0.6909403800964355, "learning_rate": 0.0002, "epoch": 1.7848193355428017, "step": 11040}, {"loss": 0.7471, "grad_norm": 0.7148767709732056, "learning_rate": 0.0002, "epoch": 1.786436019723547, "step": 11050}, {"loss": 0.7145, "grad_norm": 0.7442979216575623, "learning_rate": 0.0002, "epoch": 1.7880527039042922, "step": 11060}, {"loss": 0.7215, "grad_norm": 0.6830431818962097, "learning_rate": 0.0002, "epoch": 1.7896693880850376, "step": 11070}, {"loss": 0.7625, "grad_norm": 0.9172667264938354, "learning_rate": 0.0002, "epoch": 1.7912860722657828, "step": 11080}, {"loss": 0.76, "grad_norm": 0.6799490451812744, "learning_rate": 0.0002, "epoch": 1.7929027564465283, "step": 11090}, {"loss": 0.7716, "grad_norm": 0.7617024779319763, "learning_rate": 0.0002, "epoch": 1.7945194406272735, "step": 11100}, {"loss": 0.7586, "grad_norm": 0.7701810002326965, "learning_rate": 0.0002, "epoch": 1.7961361248080188, "step": 11110}, {"loss": 0.7843, "grad_norm": 0.7454385757446289, "learning_rate": 0.0002, "epoch": 1.797752808988764, "step": 11120}, {"loss": 0.7873, "grad_norm": 0.6121436953544617, "learning_rate": 0.0002, "epoch": 1.7993694931695092, "step": 11130}, {"loss": 0.7305, "grad_norm": 0.6237571835517883, "learning_rate": 0.0002, "epoch": 1.8009861773502547, "step": 11140}, {"loss": 0.6827, "grad_norm": 0.6818515658378601, "learning_rate": 0.0002, "epoch": 1.802602861531, "step": 11150}, {"loss": 0.6876, "grad_norm": 0.7768308520317078, "learning_rate": 0.0002, "epoch": 1.8042195457117454, "step": 11160}, {"loss": 0.7533, "grad_norm": 0.6875537633895874, "learning_rate": 0.0002, "epoch": 1.8058362298924906, "step": 11170}, {"loss": 0.761, "grad_norm": 0.7950584888458252, "learning_rate": 0.0002, "epoch": 1.8074529140732358, "step": 11180}, {"loss": 0.7623, "grad_norm": 0.8210248351097107, "learning_rate": 0.0002, "epoch": 1.809069598253981, "step": 11190}, {"loss": 0.7556, "grad_norm": 0.6674110889434814, "learning_rate": 0.0002, "epoch": 1.8106862824347263, "step": 11200}, {"loss": 0.7663, "grad_norm": 0.6261674761772156, "learning_rate": 0.0002, "epoch": 1.8123029666154715, "step": 11210}, {"loss": 0.7122, "grad_norm": 0.6484741568565369, "learning_rate": 0.0002, "epoch": 1.813919650796217, "step": 11220}, {"loss": 0.7718, "grad_norm": 0.6231244206428528, "learning_rate": 0.0002, "epoch": 1.8155363349769622, "step": 11230}, {"loss": 0.7152, "grad_norm": 0.7243146896362305, "learning_rate": 0.0002, "epoch": 1.8171530191577077, "step": 11240}, {"loss": 0.7448, "grad_norm": 0.6776193380355835, "learning_rate": 0.0002, "epoch": 1.818769703338453, "step": 11250}, {"loss": 0.7317, "grad_norm": 0.5973618030548096, "learning_rate": 0.0002, "epoch": 1.8203863875191981, "step": 11260}, {"loss": 0.7961, "grad_norm": 0.6451361179351807, "learning_rate": 0.0002, "epoch": 1.8220030716999434, "step": 11270}, {"loss": 0.7611, "grad_norm": 0.5963068008422852, "learning_rate": 0.0002, "epoch": 1.8236197558806886, "step": 11280}, {"loss": 0.7466, "grad_norm": 0.536902129650116, "learning_rate": 0.0002, "epoch": 1.825236440061434, "step": 11290}, {"loss": 0.708, "grad_norm": 0.6993787288665771, "learning_rate": 0.0002, "epoch": 1.8268531242421793, "step": 11300}, {"loss": 0.7153, "grad_norm": 0.6135255098342896, "learning_rate": 0.0002, "epoch": 1.8284698084229247, "step": 11310}, {"loss": 0.7423, "grad_norm": 0.6057423949241638, "learning_rate": 0.0002, "epoch": 1.83008649260367, "step": 11320}, {"loss": 0.735, "grad_norm": 0.6598812341690063, "learning_rate": 0.0002, "epoch": 1.8317031767844152, "step": 11330}, {"loss": 0.7278, "grad_norm": 0.6075948476791382, "learning_rate": 0.0002, "epoch": 1.8333198609651604, "step": 11340}, {"loss": 0.7846, "grad_norm": 0.7065447568893433, "learning_rate": 0.0002, "epoch": 1.8349365451459057, "step": 11350}, {"loss": 0.7365, "grad_norm": 0.680526614189148, "learning_rate": 0.0002, "epoch": 1.8365532293266509, "step": 11360}, {"loss": 0.7152, "grad_norm": 0.6356695294380188, "learning_rate": 0.0002, "epoch": 1.8381699135073963, "step": 11370}, {"loss": 0.721, "grad_norm": 0.6399052143096924, "learning_rate": 0.0002, "epoch": 1.8397865976881416, "step": 11380}, {"loss": 0.7618, "grad_norm": 0.6125704050064087, "learning_rate": 0.0002, "epoch": 1.841403281868887, "step": 11390}, {"loss": 0.755, "grad_norm": 0.7124643325805664, "learning_rate": 0.0002, "epoch": 1.8430199660496323, "step": 11400}, {"loss": 0.7972, "grad_norm": 0.6099604964256287, "learning_rate": 0.0002, "epoch": 1.8446366502303775, "step": 11410}, {"loss": 0.7187, "grad_norm": 0.7338208556175232, "learning_rate": 0.0002, "epoch": 1.8462533344111227, "step": 11420}, {"loss": 0.7007, "grad_norm": 0.7534668445587158, "learning_rate": 0.0002, "epoch": 1.847870018591868, "step": 11430}, {"loss": 0.7464, "grad_norm": 0.6135470271110535, "learning_rate": 0.0002, "epoch": 1.8494867027726134, "step": 11440}, {"loss": 0.7955, "grad_norm": 0.6229309439659119, "learning_rate": 0.0002, "epoch": 1.8511033869533586, "step": 11450}, {"loss": 0.7594, "grad_norm": 0.706423282623291, "learning_rate": 0.0002, "epoch": 1.852720071134104, "step": 11460}, {"loss": 0.7411, "grad_norm": 0.5460049510002136, "learning_rate": 0.0002, "epoch": 1.8543367553148493, "step": 11470}, {"loss": 0.7416, "grad_norm": 0.6616711020469666, "learning_rate": 0.0002, "epoch": 1.8559534394955945, "step": 11480}, {"loss": 0.729, "grad_norm": 0.6372783184051514, "learning_rate": 0.0002, "epoch": 1.8575701236763398, "step": 11490}, {"loss": 0.7333, "grad_norm": 0.7162668108940125, "learning_rate": 0.0002, "epoch": 1.859186807857085, "step": 11500}, {"loss": 0.7747, "grad_norm": 0.6605209708213806, "learning_rate": 0.0002, "epoch": 1.8608034920378305, "step": 11510}, {"loss": 0.7258, "grad_norm": 0.6933956742286682, "learning_rate": 0.0002, "epoch": 1.8624201762185757, "step": 11520}, {"loss": 0.7243, "grad_norm": 0.6582090854644775, "learning_rate": 0.0002, "epoch": 1.8640368603993211, "step": 11530}, {"loss": 0.7313, "grad_norm": 0.6416500806808472, "learning_rate": 0.0002, "epoch": 1.8656535445800664, "step": 11540}, {"loss": 0.7372, "grad_norm": 0.5434312224388123, "learning_rate": 0.0002, "epoch": 1.8672702287608116, "step": 11550}, {"loss": 0.7635, "grad_norm": 0.6827567219734192, "learning_rate": 0.0002, "epoch": 1.8688869129415568, "step": 11560}, {"loss": 0.7137, "grad_norm": 0.7354370951652527, "learning_rate": 0.0002, "epoch": 1.870503597122302, "step": 11570}, {"loss": 0.7526, "grad_norm": 0.590372622013092, "learning_rate": 0.0002, "epoch": 1.8721202813030473, "step": 11580}, {"loss": 0.731, "grad_norm": 0.853183925151825, "learning_rate": 0.0002, "epoch": 1.8737369654837928, "step": 11590}, {"loss": 0.7487, "grad_norm": 0.822678804397583, "learning_rate": 0.0002, "epoch": 1.875353649664538, "step": 11600}, {"loss": 0.7427, "grad_norm": 0.6591550707817078, "learning_rate": 0.0002, "epoch": 1.8769703338452834, "step": 11610}, {"loss": 0.7054, "grad_norm": 0.7475301623344421, "learning_rate": 0.0002, "epoch": 1.8785870180260287, "step": 11620}, {"loss": 0.811, "grad_norm": 0.6390765309333801, "learning_rate": 0.0002, "epoch": 1.880203702206774, "step": 11630}, {"loss": 0.7531, "grad_norm": 0.6589758992195129, "learning_rate": 0.0002, "epoch": 1.8818203863875191, "step": 11640}, {"loss": 0.7475, "grad_norm": 0.6765508651733398, "learning_rate": 0.0002, "epoch": 1.8834370705682644, "step": 11650}, {"loss": 0.738, "grad_norm": 0.6527857780456543, "learning_rate": 0.0002, "epoch": 1.8850537547490098, "step": 11660}, {"loss": 0.7504, "grad_norm": 0.6642923951148987, "learning_rate": 0.0002, "epoch": 1.886670438929755, "step": 11670}, {"loss": 0.7701, "grad_norm": 0.6945584416389465, "learning_rate": 0.0002, "epoch": 1.8882871231105005, "step": 11680}, {"loss": 0.7711, "grad_norm": 0.694018542766571, "learning_rate": 0.0002, "epoch": 1.8899038072912457, "step": 11690}, {"loss": 0.7195, "grad_norm": 0.7237417101860046, "learning_rate": 0.0002, "epoch": 1.891520491471991, "step": 11700}, {"loss": 0.7491, "grad_norm": 0.7401309609413147, "learning_rate": 0.0002, "epoch": 1.8931371756527362, "step": 11710}, {"loss": 0.805, "grad_norm": 0.6537784337997437, "learning_rate": 0.0002, "epoch": 1.8947538598334814, "step": 11720}, {"loss": 0.793, "grad_norm": 0.7398539185523987, "learning_rate": 0.0002, "epoch": 1.8963705440142267, "step": 11730}, {"loss": 0.7561, "grad_norm": 0.6696075797080994, "learning_rate": 0.0002, "epoch": 1.8979872281949721, "step": 11740}, {"loss": 0.7353, "grad_norm": 0.6014142036437988, "learning_rate": 0.0002, "epoch": 1.8996039123757174, "step": 11750}, {"loss": 0.7714, "grad_norm": 0.7023524641990662, "learning_rate": 0.0002, "epoch": 1.9012205965564628, "step": 11760}, {"loss": 0.7088, "grad_norm": 0.739973783493042, "learning_rate": 0.0002, "epoch": 1.902837280737208, "step": 11770}, {"loss": 0.7848, "grad_norm": 0.5576770901679993, "learning_rate": 0.0002, "epoch": 1.9044539649179533, "step": 11780}, {"loss": 0.7483, "grad_norm": 0.6907393932342529, "learning_rate": 0.0002, "epoch": 1.9060706490986985, "step": 11790}, {"loss": 0.7827, "grad_norm": 0.6934581995010376, "learning_rate": 0.0002, "epoch": 1.9076873332794437, "step": 11800}, {"loss": 0.7199, "grad_norm": 0.591774582862854, "learning_rate": 0.0002, "epoch": 1.9093040174601892, "step": 11810}, {"loss": 0.7333, "grad_norm": 0.6249791383743286, "learning_rate": 0.0002, "epoch": 1.9109207016409344, "step": 11820}, {"loss": 0.7581, "grad_norm": 0.6755744218826294, "learning_rate": 0.0002, "epoch": 1.9125373858216799, "step": 11830}, {"loss": 0.696, "grad_norm": 0.7286285161972046, "learning_rate": 0.0002, "epoch": 1.914154070002425, "step": 11840}, {"loss": 0.7509, "grad_norm": 0.7867850065231323, "learning_rate": 0.0002, "epoch": 1.9157707541831703, "step": 11850}, {"loss": 0.735, "grad_norm": 0.6283972859382629, "learning_rate": 0.0002, "epoch": 1.9173874383639156, "step": 11860}, {"loss": 0.7296, "grad_norm": 0.605823814868927, "learning_rate": 0.0002, "epoch": 1.9190041225446608, "step": 11870}, {"loss": 0.6598, "grad_norm": 0.5927976965904236, "learning_rate": 0.0002, "epoch": 1.920620806725406, "step": 11880}, {"loss": 0.7649, "grad_norm": 0.5974002480506897, "learning_rate": 0.0002, "epoch": 1.9222374909061515, "step": 11890}, {"loss": 0.7843, "grad_norm": 0.7091866135597229, "learning_rate": 0.0002, "epoch": 1.923854175086897, "step": 11900}, {"loss": 0.775, "grad_norm": 0.72496497631073, "learning_rate": 0.0002, "epoch": 1.9254708592676422, "step": 11910}, {"loss": 0.7153, "grad_norm": 0.6131896376609802, "learning_rate": 0.0002, "epoch": 1.9270875434483874, "step": 11920}, {"loss": 0.7228, "grad_norm": 0.6556436419487, "learning_rate": 0.0002, "epoch": 1.9287042276291326, "step": 11930}, {"loss": 0.7319, "grad_norm": 0.622932493686676, "learning_rate": 0.0002, "epoch": 1.9303209118098779, "step": 11940}, {"loss": 0.7592, "grad_norm": 0.6618631482124329, "learning_rate": 0.0002, "epoch": 1.931937595990623, "step": 11950}, {"loss": 0.8332, "grad_norm": 0.630966305732727, "learning_rate": 0.0002, "epoch": 1.9335542801713685, "step": 11960}, {"loss": 0.6854, "grad_norm": 0.6336734890937805, "learning_rate": 0.0002, "epoch": 1.9351709643521138, "step": 11970}, {"loss": 0.7433, "grad_norm": 0.655403196811676, "learning_rate": 0.0002, "epoch": 1.9367876485328592, "step": 11980}, {"loss": 0.7282, "grad_norm": 0.5640574097633362, "learning_rate": 0.0002, "epoch": 1.9384043327136045, "step": 11990}, {"loss": 0.7289, "grad_norm": 0.6322951316833496, "learning_rate": 0.0002, "epoch": 1.9400210168943497, "step": 12000}, {"loss": 0.7627, "grad_norm": 0.615703821182251, "learning_rate": 0.0002, "epoch": 1.941637701075095, "step": 12010}, {"loss": 0.786, "grad_norm": 0.6487536430358887, "learning_rate": 0.0002, "epoch": 1.9432543852558402, "step": 12020}, {"loss": 0.7435, "grad_norm": 0.9209630489349365, "learning_rate": 0.0002, "epoch": 1.9448710694365856, "step": 12030}, {"loss": 0.7274, "grad_norm": 0.67485511302948, "learning_rate": 0.0002, "epoch": 1.9464877536173308, "step": 12040}, {"loss": 0.7551, "grad_norm": 0.6831230521202087, "learning_rate": 0.0002, "epoch": 1.9481044377980763, "step": 12050}, {"loss": 0.7546, "grad_norm": 0.6578302383422852, "learning_rate": 0.0002, "epoch": 1.9497211219788215, "step": 12060}, {"loss": 0.6989, "grad_norm": 0.9975938200950623, "learning_rate": 0.0002, "epoch": 1.9513378061595668, "step": 12070}, {"loss": 0.7952, "grad_norm": 0.6637365221977234, "learning_rate": 0.0002, "epoch": 1.952954490340312, "step": 12080}, {"loss": 0.7482, "grad_norm": 0.605707049369812, "learning_rate": 0.0002, "epoch": 1.9545711745210572, "step": 12090}, {"loss": 0.7768, "grad_norm": 0.6584440469741821, "learning_rate": 0.0002, "epoch": 1.9561878587018025, "step": 12100}, {"loss": 0.7187, "grad_norm": 0.6070835590362549, "learning_rate": 0.0002, "epoch": 1.957804542882548, "step": 12110}, {"loss": 0.7491, "grad_norm": 0.7862601280212402, "learning_rate": 0.0002, "epoch": 1.9594212270632931, "step": 12120}, {"loss": 0.7972, "grad_norm": 0.8175255060195923, "learning_rate": 0.0002, "epoch": 1.9610379112440386, "step": 12130}, {"loss": 0.7242, "grad_norm": 0.5648472905158997, "learning_rate": 0.0002, "epoch": 1.9626545954247838, "step": 12140}, {"loss": 0.7321, "grad_norm": 0.6591973304748535, "learning_rate": 0.0002, "epoch": 1.964271279605529, "step": 12150}, {"loss": 0.739, "grad_norm": 0.5960676074028015, "learning_rate": 0.0002, "epoch": 1.9658879637862743, "step": 12160}, {"loss": 0.7254, "grad_norm": 0.7272544503211975, "learning_rate": 0.0002, "epoch": 1.9675046479670195, "step": 12170}, {"loss": 0.7376, "grad_norm": 0.7176699042320251, "learning_rate": 0.0002, "epoch": 1.969121332147765, "step": 12180}, {"loss": 0.7525, "grad_norm": 0.6927123665809631, "learning_rate": 0.0002, "epoch": 1.9707380163285102, "step": 12190}, {"loss": 0.7318, "grad_norm": 0.5536034107208252, "learning_rate": 0.0002, "epoch": 1.9723547005092557, "step": 12200}, {"loss": 0.7737, "grad_norm": 0.8348390460014343, "learning_rate": 0.0002, "epoch": 1.9739713846900009, "step": 12210}, {"loss": 0.7494, "grad_norm": 0.6591181755065918, "learning_rate": 0.0002, "epoch": 1.9755880688707461, "step": 12220}, {"loss": 0.763, "grad_norm": 1.0624109506607056, "learning_rate": 0.0002, "epoch": 1.9772047530514913, "step": 12230}, {"loss": 0.7541, "grad_norm": 0.9265586137771606, "learning_rate": 0.0002, "epoch": 1.9788214372322366, "step": 12240}, {"loss": 0.7533, "grad_norm": 0.5998196005821228, "learning_rate": 0.0002, "epoch": 1.9804381214129818, "step": 12250}, {"loss": 0.7225, "grad_norm": 0.6960851550102234, "learning_rate": 0.0002, "epoch": 1.9820548055937273, "step": 12260}, {"loss": 0.7398, "grad_norm": 0.7674502730369568, "learning_rate": 0.0002, "epoch": 1.9836714897744727, "step": 12270}, {"loss": 0.7185, "grad_norm": 0.6407275795936584, "learning_rate": 0.0002, "epoch": 1.985288173955218, "step": 12280}, {"loss": 0.7382, "grad_norm": 0.6673079133033752, "learning_rate": 0.0002, "epoch": 1.9869048581359632, "step": 12290}, {"loss": 0.7326, "grad_norm": 0.6989844441413879, "learning_rate": 0.0002, "epoch": 1.9885215423167084, "step": 12300}, {"loss": 0.7559, "grad_norm": 0.7564442157745361, "learning_rate": 0.0002, "epoch": 1.9901382264974536, "step": 12310}, {"loss": 0.7719, "grad_norm": 0.6385478973388672, "learning_rate": 0.0002, "epoch": 1.9917549106781989, "step": 12320}, {"loss": 0.7369, "grad_norm": 0.7193717956542969, "learning_rate": 0.0002, "epoch": 1.9933715948589443, "step": 12330}, {"loss": 0.7583, "grad_norm": 0.7987112402915955, "learning_rate": 0.0002, "epoch": 1.9949882790396896, "step": 12340}, {"loss": 0.7793, "grad_norm": 0.7260826826095581, "learning_rate": 0.0002, "epoch": 1.996604963220435, "step": 12350}, {"loss": 0.7505, "grad_norm": 0.7968255281448364, "learning_rate": 0.0002, "epoch": 1.9982216474011802, "step": 12360}, {"loss": 0.717, "grad_norm": 0.6893062591552734, "learning_rate": 0.0002, "epoch": 1.9998383315819255, "step": 12370}, {"eval_loss": 1.1044032573699951, "eval_runtime": 122.1508, "eval_samples_per_second": 6.001, "eval_steps_per_second": 0.753, "epoch": 2.0, "step": 12371}, {"loss": 0.6604, "grad_norm": 0.7775409817695618, "learning_rate": 0.0002, "epoch": 2.0014550157626707, "step": 12380}, {"loss": 0.6845, "grad_norm": 0.76218581199646, "learning_rate": 0.0002, "epoch": 2.003071699943416, "step": 12390}, {"loss": 0.6909, "grad_norm": 0.5677764415740967, "learning_rate": 0.0002, "epoch": 2.004688384124161, "step": 12400}, {"loss": 0.6584, "grad_norm": 0.808442234992981, "learning_rate": 0.0002, "epoch": 2.006305068304907, "step": 12410}, {"loss": 0.659, "grad_norm": 0.7144765257835388, "learning_rate": 0.0002, "epoch": 2.007921752485652, "step": 12420}, {"loss": 0.6666, "grad_norm": 0.6914031505584717, "learning_rate": 0.0002, "epoch": 2.0095384366663973, "step": 12430}, {"loss": 0.6596, "grad_norm": 0.7581454515457153, "learning_rate": 0.0002, "epoch": 2.0111551208471425, "step": 12440}, {"loss": 0.6785, "grad_norm": 0.8388504981994629, "learning_rate": 0.0002, "epoch": 2.0127718050278878, "step": 12450}, {"loss": 0.6942, "grad_norm": 0.6716406941413879, "learning_rate": 0.0002, "epoch": 2.014388489208633, "step": 12460}, {"loss": 0.6441, "grad_norm": 0.898902416229248, "learning_rate": 0.0002, "epoch": 2.0160051733893782, "step": 12470}, {"loss": 0.6655, "grad_norm": 0.6432679891586304, "learning_rate": 0.0002, "epoch": 2.0176218575701235, "step": 12480}, {"loss": 0.6521, "grad_norm": 0.8021109104156494, "learning_rate": 0.0002, "epoch": 2.019238541750869, "step": 12490}, {"loss": 0.6581, "grad_norm": 0.7039216756820679, "learning_rate": 0.0002, "epoch": 2.0208552259316144, "step": 12500}, {"loss": 0.6521, "grad_norm": 0.646531879901886, "learning_rate": 0.0002, "epoch": 2.0224719101123596, "step": 12510}, {"loss": 0.6302, "grad_norm": 0.783704400062561, "learning_rate": 0.0002, "epoch": 2.024088594293105, "step": 12520}, {"loss": 0.6288, "grad_norm": 0.8805046677589417, "learning_rate": 0.0002, "epoch": 2.02570527847385, "step": 12530}, {"loss": 0.6288, "grad_norm": 0.7289270758628845, "learning_rate": 0.0002, "epoch": 2.0273219626545953, "step": 12540}, {"loss": 0.6663, "grad_norm": 0.71653151512146, "learning_rate": 0.0002, "epoch": 2.0289386468353405, "step": 12550}, {"loss": 0.625, "grad_norm": 0.73281329870224, "learning_rate": 0.0002, "epoch": 2.030555331016086, "step": 12560}, {"loss": 0.6448, "grad_norm": 0.6657090187072754, "learning_rate": 0.0002, "epoch": 2.0321720151968314, "step": 12570}, {"loss": 0.6983, "grad_norm": 0.8241133093833923, "learning_rate": 0.0002, "epoch": 2.0337886993775767, "step": 12580}, {"loss": 0.6488, "grad_norm": 0.5834135413169861, "learning_rate": 0.0002, "epoch": 2.035405383558322, "step": 12590}, {"loss": 0.6188, "grad_norm": 0.84502112865448, "learning_rate": 0.0002, "epoch": 2.037022067739067, "step": 12600}, {"loss": 0.6349, "grad_norm": 0.8952481746673584, "learning_rate": 0.0002, "epoch": 2.0386387519198124, "step": 12610}, {"loss": 0.6923, "grad_norm": 0.7801461815834045, "learning_rate": 0.0002, "epoch": 2.0402554361005576, "step": 12620}, {"loss": 0.6176, "grad_norm": 0.6788367033004761, "learning_rate": 0.0002, "epoch": 2.041872120281303, "step": 12630}, {"loss": 0.6162, "grad_norm": 0.7241756319999695, "learning_rate": 0.0002, "epoch": 2.0434888044620485, "step": 12640}, {"loss": 0.655, "grad_norm": 0.6933388113975525, "learning_rate": 0.0002, "epoch": 2.0451054886427937, "step": 12650}, {"loss": 0.6431, "grad_norm": 0.8029746413230896, "learning_rate": 0.0002, "epoch": 2.046722172823539, "step": 12660}, {"loss": 0.7164, "grad_norm": 0.946399986743927, "learning_rate": 0.0002, "epoch": 2.048338857004284, "step": 12670}, {"loss": 0.638, "grad_norm": 0.7072678804397583, "learning_rate": 0.0002, "epoch": 2.0499555411850294, "step": 12680}, {"loss": 0.6487, "grad_norm": 0.6810618042945862, "learning_rate": 0.0002, "epoch": 2.0515722253657747, "step": 12690}, {"loss": 0.6554, "grad_norm": 0.7661160230636597, "learning_rate": 0.0002, "epoch": 2.05318890954652, "step": 12700}, {"loss": 0.6799, "grad_norm": 0.6350653767585754, "learning_rate": 0.0002, "epoch": 2.0548055937272656, "step": 12710}, {"loss": 0.6654, "grad_norm": 0.861890971660614, "learning_rate": 0.0002, "epoch": 2.056422277908011, "step": 12720}, {"loss": 0.6286, "grad_norm": 0.6489875912666321, "learning_rate": 0.0002, "epoch": 2.058038962088756, "step": 12730}, {"loss": 0.6811, "grad_norm": 0.8268506526947021, "learning_rate": 0.0002, "epoch": 2.0596556462695013, "step": 12740}, {"loss": 0.6524, "grad_norm": 0.607679545879364, "learning_rate": 0.0002, "epoch": 2.0612723304502465, "step": 12750}, {"loss": 0.6649, "grad_norm": 0.6754153370857239, "learning_rate": 0.0002, "epoch": 2.0628890146309917, "step": 12760}, {"loss": 0.6549, "grad_norm": 0.7263124585151672, "learning_rate": 0.0002, "epoch": 2.064505698811737, "step": 12770}, {"loss": 0.6189, "grad_norm": 0.6986154317855835, "learning_rate": 0.0002, "epoch": 2.0661223829924826, "step": 12780}, {"loss": 0.6723, "grad_norm": 0.7768576741218567, "learning_rate": 0.0002, "epoch": 2.067739067173228, "step": 12790}, {"loss": 0.677, "grad_norm": 0.7546762824058533, "learning_rate": 0.0002, "epoch": 2.069355751353973, "step": 12800}, {"loss": 0.6485, "grad_norm": 0.7588880062103271, "learning_rate": 0.0002, "epoch": 2.0709724355347183, "step": 12810}, {"loss": 0.6989, "grad_norm": 0.7457242608070374, "learning_rate": 0.0002, "epoch": 2.0725891197154636, "step": 12820}, {"loss": 0.6489, "grad_norm": 0.6983516812324524, "learning_rate": 0.0002, "epoch": 2.074205803896209, "step": 12830}, {"loss": 0.651, "grad_norm": 0.7950928807258606, "learning_rate": 0.0002, "epoch": 2.075822488076954, "step": 12840}, {"loss": 0.6603, "grad_norm": 0.9248087406158447, "learning_rate": 0.0002, "epoch": 2.0774391722576993, "step": 12850}, {"loss": 0.6847, "grad_norm": 0.7229493260383606, "learning_rate": 0.0002, "epoch": 2.079055856438445, "step": 12860}, {"loss": 0.6702, "grad_norm": 0.5710847973823547, "learning_rate": 0.0002, "epoch": 2.08067254061919, "step": 12870}, {"loss": 0.6974, "grad_norm": 0.9580423831939697, "learning_rate": 0.0002, "epoch": 2.0822892247999354, "step": 12880}, {"loss": 0.6341, "grad_norm": 0.7399665713310242, "learning_rate": 0.0002, "epoch": 2.0839059089806806, "step": 12890}, {"loss": 0.6993, "grad_norm": 0.7981410622596741, "learning_rate": 0.0002, "epoch": 2.085522593161426, "step": 12900}, {"loss": 0.6976, "grad_norm": 0.870759904384613, "learning_rate": 0.0002, "epoch": 2.087139277342171, "step": 12910}, {"loss": 0.7194, "grad_norm": 0.7001481652259827, "learning_rate": 0.0002, "epoch": 2.0887559615229163, "step": 12920}, {"loss": 0.6383, "grad_norm": 0.6745418310165405, "learning_rate": 0.0002, "epoch": 2.090372645703662, "step": 12930}, {"loss": 0.6519, "grad_norm": 0.7739067673683167, "learning_rate": 0.0002, "epoch": 2.0919893298844072, "step": 12940}, {"loss": 0.6856, "grad_norm": 0.6742934584617615, "learning_rate": 0.0002, "epoch": 2.0936060140651525, "step": 12950}, {"loss": 0.6279, "grad_norm": 0.7270349860191345, "learning_rate": 0.0002, "epoch": 2.0952226982458977, "step": 12960}, {"loss": 0.6783, "grad_norm": 0.7150624394416809, "learning_rate": 0.0002, "epoch": 2.096839382426643, "step": 12970}, {"loss": 0.6093, "grad_norm": 0.7734767198562622, "learning_rate": 0.0002, "epoch": 2.098456066607388, "step": 12980}, {"loss": 0.6534, "grad_norm": 0.7618662118911743, "learning_rate": 0.0002, "epoch": 2.1000727507881334, "step": 12990}, {"loss": 0.6707, "grad_norm": 0.6557944416999817, "learning_rate": 0.0002, "epoch": 2.101689434968879, "step": 13000}, {"loss": 0.7268, "grad_norm": 0.8786448240280151, "learning_rate": 0.0002, "epoch": 2.1033061191496243, "step": 13010}, {"loss": 0.6677, "grad_norm": 0.6878724098205566, "learning_rate": 0.0002, "epoch": 2.1049228033303695, "step": 13020}, {"loss": 0.6824, "grad_norm": 0.822318971157074, "learning_rate": 0.0002, "epoch": 2.1065394875111147, "step": 13030}, {"loss": 0.6228, "grad_norm": 0.831468939781189, "learning_rate": 0.0002, "epoch": 2.10815617169186, "step": 13040}, {"loss": 0.6511, "grad_norm": 0.7699505686759949, "learning_rate": 0.0002, "epoch": 2.109772855872605, "step": 13050}, {"loss": 0.6671, "grad_norm": 0.7559016346931458, "learning_rate": 0.0002, "epoch": 2.1113895400533504, "step": 13060}, {"loss": 0.6215, "grad_norm": 0.6942209601402283, "learning_rate": 0.0002, "epoch": 2.1130062242340957, "step": 13070}, {"loss": 0.6449, "grad_norm": 0.6098947525024414, "learning_rate": 0.0002, "epoch": 2.1146229084148414, "step": 13080}, {"loss": 0.7091, "grad_norm": 0.6499016284942627, "learning_rate": 0.0002, "epoch": 2.1162395925955866, "step": 13090}, {"loss": 0.6247, "grad_norm": 0.7719953060150146, "learning_rate": 0.0002, "epoch": 2.117856276776332, "step": 13100}, {"loss": 0.6064, "grad_norm": 0.6708134412765503, "learning_rate": 0.0002, "epoch": 2.119472960957077, "step": 13110}, {"loss": 0.6056, "grad_norm": 0.8119585514068604, "learning_rate": 0.0002, "epoch": 2.1210896451378223, "step": 13120}, {"loss": 0.6628, "grad_norm": 0.6947157979011536, "learning_rate": 0.0002, "epoch": 2.1227063293185675, "step": 13130}, {"loss": 0.6375, "grad_norm": 0.8831837773323059, "learning_rate": 0.0002, "epoch": 2.1243230134993127, "step": 13140}, {"loss": 0.6997, "grad_norm": 0.7266910672187805, "learning_rate": 0.0002, "epoch": 2.1259396976800584, "step": 13150}, {"loss": 0.6446, "grad_norm": 0.8864351511001587, "learning_rate": 0.0002, "epoch": 2.1275563818608036, "step": 13160}, {"loss": 0.6762, "grad_norm": 0.8104248046875, "learning_rate": 0.0002, "epoch": 2.129173066041549, "step": 13170}, {"loss": 0.6581, "grad_norm": 0.6077079772949219, "learning_rate": 0.0002, "epoch": 2.130789750222294, "step": 13180}, {"loss": 0.6572, "grad_norm": 0.6874213814735413, "learning_rate": 0.0002, "epoch": 2.1324064344030393, "step": 13190}, {"loss": 0.642, "grad_norm": 0.7134367823600769, "learning_rate": 0.0002, "epoch": 2.1340231185837846, "step": 13200}, {"loss": 0.7016, "grad_norm": 0.6101235151290894, "learning_rate": 0.0002, "epoch": 2.13563980276453, "step": 13210}, {"loss": 0.6529, "grad_norm": 0.6042411923408508, "learning_rate": 0.0002, "epoch": 2.137256486945275, "step": 13220}, {"loss": 0.7179, "grad_norm": 0.914601743221283, "learning_rate": 0.0002, "epoch": 2.1388731711260207, "step": 13230}, {"loss": 0.6513, "grad_norm": 0.7104284167289734, "learning_rate": 0.0002, "epoch": 2.140489855306766, "step": 13240}, {"loss": 0.6607, "grad_norm": 0.664395272731781, "learning_rate": 0.0002, "epoch": 2.142106539487511, "step": 13250}, {"loss": 0.7211, "grad_norm": 0.6991241574287415, "learning_rate": 0.0002, "epoch": 2.1437232236682564, "step": 13260}, {"loss": 0.6484, "grad_norm": 0.5469560623168945, "learning_rate": 0.0002, "epoch": 2.1453399078490016, "step": 13270}, {"loss": 0.6765, "grad_norm": 0.8454998135566711, "learning_rate": 0.0002, "epoch": 2.146956592029747, "step": 13280}, {"loss": 0.6683, "grad_norm": 0.7088868618011475, "learning_rate": 0.0002, "epoch": 2.148573276210492, "step": 13290}, {"loss": 0.6835, "grad_norm": 0.7002687454223633, "learning_rate": 0.0002, "epoch": 2.1501899603912378, "step": 13300}, {"loss": 0.6399, "grad_norm": 0.7785214781761169, "learning_rate": 0.0002, "epoch": 2.151806644571983, "step": 13310}, {"loss": 0.67, "grad_norm": 0.8049132227897644, "learning_rate": 0.0002, "epoch": 2.1534233287527282, "step": 13320}, {"loss": 0.6495, "grad_norm": 0.8062595129013062, "learning_rate": 0.0002, "epoch": 2.1550400129334735, "step": 13330}, {"loss": 0.6603, "grad_norm": 0.6208319067955017, "learning_rate": 0.0002, "epoch": 2.1566566971142187, "step": 13340}, {"loss": 0.6584, "grad_norm": 0.7519655823707581, "learning_rate": 0.0002, "epoch": 2.158273381294964, "step": 13350}, {"loss": 0.6457, "grad_norm": 0.7645747065544128, "learning_rate": 0.0002, "epoch": 2.159890065475709, "step": 13360}, {"loss": 0.645, "grad_norm": 0.6847302913665771, "learning_rate": 0.0002, "epoch": 2.1615067496564544, "step": 13370}, {"loss": 0.6903, "grad_norm": 0.8630441427230835, "learning_rate": 0.0002, "epoch": 2.1631234338372, "step": 13380}, {"loss": 0.6742, "grad_norm": 0.7947702407836914, "learning_rate": 0.0002, "epoch": 2.1647401180179453, "step": 13390}, {"loss": 0.7206, "grad_norm": 0.6836977005004883, "learning_rate": 0.0002, "epoch": 2.1663568021986905, "step": 13400}, {"loss": 0.6304, "grad_norm": 0.7340566515922546, "learning_rate": 0.0002, "epoch": 2.1679734863794358, "step": 13410}, {"loss": 0.6528, "grad_norm": 0.7075738906860352, "learning_rate": 0.0002, "epoch": 2.169590170560181, "step": 13420}, {"loss": 0.6585, "grad_norm": 0.7080879807472229, "learning_rate": 0.0002, "epoch": 2.1712068547409262, "step": 13430}, {"loss": 0.6615, "grad_norm": 0.6218613386154175, "learning_rate": 0.0002, "epoch": 2.1728235389216715, "step": 13440}, {"loss": 0.6488, "grad_norm": 0.8211479187011719, "learning_rate": 0.0002, "epoch": 2.174440223102417, "step": 13450}, {"loss": 0.6738, "grad_norm": 0.864466667175293, "learning_rate": 0.0002, "epoch": 2.1760569072831624, "step": 13460}, {"loss": 0.679, "grad_norm": 0.7943857908248901, "learning_rate": 0.0002, "epoch": 2.1776735914639076, "step": 13470}, {"loss": 0.6838, "grad_norm": 0.78728187084198, "learning_rate": 0.0002, "epoch": 2.179290275644653, "step": 13480}, {"loss": 0.6397, "grad_norm": 0.697527289390564, "learning_rate": 0.0002, "epoch": 2.180906959825398, "step": 13490}, {"loss": 0.669, "grad_norm": 0.8205804228782654, "learning_rate": 0.0002, "epoch": 2.1825236440061433, "step": 13500}, {"loss": 0.7227, "grad_norm": 0.8709042072296143, "learning_rate": 0.0002, "epoch": 2.1841403281868885, "step": 13510}, {"loss": 0.6313, "grad_norm": 0.6228537559509277, "learning_rate": 0.0002, "epoch": 2.1857570123676338, "step": 13520}, {"loss": 0.7025, "grad_norm": 0.9566980004310608, "learning_rate": 0.0002, "epoch": 2.1873736965483794, "step": 13530}, {"loss": 0.6755, "grad_norm": 0.7128894329071045, "learning_rate": 0.0002, "epoch": 2.1889903807291247, "step": 13540}, {"loss": 0.6827, "grad_norm": 0.6888654232025146, "learning_rate": 0.0002, "epoch": 2.19060706490987, "step": 13550}, {"loss": 0.6961, "grad_norm": 0.6444337368011475, "learning_rate": 0.0002, "epoch": 2.192223749090615, "step": 13560}, {"loss": 0.656, "grad_norm": 0.8008806705474854, "learning_rate": 0.0002, "epoch": 2.1938404332713604, "step": 13570}, {"loss": 0.7, "grad_norm": 0.8482748866081238, "learning_rate": 0.0002, "epoch": 2.1954571174521056, "step": 13580}, {"loss": 0.7326, "grad_norm": 0.8584157228469849, "learning_rate": 0.0002, "epoch": 2.197073801632851, "step": 13590}, {"loss": 0.7014, "grad_norm": 0.7513734698295593, "learning_rate": 0.0002, "epoch": 2.1986904858135965, "step": 13600}, {"loss": 0.6632, "grad_norm": 0.7864262461662292, "learning_rate": 0.0002, "epoch": 2.2003071699943417, "step": 13610}, {"loss": 0.6879, "grad_norm": 0.8493645191192627, "learning_rate": 0.0002, "epoch": 2.201923854175087, "step": 13620}, {"loss": 0.6617, "grad_norm": 0.6902140974998474, "learning_rate": 0.0002, "epoch": 2.203540538355832, "step": 13630}, {"loss": 0.6655, "grad_norm": 0.8711254596710205, "learning_rate": 0.0002, "epoch": 2.2051572225365774, "step": 13640}, {"loss": 0.6359, "grad_norm": 0.7832191586494446, "learning_rate": 0.0002, "epoch": 2.2067739067173227, "step": 13650}, {"loss": 0.6723, "grad_norm": 0.5668176412582397, "learning_rate": 0.0002, "epoch": 2.208390590898068, "step": 13660}, {"loss": 0.635, "grad_norm": 0.8648375272750854, "learning_rate": 0.0002, "epoch": 2.2100072750788136, "step": 13670}, {"loss": 0.653, "grad_norm": 0.7643089890480042, "learning_rate": 0.0002, "epoch": 2.211623959259559, "step": 13680}, {"loss": 0.6765, "grad_norm": 0.6293777823448181, "learning_rate": 0.0002, "epoch": 2.213240643440304, "step": 13690}, {"loss": 0.6842, "grad_norm": 0.6459372639656067, "learning_rate": 0.0002, "epoch": 2.2148573276210493, "step": 13700}, {"loss": 0.6526, "grad_norm": 0.7060744166374207, "learning_rate": 0.0002, "epoch": 2.2164740118017945, "step": 13710}, {"loss": 0.7101, "grad_norm": 0.674109160900116, "learning_rate": 0.0002, "epoch": 2.2180906959825397, "step": 13720}, {"loss": 0.6529, "grad_norm": 0.830392062664032, "learning_rate": 0.0002, "epoch": 2.219707380163285, "step": 13730}, {"loss": 0.6733, "grad_norm": 0.6474477052688599, "learning_rate": 0.0002, "epoch": 2.2213240643440306, "step": 13740}, {"loss": 0.6413, "grad_norm": 0.7037909626960754, "learning_rate": 0.0002, "epoch": 2.222940748524776, "step": 13750}, {"loss": 0.6417, "grad_norm": 0.6554131507873535, "learning_rate": 0.0002, "epoch": 2.224557432705521, "step": 13760}, {"loss": 0.6907, "grad_norm": 0.7822230458259583, "learning_rate": 0.0002, "epoch": 2.2261741168862663, "step": 13770}, {"loss": 0.6505, "grad_norm": 0.9082167744636536, "learning_rate": 0.0002, "epoch": 2.2277908010670116, "step": 13780}, {"loss": 0.6878, "grad_norm": 0.7918276190757751, "learning_rate": 0.0002, "epoch": 2.229407485247757, "step": 13790}, {"loss": 0.6669, "grad_norm": 0.7354569435119629, "learning_rate": 0.0002, "epoch": 2.231024169428502, "step": 13800}, {"loss": 0.6503, "grad_norm": 0.8265249133110046, "learning_rate": 0.0002, "epoch": 2.2326408536092472, "step": 13810}, {"loss": 0.6871, "grad_norm": 0.6653847098350525, "learning_rate": 0.0002, "epoch": 2.234257537789993, "step": 13820}, {"loss": 0.6413, "grad_norm": 0.7157923579216003, "learning_rate": 0.0002, "epoch": 2.235874221970738, "step": 13830}, {"loss": 0.6306, "grad_norm": 0.7110323309898376, "learning_rate": 0.0002, "epoch": 2.2374909061514834, "step": 13840}, {"loss": 0.6913, "grad_norm": 0.7155357599258423, "learning_rate": 0.0002, "epoch": 2.2391075903322286, "step": 13850}, {"loss": 0.6579, "grad_norm": 1.0177817344665527, "learning_rate": 0.0002, "epoch": 2.240724274512974, "step": 13860}, {"loss": 0.635, "grad_norm": 0.7601948380470276, "learning_rate": 0.0002, "epoch": 2.242340958693719, "step": 13870}, {"loss": 0.6679, "grad_norm": 0.7628820538520813, "learning_rate": 0.0002, "epoch": 2.2439576428744643, "step": 13880}, {"loss": 0.6805, "grad_norm": 0.7089297771453857, "learning_rate": 0.0002, "epoch": 2.24557432705521, "step": 13890}, {"loss": 0.7236, "grad_norm": 0.695178210735321, "learning_rate": 0.0002, "epoch": 2.247191011235955, "step": 13900}, {"loss": 0.7084, "grad_norm": 0.7631948590278625, "learning_rate": 0.0002, "epoch": 2.2488076954167004, "step": 13910}, {"loss": 0.685, "grad_norm": 0.8203101754188538, "learning_rate": 0.0002, "epoch": 2.2504243795974457, "step": 13920}, {"loss": 0.653, "grad_norm": 0.8099079728126526, "learning_rate": 0.0002, "epoch": 2.252041063778191, "step": 13930}, {"loss": 0.694, "grad_norm": 0.6498546004295349, "learning_rate": 0.0002, "epoch": 2.253657747958936, "step": 13940}, {"loss": 0.6684, "grad_norm": 0.7797415256500244, "learning_rate": 0.0002, "epoch": 2.2552744321396814, "step": 13950}, {"loss": 0.683, "grad_norm": 0.8254124522209167, "learning_rate": 0.0002, "epoch": 2.2568911163204266, "step": 13960}, {"loss": 0.6806, "grad_norm": 0.6327953338623047, "learning_rate": 0.0002, "epoch": 2.2585078005011723, "step": 13970}, {"loss": 0.668, "grad_norm": 0.734194278717041, "learning_rate": 0.0002, "epoch": 2.2601244846819175, "step": 13980}, {"loss": 0.6912, "grad_norm": 0.9014202952384949, "learning_rate": 0.0002, "epoch": 2.2617411688626627, "step": 13990}, {"loss": 0.692, "grad_norm": 0.7643631100654602, "learning_rate": 0.0002, "epoch": 2.263357853043408, "step": 14000}, {"loss": 0.6657, "grad_norm": 0.8882834911346436, "learning_rate": 0.0002, "epoch": 2.264974537224153, "step": 14010}, {"loss": 0.6453, "grad_norm": 0.7975873351097107, "learning_rate": 0.0002, "epoch": 2.2665912214048984, "step": 14020}, {"loss": 0.7193, "grad_norm": 0.7765783071517944, "learning_rate": 0.0002, "epoch": 2.2682079055856437, "step": 14030}, {"loss": 0.662, "grad_norm": 0.8846288323402405, "learning_rate": 0.0002, "epoch": 2.2698245897663893, "step": 14040}, {"loss": 0.6494, "grad_norm": 0.9006744027137756, "learning_rate": 0.0002, "epoch": 2.2714412739471346, "step": 14050}, {"loss": 0.6423, "grad_norm": 0.7420173287391663, "learning_rate": 0.0002, "epoch": 2.27305795812788, "step": 14060}, {"loss": 0.7068, "grad_norm": 0.7956424951553345, "learning_rate": 0.0002, "epoch": 2.274674642308625, "step": 14070}, {"loss": 0.6581, "grad_norm": 0.7783209085464478, "learning_rate": 0.0002, "epoch": 2.2762913264893703, "step": 14080}, {"loss": 0.7202, "grad_norm": 0.7597188949584961, "learning_rate": 0.0002, "epoch": 2.2779080106701155, "step": 14090}, {"loss": 0.6778, "grad_norm": 0.6718921661376953, "learning_rate": 0.0002, "epoch": 2.2795246948508607, "step": 14100}, {"loss": 0.632, "grad_norm": 0.7528082132339478, "learning_rate": 0.0002, "epoch": 2.281141379031606, "step": 14110}, {"loss": 0.7608, "grad_norm": 0.8379864692687988, "learning_rate": 0.0002, "epoch": 2.2827580632123516, "step": 14120}, {"loss": 0.6767, "grad_norm": 0.748613715171814, "learning_rate": 0.0002, "epoch": 2.284374747393097, "step": 14130}, {"loss": 0.6641, "grad_norm": 0.7435423135757446, "learning_rate": 0.0002, "epoch": 2.285991431573842, "step": 14140}, {"loss": 0.6849, "grad_norm": 0.7580803632736206, "learning_rate": 0.0002, "epoch": 2.2876081157545873, "step": 14150}, {"loss": 0.6604, "grad_norm": 0.6278321146965027, "learning_rate": 0.0002, "epoch": 2.2892247999353326, "step": 14160}, {"loss": 0.6573, "grad_norm": 0.7663896083831787, "learning_rate": 0.0002, "epoch": 2.290841484116078, "step": 14170}, {"loss": 0.6655, "grad_norm": 0.9716812372207642, "learning_rate": 0.0002, "epoch": 2.292458168296823, "step": 14180}, {"loss": 0.7067, "grad_norm": 0.8993458151817322, "learning_rate": 0.0002, "epoch": 2.2940748524775687, "step": 14190}, {"loss": 0.6172, "grad_norm": 0.6156117916107178, "learning_rate": 0.0002, "epoch": 2.295691536658314, "step": 14200}, {"loss": 0.6318, "grad_norm": 0.8911278247833252, "learning_rate": 0.0002, "epoch": 2.297308220839059, "step": 14210}, {"loss": 0.6364, "grad_norm": 0.6422147154808044, "learning_rate": 0.0002, "epoch": 2.2989249050198044, "step": 14220}, {"loss": 0.6795, "grad_norm": 0.6866879463195801, "learning_rate": 0.0002, "epoch": 2.3005415892005496, "step": 14230}, {"loss": 0.6907, "grad_norm": 0.9297130107879639, "learning_rate": 0.0002, "epoch": 2.302158273381295, "step": 14240}, {"loss": 0.6823, "grad_norm": 0.7501356601715088, "learning_rate": 0.0002, "epoch": 2.30377495756204, "step": 14250}, {"loss": 0.6414, "grad_norm": 0.8363515138626099, "learning_rate": 0.0002, "epoch": 2.3053916417427853, "step": 14260}, {"loss": 0.6362, "grad_norm": 0.9083868265151978, "learning_rate": 0.0002, "epoch": 2.307008325923531, "step": 14270}, {"loss": 0.6862, "grad_norm": 0.7791516780853271, "learning_rate": 0.0002, "epoch": 2.3086250101042762, "step": 14280}, {"loss": 0.6569, "grad_norm": 0.8766953349113464, "learning_rate": 0.0002, "epoch": 2.3102416942850215, "step": 14290}, {"loss": 0.6698, "grad_norm": 0.7916635274887085, "learning_rate": 0.0002, "epoch": 2.3118583784657667, "step": 14300}, {"loss": 0.6927, "grad_norm": 0.627525269985199, "learning_rate": 0.0002, "epoch": 2.313475062646512, "step": 14310}, {"loss": 0.6541, "grad_norm": 0.8856783509254456, "learning_rate": 0.0002, "epoch": 2.315091746827257, "step": 14320}, {"loss": 0.6806, "grad_norm": 0.6758689284324646, "learning_rate": 0.0002, "epoch": 2.316708431008003, "step": 14330}, {"loss": 0.6794, "grad_norm": 0.6428321003913879, "learning_rate": 0.0002, "epoch": 2.318325115188748, "step": 14340}, {"loss": 0.682, "grad_norm": 0.9032121300697327, "learning_rate": 0.0002, "epoch": 2.3199417993694933, "step": 14350}, {"loss": 0.6569, "grad_norm": 0.8035986423492432, "learning_rate": 0.0002, "epoch": 2.3215584835502385, "step": 14360}, {"loss": 0.7067, "grad_norm": 0.7974579334259033, "learning_rate": 0.0002, "epoch": 2.3231751677309838, "step": 14370}, {"loss": 0.6451, "grad_norm": 0.8356034755706787, "learning_rate": 0.0002, "epoch": 2.324791851911729, "step": 14380}, {"loss": 0.6623, "grad_norm": 0.998760998249054, "learning_rate": 0.0002, "epoch": 2.326408536092474, "step": 14390}, {"loss": 0.649, "grad_norm": 0.6518142223358154, "learning_rate": 0.0002, "epoch": 2.3280252202732195, "step": 14400}, {"loss": 0.7146, "grad_norm": 0.7443506717681885, "learning_rate": 0.0002, "epoch": 2.3296419044539647, "step": 14410}, {"loss": 0.648, "grad_norm": 0.8436172604560852, "learning_rate": 0.0002, "epoch": 2.3312585886347104, "step": 14420}, {"loss": 0.6585, "grad_norm": 0.7411080598831177, "learning_rate": 0.0002, "epoch": 2.3328752728154556, "step": 14430}, {"loss": 0.6781, "grad_norm": 0.8839048743247986, "learning_rate": 0.0002, "epoch": 2.334491956996201, "step": 14440}, {"loss": 0.6565, "grad_norm": 0.8360885977745056, "learning_rate": 0.0002, "epoch": 2.336108641176946, "step": 14450}, {"loss": 0.6662, "grad_norm": 0.7608986496925354, "learning_rate": 0.0002, "epoch": 2.3377253253576913, "step": 14460}, {"loss": 0.6685, "grad_norm": 0.8179867267608643, "learning_rate": 0.0002, "epoch": 2.3393420095384365, "step": 14470}, {"loss": 0.7055, "grad_norm": 0.5989999771118164, "learning_rate": 0.0002, "epoch": 2.340958693719182, "step": 14480}, {"loss": 0.644, "grad_norm": 0.9450054168701172, "learning_rate": 0.0002, "epoch": 2.3425753778999274, "step": 14490}, {"loss": 0.6983, "grad_norm": 0.7885149717330933, "learning_rate": 0.0002, "epoch": 2.3441920620806727, "step": 14500}, {"loss": 0.6819, "grad_norm": 0.8152616620063782, "learning_rate": 0.0002, "epoch": 2.345808746261418, "step": 14510}, {"loss": 0.6989, "grad_norm": 0.7193838953971863, "learning_rate": 0.0002, "epoch": 2.347425430442163, "step": 14520}, {"loss": 0.6594, "grad_norm": 0.6701092720031738, "learning_rate": 0.0002, "epoch": 2.3490421146229084, "step": 14530}, {"loss": 0.6559, "grad_norm": 0.7529364228248596, "learning_rate": 0.0002, "epoch": 2.3506587988036536, "step": 14540}, {"loss": 0.6306, "grad_norm": 0.6599733829498291, "learning_rate": 0.0002, "epoch": 2.352275482984399, "step": 14550}, {"loss": 0.706, "grad_norm": 0.9502474069595337, "learning_rate": 0.0002, "epoch": 2.353892167165144, "step": 14560}, {"loss": 0.717, "grad_norm": 0.7619650959968567, "learning_rate": 0.0002, "epoch": 2.3555088513458897, "step": 14570}, {"loss": 0.6684, "grad_norm": 0.9854652285575867, "learning_rate": 0.0002, "epoch": 2.357125535526635, "step": 14580}, {"loss": 0.6455, "grad_norm": 0.727439284324646, "learning_rate": 0.0002, "epoch": 2.35874221970738, "step": 14590}, {"loss": 0.6645, "grad_norm": 0.6994746327400208, "learning_rate": 0.0002, "epoch": 2.3603589038881254, "step": 14600}, {"loss": 0.6587, "grad_norm": 0.7117531299591064, "learning_rate": 0.0002, "epoch": 2.3619755880688706, "step": 14610}, {"loss": 0.6804, "grad_norm": 0.6403067708015442, "learning_rate": 0.0002, "epoch": 2.363592272249616, "step": 14620}, {"loss": 0.7055, "grad_norm": 0.8377841711044312, "learning_rate": 0.0002, "epoch": 2.3652089564303616, "step": 14630}, {"loss": 0.6778, "grad_norm": 0.749171257019043, "learning_rate": 0.0002, "epoch": 2.366825640611107, "step": 14640}, {"loss": 0.6552, "grad_norm": 0.8418586254119873, "learning_rate": 0.0002, "epoch": 2.368442324791852, "step": 14650}, {"loss": 0.6685, "grad_norm": 0.6178573369979858, "learning_rate": 0.0002, "epoch": 2.3700590089725972, "step": 14660}, {"loss": 0.6774, "grad_norm": 0.6368302702903748, "learning_rate": 0.0002, "epoch": 2.3716756931533425, "step": 14670}, {"loss": 0.6136, "grad_norm": 0.9122977256774902, "learning_rate": 0.0002, "epoch": 2.3732923773340877, "step": 14680}, {"loss": 0.6675, "grad_norm": 0.7086195349693298, "learning_rate": 0.0002, "epoch": 2.374909061514833, "step": 14690}, {"loss": 0.6582, "grad_norm": 0.7500800490379333, "learning_rate": 0.0002, "epoch": 2.376525745695578, "step": 14700}, {"loss": 0.6792, "grad_norm": 0.6634900569915771, "learning_rate": 0.0002, "epoch": 2.378142429876324, "step": 14710}, {"loss": 0.6614, "grad_norm": 0.839898407459259, "learning_rate": 0.0002, "epoch": 2.379759114057069, "step": 14720}, {"loss": 0.6453, "grad_norm": 0.7578426003456116, "learning_rate": 0.0002, "epoch": 2.3813757982378143, "step": 14730}, {"loss": 0.7282, "grad_norm": 1.0213173627853394, "learning_rate": 0.0002, "epoch": 2.3829924824185595, "step": 14740}, {"loss": 0.6704, "grad_norm": 0.7855949401855469, "learning_rate": 0.0002, "epoch": 2.3846091665993048, "step": 14750}, {"loss": 0.6694, "grad_norm": 0.7224128842353821, "learning_rate": 0.0002, "epoch": 2.38622585078005, "step": 14760}, {"loss": 0.7017, "grad_norm": 0.8040381669998169, "learning_rate": 0.0002, "epoch": 2.3878425349607952, "step": 14770}, {"loss": 0.6799, "grad_norm": 0.7705281376838684, "learning_rate": 0.0002, "epoch": 2.389459219141541, "step": 14780}, {"loss": 0.6326, "grad_norm": 0.667966902256012, "learning_rate": 0.0002, "epoch": 2.391075903322286, "step": 14790}, {"loss": 0.7061, "grad_norm": 0.6611011028289795, "learning_rate": 0.0002, "epoch": 2.3926925875030314, "step": 14800}, {"loss": 0.6527, "grad_norm": 0.6862651705741882, "learning_rate": 0.0002, "epoch": 2.3943092716837766, "step": 14810}, {"loss": 0.6537, "grad_norm": 0.8086010217666626, "learning_rate": 0.0002, "epoch": 2.395925955864522, "step": 14820}, {"loss": 0.7189, "grad_norm": 0.7189689874649048, "learning_rate": 0.0002, "epoch": 2.397542640045267, "step": 14830}, {"loss": 0.6709, "grad_norm": 0.6280009150505066, "learning_rate": 0.0002, "epoch": 2.3991593242260123, "step": 14840}, {"loss": 0.706, "grad_norm": 0.7826612591743469, "learning_rate": 0.0002, "epoch": 2.4007760084067575, "step": 14850}, {"loss": 0.6738, "grad_norm": 0.7681610584259033, "learning_rate": 0.0002, "epoch": 2.402392692587503, "step": 14860}, {"loss": 0.636, "grad_norm": 0.720966100692749, "learning_rate": 0.0002, "epoch": 2.4040093767682484, "step": 14870}, {"loss": 0.6667, "grad_norm": 0.8202250599861145, "learning_rate": 0.0002, "epoch": 2.4056260609489937, "step": 14880}, {"loss": 0.6935, "grad_norm": 0.786212682723999, "learning_rate": 0.0002, "epoch": 2.407242745129739, "step": 14890}, {"loss": 0.6628, "grad_norm": 0.6647164821624756, "learning_rate": 0.0002, "epoch": 2.408859429310484, "step": 14900}, {"loss": 0.6706, "grad_norm": 0.7566399574279785, "learning_rate": 0.0002, "epoch": 2.4104761134912294, "step": 14910}, {"loss": 0.7188, "grad_norm": 0.748814582824707, "learning_rate": 0.0002, "epoch": 2.4120927976719746, "step": 14920}, {"loss": 0.6684, "grad_norm": 0.7624038457870483, "learning_rate": 0.0002, "epoch": 2.4137094818527203, "step": 14930}, {"loss": 0.6483, "grad_norm": 0.8267335295677185, "learning_rate": 0.0002, "epoch": 2.4153261660334655, "step": 14940}, {"loss": 0.6612, "grad_norm": 0.8785360455513, "learning_rate": 0.0002, "epoch": 2.4169428502142107, "step": 14950}, {"loss": 0.6718, "grad_norm": 0.679887592792511, "learning_rate": 0.0002, "epoch": 2.418559534394956, "step": 14960}, {"loss": 0.6136, "grad_norm": 0.7218474745750427, "learning_rate": 0.0002, "epoch": 2.420176218575701, "step": 14970}, {"loss": 0.648, "grad_norm": 0.6342799663543701, "learning_rate": 0.0002, "epoch": 2.4217929027564464, "step": 14980}, {"loss": 0.6617, "grad_norm": 0.7098712921142578, "learning_rate": 0.0002, "epoch": 2.4234095869371917, "step": 14990}, {"loss": 0.6942, "grad_norm": 0.7497431635856628, "learning_rate": 0.0002, "epoch": 2.425026271117937, "step": 15000}, {"loss": 0.6772, "grad_norm": 0.934836208820343, "learning_rate": 0.0002, "epoch": 2.4266429552986826, "step": 15010}, {"loss": 0.7221, "grad_norm": 0.8430966734886169, "learning_rate": 0.0002, "epoch": 2.428259639479428, "step": 15020}, {"loss": 0.6985, "grad_norm": 0.7032104730606079, "learning_rate": 0.0002, "epoch": 2.429876323660173, "step": 15030}, {"loss": 0.6715, "grad_norm": 0.7746111750602722, "learning_rate": 0.0002, "epoch": 2.4314930078409183, "step": 15040}, {"loss": 0.7177, "grad_norm": 0.7661406397819519, "learning_rate": 0.0002, "epoch": 2.4331096920216635, "step": 15050}, {"loss": 0.6517, "grad_norm": 0.6941645741462708, "learning_rate": 0.0002, "epoch": 2.4347263762024087, "step": 15060}, {"loss": 0.6421, "grad_norm": 0.7487249374389648, "learning_rate": 0.0002, "epoch": 2.436343060383154, "step": 15070}, {"loss": 0.6796, "grad_norm": 0.7639912962913513, "learning_rate": 0.0002, "epoch": 2.4379597445638996, "step": 15080}, {"loss": 0.7087, "grad_norm": 0.7708953619003296, "learning_rate": 0.0002, "epoch": 2.439576428744645, "step": 15090}, {"loss": 0.7065, "grad_norm": 0.9135832190513611, "learning_rate": 0.0002, "epoch": 2.44119311292539, "step": 15100}, {"loss": 0.672, "grad_norm": 0.8283005356788635, "learning_rate": 0.0002, "epoch": 2.4428097971061353, "step": 15110}, {"loss": 0.6551, "grad_norm": 0.925299346446991, "learning_rate": 0.0002, "epoch": 2.4444264812868806, "step": 15120}, {"loss": 0.687, "grad_norm": 0.7013528943061829, "learning_rate": 0.0002, "epoch": 2.446043165467626, "step": 15130}, {"loss": 0.6842, "grad_norm": 0.622303307056427, "learning_rate": 0.0002, "epoch": 2.447659849648371, "step": 15140}, {"loss": 0.6676, "grad_norm": 0.876569390296936, "learning_rate": 0.0002, "epoch": 2.4492765338291163, "step": 15150}, {"loss": 0.6463, "grad_norm": 0.6836351752281189, "learning_rate": 0.0002, "epoch": 2.450893218009862, "step": 15160}, {"loss": 0.6781, "grad_norm": 0.7886684536933899, "learning_rate": 0.0002, "epoch": 2.452509902190607, "step": 15170}, {"loss": 0.6794, "grad_norm": 0.6647440791130066, "learning_rate": 0.0002, "epoch": 2.4541265863713524, "step": 15180}, {"loss": 0.6353, "grad_norm": 0.7477722764015198, "learning_rate": 0.0002, "epoch": 2.4557432705520976, "step": 15190}, {"loss": 0.698, "grad_norm": 0.8192033767700195, "learning_rate": 0.0002, "epoch": 2.457359954732843, "step": 15200}, {"loss": 0.6735, "grad_norm": 0.847537100315094, "learning_rate": 0.0002, "epoch": 2.458976638913588, "step": 15210}, {"loss": 0.6962, "grad_norm": 0.9027776122093201, "learning_rate": 0.0002, "epoch": 2.4605933230943338, "step": 15220}, {"loss": 0.7084, "grad_norm": 0.7217772006988525, "learning_rate": 0.0002, "epoch": 2.462210007275079, "step": 15230}, {"loss": 0.691, "grad_norm": 0.7994546294212341, "learning_rate": 0.0002, "epoch": 2.4638266914558242, "step": 15240}, {"loss": 0.6828, "grad_norm": 0.939916729927063, "learning_rate": 0.0002, "epoch": 2.4654433756365695, "step": 15250}, {"loss": 0.6893, "grad_norm": 1.0009053945541382, "learning_rate": 0.0002, "epoch": 2.4670600598173147, "step": 15260}, {"loss": 0.643, "grad_norm": 0.625555694103241, "learning_rate": 0.0002, "epoch": 2.46867674399806, "step": 15270}, {"loss": 0.688, "grad_norm": 0.7924878597259521, "learning_rate": 0.0002, "epoch": 2.470293428178805, "step": 15280}, {"loss": 0.6789, "grad_norm": 0.8536689877510071, "learning_rate": 0.0002, "epoch": 2.4719101123595504, "step": 15290}, {"loss": 0.6924, "grad_norm": 0.8572589755058289, "learning_rate": 0.0002, "epoch": 2.4735267965402956, "step": 15300}, {"loss": 0.604, "grad_norm": 0.773279070854187, "learning_rate": 0.0002, "epoch": 2.4751434807210413, "step": 15310}, {"loss": 0.6573, "grad_norm": 0.7708749771118164, "learning_rate": 0.0002, "epoch": 2.4767601649017865, "step": 15320}, {"loss": 0.7065, "grad_norm": 0.770905077457428, "learning_rate": 0.0002, "epoch": 2.4783768490825318, "step": 15330}, {"loss": 0.6878, "grad_norm": 0.8238571882247925, "learning_rate": 0.0002, "epoch": 2.479993533263277, "step": 15340}, {"loss": 0.6772, "grad_norm": 0.7670477032661438, "learning_rate": 0.0002, "epoch": 2.481610217444022, "step": 15350}, {"loss": 0.7759, "grad_norm": 0.905036985874176, "learning_rate": 0.0002, "epoch": 2.4832269016247674, "step": 15360}, {"loss": 0.706, "grad_norm": 0.6672089695930481, "learning_rate": 0.0002, "epoch": 2.484843585805513, "step": 15370}, {"loss": 0.6722, "grad_norm": 0.625095784664154, "learning_rate": 0.0002, "epoch": 2.4864602699862584, "step": 15380}, {"loss": 0.6396, "grad_norm": 0.679772675037384, "learning_rate": 0.0002, "epoch": 2.4880769541670036, "step": 15390}, {"loss": 0.6778, "grad_norm": 0.711492121219635, "learning_rate": 0.0002, "epoch": 2.489693638347749, "step": 15400}, {"loss": 0.6966, "grad_norm": 0.876189112663269, "learning_rate": 0.0002, "epoch": 2.491310322528494, "step": 15410}, {"loss": 0.7307, "grad_norm": 0.7236915230751038, "learning_rate": 0.0002, "epoch": 2.4929270067092393, "step": 15420}, {"loss": 0.647, "grad_norm": 0.6629832983016968, "learning_rate": 0.0002, "epoch": 2.4945436908899845, "step": 15430}, {"loss": 0.6669, "grad_norm": 0.9756859540939331, "learning_rate": 0.0002, "epoch": 2.4961603750707297, "step": 15440}, {"loss": 0.7559, "grad_norm": 0.6896940469741821, "learning_rate": 0.0002, "epoch": 2.4977770592514754, "step": 15450}, {"loss": 0.6818, "grad_norm": 0.7105149626731873, "learning_rate": 0.0002, "epoch": 2.4993937434322206, "step": 15460}, {"loss": 0.6859, "grad_norm": 0.8374546766281128, "learning_rate": 0.0002, "epoch": 2.501010427612966, "step": 15470}, {"loss": 0.6512, "grad_norm": 0.7320070266723633, "learning_rate": 0.0002, "epoch": 2.502627111793711, "step": 15480}, {"loss": 0.685, "grad_norm": 0.8306367993354797, "learning_rate": 0.0002, "epoch": 2.5042437959744563, "step": 15490}, {"loss": 0.7253, "grad_norm": 0.7472721338272095, "learning_rate": 0.0002, "epoch": 2.5058604801552016, "step": 15500}, {"loss": 0.6699, "grad_norm": 0.6147692203521729, "learning_rate": 0.0002, "epoch": 2.507477164335947, "step": 15510}, {"loss": 0.7158, "grad_norm": 0.7788505554199219, "learning_rate": 0.0002, "epoch": 2.5090938485166925, "step": 15520}, {"loss": 0.6521, "grad_norm": 0.8807527422904968, "learning_rate": 0.0002, "epoch": 2.5107105326974377, "step": 15530}, {"loss": 0.6792, "grad_norm": 0.7521643042564392, "learning_rate": 0.0002, "epoch": 2.512327216878183, "step": 15540}, {"loss": 0.6772, "grad_norm": 0.6900225281715393, "learning_rate": 0.0002, "epoch": 2.513943901058928, "step": 15550}, {"loss": 0.6769, "grad_norm": 0.6601938605308533, "learning_rate": 0.0002, "epoch": 2.5155605852396734, "step": 15560}, {"loss": 0.6648, "grad_norm": 0.8179984092712402, "learning_rate": 0.0002, "epoch": 2.5171772694204186, "step": 15570}, {"loss": 0.7028, "grad_norm": 0.792556881904602, "learning_rate": 0.0002, "epoch": 2.518793953601164, "step": 15580}, {"loss": 0.6464, "grad_norm": 0.7081938982009888, "learning_rate": 0.0002, "epoch": 2.520410637781909, "step": 15590}, {"loss": 0.6691, "grad_norm": 0.8733121156692505, "learning_rate": 0.0002, "epoch": 2.5220273219626543, "step": 15600}, {"loss": 0.6969, "grad_norm": 0.7980992794036865, "learning_rate": 0.0002, "epoch": 2.5236440061434, "step": 15610}, {"loss": 0.7124, "grad_norm": 0.883664071559906, "learning_rate": 0.0002, "epoch": 2.5252606903241452, "step": 15620}, {"loss": 0.7022, "grad_norm": 0.6963341236114502, "learning_rate": 0.0002, "epoch": 2.5268773745048905, "step": 15630}, {"loss": 0.7334, "grad_norm": 0.6433573365211487, "learning_rate": 0.0002, "epoch": 2.5284940586856357, "step": 15640}, {"loss": 0.6889, "grad_norm": 0.8538183569908142, "learning_rate": 0.0002, "epoch": 2.530110742866381, "step": 15650}, {"loss": 0.6841, "grad_norm": 0.9748201370239258, "learning_rate": 0.0002, "epoch": 2.5317274270471266, "step": 15660}, {"loss": 0.6765, "grad_norm": 0.7670575380325317, "learning_rate": 0.0002, "epoch": 2.533344111227872, "step": 15670}, {"loss": 0.6435, "grad_norm": 0.8738890290260315, "learning_rate": 0.0002, "epoch": 2.534960795408617, "step": 15680}, {"loss": 0.6802, "grad_norm": 0.8391636610031128, "learning_rate": 0.0002, "epoch": 2.5365774795893623, "step": 15690}, {"loss": 0.6901, "grad_norm": 0.7239366769790649, "learning_rate": 0.0002, "epoch": 2.5381941637701075, "step": 15700}, {"loss": 0.7011, "grad_norm": 0.8498379588127136, "learning_rate": 0.0002, "epoch": 2.5398108479508528, "step": 15710}, {"loss": 0.6998, "grad_norm": 0.8029484152793884, "learning_rate": 0.0002, "epoch": 2.541427532131598, "step": 15720}, {"loss": 0.6678, "grad_norm": 1.0639333724975586, "learning_rate": 0.0002, "epoch": 2.5430442163123432, "step": 15730}, {"loss": 0.6341, "grad_norm": 0.6401297450065613, "learning_rate": 0.0002, "epoch": 2.5446609004930885, "step": 15740}, {"loss": 0.7196, "grad_norm": 0.7123814821243286, "learning_rate": 0.0002, "epoch": 2.5462775846738337, "step": 15750}, {"loss": 0.654, "grad_norm": 0.7874974608421326, "learning_rate": 0.0002, "epoch": 2.5478942688545794, "step": 15760}, {"loss": 0.6721, "grad_norm": 0.8046808838844299, "learning_rate": 0.0002, "epoch": 2.5495109530353246, "step": 15770}, {"loss": 0.6665, "grad_norm": 0.7888661623001099, "learning_rate": 0.0002, "epoch": 2.55112763721607, "step": 15780}, {"loss": 0.6893, "grad_norm": 0.8445866107940674, "learning_rate": 0.0002, "epoch": 2.552744321396815, "step": 15790}, {"loss": 0.6815, "grad_norm": 0.7475846409797668, "learning_rate": 0.0002, "epoch": 2.5543610055775603, "step": 15800}, {"loss": 0.6711, "grad_norm": 0.7455102801322937, "learning_rate": 0.0002, "epoch": 2.555977689758306, "step": 15810}, {"loss": 0.6932, "grad_norm": 0.8226983547210693, "learning_rate": 0.0002, "epoch": 2.557594373939051, "step": 15820}, {"loss": 0.651, "grad_norm": 0.8920368552207947, "learning_rate": 0.0002, "epoch": 2.5592110581197964, "step": 15830}, {"loss": 0.6297, "grad_norm": 0.8413904905319214, "learning_rate": 0.0002, "epoch": 2.5608277423005417, "step": 15840}, {"loss": 0.7106, "grad_norm": 0.8483649492263794, "learning_rate": 0.0002, "epoch": 2.562444426481287, "step": 15850}, {"loss": 0.6957, "grad_norm": 0.5923284292221069, "learning_rate": 0.0002, "epoch": 2.564061110662032, "step": 15860}, {"loss": 0.6847, "grad_norm": 0.8518726229667664, "learning_rate": 0.0002, "epoch": 2.5656777948427774, "step": 15870}, {"loss": 0.6362, "grad_norm": 0.731235146522522, "learning_rate": 0.0002, "epoch": 2.5672944790235226, "step": 15880}, {"loss": 0.7611, "grad_norm": 0.7517194151878357, "learning_rate": 0.0002, "epoch": 2.568911163204268, "step": 15890}, {"loss": 0.6907, "grad_norm": 0.8378692269325256, "learning_rate": 0.0002, "epoch": 2.5705278473850135, "step": 15900}, {"loss": 0.7055, "grad_norm": 0.843701958656311, "learning_rate": 0.0002, "epoch": 2.5721445315657587, "step": 15910}, {"loss": 0.6882, "grad_norm": 0.7254629731178284, "learning_rate": 0.0002, "epoch": 2.573761215746504, "step": 15920}, {"loss": 0.6872, "grad_norm": 0.8863335847854614, "learning_rate": 0.0002, "epoch": 2.575377899927249, "step": 15930}, {"loss": 0.6813, "grad_norm": 0.7675097584724426, "learning_rate": 0.0002, "epoch": 2.5769945841079944, "step": 15940}, {"loss": 0.7357, "grad_norm": 0.82063889503479, "learning_rate": 0.0002, "epoch": 2.5786112682887397, "step": 15950}, {"loss": 0.662, "grad_norm": 0.7729717493057251, "learning_rate": 0.0002, "epoch": 2.5802279524694853, "step": 15960}, {"loss": 0.633, "grad_norm": 0.8301846981048584, "learning_rate": 0.0002, "epoch": 2.5818446366502306, "step": 15970}, {"loss": 0.6897, "grad_norm": 0.7906861305236816, "learning_rate": 0.0002, "epoch": 2.583461320830976, "step": 15980}, {"loss": 0.7175, "grad_norm": 0.6749057173728943, "learning_rate": 0.0002, "epoch": 2.585078005011721, "step": 15990}, {"loss": 0.7212, "grad_norm": 0.9386842846870422, "learning_rate": 0.0002, "epoch": 2.5866946891924663, "step": 16000}, {"loss": 0.6934, "grad_norm": 0.7868891358375549, "learning_rate": 0.0002, "epoch": 2.5883113733732115, "step": 16010}, {"loss": 0.7036, "grad_norm": 0.8674671053886414, "learning_rate": 0.0002, "epoch": 2.5899280575539567, "step": 16020}, {"loss": 0.7217, "grad_norm": 0.7043559551239014, "learning_rate": 0.0002, "epoch": 2.591544741734702, "step": 16030}, {"loss": 0.6967, "grad_norm": 0.5846083760261536, "learning_rate": 0.0002, "epoch": 2.593161425915447, "step": 16040}, {"loss": 0.7322, "grad_norm": 0.7323982119560242, "learning_rate": 0.0002, "epoch": 2.594778110096193, "step": 16050}, {"loss": 0.6794, "grad_norm": 0.9069556593894958, "learning_rate": 0.0002, "epoch": 2.596394794276938, "step": 16060}, {"loss": 0.7076, "grad_norm": 0.7522736191749573, "learning_rate": 0.0002, "epoch": 2.5980114784576833, "step": 16070}, {"loss": 0.6477, "grad_norm": 0.8149648308753967, "learning_rate": 0.0002, "epoch": 2.5996281626384286, "step": 16080}, {"loss": 0.6664, "grad_norm": 0.6214233040809631, "learning_rate": 0.0002, "epoch": 2.601244846819174, "step": 16090}, {"loss": 0.7307, "grad_norm": 0.6803743839263916, "learning_rate": 0.0002, "epoch": 2.602861530999919, "step": 16100}, {"loss": 0.7244, "grad_norm": 0.7223997116088867, "learning_rate": 0.0002, "epoch": 2.6044782151806647, "step": 16110}, {"loss": 0.6867, "grad_norm": 0.7324174642562866, "learning_rate": 0.0002, "epoch": 2.60609489936141, "step": 16120}, {"loss": 0.7159, "grad_norm": 0.9594739675521851, "learning_rate": 0.0002, "epoch": 2.607711583542155, "step": 16130}, {"loss": 0.6451, "grad_norm": 0.9485327005386353, "learning_rate": 0.0002, "epoch": 2.6093282677229004, "step": 16140}, {"loss": 0.6815, "grad_norm": 0.8449000120162964, "learning_rate": 0.0002, "epoch": 2.6109449519036456, "step": 16150}, {"loss": 0.7152, "grad_norm": 0.8520140051841736, "learning_rate": 0.0002, "epoch": 2.612561636084391, "step": 16160}, {"loss": 0.6759, "grad_norm": 0.7456524968147278, "learning_rate": 0.0002, "epoch": 2.614178320265136, "step": 16170}, {"loss": 0.6893, "grad_norm": 0.9912857413291931, "learning_rate": 0.0002, "epoch": 2.6157950044458813, "step": 16180}, {"loss": 0.7243, "grad_norm": 0.9001946449279785, "learning_rate": 0.0002, "epoch": 2.6174116886266265, "step": 16190}, {"loss": 0.6825, "grad_norm": 0.6568667888641357, "learning_rate": 0.0002, "epoch": 2.619028372807372, "step": 16200}, {"loss": 0.7013, "grad_norm": 1.0248128175735474, "learning_rate": 0.0002, "epoch": 2.6206450569881174, "step": 16210}, {"loss": 0.7045, "grad_norm": 0.6509039998054504, "learning_rate": 0.0002, "epoch": 2.6222617411688627, "step": 16220}, {"loss": 0.72, "grad_norm": 0.7626351118087769, "learning_rate": 0.0002, "epoch": 2.623878425349608, "step": 16230}, {"loss": 0.6556, "grad_norm": 0.6938552260398865, "learning_rate": 0.0002, "epoch": 2.625495109530353, "step": 16240}, {"loss": 0.65, "grad_norm": 0.6434680819511414, "learning_rate": 0.0002, "epoch": 2.6271117937110984, "step": 16250}, {"loss": 0.6943, "grad_norm": 0.7111515998840332, "learning_rate": 0.0002, "epoch": 2.628728477891844, "step": 16260}, {"loss": 0.679, "grad_norm": 0.7712395787239075, "learning_rate": 0.0002, "epoch": 2.6303451620725893, "step": 16270}, {"loss": 0.6886, "grad_norm": 0.792209267616272, "learning_rate": 0.0002, "epoch": 2.6319618462533345, "step": 16280}, {"loss": 0.6554, "grad_norm": 0.6801066398620605, "learning_rate": 0.0002, "epoch": 2.6335785304340797, "step": 16290}, {"loss": 0.73, "grad_norm": 0.7802573442459106, "learning_rate": 0.0002, "epoch": 2.635195214614825, "step": 16300}, {"loss": 0.7484, "grad_norm": 0.7742244601249695, "learning_rate": 0.0002, "epoch": 2.63681189879557, "step": 16310}, {"loss": 0.6524, "grad_norm": 0.664184033870697, "learning_rate": 0.0002, "epoch": 2.6384285829763154, "step": 16320}, {"loss": 0.6442, "grad_norm": 0.9242228865623474, "learning_rate": 0.0002, "epoch": 2.6400452671570607, "step": 16330}, {"loss": 0.6792, "grad_norm": 0.9661325216293335, "learning_rate": 0.0002, "epoch": 2.641661951337806, "step": 16340}, {"loss": 0.6847, "grad_norm": 0.837526798248291, "learning_rate": 0.0002, "epoch": 2.6432786355185516, "step": 16350}, {"loss": 0.7686, "grad_norm": 1.1834373474121094, "learning_rate": 0.0002, "epoch": 2.644895319699297, "step": 16360}, {"loss": 0.6746, "grad_norm": 0.7467831373214722, "learning_rate": 0.0002, "epoch": 2.646512003880042, "step": 16370}, {"loss": 0.6935, "grad_norm": 0.8627146482467651, "learning_rate": 0.0002, "epoch": 2.6481286880607873, "step": 16380}, {"loss": 0.715, "grad_norm": 0.790447473526001, "learning_rate": 0.0002, "epoch": 2.6497453722415325, "step": 16390}, {"loss": 0.723, "grad_norm": 0.8447365164756775, "learning_rate": 0.0002, "epoch": 2.651362056422278, "step": 16400}, {"loss": 0.6628, "grad_norm": 0.7831417918205261, "learning_rate": 0.0002, "epoch": 2.6529787406030234, "step": 16410}, {"loss": 0.6691, "grad_norm": 0.6837952136993408, "learning_rate": 0.0002, "epoch": 2.6545954247837686, "step": 16420}, {"loss": 0.6139, "grad_norm": 0.7031801342964172, "learning_rate": 0.0002, "epoch": 2.656212108964514, "step": 16430}, {"loss": 0.7382, "grad_norm": 0.8963770866394043, "learning_rate": 0.0002, "epoch": 2.657828793145259, "step": 16440}, {"loss": 0.6439, "grad_norm": 0.6852328181266785, "learning_rate": 0.0002, "epoch": 2.6594454773260043, "step": 16450}, {"loss": 0.6278, "grad_norm": 0.8069294095039368, "learning_rate": 0.0002, "epoch": 2.6610621615067496, "step": 16460}, {"loss": 0.6939, "grad_norm": 0.7503686547279358, "learning_rate": 0.0002, "epoch": 2.662678845687495, "step": 16470}, {"loss": 0.6777, "grad_norm": 0.6430956125259399, "learning_rate": 0.0002, "epoch": 2.66429552986824, "step": 16480}, {"loss": 0.6863, "grad_norm": 0.7894312739372253, "learning_rate": 0.0002, "epoch": 2.6659122140489853, "step": 16490}, {"loss": 0.7165, "grad_norm": 0.7277431488037109, "learning_rate": 0.0002, "epoch": 2.667528898229731, "step": 16500}, {"loss": 0.6772, "grad_norm": 0.6816153526306152, "learning_rate": 0.0002, "epoch": 2.669145582410476, "step": 16510}, {"loss": 0.691, "grad_norm": 0.8145235776901245, "learning_rate": 0.0002, "epoch": 2.6707622665912214, "step": 16520}, {"loss": 0.709, "grad_norm": 0.8645890355110168, "learning_rate": 0.0002, "epoch": 2.6723789507719666, "step": 16530}, {"loss": 0.6946, "grad_norm": 0.704393208026886, "learning_rate": 0.0002, "epoch": 2.673995634952712, "step": 16540}, {"loss": 0.6378, "grad_norm": 1.0120846033096313, "learning_rate": 0.0002, "epoch": 2.6756123191334575, "step": 16550}, {"loss": 0.7241, "grad_norm": 0.6919328570365906, "learning_rate": 0.0002, "epoch": 2.6772290033142028, "step": 16560}, {"loss": 0.7098, "grad_norm": 0.6924574971199036, "learning_rate": 0.0002, "epoch": 2.678845687494948, "step": 16570}, {"loss": 0.731, "grad_norm": 0.9679301381111145, "learning_rate": 0.0002, "epoch": 2.6804623716756932, "step": 16580}, {"loss": 0.7124, "grad_norm": 0.6810211539268494, "learning_rate": 0.0002, "epoch": 2.6820790558564385, "step": 16590}, {"loss": 0.6688, "grad_norm": 0.9730555415153503, "learning_rate": 0.0002, "epoch": 2.6836957400371837, "step": 16600}, {"loss": 0.7344, "grad_norm": 0.7852821350097656, "learning_rate": 0.0002, "epoch": 2.685312424217929, "step": 16610}, {"loss": 0.6401, "grad_norm": 0.6059057116508484, "learning_rate": 0.0002, "epoch": 2.686929108398674, "step": 16620}, {"loss": 0.6796, "grad_norm": 0.9395958781242371, "learning_rate": 0.0002, "epoch": 2.6885457925794194, "step": 16630}, {"loss": 0.7174, "grad_norm": 0.7473729848861694, "learning_rate": 0.0002, "epoch": 2.690162476760165, "step": 16640}, {"loss": 0.7087, "grad_norm": 0.765934407711029, "learning_rate": 0.0002, "epoch": 2.6917791609409103, "step": 16650}, {"loss": 0.707, "grad_norm": 0.8496677279472351, "learning_rate": 0.0002, "epoch": 2.6933958451216555, "step": 16660}, {"loss": 0.7084, "grad_norm": 0.7641879916191101, "learning_rate": 0.0002, "epoch": 2.6950125293024008, "step": 16670}, {"loss": 0.6566, "grad_norm": 0.8471952676773071, "learning_rate": 0.0002, "epoch": 2.696629213483146, "step": 16680}, {"loss": 0.6635, "grad_norm": 0.6946060657501221, "learning_rate": 0.0002, "epoch": 2.6982458976638912, "step": 16690}, {"loss": 0.7027, "grad_norm": 0.7361312508583069, "learning_rate": 0.0002, "epoch": 2.699862581844637, "step": 16700}, {"loss": 0.6767, "grad_norm": 0.6605038046836853, "learning_rate": 0.0002, "epoch": 2.701479266025382, "step": 16710}, {"loss": 0.6885, "grad_norm": 0.7164411544799805, "learning_rate": 0.0002, "epoch": 2.7030959502061274, "step": 16720}, {"loss": 0.6736, "grad_norm": 0.6496201157569885, "learning_rate": 0.0002, "epoch": 2.7047126343868726, "step": 16730}, {"loss": 0.6942, "grad_norm": 0.7826663851737976, "learning_rate": 0.0002, "epoch": 2.706329318567618, "step": 16740}, {"loss": 0.6773, "grad_norm": 0.7639131546020508, "learning_rate": 0.0002, "epoch": 2.707946002748363, "step": 16750}, {"loss": 0.69, "grad_norm": 0.7976210713386536, "learning_rate": 0.0002, "epoch": 2.7095626869291083, "step": 16760}, {"loss": 0.6735, "grad_norm": 0.6836577653884888, "learning_rate": 0.0002, "epoch": 2.7111793711098535, "step": 16770}, {"loss": 0.6596, "grad_norm": 0.8025202751159668, "learning_rate": 0.0002, "epoch": 2.7127960552905988, "step": 16780}, {"loss": 0.6324, "grad_norm": 0.7636463642120361, "learning_rate": 0.0002, "epoch": 2.7144127394713444, "step": 16790}, {"loss": 0.6227, "grad_norm": 0.7481677532196045, "learning_rate": 0.0002, "epoch": 2.7160294236520897, "step": 16800}, {"loss": 0.6925, "grad_norm": 0.7566834688186646, "learning_rate": 0.0002, "epoch": 2.717646107832835, "step": 16810}, {"loss": 0.6531, "grad_norm": 0.7931267619132996, "learning_rate": 0.0002, "epoch": 2.71926279201358, "step": 16820}, {"loss": 0.6672, "grad_norm": 0.8811662197113037, "learning_rate": 0.0002, "epoch": 2.7208794761943254, "step": 16830}, {"loss": 0.6675, "grad_norm": 0.8561240434646606, "learning_rate": 0.0002, "epoch": 2.7224961603750706, "step": 16840}, {"loss": 0.7135, "grad_norm": 0.7121599316596985, "learning_rate": 0.0002, "epoch": 2.7241128445558163, "step": 16850}, {"loss": 0.6825, "grad_norm": 0.8066257238388062, "learning_rate": 0.0002, "epoch": 2.7257295287365615, "step": 16860}, {"loss": 0.6839, "grad_norm": 0.7699271440505981, "learning_rate": 0.0002, "epoch": 2.7273462129173067, "step": 16870}, {"loss": 0.699, "grad_norm": 1.1828432083129883, "learning_rate": 0.0002, "epoch": 2.728962897098052, "step": 16880}, {"loss": 0.6518, "grad_norm": 0.9989302754402161, "learning_rate": 0.0002, "epoch": 2.730579581278797, "step": 16890}, {"loss": 0.7015, "grad_norm": 0.8100560307502747, "learning_rate": 0.0002, "epoch": 2.7321962654595424, "step": 16900}, {"loss": 0.6851, "grad_norm": 0.8615233898162842, "learning_rate": 0.0002, "epoch": 2.7338129496402876, "step": 16910}, {"loss": 0.6322, "grad_norm": 0.8633756041526794, "learning_rate": 0.0002, "epoch": 2.735429633821033, "step": 16920}, {"loss": 0.6488, "grad_norm": 0.7769348621368408, "learning_rate": 0.0002, "epoch": 2.737046318001778, "step": 16930}, {"loss": 0.6582, "grad_norm": 0.6943058371543884, "learning_rate": 0.0002, "epoch": 2.738663002182524, "step": 16940}, {"loss": 0.6516, "grad_norm": 0.8510736227035522, "learning_rate": 0.0002, "epoch": 2.740279686363269, "step": 16950}, {"loss": 0.7275, "grad_norm": 0.7732602953910828, "learning_rate": 0.0002, "epoch": 2.7418963705440142, "step": 16960}, {"loss": 0.6553, "grad_norm": 0.5981788635253906, "learning_rate": 0.0002, "epoch": 2.7435130547247595, "step": 16970}, {"loss": 0.6777, "grad_norm": 0.7604416012763977, "learning_rate": 0.0002, "epoch": 2.7451297389055047, "step": 16980}, {"loss": 0.6981, "grad_norm": 0.7377738356590271, "learning_rate": 0.0002, "epoch": 2.74674642308625, "step": 16990}, {"loss": 0.6294, "grad_norm": 0.9400289058685303, "learning_rate": 0.0002, "epoch": 2.7483631072669956, "step": 17000}, {"loss": 0.6952, "grad_norm": 0.6340599656105042, "learning_rate": 0.0002, "epoch": 2.749979791447741, "step": 17010}, {"loss": 0.7222, "grad_norm": 0.7297601103782654, "learning_rate": 0.0002, "epoch": 2.751596475628486, "step": 17020}, {"loss": 0.6659, "grad_norm": 0.9479979872703552, "learning_rate": 0.0002, "epoch": 2.7532131598092313, "step": 17030}, {"loss": 0.691, "grad_norm": 0.8461511135101318, "learning_rate": 0.0002, "epoch": 2.7548298439899765, "step": 17040}, {"loss": 0.6764, "grad_norm": 0.7477551698684692, "learning_rate": 0.0002, "epoch": 2.7564465281707218, "step": 17050}, {"loss": 0.684, "grad_norm": 1.019270420074463, "learning_rate": 0.0002, "epoch": 2.758063212351467, "step": 17060}, {"loss": 0.7119, "grad_norm": 0.7730235457420349, "learning_rate": 0.0002, "epoch": 2.7596798965322122, "step": 17070}, {"loss": 0.6886, "grad_norm": 0.8216866254806519, "learning_rate": 0.0002, "epoch": 2.7612965807129575, "step": 17080}, {"loss": 0.6811, "grad_norm": 0.7235931754112244, "learning_rate": 0.0002, "epoch": 2.762913264893703, "step": 17090}, {"loss": 0.7031, "grad_norm": 0.7352296710014343, "learning_rate": 0.0002, "epoch": 2.7645299490744484, "step": 17100}, {"loss": 0.6951, "grad_norm": 0.8129373788833618, "learning_rate": 0.0002, "epoch": 2.7661466332551936, "step": 17110}, {"loss": 0.6703, "grad_norm": 0.7387019991874695, "learning_rate": 0.0002, "epoch": 2.767763317435939, "step": 17120}, {"loss": 0.6789, "grad_norm": 0.9149190187454224, "learning_rate": 0.0002, "epoch": 2.769380001616684, "step": 17130}, {"loss": 0.6038, "grad_norm": 0.7352971434593201, "learning_rate": 0.0002, "epoch": 2.7709966857974297, "step": 17140}, {"loss": 0.6728, "grad_norm": 0.7903780341148376, "learning_rate": 0.0002, "epoch": 2.772613369978175, "step": 17150}, {"loss": 0.6988, "grad_norm": 0.8255927562713623, "learning_rate": 0.0002, "epoch": 2.77423005415892, "step": 17160}, {"loss": 0.6694, "grad_norm": 0.7235927581787109, "learning_rate": 0.0002, "epoch": 2.7758467383396654, "step": 17170}, {"loss": 0.7161, "grad_norm": 0.8281434774398804, "learning_rate": 0.0002, "epoch": 2.7774634225204107, "step": 17180}, {"loss": 0.682, "grad_norm": 0.7586921453475952, "learning_rate": 0.0002, "epoch": 2.779080106701156, "step": 17190}, {"loss": 0.6427, "grad_norm": 0.7161715030670166, "learning_rate": 0.0002, "epoch": 2.780696790881901, "step": 17200}, {"loss": 0.6426, "grad_norm": 0.762868344783783, "learning_rate": 0.0002, "epoch": 2.7823134750626464, "step": 17210}, {"loss": 0.705, "grad_norm": 0.9285483360290527, "learning_rate": 0.0002, "epoch": 2.7839301592433916, "step": 17220}, {"loss": 0.7084, "grad_norm": 0.6900462508201599, "learning_rate": 0.0002, "epoch": 2.785546843424137, "step": 17230}, {"loss": 0.6988, "grad_norm": 0.780384361743927, "learning_rate": 0.0002, "epoch": 2.7871635276048825, "step": 17240}, {"loss": 0.7073, "grad_norm": 0.7580406665802002, "learning_rate": 0.0002, "epoch": 2.7887802117856277, "step": 17250}, {"loss": 0.6833, "grad_norm": 0.8145199418067932, "learning_rate": 0.0002, "epoch": 2.790396895966373, "step": 17260}, {"loss": 0.6909, "grad_norm": 0.9159596562385559, "learning_rate": 0.0002, "epoch": 2.792013580147118, "step": 17270}, {"loss": 0.6008, "grad_norm": 0.9590014219284058, "learning_rate": 0.0002, "epoch": 2.7936302643278634, "step": 17280}, {"loss": 0.6704, "grad_norm": 0.7603529691696167, "learning_rate": 0.0002, "epoch": 2.795246948508609, "step": 17290}, {"loss": 0.7165, "grad_norm": 0.8039976358413696, "learning_rate": 0.0002, "epoch": 2.7968636326893543, "step": 17300}, {"loss": 0.7037, "grad_norm": 0.8364847302436829, "learning_rate": 0.0002, "epoch": 2.7984803168700996, "step": 17310}, {"loss": 0.6749, "grad_norm": 0.8763046860694885, "learning_rate": 0.0002, "epoch": 2.800097001050845, "step": 17320}, {"loss": 0.6844, "grad_norm": 0.8409647941589355, "learning_rate": 0.0002, "epoch": 2.80171368523159, "step": 17330}, {"loss": 0.6936, "grad_norm": 0.7649006247520447, "learning_rate": 0.0002, "epoch": 2.8033303694123353, "step": 17340}, {"loss": 0.7051, "grad_norm": 0.7970262169837952, "learning_rate": 0.0002, "epoch": 2.8049470535930805, "step": 17350}, {"loss": 0.6533, "grad_norm": 0.9088607430458069, "learning_rate": 0.0002, "epoch": 2.8065637377738257, "step": 17360}, {"loss": 0.675, "grad_norm": 0.6454846858978271, "learning_rate": 0.0002, "epoch": 2.808180421954571, "step": 17370}, {"loss": 0.7069, "grad_norm": 0.7744787931442261, "learning_rate": 0.0002, "epoch": 2.809797106135316, "step": 17380}, {"loss": 0.6772, "grad_norm": 0.6678640842437744, "learning_rate": 0.0002, "epoch": 2.811413790316062, "step": 17390}, {"loss": 0.6784, "grad_norm": 0.772676944732666, "learning_rate": 0.0002, "epoch": 2.813030474496807, "step": 17400}, {"loss": 0.7252, "grad_norm": 0.7088175415992737, "learning_rate": 0.0002, "epoch": 2.8146471586775523, "step": 17410}, {"loss": 0.7086, "grad_norm": 0.8280573487281799, "learning_rate": 0.0002, "epoch": 2.8162638428582976, "step": 17420}, {"loss": 0.6732, "grad_norm": 0.6665388345718384, "learning_rate": 0.0002, "epoch": 2.817880527039043, "step": 17430}, {"loss": 0.6675, "grad_norm": 0.6427883505821228, "learning_rate": 0.0002, "epoch": 2.8194972112197885, "step": 17440}, {"loss": 0.6972, "grad_norm": 0.9697760343551636, "learning_rate": 0.0002, "epoch": 2.8211138954005337, "step": 17450}, {"loss": 0.6838, "grad_norm": 0.7573966383934021, "learning_rate": 0.0002, "epoch": 2.822730579581279, "step": 17460}, {"loss": 0.7243, "grad_norm": 0.878688633441925, "learning_rate": 0.0002, "epoch": 2.824347263762024, "step": 17470}, {"loss": 0.6666, "grad_norm": 0.7752242684364319, "learning_rate": 0.0002, "epoch": 2.8259639479427694, "step": 17480}, {"loss": 0.6638, "grad_norm": 0.6135398745536804, "learning_rate": 0.0002, "epoch": 2.8275806321235146, "step": 17490}, {"loss": 0.6829, "grad_norm": 0.6924924850463867, "learning_rate": 0.0002, "epoch": 2.82919731630426, "step": 17500}, {"loss": 0.6731, "grad_norm": 0.7471627593040466, "learning_rate": 0.0002, "epoch": 2.830814000485005, "step": 17510}, {"loss": 0.7016, "grad_norm": 0.7145499587059021, "learning_rate": 0.0002, "epoch": 2.8324306846657503, "step": 17520}, {"loss": 0.6787, "grad_norm": 0.7415414452552795, "learning_rate": 0.0002, "epoch": 2.834047368846496, "step": 17530}, {"loss": 0.6811, "grad_norm": 0.7328441739082336, "learning_rate": 0.0002, "epoch": 2.8356640530272412, "step": 17540}, {"loss": 0.6866, "grad_norm": 0.8267839550971985, "learning_rate": 0.0002, "epoch": 2.8372807372079865, "step": 17550}, {"loss": 0.6787, "grad_norm": 0.8877885341644287, "learning_rate": 0.0002, "epoch": 2.8388974213887317, "step": 17560}, {"loss": 0.7136, "grad_norm": 0.857138454914093, "learning_rate": 0.0002, "epoch": 2.840514105569477, "step": 17570}, {"loss": 0.6454, "grad_norm": 0.8470779657363892, "learning_rate": 0.0002, "epoch": 2.842130789750222, "step": 17580}, {"loss": 0.6976, "grad_norm": 0.8553254008293152, "learning_rate": 0.0002, "epoch": 2.843747473930968, "step": 17590}, {"loss": 0.7297, "grad_norm": 0.8033196926116943, "learning_rate": 0.0002, "epoch": 2.845364158111713, "step": 17600}, {"loss": 0.7062, "grad_norm": 0.7949087023735046, "learning_rate": 0.0002, "epoch": 2.8469808422924583, "step": 17610}, {"loss": 0.651, "grad_norm": 0.9241406321525574, "learning_rate": 0.0002, "epoch": 2.8485975264732035, "step": 17620}, {"loss": 0.6601, "grad_norm": 0.7721285223960876, "learning_rate": 0.0002, "epoch": 2.8502142106539488, "step": 17630}, {"loss": 0.6183, "grad_norm": 1.0246692895889282, "learning_rate": 0.0002, "epoch": 2.851830894834694, "step": 17640}, {"loss": 0.7007, "grad_norm": 0.9244589805603027, "learning_rate": 0.0002, "epoch": 2.853447579015439, "step": 17650}, {"loss": 0.7274, "grad_norm": 0.7243508696556091, "learning_rate": 0.0002, "epoch": 2.8550642631961844, "step": 17660}, {"loss": 0.6471, "grad_norm": 0.8943371176719666, "learning_rate": 0.0002, "epoch": 2.8566809473769297, "step": 17670}, {"loss": 0.686, "grad_norm": 0.6531758904457092, "learning_rate": 0.0002, "epoch": 2.8582976315576754, "step": 17680}, {"loss": 0.6253, "grad_norm": 0.8367000818252563, "learning_rate": 0.0002, "epoch": 2.8599143157384206, "step": 17690}, {"loss": 0.6943, "grad_norm": 0.7868556380271912, "learning_rate": 0.0002, "epoch": 2.861530999919166, "step": 17700}, {"loss": 0.6919, "grad_norm": 0.7213859558105469, "learning_rate": 0.0002, "epoch": 2.863147684099911, "step": 17710}, {"loss": 0.6657, "grad_norm": 0.7383931279182434, "learning_rate": 0.0002, "epoch": 2.8647643682806563, "step": 17720}, {"loss": 0.6841, "grad_norm": 0.7566812634468079, "learning_rate": 0.0002, "epoch": 2.8663810524614015, "step": 17730}, {"loss": 0.6449, "grad_norm": 0.6930373311042786, "learning_rate": 0.0002, "epoch": 2.867997736642147, "step": 17740}, {"loss": 0.6764, "grad_norm": 0.7911090850830078, "learning_rate": 0.0002, "epoch": 2.8696144208228924, "step": 17750}, {"loss": 0.6554, "grad_norm": 0.8484548926353455, "learning_rate": 0.0002, "epoch": 2.8712311050036377, "step": 17760}, {"loss": 0.6931, "grad_norm": 0.7647597193717957, "learning_rate": 0.0002, "epoch": 2.872847789184383, "step": 17770}, {"loss": 0.6945, "grad_norm": 0.8791151642799377, "learning_rate": 0.0002, "epoch": 2.874464473365128, "step": 17780}, {"loss": 0.7078, "grad_norm": 0.7253178358078003, "learning_rate": 0.0002, "epoch": 2.8760811575458733, "step": 17790}, {"loss": 0.6474, "grad_norm": 0.7956077456474304, "learning_rate": 0.0002, "epoch": 2.8776978417266186, "step": 17800}, {"loss": 0.6687, "grad_norm": 0.8657688498497009, "learning_rate": 0.0002, "epoch": 2.879314525907364, "step": 17810}, {"loss": 0.7171, "grad_norm": 0.7059141993522644, "learning_rate": 0.0002, "epoch": 2.880931210088109, "step": 17820}, {"loss": 0.683, "grad_norm": 0.8886896967887878, "learning_rate": 0.0002, "epoch": 2.8825478942688547, "step": 17830}, {"loss": 0.669, "grad_norm": 0.821032702922821, "learning_rate": 0.0002, "epoch": 2.8841645784496, "step": 17840}, {"loss": 0.6805, "grad_norm": 0.7183963656425476, "learning_rate": 0.0002, "epoch": 2.885781262630345, "step": 17850}, {"loss": 0.7088, "grad_norm": 0.6222899556159973, "learning_rate": 0.0002, "epoch": 2.8873979468110904, "step": 17860}, {"loss": 0.6626, "grad_norm": 0.8187434077262878, "learning_rate": 0.0002, "epoch": 2.8890146309918356, "step": 17870}, {"loss": 0.6815, "grad_norm": 0.9838479161262512, "learning_rate": 0.0002, "epoch": 2.890631315172581, "step": 17880}, {"loss": 0.6967, "grad_norm": 0.7567742466926575, "learning_rate": 0.0002, "epoch": 2.8922479993533265, "step": 17890}, {"loss": 0.7073, "grad_norm": 0.6875903606414795, "learning_rate": 0.0002, "epoch": 2.893864683534072, "step": 17900}, {"loss": 0.6415, "grad_norm": 0.8043789267539978, "learning_rate": 0.0002, "epoch": 2.895481367714817, "step": 17910}, {"loss": 0.6588, "grad_norm": 0.8062626719474792, "learning_rate": 0.0002, "epoch": 2.8970980518955622, "step": 17920}, {"loss": 0.7151, "grad_norm": 1.0251191854476929, "learning_rate": 0.0002, "epoch": 2.8987147360763075, "step": 17930}, {"loss": 0.6605, "grad_norm": 0.882253110408783, "learning_rate": 0.0002, "epoch": 2.9003314202570527, "step": 17940}, {"loss": 0.6719, "grad_norm": 0.8683299422264099, "learning_rate": 0.0002, "epoch": 2.901948104437798, "step": 17950}, {"loss": 0.6896, "grad_norm": 0.7167282104492188, "learning_rate": 0.0002, "epoch": 2.903564788618543, "step": 17960}, {"loss": 0.663, "grad_norm": 0.7093694806098938, "learning_rate": 0.0002, "epoch": 2.9051814727992884, "step": 17970}, {"loss": 0.6591, "grad_norm": 0.8549879193305969, "learning_rate": 0.0002, "epoch": 2.906798156980034, "step": 17980}, {"loss": 0.6962, "grad_norm": 0.6989606618881226, "learning_rate": 0.0002, "epoch": 2.9084148411607793, "step": 17990}, {"loss": 0.6635, "grad_norm": 0.9482976794242859, "learning_rate": 0.0002, "epoch": 2.9100315253415245, "step": 18000}, {"loss": 0.6586, "grad_norm": 0.7182440161705017, "learning_rate": 0.0002, "epoch": 2.9116482095222698, "step": 18010}, {"loss": 0.6827, "grad_norm": 0.7732226252555847, "learning_rate": 0.0002, "epoch": 2.913264893703015, "step": 18020}, {"loss": 0.7123, "grad_norm": 0.7936875224113464, "learning_rate": 0.0002, "epoch": 2.9148815778837607, "step": 18030}, {"loss": 0.6736, "grad_norm": 0.8825615644454956, "learning_rate": 0.0002, "epoch": 2.916498262064506, "step": 18040}, {"loss": 0.7139, "grad_norm": 0.6778587102890015, "learning_rate": 0.0002, "epoch": 2.918114946245251, "step": 18050}, {"loss": 0.6588, "grad_norm": 0.7529265880584717, "learning_rate": 0.0002, "epoch": 2.9197316304259964, "step": 18060}, {"loss": 0.737, "grad_norm": 0.7111883163452148, "learning_rate": 0.0002, "epoch": 2.9213483146067416, "step": 18070}, {"loss": 0.7475, "grad_norm": 0.7214767932891846, "learning_rate": 0.0002, "epoch": 2.922964998787487, "step": 18080}, {"loss": 0.6672, "grad_norm": 0.800417423248291, "learning_rate": 0.0002, "epoch": 2.924581682968232, "step": 18090}, {"loss": 0.6694, "grad_norm": 1.248575210571289, "learning_rate": 0.0002, "epoch": 2.9261983671489773, "step": 18100}, {"loss": 0.7004, "grad_norm": 0.757788360118866, "learning_rate": 0.0002, "epoch": 2.9278150513297225, "step": 18110}, {"loss": 0.6999, "grad_norm": 1.0583995580673218, "learning_rate": 0.0002, "epoch": 2.9294317355104678, "step": 18120}, {"loss": 0.6365, "grad_norm": 0.8228777647018433, "learning_rate": 0.0002, "epoch": 2.9310484196912134, "step": 18130}, {"loss": 0.6791, "grad_norm": 0.8374035358428955, "learning_rate": 0.0002, "epoch": 2.9326651038719587, "step": 18140}, {"loss": 0.6399, "grad_norm": 0.7976473569869995, "learning_rate": 0.0002, "epoch": 2.934281788052704, "step": 18150}, {"loss": 0.6585, "grad_norm": 0.8009907603263855, "learning_rate": 0.0002, "epoch": 2.935898472233449, "step": 18160}, {"loss": 0.7485, "grad_norm": 0.835213303565979, "learning_rate": 0.0002, "epoch": 2.9375151564141944, "step": 18170}, {"loss": 0.7376, "grad_norm": 0.7982219457626343, "learning_rate": 0.0002, "epoch": 2.93913184059494, "step": 18180}, {"loss": 0.6348, "grad_norm": 0.7070978879928589, "learning_rate": 0.0002, "epoch": 2.9407485247756853, "step": 18190}, {"loss": 0.6608, "grad_norm": 0.8619440197944641, "learning_rate": 0.0002, "epoch": 2.9423652089564305, "step": 18200}, {"loss": 0.666, "grad_norm": 0.6693987250328064, "learning_rate": 0.0002, "epoch": 2.9439818931371757, "step": 18210}, {"loss": 0.728, "grad_norm": 0.6747021079063416, "learning_rate": 0.0002, "epoch": 2.945598577317921, "step": 18220}, {"loss": 0.6686, "grad_norm": 0.860387921333313, "learning_rate": 0.0002, "epoch": 2.947215261498666, "step": 18230}, {"loss": 0.6945, "grad_norm": 0.799976646900177, "learning_rate": 0.0002, "epoch": 2.9488319456794114, "step": 18240}, {"loss": 0.7243, "grad_norm": 0.7864769101142883, "learning_rate": 0.0002, "epoch": 2.9504486298601567, "step": 18250}, {"loss": 0.6785, "grad_norm": 0.6713884472846985, "learning_rate": 0.0002, "epoch": 2.952065314040902, "step": 18260}, {"loss": 0.7429, "grad_norm": 0.9031508564949036, "learning_rate": 0.0002, "epoch": 2.9536819982216476, "step": 18270}, {"loss": 0.7055, "grad_norm": 0.7205073237419128, "learning_rate": 0.0002, "epoch": 2.955298682402393, "step": 18280}, {"loss": 0.7298, "grad_norm": 0.7746205925941467, "learning_rate": 0.0002, "epoch": 2.956915366583138, "step": 18290}, {"loss": 0.6218, "grad_norm": 0.6533427834510803, "learning_rate": 0.0002, "epoch": 2.9585320507638833, "step": 18300}, {"loss": 0.6674, "grad_norm": 0.9083208441734314, "learning_rate": 0.0002, "epoch": 2.9601487349446285, "step": 18310}, {"loss": 0.7359, "grad_norm": 0.7446991801261902, "learning_rate": 0.0002, "epoch": 2.9617654191253737, "step": 18320}, {"loss": 0.6738, "grad_norm": 0.6514461636543274, "learning_rate": 0.0002, "epoch": 2.9633821033061194, "step": 18330}, {"loss": 0.6677, "grad_norm": 0.8580465912818909, "learning_rate": 0.0002, "epoch": 2.9649987874868646, "step": 18340}, {"loss": 0.6971, "grad_norm": 0.7074266076087952, "learning_rate": 0.0002, "epoch": 2.96661547166761, "step": 18350}, {"loss": 0.6804, "grad_norm": 0.899892270565033, "learning_rate": 0.0002, "epoch": 2.968232155848355, "step": 18360}, {"loss": 0.7094, "grad_norm": 0.8217641711235046, "learning_rate": 0.0002, "epoch": 2.9698488400291003, "step": 18370}, {"loss": 0.6916, "grad_norm": 0.8611799478530884, "learning_rate": 0.0002, "epoch": 2.9714655242098456, "step": 18380}, {"loss": 0.6677, "grad_norm": 0.6909302473068237, "learning_rate": 0.0002, "epoch": 2.973082208390591, "step": 18390}, {"loss": 0.7247, "grad_norm": 0.6554358005523682, "learning_rate": 0.0002, "epoch": 2.974698892571336, "step": 18400}, {"loss": 0.6516, "grad_norm": 0.7803071737289429, "learning_rate": 0.0002, "epoch": 2.9763155767520812, "step": 18410}, {"loss": 0.7322, "grad_norm": 0.7838954925537109, "learning_rate": 0.0002, "epoch": 2.977932260932827, "step": 18420}, {"loss": 0.6522, "grad_norm": 0.7098495364189148, "learning_rate": 0.0002, "epoch": 2.979548945113572, "step": 18430}, {"loss": 0.739, "grad_norm": 0.8981785774230957, "learning_rate": 0.0002, "epoch": 2.9811656292943174, "step": 18440}, {"loss": 0.6689, "grad_norm": 0.7197171449661255, "learning_rate": 0.0002, "epoch": 2.9827823134750626, "step": 18450}, {"loss": 0.706, "grad_norm": 0.793185293674469, "learning_rate": 0.0002, "epoch": 2.984398997655808, "step": 18460}, {"loss": 0.7124, "grad_norm": 0.8531473875045776, "learning_rate": 0.0002, "epoch": 2.986015681836553, "step": 18470}, {"loss": 0.6901, "grad_norm": 0.6627361178398132, "learning_rate": 0.0002, "epoch": 2.9876323660172988, "step": 18480}, {"loss": 0.6591, "grad_norm": 0.5708155035972595, "learning_rate": 0.0002, "epoch": 2.989249050198044, "step": 18490}, {"loss": 0.6725, "grad_norm": 0.8227280378341675, "learning_rate": 0.0002, "epoch": 2.990865734378789, "step": 18500}, {"loss": 0.6701, "grad_norm": 0.7102749943733215, "learning_rate": 0.0002, "epoch": 2.9924824185595345, "step": 18510}, {"loss": 0.7091, "grad_norm": 0.839485228061676, "learning_rate": 0.0002, "epoch": 2.9940991027402797, "step": 18520}, {"loss": 0.6521, "grad_norm": 0.9038704037666321, "learning_rate": 0.0002, "epoch": 2.995715786921025, "step": 18530}, {"loss": 0.7186, "grad_norm": 0.8737510442733765, "learning_rate": 0.0002, "epoch": 2.99733247110177, "step": 18540}, {"loss": 0.6819, "grad_norm": 0.7323142886161804, "learning_rate": 0.0002, "epoch": 2.9989491552825154, "step": 18550}, {"eval_loss": 1.1262480020523071, "eval_runtime": 122.0868, "eval_samples_per_second": 6.004, "eval_steps_per_second": 0.754, "epoch": 2.9999191657909625, "step": 18556}, {"loss": 0.6337, "grad_norm": 0.8465463519096375, "learning_rate": 0.0002, "epoch": 3.000565839463261, "step": 18560}, {"loss": 0.6064, "grad_norm": 0.9134138822555542, "learning_rate": 0.0002, "epoch": 3.0021825236440063, "step": 18570}, {"loss": 0.5804, "grad_norm": 0.760715126991272, "learning_rate": 0.0002, "epoch": 3.0037992078247515, "step": 18580}, {"loss": 0.5571, "grad_norm": 0.9208743572235107, "learning_rate": 0.0002, "epoch": 3.0054158920054967, "step": 18590}, {"loss": 0.5731, "grad_norm": 0.9232364892959595, "learning_rate": 0.0002, "epoch": 3.007032576186242, "step": 18600}, {"loss": 0.6299, "grad_norm": 1.1881544589996338, "learning_rate": 0.0002, "epoch": 3.008649260366987, "step": 18610}, {"loss": 0.5482, "grad_norm": 0.9372987747192383, "learning_rate": 0.0002, "epoch": 3.0102659445477324, "step": 18620}, {"loss": 0.5709, "grad_norm": 0.6900241374969482, "learning_rate": 0.0002, "epoch": 3.0118826287284777, "step": 18630}, {"loss": 0.5256, "grad_norm": 0.8451071381568909, "learning_rate": 0.0002, "epoch": 3.0134993129092233, "step": 18640}, {"loss": 0.5916, "grad_norm": 0.7763112187385559, "learning_rate": 0.0002, "epoch": 3.0151159970899686, "step": 18650}, {"loss": 0.6095, "grad_norm": 1.043653964996338, "learning_rate": 0.0002, "epoch": 3.016732681270714, "step": 18660}, {"loss": 0.6228, "grad_norm": 1.0170660018920898, "learning_rate": 0.0002, "epoch": 3.018349365451459, "step": 18670}, {"loss": 0.5671, "grad_norm": 0.7534180283546448, "learning_rate": 0.0002, "epoch": 3.0199660496322043, "step": 18680}, {"loss": 0.6015, "grad_norm": 0.7507367730140686, "learning_rate": 0.0002, "epoch": 3.0215827338129495, "step": 18690}, {"loss": 0.6201, "grad_norm": 0.7861620187759399, "learning_rate": 0.0002, "epoch": 3.0231994179936947, "step": 18700}, {"loss": 0.5802, "grad_norm": 1.0580339431762695, "learning_rate": 0.0002, "epoch": 3.0248161021744404, "step": 18710}, {"loss": 0.5975, "grad_norm": 0.7542710900306702, "learning_rate": 0.0002, "epoch": 3.0264327863551856, "step": 18720}, {"loss": 0.5695, "grad_norm": 0.8189544677734375, "learning_rate": 0.0002, "epoch": 3.028049470535931, "step": 18730}, {"loss": 0.6109, "grad_norm": 0.9126611351966858, "learning_rate": 0.0002, "epoch": 3.029666154716676, "step": 18740}, {"loss": 0.6443, "grad_norm": 0.8891341686248779, "learning_rate": 0.0002, "epoch": 3.0312828388974213, "step": 18750}, {"loss": 0.6207, "grad_norm": 0.8419283032417297, "learning_rate": 0.0002, "epoch": 3.0328995230781666, "step": 18760}, {"loss": 0.5818, "grad_norm": 0.8048048615455627, "learning_rate": 0.0002, "epoch": 3.034516207258912, "step": 18770}, {"loss": 0.6381, "grad_norm": 0.7820217609405518, "learning_rate": 0.0002, "epoch": 3.0361328914396575, "step": 18780}, {"loss": 0.5843, "grad_norm": 0.854721188545227, "learning_rate": 0.0002, "epoch": 3.0377495756204027, "step": 18790}, {"loss": 0.5784, "grad_norm": 0.912092924118042, "learning_rate": 0.0002, "epoch": 3.039366259801148, "step": 18800}, {"loss": 0.5734, "grad_norm": 0.6596226096153259, "learning_rate": 0.0002, "epoch": 3.040982943981893, "step": 18810}, {"loss": 0.5969, "grad_norm": 0.6351348757743835, "learning_rate": 0.0002, "epoch": 3.0425996281626384, "step": 18820}, {"loss": 0.5953, "grad_norm": 0.778188943862915, "learning_rate": 0.0002, "epoch": 3.0442163123433836, "step": 18830}, {"loss": 0.602, "grad_norm": 0.68234783411026, "learning_rate": 0.0002, "epoch": 3.045832996524129, "step": 18840}, {"loss": 0.5785, "grad_norm": 0.998628556728363, "learning_rate": 0.0002, "epoch": 3.047449680704874, "step": 18850}, {"loss": 0.6231, "grad_norm": 0.7393841743469238, "learning_rate": 0.0002, "epoch": 3.0490663648856198, "step": 18860}, {"loss": 0.568, "grad_norm": 0.84438556432724, "learning_rate": 0.0002, "epoch": 3.050683049066365, "step": 18870}, {"loss": 0.6205, "grad_norm": 0.8857501745223999, "learning_rate": 0.0002, "epoch": 3.0522997332471102, "step": 18880}, {"loss": 0.6335, "grad_norm": 0.7208474278450012, "learning_rate": 0.0002, "epoch": 3.0539164174278555, "step": 18890}, {"loss": 0.5998, "grad_norm": 0.7135229110717773, "learning_rate": 0.0002, "epoch": 3.0555331016086007, "step": 18900}, {"loss": 0.5575, "grad_norm": 0.9130001664161682, "learning_rate": 0.0002, "epoch": 3.057149785789346, "step": 18910}, {"loss": 0.5955, "grad_norm": 0.9001716375350952, "learning_rate": 0.0002, "epoch": 3.058766469970091, "step": 18920}, {"loss": 0.6052, "grad_norm": 0.8667559623718262, "learning_rate": 0.0002, "epoch": 3.060383154150837, "step": 18930}, {"loss": 0.5818, "grad_norm": 0.8943959474563599, "learning_rate": 0.0002, "epoch": 3.061999838331582, "step": 18940}, {"loss": 0.5978, "grad_norm": 0.8298377990722656, "learning_rate": 0.0002, "epoch": 3.0636165225123273, "step": 18950}, {"loss": 0.5782, "grad_norm": 0.7935267686843872, "learning_rate": 0.0002, "epoch": 3.0652332066930725, "step": 18960}, {"loss": 0.6434, "grad_norm": 1.1506379842758179, "learning_rate": 0.0002, "epoch": 3.0668498908738178, "step": 18970}, {"loss": 0.5571, "grad_norm": 0.7693049907684326, "learning_rate": 0.0002, "epoch": 3.068466575054563, "step": 18980}, {"loss": 0.5971, "grad_norm": 0.8040135502815247, "learning_rate": 0.0002, "epoch": 3.0700832592353082, "step": 18990}, {"loss": 0.5541, "grad_norm": 0.828404426574707, "learning_rate": 0.0002, "epoch": 3.0716999434160535, "step": 19000}, {"loss": 0.6048, "grad_norm": 0.8811164498329163, "learning_rate": 0.0002, "epoch": 3.073316627596799, "step": 19010}, {"loss": 0.5845, "grad_norm": 1.036205768585205, "learning_rate": 0.0002, "epoch": 3.0749333117775444, "step": 19020}, {"loss": 0.5838, "grad_norm": 0.8857285976409912, "learning_rate": 0.0002, "epoch": 3.0765499959582896, "step": 19030}, {"loss": 0.592, "grad_norm": 0.8392079472541809, "learning_rate": 0.0002, "epoch": 3.078166680139035, "step": 19040}, {"loss": 0.5927, "grad_norm": 1.0287401676177979, "learning_rate": 0.0002, "epoch": 3.07978336431978, "step": 19050}, {"loss": 0.5964, "grad_norm": 1.0086315870285034, "learning_rate": 0.0002, "epoch": 3.0814000485005253, "step": 19060}, {"loss": 0.5567, "grad_norm": 0.9245324730873108, "learning_rate": 0.0002, "epoch": 3.0830167326812705, "step": 19070}, {"loss": 0.5797, "grad_norm": 0.8680877089500427, "learning_rate": 0.0002, "epoch": 3.084633416862016, "step": 19080}, {"loss": 0.5611, "grad_norm": 0.8814793825149536, "learning_rate": 0.0002, "epoch": 3.0862501010427614, "step": 19090}, {"loss": 0.6051, "grad_norm": 0.9234458208084106, "learning_rate": 0.0002, "epoch": 3.0878667852235067, "step": 19100}, {"loss": 0.6209, "grad_norm": 1.1291664838790894, "learning_rate": 0.0002, "epoch": 3.089483469404252, "step": 19110}, {"loss": 0.5695, "grad_norm": 0.9191402792930603, "learning_rate": 0.0002, "epoch": 3.091100153584997, "step": 19120}, {"loss": 0.5856, "grad_norm": 0.7103154063224792, "learning_rate": 0.0002, "epoch": 3.0927168377657424, "step": 19130}, {"loss": 0.6479, "grad_norm": 0.9368883967399597, "learning_rate": 0.0002, "epoch": 3.0943335219464876, "step": 19140}, {"loss": 0.6167, "grad_norm": 0.9676656723022461, "learning_rate": 0.0002, "epoch": 3.095950206127233, "step": 19150}, {"loss": 0.5794, "grad_norm": 0.8739792704582214, "learning_rate": 0.0002, "epoch": 3.0975668903079785, "step": 19160}, {"loss": 0.6112, "grad_norm": 0.8530174493789673, "learning_rate": 0.0002, "epoch": 3.0991835744887237, "step": 19170}, {"loss": 0.6568, "grad_norm": 0.794945478439331, "learning_rate": 0.0002, "epoch": 3.100800258669469, "step": 19180}, {"loss": 0.5928, "grad_norm": 0.9508888125419617, "learning_rate": 0.0002, "epoch": 3.102416942850214, "step": 19190}, {"loss": 0.5757, "grad_norm": 1.0599955320358276, "learning_rate": 0.0002, "epoch": 3.1040336270309594, "step": 19200}, {"loss": 0.6151, "grad_norm": 1.0673625469207764, "learning_rate": 0.0002, "epoch": 3.1056503112117047, "step": 19210}, {"loss": 0.6043, "grad_norm": 0.7739115953445435, "learning_rate": 0.0002, "epoch": 3.10726699539245, "step": 19220}, {"loss": 0.6046, "grad_norm": 0.9884951114654541, "learning_rate": 0.0002, "epoch": 3.1088836795731956, "step": 19230}, {"loss": 0.5932, "grad_norm": 0.862260103225708, "learning_rate": 0.0002, "epoch": 3.110500363753941, "step": 19240}, {"loss": 0.6098, "grad_norm": 0.7690284848213196, "learning_rate": 0.0002, "epoch": 3.112117047934686, "step": 19250}, {"loss": 0.5791, "grad_norm": 0.8758958578109741, "learning_rate": 0.0002, "epoch": 3.1137337321154313, "step": 19260}, {"loss": 0.6136, "grad_norm": 1.0356395244598389, "learning_rate": 0.0002, "epoch": 3.1153504162961765, "step": 19270}, {"loss": 0.6159, "grad_norm": 0.6950937509536743, "learning_rate": 0.0002, "epoch": 3.1169671004769217, "step": 19280}, {"loss": 0.592, "grad_norm": 0.760998010635376, "learning_rate": 0.0002, "epoch": 3.118583784657667, "step": 19290}, {"loss": 0.575, "grad_norm": 0.9335789084434509, "learning_rate": 0.0002, "epoch": 3.1202004688384126, "step": 19300}, {"loss": 0.6139, "grad_norm": 0.9636204242706299, "learning_rate": 0.0002, "epoch": 3.121817153019158, "step": 19310}, {"loss": 0.6001, "grad_norm": 1.0820997953414917, "learning_rate": 0.0002, "epoch": 3.123433837199903, "step": 19320}, {"loss": 0.6542, "grad_norm": 0.7333487272262573, "learning_rate": 0.0002, "epoch": 3.1250505213806483, "step": 19330}, {"loss": 0.6178, "grad_norm": 1.0417509078979492, "learning_rate": 0.0002, "epoch": 3.1266672055613935, "step": 19340}, {"loss": 0.603, "grad_norm": 0.9267749190330505, "learning_rate": 0.0002, "epoch": 3.128283889742139, "step": 19350}, {"loss": 0.6063, "grad_norm": 0.777798593044281, "learning_rate": 0.0002, "epoch": 3.129900573922884, "step": 19360}, {"loss": 0.5913, "grad_norm": 0.8425456881523132, "learning_rate": 0.0002, "epoch": 3.1315172581036297, "step": 19370}, {"loss": 0.6042, "grad_norm": 0.9617102146148682, "learning_rate": 0.0002, "epoch": 3.133133942284375, "step": 19380}, {"loss": 0.633, "grad_norm": 1.0052828788757324, "learning_rate": 0.0002, "epoch": 3.13475062646512, "step": 19390}, {"loss": 0.5713, "grad_norm": 0.7637009024620056, "learning_rate": 0.0002, "epoch": 3.1363673106458654, "step": 19400}, {"loss": 0.5497, "grad_norm": 0.7958088517189026, "learning_rate": 0.0002, "epoch": 3.1379839948266106, "step": 19410}, {"loss": 0.6283, "grad_norm": 0.9161727428436279, "learning_rate": 0.0002, "epoch": 3.139600679007356, "step": 19420}, {"loss": 0.5638, "grad_norm": 0.8402149677276611, "learning_rate": 0.0002, "epoch": 3.141217363188101, "step": 19430}, {"loss": 0.5848, "grad_norm": 1.0056525468826294, "learning_rate": 0.0002, "epoch": 3.1428340473688463, "step": 19440}, {"loss": 0.5954, "grad_norm": 1.0129190683364868, "learning_rate": 0.0002, "epoch": 3.144450731549592, "step": 19450}, {"loss": 0.5808, "grad_norm": 0.790825366973877, "learning_rate": 0.0002, "epoch": 3.146067415730337, "step": 19460}, {"loss": 0.5607, "grad_norm": 1.441665530204773, "learning_rate": 0.0002, "epoch": 3.1476840999110824, "step": 19470}, {"loss": 0.5785, "grad_norm": 0.7846331596374512, "learning_rate": 0.0002, "epoch": 3.1493007840918277, "step": 19480}, {"loss": 0.5892, "grad_norm": 0.7915332913398743, "learning_rate": 0.0002, "epoch": 3.150917468272573, "step": 19490}, {"loss": 0.5759, "grad_norm": 0.933982253074646, "learning_rate": 0.0002, "epoch": 3.152534152453318, "step": 19500}, {"loss": 0.6206, "grad_norm": 1.038408637046814, "learning_rate": 0.0002, "epoch": 3.1541508366340634, "step": 19510}, {"loss": 0.6271, "grad_norm": 1.018935203552246, "learning_rate": 0.0002, "epoch": 3.155767520814809, "step": 19520}, {"loss": 0.6173, "grad_norm": 0.9618112444877625, "learning_rate": 0.0002, "epoch": 3.1573842049955543, "step": 19530}, {"loss": 0.5972, "grad_norm": 0.8900452852249146, "learning_rate": 0.0002, "epoch": 3.1590008891762995, "step": 19540}, {"loss": 0.5925, "grad_norm": 0.8254160284996033, "learning_rate": 0.0002, "epoch": 3.1606175733570447, "step": 19550}, {"loss": 0.625, "grad_norm": 1.004376769065857, "learning_rate": 0.0002, "epoch": 3.16223425753779, "step": 19560}, {"loss": 0.5775, "grad_norm": 1.0490446090698242, "learning_rate": 0.0002, "epoch": 3.163850941718535, "step": 19570}, {"loss": 0.5986, "grad_norm": 0.7387403845787048, "learning_rate": 0.0002, "epoch": 3.1654676258992804, "step": 19580}, {"loss": 0.5898, "grad_norm": 0.7611538171768188, "learning_rate": 0.0002, "epoch": 3.1670843100800257, "step": 19590}, {"loss": 0.5937, "grad_norm": 0.8239886164665222, "learning_rate": 0.0002, "epoch": 3.1687009942607713, "step": 19600}, {"loss": 0.6068, "grad_norm": 0.9327243566513062, "learning_rate": 0.0002, "epoch": 3.1703176784415166, "step": 19610}, {"loss": 0.572, "grad_norm": 0.9662560224533081, "learning_rate": 0.0002, "epoch": 3.171934362622262, "step": 19620}, {"loss": 0.5988, "grad_norm": 0.9183341860771179, "learning_rate": 0.0002, "epoch": 3.173551046803007, "step": 19630}, {"loss": 0.5909, "grad_norm": 0.875066876411438, "learning_rate": 0.0002, "epoch": 3.1751677309837523, "step": 19640}, {"loss": 0.5956, "grad_norm": 0.8567508459091187, "learning_rate": 0.0002, "epoch": 3.1767844151644975, "step": 19650}, {"loss": 0.5805, "grad_norm": 0.6805780529975891, "learning_rate": 0.0002, "epoch": 3.1784010993452427, "step": 19660}, {"loss": 0.6204, "grad_norm": 0.8776944279670715, "learning_rate": 0.0002, "epoch": 3.1800177835259884, "step": 19670}, {"loss": 0.6108, "grad_norm": 0.9036329984664917, "learning_rate": 0.0002, "epoch": 3.1816344677067336, "step": 19680}, {"loss": 0.6238, "grad_norm": 0.8527372479438782, "learning_rate": 0.0002, "epoch": 3.183251151887479, "step": 19690}, {"loss": 0.6089, "grad_norm": 1.1045585870742798, "learning_rate": 0.0002, "epoch": 3.184867836068224, "step": 19700}, {"loss": 0.5491, "grad_norm": 0.9213830828666687, "learning_rate": 0.0002, "epoch": 3.1864845202489693, "step": 19710}, {"loss": 0.618, "grad_norm": 0.8865814805030823, "learning_rate": 0.0002, "epoch": 3.1881012044297146, "step": 19720}, {"loss": 0.5785, "grad_norm": 0.7939388751983643, "learning_rate": 0.0002, "epoch": 3.18971788861046, "step": 19730}, {"loss": 0.5682, "grad_norm": 0.6966729760169983, "learning_rate": 0.0002, "epoch": 3.191334572791205, "step": 19740}, {"loss": 0.5839, "grad_norm": 0.8023673295974731, "learning_rate": 0.0002, "epoch": 3.1929512569719507, "step": 19750}, {"loss": 0.6267, "grad_norm": 0.7992037534713745, "learning_rate": 0.0002, "epoch": 3.194567941152696, "step": 19760}, {"loss": 0.6141, "grad_norm": 0.7412247657775879, "learning_rate": 0.0002, "epoch": 3.196184625333441, "step": 19770}, {"loss": 0.6179, "grad_norm": 0.9598729014396667, "learning_rate": 0.0002, "epoch": 3.1978013095141864, "step": 19780}, {"loss": 0.5685, "grad_norm": 0.8331366777420044, "learning_rate": 0.0002, "epoch": 3.1994179936949316, "step": 19790}, {"loss": 0.6104, "grad_norm": 0.8939169645309448, "learning_rate": 0.0002, "epoch": 3.201034677875677, "step": 19800}, {"loss": 0.6147, "grad_norm": 0.9219734072685242, "learning_rate": 0.0002, "epoch": 3.202651362056422, "step": 19810}, {"loss": 0.6051, "grad_norm": 0.869490385055542, "learning_rate": 0.0002, "epoch": 3.2042680462371678, "step": 19820}, {"loss": 0.5946, "grad_norm": 0.8989706635475159, "learning_rate": 0.0002, "epoch": 3.205884730417913, "step": 19830}, {"loss": 0.5866, "grad_norm": 0.8477165102958679, "learning_rate": 0.0002, "epoch": 3.2075014145986582, "step": 19840}, {"loss": 0.6176, "grad_norm": 0.8720678687095642, "learning_rate": 0.0002, "epoch": 3.2091180987794035, "step": 19850}, {"loss": 0.5694, "grad_norm": 0.861406683921814, "learning_rate": 0.0002, "epoch": 3.2107347829601487, "step": 19860}, {"loss": 0.6264, "grad_norm": 0.8228686451911926, "learning_rate": 0.0002, "epoch": 3.212351467140894, "step": 19870}, {"loss": 0.625, "grad_norm": 0.7936596870422363, "learning_rate": 0.0002, "epoch": 3.213968151321639, "step": 19880}, {"loss": 0.5698, "grad_norm": 1.097377896308899, "learning_rate": 0.0002, "epoch": 3.2155848355023844, "step": 19890}, {"loss": 0.6725, "grad_norm": 0.9544782638549805, "learning_rate": 0.0002, "epoch": 3.21720151968313, "step": 19900}, {"loss": 0.6022, "grad_norm": 0.8240751624107361, "learning_rate": 0.0002, "epoch": 3.2188182038638753, "step": 19910}, {"loss": 0.5659, "grad_norm": 0.8332096338272095, "learning_rate": 0.0002, "epoch": 3.2204348880446205, "step": 19920}, {"loss": 0.6274, "grad_norm": 1.0954567193984985, "learning_rate": 0.0002, "epoch": 3.2220515722253658, "step": 19930}, {"loss": 0.652, "grad_norm": 0.7790525555610657, "learning_rate": 0.0002, "epoch": 3.223668256406111, "step": 19940}, {"loss": 0.5986, "grad_norm": 0.7966814041137695, "learning_rate": 0.0002, "epoch": 3.225284940586856, "step": 19950}, {"loss": 0.5911, "grad_norm": 0.9751881957054138, "learning_rate": 0.0002, "epoch": 3.2269016247676015, "step": 19960}, {"loss": 0.6071, "grad_norm": 0.9856047630310059, "learning_rate": 0.0002, "epoch": 3.228518308948347, "step": 19970}, {"loss": 0.5837, "grad_norm": 1.3062353134155273, "learning_rate": 0.0002, "epoch": 3.2301349931290924, "step": 19980}, {"loss": 0.6588, "grad_norm": 0.9510692358016968, "learning_rate": 0.0002, "epoch": 3.2317516773098376, "step": 19990}, {"loss": 0.6264, "grad_norm": 0.8630342483520508, "learning_rate": 0.0002, "epoch": 3.233368361490583, "step": 20000}, {"loss": 0.6073, "grad_norm": 0.8966519236564636, "learning_rate": 0.0002, "epoch": 3.234985045671328, "step": 20010}, {"loss": 0.612, "grad_norm": 0.7093510627746582, "learning_rate": 0.0002, "epoch": 3.2366017298520733, "step": 20020}, {"loss": 0.585, "grad_norm": 0.7771096229553223, "learning_rate": 0.0002, "epoch": 3.2382184140328185, "step": 20030}, {"loss": 0.5821, "grad_norm": 0.841058075428009, "learning_rate": 0.0002, "epoch": 3.2398350982135637, "step": 20040}, {"loss": 0.6519, "grad_norm": 0.909712553024292, "learning_rate": 0.0002, "epoch": 3.2414517823943094, "step": 20050}, {"loss": 0.6089, "grad_norm": 0.8321019411087036, "learning_rate": 0.0002, "epoch": 3.2430684665750547, "step": 20060}, {"loss": 0.6115, "grad_norm": 0.779901921749115, "learning_rate": 0.0002, "epoch": 3.2446851507558, "step": 20070}, {"loss": 0.6107, "grad_norm": 0.6249170303344727, "learning_rate": 0.0002, "epoch": 3.246301834936545, "step": 20080}, {"loss": 0.603, "grad_norm": 0.8000940680503845, "learning_rate": 0.0002, "epoch": 3.2479185191172903, "step": 20090}, {"loss": 0.6273, "grad_norm": 0.7627735137939453, "learning_rate": 0.0002, "epoch": 3.2495352032980356, "step": 20100}, {"loss": 0.6223, "grad_norm": 0.8780747056007385, "learning_rate": 0.0002, "epoch": 3.2511518874787813, "step": 20110}, {"loss": 0.5969, "grad_norm": 0.772037148475647, "learning_rate": 0.0002, "epoch": 3.2527685716595265, "step": 20120}, {"loss": 0.5843, "grad_norm": 1.0086580514907837, "learning_rate": 0.0002, "epoch": 3.2543852558402717, "step": 20130}, {"loss": 0.5777, "grad_norm": 0.9360289573669434, "learning_rate": 0.0002, "epoch": 3.256001940021017, "step": 20140}, {"loss": 0.5777, "grad_norm": 1.2099586725234985, "learning_rate": 0.0002, "epoch": 3.257618624201762, "step": 20150}, {"loss": 0.624, "grad_norm": 0.8368481397628784, "learning_rate": 0.0002, "epoch": 3.2592353083825074, "step": 20160}, {"loss": 0.5626, "grad_norm": 0.7391039133071899, "learning_rate": 0.0002, "epoch": 3.2608519925632526, "step": 20170}, {"loss": 0.6041, "grad_norm": 0.9122273325920105, "learning_rate": 0.0002, "epoch": 3.262468676743998, "step": 20180}, {"loss": 0.5868, "grad_norm": 0.8502281904220581, "learning_rate": 0.0002, "epoch": 3.264085360924743, "step": 20190}, {"loss": 0.5841, "grad_norm": 1.0926852226257324, "learning_rate": 0.0002, "epoch": 3.265702045105489, "step": 20200}, {"loss": 0.6027, "grad_norm": 0.7902828454971313, "learning_rate": 0.0002, "epoch": 3.267318729286234, "step": 20210}, {"loss": 0.6089, "grad_norm": 0.8724729418754578, "learning_rate": 0.0002, "epoch": 3.2689354134669792, "step": 20220}, {"loss": 0.6242, "grad_norm": 0.8469277024269104, "learning_rate": 0.0002, "epoch": 3.2705520976477245, "step": 20230}, {"loss": 0.644, "grad_norm": 0.8865092992782593, "learning_rate": 0.0002, "epoch": 3.2721687818284697, "step": 20240}, {"loss": 0.6464, "grad_norm": 1.0979334115982056, "learning_rate": 0.0002, "epoch": 3.273785466009215, "step": 20250}, {"loss": 0.647, "grad_norm": 1.0860793590545654, "learning_rate": 0.0002, "epoch": 3.2754021501899606, "step": 20260}, {"loss": 0.6105, "grad_norm": 0.981745183467865, "learning_rate": 0.0002, "epoch": 3.277018834370706, "step": 20270}, {"loss": 0.627, "grad_norm": 0.9155020713806152, "learning_rate": 0.0002, "epoch": 3.278635518551451, "step": 20280}, {"loss": 0.5899, "grad_norm": 0.8436718583106995, "learning_rate": 0.0002, "epoch": 3.2802522027321963, "step": 20290}, {"loss": 0.6371, "grad_norm": 1.0329409837722778, "learning_rate": 0.0002, "epoch": 3.2818688869129415, "step": 20300}, {"loss": 0.6, "grad_norm": 0.9876394271850586, "learning_rate": 0.0002, "epoch": 3.2834855710936868, "step": 20310}, {"loss": 0.5463, "grad_norm": 0.8052917718887329, "learning_rate": 0.0002, "epoch": 3.285102255274432, "step": 20320}, {"loss": 0.5949, "grad_norm": 0.8390680551528931, "learning_rate": 0.0002, "epoch": 3.2867189394551772, "step": 20330}, {"loss": 0.6492, "grad_norm": 0.9515735507011414, "learning_rate": 0.0002, "epoch": 3.288335623635923, "step": 20340}, {"loss": 0.596, "grad_norm": 0.8028870224952698, "learning_rate": 0.0002, "epoch": 3.289952307816668, "step": 20350}, {"loss": 0.634, "grad_norm": 0.862592339515686, "learning_rate": 0.0002, "epoch": 3.2915689919974134, "step": 20360}, {"loss": 0.6345, "grad_norm": 0.7451621890068054, "learning_rate": 0.0002, "epoch": 3.2931856761781586, "step": 20370}, {"loss": 0.6458, "grad_norm": 0.8966776728630066, "learning_rate": 0.0002, "epoch": 3.294802360358904, "step": 20380}, {"loss": 0.5967, "grad_norm": 0.9289216995239258, "learning_rate": 0.0002, "epoch": 3.296419044539649, "step": 20390}, {"loss": 0.6599, "grad_norm": 0.9649626612663269, "learning_rate": 0.0002, "epoch": 3.2980357287203943, "step": 20400}, {"loss": 0.5781, "grad_norm": 1.1953798532485962, "learning_rate": 0.0002, "epoch": 3.29965241290114, "step": 20410}, {"loss": 0.5997, "grad_norm": 0.8929083943367004, "learning_rate": 0.0002, "epoch": 3.301269097081885, "step": 20420}, {"loss": 0.597, "grad_norm": 0.8922014236450195, "learning_rate": 0.0002, "epoch": 3.3028857812626304, "step": 20430}, {"loss": 0.5766, "grad_norm": 0.9754860401153564, "learning_rate": 0.0002, "epoch": 3.3045024654433757, "step": 20440}, {"loss": 0.5653, "grad_norm": 0.8873140215873718, "learning_rate": 0.0002, "epoch": 3.306119149624121, "step": 20450}, {"loss": 0.6138, "grad_norm": 0.857271671295166, "learning_rate": 0.0002, "epoch": 3.307735833804866, "step": 20460}, {"loss": 0.633, "grad_norm": 0.9022141098976135, "learning_rate": 0.0002, "epoch": 3.3093525179856114, "step": 20470}, {"loss": 0.6654, "grad_norm": 0.8614798188209534, "learning_rate": 0.0002, "epoch": 3.3109692021663566, "step": 20480}, {"loss": 0.6254, "grad_norm": 0.8838164210319519, "learning_rate": 0.0002, "epoch": 3.3125858863471023, "step": 20490}, {"loss": 0.5849, "grad_norm": 0.8709736466407776, "learning_rate": 0.0002, "epoch": 3.3142025705278475, "step": 20500}, {"loss": 0.6146, "grad_norm": 0.9533300995826721, "learning_rate": 0.0002, "epoch": 3.3158192547085927, "step": 20510}, {"loss": 0.6029, "grad_norm": 0.8259269595146179, "learning_rate": 0.0002, "epoch": 3.317435938889338, "step": 20520}, {"loss": 0.6268, "grad_norm": 0.8607608079910278, "learning_rate": 0.0002, "epoch": 3.319052623070083, "step": 20530}, {"loss": 0.5676, "grad_norm": 1.0863020420074463, "learning_rate": 0.0002, "epoch": 3.3206693072508284, "step": 20540}, {"loss": 0.6412, "grad_norm": 1.011489987373352, "learning_rate": 0.0002, "epoch": 3.3222859914315737, "step": 20550}, {"loss": 0.6247, "grad_norm": 0.6952177882194519, "learning_rate": 0.0002, "epoch": 3.3239026756123193, "step": 20560}, {"loss": 0.6229, "grad_norm": 0.9638974070549011, "learning_rate": 0.0002, "epoch": 3.3255193597930646, "step": 20570}, {"loss": 0.5882, "grad_norm": 1.0310138463974, "learning_rate": 0.0002, "epoch": 3.32713604397381, "step": 20580}, {"loss": 0.594, "grad_norm": 0.9371318221092224, "learning_rate": 0.0002, "epoch": 3.328752728154555, "step": 20590}, {"loss": 0.6137, "grad_norm": 0.8756691813468933, "learning_rate": 0.0002, "epoch": 3.3303694123353003, "step": 20600}, {"loss": 0.5994, "grad_norm": 1.054175853729248, "learning_rate": 0.0002, "epoch": 3.3319860965160455, "step": 20610}, {"loss": 0.6169, "grad_norm": 0.9074128270149231, "learning_rate": 0.0002, "epoch": 3.3336027806967907, "step": 20620}, {"loss": 0.6138, "grad_norm": 0.906900942325592, "learning_rate": 0.0002, "epoch": 3.335219464877536, "step": 20630}, {"loss": 0.571, "grad_norm": 0.8689333200454712, "learning_rate": 0.0002, "epoch": 3.3368361490582816, "step": 20640}, {"loss": 0.6079, "grad_norm": 0.9889747500419617, "learning_rate": 0.0002, "epoch": 3.338452833239027, "step": 20650}, {"loss": 0.6073, "grad_norm": 1.0685805082321167, "learning_rate": 0.0002, "epoch": 3.340069517419772, "step": 20660}, {"loss": 0.6091, "grad_norm": 0.7495010495185852, "learning_rate": 0.0002, "epoch": 3.3416862016005173, "step": 20670}, {"loss": 0.5883, "grad_norm": 0.8747848272323608, "learning_rate": 0.0002, "epoch": 3.3433028857812626, "step": 20680}, {"loss": 0.604, "grad_norm": 0.9762673377990723, "learning_rate": 0.0002, "epoch": 3.344919569962008, "step": 20690}, {"loss": 0.6784, "grad_norm": 1.0284489393234253, "learning_rate": 0.0002, "epoch": 3.346536254142753, "step": 20700}, {"loss": 0.6464, "grad_norm": 0.7293812036514282, "learning_rate": 0.0002, "epoch": 3.3481529383234987, "step": 20710}, {"loss": 0.609, "grad_norm": 0.8330199122428894, "learning_rate": 0.0002, "epoch": 3.349769622504244, "step": 20720}, {"loss": 0.5729, "grad_norm": 0.9808499217033386, "learning_rate": 0.0002, "epoch": 3.351386306684989, "step": 20730}, {"loss": 0.6315, "grad_norm": 0.9508825540542603, "learning_rate": 0.0002, "epoch": 3.3530029908657344, "step": 20740}, {"loss": 0.5965, "grad_norm": 0.790483832359314, "learning_rate": 0.0002, "epoch": 3.3546196750464796, "step": 20750}, {"loss": 0.6327, "grad_norm": 1.022793173789978, "learning_rate": 0.0002, "epoch": 3.356236359227225, "step": 20760}, {"loss": 0.6439, "grad_norm": 0.8318950533866882, "learning_rate": 0.0002, "epoch": 3.35785304340797, "step": 20770}, {"loss": 0.6037, "grad_norm": 0.7980858087539673, "learning_rate": 0.0002, "epoch": 3.3594697275887153, "step": 20780}, {"loss": 0.6746, "grad_norm": 0.8114802241325378, "learning_rate": 0.0002, "epoch": 3.361086411769461, "step": 20790}, {"loss": 0.6017, "grad_norm": 0.8522519469261169, "learning_rate": 0.0002, "epoch": 3.3627030959502062, "step": 20800}, {"loss": 0.5864, "grad_norm": 0.9142431616783142, "learning_rate": 0.0002, "epoch": 3.3643197801309515, "step": 20810}, {"loss": 0.6331, "grad_norm": 0.771170437335968, "learning_rate": 0.0002, "epoch": 3.3659364643116967, "step": 20820}, {"loss": 0.5879, "grad_norm": 1.0628231763839722, "learning_rate": 0.0002, "epoch": 3.367553148492442, "step": 20830}, {"loss": 0.6533, "grad_norm": 0.9384352564811707, "learning_rate": 0.0002, "epoch": 3.369169832673187, "step": 20840}, {"loss": 0.6292, "grad_norm": 1.1286591291427612, "learning_rate": 0.0002, "epoch": 3.370786516853933, "step": 20850}, {"loss": 0.5986, "grad_norm": 1.1349513530731201, "learning_rate": 0.0002, "epoch": 3.372403201034678, "step": 20860}, {"loss": 0.6413, "grad_norm": 1.0127464532852173, "learning_rate": 0.0002, "epoch": 3.3740198852154233, "step": 20870}, {"loss": 0.6414, "grad_norm": 0.9111971855163574, "learning_rate": 0.0002, "epoch": 3.3756365693961685, "step": 20880}, {"loss": 0.6101, "grad_norm": 0.871356725692749, "learning_rate": 0.0002, "epoch": 3.3772532535769137, "step": 20890}, {"loss": 0.5995, "grad_norm": 0.7774117588996887, "learning_rate": 0.0002, "epoch": 3.378869937757659, "step": 20900}, {"loss": 0.6062, "grad_norm": 1.0089964866638184, "learning_rate": 0.0002, "epoch": 3.380486621938404, "step": 20910}, {"loss": 0.5908, "grad_norm": 0.7855867147445679, "learning_rate": 0.0002, "epoch": 3.3821033061191494, "step": 20920}, {"loss": 0.6373, "grad_norm": 1.3713710308074951, "learning_rate": 0.0002, "epoch": 3.3837199902998947, "step": 20930}, {"loss": 0.6627, "grad_norm": 0.8599116206169128, "learning_rate": 0.0002, "epoch": 3.3853366744806404, "step": 20940}, {"loss": 0.6224, "grad_norm": 0.9392673373222351, "learning_rate": 0.0002, "epoch": 3.3869533586613856, "step": 20950}, {"loss": 0.5855, "grad_norm": 0.8764075040817261, "learning_rate": 0.0002, "epoch": 3.388570042842131, "step": 20960}, {"loss": 0.5734, "grad_norm": 0.8240136504173279, "learning_rate": 0.0002, "epoch": 3.390186727022876, "step": 20970}, {"loss": 0.5783, "grad_norm": 1.0982369184494019, "learning_rate": 0.0002, "epoch": 3.3918034112036213, "step": 20980}, {"loss": 0.5451, "grad_norm": 1.0599013566970825, "learning_rate": 0.0002, "epoch": 3.3934200953843665, "step": 20990}, {"loss": 0.6356, "grad_norm": 0.895438015460968, "learning_rate": 0.0002, "epoch": 3.395036779565112, "step": 21000}, {"loss": 0.6065, "grad_norm": 0.6974841356277466, "learning_rate": 0.0002, "epoch": 3.3966534637458574, "step": 21010}, {"loss": 0.5704, "grad_norm": 0.9571719765663147, "learning_rate": 0.0002, "epoch": 3.3982701479266026, "step": 21020}, {"loss": 0.679, "grad_norm": 0.831912636756897, "learning_rate": 0.0002, "epoch": 3.399886832107348, "step": 21030}, {"loss": 0.6051, "grad_norm": 0.831936240196228, "learning_rate": 0.0002, "epoch": 3.401503516288093, "step": 21040}, {"loss": 0.5857, "grad_norm": 0.7388373613357544, "learning_rate": 0.0002, "epoch": 3.4031202004688383, "step": 21050}, {"loss": 0.6245, "grad_norm": 0.938667356967926, "learning_rate": 0.0002, "epoch": 3.4047368846495836, "step": 21060}, {"loss": 0.6121, "grad_norm": 0.9202313423156738, "learning_rate": 0.0002, "epoch": 3.406353568830329, "step": 21070}, {"loss": 0.6388, "grad_norm": 0.9888381958007812, "learning_rate": 0.0002, "epoch": 3.4079702530110745, "step": 21080}, {"loss": 0.6245, "grad_norm": 0.8526970744132996, "learning_rate": 0.0002, "epoch": 3.4095869371918197, "step": 21090}, {"loss": 0.5914, "grad_norm": 0.7939383387565613, "learning_rate": 0.0002, "epoch": 3.411203621372565, "step": 21100}, {"loss": 0.6066, "grad_norm": 0.9986352920532227, "learning_rate": 0.0002, "epoch": 3.41282030555331, "step": 21110}, {"loss": 0.5947, "grad_norm": 0.8895300030708313, "learning_rate": 0.0002, "epoch": 3.4144369897340554, "step": 21120}, {"loss": 0.6264, "grad_norm": 0.9559482932090759, "learning_rate": 0.0002, "epoch": 3.4160536739148006, "step": 21130}, {"loss": 0.6491, "grad_norm": 0.8351506590843201, "learning_rate": 0.0002, "epoch": 3.417670358095546, "step": 21140}, {"loss": 0.567, "grad_norm": 0.8224456906318665, "learning_rate": 0.0002, "epoch": 3.4192870422762915, "step": 21150}, {"loss": 0.5871, "grad_norm": 1.0110299587249756, "learning_rate": 0.0002, "epoch": 3.4209037264570368, "step": 21160}, {"loss": 0.6116, "grad_norm": 0.82564777135849, "learning_rate": 0.0002, "epoch": 3.422520410637782, "step": 21170}, {"loss": 0.595, "grad_norm": 1.004738688468933, "learning_rate": 0.0002, "epoch": 3.4241370948185272, "step": 21180}, {"loss": 0.6286, "grad_norm": 0.7545676827430725, "learning_rate": 0.0002, "epoch": 3.4257537789992725, "step": 21190}, {"loss": 0.5868, "grad_norm": 0.8918704390525818, "learning_rate": 0.0002, "epoch": 3.4273704631800177, "step": 21200}, {"loss": 0.6542, "grad_norm": 0.8336876034736633, "learning_rate": 0.0002, "epoch": 3.428987147360763, "step": 21210}, {"loss": 0.5824, "grad_norm": 0.8928771018981934, "learning_rate": 0.0002, "epoch": 3.430603831541508, "step": 21220}, {"loss": 0.6468, "grad_norm": 0.7663705945014954, "learning_rate": 0.0002, "epoch": 3.432220515722254, "step": 21230}, {"loss": 0.6693, "grad_norm": 0.8392598628997803, "learning_rate": 0.0002, "epoch": 3.433837199902999, "step": 21240}, {"loss": 0.5971, "grad_norm": 0.8819600343704224, "learning_rate": 0.0002, "epoch": 3.4354538840837443, "step": 21250}, {"loss": 0.6791, "grad_norm": 0.9124642014503479, "learning_rate": 0.0002, "epoch": 3.4370705682644895, "step": 21260}, {"loss": 0.5925, "grad_norm": 0.8329763412475586, "learning_rate": 0.0002, "epoch": 3.4386872524452348, "step": 21270}, {"loss": 0.6541, "grad_norm": 0.9982839822769165, "learning_rate": 0.0002, "epoch": 3.44030393662598, "step": 21280}, {"loss": 0.6441, "grad_norm": 0.9105954766273499, "learning_rate": 0.0002, "epoch": 3.4419206208067252, "step": 21290}, {"loss": 0.6028, "grad_norm": 0.8182359337806702, "learning_rate": 0.0002, "epoch": 3.443537304987471, "step": 21300}, {"loss": 0.5991, "grad_norm": 1.0568904876708984, "learning_rate": 0.0002, "epoch": 3.445153989168216, "step": 21310}, {"loss": 0.6117, "grad_norm": 0.968539834022522, "learning_rate": 0.0002, "epoch": 3.4467706733489614, "step": 21320}, {"loss": 0.6219, "grad_norm": 0.8774511218070984, "learning_rate": 0.0002, "epoch": 3.4483873575297066, "step": 21330}, {"loss": 0.6438, "grad_norm": 0.7598156332969666, "learning_rate": 0.0002, "epoch": 3.450004041710452, "step": 21340}, {"loss": 0.6033, "grad_norm": 1.1012897491455078, "learning_rate": 0.0002, "epoch": 3.451620725891197, "step": 21350}, {"loss": 0.6137, "grad_norm": 0.8040637373924255, "learning_rate": 0.0002, "epoch": 3.4532374100719423, "step": 21360}, {"loss": 0.6173, "grad_norm": 0.8497496247291565, "learning_rate": 0.0002, "epoch": 3.4548540942526875, "step": 21370}, {"loss": 0.6005, "grad_norm": 0.8429915904998779, "learning_rate": 0.0002, "epoch": 3.456470778433433, "step": 21380}, {"loss": 0.6182, "grad_norm": 0.8107112646102905, "learning_rate": 0.0002, "epoch": 3.4580874626141784, "step": 21390}, {"loss": 0.6109, "grad_norm": 1.00872004032135, "learning_rate": 0.0002, "epoch": 3.4597041467949237, "step": 21400}, {"loss": 0.5712, "grad_norm": 0.8266542553901672, "learning_rate": 0.0002, "epoch": 3.461320830975669, "step": 21410}, {"loss": 0.6457, "grad_norm": 0.8972568511962891, "learning_rate": 0.0002, "epoch": 3.462937515156414, "step": 21420}, {"loss": 0.6081, "grad_norm": 1.0781476497650146, "learning_rate": 0.0002, "epoch": 3.4645541993371594, "step": 21430}, {"loss": 0.6303, "grad_norm": 0.9571592807769775, "learning_rate": 0.0002, "epoch": 3.4661708835179046, "step": 21440}, {"loss": 0.6309, "grad_norm": 0.881547212600708, "learning_rate": 0.0002, "epoch": 3.4677875676986503, "step": 21450}, {"loss": 0.6076, "grad_norm": 0.6955338716506958, "learning_rate": 0.0002, "epoch": 3.4694042518793955, "step": 21460}, {"loss": 0.6205, "grad_norm": 0.901187539100647, "learning_rate": 0.0002, "epoch": 3.4710209360601407, "step": 21470}, {"loss": 0.639, "grad_norm": 0.7063511610031128, "learning_rate": 0.0002, "epoch": 3.472637620240886, "step": 21480}, {"loss": 0.6154, "grad_norm": 0.8462792038917542, "learning_rate": 0.0002, "epoch": 3.474254304421631, "step": 21490}, {"loss": 0.61, "grad_norm": 1.1861060857772827, "learning_rate": 0.0002, "epoch": 3.4758709886023764, "step": 21500}, {"loss": 0.6586, "grad_norm": 0.70503169298172, "learning_rate": 0.0002, "epoch": 3.4774876727831217, "step": 21510}, {"loss": 0.6475, "grad_norm": 0.9650066494941711, "learning_rate": 0.0002, "epoch": 3.479104356963867, "step": 21520}, {"loss": 0.6452, "grad_norm": 1.0266852378845215, "learning_rate": 0.0002, "epoch": 3.4807210411446126, "step": 21530}, {"loss": 0.6553, "grad_norm": 0.956372857093811, "learning_rate": 0.0002, "epoch": 3.482337725325358, "step": 21540}, {"loss": 0.6667, "grad_norm": 0.8848432898521423, "learning_rate": 0.0002, "epoch": 3.483954409506103, "step": 21550}, {"loss": 0.6375, "grad_norm": 1.0805351734161377, "learning_rate": 0.0002, "epoch": 3.4855710936868483, "step": 21560}, {"loss": 0.6958, "grad_norm": 0.9279725551605225, "learning_rate": 0.0002, "epoch": 3.4871877778675935, "step": 21570}, {"loss": 0.6354, "grad_norm": 0.9049562215805054, "learning_rate": 0.0002, "epoch": 3.4888044620483387, "step": 21580}, {"loss": 0.6071, "grad_norm": 0.9619429111480713, "learning_rate": 0.0002, "epoch": 3.4904211462290844, "step": 21590}, {"loss": 0.5927, "grad_norm": 0.8508906960487366, "learning_rate": 0.0002, "epoch": 3.4920378304098296, "step": 21600}, {"loss": 0.6115, "grad_norm": 0.8692502379417419, "learning_rate": 0.0002, "epoch": 3.493654514590575, "step": 21610}, {"loss": 0.5878, "grad_norm": 0.8187332153320312, "learning_rate": 0.0002, "epoch": 3.49527119877132, "step": 21620}, {"loss": 0.5874, "grad_norm": 1.145400047302246, "learning_rate": 0.0002, "epoch": 3.4968878829520653, "step": 21630}, {"loss": 0.6313, "grad_norm": 0.8281388282775879, "learning_rate": 0.0002, "epoch": 3.4985045671328105, "step": 21640}, {"loss": 0.6624, "grad_norm": 0.82256019115448, "learning_rate": 0.0002, "epoch": 3.500121251313556, "step": 21650}, {"loss": 0.6346, "grad_norm": 0.9315484762191772, "learning_rate": 0.0002, "epoch": 3.501737935494301, "step": 21660}, {"loss": 0.6086, "grad_norm": 0.7626111507415771, "learning_rate": 0.0002, "epoch": 3.5033546196750462, "step": 21670}, {"loss": 0.6177, "grad_norm": 0.9275059103965759, "learning_rate": 0.0002, "epoch": 3.504971303855792, "step": 21680}, {"loss": 0.64, "grad_norm": 0.7906724810600281, "learning_rate": 0.0002, "epoch": 3.506587988036537, "step": 21690}, {"loss": 0.6015, "grad_norm": 0.8289761543273926, "learning_rate": 0.0002, "epoch": 3.5082046722172824, "step": 21700}, {"loss": 0.6246, "grad_norm": 0.8316431045532227, "learning_rate": 0.0002, "epoch": 3.5098213563980276, "step": 21710}, {"loss": 0.619, "grad_norm": 1.0451812744140625, "learning_rate": 0.0002, "epoch": 3.511438040578773, "step": 21720}, {"loss": 0.632, "grad_norm": 0.928252637386322, "learning_rate": 0.0002, "epoch": 3.513054724759518, "step": 21730}, {"loss": 0.6062, "grad_norm": 0.7985895276069641, "learning_rate": 0.0002, "epoch": 3.5146714089402638, "step": 21740}, {"loss": 0.6463, "grad_norm": 0.6740974187850952, "learning_rate": 0.0002, "epoch": 3.516288093121009, "step": 21750}, {"loss": 0.6138, "grad_norm": 0.8482223749160767, "learning_rate": 0.0002, "epoch": 3.517904777301754, "step": 21760}, {"loss": 0.6277, "grad_norm": 0.889947772026062, "learning_rate": 0.0002, "epoch": 3.5195214614824994, "step": 21770}, {"loss": 0.6174, "grad_norm": 0.8304598927497864, "learning_rate": 0.0002, "epoch": 3.5211381456632447, "step": 21780}, {"loss": 0.6156, "grad_norm": 0.8002981543540955, "learning_rate": 0.0002, "epoch": 3.52275482984399, "step": 21790}, {"loss": 0.5896, "grad_norm": 0.8115083575248718, "learning_rate": 0.0002, "epoch": 3.524371514024735, "step": 21800}, {"loss": 0.6041, "grad_norm": 0.9715048670768738, "learning_rate": 0.0002, "epoch": 3.5259881982054804, "step": 21810}, {"loss": 0.6715, "grad_norm": 1.0910786390304565, "learning_rate": 0.0002, "epoch": 3.5276048823862256, "step": 21820}, {"loss": 0.6543, "grad_norm": 0.8438942432403564, "learning_rate": 0.0002, "epoch": 3.5292215665669713, "step": 21830}, {"loss": 0.6509, "grad_norm": 0.8813382983207703, "learning_rate": 0.0002, "epoch": 3.5308382507477165, "step": 21840}, {"loss": 0.6049, "grad_norm": 0.7092908024787903, "learning_rate": 0.0002, "epoch": 3.5324549349284617, "step": 21850}, {"loss": 0.5678, "grad_norm": 0.8332187533378601, "learning_rate": 0.0002, "epoch": 3.534071619109207, "step": 21860}, {"loss": 0.5896, "grad_norm": 0.8958209156990051, "learning_rate": 0.0002, "epoch": 3.535688303289952, "step": 21870}, {"loss": 0.6476, "grad_norm": 0.824138879776001, "learning_rate": 0.0002, "epoch": 3.5373049874706974, "step": 21880}, {"loss": 0.6022, "grad_norm": 0.8375158309936523, "learning_rate": 0.0002, "epoch": 3.538921671651443, "step": 21890}, {"loss": 0.6019, "grad_norm": 1.0274608135223389, "learning_rate": 0.0002, "epoch": 3.5405383558321883, "step": 21900}, {"loss": 0.6194, "grad_norm": 0.7088932394981384, "learning_rate": 0.0002, "epoch": 3.5421550400129336, "step": 21910}, {"loss": 0.6554, "grad_norm": 0.8172445297241211, "learning_rate": 0.0002, "epoch": 3.543771724193679, "step": 21920}, {"loss": 0.6711, "grad_norm": 0.9904135465621948, "learning_rate": 0.0002, "epoch": 3.545388408374424, "step": 21930}, {"loss": 0.6001, "grad_norm": 0.9900432229042053, "learning_rate": 0.0002, "epoch": 3.5470050925551693, "step": 21940}, {"loss": 0.6195, "grad_norm": 0.8963301181793213, "learning_rate": 0.0002, "epoch": 3.5486217767359145, "step": 21950}, {"loss": 0.5972, "grad_norm": 0.8551464676856995, "learning_rate": 0.0002, "epoch": 3.5502384609166597, "step": 21960}, {"loss": 0.6206, "grad_norm": 1.0916603803634644, "learning_rate": 0.0002, "epoch": 3.551855145097405, "step": 21970}, {"loss": 0.6523, "grad_norm": 0.841598391532898, "learning_rate": 0.0002, "epoch": 3.5534718292781506, "step": 21980}, {"loss": 0.617, "grad_norm": 0.8566757440567017, "learning_rate": 0.0002, "epoch": 3.555088513458896, "step": 21990}, {"loss": 0.6192, "grad_norm": 1.0145052671432495, "learning_rate": 0.0002, "epoch": 3.556705197639641, "step": 22000}, {"loss": 0.6173, "grad_norm": 0.9293754696846008, "learning_rate": 0.0002, "epoch": 3.5583218818203863, "step": 22010}, {"loss": 0.612, "grad_norm": 0.9568536281585693, "learning_rate": 0.0002, "epoch": 3.5599385660011316, "step": 22020}, {"loss": 0.641, "grad_norm": 0.8613139986991882, "learning_rate": 0.0002, "epoch": 3.5615552501818772, "step": 22030}, {"loss": 0.6496, "grad_norm": 0.8179237246513367, "learning_rate": 0.0002, "epoch": 3.5631719343626225, "step": 22040}, {"loss": 0.574, "grad_norm": 0.9059830904006958, "learning_rate": 0.0002, "epoch": 3.5647886185433677, "step": 22050}, {"loss": 0.6448, "grad_norm": 1.0068252086639404, "learning_rate": 0.0002, "epoch": 3.566405302724113, "step": 22060}, {"loss": 0.6239, "grad_norm": 0.9682072997093201, "learning_rate": 0.0002, "epoch": 3.568021986904858, "step": 22070}, {"loss": 0.6808, "grad_norm": 0.8514005541801453, "learning_rate": 0.0002, "epoch": 3.5696386710856034, "step": 22080}, {"loss": 0.5956, "grad_norm": 0.8327770829200745, "learning_rate": 0.0002, "epoch": 3.5712553552663486, "step": 22090}, {"loss": 0.5976, "grad_norm": 1.024976372718811, "learning_rate": 0.0002, "epoch": 3.572872039447094, "step": 22100}, {"loss": 0.624, "grad_norm": 0.7721174955368042, "learning_rate": 0.0002, "epoch": 3.574488723627839, "step": 22110}, {"loss": 0.5896, "grad_norm": 1.0351054668426514, "learning_rate": 0.0002, "epoch": 3.5761054078085843, "step": 22120}, {"loss": 0.6379, "grad_norm": 0.9680907130241394, "learning_rate": 0.0002, "epoch": 3.57772209198933, "step": 22130}, {"loss": 0.6194, "grad_norm": 0.8016974925994873, "learning_rate": 0.0002, "epoch": 3.5793387761700752, "step": 22140}, {"loss": 0.6387, "grad_norm": 1.0109003782272339, "learning_rate": 0.0002, "epoch": 3.5809554603508205, "step": 22150}, {"loss": 0.6368, "grad_norm": 1.0473392009735107, "learning_rate": 0.0002, "epoch": 3.5825721445315657, "step": 22160}, {"loss": 0.6353, "grad_norm": 0.8686613440513611, "learning_rate": 0.0002, "epoch": 3.584188828712311, "step": 22170}, {"loss": 0.5791, "grad_norm": 0.869149923324585, "learning_rate": 0.0002, "epoch": 3.5858055128930566, "step": 22180}, {"loss": 0.5895, "grad_norm": 0.9769062995910645, "learning_rate": 0.0002, "epoch": 3.587422197073802, "step": 22190}, {"loss": 0.5939, "grad_norm": 0.779636561870575, "learning_rate": 0.0002, "epoch": 3.589038881254547, "step": 22200}, {"loss": 0.5875, "grad_norm": 0.9063841104507446, "learning_rate": 0.0002, "epoch": 3.5906555654352923, "step": 22210}, {"loss": 0.5671, "grad_norm": 0.9216037392616272, "learning_rate": 0.0002, "epoch": 3.5922722496160375, "step": 22220}, {"loss": 0.6484, "grad_norm": 1.0217336416244507, "learning_rate": 0.0002, "epoch": 3.5938889337967828, "step": 22230}, {"loss": 0.6511, "grad_norm": 0.8513161540031433, "learning_rate": 0.0002, "epoch": 3.595505617977528, "step": 22240}, {"loss": 0.6301, "grad_norm": 0.8084813952445984, "learning_rate": 0.0002, "epoch": 3.597122302158273, "step": 22250}, {"loss": 0.6197, "grad_norm": 0.8524802923202515, "learning_rate": 0.0002, "epoch": 3.5987389863390185, "step": 22260}, {"loss": 0.5599, "grad_norm": 0.9356237649917603, "learning_rate": 0.0002, "epoch": 3.600355670519764, "step": 22270}, {"loss": 0.628, "grad_norm": 1.009600281715393, "learning_rate": 0.0002, "epoch": 3.6019723547005094, "step": 22280}, {"loss": 0.6179, "grad_norm": 0.9900581240653992, "learning_rate": 0.0002, "epoch": 3.6035890388812546, "step": 22290}, {"loss": 0.5725, "grad_norm": 1.062495231628418, "learning_rate": 0.0002, "epoch": 3.605205723062, "step": 22300}, {"loss": 0.607, "grad_norm": 0.8832381367683411, "learning_rate": 0.0002, "epoch": 3.606822407242745, "step": 22310}, {"loss": 0.6215, "grad_norm": 0.9284297823905945, "learning_rate": 0.0002, "epoch": 3.6084390914234903, "step": 22320}, {"loss": 0.685, "grad_norm": 1.2381829023361206, "learning_rate": 0.0002, "epoch": 3.610055775604236, "step": 22330}, {"loss": 0.6181, "grad_norm": 0.929434597492218, "learning_rate": 0.0002, "epoch": 3.611672459784981, "step": 22340}, {"loss": 0.6141, "grad_norm": 0.9714490175247192, "learning_rate": 0.0002, "epoch": 3.6132891439657264, "step": 22350}, {"loss": 0.6861, "grad_norm": 0.808014988899231, "learning_rate": 0.0002, "epoch": 3.6149058281464717, "step": 22360}, {"loss": 0.6428, "grad_norm": 1.0364398956298828, "learning_rate": 0.0002, "epoch": 3.616522512327217, "step": 22370}, {"loss": 0.6337, "grad_norm": 0.7858489751815796, "learning_rate": 0.0002, "epoch": 3.618139196507962, "step": 22380}, {"loss": 0.6214, "grad_norm": 0.9920870065689087, "learning_rate": 0.0002, "epoch": 3.6197558806887074, "step": 22390}, {"loss": 0.6659, "grad_norm": 0.9183220863342285, "learning_rate": 0.0002, "epoch": 3.6213725648694526, "step": 22400}, {"loss": 0.6036, "grad_norm": 0.9826246500015259, "learning_rate": 0.0002, "epoch": 3.622989249050198, "step": 22410}, {"loss": 0.6441, "grad_norm": 0.8632931113243103, "learning_rate": 0.0002, "epoch": 3.6246059332309435, "step": 22420}, {"loss": 0.6124, "grad_norm": 0.8468965291976929, "learning_rate": 0.0002, "epoch": 3.6262226174116887, "step": 22430}, {"loss": 0.6328, "grad_norm": 0.8466871976852417, "learning_rate": 0.0002, "epoch": 3.627839301592434, "step": 22440}, {"loss": 0.5941, "grad_norm": 0.9501169919967651, "learning_rate": 0.0002, "epoch": 3.629455985773179, "step": 22450}, {"loss": 0.6069, "grad_norm": 0.8906720876693726, "learning_rate": 0.0002, "epoch": 3.6310726699539244, "step": 22460}, {"loss": 0.6928, "grad_norm": 0.7400227189064026, "learning_rate": 0.0002, "epoch": 3.6326893541346696, "step": 22470}, {"loss": 0.6337, "grad_norm": 0.9756355881690979, "learning_rate": 0.0002, "epoch": 3.6343060383154153, "step": 22480}, {"loss": 0.6203, "grad_norm": 0.7504993081092834, "learning_rate": 0.0002, "epoch": 3.6359227224961606, "step": 22490}, {"loss": 0.6302, "grad_norm": 0.9270039200782776, "learning_rate": 0.0002, "epoch": 3.637539406676906, "step": 22500}, {"loss": 0.6026, "grad_norm": 0.8841686844825745, "learning_rate": 0.0002, "epoch": 3.639156090857651, "step": 22510}, {"loss": 0.6098, "grad_norm": 0.8533213138580322, "learning_rate": 0.0002, "epoch": 3.6407727750383962, "step": 22520}, {"loss": 0.6412, "grad_norm": 1.0052043199539185, "learning_rate": 0.0002, "epoch": 3.6423894592191415, "step": 22530}, {"loss": 0.6363, "grad_norm": 1.0323461294174194, "learning_rate": 0.0002, "epoch": 3.6440061433998867, "step": 22540}, {"loss": 0.6545, "grad_norm": 0.8654312491416931, "learning_rate": 0.0002, "epoch": 3.645622827580632, "step": 22550}, {"loss": 0.6155, "grad_norm": 0.6400038003921509, "learning_rate": 0.0002, "epoch": 3.647239511761377, "step": 22560}, {"loss": 0.5829, "grad_norm": 0.8061298727989197, "learning_rate": 0.0002, "epoch": 3.648856195942123, "step": 22570}, {"loss": 0.6388, "grad_norm": 0.9257854223251343, "learning_rate": 0.0002, "epoch": 3.650472880122868, "step": 22580}, {"loss": 0.6409, "grad_norm": 0.8439396619796753, "learning_rate": 0.0002, "epoch": 3.6520895643036133, "step": 22590}, {"loss": 0.5996, "grad_norm": 0.7764544486999512, "learning_rate": 0.0002, "epoch": 3.6537062484843585, "step": 22600}, {"loss": 0.6434, "grad_norm": 1.125451683998108, "learning_rate": 0.0002, "epoch": 3.6553229326651038, "step": 22610}, {"loss": 0.6579, "grad_norm": 0.7523018717765808, "learning_rate": 0.0002, "epoch": 3.656939616845849, "step": 22620}, {"loss": 0.6476, "grad_norm": 1.071026086807251, "learning_rate": 0.0002, "epoch": 3.6585563010265947, "step": 22630}, {"loss": 0.6459, "grad_norm": 0.945791482925415, "learning_rate": 0.0002, "epoch": 3.66017298520734, "step": 22640}, {"loss": 0.659, "grad_norm": 0.8001811504364014, "learning_rate": 0.0002, "epoch": 3.661789669388085, "step": 22650}, {"loss": 0.6385, "grad_norm": 0.9700816869735718, "learning_rate": 0.0002, "epoch": 3.6634063535688304, "step": 22660}, {"loss": 0.6337, "grad_norm": 0.9053242206573486, "learning_rate": 0.0002, "epoch": 3.6650230377495756, "step": 22670}, {"loss": 0.6335, "grad_norm": 0.944362461566925, "learning_rate": 0.0002, "epoch": 3.666639721930321, "step": 22680}, {"loss": 0.6235, "grad_norm": 1.067489504814148, "learning_rate": 0.0002, "epoch": 3.668256406111066, "step": 22690}, {"loss": 0.698, "grad_norm": 1.0984995365142822, "learning_rate": 0.0002, "epoch": 3.6698730902918113, "step": 22700}, {"loss": 0.6717, "grad_norm": 0.9336317777633667, "learning_rate": 0.0002, "epoch": 3.6714897744725565, "step": 22710}, {"loss": 0.6195, "grad_norm": 0.9261918663978577, "learning_rate": 0.0002, "epoch": 3.673106458653302, "step": 22720}, {"loss": 0.6332, "grad_norm": 0.8648008704185486, "learning_rate": 0.0002, "epoch": 3.6747231428340474, "step": 22730}, {"loss": 0.6576, "grad_norm": 0.7225083708763123, "learning_rate": 0.0002, "epoch": 3.6763398270147927, "step": 22740}, {"loss": 0.6406, "grad_norm": 0.9258282780647278, "learning_rate": 0.0002, "epoch": 3.677956511195538, "step": 22750}, {"loss": 0.6397, "grad_norm": 0.70876145362854, "learning_rate": 0.0002, "epoch": 3.679573195376283, "step": 22760}, {"loss": 0.6821, "grad_norm": 0.8780210018157959, "learning_rate": 0.0002, "epoch": 3.681189879557029, "step": 22770}, {"loss": 0.6036, "grad_norm": 0.8075440526008606, "learning_rate": 0.0002, "epoch": 3.682806563737774, "step": 22780}, {"loss": 0.6561, "grad_norm": 0.8503130674362183, "learning_rate": 0.0002, "epoch": 3.6844232479185193, "step": 22790}, {"loss": 0.6082, "grad_norm": 0.8413618206977844, "learning_rate": 0.0002, "epoch": 3.6860399320992645, "step": 22800}, {"loss": 0.614, "grad_norm": 0.8675165176391602, "learning_rate": 0.0002, "epoch": 3.6876566162800097, "step": 22810}, {"loss": 0.6157, "grad_norm": 0.8235884308815002, "learning_rate": 0.0002, "epoch": 3.689273300460755, "step": 22820}, {"loss": 0.5708, "grad_norm": 0.9477725625038147, "learning_rate": 0.0002, "epoch": 3.6908899846415, "step": 22830}, {"loss": 0.6481, "grad_norm": 0.7883533835411072, "learning_rate": 0.0002, "epoch": 3.6925066688222454, "step": 22840}, {"loss": 0.5872, "grad_norm": 1.047913908958435, "learning_rate": 0.0002, "epoch": 3.6941233530029907, "step": 22850}, {"loss": 0.6176, "grad_norm": 0.9171528816223145, "learning_rate": 0.0002, "epoch": 3.695740037183736, "step": 22860}, {"loss": 0.6204, "grad_norm": 0.9338192343711853, "learning_rate": 0.0002, "epoch": 3.6973567213644816, "step": 22870}, {"loss": 0.686, "grad_norm": 0.8799443244934082, "learning_rate": 0.0002, "epoch": 3.698973405545227, "step": 22880}, {"loss": 0.6206, "grad_norm": 0.8515434861183167, "learning_rate": 0.0002, "epoch": 3.700590089725972, "step": 22890}, {"loss": 0.5954, "grad_norm": 0.7805591821670532, "learning_rate": 0.0002, "epoch": 3.7022067739067173, "step": 22900}, {"loss": 0.6108, "grad_norm": 0.8470911979675293, "learning_rate": 0.0002, "epoch": 3.7038234580874625, "step": 22910}, {"loss": 0.6557, "grad_norm": 0.9452309012413025, "learning_rate": 0.0002, "epoch": 3.705440142268208, "step": 22920}, {"loss": 0.6529, "grad_norm": 0.950243353843689, "learning_rate": 0.0002, "epoch": 3.7070568264489534, "step": 22930}, {"loss": 0.6364, "grad_norm": 0.7882499098777771, "learning_rate": 0.0002, "epoch": 3.7086735106296986, "step": 22940}, {"loss": 0.6462, "grad_norm": 0.8307787775993347, "learning_rate": 0.0002, "epoch": 3.710290194810444, "step": 22950}, {"loss": 0.6371, "grad_norm": 1.0970630645751953, "learning_rate": 0.0002, "epoch": 3.711906878991189, "step": 22960}, {"loss": 0.6281, "grad_norm": 0.8269566297531128, "learning_rate": 0.0002, "epoch": 3.7135235631719343, "step": 22970}, {"loss": 0.6561, "grad_norm": 0.8306704759597778, "learning_rate": 0.0002, "epoch": 3.7151402473526796, "step": 22980}, {"loss": 0.6418, "grad_norm": 0.9710225462913513, "learning_rate": 0.0002, "epoch": 3.716756931533425, "step": 22990}, {"loss": 0.6639, "grad_norm": 0.8890530467033386, "learning_rate": 0.0002, "epoch": 3.71837361571417, "step": 23000}, {"loss": 0.6084, "grad_norm": 0.883522629737854, "learning_rate": 0.0002, "epoch": 3.7199902998949153, "step": 23010}, {"loss": 0.6183, "grad_norm": 0.8662652373313904, "learning_rate": 0.0002, "epoch": 3.721606984075661, "step": 23020}, {"loss": 0.6266, "grad_norm": 0.7228406667709351, "learning_rate": 0.0002, "epoch": 3.723223668256406, "step": 23030}, {"loss": 0.6417, "grad_norm": 1.060792088508606, "learning_rate": 0.0002, "epoch": 3.7248403524371514, "step": 23040}, {"loss": 0.6346, "grad_norm": 1.0119613409042358, "learning_rate": 0.0002, "epoch": 3.7264570366178966, "step": 23050}, {"loss": 0.6466, "grad_norm": 0.9212996959686279, "learning_rate": 0.0002, "epoch": 3.728073720798642, "step": 23060}, {"loss": 0.6454, "grad_norm": 0.925690233707428, "learning_rate": 0.0002, "epoch": 3.7296904049793875, "step": 23070}, {"loss": 0.615, "grad_norm": 0.8323310613632202, "learning_rate": 0.0002, "epoch": 3.7313070891601328, "step": 23080}, {"loss": 0.679, "grad_norm": 0.8966048955917358, "learning_rate": 0.0002, "epoch": 3.732923773340878, "step": 23090}, {"loss": 0.6151, "grad_norm": 0.8995837569236755, "learning_rate": 0.0002, "epoch": 3.7345404575216232, "step": 23100}, {"loss": 0.6143, "grad_norm": 0.8748890161514282, "learning_rate": 0.0002, "epoch": 3.7361571417023685, "step": 23110}, {"loss": 0.6246, "grad_norm": 0.7985540628433228, "learning_rate": 0.0002, "epoch": 3.7377738258831137, "step": 23120}, {"loss": 0.6279, "grad_norm": 1.0240917205810547, "learning_rate": 0.0002, "epoch": 3.739390510063859, "step": 23130}, {"loss": 0.6747, "grad_norm": 0.9181789755821228, "learning_rate": 0.0002, "epoch": 3.741007194244604, "step": 23140}, {"loss": 0.6026, "grad_norm": 0.8896583914756775, "learning_rate": 0.0002, "epoch": 3.7426238784253494, "step": 23150}, {"loss": 0.5972, "grad_norm": 0.8635515570640564, "learning_rate": 0.0002, "epoch": 3.744240562606095, "step": 23160}, {"loss": 0.6683, "grad_norm": 0.8873575329780579, "learning_rate": 0.0002, "epoch": 3.7458572467868403, "step": 23170}, {"loss": 0.6143, "grad_norm": 0.9807148575782776, "learning_rate": 0.0002, "epoch": 3.7474739309675855, "step": 23180}, {"loss": 0.6381, "grad_norm": 0.900477945804596, "learning_rate": 0.0002, "epoch": 3.7490906151483308, "step": 23190}, {"loss": 0.6542, "grad_norm": 0.9379992485046387, "learning_rate": 0.0002, "epoch": 3.750707299329076, "step": 23200}, {"loss": 0.6015, "grad_norm": 0.9649890661239624, "learning_rate": 0.0002, "epoch": 3.752323983509821, "step": 23210}, {"loss": 0.6735, "grad_norm": 0.824442446231842, "learning_rate": 0.0002, "epoch": 3.753940667690567, "step": 23220}, {"loss": 0.5992, "grad_norm": 0.8896150588989258, "learning_rate": 0.0002, "epoch": 3.755557351871312, "step": 23230}, {"loss": 0.6081, "grad_norm": 0.751249372959137, "learning_rate": 0.0002, "epoch": 3.7571740360520574, "step": 23240}, {"loss": 0.629, "grad_norm": 0.9392193555831909, "learning_rate": 0.0002, "epoch": 3.7587907202328026, "step": 23250}, {"loss": 0.6209, "grad_norm": 0.9284586310386658, "learning_rate": 0.0002, "epoch": 3.760407404413548, "step": 23260}, {"loss": 0.6414, "grad_norm": 0.7738175392150879, "learning_rate": 0.0002, "epoch": 3.762024088594293, "step": 23270}, {"loss": 0.6743, "grad_norm": 0.9252978563308716, "learning_rate": 0.0002, "epoch": 3.7636407727750383, "step": 23280}, {"loss": 0.5984, "grad_norm": 0.9501895904541016, "learning_rate": 0.0002, "epoch": 3.7652574569557835, "step": 23290}, {"loss": 0.6568, "grad_norm": 0.9416276216506958, "learning_rate": 0.0002, "epoch": 3.7668741411365287, "step": 23300}, {"loss": 0.6507, "grad_norm": 0.7076631784439087, "learning_rate": 0.0002, "epoch": 3.7684908253172744, "step": 23310}, {"loss": 0.6329, "grad_norm": 0.9864492416381836, "learning_rate": 0.0002, "epoch": 3.7701075094980196, "step": 23320}, {"loss": 0.6537, "grad_norm": 0.8450456261634827, "learning_rate": 0.0002, "epoch": 3.771724193678765, "step": 23330}, {"loss": 0.658, "grad_norm": 1.0768941640853882, "learning_rate": 0.0002, "epoch": 3.77334087785951, "step": 23340}, {"loss": 0.6408, "grad_norm": 0.9956819415092468, "learning_rate": 0.0002, "epoch": 3.7749575620402553, "step": 23350}, {"loss": 0.6464, "grad_norm": 0.9234658479690552, "learning_rate": 0.0002, "epoch": 3.7765742462210006, "step": 23360}, {"loss": 0.6542, "grad_norm": 1.0993858575820923, "learning_rate": 0.0002, "epoch": 3.7781909304017463, "step": 23370}, {"loss": 0.6391, "grad_norm": 0.923159658908844, "learning_rate": 0.0002, "epoch": 3.7798076145824915, "step": 23380}, {"loss": 0.6625, "grad_norm": 0.9311541318893433, "learning_rate": 0.0002, "epoch": 3.7814242987632367, "step": 23390}, {"loss": 0.6535, "grad_norm": 0.919681191444397, "learning_rate": 0.0002, "epoch": 3.783040982943982, "step": 23400}, {"loss": 0.6138, "grad_norm": 1.7406195402145386, "learning_rate": 0.0002, "epoch": 3.784657667124727, "step": 23410}, {"loss": 0.657, "grad_norm": 0.7789074182510376, "learning_rate": 0.0002, "epoch": 3.7862743513054724, "step": 23420}, {"loss": 0.658, "grad_norm": 0.8302814960479736, "learning_rate": 0.0002, "epoch": 3.7878910354862176, "step": 23430}, {"loss": 0.649, "grad_norm": 0.8089349269866943, "learning_rate": 0.0002, "epoch": 3.789507719666963, "step": 23440}, {"loss": 0.6682, "grad_norm": 0.9006284475326538, "learning_rate": 0.0002, "epoch": 3.791124403847708, "step": 23450}, {"loss": 0.6335, "grad_norm": 0.8426766991615295, "learning_rate": 0.0002, "epoch": 3.7927410880284538, "step": 23460}, {"loss": 0.6364, "grad_norm": 1.2576252222061157, "learning_rate": 0.0002, "epoch": 3.794357772209199, "step": 23470}, {"loss": 0.6324, "grad_norm": 1.0307610034942627, "learning_rate": 0.0002, "epoch": 3.7959744563899442, "step": 23480}, {"loss": 0.6262, "grad_norm": 0.8525972962379456, "learning_rate": 0.0002, "epoch": 3.7975911405706895, "step": 23490}, {"loss": 0.6757, "grad_norm": 1.159039855003357, "learning_rate": 0.0002, "epoch": 3.7992078247514347, "step": 23500}, {"loss": 0.6414, "grad_norm": 1.4193549156188965, "learning_rate": 0.0002, "epoch": 3.80082450893218, "step": 23510}, {"loss": 0.6413, "grad_norm": 0.8245543837547302, "learning_rate": 0.0002, "epoch": 3.8024411931129256, "step": 23520}, {"loss": 0.6417, "grad_norm": 0.8847230076789856, "learning_rate": 0.0002, "epoch": 3.804057877293671, "step": 23530}, {"loss": 0.6415, "grad_norm": 0.9574624300003052, "learning_rate": 0.0002, "epoch": 3.805674561474416, "step": 23540}, {"loss": 0.5765, "grad_norm": 1.048020601272583, "learning_rate": 0.0002, "epoch": 3.8072912456551613, "step": 23550}, {"loss": 0.6497, "grad_norm": 0.8302255868911743, "learning_rate": 0.0002, "epoch": 3.8089079298359065, "step": 23560}, {"loss": 0.6534, "grad_norm": 0.8269215822219849, "learning_rate": 0.0002, "epoch": 3.8105246140166518, "step": 23570}, {"loss": 0.6294, "grad_norm": 0.9375753402709961, "learning_rate": 0.0002, "epoch": 3.812141298197397, "step": 23580}, {"loss": 0.6132, "grad_norm": 1.0234097242355347, "learning_rate": 0.0002, "epoch": 3.8137579823781422, "step": 23590}, {"loss": 0.6625, "grad_norm": 0.8978445529937744, "learning_rate": 0.0002, "epoch": 3.8153746665588875, "step": 23600}, {"loss": 0.6315, "grad_norm": 0.7929515838623047, "learning_rate": 0.0002, "epoch": 3.816991350739633, "step": 23610}, {"loss": 0.6387, "grad_norm": 1.3255881071090698, "learning_rate": 0.0002, "epoch": 3.8186080349203784, "step": 23620}, {"loss": 0.5947, "grad_norm": 0.9188598990440369, "learning_rate": 0.0002, "epoch": 3.8202247191011236, "step": 23630}, {"loss": 0.6152, "grad_norm": 0.8811675906181335, "learning_rate": 0.0002, "epoch": 3.821841403281869, "step": 23640}, {"loss": 0.6253, "grad_norm": 0.8061038255691528, "learning_rate": 0.0002, "epoch": 3.823458087462614, "step": 23650}, {"loss": 0.6517, "grad_norm": 0.9975376129150391, "learning_rate": 0.0002, "epoch": 3.8250747716433597, "step": 23660}, {"loss": 0.6288, "grad_norm": 0.8036105036735535, "learning_rate": 0.0002, "epoch": 3.826691455824105, "step": 23670}, {"loss": 0.6845, "grad_norm": 0.7401984333992004, "learning_rate": 0.0002, "epoch": 3.82830814000485, "step": 23680}, {"loss": 0.6423, "grad_norm": 0.829753041267395, "learning_rate": 0.0002, "epoch": 3.8299248241855954, "step": 23690}, {"loss": 0.6611, "grad_norm": 0.8753240704536438, "learning_rate": 0.0002, "epoch": 3.8315415083663407, "step": 23700}, {"loss": 0.6686, "grad_norm": 0.8157842755317688, "learning_rate": 0.0002, "epoch": 3.833158192547086, "step": 23710}, {"loss": 0.6181, "grad_norm": 0.6183798909187317, "learning_rate": 0.0002, "epoch": 3.834774876727831, "step": 23720}, {"loss": 0.5965, "grad_norm": 0.9548442363739014, "learning_rate": 0.0002, "epoch": 3.8363915609085764, "step": 23730}, {"loss": 0.6456, "grad_norm": 0.8319669961929321, "learning_rate": 0.0002, "epoch": 3.8380082450893216, "step": 23740}, {"loss": 0.6585, "grad_norm": 0.9718693494796753, "learning_rate": 0.0002, "epoch": 3.839624929270067, "step": 23750}, {"loss": 0.6518, "grad_norm": 0.8672235012054443, "learning_rate": 0.0002, "epoch": 3.8412416134508125, "step": 23760}, {"loss": 0.6774, "grad_norm": 1.1210707426071167, "learning_rate": 0.0002, "epoch": 3.8428582976315577, "step": 23770}, {"loss": 0.5923, "grad_norm": 0.9177767634391785, "learning_rate": 0.0002, "epoch": 3.844474981812303, "step": 23780}, {"loss": 0.6286, "grad_norm": 0.8714171648025513, "learning_rate": 0.0002, "epoch": 3.846091665993048, "step": 23790}, {"loss": 0.6302, "grad_norm": 1.1853246688842773, "learning_rate": 0.0002, "epoch": 3.8477083501737934, "step": 23800}, {"loss": 0.6144, "grad_norm": 0.8091260194778442, "learning_rate": 0.0002, "epoch": 3.849325034354539, "step": 23810}, {"loss": 0.658, "grad_norm": 0.9710774421691895, "learning_rate": 0.0002, "epoch": 3.8509417185352843, "step": 23820}, {"loss": 0.6151, "grad_norm": 0.7648707628250122, "learning_rate": 0.0002, "epoch": 3.8525584027160296, "step": 23830}, {"loss": 0.6013, "grad_norm": 0.7809253931045532, "learning_rate": 0.0002, "epoch": 3.854175086896775, "step": 23840}, {"loss": 0.6006, "grad_norm": 0.8337951898574829, "learning_rate": 0.0002, "epoch": 3.85579177107752, "step": 23850}, {"loss": 0.6456, "grad_norm": 0.9271913170814514, "learning_rate": 0.0002, "epoch": 3.8574084552582653, "step": 23860}, {"loss": 0.6671, "grad_norm": 0.985334038734436, "learning_rate": 0.0002, "epoch": 3.8590251394390105, "step": 23870}, {"loss": 0.6693, "grad_norm": 0.8458583354949951, "learning_rate": 0.0002, "epoch": 3.8606418236197557, "step": 23880}, {"loss": 0.6207, "grad_norm": 1.015348196029663, "learning_rate": 0.0002, "epoch": 3.862258507800501, "step": 23890}, {"loss": 0.649, "grad_norm": 1.0121688842773438, "learning_rate": 0.0002, "epoch": 3.8638751919812466, "step": 23900}, {"loss": 0.5921, "grad_norm": 0.8883971571922302, "learning_rate": 0.0002, "epoch": 3.865491876161992, "step": 23910}, {"loss": 0.6597, "grad_norm": 1.028086543083191, "learning_rate": 0.0002, "epoch": 3.867108560342737, "step": 23920}, {"loss": 0.6654, "grad_norm": 0.9645734429359436, "learning_rate": 0.0002, "epoch": 3.8687252445234823, "step": 23930}, {"loss": 0.6328, "grad_norm": 0.8235350251197815, "learning_rate": 0.0002, "epoch": 3.8703419287042276, "step": 23940}, {"loss": 0.6387, "grad_norm": 1.0298916101455688, "learning_rate": 0.0002, "epoch": 3.871958612884973, "step": 23950}, {"loss": 0.5966, "grad_norm": 1.0063377618789673, "learning_rate": 0.0002, "epoch": 3.8735752970657185, "step": 23960}, {"loss": 0.6234, "grad_norm": 0.9230626821517944, "learning_rate": 0.0002, "epoch": 3.8751919812464637, "step": 23970}, {"loss": 0.6159, "grad_norm": 0.9243063926696777, "learning_rate": 0.0002, "epoch": 3.876808665427209, "step": 23980}, {"loss": 0.6035, "grad_norm": 1.0211291313171387, "learning_rate": 0.0002, "epoch": 3.878425349607954, "step": 23990}, {"loss": 0.6351, "grad_norm": 0.7800535559654236, "learning_rate": 0.0002, "epoch": 3.8800420337886994, "step": 24000}, {"loss": 0.7, "grad_norm": 0.7904248833656311, "learning_rate": 0.0002, "epoch": 3.8816587179694446, "step": 24010}, {"loss": 0.6516, "grad_norm": 1.1975988149642944, "learning_rate": 0.0002, "epoch": 3.88327540215019, "step": 24020}, {"loss": 0.6006, "grad_norm": 1.0626593828201294, "learning_rate": 0.0002, "epoch": 3.884892086330935, "step": 24030}, {"loss": 0.6115, "grad_norm": 0.9012193083763123, "learning_rate": 0.0002, "epoch": 3.8865087705116803, "step": 24040}, {"loss": 0.6786, "grad_norm": 1.1159172058105469, "learning_rate": 0.0002, "epoch": 3.888125454692426, "step": 24050}, {"loss": 0.6635, "grad_norm": 1.276838779449463, "learning_rate": 0.0002, "epoch": 3.889742138873171, "step": 24060}, {"loss": 0.5985, "grad_norm": 0.8467690348625183, "learning_rate": 0.0002, "epoch": 3.8913588230539164, "step": 24070}, {"loss": 0.6655, "grad_norm": 0.9862841963768005, "learning_rate": 0.0002, "epoch": 3.8929755072346617, "step": 24080}, {"loss": 0.6098, "grad_norm": 0.7134621739387512, "learning_rate": 0.0002, "epoch": 3.894592191415407, "step": 24090}, {"loss": 0.618, "grad_norm": 0.8178175091743469, "learning_rate": 0.0002, "epoch": 3.896208875596152, "step": 24100}, {"loss": 0.6147, "grad_norm": 0.9229172468185425, "learning_rate": 0.0002, "epoch": 3.897825559776898, "step": 24110}, {"loss": 0.6554, "grad_norm": 1.0878316164016724, "learning_rate": 0.0002, "epoch": 3.899442243957643, "step": 24120}, {"loss": 0.6616, "grad_norm": 0.971645712852478, "learning_rate": 0.0002, "epoch": 3.9010589281383883, "step": 24130}, {"loss": 0.6228, "grad_norm": 0.8862188458442688, "learning_rate": 0.0002, "epoch": 3.9026756123191335, "step": 24140}, {"loss": 0.6192, "grad_norm": 0.9126982688903809, "learning_rate": 0.0002, "epoch": 3.9042922964998787, "step": 24150}, {"loss": 0.6734, "grad_norm": 0.8833470940589905, "learning_rate": 0.0002, "epoch": 3.905908980680624, "step": 24160}, {"loss": 0.5832, "grad_norm": 0.8320947885513306, "learning_rate": 0.0002, "epoch": 3.907525664861369, "step": 24170}, {"loss": 0.6247, "grad_norm": 0.9156602025032043, "learning_rate": 0.0002, "epoch": 3.9091423490421144, "step": 24180}, {"loss": 0.6678, "grad_norm": 1.029181957244873, "learning_rate": 0.0002, "epoch": 3.9107590332228597, "step": 24190}, {"loss": 0.6565, "grad_norm": 0.9052802324295044, "learning_rate": 0.0002, "epoch": 3.9123757174036053, "step": 24200}, {"loss": 0.6346, "grad_norm": 0.8847255110740662, "learning_rate": 0.0002, "epoch": 3.9139924015843506, "step": 24210}, {"loss": 0.6343, "grad_norm": 0.9642062187194824, "learning_rate": 0.0002, "epoch": 3.915609085765096, "step": 24220}, {"loss": 0.6557, "grad_norm": 0.8629093766212463, "learning_rate": 0.0002, "epoch": 3.917225769945841, "step": 24230}, {"loss": 0.6086, "grad_norm": 0.8674976825714111, "learning_rate": 0.0002, "epoch": 3.9188424541265863, "step": 24240}, {"loss": 0.5874, "grad_norm": 1.104846477508545, "learning_rate": 0.0002, "epoch": 3.9204591383073315, "step": 24250}, {"loss": 0.6501, "grad_norm": 1.0874955654144287, "learning_rate": 0.0002, "epoch": 3.922075822488077, "step": 24260}, {"loss": 0.6455, "grad_norm": 0.8689812421798706, "learning_rate": 0.0002, "epoch": 3.9236925066688224, "step": 24270}, {"loss": 0.5893, "grad_norm": 0.9724617004394531, "learning_rate": 0.0002, "epoch": 3.9253091908495676, "step": 24280}, {"loss": 0.6616, "grad_norm": 0.9165538549423218, "learning_rate": 0.0002, "epoch": 3.926925875030313, "step": 24290}, {"loss": 0.645, "grad_norm": 0.9307710528373718, "learning_rate": 0.0002, "epoch": 3.928542559211058, "step": 24300}, {"loss": 0.6071, "grad_norm": 0.8589295148849487, "learning_rate": 0.0002, "epoch": 3.9301592433918033, "step": 24310}, {"loss": 0.6662, "grad_norm": 0.9151099920272827, "learning_rate": 0.0002, "epoch": 3.9317759275725486, "step": 24320}, {"loss": 0.7075, "grad_norm": 0.9633517265319824, "learning_rate": 0.0002, "epoch": 3.933392611753294, "step": 24330}, {"loss": 0.6432, "grad_norm": 0.9521116018295288, "learning_rate": 0.0002, "epoch": 3.935009295934039, "step": 24340}, {"loss": 0.6457, "grad_norm": 0.8366776704788208, "learning_rate": 0.0002, "epoch": 3.9366259801147847, "step": 24350}, {"loss": 0.6139, "grad_norm": 0.8972663283348083, "learning_rate": 0.0002, "epoch": 3.93824266429553, "step": 24360}, {"loss": 0.661, "grad_norm": 0.8102919459342957, "learning_rate": 0.0002, "epoch": 3.939859348476275, "step": 24370}, {"loss": 0.6388, "grad_norm": 0.8189975023269653, "learning_rate": 0.0002, "epoch": 3.9414760326570204, "step": 24380}, {"loss": 0.6818, "grad_norm": 0.9569464921951294, "learning_rate": 0.0002, "epoch": 3.9430927168377656, "step": 24390}, {"loss": 0.6999, "grad_norm": 0.7459101676940918, "learning_rate": 0.0002, "epoch": 3.9447094010185113, "step": 24400}, {"loss": 0.6069, "grad_norm": 0.8536974787712097, "learning_rate": 0.0002, "epoch": 3.9463260851992565, "step": 24410}, {"loss": 0.5683, "grad_norm": 0.8763698935508728, "learning_rate": 0.0002, "epoch": 3.9479427693800018, "step": 24420}, {"loss": 0.6478, "grad_norm": 0.9381106495857239, "learning_rate": 0.0002, "epoch": 3.949559453560747, "step": 24430}, {"loss": 0.6371, "grad_norm": 0.934440016746521, "learning_rate": 0.0002, "epoch": 3.9511761377414922, "step": 24440}, {"loss": 0.6393, "grad_norm": 0.903918981552124, "learning_rate": 0.0002, "epoch": 3.9527928219222375, "step": 24450}, {"loss": 0.6175, "grad_norm": 0.8771953582763672, "learning_rate": 0.0002, "epoch": 3.9544095061029827, "step": 24460}, {"loss": 0.6971, "grad_norm": 1.0375410318374634, "learning_rate": 0.0002, "epoch": 3.956026190283728, "step": 24470}, {"loss": 0.6313, "grad_norm": 0.9439185261726379, "learning_rate": 0.0002, "epoch": 3.957642874464473, "step": 24480}, {"loss": 0.6076, "grad_norm": 0.935467004776001, "learning_rate": 0.0002, "epoch": 3.9592595586452184, "step": 24490}, {"loss": 0.6437, "grad_norm": 0.6900772452354431, "learning_rate": 0.0002, "epoch": 3.960876242825964, "step": 24500}, {"loss": 0.6445, "grad_norm": 1.0172916650772095, "learning_rate": 0.0002, "epoch": 3.9624929270067093, "step": 24510}, {"loss": 0.6308, "grad_norm": 0.9167046546936035, "learning_rate": 0.0002, "epoch": 3.9641096111874545, "step": 24520}, {"loss": 0.6519, "grad_norm": 0.7230527997016907, "learning_rate": 0.0002, "epoch": 3.9657262953681998, "step": 24530}, {"loss": 0.6564, "grad_norm": 0.8980403542518616, "learning_rate": 0.0002, "epoch": 3.967342979548945, "step": 24540}, {"loss": 0.6099, "grad_norm": 0.8555465936660767, "learning_rate": 0.0002, "epoch": 3.9689596637296907, "step": 24550}, {"loss": 0.6617, "grad_norm": 0.7825445532798767, "learning_rate": 0.0002, "epoch": 3.970576347910436, "step": 24560}, {"loss": 0.604, "grad_norm": 0.7273133993148804, "learning_rate": 0.0002, "epoch": 3.972193032091181, "step": 24570}, {"loss": 0.6427, "grad_norm": 0.9612047672271729, "learning_rate": 0.0002, "epoch": 3.9738097162719264, "step": 24580}, {"loss": 0.6426, "grad_norm": 0.9865460991859436, "learning_rate": 0.0002, "epoch": 3.9754264004526716, "step": 24590}, {"loss": 0.6052, "grad_norm": 0.8638762831687927, "learning_rate": 0.0002, "epoch": 3.977043084633417, "step": 24600}, {"loss": 0.6097, "grad_norm": 1.0096198320388794, "learning_rate": 0.0002, "epoch": 3.978659768814162, "step": 24610}, {"loss": 0.6664, "grad_norm": 0.8475532531738281, "learning_rate": 0.0002, "epoch": 3.9802764529949073, "step": 24620}, {"loss": 0.6711, "grad_norm": 0.9696195721626282, "learning_rate": 0.0002, "epoch": 3.9818931371756525, "step": 24630}, {"loss": 0.6446, "grad_norm": 0.7499843239784241, "learning_rate": 0.0002, "epoch": 3.9835098213563978, "step": 24640}, {"loss": 0.6054, "grad_norm": 0.8865424990653992, "learning_rate": 0.0002, "epoch": 3.9851265055371434, "step": 24650}, {"loss": 0.5975, "grad_norm": 0.8089959025382996, "learning_rate": 0.0002, "epoch": 3.9867431897178887, "step": 24660}, {"loss": 0.6677, "grad_norm": 0.6946012377738953, "learning_rate": 0.0002, "epoch": 3.988359873898634, "step": 24670}, {"loss": 0.6329, "grad_norm": 0.7991759181022644, "learning_rate": 0.0002, "epoch": 3.989976558079379, "step": 24680}, {"loss": 0.6449, "grad_norm": 0.8803931474685669, "learning_rate": 0.0002, "epoch": 3.9915932422601244, "step": 24690}, {"loss": 0.7091, "grad_norm": 0.8848299980163574, "learning_rate": 0.0002, "epoch": 3.99320992644087, "step": 24700}, {"loss": 0.6551, "grad_norm": 0.7448889017105103, "learning_rate": 0.0002, "epoch": 3.9948266106216153, "step": 24710}, {"loss": 0.6432, "grad_norm": 0.9361620545387268, "learning_rate": 0.0002, "epoch": 3.9964432948023605, "step": 24720}, {"loss": 0.5917, "grad_norm": 0.9958081245422363, "learning_rate": 0.0002, "epoch": 3.9980599789831057, "step": 24730}, {"loss": 0.6567, "grad_norm": 1.026004672050476, "learning_rate": 0.0002, "epoch": 3.999676663163851, "step": 24740}, {"eval_loss": 1.1524168252944946, "eval_runtime": 122.1585, "eval_samples_per_second": 6.0, "eval_steps_per_second": 0.753, "epoch": 4.0, "step": 24742}, {"loss": 0.6057, "grad_norm": 1.0664808750152588, "learning_rate": 0.0002, "epoch": 4.001293347344596, "step": 24750}, {"loss": 0.5644, "grad_norm": 1.0113720893859863, "learning_rate": 0.0002, "epoch": 4.002910031525341, "step": 24760}, {"loss": 0.5628, "grad_norm": 0.991486668586731, "learning_rate": 0.0002, "epoch": 4.004526715706087, "step": 24770}, {"loss": 0.508, "grad_norm": 0.951754629611969, "learning_rate": 0.0002, "epoch": 4.006143399886832, "step": 24780}, {"loss": 0.5314, "grad_norm": 1.13059401512146, "learning_rate": 0.0002, "epoch": 4.007760084067577, "step": 24790}, {"loss": 0.5323, "grad_norm": 0.9343926310539246, "learning_rate": 0.0002, "epoch": 4.009376768248322, "step": 24800}, {"loss": 0.5161, "grad_norm": 1.0680590867996216, "learning_rate": 0.0002, "epoch": 4.010993452429068, "step": 24810}, {"loss": 0.513, "grad_norm": 1.0022706985473633, "learning_rate": 0.0002, "epoch": 4.012610136609814, "step": 24820}, {"loss": 0.543, "grad_norm": 1.0285297632217407, "learning_rate": 0.0002, "epoch": 4.014226820790559, "step": 24830}, {"loss": 0.5311, "grad_norm": 0.8347002863883972, "learning_rate": 0.0002, "epoch": 4.015843504971304, "step": 24840}, {"loss": 0.5655, "grad_norm": 0.9675396680831909, "learning_rate": 0.0002, "epoch": 4.017460189152049, "step": 24850}, {"loss": 0.5625, "grad_norm": 0.9238511323928833, "learning_rate": 0.0002, "epoch": 4.019076873332795, "step": 24860}, {"loss": 0.5327, "grad_norm": 1.1576941013336182, "learning_rate": 0.0002, "epoch": 4.02069355751354, "step": 24870}, {"loss": 0.5533, "grad_norm": 0.8583757281303406, "learning_rate": 0.0002, "epoch": 4.022310241694285, "step": 24880}, {"loss": 0.5483, "grad_norm": 0.9816817045211792, "learning_rate": 0.0002, "epoch": 4.02392692587503, "step": 24890}, {"loss": 0.5605, "grad_norm": 0.955073893070221, "learning_rate": 0.0002, "epoch": 4.0255436100557755, "step": 24900}, {"loss": 0.4896, "grad_norm": 1.1054974794387817, "learning_rate": 0.0002, "epoch": 4.027160294236521, "step": 24910}, {"loss": 0.5246, "grad_norm": 1.1240060329437256, "learning_rate": 0.0002, "epoch": 4.028776978417266, "step": 24920}, {"loss": 0.5451, "grad_norm": 0.9512825012207031, "learning_rate": 0.0002, "epoch": 4.030393662598011, "step": 24930}, {"loss": 0.5584, "grad_norm": 0.85965496301651, "learning_rate": 0.0002, "epoch": 4.0320103467787565, "step": 24940}, {"loss": 0.5564, "grad_norm": 0.9378061294555664, "learning_rate": 0.0002, "epoch": 4.033627030959502, "step": 24950}, {"loss": 0.5008, "grad_norm": 0.9655424356460571, "learning_rate": 0.0002, "epoch": 4.035243715140247, "step": 24960}, {"loss": 0.5538, "grad_norm": 1.1393707990646362, "learning_rate": 0.0002, "epoch": 4.036860399320993, "step": 24970}, {"loss": 0.5785, "grad_norm": 1.0220451354980469, "learning_rate": 0.0002, "epoch": 4.038477083501738, "step": 24980}, {"loss": 0.5813, "grad_norm": 0.9785808324813843, "learning_rate": 0.0002, "epoch": 4.0400937676824835, "step": 24990}, {"loss": 0.5153, "grad_norm": 1.0257649421691895, "learning_rate": 0.0002, "epoch": 4.041710451863229, "step": 25000}, {"loss": 0.5658, "grad_norm": 0.9737892150878906, "learning_rate": 0.0002, "epoch": 4.043327136043974, "step": 25010}, {"loss": 0.5515, "grad_norm": 0.7416959404945374, "learning_rate": 0.0002, "epoch": 4.044943820224719, "step": 25020}, {"loss": 0.5372, "grad_norm": 0.7909596562385559, "learning_rate": 0.0002, "epoch": 4.046560504405464, "step": 25030}, {"loss": 0.5265, "grad_norm": 0.8923130631446838, "learning_rate": 0.0002, "epoch": 4.04817718858621, "step": 25040}, {"loss": 0.5035, "grad_norm": 0.9044941663742065, "learning_rate": 0.0002, "epoch": 4.049793872766955, "step": 25050}, {"loss": 0.5135, "grad_norm": 0.866352379322052, "learning_rate": 0.0002, "epoch": 4.0514105569477, "step": 25060}, {"loss": 0.5956, "grad_norm": 1.544549822807312, "learning_rate": 0.0002, "epoch": 4.053027241128445, "step": 25070}, {"loss": 0.5418, "grad_norm": 0.8426995277404785, "learning_rate": 0.0002, "epoch": 4.054643925309191, "step": 25080}, {"loss": 0.5537, "grad_norm": 0.9797548651695251, "learning_rate": 0.0002, "epoch": 4.056260609489936, "step": 25090}, {"loss": 0.55, "grad_norm": 0.8468434810638428, "learning_rate": 0.0002, "epoch": 4.057877293670681, "step": 25100}, {"loss": 0.5242, "grad_norm": 0.9294559955596924, "learning_rate": 0.0002, "epoch": 4.059493977851426, "step": 25110}, {"loss": 0.5295, "grad_norm": 0.9686688780784607, "learning_rate": 0.0002, "epoch": 4.061110662032172, "step": 25120}, {"loss": 0.5642, "grad_norm": 0.8042728304862976, "learning_rate": 0.0002, "epoch": 4.062727346212918, "step": 25130}, {"loss": 0.548, "grad_norm": 1.165160894393921, "learning_rate": 0.0002, "epoch": 4.064344030393663, "step": 25140}, {"loss": 0.5473, "grad_norm": 1.2161961793899536, "learning_rate": 0.0002, "epoch": 4.065960714574408, "step": 25150}, {"loss": 0.5217, "grad_norm": 1.0762810707092285, "learning_rate": 0.0002, "epoch": 4.067577398755153, "step": 25160}, {"loss": 0.5886, "grad_norm": 0.7580869793891907, "learning_rate": 0.0002, "epoch": 4.069194082935899, "step": 25170}, {"loss": 0.5401, "grad_norm": 0.9630117416381836, "learning_rate": 0.0002, "epoch": 4.070810767116644, "step": 25180}, {"loss": 0.5378, "grad_norm": 0.9049716591835022, "learning_rate": 0.0002, "epoch": 4.072427451297389, "step": 25190}, {"loss": 0.5266, "grad_norm": 1.1536930799484253, "learning_rate": 0.0002, "epoch": 4.074044135478134, "step": 25200}, {"loss": 0.5523, "grad_norm": 0.901461124420166, "learning_rate": 0.0002, "epoch": 4.0756608196588795, "step": 25210}, {"loss": 0.5132, "grad_norm": 1.3318437337875366, "learning_rate": 0.0002, "epoch": 4.077277503839625, "step": 25220}, {"loss": 0.5317, "grad_norm": 0.8811455368995667, "learning_rate": 0.0002, "epoch": 4.07889418802037, "step": 25230}, {"loss": 0.5798, "grad_norm": 1.0564165115356445, "learning_rate": 0.0002, "epoch": 4.080510872201115, "step": 25240}, {"loss": 0.5472, "grad_norm": 1.1008027791976929, "learning_rate": 0.0002, "epoch": 4.08212755638186, "step": 25250}, {"loss": 0.5195, "grad_norm": 1.150097131729126, "learning_rate": 0.0002, "epoch": 4.083744240562606, "step": 25260}, {"loss": 0.5321, "grad_norm": 0.9339924454689026, "learning_rate": 0.0002, "epoch": 4.085360924743352, "step": 25270}, {"loss": 0.5597, "grad_norm": 1.0902045965194702, "learning_rate": 0.0002, "epoch": 4.086977608924097, "step": 25280}, {"loss": 0.5203, "grad_norm": 0.8483911156654358, "learning_rate": 0.0002, "epoch": 4.088594293104842, "step": 25290}, {"loss": 0.5697, "grad_norm": 0.9477024674415588, "learning_rate": 0.0002, "epoch": 4.0902109772855875, "step": 25300}, {"loss": 0.5384, "grad_norm": 0.9500215649604797, "learning_rate": 0.0002, "epoch": 4.091827661466333, "step": 25310}, {"loss": 0.5045, "grad_norm": 1.040468454360962, "learning_rate": 0.0002, "epoch": 4.093444345647078, "step": 25320}, {"loss": 0.5488, "grad_norm": 0.7457592487335205, "learning_rate": 0.0002, "epoch": 4.095061029827823, "step": 25330}, {"loss": 0.609, "grad_norm": 1.2092097997665405, "learning_rate": 0.0002, "epoch": 4.096677714008568, "step": 25340}, {"loss": 0.5174, "grad_norm": 0.9652107954025269, "learning_rate": 0.0002, "epoch": 4.098294398189314, "step": 25350}, {"loss": 0.5559, "grad_norm": 0.8464955687522888, "learning_rate": 0.0002, "epoch": 4.099911082370059, "step": 25360}, {"loss": 0.5635, "grad_norm": 0.875026285648346, "learning_rate": 0.0002, "epoch": 4.101527766550804, "step": 25370}, {"loss": 0.5774, "grad_norm": 0.9241740107536316, "learning_rate": 0.0002, "epoch": 4.103144450731549, "step": 25380}, {"loss": 0.5578, "grad_norm": 0.9769546389579773, "learning_rate": 0.0002, "epoch": 4.1047611349122946, "step": 25390}, {"loss": 0.567, "grad_norm": 1.1501960754394531, "learning_rate": 0.0002, "epoch": 4.10637781909304, "step": 25400}, {"loss": 0.5241, "grad_norm": 0.9135243892669678, "learning_rate": 0.0002, "epoch": 4.107994503273786, "step": 25410}, {"loss": 0.5152, "grad_norm": 0.9905396103858948, "learning_rate": 0.0002, "epoch": 4.109611187454531, "step": 25420}, {"loss": 0.5064, "grad_norm": 0.9845104217529297, "learning_rate": 0.0002, "epoch": 4.111227871635276, "step": 25430}, {"loss": 0.5029, "grad_norm": 0.8326883912086487, "learning_rate": 0.0002, "epoch": 4.112844555816022, "step": 25440}, {"loss": 0.5312, "grad_norm": 0.9264556765556335, "learning_rate": 0.0002, "epoch": 4.114461239996767, "step": 25450}, {"loss": 0.5968, "grad_norm": 1.043080449104309, "learning_rate": 0.0002, "epoch": 4.116077924177512, "step": 25460}, {"loss": 0.5773, "grad_norm": 0.8533386588096619, "learning_rate": 0.0002, "epoch": 4.117694608358257, "step": 25470}, {"loss": 0.5584, "grad_norm": 1.0133965015411377, "learning_rate": 0.0002, "epoch": 4.1193112925390025, "step": 25480}, {"loss": 0.566, "grad_norm": 0.7476310133934021, "learning_rate": 0.0002, "epoch": 4.120927976719748, "step": 25490}, {"loss": 0.5189, "grad_norm": 1.1247259378433228, "learning_rate": 0.0002, "epoch": 4.122544660900493, "step": 25500}, {"loss": 0.5751, "grad_norm": 1.0764678716659546, "learning_rate": 0.0002, "epoch": 4.124161345081238, "step": 25510}, {"loss": 0.5391, "grad_norm": 0.7679798007011414, "learning_rate": 0.0002, "epoch": 4.1257780292619834, "step": 25520}, {"loss": 0.5233, "grad_norm": 0.8877071142196655, "learning_rate": 0.0002, "epoch": 4.127394713442729, "step": 25530}, {"loss": 0.5769, "grad_norm": 1.0440239906311035, "learning_rate": 0.0002, "epoch": 4.129011397623474, "step": 25540}, {"loss": 0.5723, "grad_norm": 0.984145998954773, "learning_rate": 0.0002, "epoch": 4.130628081804219, "step": 25550}, {"loss": 0.5741, "grad_norm": 0.8667055368423462, "learning_rate": 0.0002, "epoch": 4.132244765984965, "step": 25560}, {"loss": 0.5816, "grad_norm": 1.1300835609436035, "learning_rate": 0.0002, "epoch": 4.1338614501657105, "step": 25570}, {"loss": 0.524, "grad_norm": 0.9314348101615906, "learning_rate": 0.0002, "epoch": 4.135478134346456, "step": 25580}, {"loss": 0.5283, "grad_norm": 0.7731879949569702, "learning_rate": 0.0002, "epoch": 4.137094818527201, "step": 25590}, {"loss": 0.5307, "grad_norm": 1.0080097913742065, "learning_rate": 0.0002, "epoch": 4.138711502707946, "step": 25600}, {"loss": 0.5759, "grad_norm": 1.2475038766860962, "learning_rate": 0.0002, "epoch": 4.140328186888691, "step": 25610}, {"loss": 0.55, "grad_norm": 0.9912930727005005, "learning_rate": 0.0002, "epoch": 4.141944871069437, "step": 25620}, {"loss": 0.5624, "grad_norm": 0.9088651537895203, "learning_rate": 0.0002, "epoch": 4.143561555250182, "step": 25630}, {"loss": 0.5393, "grad_norm": 0.8940697312355042, "learning_rate": 0.0002, "epoch": 4.145178239430927, "step": 25640}, {"loss": 0.5341, "grad_norm": 1.0798203945159912, "learning_rate": 0.0002, "epoch": 4.146794923611672, "step": 25650}, {"loss": 0.5987, "grad_norm": 0.955172061920166, "learning_rate": 0.0002, "epoch": 4.148411607792418, "step": 25660}, {"loss": 0.569, "grad_norm": 0.9692716002464294, "learning_rate": 0.0002, "epoch": 4.150028291973163, "step": 25670}, {"loss": 0.5478, "grad_norm": 1.0813939571380615, "learning_rate": 0.0002, "epoch": 4.151644976153908, "step": 25680}, {"loss": 0.5383, "grad_norm": 1.135675072669983, "learning_rate": 0.0002, "epoch": 4.153261660334653, "step": 25690}, {"loss": 0.5247, "grad_norm": 1.0392236709594727, "learning_rate": 0.0002, "epoch": 4.1548783445153985, "step": 25700}, {"loss": 0.5204, "grad_norm": 0.9473116993904114, "learning_rate": 0.0002, "epoch": 4.156495028696145, "step": 25710}, {"loss": 0.5339, "grad_norm": 0.712493896484375, "learning_rate": 0.0002, "epoch": 4.15811171287689, "step": 25720}, {"loss": 0.5781, "grad_norm": 0.8724465370178223, "learning_rate": 0.0002, "epoch": 4.159728397057635, "step": 25730}, {"loss": 0.5325, "grad_norm": 0.9870015978813171, "learning_rate": 0.0002, "epoch": 4.16134508123838, "step": 25740}, {"loss": 0.5503, "grad_norm": 1.025273084640503, "learning_rate": 0.0002, "epoch": 4.1629617654191255, "step": 25750}, {"loss": 0.5223, "grad_norm": 0.9243090152740479, "learning_rate": 0.0002, "epoch": 4.164578449599871, "step": 25760}, {"loss": 0.5177, "grad_norm": 1.1656451225280762, "learning_rate": 0.0002, "epoch": 4.166195133780616, "step": 25770}, {"loss": 0.5334, "grad_norm": 0.936358630657196, "learning_rate": 0.0002, "epoch": 4.167811817961361, "step": 25780}, {"loss": 0.5236, "grad_norm": 0.8618208169937134, "learning_rate": 0.0002, "epoch": 4.1694285021421065, "step": 25790}, {"loss": 0.5186, "grad_norm": 0.8580600023269653, "learning_rate": 0.0002, "epoch": 4.171045186322852, "step": 25800}, {"loss": 0.5212, "grad_norm": 1.0128562450408936, "learning_rate": 0.0002, "epoch": 4.172661870503597, "step": 25810}, {"loss": 0.5404, "grad_norm": 0.854865312576294, "learning_rate": 0.0002, "epoch": 4.174278554684342, "step": 25820}, {"loss": 0.5377, "grad_norm": 1.235082745552063, "learning_rate": 0.0002, "epoch": 4.175895238865087, "step": 25830}, {"loss": 0.5614, "grad_norm": 0.9796220660209656, "learning_rate": 0.0002, "epoch": 4.177511923045833, "step": 25840}, {"loss": 0.5689, "grad_norm": 0.8922094702720642, "learning_rate": 0.0002, "epoch": 4.179128607226578, "step": 25850}, {"loss": 0.5806, "grad_norm": 0.9672530293464661, "learning_rate": 0.0002, "epoch": 4.180745291407324, "step": 25860}, {"loss": 0.5074, "grad_norm": 0.8662548661231995, "learning_rate": 0.0002, "epoch": 4.182361975588069, "step": 25870}, {"loss": 0.5329, "grad_norm": 0.7938798069953918, "learning_rate": 0.0002, "epoch": 4.1839786597688144, "step": 25880}, {"loss": 0.5427, "grad_norm": 1.0517958402633667, "learning_rate": 0.0002, "epoch": 4.18559534394956, "step": 25890}, {"loss": 0.5147, "grad_norm": 0.8939275145530701, "learning_rate": 0.0002, "epoch": 4.187212028130305, "step": 25900}, {"loss": 0.5199, "grad_norm": 1.0296672582626343, "learning_rate": 0.0002, "epoch": 4.18882871231105, "step": 25910}, {"loss": 0.5522, "grad_norm": 0.8104017972946167, "learning_rate": 0.0002, "epoch": 4.190445396491795, "step": 25920}, {"loss": 0.596, "grad_norm": 0.9984509944915771, "learning_rate": 0.0002, "epoch": 4.192062080672541, "step": 25930}, {"loss": 0.5356, "grad_norm": 0.9844784736633301, "learning_rate": 0.0002, "epoch": 4.193678764853286, "step": 25940}, {"loss": 0.5198, "grad_norm": 0.8168622255325317, "learning_rate": 0.0002, "epoch": 4.195295449034031, "step": 25950}, {"loss": 0.542, "grad_norm": 1.0878913402557373, "learning_rate": 0.0002, "epoch": 4.196912133214776, "step": 25960}, {"loss": 0.5414, "grad_norm": 0.927126407623291, "learning_rate": 0.0002, "epoch": 4.1985288173955215, "step": 25970}, {"loss": 0.5794, "grad_norm": 0.838586688041687, "learning_rate": 0.0002, "epoch": 4.200145501576267, "step": 25980}, {"loss": 0.5454, "grad_norm": 1.2572145462036133, "learning_rate": 0.0002, "epoch": 4.201762185757012, "step": 25990}, {"loss": 0.5048, "grad_norm": 1.0476740598678589, "learning_rate": 0.0002, "epoch": 4.203378869937758, "step": 26000}, {"loss": 0.5127, "grad_norm": 1.0873368978500366, "learning_rate": 0.0002, "epoch": 4.204995554118503, "step": 26010}, {"loss": 0.5679, "grad_norm": 1.2664896249771118, "learning_rate": 0.0002, "epoch": 4.206612238299249, "step": 26020}, {"loss": 0.5814, "grad_norm": 1.0312391519546509, "learning_rate": 0.0002, "epoch": 4.208228922479994, "step": 26030}, {"loss": 0.571, "grad_norm": 1.0235042572021484, "learning_rate": 0.0002, "epoch": 4.209845606660739, "step": 26040}, {"loss": 0.5766, "grad_norm": 0.8882219195365906, "learning_rate": 0.0002, "epoch": 4.211462290841484, "step": 26050}, {"loss": 0.5557, "grad_norm": 0.9115961790084839, "learning_rate": 0.0002, "epoch": 4.2130789750222295, "step": 26060}, {"loss": 0.5455, "grad_norm": 1.0218228101730347, "learning_rate": 0.0002, "epoch": 4.214695659202975, "step": 26070}, {"loss": 0.5462, "grad_norm": 1.0802232027053833, "learning_rate": 0.0002, "epoch": 4.21631234338372, "step": 26080}, {"loss": 0.557, "grad_norm": 1.1488053798675537, "learning_rate": 0.0002, "epoch": 4.217929027564465, "step": 26090}, {"loss": 0.52, "grad_norm": 1.0487725734710693, "learning_rate": 0.0002, "epoch": 4.21954571174521, "step": 26100}, {"loss": 0.5568, "grad_norm": 0.9131165742874146, "learning_rate": 0.0002, "epoch": 4.221162395925956, "step": 26110}, {"loss": 0.5206, "grad_norm": 0.9012845158576965, "learning_rate": 0.0002, "epoch": 4.222779080106701, "step": 26120}, {"loss": 0.561, "grad_norm": 0.8389840126037598, "learning_rate": 0.0002, "epoch": 4.224395764287446, "step": 26130}, {"loss": 0.5268, "grad_norm": 0.8924660682678223, "learning_rate": 0.0002, "epoch": 4.226012448468191, "step": 26140}, {"loss": 0.5715, "grad_norm": 0.8556463718414307, "learning_rate": 0.0002, "epoch": 4.2276291326489375, "step": 26150}, {"loss": 0.5695, "grad_norm": 0.9643129110336304, "learning_rate": 0.0002, "epoch": 4.229245816829683, "step": 26160}, {"loss": 0.5321, "grad_norm": 0.9865712523460388, "learning_rate": 0.0002, "epoch": 4.230862501010428, "step": 26170}, {"loss": 0.5406, "grad_norm": 1.152641773223877, "learning_rate": 0.0002, "epoch": 4.232479185191173, "step": 26180}, {"loss": 0.5632, "grad_norm": 0.9157698154449463, "learning_rate": 0.0002, "epoch": 4.234095869371918, "step": 26190}, {"loss": 0.5717, "grad_norm": 0.8418048620223999, "learning_rate": 0.0002, "epoch": 4.235712553552664, "step": 26200}, {"loss": 0.5624, "grad_norm": 0.9430168867111206, "learning_rate": 0.0002, "epoch": 4.237329237733409, "step": 26210}, {"loss": 0.5574, "grad_norm": 1.012582778930664, "learning_rate": 0.0002, "epoch": 4.238945921914154, "step": 26220}, {"loss": 0.5693, "grad_norm": 1.112619400024414, "learning_rate": 0.0002, "epoch": 4.240562606094899, "step": 26230}, {"loss": 0.6037, "grad_norm": 0.9243621826171875, "learning_rate": 0.0002, "epoch": 4.2421792902756446, "step": 26240}, {"loss": 0.569, "grad_norm": 0.6977595686912537, "learning_rate": 0.0002, "epoch": 4.24379597445639, "step": 26250}, {"loss": 0.5379, "grad_norm": 0.9600721597671509, "learning_rate": 0.0002, "epoch": 4.245412658637135, "step": 26260}, {"loss": 0.5658, "grad_norm": 0.882641613483429, "learning_rate": 0.0002, "epoch": 4.24702934281788, "step": 26270}, {"loss": 0.55, "grad_norm": 1.010920763015747, "learning_rate": 0.0002, "epoch": 4.2486460269986255, "step": 26280}, {"loss": 0.5803, "grad_norm": 0.9289400577545166, "learning_rate": 0.0002, "epoch": 4.250262711179371, "step": 26290}, {"loss": 0.541, "grad_norm": 1.137397289276123, "learning_rate": 0.0002, "epoch": 4.251879395360117, "step": 26300}, {"loss": 0.5204, "grad_norm": 1.0136182308197021, "learning_rate": 0.0002, "epoch": 4.253496079540862, "step": 26310}, {"loss": 0.5708, "grad_norm": 0.9387356042861938, "learning_rate": 0.0002, "epoch": 4.255112763721607, "step": 26320}, {"loss": 0.5948, "grad_norm": 1.1833957433700562, "learning_rate": 0.0002, "epoch": 4.2567294479023525, "step": 26330}, {"loss": 0.5905, "grad_norm": 0.9415934681892395, "learning_rate": 0.0002, "epoch": 4.258346132083098, "step": 26340}, {"loss": 0.5539, "grad_norm": 0.8550165891647339, "learning_rate": 0.0002, "epoch": 4.259962816263843, "step": 26350}, {"loss": 0.555, "grad_norm": 9.924622535705566, "learning_rate": 0.0002, "epoch": 4.261579500444588, "step": 26360}, {"loss": 0.5689, "grad_norm": 1.0104902982711792, "learning_rate": 0.0002, "epoch": 4.2631961846253335, "step": 26370}, {"loss": 0.5698, "grad_norm": 0.890794038772583, "learning_rate": 0.0002, "epoch": 4.264812868806079, "step": 26380}, {"loss": 0.563, "grad_norm": 1.0560191869735718, "learning_rate": 0.0002, "epoch": 4.266429552986824, "step": 26390}, {"loss": 0.5119, "grad_norm": 1.0135581493377686, "learning_rate": 0.0002, "epoch": 4.268046237167569, "step": 26400}, {"loss": 0.5359, "grad_norm": 1.1304140090942383, "learning_rate": 0.0002, "epoch": 4.269662921348314, "step": 26410}, {"loss": 0.5615, "grad_norm": 0.9899303913116455, "learning_rate": 0.0002, "epoch": 4.27127960552906, "step": 26420}, {"loss": 0.5815, "grad_norm": 1.0505329370498657, "learning_rate": 0.0002, "epoch": 4.272896289709805, "step": 26430}, {"loss": 0.5384, "grad_norm": 0.9389396905899048, "learning_rate": 0.0002, "epoch": 4.27451297389055, "step": 26440}, {"loss": 0.5558, "grad_norm": 0.875328779220581, "learning_rate": 0.0002, "epoch": 4.276129658071296, "step": 26450}, {"loss": 0.5601, "grad_norm": 1.0689256191253662, "learning_rate": 0.0002, "epoch": 4.277746342252041, "step": 26460}, {"loss": 0.546, "grad_norm": 0.9988957643508911, "learning_rate": 0.0002, "epoch": 4.279363026432787, "step": 26470}, {"loss": 0.5478, "grad_norm": 0.8721813559532166, "learning_rate": 0.0002, "epoch": 4.280979710613532, "step": 26480}, {"loss": 0.5424, "grad_norm": 1.100109577178955, "learning_rate": 0.0002, "epoch": 4.282596394794277, "step": 26490}, {"loss": 0.572, "grad_norm": 1.1607271432876587, "learning_rate": 0.0002, "epoch": 4.284213078975022, "step": 26500}, {"loss": 0.6287, "grad_norm": 0.879088819026947, "learning_rate": 0.0002, "epoch": 4.285829763155768, "step": 26510}, {"loss": 0.573, "grad_norm": 0.9891700744628906, "learning_rate": 0.0002, "epoch": 4.287446447336513, "step": 26520}, {"loss": 0.6018, "grad_norm": 1.0831127166748047, "learning_rate": 0.0002, "epoch": 4.289063131517258, "step": 26530}, {"loss": 0.5693, "grad_norm": 1.4108285903930664, "learning_rate": 0.0002, "epoch": 4.290679815698003, "step": 26540}, {"loss": 0.5888, "grad_norm": 1.0630289316177368, "learning_rate": 0.0002, "epoch": 4.2922964998787485, "step": 26550}, {"loss": 0.5817, "grad_norm": 1.0854572057724, "learning_rate": 0.0002, "epoch": 4.293913184059494, "step": 26560}, {"loss": 0.5586, "grad_norm": 0.9561646580696106, "learning_rate": 0.0002, "epoch": 4.295529868240239, "step": 26570}, {"loss": 0.5674, "grad_norm": 0.9064981937408447, "learning_rate": 0.0002, "epoch": 4.297146552420984, "step": 26580}, {"loss": 0.5847, "grad_norm": 1.0082972049713135, "learning_rate": 0.0002, "epoch": 4.298763236601729, "step": 26590}, {"loss": 0.5711, "grad_norm": 1.1613214015960693, "learning_rate": 0.0002, "epoch": 4.3003799207824756, "step": 26600}, {"loss": 0.551, "grad_norm": 0.9847695231437683, "learning_rate": 0.0002, "epoch": 4.301996604963221, "step": 26610}, {"loss": 0.6089, "grad_norm": 1.0980697870254517, "learning_rate": 0.0002, "epoch": 4.303613289143966, "step": 26620}, {"loss": 0.5797, "grad_norm": 0.8861175179481506, "learning_rate": 0.0002, "epoch": 4.305229973324711, "step": 26630}, {"loss": 0.5716, "grad_norm": 0.8917363286018372, "learning_rate": 0.0002, "epoch": 4.3068466575054565, "step": 26640}, {"loss": 0.5892, "grad_norm": 1.0458378791809082, "learning_rate": 0.0002, "epoch": 4.308463341686202, "step": 26650}, {"loss": 0.5883, "grad_norm": 1.4859240055084229, "learning_rate": 0.0002, "epoch": 4.310080025866947, "step": 26660}, {"loss": 0.5296, "grad_norm": 1.1376359462738037, "learning_rate": 0.0002, "epoch": 4.311696710047692, "step": 26670}, {"loss": 0.5671, "grad_norm": 0.991349995136261, "learning_rate": 0.0002, "epoch": 4.313313394228437, "step": 26680}, {"loss": 0.5338, "grad_norm": 0.9995543956756592, "learning_rate": 0.0002, "epoch": 4.314930078409183, "step": 26690}, {"loss": 0.5542, "grad_norm": 1.0515851974487305, "learning_rate": 0.0002, "epoch": 4.316546762589928, "step": 26700}, {"loss": 0.5473, "grad_norm": 1.008023977279663, "learning_rate": 0.0002, "epoch": 4.318163446770673, "step": 26710}, {"loss": 0.5506, "grad_norm": 1.0184582471847534, "learning_rate": 0.0002, "epoch": 4.319780130951418, "step": 26720}, {"loss": 0.5828, "grad_norm": 1.161071538925171, "learning_rate": 0.0002, "epoch": 4.321396815132164, "step": 26730}, {"loss": 0.5633, "grad_norm": 0.9580779671669006, "learning_rate": 0.0002, "epoch": 4.323013499312909, "step": 26740}, {"loss": 0.5785, "grad_norm": 1.0189911127090454, "learning_rate": 0.0002, "epoch": 4.324630183493655, "step": 26750}, {"loss": 0.5237, "grad_norm": 0.7484358549118042, "learning_rate": 0.0002, "epoch": 4.3262468676744, "step": 26760}, {"loss": 0.5728, "grad_norm": 1.0015908479690552, "learning_rate": 0.0002, "epoch": 4.327863551855145, "step": 26770}, {"loss": 0.5597, "grad_norm": 0.8972945809364319, "learning_rate": 0.0002, "epoch": 4.329480236035891, "step": 26780}, {"loss": 0.5857, "grad_norm": 1.01099693775177, "learning_rate": 0.0002, "epoch": 4.331096920216636, "step": 26790}, {"loss": 0.5591, "grad_norm": 0.846958339214325, "learning_rate": 0.0002, "epoch": 4.332713604397381, "step": 26800}, {"loss": 0.5547, "grad_norm": 1.0792603492736816, "learning_rate": 0.0002, "epoch": 4.334330288578126, "step": 26810}, {"loss": 0.5747, "grad_norm": 1.0373345613479614, "learning_rate": 0.0002, "epoch": 4.3359469727588715, "step": 26820}, {"loss": 0.558, "grad_norm": 0.9779167771339417, "learning_rate": 0.0002, "epoch": 4.337563656939617, "step": 26830}, {"loss": 0.5821, "grad_norm": 1.0235520601272583, "learning_rate": 0.0002, "epoch": 4.339180341120362, "step": 26840}, {"loss": 0.5843, "grad_norm": 1.04195237159729, "learning_rate": 0.0002, "epoch": 4.340797025301107, "step": 26850}, {"loss": 0.5474, "grad_norm": 0.9479565620422363, "learning_rate": 0.0002, "epoch": 4.3424137094818525, "step": 26860}, {"loss": 0.5646, "grad_norm": 0.9526172280311584, "learning_rate": 0.0002, "epoch": 4.344030393662598, "step": 26870}, {"loss": 0.521, "grad_norm": 0.8571456074714661, "learning_rate": 0.0002, "epoch": 4.345647077843343, "step": 26880}, {"loss": 0.5846, "grad_norm": 0.9475828409194946, "learning_rate": 0.0002, "epoch": 4.347263762024088, "step": 26890}, {"loss": 0.5815, "grad_norm": 1.0529576539993286, "learning_rate": 0.0002, "epoch": 4.348880446204834, "step": 26900}, {"loss": 0.56, "grad_norm": 0.9648140072822571, "learning_rate": 0.0002, "epoch": 4.3504971303855795, "step": 26910}, {"loss": 0.5162, "grad_norm": 1.0488841533660889, "learning_rate": 0.0002, "epoch": 4.352113814566325, "step": 26920}, {"loss": 0.5842, "grad_norm": 0.8771942257881165, "learning_rate": 0.0002, "epoch": 4.35373049874707, "step": 26930}, {"loss": 0.5966, "grad_norm": 0.9411202073097229, "learning_rate": 0.0002, "epoch": 4.355347182927815, "step": 26940}, {"loss": 0.6001, "grad_norm": 1.0997588634490967, "learning_rate": 0.0002, "epoch": 4.35696386710856, "step": 26950}, {"loss": 0.5528, "grad_norm": 0.968754768371582, "learning_rate": 0.0002, "epoch": 4.358580551289306, "step": 26960}, {"loss": 0.5881, "grad_norm": 0.9990773797035217, "learning_rate": 0.0002, "epoch": 4.360197235470051, "step": 26970}, {"loss": 0.5761, "grad_norm": 1.0210620164871216, "learning_rate": 0.0002, "epoch": 4.361813919650796, "step": 26980}, {"loss": 0.5768, "grad_norm": 0.855462908744812, "learning_rate": 0.0002, "epoch": 4.363430603831541, "step": 26990}, {"loss": 0.5493, "grad_norm": 0.9169660806655884, "learning_rate": 0.0002, "epoch": 4.365047288012287, "step": 27000}, {"loss": 0.5697, "grad_norm": 1.089629888534546, "learning_rate": 0.0002, "epoch": 4.366663972193032, "step": 27010}, {"loss": 0.5854, "grad_norm": 1.0932867527008057, "learning_rate": 0.0002, "epoch": 4.368280656373777, "step": 27020}, {"loss": 0.5656, "grad_norm": 0.9290956854820251, "learning_rate": 0.0002, "epoch": 4.369897340554522, "step": 27030}, {"loss": 0.5727, "grad_norm": 1.2800624370574951, "learning_rate": 0.0002, "epoch": 4.3715140247352675, "step": 27040}, {"loss": 0.5837, "grad_norm": 0.8993493318557739, "learning_rate": 0.0002, "epoch": 4.373130708916014, "step": 27050}, {"loss": 0.6232, "grad_norm": 1.1566431522369385, "learning_rate": 0.0002, "epoch": 4.374747393096759, "step": 27060}, {"loss": 0.5902, "grad_norm": 0.9479052424430847, "learning_rate": 0.0002, "epoch": 4.376364077277504, "step": 27070}, {"loss": 0.6189, "grad_norm": 1.0063648223876953, "learning_rate": 0.0002, "epoch": 4.377980761458249, "step": 27080}, {"loss": 0.561, "grad_norm": 0.8342045545578003, "learning_rate": 0.0002, "epoch": 4.379597445638995, "step": 27090}, {"loss": 0.5515, "grad_norm": 1.1390739679336548, "learning_rate": 0.0002, "epoch": 4.38121412981974, "step": 27100}, {"loss": 0.5372, "grad_norm": 0.9547637104988098, "learning_rate": 0.0002, "epoch": 4.382830814000485, "step": 27110}, {"loss": 0.5728, "grad_norm": 1.0503804683685303, "learning_rate": 0.0002, "epoch": 4.38444749818123, "step": 27120}, {"loss": 0.5787, "grad_norm": 0.9064017534255981, "learning_rate": 0.0002, "epoch": 4.3860641823619755, "step": 27130}, {"loss": 0.5798, "grad_norm": 0.9382519125938416, "learning_rate": 0.0002, "epoch": 4.387680866542721, "step": 27140}, {"loss": 0.5791, "grad_norm": 1.0410341024398804, "learning_rate": 0.0002, "epoch": 4.389297550723466, "step": 27150}, {"loss": 0.6034, "grad_norm": 0.9218655824661255, "learning_rate": 0.0002, "epoch": 4.390914234904211, "step": 27160}, {"loss": 0.5204, "grad_norm": 0.8119737505912781, "learning_rate": 0.0002, "epoch": 4.392530919084956, "step": 27170}, {"loss": 0.5612, "grad_norm": 0.8584722876548767, "learning_rate": 0.0002, "epoch": 4.394147603265702, "step": 27180}, {"loss": 0.5772, "grad_norm": 0.9668293595314026, "learning_rate": 0.0002, "epoch": 4.395764287446447, "step": 27190}, {"loss": 0.6009, "grad_norm": 1.022334098815918, "learning_rate": 0.0002, "epoch": 4.397380971627193, "step": 27200}, {"loss": 0.5573, "grad_norm": 0.9553216099739075, "learning_rate": 0.0002, "epoch": 4.398997655807938, "step": 27210}, {"loss": 0.5604, "grad_norm": 0.9282339215278625, "learning_rate": 0.0002, "epoch": 4.4006143399886835, "step": 27220}, {"loss": 0.5599, "grad_norm": 1.0232292413711548, "learning_rate": 0.0002, "epoch": 4.402231024169429, "step": 27230}, {"loss": 0.6078, "grad_norm": 0.9915700554847717, "learning_rate": 0.0002, "epoch": 4.403847708350174, "step": 27240}, {"loss": 0.5778, "grad_norm": 1.0014961957931519, "learning_rate": 0.0002, "epoch": 4.405464392530919, "step": 27250}, {"loss": 0.5824, "grad_norm": 1.1172103881835938, "learning_rate": 0.0002, "epoch": 4.407081076711664, "step": 27260}, {"loss": 0.5286, "grad_norm": 0.8583093285560608, "learning_rate": 0.0002, "epoch": 4.40869776089241, "step": 27270}, {"loss": 0.5507, "grad_norm": 0.7609201669692993, "learning_rate": 0.0002, "epoch": 4.410314445073155, "step": 27280}, {"loss": 0.575, "grad_norm": 1.0619351863861084, "learning_rate": 0.0002, "epoch": 4.4119311292539, "step": 27290}, {"loss": 0.5579, "grad_norm": 1.0177674293518066, "learning_rate": 0.0002, "epoch": 4.413547813434645, "step": 27300}, {"loss": 0.5628, "grad_norm": 0.9921218156814575, "learning_rate": 0.0002, "epoch": 4.4151644976153905, "step": 27310}, {"loss": 0.6018, "grad_norm": 1.126244306564331, "learning_rate": 0.0002, "epoch": 4.416781181796136, "step": 27320}, {"loss": 0.5743, "grad_norm": 1.0678540468215942, "learning_rate": 0.0002, "epoch": 4.418397865976881, "step": 27330}, {"loss": 0.5665, "grad_norm": 0.8705704212188721, "learning_rate": 0.0002, "epoch": 4.420014550157627, "step": 27340}, {"loss": 0.5763, "grad_norm": 1.272074818611145, "learning_rate": 0.0002, "epoch": 4.421631234338372, "step": 27350}, {"loss": 0.561, "grad_norm": 0.8740444183349609, "learning_rate": 0.0002, "epoch": 4.423247918519118, "step": 27360}, {"loss": 0.5492, "grad_norm": 1.0584250688552856, "learning_rate": 0.0002, "epoch": 4.424864602699863, "step": 27370}, {"loss": 0.589, "grad_norm": 1.059870719909668, "learning_rate": 0.0002, "epoch": 4.426481286880608, "step": 27380}, {"loss": 0.5551, "grad_norm": 1.072265863418579, "learning_rate": 0.0002, "epoch": 4.428097971061353, "step": 27390}, {"loss": 0.5584, "grad_norm": 0.871481716632843, "learning_rate": 0.0002, "epoch": 4.4297146552420985, "step": 27400}, {"loss": 0.5372, "grad_norm": 0.9555448293685913, "learning_rate": 0.0002, "epoch": 4.431331339422844, "step": 27410}, {"loss": 0.5593, "grad_norm": 1.0402292013168335, "learning_rate": 0.0002, "epoch": 4.432948023603589, "step": 27420}, {"loss": 0.5532, "grad_norm": 1.12587571144104, "learning_rate": 0.0002, "epoch": 4.434564707784334, "step": 27430}, {"loss": 0.5403, "grad_norm": 1.0783193111419678, "learning_rate": 0.0002, "epoch": 4.436181391965079, "step": 27440}, {"loss": 0.5313, "grad_norm": 1.024133563041687, "learning_rate": 0.0002, "epoch": 4.437798076145825, "step": 27450}, {"loss": 0.5621, "grad_norm": 0.9156768918037415, "learning_rate": 0.0002, "epoch": 4.43941476032657, "step": 27460}, {"loss": 0.5307, "grad_norm": 1.0215224027633667, "learning_rate": 0.0002, "epoch": 4.441031444507315, "step": 27470}, {"loss": 0.5188, "grad_norm": 1.082116961479187, "learning_rate": 0.0002, "epoch": 4.442648128688061, "step": 27480}, {"loss": 0.6203, "grad_norm": 1.0412873029708862, "learning_rate": 0.0002, "epoch": 4.4442648128688065, "step": 27490}, {"loss": 0.5939, "grad_norm": 1.0509289503097534, "learning_rate": 0.0002, "epoch": 4.445881497049552, "step": 27500}, {"loss": 0.5503, "grad_norm": 0.9291498064994812, "learning_rate": 0.0002, "epoch": 4.447498181230297, "step": 27510}, {"loss": 0.5408, "grad_norm": 0.970184326171875, "learning_rate": 0.0002, "epoch": 4.449114865411042, "step": 27520}, {"loss": 0.5705, "grad_norm": 0.8418883681297302, "learning_rate": 0.0002, "epoch": 4.450731549591787, "step": 27530}, {"loss": 0.5124, "grad_norm": 0.8823825120925903, "learning_rate": 0.0002, "epoch": 4.452348233772533, "step": 27540}, {"loss": 0.5867, "grad_norm": 1.1909019947052002, "learning_rate": 0.0002, "epoch": 4.453964917953278, "step": 27550}, {"loss": 0.5685, "grad_norm": 1.0317302942276, "learning_rate": 0.0002, "epoch": 4.455581602134023, "step": 27560}, {"loss": 0.5538, "grad_norm": 0.9977751970291138, "learning_rate": 0.0002, "epoch": 4.457198286314768, "step": 27570}, {"loss": 0.5628, "grad_norm": 0.8909519910812378, "learning_rate": 0.0002, "epoch": 4.458814970495514, "step": 27580}, {"loss": 0.6099, "grad_norm": 0.8653029799461365, "learning_rate": 0.0002, "epoch": 4.460431654676259, "step": 27590}, {"loss": 0.5622, "grad_norm": 1.0783653259277344, "learning_rate": 0.0002, "epoch": 4.462048338857004, "step": 27600}, {"loss": 0.579, "grad_norm": 1.1235394477844238, "learning_rate": 0.0002, "epoch": 4.463665023037749, "step": 27610}, {"loss": 0.5545, "grad_norm": 0.9386643767356873, "learning_rate": 0.0002, "epoch": 4.4652817072184945, "step": 27620}, {"loss": 0.5554, "grad_norm": 1.0605148077011108, "learning_rate": 0.0002, "epoch": 4.466898391399241, "step": 27630}, {"loss": 0.5886, "grad_norm": 1.1283893585205078, "learning_rate": 0.0002, "epoch": 4.468515075579986, "step": 27640}, {"loss": 0.5801, "grad_norm": 1.0583468675613403, "learning_rate": 0.0002, "epoch": 4.470131759760731, "step": 27650}, {"loss": 0.5601, "grad_norm": 0.9563992023468018, "learning_rate": 0.0002, "epoch": 4.471748443941476, "step": 27660}, {"loss": 0.5687, "grad_norm": 1.100598931312561, "learning_rate": 0.0002, "epoch": 4.4733651281222215, "step": 27670}, {"loss": 0.589, "grad_norm": 0.9386957287788391, "learning_rate": 0.0002, "epoch": 4.474981812302967, "step": 27680}, {"loss": 0.6241, "grad_norm": 1.2946288585662842, "learning_rate": 0.0002, "epoch": 4.476598496483712, "step": 27690}, {"loss": 0.6075, "grad_norm": 1.0325199365615845, "learning_rate": 0.0002, "epoch": 4.478215180664457, "step": 27700}, {"loss": 0.588, "grad_norm": 1.0318928956985474, "learning_rate": 0.0002, "epoch": 4.4798318648452025, "step": 27710}, {"loss": 0.5656, "grad_norm": 0.8721024394035339, "learning_rate": 0.0002, "epoch": 4.481448549025948, "step": 27720}, {"loss": 0.5421, "grad_norm": 1.17376708984375, "learning_rate": 0.0002, "epoch": 4.483065233206693, "step": 27730}, {"loss": 0.5657, "grad_norm": 1.0926326513290405, "learning_rate": 0.0002, "epoch": 4.484681917387438, "step": 27740}, {"loss": 0.5514, "grad_norm": 0.9043852686882019, "learning_rate": 0.0002, "epoch": 4.486298601568183, "step": 27750}, {"loss": 0.582, "grad_norm": 1.064600944519043, "learning_rate": 0.0002, "epoch": 4.487915285748929, "step": 27760}, {"loss": 0.6108, "grad_norm": 0.7833460569381714, "learning_rate": 0.0002, "epoch": 4.489531969929674, "step": 27770}, {"loss": 0.5985, "grad_norm": 1.1073496341705322, "learning_rate": 0.0002, "epoch": 4.49114865411042, "step": 27780}, {"loss": 0.5577, "grad_norm": 1.0799397230148315, "learning_rate": 0.0002, "epoch": 4.492765338291165, "step": 27790}, {"loss": 0.5601, "grad_norm": 1.1062238216400146, "learning_rate": 0.0002, "epoch": 4.49438202247191, "step": 27800}, {"loss": 0.6126, "grad_norm": 1.0568242073059082, "learning_rate": 0.0002, "epoch": 4.495998706652656, "step": 27810}, {"loss": 0.5913, "grad_norm": 0.8861091732978821, "learning_rate": 0.0002, "epoch": 4.497615390833401, "step": 27820}, {"loss": 0.5858, "grad_norm": 1.2297543287277222, "learning_rate": 0.0002, "epoch": 4.499232075014146, "step": 27830}, {"loss": 0.5859, "grad_norm": 0.9600302577018738, "learning_rate": 0.0002, "epoch": 4.500848759194891, "step": 27840}, {"loss": 0.6124, "grad_norm": 1.057051181793213, "learning_rate": 0.0002, "epoch": 4.502465443375637, "step": 27850}, {"loss": 0.5788, "grad_norm": 0.9839690923690796, "learning_rate": 0.0002, "epoch": 4.504082127556382, "step": 27860}, {"loss": 0.555, "grad_norm": 1.1479853391647339, "learning_rate": 0.0002, "epoch": 4.505698811737127, "step": 27870}, {"loss": 0.6039, "grad_norm": 1.0550768375396729, "learning_rate": 0.0002, "epoch": 4.507315495917872, "step": 27880}, {"loss": 0.563, "grad_norm": 0.898209273815155, "learning_rate": 0.0002, "epoch": 4.5089321800986175, "step": 27890}, {"loss": 0.5734, "grad_norm": 0.9460315108299255, "learning_rate": 0.0002, "epoch": 4.510548864279363, "step": 27900}, {"loss": 0.5702, "grad_norm": 0.9499884247779846, "learning_rate": 0.0002, "epoch": 4.512165548460108, "step": 27910}, {"loss": 0.5385, "grad_norm": 0.7801318764686584, "learning_rate": 0.0002, "epoch": 4.513782232640853, "step": 27920}, {"loss": 0.5391, "grad_norm": 0.9286966323852539, "learning_rate": 0.0002, "epoch": 4.515398916821599, "step": 27930}, {"loss": 0.5717, "grad_norm": 0.9539980292320251, "learning_rate": 0.0002, "epoch": 4.517015601002345, "step": 27940}, {"loss": 0.6073, "grad_norm": 1.1053401231765747, "learning_rate": 0.0002, "epoch": 4.51863228518309, "step": 27950}, {"loss": 0.6087, "grad_norm": 0.7535534501075745, "learning_rate": 0.0002, "epoch": 4.520248969363835, "step": 27960}, {"loss": 0.5701, "grad_norm": 1.076926589012146, "learning_rate": 0.0002, "epoch": 4.52186565354458, "step": 27970}, {"loss": 0.6028, "grad_norm": 1.181935429573059, "learning_rate": 0.0002, "epoch": 4.5234823377253255, "step": 27980}, {"loss": 0.6033, "grad_norm": 0.9293407201766968, "learning_rate": 0.0002, "epoch": 4.525099021906071, "step": 27990}, {"loss": 0.5815, "grad_norm": 0.8953009247779846, "learning_rate": 0.0002, "epoch": 4.526715706086816, "step": 28000}, {"loss": 0.5564, "grad_norm": 1.0850225687026978, "learning_rate": 0.0002, "epoch": 4.528332390267561, "step": 28010}, {"loss": 0.5459, "grad_norm": 0.9125663042068481, "learning_rate": 0.0002, "epoch": 4.529949074448306, "step": 28020}, {"loss": 0.5922, "grad_norm": 0.8745216727256775, "learning_rate": 0.0002, "epoch": 4.531565758629052, "step": 28030}, {"loss": 0.567, "grad_norm": 1.0783463716506958, "learning_rate": 0.0002, "epoch": 4.533182442809797, "step": 28040}, {"loss": 0.5754, "grad_norm": 0.7513844966888428, "learning_rate": 0.0002, "epoch": 4.534799126990542, "step": 28050}, {"loss": 0.5608, "grad_norm": 1.0135776996612549, "learning_rate": 0.0002, "epoch": 4.536415811171287, "step": 28060}, {"loss": 0.5827, "grad_norm": 0.8886825442314148, "learning_rate": 0.0002, "epoch": 4.538032495352033, "step": 28070}, {"loss": 0.5605, "grad_norm": 0.8153995275497437, "learning_rate": 0.0002, "epoch": 4.539649179532779, "step": 28080}, {"loss": 0.6377, "grad_norm": 0.9853341579437256, "learning_rate": 0.0002, "epoch": 4.541265863713524, "step": 28090}, {"loss": 0.5957, "grad_norm": 0.9365800023078918, "learning_rate": 0.0002, "epoch": 4.542882547894269, "step": 28100}, {"loss": 0.5477, "grad_norm": 0.9765017628669739, "learning_rate": 0.0002, "epoch": 4.544499232075014, "step": 28110}, {"loss": 0.6185, "grad_norm": 0.9811279773712158, "learning_rate": 0.0002, "epoch": 4.54611591625576, "step": 28120}, {"loss": 0.6095, "grad_norm": 1.0387924909591675, "learning_rate": 0.0002, "epoch": 4.547732600436505, "step": 28130}, {"loss": 0.6534, "grad_norm": 1.0684878826141357, "learning_rate": 0.0002, "epoch": 4.54934928461725, "step": 28140}, {"loss": 0.5701, "grad_norm": 1.0000102519989014, "learning_rate": 0.0002, "epoch": 4.550965968797995, "step": 28150}, {"loss": 0.5327, "grad_norm": 1.0717930793762207, "learning_rate": 0.0002, "epoch": 4.5525826529787405, "step": 28160}, {"loss": 0.5594, "grad_norm": 0.990074634552002, "learning_rate": 0.0002, "epoch": 4.554199337159486, "step": 28170}, {"loss": 0.5452, "grad_norm": 0.8673754930496216, "learning_rate": 0.0002, "epoch": 4.555816021340231, "step": 28180}, {"loss": 0.5773, "grad_norm": 0.864247739315033, "learning_rate": 0.0002, "epoch": 4.557432705520976, "step": 28190}, {"loss": 0.5516, "grad_norm": 0.8280200958251953, "learning_rate": 0.0002, "epoch": 4.5590493897017215, "step": 28200}, {"loss": 0.5709, "grad_norm": 1.1312172412872314, "learning_rate": 0.0002, "epoch": 4.560666073882467, "step": 28210}, {"loss": 0.5776, "grad_norm": 0.9147403240203857, "learning_rate": 0.0002, "epoch": 4.562282758063212, "step": 28220}, {"loss": 0.5591, "grad_norm": 1.0321218967437744, "learning_rate": 0.0002, "epoch": 4.563899442243958, "step": 28230}, {"loss": 0.5508, "grad_norm": 1.168332815170288, "learning_rate": 0.0002, "epoch": 4.565516126424703, "step": 28240}, {"loss": 0.5649, "grad_norm": 1.0067222118377686, "learning_rate": 0.0002, "epoch": 4.5671328106054485, "step": 28250}, {"loss": 0.5853, "grad_norm": 1.0283393859863281, "learning_rate": 0.0002, "epoch": 4.568749494786194, "step": 28260}, {"loss": 0.5772, "grad_norm": 0.9912363886833191, "learning_rate": 0.0002, "epoch": 4.570366178966939, "step": 28270}, {"loss": 0.5757, "grad_norm": 1.108032464981079, "learning_rate": 0.0002, "epoch": 4.571982863147684, "step": 28280}, {"loss": 0.5529, "grad_norm": 0.8260078430175781, "learning_rate": 0.0002, "epoch": 4.573599547328429, "step": 28290}, {"loss": 0.5625, "grad_norm": 0.8946247100830078, "learning_rate": 0.0002, "epoch": 4.575216231509175, "step": 28300}, {"loss": 0.5533, "grad_norm": 0.8273587822914124, "learning_rate": 0.0002, "epoch": 4.57683291568992, "step": 28310}, {"loss": 0.6058, "grad_norm": 0.9040093421936035, "learning_rate": 0.0002, "epoch": 4.578449599870665, "step": 28320}, {"loss": 0.5521, "grad_norm": 0.8435290455818176, "learning_rate": 0.0002, "epoch": 4.58006628405141, "step": 28330}, {"loss": 0.6086, "grad_norm": 1.164088249206543, "learning_rate": 0.0002, "epoch": 4.581682968232156, "step": 28340}, {"loss": 0.5603, "grad_norm": 0.9861085414886475, "learning_rate": 0.0002, "epoch": 4.583299652412901, "step": 28350}, {"loss": 0.5701, "grad_norm": 0.8892980813980103, "learning_rate": 0.0002, "epoch": 4.584916336593646, "step": 28360}, {"loss": 0.598, "grad_norm": 1.240574836730957, "learning_rate": 0.0002, "epoch": 4.586533020774391, "step": 28370}, {"loss": 0.5797, "grad_norm": 0.8669408559799194, "learning_rate": 0.0002, "epoch": 4.588149704955137, "step": 28380}, {"loss": 0.5603, "grad_norm": 0.9145985841751099, "learning_rate": 0.0002, "epoch": 4.589766389135883, "step": 28390}, {"loss": 0.5765, "grad_norm": 0.8584614992141724, "learning_rate": 0.0002, "epoch": 4.591383073316628, "step": 28400}, {"loss": 0.5898, "grad_norm": 1.118829369544983, "learning_rate": 0.0002, "epoch": 4.592999757497373, "step": 28410}, {"loss": 0.5641, "grad_norm": 1.1411553621292114, "learning_rate": 0.0002, "epoch": 4.594616441678118, "step": 28420}, {"loss": 0.549, "grad_norm": 0.9433278441429138, "learning_rate": 0.0002, "epoch": 4.596233125858864, "step": 28430}, {"loss": 0.5496, "grad_norm": 0.816830039024353, "learning_rate": 0.0002, "epoch": 4.597849810039609, "step": 28440}, {"loss": 0.5543, "grad_norm": 1.2124968767166138, "learning_rate": 0.0002, "epoch": 4.599466494220354, "step": 28450}, {"loss": 0.5759, "grad_norm": 0.9658762216567993, "learning_rate": 0.0002, "epoch": 4.601083178401099, "step": 28460}, {"loss": 0.5902, "grad_norm": 0.836100161075592, "learning_rate": 0.0002, "epoch": 4.6026998625818445, "step": 28470}, {"loss": 0.5749, "grad_norm": 0.9989104270935059, "learning_rate": 0.0002, "epoch": 4.60431654676259, "step": 28480}, {"loss": 0.5616, "grad_norm": 1.1298956871032715, "learning_rate": 0.0002, "epoch": 4.605933230943335, "step": 28490}, {"loss": 0.5846, "grad_norm": 1.1731704473495483, "learning_rate": 0.0002, "epoch": 4.60754991512408, "step": 28500}, {"loss": 0.5816, "grad_norm": 0.9624714255332947, "learning_rate": 0.0002, "epoch": 4.609166599304825, "step": 28510}, {"loss": 0.5868, "grad_norm": 1.364073634147644, "learning_rate": 0.0002, "epoch": 4.610783283485571, "step": 28520}, {"loss": 0.6237, "grad_norm": 1.1827356815338135, "learning_rate": 0.0002, "epoch": 4.612399967666317, "step": 28530}, {"loss": 0.5643, "grad_norm": 0.6651531457901001, "learning_rate": 0.0002, "epoch": 4.614016651847062, "step": 28540}, {"loss": 0.6051, "grad_norm": 1.1640995740890503, "learning_rate": 0.0002, "epoch": 4.615633336027807, "step": 28550}, {"loss": 0.5995, "grad_norm": 1.028918743133545, "learning_rate": 0.0002, "epoch": 4.6172500202085525, "step": 28560}, {"loss": 0.5607, "grad_norm": 0.8252120614051819, "learning_rate": 0.0002, "epoch": 4.618866704389298, "step": 28570}, {"loss": 0.5769, "grad_norm": 1.3536735773086548, "learning_rate": 0.0002, "epoch": 4.620483388570043, "step": 28580}, {"loss": 0.6006, "grad_norm": 1.2146915197372437, "learning_rate": 0.0002, "epoch": 4.622100072750788, "step": 28590}, {"loss": 0.5503, "grad_norm": 1.0122549533843994, "learning_rate": 0.0002, "epoch": 4.623716756931533, "step": 28600}, {"loss": 0.6072, "grad_norm": 0.9977872967720032, "learning_rate": 0.0002, "epoch": 4.625333441112279, "step": 28610}, {"loss": 0.5669, "grad_norm": 1.0159751176834106, "learning_rate": 0.0002, "epoch": 4.626950125293024, "step": 28620}, {"loss": 0.5935, "grad_norm": 1.0028325319290161, "learning_rate": 0.0002, "epoch": 4.628566809473769, "step": 28630}, {"loss": 0.5515, "grad_norm": 0.901638388633728, "learning_rate": 0.0002, "epoch": 4.630183493654514, "step": 28640}, {"loss": 0.595, "grad_norm": 0.9450507164001465, "learning_rate": 0.0002, "epoch": 4.6318001778352595, "step": 28650}, {"loss": 0.5972, "grad_norm": 0.9987545013427734, "learning_rate": 0.0002, "epoch": 4.633416862016006, "step": 28660}, {"loss": 0.5863, "grad_norm": 0.9574332237243652, "learning_rate": 0.0002, "epoch": 4.63503354619675, "step": 28670}, {"loss": 0.5804, "grad_norm": 1.2215653657913208, "learning_rate": 0.0002, "epoch": 4.636650230377496, "step": 28680}, {"loss": 0.5798, "grad_norm": 0.9798858761787415, "learning_rate": 0.0002, "epoch": 4.638266914558241, "step": 28690}, {"loss": 0.5773, "grad_norm": 1.0648466348648071, "learning_rate": 0.0002, "epoch": 4.639883598738987, "step": 28700}, {"loss": 0.6108, "grad_norm": 1.0606504678726196, "learning_rate": 0.0002, "epoch": 4.641500282919732, "step": 28710}, {"loss": 0.5801, "grad_norm": 1.0892442464828491, "learning_rate": 0.0002, "epoch": 4.643116967100477, "step": 28720}, {"loss": 0.5492, "grad_norm": 0.914391040802002, "learning_rate": 0.0002, "epoch": 4.644733651281222, "step": 28730}, {"loss": 0.5439, "grad_norm": 0.9782370328903198, "learning_rate": 0.0002, "epoch": 4.6463503354619675, "step": 28740}, {"loss": 0.6035, "grad_norm": 1.0344339609146118, "learning_rate": 0.0002, "epoch": 4.647967019642713, "step": 28750}, {"loss": 0.5775, "grad_norm": 1.0513931512832642, "learning_rate": 0.0002, "epoch": 4.649583703823458, "step": 28760}, {"loss": 0.546, "grad_norm": 0.9711475968360901, "learning_rate": 0.0002, "epoch": 4.651200388004203, "step": 28770}, {"loss": 0.5472, "grad_norm": 0.977519690990448, "learning_rate": 0.0002, "epoch": 4.652817072184948, "step": 28780}, {"loss": 0.5826, "grad_norm": 0.9150224924087524, "learning_rate": 0.0002, "epoch": 4.654433756365694, "step": 28790}, {"loss": 0.5382, "grad_norm": 1.0973542928695679, "learning_rate": 0.0002, "epoch": 4.656050440546439, "step": 28800}, {"loss": 0.6147, "grad_norm": 0.944877564907074, "learning_rate": 0.0002, "epoch": 4.657667124727185, "step": 28810}, {"loss": 0.5537, "grad_norm": 0.9508748650550842, "learning_rate": 0.0002, "epoch": 4.659283808907929, "step": 28820}, {"loss": 0.5537, "grad_norm": 0.9681721329689026, "learning_rate": 0.0002, "epoch": 4.6609004930886755, "step": 28830}, {"loss": 0.592, "grad_norm": 1.0214351415634155, "learning_rate": 0.0002, "epoch": 4.662517177269421, "step": 28840}, {"loss": 0.6031, "grad_norm": 0.9748611450195312, "learning_rate": 0.0002, "epoch": 4.664133861450166, "step": 28850}, {"loss": 0.572, "grad_norm": 0.8484147191047668, "learning_rate": 0.0002, "epoch": 4.665750545630911, "step": 28860}, {"loss": 0.5699, "grad_norm": 1.1252986192703247, "learning_rate": 0.0002, "epoch": 4.667367229811656, "step": 28870}, {"loss": 0.5724, "grad_norm": 0.8706206679344177, "learning_rate": 0.0002, "epoch": 4.668983913992402, "step": 28880}, {"loss": 0.6002, "grad_norm": 1.1432424783706665, "learning_rate": 0.0002, "epoch": 4.670600598173147, "step": 28890}, {"loss": 0.5675, "grad_norm": 1.017029047012329, "learning_rate": 0.0002, "epoch": 4.672217282353892, "step": 28900}, {"loss": 0.5831, "grad_norm": 1.085597038269043, "learning_rate": 0.0002, "epoch": 4.673833966534637, "step": 28910}, {"loss": 0.5678, "grad_norm": 0.9275796413421631, "learning_rate": 0.0002, "epoch": 4.675450650715383, "step": 28920}, {"loss": 0.5603, "grad_norm": 0.9518964886665344, "learning_rate": 0.0002, "epoch": 4.677067334896128, "step": 28930}, {"loss": 0.6232, "grad_norm": 1.0352122783660889, "learning_rate": 0.0002, "epoch": 4.678684019076873, "step": 28940}, {"loss": 0.5786, "grad_norm": 1.090124249458313, "learning_rate": 0.0002, "epoch": 4.680300703257618, "step": 28950}, {"loss": 0.5728, "grad_norm": 0.8799563050270081, "learning_rate": 0.0002, "epoch": 4.681917387438364, "step": 28960}, {"loss": 0.5787, "grad_norm": 1.0929821729660034, "learning_rate": 0.0002, "epoch": 4.683534071619109, "step": 28970}, {"loss": 0.6134, "grad_norm": 0.903727650642395, "learning_rate": 0.0002, "epoch": 4.685150755799855, "step": 28980}, {"loss": 0.5522, "grad_norm": 0.9752424955368042, "learning_rate": 0.0002, "epoch": 4.6867674399806, "step": 28990}, {"loss": 0.5762, "grad_norm": 0.9351571202278137, "learning_rate": 0.0002, "epoch": 4.688384124161345, "step": 29000}, {"loss": 0.5811, "grad_norm": 0.923877477645874, "learning_rate": 0.0002, "epoch": 4.6900008083420905, "step": 29010}, {"loss": 0.5682, "grad_norm": 1.045389175415039, "learning_rate": 0.0002, "epoch": 4.691617492522836, "step": 29020}, {"loss": 0.584, "grad_norm": 1.0200831890106201, "learning_rate": 0.0002, "epoch": 4.693234176703581, "step": 29030}, {"loss": 0.5514, "grad_norm": 1.1499706506729126, "learning_rate": 0.0002, "epoch": 4.694850860884326, "step": 29040}, {"loss": 0.5745, "grad_norm": 0.860118567943573, "learning_rate": 0.0002, "epoch": 4.6964675450650715, "step": 29050}, {"loss": 0.5741, "grad_norm": 0.9774864315986633, "learning_rate": 0.0002, "epoch": 4.698084229245817, "step": 29060}, {"loss": 0.5765, "grad_norm": 1.0323210954666138, "learning_rate": 0.0002, "epoch": 4.699700913426562, "step": 29070}, {"loss": 0.5452, "grad_norm": 0.8492481112480164, "learning_rate": 0.0002, "epoch": 4.701317597607307, "step": 29080}, {"loss": 0.5985, "grad_norm": 1.131951093673706, "learning_rate": 0.0002, "epoch": 4.702934281788052, "step": 29090}, {"loss": 0.6412, "grad_norm": 0.8763113021850586, "learning_rate": 0.0002, "epoch": 4.704550965968798, "step": 29100}, {"loss": 0.575, "grad_norm": 1.045028805732727, "learning_rate": 0.0002, "epoch": 4.706167650149544, "step": 29110}, {"loss": 0.5548, "grad_norm": 0.9961401224136353, "learning_rate": 0.0002, "epoch": 4.707784334330288, "step": 29120}, {"loss": 0.559, "grad_norm": 0.9282503724098206, "learning_rate": 0.0002, "epoch": 4.709401018511034, "step": 29130}, {"loss": 0.5744, "grad_norm": 1.1418932676315308, "learning_rate": 0.0002, "epoch": 4.711017702691779, "step": 29140}, {"loss": 0.5394, "grad_norm": 0.9950099587440491, "learning_rate": 0.0002, "epoch": 4.712634386872525, "step": 29150}, {"loss": 0.6177, "grad_norm": 0.8304893374443054, "learning_rate": 0.0002, "epoch": 4.71425107105327, "step": 29160}, {"loss": 0.6074, "grad_norm": 1.115626335144043, "learning_rate": 0.0002, "epoch": 4.715867755234015, "step": 29170}, {"loss": 0.6265, "grad_norm": 1.079818606376648, "learning_rate": 0.0002, "epoch": 4.71748443941476, "step": 29180}, {"loss": 0.561, "grad_norm": 1.1929082870483398, "learning_rate": 0.0002, "epoch": 4.719101123595506, "step": 29190}, {"loss": 0.5708, "grad_norm": 0.9621080756187439, "learning_rate": 0.0002, "epoch": 4.720717807776251, "step": 29200}, {"loss": 0.546, "grad_norm": 0.8549222350120544, "learning_rate": 0.0002, "epoch": 4.722334491956996, "step": 29210}, {"loss": 0.5775, "grad_norm": 0.9341941475868225, "learning_rate": 0.0002, "epoch": 4.723951176137741, "step": 29220}, {"loss": 0.5436, "grad_norm": 1.075406789779663, "learning_rate": 0.0002, "epoch": 4.7255678603184865, "step": 29230}, {"loss": 0.576, "grad_norm": 1.0859880447387695, "learning_rate": 0.0002, "epoch": 4.727184544499232, "step": 29240}, {"loss": 0.5525, "grad_norm": 0.8475605249404907, "learning_rate": 0.0002, "epoch": 4.728801228679977, "step": 29250}, {"loss": 0.5659, "grad_norm": 0.9331845641136169, "learning_rate": 0.0002, "epoch": 4.730417912860723, "step": 29260}, {"loss": 0.5901, "grad_norm": 0.9279314279556274, "learning_rate": 0.0002, "epoch": 4.7320345970414674, "step": 29270}, {"loss": 0.597, "grad_norm": 0.7803558707237244, "learning_rate": 0.0002, "epoch": 4.733651281222214, "step": 29280}, {"loss": 0.5968, "grad_norm": 1.0159329175949097, "learning_rate": 0.0002, "epoch": 4.735267965402959, "step": 29290}, {"loss": 0.5333, "grad_norm": 0.9448670744895935, "learning_rate": 0.0002, "epoch": 4.736884649583704, "step": 29300}, {"loss": 0.574, "grad_norm": 1.0732197761535645, "learning_rate": 0.0002, "epoch": 4.738501333764449, "step": 29310}, {"loss": 0.6066, "grad_norm": 0.901830792427063, "learning_rate": 0.0002, "epoch": 4.7401180179451945, "step": 29320}, {"loss": 0.6105, "grad_norm": 0.9141789674758911, "learning_rate": 0.0002, "epoch": 4.74173470212594, "step": 29330}, {"loss": 0.5481, "grad_norm": 0.9733418226242065, "learning_rate": 0.0002, "epoch": 4.743351386306685, "step": 29340}, {"loss": 0.612, "grad_norm": 0.909810483455658, "learning_rate": 0.0002, "epoch": 4.74496807048743, "step": 29350}, {"loss": 0.5911, "grad_norm": 0.909541666507721, "learning_rate": 0.0002, "epoch": 4.746584754668175, "step": 29360}, {"loss": 0.5579, "grad_norm": 0.9383015632629395, "learning_rate": 0.0002, "epoch": 4.748201438848921, "step": 29370}, {"loss": 0.5529, "grad_norm": 0.9275668263435364, "learning_rate": 0.0002, "epoch": 4.749818123029666, "step": 29380}, {"loss": 0.5623, "grad_norm": 1.1146225929260254, "learning_rate": 0.0002, "epoch": 4.751434807210411, "step": 29390}, {"loss": 0.6018, "grad_norm": 1.0062453746795654, "learning_rate": 0.0002, "epoch": 4.753051491391156, "step": 29400}, {"loss": 0.5872, "grad_norm": 0.9451895952224731, "learning_rate": 0.0002, "epoch": 4.7546681755719025, "step": 29410}, {"loss": 0.5767, "grad_norm": 0.870457649230957, "learning_rate": 0.0002, "epoch": 4.756284859752648, "step": 29420}, {"loss": 0.57, "grad_norm": 1.0411282777786255, "learning_rate": 0.0002, "epoch": 4.757901543933393, "step": 29430}, {"loss": 0.5688, "grad_norm": 1.1648986339569092, "learning_rate": 0.0002, "epoch": 4.759518228114138, "step": 29440}, {"loss": 0.5432, "grad_norm": 0.8999572992324829, "learning_rate": 0.0002, "epoch": 4.761134912294883, "step": 29450}, {"loss": 0.5667, "grad_norm": 0.9863559007644653, "learning_rate": 0.0002, "epoch": 4.762751596475629, "step": 29460}, {"loss": 0.5779, "grad_norm": 0.9676542282104492, "learning_rate": 0.0002, "epoch": 4.764368280656374, "step": 29470}, {"loss": 0.6075, "grad_norm": 1.004775047302246, "learning_rate": 0.0002, "epoch": 4.765984964837119, "step": 29480}, {"loss": 0.6044, "grad_norm": 1.0937515497207642, "learning_rate": 0.0002, "epoch": 4.767601649017864, "step": 29490}, {"loss": 0.5433, "grad_norm": 0.9551598429679871, "learning_rate": 0.0002, "epoch": 4.7692183331986095, "step": 29500}, {"loss": 0.5609, "grad_norm": 1.0757228136062622, "learning_rate": 0.0002, "epoch": 4.770835017379355, "step": 29510}, {"loss": 0.567, "grad_norm": 1.0588841438293457, "learning_rate": 0.0002, "epoch": 4.7724517015601, "step": 29520}, {"loss": 0.5814, "grad_norm": 1.0744032859802246, "learning_rate": 0.0002, "epoch": 4.774068385740845, "step": 29530}, {"loss": 0.5681, "grad_norm": 1.0066277980804443, "learning_rate": 0.0002, "epoch": 4.7756850699215905, "step": 29540}, {"loss": 0.545, "grad_norm": 1.082319736480713, "learning_rate": 0.0002, "epoch": 4.777301754102336, "step": 29550}, {"loss": 0.5709, "grad_norm": 0.8252472877502441, "learning_rate": 0.0002, "epoch": 4.778918438283082, "step": 29560}, {"loss": 0.5666, "grad_norm": 0.9855340123176575, "learning_rate": 0.0002, "epoch": 4.780535122463827, "step": 29570}, {"loss": 0.6117, "grad_norm": 0.9991421699523926, "learning_rate": 0.0002, "epoch": 4.782151806644572, "step": 29580}, {"loss": 0.5966, "grad_norm": 1.316841959953308, "learning_rate": 0.0002, "epoch": 4.7837684908253175, "step": 29590}, {"loss": 0.6102, "grad_norm": 1.1513035297393799, "learning_rate": 0.0002, "epoch": 4.785385175006063, "step": 29600}, {"loss": 0.5785, "grad_norm": 0.9767683744430542, "learning_rate": 0.0002, "epoch": 4.787001859186808, "step": 29610}, {"loss": 0.6037, "grad_norm": 0.9786278605461121, "learning_rate": 0.0002, "epoch": 4.788618543367553, "step": 29620}, {"loss": 0.6108, "grad_norm": 0.8004973530769348, "learning_rate": 0.0002, "epoch": 4.7902352275482984, "step": 29630}, {"loss": 0.5932, "grad_norm": 1.0997767448425293, "learning_rate": 0.0002, "epoch": 4.791851911729044, "step": 29640}, {"loss": 0.5655, "grad_norm": 0.9752856492996216, "learning_rate": 0.0002, "epoch": 4.793468595909789, "step": 29650}, {"loss": 0.5916, "grad_norm": 1.0518392324447632, "learning_rate": 0.0002, "epoch": 4.795085280090534, "step": 29660}, {"loss": 0.6042, "grad_norm": 1.1050055027008057, "learning_rate": 0.0002, "epoch": 4.796701964271279, "step": 29670}, {"loss": 0.6089, "grad_norm": 0.9933857917785645, "learning_rate": 0.0002, "epoch": 4.798318648452025, "step": 29680}, {"loss": 0.6041, "grad_norm": 1.2804018259048462, "learning_rate": 0.0002, "epoch": 4.79993533263277, "step": 29690}, {"loss": 0.636, "grad_norm": 1.0133371353149414, "learning_rate": 0.0002, "epoch": 4.801552016813515, "step": 29700}, {"loss": 0.5662, "grad_norm": 1.080350637435913, "learning_rate": 0.0002, "epoch": 4.803168700994261, "step": 29710}, {"loss": 0.5603, "grad_norm": 0.9986529350280762, "learning_rate": 0.0002, "epoch": 4.804785385175006, "step": 29720}, {"loss": 0.5894, "grad_norm": 0.975665807723999, "learning_rate": 0.0002, "epoch": 4.806402069355752, "step": 29730}, {"loss": 0.6328, "grad_norm": 0.8458138704299927, "learning_rate": 0.0002, "epoch": 4.808018753536497, "step": 29740}, {"loss": 0.5837, "grad_norm": 0.99330073595047, "learning_rate": 0.0002, "epoch": 4.809635437717242, "step": 29750}, {"loss": 0.5507, "grad_norm": 0.898274302482605, "learning_rate": 0.0002, "epoch": 4.811252121897987, "step": 29760}, {"loss": 0.5842, "grad_norm": 1.0504480600357056, "learning_rate": 0.0002, "epoch": 4.812868806078733, "step": 29770}, {"loss": 0.5821, "grad_norm": 0.937919020652771, "learning_rate": 0.0002, "epoch": 4.814485490259478, "step": 29780}, {"loss": 0.5885, "grad_norm": 0.9593307971954346, "learning_rate": 0.0002, "epoch": 4.816102174440223, "step": 29790}, {"loss": 0.578, "grad_norm": 0.9431198835372925, "learning_rate": 0.0002, "epoch": 4.817718858620968, "step": 29800}, {"loss": 0.5739, "grad_norm": 1.2729957103729248, "learning_rate": 0.0002, "epoch": 4.8193355428017135, "step": 29810}, {"loss": 0.6124, "grad_norm": 0.8876838684082031, "learning_rate": 0.0002, "epoch": 4.820952226982459, "step": 29820}, {"loss": 0.5583, "grad_norm": 1.0185000896453857, "learning_rate": 0.0002, "epoch": 4.822568911163204, "step": 29830}, {"loss": 0.5686, "grad_norm": 1.064276099205017, "learning_rate": 0.0002, "epoch": 4.824185595343949, "step": 29840}, {"loss": 0.5698, "grad_norm": 0.9774803519248962, "learning_rate": 0.0002, "epoch": 4.825802279524694, "step": 29850}, {"loss": 0.5533, "grad_norm": 1.131646990776062, "learning_rate": 0.0002, "epoch": 4.8274189637054405, "step": 29860}, {"loss": 0.6371, "grad_norm": 1.081455945968628, "learning_rate": 0.0002, "epoch": 4.829035647886186, "step": 29870}, {"loss": 0.5793, "grad_norm": 0.990538477897644, "learning_rate": 0.0002, "epoch": 4.830652332066931, "step": 29880}, {"loss": 0.5833, "grad_norm": 0.9750600457191467, "learning_rate": 0.0002, "epoch": 4.832269016247676, "step": 29890}, {"loss": 0.619, "grad_norm": 1.0600621700286865, "learning_rate": 0.0002, "epoch": 4.8338857004284215, "step": 29900}, {"loss": 0.5841, "grad_norm": 0.9237320423126221, "learning_rate": 0.0002, "epoch": 4.835502384609167, "step": 29910}, {"loss": 0.5513, "grad_norm": 0.9739177227020264, "learning_rate": 0.0002, "epoch": 4.837119068789912, "step": 29920}, {"loss": 0.587, "grad_norm": 1.128677248954773, "learning_rate": 0.0002, "epoch": 4.838735752970657, "step": 29930}, {"loss": 0.564, "grad_norm": 1.042604923248291, "learning_rate": 0.0002, "epoch": 4.840352437151402, "step": 29940}, {"loss": 0.5885, "grad_norm": 0.849758505821228, "learning_rate": 0.0002, "epoch": 4.841969121332148, "step": 29950}, {"loss": 0.5952, "grad_norm": 1.2809888124465942, "learning_rate": 0.0002, "epoch": 4.843585805512893, "step": 29960}, {"loss": 0.5703, "grad_norm": 1.0177865028381348, "learning_rate": 0.0002, "epoch": 4.845202489693638, "step": 29970}, {"loss": 0.5946, "grad_norm": 1.0026639699935913, "learning_rate": 0.0002, "epoch": 4.846819173874383, "step": 29980}, {"loss": 0.5897, "grad_norm": 0.9679505228996277, "learning_rate": 0.0002, "epoch": 4.8484358580551286, "step": 29990}, {"loss": 0.5621, "grad_norm": 0.8939532041549683, "learning_rate": 0.0002, "epoch": 4.850052542235874, "step": 30000}, {"loss": 0.5852, "grad_norm": 0.9957457780838013, "learning_rate": 0.0002, "epoch": 4.85166922641662, "step": 30010}, {"loss": 0.6117, "grad_norm": 1.1646790504455566, "learning_rate": 0.0002, "epoch": 4.853285910597365, "step": 30020}, {"loss": 0.5711, "grad_norm": 0.8804680705070496, "learning_rate": 0.0002, "epoch": 4.85490259477811, "step": 30030}, {"loss": 0.5397, "grad_norm": 1.161970853805542, "learning_rate": 0.0002, "epoch": 4.856519278958856, "step": 30040}, {"loss": 0.5552, "grad_norm": 0.9081037640571594, "learning_rate": 0.0002, "epoch": 4.858135963139601, "step": 30050}, {"loss": 0.6024, "grad_norm": 0.9402848482131958, "learning_rate": 0.0002, "epoch": 4.859752647320346, "step": 30060}, {"loss": 0.6256, "grad_norm": 0.9023865461349487, "learning_rate": 0.0002, "epoch": 4.861369331501091, "step": 30070}, {"loss": 0.5926, "grad_norm": 1.0173414945602417, "learning_rate": 0.0002, "epoch": 4.8629860156818365, "step": 30080}, {"loss": 0.6274, "grad_norm": 1.084402322769165, "learning_rate": 0.0002, "epoch": 4.864602699862582, "step": 30090}, {"loss": 0.6311, "grad_norm": 0.9577937126159668, "learning_rate": 0.0002, "epoch": 4.866219384043327, "step": 30100}, {"loss": 0.5724, "grad_norm": 0.9807606935501099, "learning_rate": 0.0002, "epoch": 4.867836068224072, "step": 30110}, {"loss": 0.5786, "grad_norm": 0.978784441947937, "learning_rate": 0.0002, "epoch": 4.8694527524048175, "step": 30120}, {"loss": 0.6194, "grad_norm": 0.9762914776802063, "learning_rate": 0.0002, "epoch": 4.871069436585563, "step": 30130}, {"loss": 0.5892, "grad_norm": 0.9404871463775635, "learning_rate": 0.0002, "epoch": 4.872686120766308, "step": 30140}, {"loss": 0.6182, "grad_norm": 1.0069509744644165, "learning_rate": 0.0002, "epoch": 4.874302804947053, "step": 30150}, {"loss": 0.6225, "grad_norm": 1.1770923137664795, "learning_rate": 0.0002, "epoch": 4.875919489127799, "step": 30160}, {"loss": 0.5657, "grad_norm": 1.021210789680481, "learning_rate": 0.0002, "epoch": 4.8775361733085445, "step": 30170}, {"loss": 0.6033, "grad_norm": 0.8512648940086365, "learning_rate": 0.0002, "epoch": 4.87915285748929, "step": 30180}, {"loss": 0.5519, "grad_norm": 0.9345870018005371, "learning_rate": 0.0002, "epoch": 4.880769541670035, "step": 30190}, {"loss": 0.5682, "grad_norm": 1.0224418640136719, "learning_rate": 0.0002, "epoch": 4.88238622585078, "step": 30200}, {"loss": 0.5807, "grad_norm": 1.0316044092178345, "learning_rate": 0.0002, "epoch": 4.884002910031525, "step": 30210}, {"loss": 0.6065, "grad_norm": 1.102437973022461, "learning_rate": 0.0002, "epoch": 4.885619594212271, "step": 30220}, {"loss": 0.586, "grad_norm": 1.0220023393630981, "learning_rate": 0.0002, "epoch": 4.887236278393016, "step": 30230}, {"loss": 0.5781, "grad_norm": 1.0934523344039917, "learning_rate": 0.0002, "epoch": 4.888852962573761, "step": 30240}, {"loss": 0.6313, "grad_norm": 1.264630913734436, "learning_rate": 0.0002, "epoch": 4.890469646754506, "step": 30250}, {"loss": 0.5712, "grad_norm": 1.0999879837036133, "learning_rate": 0.0002, "epoch": 4.892086330935252, "step": 30260}, {"loss": 0.6413, "grad_norm": 0.9124550223350525, "learning_rate": 0.0002, "epoch": 4.893703015115997, "step": 30270}, {"loss": 0.596, "grad_norm": 0.9853624105453491, "learning_rate": 0.0002, "epoch": 4.895319699296742, "step": 30280}, {"loss": 0.595, "grad_norm": 1.0589802265167236, "learning_rate": 0.0002, "epoch": 4.896936383477488, "step": 30290}, {"loss": 0.6129, "grad_norm": 0.8487226366996765, "learning_rate": 0.0002, "epoch": 4.8985530676582325, "step": 30300}, {"loss": 0.5514, "grad_norm": 1.0212191343307495, "learning_rate": 0.0002, "epoch": 4.900169751838979, "step": 30310}, {"loss": 0.5896, "grad_norm": 1.0187491178512573, "learning_rate": 0.0002, "epoch": 4.901786436019724, "step": 30320}, {"loss": 0.5809, "grad_norm": 1.0013091564178467, "learning_rate": 0.0002, "epoch": 4.903403120200469, "step": 30330}, {"loss": 0.5658, "grad_norm": 1.0017542839050293, "learning_rate": 0.0002, "epoch": 4.905019804381214, "step": 30340}, {"loss": 0.6002, "grad_norm": 0.9665151238441467, "learning_rate": 0.0002, "epoch": 4.9066364885619596, "step": 30350}, {"loss": 0.5864, "grad_norm": 0.8774822950363159, "learning_rate": 0.0002, "epoch": 4.908253172742705, "step": 30360}, {"loss": 0.5771, "grad_norm": 0.9449850916862488, "learning_rate": 0.0002, "epoch": 4.90986985692345, "step": 30370}, {"loss": 0.58, "grad_norm": 0.7368341088294983, "learning_rate": 0.0002, "epoch": 4.911486541104195, "step": 30380}, {"loss": 0.5992, "grad_norm": 0.9669167995452881, "learning_rate": 0.0002, "epoch": 4.9131032252849405, "step": 30390}, {"loss": 0.6202, "grad_norm": 1.1227794885635376, "learning_rate": 0.0002, "epoch": 4.914719909465686, "step": 30400}, {"loss": 0.6181, "grad_norm": 0.9884361028671265, "learning_rate": 0.0002, "epoch": 4.916336593646431, "step": 30410}, {"loss": 0.6185, "grad_norm": 0.9949551224708557, "learning_rate": 0.0002, "epoch": 4.917953277827176, "step": 30420}, {"loss": 0.5866, "grad_norm": 0.9491621851921082, "learning_rate": 0.0002, "epoch": 4.919569962007921, "step": 30430}, {"loss": 0.6005, "grad_norm": 0.78848797082901, "learning_rate": 0.0002, "epoch": 4.9211866461886675, "step": 30440}, {"loss": 0.5561, "grad_norm": 1.0693835020065308, "learning_rate": 0.0002, "epoch": 4.922803330369412, "step": 30450}, {"loss": 0.566, "grad_norm": 0.9573729634284973, "learning_rate": 0.0002, "epoch": 4.924420014550158, "step": 30460}, {"loss": 0.6084, "grad_norm": 0.9975152611732483, "learning_rate": 0.0002, "epoch": 4.926036698730903, "step": 30470}, {"loss": 0.5969, "grad_norm": 0.8695693016052246, "learning_rate": 0.0002, "epoch": 4.9276533829116484, "step": 30480}, {"loss": 0.6144, "grad_norm": 1.145394206047058, "learning_rate": 0.0002, "epoch": 4.929270067092394, "step": 30490}, {"loss": 0.5736, "grad_norm": 0.7668989896774292, "learning_rate": 0.0002, "epoch": 4.930886751273139, "step": 30500}, {"loss": 0.6052, "grad_norm": 0.9630151391029358, "learning_rate": 0.0002, "epoch": 4.932503435453884, "step": 30510}, {"loss": 0.6461, "grad_norm": 0.940705418586731, "learning_rate": 0.0002, "epoch": 4.934120119634629, "step": 30520}, {"loss": 0.6326, "grad_norm": 1.3243348598480225, "learning_rate": 0.0002, "epoch": 4.935736803815375, "step": 30530}, {"loss": 0.6174, "grad_norm": 1.004347801208496, "learning_rate": 0.0002, "epoch": 4.93735348799612, "step": 30540}, {"loss": 0.583, "grad_norm": 0.8711541295051575, "learning_rate": 0.0002, "epoch": 4.938970172176865, "step": 30550}, {"loss": 0.599, "grad_norm": 0.8980631828308105, "learning_rate": 0.0002, "epoch": 4.94058685635761, "step": 30560}, {"loss": 0.6024, "grad_norm": 0.8388893604278564, "learning_rate": 0.0002, "epoch": 4.9422035405383555, "step": 30570}, {"loss": 0.6189, "grad_norm": 1.0991183519363403, "learning_rate": 0.0002, "epoch": 4.943820224719101, "step": 30580}, {"loss": 0.5906, "grad_norm": 0.9731075763702393, "learning_rate": 0.0002, "epoch": 4.945436908899847, "step": 30590}, {"loss": 0.5883, "grad_norm": 1.3904452323913574, "learning_rate": 0.0002, "epoch": 4.947053593080591, "step": 30600}, {"loss": 0.5952, "grad_norm": 1.2489882707595825, "learning_rate": 0.0002, "epoch": 4.948670277261337, "step": 30610}, {"loss": 0.5887, "grad_norm": 1.240072250366211, "learning_rate": 0.0002, "epoch": 4.950286961442083, "step": 30620}, {"loss": 0.5762, "grad_norm": 0.9191411733627319, "learning_rate": 0.0002, "epoch": 4.951903645622828, "step": 30630}, {"loss": 0.5597, "grad_norm": 0.8888895511627197, "learning_rate": 0.0002, "epoch": 4.953520329803573, "step": 30640}, {"loss": 0.6594, "grad_norm": 0.9001450538635254, "learning_rate": 0.0002, "epoch": 4.955137013984318, "step": 30650}, {"loss": 0.6047, "grad_norm": 1.053971767425537, "learning_rate": 0.0002, "epoch": 4.9567536981650635, "step": 30660}, {"loss": 0.6107, "grad_norm": 1.2224042415618896, "learning_rate": 0.0002, "epoch": 4.958370382345809, "step": 30670}, {"loss": 0.6211, "grad_norm": 0.8855111598968506, "learning_rate": 0.0002, "epoch": 4.959987066526554, "step": 30680}, {"loss": 0.5764, "grad_norm": 0.9489575624465942, "learning_rate": 0.0002, "epoch": 4.961603750707299, "step": 30690}, {"loss": 0.5371, "grad_norm": 0.9635404944419861, "learning_rate": 0.0002, "epoch": 4.963220434888044, "step": 30700}, {"loss": 0.6043, "grad_norm": 1.1784121990203857, "learning_rate": 0.0002, "epoch": 4.96483711906879, "step": 30710}, {"loss": 0.5803, "grad_norm": 1.0059462785720825, "learning_rate": 0.0002, "epoch": 4.966453803249535, "step": 30720}, {"loss": 0.5759, "grad_norm": 0.9479738473892212, "learning_rate": 0.0002, "epoch": 4.96807048743028, "step": 30730}, {"loss": 0.584, "grad_norm": 1.0624593496322632, "learning_rate": 0.0002, "epoch": 4.969687171611026, "step": 30740}, {"loss": 0.6202, "grad_norm": 1.1429259777069092, "learning_rate": 0.0002, "epoch": 4.971303855791771, "step": 30750}, {"loss": 0.6174, "grad_norm": 0.9102491140365601, "learning_rate": 0.0002, "epoch": 4.972920539972517, "step": 30760}, {"loss": 0.6025, "grad_norm": 1.1262688636779785, "learning_rate": 0.0002, "epoch": 4.974537224153262, "step": 30770}, {"loss": 0.588, "grad_norm": 1.1415393352508545, "learning_rate": 0.0002, "epoch": 4.976153908334007, "step": 30780}, {"loss": 0.5832, "grad_norm": 1.083078384399414, "learning_rate": 0.0002, "epoch": 4.977770592514752, "step": 30790}, {"loss": 0.6025, "grad_norm": 0.964859127998352, "learning_rate": 0.0002, "epoch": 4.979387276695498, "step": 30800}, {"loss": 0.6095, "grad_norm": 0.8704743385314941, "learning_rate": 0.0002, "epoch": 4.981003960876243, "step": 30810}, {"loss": 0.5666, "grad_norm": 1.0714856386184692, "learning_rate": 0.0002, "epoch": 4.982620645056988, "step": 30820}, {"loss": 0.565, "grad_norm": 0.6818771362304688, "learning_rate": 0.0002, "epoch": 4.984237329237733, "step": 30830}, {"loss": 0.5999, "grad_norm": 1.0454156398773193, "learning_rate": 0.0002, "epoch": 4.985854013418479, "step": 30840}, {"loss": 0.5683, "grad_norm": 0.9410776495933533, "learning_rate": 0.0002, "epoch": 4.987470697599224, "step": 30850}, {"loss": 0.5899, "grad_norm": 1.0878902673721313, "learning_rate": 0.0002, "epoch": 4.989087381779969, "step": 30860}, {"loss": 0.5914, "grad_norm": 0.8916727304458618, "learning_rate": 0.0002, "epoch": 4.990704065960714, "step": 30870}, {"loss": 0.6066, "grad_norm": 1.045776128768921, "learning_rate": 0.0002, "epoch": 4.9923207501414595, "step": 30880}, {"loss": 0.5767, "grad_norm": 0.9861903786659241, "learning_rate": 0.0002, "epoch": 4.993937434322206, "step": 30890}, {"loss": 0.6192, "grad_norm": 0.9275050759315491, "learning_rate": 0.0002, "epoch": 4.995554118502951, "step": 30900}, {"loss": 0.6181, "grad_norm": 0.94013911485672, "learning_rate": 0.0002, "epoch": 4.997170802683696, "step": 30910}, {"loss": 0.614, "grad_norm": 0.9771268367767334, "learning_rate": 0.0002, "epoch": 4.998787486864441, "step": 30920}]} +{"epoch": 6.0, "step": 37113, "epoch_duration": 16916.863001823425, "total_accumulated_duration": 101331.91647815704, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75537109375}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.6636, "grad_norm": 0.9894065856933594, "learning_rate": 0.0002, "epoch": 0.0016166841807452913, "step": 10}, {"loss": 1.1528, "grad_norm": 1.7810699939727783, "learning_rate": 0.0002, "epoch": 0.0032333683614905826, "step": 20}, {"loss": 0.9767, "grad_norm": 0.5969577431678772, "learning_rate": 0.0002, "epoch": 0.004850052542235874, "step": 30}, {"loss": 0.9772, "grad_norm": 0.6354120969772339, "learning_rate": 0.0002, "epoch": 0.006466736722981165, "step": 40}, {"loss": 0.8643, "grad_norm": 0.5604607462882996, "learning_rate": 0.0002, "epoch": 0.008083420903726457, "step": 50}, {"loss": 0.8841, "grad_norm": 0.4676193594932556, "learning_rate": 0.0002, "epoch": 0.009700105084471748, "step": 60}, {"loss": 0.9022, "grad_norm": 0.6099211573600769, "learning_rate": 0.0002, "epoch": 0.01131678926521704, "step": 70}, {"loss": 0.9133, "grad_norm": 0.48639994859695435, "learning_rate": 0.0002, "epoch": 0.01293347344596233, "step": 80}, {"loss": 0.8704, "grad_norm": 0.4904264509677887, "learning_rate": 0.0002, "epoch": 0.014550157626707623, "step": 90}, {"loss": 0.8855, "grad_norm": 2.8334362506866455, "learning_rate": 0.0002, "epoch": 0.016166841807452915, "step": 100}, {"loss": 0.8958, "grad_norm": 0.43221670389175415, "learning_rate": 0.0002, "epoch": 0.017783525988198205, "step": 110}, {"loss": 0.8412, "grad_norm": 0.42244166135787964, "learning_rate": 0.0002, "epoch": 0.019400210168943496, "step": 120}, {"loss": 0.8467, "grad_norm": 0.45363298058509827, "learning_rate": 0.0002, "epoch": 0.02101689434968879, "step": 130}, {"loss": 0.8641, "grad_norm": 0.44816508889198303, "learning_rate": 0.0002, "epoch": 0.02263357853043408, "step": 140}, {"loss": 0.8496, "grad_norm": 0.43308213353157043, "learning_rate": 0.0002, "epoch": 0.02425026271117937, "step": 150}, {"loss": 0.8213, "grad_norm": 0.4084763526916504, "learning_rate": 0.0002, "epoch": 0.02586694689192466, "step": 160}, {"loss": 0.8343, "grad_norm": 0.5363703966140747, "learning_rate": 0.0002, "epoch": 0.027483631072669955, "step": 170}, {"loss": 0.8558, "grad_norm": 0.4619699716567993, "learning_rate": 0.0002, "epoch": 0.029100315253415245, "step": 180}, {"loss": 0.8878, "grad_norm": 0.49069908261299133, "learning_rate": 0.0002, "epoch": 0.030716999434160536, "step": 190}, {"loss": 0.8867, "grad_norm": 0.4645835757255554, "learning_rate": 0.0002, "epoch": 0.03233368361490583, "step": 200}, {"loss": 0.8842, "grad_norm": 1.2411243915557861, "learning_rate": 0.0002, "epoch": 0.03395036779565112, "step": 210}, {"loss": 0.8245, "grad_norm": 0.5211851596832275, "learning_rate": 0.0002, "epoch": 0.03556705197639641, "step": 220}, {"loss": 0.8194, "grad_norm": 0.5253691673278809, "learning_rate": 0.0002, "epoch": 0.037183736157141704, "step": 230}, {"loss": 0.8856, "grad_norm": 0.4567478895187378, "learning_rate": 0.0002, "epoch": 0.03880042033788699, "step": 240}, {"loss": 0.838, "grad_norm": 0.5472128391265869, "learning_rate": 0.0002, "epoch": 0.040417104518632285, "step": 250}, {"loss": 0.8201, "grad_norm": 0.42978546023368835, "learning_rate": 0.0002, "epoch": 0.04203378869937758, "step": 260}, {"loss": 0.8334, "grad_norm": 0.601734459400177, "learning_rate": 0.0002, "epoch": 0.043650472880122866, "step": 270}, {"loss": 0.815, "grad_norm": 0.4286513328552246, "learning_rate": 0.0002, "epoch": 0.04526715706086816, "step": 280}, {"loss": 0.8758, "grad_norm": 0.5230861902236938, "learning_rate": 0.0002, "epoch": 0.046883841241613454, "step": 290}, {"loss": 0.8636, "grad_norm": 0.6504611968994141, "learning_rate": 0.0002, "epoch": 0.04850052542235874, "step": 300}, {"loss": 0.8102, "grad_norm": 0.43485215306282043, "learning_rate": 0.0002, "epoch": 0.050117209603104035, "step": 310}, {"loss": 0.8221, "grad_norm": 0.4717007875442505, "learning_rate": 0.0002, "epoch": 0.05173389378384932, "step": 320}, {"loss": 0.8469, "grad_norm": 0.4059787690639496, "learning_rate": 0.0002, "epoch": 0.053350577964594616, "step": 330}, {"loss": 0.8866, "grad_norm": 0.4366913437843323, "learning_rate": 0.0002, "epoch": 0.05496726214533991, "step": 340}, {"loss": 0.7976, "grad_norm": 0.4233848452568054, "learning_rate": 0.0002, "epoch": 0.0565839463260852, "step": 350}, {"loss": 0.8456, "grad_norm": 0.4209108352661133, "learning_rate": 0.0002, "epoch": 0.05820063050683049, "step": 360}, {"loss": 0.816, "grad_norm": 0.41637396812438965, "learning_rate": 0.0002, "epoch": 0.059817314687575784, "step": 370}, {"loss": 0.7976, "grad_norm": 0.46235376596450806, "learning_rate": 0.0002, "epoch": 0.06143399886832107, "step": 380}, {"loss": 0.7966, "grad_norm": 0.4013484716415405, "learning_rate": 0.0002, "epoch": 0.06305068304906636, "step": 390}, {"loss": 0.8253, "grad_norm": 0.47443896532058716, "learning_rate": 0.0002, "epoch": 0.06466736722981166, "step": 400}, {"loss": 0.8666, "grad_norm": 0.3942156434059143, "learning_rate": 0.0002, "epoch": 0.06628405141055695, "step": 410}, {"loss": 0.8402, "grad_norm": 0.4965320825576782, "learning_rate": 0.0002, "epoch": 0.06790073559130223, "step": 420}, {"loss": 0.8317, "grad_norm": 0.4304835796356201, "learning_rate": 0.0002, "epoch": 0.06951741977204753, "step": 430}, {"loss": 0.8528, "grad_norm": 0.511726975440979, "learning_rate": 0.0002, "epoch": 0.07113410395279282, "step": 440}, {"loss": 0.8675, "grad_norm": 0.4040689170360565, "learning_rate": 0.0002, "epoch": 0.07275078813353811, "step": 450}, {"loss": 0.8788, "grad_norm": 0.5402171015739441, "learning_rate": 0.0002, "epoch": 0.07436747231428341, "step": 460}, {"loss": 0.8737, "grad_norm": 0.4174517095088959, "learning_rate": 0.0002, "epoch": 0.0759841564950287, "step": 470}, {"loss": 0.7605, "grad_norm": 0.4306182265281677, "learning_rate": 0.0002, "epoch": 0.07760084067577398, "step": 480}, {"loss": 0.799, "grad_norm": 0.535210132598877, "learning_rate": 0.0002, "epoch": 0.07921752485651928, "step": 490}, {"loss": 0.7825, "grad_norm": 0.5339109897613525, "learning_rate": 0.0002, "epoch": 0.08083420903726457, "step": 500}, {"loss": 0.8985, "grad_norm": 0.45754891633987427, "learning_rate": 0.0002, "epoch": 0.08245089321800986, "step": 510}, {"loss": 0.8144, "grad_norm": 0.43820783495903015, "learning_rate": 0.0002, "epoch": 0.08406757739875516, "step": 520}, {"loss": 0.8001, "grad_norm": 0.4434749186038971, "learning_rate": 0.0002, "epoch": 0.08568426157950045, "step": 530}, {"loss": 0.7857, "grad_norm": 0.43111467361450195, "learning_rate": 0.0002, "epoch": 0.08730094576024573, "step": 540}, {"loss": 0.8418, "grad_norm": 0.4378940165042877, "learning_rate": 0.0002, "epoch": 0.08891762994099103, "step": 550}, {"loss": 0.8361, "grad_norm": 0.4772215187549591, "learning_rate": 0.0002, "epoch": 0.09053431412173632, "step": 560}, {"loss": 0.8268, "grad_norm": 0.6837629079818726, "learning_rate": 0.0002, "epoch": 0.09215099830248161, "step": 570}, {"loss": 0.8607, "grad_norm": 0.42241212725639343, "learning_rate": 0.0002, "epoch": 0.09376768248322691, "step": 580}, {"loss": 0.852, "grad_norm": 0.5165936350822449, "learning_rate": 0.0002, "epoch": 0.0953843666639722, "step": 590}, {"loss": 0.8664, "grad_norm": 0.48737478256225586, "learning_rate": 0.0002, "epoch": 0.09700105084471748, "step": 600}, {"loss": 0.8806, "grad_norm": 0.47419852018356323, "learning_rate": 0.0002, "epoch": 0.09861773502546278, "step": 610}, {"loss": 0.8254, "grad_norm": 0.4975486099720001, "learning_rate": 0.0002, "epoch": 0.10023441920620807, "step": 620}, {"loss": 0.8548, "grad_norm": 0.49123844504356384, "learning_rate": 0.0002, "epoch": 0.10185110338695336, "step": 630}, {"loss": 0.8911, "grad_norm": 0.6288952827453613, "learning_rate": 0.0002, "epoch": 0.10346778756769864, "step": 640}, {"loss": 0.827, "grad_norm": 0.4277345836162567, "learning_rate": 0.0002, "epoch": 0.10508447174844394, "step": 650}, {"loss": 0.7996, "grad_norm": 0.4021061956882477, "learning_rate": 0.0002, "epoch": 0.10670115592918923, "step": 660}, {"loss": 0.87, "grad_norm": 0.3492237329483032, "learning_rate": 0.0002, "epoch": 0.10831784010993452, "step": 670}, {"loss": 0.8698, "grad_norm": 0.4341012239456177, "learning_rate": 0.0002, "epoch": 0.10993452429067982, "step": 680}, {"loss": 0.781, "grad_norm": 0.7296304106712341, "learning_rate": 0.0002, "epoch": 0.1115512084714251, "step": 690}, {"loss": 0.8433, "grad_norm": 0.397494912147522, "learning_rate": 0.0002, "epoch": 0.1131678926521704, "step": 700}, {"loss": 0.827, "grad_norm": 0.396431028842926, "learning_rate": 0.0002, "epoch": 0.1147845768329157, "step": 710}, {"loss": 0.8379, "grad_norm": 0.48842838406562805, "learning_rate": 0.0002, "epoch": 0.11640126101366098, "step": 720}, {"loss": 0.8238, "grad_norm": 0.46322616934776306, "learning_rate": 0.0002, "epoch": 0.11801794519440627, "step": 730}, {"loss": 0.8041, "grad_norm": 0.47990912199020386, "learning_rate": 0.0002, "epoch": 0.11963462937515157, "step": 740}, {"loss": 0.82, "grad_norm": 0.4997142255306244, "learning_rate": 0.0002, "epoch": 0.12125131355589686, "step": 750}, {"loss": 0.7702, "grad_norm": 0.4040526747703552, "learning_rate": 0.0002, "epoch": 0.12286799773664214, "step": 760}, {"loss": 0.863, "grad_norm": 0.453095942735672, "learning_rate": 0.0002, "epoch": 0.12448468191738744, "step": 770}, {"loss": 0.8792, "grad_norm": 0.4636971950531006, "learning_rate": 0.0002, "epoch": 0.12610136609813272, "step": 780}, {"loss": 0.8112, "grad_norm": 0.4279276132583618, "learning_rate": 0.0002, "epoch": 0.12771805027887803, "step": 790}, {"loss": 0.8711, "grad_norm": 0.46212655305862427, "learning_rate": 0.0002, "epoch": 0.12933473445962332, "step": 800}, {"loss": 0.8368, "grad_norm": 0.43127650022506714, "learning_rate": 0.0002, "epoch": 0.1309514186403686, "step": 810}, {"loss": 0.8476, "grad_norm": 0.4201301336288452, "learning_rate": 0.0002, "epoch": 0.1325681028211139, "step": 820}, {"loss": 0.8078, "grad_norm": 0.42583167552948, "learning_rate": 0.0002, "epoch": 0.13418478700185918, "step": 830}, {"loss": 0.8219, "grad_norm": 0.4535622000694275, "learning_rate": 0.0002, "epoch": 0.13580147118260447, "step": 840}, {"loss": 0.8423, "grad_norm": 0.4116036891937256, "learning_rate": 0.0002, "epoch": 0.13741815536334978, "step": 850}, {"loss": 0.8466, "grad_norm": 0.45997580885887146, "learning_rate": 0.0002, "epoch": 0.13903483954409507, "step": 860}, {"loss": 0.8917, "grad_norm": 0.4487837255001068, "learning_rate": 0.0002, "epoch": 0.14065152372484035, "step": 870}, {"loss": 0.8217, "grad_norm": 0.43650057911872864, "learning_rate": 0.0002, "epoch": 0.14226820790558564, "step": 880}, {"loss": 0.8178, "grad_norm": 0.5335358381271362, "learning_rate": 0.0002, "epoch": 0.14388489208633093, "step": 890}, {"loss": 0.7957, "grad_norm": 0.5989000201225281, "learning_rate": 0.0002, "epoch": 0.14550157626707622, "step": 900}, {"loss": 0.8385, "grad_norm": 0.517179012298584, "learning_rate": 0.0002, "epoch": 0.14711826044782153, "step": 910}, {"loss": 0.8255, "grad_norm": 0.44435232877731323, "learning_rate": 0.0002, "epoch": 0.14873494462856682, "step": 920}, {"loss": 0.8305, "grad_norm": 0.42635923624038696, "learning_rate": 0.0002, "epoch": 0.1503516288093121, "step": 930}, {"loss": 0.8043, "grad_norm": 0.49603334069252014, "learning_rate": 0.0002, "epoch": 0.1519683129900574, "step": 940}, {"loss": 0.8377, "grad_norm": 0.40639808773994446, "learning_rate": 0.0002, "epoch": 0.15358499717080268, "step": 950}, {"loss": 0.8529, "grad_norm": 0.4850759208202362, "learning_rate": 0.0002, "epoch": 0.15520168135154797, "step": 960}, {"loss": 0.846, "grad_norm": 0.4427442252635956, "learning_rate": 0.0002, "epoch": 0.15681836553229328, "step": 970}, {"loss": 0.8705, "grad_norm": 0.3760930001735687, "learning_rate": 0.0002, "epoch": 0.15843504971303857, "step": 980}, {"loss": 0.8644, "grad_norm": 0.4794144332408905, "learning_rate": 0.0002, "epoch": 0.16005173389378385, "step": 990}, {"loss": 0.8002, "grad_norm": 0.45828768610954285, "learning_rate": 0.0002, "epoch": 0.16166841807452914, "step": 1000}, {"loss": 0.7658, "grad_norm": 0.6313053369522095, "learning_rate": 0.0002, "epoch": 0.16328510225527443, "step": 1010}, {"loss": 0.8047, "grad_norm": 0.45041006803512573, "learning_rate": 0.0002, "epoch": 0.16490178643601971, "step": 1020}, {"loss": 0.8423, "grad_norm": 0.441403865814209, "learning_rate": 0.0002, "epoch": 0.166518470616765, "step": 1030}, {"loss": 0.8475, "grad_norm": 0.8171296119689941, "learning_rate": 0.0002, "epoch": 0.16813515479751032, "step": 1040}, {"loss": 0.845, "grad_norm": 0.7137420773506165, "learning_rate": 0.0002, "epoch": 0.1697518389782556, "step": 1050}, {"loss": 0.8213, "grad_norm": 0.5236809849739075, "learning_rate": 0.0002, "epoch": 0.1713685231590009, "step": 1060}, {"loss": 0.8265, "grad_norm": 0.5021864175796509, "learning_rate": 0.0002, "epoch": 0.17298520733974618, "step": 1070}, {"loss": 0.8305, "grad_norm": 0.47347521781921387, "learning_rate": 0.0002, "epoch": 0.17460189152049146, "step": 1080}, {"loss": 0.8105, "grad_norm": 0.4631653428077698, "learning_rate": 0.0002, "epoch": 0.17621857570123675, "step": 1090}, {"loss": 0.8166, "grad_norm": 0.49169182777404785, "learning_rate": 0.0002, "epoch": 0.17783525988198207, "step": 1100}, {"loss": 0.8012, "grad_norm": 0.5019739270210266, "learning_rate": 0.0002, "epoch": 0.17945194406272735, "step": 1110}, {"loss": 0.8247, "grad_norm": 0.5100422501564026, "learning_rate": 0.0002, "epoch": 0.18106862824347264, "step": 1120}, {"loss": 0.8142, "grad_norm": 0.3888324499130249, "learning_rate": 0.0002, "epoch": 0.18268531242421793, "step": 1130}, {"loss": 0.8533, "grad_norm": 0.39765217900276184, "learning_rate": 0.0002, "epoch": 0.18430199660496321, "step": 1140}, {"loss": 0.8541, "grad_norm": 0.47190186381340027, "learning_rate": 0.0002, "epoch": 0.1859186807857085, "step": 1150}, {"loss": 0.8301, "grad_norm": 0.4464188814163208, "learning_rate": 0.0002, "epoch": 0.18753536496645382, "step": 1160}, {"loss": 0.8341, "grad_norm": 0.5153930187225342, "learning_rate": 0.0002, "epoch": 0.1891520491471991, "step": 1170}, {"loss": 0.8033, "grad_norm": 0.4779708683490753, "learning_rate": 0.0002, "epoch": 0.1907687333279444, "step": 1180}, {"loss": 0.8187, "grad_norm": 0.4834315776824951, "learning_rate": 0.0002, "epoch": 0.19238541750868968, "step": 1190}, {"loss": 0.7721, "grad_norm": 0.402357816696167, "learning_rate": 0.0002, "epoch": 0.19400210168943496, "step": 1200}, {"loss": 0.7941, "grad_norm": 0.45899084210395813, "learning_rate": 0.0002, "epoch": 0.19561878587018025, "step": 1210}, {"loss": 0.8353, "grad_norm": 0.5106529593467712, "learning_rate": 0.0002, "epoch": 0.19723547005092557, "step": 1220}, {"loss": 0.7816, "grad_norm": 0.45261722803115845, "learning_rate": 0.0002, "epoch": 0.19885215423167085, "step": 1230}, {"loss": 0.8068, "grad_norm": 0.4647127091884613, "learning_rate": 0.0002, "epoch": 0.20046883841241614, "step": 1240}, {"loss": 0.8239, "grad_norm": 0.4849368929862976, "learning_rate": 0.0002, "epoch": 0.20208552259316143, "step": 1250}, {"loss": 0.8514, "grad_norm": 0.4518061578273773, "learning_rate": 0.0002, "epoch": 0.2037022067739067, "step": 1260}, {"loss": 0.8158, "grad_norm": 0.49535325169563293, "learning_rate": 0.0002, "epoch": 0.205318890954652, "step": 1270}, {"loss": 0.8348, "grad_norm": 0.4835205376148224, "learning_rate": 0.0002, "epoch": 0.2069355751353973, "step": 1280}, {"loss": 0.8428, "grad_norm": 0.45308539271354675, "learning_rate": 0.0002, "epoch": 0.2085522593161426, "step": 1290}, {"loss": 0.7993, "grad_norm": 0.5369905233383179, "learning_rate": 0.0002, "epoch": 0.2101689434968879, "step": 1300}, {"loss": 0.8676, "grad_norm": 0.5031622052192688, "learning_rate": 0.0002, "epoch": 0.21178562767763318, "step": 1310}, {"loss": 0.7686, "grad_norm": 0.48010334372520447, "learning_rate": 0.0002, "epoch": 0.21340231185837846, "step": 1320}, {"loss": 0.806, "grad_norm": 0.4905701279640198, "learning_rate": 0.0002, "epoch": 0.21501899603912375, "step": 1330}, {"loss": 0.7885, "grad_norm": 0.43531742691993713, "learning_rate": 0.0002, "epoch": 0.21663568021986904, "step": 1340}, {"loss": 0.8191, "grad_norm": 0.44330692291259766, "learning_rate": 0.0002, "epoch": 0.21825236440061435, "step": 1350}, {"loss": 0.8205, "grad_norm": 0.5384416580200195, "learning_rate": 0.0002, "epoch": 0.21986904858135964, "step": 1360}, {"loss": 0.7726, "grad_norm": 0.4181833863258362, "learning_rate": 0.0002, "epoch": 0.22148573276210493, "step": 1370}, {"loss": 0.8311, "grad_norm": 0.523833692073822, "learning_rate": 0.0002, "epoch": 0.2231024169428502, "step": 1380}, {"loss": 0.7913, "grad_norm": 0.5528736710548401, "learning_rate": 0.0002, "epoch": 0.2247191011235955, "step": 1390}, {"loss": 0.8079, "grad_norm": 0.43515023589134216, "learning_rate": 0.0002, "epoch": 0.2263357853043408, "step": 1400}, {"loss": 0.8403, "grad_norm": 0.48809877038002014, "learning_rate": 0.0002, "epoch": 0.2279524694850861, "step": 1410}, {"loss": 0.8165, "grad_norm": 0.43591251969337463, "learning_rate": 0.0002, "epoch": 0.2295691536658314, "step": 1420}, {"loss": 0.8147, "grad_norm": 0.44625312089920044, "learning_rate": 0.0002, "epoch": 0.23118583784657668, "step": 1430}, {"loss": 0.8134, "grad_norm": 0.4390665292739868, "learning_rate": 0.0002, "epoch": 0.23280252202732196, "step": 1440}, {"loss": 0.8465, "grad_norm": 0.48496049642562866, "learning_rate": 0.0002, "epoch": 0.23441920620806725, "step": 1450}, {"loss": 0.775, "grad_norm": 0.45919957756996155, "learning_rate": 0.0002, "epoch": 0.23603589038881254, "step": 1460}, {"loss": 0.8659, "grad_norm": 0.5471845865249634, "learning_rate": 0.0002, "epoch": 0.23765257456955785, "step": 1470}, {"loss": 0.8164, "grad_norm": 0.47269317507743835, "learning_rate": 0.0002, "epoch": 0.23926925875030314, "step": 1480}, {"loss": 0.854, "grad_norm": 0.4930245578289032, "learning_rate": 0.0002, "epoch": 0.24088594293104842, "step": 1490}, {"loss": 0.8139, "grad_norm": 0.5605630278587341, "learning_rate": 0.0002, "epoch": 0.2425026271117937, "step": 1500}, {"loss": 0.8125, "grad_norm": 0.4435870945453644, "learning_rate": 0.0002, "epoch": 0.244119311292539, "step": 1510}, {"loss": 0.8123, "grad_norm": 0.4941999912261963, "learning_rate": 0.0002, "epoch": 0.24573599547328429, "step": 1520}, {"loss": 0.8427, "grad_norm": 0.5100624561309814, "learning_rate": 0.0002, "epoch": 0.24735267965402957, "step": 1530}, {"loss": 0.8405, "grad_norm": 0.4638267457485199, "learning_rate": 0.0002, "epoch": 0.2489693638347749, "step": 1540}, {"loss": 0.81, "grad_norm": 0.5071570873260498, "learning_rate": 0.0002, "epoch": 0.25058604801552015, "step": 1550}, {"loss": 0.7724, "grad_norm": 0.4291319251060486, "learning_rate": 0.0002, "epoch": 0.25220273219626543, "step": 1560}, {"loss": 0.7984, "grad_norm": 0.5388049483299255, "learning_rate": 0.0002, "epoch": 0.2538194163770108, "step": 1570}, {"loss": 0.8176, "grad_norm": 0.5083683729171753, "learning_rate": 0.0002, "epoch": 0.25543610055775606, "step": 1580}, {"loss": 0.843, "grad_norm": 0.4824463725090027, "learning_rate": 0.0002, "epoch": 0.25705278473850135, "step": 1590}, {"loss": 0.7996, "grad_norm": 0.41177722811698914, "learning_rate": 0.0002, "epoch": 0.25866946891924664, "step": 1600}, {"loss": 0.7772, "grad_norm": 0.5656219124794006, "learning_rate": 0.0002, "epoch": 0.2602861530999919, "step": 1610}, {"loss": 0.7955, "grad_norm": 0.41063204407691956, "learning_rate": 0.0002, "epoch": 0.2619028372807372, "step": 1620}, {"loss": 0.7998, "grad_norm": 0.4897061288356781, "learning_rate": 0.0002, "epoch": 0.2635195214614825, "step": 1630}, {"loss": 0.8198, "grad_norm": 0.4454376697540283, "learning_rate": 0.0002, "epoch": 0.2651362056422278, "step": 1640}, {"loss": 0.8684, "grad_norm": 0.4355238378047943, "learning_rate": 0.0002, "epoch": 0.26675288982297307, "step": 1650}, {"loss": 0.7801, "grad_norm": 0.458310067653656, "learning_rate": 0.0002, "epoch": 0.26836957400371836, "step": 1660}, {"loss": 0.7935, "grad_norm": 0.4752083718776703, "learning_rate": 0.0002, "epoch": 0.26998625818446365, "step": 1670}, {"loss": 0.8267, "grad_norm": 0.4666106402873993, "learning_rate": 0.0002, "epoch": 0.27160294236520893, "step": 1680}, {"loss": 0.8252, "grad_norm": 0.4213818609714508, "learning_rate": 0.0002, "epoch": 0.2732196265459543, "step": 1690}, {"loss": 0.8559, "grad_norm": 0.5768913626670837, "learning_rate": 0.0002, "epoch": 0.27483631072669956, "step": 1700}, {"loss": 0.7931, "grad_norm": 0.4209914803504944, "learning_rate": 0.0002, "epoch": 0.27645299490744485, "step": 1710}, {"loss": 0.8167, "grad_norm": 0.501909613609314, "learning_rate": 0.0002, "epoch": 0.27806967908819014, "step": 1720}, {"loss": 0.7832, "grad_norm": 0.5266261100769043, "learning_rate": 0.0002, "epoch": 0.2796863632689354, "step": 1730}, {"loss": 0.8102, "grad_norm": 0.43806859850883484, "learning_rate": 0.0002, "epoch": 0.2813030474496807, "step": 1740}, {"loss": 0.8157, "grad_norm": 0.46048814058303833, "learning_rate": 0.0002, "epoch": 0.282919731630426, "step": 1750}, {"loss": 0.8596, "grad_norm": 0.44972819089889526, "learning_rate": 0.0002, "epoch": 0.2845364158111713, "step": 1760}, {"loss": 0.8421, "grad_norm": 0.5114831328392029, "learning_rate": 0.0002, "epoch": 0.28615309999191657, "step": 1770}, {"loss": 0.8361, "grad_norm": 0.47931742668151855, "learning_rate": 0.0002, "epoch": 0.28776978417266186, "step": 1780}, {"loss": 0.8265, "grad_norm": 0.5092599987983704, "learning_rate": 0.0002, "epoch": 0.28938646835340714, "step": 1790}, {"loss": 0.8506, "grad_norm": 0.37581443786621094, "learning_rate": 0.0002, "epoch": 0.29100315253415243, "step": 1800}, {"loss": 0.7932, "grad_norm": 0.47097381949424744, "learning_rate": 0.0002, "epoch": 0.2926198367148977, "step": 1810}, {"loss": 0.7787, "grad_norm": 0.48300236463546753, "learning_rate": 0.0002, "epoch": 0.29423652089564306, "step": 1820}, {"loss": 0.8391, "grad_norm": 0.5600419640541077, "learning_rate": 0.0002, "epoch": 0.29585320507638835, "step": 1830}, {"loss": 0.8507, "grad_norm": 0.48555272817611694, "learning_rate": 0.0002, "epoch": 0.29746988925713364, "step": 1840}, {"loss": 0.7657, "grad_norm": 0.3752668499946594, "learning_rate": 0.0002, "epoch": 0.2990865734378789, "step": 1850}, {"loss": 0.7915, "grad_norm": 0.5328747034072876, "learning_rate": 0.0002, "epoch": 0.3007032576186242, "step": 1860}, {"loss": 0.8426, "grad_norm": 0.48716455698013306, "learning_rate": 0.0002, "epoch": 0.3023199417993695, "step": 1870}, {"loss": 0.8335, "grad_norm": 0.5011493563652039, "learning_rate": 0.0002, "epoch": 0.3039366259801148, "step": 1880}, {"loss": 0.852, "grad_norm": 0.46461427211761475, "learning_rate": 0.0002, "epoch": 0.30555331016086007, "step": 1890}, {"loss": 0.8478, "grad_norm": 0.36630210280418396, "learning_rate": 0.0002, "epoch": 0.30716999434160536, "step": 1900}, {"loss": 0.8162, "grad_norm": 0.4217296242713928, "learning_rate": 0.0002, "epoch": 0.30878667852235064, "step": 1910}, {"loss": 0.8128, "grad_norm": 0.4394875466823578, "learning_rate": 0.0002, "epoch": 0.31040336270309593, "step": 1920}, {"loss": 0.8471, "grad_norm": 0.6587965488433838, "learning_rate": 0.0002, "epoch": 0.3120200468838412, "step": 1930}, {"loss": 0.8565, "grad_norm": 0.5469298958778381, "learning_rate": 0.0002, "epoch": 0.31363673106458656, "step": 1940}, {"loss": 0.8236, "grad_norm": 0.4371595084667206, "learning_rate": 0.0002, "epoch": 0.31525341524533185, "step": 1950}, {"loss": 0.887, "grad_norm": 0.4809541404247284, "learning_rate": 0.0002, "epoch": 0.31687009942607713, "step": 1960}, {"loss": 0.7855, "grad_norm": 0.6061086654663086, "learning_rate": 0.0002, "epoch": 0.3184867836068224, "step": 1970}, {"loss": 0.7679, "grad_norm": 0.5342657566070557, "learning_rate": 0.0002, "epoch": 0.3201034677875677, "step": 1980}, {"loss": 0.7955, "grad_norm": 0.5057743787765503, "learning_rate": 0.0002, "epoch": 0.321720151968313, "step": 1990}, {"loss": 0.7774, "grad_norm": 0.528626024723053, "learning_rate": 0.0002, "epoch": 0.3233368361490583, "step": 2000}, {"loss": 0.8845, "grad_norm": 0.46742770075798035, "learning_rate": 0.0002, "epoch": 0.32495352032980357, "step": 2010}, {"loss": 0.8484, "grad_norm": 0.515101432800293, "learning_rate": 0.0002, "epoch": 0.32657020451054886, "step": 2020}, {"loss": 0.8139, "grad_norm": 0.41941216588020325, "learning_rate": 0.0002, "epoch": 0.32818688869129414, "step": 2030}, {"loss": 0.7637, "grad_norm": 0.49902522563934326, "learning_rate": 0.0002, "epoch": 0.32980357287203943, "step": 2040}, {"loss": 0.7822, "grad_norm": 0.4120897650718689, "learning_rate": 0.0002, "epoch": 0.3314202570527847, "step": 2050}, {"loss": 0.8057, "grad_norm": 0.45352041721343994, "learning_rate": 0.0002, "epoch": 0.33303694123353, "step": 2060}, {"loss": 0.7913, "grad_norm": 0.523199737071991, "learning_rate": 0.0002, "epoch": 0.33465362541427535, "step": 2070}, {"loss": 0.8036, "grad_norm": 0.4390358626842499, "learning_rate": 0.0002, "epoch": 0.33627030959502063, "step": 2080}, {"loss": 0.8145, "grad_norm": 0.6752901077270508, "learning_rate": 0.0002, "epoch": 0.3378869937757659, "step": 2090}, {"loss": 0.7807, "grad_norm": 0.547821044921875, "learning_rate": 0.0002, "epoch": 0.3395036779565112, "step": 2100}, {"loss": 0.8561, "grad_norm": 0.5161308646202087, "learning_rate": 0.0002, "epoch": 0.3411203621372565, "step": 2110}, {"loss": 0.7697, "grad_norm": 0.4565401077270508, "learning_rate": 0.0002, "epoch": 0.3427370463180018, "step": 2120}, {"loss": 0.7964, "grad_norm": 0.4666115939617157, "learning_rate": 0.0002, "epoch": 0.34435373049874707, "step": 2130}, {"loss": 0.8189, "grad_norm": 0.4090428352355957, "learning_rate": 0.0002, "epoch": 0.34597041467949236, "step": 2140}, {"loss": 0.8817, "grad_norm": 0.510845422744751, "learning_rate": 0.0002, "epoch": 0.34758709886023764, "step": 2150}, {"loss": 0.8398, "grad_norm": 0.42861923575401306, "learning_rate": 0.0002, "epoch": 0.34920378304098293, "step": 2160}, {"loss": 0.7716, "grad_norm": 0.4476332664489746, "learning_rate": 0.0002, "epoch": 0.3508204672217282, "step": 2170}, {"loss": 0.7845, "grad_norm": 0.6065791249275208, "learning_rate": 0.0002, "epoch": 0.3524371514024735, "step": 2180}, {"loss": 0.8187, "grad_norm": 0.42335066199302673, "learning_rate": 0.0002, "epoch": 0.35405383558321885, "step": 2190}, {"loss": 0.8239, "grad_norm": 0.5094629526138306, "learning_rate": 0.0002, "epoch": 0.35567051976396413, "step": 2200}, {"loss": 0.7807, "grad_norm": 0.5476373434066772, "learning_rate": 0.0002, "epoch": 0.3572872039447094, "step": 2210}, {"loss": 0.814, "grad_norm": 0.3911719024181366, "learning_rate": 0.0002, "epoch": 0.3589038881254547, "step": 2220}, {"loss": 0.8599, "grad_norm": 0.6599636077880859, "learning_rate": 0.0002, "epoch": 0.3605205723062, "step": 2230}, {"loss": 0.7482, "grad_norm": 0.40381914377212524, "learning_rate": 0.0002, "epoch": 0.3621372564869453, "step": 2240}, {"loss": 0.7772, "grad_norm": 0.4433908462524414, "learning_rate": 0.0002, "epoch": 0.36375394066769057, "step": 2250}, {"loss": 0.8503, "grad_norm": 0.578326940536499, "learning_rate": 0.0002, "epoch": 0.36537062484843585, "step": 2260}, {"loss": 0.8178, "grad_norm": 0.5734784007072449, "learning_rate": 0.0002, "epoch": 0.36698730902918114, "step": 2270}, {"loss": 0.8193, "grad_norm": 0.45555487275123596, "learning_rate": 0.0002, "epoch": 0.36860399320992643, "step": 2280}, {"loss": 0.7929, "grad_norm": 0.5666276216506958, "learning_rate": 0.0002, "epoch": 0.3702206773906717, "step": 2290}, {"loss": 0.8292, "grad_norm": 0.5461117625236511, "learning_rate": 0.0002, "epoch": 0.371837361571417, "step": 2300}, {"loss": 0.8204, "grad_norm": 0.6318911910057068, "learning_rate": 0.0002, "epoch": 0.3734540457521623, "step": 2310}, {"loss": 0.7964, "grad_norm": 0.493263304233551, "learning_rate": 0.0002, "epoch": 0.37507072993290763, "step": 2320}, {"loss": 0.8339, "grad_norm": 0.5888760089874268, "learning_rate": 0.0002, "epoch": 0.3766874141136529, "step": 2330}, {"loss": 0.7737, "grad_norm": 0.48671841621398926, "learning_rate": 0.0002, "epoch": 0.3783040982943982, "step": 2340}, {"loss": 0.8367, "grad_norm": 0.4385145306587219, "learning_rate": 0.0002, "epoch": 0.3799207824751435, "step": 2350}, {"loss": 0.812, "grad_norm": 0.5523318648338318, "learning_rate": 0.0002, "epoch": 0.3815374666558888, "step": 2360}, {"loss": 0.8351, "grad_norm": 0.7308220267295837, "learning_rate": 0.0002, "epoch": 0.38315415083663407, "step": 2370}, {"loss": 0.859, "grad_norm": 0.554214358329773, "learning_rate": 0.0002, "epoch": 0.38477083501737935, "step": 2380}, {"loss": 0.8146, "grad_norm": 0.5425800085067749, "learning_rate": 0.0002, "epoch": 0.38638751919812464, "step": 2390}, {"loss": 0.8282, "grad_norm": 0.48811158537864685, "learning_rate": 0.0002, "epoch": 0.3880042033788699, "step": 2400}, {"loss": 0.8074, "grad_norm": 0.49212366342544556, "learning_rate": 0.0002, "epoch": 0.3896208875596152, "step": 2410}, {"loss": 0.7991, "grad_norm": 0.5222218632698059, "learning_rate": 0.0002, "epoch": 0.3912375717403605, "step": 2420}, {"loss": 0.8182, "grad_norm": 0.4699819087982178, "learning_rate": 0.0002, "epoch": 0.3928542559211058, "step": 2430}, {"loss": 0.7919, "grad_norm": 0.46153587102890015, "learning_rate": 0.0002, "epoch": 0.39447094010185113, "step": 2440}, {"loss": 0.8111, "grad_norm": 0.4150611162185669, "learning_rate": 0.0002, "epoch": 0.3960876242825964, "step": 2450}, {"loss": 0.8589, "grad_norm": 0.5799614787101746, "learning_rate": 0.0002, "epoch": 0.3977043084633417, "step": 2460}, {"loss": 0.8085, "grad_norm": 0.56536865234375, "learning_rate": 0.0002, "epoch": 0.399320992644087, "step": 2470}, {"loss": 0.8022, "grad_norm": 0.5451247096061707, "learning_rate": 0.0002, "epoch": 0.4009376768248323, "step": 2480}, {"loss": 0.8217, "grad_norm": 0.5914521217346191, "learning_rate": 0.0002, "epoch": 0.40255436100557757, "step": 2490}, {"loss": 0.7859, "grad_norm": 0.4428117275238037, "learning_rate": 0.0002, "epoch": 0.40417104518632285, "step": 2500}, {"loss": 0.8054, "grad_norm": 0.48580947518348694, "learning_rate": 0.0002, "epoch": 0.40578772936706814, "step": 2510}, {"loss": 0.8405, "grad_norm": 0.436734676361084, "learning_rate": 0.0002, "epoch": 0.4074044135478134, "step": 2520}, {"loss": 0.8209, "grad_norm": 0.5752223134040833, "learning_rate": 0.0002, "epoch": 0.4090210977285587, "step": 2530}, {"loss": 0.8181, "grad_norm": 0.4271308183670044, "learning_rate": 0.0002, "epoch": 0.410637781909304, "step": 2540}, {"loss": 0.8058, "grad_norm": 0.46294718980789185, "learning_rate": 0.0002, "epoch": 0.4122544660900493, "step": 2550}, {"loss": 0.8473, "grad_norm": 0.49407583475112915, "learning_rate": 0.0002, "epoch": 0.4138711502707946, "step": 2560}, {"loss": 0.7881, "grad_norm": 0.4729035496711731, "learning_rate": 0.0002, "epoch": 0.4154878344515399, "step": 2570}, {"loss": 0.7834, "grad_norm": 0.4129747152328491, "learning_rate": 0.0002, "epoch": 0.4171045186322852, "step": 2580}, {"loss": 0.7859, "grad_norm": 0.5684236288070679, "learning_rate": 0.0002, "epoch": 0.4187212028130305, "step": 2590}, {"loss": 0.811, "grad_norm": 0.4862157106399536, "learning_rate": 0.0002, "epoch": 0.4203378869937758, "step": 2600}, {"loss": 0.7582, "grad_norm": 0.46567976474761963, "learning_rate": 0.0002, "epoch": 0.42195457117452106, "step": 2610}, {"loss": 0.7755, "grad_norm": 0.5710650682449341, "learning_rate": 0.0002, "epoch": 0.42357125535526635, "step": 2620}, {"loss": 0.8573, "grad_norm": 0.5660041570663452, "learning_rate": 0.0002, "epoch": 0.42518793953601164, "step": 2630}, {"loss": 0.7812, "grad_norm": 0.47944375872612, "learning_rate": 0.0002, "epoch": 0.4268046237167569, "step": 2640}, {"loss": 0.7459, "grad_norm": 0.537223756313324, "learning_rate": 0.0002, "epoch": 0.4284213078975022, "step": 2650}, {"loss": 0.8246, "grad_norm": 0.41669997572898865, "learning_rate": 0.0002, "epoch": 0.4300379920782475, "step": 2660}, {"loss": 0.7785, "grad_norm": 0.44727686047554016, "learning_rate": 0.0002, "epoch": 0.4316546762589928, "step": 2670}, {"loss": 0.8241, "grad_norm": 0.5600888729095459, "learning_rate": 0.0002, "epoch": 0.4332713604397381, "step": 2680}, {"loss": 0.7708, "grad_norm": 0.39820605516433716, "learning_rate": 0.0002, "epoch": 0.4348880446204834, "step": 2690}, {"loss": 0.8202, "grad_norm": 0.5637655854225159, "learning_rate": 0.0002, "epoch": 0.4365047288012287, "step": 2700}, {"loss": 0.855, "grad_norm": 0.6363666653633118, "learning_rate": 0.0002, "epoch": 0.438121412981974, "step": 2710}, {"loss": 0.8468, "grad_norm": 0.5656129121780396, "learning_rate": 0.0002, "epoch": 0.4397380971627193, "step": 2720}, {"loss": 0.7845, "grad_norm": 0.5600156188011169, "learning_rate": 0.0002, "epoch": 0.44135478134346456, "step": 2730}, {"loss": 0.8405, "grad_norm": 0.5506579875946045, "learning_rate": 0.0002, "epoch": 0.44297146552420985, "step": 2740}, {"loss": 0.7725, "grad_norm": 0.49878305196762085, "learning_rate": 0.0002, "epoch": 0.44458814970495514, "step": 2750}, {"loss": 0.8292, "grad_norm": 0.4569213092327118, "learning_rate": 0.0002, "epoch": 0.4462048338857004, "step": 2760}, {"loss": 0.8028, "grad_norm": 0.6056680083274841, "learning_rate": 0.0002, "epoch": 0.4478215180664457, "step": 2770}, {"loss": 0.8242, "grad_norm": 0.44474557042121887, "learning_rate": 0.0002, "epoch": 0.449438202247191, "step": 2780}, {"loss": 0.801, "grad_norm": 0.46055394411087036, "learning_rate": 0.0002, "epoch": 0.4510548864279363, "step": 2790}, {"loss": 0.7521, "grad_norm": 0.4904133379459381, "learning_rate": 0.0002, "epoch": 0.4526715706086816, "step": 2800}, {"loss": 0.8829, "grad_norm": 0.5647031664848328, "learning_rate": 0.0002, "epoch": 0.45428825478942686, "step": 2810}, {"loss": 0.8622, "grad_norm": 0.5759473443031311, "learning_rate": 0.0002, "epoch": 0.4559049389701722, "step": 2820}, {"loss": 0.7812, "grad_norm": 0.5161895751953125, "learning_rate": 0.0002, "epoch": 0.4575216231509175, "step": 2830}, {"loss": 0.8045, "grad_norm": 0.4248254597187042, "learning_rate": 0.0002, "epoch": 0.4591383073316628, "step": 2840}, {"loss": 0.7838, "grad_norm": 0.45395001769065857, "learning_rate": 0.0002, "epoch": 0.46075499151240806, "step": 2850}, {"loss": 0.8208, "grad_norm": 0.5358697772026062, "learning_rate": 0.0002, "epoch": 0.46237167569315335, "step": 2860}, {"loss": 0.8147, "grad_norm": 0.5379165410995483, "learning_rate": 0.0002, "epoch": 0.46398835987389864, "step": 2870}, {"loss": 0.7403, "grad_norm": 0.4601989686489105, "learning_rate": 0.0002, "epoch": 0.4656050440546439, "step": 2880}, {"loss": 0.8523, "grad_norm": 0.671115517616272, "learning_rate": 0.0002, "epoch": 0.4672217282353892, "step": 2890}, {"loss": 0.8262, "grad_norm": 0.4425133168697357, "learning_rate": 0.0002, "epoch": 0.4688384124161345, "step": 2900}, {"loss": 0.8178, "grad_norm": 0.5446155071258545, "learning_rate": 0.0002, "epoch": 0.4704550965968798, "step": 2910}, {"loss": 0.8106, "grad_norm": 0.603306233882904, "learning_rate": 0.0002, "epoch": 0.47207178077762507, "step": 2920}, {"loss": 0.8044, "grad_norm": 0.5377997159957886, "learning_rate": 0.0002, "epoch": 0.47368846495837036, "step": 2930}, {"loss": 0.8075, "grad_norm": 0.4931027591228485, "learning_rate": 0.0002, "epoch": 0.4753051491391157, "step": 2940}, {"loss": 0.8004, "grad_norm": 0.4711960256099701, "learning_rate": 0.0002, "epoch": 0.476921833319861, "step": 2950}, {"loss": 0.8121, "grad_norm": 0.5020492672920227, "learning_rate": 0.0002, "epoch": 0.4785385175006063, "step": 2960}, {"loss": 0.8221, "grad_norm": 0.5428946614265442, "learning_rate": 0.0002, "epoch": 0.48015520168135156, "step": 2970}, {"loss": 0.7849, "grad_norm": 0.5294089317321777, "learning_rate": 0.0002, "epoch": 0.48177188586209685, "step": 2980}, {"loss": 0.8553, "grad_norm": 0.648289144039154, "learning_rate": 0.0002, "epoch": 0.48338857004284214, "step": 2990}, {"loss": 0.7874, "grad_norm": 0.47916680574417114, "learning_rate": 0.0002, "epoch": 0.4850052542235874, "step": 3000}, {"loss": 0.8087, "grad_norm": 0.43849772214889526, "learning_rate": 0.0002, "epoch": 0.4866219384043327, "step": 3010}, {"loss": 0.7662, "grad_norm": 0.47007861733436584, "learning_rate": 0.0002, "epoch": 0.488238622585078, "step": 3020}, {"loss": 0.757, "grad_norm": 0.6314331293106079, "learning_rate": 0.0002, "epoch": 0.4898553067658233, "step": 3030}, {"loss": 0.7863, "grad_norm": 0.49211493134498596, "learning_rate": 0.0002, "epoch": 0.49147199094656857, "step": 3040}, {"loss": 0.8335, "grad_norm": 0.4537973403930664, "learning_rate": 0.0002, "epoch": 0.49308867512731386, "step": 3050}, {"loss": 0.8095, "grad_norm": 0.47326919436454773, "learning_rate": 0.0002, "epoch": 0.49470535930805914, "step": 3060}, {"loss": 0.8447, "grad_norm": 0.525874137878418, "learning_rate": 0.0002, "epoch": 0.4963220434888045, "step": 3070}, {"loss": 0.8339, "grad_norm": 0.6361091732978821, "learning_rate": 0.0002, "epoch": 0.4979387276695498, "step": 3080}, {"loss": 0.821, "grad_norm": 0.5850642919540405, "learning_rate": 0.0002, "epoch": 0.49955541185029506, "step": 3090}, {"loss": 0.8279, "grad_norm": 0.47299543023109436, "learning_rate": 0.0002, "epoch": 0.5011720960310403, "step": 3100}, {"loss": 0.8681, "grad_norm": 0.473099946975708, "learning_rate": 0.0002, "epoch": 0.5027887802117856, "step": 3110}, {"loss": 0.8223, "grad_norm": 0.48186397552490234, "learning_rate": 0.0002, "epoch": 0.5044054643925309, "step": 3120}, {"loss": 0.8292, "grad_norm": 0.5015401840209961, "learning_rate": 0.0002, "epoch": 0.5060221485732762, "step": 3130}, {"loss": 0.7692, "grad_norm": 0.5617750287055969, "learning_rate": 0.0002, "epoch": 0.5076388327540216, "step": 3140}, {"loss": 0.8708, "grad_norm": 0.5169327259063721, "learning_rate": 0.0002, "epoch": 0.5092555169347668, "step": 3150}, {"loss": 0.7845, "grad_norm": 0.545657753944397, "learning_rate": 0.0002, "epoch": 0.5108722011155121, "step": 3160}, {"loss": 0.799, "grad_norm": 0.512864351272583, "learning_rate": 0.0002, "epoch": 0.5124888852962574, "step": 3170}, {"loss": 0.7794, "grad_norm": 0.4113546311855316, "learning_rate": 0.0002, "epoch": 0.5141055694770027, "step": 3180}, {"loss": 0.8206, "grad_norm": 0.44532445073127747, "learning_rate": 0.0002, "epoch": 0.5157222536577479, "step": 3190}, {"loss": 0.8213, "grad_norm": 0.5623497366905212, "learning_rate": 0.0002, "epoch": 0.5173389378384933, "step": 3200}, {"loss": 0.7928, "grad_norm": 0.5084741115570068, "learning_rate": 0.0002, "epoch": 0.5189556220192385, "step": 3210}, {"loss": 0.8174, "grad_norm": 0.5305403470993042, "learning_rate": 0.0002, "epoch": 0.5205723061999838, "step": 3220}, {"loss": 0.8139, "grad_norm": 0.4708254337310791, "learning_rate": 0.0002, "epoch": 0.5221889903807291, "step": 3230}, {"loss": 0.7639, "grad_norm": 0.43827131390571594, "learning_rate": 0.0002, "epoch": 0.5238056745614744, "step": 3240}, {"loss": 0.7993, "grad_norm": 0.5630002617835999, "learning_rate": 0.0002, "epoch": 0.5254223587422197, "step": 3250}, {"loss": 0.7522, "grad_norm": 0.5010961890220642, "learning_rate": 0.0002, "epoch": 0.527039042922965, "step": 3260}, {"loss": 0.8374, "grad_norm": 0.6303122043609619, "learning_rate": 0.0002, "epoch": 0.5286557271037103, "step": 3270}, {"loss": 0.7727, "grad_norm": 0.5107331275939941, "learning_rate": 0.0002, "epoch": 0.5302724112844556, "step": 3280}, {"loss": 0.8495, "grad_norm": 0.5700443387031555, "learning_rate": 0.0002, "epoch": 0.5318890954652009, "step": 3290}, {"loss": 0.7776, "grad_norm": 0.46296367049217224, "learning_rate": 0.0002, "epoch": 0.5335057796459461, "step": 3300}, {"loss": 0.7931, "grad_norm": 0.531568706035614, "learning_rate": 0.0002, "epoch": 0.5351224638266915, "step": 3310}, {"loss": 0.843, "grad_norm": 0.4686741530895233, "learning_rate": 0.0002, "epoch": 0.5367391480074367, "step": 3320}, {"loss": 0.8104, "grad_norm": 0.5404331088066101, "learning_rate": 0.0002, "epoch": 0.5383558321881821, "step": 3330}, {"loss": 0.7686, "grad_norm": 0.6368790864944458, "learning_rate": 0.0002, "epoch": 0.5399725163689273, "step": 3340}, {"loss": 0.8514, "grad_norm": 0.42300888895988464, "learning_rate": 0.0002, "epoch": 0.5415892005496726, "step": 3350}, {"loss": 0.8236, "grad_norm": 0.5362542867660522, "learning_rate": 0.0002, "epoch": 0.5432058847304179, "step": 3360}, {"loss": 0.858, "grad_norm": 0.497128963470459, "learning_rate": 0.0002, "epoch": 0.5448225689111632, "step": 3370}, {"loss": 0.8519, "grad_norm": 0.5006386041641235, "learning_rate": 0.0002, "epoch": 0.5464392530919085, "step": 3380}, {"loss": 0.7867, "grad_norm": 0.44136837124824524, "learning_rate": 0.0002, "epoch": 0.5480559372726538, "step": 3390}, {"loss": 0.773, "grad_norm": 0.5897833108901978, "learning_rate": 0.0002, "epoch": 0.5496726214533991, "step": 3400}, {"loss": 0.8895, "grad_norm": 0.641075611114502, "learning_rate": 0.0002, "epoch": 0.5512893056341444, "step": 3410}, {"loss": 0.7827, "grad_norm": 0.7251322269439697, "learning_rate": 0.0002, "epoch": 0.5529059898148897, "step": 3420}, {"loss": 0.7626, "grad_norm": 0.47411349415779114, "learning_rate": 0.0002, "epoch": 0.5545226739956349, "step": 3430}, {"loss": 0.8196, "grad_norm": 0.4994310438632965, "learning_rate": 0.0002, "epoch": 0.5561393581763803, "step": 3440}, {"loss": 0.7812, "grad_norm": 0.5814438462257385, "learning_rate": 0.0002, "epoch": 0.5577560423571255, "step": 3450}, {"loss": 0.8805, "grad_norm": 0.6278898119926453, "learning_rate": 0.0002, "epoch": 0.5593727265378708, "step": 3460}, {"loss": 0.813, "grad_norm": 0.46208274364471436, "learning_rate": 0.0002, "epoch": 0.5609894107186161, "step": 3470}, {"loss": 0.8295, "grad_norm": 0.5718930959701538, "learning_rate": 0.0002, "epoch": 0.5626060948993614, "step": 3480}, {"loss": 0.8152, "grad_norm": 0.48178744316101074, "learning_rate": 0.0002, "epoch": 0.5642227790801067, "step": 3490}, {"loss": 0.8244, "grad_norm": 0.47336965799331665, "learning_rate": 0.0002, "epoch": 0.565839463260852, "step": 3500}, {"loss": 0.8099, "grad_norm": 0.43442684412002563, "learning_rate": 0.0002, "epoch": 0.5674561474415973, "step": 3510}, {"loss": 0.7564, "grad_norm": 0.6463358998298645, "learning_rate": 0.0002, "epoch": 0.5690728316223426, "step": 3520}, {"loss": 0.836, "grad_norm": 0.5286486744880676, "learning_rate": 0.0002, "epoch": 0.5706895158030879, "step": 3530}, {"loss": 0.8421, "grad_norm": 0.5405499935150146, "learning_rate": 0.0002, "epoch": 0.5723061999838331, "step": 3540}, {"loss": 0.7614, "grad_norm": 0.6654391884803772, "learning_rate": 0.0002, "epoch": 0.5739228841645785, "step": 3550}, {"loss": 0.7803, "grad_norm": 0.5081980228424072, "learning_rate": 0.0002, "epoch": 0.5755395683453237, "step": 3560}, {"loss": 0.7753, "grad_norm": 0.48978179693222046, "learning_rate": 0.0002, "epoch": 0.5771562525260691, "step": 3570}, {"loss": 0.8151, "grad_norm": 0.5840612053871155, "learning_rate": 0.0002, "epoch": 0.5787729367068143, "step": 3580}, {"loss": 0.8937, "grad_norm": 0.5235261917114258, "learning_rate": 0.0002, "epoch": 0.5803896208875596, "step": 3590}, {"loss": 0.7894, "grad_norm": 0.5672075748443604, "learning_rate": 0.0002, "epoch": 0.5820063050683049, "step": 3600}, {"loss": 0.8347, "grad_norm": 0.5613429546356201, "learning_rate": 0.0002, "epoch": 0.5836229892490502, "step": 3610}, {"loss": 0.8274, "grad_norm": 0.4032273590564728, "learning_rate": 0.0002, "epoch": 0.5852396734297954, "step": 3620}, {"loss": 0.8421, "grad_norm": 0.49559324979782104, "learning_rate": 0.0002, "epoch": 0.5868563576105408, "step": 3630}, {"loss": 0.8332, "grad_norm": 0.6895697712898254, "learning_rate": 0.0002, "epoch": 0.5884730417912861, "step": 3640}, {"loss": 0.7877, "grad_norm": 0.4750136435031891, "learning_rate": 0.0002, "epoch": 0.5900897259720314, "step": 3650}, {"loss": 0.8219, "grad_norm": 0.5176819562911987, "learning_rate": 0.0002, "epoch": 0.5917064101527767, "step": 3660}, {"loss": 0.8151, "grad_norm": 0.5817760229110718, "learning_rate": 0.0002, "epoch": 0.5933230943335219, "step": 3670}, {"loss": 0.7823, "grad_norm": 0.6064626574516296, "learning_rate": 0.0002, "epoch": 0.5949397785142673, "step": 3680}, {"loss": 0.8422, "grad_norm": 0.6728700995445251, "learning_rate": 0.0002, "epoch": 0.5965564626950125, "step": 3690}, {"loss": 0.7679, "grad_norm": 0.609305202960968, "learning_rate": 0.0002, "epoch": 0.5981731468757578, "step": 3700}, {"loss": 0.8048, "grad_norm": 0.4615488350391388, "learning_rate": 0.0002, "epoch": 0.5997898310565031, "step": 3710}, {"loss": 0.8214, "grad_norm": 2.0531179904937744, "learning_rate": 0.0002, "epoch": 0.6014065152372484, "step": 3720}, {"loss": 0.8158, "grad_norm": 0.5091132521629333, "learning_rate": 0.0002, "epoch": 0.6030231994179936, "step": 3730}, {"loss": 0.7833, "grad_norm": 0.5951124429702759, "learning_rate": 0.0002, "epoch": 0.604639883598739, "step": 3740}, {"loss": 0.7784, "grad_norm": 0.5870208144187927, "learning_rate": 0.0002, "epoch": 0.6062565677794842, "step": 3750}, {"loss": 0.8044, "grad_norm": 0.6254619359970093, "learning_rate": 0.0002, "epoch": 0.6078732519602296, "step": 3760}, {"loss": 0.7868, "grad_norm": 0.5577626824378967, "learning_rate": 0.0002, "epoch": 0.6094899361409749, "step": 3770}, {"loss": 0.8108, "grad_norm": 0.5004405379295349, "learning_rate": 0.0002, "epoch": 0.6111066203217201, "step": 3780}, {"loss": 0.8092, "grad_norm": 0.5527383685112, "learning_rate": 0.0002, "epoch": 0.6127233045024655, "step": 3790}, {"loss": 0.8036, "grad_norm": 0.49116113781929016, "learning_rate": 0.0002, "epoch": 0.6143399886832107, "step": 3800}, {"loss": 0.8352, "grad_norm": 0.5299299359321594, "learning_rate": 0.0002, "epoch": 0.6159566728639561, "step": 3810}, {"loss": 0.7737, "grad_norm": 0.464897483587265, "learning_rate": 0.0002, "epoch": 0.6175733570447013, "step": 3820}, {"loss": 0.7923, "grad_norm": 0.6505740880966187, "learning_rate": 0.0002, "epoch": 0.6191900412254466, "step": 3830}, {"loss": 0.8123, "grad_norm": 0.5512559413909912, "learning_rate": 0.0002, "epoch": 0.6208067254061919, "step": 3840}, {"loss": 0.8856, "grad_norm": 0.49427518248558044, "learning_rate": 0.0002, "epoch": 0.6224234095869372, "step": 3850}, {"loss": 0.7751, "grad_norm": 0.3839147090911865, "learning_rate": 0.0002, "epoch": 0.6240400937676824, "step": 3860}, {"loss": 0.8006, "grad_norm": 0.5760218501091003, "learning_rate": 0.0002, "epoch": 0.6256567779484278, "step": 3870}, {"loss": 0.7836, "grad_norm": 0.7226507067680359, "learning_rate": 0.0002, "epoch": 0.6272734621291731, "step": 3880}, {"loss": 0.8244, "grad_norm": 0.676781415939331, "learning_rate": 0.0002, "epoch": 0.6288901463099184, "step": 3890}, {"loss": 0.8239, "grad_norm": 0.4284018278121948, "learning_rate": 0.0002, "epoch": 0.6305068304906637, "step": 3900}, {"loss": 0.7996, "grad_norm": 0.5060628056526184, "learning_rate": 0.0002, "epoch": 0.6321235146714089, "step": 3910}, {"loss": 0.8089, "grad_norm": 0.5524522066116333, "learning_rate": 0.0002, "epoch": 0.6337401988521543, "step": 3920}, {"loss": 0.8276, "grad_norm": 0.6099881529808044, "learning_rate": 0.0002, "epoch": 0.6353568830328995, "step": 3930}, {"loss": 0.809, "grad_norm": 0.43155938386917114, "learning_rate": 0.0002, "epoch": 0.6369735672136448, "step": 3940}, {"loss": 0.8404, "grad_norm": 0.6427084803581238, "learning_rate": 0.0002, "epoch": 0.6385902513943901, "step": 3950}, {"loss": 0.8368, "grad_norm": 0.541220486164093, "learning_rate": 0.0002, "epoch": 0.6402069355751354, "step": 3960}, {"loss": 0.8539, "grad_norm": 0.5414294600486755, "learning_rate": 0.0002, "epoch": 0.6418236197558806, "step": 3970}, {"loss": 0.7996, "grad_norm": 0.46344003081321716, "learning_rate": 0.0002, "epoch": 0.643440303936626, "step": 3980}, {"loss": 0.7474, "grad_norm": 0.45209285616874695, "learning_rate": 0.0002, "epoch": 0.6450569881173712, "step": 3990}, {"loss": 0.8202, "grad_norm": 0.5417284369468689, "learning_rate": 0.0002, "epoch": 0.6466736722981166, "step": 4000}, {"loss": 0.7563, "grad_norm": 0.7995685935020447, "learning_rate": 0.0002, "epoch": 0.6482903564788619, "step": 4010}, {"loss": 0.7812, "grad_norm": 0.6384002566337585, "learning_rate": 0.0002, "epoch": 0.6499070406596071, "step": 4020}, {"loss": 0.732, "grad_norm": 0.4472815692424774, "learning_rate": 0.0002, "epoch": 0.6515237248403525, "step": 4030}, {"loss": 0.8071, "grad_norm": 0.6834294199943542, "learning_rate": 0.0002, "epoch": 0.6531404090210977, "step": 4040}, {"loss": 0.7812, "grad_norm": 0.4612339735031128, "learning_rate": 0.0002, "epoch": 0.654757093201843, "step": 4050}, {"loss": 0.8141, "grad_norm": 0.9266576170921326, "learning_rate": 0.0002, "epoch": 0.6563737773825883, "step": 4060}, {"loss": 0.7991, "grad_norm": 0.4470861852169037, "learning_rate": 0.0002, "epoch": 0.6579904615633336, "step": 4070}, {"loss": 0.8293, "grad_norm": 0.45544925332069397, "learning_rate": 0.0002, "epoch": 0.6596071457440789, "step": 4080}, {"loss": 0.8455, "grad_norm": 0.6144481301307678, "learning_rate": 0.0002, "epoch": 0.6612238299248242, "step": 4090}, {"loss": 0.7877, "grad_norm": 0.5936288237571716, "learning_rate": 0.0002, "epoch": 0.6628405141055694, "step": 4100}, {"loss": 0.7617, "grad_norm": 0.4822963774204254, "learning_rate": 0.0002, "epoch": 0.6644571982863148, "step": 4110}, {"loss": 0.7997, "grad_norm": 0.48432496190071106, "learning_rate": 0.0002, "epoch": 0.66607388246706, "step": 4120}, {"loss": 0.8404, "grad_norm": 0.4901607930660248, "learning_rate": 0.0002, "epoch": 0.6676905666478054, "step": 4130}, {"loss": 0.8085, "grad_norm": 0.5018393397331238, "learning_rate": 0.0002, "epoch": 0.6693072508285507, "step": 4140}, {"loss": 0.8065, "grad_norm": 0.6946378946304321, "learning_rate": 0.0002, "epoch": 0.6709239350092959, "step": 4150}, {"loss": 0.8147, "grad_norm": 0.5997390747070312, "learning_rate": 0.0002, "epoch": 0.6725406191900413, "step": 4160}, {"loss": 0.8268, "grad_norm": 0.6738849878311157, "learning_rate": 0.0002, "epoch": 0.6741573033707865, "step": 4170}, {"loss": 0.7704, "grad_norm": 0.6110581159591675, "learning_rate": 0.0002, "epoch": 0.6757739875515318, "step": 4180}, {"loss": 0.8043, "grad_norm": 0.5703322291374207, "learning_rate": 0.0002, "epoch": 0.6773906717322771, "step": 4190}, {"loss": 0.8099, "grad_norm": 0.4686066210269928, "learning_rate": 0.0002, "epoch": 0.6790073559130224, "step": 4200}, {"loss": 0.8441, "grad_norm": 0.6394643783569336, "learning_rate": 0.0002, "epoch": 0.6806240400937676, "step": 4210}, {"loss": 0.8011, "grad_norm": 0.5454841256141663, "learning_rate": 0.0002, "epoch": 0.682240724274513, "step": 4220}, {"loss": 0.8307, "grad_norm": 0.4859732985496521, "learning_rate": 0.0002, "epoch": 0.6838574084552582, "step": 4230}, {"loss": 0.8161, "grad_norm": 0.5544065833091736, "learning_rate": 0.0002, "epoch": 0.6854740926360036, "step": 4240}, {"loss": 0.7839, "grad_norm": 0.4902505576610565, "learning_rate": 0.0002, "epoch": 0.6870907768167488, "step": 4250}, {"loss": 0.7977, "grad_norm": 0.4768051505088806, "learning_rate": 0.0002, "epoch": 0.6887074609974941, "step": 4260}, {"loss": 0.7539, "grad_norm": 0.49982190132141113, "learning_rate": 0.0002, "epoch": 0.6903241451782395, "step": 4270}, {"loss": 0.7353, "grad_norm": 0.6351838111877441, "learning_rate": 0.0002, "epoch": 0.6919408293589847, "step": 4280}, {"loss": 0.7664, "grad_norm": 0.5647561550140381, "learning_rate": 0.0002, "epoch": 0.69355751353973, "step": 4290}, {"loss": 0.7618, "grad_norm": 0.5340486764907837, "learning_rate": 0.0002, "epoch": 0.6951741977204753, "step": 4300}, {"loss": 0.8526, "grad_norm": 0.5649092793464661, "learning_rate": 0.0002, "epoch": 0.6967908819012206, "step": 4310}, {"loss": 0.8246, "grad_norm": 0.6183916926383972, "learning_rate": 0.0002, "epoch": 0.6984075660819659, "step": 4320}, {"loss": 0.792, "grad_norm": 0.6154509782791138, "learning_rate": 0.0002, "epoch": 0.7000242502627112, "step": 4330}, {"loss": 0.8397, "grad_norm": 0.5156264305114746, "learning_rate": 0.0002, "epoch": 0.7016409344434564, "step": 4340}, {"loss": 0.8512, "grad_norm": 0.562171459197998, "learning_rate": 0.0002, "epoch": 0.7032576186242018, "step": 4350}, {"loss": 0.7882, "grad_norm": 0.4949502646923065, "learning_rate": 0.0002, "epoch": 0.704874302804947, "step": 4360}, {"loss": 0.738, "grad_norm": 0.5171684622764587, "learning_rate": 0.0002, "epoch": 0.7064909869856923, "step": 4370}, {"loss": 0.8001, "grad_norm": 0.6198443174362183, "learning_rate": 0.0002, "epoch": 0.7081076711664377, "step": 4380}, {"loss": 0.7606, "grad_norm": 0.5802276134490967, "learning_rate": 0.0002, "epoch": 0.7097243553471829, "step": 4390}, {"loss": 0.8797, "grad_norm": 0.41096967458724976, "learning_rate": 0.0002, "epoch": 0.7113410395279283, "step": 4400}, {"loss": 0.805, "grad_norm": 0.4397392272949219, "learning_rate": 0.0002, "epoch": 0.7129577237086735, "step": 4410}, {"loss": 0.7651, "grad_norm": 0.45228442549705505, "learning_rate": 0.0002, "epoch": 0.7145744078894188, "step": 4420}, {"loss": 0.7938, "grad_norm": 0.4839673936367035, "learning_rate": 0.0002, "epoch": 0.7161910920701641, "step": 4430}, {"loss": 0.8362, "grad_norm": 0.6140755414962769, "learning_rate": 0.0002, "epoch": 0.7178077762509094, "step": 4440}, {"loss": 0.7722, "grad_norm": 0.6841378808021545, "learning_rate": 0.0002, "epoch": 0.7194244604316546, "step": 4450}, {"loss": 0.8177, "grad_norm": 0.6664239168167114, "learning_rate": 0.0002, "epoch": 0.7210411446124, "step": 4460}, {"loss": 0.7983, "grad_norm": 0.47552719712257385, "learning_rate": 0.0002, "epoch": 0.7226578287931452, "step": 4470}, {"loss": 0.8982, "grad_norm": 0.6649776101112366, "learning_rate": 0.0002, "epoch": 0.7242745129738906, "step": 4480}, {"loss": 0.8074, "grad_norm": 0.5159541964530945, "learning_rate": 0.0002, "epoch": 0.7258911971546358, "step": 4490}, {"loss": 0.7786, "grad_norm": 0.6693112850189209, "learning_rate": 0.0002, "epoch": 0.7275078813353811, "step": 4500}, {"loss": 0.8655, "grad_norm": 0.48870977759361267, "learning_rate": 0.0002, "epoch": 0.7291245655161265, "step": 4510}, {"loss": 0.7337, "grad_norm": 0.4857887923717499, "learning_rate": 0.0002, "epoch": 0.7307412496968717, "step": 4520}, {"loss": 0.8026, "grad_norm": 0.5515662431716919, "learning_rate": 0.0002, "epoch": 0.732357933877617, "step": 4530}, {"loss": 0.8031, "grad_norm": 0.6292222738265991, "learning_rate": 0.0002, "epoch": 0.7339746180583623, "step": 4540}, {"loss": 0.7749, "grad_norm": 0.48265689611434937, "learning_rate": 0.0002, "epoch": 0.7355913022391076, "step": 4550}, {"loss": 0.8499, "grad_norm": 0.8044266104698181, "learning_rate": 0.0002, "epoch": 0.7372079864198529, "step": 4560}, {"loss": 0.8162, "grad_norm": 0.6111769676208496, "learning_rate": 0.0002, "epoch": 0.7388246706005982, "step": 4570}, {"loss": 0.7291, "grad_norm": 0.5229553580284119, "learning_rate": 0.0002, "epoch": 0.7404413547813434, "step": 4580}, {"loss": 0.8038, "grad_norm": 0.6054152250289917, "learning_rate": 0.0002, "epoch": 0.7420580389620888, "step": 4590}, {"loss": 0.8169, "grad_norm": 0.5574966669082642, "learning_rate": 0.0002, "epoch": 0.743674723142834, "step": 4600}, {"loss": 0.8439, "grad_norm": 0.5395817160606384, "learning_rate": 0.0002, "epoch": 0.7452914073235793, "step": 4610}, {"loss": 0.8495, "grad_norm": 0.7116472721099854, "learning_rate": 0.0002, "epoch": 0.7469080915043246, "step": 4620}, {"loss": 0.7743, "grad_norm": 0.5618700981140137, "learning_rate": 0.0002, "epoch": 0.7485247756850699, "step": 4630}, {"loss": 0.7744, "grad_norm": 0.5802770853042603, "learning_rate": 0.0002, "epoch": 0.7501414598658153, "step": 4640}, {"loss": 0.7924, "grad_norm": 0.5690428018569946, "learning_rate": 0.0002, "epoch": 0.7517581440465605, "step": 4650}, {"loss": 0.8017, "grad_norm": 0.4813360273838043, "learning_rate": 0.0002, "epoch": 0.7533748282273058, "step": 4660}, {"loss": 0.8108, "grad_norm": 0.5434042811393738, "learning_rate": 0.0002, "epoch": 0.7549915124080511, "step": 4670}, {"loss": 0.7824, "grad_norm": 0.5502099990844727, "learning_rate": 0.0002, "epoch": 0.7566081965887964, "step": 4680}, {"loss": 0.8598, "grad_norm": 0.6020621061325073, "learning_rate": 0.0002, "epoch": 0.7582248807695416, "step": 4690}, {"loss": 0.7937, "grad_norm": 0.4922301471233368, "learning_rate": 0.0002, "epoch": 0.759841564950287, "step": 4700}, {"loss": 0.788, "grad_norm": 0.6492828726768494, "learning_rate": 0.0002, "epoch": 0.7614582491310322, "step": 4710}, {"loss": 0.8313, "grad_norm": 0.4865580201148987, "learning_rate": 0.0002, "epoch": 0.7630749333117776, "step": 4720}, {"loss": 0.7966, "grad_norm": 0.5971422791481018, "learning_rate": 0.0002, "epoch": 0.7646916174925228, "step": 4730}, {"loss": 0.8298, "grad_norm": 0.6832674145698547, "learning_rate": 0.0002, "epoch": 0.7663083016732681, "step": 4740}, {"loss": 0.8156, "grad_norm": 0.500908613204956, "learning_rate": 0.0002, "epoch": 0.7679249858540134, "step": 4750}, {"loss": 0.8383, "grad_norm": 0.6112465858459473, "learning_rate": 0.0002, "epoch": 0.7695416700347587, "step": 4760}, {"loss": 0.76, "grad_norm": 0.5753506422042847, "learning_rate": 0.0002, "epoch": 0.771158354215504, "step": 4770}, {"loss": 0.8297, "grad_norm": 0.6529405117034912, "learning_rate": 0.0002, "epoch": 0.7727750383962493, "step": 4780}, {"loss": 0.8171, "grad_norm": 0.5916843414306641, "learning_rate": 0.0002, "epoch": 0.7743917225769946, "step": 4790}, {"loss": 0.83, "grad_norm": 0.4821224510669708, "learning_rate": 0.0002, "epoch": 0.7760084067577399, "step": 4800}, {"loss": 0.7703, "grad_norm": 0.5532580018043518, "learning_rate": 0.0002, "epoch": 0.7776250909384852, "step": 4810}, {"loss": 0.7363, "grad_norm": 0.4604877233505249, "learning_rate": 0.0002, "epoch": 0.7792417751192304, "step": 4820}, {"loss": 0.7506, "grad_norm": 0.5009613037109375, "learning_rate": 0.0002, "epoch": 0.7808584592999758, "step": 4830}, {"loss": 0.7863, "grad_norm": 0.6448560357093811, "learning_rate": 0.0002, "epoch": 0.782475143480721, "step": 4840}, {"loss": 0.7957, "grad_norm": 0.44327953457832336, "learning_rate": 0.0002, "epoch": 0.7840918276614663, "step": 4850}, {"loss": 0.7925, "grad_norm": 0.5355411171913147, "learning_rate": 0.0002, "epoch": 0.7857085118422116, "step": 4860}, {"loss": 0.7754, "grad_norm": 0.5635677576065063, "learning_rate": 0.0002, "epoch": 0.7873251960229569, "step": 4870}, {"loss": 0.7931, "grad_norm": 0.5417491793632507, "learning_rate": 0.0002, "epoch": 0.7889418802037023, "step": 4880}, {"loss": 0.7819, "grad_norm": 0.4567430913448334, "learning_rate": 0.0002, "epoch": 0.7905585643844475, "step": 4890}, {"loss": 0.8454, "grad_norm": 0.44651296734809875, "learning_rate": 0.0002, "epoch": 0.7921752485651928, "step": 4900}, {"loss": 0.7959, "grad_norm": 0.5741217136383057, "learning_rate": 0.0002, "epoch": 0.7937919327459381, "step": 4910}, {"loss": 0.8093, "grad_norm": 0.6605045199394226, "learning_rate": 0.0002, "epoch": 0.7954086169266834, "step": 4920}, {"loss": 0.77, "grad_norm": 0.5126531720161438, "learning_rate": 0.0002, "epoch": 0.7970253011074286, "step": 4930}, {"loss": 0.7793, "grad_norm": 0.513648271560669, "learning_rate": 0.0002, "epoch": 0.798641985288174, "step": 4940}, {"loss": 0.8314, "grad_norm": 0.5350404381752014, "learning_rate": 0.0002, "epoch": 0.8002586694689192, "step": 4950}, {"loss": 0.7649, "grad_norm": 0.5731674432754517, "learning_rate": 0.0002, "epoch": 0.8018753536496646, "step": 4960}, {"loss": 0.8572, "grad_norm": 0.5974258184432983, "learning_rate": 0.0002, "epoch": 0.8034920378304098, "step": 4970}, {"loss": 0.7972, "grad_norm": 0.8774799704551697, "learning_rate": 0.0002, "epoch": 0.8051087220111551, "step": 4980}, {"loss": 0.7899, "grad_norm": 0.5994430184364319, "learning_rate": 0.0002, "epoch": 0.8067254061919004, "step": 4990}, {"loss": 0.7736, "grad_norm": 0.4894903004169464, "learning_rate": 0.0002, "epoch": 0.8083420903726457, "step": 5000}, {"loss": 0.78, "grad_norm": 0.5218459367752075, "learning_rate": 0.0002, "epoch": 0.809958774553391, "step": 5010}, {"loss": 0.817, "grad_norm": 0.5232468843460083, "learning_rate": 0.0002, "epoch": 0.8115754587341363, "step": 5020}, {"loss": 0.7704, "grad_norm": 0.44358372688293457, "learning_rate": 0.0002, "epoch": 0.8131921429148816, "step": 5030}, {"loss": 0.785, "grad_norm": 0.6202037334442139, "learning_rate": 0.0002, "epoch": 0.8148088270956269, "step": 5040}, {"loss": 0.7351, "grad_norm": 0.7721474170684814, "learning_rate": 0.0002, "epoch": 0.8164255112763722, "step": 5050}, {"loss": 0.8297, "grad_norm": 0.5568501353263855, "learning_rate": 0.0002, "epoch": 0.8180421954571174, "step": 5060}, {"loss": 0.7733, "grad_norm": 0.49148809909820557, "learning_rate": 0.0002, "epoch": 0.8196588796378628, "step": 5070}, {"loss": 0.8054, "grad_norm": 0.4956012964248657, "learning_rate": 0.0002, "epoch": 0.821275563818608, "step": 5080}, {"loss": 0.8201, "grad_norm": 0.6078833937644958, "learning_rate": 0.0002, "epoch": 0.8228922479993533, "step": 5090}, {"loss": 0.828, "grad_norm": 0.46906954050064087, "learning_rate": 0.0002, "epoch": 0.8245089321800986, "step": 5100}, {"loss": 0.7703, "grad_norm": 0.50812166929245, "learning_rate": 0.0002, "epoch": 0.8261256163608439, "step": 5110}, {"loss": 0.8243, "grad_norm": 0.5319661498069763, "learning_rate": 0.0002, "epoch": 0.8277423005415891, "step": 5120}, {"loss": 0.7798, "grad_norm": 0.4949689209461212, "learning_rate": 0.0002, "epoch": 0.8293589847223345, "step": 5130}, {"loss": 0.7428, "grad_norm": 0.5151591300964355, "learning_rate": 0.0002, "epoch": 0.8309756689030798, "step": 5140}, {"loss": 0.8147, "grad_norm": 0.5530214309692383, "learning_rate": 0.0002, "epoch": 0.8325923530838251, "step": 5150}, {"loss": 0.8251, "grad_norm": 0.6297410130500793, "learning_rate": 0.0002, "epoch": 0.8342090372645704, "step": 5160}, {"loss": 0.8067, "grad_norm": 0.5466840267181396, "learning_rate": 0.0002, "epoch": 0.8358257214453156, "step": 5170}, {"loss": 0.7875, "grad_norm": 0.652913510799408, "learning_rate": 0.0002, "epoch": 0.837442405626061, "step": 5180}, {"loss": 0.8295, "grad_norm": 0.5811293125152588, "learning_rate": 0.0002, "epoch": 0.8390590898068062, "step": 5190}, {"loss": 0.7412, "grad_norm": 0.5109550952911377, "learning_rate": 0.0002, "epoch": 0.8406757739875516, "step": 5200}, {"loss": 0.8077, "grad_norm": 0.4551706612110138, "learning_rate": 0.0002, "epoch": 0.8422924581682968, "step": 5210}, {"loss": 0.7827, "grad_norm": 0.5813754200935364, "learning_rate": 0.0002, "epoch": 0.8439091423490421, "step": 5220}, {"loss": 0.802, "grad_norm": 0.5856947898864746, "learning_rate": 0.0002, "epoch": 0.8455258265297874, "step": 5230}, {"loss": 0.7957, "grad_norm": 0.5482739210128784, "learning_rate": 0.0002, "epoch": 0.8471425107105327, "step": 5240}, {"loss": 0.8295, "grad_norm": 0.49023720622062683, "learning_rate": 0.0002, "epoch": 0.8487591948912779, "step": 5250}, {"loss": 0.8022, "grad_norm": 0.49472475051879883, "learning_rate": 0.0002, "epoch": 0.8503758790720233, "step": 5260}, {"loss": 0.8001, "grad_norm": 0.5490226745605469, "learning_rate": 0.0002, "epoch": 0.8519925632527686, "step": 5270}, {"loss": 0.8333, "grad_norm": 0.5340665578842163, "learning_rate": 0.0002, "epoch": 0.8536092474335139, "step": 5280}, {"loss": 0.8277, "grad_norm": 0.5962483882904053, "learning_rate": 0.0002, "epoch": 0.8552259316142592, "step": 5290}, {"loss": 0.8765, "grad_norm": 0.586358368396759, "learning_rate": 0.0002, "epoch": 0.8568426157950044, "step": 5300}, {"loss": 0.7831, "grad_norm": 0.49120277166366577, "learning_rate": 0.0002, "epoch": 0.8584592999757498, "step": 5310}, {"loss": 0.8162, "grad_norm": 0.5887332558631897, "learning_rate": 0.0002, "epoch": 0.860075984156495, "step": 5320}, {"loss": 0.7464, "grad_norm": 0.42496153712272644, "learning_rate": 0.0002, "epoch": 0.8616926683372403, "step": 5330}, {"loss": 0.7905, "grad_norm": 0.5489874482154846, "learning_rate": 0.0002, "epoch": 0.8633093525179856, "step": 5340}, {"loss": 0.7958, "grad_norm": 0.5850813984870911, "learning_rate": 0.0002, "epoch": 0.8649260366987309, "step": 5350}, {"loss": 0.7642, "grad_norm": 0.517487108707428, "learning_rate": 0.0002, "epoch": 0.8665427208794761, "step": 5360}, {"loss": 0.7801, "grad_norm": 0.5339142680168152, "learning_rate": 0.0002, "epoch": 0.8681594050602215, "step": 5370}, {"loss": 0.818, "grad_norm": 0.6236387491226196, "learning_rate": 0.0002, "epoch": 0.8697760892409668, "step": 5380}, {"loss": 0.7708, "grad_norm": 0.5752192735671997, "learning_rate": 0.0002, "epoch": 0.8713927734217121, "step": 5390}, {"loss": 0.8542, "grad_norm": 0.6724614500999451, "learning_rate": 0.0002, "epoch": 0.8730094576024574, "step": 5400}, {"loss": 0.7581, "grad_norm": 0.5280613303184509, "learning_rate": 0.0002, "epoch": 0.8746261417832026, "step": 5410}, {"loss": 0.8231, "grad_norm": 0.44033288955688477, "learning_rate": 0.0002, "epoch": 0.876242825963948, "step": 5420}, {"loss": 0.8839, "grad_norm": 0.5199708342552185, "learning_rate": 0.0002, "epoch": 0.8778595101446932, "step": 5430}, {"loss": 0.7852, "grad_norm": 0.46778348088264465, "learning_rate": 0.0002, "epoch": 0.8794761943254386, "step": 5440}, {"loss": 0.7834, "grad_norm": 0.4657754898071289, "learning_rate": 0.0002, "epoch": 0.8810928785061838, "step": 5450}, {"loss": 0.7799, "grad_norm": 0.5472902655601501, "learning_rate": 0.0002, "epoch": 0.8827095626869291, "step": 5460}, {"loss": 0.8253, "grad_norm": 0.4876766800880432, "learning_rate": 0.0002, "epoch": 0.8843262468676744, "step": 5470}, {"loss": 0.7906, "grad_norm": 0.5057248473167419, "learning_rate": 0.0002, "epoch": 0.8859429310484197, "step": 5480}, {"loss": 0.8124, "grad_norm": 0.4637320637702942, "learning_rate": 0.0002, "epoch": 0.8875596152291649, "step": 5490}, {"loss": 0.781, "grad_norm": 0.471955806016922, "learning_rate": 0.0002, "epoch": 0.8891762994099103, "step": 5500}, {"loss": 0.8057, "grad_norm": 0.5209813714027405, "learning_rate": 0.0002, "epoch": 0.8907929835906556, "step": 5510}, {"loss": 0.8106, "grad_norm": 0.6213834285736084, "learning_rate": 0.0002, "epoch": 0.8924096677714008, "step": 5520}, {"loss": 0.7787, "grad_norm": 0.5215408205986023, "learning_rate": 0.0002, "epoch": 0.8940263519521462, "step": 5530}, {"loss": 0.8174, "grad_norm": 0.580478310585022, "learning_rate": 0.0002, "epoch": 0.8956430361328914, "step": 5540}, {"loss": 0.8371, "grad_norm": 0.49102169275283813, "learning_rate": 0.0002, "epoch": 0.8972597203136368, "step": 5550}, {"loss": 0.7806, "grad_norm": 0.6043479442596436, "learning_rate": 0.0002, "epoch": 0.898876404494382, "step": 5560}, {"loss": 0.7754, "grad_norm": 0.5636463165283203, "learning_rate": 0.0002, "epoch": 0.9004930886751273, "step": 5570}, {"loss": 0.8145, "grad_norm": 0.5620124340057373, "learning_rate": 0.0002, "epoch": 0.9021097728558726, "step": 5580}, {"loss": 0.8083, "grad_norm": 0.5206354856491089, "learning_rate": 0.0002, "epoch": 0.9037264570366179, "step": 5590}, {"loss": 0.8557, "grad_norm": 0.5798229575157166, "learning_rate": 0.0002, "epoch": 0.9053431412173631, "step": 5600}, {"loss": 0.8097, "grad_norm": 0.6428212523460388, "learning_rate": 0.0002, "epoch": 0.9069598253981085, "step": 5610}, {"loss": 0.7839, "grad_norm": 0.48064687848091125, "learning_rate": 0.0002, "epoch": 0.9085765095788537, "step": 5620}, {"loss": 0.8343, "grad_norm": 0.6347860097885132, "learning_rate": 0.0002, "epoch": 0.9101931937595991, "step": 5630}, {"loss": 0.851, "grad_norm": 0.5353913307189941, "learning_rate": 0.0002, "epoch": 0.9118098779403444, "step": 5640}, {"loss": 0.7736, "grad_norm": 0.5323944091796875, "learning_rate": 0.0002, "epoch": 0.9134265621210896, "step": 5650}, {"loss": 0.8393, "grad_norm": 0.5261843204498291, "learning_rate": 0.0002, "epoch": 0.915043246301835, "step": 5660}, {"loss": 0.7355, "grad_norm": 0.5451326966285706, "learning_rate": 0.0002, "epoch": 0.9166599304825802, "step": 5670}, {"loss": 0.8012, "grad_norm": 0.5183324217796326, "learning_rate": 0.0002, "epoch": 0.9182766146633256, "step": 5680}, {"loss": 0.7659, "grad_norm": 0.47229018807411194, "learning_rate": 0.0002, "epoch": 0.9198932988440708, "step": 5690}, {"loss": 0.7757, "grad_norm": 0.49180513620376587, "learning_rate": 0.0002, "epoch": 0.9215099830248161, "step": 5700}, {"loss": 0.8735, "grad_norm": 0.5419785380363464, "learning_rate": 0.0002, "epoch": 0.9231266672055614, "step": 5710}, {"loss": 0.7378, "grad_norm": 0.5408698916435242, "learning_rate": 0.0002, "epoch": 0.9247433513863067, "step": 5720}, {"loss": 0.7701, "grad_norm": 0.5286232829093933, "learning_rate": 0.0002, "epoch": 0.9263600355670519, "step": 5730}, {"loss": 0.8242, "grad_norm": 0.7539758086204529, "learning_rate": 0.0002, "epoch": 0.9279767197477973, "step": 5740}, {"loss": 0.8118, "grad_norm": 0.5166944861412048, "learning_rate": 0.0002, "epoch": 0.9295934039285425, "step": 5750}, {"loss": 0.783, "grad_norm": 0.6601425409317017, "learning_rate": 0.0002, "epoch": 0.9312100881092878, "step": 5760}, {"loss": 0.7873, "grad_norm": 0.5029960870742798, "learning_rate": 0.0002, "epoch": 0.9328267722900332, "step": 5770}, {"loss": 0.7989, "grad_norm": 0.4926645755767822, "learning_rate": 0.0002, "epoch": 0.9344434564707784, "step": 5780}, {"loss": 0.8174, "grad_norm": 0.5739615559577942, "learning_rate": 0.0002, "epoch": 0.9360601406515238, "step": 5790}, {"loss": 0.8037, "grad_norm": 0.5058279037475586, "learning_rate": 0.0002, "epoch": 0.937676824832269, "step": 5800}, {"loss": 0.8537, "grad_norm": 0.5260962247848511, "learning_rate": 0.0002, "epoch": 0.9392935090130143, "step": 5810}, {"loss": 0.7486, "grad_norm": 0.5768588185310364, "learning_rate": 0.0002, "epoch": 0.9409101931937596, "step": 5820}, {"loss": 0.8215, "grad_norm": 0.5170126557350159, "learning_rate": 0.0002, "epoch": 0.9425268773745049, "step": 5830}, {"loss": 0.7422, "grad_norm": 0.5745864510536194, "learning_rate": 0.0002, "epoch": 0.9441435615552501, "step": 5840}, {"loss": 0.7824, "grad_norm": 0.5551357865333557, "learning_rate": 0.0002, "epoch": 0.9457602457359955, "step": 5850}, {"loss": 0.8529, "grad_norm": 0.5776078701019287, "learning_rate": 0.0002, "epoch": 0.9473769299167407, "step": 5860}, {"loss": 0.8527, "grad_norm": 0.5340062379837036, "learning_rate": 0.0002, "epoch": 0.9489936140974861, "step": 5870}, {"loss": 0.8217, "grad_norm": 0.6447290182113647, "learning_rate": 0.0002, "epoch": 0.9506102982782314, "step": 5880}, {"loss": 0.7945, "grad_norm": 0.5123815536499023, "learning_rate": 0.0002, "epoch": 0.9522269824589766, "step": 5890}, {"loss": 0.8209, "grad_norm": 0.48547613620758057, "learning_rate": 0.0002, "epoch": 0.953843666639722, "step": 5900}, {"loss": 0.7896, "grad_norm": 0.5791414976119995, "learning_rate": 0.0002, "epoch": 0.9554603508204672, "step": 5910}, {"loss": 0.8408, "grad_norm": 0.6195011734962463, "learning_rate": 0.0002, "epoch": 0.9570770350012126, "step": 5920}, {"loss": 0.7805, "grad_norm": 0.6323803067207336, "learning_rate": 0.0002, "epoch": 0.9586937191819578, "step": 5930}, {"loss": 0.8484, "grad_norm": 0.45552879571914673, "learning_rate": 0.0002, "epoch": 0.9603104033627031, "step": 5940}, {"loss": 0.7367, "grad_norm": 0.5796473622322083, "learning_rate": 0.0002, "epoch": 0.9619270875434484, "step": 5950}, {"loss": 0.7672, "grad_norm": 0.647261381149292, "learning_rate": 0.0002, "epoch": 0.9635437717241937, "step": 5960}, {"loss": 0.8086, "grad_norm": 0.5487682819366455, "learning_rate": 0.0002, "epoch": 0.9651604559049389, "step": 5970}, {"loss": 0.7973, "grad_norm": 0.5743663907051086, "learning_rate": 0.0002, "epoch": 0.9667771400856843, "step": 5980}, {"loss": 0.8153, "grad_norm": 0.5470591187477112, "learning_rate": 0.0002, "epoch": 0.9683938242664295, "step": 5990}, {"loss": 0.8119, "grad_norm": 0.5901660323143005, "learning_rate": 0.0002, "epoch": 0.9700105084471748, "step": 6000}, {"loss": 0.8147, "grad_norm": 0.6544759273529053, "learning_rate": 0.0002, "epoch": 0.9716271926279202, "step": 6010}, {"loss": 0.7536, "grad_norm": 0.6288470029830933, "learning_rate": 0.0002, "epoch": 0.9732438768086654, "step": 6020}, {"loss": 0.7989, "grad_norm": 0.673153817653656, "learning_rate": 0.0002, "epoch": 0.9748605609894108, "step": 6030}, {"loss": 0.7556, "grad_norm": 0.42854753136634827, "learning_rate": 0.0002, "epoch": 0.976477245170156, "step": 6040}, {"loss": 0.8006, "grad_norm": 0.5227066278457642, "learning_rate": 0.0002, "epoch": 0.9780939293509013, "step": 6050}, {"loss": 0.795, "grad_norm": 0.5372416973114014, "learning_rate": 0.0002, "epoch": 0.9797106135316466, "step": 6060}, {"loss": 0.7591, "grad_norm": 0.6026402115821838, "learning_rate": 0.0002, "epoch": 0.9813272977123919, "step": 6070}, {"loss": 0.8347, "grad_norm": 0.49547791481018066, "learning_rate": 0.0002, "epoch": 0.9829439818931371, "step": 6080}, {"loss": 0.7722, "grad_norm": 0.4641951322555542, "learning_rate": 0.0002, "epoch": 0.9845606660738825, "step": 6090}, {"loss": 0.8125, "grad_norm": 0.5818535089492798, "learning_rate": 0.0002, "epoch": 0.9861773502546277, "step": 6100}, {"loss": 0.81, "grad_norm": 0.63955157995224, "learning_rate": 0.0002, "epoch": 0.9877940344353731, "step": 6110}, {"loss": 0.7547, "grad_norm": 0.5649438500404358, "learning_rate": 0.0002, "epoch": 0.9894107186161183, "step": 6120}, {"loss": 0.7861, "grad_norm": 0.5290433168411255, "learning_rate": 0.0002, "epoch": 0.9910274027968636, "step": 6130}, {"loss": 0.8109, "grad_norm": 0.6399374008178711, "learning_rate": 0.0002, "epoch": 0.992644086977609, "step": 6140}, {"loss": 0.8373, "grad_norm": 0.6736576557159424, "learning_rate": 0.0002, "epoch": 0.9942607711583542, "step": 6150}, {"loss": 0.7915, "grad_norm": 0.515420138835907, "learning_rate": 0.0002, "epoch": 0.9958774553390995, "step": 6160}, {"loss": 0.8032, "grad_norm": 0.562677800655365, "learning_rate": 0.0002, "epoch": 0.9974941395198448, "step": 6170}, {"loss": 0.8187, "grad_norm": 0.7113858461380005, "learning_rate": 0.0002, "epoch": 0.9991108237005901, "step": 6180}, {"eval_loss": 1.0871200561523438, "eval_runtime": 122.2071, "eval_samples_per_second": 5.998, "eval_steps_per_second": 0.753, "epoch": 0.9999191657909627, "step": 6185}, {"loss": 0.7507, "grad_norm": 0.7111801505088806, "learning_rate": 0.0002, "epoch": 1.0007275078813354, "step": 6190}, {"loss": 0.6865, "grad_norm": 0.5402125716209412, "learning_rate": 0.0002, "epoch": 1.0023441920620806, "step": 6200}, {"loss": 0.7625, "grad_norm": 0.6098830103874207, "learning_rate": 0.0002, "epoch": 1.003960876242826, "step": 6210}, {"loss": 0.7631, "grad_norm": 0.5829983353614807, "learning_rate": 0.0002, "epoch": 1.0055775604235713, "step": 6220}, {"loss": 0.7188, "grad_norm": 0.5614621043205261, "learning_rate": 0.0002, "epoch": 1.0071942446043165, "step": 6230}, {"loss": 0.7505, "grad_norm": 0.5954238772392273, "learning_rate": 0.0002, "epoch": 1.0088109287850617, "step": 6240}, {"loss": 0.7448, "grad_norm": 0.6480574607849121, "learning_rate": 0.0002, "epoch": 1.0104276129658072, "step": 6250}, {"loss": 0.7514, "grad_norm": 0.6051128506660461, "learning_rate": 0.0002, "epoch": 1.0120442971465524, "step": 6260}, {"loss": 0.7237, "grad_norm": 0.6318870782852173, "learning_rate": 0.0002, "epoch": 1.0136609813272976, "step": 6270}, {"loss": 0.7178, "grad_norm": 0.5048980116844177, "learning_rate": 0.0002, "epoch": 1.015277665508043, "step": 6280}, {"loss": 0.7391, "grad_norm": 0.6346936225891113, "learning_rate": 0.0002, "epoch": 1.0168943496887883, "step": 6290}, {"loss": 0.7486, "grad_norm": 0.5711665749549866, "learning_rate": 0.0002, "epoch": 1.0185110338695336, "step": 6300}, {"loss": 0.6808, "grad_norm": 0.5175361037254333, "learning_rate": 0.0002, "epoch": 1.0201277180502788, "step": 6310}, {"loss": 0.7539, "grad_norm": 0.5360831618309021, "learning_rate": 0.0002, "epoch": 1.0217444022310243, "step": 6320}, {"loss": 0.7112, "grad_norm": 0.614675760269165, "learning_rate": 0.0002, "epoch": 1.0233610864117695, "step": 6330}, {"loss": 0.7748, "grad_norm": 0.5626118183135986, "learning_rate": 0.0002, "epoch": 1.0249777705925147, "step": 6340}, {"loss": 0.7375, "grad_norm": 0.574897289276123, "learning_rate": 0.0002, "epoch": 1.02659445477326, "step": 6350}, {"loss": 0.759, "grad_norm": 0.7185447812080383, "learning_rate": 0.0002, "epoch": 1.0282111389540054, "step": 6360}, {"loss": 0.703, "grad_norm": 0.6705799698829651, "learning_rate": 0.0002, "epoch": 1.0298278231347506, "step": 6370}, {"loss": 0.7139, "grad_norm": 0.6740428805351257, "learning_rate": 0.0002, "epoch": 1.0314445073154959, "step": 6380}, {"loss": 0.7252, "grad_norm": 0.663902759552002, "learning_rate": 0.0002, "epoch": 1.0330611914962413, "step": 6390}, {"loss": 0.7065, "grad_norm": 0.5029543042182922, "learning_rate": 0.0002, "epoch": 1.0346778756769865, "step": 6400}, {"loss": 0.711, "grad_norm": 0.7813863158226013, "learning_rate": 0.0002, "epoch": 1.0362945598577318, "step": 6410}, {"loss": 0.7433, "grad_norm": 0.5396282076835632, "learning_rate": 0.0002, "epoch": 1.037911244038477, "step": 6420}, {"loss": 0.7222, "grad_norm": 0.5253293514251709, "learning_rate": 0.0002, "epoch": 1.0395279282192225, "step": 6430}, {"loss": 0.715, "grad_norm": 0.7236770987510681, "learning_rate": 0.0002, "epoch": 1.0411446123999677, "step": 6440}, {"loss": 0.7259, "grad_norm": 0.5670917630195618, "learning_rate": 0.0002, "epoch": 1.042761296580713, "step": 6450}, {"loss": 0.7195, "grad_norm": 0.6031978726387024, "learning_rate": 0.0002, "epoch": 1.0443779807614582, "step": 6460}, {"loss": 0.7648, "grad_norm": 0.5309213399887085, "learning_rate": 0.0002, "epoch": 1.0459946649422036, "step": 6470}, {"loss": 0.7161, "grad_norm": 0.7114651799201965, "learning_rate": 0.0002, "epoch": 1.0476113491229488, "step": 6480}, {"loss": 0.7583, "grad_norm": 0.5591610670089722, "learning_rate": 0.0002, "epoch": 1.049228033303694, "step": 6490}, {"loss": 0.6645, "grad_norm": 0.5185961127281189, "learning_rate": 0.0002, "epoch": 1.0508447174844395, "step": 6500}, {"loss": 0.7654, "grad_norm": 0.6510552167892456, "learning_rate": 0.0002, "epoch": 1.0524614016651848, "step": 6510}, {"loss": 0.7057, "grad_norm": 0.6557928919792175, "learning_rate": 0.0002, "epoch": 1.05407808584593, "step": 6520}, {"loss": 0.8056, "grad_norm": 0.6973192691802979, "learning_rate": 0.0002, "epoch": 1.0556947700266752, "step": 6530}, {"loss": 0.6793, "grad_norm": 0.6226583123207092, "learning_rate": 0.0002, "epoch": 1.0573114542074207, "step": 6540}, {"loss": 0.7151, "grad_norm": 0.5633195638656616, "learning_rate": 0.0002, "epoch": 1.058928138388166, "step": 6550}, {"loss": 0.7082, "grad_norm": 0.7466658353805542, "learning_rate": 0.0002, "epoch": 1.0605448225689111, "step": 6560}, {"loss": 0.7059, "grad_norm": 0.6462772488594055, "learning_rate": 0.0002, "epoch": 1.0621615067496564, "step": 6570}, {"loss": 0.7046, "grad_norm": 0.5266856551170349, "learning_rate": 0.0002, "epoch": 1.0637781909304018, "step": 6580}, {"loss": 0.7157, "grad_norm": 0.534392774105072, "learning_rate": 0.0002, "epoch": 1.065394875111147, "step": 6590}, {"loss": 0.7115, "grad_norm": 0.7514177560806274, "learning_rate": 0.0002, "epoch": 1.0670115592918923, "step": 6600}, {"loss": 0.7545, "grad_norm": 0.7593035697937012, "learning_rate": 0.0002, "epoch": 1.0686282434726375, "step": 6610}, {"loss": 0.6836, "grad_norm": 0.5277858972549438, "learning_rate": 0.0002, "epoch": 1.070244927653383, "step": 6620}, {"loss": 0.7405, "grad_norm": 0.5573670268058777, "learning_rate": 0.0002, "epoch": 1.0718616118341282, "step": 6630}, {"loss": 0.6774, "grad_norm": 0.6802396774291992, "learning_rate": 0.0002, "epoch": 1.0734782960148734, "step": 6640}, {"loss": 0.723, "grad_norm": 0.7367215752601624, "learning_rate": 0.0002, "epoch": 1.0750949801956189, "step": 6650}, {"loss": 0.7429, "grad_norm": 0.5961891412734985, "learning_rate": 0.0002, "epoch": 1.0767116643763641, "step": 6660}, {"loss": 0.6791, "grad_norm": 0.5736313462257385, "learning_rate": 0.0002, "epoch": 1.0783283485571094, "step": 6670}, {"loss": 0.7178, "grad_norm": 0.619219183921814, "learning_rate": 0.0002, "epoch": 1.0799450327378546, "step": 6680}, {"loss": 0.7318, "grad_norm": 0.6214390993118286, "learning_rate": 0.0002, "epoch": 1.0815617169186, "step": 6690}, {"loss": 0.7554, "grad_norm": 0.564536988735199, "learning_rate": 0.0002, "epoch": 1.0831784010993453, "step": 6700}, {"loss": 0.7362, "grad_norm": 0.5838140249252319, "learning_rate": 0.0002, "epoch": 1.0847950852800905, "step": 6710}, {"loss": 0.739, "grad_norm": 0.7000553607940674, "learning_rate": 0.0002, "epoch": 1.0864117694608357, "step": 6720}, {"loss": 0.7369, "grad_norm": 0.7078263759613037, "learning_rate": 0.0002, "epoch": 1.0880284536415812, "step": 6730}, {"loss": 0.7654, "grad_norm": 0.8353848457336426, "learning_rate": 0.0002, "epoch": 1.0896451378223264, "step": 6740}, {"loss": 0.7015, "grad_norm": 0.5615518689155579, "learning_rate": 0.0002, "epoch": 1.0912618220030716, "step": 6750}, {"loss": 0.7396, "grad_norm": 0.5475581288337708, "learning_rate": 0.0002, "epoch": 1.0928785061838169, "step": 6760}, {"loss": 0.7652, "grad_norm": 0.5835978388786316, "learning_rate": 0.0002, "epoch": 1.0944951903645623, "step": 6770}, {"loss": 0.7541, "grad_norm": 0.5516105890274048, "learning_rate": 0.0002, "epoch": 1.0961118745453076, "step": 6780}, {"loss": 0.6842, "grad_norm": 0.5875251889228821, "learning_rate": 0.0002, "epoch": 1.0977285587260528, "step": 6790}, {"loss": 0.6903, "grad_norm": 0.7376947999000549, "learning_rate": 0.0002, "epoch": 1.0993452429067982, "step": 6800}, {"loss": 0.7512, "grad_norm": 0.5656165480613708, "learning_rate": 0.0002, "epoch": 1.1009619270875435, "step": 6810}, {"loss": 0.7409, "grad_norm": 0.6365954279899597, "learning_rate": 0.0002, "epoch": 1.1025786112682887, "step": 6820}, {"loss": 0.7392, "grad_norm": 0.5033080577850342, "learning_rate": 0.0002, "epoch": 1.104195295449034, "step": 6830}, {"loss": 0.6909, "grad_norm": 0.617396891117096, "learning_rate": 0.0002, "epoch": 1.1058119796297794, "step": 6840}, {"loss": 0.7006, "grad_norm": 0.6395374536514282, "learning_rate": 0.0002, "epoch": 1.1074286638105246, "step": 6850}, {"loss": 0.7335, "grad_norm": 0.6775295734405518, "learning_rate": 0.0002, "epoch": 1.1090453479912699, "step": 6860}, {"loss": 0.764, "grad_norm": 0.6655223965644836, "learning_rate": 0.0002, "epoch": 1.1106620321720153, "step": 6870}, {"loss": 0.7553, "grad_norm": 0.676655113697052, "learning_rate": 0.0002, "epoch": 1.1122787163527605, "step": 6880}, {"loss": 0.7342, "grad_norm": 0.6062718629837036, "learning_rate": 0.0002, "epoch": 1.1138954005335058, "step": 6890}, {"loss": 0.7446, "grad_norm": 0.590943455696106, "learning_rate": 0.0002, "epoch": 1.115512084714251, "step": 6900}, {"loss": 0.6705, "grad_norm": 0.6315317153930664, "learning_rate": 0.0002, "epoch": 1.1171287688949965, "step": 6910}, {"loss": 0.6912, "grad_norm": 0.47979024052619934, "learning_rate": 0.0002, "epoch": 1.1187454530757417, "step": 6920}, {"loss": 0.7002, "grad_norm": 0.647298276424408, "learning_rate": 0.0002, "epoch": 1.120362137256487, "step": 6930}, {"loss": 0.7502, "grad_norm": 0.7336484789848328, "learning_rate": 0.0002, "epoch": 1.1219788214372322, "step": 6940}, {"loss": 0.693, "grad_norm": 0.5071424245834351, "learning_rate": 0.0002, "epoch": 1.1235955056179776, "step": 6950}, {"loss": 0.7378, "grad_norm": 0.6527144312858582, "learning_rate": 0.0002, "epoch": 1.1252121897987228, "step": 6960}, {"loss": 0.7228, "grad_norm": 0.6935935020446777, "learning_rate": 0.0002, "epoch": 1.126828873979468, "step": 6970}, {"loss": 0.699, "grad_norm": 0.8026931881904602, "learning_rate": 0.0002, "epoch": 1.1284455581602133, "step": 6980}, {"loss": 0.7361, "grad_norm": 0.5210393667221069, "learning_rate": 0.0002, "epoch": 1.1300622423409588, "step": 6990}, {"loss": 0.7456, "grad_norm": 0.60475093126297, "learning_rate": 0.0002, "epoch": 1.131678926521704, "step": 7000}, {"loss": 0.7495, "grad_norm": 0.6417073607444763, "learning_rate": 0.0002, "epoch": 1.1332956107024492, "step": 7010}, {"loss": 0.7459, "grad_norm": 0.6732175946235657, "learning_rate": 0.0002, "epoch": 1.1349122948831947, "step": 7020}, {"loss": 0.7278, "grad_norm": 0.6719491481781006, "learning_rate": 0.0002, "epoch": 1.13652897906394, "step": 7030}, {"loss": 0.7694, "grad_norm": 0.5708295106887817, "learning_rate": 0.0002, "epoch": 1.1381456632446851, "step": 7040}, {"loss": 0.7823, "grad_norm": 0.7141719460487366, "learning_rate": 0.0002, "epoch": 1.1397623474254304, "step": 7050}, {"loss": 0.764, "grad_norm": 0.6187017560005188, "learning_rate": 0.0002, "epoch": 1.1413790316061758, "step": 7060}, {"loss": 0.7657, "grad_norm": 0.50581294298172, "learning_rate": 0.0002, "epoch": 1.142995715786921, "step": 7070}, {"loss": 0.7357, "grad_norm": 0.5620143413543701, "learning_rate": 0.0002, "epoch": 1.1446123999676663, "step": 7080}, {"loss": 0.7287, "grad_norm": 0.6231929659843445, "learning_rate": 0.0002, "epoch": 1.1462290841484115, "step": 7090}, {"loss": 0.7328, "grad_norm": 0.5775774121284485, "learning_rate": 0.0002, "epoch": 1.147845768329157, "step": 7100}, {"loss": 0.7728, "grad_norm": 0.6492809653282166, "learning_rate": 0.0002, "epoch": 1.1494624525099022, "step": 7110}, {"loss": 0.7545, "grad_norm": 0.6434972286224365, "learning_rate": 0.0002, "epoch": 1.1510791366906474, "step": 7120}, {"loss": 0.7374, "grad_norm": 0.6191812753677368, "learning_rate": 0.0002, "epoch": 1.1526958208713927, "step": 7130}, {"loss": 0.7276, "grad_norm": 0.6690331697463989, "learning_rate": 0.0002, "epoch": 1.1543125050521381, "step": 7140}, {"loss": 0.7704, "grad_norm": 0.5977938175201416, "learning_rate": 0.0002, "epoch": 1.1559291892328833, "step": 7150}, {"loss": 0.7251, "grad_norm": 0.6195854544639587, "learning_rate": 0.0002, "epoch": 1.1575458734136286, "step": 7160}, {"loss": 0.7249, "grad_norm": 0.5752048492431641, "learning_rate": 0.0002, "epoch": 1.159162557594374, "step": 7170}, {"loss": 0.7593, "grad_norm": 0.589081883430481, "learning_rate": 0.0002, "epoch": 1.1607792417751193, "step": 7180}, {"loss": 0.704, "grad_norm": 0.756996750831604, "learning_rate": 0.0002, "epoch": 1.1623959259558645, "step": 7190}, {"loss": 0.7404, "grad_norm": 0.7614967226982117, "learning_rate": 0.0002, "epoch": 1.1640126101366097, "step": 7200}, {"loss": 0.7867, "grad_norm": 0.6120437979698181, "learning_rate": 0.0002, "epoch": 1.1656292943173552, "step": 7210}, {"loss": 0.7384, "grad_norm": 0.6210004687309265, "learning_rate": 0.0002, "epoch": 1.1672459784981004, "step": 7220}, {"loss": 0.7251, "grad_norm": 0.6044116020202637, "learning_rate": 0.0002, "epoch": 1.1688626626788456, "step": 7230}, {"loss": 0.7361, "grad_norm": 0.5418457388877869, "learning_rate": 0.0002, "epoch": 1.170479346859591, "step": 7240}, {"loss": 0.6938, "grad_norm": 0.6413537263870239, "learning_rate": 0.0002, "epoch": 1.1720960310403363, "step": 7250}, {"loss": 0.6978, "grad_norm": 0.5777867436408997, "learning_rate": 0.0002, "epoch": 1.1737127152210816, "step": 7260}, {"loss": 0.7503, "grad_norm": 0.7092402577400208, "learning_rate": 0.0002, "epoch": 1.1753293994018268, "step": 7270}, {"loss": 0.7487, "grad_norm": 0.6351709365844727, "learning_rate": 0.0002, "epoch": 1.176946083582572, "step": 7280}, {"loss": 0.7527, "grad_norm": 0.6172189712524414, "learning_rate": 0.0002, "epoch": 1.1785627677633175, "step": 7290}, {"loss": 0.7319, "grad_norm": 0.6801714897155762, "learning_rate": 0.0002, "epoch": 1.1801794519440627, "step": 7300}, {"loss": 0.6941, "grad_norm": 0.6044712066650391, "learning_rate": 0.0002, "epoch": 1.181796136124808, "step": 7310}, {"loss": 0.6951, "grad_norm": 0.7413212060928345, "learning_rate": 0.0002, "epoch": 1.1834128203055534, "step": 7320}, {"loss": 0.7396, "grad_norm": 0.5303856134414673, "learning_rate": 0.0002, "epoch": 1.1850295044862986, "step": 7330}, {"loss": 0.6915, "grad_norm": 0.5647098422050476, "learning_rate": 0.0002, "epoch": 1.1866461886670439, "step": 7340}, {"loss": 0.7506, "grad_norm": 0.7374135255813599, "learning_rate": 0.0002, "epoch": 1.188262872847789, "step": 7350}, {"loss": 0.7041, "grad_norm": 0.5710089206695557, "learning_rate": 0.0002, "epoch": 1.1898795570285345, "step": 7360}, {"loss": 0.8289, "grad_norm": 0.6073619723320007, "learning_rate": 0.0002, "epoch": 1.1914962412092798, "step": 7370}, {"loss": 0.7722, "grad_norm": 0.5899916887283325, "learning_rate": 0.0002, "epoch": 1.193112925390025, "step": 7380}, {"loss": 0.756, "grad_norm": 0.7762434482574463, "learning_rate": 0.0002, "epoch": 1.1947296095707705, "step": 7390}, {"loss": 0.7319, "grad_norm": 0.679949939250946, "learning_rate": 0.0002, "epoch": 1.1963462937515157, "step": 7400}, {"loss": 0.7599, "grad_norm": 0.6106849312782288, "learning_rate": 0.0002, "epoch": 1.197962977932261, "step": 7410}, {"loss": 0.7648, "grad_norm": 0.682461678981781, "learning_rate": 0.0002, "epoch": 1.1995796621130062, "step": 7420}, {"loss": 0.7741, "grad_norm": 0.6087017059326172, "learning_rate": 0.0002, "epoch": 1.2011963462937516, "step": 7430}, {"loss": 0.7642, "grad_norm": 0.63739013671875, "learning_rate": 0.0002, "epoch": 1.2028130304744968, "step": 7440}, {"loss": 0.7611, "grad_norm": 0.6154777407646179, "learning_rate": 0.0002, "epoch": 1.204429714655242, "step": 7450}, {"loss": 0.7565, "grad_norm": 0.7491534948348999, "learning_rate": 0.0002, "epoch": 1.2060463988359873, "step": 7460}, {"loss": 0.698, "grad_norm": 0.6664797067642212, "learning_rate": 0.0002, "epoch": 1.2076630830167328, "step": 7470}, {"loss": 0.7456, "grad_norm": 0.6660266518592834, "learning_rate": 0.0002, "epoch": 1.209279767197478, "step": 7480}, {"loss": 0.714, "grad_norm": 0.6972551345825195, "learning_rate": 0.0002, "epoch": 1.2108964513782232, "step": 7490}, {"loss": 0.7023, "grad_norm": 0.6157945990562439, "learning_rate": 0.0002, "epoch": 1.2125131355589684, "step": 7500}, {"loss": 0.7326, "grad_norm": 0.5199310183525085, "learning_rate": 0.0002, "epoch": 1.214129819739714, "step": 7510}, {"loss": 0.7586, "grad_norm": 0.577610433101654, "learning_rate": 0.0002, "epoch": 1.2157465039204591, "step": 7520}, {"loss": 0.7179, "grad_norm": 0.53652423620224, "learning_rate": 0.0002, "epoch": 1.2173631881012044, "step": 7530}, {"loss": 0.7393, "grad_norm": 0.6479050517082214, "learning_rate": 0.0002, "epoch": 1.2189798722819498, "step": 7540}, {"loss": 0.7534, "grad_norm": 0.618748128414154, "learning_rate": 0.0002, "epoch": 1.220596556462695, "step": 7550}, {"loss": 0.6886, "grad_norm": 0.6311424374580383, "learning_rate": 0.0002, "epoch": 1.2222132406434403, "step": 7560}, {"loss": 0.7272, "grad_norm": 0.6595825552940369, "learning_rate": 0.0002, "epoch": 1.2238299248241855, "step": 7570}, {"loss": 0.7353, "grad_norm": 0.5198960900306702, "learning_rate": 0.0002, "epoch": 1.225446609004931, "step": 7580}, {"loss": 0.674, "grad_norm": 0.578650712966919, "learning_rate": 0.0002, "epoch": 1.2270632931856762, "step": 7590}, {"loss": 0.7507, "grad_norm": 0.6080220937728882, "learning_rate": 0.0002, "epoch": 1.2286799773664214, "step": 7600}, {"loss": 0.7733, "grad_norm": 0.7050248384475708, "learning_rate": 0.0002, "epoch": 1.2302966615471669, "step": 7610}, {"loss": 0.7032, "grad_norm": 0.6652196049690247, "learning_rate": 0.0002, "epoch": 1.2319133457279121, "step": 7620}, {"loss": 0.7085, "grad_norm": 0.7322776317596436, "learning_rate": 0.0002, "epoch": 1.2335300299086573, "step": 7630}, {"loss": 0.7402, "grad_norm": 0.4998728036880493, "learning_rate": 0.0002, "epoch": 1.2351467140894026, "step": 7640}, {"loss": 0.7214, "grad_norm": 0.6428788900375366, "learning_rate": 0.0002, "epoch": 1.2367633982701478, "step": 7650}, {"loss": 0.7699, "grad_norm": 0.585242509841919, "learning_rate": 0.0002, "epoch": 1.2383800824508933, "step": 7660}, {"loss": 0.7621, "grad_norm": 0.5211917757987976, "learning_rate": 0.0002, "epoch": 1.2399967666316385, "step": 7670}, {"loss": 0.746, "grad_norm": 0.6490384340286255, "learning_rate": 0.0002, "epoch": 1.2416134508123837, "step": 7680}, {"loss": 0.7186, "grad_norm": 0.6249763369560242, "learning_rate": 0.0002, "epoch": 1.2432301349931292, "step": 7690}, {"loss": 0.7761, "grad_norm": 0.71870356798172, "learning_rate": 0.0002, "epoch": 1.2448468191738744, "step": 7700}, {"loss": 0.7525, "grad_norm": 0.6761967539787292, "learning_rate": 0.0002, "epoch": 1.2464635033546196, "step": 7710}, {"loss": 0.7501, "grad_norm": 0.6500617265701294, "learning_rate": 0.0002, "epoch": 1.2480801875353649, "step": 7720}, {"loss": 0.7903, "grad_norm": 0.8069869875907898, "learning_rate": 0.0002, "epoch": 1.2496968717161103, "step": 7730}, {"loss": 0.6747, "grad_norm": 0.6044608950614929, "learning_rate": 0.0002, "epoch": 1.2513135558968556, "step": 7740}, {"loss": 0.6825, "grad_norm": 0.6573283076286316, "learning_rate": 0.0002, "epoch": 1.2529302400776008, "step": 7750}, {"loss": 0.7617, "grad_norm": 0.625430166721344, "learning_rate": 0.0002, "epoch": 1.2545469242583462, "step": 7760}, {"loss": 0.7041, "grad_norm": 0.5442022681236267, "learning_rate": 0.0002, "epoch": 1.2561636084390915, "step": 7770}, {"loss": 0.7172, "grad_norm": 0.6818386912345886, "learning_rate": 0.0002, "epoch": 1.2577802926198367, "step": 7780}, {"loss": 0.696, "grad_norm": 0.6381874084472656, "learning_rate": 0.0002, "epoch": 1.259396976800582, "step": 7790}, {"loss": 0.6834, "grad_norm": 0.6269212961196899, "learning_rate": 0.0002, "epoch": 1.2610136609813272, "step": 7800}, {"loss": 0.7821, "grad_norm": 0.600121259689331, "learning_rate": 0.0002, "epoch": 1.2626303451620726, "step": 7810}, {"loss": 0.7761, "grad_norm": 0.6337703466415405, "learning_rate": 0.0002, "epoch": 1.2642470293428179, "step": 7820}, {"loss": 0.732, "grad_norm": 0.7234963774681091, "learning_rate": 0.0002, "epoch": 1.2658637135235633, "step": 7830}, {"loss": 0.785, "grad_norm": 0.800184965133667, "learning_rate": 0.0002, "epoch": 1.2674803977043085, "step": 7840}, {"loss": 0.7426, "grad_norm": 0.7539464831352234, "learning_rate": 0.0002, "epoch": 1.2690970818850538, "step": 7850}, {"loss": 0.7496, "grad_norm": 0.5493760704994202, "learning_rate": 0.0002, "epoch": 1.270713766065799, "step": 7860}, {"loss": 0.7537, "grad_norm": 0.7477145791053772, "learning_rate": 0.0002, "epoch": 1.2723304502465442, "step": 7870}, {"loss": 0.7573, "grad_norm": 0.6366362571716309, "learning_rate": 0.0002, "epoch": 1.2739471344272897, "step": 7880}, {"loss": 0.7608, "grad_norm": 0.7419533729553223, "learning_rate": 0.0002, "epoch": 1.275563818608035, "step": 7890}, {"loss": 0.7873, "grad_norm": 0.6141223311424255, "learning_rate": 0.0002, "epoch": 1.2771805027887801, "step": 7900}, {"loss": 0.6916, "grad_norm": 0.7522598505020142, "learning_rate": 0.0002, "epoch": 1.2787971869695256, "step": 7910}, {"loss": 0.7097, "grad_norm": 0.6935804486274719, "learning_rate": 0.0002, "epoch": 1.2804138711502708, "step": 7920}, {"loss": 0.7185, "grad_norm": 0.7239290475845337, "learning_rate": 0.0002, "epoch": 1.282030555331016, "step": 7930}, {"loss": 0.7145, "grad_norm": 0.8800187110900879, "learning_rate": 0.0002, "epoch": 1.2836472395117613, "step": 7940}, {"loss": 0.6991, "grad_norm": 0.540458083152771, "learning_rate": 0.0002, "epoch": 1.2852639236925067, "step": 7950}, {"loss": 0.7139, "grad_norm": 0.6492934226989746, "learning_rate": 0.0002, "epoch": 1.286880607873252, "step": 7960}, {"loss": 0.7742, "grad_norm": 0.6543959379196167, "learning_rate": 0.0002, "epoch": 1.2884972920539972, "step": 7970}, {"loss": 0.7316, "grad_norm": 0.5804705619812012, "learning_rate": 0.0002, "epoch": 1.2901139762347427, "step": 7980}, {"loss": 0.796, "grad_norm": 0.7074727416038513, "learning_rate": 0.0002, "epoch": 1.291730660415488, "step": 7990}, {"loss": 0.7034, "grad_norm": 0.5347974300384521, "learning_rate": 0.0002, "epoch": 1.2933473445962331, "step": 8000}, {"loss": 0.738, "grad_norm": 0.6457298398017883, "learning_rate": 0.0002, "epoch": 1.2949640287769784, "step": 8010}, {"loss": 0.7634, "grad_norm": 0.6407219171524048, "learning_rate": 0.0002, "epoch": 1.2965807129577236, "step": 8020}, {"loss": 0.7506, "grad_norm": 0.828439474105835, "learning_rate": 0.0002, "epoch": 1.298197397138469, "step": 8030}, {"loss": 0.735, "grad_norm": 0.4840380549430847, "learning_rate": 0.0002, "epoch": 1.2998140813192143, "step": 8040}, {"loss": 0.7283, "grad_norm": 0.5921024680137634, "learning_rate": 0.0002, "epoch": 1.3014307654999595, "step": 8050}, {"loss": 0.7477, "grad_norm": 0.6170315146446228, "learning_rate": 0.0002, "epoch": 1.303047449680705, "step": 8060}, {"loss": 0.7534, "grad_norm": 0.5374847054481506, "learning_rate": 0.0002, "epoch": 1.3046641338614502, "step": 8070}, {"loss": 0.7593, "grad_norm": 0.545758068561554, "learning_rate": 0.0002, "epoch": 1.3062808180421954, "step": 8080}, {"loss": 0.7463, "grad_norm": 0.55641770362854, "learning_rate": 0.0002, "epoch": 1.3078975022229407, "step": 8090}, {"loss": 0.7594, "grad_norm": 0.6724897027015686, "learning_rate": 0.0002, "epoch": 1.309514186403686, "step": 8100}, {"loss": 0.7105, "grad_norm": 0.6923972368240356, "learning_rate": 0.0002, "epoch": 1.3111308705844313, "step": 8110}, {"loss": 0.7149, "grad_norm": 0.5136841535568237, "learning_rate": 0.0002, "epoch": 1.3127475547651766, "step": 8120}, {"loss": 0.7504, "grad_norm": 0.6766283512115479, "learning_rate": 0.0002, "epoch": 1.314364238945922, "step": 8130}, {"loss": 0.7489, "grad_norm": 0.6283926367759705, "learning_rate": 0.0002, "epoch": 1.3159809231266673, "step": 8140}, {"loss": 0.7459, "grad_norm": 0.644216001033783, "learning_rate": 0.0002, "epoch": 1.3175976073074125, "step": 8150}, {"loss": 0.7125, "grad_norm": 0.7827503085136414, "learning_rate": 0.0002, "epoch": 1.3192142914881577, "step": 8160}, {"loss": 0.7271, "grad_norm": 0.6651390790939331, "learning_rate": 0.0002, "epoch": 1.320830975668903, "step": 8170}, {"loss": 0.7778, "grad_norm": 0.5547412633895874, "learning_rate": 0.0002, "epoch": 1.3224476598496484, "step": 8180}, {"loss": 0.7402, "grad_norm": 0.6765179634094238, "learning_rate": 0.0002, "epoch": 1.3240643440303936, "step": 8190}, {"loss": 0.7106, "grad_norm": 0.6822077035903931, "learning_rate": 0.0002, "epoch": 1.325681028211139, "step": 8200}, {"loss": 0.7288, "grad_norm": 0.5941002368927002, "learning_rate": 0.0002, "epoch": 1.3272977123918843, "step": 8210}, {"loss": 0.7494, "grad_norm": 0.4850037097930908, "learning_rate": 0.0002, "epoch": 1.3289143965726296, "step": 8220}, {"loss": 0.7474, "grad_norm": 0.6162990927696228, "learning_rate": 0.0002, "epoch": 1.3305310807533748, "step": 8230}, {"loss": 0.7751, "grad_norm": 0.6665613651275635, "learning_rate": 0.0002, "epoch": 1.33214776493412, "step": 8240}, {"loss": 0.759, "grad_norm": 0.618192732334137, "learning_rate": 0.0002, "epoch": 1.3337644491148655, "step": 8250}, {"loss": 0.7532, "grad_norm": 0.710418701171875, "learning_rate": 0.0002, "epoch": 1.3353811332956107, "step": 8260}, {"loss": 0.7306, "grad_norm": 0.5109876990318298, "learning_rate": 0.0002, "epoch": 1.336997817476356, "step": 8270}, {"loss": 0.7303, "grad_norm": 0.6791711449623108, "learning_rate": 0.0002, "epoch": 1.3386145016571014, "step": 8280}, {"loss": 0.7594, "grad_norm": 0.6836432814598083, "learning_rate": 0.0002, "epoch": 1.3402311858378466, "step": 8290}, {"loss": 0.7594, "grad_norm": 0.5579386353492737, "learning_rate": 0.0002, "epoch": 1.3418478700185918, "step": 8300}, {"loss": 0.7377, "grad_norm": 0.6713546514511108, "learning_rate": 0.0002, "epoch": 1.343464554199337, "step": 8310}, {"loss": 0.7756, "grad_norm": 0.5353720188140869, "learning_rate": 0.0002, "epoch": 1.3450812383800825, "step": 8320}, {"loss": 0.718, "grad_norm": 0.5813682675361633, "learning_rate": 0.0002, "epoch": 1.3466979225608278, "step": 8330}, {"loss": 0.7294, "grad_norm": 0.8158791661262512, "learning_rate": 0.0002, "epoch": 1.348314606741573, "step": 8340}, {"loss": 0.6992, "grad_norm": 0.6193785071372986, "learning_rate": 0.0002, "epoch": 1.3499312909223184, "step": 8350}, {"loss": 0.7654, "grad_norm": 0.6353939771652222, "learning_rate": 0.0002, "epoch": 1.3515479751030637, "step": 8360}, {"loss": 0.7519, "grad_norm": 0.6925048232078552, "learning_rate": 0.0002, "epoch": 1.353164659283809, "step": 8370}, {"loss": 0.736, "grad_norm": 0.988264799118042, "learning_rate": 0.0002, "epoch": 1.3547813434645541, "step": 8380}, {"loss": 0.7744, "grad_norm": 0.6476002931594849, "learning_rate": 0.0002, "epoch": 1.3563980276452994, "step": 8390}, {"loss": 0.776, "grad_norm": 0.7120398879051208, "learning_rate": 0.0002, "epoch": 1.3580147118260448, "step": 8400}, {"loss": 0.7368, "grad_norm": 0.9048416614532471, "learning_rate": 0.0002, "epoch": 1.35963139600679, "step": 8410}, {"loss": 0.7544, "grad_norm": 0.7000672817230225, "learning_rate": 0.0002, "epoch": 1.3612480801875353, "step": 8420}, {"loss": 0.7358, "grad_norm": 0.6015632152557373, "learning_rate": 0.0002, "epoch": 1.3628647643682807, "step": 8430}, {"loss": 0.7298, "grad_norm": 0.612516462802887, "learning_rate": 0.0002, "epoch": 1.364481448549026, "step": 8440}, {"loss": 0.7055, "grad_norm": 0.5969301462173462, "learning_rate": 0.0002, "epoch": 1.3660981327297712, "step": 8450}, {"loss": 0.7754, "grad_norm": 0.6730654239654541, "learning_rate": 0.0002, "epoch": 1.3677148169105164, "step": 8460}, {"loss": 0.7465, "grad_norm": 0.6386392116546631, "learning_rate": 0.0002, "epoch": 1.369331501091262, "step": 8470}, {"loss": 0.7433, "grad_norm": 0.739544153213501, "learning_rate": 0.0002, "epoch": 1.3709481852720071, "step": 8480}, {"loss": 0.7892, "grad_norm": 0.6462782621383667, "learning_rate": 0.0002, "epoch": 1.3725648694527524, "step": 8490}, {"loss": 0.7302, "grad_norm": 0.7346843481063843, "learning_rate": 0.0002, "epoch": 1.3741815536334978, "step": 8500}, {"loss": 0.7634, "grad_norm": 0.6884821057319641, "learning_rate": 0.0002, "epoch": 1.375798237814243, "step": 8510}, {"loss": 0.7614, "grad_norm": 0.6999333500862122, "learning_rate": 0.0002, "epoch": 1.3774149219949883, "step": 8520}, {"loss": 0.729, "grad_norm": 0.5378713011741638, "learning_rate": 0.0002, "epoch": 1.3790316061757335, "step": 8530}, {"loss": 0.6797, "grad_norm": 0.5417906641960144, "learning_rate": 0.0002, "epoch": 1.3806482903564787, "step": 8540}, {"loss": 0.7499, "grad_norm": 0.6602526307106018, "learning_rate": 0.0002, "epoch": 1.3822649745372242, "step": 8550}, {"loss": 0.7356, "grad_norm": 0.7073674201965332, "learning_rate": 0.0002, "epoch": 1.3838816587179694, "step": 8560}, {"loss": 0.75, "grad_norm": 0.5841707587242126, "learning_rate": 0.0002, "epoch": 1.3854983428987149, "step": 8570}, {"loss": 0.732, "grad_norm": 0.7031095027923584, "learning_rate": 0.0002, "epoch": 1.38711502707946, "step": 8580}, {"loss": 0.7464, "grad_norm": 0.5198570489883423, "learning_rate": 0.0002, "epoch": 1.3887317112602053, "step": 8590}, {"loss": 0.7354, "grad_norm": 0.7261320352554321, "learning_rate": 0.0002, "epoch": 1.3903483954409506, "step": 8600}, {"loss": 0.7339, "grad_norm": 0.5616350173950195, "learning_rate": 0.0002, "epoch": 1.3919650796216958, "step": 8610}, {"loss": 0.7382, "grad_norm": 0.5185914635658264, "learning_rate": 0.0002, "epoch": 1.3935817638024413, "step": 8620}, {"loss": 0.7456, "grad_norm": 0.5814694762229919, "learning_rate": 0.0002, "epoch": 1.3951984479831865, "step": 8630}, {"loss": 0.7413, "grad_norm": 0.6977371573448181, "learning_rate": 0.0002, "epoch": 1.3968151321639317, "step": 8640}, {"loss": 0.7574, "grad_norm": 0.6855689883232117, "learning_rate": 0.0002, "epoch": 1.3984318163446772, "step": 8650}, {"loss": 0.7802, "grad_norm": 0.5414357781410217, "learning_rate": 0.0002, "epoch": 1.4000485005254224, "step": 8660}, {"loss": 0.7487, "grad_norm": 0.6970012784004211, "learning_rate": 0.0002, "epoch": 1.4016651847061676, "step": 8670}, {"loss": 0.7421, "grad_norm": 0.526079535484314, "learning_rate": 0.0002, "epoch": 1.4032818688869129, "step": 8680}, {"loss": 0.737, "grad_norm": 0.758712887763977, "learning_rate": 0.0002, "epoch": 1.404898553067658, "step": 8690}, {"loss": 0.7612, "grad_norm": 0.7118762731552124, "learning_rate": 0.0002, "epoch": 1.4065152372484035, "step": 8700}, {"loss": 0.7628, "grad_norm": 0.5696909427642822, "learning_rate": 0.0002, "epoch": 1.4081319214291488, "step": 8710}, {"loss": 0.7156, "grad_norm": 0.7995436787605286, "learning_rate": 0.0002, "epoch": 1.4097486056098942, "step": 8720}, {"loss": 0.7521, "grad_norm": 0.7237521409988403, "learning_rate": 0.0002, "epoch": 1.4113652897906395, "step": 8730}, {"loss": 0.7661, "grad_norm": 0.744628369808197, "learning_rate": 0.0002, "epoch": 1.4129819739713847, "step": 8740}, {"loss": 0.7073, "grad_norm": 0.6082926988601685, "learning_rate": 0.0002, "epoch": 1.41459865815213, "step": 8750}, {"loss": 0.7282, "grad_norm": 0.5185243487358093, "learning_rate": 0.0002, "epoch": 1.4162153423328752, "step": 8760}, {"loss": 0.7592, "grad_norm": 0.5183082222938538, "learning_rate": 0.0002, "epoch": 1.4178320265136206, "step": 8770}, {"loss": 0.7509, "grad_norm": 0.7326041460037231, "learning_rate": 0.0002, "epoch": 1.4194487106943658, "step": 8780}, {"loss": 0.7398, "grad_norm": 0.7174660563468933, "learning_rate": 0.0002, "epoch": 1.421065394875111, "step": 8790}, {"loss": 0.7507, "grad_norm": 0.8080165982246399, "learning_rate": 0.0002, "epoch": 1.4226820790558565, "step": 8800}, {"loss": 0.72, "grad_norm": 0.5061507821083069, "learning_rate": 0.0002, "epoch": 1.4242987632366018, "step": 8810}, {"loss": 0.7563, "grad_norm": 0.801602840423584, "learning_rate": 0.0002, "epoch": 1.425915447417347, "step": 8820}, {"loss": 0.7287, "grad_norm": 0.6150273084640503, "learning_rate": 0.0002, "epoch": 1.4275321315980922, "step": 8830}, {"loss": 0.7452, "grad_norm": 0.8786525726318359, "learning_rate": 0.0002, "epoch": 1.4291488157788377, "step": 8840}, {"loss": 0.7257, "grad_norm": 0.6371538639068604, "learning_rate": 0.0002, "epoch": 1.430765499959583, "step": 8850}, {"loss": 0.711, "grad_norm": 0.6409295797348022, "learning_rate": 0.0002, "epoch": 1.4323821841403281, "step": 8860}, {"loss": 0.7891, "grad_norm": 0.6452359557151794, "learning_rate": 0.0002, "epoch": 1.4339988683210736, "step": 8870}, {"loss": 0.7588, "grad_norm": 0.5842334628105164, "learning_rate": 0.0002, "epoch": 1.4356155525018188, "step": 8880}, {"loss": 0.7446, "grad_norm": 0.696761965751648, "learning_rate": 0.0002, "epoch": 1.437232236682564, "step": 8890}, {"loss": 0.7541, "grad_norm": 0.6384600400924683, "learning_rate": 0.0002, "epoch": 1.4388489208633093, "step": 8900}, {"loss": 0.7049, "grad_norm": 0.5981136560440063, "learning_rate": 0.0002, "epoch": 1.4404656050440545, "step": 8910}, {"loss": 0.795, "grad_norm": 0.6355637907981873, "learning_rate": 0.0002, "epoch": 1.4420822892248, "step": 8920}, {"loss": 0.7653, "grad_norm": 0.6374830603599548, "learning_rate": 0.0002, "epoch": 1.4436989734055452, "step": 8930}, {"loss": 0.8108, "grad_norm": 0.559013307094574, "learning_rate": 0.0002, "epoch": 1.4453156575862904, "step": 8940}, {"loss": 0.7045, "grad_norm": 0.7289170026779175, "learning_rate": 0.0002, "epoch": 1.446932341767036, "step": 8950}, {"loss": 0.7484, "grad_norm": 0.8649206757545471, "learning_rate": 0.0002, "epoch": 1.4485490259477811, "step": 8960}, {"loss": 0.7745, "grad_norm": 0.7664689421653748, "learning_rate": 0.0002, "epoch": 1.4501657101285264, "step": 8970}, {"loss": 0.7431, "grad_norm": 0.7109952569007874, "learning_rate": 0.0002, "epoch": 1.4517823943092716, "step": 8980}, {"loss": 0.7997, "grad_norm": 0.6312844753265381, "learning_rate": 0.0002, "epoch": 1.453399078490017, "step": 8990}, {"loss": 0.7467, "grad_norm": 0.6616617441177368, "learning_rate": 0.0002, "epoch": 1.4550157626707623, "step": 9000}, {"loss": 0.7518, "grad_norm": 0.7384068965911865, "learning_rate": 0.0002, "epoch": 1.4566324468515075, "step": 9010}, {"loss": 0.7483, "grad_norm": 0.6549670100212097, "learning_rate": 0.0002, "epoch": 1.458249131032253, "step": 9020}, {"loss": 0.7423, "grad_norm": 0.6254119277000427, "learning_rate": 0.0002, "epoch": 1.4598658152129982, "step": 9030}, {"loss": 0.7645, "grad_norm": 0.6806328892707825, "learning_rate": 0.0002, "epoch": 1.4614824993937434, "step": 9040}, {"loss": 0.7221, "grad_norm": 0.6803115010261536, "learning_rate": 0.0002, "epoch": 1.4630991835744886, "step": 9050}, {"loss": 0.7264, "grad_norm": 0.48529282212257385, "learning_rate": 0.0002, "epoch": 1.4647158677552339, "step": 9060}, {"loss": 0.7542, "grad_norm": 0.5995030999183655, "learning_rate": 0.0002, "epoch": 1.4663325519359793, "step": 9070}, {"loss": 0.7894, "grad_norm": 0.6005427837371826, "learning_rate": 0.0002, "epoch": 1.4679492361167246, "step": 9080}, {"loss": 0.7288, "grad_norm": 0.718564510345459, "learning_rate": 0.0002, "epoch": 1.46956592029747, "step": 9090}, {"loss": 0.7089, "grad_norm": 0.7003577351570129, "learning_rate": 0.0002, "epoch": 1.4711826044782153, "step": 9100}, {"loss": 0.8069, "grad_norm": 0.5888323783874512, "learning_rate": 0.0002, "epoch": 1.4727992886589605, "step": 9110}, {"loss": 0.7275, "grad_norm": 0.6417609453201294, "learning_rate": 0.0002, "epoch": 1.4744159728397057, "step": 9120}, {"loss": 0.7441, "grad_norm": 0.572294294834137, "learning_rate": 0.0002, "epoch": 1.476032657020451, "step": 9130}, {"loss": 0.8053, "grad_norm": 0.8200714588165283, "learning_rate": 0.0002, "epoch": 1.4776493412011964, "step": 9140}, {"loss": 0.7382, "grad_norm": 0.6343288421630859, "learning_rate": 0.0002, "epoch": 1.4792660253819416, "step": 9150}, {"loss": 0.7641, "grad_norm": 0.7017961144447327, "learning_rate": 0.0002, "epoch": 1.4808827095626869, "step": 9160}, {"loss": 0.7619, "grad_norm": 0.6202912926673889, "learning_rate": 0.0002, "epoch": 1.4824993937434323, "step": 9170}, {"loss": 0.7428, "grad_norm": 0.6677869558334351, "learning_rate": 0.0002, "epoch": 1.4841160779241775, "step": 9180}, {"loss": 0.7648, "grad_norm": 0.6052267551422119, "learning_rate": 0.0002, "epoch": 1.4857327621049228, "step": 9190}, {"loss": 0.7152, "grad_norm": 0.6638872027397156, "learning_rate": 0.0002, "epoch": 1.487349446285668, "step": 9200}, {"loss": 0.7448, "grad_norm": 0.6245523691177368, "learning_rate": 0.0002, "epoch": 1.4889661304664135, "step": 9210}, {"loss": 0.6958, "grad_norm": 0.5761767625808716, "learning_rate": 0.0002, "epoch": 1.4905828146471587, "step": 9220}, {"loss": 0.8012, "grad_norm": 0.8175981640815735, "learning_rate": 0.0002, "epoch": 1.492199498827904, "step": 9230}, {"loss": 0.683, "grad_norm": 0.9144009947776794, "learning_rate": 0.0002, "epoch": 1.4938161830086494, "step": 9240}, {"loss": 0.7623, "grad_norm": 0.5742552876472473, "learning_rate": 0.0002, "epoch": 1.4954328671893946, "step": 9250}, {"loss": 0.7418, "grad_norm": 0.534534215927124, "learning_rate": 0.0002, "epoch": 1.4970495513701398, "step": 9260}, {"loss": 0.7194, "grad_norm": 0.7836225032806396, "learning_rate": 0.0002, "epoch": 1.498666235550885, "step": 9270}, {"loss": 0.7453, "grad_norm": 0.5292993187904358, "learning_rate": 0.0002, "epoch": 1.5002829197316303, "step": 9280}, {"loss": 0.7168, "grad_norm": 0.8044071793556213, "learning_rate": 0.0002, "epoch": 1.5018996039123758, "step": 9290}, {"loss": 0.7229, "grad_norm": 0.6185805201530457, "learning_rate": 0.0002, "epoch": 1.503516288093121, "step": 9300}, {"loss": 0.684, "grad_norm": 0.6093607544898987, "learning_rate": 0.0002, "epoch": 1.5051329722738664, "step": 9310}, {"loss": 0.7973, "grad_norm": 0.5891730189323425, "learning_rate": 0.0002, "epoch": 1.5067496564546117, "step": 9320}, {"loss": 0.7474, "grad_norm": 0.6331129670143127, "learning_rate": 0.0002, "epoch": 1.508366340635357, "step": 9330}, {"loss": 0.7074, "grad_norm": 0.7690958380699158, "learning_rate": 0.0002, "epoch": 1.5099830248161021, "step": 9340}, {"loss": 0.672, "grad_norm": 0.6548877358436584, "learning_rate": 0.0002, "epoch": 1.5115997089968474, "step": 9350}, {"loss": 0.7408, "grad_norm": 0.6545143127441406, "learning_rate": 0.0002, "epoch": 1.5132163931775926, "step": 9360}, {"loss": 0.7432, "grad_norm": 0.553247332572937, "learning_rate": 0.0002, "epoch": 1.514833077358338, "step": 9370}, {"loss": 0.7265, "grad_norm": 0.8145074844360352, "learning_rate": 0.0002, "epoch": 1.5164497615390833, "step": 9380}, {"loss": 0.7379, "grad_norm": 0.7636994123458862, "learning_rate": 0.0002, "epoch": 1.5180664457198287, "step": 9390}, {"loss": 0.7413, "grad_norm": 0.6838982701301575, "learning_rate": 0.0002, "epoch": 1.519683129900574, "step": 9400}, {"loss": 0.7367, "grad_norm": 0.8599441647529602, "learning_rate": 0.0002, "epoch": 1.5212998140813192, "step": 9410}, {"loss": 0.7663, "grad_norm": 0.7020329833030701, "learning_rate": 0.0002, "epoch": 1.5229164982620644, "step": 9420}, {"loss": 0.7928, "grad_norm": 0.6964772343635559, "learning_rate": 0.0002, "epoch": 1.5245331824428097, "step": 9430}, {"loss": 0.7168, "grad_norm": 0.6916600465774536, "learning_rate": 0.0002, "epoch": 1.5261498666235551, "step": 9440}, {"loss": 0.7519, "grad_norm": 0.7282621264457703, "learning_rate": 0.0002, "epoch": 1.5277665508043003, "step": 9450}, {"loss": 0.7628, "grad_norm": 0.5363983511924744, "learning_rate": 0.0002, "epoch": 1.5293832349850458, "step": 9460}, {"loss": 0.7154, "grad_norm": 0.6184861063957214, "learning_rate": 0.0002, "epoch": 1.530999919165791, "step": 9470}, {"loss": 0.7837, "grad_norm": 0.5991285443305969, "learning_rate": 0.0002, "epoch": 1.5326166033465363, "step": 9480}, {"loss": 0.7827, "grad_norm": 0.8176587820053101, "learning_rate": 0.0002, "epoch": 1.5342332875272815, "step": 9490}, {"loss": 0.7415, "grad_norm": 0.6473721861839294, "learning_rate": 0.0002, "epoch": 1.5358499717080267, "step": 9500}, {"loss": 0.7632, "grad_norm": 0.7319952845573425, "learning_rate": 0.0002, "epoch": 1.5374666558887722, "step": 9510}, {"loss": 0.7706, "grad_norm": 0.702900230884552, "learning_rate": 0.0002, "epoch": 1.5390833400695174, "step": 9520}, {"loss": 0.7754, "grad_norm": 0.7971600294113159, "learning_rate": 0.0002, "epoch": 1.5407000242502629, "step": 9530}, {"loss": 0.7352, "grad_norm": 0.6527525186538696, "learning_rate": 0.0002, "epoch": 1.542316708431008, "step": 9540}, {"loss": 0.7425, "grad_norm": 0.5791676044464111, "learning_rate": 0.0002, "epoch": 1.5439333926117533, "step": 9550}, {"loss": 0.7585, "grad_norm": 0.5619390606880188, "learning_rate": 0.0002, "epoch": 1.5455500767924986, "step": 9560}, {"loss": 0.7894, "grad_norm": 0.5701689124107361, "learning_rate": 0.0002, "epoch": 1.5471667609732438, "step": 9570}, {"loss": 0.793, "grad_norm": 0.47549352049827576, "learning_rate": 0.0002, "epoch": 1.548783445153989, "step": 9580}, {"loss": 0.7276, "grad_norm": 0.8730611205101013, "learning_rate": 0.0002, "epoch": 1.5504001293347345, "step": 9590}, {"loss": 0.798, "grad_norm": 0.6842091083526611, "learning_rate": 0.0002, "epoch": 1.5520168135154797, "step": 9600}, {"loss": 0.7528, "grad_norm": 0.6675129532814026, "learning_rate": 0.0002, "epoch": 1.5536334976962252, "step": 9610}, {"loss": 0.7954, "grad_norm": 0.8173956274986267, "learning_rate": 0.0002, "epoch": 1.5552501818769704, "step": 9620}, {"loss": 0.7535, "grad_norm": 0.724947452545166, "learning_rate": 0.0002, "epoch": 1.5568668660577156, "step": 9630}, {"loss": 0.7738, "grad_norm": 0.6154758930206299, "learning_rate": 0.0002, "epoch": 1.5584835502384609, "step": 9640}, {"loss": 0.7568, "grad_norm": 0.6072008013725281, "learning_rate": 0.0002, "epoch": 1.560100234419206, "step": 9650}, {"loss": 0.7219, "grad_norm": 0.659010648727417, "learning_rate": 0.0002, "epoch": 1.5617169185999515, "step": 9660}, {"loss": 0.673, "grad_norm": 0.65857994556427, "learning_rate": 0.0002, "epoch": 1.5633336027806968, "step": 9670}, {"loss": 0.7156, "grad_norm": 0.5914267301559448, "learning_rate": 0.0002, "epoch": 1.5649502869614422, "step": 9680}, {"loss": 0.7414, "grad_norm": 0.6248020529747009, "learning_rate": 0.0002, "epoch": 1.5665669711421875, "step": 9690}, {"loss": 0.694, "grad_norm": 0.7147795557975769, "learning_rate": 0.0002, "epoch": 1.5681836553229327, "step": 9700}, {"loss": 0.7335, "grad_norm": 0.7076232433319092, "learning_rate": 0.0002, "epoch": 1.569800339503678, "step": 9710}, {"loss": 0.7413, "grad_norm": 0.6217400431632996, "learning_rate": 0.0002, "epoch": 1.5714170236844232, "step": 9720}, {"loss": 0.7296, "grad_norm": 0.6709911227226257, "learning_rate": 0.0002, "epoch": 1.5730337078651684, "step": 9730}, {"loss": 0.7306, "grad_norm": 0.749171257019043, "learning_rate": 0.0002, "epoch": 1.5746503920459138, "step": 9740}, {"loss": 0.7242, "grad_norm": 0.6241145730018616, "learning_rate": 0.0002, "epoch": 1.576267076226659, "step": 9750}, {"loss": 0.7384, "grad_norm": 0.4960934817790985, "learning_rate": 0.0002, "epoch": 1.5778837604074045, "step": 9760}, {"loss": 0.725, "grad_norm": 0.6593309640884399, "learning_rate": 0.0002, "epoch": 1.5795004445881498, "step": 9770}, {"loss": 0.7531, "grad_norm": 0.5814042091369629, "learning_rate": 0.0002, "epoch": 1.581117128768895, "step": 9780}, {"loss": 0.7109, "grad_norm": 0.5936070680618286, "learning_rate": 0.0002, "epoch": 1.5827338129496402, "step": 9790}, {"loss": 0.7769, "grad_norm": 0.6454403400421143, "learning_rate": 0.0002, "epoch": 1.5843504971303854, "step": 9800}, {"loss": 0.7677, "grad_norm": 0.7612107992172241, "learning_rate": 0.0002, "epoch": 1.585967181311131, "step": 9810}, {"loss": 0.7649, "grad_norm": 0.6494482755661011, "learning_rate": 0.0002, "epoch": 1.5875838654918761, "step": 9820}, {"loss": 0.7569, "grad_norm": 0.7825694680213928, "learning_rate": 0.0002, "epoch": 1.5892005496726216, "step": 9830}, {"loss": 0.706, "grad_norm": 0.6757757663726807, "learning_rate": 0.0002, "epoch": 1.5908172338533668, "step": 9840}, {"loss": 0.7803, "grad_norm": 0.7105609178543091, "learning_rate": 0.0002, "epoch": 1.592433918034112, "step": 9850}, {"loss": 0.7925, "grad_norm": 0.7596991062164307, "learning_rate": 0.0002, "epoch": 1.5940506022148573, "step": 9860}, {"loss": 0.7108, "grad_norm": 0.5681525468826294, "learning_rate": 0.0002, "epoch": 1.5956672863956025, "step": 9870}, {"loss": 0.7811, "grad_norm": 0.6090980768203735, "learning_rate": 0.0002, "epoch": 1.5972839705763477, "step": 9880}, {"loss": 0.7339, "grad_norm": 0.6271613240242004, "learning_rate": 0.0002, "epoch": 1.5989006547570932, "step": 9890}, {"loss": 0.7419, "grad_norm": 0.7656369805335999, "learning_rate": 0.0002, "epoch": 1.6005173389378387, "step": 9900}, {"loss": 0.7336, "grad_norm": 0.7504446506500244, "learning_rate": 0.0002, "epoch": 1.6021340231185839, "step": 9910}, {"loss": 0.7479, "grad_norm": 0.659656286239624, "learning_rate": 0.0002, "epoch": 1.6037507072993291, "step": 9920}, {"loss": 0.7483, "grad_norm": 0.6006826162338257, "learning_rate": 0.0002, "epoch": 1.6053673914800743, "step": 9930}, {"loss": 0.732, "grad_norm": 0.7872757911682129, "learning_rate": 0.0002, "epoch": 1.6069840756608196, "step": 9940}, {"loss": 0.768, "grad_norm": 0.5545852780342102, "learning_rate": 0.0002, "epoch": 1.6086007598415648, "step": 9950}, {"loss": 0.8064, "grad_norm": 0.7429468631744385, "learning_rate": 0.0002, "epoch": 1.6102174440223103, "step": 9960}, {"loss": 0.714, "grad_norm": 0.6873556971549988, "learning_rate": 0.0002, "epoch": 1.6118341282030555, "step": 9970}, {"loss": 0.7324, "grad_norm": 0.5874287486076355, "learning_rate": 0.0002, "epoch": 1.613450812383801, "step": 9980}, {"loss": 0.7141, "grad_norm": 0.6039386987686157, "learning_rate": 0.0002, "epoch": 1.6150674965645462, "step": 9990}, {"loss": 0.6674, "grad_norm": 0.6233575940132141, "learning_rate": 0.0002, "epoch": 1.6166841807452914, "step": 10000}, {"loss": 0.7602, "grad_norm": 0.7676448225975037, "learning_rate": 0.0002, "epoch": 1.6183008649260366, "step": 10010}, {"loss": 0.7784, "grad_norm": 0.6565698385238647, "learning_rate": 0.0002, "epoch": 1.6199175491067819, "step": 10020}, {"loss": 0.7104, "grad_norm": 0.6787590384483337, "learning_rate": 0.0002, "epoch": 1.6215342332875273, "step": 10030}, {"loss": 0.7464, "grad_norm": 0.6137678027153015, "learning_rate": 0.0002, "epoch": 1.6231509174682726, "step": 10040}, {"loss": 0.7646, "grad_norm": 0.5236800312995911, "learning_rate": 0.0002, "epoch": 1.624767601649018, "step": 10050}, {"loss": 0.7437, "grad_norm": 0.7626367807388306, "learning_rate": 0.0002, "epoch": 1.6263842858297632, "step": 10060}, {"loss": 0.7273, "grad_norm": 0.5657260417938232, "learning_rate": 0.0002, "epoch": 1.6280009700105085, "step": 10070}, {"loss": 0.7354, "grad_norm": 0.4913991391658783, "learning_rate": 0.0002, "epoch": 1.6296176541912537, "step": 10080}, {"loss": 0.7596, "grad_norm": 0.7715556621551514, "learning_rate": 0.0002, "epoch": 1.631234338371999, "step": 10090}, {"loss": 0.7105, "grad_norm": 0.6509000062942505, "learning_rate": 0.0002, "epoch": 1.6328510225527442, "step": 10100}, {"loss": 0.7274, "grad_norm": 0.6215850114822388, "learning_rate": 0.0002, "epoch": 1.6344677067334896, "step": 10110}, {"loss": 0.7705, "grad_norm": 0.6956844329833984, "learning_rate": 0.0002, "epoch": 1.6360843909142349, "step": 10120}, {"loss": 0.7129, "grad_norm": 0.6111597418785095, "learning_rate": 0.0002, "epoch": 1.6377010750949803, "step": 10130}, {"loss": 0.6955, "grad_norm": 0.6518288850784302, "learning_rate": 0.0002, "epoch": 1.6393177592757255, "step": 10140}, {"loss": 0.731, "grad_norm": 0.6914522051811218, "learning_rate": 0.0002, "epoch": 1.6409344434564708, "step": 10150}, {"loss": 0.7295, "grad_norm": 0.63785719871521, "learning_rate": 0.0002, "epoch": 1.642551127637216, "step": 10160}, {"loss": 0.7355, "grad_norm": 0.6379287838935852, "learning_rate": 0.0002, "epoch": 1.6441678118179612, "step": 10170}, {"loss": 0.7359, "grad_norm": 0.6793403029441833, "learning_rate": 0.0002, "epoch": 1.6457844959987067, "step": 10180}, {"loss": 0.7402, "grad_norm": 0.6099132895469666, "learning_rate": 0.0002, "epoch": 1.647401180179452, "step": 10190}, {"loss": 0.7353, "grad_norm": 0.5869854092597961, "learning_rate": 0.0002, "epoch": 1.6490178643601974, "step": 10200}, {"loss": 0.8308, "grad_norm": 0.7716999053955078, "learning_rate": 0.0002, "epoch": 1.6506345485409426, "step": 10210}, {"loss": 0.7215, "grad_norm": 0.6854110360145569, "learning_rate": 0.0002, "epoch": 1.6522512327216878, "step": 10220}, {"loss": 0.782, "grad_norm": 0.6957170367240906, "learning_rate": 0.0002, "epoch": 1.653867916902433, "step": 10230}, {"loss": 0.7282, "grad_norm": 0.6932903528213501, "learning_rate": 0.0002, "epoch": 1.6554846010831783, "step": 10240}, {"loss": 0.7478, "grad_norm": 0.7713165283203125, "learning_rate": 0.0002, "epoch": 1.6571012852639235, "step": 10250}, {"loss": 0.7099, "grad_norm": 0.7455793619155884, "learning_rate": 0.0002, "epoch": 1.658717969444669, "step": 10260}, {"loss": 0.7524, "grad_norm": 0.5464168190956116, "learning_rate": 0.0002, "epoch": 1.6603346536254144, "step": 10270}, {"loss": 0.7328, "grad_norm": 0.6782926321029663, "learning_rate": 0.0002, "epoch": 1.6619513378061597, "step": 10280}, {"loss": 0.7801, "grad_norm": 0.7962649464607239, "learning_rate": 0.0002, "epoch": 1.663568021986905, "step": 10290}, {"loss": 0.7142, "grad_norm": 0.6814526319503784, "learning_rate": 0.0002, "epoch": 1.6651847061676501, "step": 10300}, {"loss": 0.7285, "grad_norm": 0.656895101070404, "learning_rate": 0.0002, "epoch": 1.6668013903483954, "step": 10310}, {"loss": 0.7358, "grad_norm": 0.6085672378540039, "learning_rate": 0.0002, "epoch": 1.6684180745291406, "step": 10320}, {"loss": 0.7074, "grad_norm": 0.585508406162262, "learning_rate": 0.0002, "epoch": 1.670034758709886, "step": 10330}, {"loss": 0.7604, "grad_norm": 0.6930184364318848, "learning_rate": 0.0002, "epoch": 1.6716514428906313, "step": 10340}, {"loss": 0.7169, "grad_norm": 0.575663149356842, "learning_rate": 0.0002, "epoch": 1.6732681270713767, "step": 10350}, {"loss": 0.7198, "grad_norm": 0.582502543926239, "learning_rate": 0.0002, "epoch": 1.674884811252122, "step": 10360}, {"loss": 0.7793, "grad_norm": 0.5668916702270508, "learning_rate": 0.0002, "epoch": 1.6765014954328672, "step": 10370}, {"loss": 0.7478, "grad_norm": 0.6070065498352051, "learning_rate": 0.0002, "epoch": 1.6781181796136124, "step": 10380}, {"loss": 0.7939, "grad_norm": 0.6141316294670105, "learning_rate": 0.0002, "epoch": 1.6797348637943577, "step": 10390}, {"loss": 0.7573, "grad_norm": 0.8359124064445496, "learning_rate": 0.0002, "epoch": 1.6813515479751031, "step": 10400}, {"loss": 0.7488, "grad_norm": 0.5378185510635376, "learning_rate": 0.0002, "epoch": 1.6829682321558483, "step": 10410}, {"loss": 0.7588, "grad_norm": 0.6959536075592041, "learning_rate": 0.0002, "epoch": 1.6845849163365938, "step": 10420}, {"loss": 0.7872, "grad_norm": 0.6514357328414917, "learning_rate": 0.0002, "epoch": 1.686201600517339, "step": 10430}, {"loss": 0.725, "grad_norm": 0.7706646919250488, "learning_rate": 0.0002, "epoch": 1.6878182846980843, "step": 10440}, {"loss": 0.7673, "grad_norm": 0.6183337569236755, "learning_rate": 0.0002, "epoch": 1.6894349688788295, "step": 10450}, {"loss": 0.7566, "grad_norm": 0.6123278141021729, "learning_rate": 0.0002, "epoch": 1.6910516530595747, "step": 10460}, {"loss": 0.7169, "grad_norm": 0.6894851326942444, "learning_rate": 0.0002, "epoch": 1.69266833724032, "step": 10470}, {"loss": 0.7435, "grad_norm": 0.7497312426567078, "learning_rate": 0.0002, "epoch": 1.6942850214210654, "step": 10480}, {"loss": 0.7544, "grad_norm": 0.5968214273452759, "learning_rate": 0.0002, "epoch": 1.6959017056018106, "step": 10490}, {"loss": 0.6793, "grad_norm": 0.6747927069664001, "learning_rate": 0.0002, "epoch": 1.697518389782556, "step": 10500}, {"loss": 0.7415, "grad_norm": 0.5708310008049011, "learning_rate": 0.0002, "epoch": 1.6991350739633013, "step": 10510}, {"loss": 0.7385, "grad_norm": 0.606526792049408, "learning_rate": 0.0002, "epoch": 1.7007517581440466, "step": 10520}, {"loss": 0.7204, "grad_norm": 0.662011981010437, "learning_rate": 0.0002, "epoch": 1.7023684423247918, "step": 10530}, {"loss": 0.7999, "grad_norm": 0.7583045363426208, "learning_rate": 0.0002, "epoch": 1.703985126505537, "step": 10540}, {"loss": 0.7563, "grad_norm": 0.721632182598114, "learning_rate": 0.0002, "epoch": 1.7056018106862825, "step": 10550}, {"loss": 0.7407, "grad_norm": 0.6107715368270874, "learning_rate": 0.0002, "epoch": 1.7072184948670277, "step": 10560}, {"loss": 0.7519, "grad_norm": 0.6652471423149109, "learning_rate": 0.0002, "epoch": 1.7088351790477732, "step": 10570}, {"loss": 0.7767, "grad_norm": 0.6308087110519409, "learning_rate": 0.0002, "epoch": 1.7104518632285184, "step": 10580}, {"loss": 0.7659, "grad_norm": 0.5464386940002441, "learning_rate": 0.0002, "epoch": 1.7120685474092636, "step": 10590}, {"loss": 0.7063, "grad_norm": 0.6558911204338074, "learning_rate": 0.0002, "epoch": 1.7136852315900089, "step": 10600}, {"loss": 0.7126, "grad_norm": 0.5665024518966675, "learning_rate": 0.0002, "epoch": 1.715301915770754, "step": 10610}, {"loss": 0.6958, "grad_norm": 0.7888094186782837, "learning_rate": 0.0002, "epoch": 1.7169185999514993, "step": 10620}, {"loss": 0.7785, "grad_norm": 0.7084909081459045, "learning_rate": 0.0002, "epoch": 1.7185352841322448, "step": 10630}, {"loss": 0.7557, "grad_norm": 0.7982324361801147, "learning_rate": 0.0002, "epoch": 1.7201519683129902, "step": 10640}, {"loss": 0.7345, "grad_norm": 0.6418732404708862, "learning_rate": 0.0002, "epoch": 1.7217686524937355, "step": 10650}, {"loss": 0.7734, "grad_norm": 0.7636681795120239, "learning_rate": 0.0002, "epoch": 1.7233853366744807, "step": 10660}, {"loss": 0.7541, "grad_norm": 0.5646875500679016, "learning_rate": 0.0002, "epoch": 1.725002020855226, "step": 10670}, {"loss": 0.7642, "grad_norm": 0.5231260657310486, "learning_rate": 0.0002, "epoch": 1.7266187050359711, "step": 10680}, {"loss": 0.7846, "grad_norm": 0.7635011672973633, "learning_rate": 0.0002, "epoch": 1.7282353892167164, "step": 10690}, {"loss": 0.7471, "grad_norm": 0.7518259286880493, "learning_rate": 0.0002, "epoch": 1.7298520733974618, "step": 10700}, {"loss": 0.751, "grad_norm": 0.7295602560043335, "learning_rate": 0.0002, "epoch": 1.731468757578207, "step": 10710}, {"loss": 0.731, "grad_norm": 0.6984632015228271, "learning_rate": 0.0002, "epoch": 1.7330854417589525, "step": 10720}, {"loss": 0.7921, "grad_norm": 0.6198219060897827, "learning_rate": 0.0002, "epoch": 1.7347021259396977, "step": 10730}, {"loss": 0.7642, "grad_norm": 0.6957576274871826, "learning_rate": 0.0002, "epoch": 1.736318810120443, "step": 10740}, {"loss": 0.7917, "grad_norm": 0.6430263519287109, "learning_rate": 0.0002, "epoch": 1.7379354943011882, "step": 10750}, {"loss": 0.7156, "grad_norm": 0.6134995222091675, "learning_rate": 0.0002, "epoch": 1.7395521784819334, "step": 10760}, {"loss": 0.7584, "grad_norm": 0.7209452986717224, "learning_rate": 0.0002, "epoch": 1.741168862662679, "step": 10770}, {"loss": 0.7528, "grad_norm": 0.6735447645187378, "learning_rate": 0.0002, "epoch": 1.7427855468434241, "step": 10780}, {"loss": 0.756, "grad_norm": 0.5605693459510803, "learning_rate": 0.0002, "epoch": 1.7444022310241696, "step": 10790}, {"loss": 0.7759, "grad_norm": 0.6882363557815552, "learning_rate": 0.0002, "epoch": 1.7460189152049148, "step": 10800}, {"loss": 0.7544, "grad_norm": 0.6386259198188782, "learning_rate": 0.0002, "epoch": 1.74763559938566, "step": 10810}, {"loss": 0.7697, "grad_norm": 0.6529015302658081, "learning_rate": 0.0002, "epoch": 1.7492522835664053, "step": 10820}, {"loss": 0.7219, "grad_norm": 0.5664082765579224, "learning_rate": 0.0002, "epoch": 1.7508689677471505, "step": 10830}, {"loss": 0.7586, "grad_norm": 0.7532684206962585, "learning_rate": 0.0002, "epoch": 1.7524856519278957, "step": 10840}, {"loss": 0.6919, "grad_norm": 0.77171391248703, "learning_rate": 0.0002, "epoch": 1.7541023361086412, "step": 10850}, {"loss": 0.785, "grad_norm": 0.7255431413650513, "learning_rate": 0.0002, "epoch": 1.7557190202893864, "step": 10860}, {"loss": 0.7458, "grad_norm": 0.763083279132843, "learning_rate": 0.0002, "epoch": 1.7573357044701319, "step": 10870}, {"loss": 0.7846, "grad_norm": 0.6042402982711792, "learning_rate": 0.0002, "epoch": 1.758952388650877, "step": 10880}, {"loss": 0.7027, "grad_norm": 0.7642518281936646, "learning_rate": 0.0002, "epoch": 1.7605690728316223, "step": 10890}, {"loss": 0.746, "grad_norm": 0.6347904801368713, "learning_rate": 0.0002, "epoch": 1.7621857570123676, "step": 10900}, {"loss": 0.7458, "grad_norm": 0.5371627807617188, "learning_rate": 0.0002, "epoch": 1.7638024411931128, "step": 10910}, {"loss": 0.7466, "grad_norm": 0.6840225458145142, "learning_rate": 0.0002, "epoch": 1.7654191253738583, "step": 10920}, {"loss": 0.725, "grad_norm": 0.5288469195365906, "learning_rate": 0.0002, "epoch": 1.7670358095546035, "step": 10930}, {"loss": 0.7863, "grad_norm": 0.69020676612854, "learning_rate": 0.0002, "epoch": 1.768652493735349, "step": 10940}, {"loss": 0.7468, "grad_norm": 0.5943242311477661, "learning_rate": 0.0002, "epoch": 1.7702691779160942, "step": 10950}, {"loss": 0.7244, "grad_norm": 0.5616418123245239, "learning_rate": 0.0002, "epoch": 1.7718858620968394, "step": 10960}, {"loss": 0.7137, "grad_norm": 0.7209470868110657, "learning_rate": 0.0002, "epoch": 1.7735025462775846, "step": 10970}, {"loss": 0.7459, "grad_norm": 0.6657957434654236, "learning_rate": 0.0002, "epoch": 1.7751192304583299, "step": 10980}, {"loss": 0.7076, "grad_norm": 0.6469064950942993, "learning_rate": 0.0002, "epoch": 1.776735914639075, "step": 10990}, {"loss": 0.7321, "grad_norm": 0.6615678071975708, "learning_rate": 0.0002, "epoch": 1.7783525988198206, "step": 11000}, {"loss": 0.747, "grad_norm": 0.6722439527511597, "learning_rate": 0.0002, "epoch": 1.779969283000566, "step": 11010}, {"loss": 0.7302, "grad_norm": 0.634136974811554, "learning_rate": 0.0002, "epoch": 1.7815859671813112, "step": 11020}, {"loss": 0.8105, "grad_norm": 0.6024377346038818, "learning_rate": 0.0002, "epoch": 1.7832026513620565, "step": 11030}, {"loss": 0.7855, "grad_norm": 0.6909403800964355, "learning_rate": 0.0002, "epoch": 1.7848193355428017, "step": 11040}, {"loss": 0.7471, "grad_norm": 0.7148767709732056, "learning_rate": 0.0002, "epoch": 1.786436019723547, "step": 11050}, {"loss": 0.7145, "grad_norm": 0.7442979216575623, "learning_rate": 0.0002, "epoch": 1.7880527039042922, "step": 11060}, {"loss": 0.7215, "grad_norm": 0.6830431818962097, "learning_rate": 0.0002, "epoch": 1.7896693880850376, "step": 11070}, {"loss": 0.7625, "grad_norm": 0.9172667264938354, "learning_rate": 0.0002, "epoch": 1.7912860722657828, "step": 11080}, {"loss": 0.76, "grad_norm": 0.6799490451812744, "learning_rate": 0.0002, "epoch": 1.7929027564465283, "step": 11090}, {"loss": 0.7716, "grad_norm": 0.7617024779319763, "learning_rate": 0.0002, "epoch": 1.7945194406272735, "step": 11100}, {"loss": 0.7586, "grad_norm": 0.7701810002326965, "learning_rate": 0.0002, "epoch": 1.7961361248080188, "step": 11110}, {"loss": 0.7843, "grad_norm": 0.7454385757446289, "learning_rate": 0.0002, "epoch": 1.797752808988764, "step": 11120}, {"loss": 0.7873, "grad_norm": 0.6121436953544617, "learning_rate": 0.0002, "epoch": 1.7993694931695092, "step": 11130}, {"loss": 0.7305, "grad_norm": 0.6237571835517883, "learning_rate": 0.0002, "epoch": 1.8009861773502547, "step": 11140}, {"loss": 0.6827, "grad_norm": 0.6818515658378601, "learning_rate": 0.0002, "epoch": 1.802602861531, "step": 11150}, {"loss": 0.6876, "grad_norm": 0.7768308520317078, "learning_rate": 0.0002, "epoch": 1.8042195457117454, "step": 11160}, {"loss": 0.7533, "grad_norm": 0.6875537633895874, "learning_rate": 0.0002, "epoch": 1.8058362298924906, "step": 11170}, {"loss": 0.761, "grad_norm": 0.7950584888458252, "learning_rate": 0.0002, "epoch": 1.8074529140732358, "step": 11180}, {"loss": 0.7623, "grad_norm": 0.8210248351097107, "learning_rate": 0.0002, "epoch": 1.809069598253981, "step": 11190}, {"loss": 0.7556, "grad_norm": 0.6674110889434814, "learning_rate": 0.0002, "epoch": 1.8106862824347263, "step": 11200}, {"loss": 0.7663, "grad_norm": 0.6261674761772156, "learning_rate": 0.0002, "epoch": 1.8123029666154715, "step": 11210}, {"loss": 0.7122, "grad_norm": 0.6484741568565369, "learning_rate": 0.0002, "epoch": 1.813919650796217, "step": 11220}, {"loss": 0.7718, "grad_norm": 0.6231244206428528, "learning_rate": 0.0002, "epoch": 1.8155363349769622, "step": 11230}, {"loss": 0.7152, "grad_norm": 0.7243146896362305, "learning_rate": 0.0002, "epoch": 1.8171530191577077, "step": 11240}, {"loss": 0.7448, "grad_norm": 0.6776193380355835, "learning_rate": 0.0002, "epoch": 1.818769703338453, "step": 11250}, {"loss": 0.7317, "grad_norm": 0.5973618030548096, "learning_rate": 0.0002, "epoch": 1.8203863875191981, "step": 11260}, {"loss": 0.7961, "grad_norm": 0.6451361179351807, "learning_rate": 0.0002, "epoch": 1.8220030716999434, "step": 11270}, {"loss": 0.7611, "grad_norm": 0.5963068008422852, "learning_rate": 0.0002, "epoch": 1.8236197558806886, "step": 11280}, {"loss": 0.7466, "grad_norm": 0.536902129650116, "learning_rate": 0.0002, "epoch": 1.825236440061434, "step": 11290}, {"loss": 0.708, "grad_norm": 0.6993787288665771, "learning_rate": 0.0002, "epoch": 1.8268531242421793, "step": 11300}, {"loss": 0.7153, "grad_norm": 0.6135255098342896, "learning_rate": 0.0002, "epoch": 1.8284698084229247, "step": 11310}, {"loss": 0.7423, "grad_norm": 0.6057423949241638, "learning_rate": 0.0002, "epoch": 1.83008649260367, "step": 11320}, {"loss": 0.735, "grad_norm": 0.6598812341690063, "learning_rate": 0.0002, "epoch": 1.8317031767844152, "step": 11330}, {"loss": 0.7278, "grad_norm": 0.6075948476791382, "learning_rate": 0.0002, "epoch": 1.8333198609651604, "step": 11340}, {"loss": 0.7846, "grad_norm": 0.7065447568893433, "learning_rate": 0.0002, "epoch": 1.8349365451459057, "step": 11350}, {"loss": 0.7365, "grad_norm": 0.680526614189148, "learning_rate": 0.0002, "epoch": 1.8365532293266509, "step": 11360}, {"loss": 0.7152, "grad_norm": 0.6356695294380188, "learning_rate": 0.0002, "epoch": 1.8381699135073963, "step": 11370}, {"loss": 0.721, "grad_norm": 0.6399052143096924, "learning_rate": 0.0002, "epoch": 1.8397865976881416, "step": 11380}, {"loss": 0.7618, "grad_norm": 0.6125704050064087, "learning_rate": 0.0002, "epoch": 1.841403281868887, "step": 11390}, {"loss": 0.755, "grad_norm": 0.7124643325805664, "learning_rate": 0.0002, "epoch": 1.8430199660496323, "step": 11400}, {"loss": 0.7972, "grad_norm": 0.6099604964256287, "learning_rate": 0.0002, "epoch": 1.8446366502303775, "step": 11410}, {"loss": 0.7187, "grad_norm": 0.7338208556175232, "learning_rate": 0.0002, "epoch": 1.8462533344111227, "step": 11420}, {"loss": 0.7007, "grad_norm": 0.7534668445587158, "learning_rate": 0.0002, "epoch": 1.847870018591868, "step": 11430}, {"loss": 0.7464, "grad_norm": 0.6135470271110535, "learning_rate": 0.0002, "epoch": 1.8494867027726134, "step": 11440}, {"loss": 0.7955, "grad_norm": 0.6229309439659119, "learning_rate": 0.0002, "epoch": 1.8511033869533586, "step": 11450}, {"loss": 0.7594, "grad_norm": 0.706423282623291, "learning_rate": 0.0002, "epoch": 1.852720071134104, "step": 11460}, {"loss": 0.7411, "grad_norm": 0.5460049510002136, "learning_rate": 0.0002, "epoch": 1.8543367553148493, "step": 11470}, {"loss": 0.7416, "grad_norm": 0.6616711020469666, "learning_rate": 0.0002, "epoch": 1.8559534394955945, "step": 11480}, {"loss": 0.729, "grad_norm": 0.6372783184051514, "learning_rate": 0.0002, "epoch": 1.8575701236763398, "step": 11490}, {"loss": 0.7333, "grad_norm": 0.7162668108940125, "learning_rate": 0.0002, "epoch": 1.859186807857085, "step": 11500}, {"loss": 0.7747, "grad_norm": 0.6605209708213806, "learning_rate": 0.0002, "epoch": 1.8608034920378305, "step": 11510}, {"loss": 0.7258, "grad_norm": 0.6933956742286682, "learning_rate": 0.0002, "epoch": 1.8624201762185757, "step": 11520}, {"loss": 0.7243, "grad_norm": 0.6582090854644775, "learning_rate": 0.0002, "epoch": 1.8640368603993211, "step": 11530}, {"loss": 0.7313, "grad_norm": 0.6416500806808472, "learning_rate": 0.0002, "epoch": 1.8656535445800664, "step": 11540}, {"loss": 0.7372, "grad_norm": 0.5434312224388123, "learning_rate": 0.0002, "epoch": 1.8672702287608116, "step": 11550}, {"loss": 0.7635, "grad_norm": 0.6827567219734192, "learning_rate": 0.0002, "epoch": 1.8688869129415568, "step": 11560}, {"loss": 0.7137, "grad_norm": 0.7354370951652527, "learning_rate": 0.0002, "epoch": 1.870503597122302, "step": 11570}, {"loss": 0.7526, "grad_norm": 0.590372622013092, "learning_rate": 0.0002, "epoch": 1.8721202813030473, "step": 11580}, {"loss": 0.731, "grad_norm": 0.853183925151825, "learning_rate": 0.0002, "epoch": 1.8737369654837928, "step": 11590}, {"loss": 0.7487, "grad_norm": 0.822678804397583, "learning_rate": 0.0002, "epoch": 1.875353649664538, "step": 11600}, {"loss": 0.7427, "grad_norm": 0.6591550707817078, "learning_rate": 0.0002, "epoch": 1.8769703338452834, "step": 11610}, {"loss": 0.7054, "grad_norm": 0.7475301623344421, "learning_rate": 0.0002, "epoch": 1.8785870180260287, "step": 11620}, {"loss": 0.811, "grad_norm": 0.6390765309333801, "learning_rate": 0.0002, "epoch": 1.880203702206774, "step": 11630}, {"loss": 0.7531, "grad_norm": 0.6589758992195129, "learning_rate": 0.0002, "epoch": 1.8818203863875191, "step": 11640}, {"loss": 0.7475, "grad_norm": 0.6765508651733398, "learning_rate": 0.0002, "epoch": 1.8834370705682644, "step": 11650}, {"loss": 0.738, "grad_norm": 0.6527857780456543, "learning_rate": 0.0002, "epoch": 1.8850537547490098, "step": 11660}, {"loss": 0.7504, "grad_norm": 0.6642923951148987, "learning_rate": 0.0002, "epoch": 1.886670438929755, "step": 11670}, {"loss": 0.7701, "grad_norm": 0.6945584416389465, "learning_rate": 0.0002, "epoch": 1.8882871231105005, "step": 11680}, {"loss": 0.7711, "grad_norm": 0.694018542766571, "learning_rate": 0.0002, "epoch": 1.8899038072912457, "step": 11690}, {"loss": 0.7195, "grad_norm": 0.7237417101860046, "learning_rate": 0.0002, "epoch": 1.891520491471991, "step": 11700}, {"loss": 0.7491, "grad_norm": 0.7401309609413147, "learning_rate": 0.0002, "epoch": 1.8931371756527362, "step": 11710}, {"loss": 0.805, "grad_norm": 0.6537784337997437, "learning_rate": 0.0002, "epoch": 1.8947538598334814, "step": 11720}, {"loss": 0.793, "grad_norm": 0.7398539185523987, "learning_rate": 0.0002, "epoch": 1.8963705440142267, "step": 11730}, {"loss": 0.7561, "grad_norm": 0.6696075797080994, "learning_rate": 0.0002, "epoch": 1.8979872281949721, "step": 11740}, {"loss": 0.7353, "grad_norm": 0.6014142036437988, "learning_rate": 0.0002, "epoch": 1.8996039123757174, "step": 11750}, {"loss": 0.7714, "grad_norm": 0.7023524641990662, "learning_rate": 0.0002, "epoch": 1.9012205965564628, "step": 11760}, {"loss": 0.7088, "grad_norm": 0.739973783493042, "learning_rate": 0.0002, "epoch": 1.902837280737208, "step": 11770}, {"loss": 0.7848, "grad_norm": 0.5576770901679993, "learning_rate": 0.0002, "epoch": 1.9044539649179533, "step": 11780}, {"loss": 0.7483, "grad_norm": 0.6907393932342529, "learning_rate": 0.0002, "epoch": 1.9060706490986985, "step": 11790}, {"loss": 0.7827, "grad_norm": 0.6934581995010376, "learning_rate": 0.0002, "epoch": 1.9076873332794437, "step": 11800}, {"loss": 0.7199, "grad_norm": 0.591774582862854, "learning_rate": 0.0002, "epoch": 1.9093040174601892, "step": 11810}, {"loss": 0.7333, "grad_norm": 0.6249791383743286, "learning_rate": 0.0002, "epoch": 1.9109207016409344, "step": 11820}, {"loss": 0.7581, "grad_norm": 0.6755744218826294, "learning_rate": 0.0002, "epoch": 1.9125373858216799, "step": 11830}, {"loss": 0.696, "grad_norm": 0.7286285161972046, "learning_rate": 0.0002, "epoch": 1.914154070002425, "step": 11840}, {"loss": 0.7509, "grad_norm": 0.7867850065231323, "learning_rate": 0.0002, "epoch": 1.9157707541831703, "step": 11850}, {"loss": 0.735, "grad_norm": 0.6283972859382629, "learning_rate": 0.0002, "epoch": 1.9173874383639156, "step": 11860}, {"loss": 0.7296, "grad_norm": 0.605823814868927, "learning_rate": 0.0002, "epoch": 1.9190041225446608, "step": 11870}, {"loss": 0.6598, "grad_norm": 0.5927976965904236, "learning_rate": 0.0002, "epoch": 1.920620806725406, "step": 11880}, {"loss": 0.7649, "grad_norm": 0.5974002480506897, "learning_rate": 0.0002, "epoch": 1.9222374909061515, "step": 11890}, {"loss": 0.7843, "grad_norm": 0.7091866135597229, "learning_rate": 0.0002, "epoch": 1.923854175086897, "step": 11900}, {"loss": 0.775, "grad_norm": 0.72496497631073, "learning_rate": 0.0002, "epoch": 1.9254708592676422, "step": 11910}, {"loss": 0.7153, "grad_norm": 0.6131896376609802, "learning_rate": 0.0002, "epoch": 1.9270875434483874, "step": 11920}, {"loss": 0.7228, "grad_norm": 0.6556436419487, "learning_rate": 0.0002, "epoch": 1.9287042276291326, "step": 11930}, {"loss": 0.7319, "grad_norm": 0.622932493686676, "learning_rate": 0.0002, "epoch": 1.9303209118098779, "step": 11940}, {"loss": 0.7592, "grad_norm": 0.6618631482124329, "learning_rate": 0.0002, "epoch": 1.931937595990623, "step": 11950}, {"loss": 0.8332, "grad_norm": 0.630966305732727, "learning_rate": 0.0002, "epoch": 1.9335542801713685, "step": 11960}, {"loss": 0.6854, "grad_norm": 0.6336734890937805, "learning_rate": 0.0002, "epoch": 1.9351709643521138, "step": 11970}, {"loss": 0.7433, "grad_norm": 0.655403196811676, "learning_rate": 0.0002, "epoch": 1.9367876485328592, "step": 11980}, {"loss": 0.7282, "grad_norm": 0.5640574097633362, "learning_rate": 0.0002, "epoch": 1.9384043327136045, "step": 11990}, {"loss": 0.7289, "grad_norm": 0.6322951316833496, "learning_rate": 0.0002, "epoch": 1.9400210168943497, "step": 12000}, {"loss": 0.7627, "grad_norm": 0.615703821182251, "learning_rate": 0.0002, "epoch": 1.941637701075095, "step": 12010}, {"loss": 0.786, "grad_norm": 0.6487536430358887, "learning_rate": 0.0002, "epoch": 1.9432543852558402, "step": 12020}, {"loss": 0.7435, "grad_norm": 0.9209630489349365, "learning_rate": 0.0002, "epoch": 1.9448710694365856, "step": 12030}, {"loss": 0.7274, "grad_norm": 0.67485511302948, "learning_rate": 0.0002, "epoch": 1.9464877536173308, "step": 12040}, {"loss": 0.7551, "grad_norm": 0.6831230521202087, "learning_rate": 0.0002, "epoch": 1.9481044377980763, "step": 12050}, {"loss": 0.7546, "grad_norm": 0.6578302383422852, "learning_rate": 0.0002, "epoch": 1.9497211219788215, "step": 12060}, {"loss": 0.6989, "grad_norm": 0.9975938200950623, "learning_rate": 0.0002, "epoch": 1.9513378061595668, "step": 12070}, {"loss": 0.7952, "grad_norm": 0.6637365221977234, "learning_rate": 0.0002, "epoch": 1.952954490340312, "step": 12080}, {"loss": 0.7482, "grad_norm": 0.605707049369812, "learning_rate": 0.0002, "epoch": 1.9545711745210572, "step": 12090}, {"loss": 0.7768, "grad_norm": 0.6584440469741821, "learning_rate": 0.0002, "epoch": 1.9561878587018025, "step": 12100}, {"loss": 0.7187, "grad_norm": 0.6070835590362549, "learning_rate": 0.0002, "epoch": 1.957804542882548, "step": 12110}, {"loss": 0.7491, "grad_norm": 0.7862601280212402, "learning_rate": 0.0002, "epoch": 1.9594212270632931, "step": 12120}, {"loss": 0.7972, "grad_norm": 0.8175255060195923, "learning_rate": 0.0002, "epoch": 1.9610379112440386, "step": 12130}, {"loss": 0.7242, "grad_norm": 0.5648472905158997, "learning_rate": 0.0002, "epoch": 1.9626545954247838, "step": 12140}, {"loss": 0.7321, "grad_norm": 0.6591973304748535, "learning_rate": 0.0002, "epoch": 1.964271279605529, "step": 12150}, {"loss": 0.739, "grad_norm": 0.5960676074028015, "learning_rate": 0.0002, "epoch": 1.9658879637862743, "step": 12160}, {"loss": 0.7254, "grad_norm": 0.7272544503211975, "learning_rate": 0.0002, "epoch": 1.9675046479670195, "step": 12170}, {"loss": 0.7376, "grad_norm": 0.7176699042320251, "learning_rate": 0.0002, "epoch": 1.969121332147765, "step": 12180}, {"loss": 0.7525, "grad_norm": 0.6927123665809631, "learning_rate": 0.0002, "epoch": 1.9707380163285102, "step": 12190}, {"loss": 0.7318, "grad_norm": 0.5536034107208252, "learning_rate": 0.0002, "epoch": 1.9723547005092557, "step": 12200}, {"loss": 0.7737, "grad_norm": 0.8348390460014343, "learning_rate": 0.0002, "epoch": 1.9739713846900009, "step": 12210}, {"loss": 0.7494, "grad_norm": 0.6591181755065918, "learning_rate": 0.0002, "epoch": 1.9755880688707461, "step": 12220}, {"loss": 0.763, "grad_norm": 1.0624109506607056, "learning_rate": 0.0002, "epoch": 1.9772047530514913, "step": 12230}, {"loss": 0.7541, "grad_norm": 0.9265586137771606, "learning_rate": 0.0002, "epoch": 1.9788214372322366, "step": 12240}, {"loss": 0.7533, "grad_norm": 0.5998196005821228, "learning_rate": 0.0002, "epoch": 1.9804381214129818, "step": 12250}, {"loss": 0.7225, "grad_norm": 0.6960851550102234, "learning_rate": 0.0002, "epoch": 1.9820548055937273, "step": 12260}, {"loss": 0.7398, "grad_norm": 0.7674502730369568, "learning_rate": 0.0002, "epoch": 1.9836714897744727, "step": 12270}, {"loss": 0.7185, "grad_norm": 0.6407275795936584, "learning_rate": 0.0002, "epoch": 1.985288173955218, "step": 12280}, {"loss": 0.7382, "grad_norm": 0.6673079133033752, "learning_rate": 0.0002, "epoch": 1.9869048581359632, "step": 12290}, {"loss": 0.7326, "grad_norm": 0.6989844441413879, "learning_rate": 0.0002, "epoch": 1.9885215423167084, "step": 12300}, {"loss": 0.7559, "grad_norm": 0.7564442157745361, "learning_rate": 0.0002, "epoch": 1.9901382264974536, "step": 12310}, {"loss": 0.7719, "grad_norm": 0.6385478973388672, "learning_rate": 0.0002, "epoch": 1.9917549106781989, "step": 12320}, {"loss": 0.7369, "grad_norm": 0.7193717956542969, "learning_rate": 0.0002, "epoch": 1.9933715948589443, "step": 12330}, {"loss": 0.7583, "grad_norm": 0.7987112402915955, "learning_rate": 0.0002, "epoch": 1.9949882790396896, "step": 12340}, {"loss": 0.7793, "grad_norm": 0.7260826826095581, "learning_rate": 0.0002, "epoch": 1.996604963220435, "step": 12350}, {"loss": 0.7505, "grad_norm": 0.7968255281448364, "learning_rate": 0.0002, "epoch": 1.9982216474011802, "step": 12360}, {"loss": 0.717, "grad_norm": 0.6893062591552734, "learning_rate": 0.0002, "epoch": 1.9998383315819255, "step": 12370}, {"eval_loss": 1.1044032573699951, "eval_runtime": 122.1508, "eval_samples_per_second": 6.001, "eval_steps_per_second": 0.753, "epoch": 2.0, "step": 12371}, {"loss": 0.6604, "grad_norm": 0.7775409817695618, "learning_rate": 0.0002, "epoch": 2.0014550157626707, "step": 12380}, {"loss": 0.6845, "grad_norm": 0.76218581199646, "learning_rate": 0.0002, "epoch": 2.003071699943416, "step": 12390}, {"loss": 0.6909, "grad_norm": 0.5677764415740967, "learning_rate": 0.0002, "epoch": 2.004688384124161, "step": 12400}, {"loss": 0.6584, "grad_norm": 0.808442234992981, "learning_rate": 0.0002, "epoch": 2.006305068304907, "step": 12410}, {"loss": 0.659, "grad_norm": 0.7144765257835388, "learning_rate": 0.0002, "epoch": 2.007921752485652, "step": 12420}, {"loss": 0.6666, "grad_norm": 0.6914031505584717, "learning_rate": 0.0002, "epoch": 2.0095384366663973, "step": 12430}, {"loss": 0.6596, "grad_norm": 0.7581454515457153, "learning_rate": 0.0002, "epoch": 2.0111551208471425, "step": 12440}, {"loss": 0.6785, "grad_norm": 0.8388504981994629, "learning_rate": 0.0002, "epoch": 2.0127718050278878, "step": 12450}, {"loss": 0.6942, "grad_norm": 0.6716406941413879, "learning_rate": 0.0002, "epoch": 2.014388489208633, "step": 12460}, {"loss": 0.6441, "grad_norm": 0.898902416229248, "learning_rate": 0.0002, "epoch": 2.0160051733893782, "step": 12470}, {"loss": 0.6655, "grad_norm": 0.6432679891586304, "learning_rate": 0.0002, "epoch": 2.0176218575701235, "step": 12480}, {"loss": 0.6521, "grad_norm": 0.8021109104156494, "learning_rate": 0.0002, "epoch": 2.019238541750869, "step": 12490}, {"loss": 0.6581, "grad_norm": 0.7039216756820679, "learning_rate": 0.0002, "epoch": 2.0208552259316144, "step": 12500}, {"loss": 0.6521, "grad_norm": 0.646531879901886, "learning_rate": 0.0002, "epoch": 2.0224719101123596, "step": 12510}, {"loss": 0.6302, "grad_norm": 0.783704400062561, "learning_rate": 0.0002, "epoch": 2.024088594293105, "step": 12520}, {"loss": 0.6288, "grad_norm": 0.8805046677589417, "learning_rate": 0.0002, "epoch": 2.02570527847385, "step": 12530}, {"loss": 0.6288, "grad_norm": 0.7289270758628845, "learning_rate": 0.0002, "epoch": 2.0273219626545953, "step": 12540}, {"loss": 0.6663, "grad_norm": 0.71653151512146, "learning_rate": 0.0002, "epoch": 2.0289386468353405, "step": 12550}, {"loss": 0.625, "grad_norm": 0.73281329870224, "learning_rate": 0.0002, "epoch": 2.030555331016086, "step": 12560}, {"loss": 0.6448, "grad_norm": 0.6657090187072754, "learning_rate": 0.0002, "epoch": 2.0321720151968314, "step": 12570}, {"loss": 0.6983, "grad_norm": 0.8241133093833923, "learning_rate": 0.0002, "epoch": 2.0337886993775767, "step": 12580}, {"loss": 0.6488, "grad_norm": 0.5834135413169861, "learning_rate": 0.0002, "epoch": 2.035405383558322, "step": 12590}, {"loss": 0.6188, "grad_norm": 0.84502112865448, "learning_rate": 0.0002, "epoch": 2.037022067739067, "step": 12600}, {"loss": 0.6349, "grad_norm": 0.8952481746673584, "learning_rate": 0.0002, "epoch": 2.0386387519198124, "step": 12610}, {"loss": 0.6923, "grad_norm": 0.7801461815834045, "learning_rate": 0.0002, "epoch": 2.0402554361005576, "step": 12620}, {"loss": 0.6176, "grad_norm": 0.6788367033004761, "learning_rate": 0.0002, "epoch": 2.041872120281303, "step": 12630}, {"loss": 0.6162, "grad_norm": 0.7241756319999695, "learning_rate": 0.0002, "epoch": 2.0434888044620485, "step": 12640}, {"loss": 0.655, "grad_norm": 0.6933388113975525, "learning_rate": 0.0002, "epoch": 2.0451054886427937, "step": 12650}, {"loss": 0.6431, "grad_norm": 0.8029746413230896, "learning_rate": 0.0002, "epoch": 2.046722172823539, "step": 12660}, {"loss": 0.7164, "grad_norm": 0.946399986743927, "learning_rate": 0.0002, "epoch": 2.048338857004284, "step": 12670}, {"loss": 0.638, "grad_norm": 0.7072678804397583, "learning_rate": 0.0002, "epoch": 2.0499555411850294, "step": 12680}, {"loss": 0.6487, "grad_norm": 0.6810618042945862, "learning_rate": 0.0002, "epoch": 2.0515722253657747, "step": 12690}, {"loss": 0.6554, "grad_norm": 0.7661160230636597, "learning_rate": 0.0002, "epoch": 2.05318890954652, "step": 12700}, {"loss": 0.6799, "grad_norm": 0.6350653767585754, "learning_rate": 0.0002, "epoch": 2.0548055937272656, "step": 12710}, {"loss": 0.6654, "grad_norm": 0.861890971660614, "learning_rate": 0.0002, "epoch": 2.056422277908011, "step": 12720}, {"loss": 0.6286, "grad_norm": 0.6489875912666321, "learning_rate": 0.0002, "epoch": 2.058038962088756, "step": 12730}, {"loss": 0.6811, "grad_norm": 0.8268506526947021, "learning_rate": 0.0002, "epoch": 2.0596556462695013, "step": 12740}, {"loss": 0.6524, "grad_norm": 0.607679545879364, "learning_rate": 0.0002, "epoch": 2.0612723304502465, "step": 12750}, {"loss": 0.6649, "grad_norm": 0.6754153370857239, "learning_rate": 0.0002, "epoch": 2.0628890146309917, "step": 12760}, {"loss": 0.6549, "grad_norm": 0.7263124585151672, "learning_rate": 0.0002, "epoch": 2.064505698811737, "step": 12770}, {"loss": 0.6189, "grad_norm": 0.6986154317855835, "learning_rate": 0.0002, "epoch": 2.0661223829924826, "step": 12780}, {"loss": 0.6723, "grad_norm": 0.7768576741218567, "learning_rate": 0.0002, "epoch": 2.067739067173228, "step": 12790}, {"loss": 0.677, "grad_norm": 0.7546762824058533, "learning_rate": 0.0002, "epoch": 2.069355751353973, "step": 12800}, {"loss": 0.6485, "grad_norm": 0.7588880062103271, "learning_rate": 0.0002, "epoch": 2.0709724355347183, "step": 12810}, {"loss": 0.6989, "grad_norm": 0.7457242608070374, "learning_rate": 0.0002, "epoch": 2.0725891197154636, "step": 12820}, {"loss": 0.6489, "grad_norm": 0.6983516812324524, "learning_rate": 0.0002, "epoch": 2.074205803896209, "step": 12830}, {"loss": 0.651, "grad_norm": 0.7950928807258606, "learning_rate": 0.0002, "epoch": 2.075822488076954, "step": 12840}, {"loss": 0.6603, "grad_norm": 0.9248087406158447, "learning_rate": 0.0002, "epoch": 2.0774391722576993, "step": 12850}, {"loss": 0.6847, "grad_norm": 0.7229493260383606, "learning_rate": 0.0002, "epoch": 2.079055856438445, "step": 12860}, {"loss": 0.6702, "grad_norm": 0.5710847973823547, "learning_rate": 0.0002, "epoch": 2.08067254061919, "step": 12870}, {"loss": 0.6974, "grad_norm": 0.9580423831939697, "learning_rate": 0.0002, "epoch": 2.0822892247999354, "step": 12880}, {"loss": 0.6341, "grad_norm": 0.7399665713310242, "learning_rate": 0.0002, "epoch": 2.0839059089806806, "step": 12890}, {"loss": 0.6993, "grad_norm": 0.7981410622596741, "learning_rate": 0.0002, "epoch": 2.085522593161426, "step": 12900}, {"loss": 0.6976, "grad_norm": 0.870759904384613, "learning_rate": 0.0002, "epoch": 2.087139277342171, "step": 12910}, {"loss": 0.7194, "grad_norm": 0.7001481652259827, "learning_rate": 0.0002, "epoch": 2.0887559615229163, "step": 12920}, {"loss": 0.6383, "grad_norm": 0.6745418310165405, "learning_rate": 0.0002, "epoch": 2.090372645703662, "step": 12930}, {"loss": 0.6519, "grad_norm": 0.7739067673683167, "learning_rate": 0.0002, "epoch": 2.0919893298844072, "step": 12940}, {"loss": 0.6856, "grad_norm": 0.6742934584617615, "learning_rate": 0.0002, "epoch": 2.0936060140651525, "step": 12950}, {"loss": 0.6279, "grad_norm": 0.7270349860191345, "learning_rate": 0.0002, "epoch": 2.0952226982458977, "step": 12960}, {"loss": 0.6783, "grad_norm": 0.7150624394416809, "learning_rate": 0.0002, "epoch": 2.096839382426643, "step": 12970}, {"loss": 0.6093, "grad_norm": 0.7734767198562622, "learning_rate": 0.0002, "epoch": 2.098456066607388, "step": 12980}, {"loss": 0.6534, "grad_norm": 0.7618662118911743, "learning_rate": 0.0002, "epoch": 2.1000727507881334, "step": 12990}, {"loss": 0.6707, "grad_norm": 0.6557944416999817, "learning_rate": 0.0002, "epoch": 2.101689434968879, "step": 13000}, {"loss": 0.7268, "grad_norm": 0.8786448240280151, "learning_rate": 0.0002, "epoch": 2.1033061191496243, "step": 13010}, {"loss": 0.6677, "grad_norm": 0.6878724098205566, "learning_rate": 0.0002, "epoch": 2.1049228033303695, "step": 13020}, {"loss": 0.6824, "grad_norm": 0.822318971157074, "learning_rate": 0.0002, "epoch": 2.1065394875111147, "step": 13030}, {"loss": 0.6228, "grad_norm": 0.831468939781189, "learning_rate": 0.0002, "epoch": 2.10815617169186, "step": 13040}, {"loss": 0.6511, "grad_norm": 0.7699505686759949, "learning_rate": 0.0002, "epoch": 2.109772855872605, "step": 13050}, {"loss": 0.6671, "grad_norm": 0.7559016346931458, "learning_rate": 0.0002, "epoch": 2.1113895400533504, "step": 13060}, {"loss": 0.6215, "grad_norm": 0.6942209601402283, "learning_rate": 0.0002, "epoch": 2.1130062242340957, "step": 13070}, {"loss": 0.6449, "grad_norm": 0.6098947525024414, "learning_rate": 0.0002, "epoch": 2.1146229084148414, "step": 13080}, {"loss": 0.7091, "grad_norm": 0.6499016284942627, "learning_rate": 0.0002, "epoch": 2.1162395925955866, "step": 13090}, {"loss": 0.6247, "grad_norm": 0.7719953060150146, "learning_rate": 0.0002, "epoch": 2.117856276776332, "step": 13100}, {"loss": 0.6064, "grad_norm": 0.6708134412765503, "learning_rate": 0.0002, "epoch": 2.119472960957077, "step": 13110}, {"loss": 0.6056, "grad_norm": 0.8119585514068604, "learning_rate": 0.0002, "epoch": 2.1210896451378223, "step": 13120}, {"loss": 0.6628, "grad_norm": 0.6947157979011536, "learning_rate": 0.0002, "epoch": 2.1227063293185675, "step": 13130}, {"loss": 0.6375, "grad_norm": 0.8831837773323059, "learning_rate": 0.0002, "epoch": 2.1243230134993127, "step": 13140}, {"loss": 0.6997, "grad_norm": 0.7266910672187805, "learning_rate": 0.0002, "epoch": 2.1259396976800584, "step": 13150}, {"loss": 0.6446, "grad_norm": 0.8864351511001587, "learning_rate": 0.0002, "epoch": 2.1275563818608036, "step": 13160}, {"loss": 0.6762, "grad_norm": 0.8104248046875, "learning_rate": 0.0002, "epoch": 2.129173066041549, "step": 13170}, {"loss": 0.6581, "grad_norm": 0.6077079772949219, "learning_rate": 0.0002, "epoch": 2.130789750222294, "step": 13180}, {"loss": 0.6572, "grad_norm": 0.6874213814735413, "learning_rate": 0.0002, "epoch": 2.1324064344030393, "step": 13190}, {"loss": 0.642, "grad_norm": 0.7134367823600769, "learning_rate": 0.0002, "epoch": 2.1340231185837846, "step": 13200}, {"loss": 0.7016, "grad_norm": 0.6101235151290894, "learning_rate": 0.0002, "epoch": 2.13563980276453, "step": 13210}, {"loss": 0.6529, "grad_norm": 0.6042411923408508, "learning_rate": 0.0002, "epoch": 2.137256486945275, "step": 13220}, {"loss": 0.7179, "grad_norm": 0.914601743221283, "learning_rate": 0.0002, "epoch": 2.1388731711260207, "step": 13230}, {"loss": 0.6513, "grad_norm": 0.7104284167289734, "learning_rate": 0.0002, "epoch": 2.140489855306766, "step": 13240}, {"loss": 0.6607, "grad_norm": 0.664395272731781, "learning_rate": 0.0002, "epoch": 2.142106539487511, "step": 13250}, {"loss": 0.7211, "grad_norm": 0.6991241574287415, "learning_rate": 0.0002, "epoch": 2.1437232236682564, "step": 13260}, {"loss": 0.6484, "grad_norm": 0.5469560623168945, "learning_rate": 0.0002, "epoch": 2.1453399078490016, "step": 13270}, {"loss": 0.6765, "grad_norm": 0.8454998135566711, "learning_rate": 0.0002, "epoch": 2.146956592029747, "step": 13280}, {"loss": 0.6683, "grad_norm": 0.7088868618011475, "learning_rate": 0.0002, "epoch": 2.148573276210492, "step": 13290}, {"loss": 0.6835, "grad_norm": 0.7002687454223633, "learning_rate": 0.0002, "epoch": 2.1501899603912378, "step": 13300}, {"loss": 0.6399, "grad_norm": 0.7785214781761169, "learning_rate": 0.0002, "epoch": 2.151806644571983, "step": 13310}, {"loss": 0.67, "grad_norm": 0.8049132227897644, "learning_rate": 0.0002, "epoch": 2.1534233287527282, "step": 13320}, {"loss": 0.6495, "grad_norm": 0.8062595129013062, "learning_rate": 0.0002, "epoch": 2.1550400129334735, "step": 13330}, {"loss": 0.6603, "grad_norm": 0.6208319067955017, "learning_rate": 0.0002, "epoch": 2.1566566971142187, "step": 13340}, {"loss": 0.6584, "grad_norm": 0.7519655823707581, "learning_rate": 0.0002, "epoch": 2.158273381294964, "step": 13350}, {"loss": 0.6457, "grad_norm": 0.7645747065544128, "learning_rate": 0.0002, "epoch": 2.159890065475709, "step": 13360}, {"loss": 0.645, "grad_norm": 0.6847302913665771, "learning_rate": 0.0002, "epoch": 2.1615067496564544, "step": 13370}, {"loss": 0.6903, "grad_norm": 0.8630441427230835, "learning_rate": 0.0002, "epoch": 2.1631234338372, "step": 13380}, {"loss": 0.6742, "grad_norm": 0.7947702407836914, "learning_rate": 0.0002, "epoch": 2.1647401180179453, "step": 13390}, {"loss": 0.7206, "grad_norm": 0.6836977005004883, "learning_rate": 0.0002, "epoch": 2.1663568021986905, "step": 13400}, {"loss": 0.6304, "grad_norm": 0.7340566515922546, "learning_rate": 0.0002, "epoch": 2.1679734863794358, "step": 13410}, {"loss": 0.6528, "grad_norm": 0.7075738906860352, "learning_rate": 0.0002, "epoch": 2.169590170560181, "step": 13420}, {"loss": 0.6585, "grad_norm": 0.7080879807472229, "learning_rate": 0.0002, "epoch": 2.1712068547409262, "step": 13430}, {"loss": 0.6615, "grad_norm": 0.6218613386154175, "learning_rate": 0.0002, "epoch": 2.1728235389216715, "step": 13440}, {"loss": 0.6488, "grad_norm": 0.8211479187011719, "learning_rate": 0.0002, "epoch": 2.174440223102417, "step": 13450}, {"loss": 0.6738, "grad_norm": 0.864466667175293, "learning_rate": 0.0002, "epoch": 2.1760569072831624, "step": 13460}, {"loss": 0.679, "grad_norm": 0.7943857908248901, "learning_rate": 0.0002, "epoch": 2.1776735914639076, "step": 13470}, {"loss": 0.6838, "grad_norm": 0.78728187084198, "learning_rate": 0.0002, "epoch": 2.179290275644653, "step": 13480}, {"loss": 0.6397, "grad_norm": 0.697527289390564, "learning_rate": 0.0002, "epoch": 2.180906959825398, "step": 13490}, {"loss": 0.669, "grad_norm": 0.8205804228782654, "learning_rate": 0.0002, "epoch": 2.1825236440061433, "step": 13500}, {"loss": 0.7227, "grad_norm": 0.8709042072296143, "learning_rate": 0.0002, "epoch": 2.1841403281868885, "step": 13510}, {"loss": 0.6313, "grad_norm": 0.6228537559509277, "learning_rate": 0.0002, "epoch": 2.1857570123676338, "step": 13520}, {"loss": 0.7025, "grad_norm": 0.9566980004310608, "learning_rate": 0.0002, "epoch": 2.1873736965483794, "step": 13530}, {"loss": 0.6755, "grad_norm": 0.7128894329071045, "learning_rate": 0.0002, "epoch": 2.1889903807291247, "step": 13540}, {"loss": 0.6827, "grad_norm": 0.6888654232025146, "learning_rate": 0.0002, "epoch": 2.19060706490987, "step": 13550}, {"loss": 0.6961, "grad_norm": 0.6444337368011475, "learning_rate": 0.0002, "epoch": 2.192223749090615, "step": 13560}, {"loss": 0.656, "grad_norm": 0.8008806705474854, "learning_rate": 0.0002, "epoch": 2.1938404332713604, "step": 13570}, {"loss": 0.7, "grad_norm": 0.8482748866081238, "learning_rate": 0.0002, "epoch": 2.1954571174521056, "step": 13580}, {"loss": 0.7326, "grad_norm": 0.8584157228469849, "learning_rate": 0.0002, "epoch": 2.197073801632851, "step": 13590}, {"loss": 0.7014, "grad_norm": 0.7513734698295593, "learning_rate": 0.0002, "epoch": 2.1986904858135965, "step": 13600}, {"loss": 0.6632, "grad_norm": 0.7864262461662292, "learning_rate": 0.0002, "epoch": 2.2003071699943417, "step": 13610}, {"loss": 0.6879, "grad_norm": 0.8493645191192627, "learning_rate": 0.0002, "epoch": 2.201923854175087, "step": 13620}, {"loss": 0.6617, "grad_norm": 0.6902140974998474, "learning_rate": 0.0002, "epoch": 2.203540538355832, "step": 13630}, {"loss": 0.6655, "grad_norm": 0.8711254596710205, "learning_rate": 0.0002, "epoch": 2.2051572225365774, "step": 13640}, {"loss": 0.6359, "grad_norm": 0.7832191586494446, "learning_rate": 0.0002, "epoch": 2.2067739067173227, "step": 13650}, {"loss": 0.6723, "grad_norm": 0.5668176412582397, "learning_rate": 0.0002, "epoch": 2.208390590898068, "step": 13660}, {"loss": 0.635, "grad_norm": 0.8648375272750854, "learning_rate": 0.0002, "epoch": 2.2100072750788136, "step": 13670}, {"loss": 0.653, "grad_norm": 0.7643089890480042, "learning_rate": 0.0002, "epoch": 2.211623959259559, "step": 13680}, {"loss": 0.6765, "grad_norm": 0.6293777823448181, "learning_rate": 0.0002, "epoch": 2.213240643440304, "step": 13690}, {"loss": 0.6842, "grad_norm": 0.6459372639656067, "learning_rate": 0.0002, "epoch": 2.2148573276210493, "step": 13700}, {"loss": 0.6526, "grad_norm": 0.7060744166374207, "learning_rate": 0.0002, "epoch": 2.2164740118017945, "step": 13710}, {"loss": 0.7101, "grad_norm": 0.674109160900116, "learning_rate": 0.0002, "epoch": 2.2180906959825397, "step": 13720}, {"loss": 0.6529, "grad_norm": 0.830392062664032, "learning_rate": 0.0002, "epoch": 2.219707380163285, "step": 13730}, {"loss": 0.6733, "grad_norm": 0.6474477052688599, "learning_rate": 0.0002, "epoch": 2.2213240643440306, "step": 13740}, {"loss": 0.6413, "grad_norm": 0.7037909626960754, "learning_rate": 0.0002, "epoch": 2.222940748524776, "step": 13750}, {"loss": 0.6417, "grad_norm": 0.6554131507873535, "learning_rate": 0.0002, "epoch": 2.224557432705521, "step": 13760}, {"loss": 0.6907, "grad_norm": 0.7822230458259583, "learning_rate": 0.0002, "epoch": 2.2261741168862663, "step": 13770}, {"loss": 0.6505, "grad_norm": 0.9082167744636536, "learning_rate": 0.0002, "epoch": 2.2277908010670116, "step": 13780}, {"loss": 0.6878, "grad_norm": 0.7918276190757751, "learning_rate": 0.0002, "epoch": 2.229407485247757, "step": 13790}, {"loss": 0.6669, "grad_norm": 0.7354569435119629, "learning_rate": 0.0002, "epoch": 2.231024169428502, "step": 13800}, {"loss": 0.6503, "grad_norm": 0.8265249133110046, "learning_rate": 0.0002, "epoch": 2.2326408536092472, "step": 13810}, {"loss": 0.6871, "grad_norm": 0.6653847098350525, "learning_rate": 0.0002, "epoch": 2.234257537789993, "step": 13820}, {"loss": 0.6413, "grad_norm": 0.7157923579216003, "learning_rate": 0.0002, "epoch": 2.235874221970738, "step": 13830}, {"loss": 0.6306, "grad_norm": 0.7110323309898376, "learning_rate": 0.0002, "epoch": 2.2374909061514834, "step": 13840}, {"loss": 0.6913, "grad_norm": 0.7155357599258423, "learning_rate": 0.0002, "epoch": 2.2391075903322286, "step": 13850}, {"loss": 0.6579, "grad_norm": 1.0177817344665527, "learning_rate": 0.0002, "epoch": 2.240724274512974, "step": 13860}, {"loss": 0.635, "grad_norm": 0.7601948380470276, "learning_rate": 0.0002, "epoch": 2.242340958693719, "step": 13870}, {"loss": 0.6679, "grad_norm": 0.7628820538520813, "learning_rate": 0.0002, "epoch": 2.2439576428744643, "step": 13880}, {"loss": 0.6805, "grad_norm": 0.7089297771453857, "learning_rate": 0.0002, "epoch": 2.24557432705521, "step": 13890}, {"loss": 0.7236, "grad_norm": 0.695178210735321, "learning_rate": 0.0002, "epoch": 2.247191011235955, "step": 13900}, {"loss": 0.7084, "grad_norm": 0.7631948590278625, "learning_rate": 0.0002, "epoch": 2.2488076954167004, "step": 13910}, {"loss": 0.685, "grad_norm": 0.8203101754188538, "learning_rate": 0.0002, "epoch": 2.2504243795974457, "step": 13920}, {"loss": 0.653, "grad_norm": 0.8099079728126526, "learning_rate": 0.0002, "epoch": 2.252041063778191, "step": 13930}, {"loss": 0.694, "grad_norm": 0.6498546004295349, "learning_rate": 0.0002, "epoch": 2.253657747958936, "step": 13940}, {"loss": 0.6684, "grad_norm": 0.7797415256500244, "learning_rate": 0.0002, "epoch": 2.2552744321396814, "step": 13950}, {"loss": 0.683, "grad_norm": 0.8254124522209167, "learning_rate": 0.0002, "epoch": 2.2568911163204266, "step": 13960}, {"loss": 0.6806, "grad_norm": 0.6327953338623047, "learning_rate": 0.0002, "epoch": 2.2585078005011723, "step": 13970}, {"loss": 0.668, "grad_norm": 0.734194278717041, "learning_rate": 0.0002, "epoch": 2.2601244846819175, "step": 13980}, {"loss": 0.6912, "grad_norm": 0.9014202952384949, "learning_rate": 0.0002, "epoch": 2.2617411688626627, "step": 13990}, {"loss": 0.692, "grad_norm": 0.7643631100654602, "learning_rate": 0.0002, "epoch": 2.263357853043408, "step": 14000}, {"loss": 0.6657, "grad_norm": 0.8882834911346436, "learning_rate": 0.0002, "epoch": 2.264974537224153, "step": 14010}, {"loss": 0.6453, "grad_norm": 0.7975873351097107, "learning_rate": 0.0002, "epoch": 2.2665912214048984, "step": 14020}, {"loss": 0.7193, "grad_norm": 0.7765783071517944, "learning_rate": 0.0002, "epoch": 2.2682079055856437, "step": 14030}, {"loss": 0.662, "grad_norm": 0.8846288323402405, "learning_rate": 0.0002, "epoch": 2.2698245897663893, "step": 14040}, {"loss": 0.6494, "grad_norm": 0.9006744027137756, "learning_rate": 0.0002, "epoch": 2.2714412739471346, "step": 14050}, {"loss": 0.6423, "grad_norm": 0.7420173287391663, "learning_rate": 0.0002, "epoch": 2.27305795812788, "step": 14060}, {"loss": 0.7068, "grad_norm": 0.7956424951553345, "learning_rate": 0.0002, "epoch": 2.274674642308625, "step": 14070}, {"loss": 0.6581, "grad_norm": 0.7783209085464478, "learning_rate": 0.0002, "epoch": 2.2762913264893703, "step": 14080}, {"loss": 0.7202, "grad_norm": 0.7597188949584961, "learning_rate": 0.0002, "epoch": 2.2779080106701155, "step": 14090}, {"loss": 0.6778, "grad_norm": 0.6718921661376953, "learning_rate": 0.0002, "epoch": 2.2795246948508607, "step": 14100}, {"loss": 0.632, "grad_norm": 0.7528082132339478, "learning_rate": 0.0002, "epoch": 2.281141379031606, "step": 14110}, {"loss": 0.7608, "grad_norm": 0.8379864692687988, "learning_rate": 0.0002, "epoch": 2.2827580632123516, "step": 14120}, {"loss": 0.6767, "grad_norm": 0.748613715171814, "learning_rate": 0.0002, "epoch": 2.284374747393097, "step": 14130}, {"loss": 0.6641, "grad_norm": 0.7435423135757446, "learning_rate": 0.0002, "epoch": 2.285991431573842, "step": 14140}, {"loss": 0.6849, "grad_norm": 0.7580803632736206, "learning_rate": 0.0002, "epoch": 2.2876081157545873, "step": 14150}, {"loss": 0.6604, "grad_norm": 0.6278321146965027, "learning_rate": 0.0002, "epoch": 2.2892247999353326, "step": 14160}, {"loss": 0.6573, "grad_norm": 0.7663896083831787, "learning_rate": 0.0002, "epoch": 2.290841484116078, "step": 14170}, {"loss": 0.6655, "grad_norm": 0.9716812372207642, "learning_rate": 0.0002, "epoch": 2.292458168296823, "step": 14180}, {"loss": 0.7067, "grad_norm": 0.8993458151817322, "learning_rate": 0.0002, "epoch": 2.2940748524775687, "step": 14190}, {"loss": 0.6172, "grad_norm": 0.6156117916107178, "learning_rate": 0.0002, "epoch": 2.295691536658314, "step": 14200}, {"loss": 0.6318, "grad_norm": 0.8911278247833252, "learning_rate": 0.0002, "epoch": 2.297308220839059, "step": 14210}, {"loss": 0.6364, "grad_norm": 0.6422147154808044, "learning_rate": 0.0002, "epoch": 2.2989249050198044, "step": 14220}, {"loss": 0.6795, "grad_norm": 0.6866879463195801, "learning_rate": 0.0002, "epoch": 2.3005415892005496, "step": 14230}, {"loss": 0.6907, "grad_norm": 0.9297130107879639, "learning_rate": 0.0002, "epoch": 2.302158273381295, "step": 14240}, {"loss": 0.6823, "grad_norm": 0.7501356601715088, "learning_rate": 0.0002, "epoch": 2.30377495756204, "step": 14250}, {"loss": 0.6414, "grad_norm": 0.8363515138626099, "learning_rate": 0.0002, "epoch": 2.3053916417427853, "step": 14260}, {"loss": 0.6362, "grad_norm": 0.9083868265151978, "learning_rate": 0.0002, "epoch": 2.307008325923531, "step": 14270}, {"loss": 0.6862, "grad_norm": 0.7791516780853271, "learning_rate": 0.0002, "epoch": 2.3086250101042762, "step": 14280}, {"loss": 0.6569, "grad_norm": 0.8766953349113464, "learning_rate": 0.0002, "epoch": 2.3102416942850215, "step": 14290}, {"loss": 0.6698, "grad_norm": 0.7916635274887085, "learning_rate": 0.0002, "epoch": 2.3118583784657667, "step": 14300}, {"loss": 0.6927, "grad_norm": 0.627525269985199, "learning_rate": 0.0002, "epoch": 2.313475062646512, "step": 14310}, {"loss": 0.6541, "grad_norm": 0.8856783509254456, "learning_rate": 0.0002, "epoch": 2.315091746827257, "step": 14320}, {"loss": 0.6806, "grad_norm": 0.6758689284324646, "learning_rate": 0.0002, "epoch": 2.316708431008003, "step": 14330}, {"loss": 0.6794, "grad_norm": 0.6428321003913879, "learning_rate": 0.0002, "epoch": 2.318325115188748, "step": 14340}, {"loss": 0.682, "grad_norm": 0.9032121300697327, "learning_rate": 0.0002, "epoch": 2.3199417993694933, "step": 14350}, {"loss": 0.6569, "grad_norm": 0.8035986423492432, "learning_rate": 0.0002, "epoch": 2.3215584835502385, "step": 14360}, {"loss": 0.7067, "grad_norm": 0.7974579334259033, "learning_rate": 0.0002, "epoch": 2.3231751677309838, "step": 14370}, {"loss": 0.6451, "grad_norm": 0.8356034755706787, "learning_rate": 0.0002, "epoch": 2.324791851911729, "step": 14380}, {"loss": 0.6623, "grad_norm": 0.998760998249054, "learning_rate": 0.0002, "epoch": 2.326408536092474, "step": 14390}, {"loss": 0.649, "grad_norm": 0.6518142223358154, "learning_rate": 0.0002, "epoch": 2.3280252202732195, "step": 14400}, {"loss": 0.7146, "grad_norm": 0.7443506717681885, "learning_rate": 0.0002, "epoch": 2.3296419044539647, "step": 14410}, {"loss": 0.648, "grad_norm": 0.8436172604560852, "learning_rate": 0.0002, "epoch": 2.3312585886347104, "step": 14420}, {"loss": 0.6585, "grad_norm": 0.7411080598831177, "learning_rate": 0.0002, "epoch": 2.3328752728154556, "step": 14430}, {"loss": 0.6781, "grad_norm": 0.8839048743247986, "learning_rate": 0.0002, "epoch": 2.334491956996201, "step": 14440}, {"loss": 0.6565, "grad_norm": 0.8360885977745056, "learning_rate": 0.0002, "epoch": 2.336108641176946, "step": 14450}, {"loss": 0.6662, "grad_norm": 0.7608986496925354, "learning_rate": 0.0002, "epoch": 2.3377253253576913, "step": 14460}, {"loss": 0.6685, "grad_norm": 0.8179867267608643, "learning_rate": 0.0002, "epoch": 2.3393420095384365, "step": 14470}, {"loss": 0.7055, "grad_norm": 0.5989999771118164, "learning_rate": 0.0002, "epoch": 2.340958693719182, "step": 14480}, {"loss": 0.644, "grad_norm": 0.9450054168701172, "learning_rate": 0.0002, "epoch": 2.3425753778999274, "step": 14490}, {"loss": 0.6983, "grad_norm": 0.7885149717330933, "learning_rate": 0.0002, "epoch": 2.3441920620806727, "step": 14500}, {"loss": 0.6819, "grad_norm": 0.8152616620063782, "learning_rate": 0.0002, "epoch": 2.345808746261418, "step": 14510}, {"loss": 0.6989, "grad_norm": 0.7193838953971863, "learning_rate": 0.0002, "epoch": 2.347425430442163, "step": 14520}, {"loss": 0.6594, "grad_norm": 0.6701092720031738, "learning_rate": 0.0002, "epoch": 2.3490421146229084, "step": 14530}, {"loss": 0.6559, "grad_norm": 0.7529364228248596, "learning_rate": 0.0002, "epoch": 2.3506587988036536, "step": 14540}, {"loss": 0.6306, "grad_norm": 0.6599733829498291, "learning_rate": 0.0002, "epoch": 2.352275482984399, "step": 14550}, {"loss": 0.706, "grad_norm": 0.9502474069595337, "learning_rate": 0.0002, "epoch": 2.353892167165144, "step": 14560}, {"loss": 0.717, "grad_norm": 0.7619650959968567, "learning_rate": 0.0002, "epoch": 2.3555088513458897, "step": 14570}, {"loss": 0.6684, "grad_norm": 0.9854652285575867, "learning_rate": 0.0002, "epoch": 2.357125535526635, "step": 14580}, {"loss": 0.6455, "grad_norm": 0.727439284324646, "learning_rate": 0.0002, "epoch": 2.35874221970738, "step": 14590}, {"loss": 0.6645, "grad_norm": 0.6994746327400208, "learning_rate": 0.0002, "epoch": 2.3603589038881254, "step": 14600}, {"loss": 0.6587, "grad_norm": 0.7117531299591064, "learning_rate": 0.0002, "epoch": 2.3619755880688706, "step": 14610}, {"loss": 0.6804, "grad_norm": 0.6403067708015442, "learning_rate": 0.0002, "epoch": 2.363592272249616, "step": 14620}, {"loss": 0.7055, "grad_norm": 0.8377841711044312, "learning_rate": 0.0002, "epoch": 2.3652089564303616, "step": 14630}, {"loss": 0.6778, "grad_norm": 0.749171257019043, "learning_rate": 0.0002, "epoch": 2.366825640611107, "step": 14640}, {"loss": 0.6552, "grad_norm": 0.8418586254119873, "learning_rate": 0.0002, "epoch": 2.368442324791852, "step": 14650}, {"loss": 0.6685, "grad_norm": 0.6178573369979858, "learning_rate": 0.0002, "epoch": 2.3700590089725972, "step": 14660}, {"loss": 0.6774, "grad_norm": 0.6368302702903748, "learning_rate": 0.0002, "epoch": 2.3716756931533425, "step": 14670}, {"loss": 0.6136, "grad_norm": 0.9122977256774902, "learning_rate": 0.0002, "epoch": 2.3732923773340877, "step": 14680}, {"loss": 0.6675, "grad_norm": 0.7086195349693298, "learning_rate": 0.0002, "epoch": 2.374909061514833, "step": 14690}, {"loss": 0.6582, "grad_norm": 0.7500800490379333, "learning_rate": 0.0002, "epoch": 2.376525745695578, "step": 14700}, {"loss": 0.6792, "grad_norm": 0.6634900569915771, "learning_rate": 0.0002, "epoch": 2.378142429876324, "step": 14710}, {"loss": 0.6614, "grad_norm": 0.839898407459259, "learning_rate": 0.0002, "epoch": 2.379759114057069, "step": 14720}, {"loss": 0.6453, "grad_norm": 0.7578426003456116, "learning_rate": 0.0002, "epoch": 2.3813757982378143, "step": 14730}, {"loss": 0.7282, "grad_norm": 1.0213173627853394, "learning_rate": 0.0002, "epoch": 2.3829924824185595, "step": 14740}, {"loss": 0.6704, "grad_norm": 0.7855949401855469, "learning_rate": 0.0002, "epoch": 2.3846091665993048, "step": 14750}, {"loss": 0.6694, "grad_norm": 0.7224128842353821, "learning_rate": 0.0002, "epoch": 2.38622585078005, "step": 14760}, {"loss": 0.7017, "grad_norm": 0.8040381669998169, "learning_rate": 0.0002, "epoch": 2.3878425349607952, "step": 14770}, {"loss": 0.6799, "grad_norm": 0.7705281376838684, "learning_rate": 0.0002, "epoch": 2.389459219141541, "step": 14780}, {"loss": 0.6326, "grad_norm": 0.667966902256012, "learning_rate": 0.0002, "epoch": 2.391075903322286, "step": 14790}, {"loss": 0.7061, "grad_norm": 0.6611011028289795, "learning_rate": 0.0002, "epoch": 2.3926925875030314, "step": 14800}, {"loss": 0.6527, "grad_norm": 0.6862651705741882, "learning_rate": 0.0002, "epoch": 2.3943092716837766, "step": 14810}, {"loss": 0.6537, "grad_norm": 0.8086010217666626, "learning_rate": 0.0002, "epoch": 2.395925955864522, "step": 14820}, {"loss": 0.7189, "grad_norm": 0.7189689874649048, "learning_rate": 0.0002, "epoch": 2.397542640045267, "step": 14830}, {"loss": 0.6709, "grad_norm": 0.6280009150505066, "learning_rate": 0.0002, "epoch": 2.3991593242260123, "step": 14840}, {"loss": 0.706, "grad_norm": 0.7826612591743469, "learning_rate": 0.0002, "epoch": 2.4007760084067575, "step": 14850}, {"loss": 0.6738, "grad_norm": 0.7681610584259033, "learning_rate": 0.0002, "epoch": 2.402392692587503, "step": 14860}, {"loss": 0.636, "grad_norm": 0.720966100692749, "learning_rate": 0.0002, "epoch": 2.4040093767682484, "step": 14870}, {"loss": 0.6667, "grad_norm": 0.8202250599861145, "learning_rate": 0.0002, "epoch": 2.4056260609489937, "step": 14880}, {"loss": 0.6935, "grad_norm": 0.786212682723999, "learning_rate": 0.0002, "epoch": 2.407242745129739, "step": 14890}, {"loss": 0.6628, "grad_norm": 0.6647164821624756, "learning_rate": 0.0002, "epoch": 2.408859429310484, "step": 14900}, {"loss": 0.6706, "grad_norm": 0.7566399574279785, "learning_rate": 0.0002, "epoch": 2.4104761134912294, "step": 14910}, {"loss": 0.7188, "grad_norm": 0.748814582824707, "learning_rate": 0.0002, "epoch": 2.4120927976719746, "step": 14920}, {"loss": 0.6684, "grad_norm": 0.7624038457870483, "learning_rate": 0.0002, "epoch": 2.4137094818527203, "step": 14930}, {"loss": 0.6483, "grad_norm": 0.8267335295677185, "learning_rate": 0.0002, "epoch": 2.4153261660334655, "step": 14940}, {"loss": 0.6612, "grad_norm": 0.8785360455513, "learning_rate": 0.0002, "epoch": 2.4169428502142107, "step": 14950}, {"loss": 0.6718, "grad_norm": 0.679887592792511, "learning_rate": 0.0002, "epoch": 2.418559534394956, "step": 14960}, {"loss": 0.6136, "grad_norm": 0.7218474745750427, "learning_rate": 0.0002, "epoch": 2.420176218575701, "step": 14970}, {"loss": 0.648, "grad_norm": 0.6342799663543701, "learning_rate": 0.0002, "epoch": 2.4217929027564464, "step": 14980}, {"loss": 0.6617, "grad_norm": 0.7098712921142578, "learning_rate": 0.0002, "epoch": 2.4234095869371917, "step": 14990}, {"loss": 0.6942, "grad_norm": 0.7497431635856628, "learning_rate": 0.0002, "epoch": 2.425026271117937, "step": 15000}, {"loss": 0.6772, "grad_norm": 0.934836208820343, "learning_rate": 0.0002, "epoch": 2.4266429552986826, "step": 15010}, {"loss": 0.7221, "grad_norm": 0.8430966734886169, "learning_rate": 0.0002, "epoch": 2.428259639479428, "step": 15020}, {"loss": 0.6985, "grad_norm": 0.7032104730606079, "learning_rate": 0.0002, "epoch": 2.429876323660173, "step": 15030}, {"loss": 0.6715, "grad_norm": 0.7746111750602722, "learning_rate": 0.0002, "epoch": 2.4314930078409183, "step": 15040}, {"loss": 0.7177, "grad_norm": 0.7661406397819519, "learning_rate": 0.0002, "epoch": 2.4331096920216635, "step": 15050}, {"loss": 0.6517, "grad_norm": 0.6941645741462708, "learning_rate": 0.0002, "epoch": 2.4347263762024087, "step": 15060}, {"loss": 0.6421, "grad_norm": 0.7487249374389648, "learning_rate": 0.0002, "epoch": 2.436343060383154, "step": 15070}, {"loss": 0.6796, "grad_norm": 0.7639912962913513, "learning_rate": 0.0002, "epoch": 2.4379597445638996, "step": 15080}, {"loss": 0.7087, "grad_norm": 0.7708953619003296, "learning_rate": 0.0002, "epoch": 2.439576428744645, "step": 15090}, {"loss": 0.7065, "grad_norm": 0.9135832190513611, "learning_rate": 0.0002, "epoch": 2.44119311292539, "step": 15100}, {"loss": 0.672, "grad_norm": 0.8283005356788635, "learning_rate": 0.0002, "epoch": 2.4428097971061353, "step": 15110}, {"loss": 0.6551, "grad_norm": 0.925299346446991, "learning_rate": 0.0002, "epoch": 2.4444264812868806, "step": 15120}, {"loss": 0.687, "grad_norm": 0.7013528943061829, "learning_rate": 0.0002, "epoch": 2.446043165467626, "step": 15130}, {"loss": 0.6842, "grad_norm": 0.622303307056427, "learning_rate": 0.0002, "epoch": 2.447659849648371, "step": 15140}, {"loss": 0.6676, "grad_norm": 0.876569390296936, "learning_rate": 0.0002, "epoch": 2.4492765338291163, "step": 15150}, {"loss": 0.6463, "grad_norm": 0.6836351752281189, "learning_rate": 0.0002, "epoch": 2.450893218009862, "step": 15160}, {"loss": 0.6781, "grad_norm": 0.7886684536933899, "learning_rate": 0.0002, "epoch": 2.452509902190607, "step": 15170}, {"loss": 0.6794, "grad_norm": 0.6647440791130066, "learning_rate": 0.0002, "epoch": 2.4541265863713524, "step": 15180}, {"loss": 0.6353, "grad_norm": 0.7477722764015198, "learning_rate": 0.0002, "epoch": 2.4557432705520976, "step": 15190}, {"loss": 0.698, "grad_norm": 0.8192033767700195, "learning_rate": 0.0002, "epoch": 2.457359954732843, "step": 15200}, {"loss": 0.6735, "grad_norm": 0.847537100315094, "learning_rate": 0.0002, "epoch": 2.458976638913588, "step": 15210}, {"loss": 0.6962, "grad_norm": 0.9027776122093201, "learning_rate": 0.0002, "epoch": 2.4605933230943338, "step": 15220}, {"loss": 0.7084, "grad_norm": 0.7217772006988525, "learning_rate": 0.0002, "epoch": 2.462210007275079, "step": 15230}, {"loss": 0.691, "grad_norm": 0.7994546294212341, "learning_rate": 0.0002, "epoch": 2.4638266914558242, "step": 15240}, {"loss": 0.6828, "grad_norm": 0.939916729927063, "learning_rate": 0.0002, "epoch": 2.4654433756365695, "step": 15250}, {"loss": 0.6893, "grad_norm": 1.0009053945541382, "learning_rate": 0.0002, "epoch": 2.4670600598173147, "step": 15260}, {"loss": 0.643, "grad_norm": 0.625555694103241, "learning_rate": 0.0002, "epoch": 2.46867674399806, "step": 15270}, {"loss": 0.688, "grad_norm": 0.7924878597259521, "learning_rate": 0.0002, "epoch": 2.470293428178805, "step": 15280}, {"loss": 0.6789, "grad_norm": 0.8536689877510071, "learning_rate": 0.0002, "epoch": 2.4719101123595504, "step": 15290}, {"loss": 0.6924, "grad_norm": 0.8572589755058289, "learning_rate": 0.0002, "epoch": 2.4735267965402956, "step": 15300}, {"loss": 0.604, "grad_norm": 0.773279070854187, "learning_rate": 0.0002, "epoch": 2.4751434807210413, "step": 15310}, {"loss": 0.6573, "grad_norm": 0.7708749771118164, "learning_rate": 0.0002, "epoch": 2.4767601649017865, "step": 15320}, {"loss": 0.7065, "grad_norm": 0.770905077457428, "learning_rate": 0.0002, "epoch": 2.4783768490825318, "step": 15330}, {"loss": 0.6878, "grad_norm": 0.8238571882247925, "learning_rate": 0.0002, "epoch": 2.479993533263277, "step": 15340}, {"loss": 0.6772, "grad_norm": 0.7670477032661438, "learning_rate": 0.0002, "epoch": 2.481610217444022, "step": 15350}, {"loss": 0.7759, "grad_norm": 0.905036985874176, "learning_rate": 0.0002, "epoch": 2.4832269016247674, "step": 15360}, {"loss": 0.706, "grad_norm": 0.6672089695930481, "learning_rate": 0.0002, "epoch": 2.484843585805513, "step": 15370}, {"loss": 0.6722, "grad_norm": 0.625095784664154, "learning_rate": 0.0002, "epoch": 2.4864602699862584, "step": 15380}, {"loss": 0.6396, "grad_norm": 0.679772675037384, "learning_rate": 0.0002, "epoch": 2.4880769541670036, "step": 15390}, {"loss": 0.6778, "grad_norm": 0.711492121219635, "learning_rate": 0.0002, "epoch": 2.489693638347749, "step": 15400}, {"loss": 0.6966, "grad_norm": 0.876189112663269, "learning_rate": 0.0002, "epoch": 2.491310322528494, "step": 15410}, {"loss": 0.7307, "grad_norm": 0.7236915230751038, "learning_rate": 0.0002, "epoch": 2.4929270067092393, "step": 15420}, {"loss": 0.647, "grad_norm": 0.6629832983016968, "learning_rate": 0.0002, "epoch": 2.4945436908899845, "step": 15430}, {"loss": 0.6669, "grad_norm": 0.9756859540939331, "learning_rate": 0.0002, "epoch": 2.4961603750707297, "step": 15440}, {"loss": 0.7559, "grad_norm": 0.6896940469741821, "learning_rate": 0.0002, "epoch": 2.4977770592514754, "step": 15450}, {"loss": 0.6818, "grad_norm": 0.7105149626731873, "learning_rate": 0.0002, "epoch": 2.4993937434322206, "step": 15460}, {"loss": 0.6859, "grad_norm": 0.8374546766281128, "learning_rate": 0.0002, "epoch": 2.501010427612966, "step": 15470}, {"loss": 0.6512, "grad_norm": 0.7320070266723633, "learning_rate": 0.0002, "epoch": 2.502627111793711, "step": 15480}, {"loss": 0.685, "grad_norm": 0.8306367993354797, "learning_rate": 0.0002, "epoch": 2.5042437959744563, "step": 15490}, {"loss": 0.7253, "grad_norm": 0.7472721338272095, "learning_rate": 0.0002, "epoch": 2.5058604801552016, "step": 15500}, {"loss": 0.6699, "grad_norm": 0.6147692203521729, "learning_rate": 0.0002, "epoch": 2.507477164335947, "step": 15510}, {"loss": 0.7158, "grad_norm": 0.7788505554199219, "learning_rate": 0.0002, "epoch": 2.5090938485166925, "step": 15520}, {"loss": 0.6521, "grad_norm": 0.8807527422904968, "learning_rate": 0.0002, "epoch": 2.5107105326974377, "step": 15530}, {"loss": 0.6792, "grad_norm": 0.7521643042564392, "learning_rate": 0.0002, "epoch": 2.512327216878183, "step": 15540}, {"loss": 0.6772, "grad_norm": 0.6900225281715393, "learning_rate": 0.0002, "epoch": 2.513943901058928, "step": 15550}, {"loss": 0.6769, "grad_norm": 0.6601938605308533, "learning_rate": 0.0002, "epoch": 2.5155605852396734, "step": 15560}, {"loss": 0.6648, "grad_norm": 0.8179984092712402, "learning_rate": 0.0002, "epoch": 2.5171772694204186, "step": 15570}, {"loss": 0.7028, "grad_norm": 0.792556881904602, "learning_rate": 0.0002, "epoch": 2.518793953601164, "step": 15580}, {"loss": 0.6464, "grad_norm": 0.7081938982009888, "learning_rate": 0.0002, "epoch": 2.520410637781909, "step": 15590}, {"loss": 0.6691, "grad_norm": 0.8733121156692505, "learning_rate": 0.0002, "epoch": 2.5220273219626543, "step": 15600}, {"loss": 0.6969, "grad_norm": 0.7980992794036865, "learning_rate": 0.0002, "epoch": 2.5236440061434, "step": 15610}, {"loss": 0.7124, "grad_norm": 0.883664071559906, "learning_rate": 0.0002, "epoch": 2.5252606903241452, "step": 15620}, {"loss": 0.7022, "grad_norm": 0.6963341236114502, "learning_rate": 0.0002, "epoch": 2.5268773745048905, "step": 15630}, {"loss": 0.7334, "grad_norm": 0.6433573365211487, "learning_rate": 0.0002, "epoch": 2.5284940586856357, "step": 15640}, {"loss": 0.6889, "grad_norm": 0.8538183569908142, "learning_rate": 0.0002, "epoch": 2.530110742866381, "step": 15650}, {"loss": 0.6841, "grad_norm": 0.9748201370239258, "learning_rate": 0.0002, "epoch": 2.5317274270471266, "step": 15660}, {"loss": 0.6765, "grad_norm": 0.7670575380325317, "learning_rate": 0.0002, "epoch": 2.533344111227872, "step": 15670}, {"loss": 0.6435, "grad_norm": 0.8738890290260315, "learning_rate": 0.0002, "epoch": 2.534960795408617, "step": 15680}, {"loss": 0.6802, "grad_norm": 0.8391636610031128, "learning_rate": 0.0002, "epoch": 2.5365774795893623, "step": 15690}, {"loss": 0.6901, "grad_norm": 0.7239366769790649, "learning_rate": 0.0002, "epoch": 2.5381941637701075, "step": 15700}, {"loss": 0.7011, "grad_norm": 0.8498379588127136, "learning_rate": 0.0002, "epoch": 2.5398108479508528, "step": 15710}, {"loss": 0.6998, "grad_norm": 0.8029484152793884, "learning_rate": 0.0002, "epoch": 2.541427532131598, "step": 15720}, {"loss": 0.6678, "grad_norm": 1.0639333724975586, "learning_rate": 0.0002, "epoch": 2.5430442163123432, "step": 15730}, {"loss": 0.6341, "grad_norm": 0.6401297450065613, "learning_rate": 0.0002, "epoch": 2.5446609004930885, "step": 15740}, {"loss": 0.7196, "grad_norm": 0.7123814821243286, "learning_rate": 0.0002, "epoch": 2.5462775846738337, "step": 15750}, {"loss": 0.654, "grad_norm": 0.7874974608421326, "learning_rate": 0.0002, "epoch": 2.5478942688545794, "step": 15760}, {"loss": 0.6721, "grad_norm": 0.8046808838844299, "learning_rate": 0.0002, "epoch": 2.5495109530353246, "step": 15770}, {"loss": 0.6665, "grad_norm": 0.7888661623001099, "learning_rate": 0.0002, "epoch": 2.55112763721607, "step": 15780}, {"loss": 0.6893, "grad_norm": 0.8445866107940674, "learning_rate": 0.0002, "epoch": 2.552744321396815, "step": 15790}, {"loss": 0.6815, "grad_norm": 0.7475846409797668, "learning_rate": 0.0002, "epoch": 2.5543610055775603, "step": 15800}, {"loss": 0.6711, "grad_norm": 0.7455102801322937, "learning_rate": 0.0002, "epoch": 2.555977689758306, "step": 15810}, {"loss": 0.6932, "grad_norm": 0.8226983547210693, "learning_rate": 0.0002, "epoch": 2.557594373939051, "step": 15820}, {"loss": 0.651, "grad_norm": 0.8920368552207947, "learning_rate": 0.0002, "epoch": 2.5592110581197964, "step": 15830}, {"loss": 0.6297, "grad_norm": 0.8413904905319214, "learning_rate": 0.0002, "epoch": 2.5608277423005417, "step": 15840}, {"loss": 0.7106, "grad_norm": 0.8483649492263794, "learning_rate": 0.0002, "epoch": 2.562444426481287, "step": 15850}, {"loss": 0.6957, "grad_norm": 0.5923284292221069, "learning_rate": 0.0002, "epoch": 2.564061110662032, "step": 15860}, {"loss": 0.6847, "grad_norm": 0.8518726229667664, "learning_rate": 0.0002, "epoch": 2.5656777948427774, "step": 15870}, {"loss": 0.6362, "grad_norm": 0.731235146522522, "learning_rate": 0.0002, "epoch": 2.5672944790235226, "step": 15880}, {"loss": 0.7611, "grad_norm": 0.7517194151878357, "learning_rate": 0.0002, "epoch": 2.568911163204268, "step": 15890}, {"loss": 0.6907, "grad_norm": 0.8378692269325256, "learning_rate": 0.0002, "epoch": 2.5705278473850135, "step": 15900}, {"loss": 0.7055, "grad_norm": 0.843701958656311, "learning_rate": 0.0002, "epoch": 2.5721445315657587, "step": 15910}, {"loss": 0.6882, "grad_norm": 0.7254629731178284, "learning_rate": 0.0002, "epoch": 2.573761215746504, "step": 15920}, {"loss": 0.6872, "grad_norm": 0.8863335847854614, "learning_rate": 0.0002, "epoch": 2.575377899927249, "step": 15930}, {"loss": 0.6813, "grad_norm": 0.7675097584724426, "learning_rate": 0.0002, "epoch": 2.5769945841079944, "step": 15940}, {"loss": 0.7357, "grad_norm": 0.82063889503479, "learning_rate": 0.0002, "epoch": 2.5786112682887397, "step": 15950}, {"loss": 0.662, "grad_norm": 0.7729717493057251, "learning_rate": 0.0002, "epoch": 2.5802279524694853, "step": 15960}, {"loss": 0.633, "grad_norm": 0.8301846981048584, "learning_rate": 0.0002, "epoch": 2.5818446366502306, "step": 15970}, {"loss": 0.6897, "grad_norm": 0.7906861305236816, "learning_rate": 0.0002, "epoch": 2.583461320830976, "step": 15980}, {"loss": 0.7175, "grad_norm": 0.6749057173728943, "learning_rate": 0.0002, "epoch": 2.585078005011721, "step": 15990}, {"loss": 0.7212, "grad_norm": 0.9386842846870422, "learning_rate": 0.0002, "epoch": 2.5866946891924663, "step": 16000}, {"loss": 0.6934, "grad_norm": 0.7868891358375549, "learning_rate": 0.0002, "epoch": 2.5883113733732115, "step": 16010}, {"loss": 0.7036, "grad_norm": 0.8674671053886414, "learning_rate": 0.0002, "epoch": 2.5899280575539567, "step": 16020}, {"loss": 0.7217, "grad_norm": 0.7043559551239014, "learning_rate": 0.0002, "epoch": 2.591544741734702, "step": 16030}, {"loss": 0.6967, "grad_norm": 0.5846083760261536, "learning_rate": 0.0002, "epoch": 2.593161425915447, "step": 16040}, {"loss": 0.7322, "grad_norm": 0.7323982119560242, "learning_rate": 0.0002, "epoch": 2.594778110096193, "step": 16050}, {"loss": 0.6794, "grad_norm": 0.9069556593894958, "learning_rate": 0.0002, "epoch": 2.596394794276938, "step": 16060}, {"loss": 0.7076, "grad_norm": 0.7522736191749573, "learning_rate": 0.0002, "epoch": 2.5980114784576833, "step": 16070}, {"loss": 0.6477, "grad_norm": 0.8149648308753967, "learning_rate": 0.0002, "epoch": 2.5996281626384286, "step": 16080}, {"loss": 0.6664, "grad_norm": 0.6214233040809631, "learning_rate": 0.0002, "epoch": 2.601244846819174, "step": 16090}, {"loss": 0.7307, "grad_norm": 0.6803743839263916, "learning_rate": 0.0002, "epoch": 2.602861530999919, "step": 16100}, {"loss": 0.7244, "grad_norm": 0.7223997116088867, "learning_rate": 0.0002, "epoch": 2.6044782151806647, "step": 16110}, {"loss": 0.6867, "grad_norm": 0.7324174642562866, "learning_rate": 0.0002, "epoch": 2.60609489936141, "step": 16120}, {"loss": 0.7159, "grad_norm": 0.9594739675521851, "learning_rate": 0.0002, "epoch": 2.607711583542155, "step": 16130}, {"loss": 0.6451, "grad_norm": 0.9485327005386353, "learning_rate": 0.0002, "epoch": 2.6093282677229004, "step": 16140}, {"loss": 0.6815, "grad_norm": 0.8449000120162964, "learning_rate": 0.0002, "epoch": 2.6109449519036456, "step": 16150}, {"loss": 0.7152, "grad_norm": 0.8520140051841736, "learning_rate": 0.0002, "epoch": 2.612561636084391, "step": 16160}, {"loss": 0.6759, "grad_norm": 0.7456524968147278, "learning_rate": 0.0002, "epoch": 2.614178320265136, "step": 16170}, {"loss": 0.6893, "grad_norm": 0.9912857413291931, "learning_rate": 0.0002, "epoch": 2.6157950044458813, "step": 16180}, {"loss": 0.7243, "grad_norm": 0.9001946449279785, "learning_rate": 0.0002, "epoch": 2.6174116886266265, "step": 16190}, {"loss": 0.6825, "grad_norm": 0.6568667888641357, "learning_rate": 0.0002, "epoch": 2.619028372807372, "step": 16200}, {"loss": 0.7013, "grad_norm": 1.0248128175735474, "learning_rate": 0.0002, "epoch": 2.6206450569881174, "step": 16210}, {"loss": 0.7045, "grad_norm": 0.6509039998054504, "learning_rate": 0.0002, "epoch": 2.6222617411688627, "step": 16220}, {"loss": 0.72, "grad_norm": 0.7626351118087769, "learning_rate": 0.0002, "epoch": 2.623878425349608, "step": 16230}, {"loss": 0.6556, "grad_norm": 0.6938552260398865, "learning_rate": 0.0002, "epoch": 2.625495109530353, "step": 16240}, {"loss": 0.65, "grad_norm": 0.6434680819511414, "learning_rate": 0.0002, "epoch": 2.6271117937110984, "step": 16250}, {"loss": 0.6943, "grad_norm": 0.7111515998840332, "learning_rate": 0.0002, "epoch": 2.628728477891844, "step": 16260}, {"loss": 0.679, "grad_norm": 0.7712395787239075, "learning_rate": 0.0002, "epoch": 2.6303451620725893, "step": 16270}, {"loss": 0.6886, "grad_norm": 0.792209267616272, "learning_rate": 0.0002, "epoch": 2.6319618462533345, "step": 16280}, {"loss": 0.6554, "grad_norm": 0.6801066398620605, "learning_rate": 0.0002, "epoch": 2.6335785304340797, "step": 16290}, {"loss": 0.73, "grad_norm": 0.7802573442459106, "learning_rate": 0.0002, "epoch": 2.635195214614825, "step": 16300}, {"loss": 0.7484, "grad_norm": 0.7742244601249695, "learning_rate": 0.0002, "epoch": 2.63681189879557, "step": 16310}, {"loss": 0.6524, "grad_norm": 0.664184033870697, "learning_rate": 0.0002, "epoch": 2.6384285829763154, "step": 16320}, {"loss": 0.6442, "grad_norm": 0.9242228865623474, "learning_rate": 0.0002, "epoch": 2.6400452671570607, "step": 16330}, {"loss": 0.6792, "grad_norm": 0.9661325216293335, "learning_rate": 0.0002, "epoch": 2.641661951337806, "step": 16340}, {"loss": 0.6847, "grad_norm": 0.837526798248291, "learning_rate": 0.0002, "epoch": 2.6432786355185516, "step": 16350}, {"loss": 0.7686, "grad_norm": 1.1834373474121094, "learning_rate": 0.0002, "epoch": 2.644895319699297, "step": 16360}, {"loss": 0.6746, "grad_norm": 0.7467831373214722, "learning_rate": 0.0002, "epoch": 2.646512003880042, "step": 16370}, {"loss": 0.6935, "grad_norm": 0.8627146482467651, "learning_rate": 0.0002, "epoch": 2.6481286880607873, "step": 16380}, {"loss": 0.715, "grad_norm": 0.790447473526001, "learning_rate": 0.0002, "epoch": 2.6497453722415325, "step": 16390}, {"loss": 0.723, "grad_norm": 0.8447365164756775, "learning_rate": 0.0002, "epoch": 2.651362056422278, "step": 16400}, {"loss": 0.6628, "grad_norm": 0.7831417918205261, "learning_rate": 0.0002, "epoch": 2.6529787406030234, "step": 16410}, {"loss": 0.6691, "grad_norm": 0.6837952136993408, "learning_rate": 0.0002, "epoch": 2.6545954247837686, "step": 16420}, {"loss": 0.6139, "grad_norm": 0.7031801342964172, "learning_rate": 0.0002, "epoch": 2.656212108964514, "step": 16430}, {"loss": 0.7382, "grad_norm": 0.8963770866394043, "learning_rate": 0.0002, "epoch": 2.657828793145259, "step": 16440}, {"loss": 0.6439, "grad_norm": 0.6852328181266785, "learning_rate": 0.0002, "epoch": 2.6594454773260043, "step": 16450}, {"loss": 0.6278, "grad_norm": 0.8069294095039368, "learning_rate": 0.0002, "epoch": 2.6610621615067496, "step": 16460}, {"loss": 0.6939, "grad_norm": 0.7503686547279358, "learning_rate": 0.0002, "epoch": 2.662678845687495, "step": 16470}, {"loss": 0.6777, "grad_norm": 0.6430956125259399, "learning_rate": 0.0002, "epoch": 2.66429552986824, "step": 16480}, {"loss": 0.6863, "grad_norm": 0.7894312739372253, "learning_rate": 0.0002, "epoch": 2.6659122140489853, "step": 16490}, {"loss": 0.7165, "grad_norm": 0.7277431488037109, "learning_rate": 0.0002, "epoch": 2.667528898229731, "step": 16500}, {"loss": 0.6772, "grad_norm": 0.6816153526306152, "learning_rate": 0.0002, "epoch": 2.669145582410476, "step": 16510}, {"loss": 0.691, "grad_norm": 0.8145235776901245, "learning_rate": 0.0002, "epoch": 2.6707622665912214, "step": 16520}, {"loss": 0.709, "grad_norm": 0.8645890355110168, "learning_rate": 0.0002, "epoch": 2.6723789507719666, "step": 16530}, {"loss": 0.6946, "grad_norm": 0.704393208026886, "learning_rate": 0.0002, "epoch": 2.673995634952712, "step": 16540}, {"loss": 0.6378, "grad_norm": 1.0120846033096313, "learning_rate": 0.0002, "epoch": 2.6756123191334575, "step": 16550}, {"loss": 0.7241, "grad_norm": 0.6919328570365906, "learning_rate": 0.0002, "epoch": 2.6772290033142028, "step": 16560}, {"loss": 0.7098, "grad_norm": 0.6924574971199036, "learning_rate": 0.0002, "epoch": 2.678845687494948, "step": 16570}, {"loss": 0.731, "grad_norm": 0.9679301381111145, "learning_rate": 0.0002, "epoch": 2.6804623716756932, "step": 16580}, {"loss": 0.7124, "grad_norm": 0.6810211539268494, "learning_rate": 0.0002, "epoch": 2.6820790558564385, "step": 16590}, {"loss": 0.6688, "grad_norm": 0.9730555415153503, "learning_rate": 0.0002, "epoch": 2.6836957400371837, "step": 16600}, {"loss": 0.7344, "grad_norm": 0.7852821350097656, "learning_rate": 0.0002, "epoch": 2.685312424217929, "step": 16610}, {"loss": 0.6401, "grad_norm": 0.6059057116508484, "learning_rate": 0.0002, "epoch": 2.686929108398674, "step": 16620}, {"loss": 0.6796, "grad_norm": 0.9395958781242371, "learning_rate": 0.0002, "epoch": 2.6885457925794194, "step": 16630}, {"loss": 0.7174, "grad_norm": 0.7473729848861694, "learning_rate": 0.0002, "epoch": 2.690162476760165, "step": 16640}, {"loss": 0.7087, "grad_norm": 0.765934407711029, "learning_rate": 0.0002, "epoch": 2.6917791609409103, "step": 16650}, {"loss": 0.707, "grad_norm": 0.8496677279472351, "learning_rate": 0.0002, "epoch": 2.6933958451216555, "step": 16660}, {"loss": 0.7084, "grad_norm": 0.7641879916191101, "learning_rate": 0.0002, "epoch": 2.6950125293024008, "step": 16670}, {"loss": 0.6566, "grad_norm": 0.8471952676773071, "learning_rate": 0.0002, "epoch": 2.696629213483146, "step": 16680}, {"loss": 0.6635, "grad_norm": 0.6946060657501221, "learning_rate": 0.0002, "epoch": 2.6982458976638912, "step": 16690}, {"loss": 0.7027, "grad_norm": 0.7361312508583069, "learning_rate": 0.0002, "epoch": 2.699862581844637, "step": 16700}, {"loss": 0.6767, "grad_norm": 0.6605038046836853, "learning_rate": 0.0002, "epoch": 2.701479266025382, "step": 16710}, {"loss": 0.6885, "grad_norm": 0.7164411544799805, "learning_rate": 0.0002, "epoch": 2.7030959502061274, "step": 16720}, {"loss": 0.6736, "grad_norm": 0.6496201157569885, "learning_rate": 0.0002, "epoch": 2.7047126343868726, "step": 16730}, {"loss": 0.6942, "grad_norm": 0.7826663851737976, "learning_rate": 0.0002, "epoch": 2.706329318567618, "step": 16740}, {"loss": 0.6773, "grad_norm": 0.7639131546020508, "learning_rate": 0.0002, "epoch": 2.707946002748363, "step": 16750}, {"loss": 0.69, "grad_norm": 0.7976210713386536, "learning_rate": 0.0002, "epoch": 2.7095626869291083, "step": 16760}, {"loss": 0.6735, "grad_norm": 0.6836577653884888, "learning_rate": 0.0002, "epoch": 2.7111793711098535, "step": 16770}, {"loss": 0.6596, "grad_norm": 0.8025202751159668, "learning_rate": 0.0002, "epoch": 2.7127960552905988, "step": 16780}, {"loss": 0.6324, "grad_norm": 0.7636463642120361, "learning_rate": 0.0002, "epoch": 2.7144127394713444, "step": 16790}, {"loss": 0.6227, "grad_norm": 0.7481677532196045, "learning_rate": 0.0002, "epoch": 2.7160294236520897, "step": 16800}, {"loss": 0.6925, "grad_norm": 0.7566834688186646, "learning_rate": 0.0002, "epoch": 2.717646107832835, "step": 16810}, {"loss": 0.6531, "grad_norm": 0.7931267619132996, "learning_rate": 0.0002, "epoch": 2.71926279201358, "step": 16820}, {"loss": 0.6672, "grad_norm": 0.8811662197113037, "learning_rate": 0.0002, "epoch": 2.7208794761943254, "step": 16830}, {"loss": 0.6675, "grad_norm": 0.8561240434646606, "learning_rate": 0.0002, "epoch": 2.7224961603750706, "step": 16840}, {"loss": 0.7135, "grad_norm": 0.7121599316596985, "learning_rate": 0.0002, "epoch": 2.7241128445558163, "step": 16850}, {"loss": 0.6825, "grad_norm": 0.8066257238388062, "learning_rate": 0.0002, "epoch": 2.7257295287365615, "step": 16860}, {"loss": 0.6839, "grad_norm": 0.7699271440505981, "learning_rate": 0.0002, "epoch": 2.7273462129173067, "step": 16870}, {"loss": 0.699, "grad_norm": 1.1828432083129883, "learning_rate": 0.0002, "epoch": 2.728962897098052, "step": 16880}, {"loss": 0.6518, "grad_norm": 0.9989302754402161, "learning_rate": 0.0002, "epoch": 2.730579581278797, "step": 16890}, {"loss": 0.7015, "grad_norm": 0.8100560307502747, "learning_rate": 0.0002, "epoch": 2.7321962654595424, "step": 16900}, {"loss": 0.6851, "grad_norm": 0.8615233898162842, "learning_rate": 0.0002, "epoch": 2.7338129496402876, "step": 16910}, {"loss": 0.6322, "grad_norm": 0.8633756041526794, "learning_rate": 0.0002, "epoch": 2.735429633821033, "step": 16920}, {"loss": 0.6488, "grad_norm": 0.7769348621368408, "learning_rate": 0.0002, "epoch": 2.737046318001778, "step": 16930}, {"loss": 0.6582, "grad_norm": 0.6943058371543884, "learning_rate": 0.0002, "epoch": 2.738663002182524, "step": 16940}, {"loss": 0.6516, "grad_norm": 0.8510736227035522, "learning_rate": 0.0002, "epoch": 2.740279686363269, "step": 16950}, {"loss": 0.7275, "grad_norm": 0.7732602953910828, "learning_rate": 0.0002, "epoch": 2.7418963705440142, "step": 16960}, {"loss": 0.6553, "grad_norm": 0.5981788635253906, "learning_rate": 0.0002, "epoch": 2.7435130547247595, "step": 16970}, {"loss": 0.6777, "grad_norm": 0.7604416012763977, "learning_rate": 0.0002, "epoch": 2.7451297389055047, "step": 16980}, {"loss": 0.6981, "grad_norm": 0.7377738356590271, "learning_rate": 0.0002, "epoch": 2.74674642308625, "step": 16990}, {"loss": 0.6294, "grad_norm": 0.9400289058685303, "learning_rate": 0.0002, "epoch": 2.7483631072669956, "step": 17000}, {"loss": 0.6952, "grad_norm": 0.6340599656105042, "learning_rate": 0.0002, "epoch": 2.749979791447741, "step": 17010}, {"loss": 0.7222, "grad_norm": 0.7297601103782654, "learning_rate": 0.0002, "epoch": 2.751596475628486, "step": 17020}, {"loss": 0.6659, "grad_norm": 0.9479979872703552, "learning_rate": 0.0002, "epoch": 2.7532131598092313, "step": 17030}, {"loss": 0.691, "grad_norm": 0.8461511135101318, "learning_rate": 0.0002, "epoch": 2.7548298439899765, "step": 17040}, {"loss": 0.6764, "grad_norm": 0.7477551698684692, "learning_rate": 0.0002, "epoch": 2.7564465281707218, "step": 17050}, {"loss": 0.684, "grad_norm": 1.019270420074463, "learning_rate": 0.0002, "epoch": 2.758063212351467, "step": 17060}, {"loss": 0.7119, "grad_norm": 0.7730235457420349, "learning_rate": 0.0002, "epoch": 2.7596798965322122, "step": 17070}, {"loss": 0.6886, "grad_norm": 0.8216866254806519, "learning_rate": 0.0002, "epoch": 2.7612965807129575, "step": 17080}, {"loss": 0.6811, "grad_norm": 0.7235931754112244, "learning_rate": 0.0002, "epoch": 2.762913264893703, "step": 17090}, {"loss": 0.7031, "grad_norm": 0.7352296710014343, "learning_rate": 0.0002, "epoch": 2.7645299490744484, "step": 17100}, {"loss": 0.6951, "grad_norm": 0.8129373788833618, "learning_rate": 0.0002, "epoch": 2.7661466332551936, "step": 17110}, {"loss": 0.6703, "grad_norm": 0.7387019991874695, "learning_rate": 0.0002, "epoch": 2.767763317435939, "step": 17120}, {"loss": 0.6789, "grad_norm": 0.9149190187454224, "learning_rate": 0.0002, "epoch": 2.769380001616684, "step": 17130}, {"loss": 0.6038, "grad_norm": 0.7352971434593201, "learning_rate": 0.0002, "epoch": 2.7709966857974297, "step": 17140}, {"loss": 0.6728, "grad_norm": 0.7903780341148376, "learning_rate": 0.0002, "epoch": 2.772613369978175, "step": 17150}, {"loss": 0.6988, "grad_norm": 0.8255927562713623, "learning_rate": 0.0002, "epoch": 2.77423005415892, "step": 17160}, {"loss": 0.6694, "grad_norm": 0.7235927581787109, "learning_rate": 0.0002, "epoch": 2.7758467383396654, "step": 17170}, {"loss": 0.7161, "grad_norm": 0.8281434774398804, "learning_rate": 0.0002, "epoch": 2.7774634225204107, "step": 17180}, {"loss": 0.682, "grad_norm": 0.7586921453475952, "learning_rate": 0.0002, "epoch": 2.779080106701156, "step": 17190}, {"loss": 0.6427, "grad_norm": 0.7161715030670166, "learning_rate": 0.0002, "epoch": 2.780696790881901, "step": 17200}, {"loss": 0.6426, "grad_norm": 0.762868344783783, "learning_rate": 0.0002, "epoch": 2.7823134750626464, "step": 17210}, {"loss": 0.705, "grad_norm": 0.9285483360290527, "learning_rate": 0.0002, "epoch": 2.7839301592433916, "step": 17220}, {"loss": 0.7084, "grad_norm": 0.6900462508201599, "learning_rate": 0.0002, "epoch": 2.785546843424137, "step": 17230}, {"loss": 0.6988, "grad_norm": 0.780384361743927, "learning_rate": 0.0002, "epoch": 2.7871635276048825, "step": 17240}, {"loss": 0.7073, "grad_norm": 0.7580406665802002, "learning_rate": 0.0002, "epoch": 2.7887802117856277, "step": 17250}, {"loss": 0.6833, "grad_norm": 0.8145199418067932, "learning_rate": 0.0002, "epoch": 2.790396895966373, "step": 17260}, {"loss": 0.6909, "grad_norm": 0.9159596562385559, "learning_rate": 0.0002, "epoch": 2.792013580147118, "step": 17270}, {"loss": 0.6008, "grad_norm": 0.9590014219284058, "learning_rate": 0.0002, "epoch": 2.7936302643278634, "step": 17280}, {"loss": 0.6704, "grad_norm": 0.7603529691696167, "learning_rate": 0.0002, "epoch": 2.795246948508609, "step": 17290}, {"loss": 0.7165, "grad_norm": 0.8039976358413696, "learning_rate": 0.0002, "epoch": 2.7968636326893543, "step": 17300}, {"loss": 0.7037, "grad_norm": 0.8364847302436829, "learning_rate": 0.0002, "epoch": 2.7984803168700996, "step": 17310}, {"loss": 0.6749, "grad_norm": 0.8763046860694885, "learning_rate": 0.0002, "epoch": 2.800097001050845, "step": 17320}, {"loss": 0.6844, "grad_norm": 0.8409647941589355, "learning_rate": 0.0002, "epoch": 2.80171368523159, "step": 17330}, {"loss": 0.6936, "grad_norm": 0.7649006247520447, "learning_rate": 0.0002, "epoch": 2.8033303694123353, "step": 17340}, {"loss": 0.7051, "grad_norm": 0.7970262169837952, "learning_rate": 0.0002, "epoch": 2.8049470535930805, "step": 17350}, {"loss": 0.6533, "grad_norm": 0.9088607430458069, "learning_rate": 0.0002, "epoch": 2.8065637377738257, "step": 17360}, {"loss": 0.675, "grad_norm": 0.6454846858978271, "learning_rate": 0.0002, "epoch": 2.808180421954571, "step": 17370}, {"loss": 0.7069, "grad_norm": 0.7744787931442261, "learning_rate": 0.0002, "epoch": 2.809797106135316, "step": 17380}, {"loss": 0.6772, "grad_norm": 0.6678640842437744, "learning_rate": 0.0002, "epoch": 2.811413790316062, "step": 17390}, {"loss": 0.6784, "grad_norm": 0.772676944732666, "learning_rate": 0.0002, "epoch": 2.813030474496807, "step": 17400}, {"loss": 0.7252, "grad_norm": 0.7088175415992737, "learning_rate": 0.0002, "epoch": 2.8146471586775523, "step": 17410}, {"loss": 0.7086, "grad_norm": 0.8280573487281799, "learning_rate": 0.0002, "epoch": 2.8162638428582976, "step": 17420}, {"loss": 0.6732, "grad_norm": 0.6665388345718384, "learning_rate": 0.0002, "epoch": 2.817880527039043, "step": 17430}, {"loss": 0.6675, "grad_norm": 0.6427883505821228, "learning_rate": 0.0002, "epoch": 2.8194972112197885, "step": 17440}, {"loss": 0.6972, "grad_norm": 0.9697760343551636, "learning_rate": 0.0002, "epoch": 2.8211138954005337, "step": 17450}, {"loss": 0.6838, "grad_norm": 0.7573966383934021, "learning_rate": 0.0002, "epoch": 2.822730579581279, "step": 17460}, {"loss": 0.7243, "grad_norm": 0.878688633441925, "learning_rate": 0.0002, "epoch": 2.824347263762024, "step": 17470}, {"loss": 0.6666, "grad_norm": 0.7752242684364319, "learning_rate": 0.0002, "epoch": 2.8259639479427694, "step": 17480}, {"loss": 0.6638, "grad_norm": 0.6135398745536804, "learning_rate": 0.0002, "epoch": 2.8275806321235146, "step": 17490}, {"loss": 0.6829, "grad_norm": 0.6924924850463867, "learning_rate": 0.0002, "epoch": 2.82919731630426, "step": 17500}, {"loss": 0.6731, "grad_norm": 0.7471627593040466, "learning_rate": 0.0002, "epoch": 2.830814000485005, "step": 17510}, {"loss": 0.7016, "grad_norm": 0.7145499587059021, "learning_rate": 0.0002, "epoch": 2.8324306846657503, "step": 17520}, {"loss": 0.6787, "grad_norm": 0.7415414452552795, "learning_rate": 0.0002, "epoch": 2.834047368846496, "step": 17530}, {"loss": 0.6811, "grad_norm": 0.7328441739082336, "learning_rate": 0.0002, "epoch": 2.8356640530272412, "step": 17540}, {"loss": 0.6866, "grad_norm": 0.8267839550971985, "learning_rate": 0.0002, "epoch": 2.8372807372079865, "step": 17550}, {"loss": 0.6787, "grad_norm": 0.8877885341644287, "learning_rate": 0.0002, "epoch": 2.8388974213887317, "step": 17560}, {"loss": 0.7136, "grad_norm": 0.857138454914093, "learning_rate": 0.0002, "epoch": 2.840514105569477, "step": 17570}, {"loss": 0.6454, "grad_norm": 0.8470779657363892, "learning_rate": 0.0002, "epoch": 2.842130789750222, "step": 17580}, {"loss": 0.6976, "grad_norm": 0.8553254008293152, "learning_rate": 0.0002, "epoch": 2.843747473930968, "step": 17590}, {"loss": 0.7297, "grad_norm": 0.8033196926116943, "learning_rate": 0.0002, "epoch": 2.845364158111713, "step": 17600}, {"loss": 0.7062, "grad_norm": 0.7949087023735046, "learning_rate": 0.0002, "epoch": 2.8469808422924583, "step": 17610}, {"loss": 0.651, "grad_norm": 0.9241406321525574, "learning_rate": 0.0002, "epoch": 2.8485975264732035, "step": 17620}, {"loss": 0.6601, "grad_norm": 0.7721285223960876, "learning_rate": 0.0002, "epoch": 2.8502142106539488, "step": 17630}, {"loss": 0.6183, "grad_norm": 1.0246692895889282, "learning_rate": 0.0002, "epoch": 2.851830894834694, "step": 17640}, {"loss": 0.7007, "grad_norm": 0.9244589805603027, "learning_rate": 0.0002, "epoch": 2.853447579015439, "step": 17650}, {"loss": 0.7274, "grad_norm": 0.7243508696556091, "learning_rate": 0.0002, "epoch": 2.8550642631961844, "step": 17660}, {"loss": 0.6471, "grad_norm": 0.8943371176719666, "learning_rate": 0.0002, "epoch": 2.8566809473769297, "step": 17670}, {"loss": 0.686, "grad_norm": 0.6531758904457092, "learning_rate": 0.0002, "epoch": 2.8582976315576754, "step": 17680}, {"loss": 0.6253, "grad_norm": 0.8367000818252563, "learning_rate": 0.0002, "epoch": 2.8599143157384206, "step": 17690}, {"loss": 0.6943, "grad_norm": 0.7868556380271912, "learning_rate": 0.0002, "epoch": 2.861530999919166, "step": 17700}, {"loss": 0.6919, "grad_norm": 0.7213859558105469, "learning_rate": 0.0002, "epoch": 2.863147684099911, "step": 17710}, {"loss": 0.6657, "grad_norm": 0.7383931279182434, "learning_rate": 0.0002, "epoch": 2.8647643682806563, "step": 17720}, {"loss": 0.6841, "grad_norm": 0.7566812634468079, "learning_rate": 0.0002, "epoch": 2.8663810524614015, "step": 17730}, {"loss": 0.6449, "grad_norm": 0.6930373311042786, "learning_rate": 0.0002, "epoch": 2.867997736642147, "step": 17740}, {"loss": 0.6764, "grad_norm": 0.7911090850830078, "learning_rate": 0.0002, "epoch": 2.8696144208228924, "step": 17750}, {"loss": 0.6554, "grad_norm": 0.8484548926353455, "learning_rate": 0.0002, "epoch": 2.8712311050036377, "step": 17760}, {"loss": 0.6931, "grad_norm": 0.7647597193717957, "learning_rate": 0.0002, "epoch": 2.872847789184383, "step": 17770}, {"loss": 0.6945, "grad_norm": 0.8791151642799377, "learning_rate": 0.0002, "epoch": 2.874464473365128, "step": 17780}, {"loss": 0.7078, "grad_norm": 0.7253178358078003, "learning_rate": 0.0002, "epoch": 2.8760811575458733, "step": 17790}, {"loss": 0.6474, "grad_norm": 0.7956077456474304, "learning_rate": 0.0002, "epoch": 2.8776978417266186, "step": 17800}, {"loss": 0.6687, "grad_norm": 0.8657688498497009, "learning_rate": 0.0002, "epoch": 2.879314525907364, "step": 17810}, {"loss": 0.7171, "grad_norm": 0.7059141993522644, "learning_rate": 0.0002, "epoch": 2.880931210088109, "step": 17820}, {"loss": 0.683, "grad_norm": 0.8886896967887878, "learning_rate": 0.0002, "epoch": 2.8825478942688547, "step": 17830}, {"loss": 0.669, "grad_norm": 0.821032702922821, "learning_rate": 0.0002, "epoch": 2.8841645784496, "step": 17840}, {"loss": 0.6805, "grad_norm": 0.7183963656425476, "learning_rate": 0.0002, "epoch": 2.885781262630345, "step": 17850}, {"loss": 0.7088, "grad_norm": 0.6222899556159973, "learning_rate": 0.0002, "epoch": 2.8873979468110904, "step": 17860}, {"loss": 0.6626, "grad_norm": 0.8187434077262878, "learning_rate": 0.0002, "epoch": 2.8890146309918356, "step": 17870}, {"loss": 0.6815, "grad_norm": 0.9838479161262512, "learning_rate": 0.0002, "epoch": 2.890631315172581, "step": 17880}, {"loss": 0.6967, "grad_norm": 0.7567742466926575, "learning_rate": 0.0002, "epoch": 2.8922479993533265, "step": 17890}, {"loss": 0.7073, "grad_norm": 0.6875903606414795, "learning_rate": 0.0002, "epoch": 2.893864683534072, "step": 17900}, {"loss": 0.6415, "grad_norm": 0.8043789267539978, "learning_rate": 0.0002, "epoch": 2.895481367714817, "step": 17910}, {"loss": 0.6588, "grad_norm": 0.8062626719474792, "learning_rate": 0.0002, "epoch": 2.8970980518955622, "step": 17920}, {"loss": 0.7151, "grad_norm": 1.0251191854476929, "learning_rate": 0.0002, "epoch": 2.8987147360763075, "step": 17930}, {"loss": 0.6605, "grad_norm": 0.882253110408783, "learning_rate": 0.0002, "epoch": 2.9003314202570527, "step": 17940}, {"loss": 0.6719, "grad_norm": 0.8683299422264099, "learning_rate": 0.0002, "epoch": 2.901948104437798, "step": 17950}, {"loss": 0.6896, "grad_norm": 0.7167282104492188, "learning_rate": 0.0002, "epoch": 2.903564788618543, "step": 17960}, {"loss": 0.663, "grad_norm": 0.7093694806098938, "learning_rate": 0.0002, "epoch": 2.9051814727992884, "step": 17970}, {"loss": 0.6591, "grad_norm": 0.8549879193305969, "learning_rate": 0.0002, "epoch": 2.906798156980034, "step": 17980}, {"loss": 0.6962, "grad_norm": 0.6989606618881226, "learning_rate": 0.0002, "epoch": 2.9084148411607793, "step": 17990}, {"loss": 0.6635, "grad_norm": 0.9482976794242859, "learning_rate": 0.0002, "epoch": 2.9100315253415245, "step": 18000}, {"loss": 0.6586, "grad_norm": 0.7182440161705017, "learning_rate": 0.0002, "epoch": 2.9116482095222698, "step": 18010}, {"loss": 0.6827, "grad_norm": 0.7732226252555847, "learning_rate": 0.0002, "epoch": 2.913264893703015, "step": 18020}, {"loss": 0.7123, "grad_norm": 0.7936875224113464, "learning_rate": 0.0002, "epoch": 2.9148815778837607, "step": 18030}, {"loss": 0.6736, "grad_norm": 0.8825615644454956, "learning_rate": 0.0002, "epoch": 2.916498262064506, "step": 18040}, {"loss": 0.7139, "grad_norm": 0.6778587102890015, "learning_rate": 0.0002, "epoch": 2.918114946245251, "step": 18050}, {"loss": 0.6588, "grad_norm": 0.7529265880584717, "learning_rate": 0.0002, "epoch": 2.9197316304259964, "step": 18060}, {"loss": 0.737, "grad_norm": 0.7111883163452148, "learning_rate": 0.0002, "epoch": 2.9213483146067416, "step": 18070}, {"loss": 0.7475, "grad_norm": 0.7214767932891846, "learning_rate": 0.0002, "epoch": 2.922964998787487, "step": 18080}, {"loss": 0.6672, "grad_norm": 0.800417423248291, "learning_rate": 0.0002, "epoch": 2.924581682968232, "step": 18090}, {"loss": 0.6694, "grad_norm": 1.248575210571289, "learning_rate": 0.0002, "epoch": 2.9261983671489773, "step": 18100}, {"loss": 0.7004, "grad_norm": 0.757788360118866, "learning_rate": 0.0002, "epoch": 2.9278150513297225, "step": 18110}, {"loss": 0.6999, "grad_norm": 1.0583995580673218, "learning_rate": 0.0002, "epoch": 2.9294317355104678, "step": 18120}, {"loss": 0.6365, "grad_norm": 0.8228777647018433, "learning_rate": 0.0002, "epoch": 2.9310484196912134, "step": 18130}, {"loss": 0.6791, "grad_norm": 0.8374035358428955, "learning_rate": 0.0002, "epoch": 2.9326651038719587, "step": 18140}, {"loss": 0.6399, "grad_norm": 0.7976473569869995, "learning_rate": 0.0002, "epoch": 2.934281788052704, "step": 18150}, {"loss": 0.6585, "grad_norm": 0.8009907603263855, "learning_rate": 0.0002, "epoch": 2.935898472233449, "step": 18160}, {"loss": 0.7485, "grad_norm": 0.835213303565979, "learning_rate": 0.0002, "epoch": 2.9375151564141944, "step": 18170}, {"loss": 0.7376, "grad_norm": 0.7982219457626343, "learning_rate": 0.0002, "epoch": 2.93913184059494, "step": 18180}, {"loss": 0.6348, "grad_norm": 0.7070978879928589, "learning_rate": 0.0002, "epoch": 2.9407485247756853, "step": 18190}, {"loss": 0.6608, "grad_norm": 0.8619440197944641, "learning_rate": 0.0002, "epoch": 2.9423652089564305, "step": 18200}, {"loss": 0.666, "grad_norm": 0.6693987250328064, "learning_rate": 0.0002, "epoch": 2.9439818931371757, "step": 18210}, {"loss": 0.728, "grad_norm": 0.6747021079063416, "learning_rate": 0.0002, "epoch": 2.945598577317921, "step": 18220}, {"loss": 0.6686, "grad_norm": 0.860387921333313, "learning_rate": 0.0002, "epoch": 2.947215261498666, "step": 18230}, {"loss": 0.6945, "grad_norm": 0.799976646900177, "learning_rate": 0.0002, "epoch": 2.9488319456794114, "step": 18240}, {"loss": 0.7243, "grad_norm": 0.7864769101142883, "learning_rate": 0.0002, "epoch": 2.9504486298601567, "step": 18250}, {"loss": 0.6785, "grad_norm": 0.6713884472846985, "learning_rate": 0.0002, "epoch": 2.952065314040902, "step": 18260}, {"loss": 0.7429, "grad_norm": 0.9031508564949036, "learning_rate": 0.0002, "epoch": 2.9536819982216476, "step": 18270}, {"loss": 0.7055, "grad_norm": 0.7205073237419128, "learning_rate": 0.0002, "epoch": 2.955298682402393, "step": 18280}, {"loss": 0.7298, "grad_norm": 0.7746205925941467, "learning_rate": 0.0002, "epoch": 2.956915366583138, "step": 18290}, {"loss": 0.6218, "grad_norm": 0.6533427834510803, "learning_rate": 0.0002, "epoch": 2.9585320507638833, "step": 18300}, {"loss": 0.6674, "grad_norm": 0.9083208441734314, "learning_rate": 0.0002, "epoch": 2.9601487349446285, "step": 18310}, {"loss": 0.7359, "grad_norm": 0.7446991801261902, "learning_rate": 0.0002, "epoch": 2.9617654191253737, "step": 18320}, {"loss": 0.6738, "grad_norm": 0.6514461636543274, "learning_rate": 0.0002, "epoch": 2.9633821033061194, "step": 18330}, {"loss": 0.6677, "grad_norm": 0.8580465912818909, "learning_rate": 0.0002, "epoch": 2.9649987874868646, "step": 18340}, {"loss": 0.6971, "grad_norm": 0.7074266076087952, "learning_rate": 0.0002, "epoch": 2.96661547166761, "step": 18350}, {"loss": 0.6804, "grad_norm": 0.899892270565033, "learning_rate": 0.0002, "epoch": 2.968232155848355, "step": 18360}, {"loss": 0.7094, "grad_norm": 0.8217641711235046, "learning_rate": 0.0002, "epoch": 2.9698488400291003, "step": 18370}, {"loss": 0.6916, "grad_norm": 0.8611799478530884, "learning_rate": 0.0002, "epoch": 2.9714655242098456, "step": 18380}, {"loss": 0.6677, "grad_norm": 0.6909302473068237, "learning_rate": 0.0002, "epoch": 2.973082208390591, "step": 18390}, {"loss": 0.7247, "grad_norm": 0.6554358005523682, "learning_rate": 0.0002, "epoch": 2.974698892571336, "step": 18400}, {"loss": 0.6516, "grad_norm": 0.7803071737289429, "learning_rate": 0.0002, "epoch": 2.9763155767520812, "step": 18410}, {"loss": 0.7322, "grad_norm": 0.7838954925537109, "learning_rate": 0.0002, "epoch": 2.977932260932827, "step": 18420}, {"loss": 0.6522, "grad_norm": 0.7098495364189148, "learning_rate": 0.0002, "epoch": 2.979548945113572, "step": 18430}, {"loss": 0.739, "grad_norm": 0.8981785774230957, "learning_rate": 0.0002, "epoch": 2.9811656292943174, "step": 18440}, {"loss": 0.6689, "grad_norm": 0.7197171449661255, "learning_rate": 0.0002, "epoch": 2.9827823134750626, "step": 18450}, {"loss": 0.706, "grad_norm": 0.793185293674469, "learning_rate": 0.0002, "epoch": 2.984398997655808, "step": 18460}, {"loss": 0.7124, "grad_norm": 0.8531473875045776, "learning_rate": 0.0002, "epoch": 2.986015681836553, "step": 18470}, {"loss": 0.6901, "grad_norm": 0.6627361178398132, "learning_rate": 0.0002, "epoch": 2.9876323660172988, "step": 18480}, {"loss": 0.6591, "grad_norm": 0.5708155035972595, "learning_rate": 0.0002, "epoch": 2.989249050198044, "step": 18490}, {"loss": 0.6725, "grad_norm": 0.8227280378341675, "learning_rate": 0.0002, "epoch": 2.990865734378789, "step": 18500}, {"loss": 0.6701, "grad_norm": 0.7102749943733215, "learning_rate": 0.0002, "epoch": 2.9924824185595345, "step": 18510}, {"loss": 0.7091, "grad_norm": 0.839485228061676, "learning_rate": 0.0002, "epoch": 2.9940991027402797, "step": 18520}, {"loss": 0.6521, "grad_norm": 0.9038704037666321, "learning_rate": 0.0002, "epoch": 2.995715786921025, "step": 18530}, {"loss": 0.7186, "grad_norm": 0.8737510442733765, "learning_rate": 0.0002, "epoch": 2.99733247110177, "step": 18540}, {"loss": 0.6819, "grad_norm": 0.7323142886161804, "learning_rate": 0.0002, "epoch": 2.9989491552825154, "step": 18550}, {"eval_loss": 1.1262480020523071, "eval_runtime": 122.0868, "eval_samples_per_second": 6.004, "eval_steps_per_second": 0.754, "epoch": 2.9999191657909625, "step": 18556}, {"loss": 0.6337, "grad_norm": 0.8465463519096375, "learning_rate": 0.0002, "epoch": 3.000565839463261, "step": 18560}, {"loss": 0.6064, "grad_norm": 0.9134138822555542, "learning_rate": 0.0002, "epoch": 3.0021825236440063, "step": 18570}, {"loss": 0.5804, "grad_norm": 0.760715126991272, "learning_rate": 0.0002, "epoch": 3.0037992078247515, "step": 18580}, {"loss": 0.5571, "grad_norm": 0.9208743572235107, "learning_rate": 0.0002, "epoch": 3.0054158920054967, "step": 18590}, {"loss": 0.5731, "grad_norm": 0.9232364892959595, "learning_rate": 0.0002, "epoch": 3.007032576186242, "step": 18600}, {"loss": 0.6299, "grad_norm": 1.1881544589996338, "learning_rate": 0.0002, "epoch": 3.008649260366987, "step": 18610}, {"loss": 0.5482, "grad_norm": 0.9372987747192383, "learning_rate": 0.0002, "epoch": 3.0102659445477324, "step": 18620}, {"loss": 0.5709, "grad_norm": 0.6900241374969482, "learning_rate": 0.0002, "epoch": 3.0118826287284777, "step": 18630}, {"loss": 0.5256, "grad_norm": 0.8451071381568909, "learning_rate": 0.0002, "epoch": 3.0134993129092233, "step": 18640}, {"loss": 0.5916, "grad_norm": 0.7763112187385559, "learning_rate": 0.0002, "epoch": 3.0151159970899686, "step": 18650}, {"loss": 0.6095, "grad_norm": 1.043653964996338, "learning_rate": 0.0002, "epoch": 3.016732681270714, "step": 18660}, {"loss": 0.6228, "grad_norm": 1.0170660018920898, "learning_rate": 0.0002, "epoch": 3.018349365451459, "step": 18670}, {"loss": 0.5671, "grad_norm": 0.7534180283546448, "learning_rate": 0.0002, "epoch": 3.0199660496322043, "step": 18680}, {"loss": 0.6015, "grad_norm": 0.7507367730140686, "learning_rate": 0.0002, "epoch": 3.0215827338129495, "step": 18690}, {"loss": 0.6201, "grad_norm": 0.7861620187759399, "learning_rate": 0.0002, "epoch": 3.0231994179936947, "step": 18700}, {"loss": 0.5802, "grad_norm": 1.0580339431762695, "learning_rate": 0.0002, "epoch": 3.0248161021744404, "step": 18710}, {"loss": 0.5975, "grad_norm": 0.7542710900306702, "learning_rate": 0.0002, "epoch": 3.0264327863551856, "step": 18720}, {"loss": 0.5695, "grad_norm": 0.8189544677734375, "learning_rate": 0.0002, "epoch": 3.028049470535931, "step": 18730}, {"loss": 0.6109, "grad_norm": 0.9126611351966858, "learning_rate": 0.0002, "epoch": 3.029666154716676, "step": 18740}, {"loss": 0.6443, "grad_norm": 0.8891341686248779, "learning_rate": 0.0002, "epoch": 3.0312828388974213, "step": 18750}, {"loss": 0.6207, "grad_norm": 0.8419283032417297, "learning_rate": 0.0002, "epoch": 3.0328995230781666, "step": 18760}, {"loss": 0.5818, "grad_norm": 0.8048048615455627, "learning_rate": 0.0002, "epoch": 3.034516207258912, "step": 18770}, {"loss": 0.6381, "grad_norm": 0.7820217609405518, "learning_rate": 0.0002, "epoch": 3.0361328914396575, "step": 18780}, {"loss": 0.5843, "grad_norm": 0.854721188545227, "learning_rate": 0.0002, "epoch": 3.0377495756204027, "step": 18790}, {"loss": 0.5784, "grad_norm": 0.912092924118042, "learning_rate": 0.0002, "epoch": 3.039366259801148, "step": 18800}, {"loss": 0.5734, "grad_norm": 0.6596226096153259, "learning_rate": 0.0002, "epoch": 3.040982943981893, "step": 18810}, {"loss": 0.5969, "grad_norm": 0.6351348757743835, "learning_rate": 0.0002, "epoch": 3.0425996281626384, "step": 18820}, {"loss": 0.5953, "grad_norm": 0.778188943862915, "learning_rate": 0.0002, "epoch": 3.0442163123433836, "step": 18830}, {"loss": 0.602, "grad_norm": 0.68234783411026, "learning_rate": 0.0002, "epoch": 3.045832996524129, "step": 18840}, {"loss": 0.5785, "grad_norm": 0.998628556728363, "learning_rate": 0.0002, "epoch": 3.047449680704874, "step": 18850}, {"loss": 0.6231, "grad_norm": 0.7393841743469238, "learning_rate": 0.0002, "epoch": 3.0490663648856198, "step": 18860}, {"loss": 0.568, "grad_norm": 0.84438556432724, "learning_rate": 0.0002, "epoch": 3.050683049066365, "step": 18870}, {"loss": 0.6205, "grad_norm": 0.8857501745223999, "learning_rate": 0.0002, "epoch": 3.0522997332471102, "step": 18880}, {"loss": 0.6335, "grad_norm": 0.7208474278450012, "learning_rate": 0.0002, "epoch": 3.0539164174278555, "step": 18890}, {"loss": 0.5998, "grad_norm": 0.7135229110717773, "learning_rate": 0.0002, "epoch": 3.0555331016086007, "step": 18900}, {"loss": 0.5575, "grad_norm": 0.9130001664161682, "learning_rate": 0.0002, "epoch": 3.057149785789346, "step": 18910}, {"loss": 0.5955, "grad_norm": 0.9001716375350952, "learning_rate": 0.0002, "epoch": 3.058766469970091, "step": 18920}, {"loss": 0.6052, "grad_norm": 0.8667559623718262, "learning_rate": 0.0002, "epoch": 3.060383154150837, "step": 18930}, {"loss": 0.5818, "grad_norm": 0.8943959474563599, "learning_rate": 0.0002, "epoch": 3.061999838331582, "step": 18940}, {"loss": 0.5978, "grad_norm": 0.8298377990722656, "learning_rate": 0.0002, "epoch": 3.0636165225123273, "step": 18950}, {"loss": 0.5782, "grad_norm": 0.7935267686843872, "learning_rate": 0.0002, "epoch": 3.0652332066930725, "step": 18960}, {"loss": 0.6434, "grad_norm": 1.1506379842758179, "learning_rate": 0.0002, "epoch": 3.0668498908738178, "step": 18970}, {"loss": 0.5571, "grad_norm": 0.7693049907684326, "learning_rate": 0.0002, "epoch": 3.068466575054563, "step": 18980}, {"loss": 0.5971, "grad_norm": 0.8040135502815247, "learning_rate": 0.0002, "epoch": 3.0700832592353082, "step": 18990}, {"loss": 0.5541, "grad_norm": 0.828404426574707, "learning_rate": 0.0002, "epoch": 3.0716999434160535, "step": 19000}, {"loss": 0.6048, "grad_norm": 0.8811164498329163, "learning_rate": 0.0002, "epoch": 3.073316627596799, "step": 19010}, {"loss": 0.5845, "grad_norm": 1.036205768585205, "learning_rate": 0.0002, "epoch": 3.0749333117775444, "step": 19020}, {"loss": 0.5838, "grad_norm": 0.8857285976409912, "learning_rate": 0.0002, "epoch": 3.0765499959582896, "step": 19030}, {"loss": 0.592, "grad_norm": 0.8392079472541809, "learning_rate": 0.0002, "epoch": 3.078166680139035, "step": 19040}, {"loss": 0.5927, "grad_norm": 1.0287401676177979, "learning_rate": 0.0002, "epoch": 3.07978336431978, "step": 19050}, {"loss": 0.5964, "grad_norm": 1.0086315870285034, "learning_rate": 0.0002, "epoch": 3.0814000485005253, "step": 19060}, {"loss": 0.5567, "grad_norm": 0.9245324730873108, "learning_rate": 0.0002, "epoch": 3.0830167326812705, "step": 19070}, {"loss": 0.5797, "grad_norm": 0.8680877089500427, "learning_rate": 0.0002, "epoch": 3.084633416862016, "step": 19080}, {"loss": 0.5611, "grad_norm": 0.8814793825149536, "learning_rate": 0.0002, "epoch": 3.0862501010427614, "step": 19090}, {"loss": 0.6051, "grad_norm": 0.9234458208084106, "learning_rate": 0.0002, "epoch": 3.0878667852235067, "step": 19100}, {"loss": 0.6209, "grad_norm": 1.1291664838790894, "learning_rate": 0.0002, "epoch": 3.089483469404252, "step": 19110}, {"loss": 0.5695, "grad_norm": 0.9191402792930603, "learning_rate": 0.0002, "epoch": 3.091100153584997, "step": 19120}, {"loss": 0.5856, "grad_norm": 0.7103154063224792, "learning_rate": 0.0002, "epoch": 3.0927168377657424, "step": 19130}, {"loss": 0.6479, "grad_norm": 0.9368883967399597, "learning_rate": 0.0002, "epoch": 3.0943335219464876, "step": 19140}, {"loss": 0.6167, "grad_norm": 0.9676656723022461, "learning_rate": 0.0002, "epoch": 3.095950206127233, "step": 19150}, {"loss": 0.5794, "grad_norm": 0.8739792704582214, "learning_rate": 0.0002, "epoch": 3.0975668903079785, "step": 19160}, {"loss": 0.6112, "grad_norm": 0.8530174493789673, "learning_rate": 0.0002, "epoch": 3.0991835744887237, "step": 19170}, {"loss": 0.6568, "grad_norm": 0.794945478439331, "learning_rate": 0.0002, "epoch": 3.100800258669469, "step": 19180}, {"loss": 0.5928, "grad_norm": 0.9508888125419617, "learning_rate": 0.0002, "epoch": 3.102416942850214, "step": 19190}, {"loss": 0.5757, "grad_norm": 1.0599955320358276, "learning_rate": 0.0002, "epoch": 3.1040336270309594, "step": 19200}, {"loss": 0.6151, "grad_norm": 1.0673625469207764, "learning_rate": 0.0002, "epoch": 3.1056503112117047, "step": 19210}, {"loss": 0.6043, "grad_norm": 0.7739115953445435, "learning_rate": 0.0002, "epoch": 3.10726699539245, "step": 19220}, {"loss": 0.6046, "grad_norm": 0.9884951114654541, "learning_rate": 0.0002, "epoch": 3.1088836795731956, "step": 19230}, {"loss": 0.5932, "grad_norm": 0.862260103225708, "learning_rate": 0.0002, "epoch": 3.110500363753941, "step": 19240}, {"loss": 0.6098, "grad_norm": 0.7690284848213196, "learning_rate": 0.0002, "epoch": 3.112117047934686, "step": 19250}, {"loss": 0.5791, "grad_norm": 0.8758958578109741, "learning_rate": 0.0002, "epoch": 3.1137337321154313, "step": 19260}, {"loss": 0.6136, "grad_norm": 1.0356395244598389, "learning_rate": 0.0002, "epoch": 3.1153504162961765, "step": 19270}, {"loss": 0.6159, "grad_norm": 0.6950937509536743, "learning_rate": 0.0002, "epoch": 3.1169671004769217, "step": 19280}, {"loss": 0.592, "grad_norm": 0.760998010635376, "learning_rate": 0.0002, "epoch": 3.118583784657667, "step": 19290}, {"loss": 0.575, "grad_norm": 0.9335789084434509, "learning_rate": 0.0002, "epoch": 3.1202004688384126, "step": 19300}, {"loss": 0.6139, "grad_norm": 0.9636204242706299, "learning_rate": 0.0002, "epoch": 3.121817153019158, "step": 19310}, {"loss": 0.6001, "grad_norm": 1.0820997953414917, "learning_rate": 0.0002, "epoch": 3.123433837199903, "step": 19320}, {"loss": 0.6542, "grad_norm": 0.7333487272262573, "learning_rate": 0.0002, "epoch": 3.1250505213806483, "step": 19330}, {"loss": 0.6178, "grad_norm": 1.0417509078979492, "learning_rate": 0.0002, "epoch": 3.1266672055613935, "step": 19340}, {"loss": 0.603, "grad_norm": 0.9267749190330505, "learning_rate": 0.0002, "epoch": 3.128283889742139, "step": 19350}, {"loss": 0.6063, "grad_norm": 0.777798593044281, "learning_rate": 0.0002, "epoch": 3.129900573922884, "step": 19360}, {"loss": 0.5913, "grad_norm": 0.8425456881523132, "learning_rate": 0.0002, "epoch": 3.1315172581036297, "step": 19370}, {"loss": 0.6042, "grad_norm": 0.9617102146148682, "learning_rate": 0.0002, "epoch": 3.133133942284375, "step": 19380}, {"loss": 0.633, "grad_norm": 1.0052828788757324, "learning_rate": 0.0002, "epoch": 3.13475062646512, "step": 19390}, {"loss": 0.5713, "grad_norm": 0.7637009024620056, "learning_rate": 0.0002, "epoch": 3.1363673106458654, "step": 19400}, {"loss": 0.5497, "grad_norm": 0.7958088517189026, "learning_rate": 0.0002, "epoch": 3.1379839948266106, "step": 19410}, {"loss": 0.6283, "grad_norm": 0.9161727428436279, "learning_rate": 0.0002, "epoch": 3.139600679007356, "step": 19420}, {"loss": 0.5638, "grad_norm": 0.8402149677276611, "learning_rate": 0.0002, "epoch": 3.141217363188101, "step": 19430}, {"loss": 0.5848, "grad_norm": 1.0056525468826294, "learning_rate": 0.0002, "epoch": 3.1428340473688463, "step": 19440}, {"loss": 0.5954, "grad_norm": 1.0129190683364868, "learning_rate": 0.0002, "epoch": 3.144450731549592, "step": 19450}, {"loss": 0.5808, "grad_norm": 0.790825366973877, "learning_rate": 0.0002, "epoch": 3.146067415730337, "step": 19460}, {"loss": 0.5607, "grad_norm": 1.441665530204773, "learning_rate": 0.0002, "epoch": 3.1476840999110824, "step": 19470}, {"loss": 0.5785, "grad_norm": 0.7846331596374512, "learning_rate": 0.0002, "epoch": 3.1493007840918277, "step": 19480}, {"loss": 0.5892, "grad_norm": 0.7915332913398743, "learning_rate": 0.0002, "epoch": 3.150917468272573, "step": 19490}, {"loss": 0.5759, "grad_norm": 0.933982253074646, "learning_rate": 0.0002, "epoch": 3.152534152453318, "step": 19500}, {"loss": 0.6206, "grad_norm": 1.038408637046814, "learning_rate": 0.0002, "epoch": 3.1541508366340634, "step": 19510}, {"loss": 0.6271, "grad_norm": 1.018935203552246, "learning_rate": 0.0002, "epoch": 3.155767520814809, "step": 19520}, {"loss": 0.6173, "grad_norm": 0.9618112444877625, "learning_rate": 0.0002, "epoch": 3.1573842049955543, "step": 19530}, {"loss": 0.5972, "grad_norm": 0.8900452852249146, "learning_rate": 0.0002, "epoch": 3.1590008891762995, "step": 19540}, {"loss": 0.5925, "grad_norm": 0.8254160284996033, "learning_rate": 0.0002, "epoch": 3.1606175733570447, "step": 19550}, {"loss": 0.625, "grad_norm": 1.004376769065857, "learning_rate": 0.0002, "epoch": 3.16223425753779, "step": 19560}, {"loss": 0.5775, "grad_norm": 1.0490446090698242, "learning_rate": 0.0002, "epoch": 3.163850941718535, "step": 19570}, {"loss": 0.5986, "grad_norm": 0.7387403845787048, "learning_rate": 0.0002, "epoch": 3.1654676258992804, "step": 19580}, {"loss": 0.5898, "grad_norm": 0.7611538171768188, "learning_rate": 0.0002, "epoch": 3.1670843100800257, "step": 19590}, {"loss": 0.5937, "grad_norm": 0.8239886164665222, "learning_rate": 0.0002, "epoch": 3.1687009942607713, "step": 19600}, {"loss": 0.6068, "grad_norm": 0.9327243566513062, "learning_rate": 0.0002, "epoch": 3.1703176784415166, "step": 19610}, {"loss": 0.572, "grad_norm": 0.9662560224533081, "learning_rate": 0.0002, "epoch": 3.171934362622262, "step": 19620}, {"loss": 0.5988, "grad_norm": 0.9183341860771179, "learning_rate": 0.0002, "epoch": 3.173551046803007, "step": 19630}, {"loss": 0.5909, "grad_norm": 0.875066876411438, "learning_rate": 0.0002, "epoch": 3.1751677309837523, "step": 19640}, {"loss": 0.5956, "grad_norm": 0.8567508459091187, "learning_rate": 0.0002, "epoch": 3.1767844151644975, "step": 19650}, {"loss": 0.5805, "grad_norm": 0.6805780529975891, "learning_rate": 0.0002, "epoch": 3.1784010993452427, "step": 19660}, {"loss": 0.6204, "grad_norm": 0.8776944279670715, "learning_rate": 0.0002, "epoch": 3.1800177835259884, "step": 19670}, {"loss": 0.6108, "grad_norm": 0.9036329984664917, "learning_rate": 0.0002, "epoch": 3.1816344677067336, "step": 19680}, {"loss": 0.6238, "grad_norm": 0.8527372479438782, "learning_rate": 0.0002, "epoch": 3.183251151887479, "step": 19690}, {"loss": 0.6089, "grad_norm": 1.1045585870742798, "learning_rate": 0.0002, "epoch": 3.184867836068224, "step": 19700}, {"loss": 0.5491, "grad_norm": 0.9213830828666687, "learning_rate": 0.0002, "epoch": 3.1864845202489693, "step": 19710}, {"loss": 0.618, "grad_norm": 0.8865814805030823, "learning_rate": 0.0002, "epoch": 3.1881012044297146, "step": 19720}, {"loss": 0.5785, "grad_norm": 0.7939388751983643, "learning_rate": 0.0002, "epoch": 3.18971788861046, "step": 19730}, {"loss": 0.5682, "grad_norm": 0.6966729760169983, "learning_rate": 0.0002, "epoch": 3.191334572791205, "step": 19740}, {"loss": 0.5839, "grad_norm": 0.8023673295974731, "learning_rate": 0.0002, "epoch": 3.1929512569719507, "step": 19750}, {"loss": 0.6267, "grad_norm": 0.7992037534713745, "learning_rate": 0.0002, "epoch": 3.194567941152696, "step": 19760}, {"loss": 0.6141, "grad_norm": 0.7412247657775879, "learning_rate": 0.0002, "epoch": 3.196184625333441, "step": 19770}, {"loss": 0.6179, "grad_norm": 0.9598729014396667, "learning_rate": 0.0002, "epoch": 3.1978013095141864, "step": 19780}, {"loss": 0.5685, "grad_norm": 0.8331366777420044, "learning_rate": 0.0002, "epoch": 3.1994179936949316, "step": 19790}, {"loss": 0.6104, "grad_norm": 0.8939169645309448, "learning_rate": 0.0002, "epoch": 3.201034677875677, "step": 19800}, {"loss": 0.6147, "grad_norm": 0.9219734072685242, "learning_rate": 0.0002, "epoch": 3.202651362056422, "step": 19810}, {"loss": 0.6051, "grad_norm": 0.869490385055542, "learning_rate": 0.0002, "epoch": 3.2042680462371678, "step": 19820}, {"loss": 0.5946, "grad_norm": 0.8989706635475159, "learning_rate": 0.0002, "epoch": 3.205884730417913, "step": 19830}, {"loss": 0.5866, "grad_norm": 0.8477165102958679, "learning_rate": 0.0002, "epoch": 3.2075014145986582, "step": 19840}, {"loss": 0.6176, "grad_norm": 0.8720678687095642, "learning_rate": 0.0002, "epoch": 3.2091180987794035, "step": 19850}, {"loss": 0.5694, "grad_norm": 0.861406683921814, "learning_rate": 0.0002, "epoch": 3.2107347829601487, "step": 19860}, {"loss": 0.6264, "grad_norm": 0.8228686451911926, "learning_rate": 0.0002, "epoch": 3.212351467140894, "step": 19870}, {"loss": 0.625, "grad_norm": 0.7936596870422363, "learning_rate": 0.0002, "epoch": 3.213968151321639, "step": 19880}, {"loss": 0.5698, "grad_norm": 1.097377896308899, "learning_rate": 0.0002, "epoch": 3.2155848355023844, "step": 19890}, {"loss": 0.6725, "grad_norm": 0.9544782638549805, "learning_rate": 0.0002, "epoch": 3.21720151968313, "step": 19900}, {"loss": 0.6022, "grad_norm": 0.8240751624107361, "learning_rate": 0.0002, "epoch": 3.2188182038638753, "step": 19910}, {"loss": 0.5659, "grad_norm": 0.8332096338272095, "learning_rate": 0.0002, "epoch": 3.2204348880446205, "step": 19920}, {"loss": 0.6274, "grad_norm": 1.0954567193984985, "learning_rate": 0.0002, "epoch": 3.2220515722253658, "step": 19930}, {"loss": 0.652, "grad_norm": 0.7790525555610657, "learning_rate": 0.0002, "epoch": 3.223668256406111, "step": 19940}, {"loss": 0.5986, "grad_norm": 0.7966814041137695, "learning_rate": 0.0002, "epoch": 3.225284940586856, "step": 19950}, {"loss": 0.5911, "grad_norm": 0.9751881957054138, "learning_rate": 0.0002, "epoch": 3.2269016247676015, "step": 19960}, {"loss": 0.6071, "grad_norm": 0.9856047630310059, "learning_rate": 0.0002, "epoch": 3.228518308948347, "step": 19970}, {"loss": 0.5837, "grad_norm": 1.3062353134155273, "learning_rate": 0.0002, "epoch": 3.2301349931290924, "step": 19980}, {"loss": 0.6588, "grad_norm": 0.9510692358016968, "learning_rate": 0.0002, "epoch": 3.2317516773098376, "step": 19990}, {"loss": 0.6264, "grad_norm": 0.8630342483520508, "learning_rate": 0.0002, "epoch": 3.233368361490583, "step": 20000}, {"loss": 0.6073, "grad_norm": 0.8966519236564636, "learning_rate": 0.0002, "epoch": 3.234985045671328, "step": 20010}, {"loss": 0.612, "grad_norm": 0.7093510627746582, "learning_rate": 0.0002, "epoch": 3.2366017298520733, "step": 20020}, {"loss": 0.585, "grad_norm": 0.7771096229553223, "learning_rate": 0.0002, "epoch": 3.2382184140328185, "step": 20030}, {"loss": 0.5821, "grad_norm": 0.841058075428009, "learning_rate": 0.0002, "epoch": 3.2398350982135637, "step": 20040}, {"loss": 0.6519, "grad_norm": 0.909712553024292, "learning_rate": 0.0002, "epoch": 3.2414517823943094, "step": 20050}, {"loss": 0.6089, "grad_norm": 0.8321019411087036, "learning_rate": 0.0002, "epoch": 3.2430684665750547, "step": 20060}, {"loss": 0.6115, "grad_norm": 0.779901921749115, "learning_rate": 0.0002, "epoch": 3.2446851507558, "step": 20070}, {"loss": 0.6107, "grad_norm": 0.6249170303344727, "learning_rate": 0.0002, "epoch": 3.246301834936545, "step": 20080}, {"loss": 0.603, "grad_norm": 0.8000940680503845, "learning_rate": 0.0002, "epoch": 3.2479185191172903, "step": 20090}, {"loss": 0.6273, "grad_norm": 0.7627735137939453, "learning_rate": 0.0002, "epoch": 3.2495352032980356, "step": 20100}, {"loss": 0.6223, "grad_norm": 0.8780747056007385, "learning_rate": 0.0002, "epoch": 3.2511518874787813, "step": 20110}, {"loss": 0.5969, "grad_norm": 0.772037148475647, "learning_rate": 0.0002, "epoch": 3.2527685716595265, "step": 20120}, {"loss": 0.5843, "grad_norm": 1.0086580514907837, "learning_rate": 0.0002, "epoch": 3.2543852558402717, "step": 20130}, {"loss": 0.5777, "grad_norm": 0.9360289573669434, "learning_rate": 0.0002, "epoch": 3.256001940021017, "step": 20140}, {"loss": 0.5777, "grad_norm": 1.2099586725234985, "learning_rate": 0.0002, "epoch": 3.257618624201762, "step": 20150}, {"loss": 0.624, "grad_norm": 0.8368481397628784, "learning_rate": 0.0002, "epoch": 3.2592353083825074, "step": 20160}, {"loss": 0.5626, "grad_norm": 0.7391039133071899, "learning_rate": 0.0002, "epoch": 3.2608519925632526, "step": 20170}, {"loss": 0.6041, "grad_norm": 0.9122273325920105, "learning_rate": 0.0002, "epoch": 3.262468676743998, "step": 20180}, {"loss": 0.5868, "grad_norm": 0.8502281904220581, "learning_rate": 0.0002, "epoch": 3.264085360924743, "step": 20190}, {"loss": 0.5841, "grad_norm": 1.0926852226257324, "learning_rate": 0.0002, "epoch": 3.265702045105489, "step": 20200}, {"loss": 0.6027, "grad_norm": 0.7902828454971313, "learning_rate": 0.0002, "epoch": 3.267318729286234, "step": 20210}, {"loss": 0.6089, "grad_norm": 0.8724729418754578, "learning_rate": 0.0002, "epoch": 3.2689354134669792, "step": 20220}, {"loss": 0.6242, "grad_norm": 0.8469277024269104, "learning_rate": 0.0002, "epoch": 3.2705520976477245, "step": 20230}, {"loss": 0.644, "grad_norm": 0.8865092992782593, "learning_rate": 0.0002, "epoch": 3.2721687818284697, "step": 20240}, {"loss": 0.6464, "grad_norm": 1.0979334115982056, "learning_rate": 0.0002, "epoch": 3.273785466009215, "step": 20250}, {"loss": 0.647, "grad_norm": 1.0860793590545654, "learning_rate": 0.0002, "epoch": 3.2754021501899606, "step": 20260}, {"loss": 0.6105, "grad_norm": 0.981745183467865, "learning_rate": 0.0002, "epoch": 3.277018834370706, "step": 20270}, {"loss": 0.627, "grad_norm": 0.9155020713806152, "learning_rate": 0.0002, "epoch": 3.278635518551451, "step": 20280}, {"loss": 0.5899, "grad_norm": 0.8436718583106995, "learning_rate": 0.0002, "epoch": 3.2802522027321963, "step": 20290}, {"loss": 0.6371, "grad_norm": 1.0329409837722778, "learning_rate": 0.0002, "epoch": 3.2818688869129415, "step": 20300}, {"loss": 0.6, "grad_norm": 0.9876394271850586, "learning_rate": 0.0002, "epoch": 3.2834855710936868, "step": 20310}, {"loss": 0.5463, "grad_norm": 0.8052917718887329, "learning_rate": 0.0002, "epoch": 3.285102255274432, "step": 20320}, {"loss": 0.5949, "grad_norm": 0.8390680551528931, "learning_rate": 0.0002, "epoch": 3.2867189394551772, "step": 20330}, {"loss": 0.6492, "grad_norm": 0.9515735507011414, "learning_rate": 0.0002, "epoch": 3.288335623635923, "step": 20340}, {"loss": 0.596, "grad_norm": 0.8028870224952698, "learning_rate": 0.0002, "epoch": 3.289952307816668, "step": 20350}, {"loss": 0.634, "grad_norm": 0.862592339515686, "learning_rate": 0.0002, "epoch": 3.2915689919974134, "step": 20360}, {"loss": 0.6345, "grad_norm": 0.7451621890068054, "learning_rate": 0.0002, "epoch": 3.2931856761781586, "step": 20370}, {"loss": 0.6458, "grad_norm": 0.8966776728630066, "learning_rate": 0.0002, "epoch": 3.294802360358904, "step": 20380}, {"loss": 0.5967, "grad_norm": 0.9289216995239258, "learning_rate": 0.0002, "epoch": 3.296419044539649, "step": 20390}, {"loss": 0.6599, "grad_norm": 0.9649626612663269, "learning_rate": 0.0002, "epoch": 3.2980357287203943, "step": 20400}, {"loss": 0.5781, "grad_norm": 1.1953798532485962, "learning_rate": 0.0002, "epoch": 3.29965241290114, "step": 20410}, {"loss": 0.5997, "grad_norm": 0.8929083943367004, "learning_rate": 0.0002, "epoch": 3.301269097081885, "step": 20420}, {"loss": 0.597, "grad_norm": 0.8922014236450195, "learning_rate": 0.0002, "epoch": 3.3028857812626304, "step": 20430}, {"loss": 0.5766, "grad_norm": 0.9754860401153564, "learning_rate": 0.0002, "epoch": 3.3045024654433757, "step": 20440}, {"loss": 0.5653, "grad_norm": 0.8873140215873718, "learning_rate": 0.0002, "epoch": 3.306119149624121, "step": 20450}, {"loss": 0.6138, "grad_norm": 0.857271671295166, "learning_rate": 0.0002, "epoch": 3.307735833804866, "step": 20460}, {"loss": 0.633, "grad_norm": 0.9022141098976135, "learning_rate": 0.0002, "epoch": 3.3093525179856114, "step": 20470}, {"loss": 0.6654, "grad_norm": 0.8614798188209534, "learning_rate": 0.0002, "epoch": 3.3109692021663566, "step": 20480}, {"loss": 0.6254, "grad_norm": 0.8838164210319519, "learning_rate": 0.0002, "epoch": 3.3125858863471023, "step": 20490}, {"loss": 0.5849, "grad_norm": 0.8709736466407776, "learning_rate": 0.0002, "epoch": 3.3142025705278475, "step": 20500}, {"loss": 0.6146, "grad_norm": 0.9533300995826721, "learning_rate": 0.0002, "epoch": 3.3158192547085927, "step": 20510}, {"loss": 0.6029, "grad_norm": 0.8259269595146179, "learning_rate": 0.0002, "epoch": 3.317435938889338, "step": 20520}, {"loss": 0.6268, "grad_norm": 0.8607608079910278, "learning_rate": 0.0002, "epoch": 3.319052623070083, "step": 20530}, {"loss": 0.5676, "grad_norm": 1.0863020420074463, "learning_rate": 0.0002, "epoch": 3.3206693072508284, "step": 20540}, {"loss": 0.6412, "grad_norm": 1.011489987373352, "learning_rate": 0.0002, "epoch": 3.3222859914315737, "step": 20550}, {"loss": 0.6247, "grad_norm": 0.6952177882194519, "learning_rate": 0.0002, "epoch": 3.3239026756123193, "step": 20560}, {"loss": 0.6229, "grad_norm": 0.9638974070549011, "learning_rate": 0.0002, "epoch": 3.3255193597930646, "step": 20570}, {"loss": 0.5882, "grad_norm": 1.0310138463974, "learning_rate": 0.0002, "epoch": 3.32713604397381, "step": 20580}, {"loss": 0.594, "grad_norm": 0.9371318221092224, "learning_rate": 0.0002, "epoch": 3.328752728154555, "step": 20590}, {"loss": 0.6137, "grad_norm": 0.8756691813468933, "learning_rate": 0.0002, "epoch": 3.3303694123353003, "step": 20600}, {"loss": 0.5994, "grad_norm": 1.054175853729248, "learning_rate": 0.0002, "epoch": 3.3319860965160455, "step": 20610}, {"loss": 0.6169, "grad_norm": 0.9074128270149231, "learning_rate": 0.0002, "epoch": 3.3336027806967907, "step": 20620}, {"loss": 0.6138, "grad_norm": 0.906900942325592, "learning_rate": 0.0002, "epoch": 3.335219464877536, "step": 20630}, {"loss": 0.571, "grad_norm": 0.8689333200454712, "learning_rate": 0.0002, "epoch": 3.3368361490582816, "step": 20640}, {"loss": 0.6079, "grad_norm": 0.9889747500419617, "learning_rate": 0.0002, "epoch": 3.338452833239027, "step": 20650}, {"loss": 0.6073, "grad_norm": 1.0685805082321167, "learning_rate": 0.0002, "epoch": 3.340069517419772, "step": 20660}, {"loss": 0.6091, "grad_norm": 0.7495010495185852, "learning_rate": 0.0002, "epoch": 3.3416862016005173, "step": 20670}, {"loss": 0.5883, "grad_norm": 0.8747848272323608, "learning_rate": 0.0002, "epoch": 3.3433028857812626, "step": 20680}, {"loss": 0.604, "grad_norm": 0.9762673377990723, "learning_rate": 0.0002, "epoch": 3.344919569962008, "step": 20690}, {"loss": 0.6784, "grad_norm": 1.0284489393234253, "learning_rate": 0.0002, "epoch": 3.346536254142753, "step": 20700}, {"loss": 0.6464, "grad_norm": 0.7293812036514282, "learning_rate": 0.0002, "epoch": 3.3481529383234987, "step": 20710}, {"loss": 0.609, "grad_norm": 0.8330199122428894, "learning_rate": 0.0002, "epoch": 3.349769622504244, "step": 20720}, {"loss": 0.5729, "grad_norm": 0.9808499217033386, "learning_rate": 0.0002, "epoch": 3.351386306684989, "step": 20730}, {"loss": 0.6315, "grad_norm": 0.9508825540542603, "learning_rate": 0.0002, "epoch": 3.3530029908657344, "step": 20740}, {"loss": 0.5965, "grad_norm": 0.790483832359314, "learning_rate": 0.0002, "epoch": 3.3546196750464796, "step": 20750}, {"loss": 0.6327, "grad_norm": 1.022793173789978, "learning_rate": 0.0002, "epoch": 3.356236359227225, "step": 20760}, {"loss": 0.6439, "grad_norm": 0.8318950533866882, "learning_rate": 0.0002, "epoch": 3.35785304340797, "step": 20770}, {"loss": 0.6037, "grad_norm": 0.7980858087539673, "learning_rate": 0.0002, "epoch": 3.3594697275887153, "step": 20780}, {"loss": 0.6746, "grad_norm": 0.8114802241325378, "learning_rate": 0.0002, "epoch": 3.361086411769461, "step": 20790}, {"loss": 0.6017, "grad_norm": 0.8522519469261169, "learning_rate": 0.0002, "epoch": 3.3627030959502062, "step": 20800}, {"loss": 0.5864, "grad_norm": 0.9142431616783142, "learning_rate": 0.0002, "epoch": 3.3643197801309515, "step": 20810}, {"loss": 0.6331, "grad_norm": 0.771170437335968, "learning_rate": 0.0002, "epoch": 3.3659364643116967, "step": 20820}, {"loss": 0.5879, "grad_norm": 1.0628231763839722, "learning_rate": 0.0002, "epoch": 3.367553148492442, "step": 20830}, {"loss": 0.6533, "grad_norm": 0.9384352564811707, "learning_rate": 0.0002, "epoch": 3.369169832673187, "step": 20840}, {"loss": 0.6292, "grad_norm": 1.1286591291427612, "learning_rate": 0.0002, "epoch": 3.370786516853933, "step": 20850}, {"loss": 0.5986, "grad_norm": 1.1349513530731201, "learning_rate": 0.0002, "epoch": 3.372403201034678, "step": 20860}, {"loss": 0.6413, "grad_norm": 1.0127464532852173, "learning_rate": 0.0002, "epoch": 3.3740198852154233, "step": 20870}, {"loss": 0.6414, "grad_norm": 0.9111971855163574, "learning_rate": 0.0002, "epoch": 3.3756365693961685, "step": 20880}, {"loss": 0.6101, "grad_norm": 0.871356725692749, "learning_rate": 0.0002, "epoch": 3.3772532535769137, "step": 20890}, {"loss": 0.5995, "grad_norm": 0.7774117588996887, "learning_rate": 0.0002, "epoch": 3.378869937757659, "step": 20900}, {"loss": 0.6062, "grad_norm": 1.0089964866638184, "learning_rate": 0.0002, "epoch": 3.380486621938404, "step": 20910}, {"loss": 0.5908, "grad_norm": 0.7855867147445679, "learning_rate": 0.0002, "epoch": 3.3821033061191494, "step": 20920}, {"loss": 0.6373, "grad_norm": 1.3713710308074951, "learning_rate": 0.0002, "epoch": 3.3837199902998947, "step": 20930}, {"loss": 0.6627, "grad_norm": 0.8599116206169128, "learning_rate": 0.0002, "epoch": 3.3853366744806404, "step": 20940}, {"loss": 0.6224, "grad_norm": 0.9392673373222351, "learning_rate": 0.0002, "epoch": 3.3869533586613856, "step": 20950}, {"loss": 0.5855, "grad_norm": 0.8764075040817261, "learning_rate": 0.0002, "epoch": 3.388570042842131, "step": 20960}, {"loss": 0.5734, "grad_norm": 0.8240136504173279, "learning_rate": 0.0002, "epoch": 3.390186727022876, "step": 20970}, {"loss": 0.5783, "grad_norm": 1.0982369184494019, "learning_rate": 0.0002, "epoch": 3.3918034112036213, "step": 20980}, {"loss": 0.5451, "grad_norm": 1.0599013566970825, "learning_rate": 0.0002, "epoch": 3.3934200953843665, "step": 20990}, {"loss": 0.6356, "grad_norm": 0.895438015460968, "learning_rate": 0.0002, "epoch": 3.395036779565112, "step": 21000}, {"loss": 0.6065, "grad_norm": 0.6974841356277466, "learning_rate": 0.0002, "epoch": 3.3966534637458574, "step": 21010}, {"loss": 0.5704, "grad_norm": 0.9571719765663147, "learning_rate": 0.0002, "epoch": 3.3982701479266026, "step": 21020}, {"loss": 0.679, "grad_norm": 0.831912636756897, "learning_rate": 0.0002, "epoch": 3.399886832107348, "step": 21030}, {"loss": 0.6051, "grad_norm": 0.831936240196228, "learning_rate": 0.0002, "epoch": 3.401503516288093, "step": 21040}, {"loss": 0.5857, "grad_norm": 0.7388373613357544, "learning_rate": 0.0002, "epoch": 3.4031202004688383, "step": 21050}, {"loss": 0.6245, "grad_norm": 0.938667356967926, "learning_rate": 0.0002, "epoch": 3.4047368846495836, "step": 21060}, {"loss": 0.6121, "grad_norm": 0.9202313423156738, "learning_rate": 0.0002, "epoch": 3.406353568830329, "step": 21070}, {"loss": 0.6388, "grad_norm": 0.9888381958007812, "learning_rate": 0.0002, "epoch": 3.4079702530110745, "step": 21080}, {"loss": 0.6245, "grad_norm": 0.8526970744132996, "learning_rate": 0.0002, "epoch": 3.4095869371918197, "step": 21090}, {"loss": 0.5914, "grad_norm": 0.7939383387565613, "learning_rate": 0.0002, "epoch": 3.411203621372565, "step": 21100}, {"loss": 0.6066, "grad_norm": 0.9986352920532227, "learning_rate": 0.0002, "epoch": 3.41282030555331, "step": 21110}, {"loss": 0.5947, "grad_norm": 0.8895300030708313, "learning_rate": 0.0002, "epoch": 3.4144369897340554, "step": 21120}, {"loss": 0.6264, "grad_norm": 0.9559482932090759, "learning_rate": 0.0002, "epoch": 3.4160536739148006, "step": 21130}, {"loss": 0.6491, "grad_norm": 0.8351506590843201, "learning_rate": 0.0002, "epoch": 3.417670358095546, "step": 21140}, {"loss": 0.567, "grad_norm": 0.8224456906318665, "learning_rate": 0.0002, "epoch": 3.4192870422762915, "step": 21150}, {"loss": 0.5871, "grad_norm": 1.0110299587249756, "learning_rate": 0.0002, "epoch": 3.4209037264570368, "step": 21160}, {"loss": 0.6116, "grad_norm": 0.82564777135849, "learning_rate": 0.0002, "epoch": 3.422520410637782, "step": 21170}, {"loss": 0.595, "grad_norm": 1.004738688468933, "learning_rate": 0.0002, "epoch": 3.4241370948185272, "step": 21180}, {"loss": 0.6286, "grad_norm": 0.7545676827430725, "learning_rate": 0.0002, "epoch": 3.4257537789992725, "step": 21190}, {"loss": 0.5868, "grad_norm": 0.8918704390525818, "learning_rate": 0.0002, "epoch": 3.4273704631800177, "step": 21200}, {"loss": 0.6542, "grad_norm": 0.8336876034736633, "learning_rate": 0.0002, "epoch": 3.428987147360763, "step": 21210}, {"loss": 0.5824, "grad_norm": 0.8928771018981934, "learning_rate": 0.0002, "epoch": 3.430603831541508, "step": 21220}, {"loss": 0.6468, "grad_norm": 0.7663705945014954, "learning_rate": 0.0002, "epoch": 3.432220515722254, "step": 21230}, {"loss": 0.6693, "grad_norm": 0.8392598628997803, "learning_rate": 0.0002, "epoch": 3.433837199902999, "step": 21240}, {"loss": 0.5971, "grad_norm": 0.8819600343704224, "learning_rate": 0.0002, "epoch": 3.4354538840837443, "step": 21250}, {"loss": 0.6791, "grad_norm": 0.9124642014503479, "learning_rate": 0.0002, "epoch": 3.4370705682644895, "step": 21260}, {"loss": 0.5925, "grad_norm": 0.8329763412475586, "learning_rate": 0.0002, "epoch": 3.4386872524452348, "step": 21270}, {"loss": 0.6541, "grad_norm": 0.9982839822769165, "learning_rate": 0.0002, "epoch": 3.44030393662598, "step": 21280}, {"loss": 0.6441, "grad_norm": 0.9105954766273499, "learning_rate": 0.0002, "epoch": 3.4419206208067252, "step": 21290}, {"loss": 0.6028, "grad_norm": 0.8182359337806702, "learning_rate": 0.0002, "epoch": 3.443537304987471, "step": 21300}, {"loss": 0.5991, "grad_norm": 1.0568904876708984, "learning_rate": 0.0002, "epoch": 3.445153989168216, "step": 21310}, {"loss": 0.6117, "grad_norm": 0.968539834022522, "learning_rate": 0.0002, "epoch": 3.4467706733489614, "step": 21320}, {"loss": 0.6219, "grad_norm": 0.8774511218070984, "learning_rate": 0.0002, "epoch": 3.4483873575297066, "step": 21330}, {"loss": 0.6438, "grad_norm": 0.7598156332969666, "learning_rate": 0.0002, "epoch": 3.450004041710452, "step": 21340}, {"loss": 0.6033, "grad_norm": 1.1012897491455078, "learning_rate": 0.0002, "epoch": 3.451620725891197, "step": 21350}, {"loss": 0.6137, "grad_norm": 0.8040637373924255, "learning_rate": 0.0002, "epoch": 3.4532374100719423, "step": 21360}, {"loss": 0.6173, "grad_norm": 0.8497496247291565, "learning_rate": 0.0002, "epoch": 3.4548540942526875, "step": 21370}, {"loss": 0.6005, "grad_norm": 0.8429915904998779, "learning_rate": 0.0002, "epoch": 3.456470778433433, "step": 21380}, {"loss": 0.6182, "grad_norm": 0.8107112646102905, "learning_rate": 0.0002, "epoch": 3.4580874626141784, "step": 21390}, {"loss": 0.6109, "grad_norm": 1.00872004032135, "learning_rate": 0.0002, "epoch": 3.4597041467949237, "step": 21400}, {"loss": 0.5712, "grad_norm": 0.8266542553901672, "learning_rate": 0.0002, "epoch": 3.461320830975669, "step": 21410}, {"loss": 0.6457, "grad_norm": 0.8972568511962891, "learning_rate": 0.0002, "epoch": 3.462937515156414, "step": 21420}, {"loss": 0.6081, "grad_norm": 1.0781476497650146, "learning_rate": 0.0002, "epoch": 3.4645541993371594, "step": 21430}, {"loss": 0.6303, "grad_norm": 0.9571592807769775, "learning_rate": 0.0002, "epoch": 3.4661708835179046, "step": 21440}, {"loss": 0.6309, "grad_norm": 0.881547212600708, "learning_rate": 0.0002, "epoch": 3.4677875676986503, "step": 21450}, {"loss": 0.6076, "grad_norm": 0.6955338716506958, "learning_rate": 0.0002, "epoch": 3.4694042518793955, "step": 21460}, {"loss": 0.6205, "grad_norm": 0.901187539100647, "learning_rate": 0.0002, "epoch": 3.4710209360601407, "step": 21470}, {"loss": 0.639, "grad_norm": 0.7063511610031128, "learning_rate": 0.0002, "epoch": 3.472637620240886, "step": 21480}, {"loss": 0.6154, "grad_norm": 0.8462792038917542, "learning_rate": 0.0002, "epoch": 3.474254304421631, "step": 21490}, {"loss": 0.61, "grad_norm": 1.1861060857772827, "learning_rate": 0.0002, "epoch": 3.4758709886023764, "step": 21500}, {"loss": 0.6586, "grad_norm": 0.70503169298172, "learning_rate": 0.0002, "epoch": 3.4774876727831217, "step": 21510}, {"loss": 0.6475, "grad_norm": 0.9650066494941711, "learning_rate": 0.0002, "epoch": 3.479104356963867, "step": 21520}, {"loss": 0.6452, "grad_norm": 1.0266852378845215, "learning_rate": 0.0002, "epoch": 3.4807210411446126, "step": 21530}, {"loss": 0.6553, "grad_norm": 0.956372857093811, "learning_rate": 0.0002, "epoch": 3.482337725325358, "step": 21540}, {"loss": 0.6667, "grad_norm": 0.8848432898521423, "learning_rate": 0.0002, "epoch": 3.483954409506103, "step": 21550}, {"loss": 0.6375, "grad_norm": 1.0805351734161377, "learning_rate": 0.0002, "epoch": 3.4855710936868483, "step": 21560}, {"loss": 0.6958, "grad_norm": 0.9279725551605225, "learning_rate": 0.0002, "epoch": 3.4871877778675935, "step": 21570}, {"loss": 0.6354, "grad_norm": 0.9049562215805054, "learning_rate": 0.0002, "epoch": 3.4888044620483387, "step": 21580}, {"loss": 0.6071, "grad_norm": 0.9619429111480713, "learning_rate": 0.0002, "epoch": 3.4904211462290844, "step": 21590}, {"loss": 0.5927, "grad_norm": 0.8508906960487366, "learning_rate": 0.0002, "epoch": 3.4920378304098296, "step": 21600}, {"loss": 0.6115, "grad_norm": 0.8692502379417419, "learning_rate": 0.0002, "epoch": 3.493654514590575, "step": 21610}, {"loss": 0.5878, "grad_norm": 0.8187332153320312, "learning_rate": 0.0002, "epoch": 3.49527119877132, "step": 21620}, {"loss": 0.5874, "grad_norm": 1.145400047302246, "learning_rate": 0.0002, "epoch": 3.4968878829520653, "step": 21630}, {"loss": 0.6313, "grad_norm": 0.8281388282775879, "learning_rate": 0.0002, "epoch": 3.4985045671328105, "step": 21640}, {"loss": 0.6624, "grad_norm": 0.82256019115448, "learning_rate": 0.0002, "epoch": 3.500121251313556, "step": 21650}, {"loss": 0.6346, "grad_norm": 0.9315484762191772, "learning_rate": 0.0002, "epoch": 3.501737935494301, "step": 21660}, {"loss": 0.6086, "grad_norm": 0.7626111507415771, "learning_rate": 0.0002, "epoch": 3.5033546196750462, "step": 21670}, {"loss": 0.6177, "grad_norm": 0.9275059103965759, "learning_rate": 0.0002, "epoch": 3.504971303855792, "step": 21680}, {"loss": 0.64, "grad_norm": 0.7906724810600281, "learning_rate": 0.0002, "epoch": 3.506587988036537, "step": 21690}, {"loss": 0.6015, "grad_norm": 0.8289761543273926, "learning_rate": 0.0002, "epoch": 3.5082046722172824, "step": 21700}, {"loss": 0.6246, "grad_norm": 0.8316431045532227, "learning_rate": 0.0002, "epoch": 3.5098213563980276, "step": 21710}, {"loss": 0.619, "grad_norm": 1.0451812744140625, "learning_rate": 0.0002, "epoch": 3.511438040578773, "step": 21720}, {"loss": 0.632, "grad_norm": 0.928252637386322, "learning_rate": 0.0002, "epoch": 3.513054724759518, "step": 21730}, {"loss": 0.6062, "grad_norm": 0.7985895276069641, "learning_rate": 0.0002, "epoch": 3.5146714089402638, "step": 21740}, {"loss": 0.6463, "grad_norm": 0.6740974187850952, "learning_rate": 0.0002, "epoch": 3.516288093121009, "step": 21750}, {"loss": 0.6138, "grad_norm": 0.8482223749160767, "learning_rate": 0.0002, "epoch": 3.517904777301754, "step": 21760}, {"loss": 0.6277, "grad_norm": 0.889947772026062, "learning_rate": 0.0002, "epoch": 3.5195214614824994, "step": 21770}, {"loss": 0.6174, "grad_norm": 0.8304598927497864, "learning_rate": 0.0002, "epoch": 3.5211381456632447, "step": 21780}, {"loss": 0.6156, "grad_norm": 0.8002981543540955, "learning_rate": 0.0002, "epoch": 3.52275482984399, "step": 21790}, {"loss": 0.5896, "grad_norm": 0.8115083575248718, "learning_rate": 0.0002, "epoch": 3.524371514024735, "step": 21800}, {"loss": 0.6041, "grad_norm": 0.9715048670768738, "learning_rate": 0.0002, "epoch": 3.5259881982054804, "step": 21810}, {"loss": 0.6715, "grad_norm": 1.0910786390304565, "learning_rate": 0.0002, "epoch": 3.5276048823862256, "step": 21820}, {"loss": 0.6543, "grad_norm": 0.8438942432403564, "learning_rate": 0.0002, "epoch": 3.5292215665669713, "step": 21830}, {"loss": 0.6509, "grad_norm": 0.8813382983207703, "learning_rate": 0.0002, "epoch": 3.5308382507477165, "step": 21840}, {"loss": 0.6049, "grad_norm": 0.7092908024787903, "learning_rate": 0.0002, "epoch": 3.5324549349284617, "step": 21850}, {"loss": 0.5678, "grad_norm": 0.8332187533378601, "learning_rate": 0.0002, "epoch": 3.534071619109207, "step": 21860}, {"loss": 0.5896, "grad_norm": 0.8958209156990051, "learning_rate": 0.0002, "epoch": 3.535688303289952, "step": 21870}, {"loss": 0.6476, "grad_norm": 0.824138879776001, "learning_rate": 0.0002, "epoch": 3.5373049874706974, "step": 21880}, {"loss": 0.6022, "grad_norm": 0.8375158309936523, "learning_rate": 0.0002, "epoch": 3.538921671651443, "step": 21890}, {"loss": 0.6019, "grad_norm": 1.0274608135223389, "learning_rate": 0.0002, "epoch": 3.5405383558321883, "step": 21900}, {"loss": 0.6194, "grad_norm": 0.7088932394981384, "learning_rate": 0.0002, "epoch": 3.5421550400129336, "step": 21910}, {"loss": 0.6554, "grad_norm": 0.8172445297241211, "learning_rate": 0.0002, "epoch": 3.543771724193679, "step": 21920}, {"loss": 0.6711, "grad_norm": 0.9904135465621948, "learning_rate": 0.0002, "epoch": 3.545388408374424, "step": 21930}, {"loss": 0.6001, "grad_norm": 0.9900432229042053, "learning_rate": 0.0002, "epoch": 3.5470050925551693, "step": 21940}, {"loss": 0.6195, "grad_norm": 0.8963301181793213, "learning_rate": 0.0002, "epoch": 3.5486217767359145, "step": 21950}, {"loss": 0.5972, "grad_norm": 0.8551464676856995, "learning_rate": 0.0002, "epoch": 3.5502384609166597, "step": 21960}, {"loss": 0.6206, "grad_norm": 1.0916603803634644, "learning_rate": 0.0002, "epoch": 3.551855145097405, "step": 21970}, {"loss": 0.6523, "grad_norm": 0.841598391532898, "learning_rate": 0.0002, "epoch": 3.5534718292781506, "step": 21980}, {"loss": 0.617, "grad_norm": 0.8566757440567017, "learning_rate": 0.0002, "epoch": 3.555088513458896, "step": 21990}, {"loss": 0.6192, "grad_norm": 1.0145052671432495, "learning_rate": 0.0002, "epoch": 3.556705197639641, "step": 22000}, {"loss": 0.6173, "grad_norm": 0.9293754696846008, "learning_rate": 0.0002, "epoch": 3.5583218818203863, "step": 22010}, {"loss": 0.612, "grad_norm": 0.9568536281585693, "learning_rate": 0.0002, "epoch": 3.5599385660011316, "step": 22020}, {"loss": 0.641, "grad_norm": 0.8613139986991882, "learning_rate": 0.0002, "epoch": 3.5615552501818772, "step": 22030}, {"loss": 0.6496, "grad_norm": 0.8179237246513367, "learning_rate": 0.0002, "epoch": 3.5631719343626225, "step": 22040}, {"loss": 0.574, "grad_norm": 0.9059830904006958, "learning_rate": 0.0002, "epoch": 3.5647886185433677, "step": 22050}, {"loss": 0.6448, "grad_norm": 1.0068252086639404, "learning_rate": 0.0002, "epoch": 3.566405302724113, "step": 22060}, {"loss": 0.6239, "grad_norm": 0.9682072997093201, "learning_rate": 0.0002, "epoch": 3.568021986904858, "step": 22070}, {"loss": 0.6808, "grad_norm": 0.8514005541801453, "learning_rate": 0.0002, "epoch": 3.5696386710856034, "step": 22080}, {"loss": 0.5956, "grad_norm": 0.8327770829200745, "learning_rate": 0.0002, "epoch": 3.5712553552663486, "step": 22090}, {"loss": 0.5976, "grad_norm": 1.024976372718811, "learning_rate": 0.0002, "epoch": 3.572872039447094, "step": 22100}, {"loss": 0.624, "grad_norm": 0.7721174955368042, "learning_rate": 0.0002, "epoch": 3.574488723627839, "step": 22110}, {"loss": 0.5896, "grad_norm": 1.0351054668426514, "learning_rate": 0.0002, "epoch": 3.5761054078085843, "step": 22120}, {"loss": 0.6379, "grad_norm": 0.9680907130241394, "learning_rate": 0.0002, "epoch": 3.57772209198933, "step": 22130}, {"loss": 0.6194, "grad_norm": 0.8016974925994873, "learning_rate": 0.0002, "epoch": 3.5793387761700752, "step": 22140}, {"loss": 0.6387, "grad_norm": 1.0109003782272339, "learning_rate": 0.0002, "epoch": 3.5809554603508205, "step": 22150}, {"loss": 0.6368, "grad_norm": 1.0473392009735107, "learning_rate": 0.0002, "epoch": 3.5825721445315657, "step": 22160}, {"loss": 0.6353, "grad_norm": 0.8686613440513611, "learning_rate": 0.0002, "epoch": 3.584188828712311, "step": 22170}, {"loss": 0.5791, "grad_norm": 0.869149923324585, "learning_rate": 0.0002, "epoch": 3.5858055128930566, "step": 22180}, {"loss": 0.5895, "grad_norm": 0.9769062995910645, "learning_rate": 0.0002, "epoch": 3.587422197073802, "step": 22190}, {"loss": 0.5939, "grad_norm": 0.779636561870575, "learning_rate": 0.0002, "epoch": 3.589038881254547, "step": 22200}, {"loss": 0.5875, "grad_norm": 0.9063841104507446, "learning_rate": 0.0002, "epoch": 3.5906555654352923, "step": 22210}, {"loss": 0.5671, "grad_norm": 0.9216037392616272, "learning_rate": 0.0002, "epoch": 3.5922722496160375, "step": 22220}, {"loss": 0.6484, "grad_norm": 1.0217336416244507, "learning_rate": 0.0002, "epoch": 3.5938889337967828, "step": 22230}, {"loss": 0.6511, "grad_norm": 0.8513161540031433, "learning_rate": 0.0002, "epoch": 3.595505617977528, "step": 22240}, {"loss": 0.6301, "grad_norm": 0.8084813952445984, "learning_rate": 0.0002, "epoch": 3.597122302158273, "step": 22250}, {"loss": 0.6197, "grad_norm": 0.8524802923202515, "learning_rate": 0.0002, "epoch": 3.5987389863390185, "step": 22260}, {"loss": 0.5599, "grad_norm": 0.9356237649917603, "learning_rate": 0.0002, "epoch": 3.600355670519764, "step": 22270}, {"loss": 0.628, "grad_norm": 1.009600281715393, "learning_rate": 0.0002, "epoch": 3.6019723547005094, "step": 22280}, {"loss": 0.6179, "grad_norm": 0.9900581240653992, "learning_rate": 0.0002, "epoch": 3.6035890388812546, "step": 22290}, {"loss": 0.5725, "grad_norm": 1.062495231628418, "learning_rate": 0.0002, "epoch": 3.605205723062, "step": 22300}, {"loss": 0.607, "grad_norm": 0.8832381367683411, "learning_rate": 0.0002, "epoch": 3.606822407242745, "step": 22310}, {"loss": 0.6215, "grad_norm": 0.9284297823905945, "learning_rate": 0.0002, "epoch": 3.6084390914234903, "step": 22320}, {"loss": 0.685, "grad_norm": 1.2381829023361206, "learning_rate": 0.0002, "epoch": 3.610055775604236, "step": 22330}, {"loss": 0.6181, "grad_norm": 0.929434597492218, "learning_rate": 0.0002, "epoch": 3.611672459784981, "step": 22340}, {"loss": 0.6141, "grad_norm": 0.9714490175247192, "learning_rate": 0.0002, "epoch": 3.6132891439657264, "step": 22350}, {"loss": 0.6861, "grad_norm": 0.808014988899231, "learning_rate": 0.0002, "epoch": 3.6149058281464717, "step": 22360}, {"loss": 0.6428, "grad_norm": 1.0364398956298828, "learning_rate": 0.0002, "epoch": 3.616522512327217, "step": 22370}, {"loss": 0.6337, "grad_norm": 0.7858489751815796, "learning_rate": 0.0002, "epoch": 3.618139196507962, "step": 22380}, {"loss": 0.6214, "grad_norm": 0.9920870065689087, "learning_rate": 0.0002, "epoch": 3.6197558806887074, "step": 22390}, {"loss": 0.6659, "grad_norm": 0.9183220863342285, "learning_rate": 0.0002, "epoch": 3.6213725648694526, "step": 22400}, {"loss": 0.6036, "grad_norm": 0.9826246500015259, "learning_rate": 0.0002, "epoch": 3.622989249050198, "step": 22410}, {"loss": 0.6441, "grad_norm": 0.8632931113243103, "learning_rate": 0.0002, "epoch": 3.6246059332309435, "step": 22420}, {"loss": 0.6124, "grad_norm": 0.8468965291976929, "learning_rate": 0.0002, "epoch": 3.6262226174116887, "step": 22430}, {"loss": 0.6328, "grad_norm": 0.8466871976852417, "learning_rate": 0.0002, "epoch": 3.627839301592434, "step": 22440}, {"loss": 0.5941, "grad_norm": 0.9501169919967651, "learning_rate": 0.0002, "epoch": 3.629455985773179, "step": 22450}, {"loss": 0.6069, "grad_norm": 0.8906720876693726, "learning_rate": 0.0002, "epoch": 3.6310726699539244, "step": 22460}, {"loss": 0.6928, "grad_norm": 0.7400227189064026, "learning_rate": 0.0002, "epoch": 3.6326893541346696, "step": 22470}, {"loss": 0.6337, "grad_norm": 0.9756355881690979, "learning_rate": 0.0002, "epoch": 3.6343060383154153, "step": 22480}, {"loss": 0.6203, "grad_norm": 0.7504993081092834, "learning_rate": 0.0002, "epoch": 3.6359227224961606, "step": 22490}, {"loss": 0.6302, "grad_norm": 0.9270039200782776, "learning_rate": 0.0002, "epoch": 3.637539406676906, "step": 22500}, {"loss": 0.6026, "grad_norm": 0.8841686844825745, "learning_rate": 0.0002, "epoch": 3.639156090857651, "step": 22510}, {"loss": 0.6098, "grad_norm": 0.8533213138580322, "learning_rate": 0.0002, "epoch": 3.6407727750383962, "step": 22520}, {"loss": 0.6412, "grad_norm": 1.0052043199539185, "learning_rate": 0.0002, "epoch": 3.6423894592191415, "step": 22530}, {"loss": 0.6363, "grad_norm": 1.0323461294174194, "learning_rate": 0.0002, "epoch": 3.6440061433998867, "step": 22540}, {"loss": 0.6545, "grad_norm": 0.8654312491416931, "learning_rate": 0.0002, "epoch": 3.645622827580632, "step": 22550}, {"loss": 0.6155, "grad_norm": 0.6400038003921509, "learning_rate": 0.0002, "epoch": 3.647239511761377, "step": 22560}, {"loss": 0.5829, "grad_norm": 0.8061298727989197, "learning_rate": 0.0002, "epoch": 3.648856195942123, "step": 22570}, {"loss": 0.6388, "grad_norm": 0.9257854223251343, "learning_rate": 0.0002, "epoch": 3.650472880122868, "step": 22580}, {"loss": 0.6409, "grad_norm": 0.8439396619796753, "learning_rate": 0.0002, "epoch": 3.6520895643036133, "step": 22590}, {"loss": 0.5996, "grad_norm": 0.7764544486999512, "learning_rate": 0.0002, "epoch": 3.6537062484843585, "step": 22600}, {"loss": 0.6434, "grad_norm": 1.125451683998108, "learning_rate": 0.0002, "epoch": 3.6553229326651038, "step": 22610}, {"loss": 0.6579, "grad_norm": 0.7523018717765808, "learning_rate": 0.0002, "epoch": 3.656939616845849, "step": 22620}, {"loss": 0.6476, "grad_norm": 1.071026086807251, "learning_rate": 0.0002, "epoch": 3.6585563010265947, "step": 22630}, {"loss": 0.6459, "grad_norm": 0.945791482925415, "learning_rate": 0.0002, "epoch": 3.66017298520734, "step": 22640}, {"loss": 0.659, "grad_norm": 0.8001811504364014, "learning_rate": 0.0002, "epoch": 3.661789669388085, "step": 22650}, {"loss": 0.6385, "grad_norm": 0.9700816869735718, "learning_rate": 0.0002, "epoch": 3.6634063535688304, "step": 22660}, {"loss": 0.6337, "grad_norm": 0.9053242206573486, "learning_rate": 0.0002, "epoch": 3.6650230377495756, "step": 22670}, {"loss": 0.6335, "grad_norm": 0.944362461566925, "learning_rate": 0.0002, "epoch": 3.666639721930321, "step": 22680}, {"loss": 0.6235, "grad_norm": 1.067489504814148, "learning_rate": 0.0002, "epoch": 3.668256406111066, "step": 22690}, {"loss": 0.698, "grad_norm": 1.0984995365142822, "learning_rate": 0.0002, "epoch": 3.6698730902918113, "step": 22700}, {"loss": 0.6717, "grad_norm": 0.9336317777633667, "learning_rate": 0.0002, "epoch": 3.6714897744725565, "step": 22710}, {"loss": 0.6195, "grad_norm": 0.9261918663978577, "learning_rate": 0.0002, "epoch": 3.673106458653302, "step": 22720}, {"loss": 0.6332, "grad_norm": 0.8648008704185486, "learning_rate": 0.0002, "epoch": 3.6747231428340474, "step": 22730}, {"loss": 0.6576, "grad_norm": 0.7225083708763123, "learning_rate": 0.0002, "epoch": 3.6763398270147927, "step": 22740}, {"loss": 0.6406, "grad_norm": 0.9258282780647278, "learning_rate": 0.0002, "epoch": 3.677956511195538, "step": 22750}, {"loss": 0.6397, "grad_norm": 0.70876145362854, "learning_rate": 0.0002, "epoch": 3.679573195376283, "step": 22760}, {"loss": 0.6821, "grad_norm": 0.8780210018157959, "learning_rate": 0.0002, "epoch": 3.681189879557029, "step": 22770}, {"loss": 0.6036, "grad_norm": 0.8075440526008606, "learning_rate": 0.0002, "epoch": 3.682806563737774, "step": 22780}, {"loss": 0.6561, "grad_norm": 0.8503130674362183, "learning_rate": 0.0002, "epoch": 3.6844232479185193, "step": 22790}, {"loss": 0.6082, "grad_norm": 0.8413618206977844, "learning_rate": 0.0002, "epoch": 3.6860399320992645, "step": 22800}, {"loss": 0.614, "grad_norm": 0.8675165176391602, "learning_rate": 0.0002, "epoch": 3.6876566162800097, "step": 22810}, {"loss": 0.6157, "grad_norm": 0.8235884308815002, "learning_rate": 0.0002, "epoch": 3.689273300460755, "step": 22820}, {"loss": 0.5708, "grad_norm": 0.9477725625038147, "learning_rate": 0.0002, "epoch": 3.6908899846415, "step": 22830}, {"loss": 0.6481, "grad_norm": 0.7883533835411072, "learning_rate": 0.0002, "epoch": 3.6925066688222454, "step": 22840}, {"loss": 0.5872, "grad_norm": 1.047913908958435, "learning_rate": 0.0002, "epoch": 3.6941233530029907, "step": 22850}, {"loss": 0.6176, "grad_norm": 0.9171528816223145, "learning_rate": 0.0002, "epoch": 3.695740037183736, "step": 22860}, {"loss": 0.6204, "grad_norm": 0.9338192343711853, "learning_rate": 0.0002, "epoch": 3.6973567213644816, "step": 22870}, {"loss": 0.686, "grad_norm": 0.8799443244934082, "learning_rate": 0.0002, "epoch": 3.698973405545227, "step": 22880}, {"loss": 0.6206, "grad_norm": 0.8515434861183167, "learning_rate": 0.0002, "epoch": 3.700590089725972, "step": 22890}, {"loss": 0.5954, "grad_norm": 0.7805591821670532, "learning_rate": 0.0002, "epoch": 3.7022067739067173, "step": 22900}, {"loss": 0.6108, "grad_norm": 0.8470911979675293, "learning_rate": 0.0002, "epoch": 3.7038234580874625, "step": 22910}, {"loss": 0.6557, "grad_norm": 0.9452309012413025, "learning_rate": 0.0002, "epoch": 3.705440142268208, "step": 22920}, {"loss": 0.6529, "grad_norm": 0.950243353843689, "learning_rate": 0.0002, "epoch": 3.7070568264489534, "step": 22930}, {"loss": 0.6364, "grad_norm": 0.7882499098777771, "learning_rate": 0.0002, "epoch": 3.7086735106296986, "step": 22940}, {"loss": 0.6462, "grad_norm": 0.8307787775993347, "learning_rate": 0.0002, "epoch": 3.710290194810444, "step": 22950}, {"loss": 0.6371, "grad_norm": 1.0970630645751953, "learning_rate": 0.0002, "epoch": 3.711906878991189, "step": 22960}, {"loss": 0.6281, "grad_norm": 0.8269566297531128, "learning_rate": 0.0002, "epoch": 3.7135235631719343, "step": 22970}, {"loss": 0.6561, "grad_norm": 0.8306704759597778, "learning_rate": 0.0002, "epoch": 3.7151402473526796, "step": 22980}, {"loss": 0.6418, "grad_norm": 0.9710225462913513, "learning_rate": 0.0002, "epoch": 3.716756931533425, "step": 22990}, {"loss": 0.6639, "grad_norm": 0.8890530467033386, "learning_rate": 0.0002, "epoch": 3.71837361571417, "step": 23000}, {"loss": 0.6084, "grad_norm": 0.883522629737854, "learning_rate": 0.0002, "epoch": 3.7199902998949153, "step": 23010}, {"loss": 0.6183, "grad_norm": 0.8662652373313904, "learning_rate": 0.0002, "epoch": 3.721606984075661, "step": 23020}, {"loss": 0.6266, "grad_norm": 0.7228406667709351, "learning_rate": 0.0002, "epoch": 3.723223668256406, "step": 23030}, {"loss": 0.6417, "grad_norm": 1.060792088508606, "learning_rate": 0.0002, "epoch": 3.7248403524371514, "step": 23040}, {"loss": 0.6346, "grad_norm": 1.0119613409042358, "learning_rate": 0.0002, "epoch": 3.7264570366178966, "step": 23050}, {"loss": 0.6466, "grad_norm": 0.9212996959686279, "learning_rate": 0.0002, "epoch": 3.728073720798642, "step": 23060}, {"loss": 0.6454, "grad_norm": 0.925690233707428, "learning_rate": 0.0002, "epoch": 3.7296904049793875, "step": 23070}, {"loss": 0.615, "grad_norm": 0.8323310613632202, "learning_rate": 0.0002, "epoch": 3.7313070891601328, "step": 23080}, {"loss": 0.679, "grad_norm": 0.8966048955917358, "learning_rate": 0.0002, "epoch": 3.732923773340878, "step": 23090}, {"loss": 0.6151, "grad_norm": 0.8995837569236755, "learning_rate": 0.0002, "epoch": 3.7345404575216232, "step": 23100}, {"loss": 0.6143, "grad_norm": 0.8748890161514282, "learning_rate": 0.0002, "epoch": 3.7361571417023685, "step": 23110}, {"loss": 0.6246, "grad_norm": 0.7985540628433228, "learning_rate": 0.0002, "epoch": 3.7377738258831137, "step": 23120}, {"loss": 0.6279, "grad_norm": 1.0240917205810547, "learning_rate": 0.0002, "epoch": 3.739390510063859, "step": 23130}, {"loss": 0.6747, "grad_norm": 0.9181789755821228, "learning_rate": 0.0002, "epoch": 3.741007194244604, "step": 23140}, {"loss": 0.6026, "grad_norm": 0.8896583914756775, "learning_rate": 0.0002, "epoch": 3.7426238784253494, "step": 23150}, {"loss": 0.5972, "grad_norm": 0.8635515570640564, "learning_rate": 0.0002, "epoch": 3.744240562606095, "step": 23160}, {"loss": 0.6683, "grad_norm": 0.8873575329780579, "learning_rate": 0.0002, "epoch": 3.7458572467868403, "step": 23170}, {"loss": 0.6143, "grad_norm": 0.9807148575782776, "learning_rate": 0.0002, "epoch": 3.7474739309675855, "step": 23180}, {"loss": 0.6381, "grad_norm": 0.900477945804596, "learning_rate": 0.0002, "epoch": 3.7490906151483308, "step": 23190}, {"loss": 0.6542, "grad_norm": 0.9379992485046387, "learning_rate": 0.0002, "epoch": 3.750707299329076, "step": 23200}, {"loss": 0.6015, "grad_norm": 0.9649890661239624, "learning_rate": 0.0002, "epoch": 3.752323983509821, "step": 23210}, {"loss": 0.6735, "grad_norm": 0.824442446231842, "learning_rate": 0.0002, "epoch": 3.753940667690567, "step": 23220}, {"loss": 0.5992, "grad_norm": 0.8896150588989258, "learning_rate": 0.0002, "epoch": 3.755557351871312, "step": 23230}, {"loss": 0.6081, "grad_norm": 0.751249372959137, "learning_rate": 0.0002, "epoch": 3.7571740360520574, "step": 23240}, {"loss": 0.629, "grad_norm": 0.9392193555831909, "learning_rate": 0.0002, "epoch": 3.7587907202328026, "step": 23250}, {"loss": 0.6209, "grad_norm": 0.9284586310386658, "learning_rate": 0.0002, "epoch": 3.760407404413548, "step": 23260}, {"loss": 0.6414, "grad_norm": 0.7738175392150879, "learning_rate": 0.0002, "epoch": 3.762024088594293, "step": 23270}, {"loss": 0.6743, "grad_norm": 0.9252978563308716, "learning_rate": 0.0002, "epoch": 3.7636407727750383, "step": 23280}, {"loss": 0.5984, "grad_norm": 0.9501895904541016, "learning_rate": 0.0002, "epoch": 3.7652574569557835, "step": 23290}, {"loss": 0.6568, "grad_norm": 0.9416276216506958, "learning_rate": 0.0002, "epoch": 3.7668741411365287, "step": 23300}, {"loss": 0.6507, "grad_norm": 0.7076631784439087, "learning_rate": 0.0002, "epoch": 3.7684908253172744, "step": 23310}, {"loss": 0.6329, "grad_norm": 0.9864492416381836, "learning_rate": 0.0002, "epoch": 3.7701075094980196, "step": 23320}, {"loss": 0.6537, "grad_norm": 0.8450456261634827, "learning_rate": 0.0002, "epoch": 3.771724193678765, "step": 23330}, {"loss": 0.658, "grad_norm": 1.0768941640853882, "learning_rate": 0.0002, "epoch": 3.77334087785951, "step": 23340}, {"loss": 0.6408, "grad_norm": 0.9956819415092468, "learning_rate": 0.0002, "epoch": 3.7749575620402553, "step": 23350}, {"loss": 0.6464, "grad_norm": 0.9234658479690552, "learning_rate": 0.0002, "epoch": 3.7765742462210006, "step": 23360}, {"loss": 0.6542, "grad_norm": 1.0993858575820923, "learning_rate": 0.0002, "epoch": 3.7781909304017463, "step": 23370}, {"loss": 0.6391, "grad_norm": 0.923159658908844, "learning_rate": 0.0002, "epoch": 3.7798076145824915, "step": 23380}, {"loss": 0.6625, "grad_norm": 0.9311541318893433, "learning_rate": 0.0002, "epoch": 3.7814242987632367, "step": 23390}, {"loss": 0.6535, "grad_norm": 0.919681191444397, "learning_rate": 0.0002, "epoch": 3.783040982943982, "step": 23400}, {"loss": 0.6138, "grad_norm": 1.7406195402145386, "learning_rate": 0.0002, "epoch": 3.784657667124727, "step": 23410}, {"loss": 0.657, "grad_norm": 0.7789074182510376, "learning_rate": 0.0002, "epoch": 3.7862743513054724, "step": 23420}, {"loss": 0.658, "grad_norm": 0.8302814960479736, "learning_rate": 0.0002, "epoch": 3.7878910354862176, "step": 23430}, {"loss": 0.649, "grad_norm": 0.8089349269866943, "learning_rate": 0.0002, "epoch": 3.789507719666963, "step": 23440}, {"loss": 0.6682, "grad_norm": 0.9006284475326538, "learning_rate": 0.0002, "epoch": 3.791124403847708, "step": 23450}, {"loss": 0.6335, "grad_norm": 0.8426766991615295, "learning_rate": 0.0002, "epoch": 3.7927410880284538, "step": 23460}, {"loss": 0.6364, "grad_norm": 1.2576252222061157, "learning_rate": 0.0002, "epoch": 3.794357772209199, "step": 23470}, {"loss": 0.6324, "grad_norm": 1.0307610034942627, "learning_rate": 0.0002, "epoch": 3.7959744563899442, "step": 23480}, {"loss": 0.6262, "grad_norm": 0.8525972962379456, "learning_rate": 0.0002, "epoch": 3.7975911405706895, "step": 23490}, {"loss": 0.6757, "grad_norm": 1.159039855003357, "learning_rate": 0.0002, "epoch": 3.7992078247514347, "step": 23500}, {"loss": 0.6414, "grad_norm": 1.4193549156188965, "learning_rate": 0.0002, "epoch": 3.80082450893218, "step": 23510}, {"loss": 0.6413, "grad_norm": 0.8245543837547302, "learning_rate": 0.0002, "epoch": 3.8024411931129256, "step": 23520}, {"loss": 0.6417, "grad_norm": 0.8847230076789856, "learning_rate": 0.0002, "epoch": 3.804057877293671, "step": 23530}, {"loss": 0.6415, "grad_norm": 0.9574624300003052, "learning_rate": 0.0002, "epoch": 3.805674561474416, "step": 23540}, {"loss": 0.5765, "grad_norm": 1.048020601272583, "learning_rate": 0.0002, "epoch": 3.8072912456551613, "step": 23550}, {"loss": 0.6497, "grad_norm": 0.8302255868911743, "learning_rate": 0.0002, "epoch": 3.8089079298359065, "step": 23560}, {"loss": 0.6534, "grad_norm": 0.8269215822219849, "learning_rate": 0.0002, "epoch": 3.8105246140166518, "step": 23570}, {"loss": 0.6294, "grad_norm": 0.9375753402709961, "learning_rate": 0.0002, "epoch": 3.812141298197397, "step": 23580}, {"loss": 0.6132, "grad_norm": 1.0234097242355347, "learning_rate": 0.0002, "epoch": 3.8137579823781422, "step": 23590}, {"loss": 0.6625, "grad_norm": 0.8978445529937744, "learning_rate": 0.0002, "epoch": 3.8153746665588875, "step": 23600}, {"loss": 0.6315, "grad_norm": 0.7929515838623047, "learning_rate": 0.0002, "epoch": 3.816991350739633, "step": 23610}, {"loss": 0.6387, "grad_norm": 1.3255881071090698, "learning_rate": 0.0002, "epoch": 3.8186080349203784, "step": 23620}, {"loss": 0.5947, "grad_norm": 0.9188598990440369, "learning_rate": 0.0002, "epoch": 3.8202247191011236, "step": 23630}, {"loss": 0.6152, "grad_norm": 0.8811675906181335, "learning_rate": 0.0002, "epoch": 3.821841403281869, "step": 23640}, {"loss": 0.6253, "grad_norm": 0.8061038255691528, "learning_rate": 0.0002, "epoch": 3.823458087462614, "step": 23650}, {"loss": 0.6517, "grad_norm": 0.9975376129150391, "learning_rate": 0.0002, "epoch": 3.8250747716433597, "step": 23660}, {"loss": 0.6288, "grad_norm": 0.8036105036735535, "learning_rate": 0.0002, "epoch": 3.826691455824105, "step": 23670}, {"loss": 0.6845, "grad_norm": 0.7401984333992004, "learning_rate": 0.0002, "epoch": 3.82830814000485, "step": 23680}, {"loss": 0.6423, "grad_norm": 0.829753041267395, "learning_rate": 0.0002, "epoch": 3.8299248241855954, "step": 23690}, {"loss": 0.6611, "grad_norm": 0.8753240704536438, "learning_rate": 0.0002, "epoch": 3.8315415083663407, "step": 23700}, {"loss": 0.6686, "grad_norm": 0.8157842755317688, "learning_rate": 0.0002, "epoch": 3.833158192547086, "step": 23710}, {"loss": 0.6181, "grad_norm": 0.6183798909187317, "learning_rate": 0.0002, "epoch": 3.834774876727831, "step": 23720}, {"loss": 0.5965, "grad_norm": 0.9548442363739014, "learning_rate": 0.0002, "epoch": 3.8363915609085764, "step": 23730}, {"loss": 0.6456, "grad_norm": 0.8319669961929321, "learning_rate": 0.0002, "epoch": 3.8380082450893216, "step": 23740}, {"loss": 0.6585, "grad_norm": 0.9718693494796753, "learning_rate": 0.0002, "epoch": 3.839624929270067, "step": 23750}, {"loss": 0.6518, "grad_norm": 0.8672235012054443, "learning_rate": 0.0002, "epoch": 3.8412416134508125, "step": 23760}, {"loss": 0.6774, "grad_norm": 1.1210707426071167, "learning_rate": 0.0002, "epoch": 3.8428582976315577, "step": 23770}, {"loss": 0.5923, "grad_norm": 0.9177767634391785, "learning_rate": 0.0002, "epoch": 3.844474981812303, "step": 23780}, {"loss": 0.6286, "grad_norm": 0.8714171648025513, "learning_rate": 0.0002, "epoch": 3.846091665993048, "step": 23790}, {"loss": 0.6302, "grad_norm": 1.1853246688842773, "learning_rate": 0.0002, "epoch": 3.8477083501737934, "step": 23800}, {"loss": 0.6144, "grad_norm": 0.8091260194778442, "learning_rate": 0.0002, "epoch": 3.849325034354539, "step": 23810}, {"loss": 0.658, "grad_norm": 0.9710774421691895, "learning_rate": 0.0002, "epoch": 3.8509417185352843, "step": 23820}, {"loss": 0.6151, "grad_norm": 0.7648707628250122, "learning_rate": 0.0002, "epoch": 3.8525584027160296, "step": 23830}, {"loss": 0.6013, "grad_norm": 0.7809253931045532, "learning_rate": 0.0002, "epoch": 3.854175086896775, "step": 23840}, {"loss": 0.6006, "grad_norm": 0.8337951898574829, "learning_rate": 0.0002, "epoch": 3.85579177107752, "step": 23850}, {"loss": 0.6456, "grad_norm": 0.9271913170814514, "learning_rate": 0.0002, "epoch": 3.8574084552582653, "step": 23860}, {"loss": 0.6671, "grad_norm": 0.985334038734436, "learning_rate": 0.0002, "epoch": 3.8590251394390105, "step": 23870}, {"loss": 0.6693, "grad_norm": 0.8458583354949951, "learning_rate": 0.0002, "epoch": 3.8606418236197557, "step": 23880}, {"loss": 0.6207, "grad_norm": 1.015348196029663, "learning_rate": 0.0002, "epoch": 3.862258507800501, "step": 23890}, {"loss": 0.649, "grad_norm": 1.0121688842773438, "learning_rate": 0.0002, "epoch": 3.8638751919812466, "step": 23900}, {"loss": 0.5921, "grad_norm": 0.8883971571922302, "learning_rate": 0.0002, "epoch": 3.865491876161992, "step": 23910}, {"loss": 0.6597, "grad_norm": 1.028086543083191, "learning_rate": 0.0002, "epoch": 3.867108560342737, "step": 23920}, {"loss": 0.6654, "grad_norm": 0.9645734429359436, "learning_rate": 0.0002, "epoch": 3.8687252445234823, "step": 23930}, {"loss": 0.6328, "grad_norm": 0.8235350251197815, "learning_rate": 0.0002, "epoch": 3.8703419287042276, "step": 23940}, {"loss": 0.6387, "grad_norm": 1.0298916101455688, "learning_rate": 0.0002, "epoch": 3.871958612884973, "step": 23950}, {"loss": 0.5966, "grad_norm": 1.0063377618789673, "learning_rate": 0.0002, "epoch": 3.8735752970657185, "step": 23960}, {"loss": 0.6234, "grad_norm": 0.9230626821517944, "learning_rate": 0.0002, "epoch": 3.8751919812464637, "step": 23970}, {"loss": 0.6159, "grad_norm": 0.9243063926696777, "learning_rate": 0.0002, "epoch": 3.876808665427209, "step": 23980}, {"loss": 0.6035, "grad_norm": 1.0211291313171387, "learning_rate": 0.0002, "epoch": 3.878425349607954, "step": 23990}, {"loss": 0.6351, "grad_norm": 0.7800535559654236, "learning_rate": 0.0002, "epoch": 3.8800420337886994, "step": 24000}, {"loss": 0.7, "grad_norm": 0.7904248833656311, "learning_rate": 0.0002, "epoch": 3.8816587179694446, "step": 24010}, {"loss": 0.6516, "grad_norm": 1.1975988149642944, "learning_rate": 0.0002, "epoch": 3.88327540215019, "step": 24020}, {"loss": 0.6006, "grad_norm": 1.0626593828201294, "learning_rate": 0.0002, "epoch": 3.884892086330935, "step": 24030}, {"loss": 0.6115, "grad_norm": 0.9012193083763123, "learning_rate": 0.0002, "epoch": 3.8865087705116803, "step": 24040}, {"loss": 0.6786, "grad_norm": 1.1159172058105469, "learning_rate": 0.0002, "epoch": 3.888125454692426, "step": 24050}, {"loss": 0.6635, "grad_norm": 1.276838779449463, "learning_rate": 0.0002, "epoch": 3.889742138873171, "step": 24060}, {"loss": 0.5985, "grad_norm": 0.8467690348625183, "learning_rate": 0.0002, "epoch": 3.8913588230539164, "step": 24070}, {"loss": 0.6655, "grad_norm": 0.9862841963768005, "learning_rate": 0.0002, "epoch": 3.8929755072346617, "step": 24080}, {"loss": 0.6098, "grad_norm": 0.7134621739387512, "learning_rate": 0.0002, "epoch": 3.894592191415407, "step": 24090}, {"loss": 0.618, "grad_norm": 0.8178175091743469, "learning_rate": 0.0002, "epoch": 3.896208875596152, "step": 24100}, {"loss": 0.6147, "grad_norm": 0.9229172468185425, "learning_rate": 0.0002, "epoch": 3.897825559776898, "step": 24110}, {"loss": 0.6554, "grad_norm": 1.0878316164016724, "learning_rate": 0.0002, "epoch": 3.899442243957643, "step": 24120}, {"loss": 0.6616, "grad_norm": 0.971645712852478, "learning_rate": 0.0002, "epoch": 3.9010589281383883, "step": 24130}, {"loss": 0.6228, "grad_norm": 0.8862188458442688, "learning_rate": 0.0002, "epoch": 3.9026756123191335, "step": 24140}, {"loss": 0.6192, "grad_norm": 0.9126982688903809, "learning_rate": 0.0002, "epoch": 3.9042922964998787, "step": 24150}, {"loss": 0.6734, "grad_norm": 0.8833470940589905, "learning_rate": 0.0002, "epoch": 3.905908980680624, "step": 24160}, {"loss": 0.5832, "grad_norm": 0.8320947885513306, "learning_rate": 0.0002, "epoch": 3.907525664861369, "step": 24170}, {"loss": 0.6247, "grad_norm": 0.9156602025032043, "learning_rate": 0.0002, "epoch": 3.9091423490421144, "step": 24180}, {"loss": 0.6678, "grad_norm": 1.029181957244873, "learning_rate": 0.0002, "epoch": 3.9107590332228597, "step": 24190}, {"loss": 0.6565, "grad_norm": 0.9052802324295044, "learning_rate": 0.0002, "epoch": 3.9123757174036053, "step": 24200}, {"loss": 0.6346, "grad_norm": 0.8847255110740662, "learning_rate": 0.0002, "epoch": 3.9139924015843506, "step": 24210}, {"loss": 0.6343, "grad_norm": 0.9642062187194824, "learning_rate": 0.0002, "epoch": 3.915609085765096, "step": 24220}, {"loss": 0.6557, "grad_norm": 0.8629093766212463, "learning_rate": 0.0002, "epoch": 3.917225769945841, "step": 24230}, {"loss": 0.6086, "grad_norm": 0.8674976825714111, "learning_rate": 0.0002, "epoch": 3.9188424541265863, "step": 24240}, {"loss": 0.5874, "grad_norm": 1.104846477508545, "learning_rate": 0.0002, "epoch": 3.9204591383073315, "step": 24250}, {"loss": 0.6501, "grad_norm": 1.0874955654144287, "learning_rate": 0.0002, "epoch": 3.922075822488077, "step": 24260}, {"loss": 0.6455, "grad_norm": 0.8689812421798706, "learning_rate": 0.0002, "epoch": 3.9236925066688224, "step": 24270}, {"loss": 0.5893, "grad_norm": 0.9724617004394531, "learning_rate": 0.0002, "epoch": 3.9253091908495676, "step": 24280}, {"loss": 0.6616, "grad_norm": 0.9165538549423218, "learning_rate": 0.0002, "epoch": 3.926925875030313, "step": 24290}, {"loss": 0.645, "grad_norm": 0.9307710528373718, "learning_rate": 0.0002, "epoch": 3.928542559211058, "step": 24300}, {"loss": 0.6071, "grad_norm": 0.8589295148849487, "learning_rate": 0.0002, "epoch": 3.9301592433918033, "step": 24310}, {"loss": 0.6662, "grad_norm": 0.9151099920272827, "learning_rate": 0.0002, "epoch": 3.9317759275725486, "step": 24320}, {"loss": 0.7075, "grad_norm": 0.9633517265319824, "learning_rate": 0.0002, "epoch": 3.933392611753294, "step": 24330}, {"loss": 0.6432, "grad_norm": 0.9521116018295288, "learning_rate": 0.0002, "epoch": 3.935009295934039, "step": 24340}, {"loss": 0.6457, "grad_norm": 0.8366776704788208, "learning_rate": 0.0002, "epoch": 3.9366259801147847, "step": 24350}, {"loss": 0.6139, "grad_norm": 0.8972663283348083, "learning_rate": 0.0002, "epoch": 3.93824266429553, "step": 24360}, {"loss": 0.661, "grad_norm": 0.8102919459342957, "learning_rate": 0.0002, "epoch": 3.939859348476275, "step": 24370}, {"loss": 0.6388, "grad_norm": 0.8189975023269653, "learning_rate": 0.0002, "epoch": 3.9414760326570204, "step": 24380}, {"loss": 0.6818, "grad_norm": 0.9569464921951294, "learning_rate": 0.0002, "epoch": 3.9430927168377656, "step": 24390}, {"loss": 0.6999, "grad_norm": 0.7459101676940918, "learning_rate": 0.0002, "epoch": 3.9447094010185113, "step": 24400}, {"loss": 0.6069, "grad_norm": 0.8536974787712097, "learning_rate": 0.0002, "epoch": 3.9463260851992565, "step": 24410}, {"loss": 0.5683, "grad_norm": 0.8763698935508728, "learning_rate": 0.0002, "epoch": 3.9479427693800018, "step": 24420}, {"loss": 0.6478, "grad_norm": 0.9381106495857239, "learning_rate": 0.0002, "epoch": 3.949559453560747, "step": 24430}, {"loss": 0.6371, "grad_norm": 0.934440016746521, "learning_rate": 0.0002, "epoch": 3.9511761377414922, "step": 24440}, {"loss": 0.6393, "grad_norm": 0.903918981552124, "learning_rate": 0.0002, "epoch": 3.9527928219222375, "step": 24450}, {"loss": 0.6175, "grad_norm": 0.8771953582763672, "learning_rate": 0.0002, "epoch": 3.9544095061029827, "step": 24460}, {"loss": 0.6971, "grad_norm": 1.0375410318374634, "learning_rate": 0.0002, "epoch": 3.956026190283728, "step": 24470}, {"loss": 0.6313, "grad_norm": 0.9439185261726379, "learning_rate": 0.0002, "epoch": 3.957642874464473, "step": 24480}, {"loss": 0.6076, "grad_norm": 0.935467004776001, "learning_rate": 0.0002, "epoch": 3.9592595586452184, "step": 24490}, {"loss": 0.6437, "grad_norm": 0.6900772452354431, "learning_rate": 0.0002, "epoch": 3.960876242825964, "step": 24500}, {"loss": 0.6445, "grad_norm": 1.0172916650772095, "learning_rate": 0.0002, "epoch": 3.9624929270067093, "step": 24510}, {"loss": 0.6308, "grad_norm": 0.9167046546936035, "learning_rate": 0.0002, "epoch": 3.9641096111874545, "step": 24520}, {"loss": 0.6519, "grad_norm": 0.7230527997016907, "learning_rate": 0.0002, "epoch": 3.9657262953681998, "step": 24530}, {"loss": 0.6564, "grad_norm": 0.8980403542518616, "learning_rate": 0.0002, "epoch": 3.967342979548945, "step": 24540}, {"loss": 0.6099, "grad_norm": 0.8555465936660767, "learning_rate": 0.0002, "epoch": 3.9689596637296907, "step": 24550}, {"loss": 0.6617, "grad_norm": 0.7825445532798767, "learning_rate": 0.0002, "epoch": 3.970576347910436, "step": 24560}, {"loss": 0.604, "grad_norm": 0.7273133993148804, "learning_rate": 0.0002, "epoch": 3.972193032091181, "step": 24570}, {"loss": 0.6427, "grad_norm": 0.9612047672271729, "learning_rate": 0.0002, "epoch": 3.9738097162719264, "step": 24580}, {"loss": 0.6426, "grad_norm": 0.9865460991859436, "learning_rate": 0.0002, "epoch": 3.9754264004526716, "step": 24590}, {"loss": 0.6052, "grad_norm": 0.8638762831687927, "learning_rate": 0.0002, "epoch": 3.977043084633417, "step": 24600}, {"loss": 0.6097, "grad_norm": 1.0096198320388794, "learning_rate": 0.0002, "epoch": 3.978659768814162, "step": 24610}, {"loss": 0.6664, "grad_norm": 0.8475532531738281, "learning_rate": 0.0002, "epoch": 3.9802764529949073, "step": 24620}, {"loss": 0.6711, "grad_norm": 0.9696195721626282, "learning_rate": 0.0002, "epoch": 3.9818931371756525, "step": 24630}, {"loss": 0.6446, "grad_norm": 0.7499843239784241, "learning_rate": 0.0002, "epoch": 3.9835098213563978, "step": 24640}, {"loss": 0.6054, "grad_norm": 0.8865424990653992, "learning_rate": 0.0002, "epoch": 3.9851265055371434, "step": 24650}, {"loss": 0.5975, "grad_norm": 0.8089959025382996, "learning_rate": 0.0002, "epoch": 3.9867431897178887, "step": 24660}, {"loss": 0.6677, "grad_norm": 0.6946012377738953, "learning_rate": 0.0002, "epoch": 3.988359873898634, "step": 24670}, {"loss": 0.6329, "grad_norm": 0.7991759181022644, "learning_rate": 0.0002, "epoch": 3.989976558079379, "step": 24680}, {"loss": 0.6449, "grad_norm": 0.8803931474685669, "learning_rate": 0.0002, "epoch": 3.9915932422601244, "step": 24690}, {"loss": 0.7091, "grad_norm": 0.8848299980163574, "learning_rate": 0.0002, "epoch": 3.99320992644087, "step": 24700}, {"loss": 0.6551, "grad_norm": 0.7448889017105103, "learning_rate": 0.0002, "epoch": 3.9948266106216153, "step": 24710}, {"loss": 0.6432, "grad_norm": 0.9361620545387268, "learning_rate": 0.0002, "epoch": 3.9964432948023605, "step": 24720}, {"loss": 0.5917, "grad_norm": 0.9958081245422363, "learning_rate": 0.0002, "epoch": 3.9980599789831057, "step": 24730}, {"loss": 0.6567, "grad_norm": 1.026004672050476, "learning_rate": 0.0002, "epoch": 3.999676663163851, "step": 24740}, {"eval_loss": 1.1524168252944946, "eval_runtime": 122.1585, "eval_samples_per_second": 6.0, "eval_steps_per_second": 0.753, "epoch": 4.0, "step": 24742}, {"loss": 0.6057, "grad_norm": 1.0664808750152588, "learning_rate": 0.0002, "epoch": 4.001293347344596, "step": 24750}, {"loss": 0.5644, "grad_norm": 1.0113720893859863, "learning_rate": 0.0002, "epoch": 4.002910031525341, "step": 24760}, {"loss": 0.5628, "grad_norm": 0.991486668586731, "learning_rate": 0.0002, "epoch": 4.004526715706087, "step": 24770}, {"loss": 0.508, "grad_norm": 0.951754629611969, "learning_rate": 0.0002, "epoch": 4.006143399886832, "step": 24780}, {"loss": 0.5314, "grad_norm": 1.13059401512146, "learning_rate": 0.0002, "epoch": 4.007760084067577, "step": 24790}, {"loss": 0.5323, "grad_norm": 0.9343926310539246, "learning_rate": 0.0002, "epoch": 4.009376768248322, "step": 24800}, {"loss": 0.5161, "grad_norm": 1.0680590867996216, "learning_rate": 0.0002, "epoch": 4.010993452429068, "step": 24810}, {"loss": 0.513, "grad_norm": 1.0022706985473633, "learning_rate": 0.0002, "epoch": 4.012610136609814, "step": 24820}, {"loss": 0.543, "grad_norm": 1.0285297632217407, "learning_rate": 0.0002, "epoch": 4.014226820790559, "step": 24830}, {"loss": 0.5311, "grad_norm": 0.8347002863883972, "learning_rate": 0.0002, "epoch": 4.015843504971304, "step": 24840}, {"loss": 0.5655, "grad_norm": 0.9675396680831909, "learning_rate": 0.0002, "epoch": 4.017460189152049, "step": 24850}, {"loss": 0.5625, "grad_norm": 0.9238511323928833, "learning_rate": 0.0002, "epoch": 4.019076873332795, "step": 24860}, {"loss": 0.5327, "grad_norm": 1.1576941013336182, "learning_rate": 0.0002, "epoch": 4.02069355751354, "step": 24870}, {"loss": 0.5533, "grad_norm": 0.8583757281303406, "learning_rate": 0.0002, "epoch": 4.022310241694285, "step": 24880}, {"loss": 0.5483, "grad_norm": 0.9816817045211792, "learning_rate": 0.0002, "epoch": 4.02392692587503, "step": 24890}, {"loss": 0.5605, "grad_norm": 0.955073893070221, "learning_rate": 0.0002, "epoch": 4.0255436100557755, "step": 24900}, {"loss": 0.4896, "grad_norm": 1.1054974794387817, "learning_rate": 0.0002, "epoch": 4.027160294236521, "step": 24910}, {"loss": 0.5246, "grad_norm": 1.1240060329437256, "learning_rate": 0.0002, "epoch": 4.028776978417266, "step": 24920}, {"loss": 0.5451, "grad_norm": 0.9512825012207031, "learning_rate": 0.0002, "epoch": 4.030393662598011, "step": 24930}, {"loss": 0.5584, "grad_norm": 0.85965496301651, "learning_rate": 0.0002, "epoch": 4.0320103467787565, "step": 24940}, {"loss": 0.5564, "grad_norm": 0.9378061294555664, "learning_rate": 0.0002, "epoch": 4.033627030959502, "step": 24950}, {"loss": 0.5008, "grad_norm": 0.9655424356460571, "learning_rate": 0.0002, "epoch": 4.035243715140247, "step": 24960}, {"loss": 0.5538, "grad_norm": 1.1393707990646362, "learning_rate": 0.0002, "epoch": 4.036860399320993, "step": 24970}, {"loss": 0.5785, "grad_norm": 1.0220451354980469, "learning_rate": 0.0002, "epoch": 4.038477083501738, "step": 24980}, {"loss": 0.5813, "grad_norm": 0.9785808324813843, "learning_rate": 0.0002, "epoch": 4.0400937676824835, "step": 24990}, {"loss": 0.5153, "grad_norm": 1.0257649421691895, "learning_rate": 0.0002, "epoch": 4.041710451863229, "step": 25000}, {"loss": 0.5658, "grad_norm": 0.9737892150878906, "learning_rate": 0.0002, "epoch": 4.043327136043974, "step": 25010}, {"loss": 0.5515, "grad_norm": 0.7416959404945374, "learning_rate": 0.0002, "epoch": 4.044943820224719, "step": 25020}, {"loss": 0.5372, "grad_norm": 0.7909596562385559, "learning_rate": 0.0002, "epoch": 4.046560504405464, "step": 25030}, {"loss": 0.5265, "grad_norm": 0.8923130631446838, "learning_rate": 0.0002, "epoch": 4.04817718858621, "step": 25040}, {"loss": 0.5035, "grad_norm": 0.9044941663742065, "learning_rate": 0.0002, "epoch": 4.049793872766955, "step": 25050}, {"loss": 0.5135, "grad_norm": 0.866352379322052, "learning_rate": 0.0002, "epoch": 4.0514105569477, "step": 25060}, {"loss": 0.5956, "grad_norm": 1.544549822807312, "learning_rate": 0.0002, "epoch": 4.053027241128445, "step": 25070}, {"loss": 0.5418, "grad_norm": 0.8426995277404785, "learning_rate": 0.0002, "epoch": 4.054643925309191, "step": 25080}, {"loss": 0.5537, "grad_norm": 0.9797548651695251, "learning_rate": 0.0002, "epoch": 4.056260609489936, "step": 25090}, {"loss": 0.55, "grad_norm": 0.8468434810638428, "learning_rate": 0.0002, "epoch": 4.057877293670681, "step": 25100}, {"loss": 0.5242, "grad_norm": 0.9294559955596924, "learning_rate": 0.0002, "epoch": 4.059493977851426, "step": 25110}, {"loss": 0.5295, "grad_norm": 0.9686688780784607, "learning_rate": 0.0002, "epoch": 4.061110662032172, "step": 25120}, {"loss": 0.5642, "grad_norm": 0.8042728304862976, "learning_rate": 0.0002, "epoch": 4.062727346212918, "step": 25130}, {"loss": 0.548, "grad_norm": 1.165160894393921, "learning_rate": 0.0002, "epoch": 4.064344030393663, "step": 25140}, {"loss": 0.5473, "grad_norm": 1.2161961793899536, "learning_rate": 0.0002, "epoch": 4.065960714574408, "step": 25150}, {"loss": 0.5217, "grad_norm": 1.0762810707092285, "learning_rate": 0.0002, "epoch": 4.067577398755153, "step": 25160}, {"loss": 0.5886, "grad_norm": 0.7580869793891907, "learning_rate": 0.0002, "epoch": 4.069194082935899, "step": 25170}, {"loss": 0.5401, "grad_norm": 0.9630117416381836, "learning_rate": 0.0002, "epoch": 4.070810767116644, "step": 25180}, {"loss": 0.5378, "grad_norm": 0.9049716591835022, "learning_rate": 0.0002, "epoch": 4.072427451297389, "step": 25190}, {"loss": 0.5266, "grad_norm": 1.1536930799484253, "learning_rate": 0.0002, "epoch": 4.074044135478134, "step": 25200}, {"loss": 0.5523, "grad_norm": 0.901461124420166, "learning_rate": 0.0002, "epoch": 4.0756608196588795, "step": 25210}, {"loss": 0.5132, "grad_norm": 1.3318437337875366, "learning_rate": 0.0002, "epoch": 4.077277503839625, "step": 25220}, {"loss": 0.5317, "grad_norm": 0.8811455368995667, "learning_rate": 0.0002, "epoch": 4.07889418802037, "step": 25230}, {"loss": 0.5798, "grad_norm": 1.0564165115356445, "learning_rate": 0.0002, "epoch": 4.080510872201115, "step": 25240}, {"loss": 0.5472, "grad_norm": 1.1008027791976929, "learning_rate": 0.0002, "epoch": 4.08212755638186, "step": 25250}, {"loss": 0.5195, "grad_norm": 1.150097131729126, "learning_rate": 0.0002, "epoch": 4.083744240562606, "step": 25260}, {"loss": 0.5321, "grad_norm": 0.9339924454689026, "learning_rate": 0.0002, "epoch": 4.085360924743352, "step": 25270}, {"loss": 0.5597, "grad_norm": 1.0902045965194702, "learning_rate": 0.0002, "epoch": 4.086977608924097, "step": 25280}, {"loss": 0.5203, "grad_norm": 0.8483911156654358, "learning_rate": 0.0002, "epoch": 4.088594293104842, "step": 25290}, {"loss": 0.5697, "grad_norm": 0.9477024674415588, "learning_rate": 0.0002, "epoch": 4.0902109772855875, "step": 25300}, {"loss": 0.5384, "grad_norm": 0.9500215649604797, "learning_rate": 0.0002, "epoch": 4.091827661466333, "step": 25310}, {"loss": 0.5045, "grad_norm": 1.040468454360962, "learning_rate": 0.0002, "epoch": 4.093444345647078, "step": 25320}, {"loss": 0.5488, "grad_norm": 0.7457592487335205, "learning_rate": 0.0002, "epoch": 4.095061029827823, "step": 25330}, {"loss": 0.609, "grad_norm": 1.2092097997665405, "learning_rate": 0.0002, "epoch": 4.096677714008568, "step": 25340}, {"loss": 0.5174, "grad_norm": 0.9652107954025269, "learning_rate": 0.0002, "epoch": 4.098294398189314, "step": 25350}, {"loss": 0.5559, "grad_norm": 0.8464955687522888, "learning_rate": 0.0002, "epoch": 4.099911082370059, "step": 25360}, {"loss": 0.5635, "grad_norm": 0.875026285648346, "learning_rate": 0.0002, "epoch": 4.101527766550804, "step": 25370}, {"loss": 0.5774, "grad_norm": 0.9241740107536316, "learning_rate": 0.0002, "epoch": 4.103144450731549, "step": 25380}, {"loss": 0.5578, "grad_norm": 0.9769546389579773, "learning_rate": 0.0002, "epoch": 4.1047611349122946, "step": 25390}, {"loss": 0.567, "grad_norm": 1.1501960754394531, "learning_rate": 0.0002, "epoch": 4.10637781909304, "step": 25400}, {"loss": 0.5241, "grad_norm": 0.9135243892669678, "learning_rate": 0.0002, "epoch": 4.107994503273786, "step": 25410}, {"loss": 0.5152, "grad_norm": 0.9905396103858948, "learning_rate": 0.0002, "epoch": 4.109611187454531, "step": 25420}, {"loss": 0.5064, "grad_norm": 0.9845104217529297, "learning_rate": 0.0002, "epoch": 4.111227871635276, "step": 25430}, {"loss": 0.5029, "grad_norm": 0.8326883912086487, "learning_rate": 0.0002, "epoch": 4.112844555816022, "step": 25440}, {"loss": 0.5312, "grad_norm": 0.9264556765556335, "learning_rate": 0.0002, "epoch": 4.114461239996767, "step": 25450}, {"loss": 0.5968, "grad_norm": 1.043080449104309, "learning_rate": 0.0002, "epoch": 4.116077924177512, "step": 25460}, {"loss": 0.5773, "grad_norm": 0.8533386588096619, "learning_rate": 0.0002, "epoch": 4.117694608358257, "step": 25470}, {"loss": 0.5584, "grad_norm": 1.0133965015411377, "learning_rate": 0.0002, "epoch": 4.1193112925390025, "step": 25480}, {"loss": 0.566, "grad_norm": 0.7476310133934021, "learning_rate": 0.0002, "epoch": 4.120927976719748, "step": 25490}, {"loss": 0.5189, "grad_norm": 1.1247259378433228, "learning_rate": 0.0002, "epoch": 4.122544660900493, "step": 25500}, {"loss": 0.5751, "grad_norm": 1.0764678716659546, "learning_rate": 0.0002, "epoch": 4.124161345081238, "step": 25510}, {"loss": 0.5391, "grad_norm": 0.7679798007011414, "learning_rate": 0.0002, "epoch": 4.1257780292619834, "step": 25520}, {"loss": 0.5233, "grad_norm": 0.8877071142196655, "learning_rate": 0.0002, "epoch": 4.127394713442729, "step": 25530}, {"loss": 0.5769, "grad_norm": 1.0440239906311035, "learning_rate": 0.0002, "epoch": 4.129011397623474, "step": 25540}, {"loss": 0.5723, "grad_norm": 0.984145998954773, "learning_rate": 0.0002, "epoch": 4.130628081804219, "step": 25550}, {"loss": 0.5741, "grad_norm": 0.8667055368423462, "learning_rate": 0.0002, "epoch": 4.132244765984965, "step": 25560}, {"loss": 0.5816, "grad_norm": 1.1300835609436035, "learning_rate": 0.0002, "epoch": 4.1338614501657105, "step": 25570}, {"loss": 0.524, "grad_norm": 0.9314348101615906, "learning_rate": 0.0002, "epoch": 4.135478134346456, "step": 25580}, {"loss": 0.5283, "grad_norm": 0.7731879949569702, "learning_rate": 0.0002, "epoch": 4.137094818527201, "step": 25590}, {"loss": 0.5307, "grad_norm": 1.0080097913742065, "learning_rate": 0.0002, "epoch": 4.138711502707946, "step": 25600}, {"loss": 0.5759, "grad_norm": 1.2475038766860962, "learning_rate": 0.0002, "epoch": 4.140328186888691, "step": 25610}, {"loss": 0.55, "grad_norm": 0.9912930727005005, "learning_rate": 0.0002, "epoch": 4.141944871069437, "step": 25620}, {"loss": 0.5624, "grad_norm": 0.9088651537895203, "learning_rate": 0.0002, "epoch": 4.143561555250182, "step": 25630}, {"loss": 0.5393, "grad_norm": 0.8940697312355042, "learning_rate": 0.0002, "epoch": 4.145178239430927, "step": 25640}, {"loss": 0.5341, "grad_norm": 1.0798203945159912, "learning_rate": 0.0002, "epoch": 4.146794923611672, "step": 25650}, {"loss": 0.5987, "grad_norm": 0.955172061920166, "learning_rate": 0.0002, "epoch": 4.148411607792418, "step": 25660}, {"loss": 0.569, "grad_norm": 0.9692716002464294, "learning_rate": 0.0002, "epoch": 4.150028291973163, "step": 25670}, {"loss": 0.5478, "grad_norm": 1.0813939571380615, "learning_rate": 0.0002, "epoch": 4.151644976153908, "step": 25680}, {"loss": 0.5383, "grad_norm": 1.135675072669983, "learning_rate": 0.0002, "epoch": 4.153261660334653, "step": 25690}, {"loss": 0.5247, "grad_norm": 1.0392236709594727, "learning_rate": 0.0002, "epoch": 4.1548783445153985, "step": 25700}, {"loss": 0.5204, "grad_norm": 0.9473116993904114, "learning_rate": 0.0002, "epoch": 4.156495028696145, "step": 25710}, {"loss": 0.5339, "grad_norm": 0.712493896484375, "learning_rate": 0.0002, "epoch": 4.15811171287689, "step": 25720}, {"loss": 0.5781, "grad_norm": 0.8724465370178223, "learning_rate": 0.0002, "epoch": 4.159728397057635, "step": 25730}, {"loss": 0.5325, "grad_norm": 0.9870015978813171, "learning_rate": 0.0002, "epoch": 4.16134508123838, "step": 25740}, {"loss": 0.5503, "grad_norm": 1.025273084640503, "learning_rate": 0.0002, "epoch": 4.1629617654191255, "step": 25750}, {"loss": 0.5223, "grad_norm": 0.9243090152740479, "learning_rate": 0.0002, "epoch": 4.164578449599871, "step": 25760}, {"loss": 0.5177, "grad_norm": 1.1656451225280762, "learning_rate": 0.0002, "epoch": 4.166195133780616, "step": 25770}, {"loss": 0.5334, "grad_norm": 0.936358630657196, "learning_rate": 0.0002, "epoch": 4.167811817961361, "step": 25780}, {"loss": 0.5236, "grad_norm": 0.8618208169937134, "learning_rate": 0.0002, "epoch": 4.1694285021421065, "step": 25790}, {"loss": 0.5186, "grad_norm": 0.8580600023269653, "learning_rate": 0.0002, "epoch": 4.171045186322852, "step": 25800}, {"loss": 0.5212, "grad_norm": 1.0128562450408936, "learning_rate": 0.0002, "epoch": 4.172661870503597, "step": 25810}, {"loss": 0.5404, "grad_norm": 0.854865312576294, "learning_rate": 0.0002, "epoch": 4.174278554684342, "step": 25820}, {"loss": 0.5377, "grad_norm": 1.235082745552063, "learning_rate": 0.0002, "epoch": 4.175895238865087, "step": 25830}, {"loss": 0.5614, "grad_norm": 0.9796220660209656, "learning_rate": 0.0002, "epoch": 4.177511923045833, "step": 25840}, {"loss": 0.5689, "grad_norm": 0.8922094702720642, "learning_rate": 0.0002, "epoch": 4.179128607226578, "step": 25850}, {"loss": 0.5806, "grad_norm": 0.9672530293464661, "learning_rate": 0.0002, "epoch": 4.180745291407324, "step": 25860}, {"loss": 0.5074, "grad_norm": 0.8662548661231995, "learning_rate": 0.0002, "epoch": 4.182361975588069, "step": 25870}, {"loss": 0.5329, "grad_norm": 0.7938798069953918, "learning_rate": 0.0002, "epoch": 4.1839786597688144, "step": 25880}, {"loss": 0.5427, "grad_norm": 1.0517958402633667, "learning_rate": 0.0002, "epoch": 4.18559534394956, "step": 25890}, {"loss": 0.5147, "grad_norm": 0.8939275145530701, "learning_rate": 0.0002, "epoch": 4.187212028130305, "step": 25900}, {"loss": 0.5199, "grad_norm": 1.0296672582626343, "learning_rate": 0.0002, "epoch": 4.18882871231105, "step": 25910}, {"loss": 0.5522, "grad_norm": 0.8104017972946167, "learning_rate": 0.0002, "epoch": 4.190445396491795, "step": 25920}, {"loss": 0.596, "grad_norm": 0.9984509944915771, "learning_rate": 0.0002, "epoch": 4.192062080672541, "step": 25930}, {"loss": 0.5356, "grad_norm": 0.9844784736633301, "learning_rate": 0.0002, "epoch": 4.193678764853286, "step": 25940}, {"loss": 0.5198, "grad_norm": 0.8168622255325317, "learning_rate": 0.0002, "epoch": 4.195295449034031, "step": 25950}, {"loss": 0.542, "grad_norm": 1.0878913402557373, "learning_rate": 0.0002, "epoch": 4.196912133214776, "step": 25960}, {"loss": 0.5414, "grad_norm": 0.927126407623291, "learning_rate": 0.0002, "epoch": 4.1985288173955215, "step": 25970}, {"loss": 0.5794, "grad_norm": 0.838586688041687, "learning_rate": 0.0002, "epoch": 4.200145501576267, "step": 25980}, {"loss": 0.5454, "grad_norm": 1.2572145462036133, "learning_rate": 0.0002, "epoch": 4.201762185757012, "step": 25990}, {"loss": 0.5048, "grad_norm": 1.0476740598678589, "learning_rate": 0.0002, "epoch": 4.203378869937758, "step": 26000}, {"loss": 0.5127, "grad_norm": 1.0873368978500366, "learning_rate": 0.0002, "epoch": 4.204995554118503, "step": 26010}, {"loss": 0.5679, "grad_norm": 1.2664896249771118, "learning_rate": 0.0002, "epoch": 4.206612238299249, "step": 26020}, {"loss": 0.5814, "grad_norm": 1.0312391519546509, "learning_rate": 0.0002, "epoch": 4.208228922479994, "step": 26030}, {"loss": 0.571, "grad_norm": 1.0235042572021484, "learning_rate": 0.0002, "epoch": 4.209845606660739, "step": 26040}, {"loss": 0.5766, "grad_norm": 0.8882219195365906, "learning_rate": 0.0002, "epoch": 4.211462290841484, "step": 26050}, {"loss": 0.5557, "grad_norm": 0.9115961790084839, "learning_rate": 0.0002, "epoch": 4.2130789750222295, "step": 26060}, {"loss": 0.5455, "grad_norm": 1.0218228101730347, "learning_rate": 0.0002, "epoch": 4.214695659202975, "step": 26070}, {"loss": 0.5462, "grad_norm": 1.0802232027053833, "learning_rate": 0.0002, "epoch": 4.21631234338372, "step": 26080}, {"loss": 0.557, "grad_norm": 1.1488053798675537, "learning_rate": 0.0002, "epoch": 4.217929027564465, "step": 26090}, {"loss": 0.52, "grad_norm": 1.0487725734710693, "learning_rate": 0.0002, "epoch": 4.21954571174521, "step": 26100}, {"loss": 0.5568, "grad_norm": 0.9131165742874146, "learning_rate": 0.0002, "epoch": 4.221162395925956, "step": 26110}, {"loss": 0.5206, "grad_norm": 0.9012845158576965, "learning_rate": 0.0002, "epoch": 4.222779080106701, "step": 26120}, {"loss": 0.561, "grad_norm": 0.8389840126037598, "learning_rate": 0.0002, "epoch": 4.224395764287446, "step": 26130}, {"loss": 0.5268, "grad_norm": 0.8924660682678223, "learning_rate": 0.0002, "epoch": 4.226012448468191, "step": 26140}, {"loss": 0.5715, "grad_norm": 0.8556463718414307, "learning_rate": 0.0002, "epoch": 4.2276291326489375, "step": 26150}, {"loss": 0.5695, "grad_norm": 0.9643129110336304, "learning_rate": 0.0002, "epoch": 4.229245816829683, "step": 26160}, {"loss": 0.5321, "grad_norm": 0.9865712523460388, "learning_rate": 0.0002, "epoch": 4.230862501010428, "step": 26170}, {"loss": 0.5406, "grad_norm": 1.152641773223877, "learning_rate": 0.0002, "epoch": 4.232479185191173, "step": 26180}, {"loss": 0.5632, "grad_norm": 0.9157698154449463, "learning_rate": 0.0002, "epoch": 4.234095869371918, "step": 26190}, {"loss": 0.5717, "grad_norm": 0.8418048620223999, "learning_rate": 0.0002, "epoch": 4.235712553552664, "step": 26200}, {"loss": 0.5624, "grad_norm": 0.9430168867111206, "learning_rate": 0.0002, "epoch": 4.237329237733409, "step": 26210}, {"loss": 0.5574, "grad_norm": 1.012582778930664, "learning_rate": 0.0002, "epoch": 4.238945921914154, "step": 26220}, {"loss": 0.5693, "grad_norm": 1.112619400024414, "learning_rate": 0.0002, "epoch": 4.240562606094899, "step": 26230}, {"loss": 0.6037, "grad_norm": 0.9243621826171875, "learning_rate": 0.0002, "epoch": 4.2421792902756446, "step": 26240}, {"loss": 0.569, "grad_norm": 0.6977595686912537, "learning_rate": 0.0002, "epoch": 4.24379597445639, "step": 26250}, {"loss": 0.5379, "grad_norm": 0.9600721597671509, "learning_rate": 0.0002, "epoch": 4.245412658637135, "step": 26260}, {"loss": 0.5658, "grad_norm": 0.882641613483429, "learning_rate": 0.0002, "epoch": 4.24702934281788, "step": 26270}, {"loss": 0.55, "grad_norm": 1.010920763015747, "learning_rate": 0.0002, "epoch": 4.2486460269986255, "step": 26280}, {"loss": 0.5803, "grad_norm": 0.9289400577545166, "learning_rate": 0.0002, "epoch": 4.250262711179371, "step": 26290}, {"loss": 0.541, "grad_norm": 1.137397289276123, "learning_rate": 0.0002, "epoch": 4.251879395360117, "step": 26300}, {"loss": 0.5204, "grad_norm": 1.0136182308197021, "learning_rate": 0.0002, "epoch": 4.253496079540862, "step": 26310}, {"loss": 0.5708, "grad_norm": 0.9387356042861938, "learning_rate": 0.0002, "epoch": 4.255112763721607, "step": 26320}, {"loss": 0.5948, "grad_norm": 1.1833957433700562, "learning_rate": 0.0002, "epoch": 4.2567294479023525, "step": 26330}, {"loss": 0.5905, "grad_norm": 0.9415934681892395, "learning_rate": 0.0002, "epoch": 4.258346132083098, "step": 26340}, {"loss": 0.5539, "grad_norm": 0.8550165891647339, "learning_rate": 0.0002, "epoch": 4.259962816263843, "step": 26350}, {"loss": 0.555, "grad_norm": 9.924622535705566, "learning_rate": 0.0002, "epoch": 4.261579500444588, "step": 26360}, {"loss": 0.5689, "grad_norm": 1.0104902982711792, "learning_rate": 0.0002, "epoch": 4.2631961846253335, "step": 26370}, {"loss": 0.5698, "grad_norm": 0.890794038772583, "learning_rate": 0.0002, "epoch": 4.264812868806079, "step": 26380}, {"loss": 0.563, "grad_norm": 1.0560191869735718, "learning_rate": 0.0002, "epoch": 4.266429552986824, "step": 26390}, {"loss": 0.5119, "grad_norm": 1.0135581493377686, "learning_rate": 0.0002, "epoch": 4.268046237167569, "step": 26400}, {"loss": 0.5359, "grad_norm": 1.1304140090942383, "learning_rate": 0.0002, "epoch": 4.269662921348314, "step": 26410}, {"loss": 0.5615, "grad_norm": 0.9899303913116455, "learning_rate": 0.0002, "epoch": 4.27127960552906, "step": 26420}, {"loss": 0.5815, "grad_norm": 1.0505329370498657, "learning_rate": 0.0002, "epoch": 4.272896289709805, "step": 26430}, {"loss": 0.5384, "grad_norm": 0.9389396905899048, "learning_rate": 0.0002, "epoch": 4.27451297389055, "step": 26440}, {"loss": 0.5558, "grad_norm": 0.875328779220581, "learning_rate": 0.0002, "epoch": 4.276129658071296, "step": 26450}, {"loss": 0.5601, "grad_norm": 1.0689256191253662, "learning_rate": 0.0002, "epoch": 4.277746342252041, "step": 26460}, {"loss": 0.546, "grad_norm": 0.9988957643508911, "learning_rate": 0.0002, "epoch": 4.279363026432787, "step": 26470}, {"loss": 0.5478, "grad_norm": 0.8721813559532166, "learning_rate": 0.0002, "epoch": 4.280979710613532, "step": 26480}, {"loss": 0.5424, "grad_norm": 1.100109577178955, "learning_rate": 0.0002, "epoch": 4.282596394794277, "step": 26490}, {"loss": 0.572, "grad_norm": 1.1607271432876587, "learning_rate": 0.0002, "epoch": 4.284213078975022, "step": 26500}, {"loss": 0.6287, "grad_norm": 0.879088819026947, "learning_rate": 0.0002, "epoch": 4.285829763155768, "step": 26510}, {"loss": 0.573, "grad_norm": 0.9891700744628906, "learning_rate": 0.0002, "epoch": 4.287446447336513, "step": 26520}, {"loss": 0.6018, "grad_norm": 1.0831127166748047, "learning_rate": 0.0002, "epoch": 4.289063131517258, "step": 26530}, {"loss": 0.5693, "grad_norm": 1.4108285903930664, "learning_rate": 0.0002, "epoch": 4.290679815698003, "step": 26540}, {"loss": 0.5888, "grad_norm": 1.0630289316177368, "learning_rate": 0.0002, "epoch": 4.2922964998787485, "step": 26550}, {"loss": 0.5817, "grad_norm": 1.0854572057724, "learning_rate": 0.0002, "epoch": 4.293913184059494, "step": 26560}, {"loss": 0.5586, "grad_norm": 0.9561646580696106, "learning_rate": 0.0002, "epoch": 4.295529868240239, "step": 26570}, {"loss": 0.5674, "grad_norm": 0.9064981937408447, "learning_rate": 0.0002, "epoch": 4.297146552420984, "step": 26580}, {"loss": 0.5847, "grad_norm": 1.0082972049713135, "learning_rate": 0.0002, "epoch": 4.298763236601729, "step": 26590}, {"loss": 0.5711, "grad_norm": 1.1613214015960693, "learning_rate": 0.0002, "epoch": 4.3003799207824756, "step": 26600}, {"loss": 0.551, "grad_norm": 0.9847695231437683, "learning_rate": 0.0002, "epoch": 4.301996604963221, "step": 26610}, {"loss": 0.6089, "grad_norm": 1.0980697870254517, "learning_rate": 0.0002, "epoch": 4.303613289143966, "step": 26620}, {"loss": 0.5797, "grad_norm": 0.8861175179481506, "learning_rate": 0.0002, "epoch": 4.305229973324711, "step": 26630}, {"loss": 0.5716, "grad_norm": 0.8917363286018372, "learning_rate": 0.0002, "epoch": 4.3068466575054565, "step": 26640}, {"loss": 0.5892, "grad_norm": 1.0458378791809082, "learning_rate": 0.0002, "epoch": 4.308463341686202, "step": 26650}, {"loss": 0.5883, "grad_norm": 1.4859240055084229, "learning_rate": 0.0002, "epoch": 4.310080025866947, "step": 26660}, {"loss": 0.5296, "grad_norm": 1.1376359462738037, "learning_rate": 0.0002, "epoch": 4.311696710047692, "step": 26670}, {"loss": 0.5671, "grad_norm": 0.991349995136261, "learning_rate": 0.0002, "epoch": 4.313313394228437, "step": 26680}, {"loss": 0.5338, "grad_norm": 0.9995543956756592, "learning_rate": 0.0002, "epoch": 4.314930078409183, "step": 26690}, {"loss": 0.5542, "grad_norm": 1.0515851974487305, "learning_rate": 0.0002, "epoch": 4.316546762589928, "step": 26700}, {"loss": 0.5473, "grad_norm": 1.008023977279663, "learning_rate": 0.0002, "epoch": 4.318163446770673, "step": 26710}, {"loss": 0.5506, "grad_norm": 1.0184582471847534, "learning_rate": 0.0002, "epoch": 4.319780130951418, "step": 26720}, {"loss": 0.5828, "grad_norm": 1.161071538925171, "learning_rate": 0.0002, "epoch": 4.321396815132164, "step": 26730}, {"loss": 0.5633, "grad_norm": 0.9580779671669006, "learning_rate": 0.0002, "epoch": 4.323013499312909, "step": 26740}, {"loss": 0.5785, "grad_norm": 1.0189911127090454, "learning_rate": 0.0002, "epoch": 4.324630183493655, "step": 26750}, {"loss": 0.5237, "grad_norm": 0.7484358549118042, "learning_rate": 0.0002, "epoch": 4.3262468676744, "step": 26760}, {"loss": 0.5728, "grad_norm": 1.0015908479690552, "learning_rate": 0.0002, "epoch": 4.327863551855145, "step": 26770}, {"loss": 0.5597, "grad_norm": 0.8972945809364319, "learning_rate": 0.0002, "epoch": 4.329480236035891, "step": 26780}, {"loss": 0.5857, "grad_norm": 1.01099693775177, "learning_rate": 0.0002, "epoch": 4.331096920216636, "step": 26790}, {"loss": 0.5591, "grad_norm": 0.846958339214325, "learning_rate": 0.0002, "epoch": 4.332713604397381, "step": 26800}, {"loss": 0.5547, "grad_norm": 1.0792603492736816, "learning_rate": 0.0002, "epoch": 4.334330288578126, "step": 26810}, {"loss": 0.5747, "grad_norm": 1.0373345613479614, "learning_rate": 0.0002, "epoch": 4.3359469727588715, "step": 26820}, {"loss": 0.558, "grad_norm": 0.9779167771339417, "learning_rate": 0.0002, "epoch": 4.337563656939617, "step": 26830}, {"loss": 0.5821, "grad_norm": 1.0235520601272583, "learning_rate": 0.0002, "epoch": 4.339180341120362, "step": 26840}, {"loss": 0.5843, "grad_norm": 1.04195237159729, "learning_rate": 0.0002, "epoch": 4.340797025301107, "step": 26850}, {"loss": 0.5474, "grad_norm": 0.9479565620422363, "learning_rate": 0.0002, "epoch": 4.3424137094818525, "step": 26860}, {"loss": 0.5646, "grad_norm": 0.9526172280311584, "learning_rate": 0.0002, "epoch": 4.344030393662598, "step": 26870}, {"loss": 0.521, "grad_norm": 0.8571456074714661, "learning_rate": 0.0002, "epoch": 4.345647077843343, "step": 26880}, {"loss": 0.5846, "grad_norm": 0.9475828409194946, "learning_rate": 0.0002, "epoch": 4.347263762024088, "step": 26890}, {"loss": 0.5815, "grad_norm": 1.0529576539993286, "learning_rate": 0.0002, "epoch": 4.348880446204834, "step": 26900}, {"loss": 0.56, "grad_norm": 0.9648140072822571, "learning_rate": 0.0002, "epoch": 4.3504971303855795, "step": 26910}, {"loss": 0.5162, "grad_norm": 1.0488841533660889, "learning_rate": 0.0002, "epoch": 4.352113814566325, "step": 26920}, {"loss": 0.5842, "grad_norm": 0.8771942257881165, "learning_rate": 0.0002, "epoch": 4.35373049874707, "step": 26930}, {"loss": 0.5966, "grad_norm": 0.9411202073097229, "learning_rate": 0.0002, "epoch": 4.355347182927815, "step": 26940}, {"loss": 0.6001, "grad_norm": 1.0997588634490967, "learning_rate": 0.0002, "epoch": 4.35696386710856, "step": 26950}, {"loss": 0.5528, "grad_norm": 0.968754768371582, "learning_rate": 0.0002, "epoch": 4.358580551289306, "step": 26960}, {"loss": 0.5881, "grad_norm": 0.9990773797035217, "learning_rate": 0.0002, "epoch": 4.360197235470051, "step": 26970}, {"loss": 0.5761, "grad_norm": 1.0210620164871216, "learning_rate": 0.0002, "epoch": 4.361813919650796, "step": 26980}, {"loss": 0.5768, "grad_norm": 0.855462908744812, "learning_rate": 0.0002, "epoch": 4.363430603831541, "step": 26990}, {"loss": 0.5493, "grad_norm": 0.9169660806655884, "learning_rate": 0.0002, "epoch": 4.365047288012287, "step": 27000}, {"loss": 0.5697, "grad_norm": 1.089629888534546, "learning_rate": 0.0002, "epoch": 4.366663972193032, "step": 27010}, {"loss": 0.5854, "grad_norm": 1.0932867527008057, "learning_rate": 0.0002, "epoch": 4.368280656373777, "step": 27020}, {"loss": 0.5656, "grad_norm": 0.9290956854820251, "learning_rate": 0.0002, "epoch": 4.369897340554522, "step": 27030}, {"loss": 0.5727, "grad_norm": 1.2800624370574951, "learning_rate": 0.0002, "epoch": 4.3715140247352675, "step": 27040}, {"loss": 0.5837, "grad_norm": 0.8993493318557739, "learning_rate": 0.0002, "epoch": 4.373130708916014, "step": 27050}, {"loss": 0.6232, "grad_norm": 1.1566431522369385, "learning_rate": 0.0002, "epoch": 4.374747393096759, "step": 27060}, {"loss": 0.5902, "grad_norm": 0.9479052424430847, "learning_rate": 0.0002, "epoch": 4.376364077277504, "step": 27070}, {"loss": 0.6189, "grad_norm": 1.0063648223876953, "learning_rate": 0.0002, "epoch": 4.377980761458249, "step": 27080}, {"loss": 0.561, "grad_norm": 0.8342045545578003, "learning_rate": 0.0002, "epoch": 4.379597445638995, "step": 27090}, {"loss": 0.5515, "grad_norm": 1.1390739679336548, "learning_rate": 0.0002, "epoch": 4.38121412981974, "step": 27100}, {"loss": 0.5372, "grad_norm": 0.9547637104988098, "learning_rate": 0.0002, "epoch": 4.382830814000485, "step": 27110}, {"loss": 0.5728, "grad_norm": 1.0503804683685303, "learning_rate": 0.0002, "epoch": 4.38444749818123, "step": 27120}, {"loss": 0.5787, "grad_norm": 0.9064017534255981, "learning_rate": 0.0002, "epoch": 4.3860641823619755, "step": 27130}, {"loss": 0.5798, "grad_norm": 0.9382519125938416, "learning_rate": 0.0002, "epoch": 4.387680866542721, "step": 27140}, {"loss": 0.5791, "grad_norm": 1.0410341024398804, "learning_rate": 0.0002, "epoch": 4.389297550723466, "step": 27150}, {"loss": 0.6034, "grad_norm": 0.9218655824661255, "learning_rate": 0.0002, "epoch": 4.390914234904211, "step": 27160}, {"loss": 0.5204, "grad_norm": 0.8119737505912781, "learning_rate": 0.0002, "epoch": 4.392530919084956, "step": 27170}, {"loss": 0.5612, "grad_norm": 0.8584722876548767, "learning_rate": 0.0002, "epoch": 4.394147603265702, "step": 27180}, {"loss": 0.5772, "grad_norm": 0.9668293595314026, "learning_rate": 0.0002, "epoch": 4.395764287446447, "step": 27190}, {"loss": 0.6009, "grad_norm": 1.022334098815918, "learning_rate": 0.0002, "epoch": 4.397380971627193, "step": 27200}, {"loss": 0.5573, "grad_norm": 0.9553216099739075, "learning_rate": 0.0002, "epoch": 4.398997655807938, "step": 27210}, {"loss": 0.5604, "grad_norm": 0.9282339215278625, "learning_rate": 0.0002, "epoch": 4.4006143399886835, "step": 27220}, {"loss": 0.5599, "grad_norm": 1.0232292413711548, "learning_rate": 0.0002, "epoch": 4.402231024169429, "step": 27230}, {"loss": 0.6078, "grad_norm": 0.9915700554847717, "learning_rate": 0.0002, "epoch": 4.403847708350174, "step": 27240}, {"loss": 0.5778, "grad_norm": 1.0014961957931519, "learning_rate": 0.0002, "epoch": 4.405464392530919, "step": 27250}, {"loss": 0.5824, "grad_norm": 1.1172103881835938, "learning_rate": 0.0002, "epoch": 4.407081076711664, "step": 27260}, {"loss": 0.5286, "grad_norm": 0.8583093285560608, "learning_rate": 0.0002, "epoch": 4.40869776089241, "step": 27270}, {"loss": 0.5507, "grad_norm": 0.7609201669692993, "learning_rate": 0.0002, "epoch": 4.410314445073155, "step": 27280}, {"loss": 0.575, "grad_norm": 1.0619351863861084, "learning_rate": 0.0002, "epoch": 4.4119311292539, "step": 27290}, {"loss": 0.5579, "grad_norm": 1.0177674293518066, "learning_rate": 0.0002, "epoch": 4.413547813434645, "step": 27300}, {"loss": 0.5628, "grad_norm": 0.9921218156814575, "learning_rate": 0.0002, "epoch": 4.4151644976153905, "step": 27310}, {"loss": 0.6018, "grad_norm": 1.126244306564331, "learning_rate": 0.0002, "epoch": 4.416781181796136, "step": 27320}, {"loss": 0.5743, "grad_norm": 1.0678540468215942, "learning_rate": 0.0002, "epoch": 4.418397865976881, "step": 27330}, {"loss": 0.5665, "grad_norm": 0.8705704212188721, "learning_rate": 0.0002, "epoch": 4.420014550157627, "step": 27340}, {"loss": 0.5763, "grad_norm": 1.272074818611145, "learning_rate": 0.0002, "epoch": 4.421631234338372, "step": 27350}, {"loss": 0.561, "grad_norm": 0.8740444183349609, "learning_rate": 0.0002, "epoch": 4.423247918519118, "step": 27360}, {"loss": 0.5492, "grad_norm": 1.0584250688552856, "learning_rate": 0.0002, "epoch": 4.424864602699863, "step": 27370}, {"loss": 0.589, "grad_norm": 1.059870719909668, "learning_rate": 0.0002, "epoch": 4.426481286880608, "step": 27380}, {"loss": 0.5551, "grad_norm": 1.072265863418579, "learning_rate": 0.0002, "epoch": 4.428097971061353, "step": 27390}, {"loss": 0.5584, "grad_norm": 0.871481716632843, "learning_rate": 0.0002, "epoch": 4.4297146552420985, "step": 27400}, {"loss": 0.5372, "grad_norm": 0.9555448293685913, "learning_rate": 0.0002, "epoch": 4.431331339422844, "step": 27410}, {"loss": 0.5593, "grad_norm": 1.0402292013168335, "learning_rate": 0.0002, "epoch": 4.432948023603589, "step": 27420}, {"loss": 0.5532, "grad_norm": 1.12587571144104, "learning_rate": 0.0002, "epoch": 4.434564707784334, "step": 27430}, {"loss": 0.5403, "grad_norm": 1.0783193111419678, "learning_rate": 0.0002, "epoch": 4.436181391965079, "step": 27440}, {"loss": 0.5313, "grad_norm": 1.024133563041687, "learning_rate": 0.0002, "epoch": 4.437798076145825, "step": 27450}, {"loss": 0.5621, "grad_norm": 0.9156768918037415, "learning_rate": 0.0002, "epoch": 4.43941476032657, "step": 27460}, {"loss": 0.5307, "grad_norm": 1.0215224027633667, "learning_rate": 0.0002, "epoch": 4.441031444507315, "step": 27470}, {"loss": 0.5188, "grad_norm": 1.082116961479187, "learning_rate": 0.0002, "epoch": 4.442648128688061, "step": 27480}, {"loss": 0.6203, "grad_norm": 1.0412873029708862, "learning_rate": 0.0002, "epoch": 4.4442648128688065, "step": 27490}, {"loss": 0.5939, "grad_norm": 1.0509289503097534, "learning_rate": 0.0002, "epoch": 4.445881497049552, "step": 27500}, {"loss": 0.5503, "grad_norm": 0.9291498064994812, "learning_rate": 0.0002, "epoch": 4.447498181230297, "step": 27510}, {"loss": 0.5408, "grad_norm": 0.970184326171875, "learning_rate": 0.0002, "epoch": 4.449114865411042, "step": 27520}, {"loss": 0.5705, "grad_norm": 0.8418883681297302, "learning_rate": 0.0002, "epoch": 4.450731549591787, "step": 27530}, {"loss": 0.5124, "grad_norm": 0.8823825120925903, "learning_rate": 0.0002, "epoch": 4.452348233772533, "step": 27540}, {"loss": 0.5867, "grad_norm": 1.1909019947052002, "learning_rate": 0.0002, "epoch": 4.453964917953278, "step": 27550}, {"loss": 0.5685, "grad_norm": 1.0317302942276, "learning_rate": 0.0002, "epoch": 4.455581602134023, "step": 27560}, {"loss": 0.5538, "grad_norm": 0.9977751970291138, "learning_rate": 0.0002, "epoch": 4.457198286314768, "step": 27570}, {"loss": 0.5628, "grad_norm": 0.8909519910812378, "learning_rate": 0.0002, "epoch": 4.458814970495514, "step": 27580}, {"loss": 0.6099, "grad_norm": 0.8653029799461365, "learning_rate": 0.0002, "epoch": 4.460431654676259, "step": 27590}, {"loss": 0.5622, "grad_norm": 1.0783653259277344, "learning_rate": 0.0002, "epoch": 4.462048338857004, "step": 27600}, {"loss": 0.579, "grad_norm": 1.1235394477844238, "learning_rate": 0.0002, "epoch": 4.463665023037749, "step": 27610}, {"loss": 0.5545, "grad_norm": 0.9386643767356873, "learning_rate": 0.0002, "epoch": 4.4652817072184945, "step": 27620}, {"loss": 0.5554, "grad_norm": 1.0605148077011108, "learning_rate": 0.0002, "epoch": 4.466898391399241, "step": 27630}, {"loss": 0.5886, "grad_norm": 1.1283893585205078, "learning_rate": 0.0002, "epoch": 4.468515075579986, "step": 27640}, {"loss": 0.5801, "grad_norm": 1.0583468675613403, "learning_rate": 0.0002, "epoch": 4.470131759760731, "step": 27650}, {"loss": 0.5601, "grad_norm": 0.9563992023468018, "learning_rate": 0.0002, "epoch": 4.471748443941476, "step": 27660}, {"loss": 0.5687, "grad_norm": 1.100598931312561, "learning_rate": 0.0002, "epoch": 4.4733651281222215, "step": 27670}, {"loss": 0.589, "grad_norm": 0.9386957287788391, "learning_rate": 0.0002, "epoch": 4.474981812302967, "step": 27680}, {"loss": 0.6241, "grad_norm": 1.2946288585662842, "learning_rate": 0.0002, "epoch": 4.476598496483712, "step": 27690}, {"loss": 0.6075, "grad_norm": 1.0325199365615845, "learning_rate": 0.0002, "epoch": 4.478215180664457, "step": 27700}, {"loss": 0.588, "grad_norm": 1.0318928956985474, "learning_rate": 0.0002, "epoch": 4.4798318648452025, "step": 27710}, {"loss": 0.5656, "grad_norm": 0.8721024394035339, "learning_rate": 0.0002, "epoch": 4.481448549025948, "step": 27720}, {"loss": 0.5421, "grad_norm": 1.17376708984375, "learning_rate": 0.0002, "epoch": 4.483065233206693, "step": 27730}, {"loss": 0.5657, "grad_norm": 1.0926326513290405, "learning_rate": 0.0002, "epoch": 4.484681917387438, "step": 27740}, {"loss": 0.5514, "grad_norm": 0.9043852686882019, "learning_rate": 0.0002, "epoch": 4.486298601568183, "step": 27750}, {"loss": 0.582, "grad_norm": 1.064600944519043, "learning_rate": 0.0002, "epoch": 4.487915285748929, "step": 27760}, {"loss": 0.6108, "grad_norm": 0.7833460569381714, "learning_rate": 0.0002, "epoch": 4.489531969929674, "step": 27770}, {"loss": 0.5985, "grad_norm": 1.1073496341705322, "learning_rate": 0.0002, "epoch": 4.49114865411042, "step": 27780}, {"loss": 0.5577, "grad_norm": 1.0799397230148315, "learning_rate": 0.0002, "epoch": 4.492765338291165, "step": 27790}, {"loss": 0.5601, "grad_norm": 1.1062238216400146, "learning_rate": 0.0002, "epoch": 4.49438202247191, "step": 27800}, {"loss": 0.6126, "grad_norm": 1.0568242073059082, "learning_rate": 0.0002, "epoch": 4.495998706652656, "step": 27810}, {"loss": 0.5913, "grad_norm": 0.8861091732978821, "learning_rate": 0.0002, "epoch": 4.497615390833401, "step": 27820}, {"loss": 0.5858, "grad_norm": 1.2297543287277222, "learning_rate": 0.0002, "epoch": 4.499232075014146, "step": 27830}, {"loss": 0.5859, "grad_norm": 0.9600302577018738, "learning_rate": 0.0002, "epoch": 4.500848759194891, "step": 27840}, {"loss": 0.6124, "grad_norm": 1.057051181793213, "learning_rate": 0.0002, "epoch": 4.502465443375637, "step": 27850}, {"loss": 0.5788, "grad_norm": 0.9839690923690796, "learning_rate": 0.0002, "epoch": 4.504082127556382, "step": 27860}, {"loss": 0.555, "grad_norm": 1.1479853391647339, "learning_rate": 0.0002, "epoch": 4.505698811737127, "step": 27870}, {"loss": 0.6039, "grad_norm": 1.0550768375396729, "learning_rate": 0.0002, "epoch": 4.507315495917872, "step": 27880}, {"loss": 0.563, "grad_norm": 0.898209273815155, "learning_rate": 0.0002, "epoch": 4.5089321800986175, "step": 27890}, {"loss": 0.5734, "grad_norm": 0.9460315108299255, "learning_rate": 0.0002, "epoch": 4.510548864279363, "step": 27900}, {"loss": 0.5702, "grad_norm": 0.9499884247779846, "learning_rate": 0.0002, "epoch": 4.512165548460108, "step": 27910}, {"loss": 0.5385, "grad_norm": 0.7801318764686584, "learning_rate": 0.0002, "epoch": 4.513782232640853, "step": 27920}, {"loss": 0.5391, "grad_norm": 0.9286966323852539, "learning_rate": 0.0002, "epoch": 4.515398916821599, "step": 27930}, {"loss": 0.5717, "grad_norm": 0.9539980292320251, "learning_rate": 0.0002, "epoch": 4.517015601002345, "step": 27940}, {"loss": 0.6073, "grad_norm": 1.1053401231765747, "learning_rate": 0.0002, "epoch": 4.51863228518309, "step": 27950}, {"loss": 0.6087, "grad_norm": 0.7535534501075745, "learning_rate": 0.0002, "epoch": 4.520248969363835, "step": 27960}, {"loss": 0.5701, "grad_norm": 1.076926589012146, "learning_rate": 0.0002, "epoch": 4.52186565354458, "step": 27970}, {"loss": 0.6028, "grad_norm": 1.181935429573059, "learning_rate": 0.0002, "epoch": 4.5234823377253255, "step": 27980}, {"loss": 0.6033, "grad_norm": 0.9293407201766968, "learning_rate": 0.0002, "epoch": 4.525099021906071, "step": 27990}, {"loss": 0.5815, "grad_norm": 0.8953009247779846, "learning_rate": 0.0002, "epoch": 4.526715706086816, "step": 28000}, {"loss": 0.5564, "grad_norm": 1.0850225687026978, "learning_rate": 0.0002, "epoch": 4.528332390267561, "step": 28010}, {"loss": 0.5459, "grad_norm": 0.9125663042068481, "learning_rate": 0.0002, "epoch": 4.529949074448306, "step": 28020}, {"loss": 0.5922, "grad_norm": 0.8745216727256775, "learning_rate": 0.0002, "epoch": 4.531565758629052, "step": 28030}, {"loss": 0.567, "grad_norm": 1.0783463716506958, "learning_rate": 0.0002, "epoch": 4.533182442809797, "step": 28040}, {"loss": 0.5754, "grad_norm": 0.7513844966888428, "learning_rate": 0.0002, "epoch": 4.534799126990542, "step": 28050}, {"loss": 0.5608, "grad_norm": 1.0135776996612549, "learning_rate": 0.0002, "epoch": 4.536415811171287, "step": 28060}, {"loss": 0.5827, "grad_norm": 0.8886825442314148, "learning_rate": 0.0002, "epoch": 4.538032495352033, "step": 28070}, {"loss": 0.5605, "grad_norm": 0.8153995275497437, "learning_rate": 0.0002, "epoch": 4.539649179532779, "step": 28080}, {"loss": 0.6377, "grad_norm": 0.9853341579437256, "learning_rate": 0.0002, "epoch": 4.541265863713524, "step": 28090}, {"loss": 0.5957, "grad_norm": 0.9365800023078918, "learning_rate": 0.0002, "epoch": 4.542882547894269, "step": 28100}, {"loss": 0.5477, "grad_norm": 0.9765017628669739, "learning_rate": 0.0002, "epoch": 4.544499232075014, "step": 28110}, {"loss": 0.6185, "grad_norm": 0.9811279773712158, "learning_rate": 0.0002, "epoch": 4.54611591625576, "step": 28120}, {"loss": 0.6095, "grad_norm": 1.0387924909591675, "learning_rate": 0.0002, "epoch": 4.547732600436505, "step": 28130}, {"loss": 0.6534, "grad_norm": 1.0684878826141357, "learning_rate": 0.0002, "epoch": 4.54934928461725, "step": 28140}, {"loss": 0.5701, "grad_norm": 1.0000102519989014, "learning_rate": 0.0002, "epoch": 4.550965968797995, "step": 28150}, {"loss": 0.5327, "grad_norm": 1.0717930793762207, "learning_rate": 0.0002, "epoch": 4.5525826529787405, "step": 28160}, {"loss": 0.5594, "grad_norm": 0.990074634552002, "learning_rate": 0.0002, "epoch": 4.554199337159486, "step": 28170}, {"loss": 0.5452, "grad_norm": 0.8673754930496216, "learning_rate": 0.0002, "epoch": 4.555816021340231, "step": 28180}, {"loss": 0.5773, "grad_norm": 0.864247739315033, "learning_rate": 0.0002, "epoch": 4.557432705520976, "step": 28190}, {"loss": 0.5516, "grad_norm": 0.8280200958251953, "learning_rate": 0.0002, "epoch": 4.5590493897017215, "step": 28200}, {"loss": 0.5709, "grad_norm": 1.1312172412872314, "learning_rate": 0.0002, "epoch": 4.560666073882467, "step": 28210}, {"loss": 0.5776, "grad_norm": 0.9147403240203857, "learning_rate": 0.0002, "epoch": 4.562282758063212, "step": 28220}, {"loss": 0.5591, "grad_norm": 1.0321218967437744, "learning_rate": 0.0002, "epoch": 4.563899442243958, "step": 28230}, {"loss": 0.5508, "grad_norm": 1.168332815170288, "learning_rate": 0.0002, "epoch": 4.565516126424703, "step": 28240}, {"loss": 0.5649, "grad_norm": 1.0067222118377686, "learning_rate": 0.0002, "epoch": 4.5671328106054485, "step": 28250}, {"loss": 0.5853, "grad_norm": 1.0283393859863281, "learning_rate": 0.0002, "epoch": 4.568749494786194, "step": 28260}, {"loss": 0.5772, "grad_norm": 0.9912363886833191, "learning_rate": 0.0002, "epoch": 4.570366178966939, "step": 28270}, {"loss": 0.5757, "grad_norm": 1.108032464981079, "learning_rate": 0.0002, "epoch": 4.571982863147684, "step": 28280}, {"loss": 0.5529, "grad_norm": 0.8260078430175781, "learning_rate": 0.0002, "epoch": 4.573599547328429, "step": 28290}, {"loss": 0.5625, "grad_norm": 0.8946247100830078, "learning_rate": 0.0002, "epoch": 4.575216231509175, "step": 28300}, {"loss": 0.5533, "grad_norm": 0.8273587822914124, "learning_rate": 0.0002, "epoch": 4.57683291568992, "step": 28310}, {"loss": 0.6058, "grad_norm": 0.9040093421936035, "learning_rate": 0.0002, "epoch": 4.578449599870665, "step": 28320}, {"loss": 0.5521, "grad_norm": 0.8435290455818176, "learning_rate": 0.0002, "epoch": 4.58006628405141, "step": 28330}, {"loss": 0.6086, "grad_norm": 1.164088249206543, "learning_rate": 0.0002, "epoch": 4.581682968232156, "step": 28340}, {"loss": 0.5603, "grad_norm": 0.9861085414886475, "learning_rate": 0.0002, "epoch": 4.583299652412901, "step": 28350}, {"loss": 0.5701, "grad_norm": 0.8892980813980103, "learning_rate": 0.0002, "epoch": 4.584916336593646, "step": 28360}, {"loss": 0.598, "grad_norm": 1.240574836730957, "learning_rate": 0.0002, "epoch": 4.586533020774391, "step": 28370}, {"loss": 0.5797, "grad_norm": 0.8669408559799194, "learning_rate": 0.0002, "epoch": 4.588149704955137, "step": 28380}, {"loss": 0.5603, "grad_norm": 0.9145985841751099, "learning_rate": 0.0002, "epoch": 4.589766389135883, "step": 28390}, {"loss": 0.5765, "grad_norm": 0.8584614992141724, "learning_rate": 0.0002, "epoch": 4.591383073316628, "step": 28400}, {"loss": 0.5898, "grad_norm": 1.118829369544983, "learning_rate": 0.0002, "epoch": 4.592999757497373, "step": 28410}, {"loss": 0.5641, "grad_norm": 1.1411553621292114, "learning_rate": 0.0002, "epoch": 4.594616441678118, "step": 28420}, {"loss": 0.549, "grad_norm": 0.9433278441429138, "learning_rate": 0.0002, "epoch": 4.596233125858864, "step": 28430}, {"loss": 0.5496, "grad_norm": 0.816830039024353, "learning_rate": 0.0002, "epoch": 4.597849810039609, "step": 28440}, {"loss": 0.5543, "grad_norm": 1.2124968767166138, "learning_rate": 0.0002, "epoch": 4.599466494220354, "step": 28450}, {"loss": 0.5759, "grad_norm": 0.9658762216567993, "learning_rate": 0.0002, "epoch": 4.601083178401099, "step": 28460}, {"loss": 0.5902, "grad_norm": 0.836100161075592, "learning_rate": 0.0002, "epoch": 4.6026998625818445, "step": 28470}, {"loss": 0.5749, "grad_norm": 0.9989104270935059, "learning_rate": 0.0002, "epoch": 4.60431654676259, "step": 28480}, {"loss": 0.5616, "grad_norm": 1.1298956871032715, "learning_rate": 0.0002, "epoch": 4.605933230943335, "step": 28490}, {"loss": 0.5846, "grad_norm": 1.1731704473495483, "learning_rate": 0.0002, "epoch": 4.60754991512408, "step": 28500}, {"loss": 0.5816, "grad_norm": 0.9624714255332947, "learning_rate": 0.0002, "epoch": 4.609166599304825, "step": 28510}, {"loss": 0.5868, "grad_norm": 1.364073634147644, "learning_rate": 0.0002, "epoch": 4.610783283485571, "step": 28520}, {"loss": 0.6237, "grad_norm": 1.1827356815338135, "learning_rate": 0.0002, "epoch": 4.612399967666317, "step": 28530}, {"loss": 0.5643, "grad_norm": 0.6651531457901001, "learning_rate": 0.0002, "epoch": 4.614016651847062, "step": 28540}, {"loss": 0.6051, "grad_norm": 1.1640995740890503, "learning_rate": 0.0002, "epoch": 4.615633336027807, "step": 28550}, {"loss": 0.5995, "grad_norm": 1.028918743133545, "learning_rate": 0.0002, "epoch": 4.6172500202085525, "step": 28560}, {"loss": 0.5607, "grad_norm": 0.8252120614051819, "learning_rate": 0.0002, "epoch": 4.618866704389298, "step": 28570}, {"loss": 0.5769, "grad_norm": 1.3536735773086548, "learning_rate": 0.0002, "epoch": 4.620483388570043, "step": 28580}, {"loss": 0.6006, "grad_norm": 1.2146915197372437, "learning_rate": 0.0002, "epoch": 4.622100072750788, "step": 28590}, {"loss": 0.5503, "grad_norm": 1.0122549533843994, "learning_rate": 0.0002, "epoch": 4.623716756931533, "step": 28600}, {"loss": 0.6072, "grad_norm": 0.9977872967720032, "learning_rate": 0.0002, "epoch": 4.625333441112279, "step": 28610}, {"loss": 0.5669, "grad_norm": 1.0159751176834106, "learning_rate": 0.0002, "epoch": 4.626950125293024, "step": 28620}, {"loss": 0.5935, "grad_norm": 1.0028325319290161, "learning_rate": 0.0002, "epoch": 4.628566809473769, "step": 28630}, {"loss": 0.5515, "grad_norm": 0.901638388633728, "learning_rate": 0.0002, "epoch": 4.630183493654514, "step": 28640}, {"loss": 0.595, "grad_norm": 0.9450507164001465, "learning_rate": 0.0002, "epoch": 4.6318001778352595, "step": 28650}, {"loss": 0.5972, "grad_norm": 0.9987545013427734, "learning_rate": 0.0002, "epoch": 4.633416862016006, "step": 28660}, {"loss": 0.5863, "grad_norm": 0.9574332237243652, "learning_rate": 0.0002, "epoch": 4.63503354619675, "step": 28670}, {"loss": 0.5804, "grad_norm": 1.2215653657913208, "learning_rate": 0.0002, "epoch": 4.636650230377496, "step": 28680}, {"loss": 0.5798, "grad_norm": 0.9798858761787415, "learning_rate": 0.0002, "epoch": 4.638266914558241, "step": 28690}, {"loss": 0.5773, "grad_norm": 1.0648466348648071, "learning_rate": 0.0002, "epoch": 4.639883598738987, "step": 28700}, {"loss": 0.6108, "grad_norm": 1.0606504678726196, "learning_rate": 0.0002, "epoch": 4.641500282919732, "step": 28710}, {"loss": 0.5801, "grad_norm": 1.0892442464828491, "learning_rate": 0.0002, "epoch": 4.643116967100477, "step": 28720}, {"loss": 0.5492, "grad_norm": 0.914391040802002, "learning_rate": 0.0002, "epoch": 4.644733651281222, "step": 28730}, {"loss": 0.5439, "grad_norm": 0.9782370328903198, "learning_rate": 0.0002, "epoch": 4.6463503354619675, "step": 28740}, {"loss": 0.6035, "grad_norm": 1.0344339609146118, "learning_rate": 0.0002, "epoch": 4.647967019642713, "step": 28750}, {"loss": 0.5775, "grad_norm": 1.0513931512832642, "learning_rate": 0.0002, "epoch": 4.649583703823458, "step": 28760}, {"loss": 0.546, "grad_norm": 0.9711475968360901, "learning_rate": 0.0002, "epoch": 4.651200388004203, "step": 28770}, {"loss": 0.5472, "grad_norm": 0.977519690990448, "learning_rate": 0.0002, "epoch": 4.652817072184948, "step": 28780}, {"loss": 0.5826, "grad_norm": 0.9150224924087524, "learning_rate": 0.0002, "epoch": 4.654433756365694, "step": 28790}, {"loss": 0.5382, "grad_norm": 1.0973542928695679, "learning_rate": 0.0002, "epoch": 4.656050440546439, "step": 28800}, {"loss": 0.6147, "grad_norm": 0.944877564907074, "learning_rate": 0.0002, "epoch": 4.657667124727185, "step": 28810}, {"loss": 0.5537, "grad_norm": 0.9508748650550842, "learning_rate": 0.0002, "epoch": 4.659283808907929, "step": 28820}, {"loss": 0.5537, "grad_norm": 0.9681721329689026, "learning_rate": 0.0002, "epoch": 4.6609004930886755, "step": 28830}, {"loss": 0.592, "grad_norm": 1.0214351415634155, "learning_rate": 0.0002, "epoch": 4.662517177269421, "step": 28840}, {"loss": 0.6031, "grad_norm": 0.9748611450195312, "learning_rate": 0.0002, "epoch": 4.664133861450166, "step": 28850}, {"loss": 0.572, "grad_norm": 0.8484147191047668, "learning_rate": 0.0002, "epoch": 4.665750545630911, "step": 28860}, {"loss": 0.5699, "grad_norm": 1.1252986192703247, "learning_rate": 0.0002, "epoch": 4.667367229811656, "step": 28870}, {"loss": 0.5724, "grad_norm": 0.8706206679344177, "learning_rate": 0.0002, "epoch": 4.668983913992402, "step": 28880}, {"loss": 0.6002, "grad_norm": 1.1432424783706665, "learning_rate": 0.0002, "epoch": 4.670600598173147, "step": 28890}, {"loss": 0.5675, "grad_norm": 1.017029047012329, "learning_rate": 0.0002, "epoch": 4.672217282353892, "step": 28900}, {"loss": 0.5831, "grad_norm": 1.085597038269043, "learning_rate": 0.0002, "epoch": 4.673833966534637, "step": 28910}, {"loss": 0.5678, "grad_norm": 0.9275796413421631, "learning_rate": 0.0002, "epoch": 4.675450650715383, "step": 28920}, {"loss": 0.5603, "grad_norm": 0.9518964886665344, "learning_rate": 0.0002, "epoch": 4.677067334896128, "step": 28930}, {"loss": 0.6232, "grad_norm": 1.0352122783660889, "learning_rate": 0.0002, "epoch": 4.678684019076873, "step": 28940}, {"loss": 0.5786, "grad_norm": 1.090124249458313, "learning_rate": 0.0002, "epoch": 4.680300703257618, "step": 28950}, {"loss": 0.5728, "grad_norm": 0.8799563050270081, "learning_rate": 0.0002, "epoch": 4.681917387438364, "step": 28960}, {"loss": 0.5787, "grad_norm": 1.0929821729660034, "learning_rate": 0.0002, "epoch": 4.683534071619109, "step": 28970}, {"loss": 0.6134, "grad_norm": 0.903727650642395, "learning_rate": 0.0002, "epoch": 4.685150755799855, "step": 28980}, {"loss": 0.5522, "grad_norm": 0.9752424955368042, "learning_rate": 0.0002, "epoch": 4.6867674399806, "step": 28990}, {"loss": 0.5762, "grad_norm": 0.9351571202278137, "learning_rate": 0.0002, "epoch": 4.688384124161345, "step": 29000}, {"loss": 0.5811, "grad_norm": 0.923877477645874, "learning_rate": 0.0002, "epoch": 4.6900008083420905, "step": 29010}, {"loss": 0.5682, "grad_norm": 1.045389175415039, "learning_rate": 0.0002, "epoch": 4.691617492522836, "step": 29020}, {"loss": 0.584, "grad_norm": 1.0200831890106201, "learning_rate": 0.0002, "epoch": 4.693234176703581, "step": 29030}, {"loss": 0.5514, "grad_norm": 1.1499706506729126, "learning_rate": 0.0002, "epoch": 4.694850860884326, "step": 29040}, {"loss": 0.5745, "grad_norm": 0.860118567943573, "learning_rate": 0.0002, "epoch": 4.6964675450650715, "step": 29050}, {"loss": 0.5741, "grad_norm": 0.9774864315986633, "learning_rate": 0.0002, "epoch": 4.698084229245817, "step": 29060}, {"loss": 0.5765, "grad_norm": 1.0323210954666138, "learning_rate": 0.0002, "epoch": 4.699700913426562, "step": 29070}, {"loss": 0.5452, "grad_norm": 0.8492481112480164, "learning_rate": 0.0002, "epoch": 4.701317597607307, "step": 29080}, {"loss": 0.5985, "grad_norm": 1.131951093673706, "learning_rate": 0.0002, "epoch": 4.702934281788052, "step": 29090}, {"loss": 0.6412, "grad_norm": 0.8763113021850586, "learning_rate": 0.0002, "epoch": 4.704550965968798, "step": 29100}, {"loss": 0.575, "grad_norm": 1.045028805732727, "learning_rate": 0.0002, "epoch": 4.706167650149544, "step": 29110}, {"loss": 0.5548, "grad_norm": 0.9961401224136353, "learning_rate": 0.0002, "epoch": 4.707784334330288, "step": 29120}, {"loss": 0.559, "grad_norm": 0.9282503724098206, "learning_rate": 0.0002, "epoch": 4.709401018511034, "step": 29130}, {"loss": 0.5744, "grad_norm": 1.1418932676315308, "learning_rate": 0.0002, "epoch": 4.711017702691779, "step": 29140}, {"loss": 0.5394, "grad_norm": 0.9950099587440491, "learning_rate": 0.0002, "epoch": 4.712634386872525, "step": 29150}, {"loss": 0.6177, "grad_norm": 0.8304893374443054, "learning_rate": 0.0002, "epoch": 4.71425107105327, "step": 29160}, {"loss": 0.6074, "grad_norm": 1.115626335144043, "learning_rate": 0.0002, "epoch": 4.715867755234015, "step": 29170}, {"loss": 0.6265, "grad_norm": 1.079818606376648, "learning_rate": 0.0002, "epoch": 4.71748443941476, "step": 29180}, {"loss": 0.561, "grad_norm": 1.1929082870483398, "learning_rate": 0.0002, "epoch": 4.719101123595506, "step": 29190}, {"loss": 0.5708, "grad_norm": 0.9621080756187439, "learning_rate": 0.0002, "epoch": 4.720717807776251, "step": 29200}, {"loss": 0.546, "grad_norm": 0.8549222350120544, "learning_rate": 0.0002, "epoch": 4.722334491956996, "step": 29210}, {"loss": 0.5775, "grad_norm": 0.9341941475868225, "learning_rate": 0.0002, "epoch": 4.723951176137741, "step": 29220}, {"loss": 0.5436, "grad_norm": 1.075406789779663, "learning_rate": 0.0002, "epoch": 4.7255678603184865, "step": 29230}, {"loss": 0.576, "grad_norm": 1.0859880447387695, "learning_rate": 0.0002, "epoch": 4.727184544499232, "step": 29240}, {"loss": 0.5525, "grad_norm": 0.8475605249404907, "learning_rate": 0.0002, "epoch": 4.728801228679977, "step": 29250}, {"loss": 0.5659, "grad_norm": 0.9331845641136169, "learning_rate": 0.0002, "epoch": 4.730417912860723, "step": 29260}, {"loss": 0.5901, "grad_norm": 0.9279314279556274, "learning_rate": 0.0002, "epoch": 4.7320345970414674, "step": 29270}, {"loss": 0.597, "grad_norm": 0.7803558707237244, "learning_rate": 0.0002, "epoch": 4.733651281222214, "step": 29280}, {"loss": 0.5968, "grad_norm": 1.0159329175949097, "learning_rate": 0.0002, "epoch": 4.735267965402959, "step": 29290}, {"loss": 0.5333, "grad_norm": 0.9448670744895935, "learning_rate": 0.0002, "epoch": 4.736884649583704, "step": 29300}, {"loss": 0.574, "grad_norm": 1.0732197761535645, "learning_rate": 0.0002, "epoch": 4.738501333764449, "step": 29310}, {"loss": 0.6066, "grad_norm": 0.901830792427063, "learning_rate": 0.0002, "epoch": 4.7401180179451945, "step": 29320}, {"loss": 0.6105, "grad_norm": 0.9141789674758911, "learning_rate": 0.0002, "epoch": 4.74173470212594, "step": 29330}, {"loss": 0.5481, "grad_norm": 0.9733418226242065, "learning_rate": 0.0002, "epoch": 4.743351386306685, "step": 29340}, {"loss": 0.612, "grad_norm": 0.909810483455658, "learning_rate": 0.0002, "epoch": 4.74496807048743, "step": 29350}, {"loss": 0.5911, "grad_norm": 0.909541666507721, "learning_rate": 0.0002, "epoch": 4.746584754668175, "step": 29360}, {"loss": 0.5579, "grad_norm": 0.9383015632629395, "learning_rate": 0.0002, "epoch": 4.748201438848921, "step": 29370}, {"loss": 0.5529, "grad_norm": 0.9275668263435364, "learning_rate": 0.0002, "epoch": 4.749818123029666, "step": 29380}, {"loss": 0.5623, "grad_norm": 1.1146225929260254, "learning_rate": 0.0002, "epoch": 4.751434807210411, "step": 29390}, {"loss": 0.6018, "grad_norm": 1.0062453746795654, "learning_rate": 0.0002, "epoch": 4.753051491391156, "step": 29400}, {"loss": 0.5872, "grad_norm": 0.9451895952224731, "learning_rate": 0.0002, "epoch": 4.7546681755719025, "step": 29410}, {"loss": 0.5767, "grad_norm": 0.870457649230957, "learning_rate": 0.0002, "epoch": 4.756284859752648, "step": 29420}, {"loss": 0.57, "grad_norm": 1.0411282777786255, "learning_rate": 0.0002, "epoch": 4.757901543933393, "step": 29430}, {"loss": 0.5688, "grad_norm": 1.1648986339569092, "learning_rate": 0.0002, "epoch": 4.759518228114138, "step": 29440}, {"loss": 0.5432, "grad_norm": 0.8999572992324829, "learning_rate": 0.0002, "epoch": 4.761134912294883, "step": 29450}, {"loss": 0.5667, "grad_norm": 0.9863559007644653, "learning_rate": 0.0002, "epoch": 4.762751596475629, "step": 29460}, {"loss": 0.5779, "grad_norm": 0.9676542282104492, "learning_rate": 0.0002, "epoch": 4.764368280656374, "step": 29470}, {"loss": 0.6075, "grad_norm": 1.004775047302246, "learning_rate": 0.0002, "epoch": 4.765984964837119, "step": 29480}, {"loss": 0.6044, "grad_norm": 1.0937515497207642, "learning_rate": 0.0002, "epoch": 4.767601649017864, "step": 29490}, {"loss": 0.5433, "grad_norm": 0.9551598429679871, "learning_rate": 0.0002, "epoch": 4.7692183331986095, "step": 29500}, {"loss": 0.5609, "grad_norm": 1.0757228136062622, "learning_rate": 0.0002, "epoch": 4.770835017379355, "step": 29510}, {"loss": 0.567, "grad_norm": 1.0588841438293457, "learning_rate": 0.0002, "epoch": 4.7724517015601, "step": 29520}, {"loss": 0.5814, "grad_norm": 1.0744032859802246, "learning_rate": 0.0002, "epoch": 4.774068385740845, "step": 29530}, {"loss": 0.5681, "grad_norm": 1.0066277980804443, "learning_rate": 0.0002, "epoch": 4.7756850699215905, "step": 29540}, {"loss": 0.545, "grad_norm": 1.082319736480713, "learning_rate": 0.0002, "epoch": 4.777301754102336, "step": 29550}, {"loss": 0.5709, "grad_norm": 0.8252472877502441, "learning_rate": 0.0002, "epoch": 4.778918438283082, "step": 29560}, {"loss": 0.5666, "grad_norm": 0.9855340123176575, "learning_rate": 0.0002, "epoch": 4.780535122463827, "step": 29570}, {"loss": 0.6117, "grad_norm": 0.9991421699523926, "learning_rate": 0.0002, "epoch": 4.782151806644572, "step": 29580}, {"loss": 0.5966, "grad_norm": 1.316841959953308, "learning_rate": 0.0002, "epoch": 4.7837684908253175, "step": 29590}, {"loss": 0.6102, "grad_norm": 1.1513035297393799, "learning_rate": 0.0002, "epoch": 4.785385175006063, "step": 29600}, {"loss": 0.5785, "grad_norm": 0.9767683744430542, "learning_rate": 0.0002, "epoch": 4.787001859186808, "step": 29610}, {"loss": 0.6037, "grad_norm": 0.9786278605461121, "learning_rate": 0.0002, "epoch": 4.788618543367553, "step": 29620}, {"loss": 0.6108, "grad_norm": 0.8004973530769348, "learning_rate": 0.0002, "epoch": 4.7902352275482984, "step": 29630}, {"loss": 0.5932, "grad_norm": 1.0997767448425293, "learning_rate": 0.0002, "epoch": 4.791851911729044, "step": 29640}, {"loss": 0.5655, "grad_norm": 0.9752856492996216, "learning_rate": 0.0002, "epoch": 4.793468595909789, "step": 29650}, {"loss": 0.5916, "grad_norm": 1.0518392324447632, "learning_rate": 0.0002, "epoch": 4.795085280090534, "step": 29660}, {"loss": 0.6042, "grad_norm": 1.1050055027008057, "learning_rate": 0.0002, "epoch": 4.796701964271279, "step": 29670}, {"loss": 0.6089, "grad_norm": 0.9933857917785645, "learning_rate": 0.0002, "epoch": 4.798318648452025, "step": 29680}, {"loss": 0.6041, "grad_norm": 1.2804018259048462, "learning_rate": 0.0002, "epoch": 4.79993533263277, "step": 29690}, {"loss": 0.636, "grad_norm": 1.0133371353149414, "learning_rate": 0.0002, "epoch": 4.801552016813515, "step": 29700}, {"loss": 0.5662, "grad_norm": 1.080350637435913, "learning_rate": 0.0002, "epoch": 4.803168700994261, "step": 29710}, {"loss": 0.5603, "grad_norm": 0.9986529350280762, "learning_rate": 0.0002, "epoch": 4.804785385175006, "step": 29720}, {"loss": 0.5894, "grad_norm": 0.975665807723999, "learning_rate": 0.0002, "epoch": 4.806402069355752, "step": 29730}, {"loss": 0.6328, "grad_norm": 0.8458138704299927, "learning_rate": 0.0002, "epoch": 4.808018753536497, "step": 29740}, {"loss": 0.5837, "grad_norm": 0.99330073595047, "learning_rate": 0.0002, "epoch": 4.809635437717242, "step": 29750}, {"loss": 0.5507, "grad_norm": 0.898274302482605, "learning_rate": 0.0002, "epoch": 4.811252121897987, "step": 29760}, {"loss": 0.5842, "grad_norm": 1.0504480600357056, "learning_rate": 0.0002, "epoch": 4.812868806078733, "step": 29770}, {"loss": 0.5821, "grad_norm": 0.937919020652771, "learning_rate": 0.0002, "epoch": 4.814485490259478, "step": 29780}, {"loss": 0.5885, "grad_norm": 0.9593307971954346, "learning_rate": 0.0002, "epoch": 4.816102174440223, "step": 29790}, {"loss": 0.578, "grad_norm": 0.9431198835372925, "learning_rate": 0.0002, "epoch": 4.817718858620968, "step": 29800}, {"loss": 0.5739, "grad_norm": 1.2729957103729248, "learning_rate": 0.0002, "epoch": 4.8193355428017135, "step": 29810}, {"loss": 0.6124, "grad_norm": 0.8876838684082031, "learning_rate": 0.0002, "epoch": 4.820952226982459, "step": 29820}, {"loss": 0.5583, "grad_norm": 1.0185000896453857, "learning_rate": 0.0002, "epoch": 4.822568911163204, "step": 29830}, {"loss": 0.5686, "grad_norm": 1.064276099205017, "learning_rate": 0.0002, "epoch": 4.824185595343949, "step": 29840}, {"loss": 0.5698, "grad_norm": 0.9774803519248962, "learning_rate": 0.0002, "epoch": 4.825802279524694, "step": 29850}, {"loss": 0.5533, "grad_norm": 1.131646990776062, "learning_rate": 0.0002, "epoch": 4.8274189637054405, "step": 29860}, {"loss": 0.6371, "grad_norm": 1.081455945968628, "learning_rate": 0.0002, "epoch": 4.829035647886186, "step": 29870}, {"loss": 0.5793, "grad_norm": 0.990538477897644, "learning_rate": 0.0002, "epoch": 4.830652332066931, "step": 29880}, {"loss": 0.5833, "grad_norm": 0.9750600457191467, "learning_rate": 0.0002, "epoch": 4.832269016247676, "step": 29890}, {"loss": 0.619, "grad_norm": 1.0600621700286865, "learning_rate": 0.0002, "epoch": 4.8338857004284215, "step": 29900}, {"loss": 0.5841, "grad_norm": 0.9237320423126221, "learning_rate": 0.0002, "epoch": 4.835502384609167, "step": 29910}, {"loss": 0.5513, "grad_norm": 0.9739177227020264, "learning_rate": 0.0002, "epoch": 4.837119068789912, "step": 29920}, {"loss": 0.587, "grad_norm": 1.128677248954773, "learning_rate": 0.0002, "epoch": 4.838735752970657, "step": 29930}, {"loss": 0.564, "grad_norm": 1.042604923248291, "learning_rate": 0.0002, "epoch": 4.840352437151402, "step": 29940}, {"loss": 0.5885, "grad_norm": 0.849758505821228, "learning_rate": 0.0002, "epoch": 4.841969121332148, "step": 29950}, {"loss": 0.5952, "grad_norm": 1.2809888124465942, "learning_rate": 0.0002, "epoch": 4.843585805512893, "step": 29960}, {"loss": 0.5703, "grad_norm": 1.0177865028381348, "learning_rate": 0.0002, "epoch": 4.845202489693638, "step": 29970}, {"loss": 0.5946, "grad_norm": 1.0026639699935913, "learning_rate": 0.0002, "epoch": 4.846819173874383, "step": 29980}, {"loss": 0.5897, "grad_norm": 0.9679505228996277, "learning_rate": 0.0002, "epoch": 4.8484358580551286, "step": 29990}, {"loss": 0.5621, "grad_norm": 0.8939532041549683, "learning_rate": 0.0002, "epoch": 4.850052542235874, "step": 30000}, {"loss": 0.5852, "grad_norm": 0.9957457780838013, "learning_rate": 0.0002, "epoch": 4.85166922641662, "step": 30010}, {"loss": 0.6117, "grad_norm": 1.1646790504455566, "learning_rate": 0.0002, "epoch": 4.853285910597365, "step": 30020}, {"loss": 0.5711, "grad_norm": 0.8804680705070496, "learning_rate": 0.0002, "epoch": 4.85490259477811, "step": 30030}, {"loss": 0.5397, "grad_norm": 1.161970853805542, "learning_rate": 0.0002, "epoch": 4.856519278958856, "step": 30040}, {"loss": 0.5552, "grad_norm": 0.9081037640571594, "learning_rate": 0.0002, "epoch": 4.858135963139601, "step": 30050}, {"loss": 0.6024, "grad_norm": 0.9402848482131958, "learning_rate": 0.0002, "epoch": 4.859752647320346, "step": 30060}, {"loss": 0.6256, "grad_norm": 0.9023865461349487, "learning_rate": 0.0002, "epoch": 4.861369331501091, "step": 30070}, {"loss": 0.5926, "grad_norm": 1.0173414945602417, "learning_rate": 0.0002, "epoch": 4.8629860156818365, "step": 30080}, {"loss": 0.6274, "grad_norm": 1.084402322769165, "learning_rate": 0.0002, "epoch": 4.864602699862582, "step": 30090}, {"loss": 0.6311, "grad_norm": 0.9577937126159668, "learning_rate": 0.0002, "epoch": 4.866219384043327, "step": 30100}, {"loss": 0.5724, "grad_norm": 0.9807606935501099, "learning_rate": 0.0002, "epoch": 4.867836068224072, "step": 30110}, {"loss": 0.5786, "grad_norm": 0.978784441947937, "learning_rate": 0.0002, "epoch": 4.8694527524048175, "step": 30120}, {"loss": 0.6194, "grad_norm": 0.9762914776802063, "learning_rate": 0.0002, "epoch": 4.871069436585563, "step": 30130}, {"loss": 0.5892, "grad_norm": 0.9404871463775635, "learning_rate": 0.0002, "epoch": 4.872686120766308, "step": 30140}, {"loss": 0.6182, "grad_norm": 1.0069509744644165, "learning_rate": 0.0002, "epoch": 4.874302804947053, "step": 30150}, {"loss": 0.6225, "grad_norm": 1.1770923137664795, "learning_rate": 0.0002, "epoch": 4.875919489127799, "step": 30160}, {"loss": 0.5657, "grad_norm": 1.021210789680481, "learning_rate": 0.0002, "epoch": 4.8775361733085445, "step": 30170}, {"loss": 0.6033, "grad_norm": 0.8512648940086365, "learning_rate": 0.0002, "epoch": 4.87915285748929, "step": 30180}, {"loss": 0.5519, "grad_norm": 0.9345870018005371, "learning_rate": 0.0002, "epoch": 4.880769541670035, "step": 30190}, {"loss": 0.5682, "grad_norm": 1.0224418640136719, "learning_rate": 0.0002, "epoch": 4.88238622585078, "step": 30200}, {"loss": 0.5807, "grad_norm": 1.0316044092178345, "learning_rate": 0.0002, "epoch": 4.884002910031525, "step": 30210}, {"loss": 0.6065, "grad_norm": 1.102437973022461, "learning_rate": 0.0002, "epoch": 4.885619594212271, "step": 30220}, {"loss": 0.586, "grad_norm": 1.0220023393630981, "learning_rate": 0.0002, "epoch": 4.887236278393016, "step": 30230}, {"loss": 0.5781, "grad_norm": 1.0934523344039917, "learning_rate": 0.0002, "epoch": 4.888852962573761, "step": 30240}, {"loss": 0.6313, "grad_norm": 1.264630913734436, "learning_rate": 0.0002, "epoch": 4.890469646754506, "step": 30250}, {"loss": 0.5712, "grad_norm": 1.0999879837036133, "learning_rate": 0.0002, "epoch": 4.892086330935252, "step": 30260}, {"loss": 0.6413, "grad_norm": 0.9124550223350525, "learning_rate": 0.0002, "epoch": 4.893703015115997, "step": 30270}, {"loss": 0.596, "grad_norm": 0.9853624105453491, "learning_rate": 0.0002, "epoch": 4.895319699296742, "step": 30280}, {"loss": 0.595, "grad_norm": 1.0589802265167236, "learning_rate": 0.0002, "epoch": 4.896936383477488, "step": 30290}, {"loss": 0.6129, "grad_norm": 0.8487226366996765, "learning_rate": 0.0002, "epoch": 4.8985530676582325, "step": 30300}, {"loss": 0.5514, "grad_norm": 1.0212191343307495, "learning_rate": 0.0002, "epoch": 4.900169751838979, "step": 30310}, {"loss": 0.5896, "grad_norm": 1.0187491178512573, "learning_rate": 0.0002, "epoch": 4.901786436019724, "step": 30320}, {"loss": 0.5809, "grad_norm": 1.0013091564178467, "learning_rate": 0.0002, "epoch": 4.903403120200469, "step": 30330}, {"loss": 0.5658, "grad_norm": 1.0017542839050293, "learning_rate": 0.0002, "epoch": 4.905019804381214, "step": 30340}, {"loss": 0.6002, "grad_norm": 0.9665151238441467, "learning_rate": 0.0002, "epoch": 4.9066364885619596, "step": 30350}, {"loss": 0.5864, "grad_norm": 0.8774822950363159, "learning_rate": 0.0002, "epoch": 4.908253172742705, "step": 30360}, {"loss": 0.5771, "grad_norm": 0.9449850916862488, "learning_rate": 0.0002, "epoch": 4.90986985692345, "step": 30370}, {"loss": 0.58, "grad_norm": 0.7368341088294983, "learning_rate": 0.0002, "epoch": 4.911486541104195, "step": 30380}, {"loss": 0.5992, "grad_norm": 0.9669167995452881, "learning_rate": 0.0002, "epoch": 4.9131032252849405, "step": 30390}, {"loss": 0.6202, "grad_norm": 1.1227794885635376, "learning_rate": 0.0002, "epoch": 4.914719909465686, "step": 30400}, {"loss": 0.6181, "grad_norm": 0.9884361028671265, "learning_rate": 0.0002, "epoch": 4.916336593646431, "step": 30410}, {"loss": 0.6185, "grad_norm": 0.9949551224708557, "learning_rate": 0.0002, "epoch": 4.917953277827176, "step": 30420}, {"loss": 0.5866, "grad_norm": 0.9491621851921082, "learning_rate": 0.0002, "epoch": 4.919569962007921, "step": 30430}, {"loss": 0.6005, "grad_norm": 0.78848797082901, "learning_rate": 0.0002, "epoch": 4.9211866461886675, "step": 30440}, {"loss": 0.5561, "grad_norm": 1.0693835020065308, "learning_rate": 0.0002, "epoch": 4.922803330369412, "step": 30450}, {"loss": 0.566, "grad_norm": 0.9573729634284973, "learning_rate": 0.0002, "epoch": 4.924420014550158, "step": 30460}, {"loss": 0.6084, "grad_norm": 0.9975152611732483, "learning_rate": 0.0002, "epoch": 4.926036698730903, "step": 30470}, {"loss": 0.5969, "grad_norm": 0.8695693016052246, "learning_rate": 0.0002, "epoch": 4.9276533829116484, "step": 30480}, {"loss": 0.6144, "grad_norm": 1.145394206047058, "learning_rate": 0.0002, "epoch": 4.929270067092394, "step": 30490}, {"loss": 0.5736, "grad_norm": 0.7668989896774292, "learning_rate": 0.0002, "epoch": 4.930886751273139, "step": 30500}, {"loss": 0.6052, "grad_norm": 0.9630151391029358, "learning_rate": 0.0002, "epoch": 4.932503435453884, "step": 30510}, {"loss": 0.6461, "grad_norm": 0.940705418586731, "learning_rate": 0.0002, "epoch": 4.934120119634629, "step": 30520}, {"loss": 0.6326, "grad_norm": 1.3243348598480225, "learning_rate": 0.0002, "epoch": 4.935736803815375, "step": 30530}, {"loss": 0.6174, "grad_norm": 1.004347801208496, "learning_rate": 0.0002, "epoch": 4.93735348799612, "step": 30540}, {"loss": 0.583, "grad_norm": 0.8711541295051575, "learning_rate": 0.0002, "epoch": 4.938970172176865, "step": 30550}, {"loss": 0.599, "grad_norm": 0.8980631828308105, "learning_rate": 0.0002, "epoch": 4.94058685635761, "step": 30560}, {"loss": 0.6024, "grad_norm": 0.8388893604278564, "learning_rate": 0.0002, "epoch": 4.9422035405383555, "step": 30570}, {"loss": 0.6189, "grad_norm": 1.0991183519363403, "learning_rate": 0.0002, "epoch": 4.943820224719101, "step": 30580}, {"loss": 0.5906, "grad_norm": 0.9731075763702393, "learning_rate": 0.0002, "epoch": 4.945436908899847, "step": 30590}, {"loss": 0.5883, "grad_norm": 1.3904452323913574, "learning_rate": 0.0002, "epoch": 4.947053593080591, "step": 30600}, {"loss": 0.5952, "grad_norm": 1.2489882707595825, "learning_rate": 0.0002, "epoch": 4.948670277261337, "step": 30610}, {"loss": 0.5887, "grad_norm": 1.240072250366211, "learning_rate": 0.0002, "epoch": 4.950286961442083, "step": 30620}, {"loss": 0.5762, "grad_norm": 0.9191411733627319, "learning_rate": 0.0002, "epoch": 4.951903645622828, "step": 30630}, {"loss": 0.5597, "grad_norm": 0.8888895511627197, "learning_rate": 0.0002, "epoch": 4.953520329803573, "step": 30640}, {"loss": 0.6594, "grad_norm": 0.9001450538635254, "learning_rate": 0.0002, "epoch": 4.955137013984318, "step": 30650}, {"loss": 0.6047, "grad_norm": 1.053971767425537, "learning_rate": 0.0002, "epoch": 4.9567536981650635, "step": 30660}, {"loss": 0.6107, "grad_norm": 1.2224042415618896, "learning_rate": 0.0002, "epoch": 4.958370382345809, "step": 30670}, {"loss": 0.6211, "grad_norm": 0.8855111598968506, "learning_rate": 0.0002, "epoch": 4.959987066526554, "step": 30680}, {"loss": 0.5764, "grad_norm": 0.9489575624465942, "learning_rate": 0.0002, "epoch": 4.961603750707299, "step": 30690}, {"loss": 0.5371, "grad_norm": 0.9635404944419861, "learning_rate": 0.0002, "epoch": 4.963220434888044, "step": 30700}, {"loss": 0.6043, "grad_norm": 1.1784121990203857, "learning_rate": 0.0002, "epoch": 4.96483711906879, "step": 30710}, {"loss": 0.5803, "grad_norm": 1.0059462785720825, "learning_rate": 0.0002, "epoch": 4.966453803249535, "step": 30720}, {"loss": 0.5759, "grad_norm": 0.9479738473892212, "learning_rate": 0.0002, "epoch": 4.96807048743028, "step": 30730}, {"loss": 0.584, "grad_norm": 1.0624593496322632, "learning_rate": 0.0002, "epoch": 4.969687171611026, "step": 30740}, {"loss": 0.6202, "grad_norm": 1.1429259777069092, "learning_rate": 0.0002, "epoch": 4.971303855791771, "step": 30750}, {"loss": 0.6174, "grad_norm": 0.9102491140365601, "learning_rate": 0.0002, "epoch": 4.972920539972517, "step": 30760}, {"loss": 0.6025, "grad_norm": 1.1262688636779785, "learning_rate": 0.0002, "epoch": 4.974537224153262, "step": 30770}, {"loss": 0.588, "grad_norm": 1.1415393352508545, "learning_rate": 0.0002, "epoch": 4.976153908334007, "step": 30780}, {"loss": 0.5832, "grad_norm": 1.083078384399414, "learning_rate": 0.0002, "epoch": 4.977770592514752, "step": 30790}, {"loss": 0.6025, "grad_norm": 0.964859127998352, "learning_rate": 0.0002, "epoch": 4.979387276695498, "step": 30800}, {"loss": 0.6095, "grad_norm": 0.8704743385314941, "learning_rate": 0.0002, "epoch": 4.981003960876243, "step": 30810}, {"loss": 0.5666, "grad_norm": 1.0714856386184692, "learning_rate": 0.0002, "epoch": 4.982620645056988, "step": 30820}, {"loss": 0.565, "grad_norm": 0.6818771362304688, "learning_rate": 0.0002, "epoch": 4.984237329237733, "step": 30830}, {"loss": 0.5999, "grad_norm": 1.0454156398773193, "learning_rate": 0.0002, "epoch": 4.985854013418479, "step": 30840}, {"loss": 0.5683, "grad_norm": 0.9410776495933533, "learning_rate": 0.0002, "epoch": 4.987470697599224, "step": 30850}, {"loss": 0.5899, "grad_norm": 1.0878902673721313, "learning_rate": 0.0002, "epoch": 4.989087381779969, "step": 30860}, {"loss": 0.5914, "grad_norm": 0.8916727304458618, "learning_rate": 0.0002, "epoch": 4.990704065960714, "step": 30870}, {"loss": 0.6066, "grad_norm": 1.045776128768921, "learning_rate": 0.0002, "epoch": 4.9923207501414595, "step": 30880}, {"loss": 0.5767, "grad_norm": 0.9861903786659241, "learning_rate": 0.0002, "epoch": 4.993937434322206, "step": 30890}, {"loss": 0.6192, "grad_norm": 0.9275050759315491, "learning_rate": 0.0002, "epoch": 4.995554118502951, "step": 30900}, {"loss": 0.6181, "grad_norm": 0.94013911485672, "learning_rate": 0.0002, "epoch": 4.997170802683696, "step": 30910}, {"loss": 0.614, "grad_norm": 0.9771268367767334, "learning_rate": 0.0002, "epoch": 4.998787486864441, "step": 30920}, {"eval_loss": 1.1968598365783691, "eval_runtime": 122.2519, "eval_samples_per_second": 5.996, "eval_steps_per_second": 0.753, "epoch": 4.9999191657909625, "step": 30927}, {"loss": 0.5238, "grad_norm": 0.8021580576896667, "learning_rate": 0.0002, "epoch": 5.0004041710451865, "step": 30930}, {"loss": 0.4984, "grad_norm": 1.0807327032089233, "learning_rate": 0.0002, "epoch": 5.002020855225932, "step": 30940}, {"loss": 0.514, "grad_norm": 1.1638425588607788, "learning_rate": 0.0002, "epoch": 5.003637539406677, "step": 30950}, {"loss": 0.4621, "grad_norm": 1.1700230836868286, "learning_rate": 0.0002, "epoch": 5.005254223587422, "step": 30960}, {"loss": 0.4657, "grad_norm": 0.9053420424461365, "learning_rate": 0.0002, "epoch": 5.0068709077681675, "step": 30970}, {"loss": 0.4865, "grad_norm": 0.9226111769676208, "learning_rate": 0.0002, "epoch": 5.008487591948913, "step": 30980}, {"loss": 0.5011, "grad_norm": 1.238669514656067, "learning_rate": 0.0002, "epoch": 5.010104276129658, "step": 30990}, {"loss": 0.4754, "grad_norm": 1.0668327808380127, "learning_rate": 0.0002, "epoch": 5.011720960310403, "step": 31000}, {"loss": 0.5414, "grad_norm": 1.0903944969177246, "learning_rate": 0.0002, "epoch": 5.013337644491148, "step": 31010}, {"loss": 0.5117, "grad_norm": 1.0763911008834839, "learning_rate": 0.0002, "epoch": 5.014954328671894, "step": 31020}, {"loss": 0.4908, "grad_norm": 1.0108771324157715, "learning_rate": 0.0002, "epoch": 5.016571012852639, "step": 31030}, {"loss": 0.5052, "grad_norm": 0.8816103935241699, "learning_rate": 0.0002, "epoch": 5.018187697033385, "step": 31040}, {"loss": 0.4985, "grad_norm": 1.11434805393219, "learning_rate": 0.0002, "epoch": 5.01980438121413, "step": 31050}, {"loss": 0.5074, "grad_norm": 1.0727789402008057, "learning_rate": 0.0002, "epoch": 5.021421065394875, "step": 31060}, {"loss": 0.4938, "grad_norm": 1.1480379104614258, "learning_rate": 0.0002, "epoch": 5.023037749575621, "step": 31070}, {"loss": 0.491, "grad_norm": 1.0913071632385254, "learning_rate": 0.0002, "epoch": 5.024654433756366, "step": 31080}, {"loss": 0.4896, "grad_norm": 0.9891864657402039, "learning_rate": 0.0002, "epoch": 5.026271117937111, "step": 31090}, {"loss": 0.4965, "grad_norm": 0.9167473912239075, "learning_rate": 0.0002, "epoch": 5.027887802117856, "step": 31100}, {"loss": 0.5098, "grad_norm": 1.2259035110473633, "learning_rate": 0.0002, "epoch": 5.029504486298602, "step": 31110}, {"loss": 0.5206, "grad_norm": 1.1812787055969238, "learning_rate": 0.0002, "epoch": 5.031121170479347, "step": 31120}, {"loss": 0.4725, "grad_norm": 1.0890522003173828, "learning_rate": 0.0002, "epoch": 5.032737854660092, "step": 31130}, {"loss": 0.4768, "grad_norm": 1.0521091222763062, "learning_rate": 0.0002, "epoch": 5.034354538840837, "step": 31140}, {"loss": 0.4718, "grad_norm": 1.1274569034576416, "learning_rate": 0.0002, "epoch": 5.0359712230215825, "step": 31150}, {"loss": 0.4604, "grad_norm": 1.140974998474121, "learning_rate": 0.0002, "epoch": 5.037587907202328, "step": 31160}, {"loss": 0.5077, "grad_norm": 1.1215609312057495, "learning_rate": 0.0002, "epoch": 5.039204591383073, "step": 31170}, {"loss": 0.4746, "grad_norm": 1.0107218027114868, "learning_rate": 0.0002, "epoch": 5.040821275563818, "step": 31180}, {"loss": 0.5126, "grad_norm": 1.0198770761489868, "learning_rate": 0.0002, "epoch": 5.042437959744564, "step": 31190}, {"loss": 0.5004, "grad_norm": 1.1613430976867676, "learning_rate": 0.0002, "epoch": 5.0440546439253096, "step": 31200}, {"loss": 0.5181, "grad_norm": 0.8555458188056946, "learning_rate": 0.0002, "epoch": 5.045671328106055, "step": 31210}, {"loss": 0.4878, "grad_norm": 1.0235545635223389, "learning_rate": 0.0002, "epoch": 5.0472880122868, "step": 31220}, {"loss": 0.499, "grad_norm": 1.0228750705718994, "learning_rate": 0.0002, "epoch": 5.048904696467545, "step": 31230}, {"loss": 0.4544, "grad_norm": 0.8216419816017151, "learning_rate": 0.0002, "epoch": 5.0505213806482905, "step": 31240}, {"loss": 0.4947, "grad_norm": 0.925828218460083, "learning_rate": 0.0002, "epoch": 5.052138064829036, "step": 31250}, {"loss": 0.4835, "grad_norm": 0.9229369759559631, "learning_rate": 0.0002, "epoch": 5.053754749009781, "step": 31260}, {"loss": 0.5136, "grad_norm": 0.9531727433204651, "learning_rate": 0.0002, "epoch": 5.055371433190526, "step": 31270}, {"loss": 0.5161, "grad_norm": 0.7738548517227173, "learning_rate": 0.0002, "epoch": 5.056988117371271, "step": 31280}, {"loss": 0.5166, "grad_norm": 1.0551451444625854, "learning_rate": 0.0002, "epoch": 5.058604801552017, "step": 31290}, {"loss": 0.4953, "grad_norm": 0.9782299399375916, "learning_rate": 0.0002, "epoch": 5.060221485732762, "step": 31300}, {"loss": 0.4776, "grad_norm": 1.0220632553100586, "learning_rate": 0.0002, "epoch": 5.061838169913507, "step": 31310}, {"loss": 0.5117, "grad_norm": 0.9808892607688904, "learning_rate": 0.0002, "epoch": 5.063454854094252, "step": 31320}, {"loss": 0.501, "grad_norm": 1.0662003755569458, "learning_rate": 0.0002, "epoch": 5.065071538274998, "step": 31330}, {"loss": 0.4844, "grad_norm": 1.0036940574645996, "learning_rate": 0.0002, "epoch": 5.066688222455744, "step": 31340}, {"loss": 0.5299, "grad_norm": 1.1931052207946777, "learning_rate": 0.0002, "epoch": 5.068304906636489, "step": 31350}, {"loss": 0.4646, "grad_norm": 0.9370693564414978, "learning_rate": 0.0002, "epoch": 5.069921590817234, "step": 31360}, {"loss": 0.5274, "grad_norm": 0.9589039087295532, "learning_rate": 0.0002, "epoch": 5.071538274997979, "step": 31370}, {"loss": 0.4669, "grad_norm": 1.0052711963653564, "learning_rate": 0.0002, "epoch": 5.073154959178725, "step": 31380}, {"loss": 0.5283, "grad_norm": 0.9991368651390076, "learning_rate": 0.0002, "epoch": 5.07477164335947, "step": 31390}, {"loss": 0.4579, "grad_norm": 0.8539695739746094, "learning_rate": 0.0002, "epoch": 5.076388327540215, "step": 31400}, {"loss": 0.4609, "grad_norm": 1.048775553703308, "learning_rate": 0.0002, "epoch": 5.07800501172096, "step": 31410}, {"loss": 0.4915, "grad_norm": 0.9983724355697632, "learning_rate": 0.0002, "epoch": 5.0796216959017055, "step": 31420}, {"loss": 0.4594, "grad_norm": 1.0189813375473022, "learning_rate": 0.0002, "epoch": 5.081238380082451, "step": 31430}, {"loss": 0.5449, "grad_norm": 0.9781646728515625, "learning_rate": 0.0002, "epoch": 5.082855064263196, "step": 31440}, {"loss": 0.4698, "grad_norm": 0.9424566030502319, "learning_rate": 0.0002, "epoch": 5.084471748443941, "step": 31450}, {"loss": 0.4768, "grad_norm": 1.0036484003067017, "learning_rate": 0.0002, "epoch": 5.0860884326246865, "step": 31460}, {"loss": 0.487, "grad_norm": 1.0983147621154785, "learning_rate": 0.0002, "epoch": 5.087705116805432, "step": 31470}, {"loss": 0.5236, "grad_norm": 1.0856730937957764, "learning_rate": 0.0002, "epoch": 5.089321800986177, "step": 31480}, {"loss": 0.485, "grad_norm": 1.2191699743270874, "learning_rate": 0.0002, "epoch": 5.090938485166923, "step": 31490}, {"loss": 0.4936, "grad_norm": 0.939346194267273, "learning_rate": 0.0002, "epoch": 5.092555169347668, "step": 31500}, {"loss": 0.5107, "grad_norm": 0.9730121493339539, "learning_rate": 0.0002, "epoch": 5.0941718535284135, "step": 31510}, {"loss": 0.4973, "grad_norm": 0.923686146736145, "learning_rate": 0.0002, "epoch": 5.095788537709159, "step": 31520}, {"loss": 0.4906, "grad_norm": 1.1734349727630615, "learning_rate": 0.0002, "epoch": 5.097405221889904, "step": 31530}, {"loss": 0.5165, "grad_norm": 1.084509015083313, "learning_rate": 0.0002, "epoch": 5.099021906070649, "step": 31540}, {"loss": 0.5078, "grad_norm": 1.0144678354263306, "learning_rate": 0.0002, "epoch": 5.100638590251394, "step": 31550}, {"loss": 0.4719, "grad_norm": 0.9958019256591797, "learning_rate": 0.0002, "epoch": 5.10225527443214, "step": 31560}, {"loss": 0.4876, "grad_norm": 0.8900736570358276, "learning_rate": 0.0002, "epoch": 5.103871958612885, "step": 31570}, {"loss": 0.463, "grad_norm": 1.0921649932861328, "learning_rate": 0.0002, "epoch": 5.10548864279363, "step": 31580}, {"loss": 0.5148, "grad_norm": 1.1613792181015015, "learning_rate": 0.0002, "epoch": 5.107105326974375, "step": 31590}, {"loss": 0.5055, "grad_norm": 0.9211367964744568, "learning_rate": 0.0002, "epoch": 5.108722011155121, "step": 31600}, {"loss": 0.5364, "grad_norm": 1.3315813541412354, "learning_rate": 0.0002, "epoch": 5.110338695335866, "step": 31610}, {"loss": 0.5336, "grad_norm": 1.3765019178390503, "learning_rate": 0.0002, "epoch": 5.111955379516611, "step": 31620}, {"loss": 0.4861, "grad_norm": 1.070198893547058, "learning_rate": 0.0002, "epoch": 5.113572063697356, "step": 31630}, {"loss": 0.5046, "grad_norm": 0.947631299495697, "learning_rate": 0.0002, "epoch": 5.115188747878102, "step": 31640}, {"loss": 0.5297, "grad_norm": 1.0197371244430542, "learning_rate": 0.0002, "epoch": 5.116805432058848, "step": 31650}, {"loss": 0.5014, "grad_norm": 0.8647911548614502, "learning_rate": 0.0002, "epoch": 5.118422116239593, "step": 31660}, {"loss": 0.4705, "grad_norm": 0.8944075107574463, "learning_rate": 0.0002, "epoch": 5.120038800420338, "step": 31670}, {"loss": 0.5175, "grad_norm": 1.124497652053833, "learning_rate": 0.0002, "epoch": 5.121655484601083, "step": 31680}, {"loss": 0.5109, "grad_norm": 0.893131673336029, "learning_rate": 0.0002, "epoch": 5.123272168781829, "step": 31690}, {"loss": 0.4937, "grad_norm": 1.0122284889221191, "learning_rate": 0.0002, "epoch": 5.124888852962574, "step": 31700}, {"loss": 0.5522, "grad_norm": 0.9493719935417175, "learning_rate": 0.0002, "epoch": 5.126505537143319, "step": 31710}, {"loss": 0.5031, "grad_norm": 0.9700539112091064, "learning_rate": 0.0002, "epoch": 5.128122221324064, "step": 31720}, {"loss": 0.5126, "grad_norm": 1.111677646636963, "learning_rate": 0.0002, "epoch": 5.1297389055048095, "step": 31730}, {"loss": 0.5272, "grad_norm": 0.8204274773597717, "learning_rate": 0.0002, "epoch": 5.131355589685555, "step": 31740}, {"loss": 0.5029, "grad_norm": 1.1029267311096191, "learning_rate": 0.0002, "epoch": 5.1329722738663, "step": 31750}, {"loss": 0.505, "grad_norm": 1.065575122833252, "learning_rate": 0.0002, "epoch": 5.134588958047045, "step": 31760}, {"loss": 0.502, "grad_norm": 0.8208706974983215, "learning_rate": 0.0002, "epoch": 5.13620564222779, "step": 31770}, {"loss": 0.5352, "grad_norm": 1.0520979166030884, "learning_rate": 0.0002, "epoch": 5.137822326408536, "step": 31780}, {"loss": 0.4911, "grad_norm": 0.8585538268089294, "learning_rate": 0.0002, "epoch": 5.139439010589282, "step": 31790}, {"loss": 0.5159, "grad_norm": 1.1491447687149048, "learning_rate": 0.0002, "epoch": 5.141055694770027, "step": 31800}, {"loss": 0.5157, "grad_norm": 0.9441081285476685, "learning_rate": 0.0002, "epoch": 5.142672378950772, "step": 31810}, {"loss": 0.5383, "grad_norm": 1.4146889448165894, "learning_rate": 0.0002, "epoch": 5.1442890631315175, "step": 31820}, {"loss": 0.5159, "grad_norm": 1.0326547622680664, "learning_rate": 0.0002, "epoch": 5.145905747312263, "step": 31830}, {"loss": 0.5348, "grad_norm": 0.9879202842712402, "learning_rate": 0.0002, "epoch": 5.147522431493008, "step": 31840}, {"loss": 0.5083, "grad_norm": 1.0374281406402588, "learning_rate": 0.0002, "epoch": 5.149139115673753, "step": 31850}, {"loss": 0.4827, "grad_norm": 1.181229591369629, "learning_rate": 0.0002, "epoch": 5.150755799854498, "step": 31860}, {"loss": 0.5313, "grad_norm": 1.2078537940979004, "learning_rate": 0.0002, "epoch": 5.152372484035244, "step": 31870}, {"loss": 0.5329, "grad_norm": 0.9599190354347229, "learning_rate": 0.0002, "epoch": 5.153989168215989, "step": 31880}, {"loss": 0.4953, "grad_norm": 1.0378568172454834, "learning_rate": 0.0002, "epoch": 5.155605852396734, "step": 31890}, {"loss": 0.5069, "grad_norm": 0.8746536374092102, "learning_rate": 0.0002, "epoch": 5.157222536577479, "step": 31900}, {"loss": 0.5272, "grad_norm": 1.0232136249542236, "learning_rate": 0.0002, "epoch": 5.1588392207582245, "step": 31910}, {"loss": 0.4844, "grad_norm": 0.9827565550804138, "learning_rate": 0.0002, "epoch": 5.16045590493897, "step": 31920}, {"loss": 0.5029, "grad_norm": 1.342657208442688, "learning_rate": 0.0002, "epoch": 5.162072589119716, "step": 31930}, {"loss": 0.513, "grad_norm": 1.18390691280365, "learning_rate": 0.0002, "epoch": 5.163689273300461, "step": 31940}, {"loss": 0.5267, "grad_norm": 0.996350109577179, "learning_rate": 0.0002, "epoch": 5.165305957481206, "step": 31950}, {"loss": 0.5063, "grad_norm": 0.9710391163825989, "learning_rate": 0.0002, "epoch": 5.166922641661952, "step": 31960}, {"loss": 0.5115, "grad_norm": 1.0264002084732056, "learning_rate": 0.0002, "epoch": 5.168539325842697, "step": 31970}, {"loss": 0.4972, "grad_norm": 1.0028311014175415, "learning_rate": 0.0002, "epoch": 5.170156010023442, "step": 31980}, {"loss": 0.5103, "grad_norm": 1.1078234910964966, "learning_rate": 0.0002, "epoch": 5.171772694204187, "step": 31990}, {"loss": 0.495, "grad_norm": 0.9659610390663147, "learning_rate": 0.0002, "epoch": 5.1733893783849325, "step": 32000}, {"loss": 0.5114, "grad_norm": 0.841986894607544, "learning_rate": 0.0002, "epoch": 5.175006062565678, "step": 32010}, {"loss": 0.48, "grad_norm": 1.095332384109497, "learning_rate": 0.0002, "epoch": 5.176622746746423, "step": 32020}, {"loss": 0.4741, "grad_norm": 1.1242377758026123, "learning_rate": 0.0002, "epoch": 5.178239430927168, "step": 32030}, {"loss": 0.5573, "grad_norm": 0.9872292280197144, "learning_rate": 0.0002, "epoch": 5.179856115107913, "step": 32040}, {"loss": 0.48, "grad_norm": 0.936161994934082, "learning_rate": 0.0002, "epoch": 5.181472799288659, "step": 32050}, {"loss": 0.5093, "grad_norm": 1.166100025177002, "learning_rate": 0.0002, "epoch": 5.183089483469404, "step": 32060}, {"loss": 0.5438, "grad_norm": 1.0764425992965698, "learning_rate": 0.0002, "epoch": 5.184706167650149, "step": 32070}, {"loss": 0.4843, "grad_norm": 1.0480051040649414, "learning_rate": 0.0002, "epoch": 5.186322851830895, "step": 32080}, {"loss": 0.5386, "grad_norm": 1.0874916315078735, "learning_rate": 0.0002, "epoch": 5.1879395360116405, "step": 32090}, {"loss": 0.4975, "grad_norm": 1.0817396640777588, "learning_rate": 0.0002, "epoch": 5.189556220192386, "step": 32100}, {"loss": 0.5177, "grad_norm": 1.054111361503601, "learning_rate": 0.0002, "epoch": 5.191172904373131, "step": 32110}, {"loss": 0.5229, "grad_norm": 0.9655823707580566, "learning_rate": 0.0002, "epoch": 5.192789588553876, "step": 32120}, {"loss": 0.5105, "grad_norm": 1.1384109258651733, "learning_rate": 0.0002, "epoch": 5.194406272734621, "step": 32130}, {"loss": 0.5073, "grad_norm": 1.0149348974227905, "learning_rate": 0.0002, "epoch": 5.196022956915367, "step": 32140}, {"loss": 0.5293, "grad_norm": 1.1084046363830566, "learning_rate": 0.0002, "epoch": 5.197639641096112, "step": 32150}, {"loss": 0.4936, "grad_norm": 1.1209309101104736, "learning_rate": 0.0002, "epoch": 5.199256325276857, "step": 32160}, {"loss": 0.5101, "grad_norm": 1.133089542388916, "learning_rate": 0.0002, "epoch": 5.200873009457602, "step": 32170}, {"loss": 0.5242, "grad_norm": 1.0893020629882812, "learning_rate": 0.0002, "epoch": 5.202489693638348, "step": 32180}, {"loss": 0.4872, "grad_norm": 0.90018630027771, "learning_rate": 0.0002, "epoch": 5.204106377819093, "step": 32190}, {"loss": 0.4999, "grad_norm": 0.977622926235199, "learning_rate": 0.0002, "epoch": 5.205723061999838, "step": 32200}, {"loss": 0.5028, "grad_norm": 1.2940177917480469, "learning_rate": 0.0002, "epoch": 5.207339746180583, "step": 32210}, {"loss": 0.5396, "grad_norm": 1.2131710052490234, "learning_rate": 0.0002, "epoch": 5.2089564303613285, "step": 32220}, {"loss": 0.5189, "grad_norm": 1.0234841108322144, "learning_rate": 0.0002, "epoch": 5.210573114542075, "step": 32230}, {"loss": 0.5424, "grad_norm": 1.157975435256958, "learning_rate": 0.0002, "epoch": 5.21218979872282, "step": 32240}, {"loss": 0.5396, "grad_norm": 1.0381282567977905, "learning_rate": 0.0002, "epoch": 5.213806482903565, "step": 32250}, {"loss": 0.5192, "grad_norm": 1.0125395059585571, "learning_rate": 0.0002, "epoch": 5.21542316708431, "step": 32260}, {"loss": 0.5216, "grad_norm": 1.272691011428833, "learning_rate": 0.0002, "epoch": 5.2170398512650555, "step": 32270}, {"loss": 0.52, "grad_norm": 1.0061250925064087, "learning_rate": 0.0002, "epoch": 5.218656535445801, "step": 32280}, {"loss": 0.4739, "grad_norm": 0.9752234816551208, "learning_rate": 0.0002, "epoch": 5.220273219626546, "step": 32290}, {"loss": 0.5471, "grad_norm": 1.1193140745162964, "learning_rate": 0.0002, "epoch": 5.221889903807291, "step": 32300}, {"loss": 0.4976, "grad_norm": 1.0126434564590454, "learning_rate": 0.0002, "epoch": 5.2235065879880365, "step": 32310}, {"loss": 0.5257, "grad_norm": 1.4338394403457642, "learning_rate": 0.0002, "epoch": 5.225123272168782, "step": 32320}, {"loss": 0.5235, "grad_norm": 1.004101276397705, "learning_rate": 0.0002, "epoch": 5.226739956349527, "step": 32330}, {"loss": 0.5091, "grad_norm": 0.8744166493415833, "learning_rate": 0.0002, "epoch": 5.228356640530272, "step": 32340}, {"loss": 0.5388, "grad_norm": 1.0165376663208008, "learning_rate": 0.0002, "epoch": 5.229973324711017, "step": 32350}, {"loss": 0.5469, "grad_norm": 0.8635954260826111, "learning_rate": 0.0002, "epoch": 5.231590008891763, "step": 32360}, {"loss": 0.5609, "grad_norm": 1.1392399072647095, "learning_rate": 0.0002, "epoch": 5.233206693072509, "step": 32370}, {"loss": 0.5173, "grad_norm": 1.0202113389968872, "learning_rate": 0.0002, "epoch": 5.234823377253254, "step": 32380}, {"loss": 0.4983, "grad_norm": 1.0417983531951904, "learning_rate": 0.0002, "epoch": 5.236440061433999, "step": 32390}, {"loss": 0.507, "grad_norm": 0.8729333877563477, "learning_rate": 0.0002, "epoch": 5.238056745614744, "step": 32400}, {"loss": 0.5426, "grad_norm": 1.1626229286193848, "learning_rate": 0.0002, "epoch": 5.23967342979549, "step": 32410}, {"loss": 0.5355, "grad_norm": 0.9086161851882935, "learning_rate": 0.0002, "epoch": 5.241290113976235, "step": 32420}, {"loss": 0.4927, "grad_norm": 1.3999892473220825, "learning_rate": 0.0002, "epoch": 5.24290679815698, "step": 32430}, {"loss": 0.4795, "grad_norm": 1.0356311798095703, "learning_rate": 0.0002, "epoch": 5.244523482337725, "step": 32440}, {"loss": 0.5035, "grad_norm": 0.9655531644821167, "learning_rate": 0.0002, "epoch": 5.246140166518471, "step": 32450}, {"loss": 0.5166, "grad_norm": 1.0411828756332397, "learning_rate": 0.0002, "epoch": 5.247756850699216, "step": 32460}, {"loss": 0.5141, "grad_norm": 1.1199816465377808, "learning_rate": 0.0002, "epoch": 5.249373534879961, "step": 32470}, {"loss": 0.4864, "grad_norm": 1.260321855545044, "learning_rate": 0.0002, "epoch": 5.250990219060706, "step": 32480}, {"loss": 0.4893, "grad_norm": 1.2950857877731323, "learning_rate": 0.0002, "epoch": 5.2526069032414515, "step": 32490}, {"loss": 0.4952, "grad_norm": 0.8982820510864258, "learning_rate": 0.0002, "epoch": 5.254223587422197, "step": 32500}, {"loss": 0.5138, "grad_norm": 0.8512987494468689, "learning_rate": 0.0002, "epoch": 5.255840271602942, "step": 32510}, {"loss": 0.5341, "grad_norm": 1.067443609237671, "learning_rate": 0.0002, "epoch": 5.257456955783688, "step": 32520}, {"loss": 0.4928, "grad_norm": 1.0957417488098145, "learning_rate": 0.0002, "epoch": 5.259073639964433, "step": 32530}, {"loss": 0.5169, "grad_norm": 1.4161807298660278, "learning_rate": 0.0002, "epoch": 5.260690324145179, "step": 32540}, {"loss": 0.5599, "grad_norm": 1.2264093160629272, "learning_rate": 0.0002, "epoch": 5.262307008325924, "step": 32550}, {"loss": 0.5221, "grad_norm": 1.0015931129455566, "learning_rate": 0.0002, "epoch": 5.263923692506669, "step": 32560}, {"loss": 0.5253, "grad_norm": 1.0743094682693481, "learning_rate": 0.0002, "epoch": 5.265540376687414, "step": 32570}, {"loss": 0.5289, "grad_norm": 1.1386840343475342, "learning_rate": 0.0002, "epoch": 5.2671570608681595, "step": 32580}, {"loss": 0.5315, "grad_norm": 1.0093860626220703, "learning_rate": 0.0002, "epoch": 5.268773745048905, "step": 32590}, {"loss": 0.5175, "grad_norm": 0.9593744874000549, "learning_rate": 0.0002, "epoch": 5.27039042922965, "step": 32600}, {"loss": 0.528, "grad_norm": 1.146021842956543, "learning_rate": 0.0002, "epoch": 5.272007113410395, "step": 32610}, {"loss": 0.4983, "grad_norm": 0.9579031467437744, "learning_rate": 0.0002, "epoch": 5.27362379759114, "step": 32620}, {"loss": 0.5376, "grad_norm": 1.0548793077468872, "learning_rate": 0.0002, "epoch": 5.275240481771886, "step": 32630}, {"loss": 0.5267, "grad_norm": 1.0380561351776123, "learning_rate": 0.0002, "epoch": 5.276857165952631, "step": 32640}, {"loss": 0.5182, "grad_norm": 1.2119969129562378, "learning_rate": 0.0002, "epoch": 5.278473850133376, "step": 32650}, {"loss": 0.5298, "grad_norm": 1.0507797002792358, "learning_rate": 0.0002, "epoch": 5.280090534314121, "step": 32660}, {"loss": 0.5253, "grad_norm": 1.0185176134109497, "learning_rate": 0.0002, "epoch": 5.2817072184948675, "step": 32670}, {"loss": 0.4904, "grad_norm": 1.2358098030090332, "learning_rate": 0.0002, "epoch": 5.283323902675613, "step": 32680}, {"loss": 0.5169, "grad_norm": 0.7937114238739014, "learning_rate": 0.0002, "epoch": 5.284940586856358, "step": 32690}, {"loss": 0.495, "grad_norm": 0.9825124740600586, "learning_rate": 0.0002, "epoch": 5.286557271037103, "step": 32700}, {"loss": 0.5149, "grad_norm": 1.2059301137924194, "learning_rate": 0.0002, "epoch": 5.288173955217848, "step": 32710}, {"loss": 0.5272, "grad_norm": 1.0828571319580078, "learning_rate": 0.0002, "epoch": 5.289790639398594, "step": 32720}, {"loss": 0.5383, "grad_norm": 1.0129735469818115, "learning_rate": 0.0002, "epoch": 5.291407323579339, "step": 32730}, {"loss": 0.5216, "grad_norm": 1.0591634511947632, "learning_rate": 0.0002, "epoch": 5.293024007760084, "step": 32740}, {"loss": 0.522, "grad_norm": 0.9256815910339355, "learning_rate": 0.0002, "epoch": 5.294640691940829, "step": 32750}, {"loss": 0.5396, "grad_norm": 1.0928633213043213, "learning_rate": 0.0002, "epoch": 5.2962573761215745, "step": 32760}, {"loss": 0.5093, "grad_norm": 0.9415594935417175, "learning_rate": 0.0002, "epoch": 5.29787406030232, "step": 32770}, {"loss": 0.5252, "grad_norm": 1.141316294670105, "learning_rate": 0.0002, "epoch": 5.299490744483065, "step": 32780}, {"loss": 0.4837, "grad_norm": 1.0646510124206543, "learning_rate": 0.0002, "epoch": 5.30110742866381, "step": 32790}, {"loss": 0.5547, "grad_norm": 1.189661979675293, "learning_rate": 0.0002, "epoch": 5.3027241128445555, "step": 32800}, {"loss": 0.5664, "grad_norm": 0.9568731188774109, "learning_rate": 0.0002, "epoch": 5.304340797025301, "step": 32810}, {"loss": 0.5344, "grad_norm": 1.1556824445724487, "learning_rate": 0.0002, "epoch": 5.305957481206047, "step": 32820}, {"loss": 0.4894, "grad_norm": 0.9353463649749756, "learning_rate": 0.0002, "epoch": 5.307574165386792, "step": 32830}, {"loss": 0.5052, "grad_norm": 1.1208295822143555, "learning_rate": 0.0002, "epoch": 5.309190849567537, "step": 32840}, {"loss": 0.5126, "grad_norm": 1.0894153118133545, "learning_rate": 0.0002, "epoch": 5.3108075337482825, "step": 32850}, {"loss": 0.5046, "grad_norm": 1.090329647064209, "learning_rate": 0.0002, "epoch": 5.312424217929028, "step": 32860}, {"loss": 0.5237, "grad_norm": 1.0781712532043457, "learning_rate": 0.0002, "epoch": 5.314040902109773, "step": 32870}, {"loss": 0.57, "grad_norm": 1.1785295009613037, "learning_rate": 0.0002, "epoch": 5.315657586290518, "step": 32880}, {"loss": 0.4953, "grad_norm": 1.0406851768493652, "learning_rate": 0.0002, "epoch": 5.317274270471263, "step": 32890}, {"loss": 0.514, "grad_norm": 1.0982953310012817, "learning_rate": 0.0002, "epoch": 5.318890954652009, "step": 32900}, {"loss": 0.4944, "grad_norm": 1.2969383001327515, "learning_rate": 0.0002, "epoch": 5.320507638832754, "step": 32910}, {"loss": 0.4786, "grad_norm": 0.9687288999557495, "learning_rate": 0.0002, "epoch": 5.322124323013499, "step": 32920}, {"loss": 0.5286, "grad_norm": 1.136760950088501, "learning_rate": 0.0002, "epoch": 5.323741007194244, "step": 32930}, {"loss": 0.5321, "grad_norm": 1.3045495748519897, "learning_rate": 0.0002, "epoch": 5.32535769137499, "step": 32940}, {"loss": 0.5413, "grad_norm": 1.221675992012024, "learning_rate": 0.0002, "epoch": 5.326974375555735, "step": 32950}, {"loss": 0.4999, "grad_norm": 1.1380633115768433, "learning_rate": 0.0002, "epoch": 5.32859105973648, "step": 32960}, {"loss": 0.5037, "grad_norm": 1.1065956354141235, "learning_rate": 0.0002, "epoch": 5.330207743917226, "step": 32970}, {"loss": 0.4913, "grad_norm": 1.0187175273895264, "learning_rate": 0.0002, "epoch": 5.331824428097971, "step": 32980}, {"loss": 0.5234, "grad_norm": 0.9077118039131165, "learning_rate": 0.0002, "epoch": 5.333441112278717, "step": 32990}, {"loss": 0.5071, "grad_norm": 1.0092815160751343, "learning_rate": 0.0002, "epoch": 5.335057796459462, "step": 33000}, {"loss": 0.498, "grad_norm": 1.0168777704238892, "learning_rate": 0.0002, "epoch": 5.336674480640207, "step": 33010}, {"loss": 0.4952, "grad_norm": 0.996161937713623, "learning_rate": 0.0002, "epoch": 5.338291164820952, "step": 33020}, {"loss": 0.5024, "grad_norm": 0.794463038444519, "learning_rate": 0.0002, "epoch": 5.339907849001698, "step": 33030}, {"loss": 0.5112, "grad_norm": 0.9750674962997437, "learning_rate": 0.0002, "epoch": 5.341524533182443, "step": 33040}, {"loss": 0.528, "grad_norm": 1.2770029306411743, "learning_rate": 0.0002, "epoch": 5.343141217363188, "step": 33050}, {"loss": 0.52, "grad_norm": 1.1500186920166016, "learning_rate": 0.0002, "epoch": 5.344757901543933, "step": 33060}, {"loss": 0.4906, "grad_norm": 1.0726377964019775, "learning_rate": 0.0002, "epoch": 5.3463745857246785, "step": 33070}, {"loss": 0.5212, "grad_norm": 0.9314153790473938, "learning_rate": 0.0002, "epoch": 5.347991269905424, "step": 33080}, {"loss": 0.5434, "grad_norm": 1.344988465309143, "learning_rate": 0.0002, "epoch": 5.349607954086169, "step": 33090}, {"loss": 0.4874, "grad_norm": 0.863196611404419, "learning_rate": 0.0002, "epoch": 5.351224638266914, "step": 33100}, {"loss": 0.534, "grad_norm": 1.128100037574768, "learning_rate": 0.0002, "epoch": 5.352841322447659, "step": 33110}, {"loss": 0.5293, "grad_norm": 1.1673583984375, "learning_rate": 0.0002, "epoch": 5.3544580066284055, "step": 33120}, {"loss": 0.4787, "grad_norm": 0.9416789412498474, "learning_rate": 0.0002, "epoch": 5.356074690809151, "step": 33130}, {"loss": 0.5155, "grad_norm": 1.1855236291885376, "learning_rate": 0.0002, "epoch": 5.357691374989896, "step": 33140}, {"loss": 0.515, "grad_norm": 1.0415170192718506, "learning_rate": 0.0002, "epoch": 5.359308059170641, "step": 33150}, {"loss": 0.545, "grad_norm": 0.9953004121780396, "learning_rate": 0.0002, "epoch": 5.3609247433513865, "step": 33160}, {"loss": 0.5305, "grad_norm": 0.96138596534729, "learning_rate": 0.0002, "epoch": 5.362541427532132, "step": 33170}, {"loss": 0.5064, "grad_norm": 1.341979742050171, "learning_rate": 0.0002, "epoch": 5.364158111712877, "step": 33180}, {"loss": 0.4986, "grad_norm": 1.0136911869049072, "learning_rate": 0.0002, "epoch": 5.365774795893622, "step": 33190}, {"loss": 0.5459, "grad_norm": 0.8685575127601624, "learning_rate": 0.0002, "epoch": 5.367391480074367, "step": 33200}, {"loss": 0.5146, "grad_norm": 0.8833574652671814, "learning_rate": 0.0002, "epoch": 5.369008164255113, "step": 33210}, {"loss": 0.4982, "grad_norm": 0.9123612642288208, "learning_rate": 0.0002, "epoch": 5.370624848435858, "step": 33220}, {"loss": 0.5047, "grad_norm": 1.2720599174499512, "learning_rate": 0.0002, "epoch": 5.372241532616603, "step": 33230}, {"loss": 0.5175, "grad_norm": 1.0596648454666138, "learning_rate": 0.0002, "epoch": 5.373858216797348, "step": 33240}, {"loss": 0.5284, "grad_norm": 1.119701623916626, "learning_rate": 0.0002, "epoch": 5.3754749009780936, "step": 33250}, {"loss": 0.5217, "grad_norm": 1.3000061511993408, "learning_rate": 0.0002, "epoch": 5.377091585158839, "step": 33260}, {"loss": 0.5125, "grad_norm": 1.083891749382019, "learning_rate": 0.0002, "epoch": 5.378708269339585, "step": 33270}, {"loss": 0.5065, "grad_norm": 0.9402718544006348, "learning_rate": 0.0002, "epoch": 5.38032495352033, "step": 33280}, {"loss": 0.5559, "grad_norm": 1.3376892805099487, "learning_rate": 0.0002, "epoch": 5.381941637701075, "step": 33290}, {"loss": 0.5193, "grad_norm": 1.1600074768066406, "learning_rate": 0.0002, "epoch": 5.383558321881821, "step": 33300}, {"loss": 0.4907, "grad_norm": 1.1449427604675293, "learning_rate": 0.0002, "epoch": 5.385175006062566, "step": 33310}, {"loss": 0.5449, "grad_norm": 1.3118891716003418, "learning_rate": 0.0002, "epoch": 5.386791690243311, "step": 33320}, {"loss": 0.547, "grad_norm": 0.743449866771698, "learning_rate": 0.0002, "epoch": 5.388408374424056, "step": 33330}, {"loss": 0.5555, "grad_norm": 0.9358304142951965, "learning_rate": 0.0002, "epoch": 5.3900250586048015, "step": 33340}, {"loss": 0.5558, "grad_norm": 1.0447142124176025, "learning_rate": 0.0002, "epoch": 5.391641742785547, "step": 33350}, {"loss": 0.5106, "grad_norm": 1.1088626384735107, "learning_rate": 0.0002, "epoch": 5.393258426966292, "step": 33360}, {"loss": 0.4929, "grad_norm": 1.1267958879470825, "learning_rate": 0.0002, "epoch": 5.394875111147037, "step": 33370}, {"loss": 0.5165, "grad_norm": 0.9709370136260986, "learning_rate": 0.0002, "epoch": 5.3964917953277824, "step": 33380}, {"loss": 0.5206, "grad_norm": 1.0939103364944458, "learning_rate": 0.0002, "epoch": 5.398108479508528, "step": 33390}, {"loss": 0.5177, "grad_norm": 0.9559304714202881, "learning_rate": 0.0002, "epoch": 5.399725163689273, "step": 33400}, {"loss": 0.5064, "grad_norm": 1.199580430984497, "learning_rate": 0.0002, "epoch": 5.401341847870018, "step": 33410}, {"loss": 0.52, "grad_norm": 0.9097000360488892, "learning_rate": 0.0002, "epoch": 5.402958532050764, "step": 33420}, {"loss": 0.514, "grad_norm": 1.1940981149673462, "learning_rate": 0.0002, "epoch": 5.4045752162315095, "step": 33430}, {"loss": 0.5069, "grad_norm": 1.0530916452407837, "learning_rate": 0.0002, "epoch": 5.406191900412255, "step": 33440}, {"loss": 0.5482, "grad_norm": 1.0482549667358398, "learning_rate": 0.0002, "epoch": 5.407808584593, "step": 33450}, {"loss": 0.501, "grad_norm": 1.2524714469909668, "learning_rate": 0.0002, "epoch": 5.409425268773745, "step": 33460}, {"loss": 0.5597, "grad_norm": 1.1091666221618652, "learning_rate": 0.0002, "epoch": 5.41104195295449, "step": 33470}, {"loss": 0.546, "grad_norm": 0.9981587529182434, "learning_rate": 0.0002, "epoch": 5.412658637135236, "step": 33480}, {"loss": 0.4977, "grad_norm": 1.016681432723999, "learning_rate": 0.0002, "epoch": 5.414275321315981, "step": 33490}, {"loss": 0.5388, "grad_norm": 1.1456854343414307, "learning_rate": 0.0002, "epoch": 5.415892005496726, "step": 33500}, {"loss": 0.5292, "grad_norm": 1.1454259157180786, "learning_rate": 0.0002, "epoch": 5.417508689677471, "step": 33510}, {"loss": 0.5061, "grad_norm": 0.9858416318893433, "learning_rate": 0.0002, "epoch": 5.419125373858217, "step": 33520}, {"loss": 0.5139, "grad_norm": 0.9764766693115234, "learning_rate": 0.0002, "epoch": 5.420742058038962, "step": 33530}, {"loss": 0.5518, "grad_norm": 1.199920892715454, "learning_rate": 0.0002, "epoch": 5.422358742219707, "step": 33540}, {"loss": 0.5182, "grad_norm": 1.3107370138168335, "learning_rate": 0.0002, "epoch": 5.423975426400452, "step": 33550}, {"loss": 0.5149, "grad_norm": 0.9637970328330994, "learning_rate": 0.0002, "epoch": 5.4255921105811975, "step": 33560}, {"loss": 0.526, "grad_norm": 1.023359775543213, "learning_rate": 0.0002, "epoch": 5.427208794761944, "step": 33570}, {"loss": 0.5206, "grad_norm": 1.060417652130127, "learning_rate": 0.0002, "epoch": 5.428825478942689, "step": 33580}, {"loss": 0.5052, "grad_norm": 0.9971120953559875, "learning_rate": 0.0002, "epoch": 5.430442163123434, "step": 33590}, {"loss": 0.5044, "grad_norm": 0.9213743209838867, "learning_rate": 0.0002, "epoch": 5.432058847304179, "step": 33600}, {"loss": 0.5714, "grad_norm": 1.1512309312820435, "learning_rate": 0.0002, "epoch": 5.4336755314849245, "step": 33610}, {"loss": 0.5317, "grad_norm": 1.2198847532272339, "learning_rate": 0.0002, "epoch": 5.43529221566567, "step": 33620}, {"loss": 0.5237, "grad_norm": 1.0329595804214478, "learning_rate": 0.0002, "epoch": 5.436908899846415, "step": 33630}, {"loss": 0.5364, "grad_norm": 1.1075750589370728, "learning_rate": 0.0002, "epoch": 5.43852558402716, "step": 33640}, {"loss": 0.5295, "grad_norm": 1.006342887878418, "learning_rate": 0.0002, "epoch": 5.4401422682079055, "step": 33650}, {"loss": 0.5394, "grad_norm": 0.9179885983467102, "learning_rate": 0.0002, "epoch": 5.441758952388651, "step": 33660}, {"loss": 0.5124, "grad_norm": 1.2799493074417114, "learning_rate": 0.0002, "epoch": 5.443375636569396, "step": 33670}, {"loss": 0.5426, "grad_norm": 1.1153863668441772, "learning_rate": 0.0002, "epoch": 5.444992320750141, "step": 33680}, {"loss": 0.5087, "grad_norm": 1.0681028366088867, "learning_rate": 0.0002, "epoch": 5.446609004930886, "step": 33690}, {"loss": 0.5272, "grad_norm": 0.9788817167282104, "learning_rate": 0.0002, "epoch": 5.448225689111632, "step": 33700}, {"loss": 0.5308, "grad_norm": 0.8481608629226685, "learning_rate": 0.0002, "epoch": 5.449842373292377, "step": 33710}, {"loss": 0.5225, "grad_norm": 1.113756537437439, "learning_rate": 0.0002, "epoch": 5.451459057473123, "step": 33720}, {"loss": 0.5213, "grad_norm": 0.8425475358963013, "learning_rate": 0.0002, "epoch": 5.453075741653868, "step": 33730}, {"loss": 0.571, "grad_norm": 1.0852208137512207, "learning_rate": 0.0002, "epoch": 5.4546924258346134, "step": 33740}, {"loss": 0.5535, "grad_norm": 1.1664748191833496, "learning_rate": 0.0002, "epoch": 5.456309110015359, "step": 33750}, {"loss": 0.5419, "grad_norm": 1.217241644859314, "learning_rate": 0.0002, "epoch": 5.457925794196104, "step": 33760}, {"loss": 0.5351, "grad_norm": 1.1572928428649902, "learning_rate": 0.0002, "epoch": 5.459542478376849, "step": 33770}, {"loss": 0.5161, "grad_norm": 1.0437318086624146, "learning_rate": 0.0002, "epoch": 5.461159162557594, "step": 33780}, {"loss": 0.5266, "grad_norm": 0.9807571768760681, "learning_rate": 0.0002, "epoch": 5.46277584673834, "step": 33790}, {"loss": 0.5384, "grad_norm": 1.1436342000961304, "learning_rate": 0.0002, "epoch": 5.464392530919085, "step": 33800}, {"loss": 0.5338, "grad_norm": 1.1004794836044312, "learning_rate": 0.0002, "epoch": 5.46600921509983, "step": 33810}, {"loss": 0.4868, "grad_norm": 1.2130268812179565, "learning_rate": 0.0002, "epoch": 5.467625899280575, "step": 33820}, {"loss": 0.516, "grad_norm": 1.3154419660568237, "learning_rate": 0.0002, "epoch": 5.4692425834613205, "step": 33830}, {"loss": 0.4934, "grad_norm": 0.7934383749961853, "learning_rate": 0.0002, "epoch": 5.470859267642066, "step": 33840}, {"loss": 0.5133, "grad_norm": 0.7838410139083862, "learning_rate": 0.0002, "epoch": 5.472475951822812, "step": 33850}, {"loss": 0.4926, "grad_norm": 1.0415139198303223, "learning_rate": 0.0002, "epoch": 5.474092636003557, "step": 33860}, {"loss": 0.5323, "grad_norm": 0.9213164448738098, "learning_rate": 0.0002, "epoch": 5.475709320184302, "step": 33870}, {"loss": 0.5125, "grad_norm": 1.0364776849746704, "learning_rate": 0.0002, "epoch": 5.477326004365048, "step": 33880}, {"loss": 0.5212, "grad_norm": 0.9994072318077087, "learning_rate": 0.0002, "epoch": 5.478942688545793, "step": 33890}, {"loss": 0.5396, "grad_norm": 1.196730136871338, "learning_rate": 0.0002, "epoch": 5.480559372726538, "step": 33900}, {"loss": 0.538, "grad_norm": 0.9955780506134033, "learning_rate": 0.0002, "epoch": 5.482176056907283, "step": 33910}, {"loss": 0.5307, "grad_norm": 1.168188214302063, "learning_rate": 0.0002, "epoch": 5.4837927410880285, "step": 33920}, {"loss": 0.5548, "grad_norm": 1.1816450357437134, "learning_rate": 0.0002, "epoch": 5.485409425268774, "step": 33930}, {"loss": 0.5535, "grad_norm": 1.079715609550476, "learning_rate": 0.0002, "epoch": 5.487026109449519, "step": 33940}, {"loss": 0.5262, "grad_norm": 1.153850793838501, "learning_rate": 0.0002, "epoch": 5.488642793630264, "step": 33950}, {"loss": 0.5248, "grad_norm": 1.0207297801971436, "learning_rate": 0.0002, "epoch": 5.490259477811009, "step": 33960}, {"loss": 0.5142, "grad_norm": 1.1290855407714844, "learning_rate": 0.0002, "epoch": 5.491876161991755, "step": 33970}, {"loss": 0.5168, "grad_norm": 1.068058967590332, "learning_rate": 0.0002, "epoch": 5.4934928461725, "step": 33980}, {"loss": 0.5317, "grad_norm": 0.9789979457855225, "learning_rate": 0.0002, "epoch": 5.495109530353245, "step": 33990}, {"loss": 0.5113, "grad_norm": 0.9696692824363708, "learning_rate": 0.0002, "epoch": 5.496726214533991, "step": 34000}, {"loss": 0.5413, "grad_norm": 1.0539981126785278, "learning_rate": 0.0002, "epoch": 5.4983428987147365, "step": 34010}, {"loss": 0.5783, "grad_norm": 1.0249929428100586, "learning_rate": 0.0002, "epoch": 5.499959582895482, "step": 34020}, {"loss": 0.4888, "grad_norm": 0.9577504992485046, "learning_rate": 0.0002, "epoch": 5.501576267076227, "step": 34030}, {"loss": 0.5291, "grad_norm": 1.0963513851165771, "learning_rate": 0.0002, "epoch": 5.503192951256972, "step": 34040}, {"loss": 0.5315, "grad_norm": 0.8339345455169678, "learning_rate": 0.0002, "epoch": 5.504809635437717, "step": 34050}, {"loss": 0.5191, "grad_norm": 1.0138782262802124, "learning_rate": 0.0002, "epoch": 5.506426319618463, "step": 34060}, {"loss": 0.5463, "grad_norm": 1.0180109739303589, "learning_rate": 0.0002, "epoch": 5.508043003799208, "step": 34070}, {"loss": 0.5083, "grad_norm": 1.2790818214416504, "learning_rate": 0.0002, "epoch": 5.509659687979953, "step": 34080}, {"loss": 0.5195, "grad_norm": 1.428247332572937, "learning_rate": 0.0002, "epoch": 5.511276372160698, "step": 34090}, {"loss": 0.5291, "grad_norm": 1.0926059484481812, "learning_rate": 0.0002, "epoch": 5.5128930563414436, "step": 34100}, {"loss": 0.5665, "grad_norm": 1.2353343963623047, "learning_rate": 0.0002, "epoch": 5.514509740522189, "step": 34110}, {"loss": 0.5331, "grad_norm": 0.935587465763092, "learning_rate": 0.0002, "epoch": 5.516126424702934, "step": 34120}, {"loss": 0.5512, "grad_norm": 0.9767586588859558, "learning_rate": 0.0002, "epoch": 5.517743108883679, "step": 34130}, {"loss": 0.5315, "grad_norm": 1.1660610437393188, "learning_rate": 0.0002, "epoch": 5.5193597930644245, "step": 34140}, {"loss": 0.52, "grad_norm": 0.9828870892524719, "learning_rate": 0.0002, "epoch": 5.520976477245171, "step": 34150}, {"loss": 0.5198, "grad_norm": 1.0097278356552124, "learning_rate": 0.0002, "epoch": 5.522593161425916, "step": 34160}, {"loss": 0.5293, "grad_norm": 1.1766167879104614, "learning_rate": 0.0002, "epoch": 5.524209845606661, "step": 34170}, {"loss": 0.5258, "grad_norm": 0.982292115688324, "learning_rate": 0.0002, "epoch": 5.525826529787406, "step": 34180}, {"loss": 0.5114, "grad_norm": 1.0744609832763672, "learning_rate": 0.0002, "epoch": 5.5274432139681515, "step": 34190}, {"loss": 0.5469, "grad_norm": 1.3831160068511963, "learning_rate": 0.0002, "epoch": 5.529059898148897, "step": 34200}, {"loss": 0.5819, "grad_norm": 1.074771761894226, "learning_rate": 0.0002, "epoch": 5.530676582329642, "step": 34210}, {"loss": 0.5399, "grad_norm": 1.016652226448059, "learning_rate": 0.0002, "epoch": 5.532293266510387, "step": 34220}, {"loss": 0.5158, "grad_norm": 1.2231552600860596, "learning_rate": 0.0002, "epoch": 5.5339099506911325, "step": 34230}, {"loss": 0.5091, "grad_norm": 0.8051198720932007, "learning_rate": 0.0002, "epoch": 5.535526634871878, "step": 34240}, {"loss": 0.5583, "grad_norm": 1.1779674291610718, "learning_rate": 0.0002, "epoch": 5.537143319052623, "step": 34250}, {"loss": 0.5044, "grad_norm": 1.2468291521072388, "learning_rate": 0.0002, "epoch": 5.538760003233368, "step": 34260}, {"loss": 0.523, "grad_norm": 1.14818274974823, "learning_rate": 0.0002, "epoch": 5.540376687414113, "step": 34270}, {"loss": 0.5375, "grad_norm": 1.2362616062164307, "learning_rate": 0.0002, "epoch": 5.541993371594859, "step": 34280}, {"loss": 0.4996, "grad_norm": 1.0206977128982544, "learning_rate": 0.0002, "epoch": 5.543610055775604, "step": 34290}, {"loss": 0.5212, "grad_norm": 1.2018457651138306, "learning_rate": 0.0002, "epoch": 5.54522673995635, "step": 34300}, {"loss": 0.5462, "grad_norm": 1.0349043607711792, "learning_rate": 0.0002, "epoch": 5.546843424137095, "step": 34310}, {"loss": 0.5231, "grad_norm": 1.2022006511688232, "learning_rate": 0.0002, "epoch": 5.54846010831784, "step": 34320}, {"loss": 0.5173, "grad_norm": 1.0810624361038208, "learning_rate": 0.0002, "epoch": 5.550076792498586, "step": 34330}, {"loss": 0.5821, "grad_norm": 1.3297529220581055, "learning_rate": 0.0002, "epoch": 5.551693476679331, "step": 34340}, {"loss": 0.5321, "grad_norm": 0.9722549915313721, "learning_rate": 0.0002, "epoch": 5.553310160860076, "step": 34350}, {"loss": 0.4823, "grad_norm": 0.9903425574302673, "learning_rate": 0.0002, "epoch": 5.554926845040821, "step": 34360}, {"loss": 0.5601, "grad_norm": 0.9568067789077759, "learning_rate": 0.0002, "epoch": 5.556543529221567, "step": 34370}, {"loss": 0.5242, "grad_norm": 1.113870620727539, "learning_rate": 0.0002, "epoch": 5.558160213402312, "step": 34380}, {"loss": 0.5278, "grad_norm": 1.0557632446289062, "learning_rate": 0.0002, "epoch": 5.559776897583057, "step": 34390}, {"loss": 0.5501, "grad_norm": 0.9615673422813416, "learning_rate": 0.0002, "epoch": 5.561393581763802, "step": 34400}, {"loss": 0.5066, "grad_norm": 0.9536027312278748, "learning_rate": 0.0002, "epoch": 5.5630102659445475, "step": 34410}, {"loss": 0.4949, "grad_norm": 0.8808749318122864, "learning_rate": 0.0002, "epoch": 5.564626950125293, "step": 34420}, {"loss": 0.5954, "grad_norm": 1.286132574081421, "learning_rate": 0.0002, "epoch": 5.566243634306038, "step": 34430}, {"loss": 0.5507, "grad_norm": 1.259644865989685, "learning_rate": 0.0002, "epoch": 5.567860318486783, "step": 34440}, {"loss": 0.4922, "grad_norm": 0.9920216798782349, "learning_rate": 0.0002, "epoch": 5.569477002667529, "step": 34450}, {"loss": 0.5527, "grad_norm": 1.182926893234253, "learning_rate": 0.0002, "epoch": 5.5710936868482746, "step": 34460}, {"loss": 0.5185, "grad_norm": 1.1434749364852905, "learning_rate": 0.0002, "epoch": 5.57271037102902, "step": 34470}, {"loss": 0.5256, "grad_norm": 1.2420979738235474, "learning_rate": 0.0002, "epoch": 5.574327055209765, "step": 34480}, {"loss": 0.5039, "grad_norm": 0.9338384866714478, "learning_rate": 0.0002, "epoch": 5.57594373939051, "step": 34490}, {"loss": 0.5634, "grad_norm": 1.0196425914764404, "learning_rate": 0.0002, "epoch": 5.5775604235712555, "step": 34500}, {"loss": 0.5132, "grad_norm": 0.9586997032165527, "learning_rate": 0.0002, "epoch": 5.579177107752001, "step": 34510}, {"loss": 0.5336, "grad_norm": 1.2409086227416992, "learning_rate": 0.0002, "epoch": 5.580793791932746, "step": 34520}, {"loss": 0.5364, "grad_norm": 1.1483757495880127, "learning_rate": 0.0002, "epoch": 5.582410476113491, "step": 34530}, {"loss": 0.5325, "grad_norm": 1.1624305248260498, "learning_rate": 0.0002, "epoch": 5.584027160294236, "step": 34540}, {"loss": 0.5342, "grad_norm": 1.2635223865509033, "learning_rate": 0.0002, "epoch": 5.585643844474982, "step": 34550}, {"loss": 0.4924, "grad_norm": 0.9824051856994629, "learning_rate": 0.0002, "epoch": 5.587260528655727, "step": 34560}, {"loss": 0.5395, "grad_norm": 1.0858620405197144, "learning_rate": 0.0002, "epoch": 5.588877212836472, "step": 34570}, {"loss": 0.5459, "grad_norm": 1.1452655792236328, "learning_rate": 0.0002, "epoch": 5.590493897017217, "step": 34580}, {"loss": 0.5746, "grad_norm": 1.110610842704773, "learning_rate": 0.0002, "epoch": 5.592110581197963, "step": 34590}, {"loss": 0.5285, "grad_norm": 0.9976194500923157, "learning_rate": 0.0002, "epoch": 5.593727265378709, "step": 34600}, {"loss": 0.548, "grad_norm": 1.0698920488357544, "learning_rate": 0.0002, "epoch": 5.595343949559454, "step": 34610}, {"loss": 0.5311, "grad_norm": 1.1505171060562134, "learning_rate": 0.0002, "epoch": 5.596960633740199, "step": 34620}, {"loss": 0.5471, "grad_norm": 1.1014643907546997, "learning_rate": 0.0002, "epoch": 5.598577317920944, "step": 34630}, {"loss": 0.55, "grad_norm": 0.915595293045044, "learning_rate": 0.0002, "epoch": 5.60019400210169, "step": 34640}, {"loss": 0.5821, "grad_norm": 1.1856765747070312, "learning_rate": 0.0002, "epoch": 5.601810686282435, "step": 34650}, {"loss": 0.5502, "grad_norm": 1.1357687711715698, "learning_rate": 0.0002, "epoch": 5.60342737046318, "step": 34660}, {"loss": 0.5034, "grad_norm": 1.0232492685317993, "learning_rate": 0.0002, "epoch": 5.605044054643925, "step": 34670}, {"loss": 0.5357, "grad_norm": 0.9375017881393433, "learning_rate": 0.0002, "epoch": 5.6066607388246705, "step": 34680}, {"loss": 0.5518, "grad_norm": 1.0796529054641724, "learning_rate": 0.0002, "epoch": 5.608277423005416, "step": 34690}, {"loss": 0.5173, "grad_norm": 1.1383336782455444, "learning_rate": 0.0002, "epoch": 5.609894107186161, "step": 34700}, {"loss": 0.5477, "grad_norm": 1.0248544216156006, "learning_rate": 0.0002, "epoch": 5.611510791366906, "step": 34710}, {"loss": 0.5669, "grad_norm": 1.0986040830612183, "learning_rate": 0.0002, "epoch": 5.6131274755476515, "step": 34720}, {"loss": 0.5188, "grad_norm": 1.2689568996429443, "learning_rate": 0.0002, "epoch": 5.614744159728397, "step": 34730}, {"loss": 0.5136, "grad_norm": 1.4044264554977417, "learning_rate": 0.0002, "epoch": 5.616360843909142, "step": 34740}, {"loss": 0.5699, "grad_norm": 1.2084474563598633, "learning_rate": 0.0002, "epoch": 5.617977528089888, "step": 34750}, {"loss": 0.5377, "grad_norm": 1.061248540878296, "learning_rate": 0.0002, "epoch": 5.619594212270633, "step": 34760}, {"loss": 0.5669, "grad_norm": 1.0220764875411987, "learning_rate": 0.0002, "epoch": 5.6212108964513785, "step": 34770}, {"loss": 0.54, "grad_norm": 1.0859092473983765, "learning_rate": 0.0002, "epoch": 5.622827580632124, "step": 34780}, {"loss": 0.5308, "grad_norm": 0.9049732089042664, "learning_rate": 0.0002, "epoch": 5.624444264812869, "step": 34790}, {"loss": 0.5433, "grad_norm": 1.2103937864303589, "learning_rate": 0.0002, "epoch": 5.626060948993614, "step": 34800}, {"loss": 0.5513, "grad_norm": 0.9854230284690857, "learning_rate": 0.0002, "epoch": 5.627677633174359, "step": 34810}, {"loss": 0.5274, "grad_norm": 0.9316635131835938, "learning_rate": 0.0002, "epoch": 5.629294317355105, "step": 34820}, {"loss": 0.5393, "grad_norm": 1.105296015739441, "learning_rate": 0.0002, "epoch": 5.63091100153585, "step": 34830}, {"loss": 0.5527, "grad_norm": 0.993383526802063, "learning_rate": 0.0002, "epoch": 5.632527685716595, "step": 34840}, {"loss": 0.5375, "grad_norm": 1.1544116735458374, "learning_rate": 0.0002, "epoch": 5.63414436989734, "step": 34850}, {"loss": 0.5448, "grad_norm": 1.284475326538086, "learning_rate": 0.0002, "epoch": 5.635761054078086, "step": 34860}, {"loss": 0.5069, "grad_norm": 1.121997594833374, "learning_rate": 0.0002, "epoch": 5.637377738258831, "step": 34870}, {"loss": 0.5335, "grad_norm": 1.213040828704834, "learning_rate": 0.0002, "epoch": 5.638994422439576, "step": 34880}, {"loss": 0.5623, "grad_norm": 1.23222017288208, "learning_rate": 0.0002, "epoch": 5.640611106620321, "step": 34890}, {"loss": 0.5622, "grad_norm": 0.9793637990951538, "learning_rate": 0.0002, "epoch": 5.642227790801067, "step": 34900}, {"loss": 0.5405, "grad_norm": 1.38919997215271, "learning_rate": 0.0002, "epoch": 5.643844474981813, "step": 34910}, {"loss": 0.5007, "grad_norm": 0.8390951156616211, "learning_rate": 0.0002, "epoch": 5.645461159162558, "step": 34920}, {"loss": 0.5974, "grad_norm": 0.9465909004211426, "learning_rate": 0.0002, "epoch": 5.647077843343303, "step": 34930}, {"loss": 0.5264, "grad_norm": 1.066957712173462, "learning_rate": 0.0002, "epoch": 5.648694527524048, "step": 34940}, {"loss": 0.5513, "grad_norm": 0.9842154383659363, "learning_rate": 0.0002, "epoch": 5.650311211704794, "step": 34950}, {"loss": 0.567, "grad_norm": 1.1766440868377686, "learning_rate": 0.0002, "epoch": 5.651927895885539, "step": 34960}, {"loss": 0.5462, "grad_norm": 0.9061306118965149, "learning_rate": 0.0002, "epoch": 5.653544580066284, "step": 34970}, {"loss": 0.5446, "grad_norm": 1.2941309213638306, "learning_rate": 0.0002, "epoch": 5.655161264247029, "step": 34980}, {"loss": 0.5704, "grad_norm": 0.9741247892379761, "learning_rate": 0.0002, "epoch": 5.6567779484277745, "step": 34990}, {"loss": 0.5152, "grad_norm": 1.0784187316894531, "learning_rate": 0.0002, "epoch": 5.65839463260852, "step": 35000}, {"loss": 0.5363, "grad_norm": 0.937889814376831, "learning_rate": 0.0002, "epoch": 5.660011316789265, "step": 35010}, {"loss": 0.5019, "grad_norm": 0.9667879939079285, "learning_rate": 0.0002, "epoch": 5.66162800097001, "step": 35020}, {"loss": 0.5209, "grad_norm": 1.0554876327514648, "learning_rate": 0.0002, "epoch": 5.663244685150756, "step": 35030}, {"loss": 0.523, "grad_norm": 1.2030539512634277, "learning_rate": 0.0002, "epoch": 5.664861369331501, "step": 35040}, {"loss": 0.5406, "grad_norm": 1.0849953889846802, "learning_rate": 0.0002, "epoch": 5.666478053512247, "step": 35050}, {"loss": 0.5747, "grad_norm": 1.1598973274230957, "learning_rate": 0.0002, "epoch": 5.668094737692992, "step": 35060}, {"loss": 0.5488, "grad_norm": 1.0233359336853027, "learning_rate": 0.0002, "epoch": 5.669711421873737, "step": 35070}, {"loss": 0.5409, "grad_norm": 1.1124799251556396, "learning_rate": 0.0002, "epoch": 5.6713281060544825, "step": 35080}, {"loss": 0.5578, "grad_norm": 1.2351475954055786, "learning_rate": 0.0002, "epoch": 5.672944790235228, "step": 35090}, {"loss": 0.5638, "grad_norm": 1.0240728855133057, "learning_rate": 0.0002, "epoch": 5.674561474415973, "step": 35100}, {"loss": 0.5192, "grad_norm": 1.0223692655563354, "learning_rate": 0.0002, "epoch": 5.676178158596718, "step": 35110}, {"loss": 0.524, "grad_norm": 1.4569132328033447, "learning_rate": 0.0002, "epoch": 5.677794842777463, "step": 35120}, {"loss": 0.555, "grad_norm": 0.8983587026596069, "learning_rate": 0.0002, "epoch": 5.679411526958209, "step": 35130}, {"loss": 0.5439, "grad_norm": 1.0775383710861206, "learning_rate": 0.0002, "epoch": 5.681028211138954, "step": 35140}, {"loss": 0.5289, "grad_norm": 0.9800270795822144, "learning_rate": 0.0002, "epoch": 5.682644895319699, "step": 35150}, {"loss": 0.533, "grad_norm": 0.9858237504959106, "learning_rate": 0.0002, "epoch": 5.684261579500444, "step": 35160}, {"loss": 0.5671, "grad_norm": 1.031087040901184, "learning_rate": 0.0002, "epoch": 5.6858782636811895, "step": 35170}, {"loss": 0.5528, "grad_norm": 1.0294365882873535, "learning_rate": 0.0002, "epoch": 5.687494947861936, "step": 35180}, {"loss": 0.5581, "grad_norm": 1.108144760131836, "learning_rate": 0.0002, "epoch": 5.68911163204268, "step": 35190}, {"loss": 0.5373, "grad_norm": 1.0813100337982178, "learning_rate": 0.0002, "epoch": 5.690728316223426, "step": 35200}, {"loss": 0.5429, "grad_norm": 1.3146867752075195, "learning_rate": 0.0002, "epoch": 5.692345000404171, "step": 35210}, {"loss": 0.5297, "grad_norm": 1.16780424118042, "learning_rate": 0.0002, "epoch": 5.693961684584917, "step": 35220}, {"loss": 0.577, "grad_norm": 0.9929125905036926, "learning_rate": 0.0002, "epoch": 5.695578368765662, "step": 35230}, {"loss": 0.5441, "grad_norm": 0.9049441814422607, "learning_rate": 0.0002, "epoch": 5.697195052946407, "step": 35240}, {"loss": 0.5349, "grad_norm": 0.9768866300582886, "learning_rate": 0.0002, "epoch": 5.698811737127152, "step": 35250}, {"loss": 0.542, "grad_norm": 0.8306029438972473, "learning_rate": 0.0002, "epoch": 5.7004284213078975, "step": 35260}, {"loss": 0.4771, "grad_norm": 0.8417280316352844, "learning_rate": 0.0002, "epoch": 5.702045105488643, "step": 35270}, {"loss": 0.574, "grad_norm": 0.9954485893249512, "learning_rate": 0.0002, "epoch": 5.703661789669388, "step": 35280}, {"loss": 0.5469, "grad_norm": 1.2417993545532227, "learning_rate": 0.0002, "epoch": 5.705278473850133, "step": 35290}, {"loss": 0.5275, "grad_norm": 1.1696544885635376, "learning_rate": 0.0002, "epoch": 5.706895158030878, "step": 35300}, {"loss": 0.5188, "grad_norm": 1.2424817085266113, "learning_rate": 0.0002, "epoch": 5.708511842211624, "step": 35310}, {"loss": 0.5595, "grad_norm": 1.1791106462478638, "learning_rate": 0.0002, "epoch": 5.710128526392369, "step": 35320}, {"loss": 0.5076, "grad_norm": 1.202181339263916, "learning_rate": 0.0002, "epoch": 5.711745210573115, "step": 35330}, {"loss": 0.5847, "grad_norm": 1.1006861925125122, "learning_rate": 0.0002, "epoch": 5.713361894753859, "step": 35340}, {"loss": 0.5627, "grad_norm": 1.0918344259262085, "learning_rate": 0.0002, "epoch": 5.7149785789346055, "step": 35350}, {"loss": 0.5677, "grad_norm": 1.0427305698394775, "learning_rate": 0.0002, "epoch": 5.716595263115351, "step": 35360}, {"loss": 0.5288, "grad_norm": 1.0818872451782227, "learning_rate": 0.0002, "epoch": 5.718211947296096, "step": 35370}, {"loss": 0.5296, "grad_norm": 1.186006784439087, "learning_rate": 0.0002, "epoch": 5.719828631476841, "step": 35380}, {"loss": 0.5507, "grad_norm": 1.2073674201965332, "learning_rate": 0.0002, "epoch": 5.721445315657586, "step": 35390}, {"loss": 0.5483, "grad_norm": 1.065338134765625, "learning_rate": 0.0002, "epoch": 5.723061999838332, "step": 35400}, {"loss": 0.5195, "grad_norm": 0.9448973536491394, "learning_rate": 0.0002, "epoch": 5.724678684019077, "step": 35410}, {"loss": 0.5276, "grad_norm": 1.1487499475479126, "learning_rate": 0.0002, "epoch": 5.726295368199822, "step": 35420}, {"loss": 0.5435, "grad_norm": 1.1334216594696045, "learning_rate": 0.0002, "epoch": 5.727912052380567, "step": 35430}, {"loss": 0.5074, "grad_norm": 1.1932826042175293, "learning_rate": 0.0002, "epoch": 5.729528736561313, "step": 35440}, {"loss": 0.5502, "grad_norm": 1.2615786790847778, "learning_rate": 0.0002, "epoch": 5.731145420742058, "step": 35450}, {"loss": 0.5612, "grad_norm": 1.2803694009780884, "learning_rate": 0.0002, "epoch": 5.732762104922803, "step": 35460}, {"loss": 0.5458, "grad_norm": 0.9271906614303589, "learning_rate": 0.0002, "epoch": 5.734378789103548, "step": 35470}, {"loss": 0.5342, "grad_norm": 1.0958917140960693, "learning_rate": 0.0002, "epoch": 5.735995473284294, "step": 35480}, {"loss": 0.538, "grad_norm": 1.1072784662246704, "learning_rate": 0.0002, "epoch": 5.737612157465039, "step": 35490}, {"loss": 0.5683, "grad_norm": 1.1641002893447876, "learning_rate": 0.0002, "epoch": 5.739228841645785, "step": 35500}, {"loss": 0.5252, "grad_norm": 1.0246447324752808, "learning_rate": 0.0002, "epoch": 5.74084552582653, "step": 35510}, {"loss": 0.55, "grad_norm": 1.032474398612976, "learning_rate": 0.0002, "epoch": 5.742462210007275, "step": 35520}, {"loss": 0.4965, "grad_norm": 1.1600854396820068, "learning_rate": 0.0002, "epoch": 5.7440788941880205, "step": 35530}, {"loss": 0.5543, "grad_norm": 1.0686054229736328, "learning_rate": 0.0002, "epoch": 5.745695578368766, "step": 35540}, {"loss": 0.5706, "grad_norm": 1.2314637899398804, "learning_rate": 0.0002, "epoch": 5.747312262549511, "step": 35550}, {"loss": 0.5492, "grad_norm": 0.922134280204773, "learning_rate": 0.0002, "epoch": 5.748928946730256, "step": 35560}, {"loss": 0.5495, "grad_norm": 0.933043360710144, "learning_rate": 0.0002, "epoch": 5.7505456309110015, "step": 35570}, {"loss": 0.5007, "grad_norm": 1.1911931037902832, "learning_rate": 0.0002, "epoch": 5.752162315091747, "step": 35580}, {"loss": 0.5244, "grad_norm": 0.8984857797622681, "learning_rate": 0.0002, "epoch": 5.753778999272492, "step": 35590}, {"loss": 0.5493, "grad_norm": 0.9495107531547546, "learning_rate": 0.0002, "epoch": 5.755395683453237, "step": 35600}, {"loss": 0.5326, "grad_norm": 1.2805472612380981, "learning_rate": 0.0002, "epoch": 5.757012367633982, "step": 35610}, {"loss": 0.5276, "grad_norm": 1.1236625909805298, "learning_rate": 0.0002, "epoch": 5.758629051814728, "step": 35620}, {"loss": 0.6102, "grad_norm": 1.0552798509597778, "learning_rate": 0.0002, "epoch": 5.760245735995474, "step": 35630}, {"loss": 0.5479, "grad_norm": 1.119909644126892, "learning_rate": 0.0002, "epoch": 5.761862420176218, "step": 35640}, {"loss": 0.5282, "grad_norm": 0.8786116242408752, "learning_rate": 0.0002, "epoch": 5.763479104356964, "step": 35650}, {"loss": 0.5406, "grad_norm": 1.2417117357254028, "learning_rate": 0.0002, "epoch": 5.765095788537709, "step": 35660}, {"loss": 0.537, "grad_norm": 1.255200982093811, "learning_rate": 0.0002, "epoch": 5.766712472718455, "step": 35670}, {"loss": 0.5308, "grad_norm": 1.0611358880996704, "learning_rate": 0.0002, "epoch": 5.7683291568992, "step": 35680}, {"loss": 0.5614, "grad_norm": 1.1443911790847778, "learning_rate": 0.0002, "epoch": 5.769945841079945, "step": 35690}, {"loss": 0.5386, "grad_norm": 1.1437989473342896, "learning_rate": 0.0002, "epoch": 5.77156252526069, "step": 35700}, {"loss": 0.537, "grad_norm": 1.1375046968460083, "learning_rate": 0.0002, "epoch": 5.773179209441436, "step": 35710}, {"loss": 0.5198, "grad_norm": 1.0777729749679565, "learning_rate": 0.0002, "epoch": 5.774795893622181, "step": 35720}, {"loss": 0.5521, "grad_norm": 1.1160215139389038, "learning_rate": 0.0002, "epoch": 5.776412577802926, "step": 35730}, {"loss": 0.5569, "grad_norm": 1.1268514394760132, "learning_rate": 0.0002, "epoch": 5.778029261983671, "step": 35740}, {"loss": 0.5311, "grad_norm": 1.2752262353897095, "learning_rate": 0.0002, "epoch": 5.7796459461644165, "step": 35750}, {"loss": 0.5625, "grad_norm": 1.0416184663772583, "learning_rate": 0.0002, "epoch": 5.781262630345162, "step": 35760}, {"loss": 0.5438, "grad_norm": 1.0622444152832031, "learning_rate": 0.0002, "epoch": 5.782879314525907, "step": 35770}, {"loss": 0.5268, "grad_norm": 1.1217877864837646, "learning_rate": 0.0002, "epoch": 5.784495998706653, "step": 35780}, {"loss": 0.5225, "grad_norm": 0.9363139867782593, "learning_rate": 0.0002, "epoch": 5.786112682887398, "step": 35790}, {"loss": 0.5524, "grad_norm": 0.96628737449646, "learning_rate": 0.0002, "epoch": 5.787729367068144, "step": 35800}, {"loss": 0.52, "grad_norm": 0.9572572112083435, "learning_rate": 0.0002, "epoch": 5.789346051248889, "step": 35810}, {"loss": 0.5615, "grad_norm": 0.938724935054779, "learning_rate": 0.0002, "epoch": 5.790962735429634, "step": 35820}, {"loss": 0.5391, "grad_norm": 1.3314417600631714, "learning_rate": 0.0002, "epoch": 5.792579419610379, "step": 35830}, {"loss": 0.5441, "grad_norm": 1.0097602605819702, "learning_rate": 0.0002, "epoch": 5.7941961037911245, "step": 35840}, {"loss": 0.591, "grad_norm": 1.1265122890472412, "learning_rate": 0.0002, "epoch": 5.79581278797187, "step": 35850}, {"loss": 0.5333, "grad_norm": 1.2191909551620483, "learning_rate": 0.0002, "epoch": 5.797429472152615, "step": 35860}, {"loss": 0.5274, "grad_norm": 0.9690808057785034, "learning_rate": 0.0002, "epoch": 5.79904615633336, "step": 35870}, {"loss": 0.5425, "grad_norm": 1.0871665477752686, "learning_rate": 0.0002, "epoch": 5.800662840514105, "step": 35880}, {"loss": 0.5602, "grad_norm": 1.1093597412109375, "learning_rate": 0.0002, "epoch": 5.802279524694851, "step": 35890}, {"loss": 0.5475, "grad_norm": 1.2434282302856445, "learning_rate": 0.0002, "epoch": 5.803896208875596, "step": 35900}, {"loss": 0.5288, "grad_norm": 1.2933623790740967, "learning_rate": 0.0002, "epoch": 5.805512893056341, "step": 35910}, {"loss": 0.5554, "grad_norm": 1.0005441904067993, "learning_rate": 0.0002, "epoch": 5.807129577237086, "step": 35920}, {"loss": 0.5318, "grad_norm": 1.2373108863830566, "learning_rate": 0.0002, "epoch": 5.8087462614178325, "step": 35930}, {"loss": 0.5413, "grad_norm": 1.2622692584991455, "learning_rate": 0.0002, "epoch": 5.810362945598578, "step": 35940}, {"loss": 0.5558, "grad_norm": 1.0112963914871216, "learning_rate": 0.0002, "epoch": 5.811979629779323, "step": 35950}, {"loss": 0.5115, "grad_norm": 1.050572395324707, "learning_rate": 0.0002, "epoch": 5.813596313960068, "step": 35960}, {"loss": 0.5288, "grad_norm": 0.9774560928344727, "learning_rate": 0.0002, "epoch": 5.815212998140813, "step": 35970}, {"loss": 0.585, "grad_norm": 1.19438898563385, "learning_rate": 0.0002, "epoch": 5.816829682321559, "step": 35980}, {"loss": 0.5798, "grad_norm": 1.0267130136489868, "learning_rate": 0.0002, "epoch": 5.818446366502304, "step": 35990}, {"loss": 0.5126, "grad_norm": 0.9813851714134216, "learning_rate": 0.0002, "epoch": 5.820063050683049, "step": 36000}, {"loss": 0.5138, "grad_norm": 0.9177457094192505, "learning_rate": 0.0002, "epoch": 5.821679734863794, "step": 36010}, {"loss": 0.5453, "grad_norm": 1.0020731687545776, "learning_rate": 0.0002, "epoch": 5.8232964190445395, "step": 36020}, {"loss": 0.5646, "grad_norm": 1.073222041130066, "learning_rate": 0.0002, "epoch": 5.824913103225285, "step": 36030}, {"loss": 0.5539, "grad_norm": 1.016337513923645, "learning_rate": 0.0002, "epoch": 5.82652978740603, "step": 36040}, {"loss": 0.5592, "grad_norm": 1.267364263534546, "learning_rate": 0.0002, "epoch": 5.828146471586775, "step": 36050}, {"loss": 0.595, "grad_norm": 1.2730127573013306, "learning_rate": 0.0002, "epoch": 5.8297631557675205, "step": 36060}, {"loss": 0.5247, "grad_norm": 1.108442783355713, "learning_rate": 0.0002, "epoch": 5.831379839948266, "step": 36070}, {"loss": 0.5103, "grad_norm": 1.198072075843811, "learning_rate": 0.0002, "epoch": 5.832996524129012, "step": 36080}, {"loss": 0.5479, "grad_norm": 1.0458786487579346, "learning_rate": 0.0002, "epoch": 5.834613208309757, "step": 36090}, {"loss": 0.5564, "grad_norm": 0.9096664786338806, "learning_rate": 0.0002, "epoch": 5.836229892490502, "step": 36100}, {"loss": 0.5602, "grad_norm": 0.9957793951034546, "learning_rate": 0.0002, "epoch": 5.8378465766712475, "step": 36110}, {"loss": 0.5799, "grad_norm": 1.3693058490753174, "learning_rate": 0.0002, "epoch": 5.839463260851993, "step": 36120}, {"loss": 0.5425, "grad_norm": 1.268608808517456, "learning_rate": 0.0002, "epoch": 5.841079945032738, "step": 36130}, {"loss": 0.5653, "grad_norm": 0.8516020178794861, "learning_rate": 0.0002, "epoch": 5.842696629213483, "step": 36140}, {"loss": 0.5475, "grad_norm": 0.90385502576828, "learning_rate": 0.0002, "epoch": 5.844313313394228, "step": 36150}, {"loss": 0.5274, "grad_norm": 1.0910571813583374, "learning_rate": 0.0002, "epoch": 5.845929997574974, "step": 36160}, {"loss": 0.555, "grad_norm": 0.9417795538902283, "learning_rate": 0.0002, "epoch": 5.847546681755719, "step": 36170}, {"loss": 0.5784, "grad_norm": 1.0027360916137695, "learning_rate": 0.0002, "epoch": 5.849163365936464, "step": 36180}, {"loss": 0.5423, "grad_norm": 1.1480516195297241, "learning_rate": 0.0002, "epoch": 5.850780050117209, "step": 36190}, {"loss": 0.5517, "grad_norm": 1.2431457042694092, "learning_rate": 0.0002, "epoch": 5.852396734297955, "step": 36200}, {"loss": 0.5404, "grad_norm": 1.091465950012207, "learning_rate": 0.0002, "epoch": 5.8540134184787, "step": 36210}, {"loss": 0.53, "grad_norm": 0.9693930745124817, "learning_rate": 0.0002, "epoch": 5.855630102659445, "step": 36220}, {"loss": 0.5453, "grad_norm": 0.9937465190887451, "learning_rate": 0.0002, "epoch": 5.857246786840191, "step": 36230}, {"loss": 0.5621, "grad_norm": 1.0731011629104614, "learning_rate": 0.0002, "epoch": 5.858863471020936, "step": 36240}, {"loss": 0.5687, "grad_norm": 1.0869048833847046, "learning_rate": 0.0002, "epoch": 5.860480155201682, "step": 36250}, {"loss": 0.5576, "grad_norm": 0.9226390719413757, "learning_rate": 0.0002, "epoch": 5.862096839382427, "step": 36260}, {"loss": 0.531, "grad_norm": 1.1755430698394775, "learning_rate": 0.0002, "epoch": 5.863713523563172, "step": 36270}, {"loss": 0.558, "grad_norm": 0.8815974593162537, "learning_rate": 0.0002, "epoch": 5.865330207743917, "step": 36280}, {"loss": 0.5065, "grad_norm": 1.3648751974105835, "learning_rate": 0.0002, "epoch": 5.866946891924663, "step": 36290}, {"loss": 0.536, "grad_norm": 0.8729211091995239, "learning_rate": 0.0002, "epoch": 5.868563576105408, "step": 36300}, {"loss": 0.5192, "grad_norm": 1.0870907306671143, "learning_rate": 0.0002, "epoch": 5.870180260286153, "step": 36310}, {"loss": 0.5609, "grad_norm": 1.1164259910583496, "learning_rate": 0.0002, "epoch": 5.871796944466898, "step": 36320}, {"loss": 0.551, "grad_norm": 1.1572535037994385, "learning_rate": 0.0002, "epoch": 5.8734136286476435, "step": 36330}, {"loss": 0.5898, "grad_norm": 1.0456238985061646, "learning_rate": 0.0002, "epoch": 5.875030312828389, "step": 36340}, {"loss": 0.5008, "grad_norm": 1.1310722827911377, "learning_rate": 0.0002, "epoch": 5.876646997009134, "step": 36350}, {"loss": 0.5352, "grad_norm": 1.0004712343215942, "learning_rate": 0.0002, "epoch": 5.878263681189879, "step": 36360}, {"loss": 0.5632, "grad_norm": 1.0991777181625366, "learning_rate": 0.0002, "epoch": 5.879880365370624, "step": 36370}, {"loss": 0.5815, "grad_norm": 1.2789239883422852, "learning_rate": 0.0002, "epoch": 5.8814970495513705, "step": 36380}, {"loss": 0.56, "grad_norm": 0.9524819850921631, "learning_rate": 0.0002, "epoch": 5.883113733732116, "step": 36390}, {"loss": 0.5701, "grad_norm": 1.1115771532058716, "learning_rate": 0.0002, "epoch": 5.884730417912861, "step": 36400}, {"loss": 0.5463, "grad_norm": 1.37419855594635, "learning_rate": 0.0002, "epoch": 5.886347102093606, "step": 36410}, {"loss": 0.5675, "grad_norm": 1.1449527740478516, "learning_rate": 0.0002, "epoch": 5.8879637862743515, "step": 36420}, {"loss": 0.5255, "grad_norm": 1.198046326637268, "learning_rate": 0.0002, "epoch": 5.889580470455097, "step": 36430}, {"loss": 0.5383, "grad_norm": 1.0180530548095703, "learning_rate": 0.0002, "epoch": 5.891197154635842, "step": 36440}, {"loss": 0.5319, "grad_norm": 1.0516417026519775, "learning_rate": 0.0002, "epoch": 5.892813838816587, "step": 36450}, {"loss": 0.5782, "grad_norm": 1.1658052206039429, "learning_rate": 0.0002, "epoch": 5.894430522997332, "step": 36460}, {"loss": 0.5864, "grad_norm": 1.190699577331543, "learning_rate": 0.0002, "epoch": 5.896047207178078, "step": 36470}, {"loss": 0.5451, "grad_norm": 1.1235495805740356, "learning_rate": 0.0002, "epoch": 5.897663891358823, "step": 36480}, {"loss": 0.5284, "grad_norm": 1.1926926374435425, "learning_rate": 0.0002, "epoch": 5.899280575539568, "step": 36490}, {"loss": 0.5686, "grad_norm": 1.1184662580490112, "learning_rate": 0.0002, "epoch": 5.900897259720313, "step": 36500}, {"loss": 0.5147, "grad_norm": 1.000970721244812, "learning_rate": 0.0002, "epoch": 5.9025139439010585, "step": 36510}, {"loss": 0.5351, "grad_norm": 1.0373306274414062, "learning_rate": 0.0002, "epoch": 5.904130628081804, "step": 36520}, {"loss": 0.535, "grad_norm": 1.0840669870376587, "learning_rate": 0.0002, "epoch": 5.90574731226255, "step": 36530}, {"loss": 0.538, "grad_norm": 0.9908381104469299, "learning_rate": 0.0002, "epoch": 5.907363996443295, "step": 36540}, {"loss": 0.5313, "grad_norm": 1.0456029176712036, "learning_rate": 0.0002, "epoch": 5.90898068062404, "step": 36550}, {"loss": 0.5693, "grad_norm": 1.1381454467773438, "learning_rate": 0.0002, "epoch": 5.910597364804786, "step": 36560}, {"loss": 0.5473, "grad_norm": 0.9440900087356567, "learning_rate": 0.0002, "epoch": 5.912214048985531, "step": 36570}, {"loss": 0.5542, "grad_norm": 1.1674573421478271, "learning_rate": 0.0002, "epoch": 5.913830733166276, "step": 36580}, {"loss": 0.526, "grad_norm": 1.1226966381072998, "learning_rate": 0.0002, "epoch": 5.915447417347021, "step": 36590}, {"loss": 0.6091, "grad_norm": 0.9696915745735168, "learning_rate": 0.0002, "epoch": 5.9170641015277665, "step": 36600}, {"loss": 0.5523, "grad_norm": 0.9593005180358887, "learning_rate": 0.0002, "epoch": 5.918680785708512, "step": 36610}, {"loss": 0.5536, "grad_norm": 1.122169852256775, "learning_rate": 0.0002, "epoch": 5.920297469889257, "step": 36620}, {"loss": 0.5039, "grad_norm": 0.9923415780067444, "learning_rate": 0.0002, "epoch": 5.921914154070002, "step": 36630}, {"loss": 0.5893, "grad_norm": 1.063838005065918, "learning_rate": 0.0002, "epoch": 5.923530838250747, "step": 36640}, {"loss": 0.5799, "grad_norm": 0.9083505272865295, "learning_rate": 0.0002, "epoch": 5.925147522431493, "step": 36650}, {"loss": 0.5264, "grad_norm": 0.9439437985420227, "learning_rate": 0.0002, "epoch": 5.926764206612239, "step": 36660}, {"loss": 0.5891, "grad_norm": 0.9778534173965454, "learning_rate": 0.0002, "epoch": 5.928380890792983, "step": 36670}, {"loss": 0.566, "grad_norm": 0.9723961353302002, "learning_rate": 0.0002, "epoch": 5.929997574973729, "step": 36680}, {"loss": 0.5741, "grad_norm": 1.162333607673645, "learning_rate": 0.0002, "epoch": 5.9316142591544745, "step": 36690}, {"loss": 0.5771, "grad_norm": 1.2784897089004517, "learning_rate": 0.0002, "epoch": 5.93323094333522, "step": 36700}, {"loss": 0.5343, "grad_norm": 1.0924867391586304, "learning_rate": 0.0002, "epoch": 5.934847627515965, "step": 36710}, {"loss": 0.5554, "grad_norm": 1.046922206878662, "learning_rate": 0.0002, "epoch": 5.93646431169671, "step": 36720}, {"loss": 0.5476, "grad_norm": 0.8632535338401794, "learning_rate": 0.0002, "epoch": 5.938080995877455, "step": 36730}, {"loss": 0.5456, "grad_norm": 1.358762502670288, "learning_rate": 0.0002, "epoch": 5.939697680058201, "step": 36740}, {"loss": 0.551, "grad_norm": 1.2058624029159546, "learning_rate": 0.0002, "epoch": 5.941314364238946, "step": 36750}, {"loss": 0.5462, "grad_norm": 1.1396408081054688, "learning_rate": 0.0002, "epoch": 5.942931048419691, "step": 36760}, {"loss": 0.5483, "grad_norm": 1.1510354280471802, "learning_rate": 0.0002, "epoch": 5.944547732600436, "step": 36770}, {"loss": 0.5659, "grad_norm": 1.1401607990264893, "learning_rate": 0.0002, "epoch": 5.946164416781182, "step": 36780}, {"loss": 0.5557, "grad_norm": 1.1871325969696045, "learning_rate": 0.0002, "epoch": 5.947781100961927, "step": 36790}, {"loss": 0.4945, "grad_norm": 0.9928333163261414, "learning_rate": 0.0002, "epoch": 5.949397785142672, "step": 36800}, {"loss": 0.5303, "grad_norm": 1.0549445152282715, "learning_rate": 0.0002, "epoch": 5.951014469323418, "step": 36810}, {"loss": 0.5532, "grad_norm": 0.9791563749313354, "learning_rate": 0.0002, "epoch": 5.9526311535041625, "step": 36820}, {"loss": 0.5317, "grad_norm": 1.1268441677093506, "learning_rate": 0.0002, "epoch": 5.954247837684909, "step": 36830}, {"loss": 0.5585, "grad_norm": 1.0533992052078247, "learning_rate": 0.0002, "epoch": 5.955864521865654, "step": 36840}, {"loss": 0.4972, "grad_norm": 1.023358941078186, "learning_rate": 0.0002, "epoch": 5.957481206046399, "step": 36850}, {"loss": 0.5557, "grad_norm": 1.2631961107254028, "learning_rate": 0.0002, "epoch": 5.959097890227144, "step": 36860}, {"loss": 0.5662, "grad_norm": 0.9397698640823364, "learning_rate": 0.0002, "epoch": 5.9607145744078895, "step": 36870}, {"loss": 0.5775, "grad_norm": 1.1678427457809448, "learning_rate": 0.0002, "epoch": 5.962331258588635, "step": 36880}, {"loss": 0.5435, "grad_norm": 1.1403759717941284, "learning_rate": 0.0002, "epoch": 5.96394794276938, "step": 36890}, {"loss": 0.5479, "grad_norm": 1.030572772026062, "learning_rate": 0.0002, "epoch": 5.965564626950125, "step": 36900}, {"loss": 0.5838, "grad_norm": 1.0992497205734253, "learning_rate": 0.0002, "epoch": 5.9671813111308705, "step": 36910}, {"loss": 0.5452, "grad_norm": 1.075466275215149, "learning_rate": 0.0002, "epoch": 5.968797995311616, "step": 36920}, {"loss": 0.5739, "grad_norm": 1.0153694152832031, "learning_rate": 0.0002, "epoch": 5.970414679492361, "step": 36930}, {"loss": 0.5672, "grad_norm": 0.973193883895874, "learning_rate": 0.0002, "epoch": 5.972031363673106, "step": 36940}, {"loss": 0.5585, "grad_norm": 0.8294678926467896, "learning_rate": 0.0002, "epoch": 5.973648047853851, "step": 36950}, {"loss": 0.5631, "grad_norm": 1.0048716068267822, "learning_rate": 0.0002, "epoch": 5.9752647320345975, "step": 36960}, {"loss": 0.5471, "grad_norm": 0.9714070558547974, "learning_rate": 0.0002, "epoch": 5.976881416215342, "step": 36970}, {"loss": 0.5419, "grad_norm": 0.8667682409286499, "learning_rate": 0.0002, "epoch": 5.978498100396088, "step": 36980}, {"loss": 0.5474, "grad_norm": 1.0461409091949463, "learning_rate": 0.0002, "epoch": 5.980114784576833, "step": 36990}, {"loss": 0.5454, "grad_norm": 0.9229754209518433, "learning_rate": 0.0002, "epoch": 5.981731468757578, "step": 37000}, {"loss": 0.5599, "grad_norm": 1.0406876802444458, "learning_rate": 0.0002, "epoch": 5.983348152938324, "step": 37010}, {"loss": 0.5569, "grad_norm": 0.8993828296661377, "learning_rate": 0.0002, "epoch": 5.984964837119069, "step": 37020}, {"loss": 0.5611, "grad_norm": 1.2260479927062988, "learning_rate": 0.0002, "epoch": 5.986581521299814, "step": 37030}, {"loss": 0.5523, "grad_norm": 1.0107380151748657, "learning_rate": 0.0002, "epoch": 5.988198205480559, "step": 37040}, {"loss": 0.5639, "grad_norm": 1.0240139961242676, "learning_rate": 0.0002, "epoch": 5.989814889661305, "step": 37050}, {"loss": 0.5209, "grad_norm": 1.0185275077819824, "learning_rate": 0.0002, "epoch": 5.99143157384205, "step": 37060}, {"loss": 0.5114, "grad_norm": 1.1361802816390991, "learning_rate": 0.0002, "epoch": 5.993048258022795, "step": 37070}, {"loss": 0.5692, "grad_norm": 1.0395532846450806, "learning_rate": 0.0002, "epoch": 5.99466494220354, "step": 37080}, {"loss": 0.594, "grad_norm": 0.9463558197021484, "learning_rate": 0.0002, "epoch": 5.9962816263842855, "step": 37090}, {"loss": 0.5775, "grad_norm": 1.2066948413848877, "learning_rate": 0.0002, "epoch": 5.997898310565031, "step": 37100}, {"loss": 0.5356, "grad_norm": 0.9749386310577393, "learning_rate": 0.0002, "epoch": 5.999514994745777, "step": 37110}]} +{"epoch": 6.9999191657909625, "step": 43298, "epoch_duration": 16911.370942354202, "total_accumulated_duration": 118243.28742051125, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7672.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75537109375}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.6636, "grad_norm": 0.9894065856933594, "learning_rate": 0.0002, "epoch": 0.0016166841807452913, "step": 10}, {"loss": 1.1528, "grad_norm": 1.7810699939727783, "learning_rate": 0.0002, "epoch": 0.0032333683614905826, "step": 20}, {"loss": 0.9767, "grad_norm": 0.5969577431678772, "learning_rate": 0.0002, "epoch": 0.004850052542235874, "step": 30}, {"loss": 0.9772, "grad_norm": 0.6354120969772339, "learning_rate": 0.0002, "epoch": 0.006466736722981165, "step": 40}, {"loss": 0.8643, "grad_norm": 0.5604607462882996, "learning_rate": 0.0002, "epoch": 0.008083420903726457, "step": 50}, {"loss": 0.8841, "grad_norm": 0.4676193594932556, "learning_rate": 0.0002, "epoch": 0.009700105084471748, "step": 60}, {"loss": 0.9022, "grad_norm": 0.6099211573600769, "learning_rate": 0.0002, "epoch": 0.01131678926521704, "step": 70}, {"loss": 0.9133, "grad_norm": 0.48639994859695435, "learning_rate": 0.0002, "epoch": 0.01293347344596233, "step": 80}, {"loss": 0.8704, "grad_norm": 0.4904264509677887, "learning_rate": 0.0002, "epoch": 0.014550157626707623, "step": 90}, {"loss": 0.8855, "grad_norm": 2.8334362506866455, "learning_rate": 0.0002, "epoch": 0.016166841807452915, "step": 100}, {"loss": 0.8958, "grad_norm": 0.43221670389175415, "learning_rate": 0.0002, "epoch": 0.017783525988198205, "step": 110}, {"loss": 0.8412, "grad_norm": 0.42244166135787964, "learning_rate": 0.0002, "epoch": 0.019400210168943496, "step": 120}, {"loss": 0.8467, "grad_norm": 0.45363298058509827, "learning_rate": 0.0002, "epoch": 0.02101689434968879, "step": 130}, {"loss": 0.8641, "grad_norm": 0.44816508889198303, "learning_rate": 0.0002, "epoch": 0.02263357853043408, "step": 140}, {"loss": 0.8496, "grad_norm": 0.43308213353157043, "learning_rate": 0.0002, "epoch": 0.02425026271117937, "step": 150}, {"loss": 0.8213, "grad_norm": 0.4084763526916504, "learning_rate": 0.0002, "epoch": 0.02586694689192466, "step": 160}, {"loss": 0.8343, "grad_norm": 0.5363703966140747, "learning_rate": 0.0002, "epoch": 0.027483631072669955, "step": 170}, {"loss": 0.8558, "grad_norm": 0.4619699716567993, "learning_rate": 0.0002, "epoch": 0.029100315253415245, "step": 180}, {"loss": 0.8878, "grad_norm": 0.49069908261299133, "learning_rate": 0.0002, "epoch": 0.030716999434160536, "step": 190}, {"loss": 0.8867, "grad_norm": 0.4645835757255554, "learning_rate": 0.0002, "epoch": 0.03233368361490583, "step": 200}, {"loss": 0.8842, "grad_norm": 1.2411243915557861, "learning_rate": 0.0002, "epoch": 0.03395036779565112, "step": 210}, {"loss": 0.8245, "grad_norm": 0.5211851596832275, "learning_rate": 0.0002, "epoch": 0.03556705197639641, "step": 220}, {"loss": 0.8194, "grad_norm": 0.5253691673278809, "learning_rate": 0.0002, "epoch": 0.037183736157141704, "step": 230}, {"loss": 0.8856, "grad_norm": 0.4567478895187378, "learning_rate": 0.0002, "epoch": 0.03880042033788699, "step": 240}, {"loss": 0.838, "grad_norm": 0.5472128391265869, "learning_rate": 0.0002, "epoch": 0.040417104518632285, "step": 250}, {"loss": 0.8201, "grad_norm": 0.42978546023368835, "learning_rate": 0.0002, "epoch": 0.04203378869937758, "step": 260}, {"loss": 0.8334, "grad_norm": 0.601734459400177, "learning_rate": 0.0002, "epoch": 0.043650472880122866, "step": 270}, {"loss": 0.815, "grad_norm": 0.4286513328552246, "learning_rate": 0.0002, "epoch": 0.04526715706086816, "step": 280}, {"loss": 0.8758, "grad_norm": 0.5230861902236938, "learning_rate": 0.0002, "epoch": 0.046883841241613454, "step": 290}, {"loss": 0.8636, "grad_norm": 0.6504611968994141, "learning_rate": 0.0002, "epoch": 0.04850052542235874, "step": 300}, {"loss": 0.8102, "grad_norm": 0.43485215306282043, "learning_rate": 0.0002, "epoch": 0.050117209603104035, "step": 310}, {"loss": 0.8221, "grad_norm": 0.4717007875442505, "learning_rate": 0.0002, "epoch": 0.05173389378384932, "step": 320}, {"loss": 0.8469, "grad_norm": 0.4059787690639496, "learning_rate": 0.0002, "epoch": 0.053350577964594616, "step": 330}, {"loss": 0.8866, "grad_norm": 0.4366913437843323, "learning_rate": 0.0002, "epoch": 0.05496726214533991, "step": 340}, {"loss": 0.7976, "grad_norm": 0.4233848452568054, "learning_rate": 0.0002, "epoch": 0.0565839463260852, "step": 350}, {"loss": 0.8456, "grad_norm": 0.4209108352661133, "learning_rate": 0.0002, "epoch": 0.05820063050683049, "step": 360}, {"loss": 0.816, "grad_norm": 0.41637396812438965, "learning_rate": 0.0002, "epoch": 0.059817314687575784, "step": 370}, {"loss": 0.7976, "grad_norm": 0.46235376596450806, "learning_rate": 0.0002, "epoch": 0.06143399886832107, "step": 380}, {"loss": 0.7966, "grad_norm": 0.4013484716415405, "learning_rate": 0.0002, "epoch": 0.06305068304906636, "step": 390}, {"loss": 0.8253, "grad_norm": 0.47443896532058716, "learning_rate": 0.0002, "epoch": 0.06466736722981166, "step": 400}, {"loss": 0.8666, "grad_norm": 0.3942156434059143, "learning_rate": 0.0002, "epoch": 0.06628405141055695, "step": 410}, {"loss": 0.8402, "grad_norm": 0.4965320825576782, "learning_rate": 0.0002, "epoch": 0.06790073559130223, "step": 420}, {"loss": 0.8317, "grad_norm": 0.4304835796356201, "learning_rate": 0.0002, "epoch": 0.06951741977204753, "step": 430}, {"loss": 0.8528, "grad_norm": 0.511726975440979, "learning_rate": 0.0002, "epoch": 0.07113410395279282, "step": 440}, {"loss": 0.8675, "grad_norm": 0.4040689170360565, "learning_rate": 0.0002, "epoch": 0.07275078813353811, "step": 450}, {"loss": 0.8788, "grad_norm": 0.5402171015739441, "learning_rate": 0.0002, "epoch": 0.07436747231428341, "step": 460}, {"loss": 0.8737, "grad_norm": 0.4174517095088959, "learning_rate": 0.0002, "epoch": 0.0759841564950287, "step": 470}, {"loss": 0.7605, "grad_norm": 0.4306182265281677, "learning_rate": 0.0002, "epoch": 0.07760084067577398, "step": 480}, {"loss": 0.799, "grad_norm": 0.535210132598877, "learning_rate": 0.0002, "epoch": 0.07921752485651928, "step": 490}, {"loss": 0.7825, "grad_norm": 0.5339109897613525, "learning_rate": 0.0002, "epoch": 0.08083420903726457, "step": 500}, {"loss": 0.8985, "grad_norm": 0.45754891633987427, "learning_rate": 0.0002, "epoch": 0.08245089321800986, "step": 510}, {"loss": 0.8144, "grad_norm": 0.43820783495903015, "learning_rate": 0.0002, "epoch": 0.08406757739875516, "step": 520}, {"loss": 0.8001, "grad_norm": 0.4434749186038971, "learning_rate": 0.0002, "epoch": 0.08568426157950045, "step": 530}, {"loss": 0.7857, "grad_norm": 0.43111467361450195, "learning_rate": 0.0002, "epoch": 0.08730094576024573, "step": 540}, {"loss": 0.8418, "grad_norm": 0.4378940165042877, "learning_rate": 0.0002, "epoch": 0.08891762994099103, "step": 550}, {"loss": 0.8361, "grad_norm": 0.4772215187549591, "learning_rate": 0.0002, "epoch": 0.09053431412173632, "step": 560}, {"loss": 0.8268, "grad_norm": 0.6837629079818726, "learning_rate": 0.0002, "epoch": 0.09215099830248161, "step": 570}, {"loss": 0.8607, "grad_norm": 0.42241212725639343, "learning_rate": 0.0002, "epoch": 0.09376768248322691, "step": 580}, {"loss": 0.852, "grad_norm": 0.5165936350822449, "learning_rate": 0.0002, "epoch": 0.0953843666639722, "step": 590}, {"loss": 0.8664, "grad_norm": 0.48737478256225586, "learning_rate": 0.0002, "epoch": 0.09700105084471748, "step": 600}, {"loss": 0.8806, "grad_norm": 0.47419852018356323, "learning_rate": 0.0002, "epoch": 0.09861773502546278, "step": 610}, {"loss": 0.8254, "grad_norm": 0.4975486099720001, "learning_rate": 0.0002, "epoch": 0.10023441920620807, "step": 620}, {"loss": 0.8548, "grad_norm": 0.49123844504356384, "learning_rate": 0.0002, "epoch": 0.10185110338695336, "step": 630}, {"loss": 0.8911, "grad_norm": 0.6288952827453613, "learning_rate": 0.0002, "epoch": 0.10346778756769864, "step": 640}, {"loss": 0.827, "grad_norm": 0.4277345836162567, "learning_rate": 0.0002, "epoch": 0.10508447174844394, "step": 650}, {"loss": 0.7996, "grad_norm": 0.4021061956882477, "learning_rate": 0.0002, "epoch": 0.10670115592918923, "step": 660}, {"loss": 0.87, "grad_norm": 0.3492237329483032, "learning_rate": 0.0002, "epoch": 0.10831784010993452, "step": 670}, {"loss": 0.8698, "grad_norm": 0.4341012239456177, "learning_rate": 0.0002, "epoch": 0.10993452429067982, "step": 680}, {"loss": 0.781, "grad_norm": 0.7296304106712341, "learning_rate": 0.0002, "epoch": 0.1115512084714251, "step": 690}, {"loss": 0.8433, "grad_norm": 0.397494912147522, "learning_rate": 0.0002, "epoch": 0.1131678926521704, "step": 700}, {"loss": 0.827, "grad_norm": 0.396431028842926, "learning_rate": 0.0002, "epoch": 0.1147845768329157, "step": 710}, {"loss": 0.8379, "grad_norm": 0.48842838406562805, "learning_rate": 0.0002, "epoch": 0.11640126101366098, "step": 720}, {"loss": 0.8238, "grad_norm": 0.46322616934776306, "learning_rate": 0.0002, "epoch": 0.11801794519440627, "step": 730}, {"loss": 0.8041, "grad_norm": 0.47990912199020386, "learning_rate": 0.0002, "epoch": 0.11963462937515157, "step": 740}, {"loss": 0.82, "grad_norm": 0.4997142255306244, "learning_rate": 0.0002, "epoch": 0.12125131355589686, "step": 750}, {"loss": 0.7702, "grad_norm": 0.4040526747703552, "learning_rate": 0.0002, "epoch": 0.12286799773664214, "step": 760}, {"loss": 0.863, "grad_norm": 0.453095942735672, "learning_rate": 0.0002, "epoch": 0.12448468191738744, "step": 770}, {"loss": 0.8792, "grad_norm": 0.4636971950531006, "learning_rate": 0.0002, "epoch": 0.12610136609813272, "step": 780}, {"loss": 0.8112, "grad_norm": 0.4279276132583618, "learning_rate": 0.0002, "epoch": 0.12771805027887803, "step": 790}, {"loss": 0.8711, "grad_norm": 0.46212655305862427, "learning_rate": 0.0002, "epoch": 0.12933473445962332, "step": 800}, {"loss": 0.8368, "grad_norm": 0.43127650022506714, "learning_rate": 0.0002, "epoch": 0.1309514186403686, "step": 810}, {"loss": 0.8476, "grad_norm": 0.4201301336288452, "learning_rate": 0.0002, "epoch": 0.1325681028211139, "step": 820}, {"loss": 0.8078, "grad_norm": 0.42583167552948, "learning_rate": 0.0002, "epoch": 0.13418478700185918, "step": 830}, {"loss": 0.8219, "grad_norm": 0.4535622000694275, "learning_rate": 0.0002, "epoch": 0.13580147118260447, "step": 840}, {"loss": 0.8423, "grad_norm": 0.4116036891937256, "learning_rate": 0.0002, "epoch": 0.13741815536334978, "step": 850}, {"loss": 0.8466, "grad_norm": 0.45997580885887146, "learning_rate": 0.0002, "epoch": 0.13903483954409507, "step": 860}, {"loss": 0.8917, "grad_norm": 0.4487837255001068, "learning_rate": 0.0002, "epoch": 0.14065152372484035, "step": 870}, {"loss": 0.8217, "grad_norm": 0.43650057911872864, "learning_rate": 0.0002, "epoch": 0.14226820790558564, "step": 880}, {"loss": 0.8178, "grad_norm": 0.5335358381271362, "learning_rate": 0.0002, "epoch": 0.14388489208633093, "step": 890}, {"loss": 0.7957, "grad_norm": 0.5989000201225281, "learning_rate": 0.0002, "epoch": 0.14550157626707622, "step": 900}, {"loss": 0.8385, "grad_norm": 0.517179012298584, "learning_rate": 0.0002, "epoch": 0.14711826044782153, "step": 910}, {"loss": 0.8255, "grad_norm": 0.44435232877731323, "learning_rate": 0.0002, "epoch": 0.14873494462856682, "step": 920}, {"loss": 0.8305, "grad_norm": 0.42635923624038696, "learning_rate": 0.0002, "epoch": 0.1503516288093121, "step": 930}, {"loss": 0.8043, "grad_norm": 0.49603334069252014, "learning_rate": 0.0002, "epoch": 0.1519683129900574, "step": 940}, {"loss": 0.8377, "grad_norm": 0.40639808773994446, "learning_rate": 0.0002, "epoch": 0.15358499717080268, "step": 950}, {"loss": 0.8529, "grad_norm": 0.4850759208202362, "learning_rate": 0.0002, "epoch": 0.15520168135154797, "step": 960}, {"loss": 0.846, "grad_norm": 0.4427442252635956, "learning_rate": 0.0002, "epoch": 0.15681836553229328, "step": 970}, {"loss": 0.8705, "grad_norm": 0.3760930001735687, "learning_rate": 0.0002, "epoch": 0.15843504971303857, "step": 980}, {"loss": 0.8644, "grad_norm": 0.4794144332408905, "learning_rate": 0.0002, "epoch": 0.16005173389378385, "step": 990}, {"loss": 0.8002, "grad_norm": 0.45828768610954285, "learning_rate": 0.0002, "epoch": 0.16166841807452914, "step": 1000}, {"loss": 0.7658, "grad_norm": 0.6313053369522095, "learning_rate": 0.0002, "epoch": 0.16328510225527443, "step": 1010}, {"loss": 0.8047, "grad_norm": 0.45041006803512573, "learning_rate": 0.0002, "epoch": 0.16490178643601971, "step": 1020}, {"loss": 0.8423, "grad_norm": 0.441403865814209, "learning_rate": 0.0002, "epoch": 0.166518470616765, "step": 1030}, {"loss": 0.8475, "grad_norm": 0.8171296119689941, "learning_rate": 0.0002, "epoch": 0.16813515479751032, "step": 1040}, {"loss": 0.845, "grad_norm": 0.7137420773506165, "learning_rate": 0.0002, "epoch": 0.1697518389782556, "step": 1050}, {"loss": 0.8213, "grad_norm": 0.5236809849739075, "learning_rate": 0.0002, "epoch": 0.1713685231590009, "step": 1060}, {"loss": 0.8265, "grad_norm": 0.5021864175796509, "learning_rate": 0.0002, "epoch": 0.17298520733974618, "step": 1070}, {"loss": 0.8305, "grad_norm": 0.47347521781921387, "learning_rate": 0.0002, "epoch": 0.17460189152049146, "step": 1080}, {"loss": 0.8105, "grad_norm": 0.4631653428077698, "learning_rate": 0.0002, "epoch": 0.17621857570123675, "step": 1090}, {"loss": 0.8166, "grad_norm": 0.49169182777404785, "learning_rate": 0.0002, "epoch": 0.17783525988198207, "step": 1100}, {"loss": 0.8012, "grad_norm": 0.5019739270210266, "learning_rate": 0.0002, "epoch": 0.17945194406272735, "step": 1110}, {"loss": 0.8247, "grad_norm": 0.5100422501564026, "learning_rate": 0.0002, "epoch": 0.18106862824347264, "step": 1120}, {"loss": 0.8142, "grad_norm": 0.3888324499130249, "learning_rate": 0.0002, "epoch": 0.18268531242421793, "step": 1130}, {"loss": 0.8533, "grad_norm": 0.39765217900276184, "learning_rate": 0.0002, "epoch": 0.18430199660496321, "step": 1140}, {"loss": 0.8541, "grad_norm": 0.47190186381340027, "learning_rate": 0.0002, "epoch": 0.1859186807857085, "step": 1150}, {"loss": 0.8301, "grad_norm": 0.4464188814163208, "learning_rate": 0.0002, "epoch": 0.18753536496645382, "step": 1160}, {"loss": 0.8341, "grad_norm": 0.5153930187225342, "learning_rate": 0.0002, "epoch": 0.1891520491471991, "step": 1170}, {"loss": 0.8033, "grad_norm": 0.4779708683490753, "learning_rate": 0.0002, "epoch": 0.1907687333279444, "step": 1180}, {"loss": 0.8187, "grad_norm": 0.4834315776824951, "learning_rate": 0.0002, "epoch": 0.19238541750868968, "step": 1190}, {"loss": 0.7721, "grad_norm": 0.402357816696167, "learning_rate": 0.0002, "epoch": 0.19400210168943496, "step": 1200}, {"loss": 0.7941, "grad_norm": 0.45899084210395813, "learning_rate": 0.0002, "epoch": 0.19561878587018025, "step": 1210}, {"loss": 0.8353, "grad_norm": 0.5106529593467712, "learning_rate": 0.0002, "epoch": 0.19723547005092557, "step": 1220}, {"loss": 0.7816, "grad_norm": 0.45261722803115845, "learning_rate": 0.0002, "epoch": 0.19885215423167085, "step": 1230}, {"loss": 0.8068, "grad_norm": 0.4647127091884613, "learning_rate": 0.0002, "epoch": 0.20046883841241614, "step": 1240}, {"loss": 0.8239, "grad_norm": 0.4849368929862976, "learning_rate": 0.0002, "epoch": 0.20208552259316143, "step": 1250}, {"loss": 0.8514, "grad_norm": 0.4518061578273773, "learning_rate": 0.0002, "epoch": 0.2037022067739067, "step": 1260}, {"loss": 0.8158, "grad_norm": 0.49535325169563293, "learning_rate": 0.0002, "epoch": 0.205318890954652, "step": 1270}, {"loss": 0.8348, "grad_norm": 0.4835205376148224, "learning_rate": 0.0002, "epoch": 0.2069355751353973, "step": 1280}, {"loss": 0.8428, "grad_norm": 0.45308539271354675, "learning_rate": 0.0002, "epoch": 0.2085522593161426, "step": 1290}, {"loss": 0.7993, "grad_norm": 0.5369905233383179, "learning_rate": 0.0002, "epoch": 0.2101689434968879, "step": 1300}, {"loss": 0.8676, "grad_norm": 0.5031622052192688, "learning_rate": 0.0002, "epoch": 0.21178562767763318, "step": 1310}, {"loss": 0.7686, "grad_norm": 0.48010334372520447, "learning_rate": 0.0002, "epoch": 0.21340231185837846, "step": 1320}, {"loss": 0.806, "grad_norm": 0.4905701279640198, "learning_rate": 0.0002, "epoch": 0.21501899603912375, "step": 1330}, {"loss": 0.7885, "grad_norm": 0.43531742691993713, "learning_rate": 0.0002, "epoch": 0.21663568021986904, "step": 1340}, {"loss": 0.8191, "grad_norm": 0.44330692291259766, "learning_rate": 0.0002, "epoch": 0.21825236440061435, "step": 1350}, {"loss": 0.8205, "grad_norm": 0.5384416580200195, "learning_rate": 0.0002, "epoch": 0.21986904858135964, "step": 1360}, {"loss": 0.7726, "grad_norm": 0.4181833863258362, "learning_rate": 0.0002, "epoch": 0.22148573276210493, "step": 1370}, {"loss": 0.8311, "grad_norm": 0.523833692073822, "learning_rate": 0.0002, "epoch": 0.2231024169428502, "step": 1380}, {"loss": 0.7913, "grad_norm": 0.5528736710548401, "learning_rate": 0.0002, "epoch": 0.2247191011235955, "step": 1390}, {"loss": 0.8079, "grad_norm": 0.43515023589134216, "learning_rate": 0.0002, "epoch": 0.2263357853043408, "step": 1400}, {"loss": 0.8403, "grad_norm": 0.48809877038002014, "learning_rate": 0.0002, "epoch": 0.2279524694850861, "step": 1410}, {"loss": 0.8165, "grad_norm": 0.43591251969337463, "learning_rate": 0.0002, "epoch": 0.2295691536658314, "step": 1420}, {"loss": 0.8147, "grad_norm": 0.44625312089920044, "learning_rate": 0.0002, "epoch": 0.23118583784657668, "step": 1430}, {"loss": 0.8134, "grad_norm": 0.4390665292739868, "learning_rate": 0.0002, "epoch": 0.23280252202732196, "step": 1440}, {"loss": 0.8465, "grad_norm": 0.48496049642562866, "learning_rate": 0.0002, "epoch": 0.23441920620806725, "step": 1450}, {"loss": 0.775, "grad_norm": 0.45919957756996155, "learning_rate": 0.0002, "epoch": 0.23603589038881254, "step": 1460}, {"loss": 0.8659, "grad_norm": 0.5471845865249634, "learning_rate": 0.0002, "epoch": 0.23765257456955785, "step": 1470}, {"loss": 0.8164, "grad_norm": 0.47269317507743835, "learning_rate": 0.0002, "epoch": 0.23926925875030314, "step": 1480}, {"loss": 0.854, "grad_norm": 0.4930245578289032, "learning_rate": 0.0002, "epoch": 0.24088594293104842, "step": 1490}, {"loss": 0.8139, "grad_norm": 0.5605630278587341, "learning_rate": 0.0002, "epoch": 0.2425026271117937, "step": 1500}, {"loss": 0.8125, "grad_norm": 0.4435870945453644, "learning_rate": 0.0002, "epoch": 0.244119311292539, "step": 1510}, {"loss": 0.8123, "grad_norm": 0.4941999912261963, "learning_rate": 0.0002, "epoch": 0.24573599547328429, "step": 1520}, {"loss": 0.8427, "grad_norm": 0.5100624561309814, "learning_rate": 0.0002, "epoch": 0.24735267965402957, "step": 1530}, {"loss": 0.8405, "grad_norm": 0.4638267457485199, "learning_rate": 0.0002, "epoch": 0.2489693638347749, "step": 1540}, {"loss": 0.81, "grad_norm": 0.5071570873260498, "learning_rate": 0.0002, "epoch": 0.25058604801552015, "step": 1550}, {"loss": 0.7724, "grad_norm": 0.4291319251060486, "learning_rate": 0.0002, "epoch": 0.25220273219626543, "step": 1560}, {"loss": 0.7984, "grad_norm": 0.5388049483299255, "learning_rate": 0.0002, "epoch": 0.2538194163770108, "step": 1570}, {"loss": 0.8176, "grad_norm": 0.5083683729171753, "learning_rate": 0.0002, "epoch": 0.25543610055775606, "step": 1580}, {"loss": 0.843, "grad_norm": 0.4824463725090027, "learning_rate": 0.0002, "epoch": 0.25705278473850135, "step": 1590}, {"loss": 0.7996, "grad_norm": 0.41177722811698914, "learning_rate": 0.0002, "epoch": 0.25866946891924664, "step": 1600}, {"loss": 0.7772, "grad_norm": 0.5656219124794006, "learning_rate": 0.0002, "epoch": 0.2602861530999919, "step": 1610}, {"loss": 0.7955, "grad_norm": 0.41063204407691956, "learning_rate": 0.0002, "epoch": 0.2619028372807372, "step": 1620}, {"loss": 0.7998, "grad_norm": 0.4897061288356781, "learning_rate": 0.0002, "epoch": 0.2635195214614825, "step": 1630}, {"loss": 0.8198, "grad_norm": 0.4454376697540283, "learning_rate": 0.0002, "epoch": 0.2651362056422278, "step": 1640}, {"loss": 0.8684, "grad_norm": 0.4355238378047943, "learning_rate": 0.0002, "epoch": 0.26675288982297307, "step": 1650}, {"loss": 0.7801, "grad_norm": 0.458310067653656, "learning_rate": 0.0002, "epoch": 0.26836957400371836, "step": 1660}, {"loss": 0.7935, "grad_norm": 0.4752083718776703, "learning_rate": 0.0002, "epoch": 0.26998625818446365, "step": 1670}, {"loss": 0.8267, "grad_norm": 0.4666106402873993, "learning_rate": 0.0002, "epoch": 0.27160294236520893, "step": 1680}, {"loss": 0.8252, "grad_norm": 0.4213818609714508, "learning_rate": 0.0002, "epoch": 0.2732196265459543, "step": 1690}, {"loss": 0.8559, "grad_norm": 0.5768913626670837, "learning_rate": 0.0002, "epoch": 0.27483631072669956, "step": 1700}, {"loss": 0.7931, "grad_norm": 0.4209914803504944, "learning_rate": 0.0002, "epoch": 0.27645299490744485, "step": 1710}, {"loss": 0.8167, "grad_norm": 0.501909613609314, "learning_rate": 0.0002, "epoch": 0.27806967908819014, "step": 1720}, {"loss": 0.7832, "grad_norm": 0.5266261100769043, "learning_rate": 0.0002, "epoch": 0.2796863632689354, "step": 1730}, {"loss": 0.8102, "grad_norm": 0.43806859850883484, "learning_rate": 0.0002, "epoch": 0.2813030474496807, "step": 1740}, {"loss": 0.8157, "grad_norm": 0.46048814058303833, "learning_rate": 0.0002, "epoch": 0.282919731630426, "step": 1750}, {"loss": 0.8596, "grad_norm": 0.44972819089889526, "learning_rate": 0.0002, "epoch": 0.2845364158111713, "step": 1760}, {"loss": 0.8421, "grad_norm": 0.5114831328392029, "learning_rate": 0.0002, "epoch": 0.28615309999191657, "step": 1770}, {"loss": 0.8361, "grad_norm": 0.47931742668151855, "learning_rate": 0.0002, "epoch": 0.28776978417266186, "step": 1780}, {"loss": 0.8265, "grad_norm": 0.5092599987983704, "learning_rate": 0.0002, "epoch": 0.28938646835340714, "step": 1790}, {"loss": 0.8506, "grad_norm": 0.37581443786621094, "learning_rate": 0.0002, "epoch": 0.29100315253415243, "step": 1800}, {"loss": 0.7932, "grad_norm": 0.47097381949424744, "learning_rate": 0.0002, "epoch": 0.2926198367148977, "step": 1810}, {"loss": 0.7787, "grad_norm": 0.48300236463546753, "learning_rate": 0.0002, "epoch": 0.29423652089564306, "step": 1820}, {"loss": 0.8391, "grad_norm": 0.5600419640541077, "learning_rate": 0.0002, "epoch": 0.29585320507638835, "step": 1830}, {"loss": 0.8507, "grad_norm": 0.48555272817611694, "learning_rate": 0.0002, "epoch": 0.29746988925713364, "step": 1840}, {"loss": 0.7657, "grad_norm": 0.3752668499946594, "learning_rate": 0.0002, "epoch": 0.2990865734378789, "step": 1850}, {"loss": 0.7915, "grad_norm": 0.5328747034072876, "learning_rate": 0.0002, "epoch": 0.3007032576186242, "step": 1860}, {"loss": 0.8426, "grad_norm": 0.48716455698013306, "learning_rate": 0.0002, "epoch": 0.3023199417993695, "step": 1870}, {"loss": 0.8335, "grad_norm": 0.5011493563652039, "learning_rate": 0.0002, "epoch": 0.3039366259801148, "step": 1880}, {"loss": 0.852, "grad_norm": 0.46461427211761475, "learning_rate": 0.0002, "epoch": 0.30555331016086007, "step": 1890}, {"loss": 0.8478, "grad_norm": 0.36630210280418396, "learning_rate": 0.0002, "epoch": 0.30716999434160536, "step": 1900}, {"loss": 0.8162, "grad_norm": 0.4217296242713928, "learning_rate": 0.0002, "epoch": 0.30878667852235064, "step": 1910}, {"loss": 0.8128, "grad_norm": 0.4394875466823578, "learning_rate": 0.0002, "epoch": 0.31040336270309593, "step": 1920}, {"loss": 0.8471, "grad_norm": 0.6587965488433838, "learning_rate": 0.0002, "epoch": 0.3120200468838412, "step": 1930}, {"loss": 0.8565, "grad_norm": 0.5469298958778381, "learning_rate": 0.0002, "epoch": 0.31363673106458656, "step": 1940}, {"loss": 0.8236, "grad_norm": 0.4371595084667206, "learning_rate": 0.0002, "epoch": 0.31525341524533185, "step": 1950}, {"loss": 0.887, "grad_norm": 0.4809541404247284, "learning_rate": 0.0002, "epoch": 0.31687009942607713, "step": 1960}, {"loss": 0.7855, "grad_norm": 0.6061086654663086, "learning_rate": 0.0002, "epoch": 0.3184867836068224, "step": 1970}, {"loss": 0.7679, "grad_norm": 0.5342657566070557, "learning_rate": 0.0002, "epoch": 0.3201034677875677, "step": 1980}, {"loss": 0.7955, "grad_norm": 0.5057743787765503, "learning_rate": 0.0002, "epoch": 0.321720151968313, "step": 1990}, {"loss": 0.7774, "grad_norm": 0.528626024723053, "learning_rate": 0.0002, "epoch": 0.3233368361490583, "step": 2000}, {"loss": 0.8845, "grad_norm": 0.46742770075798035, "learning_rate": 0.0002, "epoch": 0.32495352032980357, "step": 2010}, {"loss": 0.8484, "grad_norm": 0.515101432800293, "learning_rate": 0.0002, "epoch": 0.32657020451054886, "step": 2020}, {"loss": 0.8139, "grad_norm": 0.41941216588020325, "learning_rate": 0.0002, "epoch": 0.32818688869129414, "step": 2030}, {"loss": 0.7637, "grad_norm": 0.49902522563934326, "learning_rate": 0.0002, "epoch": 0.32980357287203943, "step": 2040}, {"loss": 0.7822, "grad_norm": 0.4120897650718689, "learning_rate": 0.0002, "epoch": 0.3314202570527847, "step": 2050}, {"loss": 0.8057, "grad_norm": 0.45352041721343994, "learning_rate": 0.0002, "epoch": 0.33303694123353, "step": 2060}, {"loss": 0.7913, "grad_norm": 0.523199737071991, "learning_rate": 0.0002, "epoch": 0.33465362541427535, "step": 2070}, {"loss": 0.8036, "grad_norm": 0.4390358626842499, "learning_rate": 0.0002, "epoch": 0.33627030959502063, "step": 2080}, {"loss": 0.8145, "grad_norm": 0.6752901077270508, "learning_rate": 0.0002, "epoch": 0.3378869937757659, "step": 2090}, {"loss": 0.7807, "grad_norm": 0.547821044921875, "learning_rate": 0.0002, "epoch": 0.3395036779565112, "step": 2100}, {"loss": 0.8561, "grad_norm": 0.5161308646202087, "learning_rate": 0.0002, "epoch": 0.3411203621372565, "step": 2110}, {"loss": 0.7697, "grad_norm": 0.4565401077270508, "learning_rate": 0.0002, "epoch": 0.3427370463180018, "step": 2120}, {"loss": 0.7964, "grad_norm": 0.4666115939617157, "learning_rate": 0.0002, "epoch": 0.34435373049874707, "step": 2130}, {"loss": 0.8189, "grad_norm": 0.4090428352355957, "learning_rate": 0.0002, "epoch": 0.34597041467949236, "step": 2140}, {"loss": 0.8817, "grad_norm": 0.510845422744751, "learning_rate": 0.0002, "epoch": 0.34758709886023764, "step": 2150}, {"loss": 0.8398, "grad_norm": 0.42861923575401306, "learning_rate": 0.0002, "epoch": 0.34920378304098293, "step": 2160}, {"loss": 0.7716, "grad_norm": 0.4476332664489746, "learning_rate": 0.0002, "epoch": 0.3508204672217282, "step": 2170}, {"loss": 0.7845, "grad_norm": 0.6065791249275208, "learning_rate": 0.0002, "epoch": 0.3524371514024735, "step": 2180}, {"loss": 0.8187, "grad_norm": 0.42335066199302673, "learning_rate": 0.0002, "epoch": 0.35405383558321885, "step": 2190}, {"loss": 0.8239, "grad_norm": 0.5094629526138306, "learning_rate": 0.0002, "epoch": 0.35567051976396413, "step": 2200}, {"loss": 0.7807, "grad_norm": 0.5476373434066772, "learning_rate": 0.0002, "epoch": 0.3572872039447094, "step": 2210}, {"loss": 0.814, "grad_norm": 0.3911719024181366, "learning_rate": 0.0002, "epoch": 0.3589038881254547, "step": 2220}, {"loss": 0.8599, "grad_norm": 0.6599636077880859, "learning_rate": 0.0002, "epoch": 0.3605205723062, "step": 2230}, {"loss": 0.7482, "grad_norm": 0.40381914377212524, "learning_rate": 0.0002, "epoch": 0.3621372564869453, "step": 2240}, {"loss": 0.7772, "grad_norm": 0.4433908462524414, "learning_rate": 0.0002, "epoch": 0.36375394066769057, "step": 2250}, {"loss": 0.8503, "grad_norm": 0.578326940536499, "learning_rate": 0.0002, "epoch": 0.36537062484843585, "step": 2260}, {"loss": 0.8178, "grad_norm": 0.5734784007072449, "learning_rate": 0.0002, "epoch": 0.36698730902918114, "step": 2270}, {"loss": 0.8193, "grad_norm": 0.45555487275123596, "learning_rate": 0.0002, "epoch": 0.36860399320992643, "step": 2280}, {"loss": 0.7929, "grad_norm": 0.5666276216506958, "learning_rate": 0.0002, "epoch": 0.3702206773906717, "step": 2290}, {"loss": 0.8292, "grad_norm": 0.5461117625236511, "learning_rate": 0.0002, "epoch": 0.371837361571417, "step": 2300}, {"loss": 0.8204, "grad_norm": 0.6318911910057068, "learning_rate": 0.0002, "epoch": 0.3734540457521623, "step": 2310}, {"loss": 0.7964, "grad_norm": 0.493263304233551, "learning_rate": 0.0002, "epoch": 0.37507072993290763, "step": 2320}, {"loss": 0.8339, "grad_norm": 0.5888760089874268, "learning_rate": 0.0002, "epoch": 0.3766874141136529, "step": 2330}, {"loss": 0.7737, "grad_norm": 0.48671841621398926, "learning_rate": 0.0002, "epoch": 0.3783040982943982, "step": 2340}, {"loss": 0.8367, "grad_norm": 0.4385145306587219, "learning_rate": 0.0002, "epoch": 0.3799207824751435, "step": 2350}, {"loss": 0.812, "grad_norm": 0.5523318648338318, "learning_rate": 0.0002, "epoch": 0.3815374666558888, "step": 2360}, {"loss": 0.8351, "grad_norm": 0.7308220267295837, "learning_rate": 0.0002, "epoch": 0.38315415083663407, "step": 2370}, {"loss": 0.859, "grad_norm": 0.554214358329773, "learning_rate": 0.0002, "epoch": 0.38477083501737935, "step": 2380}, {"loss": 0.8146, "grad_norm": 0.5425800085067749, "learning_rate": 0.0002, "epoch": 0.38638751919812464, "step": 2390}, {"loss": 0.8282, "grad_norm": 0.48811158537864685, "learning_rate": 0.0002, "epoch": 0.3880042033788699, "step": 2400}, {"loss": 0.8074, "grad_norm": 0.49212366342544556, "learning_rate": 0.0002, "epoch": 0.3896208875596152, "step": 2410}, {"loss": 0.7991, "grad_norm": 0.5222218632698059, "learning_rate": 0.0002, "epoch": 0.3912375717403605, "step": 2420}, {"loss": 0.8182, "grad_norm": 0.4699819087982178, "learning_rate": 0.0002, "epoch": 0.3928542559211058, "step": 2430}, {"loss": 0.7919, "grad_norm": 0.46153587102890015, "learning_rate": 0.0002, "epoch": 0.39447094010185113, "step": 2440}, {"loss": 0.8111, "grad_norm": 0.4150611162185669, "learning_rate": 0.0002, "epoch": 0.3960876242825964, "step": 2450}, {"loss": 0.8589, "grad_norm": 0.5799614787101746, "learning_rate": 0.0002, "epoch": 0.3977043084633417, "step": 2460}, {"loss": 0.8085, "grad_norm": 0.56536865234375, "learning_rate": 0.0002, "epoch": 0.399320992644087, "step": 2470}, {"loss": 0.8022, "grad_norm": 0.5451247096061707, "learning_rate": 0.0002, "epoch": 0.4009376768248323, "step": 2480}, {"loss": 0.8217, "grad_norm": 0.5914521217346191, "learning_rate": 0.0002, "epoch": 0.40255436100557757, "step": 2490}, {"loss": 0.7859, "grad_norm": 0.4428117275238037, "learning_rate": 0.0002, "epoch": 0.40417104518632285, "step": 2500}, {"loss": 0.8054, "grad_norm": 0.48580947518348694, "learning_rate": 0.0002, "epoch": 0.40578772936706814, "step": 2510}, {"loss": 0.8405, "grad_norm": 0.436734676361084, "learning_rate": 0.0002, "epoch": 0.4074044135478134, "step": 2520}, {"loss": 0.8209, "grad_norm": 0.5752223134040833, "learning_rate": 0.0002, "epoch": 0.4090210977285587, "step": 2530}, {"loss": 0.8181, "grad_norm": 0.4271308183670044, "learning_rate": 0.0002, "epoch": 0.410637781909304, "step": 2540}, {"loss": 0.8058, "grad_norm": 0.46294718980789185, "learning_rate": 0.0002, "epoch": 0.4122544660900493, "step": 2550}, {"loss": 0.8473, "grad_norm": 0.49407583475112915, "learning_rate": 0.0002, "epoch": 0.4138711502707946, "step": 2560}, {"loss": 0.7881, "grad_norm": 0.4729035496711731, "learning_rate": 0.0002, "epoch": 0.4154878344515399, "step": 2570}, {"loss": 0.7834, "grad_norm": 0.4129747152328491, "learning_rate": 0.0002, "epoch": 0.4171045186322852, "step": 2580}, {"loss": 0.7859, "grad_norm": 0.5684236288070679, "learning_rate": 0.0002, "epoch": 0.4187212028130305, "step": 2590}, {"loss": 0.811, "grad_norm": 0.4862157106399536, "learning_rate": 0.0002, "epoch": 0.4203378869937758, "step": 2600}, {"loss": 0.7582, "grad_norm": 0.46567976474761963, "learning_rate": 0.0002, "epoch": 0.42195457117452106, "step": 2610}, {"loss": 0.7755, "grad_norm": 0.5710650682449341, "learning_rate": 0.0002, "epoch": 0.42357125535526635, "step": 2620}, {"loss": 0.8573, "grad_norm": 0.5660041570663452, "learning_rate": 0.0002, "epoch": 0.42518793953601164, "step": 2630}, {"loss": 0.7812, "grad_norm": 0.47944375872612, "learning_rate": 0.0002, "epoch": 0.4268046237167569, "step": 2640}, {"loss": 0.7459, "grad_norm": 0.537223756313324, "learning_rate": 0.0002, "epoch": 0.4284213078975022, "step": 2650}, {"loss": 0.8246, "grad_norm": 0.41669997572898865, "learning_rate": 0.0002, "epoch": 0.4300379920782475, "step": 2660}, {"loss": 0.7785, "grad_norm": 0.44727686047554016, "learning_rate": 0.0002, "epoch": 0.4316546762589928, "step": 2670}, {"loss": 0.8241, "grad_norm": 0.5600888729095459, "learning_rate": 0.0002, "epoch": 0.4332713604397381, "step": 2680}, {"loss": 0.7708, "grad_norm": 0.39820605516433716, "learning_rate": 0.0002, "epoch": 0.4348880446204834, "step": 2690}, {"loss": 0.8202, "grad_norm": 0.5637655854225159, "learning_rate": 0.0002, "epoch": 0.4365047288012287, "step": 2700}, {"loss": 0.855, "grad_norm": 0.6363666653633118, "learning_rate": 0.0002, "epoch": 0.438121412981974, "step": 2710}, {"loss": 0.8468, "grad_norm": 0.5656129121780396, "learning_rate": 0.0002, "epoch": 0.4397380971627193, "step": 2720}, {"loss": 0.7845, "grad_norm": 0.5600156188011169, "learning_rate": 0.0002, "epoch": 0.44135478134346456, "step": 2730}, {"loss": 0.8405, "grad_norm": 0.5506579875946045, "learning_rate": 0.0002, "epoch": 0.44297146552420985, "step": 2740}, {"loss": 0.7725, "grad_norm": 0.49878305196762085, "learning_rate": 0.0002, "epoch": 0.44458814970495514, "step": 2750}, {"loss": 0.8292, "grad_norm": 0.4569213092327118, "learning_rate": 0.0002, "epoch": 0.4462048338857004, "step": 2760}, {"loss": 0.8028, "grad_norm": 0.6056680083274841, "learning_rate": 0.0002, "epoch": 0.4478215180664457, "step": 2770}, {"loss": 0.8242, "grad_norm": 0.44474557042121887, "learning_rate": 0.0002, "epoch": 0.449438202247191, "step": 2780}, {"loss": 0.801, "grad_norm": 0.46055394411087036, "learning_rate": 0.0002, "epoch": 0.4510548864279363, "step": 2790}, {"loss": 0.7521, "grad_norm": 0.4904133379459381, "learning_rate": 0.0002, "epoch": 0.4526715706086816, "step": 2800}, {"loss": 0.8829, "grad_norm": 0.5647031664848328, "learning_rate": 0.0002, "epoch": 0.45428825478942686, "step": 2810}, {"loss": 0.8622, "grad_norm": 0.5759473443031311, "learning_rate": 0.0002, "epoch": 0.4559049389701722, "step": 2820}, {"loss": 0.7812, "grad_norm": 0.5161895751953125, "learning_rate": 0.0002, "epoch": 0.4575216231509175, "step": 2830}, {"loss": 0.8045, "grad_norm": 0.4248254597187042, "learning_rate": 0.0002, "epoch": 0.4591383073316628, "step": 2840}, {"loss": 0.7838, "grad_norm": 0.45395001769065857, "learning_rate": 0.0002, "epoch": 0.46075499151240806, "step": 2850}, {"loss": 0.8208, "grad_norm": 0.5358697772026062, "learning_rate": 0.0002, "epoch": 0.46237167569315335, "step": 2860}, {"loss": 0.8147, "grad_norm": 0.5379165410995483, "learning_rate": 0.0002, "epoch": 0.46398835987389864, "step": 2870}, {"loss": 0.7403, "grad_norm": 0.4601989686489105, "learning_rate": 0.0002, "epoch": 0.4656050440546439, "step": 2880}, {"loss": 0.8523, "grad_norm": 0.671115517616272, "learning_rate": 0.0002, "epoch": 0.4672217282353892, "step": 2890}, {"loss": 0.8262, "grad_norm": 0.4425133168697357, "learning_rate": 0.0002, "epoch": 0.4688384124161345, "step": 2900}, {"loss": 0.8178, "grad_norm": 0.5446155071258545, "learning_rate": 0.0002, "epoch": 0.4704550965968798, "step": 2910}, {"loss": 0.8106, "grad_norm": 0.603306233882904, "learning_rate": 0.0002, "epoch": 0.47207178077762507, "step": 2920}, {"loss": 0.8044, "grad_norm": 0.5377997159957886, "learning_rate": 0.0002, "epoch": 0.47368846495837036, "step": 2930}, {"loss": 0.8075, "grad_norm": 0.4931027591228485, "learning_rate": 0.0002, "epoch": 0.4753051491391157, "step": 2940}, {"loss": 0.8004, "grad_norm": 0.4711960256099701, "learning_rate": 0.0002, "epoch": 0.476921833319861, "step": 2950}, {"loss": 0.8121, "grad_norm": 0.5020492672920227, "learning_rate": 0.0002, "epoch": 0.4785385175006063, "step": 2960}, {"loss": 0.8221, "grad_norm": 0.5428946614265442, "learning_rate": 0.0002, "epoch": 0.48015520168135156, "step": 2970}, {"loss": 0.7849, "grad_norm": 0.5294089317321777, "learning_rate": 0.0002, "epoch": 0.48177188586209685, "step": 2980}, {"loss": 0.8553, "grad_norm": 0.648289144039154, "learning_rate": 0.0002, "epoch": 0.48338857004284214, "step": 2990}, {"loss": 0.7874, "grad_norm": 0.47916680574417114, "learning_rate": 0.0002, "epoch": 0.4850052542235874, "step": 3000}, {"loss": 0.8087, "grad_norm": 0.43849772214889526, "learning_rate": 0.0002, "epoch": 0.4866219384043327, "step": 3010}, {"loss": 0.7662, "grad_norm": 0.47007861733436584, "learning_rate": 0.0002, "epoch": 0.488238622585078, "step": 3020}, {"loss": 0.757, "grad_norm": 0.6314331293106079, "learning_rate": 0.0002, "epoch": 0.4898553067658233, "step": 3030}, {"loss": 0.7863, "grad_norm": 0.49211493134498596, "learning_rate": 0.0002, "epoch": 0.49147199094656857, "step": 3040}, {"loss": 0.8335, "grad_norm": 0.4537973403930664, "learning_rate": 0.0002, "epoch": 0.49308867512731386, "step": 3050}, {"loss": 0.8095, "grad_norm": 0.47326919436454773, "learning_rate": 0.0002, "epoch": 0.49470535930805914, "step": 3060}, {"loss": 0.8447, "grad_norm": 0.525874137878418, "learning_rate": 0.0002, "epoch": 0.4963220434888045, "step": 3070}, {"loss": 0.8339, "grad_norm": 0.6361091732978821, "learning_rate": 0.0002, "epoch": 0.4979387276695498, "step": 3080}, {"loss": 0.821, "grad_norm": 0.5850642919540405, "learning_rate": 0.0002, "epoch": 0.49955541185029506, "step": 3090}, {"loss": 0.8279, "grad_norm": 0.47299543023109436, "learning_rate": 0.0002, "epoch": 0.5011720960310403, "step": 3100}, {"loss": 0.8681, "grad_norm": 0.473099946975708, "learning_rate": 0.0002, "epoch": 0.5027887802117856, "step": 3110}, {"loss": 0.8223, "grad_norm": 0.48186397552490234, "learning_rate": 0.0002, "epoch": 0.5044054643925309, "step": 3120}, {"loss": 0.8292, "grad_norm": 0.5015401840209961, "learning_rate": 0.0002, "epoch": 0.5060221485732762, "step": 3130}, {"loss": 0.7692, "grad_norm": 0.5617750287055969, "learning_rate": 0.0002, "epoch": 0.5076388327540216, "step": 3140}, {"loss": 0.8708, "grad_norm": 0.5169327259063721, "learning_rate": 0.0002, "epoch": 0.5092555169347668, "step": 3150}, {"loss": 0.7845, "grad_norm": 0.545657753944397, "learning_rate": 0.0002, "epoch": 0.5108722011155121, "step": 3160}, {"loss": 0.799, "grad_norm": 0.512864351272583, "learning_rate": 0.0002, "epoch": 0.5124888852962574, "step": 3170}, {"loss": 0.7794, "grad_norm": 0.4113546311855316, "learning_rate": 0.0002, "epoch": 0.5141055694770027, "step": 3180}, {"loss": 0.8206, "grad_norm": 0.44532445073127747, "learning_rate": 0.0002, "epoch": 0.5157222536577479, "step": 3190}, {"loss": 0.8213, "grad_norm": 0.5623497366905212, "learning_rate": 0.0002, "epoch": 0.5173389378384933, "step": 3200}, {"loss": 0.7928, "grad_norm": 0.5084741115570068, "learning_rate": 0.0002, "epoch": 0.5189556220192385, "step": 3210}, {"loss": 0.8174, "grad_norm": 0.5305403470993042, "learning_rate": 0.0002, "epoch": 0.5205723061999838, "step": 3220}, {"loss": 0.8139, "grad_norm": 0.4708254337310791, "learning_rate": 0.0002, "epoch": 0.5221889903807291, "step": 3230}, {"loss": 0.7639, "grad_norm": 0.43827131390571594, "learning_rate": 0.0002, "epoch": 0.5238056745614744, "step": 3240}, {"loss": 0.7993, "grad_norm": 0.5630002617835999, "learning_rate": 0.0002, "epoch": 0.5254223587422197, "step": 3250}, {"loss": 0.7522, "grad_norm": 0.5010961890220642, "learning_rate": 0.0002, "epoch": 0.527039042922965, "step": 3260}, {"loss": 0.8374, "grad_norm": 0.6303122043609619, "learning_rate": 0.0002, "epoch": 0.5286557271037103, "step": 3270}, {"loss": 0.7727, "grad_norm": 0.5107331275939941, "learning_rate": 0.0002, "epoch": 0.5302724112844556, "step": 3280}, {"loss": 0.8495, "grad_norm": 0.5700443387031555, "learning_rate": 0.0002, "epoch": 0.5318890954652009, "step": 3290}, {"loss": 0.7776, "grad_norm": 0.46296367049217224, "learning_rate": 0.0002, "epoch": 0.5335057796459461, "step": 3300}, {"loss": 0.7931, "grad_norm": 0.531568706035614, "learning_rate": 0.0002, "epoch": 0.5351224638266915, "step": 3310}, {"loss": 0.843, "grad_norm": 0.4686741530895233, "learning_rate": 0.0002, "epoch": 0.5367391480074367, "step": 3320}, {"loss": 0.8104, "grad_norm": 0.5404331088066101, "learning_rate": 0.0002, "epoch": 0.5383558321881821, "step": 3330}, {"loss": 0.7686, "grad_norm": 0.6368790864944458, "learning_rate": 0.0002, "epoch": 0.5399725163689273, "step": 3340}, {"loss": 0.8514, "grad_norm": 0.42300888895988464, "learning_rate": 0.0002, "epoch": 0.5415892005496726, "step": 3350}, {"loss": 0.8236, "grad_norm": 0.5362542867660522, "learning_rate": 0.0002, "epoch": 0.5432058847304179, "step": 3360}, {"loss": 0.858, "grad_norm": 0.497128963470459, "learning_rate": 0.0002, "epoch": 0.5448225689111632, "step": 3370}, {"loss": 0.8519, "grad_norm": 0.5006386041641235, "learning_rate": 0.0002, "epoch": 0.5464392530919085, "step": 3380}, {"loss": 0.7867, "grad_norm": 0.44136837124824524, "learning_rate": 0.0002, "epoch": 0.5480559372726538, "step": 3390}, {"loss": 0.773, "grad_norm": 0.5897833108901978, "learning_rate": 0.0002, "epoch": 0.5496726214533991, "step": 3400}, {"loss": 0.8895, "grad_norm": 0.641075611114502, "learning_rate": 0.0002, "epoch": 0.5512893056341444, "step": 3410}, {"loss": 0.7827, "grad_norm": 0.7251322269439697, "learning_rate": 0.0002, "epoch": 0.5529059898148897, "step": 3420}, {"loss": 0.7626, "grad_norm": 0.47411349415779114, "learning_rate": 0.0002, "epoch": 0.5545226739956349, "step": 3430}, {"loss": 0.8196, "grad_norm": 0.4994310438632965, "learning_rate": 0.0002, "epoch": 0.5561393581763803, "step": 3440}, {"loss": 0.7812, "grad_norm": 0.5814438462257385, "learning_rate": 0.0002, "epoch": 0.5577560423571255, "step": 3450}, {"loss": 0.8805, "grad_norm": 0.6278898119926453, "learning_rate": 0.0002, "epoch": 0.5593727265378708, "step": 3460}, {"loss": 0.813, "grad_norm": 0.46208274364471436, "learning_rate": 0.0002, "epoch": 0.5609894107186161, "step": 3470}, {"loss": 0.8295, "grad_norm": 0.5718930959701538, "learning_rate": 0.0002, "epoch": 0.5626060948993614, "step": 3480}, {"loss": 0.8152, "grad_norm": 0.48178744316101074, "learning_rate": 0.0002, "epoch": 0.5642227790801067, "step": 3490}, {"loss": 0.8244, "grad_norm": 0.47336965799331665, "learning_rate": 0.0002, "epoch": 0.565839463260852, "step": 3500}, {"loss": 0.8099, "grad_norm": 0.43442684412002563, "learning_rate": 0.0002, "epoch": 0.5674561474415973, "step": 3510}, {"loss": 0.7564, "grad_norm": 0.6463358998298645, "learning_rate": 0.0002, "epoch": 0.5690728316223426, "step": 3520}, {"loss": 0.836, "grad_norm": 0.5286486744880676, "learning_rate": 0.0002, "epoch": 0.5706895158030879, "step": 3530}, {"loss": 0.8421, "grad_norm": 0.5405499935150146, "learning_rate": 0.0002, "epoch": 0.5723061999838331, "step": 3540}, {"loss": 0.7614, "grad_norm": 0.6654391884803772, "learning_rate": 0.0002, "epoch": 0.5739228841645785, "step": 3550}, {"loss": 0.7803, "grad_norm": 0.5081980228424072, "learning_rate": 0.0002, "epoch": 0.5755395683453237, "step": 3560}, {"loss": 0.7753, "grad_norm": 0.48978179693222046, "learning_rate": 0.0002, "epoch": 0.5771562525260691, "step": 3570}, {"loss": 0.8151, "grad_norm": 0.5840612053871155, "learning_rate": 0.0002, "epoch": 0.5787729367068143, "step": 3580}, {"loss": 0.8937, "grad_norm": 0.5235261917114258, "learning_rate": 0.0002, "epoch": 0.5803896208875596, "step": 3590}, {"loss": 0.7894, "grad_norm": 0.5672075748443604, "learning_rate": 0.0002, "epoch": 0.5820063050683049, "step": 3600}, {"loss": 0.8347, "grad_norm": 0.5613429546356201, "learning_rate": 0.0002, "epoch": 0.5836229892490502, "step": 3610}, {"loss": 0.8274, "grad_norm": 0.4032273590564728, "learning_rate": 0.0002, "epoch": 0.5852396734297954, "step": 3620}, {"loss": 0.8421, "grad_norm": 0.49559324979782104, "learning_rate": 0.0002, "epoch": 0.5868563576105408, "step": 3630}, {"loss": 0.8332, "grad_norm": 0.6895697712898254, "learning_rate": 0.0002, "epoch": 0.5884730417912861, "step": 3640}, {"loss": 0.7877, "grad_norm": 0.4750136435031891, "learning_rate": 0.0002, "epoch": 0.5900897259720314, "step": 3650}, {"loss": 0.8219, "grad_norm": 0.5176819562911987, "learning_rate": 0.0002, "epoch": 0.5917064101527767, "step": 3660}, {"loss": 0.8151, "grad_norm": 0.5817760229110718, "learning_rate": 0.0002, "epoch": 0.5933230943335219, "step": 3670}, {"loss": 0.7823, "grad_norm": 0.6064626574516296, "learning_rate": 0.0002, "epoch": 0.5949397785142673, "step": 3680}, {"loss": 0.8422, "grad_norm": 0.6728700995445251, "learning_rate": 0.0002, "epoch": 0.5965564626950125, "step": 3690}, {"loss": 0.7679, "grad_norm": 0.609305202960968, "learning_rate": 0.0002, "epoch": 0.5981731468757578, "step": 3700}, {"loss": 0.8048, "grad_norm": 0.4615488350391388, "learning_rate": 0.0002, "epoch": 0.5997898310565031, "step": 3710}, {"loss": 0.8214, "grad_norm": 2.0531179904937744, "learning_rate": 0.0002, "epoch": 0.6014065152372484, "step": 3720}, {"loss": 0.8158, "grad_norm": 0.5091132521629333, "learning_rate": 0.0002, "epoch": 0.6030231994179936, "step": 3730}, {"loss": 0.7833, "grad_norm": 0.5951124429702759, "learning_rate": 0.0002, "epoch": 0.604639883598739, "step": 3740}, {"loss": 0.7784, "grad_norm": 0.5870208144187927, "learning_rate": 0.0002, "epoch": 0.6062565677794842, "step": 3750}, {"loss": 0.8044, "grad_norm": 0.6254619359970093, "learning_rate": 0.0002, "epoch": 0.6078732519602296, "step": 3760}, {"loss": 0.7868, "grad_norm": 0.5577626824378967, "learning_rate": 0.0002, "epoch": 0.6094899361409749, "step": 3770}, {"loss": 0.8108, "grad_norm": 0.5004405379295349, "learning_rate": 0.0002, "epoch": 0.6111066203217201, "step": 3780}, {"loss": 0.8092, "grad_norm": 0.5527383685112, "learning_rate": 0.0002, "epoch": 0.6127233045024655, "step": 3790}, {"loss": 0.8036, "grad_norm": 0.49116113781929016, "learning_rate": 0.0002, "epoch": 0.6143399886832107, "step": 3800}, {"loss": 0.8352, "grad_norm": 0.5299299359321594, "learning_rate": 0.0002, "epoch": 0.6159566728639561, "step": 3810}, {"loss": 0.7737, "grad_norm": 0.464897483587265, "learning_rate": 0.0002, "epoch": 0.6175733570447013, "step": 3820}, {"loss": 0.7923, "grad_norm": 0.6505740880966187, "learning_rate": 0.0002, "epoch": 0.6191900412254466, "step": 3830}, {"loss": 0.8123, "grad_norm": 0.5512559413909912, "learning_rate": 0.0002, "epoch": 0.6208067254061919, "step": 3840}, {"loss": 0.8856, "grad_norm": 0.49427518248558044, "learning_rate": 0.0002, "epoch": 0.6224234095869372, "step": 3850}, {"loss": 0.7751, "grad_norm": 0.3839147090911865, "learning_rate": 0.0002, "epoch": 0.6240400937676824, "step": 3860}, {"loss": 0.8006, "grad_norm": 0.5760218501091003, "learning_rate": 0.0002, "epoch": 0.6256567779484278, "step": 3870}, {"loss": 0.7836, "grad_norm": 0.7226507067680359, "learning_rate": 0.0002, "epoch": 0.6272734621291731, "step": 3880}, {"loss": 0.8244, "grad_norm": 0.676781415939331, "learning_rate": 0.0002, "epoch": 0.6288901463099184, "step": 3890}, {"loss": 0.8239, "grad_norm": 0.4284018278121948, "learning_rate": 0.0002, "epoch": 0.6305068304906637, "step": 3900}, {"loss": 0.7996, "grad_norm": 0.5060628056526184, "learning_rate": 0.0002, "epoch": 0.6321235146714089, "step": 3910}, {"loss": 0.8089, "grad_norm": 0.5524522066116333, "learning_rate": 0.0002, "epoch": 0.6337401988521543, "step": 3920}, {"loss": 0.8276, "grad_norm": 0.6099881529808044, "learning_rate": 0.0002, "epoch": 0.6353568830328995, "step": 3930}, {"loss": 0.809, "grad_norm": 0.43155938386917114, "learning_rate": 0.0002, "epoch": 0.6369735672136448, "step": 3940}, {"loss": 0.8404, "grad_norm": 0.6427084803581238, "learning_rate": 0.0002, "epoch": 0.6385902513943901, "step": 3950}, {"loss": 0.8368, "grad_norm": 0.541220486164093, "learning_rate": 0.0002, "epoch": 0.6402069355751354, "step": 3960}, {"loss": 0.8539, "grad_norm": 0.5414294600486755, "learning_rate": 0.0002, "epoch": 0.6418236197558806, "step": 3970}, {"loss": 0.7996, "grad_norm": 0.46344003081321716, "learning_rate": 0.0002, "epoch": 0.643440303936626, "step": 3980}, {"loss": 0.7474, "grad_norm": 0.45209285616874695, "learning_rate": 0.0002, "epoch": 0.6450569881173712, "step": 3990}, {"loss": 0.8202, "grad_norm": 0.5417284369468689, "learning_rate": 0.0002, "epoch": 0.6466736722981166, "step": 4000}, {"loss": 0.7563, "grad_norm": 0.7995685935020447, "learning_rate": 0.0002, "epoch": 0.6482903564788619, "step": 4010}, {"loss": 0.7812, "grad_norm": 0.6384002566337585, "learning_rate": 0.0002, "epoch": 0.6499070406596071, "step": 4020}, {"loss": 0.732, "grad_norm": 0.4472815692424774, "learning_rate": 0.0002, "epoch": 0.6515237248403525, "step": 4030}, {"loss": 0.8071, "grad_norm": 0.6834294199943542, "learning_rate": 0.0002, "epoch": 0.6531404090210977, "step": 4040}, {"loss": 0.7812, "grad_norm": 0.4612339735031128, "learning_rate": 0.0002, "epoch": 0.654757093201843, "step": 4050}, {"loss": 0.8141, "grad_norm": 0.9266576170921326, "learning_rate": 0.0002, "epoch": 0.6563737773825883, "step": 4060}, {"loss": 0.7991, "grad_norm": 0.4470861852169037, "learning_rate": 0.0002, "epoch": 0.6579904615633336, "step": 4070}, {"loss": 0.8293, "grad_norm": 0.45544925332069397, "learning_rate": 0.0002, "epoch": 0.6596071457440789, "step": 4080}, {"loss": 0.8455, "grad_norm": 0.6144481301307678, "learning_rate": 0.0002, "epoch": 0.6612238299248242, "step": 4090}, {"loss": 0.7877, "grad_norm": 0.5936288237571716, "learning_rate": 0.0002, "epoch": 0.6628405141055694, "step": 4100}, {"loss": 0.7617, "grad_norm": 0.4822963774204254, "learning_rate": 0.0002, "epoch": 0.6644571982863148, "step": 4110}, {"loss": 0.7997, "grad_norm": 0.48432496190071106, "learning_rate": 0.0002, "epoch": 0.66607388246706, "step": 4120}, {"loss": 0.8404, "grad_norm": 0.4901607930660248, "learning_rate": 0.0002, "epoch": 0.6676905666478054, "step": 4130}, {"loss": 0.8085, "grad_norm": 0.5018393397331238, "learning_rate": 0.0002, "epoch": 0.6693072508285507, "step": 4140}, {"loss": 0.8065, "grad_norm": 0.6946378946304321, "learning_rate": 0.0002, "epoch": 0.6709239350092959, "step": 4150}, {"loss": 0.8147, "grad_norm": 0.5997390747070312, "learning_rate": 0.0002, "epoch": 0.6725406191900413, "step": 4160}, {"loss": 0.8268, "grad_norm": 0.6738849878311157, "learning_rate": 0.0002, "epoch": 0.6741573033707865, "step": 4170}, {"loss": 0.7704, "grad_norm": 0.6110581159591675, "learning_rate": 0.0002, "epoch": 0.6757739875515318, "step": 4180}, {"loss": 0.8043, "grad_norm": 0.5703322291374207, "learning_rate": 0.0002, "epoch": 0.6773906717322771, "step": 4190}, {"loss": 0.8099, "grad_norm": 0.4686066210269928, "learning_rate": 0.0002, "epoch": 0.6790073559130224, "step": 4200}, {"loss": 0.8441, "grad_norm": 0.6394643783569336, "learning_rate": 0.0002, "epoch": 0.6806240400937676, "step": 4210}, {"loss": 0.8011, "grad_norm": 0.5454841256141663, "learning_rate": 0.0002, "epoch": 0.682240724274513, "step": 4220}, {"loss": 0.8307, "grad_norm": 0.4859732985496521, "learning_rate": 0.0002, "epoch": 0.6838574084552582, "step": 4230}, {"loss": 0.8161, "grad_norm": 0.5544065833091736, "learning_rate": 0.0002, "epoch": 0.6854740926360036, "step": 4240}, {"loss": 0.7839, "grad_norm": 0.4902505576610565, "learning_rate": 0.0002, "epoch": 0.6870907768167488, "step": 4250}, {"loss": 0.7977, "grad_norm": 0.4768051505088806, "learning_rate": 0.0002, "epoch": 0.6887074609974941, "step": 4260}, {"loss": 0.7539, "grad_norm": 0.49982190132141113, "learning_rate": 0.0002, "epoch": 0.6903241451782395, "step": 4270}, {"loss": 0.7353, "grad_norm": 0.6351838111877441, "learning_rate": 0.0002, "epoch": 0.6919408293589847, "step": 4280}, {"loss": 0.7664, "grad_norm": 0.5647561550140381, "learning_rate": 0.0002, "epoch": 0.69355751353973, "step": 4290}, {"loss": 0.7618, "grad_norm": 0.5340486764907837, "learning_rate": 0.0002, "epoch": 0.6951741977204753, "step": 4300}, {"loss": 0.8526, "grad_norm": 0.5649092793464661, "learning_rate": 0.0002, "epoch": 0.6967908819012206, "step": 4310}, {"loss": 0.8246, "grad_norm": 0.6183916926383972, "learning_rate": 0.0002, "epoch": 0.6984075660819659, "step": 4320}, {"loss": 0.792, "grad_norm": 0.6154509782791138, "learning_rate": 0.0002, "epoch": 0.7000242502627112, "step": 4330}, {"loss": 0.8397, "grad_norm": 0.5156264305114746, "learning_rate": 0.0002, "epoch": 0.7016409344434564, "step": 4340}, {"loss": 0.8512, "grad_norm": 0.562171459197998, "learning_rate": 0.0002, "epoch": 0.7032576186242018, "step": 4350}, {"loss": 0.7882, "grad_norm": 0.4949502646923065, "learning_rate": 0.0002, "epoch": 0.704874302804947, "step": 4360}, {"loss": 0.738, "grad_norm": 0.5171684622764587, "learning_rate": 0.0002, "epoch": 0.7064909869856923, "step": 4370}, {"loss": 0.8001, "grad_norm": 0.6198443174362183, "learning_rate": 0.0002, "epoch": 0.7081076711664377, "step": 4380}, {"loss": 0.7606, "grad_norm": 0.5802276134490967, "learning_rate": 0.0002, "epoch": 0.7097243553471829, "step": 4390}, {"loss": 0.8797, "grad_norm": 0.41096967458724976, "learning_rate": 0.0002, "epoch": 0.7113410395279283, "step": 4400}, {"loss": 0.805, "grad_norm": 0.4397392272949219, "learning_rate": 0.0002, "epoch": 0.7129577237086735, "step": 4410}, {"loss": 0.7651, "grad_norm": 0.45228442549705505, "learning_rate": 0.0002, "epoch": 0.7145744078894188, "step": 4420}, {"loss": 0.7938, "grad_norm": 0.4839673936367035, "learning_rate": 0.0002, "epoch": 0.7161910920701641, "step": 4430}, {"loss": 0.8362, "grad_norm": 0.6140755414962769, "learning_rate": 0.0002, "epoch": 0.7178077762509094, "step": 4440}, {"loss": 0.7722, "grad_norm": 0.6841378808021545, "learning_rate": 0.0002, "epoch": 0.7194244604316546, "step": 4450}, {"loss": 0.8177, "grad_norm": 0.6664239168167114, "learning_rate": 0.0002, "epoch": 0.7210411446124, "step": 4460}, {"loss": 0.7983, "grad_norm": 0.47552719712257385, "learning_rate": 0.0002, "epoch": 0.7226578287931452, "step": 4470}, {"loss": 0.8982, "grad_norm": 0.6649776101112366, "learning_rate": 0.0002, "epoch": 0.7242745129738906, "step": 4480}, {"loss": 0.8074, "grad_norm": 0.5159541964530945, "learning_rate": 0.0002, "epoch": 0.7258911971546358, "step": 4490}, {"loss": 0.7786, "grad_norm": 0.6693112850189209, "learning_rate": 0.0002, "epoch": 0.7275078813353811, "step": 4500}, {"loss": 0.8655, "grad_norm": 0.48870977759361267, "learning_rate": 0.0002, "epoch": 0.7291245655161265, "step": 4510}, {"loss": 0.7337, "grad_norm": 0.4857887923717499, "learning_rate": 0.0002, "epoch": 0.7307412496968717, "step": 4520}, {"loss": 0.8026, "grad_norm": 0.5515662431716919, "learning_rate": 0.0002, "epoch": 0.732357933877617, "step": 4530}, {"loss": 0.8031, "grad_norm": 0.6292222738265991, "learning_rate": 0.0002, "epoch": 0.7339746180583623, "step": 4540}, {"loss": 0.7749, "grad_norm": 0.48265689611434937, "learning_rate": 0.0002, "epoch": 0.7355913022391076, "step": 4550}, {"loss": 0.8499, "grad_norm": 0.8044266104698181, "learning_rate": 0.0002, "epoch": 0.7372079864198529, "step": 4560}, {"loss": 0.8162, "grad_norm": 0.6111769676208496, "learning_rate": 0.0002, "epoch": 0.7388246706005982, "step": 4570}, {"loss": 0.7291, "grad_norm": 0.5229553580284119, "learning_rate": 0.0002, "epoch": 0.7404413547813434, "step": 4580}, {"loss": 0.8038, "grad_norm": 0.6054152250289917, "learning_rate": 0.0002, "epoch": 0.7420580389620888, "step": 4590}, {"loss": 0.8169, "grad_norm": 0.5574966669082642, "learning_rate": 0.0002, "epoch": 0.743674723142834, "step": 4600}, {"loss": 0.8439, "grad_norm": 0.5395817160606384, "learning_rate": 0.0002, "epoch": 0.7452914073235793, "step": 4610}, {"loss": 0.8495, "grad_norm": 0.7116472721099854, "learning_rate": 0.0002, "epoch": 0.7469080915043246, "step": 4620}, {"loss": 0.7743, "grad_norm": 0.5618700981140137, "learning_rate": 0.0002, "epoch": 0.7485247756850699, "step": 4630}, {"loss": 0.7744, "grad_norm": 0.5802770853042603, "learning_rate": 0.0002, "epoch": 0.7501414598658153, "step": 4640}, {"loss": 0.7924, "grad_norm": 0.5690428018569946, "learning_rate": 0.0002, "epoch": 0.7517581440465605, "step": 4650}, {"loss": 0.8017, "grad_norm": 0.4813360273838043, "learning_rate": 0.0002, "epoch": 0.7533748282273058, "step": 4660}, {"loss": 0.8108, "grad_norm": 0.5434042811393738, "learning_rate": 0.0002, "epoch": 0.7549915124080511, "step": 4670}, {"loss": 0.7824, "grad_norm": 0.5502099990844727, "learning_rate": 0.0002, "epoch": 0.7566081965887964, "step": 4680}, {"loss": 0.8598, "grad_norm": 0.6020621061325073, "learning_rate": 0.0002, "epoch": 0.7582248807695416, "step": 4690}, {"loss": 0.7937, "grad_norm": 0.4922301471233368, "learning_rate": 0.0002, "epoch": 0.759841564950287, "step": 4700}, {"loss": 0.788, "grad_norm": 0.6492828726768494, "learning_rate": 0.0002, "epoch": 0.7614582491310322, "step": 4710}, {"loss": 0.8313, "grad_norm": 0.4865580201148987, "learning_rate": 0.0002, "epoch": 0.7630749333117776, "step": 4720}, {"loss": 0.7966, "grad_norm": 0.5971422791481018, "learning_rate": 0.0002, "epoch": 0.7646916174925228, "step": 4730}, {"loss": 0.8298, "grad_norm": 0.6832674145698547, "learning_rate": 0.0002, "epoch": 0.7663083016732681, "step": 4740}, {"loss": 0.8156, "grad_norm": 0.500908613204956, "learning_rate": 0.0002, "epoch": 0.7679249858540134, "step": 4750}, {"loss": 0.8383, "grad_norm": 0.6112465858459473, "learning_rate": 0.0002, "epoch": 0.7695416700347587, "step": 4760}, {"loss": 0.76, "grad_norm": 0.5753506422042847, "learning_rate": 0.0002, "epoch": 0.771158354215504, "step": 4770}, {"loss": 0.8297, "grad_norm": 0.6529405117034912, "learning_rate": 0.0002, "epoch": 0.7727750383962493, "step": 4780}, {"loss": 0.8171, "grad_norm": 0.5916843414306641, "learning_rate": 0.0002, "epoch": 0.7743917225769946, "step": 4790}, {"loss": 0.83, "grad_norm": 0.4821224510669708, "learning_rate": 0.0002, "epoch": 0.7760084067577399, "step": 4800}, {"loss": 0.7703, "grad_norm": 0.5532580018043518, "learning_rate": 0.0002, "epoch": 0.7776250909384852, "step": 4810}, {"loss": 0.7363, "grad_norm": 0.4604877233505249, "learning_rate": 0.0002, "epoch": 0.7792417751192304, "step": 4820}, {"loss": 0.7506, "grad_norm": 0.5009613037109375, "learning_rate": 0.0002, "epoch": 0.7808584592999758, "step": 4830}, {"loss": 0.7863, "grad_norm": 0.6448560357093811, "learning_rate": 0.0002, "epoch": 0.782475143480721, "step": 4840}, {"loss": 0.7957, "grad_norm": 0.44327953457832336, "learning_rate": 0.0002, "epoch": 0.7840918276614663, "step": 4850}, {"loss": 0.7925, "grad_norm": 0.5355411171913147, "learning_rate": 0.0002, "epoch": 0.7857085118422116, "step": 4860}, {"loss": 0.7754, "grad_norm": 0.5635677576065063, "learning_rate": 0.0002, "epoch": 0.7873251960229569, "step": 4870}, {"loss": 0.7931, "grad_norm": 0.5417491793632507, "learning_rate": 0.0002, "epoch": 0.7889418802037023, "step": 4880}, {"loss": 0.7819, "grad_norm": 0.4567430913448334, "learning_rate": 0.0002, "epoch": 0.7905585643844475, "step": 4890}, {"loss": 0.8454, "grad_norm": 0.44651296734809875, "learning_rate": 0.0002, "epoch": 0.7921752485651928, "step": 4900}, {"loss": 0.7959, "grad_norm": 0.5741217136383057, "learning_rate": 0.0002, "epoch": 0.7937919327459381, "step": 4910}, {"loss": 0.8093, "grad_norm": 0.6605045199394226, "learning_rate": 0.0002, "epoch": 0.7954086169266834, "step": 4920}, {"loss": 0.77, "grad_norm": 0.5126531720161438, "learning_rate": 0.0002, "epoch": 0.7970253011074286, "step": 4930}, {"loss": 0.7793, "grad_norm": 0.513648271560669, "learning_rate": 0.0002, "epoch": 0.798641985288174, "step": 4940}, {"loss": 0.8314, "grad_norm": 0.5350404381752014, "learning_rate": 0.0002, "epoch": 0.8002586694689192, "step": 4950}, {"loss": 0.7649, "grad_norm": 0.5731674432754517, "learning_rate": 0.0002, "epoch": 0.8018753536496646, "step": 4960}, {"loss": 0.8572, "grad_norm": 0.5974258184432983, "learning_rate": 0.0002, "epoch": 0.8034920378304098, "step": 4970}, {"loss": 0.7972, "grad_norm": 0.8774799704551697, "learning_rate": 0.0002, "epoch": 0.8051087220111551, "step": 4980}, {"loss": 0.7899, "grad_norm": 0.5994430184364319, "learning_rate": 0.0002, "epoch": 0.8067254061919004, "step": 4990}, {"loss": 0.7736, "grad_norm": 0.4894903004169464, "learning_rate": 0.0002, "epoch": 0.8083420903726457, "step": 5000}, {"loss": 0.78, "grad_norm": 0.5218459367752075, "learning_rate": 0.0002, "epoch": 0.809958774553391, "step": 5010}, {"loss": 0.817, "grad_norm": 0.5232468843460083, "learning_rate": 0.0002, "epoch": 0.8115754587341363, "step": 5020}, {"loss": 0.7704, "grad_norm": 0.44358372688293457, "learning_rate": 0.0002, "epoch": 0.8131921429148816, "step": 5030}, {"loss": 0.785, "grad_norm": 0.6202037334442139, "learning_rate": 0.0002, "epoch": 0.8148088270956269, "step": 5040}, {"loss": 0.7351, "grad_norm": 0.7721474170684814, "learning_rate": 0.0002, "epoch": 0.8164255112763722, "step": 5050}, {"loss": 0.8297, "grad_norm": 0.5568501353263855, "learning_rate": 0.0002, "epoch": 0.8180421954571174, "step": 5060}, {"loss": 0.7733, "grad_norm": 0.49148809909820557, "learning_rate": 0.0002, "epoch": 0.8196588796378628, "step": 5070}, {"loss": 0.8054, "grad_norm": 0.4956012964248657, "learning_rate": 0.0002, "epoch": 0.821275563818608, "step": 5080}, {"loss": 0.8201, "grad_norm": 0.6078833937644958, "learning_rate": 0.0002, "epoch": 0.8228922479993533, "step": 5090}, {"loss": 0.828, "grad_norm": 0.46906954050064087, "learning_rate": 0.0002, "epoch": 0.8245089321800986, "step": 5100}, {"loss": 0.7703, "grad_norm": 0.50812166929245, "learning_rate": 0.0002, "epoch": 0.8261256163608439, "step": 5110}, {"loss": 0.8243, "grad_norm": 0.5319661498069763, "learning_rate": 0.0002, "epoch": 0.8277423005415891, "step": 5120}, {"loss": 0.7798, "grad_norm": 0.4949689209461212, "learning_rate": 0.0002, "epoch": 0.8293589847223345, "step": 5130}, {"loss": 0.7428, "grad_norm": 0.5151591300964355, "learning_rate": 0.0002, "epoch": 0.8309756689030798, "step": 5140}, {"loss": 0.8147, "grad_norm": 0.5530214309692383, "learning_rate": 0.0002, "epoch": 0.8325923530838251, "step": 5150}, {"loss": 0.8251, "grad_norm": 0.6297410130500793, "learning_rate": 0.0002, "epoch": 0.8342090372645704, "step": 5160}, {"loss": 0.8067, "grad_norm": 0.5466840267181396, "learning_rate": 0.0002, "epoch": 0.8358257214453156, "step": 5170}, {"loss": 0.7875, "grad_norm": 0.652913510799408, "learning_rate": 0.0002, "epoch": 0.837442405626061, "step": 5180}, {"loss": 0.8295, "grad_norm": 0.5811293125152588, "learning_rate": 0.0002, "epoch": 0.8390590898068062, "step": 5190}, {"loss": 0.7412, "grad_norm": 0.5109550952911377, "learning_rate": 0.0002, "epoch": 0.8406757739875516, "step": 5200}, {"loss": 0.8077, "grad_norm": 0.4551706612110138, "learning_rate": 0.0002, "epoch": 0.8422924581682968, "step": 5210}, {"loss": 0.7827, "grad_norm": 0.5813754200935364, "learning_rate": 0.0002, "epoch": 0.8439091423490421, "step": 5220}, {"loss": 0.802, "grad_norm": 0.5856947898864746, "learning_rate": 0.0002, "epoch": 0.8455258265297874, "step": 5230}, {"loss": 0.7957, "grad_norm": 0.5482739210128784, "learning_rate": 0.0002, "epoch": 0.8471425107105327, "step": 5240}, {"loss": 0.8295, "grad_norm": 0.49023720622062683, "learning_rate": 0.0002, "epoch": 0.8487591948912779, "step": 5250}, {"loss": 0.8022, "grad_norm": 0.49472475051879883, "learning_rate": 0.0002, "epoch": 0.8503758790720233, "step": 5260}, {"loss": 0.8001, "grad_norm": 0.5490226745605469, "learning_rate": 0.0002, "epoch": 0.8519925632527686, "step": 5270}, {"loss": 0.8333, "grad_norm": 0.5340665578842163, "learning_rate": 0.0002, "epoch": 0.8536092474335139, "step": 5280}, {"loss": 0.8277, "grad_norm": 0.5962483882904053, "learning_rate": 0.0002, "epoch": 0.8552259316142592, "step": 5290}, {"loss": 0.8765, "grad_norm": 0.586358368396759, "learning_rate": 0.0002, "epoch": 0.8568426157950044, "step": 5300}, {"loss": 0.7831, "grad_norm": 0.49120277166366577, "learning_rate": 0.0002, "epoch": 0.8584592999757498, "step": 5310}, {"loss": 0.8162, "grad_norm": 0.5887332558631897, "learning_rate": 0.0002, "epoch": 0.860075984156495, "step": 5320}, {"loss": 0.7464, "grad_norm": 0.42496153712272644, "learning_rate": 0.0002, "epoch": 0.8616926683372403, "step": 5330}, {"loss": 0.7905, "grad_norm": 0.5489874482154846, "learning_rate": 0.0002, "epoch": 0.8633093525179856, "step": 5340}, {"loss": 0.7958, "grad_norm": 0.5850813984870911, "learning_rate": 0.0002, "epoch": 0.8649260366987309, "step": 5350}, {"loss": 0.7642, "grad_norm": 0.517487108707428, "learning_rate": 0.0002, "epoch": 0.8665427208794761, "step": 5360}, {"loss": 0.7801, "grad_norm": 0.5339142680168152, "learning_rate": 0.0002, "epoch": 0.8681594050602215, "step": 5370}, {"loss": 0.818, "grad_norm": 0.6236387491226196, "learning_rate": 0.0002, "epoch": 0.8697760892409668, "step": 5380}, {"loss": 0.7708, "grad_norm": 0.5752192735671997, "learning_rate": 0.0002, "epoch": 0.8713927734217121, "step": 5390}, {"loss": 0.8542, "grad_norm": 0.6724614500999451, "learning_rate": 0.0002, "epoch": 0.8730094576024574, "step": 5400}, {"loss": 0.7581, "grad_norm": 0.5280613303184509, "learning_rate": 0.0002, "epoch": 0.8746261417832026, "step": 5410}, {"loss": 0.8231, "grad_norm": 0.44033288955688477, "learning_rate": 0.0002, "epoch": 0.876242825963948, "step": 5420}, {"loss": 0.8839, "grad_norm": 0.5199708342552185, "learning_rate": 0.0002, "epoch": 0.8778595101446932, "step": 5430}, {"loss": 0.7852, "grad_norm": 0.46778348088264465, "learning_rate": 0.0002, "epoch": 0.8794761943254386, "step": 5440}, {"loss": 0.7834, "grad_norm": 0.4657754898071289, "learning_rate": 0.0002, "epoch": 0.8810928785061838, "step": 5450}, {"loss": 0.7799, "grad_norm": 0.5472902655601501, "learning_rate": 0.0002, "epoch": 0.8827095626869291, "step": 5460}, {"loss": 0.8253, "grad_norm": 0.4876766800880432, "learning_rate": 0.0002, "epoch": 0.8843262468676744, "step": 5470}, {"loss": 0.7906, "grad_norm": 0.5057248473167419, "learning_rate": 0.0002, "epoch": 0.8859429310484197, "step": 5480}, {"loss": 0.8124, "grad_norm": 0.4637320637702942, "learning_rate": 0.0002, "epoch": 0.8875596152291649, "step": 5490}, {"loss": 0.781, "grad_norm": 0.471955806016922, "learning_rate": 0.0002, "epoch": 0.8891762994099103, "step": 5500}, {"loss": 0.8057, "grad_norm": 0.5209813714027405, "learning_rate": 0.0002, "epoch": 0.8907929835906556, "step": 5510}, {"loss": 0.8106, "grad_norm": 0.6213834285736084, "learning_rate": 0.0002, "epoch": 0.8924096677714008, "step": 5520}, {"loss": 0.7787, "grad_norm": 0.5215408205986023, "learning_rate": 0.0002, "epoch": 0.8940263519521462, "step": 5530}, {"loss": 0.8174, "grad_norm": 0.580478310585022, "learning_rate": 0.0002, "epoch": 0.8956430361328914, "step": 5540}, {"loss": 0.8371, "grad_norm": 0.49102169275283813, "learning_rate": 0.0002, "epoch": 0.8972597203136368, "step": 5550}, {"loss": 0.7806, "grad_norm": 0.6043479442596436, "learning_rate": 0.0002, "epoch": 0.898876404494382, "step": 5560}, {"loss": 0.7754, "grad_norm": 0.5636463165283203, "learning_rate": 0.0002, "epoch": 0.9004930886751273, "step": 5570}, {"loss": 0.8145, "grad_norm": 0.5620124340057373, "learning_rate": 0.0002, "epoch": 0.9021097728558726, "step": 5580}, {"loss": 0.8083, "grad_norm": 0.5206354856491089, "learning_rate": 0.0002, "epoch": 0.9037264570366179, "step": 5590}, {"loss": 0.8557, "grad_norm": 0.5798229575157166, "learning_rate": 0.0002, "epoch": 0.9053431412173631, "step": 5600}, {"loss": 0.8097, "grad_norm": 0.6428212523460388, "learning_rate": 0.0002, "epoch": 0.9069598253981085, "step": 5610}, {"loss": 0.7839, "grad_norm": 0.48064687848091125, "learning_rate": 0.0002, "epoch": 0.9085765095788537, "step": 5620}, {"loss": 0.8343, "grad_norm": 0.6347860097885132, "learning_rate": 0.0002, "epoch": 0.9101931937595991, "step": 5630}, {"loss": 0.851, "grad_norm": 0.5353913307189941, "learning_rate": 0.0002, "epoch": 0.9118098779403444, "step": 5640}, {"loss": 0.7736, "grad_norm": 0.5323944091796875, "learning_rate": 0.0002, "epoch": 0.9134265621210896, "step": 5650}, {"loss": 0.8393, "grad_norm": 0.5261843204498291, "learning_rate": 0.0002, "epoch": 0.915043246301835, "step": 5660}, {"loss": 0.7355, "grad_norm": 0.5451326966285706, "learning_rate": 0.0002, "epoch": 0.9166599304825802, "step": 5670}, {"loss": 0.8012, "grad_norm": 0.5183324217796326, "learning_rate": 0.0002, "epoch": 0.9182766146633256, "step": 5680}, {"loss": 0.7659, "grad_norm": 0.47229018807411194, "learning_rate": 0.0002, "epoch": 0.9198932988440708, "step": 5690}, {"loss": 0.7757, "grad_norm": 0.49180513620376587, "learning_rate": 0.0002, "epoch": 0.9215099830248161, "step": 5700}, {"loss": 0.8735, "grad_norm": 0.5419785380363464, "learning_rate": 0.0002, "epoch": 0.9231266672055614, "step": 5710}, {"loss": 0.7378, "grad_norm": 0.5408698916435242, "learning_rate": 0.0002, "epoch": 0.9247433513863067, "step": 5720}, {"loss": 0.7701, "grad_norm": 0.5286232829093933, "learning_rate": 0.0002, "epoch": 0.9263600355670519, "step": 5730}, {"loss": 0.8242, "grad_norm": 0.7539758086204529, "learning_rate": 0.0002, "epoch": 0.9279767197477973, "step": 5740}, {"loss": 0.8118, "grad_norm": 0.5166944861412048, "learning_rate": 0.0002, "epoch": 0.9295934039285425, "step": 5750}, {"loss": 0.783, "grad_norm": 0.6601425409317017, "learning_rate": 0.0002, "epoch": 0.9312100881092878, "step": 5760}, {"loss": 0.7873, "grad_norm": 0.5029960870742798, "learning_rate": 0.0002, "epoch": 0.9328267722900332, "step": 5770}, {"loss": 0.7989, "grad_norm": 0.4926645755767822, "learning_rate": 0.0002, "epoch": 0.9344434564707784, "step": 5780}, {"loss": 0.8174, "grad_norm": 0.5739615559577942, "learning_rate": 0.0002, "epoch": 0.9360601406515238, "step": 5790}, {"loss": 0.8037, "grad_norm": 0.5058279037475586, "learning_rate": 0.0002, "epoch": 0.937676824832269, "step": 5800}, {"loss": 0.8537, "grad_norm": 0.5260962247848511, "learning_rate": 0.0002, "epoch": 0.9392935090130143, "step": 5810}, {"loss": 0.7486, "grad_norm": 0.5768588185310364, "learning_rate": 0.0002, "epoch": 0.9409101931937596, "step": 5820}, {"loss": 0.8215, "grad_norm": 0.5170126557350159, "learning_rate": 0.0002, "epoch": 0.9425268773745049, "step": 5830}, {"loss": 0.7422, "grad_norm": 0.5745864510536194, "learning_rate": 0.0002, "epoch": 0.9441435615552501, "step": 5840}, {"loss": 0.7824, "grad_norm": 0.5551357865333557, "learning_rate": 0.0002, "epoch": 0.9457602457359955, "step": 5850}, {"loss": 0.8529, "grad_norm": 0.5776078701019287, "learning_rate": 0.0002, "epoch": 0.9473769299167407, "step": 5860}, {"loss": 0.8527, "grad_norm": 0.5340062379837036, "learning_rate": 0.0002, "epoch": 0.9489936140974861, "step": 5870}, {"loss": 0.8217, "grad_norm": 0.6447290182113647, "learning_rate": 0.0002, "epoch": 0.9506102982782314, "step": 5880}, {"loss": 0.7945, "grad_norm": 0.5123815536499023, "learning_rate": 0.0002, "epoch": 0.9522269824589766, "step": 5890}, {"loss": 0.8209, "grad_norm": 0.48547613620758057, "learning_rate": 0.0002, "epoch": 0.953843666639722, "step": 5900}, {"loss": 0.7896, "grad_norm": 0.5791414976119995, "learning_rate": 0.0002, "epoch": 0.9554603508204672, "step": 5910}, {"loss": 0.8408, "grad_norm": 0.6195011734962463, "learning_rate": 0.0002, "epoch": 0.9570770350012126, "step": 5920}, {"loss": 0.7805, "grad_norm": 0.6323803067207336, "learning_rate": 0.0002, "epoch": 0.9586937191819578, "step": 5930}, {"loss": 0.8484, "grad_norm": 0.45552879571914673, "learning_rate": 0.0002, "epoch": 0.9603104033627031, "step": 5940}, {"loss": 0.7367, "grad_norm": 0.5796473622322083, "learning_rate": 0.0002, "epoch": 0.9619270875434484, "step": 5950}, {"loss": 0.7672, "grad_norm": 0.647261381149292, "learning_rate": 0.0002, "epoch": 0.9635437717241937, "step": 5960}, {"loss": 0.8086, "grad_norm": 0.5487682819366455, "learning_rate": 0.0002, "epoch": 0.9651604559049389, "step": 5970}, {"loss": 0.7973, "grad_norm": 0.5743663907051086, "learning_rate": 0.0002, "epoch": 0.9667771400856843, "step": 5980}, {"loss": 0.8153, "grad_norm": 0.5470591187477112, "learning_rate": 0.0002, "epoch": 0.9683938242664295, "step": 5990}, {"loss": 0.8119, "grad_norm": 0.5901660323143005, "learning_rate": 0.0002, "epoch": 0.9700105084471748, "step": 6000}, {"loss": 0.8147, "grad_norm": 0.6544759273529053, "learning_rate": 0.0002, "epoch": 0.9716271926279202, "step": 6010}, {"loss": 0.7536, "grad_norm": 0.6288470029830933, "learning_rate": 0.0002, "epoch": 0.9732438768086654, "step": 6020}, {"loss": 0.7989, "grad_norm": 0.673153817653656, "learning_rate": 0.0002, "epoch": 0.9748605609894108, "step": 6030}, {"loss": 0.7556, "grad_norm": 0.42854753136634827, "learning_rate": 0.0002, "epoch": 0.976477245170156, "step": 6040}, {"loss": 0.8006, "grad_norm": 0.5227066278457642, "learning_rate": 0.0002, "epoch": 0.9780939293509013, "step": 6050}, {"loss": 0.795, "grad_norm": 0.5372416973114014, "learning_rate": 0.0002, "epoch": 0.9797106135316466, "step": 6060}, {"loss": 0.7591, "grad_norm": 0.6026402115821838, "learning_rate": 0.0002, "epoch": 0.9813272977123919, "step": 6070}, {"loss": 0.8347, "grad_norm": 0.49547791481018066, "learning_rate": 0.0002, "epoch": 0.9829439818931371, "step": 6080}, {"loss": 0.7722, "grad_norm": 0.4641951322555542, "learning_rate": 0.0002, "epoch": 0.9845606660738825, "step": 6090}, {"loss": 0.8125, "grad_norm": 0.5818535089492798, "learning_rate": 0.0002, "epoch": 0.9861773502546277, "step": 6100}, {"loss": 0.81, "grad_norm": 0.63955157995224, "learning_rate": 0.0002, "epoch": 0.9877940344353731, "step": 6110}, {"loss": 0.7547, "grad_norm": 0.5649438500404358, "learning_rate": 0.0002, "epoch": 0.9894107186161183, "step": 6120}, {"loss": 0.7861, "grad_norm": 0.5290433168411255, "learning_rate": 0.0002, "epoch": 0.9910274027968636, "step": 6130}, {"loss": 0.8109, "grad_norm": 0.6399374008178711, "learning_rate": 0.0002, "epoch": 0.992644086977609, "step": 6140}, {"loss": 0.8373, "grad_norm": 0.6736576557159424, "learning_rate": 0.0002, "epoch": 0.9942607711583542, "step": 6150}, {"loss": 0.7915, "grad_norm": 0.515420138835907, "learning_rate": 0.0002, "epoch": 0.9958774553390995, "step": 6160}, {"loss": 0.8032, "grad_norm": 0.562677800655365, "learning_rate": 0.0002, "epoch": 0.9974941395198448, "step": 6170}, {"loss": 0.8187, "grad_norm": 0.7113858461380005, "learning_rate": 0.0002, "epoch": 0.9991108237005901, "step": 6180}, {"eval_loss": 1.0871200561523438, "eval_runtime": 122.2071, "eval_samples_per_second": 5.998, "eval_steps_per_second": 0.753, "epoch": 0.9999191657909627, "step": 6185}, {"loss": 0.7507, "grad_norm": 0.7111801505088806, "learning_rate": 0.0002, "epoch": 1.0007275078813354, "step": 6190}, {"loss": 0.6865, "grad_norm": 0.5402125716209412, "learning_rate": 0.0002, "epoch": 1.0023441920620806, "step": 6200}, {"loss": 0.7625, "grad_norm": 0.6098830103874207, "learning_rate": 0.0002, "epoch": 1.003960876242826, "step": 6210}, {"loss": 0.7631, "grad_norm": 0.5829983353614807, "learning_rate": 0.0002, "epoch": 1.0055775604235713, "step": 6220}, {"loss": 0.7188, "grad_norm": 0.5614621043205261, "learning_rate": 0.0002, "epoch": 1.0071942446043165, "step": 6230}, {"loss": 0.7505, "grad_norm": 0.5954238772392273, "learning_rate": 0.0002, "epoch": 1.0088109287850617, "step": 6240}, {"loss": 0.7448, "grad_norm": 0.6480574607849121, "learning_rate": 0.0002, "epoch": 1.0104276129658072, "step": 6250}, {"loss": 0.7514, "grad_norm": 0.6051128506660461, "learning_rate": 0.0002, "epoch": 1.0120442971465524, "step": 6260}, {"loss": 0.7237, "grad_norm": 0.6318870782852173, "learning_rate": 0.0002, "epoch": 1.0136609813272976, "step": 6270}, {"loss": 0.7178, "grad_norm": 0.5048980116844177, "learning_rate": 0.0002, "epoch": 1.015277665508043, "step": 6280}, {"loss": 0.7391, "grad_norm": 0.6346936225891113, "learning_rate": 0.0002, "epoch": 1.0168943496887883, "step": 6290}, {"loss": 0.7486, "grad_norm": 0.5711665749549866, "learning_rate": 0.0002, "epoch": 1.0185110338695336, "step": 6300}, {"loss": 0.6808, "grad_norm": 0.5175361037254333, "learning_rate": 0.0002, "epoch": 1.0201277180502788, "step": 6310}, {"loss": 0.7539, "grad_norm": 0.5360831618309021, "learning_rate": 0.0002, "epoch": 1.0217444022310243, "step": 6320}, {"loss": 0.7112, "grad_norm": 0.614675760269165, "learning_rate": 0.0002, "epoch": 1.0233610864117695, "step": 6330}, {"loss": 0.7748, "grad_norm": 0.5626118183135986, "learning_rate": 0.0002, "epoch": 1.0249777705925147, "step": 6340}, {"loss": 0.7375, "grad_norm": 0.574897289276123, "learning_rate": 0.0002, "epoch": 1.02659445477326, "step": 6350}, {"loss": 0.759, "grad_norm": 0.7185447812080383, "learning_rate": 0.0002, "epoch": 1.0282111389540054, "step": 6360}, {"loss": 0.703, "grad_norm": 0.6705799698829651, "learning_rate": 0.0002, "epoch": 1.0298278231347506, "step": 6370}, {"loss": 0.7139, "grad_norm": 0.6740428805351257, "learning_rate": 0.0002, "epoch": 1.0314445073154959, "step": 6380}, {"loss": 0.7252, "grad_norm": 0.663902759552002, "learning_rate": 0.0002, "epoch": 1.0330611914962413, "step": 6390}, {"loss": 0.7065, "grad_norm": 0.5029543042182922, "learning_rate": 0.0002, "epoch": 1.0346778756769865, "step": 6400}, {"loss": 0.711, "grad_norm": 0.7813863158226013, "learning_rate": 0.0002, "epoch": 1.0362945598577318, "step": 6410}, {"loss": 0.7433, "grad_norm": 0.5396282076835632, "learning_rate": 0.0002, "epoch": 1.037911244038477, "step": 6420}, {"loss": 0.7222, "grad_norm": 0.5253293514251709, "learning_rate": 0.0002, "epoch": 1.0395279282192225, "step": 6430}, {"loss": 0.715, "grad_norm": 0.7236770987510681, "learning_rate": 0.0002, "epoch": 1.0411446123999677, "step": 6440}, {"loss": 0.7259, "grad_norm": 0.5670917630195618, "learning_rate": 0.0002, "epoch": 1.042761296580713, "step": 6450}, {"loss": 0.7195, "grad_norm": 0.6031978726387024, "learning_rate": 0.0002, "epoch": 1.0443779807614582, "step": 6460}, {"loss": 0.7648, "grad_norm": 0.5309213399887085, "learning_rate": 0.0002, "epoch": 1.0459946649422036, "step": 6470}, {"loss": 0.7161, "grad_norm": 0.7114651799201965, "learning_rate": 0.0002, "epoch": 1.0476113491229488, "step": 6480}, {"loss": 0.7583, "grad_norm": 0.5591610670089722, "learning_rate": 0.0002, "epoch": 1.049228033303694, "step": 6490}, {"loss": 0.6645, "grad_norm": 0.5185961127281189, "learning_rate": 0.0002, "epoch": 1.0508447174844395, "step": 6500}, {"loss": 0.7654, "grad_norm": 0.6510552167892456, "learning_rate": 0.0002, "epoch": 1.0524614016651848, "step": 6510}, {"loss": 0.7057, "grad_norm": 0.6557928919792175, "learning_rate": 0.0002, "epoch": 1.05407808584593, "step": 6520}, {"loss": 0.8056, "grad_norm": 0.6973192691802979, "learning_rate": 0.0002, "epoch": 1.0556947700266752, "step": 6530}, {"loss": 0.6793, "grad_norm": 0.6226583123207092, "learning_rate": 0.0002, "epoch": 1.0573114542074207, "step": 6540}, {"loss": 0.7151, "grad_norm": 0.5633195638656616, "learning_rate": 0.0002, "epoch": 1.058928138388166, "step": 6550}, {"loss": 0.7082, "grad_norm": 0.7466658353805542, "learning_rate": 0.0002, "epoch": 1.0605448225689111, "step": 6560}, {"loss": 0.7059, "grad_norm": 0.6462772488594055, "learning_rate": 0.0002, "epoch": 1.0621615067496564, "step": 6570}, {"loss": 0.7046, "grad_norm": 0.5266856551170349, "learning_rate": 0.0002, "epoch": 1.0637781909304018, "step": 6580}, {"loss": 0.7157, "grad_norm": 0.534392774105072, "learning_rate": 0.0002, "epoch": 1.065394875111147, "step": 6590}, {"loss": 0.7115, "grad_norm": 0.7514177560806274, "learning_rate": 0.0002, "epoch": 1.0670115592918923, "step": 6600}, {"loss": 0.7545, "grad_norm": 0.7593035697937012, "learning_rate": 0.0002, "epoch": 1.0686282434726375, "step": 6610}, {"loss": 0.6836, "grad_norm": 0.5277858972549438, "learning_rate": 0.0002, "epoch": 1.070244927653383, "step": 6620}, {"loss": 0.7405, "grad_norm": 0.5573670268058777, "learning_rate": 0.0002, "epoch": 1.0718616118341282, "step": 6630}, {"loss": 0.6774, "grad_norm": 0.6802396774291992, "learning_rate": 0.0002, "epoch": 1.0734782960148734, "step": 6640}, {"loss": 0.723, "grad_norm": 0.7367215752601624, "learning_rate": 0.0002, "epoch": 1.0750949801956189, "step": 6650}, {"loss": 0.7429, "grad_norm": 0.5961891412734985, "learning_rate": 0.0002, "epoch": 1.0767116643763641, "step": 6660}, {"loss": 0.6791, "grad_norm": 0.5736313462257385, "learning_rate": 0.0002, "epoch": 1.0783283485571094, "step": 6670}, {"loss": 0.7178, "grad_norm": 0.619219183921814, "learning_rate": 0.0002, "epoch": 1.0799450327378546, "step": 6680}, {"loss": 0.7318, "grad_norm": 0.6214390993118286, "learning_rate": 0.0002, "epoch": 1.0815617169186, "step": 6690}, {"loss": 0.7554, "grad_norm": 0.564536988735199, "learning_rate": 0.0002, "epoch": 1.0831784010993453, "step": 6700}, {"loss": 0.7362, "grad_norm": 0.5838140249252319, "learning_rate": 0.0002, "epoch": 1.0847950852800905, "step": 6710}, {"loss": 0.739, "grad_norm": 0.7000553607940674, "learning_rate": 0.0002, "epoch": 1.0864117694608357, "step": 6720}, {"loss": 0.7369, "grad_norm": 0.7078263759613037, "learning_rate": 0.0002, "epoch": 1.0880284536415812, "step": 6730}, {"loss": 0.7654, "grad_norm": 0.8353848457336426, "learning_rate": 0.0002, "epoch": 1.0896451378223264, "step": 6740}, {"loss": 0.7015, "grad_norm": 0.5615518689155579, "learning_rate": 0.0002, "epoch": 1.0912618220030716, "step": 6750}, {"loss": 0.7396, "grad_norm": 0.5475581288337708, "learning_rate": 0.0002, "epoch": 1.0928785061838169, "step": 6760}, {"loss": 0.7652, "grad_norm": 0.5835978388786316, "learning_rate": 0.0002, "epoch": 1.0944951903645623, "step": 6770}, {"loss": 0.7541, "grad_norm": 0.5516105890274048, "learning_rate": 0.0002, "epoch": 1.0961118745453076, "step": 6780}, {"loss": 0.6842, "grad_norm": 0.5875251889228821, "learning_rate": 0.0002, "epoch": 1.0977285587260528, "step": 6790}, {"loss": 0.6903, "grad_norm": 0.7376947999000549, "learning_rate": 0.0002, "epoch": 1.0993452429067982, "step": 6800}, {"loss": 0.7512, "grad_norm": 0.5656165480613708, "learning_rate": 0.0002, "epoch": 1.1009619270875435, "step": 6810}, {"loss": 0.7409, "grad_norm": 0.6365954279899597, "learning_rate": 0.0002, "epoch": 1.1025786112682887, "step": 6820}, {"loss": 0.7392, "grad_norm": 0.5033080577850342, "learning_rate": 0.0002, "epoch": 1.104195295449034, "step": 6830}, {"loss": 0.6909, "grad_norm": 0.617396891117096, "learning_rate": 0.0002, "epoch": 1.1058119796297794, "step": 6840}, {"loss": 0.7006, "grad_norm": 0.6395374536514282, "learning_rate": 0.0002, "epoch": 1.1074286638105246, "step": 6850}, {"loss": 0.7335, "grad_norm": 0.6775295734405518, "learning_rate": 0.0002, "epoch": 1.1090453479912699, "step": 6860}, {"loss": 0.764, "grad_norm": 0.6655223965644836, "learning_rate": 0.0002, "epoch": 1.1106620321720153, "step": 6870}, {"loss": 0.7553, "grad_norm": 0.676655113697052, "learning_rate": 0.0002, "epoch": 1.1122787163527605, "step": 6880}, {"loss": 0.7342, "grad_norm": 0.6062718629837036, "learning_rate": 0.0002, "epoch": 1.1138954005335058, "step": 6890}, {"loss": 0.7446, "grad_norm": 0.590943455696106, "learning_rate": 0.0002, "epoch": 1.115512084714251, "step": 6900}, {"loss": 0.6705, "grad_norm": 0.6315317153930664, "learning_rate": 0.0002, "epoch": 1.1171287688949965, "step": 6910}, {"loss": 0.6912, "grad_norm": 0.47979024052619934, "learning_rate": 0.0002, "epoch": 1.1187454530757417, "step": 6920}, {"loss": 0.7002, "grad_norm": 0.647298276424408, "learning_rate": 0.0002, "epoch": 1.120362137256487, "step": 6930}, {"loss": 0.7502, "grad_norm": 0.7336484789848328, "learning_rate": 0.0002, "epoch": 1.1219788214372322, "step": 6940}, {"loss": 0.693, "grad_norm": 0.5071424245834351, "learning_rate": 0.0002, "epoch": 1.1235955056179776, "step": 6950}, {"loss": 0.7378, "grad_norm": 0.6527144312858582, "learning_rate": 0.0002, "epoch": 1.1252121897987228, "step": 6960}, {"loss": 0.7228, "grad_norm": 0.6935935020446777, "learning_rate": 0.0002, "epoch": 1.126828873979468, "step": 6970}, {"loss": 0.699, "grad_norm": 0.8026931881904602, "learning_rate": 0.0002, "epoch": 1.1284455581602133, "step": 6980}, {"loss": 0.7361, "grad_norm": 0.5210393667221069, "learning_rate": 0.0002, "epoch": 1.1300622423409588, "step": 6990}, {"loss": 0.7456, "grad_norm": 0.60475093126297, "learning_rate": 0.0002, "epoch": 1.131678926521704, "step": 7000}, {"loss": 0.7495, "grad_norm": 0.6417073607444763, "learning_rate": 0.0002, "epoch": 1.1332956107024492, "step": 7010}, {"loss": 0.7459, "grad_norm": 0.6732175946235657, "learning_rate": 0.0002, "epoch": 1.1349122948831947, "step": 7020}, {"loss": 0.7278, "grad_norm": 0.6719491481781006, "learning_rate": 0.0002, "epoch": 1.13652897906394, "step": 7030}, {"loss": 0.7694, "grad_norm": 0.5708295106887817, "learning_rate": 0.0002, "epoch": 1.1381456632446851, "step": 7040}, {"loss": 0.7823, "grad_norm": 0.7141719460487366, "learning_rate": 0.0002, "epoch": 1.1397623474254304, "step": 7050}, {"loss": 0.764, "grad_norm": 0.6187017560005188, "learning_rate": 0.0002, "epoch": 1.1413790316061758, "step": 7060}, {"loss": 0.7657, "grad_norm": 0.50581294298172, "learning_rate": 0.0002, "epoch": 1.142995715786921, "step": 7070}, {"loss": 0.7357, "grad_norm": 0.5620143413543701, "learning_rate": 0.0002, "epoch": 1.1446123999676663, "step": 7080}, {"loss": 0.7287, "grad_norm": 0.6231929659843445, "learning_rate": 0.0002, "epoch": 1.1462290841484115, "step": 7090}, {"loss": 0.7328, "grad_norm": 0.5775774121284485, "learning_rate": 0.0002, "epoch": 1.147845768329157, "step": 7100}, {"loss": 0.7728, "grad_norm": 0.6492809653282166, "learning_rate": 0.0002, "epoch": 1.1494624525099022, "step": 7110}, {"loss": 0.7545, "grad_norm": 0.6434972286224365, "learning_rate": 0.0002, "epoch": 1.1510791366906474, "step": 7120}, {"loss": 0.7374, "grad_norm": 0.6191812753677368, "learning_rate": 0.0002, "epoch": 1.1526958208713927, "step": 7130}, {"loss": 0.7276, "grad_norm": 0.6690331697463989, "learning_rate": 0.0002, "epoch": 1.1543125050521381, "step": 7140}, {"loss": 0.7704, "grad_norm": 0.5977938175201416, "learning_rate": 0.0002, "epoch": 1.1559291892328833, "step": 7150}, {"loss": 0.7251, "grad_norm": 0.6195854544639587, "learning_rate": 0.0002, "epoch": 1.1575458734136286, "step": 7160}, {"loss": 0.7249, "grad_norm": 0.5752048492431641, "learning_rate": 0.0002, "epoch": 1.159162557594374, "step": 7170}, {"loss": 0.7593, "grad_norm": 0.589081883430481, "learning_rate": 0.0002, "epoch": 1.1607792417751193, "step": 7180}, {"loss": 0.704, "grad_norm": 0.756996750831604, "learning_rate": 0.0002, "epoch": 1.1623959259558645, "step": 7190}, {"loss": 0.7404, "grad_norm": 0.7614967226982117, "learning_rate": 0.0002, "epoch": 1.1640126101366097, "step": 7200}, {"loss": 0.7867, "grad_norm": 0.6120437979698181, "learning_rate": 0.0002, "epoch": 1.1656292943173552, "step": 7210}, {"loss": 0.7384, "grad_norm": 0.6210004687309265, "learning_rate": 0.0002, "epoch": 1.1672459784981004, "step": 7220}, {"loss": 0.7251, "grad_norm": 0.6044116020202637, "learning_rate": 0.0002, "epoch": 1.1688626626788456, "step": 7230}, {"loss": 0.7361, "grad_norm": 0.5418457388877869, "learning_rate": 0.0002, "epoch": 1.170479346859591, "step": 7240}, {"loss": 0.6938, "grad_norm": 0.6413537263870239, "learning_rate": 0.0002, "epoch": 1.1720960310403363, "step": 7250}, {"loss": 0.6978, "grad_norm": 0.5777867436408997, "learning_rate": 0.0002, "epoch": 1.1737127152210816, "step": 7260}, {"loss": 0.7503, "grad_norm": 0.7092402577400208, "learning_rate": 0.0002, "epoch": 1.1753293994018268, "step": 7270}, {"loss": 0.7487, "grad_norm": 0.6351709365844727, "learning_rate": 0.0002, "epoch": 1.176946083582572, "step": 7280}, {"loss": 0.7527, "grad_norm": 0.6172189712524414, "learning_rate": 0.0002, "epoch": 1.1785627677633175, "step": 7290}, {"loss": 0.7319, "grad_norm": 0.6801714897155762, "learning_rate": 0.0002, "epoch": 1.1801794519440627, "step": 7300}, {"loss": 0.6941, "grad_norm": 0.6044712066650391, "learning_rate": 0.0002, "epoch": 1.181796136124808, "step": 7310}, {"loss": 0.6951, "grad_norm": 0.7413212060928345, "learning_rate": 0.0002, "epoch": 1.1834128203055534, "step": 7320}, {"loss": 0.7396, "grad_norm": 0.5303856134414673, "learning_rate": 0.0002, "epoch": 1.1850295044862986, "step": 7330}, {"loss": 0.6915, "grad_norm": 0.5647098422050476, "learning_rate": 0.0002, "epoch": 1.1866461886670439, "step": 7340}, {"loss": 0.7506, "grad_norm": 0.7374135255813599, "learning_rate": 0.0002, "epoch": 1.188262872847789, "step": 7350}, {"loss": 0.7041, "grad_norm": 0.5710089206695557, "learning_rate": 0.0002, "epoch": 1.1898795570285345, "step": 7360}, {"loss": 0.8289, "grad_norm": 0.6073619723320007, "learning_rate": 0.0002, "epoch": 1.1914962412092798, "step": 7370}, {"loss": 0.7722, "grad_norm": 0.5899916887283325, "learning_rate": 0.0002, "epoch": 1.193112925390025, "step": 7380}, {"loss": 0.756, "grad_norm": 0.7762434482574463, "learning_rate": 0.0002, "epoch": 1.1947296095707705, "step": 7390}, {"loss": 0.7319, "grad_norm": 0.679949939250946, "learning_rate": 0.0002, "epoch": 1.1963462937515157, "step": 7400}, {"loss": 0.7599, "grad_norm": 0.6106849312782288, "learning_rate": 0.0002, "epoch": 1.197962977932261, "step": 7410}, {"loss": 0.7648, "grad_norm": 0.682461678981781, "learning_rate": 0.0002, "epoch": 1.1995796621130062, "step": 7420}, {"loss": 0.7741, "grad_norm": 0.6087017059326172, "learning_rate": 0.0002, "epoch": 1.2011963462937516, "step": 7430}, {"loss": 0.7642, "grad_norm": 0.63739013671875, "learning_rate": 0.0002, "epoch": 1.2028130304744968, "step": 7440}, {"loss": 0.7611, "grad_norm": 0.6154777407646179, "learning_rate": 0.0002, "epoch": 1.204429714655242, "step": 7450}, {"loss": 0.7565, "grad_norm": 0.7491534948348999, "learning_rate": 0.0002, "epoch": 1.2060463988359873, "step": 7460}, {"loss": 0.698, "grad_norm": 0.6664797067642212, "learning_rate": 0.0002, "epoch": 1.2076630830167328, "step": 7470}, {"loss": 0.7456, "grad_norm": 0.6660266518592834, "learning_rate": 0.0002, "epoch": 1.209279767197478, "step": 7480}, {"loss": 0.714, "grad_norm": 0.6972551345825195, "learning_rate": 0.0002, "epoch": 1.2108964513782232, "step": 7490}, {"loss": 0.7023, "grad_norm": 0.6157945990562439, "learning_rate": 0.0002, "epoch": 1.2125131355589684, "step": 7500}, {"loss": 0.7326, "grad_norm": 0.5199310183525085, "learning_rate": 0.0002, "epoch": 1.214129819739714, "step": 7510}, {"loss": 0.7586, "grad_norm": 0.577610433101654, "learning_rate": 0.0002, "epoch": 1.2157465039204591, "step": 7520}, {"loss": 0.7179, "grad_norm": 0.53652423620224, "learning_rate": 0.0002, "epoch": 1.2173631881012044, "step": 7530}, {"loss": 0.7393, "grad_norm": 0.6479050517082214, "learning_rate": 0.0002, "epoch": 1.2189798722819498, "step": 7540}, {"loss": 0.7534, "grad_norm": 0.618748128414154, "learning_rate": 0.0002, "epoch": 1.220596556462695, "step": 7550}, {"loss": 0.6886, "grad_norm": 0.6311424374580383, "learning_rate": 0.0002, "epoch": 1.2222132406434403, "step": 7560}, {"loss": 0.7272, "grad_norm": 0.6595825552940369, "learning_rate": 0.0002, "epoch": 1.2238299248241855, "step": 7570}, {"loss": 0.7353, "grad_norm": 0.5198960900306702, "learning_rate": 0.0002, "epoch": 1.225446609004931, "step": 7580}, {"loss": 0.674, "grad_norm": 0.578650712966919, "learning_rate": 0.0002, "epoch": 1.2270632931856762, "step": 7590}, {"loss": 0.7507, "grad_norm": 0.6080220937728882, "learning_rate": 0.0002, "epoch": 1.2286799773664214, "step": 7600}, {"loss": 0.7733, "grad_norm": 0.7050248384475708, "learning_rate": 0.0002, "epoch": 1.2302966615471669, "step": 7610}, {"loss": 0.7032, "grad_norm": 0.6652196049690247, "learning_rate": 0.0002, "epoch": 1.2319133457279121, "step": 7620}, {"loss": 0.7085, "grad_norm": 0.7322776317596436, "learning_rate": 0.0002, "epoch": 1.2335300299086573, "step": 7630}, {"loss": 0.7402, "grad_norm": 0.4998728036880493, "learning_rate": 0.0002, "epoch": 1.2351467140894026, "step": 7640}, {"loss": 0.7214, "grad_norm": 0.6428788900375366, "learning_rate": 0.0002, "epoch": 1.2367633982701478, "step": 7650}, {"loss": 0.7699, "grad_norm": 0.585242509841919, "learning_rate": 0.0002, "epoch": 1.2383800824508933, "step": 7660}, {"loss": 0.7621, "grad_norm": 0.5211917757987976, "learning_rate": 0.0002, "epoch": 1.2399967666316385, "step": 7670}, {"loss": 0.746, "grad_norm": 0.6490384340286255, "learning_rate": 0.0002, "epoch": 1.2416134508123837, "step": 7680}, {"loss": 0.7186, "grad_norm": 0.6249763369560242, "learning_rate": 0.0002, "epoch": 1.2432301349931292, "step": 7690}, {"loss": 0.7761, "grad_norm": 0.71870356798172, "learning_rate": 0.0002, "epoch": 1.2448468191738744, "step": 7700}, {"loss": 0.7525, "grad_norm": 0.6761967539787292, "learning_rate": 0.0002, "epoch": 1.2464635033546196, "step": 7710}, {"loss": 0.7501, "grad_norm": 0.6500617265701294, "learning_rate": 0.0002, "epoch": 1.2480801875353649, "step": 7720}, {"loss": 0.7903, "grad_norm": 0.8069869875907898, "learning_rate": 0.0002, "epoch": 1.2496968717161103, "step": 7730}, {"loss": 0.6747, "grad_norm": 0.6044608950614929, "learning_rate": 0.0002, "epoch": 1.2513135558968556, "step": 7740}, {"loss": 0.6825, "grad_norm": 0.6573283076286316, "learning_rate": 0.0002, "epoch": 1.2529302400776008, "step": 7750}, {"loss": 0.7617, "grad_norm": 0.625430166721344, "learning_rate": 0.0002, "epoch": 1.2545469242583462, "step": 7760}, {"loss": 0.7041, "grad_norm": 0.5442022681236267, "learning_rate": 0.0002, "epoch": 1.2561636084390915, "step": 7770}, {"loss": 0.7172, "grad_norm": 0.6818386912345886, "learning_rate": 0.0002, "epoch": 1.2577802926198367, "step": 7780}, {"loss": 0.696, "grad_norm": 0.6381874084472656, "learning_rate": 0.0002, "epoch": 1.259396976800582, "step": 7790}, {"loss": 0.6834, "grad_norm": 0.6269212961196899, "learning_rate": 0.0002, "epoch": 1.2610136609813272, "step": 7800}, {"loss": 0.7821, "grad_norm": 0.600121259689331, "learning_rate": 0.0002, "epoch": 1.2626303451620726, "step": 7810}, {"loss": 0.7761, "grad_norm": 0.6337703466415405, "learning_rate": 0.0002, "epoch": 1.2642470293428179, "step": 7820}, {"loss": 0.732, "grad_norm": 0.7234963774681091, "learning_rate": 0.0002, "epoch": 1.2658637135235633, "step": 7830}, {"loss": 0.785, "grad_norm": 0.800184965133667, "learning_rate": 0.0002, "epoch": 1.2674803977043085, "step": 7840}, {"loss": 0.7426, "grad_norm": 0.7539464831352234, "learning_rate": 0.0002, "epoch": 1.2690970818850538, "step": 7850}, {"loss": 0.7496, "grad_norm": 0.5493760704994202, "learning_rate": 0.0002, "epoch": 1.270713766065799, "step": 7860}, {"loss": 0.7537, "grad_norm": 0.7477145791053772, "learning_rate": 0.0002, "epoch": 1.2723304502465442, "step": 7870}, {"loss": 0.7573, "grad_norm": 0.6366362571716309, "learning_rate": 0.0002, "epoch": 1.2739471344272897, "step": 7880}, {"loss": 0.7608, "grad_norm": 0.7419533729553223, "learning_rate": 0.0002, "epoch": 1.275563818608035, "step": 7890}, {"loss": 0.7873, "grad_norm": 0.6141223311424255, "learning_rate": 0.0002, "epoch": 1.2771805027887801, "step": 7900}, {"loss": 0.6916, "grad_norm": 0.7522598505020142, "learning_rate": 0.0002, "epoch": 1.2787971869695256, "step": 7910}, {"loss": 0.7097, "grad_norm": 0.6935804486274719, "learning_rate": 0.0002, "epoch": 1.2804138711502708, "step": 7920}, {"loss": 0.7185, "grad_norm": 0.7239290475845337, "learning_rate": 0.0002, "epoch": 1.282030555331016, "step": 7930}, {"loss": 0.7145, "grad_norm": 0.8800187110900879, "learning_rate": 0.0002, "epoch": 1.2836472395117613, "step": 7940}, {"loss": 0.6991, "grad_norm": 0.540458083152771, "learning_rate": 0.0002, "epoch": 1.2852639236925067, "step": 7950}, {"loss": 0.7139, "grad_norm": 0.6492934226989746, "learning_rate": 0.0002, "epoch": 1.286880607873252, "step": 7960}, {"loss": 0.7742, "grad_norm": 0.6543959379196167, "learning_rate": 0.0002, "epoch": 1.2884972920539972, "step": 7970}, {"loss": 0.7316, "grad_norm": 0.5804705619812012, "learning_rate": 0.0002, "epoch": 1.2901139762347427, "step": 7980}, {"loss": 0.796, "grad_norm": 0.7074727416038513, "learning_rate": 0.0002, "epoch": 1.291730660415488, "step": 7990}, {"loss": 0.7034, "grad_norm": 0.5347974300384521, "learning_rate": 0.0002, "epoch": 1.2933473445962331, "step": 8000}, {"loss": 0.738, "grad_norm": 0.6457298398017883, "learning_rate": 0.0002, "epoch": 1.2949640287769784, "step": 8010}, {"loss": 0.7634, "grad_norm": 0.6407219171524048, "learning_rate": 0.0002, "epoch": 1.2965807129577236, "step": 8020}, {"loss": 0.7506, "grad_norm": 0.828439474105835, "learning_rate": 0.0002, "epoch": 1.298197397138469, "step": 8030}, {"loss": 0.735, "grad_norm": 0.4840380549430847, "learning_rate": 0.0002, "epoch": 1.2998140813192143, "step": 8040}, {"loss": 0.7283, "grad_norm": 0.5921024680137634, "learning_rate": 0.0002, "epoch": 1.3014307654999595, "step": 8050}, {"loss": 0.7477, "grad_norm": 0.6170315146446228, "learning_rate": 0.0002, "epoch": 1.303047449680705, "step": 8060}, {"loss": 0.7534, "grad_norm": 0.5374847054481506, "learning_rate": 0.0002, "epoch": 1.3046641338614502, "step": 8070}, {"loss": 0.7593, "grad_norm": 0.545758068561554, "learning_rate": 0.0002, "epoch": 1.3062808180421954, "step": 8080}, {"loss": 0.7463, "grad_norm": 0.55641770362854, "learning_rate": 0.0002, "epoch": 1.3078975022229407, "step": 8090}, {"loss": 0.7594, "grad_norm": 0.6724897027015686, "learning_rate": 0.0002, "epoch": 1.309514186403686, "step": 8100}, {"loss": 0.7105, "grad_norm": 0.6923972368240356, "learning_rate": 0.0002, "epoch": 1.3111308705844313, "step": 8110}, {"loss": 0.7149, "grad_norm": 0.5136841535568237, "learning_rate": 0.0002, "epoch": 1.3127475547651766, "step": 8120}, {"loss": 0.7504, "grad_norm": 0.6766283512115479, "learning_rate": 0.0002, "epoch": 1.314364238945922, "step": 8130}, {"loss": 0.7489, "grad_norm": 0.6283926367759705, "learning_rate": 0.0002, "epoch": 1.3159809231266673, "step": 8140}, {"loss": 0.7459, "grad_norm": 0.644216001033783, "learning_rate": 0.0002, "epoch": 1.3175976073074125, "step": 8150}, {"loss": 0.7125, "grad_norm": 0.7827503085136414, "learning_rate": 0.0002, "epoch": 1.3192142914881577, "step": 8160}, {"loss": 0.7271, "grad_norm": 0.6651390790939331, "learning_rate": 0.0002, "epoch": 1.320830975668903, "step": 8170}, {"loss": 0.7778, "grad_norm": 0.5547412633895874, "learning_rate": 0.0002, "epoch": 1.3224476598496484, "step": 8180}, {"loss": 0.7402, "grad_norm": 0.6765179634094238, "learning_rate": 0.0002, "epoch": 1.3240643440303936, "step": 8190}, {"loss": 0.7106, "grad_norm": 0.6822077035903931, "learning_rate": 0.0002, "epoch": 1.325681028211139, "step": 8200}, {"loss": 0.7288, "grad_norm": 0.5941002368927002, "learning_rate": 0.0002, "epoch": 1.3272977123918843, "step": 8210}, {"loss": 0.7494, "grad_norm": 0.4850037097930908, "learning_rate": 0.0002, "epoch": 1.3289143965726296, "step": 8220}, {"loss": 0.7474, "grad_norm": 0.6162990927696228, "learning_rate": 0.0002, "epoch": 1.3305310807533748, "step": 8230}, {"loss": 0.7751, "grad_norm": 0.6665613651275635, "learning_rate": 0.0002, "epoch": 1.33214776493412, "step": 8240}, {"loss": 0.759, "grad_norm": 0.618192732334137, "learning_rate": 0.0002, "epoch": 1.3337644491148655, "step": 8250}, {"loss": 0.7532, "grad_norm": 0.710418701171875, "learning_rate": 0.0002, "epoch": 1.3353811332956107, "step": 8260}, {"loss": 0.7306, "grad_norm": 0.5109876990318298, "learning_rate": 0.0002, "epoch": 1.336997817476356, "step": 8270}, {"loss": 0.7303, "grad_norm": 0.6791711449623108, "learning_rate": 0.0002, "epoch": 1.3386145016571014, "step": 8280}, {"loss": 0.7594, "grad_norm": 0.6836432814598083, "learning_rate": 0.0002, "epoch": 1.3402311858378466, "step": 8290}, {"loss": 0.7594, "grad_norm": 0.5579386353492737, "learning_rate": 0.0002, "epoch": 1.3418478700185918, "step": 8300}, {"loss": 0.7377, "grad_norm": 0.6713546514511108, "learning_rate": 0.0002, "epoch": 1.343464554199337, "step": 8310}, {"loss": 0.7756, "grad_norm": 0.5353720188140869, "learning_rate": 0.0002, "epoch": 1.3450812383800825, "step": 8320}, {"loss": 0.718, "grad_norm": 0.5813682675361633, "learning_rate": 0.0002, "epoch": 1.3466979225608278, "step": 8330}, {"loss": 0.7294, "grad_norm": 0.8158791661262512, "learning_rate": 0.0002, "epoch": 1.348314606741573, "step": 8340}, {"loss": 0.6992, "grad_norm": 0.6193785071372986, "learning_rate": 0.0002, "epoch": 1.3499312909223184, "step": 8350}, {"loss": 0.7654, "grad_norm": 0.6353939771652222, "learning_rate": 0.0002, "epoch": 1.3515479751030637, "step": 8360}, {"loss": 0.7519, "grad_norm": 0.6925048232078552, "learning_rate": 0.0002, "epoch": 1.353164659283809, "step": 8370}, {"loss": 0.736, "grad_norm": 0.988264799118042, "learning_rate": 0.0002, "epoch": 1.3547813434645541, "step": 8380}, {"loss": 0.7744, "grad_norm": 0.6476002931594849, "learning_rate": 0.0002, "epoch": 1.3563980276452994, "step": 8390}, {"loss": 0.776, "grad_norm": 0.7120398879051208, "learning_rate": 0.0002, "epoch": 1.3580147118260448, "step": 8400}, {"loss": 0.7368, "grad_norm": 0.9048416614532471, "learning_rate": 0.0002, "epoch": 1.35963139600679, "step": 8410}, {"loss": 0.7544, "grad_norm": 0.7000672817230225, "learning_rate": 0.0002, "epoch": 1.3612480801875353, "step": 8420}, {"loss": 0.7358, "grad_norm": 0.6015632152557373, "learning_rate": 0.0002, "epoch": 1.3628647643682807, "step": 8430}, {"loss": 0.7298, "grad_norm": 0.612516462802887, "learning_rate": 0.0002, "epoch": 1.364481448549026, "step": 8440}, {"loss": 0.7055, "grad_norm": 0.5969301462173462, "learning_rate": 0.0002, "epoch": 1.3660981327297712, "step": 8450}, {"loss": 0.7754, "grad_norm": 0.6730654239654541, "learning_rate": 0.0002, "epoch": 1.3677148169105164, "step": 8460}, {"loss": 0.7465, "grad_norm": 0.6386392116546631, "learning_rate": 0.0002, "epoch": 1.369331501091262, "step": 8470}, {"loss": 0.7433, "grad_norm": 0.739544153213501, "learning_rate": 0.0002, "epoch": 1.3709481852720071, "step": 8480}, {"loss": 0.7892, "grad_norm": 0.6462782621383667, "learning_rate": 0.0002, "epoch": 1.3725648694527524, "step": 8490}, {"loss": 0.7302, "grad_norm": 0.7346843481063843, "learning_rate": 0.0002, "epoch": 1.3741815536334978, "step": 8500}, {"loss": 0.7634, "grad_norm": 0.6884821057319641, "learning_rate": 0.0002, "epoch": 1.375798237814243, "step": 8510}, {"loss": 0.7614, "grad_norm": 0.6999333500862122, "learning_rate": 0.0002, "epoch": 1.3774149219949883, "step": 8520}, {"loss": 0.729, "grad_norm": 0.5378713011741638, "learning_rate": 0.0002, "epoch": 1.3790316061757335, "step": 8530}, {"loss": 0.6797, "grad_norm": 0.5417906641960144, "learning_rate": 0.0002, "epoch": 1.3806482903564787, "step": 8540}, {"loss": 0.7499, "grad_norm": 0.6602526307106018, "learning_rate": 0.0002, "epoch": 1.3822649745372242, "step": 8550}, {"loss": 0.7356, "grad_norm": 0.7073674201965332, "learning_rate": 0.0002, "epoch": 1.3838816587179694, "step": 8560}, {"loss": 0.75, "grad_norm": 0.5841707587242126, "learning_rate": 0.0002, "epoch": 1.3854983428987149, "step": 8570}, {"loss": 0.732, "grad_norm": 0.7031095027923584, "learning_rate": 0.0002, "epoch": 1.38711502707946, "step": 8580}, {"loss": 0.7464, "grad_norm": 0.5198570489883423, "learning_rate": 0.0002, "epoch": 1.3887317112602053, "step": 8590}, {"loss": 0.7354, "grad_norm": 0.7261320352554321, "learning_rate": 0.0002, "epoch": 1.3903483954409506, "step": 8600}, {"loss": 0.7339, "grad_norm": 0.5616350173950195, "learning_rate": 0.0002, "epoch": 1.3919650796216958, "step": 8610}, {"loss": 0.7382, "grad_norm": 0.5185914635658264, "learning_rate": 0.0002, "epoch": 1.3935817638024413, "step": 8620}, {"loss": 0.7456, "grad_norm": 0.5814694762229919, "learning_rate": 0.0002, "epoch": 1.3951984479831865, "step": 8630}, {"loss": 0.7413, "grad_norm": 0.6977371573448181, "learning_rate": 0.0002, "epoch": 1.3968151321639317, "step": 8640}, {"loss": 0.7574, "grad_norm": 0.6855689883232117, "learning_rate": 0.0002, "epoch": 1.3984318163446772, "step": 8650}, {"loss": 0.7802, "grad_norm": 0.5414357781410217, "learning_rate": 0.0002, "epoch": 1.4000485005254224, "step": 8660}, {"loss": 0.7487, "grad_norm": 0.6970012784004211, "learning_rate": 0.0002, "epoch": 1.4016651847061676, "step": 8670}, {"loss": 0.7421, "grad_norm": 0.526079535484314, "learning_rate": 0.0002, "epoch": 1.4032818688869129, "step": 8680}, {"loss": 0.737, "grad_norm": 0.758712887763977, "learning_rate": 0.0002, "epoch": 1.404898553067658, "step": 8690}, {"loss": 0.7612, "grad_norm": 0.7118762731552124, "learning_rate": 0.0002, "epoch": 1.4065152372484035, "step": 8700}, {"loss": 0.7628, "grad_norm": 0.5696909427642822, "learning_rate": 0.0002, "epoch": 1.4081319214291488, "step": 8710}, {"loss": 0.7156, "grad_norm": 0.7995436787605286, "learning_rate": 0.0002, "epoch": 1.4097486056098942, "step": 8720}, {"loss": 0.7521, "grad_norm": 0.7237521409988403, "learning_rate": 0.0002, "epoch": 1.4113652897906395, "step": 8730}, {"loss": 0.7661, "grad_norm": 0.744628369808197, "learning_rate": 0.0002, "epoch": 1.4129819739713847, "step": 8740}, {"loss": 0.7073, "grad_norm": 0.6082926988601685, "learning_rate": 0.0002, "epoch": 1.41459865815213, "step": 8750}, {"loss": 0.7282, "grad_norm": 0.5185243487358093, "learning_rate": 0.0002, "epoch": 1.4162153423328752, "step": 8760}, {"loss": 0.7592, "grad_norm": 0.5183082222938538, "learning_rate": 0.0002, "epoch": 1.4178320265136206, "step": 8770}, {"loss": 0.7509, "grad_norm": 0.7326041460037231, "learning_rate": 0.0002, "epoch": 1.4194487106943658, "step": 8780}, {"loss": 0.7398, "grad_norm": 0.7174660563468933, "learning_rate": 0.0002, "epoch": 1.421065394875111, "step": 8790}, {"loss": 0.7507, "grad_norm": 0.8080165982246399, "learning_rate": 0.0002, "epoch": 1.4226820790558565, "step": 8800}, {"loss": 0.72, "grad_norm": 0.5061507821083069, "learning_rate": 0.0002, "epoch": 1.4242987632366018, "step": 8810}, {"loss": 0.7563, "grad_norm": 0.801602840423584, "learning_rate": 0.0002, "epoch": 1.425915447417347, "step": 8820}, {"loss": 0.7287, "grad_norm": 0.6150273084640503, "learning_rate": 0.0002, "epoch": 1.4275321315980922, "step": 8830}, {"loss": 0.7452, "grad_norm": 0.8786525726318359, "learning_rate": 0.0002, "epoch": 1.4291488157788377, "step": 8840}, {"loss": 0.7257, "grad_norm": 0.6371538639068604, "learning_rate": 0.0002, "epoch": 1.430765499959583, "step": 8850}, {"loss": 0.711, "grad_norm": 0.6409295797348022, "learning_rate": 0.0002, "epoch": 1.4323821841403281, "step": 8860}, {"loss": 0.7891, "grad_norm": 0.6452359557151794, "learning_rate": 0.0002, "epoch": 1.4339988683210736, "step": 8870}, {"loss": 0.7588, "grad_norm": 0.5842334628105164, "learning_rate": 0.0002, "epoch": 1.4356155525018188, "step": 8880}, {"loss": 0.7446, "grad_norm": 0.696761965751648, "learning_rate": 0.0002, "epoch": 1.437232236682564, "step": 8890}, {"loss": 0.7541, "grad_norm": 0.6384600400924683, "learning_rate": 0.0002, "epoch": 1.4388489208633093, "step": 8900}, {"loss": 0.7049, "grad_norm": 0.5981136560440063, "learning_rate": 0.0002, "epoch": 1.4404656050440545, "step": 8910}, {"loss": 0.795, "grad_norm": 0.6355637907981873, "learning_rate": 0.0002, "epoch": 1.4420822892248, "step": 8920}, {"loss": 0.7653, "grad_norm": 0.6374830603599548, "learning_rate": 0.0002, "epoch": 1.4436989734055452, "step": 8930}, {"loss": 0.8108, "grad_norm": 0.559013307094574, "learning_rate": 0.0002, "epoch": 1.4453156575862904, "step": 8940}, {"loss": 0.7045, "grad_norm": 0.7289170026779175, "learning_rate": 0.0002, "epoch": 1.446932341767036, "step": 8950}, {"loss": 0.7484, "grad_norm": 0.8649206757545471, "learning_rate": 0.0002, "epoch": 1.4485490259477811, "step": 8960}, {"loss": 0.7745, "grad_norm": 0.7664689421653748, "learning_rate": 0.0002, "epoch": 1.4501657101285264, "step": 8970}, {"loss": 0.7431, "grad_norm": 0.7109952569007874, "learning_rate": 0.0002, "epoch": 1.4517823943092716, "step": 8980}, {"loss": 0.7997, "grad_norm": 0.6312844753265381, "learning_rate": 0.0002, "epoch": 1.453399078490017, "step": 8990}, {"loss": 0.7467, "grad_norm": 0.6616617441177368, "learning_rate": 0.0002, "epoch": 1.4550157626707623, "step": 9000}, {"loss": 0.7518, "grad_norm": 0.7384068965911865, "learning_rate": 0.0002, "epoch": 1.4566324468515075, "step": 9010}, {"loss": 0.7483, "grad_norm": 0.6549670100212097, "learning_rate": 0.0002, "epoch": 1.458249131032253, "step": 9020}, {"loss": 0.7423, "grad_norm": 0.6254119277000427, "learning_rate": 0.0002, "epoch": 1.4598658152129982, "step": 9030}, {"loss": 0.7645, "grad_norm": 0.6806328892707825, "learning_rate": 0.0002, "epoch": 1.4614824993937434, "step": 9040}, {"loss": 0.7221, "grad_norm": 0.6803115010261536, "learning_rate": 0.0002, "epoch": 1.4630991835744886, "step": 9050}, {"loss": 0.7264, "grad_norm": 0.48529282212257385, "learning_rate": 0.0002, "epoch": 1.4647158677552339, "step": 9060}, {"loss": 0.7542, "grad_norm": 0.5995030999183655, "learning_rate": 0.0002, "epoch": 1.4663325519359793, "step": 9070}, {"loss": 0.7894, "grad_norm": 0.6005427837371826, "learning_rate": 0.0002, "epoch": 1.4679492361167246, "step": 9080}, {"loss": 0.7288, "grad_norm": 0.718564510345459, "learning_rate": 0.0002, "epoch": 1.46956592029747, "step": 9090}, {"loss": 0.7089, "grad_norm": 0.7003577351570129, "learning_rate": 0.0002, "epoch": 1.4711826044782153, "step": 9100}, {"loss": 0.8069, "grad_norm": 0.5888323783874512, "learning_rate": 0.0002, "epoch": 1.4727992886589605, "step": 9110}, {"loss": 0.7275, "grad_norm": 0.6417609453201294, "learning_rate": 0.0002, "epoch": 1.4744159728397057, "step": 9120}, {"loss": 0.7441, "grad_norm": 0.572294294834137, "learning_rate": 0.0002, "epoch": 1.476032657020451, "step": 9130}, {"loss": 0.8053, "grad_norm": 0.8200714588165283, "learning_rate": 0.0002, "epoch": 1.4776493412011964, "step": 9140}, {"loss": 0.7382, "grad_norm": 0.6343288421630859, "learning_rate": 0.0002, "epoch": 1.4792660253819416, "step": 9150}, {"loss": 0.7641, "grad_norm": 0.7017961144447327, "learning_rate": 0.0002, "epoch": 1.4808827095626869, "step": 9160}, {"loss": 0.7619, "grad_norm": 0.6202912926673889, "learning_rate": 0.0002, "epoch": 1.4824993937434323, "step": 9170}, {"loss": 0.7428, "grad_norm": 0.6677869558334351, "learning_rate": 0.0002, "epoch": 1.4841160779241775, "step": 9180}, {"loss": 0.7648, "grad_norm": 0.6052267551422119, "learning_rate": 0.0002, "epoch": 1.4857327621049228, "step": 9190}, {"loss": 0.7152, "grad_norm": 0.6638872027397156, "learning_rate": 0.0002, "epoch": 1.487349446285668, "step": 9200}, {"loss": 0.7448, "grad_norm": 0.6245523691177368, "learning_rate": 0.0002, "epoch": 1.4889661304664135, "step": 9210}, {"loss": 0.6958, "grad_norm": 0.5761767625808716, "learning_rate": 0.0002, "epoch": 1.4905828146471587, "step": 9220}, {"loss": 0.8012, "grad_norm": 0.8175981640815735, "learning_rate": 0.0002, "epoch": 1.492199498827904, "step": 9230}, {"loss": 0.683, "grad_norm": 0.9144009947776794, "learning_rate": 0.0002, "epoch": 1.4938161830086494, "step": 9240}, {"loss": 0.7623, "grad_norm": 0.5742552876472473, "learning_rate": 0.0002, "epoch": 1.4954328671893946, "step": 9250}, {"loss": 0.7418, "grad_norm": 0.534534215927124, "learning_rate": 0.0002, "epoch": 1.4970495513701398, "step": 9260}, {"loss": 0.7194, "grad_norm": 0.7836225032806396, "learning_rate": 0.0002, "epoch": 1.498666235550885, "step": 9270}, {"loss": 0.7453, "grad_norm": 0.5292993187904358, "learning_rate": 0.0002, "epoch": 1.5002829197316303, "step": 9280}, {"loss": 0.7168, "grad_norm": 0.8044071793556213, "learning_rate": 0.0002, "epoch": 1.5018996039123758, "step": 9290}, {"loss": 0.7229, "grad_norm": 0.6185805201530457, "learning_rate": 0.0002, "epoch": 1.503516288093121, "step": 9300}, {"loss": 0.684, "grad_norm": 0.6093607544898987, "learning_rate": 0.0002, "epoch": 1.5051329722738664, "step": 9310}, {"loss": 0.7973, "grad_norm": 0.5891730189323425, "learning_rate": 0.0002, "epoch": 1.5067496564546117, "step": 9320}, {"loss": 0.7474, "grad_norm": 0.6331129670143127, "learning_rate": 0.0002, "epoch": 1.508366340635357, "step": 9330}, {"loss": 0.7074, "grad_norm": 0.7690958380699158, "learning_rate": 0.0002, "epoch": 1.5099830248161021, "step": 9340}, {"loss": 0.672, "grad_norm": 0.6548877358436584, "learning_rate": 0.0002, "epoch": 1.5115997089968474, "step": 9350}, {"loss": 0.7408, "grad_norm": 0.6545143127441406, "learning_rate": 0.0002, "epoch": 1.5132163931775926, "step": 9360}, {"loss": 0.7432, "grad_norm": 0.553247332572937, "learning_rate": 0.0002, "epoch": 1.514833077358338, "step": 9370}, {"loss": 0.7265, "grad_norm": 0.8145074844360352, "learning_rate": 0.0002, "epoch": 1.5164497615390833, "step": 9380}, {"loss": 0.7379, "grad_norm": 0.7636994123458862, "learning_rate": 0.0002, "epoch": 1.5180664457198287, "step": 9390}, {"loss": 0.7413, "grad_norm": 0.6838982701301575, "learning_rate": 0.0002, "epoch": 1.519683129900574, "step": 9400}, {"loss": 0.7367, "grad_norm": 0.8599441647529602, "learning_rate": 0.0002, "epoch": 1.5212998140813192, "step": 9410}, {"loss": 0.7663, "grad_norm": 0.7020329833030701, "learning_rate": 0.0002, "epoch": 1.5229164982620644, "step": 9420}, {"loss": 0.7928, "grad_norm": 0.6964772343635559, "learning_rate": 0.0002, "epoch": 1.5245331824428097, "step": 9430}, {"loss": 0.7168, "grad_norm": 0.6916600465774536, "learning_rate": 0.0002, "epoch": 1.5261498666235551, "step": 9440}, {"loss": 0.7519, "grad_norm": 0.7282621264457703, "learning_rate": 0.0002, "epoch": 1.5277665508043003, "step": 9450}, {"loss": 0.7628, "grad_norm": 0.5363983511924744, "learning_rate": 0.0002, "epoch": 1.5293832349850458, "step": 9460}, {"loss": 0.7154, "grad_norm": 0.6184861063957214, "learning_rate": 0.0002, "epoch": 1.530999919165791, "step": 9470}, {"loss": 0.7837, "grad_norm": 0.5991285443305969, "learning_rate": 0.0002, "epoch": 1.5326166033465363, "step": 9480}, {"loss": 0.7827, "grad_norm": 0.8176587820053101, "learning_rate": 0.0002, "epoch": 1.5342332875272815, "step": 9490}, {"loss": 0.7415, "grad_norm": 0.6473721861839294, "learning_rate": 0.0002, "epoch": 1.5358499717080267, "step": 9500}, {"loss": 0.7632, "grad_norm": 0.7319952845573425, "learning_rate": 0.0002, "epoch": 1.5374666558887722, "step": 9510}, {"loss": 0.7706, "grad_norm": 0.702900230884552, "learning_rate": 0.0002, "epoch": 1.5390833400695174, "step": 9520}, {"loss": 0.7754, "grad_norm": 0.7971600294113159, "learning_rate": 0.0002, "epoch": 1.5407000242502629, "step": 9530}, {"loss": 0.7352, "grad_norm": 0.6527525186538696, "learning_rate": 0.0002, "epoch": 1.542316708431008, "step": 9540}, {"loss": 0.7425, "grad_norm": 0.5791676044464111, "learning_rate": 0.0002, "epoch": 1.5439333926117533, "step": 9550}, {"loss": 0.7585, "grad_norm": 0.5619390606880188, "learning_rate": 0.0002, "epoch": 1.5455500767924986, "step": 9560}, {"loss": 0.7894, "grad_norm": 0.5701689124107361, "learning_rate": 0.0002, "epoch": 1.5471667609732438, "step": 9570}, {"loss": 0.793, "grad_norm": 0.47549352049827576, "learning_rate": 0.0002, "epoch": 1.548783445153989, "step": 9580}, {"loss": 0.7276, "grad_norm": 0.8730611205101013, "learning_rate": 0.0002, "epoch": 1.5504001293347345, "step": 9590}, {"loss": 0.798, "grad_norm": 0.6842091083526611, "learning_rate": 0.0002, "epoch": 1.5520168135154797, "step": 9600}, {"loss": 0.7528, "grad_norm": 0.6675129532814026, "learning_rate": 0.0002, "epoch": 1.5536334976962252, "step": 9610}, {"loss": 0.7954, "grad_norm": 0.8173956274986267, "learning_rate": 0.0002, "epoch": 1.5552501818769704, "step": 9620}, {"loss": 0.7535, "grad_norm": 0.724947452545166, "learning_rate": 0.0002, "epoch": 1.5568668660577156, "step": 9630}, {"loss": 0.7738, "grad_norm": 0.6154758930206299, "learning_rate": 0.0002, "epoch": 1.5584835502384609, "step": 9640}, {"loss": 0.7568, "grad_norm": 0.6072008013725281, "learning_rate": 0.0002, "epoch": 1.560100234419206, "step": 9650}, {"loss": 0.7219, "grad_norm": 0.659010648727417, "learning_rate": 0.0002, "epoch": 1.5617169185999515, "step": 9660}, {"loss": 0.673, "grad_norm": 0.65857994556427, "learning_rate": 0.0002, "epoch": 1.5633336027806968, "step": 9670}, {"loss": 0.7156, "grad_norm": 0.5914267301559448, "learning_rate": 0.0002, "epoch": 1.5649502869614422, "step": 9680}, {"loss": 0.7414, "grad_norm": 0.6248020529747009, "learning_rate": 0.0002, "epoch": 1.5665669711421875, "step": 9690}, {"loss": 0.694, "grad_norm": 0.7147795557975769, "learning_rate": 0.0002, "epoch": 1.5681836553229327, "step": 9700}, {"loss": 0.7335, "grad_norm": 0.7076232433319092, "learning_rate": 0.0002, "epoch": 1.569800339503678, "step": 9710}, {"loss": 0.7413, "grad_norm": 0.6217400431632996, "learning_rate": 0.0002, "epoch": 1.5714170236844232, "step": 9720}, {"loss": 0.7296, "grad_norm": 0.6709911227226257, "learning_rate": 0.0002, "epoch": 1.5730337078651684, "step": 9730}, {"loss": 0.7306, "grad_norm": 0.749171257019043, "learning_rate": 0.0002, "epoch": 1.5746503920459138, "step": 9740}, {"loss": 0.7242, "grad_norm": 0.6241145730018616, "learning_rate": 0.0002, "epoch": 1.576267076226659, "step": 9750}, {"loss": 0.7384, "grad_norm": 0.4960934817790985, "learning_rate": 0.0002, "epoch": 1.5778837604074045, "step": 9760}, {"loss": 0.725, "grad_norm": 0.6593309640884399, "learning_rate": 0.0002, "epoch": 1.5795004445881498, "step": 9770}, {"loss": 0.7531, "grad_norm": 0.5814042091369629, "learning_rate": 0.0002, "epoch": 1.581117128768895, "step": 9780}, {"loss": 0.7109, "grad_norm": 0.5936070680618286, "learning_rate": 0.0002, "epoch": 1.5827338129496402, "step": 9790}, {"loss": 0.7769, "grad_norm": 0.6454403400421143, "learning_rate": 0.0002, "epoch": 1.5843504971303854, "step": 9800}, {"loss": 0.7677, "grad_norm": 0.7612107992172241, "learning_rate": 0.0002, "epoch": 1.585967181311131, "step": 9810}, {"loss": 0.7649, "grad_norm": 0.6494482755661011, "learning_rate": 0.0002, "epoch": 1.5875838654918761, "step": 9820}, {"loss": 0.7569, "grad_norm": 0.7825694680213928, "learning_rate": 0.0002, "epoch": 1.5892005496726216, "step": 9830}, {"loss": 0.706, "grad_norm": 0.6757757663726807, "learning_rate": 0.0002, "epoch": 1.5908172338533668, "step": 9840}, {"loss": 0.7803, "grad_norm": 0.7105609178543091, "learning_rate": 0.0002, "epoch": 1.592433918034112, "step": 9850}, {"loss": 0.7925, "grad_norm": 0.7596991062164307, "learning_rate": 0.0002, "epoch": 1.5940506022148573, "step": 9860}, {"loss": 0.7108, "grad_norm": 0.5681525468826294, "learning_rate": 0.0002, "epoch": 1.5956672863956025, "step": 9870}, {"loss": 0.7811, "grad_norm": 0.6090980768203735, "learning_rate": 0.0002, "epoch": 1.5972839705763477, "step": 9880}, {"loss": 0.7339, "grad_norm": 0.6271613240242004, "learning_rate": 0.0002, "epoch": 1.5989006547570932, "step": 9890}, {"loss": 0.7419, "grad_norm": 0.7656369805335999, "learning_rate": 0.0002, "epoch": 1.6005173389378387, "step": 9900}, {"loss": 0.7336, "grad_norm": 0.7504446506500244, "learning_rate": 0.0002, "epoch": 1.6021340231185839, "step": 9910}, {"loss": 0.7479, "grad_norm": 0.659656286239624, "learning_rate": 0.0002, "epoch": 1.6037507072993291, "step": 9920}, {"loss": 0.7483, "grad_norm": 0.6006826162338257, "learning_rate": 0.0002, "epoch": 1.6053673914800743, "step": 9930}, {"loss": 0.732, "grad_norm": 0.7872757911682129, "learning_rate": 0.0002, "epoch": 1.6069840756608196, "step": 9940}, {"loss": 0.768, "grad_norm": 0.5545852780342102, "learning_rate": 0.0002, "epoch": 1.6086007598415648, "step": 9950}, {"loss": 0.8064, "grad_norm": 0.7429468631744385, "learning_rate": 0.0002, "epoch": 1.6102174440223103, "step": 9960}, {"loss": 0.714, "grad_norm": 0.6873556971549988, "learning_rate": 0.0002, "epoch": 1.6118341282030555, "step": 9970}, {"loss": 0.7324, "grad_norm": 0.5874287486076355, "learning_rate": 0.0002, "epoch": 1.613450812383801, "step": 9980}, {"loss": 0.7141, "grad_norm": 0.6039386987686157, "learning_rate": 0.0002, "epoch": 1.6150674965645462, "step": 9990}, {"loss": 0.6674, "grad_norm": 0.6233575940132141, "learning_rate": 0.0002, "epoch": 1.6166841807452914, "step": 10000}, {"loss": 0.7602, "grad_norm": 0.7676448225975037, "learning_rate": 0.0002, "epoch": 1.6183008649260366, "step": 10010}, {"loss": 0.7784, "grad_norm": 0.6565698385238647, "learning_rate": 0.0002, "epoch": 1.6199175491067819, "step": 10020}, {"loss": 0.7104, "grad_norm": 0.6787590384483337, "learning_rate": 0.0002, "epoch": 1.6215342332875273, "step": 10030}, {"loss": 0.7464, "grad_norm": 0.6137678027153015, "learning_rate": 0.0002, "epoch": 1.6231509174682726, "step": 10040}, {"loss": 0.7646, "grad_norm": 0.5236800312995911, "learning_rate": 0.0002, "epoch": 1.624767601649018, "step": 10050}, {"loss": 0.7437, "grad_norm": 0.7626367807388306, "learning_rate": 0.0002, "epoch": 1.6263842858297632, "step": 10060}, {"loss": 0.7273, "grad_norm": 0.5657260417938232, "learning_rate": 0.0002, "epoch": 1.6280009700105085, "step": 10070}, {"loss": 0.7354, "grad_norm": 0.4913991391658783, "learning_rate": 0.0002, "epoch": 1.6296176541912537, "step": 10080}, {"loss": 0.7596, "grad_norm": 0.7715556621551514, "learning_rate": 0.0002, "epoch": 1.631234338371999, "step": 10090}, {"loss": 0.7105, "grad_norm": 0.6509000062942505, "learning_rate": 0.0002, "epoch": 1.6328510225527442, "step": 10100}, {"loss": 0.7274, "grad_norm": 0.6215850114822388, "learning_rate": 0.0002, "epoch": 1.6344677067334896, "step": 10110}, {"loss": 0.7705, "grad_norm": 0.6956844329833984, "learning_rate": 0.0002, "epoch": 1.6360843909142349, "step": 10120}, {"loss": 0.7129, "grad_norm": 0.6111597418785095, "learning_rate": 0.0002, "epoch": 1.6377010750949803, "step": 10130}, {"loss": 0.6955, "grad_norm": 0.6518288850784302, "learning_rate": 0.0002, "epoch": 1.6393177592757255, "step": 10140}, {"loss": 0.731, "grad_norm": 0.6914522051811218, "learning_rate": 0.0002, "epoch": 1.6409344434564708, "step": 10150}, {"loss": 0.7295, "grad_norm": 0.63785719871521, "learning_rate": 0.0002, "epoch": 1.642551127637216, "step": 10160}, {"loss": 0.7355, "grad_norm": 0.6379287838935852, "learning_rate": 0.0002, "epoch": 1.6441678118179612, "step": 10170}, {"loss": 0.7359, "grad_norm": 0.6793403029441833, "learning_rate": 0.0002, "epoch": 1.6457844959987067, "step": 10180}, {"loss": 0.7402, "grad_norm": 0.6099132895469666, "learning_rate": 0.0002, "epoch": 1.647401180179452, "step": 10190}, {"loss": 0.7353, "grad_norm": 0.5869854092597961, "learning_rate": 0.0002, "epoch": 1.6490178643601974, "step": 10200}, {"loss": 0.8308, "grad_norm": 0.7716999053955078, "learning_rate": 0.0002, "epoch": 1.6506345485409426, "step": 10210}, {"loss": 0.7215, "grad_norm": 0.6854110360145569, "learning_rate": 0.0002, "epoch": 1.6522512327216878, "step": 10220}, {"loss": 0.782, "grad_norm": 0.6957170367240906, "learning_rate": 0.0002, "epoch": 1.653867916902433, "step": 10230}, {"loss": 0.7282, "grad_norm": 0.6932903528213501, "learning_rate": 0.0002, "epoch": 1.6554846010831783, "step": 10240}, {"loss": 0.7478, "grad_norm": 0.7713165283203125, "learning_rate": 0.0002, "epoch": 1.6571012852639235, "step": 10250}, {"loss": 0.7099, "grad_norm": 0.7455793619155884, "learning_rate": 0.0002, "epoch": 1.658717969444669, "step": 10260}, {"loss": 0.7524, "grad_norm": 0.5464168190956116, "learning_rate": 0.0002, "epoch": 1.6603346536254144, "step": 10270}, {"loss": 0.7328, "grad_norm": 0.6782926321029663, "learning_rate": 0.0002, "epoch": 1.6619513378061597, "step": 10280}, {"loss": 0.7801, "grad_norm": 0.7962649464607239, "learning_rate": 0.0002, "epoch": 1.663568021986905, "step": 10290}, {"loss": 0.7142, "grad_norm": 0.6814526319503784, "learning_rate": 0.0002, "epoch": 1.6651847061676501, "step": 10300}, {"loss": 0.7285, "grad_norm": 0.656895101070404, "learning_rate": 0.0002, "epoch": 1.6668013903483954, "step": 10310}, {"loss": 0.7358, "grad_norm": 0.6085672378540039, "learning_rate": 0.0002, "epoch": 1.6684180745291406, "step": 10320}, {"loss": 0.7074, "grad_norm": 0.585508406162262, "learning_rate": 0.0002, "epoch": 1.670034758709886, "step": 10330}, {"loss": 0.7604, "grad_norm": 0.6930184364318848, "learning_rate": 0.0002, "epoch": 1.6716514428906313, "step": 10340}, {"loss": 0.7169, "grad_norm": 0.575663149356842, "learning_rate": 0.0002, "epoch": 1.6732681270713767, "step": 10350}, {"loss": 0.7198, "grad_norm": 0.582502543926239, "learning_rate": 0.0002, "epoch": 1.674884811252122, "step": 10360}, {"loss": 0.7793, "grad_norm": 0.5668916702270508, "learning_rate": 0.0002, "epoch": 1.6765014954328672, "step": 10370}, {"loss": 0.7478, "grad_norm": 0.6070065498352051, "learning_rate": 0.0002, "epoch": 1.6781181796136124, "step": 10380}, {"loss": 0.7939, "grad_norm": 0.6141316294670105, "learning_rate": 0.0002, "epoch": 1.6797348637943577, "step": 10390}, {"loss": 0.7573, "grad_norm": 0.8359124064445496, "learning_rate": 0.0002, "epoch": 1.6813515479751031, "step": 10400}, {"loss": 0.7488, "grad_norm": 0.5378185510635376, "learning_rate": 0.0002, "epoch": 1.6829682321558483, "step": 10410}, {"loss": 0.7588, "grad_norm": 0.6959536075592041, "learning_rate": 0.0002, "epoch": 1.6845849163365938, "step": 10420}, {"loss": 0.7872, "grad_norm": 0.6514357328414917, "learning_rate": 0.0002, "epoch": 1.686201600517339, "step": 10430}, {"loss": 0.725, "grad_norm": 0.7706646919250488, "learning_rate": 0.0002, "epoch": 1.6878182846980843, "step": 10440}, {"loss": 0.7673, "grad_norm": 0.6183337569236755, "learning_rate": 0.0002, "epoch": 1.6894349688788295, "step": 10450}, {"loss": 0.7566, "grad_norm": 0.6123278141021729, "learning_rate": 0.0002, "epoch": 1.6910516530595747, "step": 10460}, {"loss": 0.7169, "grad_norm": 0.6894851326942444, "learning_rate": 0.0002, "epoch": 1.69266833724032, "step": 10470}, {"loss": 0.7435, "grad_norm": 0.7497312426567078, "learning_rate": 0.0002, "epoch": 1.6942850214210654, "step": 10480}, {"loss": 0.7544, "grad_norm": 0.5968214273452759, "learning_rate": 0.0002, "epoch": 1.6959017056018106, "step": 10490}, {"loss": 0.6793, "grad_norm": 0.6747927069664001, "learning_rate": 0.0002, "epoch": 1.697518389782556, "step": 10500}, {"loss": 0.7415, "grad_norm": 0.5708310008049011, "learning_rate": 0.0002, "epoch": 1.6991350739633013, "step": 10510}, {"loss": 0.7385, "grad_norm": 0.606526792049408, "learning_rate": 0.0002, "epoch": 1.7007517581440466, "step": 10520}, {"loss": 0.7204, "grad_norm": 0.662011981010437, "learning_rate": 0.0002, "epoch": 1.7023684423247918, "step": 10530}, {"loss": 0.7999, "grad_norm": 0.7583045363426208, "learning_rate": 0.0002, "epoch": 1.703985126505537, "step": 10540}, {"loss": 0.7563, "grad_norm": 0.721632182598114, "learning_rate": 0.0002, "epoch": 1.7056018106862825, "step": 10550}, {"loss": 0.7407, "grad_norm": 0.6107715368270874, "learning_rate": 0.0002, "epoch": 1.7072184948670277, "step": 10560}, {"loss": 0.7519, "grad_norm": 0.6652471423149109, "learning_rate": 0.0002, "epoch": 1.7088351790477732, "step": 10570}, {"loss": 0.7767, "grad_norm": 0.6308087110519409, "learning_rate": 0.0002, "epoch": 1.7104518632285184, "step": 10580}, {"loss": 0.7659, "grad_norm": 0.5464386940002441, "learning_rate": 0.0002, "epoch": 1.7120685474092636, "step": 10590}, {"loss": 0.7063, "grad_norm": 0.6558911204338074, "learning_rate": 0.0002, "epoch": 1.7136852315900089, "step": 10600}, {"loss": 0.7126, "grad_norm": 0.5665024518966675, "learning_rate": 0.0002, "epoch": 1.715301915770754, "step": 10610}, {"loss": 0.6958, "grad_norm": 0.7888094186782837, "learning_rate": 0.0002, "epoch": 1.7169185999514993, "step": 10620}, {"loss": 0.7785, "grad_norm": 0.7084909081459045, "learning_rate": 0.0002, "epoch": 1.7185352841322448, "step": 10630}, {"loss": 0.7557, "grad_norm": 0.7982324361801147, "learning_rate": 0.0002, "epoch": 1.7201519683129902, "step": 10640}, {"loss": 0.7345, "grad_norm": 0.6418732404708862, "learning_rate": 0.0002, "epoch": 1.7217686524937355, "step": 10650}, {"loss": 0.7734, "grad_norm": 0.7636681795120239, "learning_rate": 0.0002, "epoch": 1.7233853366744807, "step": 10660}, {"loss": 0.7541, "grad_norm": 0.5646875500679016, "learning_rate": 0.0002, "epoch": 1.725002020855226, "step": 10670}, {"loss": 0.7642, "grad_norm": 0.5231260657310486, "learning_rate": 0.0002, "epoch": 1.7266187050359711, "step": 10680}, {"loss": 0.7846, "grad_norm": 0.7635011672973633, "learning_rate": 0.0002, "epoch": 1.7282353892167164, "step": 10690}, {"loss": 0.7471, "grad_norm": 0.7518259286880493, "learning_rate": 0.0002, "epoch": 1.7298520733974618, "step": 10700}, {"loss": 0.751, "grad_norm": 0.7295602560043335, "learning_rate": 0.0002, "epoch": 1.731468757578207, "step": 10710}, {"loss": 0.731, "grad_norm": 0.6984632015228271, "learning_rate": 0.0002, "epoch": 1.7330854417589525, "step": 10720}, {"loss": 0.7921, "grad_norm": 0.6198219060897827, "learning_rate": 0.0002, "epoch": 1.7347021259396977, "step": 10730}, {"loss": 0.7642, "grad_norm": 0.6957576274871826, "learning_rate": 0.0002, "epoch": 1.736318810120443, "step": 10740}, {"loss": 0.7917, "grad_norm": 0.6430263519287109, "learning_rate": 0.0002, "epoch": 1.7379354943011882, "step": 10750}, {"loss": 0.7156, "grad_norm": 0.6134995222091675, "learning_rate": 0.0002, "epoch": 1.7395521784819334, "step": 10760}, {"loss": 0.7584, "grad_norm": 0.7209452986717224, "learning_rate": 0.0002, "epoch": 1.741168862662679, "step": 10770}, {"loss": 0.7528, "grad_norm": 0.6735447645187378, "learning_rate": 0.0002, "epoch": 1.7427855468434241, "step": 10780}, {"loss": 0.756, "grad_norm": 0.5605693459510803, "learning_rate": 0.0002, "epoch": 1.7444022310241696, "step": 10790}, {"loss": 0.7759, "grad_norm": 0.6882363557815552, "learning_rate": 0.0002, "epoch": 1.7460189152049148, "step": 10800}, {"loss": 0.7544, "grad_norm": 0.6386259198188782, "learning_rate": 0.0002, "epoch": 1.74763559938566, "step": 10810}, {"loss": 0.7697, "grad_norm": 0.6529015302658081, "learning_rate": 0.0002, "epoch": 1.7492522835664053, "step": 10820}, {"loss": 0.7219, "grad_norm": 0.5664082765579224, "learning_rate": 0.0002, "epoch": 1.7508689677471505, "step": 10830}, {"loss": 0.7586, "grad_norm": 0.7532684206962585, "learning_rate": 0.0002, "epoch": 1.7524856519278957, "step": 10840}, {"loss": 0.6919, "grad_norm": 0.77171391248703, "learning_rate": 0.0002, "epoch": 1.7541023361086412, "step": 10850}, {"loss": 0.785, "grad_norm": 0.7255431413650513, "learning_rate": 0.0002, "epoch": 1.7557190202893864, "step": 10860}, {"loss": 0.7458, "grad_norm": 0.763083279132843, "learning_rate": 0.0002, "epoch": 1.7573357044701319, "step": 10870}, {"loss": 0.7846, "grad_norm": 0.6042402982711792, "learning_rate": 0.0002, "epoch": 1.758952388650877, "step": 10880}, {"loss": 0.7027, "grad_norm": 0.7642518281936646, "learning_rate": 0.0002, "epoch": 1.7605690728316223, "step": 10890}, {"loss": 0.746, "grad_norm": 0.6347904801368713, "learning_rate": 0.0002, "epoch": 1.7621857570123676, "step": 10900}, {"loss": 0.7458, "grad_norm": 0.5371627807617188, "learning_rate": 0.0002, "epoch": 1.7638024411931128, "step": 10910}, {"loss": 0.7466, "grad_norm": 0.6840225458145142, "learning_rate": 0.0002, "epoch": 1.7654191253738583, "step": 10920}, {"loss": 0.725, "grad_norm": 0.5288469195365906, "learning_rate": 0.0002, "epoch": 1.7670358095546035, "step": 10930}, {"loss": 0.7863, "grad_norm": 0.69020676612854, "learning_rate": 0.0002, "epoch": 1.768652493735349, "step": 10940}, {"loss": 0.7468, "grad_norm": 0.5943242311477661, "learning_rate": 0.0002, "epoch": 1.7702691779160942, "step": 10950}, {"loss": 0.7244, "grad_norm": 0.5616418123245239, "learning_rate": 0.0002, "epoch": 1.7718858620968394, "step": 10960}, {"loss": 0.7137, "grad_norm": 0.7209470868110657, "learning_rate": 0.0002, "epoch": 1.7735025462775846, "step": 10970}, {"loss": 0.7459, "grad_norm": 0.6657957434654236, "learning_rate": 0.0002, "epoch": 1.7751192304583299, "step": 10980}, {"loss": 0.7076, "grad_norm": 0.6469064950942993, "learning_rate": 0.0002, "epoch": 1.776735914639075, "step": 10990}, {"loss": 0.7321, "grad_norm": 0.6615678071975708, "learning_rate": 0.0002, "epoch": 1.7783525988198206, "step": 11000}, {"loss": 0.747, "grad_norm": 0.6722439527511597, "learning_rate": 0.0002, "epoch": 1.779969283000566, "step": 11010}, {"loss": 0.7302, "grad_norm": 0.634136974811554, "learning_rate": 0.0002, "epoch": 1.7815859671813112, "step": 11020}, {"loss": 0.8105, "grad_norm": 0.6024377346038818, "learning_rate": 0.0002, "epoch": 1.7832026513620565, "step": 11030}, {"loss": 0.7855, "grad_norm": 0.6909403800964355, "learning_rate": 0.0002, "epoch": 1.7848193355428017, "step": 11040}, {"loss": 0.7471, "grad_norm": 0.7148767709732056, "learning_rate": 0.0002, "epoch": 1.786436019723547, "step": 11050}, {"loss": 0.7145, "grad_norm": 0.7442979216575623, "learning_rate": 0.0002, "epoch": 1.7880527039042922, "step": 11060}, {"loss": 0.7215, "grad_norm": 0.6830431818962097, "learning_rate": 0.0002, "epoch": 1.7896693880850376, "step": 11070}, {"loss": 0.7625, "grad_norm": 0.9172667264938354, "learning_rate": 0.0002, "epoch": 1.7912860722657828, "step": 11080}, {"loss": 0.76, "grad_norm": 0.6799490451812744, "learning_rate": 0.0002, "epoch": 1.7929027564465283, "step": 11090}, {"loss": 0.7716, "grad_norm": 0.7617024779319763, "learning_rate": 0.0002, "epoch": 1.7945194406272735, "step": 11100}, {"loss": 0.7586, "grad_norm": 0.7701810002326965, "learning_rate": 0.0002, "epoch": 1.7961361248080188, "step": 11110}, {"loss": 0.7843, "grad_norm": 0.7454385757446289, "learning_rate": 0.0002, "epoch": 1.797752808988764, "step": 11120}, {"loss": 0.7873, "grad_norm": 0.6121436953544617, "learning_rate": 0.0002, "epoch": 1.7993694931695092, "step": 11130}, {"loss": 0.7305, "grad_norm": 0.6237571835517883, "learning_rate": 0.0002, "epoch": 1.8009861773502547, "step": 11140}, {"loss": 0.6827, "grad_norm": 0.6818515658378601, "learning_rate": 0.0002, "epoch": 1.802602861531, "step": 11150}, {"loss": 0.6876, "grad_norm": 0.7768308520317078, "learning_rate": 0.0002, "epoch": 1.8042195457117454, "step": 11160}, {"loss": 0.7533, "grad_norm": 0.6875537633895874, "learning_rate": 0.0002, "epoch": 1.8058362298924906, "step": 11170}, {"loss": 0.761, "grad_norm": 0.7950584888458252, "learning_rate": 0.0002, "epoch": 1.8074529140732358, "step": 11180}, {"loss": 0.7623, "grad_norm": 0.8210248351097107, "learning_rate": 0.0002, "epoch": 1.809069598253981, "step": 11190}, {"loss": 0.7556, "grad_norm": 0.6674110889434814, "learning_rate": 0.0002, "epoch": 1.8106862824347263, "step": 11200}, {"loss": 0.7663, "grad_norm": 0.6261674761772156, "learning_rate": 0.0002, "epoch": 1.8123029666154715, "step": 11210}, {"loss": 0.7122, "grad_norm": 0.6484741568565369, "learning_rate": 0.0002, "epoch": 1.813919650796217, "step": 11220}, {"loss": 0.7718, "grad_norm": 0.6231244206428528, "learning_rate": 0.0002, "epoch": 1.8155363349769622, "step": 11230}, {"loss": 0.7152, "grad_norm": 0.7243146896362305, "learning_rate": 0.0002, "epoch": 1.8171530191577077, "step": 11240}, {"loss": 0.7448, "grad_norm": 0.6776193380355835, "learning_rate": 0.0002, "epoch": 1.818769703338453, "step": 11250}, {"loss": 0.7317, "grad_norm": 0.5973618030548096, "learning_rate": 0.0002, "epoch": 1.8203863875191981, "step": 11260}, {"loss": 0.7961, "grad_norm": 0.6451361179351807, "learning_rate": 0.0002, "epoch": 1.8220030716999434, "step": 11270}, {"loss": 0.7611, "grad_norm": 0.5963068008422852, "learning_rate": 0.0002, "epoch": 1.8236197558806886, "step": 11280}, {"loss": 0.7466, "grad_norm": 0.536902129650116, "learning_rate": 0.0002, "epoch": 1.825236440061434, "step": 11290}, {"loss": 0.708, "grad_norm": 0.6993787288665771, "learning_rate": 0.0002, "epoch": 1.8268531242421793, "step": 11300}, {"loss": 0.7153, "grad_norm": 0.6135255098342896, "learning_rate": 0.0002, "epoch": 1.8284698084229247, "step": 11310}, {"loss": 0.7423, "grad_norm": 0.6057423949241638, "learning_rate": 0.0002, "epoch": 1.83008649260367, "step": 11320}, {"loss": 0.735, "grad_norm": 0.6598812341690063, "learning_rate": 0.0002, "epoch": 1.8317031767844152, "step": 11330}, {"loss": 0.7278, "grad_norm": 0.6075948476791382, "learning_rate": 0.0002, "epoch": 1.8333198609651604, "step": 11340}, {"loss": 0.7846, "grad_norm": 0.7065447568893433, "learning_rate": 0.0002, "epoch": 1.8349365451459057, "step": 11350}, {"loss": 0.7365, "grad_norm": 0.680526614189148, "learning_rate": 0.0002, "epoch": 1.8365532293266509, "step": 11360}, {"loss": 0.7152, "grad_norm": 0.6356695294380188, "learning_rate": 0.0002, "epoch": 1.8381699135073963, "step": 11370}, {"loss": 0.721, "grad_norm": 0.6399052143096924, "learning_rate": 0.0002, "epoch": 1.8397865976881416, "step": 11380}, {"loss": 0.7618, "grad_norm": 0.6125704050064087, "learning_rate": 0.0002, "epoch": 1.841403281868887, "step": 11390}, {"loss": 0.755, "grad_norm": 0.7124643325805664, "learning_rate": 0.0002, "epoch": 1.8430199660496323, "step": 11400}, {"loss": 0.7972, "grad_norm": 0.6099604964256287, "learning_rate": 0.0002, "epoch": 1.8446366502303775, "step": 11410}, {"loss": 0.7187, "grad_norm": 0.7338208556175232, "learning_rate": 0.0002, "epoch": 1.8462533344111227, "step": 11420}, {"loss": 0.7007, "grad_norm": 0.7534668445587158, "learning_rate": 0.0002, "epoch": 1.847870018591868, "step": 11430}, {"loss": 0.7464, "grad_norm": 0.6135470271110535, "learning_rate": 0.0002, "epoch": 1.8494867027726134, "step": 11440}, {"loss": 0.7955, "grad_norm": 0.6229309439659119, "learning_rate": 0.0002, "epoch": 1.8511033869533586, "step": 11450}, {"loss": 0.7594, "grad_norm": 0.706423282623291, "learning_rate": 0.0002, "epoch": 1.852720071134104, "step": 11460}, {"loss": 0.7411, "grad_norm": 0.5460049510002136, "learning_rate": 0.0002, "epoch": 1.8543367553148493, "step": 11470}, {"loss": 0.7416, "grad_norm": 0.6616711020469666, "learning_rate": 0.0002, "epoch": 1.8559534394955945, "step": 11480}, {"loss": 0.729, "grad_norm": 0.6372783184051514, "learning_rate": 0.0002, "epoch": 1.8575701236763398, "step": 11490}, {"loss": 0.7333, "grad_norm": 0.7162668108940125, "learning_rate": 0.0002, "epoch": 1.859186807857085, "step": 11500}, {"loss": 0.7747, "grad_norm": 0.6605209708213806, "learning_rate": 0.0002, "epoch": 1.8608034920378305, "step": 11510}, {"loss": 0.7258, "grad_norm": 0.6933956742286682, "learning_rate": 0.0002, "epoch": 1.8624201762185757, "step": 11520}, {"loss": 0.7243, "grad_norm": 0.6582090854644775, "learning_rate": 0.0002, "epoch": 1.8640368603993211, "step": 11530}, {"loss": 0.7313, "grad_norm": 0.6416500806808472, "learning_rate": 0.0002, "epoch": 1.8656535445800664, "step": 11540}, {"loss": 0.7372, "grad_norm": 0.5434312224388123, "learning_rate": 0.0002, "epoch": 1.8672702287608116, "step": 11550}, {"loss": 0.7635, "grad_norm": 0.6827567219734192, "learning_rate": 0.0002, "epoch": 1.8688869129415568, "step": 11560}, {"loss": 0.7137, "grad_norm": 0.7354370951652527, "learning_rate": 0.0002, "epoch": 1.870503597122302, "step": 11570}, {"loss": 0.7526, "grad_norm": 0.590372622013092, "learning_rate": 0.0002, "epoch": 1.8721202813030473, "step": 11580}, {"loss": 0.731, "grad_norm": 0.853183925151825, "learning_rate": 0.0002, "epoch": 1.8737369654837928, "step": 11590}, {"loss": 0.7487, "grad_norm": 0.822678804397583, "learning_rate": 0.0002, "epoch": 1.875353649664538, "step": 11600}, {"loss": 0.7427, "grad_norm": 0.6591550707817078, "learning_rate": 0.0002, "epoch": 1.8769703338452834, "step": 11610}, {"loss": 0.7054, "grad_norm": 0.7475301623344421, "learning_rate": 0.0002, "epoch": 1.8785870180260287, "step": 11620}, {"loss": 0.811, "grad_norm": 0.6390765309333801, "learning_rate": 0.0002, "epoch": 1.880203702206774, "step": 11630}, {"loss": 0.7531, "grad_norm": 0.6589758992195129, "learning_rate": 0.0002, "epoch": 1.8818203863875191, "step": 11640}, {"loss": 0.7475, "grad_norm": 0.6765508651733398, "learning_rate": 0.0002, "epoch": 1.8834370705682644, "step": 11650}, {"loss": 0.738, "grad_norm": 0.6527857780456543, "learning_rate": 0.0002, "epoch": 1.8850537547490098, "step": 11660}, {"loss": 0.7504, "grad_norm": 0.6642923951148987, "learning_rate": 0.0002, "epoch": 1.886670438929755, "step": 11670}, {"loss": 0.7701, "grad_norm": 0.6945584416389465, "learning_rate": 0.0002, "epoch": 1.8882871231105005, "step": 11680}, {"loss": 0.7711, "grad_norm": 0.694018542766571, "learning_rate": 0.0002, "epoch": 1.8899038072912457, "step": 11690}, {"loss": 0.7195, "grad_norm": 0.7237417101860046, "learning_rate": 0.0002, "epoch": 1.891520491471991, "step": 11700}, {"loss": 0.7491, "grad_norm": 0.7401309609413147, "learning_rate": 0.0002, "epoch": 1.8931371756527362, "step": 11710}, {"loss": 0.805, "grad_norm": 0.6537784337997437, "learning_rate": 0.0002, "epoch": 1.8947538598334814, "step": 11720}, {"loss": 0.793, "grad_norm": 0.7398539185523987, "learning_rate": 0.0002, "epoch": 1.8963705440142267, "step": 11730}, {"loss": 0.7561, "grad_norm": 0.6696075797080994, "learning_rate": 0.0002, "epoch": 1.8979872281949721, "step": 11740}, {"loss": 0.7353, "grad_norm": 0.6014142036437988, "learning_rate": 0.0002, "epoch": 1.8996039123757174, "step": 11750}, {"loss": 0.7714, "grad_norm": 0.7023524641990662, "learning_rate": 0.0002, "epoch": 1.9012205965564628, "step": 11760}, {"loss": 0.7088, "grad_norm": 0.739973783493042, "learning_rate": 0.0002, "epoch": 1.902837280737208, "step": 11770}, {"loss": 0.7848, "grad_norm": 0.5576770901679993, "learning_rate": 0.0002, "epoch": 1.9044539649179533, "step": 11780}, {"loss": 0.7483, "grad_norm": 0.6907393932342529, "learning_rate": 0.0002, "epoch": 1.9060706490986985, "step": 11790}, {"loss": 0.7827, "grad_norm": 0.6934581995010376, "learning_rate": 0.0002, "epoch": 1.9076873332794437, "step": 11800}, {"loss": 0.7199, "grad_norm": 0.591774582862854, "learning_rate": 0.0002, "epoch": 1.9093040174601892, "step": 11810}, {"loss": 0.7333, "grad_norm": 0.6249791383743286, "learning_rate": 0.0002, "epoch": 1.9109207016409344, "step": 11820}, {"loss": 0.7581, "grad_norm": 0.6755744218826294, "learning_rate": 0.0002, "epoch": 1.9125373858216799, "step": 11830}, {"loss": 0.696, "grad_norm": 0.7286285161972046, "learning_rate": 0.0002, "epoch": 1.914154070002425, "step": 11840}, {"loss": 0.7509, "grad_norm": 0.7867850065231323, "learning_rate": 0.0002, "epoch": 1.9157707541831703, "step": 11850}, {"loss": 0.735, "grad_norm": 0.6283972859382629, "learning_rate": 0.0002, "epoch": 1.9173874383639156, "step": 11860}, {"loss": 0.7296, "grad_norm": 0.605823814868927, "learning_rate": 0.0002, "epoch": 1.9190041225446608, "step": 11870}, {"loss": 0.6598, "grad_norm": 0.5927976965904236, "learning_rate": 0.0002, "epoch": 1.920620806725406, "step": 11880}, {"loss": 0.7649, "grad_norm": 0.5974002480506897, "learning_rate": 0.0002, "epoch": 1.9222374909061515, "step": 11890}, {"loss": 0.7843, "grad_norm": 0.7091866135597229, "learning_rate": 0.0002, "epoch": 1.923854175086897, "step": 11900}, {"loss": 0.775, "grad_norm": 0.72496497631073, "learning_rate": 0.0002, "epoch": 1.9254708592676422, "step": 11910}, {"loss": 0.7153, "grad_norm": 0.6131896376609802, "learning_rate": 0.0002, "epoch": 1.9270875434483874, "step": 11920}, {"loss": 0.7228, "grad_norm": 0.6556436419487, "learning_rate": 0.0002, "epoch": 1.9287042276291326, "step": 11930}, {"loss": 0.7319, "grad_norm": 0.622932493686676, "learning_rate": 0.0002, "epoch": 1.9303209118098779, "step": 11940}, {"loss": 0.7592, "grad_norm": 0.6618631482124329, "learning_rate": 0.0002, "epoch": 1.931937595990623, "step": 11950}, {"loss": 0.8332, "grad_norm": 0.630966305732727, "learning_rate": 0.0002, "epoch": 1.9335542801713685, "step": 11960}, {"loss": 0.6854, "grad_norm": 0.6336734890937805, "learning_rate": 0.0002, "epoch": 1.9351709643521138, "step": 11970}, {"loss": 0.7433, "grad_norm": 0.655403196811676, "learning_rate": 0.0002, "epoch": 1.9367876485328592, "step": 11980}, {"loss": 0.7282, "grad_norm": 0.5640574097633362, "learning_rate": 0.0002, "epoch": 1.9384043327136045, "step": 11990}, {"loss": 0.7289, "grad_norm": 0.6322951316833496, "learning_rate": 0.0002, "epoch": 1.9400210168943497, "step": 12000}, {"loss": 0.7627, "grad_norm": 0.615703821182251, "learning_rate": 0.0002, "epoch": 1.941637701075095, "step": 12010}, {"loss": 0.786, "grad_norm": 0.6487536430358887, "learning_rate": 0.0002, "epoch": 1.9432543852558402, "step": 12020}, {"loss": 0.7435, "grad_norm": 0.9209630489349365, "learning_rate": 0.0002, "epoch": 1.9448710694365856, "step": 12030}, {"loss": 0.7274, "grad_norm": 0.67485511302948, "learning_rate": 0.0002, "epoch": 1.9464877536173308, "step": 12040}, {"loss": 0.7551, "grad_norm": 0.6831230521202087, "learning_rate": 0.0002, "epoch": 1.9481044377980763, "step": 12050}, {"loss": 0.7546, "grad_norm": 0.6578302383422852, "learning_rate": 0.0002, "epoch": 1.9497211219788215, "step": 12060}, {"loss": 0.6989, "grad_norm": 0.9975938200950623, "learning_rate": 0.0002, "epoch": 1.9513378061595668, "step": 12070}, {"loss": 0.7952, "grad_norm": 0.6637365221977234, "learning_rate": 0.0002, "epoch": 1.952954490340312, "step": 12080}, {"loss": 0.7482, "grad_norm": 0.605707049369812, "learning_rate": 0.0002, "epoch": 1.9545711745210572, "step": 12090}, {"loss": 0.7768, "grad_norm": 0.6584440469741821, "learning_rate": 0.0002, "epoch": 1.9561878587018025, "step": 12100}, {"loss": 0.7187, "grad_norm": 0.6070835590362549, "learning_rate": 0.0002, "epoch": 1.957804542882548, "step": 12110}, {"loss": 0.7491, "grad_norm": 0.7862601280212402, "learning_rate": 0.0002, "epoch": 1.9594212270632931, "step": 12120}, {"loss": 0.7972, "grad_norm": 0.8175255060195923, "learning_rate": 0.0002, "epoch": 1.9610379112440386, "step": 12130}, {"loss": 0.7242, "grad_norm": 0.5648472905158997, "learning_rate": 0.0002, "epoch": 1.9626545954247838, "step": 12140}, {"loss": 0.7321, "grad_norm": 0.6591973304748535, "learning_rate": 0.0002, "epoch": 1.964271279605529, "step": 12150}, {"loss": 0.739, "grad_norm": 0.5960676074028015, "learning_rate": 0.0002, "epoch": 1.9658879637862743, "step": 12160}, {"loss": 0.7254, "grad_norm": 0.7272544503211975, "learning_rate": 0.0002, "epoch": 1.9675046479670195, "step": 12170}, {"loss": 0.7376, "grad_norm": 0.7176699042320251, "learning_rate": 0.0002, "epoch": 1.969121332147765, "step": 12180}, {"loss": 0.7525, "grad_norm": 0.6927123665809631, "learning_rate": 0.0002, "epoch": 1.9707380163285102, "step": 12190}, {"loss": 0.7318, "grad_norm": 0.5536034107208252, "learning_rate": 0.0002, "epoch": 1.9723547005092557, "step": 12200}, {"loss": 0.7737, "grad_norm": 0.8348390460014343, "learning_rate": 0.0002, "epoch": 1.9739713846900009, "step": 12210}, {"loss": 0.7494, "grad_norm": 0.6591181755065918, "learning_rate": 0.0002, "epoch": 1.9755880688707461, "step": 12220}, {"loss": 0.763, "grad_norm": 1.0624109506607056, "learning_rate": 0.0002, "epoch": 1.9772047530514913, "step": 12230}, {"loss": 0.7541, "grad_norm": 0.9265586137771606, "learning_rate": 0.0002, "epoch": 1.9788214372322366, "step": 12240}, {"loss": 0.7533, "grad_norm": 0.5998196005821228, "learning_rate": 0.0002, "epoch": 1.9804381214129818, "step": 12250}, {"loss": 0.7225, "grad_norm": 0.6960851550102234, "learning_rate": 0.0002, "epoch": 1.9820548055937273, "step": 12260}, {"loss": 0.7398, "grad_norm": 0.7674502730369568, "learning_rate": 0.0002, "epoch": 1.9836714897744727, "step": 12270}, {"loss": 0.7185, "grad_norm": 0.6407275795936584, "learning_rate": 0.0002, "epoch": 1.985288173955218, "step": 12280}, {"loss": 0.7382, "grad_norm": 0.6673079133033752, "learning_rate": 0.0002, "epoch": 1.9869048581359632, "step": 12290}, {"loss": 0.7326, "grad_norm": 0.6989844441413879, "learning_rate": 0.0002, "epoch": 1.9885215423167084, "step": 12300}, {"loss": 0.7559, "grad_norm": 0.7564442157745361, "learning_rate": 0.0002, "epoch": 1.9901382264974536, "step": 12310}, {"loss": 0.7719, "grad_norm": 0.6385478973388672, "learning_rate": 0.0002, "epoch": 1.9917549106781989, "step": 12320}, {"loss": 0.7369, "grad_norm": 0.7193717956542969, "learning_rate": 0.0002, "epoch": 1.9933715948589443, "step": 12330}, {"loss": 0.7583, "grad_norm": 0.7987112402915955, "learning_rate": 0.0002, "epoch": 1.9949882790396896, "step": 12340}, {"loss": 0.7793, "grad_norm": 0.7260826826095581, "learning_rate": 0.0002, "epoch": 1.996604963220435, "step": 12350}, {"loss": 0.7505, "grad_norm": 0.7968255281448364, "learning_rate": 0.0002, "epoch": 1.9982216474011802, "step": 12360}, {"loss": 0.717, "grad_norm": 0.6893062591552734, "learning_rate": 0.0002, "epoch": 1.9998383315819255, "step": 12370}, {"eval_loss": 1.1044032573699951, "eval_runtime": 122.1508, "eval_samples_per_second": 6.001, "eval_steps_per_second": 0.753, "epoch": 2.0, "step": 12371}, {"loss": 0.6604, "grad_norm": 0.7775409817695618, "learning_rate": 0.0002, "epoch": 2.0014550157626707, "step": 12380}, {"loss": 0.6845, "grad_norm": 0.76218581199646, "learning_rate": 0.0002, "epoch": 2.003071699943416, "step": 12390}, {"loss": 0.6909, "grad_norm": 0.5677764415740967, "learning_rate": 0.0002, "epoch": 2.004688384124161, "step": 12400}, {"loss": 0.6584, "grad_norm": 0.808442234992981, "learning_rate": 0.0002, "epoch": 2.006305068304907, "step": 12410}, {"loss": 0.659, "grad_norm": 0.7144765257835388, "learning_rate": 0.0002, "epoch": 2.007921752485652, "step": 12420}, {"loss": 0.6666, "grad_norm": 0.6914031505584717, "learning_rate": 0.0002, "epoch": 2.0095384366663973, "step": 12430}, {"loss": 0.6596, "grad_norm": 0.7581454515457153, "learning_rate": 0.0002, "epoch": 2.0111551208471425, "step": 12440}, {"loss": 0.6785, "grad_norm": 0.8388504981994629, "learning_rate": 0.0002, "epoch": 2.0127718050278878, "step": 12450}, {"loss": 0.6942, "grad_norm": 0.6716406941413879, "learning_rate": 0.0002, "epoch": 2.014388489208633, "step": 12460}, {"loss": 0.6441, "grad_norm": 0.898902416229248, "learning_rate": 0.0002, "epoch": 2.0160051733893782, "step": 12470}, {"loss": 0.6655, "grad_norm": 0.6432679891586304, "learning_rate": 0.0002, "epoch": 2.0176218575701235, "step": 12480}, {"loss": 0.6521, "grad_norm": 0.8021109104156494, "learning_rate": 0.0002, "epoch": 2.019238541750869, "step": 12490}, {"loss": 0.6581, "grad_norm": 0.7039216756820679, "learning_rate": 0.0002, "epoch": 2.0208552259316144, "step": 12500}, {"loss": 0.6521, "grad_norm": 0.646531879901886, "learning_rate": 0.0002, "epoch": 2.0224719101123596, "step": 12510}, {"loss": 0.6302, "grad_norm": 0.783704400062561, "learning_rate": 0.0002, "epoch": 2.024088594293105, "step": 12520}, {"loss": 0.6288, "grad_norm": 0.8805046677589417, "learning_rate": 0.0002, "epoch": 2.02570527847385, "step": 12530}, {"loss": 0.6288, "grad_norm": 0.7289270758628845, "learning_rate": 0.0002, "epoch": 2.0273219626545953, "step": 12540}, {"loss": 0.6663, "grad_norm": 0.71653151512146, "learning_rate": 0.0002, "epoch": 2.0289386468353405, "step": 12550}, {"loss": 0.625, "grad_norm": 0.73281329870224, "learning_rate": 0.0002, "epoch": 2.030555331016086, "step": 12560}, {"loss": 0.6448, "grad_norm": 0.6657090187072754, "learning_rate": 0.0002, "epoch": 2.0321720151968314, "step": 12570}, {"loss": 0.6983, "grad_norm": 0.8241133093833923, "learning_rate": 0.0002, "epoch": 2.0337886993775767, "step": 12580}, {"loss": 0.6488, "grad_norm": 0.5834135413169861, "learning_rate": 0.0002, "epoch": 2.035405383558322, "step": 12590}, {"loss": 0.6188, "grad_norm": 0.84502112865448, "learning_rate": 0.0002, "epoch": 2.037022067739067, "step": 12600}, {"loss": 0.6349, "grad_norm": 0.8952481746673584, "learning_rate": 0.0002, "epoch": 2.0386387519198124, "step": 12610}, {"loss": 0.6923, "grad_norm": 0.7801461815834045, "learning_rate": 0.0002, "epoch": 2.0402554361005576, "step": 12620}, {"loss": 0.6176, "grad_norm": 0.6788367033004761, "learning_rate": 0.0002, "epoch": 2.041872120281303, "step": 12630}, {"loss": 0.6162, "grad_norm": 0.7241756319999695, "learning_rate": 0.0002, "epoch": 2.0434888044620485, "step": 12640}, {"loss": 0.655, "grad_norm": 0.6933388113975525, "learning_rate": 0.0002, "epoch": 2.0451054886427937, "step": 12650}, {"loss": 0.6431, "grad_norm": 0.8029746413230896, "learning_rate": 0.0002, "epoch": 2.046722172823539, "step": 12660}, {"loss": 0.7164, "grad_norm": 0.946399986743927, "learning_rate": 0.0002, "epoch": 2.048338857004284, "step": 12670}, {"loss": 0.638, "grad_norm": 0.7072678804397583, "learning_rate": 0.0002, "epoch": 2.0499555411850294, "step": 12680}, {"loss": 0.6487, "grad_norm": 0.6810618042945862, "learning_rate": 0.0002, "epoch": 2.0515722253657747, "step": 12690}, {"loss": 0.6554, "grad_norm": 0.7661160230636597, "learning_rate": 0.0002, "epoch": 2.05318890954652, "step": 12700}, {"loss": 0.6799, "grad_norm": 0.6350653767585754, "learning_rate": 0.0002, "epoch": 2.0548055937272656, "step": 12710}, {"loss": 0.6654, "grad_norm": 0.861890971660614, "learning_rate": 0.0002, "epoch": 2.056422277908011, "step": 12720}, {"loss": 0.6286, "grad_norm": 0.6489875912666321, "learning_rate": 0.0002, "epoch": 2.058038962088756, "step": 12730}, {"loss": 0.6811, "grad_norm": 0.8268506526947021, "learning_rate": 0.0002, "epoch": 2.0596556462695013, "step": 12740}, {"loss": 0.6524, "grad_norm": 0.607679545879364, "learning_rate": 0.0002, "epoch": 2.0612723304502465, "step": 12750}, {"loss": 0.6649, "grad_norm": 0.6754153370857239, "learning_rate": 0.0002, "epoch": 2.0628890146309917, "step": 12760}, {"loss": 0.6549, "grad_norm": 0.7263124585151672, "learning_rate": 0.0002, "epoch": 2.064505698811737, "step": 12770}, {"loss": 0.6189, "grad_norm": 0.6986154317855835, "learning_rate": 0.0002, "epoch": 2.0661223829924826, "step": 12780}, {"loss": 0.6723, "grad_norm": 0.7768576741218567, "learning_rate": 0.0002, "epoch": 2.067739067173228, "step": 12790}, {"loss": 0.677, "grad_norm": 0.7546762824058533, "learning_rate": 0.0002, "epoch": 2.069355751353973, "step": 12800}, {"loss": 0.6485, "grad_norm": 0.7588880062103271, "learning_rate": 0.0002, "epoch": 2.0709724355347183, "step": 12810}, {"loss": 0.6989, "grad_norm": 0.7457242608070374, "learning_rate": 0.0002, "epoch": 2.0725891197154636, "step": 12820}, {"loss": 0.6489, "grad_norm": 0.6983516812324524, "learning_rate": 0.0002, "epoch": 2.074205803896209, "step": 12830}, {"loss": 0.651, "grad_norm": 0.7950928807258606, "learning_rate": 0.0002, "epoch": 2.075822488076954, "step": 12840}, {"loss": 0.6603, "grad_norm": 0.9248087406158447, "learning_rate": 0.0002, "epoch": 2.0774391722576993, "step": 12850}, {"loss": 0.6847, "grad_norm": 0.7229493260383606, "learning_rate": 0.0002, "epoch": 2.079055856438445, "step": 12860}, {"loss": 0.6702, "grad_norm": 0.5710847973823547, "learning_rate": 0.0002, "epoch": 2.08067254061919, "step": 12870}, {"loss": 0.6974, "grad_norm": 0.9580423831939697, "learning_rate": 0.0002, "epoch": 2.0822892247999354, "step": 12880}, {"loss": 0.6341, "grad_norm": 0.7399665713310242, "learning_rate": 0.0002, "epoch": 2.0839059089806806, "step": 12890}, {"loss": 0.6993, "grad_norm": 0.7981410622596741, "learning_rate": 0.0002, "epoch": 2.085522593161426, "step": 12900}, {"loss": 0.6976, "grad_norm": 0.870759904384613, "learning_rate": 0.0002, "epoch": 2.087139277342171, "step": 12910}, {"loss": 0.7194, "grad_norm": 0.7001481652259827, "learning_rate": 0.0002, "epoch": 2.0887559615229163, "step": 12920}, {"loss": 0.6383, "grad_norm": 0.6745418310165405, "learning_rate": 0.0002, "epoch": 2.090372645703662, "step": 12930}, {"loss": 0.6519, "grad_norm": 0.7739067673683167, "learning_rate": 0.0002, "epoch": 2.0919893298844072, "step": 12940}, {"loss": 0.6856, "grad_norm": 0.6742934584617615, "learning_rate": 0.0002, "epoch": 2.0936060140651525, "step": 12950}, {"loss": 0.6279, "grad_norm": 0.7270349860191345, "learning_rate": 0.0002, "epoch": 2.0952226982458977, "step": 12960}, {"loss": 0.6783, "grad_norm": 0.7150624394416809, "learning_rate": 0.0002, "epoch": 2.096839382426643, "step": 12970}, {"loss": 0.6093, "grad_norm": 0.7734767198562622, "learning_rate": 0.0002, "epoch": 2.098456066607388, "step": 12980}, {"loss": 0.6534, "grad_norm": 0.7618662118911743, "learning_rate": 0.0002, "epoch": 2.1000727507881334, "step": 12990}, {"loss": 0.6707, "grad_norm": 0.6557944416999817, "learning_rate": 0.0002, "epoch": 2.101689434968879, "step": 13000}, {"loss": 0.7268, "grad_norm": 0.8786448240280151, "learning_rate": 0.0002, "epoch": 2.1033061191496243, "step": 13010}, {"loss": 0.6677, "grad_norm": 0.6878724098205566, "learning_rate": 0.0002, "epoch": 2.1049228033303695, "step": 13020}, {"loss": 0.6824, "grad_norm": 0.822318971157074, "learning_rate": 0.0002, "epoch": 2.1065394875111147, "step": 13030}, {"loss": 0.6228, "grad_norm": 0.831468939781189, "learning_rate": 0.0002, "epoch": 2.10815617169186, "step": 13040}, {"loss": 0.6511, "grad_norm": 0.7699505686759949, "learning_rate": 0.0002, "epoch": 2.109772855872605, "step": 13050}, {"loss": 0.6671, "grad_norm": 0.7559016346931458, "learning_rate": 0.0002, "epoch": 2.1113895400533504, "step": 13060}, {"loss": 0.6215, "grad_norm": 0.6942209601402283, "learning_rate": 0.0002, "epoch": 2.1130062242340957, "step": 13070}, {"loss": 0.6449, "grad_norm": 0.6098947525024414, "learning_rate": 0.0002, "epoch": 2.1146229084148414, "step": 13080}, {"loss": 0.7091, "grad_norm": 0.6499016284942627, "learning_rate": 0.0002, "epoch": 2.1162395925955866, "step": 13090}, {"loss": 0.6247, "grad_norm": 0.7719953060150146, "learning_rate": 0.0002, "epoch": 2.117856276776332, "step": 13100}, {"loss": 0.6064, "grad_norm": 0.6708134412765503, "learning_rate": 0.0002, "epoch": 2.119472960957077, "step": 13110}, {"loss": 0.6056, "grad_norm": 0.8119585514068604, "learning_rate": 0.0002, "epoch": 2.1210896451378223, "step": 13120}, {"loss": 0.6628, "grad_norm": 0.6947157979011536, "learning_rate": 0.0002, "epoch": 2.1227063293185675, "step": 13130}, {"loss": 0.6375, "grad_norm": 0.8831837773323059, "learning_rate": 0.0002, "epoch": 2.1243230134993127, "step": 13140}, {"loss": 0.6997, "grad_norm": 0.7266910672187805, "learning_rate": 0.0002, "epoch": 2.1259396976800584, "step": 13150}, {"loss": 0.6446, "grad_norm": 0.8864351511001587, "learning_rate": 0.0002, "epoch": 2.1275563818608036, "step": 13160}, {"loss": 0.6762, "grad_norm": 0.8104248046875, "learning_rate": 0.0002, "epoch": 2.129173066041549, "step": 13170}, {"loss": 0.6581, "grad_norm": 0.6077079772949219, "learning_rate": 0.0002, "epoch": 2.130789750222294, "step": 13180}, {"loss": 0.6572, "grad_norm": 0.6874213814735413, "learning_rate": 0.0002, "epoch": 2.1324064344030393, "step": 13190}, {"loss": 0.642, "grad_norm": 0.7134367823600769, "learning_rate": 0.0002, "epoch": 2.1340231185837846, "step": 13200}, {"loss": 0.7016, "grad_norm": 0.6101235151290894, "learning_rate": 0.0002, "epoch": 2.13563980276453, "step": 13210}, {"loss": 0.6529, "grad_norm": 0.6042411923408508, "learning_rate": 0.0002, "epoch": 2.137256486945275, "step": 13220}, {"loss": 0.7179, "grad_norm": 0.914601743221283, "learning_rate": 0.0002, "epoch": 2.1388731711260207, "step": 13230}, {"loss": 0.6513, "grad_norm": 0.7104284167289734, "learning_rate": 0.0002, "epoch": 2.140489855306766, "step": 13240}, {"loss": 0.6607, "grad_norm": 0.664395272731781, "learning_rate": 0.0002, "epoch": 2.142106539487511, "step": 13250}, {"loss": 0.7211, "grad_norm": 0.6991241574287415, "learning_rate": 0.0002, "epoch": 2.1437232236682564, "step": 13260}, {"loss": 0.6484, "grad_norm": 0.5469560623168945, "learning_rate": 0.0002, "epoch": 2.1453399078490016, "step": 13270}, {"loss": 0.6765, "grad_norm": 0.8454998135566711, "learning_rate": 0.0002, "epoch": 2.146956592029747, "step": 13280}, {"loss": 0.6683, "grad_norm": 0.7088868618011475, "learning_rate": 0.0002, "epoch": 2.148573276210492, "step": 13290}, {"loss": 0.6835, "grad_norm": 0.7002687454223633, "learning_rate": 0.0002, "epoch": 2.1501899603912378, "step": 13300}, {"loss": 0.6399, "grad_norm": 0.7785214781761169, "learning_rate": 0.0002, "epoch": 2.151806644571983, "step": 13310}, {"loss": 0.67, "grad_norm": 0.8049132227897644, "learning_rate": 0.0002, "epoch": 2.1534233287527282, "step": 13320}, {"loss": 0.6495, "grad_norm": 0.8062595129013062, "learning_rate": 0.0002, "epoch": 2.1550400129334735, "step": 13330}, {"loss": 0.6603, "grad_norm": 0.6208319067955017, "learning_rate": 0.0002, "epoch": 2.1566566971142187, "step": 13340}, {"loss": 0.6584, "grad_norm": 0.7519655823707581, "learning_rate": 0.0002, "epoch": 2.158273381294964, "step": 13350}, {"loss": 0.6457, "grad_norm": 0.7645747065544128, "learning_rate": 0.0002, "epoch": 2.159890065475709, "step": 13360}, {"loss": 0.645, "grad_norm": 0.6847302913665771, "learning_rate": 0.0002, "epoch": 2.1615067496564544, "step": 13370}, {"loss": 0.6903, "grad_norm": 0.8630441427230835, "learning_rate": 0.0002, "epoch": 2.1631234338372, "step": 13380}, {"loss": 0.6742, "grad_norm": 0.7947702407836914, "learning_rate": 0.0002, "epoch": 2.1647401180179453, "step": 13390}, {"loss": 0.7206, "grad_norm": 0.6836977005004883, "learning_rate": 0.0002, "epoch": 2.1663568021986905, "step": 13400}, {"loss": 0.6304, "grad_norm": 0.7340566515922546, "learning_rate": 0.0002, "epoch": 2.1679734863794358, "step": 13410}, {"loss": 0.6528, "grad_norm": 0.7075738906860352, "learning_rate": 0.0002, "epoch": 2.169590170560181, "step": 13420}, {"loss": 0.6585, "grad_norm": 0.7080879807472229, "learning_rate": 0.0002, "epoch": 2.1712068547409262, "step": 13430}, {"loss": 0.6615, "grad_norm": 0.6218613386154175, "learning_rate": 0.0002, "epoch": 2.1728235389216715, "step": 13440}, {"loss": 0.6488, "grad_norm": 0.8211479187011719, "learning_rate": 0.0002, "epoch": 2.174440223102417, "step": 13450}, {"loss": 0.6738, "grad_norm": 0.864466667175293, "learning_rate": 0.0002, "epoch": 2.1760569072831624, "step": 13460}, {"loss": 0.679, "grad_norm": 0.7943857908248901, "learning_rate": 0.0002, "epoch": 2.1776735914639076, "step": 13470}, {"loss": 0.6838, "grad_norm": 0.78728187084198, "learning_rate": 0.0002, "epoch": 2.179290275644653, "step": 13480}, {"loss": 0.6397, "grad_norm": 0.697527289390564, "learning_rate": 0.0002, "epoch": 2.180906959825398, "step": 13490}, {"loss": 0.669, "grad_norm": 0.8205804228782654, "learning_rate": 0.0002, "epoch": 2.1825236440061433, "step": 13500}, {"loss": 0.7227, "grad_norm": 0.8709042072296143, "learning_rate": 0.0002, "epoch": 2.1841403281868885, "step": 13510}, {"loss": 0.6313, "grad_norm": 0.6228537559509277, "learning_rate": 0.0002, "epoch": 2.1857570123676338, "step": 13520}, {"loss": 0.7025, "grad_norm": 0.9566980004310608, "learning_rate": 0.0002, "epoch": 2.1873736965483794, "step": 13530}, {"loss": 0.6755, "grad_norm": 0.7128894329071045, "learning_rate": 0.0002, "epoch": 2.1889903807291247, "step": 13540}, {"loss": 0.6827, "grad_norm": 0.6888654232025146, "learning_rate": 0.0002, "epoch": 2.19060706490987, "step": 13550}, {"loss": 0.6961, "grad_norm": 0.6444337368011475, "learning_rate": 0.0002, "epoch": 2.192223749090615, "step": 13560}, {"loss": 0.656, "grad_norm": 0.8008806705474854, "learning_rate": 0.0002, "epoch": 2.1938404332713604, "step": 13570}, {"loss": 0.7, "grad_norm": 0.8482748866081238, "learning_rate": 0.0002, "epoch": 2.1954571174521056, "step": 13580}, {"loss": 0.7326, "grad_norm": 0.8584157228469849, "learning_rate": 0.0002, "epoch": 2.197073801632851, "step": 13590}, {"loss": 0.7014, "grad_norm": 0.7513734698295593, "learning_rate": 0.0002, "epoch": 2.1986904858135965, "step": 13600}, {"loss": 0.6632, "grad_norm": 0.7864262461662292, "learning_rate": 0.0002, "epoch": 2.2003071699943417, "step": 13610}, {"loss": 0.6879, "grad_norm": 0.8493645191192627, "learning_rate": 0.0002, "epoch": 2.201923854175087, "step": 13620}, {"loss": 0.6617, "grad_norm": 0.6902140974998474, "learning_rate": 0.0002, "epoch": 2.203540538355832, "step": 13630}, {"loss": 0.6655, "grad_norm": 0.8711254596710205, "learning_rate": 0.0002, "epoch": 2.2051572225365774, "step": 13640}, {"loss": 0.6359, "grad_norm": 0.7832191586494446, "learning_rate": 0.0002, "epoch": 2.2067739067173227, "step": 13650}, {"loss": 0.6723, "grad_norm": 0.5668176412582397, "learning_rate": 0.0002, "epoch": 2.208390590898068, "step": 13660}, {"loss": 0.635, "grad_norm": 0.8648375272750854, "learning_rate": 0.0002, "epoch": 2.2100072750788136, "step": 13670}, {"loss": 0.653, "grad_norm": 0.7643089890480042, "learning_rate": 0.0002, "epoch": 2.211623959259559, "step": 13680}, {"loss": 0.6765, "grad_norm": 0.6293777823448181, "learning_rate": 0.0002, "epoch": 2.213240643440304, "step": 13690}, {"loss": 0.6842, "grad_norm": 0.6459372639656067, "learning_rate": 0.0002, "epoch": 2.2148573276210493, "step": 13700}, {"loss": 0.6526, "grad_norm": 0.7060744166374207, "learning_rate": 0.0002, "epoch": 2.2164740118017945, "step": 13710}, {"loss": 0.7101, "grad_norm": 0.674109160900116, "learning_rate": 0.0002, "epoch": 2.2180906959825397, "step": 13720}, {"loss": 0.6529, "grad_norm": 0.830392062664032, "learning_rate": 0.0002, "epoch": 2.219707380163285, "step": 13730}, {"loss": 0.6733, "grad_norm": 0.6474477052688599, "learning_rate": 0.0002, "epoch": 2.2213240643440306, "step": 13740}, {"loss": 0.6413, "grad_norm": 0.7037909626960754, "learning_rate": 0.0002, "epoch": 2.222940748524776, "step": 13750}, {"loss": 0.6417, "grad_norm": 0.6554131507873535, "learning_rate": 0.0002, "epoch": 2.224557432705521, "step": 13760}, {"loss": 0.6907, "grad_norm": 0.7822230458259583, "learning_rate": 0.0002, "epoch": 2.2261741168862663, "step": 13770}, {"loss": 0.6505, "grad_norm": 0.9082167744636536, "learning_rate": 0.0002, "epoch": 2.2277908010670116, "step": 13780}, {"loss": 0.6878, "grad_norm": 0.7918276190757751, "learning_rate": 0.0002, "epoch": 2.229407485247757, "step": 13790}, {"loss": 0.6669, "grad_norm": 0.7354569435119629, "learning_rate": 0.0002, "epoch": 2.231024169428502, "step": 13800}, {"loss": 0.6503, "grad_norm": 0.8265249133110046, "learning_rate": 0.0002, "epoch": 2.2326408536092472, "step": 13810}, {"loss": 0.6871, "grad_norm": 0.6653847098350525, "learning_rate": 0.0002, "epoch": 2.234257537789993, "step": 13820}, {"loss": 0.6413, "grad_norm": 0.7157923579216003, "learning_rate": 0.0002, "epoch": 2.235874221970738, "step": 13830}, {"loss": 0.6306, "grad_norm": 0.7110323309898376, "learning_rate": 0.0002, "epoch": 2.2374909061514834, "step": 13840}, {"loss": 0.6913, "grad_norm": 0.7155357599258423, "learning_rate": 0.0002, "epoch": 2.2391075903322286, "step": 13850}, {"loss": 0.6579, "grad_norm": 1.0177817344665527, "learning_rate": 0.0002, "epoch": 2.240724274512974, "step": 13860}, {"loss": 0.635, "grad_norm": 0.7601948380470276, "learning_rate": 0.0002, "epoch": 2.242340958693719, "step": 13870}, {"loss": 0.6679, "grad_norm": 0.7628820538520813, "learning_rate": 0.0002, "epoch": 2.2439576428744643, "step": 13880}, {"loss": 0.6805, "grad_norm": 0.7089297771453857, "learning_rate": 0.0002, "epoch": 2.24557432705521, "step": 13890}, {"loss": 0.7236, "grad_norm": 0.695178210735321, "learning_rate": 0.0002, "epoch": 2.247191011235955, "step": 13900}, {"loss": 0.7084, "grad_norm": 0.7631948590278625, "learning_rate": 0.0002, "epoch": 2.2488076954167004, "step": 13910}, {"loss": 0.685, "grad_norm": 0.8203101754188538, "learning_rate": 0.0002, "epoch": 2.2504243795974457, "step": 13920}, {"loss": 0.653, "grad_norm": 0.8099079728126526, "learning_rate": 0.0002, "epoch": 2.252041063778191, "step": 13930}, {"loss": 0.694, "grad_norm": 0.6498546004295349, "learning_rate": 0.0002, "epoch": 2.253657747958936, "step": 13940}, {"loss": 0.6684, "grad_norm": 0.7797415256500244, "learning_rate": 0.0002, "epoch": 2.2552744321396814, "step": 13950}, {"loss": 0.683, "grad_norm": 0.8254124522209167, "learning_rate": 0.0002, "epoch": 2.2568911163204266, "step": 13960}, {"loss": 0.6806, "grad_norm": 0.6327953338623047, "learning_rate": 0.0002, "epoch": 2.2585078005011723, "step": 13970}, {"loss": 0.668, "grad_norm": 0.734194278717041, "learning_rate": 0.0002, "epoch": 2.2601244846819175, "step": 13980}, {"loss": 0.6912, "grad_norm": 0.9014202952384949, "learning_rate": 0.0002, "epoch": 2.2617411688626627, "step": 13990}, {"loss": 0.692, "grad_norm": 0.7643631100654602, "learning_rate": 0.0002, "epoch": 2.263357853043408, "step": 14000}, {"loss": 0.6657, "grad_norm": 0.8882834911346436, "learning_rate": 0.0002, "epoch": 2.264974537224153, "step": 14010}, {"loss": 0.6453, "grad_norm": 0.7975873351097107, "learning_rate": 0.0002, "epoch": 2.2665912214048984, "step": 14020}, {"loss": 0.7193, "grad_norm": 0.7765783071517944, "learning_rate": 0.0002, "epoch": 2.2682079055856437, "step": 14030}, {"loss": 0.662, "grad_norm": 0.8846288323402405, "learning_rate": 0.0002, "epoch": 2.2698245897663893, "step": 14040}, {"loss": 0.6494, "grad_norm": 0.9006744027137756, "learning_rate": 0.0002, "epoch": 2.2714412739471346, "step": 14050}, {"loss": 0.6423, "grad_norm": 0.7420173287391663, "learning_rate": 0.0002, "epoch": 2.27305795812788, "step": 14060}, {"loss": 0.7068, "grad_norm": 0.7956424951553345, "learning_rate": 0.0002, "epoch": 2.274674642308625, "step": 14070}, {"loss": 0.6581, "grad_norm": 0.7783209085464478, "learning_rate": 0.0002, "epoch": 2.2762913264893703, "step": 14080}, {"loss": 0.7202, "grad_norm": 0.7597188949584961, "learning_rate": 0.0002, "epoch": 2.2779080106701155, "step": 14090}, {"loss": 0.6778, "grad_norm": 0.6718921661376953, "learning_rate": 0.0002, "epoch": 2.2795246948508607, "step": 14100}, {"loss": 0.632, "grad_norm": 0.7528082132339478, "learning_rate": 0.0002, "epoch": 2.281141379031606, "step": 14110}, {"loss": 0.7608, "grad_norm": 0.8379864692687988, "learning_rate": 0.0002, "epoch": 2.2827580632123516, "step": 14120}, {"loss": 0.6767, "grad_norm": 0.748613715171814, "learning_rate": 0.0002, "epoch": 2.284374747393097, "step": 14130}, {"loss": 0.6641, "grad_norm": 0.7435423135757446, "learning_rate": 0.0002, "epoch": 2.285991431573842, "step": 14140}, {"loss": 0.6849, "grad_norm": 0.7580803632736206, "learning_rate": 0.0002, "epoch": 2.2876081157545873, "step": 14150}, {"loss": 0.6604, "grad_norm": 0.6278321146965027, "learning_rate": 0.0002, "epoch": 2.2892247999353326, "step": 14160}, {"loss": 0.6573, "grad_norm": 0.7663896083831787, "learning_rate": 0.0002, "epoch": 2.290841484116078, "step": 14170}, {"loss": 0.6655, "grad_norm": 0.9716812372207642, "learning_rate": 0.0002, "epoch": 2.292458168296823, "step": 14180}, {"loss": 0.7067, "grad_norm": 0.8993458151817322, "learning_rate": 0.0002, "epoch": 2.2940748524775687, "step": 14190}, {"loss": 0.6172, "grad_norm": 0.6156117916107178, "learning_rate": 0.0002, "epoch": 2.295691536658314, "step": 14200}, {"loss": 0.6318, "grad_norm": 0.8911278247833252, "learning_rate": 0.0002, "epoch": 2.297308220839059, "step": 14210}, {"loss": 0.6364, "grad_norm": 0.6422147154808044, "learning_rate": 0.0002, "epoch": 2.2989249050198044, "step": 14220}, {"loss": 0.6795, "grad_norm": 0.6866879463195801, "learning_rate": 0.0002, "epoch": 2.3005415892005496, "step": 14230}, {"loss": 0.6907, "grad_norm": 0.9297130107879639, "learning_rate": 0.0002, "epoch": 2.302158273381295, "step": 14240}, {"loss": 0.6823, "grad_norm": 0.7501356601715088, "learning_rate": 0.0002, "epoch": 2.30377495756204, "step": 14250}, {"loss": 0.6414, "grad_norm": 0.8363515138626099, "learning_rate": 0.0002, "epoch": 2.3053916417427853, "step": 14260}, {"loss": 0.6362, "grad_norm": 0.9083868265151978, "learning_rate": 0.0002, "epoch": 2.307008325923531, "step": 14270}, {"loss": 0.6862, "grad_norm": 0.7791516780853271, "learning_rate": 0.0002, "epoch": 2.3086250101042762, "step": 14280}, {"loss": 0.6569, "grad_norm": 0.8766953349113464, "learning_rate": 0.0002, "epoch": 2.3102416942850215, "step": 14290}, {"loss": 0.6698, "grad_norm": 0.7916635274887085, "learning_rate": 0.0002, "epoch": 2.3118583784657667, "step": 14300}, {"loss": 0.6927, "grad_norm": 0.627525269985199, "learning_rate": 0.0002, "epoch": 2.313475062646512, "step": 14310}, {"loss": 0.6541, "grad_norm": 0.8856783509254456, "learning_rate": 0.0002, "epoch": 2.315091746827257, "step": 14320}, {"loss": 0.6806, "grad_norm": 0.6758689284324646, "learning_rate": 0.0002, "epoch": 2.316708431008003, "step": 14330}, {"loss": 0.6794, "grad_norm": 0.6428321003913879, "learning_rate": 0.0002, "epoch": 2.318325115188748, "step": 14340}, {"loss": 0.682, "grad_norm": 0.9032121300697327, "learning_rate": 0.0002, "epoch": 2.3199417993694933, "step": 14350}, {"loss": 0.6569, "grad_norm": 0.8035986423492432, "learning_rate": 0.0002, "epoch": 2.3215584835502385, "step": 14360}, {"loss": 0.7067, "grad_norm": 0.7974579334259033, "learning_rate": 0.0002, "epoch": 2.3231751677309838, "step": 14370}, {"loss": 0.6451, "grad_norm": 0.8356034755706787, "learning_rate": 0.0002, "epoch": 2.324791851911729, "step": 14380}, {"loss": 0.6623, "grad_norm": 0.998760998249054, "learning_rate": 0.0002, "epoch": 2.326408536092474, "step": 14390}, {"loss": 0.649, "grad_norm": 0.6518142223358154, "learning_rate": 0.0002, "epoch": 2.3280252202732195, "step": 14400}, {"loss": 0.7146, "grad_norm": 0.7443506717681885, "learning_rate": 0.0002, "epoch": 2.3296419044539647, "step": 14410}, {"loss": 0.648, "grad_norm": 0.8436172604560852, "learning_rate": 0.0002, "epoch": 2.3312585886347104, "step": 14420}, {"loss": 0.6585, "grad_norm": 0.7411080598831177, "learning_rate": 0.0002, "epoch": 2.3328752728154556, "step": 14430}, {"loss": 0.6781, "grad_norm": 0.8839048743247986, "learning_rate": 0.0002, "epoch": 2.334491956996201, "step": 14440}, {"loss": 0.6565, "grad_norm": 0.8360885977745056, "learning_rate": 0.0002, "epoch": 2.336108641176946, "step": 14450}, {"loss": 0.6662, "grad_norm": 0.7608986496925354, "learning_rate": 0.0002, "epoch": 2.3377253253576913, "step": 14460}, {"loss": 0.6685, "grad_norm": 0.8179867267608643, "learning_rate": 0.0002, "epoch": 2.3393420095384365, "step": 14470}, {"loss": 0.7055, "grad_norm": 0.5989999771118164, "learning_rate": 0.0002, "epoch": 2.340958693719182, "step": 14480}, {"loss": 0.644, "grad_norm": 0.9450054168701172, "learning_rate": 0.0002, "epoch": 2.3425753778999274, "step": 14490}, {"loss": 0.6983, "grad_norm": 0.7885149717330933, "learning_rate": 0.0002, "epoch": 2.3441920620806727, "step": 14500}, {"loss": 0.6819, "grad_norm": 0.8152616620063782, "learning_rate": 0.0002, "epoch": 2.345808746261418, "step": 14510}, {"loss": 0.6989, "grad_norm": 0.7193838953971863, "learning_rate": 0.0002, "epoch": 2.347425430442163, "step": 14520}, {"loss": 0.6594, "grad_norm": 0.6701092720031738, "learning_rate": 0.0002, "epoch": 2.3490421146229084, "step": 14530}, {"loss": 0.6559, "grad_norm": 0.7529364228248596, "learning_rate": 0.0002, "epoch": 2.3506587988036536, "step": 14540}, {"loss": 0.6306, "grad_norm": 0.6599733829498291, "learning_rate": 0.0002, "epoch": 2.352275482984399, "step": 14550}, {"loss": 0.706, "grad_norm": 0.9502474069595337, "learning_rate": 0.0002, "epoch": 2.353892167165144, "step": 14560}, {"loss": 0.717, "grad_norm": 0.7619650959968567, "learning_rate": 0.0002, "epoch": 2.3555088513458897, "step": 14570}, {"loss": 0.6684, "grad_norm": 0.9854652285575867, "learning_rate": 0.0002, "epoch": 2.357125535526635, "step": 14580}, {"loss": 0.6455, "grad_norm": 0.727439284324646, "learning_rate": 0.0002, "epoch": 2.35874221970738, "step": 14590}, {"loss": 0.6645, "grad_norm": 0.6994746327400208, "learning_rate": 0.0002, "epoch": 2.3603589038881254, "step": 14600}, {"loss": 0.6587, "grad_norm": 0.7117531299591064, "learning_rate": 0.0002, "epoch": 2.3619755880688706, "step": 14610}, {"loss": 0.6804, "grad_norm": 0.6403067708015442, "learning_rate": 0.0002, "epoch": 2.363592272249616, "step": 14620}, {"loss": 0.7055, "grad_norm": 0.8377841711044312, "learning_rate": 0.0002, "epoch": 2.3652089564303616, "step": 14630}, {"loss": 0.6778, "grad_norm": 0.749171257019043, "learning_rate": 0.0002, "epoch": 2.366825640611107, "step": 14640}, {"loss": 0.6552, "grad_norm": 0.8418586254119873, "learning_rate": 0.0002, "epoch": 2.368442324791852, "step": 14650}, {"loss": 0.6685, "grad_norm": 0.6178573369979858, "learning_rate": 0.0002, "epoch": 2.3700590089725972, "step": 14660}, {"loss": 0.6774, "grad_norm": 0.6368302702903748, "learning_rate": 0.0002, "epoch": 2.3716756931533425, "step": 14670}, {"loss": 0.6136, "grad_norm": 0.9122977256774902, "learning_rate": 0.0002, "epoch": 2.3732923773340877, "step": 14680}, {"loss": 0.6675, "grad_norm": 0.7086195349693298, "learning_rate": 0.0002, "epoch": 2.374909061514833, "step": 14690}, {"loss": 0.6582, "grad_norm": 0.7500800490379333, "learning_rate": 0.0002, "epoch": 2.376525745695578, "step": 14700}, {"loss": 0.6792, "grad_norm": 0.6634900569915771, "learning_rate": 0.0002, "epoch": 2.378142429876324, "step": 14710}, {"loss": 0.6614, "grad_norm": 0.839898407459259, "learning_rate": 0.0002, "epoch": 2.379759114057069, "step": 14720}, {"loss": 0.6453, "grad_norm": 0.7578426003456116, "learning_rate": 0.0002, "epoch": 2.3813757982378143, "step": 14730}, {"loss": 0.7282, "grad_norm": 1.0213173627853394, "learning_rate": 0.0002, "epoch": 2.3829924824185595, "step": 14740}, {"loss": 0.6704, "grad_norm": 0.7855949401855469, "learning_rate": 0.0002, "epoch": 2.3846091665993048, "step": 14750}, {"loss": 0.6694, "grad_norm": 0.7224128842353821, "learning_rate": 0.0002, "epoch": 2.38622585078005, "step": 14760}, {"loss": 0.7017, "grad_norm": 0.8040381669998169, "learning_rate": 0.0002, "epoch": 2.3878425349607952, "step": 14770}, {"loss": 0.6799, "grad_norm": 0.7705281376838684, "learning_rate": 0.0002, "epoch": 2.389459219141541, "step": 14780}, {"loss": 0.6326, "grad_norm": 0.667966902256012, "learning_rate": 0.0002, "epoch": 2.391075903322286, "step": 14790}, {"loss": 0.7061, "grad_norm": 0.6611011028289795, "learning_rate": 0.0002, "epoch": 2.3926925875030314, "step": 14800}, {"loss": 0.6527, "grad_norm": 0.6862651705741882, "learning_rate": 0.0002, "epoch": 2.3943092716837766, "step": 14810}, {"loss": 0.6537, "grad_norm": 0.8086010217666626, "learning_rate": 0.0002, "epoch": 2.395925955864522, "step": 14820}, {"loss": 0.7189, "grad_norm": 0.7189689874649048, "learning_rate": 0.0002, "epoch": 2.397542640045267, "step": 14830}, {"loss": 0.6709, "grad_norm": 0.6280009150505066, "learning_rate": 0.0002, "epoch": 2.3991593242260123, "step": 14840}, {"loss": 0.706, "grad_norm": 0.7826612591743469, "learning_rate": 0.0002, "epoch": 2.4007760084067575, "step": 14850}, {"loss": 0.6738, "grad_norm": 0.7681610584259033, "learning_rate": 0.0002, "epoch": 2.402392692587503, "step": 14860}, {"loss": 0.636, "grad_norm": 0.720966100692749, "learning_rate": 0.0002, "epoch": 2.4040093767682484, "step": 14870}, {"loss": 0.6667, "grad_norm": 0.8202250599861145, "learning_rate": 0.0002, "epoch": 2.4056260609489937, "step": 14880}, {"loss": 0.6935, "grad_norm": 0.786212682723999, "learning_rate": 0.0002, "epoch": 2.407242745129739, "step": 14890}, {"loss": 0.6628, "grad_norm": 0.6647164821624756, "learning_rate": 0.0002, "epoch": 2.408859429310484, "step": 14900}, {"loss": 0.6706, "grad_norm": 0.7566399574279785, "learning_rate": 0.0002, "epoch": 2.4104761134912294, "step": 14910}, {"loss": 0.7188, "grad_norm": 0.748814582824707, "learning_rate": 0.0002, "epoch": 2.4120927976719746, "step": 14920}, {"loss": 0.6684, "grad_norm": 0.7624038457870483, "learning_rate": 0.0002, "epoch": 2.4137094818527203, "step": 14930}, {"loss": 0.6483, "grad_norm": 0.8267335295677185, "learning_rate": 0.0002, "epoch": 2.4153261660334655, "step": 14940}, {"loss": 0.6612, "grad_norm": 0.8785360455513, "learning_rate": 0.0002, "epoch": 2.4169428502142107, "step": 14950}, {"loss": 0.6718, "grad_norm": 0.679887592792511, "learning_rate": 0.0002, "epoch": 2.418559534394956, "step": 14960}, {"loss": 0.6136, "grad_norm": 0.7218474745750427, "learning_rate": 0.0002, "epoch": 2.420176218575701, "step": 14970}, {"loss": 0.648, "grad_norm": 0.6342799663543701, "learning_rate": 0.0002, "epoch": 2.4217929027564464, "step": 14980}, {"loss": 0.6617, "grad_norm": 0.7098712921142578, "learning_rate": 0.0002, "epoch": 2.4234095869371917, "step": 14990}, {"loss": 0.6942, "grad_norm": 0.7497431635856628, "learning_rate": 0.0002, "epoch": 2.425026271117937, "step": 15000}, {"loss": 0.6772, "grad_norm": 0.934836208820343, "learning_rate": 0.0002, "epoch": 2.4266429552986826, "step": 15010}, {"loss": 0.7221, "grad_norm": 0.8430966734886169, "learning_rate": 0.0002, "epoch": 2.428259639479428, "step": 15020}, {"loss": 0.6985, "grad_norm": 0.7032104730606079, "learning_rate": 0.0002, "epoch": 2.429876323660173, "step": 15030}, {"loss": 0.6715, "grad_norm": 0.7746111750602722, "learning_rate": 0.0002, "epoch": 2.4314930078409183, "step": 15040}, {"loss": 0.7177, "grad_norm": 0.7661406397819519, "learning_rate": 0.0002, "epoch": 2.4331096920216635, "step": 15050}, {"loss": 0.6517, "grad_norm": 0.6941645741462708, "learning_rate": 0.0002, "epoch": 2.4347263762024087, "step": 15060}, {"loss": 0.6421, "grad_norm": 0.7487249374389648, "learning_rate": 0.0002, "epoch": 2.436343060383154, "step": 15070}, {"loss": 0.6796, "grad_norm": 0.7639912962913513, "learning_rate": 0.0002, "epoch": 2.4379597445638996, "step": 15080}, {"loss": 0.7087, "grad_norm": 0.7708953619003296, "learning_rate": 0.0002, "epoch": 2.439576428744645, "step": 15090}, {"loss": 0.7065, "grad_norm": 0.9135832190513611, "learning_rate": 0.0002, "epoch": 2.44119311292539, "step": 15100}, {"loss": 0.672, "grad_norm": 0.8283005356788635, "learning_rate": 0.0002, "epoch": 2.4428097971061353, "step": 15110}, {"loss": 0.6551, "grad_norm": 0.925299346446991, "learning_rate": 0.0002, "epoch": 2.4444264812868806, "step": 15120}, {"loss": 0.687, "grad_norm": 0.7013528943061829, "learning_rate": 0.0002, "epoch": 2.446043165467626, "step": 15130}, {"loss": 0.6842, "grad_norm": 0.622303307056427, "learning_rate": 0.0002, "epoch": 2.447659849648371, "step": 15140}, {"loss": 0.6676, "grad_norm": 0.876569390296936, "learning_rate": 0.0002, "epoch": 2.4492765338291163, "step": 15150}, {"loss": 0.6463, "grad_norm": 0.6836351752281189, "learning_rate": 0.0002, "epoch": 2.450893218009862, "step": 15160}, {"loss": 0.6781, "grad_norm": 0.7886684536933899, "learning_rate": 0.0002, "epoch": 2.452509902190607, "step": 15170}, {"loss": 0.6794, "grad_norm": 0.6647440791130066, "learning_rate": 0.0002, "epoch": 2.4541265863713524, "step": 15180}, {"loss": 0.6353, "grad_norm": 0.7477722764015198, "learning_rate": 0.0002, "epoch": 2.4557432705520976, "step": 15190}, {"loss": 0.698, "grad_norm": 0.8192033767700195, "learning_rate": 0.0002, "epoch": 2.457359954732843, "step": 15200}, {"loss": 0.6735, "grad_norm": 0.847537100315094, "learning_rate": 0.0002, "epoch": 2.458976638913588, "step": 15210}, {"loss": 0.6962, "grad_norm": 0.9027776122093201, "learning_rate": 0.0002, "epoch": 2.4605933230943338, "step": 15220}, {"loss": 0.7084, "grad_norm": 0.7217772006988525, "learning_rate": 0.0002, "epoch": 2.462210007275079, "step": 15230}, {"loss": 0.691, "grad_norm": 0.7994546294212341, "learning_rate": 0.0002, "epoch": 2.4638266914558242, "step": 15240}, {"loss": 0.6828, "grad_norm": 0.939916729927063, "learning_rate": 0.0002, "epoch": 2.4654433756365695, "step": 15250}, {"loss": 0.6893, "grad_norm": 1.0009053945541382, "learning_rate": 0.0002, "epoch": 2.4670600598173147, "step": 15260}, {"loss": 0.643, "grad_norm": 0.625555694103241, "learning_rate": 0.0002, "epoch": 2.46867674399806, "step": 15270}, {"loss": 0.688, "grad_norm": 0.7924878597259521, "learning_rate": 0.0002, "epoch": 2.470293428178805, "step": 15280}, {"loss": 0.6789, "grad_norm": 0.8536689877510071, "learning_rate": 0.0002, "epoch": 2.4719101123595504, "step": 15290}, {"loss": 0.6924, "grad_norm": 0.8572589755058289, "learning_rate": 0.0002, "epoch": 2.4735267965402956, "step": 15300}, {"loss": 0.604, "grad_norm": 0.773279070854187, "learning_rate": 0.0002, "epoch": 2.4751434807210413, "step": 15310}, {"loss": 0.6573, "grad_norm": 0.7708749771118164, "learning_rate": 0.0002, "epoch": 2.4767601649017865, "step": 15320}, {"loss": 0.7065, "grad_norm": 0.770905077457428, "learning_rate": 0.0002, "epoch": 2.4783768490825318, "step": 15330}, {"loss": 0.6878, "grad_norm": 0.8238571882247925, "learning_rate": 0.0002, "epoch": 2.479993533263277, "step": 15340}, {"loss": 0.6772, "grad_norm": 0.7670477032661438, "learning_rate": 0.0002, "epoch": 2.481610217444022, "step": 15350}, {"loss": 0.7759, "grad_norm": 0.905036985874176, "learning_rate": 0.0002, "epoch": 2.4832269016247674, "step": 15360}, {"loss": 0.706, "grad_norm": 0.6672089695930481, "learning_rate": 0.0002, "epoch": 2.484843585805513, "step": 15370}, {"loss": 0.6722, "grad_norm": 0.625095784664154, "learning_rate": 0.0002, "epoch": 2.4864602699862584, "step": 15380}, {"loss": 0.6396, "grad_norm": 0.679772675037384, "learning_rate": 0.0002, "epoch": 2.4880769541670036, "step": 15390}, {"loss": 0.6778, "grad_norm": 0.711492121219635, "learning_rate": 0.0002, "epoch": 2.489693638347749, "step": 15400}, {"loss": 0.6966, "grad_norm": 0.876189112663269, "learning_rate": 0.0002, "epoch": 2.491310322528494, "step": 15410}, {"loss": 0.7307, "grad_norm": 0.7236915230751038, "learning_rate": 0.0002, "epoch": 2.4929270067092393, "step": 15420}, {"loss": 0.647, "grad_norm": 0.6629832983016968, "learning_rate": 0.0002, "epoch": 2.4945436908899845, "step": 15430}, {"loss": 0.6669, "grad_norm": 0.9756859540939331, "learning_rate": 0.0002, "epoch": 2.4961603750707297, "step": 15440}, {"loss": 0.7559, "grad_norm": 0.6896940469741821, "learning_rate": 0.0002, "epoch": 2.4977770592514754, "step": 15450}, {"loss": 0.6818, "grad_norm": 0.7105149626731873, "learning_rate": 0.0002, "epoch": 2.4993937434322206, "step": 15460}, {"loss": 0.6859, "grad_norm": 0.8374546766281128, "learning_rate": 0.0002, "epoch": 2.501010427612966, "step": 15470}, {"loss": 0.6512, "grad_norm": 0.7320070266723633, "learning_rate": 0.0002, "epoch": 2.502627111793711, "step": 15480}, {"loss": 0.685, "grad_norm": 0.8306367993354797, "learning_rate": 0.0002, "epoch": 2.5042437959744563, "step": 15490}, {"loss": 0.7253, "grad_norm": 0.7472721338272095, "learning_rate": 0.0002, "epoch": 2.5058604801552016, "step": 15500}, {"loss": 0.6699, "grad_norm": 0.6147692203521729, "learning_rate": 0.0002, "epoch": 2.507477164335947, "step": 15510}, {"loss": 0.7158, "grad_norm": 0.7788505554199219, "learning_rate": 0.0002, "epoch": 2.5090938485166925, "step": 15520}, {"loss": 0.6521, "grad_norm": 0.8807527422904968, "learning_rate": 0.0002, "epoch": 2.5107105326974377, "step": 15530}, {"loss": 0.6792, "grad_norm": 0.7521643042564392, "learning_rate": 0.0002, "epoch": 2.512327216878183, "step": 15540}, {"loss": 0.6772, "grad_norm": 0.6900225281715393, "learning_rate": 0.0002, "epoch": 2.513943901058928, "step": 15550}, {"loss": 0.6769, "grad_norm": 0.6601938605308533, "learning_rate": 0.0002, "epoch": 2.5155605852396734, "step": 15560}, {"loss": 0.6648, "grad_norm": 0.8179984092712402, "learning_rate": 0.0002, "epoch": 2.5171772694204186, "step": 15570}, {"loss": 0.7028, "grad_norm": 0.792556881904602, "learning_rate": 0.0002, "epoch": 2.518793953601164, "step": 15580}, {"loss": 0.6464, "grad_norm": 0.7081938982009888, "learning_rate": 0.0002, "epoch": 2.520410637781909, "step": 15590}, {"loss": 0.6691, "grad_norm": 0.8733121156692505, "learning_rate": 0.0002, "epoch": 2.5220273219626543, "step": 15600}, {"loss": 0.6969, "grad_norm": 0.7980992794036865, "learning_rate": 0.0002, "epoch": 2.5236440061434, "step": 15610}, {"loss": 0.7124, "grad_norm": 0.883664071559906, "learning_rate": 0.0002, "epoch": 2.5252606903241452, "step": 15620}, {"loss": 0.7022, "grad_norm": 0.6963341236114502, "learning_rate": 0.0002, "epoch": 2.5268773745048905, "step": 15630}, {"loss": 0.7334, "grad_norm": 0.6433573365211487, "learning_rate": 0.0002, "epoch": 2.5284940586856357, "step": 15640}, {"loss": 0.6889, "grad_norm": 0.8538183569908142, "learning_rate": 0.0002, "epoch": 2.530110742866381, "step": 15650}, {"loss": 0.6841, "grad_norm": 0.9748201370239258, "learning_rate": 0.0002, "epoch": 2.5317274270471266, "step": 15660}, {"loss": 0.6765, "grad_norm": 0.7670575380325317, "learning_rate": 0.0002, "epoch": 2.533344111227872, "step": 15670}, {"loss": 0.6435, "grad_norm": 0.8738890290260315, "learning_rate": 0.0002, "epoch": 2.534960795408617, "step": 15680}, {"loss": 0.6802, "grad_norm": 0.8391636610031128, "learning_rate": 0.0002, "epoch": 2.5365774795893623, "step": 15690}, {"loss": 0.6901, "grad_norm": 0.7239366769790649, "learning_rate": 0.0002, "epoch": 2.5381941637701075, "step": 15700}, {"loss": 0.7011, "grad_norm": 0.8498379588127136, "learning_rate": 0.0002, "epoch": 2.5398108479508528, "step": 15710}, {"loss": 0.6998, "grad_norm": 0.8029484152793884, "learning_rate": 0.0002, "epoch": 2.541427532131598, "step": 15720}, {"loss": 0.6678, "grad_norm": 1.0639333724975586, "learning_rate": 0.0002, "epoch": 2.5430442163123432, "step": 15730}, {"loss": 0.6341, "grad_norm": 0.6401297450065613, "learning_rate": 0.0002, "epoch": 2.5446609004930885, "step": 15740}, {"loss": 0.7196, "grad_norm": 0.7123814821243286, "learning_rate": 0.0002, "epoch": 2.5462775846738337, "step": 15750}, {"loss": 0.654, "grad_norm": 0.7874974608421326, "learning_rate": 0.0002, "epoch": 2.5478942688545794, "step": 15760}, {"loss": 0.6721, "grad_norm": 0.8046808838844299, "learning_rate": 0.0002, "epoch": 2.5495109530353246, "step": 15770}, {"loss": 0.6665, "grad_norm": 0.7888661623001099, "learning_rate": 0.0002, "epoch": 2.55112763721607, "step": 15780}, {"loss": 0.6893, "grad_norm": 0.8445866107940674, "learning_rate": 0.0002, "epoch": 2.552744321396815, "step": 15790}, {"loss": 0.6815, "grad_norm": 0.7475846409797668, "learning_rate": 0.0002, "epoch": 2.5543610055775603, "step": 15800}, {"loss": 0.6711, "grad_norm": 0.7455102801322937, "learning_rate": 0.0002, "epoch": 2.555977689758306, "step": 15810}, {"loss": 0.6932, "grad_norm": 0.8226983547210693, "learning_rate": 0.0002, "epoch": 2.557594373939051, "step": 15820}, {"loss": 0.651, "grad_norm": 0.8920368552207947, "learning_rate": 0.0002, "epoch": 2.5592110581197964, "step": 15830}, {"loss": 0.6297, "grad_norm": 0.8413904905319214, "learning_rate": 0.0002, "epoch": 2.5608277423005417, "step": 15840}, {"loss": 0.7106, "grad_norm": 0.8483649492263794, "learning_rate": 0.0002, "epoch": 2.562444426481287, "step": 15850}, {"loss": 0.6957, "grad_norm": 0.5923284292221069, "learning_rate": 0.0002, "epoch": 2.564061110662032, "step": 15860}, {"loss": 0.6847, "grad_norm": 0.8518726229667664, "learning_rate": 0.0002, "epoch": 2.5656777948427774, "step": 15870}, {"loss": 0.6362, "grad_norm": 0.731235146522522, "learning_rate": 0.0002, "epoch": 2.5672944790235226, "step": 15880}, {"loss": 0.7611, "grad_norm": 0.7517194151878357, "learning_rate": 0.0002, "epoch": 2.568911163204268, "step": 15890}, {"loss": 0.6907, "grad_norm": 0.8378692269325256, "learning_rate": 0.0002, "epoch": 2.5705278473850135, "step": 15900}, {"loss": 0.7055, "grad_norm": 0.843701958656311, "learning_rate": 0.0002, "epoch": 2.5721445315657587, "step": 15910}, {"loss": 0.6882, "grad_norm": 0.7254629731178284, "learning_rate": 0.0002, "epoch": 2.573761215746504, "step": 15920}, {"loss": 0.6872, "grad_norm": 0.8863335847854614, "learning_rate": 0.0002, "epoch": 2.575377899927249, "step": 15930}, {"loss": 0.6813, "grad_norm": 0.7675097584724426, "learning_rate": 0.0002, "epoch": 2.5769945841079944, "step": 15940}, {"loss": 0.7357, "grad_norm": 0.82063889503479, "learning_rate": 0.0002, "epoch": 2.5786112682887397, "step": 15950}, {"loss": 0.662, "grad_norm": 0.7729717493057251, "learning_rate": 0.0002, "epoch": 2.5802279524694853, "step": 15960}, {"loss": 0.633, "grad_norm": 0.8301846981048584, "learning_rate": 0.0002, "epoch": 2.5818446366502306, "step": 15970}, {"loss": 0.6897, "grad_norm": 0.7906861305236816, "learning_rate": 0.0002, "epoch": 2.583461320830976, "step": 15980}, {"loss": 0.7175, "grad_norm": 0.6749057173728943, "learning_rate": 0.0002, "epoch": 2.585078005011721, "step": 15990}, {"loss": 0.7212, "grad_norm": 0.9386842846870422, "learning_rate": 0.0002, "epoch": 2.5866946891924663, "step": 16000}, {"loss": 0.6934, "grad_norm": 0.7868891358375549, "learning_rate": 0.0002, "epoch": 2.5883113733732115, "step": 16010}, {"loss": 0.7036, "grad_norm": 0.8674671053886414, "learning_rate": 0.0002, "epoch": 2.5899280575539567, "step": 16020}, {"loss": 0.7217, "grad_norm": 0.7043559551239014, "learning_rate": 0.0002, "epoch": 2.591544741734702, "step": 16030}, {"loss": 0.6967, "grad_norm": 0.5846083760261536, "learning_rate": 0.0002, "epoch": 2.593161425915447, "step": 16040}, {"loss": 0.7322, "grad_norm": 0.7323982119560242, "learning_rate": 0.0002, "epoch": 2.594778110096193, "step": 16050}, {"loss": 0.6794, "grad_norm": 0.9069556593894958, "learning_rate": 0.0002, "epoch": 2.596394794276938, "step": 16060}, {"loss": 0.7076, "grad_norm": 0.7522736191749573, "learning_rate": 0.0002, "epoch": 2.5980114784576833, "step": 16070}, {"loss": 0.6477, "grad_norm": 0.8149648308753967, "learning_rate": 0.0002, "epoch": 2.5996281626384286, "step": 16080}, {"loss": 0.6664, "grad_norm": 0.6214233040809631, "learning_rate": 0.0002, "epoch": 2.601244846819174, "step": 16090}, {"loss": 0.7307, "grad_norm": 0.6803743839263916, "learning_rate": 0.0002, "epoch": 2.602861530999919, "step": 16100}, {"loss": 0.7244, "grad_norm": 0.7223997116088867, "learning_rate": 0.0002, "epoch": 2.6044782151806647, "step": 16110}, {"loss": 0.6867, "grad_norm": 0.7324174642562866, "learning_rate": 0.0002, "epoch": 2.60609489936141, "step": 16120}, {"loss": 0.7159, "grad_norm": 0.9594739675521851, "learning_rate": 0.0002, "epoch": 2.607711583542155, "step": 16130}, {"loss": 0.6451, "grad_norm": 0.9485327005386353, "learning_rate": 0.0002, "epoch": 2.6093282677229004, "step": 16140}, {"loss": 0.6815, "grad_norm": 0.8449000120162964, "learning_rate": 0.0002, "epoch": 2.6109449519036456, "step": 16150}, {"loss": 0.7152, "grad_norm": 0.8520140051841736, "learning_rate": 0.0002, "epoch": 2.612561636084391, "step": 16160}, {"loss": 0.6759, "grad_norm": 0.7456524968147278, "learning_rate": 0.0002, "epoch": 2.614178320265136, "step": 16170}, {"loss": 0.6893, "grad_norm": 0.9912857413291931, "learning_rate": 0.0002, "epoch": 2.6157950044458813, "step": 16180}, {"loss": 0.7243, "grad_norm": 0.9001946449279785, "learning_rate": 0.0002, "epoch": 2.6174116886266265, "step": 16190}, {"loss": 0.6825, "grad_norm": 0.6568667888641357, "learning_rate": 0.0002, "epoch": 2.619028372807372, "step": 16200}, {"loss": 0.7013, "grad_norm": 1.0248128175735474, "learning_rate": 0.0002, "epoch": 2.6206450569881174, "step": 16210}, {"loss": 0.7045, "grad_norm": 0.6509039998054504, "learning_rate": 0.0002, "epoch": 2.6222617411688627, "step": 16220}, {"loss": 0.72, "grad_norm": 0.7626351118087769, "learning_rate": 0.0002, "epoch": 2.623878425349608, "step": 16230}, {"loss": 0.6556, "grad_norm": 0.6938552260398865, "learning_rate": 0.0002, "epoch": 2.625495109530353, "step": 16240}, {"loss": 0.65, "grad_norm": 0.6434680819511414, "learning_rate": 0.0002, "epoch": 2.6271117937110984, "step": 16250}, {"loss": 0.6943, "grad_norm": 0.7111515998840332, "learning_rate": 0.0002, "epoch": 2.628728477891844, "step": 16260}, {"loss": 0.679, "grad_norm": 0.7712395787239075, "learning_rate": 0.0002, "epoch": 2.6303451620725893, "step": 16270}, {"loss": 0.6886, "grad_norm": 0.792209267616272, "learning_rate": 0.0002, "epoch": 2.6319618462533345, "step": 16280}, {"loss": 0.6554, "grad_norm": 0.6801066398620605, "learning_rate": 0.0002, "epoch": 2.6335785304340797, "step": 16290}, {"loss": 0.73, "grad_norm": 0.7802573442459106, "learning_rate": 0.0002, "epoch": 2.635195214614825, "step": 16300}, {"loss": 0.7484, "grad_norm": 0.7742244601249695, "learning_rate": 0.0002, "epoch": 2.63681189879557, "step": 16310}, {"loss": 0.6524, "grad_norm": 0.664184033870697, "learning_rate": 0.0002, "epoch": 2.6384285829763154, "step": 16320}, {"loss": 0.6442, "grad_norm": 0.9242228865623474, "learning_rate": 0.0002, "epoch": 2.6400452671570607, "step": 16330}, {"loss": 0.6792, "grad_norm": 0.9661325216293335, "learning_rate": 0.0002, "epoch": 2.641661951337806, "step": 16340}, {"loss": 0.6847, "grad_norm": 0.837526798248291, "learning_rate": 0.0002, "epoch": 2.6432786355185516, "step": 16350}, {"loss": 0.7686, "grad_norm": 1.1834373474121094, "learning_rate": 0.0002, "epoch": 2.644895319699297, "step": 16360}, {"loss": 0.6746, "grad_norm": 0.7467831373214722, "learning_rate": 0.0002, "epoch": 2.646512003880042, "step": 16370}, {"loss": 0.6935, "grad_norm": 0.8627146482467651, "learning_rate": 0.0002, "epoch": 2.6481286880607873, "step": 16380}, {"loss": 0.715, "grad_norm": 0.790447473526001, "learning_rate": 0.0002, "epoch": 2.6497453722415325, "step": 16390}, {"loss": 0.723, "grad_norm": 0.8447365164756775, "learning_rate": 0.0002, "epoch": 2.651362056422278, "step": 16400}, {"loss": 0.6628, "grad_norm": 0.7831417918205261, "learning_rate": 0.0002, "epoch": 2.6529787406030234, "step": 16410}, {"loss": 0.6691, "grad_norm": 0.6837952136993408, "learning_rate": 0.0002, "epoch": 2.6545954247837686, "step": 16420}, {"loss": 0.6139, "grad_norm": 0.7031801342964172, "learning_rate": 0.0002, "epoch": 2.656212108964514, "step": 16430}, {"loss": 0.7382, "grad_norm": 0.8963770866394043, "learning_rate": 0.0002, "epoch": 2.657828793145259, "step": 16440}, {"loss": 0.6439, "grad_norm": 0.6852328181266785, "learning_rate": 0.0002, "epoch": 2.6594454773260043, "step": 16450}, {"loss": 0.6278, "grad_norm": 0.8069294095039368, "learning_rate": 0.0002, "epoch": 2.6610621615067496, "step": 16460}, {"loss": 0.6939, "grad_norm": 0.7503686547279358, "learning_rate": 0.0002, "epoch": 2.662678845687495, "step": 16470}, {"loss": 0.6777, "grad_norm": 0.6430956125259399, "learning_rate": 0.0002, "epoch": 2.66429552986824, "step": 16480}, {"loss": 0.6863, "grad_norm": 0.7894312739372253, "learning_rate": 0.0002, "epoch": 2.6659122140489853, "step": 16490}, {"loss": 0.7165, "grad_norm": 0.7277431488037109, "learning_rate": 0.0002, "epoch": 2.667528898229731, "step": 16500}, {"loss": 0.6772, "grad_norm": 0.6816153526306152, "learning_rate": 0.0002, "epoch": 2.669145582410476, "step": 16510}, {"loss": 0.691, "grad_norm": 0.8145235776901245, "learning_rate": 0.0002, "epoch": 2.6707622665912214, "step": 16520}, {"loss": 0.709, "grad_norm": 0.8645890355110168, "learning_rate": 0.0002, "epoch": 2.6723789507719666, "step": 16530}, {"loss": 0.6946, "grad_norm": 0.704393208026886, "learning_rate": 0.0002, "epoch": 2.673995634952712, "step": 16540}, {"loss": 0.6378, "grad_norm": 1.0120846033096313, "learning_rate": 0.0002, "epoch": 2.6756123191334575, "step": 16550}, {"loss": 0.7241, "grad_norm": 0.6919328570365906, "learning_rate": 0.0002, "epoch": 2.6772290033142028, "step": 16560}, {"loss": 0.7098, "grad_norm": 0.6924574971199036, "learning_rate": 0.0002, "epoch": 2.678845687494948, "step": 16570}, {"loss": 0.731, "grad_norm": 0.9679301381111145, "learning_rate": 0.0002, "epoch": 2.6804623716756932, "step": 16580}, {"loss": 0.7124, "grad_norm": 0.6810211539268494, "learning_rate": 0.0002, "epoch": 2.6820790558564385, "step": 16590}, {"loss": 0.6688, "grad_norm": 0.9730555415153503, "learning_rate": 0.0002, "epoch": 2.6836957400371837, "step": 16600}, {"loss": 0.7344, "grad_norm": 0.7852821350097656, "learning_rate": 0.0002, "epoch": 2.685312424217929, "step": 16610}, {"loss": 0.6401, "grad_norm": 0.6059057116508484, "learning_rate": 0.0002, "epoch": 2.686929108398674, "step": 16620}, {"loss": 0.6796, "grad_norm": 0.9395958781242371, "learning_rate": 0.0002, "epoch": 2.6885457925794194, "step": 16630}, {"loss": 0.7174, "grad_norm": 0.7473729848861694, "learning_rate": 0.0002, "epoch": 2.690162476760165, "step": 16640}, {"loss": 0.7087, "grad_norm": 0.765934407711029, "learning_rate": 0.0002, "epoch": 2.6917791609409103, "step": 16650}, {"loss": 0.707, "grad_norm": 0.8496677279472351, "learning_rate": 0.0002, "epoch": 2.6933958451216555, "step": 16660}, {"loss": 0.7084, "grad_norm": 0.7641879916191101, "learning_rate": 0.0002, "epoch": 2.6950125293024008, "step": 16670}, {"loss": 0.6566, "grad_norm": 0.8471952676773071, "learning_rate": 0.0002, "epoch": 2.696629213483146, "step": 16680}, {"loss": 0.6635, "grad_norm": 0.6946060657501221, "learning_rate": 0.0002, "epoch": 2.6982458976638912, "step": 16690}, {"loss": 0.7027, "grad_norm": 0.7361312508583069, "learning_rate": 0.0002, "epoch": 2.699862581844637, "step": 16700}, {"loss": 0.6767, "grad_norm": 0.6605038046836853, "learning_rate": 0.0002, "epoch": 2.701479266025382, "step": 16710}, {"loss": 0.6885, "grad_norm": 0.7164411544799805, "learning_rate": 0.0002, "epoch": 2.7030959502061274, "step": 16720}, {"loss": 0.6736, "grad_norm": 0.6496201157569885, "learning_rate": 0.0002, "epoch": 2.7047126343868726, "step": 16730}, {"loss": 0.6942, "grad_norm": 0.7826663851737976, "learning_rate": 0.0002, "epoch": 2.706329318567618, "step": 16740}, {"loss": 0.6773, "grad_norm": 0.7639131546020508, "learning_rate": 0.0002, "epoch": 2.707946002748363, "step": 16750}, {"loss": 0.69, "grad_norm": 0.7976210713386536, "learning_rate": 0.0002, "epoch": 2.7095626869291083, "step": 16760}, {"loss": 0.6735, "grad_norm": 0.6836577653884888, "learning_rate": 0.0002, "epoch": 2.7111793711098535, "step": 16770}, {"loss": 0.6596, "grad_norm": 0.8025202751159668, "learning_rate": 0.0002, "epoch": 2.7127960552905988, "step": 16780}, {"loss": 0.6324, "grad_norm": 0.7636463642120361, "learning_rate": 0.0002, "epoch": 2.7144127394713444, "step": 16790}, {"loss": 0.6227, "grad_norm": 0.7481677532196045, "learning_rate": 0.0002, "epoch": 2.7160294236520897, "step": 16800}, {"loss": 0.6925, "grad_norm": 0.7566834688186646, "learning_rate": 0.0002, "epoch": 2.717646107832835, "step": 16810}, {"loss": 0.6531, "grad_norm": 0.7931267619132996, "learning_rate": 0.0002, "epoch": 2.71926279201358, "step": 16820}, {"loss": 0.6672, "grad_norm": 0.8811662197113037, "learning_rate": 0.0002, "epoch": 2.7208794761943254, "step": 16830}, {"loss": 0.6675, "grad_norm": 0.8561240434646606, "learning_rate": 0.0002, "epoch": 2.7224961603750706, "step": 16840}, {"loss": 0.7135, "grad_norm": 0.7121599316596985, "learning_rate": 0.0002, "epoch": 2.7241128445558163, "step": 16850}, {"loss": 0.6825, "grad_norm": 0.8066257238388062, "learning_rate": 0.0002, "epoch": 2.7257295287365615, "step": 16860}, {"loss": 0.6839, "grad_norm": 0.7699271440505981, "learning_rate": 0.0002, "epoch": 2.7273462129173067, "step": 16870}, {"loss": 0.699, "grad_norm": 1.1828432083129883, "learning_rate": 0.0002, "epoch": 2.728962897098052, "step": 16880}, {"loss": 0.6518, "grad_norm": 0.9989302754402161, "learning_rate": 0.0002, "epoch": 2.730579581278797, "step": 16890}, {"loss": 0.7015, "grad_norm": 0.8100560307502747, "learning_rate": 0.0002, "epoch": 2.7321962654595424, "step": 16900}, {"loss": 0.6851, "grad_norm": 0.8615233898162842, "learning_rate": 0.0002, "epoch": 2.7338129496402876, "step": 16910}, {"loss": 0.6322, "grad_norm": 0.8633756041526794, "learning_rate": 0.0002, "epoch": 2.735429633821033, "step": 16920}, {"loss": 0.6488, "grad_norm": 0.7769348621368408, "learning_rate": 0.0002, "epoch": 2.737046318001778, "step": 16930}, {"loss": 0.6582, "grad_norm": 0.6943058371543884, "learning_rate": 0.0002, "epoch": 2.738663002182524, "step": 16940}, {"loss": 0.6516, "grad_norm": 0.8510736227035522, "learning_rate": 0.0002, "epoch": 2.740279686363269, "step": 16950}, {"loss": 0.7275, "grad_norm": 0.7732602953910828, "learning_rate": 0.0002, "epoch": 2.7418963705440142, "step": 16960}, {"loss": 0.6553, "grad_norm": 0.5981788635253906, "learning_rate": 0.0002, "epoch": 2.7435130547247595, "step": 16970}, {"loss": 0.6777, "grad_norm": 0.7604416012763977, "learning_rate": 0.0002, "epoch": 2.7451297389055047, "step": 16980}, {"loss": 0.6981, "grad_norm": 0.7377738356590271, "learning_rate": 0.0002, "epoch": 2.74674642308625, "step": 16990}, {"loss": 0.6294, "grad_norm": 0.9400289058685303, "learning_rate": 0.0002, "epoch": 2.7483631072669956, "step": 17000}, {"loss": 0.6952, "grad_norm": 0.6340599656105042, "learning_rate": 0.0002, "epoch": 2.749979791447741, "step": 17010}, {"loss": 0.7222, "grad_norm": 0.7297601103782654, "learning_rate": 0.0002, "epoch": 2.751596475628486, "step": 17020}, {"loss": 0.6659, "grad_norm": 0.9479979872703552, "learning_rate": 0.0002, "epoch": 2.7532131598092313, "step": 17030}, {"loss": 0.691, "grad_norm": 0.8461511135101318, "learning_rate": 0.0002, "epoch": 2.7548298439899765, "step": 17040}, {"loss": 0.6764, "grad_norm": 0.7477551698684692, "learning_rate": 0.0002, "epoch": 2.7564465281707218, "step": 17050}, {"loss": 0.684, "grad_norm": 1.019270420074463, "learning_rate": 0.0002, "epoch": 2.758063212351467, "step": 17060}, {"loss": 0.7119, "grad_norm": 0.7730235457420349, "learning_rate": 0.0002, "epoch": 2.7596798965322122, "step": 17070}, {"loss": 0.6886, "grad_norm": 0.8216866254806519, "learning_rate": 0.0002, "epoch": 2.7612965807129575, "step": 17080}, {"loss": 0.6811, "grad_norm": 0.7235931754112244, "learning_rate": 0.0002, "epoch": 2.762913264893703, "step": 17090}, {"loss": 0.7031, "grad_norm": 0.7352296710014343, "learning_rate": 0.0002, "epoch": 2.7645299490744484, "step": 17100}, {"loss": 0.6951, "grad_norm": 0.8129373788833618, "learning_rate": 0.0002, "epoch": 2.7661466332551936, "step": 17110}, {"loss": 0.6703, "grad_norm": 0.7387019991874695, "learning_rate": 0.0002, "epoch": 2.767763317435939, "step": 17120}, {"loss": 0.6789, "grad_norm": 0.9149190187454224, "learning_rate": 0.0002, "epoch": 2.769380001616684, "step": 17130}, {"loss": 0.6038, "grad_norm": 0.7352971434593201, "learning_rate": 0.0002, "epoch": 2.7709966857974297, "step": 17140}, {"loss": 0.6728, "grad_norm": 0.7903780341148376, "learning_rate": 0.0002, "epoch": 2.772613369978175, "step": 17150}, {"loss": 0.6988, "grad_norm": 0.8255927562713623, "learning_rate": 0.0002, "epoch": 2.77423005415892, "step": 17160}, {"loss": 0.6694, "grad_norm": 0.7235927581787109, "learning_rate": 0.0002, "epoch": 2.7758467383396654, "step": 17170}, {"loss": 0.7161, "grad_norm": 0.8281434774398804, "learning_rate": 0.0002, "epoch": 2.7774634225204107, "step": 17180}, {"loss": 0.682, "grad_norm": 0.7586921453475952, "learning_rate": 0.0002, "epoch": 2.779080106701156, "step": 17190}, {"loss": 0.6427, "grad_norm": 0.7161715030670166, "learning_rate": 0.0002, "epoch": 2.780696790881901, "step": 17200}, {"loss": 0.6426, "grad_norm": 0.762868344783783, "learning_rate": 0.0002, "epoch": 2.7823134750626464, "step": 17210}, {"loss": 0.705, "grad_norm": 0.9285483360290527, "learning_rate": 0.0002, "epoch": 2.7839301592433916, "step": 17220}, {"loss": 0.7084, "grad_norm": 0.6900462508201599, "learning_rate": 0.0002, "epoch": 2.785546843424137, "step": 17230}, {"loss": 0.6988, "grad_norm": 0.780384361743927, "learning_rate": 0.0002, "epoch": 2.7871635276048825, "step": 17240}, {"loss": 0.7073, "grad_norm": 0.7580406665802002, "learning_rate": 0.0002, "epoch": 2.7887802117856277, "step": 17250}, {"loss": 0.6833, "grad_norm": 0.8145199418067932, "learning_rate": 0.0002, "epoch": 2.790396895966373, "step": 17260}, {"loss": 0.6909, "grad_norm": 0.9159596562385559, "learning_rate": 0.0002, "epoch": 2.792013580147118, "step": 17270}, {"loss": 0.6008, "grad_norm": 0.9590014219284058, "learning_rate": 0.0002, "epoch": 2.7936302643278634, "step": 17280}, {"loss": 0.6704, "grad_norm": 0.7603529691696167, "learning_rate": 0.0002, "epoch": 2.795246948508609, "step": 17290}, {"loss": 0.7165, "grad_norm": 0.8039976358413696, "learning_rate": 0.0002, "epoch": 2.7968636326893543, "step": 17300}, {"loss": 0.7037, "grad_norm": 0.8364847302436829, "learning_rate": 0.0002, "epoch": 2.7984803168700996, "step": 17310}, {"loss": 0.6749, "grad_norm": 0.8763046860694885, "learning_rate": 0.0002, "epoch": 2.800097001050845, "step": 17320}, {"loss": 0.6844, "grad_norm": 0.8409647941589355, "learning_rate": 0.0002, "epoch": 2.80171368523159, "step": 17330}, {"loss": 0.6936, "grad_norm": 0.7649006247520447, "learning_rate": 0.0002, "epoch": 2.8033303694123353, "step": 17340}, {"loss": 0.7051, "grad_norm": 0.7970262169837952, "learning_rate": 0.0002, "epoch": 2.8049470535930805, "step": 17350}, {"loss": 0.6533, "grad_norm": 0.9088607430458069, "learning_rate": 0.0002, "epoch": 2.8065637377738257, "step": 17360}, {"loss": 0.675, "grad_norm": 0.6454846858978271, "learning_rate": 0.0002, "epoch": 2.808180421954571, "step": 17370}, {"loss": 0.7069, "grad_norm": 0.7744787931442261, "learning_rate": 0.0002, "epoch": 2.809797106135316, "step": 17380}, {"loss": 0.6772, "grad_norm": 0.6678640842437744, "learning_rate": 0.0002, "epoch": 2.811413790316062, "step": 17390}, {"loss": 0.6784, "grad_norm": 0.772676944732666, "learning_rate": 0.0002, "epoch": 2.813030474496807, "step": 17400}, {"loss": 0.7252, "grad_norm": 0.7088175415992737, "learning_rate": 0.0002, "epoch": 2.8146471586775523, "step": 17410}, {"loss": 0.7086, "grad_norm": 0.8280573487281799, "learning_rate": 0.0002, "epoch": 2.8162638428582976, "step": 17420}, {"loss": 0.6732, "grad_norm": 0.6665388345718384, "learning_rate": 0.0002, "epoch": 2.817880527039043, "step": 17430}, {"loss": 0.6675, "grad_norm": 0.6427883505821228, "learning_rate": 0.0002, "epoch": 2.8194972112197885, "step": 17440}, {"loss": 0.6972, "grad_norm": 0.9697760343551636, "learning_rate": 0.0002, "epoch": 2.8211138954005337, "step": 17450}, {"loss": 0.6838, "grad_norm": 0.7573966383934021, "learning_rate": 0.0002, "epoch": 2.822730579581279, "step": 17460}, {"loss": 0.7243, "grad_norm": 0.878688633441925, "learning_rate": 0.0002, "epoch": 2.824347263762024, "step": 17470}, {"loss": 0.6666, "grad_norm": 0.7752242684364319, "learning_rate": 0.0002, "epoch": 2.8259639479427694, "step": 17480}, {"loss": 0.6638, "grad_norm": 0.6135398745536804, "learning_rate": 0.0002, "epoch": 2.8275806321235146, "step": 17490}, {"loss": 0.6829, "grad_norm": 0.6924924850463867, "learning_rate": 0.0002, "epoch": 2.82919731630426, "step": 17500}, {"loss": 0.6731, "grad_norm": 0.7471627593040466, "learning_rate": 0.0002, "epoch": 2.830814000485005, "step": 17510}, {"loss": 0.7016, "grad_norm": 0.7145499587059021, "learning_rate": 0.0002, "epoch": 2.8324306846657503, "step": 17520}, {"loss": 0.6787, "grad_norm": 0.7415414452552795, "learning_rate": 0.0002, "epoch": 2.834047368846496, "step": 17530}, {"loss": 0.6811, "grad_norm": 0.7328441739082336, "learning_rate": 0.0002, "epoch": 2.8356640530272412, "step": 17540}, {"loss": 0.6866, "grad_norm": 0.8267839550971985, "learning_rate": 0.0002, "epoch": 2.8372807372079865, "step": 17550}, {"loss": 0.6787, "grad_norm": 0.8877885341644287, "learning_rate": 0.0002, "epoch": 2.8388974213887317, "step": 17560}, {"loss": 0.7136, "grad_norm": 0.857138454914093, "learning_rate": 0.0002, "epoch": 2.840514105569477, "step": 17570}, {"loss": 0.6454, "grad_norm": 0.8470779657363892, "learning_rate": 0.0002, "epoch": 2.842130789750222, "step": 17580}, {"loss": 0.6976, "grad_norm": 0.8553254008293152, "learning_rate": 0.0002, "epoch": 2.843747473930968, "step": 17590}, {"loss": 0.7297, "grad_norm": 0.8033196926116943, "learning_rate": 0.0002, "epoch": 2.845364158111713, "step": 17600}, {"loss": 0.7062, "grad_norm": 0.7949087023735046, "learning_rate": 0.0002, "epoch": 2.8469808422924583, "step": 17610}, {"loss": 0.651, "grad_norm": 0.9241406321525574, "learning_rate": 0.0002, "epoch": 2.8485975264732035, "step": 17620}, {"loss": 0.6601, "grad_norm": 0.7721285223960876, "learning_rate": 0.0002, "epoch": 2.8502142106539488, "step": 17630}, {"loss": 0.6183, "grad_norm": 1.0246692895889282, "learning_rate": 0.0002, "epoch": 2.851830894834694, "step": 17640}, {"loss": 0.7007, "grad_norm": 0.9244589805603027, "learning_rate": 0.0002, "epoch": 2.853447579015439, "step": 17650}, {"loss": 0.7274, "grad_norm": 0.7243508696556091, "learning_rate": 0.0002, "epoch": 2.8550642631961844, "step": 17660}, {"loss": 0.6471, "grad_norm": 0.8943371176719666, "learning_rate": 0.0002, "epoch": 2.8566809473769297, "step": 17670}, {"loss": 0.686, "grad_norm": 0.6531758904457092, "learning_rate": 0.0002, "epoch": 2.8582976315576754, "step": 17680}, {"loss": 0.6253, "grad_norm": 0.8367000818252563, "learning_rate": 0.0002, "epoch": 2.8599143157384206, "step": 17690}, {"loss": 0.6943, "grad_norm": 0.7868556380271912, "learning_rate": 0.0002, "epoch": 2.861530999919166, "step": 17700}, {"loss": 0.6919, "grad_norm": 0.7213859558105469, "learning_rate": 0.0002, "epoch": 2.863147684099911, "step": 17710}, {"loss": 0.6657, "grad_norm": 0.7383931279182434, "learning_rate": 0.0002, "epoch": 2.8647643682806563, "step": 17720}, {"loss": 0.6841, "grad_norm": 0.7566812634468079, "learning_rate": 0.0002, "epoch": 2.8663810524614015, "step": 17730}, {"loss": 0.6449, "grad_norm": 0.6930373311042786, "learning_rate": 0.0002, "epoch": 2.867997736642147, "step": 17740}, {"loss": 0.6764, "grad_norm": 0.7911090850830078, "learning_rate": 0.0002, "epoch": 2.8696144208228924, "step": 17750}, {"loss": 0.6554, "grad_norm": 0.8484548926353455, "learning_rate": 0.0002, "epoch": 2.8712311050036377, "step": 17760}, {"loss": 0.6931, "grad_norm": 0.7647597193717957, "learning_rate": 0.0002, "epoch": 2.872847789184383, "step": 17770}, {"loss": 0.6945, "grad_norm": 0.8791151642799377, "learning_rate": 0.0002, "epoch": 2.874464473365128, "step": 17780}, {"loss": 0.7078, "grad_norm": 0.7253178358078003, "learning_rate": 0.0002, "epoch": 2.8760811575458733, "step": 17790}, {"loss": 0.6474, "grad_norm": 0.7956077456474304, "learning_rate": 0.0002, "epoch": 2.8776978417266186, "step": 17800}, {"loss": 0.6687, "grad_norm": 0.8657688498497009, "learning_rate": 0.0002, "epoch": 2.879314525907364, "step": 17810}, {"loss": 0.7171, "grad_norm": 0.7059141993522644, "learning_rate": 0.0002, "epoch": 2.880931210088109, "step": 17820}, {"loss": 0.683, "grad_norm": 0.8886896967887878, "learning_rate": 0.0002, "epoch": 2.8825478942688547, "step": 17830}, {"loss": 0.669, "grad_norm": 0.821032702922821, "learning_rate": 0.0002, "epoch": 2.8841645784496, "step": 17840}, {"loss": 0.6805, "grad_norm": 0.7183963656425476, "learning_rate": 0.0002, "epoch": 2.885781262630345, "step": 17850}, {"loss": 0.7088, "grad_norm": 0.6222899556159973, "learning_rate": 0.0002, "epoch": 2.8873979468110904, "step": 17860}, {"loss": 0.6626, "grad_norm": 0.8187434077262878, "learning_rate": 0.0002, "epoch": 2.8890146309918356, "step": 17870}, {"loss": 0.6815, "grad_norm": 0.9838479161262512, "learning_rate": 0.0002, "epoch": 2.890631315172581, "step": 17880}, {"loss": 0.6967, "grad_norm": 0.7567742466926575, "learning_rate": 0.0002, "epoch": 2.8922479993533265, "step": 17890}, {"loss": 0.7073, "grad_norm": 0.6875903606414795, "learning_rate": 0.0002, "epoch": 2.893864683534072, "step": 17900}, {"loss": 0.6415, "grad_norm": 0.8043789267539978, "learning_rate": 0.0002, "epoch": 2.895481367714817, "step": 17910}, {"loss": 0.6588, "grad_norm": 0.8062626719474792, "learning_rate": 0.0002, "epoch": 2.8970980518955622, "step": 17920}, {"loss": 0.7151, "grad_norm": 1.0251191854476929, "learning_rate": 0.0002, "epoch": 2.8987147360763075, "step": 17930}, {"loss": 0.6605, "grad_norm": 0.882253110408783, "learning_rate": 0.0002, "epoch": 2.9003314202570527, "step": 17940}, {"loss": 0.6719, "grad_norm": 0.8683299422264099, "learning_rate": 0.0002, "epoch": 2.901948104437798, "step": 17950}, {"loss": 0.6896, "grad_norm": 0.7167282104492188, "learning_rate": 0.0002, "epoch": 2.903564788618543, "step": 17960}, {"loss": 0.663, "grad_norm": 0.7093694806098938, "learning_rate": 0.0002, "epoch": 2.9051814727992884, "step": 17970}, {"loss": 0.6591, "grad_norm": 0.8549879193305969, "learning_rate": 0.0002, "epoch": 2.906798156980034, "step": 17980}, {"loss": 0.6962, "grad_norm": 0.6989606618881226, "learning_rate": 0.0002, "epoch": 2.9084148411607793, "step": 17990}, {"loss": 0.6635, "grad_norm": 0.9482976794242859, "learning_rate": 0.0002, "epoch": 2.9100315253415245, "step": 18000}, {"loss": 0.6586, "grad_norm": 0.7182440161705017, "learning_rate": 0.0002, "epoch": 2.9116482095222698, "step": 18010}, {"loss": 0.6827, "grad_norm": 0.7732226252555847, "learning_rate": 0.0002, "epoch": 2.913264893703015, "step": 18020}, {"loss": 0.7123, "grad_norm": 0.7936875224113464, "learning_rate": 0.0002, "epoch": 2.9148815778837607, "step": 18030}, {"loss": 0.6736, "grad_norm": 0.8825615644454956, "learning_rate": 0.0002, "epoch": 2.916498262064506, "step": 18040}, {"loss": 0.7139, "grad_norm": 0.6778587102890015, "learning_rate": 0.0002, "epoch": 2.918114946245251, "step": 18050}, {"loss": 0.6588, "grad_norm": 0.7529265880584717, "learning_rate": 0.0002, "epoch": 2.9197316304259964, "step": 18060}, {"loss": 0.737, "grad_norm": 0.7111883163452148, "learning_rate": 0.0002, "epoch": 2.9213483146067416, "step": 18070}, {"loss": 0.7475, "grad_norm": 0.7214767932891846, "learning_rate": 0.0002, "epoch": 2.922964998787487, "step": 18080}, {"loss": 0.6672, "grad_norm": 0.800417423248291, "learning_rate": 0.0002, "epoch": 2.924581682968232, "step": 18090}, {"loss": 0.6694, "grad_norm": 1.248575210571289, "learning_rate": 0.0002, "epoch": 2.9261983671489773, "step": 18100}, {"loss": 0.7004, "grad_norm": 0.757788360118866, "learning_rate": 0.0002, "epoch": 2.9278150513297225, "step": 18110}, {"loss": 0.6999, "grad_norm": 1.0583995580673218, "learning_rate": 0.0002, "epoch": 2.9294317355104678, "step": 18120}, {"loss": 0.6365, "grad_norm": 0.8228777647018433, "learning_rate": 0.0002, "epoch": 2.9310484196912134, "step": 18130}, {"loss": 0.6791, "grad_norm": 0.8374035358428955, "learning_rate": 0.0002, "epoch": 2.9326651038719587, "step": 18140}, {"loss": 0.6399, "grad_norm": 0.7976473569869995, "learning_rate": 0.0002, "epoch": 2.934281788052704, "step": 18150}, {"loss": 0.6585, "grad_norm": 0.8009907603263855, "learning_rate": 0.0002, "epoch": 2.935898472233449, "step": 18160}, {"loss": 0.7485, "grad_norm": 0.835213303565979, "learning_rate": 0.0002, "epoch": 2.9375151564141944, "step": 18170}, {"loss": 0.7376, "grad_norm": 0.7982219457626343, "learning_rate": 0.0002, "epoch": 2.93913184059494, "step": 18180}, {"loss": 0.6348, "grad_norm": 0.7070978879928589, "learning_rate": 0.0002, "epoch": 2.9407485247756853, "step": 18190}, {"loss": 0.6608, "grad_norm": 0.8619440197944641, "learning_rate": 0.0002, "epoch": 2.9423652089564305, "step": 18200}, {"loss": 0.666, "grad_norm": 0.6693987250328064, "learning_rate": 0.0002, "epoch": 2.9439818931371757, "step": 18210}, {"loss": 0.728, "grad_norm": 0.6747021079063416, "learning_rate": 0.0002, "epoch": 2.945598577317921, "step": 18220}, {"loss": 0.6686, "grad_norm": 0.860387921333313, "learning_rate": 0.0002, "epoch": 2.947215261498666, "step": 18230}, {"loss": 0.6945, "grad_norm": 0.799976646900177, "learning_rate": 0.0002, "epoch": 2.9488319456794114, "step": 18240}, {"loss": 0.7243, "grad_norm": 0.7864769101142883, "learning_rate": 0.0002, "epoch": 2.9504486298601567, "step": 18250}, {"loss": 0.6785, "grad_norm": 0.6713884472846985, "learning_rate": 0.0002, "epoch": 2.952065314040902, "step": 18260}, {"loss": 0.7429, "grad_norm": 0.9031508564949036, "learning_rate": 0.0002, "epoch": 2.9536819982216476, "step": 18270}, {"loss": 0.7055, "grad_norm": 0.7205073237419128, "learning_rate": 0.0002, "epoch": 2.955298682402393, "step": 18280}, {"loss": 0.7298, "grad_norm": 0.7746205925941467, "learning_rate": 0.0002, "epoch": 2.956915366583138, "step": 18290}, {"loss": 0.6218, "grad_norm": 0.6533427834510803, "learning_rate": 0.0002, "epoch": 2.9585320507638833, "step": 18300}, {"loss": 0.6674, "grad_norm": 0.9083208441734314, "learning_rate": 0.0002, "epoch": 2.9601487349446285, "step": 18310}, {"loss": 0.7359, "grad_norm": 0.7446991801261902, "learning_rate": 0.0002, "epoch": 2.9617654191253737, "step": 18320}, {"loss": 0.6738, "grad_norm": 0.6514461636543274, "learning_rate": 0.0002, "epoch": 2.9633821033061194, "step": 18330}, {"loss": 0.6677, "grad_norm": 0.8580465912818909, "learning_rate": 0.0002, "epoch": 2.9649987874868646, "step": 18340}, {"loss": 0.6971, "grad_norm": 0.7074266076087952, "learning_rate": 0.0002, "epoch": 2.96661547166761, "step": 18350}, {"loss": 0.6804, "grad_norm": 0.899892270565033, "learning_rate": 0.0002, "epoch": 2.968232155848355, "step": 18360}, {"loss": 0.7094, "grad_norm": 0.8217641711235046, "learning_rate": 0.0002, "epoch": 2.9698488400291003, "step": 18370}, {"loss": 0.6916, "grad_norm": 0.8611799478530884, "learning_rate": 0.0002, "epoch": 2.9714655242098456, "step": 18380}, {"loss": 0.6677, "grad_norm": 0.6909302473068237, "learning_rate": 0.0002, "epoch": 2.973082208390591, "step": 18390}, {"loss": 0.7247, "grad_norm": 0.6554358005523682, "learning_rate": 0.0002, "epoch": 2.974698892571336, "step": 18400}, {"loss": 0.6516, "grad_norm": 0.7803071737289429, "learning_rate": 0.0002, "epoch": 2.9763155767520812, "step": 18410}, {"loss": 0.7322, "grad_norm": 0.7838954925537109, "learning_rate": 0.0002, "epoch": 2.977932260932827, "step": 18420}, {"loss": 0.6522, "grad_norm": 0.7098495364189148, "learning_rate": 0.0002, "epoch": 2.979548945113572, "step": 18430}, {"loss": 0.739, "grad_norm": 0.8981785774230957, "learning_rate": 0.0002, "epoch": 2.9811656292943174, "step": 18440}, {"loss": 0.6689, "grad_norm": 0.7197171449661255, "learning_rate": 0.0002, "epoch": 2.9827823134750626, "step": 18450}, {"loss": 0.706, "grad_norm": 0.793185293674469, "learning_rate": 0.0002, "epoch": 2.984398997655808, "step": 18460}, {"loss": 0.7124, "grad_norm": 0.8531473875045776, "learning_rate": 0.0002, "epoch": 2.986015681836553, "step": 18470}, {"loss": 0.6901, "grad_norm": 0.6627361178398132, "learning_rate": 0.0002, "epoch": 2.9876323660172988, "step": 18480}, {"loss": 0.6591, "grad_norm": 0.5708155035972595, "learning_rate": 0.0002, "epoch": 2.989249050198044, "step": 18490}, {"loss": 0.6725, "grad_norm": 0.8227280378341675, "learning_rate": 0.0002, "epoch": 2.990865734378789, "step": 18500}, {"loss": 0.6701, "grad_norm": 0.7102749943733215, "learning_rate": 0.0002, "epoch": 2.9924824185595345, "step": 18510}, {"loss": 0.7091, "grad_norm": 0.839485228061676, "learning_rate": 0.0002, "epoch": 2.9940991027402797, "step": 18520}, {"loss": 0.6521, "grad_norm": 0.9038704037666321, "learning_rate": 0.0002, "epoch": 2.995715786921025, "step": 18530}, {"loss": 0.7186, "grad_norm": 0.8737510442733765, "learning_rate": 0.0002, "epoch": 2.99733247110177, "step": 18540}, {"loss": 0.6819, "grad_norm": 0.7323142886161804, "learning_rate": 0.0002, "epoch": 2.9989491552825154, "step": 18550}, {"eval_loss": 1.1262480020523071, "eval_runtime": 122.0868, "eval_samples_per_second": 6.004, "eval_steps_per_second": 0.754, "epoch": 2.9999191657909625, "step": 18556}, {"loss": 0.6337, "grad_norm": 0.8465463519096375, "learning_rate": 0.0002, "epoch": 3.000565839463261, "step": 18560}, {"loss": 0.6064, "grad_norm": 0.9134138822555542, "learning_rate": 0.0002, "epoch": 3.0021825236440063, "step": 18570}, {"loss": 0.5804, "grad_norm": 0.760715126991272, "learning_rate": 0.0002, "epoch": 3.0037992078247515, "step": 18580}, {"loss": 0.5571, "grad_norm": 0.9208743572235107, "learning_rate": 0.0002, "epoch": 3.0054158920054967, "step": 18590}, {"loss": 0.5731, "grad_norm": 0.9232364892959595, "learning_rate": 0.0002, "epoch": 3.007032576186242, "step": 18600}, {"loss": 0.6299, "grad_norm": 1.1881544589996338, "learning_rate": 0.0002, "epoch": 3.008649260366987, "step": 18610}, {"loss": 0.5482, "grad_norm": 0.9372987747192383, "learning_rate": 0.0002, "epoch": 3.0102659445477324, "step": 18620}, {"loss": 0.5709, "grad_norm": 0.6900241374969482, "learning_rate": 0.0002, "epoch": 3.0118826287284777, "step": 18630}, {"loss": 0.5256, "grad_norm": 0.8451071381568909, "learning_rate": 0.0002, "epoch": 3.0134993129092233, "step": 18640}, {"loss": 0.5916, "grad_norm": 0.7763112187385559, "learning_rate": 0.0002, "epoch": 3.0151159970899686, "step": 18650}, {"loss": 0.6095, "grad_norm": 1.043653964996338, "learning_rate": 0.0002, "epoch": 3.016732681270714, "step": 18660}, {"loss": 0.6228, "grad_norm": 1.0170660018920898, "learning_rate": 0.0002, "epoch": 3.018349365451459, "step": 18670}, {"loss": 0.5671, "grad_norm": 0.7534180283546448, "learning_rate": 0.0002, "epoch": 3.0199660496322043, "step": 18680}, {"loss": 0.6015, "grad_norm": 0.7507367730140686, "learning_rate": 0.0002, "epoch": 3.0215827338129495, "step": 18690}, {"loss": 0.6201, "grad_norm": 0.7861620187759399, "learning_rate": 0.0002, "epoch": 3.0231994179936947, "step": 18700}, {"loss": 0.5802, "grad_norm": 1.0580339431762695, "learning_rate": 0.0002, "epoch": 3.0248161021744404, "step": 18710}, {"loss": 0.5975, "grad_norm": 0.7542710900306702, "learning_rate": 0.0002, "epoch": 3.0264327863551856, "step": 18720}, {"loss": 0.5695, "grad_norm": 0.8189544677734375, "learning_rate": 0.0002, "epoch": 3.028049470535931, "step": 18730}, {"loss": 0.6109, "grad_norm": 0.9126611351966858, "learning_rate": 0.0002, "epoch": 3.029666154716676, "step": 18740}, {"loss": 0.6443, "grad_norm": 0.8891341686248779, "learning_rate": 0.0002, "epoch": 3.0312828388974213, "step": 18750}, {"loss": 0.6207, "grad_norm": 0.8419283032417297, "learning_rate": 0.0002, "epoch": 3.0328995230781666, "step": 18760}, {"loss": 0.5818, "grad_norm": 0.8048048615455627, "learning_rate": 0.0002, "epoch": 3.034516207258912, "step": 18770}, {"loss": 0.6381, "grad_norm": 0.7820217609405518, "learning_rate": 0.0002, "epoch": 3.0361328914396575, "step": 18780}, {"loss": 0.5843, "grad_norm": 0.854721188545227, "learning_rate": 0.0002, "epoch": 3.0377495756204027, "step": 18790}, {"loss": 0.5784, "grad_norm": 0.912092924118042, "learning_rate": 0.0002, "epoch": 3.039366259801148, "step": 18800}, {"loss": 0.5734, "grad_norm": 0.6596226096153259, "learning_rate": 0.0002, "epoch": 3.040982943981893, "step": 18810}, {"loss": 0.5969, "grad_norm": 0.6351348757743835, "learning_rate": 0.0002, "epoch": 3.0425996281626384, "step": 18820}, {"loss": 0.5953, "grad_norm": 0.778188943862915, "learning_rate": 0.0002, "epoch": 3.0442163123433836, "step": 18830}, {"loss": 0.602, "grad_norm": 0.68234783411026, "learning_rate": 0.0002, "epoch": 3.045832996524129, "step": 18840}, {"loss": 0.5785, "grad_norm": 0.998628556728363, "learning_rate": 0.0002, "epoch": 3.047449680704874, "step": 18850}, {"loss": 0.6231, "grad_norm": 0.7393841743469238, "learning_rate": 0.0002, "epoch": 3.0490663648856198, "step": 18860}, {"loss": 0.568, "grad_norm": 0.84438556432724, "learning_rate": 0.0002, "epoch": 3.050683049066365, "step": 18870}, {"loss": 0.6205, "grad_norm": 0.8857501745223999, "learning_rate": 0.0002, "epoch": 3.0522997332471102, "step": 18880}, {"loss": 0.6335, "grad_norm": 0.7208474278450012, "learning_rate": 0.0002, "epoch": 3.0539164174278555, "step": 18890}, {"loss": 0.5998, "grad_norm": 0.7135229110717773, "learning_rate": 0.0002, "epoch": 3.0555331016086007, "step": 18900}, {"loss": 0.5575, "grad_norm": 0.9130001664161682, "learning_rate": 0.0002, "epoch": 3.057149785789346, "step": 18910}, {"loss": 0.5955, "grad_norm": 0.9001716375350952, "learning_rate": 0.0002, "epoch": 3.058766469970091, "step": 18920}, {"loss": 0.6052, "grad_norm": 0.8667559623718262, "learning_rate": 0.0002, "epoch": 3.060383154150837, "step": 18930}, {"loss": 0.5818, "grad_norm": 0.8943959474563599, "learning_rate": 0.0002, "epoch": 3.061999838331582, "step": 18940}, {"loss": 0.5978, "grad_norm": 0.8298377990722656, "learning_rate": 0.0002, "epoch": 3.0636165225123273, "step": 18950}, {"loss": 0.5782, "grad_norm": 0.7935267686843872, "learning_rate": 0.0002, "epoch": 3.0652332066930725, "step": 18960}, {"loss": 0.6434, "grad_norm": 1.1506379842758179, "learning_rate": 0.0002, "epoch": 3.0668498908738178, "step": 18970}, {"loss": 0.5571, "grad_norm": 0.7693049907684326, "learning_rate": 0.0002, "epoch": 3.068466575054563, "step": 18980}, {"loss": 0.5971, "grad_norm": 0.8040135502815247, "learning_rate": 0.0002, "epoch": 3.0700832592353082, "step": 18990}, {"loss": 0.5541, "grad_norm": 0.828404426574707, "learning_rate": 0.0002, "epoch": 3.0716999434160535, "step": 19000}, {"loss": 0.6048, "grad_norm": 0.8811164498329163, "learning_rate": 0.0002, "epoch": 3.073316627596799, "step": 19010}, {"loss": 0.5845, "grad_norm": 1.036205768585205, "learning_rate": 0.0002, "epoch": 3.0749333117775444, "step": 19020}, {"loss": 0.5838, "grad_norm": 0.8857285976409912, "learning_rate": 0.0002, "epoch": 3.0765499959582896, "step": 19030}, {"loss": 0.592, "grad_norm": 0.8392079472541809, "learning_rate": 0.0002, "epoch": 3.078166680139035, "step": 19040}, {"loss": 0.5927, "grad_norm": 1.0287401676177979, "learning_rate": 0.0002, "epoch": 3.07978336431978, "step": 19050}, {"loss": 0.5964, "grad_norm": 1.0086315870285034, "learning_rate": 0.0002, "epoch": 3.0814000485005253, "step": 19060}, {"loss": 0.5567, "grad_norm": 0.9245324730873108, "learning_rate": 0.0002, "epoch": 3.0830167326812705, "step": 19070}, {"loss": 0.5797, "grad_norm": 0.8680877089500427, "learning_rate": 0.0002, "epoch": 3.084633416862016, "step": 19080}, {"loss": 0.5611, "grad_norm": 0.8814793825149536, "learning_rate": 0.0002, "epoch": 3.0862501010427614, "step": 19090}, {"loss": 0.6051, "grad_norm": 0.9234458208084106, "learning_rate": 0.0002, "epoch": 3.0878667852235067, "step": 19100}, {"loss": 0.6209, "grad_norm": 1.1291664838790894, "learning_rate": 0.0002, "epoch": 3.089483469404252, "step": 19110}, {"loss": 0.5695, "grad_norm": 0.9191402792930603, "learning_rate": 0.0002, "epoch": 3.091100153584997, "step": 19120}, {"loss": 0.5856, "grad_norm": 0.7103154063224792, "learning_rate": 0.0002, "epoch": 3.0927168377657424, "step": 19130}, {"loss": 0.6479, "grad_norm": 0.9368883967399597, "learning_rate": 0.0002, "epoch": 3.0943335219464876, "step": 19140}, {"loss": 0.6167, "grad_norm": 0.9676656723022461, "learning_rate": 0.0002, "epoch": 3.095950206127233, "step": 19150}, {"loss": 0.5794, "grad_norm": 0.8739792704582214, "learning_rate": 0.0002, "epoch": 3.0975668903079785, "step": 19160}, {"loss": 0.6112, "grad_norm": 0.8530174493789673, "learning_rate": 0.0002, "epoch": 3.0991835744887237, "step": 19170}, {"loss": 0.6568, "grad_norm": 0.794945478439331, "learning_rate": 0.0002, "epoch": 3.100800258669469, "step": 19180}, {"loss": 0.5928, "grad_norm": 0.9508888125419617, "learning_rate": 0.0002, "epoch": 3.102416942850214, "step": 19190}, {"loss": 0.5757, "grad_norm": 1.0599955320358276, "learning_rate": 0.0002, "epoch": 3.1040336270309594, "step": 19200}, {"loss": 0.6151, "grad_norm": 1.0673625469207764, "learning_rate": 0.0002, "epoch": 3.1056503112117047, "step": 19210}, {"loss": 0.6043, "grad_norm": 0.7739115953445435, "learning_rate": 0.0002, "epoch": 3.10726699539245, "step": 19220}, {"loss": 0.6046, "grad_norm": 0.9884951114654541, "learning_rate": 0.0002, "epoch": 3.1088836795731956, "step": 19230}, {"loss": 0.5932, "grad_norm": 0.862260103225708, "learning_rate": 0.0002, "epoch": 3.110500363753941, "step": 19240}, {"loss": 0.6098, "grad_norm": 0.7690284848213196, "learning_rate": 0.0002, "epoch": 3.112117047934686, "step": 19250}, {"loss": 0.5791, "grad_norm": 0.8758958578109741, "learning_rate": 0.0002, "epoch": 3.1137337321154313, "step": 19260}, {"loss": 0.6136, "grad_norm": 1.0356395244598389, "learning_rate": 0.0002, "epoch": 3.1153504162961765, "step": 19270}, {"loss": 0.6159, "grad_norm": 0.6950937509536743, "learning_rate": 0.0002, "epoch": 3.1169671004769217, "step": 19280}, {"loss": 0.592, "grad_norm": 0.760998010635376, "learning_rate": 0.0002, "epoch": 3.118583784657667, "step": 19290}, {"loss": 0.575, "grad_norm": 0.9335789084434509, "learning_rate": 0.0002, "epoch": 3.1202004688384126, "step": 19300}, {"loss": 0.6139, "grad_norm": 0.9636204242706299, "learning_rate": 0.0002, "epoch": 3.121817153019158, "step": 19310}, {"loss": 0.6001, "grad_norm": 1.0820997953414917, "learning_rate": 0.0002, "epoch": 3.123433837199903, "step": 19320}, {"loss": 0.6542, "grad_norm": 0.7333487272262573, "learning_rate": 0.0002, "epoch": 3.1250505213806483, "step": 19330}, {"loss": 0.6178, "grad_norm": 1.0417509078979492, "learning_rate": 0.0002, "epoch": 3.1266672055613935, "step": 19340}, {"loss": 0.603, "grad_norm": 0.9267749190330505, "learning_rate": 0.0002, "epoch": 3.128283889742139, "step": 19350}, {"loss": 0.6063, "grad_norm": 0.777798593044281, "learning_rate": 0.0002, "epoch": 3.129900573922884, "step": 19360}, {"loss": 0.5913, "grad_norm": 0.8425456881523132, "learning_rate": 0.0002, "epoch": 3.1315172581036297, "step": 19370}, {"loss": 0.6042, "grad_norm": 0.9617102146148682, "learning_rate": 0.0002, "epoch": 3.133133942284375, "step": 19380}, {"loss": 0.633, "grad_norm": 1.0052828788757324, "learning_rate": 0.0002, "epoch": 3.13475062646512, "step": 19390}, {"loss": 0.5713, "grad_norm": 0.7637009024620056, "learning_rate": 0.0002, "epoch": 3.1363673106458654, "step": 19400}, {"loss": 0.5497, "grad_norm": 0.7958088517189026, "learning_rate": 0.0002, "epoch": 3.1379839948266106, "step": 19410}, {"loss": 0.6283, "grad_norm": 0.9161727428436279, "learning_rate": 0.0002, "epoch": 3.139600679007356, "step": 19420}, {"loss": 0.5638, "grad_norm": 0.8402149677276611, "learning_rate": 0.0002, "epoch": 3.141217363188101, "step": 19430}, {"loss": 0.5848, "grad_norm": 1.0056525468826294, "learning_rate": 0.0002, "epoch": 3.1428340473688463, "step": 19440}, {"loss": 0.5954, "grad_norm": 1.0129190683364868, "learning_rate": 0.0002, "epoch": 3.144450731549592, "step": 19450}, {"loss": 0.5808, "grad_norm": 0.790825366973877, "learning_rate": 0.0002, "epoch": 3.146067415730337, "step": 19460}, {"loss": 0.5607, "grad_norm": 1.441665530204773, "learning_rate": 0.0002, "epoch": 3.1476840999110824, "step": 19470}, {"loss": 0.5785, "grad_norm": 0.7846331596374512, "learning_rate": 0.0002, "epoch": 3.1493007840918277, "step": 19480}, {"loss": 0.5892, "grad_norm": 0.7915332913398743, "learning_rate": 0.0002, "epoch": 3.150917468272573, "step": 19490}, {"loss": 0.5759, "grad_norm": 0.933982253074646, "learning_rate": 0.0002, "epoch": 3.152534152453318, "step": 19500}, {"loss": 0.6206, "grad_norm": 1.038408637046814, "learning_rate": 0.0002, "epoch": 3.1541508366340634, "step": 19510}, {"loss": 0.6271, "grad_norm": 1.018935203552246, "learning_rate": 0.0002, "epoch": 3.155767520814809, "step": 19520}, {"loss": 0.6173, "grad_norm": 0.9618112444877625, "learning_rate": 0.0002, "epoch": 3.1573842049955543, "step": 19530}, {"loss": 0.5972, "grad_norm": 0.8900452852249146, "learning_rate": 0.0002, "epoch": 3.1590008891762995, "step": 19540}, {"loss": 0.5925, "grad_norm": 0.8254160284996033, "learning_rate": 0.0002, "epoch": 3.1606175733570447, "step": 19550}, {"loss": 0.625, "grad_norm": 1.004376769065857, "learning_rate": 0.0002, "epoch": 3.16223425753779, "step": 19560}, {"loss": 0.5775, "grad_norm": 1.0490446090698242, "learning_rate": 0.0002, "epoch": 3.163850941718535, "step": 19570}, {"loss": 0.5986, "grad_norm": 0.7387403845787048, "learning_rate": 0.0002, "epoch": 3.1654676258992804, "step": 19580}, {"loss": 0.5898, "grad_norm": 0.7611538171768188, "learning_rate": 0.0002, "epoch": 3.1670843100800257, "step": 19590}, {"loss": 0.5937, "grad_norm": 0.8239886164665222, "learning_rate": 0.0002, "epoch": 3.1687009942607713, "step": 19600}, {"loss": 0.6068, "grad_norm": 0.9327243566513062, "learning_rate": 0.0002, "epoch": 3.1703176784415166, "step": 19610}, {"loss": 0.572, "grad_norm": 0.9662560224533081, "learning_rate": 0.0002, "epoch": 3.171934362622262, "step": 19620}, {"loss": 0.5988, "grad_norm": 0.9183341860771179, "learning_rate": 0.0002, "epoch": 3.173551046803007, "step": 19630}, {"loss": 0.5909, "grad_norm": 0.875066876411438, "learning_rate": 0.0002, "epoch": 3.1751677309837523, "step": 19640}, {"loss": 0.5956, "grad_norm": 0.8567508459091187, "learning_rate": 0.0002, "epoch": 3.1767844151644975, "step": 19650}, {"loss": 0.5805, "grad_norm": 0.6805780529975891, "learning_rate": 0.0002, "epoch": 3.1784010993452427, "step": 19660}, {"loss": 0.6204, "grad_norm": 0.8776944279670715, "learning_rate": 0.0002, "epoch": 3.1800177835259884, "step": 19670}, {"loss": 0.6108, "grad_norm": 0.9036329984664917, "learning_rate": 0.0002, "epoch": 3.1816344677067336, "step": 19680}, {"loss": 0.6238, "grad_norm": 0.8527372479438782, "learning_rate": 0.0002, "epoch": 3.183251151887479, "step": 19690}, {"loss": 0.6089, "grad_norm": 1.1045585870742798, "learning_rate": 0.0002, "epoch": 3.184867836068224, "step": 19700}, {"loss": 0.5491, "grad_norm": 0.9213830828666687, "learning_rate": 0.0002, "epoch": 3.1864845202489693, "step": 19710}, {"loss": 0.618, "grad_norm": 0.8865814805030823, "learning_rate": 0.0002, "epoch": 3.1881012044297146, "step": 19720}, {"loss": 0.5785, "grad_norm": 0.7939388751983643, "learning_rate": 0.0002, "epoch": 3.18971788861046, "step": 19730}, {"loss": 0.5682, "grad_norm": 0.6966729760169983, "learning_rate": 0.0002, "epoch": 3.191334572791205, "step": 19740}, {"loss": 0.5839, "grad_norm": 0.8023673295974731, "learning_rate": 0.0002, "epoch": 3.1929512569719507, "step": 19750}, {"loss": 0.6267, "grad_norm": 0.7992037534713745, "learning_rate": 0.0002, "epoch": 3.194567941152696, "step": 19760}, {"loss": 0.6141, "grad_norm": 0.7412247657775879, "learning_rate": 0.0002, "epoch": 3.196184625333441, "step": 19770}, {"loss": 0.6179, "grad_norm": 0.9598729014396667, "learning_rate": 0.0002, "epoch": 3.1978013095141864, "step": 19780}, {"loss": 0.5685, "grad_norm": 0.8331366777420044, "learning_rate": 0.0002, "epoch": 3.1994179936949316, "step": 19790}, {"loss": 0.6104, "grad_norm": 0.8939169645309448, "learning_rate": 0.0002, "epoch": 3.201034677875677, "step": 19800}, {"loss": 0.6147, "grad_norm": 0.9219734072685242, "learning_rate": 0.0002, "epoch": 3.202651362056422, "step": 19810}, {"loss": 0.6051, "grad_norm": 0.869490385055542, "learning_rate": 0.0002, "epoch": 3.2042680462371678, "step": 19820}, {"loss": 0.5946, "grad_norm": 0.8989706635475159, "learning_rate": 0.0002, "epoch": 3.205884730417913, "step": 19830}, {"loss": 0.5866, "grad_norm": 0.8477165102958679, "learning_rate": 0.0002, "epoch": 3.2075014145986582, "step": 19840}, {"loss": 0.6176, "grad_norm": 0.8720678687095642, "learning_rate": 0.0002, "epoch": 3.2091180987794035, "step": 19850}, {"loss": 0.5694, "grad_norm": 0.861406683921814, "learning_rate": 0.0002, "epoch": 3.2107347829601487, "step": 19860}, {"loss": 0.6264, "grad_norm": 0.8228686451911926, "learning_rate": 0.0002, "epoch": 3.212351467140894, "step": 19870}, {"loss": 0.625, "grad_norm": 0.7936596870422363, "learning_rate": 0.0002, "epoch": 3.213968151321639, "step": 19880}, {"loss": 0.5698, "grad_norm": 1.097377896308899, "learning_rate": 0.0002, "epoch": 3.2155848355023844, "step": 19890}, {"loss": 0.6725, "grad_norm": 0.9544782638549805, "learning_rate": 0.0002, "epoch": 3.21720151968313, "step": 19900}, {"loss": 0.6022, "grad_norm": 0.8240751624107361, "learning_rate": 0.0002, "epoch": 3.2188182038638753, "step": 19910}, {"loss": 0.5659, "grad_norm": 0.8332096338272095, "learning_rate": 0.0002, "epoch": 3.2204348880446205, "step": 19920}, {"loss": 0.6274, "grad_norm": 1.0954567193984985, "learning_rate": 0.0002, "epoch": 3.2220515722253658, "step": 19930}, {"loss": 0.652, "grad_norm": 0.7790525555610657, "learning_rate": 0.0002, "epoch": 3.223668256406111, "step": 19940}, {"loss": 0.5986, "grad_norm": 0.7966814041137695, "learning_rate": 0.0002, "epoch": 3.225284940586856, "step": 19950}, {"loss": 0.5911, "grad_norm": 0.9751881957054138, "learning_rate": 0.0002, "epoch": 3.2269016247676015, "step": 19960}, {"loss": 0.6071, "grad_norm": 0.9856047630310059, "learning_rate": 0.0002, "epoch": 3.228518308948347, "step": 19970}, {"loss": 0.5837, "grad_norm": 1.3062353134155273, "learning_rate": 0.0002, "epoch": 3.2301349931290924, "step": 19980}, {"loss": 0.6588, "grad_norm": 0.9510692358016968, "learning_rate": 0.0002, "epoch": 3.2317516773098376, "step": 19990}, {"loss": 0.6264, "grad_norm": 0.8630342483520508, "learning_rate": 0.0002, "epoch": 3.233368361490583, "step": 20000}, {"loss": 0.6073, "grad_norm": 0.8966519236564636, "learning_rate": 0.0002, "epoch": 3.234985045671328, "step": 20010}, {"loss": 0.612, "grad_norm": 0.7093510627746582, "learning_rate": 0.0002, "epoch": 3.2366017298520733, "step": 20020}, {"loss": 0.585, "grad_norm": 0.7771096229553223, "learning_rate": 0.0002, "epoch": 3.2382184140328185, "step": 20030}, {"loss": 0.5821, "grad_norm": 0.841058075428009, "learning_rate": 0.0002, "epoch": 3.2398350982135637, "step": 20040}, {"loss": 0.6519, "grad_norm": 0.909712553024292, "learning_rate": 0.0002, "epoch": 3.2414517823943094, "step": 20050}, {"loss": 0.6089, "grad_norm": 0.8321019411087036, "learning_rate": 0.0002, "epoch": 3.2430684665750547, "step": 20060}, {"loss": 0.6115, "grad_norm": 0.779901921749115, "learning_rate": 0.0002, "epoch": 3.2446851507558, "step": 20070}, {"loss": 0.6107, "grad_norm": 0.6249170303344727, "learning_rate": 0.0002, "epoch": 3.246301834936545, "step": 20080}, {"loss": 0.603, "grad_norm": 0.8000940680503845, "learning_rate": 0.0002, "epoch": 3.2479185191172903, "step": 20090}, {"loss": 0.6273, "grad_norm": 0.7627735137939453, "learning_rate": 0.0002, "epoch": 3.2495352032980356, "step": 20100}, {"loss": 0.6223, "grad_norm": 0.8780747056007385, "learning_rate": 0.0002, "epoch": 3.2511518874787813, "step": 20110}, {"loss": 0.5969, "grad_norm": 0.772037148475647, "learning_rate": 0.0002, "epoch": 3.2527685716595265, "step": 20120}, {"loss": 0.5843, "grad_norm": 1.0086580514907837, "learning_rate": 0.0002, "epoch": 3.2543852558402717, "step": 20130}, {"loss": 0.5777, "grad_norm": 0.9360289573669434, "learning_rate": 0.0002, "epoch": 3.256001940021017, "step": 20140}, {"loss": 0.5777, "grad_norm": 1.2099586725234985, "learning_rate": 0.0002, "epoch": 3.257618624201762, "step": 20150}, {"loss": 0.624, "grad_norm": 0.8368481397628784, "learning_rate": 0.0002, "epoch": 3.2592353083825074, "step": 20160}, {"loss": 0.5626, "grad_norm": 0.7391039133071899, "learning_rate": 0.0002, "epoch": 3.2608519925632526, "step": 20170}, {"loss": 0.6041, "grad_norm": 0.9122273325920105, "learning_rate": 0.0002, "epoch": 3.262468676743998, "step": 20180}, {"loss": 0.5868, "grad_norm": 0.8502281904220581, "learning_rate": 0.0002, "epoch": 3.264085360924743, "step": 20190}, {"loss": 0.5841, "grad_norm": 1.0926852226257324, "learning_rate": 0.0002, "epoch": 3.265702045105489, "step": 20200}, {"loss": 0.6027, "grad_norm": 0.7902828454971313, "learning_rate": 0.0002, "epoch": 3.267318729286234, "step": 20210}, {"loss": 0.6089, "grad_norm": 0.8724729418754578, "learning_rate": 0.0002, "epoch": 3.2689354134669792, "step": 20220}, {"loss": 0.6242, "grad_norm": 0.8469277024269104, "learning_rate": 0.0002, "epoch": 3.2705520976477245, "step": 20230}, {"loss": 0.644, "grad_norm": 0.8865092992782593, "learning_rate": 0.0002, "epoch": 3.2721687818284697, "step": 20240}, {"loss": 0.6464, "grad_norm": 1.0979334115982056, "learning_rate": 0.0002, "epoch": 3.273785466009215, "step": 20250}, {"loss": 0.647, "grad_norm": 1.0860793590545654, "learning_rate": 0.0002, "epoch": 3.2754021501899606, "step": 20260}, {"loss": 0.6105, "grad_norm": 0.981745183467865, "learning_rate": 0.0002, "epoch": 3.277018834370706, "step": 20270}, {"loss": 0.627, "grad_norm": 0.9155020713806152, "learning_rate": 0.0002, "epoch": 3.278635518551451, "step": 20280}, {"loss": 0.5899, "grad_norm": 0.8436718583106995, "learning_rate": 0.0002, "epoch": 3.2802522027321963, "step": 20290}, {"loss": 0.6371, "grad_norm": 1.0329409837722778, "learning_rate": 0.0002, "epoch": 3.2818688869129415, "step": 20300}, {"loss": 0.6, "grad_norm": 0.9876394271850586, "learning_rate": 0.0002, "epoch": 3.2834855710936868, "step": 20310}, {"loss": 0.5463, "grad_norm": 0.8052917718887329, "learning_rate": 0.0002, "epoch": 3.285102255274432, "step": 20320}, {"loss": 0.5949, "grad_norm": 0.8390680551528931, "learning_rate": 0.0002, "epoch": 3.2867189394551772, "step": 20330}, {"loss": 0.6492, "grad_norm": 0.9515735507011414, "learning_rate": 0.0002, "epoch": 3.288335623635923, "step": 20340}, {"loss": 0.596, "grad_norm": 0.8028870224952698, "learning_rate": 0.0002, "epoch": 3.289952307816668, "step": 20350}, {"loss": 0.634, "grad_norm": 0.862592339515686, "learning_rate": 0.0002, "epoch": 3.2915689919974134, "step": 20360}, {"loss": 0.6345, "grad_norm": 0.7451621890068054, "learning_rate": 0.0002, "epoch": 3.2931856761781586, "step": 20370}, {"loss": 0.6458, "grad_norm": 0.8966776728630066, "learning_rate": 0.0002, "epoch": 3.294802360358904, "step": 20380}, {"loss": 0.5967, "grad_norm": 0.9289216995239258, "learning_rate": 0.0002, "epoch": 3.296419044539649, "step": 20390}, {"loss": 0.6599, "grad_norm": 0.9649626612663269, "learning_rate": 0.0002, "epoch": 3.2980357287203943, "step": 20400}, {"loss": 0.5781, "grad_norm": 1.1953798532485962, "learning_rate": 0.0002, "epoch": 3.29965241290114, "step": 20410}, {"loss": 0.5997, "grad_norm": 0.8929083943367004, "learning_rate": 0.0002, "epoch": 3.301269097081885, "step": 20420}, {"loss": 0.597, "grad_norm": 0.8922014236450195, "learning_rate": 0.0002, "epoch": 3.3028857812626304, "step": 20430}, {"loss": 0.5766, "grad_norm": 0.9754860401153564, "learning_rate": 0.0002, "epoch": 3.3045024654433757, "step": 20440}, {"loss": 0.5653, "grad_norm": 0.8873140215873718, "learning_rate": 0.0002, "epoch": 3.306119149624121, "step": 20450}, {"loss": 0.6138, "grad_norm": 0.857271671295166, "learning_rate": 0.0002, "epoch": 3.307735833804866, "step": 20460}, {"loss": 0.633, "grad_norm": 0.9022141098976135, "learning_rate": 0.0002, "epoch": 3.3093525179856114, "step": 20470}, {"loss": 0.6654, "grad_norm": 0.8614798188209534, "learning_rate": 0.0002, "epoch": 3.3109692021663566, "step": 20480}, {"loss": 0.6254, "grad_norm": 0.8838164210319519, "learning_rate": 0.0002, "epoch": 3.3125858863471023, "step": 20490}, {"loss": 0.5849, "grad_norm": 0.8709736466407776, "learning_rate": 0.0002, "epoch": 3.3142025705278475, "step": 20500}, {"loss": 0.6146, "grad_norm": 0.9533300995826721, "learning_rate": 0.0002, "epoch": 3.3158192547085927, "step": 20510}, {"loss": 0.6029, "grad_norm": 0.8259269595146179, "learning_rate": 0.0002, "epoch": 3.317435938889338, "step": 20520}, {"loss": 0.6268, "grad_norm": 0.8607608079910278, "learning_rate": 0.0002, "epoch": 3.319052623070083, "step": 20530}, {"loss": 0.5676, "grad_norm": 1.0863020420074463, "learning_rate": 0.0002, "epoch": 3.3206693072508284, "step": 20540}, {"loss": 0.6412, "grad_norm": 1.011489987373352, "learning_rate": 0.0002, "epoch": 3.3222859914315737, "step": 20550}, {"loss": 0.6247, "grad_norm": 0.6952177882194519, "learning_rate": 0.0002, "epoch": 3.3239026756123193, "step": 20560}, {"loss": 0.6229, "grad_norm": 0.9638974070549011, "learning_rate": 0.0002, "epoch": 3.3255193597930646, "step": 20570}, {"loss": 0.5882, "grad_norm": 1.0310138463974, "learning_rate": 0.0002, "epoch": 3.32713604397381, "step": 20580}, {"loss": 0.594, "grad_norm": 0.9371318221092224, "learning_rate": 0.0002, "epoch": 3.328752728154555, "step": 20590}, {"loss": 0.6137, "grad_norm": 0.8756691813468933, "learning_rate": 0.0002, "epoch": 3.3303694123353003, "step": 20600}, {"loss": 0.5994, "grad_norm": 1.054175853729248, "learning_rate": 0.0002, "epoch": 3.3319860965160455, "step": 20610}, {"loss": 0.6169, "grad_norm": 0.9074128270149231, "learning_rate": 0.0002, "epoch": 3.3336027806967907, "step": 20620}, {"loss": 0.6138, "grad_norm": 0.906900942325592, "learning_rate": 0.0002, "epoch": 3.335219464877536, "step": 20630}, {"loss": 0.571, "grad_norm": 0.8689333200454712, "learning_rate": 0.0002, "epoch": 3.3368361490582816, "step": 20640}, {"loss": 0.6079, "grad_norm": 0.9889747500419617, "learning_rate": 0.0002, "epoch": 3.338452833239027, "step": 20650}, {"loss": 0.6073, "grad_norm": 1.0685805082321167, "learning_rate": 0.0002, "epoch": 3.340069517419772, "step": 20660}, {"loss": 0.6091, "grad_norm": 0.7495010495185852, "learning_rate": 0.0002, "epoch": 3.3416862016005173, "step": 20670}, {"loss": 0.5883, "grad_norm": 0.8747848272323608, "learning_rate": 0.0002, "epoch": 3.3433028857812626, "step": 20680}, {"loss": 0.604, "grad_norm": 0.9762673377990723, "learning_rate": 0.0002, "epoch": 3.344919569962008, "step": 20690}, {"loss": 0.6784, "grad_norm": 1.0284489393234253, "learning_rate": 0.0002, "epoch": 3.346536254142753, "step": 20700}, {"loss": 0.6464, "grad_norm": 0.7293812036514282, "learning_rate": 0.0002, "epoch": 3.3481529383234987, "step": 20710}, {"loss": 0.609, "grad_norm": 0.8330199122428894, "learning_rate": 0.0002, "epoch": 3.349769622504244, "step": 20720}, {"loss": 0.5729, "grad_norm": 0.9808499217033386, "learning_rate": 0.0002, "epoch": 3.351386306684989, "step": 20730}, {"loss": 0.6315, "grad_norm": 0.9508825540542603, "learning_rate": 0.0002, "epoch": 3.3530029908657344, "step": 20740}, {"loss": 0.5965, "grad_norm": 0.790483832359314, "learning_rate": 0.0002, "epoch": 3.3546196750464796, "step": 20750}, {"loss": 0.6327, "grad_norm": 1.022793173789978, "learning_rate": 0.0002, "epoch": 3.356236359227225, "step": 20760}, {"loss": 0.6439, "grad_norm": 0.8318950533866882, "learning_rate": 0.0002, "epoch": 3.35785304340797, "step": 20770}, {"loss": 0.6037, "grad_norm": 0.7980858087539673, "learning_rate": 0.0002, "epoch": 3.3594697275887153, "step": 20780}, {"loss": 0.6746, "grad_norm": 0.8114802241325378, "learning_rate": 0.0002, "epoch": 3.361086411769461, "step": 20790}, {"loss": 0.6017, "grad_norm": 0.8522519469261169, "learning_rate": 0.0002, "epoch": 3.3627030959502062, "step": 20800}, {"loss": 0.5864, "grad_norm": 0.9142431616783142, "learning_rate": 0.0002, "epoch": 3.3643197801309515, "step": 20810}, {"loss": 0.6331, "grad_norm": 0.771170437335968, "learning_rate": 0.0002, "epoch": 3.3659364643116967, "step": 20820}, {"loss": 0.5879, "grad_norm": 1.0628231763839722, "learning_rate": 0.0002, "epoch": 3.367553148492442, "step": 20830}, {"loss": 0.6533, "grad_norm": 0.9384352564811707, "learning_rate": 0.0002, "epoch": 3.369169832673187, "step": 20840}, {"loss": 0.6292, "grad_norm": 1.1286591291427612, "learning_rate": 0.0002, "epoch": 3.370786516853933, "step": 20850}, {"loss": 0.5986, "grad_norm": 1.1349513530731201, "learning_rate": 0.0002, "epoch": 3.372403201034678, "step": 20860}, {"loss": 0.6413, "grad_norm": 1.0127464532852173, "learning_rate": 0.0002, "epoch": 3.3740198852154233, "step": 20870}, {"loss": 0.6414, "grad_norm": 0.9111971855163574, "learning_rate": 0.0002, "epoch": 3.3756365693961685, "step": 20880}, {"loss": 0.6101, "grad_norm": 0.871356725692749, "learning_rate": 0.0002, "epoch": 3.3772532535769137, "step": 20890}, {"loss": 0.5995, "grad_norm": 0.7774117588996887, "learning_rate": 0.0002, "epoch": 3.378869937757659, "step": 20900}, {"loss": 0.6062, "grad_norm": 1.0089964866638184, "learning_rate": 0.0002, "epoch": 3.380486621938404, "step": 20910}, {"loss": 0.5908, "grad_norm": 0.7855867147445679, "learning_rate": 0.0002, "epoch": 3.3821033061191494, "step": 20920}, {"loss": 0.6373, "grad_norm": 1.3713710308074951, "learning_rate": 0.0002, "epoch": 3.3837199902998947, "step": 20930}, {"loss": 0.6627, "grad_norm": 0.8599116206169128, "learning_rate": 0.0002, "epoch": 3.3853366744806404, "step": 20940}, {"loss": 0.6224, "grad_norm": 0.9392673373222351, "learning_rate": 0.0002, "epoch": 3.3869533586613856, "step": 20950}, {"loss": 0.5855, "grad_norm": 0.8764075040817261, "learning_rate": 0.0002, "epoch": 3.388570042842131, "step": 20960}, {"loss": 0.5734, "grad_norm": 0.8240136504173279, "learning_rate": 0.0002, "epoch": 3.390186727022876, "step": 20970}, {"loss": 0.5783, "grad_norm": 1.0982369184494019, "learning_rate": 0.0002, "epoch": 3.3918034112036213, "step": 20980}, {"loss": 0.5451, "grad_norm": 1.0599013566970825, "learning_rate": 0.0002, "epoch": 3.3934200953843665, "step": 20990}, {"loss": 0.6356, "grad_norm": 0.895438015460968, "learning_rate": 0.0002, "epoch": 3.395036779565112, "step": 21000}, {"loss": 0.6065, "grad_norm": 0.6974841356277466, "learning_rate": 0.0002, "epoch": 3.3966534637458574, "step": 21010}, {"loss": 0.5704, "grad_norm": 0.9571719765663147, "learning_rate": 0.0002, "epoch": 3.3982701479266026, "step": 21020}, {"loss": 0.679, "grad_norm": 0.831912636756897, "learning_rate": 0.0002, "epoch": 3.399886832107348, "step": 21030}, {"loss": 0.6051, "grad_norm": 0.831936240196228, "learning_rate": 0.0002, "epoch": 3.401503516288093, "step": 21040}, {"loss": 0.5857, "grad_norm": 0.7388373613357544, "learning_rate": 0.0002, "epoch": 3.4031202004688383, "step": 21050}, {"loss": 0.6245, "grad_norm": 0.938667356967926, "learning_rate": 0.0002, "epoch": 3.4047368846495836, "step": 21060}, {"loss": 0.6121, "grad_norm": 0.9202313423156738, "learning_rate": 0.0002, "epoch": 3.406353568830329, "step": 21070}, {"loss": 0.6388, "grad_norm": 0.9888381958007812, "learning_rate": 0.0002, "epoch": 3.4079702530110745, "step": 21080}, {"loss": 0.6245, "grad_norm": 0.8526970744132996, "learning_rate": 0.0002, "epoch": 3.4095869371918197, "step": 21090}, {"loss": 0.5914, "grad_norm": 0.7939383387565613, "learning_rate": 0.0002, "epoch": 3.411203621372565, "step": 21100}, {"loss": 0.6066, "grad_norm": 0.9986352920532227, "learning_rate": 0.0002, "epoch": 3.41282030555331, "step": 21110}, {"loss": 0.5947, "grad_norm": 0.8895300030708313, "learning_rate": 0.0002, "epoch": 3.4144369897340554, "step": 21120}, {"loss": 0.6264, "grad_norm": 0.9559482932090759, "learning_rate": 0.0002, "epoch": 3.4160536739148006, "step": 21130}, {"loss": 0.6491, "grad_norm": 0.8351506590843201, "learning_rate": 0.0002, "epoch": 3.417670358095546, "step": 21140}, {"loss": 0.567, "grad_norm": 0.8224456906318665, "learning_rate": 0.0002, "epoch": 3.4192870422762915, "step": 21150}, {"loss": 0.5871, "grad_norm": 1.0110299587249756, "learning_rate": 0.0002, "epoch": 3.4209037264570368, "step": 21160}, {"loss": 0.6116, "grad_norm": 0.82564777135849, "learning_rate": 0.0002, "epoch": 3.422520410637782, "step": 21170}, {"loss": 0.595, "grad_norm": 1.004738688468933, "learning_rate": 0.0002, "epoch": 3.4241370948185272, "step": 21180}, {"loss": 0.6286, "grad_norm": 0.7545676827430725, "learning_rate": 0.0002, "epoch": 3.4257537789992725, "step": 21190}, {"loss": 0.5868, "grad_norm": 0.8918704390525818, "learning_rate": 0.0002, "epoch": 3.4273704631800177, "step": 21200}, {"loss": 0.6542, "grad_norm": 0.8336876034736633, "learning_rate": 0.0002, "epoch": 3.428987147360763, "step": 21210}, {"loss": 0.5824, "grad_norm": 0.8928771018981934, "learning_rate": 0.0002, "epoch": 3.430603831541508, "step": 21220}, {"loss": 0.6468, "grad_norm": 0.7663705945014954, "learning_rate": 0.0002, "epoch": 3.432220515722254, "step": 21230}, {"loss": 0.6693, "grad_norm": 0.8392598628997803, "learning_rate": 0.0002, "epoch": 3.433837199902999, "step": 21240}, {"loss": 0.5971, "grad_norm": 0.8819600343704224, "learning_rate": 0.0002, "epoch": 3.4354538840837443, "step": 21250}, {"loss": 0.6791, "grad_norm": 0.9124642014503479, "learning_rate": 0.0002, "epoch": 3.4370705682644895, "step": 21260}, {"loss": 0.5925, "grad_norm": 0.8329763412475586, "learning_rate": 0.0002, "epoch": 3.4386872524452348, "step": 21270}, {"loss": 0.6541, "grad_norm": 0.9982839822769165, "learning_rate": 0.0002, "epoch": 3.44030393662598, "step": 21280}, {"loss": 0.6441, "grad_norm": 0.9105954766273499, "learning_rate": 0.0002, "epoch": 3.4419206208067252, "step": 21290}, {"loss": 0.6028, "grad_norm": 0.8182359337806702, "learning_rate": 0.0002, "epoch": 3.443537304987471, "step": 21300}, {"loss": 0.5991, "grad_norm": 1.0568904876708984, "learning_rate": 0.0002, "epoch": 3.445153989168216, "step": 21310}, {"loss": 0.6117, "grad_norm": 0.968539834022522, "learning_rate": 0.0002, "epoch": 3.4467706733489614, "step": 21320}, {"loss": 0.6219, "grad_norm": 0.8774511218070984, "learning_rate": 0.0002, "epoch": 3.4483873575297066, "step": 21330}, {"loss": 0.6438, "grad_norm": 0.7598156332969666, "learning_rate": 0.0002, "epoch": 3.450004041710452, "step": 21340}, {"loss": 0.6033, "grad_norm": 1.1012897491455078, "learning_rate": 0.0002, "epoch": 3.451620725891197, "step": 21350}, {"loss": 0.6137, "grad_norm": 0.8040637373924255, "learning_rate": 0.0002, "epoch": 3.4532374100719423, "step": 21360}, {"loss": 0.6173, "grad_norm": 0.8497496247291565, "learning_rate": 0.0002, "epoch": 3.4548540942526875, "step": 21370}, {"loss": 0.6005, "grad_norm": 0.8429915904998779, "learning_rate": 0.0002, "epoch": 3.456470778433433, "step": 21380}, {"loss": 0.6182, "grad_norm": 0.8107112646102905, "learning_rate": 0.0002, "epoch": 3.4580874626141784, "step": 21390}, {"loss": 0.6109, "grad_norm": 1.00872004032135, "learning_rate": 0.0002, "epoch": 3.4597041467949237, "step": 21400}, {"loss": 0.5712, "grad_norm": 0.8266542553901672, "learning_rate": 0.0002, "epoch": 3.461320830975669, "step": 21410}, {"loss": 0.6457, "grad_norm": 0.8972568511962891, "learning_rate": 0.0002, "epoch": 3.462937515156414, "step": 21420}, {"loss": 0.6081, "grad_norm": 1.0781476497650146, "learning_rate": 0.0002, "epoch": 3.4645541993371594, "step": 21430}, {"loss": 0.6303, "grad_norm": 0.9571592807769775, "learning_rate": 0.0002, "epoch": 3.4661708835179046, "step": 21440}, {"loss": 0.6309, "grad_norm": 0.881547212600708, "learning_rate": 0.0002, "epoch": 3.4677875676986503, "step": 21450}, {"loss": 0.6076, "grad_norm": 0.6955338716506958, "learning_rate": 0.0002, "epoch": 3.4694042518793955, "step": 21460}, {"loss": 0.6205, "grad_norm": 0.901187539100647, "learning_rate": 0.0002, "epoch": 3.4710209360601407, "step": 21470}, {"loss": 0.639, "grad_norm": 0.7063511610031128, "learning_rate": 0.0002, "epoch": 3.472637620240886, "step": 21480}, {"loss": 0.6154, "grad_norm": 0.8462792038917542, "learning_rate": 0.0002, "epoch": 3.474254304421631, "step": 21490}, {"loss": 0.61, "grad_norm": 1.1861060857772827, "learning_rate": 0.0002, "epoch": 3.4758709886023764, "step": 21500}, {"loss": 0.6586, "grad_norm": 0.70503169298172, "learning_rate": 0.0002, "epoch": 3.4774876727831217, "step": 21510}, {"loss": 0.6475, "grad_norm": 0.9650066494941711, "learning_rate": 0.0002, "epoch": 3.479104356963867, "step": 21520}, {"loss": 0.6452, "grad_norm": 1.0266852378845215, "learning_rate": 0.0002, "epoch": 3.4807210411446126, "step": 21530}, {"loss": 0.6553, "grad_norm": 0.956372857093811, "learning_rate": 0.0002, "epoch": 3.482337725325358, "step": 21540}, {"loss": 0.6667, "grad_norm": 0.8848432898521423, "learning_rate": 0.0002, "epoch": 3.483954409506103, "step": 21550}, {"loss": 0.6375, "grad_norm": 1.0805351734161377, "learning_rate": 0.0002, "epoch": 3.4855710936868483, "step": 21560}, {"loss": 0.6958, "grad_norm": 0.9279725551605225, "learning_rate": 0.0002, "epoch": 3.4871877778675935, "step": 21570}, {"loss": 0.6354, "grad_norm": 0.9049562215805054, "learning_rate": 0.0002, "epoch": 3.4888044620483387, "step": 21580}, {"loss": 0.6071, "grad_norm": 0.9619429111480713, "learning_rate": 0.0002, "epoch": 3.4904211462290844, "step": 21590}, {"loss": 0.5927, "grad_norm": 0.8508906960487366, "learning_rate": 0.0002, "epoch": 3.4920378304098296, "step": 21600}, {"loss": 0.6115, "grad_norm": 0.8692502379417419, "learning_rate": 0.0002, "epoch": 3.493654514590575, "step": 21610}, {"loss": 0.5878, "grad_norm": 0.8187332153320312, "learning_rate": 0.0002, "epoch": 3.49527119877132, "step": 21620}, {"loss": 0.5874, "grad_norm": 1.145400047302246, "learning_rate": 0.0002, "epoch": 3.4968878829520653, "step": 21630}, {"loss": 0.6313, "grad_norm": 0.8281388282775879, "learning_rate": 0.0002, "epoch": 3.4985045671328105, "step": 21640}, {"loss": 0.6624, "grad_norm": 0.82256019115448, "learning_rate": 0.0002, "epoch": 3.500121251313556, "step": 21650}, {"loss": 0.6346, "grad_norm": 0.9315484762191772, "learning_rate": 0.0002, "epoch": 3.501737935494301, "step": 21660}, {"loss": 0.6086, "grad_norm": 0.7626111507415771, "learning_rate": 0.0002, "epoch": 3.5033546196750462, "step": 21670}, {"loss": 0.6177, "grad_norm": 0.9275059103965759, "learning_rate": 0.0002, "epoch": 3.504971303855792, "step": 21680}, {"loss": 0.64, "grad_norm": 0.7906724810600281, "learning_rate": 0.0002, "epoch": 3.506587988036537, "step": 21690}, {"loss": 0.6015, "grad_norm": 0.8289761543273926, "learning_rate": 0.0002, "epoch": 3.5082046722172824, "step": 21700}, {"loss": 0.6246, "grad_norm": 0.8316431045532227, "learning_rate": 0.0002, "epoch": 3.5098213563980276, "step": 21710}, {"loss": 0.619, "grad_norm": 1.0451812744140625, "learning_rate": 0.0002, "epoch": 3.511438040578773, "step": 21720}, {"loss": 0.632, "grad_norm": 0.928252637386322, "learning_rate": 0.0002, "epoch": 3.513054724759518, "step": 21730}, {"loss": 0.6062, "grad_norm": 0.7985895276069641, "learning_rate": 0.0002, "epoch": 3.5146714089402638, "step": 21740}, {"loss": 0.6463, "grad_norm": 0.6740974187850952, "learning_rate": 0.0002, "epoch": 3.516288093121009, "step": 21750}, {"loss": 0.6138, "grad_norm": 0.8482223749160767, "learning_rate": 0.0002, "epoch": 3.517904777301754, "step": 21760}, {"loss": 0.6277, "grad_norm": 0.889947772026062, "learning_rate": 0.0002, "epoch": 3.5195214614824994, "step": 21770}, {"loss": 0.6174, "grad_norm": 0.8304598927497864, "learning_rate": 0.0002, "epoch": 3.5211381456632447, "step": 21780}, {"loss": 0.6156, "grad_norm": 0.8002981543540955, "learning_rate": 0.0002, "epoch": 3.52275482984399, "step": 21790}, {"loss": 0.5896, "grad_norm": 0.8115083575248718, "learning_rate": 0.0002, "epoch": 3.524371514024735, "step": 21800}, {"loss": 0.6041, "grad_norm": 0.9715048670768738, "learning_rate": 0.0002, "epoch": 3.5259881982054804, "step": 21810}, {"loss": 0.6715, "grad_norm": 1.0910786390304565, "learning_rate": 0.0002, "epoch": 3.5276048823862256, "step": 21820}, {"loss": 0.6543, "grad_norm": 0.8438942432403564, "learning_rate": 0.0002, "epoch": 3.5292215665669713, "step": 21830}, {"loss": 0.6509, "grad_norm": 0.8813382983207703, "learning_rate": 0.0002, "epoch": 3.5308382507477165, "step": 21840}, {"loss": 0.6049, "grad_norm": 0.7092908024787903, "learning_rate": 0.0002, "epoch": 3.5324549349284617, "step": 21850}, {"loss": 0.5678, "grad_norm": 0.8332187533378601, "learning_rate": 0.0002, "epoch": 3.534071619109207, "step": 21860}, {"loss": 0.5896, "grad_norm": 0.8958209156990051, "learning_rate": 0.0002, "epoch": 3.535688303289952, "step": 21870}, {"loss": 0.6476, "grad_norm": 0.824138879776001, "learning_rate": 0.0002, "epoch": 3.5373049874706974, "step": 21880}, {"loss": 0.6022, "grad_norm": 0.8375158309936523, "learning_rate": 0.0002, "epoch": 3.538921671651443, "step": 21890}, {"loss": 0.6019, "grad_norm": 1.0274608135223389, "learning_rate": 0.0002, "epoch": 3.5405383558321883, "step": 21900}, {"loss": 0.6194, "grad_norm": 0.7088932394981384, "learning_rate": 0.0002, "epoch": 3.5421550400129336, "step": 21910}, {"loss": 0.6554, "grad_norm": 0.8172445297241211, "learning_rate": 0.0002, "epoch": 3.543771724193679, "step": 21920}, {"loss": 0.6711, "grad_norm": 0.9904135465621948, "learning_rate": 0.0002, "epoch": 3.545388408374424, "step": 21930}, {"loss": 0.6001, "grad_norm": 0.9900432229042053, "learning_rate": 0.0002, "epoch": 3.5470050925551693, "step": 21940}, {"loss": 0.6195, "grad_norm": 0.8963301181793213, "learning_rate": 0.0002, "epoch": 3.5486217767359145, "step": 21950}, {"loss": 0.5972, "grad_norm": 0.8551464676856995, "learning_rate": 0.0002, "epoch": 3.5502384609166597, "step": 21960}, {"loss": 0.6206, "grad_norm": 1.0916603803634644, "learning_rate": 0.0002, "epoch": 3.551855145097405, "step": 21970}, {"loss": 0.6523, "grad_norm": 0.841598391532898, "learning_rate": 0.0002, "epoch": 3.5534718292781506, "step": 21980}, {"loss": 0.617, "grad_norm": 0.8566757440567017, "learning_rate": 0.0002, "epoch": 3.555088513458896, "step": 21990}, {"loss": 0.6192, "grad_norm": 1.0145052671432495, "learning_rate": 0.0002, "epoch": 3.556705197639641, "step": 22000}, {"loss": 0.6173, "grad_norm": 0.9293754696846008, "learning_rate": 0.0002, "epoch": 3.5583218818203863, "step": 22010}, {"loss": 0.612, "grad_norm": 0.9568536281585693, "learning_rate": 0.0002, "epoch": 3.5599385660011316, "step": 22020}, {"loss": 0.641, "grad_norm": 0.8613139986991882, "learning_rate": 0.0002, "epoch": 3.5615552501818772, "step": 22030}, {"loss": 0.6496, "grad_norm": 0.8179237246513367, "learning_rate": 0.0002, "epoch": 3.5631719343626225, "step": 22040}, {"loss": 0.574, "grad_norm": 0.9059830904006958, "learning_rate": 0.0002, "epoch": 3.5647886185433677, "step": 22050}, {"loss": 0.6448, "grad_norm": 1.0068252086639404, "learning_rate": 0.0002, "epoch": 3.566405302724113, "step": 22060}, {"loss": 0.6239, "grad_norm": 0.9682072997093201, "learning_rate": 0.0002, "epoch": 3.568021986904858, "step": 22070}, {"loss": 0.6808, "grad_norm": 0.8514005541801453, "learning_rate": 0.0002, "epoch": 3.5696386710856034, "step": 22080}, {"loss": 0.5956, "grad_norm": 0.8327770829200745, "learning_rate": 0.0002, "epoch": 3.5712553552663486, "step": 22090}, {"loss": 0.5976, "grad_norm": 1.024976372718811, "learning_rate": 0.0002, "epoch": 3.572872039447094, "step": 22100}, {"loss": 0.624, "grad_norm": 0.7721174955368042, "learning_rate": 0.0002, "epoch": 3.574488723627839, "step": 22110}, {"loss": 0.5896, "grad_norm": 1.0351054668426514, "learning_rate": 0.0002, "epoch": 3.5761054078085843, "step": 22120}, {"loss": 0.6379, "grad_norm": 0.9680907130241394, "learning_rate": 0.0002, "epoch": 3.57772209198933, "step": 22130}, {"loss": 0.6194, "grad_norm": 0.8016974925994873, "learning_rate": 0.0002, "epoch": 3.5793387761700752, "step": 22140}, {"loss": 0.6387, "grad_norm": 1.0109003782272339, "learning_rate": 0.0002, "epoch": 3.5809554603508205, "step": 22150}, {"loss": 0.6368, "grad_norm": 1.0473392009735107, "learning_rate": 0.0002, "epoch": 3.5825721445315657, "step": 22160}, {"loss": 0.6353, "grad_norm": 0.8686613440513611, "learning_rate": 0.0002, "epoch": 3.584188828712311, "step": 22170}, {"loss": 0.5791, "grad_norm": 0.869149923324585, "learning_rate": 0.0002, "epoch": 3.5858055128930566, "step": 22180}, {"loss": 0.5895, "grad_norm": 0.9769062995910645, "learning_rate": 0.0002, "epoch": 3.587422197073802, "step": 22190}, {"loss": 0.5939, "grad_norm": 0.779636561870575, "learning_rate": 0.0002, "epoch": 3.589038881254547, "step": 22200}, {"loss": 0.5875, "grad_norm": 0.9063841104507446, "learning_rate": 0.0002, "epoch": 3.5906555654352923, "step": 22210}, {"loss": 0.5671, "grad_norm": 0.9216037392616272, "learning_rate": 0.0002, "epoch": 3.5922722496160375, "step": 22220}, {"loss": 0.6484, "grad_norm": 1.0217336416244507, "learning_rate": 0.0002, "epoch": 3.5938889337967828, "step": 22230}, {"loss": 0.6511, "grad_norm": 0.8513161540031433, "learning_rate": 0.0002, "epoch": 3.595505617977528, "step": 22240}, {"loss": 0.6301, "grad_norm": 0.8084813952445984, "learning_rate": 0.0002, "epoch": 3.597122302158273, "step": 22250}, {"loss": 0.6197, "grad_norm": 0.8524802923202515, "learning_rate": 0.0002, "epoch": 3.5987389863390185, "step": 22260}, {"loss": 0.5599, "grad_norm": 0.9356237649917603, "learning_rate": 0.0002, "epoch": 3.600355670519764, "step": 22270}, {"loss": 0.628, "grad_norm": 1.009600281715393, "learning_rate": 0.0002, "epoch": 3.6019723547005094, "step": 22280}, {"loss": 0.6179, "grad_norm": 0.9900581240653992, "learning_rate": 0.0002, "epoch": 3.6035890388812546, "step": 22290}, {"loss": 0.5725, "grad_norm": 1.062495231628418, "learning_rate": 0.0002, "epoch": 3.605205723062, "step": 22300}, {"loss": 0.607, "grad_norm": 0.8832381367683411, "learning_rate": 0.0002, "epoch": 3.606822407242745, "step": 22310}, {"loss": 0.6215, "grad_norm": 0.9284297823905945, "learning_rate": 0.0002, "epoch": 3.6084390914234903, "step": 22320}, {"loss": 0.685, "grad_norm": 1.2381829023361206, "learning_rate": 0.0002, "epoch": 3.610055775604236, "step": 22330}, {"loss": 0.6181, "grad_norm": 0.929434597492218, "learning_rate": 0.0002, "epoch": 3.611672459784981, "step": 22340}, {"loss": 0.6141, "grad_norm": 0.9714490175247192, "learning_rate": 0.0002, "epoch": 3.6132891439657264, "step": 22350}, {"loss": 0.6861, "grad_norm": 0.808014988899231, "learning_rate": 0.0002, "epoch": 3.6149058281464717, "step": 22360}, {"loss": 0.6428, "grad_norm": 1.0364398956298828, "learning_rate": 0.0002, "epoch": 3.616522512327217, "step": 22370}, {"loss": 0.6337, "grad_norm": 0.7858489751815796, "learning_rate": 0.0002, "epoch": 3.618139196507962, "step": 22380}, {"loss": 0.6214, "grad_norm": 0.9920870065689087, "learning_rate": 0.0002, "epoch": 3.6197558806887074, "step": 22390}, {"loss": 0.6659, "grad_norm": 0.9183220863342285, "learning_rate": 0.0002, "epoch": 3.6213725648694526, "step": 22400}, {"loss": 0.6036, "grad_norm": 0.9826246500015259, "learning_rate": 0.0002, "epoch": 3.622989249050198, "step": 22410}, {"loss": 0.6441, "grad_norm": 0.8632931113243103, "learning_rate": 0.0002, "epoch": 3.6246059332309435, "step": 22420}, {"loss": 0.6124, "grad_norm": 0.8468965291976929, "learning_rate": 0.0002, "epoch": 3.6262226174116887, "step": 22430}, {"loss": 0.6328, "grad_norm": 0.8466871976852417, "learning_rate": 0.0002, "epoch": 3.627839301592434, "step": 22440}, {"loss": 0.5941, "grad_norm": 0.9501169919967651, "learning_rate": 0.0002, "epoch": 3.629455985773179, "step": 22450}, {"loss": 0.6069, "grad_norm": 0.8906720876693726, "learning_rate": 0.0002, "epoch": 3.6310726699539244, "step": 22460}, {"loss": 0.6928, "grad_norm": 0.7400227189064026, "learning_rate": 0.0002, "epoch": 3.6326893541346696, "step": 22470}, {"loss": 0.6337, "grad_norm": 0.9756355881690979, "learning_rate": 0.0002, "epoch": 3.6343060383154153, "step": 22480}, {"loss": 0.6203, "grad_norm": 0.7504993081092834, "learning_rate": 0.0002, "epoch": 3.6359227224961606, "step": 22490}, {"loss": 0.6302, "grad_norm": 0.9270039200782776, "learning_rate": 0.0002, "epoch": 3.637539406676906, "step": 22500}, {"loss": 0.6026, "grad_norm": 0.8841686844825745, "learning_rate": 0.0002, "epoch": 3.639156090857651, "step": 22510}, {"loss": 0.6098, "grad_norm": 0.8533213138580322, "learning_rate": 0.0002, "epoch": 3.6407727750383962, "step": 22520}, {"loss": 0.6412, "grad_norm": 1.0052043199539185, "learning_rate": 0.0002, "epoch": 3.6423894592191415, "step": 22530}, {"loss": 0.6363, "grad_norm": 1.0323461294174194, "learning_rate": 0.0002, "epoch": 3.6440061433998867, "step": 22540}, {"loss": 0.6545, "grad_norm": 0.8654312491416931, "learning_rate": 0.0002, "epoch": 3.645622827580632, "step": 22550}, {"loss": 0.6155, "grad_norm": 0.6400038003921509, "learning_rate": 0.0002, "epoch": 3.647239511761377, "step": 22560}, {"loss": 0.5829, "grad_norm": 0.8061298727989197, "learning_rate": 0.0002, "epoch": 3.648856195942123, "step": 22570}, {"loss": 0.6388, "grad_norm": 0.9257854223251343, "learning_rate": 0.0002, "epoch": 3.650472880122868, "step": 22580}, {"loss": 0.6409, "grad_norm": 0.8439396619796753, "learning_rate": 0.0002, "epoch": 3.6520895643036133, "step": 22590}, {"loss": 0.5996, "grad_norm": 0.7764544486999512, "learning_rate": 0.0002, "epoch": 3.6537062484843585, "step": 22600}, {"loss": 0.6434, "grad_norm": 1.125451683998108, "learning_rate": 0.0002, "epoch": 3.6553229326651038, "step": 22610}, {"loss": 0.6579, "grad_norm": 0.7523018717765808, "learning_rate": 0.0002, "epoch": 3.656939616845849, "step": 22620}, {"loss": 0.6476, "grad_norm": 1.071026086807251, "learning_rate": 0.0002, "epoch": 3.6585563010265947, "step": 22630}, {"loss": 0.6459, "grad_norm": 0.945791482925415, "learning_rate": 0.0002, "epoch": 3.66017298520734, "step": 22640}, {"loss": 0.659, "grad_norm": 0.8001811504364014, "learning_rate": 0.0002, "epoch": 3.661789669388085, "step": 22650}, {"loss": 0.6385, "grad_norm": 0.9700816869735718, "learning_rate": 0.0002, "epoch": 3.6634063535688304, "step": 22660}, {"loss": 0.6337, "grad_norm": 0.9053242206573486, "learning_rate": 0.0002, "epoch": 3.6650230377495756, "step": 22670}, {"loss": 0.6335, "grad_norm": 0.944362461566925, "learning_rate": 0.0002, "epoch": 3.666639721930321, "step": 22680}, {"loss": 0.6235, "grad_norm": 1.067489504814148, "learning_rate": 0.0002, "epoch": 3.668256406111066, "step": 22690}, {"loss": 0.698, "grad_norm": 1.0984995365142822, "learning_rate": 0.0002, "epoch": 3.6698730902918113, "step": 22700}, {"loss": 0.6717, "grad_norm": 0.9336317777633667, "learning_rate": 0.0002, "epoch": 3.6714897744725565, "step": 22710}, {"loss": 0.6195, "grad_norm": 0.9261918663978577, "learning_rate": 0.0002, "epoch": 3.673106458653302, "step": 22720}, {"loss": 0.6332, "grad_norm": 0.8648008704185486, "learning_rate": 0.0002, "epoch": 3.6747231428340474, "step": 22730}, {"loss": 0.6576, "grad_norm": 0.7225083708763123, "learning_rate": 0.0002, "epoch": 3.6763398270147927, "step": 22740}, {"loss": 0.6406, "grad_norm": 0.9258282780647278, "learning_rate": 0.0002, "epoch": 3.677956511195538, "step": 22750}, {"loss": 0.6397, "grad_norm": 0.70876145362854, "learning_rate": 0.0002, "epoch": 3.679573195376283, "step": 22760}, {"loss": 0.6821, "grad_norm": 0.8780210018157959, "learning_rate": 0.0002, "epoch": 3.681189879557029, "step": 22770}, {"loss": 0.6036, "grad_norm": 0.8075440526008606, "learning_rate": 0.0002, "epoch": 3.682806563737774, "step": 22780}, {"loss": 0.6561, "grad_norm": 0.8503130674362183, "learning_rate": 0.0002, "epoch": 3.6844232479185193, "step": 22790}, {"loss": 0.6082, "grad_norm": 0.8413618206977844, "learning_rate": 0.0002, "epoch": 3.6860399320992645, "step": 22800}, {"loss": 0.614, "grad_norm": 0.8675165176391602, "learning_rate": 0.0002, "epoch": 3.6876566162800097, "step": 22810}, {"loss": 0.6157, "grad_norm": 0.8235884308815002, "learning_rate": 0.0002, "epoch": 3.689273300460755, "step": 22820}, {"loss": 0.5708, "grad_norm": 0.9477725625038147, "learning_rate": 0.0002, "epoch": 3.6908899846415, "step": 22830}, {"loss": 0.6481, "grad_norm": 0.7883533835411072, "learning_rate": 0.0002, "epoch": 3.6925066688222454, "step": 22840}, {"loss": 0.5872, "grad_norm": 1.047913908958435, "learning_rate": 0.0002, "epoch": 3.6941233530029907, "step": 22850}, {"loss": 0.6176, "grad_norm": 0.9171528816223145, "learning_rate": 0.0002, "epoch": 3.695740037183736, "step": 22860}, {"loss": 0.6204, "grad_norm": 0.9338192343711853, "learning_rate": 0.0002, "epoch": 3.6973567213644816, "step": 22870}, {"loss": 0.686, "grad_norm": 0.8799443244934082, "learning_rate": 0.0002, "epoch": 3.698973405545227, "step": 22880}, {"loss": 0.6206, "grad_norm": 0.8515434861183167, "learning_rate": 0.0002, "epoch": 3.700590089725972, "step": 22890}, {"loss": 0.5954, "grad_norm": 0.7805591821670532, "learning_rate": 0.0002, "epoch": 3.7022067739067173, "step": 22900}, {"loss": 0.6108, "grad_norm": 0.8470911979675293, "learning_rate": 0.0002, "epoch": 3.7038234580874625, "step": 22910}, {"loss": 0.6557, "grad_norm": 0.9452309012413025, "learning_rate": 0.0002, "epoch": 3.705440142268208, "step": 22920}, {"loss": 0.6529, "grad_norm": 0.950243353843689, "learning_rate": 0.0002, "epoch": 3.7070568264489534, "step": 22930}, {"loss": 0.6364, "grad_norm": 0.7882499098777771, "learning_rate": 0.0002, "epoch": 3.7086735106296986, "step": 22940}, {"loss": 0.6462, "grad_norm": 0.8307787775993347, "learning_rate": 0.0002, "epoch": 3.710290194810444, "step": 22950}, {"loss": 0.6371, "grad_norm": 1.0970630645751953, "learning_rate": 0.0002, "epoch": 3.711906878991189, "step": 22960}, {"loss": 0.6281, "grad_norm": 0.8269566297531128, "learning_rate": 0.0002, "epoch": 3.7135235631719343, "step": 22970}, {"loss": 0.6561, "grad_norm": 0.8306704759597778, "learning_rate": 0.0002, "epoch": 3.7151402473526796, "step": 22980}, {"loss": 0.6418, "grad_norm": 0.9710225462913513, "learning_rate": 0.0002, "epoch": 3.716756931533425, "step": 22990}, {"loss": 0.6639, "grad_norm": 0.8890530467033386, "learning_rate": 0.0002, "epoch": 3.71837361571417, "step": 23000}, {"loss": 0.6084, "grad_norm": 0.883522629737854, "learning_rate": 0.0002, "epoch": 3.7199902998949153, "step": 23010}, {"loss": 0.6183, "grad_norm": 0.8662652373313904, "learning_rate": 0.0002, "epoch": 3.721606984075661, "step": 23020}, {"loss": 0.6266, "grad_norm": 0.7228406667709351, "learning_rate": 0.0002, "epoch": 3.723223668256406, "step": 23030}, {"loss": 0.6417, "grad_norm": 1.060792088508606, "learning_rate": 0.0002, "epoch": 3.7248403524371514, "step": 23040}, {"loss": 0.6346, "grad_norm": 1.0119613409042358, "learning_rate": 0.0002, "epoch": 3.7264570366178966, "step": 23050}, {"loss": 0.6466, "grad_norm": 0.9212996959686279, "learning_rate": 0.0002, "epoch": 3.728073720798642, "step": 23060}, {"loss": 0.6454, "grad_norm": 0.925690233707428, "learning_rate": 0.0002, "epoch": 3.7296904049793875, "step": 23070}, {"loss": 0.615, "grad_norm": 0.8323310613632202, "learning_rate": 0.0002, "epoch": 3.7313070891601328, "step": 23080}, {"loss": 0.679, "grad_norm": 0.8966048955917358, "learning_rate": 0.0002, "epoch": 3.732923773340878, "step": 23090}, {"loss": 0.6151, "grad_norm": 0.8995837569236755, "learning_rate": 0.0002, "epoch": 3.7345404575216232, "step": 23100}, {"loss": 0.6143, "grad_norm": 0.8748890161514282, "learning_rate": 0.0002, "epoch": 3.7361571417023685, "step": 23110}, {"loss": 0.6246, "grad_norm": 0.7985540628433228, "learning_rate": 0.0002, "epoch": 3.7377738258831137, "step": 23120}, {"loss": 0.6279, "grad_norm": 1.0240917205810547, "learning_rate": 0.0002, "epoch": 3.739390510063859, "step": 23130}, {"loss": 0.6747, "grad_norm": 0.9181789755821228, "learning_rate": 0.0002, "epoch": 3.741007194244604, "step": 23140}, {"loss": 0.6026, "grad_norm": 0.8896583914756775, "learning_rate": 0.0002, "epoch": 3.7426238784253494, "step": 23150}, {"loss": 0.5972, "grad_norm": 0.8635515570640564, "learning_rate": 0.0002, "epoch": 3.744240562606095, "step": 23160}, {"loss": 0.6683, "grad_norm": 0.8873575329780579, "learning_rate": 0.0002, "epoch": 3.7458572467868403, "step": 23170}, {"loss": 0.6143, "grad_norm": 0.9807148575782776, "learning_rate": 0.0002, "epoch": 3.7474739309675855, "step": 23180}, {"loss": 0.6381, "grad_norm": 0.900477945804596, "learning_rate": 0.0002, "epoch": 3.7490906151483308, "step": 23190}, {"loss": 0.6542, "grad_norm": 0.9379992485046387, "learning_rate": 0.0002, "epoch": 3.750707299329076, "step": 23200}, {"loss": 0.6015, "grad_norm": 0.9649890661239624, "learning_rate": 0.0002, "epoch": 3.752323983509821, "step": 23210}, {"loss": 0.6735, "grad_norm": 0.824442446231842, "learning_rate": 0.0002, "epoch": 3.753940667690567, "step": 23220}, {"loss": 0.5992, "grad_norm": 0.8896150588989258, "learning_rate": 0.0002, "epoch": 3.755557351871312, "step": 23230}, {"loss": 0.6081, "grad_norm": 0.751249372959137, "learning_rate": 0.0002, "epoch": 3.7571740360520574, "step": 23240}, {"loss": 0.629, "grad_norm": 0.9392193555831909, "learning_rate": 0.0002, "epoch": 3.7587907202328026, "step": 23250}, {"loss": 0.6209, "grad_norm": 0.9284586310386658, "learning_rate": 0.0002, "epoch": 3.760407404413548, "step": 23260}, {"loss": 0.6414, "grad_norm": 0.7738175392150879, "learning_rate": 0.0002, "epoch": 3.762024088594293, "step": 23270}, {"loss": 0.6743, "grad_norm": 0.9252978563308716, "learning_rate": 0.0002, "epoch": 3.7636407727750383, "step": 23280}, {"loss": 0.5984, "grad_norm": 0.9501895904541016, "learning_rate": 0.0002, "epoch": 3.7652574569557835, "step": 23290}, {"loss": 0.6568, "grad_norm": 0.9416276216506958, "learning_rate": 0.0002, "epoch": 3.7668741411365287, "step": 23300}, {"loss": 0.6507, "grad_norm": 0.7076631784439087, "learning_rate": 0.0002, "epoch": 3.7684908253172744, "step": 23310}, {"loss": 0.6329, "grad_norm": 0.9864492416381836, "learning_rate": 0.0002, "epoch": 3.7701075094980196, "step": 23320}, {"loss": 0.6537, "grad_norm": 0.8450456261634827, "learning_rate": 0.0002, "epoch": 3.771724193678765, "step": 23330}, {"loss": 0.658, "grad_norm": 1.0768941640853882, "learning_rate": 0.0002, "epoch": 3.77334087785951, "step": 23340}, {"loss": 0.6408, "grad_norm": 0.9956819415092468, "learning_rate": 0.0002, "epoch": 3.7749575620402553, "step": 23350}, {"loss": 0.6464, "grad_norm": 0.9234658479690552, "learning_rate": 0.0002, "epoch": 3.7765742462210006, "step": 23360}, {"loss": 0.6542, "grad_norm": 1.0993858575820923, "learning_rate": 0.0002, "epoch": 3.7781909304017463, "step": 23370}, {"loss": 0.6391, "grad_norm": 0.923159658908844, "learning_rate": 0.0002, "epoch": 3.7798076145824915, "step": 23380}, {"loss": 0.6625, "grad_norm": 0.9311541318893433, "learning_rate": 0.0002, "epoch": 3.7814242987632367, "step": 23390}, {"loss": 0.6535, "grad_norm": 0.919681191444397, "learning_rate": 0.0002, "epoch": 3.783040982943982, "step": 23400}, {"loss": 0.6138, "grad_norm": 1.7406195402145386, "learning_rate": 0.0002, "epoch": 3.784657667124727, "step": 23410}, {"loss": 0.657, "grad_norm": 0.7789074182510376, "learning_rate": 0.0002, "epoch": 3.7862743513054724, "step": 23420}, {"loss": 0.658, "grad_norm": 0.8302814960479736, "learning_rate": 0.0002, "epoch": 3.7878910354862176, "step": 23430}, {"loss": 0.649, "grad_norm": 0.8089349269866943, "learning_rate": 0.0002, "epoch": 3.789507719666963, "step": 23440}, {"loss": 0.6682, "grad_norm": 0.9006284475326538, "learning_rate": 0.0002, "epoch": 3.791124403847708, "step": 23450}, {"loss": 0.6335, "grad_norm": 0.8426766991615295, "learning_rate": 0.0002, "epoch": 3.7927410880284538, "step": 23460}, {"loss": 0.6364, "grad_norm": 1.2576252222061157, "learning_rate": 0.0002, "epoch": 3.794357772209199, "step": 23470}, {"loss": 0.6324, "grad_norm": 1.0307610034942627, "learning_rate": 0.0002, "epoch": 3.7959744563899442, "step": 23480}, {"loss": 0.6262, "grad_norm": 0.8525972962379456, "learning_rate": 0.0002, "epoch": 3.7975911405706895, "step": 23490}, {"loss": 0.6757, "grad_norm": 1.159039855003357, "learning_rate": 0.0002, "epoch": 3.7992078247514347, "step": 23500}, {"loss": 0.6414, "grad_norm": 1.4193549156188965, "learning_rate": 0.0002, "epoch": 3.80082450893218, "step": 23510}, {"loss": 0.6413, "grad_norm": 0.8245543837547302, "learning_rate": 0.0002, "epoch": 3.8024411931129256, "step": 23520}, {"loss": 0.6417, "grad_norm": 0.8847230076789856, "learning_rate": 0.0002, "epoch": 3.804057877293671, "step": 23530}, {"loss": 0.6415, "grad_norm": 0.9574624300003052, "learning_rate": 0.0002, "epoch": 3.805674561474416, "step": 23540}, {"loss": 0.5765, "grad_norm": 1.048020601272583, "learning_rate": 0.0002, "epoch": 3.8072912456551613, "step": 23550}, {"loss": 0.6497, "grad_norm": 0.8302255868911743, "learning_rate": 0.0002, "epoch": 3.8089079298359065, "step": 23560}, {"loss": 0.6534, "grad_norm": 0.8269215822219849, "learning_rate": 0.0002, "epoch": 3.8105246140166518, "step": 23570}, {"loss": 0.6294, "grad_norm": 0.9375753402709961, "learning_rate": 0.0002, "epoch": 3.812141298197397, "step": 23580}, {"loss": 0.6132, "grad_norm": 1.0234097242355347, "learning_rate": 0.0002, "epoch": 3.8137579823781422, "step": 23590}, {"loss": 0.6625, "grad_norm": 0.8978445529937744, "learning_rate": 0.0002, "epoch": 3.8153746665588875, "step": 23600}, {"loss": 0.6315, "grad_norm": 0.7929515838623047, "learning_rate": 0.0002, "epoch": 3.816991350739633, "step": 23610}, {"loss": 0.6387, "grad_norm": 1.3255881071090698, "learning_rate": 0.0002, "epoch": 3.8186080349203784, "step": 23620}, {"loss": 0.5947, "grad_norm": 0.9188598990440369, "learning_rate": 0.0002, "epoch": 3.8202247191011236, "step": 23630}, {"loss": 0.6152, "grad_norm": 0.8811675906181335, "learning_rate": 0.0002, "epoch": 3.821841403281869, "step": 23640}, {"loss": 0.6253, "grad_norm": 0.8061038255691528, "learning_rate": 0.0002, "epoch": 3.823458087462614, "step": 23650}, {"loss": 0.6517, "grad_norm": 0.9975376129150391, "learning_rate": 0.0002, "epoch": 3.8250747716433597, "step": 23660}, {"loss": 0.6288, "grad_norm": 0.8036105036735535, "learning_rate": 0.0002, "epoch": 3.826691455824105, "step": 23670}, {"loss": 0.6845, "grad_norm": 0.7401984333992004, "learning_rate": 0.0002, "epoch": 3.82830814000485, "step": 23680}, {"loss": 0.6423, "grad_norm": 0.829753041267395, "learning_rate": 0.0002, "epoch": 3.8299248241855954, "step": 23690}, {"loss": 0.6611, "grad_norm": 0.8753240704536438, "learning_rate": 0.0002, "epoch": 3.8315415083663407, "step": 23700}, {"loss": 0.6686, "grad_norm": 0.8157842755317688, "learning_rate": 0.0002, "epoch": 3.833158192547086, "step": 23710}, {"loss": 0.6181, "grad_norm": 0.6183798909187317, "learning_rate": 0.0002, "epoch": 3.834774876727831, "step": 23720}, {"loss": 0.5965, "grad_norm": 0.9548442363739014, "learning_rate": 0.0002, "epoch": 3.8363915609085764, "step": 23730}, {"loss": 0.6456, "grad_norm": 0.8319669961929321, "learning_rate": 0.0002, "epoch": 3.8380082450893216, "step": 23740}, {"loss": 0.6585, "grad_norm": 0.9718693494796753, "learning_rate": 0.0002, "epoch": 3.839624929270067, "step": 23750}, {"loss": 0.6518, "grad_norm": 0.8672235012054443, "learning_rate": 0.0002, "epoch": 3.8412416134508125, "step": 23760}, {"loss": 0.6774, "grad_norm": 1.1210707426071167, "learning_rate": 0.0002, "epoch": 3.8428582976315577, "step": 23770}, {"loss": 0.5923, "grad_norm": 0.9177767634391785, "learning_rate": 0.0002, "epoch": 3.844474981812303, "step": 23780}, {"loss": 0.6286, "grad_norm": 0.8714171648025513, "learning_rate": 0.0002, "epoch": 3.846091665993048, "step": 23790}, {"loss": 0.6302, "grad_norm": 1.1853246688842773, "learning_rate": 0.0002, "epoch": 3.8477083501737934, "step": 23800}, {"loss": 0.6144, "grad_norm": 0.8091260194778442, "learning_rate": 0.0002, "epoch": 3.849325034354539, "step": 23810}, {"loss": 0.658, "grad_norm": 0.9710774421691895, "learning_rate": 0.0002, "epoch": 3.8509417185352843, "step": 23820}, {"loss": 0.6151, "grad_norm": 0.7648707628250122, "learning_rate": 0.0002, "epoch": 3.8525584027160296, "step": 23830}, {"loss": 0.6013, "grad_norm": 0.7809253931045532, "learning_rate": 0.0002, "epoch": 3.854175086896775, "step": 23840}, {"loss": 0.6006, "grad_norm": 0.8337951898574829, "learning_rate": 0.0002, "epoch": 3.85579177107752, "step": 23850}, {"loss": 0.6456, "grad_norm": 0.9271913170814514, "learning_rate": 0.0002, "epoch": 3.8574084552582653, "step": 23860}, {"loss": 0.6671, "grad_norm": 0.985334038734436, "learning_rate": 0.0002, "epoch": 3.8590251394390105, "step": 23870}, {"loss": 0.6693, "grad_norm": 0.8458583354949951, "learning_rate": 0.0002, "epoch": 3.8606418236197557, "step": 23880}, {"loss": 0.6207, "grad_norm": 1.015348196029663, "learning_rate": 0.0002, "epoch": 3.862258507800501, "step": 23890}, {"loss": 0.649, "grad_norm": 1.0121688842773438, "learning_rate": 0.0002, "epoch": 3.8638751919812466, "step": 23900}, {"loss": 0.5921, "grad_norm": 0.8883971571922302, "learning_rate": 0.0002, "epoch": 3.865491876161992, "step": 23910}, {"loss": 0.6597, "grad_norm": 1.028086543083191, "learning_rate": 0.0002, "epoch": 3.867108560342737, "step": 23920}, {"loss": 0.6654, "grad_norm": 0.9645734429359436, "learning_rate": 0.0002, "epoch": 3.8687252445234823, "step": 23930}, {"loss": 0.6328, "grad_norm": 0.8235350251197815, "learning_rate": 0.0002, "epoch": 3.8703419287042276, "step": 23940}, {"loss": 0.6387, "grad_norm": 1.0298916101455688, "learning_rate": 0.0002, "epoch": 3.871958612884973, "step": 23950}, {"loss": 0.5966, "grad_norm": 1.0063377618789673, "learning_rate": 0.0002, "epoch": 3.8735752970657185, "step": 23960}, {"loss": 0.6234, "grad_norm": 0.9230626821517944, "learning_rate": 0.0002, "epoch": 3.8751919812464637, "step": 23970}, {"loss": 0.6159, "grad_norm": 0.9243063926696777, "learning_rate": 0.0002, "epoch": 3.876808665427209, "step": 23980}, {"loss": 0.6035, "grad_norm": 1.0211291313171387, "learning_rate": 0.0002, "epoch": 3.878425349607954, "step": 23990}, {"loss": 0.6351, "grad_norm": 0.7800535559654236, "learning_rate": 0.0002, "epoch": 3.8800420337886994, "step": 24000}, {"loss": 0.7, "grad_norm": 0.7904248833656311, "learning_rate": 0.0002, "epoch": 3.8816587179694446, "step": 24010}, {"loss": 0.6516, "grad_norm": 1.1975988149642944, "learning_rate": 0.0002, "epoch": 3.88327540215019, "step": 24020}, {"loss": 0.6006, "grad_norm": 1.0626593828201294, "learning_rate": 0.0002, "epoch": 3.884892086330935, "step": 24030}, {"loss": 0.6115, "grad_norm": 0.9012193083763123, "learning_rate": 0.0002, "epoch": 3.8865087705116803, "step": 24040}, {"loss": 0.6786, "grad_norm": 1.1159172058105469, "learning_rate": 0.0002, "epoch": 3.888125454692426, "step": 24050}, {"loss": 0.6635, "grad_norm": 1.276838779449463, "learning_rate": 0.0002, "epoch": 3.889742138873171, "step": 24060}, {"loss": 0.5985, "grad_norm": 0.8467690348625183, "learning_rate": 0.0002, "epoch": 3.8913588230539164, "step": 24070}, {"loss": 0.6655, "grad_norm": 0.9862841963768005, "learning_rate": 0.0002, "epoch": 3.8929755072346617, "step": 24080}, {"loss": 0.6098, "grad_norm": 0.7134621739387512, "learning_rate": 0.0002, "epoch": 3.894592191415407, "step": 24090}, {"loss": 0.618, "grad_norm": 0.8178175091743469, "learning_rate": 0.0002, "epoch": 3.896208875596152, "step": 24100}, {"loss": 0.6147, "grad_norm": 0.9229172468185425, "learning_rate": 0.0002, "epoch": 3.897825559776898, "step": 24110}, {"loss": 0.6554, "grad_norm": 1.0878316164016724, "learning_rate": 0.0002, "epoch": 3.899442243957643, "step": 24120}, {"loss": 0.6616, "grad_norm": 0.971645712852478, "learning_rate": 0.0002, "epoch": 3.9010589281383883, "step": 24130}, {"loss": 0.6228, "grad_norm": 0.8862188458442688, "learning_rate": 0.0002, "epoch": 3.9026756123191335, "step": 24140}, {"loss": 0.6192, "grad_norm": 0.9126982688903809, "learning_rate": 0.0002, "epoch": 3.9042922964998787, "step": 24150}, {"loss": 0.6734, "grad_norm": 0.8833470940589905, "learning_rate": 0.0002, "epoch": 3.905908980680624, "step": 24160}, {"loss": 0.5832, "grad_norm": 0.8320947885513306, "learning_rate": 0.0002, "epoch": 3.907525664861369, "step": 24170}, {"loss": 0.6247, "grad_norm": 0.9156602025032043, "learning_rate": 0.0002, "epoch": 3.9091423490421144, "step": 24180}, {"loss": 0.6678, "grad_norm": 1.029181957244873, "learning_rate": 0.0002, "epoch": 3.9107590332228597, "step": 24190}, {"loss": 0.6565, "grad_norm": 0.9052802324295044, "learning_rate": 0.0002, "epoch": 3.9123757174036053, "step": 24200}, {"loss": 0.6346, "grad_norm": 0.8847255110740662, "learning_rate": 0.0002, "epoch": 3.9139924015843506, "step": 24210}, {"loss": 0.6343, "grad_norm": 0.9642062187194824, "learning_rate": 0.0002, "epoch": 3.915609085765096, "step": 24220}, {"loss": 0.6557, "grad_norm": 0.8629093766212463, "learning_rate": 0.0002, "epoch": 3.917225769945841, "step": 24230}, {"loss": 0.6086, "grad_norm": 0.8674976825714111, "learning_rate": 0.0002, "epoch": 3.9188424541265863, "step": 24240}, {"loss": 0.5874, "grad_norm": 1.104846477508545, "learning_rate": 0.0002, "epoch": 3.9204591383073315, "step": 24250}, {"loss": 0.6501, "grad_norm": 1.0874955654144287, "learning_rate": 0.0002, "epoch": 3.922075822488077, "step": 24260}, {"loss": 0.6455, "grad_norm": 0.8689812421798706, "learning_rate": 0.0002, "epoch": 3.9236925066688224, "step": 24270}, {"loss": 0.5893, "grad_norm": 0.9724617004394531, "learning_rate": 0.0002, "epoch": 3.9253091908495676, "step": 24280}, {"loss": 0.6616, "grad_norm": 0.9165538549423218, "learning_rate": 0.0002, "epoch": 3.926925875030313, "step": 24290}, {"loss": 0.645, "grad_norm": 0.9307710528373718, "learning_rate": 0.0002, "epoch": 3.928542559211058, "step": 24300}, {"loss": 0.6071, "grad_norm": 0.8589295148849487, "learning_rate": 0.0002, "epoch": 3.9301592433918033, "step": 24310}, {"loss": 0.6662, "grad_norm": 0.9151099920272827, "learning_rate": 0.0002, "epoch": 3.9317759275725486, "step": 24320}, {"loss": 0.7075, "grad_norm": 0.9633517265319824, "learning_rate": 0.0002, "epoch": 3.933392611753294, "step": 24330}, {"loss": 0.6432, "grad_norm": 0.9521116018295288, "learning_rate": 0.0002, "epoch": 3.935009295934039, "step": 24340}, {"loss": 0.6457, "grad_norm": 0.8366776704788208, "learning_rate": 0.0002, "epoch": 3.9366259801147847, "step": 24350}, {"loss": 0.6139, "grad_norm": 0.8972663283348083, "learning_rate": 0.0002, "epoch": 3.93824266429553, "step": 24360}, {"loss": 0.661, "grad_norm": 0.8102919459342957, "learning_rate": 0.0002, "epoch": 3.939859348476275, "step": 24370}, {"loss": 0.6388, "grad_norm": 0.8189975023269653, "learning_rate": 0.0002, "epoch": 3.9414760326570204, "step": 24380}, {"loss": 0.6818, "grad_norm": 0.9569464921951294, "learning_rate": 0.0002, "epoch": 3.9430927168377656, "step": 24390}, {"loss": 0.6999, "grad_norm": 0.7459101676940918, "learning_rate": 0.0002, "epoch": 3.9447094010185113, "step": 24400}, {"loss": 0.6069, "grad_norm": 0.8536974787712097, "learning_rate": 0.0002, "epoch": 3.9463260851992565, "step": 24410}, {"loss": 0.5683, "grad_norm": 0.8763698935508728, "learning_rate": 0.0002, "epoch": 3.9479427693800018, "step": 24420}, {"loss": 0.6478, "grad_norm": 0.9381106495857239, "learning_rate": 0.0002, "epoch": 3.949559453560747, "step": 24430}, {"loss": 0.6371, "grad_norm": 0.934440016746521, "learning_rate": 0.0002, "epoch": 3.9511761377414922, "step": 24440}, {"loss": 0.6393, "grad_norm": 0.903918981552124, "learning_rate": 0.0002, "epoch": 3.9527928219222375, "step": 24450}, {"loss": 0.6175, "grad_norm": 0.8771953582763672, "learning_rate": 0.0002, "epoch": 3.9544095061029827, "step": 24460}, {"loss": 0.6971, "grad_norm": 1.0375410318374634, "learning_rate": 0.0002, "epoch": 3.956026190283728, "step": 24470}, {"loss": 0.6313, "grad_norm": 0.9439185261726379, "learning_rate": 0.0002, "epoch": 3.957642874464473, "step": 24480}, {"loss": 0.6076, "grad_norm": 0.935467004776001, "learning_rate": 0.0002, "epoch": 3.9592595586452184, "step": 24490}, {"loss": 0.6437, "grad_norm": 0.6900772452354431, "learning_rate": 0.0002, "epoch": 3.960876242825964, "step": 24500}, {"loss": 0.6445, "grad_norm": 1.0172916650772095, "learning_rate": 0.0002, "epoch": 3.9624929270067093, "step": 24510}, {"loss": 0.6308, "grad_norm": 0.9167046546936035, "learning_rate": 0.0002, "epoch": 3.9641096111874545, "step": 24520}, {"loss": 0.6519, "grad_norm": 0.7230527997016907, "learning_rate": 0.0002, "epoch": 3.9657262953681998, "step": 24530}, {"loss": 0.6564, "grad_norm": 0.8980403542518616, "learning_rate": 0.0002, "epoch": 3.967342979548945, "step": 24540}, {"loss": 0.6099, "grad_norm": 0.8555465936660767, "learning_rate": 0.0002, "epoch": 3.9689596637296907, "step": 24550}, {"loss": 0.6617, "grad_norm": 0.7825445532798767, "learning_rate": 0.0002, "epoch": 3.970576347910436, "step": 24560}, {"loss": 0.604, "grad_norm": 0.7273133993148804, "learning_rate": 0.0002, "epoch": 3.972193032091181, "step": 24570}, {"loss": 0.6427, "grad_norm": 0.9612047672271729, "learning_rate": 0.0002, "epoch": 3.9738097162719264, "step": 24580}, {"loss": 0.6426, "grad_norm": 0.9865460991859436, "learning_rate": 0.0002, "epoch": 3.9754264004526716, "step": 24590}, {"loss": 0.6052, "grad_norm": 0.8638762831687927, "learning_rate": 0.0002, "epoch": 3.977043084633417, "step": 24600}, {"loss": 0.6097, "grad_norm": 1.0096198320388794, "learning_rate": 0.0002, "epoch": 3.978659768814162, "step": 24610}, {"loss": 0.6664, "grad_norm": 0.8475532531738281, "learning_rate": 0.0002, "epoch": 3.9802764529949073, "step": 24620}, {"loss": 0.6711, "grad_norm": 0.9696195721626282, "learning_rate": 0.0002, "epoch": 3.9818931371756525, "step": 24630}, {"loss": 0.6446, "grad_norm": 0.7499843239784241, "learning_rate": 0.0002, "epoch": 3.9835098213563978, "step": 24640}, {"loss": 0.6054, "grad_norm": 0.8865424990653992, "learning_rate": 0.0002, "epoch": 3.9851265055371434, "step": 24650}, {"loss": 0.5975, "grad_norm": 0.8089959025382996, "learning_rate": 0.0002, "epoch": 3.9867431897178887, "step": 24660}, {"loss": 0.6677, "grad_norm": 0.6946012377738953, "learning_rate": 0.0002, "epoch": 3.988359873898634, "step": 24670}, {"loss": 0.6329, "grad_norm": 0.7991759181022644, "learning_rate": 0.0002, "epoch": 3.989976558079379, "step": 24680}, {"loss": 0.6449, "grad_norm": 0.8803931474685669, "learning_rate": 0.0002, "epoch": 3.9915932422601244, "step": 24690}, {"loss": 0.7091, "grad_norm": 0.8848299980163574, "learning_rate": 0.0002, "epoch": 3.99320992644087, "step": 24700}, {"loss": 0.6551, "grad_norm": 0.7448889017105103, "learning_rate": 0.0002, "epoch": 3.9948266106216153, "step": 24710}, {"loss": 0.6432, "grad_norm": 0.9361620545387268, "learning_rate": 0.0002, "epoch": 3.9964432948023605, "step": 24720}, {"loss": 0.5917, "grad_norm": 0.9958081245422363, "learning_rate": 0.0002, "epoch": 3.9980599789831057, "step": 24730}, {"loss": 0.6567, "grad_norm": 1.026004672050476, "learning_rate": 0.0002, "epoch": 3.999676663163851, "step": 24740}, {"eval_loss": 1.1524168252944946, "eval_runtime": 122.1585, "eval_samples_per_second": 6.0, "eval_steps_per_second": 0.753, "epoch": 4.0, "step": 24742}, {"loss": 0.6057, "grad_norm": 1.0664808750152588, "learning_rate": 0.0002, "epoch": 4.001293347344596, "step": 24750}, {"loss": 0.5644, "grad_norm": 1.0113720893859863, "learning_rate": 0.0002, "epoch": 4.002910031525341, "step": 24760}, {"loss": 0.5628, "grad_norm": 0.991486668586731, "learning_rate": 0.0002, "epoch": 4.004526715706087, "step": 24770}, {"loss": 0.508, "grad_norm": 0.951754629611969, "learning_rate": 0.0002, "epoch": 4.006143399886832, "step": 24780}, {"loss": 0.5314, "grad_norm": 1.13059401512146, "learning_rate": 0.0002, "epoch": 4.007760084067577, "step": 24790}, {"loss": 0.5323, "grad_norm": 0.9343926310539246, "learning_rate": 0.0002, "epoch": 4.009376768248322, "step": 24800}, {"loss": 0.5161, "grad_norm": 1.0680590867996216, "learning_rate": 0.0002, "epoch": 4.010993452429068, "step": 24810}, {"loss": 0.513, "grad_norm": 1.0022706985473633, "learning_rate": 0.0002, "epoch": 4.012610136609814, "step": 24820}, {"loss": 0.543, "grad_norm": 1.0285297632217407, "learning_rate": 0.0002, "epoch": 4.014226820790559, "step": 24830}, {"loss": 0.5311, "grad_norm": 0.8347002863883972, "learning_rate": 0.0002, "epoch": 4.015843504971304, "step": 24840}, {"loss": 0.5655, "grad_norm": 0.9675396680831909, "learning_rate": 0.0002, "epoch": 4.017460189152049, "step": 24850}, {"loss": 0.5625, "grad_norm": 0.9238511323928833, "learning_rate": 0.0002, "epoch": 4.019076873332795, "step": 24860}, {"loss": 0.5327, "grad_norm": 1.1576941013336182, "learning_rate": 0.0002, "epoch": 4.02069355751354, "step": 24870}, {"loss": 0.5533, "grad_norm": 0.8583757281303406, "learning_rate": 0.0002, "epoch": 4.022310241694285, "step": 24880}, {"loss": 0.5483, "grad_norm": 0.9816817045211792, "learning_rate": 0.0002, "epoch": 4.02392692587503, "step": 24890}, {"loss": 0.5605, "grad_norm": 0.955073893070221, "learning_rate": 0.0002, "epoch": 4.0255436100557755, "step": 24900}, {"loss": 0.4896, "grad_norm": 1.1054974794387817, "learning_rate": 0.0002, "epoch": 4.027160294236521, "step": 24910}, {"loss": 0.5246, "grad_norm": 1.1240060329437256, "learning_rate": 0.0002, "epoch": 4.028776978417266, "step": 24920}, {"loss": 0.5451, "grad_norm": 0.9512825012207031, "learning_rate": 0.0002, "epoch": 4.030393662598011, "step": 24930}, {"loss": 0.5584, "grad_norm": 0.85965496301651, "learning_rate": 0.0002, "epoch": 4.0320103467787565, "step": 24940}, {"loss": 0.5564, "grad_norm": 0.9378061294555664, "learning_rate": 0.0002, "epoch": 4.033627030959502, "step": 24950}, {"loss": 0.5008, "grad_norm": 0.9655424356460571, "learning_rate": 0.0002, "epoch": 4.035243715140247, "step": 24960}, {"loss": 0.5538, "grad_norm": 1.1393707990646362, "learning_rate": 0.0002, "epoch": 4.036860399320993, "step": 24970}, {"loss": 0.5785, "grad_norm": 1.0220451354980469, "learning_rate": 0.0002, "epoch": 4.038477083501738, "step": 24980}, {"loss": 0.5813, "grad_norm": 0.9785808324813843, "learning_rate": 0.0002, "epoch": 4.0400937676824835, "step": 24990}, {"loss": 0.5153, "grad_norm": 1.0257649421691895, "learning_rate": 0.0002, "epoch": 4.041710451863229, "step": 25000}, {"loss": 0.5658, "grad_norm": 0.9737892150878906, "learning_rate": 0.0002, "epoch": 4.043327136043974, "step": 25010}, {"loss": 0.5515, "grad_norm": 0.7416959404945374, "learning_rate": 0.0002, "epoch": 4.044943820224719, "step": 25020}, {"loss": 0.5372, "grad_norm": 0.7909596562385559, "learning_rate": 0.0002, "epoch": 4.046560504405464, "step": 25030}, {"loss": 0.5265, "grad_norm": 0.8923130631446838, "learning_rate": 0.0002, "epoch": 4.04817718858621, "step": 25040}, {"loss": 0.5035, "grad_norm": 0.9044941663742065, "learning_rate": 0.0002, "epoch": 4.049793872766955, "step": 25050}, {"loss": 0.5135, "grad_norm": 0.866352379322052, "learning_rate": 0.0002, "epoch": 4.0514105569477, "step": 25060}, {"loss": 0.5956, "grad_norm": 1.544549822807312, "learning_rate": 0.0002, "epoch": 4.053027241128445, "step": 25070}, {"loss": 0.5418, "grad_norm": 0.8426995277404785, "learning_rate": 0.0002, "epoch": 4.054643925309191, "step": 25080}, {"loss": 0.5537, "grad_norm": 0.9797548651695251, "learning_rate": 0.0002, "epoch": 4.056260609489936, "step": 25090}, {"loss": 0.55, "grad_norm": 0.8468434810638428, "learning_rate": 0.0002, "epoch": 4.057877293670681, "step": 25100}, {"loss": 0.5242, "grad_norm": 0.9294559955596924, "learning_rate": 0.0002, "epoch": 4.059493977851426, "step": 25110}, {"loss": 0.5295, "grad_norm": 0.9686688780784607, "learning_rate": 0.0002, "epoch": 4.061110662032172, "step": 25120}, {"loss": 0.5642, "grad_norm": 0.8042728304862976, "learning_rate": 0.0002, "epoch": 4.062727346212918, "step": 25130}, {"loss": 0.548, "grad_norm": 1.165160894393921, "learning_rate": 0.0002, "epoch": 4.064344030393663, "step": 25140}, {"loss": 0.5473, "grad_norm": 1.2161961793899536, "learning_rate": 0.0002, "epoch": 4.065960714574408, "step": 25150}, {"loss": 0.5217, "grad_norm": 1.0762810707092285, "learning_rate": 0.0002, "epoch": 4.067577398755153, "step": 25160}, {"loss": 0.5886, "grad_norm": 0.7580869793891907, "learning_rate": 0.0002, "epoch": 4.069194082935899, "step": 25170}, {"loss": 0.5401, "grad_norm": 0.9630117416381836, "learning_rate": 0.0002, "epoch": 4.070810767116644, "step": 25180}, {"loss": 0.5378, "grad_norm": 0.9049716591835022, "learning_rate": 0.0002, "epoch": 4.072427451297389, "step": 25190}, {"loss": 0.5266, "grad_norm": 1.1536930799484253, "learning_rate": 0.0002, "epoch": 4.074044135478134, "step": 25200}, {"loss": 0.5523, "grad_norm": 0.901461124420166, "learning_rate": 0.0002, "epoch": 4.0756608196588795, "step": 25210}, {"loss": 0.5132, "grad_norm": 1.3318437337875366, "learning_rate": 0.0002, "epoch": 4.077277503839625, "step": 25220}, {"loss": 0.5317, "grad_norm": 0.8811455368995667, "learning_rate": 0.0002, "epoch": 4.07889418802037, "step": 25230}, {"loss": 0.5798, "grad_norm": 1.0564165115356445, "learning_rate": 0.0002, "epoch": 4.080510872201115, "step": 25240}, {"loss": 0.5472, "grad_norm": 1.1008027791976929, "learning_rate": 0.0002, "epoch": 4.08212755638186, "step": 25250}, {"loss": 0.5195, "grad_norm": 1.150097131729126, "learning_rate": 0.0002, "epoch": 4.083744240562606, "step": 25260}, {"loss": 0.5321, "grad_norm": 0.9339924454689026, "learning_rate": 0.0002, "epoch": 4.085360924743352, "step": 25270}, {"loss": 0.5597, "grad_norm": 1.0902045965194702, "learning_rate": 0.0002, "epoch": 4.086977608924097, "step": 25280}, {"loss": 0.5203, "grad_norm": 0.8483911156654358, "learning_rate": 0.0002, "epoch": 4.088594293104842, "step": 25290}, {"loss": 0.5697, "grad_norm": 0.9477024674415588, "learning_rate": 0.0002, "epoch": 4.0902109772855875, "step": 25300}, {"loss": 0.5384, "grad_norm": 0.9500215649604797, "learning_rate": 0.0002, "epoch": 4.091827661466333, "step": 25310}, {"loss": 0.5045, "grad_norm": 1.040468454360962, "learning_rate": 0.0002, "epoch": 4.093444345647078, "step": 25320}, {"loss": 0.5488, "grad_norm": 0.7457592487335205, "learning_rate": 0.0002, "epoch": 4.095061029827823, "step": 25330}, {"loss": 0.609, "grad_norm": 1.2092097997665405, "learning_rate": 0.0002, "epoch": 4.096677714008568, "step": 25340}, {"loss": 0.5174, "grad_norm": 0.9652107954025269, "learning_rate": 0.0002, "epoch": 4.098294398189314, "step": 25350}, {"loss": 0.5559, "grad_norm": 0.8464955687522888, "learning_rate": 0.0002, "epoch": 4.099911082370059, "step": 25360}, {"loss": 0.5635, "grad_norm": 0.875026285648346, "learning_rate": 0.0002, "epoch": 4.101527766550804, "step": 25370}, {"loss": 0.5774, "grad_norm": 0.9241740107536316, "learning_rate": 0.0002, "epoch": 4.103144450731549, "step": 25380}, {"loss": 0.5578, "grad_norm": 0.9769546389579773, "learning_rate": 0.0002, "epoch": 4.1047611349122946, "step": 25390}, {"loss": 0.567, "grad_norm": 1.1501960754394531, "learning_rate": 0.0002, "epoch": 4.10637781909304, "step": 25400}, {"loss": 0.5241, "grad_norm": 0.9135243892669678, "learning_rate": 0.0002, "epoch": 4.107994503273786, "step": 25410}, {"loss": 0.5152, "grad_norm": 0.9905396103858948, "learning_rate": 0.0002, "epoch": 4.109611187454531, "step": 25420}, {"loss": 0.5064, "grad_norm": 0.9845104217529297, "learning_rate": 0.0002, "epoch": 4.111227871635276, "step": 25430}, {"loss": 0.5029, "grad_norm": 0.8326883912086487, "learning_rate": 0.0002, "epoch": 4.112844555816022, "step": 25440}, {"loss": 0.5312, "grad_norm": 0.9264556765556335, "learning_rate": 0.0002, "epoch": 4.114461239996767, "step": 25450}, {"loss": 0.5968, "grad_norm": 1.043080449104309, "learning_rate": 0.0002, "epoch": 4.116077924177512, "step": 25460}, {"loss": 0.5773, "grad_norm": 0.8533386588096619, "learning_rate": 0.0002, "epoch": 4.117694608358257, "step": 25470}, {"loss": 0.5584, "grad_norm": 1.0133965015411377, "learning_rate": 0.0002, "epoch": 4.1193112925390025, "step": 25480}, {"loss": 0.566, "grad_norm": 0.7476310133934021, "learning_rate": 0.0002, "epoch": 4.120927976719748, "step": 25490}, {"loss": 0.5189, "grad_norm": 1.1247259378433228, "learning_rate": 0.0002, "epoch": 4.122544660900493, "step": 25500}, {"loss": 0.5751, "grad_norm": 1.0764678716659546, "learning_rate": 0.0002, "epoch": 4.124161345081238, "step": 25510}, {"loss": 0.5391, "grad_norm": 0.7679798007011414, "learning_rate": 0.0002, "epoch": 4.1257780292619834, "step": 25520}, {"loss": 0.5233, "grad_norm": 0.8877071142196655, "learning_rate": 0.0002, "epoch": 4.127394713442729, "step": 25530}, {"loss": 0.5769, "grad_norm": 1.0440239906311035, "learning_rate": 0.0002, "epoch": 4.129011397623474, "step": 25540}, {"loss": 0.5723, "grad_norm": 0.984145998954773, "learning_rate": 0.0002, "epoch": 4.130628081804219, "step": 25550}, {"loss": 0.5741, "grad_norm": 0.8667055368423462, "learning_rate": 0.0002, "epoch": 4.132244765984965, "step": 25560}, {"loss": 0.5816, "grad_norm": 1.1300835609436035, "learning_rate": 0.0002, "epoch": 4.1338614501657105, "step": 25570}, {"loss": 0.524, "grad_norm": 0.9314348101615906, "learning_rate": 0.0002, "epoch": 4.135478134346456, "step": 25580}, {"loss": 0.5283, "grad_norm": 0.7731879949569702, "learning_rate": 0.0002, "epoch": 4.137094818527201, "step": 25590}, {"loss": 0.5307, "grad_norm": 1.0080097913742065, "learning_rate": 0.0002, "epoch": 4.138711502707946, "step": 25600}, {"loss": 0.5759, "grad_norm": 1.2475038766860962, "learning_rate": 0.0002, "epoch": 4.140328186888691, "step": 25610}, {"loss": 0.55, "grad_norm": 0.9912930727005005, "learning_rate": 0.0002, "epoch": 4.141944871069437, "step": 25620}, {"loss": 0.5624, "grad_norm": 0.9088651537895203, "learning_rate": 0.0002, "epoch": 4.143561555250182, "step": 25630}, {"loss": 0.5393, "grad_norm": 0.8940697312355042, "learning_rate": 0.0002, "epoch": 4.145178239430927, "step": 25640}, {"loss": 0.5341, "grad_norm": 1.0798203945159912, "learning_rate": 0.0002, "epoch": 4.146794923611672, "step": 25650}, {"loss": 0.5987, "grad_norm": 0.955172061920166, "learning_rate": 0.0002, "epoch": 4.148411607792418, "step": 25660}, {"loss": 0.569, "grad_norm": 0.9692716002464294, "learning_rate": 0.0002, "epoch": 4.150028291973163, "step": 25670}, {"loss": 0.5478, "grad_norm": 1.0813939571380615, "learning_rate": 0.0002, "epoch": 4.151644976153908, "step": 25680}, {"loss": 0.5383, "grad_norm": 1.135675072669983, "learning_rate": 0.0002, "epoch": 4.153261660334653, "step": 25690}, {"loss": 0.5247, "grad_norm": 1.0392236709594727, "learning_rate": 0.0002, "epoch": 4.1548783445153985, "step": 25700}, {"loss": 0.5204, "grad_norm": 0.9473116993904114, "learning_rate": 0.0002, "epoch": 4.156495028696145, "step": 25710}, {"loss": 0.5339, "grad_norm": 0.712493896484375, "learning_rate": 0.0002, "epoch": 4.15811171287689, "step": 25720}, {"loss": 0.5781, "grad_norm": 0.8724465370178223, "learning_rate": 0.0002, "epoch": 4.159728397057635, "step": 25730}, {"loss": 0.5325, "grad_norm": 0.9870015978813171, "learning_rate": 0.0002, "epoch": 4.16134508123838, "step": 25740}, {"loss": 0.5503, "grad_norm": 1.025273084640503, "learning_rate": 0.0002, "epoch": 4.1629617654191255, "step": 25750}, {"loss": 0.5223, "grad_norm": 0.9243090152740479, "learning_rate": 0.0002, "epoch": 4.164578449599871, "step": 25760}, {"loss": 0.5177, "grad_norm": 1.1656451225280762, "learning_rate": 0.0002, "epoch": 4.166195133780616, "step": 25770}, {"loss": 0.5334, "grad_norm": 0.936358630657196, "learning_rate": 0.0002, "epoch": 4.167811817961361, "step": 25780}, {"loss": 0.5236, "grad_norm": 0.8618208169937134, "learning_rate": 0.0002, "epoch": 4.1694285021421065, "step": 25790}, {"loss": 0.5186, "grad_norm": 0.8580600023269653, "learning_rate": 0.0002, "epoch": 4.171045186322852, "step": 25800}, {"loss": 0.5212, "grad_norm": 1.0128562450408936, "learning_rate": 0.0002, "epoch": 4.172661870503597, "step": 25810}, {"loss": 0.5404, "grad_norm": 0.854865312576294, "learning_rate": 0.0002, "epoch": 4.174278554684342, "step": 25820}, {"loss": 0.5377, "grad_norm": 1.235082745552063, "learning_rate": 0.0002, "epoch": 4.175895238865087, "step": 25830}, {"loss": 0.5614, "grad_norm": 0.9796220660209656, "learning_rate": 0.0002, "epoch": 4.177511923045833, "step": 25840}, {"loss": 0.5689, "grad_norm": 0.8922094702720642, "learning_rate": 0.0002, "epoch": 4.179128607226578, "step": 25850}, {"loss": 0.5806, "grad_norm": 0.9672530293464661, "learning_rate": 0.0002, "epoch": 4.180745291407324, "step": 25860}, {"loss": 0.5074, "grad_norm": 0.8662548661231995, "learning_rate": 0.0002, "epoch": 4.182361975588069, "step": 25870}, {"loss": 0.5329, "grad_norm": 0.7938798069953918, "learning_rate": 0.0002, "epoch": 4.1839786597688144, "step": 25880}, {"loss": 0.5427, "grad_norm": 1.0517958402633667, "learning_rate": 0.0002, "epoch": 4.18559534394956, "step": 25890}, {"loss": 0.5147, "grad_norm": 0.8939275145530701, "learning_rate": 0.0002, "epoch": 4.187212028130305, "step": 25900}, {"loss": 0.5199, "grad_norm": 1.0296672582626343, "learning_rate": 0.0002, "epoch": 4.18882871231105, "step": 25910}, {"loss": 0.5522, "grad_norm": 0.8104017972946167, "learning_rate": 0.0002, "epoch": 4.190445396491795, "step": 25920}, {"loss": 0.596, "grad_norm": 0.9984509944915771, "learning_rate": 0.0002, "epoch": 4.192062080672541, "step": 25930}, {"loss": 0.5356, "grad_norm": 0.9844784736633301, "learning_rate": 0.0002, "epoch": 4.193678764853286, "step": 25940}, {"loss": 0.5198, "grad_norm": 0.8168622255325317, "learning_rate": 0.0002, "epoch": 4.195295449034031, "step": 25950}, {"loss": 0.542, "grad_norm": 1.0878913402557373, "learning_rate": 0.0002, "epoch": 4.196912133214776, "step": 25960}, {"loss": 0.5414, "grad_norm": 0.927126407623291, "learning_rate": 0.0002, "epoch": 4.1985288173955215, "step": 25970}, {"loss": 0.5794, "grad_norm": 0.838586688041687, "learning_rate": 0.0002, "epoch": 4.200145501576267, "step": 25980}, {"loss": 0.5454, "grad_norm": 1.2572145462036133, "learning_rate": 0.0002, "epoch": 4.201762185757012, "step": 25990}, {"loss": 0.5048, "grad_norm": 1.0476740598678589, "learning_rate": 0.0002, "epoch": 4.203378869937758, "step": 26000}, {"loss": 0.5127, "grad_norm": 1.0873368978500366, "learning_rate": 0.0002, "epoch": 4.204995554118503, "step": 26010}, {"loss": 0.5679, "grad_norm": 1.2664896249771118, "learning_rate": 0.0002, "epoch": 4.206612238299249, "step": 26020}, {"loss": 0.5814, "grad_norm": 1.0312391519546509, "learning_rate": 0.0002, "epoch": 4.208228922479994, "step": 26030}, {"loss": 0.571, "grad_norm": 1.0235042572021484, "learning_rate": 0.0002, "epoch": 4.209845606660739, "step": 26040}, {"loss": 0.5766, "grad_norm": 0.8882219195365906, "learning_rate": 0.0002, "epoch": 4.211462290841484, "step": 26050}, {"loss": 0.5557, "grad_norm": 0.9115961790084839, "learning_rate": 0.0002, "epoch": 4.2130789750222295, "step": 26060}, {"loss": 0.5455, "grad_norm": 1.0218228101730347, "learning_rate": 0.0002, "epoch": 4.214695659202975, "step": 26070}, {"loss": 0.5462, "grad_norm": 1.0802232027053833, "learning_rate": 0.0002, "epoch": 4.21631234338372, "step": 26080}, {"loss": 0.557, "grad_norm": 1.1488053798675537, "learning_rate": 0.0002, "epoch": 4.217929027564465, "step": 26090}, {"loss": 0.52, "grad_norm": 1.0487725734710693, "learning_rate": 0.0002, "epoch": 4.21954571174521, "step": 26100}, {"loss": 0.5568, "grad_norm": 0.9131165742874146, "learning_rate": 0.0002, "epoch": 4.221162395925956, "step": 26110}, {"loss": 0.5206, "grad_norm": 0.9012845158576965, "learning_rate": 0.0002, "epoch": 4.222779080106701, "step": 26120}, {"loss": 0.561, "grad_norm": 0.8389840126037598, "learning_rate": 0.0002, "epoch": 4.224395764287446, "step": 26130}, {"loss": 0.5268, "grad_norm": 0.8924660682678223, "learning_rate": 0.0002, "epoch": 4.226012448468191, "step": 26140}, {"loss": 0.5715, "grad_norm": 0.8556463718414307, "learning_rate": 0.0002, "epoch": 4.2276291326489375, "step": 26150}, {"loss": 0.5695, "grad_norm": 0.9643129110336304, "learning_rate": 0.0002, "epoch": 4.229245816829683, "step": 26160}, {"loss": 0.5321, "grad_norm": 0.9865712523460388, "learning_rate": 0.0002, "epoch": 4.230862501010428, "step": 26170}, {"loss": 0.5406, "grad_norm": 1.152641773223877, "learning_rate": 0.0002, "epoch": 4.232479185191173, "step": 26180}, {"loss": 0.5632, "grad_norm": 0.9157698154449463, "learning_rate": 0.0002, "epoch": 4.234095869371918, "step": 26190}, {"loss": 0.5717, "grad_norm": 0.8418048620223999, "learning_rate": 0.0002, "epoch": 4.235712553552664, "step": 26200}, {"loss": 0.5624, "grad_norm": 0.9430168867111206, "learning_rate": 0.0002, "epoch": 4.237329237733409, "step": 26210}, {"loss": 0.5574, "grad_norm": 1.012582778930664, "learning_rate": 0.0002, "epoch": 4.238945921914154, "step": 26220}, {"loss": 0.5693, "grad_norm": 1.112619400024414, "learning_rate": 0.0002, "epoch": 4.240562606094899, "step": 26230}, {"loss": 0.6037, "grad_norm": 0.9243621826171875, "learning_rate": 0.0002, "epoch": 4.2421792902756446, "step": 26240}, {"loss": 0.569, "grad_norm": 0.6977595686912537, "learning_rate": 0.0002, "epoch": 4.24379597445639, "step": 26250}, {"loss": 0.5379, "grad_norm": 0.9600721597671509, "learning_rate": 0.0002, "epoch": 4.245412658637135, "step": 26260}, {"loss": 0.5658, "grad_norm": 0.882641613483429, "learning_rate": 0.0002, "epoch": 4.24702934281788, "step": 26270}, {"loss": 0.55, "grad_norm": 1.010920763015747, "learning_rate": 0.0002, "epoch": 4.2486460269986255, "step": 26280}, {"loss": 0.5803, "grad_norm": 0.9289400577545166, "learning_rate": 0.0002, "epoch": 4.250262711179371, "step": 26290}, {"loss": 0.541, "grad_norm": 1.137397289276123, "learning_rate": 0.0002, "epoch": 4.251879395360117, "step": 26300}, {"loss": 0.5204, "grad_norm": 1.0136182308197021, "learning_rate": 0.0002, "epoch": 4.253496079540862, "step": 26310}, {"loss": 0.5708, "grad_norm": 0.9387356042861938, "learning_rate": 0.0002, "epoch": 4.255112763721607, "step": 26320}, {"loss": 0.5948, "grad_norm": 1.1833957433700562, "learning_rate": 0.0002, "epoch": 4.2567294479023525, "step": 26330}, {"loss": 0.5905, "grad_norm": 0.9415934681892395, "learning_rate": 0.0002, "epoch": 4.258346132083098, "step": 26340}, {"loss": 0.5539, "grad_norm": 0.8550165891647339, "learning_rate": 0.0002, "epoch": 4.259962816263843, "step": 26350}, {"loss": 0.555, "grad_norm": 9.924622535705566, "learning_rate": 0.0002, "epoch": 4.261579500444588, "step": 26360}, {"loss": 0.5689, "grad_norm": 1.0104902982711792, "learning_rate": 0.0002, "epoch": 4.2631961846253335, "step": 26370}, {"loss": 0.5698, "grad_norm": 0.890794038772583, "learning_rate": 0.0002, "epoch": 4.264812868806079, "step": 26380}, {"loss": 0.563, "grad_norm": 1.0560191869735718, "learning_rate": 0.0002, "epoch": 4.266429552986824, "step": 26390}, {"loss": 0.5119, "grad_norm": 1.0135581493377686, "learning_rate": 0.0002, "epoch": 4.268046237167569, "step": 26400}, {"loss": 0.5359, "grad_norm": 1.1304140090942383, "learning_rate": 0.0002, "epoch": 4.269662921348314, "step": 26410}, {"loss": 0.5615, "grad_norm": 0.9899303913116455, "learning_rate": 0.0002, "epoch": 4.27127960552906, "step": 26420}, {"loss": 0.5815, "grad_norm": 1.0505329370498657, "learning_rate": 0.0002, "epoch": 4.272896289709805, "step": 26430}, {"loss": 0.5384, "grad_norm": 0.9389396905899048, "learning_rate": 0.0002, "epoch": 4.27451297389055, "step": 26440}, {"loss": 0.5558, "grad_norm": 0.875328779220581, "learning_rate": 0.0002, "epoch": 4.276129658071296, "step": 26450}, {"loss": 0.5601, "grad_norm": 1.0689256191253662, "learning_rate": 0.0002, "epoch": 4.277746342252041, "step": 26460}, {"loss": 0.546, "grad_norm": 0.9988957643508911, "learning_rate": 0.0002, "epoch": 4.279363026432787, "step": 26470}, {"loss": 0.5478, "grad_norm": 0.8721813559532166, "learning_rate": 0.0002, "epoch": 4.280979710613532, "step": 26480}, {"loss": 0.5424, "grad_norm": 1.100109577178955, "learning_rate": 0.0002, "epoch": 4.282596394794277, "step": 26490}, {"loss": 0.572, "grad_norm": 1.1607271432876587, "learning_rate": 0.0002, "epoch": 4.284213078975022, "step": 26500}, {"loss": 0.6287, "grad_norm": 0.879088819026947, "learning_rate": 0.0002, "epoch": 4.285829763155768, "step": 26510}, {"loss": 0.573, "grad_norm": 0.9891700744628906, "learning_rate": 0.0002, "epoch": 4.287446447336513, "step": 26520}, {"loss": 0.6018, "grad_norm": 1.0831127166748047, "learning_rate": 0.0002, "epoch": 4.289063131517258, "step": 26530}, {"loss": 0.5693, "grad_norm": 1.4108285903930664, "learning_rate": 0.0002, "epoch": 4.290679815698003, "step": 26540}, {"loss": 0.5888, "grad_norm": 1.0630289316177368, "learning_rate": 0.0002, "epoch": 4.2922964998787485, "step": 26550}, {"loss": 0.5817, "grad_norm": 1.0854572057724, "learning_rate": 0.0002, "epoch": 4.293913184059494, "step": 26560}, {"loss": 0.5586, "grad_norm": 0.9561646580696106, "learning_rate": 0.0002, "epoch": 4.295529868240239, "step": 26570}, {"loss": 0.5674, "grad_norm": 0.9064981937408447, "learning_rate": 0.0002, "epoch": 4.297146552420984, "step": 26580}, {"loss": 0.5847, "grad_norm": 1.0082972049713135, "learning_rate": 0.0002, "epoch": 4.298763236601729, "step": 26590}, {"loss": 0.5711, "grad_norm": 1.1613214015960693, "learning_rate": 0.0002, "epoch": 4.3003799207824756, "step": 26600}, {"loss": 0.551, "grad_norm": 0.9847695231437683, "learning_rate": 0.0002, "epoch": 4.301996604963221, "step": 26610}, {"loss": 0.6089, "grad_norm": 1.0980697870254517, "learning_rate": 0.0002, "epoch": 4.303613289143966, "step": 26620}, {"loss": 0.5797, "grad_norm": 0.8861175179481506, "learning_rate": 0.0002, "epoch": 4.305229973324711, "step": 26630}, {"loss": 0.5716, "grad_norm": 0.8917363286018372, "learning_rate": 0.0002, "epoch": 4.3068466575054565, "step": 26640}, {"loss": 0.5892, "grad_norm": 1.0458378791809082, "learning_rate": 0.0002, "epoch": 4.308463341686202, "step": 26650}, {"loss": 0.5883, "grad_norm": 1.4859240055084229, "learning_rate": 0.0002, "epoch": 4.310080025866947, "step": 26660}, {"loss": 0.5296, "grad_norm": 1.1376359462738037, "learning_rate": 0.0002, "epoch": 4.311696710047692, "step": 26670}, {"loss": 0.5671, "grad_norm": 0.991349995136261, "learning_rate": 0.0002, "epoch": 4.313313394228437, "step": 26680}, {"loss": 0.5338, "grad_norm": 0.9995543956756592, "learning_rate": 0.0002, "epoch": 4.314930078409183, "step": 26690}, {"loss": 0.5542, "grad_norm": 1.0515851974487305, "learning_rate": 0.0002, "epoch": 4.316546762589928, "step": 26700}, {"loss": 0.5473, "grad_norm": 1.008023977279663, "learning_rate": 0.0002, "epoch": 4.318163446770673, "step": 26710}, {"loss": 0.5506, "grad_norm": 1.0184582471847534, "learning_rate": 0.0002, "epoch": 4.319780130951418, "step": 26720}, {"loss": 0.5828, "grad_norm": 1.161071538925171, "learning_rate": 0.0002, "epoch": 4.321396815132164, "step": 26730}, {"loss": 0.5633, "grad_norm": 0.9580779671669006, "learning_rate": 0.0002, "epoch": 4.323013499312909, "step": 26740}, {"loss": 0.5785, "grad_norm": 1.0189911127090454, "learning_rate": 0.0002, "epoch": 4.324630183493655, "step": 26750}, {"loss": 0.5237, "grad_norm": 0.7484358549118042, "learning_rate": 0.0002, "epoch": 4.3262468676744, "step": 26760}, {"loss": 0.5728, "grad_norm": 1.0015908479690552, "learning_rate": 0.0002, "epoch": 4.327863551855145, "step": 26770}, {"loss": 0.5597, "grad_norm": 0.8972945809364319, "learning_rate": 0.0002, "epoch": 4.329480236035891, "step": 26780}, {"loss": 0.5857, "grad_norm": 1.01099693775177, "learning_rate": 0.0002, "epoch": 4.331096920216636, "step": 26790}, {"loss": 0.5591, "grad_norm": 0.846958339214325, "learning_rate": 0.0002, "epoch": 4.332713604397381, "step": 26800}, {"loss": 0.5547, "grad_norm": 1.0792603492736816, "learning_rate": 0.0002, "epoch": 4.334330288578126, "step": 26810}, {"loss": 0.5747, "grad_norm": 1.0373345613479614, "learning_rate": 0.0002, "epoch": 4.3359469727588715, "step": 26820}, {"loss": 0.558, "grad_norm": 0.9779167771339417, "learning_rate": 0.0002, "epoch": 4.337563656939617, "step": 26830}, {"loss": 0.5821, "grad_norm": 1.0235520601272583, "learning_rate": 0.0002, "epoch": 4.339180341120362, "step": 26840}, {"loss": 0.5843, "grad_norm": 1.04195237159729, "learning_rate": 0.0002, "epoch": 4.340797025301107, "step": 26850}, {"loss": 0.5474, "grad_norm": 0.9479565620422363, "learning_rate": 0.0002, "epoch": 4.3424137094818525, "step": 26860}, {"loss": 0.5646, "grad_norm": 0.9526172280311584, "learning_rate": 0.0002, "epoch": 4.344030393662598, "step": 26870}, {"loss": 0.521, "grad_norm": 0.8571456074714661, "learning_rate": 0.0002, "epoch": 4.345647077843343, "step": 26880}, {"loss": 0.5846, "grad_norm": 0.9475828409194946, "learning_rate": 0.0002, "epoch": 4.347263762024088, "step": 26890}, {"loss": 0.5815, "grad_norm": 1.0529576539993286, "learning_rate": 0.0002, "epoch": 4.348880446204834, "step": 26900}, {"loss": 0.56, "grad_norm": 0.9648140072822571, "learning_rate": 0.0002, "epoch": 4.3504971303855795, "step": 26910}, {"loss": 0.5162, "grad_norm": 1.0488841533660889, "learning_rate": 0.0002, "epoch": 4.352113814566325, "step": 26920}, {"loss": 0.5842, "grad_norm": 0.8771942257881165, "learning_rate": 0.0002, "epoch": 4.35373049874707, "step": 26930}, {"loss": 0.5966, "grad_norm": 0.9411202073097229, "learning_rate": 0.0002, "epoch": 4.355347182927815, "step": 26940}, {"loss": 0.6001, "grad_norm": 1.0997588634490967, "learning_rate": 0.0002, "epoch": 4.35696386710856, "step": 26950}, {"loss": 0.5528, "grad_norm": 0.968754768371582, "learning_rate": 0.0002, "epoch": 4.358580551289306, "step": 26960}, {"loss": 0.5881, "grad_norm": 0.9990773797035217, "learning_rate": 0.0002, "epoch": 4.360197235470051, "step": 26970}, {"loss": 0.5761, "grad_norm": 1.0210620164871216, "learning_rate": 0.0002, "epoch": 4.361813919650796, "step": 26980}, {"loss": 0.5768, "grad_norm": 0.855462908744812, "learning_rate": 0.0002, "epoch": 4.363430603831541, "step": 26990}, {"loss": 0.5493, "grad_norm": 0.9169660806655884, "learning_rate": 0.0002, "epoch": 4.365047288012287, "step": 27000}, {"loss": 0.5697, "grad_norm": 1.089629888534546, "learning_rate": 0.0002, "epoch": 4.366663972193032, "step": 27010}, {"loss": 0.5854, "grad_norm": 1.0932867527008057, "learning_rate": 0.0002, "epoch": 4.368280656373777, "step": 27020}, {"loss": 0.5656, "grad_norm": 0.9290956854820251, "learning_rate": 0.0002, "epoch": 4.369897340554522, "step": 27030}, {"loss": 0.5727, "grad_norm": 1.2800624370574951, "learning_rate": 0.0002, "epoch": 4.3715140247352675, "step": 27040}, {"loss": 0.5837, "grad_norm": 0.8993493318557739, "learning_rate": 0.0002, "epoch": 4.373130708916014, "step": 27050}, {"loss": 0.6232, "grad_norm": 1.1566431522369385, "learning_rate": 0.0002, "epoch": 4.374747393096759, "step": 27060}, {"loss": 0.5902, "grad_norm": 0.9479052424430847, "learning_rate": 0.0002, "epoch": 4.376364077277504, "step": 27070}, {"loss": 0.6189, "grad_norm": 1.0063648223876953, "learning_rate": 0.0002, "epoch": 4.377980761458249, "step": 27080}, {"loss": 0.561, "grad_norm": 0.8342045545578003, "learning_rate": 0.0002, "epoch": 4.379597445638995, "step": 27090}, {"loss": 0.5515, "grad_norm": 1.1390739679336548, "learning_rate": 0.0002, "epoch": 4.38121412981974, "step": 27100}, {"loss": 0.5372, "grad_norm": 0.9547637104988098, "learning_rate": 0.0002, "epoch": 4.382830814000485, "step": 27110}, {"loss": 0.5728, "grad_norm": 1.0503804683685303, "learning_rate": 0.0002, "epoch": 4.38444749818123, "step": 27120}, {"loss": 0.5787, "grad_norm": 0.9064017534255981, "learning_rate": 0.0002, "epoch": 4.3860641823619755, "step": 27130}, {"loss": 0.5798, "grad_norm": 0.9382519125938416, "learning_rate": 0.0002, "epoch": 4.387680866542721, "step": 27140}, {"loss": 0.5791, "grad_norm": 1.0410341024398804, "learning_rate": 0.0002, "epoch": 4.389297550723466, "step": 27150}, {"loss": 0.6034, "grad_norm": 0.9218655824661255, "learning_rate": 0.0002, "epoch": 4.390914234904211, "step": 27160}, {"loss": 0.5204, "grad_norm": 0.8119737505912781, "learning_rate": 0.0002, "epoch": 4.392530919084956, "step": 27170}, {"loss": 0.5612, "grad_norm": 0.8584722876548767, "learning_rate": 0.0002, "epoch": 4.394147603265702, "step": 27180}, {"loss": 0.5772, "grad_norm": 0.9668293595314026, "learning_rate": 0.0002, "epoch": 4.395764287446447, "step": 27190}, {"loss": 0.6009, "grad_norm": 1.022334098815918, "learning_rate": 0.0002, "epoch": 4.397380971627193, "step": 27200}, {"loss": 0.5573, "grad_norm": 0.9553216099739075, "learning_rate": 0.0002, "epoch": 4.398997655807938, "step": 27210}, {"loss": 0.5604, "grad_norm": 0.9282339215278625, "learning_rate": 0.0002, "epoch": 4.4006143399886835, "step": 27220}, {"loss": 0.5599, "grad_norm": 1.0232292413711548, "learning_rate": 0.0002, "epoch": 4.402231024169429, "step": 27230}, {"loss": 0.6078, "grad_norm": 0.9915700554847717, "learning_rate": 0.0002, "epoch": 4.403847708350174, "step": 27240}, {"loss": 0.5778, "grad_norm": 1.0014961957931519, "learning_rate": 0.0002, "epoch": 4.405464392530919, "step": 27250}, {"loss": 0.5824, "grad_norm": 1.1172103881835938, "learning_rate": 0.0002, "epoch": 4.407081076711664, "step": 27260}, {"loss": 0.5286, "grad_norm": 0.8583093285560608, "learning_rate": 0.0002, "epoch": 4.40869776089241, "step": 27270}, {"loss": 0.5507, "grad_norm": 0.7609201669692993, "learning_rate": 0.0002, "epoch": 4.410314445073155, "step": 27280}, {"loss": 0.575, "grad_norm": 1.0619351863861084, "learning_rate": 0.0002, "epoch": 4.4119311292539, "step": 27290}, {"loss": 0.5579, "grad_norm": 1.0177674293518066, "learning_rate": 0.0002, "epoch": 4.413547813434645, "step": 27300}, {"loss": 0.5628, "grad_norm": 0.9921218156814575, "learning_rate": 0.0002, "epoch": 4.4151644976153905, "step": 27310}, {"loss": 0.6018, "grad_norm": 1.126244306564331, "learning_rate": 0.0002, "epoch": 4.416781181796136, "step": 27320}, {"loss": 0.5743, "grad_norm": 1.0678540468215942, "learning_rate": 0.0002, "epoch": 4.418397865976881, "step": 27330}, {"loss": 0.5665, "grad_norm": 0.8705704212188721, "learning_rate": 0.0002, "epoch": 4.420014550157627, "step": 27340}, {"loss": 0.5763, "grad_norm": 1.272074818611145, "learning_rate": 0.0002, "epoch": 4.421631234338372, "step": 27350}, {"loss": 0.561, "grad_norm": 0.8740444183349609, "learning_rate": 0.0002, "epoch": 4.423247918519118, "step": 27360}, {"loss": 0.5492, "grad_norm": 1.0584250688552856, "learning_rate": 0.0002, "epoch": 4.424864602699863, "step": 27370}, {"loss": 0.589, "grad_norm": 1.059870719909668, "learning_rate": 0.0002, "epoch": 4.426481286880608, "step": 27380}, {"loss": 0.5551, "grad_norm": 1.072265863418579, "learning_rate": 0.0002, "epoch": 4.428097971061353, "step": 27390}, {"loss": 0.5584, "grad_norm": 0.871481716632843, "learning_rate": 0.0002, "epoch": 4.4297146552420985, "step": 27400}, {"loss": 0.5372, "grad_norm": 0.9555448293685913, "learning_rate": 0.0002, "epoch": 4.431331339422844, "step": 27410}, {"loss": 0.5593, "grad_norm": 1.0402292013168335, "learning_rate": 0.0002, "epoch": 4.432948023603589, "step": 27420}, {"loss": 0.5532, "grad_norm": 1.12587571144104, "learning_rate": 0.0002, "epoch": 4.434564707784334, "step": 27430}, {"loss": 0.5403, "grad_norm": 1.0783193111419678, "learning_rate": 0.0002, "epoch": 4.436181391965079, "step": 27440}, {"loss": 0.5313, "grad_norm": 1.024133563041687, "learning_rate": 0.0002, "epoch": 4.437798076145825, "step": 27450}, {"loss": 0.5621, "grad_norm": 0.9156768918037415, "learning_rate": 0.0002, "epoch": 4.43941476032657, "step": 27460}, {"loss": 0.5307, "grad_norm": 1.0215224027633667, "learning_rate": 0.0002, "epoch": 4.441031444507315, "step": 27470}, {"loss": 0.5188, "grad_norm": 1.082116961479187, "learning_rate": 0.0002, "epoch": 4.442648128688061, "step": 27480}, {"loss": 0.6203, "grad_norm": 1.0412873029708862, "learning_rate": 0.0002, "epoch": 4.4442648128688065, "step": 27490}, {"loss": 0.5939, "grad_norm": 1.0509289503097534, "learning_rate": 0.0002, "epoch": 4.445881497049552, "step": 27500}, {"loss": 0.5503, "grad_norm": 0.9291498064994812, "learning_rate": 0.0002, "epoch": 4.447498181230297, "step": 27510}, {"loss": 0.5408, "grad_norm": 0.970184326171875, "learning_rate": 0.0002, "epoch": 4.449114865411042, "step": 27520}, {"loss": 0.5705, "grad_norm": 0.8418883681297302, "learning_rate": 0.0002, "epoch": 4.450731549591787, "step": 27530}, {"loss": 0.5124, "grad_norm": 0.8823825120925903, "learning_rate": 0.0002, "epoch": 4.452348233772533, "step": 27540}, {"loss": 0.5867, "grad_norm": 1.1909019947052002, "learning_rate": 0.0002, "epoch": 4.453964917953278, "step": 27550}, {"loss": 0.5685, "grad_norm": 1.0317302942276, "learning_rate": 0.0002, "epoch": 4.455581602134023, "step": 27560}, {"loss": 0.5538, "grad_norm": 0.9977751970291138, "learning_rate": 0.0002, "epoch": 4.457198286314768, "step": 27570}, {"loss": 0.5628, "grad_norm": 0.8909519910812378, "learning_rate": 0.0002, "epoch": 4.458814970495514, "step": 27580}, {"loss": 0.6099, "grad_norm": 0.8653029799461365, "learning_rate": 0.0002, "epoch": 4.460431654676259, "step": 27590}, {"loss": 0.5622, "grad_norm": 1.0783653259277344, "learning_rate": 0.0002, "epoch": 4.462048338857004, "step": 27600}, {"loss": 0.579, "grad_norm": 1.1235394477844238, "learning_rate": 0.0002, "epoch": 4.463665023037749, "step": 27610}, {"loss": 0.5545, "grad_norm": 0.9386643767356873, "learning_rate": 0.0002, "epoch": 4.4652817072184945, "step": 27620}, {"loss": 0.5554, "grad_norm": 1.0605148077011108, "learning_rate": 0.0002, "epoch": 4.466898391399241, "step": 27630}, {"loss": 0.5886, "grad_norm": 1.1283893585205078, "learning_rate": 0.0002, "epoch": 4.468515075579986, "step": 27640}, {"loss": 0.5801, "grad_norm": 1.0583468675613403, "learning_rate": 0.0002, "epoch": 4.470131759760731, "step": 27650}, {"loss": 0.5601, "grad_norm": 0.9563992023468018, "learning_rate": 0.0002, "epoch": 4.471748443941476, "step": 27660}, {"loss": 0.5687, "grad_norm": 1.100598931312561, "learning_rate": 0.0002, "epoch": 4.4733651281222215, "step": 27670}, {"loss": 0.589, "grad_norm": 0.9386957287788391, "learning_rate": 0.0002, "epoch": 4.474981812302967, "step": 27680}, {"loss": 0.6241, "grad_norm": 1.2946288585662842, "learning_rate": 0.0002, "epoch": 4.476598496483712, "step": 27690}, {"loss": 0.6075, "grad_norm": 1.0325199365615845, "learning_rate": 0.0002, "epoch": 4.478215180664457, "step": 27700}, {"loss": 0.588, "grad_norm": 1.0318928956985474, "learning_rate": 0.0002, "epoch": 4.4798318648452025, "step": 27710}, {"loss": 0.5656, "grad_norm": 0.8721024394035339, "learning_rate": 0.0002, "epoch": 4.481448549025948, "step": 27720}, {"loss": 0.5421, "grad_norm": 1.17376708984375, "learning_rate": 0.0002, "epoch": 4.483065233206693, "step": 27730}, {"loss": 0.5657, "grad_norm": 1.0926326513290405, "learning_rate": 0.0002, "epoch": 4.484681917387438, "step": 27740}, {"loss": 0.5514, "grad_norm": 0.9043852686882019, "learning_rate": 0.0002, "epoch": 4.486298601568183, "step": 27750}, {"loss": 0.582, "grad_norm": 1.064600944519043, "learning_rate": 0.0002, "epoch": 4.487915285748929, "step": 27760}, {"loss": 0.6108, "grad_norm": 0.7833460569381714, "learning_rate": 0.0002, "epoch": 4.489531969929674, "step": 27770}, {"loss": 0.5985, "grad_norm": 1.1073496341705322, "learning_rate": 0.0002, "epoch": 4.49114865411042, "step": 27780}, {"loss": 0.5577, "grad_norm": 1.0799397230148315, "learning_rate": 0.0002, "epoch": 4.492765338291165, "step": 27790}, {"loss": 0.5601, "grad_norm": 1.1062238216400146, "learning_rate": 0.0002, "epoch": 4.49438202247191, "step": 27800}, {"loss": 0.6126, "grad_norm": 1.0568242073059082, "learning_rate": 0.0002, "epoch": 4.495998706652656, "step": 27810}, {"loss": 0.5913, "grad_norm": 0.8861091732978821, "learning_rate": 0.0002, "epoch": 4.497615390833401, "step": 27820}, {"loss": 0.5858, "grad_norm": 1.2297543287277222, "learning_rate": 0.0002, "epoch": 4.499232075014146, "step": 27830}, {"loss": 0.5859, "grad_norm": 0.9600302577018738, "learning_rate": 0.0002, "epoch": 4.500848759194891, "step": 27840}, {"loss": 0.6124, "grad_norm": 1.057051181793213, "learning_rate": 0.0002, "epoch": 4.502465443375637, "step": 27850}, {"loss": 0.5788, "grad_norm": 0.9839690923690796, "learning_rate": 0.0002, "epoch": 4.504082127556382, "step": 27860}, {"loss": 0.555, "grad_norm": 1.1479853391647339, "learning_rate": 0.0002, "epoch": 4.505698811737127, "step": 27870}, {"loss": 0.6039, "grad_norm": 1.0550768375396729, "learning_rate": 0.0002, "epoch": 4.507315495917872, "step": 27880}, {"loss": 0.563, "grad_norm": 0.898209273815155, "learning_rate": 0.0002, "epoch": 4.5089321800986175, "step": 27890}, {"loss": 0.5734, "grad_norm": 0.9460315108299255, "learning_rate": 0.0002, "epoch": 4.510548864279363, "step": 27900}, {"loss": 0.5702, "grad_norm": 0.9499884247779846, "learning_rate": 0.0002, "epoch": 4.512165548460108, "step": 27910}, {"loss": 0.5385, "grad_norm": 0.7801318764686584, "learning_rate": 0.0002, "epoch": 4.513782232640853, "step": 27920}, {"loss": 0.5391, "grad_norm": 0.9286966323852539, "learning_rate": 0.0002, "epoch": 4.515398916821599, "step": 27930}, {"loss": 0.5717, "grad_norm": 0.9539980292320251, "learning_rate": 0.0002, "epoch": 4.517015601002345, "step": 27940}, {"loss": 0.6073, "grad_norm": 1.1053401231765747, "learning_rate": 0.0002, "epoch": 4.51863228518309, "step": 27950}, {"loss": 0.6087, "grad_norm": 0.7535534501075745, "learning_rate": 0.0002, "epoch": 4.520248969363835, "step": 27960}, {"loss": 0.5701, "grad_norm": 1.076926589012146, "learning_rate": 0.0002, "epoch": 4.52186565354458, "step": 27970}, {"loss": 0.6028, "grad_norm": 1.181935429573059, "learning_rate": 0.0002, "epoch": 4.5234823377253255, "step": 27980}, {"loss": 0.6033, "grad_norm": 0.9293407201766968, "learning_rate": 0.0002, "epoch": 4.525099021906071, "step": 27990}, {"loss": 0.5815, "grad_norm": 0.8953009247779846, "learning_rate": 0.0002, "epoch": 4.526715706086816, "step": 28000}, {"loss": 0.5564, "grad_norm": 1.0850225687026978, "learning_rate": 0.0002, "epoch": 4.528332390267561, "step": 28010}, {"loss": 0.5459, "grad_norm": 0.9125663042068481, "learning_rate": 0.0002, "epoch": 4.529949074448306, "step": 28020}, {"loss": 0.5922, "grad_norm": 0.8745216727256775, "learning_rate": 0.0002, "epoch": 4.531565758629052, "step": 28030}, {"loss": 0.567, "grad_norm": 1.0783463716506958, "learning_rate": 0.0002, "epoch": 4.533182442809797, "step": 28040}, {"loss": 0.5754, "grad_norm": 0.7513844966888428, "learning_rate": 0.0002, "epoch": 4.534799126990542, "step": 28050}, {"loss": 0.5608, "grad_norm": 1.0135776996612549, "learning_rate": 0.0002, "epoch": 4.536415811171287, "step": 28060}, {"loss": 0.5827, "grad_norm": 0.8886825442314148, "learning_rate": 0.0002, "epoch": 4.538032495352033, "step": 28070}, {"loss": 0.5605, "grad_norm": 0.8153995275497437, "learning_rate": 0.0002, "epoch": 4.539649179532779, "step": 28080}, {"loss": 0.6377, "grad_norm": 0.9853341579437256, "learning_rate": 0.0002, "epoch": 4.541265863713524, "step": 28090}, {"loss": 0.5957, "grad_norm": 0.9365800023078918, "learning_rate": 0.0002, "epoch": 4.542882547894269, "step": 28100}, {"loss": 0.5477, "grad_norm": 0.9765017628669739, "learning_rate": 0.0002, "epoch": 4.544499232075014, "step": 28110}, {"loss": 0.6185, "grad_norm": 0.9811279773712158, "learning_rate": 0.0002, "epoch": 4.54611591625576, "step": 28120}, {"loss": 0.6095, "grad_norm": 1.0387924909591675, "learning_rate": 0.0002, "epoch": 4.547732600436505, "step": 28130}, {"loss": 0.6534, "grad_norm": 1.0684878826141357, "learning_rate": 0.0002, "epoch": 4.54934928461725, "step": 28140}, {"loss": 0.5701, "grad_norm": 1.0000102519989014, "learning_rate": 0.0002, "epoch": 4.550965968797995, "step": 28150}, {"loss": 0.5327, "grad_norm": 1.0717930793762207, "learning_rate": 0.0002, "epoch": 4.5525826529787405, "step": 28160}, {"loss": 0.5594, "grad_norm": 0.990074634552002, "learning_rate": 0.0002, "epoch": 4.554199337159486, "step": 28170}, {"loss": 0.5452, "grad_norm": 0.8673754930496216, "learning_rate": 0.0002, "epoch": 4.555816021340231, "step": 28180}, {"loss": 0.5773, "grad_norm": 0.864247739315033, "learning_rate": 0.0002, "epoch": 4.557432705520976, "step": 28190}, {"loss": 0.5516, "grad_norm": 0.8280200958251953, "learning_rate": 0.0002, "epoch": 4.5590493897017215, "step": 28200}, {"loss": 0.5709, "grad_norm": 1.1312172412872314, "learning_rate": 0.0002, "epoch": 4.560666073882467, "step": 28210}, {"loss": 0.5776, "grad_norm": 0.9147403240203857, "learning_rate": 0.0002, "epoch": 4.562282758063212, "step": 28220}, {"loss": 0.5591, "grad_norm": 1.0321218967437744, "learning_rate": 0.0002, "epoch": 4.563899442243958, "step": 28230}, {"loss": 0.5508, "grad_norm": 1.168332815170288, "learning_rate": 0.0002, "epoch": 4.565516126424703, "step": 28240}, {"loss": 0.5649, "grad_norm": 1.0067222118377686, "learning_rate": 0.0002, "epoch": 4.5671328106054485, "step": 28250}, {"loss": 0.5853, "grad_norm": 1.0283393859863281, "learning_rate": 0.0002, "epoch": 4.568749494786194, "step": 28260}, {"loss": 0.5772, "grad_norm": 0.9912363886833191, "learning_rate": 0.0002, "epoch": 4.570366178966939, "step": 28270}, {"loss": 0.5757, "grad_norm": 1.108032464981079, "learning_rate": 0.0002, "epoch": 4.571982863147684, "step": 28280}, {"loss": 0.5529, "grad_norm": 0.8260078430175781, "learning_rate": 0.0002, "epoch": 4.573599547328429, "step": 28290}, {"loss": 0.5625, "grad_norm": 0.8946247100830078, "learning_rate": 0.0002, "epoch": 4.575216231509175, "step": 28300}, {"loss": 0.5533, "grad_norm": 0.8273587822914124, "learning_rate": 0.0002, "epoch": 4.57683291568992, "step": 28310}, {"loss": 0.6058, "grad_norm": 0.9040093421936035, "learning_rate": 0.0002, "epoch": 4.578449599870665, "step": 28320}, {"loss": 0.5521, "grad_norm": 0.8435290455818176, "learning_rate": 0.0002, "epoch": 4.58006628405141, "step": 28330}, {"loss": 0.6086, "grad_norm": 1.164088249206543, "learning_rate": 0.0002, "epoch": 4.581682968232156, "step": 28340}, {"loss": 0.5603, "grad_norm": 0.9861085414886475, "learning_rate": 0.0002, "epoch": 4.583299652412901, "step": 28350}, {"loss": 0.5701, "grad_norm": 0.8892980813980103, "learning_rate": 0.0002, "epoch": 4.584916336593646, "step": 28360}, {"loss": 0.598, "grad_norm": 1.240574836730957, "learning_rate": 0.0002, "epoch": 4.586533020774391, "step": 28370}, {"loss": 0.5797, "grad_norm": 0.8669408559799194, "learning_rate": 0.0002, "epoch": 4.588149704955137, "step": 28380}, {"loss": 0.5603, "grad_norm": 0.9145985841751099, "learning_rate": 0.0002, "epoch": 4.589766389135883, "step": 28390}, {"loss": 0.5765, "grad_norm": 0.8584614992141724, "learning_rate": 0.0002, "epoch": 4.591383073316628, "step": 28400}, {"loss": 0.5898, "grad_norm": 1.118829369544983, "learning_rate": 0.0002, "epoch": 4.592999757497373, "step": 28410}, {"loss": 0.5641, "grad_norm": 1.1411553621292114, "learning_rate": 0.0002, "epoch": 4.594616441678118, "step": 28420}, {"loss": 0.549, "grad_norm": 0.9433278441429138, "learning_rate": 0.0002, "epoch": 4.596233125858864, "step": 28430}, {"loss": 0.5496, "grad_norm": 0.816830039024353, "learning_rate": 0.0002, "epoch": 4.597849810039609, "step": 28440}, {"loss": 0.5543, "grad_norm": 1.2124968767166138, "learning_rate": 0.0002, "epoch": 4.599466494220354, "step": 28450}, {"loss": 0.5759, "grad_norm": 0.9658762216567993, "learning_rate": 0.0002, "epoch": 4.601083178401099, "step": 28460}, {"loss": 0.5902, "grad_norm": 0.836100161075592, "learning_rate": 0.0002, "epoch": 4.6026998625818445, "step": 28470}, {"loss": 0.5749, "grad_norm": 0.9989104270935059, "learning_rate": 0.0002, "epoch": 4.60431654676259, "step": 28480}, {"loss": 0.5616, "grad_norm": 1.1298956871032715, "learning_rate": 0.0002, "epoch": 4.605933230943335, "step": 28490}, {"loss": 0.5846, "grad_norm": 1.1731704473495483, "learning_rate": 0.0002, "epoch": 4.60754991512408, "step": 28500}, {"loss": 0.5816, "grad_norm": 0.9624714255332947, "learning_rate": 0.0002, "epoch": 4.609166599304825, "step": 28510}, {"loss": 0.5868, "grad_norm": 1.364073634147644, "learning_rate": 0.0002, "epoch": 4.610783283485571, "step": 28520}, {"loss": 0.6237, "grad_norm": 1.1827356815338135, "learning_rate": 0.0002, "epoch": 4.612399967666317, "step": 28530}, {"loss": 0.5643, "grad_norm": 0.6651531457901001, "learning_rate": 0.0002, "epoch": 4.614016651847062, "step": 28540}, {"loss": 0.6051, "grad_norm": 1.1640995740890503, "learning_rate": 0.0002, "epoch": 4.615633336027807, "step": 28550}, {"loss": 0.5995, "grad_norm": 1.028918743133545, "learning_rate": 0.0002, "epoch": 4.6172500202085525, "step": 28560}, {"loss": 0.5607, "grad_norm": 0.8252120614051819, "learning_rate": 0.0002, "epoch": 4.618866704389298, "step": 28570}, {"loss": 0.5769, "grad_norm": 1.3536735773086548, "learning_rate": 0.0002, "epoch": 4.620483388570043, "step": 28580}, {"loss": 0.6006, "grad_norm": 1.2146915197372437, "learning_rate": 0.0002, "epoch": 4.622100072750788, "step": 28590}, {"loss": 0.5503, "grad_norm": 1.0122549533843994, "learning_rate": 0.0002, "epoch": 4.623716756931533, "step": 28600}, {"loss": 0.6072, "grad_norm": 0.9977872967720032, "learning_rate": 0.0002, "epoch": 4.625333441112279, "step": 28610}, {"loss": 0.5669, "grad_norm": 1.0159751176834106, "learning_rate": 0.0002, "epoch": 4.626950125293024, "step": 28620}, {"loss": 0.5935, "grad_norm": 1.0028325319290161, "learning_rate": 0.0002, "epoch": 4.628566809473769, "step": 28630}, {"loss": 0.5515, "grad_norm": 0.901638388633728, "learning_rate": 0.0002, "epoch": 4.630183493654514, "step": 28640}, {"loss": 0.595, "grad_norm": 0.9450507164001465, "learning_rate": 0.0002, "epoch": 4.6318001778352595, "step": 28650}, {"loss": 0.5972, "grad_norm": 0.9987545013427734, "learning_rate": 0.0002, "epoch": 4.633416862016006, "step": 28660}, {"loss": 0.5863, "grad_norm": 0.9574332237243652, "learning_rate": 0.0002, "epoch": 4.63503354619675, "step": 28670}, {"loss": 0.5804, "grad_norm": 1.2215653657913208, "learning_rate": 0.0002, "epoch": 4.636650230377496, "step": 28680}, {"loss": 0.5798, "grad_norm": 0.9798858761787415, "learning_rate": 0.0002, "epoch": 4.638266914558241, "step": 28690}, {"loss": 0.5773, "grad_norm": 1.0648466348648071, "learning_rate": 0.0002, "epoch": 4.639883598738987, "step": 28700}, {"loss": 0.6108, "grad_norm": 1.0606504678726196, "learning_rate": 0.0002, "epoch": 4.641500282919732, "step": 28710}, {"loss": 0.5801, "grad_norm": 1.0892442464828491, "learning_rate": 0.0002, "epoch": 4.643116967100477, "step": 28720}, {"loss": 0.5492, "grad_norm": 0.914391040802002, "learning_rate": 0.0002, "epoch": 4.644733651281222, "step": 28730}, {"loss": 0.5439, "grad_norm": 0.9782370328903198, "learning_rate": 0.0002, "epoch": 4.6463503354619675, "step": 28740}, {"loss": 0.6035, "grad_norm": 1.0344339609146118, "learning_rate": 0.0002, "epoch": 4.647967019642713, "step": 28750}, {"loss": 0.5775, "grad_norm": 1.0513931512832642, "learning_rate": 0.0002, "epoch": 4.649583703823458, "step": 28760}, {"loss": 0.546, "grad_norm": 0.9711475968360901, "learning_rate": 0.0002, "epoch": 4.651200388004203, "step": 28770}, {"loss": 0.5472, "grad_norm": 0.977519690990448, "learning_rate": 0.0002, "epoch": 4.652817072184948, "step": 28780}, {"loss": 0.5826, "grad_norm": 0.9150224924087524, "learning_rate": 0.0002, "epoch": 4.654433756365694, "step": 28790}, {"loss": 0.5382, "grad_norm": 1.0973542928695679, "learning_rate": 0.0002, "epoch": 4.656050440546439, "step": 28800}, {"loss": 0.6147, "grad_norm": 0.944877564907074, "learning_rate": 0.0002, "epoch": 4.657667124727185, "step": 28810}, {"loss": 0.5537, "grad_norm": 0.9508748650550842, "learning_rate": 0.0002, "epoch": 4.659283808907929, "step": 28820}, {"loss": 0.5537, "grad_norm": 0.9681721329689026, "learning_rate": 0.0002, "epoch": 4.6609004930886755, "step": 28830}, {"loss": 0.592, "grad_norm": 1.0214351415634155, "learning_rate": 0.0002, "epoch": 4.662517177269421, "step": 28840}, {"loss": 0.6031, "grad_norm": 0.9748611450195312, "learning_rate": 0.0002, "epoch": 4.664133861450166, "step": 28850}, {"loss": 0.572, "grad_norm": 0.8484147191047668, "learning_rate": 0.0002, "epoch": 4.665750545630911, "step": 28860}, {"loss": 0.5699, "grad_norm": 1.1252986192703247, "learning_rate": 0.0002, "epoch": 4.667367229811656, "step": 28870}, {"loss": 0.5724, "grad_norm": 0.8706206679344177, "learning_rate": 0.0002, "epoch": 4.668983913992402, "step": 28880}, {"loss": 0.6002, "grad_norm": 1.1432424783706665, "learning_rate": 0.0002, "epoch": 4.670600598173147, "step": 28890}, {"loss": 0.5675, "grad_norm": 1.017029047012329, "learning_rate": 0.0002, "epoch": 4.672217282353892, "step": 28900}, {"loss": 0.5831, "grad_norm": 1.085597038269043, "learning_rate": 0.0002, "epoch": 4.673833966534637, "step": 28910}, {"loss": 0.5678, "grad_norm": 0.9275796413421631, "learning_rate": 0.0002, "epoch": 4.675450650715383, "step": 28920}, {"loss": 0.5603, "grad_norm": 0.9518964886665344, "learning_rate": 0.0002, "epoch": 4.677067334896128, "step": 28930}, {"loss": 0.6232, "grad_norm": 1.0352122783660889, "learning_rate": 0.0002, "epoch": 4.678684019076873, "step": 28940}, {"loss": 0.5786, "grad_norm": 1.090124249458313, "learning_rate": 0.0002, "epoch": 4.680300703257618, "step": 28950}, {"loss": 0.5728, "grad_norm": 0.8799563050270081, "learning_rate": 0.0002, "epoch": 4.681917387438364, "step": 28960}, {"loss": 0.5787, "grad_norm": 1.0929821729660034, "learning_rate": 0.0002, "epoch": 4.683534071619109, "step": 28970}, {"loss": 0.6134, "grad_norm": 0.903727650642395, "learning_rate": 0.0002, "epoch": 4.685150755799855, "step": 28980}, {"loss": 0.5522, "grad_norm": 0.9752424955368042, "learning_rate": 0.0002, "epoch": 4.6867674399806, "step": 28990}, {"loss": 0.5762, "grad_norm": 0.9351571202278137, "learning_rate": 0.0002, "epoch": 4.688384124161345, "step": 29000}, {"loss": 0.5811, "grad_norm": 0.923877477645874, "learning_rate": 0.0002, "epoch": 4.6900008083420905, "step": 29010}, {"loss": 0.5682, "grad_norm": 1.045389175415039, "learning_rate": 0.0002, "epoch": 4.691617492522836, "step": 29020}, {"loss": 0.584, "grad_norm": 1.0200831890106201, "learning_rate": 0.0002, "epoch": 4.693234176703581, "step": 29030}, {"loss": 0.5514, "grad_norm": 1.1499706506729126, "learning_rate": 0.0002, "epoch": 4.694850860884326, "step": 29040}, {"loss": 0.5745, "grad_norm": 0.860118567943573, "learning_rate": 0.0002, "epoch": 4.6964675450650715, "step": 29050}, {"loss": 0.5741, "grad_norm": 0.9774864315986633, "learning_rate": 0.0002, "epoch": 4.698084229245817, "step": 29060}, {"loss": 0.5765, "grad_norm": 1.0323210954666138, "learning_rate": 0.0002, "epoch": 4.699700913426562, "step": 29070}, {"loss": 0.5452, "grad_norm": 0.8492481112480164, "learning_rate": 0.0002, "epoch": 4.701317597607307, "step": 29080}, {"loss": 0.5985, "grad_norm": 1.131951093673706, "learning_rate": 0.0002, "epoch": 4.702934281788052, "step": 29090}, {"loss": 0.6412, "grad_norm": 0.8763113021850586, "learning_rate": 0.0002, "epoch": 4.704550965968798, "step": 29100}, {"loss": 0.575, "grad_norm": 1.045028805732727, "learning_rate": 0.0002, "epoch": 4.706167650149544, "step": 29110}, {"loss": 0.5548, "grad_norm": 0.9961401224136353, "learning_rate": 0.0002, "epoch": 4.707784334330288, "step": 29120}, {"loss": 0.559, "grad_norm": 0.9282503724098206, "learning_rate": 0.0002, "epoch": 4.709401018511034, "step": 29130}, {"loss": 0.5744, "grad_norm": 1.1418932676315308, "learning_rate": 0.0002, "epoch": 4.711017702691779, "step": 29140}, {"loss": 0.5394, "grad_norm": 0.9950099587440491, "learning_rate": 0.0002, "epoch": 4.712634386872525, "step": 29150}, {"loss": 0.6177, "grad_norm": 0.8304893374443054, "learning_rate": 0.0002, "epoch": 4.71425107105327, "step": 29160}, {"loss": 0.6074, "grad_norm": 1.115626335144043, "learning_rate": 0.0002, "epoch": 4.715867755234015, "step": 29170}, {"loss": 0.6265, "grad_norm": 1.079818606376648, "learning_rate": 0.0002, "epoch": 4.71748443941476, "step": 29180}, {"loss": 0.561, "grad_norm": 1.1929082870483398, "learning_rate": 0.0002, "epoch": 4.719101123595506, "step": 29190}, {"loss": 0.5708, "grad_norm": 0.9621080756187439, "learning_rate": 0.0002, "epoch": 4.720717807776251, "step": 29200}, {"loss": 0.546, "grad_norm": 0.8549222350120544, "learning_rate": 0.0002, "epoch": 4.722334491956996, "step": 29210}, {"loss": 0.5775, "grad_norm": 0.9341941475868225, "learning_rate": 0.0002, "epoch": 4.723951176137741, "step": 29220}, {"loss": 0.5436, "grad_norm": 1.075406789779663, "learning_rate": 0.0002, "epoch": 4.7255678603184865, "step": 29230}, {"loss": 0.576, "grad_norm": 1.0859880447387695, "learning_rate": 0.0002, "epoch": 4.727184544499232, "step": 29240}, {"loss": 0.5525, "grad_norm": 0.8475605249404907, "learning_rate": 0.0002, "epoch": 4.728801228679977, "step": 29250}, {"loss": 0.5659, "grad_norm": 0.9331845641136169, "learning_rate": 0.0002, "epoch": 4.730417912860723, "step": 29260}, {"loss": 0.5901, "grad_norm": 0.9279314279556274, "learning_rate": 0.0002, "epoch": 4.7320345970414674, "step": 29270}, {"loss": 0.597, "grad_norm": 0.7803558707237244, "learning_rate": 0.0002, "epoch": 4.733651281222214, "step": 29280}, {"loss": 0.5968, "grad_norm": 1.0159329175949097, "learning_rate": 0.0002, "epoch": 4.735267965402959, "step": 29290}, {"loss": 0.5333, "grad_norm": 0.9448670744895935, "learning_rate": 0.0002, "epoch": 4.736884649583704, "step": 29300}, {"loss": 0.574, "grad_norm": 1.0732197761535645, "learning_rate": 0.0002, "epoch": 4.738501333764449, "step": 29310}, {"loss": 0.6066, "grad_norm": 0.901830792427063, "learning_rate": 0.0002, "epoch": 4.7401180179451945, "step": 29320}, {"loss": 0.6105, "grad_norm": 0.9141789674758911, "learning_rate": 0.0002, "epoch": 4.74173470212594, "step": 29330}, {"loss": 0.5481, "grad_norm": 0.9733418226242065, "learning_rate": 0.0002, "epoch": 4.743351386306685, "step": 29340}, {"loss": 0.612, "grad_norm": 0.909810483455658, "learning_rate": 0.0002, "epoch": 4.74496807048743, "step": 29350}, {"loss": 0.5911, "grad_norm": 0.909541666507721, "learning_rate": 0.0002, "epoch": 4.746584754668175, "step": 29360}, {"loss": 0.5579, "grad_norm": 0.9383015632629395, "learning_rate": 0.0002, "epoch": 4.748201438848921, "step": 29370}, {"loss": 0.5529, "grad_norm": 0.9275668263435364, "learning_rate": 0.0002, "epoch": 4.749818123029666, "step": 29380}, {"loss": 0.5623, "grad_norm": 1.1146225929260254, "learning_rate": 0.0002, "epoch": 4.751434807210411, "step": 29390}, {"loss": 0.6018, "grad_norm": 1.0062453746795654, "learning_rate": 0.0002, "epoch": 4.753051491391156, "step": 29400}, {"loss": 0.5872, "grad_norm": 0.9451895952224731, "learning_rate": 0.0002, "epoch": 4.7546681755719025, "step": 29410}, {"loss": 0.5767, "grad_norm": 0.870457649230957, "learning_rate": 0.0002, "epoch": 4.756284859752648, "step": 29420}, {"loss": 0.57, "grad_norm": 1.0411282777786255, "learning_rate": 0.0002, "epoch": 4.757901543933393, "step": 29430}, {"loss": 0.5688, "grad_norm": 1.1648986339569092, "learning_rate": 0.0002, "epoch": 4.759518228114138, "step": 29440}, {"loss": 0.5432, "grad_norm": 0.8999572992324829, "learning_rate": 0.0002, "epoch": 4.761134912294883, "step": 29450}, {"loss": 0.5667, "grad_norm": 0.9863559007644653, "learning_rate": 0.0002, "epoch": 4.762751596475629, "step": 29460}, {"loss": 0.5779, "grad_norm": 0.9676542282104492, "learning_rate": 0.0002, "epoch": 4.764368280656374, "step": 29470}, {"loss": 0.6075, "grad_norm": 1.004775047302246, "learning_rate": 0.0002, "epoch": 4.765984964837119, "step": 29480}, {"loss": 0.6044, "grad_norm": 1.0937515497207642, "learning_rate": 0.0002, "epoch": 4.767601649017864, "step": 29490}, {"loss": 0.5433, "grad_norm": 0.9551598429679871, "learning_rate": 0.0002, "epoch": 4.7692183331986095, "step": 29500}, {"loss": 0.5609, "grad_norm": 1.0757228136062622, "learning_rate": 0.0002, "epoch": 4.770835017379355, "step": 29510}, {"loss": 0.567, "grad_norm": 1.0588841438293457, "learning_rate": 0.0002, "epoch": 4.7724517015601, "step": 29520}, {"loss": 0.5814, "grad_norm": 1.0744032859802246, "learning_rate": 0.0002, "epoch": 4.774068385740845, "step": 29530}, {"loss": 0.5681, "grad_norm": 1.0066277980804443, "learning_rate": 0.0002, "epoch": 4.7756850699215905, "step": 29540}, {"loss": 0.545, "grad_norm": 1.082319736480713, "learning_rate": 0.0002, "epoch": 4.777301754102336, "step": 29550}, {"loss": 0.5709, "grad_norm": 0.8252472877502441, "learning_rate": 0.0002, "epoch": 4.778918438283082, "step": 29560}, {"loss": 0.5666, "grad_norm": 0.9855340123176575, "learning_rate": 0.0002, "epoch": 4.780535122463827, "step": 29570}, {"loss": 0.6117, "grad_norm": 0.9991421699523926, "learning_rate": 0.0002, "epoch": 4.782151806644572, "step": 29580}, {"loss": 0.5966, "grad_norm": 1.316841959953308, "learning_rate": 0.0002, "epoch": 4.7837684908253175, "step": 29590}, {"loss": 0.6102, "grad_norm": 1.1513035297393799, "learning_rate": 0.0002, "epoch": 4.785385175006063, "step": 29600}, {"loss": 0.5785, "grad_norm": 0.9767683744430542, "learning_rate": 0.0002, "epoch": 4.787001859186808, "step": 29610}, {"loss": 0.6037, "grad_norm": 0.9786278605461121, "learning_rate": 0.0002, "epoch": 4.788618543367553, "step": 29620}, {"loss": 0.6108, "grad_norm": 0.8004973530769348, "learning_rate": 0.0002, "epoch": 4.7902352275482984, "step": 29630}, {"loss": 0.5932, "grad_norm": 1.0997767448425293, "learning_rate": 0.0002, "epoch": 4.791851911729044, "step": 29640}, {"loss": 0.5655, "grad_norm": 0.9752856492996216, "learning_rate": 0.0002, "epoch": 4.793468595909789, "step": 29650}, {"loss": 0.5916, "grad_norm": 1.0518392324447632, "learning_rate": 0.0002, "epoch": 4.795085280090534, "step": 29660}, {"loss": 0.6042, "grad_norm": 1.1050055027008057, "learning_rate": 0.0002, "epoch": 4.796701964271279, "step": 29670}, {"loss": 0.6089, "grad_norm": 0.9933857917785645, "learning_rate": 0.0002, "epoch": 4.798318648452025, "step": 29680}, {"loss": 0.6041, "grad_norm": 1.2804018259048462, "learning_rate": 0.0002, "epoch": 4.79993533263277, "step": 29690}, {"loss": 0.636, "grad_norm": 1.0133371353149414, "learning_rate": 0.0002, "epoch": 4.801552016813515, "step": 29700}, {"loss": 0.5662, "grad_norm": 1.080350637435913, "learning_rate": 0.0002, "epoch": 4.803168700994261, "step": 29710}, {"loss": 0.5603, "grad_norm": 0.9986529350280762, "learning_rate": 0.0002, "epoch": 4.804785385175006, "step": 29720}, {"loss": 0.5894, "grad_norm": 0.975665807723999, "learning_rate": 0.0002, "epoch": 4.806402069355752, "step": 29730}, {"loss": 0.6328, "grad_norm": 0.8458138704299927, "learning_rate": 0.0002, "epoch": 4.808018753536497, "step": 29740}, {"loss": 0.5837, "grad_norm": 0.99330073595047, "learning_rate": 0.0002, "epoch": 4.809635437717242, "step": 29750}, {"loss": 0.5507, "grad_norm": 0.898274302482605, "learning_rate": 0.0002, "epoch": 4.811252121897987, "step": 29760}, {"loss": 0.5842, "grad_norm": 1.0504480600357056, "learning_rate": 0.0002, "epoch": 4.812868806078733, "step": 29770}, {"loss": 0.5821, "grad_norm": 0.937919020652771, "learning_rate": 0.0002, "epoch": 4.814485490259478, "step": 29780}, {"loss": 0.5885, "grad_norm": 0.9593307971954346, "learning_rate": 0.0002, "epoch": 4.816102174440223, "step": 29790}, {"loss": 0.578, "grad_norm": 0.9431198835372925, "learning_rate": 0.0002, "epoch": 4.817718858620968, "step": 29800}, {"loss": 0.5739, "grad_norm": 1.2729957103729248, "learning_rate": 0.0002, "epoch": 4.8193355428017135, "step": 29810}, {"loss": 0.6124, "grad_norm": 0.8876838684082031, "learning_rate": 0.0002, "epoch": 4.820952226982459, "step": 29820}, {"loss": 0.5583, "grad_norm": 1.0185000896453857, "learning_rate": 0.0002, "epoch": 4.822568911163204, "step": 29830}, {"loss": 0.5686, "grad_norm": 1.064276099205017, "learning_rate": 0.0002, "epoch": 4.824185595343949, "step": 29840}, {"loss": 0.5698, "grad_norm": 0.9774803519248962, "learning_rate": 0.0002, "epoch": 4.825802279524694, "step": 29850}, {"loss": 0.5533, "grad_norm": 1.131646990776062, "learning_rate": 0.0002, "epoch": 4.8274189637054405, "step": 29860}, {"loss": 0.6371, "grad_norm": 1.081455945968628, "learning_rate": 0.0002, "epoch": 4.829035647886186, "step": 29870}, {"loss": 0.5793, "grad_norm": 0.990538477897644, "learning_rate": 0.0002, "epoch": 4.830652332066931, "step": 29880}, {"loss": 0.5833, "grad_norm": 0.9750600457191467, "learning_rate": 0.0002, "epoch": 4.832269016247676, "step": 29890}, {"loss": 0.619, "grad_norm": 1.0600621700286865, "learning_rate": 0.0002, "epoch": 4.8338857004284215, "step": 29900}, {"loss": 0.5841, "grad_norm": 0.9237320423126221, "learning_rate": 0.0002, "epoch": 4.835502384609167, "step": 29910}, {"loss": 0.5513, "grad_norm": 0.9739177227020264, "learning_rate": 0.0002, "epoch": 4.837119068789912, "step": 29920}, {"loss": 0.587, "grad_norm": 1.128677248954773, "learning_rate": 0.0002, "epoch": 4.838735752970657, "step": 29930}, {"loss": 0.564, "grad_norm": 1.042604923248291, "learning_rate": 0.0002, "epoch": 4.840352437151402, "step": 29940}, {"loss": 0.5885, "grad_norm": 0.849758505821228, "learning_rate": 0.0002, "epoch": 4.841969121332148, "step": 29950}, {"loss": 0.5952, "grad_norm": 1.2809888124465942, "learning_rate": 0.0002, "epoch": 4.843585805512893, "step": 29960}, {"loss": 0.5703, "grad_norm": 1.0177865028381348, "learning_rate": 0.0002, "epoch": 4.845202489693638, "step": 29970}, {"loss": 0.5946, "grad_norm": 1.0026639699935913, "learning_rate": 0.0002, "epoch": 4.846819173874383, "step": 29980}, {"loss": 0.5897, "grad_norm": 0.9679505228996277, "learning_rate": 0.0002, "epoch": 4.8484358580551286, "step": 29990}, {"loss": 0.5621, "grad_norm": 0.8939532041549683, "learning_rate": 0.0002, "epoch": 4.850052542235874, "step": 30000}, {"loss": 0.5852, "grad_norm": 0.9957457780838013, "learning_rate": 0.0002, "epoch": 4.85166922641662, "step": 30010}, {"loss": 0.6117, "grad_norm": 1.1646790504455566, "learning_rate": 0.0002, "epoch": 4.853285910597365, "step": 30020}, {"loss": 0.5711, "grad_norm": 0.8804680705070496, "learning_rate": 0.0002, "epoch": 4.85490259477811, "step": 30030}, {"loss": 0.5397, "grad_norm": 1.161970853805542, "learning_rate": 0.0002, "epoch": 4.856519278958856, "step": 30040}, {"loss": 0.5552, "grad_norm": 0.9081037640571594, "learning_rate": 0.0002, "epoch": 4.858135963139601, "step": 30050}, {"loss": 0.6024, "grad_norm": 0.9402848482131958, "learning_rate": 0.0002, "epoch": 4.859752647320346, "step": 30060}, {"loss": 0.6256, "grad_norm": 0.9023865461349487, "learning_rate": 0.0002, "epoch": 4.861369331501091, "step": 30070}, {"loss": 0.5926, "grad_norm": 1.0173414945602417, "learning_rate": 0.0002, "epoch": 4.8629860156818365, "step": 30080}, {"loss": 0.6274, "grad_norm": 1.084402322769165, "learning_rate": 0.0002, "epoch": 4.864602699862582, "step": 30090}, {"loss": 0.6311, "grad_norm": 0.9577937126159668, "learning_rate": 0.0002, "epoch": 4.866219384043327, "step": 30100}, {"loss": 0.5724, "grad_norm": 0.9807606935501099, "learning_rate": 0.0002, "epoch": 4.867836068224072, "step": 30110}, {"loss": 0.5786, "grad_norm": 0.978784441947937, "learning_rate": 0.0002, "epoch": 4.8694527524048175, "step": 30120}, {"loss": 0.6194, "grad_norm": 0.9762914776802063, "learning_rate": 0.0002, "epoch": 4.871069436585563, "step": 30130}, {"loss": 0.5892, "grad_norm": 0.9404871463775635, "learning_rate": 0.0002, "epoch": 4.872686120766308, "step": 30140}, {"loss": 0.6182, "grad_norm": 1.0069509744644165, "learning_rate": 0.0002, "epoch": 4.874302804947053, "step": 30150}, {"loss": 0.6225, "grad_norm": 1.1770923137664795, "learning_rate": 0.0002, "epoch": 4.875919489127799, "step": 30160}, {"loss": 0.5657, "grad_norm": 1.021210789680481, "learning_rate": 0.0002, "epoch": 4.8775361733085445, "step": 30170}, {"loss": 0.6033, "grad_norm": 0.8512648940086365, "learning_rate": 0.0002, "epoch": 4.87915285748929, "step": 30180}, {"loss": 0.5519, "grad_norm": 0.9345870018005371, "learning_rate": 0.0002, "epoch": 4.880769541670035, "step": 30190}, {"loss": 0.5682, "grad_norm": 1.0224418640136719, "learning_rate": 0.0002, "epoch": 4.88238622585078, "step": 30200}, {"loss": 0.5807, "grad_norm": 1.0316044092178345, "learning_rate": 0.0002, "epoch": 4.884002910031525, "step": 30210}, {"loss": 0.6065, "grad_norm": 1.102437973022461, "learning_rate": 0.0002, "epoch": 4.885619594212271, "step": 30220}, {"loss": 0.586, "grad_norm": 1.0220023393630981, "learning_rate": 0.0002, "epoch": 4.887236278393016, "step": 30230}, {"loss": 0.5781, "grad_norm": 1.0934523344039917, "learning_rate": 0.0002, "epoch": 4.888852962573761, "step": 30240}, {"loss": 0.6313, "grad_norm": 1.264630913734436, "learning_rate": 0.0002, "epoch": 4.890469646754506, "step": 30250}, {"loss": 0.5712, "grad_norm": 1.0999879837036133, "learning_rate": 0.0002, "epoch": 4.892086330935252, "step": 30260}, {"loss": 0.6413, "grad_norm": 0.9124550223350525, "learning_rate": 0.0002, "epoch": 4.893703015115997, "step": 30270}, {"loss": 0.596, "grad_norm": 0.9853624105453491, "learning_rate": 0.0002, "epoch": 4.895319699296742, "step": 30280}, {"loss": 0.595, "grad_norm": 1.0589802265167236, "learning_rate": 0.0002, "epoch": 4.896936383477488, "step": 30290}, {"loss": 0.6129, "grad_norm": 0.8487226366996765, "learning_rate": 0.0002, "epoch": 4.8985530676582325, "step": 30300}, {"loss": 0.5514, "grad_norm": 1.0212191343307495, "learning_rate": 0.0002, "epoch": 4.900169751838979, "step": 30310}, {"loss": 0.5896, "grad_norm": 1.0187491178512573, "learning_rate": 0.0002, "epoch": 4.901786436019724, "step": 30320}, {"loss": 0.5809, "grad_norm": 1.0013091564178467, "learning_rate": 0.0002, "epoch": 4.903403120200469, "step": 30330}, {"loss": 0.5658, "grad_norm": 1.0017542839050293, "learning_rate": 0.0002, "epoch": 4.905019804381214, "step": 30340}, {"loss": 0.6002, "grad_norm": 0.9665151238441467, "learning_rate": 0.0002, "epoch": 4.9066364885619596, "step": 30350}, {"loss": 0.5864, "grad_norm": 0.8774822950363159, "learning_rate": 0.0002, "epoch": 4.908253172742705, "step": 30360}, {"loss": 0.5771, "grad_norm": 0.9449850916862488, "learning_rate": 0.0002, "epoch": 4.90986985692345, "step": 30370}, {"loss": 0.58, "grad_norm": 0.7368341088294983, "learning_rate": 0.0002, "epoch": 4.911486541104195, "step": 30380}, {"loss": 0.5992, "grad_norm": 0.9669167995452881, "learning_rate": 0.0002, "epoch": 4.9131032252849405, "step": 30390}, {"loss": 0.6202, "grad_norm": 1.1227794885635376, "learning_rate": 0.0002, "epoch": 4.914719909465686, "step": 30400}, {"loss": 0.6181, "grad_norm": 0.9884361028671265, "learning_rate": 0.0002, "epoch": 4.916336593646431, "step": 30410}, {"loss": 0.6185, "grad_norm": 0.9949551224708557, "learning_rate": 0.0002, "epoch": 4.917953277827176, "step": 30420}, {"loss": 0.5866, "grad_norm": 0.9491621851921082, "learning_rate": 0.0002, "epoch": 4.919569962007921, "step": 30430}, {"loss": 0.6005, "grad_norm": 0.78848797082901, "learning_rate": 0.0002, "epoch": 4.9211866461886675, "step": 30440}, {"loss": 0.5561, "grad_norm": 1.0693835020065308, "learning_rate": 0.0002, "epoch": 4.922803330369412, "step": 30450}, {"loss": 0.566, "grad_norm": 0.9573729634284973, "learning_rate": 0.0002, "epoch": 4.924420014550158, "step": 30460}, {"loss": 0.6084, "grad_norm": 0.9975152611732483, "learning_rate": 0.0002, "epoch": 4.926036698730903, "step": 30470}, {"loss": 0.5969, "grad_norm": 0.8695693016052246, "learning_rate": 0.0002, "epoch": 4.9276533829116484, "step": 30480}, {"loss": 0.6144, "grad_norm": 1.145394206047058, "learning_rate": 0.0002, "epoch": 4.929270067092394, "step": 30490}, {"loss": 0.5736, "grad_norm": 0.7668989896774292, "learning_rate": 0.0002, "epoch": 4.930886751273139, "step": 30500}, {"loss": 0.6052, "grad_norm": 0.9630151391029358, "learning_rate": 0.0002, "epoch": 4.932503435453884, "step": 30510}, {"loss": 0.6461, "grad_norm": 0.940705418586731, "learning_rate": 0.0002, "epoch": 4.934120119634629, "step": 30520}, {"loss": 0.6326, "grad_norm": 1.3243348598480225, "learning_rate": 0.0002, "epoch": 4.935736803815375, "step": 30530}, {"loss": 0.6174, "grad_norm": 1.004347801208496, "learning_rate": 0.0002, "epoch": 4.93735348799612, "step": 30540}, {"loss": 0.583, "grad_norm": 0.8711541295051575, "learning_rate": 0.0002, "epoch": 4.938970172176865, "step": 30550}, {"loss": 0.599, "grad_norm": 0.8980631828308105, "learning_rate": 0.0002, "epoch": 4.94058685635761, "step": 30560}, {"loss": 0.6024, "grad_norm": 0.8388893604278564, "learning_rate": 0.0002, "epoch": 4.9422035405383555, "step": 30570}, {"loss": 0.6189, "grad_norm": 1.0991183519363403, "learning_rate": 0.0002, "epoch": 4.943820224719101, "step": 30580}, {"loss": 0.5906, "grad_norm": 0.9731075763702393, "learning_rate": 0.0002, "epoch": 4.945436908899847, "step": 30590}, {"loss": 0.5883, "grad_norm": 1.3904452323913574, "learning_rate": 0.0002, "epoch": 4.947053593080591, "step": 30600}, {"loss": 0.5952, "grad_norm": 1.2489882707595825, "learning_rate": 0.0002, "epoch": 4.948670277261337, "step": 30610}, {"loss": 0.5887, "grad_norm": 1.240072250366211, "learning_rate": 0.0002, "epoch": 4.950286961442083, "step": 30620}, {"loss": 0.5762, "grad_norm": 0.9191411733627319, "learning_rate": 0.0002, "epoch": 4.951903645622828, "step": 30630}, {"loss": 0.5597, "grad_norm": 0.8888895511627197, "learning_rate": 0.0002, "epoch": 4.953520329803573, "step": 30640}, {"loss": 0.6594, "grad_norm": 0.9001450538635254, "learning_rate": 0.0002, "epoch": 4.955137013984318, "step": 30650}, {"loss": 0.6047, "grad_norm": 1.053971767425537, "learning_rate": 0.0002, "epoch": 4.9567536981650635, "step": 30660}, {"loss": 0.6107, "grad_norm": 1.2224042415618896, "learning_rate": 0.0002, "epoch": 4.958370382345809, "step": 30670}, {"loss": 0.6211, "grad_norm": 0.8855111598968506, "learning_rate": 0.0002, "epoch": 4.959987066526554, "step": 30680}, {"loss": 0.5764, "grad_norm": 0.9489575624465942, "learning_rate": 0.0002, "epoch": 4.961603750707299, "step": 30690}, {"loss": 0.5371, "grad_norm": 0.9635404944419861, "learning_rate": 0.0002, "epoch": 4.963220434888044, "step": 30700}, {"loss": 0.6043, "grad_norm": 1.1784121990203857, "learning_rate": 0.0002, "epoch": 4.96483711906879, "step": 30710}, {"loss": 0.5803, "grad_norm": 1.0059462785720825, "learning_rate": 0.0002, "epoch": 4.966453803249535, "step": 30720}, {"loss": 0.5759, "grad_norm": 0.9479738473892212, "learning_rate": 0.0002, "epoch": 4.96807048743028, "step": 30730}, {"loss": 0.584, "grad_norm": 1.0624593496322632, "learning_rate": 0.0002, "epoch": 4.969687171611026, "step": 30740}, {"loss": 0.6202, "grad_norm": 1.1429259777069092, "learning_rate": 0.0002, "epoch": 4.971303855791771, "step": 30750}, {"loss": 0.6174, "grad_norm": 0.9102491140365601, "learning_rate": 0.0002, "epoch": 4.972920539972517, "step": 30760}, {"loss": 0.6025, "grad_norm": 1.1262688636779785, "learning_rate": 0.0002, "epoch": 4.974537224153262, "step": 30770}, {"loss": 0.588, "grad_norm": 1.1415393352508545, "learning_rate": 0.0002, "epoch": 4.976153908334007, "step": 30780}, {"loss": 0.5832, "grad_norm": 1.083078384399414, "learning_rate": 0.0002, "epoch": 4.977770592514752, "step": 30790}, {"loss": 0.6025, "grad_norm": 0.964859127998352, "learning_rate": 0.0002, "epoch": 4.979387276695498, "step": 30800}, {"loss": 0.6095, "grad_norm": 0.8704743385314941, "learning_rate": 0.0002, "epoch": 4.981003960876243, "step": 30810}, {"loss": 0.5666, "grad_norm": 1.0714856386184692, "learning_rate": 0.0002, "epoch": 4.982620645056988, "step": 30820}, {"loss": 0.565, "grad_norm": 0.6818771362304688, "learning_rate": 0.0002, "epoch": 4.984237329237733, "step": 30830}, {"loss": 0.5999, "grad_norm": 1.0454156398773193, "learning_rate": 0.0002, "epoch": 4.985854013418479, "step": 30840}, {"loss": 0.5683, "grad_norm": 0.9410776495933533, "learning_rate": 0.0002, "epoch": 4.987470697599224, "step": 30850}, {"loss": 0.5899, "grad_norm": 1.0878902673721313, "learning_rate": 0.0002, "epoch": 4.989087381779969, "step": 30860}, {"loss": 0.5914, "grad_norm": 0.8916727304458618, "learning_rate": 0.0002, "epoch": 4.990704065960714, "step": 30870}, {"loss": 0.6066, "grad_norm": 1.045776128768921, "learning_rate": 0.0002, "epoch": 4.9923207501414595, "step": 30880}, {"loss": 0.5767, "grad_norm": 0.9861903786659241, "learning_rate": 0.0002, "epoch": 4.993937434322206, "step": 30890}, {"loss": 0.6192, "grad_norm": 0.9275050759315491, "learning_rate": 0.0002, "epoch": 4.995554118502951, "step": 30900}, {"loss": 0.6181, "grad_norm": 0.94013911485672, "learning_rate": 0.0002, "epoch": 4.997170802683696, "step": 30910}, {"loss": 0.614, "grad_norm": 0.9771268367767334, "learning_rate": 0.0002, "epoch": 4.998787486864441, "step": 30920}, {"eval_loss": 1.1968598365783691, "eval_runtime": 122.2519, "eval_samples_per_second": 5.996, "eval_steps_per_second": 0.753, "epoch": 4.9999191657909625, "step": 30927}, {"loss": 0.5238, "grad_norm": 0.8021580576896667, "learning_rate": 0.0002, "epoch": 5.0004041710451865, "step": 30930}, {"loss": 0.4984, "grad_norm": 1.0807327032089233, "learning_rate": 0.0002, "epoch": 5.002020855225932, "step": 30940}, {"loss": 0.514, "grad_norm": 1.1638425588607788, "learning_rate": 0.0002, "epoch": 5.003637539406677, "step": 30950}, {"loss": 0.4621, "grad_norm": 1.1700230836868286, "learning_rate": 0.0002, "epoch": 5.005254223587422, "step": 30960}, {"loss": 0.4657, "grad_norm": 0.9053420424461365, "learning_rate": 0.0002, "epoch": 5.0068709077681675, "step": 30970}, {"loss": 0.4865, "grad_norm": 0.9226111769676208, "learning_rate": 0.0002, "epoch": 5.008487591948913, "step": 30980}, {"loss": 0.5011, "grad_norm": 1.238669514656067, "learning_rate": 0.0002, "epoch": 5.010104276129658, "step": 30990}, {"loss": 0.4754, "grad_norm": 1.0668327808380127, "learning_rate": 0.0002, "epoch": 5.011720960310403, "step": 31000}, {"loss": 0.5414, "grad_norm": 1.0903944969177246, "learning_rate": 0.0002, "epoch": 5.013337644491148, "step": 31010}, {"loss": 0.5117, "grad_norm": 1.0763911008834839, "learning_rate": 0.0002, "epoch": 5.014954328671894, "step": 31020}, {"loss": 0.4908, "grad_norm": 1.0108771324157715, "learning_rate": 0.0002, "epoch": 5.016571012852639, "step": 31030}, {"loss": 0.5052, "grad_norm": 0.8816103935241699, "learning_rate": 0.0002, "epoch": 5.018187697033385, "step": 31040}, {"loss": 0.4985, "grad_norm": 1.11434805393219, "learning_rate": 0.0002, "epoch": 5.01980438121413, "step": 31050}, {"loss": 0.5074, "grad_norm": 1.0727789402008057, "learning_rate": 0.0002, "epoch": 5.021421065394875, "step": 31060}, {"loss": 0.4938, "grad_norm": 1.1480379104614258, "learning_rate": 0.0002, "epoch": 5.023037749575621, "step": 31070}, {"loss": 0.491, "grad_norm": 1.0913071632385254, "learning_rate": 0.0002, "epoch": 5.024654433756366, "step": 31080}, {"loss": 0.4896, "grad_norm": 0.9891864657402039, "learning_rate": 0.0002, "epoch": 5.026271117937111, "step": 31090}, {"loss": 0.4965, "grad_norm": 0.9167473912239075, "learning_rate": 0.0002, "epoch": 5.027887802117856, "step": 31100}, {"loss": 0.5098, "grad_norm": 1.2259035110473633, "learning_rate": 0.0002, "epoch": 5.029504486298602, "step": 31110}, {"loss": 0.5206, "grad_norm": 1.1812787055969238, "learning_rate": 0.0002, "epoch": 5.031121170479347, "step": 31120}, {"loss": 0.4725, "grad_norm": 1.0890522003173828, "learning_rate": 0.0002, "epoch": 5.032737854660092, "step": 31130}, {"loss": 0.4768, "grad_norm": 1.0521091222763062, "learning_rate": 0.0002, "epoch": 5.034354538840837, "step": 31140}, {"loss": 0.4718, "grad_norm": 1.1274569034576416, "learning_rate": 0.0002, "epoch": 5.0359712230215825, "step": 31150}, {"loss": 0.4604, "grad_norm": 1.140974998474121, "learning_rate": 0.0002, "epoch": 5.037587907202328, "step": 31160}, {"loss": 0.5077, "grad_norm": 1.1215609312057495, "learning_rate": 0.0002, "epoch": 5.039204591383073, "step": 31170}, {"loss": 0.4746, "grad_norm": 1.0107218027114868, "learning_rate": 0.0002, "epoch": 5.040821275563818, "step": 31180}, {"loss": 0.5126, "grad_norm": 1.0198770761489868, "learning_rate": 0.0002, "epoch": 5.042437959744564, "step": 31190}, {"loss": 0.5004, "grad_norm": 1.1613430976867676, "learning_rate": 0.0002, "epoch": 5.0440546439253096, "step": 31200}, {"loss": 0.5181, "grad_norm": 0.8555458188056946, "learning_rate": 0.0002, "epoch": 5.045671328106055, "step": 31210}, {"loss": 0.4878, "grad_norm": 1.0235545635223389, "learning_rate": 0.0002, "epoch": 5.0472880122868, "step": 31220}, {"loss": 0.499, "grad_norm": 1.0228750705718994, "learning_rate": 0.0002, "epoch": 5.048904696467545, "step": 31230}, {"loss": 0.4544, "grad_norm": 0.8216419816017151, "learning_rate": 0.0002, "epoch": 5.0505213806482905, "step": 31240}, {"loss": 0.4947, "grad_norm": 0.925828218460083, "learning_rate": 0.0002, "epoch": 5.052138064829036, "step": 31250}, {"loss": 0.4835, "grad_norm": 0.9229369759559631, "learning_rate": 0.0002, "epoch": 5.053754749009781, "step": 31260}, {"loss": 0.5136, "grad_norm": 0.9531727433204651, "learning_rate": 0.0002, "epoch": 5.055371433190526, "step": 31270}, {"loss": 0.5161, "grad_norm": 0.7738548517227173, "learning_rate": 0.0002, "epoch": 5.056988117371271, "step": 31280}, {"loss": 0.5166, "grad_norm": 1.0551451444625854, "learning_rate": 0.0002, "epoch": 5.058604801552017, "step": 31290}, {"loss": 0.4953, "grad_norm": 0.9782299399375916, "learning_rate": 0.0002, "epoch": 5.060221485732762, "step": 31300}, {"loss": 0.4776, "grad_norm": 1.0220632553100586, "learning_rate": 0.0002, "epoch": 5.061838169913507, "step": 31310}, {"loss": 0.5117, "grad_norm": 0.9808892607688904, "learning_rate": 0.0002, "epoch": 5.063454854094252, "step": 31320}, {"loss": 0.501, "grad_norm": 1.0662003755569458, "learning_rate": 0.0002, "epoch": 5.065071538274998, "step": 31330}, {"loss": 0.4844, "grad_norm": 1.0036940574645996, "learning_rate": 0.0002, "epoch": 5.066688222455744, "step": 31340}, {"loss": 0.5299, "grad_norm": 1.1931052207946777, "learning_rate": 0.0002, "epoch": 5.068304906636489, "step": 31350}, {"loss": 0.4646, "grad_norm": 0.9370693564414978, "learning_rate": 0.0002, "epoch": 5.069921590817234, "step": 31360}, {"loss": 0.5274, "grad_norm": 0.9589039087295532, "learning_rate": 0.0002, "epoch": 5.071538274997979, "step": 31370}, {"loss": 0.4669, "grad_norm": 1.0052711963653564, "learning_rate": 0.0002, "epoch": 5.073154959178725, "step": 31380}, {"loss": 0.5283, "grad_norm": 0.9991368651390076, "learning_rate": 0.0002, "epoch": 5.07477164335947, "step": 31390}, {"loss": 0.4579, "grad_norm": 0.8539695739746094, "learning_rate": 0.0002, "epoch": 5.076388327540215, "step": 31400}, {"loss": 0.4609, "grad_norm": 1.048775553703308, "learning_rate": 0.0002, "epoch": 5.07800501172096, "step": 31410}, {"loss": 0.4915, "grad_norm": 0.9983724355697632, "learning_rate": 0.0002, "epoch": 5.0796216959017055, "step": 31420}, {"loss": 0.4594, "grad_norm": 1.0189813375473022, "learning_rate": 0.0002, "epoch": 5.081238380082451, "step": 31430}, {"loss": 0.5449, "grad_norm": 0.9781646728515625, "learning_rate": 0.0002, "epoch": 5.082855064263196, "step": 31440}, {"loss": 0.4698, "grad_norm": 0.9424566030502319, "learning_rate": 0.0002, "epoch": 5.084471748443941, "step": 31450}, {"loss": 0.4768, "grad_norm": 1.0036484003067017, "learning_rate": 0.0002, "epoch": 5.0860884326246865, "step": 31460}, {"loss": 0.487, "grad_norm": 1.0983147621154785, "learning_rate": 0.0002, "epoch": 5.087705116805432, "step": 31470}, {"loss": 0.5236, "grad_norm": 1.0856730937957764, "learning_rate": 0.0002, "epoch": 5.089321800986177, "step": 31480}, {"loss": 0.485, "grad_norm": 1.2191699743270874, "learning_rate": 0.0002, "epoch": 5.090938485166923, "step": 31490}, {"loss": 0.4936, "grad_norm": 0.939346194267273, "learning_rate": 0.0002, "epoch": 5.092555169347668, "step": 31500}, {"loss": 0.5107, "grad_norm": 0.9730121493339539, "learning_rate": 0.0002, "epoch": 5.0941718535284135, "step": 31510}, {"loss": 0.4973, "grad_norm": 0.923686146736145, "learning_rate": 0.0002, "epoch": 5.095788537709159, "step": 31520}, {"loss": 0.4906, "grad_norm": 1.1734349727630615, "learning_rate": 0.0002, "epoch": 5.097405221889904, "step": 31530}, {"loss": 0.5165, "grad_norm": 1.084509015083313, "learning_rate": 0.0002, "epoch": 5.099021906070649, "step": 31540}, {"loss": 0.5078, "grad_norm": 1.0144678354263306, "learning_rate": 0.0002, "epoch": 5.100638590251394, "step": 31550}, {"loss": 0.4719, "grad_norm": 0.9958019256591797, "learning_rate": 0.0002, "epoch": 5.10225527443214, "step": 31560}, {"loss": 0.4876, "grad_norm": 0.8900736570358276, "learning_rate": 0.0002, "epoch": 5.103871958612885, "step": 31570}, {"loss": 0.463, "grad_norm": 1.0921649932861328, "learning_rate": 0.0002, "epoch": 5.10548864279363, "step": 31580}, {"loss": 0.5148, "grad_norm": 1.1613792181015015, "learning_rate": 0.0002, "epoch": 5.107105326974375, "step": 31590}, {"loss": 0.5055, "grad_norm": 0.9211367964744568, "learning_rate": 0.0002, "epoch": 5.108722011155121, "step": 31600}, {"loss": 0.5364, "grad_norm": 1.3315813541412354, "learning_rate": 0.0002, "epoch": 5.110338695335866, "step": 31610}, {"loss": 0.5336, "grad_norm": 1.3765019178390503, "learning_rate": 0.0002, "epoch": 5.111955379516611, "step": 31620}, {"loss": 0.4861, "grad_norm": 1.070198893547058, "learning_rate": 0.0002, "epoch": 5.113572063697356, "step": 31630}, {"loss": 0.5046, "grad_norm": 0.947631299495697, "learning_rate": 0.0002, "epoch": 5.115188747878102, "step": 31640}, {"loss": 0.5297, "grad_norm": 1.0197371244430542, "learning_rate": 0.0002, "epoch": 5.116805432058848, "step": 31650}, {"loss": 0.5014, "grad_norm": 0.8647911548614502, "learning_rate": 0.0002, "epoch": 5.118422116239593, "step": 31660}, {"loss": 0.4705, "grad_norm": 0.8944075107574463, "learning_rate": 0.0002, "epoch": 5.120038800420338, "step": 31670}, {"loss": 0.5175, "grad_norm": 1.124497652053833, "learning_rate": 0.0002, "epoch": 5.121655484601083, "step": 31680}, {"loss": 0.5109, "grad_norm": 0.893131673336029, "learning_rate": 0.0002, "epoch": 5.123272168781829, "step": 31690}, {"loss": 0.4937, "grad_norm": 1.0122284889221191, "learning_rate": 0.0002, "epoch": 5.124888852962574, "step": 31700}, {"loss": 0.5522, "grad_norm": 0.9493719935417175, "learning_rate": 0.0002, "epoch": 5.126505537143319, "step": 31710}, {"loss": 0.5031, "grad_norm": 0.9700539112091064, "learning_rate": 0.0002, "epoch": 5.128122221324064, "step": 31720}, {"loss": 0.5126, "grad_norm": 1.111677646636963, "learning_rate": 0.0002, "epoch": 5.1297389055048095, "step": 31730}, {"loss": 0.5272, "grad_norm": 0.8204274773597717, "learning_rate": 0.0002, "epoch": 5.131355589685555, "step": 31740}, {"loss": 0.5029, "grad_norm": 1.1029267311096191, "learning_rate": 0.0002, "epoch": 5.1329722738663, "step": 31750}, {"loss": 0.505, "grad_norm": 1.065575122833252, "learning_rate": 0.0002, "epoch": 5.134588958047045, "step": 31760}, {"loss": 0.502, "grad_norm": 0.8208706974983215, "learning_rate": 0.0002, "epoch": 5.13620564222779, "step": 31770}, {"loss": 0.5352, "grad_norm": 1.0520979166030884, "learning_rate": 0.0002, "epoch": 5.137822326408536, "step": 31780}, {"loss": 0.4911, "grad_norm": 0.8585538268089294, "learning_rate": 0.0002, "epoch": 5.139439010589282, "step": 31790}, {"loss": 0.5159, "grad_norm": 1.1491447687149048, "learning_rate": 0.0002, "epoch": 5.141055694770027, "step": 31800}, {"loss": 0.5157, "grad_norm": 0.9441081285476685, "learning_rate": 0.0002, "epoch": 5.142672378950772, "step": 31810}, {"loss": 0.5383, "grad_norm": 1.4146889448165894, "learning_rate": 0.0002, "epoch": 5.1442890631315175, "step": 31820}, {"loss": 0.5159, "grad_norm": 1.0326547622680664, "learning_rate": 0.0002, "epoch": 5.145905747312263, "step": 31830}, {"loss": 0.5348, "grad_norm": 0.9879202842712402, "learning_rate": 0.0002, "epoch": 5.147522431493008, "step": 31840}, {"loss": 0.5083, "grad_norm": 1.0374281406402588, "learning_rate": 0.0002, "epoch": 5.149139115673753, "step": 31850}, {"loss": 0.4827, "grad_norm": 1.181229591369629, "learning_rate": 0.0002, "epoch": 5.150755799854498, "step": 31860}, {"loss": 0.5313, "grad_norm": 1.2078537940979004, "learning_rate": 0.0002, "epoch": 5.152372484035244, "step": 31870}, {"loss": 0.5329, "grad_norm": 0.9599190354347229, "learning_rate": 0.0002, "epoch": 5.153989168215989, "step": 31880}, {"loss": 0.4953, "grad_norm": 1.0378568172454834, "learning_rate": 0.0002, "epoch": 5.155605852396734, "step": 31890}, {"loss": 0.5069, "grad_norm": 0.8746536374092102, "learning_rate": 0.0002, "epoch": 5.157222536577479, "step": 31900}, {"loss": 0.5272, "grad_norm": 1.0232136249542236, "learning_rate": 0.0002, "epoch": 5.1588392207582245, "step": 31910}, {"loss": 0.4844, "grad_norm": 0.9827565550804138, "learning_rate": 0.0002, "epoch": 5.16045590493897, "step": 31920}, {"loss": 0.5029, "grad_norm": 1.342657208442688, "learning_rate": 0.0002, "epoch": 5.162072589119716, "step": 31930}, {"loss": 0.513, "grad_norm": 1.18390691280365, "learning_rate": 0.0002, "epoch": 5.163689273300461, "step": 31940}, {"loss": 0.5267, "grad_norm": 0.996350109577179, "learning_rate": 0.0002, "epoch": 5.165305957481206, "step": 31950}, {"loss": 0.5063, "grad_norm": 0.9710391163825989, "learning_rate": 0.0002, "epoch": 5.166922641661952, "step": 31960}, {"loss": 0.5115, "grad_norm": 1.0264002084732056, "learning_rate": 0.0002, "epoch": 5.168539325842697, "step": 31970}, {"loss": 0.4972, "grad_norm": 1.0028311014175415, "learning_rate": 0.0002, "epoch": 5.170156010023442, "step": 31980}, {"loss": 0.5103, "grad_norm": 1.1078234910964966, "learning_rate": 0.0002, "epoch": 5.171772694204187, "step": 31990}, {"loss": 0.495, "grad_norm": 0.9659610390663147, "learning_rate": 0.0002, "epoch": 5.1733893783849325, "step": 32000}, {"loss": 0.5114, "grad_norm": 0.841986894607544, "learning_rate": 0.0002, "epoch": 5.175006062565678, "step": 32010}, {"loss": 0.48, "grad_norm": 1.095332384109497, "learning_rate": 0.0002, "epoch": 5.176622746746423, "step": 32020}, {"loss": 0.4741, "grad_norm": 1.1242377758026123, "learning_rate": 0.0002, "epoch": 5.178239430927168, "step": 32030}, {"loss": 0.5573, "grad_norm": 0.9872292280197144, "learning_rate": 0.0002, "epoch": 5.179856115107913, "step": 32040}, {"loss": 0.48, "grad_norm": 0.936161994934082, "learning_rate": 0.0002, "epoch": 5.181472799288659, "step": 32050}, {"loss": 0.5093, "grad_norm": 1.166100025177002, "learning_rate": 0.0002, "epoch": 5.183089483469404, "step": 32060}, {"loss": 0.5438, "grad_norm": 1.0764425992965698, "learning_rate": 0.0002, "epoch": 5.184706167650149, "step": 32070}, {"loss": 0.4843, "grad_norm": 1.0480051040649414, "learning_rate": 0.0002, "epoch": 5.186322851830895, "step": 32080}, {"loss": 0.5386, "grad_norm": 1.0874916315078735, "learning_rate": 0.0002, "epoch": 5.1879395360116405, "step": 32090}, {"loss": 0.4975, "grad_norm": 1.0817396640777588, "learning_rate": 0.0002, "epoch": 5.189556220192386, "step": 32100}, {"loss": 0.5177, "grad_norm": 1.054111361503601, "learning_rate": 0.0002, "epoch": 5.191172904373131, "step": 32110}, {"loss": 0.5229, "grad_norm": 0.9655823707580566, "learning_rate": 0.0002, "epoch": 5.192789588553876, "step": 32120}, {"loss": 0.5105, "grad_norm": 1.1384109258651733, "learning_rate": 0.0002, "epoch": 5.194406272734621, "step": 32130}, {"loss": 0.5073, "grad_norm": 1.0149348974227905, "learning_rate": 0.0002, "epoch": 5.196022956915367, "step": 32140}, {"loss": 0.5293, "grad_norm": 1.1084046363830566, "learning_rate": 0.0002, "epoch": 5.197639641096112, "step": 32150}, {"loss": 0.4936, "grad_norm": 1.1209309101104736, "learning_rate": 0.0002, "epoch": 5.199256325276857, "step": 32160}, {"loss": 0.5101, "grad_norm": 1.133089542388916, "learning_rate": 0.0002, "epoch": 5.200873009457602, "step": 32170}, {"loss": 0.5242, "grad_norm": 1.0893020629882812, "learning_rate": 0.0002, "epoch": 5.202489693638348, "step": 32180}, {"loss": 0.4872, "grad_norm": 0.90018630027771, "learning_rate": 0.0002, "epoch": 5.204106377819093, "step": 32190}, {"loss": 0.4999, "grad_norm": 0.977622926235199, "learning_rate": 0.0002, "epoch": 5.205723061999838, "step": 32200}, {"loss": 0.5028, "grad_norm": 1.2940177917480469, "learning_rate": 0.0002, "epoch": 5.207339746180583, "step": 32210}, {"loss": 0.5396, "grad_norm": 1.2131710052490234, "learning_rate": 0.0002, "epoch": 5.2089564303613285, "step": 32220}, {"loss": 0.5189, "grad_norm": 1.0234841108322144, "learning_rate": 0.0002, "epoch": 5.210573114542075, "step": 32230}, {"loss": 0.5424, "grad_norm": 1.157975435256958, "learning_rate": 0.0002, "epoch": 5.21218979872282, "step": 32240}, {"loss": 0.5396, "grad_norm": 1.0381282567977905, "learning_rate": 0.0002, "epoch": 5.213806482903565, "step": 32250}, {"loss": 0.5192, "grad_norm": 1.0125395059585571, "learning_rate": 0.0002, "epoch": 5.21542316708431, "step": 32260}, {"loss": 0.5216, "grad_norm": 1.272691011428833, "learning_rate": 0.0002, "epoch": 5.2170398512650555, "step": 32270}, {"loss": 0.52, "grad_norm": 1.0061250925064087, "learning_rate": 0.0002, "epoch": 5.218656535445801, "step": 32280}, {"loss": 0.4739, "grad_norm": 0.9752234816551208, "learning_rate": 0.0002, "epoch": 5.220273219626546, "step": 32290}, {"loss": 0.5471, "grad_norm": 1.1193140745162964, "learning_rate": 0.0002, "epoch": 5.221889903807291, "step": 32300}, {"loss": 0.4976, "grad_norm": 1.0126434564590454, "learning_rate": 0.0002, "epoch": 5.2235065879880365, "step": 32310}, {"loss": 0.5257, "grad_norm": 1.4338394403457642, "learning_rate": 0.0002, "epoch": 5.225123272168782, "step": 32320}, {"loss": 0.5235, "grad_norm": 1.004101276397705, "learning_rate": 0.0002, "epoch": 5.226739956349527, "step": 32330}, {"loss": 0.5091, "grad_norm": 0.8744166493415833, "learning_rate": 0.0002, "epoch": 5.228356640530272, "step": 32340}, {"loss": 0.5388, "grad_norm": 1.0165376663208008, "learning_rate": 0.0002, "epoch": 5.229973324711017, "step": 32350}, {"loss": 0.5469, "grad_norm": 0.8635954260826111, "learning_rate": 0.0002, "epoch": 5.231590008891763, "step": 32360}, {"loss": 0.5609, "grad_norm": 1.1392399072647095, "learning_rate": 0.0002, "epoch": 5.233206693072509, "step": 32370}, {"loss": 0.5173, "grad_norm": 1.0202113389968872, "learning_rate": 0.0002, "epoch": 5.234823377253254, "step": 32380}, {"loss": 0.4983, "grad_norm": 1.0417983531951904, "learning_rate": 0.0002, "epoch": 5.236440061433999, "step": 32390}, {"loss": 0.507, "grad_norm": 0.8729333877563477, "learning_rate": 0.0002, "epoch": 5.238056745614744, "step": 32400}, {"loss": 0.5426, "grad_norm": 1.1626229286193848, "learning_rate": 0.0002, "epoch": 5.23967342979549, "step": 32410}, {"loss": 0.5355, "grad_norm": 0.9086161851882935, "learning_rate": 0.0002, "epoch": 5.241290113976235, "step": 32420}, {"loss": 0.4927, "grad_norm": 1.3999892473220825, "learning_rate": 0.0002, "epoch": 5.24290679815698, "step": 32430}, {"loss": 0.4795, "grad_norm": 1.0356311798095703, "learning_rate": 0.0002, "epoch": 5.244523482337725, "step": 32440}, {"loss": 0.5035, "grad_norm": 0.9655531644821167, "learning_rate": 0.0002, "epoch": 5.246140166518471, "step": 32450}, {"loss": 0.5166, "grad_norm": 1.0411828756332397, "learning_rate": 0.0002, "epoch": 5.247756850699216, "step": 32460}, {"loss": 0.5141, "grad_norm": 1.1199816465377808, "learning_rate": 0.0002, "epoch": 5.249373534879961, "step": 32470}, {"loss": 0.4864, "grad_norm": 1.260321855545044, "learning_rate": 0.0002, "epoch": 5.250990219060706, "step": 32480}, {"loss": 0.4893, "grad_norm": 1.2950857877731323, "learning_rate": 0.0002, "epoch": 5.2526069032414515, "step": 32490}, {"loss": 0.4952, "grad_norm": 0.8982820510864258, "learning_rate": 0.0002, "epoch": 5.254223587422197, "step": 32500}, {"loss": 0.5138, "grad_norm": 0.8512987494468689, "learning_rate": 0.0002, "epoch": 5.255840271602942, "step": 32510}, {"loss": 0.5341, "grad_norm": 1.067443609237671, "learning_rate": 0.0002, "epoch": 5.257456955783688, "step": 32520}, {"loss": 0.4928, "grad_norm": 1.0957417488098145, "learning_rate": 0.0002, "epoch": 5.259073639964433, "step": 32530}, {"loss": 0.5169, "grad_norm": 1.4161807298660278, "learning_rate": 0.0002, "epoch": 5.260690324145179, "step": 32540}, {"loss": 0.5599, "grad_norm": 1.2264093160629272, "learning_rate": 0.0002, "epoch": 5.262307008325924, "step": 32550}, {"loss": 0.5221, "grad_norm": 1.0015931129455566, "learning_rate": 0.0002, "epoch": 5.263923692506669, "step": 32560}, {"loss": 0.5253, "grad_norm": 1.0743094682693481, "learning_rate": 0.0002, "epoch": 5.265540376687414, "step": 32570}, {"loss": 0.5289, "grad_norm": 1.1386840343475342, "learning_rate": 0.0002, "epoch": 5.2671570608681595, "step": 32580}, {"loss": 0.5315, "grad_norm": 1.0093860626220703, "learning_rate": 0.0002, "epoch": 5.268773745048905, "step": 32590}, {"loss": 0.5175, "grad_norm": 0.9593744874000549, "learning_rate": 0.0002, "epoch": 5.27039042922965, "step": 32600}, {"loss": 0.528, "grad_norm": 1.146021842956543, "learning_rate": 0.0002, "epoch": 5.272007113410395, "step": 32610}, {"loss": 0.4983, "grad_norm": 0.9579031467437744, "learning_rate": 0.0002, "epoch": 5.27362379759114, "step": 32620}, {"loss": 0.5376, "grad_norm": 1.0548793077468872, "learning_rate": 0.0002, "epoch": 5.275240481771886, "step": 32630}, {"loss": 0.5267, "grad_norm": 1.0380561351776123, "learning_rate": 0.0002, "epoch": 5.276857165952631, "step": 32640}, {"loss": 0.5182, "grad_norm": 1.2119969129562378, "learning_rate": 0.0002, "epoch": 5.278473850133376, "step": 32650}, {"loss": 0.5298, "grad_norm": 1.0507797002792358, "learning_rate": 0.0002, "epoch": 5.280090534314121, "step": 32660}, {"loss": 0.5253, "grad_norm": 1.0185176134109497, "learning_rate": 0.0002, "epoch": 5.2817072184948675, "step": 32670}, {"loss": 0.4904, "grad_norm": 1.2358098030090332, "learning_rate": 0.0002, "epoch": 5.283323902675613, "step": 32680}, {"loss": 0.5169, "grad_norm": 0.7937114238739014, "learning_rate": 0.0002, "epoch": 5.284940586856358, "step": 32690}, {"loss": 0.495, "grad_norm": 0.9825124740600586, "learning_rate": 0.0002, "epoch": 5.286557271037103, "step": 32700}, {"loss": 0.5149, "grad_norm": 1.2059301137924194, "learning_rate": 0.0002, "epoch": 5.288173955217848, "step": 32710}, {"loss": 0.5272, "grad_norm": 1.0828571319580078, "learning_rate": 0.0002, "epoch": 5.289790639398594, "step": 32720}, {"loss": 0.5383, "grad_norm": 1.0129735469818115, "learning_rate": 0.0002, "epoch": 5.291407323579339, "step": 32730}, {"loss": 0.5216, "grad_norm": 1.0591634511947632, "learning_rate": 0.0002, "epoch": 5.293024007760084, "step": 32740}, {"loss": 0.522, "grad_norm": 0.9256815910339355, "learning_rate": 0.0002, "epoch": 5.294640691940829, "step": 32750}, {"loss": 0.5396, "grad_norm": 1.0928633213043213, "learning_rate": 0.0002, "epoch": 5.2962573761215745, "step": 32760}, {"loss": 0.5093, "grad_norm": 0.9415594935417175, "learning_rate": 0.0002, "epoch": 5.29787406030232, "step": 32770}, {"loss": 0.5252, "grad_norm": 1.141316294670105, "learning_rate": 0.0002, "epoch": 5.299490744483065, "step": 32780}, {"loss": 0.4837, "grad_norm": 1.0646510124206543, "learning_rate": 0.0002, "epoch": 5.30110742866381, "step": 32790}, {"loss": 0.5547, "grad_norm": 1.189661979675293, "learning_rate": 0.0002, "epoch": 5.3027241128445555, "step": 32800}, {"loss": 0.5664, "grad_norm": 0.9568731188774109, "learning_rate": 0.0002, "epoch": 5.304340797025301, "step": 32810}, {"loss": 0.5344, "grad_norm": 1.1556824445724487, "learning_rate": 0.0002, "epoch": 5.305957481206047, "step": 32820}, {"loss": 0.4894, "grad_norm": 0.9353463649749756, "learning_rate": 0.0002, "epoch": 5.307574165386792, "step": 32830}, {"loss": 0.5052, "grad_norm": 1.1208295822143555, "learning_rate": 0.0002, "epoch": 5.309190849567537, "step": 32840}, {"loss": 0.5126, "grad_norm": 1.0894153118133545, "learning_rate": 0.0002, "epoch": 5.3108075337482825, "step": 32850}, {"loss": 0.5046, "grad_norm": 1.090329647064209, "learning_rate": 0.0002, "epoch": 5.312424217929028, "step": 32860}, {"loss": 0.5237, "grad_norm": 1.0781712532043457, "learning_rate": 0.0002, "epoch": 5.314040902109773, "step": 32870}, {"loss": 0.57, "grad_norm": 1.1785295009613037, "learning_rate": 0.0002, "epoch": 5.315657586290518, "step": 32880}, {"loss": 0.4953, "grad_norm": 1.0406851768493652, "learning_rate": 0.0002, "epoch": 5.317274270471263, "step": 32890}, {"loss": 0.514, "grad_norm": 1.0982953310012817, "learning_rate": 0.0002, "epoch": 5.318890954652009, "step": 32900}, {"loss": 0.4944, "grad_norm": 1.2969383001327515, "learning_rate": 0.0002, "epoch": 5.320507638832754, "step": 32910}, {"loss": 0.4786, "grad_norm": 0.9687288999557495, "learning_rate": 0.0002, "epoch": 5.322124323013499, "step": 32920}, {"loss": 0.5286, "grad_norm": 1.136760950088501, "learning_rate": 0.0002, "epoch": 5.323741007194244, "step": 32930}, {"loss": 0.5321, "grad_norm": 1.3045495748519897, "learning_rate": 0.0002, "epoch": 5.32535769137499, "step": 32940}, {"loss": 0.5413, "grad_norm": 1.221675992012024, "learning_rate": 0.0002, "epoch": 5.326974375555735, "step": 32950}, {"loss": 0.4999, "grad_norm": 1.1380633115768433, "learning_rate": 0.0002, "epoch": 5.32859105973648, "step": 32960}, {"loss": 0.5037, "grad_norm": 1.1065956354141235, "learning_rate": 0.0002, "epoch": 5.330207743917226, "step": 32970}, {"loss": 0.4913, "grad_norm": 1.0187175273895264, "learning_rate": 0.0002, "epoch": 5.331824428097971, "step": 32980}, {"loss": 0.5234, "grad_norm": 0.9077118039131165, "learning_rate": 0.0002, "epoch": 5.333441112278717, "step": 32990}, {"loss": 0.5071, "grad_norm": 1.0092815160751343, "learning_rate": 0.0002, "epoch": 5.335057796459462, "step": 33000}, {"loss": 0.498, "grad_norm": 1.0168777704238892, "learning_rate": 0.0002, "epoch": 5.336674480640207, "step": 33010}, {"loss": 0.4952, "grad_norm": 0.996161937713623, "learning_rate": 0.0002, "epoch": 5.338291164820952, "step": 33020}, {"loss": 0.5024, "grad_norm": 0.794463038444519, "learning_rate": 0.0002, "epoch": 5.339907849001698, "step": 33030}, {"loss": 0.5112, "grad_norm": 0.9750674962997437, "learning_rate": 0.0002, "epoch": 5.341524533182443, "step": 33040}, {"loss": 0.528, "grad_norm": 1.2770029306411743, "learning_rate": 0.0002, "epoch": 5.343141217363188, "step": 33050}, {"loss": 0.52, "grad_norm": 1.1500186920166016, "learning_rate": 0.0002, "epoch": 5.344757901543933, "step": 33060}, {"loss": 0.4906, "grad_norm": 1.0726377964019775, "learning_rate": 0.0002, "epoch": 5.3463745857246785, "step": 33070}, {"loss": 0.5212, "grad_norm": 0.9314153790473938, "learning_rate": 0.0002, "epoch": 5.347991269905424, "step": 33080}, {"loss": 0.5434, "grad_norm": 1.344988465309143, "learning_rate": 0.0002, "epoch": 5.349607954086169, "step": 33090}, {"loss": 0.4874, "grad_norm": 0.863196611404419, "learning_rate": 0.0002, "epoch": 5.351224638266914, "step": 33100}, {"loss": 0.534, "grad_norm": 1.128100037574768, "learning_rate": 0.0002, "epoch": 5.352841322447659, "step": 33110}, {"loss": 0.5293, "grad_norm": 1.1673583984375, "learning_rate": 0.0002, "epoch": 5.3544580066284055, "step": 33120}, {"loss": 0.4787, "grad_norm": 0.9416789412498474, "learning_rate": 0.0002, "epoch": 5.356074690809151, "step": 33130}, {"loss": 0.5155, "grad_norm": 1.1855236291885376, "learning_rate": 0.0002, "epoch": 5.357691374989896, "step": 33140}, {"loss": 0.515, "grad_norm": 1.0415170192718506, "learning_rate": 0.0002, "epoch": 5.359308059170641, "step": 33150}, {"loss": 0.545, "grad_norm": 0.9953004121780396, "learning_rate": 0.0002, "epoch": 5.3609247433513865, "step": 33160}, {"loss": 0.5305, "grad_norm": 0.96138596534729, "learning_rate": 0.0002, "epoch": 5.362541427532132, "step": 33170}, {"loss": 0.5064, "grad_norm": 1.341979742050171, "learning_rate": 0.0002, "epoch": 5.364158111712877, "step": 33180}, {"loss": 0.4986, "grad_norm": 1.0136911869049072, "learning_rate": 0.0002, "epoch": 5.365774795893622, "step": 33190}, {"loss": 0.5459, "grad_norm": 0.8685575127601624, "learning_rate": 0.0002, "epoch": 5.367391480074367, "step": 33200}, {"loss": 0.5146, "grad_norm": 0.8833574652671814, "learning_rate": 0.0002, "epoch": 5.369008164255113, "step": 33210}, {"loss": 0.4982, "grad_norm": 0.9123612642288208, "learning_rate": 0.0002, "epoch": 5.370624848435858, "step": 33220}, {"loss": 0.5047, "grad_norm": 1.2720599174499512, "learning_rate": 0.0002, "epoch": 5.372241532616603, "step": 33230}, {"loss": 0.5175, "grad_norm": 1.0596648454666138, "learning_rate": 0.0002, "epoch": 5.373858216797348, "step": 33240}, {"loss": 0.5284, "grad_norm": 1.119701623916626, "learning_rate": 0.0002, "epoch": 5.3754749009780936, "step": 33250}, {"loss": 0.5217, "grad_norm": 1.3000061511993408, "learning_rate": 0.0002, "epoch": 5.377091585158839, "step": 33260}, {"loss": 0.5125, "grad_norm": 1.083891749382019, "learning_rate": 0.0002, "epoch": 5.378708269339585, "step": 33270}, {"loss": 0.5065, "grad_norm": 0.9402718544006348, "learning_rate": 0.0002, "epoch": 5.38032495352033, "step": 33280}, {"loss": 0.5559, "grad_norm": 1.3376892805099487, "learning_rate": 0.0002, "epoch": 5.381941637701075, "step": 33290}, {"loss": 0.5193, "grad_norm": 1.1600074768066406, "learning_rate": 0.0002, "epoch": 5.383558321881821, "step": 33300}, {"loss": 0.4907, "grad_norm": 1.1449427604675293, "learning_rate": 0.0002, "epoch": 5.385175006062566, "step": 33310}, {"loss": 0.5449, "grad_norm": 1.3118891716003418, "learning_rate": 0.0002, "epoch": 5.386791690243311, "step": 33320}, {"loss": 0.547, "grad_norm": 0.743449866771698, "learning_rate": 0.0002, "epoch": 5.388408374424056, "step": 33330}, {"loss": 0.5555, "grad_norm": 0.9358304142951965, "learning_rate": 0.0002, "epoch": 5.3900250586048015, "step": 33340}, {"loss": 0.5558, "grad_norm": 1.0447142124176025, "learning_rate": 0.0002, "epoch": 5.391641742785547, "step": 33350}, {"loss": 0.5106, "grad_norm": 1.1088626384735107, "learning_rate": 0.0002, "epoch": 5.393258426966292, "step": 33360}, {"loss": 0.4929, "grad_norm": 1.1267958879470825, "learning_rate": 0.0002, "epoch": 5.394875111147037, "step": 33370}, {"loss": 0.5165, "grad_norm": 0.9709370136260986, "learning_rate": 0.0002, "epoch": 5.3964917953277824, "step": 33380}, {"loss": 0.5206, "grad_norm": 1.0939103364944458, "learning_rate": 0.0002, "epoch": 5.398108479508528, "step": 33390}, {"loss": 0.5177, "grad_norm": 0.9559304714202881, "learning_rate": 0.0002, "epoch": 5.399725163689273, "step": 33400}, {"loss": 0.5064, "grad_norm": 1.199580430984497, "learning_rate": 0.0002, "epoch": 5.401341847870018, "step": 33410}, {"loss": 0.52, "grad_norm": 0.9097000360488892, "learning_rate": 0.0002, "epoch": 5.402958532050764, "step": 33420}, {"loss": 0.514, "grad_norm": 1.1940981149673462, "learning_rate": 0.0002, "epoch": 5.4045752162315095, "step": 33430}, {"loss": 0.5069, "grad_norm": 1.0530916452407837, "learning_rate": 0.0002, "epoch": 5.406191900412255, "step": 33440}, {"loss": 0.5482, "grad_norm": 1.0482549667358398, "learning_rate": 0.0002, "epoch": 5.407808584593, "step": 33450}, {"loss": 0.501, "grad_norm": 1.2524714469909668, "learning_rate": 0.0002, "epoch": 5.409425268773745, "step": 33460}, {"loss": 0.5597, "grad_norm": 1.1091666221618652, "learning_rate": 0.0002, "epoch": 5.41104195295449, "step": 33470}, {"loss": 0.546, "grad_norm": 0.9981587529182434, "learning_rate": 0.0002, "epoch": 5.412658637135236, "step": 33480}, {"loss": 0.4977, "grad_norm": 1.016681432723999, "learning_rate": 0.0002, "epoch": 5.414275321315981, "step": 33490}, {"loss": 0.5388, "grad_norm": 1.1456854343414307, "learning_rate": 0.0002, "epoch": 5.415892005496726, "step": 33500}, {"loss": 0.5292, "grad_norm": 1.1454259157180786, "learning_rate": 0.0002, "epoch": 5.417508689677471, "step": 33510}, {"loss": 0.5061, "grad_norm": 0.9858416318893433, "learning_rate": 0.0002, "epoch": 5.419125373858217, "step": 33520}, {"loss": 0.5139, "grad_norm": 0.9764766693115234, "learning_rate": 0.0002, "epoch": 5.420742058038962, "step": 33530}, {"loss": 0.5518, "grad_norm": 1.199920892715454, "learning_rate": 0.0002, "epoch": 5.422358742219707, "step": 33540}, {"loss": 0.5182, "grad_norm": 1.3107370138168335, "learning_rate": 0.0002, "epoch": 5.423975426400452, "step": 33550}, {"loss": 0.5149, "grad_norm": 0.9637970328330994, "learning_rate": 0.0002, "epoch": 5.4255921105811975, "step": 33560}, {"loss": 0.526, "grad_norm": 1.023359775543213, "learning_rate": 0.0002, "epoch": 5.427208794761944, "step": 33570}, {"loss": 0.5206, "grad_norm": 1.060417652130127, "learning_rate": 0.0002, "epoch": 5.428825478942689, "step": 33580}, {"loss": 0.5052, "grad_norm": 0.9971120953559875, "learning_rate": 0.0002, "epoch": 5.430442163123434, "step": 33590}, {"loss": 0.5044, "grad_norm": 0.9213743209838867, "learning_rate": 0.0002, "epoch": 5.432058847304179, "step": 33600}, {"loss": 0.5714, "grad_norm": 1.1512309312820435, "learning_rate": 0.0002, "epoch": 5.4336755314849245, "step": 33610}, {"loss": 0.5317, "grad_norm": 1.2198847532272339, "learning_rate": 0.0002, "epoch": 5.43529221566567, "step": 33620}, {"loss": 0.5237, "grad_norm": 1.0329595804214478, "learning_rate": 0.0002, "epoch": 5.436908899846415, "step": 33630}, {"loss": 0.5364, "grad_norm": 1.1075750589370728, "learning_rate": 0.0002, "epoch": 5.43852558402716, "step": 33640}, {"loss": 0.5295, "grad_norm": 1.006342887878418, "learning_rate": 0.0002, "epoch": 5.4401422682079055, "step": 33650}, {"loss": 0.5394, "grad_norm": 0.9179885983467102, "learning_rate": 0.0002, "epoch": 5.441758952388651, "step": 33660}, {"loss": 0.5124, "grad_norm": 1.2799493074417114, "learning_rate": 0.0002, "epoch": 5.443375636569396, "step": 33670}, {"loss": 0.5426, "grad_norm": 1.1153863668441772, "learning_rate": 0.0002, "epoch": 5.444992320750141, "step": 33680}, {"loss": 0.5087, "grad_norm": 1.0681028366088867, "learning_rate": 0.0002, "epoch": 5.446609004930886, "step": 33690}, {"loss": 0.5272, "grad_norm": 0.9788817167282104, "learning_rate": 0.0002, "epoch": 5.448225689111632, "step": 33700}, {"loss": 0.5308, "grad_norm": 0.8481608629226685, "learning_rate": 0.0002, "epoch": 5.449842373292377, "step": 33710}, {"loss": 0.5225, "grad_norm": 1.113756537437439, "learning_rate": 0.0002, "epoch": 5.451459057473123, "step": 33720}, {"loss": 0.5213, "grad_norm": 0.8425475358963013, "learning_rate": 0.0002, "epoch": 5.453075741653868, "step": 33730}, {"loss": 0.571, "grad_norm": 1.0852208137512207, "learning_rate": 0.0002, "epoch": 5.4546924258346134, "step": 33740}, {"loss": 0.5535, "grad_norm": 1.1664748191833496, "learning_rate": 0.0002, "epoch": 5.456309110015359, "step": 33750}, {"loss": 0.5419, "grad_norm": 1.217241644859314, "learning_rate": 0.0002, "epoch": 5.457925794196104, "step": 33760}, {"loss": 0.5351, "grad_norm": 1.1572928428649902, "learning_rate": 0.0002, "epoch": 5.459542478376849, "step": 33770}, {"loss": 0.5161, "grad_norm": 1.0437318086624146, "learning_rate": 0.0002, "epoch": 5.461159162557594, "step": 33780}, {"loss": 0.5266, "grad_norm": 0.9807571768760681, "learning_rate": 0.0002, "epoch": 5.46277584673834, "step": 33790}, {"loss": 0.5384, "grad_norm": 1.1436342000961304, "learning_rate": 0.0002, "epoch": 5.464392530919085, "step": 33800}, {"loss": 0.5338, "grad_norm": 1.1004794836044312, "learning_rate": 0.0002, "epoch": 5.46600921509983, "step": 33810}, {"loss": 0.4868, "grad_norm": 1.2130268812179565, "learning_rate": 0.0002, "epoch": 5.467625899280575, "step": 33820}, {"loss": 0.516, "grad_norm": 1.3154419660568237, "learning_rate": 0.0002, "epoch": 5.4692425834613205, "step": 33830}, {"loss": 0.4934, "grad_norm": 0.7934383749961853, "learning_rate": 0.0002, "epoch": 5.470859267642066, "step": 33840}, {"loss": 0.5133, "grad_norm": 0.7838410139083862, "learning_rate": 0.0002, "epoch": 5.472475951822812, "step": 33850}, {"loss": 0.4926, "grad_norm": 1.0415139198303223, "learning_rate": 0.0002, "epoch": 5.474092636003557, "step": 33860}, {"loss": 0.5323, "grad_norm": 0.9213164448738098, "learning_rate": 0.0002, "epoch": 5.475709320184302, "step": 33870}, {"loss": 0.5125, "grad_norm": 1.0364776849746704, "learning_rate": 0.0002, "epoch": 5.477326004365048, "step": 33880}, {"loss": 0.5212, "grad_norm": 0.9994072318077087, "learning_rate": 0.0002, "epoch": 5.478942688545793, "step": 33890}, {"loss": 0.5396, "grad_norm": 1.196730136871338, "learning_rate": 0.0002, "epoch": 5.480559372726538, "step": 33900}, {"loss": 0.538, "grad_norm": 0.9955780506134033, "learning_rate": 0.0002, "epoch": 5.482176056907283, "step": 33910}, {"loss": 0.5307, "grad_norm": 1.168188214302063, "learning_rate": 0.0002, "epoch": 5.4837927410880285, "step": 33920}, {"loss": 0.5548, "grad_norm": 1.1816450357437134, "learning_rate": 0.0002, "epoch": 5.485409425268774, "step": 33930}, {"loss": 0.5535, "grad_norm": 1.079715609550476, "learning_rate": 0.0002, "epoch": 5.487026109449519, "step": 33940}, {"loss": 0.5262, "grad_norm": 1.153850793838501, "learning_rate": 0.0002, "epoch": 5.488642793630264, "step": 33950}, {"loss": 0.5248, "grad_norm": 1.0207297801971436, "learning_rate": 0.0002, "epoch": 5.490259477811009, "step": 33960}, {"loss": 0.5142, "grad_norm": 1.1290855407714844, "learning_rate": 0.0002, "epoch": 5.491876161991755, "step": 33970}, {"loss": 0.5168, "grad_norm": 1.068058967590332, "learning_rate": 0.0002, "epoch": 5.4934928461725, "step": 33980}, {"loss": 0.5317, "grad_norm": 0.9789979457855225, "learning_rate": 0.0002, "epoch": 5.495109530353245, "step": 33990}, {"loss": 0.5113, "grad_norm": 0.9696692824363708, "learning_rate": 0.0002, "epoch": 5.496726214533991, "step": 34000}, {"loss": 0.5413, "grad_norm": 1.0539981126785278, "learning_rate": 0.0002, "epoch": 5.4983428987147365, "step": 34010}, {"loss": 0.5783, "grad_norm": 1.0249929428100586, "learning_rate": 0.0002, "epoch": 5.499959582895482, "step": 34020}, {"loss": 0.4888, "grad_norm": 0.9577504992485046, "learning_rate": 0.0002, "epoch": 5.501576267076227, "step": 34030}, {"loss": 0.5291, "grad_norm": 1.0963513851165771, "learning_rate": 0.0002, "epoch": 5.503192951256972, "step": 34040}, {"loss": 0.5315, "grad_norm": 0.8339345455169678, "learning_rate": 0.0002, "epoch": 5.504809635437717, "step": 34050}, {"loss": 0.5191, "grad_norm": 1.0138782262802124, "learning_rate": 0.0002, "epoch": 5.506426319618463, "step": 34060}, {"loss": 0.5463, "grad_norm": 1.0180109739303589, "learning_rate": 0.0002, "epoch": 5.508043003799208, "step": 34070}, {"loss": 0.5083, "grad_norm": 1.2790818214416504, "learning_rate": 0.0002, "epoch": 5.509659687979953, "step": 34080}, {"loss": 0.5195, "grad_norm": 1.428247332572937, "learning_rate": 0.0002, "epoch": 5.511276372160698, "step": 34090}, {"loss": 0.5291, "grad_norm": 1.0926059484481812, "learning_rate": 0.0002, "epoch": 5.5128930563414436, "step": 34100}, {"loss": 0.5665, "grad_norm": 1.2353343963623047, "learning_rate": 0.0002, "epoch": 5.514509740522189, "step": 34110}, {"loss": 0.5331, "grad_norm": 0.935587465763092, "learning_rate": 0.0002, "epoch": 5.516126424702934, "step": 34120}, {"loss": 0.5512, "grad_norm": 0.9767586588859558, "learning_rate": 0.0002, "epoch": 5.517743108883679, "step": 34130}, {"loss": 0.5315, "grad_norm": 1.1660610437393188, "learning_rate": 0.0002, "epoch": 5.5193597930644245, "step": 34140}, {"loss": 0.52, "grad_norm": 0.9828870892524719, "learning_rate": 0.0002, "epoch": 5.520976477245171, "step": 34150}, {"loss": 0.5198, "grad_norm": 1.0097278356552124, "learning_rate": 0.0002, "epoch": 5.522593161425916, "step": 34160}, {"loss": 0.5293, "grad_norm": 1.1766167879104614, "learning_rate": 0.0002, "epoch": 5.524209845606661, "step": 34170}, {"loss": 0.5258, "grad_norm": 0.982292115688324, "learning_rate": 0.0002, "epoch": 5.525826529787406, "step": 34180}, {"loss": 0.5114, "grad_norm": 1.0744609832763672, "learning_rate": 0.0002, "epoch": 5.5274432139681515, "step": 34190}, {"loss": 0.5469, "grad_norm": 1.3831160068511963, "learning_rate": 0.0002, "epoch": 5.529059898148897, "step": 34200}, {"loss": 0.5819, "grad_norm": 1.074771761894226, "learning_rate": 0.0002, "epoch": 5.530676582329642, "step": 34210}, {"loss": 0.5399, "grad_norm": 1.016652226448059, "learning_rate": 0.0002, "epoch": 5.532293266510387, "step": 34220}, {"loss": 0.5158, "grad_norm": 1.2231552600860596, "learning_rate": 0.0002, "epoch": 5.5339099506911325, "step": 34230}, {"loss": 0.5091, "grad_norm": 0.8051198720932007, "learning_rate": 0.0002, "epoch": 5.535526634871878, "step": 34240}, {"loss": 0.5583, "grad_norm": 1.1779674291610718, "learning_rate": 0.0002, "epoch": 5.537143319052623, "step": 34250}, {"loss": 0.5044, "grad_norm": 1.2468291521072388, "learning_rate": 0.0002, "epoch": 5.538760003233368, "step": 34260}, {"loss": 0.523, "grad_norm": 1.14818274974823, "learning_rate": 0.0002, "epoch": 5.540376687414113, "step": 34270}, {"loss": 0.5375, "grad_norm": 1.2362616062164307, "learning_rate": 0.0002, "epoch": 5.541993371594859, "step": 34280}, {"loss": 0.4996, "grad_norm": 1.0206977128982544, "learning_rate": 0.0002, "epoch": 5.543610055775604, "step": 34290}, {"loss": 0.5212, "grad_norm": 1.2018457651138306, "learning_rate": 0.0002, "epoch": 5.54522673995635, "step": 34300}, {"loss": 0.5462, "grad_norm": 1.0349043607711792, "learning_rate": 0.0002, "epoch": 5.546843424137095, "step": 34310}, {"loss": 0.5231, "grad_norm": 1.2022006511688232, "learning_rate": 0.0002, "epoch": 5.54846010831784, "step": 34320}, {"loss": 0.5173, "grad_norm": 1.0810624361038208, "learning_rate": 0.0002, "epoch": 5.550076792498586, "step": 34330}, {"loss": 0.5821, "grad_norm": 1.3297529220581055, "learning_rate": 0.0002, "epoch": 5.551693476679331, "step": 34340}, {"loss": 0.5321, "grad_norm": 0.9722549915313721, "learning_rate": 0.0002, "epoch": 5.553310160860076, "step": 34350}, {"loss": 0.4823, "grad_norm": 0.9903425574302673, "learning_rate": 0.0002, "epoch": 5.554926845040821, "step": 34360}, {"loss": 0.5601, "grad_norm": 0.9568067789077759, "learning_rate": 0.0002, "epoch": 5.556543529221567, "step": 34370}, {"loss": 0.5242, "grad_norm": 1.113870620727539, "learning_rate": 0.0002, "epoch": 5.558160213402312, "step": 34380}, {"loss": 0.5278, "grad_norm": 1.0557632446289062, "learning_rate": 0.0002, "epoch": 5.559776897583057, "step": 34390}, {"loss": 0.5501, "grad_norm": 0.9615673422813416, "learning_rate": 0.0002, "epoch": 5.561393581763802, "step": 34400}, {"loss": 0.5066, "grad_norm": 0.9536027312278748, "learning_rate": 0.0002, "epoch": 5.5630102659445475, "step": 34410}, {"loss": 0.4949, "grad_norm": 0.8808749318122864, "learning_rate": 0.0002, "epoch": 5.564626950125293, "step": 34420}, {"loss": 0.5954, "grad_norm": 1.286132574081421, "learning_rate": 0.0002, "epoch": 5.566243634306038, "step": 34430}, {"loss": 0.5507, "grad_norm": 1.259644865989685, "learning_rate": 0.0002, "epoch": 5.567860318486783, "step": 34440}, {"loss": 0.4922, "grad_norm": 0.9920216798782349, "learning_rate": 0.0002, "epoch": 5.569477002667529, "step": 34450}, {"loss": 0.5527, "grad_norm": 1.182926893234253, "learning_rate": 0.0002, "epoch": 5.5710936868482746, "step": 34460}, {"loss": 0.5185, "grad_norm": 1.1434749364852905, "learning_rate": 0.0002, "epoch": 5.57271037102902, "step": 34470}, {"loss": 0.5256, "grad_norm": 1.2420979738235474, "learning_rate": 0.0002, "epoch": 5.574327055209765, "step": 34480}, {"loss": 0.5039, "grad_norm": 0.9338384866714478, "learning_rate": 0.0002, "epoch": 5.57594373939051, "step": 34490}, {"loss": 0.5634, "grad_norm": 1.0196425914764404, "learning_rate": 0.0002, "epoch": 5.5775604235712555, "step": 34500}, {"loss": 0.5132, "grad_norm": 0.9586997032165527, "learning_rate": 0.0002, "epoch": 5.579177107752001, "step": 34510}, {"loss": 0.5336, "grad_norm": 1.2409086227416992, "learning_rate": 0.0002, "epoch": 5.580793791932746, "step": 34520}, {"loss": 0.5364, "grad_norm": 1.1483757495880127, "learning_rate": 0.0002, "epoch": 5.582410476113491, "step": 34530}, {"loss": 0.5325, "grad_norm": 1.1624305248260498, "learning_rate": 0.0002, "epoch": 5.584027160294236, "step": 34540}, {"loss": 0.5342, "grad_norm": 1.2635223865509033, "learning_rate": 0.0002, "epoch": 5.585643844474982, "step": 34550}, {"loss": 0.4924, "grad_norm": 0.9824051856994629, "learning_rate": 0.0002, "epoch": 5.587260528655727, "step": 34560}, {"loss": 0.5395, "grad_norm": 1.0858620405197144, "learning_rate": 0.0002, "epoch": 5.588877212836472, "step": 34570}, {"loss": 0.5459, "grad_norm": 1.1452655792236328, "learning_rate": 0.0002, "epoch": 5.590493897017217, "step": 34580}, {"loss": 0.5746, "grad_norm": 1.110610842704773, "learning_rate": 0.0002, "epoch": 5.592110581197963, "step": 34590}, {"loss": 0.5285, "grad_norm": 0.9976194500923157, "learning_rate": 0.0002, "epoch": 5.593727265378709, "step": 34600}, {"loss": 0.548, "grad_norm": 1.0698920488357544, "learning_rate": 0.0002, "epoch": 5.595343949559454, "step": 34610}, {"loss": 0.5311, "grad_norm": 1.1505171060562134, "learning_rate": 0.0002, "epoch": 5.596960633740199, "step": 34620}, {"loss": 0.5471, "grad_norm": 1.1014643907546997, "learning_rate": 0.0002, "epoch": 5.598577317920944, "step": 34630}, {"loss": 0.55, "grad_norm": 0.915595293045044, "learning_rate": 0.0002, "epoch": 5.60019400210169, "step": 34640}, {"loss": 0.5821, "grad_norm": 1.1856765747070312, "learning_rate": 0.0002, "epoch": 5.601810686282435, "step": 34650}, {"loss": 0.5502, "grad_norm": 1.1357687711715698, "learning_rate": 0.0002, "epoch": 5.60342737046318, "step": 34660}, {"loss": 0.5034, "grad_norm": 1.0232492685317993, "learning_rate": 0.0002, "epoch": 5.605044054643925, "step": 34670}, {"loss": 0.5357, "grad_norm": 0.9375017881393433, "learning_rate": 0.0002, "epoch": 5.6066607388246705, "step": 34680}, {"loss": 0.5518, "grad_norm": 1.0796529054641724, "learning_rate": 0.0002, "epoch": 5.608277423005416, "step": 34690}, {"loss": 0.5173, "grad_norm": 1.1383336782455444, "learning_rate": 0.0002, "epoch": 5.609894107186161, "step": 34700}, {"loss": 0.5477, "grad_norm": 1.0248544216156006, "learning_rate": 0.0002, "epoch": 5.611510791366906, "step": 34710}, {"loss": 0.5669, "grad_norm": 1.0986040830612183, "learning_rate": 0.0002, "epoch": 5.6131274755476515, "step": 34720}, {"loss": 0.5188, "grad_norm": 1.2689568996429443, "learning_rate": 0.0002, "epoch": 5.614744159728397, "step": 34730}, {"loss": 0.5136, "grad_norm": 1.4044264554977417, "learning_rate": 0.0002, "epoch": 5.616360843909142, "step": 34740}, {"loss": 0.5699, "grad_norm": 1.2084474563598633, "learning_rate": 0.0002, "epoch": 5.617977528089888, "step": 34750}, {"loss": 0.5377, "grad_norm": 1.061248540878296, "learning_rate": 0.0002, "epoch": 5.619594212270633, "step": 34760}, {"loss": 0.5669, "grad_norm": 1.0220764875411987, "learning_rate": 0.0002, "epoch": 5.6212108964513785, "step": 34770}, {"loss": 0.54, "grad_norm": 1.0859092473983765, "learning_rate": 0.0002, "epoch": 5.622827580632124, "step": 34780}, {"loss": 0.5308, "grad_norm": 0.9049732089042664, "learning_rate": 0.0002, "epoch": 5.624444264812869, "step": 34790}, {"loss": 0.5433, "grad_norm": 1.2103937864303589, "learning_rate": 0.0002, "epoch": 5.626060948993614, "step": 34800}, {"loss": 0.5513, "grad_norm": 0.9854230284690857, "learning_rate": 0.0002, "epoch": 5.627677633174359, "step": 34810}, {"loss": 0.5274, "grad_norm": 0.9316635131835938, "learning_rate": 0.0002, "epoch": 5.629294317355105, "step": 34820}, {"loss": 0.5393, "grad_norm": 1.105296015739441, "learning_rate": 0.0002, "epoch": 5.63091100153585, "step": 34830}, {"loss": 0.5527, "grad_norm": 0.993383526802063, "learning_rate": 0.0002, "epoch": 5.632527685716595, "step": 34840}, {"loss": 0.5375, "grad_norm": 1.1544116735458374, "learning_rate": 0.0002, "epoch": 5.63414436989734, "step": 34850}, {"loss": 0.5448, "grad_norm": 1.284475326538086, "learning_rate": 0.0002, "epoch": 5.635761054078086, "step": 34860}, {"loss": 0.5069, "grad_norm": 1.121997594833374, "learning_rate": 0.0002, "epoch": 5.637377738258831, "step": 34870}, {"loss": 0.5335, "grad_norm": 1.213040828704834, "learning_rate": 0.0002, "epoch": 5.638994422439576, "step": 34880}, {"loss": 0.5623, "grad_norm": 1.23222017288208, "learning_rate": 0.0002, "epoch": 5.640611106620321, "step": 34890}, {"loss": 0.5622, "grad_norm": 0.9793637990951538, "learning_rate": 0.0002, "epoch": 5.642227790801067, "step": 34900}, {"loss": 0.5405, "grad_norm": 1.38919997215271, "learning_rate": 0.0002, "epoch": 5.643844474981813, "step": 34910}, {"loss": 0.5007, "grad_norm": 0.8390951156616211, "learning_rate": 0.0002, "epoch": 5.645461159162558, "step": 34920}, {"loss": 0.5974, "grad_norm": 0.9465909004211426, "learning_rate": 0.0002, "epoch": 5.647077843343303, "step": 34930}, {"loss": 0.5264, "grad_norm": 1.066957712173462, "learning_rate": 0.0002, "epoch": 5.648694527524048, "step": 34940}, {"loss": 0.5513, "grad_norm": 0.9842154383659363, "learning_rate": 0.0002, "epoch": 5.650311211704794, "step": 34950}, {"loss": 0.567, "grad_norm": 1.1766440868377686, "learning_rate": 0.0002, "epoch": 5.651927895885539, "step": 34960}, {"loss": 0.5462, "grad_norm": 0.9061306118965149, "learning_rate": 0.0002, "epoch": 5.653544580066284, "step": 34970}, {"loss": 0.5446, "grad_norm": 1.2941309213638306, "learning_rate": 0.0002, "epoch": 5.655161264247029, "step": 34980}, {"loss": 0.5704, "grad_norm": 0.9741247892379761, "learning_rate": 0.0002, "epoch": 5.6567779484277745, "step": 34990}, {"loss": 0.5152, "grad_norm": 1.0784187316894531, "learning_rate": 0.0002, "epoch": 5.65839463260852, "step": 35000}, {"loss": 0.5363, "grad_norm": 0.937889814376831, "learning_rate": 0.0002, "epoch": 5.660011316789265, "step": 35010}, {"loss": 0.5019, "grad_norm": 0.9667879939079285, "learning_rate": 0.0002, "epoch": 5.66162800097001, "step": 35020}, {"loss": 0.5209, "grad_norm": 1.0554876327514648, "learning_rate": 0.0002, "epoch": 5.663244685150756, "step": 35030}, {"loss": 0.523, "grad_norm": 1.2030539512634277, "learning_rate": 0.0002, "epoch": 5.664861369331501, "step": 35040}, {"loss": 0.5406, "grad_norm": 1.0849953889846802, "learning_rate": 0.0002, "epoch": 5.666478053512247, "step": 35050}, {"loss": 0.5747, "grad_norm": 1.1598973274230957, "learning_rate": 0.0002, "epoch": 5.668094737692992, "step": 35060}, {"loss": 0.5488, "grad_norm": 1.0233359336853027, "learning_rate": 0.0002, "epoch": 5.669711421873737, "step": 35070}, {"loss": 0.5409, "grad_norm": 1.1124799251556396, "learning_rate": 0.0002, "epoch": 5.6713281060544825, "step": 35080}, {"loss": 0.5578, "grad_norm": 1.2351475954055786, "learning_rate": 0.0002, "epoch": 5.672944790235228, "step": 35090}, {"loss": 0.5638, "grad_norm": 1.0240728855133057, "learning_rate": 0.0002, "epoch": 5.674561474415973, "step": 35100}, {"loss": 0.5192, "grad_norm": 1.0223692655563354, "learning_rate": 0.0002, "epoch": 5.676178158596718, "step": 35110}, {"loss": 0.524, "grad_norm": 1.4569132328033447, "learning_rate": 0.0002, "epoch": 5.677794842777463, "step": 35120}, {"loss": 0.555, "grad_norm": 0.8983587026596069, "learning_rate": 0.0002, "epoch": 5.679411526958209, "step": 35130}, {"loss": 0.5439, "grad_norm": 1.0775383710861206, "learning_rate": 0.0002, "epoch": 5.681028211138954, "step": 35140}, {"loss": 0.5289, "grad_norm": 0.9800270795822144, "learning_rate": 0.0002, "epoch": 5.682644895319699, "step": 35150}, {"loss": 0.533, "grad_norm": 0.9858237504959106, "learning_rate": 0.0002, "epoch": 5.684261579500444, "step": 35160}, {"loss": 0.5671, "grad_norm": 1.031087040901184, "learning_rate": 0.0002, "epoch": 5.6858782636811895, "step": 35170}, {"loss": 0.5528, "grad_norm": 1.0294365882873535, "learning_rate": 0.0002, "epoch": 5.687494947861936, "step": 35180}, {"loss": 0.5581, "grad_norm": 1.108144760131836, "learning_rate": 0.0002, "epoch": 5.68911163204268, "step": 35190}, {"loss": 0.5373, "grad_norm": 1.0813100337982178, "learning_rate": 0.0002, "epoch": 5.690728316223426, "step": 35200}, {"loss": 0.5429, "grad_norm": 1.3146867752075195, "learning_rate": 0.0002, "epoch": 5.692345000404171, "step": 35210}, {"loss": 0.5297, "grad_norm": 1.16780424118042, "learning_rate": 0.0002, "epoch": 5.693961684584917, "step": 35220}, {"loss": 0.577, "grad_norm": 0.9929125905036926, "learning_rate": 0.0002, "epoch": 5.695578368765662, "step": 35230}, {"loss": 0.5441, "grad_norm": 0.9049441814422607, "learning_rate": 0.0002, "epoch": 5.697195052946407, "step": 35240}, {"loss": 0.5349, "grad_norm": 0.9768866300582886, "learning_rate": 0.0002, "epoch": 5.698811737127152, "step": 35250}, {"loss": 0.542, "grad_norm": 0.8306029438972473, "learning_rate": 0.0002, "epoch": 5.7004284213078975, "step": 35260}, {"loss": 0.4771, "grad_norm": 0.8417280316352844, "learning_rate": 0.0002, "epoch": 5.702045105488643, "step": 35270}, {"loss": 0.574, "grad_norm": 0.9954485893249512, "learning_rate": 0.0002, "epoch": 5.703661789669388, "step": 35280}, {"loss": 0.5469, "grad_norm": 1.2417993545532227, "learning_rate": 0.0002, "epoch": 5.705278473850133, "step": 35290}, {"loss": 0.5275, "grad_norm": 1.1696544885635376, "learning_rate": 0.0002, "epoch": 5.706895158030878, "step": 35300}, {"loss": 0.5188, "grad_norm": 1.2424817085266113, "learning_rate": 0.0002, "epoch": 5.708511842211624, "step": 35310}, {"loss": 0.5595, "grad_norm": 1.1791106462478638, "learning_rate": 0.0002, "epoch": 5.710128526392369, "step": 35320}, {"loss": 0.5076, "grad_norm": 1.202181339263916, "learning_rate": 0.0002, "epoch": 5.711745210573115, "step": 35330}, {"loss": 0.5847, "grad_norm": 1.1006861925125122, "learning_rate": 0.0002, "epoch": 5.713361894753859, "step": 35340}, {"loss": 0.5627, "grad_norm": 1.0918344259262085, "learning_rate": 0.0002, "epoch": 5.7149785789346055, "step": 35350}, {"loss": 0.5677, "grad_norm": 1.0427305698394775, "learning_rate": 0.0002, "epoch": 5.716595263115351, "step": 35360}, {"loss": 0.5288, "grad_norm": 1.0818872451782227, "learning_rate": 0.0002, "epoch": 5.718211947296096, "step": 35370}, {"loss": 0.5296, "grad_norm": 1.186006784439087, "learning_rate": 0.0002, "epoch": 5.719828631476841, "step": 35380}, {"loss": 0.5507, "grad_norm": 1.2073674201965332, "learning_rate": 0.0002, "epoch": 5.721445315657586, "step": 35390}, {"loss": 0.5483, "grad_norm": 1.065338134765625, "learning_rate": 0.0002, "epoch": 5.723061999838332, "step": 35400}, {"loss": 0.5195, "grad_norm": 0.9448973536491394, "learning_rate": 0.0002, "epoch": 5.724678684019077, "step": 35410}, {"loss": 0.5276, "grad_norm": 1.1487499475479126, "learning_rate": 0.0002, "epoch": 5.726295368199822, "step": 35420}, {"loss": 0.5435, "grad_norm": 1.1334216594696045, "learning_rate": 0.0002, "epoch": 5.727912052380567, "step": 35430}, {"loss": 0.5074, "grad_norm": 1.1932826042175293, "learning_rate": 0.0002, "epoch": 5.729528736561313, "step": 35440}, {"loss": 0.5502, "grad_norm": 1.2615786790847778, "learning_rate": 0.0002, "epoch": 5.731145420742058, "step": 35450}, {"loss": 0.5612, "grad_norm": 1.2803694009780884, "learning_rate": 0.0002, "epoch": 5.732762104922803, "step": 35460}, {"loss": 0.5458, "grad_norm": 0.9271906614303589, "learning_rate": 0.0002, "epoch": 5.734378789103548, "step": 35470}, {"loss": 0.5342, "grad_norm": 1.0958917140960693, "learning_rate": 0.0002, "epoch": 5.735995473284294, "step": 35480}, {"loss": 0.538, "grad_norm": 1.1072784662246704, "learning_rate": 0.0002, "epoch": 5.737612157465039, "step": 35490}, {"loss": 0.5683, "grad_norm": 1.1641002893447876, "learning_rate": 0.0002, "epoch": 5.739228841645785, "step": 35500}, {"loss": 0.5252, "grad_norm": 1.0246447324752808, "learning_rate": 0.0002, "epoch": 5.74084552582653, "step": 35510}, {"loss": 0.55, "grad_norm": 1.032474398612976, "learning_rate": 0.0002, "epoch": 5.742462210007275, "step": 35520}, {"loss": 0.4965, "grad_norm": 1.1600854396820068, "learning_rate": 0.0002, "epoch": 5.7440788941880205, "step": 35530}, {"loss": 0.5543, "grad_norm": 1.0686054229736328, "learning_rate": 0.0002, "epoch": 5.745695578368766, "step": 35540}, {"loss": 0.5706, "grad_norm": 1.2314637899398804, "learning_rate": 0.0002, "epoch": 5.747312262549511, "step": 35550}, {"loss": 0.5492, "grad_norm": 0.922134280204773, "learning_rate": 0.0002, "epoch": 5.748928946730256, "step": 35560}, {"loss": 0.5495, "grad_norm": 0.933043360710144, "learning_rate": 0.0002, "epoch": 5.7505456309110015, "step": 35570}, {"loss": 0.5007, "grad_norm": 1.1911931037902832, "learning_rate": 0.0002, "epoch": 5.752162315091747, "step": 35580}, {"loss": 0.5244, "grad_norm": 0.8984857797622681, "learning_rate": 0.0002, "epoch": 5.753778999272492, "step": 35590}, {"loss": 0.5493, "grad_norm": 0.9495107531547546, "learning_rate": 0.0002, "epoch": 5.755395683453237, "step": 35600}, {"loss": 0.5326, "grad_norm": 1.2805472612380981, "learning_rate": 0.0002, "epoch": 5.757012367633982, "step": 35610}, {"loss": 0.5276, "grad_norm": 1.1236625909805298, "learning_rate": 0.0002, "epoch": 5.758629051814728, "step": 35620}, {"loss": 0.6102, "grad_norm": 1.0552798509597778, "learning_rate": 0.0002, "epoch": 5.760245735995474, "step": 35630}, {"loss": 0.5479, "grad_norm": 1.119909644126892, "learning_rate": 0.0002, "epoch": 5.761862420176218, "step": 35640}, {"loss": 0.5282, "grad_norm": 0.8786116242408752, "learning_rate": 0.0002, "epoch": 5.763479104356964, "step": 35650}, {"loss": 0.5406, "grad_norm": 1.2417117357254028, "learning_rate": 0.0002, "epoch": 5.765095788537709, "step": 35660}, {"loss": 0.537, "grad_norm": 1.255200982093811, "learning_rate": 0.0002, "epoch": 5.766712472718455, "step": 35670}, {"loss": 0.5308, "grad_norm": 1.0611358880996704, "learning_rate": 0.0002, "epoch": 5.7683291568992, "step": 35680}, {"loss": 0.5614, "grad_norm": 1.1443911790847778, "learning_rate": 0.0002, "epoch": 5.769945841079945, "step": 35690}, {"loss": 0.5386, "grad_norm": 1.1437989473342896, "learning_rate": 0.0002, "epoch": 5.77156252526069, "step": 35700}, {"loss": 0.537, "grad_norm": 1.1375046968460083, "learning_rate": 0.0002, "epoch": 5.773179209441436, "step": 35710}, {"loss": 0.5198, "grad_norm": 1.0777729749679565, "learning_rate": 0.0002, "epoch": 5.774795893622181, "step": 35720}, {"loss": 0.5521, "grad_norm": 1.1160215139389038, "learning_rate": 0.0002, "epoch": 5.776412577802926, "step": 35730}, {"loss": 0.5569, "grad_norm": 1.1268514394760132, "learning_rate": 0.0002, "epoch": 5.778029261983671, "step": 35740}, {"loss": 0.5311, "grad_norm": 1.2752262353897095, "learning_rate": 0.0002, "epoch": 5.7796459461644165, "step": 35750}, {"loss": 0.5625, "grad_norm": 1.0416184663772583, "learning_rate": 0.0002, "epoch": 5.781262630345162, "step": 35760}, {"loss": 0.5438, "grad_norm": 1.0622444152832031, "learning_rate": 0.0002, "epoch": 5.782879314525907, "step": 35770}, {"loss": 0.5268, "grad_norm": 1.1217877864837646, "learning_rate": 0.0002, "epoch": 5.784495998706653, "step": 35780}, {"loss": 0.5225, "grad_norm": 0.9363139867782593, "learning_rate": 0.0002, "epoch": 5.786112682887398, "step": 35790}, {"loss": 0.5524, "grad_norm": 0.96628737449646, "learning_rate": 0.0002, "epoch": 5.787729367068144, "step": 35800}, {"loss": 0.52, "grad_norm": 0.9572572112083435, "learning_rate": 0.0002, "epoch": 5.789346051248889, "step": 35810}, {"loss": 0.5615, "grad_norm": 0.938724935054779, "learning_rate": 0.0002, "epoch": 5.790962735429634, "step": 35820}, {"loss": 0.5391, "grad_norm": 1.3314417600631714, "learning_rate": 0.0002, "epoch": 5.792579419610379, "step": 35830}, {"loss": 0.5441, "grad_norm": 1.0097602605819702, "learning_rate": 0.0002, "epoch": 5.7941961037911245, "step": 35840}, {"loss": 0.591, "grad_norm": 1.1265122890472412, "learning_rate": 0.0002, "epoch": 5.79581278797187, "step": 35850}, {"loss": 0.5333, "grad_norm": 1.2191909551620483, "learning_rate": 0.0002, "epoch": 5.797429472152615, "step": 35860}, {"loss": 0.5274, "grad_norm": 0.9690808057785034, "learning_rate": 0.0002, "epoch": 5.79904615633336, "step": 35870}, {"loss": 0.5425, "grad_norm": 1.0871665477752686, "learning_rate": 0.0002, "epoch": 5.800662840514105, "step": 35880}, {"loss": 0.5602, "grad_norm": 1.1093597412109375, "learning_rate": 0.0002, "epoch": 5.802279524694851, "step": 35890}, {"loss": 0.5475, "grad_norm": 1.2434282302856445, "learning_rate": 0.0002, "epoch": 5.803896208875596, "step": 35900}, {"loss": 0.5288, "grad_norm": 1.2933623790740967, "learning_rate": 0.0002, "epoch": 5.805512893056341, "step": 35910}, {"loss": 0.5554, "grad_norm": 1.0005441904067993, "learning_rate": 0.0002, "epoch": 5.807129577237086, "step": 35920}, {"loss": 0.5318, "grad_norm": 1.2373108863830566, "learning_rate": 0.0002, "epoch": 5.8087462614178325, "step": 35930}, {"loss": 0.5413, "grad_norm": 1.2622692584991455, "learning_rate": 0.0002, "epoch": 5.810362945598578, "step": 35940}, {"loss": 0.5558, "grad_norm": 1.0112963914871216, "learning_rate": 0.0002, "epoch": 5.811979629779323, "step": 35950}, {"loss": 0.5115, "grad_norm": 1.050572395324707, "learning_rate": 0.0002, "epoch": 5.813596313960068, "step": 35960}, {"loss": 0.5288, "grad_norm": 0.9774560928344727, "learning_rate": 0.0002, "epoch": 5.815212998140813, "step": 35970}, {"loss": 0.585, "grad_norm": 1.19438898563385, "learning_rate": 0.0002, "epoch": 5.816829682321559, "step": 35980}, {"loss": 0.5798, "grad_norm": 1.0267130136489868, "learning_rate": 0.0002, "epoch": 5.818446366502304, "step": 35990}, {"loss": 0.5126, "grad_norm": 0.9813851714134216, "learning_rate": 0.0002, "epoch": 5.820063050683049, "step": 36000}, {"loss": 0.5138, "grad_norm": 0.9177457094192505, "learning_rate": 0.0002, "epoch": 5.821679734863794, "step": 36010}, {"loss": 0.5453, "grad_norm": 1.0020731687545776, "learning_rate": 0.0002, "epoch": 5.8232964190445395, "step": 36020}, {"loss": 0.5646, "grad_norm": 1.073222041130066, "learning_rate": 0.0002, "epoch": 5.824913103225285, "step": 36030}, {"loss": 0.5539, "grad_norm": 1.016337513923645, "learning_rate": 0.0002, "epoch": 5.82652978740603, "step": 36040}, {"loss": 0.5592, "grad_norm": 1.267364263534546, "learning_rate": 0.0002, "epoch": 5.828146471586775, "step": 36050}, {"loss": 0.595, "grad_norm": 1.2730127573013306, "learning_rate": 0.0002, "epoch": 5.8297631557675205, "step": 36060}, {"loss": 0.5247, "grad_norm": 1.108442783355713, "learning_rate": 0.0002, "epoch": 5.831379839948266, "step": 36070}, {"loss": 0.5103, "grad_norm": 1.198072075843811, "learning_rate": 0.0002, "epoch": 5.832996524129012, "step": 36080}, {"loss": 0.5479, "grad_norm": 1.0458786487579346, "learning_rate": 0.0002, "epoch": 5.834613208309757, "step": 36090}, {"loss": 0.5564, "grad_norm": 0.9096664786338806, "learning_rate": 0.0002, "epoch": 5.836229892490502, "step": 36100}, {"loss": 0.5602, "grad_norm": 0.9957793951034546, "learning_rate": 0.0002, "epoch": 5.8378465766712475, "step": 36110}, {"loss": 0.5799, "grad_norm": 1.3693058490753174, "learning_rate": 0.0002, "epoch": 5.839463260851993, "step": 36120}, {"loss": 0.5425, "grad_norm": 1.268608808517456, "learning_rate": 0.0002, "epoch": 5.841079945032738, "step": 36130}, {"loss": 0.5653, "grad_norm": 0.8516020178794861, "learning_rate": 0.0002, "epoch": 5.842696629213483, "step": 36140}, {"loss": 0.5475, "grad_norm": 0.90385502576828, "learning_rate": 0.0002, "epoch": 5.844313313394228, "step": 36150}, {"loss": 0.5274, "grad_norm": 1.0910571813583374, "learning_rate": 0.0002, "epoch": 5.845929997574974, "step": 36160}, {"loss": 0.555, "grad_norm": 0.9417795538902283, "learning_rate": 0.0002, "epoch": 5.847546681755719, "step": 36170}, {"loss": 0.5784, "grad_norm": 1.0027360916137695, "learning_rate": 0.0002, "epoch": 5.849163365936464, "step": 36180}, {"loss": 0.5423, "grad_norm": 1.1480516195297241, "learning_rate": 0.0002, "epoch": 5.850780050117209, "step": 36190}, {"loss": 0.5517, "grad_norm": 1.2431457042694092, "learning_rate": 0.0002, "epoch": 5.852396734297955, "step": 36200}, {"loss": 0.5404, "grad_norm": 1.091465950012207, "learning_rate": 0.0002, "epoch": 5.8540134184787, "step": 36210}, {"loss": 0.53, "grad_norm": 0.9693930745124817, "learning_rate": 0.0002, "epoch": 5.855630102659445, "step": 36220}, {"loss": 0.5453, "grad_norm": 0.9937465190887451, "learning_rate": 0.0002, "epoch": 5.857246786840191, "step": 36230}, {"loss": 0.5621, "grad_norm": 1.0731011629104614, "learning_rate": 0.0002, "epoch": 5.858863471020936, "step": 36240}, {"loss": 0.5687, "grad_norm": 1.0869048833847046, "learning_rate": 0.0002, "epoch": 5.860480155201682, "step": 36250}, {"loss": 0.5576, "grad_norm": 0.9226390719413757, "learning_rate": 0.0002, "epoch": 5.862096839382427, "step": 36260}, {"loss": 0.531, "grad_norm": 1.1755430698394775, "learning_rate": 0.0002, "epoch": 5.863713523563172, "step": 36270}, {"loss": 0.558, "grad_norm": 0.8815974593162537, "learning_rate": 0.0002, "epoch": 5.865330207743917, "step": 36280}, {"loss": 0.5065, "grad_norm": 1.3648751974105835, "learning_rate": 0.0002, "epoch": 5.866946891924663, "step": 36290}, {"loss": 0.536, "grad_norm": 0.8729211091995239, "learning_rate": 0.0002, "epoch": 5.868563576105408, "step": 36300}, {"loss": 0.5192, "grad_norm": 1.0870907306671143, "learning_rate": 0.0002, "epoch": 5.870180260286153, "step": 36310}, {"loss": 0.5609, "grad_norm": 1.1164259910583496, "learning_rate": 0.0002, "epoch": 5.871796944466898, "step": 36320}, {"loss": 0.551, "grad_norm": 1.1572535037994385, "learning_rate": 0.0002, "epoch": 5.8734136286476435, "step": 36330}, {"loss": 0.5898, "grad_norm": 1.0456238985061646, "learning_rate": 0.0002, "epoch": 5.875030312828389, "step": 36340}, {"loss": 0.5008, "grad_norm": 1.1310722827911377, "learning_rate": 0.0002, "epoch": 5.876646997009134, "step": 36350}, {"loss": 0.5352, "grad_norm": 1.0004712343215942, "learning_rate": 0.0002, "epoch": 5.878263681189879, "step": 36360}, {"loss": 0.5632, "grad_norm": 1.0991777181625366, "learning_rate": 0.0002, "epoch": 5.879880365370624, "step": 36370}, {"loss": 0.5815, "grad_norm": 1.2789239883422852, "learning_rate": 0.0002, "epoch": 5.8814970495513705, "step": 36380}, {"loss": 0.56, "grad_norm": 0.9524819850921631, "learning_rate": 0.0002, "epoch": 5.883113733732116, "step": 36390}, {"loss": 0.5701, "grad_norm": 1.1115771532058716, "learning_rate": 0.0002, "epoch": 5.884730417912861, "step": 36400}, {"loss": 0.5463, "grad_norm": 1.37419855594635, "learning_rate": 0.0002, "epoch": 5.886347102093606, "step": 36410}, {"loss": 0.5675, "grad_norm": 1.1449527740478516, "learning_rate": 0.0002, "epoch": 5.8879637862743515, "step": 36420}, {"loss": 0.5255, "grad_norm": 1.198046326637268, "learning_rate": 0.0002, "epoch": 5.889580470455097, "step": 36430}, {"loss": 0.5383, "grad_norm": 1.0180530548095703, "learning_rate": 0.0002, "epoch": 5.891197154635842, "step": 36440}, {"loss": 0.5319, "grad_norm": 1.0516417026519775, "learning_rate": 0.0002, "epoch": 5.892813838816587, "step": 36450}, {"loss": 0.5782, "grad_norm": 1.1658052206039429, "learning_rate": 0.0002, "epoch": 5.894430522997332, "step": 36460}, {"loss": 0.5864, "grad_norm": 1.190699577331543, "learning_rate": 0.0002, "epoch": 5.896047207178078, "step": 36470}, {"loss": 0.5451, "grad_norm": 1.1235495805740356, "learning_rate": 0.0002, "epoch": 5.897663891358823, "step": 36480}, {"loss": 0.5284, "grad_norm": 1.1926926374435425, "learning_rate": 0.0002, "epoch": 5.899280575539568, "step": 36490}, {"loss": 0.5686, "grad_norm": 1.1184662580490112, "learning_rate": 0.0002, "epoch": 5.900897259720313, "step": 36500}, {"loss": 0.5147, "grad_norm": 1.000970721244812, "learning_rate": 0.0002, "epoch": 5.9025139439010585, "step": 36510}, {"loss": 0.5351, "grad_norm": 1.0373306274414062, "learning_rate": 0.0002, "epoch": 5.904130628081804, "step": 36520}, {"loss": 0.535, "grad_norm": 1.0840669870376587, "learning_rate": 0.0002, "epoch": 5.90574731226255, "step": 36530}, {"loss": 0.538, "grad_norm": 0.9908381104469299, "learning_rate": 0.0002, "epoch": 5.907363996443295, "step": 36540}, {"loss": 0.5313, "grad_norm": 1.0456029176712036, "learning_rate": 0.0002, "epoch": 5.90898068062404, "step": 36550}, {"loss": 0.5693, "grad_norm": 1.1381454467773438, "learning_rate": 0.0002, "epoch": 5.910597364804786, "step": 36560}, {"loss": 0.5473, "grad_norm": 0.9440900087356567, "learning_rate": 0.0002, "epoch": 5.912214048985531, "step": 36570}, {"loss": 0.5542, "grad_norm": 1.1674573421478271, "learning_rate": 0.0002, "epoch": 5.913830733166276, "step": 36580}, {"loss": 0.526, "grad_norm": 1.1226966381072998, "learning_rate": 0.0002, "epoch": 5.915447417347021, "step": 36590}, {"loss": 0.6091, "grad_norm": 0.9696915745735168, "learning_rate": 0.0002, "epoch": 5.9170641015277665, "step": 36600}, {"loss": 0.5523, "grad_norm": 0.9593005180358887, "learning_rate": 0.0002, "epoch": 5.918680785708512, "step": 36610}, {"loss": 0.5536, "grad_norm": 1.122169852256775, "learning_rate": 0.0002, "epoch": 5.920297469889257, "step": 36620}, {"loss": 0.5039, "grad_norm": 0.9923415780067444, "learning_rate": 0.0002, "epoch": 5.921914154070002, "step": 36630}, {"loss": 0.5893, "grad_norm": 1.063838005065918, "learning_rate": 0.0002, "epoch": 5.923530838250747, "step": 36640}, {"loss": 0.5799, "grad_norm": 0.9083505272865295, "learning_rate": 0.0002, "epoch": 5.925147522431493, "step": 36650}, {"loss": 0.5264, "grad_norm": 0.9439437985420227, "learning_rate": 0.0002, "epoch": 5.926764206612239, "step": 36660}, {"loss": 0.5891, "grad_norm": 0.9778534173965454, "learning_rate": 0.0002, "epoch": 5.928380890792983, "step": 36670}, {"loss": 0.566, "grad_norm": 0.9723961353302002, "learning_rate": 0.0002, "epoch": 5.929997574973729, "step": 36680}, {"loss": 0.5741, "grad_norm": 1.162333607673645, "learning_rate": 0.0002, "epoch": 5.9316142591544745, "step": 36690}, {"loss": 0.5771, "grad_norm": 1.2784897089004517, "learning_rate": 0.0002, "epoch": 5.93323094333522, "step": 36700}, {"loss": 0.5343, "grad_norm": 1.0924867391586304, "learning_rate": 0.0002, "epoch": 5.934847627515965, "step": 36710}, {"loss": 0.5554, "grad_norm": 1.046922206878662, "learning_rate": 0.0002, "epoch": 5.93646431169671, "step": 36720}, {"loss": 0.5476, "grad_norm": 0.8632535338401794, "learning_rate": 0.0002, "epoch": 5.938080995877455, "step": 36730}, {"loss": 0.5456, "grad_norm": 1.358762502670288, "learning_rate": 0.0002, "epoch": 5.939697680058201, "step": 36740}, {"loss": 0.551, "grad_norm": 1.2058624029159546, "learning_rate": 0.0002, "epoch": 5.941314364238946, "step": 36750}, {"loss": 0.5462, "grad_norm": 1.1396408081054688, "learning_rate": 0.0002, "epoch": 5.942931048419691, "step": 36760}, {"loss": 0.5483, "grad_norm": 1.1510354280471802, "learning_rate": 0.0002, "epoch": 5.944547732600436, "step": 36770}, {"loss": 0.5659, "grad_norm": 1.1401607990264893, "learning_rate": 0.0002, "epoch": 5.946164416781182, "step": 36780}, {"loss": 0.5557, "grad_norm": 1.1871325969696045, "learning_rate": 0.0002, "epoch": 5.947781100961927, "step": 36790}, {"loss": 0.4945, "grad_norm": 0.9928333163261414, "learning_rate": 0.0002, "epoch": 5.949397785142672, "step": 36800}, {"loss": 0.5303, "grad_norm": 1.0549445152282715, "learning_rate": 0.0002, "epoch": 5.951014469323418, "step": 36810}, {"loss": 0.5532, "grad_norm": 0.9791563749313354, "learning_rate": 0.0002, "epoch": 5.9526311535041625, "step": 36820}, {"loss": 0.5317, "grad_norm": 1.1268441677093506, "learning_rate": 0.0002, "epoch": 5.954247837684909, "step": 36830}, {"loss": 0.5585, "grad_norm": 1.0533992052078247, "learning_rate": 0.0002, "epoch": 5.955864521865654, "step": 36840}, {"loss": 0.4972, "grad_norm": 1.023358941078186, "learning_rate": 0.0002, "epoch": 5.957481206046399, "step": 36850}, {"loss": 0.5557, "grad_norm": 1.2631961107254028, "learning_rate": 0.0002, "epoch": 5.959097890227144, "step": 36860}, {"loss": 0.5662, "grad_norm": 0.9397698640823364, "learning_rate": 0.0002, "epoch": 5.9607145744078895, "step": 36870}, {"loss": 0.5775, "grad_norm": 1.1678427457809448, "learning_rate": 0.0002, "epoch": 5.962331258588635, "step": 36880}, {"loss": 0.5435, "grad_norm": 1.1403759717941284, "learning_rate": 0.0002, "epoch": 5.96394794276938, "step": 36890}, {"loss": 0.5479, "grad_norm": 1.030572772026062, "learning_rate": 0.0002, "epoch": 5.965564626950125, "step": 36900}, {"loss": 0.5838, "grad_norm": 1.0992497205734253, "learning_rate": 0.0002, "epoch": 5.9671813111308705, "step": 36910}, {"loss": 0.5452, "grad_norm": 1.075466275215149, "learning_rate": 0.0002, "epoch": 5.968797995311616, "step": 36920}, {"loss": 0.5739, "grad_norm": 1.0153694152832031, "learning_rate": 0.0002, "epoch": 5.970414679492361, "step": 36930}, {"loss": 0.5672, "grad_norm": 0.973193883895874, "learning_rate": 0.0002, "epoch": 5.972031363673106, "step": 36940}, {"loss": 0.5585, "grad_norm": 0.8294678926467896, "learning_rate": 0.0002, "epoch": 5.973648047853851, "step": 36950}, {"loss": 0.5631, "grad_norm": 1.0048716068267822, "learning_rate": 0.0002, "epoch": 5.9752647320345975, "step": 36960}, {"loss": 0.5471, "grad_norm": 0.9714070558547974, "learning_rate": 0.0002, "epoch": 5.976881416215342, "step": 36970}, {"loss": 0.5419, "grad_norm": 0.8667682409286499, "learning_rate": 0.0002, "epoch": 5.978498100396088, "step": 36980}, {"loss": 0.5474, "grad_norm": 1.0461409091949463, "learning_rate": 0.0002, "epoch": 5.980114784576833, "step": 36990}, {"loss": 0.5454, "grad_norm": 0.9229754209518433, "learning_rate": 0.0002, "epoch": 5.981731468757578, "step": 37000}, {"loss": 0.5599, "grad_norm": 1.0406876802444458, "learning_rate": 0.0002, "epoch": 5.983348152938324, "step": 37010}, {"loss": 0.5569, "grad_norm": 0.8993828296661377, "learning_rate": 0.0002, "epoch": 5.984964837119069, "step": 37020}, {"loss": 0.5611, "grad_norm": 1.2260479927062988, "learning_rate": 0.0002, "epoch": 5.986581521299814, "step": 37030}, {"loss": 0.5523, "grad_norm": 1.0107380151748657, "learning_rate": 0.0002, "epoch": 5.988198205480559, "step": 37040}, {"loss": 0.5639, "grad_norm": 1.0240139961242676, "learning_rate": 0.0002, "epoch": 5.989814889661305, "step": 37050}, {"loss": 0.5209, "grad_norm": 1.0185275077819824, "learning_rate": 0.0002, "epoch": 5.99143157384205, "step": 37060}, {"loss": 0.5114, "grad_norm": 1.1361802816390991, "learning_rate": 0.0002, "epoch": 5.993048258022795, "step": 37070}, {"loss": 0.5692, "grad_norm": 1.0395532846450806, "learning_rate": 0.0002, "epoch": 5.99466494220354, "step": 37080}, {"loss": 0.594, "grad_norm": 0.9463558197021484, "learning_rate": 0.0002, "epoch": 5.9962816263842855, "step": 37090}, {"loss": 0.5775, "grad_norm": 1.2066948413848877, "learning_rate": 0.0002, "epoch": 5.997898310565031, "step": 37100}, {"loss": 0.5356, "grad_norm": 0.9749386310577393, "learning_rate": 0.0002, "epoch": 5.999514994745777, "step": 37110}, {"eval_loss": 1.2270219326019287, "eval_runtime": 122.2047, "eval_samples_per_second": 5.998, "eval_steps_per_second": 0.753, "epoch": 6.0, "step": 37113}, {"loss": 0.4855, "grad_norm": 0.9641092419624329, "learning_rate": 0.0002, "epoch": 6.001131678926522, "step": 37120}, {"loss": 0.4112, "grad_norm": 1.103379249572754, "learning_rate": 0.0002, "epoch": 6.002748363107267, "step": 37130}, {"loss": 0.4577, "grad_norm": 0.8381665349006653, "learning_rate": 0.0002, "epoch": 6.004365047288013, "step": 37140}, {"loss": 0.4794, "grad_norm": 1.245323896408081, "learning_rate": 0.0002, "epoch": 6.005981731468758, "step": 37150}, {"loss": 0.4503, "grad_norm": 1.3140289783477783, "learning_rate": 0.0002, "epoch": 6.007598415649503, "step": 37160}, {"loss": 0.4456, "grad_norm": 0.8479695916175842, "learning_rate": 0.0002, "epoch": 6.009215099830248, "step": 37170}, {"loss": 0.4573, "grad_norm": 0.8841437101364136, "learning_rate": 0.0002, "epoch": 6.0108317840109935, "step": 37180}, {"loss": 0.4565, "grad_norm": 0.8900154829025269, "learning_rate": 0.0002, "epoch": 6.012448468191739, "step": 37190}, {"loss": 0.457, "grad_norm": 1.2753345966339111, "learning_rate": 0.0002, "epoch": 6.014065152372484, "step": 37200}, {"loss": 0.4365, "grad_norm": 1.4625498056411743, "learning_rate": 0.0002, "epoch": 6.015681836553229, "step": 37210}, {"loss": 0.4252, "grad_norm": 0.7455034852027893, "learning_rate": 0.0002, "epoch": 6.017298520733974, "step": 37220}, {"loss": 0.4433, "grad_norm": 1.1658862829208374, "learning_rate": 0.0002, "epoch": 6.01891520491472, "step": 37230}, {"loss": 0.4499, "grad_norm": 0.9785751104354858, "learning_rate": 0.0002, "epoch": 6.020531889095465, "step": 37240}, {"loss": 0.4956, "grad_norm": 1.3193122148513794, "learning_rate": 0.0002, "epoch": 6.02214857327621, "step": 37250}, {"loss": 0.4727, "grad_norm": 1.038273572921753, "learning_rate": 0.0002, "epoch": 6.023765257456955, "step": 37260}, {"loss": 0.4395, "grad_norm": 1.0550594329833984, "learning_rate": 0.0002, "epoch": 6.0253819416377015, "step": 37270}, {"loss": 0.4767, "grad_norm": 0.9745930433273315, "learning_rate": 0.0002, "epoch": 6.026998625818447, "step": 37280}, {"loss": 0.4233, "grad_norm": 0.9273530840873718, "learning_rate": 0.0002, "epoch": 6.028615309999192, "step": 37290}, {"loss": 0.4195, "grad_norm": 1.3844057321548462, "learning_rate": 0.0002, "epoch": 6.030231994179937, "step": 37300}, {"loss": 0.4768, "grad_norm": 1.2058762311935425, "learning_rate": 0.0002, "epoch": 6.031848678360682, "step": 37310}, {"loss": 0.4499, "grad_norm": 1.242663025856018, "learning_rate": 0.0002, "epoch": 6.033465362541428, "step": 37320}, {"loss": 0.4597, "grad_norm": 1.3504270315170288, "learning_rate": 0.0002, "epoch": 6.035082046722173, "step": 37330}, {"loss": 0.4402, "grad_norm": 0.8734912276268005, "learning_rate": 0.0002, "epoch": 6.036698730902918, "step": 37340}, {"loss": 0.477, "grad_norm": 1.0182311534881592, "learning_rate": 0.0002, "epoch": 6.038315415083663, "step": 37350}, {"loss": 0.4261, "grad_norm": 0.9898499846458435, "learning_rate": 0.0002, "epoch": 6.0399320992644085, "step": 37360}, {"loss": 0.4459, "grad_norm": 1.0637860298156738, "learning_rate": 0.0002, "epoch": 6.041548783445154, "step": 37370}, {"loss": 0.4958, "grad_norm": 1.0099523067474365, "learning_rate": 0.0002, "epoch": 6.043165467625899, "step": 37380}, {"loss": 0.4459, "grad_norm": 1.1080750226974487, "learning_rate": 0.0002, "epoch": 6.044782151806644, "step": 37390}, {"loss": 0.4473, "grad_norm": 1.2551289796829224, "learning_rate": 0.0002, "epoch": 6.0463988359873895, "step": 37400}, {"loss": 0.468, "grad_norm": 0.8959632515907288, "learning_rate": 0.0002, "epoch": 6.048015520168136, "step": 37410}, {"loss": 0.4255, "grad_norm": 1.1748892068862915, "learning_rate": 0.0002, "epoch": 6.049632204348881, "step": 37420}, {"loss": 0.4458, "grad_norm": 1.3122745752334595, "learning_rate": 0.0002, "epoch": 6.051248888529626, "step": 37430}, {"loss": 0.4676, "grad_norm": 1.0227985382080078, "learning_rate": 0.0002, "epoch": 6.052865572710371, "step": 37440}, {"loss": 0.4503, "grad_norm": 1.0380030870437622, "learning_rate": 0.0002, "epoch": 6.0544822568911165, "step": 37450}, {"loss": 0.4686, "grad_norm": 0.8919622898101807, "learning_rate": 0.0002, "epoch": 6.056098941071862, "step": 37460}, {"loss": 0.4406, "grad_norm": 1.4554150104522705, "learning_rate": 0.0002, "epoch": 6.057715625252607, "step": 37470}, {"loss": 0.4688, "grad_norm": 1.2853292226791382, "learning_rate": 0.0002, "epoch": 6.059332309433352, "step": 37480}, {"loss": 0.4489, "grad_norm": 1.2951840162277222, "learning_rate": 0.0002, "epoch": 6.0609489936140974, "step": 37490}, {"loss": 0.4819, "grad_norm": 1.1750973463058472, "learning_rate": 0.0002, "epoch": 6.062565677794843, "step": 37500}, {"loss": 0.4574, "grad_norm": 0.9328424334526062, "learning_rate": 0.0002, "epoch": 6.064182361975588, "step": 37510}, {"loss": 0.4597, "grad_norm": 1.0353537797927856, "learning_rate": 0.0002, "epoch": 6.065799046156333, "step": 37520}, {"loss": 0.4407, "grad_norm": 1.1594274044036865, "learning_rate": 0.0002, "epoch": 6.067415730337078, "step": 37530}, {"loss": 0.4642, "grad_norm": 0.9034168124198914, "learning_rate": 0.0002, "epoch": 6.069032414517824, "step": 37540}, {"loss": 0.4625, "grad_norm": 1.068617820739746, "learning_rate": 0.0002, "epoch": 6.070649098698569, "step": 37550}, {"loss": 0.4378, "grad_norm": 1.0931321382522583, "learning_rate": 0.0002, "epoch": 6.072265782879315, "step": 37560}, {"loss": 0.4527, "grad_norm": 1.2542688846588135, "learning_rate": 0.0002, "epoch": 6.07388246706006, "step": 37570}, {"loss": 0.4725, "grad_norm": 1.273384928703308, "learning_rate": 0.0002, "epoch": 6.075499151240805, "step": 37580}, {"loss": 0.4928, "grad_norm": 1.4771400690078735, "learning_rate": 0.0002, "epoch": 6.077115835421551, "step": 37590}, {"loss": 0.461, "grad_norm": 1.3751444816589355, "learning_rate": 0.0002, "epoch": 6.078732519602296, "step": 37600}, {"loss": 0.4602, "grad_norm": 1.4532550573349, "learning_rate": 0.0002, "epoch": 6.080349203783041, "step": 37610}, {"loss": 0.4428, "grad_norm": 1.3175991773605347, "learning_rate": 0.0002, "epoch": 6.081965887963786, "step": 37620}, {"loss": 0.4746, "grad_norm": 1.0624970197677612, "learning_rate": 0.0002, "epoch": 6.083582572144532, "step": 37630}, {"loss": 0.413, "grad_norm": 1.099715232849121, "learning_rate": 0.0002, "epoch": 6.085199256325277, "step": 37640}, {"loss": 0.4528, "grad_norm": 1.0380114316940308, "learning_rate": 0.0002, "epoch": 6.086815940506022, "step": 37650}, {"loss": 0.4373, "grad_norm": 1.1136109828948975, "learning_rate": 0.0002, "epoch": 6.088432624686767, "step": 37660}, {"loss": 0.4915, "grad_norm": 0.996498703956604, "learning_rate": 0.0002, "epoch": 6.0900493088675125, "step": 37670}, {"loss": 0.4713, "grad_norm": 1.0552574396133423, "learning_rate": 0.0002, "epoch": 6.091665993048258, "step": 37680}, {"loss": 0.4414, "grad_norm": 1.4108527898788452, "learning_rate": 0.0002, "epoch": 6.093282677229003, "step": 37690}, {"loss": 0.4851, "grad_norm": 1.1323093175888062, "learning_rate": 0.0002, "epoch": 6.094899361409748, "step": 37700}, {"loss": 0.4455, "grad_norm": 0.9364377856254578, "learning_rate": 0.0002, "epoch": 6.096516045590494, "step": 37710}, {"loss": 0.4791, "grad_norm": 1.1300561428070068, "learning_rate": 0.0002, "epoch": 6.0981327297712395, "step": 37720}, {"loss": 0.4539, "grad_norm": 1.0616047382354736, "learning_rate": 0.0002, "epoch": 6.099749413951985, "step": 37730}, {"loss": 0.4516, "grad_norm": 1.1205905675888062, "learning_rate": 0.0002, "epoch": 6.10136609813273, "step": 37740}, {"loss": 0.4688, "grad_norm": 0.9592534303665161, "learning_rate": 0.0002, "epoch": 6.102982782313475, "step": 37750}, {"loss": 0.4494, "grad_norm": 0.9797531962394714, "learning_rate": 0.0002, "epoch": 6.1045994664942205, "step": 37760}, {"loss": 0.4237, "grad_norm": 1.093404769897461, "learning_rate": 0.0002, "epoch": 6.106216150674966, "step": 37770}, {"loss": 0.4691, "grad_norm": 1.2172642946243286, "learning_rate": 0.0002, "epoch": 6.107832834855711, "step": 37780}, {"loss": 0.4398, "grad_norm": 1.0467255115509033, "learning_rate": 0.0002, "epoch": 6.109449519036456, "step": 37790}, {"loss": 0.4676, "grad_norm": 1.159318208694458, "learning_rate": 0.0002, "epoch": 6.111066203217201, "step": 37800}, {"loss": 0.4539, "grad_norm": 1.0615603923797607, "learning_rate": 0.0002, "epoch": 6.112682887397947, "step": 37810}, {"loss": 0.4957, "grad_norm": 1.0542045831680298, "learning_rate": 0.0002, "epoch": 6.114299571578692, "step": 37820}, {"loss": 0.4512, "grad_norm": 0.8962697982788086, "learning_rate": 0.0002, "epoch": 6.115916255759437, "step": 37830}, {"loss": 0.4519, "grad_norm": 1.106352686882019, "learning_rate": 0.0002, "epoch": 6.117532939940182, "step": 37840}, {"loss": 0.4421, "grad_norm": 1.1660276651382446, "learning_rate": 0.0002, "epoch": 6.1191496241209276, "step": 37850}, {"loss": 0.4701, "grad_norm": 1.3524385690689087, "learning_rate": 0.0002, "epoch": 6.120766308301674, "step": 37860}, {"loss": 0.4684, "grad_norm": 1.1056050062179565, "learning_rate": 0.0002, "epoch": 6.122382992482419, "step": 37870}, {"loss": 0.4518, "grad_norm": 1.0772725343704224, "learning_rate": 0.0002, "epoch": 6.123999676663164, "step": 37880}, {"loss": 0.4356, "grad_norm": 1.1011115312576294, "learning_rate": 0.0002, "epoch": 6.125616360843909, "step": 37890}, {"loss": 0.4909, "grad_norm": 0.8952536582946777, "learning_rate": 0.0002, "epoch": 6.127233045024655, "step": 37900}, {"loss": 0.4299, "grad_norm": 1.244398593902588, "learning_rate": 0.0002, "epoch": 6.1288497292054, "step": 37910}, {"loss": 0.4764, "grad_norm": 0.9658283591270447, "learning_rate": 0.0002, "epoch": 6.130466413386145, "step": 37920}, {"loss": 0.4378, "grad_norm": 1.0649068355560303, "learning_rate": 0.0002, "epoch": 6.13208309756689, "step": 37930}, {"loss": 0.4638, "grad_norm": 0.94698166847229, "learning_rate": 0.0002, "epoch": 6.1336997817476355, "step": 37940}, {"loss": 0.488, "grad_norm": 1.1450897455215454, "learning_rate": 0.0002, "epoch": 6.135316465928381, "step": 37950}, {"loss": 0.4791, "grad_norm": 1.032482624053955, "learning_rate": 0.0002, "epoch": 6.136933150109126, "step": 37960}, {"loss": 0.4179, "grad_norm": 1.0993428230285645, "learning_rate": 0.0002, "epoch": 6.138549834289871, "step": 37970}, {"loss": 0.4781, "grad_norm": 1.2907029390335083, "learning_rate": 0.0002, "epoch": 6.1401665184706165, "step": 37980}, {"loss": 0.4671, "grad_norm": 1.1007903814315796, "learning_rate": 0.0002, "epoch": 6.141783202651362, "step": 37990}, {"loss": 0.4213, "grad_norm": 0.9286124110221863, "learning_rate": 0.0002, "epoch": 6.143399886832107, "step": 38000}, {"loss": 0.4741, "grad_norm": 1.1426366567611694, "learning_rate": 0.0002, "epoch": 6.145016571012853, "step": 38010}, {"loss": 0.4746, "grad_norm": 1.2608287334442139, "learning_rate": 0.0002, "epoch": 6.146633255193598, "step": 38020}, {"loss": 0.454, "grad_norm": 1.1346837282180786, "learning_rate": 0.0002, "epoch": 6.1482499393743435, "step": 38030}, {"loss": 0.4469, "grad_norm": 1.144080400466919, "learning_rate": 0.0002, "epoch": 6.149866623555089, "step": 38040}, {"loss": 0.4515, "grad_norm": 1.3456705808639526, "learning_rate": 0.0002, "epoch": 6.151483307735834, "step": 38050}, {"loss": 0.4775, "grad_norm": 1.0517960786819458, "learning_rate": 0.0002, "epoch": 6.153099991916579, "step": 38060}, {"loss": 0.4986, "grad_norm": 1.1887445449829102, "learning_rate": 0.0002, "epoch": 6.154716676097324, "step": 38070}, {"loss": 0.4516, "grad_norm": 1.0449163913726807, "learning_rate": 0.0002, "epoch": 6.15633336027807, "step": 38080}, {"loss": 0.4808, "grad_norm": 1.3218743801116943, "learning_rate": 0.0002, "epoch": 6.157950044458815, "step": 38090}, {"loss": 0.4632, "grad_norm": 1.003208875656128, "learning_rate": 0.0002, "epoch": 6.15956672863956, "step": 38100}, {"loss": 0.4978, "grad_norm": 1.008623719215393, "learning_rate": 0.0002, "epoch": 6.161183412820305, "step": 38110}, {"loss": 0.4608, "grad_norm": 1.2122787237167358, "learning_rate": 0.0002, "epoch": 6.162800097001051, "step": 38120}, {"loss": 0.4666, "grad_norm": 1.253403902053833, "learning_rate": 0.0002, "epoch": 6.164416781181796, "step": 38130}, {"loss": 0.4778, "grad_norm": 1.2289724349975586, "learning_rate": 0.0002, "epoch": 6.166033465362541, "step": 38140}, {"loss": 0.4774, "grad_norm": 1.330694556236267, "learning_rate": 0.0002, "epoch": 6.167650149543286, "step": 38150}, {"loss": 0.4699, "grad_norm": 1.0946741104125977, "learning_rate": 0.0002, "epoch": 6.169266833724032, "step": 38160}, {"loss": 0.4816, "grad_norm": 1.0719934701919556, "learning_rate": 0.0002, "epoch": 6.170883517904778, "step": 38170}, {"loss": 0.4678, "grad_norm": 1.1142133474349976, "learning_rate": 0.0002, "epoch": 6.172500202085523, "step": 38180}, {"loss": 0.4911, "grad_norm": 1.1221938133239746, "learning_rate": 0.0002, "epoch": 6.174116886266268, "step": 38190}, {"loss": 0.4462, "grad_norm": 1.1391617059707642, "learning_rate": 0.0002, "epoch": 6.175733570447013, "step": 38200}, {"loss": 0.4867, "grad_norm": 1.2263455390930176, "learning_rate": 0.0002, "epoch": 6.1773502546277586, "step": 38210}, {"loss": 0.4633, "grad_norm": 1.0930434465408325, "learning_rate": 0.0002, "epoch": 6.178966938808504, "step": 38220}, {"loss": 0.4406, "grad_norm": 1.3489030599594116, "learning_rate": 0.0002, "epoch": 6.180583622989249, "step": 38230}, {"loss": 0.4994, "grad_norm": 1.1383486986160278, "learning_rate": 0.0002, "epoch": 6.182200307169994, "step": 38240}, {"loss": 0.4851, "grad_norm": 1.2408897876739502, "learning_rate": 0.0002, "epoch": 6.1838169913507395, "step": 38250}, {"loss": 0.4848, "grad_norm": 1.1436222791671753, "learning_rate": 0.0002, "epoch": 6.185433675531485, "step": 38260}, {"loss": 0.4594, "grad_norm": 1.370117425918579, "learning_rate": 0.0002, "epoch": 6.18705035971223, "step": 38270}, {"loss": 0.5023, "grad_norm": 0.8862423300743103, "learning_rate": 0.0002, "epoch": 6.188667043892975, "step": 38280}, {"loss": 0.4559, "grad_norm": 0.9603779315948486, "learning_rate": 0.0002, "epoch": 6.19028372807372, "step": 38290}, {"loss": 0.4835, "grad_norm": 1.389291524887085, "learning_rate": 0.0002, "epoch": 6.191900412254466, "step": 38300}, {"loss": 0.4435, "grad_norm": 1.0767031908035278, "learning_rate": 0.0002, "epoch": 6.193517096435212, "step": 38310}, {"loss": 0.4683, "grad_norm": 1.1800403594970703, "learning_rate": 0.0002, "epoch": 6.195133780615957, "step": 38320}, {"loss": 0.4608, "grad_norm": 0.997891366481781, "learning_rate": 0.0002, "epoch": 6.196750464796702, "step": 38330}, {"loss": 0.4575, "grad_norm": 1.1201492547988892, "learning_rate": 0.0002, "epoch": 6.1983671489774474, "step": 38340}, {"loss": 0.4952, "grad_norm": 0.9769026637077332, "learning_rate": 0.0002, "epoch": 6.199983833158193, "step": 38350}, {"loss": 0.4563, "grad_norm": 0.9447069764137268, "learning_rate": 0.0002, "epoch": 6.201600517338938, "step": 38360}, {"loss": 0.516, "grad_norm": 1.0959235429763794, "learning_rate": 0.0002, "epoch": 6.203217201519683, "step": 38370}, {"loss": 0.4688, "grad_norm": 1.2495406866073608, "learning_rate": 0.0002, "epoch": 6.204833885700428, "step": 38380}, {"loss": 0.4445, "grad_norm": 0.8589218258857727, "learning_rate": 0.0002, "epoch": 6.206450569881174, "step": 38390}, {"loss": 0.4808, "grad_norm": 0.959155797958374, "learning_rate": 0.0002, "epoch": 6.208067254061919, "step": 38400}, {"loss": 0.4622, "grad_norm": 1.0105533599853516, "learning_rate": 0.0002, "epoch": 6.209683938242664, "step": 38410}, {"loss": 0.4887, "grad_norm": 0.9824615120887756, "learning_rate": 0.0002, "epoch": 6.211300622423409, "step": 38420}, {"loss": 0.4656, "grad_norm": 0.8616500496864319, "learning_rate": 0.0002, "epoch": 6.2129173066041545, "step": 38430}, {"loss": 0.449, "grad_norm": 1.2917758226394653, "learning_rate": 0.0002, "epoch": 6.2145339907849, "step": 38440}, {"loss": 0.4201, "grad_norm": 1.0564531087875366, "learning_rate": 0.0002, "epoch": 6.216150674965646, "step": 38450}, {"loss": 0.4849, "grad_norm": 1.152331829071045, "learning_rate": 0.0002, "epoch": 6.217767359146391, "step": 38460}, {"loss": 0.4887, "grad_norm": 0.9152206778526306, "learning_rate": 0.0002, "epoch": 6.219384043327136, "step": 38470}, {"loss": 0.4686, "grad_norm": 0.9931167960166931, "learning_rate": 0.0002, "epoch": 6.221000727507882, "step": 38480}, {"loss": 0.4765, "grad_norm": 1.3248072862625122, "learning_rate": 0.0002, "epoch": 6.222617411688627, "step": 38490}, {"loss": 0.4636, "grad_norm": 1.3916507959365845, "learning_rate": 0.0002, "epoch": 6.224234095869372, "step": 38500}, {"loss": 0.506, "grad_norm": 1.1775140762329102, "learning_rate": 0.0002, "epoch": 6.225850780050117, "step": 38510}, {"loss": 0.47, "grad_norm": 1.1581059694290161, "learning_rate": 0.0002, "epoch": 6.2274674642308625, "step": 38520}, {"loss": 0.4679, "grad_norm": 1.359320878982544, "learning_rate": 0.0002, "epoch": 6.229084148411608, "step": 38530}, {"loss": 0.4697, "grad_norm": 1.185041904449463, "learning_rate": 0.0002, "epoch": 6.230700832592353, "step": 38540}, {"loss": 0.4815, "grad_norm": 1.1861097812652588, "learning_rate": 0.0002, "epoch": 6.232317516773098, "step": 38550}, {"loss": 0.4925, "grad_norm": 1.126990556716919, "learning_rate": 0.0002, "epoch": 6.233934200953843, "step": 38560}, {"loss": 0.4414, "grad_norm": 0.9744541049003601, "learning_rate": 0.0002, "epoch": 6.235550885134589, "step": 38570}, {"loss": 0.4577, "grad_norm": 1.1260887384414673, "learning_rate": 0.0002, "epoch": 6.237167569315334, "step": 38580}, {"loss": 0.4852, "grad_norm": 1.1290327310562134, "learning_rate": 0.0002, "epoch": 6.238784253496079, "step": 38590}, {"loss": 0.4805, "grad_norm": 1.0952879190444946, "learning_rate": 0.0002, "epoch": 6.240400937676825, "step": 38600}, {"loss": 0.4436, "grad_norm": 1.1037684679031372, "learning_rate": 0.0002, "epoch": 6.2420176218575705, "step": 38610}, {"loss": 0.466, "grad_norm": 1.1356085538864136, "learning_rate": 0.0002, "epoch": 6.243634306038316, "step": 38620}, {"loss": 0.5129, "grad_norm": 1.0677106380462646, "learning_rate": 0.0002, "epoch": 6.245250990219061, "step": 38630}, {"loss": 0.4907, "grad_norm": 1.1573411226272583, "learning_rate": 0.0002, "epoch": 6.246867674399806, "step": 38640}, {"loss": 0.5098, "grad_norm": 1.2707505226135254, "learning_rate": 0.0002, "epoch": 6.248484358580551, "step": 38650}, {"loss": 0.4926, "grad_norm": 1.0480109453201294, "learning_rate": 0.0002, "epoch": 6.250101042761297, "step": 38660}, {"loss": 0.4654, "grad_norm": 1.3668724298477173, "learning_rate": 0.0002, "epoch": 6.251717726942042, "step": 38670}, {"loss": 0.5128, "grad_norm": 1.217289686203003, "learning_rate": 0.0002, "epoch": 6.253334411122787, "step": 38680}, {"loss": 0.4621, "grad_norm": 1.2950236797332764, "learning_rate": 0.0002, "epoch": 6.254951095303532, "step": 38690}, {"loss": 0.5076, "grad_norm": 1.4506934881210327, "learning_rate": 0.0002, "epoch": 6.256567779484278, "step": 38700}, {"loss": 0.4803, "grad_norm": 1.1248667240142822, "learning_rate": 0.0002, "epoch": 6.258184463665023, "step": 38710}, {"loss": 0.4746, "grad_norm": 1.3384023904800415, "learning_rate": 0.0002, "epoch": 6.259801147845768, "step": 38720}, {"loss": 0.473, "grad_norm": 1.128074288368225, "learning_rate": 0.0002, "epoch": 6.261417832026513, "step": 38730}, {"loss": 0.4638, "grad_norm": 1.1169012784957886, "learning_rate": 0.0002, "epoch": 6.263034516207259, "step": 38740}, {"loss": 0.4747, "grad_norm": 1.195198893547058, "learning_rate": 0.0002, "epoch": 6.264651200388005, "step": 38750}, {"loss": 0.4906, "grad_norm": 1.2471518516540527, "learning_rate": 0.0002, "epoch": 6.26626788456875, "step": 38760}, {"loss": 0.4507, "grad_norm": 1.2646394968032837, "learning_rate": 0.0002, "epoch": 6.267884568749495, "step": 38770}, {"loss": 0.4934, "grad_norm": 1.0286450386047363, "learning_rate": 0.0002, "epoch": 6.26950125293024, "step": 38780}, {"loss": 0.4787, "grad_norm": 1.2440695762634277, "learning_rate": 0.0002, "epoch": 6.2711179371109855, "step": 38790}, {"loss": 0.4806, "grad_norm": 0.8941256403923035, "learning_rate": 0.0002, "epoch": 6.272734621291731, "step": 38800}, {"loss": 0.4741, "grad_norm": 1.0693447589874268, "learning_rate": 0.0002, "epoch": 6.274351305472476, "step": 38810}, {"loss": 0.4408, "grad_norm": 1.0936840772628784, "learning_rate": 0.0002, "epoch": 6.275967989653221, "step": 38820}, {"loss": 0.4729, "grad_norm": 1.0961874723434448, "learning_rate": 0.0002, "epoch": 6.2775846738339665, "step": 38830}, {"loss": 0.4504, "grad_norm": 1.1465433835983276, "learning_rate": 0.0002, "epoch": 6.279201358014712, "step": 38840}, {"loss": 0.4771, "grad_norm": 1.2987004518508911, "learning_rate": 0.0002, "epoch": 6.280818042195457, "step": 38850}, {"loss": 0.4945, "grad_norm": 1.1310304403305054, "learning_rate": 0.0002, "epoch": 6.282434726376202, "step": 38860}, {"loss": 0.5346, "grad_norm": 1.306538462638855, "learning_rate": 0.0002, "epoch": 6.284051410556947, "step": 38870}, {"loss": 0.4873, "grad_norm": 1.2405401468276978, "learning_rate": 0.0002, "epoch": 6.285668094737693, "step": 38880}, {"loss": 0.4929, "grad_norm": 1.0934767723083496, "learning_rate": 0.0002, "epoch": 6.287284778918439, "step": 38890}, {"loss": 0.4853, "grad_norm": 1.3370496034622192, "learning_rate": 0.0002, "epoch": 6.288901463099184, "step": 38900}, {"loss": 0.4892, "grad_norm": 1.0319404602050781, "learning_rate": 0.0002, "epoch": 6.290518147279929, "step": 38910}, {"loss": 0.4685, "grad_norm": 0.9734271168708801, "learning_rate": 0.0002, "epoch": 6.292134831460674, "step": 38920}, {"loss": 0.5085, "grad_norm": 1.0940454006195068, "learning_rate": 0.0002, "epoch": 6.29375151564142, "step": 38930}, {"loss": 0.4985, "grad_norm": 1.036500334739685, "learning_rate": 0.0002, "epoch": 6.295368199822165, "step": 38940}, {"loss": 0.4878, "grad_norm": 1.020308256149292, "learning_rate": 0.0002, "epoch": 6.29698488400291, "step": 38950}, {"loss": 0.4668, "grad_norm": 1.1416399478912354, "learning_rate": 0.0002, "epoch": 6.298601568183655, "step": 38960}, {"loss": 0.4727, "grad_norm": 1.2497479915618896, "learning_rate": 0.0002, "epoch": 6.300218252364401, "step": 38970}, {"loss": 0.4721, "grad_norm": 1.1692523956298828, "learning_rate": 0.0002, "epoch": 6.301834936545146, "step": 38980}, {"loss": 0.505, "grad_norm": 1.0693109035491943, "learning_rate": 0.0002, "epoch": 6.303451620725891, "step": 38990}, {"loss": 0.4875, "grad_norm": 0.8883291482925415, "learning_rate": 0.0002, "epoch": 6.305068304906636, "step": 39000}, {"loss": 0.5371, "grad_norm": 1.1445088386535645, "learning_rate": 0.0002, "epoch": 6.3066849890873815, "step": 39010}, {"loss": 0.5089, "grad_norm": 1.226792335510254, "learning_rate": 0.0002, "epoch": 6.308301673268127, "step": 39020}, {"loss": 0.474, "grad_norm": 1.0498932600021362, "learning_rate": 0.0002, "epoch": 6.309918357448872, "step": 39030}, {"loss": 0.4964, "grad_norm": 1.0834535360336304, "learning_rate": 0.0002, "epoch": 6.311535041629618, "step": 39040}, {"loss": 0.4733, "grad_norm": 1.144666075706482, "learning_rate": 0.0002, "epoch": 6.313151725810363, "step": 39050}, {"loss": 0.4784, "grad_norm": 1.1468489170074463, "learning_rate": 0.0002, "epoch": 6.3147684099911086, "step": 39060}, {"loss": 0.4911, "grad_norm": 1.290949821472168, "learning_rate": 0.0002, "epoch": 6.316385094171854, "step": 39070}, {"loss": 0.5002, "grad_norm": 1.087868094444275, "learning_rate": 0.0002, "epoch": 6.318001778352599, "step": 39080}, {"loss": 0.4944, "grad_norm": 1.0156296491622925, "learning_rate": 0.0002, "epoch": 6.319618462533344, "step": 39090}, {"loss": 0.5019, "grad_norm": 1.0805060863494873, "learning_rate": 0.0002, "epoch": 6.3212351467140895, "step": 39100}, {"loss": 0.4598, "grad_norm": 0.9030579924583435, "learning_rate": 0.0002, "epoch": 6.322851830894835, "step": 39110}, {"loss": 0.4635, "grad_norm": 1.1488285064697266, "learning_rate": 0.0002, "epoch": 6.32446851507558, "step": 39120}, {"loss": 0.5368, "grad_norm": 1.2050796747207642, "learning_rate": 0.0002, "epoch": 6.326085199256325, "step": 39130}, {"loss": 0.4854, "grad_norm": 1.093451738357544, "learning_rate": 0.0002, "epoch": 6.32770188343707, "step": 39140}, {"loss": 0.5055, "grad_norm": 1.2046772241592407, "learning_rate": 0.0002, "epoch": 6.329318567617816, "step": 39150}, {"loss": 0.4703, "grad_norm": 1.045777678489685, "learning_rate": 0.0002, "epoch": 6.330935251798561, "step": 39160}, {"loss": 0.513, "grad_norm": 1.2008492946624756, "learning_rate": 0.0002, "epoch": 6.332551935979306, "step": 39170}, {"loss": 0.4909, "grad_norm": 1.0613869428634644, "learning_rate": 0.0002, "epoch": 6.334168620160051, "step": 39180}, {"loss": 0.4708, "grad_norm": 1.058440089225769, "learning_rate": 0.0002, "epoch": 6.3357853043407975, "step": 39190}, {"loss": 0.4719, "grad_norm": 1.195658802986145, "learning_rate": 0.0002, "epoch": 6.337401988521543, "step": 39200}, {"loss": 0.4901, "grad_norm": 1.1595174074172974, "learning_rate": 0.0002, "epoch": 6.339018672702288, "step": 39210}, {"loss": 0.4587, "grad_norm": 1.0674750804901123, "learning_rate": 0.0002, "epoch": 6.340635356883033, "step": 39220}, {"loss": 0.4801, "grad_norm": 1.3306758403778076, "learning_rate": 0.0002, "epoch": 6.342252041063778, "step": 39230}, {"loss": 0.4839, "grad_norm": 1.3582593202590942, "learning_rate": 0.0002, "epoch": 6.343868725244524, "step": 39240}, {"loss": 0.4964, "grad_norm": 1.2351572513580322, "learning_rate": 0.0002, "epoch": 6.345485409425269, "step": 39250}, {"loss": 0.4806, "grad_norm": 1.3623450994491577, "learning_rate": 0.0002, "epoch": 6.347102093606014, "step": 39260}, {"loss": 0.466, "grad_norm": 1.201270580291748, "learning_rate": 0.0002, "epoch": 6.348718777786759, "step": 39270}, {"loss": 0.4899, "grad_norm": 0.9300584197044373, "learning_rate": 0.0002, "epoch": 6.3503354619675045, "step": 39280}, {"loss": 0.4867, "grad_norm": 0.944525957107544, "learning_rate": 0.0002, "epoch": 6.35195214614825, "step": 39290}, {"loss": 0.4954, "grad_norm": 1.4263732433319092, "learning_rate": 0.0002, "epoch": 6.353568830328995, "step": 39300}, {"loss": 0.4982, "grad_norm": 1.392592191696167, "learning_rate": 0.0002, "epoch": 6.35518551450974, "step": 39310}, {"loss": 0.4868, "grad_norm": 1.0753393173217773, "learning_rate": 0.0002, "epoch": 6.3568021986904855, "step": 39320}, {"loss": 0.4896, "grad_norm": 1.0088151693344116, "learning_rate": 0.0002, "epoch": 6.358418882871231, "step": 39330}, {"loss": 0.4684, "grad_norm": 1.1784582138061523, "learning_rate": 0.0002, "epoch": 6.360035567051977, "step": 39340}, {"loss": 0.4732, "grad_norm": 1.020526647567749, "learning_rate": 0.0002, "epoch": 6.361652251232722, "step": 39350}, {"loss": 0.5177, "grad_norm": 1.1400747299194336, "learning_rate": 0.0002, "epoch": 6.363268935413467, "step": 39360}, {"loss": 0.4976, "grad_norm": 0.9960665702819824, "learning_rate": 0.0002, "epoch": 6.3648856195942125, "step": 39370}, {"loss": 0.483, "grad_norm": 1.1547569036483765, "learning_rate": 0.0002, "epoch": 6.366502303774958, "step": 39380}, {"loss": 0.4861, "grad_norm": 1.2180676460266113, "learning_rate": 0.0002, "epoch": 6.368118987955703, "step": 39390}, {"loss": 0.4805, "grad_norm": 1.1391799449920654, "learning_rate": 0.0002, "epoch": 6.369735672136448, "step": 39400}, {"loss": 0.5004, "grad_norm": 1.2893574237823486, "learning_rate": 0.0002, "epoch": 6.371352356317193, "step": 39410}, {"loss": 0.4807, "grad_norm": 1.192878246307373, "learning_rate": 0.0002, "epoch": 6.372969040497939, "step": 39420}, {"loss": 0.4637, "grad_norm": 0.9771704077720642, "learning_rate": 0.0002, "epoch": 6.374585724678684, "step": 39430}, {"loss": 0.4867, "grad_norm": 1.285387635231018, "learning_rate": 0.0002, "epoch": 6.376202408859429, "step": 39440}, {"loss": 0.4593, "grad_norm": 1.019957184791565, "learning_rate": 0.0002, "epoch": 6.377819093040174, "step": 39450}, {"loss": 0.473, "grad_norm": 1.2002915143966675, "learning_rate": 0.0002, "epoch": 6.37943577722092, "step": 39460}, {"loss": 0.5025, "grad_norm": 1.3285092115402222, "learning_rate": 0.0002, "epoch": 6.381052461401665, "step": 39470}, {"loss": 0.4626, "grad_norm": 1.097846269607544, "learning_rate": 0.0002, "epoch": 6.38266914558241, "step": 39480}, {"loss": 0.5109, "grad_norm": 0.9537988305091858, "learning_rate": 0.0002, "epoch": 6.384285829763156, "step": 39490}, {"loss": 0.4492, "grad_norm": 1.0350042581558228, "learning_rate": 0.0002, "epoch": 6.385902513943901, "step": 39500}, {"loss": 0.4824, "grad_norm": 0.9559133052825928, "learning_rate": 0.0002, "epoch": 6.387519198124647, "step": 39510}, {"loss": 0.5189, "grad_norm": 0.9615123271942139, "learning_rate": 0.0002, "epoch": 6.389135882305392, "step": 39520}, {"loss": 0.4915, "grad_norm": 1.0604504346847534, "learning_rate": 0.0002, "epoch": 6.390752566486137, "step": 39530}, {"loss": 0.5315, "grad_norm": 1.2460750341415405, "learning_rate": 0.0002, "epoch": 6.392369250666882, "step": 39540}, {"loss": 0.4929, "grad_norm": 1.1496477127075195, "learning_rate": 0.0002, "epoch": 6.393985934847628, "step": 39550}, {"loss": 0.4872, "grad_norm": 1.048043966293335, "learning_rate": 0.0002, "epoch": 6.395602619028373, "step": 39560}, {"loss": 0.5231, "grad_norm": 1.333539366722107, "learning_rate": 0.0002, "epoch": 6.397219303209118, "step": 39570}, {"loss": 0.4877, "grad_norm": 1.0605626106262207, "learning_rate": 0.0002, "epoch": 6.398835987389863, "step": 39580}, {"loss": 0.4643, "grad_norm": 1.163220763206482, "learning_rate": 0.0002, "epoch": 6.4004526715706085, "step": 39590}, {"loss": 0.4824, "grad_norm": 1.1878494024276733, "learning_rate": 0.0002, "epoch": 6.402069355751354, "step": 39600}, {"loss": 0.5242, "grad_norm": 1.4630796909332275, "learning_rate": 0.0002, "epoch": 6.403686039932099, "step": 39610}, {"loss": 0.4985, "grad_norm": 1.073255181312561, "learning_rate": 0.0002, "epoch": 6.405302724112844, "step": 39620}, {"loss": 0.5108, "grad_norm": 1.0538873672485352, "learning_rate": 0.0002, "epoch": 6.406919408293589, "step": 39630}, {"loss": 0.4801, "grad_norm": 1.015525221824646, "learning_rate": 0.0002, "epoch": 6.4085360924743355, "step": 39640}, {"loss": 0.4781, "grad_norm": 1.1454379558563232, "learning_rate": 0.0002, "epoch": 6.410152776655081, "step": 39650}, {"loss": 0.498, "grad_norm": 1.2801800966262817, "learning_rate": 0.0002, "epoch": 6.411769460835826, "step": 39660}, {"loss": 0.4804, "grad_norm": 1.077579140663147, "learning_rate": 0.0002, "epoch": 6.413386145016571, "step": 39670}, {"loss": 0.51, "grad_norm": 1.376662015914917, "learning_rate": 0.0002, "epoch": 6.4150028291973165, "step": 39680}, {"loss": 0.4956, "grad_norm": 1.2064344882965088, "learning_rate": 0.0002, "epoch": 6.416619513378062, "step": 39690}, {"loss": 0.4762, "grad_norm": 1.0689115524291992, "learning_rate": 0.0002, "epoch": 6.418236197558807, "step": 39700}, {"loss": 0.4762, "grad_norm": 0.9997019171714783, "learning_rate": 0.0002, "epoch": 6.419852881739552, "step": 39710}, {"loss": 0.49, "grad_norm": 1.2368080615997314, "learning_rate": 0.0002, "epoch": 6.421469565920297, "step": 39720}, {"loss": 0.4774, "grad_norm": 1.2085820436477661, "learning_rate": 0.0002, "epoch": 6.423086250101043, "step": 39730}, {"loss": 0.4671, "grad_norm": 1.057246208190918, "learning_rate": 0.0002, "epoch": 6.424702934281788, "step": 39740}, {"loss": 0.5315, "grad_norm": 1.1311043500900269, "learning_rate": 0.0002, "epoch": 6.426319618462533, "step": 39750}, {"loss": 0.5171, "grad_norm": 1.2352231740951538, "learning_rate": 0.0002, "epoch": 6.427936302643278, "step": 39760}, {"loss": 0.466, "grad_norm": 0.953233540058136, "learning_rate": 0.0002, "epoch": 6.4295529868240235, "step": 39770}, {"loss": 0.4834, "grad_norm": 1.0632505416870117, "learning_rate": 0.0002, "epoch": 6.431169671004769, "step": 39780}, {"loss": 0.5053, "grad_norm": 1.0916751623153687, "learning_rate": 0.0002, "epoch": 6.432786355185515, "step": 39790}, {"loss": 0.4788, "grad_norm": 0.9732703566551208, "learning_rate": 0.0002, "epoch": 6.43440303936626, "step": 39800}, {"loss": 0.4982, "grad_norm": 1.1673705577850342, "learning_rate": 0.0002, "epoch": 6.436019723547005, "step": 39810}, {"loss": 0.4484, "grad_norm": 1.1049559116363525, "learning_rate": 0.0002, "epoch": 6.437636407727751, "step": 39820}, {"loss": 0.4784, "grad_norm": 1.345277190208435, "learning_rate": 0.0002, "epoch": 6.439253091908496, "step": 39830}, {"loss": 0.4716, "grad_norm": 1.1118950843811035, "learning_rate": 0.0002, "epoch": 6.440869776089241, "step": 39840}, {"loss": 0.5133, "grad_norm": 1.4872850179672241, "learning_rate": 0.0002, "epoch": 6.442486460269986, "step": 39850}, {"loss": 0.4532, "grad_norm": 1.0763497352600098, "learning_rate": 0.0002, "epoch": 6.4441031444507315, "step": 39860}, {"loss": 0.4572, "grad_norm": 0.9245555400848389, "learning_rate": 0.0002, "epoch": 6.445719828631477, "step": 39870}, {"loss": 0.4917, "grad_norm": 1.4154807329177856, "learning_rate": 0.0002, "epoch": 6.447336512812222, "step": 39880}, {"loss": 0.4852, "grad_norm": 1.0885124206542969, "learning_rate": 0.0002, "epoch": 6.448953196992967, "step": 39890}, {"loss": 0.5399, "grad_norm": 1.3989344835281372, "learning_rate": 0.0002, "epoch": 6.450569881173712, "step": 39900}, {"loss": 0.509, "grad_norm": 0.9763124585151672, "learning_rate": 0.0002, "epoch": 6.452186565354458, "step": 39910}, {"loss": 0.5134, "grad_norm": 1.135272741317749, "learning_rate": 0.0002, "epoch": 6.453803249535203, "step": 39920}, {"loss": 0.4941, "grad_norm": 1.1140081882476807, "learning_rate": 0.0002, "epoch": 6.455419933715948, "step": 39930}, {"loss": 0.5137, "grad_norm": 1.0992448329925537, "learning_rate": 0.0002, "epoch": 6.457036617896694, "step": 39940}, {"loss": 0.4914, "grad_norm": 1.1658501625061035, "learning_rate": 0.0002, "epoch": 6.4586533020774395, "step": 39950}, {"loss": 0.5036, "grad_norm": 1.1122797727584839, "learning_rate": 0.0002, "epoch": 6.460269986258185, "step": 39960}, {"loss": 0.5159, "grad_norm": 0.9664968252182007, "learning_rate": 0.0002, "epoch": 6.46188667043893, "step": 39970}, {"loss": 0.4989, "grad_norm": 1.2513965368270874, "learning_rate": 0.0002, "epoch": 6.463503354619675, "step": 39980}, {"loss": 0.4694, "grad_norm": 1.1198630332946777, "learning_rate": 0.0002, "epoch": 6.46512003880042, "step": 39990}, {"loss": 0.5023, "grad_norm": 0.8783249855041504, "learning_rate": 0.0002, "epoch": 6.466736722981166, "step": 40000}, {"loss": 0.4648, "grad_norm": 1.1313109397888184, "learning_rate": 0.0002, "epoch": 6.468353407161911, "step": 40010}, {"loss": 0.4965, "grad_norm": 1.0854487419128418, "learning_rate": 0.0002, "epoch": 6.469970091342656, "step": 40020}, {"loss": 0.5253, "grad_norm": 1.1738566160202026, "learning_rate": 0.0002, "epoch": 6.471586775523401, "step": 40030}, {"loss": 0.4947, "grad_norm": 0.9720084071159363, "learning_rate": 0.0002, "epoch": 6.473203459704147, "step": 40040}, {"loss": 0.5218, "grad_norm": 1.105618953704834, "learning_rate": 0.0002, "epoch": 6.474820143884892, "step": 40050}, {"loss": 0.4943, "grad_norm": 1.2007657289505005, "learning_rate": 0.0002, "epoch": 6.476436828065637, "step": 40060}, {"loss": 0.4882, "grad_norm": 1.088402509689331, "learning_rate": 0.0002, "epoch": 6.478053512246382, "step": 40070}, {"loss": 0.504, "grad_norm": 1.0775291919708252, "learning_rate": 0.0002, "epoch": 6.4796701964271275, "step": 40080}, {"loss": 0.4791, "grad_norm": 1.1018189191818237, "learning_rate": 0.0002, "epoch": 6.481286880607874, "step": 40090}, {"loss": 0.488, "grad_norm": 1.1676557064056396, "learning_rate": 0.0002, "epoch": 6.482903564788619, "step": 40100}, {"loss": 0.4818, "grad_norm": 0.9619805812835693, "learning_rate": 0.0002, "epoch": 6.484520248969364, "step": 40110}, {"loss": 0.4986, "grad_norm": 1.2408208847045898, "learning_rate": 0.0002, "epoch": 6.486136933150109, "step": 40120}, {"loss": 0.4668, "grad_norm": 1.3488136529922485, "learning_rate": 0.0002, "epoch": 6.4877536173308545, "step": 40130}, {"loss": 0.4774, "grad_norm": 0.9864488244056702, "learning_rate": 0.0002, "epoch": 6.4893703015116, "step": 40140}, {"loss": 0.4651, "grad_norm": 0.9437947273254395, "learning_rate": 0.0002, "epoch": 6.490986985692345, "step": 40150}, {"loss": 0.542, "grad_norm": 1.2005455493927002, "learning_rate": 0.0002, "epoch": 6.49260366987309, "step": 40160}, {"loss": 0.4704, "grad_norm": 1.0796732902526855, "learning_rate": 0.0002, "epoch": 6.4942203540538355, "step": 40170}, {"loss": 0.498, "grad_norm": 1.1347825527191162, "learning_rate": 0.0002, "epoch": 6.495837038234581, "step": 40180}, {"loss": 0.5215, "grad_norm": 1.2311455011367798, "learning_rate": 0.0002, "epoch": 6.497453722415326, "step": 40190}, {"loss": 0.5043, "grad_norm": 1.068609356880188, "learning_rate": 0.0002, "epoch": 6.499070406596071, "step": 40200}, {"loss": 0.4868, "grad_norm": 1.196425437927246, "learning_rate": 0.0002, "epoch": 6.500687090776816, "step": 40210}, {"loss": 0.4881, "grad_norm": 1.183927297592163, "learning_rate": 0.0002, "epoch": 6.5023037749575625, "step": 40220}, {"loss": 0.4958, "grad_norm": 0.9099724292755127, "learning_rate": 0.0002, "epoch": 6.503920459138307, "step": 40230}, {"loss": 0.4816, "grad_norm": 0.9261038899421692, "learning_rate": 0.0002, "epoch": 6.505537143319053, "step": 40240}, {"loss": 0.5151, "grad_norm": 1.185491681098938, "learning_rate": 0.0002, "epoch": 6.507153827499798, "step": 40250}, {"loss": 0.4853, "grad_norm": 1.1866052150726318, "learning_rate": 0.0002, "epoch": 6.508770511680543, "step": 40260}, {"loss": 0.491, "grad_norm": 1.1600912809371948, "learning_rate": 0.0002, "epoch": 6.510387195861289, "step": 40270}, {"loss": 0.5181, "grad_norm": 0.9609426259994507, "learning_rate": 0.0002, "epoch": 6.512003880042034, "step": 40280}, {"loss": 0.4794, "grad_norm": 1.078864336013794, "learning_rate": 0.0002, "epoch": 6.513620564222779, "step": 40290}, {"loss": 0.46, "grad_norm": 1.042761206626892, "learning_rate": 0.0002, "epoch": 6.515237248403524, "step": 40300}, {"loss": 0.5341, "grad_norm": 0.9742481112480164, "learning_rate": 0.0002, "epoch": 6.51685393258427, "step": 40310}, {"loss": 0.5234, "grad_norm": 1.2544835805892944, "learning_rate": 0.0002, "epoch": 6.518470616765015, "step": 40320}, {"loss": 0.4815, "grad_norm": 1.3019760847091675, "learning_rate": 0.0002, "epoch": 6.52008730094576, "step": 40330}, {"loss": 0.5039, "grad_norm": 1.3196964263916016, "learning_rate": 0.0002, "epoch": 6.521703985126505, "step": 40340}, {"loss": 0.4979, "grad_norm": 1.2795668840408325, "learning_rate": 0.0002, "epoch": 6.5233206693072505, "step": 40350}, {"loss": 0.5075, "grad_norm": 1.1618940830230713, "learning_rate": 0.0002, "epoch": 6.524937353487996, "step": 40360}, {"loss": 0.5081, "grad_norm": 1.330543041229248, "learning_rate": 0.0002, "epoch": 6.526554037668742, "step": 40370}, {"loss": 0.5055, "grad_norm": 1.1946901082992554, "learning_rate": 0.0002, "epoch": 6.528170721849486, "step": 40380}, {"loss": 0.4518, "grad_norm": 1.1708201169967651, "learning_rate": 0.0002, "epoch": 6.529787406030232, "step": 40390}, {"loss": 0.4556, "grad_norm": 0.894036591053009, "learning_rate": 0.0002, "epoch": 6.531404090210978, "step": 40400}, {"loss": 0.4919, "grad_norm": 1.1199041604995728, "learning_rate": 0.0002, "epoch": 6.533020774391723, "step": 40410}, {"loss": 0.471, "grad_norm": 1.180317759513855, "learning_rate": 0.0002, "epoch": 6.534637458572468, "step": 40420}, {"loss": 0.4914, "grad_norm": 1.37367582321167, "learning_rate": 0.0002, "epoch": 6.536254142753213, "step": 40430}, {"loss": 0.4561, "grad_norm": 1.134791612625122, "learning_rate": 0.0002, "epoch": 6.5378708269339585, "step": 40440}, {"loss": 0.5337, "grad_norm": 1.1160204410552979, "learning_rate": 0.0002, "epoch": 6.539487511114704, "step": 40450}, {"loss": 0.5299, "grad_norm": 1.268347978591919, "learning_rate": 0.0002, "epoch": 6.541104195295449, "step": 40460}, {"loss": 0.5167, "grad_norm": 1.1424330472946167, "learning_rate": 0.0002, "epoch": 6.542720879476194, "step": 40470}, {"loss": 0.5114, "grad_norm": 1.3098465204238892, "learning_rate": 0.0002, "epoch": 6.544337563656939, "step": 40480}, {"loss": 0.4865, "grad_norm": 1.3439544439315796, "learning_rate": 0.0002, "epoch": 6.545954247837685, "step": 40490}, {"loss": 0.5183, "grad_norm": 1.2708452939987183, "learning_rate": 0.0002, "epoch": 6.54757093201843, "step": 40500}, {"loss": 0.5099, "grad_norm": 1.483680248260498, "learning_rate": 0.0002, "epoch": 6.549187616199175, "step": 40510}, {"loss": 0.4811, "grad_norm": 1.1697806119918823, "learning_rate": 0.0002, "epoch": 6.550804300379921, "step": 40520}, {"loss": 0.4814, "grad_norm": 1.1665642261505127, "learning_rate": 0.0002, "epoch": 6.5524209845606665, "step": 40530}, {"loss": 0.4985, "grad_norm": 1.1243325471878052, "learning_rate": 0.0002, "epoch": 6.554037668741412, "step": 40540}, {"loss": 0.4936, "grad_norm": 1.0277988910675049, "learning_rate": 0.0002, "epoch": 6.555654352922157, "step": 40550}, {"loss": 0.487, "grad_norm": 1.1466810703277588, "learning_rate": 0.0002, "epoch": 6.557271037102902, "step": 40560}, {"loss": 0.4851, "grad_norm": 1.1415363550186157, "learning_rate": 0.0002, "epoch": 6.558887721283647, "step": 40570}, {"loss": 0.4631, "grad_norm": 1.1923491954803467, "learning_rate": 0.0002, "epoch": 6.560504405464393, "step": 40580}, {"loss": 0.5071, "grad_norm": 0.9264549612998962, "learning_rate": 0.0002, "epoch": 6.562121089645138, "step": 40590}, {"loss": 0.466, "grad_norm": 0.8810341954231262, "learning_rate": 0.0002, "epoch": 6.563737773825883, "step": 40600}, {"loss": 0.5085, "grad_norm": 2.3296701908111572, "learning_rate": 0.0002, "epoch": 6.565354458006628, "step": 40610}, {"loss": 0.5196, "grad_norm": 1.0865163803100586, "learning_rate": 0.0002, "epoch": 6.5669711421873735, "step": 40620}, {"loss": 0.5132, "grad_norm": 0.9844607710838318, "learning_rate": 0.0002, "epoch": 6.568587826368119, "step": 40630}, {"loss": 0.5437, "grad_norm": 1.1686855554580688, "learning_rate": 0.0002, "epoch": 6.570204510548864, "step": 40640}, {"loss": 0.5293, "grad_norm": 1.016829252243042, "learning_rate": 0.0002, "epoch": 6.571821194729609, "step": 40650}, {"loss": 0.5243, "grad_norm": 1.2789337635040283, "learning_rate": 0.0002, "epoch": 6.5734378789103545, "step": 40660}, {"loss": 0.4867, "grad_norm": 1.0819072723388672, "learning_rate": 0.0002, "epoch": 6.575054563091101, "step": 40670}, {"loss": 0.5024, "grad_norm": 1.1478345394134521, "learning_rate": 0.0002, "epoch": 6.576671247271846, "step": 40680}, {"loss": 0.5282, "grad_norm": 0.7972208857536316, "learning_rate": 0.0002, "epoch": 6.578287931452591, "step": 40690}, {"loss": 0.4877, "grad_norm": 1.1481789350509644, "learning_rate": 0.0002, "epoch": 6.579904615633336, "step": 40700}, {"loss": 0.5143, "grad_norm": 1.0921871662139893, "learning_rate": 0.0002, "epoch": 6.5815212998140815, "step": 40710}, {"loss": 0.5441, "grad_norm": 1.0230315923690796, "learning_rate": 0.0002, "epoch": 6.583137983994827, "step": 40720}, {"loss": 0.4734, "grad_norm": 1.151049017906189, "learning_rate": 0.0002, "epoch": 6.584754668175572, "step": 40730}, {"loss": 0.4782, "grad_norm": 1.4016883373260498, "learning_rate": 0.0002, "epoch": 6.586371352356317, "step": 40740}, {"loss": 0.5195, "grad_norm": 1.2211825847625732, "learning_rate": 0.0002, "epoch": 6.587988036537062, "step": 40750}, {"loss": 0.4815, "grad_norm": 1.2803404331207275, "learning_rate": 0.0002, "epoch": 6.589604720717808, "step": 40760}, {"loss": 0.5329, "grad_norm": 1.1119942665100098, "learning_rate": 0.0002, "epoch": 6.591221404898553, "step": 40770}, {"loss": 0.5135, "grad_norm": 1.464650273323059, "learning_rate": 0.0002, "epoch": 6.592838089079298, "step": 40780}, {"loss": 0.5181, "grad_norm": 1.1751397848129272, "learning_rate": 0.0002, "epoch": 6.594454773260043, "step": 40790}, {"loss": 0.4772, "grad_norm": 1.0866316556930542, "learning_rate": 0.0002, "epoch": 6.596071457440789, "step": 40800}, {"loss": 0.5132, "grad_norm": 1.1733694076538086, "learning_rate": 0.0002, "epoch": 6.597688141621534, "step": 40810}, {"loss": 0.5138, "grad_norm": 1.184708833694458, "learning_rate": 0.0002, "epoch": 6.59930482580228, "step": 40820}, {"loss": 0.4885, "grad_norm": 1.406081199645996, "learning_rate": 0.0002, "epoch": 6.600921509983025, "step": 40830}, {"loss": 0.499, "grad_norm": 0.9658212661743164, "learning_rate": 0.0002, "epoch": 6.60253819416377, "step": 40840}, {"loss": 0.5113, "grad_norm": 1.1457678079605103, "learning_rate": 0.0002, "epoch": 6.604154878344516, "step": 40850}, {"loss": 0.4916, "grad_norm": 1.0487784147262573, "learning_rate": 0.0002, "epoch": 6.605771562525261, "step": 40860}, {"loss": 0.4682, "grad_norm": 0.9357177019119263, "learning_rate": 0.0002, "epoch": 6.607388246706006, "step": 40870}, {"loss": 0.4751, "grad_norm": 1.1479727029800415, "learning_rate": 0.0002, "epoch": 6.609004930886751, "step": 40880}, {"loss": 0.5493, "grad_norm": 1.3729329109191895, "learning_rate": 0.0002, "epoch": 6.610621615067497, "step": 40890}, {"loss": 0.4886, "grad_norm": 1.0085599422454834, "learning_rate": 0.0002, "epoch": 6.612238299248242, "step": 40900}, {"loss": 0.516, "grad_norm": 1.2750911712646484, "learning_rate": 0.0002, "epoch": 6.613854983428987, "step": 40910}, {"loss": 0.5342, "grad_norm": 1.1929547786712646, "learning_rate": 0.0002, "epoch": 6.615471667609732, "step": 40920}, {"loss": 0.4919, "grad_norm": 1.0821375846862793, "learning_rate": 0.0002, "epoch": 6.6170883517904775, "step": 40930}, {"loss": 0.5057, "grad_norm": 1.197347640991211, "learning_rate": 0.0002, "epoch": 6.618705035971223, "step": 40940}, {"loss": 0.492, "grad_norm": 1.2074699401855469, "learning_rate": 0.0002, "epoch": 6.620321720151968, "step": 40950}, {"loss": 0.5089, "grad_norm": 1.312009572982788, "learning_rate": 0.0002, "epoch": 6.621938404332713, "step": 40960}, {"loss": 0.5476, "grad_norm": 1.4381471872329712, "learning_rate": 0.0002, "epoch": 6.623555088513459, "step": 40970}, {"loss": 0.4904, "grad_norm": 1.1574671268463135, "learning_rate": 0.0002, "epoch": 6.6251717726942045, "step": 40980}, {"loss": 0.531, "grad_norm": 0.885661780834198, "learning_rate": 0.0002, "epoch": 6.62678845687495, "step": 40990}, {"loss": 0.5145, "grad_norm": 1.024571180343628, "learning_rate": 0.0002, "epoch": 6.628405141055695, "step": 41000}, {"loss": 0.4791, "grad_norm": 1.103437900543213, "learning_rate": 0.0002, "epoch": 6.63002182523644, "step": 41010}, {"loss": 0.4671, "grad_norm": 1.122450828552246, "learning_rate": 0.0002, "epoch": 6.6316385094171855, "step": 41020}, {"loss": 0.5134, "grad_norm": 1.2256295680999756, "learning_rate": 0.0002, "epoch": 6.633255193597931, "step": 41030}, {"loss": 0.4908, "grad_norm": 1.364594578742981, "learning_rate": 0.0002, "epoch": 6.634871877778676, "step": 41040}, {"loss": 0.4964, "grad_norm": 0.9550056457519531, "learning_rate": 0.0002, "epoch": 6.636488561959421, "step": 41050}, {"loss": 0.5028, "grad_norm": 1.3174707889556885, "learning_rate": 0.0002, "epoch": 6.638105246140166, "step": 41060}, {"loss": 0.4717, "grad_norm": 1.0835540294647217, "learning_rate": 0.0002, "epoch": 6.639721930320912, "step": 41070}, {"loss": 0.497, "grad_norm": 1.1432770490646362, "learning_rate": 0.0002, "epoch": 6.641338614501657, "step": 41080}, {"loss": 0.4903, "grad_norm": 1.2398556470870972, "learning_rate": 0.0002, "epoch": 6.642955298682402, "step": 41090}, {"loss": 0.4991, "grad_norm": 1.1147747039794922, "learning_rate": 0.0002, "epoch": 6.644571982863147, "step": 41100}, {"loss": 0.505, "grad_norm": 1.0730493068695068, "learning_rate": 0.0002, "epoch": 6.6461886670438926, "step": 41110}, {"loss": 0.486, "grad_norm": 1.3218451738357544, "learning_rate": 0.0002, "epoch": 6.647805351224639, "step": 41120}, {"loss": 0.5276, "grad_norm": 1.3027331829071045, "learning_rate": 0.0002, "epoch": 6.649422035405384, "step": 41130}, {"loss": 0.5263, "grad_norm": 1.0280735492706299, "learning_rate": 0.0002, "epoch": 6.651038719586129, "step": 41140}, {"loss": 0.4952, "grad_norm": 1.109916090965271, "learning_rate": 0.0002, "epoch": 6.652655403766874, "step": 41150}, {"loss": 0.5001, "grad_norm": 1.078734040260315, "learning_rate": 0.0002, "epoch": 6.65427208794762, "step": 41160}, {"loss": 0.484, "grad_norm": 1.1595654487609863, "learning_rate": 0.0002, "epoch": 6.655888772128365, "step": 41170}, {"loss": 0.5101, "grad_norm": 1.1701031923294067, "learning_rate": 0.0002, "epoch": 6.65750545630911, "step": 41180}, {"loss": 0.5341, "grad_norm": 1.0424643754959106, "learning_rate": 0.0002, "epoch": 6.659122140489855, "step": 41190}, {"loss": 0.4863, "grad_norm": 1.22880220413208, "learning_rate": 0.0002, "epoch": 6.6607388246706005, "step": 41200}, {"loss": 0.4987, "grad_norm": 1.1907655000686646, "learning_rate": 0.0002, "epoch": 6.662355508851346, "step": 41210}, {"loss": 0.5343, "grad_norm": 1.0765007734298706, "learning_rate": 0.0002, "epoch": 6.663972193032091, "step": 41220}, {"loss": 0.5039, "grad_norm": 0.9994917511940002, "learning_rate": 0.0002, "epoch": 6.665588877212836, "step": 41230}, {"loss": 0.507, "grad_norm": 0.968578040599823, "learning_rate": 0.0002, "epoch": 6.6672055613935814, "step": 41240}, {"loss": 0.5068, "grad_norm": 1.0576032400131226, "learning_rate": 0.0002, "epoch": 6.668822245574327, "step": 41250}, {"loss": 0.486, "grad_norm": 1.2183765172958374, "learning_rate": 0.0002, "epoch": 6.670438929755072, "step": 41260}, {"loss": 0.4764, "grad_norm": 1.2548623085021973, "learning_rate": 0.0002, "epoch": 6.672055613935818, "step": 41270}, {"loss": 0.5014, "grad_norm": 1.0848388671875, "learning_rate": 0.0002, "epoch": 6.673672298116563, "step": 41280}, {"loss": 0.5404, "grad_norm": 1.21421217918396, "learning_rate": 0.0002, "epoch": 6.6752889822973085, "step": 41290}, {"loss": 0.4911, "grad_norm": 1.1453598737716675, "learning_rate": 0.0002, "epoch": 6.676905666478054, "step": 41300}, {"loss": 0.5033, "grad_norm": 1.2682722806930542, "learning_rate": 0.0002, "epoch": 6.678522350658799, "step": 41310}, {"loss": 0.5313, "grad_norm": 1.1659725904464722, "learning_rate": 0.0002, "epoch": 6.680139034839544, "step": 41320}, {"loss": 0.5505, "grad_norm": 1.36194908618927, "learning_rate": 0.0002, "epoch": 6.681755719020289, "step": 41330}, {"loss": 0.5127, "grad_norm": 1.1712592840194702, "learning_rate": 0.0002, "epoch": 6.683372403201035, "step": 41340}, {"loss": 0.5082, "grad_norm": 1.4168336391448975, "learning_rate": 0.0002, "epoch": 6.68498908738178, "step": 41350}, {"loss": 0.5124, "grad_norm": 1.0395328998565674, "learning_rate": 0.0002, "epoch": 6.686605771562525, "step": 41360}, {"loss": 0.5404, "grad_norm": 1.2511054277420044, "learning_rate": 0.0002, "epoch": 6.68822245574327, "step": 41370}, {"loss": 0.5027, "grad_norm": 1.0438542366027832, "learning_rate": 0.0002, "epoch": 6.689839139924016, "step": 41380}, {"loss": 0.5069, "grad_norm": 1.08684241771698, "learning_rate": 0.0002, "epoch": 6.691455824104761, "step": 41390}, {"loss": 0.5224, "grad_norm": 1.250788927078247, "learning_rate": 0.0002, "epoch": 6.693072508285506, "step": 41400}, {"loss": 0.4921, "grad_norm": 1.313890814781189, "learning_rate": 0.0002, "epoch": 6.694689192466251, "step": 41410}, {"loss": 0.5028, "grad_norm": 1.3218982219696045, "learning_rate": 0.0002, "epoch": 6.696305876646997, "step": 41420}, {"loss": 0.4851, "grad_norm": 1.0366582870483398, "learning_rate": 0.0002, "epoch": 6.697922560827743, "step": 41430}, {"loss": 0.5103, "grad_norm": 1.066121220588684, "learning_rate": 0.0002, "epoch": 6.699539245008488, "step": 41440}, {"loss": 0.4966, "grad_norm": 1.0239925384521484, "learning_rate": 0.0002, "epoch": 6.701155929189233, "step": 41450}, {"loss": 0.4767, "grad_norm": 0.9402176141738892, "learning_rate": 0.0002, "epoch": 6.702772613369978, "step": 41460}, {"loss": 0.5381, "grad_norm": 1.391718864440918, "learning_rate": 0.0002, "epoch": 6.7043892975507235, "step": 41470}, {"loss": 0.512, "grad_norm": 1.215600609779358, "learning_rate": 0.0002, "epoch": 6.706005981731469, "step": 41480}, {"loss": 0.5219, "grad_norm": 1.063722848892212, "learning_rate": 0.0002, "epoch": 6.707622665912214, "step": 41490}, {"loss": 0.492, "grad_norm": 1.132149577140808, "learning_rate": 0.0002, "epoch": 6.709239350092959, "step": 41500}, {"loss": 0.4812, "grad_norm": 1.0302950143814087, "learning_rate": 0.0002, "epoch": 6.7108560342737045, "step": 41510}, {"loss": 0.5141, "grad_norm": 1.5342752933502197, "learning_rate": 0.0002, "epoch": 6.71247271845445, "step": 41520}, {"loss": 0.5123, "grad_norm": 1.177137017250061, "learning_rate": 0.0002, "epoch": 6.714089402635195, "step": 41530}, {"loss": 0.5082, "grad_norm": 1.2335538864135742, "learning_rate": 0.0002, "epoch": 6.71570608681594, "step": 41540}, {"loss": 0.4864, "grad_norm": 1.140604853630066, "learning_rate": 0.0002, "epoch": 6.717322770996686, "step": 41550}, {"loss": 0.4888, "grad_norm": 1.3567465543746948, "learning_rate": 0.0002, "epoch": 6.718939455177431, "step": 41560}, {"loss": 0.5183, "grad_norm": 1.0693929195404053, "learning_rate": 0.0002, "epoch": 6.720556139358177, "step": 41570}, {"loss": 0.5131, "grad_norm": 1.1592605113983154, "learning_rate": 0.0002, "epoch": 6.722172823538922, "step": 41580}, {"loss": 0.5476, "grad_norm": 0.989006519317627, "learning_rate": 0.0002, "epoch": 6.723789507719667, "step": 41590}, {"loss": 0.4952, "grad_norm": 1.04103422164917, "learning_rate": 0.0002, "epoch": 6.7254061919004124, "step": 41600}, {"loss": 0.4823, "grad_norm": 1.1129004955291748, "learning_rate": 0.0002, "epoch": 6.727022876081158, "step": 41610}, {"loss": 0.5032, "grad_norm": 1.1473113298416138, "learning_rate": 0.0002, "epoch": 6.728639560261903, "step": 41620}, {"loss": 0.5253, "grad_norm": 1.348036527633667, "learning_rate": 0.0002, "epoch": 6.730256244442648, "step": 41630}, {"loss": 0.4983, "grad_norm": 1.259942650794983, "learning_rate": 0.0002, "epoch": 6.731872928623393, "step": 41640}, {"loss": 0.5182, "grad_norm": 1.0591514110565186, "learning_rate": 0.0002, "epoch": 6.733489612804139, "step": 41650}, {"loss": 0.4886, "grad_norm": 0.9737129211425781, "learning_rate": 0.0002, "epoch": 6.735106296984884, "step": 41660}, {"loss": 0.5051, "grad_norm": 1.2520451545715332, "learning_rate": 0.0002, "epoch": 6.736722981165629, "step": 41670}, {"loss": 0.5364, "grad_norm": 1.0555530786514282, "learning_rate": 0.0002, "epoch": 6.738339665346374, "step": 41680}, {"loss": 0.4954, "grad_norm": 1.0025697946548462, "learning_rate": 0.0002, "epoch": 6.7399563495271195, "step": 41690}, {"loss": 0.5485, "grad_norm": 1.1114100217819214, "learning_rate": 0.0002, "epoch": 6.741573033707866, "step": 41700}, {"loss": 0.4986, "grad_norm": 1.1537504196166992, "learning_rate": 0.0002, "epoch": 6.74318971788861, "step": 41710}, {"loss": 0.5025, "grad_norm": 1.037880539894104, "learning_rate": 0.0002, "epoch": 6.744806402069356, "step": 41720}, {"loss": 0.482, "grad_norm": 1.0691965818405151, "learning_rate": 0.0002, "epoch": 6.746423086250101, "step": 41730}, {"loss": 0.5272, "grad_norm": 1.376325011253357, "learning_rate": 0.0002, "epoch": 6.748039770430847, "step": 41740}, {"loss": 0.5484, "grad_norm": 1.4667129516601562, "learning_rate": 0.0002, "epoch": 6.749656454611592, "step": 41750}, {"loss": 0.5139, "grad_norm": 1.1517162322998047, "learning_rate": 0.0002, "epoch": 6.751273138792337, "step": 41760}, {"loss": 0.5523, "grad_norm": 1.1454511880874634, "learning_rate": 0.0002, "epoch": 6.752889822973082, "step": 41770}, {"loss": 0.4664, "grad_norm": 1.6323128938674927, "learning_rate": 0.0002, "epoch": 6.7545065071538275, "step": 41780}, {"loss": 0.5153, "grad_norm": 1.0951642990112305, "learning_rate": 0.0002, "epoch": 6.756123191334573, "step": 41790}, {"loss": 0.4998, "grad_norm": 1.0766983032226562, "learning_rate": 0.0002, "epoch": 6.757739875515318, "step": 41800}, {"loss": 0.548, "grad_norm": 1.3472381830215454, "learning_rate": 0.0002, "epoch": 6.759356559696063, "step": 41810}, {"loss": 0.5172, "grad_norm": 1.0248444080352783, "learning_rate": 0.0002, "epoch": 6.760973243876808, "step": 41820}, {"loss": 0.5236, "grad_norm": 1.1276055574417114, "learning_rate": 0.0002, "epoch": 6.762589928057554, "step": 41830}, {"loss": 0.5044, "grad_norm": 1.5398495197296143, "learning_rate": 0.0002, "epoch": 6.764206612238299, "step": 41840}, {"loss": 0.5097, "grad_norm": 1.1886497735977173, "learning_rate": 0.0002, "epoch": 6.765823296419045, "step": 41850}, {"loss": 0.499, "grad_norm": 1.027198076248169, "learning_rate": 0.0002, "epoch": 6.767439980599789, "step": 41860}, {"loss": 0.5444, "grad_norm": 1.4644980430603027, "learning_rate": 0.0002, "epoch": 6.7690566647805355, "step": 41870}, {"loss": 0.5009, "grad_norm": 0.9633586406707764, "learning_rate": 0.0002, "epoch": 6.770673348961281, "step": 41880}, {"loss": 0.484, "grad_norm": 1.0895354747772217, "learning_rate": 0.0002, "epoch": 6.772290033142026, "step": 41890}, {"loss": 0.5172, "grad_norm": 1.1887167692184448, "learning_rate": 0.0002, "epoch": 6.773906717322771, "step": 41900}, {"loss": 0.5399, "grad_norm": 1.3699820041656494, "learning_rate": 0.0002, "epoch": 6.775523401503516, "step": 41910}, {"loss": 0.5504, "grad_norm": 1.0266352891921997, "learning_rate": 0.0002, "epoch": 6.777140085684262, "step": 41920}, {"loss": 0.5105, "grad_norm": 1.0919075012207031, "learning_rate": 0.0002, "epoch": 6.778756769865007, "step": 41930}, {"loss": 0.4842, "grad_norm": 0.9839563369750977, "learning_rate": 0.0002, "epoch": 6.780373454045752, "step": 41940}, {"loss": 0.5081, "grad_norm": 1.2605451345443726, "learning_rate": 0.0002, "epoch": 6.781990138226497, "step": 41950}, {"loss": 0.5391, "grad_norm": 0.9268672466278076, "learning_rate": 0.0002, "epoch": 6.7836068224072426, "step": 41960}, {"loss": 0.4916, "grad_norm": 1.2002313137054443, "learning_rate": 0.0002, "epoch": 6.785223506587988, "step": 41970}, {"loss": 0.5467, "grad_norm": 1.2018438577651978, "learning_rate": 0.0002, "epoch": 6.786840190768733, "step": 41980}, {"loss": 0.5491, "grad_norm": 1.17646062374115, "learning_rate": 0.0002, "epoch": 6.788456874949478, "step": 41990}, {"loss": 0.5354, "grad_norm": 1.1080009937286377, "learning_rate": 0.0002, "epoch": 6.790073559130224, "step": 42000}, {"loss": 0.5384, "grad_norm": 1.1606498956680298, "learning_rate": 0.0002, "epoch": 6.791690243310969, "step": 42010}, {"loss": 0.4931, "grad_norm": 1.2484819889068604, "learning_rate": 0.0002, "epoch": 6.793306927491715, "step": 42020}, {"loss": 0.498, "grad_norm": 1.1363215446472168, "learning_rate": 0.0002, "epoch": 6.79492361167246, "step": 42030}, {"loss": 0.5343, "grad_norm": 1.4469727277755737, "learning_rate": 0.0002, "epoch": 6.796540295853205, "step": 42040}, {"loss": 0.5146, "grad_norm": 1.0617138147354126, "learning_rate": 0.0002, "epoch": 6.7981569800339505, "step": 42050}, {"loss": 0.5188, "grad_norm": 1.1459330320358276, "learning_rate": 0.0002, "epoch": 6.799773664214696, "step": 42060}, {"loss": 0.5116, "grad_norm": 1.2095019817352295, "learning_rate": 0.0002, "epoch": 6.801390348395441, "step": 42070}, {"loss": 0.545, "grad_norm": 1.3200831413269043, "learning_rate": 0.0002, "epoch": 6.803007032576186, "step": 42080}, {"loss": 0.5406, "grad_norm": 1.1633318662643433, "learning_rate": 0.0002, "epoch": 6.8046237167569315, "step": 42090}, {"loss": 0.4938, "grad_norm": 0.8986614942550659, "learning_rate": 0.0002, "epoch": 6.806240400937677, "step": 42100}, {"loss": 0.559, "grad_norm": 1.3705275058746338, "learning_rate": 0.0002, "epoch": 6.807857085118422, "step": 42110}, {"loss": 0.5022, "grad_norm": 1.2418090105056763, "learning_rate": 0.0002, "epoch": 6.809473769299167, "step": 42120}, {"loss": 0.5014, "grad_norm": 1.0818954706192017, "learning_rate": 0.0002, "epoch": 6.811090453479912, "step": 42130}, {"loss": 0.4791, "grad_norm": 0.9293872117996216, "learning_rate": 0.0002, "epoch": 6.812707137660658, "step": 42140}, {"loss": 0.5009, "grad_norm": 0.9791894555091858, "learning_rate": 0.0002, "epoch": 6.814323821841404, "step": 42150}, {"loss": 0.5142, "grad_norm": 1.1956568956375122, "learning_rate": 0.0002, "epoch": 6.815940506022149, "step": 42160}, {"loss": 0.5126, "grad_norm": 0.9643568992614746, "learning_rate": 0.0002, "epoch": 6.817557190202894, "step": 42170}, {"loss": 0.5121, "grad_norm": 1.2499792575836182, "learning_rate": 0.0002, "epoch": 6.819173874383639, "step": 42180}, {"loss": 0.4942, "grad_norm": 1.1779413223266602, "learning_rate": 0.0002, "epoch": 6.820790558564385, "step": 42190}, {"loss": 0.498, "grad_norm": 1.0570595264434814, "learning_rate": 0.0002, "epoch": 6.82240724274513, "step": 42200}, {"loss": 0.4997, "grad_norm": 1.1393938064575195, "learning_rate": 0.0002, "epoch": 6.824023926925875, "step": 42210}, {"loss": 0.4842, "grad_norm": 1.152463436126709, "learning_rate": 0.0002, "epoch": 6.82564061110662, "step": 42220}, {"loss": 0.5234, "grad_norm": 1.3353025913238525, "learning_rate": 0.0002, "epoch": 6.827257295287366, "step": 42230}, {"loss": 0.539, "grad_norm": 1.1719051599502563, "learning_rate": 0.0002, "epoch": 6.828873979468111, "step": 42240}, {"loss": 0.5139, "grad_norm": 1.262141227722168, "learning_rate": 0.0002, "epoch": 6.830490663648856, "step": 42250}, {"loss": 0.5021, "grad_norm": 1.240899920463562, "learning_rate": 0.0002, "epoch": 6.832107347829601, "step": 42260}, {"loss": 0.4961, "grad_norm": 1.0505269765853882, "learning_rate": 0.0002, "epoch": 6.8337240320103465, "step": 42270}, {"loss": 0.4932, "grad_norm": 1.1556071043014526, "learning_rate": 0.0002, "epoch": 6.835340716191092, "step": 42280}, {"loss": 0.5461, "grad_norm": 1.1427719593048096, "learning_rate": 0.0002, "epoch": 6.836957400371837, "step": 42290}, {"loss": 0.5199, "grad_norm": 1.1540080308914185, "learning_rate": 0.0002, "epoch": 6.838574084552583, "step": 42300}, {"loss": 0.5269, "grad_norm": 1.0521200895309448, "learning_rate": 0.0002, "epoch": 6.840190768733328, "step": 42310}, {"loss": 0.541, "grad_norm": 1.0205531120300293, "learning_rate": 0.0002, "epoch": 6.8418074529140736, "step": 42320}, {"loss": 0.5225, "grad_norm": 1.0010193586349487, "learning_rate": 0.0002, "epoch": 6.843424137094819, "step": 42330}, {"loss": 0.5101, "grad_norm": 1.2138770818710327, "learning_rate": 0.0002, "epoch": 6.845040821275564, "step": 42340}, {"loss": 0.5452, "grad_norm": 1.3028651475906372, "learning_rate": 0.0002, "epoch": 6.846657505456309, "step": 42350}, {"loss": 0.4894, "grad_norm": 1.0326353311538696, "learning_rate": 0.0002, "epoch": 6.8482741896370545, "step": 42360}, {"loss": 0.5285, "grad_norm": 1.036085605621338, "learning_rate": 0.0002, "epoch": 6.8498908738178, "step": 42370}, {"loss": 0.505, "grad_norm": 1.0575472116470337, "learning_rate": 0.0002, "epoch": 6.851507557998545, "step": 42380}, {"loss": 0.4997, "grad_norm": 1.1749629974365234, "learning_rate": 0.0002, "epoch": 6.85312424217929, "step": 42390}, {"loss": 0.4961, "grad_norm": 1.1747760772705078, "learning_rate": 0.0002, "epoch": 6.854740926360035, "step": 42400}, {"loss": 0.5138, "grad_norm": 1.1877071857452393, "learning_rate": 0.0002, "epoch": 6.856357610540781, "step": 42410}, {"loss": 0.4972, "grad_norm": 1.1209983825683594, "learning_rate": 0.0002, "epoch": 6.857974294721526, "step": 42420}, {"loss": 0.4939, "grad_norm": 1.2918205261230469, "learning_rate": 0.0002, "epoch": 6.859590978902271, "step": 42430}, {"loss": 0.5012, "grad_norm": 1.2443464994430542, "learning_rate": 0.0002, "epoch": 6.861207663083016, "step": 42440}, {"loss": 0.5226, "grad_norm": 0.9336795210838318, "learning_rate": 0.0002, "epoch": 6.8628243472637624, "step": 42450}, {"loss": 0.5108, "grad_norm": 1.2183542251586914, "learning_rate": 0.0002, "epoch": 6.864441031444508, "step": 42460}, {"loss": 0.5245, "grad_norm": 1.0071234703063965, "learning_rate": 0.0002, "epoch": 6.866057715625253, "step": 42470}, {"loss": 0.4753, "grad_norm": 1.2914012670516968, "learning_rate": 0.0002, "epoch": 6.867674399805998, "step": 42480}, {"loss": 0.4865, "grad_norm": 1.1050426959991455, "learning_rate": 0.0002, "epoch": 6.869291083986743, "step": 42490}, {"loss": 0.5243, "grad_norm": 1.1163811683654785, "learning_rate": 0.0002, "epoch": 6.870907768167489, "step": 42500}, {"loss": 0.5065, "grad_norm": 1.1575818061828613, "learning_rate": 0.0002, "epoch": 6.872524452348234, "step": 42510}, {"loss": 0.5353, "grad_norm": 1.11167311668396, "learning_rate": 0.0002, "epoch": 6.874141136528979, "step": 42520}, {"loss": 0.5141, "grad_norm": 1.0379102230072021, "learning_rate": 0.0002, "epoch": 6.875757820709724, "step": 42530}, {"loss": 0.5355, "grad_norm": 1.2617160081863403, "learning_rate": 0.0002, "epoch": 6.8773745048904695, "step": 42540}, {"loss": 0.4785, "grad_norm": 1.1749719381332397, "learning_rate": 0.0002, "epoch": 6.878991189071215, "step": 42550}, {"loss": 0.5503, "grad_norm": 1.2284821271896362, "learning_rate": 0.0002, "epoch": 6.88060787325196, "step": 42560}, {"loss": 0.5065, "grad_norm": 1.1917030811309814, "learning_rate": 0.0002, "epoch": 6.882224557432705, "step": 42570}, {"loss": 0.5176, "grad_norm": 1.1943914890289307, "learning_rate": 0.0002, "epoch": 6.8838412416134505, "step": 42580}, {"loss": 0.5072, "grad_norm": 1.2641394138336182, "learning_rate": 0.0002, "epoch": 6.885457925794196, "step": 42590}, {"loss": 0.5004, "grad_norm": 1.1280436515808105, "learning_rate": 0.0002, "epoch": 6.887074609974942, "step": 42600}, {"loss": 0.5328, "grad_norm": 0.9865449070930481, "learning_rate": 0.0002, "epoch": 6.888691294155687, "step": 42610}, {"loss": 0.4953, "grad_norm": 0.994987428188324, "learning_rate": 0.0002, "epoch": 6.890307978336432, "step": 42620}, {"loss": 0.4805, "grad_norm": 0.9900388717651367, "learning_rate": 0.0002, "epoch": 6.8919246625171775, "step": 42630}, {"loss": 0.5467, "grad_norm": 1.2992421388626099, "learning_rate": 0.0002, "epoch": 6.893541346697923, "step": 42640}, {"loss": 0.5017, "grad_norm": 1.0152487754821777, "learning_rate": 0.0002, "epoch": 6.895158030878668, "step": 42650}, {"loss": 0.5043, "grad_norm": 1.199453353881836, "learning_rate": 0.0002, "epoch": 6.896774715059413, "step": 42660}, {"loss": 0.5106, "grad_norm": 1.100630521774292, "learning_rate": 0.0002, "epoch": 6.898391399240158, "step": 42670}, {"loss": 0.503, "grad_norm": 1.0489764213562012, "learning_rate": 0.0002, "epoch": 6.900008083420904, "step": 42680}, {"loss": 0.4634, "grad_norm": 1.101407527923584, "learning_rate": 0.0002, "epoch": 6.901624767601649, "step": 42690}, {"loss": 0.5361, "grad_norm": 1.3130593299865723, "learning_rate": 0.0002, "epoch": 6.903241451782394, "step": 42700}, {"loss": 0.5119, "grad_norm": 0.9906072616577148, "learning_rate": 0.0002, "epoch": 6.904858135963139, "step": 42710}, {"loss": 0.5146, "grad_norm": 1.094502329826355, "learning_rate": 0.0002, "epoch": 6.906474820143885, "step": 42720}, {"loss": 0.5165, "grad_norm": 1.1025426387786865, "learning_rate": 0.0002, "epoch": 6.90809150432463, "step": 42730}, {"loss": 0.5463, "grad_norm": 1.0644042491912842, "learning_rate": 0.0002, "epoch": 6.909708188505375, "step": 42740}, {"loss": 0.5024, "grad_norm": 1.0709129571914673, "learning_rate": 0.0002, "epoch": 6.911324872686121, "step": 42750}, {"loss": 0.5093, "grad_norm": 1.2445871829986572, "learning_rate": 0.0002, "epoch": 6.912941556866866, "step": 42760}, {"loss": 0.5305, "grad_norm": 1.020058035850525, "learning_rate": 0.0002, "epoch": 6.914558241047612, "step": 42770}, {"loss": 0.5382, "grad_norm": 0.9795091152191162, "learning_rate": 0.0002, "epoch": 6.916174925228357, "step": 42780}, {"loss": 0.5429, "grad_norm": 0.9369977116584778, "learning_rate": 0.0002, "epoch": 6.917791609409102, "step": 42790}, {"loss": 0.5444, "grad_norm": 1.0741904973983765, "learning_rate": 0.0002, "epoch": 6.919408293589847, "step": 42800}, {"loss": 0.5402, "grad_norm": 1.0702799558639526, "learning_rate": 0.0002, "epoch": 6.921024977770593, "step": 42810}, {"loss": 0.5291, "grad_norm": 1.0383983850479126, "learning_rate": 0.0002, "epoch": 6.922641661951338, "step": 42820}, {"loss": 0.5106, "grad_norm": 1.0761083364486694, "learning_rate": 0.0002, "epoch": 6.924258346132083, "step": 42830}, {"loss": 0.5726, "grad_norm": 1.2332350015640259, "learning_rate": 0.0002, "epoch": 6.925875030312828, "step": 42840}, {"loss": 0.4996, "grad_norm": 1.3184348344802856, "learning_rate": 0.0002, "epoch": 6.9274917144935735, "step": 42850}, {"loss": 0.5503, "grad_norm": 1.0586378574371338, "learning_rate": 0.0002, "epoch": 6.929108398674319, "step": 42860}, {"loss": 0.511, "grad_norm": 1.2294201850891113, "learning_rate": 0.0002, "epoch": 6.930725082855064, "step": 42870}, {"loss": 0.54, "grad_norm": 1.3097991943359375, "learning_rate": 0.0002, "epoch": 6.932341767035809, "step": 42880}, {"loss": 0.5228, "grad_norm": 0.9006873965263367, "learning_rate": 0.0002, "epoch": 6.933958451216554, "step": 42890}, {"loss": 0.4617, "grad_norm": 1.265931248664856, "learning_rate": 0.0002, "epoch": 6.9355751353973005, "step": 42900}, {"loss": 0.5029, "grad_norm": 1.1013522148132324, "learning_rate": 0.0002, "epoch": 6.937191819578046, "step": 42910}, {"loss": 0.5334, "grad_norm": 0.9910131692886353, "learning_rate": 0.0002, "epoch": 6.938808503758791, "step": 42920}, {"loss": 0.5211, "grad_norm": 1.102683424949646, "learning_rate": 0.0002, "epoch": 6.940425187939536, "step": 42930}, {"loss": 0.5588, "grad_norm": 1.232961893081665, "learning_rate": 0.0002, "epoch": 6.9420418721202815, "step": 42940}, {"loss": 0.5357, "grad_norm": 1.1714650392532349, "learning_rate": 0.0002, "epoch": 6.943658556301027, "step": 42950}, {"loss": 0.5232, "grad_norm": 1.1684318780899048, "learning_rate": 0.0002, "epoch": 6.945275240481772, "step": 42960}, {"loss": 0.5035, "grad_norm": 1.2074716091156006, "learning_rate": 0.0002, "epoch": 6.946891924662517, "step": 42970}, {"loss": 0.5111, "grad_norm": 1.2061275243759155, "learning_rate": 0.0002, "epoch": 6.948508608843262, "step": 42980}, {"loss": 0.5066, "grad_norm": 1.1216989755630493, "learning_rate": 0.0002, "epoch": 6.950125293024008, "step": 42990}, {"loss": 0.4948, "grad_norm": 1.304117202758789, "learning_rate": 0.0002, "epoch": 6.951741977204753, "step": 43000}, {"loss": 0.5684, "grad_norm": 1.2377972602844238, "learning_rate": 0.0002, "epoch": 6.953358661385498, "step": 43010}, {"loss": 0.4792, "grad_norm": 1.2332178354263306, "learning_rate": 0.0002, "epoch": 6.954975345566243, "step": 43020}, {"loss": 0.5181, "grad_norm": 1.1919599771499634, "learning_rate": 0.0002, "epoch": 6.956592029746989, "step": 43030}, {"loss": 0.5352, "grad_norm": 1.272700548171997, "learning_rate": 0.0002, "epoch": 6.958208713927734, "step": 43040}, {"loss": 0.5328, "grad_norm": 1.4377546310424805, "learning_rate": 0.0002, "epoch": 6.95982539810848, "step": 43050}, {"loss": 0.4894, "grad_norm": 1.2070353031158447, "learning_rate": 0.0002, "epoch": 6.961442082289225, "step": 43060}, {"loss": 0.525, "grad_norm": 1.090205430984497, "learning_rate": 0.0002, "epoch": 6.96305876646997, "step": 43070}, {"loss": 0.5255, "grad_norm": 1.1832911968231201, "learning_rate": 0.0002, "epoch": 6.964675450650716, "step": 43080}, {"loss": 0.5497, "grad_norm": 1.2921082973480225, "learning_rate": 0.0002, "epoch": 6.966292134831461, "step": 43090}, {"loss": 0.5527, "grad_norm": 1.4303096532821655, "learning_rate": 0.0002, "epoch": 6.967908819012206, "step": 43100}, {"loss": 0.4807, "grad_norm": 1.0788004398345947, "learning_rate": 0.0002, "epoch": 6.969525503192951, "step": 43110}, {"loss": 0.5006, "grad_norm": 1.2192047834396362, "learning_rate": 0.0002, "epoch": 6.9711421873736965, "step": 43120}, {"loss": 0.4714, "grad_norm": 1.0735143423080444, "learning_rate": 0.0002, "epoch": 6.972758871554442, "step": 43130}, {"loss": 0.5307, "grad_norm": 1.0317153930664062, "learning_rate": 0.0002, "epoch": 6.974375555735187, "step": 43140}, {"loss": 0.5154, "grad_norm": 1.0926798582077026, "learning_rate": 0.0002, "epoch": 6.975992239915932, "step": 43150}, {"loss": 0.4976, "grad_norm": 1.1660500764846802, "learning_rate": 0.0002, "epoch": 6.977608924096677, "step": 43160}, {"loss": 0.5456, "grad_norm": 1.3945232629776, "learning_rate": 0.0002, "epoch": 6.979225608277423, "step": 43170}, {"loss": 0.4979, "grad_norm": 1.2684587240219116, "learning_rate": 0.0002, "epoch": 6.980842292458169, "step": 43180}, {"loss": 0.5406, "grad_norm": 1.1574004888534546, "learning_rate": 0.0002, "epoch": 6.982458976638913, "step": 43190}, {"loss": 0.5629, "grad_norm": 1.2534198760986328, "learning_rate": 0.0002, "epoch": 6.984075660819659, "step": 43200}, {"loss": 0.5191, "grad_norm": 1.135245442390442, "learning_rate": 0.0002, "epoch": 6.9856923450004045, "step": 43210}, {"loss": 0.548, "grad_norm": 1.3824104070663452, "learning_rate": 0.0002, "epoch": 6.98730902918115, "step": 43220}, {"loss": 0.5294, "grad_norm": 1.2128452062606812, "learning_rate": 0.0002, "epoch": 6.988925713361895, "step": 43230}, {"loss": 0.505, "grad_norm": 1.0795245170593262, "learning_rate": 0.0002, "epoch": 6.99054239754264, "step": 43240}, {"loss": 0.4889, "grad_norm": 1.337353229522705, "learning_rate": 0.0002, "epoch": 6.992159081723385, "step": 43250}, {"loss": 0.4749, "grad_norm": 1.1731765270233154, "learning_rate": 0.0002, "epoch": 6.993775765904131, "step": 43260}, {"loss": 0.4897, "grad_norm": 1.0203192234039307, "learning_rate": 0.0002, "epoch": 6.995392450084876, "step": 43270}, {"loss": 0.5324, "grad_norm": 0.9261201620101929, "learning_rate": 0.0002, "epoch": 6.997009134265621, "step": 43280}, {"loss": 0.5227, "grad_norm": 1.107865810394287, "learning_rate": 0.0002, "epoch": 6.998625818446366, "step": 43290}]} +{"epoch": 7.999353326327702, "step": 49480, "epoch_duration": 16893.388257026672, "total_accumulated_duration": 135136.67567753792, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7568.4541015625}, "peak_memory_usage": {"GPU_0": 13792.75537109375}, "avg_memory_reserved": {"GPU_0": 17416.0}, "peak_memory_reserved": {"GPU_0": 17416.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/Meta-Llama-3-8B-Instruct_int4_medmcqa_full_con_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-51190-sd-42/checkpoint-6185", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 1.6636, "grad_norm": 0.9894065856933594, "learning_rate": 0.0002, "epoch": 0.0016166841807452913, "step": 10}, {"loss": 1.1528, "grad_norm": 1.7810699939727783, "learning_rate": 0.0002, "epoch": 0.0032333683614905826, "step": 20}, {"loss": 0.9767, "grad_norm": 0.5969577431678772, "learning_rate": 0.0002, "epoch": 0.004850052542235874, "step": 30}, {"loss": 0.9772, "grad_norm": 0.6354120969772339, "learning_rate": 0.0002, "epoch": 0.006466736722981165, "step": 40}, {"loss": 0.8643, "grad_norm": 0.5604607462882996, "learning_rate": 0.0002, "epoch": 0.008083420903726457, "step": 50}, {"loss": 0.8841, "grad_norm": 0.4676193594932556, "learning_rate": 0.0002, "epoch": 0.009700105084471748, "step": 60}, {"loss": 0.9022, "grad_norm": 0.6099211573600769, "learning_rate": 0.0002, "epoch": 0.01131678926521704, "step": 70}, {"loss": 0.9133, "grad_norm": 0.48639994859695435, "learning_rate": 0.0002, "epoch": 0.01293347344596233, "step": 80}, {"loss": 0.8704, "grad_norm": 0.4904264509677887, "learning_rate": 0.0002, "epoch": 0.014550157626707623, "step": 90}, {"loss": 0.8855, "grad_norm": 2.8334362506866455, "learning_rate": 0.0002, "epoch": 0.016166841807452915, "step": 100}, {"loss": 0.8958, "grad_norm": 0.43221670389175415, "learning_rate": 0.0002, "epoch": 0.017783525988198205, "step": 110}, {"loss": 0.8412, "grad_norm": 0.42244166135787964, "learning_rate": 0.0002, "epoch": 0.019400210168943496, "step": 120}, {"loss": 0.8467, "grad_norm": 0.45363298058509827, "learning_rate": 0.0002, "epoch": 0.02101689434968879, "step": 130}, {"loss": 0.8641, "grad_norm": 0.44816508889198303, "learning_rate": 0.0002, "epoch": 0.02263357853043408, "step": 140}, {"loss": 0.8496, "grad_norm": 0.43308213353157043, "learning_rate": 0.0002, "epoch": 0.02425026271117937, "step": 150}, {"loss": 0.8213, "grad_norm": 0.4084763526916504, "learning_rate": 0.0002, "epoch": 0.02586694689192466, "step": 160}, {"loss": 0.8343, "grad_norm": 0.5363703966140747, "learning_rate": 0.0002, "epoch": 0.027483631072669955, "step": 170}, {"loss": 0.8558, "grad_norm": 0.4619699716567993, "learning_rate": 0.0002, "epoch": 0.029100315253415245, "step": 180}, {"loss": 0.8878, "grad_norm": 0.49069908261299133, "learning_rate": 0.0002, "epoch": 0.030716999434160536, "step": 190}, {"loss": 0.8867, "grad_norm": 0.4645835757255554, "learning_rate": 0.0002, "epoch": 0.03233368361490583, "step": 200}, {"loss": 0.8842, "grad_norm": 1.2411243915557861, "learning_rate": 0.0002, "epoch": 0.03395036779565112, "step": 210}, {"loss": 0.8245, "grad_norm": 0.5211851596832275, "learning_rate": 0.0002, "epoch": 0.03556705197639641, "step": 220}, {"loss": 0.8194, "grad_norm": 0.5253691673278809, "learning_rate": 0.0002, "epoch": 0.037183736157141704, "step": 230}, {"loss": 0.8856, "grad_norm": 0.4567478895187378, "learning_rate": 0.0002, "epoch": 0.03880042033788699, "step": 240}, {"loss": 0.838, "grad_norm": 0.5472128391265869, "learning_rate": 0.0002, "epoch": 0.040417104518632285, "step": 250}, {"loss": 0.8201, "grad_norm": 0.42978546023368835, "learning_rate": 0.0002, "epoch": 0.04203378869937758, "step": 260}, {"loss": 0.8334, "grad_norm": 0.601734459400177, "learning_rate": 0.0002, "epoch": 0.043650472880122866, "step": 270}, {"loss": 0.815, "grad_norm": 0.4286513328552246, "learning_rate": 0.0002, "epoch": 0.04526715706086816, "step": 280}, {"loss": 0.8758, "grad_norm": 0.5230861902236938, "learning_rate": 0.0002, "epoch": 0.046883841241613454, "step": 290}, {"loss": 0.8636, "grad_norm": 0.6504611968994141, "learning_rate": 0.0002, "epoch": 0.04850052542235874, "step": 300}, {"loss": 0.8102, "grad_norm": 0.43485215306282043, "learning_rate": 0.0002, "epoch": 0.050117209603104035, "step": 310}, {"loss": 0.8221, "grad_norm": 0.4717007875442505, "learning_rate": 0.0002, "epoch": 0.05173389378384932, "step": 320}, {"loss": 0.8469, "grad_norm": 0.4059787690639496, "learning_rate": 0.0002, "epoch": 0.053350577964594616, "step": 330}, {"loss": 0.8866, "grad_norm": 0.4366913437843323, "learning_rate": 0.0002, "epoch": 0.05496726214533991, "step": 340}, {"loss": 0.7976, "grad_norm": 0.4233848452568054, "learning_rate": 0.0002, "epoch": 0.0565839463260852, "step": 350}, {"loss": 0.8456, "grad_norm": 0.4209108352661133, "learning_rate": 0.0002, "epoch": 0.05820063050683049, "step": 360}, {"loss": 0.816, "grad_norm": 0.41637396812438965, "learning_rate": 0.0002, "epoch": 0.059817314687575784, "step": 370}, {"loss": 0.7976, "grad_norm": 0.46235376596450806, "learning_rate": 0.0002, "epoch": 0.06143399886832107, "step": 380}, {"loss": 0.7966, "grad_norm": 0.4013484716415405, "learning_rate": 0.0002, "epoch": 0.06305068304906636, "step": 390}, {"loss": 0.8253, "grad_norm": 0.47443896532058716, "learning_rate": 0.0002, "epoch": 0.06466736722981166, "step": 400}, {"loss": 0.8666, "grad_norm": 0.3942156434059143, "learning_rate": 0.0002, "epoch": 0.06628405141055695, "step": 410}, {"loss": 0.8402, "grad_norm": 0.4965320825576782, "learning_rate": 0.0002, "epoch": 0.06790073559130223, "step": 420}, {"loss": 0.8317, "grad_norm": 0.4304835796356201, "learning_rate": 0.0002, "epoch": 0.06951741977204753, "step": 430}, {"loss": 0.8528, "grad_norm": 0.511726975440979, "learning_rate": 0.0002, "epoch": 0.07113410395279282, "step": 440}, {"loss": 0.8675, "grad_norm": 0.4040689170360565, "learning_rate": 0.0002, "epoch": 0.07275078813353811, "step": 450}, {"loss": 0.8788, "grad_norm": 0.5402171015739441, "learning_rate": 0.0002, "epoch": 0.07436747231428341, "step": 460}, {"loss": 0.8737, "grad_norm": 0.4174517095088959, "learning_rate": 0.0002, "epoch": 0.0759841564950287, "step": 470}, {"loss": 0.7605, "grad_norm": 0.4306182265281677, "learning_rate": 0.0002, "epoch": 0.07760084067577398, "step": 480}, {"loss": 0.799, "grad_norm": 0.535210132598877, "learning_rate": 0.0002, "epoch": 0.07921752485651928, "step": 490}, {"loss": 0.7825, "grad_norm": 0.5339109897613525, "learning_rate": 0.0002, "epoch": 0.08083420903726457, "step": 500}, {"loss": 0.8985, "grad_norm": 0.45754891633987427, "learning_rate": 0.0002, "epoch": 0.08245089321800986, "step": 510}, {"loss": 0.8144, "grad_norm": 0.43820783495903015, "learning_rate": 0.0002, "epoch": 0.08406757739875516, "step": 520}, {"loss": 0.8001, "grad_norm": 0.4434749186038971, "learning_rate": 0.0002, "epoch": 0.08568426157950045, "step": 530}, {"loss": 0.7857, "grad_norm": 0.43111467361450195, "learning_rate": 0.0002, "epoch": 0.08730094576024573, "step": 540}, {"loss": 0.8418, "grad_norm": 0.4378940165042877, "learning_rate": 0.0002, "epoch": 0.08891762994099103, "step": 550}, {"loss": 0.8361, "grad_norm": 0.4772215187549591, "learning_rate": 0.0002, "epoch": 0.09053431412173632, "step": 560}, {"loss": 0.8268, "grad_norm": 0.6837629079818726, "learning_rate": 0.0002, "epoch": 0.09215099830248161, "step": 570}, {"loss": 0.8607, "grad_norm": 0.42241212725639343, "learning_rate": 0.0002, "epoch": 0.09376768248322691, "step": 580}, {"loss": 0.852, "grad_norm": 0.5165936350822449, "learning_rate": 0.0002, "epoch": 0.0953843666639722, "step": 590}, {"loss": 0.8664, "grad_norm": 0.48737478256225586, "learning_rate": 0.0002, "epoch": 0.09700105084471748, "step": 600}, {"loss": 0.8806, "grad_norm": 0.47419852018356323, "learning_rate": 0.0002, "epoch": 0.09861773502546278, "step": 610}, {"loss": 0.8254, "grad_norm": 0.4975486099720001, "learning_rate": 0.0002, "epoch": 0.10023441920620807, "step": 620}, {"loss": 0.8548, "grad_norm": 0.49123844504356384, "learning_rate": 0.0002, "epoch": 0.10185110338695336, "step": 630}, {"loss": 0.8911, "grad_norm": 0.6288952827453613, "learning_rate": 0.0002, "epoch": 0.10346778756769864, "step": 640}, {"loss": 0.827, "grad_norm": 0.4277345836162567, "learning_rate": 0.0002, "epoch": 0.10508447174844394, "step": 650}, {"loss": 0.7996, "grad_norm": 0.4021061956882477, "learning_rate": 0.0002, "epoch": 0.10670115592918923, "step": 660}, {"loss": 0.87, "grad_norm": 0.3492237329483032, "learning_rate": 0.0002, "epoch": 0.10831784010993452, "step": 670}, {"loss": 0.8698, "grad_norm": 0.4341012239456177, "learning_rate": 0.0002, "epoch": 0.10993452429067982, "step": 680}, {"loss": 0.781, "grad_norm": 0.7296304106712341, "learning_rate": 0.0002, "epoch": 0.1115512084714251, "step": 690}, {"loss": 0.8433, "grad_norm": 0.397494912147522, "learning_rate": 0.0002, "epoch": 0.1131678926521704, "step": 700}, {"loss": 0.827, "grad_norm": 0.396431028842926, "learning_rate": 0.0002, "epoch": 0.1147845768329157, "step": 710}, {"loss": 0.8379, "grad_norm": 0.48842838406562805, "learning_rate": 0.0002, "epoch": 0.11640126101366098, "step": 720}, {"loss": 0.8238, "grad_norm": 0.46322616934776306, "learning_rate": 0.0002, "epoch": 0.11801794519440627, "step": 730}, {"loss": 0.8041, "grad_norm": 0.47990912199020386, "learning_rate": 0.0002, "epoch": 0.11963462937515157, "step": 740}, {"loss": 0.82, "grad_norm": 0.4997142255306244, "learning_rate": 0.0002, "epoch": 0.12125131355589686, "step": 750}, {"loss": 0.7702, "grad_norm": 0.4040526747703552, "learning_rate": 0.0002, "epoch": 0.12286799773664214, "step": 760}, {"loss": 0.863, "grad_norm": 0.453095942735672, "learning_rate": 0.0002, "epoch": 0.12448468191738744, "step": 770}, {"loss": 0.8792, "grad_norm": 0.4636971950531006, "learning_rate": 0.0002, "epoch": 0.12610136609813272, "step": 780}, {"loss": 0.8112, "grad_norm": 0.4279276132583618, "learning_rate": 0.0002, "epoch": 0.12771805027887803, "step": 790}, {"loss": 0.8711, "grad_norm": 0.46212655305862427, "learning_rate": 0.0002, "epoch": 0.12933473445962332, "step": 800}, {"loss": 0.8368, "grad_norm": 0.43127650022506714, "learning_rate": 0.0002, "epoch": 0.1309514186403686, "step": 810}, {"loss": 0.8476, "grad_norm": 0.4201301336288452, "learning_rate": 0.0002, "epoch": 0.1325681028211139, "step": 820}, {"loss": 0.8078, "grad_norm": 0.42583167552948, "learning_rate": 0.0002, "epoch": 0.13418478700185918, "step": 830}, {"loss": 0.8219, "grad_norm": 0.4535622000694275, "learning_rate": 0.0002, "epoch": 0.13580147118260447, "step": 840}, {"loss": 0.8423, "grad_norm": 0.4116036891937256, "learning_rate": 0.0002, "epoch": 0.13741815536334978, "step": 850}, {"loss": 0.8466, "grad_norm": 0.45997580885887146, "learning_rate": 0.0002, "epoch": 0.13903483954409507, "step": 860}, {"loss": 0.8917, "grad_norm": 0.4487837255001068, "learning_rate": 0.0002, "epoch": 0.14065152372484035, "step": 870}, {"loss": 0.8217, "grad_norm": 0.43650057911872864, "learning_rate": 0.0002, "epoch": 0.14226820790558564, "step": 880}, {"loss": 0.8178, "grad_norm": 0.5335358381271362, "learning_rate": 0.0002, "epoch": 0.14388489208633093, "step": 890}, {"loss": 0.7957, "grad_norm": 0.5989000201225281, "learning_rate": 0.0002, "epoch": 0.14550157626707622, "step": 900}, {"loss": 0.8385, "grad_norm": 0.517179012298584, "learning_rate": 0.0002, "epoch": 0.14711826044782153, "step": 910}, {"loss": 0.8255, "grad_norm": 0.44435232877731323, "learning_rate": 0.0002, "epoch": 0.14873494462856682, "step": 920}, {"loss": 0.8305, "grad_norm": 0.42635923624038696, "learning_rate": 0.0002, "epoch": 0.1503516288093121, "step": 930}, {"loss": 0.8043, "grad_norm": 0.49603334069252014, "learning_rate": 0.0002, "epoch": 0.1519683129900574, "step": 940}, {"loss": 0.8377, "grad_norm": 0.40639808773994446, "learning_rate": 0.0002, "epoch": 0.15358499717080268, "step": 950}, {"loss": 0.8529, "grad_norm": 0.4850759208202362, "learning_rate": 0.0002, "epoch": 0.15520168135154797, "step": 960}, {"loss": 0.846, "grad_norm": 0.4427442252635956, "learning_rate": 0.0002, "epoch": 0.15681836553229328, "step": 970}, {"loss": 0.8705, "grad_norm": 0.3760930001735687, "learning_rate": 0.0002, "epoch": 0.15843504971303857, "step": 980}, {"loss": 0.8644, "grad_norm": 0.4794144332408905, "learning_rate": 0.0002, "epoch": 0.16005173389378385, "step": 990}, {"loss": 0.8002, "grad_norm": 0.45828768610954285, "learning_rate": 0.0002, "epoch": 0.16166841807452914, "step": 1000}, {"loss": 0.7658, "grad_norm": 0.6313053369522095, "learning_rate": 0.0002, "epoch": 0.16328510225527443, "step": 1010}, {"loss": 0.8047, "grad_norm": 0.45041006803512573, "learning_rate": 0.0002, "epoch": 0.16490178643601971, "step": 1020}, {"loss": 0.8423, "grad_norm": 0.441403865814209, "learning_rate": 0.0002, "epoch": 0.166518470616765, "step": 1030}, {"loss": 0.8475, "grad_norm": 0.8171296119689941, "learning_rate": 0.0002, "epoch": 0.16813515479751032, "step": 1040}, {"loss": 0.845, "grad_norm": 0.7137420773506165, "learning_rate": 0.0002, "epoch": 0.1697518389782556, "step": 1050}, {"loss": 0.8213, "grad_norm": 0.5236809849739075, "learning_rate": 0.0002, "epoch": 0.1713685231590009, "step": 1060}, {"loss": 0.8265, "grad_norm": 0.5021864175796509, "learning_rate": 0.0002, "epoch": 0.17298520733974618, "step": 1070}, {"loss": 0.8305, "grad_norm": 0.47347521781921387, "learning_rate": 0.0002, "epoch": 0.17460189152049146, "step": 1080}, {"loss": 0.8105, "grad_norm": 0.4631653428077698, "learning_rate": 0.0002, "epoch": 0.17621857570123675, "step": 1090}, {"loss": 0.8166, "grad_norm": 0.49169182777404785, "learning_rate": 0.0002, "epoch": 0.17783525988198207, "step": 1100}, {"loss": 0.8012, "grad_norm": 0.5019739270210266, "learning_rate": 0.0002, "epoch": 0.17945194406272735, "step": 1110}, {"loss": 0.8247, "grad_norm": 0.5100422501564026, "learning_rate": 0.0002, "epoch": 0.18106862824347264, "step": 1120}, {"loss": 0.8142, "grad_norm": 0.3888324499130249, "learning_rate": 0.0002, "epoch": 0.18268531242421793, "step": 1130}, {"loss": 0.8533, "grad_norm": 0.39765217900276184, "learning_rate": 0.0002, "epoch": 0.18430199660496321, "step": 1140}, {"loss": 0.8541, "grad_norm": 0.47190186381340027, "learning_rate": 0.0002, "epoch": 0.1859186807857085, "step": 1150}, {"loss": 0.8301, "grad_norm": 0.4464188814163208, "learning_rate": 0.0002, "epoch": 0.18753536496645382, "step": 1160}, {"loss": 0.8341, "grad_norm": 0.5153930187225342, "learning_rate": 0.0002, "epoch": 0.1891520491471991, "step": 1170}, {"loss": 0.8033, "grad_norm": 0.4779708683490753, "learning_rate": 0.0002, "epoch": 0.1907687333279444, "step": 1180}, {"loss": 0.8187, "grad_norm": 0.4834315776824951, "learning_rate": 0.0002, "epoch": 0.19238541750868968, "step": 1190}, {"loss": 0.7721, "grad_norm": 0.402357816696167, "learning_rate": 0.0002, "epoch": 0.19400210168943496, "step": 1200}, {"loss": 0.7941, "grad_norm": 0.45899084210395813, "learning_rate": 0.0002, "epoch": 0.19561878587018025, "step": 1210}, {"loss": 0.8353, "grad_norm": 0.5106529593467712, "learning_rate": 0.0002, "epoch": 0.19723547005092557, "step": 1220}, {"loss": 0.7816, "grad_norm": 0.45261722803115845, "learning_rate": 0.0002, "epoch": 0.19885215423167085, "step": 1230}, {"loss": 0.8068, "grad_norm": 0.4647127091884613, "learning_rate": 0.0002, "epoch": 0.20046883841241614, "step": 1240}, {"loss": 0.8239, "grad_norm": 0.4849368929862976, "learning_rate": 0.0002, "epoch": 0.20208552259316143, "step": 1250}, {"loss": 0.8514, "grad_norm": 0.4518061578273773, "learning_rate": 0.0002, "epoch": 0.2037022067739067, "step": 1260}, {"loss": 0.8158, "grad_norm": 0.49535325169563293, "learning_rate": 0.0002, "epoch": 0.205318890954652, "step": 1270}, {"loss": 0.8348, "grad_norm": 0.4835205376148224, "learning_rate": 0.0002, "epoch": 0.2069355751353973, "step": 1280}, {"loss": 0.8428, "grad_norm": 0.45308539271354675, "learning_rate": 0.0002, "epoch": 0.2085522593161426, "step": 1290}, {"loss": 0.7993, "grad_norm": 0.5369905233383179, "learning_rate": 0.0002, "epoch": 0.2101689434968879, "step": 1300}, {"loss": 0.8676, "grad_norm": 0.5031622052192688, "learning_rate": 0.0002, "epoch": 0.21178562767763318, "step": 1310}, {"loss": 0.7686, "grad_norm": 0.48010334372520447, "learning_rate": 0.0002, "epoch": 0.21340231185837846, "step": 1320}, {"loss": 0.806, "grad_norm": 0.4905701279640198, "learning_rate": 0.0002, "epoch": 0.21501899603912375, "step": 1330}, {"loss": 0.7885, "grad_norm": 0.43531742691993713, "learning_rate": 0.0002, "epoch": 0.21663568021986904, "step": 1340}, {"loss": 0.8191, "grad_norm": 0.44330692291259766, "learning_rate": 0.0002, "epoch": 0.21825236440061435, "step": 1350}, {"loss": 0.8205, "grad_norm": 0.5384416580200195, "learning_rate": 0.0002, "epoch": 0.21986904858135964, "step": 1360}, {"loss": 0.7726, "grad_norm": 0.4181833863258362, "learning_rate": 0.0002, "epoch": 0.22148573276210493, "step": 1370}, {"loss": 0.8311, "grad_norm": 0.523833692073822, "learning_rate": 0.0002, "epoch": 0.2231024169428502, "step": 1380}, {"loss": 0.7913, "grad_norm": 0.5528736710548401, "learning_rate": 0.0002, "epoch": 0.2247191011235955, "step": 1390}, {"loss": 0.8079, "grad_norm": 0.43515023589134216, "learning_rate": 0.0002, "epoch": 0.2263357853043408, "step": 1400}, {"loss": 0.8403, "grad_norm": 0.48809877038002014, "learning_rate": 0.0002, "epoch": 0.2279524694850861, "step": 1410}, {"loss": 0.8165, "grad_norm": 0.43591251969337463, "learning_rate": 0.0002, "epoch": 0.2295691536658314, "step": 1420}, {"loss": 0.8147, "grad_norm": 0.44625312089920044, "learning_rate": 0.0002, "epoch": 0.23118583784657668, "step": 1430}, {"loss": 0.8134, "grad_norm": 0.4390665292739868, "learning_rate": 0.0002, "epoch": 0.23280252202732196, "step": 1440}, {"loss": 0.8465, "grad_norm": 0.48496049642562866, "learning_rate": 0.0002, "epoch": 0.23441920620806725, "step": 1450}, {"loss": 0.775, "grad_norm": 0.45919957756996155, "learning_rate": 0.0002, "epoch": 0.23603589038881254, "step": 1460}, {"loss": 0.8659, "grad_norm": 0.5471845865249634, "learning_rate": 0.0002, "epoch": 0.23765257456955785, "step": 1470}, {"loss": 0.8164, "grad_norm": 0.47269317507743835, "learning_rate": 0.0002, "epoch": 0.23926925875030314, "step": 1480}, {"loss": 0.854, "grad_norm": 0.4930245578289032, "learning_rate": 0.0002, "epoch": 0.24088594293104842, "step": 1490}, {"loss": 0.8139, "grad_norm": 0.5605630278587341, "learning_rate": 0.0002, "epoch": 0.2425026271117937, "step": 1500}, {"loss": 0.8125, "grad_norm": 0.4435870945453644, "learning_rate": 0.0002, "epoch": 0.244119311292539, "step": 1510}, {"loss": 0.8123, "grad_norm": 0.4941999912261963, "learning_rate": 0.0002, "epoch": 0.24573599547328429, "step": 1520}, {"loss": 0.8427, "grad_norm": 0.5100624561309814, "learning_rate": 0.0002, "epoch": 0.24735267965402957, "step": 1530}, {"loss": 0.8405, "grad_norm": 0.4638267457485199, "learning_rate": 0.0002, "epoch": 0.2489693638347749, "step": 1540}, {"loss": 0.81, "grad_norm": 0.5071570873260498, "learning_rate": 0.0002, "epoch": 0.25058604801552015, "step": 1550}, {"loss": 0.7724, "grad_norm": 0.4291319251060486, "learning_rate": 0.0002, "epoch": 0.25220273219626543, "step": 1560}, {"loss": 0.7984, "grad_norm": 0.5388049483299255, "learning_rate": 0.0002, "epoch": 0.2538194163770108, "step": 1570}, {"loss": 0.8176, "grad_norm": 0.5083683729171753, "learning_rate": 0.0002, "epoch": 0.25543610055775606, "step": 1580}, {"loss": 0.843, "grad_norm": 0.4824463725090027, "learning_rate": 0.0002, "epoch": 0.25705278473850135, "step": 1590}, {"loss": 0.7996, "grad_norm": 0.41177722811698914, "learning_rate": 0.0002, "epoch": 0.25866946891924664, "step": 1600}, {"loss": 0.7772, "grad_norm": 0.5656219124794006, "learning_rate": 0.0002, "epoch": 0.2602861530999919, "step": 1610}, {"loss": 0.7955, "grad_norm": 0.41063204407691956, "learning_rate": 0.0002, "epoch": 0.2619028372807372, "step": 1620}, {"loss": 0.7998, "grad_norm": 0.4897061288356781, "learning_rate": 0.0002, "epoch": 0.2635195214614825, "step": 1630}, {"loss": 0.8198, "grad_norm": 0.4454376697540283, "learning_rate": 0.0002, "epoch": 0.2651362056422278, "step": 1640}, {"loss": 0.8684, "grad_norm": 0.4355238378047943, "learning_rate": 0.0002, "epoch": 0.26675288982297307, "step": 1650}, {"loss": 0.7801, "grad_norm": 0.458310067653656, "learning_rate": 0.0002, "epoch": 0.26836957400371836, "step": 1660}, {"loss": 0.7935, "grad_norm": 0.4752083718776703, "learning_rate": 0.0002, "epoch": 0.26998625818446365, "step": 1670}, {"loss": 0.8267, "grad_norm": 0.4666106402873993, "learning_rate": 0.0002, "epoch": 0.27160294236520893, "step": 1680}, {"loss": 0.8252, "grad_norm": 0.4213818609714508, "learning_rate": 0.0002, "epoch": 0.2732196265459543, "step": 1690}, {"loss": 0.8559, "grad_norm": 0.5768913626670837, "learning_rate": 0.0002, "epoch": 0.27483631072669956, "step": 1700}, {"loss": 0.7931, "grad_norm": 0.4209914803504944, "learning_rate": 0.0002, "epoch": 0.27645299490744485, "step": 1710}, {"loss": 0.8167, "grad_norm": 0.501909613609314, "learning_rate": 0.0002, "epoch": 0.27806967908819014, "step": 1720}, {"loss": 0.7832, "grad_norm": 0.5266261100769043, "learning_rate": 0.0002, "epoch": 0.2796863632689354, "step": 1730}, {"loss": 0.8102, "grad_norm": 0.43806859850883484, "learning_rate": 0.0002, "epoch": 0.2813030474496807, "step": 1740}, {"loss": 0.8157, "grad_norm": 0.46048814058303833, "learning_rate": 0.0002, "epoch": 0.282919731630426, "step": 1750}, {"loss": 0.8596, "grad_norm": 0.44972819089889526, "learning_rate": 0.0002, "epoch": 0.2845364158111713, "step": 1760}, {"loss": 0.8421, "grad_norm": 0.5114831328392029, "learning_rate": 0.0002, "epoch": 0.28615309999191657, "step": 1770}, {"loss": 0.8361, "grad_norm": 0.47931742668151855, "learning_rate": 0.0002, "epoch": 0.28776978417266186, "step": 1780}, {"loss": 0.8265, "grad_norm": 0.5092599987983704, "learning_rate": 0.0002, "epoch": 0.28938646835340714, "step": 1790}, {"loss": 0.8506, "grad_norm": 0.37581443786621094, "learning_rate": 0.0002, "epoch": 0.29100315253415243, "step": 1800}, {"loss": 0.7932, "grad_norm": 0.47097381949424744, "learning_rate": 0.0002, "epoch": 0.2926198367148977, "step": 1810}, {"loss": 0.7787, "grad_norm": 0.48300236463546753, "learning_rate": 0.0002, "epoch": 0.29423652089564306, "step": 1820}, {"loss": 0.8391, "grad_norm": 0.5600419640541077, "learning_rate": 0.0002, "epoch": 0.29585320507638835, "step": 1830}, {"loss": 0.8507, "grad_norm": 0.48555272817611694, "learning_rate": 0.0002, "epoch": 0.29746988925713364, "step": 1840}, {"loss": 0.7657, "grad_norm": 0.3752668499946594, "learning_rate": 0.0002, "epoch": 0.2990865734378789, "step": 1850}, {"loss": 0.7915, "grad_norm": 0.5328747034072876, "learning_rate": 0.0002, "epoch": 0.3007032576186242, "step": 1860}, {"loss": 0.8426, "grad_norm": 0.48716455698013306, "learning_rate": 0.0002, "epoch": 0.3023199417993695, "step": 1870}, {"loss": 0.8335, "grad_norm": 0.5011493563652039, "learning_rate": 0.0002, "epoch": 0.3039366259801148, "step": 1880}, {"loss": 0.852, "grad_norm": 0.46461427211761475, "learning_rate": 0.0002, "epoch": 0.30555331016086007, "step": 1890}, {"loss": 0.8478, "grad_norm": 0.36630210280418396, "learning_rate": 0.0002, "epoch": 0.30716999434160536, "step": 1900}, {"loss": 0.8162, "grad_norm": 0.4217296242713928, "learning_rate": 0.0002, "epoch": 0.30878667852235064, "step": 1910}, {"loss": 0.8128, "grad_norm": 0.4394875466823578, "learning_rate": 0.0002, "epoch": 0.31040336270309593, "step": 1920}, {"loss": 0.8471, "grad_norm": 0.6587965488433838, "learning_rate": 0.0002, "epoch": 0.3120200468838412, "step": 1930}, {"loss": 0.8565, "grad_norm": 0.5469298958778381, "learning_rate": 0.0002, "epoch": 0.31363673106458656, "step": 1940}, {"loss": 0.8236, "grad_norm": 0.4371595084667206, "learning_rate": 0.0002, "epoch": 0.31525341524533185, "step": 1950}, {"loss": 0.887, "grad_norm": 0.4809541404247284, "learning_rate": 0.0002, "epoch": 0.31687009942607713, "step": 1960}, {"loss": 0.7855, "grad_norm": 0.6061086654663086, "learning_rate": 0.0002, "epoch": 0.3184867836068224, "step": 1970}, {"loss": 0.7679, "grad_norm": 0.5342657566070557, "learning_rate": 0.0002, "epoch": 0.3201034677875677, "step": 1980}, {"loss": 0.7955, "grad_norm": 0.5057743787765503, "learning_rate": 0.0002, "epoch": 0.321720151968313, "step": 1990}, {"loss": 0.7774, "grad_norm": 0.528626024723053, "learning_rate": 0.0002, "epoch": 0.3233368361490583, "step": 2000}, {"loss": 0.8845, "grad_norm": 0.46742770075798035, "learning_rate": 0.0002, "epoch": 0.32495352032980357, "step": 2010}, {"loss": 0.8484, "grad_norm": 0.515101432800293, "learning_rate": 0.0002, "epoch": 0.32657020451054886, "step": 2020}, {"loss": 0.8139, "grad_norm": 0.41941216588020325, "learning_rate": 0.0002, "epoch": 0.32818688869129414, "step": 2030}, {"loss": 0.7637, "grad_norm": 0.49902522563934326, "learning_rate": 0.0002, "epoch": 0.32980357287203943, "step": 2040}, {"loss": 0.7822, "grad_norm": 0.4120897650718689, "learning_rate": 0.0002, "epoch": 0.3314202570527847, "step": 2050}, {"loss": 0.8057, "grad_norm": 0.45352041721343994, "learning_rate": 0.0002, "epoch": 0.33303694123353, "step": 2060}, {"loss": 0.7913, "grad_norm": 0.523199737071991, "learning_rate": 0.0002, "epoch": 0.33465362541427535, "step": 2070}, {"loss": 0.8036, "grad_norm": 0.4390358626842499, "learning_rate": 0.0002, "epoch": 0.33627030959502063, "step": 2080}, {"loss": 0.8145, "grad_norm": 0.6752901077270508, "learning_rate": 0.0002, "epoch": 0.3378869937757659, "step": 2090}, {"loss": 0.7807, "grad_norm": 0.547821044921875, "learning_rate": 0.0002, "epoch": 0.3395036779565112, "step": 2100}, {"loss": 0.8561, "grad_norm": 0.5161308646202087, "learning_rate": 0.0002, "epoch": 0.3411203621372565, "step": 2110}, {"loss": 0.7697, "grad_norm": 0.4565401077270508, "learning_rate": 0.0002, "epoch": 0.3427370463180018, "step": 2120}, {"loss": 0.7964, "grad_norm": 0.4666115939617157, "learning_rate": 0.0002, "epoch": 0.34435373049874707, "step": 2130}, {"loss": 0.8189, "grad_norm": 0.4090428352355957, "learning_rate": 0.0002, "epoch": 0.34597041467949236, "step": 2140}, {"loss": 0.8817, "grad_norm": 0.510845422744751, "learning_rate": 0.0002, "epoch": 0.34758709886023764, "step": 2150}, {"loss": 0.8398, "grad_norm": 0.42861923575401306, "learning_rate": 0.0002, "epoch": 0.34920378304098293, "step": 2160}, {"loss": 0.7716, "grad_norm": 0.4476332664489746, "learning_rate": 0.0002, "epoch": 0.3508204672217282, "step": 2170}, {"loss": 0.7845, "grad_norm": 0.6065791249275208, "learning_rate": 0.0002, "epoch": 0.3524371514024735, "step": 2180}, {"loss": 0.8187, "grad_norm": 0.42335066199302673, "learning_rate": 0.0002, "epoch": 0.35405383558321885, "step": 2190}, {"loss": 0.8239, "grad_norm": 0.5094629526138306, "learning_rate": 0.0002, "epoch": 0.35567051976396413, "step": 2200}, {"loss": 0.7807, "grad_norm": 0.5476373434066772, "learning_rate": 0.0002, "epoch": 0.3572872039447094, "step": 2210}, {"loss": 0.814, "grad_norm": 0.3911719024181366, "learning_rate": 0.0002, "epoch": 0.3589038881254547, "step": 2220}, {"loss": 0.8599, "grad_norm": 0.6599636077880859, "learning_rate": 0.0002, "epoch": 0.3605205723062, "step": 2230}, {"loss": 0.7482, "grad_norm": 0.40381914377212524, "learning_rate": 0.0002, "epoch": 0.3621372564869453, "step": 2240}, {"loss": 0.7772, "grad_norm": 0.4433908462524414, "learning_rate": 0.0002, "epoch": 0.36375394066769057, "step": 2250}, {"loss": 0.8503, "grad_norm": 0.578326940536499, "learning_rate": 0.0002, "epoch": 0.36537062484843585, "step": 2260}, {"loss": 0.8178, "grad_norm": 0.5734784007072449, "learning_rate": 0.0002, "epoch": 0.36698730902918114, "step": 2270}, {"loss": 0.8193, "grad_norm": 0.45555487275123596, "learning_rate": 0.0002, "epoch": 0.36860399320992643, "step": 2280}, {"loss": 0.7929, "grad_norm": 0.5666276216506958, "learning_rate": 0.0002, "epoch": 0.3702206773906717, "step": 2290}, {"loss": 0.8292, "grad_norm": 0.5461117625236511, "learning_rate": 0.0002, "epoch": 0.371837361571417, "step": 2300}, {"loss": 0.8204, "grad_norm": 0.6318911910057068, "learning_rate": 0.0002, "epoch": 0.3734540457521623, "step": 2310}, {"loss": 0.7964, "grad_norm": 0.493263304233551, "learning_rate": 0.0002, "epoch": 0.37507072993290763, "step": 2320}, {"loss": 0.8339, "grad_norm": 0.5888760089874268, "learning_rate": 0.0002, "epoch": 0.3766874141136529, "step": 2330}, {"loss": 0.7737, "grad_norm": 0.48671841621398926, "learning_rate": 0.0002, "epoch": 0.3783040982943982, "step": 2340}, {"loss": 0.8367, "grad_norm": 0.4385145306587219, "learning_rate": 0.0002, "epoch": 0.3799207824751435, "step": 2350}, {"loss": 0.812, "grad_norm": 0.5523318648338318, "learning_rate": 0.0002, "epoch": 0.3815374666558888, "step": 2360}, {"loss": 0.8351, "grad_norm": 0.7308220267295837, "learning_rate": 0.0002, "epoch": 0.38315415083663407, "step": 2370}, {"loss": 0.859, "grad_norm": 0.554214358329773, "learning_rate": 0.0002, "epoch": 0.38477083501737935, "step": 2380}, {"loss": 0.8146, "grad_norm": 0.5425800085067749, "learning_rate": 0.0002, "epoch": 0.38638751919812464, "step": 2390}, {"loss": 0.8282, "grad_norm": 0.48811158537864685, "learning_rate": 0.0002, "epoch": 0.3880042033788699, "step": 2400}, {"loss": 0.8074, "grad_norm": 0.49212366342544556, "learning_rate": 0.0002, "epoch": 0.3896208875596152, "step": 2410}, {"loss": 0.7991, "grad_norm": 0.5222218632698059, "learning_rate": 0.0002, "epoch": 0.3912375717403605, "step": 2420}, {"loss": 0.8182, "grad_norm": 0.4699819087982178, "learning_rate": 0.0002, "epoch": 0.3928542559211058, "step": 2430}, {"loss": 0.7919, "grad_norm": 0.46153587102890015, "learning_rate": 0.0002, "epoch": 0.39447094010185113, "step": 2440}, {"loss": 0.8111, "grad_norm": 0.4150611162185669, "learning_rate": 0.0002, "epoch": 0.3960876242825964, "step": 2450}, {"loss": 0.8589, "grad_norm": 0.5799614787101746, "learning_rate": 0.0002, "epoch": 0.3977043084633417, "step": 2460}, {"loss": 0.8085, "grad_norm": 0.56536865234375, "learning_rate": 0.0002, "epoch": 0.399320992644087, "step": 2470}, {"loss": 0.8022, "grad_norm": 0.5451247096061707, "learning_rate": 0.0002, "epoch": 0.4009376768248323, "step": 2480}, {"loss": 0.8217, "grad_norm": 0.5914521217346191, "learning_rate": 0.0002, "epoch": 0.40255436100557757, "step": 2490}, {"loss": 0.7859, "grad_norm": 0.4428117275238037, "learning_rate": 0.0002, "epoch": 0.40417104518632285, "step": 2500}, {"loss": 0.8054, "grad_norm": 0.48580947518348694, "learning_rate": 0.0002, "epoch": 0.40578772936706814, "step": 2510}, {"loss": 0.8405, "grad_norm": 0.436734676361084, "learning_rate": 0.0002, "epoch": 0.4074044135478134, "step": 2520}, {"loss": 0.8209, "grad_norm": 0.5752223134040833, "learning_rate": 0.0002, "epoch": 0.4090210977285587, "step": 2530}, {"loss": 0.8181, "grad_norm": 0.4271308183670044, "learning_rate": 0.0002, "epoch": 0.410637781909304, "step": 2540}, {"loss": 0.8058, "grad_norm": 0.46294718980789185, "learning_rate": 0.0002, "epoch": 0.4122544660900493, "step": 2550}, {"loss": 0.8473, "grad_norm": 0.49407583475112915, "learning_rate": 0.0002, "epoch": 0.4138711502707946, "step": 2560}, {"loss": 0.7881, "grad_norm": 0.4729035496711731, "learning_rate": 0.0002, "epoch": 0.4154878344515399, "step": 2570}, {"loss": 0.7834, "grad_norm": 0.4129747152328491, "learning_rate": 0.0002, "epoch": 0.4171045186322852, "step": 2580}, {"loss": 0.7859, "grad_norm": 0.5684236288070679, "learning_rate": 0.0002, "epoch": 0.4187212028130305, "step": 2590}, {"loss": 0.811, "grad_norm": 0.4862157106399536, "learning_rate": 0.0002, "epoch": 0.4203378869937758, "step": 2600}, {"loss": 0.7582, "grad_norm": 0.46567976474761963, "learning_rate": 0.0002, "epoch": 0.42195457117452106, "step": 2610}, {"loss": 0.7755, "grad_norm": 0.5710650682449341, "learning_rate": 0.0002, "epoch": 0.42357125535526635, "step": 2620}, {"loss": 0.8573, "grad_norm": 0.5660041570663452, "learning_rate": 0.0002, "epoch": 0.42518793953601164, "step": 2630}, {"loss": 0.7812, "grad_norm": 0.47944375872612, "learning_rate": 0.0002, "epoch": 0.4268046237167569, "step": 2640}, {"loss": 0.7459, "grad_norm": 0.537223756313324, "learning_rate": 0.0002, "epoch": 0.4284213078975022, "step": 2650}, {"loss": 0.8246, "grad_norm": 0.41669997572898865, "learning_rate": 0.0002, "epoch": 0.4300379920782475, "step": 2660}, {"loss": 0.7785, "grad_norm": 0.44727686047554016, "learning_rate": 0.0002, "epoch": 0.4316546762589928, "step": 2670}, {"loss": 0.8241, "grad_norm": 0.5600888729095459, "learning_rate": 0.0002, "epoch": 0.4332713604397381, "step": 2680}, {"loss": 0.7708, "grad_norm": 0.39820605516433716, "learning_rate": 0.0002, "epoch": 0.4348880446204834, "step": 2690}, {"loss": 0.8202, "grad_norm": 0.5637655854225159, "learning_rate": 0.0002, "epoch": 0.4365047288012287, "step": 2700}, {"loss": 0.855, "grad_norm": 0.6363666653633118, "learning_rate": 0.0002, "epoch": 0.438121412981974, "step": 2710}, {"loss": 0.8468, "grad_norm": 0.5656129121780396, "learning_rate": 0.0002, "epoch": 0.4397380971627193, "step": 2720}, {"loss": 0.7845, "grad_norm": 0.5600156188011169, "learning_rate": 0.0002, "epoch": 0.44135478134346456, "step": 2730}, {"loss": 0.8405, "grad_norm": 0.5506579875946045, "learning_rate": 0.0002, "epoch": 0.44297146552420985, "step": 2740}, {"loss": 0.7725, "grad_norm": 0.49878305196762085, "learning_rate": 0.0002, "epoch": 0.44458814970495514, "step": 2750}, {"loss": 0.8292, "grad_norm": 0.4569213092327118, "learning_rate": 0.0002, "epoch": 0.4462048338857004, "step": 2760}, {"loss": 0.8028, "grad_norm": 0.6056680083274841, "learning_rate": 0.0002, "epoch": 0.4478215180664457, "step": 2770}, {"loss": 0.8242, "grad_norm": 0.44474557042121887, "learning_rate": 0.0002, "epoch": 0.449438202247191, "step": 2780}, {"loss": 0.801, "grad_norm": 0.46055394411087036, "learning_rate": 0.0002, "epoch": 0.4510548864279363, "step": 2790}, {"loss": 0.7521, "grad_norm": 0.4904133379459381, "learning_rate": 0.0002, "epoch": 0.4526715706086816, "step": 2800}, {"loss": 0.8829, "grad_norm": 0.5647031664848328, "learning_rate": 0.0002, "epoch": 0.45428825478942686, "step": 2810}, {"loss": 0.8622, "grad_norm": 0.5759473443031311, "learning_rate": 0.0002, "epoch": 0.4559049389701722, "step": 2820}, {"loss": 0.7812, "grad_norm": 0.5161895751953125, "learning_rate": 0.0002, "epoch": 0.4575216231509175, "step": 2830}, {"loss": 0.8045, "grad_norm": 0.4248254597187042, "learning_rate": 0.0002, "epoch": 0.4591383073316628, "step": 2840}, {"loss": 0.7838, "grad_norm": 0.45395001769065857, "learning_rate": 0.0002, "epoch": 0.46075499151240806, "step": 2850}, {"loss": 0.8208, "grad_norm": 0.5358697772026062, "learning_rate": 0.0002, "epoch": 0.46237167569315335, "step": 2860}, {"loss": 0.8147, "grad_norm": 0.5379165410995483, "learning_rate": 0.0002, "epoch": 0.46398835987389864, "step": 2870}, {"loss": 0.7403, "grad_norm": 0.4601989686489105, "learning_rate": 0.0002, "epoch": 0.4656050440546439, "step": 2880}, {"loss": 0.8523, "grad_norm": 0.671115517616272, "learning_rate": 0.0002, "epoch": 0.4672217282353892, "step": 2890}, {"loss": 0.8262, "grad_norm": 0.4425133168697357, "learning_rate": 0.0002, "epoch": 0.4688384124161345, "step": 2900}, {"loss": 0.8178, "grad_norm": 0.5446155071258545, "learning_rate": 0.0002, "epoch": 0.4704550965968798, "step": 2910}, {"loss": 0.8106, "grad_norm": 0.603306233882904, "learning_rate": 0.0002, "epoch": 0.47207178077762507, "step": 2920}, {"loss": 0.8044, "grad_norm": 0.5377997159957886, "learning_rate": 0.0002, "epoch": 0.47368846495837036, "step": 2930}, {"loss": 0.8075, "grad_norm": 0.4931027591228485, "learning_rate": 0.0002, "epoch": 0.4753051491391157, "step": 2940}, {"loss": 0.8004, "grad_norm": 0.4711960256099701, "learning_rate": 0.0002, "epoch": 0.476921833319861, "step": 2950}, {"loss": 0.8121, "grad_norm": 0.5020492672920227, "learning_rate": 0.0002, "epoch": 0.4785385175006063, "step": 2960}, {"loss": 0.8221, "grad_norm": 0.5428946614265442, "learning_rate": 0.0002, "epoch": 0.48015520168135156, "step": 2970}, {"loss": 0.7849, "grad_norm": 0.5294089317321777, "learning_rate": 0.0002, "epoch": 0.48177188586209685, "step": 2980}, {"loss": 0.8553, "grad_norm": 0.648289144039154, "learning_rate": 0.0002, "epoch": 0.48338857004284214, "step": 2990}, {"loss": 0.7874, "grad_norm": 0.47916680574417114, "learning_rate": 0.0002, "epoch": 0.4850052542235874, "step": 3000}, {"loss": 0.8087, "grad_norm": 0.43849772214889526, "learning_rate": 0.0002, "epoch": 0.4866219384043327, "step": 3010}, {"loss": 0.7662, "grad_norm": 0.47007861733436584, "learning_rate": 0.0002, "epoch": 0.488238622585078, "step": 3020}, {"loss": 0.757, "grad_norm": 0.6314331293106079, "learning_rate": 0.0002, "epoch": 0.4898553067658233, "step": 3030}, {"loss": 0.7863, "grad_norm": 0.49211493134498596, "learning_rate": 0.0002, "epoch": 0.49147199094656857, "step": 3040}, {"loss": 0.8335, "grad_norm": 0.4537973403930664, "learning_rate": 0.0002, "epoch": 0.49308867512731386, "step": 3050}, {"loss": 0.8095, "grad_norm": 0.47326919436454773, "learning_rate": 0.0002, "epoch": 0.49470535930805914, "step": 3060}, {"loss": 0.8447, "grad_norm": 0.525874137878418, "learning_rate": 0.0002, "epoch": 0.4963220434888045, "step": 3070}, {"loss": 0.8339, "grad_norm": 0.6361091732978821, "learning_rate": 0.0002, "epoch": 0.4979387276695498, "step": 3080}, {"loss": 0.821, "grad_norm": 0.5850642919540405, "learning_rate": 0.0002, "epoch": 0.49955541185029506, "step": 3090}, {"loss": 0.8279, "grad_norm": 0.47299543023109436, "learning_rate": 0.0002, "epoch": 0.5011720960310403, "step": 3100}, {"loss": 0.8681, "grad_norm": 0.473099946975708, "learning_rate": 0.0002, "epoch": 0.5027887802117856, "step": 3110}, {"loss": 0.8223, "grad_norm": 0.48186397552490234, "learning_rate": 0.0002, "epoch": 0.5044054643925309, "step": 3120}, {"loss": 0.8292, "grad_norm": 0.5015401840209961, "learning_rate": 0.0002, "epoch": 0.5060221485732762, "step": 3130}, {"loss": 0.7692, "grad_norm": 0.5617750287055969, "learning_rate": 0.0002, "epoch": 0.5076388327540216, "step": 3140}, {"loss": 0.8708, "grad_norm": 0.5169327259063721, "learning_rate": 0.0002, "epoch": 0.5092555169347668, "step": 3150}, {"loss": 0.7845, "grad_norm": 0.545657753944397, "learning_rate": 0.0002, "epoch": 0.5108722011155121, "step": 3160}, {"loss": 0.799, "grad_norm": 0.512864351272583, "learning_rate": 0.0002, "epoch": 0.5124888852962574, "step": 3170}, {"loss": 0.7794, "grad_norm": 0.4113546311855316, "learning_rate": 0.0002, "epoch": 0.5141055694770027, "step": 3180}, {"loss": 0.8206, "grad_norm": 0.44532445073127747, "learning_rate": 0.0002, "epoch": 0.5157222536577479, "step": 3190}, {"loss": 0.8213, "grad_norm": 0.5623497366905212, "learning_rate": 0.0002, "epoch": 0.5173389378384933, "step": 3200}, {"loss": 0.7928, "grad_norm": 0.5084741115570068, "learning_rate": 0.0002, "epoch": 0.5189556220192385, "step": 3210}, {"loss": 0.8174, "grad_norm": 0.5305403470993042, "learning_rate": 0.0002, "epoch": 0.5205723061999838, "step": 3220}, {"loss": 0.8139, "grad_norm": 0.4708254337310791, "learning_rate": 0.0002, "epoch": 0.5221889903807291, "step": 3230}, {"loss": 0.7639, "grad_norm": 0.43827131390571594, "learning_rate": 0.0002, "epoch": 0.5238056745614744, "step": 3240}, {"loss": 0.7993, "grad_norm": 0.5630002617835999, "learning_rate": 0.0002, "epoch": 0.5254223587422197, "step": 3250}, {"loss": 0.7522, "grad_norm": 0.5010961890220642, "learning_rate": 0.0002, "epoch": 0.527039042922965, "step": 3260}, {"loss": 0.8374, "grad_norm": 0.6303122043609619, "learning_rate": 0.0002, "epoch": 0.5286557271037103, "step": 3270}, {"loss": 0.7727, "grad_norm": 0.5107331275939941, "learning_rate": 0.0002, "epoch": 0.5302724112844556, "step": 3280}, {"loss": 0.8495, "grad_norm": 0.5700443387031555, "learning_rate": 0.0002, "epoch": 0.5318890954652009, "step": 3290}, {"loss": 0.7776, "grad_norm": 0.46296367049217224, "learning_rate": 0.0002, "epoch": 0.5335057796459461, "step": 3300}, {"loss": 0.7931, "grad_norm": 0.531568706035614, "learning_rate": 0.0002, "epoch": 0.5351224638266915, "step": 3310}, {"loss": 0.843, "grad_norm": 0.4686741530895233, "learning_rate": 0.0002, "epoch": 0.5367391480074367, "step": 3320}, {"loss": 0.8104, "grad_norm": 0.5404331088066101, "learning_rate": 0.0002, "epoch": 0.5383558321881821, "step": 3330}, {"loss": 0.7686, "grad_norm": 0.6368790864944458, "learning_rate": 0.0002, "epoch": 0.5399725163689273, "step": 3340}, {"loss": 0.8514, "grad_norm": 0.42300888895988464, "learning_rate": 0.0002, "epoch": 0.5415892005496726, "step": 3350}, {"loss": 0.8236, "grad_norm": 0.5362542867660522, "learning_rate": 0.0002, "epoch": 0.5432058847304179, "step": 3360}, {"loss": 0.858, "grad_norm": 0.497128963470459, "learning_rate": 0.0002, "epoch": 0.5448225689111632, "step": 3370}, {"loss": 0.8519, "grad_norm": 0.5006386041641235, "learning_rate": 0.0002, "epoch": 0.5464392530919085, "step": 3380}, {"loss": 0.7867, "grad_norm": 0.44136837124824524, "learning_rate": 0.0002, "epoch": 0.5480559372726538, "step": 3390}, {"loss": 0.773, "grad_norm": 0.5897833108901978, "learning_rate": 0.0002, "epoch": 0.5496726214533991, "step": 3400}, {"loss": 0.8895, "grad_norm": 0.641075611114502, "learning_rate": 0.0002, "epoch": 0.5512893056341444, "step": 3410}, {"loss": 0.7827, "grad_norm": 0.7251322269439697, "learning_rate": 0.0002, "epoch": 0.5529059898148897, "step": 3420}, {"loss": 0.7626, "grad_norm": 0.47411349415779114, "learning_rate": 0.0002, "epoch": 0.5545226739956349, "step": 3430}, {"loss": 0.8196, "grad_norm": 0.4994310438632965, "learning_rate": 0.0002, "epoch": 0.5561393581763803, "step": 3440}, {"loss": 0.7812, "grad_norm": 0.5814438462257385, "learning_rate": 0.0002, "epoch": 0.5577560423571255, "step": 3450}, {"loss": 0.8805, "grad_norm": 0.6278898119926453, "learning_rate": 0.0002, "epoch": 0.5593727265378708, "step": 3460}, {"loss": 0.813, "grad_norm": 0.46208274364471436, "learning_rate": 0.0002, "epoch": 0.5609894107186161, "step": 3470}, {"loss": 0.8295, "grad_norm": 0.5718930959701538, "learning_rate": 0.0002, "epoch": 0.5626060948993614, "step": 3480}, {"loss": 0.8152, "grad_norm": 0.48178744316101074, "learning_rate": 0.0002, "epoch": 0.5642227790801067, "step": 3490}, {"loss": 0.8244, "grad_norm": 0.47336965799331665, "learning_rate": 0.0002, "epoch": 0.565839463260852, "step": 3500}, {"loss": 0.8099, "grad_norm": 0.43442684412002563, "learning_rate": 0.0002, "epoch": 0.5674561474415973, "step": 3510}, {"loss": 0.7564, "grad_norm": 0.6463358998298645, "learning_rate": 0.0002, "epoch": 0.5690728316223426, "step": 3520}, {"loss": 0.836, "grad_norm": 0.5286486744880676, "learning_rate": 0.0002, "epoch": 0.5706895158030879, "step": 3530}, {"loss": 0.8421, "grad_norm": 0.5405499935150146, "learning_rate": 0.0002, "epoch": 0.5723061999838331, "step": 3540}, {"loss": 0.7614, "grad_norm": 0.6654391884803772, "learning_rate": 0.0002, "epoch": 0.5739228841645785, "step": 3550}, {"loss": 0.7803, "grad_norm": 0.5081980228424072, "learning_rate": 0.0002, "epoch": 0.5755395683453237, "step": 3560}, {"loss": 0.7753, "grad_norm": 0.48978179693222046, "learning_rate": 0.0002, "epoch": 0.5771562525260691, "step": 3570}, {"loss": 0.8151, "grad_norm": 0.5840612053871155, "learning_rate": 0.0002, "epoch": 0.5787729367068143, "step": 3580}, {"loss": 0.8937, "grad_norm": 0.5235261917114258, "learning_rate": 0.0002, "epoch": 0.5803896208875596, "step": 3590}, {"loss": 0.7894, "grad_norm": 0.5672075748443604, "learning_rate": 0.0002, "epoch": 0.5820063050683049, "step": 3600}, {"loss": 0.8347, "grad_norm": 0.5613429546356201, "learning_rate": 0.0002, "epoch": 0.5836229892490502, "step": 3610}, {"loss": 0.8274, "grad_norm": 0.4032273590564728, "learning_rate": 0.0002, "epoch": 0.5852396734297954, "step": 3620}, {"loss": 0.8421, "grad_norm": 0.49559324979782104, "learning_rate": 0.0002, "epoch": 0.5868563576105408, "step": 3630}, {"loss": 0.8332, "grad_norm": 0.6895697712898254, "learning_rate": 0.0002, "epoch": 0.5884730417912861, "step": 3640}, {"loss": 0.7877, "grad_norm": 0.4750136435031891, "learning_rate": 0.0002, "epoch": 0.5900897259720314, "step": 3650}, {"loss": 0.8219, "grad_norm": 0.5176819562911987, "learning_rate": 0.0002, "epoch": 0.5917064101527767, "step": 3660}, {"loss": 0.8151, "grad_norm": 0.5817760229110718, "learning_rate": 0.0002, "epoch": 0.5933230943335219, "step": 3670}, {"loss": 0.7823, "grad_norm": 0.6064626574516296, "learning_rate": 0.0002, "epoch": 0.5949397785142673, "step": 3680}, {"loss": 0.8422, "grad_norm": 0.6728700995445251, "learning_rate": 0.0002, "epoch": 0.5965564626950125, "step": 3690}, {"loss": 0.7679, "grad_norm": 0.609305202960968, "learning_rate": 0.0002, "epoch": 0.5981731468757578, "step": 3700}, {"loss": 0.8048, "grad_norm": 0.4615488350391388, "learning_rate": 0.0002, "epoch": 0.5997898310565031, "step": 3710}, {"loss": 0.8214, "grad_norm": 2.0531179904937744, "learning_rate": 0.0002, "epoch": 0.6014065152372484, "step": 3720}, {"loss": 0.8158, "grad_norm": 0.5091132521629333, "learning_rate": 0.0002, "epoch": 0.6030231994179936, "step": 3730}, {"loss": 0.7833, "grad_norm": 0.5951124429702759, "learning_rate": 0.0002, "epoch": 0.604639883598739, "step": 3740}, {"loss": 0.7784, "grad_norm": 0.5870208144187927, "learning_rate": 0.0002, "epoch": 0.6062565677794842, "step": 3750}, {"loss": 0.8044, "grad_norm": 0.6254619359970093, "learning_rate": 0.0002, "epoch": 0.6078732519602296, "step": 3760}, {"loss": 0.7868, "grad_norm": 0.5577626824378967, "learning_rate": 0.0002, "epoch": 0.6094899361409749, "step": 3770}, {"loss": 0.8108, "grad_norm": 0.5004405379295349, "learning_rate": 0.0002, "epoch": 0.6111066203217201, "step": 3780}, {"loss": 0.8092, "grad_norm": 0.5527383685112, "learning_rate": 0.0002, "epoch": 0.6127233045024655, "step": 3790}, {"loss": 0.8036, "grad_norm": 0.49116113781929016, "learning_rate": 0.0002, "epoch": 0.6143399886832107, "step": 3800}, {"loss": 0.8352, "grad_norm": 0.5299299359321594, "learning_rate": 0.0002, "epoch": 0.6159566728639561, "step": 3810}, {"loss": 0.7737, "grad_norm": 0.464897483587265, "learning_rate": 0.0002, "epoch": 0.6175733570447013, "step": 3820}, {"loss": 0.7923, "grad_norm": 0.6505740880966187, "learning_rate": 0.0002, "epoch": 0.6191900412254466, "step": 3830}, {"loss": 0.8123, "grad_norm": 0.5512559413909912, "learning_rate": 0.0002, "epoch": 0.6208067254061919, "step": 3840}, {"loss": 0.8856, "grad_norm": 0.49427518248558044, "learning_rate": 0.0002, "epoch": 0.6224234095869372, "step": 3850}, {"loss": 0.7751, "grad_norm": 0.3839147090911865, "learning_rate": 0.0002, "epoch": 0.6240400937676824, "step": 3860}, {"loss": 0.8006, "grad_norm": 0.5760218501091003, "learning_rate": 0.0002, "epoch": 0.6256567779484278, "step": 3870}, {"loss": 0.7836, "grad_norm": 0.7226507067680359, "learning_rate": 0.0002, "epoch": 0.6272734621291731, "step": 3880}, {"loss": 0.8244, "grad_norm": 0.676781415939331, "learning_rate": 0.0002, "epoch": 0.6288901463099184, "step": 3890}, {"loss": 0.8239, "grad_norm": 0.4284018278121948, "learning_rate": 0.0002, "epoch": 0.6305068304906637, "step": 3900}, {"loss": 0.7996, "grad_norm": 0.5060628056526184, "learning_rate": 0.0002, "epoch": 0.6321235146714089, "step": 3910}, {"loss": 0.8089, "grad_norm": 0.5524522066116333, "learning_rate": 0.0002, "epoch": 0.6337401988521543, "step": 3920}, {"loss": 0.8276, "grad_norm": 0.6099881529808044, "learning_rate": 0.0002, "epoch": 0.6353568830328995, "step": 3930}, {"loss": 0.809, "grad_norm": 0.43155938386917114, "learning_rate": 0.0002, "epoch": 0.6369735672136448, "step": 3940}, {"loss": 0.8404, "grad_norm": 0.6427084803581238, "learning_rate": 0.0002, "epoch": 0.6385902513943901, "step": 3950}, {"loss": 0.8368, "grad_norm": 0.541220486164093, "learning_rate": 0.0002, "epoch": 0.6402069355751354, "step": 3960}, {"loss": 0.8539, "grad_norm": 0.5414294600486755, "learning_rate": 0.0002, "epoch": 0.6418236197558806, "step": 3970}, {"loss": 0.7996, "grad_norm": 0.46344003081321716, "learning_rate": 0.0002, "epoch": 0.643440303936626, "step": 3980}, {"loss": 0.7474, "grad_norm": 0.45209285616874695, "learning_rate": 0.0002, "epoch": 0.6450569881173712, "step": 3990}, {"loss": 0.8202, "grad_norm": 0.5417284369468689, "learning_rate": 0.0002, "epoch": 0.6466736722981166, "step": 4000}, {"loss": 0.7563, "grad_norm": 0.7995685935020447, "learning_rate": 0.0002, "epoch": 0.6482903564788619, "step": 4010}, {"loss": 0.7812, "grad_norm": 0.6384002566337585, "learning_rate": 0.0002, "epoch": 0.6499070406596071, "step": 4020}, {"loss": 0.732, "grad_norm": 0.4472815692424774, "learning_rate": 0.0002, "epoch": 0.6515237248403525, "step": 4030}, {"loss": 0.8071, "grad_norm": 0.6834294199943542, "learning_rate": 0.0002, "epoch": 0.6531404090210977, "step": 4040}, {"loss": 0.7812, "grad_norm": 0.4612339735031128, "learning_rate": 0.0002, "epoch": 0.654757093201843, "step": 4050}, {"loss": 0.8141, "grad_norm": 0.9266576170921326, "learning_rate": 0.0002, "epoch": 0.6563737773825883, "step": 4060}, {"loss": 0.7991, "grad_norm": 0.4470861852169037, "learning_rate": 0.0002, "epoch": 0.6579904615633336, "step": 4070}, {"loss": 0.8293, "grad_norm": 0.45544925332069397, "learning_rate": 0.0002, "epoch": 0.6596071457440789, "step": 4080}, {"loss": 0.8455, "grad_norm": 0.6144481301307678, "learning_rate": 0.0002, "epoch": 0.6612238299248242, "step": 4090}, {"loss": 0.7877, "grad_norm": 0.5936288237571716, "learning_rate": 0.0002, "epoch": 0.6628405141055694, "step": 4100}, {"loss": 0.7617, "grad_norm": 0.4822963774204254, "learning_rate": 0.0002, "epoch": 0.6644571982863148, "step": 4110}, {"loss": 0.7997, "grad_norm": 0.48432496190071106, "learning_rate": 0.0002, "epoch": 0.66607388246706, "step": 4120}, {"loss": 0.8404, "grad_norm": 0.4901607930660248, "learning_rate": 0.0002, "epoch": 0.6676905666478054, "step": 4130}, {"loss": 0.8085, "grad_norm": 0.5018393397331238, "learning_rate": 0.0002, "epoch": 0.6693072508285507, "step": 4140}, {"loss": 0.8065, "grad_norm": 0.6946378946304321, "learning_rate": 0.0002, "epoch": 0.6709239350092959, "step": 4150}, {"loss": 0.8147, "grad_norm": 0.5997390747070312, "learning_rate": 0.0002, "epoch": 0.6725406191900413, "step": 4160}, {"loss": 0.8268, "grad_norm": 0.6738849878311157, "learning_rate": 0.0002, "epoch": 0.6741573033707865, "step": 4170}, {"loss": 0.7704, "grad_norm": 0.6110581159591675, "learning_rate": 0.0002, "epoch": 0.6757739875515318, "step": 4180}, {"loss": 0.8043, "grad_norm": 0.5703322291374207, "learning_rate": 0.0002, "epoch": 0.6773906717322771, "step": 4190}, {"loss": 0.8099, "grad_norm": 0.4686066210269928, "learning_rate": 0.0002, "epoch": 0.6790073559130224, "step": 4200}, {"loss": 0.8441, "grad_norm": 0.6394643783569336, "learning_rate": 0.0002, "epoch": 0.6806240400937676, "step": 4210}, {"loss": 0.8011, "grad_norm": 0.5454841256141663, "learning_rate": 0.0002, "epoch": 0.682240724274513, "step": 4220}, {"loss": 0.8307, "grad_norm": 0.4859732985496521, "learning_rate": 0.0002, "epoch": 0.6838574084552582, "step": 4230}, {"loss": 0.8161, "grad_norm": 0.5544065833091736, "learning_rate": 0.0002, "epoch": 0.6854740926360036, "step": 4240}, {"loss": 0.7839, "grad_norm": 0.4902505576610565, "learning_rate": 0.0002, "epoch": 0.6870907768167488, "step": 4250}, {"loss": 0.7977, "grad_norm": 0.4768051505088806, "learning_rate": 0.0002, "epoch": 0.6887074609974941, "step": 4260}, {"loss": 0.7539, "grad_norm": 0.49982190132141113, "learning_rate": 0.0002, "epoch": 0.6903241451782395, "step": 4270}, {"loss": 0.7353, "grad_norm": 0.6351838111877441, "learning_rate": 0.0002, "epoch": 0.6919408293589847, "step": 4280}, {"loss": 0.7664, "grad_norm": 0.5647561550140381, "learning_rate": 0.0002, "epoch": 0.69355751353973, "step": 4290}, {"loss": 0.7618, "grad_norm": 0.5340486764907837, "learning_rate": 0.0002, "epoch": 0.6951741977204753, "step": 4300}, {"loss": 0.8526, "grad_norm": 0.5649092793464661, "learning_rate": 0.0002, "epoch": 0.6967908819012206, "step": 4310}, {"loss": 0.8246, "grad_norm": 0.6183916926383972, "learning_rate": 0.0002, "epoch": 0.6984075660819659, "step": 4320}, {"loss": 0.792, "grad_norm": 0.6154509782791138, "learning_rate": 0.0002, "epoch": 0.7000242502627112, "step": 4330}, {"loss": 0.8397, "grad_norm": 0.5156264305114746, "learning_rate": 0.0002, "epoch": 0.7016409344434564, "step": 4340}, {"loss": 0.8512, "grad_norm": 0.562171459197998, "learning_rate": 0.0002, "epoch": 0.7032576186242018, "step": 4350}, {"loss": 0.7882, "grad_norm": 0.4949502646923065, "learning_rate": 0.0002, "epoch": 0.704874302804947, "step": 4360}, {"loss": 0.738, "grad_norm": 0.5171684622764587, "learning_rate": 0.0002, "epoch": 0.7064909869856923, "step": 4370}, {"loss": 0.8001, "grad_norm": 0.6198443174362183, "learning_rate": 0.0002, "epoch": 0.7081076711664377, "step": 4380}, {"loss": 0.7606, "grad_norm": 0.5802276134490967, "learning_rate": 0.0002, "epoch": 0.7097243553471829, "step": 4390}, {"loss": 0.8797, "grad_norm": 0.41096967458724976, "learning_rate": 0.0002, "epoch": 0.7113410395279283, "step": 4400}, {"loss": 0.805, "grad_norm": 0.4397392272949219, "learning_rate": 0.0002, "epoch": 0.7129577237086735, "step": 4410}, {"loss": 0.7651, "grad_norm": 0.45228442549705505, "learning_rate": 0.0002, "epoch": 0.7145744078894188, "step": 4420}, {"loss": 0.7938, "grad_norm": 0.4839673936367035, "learning_rate": 0.0002, "epoch": 0.7161910920701641, "step": 4430}, {"loss": 0.8362, "grad_norm": 0.6140755414962769, "learning_rate": 0.0002, "epoch": 0.7178077762509094, "step": 4440}, {"loss": 0.7722, "grad_norm": 0.6841378808021545, "learning_rate": 0.0002, "epoch": 0.7194244604316546, "step": 4450}, {"loss": 0.8177, "grad_norm": 0.6664239168167114, "learning_rate": 0.0002, "epoch": 0.7210411446124, "step": 4460}, {"loss": 0.7983, "grad_norm": 0.47552719712257385, "learning_rate": 0.0002, "epoch": 0.7226578287931452, "step": 4470}, {"loss": 0.8982, "grad_norm": 0.6649776101112366, "learning_rate": 0.0002, "epoch": 0.7242745129738906, "step": 4480}, {"loss": 0.8074, "grad_norm": 0.5159541964530945, "learning_rate": 0.0002, "epoch": 0.7258911971546358, "step": 4490}, {"loss": 0.7786, "grad_norm": 0.6693112850189209, "learning_rate": 0.0002, "epoch": 0.7275078813353811, "step": 4500}, {"loss": 0.8655, "grad_norm": 0.48870977759361267, "learning_rate": 0.0002, "epoch": 0.7291245655161265, "step": 4510}, {"loss": 0.7337, "grad_norm": 0.4857887923717499, "learning_rate": 0.0002, "epoch": 0.7307412496968717, "step": 4520}, {"loss": 0.8026, "grad_norm": 0.5515662431716919, "learning_rate": 0.0002, "epoch": 0.732357933877617, "step": 4530}, {"loss": 0.8031, "grad_norm": 0.6292222738265991, "learning_rate": 0.0002, "epoch": 0.7339746180583623, "step": 4540}, {"loss": 0.7749, "grad_norm": 0.48265689611434937, "learning_rate": 0.0002, "epoch": 0.7355913022391076, "step": 4550}, {"loss": 0.8499, "grad_norm": 0.8044266104698181, "learning_rate": 0.0002, "epoch": 0.7372079864198529, "step": 4560}, {"loss": 0.8162, "grad_norm": 0.6111769676208496, "learning_rate": 0.0002, "epoch": 0.7388246706005982, "step": 4570}, {"loss": 0.7291, "grad_norm": 0.5229553580284119, "learning_rate": 0.0002, "epoch": 0.7404413547813434, "step": 4580}, {"loss": 0.8038, "grad_norm": 0.6054152250289917, "learning_rate": 0.0002, "epoch": 0.7420580389620888, "step": 4590}, {"loss": 0.8169, "grad_norm": 0.5574966669082642, "learning_rate": 0.0002, "epoch": 0.743674723142834, "step": 4600}, {"loss": 0.8439, "grad_norm": 0.5395817160606384, "learning_rate": 0.0002, "epoch": 0.7452914073235793, "step": 4610}, {"loss": 0.8495, "grad_norm": 0.7116472721099854, "learning_rate": 0.0002, "epoch": 0.7469080915043246, "step": 4620}, {"loss": 0.7743, "grad_norm": 0.5618700981140137, "learning_rate": 0.0002, "epoch": 0.7485247756850699, "step": 4630}, {"loss": 0.7744, "grad_norm": 0.5802770853042603, "learning_rate": 0.0002, "epoch": 0.7501414598658153, "step": 4640}, {"loss": 0.7924, "grad_norm": 0.5690428018569946, "learning_rate": 0.0002, "epoch": 0.7517581440465605, "step": 4650}, {"loss": 0.8017, "grad_norm": 0.4813360273838043, "learning_rate": 0.0002, "epoch": 0.7533748282273058, "step": 4660}, {"loss": 0.8108, "grad_norm": 0.5434042811393738, "learning_rate": 0.0002, "epoch": 0.7549915124080511, "step": 4670}, {"loss": 0.7824, "grad_norm": 0.5502099990844727, "learning_rate": 0.0002, "epoch": 0.7566081965887964, "step": 4680}, {"loss": 0.8598, "grad_norm": 0.6020621061325073, "learning_rate": 0.0002, "epoch": 0.7582248807695416, "step": 4690}, {"loss": 0.7937, "grad_norm": 0.4922301471233368, "learning_rate": 0.0002, "epoch": 0.759841564950287, "step": 4700}, {"loss": 0.788, "grad_norm": 0.6492828726768494, "learning_rate": 0.0002, "epoch": 0.7614582491310322, "step": 4710}, {"loss": 0.8313, "grad_norm": 0.4865580201148987, "learning_rate": 0.0002, "epoch": 0.7630749333117776, "step": 4720}, {"loss": 0.7966, "grad_norm": 0.5971422791481018, "learning_rate": 0.0002, "epoch": 0.7646916174925228, "step": 4730}, {"loss": 0.8298, "grad_norm": 0.6832674145698547, "learning_rate": 0.0002, "epoch": 0.7663083016732681, "step": 4740}, {"loss": 0.8156, "grad_norm": 0.500908613204956, "learning_rate": 0.0002, "epoch": 0.7679249858540134, "step": 4750}, {"loss": 0.8383, "grad_norm": 0.6112465858459473, "learning_rate": 0.0002, "epoch": 0.7695416700347587, "step": 4760}, {"loss": 0.76, "grad_norm": 0.5753506422042847, "learning_rate": 0.0002, "epoch": 0.771158354215504, "step": 4770}, {"loss": 0.8297, "grad_norm": 0.6529405117034912, "learning_rate": 0.0002, "epoch": 0.7727750383962493, "step": 4780}, {"loss": 0.8171, "grad_norm": 0.5916843414306641, "learning_rate": 0.0002, "epoch": 0.7743917225769946, "step": 4790}, {"loss": 0.83, "grad_norm": 0.4821224510669708, "learning_rate": 0.0002, "epoch": 0.7760084067577399, "step": 4800}, {"loss": 0.7703, "grad_norm": 0.5532580018043518, "learning_rate": 0.0002, "epoch": 0.7776250909384852, "step": 4810}, {"loss": 0.7363, "grad_norm": 0.4604877233505249, "learning_rate": 0.0002, "epoch": 0.7792417751192304, "step": 4820}, {"loss": 0.7506, "grad_norm": 0.5009613037109375, "learning_rate": 0.0002, "epoch": 0.7808584592999758, "step": 4830}, {"loss": 0.7863, "grad_norm": 0.6448560357093811, "learning_rate": 0.0002, "epoch": 0.782475143480721, "step": 4840}, {"loss": 0.7957, "grad_norm": 0.44327953457832336, "learning_rate": 0.0002, "epoch": 0.7840918276614663, "step": 4850}, {"loss": 0.7925, "grad_norm": 0.5355411171913147, "learning_rate": 0.0002, "epoch": 0.7857085118422116, "step": 4860}, {"loss": 0.7754, "grad_norm": 0.5635677576065063, "learning_rate": 0.0002, "epoch": 0.7873251960229569, "step": 4870}, {"loss": 0.7931, "grad_norm": 0.5417491793632507, "learning_rate": 0.0002, "epoch": 0.7889418802037023, "step": 4880}, {"loss": 0.7819, "grad_norm": 0.4567430913448334, "learning_rate": 0.0002, "epoch": 0.7905585643844475, "step": 4890}, {"loss": 0.8454, "grad_norm": 0.44651296734809875, "learning_rate": 0.0002, "epoch": 0.7921752485651928, "step": 4900}, {"loss": 0.7959, "grad_norm": 0.5741217136383057, "learning_rate": 0.0002, "epoch": 0.7937919327459381, "step": 4910}, {"loss": 0.8093, "grad_norm": 0.6605045199394226, "learning_rate": 0.0002, "epoch": 0.7954086169266834, "step": 4920}, {"loss": 0.77, "grad_norm": 0.5126531720161438, "learning_rate": 0.0002, "epoch": 0.7970253011074286, "step": 4930}, {"loss": 0.7793, "grad_norm": 0.513648271560669, "learning_rate": 0.0002, "epoch": 0.798641985288174, "step": 4940}, {"loss": 0.8314, "grad_norm": 0.5350404381752014, "learning_rate": 0.0002, "epoch": 0.8002586694689192, "step": 4950}, {"loss": 0.7649, "grad_norm": 0.5731674432754517, "learning_rate": 0.0002, "epoch": 0.8018753536496646, "step": 4960}, {"loss": 0.8572, "grad_norm": 0.5974258184432983, "learning_rate": 0.0002, "epoch": 0.8034920378304098, "step": 4970}, {"loss": 0.7972, "grad_norm": 0.8774799704551697, "learning_rate": 0.0002, "epoch": 0.8051087220111551, "step": 4980}, {"loss": 0.7899, "grad_norm": 0.5994430184364319, "learning_rate": 0.0002, "epoch": 0.8067254061919004, "step": 4990}, {"loss": 0.7736, "grad_norm": 0.4894903004169464, "learning_rate": 0.0002, "epoch": 0.8083420903726457, "step": 5000}, {"loss": 0.78, "grad_norm": 0.5218459367752075, "learning_rate": 0.0002, "epoch": 0.809958774553391, "step": 5010}, {"loss": 0.817, "grad_norm": 0.5232468843460083, "learning_rate": 0.0002, "epoch": 0.8115754587341363, "step": 5020}, {"loss": 0.7704, "grad_norm": 0.44358372688293457, "learning_rate": 0.0002, "epoch": 0.8131921429148816, "step": 5030}, {"loss": 0.785, "grad_norm": 0.6202037334442139, "learning_rate": 0.0002, "epoch": 0.8148088270956269, "step": 5040}, {"loss": 0.7351, "grad_norm": 0.7721474170684814, "learning_rate": 0.0002, "epoch": 0.8164255112763722, "step": 5050}, {"loss": 0.8297, "grad_norm": 0.5568501353263855, "learning_rate": 0.0002, "epoch": 0.8180421954571174, "step": 5060}, {"loss": 0.7733, "grad_norm": 0.49148809909820557, "learning_rate": 0.0002, "epoch": 0.8196588796378628, "step": 5070}, {"loss": 0.8054, "grad_norm": 0.4956012964248657, "learning_rate": 0.0002, "epoch": 0.821275563818608, "step": 5080}, {"loss": 0.8201, "grad_norm": 0.6078833937644958, "learning_rate": 0.0002, "epoch": 0.8228922479993533, "step": 5090}, {"loss": 0.828, "grad_norm": 0.46906954050064087, "learning_rate": 0.0002, "epoch": 0.8245089321800986, "step": 5100}, {"loss": 0.7703, "grad_norm": 0.50812166929245, "learning_rate": 0.0002, "epoch": 0.8261256163608439, "step": 5110}, {"loss": 0.8243, "grad_norm": 0.5319661498069763, "learning_rate": 0.0002, "epoch": 0.8277423005415891, "step": 5120}, {"loss": 0.7798, "grad_norm": 0.4949689209461212, "learning_rate": 0.0002, "epoch": 0.8293589847223345, "step": 5130}, {"loss": 0.7428, "grad_norm": 0.5151591300964355, "learning_rate": 0.0002, "epoch": 0.8309756689030798, "step": 5140}, {"loss": 0.8147, "grad_norm": 0.5530214309692383, "learning_rate": 0.0002, "epoch": 0.8325923530838251, "step": 5150}, {"loss": 0.8251, "grad_norm": 0.6297410130500793, "learning_rate": 0.0002, "epoch": 0.8342090372645704, "step": 5160}, {"loss": 0.8067, "grad_norm": 0.5466840267181396, "learning_rate": 0.0002, "epoch": 0.8358257214453156, "step": 5170}, {"loss": 0.7875, "grad_norm": 0.652913510799408, "learning_rate": 0.0002, "epoch": 0.837442405626061, "step": 5180}, {"loss": 0.8295, "grad_norm": 0.5811293125152588, "learning_rate": 0.0002, "epoch": 0.8390590898068062, "step": 5190}, {"loss": 0.7412, "grad_norm": 0.5109550952911377, "learning_rate": 0.0002, "epoch": 0.8406757739875516, "step": 5200}, {"loss": 0.8077, "grad_norm": 0.4551706612110138, "learning_rate": 0.0002, "epoch": 0.8422924581682968, "step": 5210}, {"loss": 0.7827, "grad_norm": 0.5813754200935364, "learning_rate": 0.0002, "epoch": 0.8439091423490421, "step": 5220}, {"loss": 0.802, "grad_norm": 0.5856947898864746, "learning_rate": 0.0002, "epoch": 0.8455258265297874, "step": 5230}, {"loss": 0.7957, "grad_norm": 0.5482739210128784, "learning_rate": 0.0002, "epoch": 0.8471425107105327, "step": 5240}, {"loss": 0.8295, "grad_norm": 0.49023720622062683, "learning_rate": 0.0002, "epoch": 0.8487591948912779, "step": 5250}, {"loss": 0.8022, "grad_norm": 0.49472475051879883, "learning_rate": 0.0002, "epoch": 0.8503758790720233, "step": 5260}, {"loss": 0.8001, "grad_norm": 0.5490226745605469, "learning_rate": 0.0002, "epoch": 0.8519925632527686, "step": 5270}, {"loss": 0.8333, "grad_norm": 0.5340665578842163, "learning_rate": 0.0002, "epoch": 0.8536092474335139, "step": 5280}, {"loss": 0.8277, "grad_norm": 0.5962483882904053, "learning_rate": 0.0002, "epoch": 0.8552259316142592, "step": 5290}, {"loss": 0.8765, "grad_norm": 0.586358368396759, "learning_rate": 0.0002, "epoch": 0.8568426157950044, "step": 5300}, {"loss": 0.7831, "grad_norm": 0.49120277166366577, "learning_rate": 0.0002, "epoch": 0.8584592999757498, "step": 5310}, {"loss": 0.8162, "grad_norm": 0.5887332558631897, "learning_rate": 0.0002, "epoch": 0.860075984156495, "step": 5320}, {"loss": 0.7464, "grad_norm": 0.42496153712272644, "learning_rate": 0.0002, "epoch": 0.8616926683372403, "step": 5330}, {"loss": 0.7905, "grad_norm": 0.5489874482154846, "learning_rate": 0.0002, "epoch": 0.8633093525179856, "step": 5340}, {"loss": 0.7958, "grad_norm": 0.5850813984870911, "learning_rate": 0.0002, "epoch": 0.8649260366987309, "step": 5350}, {"loss": 0.7642, "grad_norm": 0.517487108707428, "learning_rate": 0.0002, "epoch": 0.8665427208794761, "step": 5360}, {"loss": 0.7801, "grad_norm": 0.5339142680168152, "learning_rate": 0.0002, "epoch": 0.8681594050602215, "step": 5370}, {"loss": 0.818, "grad_norm": 0.6236387491226196, "learning_rate": 0.0002, "epoch": 0.8697760892409668, "step": 5380}, {"loss": 0.7708, "grad_norm": 0.5752192735671997, "learning_rate": 0.0002, "epoch": 0.8713927734217121, "step": 5390}, {"loss": 0.8542, "grad_norm": 0.6724614500999451, "learning_rate": 0.0002, "epoch": 0.8730094576024574, "step": 5400}, {"loss": 0.7581, "grad_norm": 0.5280613303184509, "learning_rate": 0.0002, "epoch": 0.8746261417832026, "step": 5410}, {"loss": 0.8231, "grad_norm": 0.44033288955688477, "learning_rate": 0.0002, "epoch": 0.876242825963948, "step": 5420}, {"loss": 0.8839, "grad_norm": 0.5199708342552185, "learning_rate": 0.0002, "epoch": 0.8778595101446932, "step": 5430}, {"loss": 0.7852, "grad_norm": 0.46778348088264465, "learning_rate": 0.0002, "epoch": 0.8794761943254386, "step": 5440}, {"loss": 0.7834, "grad_norm": 0.4657754898071289, "learning_rate": 0.0002, "epoch": 0.8810928785061838, "step": 5450}, {"loss": 0.7799, "grad_norm": 0.5472902655601501, "learning_rate": 0.0002, "epoch": 0.8827095626869291, "step": 5460}, {"loss": 0.8253, "grad_norm": 0.4876766800880432, "learning_rate": 0.0002, "epoch": 0.8843262468676744, "step": 5470}, {"loss": 0.7906, "grad_norm": 0.5057248473167419, "learning_rate": 0.0002, "epoch": 0.8859429310484197, "step": 5480}, {"loss": 0.8124, "grad_norm": 0.4637320637702942, "learning_rate": 0.0002, "epoch": 0.8875596152291649, "step": 5490}, {"loss": 0.781, "grad_norm": 0.471955806016922, "learning_rate": 0.0002, "epoch": 0.8891762994099103, "step": 5500}, {"loss": 0.8057, "grad_norm": 0.5209813714027405, "learning_rate": 0.0002, "epoch": 0.8907929835906556, "step": 5510}, {"loss": 0.8106, "grad_norm": 0.6213834285736084, "learning_rate": 0.0002, "epoch": 0.8924096677714008, "step": 5520}, {"loss": 0.7787, "grad_norm": 0.5215408205986023, "learning_rate": 0.0002, "epoch": 0.8940263519521462, "step": 5530}, {"loss": 0.8174, "grad_norm": 0.580478310585022, "learning_rate": 0.0002, "epoch": 0.8956430361328914, "step": 5540}, {"loss": 0.8371, "grad_norm": 0.49102169275283813, "learning_rate": 0.0002, "epoch": 0.8972597203136368, "step": 5550}, {"loss": 0.7806, "grad_norm": 0.6043479442596436, "learning_rate": 0.0002, "epoch": 0.898876404494382, "step": 5560}, {"loss": 0.7754, "grad_norm": 0.5636463165283203, "learning_rate": 0.0002, "epoch": 0.9004930886751273, "step": 5570}, {"loss": 0.8145, "grad_norm": 0.5620124340057373, "learning_rate": 0.0002, "epoch": 0.9021097728558726, "step": 5580}, {"loss": 0.8083, "grad_norm": 0.5206354856491089, "learning_rate": 0.0002, "epoch": 0.9037264570366179, "step": 5590}, {"loss": 0.8557, "grad_norm": 0.5798229575157166, "learning_rate": 0.0002, "epoch": 0.9053431412173631, "step": 5600}, {"loss": 0.8097, "grad_norm": 0.6428212523460388, "learning_rate": 0.0002, "epoch": 0.9069598253981085, "step": 5610}, {"loss": 0.7839, "grad_norm": 0.48064687848091125, "learning_rate": 0.0002, "epoch": 0.9085765095788537, "step": 5620}, {"loss": 0.8343, "grad_norm": 0.6347860097885132, "learning_rate": 0.0002, "epoch": 0.9101931937595991, "step": 5630}, {"loss": 0.851, "grad_norm": 0.5353913307189941, "learning_rate": 0.0002, "epoch": 0.9118098779403444, "step": 5640}, {"loss": 0.7736, "grad_norm": 0.5323944091796875, "learning_rate": 0.0002, "epoch": 0.9134265621210896, "step": 5650}, {"loss": 0.8393, "grad_norm": 0.5261843204498291, "learning_rate": 0.0002, "epoch": 0.915043246301835, "step": 5660}, {"loss": 0.7355, "grad_norm": 0.5451326966285706, "learning_rate": 0.0002, "epoch": 0.9166599304825802, "step": 5670}, {"loss": 0.8012, "grad_norm": 0.5183324217796326, "learning_rate": 0.0002, "epoch": 0.9182766146633256, "step": 5680}, {"loss": 0.7659, "grad_norm": 0.47229018807411194, "learning_rate": 0.0002, "epoch": 0.9198932988440708, "step": 5690}, {"loss": 0.7757, "grad_norm": 0.49180513620376587, "learning_rate": 0.0002, "epoch": 0.9215099830248161, "step": 5700}, {"loss": 0.8735, "grad_norm": 0.5419785380363464, "learning_rate": 0.0002, "epoch": 0.9231266672055614, "step": 5710}, {"loss": 0.7378, "grad_norm": 0.5408698916435242, "learning_rate": 0.0002, "epoch": 0.9247433513863067, "step": 5720}, {"loss": 0.7701, "grad_norm": 0.5286232829093933, "learning_rate": 0.0002, "epoch": 0.9263600355670519, "step": 5730}, {"loss": 0.8242, "grad_norm": 0.7539758086204529, "learning_rate": 0.0002, "epoch": 0.9279767197477973, "step": 5740}, {"loss": 0.8118, "grad_norm": 0.5166944861412048, "learning_rate": 0.0002, "epoch": 0.9295934039285425, "step": 5750}, {"loss": 0.783, "grad_norm": 0.6601425409317017, "learning_rate": 0.0002, "epoch": 0.9312100881092878, "step": 5760}, {"loss": 0.7873, "grad_norm": 0.5029960870742798, "learning_rate": 0.0002, "epoch": 0.9328267722900332, "step": 5770}, {"loss": 0.7989, "grad_norm": 0.4926645755767822, "learning_rate": 0.0002, "epoch": 0.9344434564707784, "step": 5780}, {"loss": 0.8174, "grad_norm": 0.5739615559577942, "learning_rate": 0.0002, "epoch": 0.9360601406515238, "step": 5790}, {"loss": 0.8037, "grad_norm": 0.5058279037475586, "learning_rate": 0.0002, "epoch": 0.937676824832269, "step": 5800}, {"loss": 0.8537, "grad_norm": 0.5260962247848511, "learning_rate": 0.0002, "epoch": 0.9392935090130143, "step": 5810}, {"loss": 0.7486, "grad_norm": 0.5768588185310364, "learning_rate": 0.0002, "epoch": 0.9409101931937596, "step": 5820}, {"loss": 0.8215, "grad_norm": 0.5170126557350159, "learning_rate": 0.0002, "epoch": 0.9425268773745049, "step": 5830}, {"loss": 0.7422, "grad_norm": 0.5745864510536194, "learning_rate": 0.0002, "epoch": 0.9441435615552501, "step": 5840}, {"loss": 0.7824, "grad_norm": 0.5551357865333557, "learning_rate": 0.0002, "epoch": 0.9457602457359955, "step": 5850}, {"loss": 0.8529, "grad_norm": 0.5776078701019287, "learning_rate": 0.0002, "epoch": 0.9473769299167407, "step": 5860}, {"loss": 0.8527, "grad_norm": 0.5340062379837036, "learning_rate": 0.0002, "epoch": 0.9489936140974861, "step": 5870}, {"loss": 0.8217, "grad_norm": 0.6447290182113647, "learning_rate": 0.0002, "epoch": 0.9506102982782314, "step": 5880}, {"loss": 0.7945, "grad_norm": 0.5123815536499023, "learning_rate": 0.0002, "epoch": 0.9522269824589766, "step": 5890}, {"loss": 0.8209, "grad_norm": 0.48547613620758057, "learning_rate": 0.0002, "epoch": 0.953843666639722, "step": 5900}, {"loss": 0.7896, "grad_norm": 0.5791414976119995, "learning_rate": 0.0002, "epoch": 0.9554603508204672, "step": 5910}, {"loss": 0.8408, "grad_norm": 0.6195011734962463, "learning_rate": 0.0002, "epoch": 0.9570770350012126, "step": 5920}, {"loss": 0.7805, "grad_norm": 0.6323803067207336, "learning_rate": 0.0002, "epoch": 0.9586937191819578, "step": 5930}, {"loss": 0.8484, "grad_norm": 0.45552879571914673, "learning_rate": 0.0002, "epoch": 0.9603104033627031, "step": 5940}, {"loss": 0.7367, "grad_norm": 0.5796473622322083, "learning_rate": 0.0002, "epoch": 0.9619270875434484, "step": 5950}, {"loss": 0.7672, "grad_norm": 0.647261381149292, "learning_rate": 0.0002, "epoch": 0.9635437717241937, "step": 5960}, {"loss": 0.8086, "grad_norm": 0.5487682819366455, "learning_rate": 0.0002, "epoch": 0.9651604559049389, "step": 5970}, {"loss": 0.7973, "grad_norm": 0.5743663907051086, "learning_rate": 0.0002, "epoch": 0.9667771400856843, "step": 5980}, {"loss": 0.8153, "grad_norm": 0.5470591187477112, "learning_rate": 0.0002, "epoch": 0.9683938242664295, "step": 5990}, {"loss": 0.8119, "grad_norm": 0.5901660323143005, "learning_rate": 0.0002, "epoch": 0.9700105084471748, "step": 6000}, {"loss": 0.8147, "grad_norm": 0.6544759273529053, "learning_rate": 0.0002, "epoch": 0.9716271926279202, "step": 6010}, {"loss": 0.7536, "grad_norm": 0.6288470029830933, "learning_rate": 0.0002, "epoch": 0.9732438768086654, "step": 6020}, {"loss": 0.7989, "grad_norm": 0.673153817653656, "learning_rate": 0.0002, "epoch": 0.9748605609894108, "step": 6030}, {"loss": 0.7556, "grad_norm": 0.42854753136634827, "learning_rate": 0.0002, "epoch": 0.976477245170156, "step": 6040}, {"loss": 0.8006, "grad_norm": 0.5227066278457642, "learning_rate": 0.0002, "epoch": 0.9780939293509013, "step": 6050}, {"loss": 0.795, "grad_norm": 0.5372416973114014, "learning_rate": 0.0002, "epoch": 0.9797106135316466, "step": 6060}, {"loss": 0.7591, "grad_norm": 0.6026402115821838, "learning_rate": 0.0002, "epoch": 0.9813272977123919, "step": 6070}, {"loss": 0.8347, "grad_norm": 0.49547791481018066, "learning_rate": 0.0002, "epoch": 0.9829439818931371, "step": 6080}, {"loss": 0.7722, "grad_norm": 0.4641951322555542, "learning_rate": 0.0002, "epoch": 0.9845606660738825, "step": 6090}, {"loss": 0.8125, "grad_norm": 0.5818535089492798, "learning_rate": 0.0002, "epoch": 0.9861773502546277, "step": 6100}, {"loss": 0.81, "grad_norm": 0.63955157995224, "learning_rate": 0.0002, "epoch": 0.9877940344353731, "step": 6110}, {"loss": 0.7547, "grad_norm": 0.5649438500404358, "learning_rate": 0.0002, "epoch": 0.9894107186161183, "step": 6120}, {"loss": 0.7861, "grad_norm": 0.5290433168411255, "learning_rate": 0.0002, "epoch": 0.9910274027968636, "step": 6130}, {"loss": 0.8109, "grad_norm": 0.6399374008178711, "learning_rate": 0.0002, "epoch": 0.992644086977609, "step": 6140}, {"loss": 0.8373, "grad_norm": 0.6736576557159424, "learning_rate": 0.0002, "epoch": 0.9942607711583542, "step": 6150}, {"loss": 0.7915, "grad_norm": 0.515420138835907, "learning_rate": 0.0002, "epoch": 0.9958774553390995, "step": 6160}, {"loss": 0.8032, "grad_norm": 0.562677800655365, "learning_rate": 0.0002, "epoch": 0.9974941395198448, "step": 6170}, {"loss": 0.8187, "grad_norm": 0.7113858461380005, "learning_rate": 0.0002, "epoch": 0.9991108237005901, "step": 6180}, {"eval_loss": 1.0871200561523438, "eval_runtime": 122.2071, "eval_samples_per_second": 5.998, "eval_steps_per_second": 0.753, "epoch": 0.9999191657909627, "step": 6185}, {"loss": 0.7507, "grad_norm": 0.7111801505088806, "learning_rate": 0.0002, "epoch": 1.0007275078813354, "step": 6190}, {"loss": 0.6865, "grad_norm": 0.5402125716209412, "learning_rate": 0.0002, "epoch": 1.0023441920620806, "step": 6200}, {"loss": 0.7625, "grad_norm": 0.6098830103874207, "learning_rate": 0.0002, "epoch": 1.003960876242826, "step": 6210}, {"loss": 0.7631, "grad_norm": 0.5829983353614807, "learning_rate": 0.0002, "epoch": 1.0055775604235713, "step": 6220}, {"loss": 0.7188, "grad_norm": 0.5614621043205261, "learning_rate": 0.0002, "epoch": 1.0071942446043165, "step": 6230}, {"loss": 0.7505, "grad_norm": 0.5954238772392273, "learning_rate": 0.0002, "epoch": 1.0088109287850617, "step": 6240}, {"loss": 0.7448, "grad_norm": 0.6480574607849121, "learning_rate": 0.0002, "epoch": 1.0104276129658072, "step": 6250}, {"loss": 0.7514, "grad_norm": 0.6051128506660461, "learning_rate": 0.0002, "epoch": 1.0120442971465524, "step": 6260}, {"loss": 0.7237, "grad_norm": 0.6318870782852173, "learning_rate": 0.0002, "epoch": 1.0136609813272976, "step": 6270}, {"loss": 0.7178, "grad_norm": 0.5048980116844177, "learning_rate": 0.0002, "epoch": 1.015277665508043, "step": 6280}, {"loss": 0.7391, "grad_norm": 0.6346936225891113, "learning_rate": 0.0002, "epoch": 1.0168943496887883, "step": 6290}, {"loss": 0.7486, "grad_norm": 0.5711665749549866, "learning_rate": 0.0002, "epoch": 1.0185110338695336, "step": 6300}, {"loss": 0.6808, "grad_norm": 0.5175361037254333, "learning_rate": 0.0002, "epoch": 1.0201277180502788, "step": 6310}, {"loss": 0.7539, "grad_norm": 0.5360831618309021, "learning_rate": 0.0002, "epoch": 1.0217444022310243, "step": 6320}, {"loss": 0.7112, "grad_norm": 0.614675760269165, "learning_rate": 0.0002, "epoch": 1.0233610864117695, "step": 6330}, {"loss": 0.7748, "grad_norm": 0.5626118183135986, "learning_rate": 0.0002, "epoch": 1.0249777705925147, "step": 6340}, {"loss": 0.7375, "grad_norm": 0.574897289276123, "learning_rate": 0.0002, "epoch": 1.02659445477326, "step": 6350}, {"loss": 0.759, "grad_norm": 0.7185447812080383, "learning_rate": 0.0002, "epoch": 1.0282111389540054, "step": 6360}, {"loss": 0.703, "grad_norm": 0.6705799698829651, "learning_rate": 0.0002, "epoch": 1.0298278231347506, "step": 6370}, {"loss": 0.7139, "grad_norm": 0.6740428805351257, "learning_rate": 0.0002, "epoch": 1.0314445073154959, "step": 6380}, {"loss": 0.7252, "grad_norm": 0.663902759552002, "learning_rate": 0.0002, "epoch": 1.0330611914962413, "step": 6390}, {"loss": 0.7065, "grad_norm": 0.5029543042182922, "learning_rate": 0.0002, "epoch": 1.0346778756769865, "step": 6400}, {"loss": 0.711, "grad_norm": 0.7813863158226013, "learning_rate": 0.0002, "epoch": 1.0362945598577318, "step": 6410}, {"loss": 0.7433, "grad_norm": 0.5396282076835632, "learning_rate": 0.0002, "epoch": 1.037911244038477, "step": 6420}, {"loss": 0.7222, "grad_norm": 0.5253293514251709, "learning_rate": 0.0002, "epoch": 1.0395279282192225, "step": 6430}, {"loss": 0.715, "grad_norm": 0.7236770987510681, "learning_rate": 0.0002, "epoch": 1.0411446123999677, "step": 6440}, {"loss": 0.7259, "grad_norm": 0.5670917630195618, "learning_rate": 0.0002, "epoch": 1.042761296580713, "step": 6450}, {"loss": 0.7195, "grad_norm": 0.6031978726387024, "learning_rate": 0.0002, "epoch": 1.0443779807614582, "step": 6460}, {"loss": 0.7648, "grad_norm": 0.5309213399887085, "learning_rate": 0.0002, "epoch": 1.0459946649422036, "step": 6470}, {"loss": 0.7161, "grad_norm": 0.7114651799201965, "learning_rate": 0.0002, "epoch": 1.0476113491229488, "step": 6480}, {"loss": 0.7583, "grad_norm": 0.5591610670089722, "learning_rate": 0.0002, "epoch": 1.049228033303694, "step": 6490}, {"loss": 0.6645, "grad_norm": 0.5185961127281189, "learning_rate": 0.0002, "epoch": 1.0508447174844395, "step": 6500}, {"loss": 0.7654, "grad_norm": 0.6510552167892456, "learning_rate": 0.0002, "epoch": 1.0524614016651848, "step": 6510}, {"loss": 0.7057, "grad_norm": 0.6557928919792175, "learning_rate": 0.0002, "epoch": 1.05407808584593, "step": 6520}, {"loss": 0.8056, "grad_norm": 0.6973192691802979, "learning_rate": 0.0002, "epoch": 1.0556947700266752, "step": 6530}, {"loss": 0.6793, "grad_norm": 0.6226583123207092, "learning_rate": 0.0002, "epoch": 1.0573114542074207, "step": 6540}, {"loss": 0.7151, "grad_norm": 0.5633195638656616, "learning_rate": 0.0002, "epoch": 1.058928138388166, "step": 6550}, {"loss": 0.7082, "grad_norm": 0.7466658353805542, "learning_rate": 0.0002, "epoch": 1.0605448225689111, "step": 6560}, {"loss": 0.7059, "grad_norm": 0.6462772488594055, "learning_rate": 0.0002, "epoch": 1.0621615067496564, "step": 6570}, {"loss": 0.7046, "grad_norm": 0.5266856551170349, "learning_rate": 0.0002, "epoch": 1.0637781909304018, "step": 6580}, {"loss": 0.7157, "grad_norm": 0.534392774105072, "learning_rate": 0.0002, "epoch": 1.065394875111147, "step": 6590}, {"loss": 0.7115, "grad_norm": 0.7514177560806274, "learning_rate": 0.0002, "epoch": 1.0670115592918923, "step": 6600}, {"loss": 0.7545, "grad_norm": 0.7593035697937012, "learning_rate": 0.0002, "epoch": 1.0686282434726375, "step": 6610}, {"loss": 0.6836, "grad_norm": 0.5277858972549438, "learning_rate": 0.0002, "epoch": 1.070244927653383, "step": 6620}, {"loss": 0.7405, "grad_norm": 0.5573670268058777, "learning_rate": 0.0002, "epoch": 1.0718616118341282, "step": 6630}, {"loss": 0.6774, "grad_norm": 0.6802396774291992, "learning_rate": 0.0002, "epoch": 1.0734782960148734, "step": 6640}, {"loss": 0.723, "grad_norm": 0.7367215752601624, "learning_rate": 0.0002, "epoch": 1.0750949801956189, "step": 6650}, {"loss": 0.7429, "grad_norm": 0.5961891412734985, "learning_rate": 0.0002, "epoch": 1.0767116643763641, "step": 6660}, {"loss": 0.6791, "grad_norm": 0.5736313462257385, "learning_rate": 0.0002, "epoch": 1.0783283485571094, "step": 6670}, {"loss": 0.7178, "grad_norm": 0.619219183921814, "learning_rate": 0.0002, "epoch": 1.0799450327378546, "step": 6680}, {"loss": 0.7318, "grad_norm": 0.6214390993118286, "learning_rate": 0.0002, "epoch": 1.0815617169186, "step": 6690}, {"loss": 0.7554, "grad_norm": 0.564536988735199, "learning_rate": 0.0002, "epoch": 1.0831784010993453, "step": 6700}, {"loss": 0.7362, "grad_norm": 0.5838140249252319, "learning_rate": 0.0002, "epoch": 1.0847950852800905, "step": 6710}, {"loss": 0.739, "grad_norm": 0.7000553607940674, "learning_rate": 0.0002, "epoch": 1.0864117694608357, "step": 6720}, {"loss": 0.7369, "grad_norm": 0.7078263759613037, "learning_rate": 0.0002, "epoch": 1.0880284536415812, "step": 6730}, {"loss": 0.7654, "grad_norm": 0.8353848457336426, "learning_rate": 0.0002, "epoch": 1.0896451378223264, "step": 6740}, {"loss": 0.7015, "grad_norm": 0.5615518689155579, "learning_rate": 0.0002, "epoch": 1.0912618220030716, "step": 6750}, {"loss": 0.7396, "grad_norm": 0.5475581288337708, "learning_rate": 0.0002, "epoch": 1.0928785061838169, "step": 6760}, {"loss": 0.7652, "grad_norm": 0.5835978388786316, "learning_rate": 0.0002, "epoch": 1.0944951903645623, "step": 6770}, {"loss": 0.7541, "grad_norm": 0.5516105890274048, "learning_rate": 0.0002, "epoch": 1.0961118745453076, "step": 6780}, {"loss": 0.6842, "grad_norm": 0.5875251889228821, "learning_rate": 0.0002, "epoch": 1.0977285587260528, "step": 6790}, {"loss": 0.6903, "grad_norm": 0.7376947999000549, "learning_rate": 0.0002, "epoch": 1.0993452429067982, "step": 6800}, {"loss": 0.7512, "grad_norm": 0.5656165480613708, "learning_rate": 0.0002, "epoch": 1.1009619270875435, "step": 6810}, {"loss": 0.7409, "grad_norm": 0.6365954279899597, "learning_rate": 0.0002, "epoch": 1.1025786112682887, "step": 6820}, {"loss": 0.7392, "grad_norm": 0.5033080577850342, "learning_rate": 0.0002, "epoch": 1.104195295449034, "step": 6830}, {"loss": 0.6909, "grad_norm": 0.617396891117096, "learning_rate": 0.0002, "epoch": 1.1058119796297794, "step": 6840}, {"loss": 0.7006, "grad_norm": 0.6395374536514282, "learning_rate": 0.0002, "epoch": 1.1074286638105246, "step": 6850}, {"loss": 0.7335, "grad_norm": 0.6775295734405518, "learning_rate": 0.0002, "epoch": 1.1090453479912699, "step": 6860}, {"loss": 0.764, "grad_norm": 0.6655223965644836, "learning_rate": 0.0002, "epoch": 1.1106620321720153, "step": 6870}, {"loss": 0.7553, "grad_norm": 0.676655113697052, "learning_rate": 0.0002, "epoch": 1.1122787163527605, "step": 6880}, {"loss": 0.7342, "grad_norm": 0.6062718629837036, "learning_rate": 0.0002, "epoch": 1.1138954005335058, "step": 6890}, {"loss": 0.7446, "grad_norm": 0.590943455696106, "learning_rate": 0.0002, "epoch": 1.115512084714251, "step": 6900}, {"loss": 0.6705, "grad_norm": 0.6315317153930664, "learning_rate": 0.0002, "epoch": 1.1171287688949965, "step": 6910}, {"loss": 0.6912, "grad_norm": 0.47979024052619934, "learning_rate": 0.0002, "epoch": 1.1187454530757417, "step": 6920}, {"loss": 0.7002, "grad_norm": 0.647298276424408, "learning_rate": 0.0002, "epoch": 1.120362137256487, "step": 6930}, {"loss": 0.7502, "grad_norm": 0.7336484789848328, "learning_rate": 0.0002, "epoch": 1.1219788214372322, "step": 6940}, {"loss": 0.693, "grad_norm": 0.5071424245834351, "learning_rate": 0.0002, "epoch": 1.1235955056179776, "step": 6950}, {"loss": 0.7378, "grad_norm": 0.6527144312858582, "learning_rate": 0.0002, "epoch": 1.1252121897987228, "step": 6960}, {"loss": 0.7228, "grad_norm": 0.6935935020446777, "learning_rate": 0.0002, "epoch": 1.126828873979468, "step": 6970}, {"loss": 0.699, "grad_norm": 0.8026931881904602, "learning_rate": 0.0002, "epoch": 1.1284455581602133, "step": 6980}, {"loss": 0.7361, "grad_norm": 0.5210393667221069, "learning_rate": 0.0002, "epoch": 1.1300622423409588, "step": 6990}, {"loss": 0.7456, "grad_norm": 0.60475093126297, "learning_rate": 0.0002, "epoch": 1.131678926521704, "step": 7000}, {"loss": 0.7495, "grad_norm": 0.6417073607444763, "learning_rate": 0.0002, "epoch": 1.1332956107024492, "step": 7010}, {"loss": 0.7459, "grad_norm": 0.6732175946235657, "learning_rate": 0.0002, "epoch": 1.1349122948831947, "step": 7020}, {"loss": 0.7278, "grad_norm": 0.6719491481781006, "learning_rate": 0.0002, "epoch": 1.13652897906394, "step": 7030}, {"loss": 0.7694, "grad_norm": 0.5708295106887817, "learning_rate": 0.0002, "epoch": 1.1381456632446851, "step": 7040}, {"loss": 0.7823, "grad_norm": 0.7141719460487366, "learning_rate": 0.0002, "epoch": 1.1397623474254304, "step": 7050}, {"loss": 0.764, "grad_norm": 0.6187017560005188, "learning_rate": 0.0002, "epoch": 1.1413790316061758, "step": 7060}, {"loss": 0.7657, "grad_norm": 0.50581294298172, "learning_rate": 0.0002, "epoch": 1.142995715786921, "step": 7070}, {"loss": 0.7357, "grad_norm": 0.5620143413543701, "learning_rate": 0.0002, "epoch": 1.1446123999676663, "step": 7080}, {"loss": 0.7287, "grad_norm": 0.6231929659843445, "learning_rate": 0.0002, "epoch": 1.1462290841484115, "step": 7090}, {"loss": 0.7328, "grad_norm": 0.5775774121284485, "learning_rate": 0.0002, "epoch": 1.147845768329157, "step": 7100}, {"loss": 0.7728, "grad_norm": 0.6492809653282166, "learning_rate": 0.0002, "epoch": 1.1494624525099022, "step": 7110}, {"loss": 0.7545, "grad_norm": 0.6434972286224365, "learning_rate": 0.0002, "epoch": 1.1510791366906474, "step": 7120}, {"loss": 0.7374, "grad_norm": 0.6191812753677368, "learning_rate": 0.0002, "epoch": 1.1526958208713927, "step": 7130}, {"loss": 0.7276, "grad_norm": 0.6690331697463989, "learning_rate": 0.0002, "epoch": 1.1543125050521381, "step": 7140}, {"loss": 0.7704, "grad_norm": 0.5977938175201416, "learning_rate": 0.0002, "epoch": 1.1559291892328833, "step": 7150}, {"loss": 0.7251, "grad_norm": 0.6195854544639587, "learning_rate": 0.0002, "epoch": 1.1575458734136286, "step": 7160}, {"loss": 0.7249, "grad_norm": 0.5752048492431641, "learning_rate": 0.0002, "epoch": 1.159162557594374, "step": 7170}, {"loss": 0.7593, "grad_norm": 0.589081883430481, "learning_rate": 0.0002, "epoch": 1.1607792417751193, "step": 7180}, {"loss": 0.704, "grad_norm": 0.756996750831604, "learning_rate": 0.0002, "epoch": 1.1623959259558645, "step": 7190}, {"loss": 0.7404, "grad_norm": 0.7614967226982117, "learning_rate": 0.0002, "epoch": 1.1640126101366097, "step": 7200}, {"loss": 0.7867, "grad_norm": 0.6120437979698181, "learning_rate": 0.0002, "epoch": 1.1656292943173552, "step": 7210}, {"loss": 0.7384, "grad_norm": 0.6210004687309265, "learning_rate": 0.0002, "epoch": 1.1672459784981004, "step": 7220}, {"loss": 0.7251, "grad_norm": 0.6044116020202637, "learning_rate": 0.0002, "epoch": 1.1688626626788456, "step": 7230}, {"loss": 0.7361, "grad_norm": 0.5418457388877869, "learning_rate": 0.0002, "epoch": 1.170479346859591, "step": 7240}, {"loss": 0.6938, "grad_norm": 0.6413537263870239, "learning_rate": 0.0002, "epoch": 1.1720960310403363, "step": 7250}, {"loss": 0.6978, "grad_norm": 0.5777867436408997, "learning_rate": 0.0002, "epoch": 1.1737127152210816, "step": 7260}, {"loss": 0.7503, "grad_norm": 0.7092402577400208, "learning_rate": 0.0002, "epoch": 1.1753293994018268, "step": 7270}, {"loss": 0.7487, "grad_norm": 0.6351709365844727, "learning_rate": 0.0002, "epoch": 1.176946083582572, "step": 7280}, {"loss": 0.7527, "grad_norm": 0.6172189712524414, "learning_rate": 0.0002, "epoch": 1.1785627677633175, "step": 7290}, {"loss": 0.7319, "grad_norm": 0.6801714897155762, "learning_rate": 0.0002, "epoch": 1.1801794519440627, "step": 7300}, {"loss": 0.6941, "grad_norm": 0.6044712066650391, "learning_rate": 0.0002, "epoch": 1.181796136124808, "step": 7310}, {"loss": 0.6951, "grad_norm": 0.7413212060928345, "learning_rate": 0.0002, "epoch": 1.1834128203055534, "step": 7320}, {"loss": 0.7396, "grad_norm": 0.5303856134414673, "learning_rate": 0.0002, "epoch": 1.1850295044862986, "step": 7330}, {"loss": 0.6915, "grad_norm": 0.5647098422050476, "learning_rate": 0.0002, "epoch": 1.1866461886670439, "step": 7340}, {"loss": 0.7506, "grad_norm": 0.7374135255813599, "learning_rate": 0.0002, "epoch": 1.188262872847789, "step": 7350}, {"loss": 0.7041, "grad_norm": 0.5710089206695557, "learning_rate": 0.0002, "epoch": 1.1898795570285345, "step": 7360}, {"loss": 0.8289, "grad_norm": 0.6073619723320007, "learning_rate": 0.0002, "epoch": 1.1914962412092798, "step": 7370}, {"loss": 0.7722, "grad_norm": 0.5899916887283325, "learning_rate": 0.0002, "epoch": 1.193112925390025, "step": 7380}, {"loss": 0.756, "grad_norm": 0.7762434482574463, "learning_rate": 0.0002, "epoch": 1.1947296095707705, "step": 7390}, {"loss": 0.7319, "grad_norm": 0.679949939250946, "learning_rate": 0.0002, "epoch": 1.1963462937515157, "step": 7400}, {"loss": 0.7599, "grad_norm": 0.6106849312782288, "learning_rate": 0.0002, "epoch": 1.197962977932261, "step": 7410}, {"loss": 0.7648, "grad_norm": 0.682461678981781, "learning_rate": 0.0002, "epoch": 1.1995796621130062, "step": 7420}, {"loss": 0.7741, "grad_norm": 0.6087017059326172, "learning_rate": 0.0002, "epoch": 1.2011963462937516, "step": 7430}, {"loss": 0.7642, "grad_norm": 0.63739013671875, "learning_rate": 0.0002, "epoch": 1.2028130304744968, "step": 7440}, {"loss": 0.7611, "grad_norm": 0.6154777407646179, "learning_rate": 0.0002, "epoch": 1.204429714655242, "step": 7450}, {"loss": 0.7565, "grad_norm": 0.7491534948348999, "learning_rate": 0.0002, "epoch": 1.2060463988359873, "step": 7460}, {"loss": 0.698, "grad_norm": 0.6664797067642212, "learning_rate": 0.0002, "epoch": 1.2076630830167328, "step": 7470}, {"loss": 0.7456, "grad_norm": 0.6660266518592834, "learning_rate": 0.0002, "epoch": 1.209279767197478, "step": 7480}, {"loss": 0.714, "grad_norm": 0.6972551345825195, "learning_rate": 0.0002, "epoch": 1.2108964513782232, "step": 7490}, {"loss": 0.7023, "grad_norm": 0.6157945990562439, "learning_rate": 0.0002, "epoch": 1.2125131355589684, "step": 7500}, {"loss": 0.7326, "grad_norm": 0.5199310183525085, "learning_rate": 0.0002, "epoch": 1.214129819739714, "step": 7510}, {"loss": 0.7586, "grad_norm": 0.577610433101654, "learning_rate": 0.0002, "epoch": 1.2157465039204591, "step": 7520}, {"loss": 0.7179, "grad_norm": 0.53652423620224, "learning_rate": 0.0002, "epoch": 1.2173631881012044, "step": 7530}, {"loss": 0.7393, "grad_norm": 0.6479050517082214, "learning_rate": 0.0002, "epoch": 1.2189798722819498, "step": 7540}, {"loss": 0.7534, "grad_norm": 0.618748128414154, "learning_rate": 0.0002, "epoch": 1.220596556462695, "step": 7550}, {"loss": 0.6886, "grad_norm": 0.6311424374580383, "learning_rate": 0.0002, "epoch": 1.2222132406434403, "step": 7560}, {"loss": 0.7272, "grad_norm": 0.6595825552940369, "learning_rate": 0.0002, "epoch": 1.2238299248241855, "step": 7570}, {"loss": 0.7353, "grad_norm": 0.5198960900306702, "learning_rate": 0.0002, "epoch": 1.225446609004931, "step": 7580}, {"loss": 0.674, "grad_norm": 0.578650712966919, "learning_rate": 0.0002, "epoch": 1.2270632931856762, "step": 7590}, {"loss": 0.7507, "grad_norm": 0.6080220937728882, "learning_rate": 0.0002, "epoch": 1.2286799773664214, "step": 7600}, {"loss": 0.7733, "grad_norm": 0.7050248384475708, "learning_rate": 0.0002, "epoch": 1.2302966615471669, "step": 7610}, {"loss": 0.7032, "grad_norm": 0.6652196049690247, "learning_rate": 0.0002, "epoch": 1.2319133457279121, "step": 7620}, {"loss": 0.7085, "grad_norm": 0.7322776317596436, "learning_rate": 0.0002, "epoch": 1.2335300299086573, "step": 7630}, {"loss": 0.7402, "grad_norm": 0.4998728036880493, "learning_rate": 0.0002, "epoch": 1.2351467140894026, "step": 7640}, {"loss": 0.7214, "grad_norm": 0.6428788900375366, "learning_rate": 0.0002, "epoch": 1.2367633982701478, "step": 7650}, {"loss": 0.7699, "grad_norm": 0.585242509841919, "learning_rate": 0.0002, "epoch": 1.2383800824508933, "step": 7660}, {"loss": 0.7621, "grad_norm": 0.5211917757987976, "learning_rate": 0.0002, "epoch": 1.2399967666316385, "step": 7670}, {"loss": 0.746, "grad_norm": 0.6490384340286255, "learning_rate": 0.0002, "epoch": 1.2416134508123837, "step": 7680}, {"loss": 0.7186, "grad_norm": 0.6249763369560242, "learning_rate": 0.0002, "epoch": 1.2432301349931292, "step": 7690}, {"loss": 0.7761, "grad_norm": 0.71870356798172, "learning_rate": 0.0002, "epoch": 1.2448468191738744, "step": 7700}, {"loss": 0.7525, "grad_norm": 0.6761967539787292, "learning_rate": 0.0002, "epoch": 1.2464635033546196, "step": 7710}, {"loss": 0.7501, "grad_norm": 0.6500617265701294, "learning_rate": 0.0002, "epoch": 1.2480801875353649, "step": 7720}, {"loss": 0.7903, "grad_norm": 0.8069869875907898, "learning_rate": 0.0002, "epoch": 1.2496968717161103, "step": 7730}, {"loss": 0.6747, "grad_norm": 0.6044608950614929, "learning_rate": 0.0002, "epoch": 1.2513135558968556, "step": 7740}, {"loss": 0.6825, "grad_norm": 0.6573283076286316, "learning_rate": 0.0002, "epoch": 1.2529302400776008, "step": 7750}, {"loss": 0.7617, "grad_norm": 0.625430166721344, "learning_rate": 0.0002, "epoch": 1.2545469242583462, "step": 7760}, {"loss": 0.7041, "grad_norm": 0.5442022681236267, "learning_rate": 0.0002, "epoch": 1.2561636084390915, "step": 7770}, {"loss": 0.7172, "grad_norm": 0.6818386912345886, "learning_rate": 0.0002, "epoch": 1.2577802926198367, "step": 7780}, {"loss": 0.696, "grad_norm": 0.6381874084472656, "learning_rate": 0.0002, "epoch": 1.259396976800582, "step": 7790}, {"loss": 0.6834, "grad_norm": 0.6269212961196899, "learning_rate": 0.0002, "epoch": 1.2610136609813272, "step": 7800}, {"loss": 0.7821, "grad_norm": 0.600121259689331, "learning_rate": 0.0002, "epoch": 1.2626303451620726, "step": 7810}, {"loss": 0.7761, "grad_norm": 0.6337703466415405, "learning_rate": 0.0002, "epoch": 1.2642470293428179, "step": 7820}, {"loss": 0.732, "grad_norm": 0.7234963774681091, "learning_rate": 0.0002, "epoch": 1.2658637135235633, "step": 7830}, {"loss": 0.785, "grad_norm": 0.800184965133667, "learning_rate": 0.0002, "epoch": 1.2674803977043085, "step": 7840}, {"loss": 0.7426, "grad_norm": 0.7539464831352234, "learning_rate": 0.0002, "epoch": 1.2690970818850538, "step": 7850}, {"loss": 0.7496, "grad_norm": 0.5493760704994202, "learning_rate": 0.0002, "epoch": 1.270713766065799, "step": 7860}, {"loss": 0.7537, "grad_norm": 0.7477145791053772, "learning_rate": 0.0002, "epoch": 1.2723304502465442, "step": 7870}, {"loss": 0.7573, "grad_norm": 0.6366362571716309, "learning_rate": 0.0002, "epoch": 1.2739471344272897, "step": 7880}, {"loss": 0.7608, "grad_norm": 0.7419533729553223, "learning_rate": 0.0002, "epoch": 1.275563818608035, "step": 7890}, {"loss": 0.7873, "grad_norm": 0.6141223311424255, "learning_rate": 0.0002, "epoch": 1.2771805027887801, "step": 7900}, {"loss": 0.6916, "grad_norm": 0.7522598505020142, "learning_rate": 0.0002, "epoch": 1.2787971869695256, "step": 7910}, {"loss": 0.7097, "grad_norm": 0.6935804486274719, "learning_rate": 0.0002, "epoch": 1.2804138711502708, "step": 7920}, {"loss": 0.7185, "grad_norm": 0.7239290475845337, "learning_rate": 0.0002, "epoch": 1.282030555331016, "step": 7930}, {"loss": 0.7145, "grad_norm": 0.8800187110900879, "learning_rate": 0.0002, "epoch": 1.2836472395117613, "step": 7940}, {"loss": 0.6991, "grad_norm": 0.540458083152771, "learning_rate": 0.0002, "epoch": 1.2852639236925067, "step": 7950}, {"loss": 0.7139, "grad_norm": 0.6492934226989746, "learning_rate": 0.0002, "epoch": 1.286880607873252, "step": 7960}, {"loss": 0.7742, "grad_norm": 0.6543959379196167, "learning_rate": 0.0002, "epoch": 1.2884972920539972, "step": 7970}, {"loss": 0.7316, "grad_norm": 0.5804705619812012, "learning_rate": 0.0002, "epoch": 1.2901139762347427, "step": 7980}, {"loss": 0.796, "grad_norm": 0.7074727416038513, "learning_rate": 0.0002, "epoch": 1.291730660415488, "step": 7990}, {"loss": 0.7034, "grad_norm": 0.5347974300384521, "learning_rate": 0.0002, "epoch": 1.2933473445962331, "step": 8000}, {"loss": 0.738, "grad_norm": 0.6457298398017883, "learning_rate": 0.0002, "epoch": 1.2949640287769784, "step": 8010}, {"loss": 0.7634, "grad_norm": 0.6407219171524048, "learning_rate": 0.0002, "epoch": 1.2965807129577236, "step": 8020}, {"loss": 0.7506, "grad_norm": 0.828439474105835, "learning_rate": 0.0002, "epoch": 1.298197397138469, "step": 8030}, {"loss": 0.735, "grad_norm": 0.4840380549430847, "learning_rate": 0.0002, "epoch": 1.2998140813192143, "step": 8040}, {"loss": 0.7283, "grad_norm": 0.5921024680137634, "learning_rate": 0.0002, "epoch": 1.3014307654999595, "step": 8050}, {"loss": 0.7477, "grad_norm": 0.6170315146446228, "learning_rate": 0.0002, "epoch": 1.303047449680705, "step": 8060}, {"loss": 0.7534, "grad_norm": 0.5374847054481506, "learning_rate": 0.0002, "epoch": 1.3046641338614502, "step": 8070}, {"loss": 0.7593, "grad_norm": 0.545758068561554, "learning_rate": 0.0002, "epoch": 1.3062808180421954, "step": 8080}, {"loss": 0.7463, "grad_norm": 0.55641770362854, "learning_rate": 0.0002, "epoch": 1.3078975022229407, "step": 8090}, {"loss": 0.7594, "grad_norm": 0.6724897027015686, "learning_rate": 0.0002, "epoch": 1.309514186403686, "step": 8100}, {"loss": 0.7105, "grad_norm": 0.6923972368240356, "learning_rate": 0.0002, "epoch": 1.3111308705844313, "step": 8110}, {"loss": 0.7149, "grad_norm": 0.5136841535568237, "learning_rate": 0.0002, "epoch": 1.3127475547651766, "step": 8120}, {"loss": 0.7504, "grad_norm": 0.6766283512115479, "learning_rate": 0.0002, "epoch": 1.314364238945922, "step": 8130}, {"loss": 0.7489, "grad_norm": 0.6283926367759705, "learning_rate": 0.0002, "epoch": 1.3159809231266673, "step": 8140}, {"loss": 0.7459, "grad_norm": 0.644216001033783, "learning_rate": 0.0002, "epoch": 1.3175976073074125, "step": 8150}, {"loss": 0.7125, "grad_norm": 0.7827503085136414, "learning_rate": 0.0002, "epoch": 1.3192142914881577, "step": 8160}, {"loss": 0.7271, "grad_norm": 0.6651390790939331, "learning_rate": 0.0002, "epoch": 1.320830975668903, "step": 8170}, {"loss": 0.7778, "grad_norm": 0.5547412633895874, "learning_rate": 0.0002, "epoch": 1.3224476598496484, "step": 8180}, {"loss": 0.7402, "grad_norm": 0.6765179634094238, "learning_rate": 0.0002, "epoch": 1.3240643440303936, "step": 8190}, {"loss": 0.7106, "grad_norm": 0.6822077035903931, "learning_rate": 0.0002, "epoch": 1.325681028211139, "step": 8200}, {"loss": 0.7288, "grad_norm": 0.5941002368927002, "learning_rate": 0.0002, "epoch": 1.3272977123918843, "step": 8210}, {"loss": 0.7494, "grad_norm": 0.4850037097930908, "learning_rate": 0.0002, "epoch": 1.3289143965726296, "step": 8220}, {"loss": 0.7474, "grad_norm": 0.6162990927696228, "learning_rate": 0.0002, "epoch": 1.3305310807533748, "step": 8230}, {"loss": 0.7751, "grad_norm": 0.6665613651275635, "learning_rate": 0.0002, "epoch": 1.33214776493412, "step": 8240}, {"loss": 0.759, "grad_norm": 0.618192732334137, "learning_rate": 0.0002, "epoch": 1.3337644491148655, "step": 8250}, {"loss": 0.7532, "grad_norm": 0.710418701171875, "learning_rate": 0.0002, "epoch": 1.3353811332956107, "step": 8260}, {"loss": 0.7306, "grad_norm": 0.5109876990318298, "learning_rate": 0.0002, "epoch": 1.336997817476356, "step": 8270}, {"loss": 0.7303, "grad_norm": 0.6791711449623108, "learning_rate": 0.0002, "epoch": 1.3386145016571014, "step": 8280}, {"loss": 0.7594, "grad_norm": 0.6836432814598083, "learning_rate": 0.0002, "epoch": 1.3402311858378466, "step": 8290}, {"loss": 0.7594, "grad_norm": 0.5579386353492737, "learning_rate": 0.0002, "epoch": 1.3418478700185918, "step": 8300}, {"loss": 0.7377, "grad_norm": 0.6713546514511108, "learning_rate": 0.0002, "epoch": 1.343464554199337, "step": 8310}, {"loss": 0.7756, "grad_norm": 0.5353720188140869, "learning_rate": 0.0002, "epoch": 1.3450812383800825, "step": 8320}, {"loss": 0.718, "grad_norm": 0.5813682675361633, "learning_rate": 0.0002, "epoch": 1.3466979225608278, "step": 8330}, {"loss": 0.7294, "grad_norm": 0.8158791661262512, "learning_rate": 0.0002, "epoch": 1.348314606741573, "step": 8340}, {"loss": 0.6992, "grad_norm": 0.6193785071372986, "learning_rate": 0.0002, "epoch": 1.3499312909223184, "step": 8350}, {"loss": 0.7654, "grad_norm": 0.6353939771652222, "learning_rate": 0.0002, "epoch": 1.3515479751030637, "step": 8360}, {"loss": 0.7519, "grad_norm": 0.6925048232078552, "learning_rate": 0.0002, "epoch": 1.353164659283809, "step": 8370}, {"loss": 0.736, "grad_norm": 0.988264799118042, "learning_rate": 0.0002, "epoch": 1.3547813434645541, "step": 8380}, {"loss": 0.7744, "grad_norm": 0.6476002931594849, "learning_rate": 0.0002, "epoch": 1.3563980276452994, "step": 8390}, {"loss": 0.776, "grad_norm": 0.7120398879051208, "learning_rate": 0.0002, "epoch": 1.3580147118260448, "step": 8400}, {"loss": 0.7368, "grad_norm": 0.9048416614532471, "learning_rate": 0.0002, "epoch": 1.35963139600679, "step": 8410}, {"loss": 0.7544, "grad_norm": 0.7000672817230225, "learning_rate": 0.0002, "epoch": 1.3612480801875353, "step": 8420}, {"loss": 0.7358, "grad_norm": 0.6015632152557373, "learning_rate": 0.0002, "epoch": 1.3628647643682807, "step": 8430}, {"loss": 0.7298, "grad_norm": 0.612516462802887, "learning_rate": 0.0002, "epoch": 1.364481448549026, "step": 8440}, {"loss": 0.7055, "grad_norm": 0.5969301462173462, "learning_rate": 0.0002, "epoch": 1.3660981327297712, "step": 8450}, {"loss": 0.7754, "grad_norm": 0.6730654239654541, "learning_rate": 0.0002, "epoch": 1.3677148169105164, "step": 8460}, {"loss": 0.7465, "grad_norm": 0.6386392116546631, "learning_rate": 0.0002, "epoch": 1.369331501091262, "step": 8470}, {"loss": 0.7433, "grad_norm": 0.739544153213501, "learning_rate": 0.0002, "epoch": 1.3709481852720071, "step": 8480}, {"loss": 0.7892, "grad_norm": 0.6462782621383667, "learning_rate": 0.0002, "epoch": 1.3725648694527524, "step": 8490}, {"loss": 0.7302, "grad_norm": 0.7346843481063843, "learning_rate": 0.0002, "epoch": 1.3741815536334978, "step": 8500}, {"loss": 0.7634, "grad_norm": 0.6884821057319641, "learning_rate": 0.0002, "epoch": 1.375798237814243, "step": 8510}, {"loss": 0.7614, "grad_norm": 0.6999333500862122, "learning_rate": 0.0002, "epoch": 1.3774149219949883, "step": 8520}, {"loss": 0.729, "grad_norm": 0.5378713011741638, "learning_rate": 0.0002, "epoch": 1.3790316061757335, "step": 8530}, {"loss": 0.6797, "grad_norm": 0.5417906641960144, "learning_rate": 0.0002, "epoch": 1.3806482903564787, "step": 8540}, {"loss": 0.7499, "grad_norm": 0.6602526307106018, "learning_rate": 0.0002, "epoch": 1.3822649745372242, "step": 8550}, {"loss": 0.7356, "grad_norm": 0.7073674201965332, "learning_rate": 0.0002, "epoch": 1.3838816587179694, "step": 8560}, {"loss": 0.75, "grad_norm": 0.5841707587242126, "learning_rate": 0.0002, "epoch": 1.3854983428987149, "step": 8570}, {"loss": 0.732, "grad_norm": 0.7031095027923584, "learning_rate": 0.0002, "epoch": 1.38711502707946, "step": 8580}, {"loss": 0.7464, "grad_norm": 0.5198570489883423, "learning_rate": 0.0002, "epoch": 1.3887317112602053, "step": 8590}, {"loss": 0.7354, "grad_norm": 0.7261320352554321, "learning_rate": 0.0002, "epoch": 1.3903483954409506, "step": 8600}, {"loss": 0.7339, "grad_norm": 0.5616350173950195, "learning_rate": 0.0002, "epoch": 1.3919650796216958, "step": 8610}, {"loss": 0.7382, "grad_norm": 0.5185914635658264, "learning_rate": 0.0002, "epoch": 1.3935817638024413, "step": 8620}, {"loss": 0.7456, "grad_norm": 0.5814694762229919, "learning_rate": 0.0002, "epoch": 1.3951984479831865, "step": 8630}, {"loss": 0.7413, "grad_norm": 0.6977371573448181, "learning_rate": 0.0002, "epoch": 1.3968151321639317, "step": 8640}, {"loss": 0.7574, "grad_norm": 0.6855689883232117, "learning_rate": 0.0002, "epoch": 1.3984318163446772, "step": 8650}, {"loss": 0.7802, "grad_norm": 0.5414357781410217, "learning_rate": 0.0002, "epoch": 1.4000485005254224, "step": 8660}, {"loss": 0.7487, "grad_norm": 0.6970012784004211, "learning_rate": 0.0002, "epoch": 1.4016651847061676, "step": 8670}, {"loss": 0.7421, "grad_norm": 0.526079535484314, "learning_rate": 0.0002, "epoch": 1.4032818688869129, "step": 8680}, {"loss": 0.737, "grad_norm": 0.758712887763977, "learning_rate": 0.0002, "epoch": 1.404898553067658, "step": 8690}, {"loss": 0.7612, "grad_norm": 0.7118762731552124, "learning_rate": 0.0002, "epoch": 1.4065152372484035, "step": 8700}, {"loss": 0.7628, "grad_norm": 0.5696909427642822, "learning_rate": 0.0002, "epoch": 1.4081319214291488, "step": 8710}, {"loss": 0.7156, "grad_norm": 0.7995436787605286, "learning_rate": 0.0002, "epoch": 1.4097486056098942, "step": 8720}, {"loss": 0.7521, "grad_norm": 0.7237521409988403, "learning_rate": 0.0002, "epoch": 1.4113652897906395, "step": 8730}, {"loss": 0.7661, "grad_norm": 0.744628369808197, "learning_rate": 0.0002, "epoch": 1.4129819739713847, "step": 8740}, {"loss": 0.7073, "grad_norm": 0.6082926988601685, "learning_rate": 0.0002, "epoch": 1.41459865815213, "step": 8750}, {"loss": 0.7282, "grad_norm": 0.5185243487358093, "learning_rate": 0.0002, "epoch": 1.4162153423328752, "step": 8760}, {"loss": 0.7592, "grad_norm": 0.5183082222938538, "learning_rate": 0.0002, "epoch": 1.4178320265136206, "step": 8770}, {"loss": 0.7509, "grad_norm": 0.7326041460037231, "learning_rate": 0.0002, "epoch": 1.4194487106943658, "step": 8780}, {"loss": 0.7398, "grad_norm": 0.7174660563468933, "learning_rate": 0.0002, "epoch": 1.421065394875111, "step": 8790}, {"loss": 0.7507, "grad_norm": 0.8080165982246399, "learning_rate": 0.0002, "epoch": 1.4226820790558565, "step": 8800}, {"loss": 0.72, "grad_norm": 0.5061507821083069, "learning_rate": 0.0002, "epoch": 1.4242987632366018, "step": 8810}, {"loss": 0.7563, "grad_norm": 0.801602840423584, "learning_rate": 0.0002, "epoch": 1.425915447417347, "step": 8820}, {"loss": 0.7287, "grad_norm": 0.6150273084640503, "learning_rate": 0.0002, "epoch": 1.4275321315980922, "step": 8830}, {"loss": 0.7452, "grad_norm": 0.8786525726318359, "learning_rate": 0.0002, "epoch": 1.4291488157788377, "step": 8840}, {"loss": 0.7257, "grad_norm": 0.6371538639068604, "learning_rate": 0.0002, "epoch": 1.430765499959583, "step": 8850}, {"loss": 0.711, "grad_norm": 0.6409295797348022, "learning_rate": 0.0002, "epoch": 1.4323821841403281, "step": 8860}, {"loss": 0.7891, "grad_norm": 0.6452359557151794, "learning_rate": 0.0002, "epoch": 1.4339988683210736, "step": 8870}, {"loss": 0.7588, "grad_norm": 0.5842334628105164, "learning_rate": 0.0002, "epoch": 1.4356155525018188, "step": 8880}, {"loss": 0.7446, "grad_norm": 0.696761965751648, "learning_rate": 0.0002, "epoch": 1.437232236682564, "step": 8890}, {"loss": 0.7541, "grad_norm": 0.6384600400924683, "learning_rate": 0.0002, "epoch": 1.4388489208633093, "step": 8900}, {"loss": 0.7049, "grad_norm": 0.5981136560440063, "learning_rate": 0.0002, "epoch": 1.4404656050440545, "step": 8910}, {"loss": 0.795, "grad_norm": 0.6355637907981873, "learning_rate": 0.0002, "epoch": 1.4420822892248, "step": 8920}, {"loss": 0.7653, "grad_norm": 0.6374830603599548, "learning_rate": 0.0002, "epoch": 1.4436989734055452, "step": 8930}, {"loss": 0.8108, "grad_norm": 0.559013307094574, "learning_rate": 0.0002, "epoch": 1.4453156575862904, "step": 8940}, {"loss": 0.7045, "grad_norm": 0.7289170026779175, "learning_rate": 0.0002, "epoch": 1.446932341767036, "step": 8950}, {"loss": 0.7484, "grad_norm": 0.8649206757545471, "learning_rate": 0.0002, "epoch": 1.4485490259477811, "step": 8960}, {"loss": 0.7745, "grad_norm": 0.7664689421653748, "learning_rate": 0.0002, "epoch": 1.4501657101285264, "step": 8970}, {"loss": 0.7431, "grad_norm": 0.7109952569007874, "learning_rate": 0.0002, "epoch": 1.4517823943092716, "step": 8980}, {"loss": 0.7997, "grad_norm": 0.6312844753265381, "learning_rate": 0.0002, "epoch": 1.453399078490017, "step": 8990}, {"loss": 0.7467, "grad_norm": 0.6616617441177368, "learning_rate": 0.0002, "epoch": 1.4550157626707623, "step": 9000}, {"loss": 0.7518, "grad_norm": 0.7384068965911865, "learning_rate": 0.0002, "epoch": 1.4566324468515075, "step": 9010}, {"loss": 0.7483, "grad_norm": 0.6549670100212097, "learning_rate": 0.0002, "epoch": 1.458249131032253, "step": 9020}, {"loss": 0.7423, "grad_norm": 0.6254119277000427, "learning_rate": 0.0002, "epoch": 1.4598658152129982, "step": 9030}, {"loss": 0.7645, "grad_norm": 0.6806328892707825, "learning_rate": 0.0002, "epoch": 1.4614824993937434, "step": 9040}, {"loss": 0.7221, "grad_norm": 0.6803115010261536, "learning_rate": 0.0002, "epoch": 1.4630991835744886, "step": 9050}, {"loss": 0.7264, "grad_norm": 0.48529282212257385, "learning_rate": 0.0002, "epoch": 1.4647158677552339, "step": 9060}, {"loss": 0.7542, "grad_norm": 0.5995030999183655, "learning_rate": 0.0002, "epoch": 1.4663325519359793, "step": 9070}, {"loss": 0.7894, "grad_norm": 0.6005427837371826, "learning_rate": 0.0002, "epoch": 1.4679492361167246, "step": 9080}, {"loss": 0.7288, "grad_norm": 0.718564510345459, "learning_rate": 0.0002, "epoch": 1.46956592029747, "step": 9090}, {"loss": 0.7089, "grad_norm": 0.7003577351570129, "learning_rate": 0.0002, "epoch": 1.4711826044782153, "step": 9100}, {"loss": 0.8069, "grad_norm": 0.5888323783874512, "learning_rate": 0.0002, "epoch": 1.4727992886589605, "step": 9110}, {"loss": 0.7275, "grad_norm": 0.6417609453201294, "learning_rate": 0.0002, "epoch": 1.4744159728397057, "step": 9120}, {"loss": 0.7441, "grad_norm": 0.572294294834137, "learning_rate": 0.0002, "epoch": 1.476032657020451, "step": 9130}, {"loss": 0.8053, "grad_norm": 0.8200714588165283, "learning_rate": 0.0002, "epoch": 1.4776493412011964, "step": 9140}, {"loss": 0.7382, "grad_norm": 0.6343288421630859, "learning_rate": 0.0002, "epoch": 1.4792660253819416, "step": 9150}, {"loss": 0.7641, "grad_norm": 0.7017961144447327, "learning_rate": 0.0002, "epoch": 1.4808827095626869, "step": 9160}, {"loss": 0.7619, "grad_norm": 0.6202912926673889, "learning_rate": 0.0002, "epoch": 1.4824993937434323, "step": 9170}, {"loss": 0.7428, "grad_norm": 0.6677869558334351, "learning_rate": 0.0002, "epoch": 1.4841160779241775, "step": 9180}, {"loss": 0.7648, "grad_norm": 0.6052267551422119, "learning_rate": 0.0002, "epoch": 1.4857327621049228, "step": 9190}, {"loss": 0.7152, "grad_norm": 0.6638872027397156, "learning_rate": 0.0002, "epoch": 1.487349446285668, "step": 9200}, {"loss": 0.7448, "grad_norm": 0.6245523691177368, "learning_rate": 0.0002, "epoch": 1.4889661304664135, "step": 9210}, {"loss": 0.6958, "grad_norm": 0.5761767625808716, "learning_rate": 0.0002, "epoch": 1.4905828146471587, "step": 9220}, {"loss": 0.8012, "grad_norm": 0.8175981640815735, "learning_rate": 0.0002, "epoch": 1.492199498827904, "step": 9230}, {"loss": 0.683, "grad_norm": 0.9144009947776794, "learning_rate": 0.0002, "epoch": 1.4938161830086494, "step": 9240}, {"loss": 0.7623, "grad_norm": 0.5742552876472473, "learning_rate": 0.0002, "epoch": 1.4954328671893946, "step": 9250}, {"loss": 0.7418, "grad_norm": 0.534534215927124, "learning_rate": 0.0002, "epoch": 1.4970495513701398, "step": 9260}, {"loss": 0.7194, "grad_norm": 0.7836225032806396, "learning_rate": 0.0002, "epoch": 1.498666235550885, "step": 9270}, {"loss": 0.7453, "grad_norm": 0.5292993187904358, "learning_rate": 0.0002, "epoch": 1.5002829197316303, "step": 9280}, {"loss": 0.7168, "grad_norm": 0.8044071793556213, "learning_rate": 0.0002, "epoch": 1.5018996039123758, "step": 9290}, {"loss": 0.7229, "grad_norm": 0.6185805201530457, "learning_rate": 0.0002, "epoch": 1.503516288093121, "step": 9300}, {"loss": 0.684, "grad_norm": 0.6093607544898987, "learning_rate": 0.0002, "epoch": 1.5051329722738664, "step": 9310}, {"loss": 0.7973, "grad_norm": 0.5891730189323425, "learning_rate": 0.0002, "epoch": 1.5067496564546117, "step": 9320}, {"loss": 0.7474, "grad_norm": 0.6331129670143127, "learning_rate": 0.0002, "epoch": 1.508366340635357, "step": 9330}, {"loss": 0.7074, "grad_norm": 0.7690958380699158, "learning_rate": 0.0002, "epoch": 1.5099830248161021, "step": 9340}, {"loss": 0.672, "grad_norm": 0.6548877358436584, "learning_rate": 0.0002, "epoch": 1.5115997089968474, "step": 9350}, {"loss": 0.7408, "grad_norm": 0.6545143127441406, "learning_rate": 0.0002, "epoch": 1.5132163931775926, "step": 9360}, {"loss": 0.7432, "grad_norm": 0.553247332572937, "learning_rate": 0.0002, "epoch": 1.514833077358338, "step": 9370}, {"loss": 0.7265, "grad_norm": 0.8145074844360352, "learning_rate": 0.0002, "epoch": 1.5164497615390833, "step": 9380}, {"loss": 0.7379, "grad_norm": 0.7636994123458862, "learning_rate": 0.0002, "epoch": 1.5180664457198287, "step": 9390}, {"loss": 0.7413, "grad_norm": 0.6838982701301575, "learning_rate": 0.0002, "epoch": 1.519683129900574, "step": 9400}, {"loss": 0.7367, "grad_norm": 0.8599441647529602, "learning_rate": 0.0002, "epoch": 1.5212998140813192, "step": 9410}, {"loss": 0.7663, "grad_norm": 0.7020329833030701, "learning_rate": 0.0002, "epoch": 1.5229164982620644, "step": 9420}, {"loss": 0.7928, "grad_norm": 0.6964772343635559, "learning_rate": 0.0002, "epoch": 1.5245331824428097, "step": 9430}, {"loss": 0.7168, "grad_norm": 0.6916600465774536, "learning_rate": 0.0002, "epoch": 1.5261498666235551, "step": 9440}, {"loss": 0.7519, "grad_norm": 0.7282621264457703, "learning_rate": 0.0002, "epoch": 1.5277665508043003, "step": 9450}, {"loss": 0.7628, "grad_norm": 0.5363983511924744, "learning_rate": 0.0002, "epoch": 1.5293832349850458, "step": 9460}, {"loss": 0.7154, "grad_norm": 0.6184861063957214, "learning_rate": 0.0002, "epoch": 1.530999919165791, "step": 9470}, {"loss": 0.7837, "grad_norm": 0.5991285443305969, "learning_rate": 0.0002, "epoch": 1.5326166033465363, "step": 9480}, {"loss": 0.7827, "grad_norm": 0.8176587820053101, "learning_rate": 0.0002, "epoch": 1.5342332875272815, "step": 9490}, {"loss": 0.7415, "grad_norm": 0.6473721861839294, "learning_rate": 0.0002, "epoch": 1.5358499717080267, "step": 9500}, {"loss": 0.7632, "grad_norm": 0.7319952845573425, "learning_rate": 0.0002, "epoch": 1.5374666558887722, "step": 9510}, {"loss": 0.7706, "grad_norm": 0.702900230884552, "learning_rate": 0.0002, "epoch": 1.5390833400695174, "step": 9520}, {"loss": 0.7754, "grad_norm": 0.7971600294113159, "learning_rate": 0.0002, "epoch": 1.5407000242502629, "step": 9530}, {"loss": 0.7352, "grad_norm": 0.6527525186538696, "learning_rate": 0.0002, "epoch": 1.542316708431008, "step": 9540}, {"loss": 0.7425, "grad_norm": 0.5791676044464111, "learning_rate": 0.0002, "epoch": 1.5439333926117533, "step": 9550}, {"loss": 0.7585, "grad_norm": 0.5619390606880188, "learning_rate": 0.0002, "epoch": 1.5455500767924986, "step": 9560}, {"loss": 0.7894, "grad_norm": 0.5701689124107361, "learning_rate": 0.0002, "epoch": 1.5471667609732438, "step": 9570}, {"loss": 0.793, "grad_norm": 0.47549352049827576, "learning_rate": 0.0002, "epoch": 1.548783445153989, "step": 9580}, {"loss": 0.7276, "grad_norm": 0.8730611205101013, "learning_rate": 0.0002, "epoch": 1.5504001293347345, "step": 9590}, {"loss": 0.798, "grad_norm": 0.6842091083526611, "learning_rate": 0.0002, "epoch": 1.5520168135154797, "step": 9600}, {"loss": 0.7528, "grad_norm": 0.6675129532814026, "learning_rate": 0.0002, "epoch": 1.5536334976962252, "step": 9610}, {"loss": 0.7954, "grad_norm": 0.8173956274986267, "learning_rate": 0.0002, "epoch": 1.5552501818769704, "step": 9620}, {"loss": 0.7535, "grad_norm": 0.724947452545166, "learning_rate": 0.0002, "epoch": 1.5568668660577156, "step": 9630}, {"loss": 0.7738, "grad_norm": 0.6154758930206299, "learning_rate": 0.0002, "epoch": 1.5584835502384609, "step": 9640}, {"loss": 0.7568, "grad_norm": 0.6072008013725281, "learning_rate": 0.0002, "epoch": 1.560100234419206, "step": 9650}, {"loss": 0.7219, "grad_norm": 0.659010648727417, "learning_rate": 0.0002, "epoch": 1.5617169185999515, "step": 9660}, {"loss": 0.673, "grad_norm": 0.65857994556427, "learning_rate": 0.0002, "epoch": 1.5633336027806968, "step": 9670}, {"loss": 0.7156, "grad_norm": 0.5914267301559448, "learning_rate": 0.0002, "epoch": 1.5649502869614422, "step": 9680}, {"loss": 0.7414, "grad_norm": 0.6248020529747009, "learning_rate": 0.0002, "epoch": 1.5665669711421875, "step": 9690}, {"loss": 0.694, "grad_norm": 0.7147795557975769, "learning_rate": 0.0002, "epoch": 1.5681836553229327, "step": 9700}, {"loss": 0.7335, "grad_norm": 0.7076232433319092, "learning_rate": 0.0002, "epoch": 1.569800339503678, "step": 9710}, {"loss": 0.7413, "grad_norm": 0.6217400431632996, "learning_rate": 0.0002, "epoch": 1.5714170236844232, "step": 9720}, {"loss": 0.7296, "grad_norm": 0.6709911227226257, "learning_rate": 0.0002, "epoch": 1.5730337078651684, "step": 9730}, {"loss": 0.7306, "grad_norm": 0.749171257019043, "learning_rate": 0.0002, "epoch": 1.5746503920459138, "step": 9740}, {"loss": 0.7242, "grad_norm": 0.6241145730018616, "learning_rate": 0.0002, "epoch": 1.576267076226659, "step": 9750}, {"loss": 0.7384, "grad_norm": 0.4960934817790985, "learning_rate": 0.0002, "epoch": 1.5778837604074045, "step": 9760}, {"loss": 0.725, "grad_norm": 0.6593309640884399, "learning_rate": 0.0002, "epoch": 1.5795004445881498, "step": 9770}, {"loss": 0.7531, "grad_norm": 0.5814042091369629, "learning_rate": 0.0002, "epoch": 1.581117128768895, "step": 9780}, {"loss": 0.7109, "grad_norm": 0.5936070680618286, "learning_rate": 0.0002, "epoch": 1.5827338129496402, "step": 9790}, {"loss": 0.7769, "grad_norm": 0.6454403400421143, "learning_rate": 0.0002, "epoch": 1.5843504971303854, "step": 9800}, {"loss": 0.7677, "grad_norm": 0.7612107992172241, "learning_rate": 0.0002, "epoch": 1.585967181311131, "step": 9810}, {"loss": 0.7649, "grad_norm": 0.6494482755661011, "learning_rate": 0.0002, "epoch": 1.5875838654918761, "step": 9820}, {"loss": 0.7569, "grad_norm": 0.7825694680213928, "learning_rate": 0.0002, "epoch": 1.5892005496726216, "step": 9830}, {"loss": 0.706, "grad_norm": 0.6757757663726807, "learning_rate": 0.0002, "epoch": 1.5908172338533668, "step": 9840}, {"loss": 0.7803, "grad_norm": 0.7105609178543091, "learning_rate": 0.0002, "epoch": 1.592433918034112, "step": 9850}, {"loss": 0.7925, "grad_norm": 0.7596991062164307, "learning_rate": 0.0002, "epoch": 1.5940506022148573, "step": 9860}, {"loss": 0.7108, "grad_norm": 0.5681525468826294, "learning_rate": 0.0002, "epoch": 1.5956672863956025, "step": 9870}, {"loss": 0.7811, "grad_norm": 0.6090980768203735, "learning_rate": 0.0002, "epoch": 1.5972839705763477, "step": 9880}, {"loss": 0.7339, "grad_norm": 0.6271613240242004, "learning_rate": 0.0002, "epoch": 1.5989006547570932, "step": 9890}, {"loss": 0.7419, "grad_norm": 0.7656369805335999, "learning_rate": 0.0002, "epoch": 1.6005173389378387, "step": 9900}, {"loss": 0.7336, "grad_norm": 0.7504446506500244, "learning_rate": 0.0002, "epoch": 1.6021340231185839, "step": 9910}, {"loss": 0.7479, "grad_norm": 0.659656286239624, "learning_rate": 0.0002, "epoch": 1.6037507072993291, "step": 9920}, {"loss": 0.7483, "grad_norm": 0.6006826162338257, "learning_rate": 0.0002, "epoch": 1.6053673914800743, "step": 9930}, {"loss": 0.732, "grad_norm": 0.7872757911682129, "learning_rate": 0.0002, "epoch": 1.6069840756608196, "step": 9940}, {"loss": 0.768, "grad_norm": 0.5545852780342102, "learning_rate": 0.0002, "epoch": 1.6086007598415648, "step": 9950}, {"loss": 0.8064, "grad_norm": 0.7429468631744385, "learning_rate": 0.0002, "epoch": 1.6102174440223103, "step": 9960}, {"loss": 0.714, "grad_norm": 0.6873556971549988, "learning_rate": 0.0002, "epoch": 1.6118341282030555, "step": 9970}, {"loss": 0.7324, "grad_norm": 0.5874287486076355, "learning_rate": 0.0002, "epoch": 1.613450812383801, "step": 9980}, {"loss": 0.7141, "grad_norm": 0.6039386987686157, "learning_rate": 0.0002, "epoch": 1.6150674965645462, "step": 9990}, {"loss": 0.6674, "grad_norm": 0.6233575940132141, "learning_rate": 0.0002, "epoch": 1.6166841807452914, "step": 10000}, {"loss": 0.7602, "grad_norm": 0.7676448225975037, "learning_rate": 0.0002, "epoch": 1.6183008649260366, "step": 10010}, {"loss": 0.7784, "grad_norm": 0.6565698385238647, "learning_rate": 0.0002, "epoch": 1.6199175491067819, "step": 10020}, {"loss": 0.7104, "grad_norm": 0.6787590384483337, "learning_rate": 0.0002, "epoch": 1.6215342332875273, "step": 10030}, {"loss": 0.7464, "grad_norm": 0.6137678027153015, "learning_rate": 0.0002, "epoch": 1.6231509174682726, "step": 10040}, {"loss": 0.7646, "grad_norm": 0.5236800312995911, "learning_rate": 0.0002, "epoch": 1.624767601649018, "step": 10050}, {"loss": 0.7437, "grad_norm": 0.7626367807388306, "learning_rate": 0.0002, "epoch": 1.6263842858297632, "step": 10060}, {"loss": 0.7273, "grad_norm": 0.5657260417938232, "learning_rate": 0.0002, "epoch": 1.6280009700105085, "step": 10070}, {"loss": 0.7354, "grad_norm": 0.4913991391658783, "learning_rate": 0.0002, "epoch": 1.6296176541912537, "step": 10080}, {"loss": 0.7596, "grad_norm": 0.7715556621551514, "learning_rate": 0.0002, "epoch": 1.631234338371999, "step": 10090}, {"loss": 0.7105, "grad_norm": 0.6509000062942505, "learning_rate": 0.0002, "epoch": 1.6328510225527442, "step": 10100}, {"loss": 0.7274, "grad_norm": 0.6215850114822388, "learning_rate": 0.0002, "epoch": 1.6344677067334896, "step": 10110}, {"loss": 0.7705, "grad_norm": 0.6956844329833984, "learning_rate": 0.0002, "epoch": 1.6360843909142349, "step": 10120}, {"loss": 0.7129, "grad_norm": 0.6111597418785095, "learning_rate": 0.0002, "epoch": 1.6377010750949803, "step": 10130}, {"loss": 0.6955, "grad_norm": 0.6518288850784302, "learning_rate": 0.0002, "epoch": 1.6393177592757255, "step": 10140}, {"loss": 0.731, "grad_norm": 0.6914522051811218, "learning_rate": 0.0002, "epoch": 1.6409344434564708, "step": 10150}, {"loss": 0.7295, "grad_norm": 0.63785719871521, "learning_rate": 0.0002, "epoch": 1.642551127637216, "step": 10160}, {"loss": 0.7355, "grad_norm": 0.6379287838935852, "learning_rate": 0.0002, "epoch": 1.6441678118179612, "step": 10170}, {"loss": 0.7359, "grad_norm": 0.6793403029441833, "learning_rate": 0.0002, "epoch": 1.6457844959987067, "step": 10180}, {"loss": 0.7402, "grad_norm": 0.6099132895469666, "learning_rate": 0.0002, "epoch": 1.647401180179452, "step": 10190}, {"loss": 0.7353, "grad_norm": 0.5869854092597961, "learning_rate": 0.0002, "epoch": 1.6490178643601974, "step": 10200}, {"loss": 0.8308, "grad_norm": 0.7716999053955078, "learning_rate": 0.0002, "epoch": 1.6506345485409426, "step": 10210}, {"loss": 0.7215, "grad_norm": 0.6854110360145569, "learning_rate": 0.0002, "epoch": 1.6522512327216878, "step": 10220}, {"loss": 0.782, "grad_norm": 0.6957170367240906, "learning_rate": 0.0002, "epoch": 1.653867916902433, "step": 10230}, {"loss": 0.7282, "grad_norm": 0.6932903528213501, "learning_rate": 0.0002, "epoch": 1.6554846010831783, "step": 10240}, {"loss": 0.7478, "grad_norm": 0.7713165283203125, "learning_rate": 0.0002, "epoch": 1.6571012852639235, "step": 10250}, {"loss": 0.7099, "grad_norm": 0.7455793619155884, "learning_rate": 0.0002, "epoch": 1.658717969444669, "step": 10260}, {"loss": 0.7524, "grad_norm": 0.5464168190956116, "learning_rate": 0.0002, "epoch": 1.6603346536254144, "step": 10270}, {"loss": 0.7328, "grad_norm": 0.6782926321029663, "learning_rate": 0.0002, "epoch": 1.6619513378061597, "step": 10280}, {"loss": 0.7801, "grad_norm": 0.7962649464607239, "learning_rate": 0.0002, "epoch": 1.663568021986905, "step": 10290}, {"loss": 0.7142, "grad_norm": 0.6814526319503784, "learning_rate": 0.0002, "epoch": 1.6651847061676501, "step": 10300}, {"loss": 0.7285, "grad_norm": 0.656895101070404, "learning_rate": 0.0002, "epoch": 1.6668013903483954, "step": 10310}, {"loss": 0.7358, "grad_norm": 0.6085672378540039, "learning_rate": 0.0002, "epoch": 1.6684180745291406, "step": 10320}, {"loss": 0.7074, "grad_norm": 0.585508406162262, "learning_rate": 0.0002, "epoch": 1.670034758709886, "step": 10330}, {"loss": 0.7604, "grad_norm": 0.6930184364318848, "learning_rate": 0.0002, "epoch": 1.6716514428906313, "step": 10340}, {"loss": 0.7169, "grad_norm": 0.575663149356842, "learning_rate": 0.0002, "epoch": 1.6732681270713767, "step": 10350}, {"loss": 0.7198, "grad_norm": 0.582502543926239, "learning_rate": 0.0002, "epoch": 1.674884811252122, "step": 10360}, {"loss": 0.7793, "grad_norm": 0.5668916702270508, "learning_rate": 0.0002, "epoch": 1.6765014954328672, "step": 10370}, {"loss": 0.7478, "grad_norm": 0.6070065498352051, "learning_rate": 0.0002, "epoch": 1.6781181796136124, "step": 10380}, {"loss": 0.7939, "grad_norm": 0.6141316294670105, "learning_rate": 0.0002, "epoch": 1.6797348637943577, "step": 10390}, {"loss": 0.7573, "grad_norm": 0.8359124064445496, "learning_rate": 0.0002, "epoch": 1.6813515479751031, "step": 10400}, {"loss": 0.7488, "grad_norm": 0.5378185510635376, "learning_rate": 0.0002, "epoch": 1.6829682321558483, "step": 10410}, {"loss": 0.7588, "grad_norm": 0.6959536075592041, "learning_rate": 0.0002, "epoch": 1.6845849163365938, "step": 10420}, {"loss": 0.7872, "grad_norm": 0.6514357328414917, "learning_rate": 0.0002, "epoch": 1.686201600517339, "step": 10430}, {"loss": 0.725, "grad_norm": 0.7706646919250488, "learning_rate": 0.0002, "epoch": 1.6878182846980843, "step": 10440}, {"loss": 0.7673, "grad_norm": 0.6183337569236755, "learning_rate": 0.0002, "epoch": 1.6894349688788295, "step": 10450}, {"loss": 0.7566, "grad_norm": 0.6123278141021729, "learning_rate": 0.0002, "epoch": 1.6910516530595747, "step": 10460}, {"loss": 0.7169, "grad_norm": 0.6894851326942444, "learning_rate": 0.0002, "epoch": 1.69266833724032, "step": 10470}, {"loss": 0.7435, "grad_norm": 0.7497312426567078, "learning_rate": 0.0002, "epoch": 1.6942850214210654, "step": 10480}, {"loss": 0.7544, "grad_norm": 0.5968214273452759, "learning_rate": 0.0002, "epoch": 1.6959017056018106, "step": 10490}, {"loss": 0.6793, "grad_norm": 0.6747927069664001, "learning_rate": 0.0002, "epoch": 1.697518389782556, "step": 10500}, {"loss": 0.7415, "grad_norm": 0.5708310008049011, "learning_rate": 0.0002, "epoch": 1.6991350739633013, "step": 10510}, {"loss": 0.7385, "grad_norm": 0.606526792049408, "learning_rate": 0.0002, "epoch": 1.7007517581440466, "step": 10520}, {"loss": 0.7204, "grad_norm": 0.662011981010437, "learning_rate": 0.0002, "epoch": 1.7023684423247918, "step": 10530}, {"loss": 0.7999, "grad_norm": 0.7583045363426208, "learning_rate": 0.0002, "epoch": 1.703985126505537, "step": 10540}, {"loss": 0.7563, "grad_norm": 0.721632182598114, "learning_rate": 0.0002, "epoch": 1.7056018106862825, "step": 10550}, {"loss": 0.7407, "grad_norm": 0.6107715368270874, "learning_rate": 0.0002, "epoch": 1.7072184948670277, "step": 10560}, {"loss": 0.7519, "grad_norm": 0.6652471423149109, "learning_rate": 0.0002, "epoch": 1.7088351790477732, "step": 10570}, {"loss": 0.7767, "grad_norm": 0.6308087110519409, "learning_rate": 0.0002, "epoch": 1.7104518632285184, "step": 10580}, {"loss": 0.7659, "grad_norm": 0.5464386940002441, "learning_rate": 0.0002, "epoch": 1.7120685474092636, "step": 10590}, {"loss": 0.7063, "grad_norm": 0.6558911204338074, "learning_rate": 0.0002, "epoch": 1.7136852315900089, "step": 10600}, {"loss": 0.7126, "grad_norm": 0.5665024518966675, "learning_rate": 0.0002, "epoch": 1.715301915770754, "step": 10610}, {"loss": 0.6958, "grad_norm": 0.7888094186782837, "learning_rate": 0.0002, "epoch": 1.7169185999514993, "step": 10620}, {"loss": 0.7785, "grad_norm": 0.7084909081459045, "learning_rate": 0.0002, "epoch": 1.7185352841322448, "step": 10630}, {"loss": 0.7557, "grad_norm": 0.7982324361801147, "learning_rate": 0.0002, "epoch": 1.7201519683129902, "step": 10640}, {"loss": 0.7345, "grad_norm": 0.6418732404708862, "learning_rate": 0.0002, "epoch": 1.7217686524937355, "step": 10650}, {"loss": 0.7734, "grad_norm": 0.7636681795120239, "learning_rate": 0.0002, "epoch": 1.7233853366744807, "step": 10660}, {"loss": 0.7541, "grad_norm": 0.5646875500679016, "learning_rate": 0.0002, "epoch": 1.725002020855226, "step": 10670}, {"loss": 0.7642, "grad_norm": 0.5231260657310486, "learning_rate": 0.0002, "epoch": 1.7266187050359711, "step": 10680}, {"loss": 0.7846, "grad_norm": 0.7635011672973633, "learning_rate": 0.0002, "epoch": 1.7282353892167164, "step": 10690}, {"loss": 0.7471, "grad_norm": 0.7518259286880493, "learning_rate": 0.0002, "epoch": 1.7298520733974618, "step": 10700}, {"loss": 0.751, "grad_norm": 0.7295602560043335, "learning_rate": 0.0002, "epoch": 1.731468757578207, "step": 10710}, {"loss": 0.731, "grad_norm": 0.6984632015228271, "learning_rate": 0.0002, "epoch": 1.7330854417589525, "step": 10720}, {"loss": 0.7921, "grad_norm": 0.6198219060897827, "learning_rate": 0.0002, "epoch": 1.7347021259396977, "step": 10730}, {"loss": 0.7642, "grad_norm": 0.6957576274871826, "learning_rate": 0.0002, "epoch": 1.736318810120443, "step": 10740}, {"loss": 0.7917, "grad_norm": 0.6430263519287109, "learning_rate": 0.0002, "epoch": 1.7379354943011882, "step": 10750}, {"loss": 0.7156, "grad_norm": 0.6134995222091675, "learning_rate": 0.0002, "epoch": 1.7395521784819334, "step": 10760}, {"loss": 0.7584, "grad_norm": 0.7209452986717224, "learning_rate": 0.0002, "epoch": 1.741168862662679, "step": 10770}, {"loss": 0.7528, "grad_norm": 0.6735447645187378, "learning_rate": 0.0002, "epoch": 1.7427855468434241, "step": 10780}, {"loss": 0.756, "grad_norm": 0.5605693459510803, "learning_rate": 0.0002, "epoch": 1.7444022310241696, "step": 10790}, {"loss": 0.7759, "grad_norm": 0.6882363557815552, "learning_rate": 0.0002, "epoch": 1.7460189152049148, "step": 10800}, {"loss": 0.7544, "grad_norm": 0.6386259198188782, "learning_rate": 0.0002, "epoch": 1.74763559938566, "step": 10810}, {"loss": 0.7697, "grad_norm": 0.6529015302658081, "learning_rate": 0.0002, "epoch": 1.7492522835664053, "step": 10820}, {"loss": 0.7219, "grad_norm": 0.5664082765579224, "learning_rate": 0.0002, "epoch": 1.7508689677471505, "step": 10830}, {"loss": 0.7586, "grad_norm": 0.7532684206962585, "learning_rate": 0.0002, "epoch": 1.7524856519278957, "step": 10840}, {"loss": 0.6919, "grad_norm": 0.77171391248703, "learning_rate": 0.0002, "epoch": 1.7541023361086412, "step": 10850}, {"loss": 0.785, "grad_norm": 0.7255431413650513, "learning_rate": 0.0002, "epoch": 1.7557190202893864, "step": 10860}, {"loss": 0.7458, "grad_norm": 0.763083279132843, "learning_rate": 0.0002, "epoch": 1.7573357044701319, "step": 10870}, {"loss": 0.7846, "grad_norm": 0.6042402982711792, "learning_rate": 0.0002, "epoch": 1.758952388650877, "step": 10880}, {"loss": 0.7027, "grad_norm": 0.7642518281936646, "learning_rate": 0.0002, "epoch": 1.7605690728316223, "step": 10890}, {"loss": 0.746, "grad_norm": 0.6347904801368713, "learning_rate": 0.0002, "epoch": 1.7621857570123676, "step": 10900}, {"loss": 0.7458, "grad_norm": 0.5371627807617188, "learning_rate": 0.0002, "epoch": 1.7638024411931128, "step": 10910}, {"loss": 0.7466, "grad_norm": 0.6840225458145142, "learning_rate": 0.0002, "epoch": 1.7654191253738583, "step": 10920}, {"loss": 0.725, "grad_norm": 0.5288469195365906, "learning_rate": 0.0002, "epoch": 1.7670358095546035, "step": 10930}, {"loss": 0.7863, "grad_norm": 0.69020676612854, "learning_rate": 0.0002, "epoch": 1.768652493735349, "step": 10940}, {"loss": 0.7468, "grad_norm": 0.5943242311477661, "learning_rate": 0.0002, "epoch": 1.7702691779160942, "step": 10950}, {"loss": 0.7244, "grad_norm": 0.5616418123245239, "learning_rate": 0.0002, "epoch": 1.7718858620968394, "step": 10960}, {"loss": 0.7137, "grad_norm": 0.7209470868110657, "learning_rate": 0.0002, "epoch": 1.7735025462775846, "step": 10970}, {"loss": 0.7459, "grad_norm": 0.6657957434654236, "learning_rate": 0.0002, "epoch": 1.7751192304583299, "step": 10980}, {"loss": 0.7076, "grad_norm": 0.6469064950942993, "learning_rate": 0.0002, "epoch": 1.776735914639075, "step": 10990}, {"loss": 0.7321, "grad_norm": 0.6615678071975708, "learning_rate": 0.0002, "epoch": 1.7783525988198206, "step": 11000}, {"loss": 0.747, "grad_norm": 0.6722439527511597, "learning_rate": 0.0002, "epoch": 1.779969283000566, "step": 11010}, {"loss": 0.7302, "grad_norm": 0.634136974811554, "learning_rate": 0.0002, "epoch": 1.7815859671813112, "step": 11020}, {"loss": 0.8105, "grad_norm": 0.6024377346038818, "learning_rate": 0.0002, "epoch": 1.7832026513620565, "step": 11030}, {"loss": 0.7855, "grad_norm": 0.6909403800964355, "learning_rate": 0.0002, "epoch": 1.7848193355428017, "step": 11040}, {"loss": 0.7471, "grad_norm": 0.7148767709732056, "learning_rate": 0.0002, "epoch": 1.786436019723547, "step": 11050}, {"loss": 0.7145, "grad_norm": 0.7442979216575623, "learning_rate": 0.0002, "epoch": 1.7880527039042922, "step": 11060}, {"loss": 0.7215, "grad_norm": 0.6830431818962097, "learning_rate": 0.0002, "epoch": 1.7896693880850376, "step": 11070}, {"loss": 0.7625, "grad_norm": 0.9172667264938354, "learning_rate": 0.0002, "epoch": 1.7912860722657828, "step": 11080}, {"loss": 0.76, "grad_norm": 0.6799490451812744, "learning_rate": 0.0002, "epoch": 1.7929027564465283, "step": 11090}, {"loss": 0.7716, "grad_norm": 0.7617024779319763, "learning_rate": 0.0002, "epoch": 1.7945194406272735, "step": 11100}, {"loss": 0.7586, "grad_norm": 0.7701810002326965, "learning_rate": 0.0002, "epoch": 1.7961361248080188, "step": 11110}, {"loss": 0.7843, "grad_norm": 0.7454385757446289, "learning_rate": 0.0002, "epoch": 1.797752808988764, "step": 11120}, {"loss": 0.7873, "grad_norm": 0.6121436953544617, "learning_rate": 0.0002, "epoch": 1.7993694931695092, "step": 11130}, {"loss": 0.7305, "grad_norm": 0.6237571835517883, "learning_rate": 0.0002, "epoch": 1.8009861773502547, "step": 11140}, {"loss": 0.6827, "grad_norm": 0.6818515658378601, "learning_rate": 0.0002, "epoch": 1.802602861531, "step": 11150}, {"loss": 0.6876, "grad_norm": 0.7768308520317078, "learning_rate": 0.0002, "epoch": 1.8042195457117454, "step": 11160}, {"loss": 0.7533, "grad_norm": 0.6875537633895874, "learning_rate": 0.0002, "epoch": 1.8058362298924906, "step": 11170}, {"loss": 0.761, "grad_norm": 0.7950584888458252, "learning_rate": 0.0002, "epoch": 1.8074529140732358, "step": 11180}, {"loss": 0.7623, "grad_norm": 0.8210248351097107, "learning_rate": 0.0002, "epoch": 1.809069598253981, "step": 11190}, {"loss": 0.7556, "grad_norm": 0.6674110889434814, "learning_rate": 0.0002, "epoch": 1.8106862824347263, "step": 11200}, {"loss": 0.7663, "grad_norm": 0.6261674761772156, "learning_rate": 0.0002, "epoch": 1.8123029666154715, "step": 11210}, {"loss": 0.7122, "grad_norm": 0.6484741568565369, "learning_rate": 0.0002, "epoch": 1.813919650796217, "step": 11220}, {"loss": 0.7718, "grad_norm": 0.6231244206428528, "learning_rate": 0.0002, "epoch": 1.8155363349769622, "step": 11230}, {"loss": 0.7152, "grad_norm": 0.7243146896362305, "learning_rate": 0.0002, "epoch": 1.8171530191577077, "step": 11240}, {"loss": 0.7448, "grad_norm": 0.6776193380355835, "learning_rate": 0.0002, "epoch": 1.818769703338453, "step": 11250}, {"loss": 0.7317, "grad_norm": 0.5973618030548096, "learning_rate": 0.0002, "epoch": 1.8203863875191981, "step": 11260}, {"loss": 0.7961, "grad_norm": 0.6451361179351807, "learning_rate": 0.0002, "epoch": 1.8220030716999434, "step": 11270}, {"loss": 0.7611, "grad_norm": 0.5963068008422852, "learning_rate": 0.0002, "epoch": 1.8236197558806886, "step": 11280}, {"loss": 0.7466, "grad_norm": 0.536902129650116, "learning_rate": 0.0002, "epoch": 1.825236440061434, "step": 11290}, {"loss": 0.708, "grad_norm": 0.6993787288665771, "learning_rate": 0.0002, "epoch": 1.8268531242421793, "step": 11300}, {"loss": 0.7153, "grad_norm": 0.6135255098342896, "learning_rate": 0.0002, "epoch": 1.8284698084229247, "step": 11310}, {"loss": 0.7423, "grad_norm": 0.6057423949241638, "learning_rate": 0.0002, "epoch": 1.83008649260367, "step": 11320}, {"loss": 0.735, "grad_norm": 0.6598812341690063, "learning_rate": 0.0002, "epoch": 1.8317031767844152, "step": 11330}, {"loss": 0.7278, "grad_norm": 0.6075948476791382, "learning_rate": 0.0002, "epoch": 1.8333198609651604, "step": 11340}, {"loss": 0.7846, "grad_norm": 0.7065447568893433, "learning_rate": 0.0002, "epoch": 1.8349365451459057, "step": 11350}, {"loss": 0.7365, "grad_norm": 0.680526614189148, "learning_rate": 0.0002, "epoch": 1.8365532293266509, "step": 11360}, {"loss": 0.7152, "grad_norm": 0.6356695294380188, "learning_rate": 0.0002, "epoch": 1.8381699135073963, "step": 11370}, {"loss": 0.721, "grad_norm": 0.6399052143096924, "learning_rate": 0.0002, "epoch": 1.8397865976881416, "step": 11380}, {"loss": 0.7618, "grad_norm": 0.6125704050064087, "learning_rate": 0.0002, "epoch": 1.841403281868887, "step": 11390}, {"loss": 0.755, "grad_norm": 0.7124643325805664, "learning_rate": 0.0002, "epoch": 1.8430199660496323, "step": 11400}, {"loss": 0.7972, "grad_norm": 0.6099604964256287, "learning_rate": 0.0002, "epoch": 1.8446366502303775, "step": 11410}, {"loss": 0.7187, "grad_norm": 0.7338208556175232, "learning_rate": 0.0002, "epoch": 1.8462533344111227, "step": 11420}, {"loss": 0.7007, "grad_norm": 0.7534668445587158, "learning_rate": 0.0002, "epoch": 1.847870018591868, "step": 11430}, {"loss": 0.7464, "grad_norm": 0.6135470271110535, "learning_rate": 0.0002, "epoch": 1.8494867027726134, "step": 11440}, {"loss": 0.7955, "grad_norm": 0.6229309439659119, "learning_rate": 0.0002, "epoch": 1.8511033869533586, "step": 11450}, {"loss": 0.7594, "grad_norm": 0.706423282623291, "learning_rate": 0.0002, "epoch": 1.852720071134104, "step": 11460}, {"loss": 0.7411, "grad_norm": 0.5460049510002136, "learning_rate": 0.0002, "epoch": 1.8543367553148493, "step": 11470}, {"loss": 0.7416, "grad_norm": 0.6616711020469666, "learning_rate": 0.0002, "epoch": 1.8559534394955945, "step": 11480}, {"loss": 0.729, "grad_norm": 0.6372783184051514, "learning_rate": 0.0002, "epoch": 1.8575701236763398, "step": 11490}, {"loss": 0.7333, "grad_norm": 0.7162668108940125, "learning_rate": 0.0002, "epoch": 1.859186807857085, "step": 11500}, {"loss": 0.7747, "grad_norm": 0.6605209708213806, "learning_rate": 0.0002, "epoch": 1.8608034920378305, "step": 11510}, {"loss": 0.7258, "grad_norm": 0.6933956742286682, "learning_rate": 0.0002, "epoch": 1.8624201762185757, "step": 11520}, {"loss": 0.7243, "grad_norm": 0.6582090854644775, "learning_rate": 0.0002, "epoch": 1.8640368603993211, "step": 11530}, {"loss": 0.7313, "grad_norm": 0.6416500806808472, "learning_rate": 0.0002, "epoch": 1.8656535445800664, "step": 11540}, {"loss": 0.7372, "grad_norm": 0.5434312224388123, "learning_rate": 0.0002, "epoch": 1.8672702287608116, "step": 11550}, {"loss": 0.7635, "grad_norm": 0.6827567219734192, "learning_rate": 0.0002, "epoch": 1.8688869129415568, "step": 11560}, {"loss": 0.7137, "grad_norm": 0.7354370951652527, "learning_rate": 0.0002, "epoch": 1.870503597122302, "step": 11570}, {"loss": 0.7526, "grad_norm": 0.590372622013092, "learning_rate": 0.0002, "epoch": 1.8721202813030473, "step": 11580}, {"loss": 0.731, "grad_norm": 0.853183925151825, "learning_rate": 0.0002, "epoch": 1.8737369654837928, "step": 11590}, {"loss": 0.7487, "grad_norm": 0.822678804397583, "learning_rate": 0.0002, "epoch": 1.875353649664538, "step": 11600}, {"loss": 0.7427, "grad_norm": 0.6591550707817078, "learning_rate": 0.0002, "epoch": 1.8769703338452834, "step": 11610}, {"loss": 0.7054, "grad_norm": 0.7475301623344421, "learning_rate": 0.0002, "epoch": 1.8785870180260287, "step": 11620}, {"loss": 0.811, "grad_norm": 0.6390765309333801, "learning_rate": 0.0002, "epoch": 1.880203702206774, "step": 11630}, {"loss": 0.7531, "grad_norm": 0.6589758992195129, "learning_rate": 0.0002, "epoch": 1.8818203863875191, "step": 11640}, {"loss": 0.7475, "grad_norm": 0.6765508651733398, "learning_rate": 0.0002, "epoch": 1.8834370705682644, "step": 11650}, {"loss": 0.738, "grad_norm": 0.6527857780456543, "learning_rate": 0.0002, "epoch": 1.8850537547490098, "step": 11660}, {"loss": 0.7504, "grad_norm": 0.6642923951148987, "learning_rate": 0.0002, "epoch": 1.886670438929755, "step": 11670}, {"loss": 0.7701, "grad_norm": 0.6945584416389465, "learning_rate": 0.0002, "epoch": 1.8882871231105005, "step": 11680}, {"loss": 0.7711, "grad_norm": 0.694018542766571, "learning_rate": 0.0002, "epoch": 1.8899038072912457, "step": 11690}, {"loss": 0.7195, "grad_norm": 0.7237417101860046, "learning_rate": 0.0002, "epoch": 1.891520491471991, "step": 11700}, {"loss": 0.7491, "grad_norm": 0.7401309609413147, "learning_rate": 0.0002, "epoch": 1.8931371756527362, "step": 11710}, {"loss": 0.805, "grad_norm": 0.6537784337997437, "learning_rate": 0.0002, "epoch": 1.8947538598334814, "step": 11720}, {"loss": 0.793, "grad_norm": 0.7398539185523987, "learning_rate": 0.0002, "epoch": 1.8963705440142267, "step": 11730}, {"loss": 0.7561, "grad_norm": 0.6696075797080994, "learning_rate": 0.0002, "epoch": 1.8979872281949721, "step": 11740}, {"loss": 0.7353, "grad_norm": 0.6014142036437988, "learning_rate": 0.0002, "epoch": 1.8996039123757174, "step": 11750}, {"loss": 0.7714, "grad_norm": 0.7023524641990662, "learning_rate": 0.0002, "epoch": 1.9012205965564628, "step": 11760}, {"loss": 0.7088, "grad_norm": 0.739973783493042, "learning_rate": 0.0002, "epoch": 1.902837280737208, "step": 11770}, {"loss": 0.7848, "grad_norm": 0.5576770901679993, "learning_rate": 0.0002, "epoch": 1.9044539649179533, "step": 11780}, {"loss": 0.7483, "grad_norm": 0.6907393932342529, "learning_rate": 0.0002, "epoch": 1.9060706490986985, "step": 11790}, {"loss": 0.7827, "grad_norm": 0.6934581995010376, "learning_rate": 0.0002, "epoch": 1.9076873332794437, "step": 11800}, {"loss": 0.7199, "grad_norm": 0.591774582862854, "learning_rate": 0.0002, "epoch": 1.9093040174601892, "step": 11810}, {"loss": 0.7333, "grad_norm": 0.6249791383743286, "learning_rate": 0.0002, "epoch": 1.9109207016409344, "step": 11820}, {"loss": 0.7581, "grad_norm": 0.6755744218826294, "learning_rate": 0.0002, "epoch": 1.9125373858216799, "step": 11830}, {"loss": 0.696, "grad_norm": 0.7286285161972046, "learning_rate": 0.0002, "epoch": 1.914154070002425, "step": 11840}, {"loss": 0.7509, "grad_norm": 0.7867850065231323, "learning_rate": 0.0002, "epoch": 1.9157707541831703, "step": 11850}, {"loss": 0.735, "grad_norm": 0.6283972859382629, "learning_rate": 0.0002, "epoch": 1.9173874383639156, "step": 11860}, {"loss": 0.7296, "grad_norm": 0.605823814868927, "learning_rate": 0.0002, "epoch": 1.9190041225446608, "step": 11870}, {"loss": 0.6598, "grad_norm": 0.5927976965904236, "learning_rate": 0.0002, "epoch": 1.920620806725406, "step": 11880}, {"loss": 0.7649, "grad_norm": 0.5974002480506897, "learning_rate": 0.0002, "epoch": 1.9222374909061515, "step": 11890}, {"loss": 0.7843, "grad_norm": 0.7091866135597229, "learning_rate": 0.0002, "epoch": 1.923854175086897, "step": 11900}, {"loss": 0.775, "grad_norm": 0.72496497631073, "learning_rate": 0.0002, "epoch": 1.9254708592676422, "step": 11910}, {"loss": 0.7153, "grad_norm": 0.6131896376609802, "learning_rate": 0.0002, "epoch": 1.9270875434483874, "step": 11920}, {"loss": 0.7228, "grad_norm": 0.6556436419487, "learning_rate": 0.0002, "epoch": 1.9287042276291326, "step": 11930}, {"loss": 0.7319, "grad_norm": 0.622932493686676, "learning_rate": 0.0002, "epoch": 1.9303209118098779, "step": 11940}, {"loss": 0.7592, "grad_norm": 0.6618631482124329, "learning_rate": 0.0002, "epoch": 1.931937595990623, "step": 11950}, {"loss": 0.8332, "grad_norm": 0.630966305732727, "learning_rate": 0.0002, "epoch": 1.9335542801713685, "step": 11960}, {"loss": 0.6854, "grad_norm": 0.6336734890937805, "learning_rate": 0.0002, "epoch": 1.9351709643521138, "step": 11970}, {"loss": 0.7433, "grad_norm": 0.655403196811676, "learning_rate": 0.0002, "epoch": 1.9367876485328592, "step": 11980}, {"loss": 0.7282, "grad_norm": 0.5640574097633362, "learning_rate": 0.0002, "epoch": 1.9384043327136045, "step": 11990}, {"loss": 0.7289, "grad_norm": 0.6322951316833496, "learning_rate": 0.0002, "epoch": 1.9400210168943497, "step": 12000}, {"loss": 0.7627, "grad_norm": 0.615703821182251, "learning_rate": 0.0002, "epoch": 1.941637701075095, "step": 12010}, {"loss": 0.786, "grad_norm": 0.6487536430358887, "learning_rate": 0.0002, "epoch": 1.9432543852558402, "step": 12020}, {"loss": 0.7435, "grad_norm": 0.9209630489349365, "learning_rate": 0.0002, "epoch": 1.9448710694365856, "step": 12030}, {"loss": 0.7274, "grad_norm": 0.67485511302948, "learning_rate": 0.0002, "epoch": 1.9464877536173308, "step": 12040}, {"loss": 0.7551, "grad_norm": 0.6831230521202087, "learning_rate": 0.0002, "epoch": 1.9481044377980763, "step": 12050}, {"loss": 0.7546, "grad_norm": 0.6578302383422852, "learning_rate": 0.0002, "epoch": 1.9497211219788215, "step": 12060}, {"loss": 0.6989, "grad_norm": 0.9975938200950623, "learning_rate": 0.0002, "epoch": 1.9513378061595668, "step": 12070}, {"loss": 0.7952, "grad_norm": 0.6637365221977234, "learning_rate": 0.0002, "epoch": 1.952954490340312, "step": 12080}, {"loss": 0.7482, "grad_norm": 0.605707049369812, "learning_rate": 0.0002, "epoch": 1.9545711745210572, "step": 12090}, {"loss": 0.7768, "grad_norm": 0.6584440469741821, "learning_rate": 0.0002, "epoch": 1.9561878587018025, "step": 12100}, {"loss": 0.7187, "grad_norm": 0.6070835590362549, "learning_rate": 0.0002, "epoch": 1.957804542882548, "step": 12110}, {"loss": 0.7491, "grad_norm": 0.7862601280212402, "learning_rate": 0.0002, "epoch": 1.9594212270632931, "step": 12120}, {"loss": 0.7972, "grad_norm": 0.8175255060195923, "learning_rate": 0.0002, "epoch": 1.9610379112440386, "step": 12130}, {"loss": 0.7242, "grad_norm": 0.5648472905158997, "learning_rate": 0.0002, "epoch": 1.9626545954247838, "step": 12140}, {"loss": 0.7321, "grad_norm": 0.6591973304748535, "learning_rate": 0.0002, "epoch": 1.964271279605529, "step": 12150}, {"loss": 0.739, "grad_norm": 0.5960676074028015, "learning_rate": 0.0002, "epoch": 1.9658879637862743, "step": 12160}, {"loss": 0.7254, "grad_norm": 0.7272544503211975, "learning_rate": 0.0002, "epoch": 1.9675046479670195, "step": 12170}, {"loss": 0.7376, "grad_norm": 0.7176699042320251, "learning_rate": 0.0002, "epoch": 1.969121332147765, "step": 12180}, {"loss": 0.7525, "grad_norm": 0.6927123665809631, "learning_rate": 0.0002, "epoch": 1.9707380163285102, "step": 12190}, {"loss": 0.7318, "grad_norm": 0.5536034107208252, "learning_rate": 0.0002, "epoch": 1.9723547005092557, "step": 12200}, {"loss": 0.7737, "grad_norm": 0.8348390460014343, "learning_rate": 0.0002, "epoch": 1.9739713846900009, "step": 12210}, {"loss": 0.7494, "grad_norm": 0.6591181755065918, "learning_rate": 0.0002, "epoch": 1.9755880688707461, "step": 12220}, {"loss": 0.763, "grad_norm": 1.0624109506607056, "learning_rate": 0.0002, "epoch": 1.9772047530514913, "step": 12230}, {"loss": 0.7541, "grad_norm": 0.9265586137771606, "learning_rate": 0.0002, "epoch": 1.9788214372322366, "step": 12240}, {"loss": 0.7533, "grad_norm": 0.5998196005821228, "learning_rate": 0.0002, "epoch": 1.9804381214129818, "step": 12250}, {"loss": 0.7225, "grad_norm": 0.6960851550102234, "learning_rate": 0.0002, "epoch": 1.9820548055937273, "step": 12260}, {"loss": 0.7398, "grad_norm": 0.7674502730369568, "learning_rate": 0.0002, "epoch": 1.9836714897744727, "step": 12270}, {"loss": 0.7185, "grad_norm": 0.6407275795936584, "learning_rate": 0.0002, "epoch": 1.985288173955218, "step": 12280}, {"loss": 0.7382, "grad_norm": 0.6673079133033752, "learning_rate": 0.0002, "epoch": 1.9869048581359632, "step": 12290}, {"loss": 0.7326, "grad_norm": 0.6989844441413879, "learning_rate": 0.0002, "epoch": 1.9885215423167084, "step": 12300}, {"loss": 0.7559, "grad_norm": 0.7564442157745361, "learning_rate": 0.0002, "epoch": 1.9901382264974536, "step": 12310}, {"loss": 0.7719, "grad_norm": 0.6385478973388672, "learning_rate": 0.0002, "epoch": 1.9917549106781989, "step": 12320}, {"loss": 0.7369, "grad_norm": 0.7193717956542969, "learning_rate": 0.0002, "epoch": 1.9933715948589443, "step": 12330}, {"loss": 0.7583, "grad_norm": 0.7987112402915955, "learning_rate": 0.0002, "epoch": 1.9949882790396896, "step": 12340}, {"loss": 0.7793, "grad_norm": 0.7260826826095581, "learning_rate": 0.0002, "epoch": 1.996604963220435, "step": 12350}, {"loss": 0.7505, "grad_norm": 0.7968255281448364, "learning_rate": 0.0002, "epoch": 1.9982216474011802, "step": 12360}, {"loss": 0.717, "grad_norm": 0.6893062591552734, "learning_rate": 0.0002, "epoch": 1.9998383315819255, "step": 12370}, {"eval_loss": 1.1044032573699951, "eval_runtime": 122.1508, "eval_samples_per_second": 6.001, "eval_steps_per_second": 0.753, "epoch": 2.0, "step": 12371}, {"loss": 0.6604, "grad_norm": 0.7775409817695618, "learning_rate": 0.0002, "epoch": 2.0014550157626707, "step": 12380}, {"loss": 0.6845, "grad_norm": 0.76218581199646, "learning_rate": 0.0002, "epoch": 2.003071699943416, "step": 12390}, {"loss": 0.6909, "grad_norm": 0.5677764415740967, "learning_rate": 0.0002, "epoch": 2.004688384124161, "step": 12400}, {"loss": 0.6584, "grad_norm": 0.808442234992981, "learning_rate": 0.0002, "epoch": 2.006305068304907, "step": 12410}, {"loss": 0.659, "grad_norm": 0.7144765257835388, "learning_rate": 0.0002, "epoch": 2.007921752485652, "step": 12420}, {"loss": 0.6666, "grad_norm": 0.6914031505584717, "learning_rate": 0.0002, "epoch": 2.0095384366663973, "step": 12430}, {"loss": 0.6596, "grad_norm": 0.7581454515457153, "learning_rate": 0.0002, "epoch": 2.0111551208471425, "step": 12440}, {"loss": 0.6785, "grad_norm": 0.8388504981994629, "learning_rate": 0.0002, "epoch": 2.0127718050278878, "step": 12450}, {"loss": 0.6942, "grad_norm": 0.6716406941413879, "learning_rate": 0.0002, "epoch": 2.014388489208633, "step": 12460}, {"loss": 0.6441, "grad_norm": 0.898902416229248, "learning_rate": 0.0002, "epoch": 2.0160051733893782, "step": 12470}, {"loss": 0.6655, "grad_norm": 0.6432679891586304, "learning_rate": 0.0002, "epoch": 2.0176218575701235, "step": 12480}, {"loss": 0.6521, "grad_norm": 0.8021109104156494, "learning_rate": 0.0002, "epoch": 2.019238541750869, "step": 12490}, {"loss": 0.6581, "grad_norm": 0.7039216756820679, "learning_rate": 0.0002, "epoch": 2.0208552259316144, "step": 12500}, {"loss": 0.6521, "grad_norm": 0.646531879901886, "learning_rate": 0.0002, "epoch": 2.0224719101123596, "step": 12510}, {"loss": 0.6302, "grad_norm": 0.783704400062561, "learning_rate": 0.0002, "epoch": 2.024088594293105, "step": 12520}, {"loss": 0.6288, "grad_norm": 0.8805046677589417, "learning_rate": 0.0002, "epoch": 2.02570527847385, "step": 12530}, {"loss": 0.6288, "grad_norm": 0.7289270758628845, "learning_rate": 0.0002, "epoch": 2.0273219626545953, "step": 12540}, {"loss": 0.6663, "grad_norm": 0.71653151512146, "learning_rate": 0.0002, "epoch": 2.0289386468353405, "step": 12550}, {"loss": 0.625, "grad_norm": 0.73281329870224, "learning_rate": 0.0002, "epoch": 2.030555331016086, "step": 12560}, {"loss": 0.6448, "grad_norm": 0.6657090187072754, "learning_rate": 0.0002, "epoch": 2.0321720151968314, "step": 12570}, {"loss": 0.6983, "grad_norm": 0.8241133093833923, "learning_rate": 0.0002, "epoch": 2.0337886993775767, "step": 12580}, {"loss": 0.6488, "grad_norm": 0.5834135413169861, "learning_rate": 0.0002, "epoch": 2.035405383558322, "step": 12590}, {"loss": 0.6188, "grad_norm": 0.84502112865448, "learning_rate": 0.0002, "epoch": 2.037022067739067, "step": 12600}, {"loss": 0.6349, "grad_norm": 0.8952481746673584, "learning_rate": 0.0002, "epoch": 2.0386387519198124, "step": 12610}, {"loss": 0.6923, "grad_norm": 0.7801461815834045, "learning_rate": 0.0002, "epoch": 2.0402554361005576, "step": 12620}, {"loss": 0.6176, "grad_norm": 0.6788367033004761, "learning_rate": 0.0002, "epoch": 2.041872120281303, "step": 12630}, {"loss": 0.6162, "grad_norm": 0.7241756319999695, "learning_rate": 0.0002, "epoch": 2.0434888044620485, "step": 12640}, {"loss": 0.655, "grad_norm": 0.6933388113975525, "learning_rate": 0.0002, "epoch": 2.0451054886427937, "step": 12650}, {"loss": 0.6431, "grad_norm": 0.8029746413230896, "learning_rate": 0.0002, "epoch": 2.046722172823539, "step": 12660}, {"loss": 0.7164, "grad_norm": 0.946399986743927, "learning_rate": 0.0002, "epoch": 2.048338857004284, "step": 12670}, {"loss": 0.638, "grad_norm": 0.7072678804397583, "learning_rate": 0.0002, "epoch": 2.0499555411850294, "step": 12680}, {"loss": 0.6487, "grad_norm": 0.6810618042945862, "learning_rate": 0.0002, "epoch": 2.0515722253657747, "step": 12690}, {"loss": 0.6554, "grad_norm": 0.7661160230636597, "learning_rate": 0.0002, "epoch": 2.05318890954652, "step": 12700}, {"loss": 0.6799, "grad_norm": 0.6350653767585754, "learning_rate": 0.0002, "epoch": 2.0548055937272656, "step": 12710}, {"loss": 0.6654, "grad_norm": 0.861890971660614, "learning_rate": 0.0002, "epoch": 2.056422277908011, "step": 12720}, {"loss": 0.6286, "grad_norm": 0.6489875912666321, "learning_rate": 0.0002, "epoch": 2.058038962088756, "step": 12730}, {"loss": 0.6811, "grad_norm": 0.8268506526947021, "learning_rate": 0.0002, "epoch": 2.0596556462695013, "step": 12740}, {"loss": 0.6524, "grad_norm": 0.607679545879364, "learning_rate": 0.0002, "epoch": 2.0612723304502465, "step": 12750}, {"loss": 0.6649, "grad_norm": 0.6754153370857239, "learning_rate": 0.0002, "epoch": 2.0628890146309917, "step": 12760}, {"loss": 0.6549, "grad_norm": 0.7263124585151672, "learning_rate": 0.0002, "epoch": 2.064505698811737, "step": 12770}, {"loss": 0.6189, "grad_norm": 0.6986154317855835, "learning_rate": 0.0002, "epoch": 2.0661223829924826, "step": 12780}, {"loss": 0.6723, "grad_norm": 0.7768576741218567, "learning_rate": 0.0002, "epoch": 2.067739067173228, "step": 12790}, {"loss": 0.677, "grad_norm": 0.7546762824058533, "learning_rate": 0.0002, "epoch": 2.069355751353973, "step": 12800}, {"loss": 0.6485, "grad_norm": 0.7588880062103271, "learning_rate": 0.0002, "epoch": 2.0709724355347183, "step": 12810}, {"loss": 0.6989, "grad_norm": 0.7457242608070374, "learning_rate": 0.0002, "epoch": 2.0725891197154636, "step": 12820}, {"loss": 0.6489, "grad_norm": 0.6983516812324524, "learning_rate": 0.0002, "epoch": 2.074205803896209, "step": 12830}, {"loss": 0.651, "grad_norm": 0.7950928807258606, "learning_rate": 0.0002, "epoch": 2.075822488076954, "step": 12840}, {"loss": 0.6603, "grad_norm": 0.9248087406158447, "learning_rate": 0.0002, "epoch": 2.0774391722576993, "step": 12850}, {"loss": 0.6847, "grad_norm": 0.7229493260383606, "learning_rate": 0.0002, "epoch": 2.079055856438445, "step": 12860}, {"loss": 0.6702, "grad_norm": 0.5710847973823547, "learning_rate": 0.0002, "epoch": 2.08067254061919, "step": 12870}, {"loss": 0.6974, "grad_norm": 0.9580423831939697, "learning_rate": 0.0002, "epoch": 2.0822892247999354, "step": 12880}, {"loss": 0.6341, "grad_norm": 0.7399665713310242, "learning_rate": 0.0002, "epoch": 2.0839059089806806, "step": 12890}, {"loss": 0.6993, "grad_norm": 0.7981410622596741, "learning_rate": 0.0002, "epoch": 2.085522593161426, "step": 12900}, {"loss": 0.6976, "grad_norm": 0.870759904384613, "learning_rate": 0.0002, "epoch": 2.087139277342171, "step": 12910}, {"loss": 0.7194, "grad_norm": 0.7001481652259827, "learning_rate": 0.0002, "epoch": 2.0887559615229163, "step": 12920}, {"loss": 0.6383, "grad_norm": 0.6745418310165405, "learning_rate": 0.0002, "epoch": 2.090372645703662, "step": 12930}, {"loss": 0.6519, "grad_norm": 0.7739067673683167, "learning_rate": 0.0002, "epoch": 2.0919893298844072, "step": 12940}, {"loss": 0.6856, "grad_norm": 0.6742934584617615, "learning_rate": 0.0002, "epoch": 2.0936060140651525, "step": 12950}, {"loss": 0.6279, "grad_norm": 0.7270349860191345, "learning_rate": 0.0002, "epoch": 2.0952226982458977, "step": 12960}, {"loss": 0.6783, "grad_norm": 0.7150624394416809, "learning_rate": 0.0002, "epoch": 2.096839382426643, "step": 12970}, {"loss": 0.6093, "grad_norm": 0.7734767198562622, "learning_rate": 0.0002, "epoch": 2.098456066607388, "step": 12980}, {"loss": 0.6534, "grad_norm": 0.7618662118911743, "learning_rate": 0.0002, "epoch": 2.1000727507881334, "step": 12990}, {"loss": 0.6707, "grad_norm": 0.6557944416999817, "learning_rate": 0.0002, "epoch": 2.101689434968879, "step": 13000}, {"loss": 0.7268, "grad_norm": 0.8786448240280151, "learning_rate": 0.0002, "epoch": 2.1033061191496243, "step": 13010}, {"loss": 0.6677, "grad_norm": 0.6878724098205566, "learning_rate": 0.0002, "epoch": 2.1049228033303695, "step": 13020}, {"loss": 0.6824, "grad_norm": 0.822318971157074, "learning_rate": 0.0002, "epoch": 2.1065394875111147, "step": 13030}, {"loss": 0.6228, "grad_norm": 0.831468939781189, "learning_rate": 0.0002, "epoch": 2.10815617169186, "step": 13040}, {"loss": 0.6511, "grad_norm": 0.7699505686759949, "learning_rate": 0.0002, "epoch": 2.109772855872605, "step": 13050}, {"loss": 0.6671, "grad_norm": 0.7559016346931458, "learning_rate": 0.0002, "epoch": 2.1113895400533504, "step": 13060}, {"loss": 0.6215, "grad_norm": 0.6942209601402283, "learning_rate": 0.0002, "epoch": 2.1130062242340957, "step": 13070}, {"loss": 0.6449, "grad_norm": 0.6098947525024414, "learning_rate": 0.0002, "epoch": 2.1146229084148414, "step": 13080}, {"loss": 0.7091, "grad_norm": 0.6499016284942627, "learning_rate": 0.0002, "epoch": 2.1162395925955866, "step": 13090}, {"loss": 0.6247, "grad_norm": 0.7719953060150146, "learning_rate": 0.0002, "epoch": 2.117856276776332, "step": 13100}, {"loss": 0.6064, "grad_norm": 0.6708134412765503, "learning_rate": 0.0002, "epoch": 2.119472960957077, "step": 13110}, {"loss": 0.6056, "grad_norm": 0.8119585514068604, "learning_rate": 0.0002, "epoch": 2.1210896451378223, "step": 13120}, {"loss": 0.6628, "grad_norm": 0.6947157979011536, "learning_rate": 0.0002, "epoch": 2.1227063293185675, "step": 13130}, {"loss": 0.6375, "grad_norm": 0.8831837773323059, "learning_rate": 0.0002, "epoch": 2.1243230134993127, "step": 13140}, {"loss": 0.6997, "grad_norm": 0.7266910672187805, "learning_rate": 0.0002, "epoch": 2.1259396976800584, "step": 13150}, {"loss": 0.6446, "grad_norm": 0.8864351511001587, "learning_rate": 0.0002, "epoch": 2.1275563818608036, "step": 13160}, {"loss": 0.6762, "grad_norm": 0.8104248046875, "learning_rate": 0.0002, "epoch": 2.129173066041549, "step": 13170}, {"loss": 0.6581, "grad_norm": 0.6077079772949219, "learning_rate": 0.0002, "epoch": 2.130789750222294, "step": 13180}, {"loss": 0.6572, "grad_norm": 0.6874213814735413, "learning_rate": 0.0002, "epoch": 2.1324064344030393, "step": 13190}, {"loss": 0.642, "grad_norm": 0.7134367823600769, "learning_rate": 0.0002, "epoch": 2.1340231185837846, "step": 13200}, {"loss": 0.7016, "grad_norm": 0.6101235151290894, "learning_rate": 0.0002, "epoch": 2.13563980276453, "step": 13210}, {"loss": 0.6529, "grad_norm": 0.6042411923408508, "learning_rate": 0.0002, "epoch": 2.137256486945275, "step": 13220}, {"loss": 0.7179, "grad_norm": 0.914601743221283, "learning_rate": 0.0002, "epoch": 2.1388731711260207, "step": 13230}, {"loss": 0.6513, "grad_norm": 0.7104284167289734, "learning_rate": 0.0002, "epoch": 2.140489855306766, "step": 13240}, {"loss": 0.6607, "grad_norm": 0.664395272731781, "learning_rate": 0.0002, "epoch": 2.142106539487511, "step": 13250}, {"loss": 0.7211, "grad_norm": 0.6991241574287415, "learning_rate": 0.0002, "epoch": 2.1437232236682564, "step": 13260}, {"loss": 0.6484, "grad_norm": 0.5469560623168945, "learning_rate": 0.0002, "epoch": 2.1453399078490016, "step": 13270}, {"loss": 0.6765, "grad_norm": 0.8454998135566711, "learning_rate": 0.0002, "epoch": 2.146956592029747, "step": 13280}, {"loss": 0.6683, "grad_norm": 0.7088868618011475, "learning_rate": 0.0002, "epoch": 2.148573276210492, "step": 13290}, {"loss": 0.6835, "grad_norm": 0.7002687454223633, "learning_rate": 0.0002, "epoch": 2.1501899603912378, "step": 13300}, {"loss": 0.6399, "grad_norm": 0.7785214781761169, "learning_rate": 0.0002, "epoch": 2.151806644571983, "step": 13310}, {"loss": 0.67, "grad_norm": 0.8049132227897644, "learning_rate": 0.0002, "epoch": 2.1534233287527282, "step": 13320}, {"loss": 0.6495, "grad_norm": 0.8062595129013062, "learning_rate": 0.0002, "epoch": 2.1550400129334735, "step": 13330}, {"loss": 0.6603, "grad_norm": 0.6208319067955017, "learning_rate": 0.0002, "epoch": 2.1566566971142187, "step": 13340}, {"loss": 0.6584, "grad_norm": 0.7519655823707581, "learning_rate": 0.0002, "epoch": 2.158273381294964, "step": 13350}, {"loss": 0.6457, "grad_norm": 0.7645747065544128, "learning_rate": 0.0002, "epoch": 2.159890065475709, "step": 13360}, {"loss": 0.645, "grad_norm": 0.6847302913665771, "learning_rate": 0.0002, "epoch": 2.1615067496564544, "step": 13370}, {"loss": 0.6903, "grad_norm": 0.8630441427230835, "learning_rate": 0.0002, "epoch": 2.1631234338372, "step": 13380}, {"loss": 0.6742, "grad_norm": 0.7947702407836914, "learning_rate": 0.0002, "epoch": 2.1647401180179453, "step": 13390}, {"loss": 0.7206, "grad_norm": 0.6836977005004883, "learning_rate": 0.0002, "epoch": 2.1663568021986905, "step": 13400}, {"loss": 0.6304, "grad_norm": 0.7340566515922546, "learning_rate": 0.0002, "epoch": 2.1679734863794358, "step": 13410}, {"loss": 0.6528, "grad_norm": 0.7075738906860352, "learning_rate": 0.0002, "epoch": 2.169590170560181, "step": 13420}, {"loss": 0.6585, "grad_norm": 0.7080879807472229, "learning_rate": 0.0002, "epoch": 2.1712068547409262, "step": 13430}, {"loss": 0.6615, "grad_norm": 0.6218613386154175, "learning_rate": 0.0002, "epoch": 2.1728235389216715, "step": 13440}, {"loss": 0.6488, "grad_norm": 0.8211479187011719, "learning_rate": 0.0002, "epoch": 2.174440223102417, "step": 13450}, {"loss": 0.6738, "grad_norm": 0.864466667175293, "learning_rate": 0.0002, "epoch": 2.1760569072831624, "step": 13460}, {"loss": 0.679, "grad_norm": 0.7943857908248901, "learning_rate": 0.0002, "epoch": 2.1776735914639076, "step": 13470}, {"loss": 0.6838, "grad_norm": 0.78728187084198, "learning_rate": 0.0002, "epoch": 2.179290275644653, "step": 13480}, {"loss": 0.6397, "grad_norm": 0.697527289390564, "learning_rate": 0.0002, "epoch": 2.180906959825398, "step": 13490}, {"loss": 0.669, "grad_norm": 0.8205804228782654, "learning_rate": 0.0002, "epoch": 2.1825236440061433, "step": 13500}, {"loss": 0.7227, "grad_norm": 0.8709042072296143, "learning_rate": 0.0002, "epoch": 2.1841403281868885, "step": 13510}, {"loss": 0.6313, "grad_norm": 0.6228537559509277, "learning_rate": 0.0002, "epoch": 2.1857570123676338, "step": 13520}, {"loss": 0.7025, "grad_norm": 0.9566980004310608, "learning_rate": 0.0002, "epoch": 2.1873736965483794, "step": 13530}, {"loss": 0.6755, "grad_norm": 0.7128894329071045, "learning_rate": 0.0002, "epoch": 2.1889903807291247, "step": 13540}, {"loss": 0.6827, "grad_norm": 0.6888654232025146, "learning_rate": 0.0002, "epoch": 2.19060706490987, "step": 13550}, {"loss": 0.6961, "grad_norm": 0.6444337368011475, "learning_rate": 0.0002, "epoch": 2.192223749090615, "step": 13560}, {"loss": 0.656, "grad_norm": 0.8008806705474854, "learning_rate": 0.0002, "epoch": 2.1938404332713604, "step": 13570}, {"loss": 0.7, "grad_norm": 0.8482748866081238, "learning_rate": 0.0002, "epoch": 2.1954571174521056, "step": 13580}, {"loss": 0.7326, "grad_norm": 0.8584157228469849, "learning_rate": 0.0002, "epoch": 2.197073801632851, "step": 13590}, {"loss": 0.7014, "grad_norm": 0.7513734698295593, "learning_rate": 0.0002, "epoch": 2.1986904858135965, "step": 13600}, {"loss": 0.6632, "grad_norm": 0.7864262461662292, "learning_rate": 0.0002, "epoch": 2.2003071699943417, "step": 13610}, {"loss": 0.6879, "grad_norm": 0.8493645191192627, "learning_rate": 0.0002, "epoch": 2.201923854175087, "step": 13620}, {"loss": 0.6617, "grad_norm": 0.6902140974998474, "learning_rate": 0.0002, "epoch": 2.203540538355832, "step": 13630}, {"loss": 0.6655, "grad_norm": 0.8711254596710205, "learning_rate": 0.0002, "epoch": 2.2051572225365774, "step": 13640}, {"loss": 0.6359, "grad_norm": 0.7832191586494446, "learning_rate": 0.0002, "epoch": 2.2067739067173227, "step": 13650}, {"loss": 0.6723, "grad_norm": 0.5668176412582397, "learning_rate": 0.0002, "epoch": 2.208390590898068, "step": 13660}, {"loss": 0.635, "grad_norm": 0.8648375272750854, "learning_rate": 0.0002, "epoch": 2.2100072750788136, "step": 13670}, {"loss": 0.653, "grad_norm": 0.7643089890480042, "learning_rate": 0.0002, "epoch": 2.211623959259559, "step": 13680}, {"loss": 0.6765, "grad_norm": 0.6293777823448181, "learning_rate": 0.0002, "epoch": 2.213240643440304, "step": 13690}, {"loss": 0.6842, "grad_norm": 0.6459372639656067, "learning_rate": 0.0002, "epoch": 2.2148573276210493, "step": 13700}, {"loss": 0.6526, "grad_norm": 0.7060744166374207, "learning_rate": 0.0002, "epoch": 2.2164740118017945, "step": 13710}, {"loss": 0.7101, "grad_norm": 0.674109160900116, "learning_rate": 0.0002, "epoch": 2.2180906959825397, "step": 13720}, {"loss": 0.6529, "grad_norm": 0.830392062664032, "learning_rate": 0.0002, "epoch": 2.219707380163285, "step": 13730}, {"loss": 0.6733, "grad_norm": 0.6474477052688599, "learning_rate": 0.0002, "epoch": 2.2213240643440306, "step": 13740}, {"loss": 0.6413, "grad_norm": 0.7037909626960754, "learning_rate": 0.0002, "epoch": 2.222940748524776, "step": 13750}, {"loss": 0.6417, "grad_norm": 0.6554131507873535, "learning_rate": 0.0002, "epoch": 2.224557432705521, "step": 13760}, {"loss": 0.6907, "grad_norm": 0.7822230458259583, "learning_rate": 0.0002, "epoch": 2.2261741168862663, "step": 13770}, {"loss": 0.6505, "grad_norm": 0.9082167744636536, "learning_rate": 0.0002, "epoch": 2.2277908010670116, "step": 13780}, {"loss": 0.6878, "grad_norm": 0.7918276190757751, "learning_rate": 0.0002, "epoch": 2.229407485247757, "step": 13790}, {"loss": 0.6669, "grad_norm": 0.7354569435119629, "learning_rate": 0.0002, "epoch": 2.231024169428502, "step": 13800}, {"loss": 0.6503, "grad_norm": 0.8265249133110046, "learning_rate": 0.0002, "epoch": 2.2326408536092472, "step": 13810}, {"loss": 0.6871, "grad_norm": 0.6653847098350525, "learning_rate": 0.0002, "epoch": 2.234257537789993, "step": 13820}, {"loss": 0.6413, "grad_norm": 0.7157923579216003, "learning_rate": 0.0002, "epoch": 2.235874221970738, "step": 13830}, {"loss": 0.6306, "grad_norm": 0.7110323309898376, "learning_rate": 0.0002, "epoch": 2.2374909061514834, "step": 13840}, {"loss": 0.6913, "grad_norm": 0.7155357599258423, "learning_rate": 0.0002, "epoch": 2.2391075903322286, "step": 13850}, {"loss": 0.6579, "grad_norm": 1.0177817344665527, "learning_rate": 0.0002, "epoch": 2.240724274512974, "step": 13860}, {"loss": 0.635, "grad_norm": 0.7601948380470276, "learning_rate": 0.0002, "epoch": 2.242340958693719, "step": 13870}, {"loss": 0.6679, "grad_norm": 0.7628820538520813, "learning_rate": 0.0002, "epoch": 2.2439576428744643, "step": 13880}, {"loss": 0.6805, "grad_norm": 0.7089297771453857, "learning_rate": 0.0002, "epoch": 2.24557432705521, "step": 13890}, {"loss": 0.7236, "grad_norm": 0.695178210735321, "learning_rate": 0.0002, "epoch": 2.247191011235955, "step": 13900}, {"loss": 0.7084, "grad_norm": 0.7631948590278625, "learning_rate": 0.0002, "epoch": 2.2488076954167004, "step": 13910}, {"loss": 0.685, "grad_norm": 0.8203101754188538, "learning_rate": 0.0002, "epoch": 2.2504243795974457, "step": 13920}, {"loss": 0.653, "grad_norm": 0.8099079728126526, "learning_rate": 0.0002, "epoch": 2.252041063778191, "step": 13930}, {"loss": 0.694, "grad_norm": 0.6498546004295349, "learning_rate": 0.0002, "epoch": 2.253657747958936, "step": 13940}, {"loss": 0.6684, "grad_norm": 0.7797415256500244, "learning_rate": 0.0002, "epoch": 2.2552744321396814, "step": 13950}, {"loss": 0.683, "grad_norm": 0.8254124522209167, "learning_rate": 0.0002, "epoch": 2.2568911163204266, "step": 13960}, {"loss": 0.6806, "grad_norm": 0.6327953338623047, "learning_rate": 0.0002, "epoch": 2.2585078005011723, "step": 13970}, {"loss": 0.668, "grad_norm": 0.734194278717041, "learning_rate": 0.0002, "epoch": 2.2601244846819175, "step": 13980}, {"loss": 0.6912, "grad_norm": 0.9014202952384949, "learning_rate": 0.0002, "epoch": 2.2617411688626627, "step": 13990}, {"loss": 0.692, "grad_norm": 0.7643631100654602, "learning_rate": 0.0002, "epoch": 2.263357853043408, "step": 14000}, {"loss": 0.6657, "grad_norm": 0.8882834911346436, "learning_rate": 0.0002, "epoch": 2.264974537224153, "step": 14010}, {"loss": 0.6453, "grad_norm": 0.7975873351097107, "learning_rate": 0.0002, "epoch": 2.2665912214048984, "step": 14020}, {"loss": 0.7193, "grad_norm": 0.7765783071517944, "learning_rate": 0.0002, "epoch": 2.2682079055856437, "step": 14030}, {"loss": 0.662, "grad_norm": 0.8846288323402405, "learning_rate": 0.0002, "epoch": 2.2698245897663893, "step": 14040}, {"loss": 0.6494, "grad_norm": 0.9006744027137756, "learning_rate": 0.0002, "epoch": 2.2714412739471346, "step": 14050}, {"loss": 0.6423, "grad_norm": 0.7420173287391663, "learning_rate": 0.0002, "epoch": 2.27305795812788, "step": 14060}, {"loss": 0.7068, "grad_norm": 0.7956424951553345, "learning_rate": 0.0002, "epoch": 2.274674642308625, "step": 14070}, {"loss": 0.6581, "grad_norm": 0.7783209085464478, "learning_rate": 0.0002, "epoch": 2.2762913264893703, "step": 14080}, {"loss": 0.7202, "grad_norm": 0.7597188949584961, "learning_rate": 0.0002, "epoch": 2.2779080106701155, "step": 14090}, {"loss": 0.6778, "grad_norm": 0.6718921661376953, "learning_rate": 0.0002, "epoch": 2.2795246948508607, "step": 14100}, {"loss": 0.632, "grad_norm": 0.7528082132339478, "learning_rate": 0.0002, "epoch": 2.281141379031606, "step": 14110}, {"loss": 0.7608, "grad_norm": 0.8379864692687988, "learning_rate": 0.0002, "epoch": 2.2827580632123516, "step": 14120}, {"loss": 0.6767, "grad_norm": 0.748613715171814, "learning_rate": 0.0002, "epoch": 2.284374747393097, "step": 14130}, {"loss": 0.6641, "grad_norm": 0.7435423135757446, "learning_rate": 0.0002, "epoch": 2.285991431573842, "step": 14140}, {"loss": 0.6849, "grad_norm": 0.7580803632736206, "learning_rate": 0.0002, "epoch": 2.2876081157545873, "step": 14150}, {"loss": 0.6604, "grad_norm": 0.6278321146965027, "learning_rate": 0.0002, "epoch": 2.2892247999353326, "step": 14160}, {"loss": 0.6573, "grad_norm": 0.7663896083831787, "learning_rate": 0.0002, "epoch": 2.290841484116078, "step": 14170}, {"loss": 0.6655, "grad_norm": 0.9716812372207642, "learning_rate": 0.0002, "epoch": 2.292458168296823, "step": 14180}, {"loss": 0.7067, "grad_norm": 0.8993458151817322, "learning_rate": 0.0002, "epoch": 2.2940748524775687, "step": 14190}, {"loss": 0.6172, "grad_norm": 0.6156117916107178, "learning_rate": 0.0002, "epoch": 2.295691536658314, "step": 14200}, {"loss": 0.6318, "grad_norm": 0.8911278247833252, "learning_rate": 0.0002, "epoch": 2.297308220839059, "step": 14210}, {"loss": 0.6364, "grad_norm": 0.6422147154808044, "learning_rate": 0.0002, "epoch": 2.2989249050198044, "step": 14220}, {"loss": 0.6795, "grad_norm": 0.6866879463195801, "learning_rate": 0.0002, "epoch": 2.3005415892005496, "step": 14230}, {"loss": 0.6907, "grad_norm": 0.9297130107879639, "learning_rate": 0.0002, "epoch": 2.302158273381295, "step": 14240}, {"loss": 0.6823, "grad_norm": 0.7501356601715088, "learning_rate": 0.0002, "epoch": 2.30377495756204, "step": 14250}, {"loss": 0.6414, "grad_norm": 0.8363515138626099, "learning_rate": 0.0002, "epoch": 2.3053916417427853, "step": 14260}, {"loss": 0.6362, "grad_norm": 0.9083868265151978, "learning_rate": 0.0002, "epoch": 2.307008325923531, "step": 14270}, {"loss": 0.6862, "grad_norm": 0.7791516780853271, "learning_rate": 0.0002, "epoch": 2.3086250101042762, "step": 14280}, {"loss": 0.6569, "grad_norm": 0.8766953349113464, "learning_rate": 0.0002, "epoch": 2.3102416942850215, "step": 14290}, {"loss": 0.6698, "grad_norm": 0.7916635274887085, "learning_rate": 0.0002, "epoch": 2.3118583784657667, "step": 14300}, {"loss": 0.6927, "grad_norm": 0.627525269985199, "learning_rate": 0.0002, "epoch": 2.313475062646512, "step": 14310}, {"loss": 0.6541, "grad_norm": 0.8856783509254456, "learning_rate": 0.0002, "epoch": 2.315091746827257, "step": 14320}, {"loss": 0.6806, "grad_norm": 0.6758689284324646, "learning_rate": 0.0002, "epoch": 2.316708431008003, "step": 14330}, {"loss": 0.6794, "grad_norm": 0.6428321003913879, "learning_rate": 0.0002, "epoch": 2.318325115188748, "step": 14340}, {"loss": 0.682, "grad_norm": 0.9032121300697327, "learning_rate": 0.0002, "epoch": 2.3199417993694933, "step": 14350}, {"loss": 0.6569, "grad_norm": 0.8035986423492432, "learning_rate": 0.0002, "epoch": 2.3215584835502385, "step": 14360}, {"loss": 0.7067, "grad_norm": 0.7974579334259033, "learning_rate": 0.0002, "epoch": 2.3231751677309838, "step": 14370}, {"loss": 0.6451, "grad_norm": 0.8356034755706787, "learning_rate": 0.0002, "epoch": 2.324791851911729, "step": 14380}, {"loss": 0.6623, "grad_norm": 0.998760998249054, "learning_rate": 0.0002, "epoch": 2.326408536092474, "step": 14390}, {"loss": 0.649, "grad_norm": 0.6518142223358154, "learning_rate": 0.0002, "epoch": 2.3280252202732195, "step": 14400}, {"loss": 0.7146, "grad_norm": 0.7443506717681885, "learning_rate": 0.0002, "epoch": 2.3296419044539647, "step": 14410}, {"loss": 0.648, "grad_norm": 0.8436172604560852, "learning_rate": 0.0002, "epoch": 2.3312585886347104, "step": 14420}, {"loss": 0.6585, "grad_norm": 0.7411080598831177, "learning_rate": 0.0002, "epoch": 2.3328752728154556, "step": 14430}, {"loss": 0.6781, "grad_norm": 0.8839048743247986, "learning_rate": 0.0002, "epoch": 2.334491956996201, "step": 14440}, {"loss": 0.6565, "grad_norm": 0.8360885977745056, "learning_rate": 0.0002, "epoch": 2.336108641176946, "step": 14450}, {"loss": 0.6662, "grad_norm": 0.7608986496925354, "learning_rate": 0.0002, "epoch": 2.3377253253576913, "step": 14460}, {"loss": 0.6685, "grad_norm": 0.8179867267608643, "learning_rate": 0.0002, "epoch": 2.3393420095384365, "step": 14470}, {"loss": 0.7055, "grad_norm": 0.5989999771118164, "learning_rate": 0.0002, "epoch": 2.340958693719182, "step": 14480}, {"loss": 0.644, "grad_norm": 0.9450054168701172, "learning_rate": 0.0002, "epoch": 2.3425753778999274, "step": 14490}, {"loss": 0.6983, "grad_norm": 0.7885149717330933, "learning_rate": 0.0002, "epoch": 2.3441920620806727, "step": 14500}, {"loss": 0.6819, "grad_norm": 0.8152616620063782, "learning_rate": 0.0002, "epoch": 2.345808746261418, "step": 14510}, {"loss": 0.6989, "grad_norm": 0.7193838953971863, "learning_rate": 0.0002, "epoch": 2.347425430442163, "step": 14520}, {"loss": 0.6594, "grad_norm": 0.6701092720031738, "learning_rate": 0.0002, "epoch": 2.3490421146229084, "step": 14530}, {"loss": 0.6559, "grad_norm": 0.7529364228248596, "learning_rate": 0.0002, "epoch": 2.3506587988036536, "step": 14540}, {"loss": 0.6306, "grad_norm": 0.6599733829498291, "learning_rate": 0.0002, "epoch": 2.352275482984399, "step": 14550}, {"loss": 0.706, "grad_norm": 0.9502474069595337, "learning_rate": 0.0002, "epoch": 2.353892167165144, "step": 14560}, {"loss": 0.717, "grad_norm": 0.7619650959968567, "learning_rate": 0.0002, "epoch": 2.3555088513458897, "step": 14570}, {"loss": 0.6684, "grad_norm": 0.9854652285575867, "learning_rate": 0.0002, "epoch": 2.357125535526635, "step": 14580}, {"loss": 0.6455, "grad_norm": 0.727439284324646, "learning_rate": 0.0002, "epoch": 2.35874221970738, "step": 14590}, {"loss": 0.6645, "grad_norm": 0.6994746327400208, "learning_rate": 0.0002, "epoch": 2.3603589038881254, "step": 14600}, {"loss": 0.6587, "grad_norm": 0.7117531299591064, "learning_rate": 0.0002, "epoch": 2.3619755880688706, "step": 14610}, {"loss": 0.6804, "grad_norm": 0.6403067708015442, "learning_rate": 0.0002, "epoch": 2.363592272249616, "step": 14620}, {"loss": 0.7055, "grad_norm": 0.8377841711044312, "learning_rate": 0.0002, "epoch": 2.3652089564303616, "step": 14630}, {"loss": 0.6778, "grad_norm": 0.749171257019043, "learning_rate": 0.0002, "epoch": 2.366825640611107, "step": 14640}, {"loss": 0.6552, "grad_norm": 0.8418586254119873, "learning_rate": 0.0002, "epoch": 2.368442324791852, "step": 14650}, {"loss": 0.6685, "grad_norm": 0.6178573369979858, "learning_rate": 0.0002, "epoch": 2.3700590089725972, "step": 14660}, {"loss": 0.6774, "grad_norm": 0.6368302702903748, "learning_rate": 0.0002, "epoch": 2.3716756931533425, "step": 14670}, {"loss": 0.6136, "grad_norm": 0.9122977256774902, "learning_rate": 0.0002, "epoch": 2.3732923773340877, "step": 14680}, {"loss": 0.6675, "grad_norm": 0.7086195349693298, "learning_rate": 0.0002, "epoch": 2.374909061514833, "step": 14690}, {"loss": 0.6582, "grad_norm": 0.7500800490379333, "learning_rate": 0.0002, "epoch": 2.376525745695578, "step": 14700}, {"loss": 0.6792, "grad_norm": 0.6634900569915771, "learning_rate": 0.0002, "epoch": 2.378142429876324, "step": 14710}, {"loss": 0.6614, "grad_norm": 0.839898407459259, "learning_rate": 0.0002, "epoch": 2.379759114057069, "step": 14720}, {"loss": 0.6453, "grad_norm": 0.7578426003456116, "learning_rate": 0.0002, "epoch": 2.3813757982378143, "step": 14730}, {"loss": 0.7282, "grad_norm": 1.0213173627853394, "learning_rate": 0.0002, "epoch": 2.3829924824185595, "step": 14740}, {"loss": 0.6704, "grad_norm": 0.7855949401855469, "learning_rate": 0.0002, "epoch": 2.3846091665993048, "step": 14750}, {"loss": 0.6694, "grad_norm": 0.7224128842353821, "learning_rate": 0.0002, "epoch": 2.38622585078005, "step": 14760}, {"loss": 0.7017, "grad_norm": 0.8040381669998169, "learning_rate": 0.0002, "epoch": 2.3878425349607952, "step": 14770}, {"loss": 0.6799, "grad_norm": 0.7705281376838684, "learning_rate": 0.0002, "epoch": 2.389459219141541, "step": 14780}, {"loss": 0.6326, "grad_norm": 0.667966902256012, "learning_rate": 0.0002, "epoch": 2.391075903322286, "step": 14790}, {"loss": 0.7061, "grad_norm": 0.6611011028289795, "learning_rate": 0.0002, "epoch": 2.3926925875030314, "step": 14800}, {"loss": 0.6527, "grad_norm": 0.6862651705741882, "learning_rate": 0.0002, "epoch": 2.3943092716837766, "step": 14810}, {"loss": 0.6537, "grad_norm": 0.8086010217666626, "learning_rate": 0.0002, "epoch": 2.395925955864522, "step": 14820}, {"loss": 0.7189, "grad_norm": 0.7189689874649048, "learning_rate": 0.0002, "epoch": 2.397542640045267, "step": 14830}, {"loss": 0.6709, "grad_norm": 0.6280009150505066, "learning_rate": 0.0002, "epoch": 2.3991593242260123, "step": 14840}, {"loss": 0.706, "grad_norm": 0.7826612591743469, "learning_rate": 0.0002, "epoch": 2.4007760084067575, "step": 14850}, {"loss": 0.6738, "grad_norm": 0.7681610584259033, "learning_rate": 0.0002, "epoch": 2.402392692587503, "step": 14860}, {"loss": 0.636, "grad_norm": 0.720966100692749, "learning_rate": 0.0002, "epoch": 2.4040093767682484, "step": 14870}, {"loss": 0.6667, "grad_norm": 0.8202250599861145, "learning_rate": 0.0002, "epoch": 2.4056260609489937, "step": 14880}, {"loss": 0.6935, "grad_norm": 0.786212682723999, "learning_rate": 0.0002, "epoch": 2.407242745129739, "step": 14890}, {"loss": 0.6628, "grad_norm": 0.6647164821624756, "learning_rate": 0.0002, "epoch": 2.408859429310484, "step": 14900}, {"loss": 0.6706, "grad_norm": 0.7566399574279785, "learning_rate": 0.0002, "epoch": 2.4104761134912294, "step": 14910}, {"loss": 0.7188, "grad_norm": 0.748814582824707, "learning_rate": 0.0002, "epoch": 2.4120927976719746, "step": 14920}, {"loss": 0.6684, "grad_norm": 0.7624038457870483, "learning_rate": 0.0002, "epoch": 2.4137094818527203, "step": 14930}, {"loss": 0.6483, "grad_norm": 0.8267335295677185, "learning_rate": 0.0002, "epoch": 2.4153261660334655, "step": 14940}, {"loss": 0.6612, "grad_norm": 0.8785360455513, "learning_rate": 0.0002, "epoch": 2.4169428502142107, "step": 14950}, {"loss": 0.6718, "grad_norm": 0.679887592792511, "learning_rate": 0.0002, "epoch": 2.418559534394956, "step": 14960}, {"loss": 0.6136, "grad_norm": 0.7218474745750427, "learning_rate": 0.0002, "epoch": 2.420176218575701, "step": 14970}, {"loss": 0.648, "grad_norm": 0.6342799663543701, "learning_rate": 0.0002, "epoch": 2.4217929027564464, "step": 14980}, {"loss": 0.6617, "grad_norm": 0.7098712921142578, "learning_rate": 0.0002, "epoch": 2.4234095869371917, "step": 14990}, {"loss": 0.6942, "grad_norm": 0.7497431635856628, "learning_rate": 0.0002, "epoch": 2.425026271117937, "step": 15000}, {"loss": 0.6772, "grad_norm": 0.934836208820343, "learning_rate": 0.0002, "epoch": 2.4266429552986826, "step": 15010}, {"loss": 0.7221, "grad_norm": 0.8430966734886169, "learning_rate": 0.0002, "epoch": 2.428259639479428, "step": 15020}, {"loss": 0.6985, "grad_norm": 0.7032104730606079, "learning_rate": 0.0002, "epoch": 2.429876323660173, "step": 15030}, {"loss": 0.6715, "grad_norm": 0.7746111750602722, "learning_rate": 0.0002, "epoch": 2.4314930078409183, "step": 15040}, {"loss": 0.7177, "grad_norm": 0.7661406397819519, "learning_rate": 0.0002, "epoch": 2.4331096920216635, "step": 15050}, {"loss": 0.6517, "grad_norm": 0.6941645741462708, "learning_rate": 0.0002, "epoch": 2.4347263762024087, "step": 15060}, {"loss": 0.6421, "grad_norm": 0.7487249374389648, "learning_rate": 0.0002, "epoch": 2.436343060383154, "step": 15070}, {"loss": 0.6796, "grad_norm": 0.7639912962913513, "learning_rate": 0.0002, "epoch": 2.4379597445638996, "step": 15080}, {"loss": 0.7087, "grad_norm": 0.7708953619003296, "learning_rate": 0.0002, "epoch": 2.439576428744645, "step": 15090}, {"loss": 0.7065, "grad_norm": 0.9135832190513611, "learning_rate": 0.0002, "epoch": 2.44119311292539, "step": 15100}, {"loss": 0.672, "grad_norm": 0.8283005356788635, "learning_rate": 0.0002, "epoch": 2.4428097971061353, "step": 15110}, {"loss": 0.6551, "grad_norm": 0.925299346446991, "learning_rate": 0.0002, "epoch": 2.4444264812868806, "step": 15120}, {"loss": 0.687, "grad_norm": 0.7013528943061829, "learning_rate": 0.0002, "epoch": 2.446043165467626, "step": 15130}, {"loss": 0.6842, "grad_norm": 0.622303307056427, "learning_rate": 0.0002, "epoch": 2.447659849648371, "step": 15140}, {"loss": 0.6676, "grad_norm": 0.876569390296936, "learning_rate": 0.0002, "epoch": 2.4492765338291163, "step": 15150}, {"loss": 0.6463, "grad_norm": 0.6836351752281189, "learning_rate": 0.0002, "epoch": 2.450893218009862, "step": 15160}, {"loss": 0.6781, "grad_norm": 0.7886684536933899, "learning_rate": 0.0002, "epoch": 2.452509902190607, "step": 15170}, {"loss": 0.6794, "grad_norm": 0.6647440791130066, "learning_rate": 0.0002, "epoch": 2.4541265863713524, "step": 15180}, {"loss": 0.6353, "grad_norm": 0.7477722764015198, "learning_rate": 0.0002, "epoch": 2.4557432705520976, "step": 15190}, {"loss": 0.698, "grad_norm": 0.8192033767700195, "learning_rate": 0.0002, "epoch": 2.457359954732843, "step": 15200}, {"loss": 0.6735, "grad_norm": 0.847537100315094, "learning_rate": 0.0002, "epoch": 2.458976638913588, "step": 15210}, {"loss": 0.6962, "grad_norm": 0.9027776122093201, "learning_rate": 0.0002, "epoch": 2.4605933230943338, "step": 15220}, {"loss": 0.7084, "grad_norm": 0.7217772006988525, "learning_rate": 0.0002, "epoch": 2.462210007275079, "step": 15230}, {"loss": 0.691, "grad_norm": 0.7994546294212341, "learning_rate": 0.0002, "epoch": 2.4638266914558242, "step": 15240}, {"loss": 0.6828, "grad_norm": 0.939916729927063, "learning_rate": 0.0002, "epoch": 2.4654433756365695, "step": 15250}, {"loss": 0.6893, "grad_norm": 1.0009053945541382, "learning_rate": 0.0002, "epoch": 2.4670600598173147, "step": 15260}, {"loss": 0.643, "grad_norm": 0.625555694103241, "learning_rate": 0.0002, "epoch": 2.46867674399806, "step": 15270}, {"loss": 0.688, "grad_norm": 0.7924878597259521, "learning_rate": 0.0002, "epoch": 2.470293428178805, "step": 15280}, {"loss": 0.6789, "grad_norm": 0.8536689877510071, "learning_rate": 0.0002, "epoch": 2.4719101123595504, "step": 15290}, {"loss": 0.6924, "grad_norm": 0.8572589755058289, "learning_rate": 0.0002, "epoch": 2.4735267965402956, "step": 15300}, {"loss": 0.604, "grad_norm": 0.773279070854187, "learning_rate": 0.0002, "epoch": 2.4751434807210413, "step": 15310}, {"loss": 0.6573, "grad_norm": 0.7708749771118164, "learning_rate": 0.0002, "epoch": 2.4767601649017865, "step": 15320}, {"loss": 0.7065, "grad_norm": 0.770905077457428, "learning_rate": 0.0002, "epoch": 2.4783768490825318, "step": 15330}, {"loss": 0.6878, "grad_norm": 0.8238571882247925, "learning_rate": 0.0002, "epoch": 2.479993533263277, "step": 15340}, {"loss": 0.6772, "grad_norm": 0.7670477032661438, "learning_rate": 0.0002, "epoch": 2.481610217444022, "step": 15350}, {"loss": 0.7759, "grad_norm": 0.905036985874176, "learning_rate": 0.0002, "epoch": 2.4832269016247674, "step": 15360}, {"loss": 0.706, "grad_norm": 0.6672089695930481, "learning_rate": 0.0002, "epoch": 2.484843585805513, "step": 15370}, {"loss": 0.6722, "grad_norm": 0.625095784664154, "learning_rate": 0.0002, "epoch": 2.4864602699862584, "step": 15380}, {"loss": 0.6396, "grad_norm": 0.679772675037384, "learning_rate": 0.0002, "epoch": 2.4880769541670036, "step": 15390}, {"loss": 0.6778, "grad_norm": 0.711492121219635, "learning_rate": 0.0002, "epoch": 2.489693638347749, "step": 15400}, {"loss": 0.6966, "grad_norm": 0.876189112663269, "learning_rate": 0.0002, "epoch": 2.491310322528494, "step": 15410}, {"loss": 0.7307, "grad_norm": 0.7236915230751038, "learning_rate": 0.0002, "epoch": 2.4929270067092393, "step": 15420}, {"loss": 0.647, "grad_norm": 0.6629832983016968, "learning_rate": 0.0002, "epoch": 2.4945436908899845, "step": 15430}, {"loss": 0.6669, "grad_norm": 0.9756859540939331, "learning_rate": 0.0002, "epoch": 2.4961603750707297, "step": 15440}, {"loss": 0.7559, "grad_norm": 0.6896940469741821, "learning_rate": 0.0002, "epoch": 2.4977770592514754, "step": 15450}, {"loss": 0.6818, "grad_norm": 0.7105149626731873, "learning_rate": 0.0002, "epoch": 2.4993937434322206, "step": 15460}, {"loss": 0.6859, "grad_norm": 0.8374546766281128, "learning_rate": 0.0002, "epoch": 2.501010427612966, "step": 15470}, {"loss": 0.6512, "grad_norm": 0.7320070266723633, "learning_rate": 0.0002, "epoch": 2.502627111793711, "step": 15480}, {"loss": 0.685, "grad_norm": 0.8306367993354797, "learning_rate": 0.0002, "epoch": 2.5042437959744563, "step": 15490}, {"loss": 0.7253, "grad_norm": 0.7472721338272095, "learning_rate": 0.0002, "epoch": 2.5058604801552016, "step": 15500}, {"loss": 0.6699, "grad_norm": 0.6147692203521729, "learning_rate": 0.0002, "epoch": 2.507477164335947, "step": 15510}, {"loss": 0.7158, "grad_norm": 0.7788505554199219, "learning_rate": 0.0002, "epoch": 2.5090938485166925, "step": 15520}, {"loss": 0.6521, "grad_norm": 0.8807527422904968, "learning_rate": 0.0002, "epoch": 2.5107105326974377, "step": 15530}, {"loss": 0.6792, "grad_norm": 0.7521643042564392, "learning_rate": 0.0002, "epoch": 2.512327216878183, "step": 15540}, {"loss": 0.6772, "grad_norm": 0.6900225281715393, "learning_rate": 0.0002, "epoch": 2.513943901058928, "step": 15550}, {"loss": 0.6769, "grad_norm": 0.6601938605308533, "learning_rate": 0.0002, "epoch": 2.5155605852396734, "step": 15560}, {"loss": 0.6648, "grad_norm": 0.8179984092712402, "learning_rate": 0.0002, "epoch": 2.5171772694204186, "step": 15570}, {"loss": 0.7028, "grad_norm": 0.792556881904602, "learning_rate": 0.0002, "epoch": 2.518793953601164, "step": 15580}, {"loss": 0.6464, "grad_norm": 0.7081938982009888, "learning_rate": 0.0002, "epoch": 2.520410637781909, "step": 15590}, {"loss": 0.6691, "grad_norm": 0.8733121156692505, "learning_rate": 0.0002, "epoch": 2.5220273219626543, "step": 15600}, {"loss": 0.6969, "grad_norm": 0.7980992794036865, "learning_rate": 0.0002, "epoch": 2.5236440061434, "step": 15610}, {"loss": 0.7124, "grad_norm": 0.883664071559906, "learning_rate": 0.0002, "epoch": 2.5252606903241452, "step": 15620}, {"loss": 0.7022, "grad_norm": 0.6963341236114502, "learning_rate": 0.0002, "epoch": 2.5268773745048905, "step": 15630}, {"loss": 0.7334, "grad_norm": 0.6433573365211487, "learning_rate": 0.0002, "epoch": 2.5284940586856357, "step": 15640}, {"loss": 0.6889, "grad_norm": 0.8538183569908142, "learning_rate": 0.0002, "epoch": 2.530110742866381, "step": 15650}, {"loss": 0.6841, "grad_norm": 0.9748201370239258, "learning_rate": 0.0002, "epoch": 2.5317274270471266, "step": 15660}, {"loss": 0.6765, "grad_norm": 0.7670575380325317, "learning_rate": 0.0002, "epoch": 2.533344111227872, "step": 15670}, {"loss": 0.6435, "grad_norm": 0.8738890290260315, "learning_rate": 0.0002, "epoch": 2.534960795408617, "step": 15680}, {"loss": 0.6802, "grad_norm": 0.8391636610031128, "learning_rate": 0.0002, "epoch": 2.5365774795893623, "step": 15690}, {"loss": 0.6901, "grad_norm": 0.7239366769790649, "learning_rate": 0.0002, "epoch": 2.5381941637701075, "step": 15700}, {"loss": 0.7011, "grad_norm": 0.8498379588127136, "learning_rate": 0.0002, "epoch": 2.5398108479508528, "step": 15710}, {"loss": 0.6998, "grad_norm": 0.8029484152793884, "learning_rate": 0.0002, "epoch": 2.541427532131598, "step": 15720}, {"loss": 0.6678, "grad_norm": 1.0639333724975586, "learning_rate": 0.0002, "epoch": 2.5430442163123432, "step": 15730}, {"loss": 0.6341, "grad_norm": 0.6401297450065613, "learning_rate": 0.0002, "epoch": 2.5446609004930885, "step": 15740}, {"loss": 0.7196, "grad_norm": 0.7123814821243286, "learning_rate": 0.0002, "epoch": 2.5462775846738337, "step": 15750}, {"loss": 0.654, "grad_norm": 0.7874974608421326, "learning_rate": 0.0002, "epoch": 2.5478942688545794, "step": 15760}, {"loss": 0.6721, "grad_norm": 0.8046808838844299, "learning_rate": 0.0002, "epoch": 2.5495109530353246, "step": 15770}, {"loss": 0.6665, "grad_norm": 0.7888661623001099, "learning_rate": 0.0002, "epoch": 2.55112763721607, "step": 15780}, {"loss": 0.6893, "grad_norm": 0.8445866107940674, "learning_rate": 0.0002, "epoch": 2.552744321396815, "step": 15790}, {"loss": 0.6815, "grad_norm": 0.7475846409797668, "learning_rate": 0.0002, "epoch": 2.5543610055775603, "step": 15800}, {"loss": 0.6711, "grad_norm": 0.7455102801322937, "learning_rate": 0.0002, "epoch": 2.555977689758306, "step": 15810}, {"loss": 0.6932, "grad_norm": 0.8226983547210693, "learning_rate": 0.0002, "epoch": 2.557594373939051, "step": 15820}, {"loss": 0.651, "grad_norm": 0.8920368552207947, "learning_rate": 0.0002, "epoch": 2.5592110581197964, "step": 15830}, {"loss": 0.6297, "grad_norm": 0.8413904905319214, "learning_rate": 0.0002, "epoch": 2.5608277423005417, "step": 15840}, {"loss": 0.7106, "grad_norm": 0.8483649492263794, "learning_rate": 0.0002, "epoch": 2.562444426481287, "step": 15850}, {"loss": 0.6957, "grad_norm": 0.5923284292221069, "learning_rate": 0.0002, "epoch": 2.564061110662032, "step": 15860}, {"loss": 0.6847, "grad_norm": 0.8518726229667664, "learning_rate": 0.0002, "epoch": 2.5656777948427774, "step": 15870}, {"loss": 0.6362, "grad_norm": 0.731235146522522, "learning_rate": 0.0002, "epoch": 2.5672944790235226, "step": 15880}, {"loss": 0.7611, "grad_norm": 0.7517194151878357, "learning_rate": 0.0002, "epoch": 2.568911163204268, "step": 15890}, {"loss": 0.6907, "grad_norm": 0.8378692269325256, "learning_rate": 0.0002, "epoch": 2.5705278473850135, "step": 15900}, {"loss": 0.7055, "grad_norm": 0.843701958656311, "learning_rate": 0.0002, "epoch": 2.5721445315657587, "step": 15910}, {"loss": 0.6882, "grad_norm": 0.7254629731178284, "learning_rate": 0.0002, "epoch": 2.573761215746504, "step": 15920}, {"loss": 0.6872, "grad_norm": 0.8863335847854614, "learning_rate": 0.0002, "epoch": 2.575377899927249, "step": 15930}, {"loss": 0.6813, "grad_norm": 0.7675097584724426, "learning_rate": 0.0002, "epoch": 2.5769945841079944, "step": 15940}, {"loss": 0.7357, "grad_norm": 0.82063889503479, "learning_rate": 0.0002, "epoch": 2.5786112682887397, "step": 15950}, {"loss": 0.662, "grad_norm": 0.7729717493057251, "learning_rate": 0.0002, "epoch": 2.5802279524694853, "step": 15960}, {"loss": 0.633, "grad_norm": 0.8301846981048584, "learning_rate": 0.0002, "epoch": 2.5818446366502306, "step": 15970}, {"loss": 0.6897, "grad_norm": 0.7906861305236816, "learning_rate": 0.0002, "epoch": 2.583461320830976, "step": 15980}, {"loss": 0.7175, "grad_norm": 0.6749057173728943, "learning_rate": 0.0002, "epoch": 2.585078005011721, "step": 15990}, {"loss": 0.7212, "grad_norm": 0.9386842846870422, "learning_rate": 0.0002, "epoch": 2.5866946891924663, "step": 16000}, {"loss": 0.6934, "grad_norm": 0.7868891358375549, "learning_rate": 0.0002, "epoch": 2.5883113733732115, "step": 16010}, {"loss": 0.7036, "grad_norm": 0.8674671053886414, "learning_rate": 0.0002, "epoch": 2.5899280575539567, "step": 16020}, {"loss": 0.7217, "grad_norm": 0.7043559551239014, "learning_rate": 0.0002, "epoch": 2.591544741734702, "step": 16030}, {"loss": 0.6967, "grad_norm": 0.5846083760261536, "learning_rate": 0.0002, "epoch": 2.593161425915447, "step": 16040}, {"loss": 0.7322, "grad_norm": 0.7323982119560242, "learning_rate": 0.0002, "epoch": 2.594778110096193, "step": 16050}, {"loss": 0.6794, "grad_norm": 0.9069556593894958, "learning_rate": 0.0002, "epoch": 2.596394794276938, "step": 16060}, {"loss": 0.7076, "grad_norm": 0.7522736191749573, "learning_rate": 0.0002, "epoch": 2.5980114784576833, "step": 16070}, {"loss": 0.6477, "grad_norm": 0.8149648308753967, "learning_rate": 0.0002, "epoch": 2.5996281626384286, "step": 16080}, {"loss": 0.6664, "grad_norm": 0.6214233040809631, "learning_rate": 0.0002, "epoch": 2.601244846819174, "step": 16090}, {"loss": 0.7307, "grad_norm": 0.6803743839263916, "learning_rate": 0.0002, "epoch": 2.602861530999919, "step": 16100}, {"loss": 0.7244, "grad_norm": 0.7223997116088867, "learning_rate": 0.0002, "epoch": 2.6044782151806647, "step": 16110}, {"loss": 0.6867, "grad_norm": 0.7324174642562866, "learning_rate": 0.0002, "epoch": 2.60609489936141, "step": 16120}, {"loss": 0.7159, "grad_norm": 0.9594739675521851, "learning_rate": 0.0002, "epoch": 2.607711583542155, "step": 16130}, {"loss": 0.6451, "grad_norm": 0.9485327005386353, "learning_rate": 0.0002, "epoch": 2.6093282677229004, "step": 16140}, {"loss": 0.6815, "grad_norm": 0.8449000120162964, "learning_rate": 0.0002, "epoch": 2.6109449519036456, "step": 16150}, {"loss": 0.7152, "grad_norm": 0.8520140051841736, "learning_rate": 0.0002, "epoch": 2.612561636084391, "step": 16160}, {"loss": 0.6759, "grad_norm": 0.7456524968147278, "learning_rate": 0.0002, "epoch": 2.614178320265136, "step": 16170}, {"loss": 0.6893, "grad_norm": 0.9912857413291931, "learning_rate": 0.0002, "epoch": 2.6157950044458813, "step": 16180}, {"loss": 0.7243, "grad_norm": 0.9001946449279785, "learning_rate": 0.0002, "epoch": 2.6174116886266265, "step": 16190}, {"loss": 0.6825, "grad_norm": 0.6568667888641357, "learning_rate": 0.0002, "epoch": 2.619028372807372, "step": 16200}, {"loss": 0.7013, "grad_norm": 1.0248128175735474, "learning_rate": 0.0002, "epoch": 2.6206450569881174, "step": 16210}, {"loss": 0.7045, "grad_norm": 0.6509039998054504, "learning_rate": 0.0002, "epoch": 2.6222617411688627, "step": 16220}, {"loss": 0.72, "grad_norm": 0.7626351118087769, "learning_rate": 0.0002, "epoch": 2.623878425349608, "step": 16230}, {"loss": 0.6556, "grad_norm": 0.6938552260398865, "learning_rate": 0.0002, "epoch": 2.625495109530353, "step": 16240}, {"loss": 0.65, "grad_norm": 0.6434680819511414, "learning_rate": 0.0002, "epoch": 2.6271117937110984, "step": 16250}, {"loss": 0.6943, "grad_norm": 0.7111515998840332, "learning_rate": 0.0002, "epoch": 2.628728477891844, "step": 16260}, {"loss": 0.679, "grad_norm": 0.7712395787239075, "learning_rate": 0.0002, "epoch": 2.6303451620725893, "step": 16270}, {"loss": 0.6886, "grad_norm": 0.792209267616272, "learning_rate": 0.0002, "epoch": 2.6319618462533345, "step": 16280}, {"loss": 0.6554, "grad_norm": 0.6801066398620605, "learning_rate": 0.0002, "epoch": 2.6335785304340797, "step": 16290}, {"loss": 0.73, "grad_norm": 0.7802573442459106, "learning_rate": 0.0002, "epoch": 2.635195214614825, "step": 16300}, {"loss": 0.7484, "grad_norm": 0.7742244601249695, "learning_rate": 0.0002, "epoch": 2.63681189879557, "step": 16310}, {"loss": 0.6524, "grad_norm": 0.664184033870697, "learning_rate": 0.0002, "epoch": 2.6384285829763154, "step": 16320}, {"loss": 0.6442, "grad_norm": 0.9242228865623474, "learning_rate": 0.0002, "epoch": 2.6400452671570607, "step": 16330}, {"loss": 0.6792, "grad_norm": 0.9661325216293335, "learning_rate": 0.0002, "epoch": 2.641661951337806, "step": 16340}, {"loss": 0.6847, "grad_norm": 0.837526798248291, "learning_rate": 0.0002, "epoch": 2.6432786355185516, "step": 16350}, {"loss": 0.7686, "grad_norm": 1.1834373474121094, "learning_rate": 0.0002, "epoch": 2.644895319699297, "step": 16360}, {"loss": 0.6746, "grad_norm": 0.7467831373214722, "learning_rate": 0.0002, "epoch": 2.646512003880042, "step": 16370}, {"loss": 0.6935, "grad_norm": 0.8627146482467651, "learning_rate": 0.0002, "epoch": 2.6481286880607873, "step": 16380}, {"loss": 0.715, "grad_norm": 0.790447473526001, "learning_rate": 0.0002, "epoch": 2.6497453722415325, "step": 16390}, {"loss": 0.723, "grad_norm": 0.8447365164756775, "learning_rate": 0.0002, "epoch": 2.651362056422278, "step": 16400}, {"loss": 0.6628, "grad_norm": 0.7831417918205261, "learning_rate": 0.0002, "epoch": 2.6529787406030234, "step": 16410}, {"loss": 0.6691, "grad_norm": 0.6837952136993408, "learning_rate": 0.0002, "epoch": 2.6545954247837686, "step": 16420}, {"loss": 0.6139, "grad_norm": 0.7031801342964172, "learning_rate": 0.0002, "epoch": 2.656212108964514, "step": 16430}, {"loss": 0.7382, "grad_norm": 0.8963770866394043, "learning_rate": 0.0002, "epoch": 2.657828793145259, "step": 16440}, {"loss": 0.6439, "grad_norm": 0.6852328181266785, "learning_rate": 0.0002, "epoch": 2.6594454773260043, "step": 16450}, {"loss": 0.6278, "grad_norm": 0.8069294095039368, "learning_rate": 0.0002, "epoch": 2.6610621615067496, "step": 16460}, {"loss": 0.6939, "grad_norm": 0.7503686547279358, "learning_rate": 0.0002, "epoch": 2.662678845687495, "step": 16470}, {"loss": 0.6777, "grad_norm": 0.6430956125259399, "learning_rate": 0.0002, "epoch": 2.66429552986824, "step": 16480}, {"loss": 0.6863, "grad_norm": 0.7894312739372253, "learning_rate": 0.0002, "epoch": 2.6659122140489853, "step": 16490}, {"loss": 0.7165, "grad_norm": 0.7277431488037109, "learning_rate": 0.0002, "epoch": 2.667528898229731, "step": 16500}, {"loss": 0.6772, "grad_norm": 0.6816153526306152, "learning_rate": 0.0002, "epoch": 2.669145582410476, "step": 16510}, {"loss": 0.691, "grad_norm": 0.8145235776901245, "learning_rate": 0.0002, "epoch": 2.6707622665912214, "step": 16520}, {"loss": 0.709, "grad_norm": 0.8645890355110168, "learning_rate": 0.0002, "epoch": 2.6723789507719666, "step": 16530}, {"loss": 0.6946, "grad_norm": 0.704393208026886, "learning_rate": 0.0002, "epoch": 2.673995634952712, "step": 16540}, {"loss": 0.6378, "grad_norm": 1.0120846033096313, "learning_rate": 0.0002, "epoch": 2.6756123191334575, "step": 16550}, {"loss": 0.7241, "grad_norm": 0.6919328570365906, "learning_rate": 0.0002, "epoch": 2.6772290033142028, "step": 16560}, {"loss": 0.7098, "grad_norm": 0.6924574971199036, "learning_rate": 0.0002, "epoch": 2.678845687494948, "step": 16570}, {"loss": 0.731, "grad_norm": 0.9679301381111145, "learning_rate": 0.0002, "epoch": 2.6804623716756932, "step": 16580}, {"loss": 0.7124, "grad_norm": 0.6810211539268494, "learning_rate": 0.0002, "epoch": 2.6820790558564385, "step": 16590}, {"loss": 0.6688, "grad_norm": 0.9730555415153503, "learning_rate": 0.0002, "epoch": 2.6836957400371837, "step": 16600}, {"loss": 0.7344, "grad_norm": 0.7852821350097656, "learning_rate": 0.0002, "epoch": 2.685312424217929, "step": 16610}, {"loss": 0.6401, "grad_norm": 0.6059057116508484, "learning_rate": 0.0002, "epoch": 2.686929108398674, "step": 16620}, {"loss": 0.6796, "grad_norm": 0.9395958781242371, "learning_rate": 0.0002, "epoch": 2.6885457925794194, "step": 16630}, {"loss": 0.7174, "grad_norm": 0.7473729848861694, "learning_rate": 0.0002, "epoch": 2.690162476760165, "step": 16640}, {"loss": 0.7087, "grad_norm": 0.765934407711029, "learning_rate": 0.0002, "epoch": 2.6917791609409103, "step": 16650}, {"loss": 0.707, "grad_norm": 0.8496677279472351, "learning_rate": 0.0002, "epoch": 2.6933958451216555, "step": 16660}, {"loss": 0.7084, "grad_norm": 0.7641879916191101, "learning_rate": 0.0002, "epoch": 2.6950125293024008, "step": 16670}, {"loss": 0.6566, "grad_norm": 0.8471952676773071, "learning_rate": 0.0002, "epoch": 2.696629213483146, "step": 16680}, {"loss": 0.6635, "grad_norm": 0.6946060657501221, "learning_rate": 0.0002, "epoch": 2.6982458976638912, "step": 16690}, {"loss": 0.7027, "grad_norm": 0.7361312508583069, "learning_rate": 0.0002, "epoch": 2.699862581844637, "step": 16700}, {"loss": 0.6767, "grad_norm": 0.6605038046836853, "learning_rate": 0.0002, "epoch": 2.701479266025382, "step": 16710}, {"loss": 0.6885, "grad_norm": 0.7164411544799805, "learning_rate": 0.0002, "epoch": 2.7030959502061274, "step": 16720}, {"loss": 0.6736, "grad_norm": 0.6496201157569885, "learning_rate": 0.0002, "epoch": 2.7047126343868726, "step": 16730}, {"loss": 0.6942, "grad_norm": 0.7826663851737976, "learning_rate": 0.0002, "epoch": 2.706329318567618, "step": 16740}, {"loss": 0.6773, "grad_norm": 0.7639131546020508, "learning_rate": 0.0002, "epoch": 2.707946002748363, "step": 16750}, {"loss": 0.69, "grad_norm": 0.7976210713386536, "learning_rate": 0.0002, "epoch": 2.7095626869291083, "step": 16760}, {"loss": 0.6735, "grad_norm": 0.6836577653884888, "learning_rate": 0.0002, "epoch": 2.7111793711098535, "step": 16770}, {"loss": 0.6596, "grad_norm": 0.8025202751159668, "learning_rate": 0.0002, "epoch": 2.7127960552905988, "step": 16780}, {"loss": 0.6324, "grad_norm": 0.7636463642120361, "learning_rate": 0.0002, "epoch": 2.7144127394713444, "step": 16790}, {"loss": 0.6227, "grad_norm": 0.7481677532196045, "learning_rate": 0.0002, "epoch": 2.7160294236520897, "step": 16800}, {"loss": 0.6925, "grad_norm": 0.7566834688186646, "learning_rate": 0.0002, "epoch": 2.717646107832835, "step": 16810}, {"loss": 0.6531, "grad_norm": 0.7931267619132996, "learning_rate": 0.0002, "epoch": 2.71926279201358, "step": 16820}, {"loss": 0.6672, "grad_norm": 0.8811662197113037, "learning_rate": 0.0002, "epoch": 2.7208794761943254, "step": 16830}, {"loss": 0.6675, "grad_norm": 0.8561240434646606, "learning_rate": 0.0002, "epoch": 2.7224961603750706, "step": 16840}, {"loss": 0.7135, "grad_norm": 0.7121599316596985, "learning_rate": 0.0002, "epoch": 2.7241128445558163, "step": 16850}, {"loss": 0.6825, "grad_norm": 0.8066257238388062, "learning_rate": 0.0002, "epoch": 2.7257295287365615, "step": 16860}, {"loss": 0.6839, "grad_norm": 0.7699271440505981, "learning_rate": 0.0002, "epoch": 2.7273462129173067, "step": 16870}, {"loss": 0.699, "grad_norm": 1.1828432083129883, "learning_rate": 0.0002, "epoch": 2.728962897098052, "step": 16880}, {"loss": 0.6518, "grad_norm": 0.9989302754402161, "learning_rate": 0.0002, "epoch": 2.730579581278797, "step": 16890}, {"loss": 0.7015, "grad_norm": 0.8100560307502747, "learning_rate": 0.0002, "epoch": 2.7321962654595424, "step": 16900}, {"loss": 0.6851, "grad_norm": 0.8615233898162842, "learning_rate": 0.0002, "epoch": 2.7338129496402876, "step": 16910}, {"loss": 0.6322, "grad_norm": 0.8633756041526794, "learning_rate": 0.0002, "epoch": 2.735429633821033, "step": 16920}, {"loss": 0.6488, "grad_norm": 0.7769348621368408, "learning_rate": 0.0002, "epoch": 2.737046318001778, "step": 16930}, {"loss": 0.6582, "grad_norm": 0.6943058371543884, "learning_rate": 0.0002, "epoch": 2.738663002182524, "step": 16940}, {"loss": 0.6516, "grad_norm": 0.8510736227035522, "learning_rate": 0.0002, "epoch": 2.740279686363269, "step": 16950}, {"loss": 0.7275, "grad_norm": 0.7732602953910828, "learning_rate": 0.0002, "epoch": 2.7418963705440142, "step": 16960}, {"loss": 0.6553, "grad_norm": 0.5981788635253906, "learning_rate": 0.0002, "epoch": 2.7435130547247595, "step": 16970}, {"loss": 0.6777, "grad_norm": 0.7604416012763977, "learning_rate": 0.0002, "epoch": 2.7451297389055047, "step": 16980}, {"loss": 0.6981, "grad_norm": 0.7377738356590271, "learning_rate": 0.0002, "epoch": 2.74674642308625, "step": 16990}, {"loss": 0.6294, "grad_norm": 0.9400289058685303, "learning_rate": 0.0002, "epoch": 2.7483631072669956, "step": 17000}, {"loss": 0.6952, "grad_norm": 0.6340599656105042, "learning_rate": 0.0002, "epoch": 2.749979791447741, "step": 17010}, {"loss": 0.7222, "grad_norm": 0.7297601103782654, "learning_rate": 0.0002, "epoch": 2.751596475628486, "step": 17020}, {"loss": 0.6659, "grad_norm": 0.9479979872703552, "learning_rate": 0.0002, "epoch": 2.7532131598092313, "step": 17030}, {"loss": 0.691, "grad_norm": 0.8461511135101318, "learning_rate": 0.0002, "epoch": 2.7548298439899765, "step": 17040}, {"loss": 0.6764, "grad_norm": 0.7477551698684692, "learning_rate": 0.0002, "epoch": 2.7564465281707218, "step": 17050}, {"loss": 0.684, "grad_norm": 1.019270420074463, "learning_rate": 0.0002, "epoch": 2.758063212351467, "step": 17060}, {"loss": 0.7119, "grad_norm": 0.7730235457420349, "learning_rate": 0.0002, "epoch": 2.7596798965322122, "step": 17070}, {"loss": 0.6886, "grad_norm": 0.8216866254806519, "learning_rate": 0.0002, "epoch": 2.7612965807129575, "step": 17080}, {"loss": 0.6811, "grad_norm": 0.7235931754112244, "learning_rate": 0.0002, "epoch": 2.762913264893703, "step": 17090}, {"loss": 0.7031, "grad_norm": 0.7352296710014343, "learning_rate": 0.0002, "epoch": 2.7645299490744484, "step": 17100}, {"loss": 0.6951, "grad_norm": 0.8129373788833618, "learning_rate": 0.0002, "epoch": 2.7661466332551936, "step": 17110}, {"loss": 0.6703, "grad_norm": 0.7387019991874695, "learning_rate": 0.0002, "epoch": 2.767763317435939, "step": 17120}, {"loss": 0.6789, "grad_norm": 0.9149190187454224, "learning_rate": 0.0002, "epoch": 2.769380001616684, "step": 17130}, {"loss": 0.6038, "grad_norm": 0.7352971434593201, "learning_rate": 0.0002, "epoch": 2.7709966857974297, "step": 17140}, {"loss": 0.6728, "grad_norm": 0.7903780341148376, "learning_rate": 0.0002, "epoch": 2.772613369978175, "step": 17150}, {"loss": 0.6988, "grad_norm": 0.8255927562713623, "learning_rate": 0.0002, "epoch": 2.77423005415892, "step": 17160}, {"loss": 0.6694, "grad_norm": 0.7235927581787109, "learning_rate": 0.0002, "epoch": 2.7758467383396654, "step": 17170}, {"loss": 0.7161, "grad_norm": 0.8281434774398804, "learning_rate": 0.0002, "epoch": 2.7774634225204107, "step": 17180}, {"loss": 0.682, "grad_norm": 0.7586921453475952, "learning_rate": 0.0002, "epoch": 2.779080106701156, "step": 17190}, {"loss": 0.6427, "grad_norm": 0.7161715030670166, "learning_rate": 0.0002, "epoch": 2.780696790881901, "step": 17200}, {"loss": 0.6426, "grad_norm": 0.762868344783783, "learning_rate": 0.0002, "epoch": 2.7823134750626464, "step": 17210}, {"loss": 0.705, "grad_norm": 0.9285483360290527, "learning_rate": 0.0002, "epoch": 2.7839301592433916, "step": 17220}, {"loss": 0.7084, "grad_norm": 0.6900462508201599, "learning_rate": 0.0002, "epoch": 2.785546843424137, "step": 17230}, {"loss": 0.6988, "grad_norm": 0.780384361743927, "learning_rate": 0.0002, "epoch": 2.7871635276048825, "step": 17240}, {"loss": 0.7073, "grad_norm": 0.7580406665802002, "learning_rate": 0.0002, "epoch": 2.7887802117856277, "step": 17250}, {"loss": 0.6833, "grad_norm": 0.8145199418067932, "learning_rate": 0.0002, "epoch": 2.790396895966373, "step": 17260}, {"loss": 0.6909, "grad_norm": 0.9159596562385559, "learning_rate": 0.0002, "epoch": 2.792013580147118, "step": 17270}, {"loss": 0.6008, "grad_norm": 0.9590014219284058, "learning_rate": 0.0002, "epoch": 2.7936302643278634, "step": 17280}, {"loss": 0.6704, "grad_norm": 0.7603529691696167, "learning_rate": 0.0002, "epoch": 2.795246948508609, "step": 17290}, {"loss": 0.7165, "grad_norm": 0.8039976358413696, "learning_rate": 0.0002, "epoch": 2.7968636326893543, "step": 17300}, {"loss": 0.7037, "grad_norm": 0.8364847302436829, "learning_rate": 0.0002, "epoch": 2.7984803168700996, "step": 17310}, {"loss": 0.6749, "grad_norm": 0.8763046860694885, "learning_rate": 0.0002, "epoch": 2.800097001050845, "step": 17320}, {"loss": 0.6844, "grad_norm": 0.8409647941589355, "learning_rate": 0.0002, "epoch": 2.80171368523159, "step": 17330}, {"loss": 0.6936, "grad_norm": 0.7649006247520447, "learning_rate": 0.0002, "epoch": 2.8033303694123353, "step": 17340}, {"loss": 0.7051, "grad_norm": 0.7970262169837952, "learning_rate": 0.0002, "epoch": 2.8049470535930805, "step": 17350}, {"loss": 0.6533, "grad_norm": 0.9088607430458069, "learning_rate": 0.0002, "epoch": 2.8065637377738257, "step": 17360}, {"loss": 0.675, "grad_norm": 0.6454846858978271, "learning_rate": 0.0002, "epoch": 2.808180421954571, "step": 17370}, {"loss": 0.7069, "grad_norm": 0.7744787931442261, "learning_rate": 0.0002, "epoch": 2.809797106135316, "step": 17380}, {"loss": 0.6772, "grad_norm": 0.6678640842437744, "learning_rate": 0.0002, "epoch": 2.811413790316062, "step": 17390}, {"loss": 0.6784, "grad_norm": 0.772676944732666, "learning_rate": 0.0002, "epoch": 2.813030474496807, "step": 17400}, {"loss": 0.7252, "grad_norm": 0.7088175415992737, "learning_rate": 0.0002, "epoch": 2.8146471586775523, "step": 17410}, {"loss": 0.7086, "grad_norm": 0.8280573487281799, "learning_rate": 0.0002, "epoch": 2.8162638428582976, "step": 17420}, {"loss": 0.6732, "grad_norm": 0.6665388345718384, "learning_rate": 0.0002, "epoch": 2.817880527039043, "step": 17430}, {"loss": 0.6675, "grad_norm": 0.6427883505821228, "learning_rate": 0.0002, "epoch": 2.8194972112197885, "step": 17440}, {"loss": 0.6972, "grad_norm": 0.9697760343551636, "learning_rate": 0.0002, "epoch": 2.8211138954005337, "step": 17450}, {"loss": 0.6838, "grad_norm": 0.7573966383934021, "learning_rate": 0.0002, "epoch": 2.822730579581279, "step": 17460}, {"loss": 0.7243, "grad_norm": 0.878688633441925, "learning_rate": 0.0002, "epoch": 2.824347263762024, "step": 17470}, {"loss": 0.6666, "grad_norm": 0.7752242684364319, "learning_rate": 0.0002, "epoch": 2.8259639479427694, "step": 17480}, {"loss": 0.6638, "grad_norm": 0.6135398745536804, "learning_rate": 0.0002, "epoch": 2.8275806321235146, "step": 17490}, {"loss": 0.6829, "grad_norm": 0.6924924850463867, "learning_rate": 0.0002, "epoch": 2.82919731630426, "step": 17500}, {"loss": 0.6731, "grad_norm": 0.7471627593040466, "learning_rate": 0.0002, "epoch": 2.830814000485005, "step": 17510}, {"loss": 0.7016, "grad_norm": 0.7145499587059021, "learning_rate": 0.0002, "epoch": 2.8324306846657503, "step": 17520}, {"loss": 0.6787, "grad_norm": 0.7415414452552795, "learning_rate": 0.0002, "epoch": 2.834047368846496, "step": 17530}, {"loss": 0.6811, "grad_norm": 0.7328441739082336, "learning_rate": 0.0002, "epoch": 2.8356640530272412, "step": 17540}, {"loss": 0.6866, "grad_norm": 0.8267839550971985, "learning_rate": 0.0002, "epoch": 2.8372807372079865, "step": 17550}, {"loss": 0.6787, "grad_norm": 0.8877885341644287, "learning_rate": 0.0002, "epoch": 2.8388974213887317, "step": 17560}, {"loss": 0.7136, "grad_norm": 0.857138454914093, "learning_rate": 0.0002, "epoch": 2.840514105569477, "step": 17570}, {"loss": 0.6454, "grad_norm": 0.8470779657363892, "learning_rate": 0.0002, "epoch": 2.842130789750222, "step": 17580}, {"loss": 0.6976, "grad_norm": 0.8553254008293152, "learning_rate": 0.0002, "epoch": 2.843747473930968, "step": 17590}, {"loss": 0.7297, "grad_norm": 0.8033196926116943, "learning_rate": 0.0002, "epoch": 2.845364158111713, "step": 17600}, {"loss": 0.7062, "grad_norm": 0.7949087023735046, "learning_rate": 0.0002, "epoch": 2.8469808422924583, "step": 17610}, {"loss": 0.651, "grad_norm": 0.9241406321525574, "learning_rate": 0.0002, "epoch": 2.8485975264732035, "step": 17620}, {"loss": 0.6601, "grad_norm": 0.7721285223960876, "learning_rate": 0.0002, "epoch": 2.8502142106539488, "step": 17630}, {"loss": 0.6183, "grad_norm": 1.0246692895889282, "learning_rate": 0.0002, "epoch": 2.851830894834694, "step": 17640}, {"loss": 0.7007, "grad_norm": 0.9244589805603027, "learning_rate": 0.0002, "epoch": 2.853447579015439, "step": 17650}, {"loss": 0.7274, "grad_norm": 0.7243508696556091, "learning_rate": 0.0002, "epoch": 2.8550642631961844, "step": 17660}, {"loss": 0.6471, "grad_norm": 0.8943371176719666, "learning_rate": 0.0002, "epoch": 2.8566809473769297, "step": 17670}, {"loss": 0.686, "grad_norm": 0.6531758904457092, "learning_rate": 0.0002, "epoch": 2.8582976315576754, "step": 17680}, {"loss": 0.6253, "grad_norm": 0.8367000818252563, "learning_rate": 0.0002, "epoch": 2.8599143157384206, "step": 17690}, {"loss": 0.6943, "grad_norm": 0.7868556380271912, "learning_rate": 0.0002, "epoch": 2.861530999919166, "step": 17700}, {"loss": 0.6919, "grad_norm": 0.7213859558105469, "learning_rate": 0.0002, "epoch": 2.863147684099911, "step": 17710}, {"loss": 0.6657, "grad_norm": 0.7383931279182434, "learning_rate": 0.0002, "epoch": 2.8647643682806563, "step": 17720}, {"loss": 0.6841, "grad_norm": 0.7566812634468079, "learning_rate": 0.0002, "epoch": 2.8663810524614015, "step": 17730}, {"loss": 0.6449, "grad_norm": 0.6930373311042786, "learning_rate": 0.0002, "epoch": 2.867997736642147, "step": 17740}, {"loss": 0.6764, "grad_norm": 0.7911090850830078, "learning_rate": 0.0002, "epoch": 2.8696144208228924, "step": 17750}, {"loss": 0.6554, "grad_norm": 0.8484548926353455, "learning_rate": 0.0002, "epoch": 2.8712311050036377, "step": 17760}, {"loss": 0.6931, "grad_norm": 0.7647597193717957, "learning_rate": 0.0002, "epoch": 2.872847789184383, "step": 17770}, {"loss": 0.6945, "grad_norm": 0.8791151642799377, "learning_rate": 0.0002, "epoch": 2.874464473365128, "step": 17780}, {"loss": 0.7078, "grad_norm": 0.7253178358078003, "learning_rate": 0.0002, "epoch": 2.8760811575458733, "step": 17790}, {"loss": 0.6474, "grad_norm": 0.7956077456474304, "learning_rate": 0.0002, "epoch": 2.8776978417266186, "step": 17800}, {"loss": 0.6687, "grad_norm": 0.8657688498497009, "learning_rate": 0.0002, "epoch": 2.879314525907364, "step": 17810}, {"loss": 0.7171, "grad_norm": 0.7059141993522644, "learning_rate": 0.0002, "epoch": 2.880931210088109, "step": 17820}, {"loss": 0.683, "grad_norm": 0.8886896967887878, "learning_rate": 0.0002, "epoch": 2.8825478942688547, "step": 17830}, {"loss": 0.669, "grad_norm": 0.821032702922821, "learning_rate": 0.0002, "epoch": 2.8841645784496, "step": 17840}, {"loss": 0.6805, "grad_norm": 0.7183963656425476, "learning_rate": 0.0002, "epoch": 2.885781262630345, "step": 17850}, {"loss": 0.7088, "grad_norm": 0.6222899556159973, "learning_rate": 0.0002, "epoch": 2.8873979468110904, "step": 17860}, {"loss": 0.6626, "grad_norm": 0.8187434077262878, "learning_rate": 0.0002, "epoch": 2.8890146309918356, "step": 17870}, {"loss": 0.6815, "grad_norm": 0.9838479161262512, "learning_rate": 0.0002, "epoch": 2.890631315172581, "step": 17880}, {"loss": 0.6967, "grad_norm": 0.7567742466926575, "learning_rate": 0.0002, "epoch": 2.8922479993533265, "step": 17890}, {"loss": 0.7073, "grad_norm": 0.6875903606414795, "learning_rate": 0.0002, "epoch": 2.893864683534072, "step": 17900}, {"loss": 0.6415, "grad_norm": 0.8043789267539978, "learning_rate": 0.0002, "epoch": 2.895481367714817, "step": 17910}, {"loss": 0.6588, "grad_norm": 0.8062626719474792, "learning_rate": 0.0002, "epoch": 2.8970980518955622, "step": 17920}, {"loss": 0.7151, "grad_norm": 1.0251191854476929, "learning_rate": 0.0002, "epoch": 2.8987147360763075, "step": 17930}, {"loss": 0.6605, "grad_norm": 0.882253110408783, "learning_rate": 0.0002, "epoch": 2.9003314202570527, "step": 17940}, {"loss": 0.6719, "grad_norm": 0.8683299422264099, "learning_rate": 0.0002, "epoch": 2.901948104437798, "step": 17950}, {"loss": 0.6896, "grad_norm": 0.7167282104492188, "learning_rate": 0.0002, "epoch": 2.903564788618543, "step": 17960}, {"loss": 0.663, "grad_norm": 0.7093694806098938, "learning_rate": 0.0002, "epoch": 2.9051814727992884, "step": 17970}, {"loss": 0.6591, "grad_norm": 0.8549879193305969, "learning_rate": 0.0002, "epoch": 2.906798156980034, "step": 17980}, {"loss": 0.6962, "grad_norm": 0.6989606618881226, "learning_rate": 0.0002, "epoch": 2.9084148411607793, "step": 17990}, {"loss": 0.6635, "grad_norm": 0.9482976794242859, "learning_rate": 0.0002, "epoch": 2.9100315253415245, "step": 18000}, {"loss": 0.6586, "grad_norm": 0.7182440161705017, "learning_rate": 0.0002, "epoch": 2.9116482095222698, "step": 18010}, {"loss": 0.6827, "grad_norm": 0.7732226252555847, "learning_rate": 0.0002, "epoch": 2.913264893703015, "step": 18020}, {"loss": 0.7123, "grad_norm": 0.7936875224113464, "learning_rate": 0.0002, "epoch": 2.9148815778837607, "step": 18030}, {"loss": 0.6736, "grad_norm": 0.8825615644454956, "learning_rate": 0.0002, "epoch": 2.916498262064506, "step": 18040}, {"loss": 0.7139, "grad_norm": 0.6778587102890015, "learning_rate": 0.0002, "epoch": 2.918114946245251, "step": 18050}, {"loss": 0.6588, "grad_norm": 0.7529265880584717, "learning_rate": 0.0002, "epoch": 2.9197316304259964, "step": 18060}, {"loss": 0.737, "grad_norm": 0.7111883163452148, "learning_rate": 0.0002, "epoch": 2.9213483146067416, "step": 18070}, {"loss": 0.7475, "grad_norm": 0.7214767932891846, "learning_rate": 0.0002, "epoch": 2.922964998787487, "step": 18080}, {"loss": 0.6672, "grad_norm": 0.800417423248291, "learning_rate": 0.0002, "epoch": 2.924581682968232, "step": 18090}, {"loss": 0.6694, "grad_norm": 1.248575210571289, "learning_rate": 0.0002, "epoch": 2.9261983671489773, "step": 18100}, {"loss": 0.7004, "grad_norm": 0.757788360118866, "learning_rate": 0.0002, "epoch": 2.9278150513297225, "step": 18110}, {"loss": 0.6999, "grad_norm": 1.0583995580673218, "learning_rate": 0.0002, "epoch": 2.9294317355104678, "step": 18120}, {"loss": 0.6365, "grad_norm": 0.8228777647018433, "learning_rate": 0.0002, "epoch": 2.9310484196912134, "step": 18130}, {"loss": 0.6791, "grad_norm": 0.8374035358428955, "learning_rate": 0.0002, "epoch": 2.9326651038719587, "step": 18140}, {"loss": 0.6399, "grad_norm": 0.7976473569869995, "learning_rate": 0.0002, "epoch": 2.934281788052704, "step": 18150}, {"loss": 0.6585, "grad_norm": 0.8009907603263855, "learning_rate": 0.0002, "epoch": 2.935898472233449, "step": 18160}, {"loss": 0.7485, "grad_norm": 0.835213303565979, "learning_rate": 0.0002, "epoch": 2.9375151564141944, "step": 18170}, {"loss": 0.7376, "grad_norm": 0.7982219457626343, "learning_rate": 0.0002, "epoch": 2.93913184059494, "step": 18180}, {"loss": 0.6348, "grad_norm": 0.7070978879928589, "learning_rate": 0.0002, "epoch": 2.9407485247756853, "step": 18190}, {"loss": 0.6608, "grad_norm": 0.8619440197944641, "learning_rate": 0.0002, "epoch": 2.9423652089564305, "step": 18200}, {"loss": 0.666, "grad_norm": 0.6693987250328064, "learning_rate": 0.0002, "epoch": 2.9439818931371757, "step": 18210}, {"loss": 0.728, "grad_norm": 0.6747021079063416, "learning_rate": 0.0002, "epoch": 2.945598577317921, "step": 18220}, {"loss": 0.6686, "grad_norm": 0.860387921333313, "learning_rate": 0.0002, "epoch": 2.947215261498666, "step": 18230}, {"loss": 0.6945, "grad_norm": 0.799976646900177, "learning_rate": 0.0002, "epoch": 2.9488319456794114, "step": 18240}, {"loss": 0.7243, "grad_norm": 0.7864769101142883, "learning_rate": 0.0002, "epoch": 2.9504486298601567, "step": 18250}, {"loss": 0.6785, "grad_norm": 0.6713884472846985, "learning_rate": 0.0002, "epoch": 2.952065314040902, "step": 18260}, {"loss": 0.7429, "grad_norm": 0.9031508564949036, "learning_rate": 0.0002, "epoch": 2.9536819982216476, "step": 18270}, {"loss": 0.7055, "grad_norm": 0.7205073237419128, "learning_rate": 0.0002, "epoch": 2.955298682402393, "step": 18280}, {"loss": 0.7298, "grad_norm": 0.7746205925941467, "learning_rate": 0.0002, "epoch": 2.956915366583138, "step": 18290}, {"loss": 0.6218, "grad_norm": 0.6533427834510803, "learning_rate": 0.0002, "epoch": 2.9585320507638833, "step": 18300}, {"loss": 0.6674, "grad_norm": 0.9083208441734314, "learning_rate": 0.0002, "epoch": 2.9601487349446285, "step": 18310}, {"loss": 0.7359, "grad_norm": 0.7446991801261902, "learning_rate": 0.0002, "epoch": 2.9617654191253737, "step": 18320}, {"loss": 0.6738, "grad_norm": 0.6514461636543274, "learning_rate": 0.0002, "epoch": 2.9633821033061194, "step": 18330}, {"loss": 0.6677, "grad_norm": 0.8580465912818909, "learning_rate": 0.0002, "epoch": 2.9649987874868646, "step": 18340}, {"loss": 0.6971, "grad_norm": 0.7074266076087952, "learning_rate": 0.0002, "epoch": 2.96661547166761, "step": 18350}, {"loss": 0.6804, "grad_norm": 0.899892270565033, "learning_rate": 0.0002, "epoch": 2.968232155848355, "step": 18360}, {"loss": 0.7094, "grad_norm": 0.8217641711235046, "learning_rate": 0.0002, "epoch": 2.9698488400291003, "step": 18370}, {"loss": 0.6916, "grad_norm": 0.8611799478530884, "learning_rate": 0.0002, "epoch": 2.9714655242098456, "step": 18380}, {"loss": 0.6677, "grad_norm": 0.6909302473068237, "learning_rate": 0.0002, "epoch": 2.973082208390591, "step": 18390}, {"loss": 0.7247, "grad_norm": 0.6554358005523682, "learning_rate": 0.0002, "epoch": 2.974698892571336, "step": 18400}, {"loss": 0.6516, "grad_norm": 0.7803071737289429, "learning_rate": 0.0002, "epoch": 2.9763155767520812, "step": 18410}, {"loss": 0.7322, "grad_norm": 0.7838954925537109, "learning_rate": 0.0002, "epoch": 2.977932260932827, "step": 18420}, {"loss": 0.6522, "grad_norm": 0.7098495364189148, "learning_rate": 0.0002, "epoch": 2.979548945113572, "step": 18430}, {"loss": 0.739, "grad_norm": 0.8981785774230957, "learning_rate": 0.0002, "epoch": 2.9811656292943174, "step": 18440}, {"loss": 0.6689, "grad_norm": 0.7197171449661255, "learning_rate": 0.0002, "epoch": 2.9827823134750626, "step": 18450}, {"loss": 0.706, "grad_norm": 0.793185293674469, "learning_rate": 0.0002, "epoch": 2.984398997655808, "step": 18460}, {"loss": 0.7124, "grad_norm": 0.8531473875045776, "learning_rate": 0.0002, "epoch": 2.986015681836553, "step": 18470}, {"loss": 0.6901, "grad_norm": 0.6627361178398132, "learning_rate": 0.0002, "epoch": 2.9876323660172988, "step": 18480}, {"loss": 0.6591, "grad_norm": 0.5708155035972595, "learning_rate": 0.0002, "epoch": 2.989249050198044, "step": 18490}, {"loss": 0.6725, "grad_norm": 0.8227280378341675, "learning_rate": 0.0002, "epoch": 2.990865734378789, "step": 18500}, {"loss": 0.6701, "grad_norm": 0.7102749943733215, "learning_rate": 0.0002, "epoch": 2.9924824185595345, "step": 18510}, {"loss": 0.7091, "grad_norm": 0.839485228061676, "learning_rate": 0.0002, "epoch": 2.9940991027402797, "step": 18520}, {"loss": 0.6521, "grad_norm": 0.9038704037666321, "learning_rate": 0.0002, "epoch": 2.995715786921025, "step": 18530}, {"loss": 0.7186, "grad_norm": 0.8737510442733765, "learning_rate": 0.0002, "epoch": 2.99733247110177, "step": 18540}, {"loss": 0.6819, "grad_norm": 0.7323142886161804, "learning_rate": 0.0002, "epoch": 2.9989491552825154, "step": 18550}, {"eval_loss": 1.1262480020523071, "eval_runtime": 122.0868, "eval_samples_per_second": 6.004, "eval_steps_per_second": 0.754, "epoch": 2.9999191657909625, "step": 18556}, {"loss": 0.6337, "grad_norm": 0.8465463519096375, "learning_rate": 0.0002, "epoch": 3.000565839463261, "step": 18560}, {"loss": 0.6064, "grad_norm": 0.9134138822555542, "learning_rate": 0.0002, "epoch": 3.0021825236440063, "step": 18570}, {"loss": 0.5804, "grad_norm": 0.760715126991272, "learning_rate": 0.0002, "epoch": 3.0037992078247515, "step": 18580}, {"loss": 0.5571, "grad_norm": 0.9208743572235107, "learning_rate": 0.0002, "epoch": 3.0054158920054967, "step": 18590}, {"loss": 0.5731, "grad_norm": 0.9232364892959595, "learning_rate": 0.0002, "epoch": 3.007032576186242, "step": 18600}, {"loss": 0.6299, "grad_norm": 1.1881544589996338, "learning_rate": 0.0002, "epoch": 3.008649260366987, "step": 18610}, {"loss": 0.5482, "grad_norm": 0.9372987747192383, "learning_rate": 0.0002, "epoch": 3.0102659445477324, "step": 18620}, {"loss": 0.5709, "grad_norm": 0.6900241374969482, "learning_rate": 0.0002, "epoch": 3.0118826287284777, "step": 18630}, {"loss": 0.5256, "grad_norm": 0.8451071381568909, "learning_rate": 0.0002, "epoch": 3.0134993129092233, "step": 18640}, {"loss": 0.5916, "grad_norm": 0.7763112187385559, "learning_rate": 0.0002, "epoch": 3.0151159970899686, "step": 18650}, {"loss": 0.6095, "grad_norm": 1.043653964996338, "learning_rate": 0.0002, "epoch": 3.016732681270714, "step": 18660}, {"loss": 0.6228, "grad_norm": 1.0170660018920898, "learning_rate": 0.0002, "epoch": 3.018349365451459, "step": 18670}, {"loss": 0.5671, "grad_norm": 0.7534180283546448, "learning_rate": 0.0002, "epoch": 3.0199660496322043, "step": 18680}, {"loss": 0.6015, "grad_norm": 0.7507367730140686, "learning_rate": 0.0002, "epoch": 3.0215827338129495, "step": 18690}, {"loss": 0.6201, "grad_norm": 0.7861620187759399, "learning_rate": 0.0002, "epoch": 3.0231994179936947, "step": 18700}, {"loss": 0.5802, "grad_norm": 1.0580339431762695, "learning_rate": 0.0002, "epoch": 3.0248161021744404, "step": 18710}, {"loss": 0.5975, "grad_norm": 0.7542710900306702, "learning_rate": 0.0002, "epoch": 3.0264327863551856, "step": 18720}, {"loss": 0.5695, "grad_norm": 0.8189544677734375, "learning_rate": 0.0002, "epoch": 3.028049470535931, "step": 18730}, {"loss": 0.6109, "grad_norm": 0.9126611351966858, "learning_rate": 0.0002, "epoch": 3.029666154716676, "step": 18740}, {"loss": 0.6443, "grad_norm": 0.8891341686248779, "learning_rate": 0.0002, "epoch": 3.0312828388974213, "step": 18750}, {"loss": 0.6207, "grad_norm": 0.8419283032417297, "learning_rate": 0.0002, "epoch": 3.0328995230781666, "step": 18760}, {"loss": 0.5818, "grad_norm": 0.8048048615455627, "learning_rate": 0.0002, "epoch": 3.034516207258912, "step": 18770}, {"loss": 0.6381, "grad_norm": 0.7820217609405518, "learning_rate": 0.0002, "epoch": 3.0361328914396575, "step": 18780}, {"loss": 0.5843, "grad_norm": 0.854721188545227, "learning_rate": 0.0002, "epoch": 3.0377495756204027, "step": 18790}, {"loss": 0.5784, "grad_norm": 0.912092924118042, "learning_rate": 0.0002, "epoch": 3.039366259801148, "step": 18800}, {"loss": 0.5734, "grad_norm": 0.6596226096153259, "learning_rate": 0.0002, "epoch": 3.040982943981893, "step": 18810}, {"loss": 0.5969, "grad_norm": 0.6351348757743835, "learning_rate": 0.0002, "epoch": 3.0425996281626384, "step": 18820}, {"loss": 0.5953, "grad_norm": 0.778188943862915, "learning_rate": 0.0002, "epoch": 3.0442163123433836, "step": 18830}, {"loss": 0.602, "grad_norm": 0.68234783411026, "learning_rate": 0.0002, "epoch": 3.045832996524129, "step": 18840}, {"loss": 0.5785, "grad_norm": 0.998628556728363, "learning_rate": 0.0002, "epoch": 3.047449680704874, "step": 18850}, {"loss": 0.6231, "grad_norm": 0.7393841743469238, "learning_rate": 0.0002, "epoch": 3.0490663648856198, "step": 18860}, {"loss": 0.568, "grad_norm": 0.84438556432724, "learning_rate": 0.0002, "epoch": 3.050683049066365, "step": 18870}, {"loss": 0.6205, "grad_norm": 0.8857501745223999, "learning_rate": 0.0002, "epoch": 3.0522997332471102, "step": 18880}, {"loss": 0.6335, "grad_norm": 0.7208474278450012, "learning_rate": 0.0002, "epoch": 3.0539164174278555, "step": 18890}, {"loss": 0.5998, "grad_norm": 0.7135229110717773, "learning_rate": 0.0002, "epoch": 3.0555331016086007, "step": 18900}, {"loss": 0.5575, "grad_norm": 0.9130001664161682, "learning_rate": 0.0002, "epoch": 3.057149785789346, "step": 18910}, {"loss": 0.5955, "grad_norm": 0.9001716375350952, "learning_rate": 0.0002, "epoch": 3.058766469970091, "step": 18920}, {"loss": 0.6052, "grad_norm": 0.8667559623718262, "learning_rate": 0.0002, "epoch": 3.060383154150837, "step": 18930}, {"loss": 0.5818, "grad_norm": 0.8943959474563599, "learning_rate": 0.0002, "epoch": 3.061999838331582, "step": 18940}, {"loss": 0.5978, "grad_norm": 0.8298377990722656, "learning_rate": 0.0002, "epoch": 3.0636165225123273, "step": 18950}, {"loss": 0.5782, "grad_norm": 0.7935267686843872, "learning_rate": 0.0002, "epoch": 3.0652332066930725, "step": 18960}, {"loss": 0.6434, "grad_norm": 1.1506379842758179, "learning_rate": 0.0002, "epoch": 3.0668498908738178, "step": 18970}, {"loss": 0.5571, "grad_norm": 0.7693049907684326, "learning_rate": 0.0002, "epoch": 3.068466575054563, "step": 18980}, {"loss": 0.5971, "grad_norm": 0.8040135502815247, "learning_rate": 0.0002, "epoch": 3.0700832592353082, "step": 18990}, {"loss": 0.5541, "grad_norm": 0.828404426574707, "learning_rate": 0.0002, "epoch": 3.0716999434160535, "step": 19000}, {"loss": 0.6048, "grad_norm": 0.8811164498329163, "learning_rate": 0.0002, "epoch": 3.073316627596799, "step": 19010}, {"loss": 0.5845, "grad_norm": 1.036205768585205, "learning_rate": 0.0002, "epoch": 3.0749333117775444, "step": 19020}, {"loss": 0.5838, "grad_norm": 0.8857285976409912, "learning_rate": 0.0002, "epoch": 3.0765499959582896, "step": 19030}, {"loss": 0.592, "grad_norm": 0.8392079472541809, "learning_rate": 0.0002, "epoch": 3.078166680139035, "step": 19040}, {"loss": 0.5927, "grad_norm": 1.0287401676177979, "learning_rate": 0.0002, "epoch": 3.07978336431978, "step": 19050}, {"loss": 0.5964, "grad_norm": 1.0086315870285034, "learning_rate": 0.0002, "epoch": 3.0814000485005253, "step": 19060}, {"loss": 0.5567, "grad_norm": 0.9245324730873108, "learning_rate": 0.0002, "epoch": 3.0830167326812705, "step": 19070}, {"loss": 0.5797, "grad_norm": 0.8680877089500427, "learning_rate": 0.0002, "epoch": 3.084633416862016, "step": 19080}, {"loss": 0.5611, "grad_norm": 0.8814793825149536, "learning_rate": 0.0002, "epoch": 3.0862501010427614, "step": 19090}, {"loss": 0.6051, "grad_norm": 0.9234458208084106, "learning_rate": 0.0002, "epoch": 3.0878667852235067, "step": 19100}, {"loss": 0.6209, "grad_norm": 1.1291664838790894, "learning_rate": 0.0002, "epoch": 3.089483469404252, "step": 19110}, {"loss": 0.5695, "grad_norm": 0.9191402792930603, "learning_rate": 0.0002, "epoch": 3.091100153584997, "step": 19120}, {"loss": 0.5856, "grad_norm": 0.7103154063224792, "learning_rate": 0.0002, "epoch": 3.0927168377657424, "step": 19130}, {"loss": 0.6479, "grad_norm": 0.9368883967399597, "learning_rate": 0.0002, "epoch": 3.0943335219464876, "step": 19140}, {"loss": 0.6167, "grad_norm": 0.9676656723022461, "learning_rate": 0.0002, "epoch": 3.095950206127233, "step": 19150}, {"loss": 0.5794, "grad_norm": 0.8739792704582214, "learning_rate": 0.0002, "epoch": 3.0975668903079785, "step": 19160}, {"loss": 0.6112, "grad_norm": 0.8530174493789673, "learning_rate": 0.0002, "epoch": 3.0991835744887237, "step": 19170}, {"loss": 0.6568, "grad_norm": 0.794945478439331, "learning_rate": 0.0002, "epoch": 3.100800258669469, "step": 19180}, {"loss": 0.5928, "grad_norm": 0.9508888125419617, "learning_rate": 0.0002, "epoch": 3.102416942850214, "step": 19190}, {"loss": 0.5757, "grad_norm": 1.0599955320358276, "learning_rate": 0.0002, "epoch": 3.1040336270309594, "step": 19200}, {"loss": 0.6151, "grad_norm": 1.0673625469207764, "learning_rate": 0.0002, "epoch": 3.1056503112117047, "step": 19210}, {"loss": 0.6043, "grad_norm": 0.7739115953445435, "learning_rate": 0.0002, "epoch": 3.10726699539245, "step": 19220}, {"loss": 0.6046, "grad_norm": 0.9884951114654541, "learning_rate": 0.0002, "epoch": 3.1088836795731956, "step": 19230}, {"loss": 0.5932, "grad_norm": 0.862260103225708, "learning_rate": 0.0002, "epoch": 3.110500363753941, "step": 19240}, {"loss": 0.6098, "grad_norm": 0.7690284848213196, "learning_rate": 0.0002, "epoch": 3.112117047934686, "step": 19250}, {"loss": 0.5791, "grad_norm": 0.8758958578109741, "learning_rate": 0.0002, "epoch": 3.1137337321154313, "step": 19260}, {"loss": 0.6136, "grad_norm": 1.0356395244598389, "learning_rate": 0.0002, "epoch": 3.1153504162961765, "step": 19270}, {"loss": 0.6159, "grad_norm": 0.6950937509536743, "learning_rate": 0.0002, "epoch": 3.1169671004769217, "step": 19280}, {"loss": 0.592, "grad_norm": 0.760998010635376, "learning_rate": 0.0002, "epoch": 3.118583784657667, "step": 19290}, {"loss": 0.575, "grad_norm": 0.9335789084434509, "learning_rate": 0.0002, "epoch": 3.1202004688384126, "step": 19300}, {"loss": 0.6139, "grad_norm": 0.9636204242706299, "learning_rate": 0.0002, "epoch": 3.121817153019158, "step": 19310}, {"loss": 0.6001, "grad_norm": 1.0820997953414917, "learning_rate": 0.0002, "epoch": 3.123433837199903, "step": 19320}, {"loss": 0.6542, "grad_norm": 0.7333487272262573, "learning_rate": 0.0002, "epoch": 3.1250505213806483, "step": 19330}, {"loss": 0.6178, "grad_norm": 1.0417509078979492, "learning_rate": 0.0002, "epoch": 3.1266672055613935, "step": 19340}, {"loss": 0.603, "grad_norm": 0.9267749190330505, "learning_rate": 0.0002, "epoch": 3.128283889742139, "step": 19350}, {"loss": 0.6063, "grad_norm": 0.777798593044281, "learning_rate": 0.0002, "epoch": 3.129900573922884, "step": 19360}, {"loss": 0.5913, "grad_norm": 0.8425456881523132, "learning_rate": 0.0002, "epoch": 3.1315172581036297, "step": 19370}, {"loss": 0.6042, "grad_norm": 0.9617102146148682, "learning_rate": 0.0002, "epoch": 3.133133942284375, "step": 19380}, {"loss": 0.633, "grad_norm": 1.0052828788757324, "learning_rate": 0.0002, "epoch": 3.13475062646512, "step": 19390}, {"loss": 0.5713, "grad_norm": 0.7637009024620056, "learning_rate": 0.0002, "epoch": 3.1363673106458654, "step": 19400}, {"loss": 0.5497, "grad_norm": 0.7958088517189026, "learning_rate": 0.0002, "epoch": 3.1379839948266106, "step": 19410}, {"loss": 0.6283, "grad_norm": 0.9161727428436279, "learning_rate": 0.0002, "epoch": 3.139600679007356, "step": 19420}, {"loss": 0.5638, "grad_norm": 0.8402149677276611, "learning_rate": 0.0002, "epoch": 3.141217363188101, "step": 19430}, {"loss": 0.5848, "grad_norm": 1.0056525468826294, "learning_rate": 0.0002, "epoch": 3.1428340473688463, "step": 19440}, {"loss": 0.5954, "grad_norm": 1.0129190683364868, "learning_rate": 0.0002, "epoch": 3.144450731549592, "step": 19450}, {"loss": 0.5808, "grad_norm": 0.790825366973877, "learning_rate": 0.0002, "epoch": 3.146067415730337, "step": 19460}, {"loss": 0.5607, "grad_norm": 1.441665530204773, "learning_rate": 0.0002, "epoch": 3.1476840999110824, "step": 19470}, {"loss": 0.5785, "grad_norm": 0.7846331596374512, "learning_rate": 0.0002, "epoch": 3.1493007840918277, "step": 19480}, {"loss": 0.5892, "grad_norm": 0.7915332913398743, "learning_rate": 0.0002, "epoch": 3.150917468272573, "step": 19490}, {"loss": 0.5759, "grad_norm": 0.933982253074646, "learning_rate": 0.0002, "epoch": 3.152534152453318, "step": 19500}, {"loss": 0.6206, "grad_norm": 1.038408637046814, "learning_rate": 0.0002, "epoch": 3.1541508366340634, "step": 19510}, {"loss": 0.6271, "grad_norm": 1.018935203552246, "learning_rate": 0.0002, "epoch": 3.155767520814809, "step": 19520}, {"loss": 0.6173, "grad_norm": 0.9618112444877625, "learning_rate": 0.0002, "epoch": 3.1573842049955543, "step": 19530}, {"loss": 0.5972, "grad_norm": 0.8900452852249146, "learning_rate": 0.0002, "epoch": 3.1590008891762995, "step": 19540}, {"loss": 0.5925, "grad_norm": 0.8254160284996033, "learning_rate": 0.0002, "epoch": 3.1606175733570447, "step": 19550}, {"loss": 0.625, "grad_norm": 1.004376769065857, "learning_rate": 0.0002, "epoch": 3.16223425753779, "step": 19560}, {"loss": 0.5775, "grad_norm": 1.0490446090698242, "learning_rate": 0.0002, "epoch": 3.163850941718535, "step": 19570}, {"loss": 0.5986, "grad_norm": 0.7387403845787048, "learning_rate": 0.0002, "epoch": 3.1654676258992804, "step": 19580}, {"loss": 0.5898, "grad_norm": 0.7611538171768188, "learning_rate": 0.0002, "epoch": 3.1670843100800257, "step": 19590}, {"loss": 0.5937, "grad_norm": 0.8239886164665222, "learning_rate": 0.0002, "epoch": 3.1687009942607713, "step": 19600}, {"loss": 0.6068, "grad_norm": 0.9327243566513062, "learning_rate": 0.0002, "epoch": 3.1703176784415166, "step": 19610}, {"loss": 0.572, "grad_norm": 0.9662560224533081, "learning_rate": 0.0002, "epoch": 3.171934362622262, "step": 19620}, {"loss": 0.5988, "grad_norm": 0.9183341860771179, "learning_rate": 0.0002, "epoch": 3.173551046803007, "step": 19630}, {"loss": 0.5909, "grad_norm": 0.875066876411438, "learning_rate": 0.0002, "epoch": 3.1751677309837523, "step": 19640}, {"loss": 0.5956, "grad_norm": 0.8567508459091187, "learning_rate": 0.0002, "epoch": 3.1767844151644975, "step": 19650}, {"loss": 0.5805, "grad_norm": 0.6805780529975891, "learning_rate": 0.0002, "epoch": 3.1784010993452427, "step": 19660}, {"loss": 0.6204, "grad_norm": 0.8776944279670715, "learning_rate": 0.0002, "epoch": 3.1800177835259884, "step": 19670}, {"loss": 0.6108, "grad_norm": 0.9036329984664917, "learning_rate": 0.0002, "epoch": 3.1816344677067336, "step": 19680}, {"loss": 0.6238, "grad_norm": 0.8527372479438782, "learning_rate": 0.0002, "epoch": 3.183251151887479, "step": 19690}, {"loss": 0.6089, "grad_norm": 1.1045585870742798, "learning_rate": 0.0002, "epoch": 3.184867836068224, "step": 19700}, {"loss": 0.5491, "grad_norm": 0.9213830828666687, "learning_rate": 0.0002, "epoch": 3.1864845202489693, "step": 19710}, {"loss": 0.618, "grad_norm": 0.8865814805030823, "learning_rate": 0.0002, "epoch": 3.1881012044297146, "step": 19720}, {"loss": 0.5785, "grad_norm": 0.7939388751983643, "learning_rate": 0.0002, "epoch": 3.18971788861046, "step": 19730}, {"loss": 0.5682, "grad_norm": 0.6966729760169983, "learning_rate": 0.0002, "epoch": 3.191334572791205, "step": 19740}, {"loss": 0.5839, "grad_norm": 0.8023673295974731, "learning_rate": 0.0002, "epoch": 3.1929512569719507, "step": 19750}, {"loss": 0.6267, "grad_norm": 0.7992037534713745, "learning_rate": 0.0002, "epoch": 3.194567941152696, "step": 19760}, {"loss": 0.6141, "grad_norm": 0.7412247657775879, "learning_rate": 0.0002, "epoch": 3.196184625333441, "step": 19770}, {"loss": 0.6179, "grad_norm": 0.9598729014396667, "learning_rate": 0.0002, "epoch": 3.1978013095141864, "step": 19780}, {"loss": 0.5685, "grad_norm": 0.8331366777420044, "learning_rate": 0.0002, "epoch": 3.1994179936949316, "step": 19790}, {"loss": 0.6104, "grad_norm": 0.8939169645309448, "learning_rate": 0.0002, "epoch": 3.201034677875677, "step": 19800}, {"loss": 0.6147, "grad_norm": 0.9219734072685242, "learning_rate": 0.0002, "epoch": 3.202651362056422, "step": 19810}, {"loss": 0.6051, "grad_norm": 0.869490385055542, "learning_rate": 0.0002, "epoch": 3.2042680462371678, "step": 19820}, {"loss": 0.5946, "grad_norm": 0.8989706635475159, "learning_rate": 0.0002, "epoch": 3.205884730417913, "step": 19830}, {"loss": 0.5866, "grad_norm": 0.8477165102958679, "learning_rate": 0.0002, "epoch": 3.2075014145986582, "step": 19840}, {"loss": 0.6176, "grad_norm": 0.8720678687095642, "learning_rate": 0.0002, "epoch": 3.2091180987794035, "step": 19850}, {"loss": 0.5694, "grad_norm": 0.861406683921814, "learning_rate": 0.0002, "epoch": 3.2107347829601487, "step": 19860}, {"loss": 0.6264, "grad_norm": 0.8228686451911926, "learning_rate": 0.0002, "epoch": 3.212351467140894, "step": 19870}, {"loss": 0.625, "grad_norm": 0.7936596870422363, "learning_rate": 0.0002, "epoch": 3.213968151321639, "step": 19880}, {"loss": 0.5698, "grad_norm": 1.097377896308899, "learning_rate": 0.0002, "epoch": 3.2155848355023844, "step": 19890}, {"loss": 0.6725, "grad_norm": 0.9544782638549805, "learning_rate": 0.0002, "epoch": 3.21720151968313, "step": 19900}, {"loss": 0.6022, "grad_norm": 0.8240751624107361, "learning_rate": 0.0002, "epoch": 3.2188182038638753, "step": 19910}, {"loss": 0.5659, "grad_norm": 0.8332096338272095, "learning_rate": 0.0002, "epoch": 3.2204348880446205, "step": 19920}, {"loss": 0.6274, "grad_norm": 1.0954567193984985, "learning_rate": 0.0002, "epoch": 3.2220515722253658, "step": 19930}, {"loss": 0.652, "grad_norm": 0.7790525555610657, "learning_rate": 0.0002, "epoch": 3.223668256406111, "step": 19940}, {"loss": 0.5986, "grad_norm": 0.7966814041137695, "learning_rate": 0.0002, "epoch": 3.225284940586856, "step": 19950}, {"loss": 0.5911, "grad_norm": 0.9751881957054138, "learning_rate": 0.0002, "epoch": 3.2269016247676015, "step": 19960}, {"loss": 0.6071, "grad_norm": 0.9856047630310059, "learning_rate": 0.0002, "epoch": 3.228518308948347, "step": 19970}, {"loss": 0.5837, "grad_norm": 1.3062353134155273, "learning_rate": 0.0002, "epoch": 3.2301349931290924, "step": 19980}, {"loss": 0.6588, "grad_norm": 0.9510692358016968, "learning_rate": 0.0002, "epoch": 3.2317516773098376, "step": 19990}, {"loss": 0.6264, "grad_norm": 0.8630342483520508, "learning_rate": 0.0002, "epoch": 3.233368361490583, "step": 20000}, {"loss": 0.6073, "grad_norm": 0.8966519236564636, "learning_rate": 0.0002, "epoch": 3.234985045671328, "step": 20010}, {"loss": 0.612, "grad_norm": 0.7093510627746582, "learning_rate": 0.0002, "epoch": 3.2366017298520733, "step": 20020}, {"loss": 0.585, "grad_norm": 0.7771096229553223, "learning_rate": 0.0002, "epoch": 3.2382184140328185, "step": 20030}, {"loss": 0.5821, "grad_norm": 0.841058075428009, "learning_rate": 0.0002, "epoch": 3.2398350982135637, "step": 20040}, {"loss": 0.6519, "grad_norm": 0.909712553024292, "learning_rate": 0.0002, "epoch": 3.2414517823943094, "step": 20050}, {"loss": 0.6089, "grad_norm": 0.8321019411087036, "learning_rate": 0.0002, "epoch": 3.2430684665750547, "step": 20060}, {"loss": 0.6115, "grad_norm": 0.779901921749115, "learning_rate": 0.0002, "epoch": 3.2446851507558, "step": 20070}, {"loss": 0.6107, "grad_norm": 0.6249170303344727, "learning_rate": 0.0002, "epoch": 3.246301834936545, "step": 20080}, {"loss": 0.603, "grad_norm": 0.8000940680503845, "learning_rate": 0.0002, "epoch": 3.2479185191172903, "step": 20090}, {"loss": 0.6273, "grad_norm": 0.7627735137939453, "learning_rate": 0.0002, "epoch": 3.2495352032980356, "step": 20100}, {"loss": 0.6223, "grad_norm": 0.8780747056007385, "learning_rate": 0.0002, "epoch": 3.2511518874787813, "step": 20110}, {"loss": 0.5969, "grad_norm": 0.772037148475647, "learning_rate": 0.0002, "epoch": 3.2527685716595265, "step": 20120}, {"loss": 0.5843, "grad_norm": 1.0086580514907837, "learning_rate": 0.0002, "epoch": 3.2543852558402717, "step": 20130}, {"loss": 0.5777, "grad_norm": 0.9360289573669434, "learning_rate": 0.0002, "epoch": 3.256001940021017, "step": 20140}, {"loss": 0.5777, "grad_norm": 1.2099586725234985, "learning_rate": 0.0002, "epoch": 3.257618624201762, "step": 20150}, {"loss": 0.624, "grad_norm": 0.8368481397628784, "learning_rate": 0.0002, "epoch": 3.2592353083825074, "step": 20160}, {"loss": 0.5626, "grad_norm": 0.7391039133071899, "learning_rate": 0.0002, "epoch": 3.2608519925632526, "step": 20170}, {"loss": 0.6041, "grad_norm": 0.9122273325920105, "learning_rate": 0.0002, "epoch": 3.262468676743998, "step": 20180}, {"loss": 0.5868, "grad_norm": 0.8502281904220581, "learning_rate": 0.0002, "epoch": 3.264085360924743, "step": 20190}, {"loss": 0.5841, "grad_norm": 1.0926852226257324, "learning_rate": 0.0002, "epoch": 3.265702045105489, "step": 20200}, {"loss": 0.6027, "grad_norm": 0.7902828454971313, "learning_rate": 0.0002, "epoch": 3.267318729286234, "step": 20210}, {"loss": 0.6089, "grad_norm": 0.8724729418754578, "learning_rate": 0.0002, "epoch": 3.2689354134669792, "step": 20220}, {"loss": 0.6242, "grad_norm": 0.8469277024269104, "learning_rate": 0.0002, "epoch": 3.2705520976477245, "step": 20230}, {"loss": 0.644, "grad_norm": 0.8865092992782593, "learning_rate": 0.0002, "epoch": 3.2721687818284697, "step": 20240}, {"loss": 0.6464, "grad_norm": 1.0979334115982056, "learning_rate": 0.0002, "epoch": 3.273785466009215, "step": 20250}, {"loss": 0.647, "grad_norm": 1.0860793590545654, "learning_rate": 0.0002, "epoch": 3.2754021501899606, "step": 20260}, {"loss": 0.6105, "grad_norm": 0.981745183467865, "learning_rate": 0.0002, "epoch": 3.277018834370706, "step": 20270}, {"loss": 0.627, "grad_norm": 0.9155020713806152, "learning_rate": 0.0002, "epoch": 3.278635518551451, "step": 20280}, {"loss": 0.5899, "grad_norm": 0.8436718583106995, "learning_rate": 0.0002, "epoch": 3.2802522027321963, "step": 20290}, {"loss": 0.6371, "grad_norm": 1.0329409837722778, "learning_rate": 0.0002, "epoch": 3.2818688869129415, "step": 20300}, {"loss": 0.6, "grad_norm": 0.9876394271850586, "learning_rate": 0.0002, "epoch": 3.2834855710936868, "step": 20310}, {"loss": 0.5463, "grad_norm": 0.8052917718887329, "learning_rate": 0.0002, "epoch": 3.285102255274432, "step": 20320}, {"loss": 0.5949, "grad_norm": 0.8390680551528931, "learning_rate": 0.0002, "epoch": 3.2867189394551772, "step": 20330}, {"loss": 0.6492, "grad_norm": 0.9515735507011414, "learning_rate": 0.0002, "epoch": 3.288335623635923, "step": 20340}, {"loss": 0.596, "grad_norm": 0.8028870224952698, "learning_rate": 0.0002, "epoch": 3.289952307816668, "step": 20350}, {"loss": 0.634, "grad_norm": 0.862592339515686, "learning_rate": 0.0002, "epoch": 3.2915689919974134, "step": 20360}, {"loss": 0.6345, "grad_norm": 0.7451621890068054, "learning_rate": 0.0002, "epoch": 3.2931856761781586, "step": 20370}, {"loss": 0.6458, "grad_norm": 0.8966776728630066, "learning_rate": 0.0002, "epoch": 3.294802360358904, "step": 20380}, {"loss": 0.5967, "grad_norm": 0.9289216995239258, "learning_rate": 0.0002, "epoch": 3.296419044539649, "step": 20390}, {"loss": 0.6599, "grad_norm": 0.9649626612663269, "learning_rate": 0.0002, "epoch": 3.2980357287203943, "step": 20400}, {"loss": 0.5781, "grad_norm": 1.1953798532485962, "learning_rate": 0.0002, "epoch": 3.29965241290114, "step": 20410}, {"loss": 0.5997, "grad_norm": 0.8929083943367004, "learning_rate": 0.0002, "epoch": 3.301269097081885, "step": 20420}, {"loss": 0.597, "grad_norm": 0.8922014236450195, "learning_rate": 0.0002, "epoch": 3.3028857812626304, "step": 20430}, {"loss": 0.5766, "grad_norm": 0.9754860401153564, "learning_rate": 0.0002, "epoch": 3.3045024654433757, "step": 20440}, {"loss": 0.5653, "grad_norm": 0.8873140215873718, "learning_rate": 0.0002, "epoch": 3.306119149624121, "step": 20450}, {"loss": 0.6138, "grad_norm": 0.857271671295166, "learning_rate": 0.0002, "epoch": 3.307735833804866, "step": 20460}, {"loss": 0.633, "grad_norm": 0.9022141098976135, "learning_rate": 0.0002, "epoch": 3.3093525179856114, "step": 20470}, {"loss": 0.6654, "grad_norm": 0.8614798188209534, "learning_rate": 0.0002, "epoch": 3.3109692021663566, "step": 20480}, {"loss": 0.6254, "grad_norm": 0.8838164210319519, "learning_rate": 0.0002, "epoch": 3.3125858863471023, "step": 20490}, {"loss": 0.5849, "grad_norm": 0.8709736466407776, "learning_rate": 0.0002, "epoch": 3.3142025705278475, "step": 20500}, {"loss": 0.6146, "grad_norm": 0.9533300995826721, "learning_rate": 0.0002, "epoch": 3.3158192547085927, "step": 20510}, {"loss": 0.6029, "grad_norm": 0.8259269595146179, "learning_rate": 0.0002, "epoch": 3.317435938889338, "step": 20520}, {"loss": 0.6268, "grad_norm": 0.8607608079910278, "learning_rate": 0.0002, "epoch": 3.319052623070083, "step": 20530}, {"loss": 0.5676, "grad_norm": 1.0863020420074463, "learning_rate": 0.0002, "epoch": 3.3206693072508284, "step": 20540}, {"loss": 0.6412, "grad_norm": 1.011489987373352, "learning_rate": 0.0002, "epoch": 3.3222859914315737, "step": 20550}, {"loss": 0.6247, "grad_norm": 0.6952177882194519, "learning_rate": 0.0002, "epoch": 3.3239026756123193, "step": 20560}, {"loss": 0.6229, "grad_norm": 0.9638974070549011, "learning_rate": 0.0002, "epoch": 3.3255193597930646, "step": 20570}, {"loss": 0.5882, "grad_norm": 1.0310138463974, "learning_rate": 0.0002, "epoch": 3.32713604397381, "step": 20580}, {"loss": 0.594, "grad_norm": 0.9371318221092224, "learning_rate": 0.0002, "epoch": 3.328752728154555, "step": 20590}, {"loss": 0.6137, "grad_norm": 0.8756691813468933, "learning_rate": 0.0002, "epoch": 3.3303694123353003, "step": 20600}, {"loss": 0.5994, "grad_norm": 1.054175853729248, "learning_rate": 0.0002, "epoch": 3.3319860965160455, "step": 20610}, {"loss": 0.6169, "grad_norm": 0.9074128270149231, "learning_rate": 0.0002, "epoch": 3.3336027806967907, "step": 20620}, {"loss": 0.6138, "grad_norm": 0.906900942325592, "learning_rate": 0.0002, "epoch": 3.335219464877536, "step": 20630}, {"loss": 0.571, "grad_norm": 0.8689333200454712, "learning_rate": 0.0002, "epoch": 3.3368361490582816, "step": 20640}, {"loss": 0.6079, "grad_norm": 0.9889747500419617, "learning_rate": 0.0002, "epoch": 3.338452833239027, "step": 20650}, {"loss": 0.6073, "grad_norm": 1.0685805082321167, "learning_rate": 0.0002, "epoch": 3.340069517419772, "step": 20660}, {"loss": 0.6091, "grad_norm": 0.7495010495185852, "learning_rate": 0.0002, "epoch": 3.3416862016005173, "step": 20670}, {"loss": 0.5883, "grad_norm": 0.8747848272323608, "learning_rate": 0.0002, "epoch": 3.3433028857812626, "step": 20680}, {"loss": 0.604, "grad_norm": 0.9762673377990723, "learning_rate": 0.0002, "epoch": 3.344919569962008, "step": 20690}, {"loss": 0.6784, "grad_norm": 1.0284489393234253, "learning_rate": 0.0002, "epoch": 3.346536254142753, "step": 20700}, {"loss": 0.6464, "grad_norm": 0.7293812036514282, "learning_rate": 0.0002, "epoch": 3.3481529383234987, "step": 20710}, {"loss": 0.609, "grad_norm": 0.8330199122428894, "learning_rate": 0.0002, "epoch": 3.349769622504244, "step": 20720}, {"loss": 0.5729, "grad_norm": 0.9808499217033386, "learning_rate": 0.0002, "epoch": 3.351386306684989, "step": 20730}, {"loss": 0.6315, "grad_norm": 0.9508825540542603, "learning_rate": 0.0002, "epoch": 3.3530029908657344, "step": 20740}, {"loss": 0.5965, "grad_norm": 0.790483832359314, "learning_rate": 0.0002, "epoch": 3.3546196750464796, "step": 20750}, {"loss": 0.6327, "grad_norm": 1.022793173789978, "learning_rate": 0.0002, "epoch": 3.356236359227225, "step": 20760}, {"loss": 0.6439, "grad_norm": 0.8318950533866882, "learning_rate": 0.0002, "epoch": 3.35785304340797, "step": 20770}, {"loss": 0.6037, "grad_norm": 0.7980858087539673, "learning_rate": 0.0002, "epoch": 3.3594697275887153, "step": 20780}, {"loss": 0.6746, "grad_norm": 0.8114802241325378, "learning_rate": 0.0002, "epoch": 3.361086411769461, "step": 20790}, {"loss": 0.6017, "grad_norm": 0.8522519469261169, "learning_rate": 0.0002, "epoch": 3.3627030959502062, "step": 20800}, {"loss": 0.5864, "grad_norm": 0.9142431616783142, "learning_rate": 0.0002, "epoch": 3.3643197801309515, "step": 20810}, {"loss": 0.6331, "grad_norm": 0.771170437335968, "learning_rate": 0.0002, "epoch": 3.3659364643116967, "step": 20820}, {"loss": 0.5879, "grad_norm": 1.0628231763839722, "learning_rate": 0.0002, "epoch": 3.367553148492442, "step": 20830}, {"loss": 0.6533, "grad_norm": 0.9384352564811707, "learning_rate": 0.0002, "epoch": 3.369169832673187, "step": 20840}, {"loss": 0.6292, "grad_norm": 1.1286591291427612, "learning_rate": 0.0002, "epoch": 3.370786516853933, "step": 20850}, {"loss": 0.5986, "grad_norm": 1.1349513530731201, "learning_rate": 0.0002, "epoch": 3.372403201034678, "step": 20860}, {"loss": 0.6413, "grad_norm": 1.0127464532852173, "learning_rate": 0.0002, "epoch": 3.3740198852154233, "step": 20870}, {"loss": 0.6414, "grad_norm": 0.9111971855163574, "learning_rate": 0.0002, "epoch": 3.3756365693961685, "step": 20880}, {"loss": 0.6101, "grad_norm": 0.871356725692749, "learning_rate": 0.0002, "epoch": 3.3772532535769137, "step": 20890}, {"loss": 0.5995, "grad_norm": 0.7774117588996887, "learning_rate": 0.0002, "epoch": 3.378869937757659, "step": 20900}, {"loss": 0.6062, "grad_norm": 1.0089964866638184, "learning_rate": 0.0002, "epoch": 3.380486621938404, "step": 20910}, {"loss": 0.5908, "grad_norm": 0.7855867147445679, "learning_rate": 0.0002, "epoch": 3.3821033061191494, "step": 20920}, {"loss": 0.6373, "grad_norm": 1.3713710308074951, "learning_rate": 0.0002, "epoch": 3.3837199902998947, "step": 20930}, {"loss": 0.6627, "grad_norm": 0.8599116206169128, "learning_rate": 0.0002, "epoch": 3.3853366744806404, "step": 20940}, {"loss": 0.6224, "grad_norm": 0.9392673373222351, "learning_rate": 0.0002, "epoch": 3.3869533586613856, "step": 20950}, {"loss": 0.5855, "grad_norm": 0.8764075040817261, "learning_rate": 0.0002, "epoch": 3.388570042842131, "step": 20960}, {"loss": 0.5734, "grad_norm": 0.8240136504173279, "learning_rate": 0.0002, "epoch": 3.390186727022876, "step": 20970}, {"loss": 0.5783, "grad_norm": 1.0982369184494019, "learning_rate": 0.0002, "epoch": 3.3918034112036213, "step": 20980}, {"loss": 0.5451, "grad_norm": 1.0599013566970825, "learning_rate": 0.0002, "epoch": 3.3934200953843665, "step": 20990}, {"loss": 0.6356, "grad_norm": 0.895438015460968, "learning_rate": 0.0002, "epoch": 3.395036779565112, "step": 21000}, {"loss": 0.6065, "grad_norm": 0.6974841356277466, "learning_rate": 0.0002, "epoch": 3.3966534637458574, "step": 21010}, {"loss": 0.5704, "grad_norm": 0.9571719765663147, "learning_rate": 0.0002, "epoch": 3.3982701479266026, "step": 21020}, {"loss": 0.679, "grad_norm": 0.831912636756897, "learning_rate": 0.0002, "epoch": 3.399886832107348, "step": 21030}, {"loss": 0.6051, "grad_norm": 0.831936240196228, "learning_rate": 0.0002, "epoch": 3.401503516288093, "step": 21040}, {"loss": 0.5857, "grad_norm": 0.7388373613357544, "learning_rate": 0.0002, "epoch": 3.4031202004688383, "step": 21050}, {"loss": 0.6245, "grad_norm": 0.938667356967926, "learning_rate": 0.0002, "epoch": 3.4047368846495836, "step": 21060}, {"loss": 0.6121, "grad_norm": 0.9202313423156738, "learning_rate": 0.0002, "epoch": 3.406353568830329, "step": 21070}, {"loss": 0.6388, "grad_norm": 0.9888381958007812, "learning_rate": 0.0002, "epoch": 3.4079702530110745, "step": 21080}, {"loss": 0.6245, "grad_norm": 0.8526970744132996, "learning_rate": 0.0002, "epoch": 3.4095869371918197, "step": 21090}, {"loss": 0.5914, "grad_norm": 0.7939383387565613, "learning_rate": 0.0002, "epoch": 3.411203621372565, "step": 21100}, {"loss": 0.6066, "grad_norm": 0.9986352920532227, "learning_rate": 0.0002, "epoch": 3.41282030555331, "step": 21110}, {"loss": 0.5947, "grad_norm": 0.8895300030708313, "learning_rate": 0.0002, "epoch": 3.4144369897340554, "step": 21120}, {"loss": 0.6264, "grad_norm": 0.9559482932090759, "learning_rate": 0.0002, "epoch": 3.4160536739148006, "step": 21130}, {"loss": 0.6491, "grad_norm": 0.8351506590843201, "learning_rate": 0.0002, "epoch": 3.417670358095546, "step": 21140}, {"loss": 0.567, "grad_norm": 0.8224456906318665, "learning_rate": 0.0002, "epoch": 3.4192870422762915, "step": 21150}, {"loss": 0.5871, "grad_norm": 1.0110299587249756, "learning_rate": 0.0002, "epoch": 3.4209037264570368, "step": 21160}, {"loss": 0.6116, "grad_norm": 0.82564777135849, "learning_rate": 0.0002, "epoch": 3.422520410637782, "step": 21170}, {"loss": 0.595, "grad_norm": 1.004738688468933, "learning_rate": 0.0002, "epoch": 3.4241370948185272, "step": 21180}, {"loss": 0.6286, "grad_norm": 0.7545676827430725, "learning_rate": 0.0002, "epoch": 3.4257537789992725, "step": 21190}, {"loss": 0.5868, "grad_norm": 0.8918704390525818, "learning_rate": 0.0002, "epoch": 3.4273704631800177, "step": 21200}, {"loss": 0.6542, "grad_norm": 0.8336876034736633, "learning_rate": 0.0002, "epoch": 3.428987147360763, "step": 21210}, {"loss": 0.5824, "grad_norm": 0.8928771018981934, "learning_rate": 0.0002, "epoch": 3.430603831541508, "step": 21220}, {"loss": 0.6468, "grad_norm": 0.7663705945014954, "learning_rate": 0.0002, "epoch": 3.432220515722254, "step": 21230}, {"loss": 0.6693, "grad_norm": 0.8392598628997803, "learning_rate": 0.0002, "epoch": 3.433837199902999, "step": 21240}, {"loss": 0.5971, "grad_norm": 0.8819600343704224, "learning_rate": 0.0002, "epoch": 3.4354538840837443, "step": 21250}, {"loss": 0.6791, "grad_norm": 0.9124642014503479, "learning_rate": 0.0002, "epoch": 3.4370705682644895, "step": 21260}, {"loss": 0.5925, "grad_norm": 0.8329763412475586, "learning_rate": 0.0002, "epoch": 3.4386872524452348, "step": 21270}, {"loss": 0.6541, "grad_norm": 0.9982839822769165, "learning_rate": 0.0002, "epoch": 3.44030393662598, "step": 21280}, {"loss": 0.6441, "grad_norm": 0.9105954766273499, "learning_rate": 0.0002, "epoch": 3.4419206208067252, "step": 21290}, {"loss": 0.6028, "grad_norm": 0.8182359337806702, "learning_rate": 0.0002, "epoch": 3.443537304987471, "step": 21300}, {"loss": 0.5991, "grad_norm": 1.0568904876708984, "learning_rate": 0.0002, "epoch": 3.445153989168216, "step": 21310}, {"loss": 0.6117, "grad_norm": 0.968539834022522, "learning_rate": 0.0002, "epoch": 3.4467706733489614, "step": 21320}, {"loss": 0.6219, "grad_norm": 0.8774511218070984, "learning_rate": 0.0002, "epoch": 3.4483873575297066, "step": 21330}, {"loss": 0.6438, "grad_norm": 0.7598156332969666, "learning_rate": 0.0002, "epoch": 3.450004041710452, "step": 21340}, {"loss": 0.6033, "grad_norm": 1.1012897491455078, "learning_rate": 0.0002, "epoch": 3.451620725891197, "step": 21350}, {"loss": 0.6137, "grad_norm": 0.8040637373924255, "learning_rate": 0.0002, "epoch": 3.4532374100719423, "step": 21360}, {"loss": 0.6173, "grad_norm": 0.8497496247291565, "learning_rate": 0.0002, "epoch": 3.4548540942526875, "step": 21370}, {"loss": 0.6005, "grad_norm": 0.8429915904998779, "learning_rate": 0.0002, "epoch": 3.456470778433433, "step": 21380}, {"loss": 0.6182, "grad_norm": 0.8107112646102905, "learning_rate": 0.0002, "epoch": 3.4580874626141784, "step": 21390}, {"loss": 0.6109, "grad_norm": 1.00872004032135, "learning_rate": 0.0002, "epoch": 3.4597041467949237, "step": 21400}, {"loss": 0.5712, "grad_norm": 0.8266542553901672, "learning_rate": 0.0002, "epoch": 3.461320830975669, "step": 21410}, {"loss": 0.6457, "grad_norm": 0.8972568511962891, "learning_rate": 0.0002, "epoch": 3.462937515156414, "step": 21420}, {"loss": 0.6081, "grad_norm": 1.0781476497650146, "learning_rate": 0.0002, "epoch": 3.4645541993371594, "step": 21430}, {"loss": 0.6303, "grad_norm": 0.9571592807769775, "learning_rate": 0.0002, "epoch": 3.4661708835179046, "step": 21440}, {"loss": 0.6309, "grad_norm": 0.881547212600708, "learning_rate": 0.0002, "epoch": 3.4677875676986503, "step": 21450}, {"loss": 0.6076, "grad_norm": 0.6955338716506958, "learning_rate": 0.0002, "epoch": 3.4694042518793955, "step": 21460}, {"loss": 0.6205, "grad_norm": 0.901187539100647, "learning_rate": 0.0002, "epoch": 3.4710209360601407, "step": 21470}, {"loss": 0.639, "grad_norm": 0.7063511610031128, "learning_rate": 0.0002, "epoch": 3.472637620240886, "step": 21480}, {"loss": 0.6154, "grad_norm": 0.8462792038917542, "learning_rate": 0.0002, "epoch": 3.474254304421631, "step": 21490}, {"loss": 0.61, "grad_norm": 1.1861060857772827, "learning_rate": 0.0002, "epoch": 3.4758709886023764, "step": 21500}, {"loss": 0.6586, "grad_norm": 0.70503169298172, "learning_rate": 0.0002, "epoch": 3.4774876727831217, "step": 21510}, {"loss": 0.6475, "grad_norm": 0.9650066494941711, "learning_rate": 0.0002, "epoch": 3.479104356963867, "step": 21520}, {"loss": 0.6452, "grad_norm": 1.0266852378845215, "learning_rate": 0.0002, "epoch": 3.4807210411446126, "step": 21530}, {"loss": 0.6553, "grad_norm": 0.956372857093811, "learning_rate": 0.0002, "epoch": 3.482337725325358, "step": 21540}, {"loss": 0.6667, "grad_norm": 0.8848432898521423, "learning_rate": 0.0002, "epoch": 3.483954409506103, "step": 21550}, {"loss": 0.6375, "grad_norm": 1.0805351734161377, "learning_rate": 0.0002, "epoch": 3.4855710936868483, "step": 21560}, {"loss": 0.6958, "grad_norm": 0.9279725551605225, "learning_rate": 0.0002, "epoch": 3.4871877778675935, "step": 21570}, {"loss": 0.6354, "grad_norm": 0.9049562215805054, "learning_rate": 0.0002, "epoch": 3.4888044620483387, "step": 21580}, {"loss": 0.6071, "grad_norm": 0.9619429111480713, "learning_rate": 0.0002, "epoch": 3.4904211462290844, "step": 21590}, {"loss": 0.5927, "grad_norm": 0.8508906960487366, "learning_rate": 0.0002, "epoch": 3.4920378304098296, "step": 21600}, {"loss": 0.6115, "grad_norm": 0.8692502379417419, "learning_rate": 0.0002, "epoch": 3.493654514590575, "step": 21610}, {"loss": 0.5878, "grad_norm": 0.8187332153320312, "learning_rate": 0.0002, "epoch": 3.49527119877132, "step": 21620}, {"loss": 0.5874, "grad_norm": 1.145400047302246, "learning_rate": 0.0002, "epoch": 3.4968878829520653, "step": 21630}, {"loss": 0.6313, "grad_norm": 0.8281388282775879, "learning_rate": 0.0002, "epoch": 3.4985045671328105, "step": 21640}, {"loss": 0.6624, "grad_norm": 0.82256019115448, "learning_rate": 0.0002, "epoch": 3.500121251313556, "step": 21650}, {"loss": 0.6346, "grad_norm": 0.9315484762191772, "learning_rate": 0.0002, "epoch": 3.501737935494301, "step": 21660}, {"loss": 0.6086, "grad_norm": 0.7626111507415771, "learning_rate": 0.0002, "epoch": 3.5033546196750462, "step": 21670}, {"loss": 0.6177, "grad_norm": 0.9275059103965759, "learning_rate": 0.0002, "epoch": 3.504971303855792, "step": 21680}, {"loss": 0.64, "grad_norm": 0.7906724810600281, "learning_rate": 0.0002, "epoch": 3.506587988036537, "step": 21690}, {"loss": 0.6015, "grad_norm": 0.8289761543273926, "learning_rate": 0.0002, "epoch": 3.5082046722172824, "step": 21700}, {"loss": 0.6246, "grad_norm": 0.8316431045532227, "learning_rate": 0.0002, "epoch": 3.5098213563980276, "step": 21710}, {"loss": 0.619, "grad_norm": 1.0451812744140625, "learning_rate": 0.0002, "epoch": 3.511438040578773, "step": 21720}, {"loss": 0.632, "grad_norm": 0.928252637386322, "learning_rate": 0.0002, "epoch": 3.513054724759518, "step": 21730}, {"loss": 0.6062, "grad_norm": 0.7985895276069641, "learning_rate": 0.0002, "epoch": 3.5146714089402638, "step": 21740}, {"loss": 0.6463, "grad_norm": 0.6740974187850952, "learning_rate": 0.0002, "epoch": 3.516288093121009, "step": 21750}, {"loss": 0.6138, "grad_norm": 0.8482223749160767, "learning_rate": 0.0002, "epoch": 3.517904777301754, "step": 21760}, {"loss": 0.6277, "grad_norm": 0.889947772026062, "learning_rate": 0.0002, "epoch": 3.5195214614824994, "step": 21770}, {"loss": 0.6174, "grad_norm": 0.8304598927497864, "learning_rate": 0.0002, "epoch": 3.5211381456632447, "step": 21780}, {"loss": 0.6156, "grad_norm": 0.8002981543540955, "learning_rate": 0.0002, "epoch": 3.52275482984399, "step": 21790}, {"loss": 0.5896, "grad_norm": 0.8115083575248718, "learning_rate": 0.0002, "epoch": 3.524371514024735, "step": 21800}, {"loss": 0.6041, "grad_norm": 0.9715048670768738, "learning_rate": 0.0002, "epoch": 3.5259881982054804, "step": 21810}, {"loss": 0.6715, "grad_norm": 1.0910786390304565, "learning_rate": 0.0002, "epoch": 3.5276048823862256, "step": 21820}, {"loss": 0.6543, "grad_norm": 0.8438942432403564, "learning_rate": 0.0002, "epoch": 3.5292215665669713, "step": 21830}, {"loss": 0.6509, "grad_norm": 0.8813382983207703, "learning_rate": 0.0002, "epoch": 3.5308382507477165, "step": 21840}, {"loss": 0.6049, "grad_norm": 0.7092908024787903, "learning_rate": 0.0002, "epoch": 3.5324549349284617, "step": 21850}, {"loss": 0.5678, "grad_norm": 0.8332187533378601, "learning_rate": 0.0002, "epoch": 3.534071619109207, "step": 21860}, {"loss": 0.5896, "grad_norm": 0.8958209156990051, "learning_rate": 0.0002, "epoch": 3.535688303289952, "step": 21870}, {"loss": 0.6476, "grad_norm": 0.824138879776001, "learning_rate": 0.0002, "epoch": 3.5373049874706974, "step": 21880}, {"loss": 0.6022, "grad_norm": 0.8375158309936523, "learning_rate": 0.0002, "epoch": 3.538921671651443, "step": 21890}, {"loss": 0.6019, "grad_norm": 1.0274608135223389, "learning_rate": 0.0002, "epoch": 3.5405383558321883, "step": 21900}, {"loss": 0.6194, "grad_norm": 0.7088932394981384, "learning_rate": 0.0002, "epoch": 3.5421550400129336, "step": 21910}, {"loss": 0.6554, "grad_norm": 0.8172445297241211, "learning_rate": 0.0002, "epoch": 3.543771724193679, "step": 21920}, {"loss": 0.6711, "grad_norm": 0.9904135465621948, "learning_rate": 0.0002, "epoch": 3.545388408374424, "step": 21930}, {"loss": 0.6001, "grad_norm": 0.9900432229042053, "learning_rate": 0.0002, "epoch": 3.5470050925551693, "step": 21940}, {"loss": 0.6195, "grad_norm": 0.8963301181793213, "learning_rate": 0.0002, "epoch": 3.5486217767359145, "step": 21950}, {"loss": 0.5972, "grad_norm": 0.8551464676856995, "learning_rate": 0.0002, "epoch": 3.5502384609166597, "step": 21960}, {"loss": 0.6206, "grad_norm": 1.0916603803634644, "learning_rate": 0.0002, "epoch": 3.551855145097405, "step": 21970}, {"loss": 0.6523, "grad_norm": 0.841598391532898, "learning_rate": 0.0002, "epoch": 3.5534718292781506, "step": 21980}, {"loss": 0.617, "grad_norm": 0.8566757440567017, "learning_rate": 0.0002, "epoch": 3.555088513458896, "step": 21990}, {"loss": 0.6192, "grad_norm": 1.0145052671432495, "learning_rate": 0.0002, "epoch": 3.556705197639641, "step": 22000}, {"loss": 0.6173, "grad_norm": 0.9293754696846008, "learning_rate": 0.0002, "epoch": 3.5583218818203863, "step": 22010}, {"loss": 0.612, "grad_norm": 0.9568536281585693, "learning_rate": 0.0002, "epoch": 3.5599385660011316, "step": 22020}, {"loss": 0.641, "grad_norm": 0.8613139986991882, "learning_rate": 0.0002, "epoch": 3.5615552501818772, "step": 22030}, {"loss": 0.6496, "grad_norm": 0.8179237246513367, "learning_rate": 0.0002, "epoch": 3.5631719343626225, "step": 22040}, {"loss": 0.574, "grad_norm": 0.9059830904006958, "learning_rate": 0.0002, "epoch": 3.5647886185433677, "step": 22050}, {"loss": 0.6448, "grad_norm": 1.0068252086639404, "learning_rate": 0.0002, "epoch": 3.566405302724113, "step": 22060}, {"loss": 0.6239, "grad_norm": 0.9682072997093201, "learning_rate": 0.0002, "epoch": 3.568021986904858, "step": 22070}, {"loss": 0.6808, "grad_norm": 0.8514005541801453, "learning_rate": 0.0002, "epoch": 3.5696386710856034, "step": 22080}, {"loss": 0.5956, "grad_norm": 0.8327770829200745, "learning_rate": 0.0002, "epoch": 3.5712553552663486, "step": 22090}, {"loss": 0.5976, "grad_norm": 1.024976372718811, "learning_rate": 0.0002, "epoch": 3.572872039447094, "step": 22100}, {"loss": 0.624, "grad_norm": 0.7721174955368042, "learning_rate": 0.0002, "epoch": 3.574488723627839, "step": 22110}, {"loss": 0.5896, "grad_norm": 1.0351054668426514, "learning_rate": 0.0002, "epoch": 3.5761054078085843, "step": 22120}, {"loss": 0.6379, "grad_norm": 0.9680907130241394, "learning_rate": 0.0002, "epoch": 3.57772209198933, "step": 22130}, {"loss": 0.6194, "grad_norm": 0.8016974925994873, "learning_rate": 0.0002, "epoch": 3.5793387761700752, "step": 22140}, {"loss": 0.6387, "grad_norm": 1.0109003782272339, "learning_rate": 0.0002, "epoch": 3.5809554603508205, "step": 22150}, {"loss": 0.6368, "grad_norm": 1.0473392009735107, "learning_rate": 0.0002, "epoch": 3.5825721445315657, "step": 22160}, {"loss": 0.6353, "grad_norm": 0.8686613440513611, "learning_rate": 0.0002, "epoch": 3.584188828712311, "step": 22170}, {"loss": 0.5791, "grad_norm": 0.869149923324585, "learning_rate": 0.0002, "epoch": 3.5858055128930566, "step": 22180}, {"loss": 0.5895, "grad_norm": 0.9769062995910645, "learning_rate": 0.0002, "epoch": 3.587422197073802, "step": 22190}, {"loss": 0.5939, "grad_norm": 0.779636561870575, "learning_rate": 0.0002, "epoch": 3.589038881254547, "step": 22200}, {"loss": 0.5875, "grad_norm": 0.9063841104507446, "learning_rate": 0.0002, "epoch": 3.5906555654352923, "step": 22210}, {"loss": 0.5671, "grad_norm": 0.9216037392616272, "learning_rate": 0.0002, "epoch": 3.5922722496160375, "step": 22220}, {"loss": 0.6484, "grad_norm": 1.0217336416244507, "learning_rate": 0.0002, "epoch": 3.5938889337967828, "step": 22230}, {"loss": 0.6511, "grad_norm": 0.8513161540031433, "learning_rate": 0.0002, "epoch": 3.595505617977528, "step": 22240}, {"loss": 0.6301, "grad_norm": 0.8084813952445984, "learning_rate": 0.0002, "epoch": 3.597122302158273, "step": 22250}, {"loss": 0.6197, "grad_norm": 0.8524802923202515, "learning_rate": 0.0002, "epoch": 3.5987389863390185, "step": 22260}, {"loss": 0.5599, "grad_norm": 0.9356237649917603, "learning_rate": 0.0002, "epoch": 3.600355670519764, "step": 22270}, {"loss": 0.628, "grad_norm": 1.009600281715393, "learning_rate": 0.0002, "epoch": 3.6019723547005094, "step": 22280}, {"loss": 0.6179, "grad_norm": 0.9900581240653992, "learning_rate": 0.0002, "epoch": 3.6035890388812546, "step": 22290}, {"loss": 0.5725, "grad_norm": 1.062495231628418, "learning_rate": 0.0002, "epoch": 3.605205723062, "step": 22300}, {"loss": 0.607, "grad_norm": 0.8832381367683411, "learning_rate": 0.0002, "epoch": 3.606822407242745, "step": 22310}, {"loss": 0.6215, "grad_norm": 0.9284297823905945, "learning_rate": 0.0002, "epoch": 3.6084390914234903, "step": 22320}, {"loss": 0.685, "grad_norm": 1.2381829023361206, "learning_rate": 0.0002, "epoch": 3.610055775604236, "step": 22330}, {"loss": 0.6181, "grad_norm": 0.929434597492218, "learning_rate": 0.0002, "epoch": 3.611672459784981, "step": 22340}, {"loss": 0.6141, "grad_norm": 0.9714490175247192, "learning_rate": 0.0002, "epoch": 3.6132891439657264, "step": 22350}, {"loss": 0.6861, "grad_norm": 0.808014988899231, "learning_rate": 0.0002, "epoch": 3.6149058281464717, "step": 22360}, {"loss": 0.6428, "grad_norm": 1.0364398956298828, "learning_rate": 0.0002, "epoch": 3.616522512327217, "step": 22370}, {"loss": 0.6337, "grad_norm": 0.7858489751815796, "learning_rate": 0.0002, "epoch": 3.618139196507962, "step": 22380}, {"loss": 0.6214, "grad_norm": 0.9920870065689087, "learning_rate": 0.0002, "epoch": 3.6197558806887074, "step": 22390}, {"loss": 0.6659, "grad_norm": 0.9183220863342285, "learning_rate": 0.0002, "epoch": 3.6213725648694526, "step": 22400}, {"loss": 0.6036, "grad_norm": 0.9826246500015259, "learning_rate": 0.0002, "epoch": 3.622989249050198, "step": 22410}, {"loss": 0.6441, "grad_norm": 0.8632931113243103, "learning_rate": 0.0002, "epoch": 3.6246059332309435, "step": 22420}, {"loss": 0.6124, "grad_norm": 0.8468965291976929, "learning_rate": 0.0002, "epoch": 3.6262226174116887, "step": 22430}, {"loss": 0.6328, "grad_norm": 0.8466871976852417, "learning_rate": 0.0002, "epoch": 3.627839301592434, "step": 22440}, {"loss": 0.5941, "grad_norm": 0.9501169919967651, "learning_rate": 0.0002, "epoch": 3.629455985773179, "step": 22450}, {"loss": 0.6069, "grad_norm": 0.8906720876693726, "learning_rate": 0.0002, "epoch": 3.6310726699539244, "step": 22460}, {"loss": 0.6928, "grad_norm": 0.7400227189064026, "learning_rate": 0.0002, "epoch": 3.6326893541346696, "step": 22470}, {"loss": 0.6337, "grad_norm": 0.9756355881690979, "learning_rate": 0.0002, "epoch": 3.6343060383154153, "step": 22480}, {"loss": 0.6203, "grad_norm": 0.7504993081092834, "learning_rate": 0.0002, "epoch": 3.6359227224961606, "step": 22490}, {"loss": 0.6302, "grad_norm": 0.9270039200782776, "learning_rate": 0.0002, "epoch": 3.637539406676906, "step": 22500}, {"loss": 0.6026, "grad_norm": 0.8841686844825745, "learning_rate": 0.0002, "epoch": 3.639156090857651, "step": 22510}, {"loss": 0.6098, "grad_norm": 0.8533213138580322, "learning_rate": 0.0002, "epoch": 3.6407727750383962, "step": 22520}, {"loss": 0.6412, "grad_norm": 1.0052043199539185, "learning_rate": 0.0002, "epoch": 3.6423894592191415, "step": 22530}, {"loss": 0.6363, "grad_norm": 1.0323461294174194, "learning_rate": 0.0002, "epoch": 3.6440061433998867, "step": 22540}, {"loss": 0.6545, "grad_norm": 0.8654312491416931, "learning_rate": 0.0002, "epoch": 3.645622827580632, "step": 22550}, {"loss": 0.6155, "grad_norm": 0.6400038003921509, "learning_rate": 0.0002, "epoch": 3.647239511761377, "step": 22560}, {"loss": 0.5829, "grad_norm": 0.8061298727989197, "learning_rate": 0.0002, "epoch": 3.648856195942123, "step": 22570}, {"loss": 0.6388, "grad_norm": 0.9257854223251343, "learning_rate": 0.0002, "epoch": 3.650472880122868, "step": 22580}, {"loss": 0.6409, "grad_norm": 0.8439396619796753, "learning_rate": 0.0002, "epoch": 3.6520895643036133, "step": 22590}, {"loss": 0.5996, "grad_norm": 0.7764544486999512, "learning_rate": 0.0002, "epoch": 3.6537062484843585, "step": 22600}, {"loss": 0.6434, "grad_norm": 1.125451683998108, "learning_rate": 0.0002, "epoch": 3.6553229326651038, "step": 22610}, {"loss": 0.6579, "grad_norm": 0.7523018717765808, "learning_rate": 0.0002, "epoch": 3.656939616845849, "step": 22620}, {"loss": 0.6476, "grad_norm": 1.071026086807251, "learning_rate": 0.0002, "epoch": 3.6585563010265947, "step": 22630}, {"loss": 0.6459, "grad_norm": 0.945791482925415, "learning_rate": 0.0002, "epoch": 3.66017298520734, "step": 22640}, {"loss": 0.659, "grad_norm": 0.8001811504364014, "learning_rate": 0.0002, "epoch": 3.661789669388085, "step": 22650}, {"loss": 0.6385, "grad_norm": 0.9700816869735718, "learning_rate": 0.0002, "epoch": 3.6634063535688304, "step": 22660}, {"loss": 0.6337, "grad_norm": 0.9053242206573486, "learning_rate": 0.0002, "epoch": 3.6650230377495756, "step": 22670}, {"loss": 0.6335, "grad_norm": 0.944362461566925, "learning_rate": 0.0002, "epoch": 3.666639721930321, "step": 22680}, {"loss": 0.6235, "grad_norm": 1.067489504814148, "learning_rate": 0.0002, "epoch": 3.668256406111066, "step": 22690}, {"loss": 0.698, "grad_norm": 1.0984995365142822, "learning_rate": 0.0002, "epoch": 3.6698730902918113, "step": 22700}, {"loss": 0.6717, "grad_norm": 0.9336317777633667, "learning_rate": 0.0002, "epoch": 3.6714897744725565, "step": 22710}, {"loss": 0.6195, "grad_norm": 0.9261918663978577, "learning_rate": 0.0002, "epoch": 3.673106458653302, "step": 22720}, {"loss": 0.6332, "grad_norm": 0.8648008704185486, "learning_rate": 0.0002, "epoch": 3.6747231428340474, "step": 22730}, {"loss": 0.6576, "grad_norm": 0.7225083708763123, "learning_rate": 0.0002, "epoch": 3.6763398270147927, "step": 22740}, {"loss": 0.6406, "grad_norm": 0.9258282780647278, "learning_rate": 0.0002, "epoch": 3.677956511195538, "step": 22750}, {"loss": 0.6397, "grad_norm": 0.70876145362854, "learning_rate": 0.0002, "epoch": 3.679573195376283, "step": 22760}, {"loss": 0.6821, "grad_norm": 0.8780210018157959, "learning_rate": 0.0002, "epoch": 3.681189879557029, "step": 22770}, {"loss": 0.6036, "grad_norm": 0.8075440526008606, "learning_rate": 0.0002, "epoch": 3.682806563737774, "step": 22780}, {"loss": 0.6561, "grad_norm": 0.8503130674362183, "learning_rate": 0.0002, "epoch": 3.6844232479185193, "step": 22790}, {"loss": 0.6082, "grad_norm": 0.8413618206977844, "learning_rate": 0.0002, "epoch": 3.6860399320992645, "step": 22800}, {"loss": 0.614, "grad_norm": 0.8675165176391602, "learning_rate": 0.0002, "epoch": 3.6876566162800097, "step": 22810}, {"loss": 0.6157, "grad_norm": 0.8235884308815002, "learning_rate": 0.0002, "epoch": 3.689273300460755, "step": 22820}, {"loss": 0.5708, "grad_norm": 0.9477725625038147, "learning_rate": 0.0002, "epoch": 3.6908899846415, "step": 22830}, {"loss": 0.6481, "grad_norm": 0.7883533835411072, "learning_rate": 0.0002, "epoch": 3.6925066688222454, "step": 22840}, {"loss": 0.5872, "grad_norm": 1.047913908958435, "learning_rate": 0.0002, "epoch": 3.6941233530029907, "step": 22850}, {"loss": 0.6176, "grad_norm": 0.9171528816223145, "learning_rate": 0.0002, "epoch": 3.695740037183736, "step": 22860}, {"loss": 0.6204, "grad_norm": 0.9338192343711853, "learning_rate": 0.0002, "epoch": 3.6973567213644816, "step": 22870}, {"loss": 0.686, "grad_norm": 0.8799443244934082, "learning_rate": 0.0002, "epoch": 3.698973405545227, "step": 22880}, {"loss": 0.6206, "grad_norm": 0.8515434861183167, "learning_rate": 0.0002, "epoch": 3.700590089725972, "step": 22890}, {"loss": 0.5954, "grad_norm": 0.7805591821670532, "learning_rate": 0.0002, "epoch": 3.7022067739067173, "step": 22900}, {"loss": 0.6108, "grad_norm": 0.8470911979675293, "learning_rate": 0.0002, "epoch": 3.7038234580874625, "step": 22910}, {"loss": 0.6557, "grad_norm": 0.9452309012413025, "learning_rate": 0.0002, "epoch": 3.705440142268208, "step": 22920}, {"loss": 0.6529, "grad_norm": 0.950243353843689, "learning_rate": 0.0002, "epoch": 3.7070568264489534, "step": 22930}, {"loss": 0.6364, "grad_norm": 0.7882499098777771, "learning_rate": 0.0002, "epoch": 3.7086735106296986, "step": 22940}, {"loss": 0.6462, "grad_norm": 0.8307787775993347, "learning_rate": 0.0002, "epoch": 3.710290194810444, "step": 22950}, {"loss": 0.6371, "grad_norm": 1.0970630645751953, "learning_rate": 0.0002, "epoch": 3.711906878991189, "step": 22960}, {"loss": 0.6281, "grad_norm": 0.8269566297531128, "learning_rate": 0.0002, "epoch": 3.7135235631719343, "step": 22970}, {"loss": 0.6561, "grad_norm": 0.8306704759597778, "learning_rate": 0.0002, "epoch": 3.7151402473526796, "step": 22980}, {"loss": 0.6418, "grad_norm": 0.9710225462913513, "learning_rate": 0.0002, "epoch": 3.716756931533425, "step": 22990}, {"loss": 0.6639, "grad_norm": 0.8890530467033386, "learning_rate": 0.0002, "epoch": 3.71837361571417, "step": 23000}, {"loss": 0.6084, "grad_norm": 0.883522629737854, "learning_rate": 0.0002, "epoch": 3.7199902998949153, "step": 23010}, {"loss": 0.6183, "grad_norm": 0.8662652373313904, "learning_rate": 0.0002, "epoch": 3.721606984075661, "step": 23020}, {"loss": 0.6266, "grad_norm": 0.7228406667709351, "learning_rate": 0.0002, "epoch": 3.723223668256406, "step": 23030}, {"loss": 0.6417, "grad_norm": 1.060792088508606, "learning_rate": 0.0002, "epoch": 3.7248403524371514, "step": 23040}, {"loss": 0.6346, "grad_norm": 1.0119613409042358, "learning_rate": 0.0002, "epoch": 3.7264570366178966, "step": 23050}, {"loss": 0.6466, "grad_norm": 0.9212996959686279, "learning_rate": 0.0002, "epoch": 3.728073720798642, "step": 23060}, {"loss": 0.6454, "grad_norm": 0.925690233707428, "learning_rate": 0.0002, "epoch": 3.7296904049793875, "step": 23070}, {"loss": 0.615, "grad_norm": 0.8323310613632202, "learning_rate": 0.0002, "epoch": 3.7313070891601328, "step": 23080}, {"loss": 0.679, "grad_norm": 0.8966048955917358, "learning_rate": 0.0002, "epoch": 3.732923773340878, "step": 23090}, {"loss": 0.6151, "grad_norm": 0.8995837569236755, "learning_rate": 0.0002, "epoch": 3.7345404575216232, "step": 23100}, {"loss": 0.6143, "grad_norm": 0.8748890161514282, "learning_rate": 0.0002, "epoch": 3.7361571417023685, "step": 23110}, {"loss": 0.6246, "grad_norm": 0.7985540628433228, "learning_rate": 0.0002, "epoch": 3.7377738258831137, "step": 23120}, {"loss": 0.6279, "grad_norm": 1.0240917205810547, "learning_rate": 0.0002, "epoch": 3.739390510063859, "step": 23130}, {"loss": 0.6747, "grad_norm": 0.9181789755821228, "learning_rate": 0.0002, "epoch": 3.741007194244604, "step": 23140}, {"loss": 0.6026, "grad_norm": 0.8896583914756775, "learning_rate": 0.0002, "epoch": 3.7426238784253494, "step": 23150}, {"loss": 0.5972, "grad_norm": 0.8635515570640564, "learning_rate": 0.0002, "epoch": 3.744240562606095, "step": 23160}, {"loss": 0.6683, "grad_norm": 0.8873575329780579, "learning_rate": 0.0002, "epoch": 3.7458572467868403, "step": 23170}, {"loss": 0.6143, "grad_norm": 0.9807148575782776, "learning_rate": 0.0002, "epoch": 3.7474739309675855, "step": 23180}, {"loss": 0.6381, "grad_norm": 0.900477945804596, "learning_rate": 0.0002, "epoch": 3.7490906151483308, "step": 23190}, {"loss": 0.6542, "grad_norm": 0.9379992485046387, "learning_rate": 0.0002, "epoch": 3.750707299329076, "step": 23200}, {"loss": 0.6015, "grad_norm": 0.9649890661239624, "learning_rate": 0.0002, "epoch": 3.752323983509821, "step": 23210}, {"loss": 0.6735, "grad_norm": 0.824442446231842, "learning_rate": 0.0002, "epoch": 3.753940667690567, "step": 23220}, {"loss": 0.5992, "grad_norm": 0.8896150588989258, "learning_rate": 0.0002, "epoch": 3.755557351871312, "step": 23230}, {"loss": 0.6081, "grad_norm": 0.751249372959137, "learning_rate": 0.0002, "epoch": 3.7571740360520574, "step": 23240}, {"loss": 0.629, "grad_norm": 0.9392193555831909, "learning_rate": 0.0002, "epoch": 3.7587907202328026, "step": 23250}, {"loss": 0.6209, "grad_norm": 0.9284586310386658, "learning_rate": 0.0002, "epoch": 3.760407404413548, "step": 23260}, {"loss": 0.6414, "grad_norm": 0.7738175392150879, "learning_rate": 0.0002, "epoch": 3.762024088594293, "step": 23270}, {"loss": 0.6743, "grad_norm": 0.9252978563308716, "learning_rate": 0.0002, "epoch": 3.7636407727750383, "step": 23280}, {"loss": 0.5984, "grad_norm": 0.9501895904541016, "learning_rate": 0.0002, "epoch": 3.7652574569557835, "step": 23290}, {"loss": 0.6568, "grad_norm": 0.9416276216506958, "learning_rate": 0.0002, "epoch": 3.7668741411365287, "step": 23300}, {"loss": 0.6507, "grad_norm": 0.7076631784439087, "learning_rate": 0.0002, "epoch": 3.7684908253172744, "step": 23310}, {"loss": 0.6329, "grad_norm": 0.9864492416381836, "learning_rate": 0.0002, "epoch": 3.7701075094980196, "step": 23320}, {"loss": 0.6537, "grad_norm": 0.8450456261634827, "learning_rate": 0.0002, "epoch": 3.771724193678765, "step": 23330}, {"loss": 0.658, "grad_norm": 1.0768941640853882, "learning_rate": 0.0002, "epoch": 3.77334087785951, "step": 23340}, {"loss": 0.6408, "grad_norm": 0.9956819415092468, "learning_rate": 0.0002, "epoch": 3.7749575620402553, "step": 23350}, {"loss": 0.6464, "grad_norm": 0.9234658479690552, "learning_rate": 0.0002, "epoch": 3.7765742462210006, "step": 23360}, {"loss": 0.6542, "grad_norm": 1.0993858575820923, "learning_rate": 0.0002, "epoch": 3.7781909304017463, "step": 23370}, {"loss": 0.6391, "grad_norm": 0.923159658908844, "learning_rate": 0.0002, "epoch": 3.7798076145824915, "step": 23380}, {"loss": 0.6625, "grad_norm": 0.9311541318893433, "learning_rate": 0.0002, "epoch": 3.7814242987632367, "step": 23390}, {"loss": 0.6535, "grad_norm": 0.919681191444397, "learning_rate": 0.0002, "epoch": 3.783040982943982, "step": 23400}, {"loss": 0.6138, "grad_norm": 1.7406195402145386, "learning_rate": 0.0002, "epoch": 3.784657667124727, "step": 23410}, {"loss": 0.657, "grad_norm": 0.7789074182510376, "learning_rate": 0.0002, "epoch": 3.7862743513054724, "step": 23420}, {"loss": 0.658, "grad_norm": 0.8302814960479736, "learning_rate": 0.0002, "epoch": 3.7878910354862176, "step": 23430}, {"loss": 0.649, "grad_norm": 0.8089349269866943, "learning_rate": 0.0002, "epoch": 3.789507719666963, "step": 23440}, {"loss": 0.6682, "grad_norm": 0.9006284475326538, "learning_rate": 0.0002, "epoch": 3.791124403847708, "step": 23450}, {"loss": 0.6335, "grad_norm": 0.8426766991615295, "learning_rate": 0.0002, "epoch": 3.7927410880284538, "step": 23460}, {"loss": 0.6364, "grad_norm": 1.2576252222061157, "learning_rate": 0.0002, "epoch": 3.794357772209199, "step": 23470}, {"loss": 0.6324, "grad_norm": 1.0307610034942627, "learning_rate": 0.0002, "epoch": 3.7959744563899442, "step": 23480}, {"loss": 0.6262, "grad_norm": 0.8525972962379456, "learning_rate": 0.0002, "epoch": 3.7975911405706895, "step": 23490}, {"loss": 0.6757, "grad_norm": 1.159039855003357, "learning_rate": 0.0002, "epoch": 3.7992078247514347, "step": 23500}, {"loss": 0.6414, "grad_norm": 1.4193549156188965, "learning_rate": 0.0002, "epoch": 3.80082450893218, "step": 23510}, {"loss": 0.6413, "grad_norm": 0.8245543837547302, "learning_rate": 0.0002, "epoch": 3.8024411931129256, "step": 23520}, {"loss": 0.6417, "grad_norm": 0.8847230076789856, "learning_rate": 0.0002, "epoch": 3.804057877293671, "step": 23530}, {"loss": 0.6415, "grad_norm": 0.9574624300003052, "learning_rate": 0.0002, "epoch": 3.805674561474416, "step": 23540}, {"loss": 0.5765, "grad_norm": 1.048020601272583, "learning_rate": 0.0002, "epoch": 3.8072912456551613, "step": 23550}, {"loss": 0.6497, "grad_norm": 0.8302255868911743, "learning_rate": 0.0002, "epoch": 3.8089079298359065, "step": 23560}, {"loss": 0.6534, "grad_norm": 0.8269215822219849, "learning_rate": 0.0002, "epoch": 3.8105246140166518, "step": 23570}, {"loss": 0.6294, "grad_norm": 0.9375753402709961, "learning_rate": 0.0002, "epoch": 3.812141298197397, "step": 23580}, {"loss": 0.6132, "grad_norm": 1.0234097242355347, "learning_rate": 0.0002, "epoch": 3.8137579823781422, "step": 23590}, {"loss": 0.6625, "grad_norm": 0.8978445529937744, "learning_rate": 0.0002, "epoch": 3.8153746665588875, "step": 23600}, {"loss": 0.6315, "grad_norm": 0.7929515838623047, "learning_rate": 0.0002, "epoch": 3.816991350739633, "step": 23610}, {"loss": 0.6387, "grad_norm": 1.3255881071090698, "learning_rate": 0.0002, "epoch": 3.8186080349203784, "step": 23620}, {"loss": 0.5947, "grad_norm": 0.9188598990440369, "learning_rate": 0.0002, "epoch": 3.8202247191011236, "step": 23630}, {"loss": 0.6152, "grad_norm": 0.8811675906181335, "learning_rate": 0.0002, "epoch": 3.821841403281869, "step": 23640}, {"loss": 0.6253, "grad_norm": 0.8061038255691528, "learning_rate": 0.0002, "epoch": 3.823458087462614, "step": 23650}, {"loss": 0.6517, "grad_norm": 0.9975376129150391, "learning_rate": 0.0002, "epoch": 3.8250747716433597, "step": 23660}, {"loss": 0.6288, "grad_norm": 0.8036105036735535, "learning_rate": 0.0002, "epoch": 3.826691455824105, "step": 23670}, {"loss": 0.6845, "grad_norm": 0.7401984333992004, "learning_rate": 0.0002, "epoch": 3.82830814000485, "step": 23680}, {"loss": 0.6423, "grad_norm": 0.829753041267395, "learning_rate": 0.0002, "epoch": 3.8299248241855954, "step": 23690}, {"loss": 0.6611, "grad_norm": 0.8753240704536438, "learning_rate": 0.0002, "epoch": 3.8315415083663407, "step": 23700}, {"loss": 0.6686, "grad_norm": 0.8157842755317688, "learning_rate": 0.0002, "epoch": 3.833158192547086, "step": 23710}, {"loss": 0.6181, "grad_norm": 0.6183798909187317, "learning_rate": 0.0002, "epoch": 3.834774876727831, "step": 23720}, {"loss": 0.5965, "grad_norm": 0.9548442363739014, "learning_rate": 0.0002, "epoch": 3.8363915609085764, "step": 23730}, {"loss": 0.6456, "grad_norm": 0.8319669961929321, "learning_rate": 0.0002, "epoch": 3.8380082450893216, "step": 23740}, {"loss": 0.6585, "grad_norm": 0.9718693494796753, "learning_rate": 0.0002, "epoch": 3.839624929270067, "step": 23750}, {"loss": 0.6518, "grad_norm": 0.8672235012054443, "learning_rate": 0.0002, "epoch": 3.8412416134508125, "step": 23760}, {"loss": 0.6774, "grad_norm": 1.1210707426071167, "learning_rate": 0.0002, "epoch": 3.8428582976315577, "step": 23770}, {"loss": 0.5923, "grad_norm": 0.9177767634391785, "learning_rate": 0.0002, "epoch": 3.844474981812303, "step": 23780}, {"loss": 0.6286, "grad_norm": 0.8714171648025513, "learning_rate": 0.0002, "epoch": 3.846091665993048, "step": 23790}, {"loss": 0.6302, "grad_norm": 1.1853246688842773, "learning_rate": 0.0002, "epoch": 3.8477083501737934, "step": 23800}, {"loss": 0.6144, "grad_norm": 0.8091260194778442, "learning_rate": 0.0002, "epoch": 3.849325034354539, "step": 23810}, {"loss": 0.658, "grad_norm": 0.9710774421691895, "learning_rate": 0.0002, "epoch": 3.8509417185352843, "step": 23820}, {"loss": 0.6151, "grad_norm": 0.7648707628250122, "learning_rate": 0.0002, "epoch": 3.8525584027160296, "step": 23830}, {"loss": 0.6013, "grad_norm": 0.7809253931045532, "learning_rate": 0.0002, "epoch": 3.854175086896775, "step": 23840}, {"loss": 0.6006, "grad_norm": 0.8337951898574829, "learning_rate": 0.0002, "epoch": 3.85579177107752, "step": 23850}, {"loss": 0.6456, "grad_norm": 0.9271913170814514, "learning_rate": 0.0002, "epoch": 3.8574084552582653, "step": 23860}, {"loss": 0.6671, "grad_norm": 0.985334038734436, "learning_rate": 0.0002, "epoch": 3.8590251394390105, "step": 23870}, {"loss": 0.6693, "grad_norm": 0.8458583354949951, "learning_rate": 0.0002, "epoch": 3.8606418236197557, "step": 23880}, {"loss": 0.6207, "grad_norm": 1.015348196029663, "learning_rate": 0.0002, "epoch": 3.862258507800501, "step": 23890}, {"loss": 0.649, "grad_norm": 1.0121688842773438, "learning_rate": 0.0002, "epoch": 3.8638751919812466, "step": 23900}, {"loss": 0.5921, "grad_norm": 0.8883971571922302, "learning_rate": 0.0002, "epoch": 3.865491876161992, "step": 23910}, {"loss": 0.6597, "grad_norm": 1.028086543083191, "learning_rate": 0.0002, "epoch": 3.867108560342737, "step": 23920}, {"loss": 0.6654, "grad_norm": 0.9645734429359436, "learning_rate": 0.0002, "epoch": 3.8687252445234823, "step": 23930}, {"loss": 0.6328, "grad_norm": 0.8235350251197815, "learning_rate": 0.0002, "epoch": 3.8703419287042276, "step": 23940}, {"loss": 0.6387, "grad_norm": 1.0298916101455688, "learning_rate": 0.0002, "epoch": 3.871958612884973, "step": 23950}, {"loss": 0.5966, "grad_norm": 1.0063377618789673, "learning_rate": 0.0002, "epoch": 3.8735752970657185, "step": 23960}, {"loss": 0.6234, "grad_norm": 0.9230626821517944, "learning_rate": 0.0002, "epoch": 3.8751919812464637, "step": 23970}, {"loss": 0.6159, "grad_norm": 0.9243063926696777, "learning_rate": 0.0002, "epoch": 3.876808665427209, "step": 23980}, {"loss": 0.6035, "grad_norm": 1.0211291313171387, "learning_rate": 0.0002, "epoch": 3.878425349607954, "step": 23990}, {"loss": 0.6351, "grad_norm": 0.7800535559654236, "learning_rate": 0.0002, "epoch": 3.8800420337886994, "step": 24000}, {"loss": 0.7, "grad_norm": 0.7904248833656311, "learning_rate": 0.0002, "epoch": 3.8816587179694446, "step": 24010}, {"loss": 0.6516, "grad_norm": 1.1975988149642944, "learning_rate": 0.0002, "epoch": 3.88327540215019, "step": 24020}, {"loss": 0.6006, "grad_norm": 1.0626593828201294, "learning_rate": 0.0002, "epoch": 3.884892086330935, "step": 24030}, {"loss": 0.6115, "grad_norm": 0.9012193083763123, "learning_rate": 0.0002, "epoch": 3.8865087705116803, "step": 24040}, {"loss": 0.6786, "grad_norm": 1.1159172058105469, "learning_rate": 0.0002, "epoch": 3.888125454692426, "step": 24050}, {"loss": 0.6635, "grad_norm": 1.276838779449463, "learning_rate": 0.0002, "epoch": 3.889742138873171, "step": 24060}, {"loss": 0.5985, "grad_norm": 0.8467690348625183, "learning_rate": 0.0002, "epoch": 3.8913588230539164, "step": 24070}, {"loss": 0.6655, "grad_norm": 0.9862841963768005, "learning_rate": 0.0002, "epoch": 3.8929755072346617, "step": 24080}, {"loss": 0.6098, "grad_norm": 0.7134621739387512, "learning_rate": 0.0002, "epoch": 3.894592191415407, "step": 24090}, {"loss": 0.618, "grad_norm": 0.8178175091743469, "learning_rate": 0.0002, "epoch": 3.896208875596152, "step": 24100}, {"loss": 0.6147, "grad_norm": 0.9229172468185425, "learning_rate": 0.0002, "epoch": 3.897825559776898, "step": 24110}, {"loss": 0.6554, "grad_norm": 1.0878316164016724, "learning_rate": 0.0002, "epoch": 3.899442243957643, "step": 24120}, {"loss": 0.6616, "grad_norm": 0.971645712852478, "learning_rate": 0.0002, "epoch": 3.9010589281383883, "step": 24130}, {"loss": 0.6228, "grad_norm": 0.8862188458442688, "learning_rate": 0.0002, "epoch": 3.9026756123191335, "step": 24140}, {"loss": 0.6192, "grad_norm": 0.9126982688903809, "learning_rate": 0.0002, "epoch": 3.9042922964998787, "step": 24150}, {"loss": 0.6734, "grad_norm": 0.8833470940589905, "learning_rate": 0.0002, "epoch": 3.905908980680624, "step": 24160}, {"loss": 0.5832, "grad_norm": 0.8320947885513306, "learning_rate": 0.0002, "epoch": 3.907525664861369, "step": 24170}, {"loss": 0.6247, "grad_norm": 0.9156602025032043, "learning_rate": 0.0002, "epoch": 3.9091423490421144, "step": 24180}, {"loss": 0.6678, "grad_norm": 1.029181957244873, "learning_rate": 0.0002, "epoch": 3.9107590332228597, "step": 24190}, {"loss": 0.6565, "grad_norm": 0.9052802324295044, "learning_rate": 0.0002, "epoch": 3.9123757174036053, "step": 24200}, {"loss": 0.6346, "grad_norm": 0.8847255110740662, "learning_rate": 0.0002, "epoch": 3.9139924015843506, "step": 24210}, {"loss": 0.6343, "grad_norm": 0.9642062187194824, "learning_rate": 0.0002, "epoch": 3.915609085765096, "step": 24220}, {"loss": 0.6557, "grad_norm": 0.8629093766212463, "learning_rate": 0.0002, "epoch": 3.917225769945841, "step": 24230}, {"loss": 0.6086, "grad_norm": 0.8674976825714111, "learning_rate": 0.0002, "epoch": 3.9188424541265863, "step": 24240}, {"loss": 0.5874, "grad_norm": 1.104846477508545, "learning_rate": 0.0002, "epoch": 3.9204591383073315, "step": 24250}, {"loss": 0.6501, "grad_norm": 1.0874955654144287, "learning_rate": 0.0002, "epoch": 3.922075822488077, "step": 24260}, {"loss": 0.6455, "grad_norm": 0.8689812421798706, "learning_rate": 0.0002, "epoch": 3.9236925066688224, "step": 24270}, {"loss": 0.5893, "grad_norm": 0.9724617004394531, "learning_rate": 0.0002, "epoch": 3.9253091908495676, "step": 24280}, {"loss": 0.6616, "grad_norm": 0.9165538549423218, "learning_rate": 0.0002, "epoch": 3.926925875030313, "step": 24290}, {"loss": 0.645, "grad_norm": 0.9307710528373718, "learning_rate": 0.0002, "epoch": 3.928542559211058, "step": 24300}, {"loss": 0.6071, "grad_norm": 0.8589295148849487, "learning_rate": 0.0002, "epoch": 3.9301592433918033, "step": 24310}, {"loss": 0.6662, "grad_norm": 0.9151099920272827, "learning_rate": 0.0002, "epoch": 3.9317759275725486, "step": 24320}, {"loss": 0.7075, "grad_norm": 0.9633517265319824, "learning_rate": 0.0002, "epoch": 3.933392611753294, "step": 24330}, {"loss": 0.6432, "grad_norm": 0.9521116018295288, "learning_rate": 0.0002, "epoch": 3.935009295934039, "step": 24340}, {"loss": 0.6457, "grad_norm": 0.8366776704788208, "learning_rate": 0.0002, "epoch": 3.9366259801147847, "step": 24350}, {"loss": 0.6139, "grad_norm": 0.8972663283348083, "learning_rate": 0.0002, "epoch": 3.93824266429553, "step": 24360}, {"loss": 0.661, "grad_norm": 0.8102919459342957, "learning_rate": 0.0002, "epoch": 3.939859348476275, "step": 24370}, {"loss": 0.6388, "grad_norm": 0.8189975023269653, "learning_rate": 0.0002, "epoch": 3.9414760326570204, "step": 24380}, {"loss": 0.6818, "grad_norm": 0.9569464921951294, "learning_rate": 0.0002, "epoch": 3.9430927168377656, "step": 24390}, {"loss": 0.6999, "grad_norm": 0.7459101676940918, "learning_rate": 0.0002, "epoch": 3.9447094010185113, "step": 24400}, {"loss": 0.6069, "grad_norm": 0.8536974787712097, "learning_rate": 0.0002, "epoch": 3.9463260851992565, "step": 24410}, {"loss": 0.5683, "grad_norm": 0.8763698935508728, "learning_rate": 0.0002, "epoch": 3.9479427693800018, "step": 24420}, {"loss": 0.6478, "grad_norm": 0.9381106495857239, "learning_rate": 0.0002, "epoch": 3.949559453560747, "step": 24430}, {"loss": 0.6371, "grad_norm": 0.934440016746521, "learning_rate": 0.0002, "epoch": 3.9511761377414922, "step": 24440}, {"loss": 0.6393, "grad_norm": 0.903918981552124, "learning_rate": 0.0002, "epoch": 3.9527928219222375, "step": 24450}, {"loss": 0.6175, "grad_norm": 0.8771953582763672, "learning_rate": 0.0002, "epoch": 3.9544095061029827, "step": 24460}, {"loss": 0.6971, "grad_norm": 1.0375410318374634, "learning_rate": 0.0002, "epoch": 3.956026190283728, "step": 24470}, {"loss": 0.6313, "grad_norm": 0.9439185261726379, "learning_rate": 0.0002, "epoch": 3.957642874464473, "step": 24480}, {"loss": 0.6076, "grad_norm": 0.935467004776001, "learning_rate": 0.0002, "epoch": 3.9592595586452184, "step": 24490}, {"loss": 0.6437, "grad_norm": 0.6900772452354431, "learning_rate": 0.0002, "epoch": 3.960876242825964, "step": 24500}, {"loss": 0.6445, "grad_norm": 1.0172916650772095, "learning_rate": 0.0002, "epoch": 3.9624929270067093, "step": 24510}, {"loss": 0.6308, "grad_norm": 0.9167046546936035, "learning_rate": 0.0002, "epoch": 3.9641096111874545, "step": 24520}, {"loss": 0.6519, "grad_norm": 0.7230527997016907, "learning_rate": 0.0002, "epoch": 3.9657262953681998, "step": 24530}, {"loss": 0.6564, "grad_norm": 0.8980403542518616, "learning_rate": 0.0002, "epoch": 3.967342979548945, "step": 24540}, {"loss": 0.6099, "grad_norm": 0.8555465936660767, "learning_rate": 0.0002, "epoch": 3.9689596637296907, "step": 24550}, {"loss": 0.6617, "grad_norm": 0.7825445532798767, "learning_rate": 0.0002, "epoch": 3.970576347910436, "step": 24560}, {"loss": 0.604, "grad_norm": 0.7273133993148804, "learning_rate": 0.0002, "epoch": 3.972193032091181, "step": 24570}, {"loss": 0.6427, "grad_norm": 0.9612047672271729, "learning_rate": 0.0002, "epoch": 3.9738097162719264, "step": 24580}, {"loss": 0.6426, "grad_norm": 0.9865460991859436, "learning_rate": 0.0002, "epoch": 3.9754264004526716, "step": 24590}, {"loss": 0.6052, "grad_norm": 0.8638762831687927, "learning_rate": 0.0002, "epoch": 3.977043084633417, "step": 24600}, {"loss": 0.6097, "grad_norm": 1.0096198320388794, "learning_rate": 0.0002, "epoch": 3.978659768814162, "step": 24610}, {"loss": 0.6664, "grad_norm": 0.8475532531738281, "learning_rate": 0.0002, "epoch": 3.9802764529949073, "step": 24620}, {"loss": 0.6711, "grad_norm": 0.9696195721626282, "learning_rate": 0.0002, "epoch": 3.9818931371756525, "step": 24630}, {"loss": 0.6446, "grad_norm": 0.7499843239784241, "learning_rate": 0.0002, "epoch": 3.9835098213563978, "step": 24640}, {"loss": 0.6054, "grad_norm": 0.8865424990653992, "learning_rate": 0.0002, "epoch": 3.9851265055371434, "step": 24650}, {"loss": 0.5975, "grad_norm": 0.8089959025382996, "learning_rate": 0.0002, "epoch": 3.9867431897178887, "step": 24660}, {"loss": 0.6677, "grad_norm": 0.6946012377738953, "learning_rate": 0.0002, "epoch": 3.988359873898634, "step": 24670}, {"loss": 0.6329, "grad_norm": 0.7991759181022644, "learning_rate": 0.0002, "epoch": 3.989976558079379, "step": 24680}, {"loss": 0.6449, "grad_norm": 0.8803931474685669, "learning_rate": 0.0002, "epoch": 3.9915932422601244, "step": 24690}, {"loss": 0.7091, "grad_norm": 0.8848299980163574, "learning_rate": 0.0002, "epoch": 3.99320992644087, "step": 24700}, {"loss": 0.6551, "grad_norm": 0.7448889017105103, "learning_rate": 0.0002, "epoch": 3.9948266106216153, "step": 24710}, {"loss": 0.6432, "grad_norm": 0.9361620545387268, "learning_rate": 0.0002, "epoch": 3.9964432948023605, "step": 24720}, {"loss": 0.5917, "grad_norm": 0.9958081245422363, "learning_rate": 0.0002, "epoch": 3.9980599789831057, "step": 24730}, {"loss": 0.6567, "grad_norm": 1.026004672050476, "learning_rate": 0.0002, "epoch": 3.999676663163851, "step": 24740}, {"eval_loss": 1.1524168252944946, "eval_runtime": 122.1585, "eval_samples_per_second": 6.0, "eval_steps_per_second": 0.753, "epoch": 4.0, "step": 24742}, {"loss": 0.6057, "grad_norm": 1.0664808750152588, "learning_rate": 0.0002, "epoch": 4.001293347344596, "step": 24750}, {"loss": 0.5644, "grad_norm": 1.0113720893859863, "learning_rate": 0.0002, "epoch": 4.002910031525341, "step": 24760}, {"loss": 0.5628, "grad_norm": 0.991486668586731, "learning_rate": 0.0002, "epoch": 4.004526715706087, "step": 24770}, {"loss": 0.508, "grad_norm": 0.951754629611969, "learning_rate": 0.0002, "epoch": 4.006143399886832, "step": 24780}, {"loss": 0.5314, "grad_norm": 1.13059401512146, "learning_rate": 0.0002, "epoch": 4.007760084067577, "step": 24790}, {"loss": 0.5323, "grad_norm": 0.9343926310539246, "learning_rate": 0.0002, "epoch": 4.009376768248322, "step": 24800}, {"loss": 0.5161, "grad_norm": 1.0680590867996216, "learning_rate": 0.0002, "epoch": 4.010993452429068, "step": 24810}, {"loss": 0.513, "grad_norm": 1.0022706985473633, "learning_rate": 0.0002, "epoch": 4.012610136609814, "step": 24820}, {"loss": 0.543, "grad_norm": 1.0285297632217407, "learning_rate": 0.0002, "epoch": 4.014226820790559, "step": 24830}, {"loss": 0.5311, "grad_norm": 0.8347002863883972, "learning_rate": 0.0002, "epoch": 4.015843504971304, "step": 24840}, {"loss": 0.5655, "grad_norm": 0.9675396680831909, "learning_rate": 0.0002, "epoch": 4.017460189152049, "step": 24850}, {"loss": 0.5625, "grad_norm": 0.9238511323928833, "learning_rate": 0.0002, "epoch": 4.019076873332795, "step": 24860}, {"loss": 0.5327, "grad_norm": 1.1576941013336182, "learning_rate": 0.0002, "epoch": 4.02069355751354, "step": 24870}, {"loss": 0.5533, "grad_norm": 0.8583757281303406, "learning_rate": 0.0002, "epoch": 4.022310241694285, "step": 24880}, {"loss": 0.5483, "grad_norm": 0.9816817045211792, "learning_rate": 0.0002, "epoch": 4.02392692587503, "step": 24890}, {"loss": 0.5605, "grad_norm": 0.955073893070221, "learning_rate": 0.0002, "epoch": 4.0255436100557755, "step": 24900}, {"loss": 0.4896, "grad_norm": 1.1054974794387817, "learning_rate": 0.0002, "epoch": 4.027160294236521, "step": 24910}, {"loss": 0.5246, "grad_norm": 1.1240060329437256, "learning_rate": 0.0002, "epoch": 4.028776978417266, "step": 24920}, {"loss": 0.5451, "grad_norm": 0.9512825012207031, "learning_rate": 0.0002, "epoch": 4.030393662598011, "step": 24930}, {"loss": 0.5584, "grad_norm": 0.85965496301651, "learning_rate": 0.0002, "epoch": 4.0320103467787565, "step": 24940}, {"loss": 0.5564, "grad_norm": 0.9378061294555664, "learning_rate": 0.0002, "epoch": 4.033627030959502, "step": 24950}, {"loss": 0.5008, "grad_norm": 0.9655424356460571, "learning_rate": 0.0002, "epoch": 4.035243715140247, "step": 24960}, {"loss": 0.5538, "grad_norm": 1.1393707990646362, "learning_rate": 0.0002, "epoch": 4.036860399320993, "step": 24970}, {"loss": 0.5785, "grad_norm": 1.0220451354980469, "learning_rate": 0.0002, "epoch": 4.038477083501738, "step": 24980}, {"loss": 0.5813, "grad_norm": 0.9785808324813843, "learning_rate": 0.0002, "epoch": 4.0400937676824835, "step": 24990}, {"loss": 0.5153, "grad_norm": 1.0257649421691895, "learning_rate": 0.0002, "epoch": 4.041710451863229, "step": 25000}, {"loss": 0.5658, "grad_norm": 0.9737892150878906, "learning_rate": 0.0002, "epoch": 4.043327136043974, "step": 25010}, {"loss": 0.5515, "grad_norm": 0.7416959404945374, "learning_rate": 0.0002, "epoch": 4.044943820224719, "step": 25020}, {"loss": 0.5372, "grad_norm": 0.7909596562385559, "learning_rate": 0.0002, "epoch": 4.046560504405464, "step": 25030}, {"loss": 0.5265, "grad_norm": 0.8923130631446838, "learning_rate": 0.0002, "epoch": 4.04817718858621, "step": 25040}, {"loss": 0.5035, "grad_norm": 0.9044941663742065, "learning_rate": 0.0002, "epoch": 4.049793872766955, "step": 25050}, {"loss": 0.5135, "grad_norm": 0.866352379322052, "learning_rate": 0.0002, "epoch": 4.0514105569477, "step": 25060}, {"loss": 0.5956, "grad_norm": 1.544549822807312, "learning_rate": 0.0002, "epoch": 4.053027241128445, "step": 25070}, {"loss": 0.5418, "grad_norm": 0.8426995277404785, "learning_rate": 0.0002, "epoch": 4.054643925309191, "step": 25080}, {"loss": 0.5537, "grad_norm": 0.9797548651695251, "learning_rate": 0.0002, "epoch": 4.056260609489936, "step": 25090}, {"loss": 0.55, "grad_norm": 0.8468434810638428, "learning_rate": 0.0002, "epoch": 4.057877293670681, "step": 25100}, {"loss": 0.5242, "grad_norm": 0.9294559955596924, "learning_rate": 0.0002, "epoch": 4.059493977851426, "step": 25110}, {"loss": 0.5295, "grad_norm": 0.9686688780784607, "learning_rate": 0.0002, "epoch": 4.061110662032172, "step": 25120}, {"loss": 0.5642, "grad_norm": 0.8042728304862976, "learning_rate": 0.0002, "epoch": 4.062727346212918, "step": 25130}, {"loss": 0.548, "grad_norm": 1.165160894393921, "learning_rate": 0.0002, "epoch": 4.064344030393663, "step": 25140}, {"loss": 0.5473, "grad_norm": 1.2161961793899536, "learning_rate": 0.0002, "epoch": 4.065960714574408, "step": 25150}, {"loss": 0.5217, "grad_norm": 1.0762810707092285, "learning_rate": 0.0002, "epoch": 4.067577398755153, "step": 25160}, {"loss": 0.5886, "grad_norm": 0.7580869793891907, "learning_rate": 0.0002, "epoch": 4.069194082935899, "step": 25170}, {"loss": 0.5401, "grad_norm": 0.9630117416381836, "learning_rate": 0.0002, "epoch": 4.070810767116644, "step": 25180}, {"loss": 0.5378, "grad_norm": 0.9049716591835022, "learning_rate": 0.0002, "epoch": 4.072427451297389, "step": 25190}, {"loss": 0.5266, "grad_norm": 1.1536930799484253, "learning_rate": 0.0002, "epoch": 4.074044135478134, "step": 25200}, {"loss": 0.5523, "grad_norm": 0.901461124420166, "learning_rate": 0.0002, "epoch": 4.0756608196588795, "step": 25210}, {"loss": 0.5132, "grad_norm": 1.3318437337875366, "learning_rate": 0.0002, "epoch": 4.077277503839625, "step": 25220}, {"loss": 0.5317, "grad_norm": 0.8811455368995667, "learning_rate": 0.0002, "epoch": 4.07889418802037, "step": 25230}, {"loss": 0.5798, "grad_norm": 1.0564165115356445, "learning_rate": 0.0002, "epoch": 4.080510872201115, "step": 25240}, {"loss": 0.5472, "grad_norm": 1.1008027791976929, "learning_rate": 0.0002, "epoch": 4.08212755638186, "step": 25250}, {"loss": 0.5195, "grad_norm": 1.150097131729126, "learning_rate": 0.0002, "epoch": 4.083744240562606, "step": 25260}, {"loss": 0.5321, "grad_norm": 0.9339924454689026, "learning_rate": 0.0002, "epoch": 4.085360924743352, "step": 25270}, {"loss": 0.5597, "grad_norm": 1.0902045965194702, "learning_rate": 0.0002, "epoch": 4.086977608924097, "step": 25280}, {"loss": 0.5203, "grad_norm": 0.8483911156654358, "learning_rate": 0.0002, "epoch": 4.088594293104842, "step": 25290}, {"loss": 0.5697, "grad_norm": 0.9477024674415588, "learning_rate": 0.0002, "epoch": 4.0902109772855875, "step": 25300}, {"loss": 0.5384, "grad_norm": 0.9500215649604797, "learning_rate": 0.0002, "epoch": 4.091827661466333, "step": 25310}, {"loss": 0.5045, "grad_norm": 1.040468454360962, "learning_rate": 0.0002, "epoch": 4.093444345647078, "step": 25320}, {"loss": 0.5488, "grad_norm": 0.7457592487335205, "learning_rate": 0.0002, "epoch": 4.095061029827823, "step": 25330}, {"loss": 0.609, "grad_norm": 1.2092097997665405, "learning_rate": 0.0002, "epoch": 4.096677714008568, "step": 25340}, {"loss": 0.5174, "grad_norm": 0.9652107954025269, "learning_rate": 0.0002, "epoch": 4.098294398189314, "step": 25350}, {"loss": 0.5559, "grad_norm": 0.8464955687522888, "learning_rate": 0.0002, "epoch": 4.099911082370059, "step": 25360}, {"loss": 0.5635, "grad_norm": 0.875026285648346, "learning_rate": 0.0002, "epoch": 4.101527766550804, "step": 25370}, {"loss": 0.5774, "grad_norm": 0.9241740107536316, "learning_rate": 0.0002, "epoch": 4.103144450731549, "step": 25380}, {"loss": 0.5578, "grad_norm": 0.9769546389579773, "learning_rate": 0.0002, "epoch": 4.1047611349122946, "step": 25390}, {"loss": 0.567, "grad_norm": 1.1501960754394531, "learning_rate": 0.0002, "epoch": 4.10637781909304, "step": 25400}, {"loss": 0.5241, "grad_norm": 0.9135243892669678, "learning_rate": 0.0002, "epoch": 4.107994503273786, "step": 25410}, {"loss": 0.5152, "grad_norm": 0.9905396103858948, "learning_rate": 0.0002, "epoch": 4.109611187454531, "step": 25420}, {"loss": 0.5064, "grad_norm": 0.9845104217529297, "learning_rate": 0.0002, "epoch": 4.111227871635276, "step": 25430}, {"loss": 0.5029, "grad_norm": 0.8326883912086487, "learning_rate": 0.0002, "epoch": 4.112844555816022, "step": 25440}, {"loss": 0.5312, "grad_norm": 0.9264556765556335, "learning_rate": 0.0002, "epoch": 4.114461239996767, "step": 25450}, {"loss": 0.5968, "grad_norm": 1.043080449104309, "learning_rate": 0.0002, "epoch": 4.116077924177512, "step": 25460}, {"loss": 0.5773, "grad_norm": 0.8533386588096619, "learning_rate": 0.0002, "epoch": 4.117694608358257, "step": 25470}, {"loss": 0.5584, "grad_norm": 1.0133965015411377, "learning_rate": 0.0002, "epoch": 4.1193112925390025, "step": 25480}, {"loss": 0.566, "grad_norm": 0.7476310133934021, "learning_rate": 0.0002, "epoch": 4.120927976719748, "step": 25490}, {"loss": 0.5189, "grad_norm": 1.1247259378433228, "learning_rate": 0.0002, "epoch": 4.122544660900493, "step": 25500}, {"loss": 0.5751, "grad_norm": 1.0764678716659546, "learning_rate": 0.0002, "epoch": 4.124161345081238, "step": 25510}, {"loss": 0.5391, "grad_norm": 0.7679798007011414, "learning_rate": 0.0002, "epoch": 4.1257780292619834, "step": 25520}, {"loss": 0.5233, "grad_norm": 0.8877071142196655, "learning_rate": 0.0002, "epoch": 4.127394713442729, "step": 25530}, {"loss": 0.5769, "grad_norm": 1.0440239906311035, "learning_rate": 0.0002, "epoch": 4.129011397623474, "step": 25540}, {"loss": 0.5723, "grad_norm": 0.984145998954773, "learning_rate": 0.0002, "epoch": 4.130628081804219, "step": 25550}, {"loss": 0.5741, "grad_norm": 0.8667055368423462, "learning_rate": 0.0002, "epoch": 4.132244765984965, "step": 25560}, {"loss": 0.5816, "grad_norm": 1.1300835609436035, "learning_rate": 0.0002, "epoch": 4.1338614501657105, "step": 25570}, {"loss": 0.524, "grad_norm": 0.9314348101615906, "learning_rate": 0.0002, "epoch": 4.135478134346456, "step": 25580}, {"loss": 0.5283, "grad_norm": 0.7731879949569702, "learning_rate": 0.0002, "epoch": 4.137094818527201, "step": 25590}, {"loss": 0.5307, "grad_norm": 1.0080097913742065, "learning_rate": 0.0002, "epoch": 4.138711502707946, "step": 25600}, {"loss": 0.5759, "grad_norm": 1.2475038766860962, "learning_rate": 0.0002, "epoch": 4.140328186888691, "step": 25610}, {"loss": 0.55, "grad_norm": 0.9912930727005005, "learning_rate": 0.0002, "epoch": 4.141944871069437, "step": 25620}, {"loss": 0.5624, "grad_norm": 0.9088651537895203, "learning_rate": 0.0002, "epoch": 4.143561555250182, "step": 25630}, {"loss": 0.5393, "grad_norm": 0.8940697312355042, "learning_rate": 0.0002, "epoch": 4.145178239430927, "step": 25640}, {"loss": 0.5341, "grad_norm": 1.0798203945159912, "learning_rate": 0.0002, "epoch": 4.146794923611672, "step": 25650}, {"loss": 0.5987, "grad_norm": 0.955172061920166, "learning_rate": 0.0002, "epoch": 4.148411607792418, "step": 25660}, {"loss": 0.569, "grad_norm": 0.9692716002464294, "learning_rate": 0.0002, "epoch": 4.150028291973163, "step": 25670}, {"loss": 0.5478, "grad_norm": 1.0813939571380615, "learning_rate": 0.0002, "epoch": 4.151644976153908, "step": 25680}, {"loss": 0.5383, "grad_norm": 1.135675072669983, "learning_rate": 0.0002, "epoch": 4.153261660334653, "step": 25690}, {"loss": 0.5247, "grad_norm": 1.0392236709594727, "learning_rate": 0.0002, "epoch": 4.1548783445153985, "step": 25700}, {"loss": 0.5204, "grad_norm": 0.9473116993904114, "learning_rate": 0.0002, "epoch": 4.156495028696145, "step": 25710}, {"loss": 0.5339, "grad_norm": 0.712493896484375, "learning_rate": 0.0002, "epoch": 4.15811171287689, "step": 25720}, {"loss": 0.5781, "grad_norm": 0.8724465370178223, "learning_rate": 0.0002, "epoch": 4.159728397057635, "step": 25730}, {"loss": 0.5325, "grad_norm": 0.9870015978813171, "learning_rate": 0.0002, "epoch": 4.16134508123838, "step": 25740}, {"loss": 0.5503, "grad_norm": 1.025273084640503, "learning_rate": 0.0002, "epoch": 4.1629617654191255, "step": 25750}, {"loss": 0.5223, "grad_norm": 0.9243090152740479, "learning_rate": 0.0002, "epoch": 4.164578449599871, "step": 25760}, {"loss": 0.5177, "grad_norm": 1.1656451225280762, "learning_rate": 0.0002, "epoch": 4.166195133780616, "step": 25770}, {"loss": 0.5334, "grad_norm": 0.936358630657196, "learning_rate": 0.0002, "epoch": 4.167811817961361, "step": 25780}, {"loss": 0.5236, "grad_norm": 0.8618208169937134, "learning_rate": 0.0002, "epoch": 4.1694285021421065, "step": 25790}, {"loss": 0.5186, "grad_norm": 0.8580600023269653, "learning_rate": 0.0002, "epoch": 4.171045186322852, "step": 25800}, {"loss": 0.5212, "grad_norm": 1.0128562450408936, "learning_rate": 0.0002, "epoch": 4.172661870503597, "step": 25810}, {"loss": 0.5404, "grad_norm": 0.854865312576294, "learning_rate": 0.0002, "epoch": 4.174278554684342, "step": 25820}, {"loss": 0.5377, "grad_norm": 1.235082745552063, "learning_rate": 0.0002, "epoch": 4.175895238865087, "step": 25830}, {"loss": 0.5614, "grad_norm": 0.9796220660209656, "learning_rate": 0.0002, "epoch": 4.177511923045833, "step": 25840}, {"loss": 0.5689, "grad_norm": 0.8922094702720642, "learning_rate": 0.0002, "epoch": 4.179128607226578, "step": 25850}, {"loss": 0.5806, "grad_norm": 0.9672530293464661, "learning_rate": 0.0002, "epoch": 4.180745291407324, "step": 25860}, {"loss": 0.5074, "grad_norm": 0.8662548661231995, "learning_rate": 0.0002, "epoch": 4.182361975588069, "step": 25870}, {"loss": 0.5329, "grad_norm": 0.7938798069953918, "learning_rate": 0.0002, "epoch": 4.1839786597688144, "step": 25880}, {"loss": 0.5427, "grad_norm": 1.0517958402633667, "learning_rate": 0.0002, "epoch": 4.18559534394956, "step": 25890}, {"loss": 0.5147, "grad_norm": 0.8939275145530701, "learning_rate": 0.0002, "epoch": 4.187212028130305, "step": 25900}, {"loss": 0.5199, "grad_norm": 1.0296672582626343, "learning_rate": 0.0002, "epoch": 4.18882871231105, "step": 25910}, {"loss": 0.5522, "grad_norm": 0.8104017972946167, "learning_rate": 0.0002, "epoch": 4.190445396491795, "step": 25920}, {"loss": 0.596, "grad_norm": 0.9984509944915771, "learning_rate": 0.0002, "epoch": 4.192062080672541, "step": 25930}, {"loss": 0.5356, "grad_norm": 0.9844784736633301, "learning_rate": 0.0002, "epoch": 4.193678764853286, "step": 25940}, {"loss": 0.5198, "grad_norm": 0.8168622255325317, "learning_rate": 0.0002, "epoch": 4.195295449034031, "step": 25950}, {"loss": 0.542, "grad_norm": 1.0878913402557373, "learning_rate": 0.0002, "epoch": 4.196912133214776, "step": 25960}, {"loss": 0.5414, "grad_norm": 0.927126407623291, "learning_rate": 0.0002, "epoch": 4.1985288173955215, "step": 25970}, {"loss": 0.5794, "grad_norm": 0.838586688041687, "learning_rate": 0.0002, "epoch": 4.200145501576267, "step": 25980}, {"loss": 0.5454, "grad_norm": 1.2572145462036133, "learning_rate": 0.0002, "epoch": 4.201762185757012, "step": 25990}, {"loss": 0.5048, "grad_norm": 1.0476740598678589, "learning_rate": 0.0002, "epoch": 4.203378869937758, "step": 26000}, {"loss": 0.5127, "grad_norm": 1.0873368978500366, "learning_rate": 0.0002, "epoch": 4.204995554118503, "step": 26010}, {"loss": 0.5679, "grad_norm": 1.2664896249771118, "learning_rate": 0.0002, "epoch": 4.206612238299249, "step": 26020}, {"loss": 0.5814, "grad_norm": 1.0312391519546509, "learning_rate": 0.0002, "epoch": 4.208228922479994, "step": 26030}, {"loss": 0.571, "grad_norm": 1.0235042572021484, "learning_rate": 0.0002, "epoch": 4.209845606660739, "step": 26040}, {"loss": 0.5766, "grad_norm": 0.8882219195365906, "learning_rate": 0.0002, "epoch": 4.211462290841484, "step": 26050}, {"loss": 0.5557, "grad_norm": 0.9115961790084839, "learning_rate": 0.0002, "epoch": 4.2130789750222295, "step": 26060}, {"loss": 0.5455, "grad_norm": 1.0218228101730347, "learning_rate": 0.0002, "epoch": 4.214695659202975, "step": 26070}, {"loss": 0.5462, "grad_norm": 1.0802232027053833, "learning_rate": 0.0002, "epoch": 4.21631234338372, "step": 26080}, {"loss": 0.557, "grad_norm": 1.1488053798675537, "learning_rate": 0.0002, "epoch": 4.217929027564465, "step": 26090}, {"loss": 0.52, "grad_norm": 1.0487725734710693, "learning_rate": 0.0002, "epoch": 4.21954571174521, "step": 26100}, {"loss": 0.5568, "grad_norm": 0.9131165742874146, "learning_rate": 0.0002, "epoch": 4.221162395925956, "step": 26110}, {"loss": 0.5206, "grad_norm": 0.9012845158576965, "learning_rate": 0.0002, "epoch": 4.222779080106701, "step": 26120}, {"loss": 0.561, "grad_norm": 0.8389840126037598, "learning_rate": 0.0002, "epoch": 4.224395764287446, "step": 26130}, {"loss": 0.5268, "grad_norm": 0.8924660682678223, "learning_rate": 0.0002, "epoch": 4.226012448468191, "step": 26140}, {"loss": 0.5715, "grad_norm": 0.8556463718414307, "learning_rate": 0.0002, "epoch": 4.2276291326489375, "step": 26150}, {"loss": 0.5695, "grad_norm": 0.9643129110336304, "learning_rate": 0.0002, "epoch": 4.229245816829683, "step": 26160}, {"loss": 0.5321, "grad_norm": 0.9865712523460388, "learning_rate": 0.0002, "epoch": 4.230862501010428, "step": 26170}, {"loss": 0.5406, "grad_norm": 1.152641773223877, "learning_rate": 0.0002, "epoch": 4.232479185191173, "step": 26180}, {"loss": 0.5632, "grad_norm": 0.9157698154449463, "learning_rate": 0.0002, "epoch": 4.234095869371918, "step": 26190}, {"loss": 0.5717, "grad_norm": 0.8418048620223999, "learning_rate": 0.0002, "epoch": 4.235712553552664, "step": 26200}, {"loss": 0.5624, "grad_norm": 0.9430168867111206, "learning_rate": 0.0002, "epoch": 4.237329237733409, "step": 26210}, {"loss": 0.5574, "grad_norm": 1.012582778930664, "learning_rate": 0.0002, "epoch": 4.238945921914154, "step": 26220}, {"loss": 0.5693, "grad_norm": 1.112619400024414, "learning_rate": 0.0002, "epoch": 4.240562606094899, "step": 26230}, {"loss": 0.6037, "grad_norm": 0.9243621826171875, "learning_rate": 0.0002, "epoch": 4.2421792902756446, "step": 26240}, {"loss": 0.569, "grad_norm": 0.6977595686912537, "learning_rate": 0.0002, "epoch": 4.24379597445639, "step": 26250}, {"loss": 0.5379, "grad_norm": 0.9600721597671509, "learning_rate": 0.0002, "epoch": 4.245412658637135, "step": 26260}, {"loss": 0.5658, "grad_norm": 0.882641613483429, "learning_rate": 0.0002, "epoch": 4.24702934281788, "step": 26270}, {"loss": 0.55, "grad_norm": 1.010920763015747, "learning_rate": 0.0002, "epoch": 4.2486460269986255, "step": 26280}, {"loss": 0.5803, "grad_norm": 0.9289400577545166, "learning_rate": 0.0002, "epoch": 4.250262711179371, "step": 26290}, {"loss": 0.541, "grad_norm": 1.137397289276123, "learning_rate": 0.0002, "epoch": 4.251879395360117, "step": 26300}, {"loss": 0.5204, "grad_norm": 1.0136182308197021, "learning_rate": 0.0002, "epoch": 4.253496079540862, "step": 26310}, {"loss": 0.5708, "grad_norm": 0.9387356042861938, "learning_rate": 0.0002, "epoch": 4.255112763721607, "step": 26320}, {"loss": 0.5948, "grad_norm": 1.1833957433700562, "learning_rate": 0.0002, "epoch": 4.2567294479023525, "step": 26330}, {"loss": 0.5905, "grad_norm": 0.9415934681892395, "learning_rate": 0.0002, "epoch": 4.258346132083098, "step": 26340}, {"loss": 0.5539, "grad_norm": 0.8550165891647339, "learning_rate": 0.0002, "epoch": 4.259962816263843, "step": 26350}, {"loss": 0.555, "grad_norm": 9.924622535705566, "learning_rate": 0.0002, "epoch": 4.261579500444588, "step": 26360}, {"loss": 0.5689, "grad_norm": 1.0104902982711792, "learning_rate": 0.0002, "epoch": 4.2631961846253335, "step": 26370}, {"loss": 0.5698, "grad_norm": 0.890794038772583, "learning_rate": 0.0002, "epoch": 4.264812868806079, "step": 26380}, {"loss": 0.563, "grad_norm": 1.0560191869735718, "learning_rate": 0.0002, "epoch": 4.266429552986824, "step": 26390}, {"loss": 0.5119, "grad_norm": 1.0135581493377686, "learning_rate": 0.0002, "epoch": 4.268046237167569, "step": 26400}, {"loss": 0.5359, "grad_norm": 1.1304140090942383, "learning_rate": 0.0002, "epoch": 4.269662921348314, "step": 26410}, {"loss": 0.5615, "grad_norm": 0.9899303913116455, "learning_rate": 0.0002, "epoch": 4.27127960552906, "step": 26420}, {"loss": 0.5815, "grad_norm": 1.0505329370498657, "learning_rate": 0.0002, "epoch": 4.272896289709805, "step": 26430}, {"loss": 0.5384, "grad_norm": 0.9389396905899048, "learning_rate": 0.0002, "epoch": 4.27451297389055, "step": 26440}, {"loss": 0.5558, "grad_norm": 0.875328779220581, "learning_rate": 0.0002, "epoch": 4.276129658071296, "step": 26450}, {"loss": 0.5601, "grad_norm": 1.0689256191253662, "learning_rate": 0.0002, "epoch": 4.277746342252041, "step": 26460}, {"loss": 0.546, "grad_norm": 0.9988957643508911, "learning_rate": 0.0002, "epoch": 4.279363026432787, "step": 26470}, {"loss": 0.5478, "grad_norm": 0.8721813559532166, "learning_rate": 0.0002, "epoch": 4.280979710613532, "step": 26480}, {"loss": 0.5424, "grad_norm": 1.100109577178955, "learning_rate": 0.0002, "epoch": 4.282596394794277, "step": 26490}, {"loss": 0.572, "grad_norm": 1.1607271432876587, "learning_rate": 0.0002, "epoch": 4.284213078975022, "step": 26500}, {"loss": 0.6287, "grad_norm": 0.879088819026947, "learning_rate": 0.0002, "epoch": 4.285829763155768, "step": 26510}, {"loss": 0.573, "grad_norm": 0.9891700744628906, "learning_rate": 0.0002, "epoch": 4.287446447336513, "step": 26520}, {"loss": 0.6018, "grad_norm": 1.0831127166748047, "learning_rate": 0.0002, "epoch": 4.289063131517258, "step": 26530}, {"loss": 0.5693, "grad_norm": 1.4108285903930664, "learning_rate": 0.0002, "epoch": 4.290679815698003, "step": 26540}, {"loss": 0.5888, "grad_norm": 1.0630289316177368, "learning_rate": 0.0002, "epoch": 4.2922964998787485, "step": 26550}, {"loss": 0.5817, "grad_norm": 1.0854572057724, "learning_rate": 0.0002, "epoch": 4.293913184059494, "step": 26560}, {"loss": 0.5586, "grad_norm": 0.9561646580696106, "learning_rate": 0.0002, "epoch": 4.295529868240239, "step": 26570}, {"loss": 0.5674, "grad_norm": 0.9064981937408447, "learning_rate": 0.0002, "epoch": 4.297146552420984, "step": 26580}, {"loss": 0.5847, "grad_norm": 1.0082972049713135, "learning_rate": 0.0002, "epoch": 4.298763236601729, "step": 26590}, {"loss": 0.5711, "grad_norm": 1.1613214015960693, "learning_rate": 0.0002, "epoch": 4.3003799207824756, "step": 26600}, {"loss": 0.551, "grad_norm": 0.9847695231437683, "learning_rate": 0.0002, "epoch": 4.301996604963221, "step": 26610}, {"loss": 0.6089, "grad_norm": 1.0980697870254517, "learning_rate": 0.0002, "epoch": 4.303613289143966, "step": 26620}, {"loss": 0.5797, "grad_norm": 0.8861175179481506, "learning_rate": 0.0002, "epoch": 4.305229973324711, "step": 26630}, {"loss": 0.5716, "grad_norm": 0.8917363286018372, "learning_rate": 0.0002, "epoch": 4.3068466575054565, "step": 26640}, {"loss": 0.5892, "grad_norm": 1.0458378791809082, "learning_rate": 0.0002, "epoch": 4.308463341686202, "step": 26650}, {"loss": 0.5883, "grad_norm": 1.4859240055084229, "learning_rate": 0.0002, "epoch": 4.310080025866947, "step": 26660}, {"loss": 0.5296, "grad_norm": 1.1376359462738037, "learning_rate": 0.0002, "epoch": 4.311696710047692, "step": 26670}, {"loss": 0.5671, "grad_norm": 0.991349995136261, "learning_rate": 0.0002, "epoch": 4.313313394228437, "step": 26680}, {"loss": 0.5338, "grad_norm": 0.9995543956756592, "learning_rate": 0.0002, "epoch": 4.314930078409183, "step": 26690}, {"loss": 0.5542, "grad_norm": 1.0515851974487305, "learning_rate": 0.0002, "epoch": 4.316546762589928, "step": 26700}, {"loss": 0.5473, "grad_norm": 1.008023977279663, "learning_rate": 0.0002, "epoch": 4.318163446770673, "step": 26710}, {"loss": 0.5506, "grad_norm": 1.0184582471847534, "learning_rate": 0.0002, "epoch": 4.319780130951418, "step": 26720}, {"loss": 0.5828, "grad_norm": 1.161071538925171, "learning_rate": 0.0002, "epoch": 4.321396815132164, "step": 26730}, {"loss": 0.5633, "grad_norm": 0.9580779671669006, "learning_rate": 0.0002, "epoch": 4.323013499312909, "step": 26740}, {"loss": 0.5785, "grad_norm": 1.0189911127090454, "learning_rate": 0.0002, "epoch": 4.324630183493655, "step": 26750}, {"loss": 0.5237, "grad_norm": 0.7484358549118042, "learning_rate": 0.0002, "epoch": 4.3262468676744, "step": 26760}, {"loss": 0.5728, "grad_norm": 1.0015908479690552, "learning_rate": 0.0002, "epoch": 4.327863551855145, "step": 26770}, {"loss": 0.5597, "grad_norm": 0.8972945809364319, "learning_rate": 0.0002, "epoch": 4.329480236035891, "step": 26780}, {"loss": 0.5857, "grad_norm": 1.01099693775177, "learning_rate": 0.0002, "epoch": 4.331096920216636, "step": 26790}, {"loss": 0.5591, "grad_norm": 0.846958339214325, "learning_rate": 0.0002, "epoch": 4.332713604397381, "step": 26800}, {"loss": 0.5547, "grad_norm": 1.0792603492736816, "learning_rate": 0.0002, "epoch": 4.334330288578126, "step": 26810}, {"loss": 0.5747, "grad_norm": 1.0373345613479614, "learning_rate": 0.0002, "epoch": 4.3359469727588715, "step": 26820}, {"loss": 0.558, "grad_norm": 0.9779167771339417, "learning_rate": 0.0002, "epoch": 4.337563656939617, "step": 26830}, {"loss": 0.5821, "grad_norm": 1.0235520601272583, "learning_rate": 0.0002, "epoch": 4.339180341120362, "step": 26840}, {"loss": 0.5843, "grad_norm": 1.04195237159729, "learning_rate": 0.0002, "epoch": 4.340797025301107, "step": 26850}, {"loss": 0.5474, "grad_norm": 0.9479565620422363, "learning_rate": 0.0002, "epoch": 4.3424137094818525, "step": 26860}, {"loss": 0.5646, "grad_norm": 0.9526172280311584, "learning_rate": 0.0002, "epoch": 4.344030393662598, "step": 26870}, {"loss": 0.521, "grad_norm": 0.8571456074714661, "learning_rate": 0.0002, "epoch": 4.345647077843343, "step": 26880}, {"loss": 0.5846, "grad_norm": 0.9475828409194946, "learning_rate": 0.0002, "epoch": 4.347263762024088, "step": 26890}, {"loss": 0.5815, "grad_norm": 1.0529576539993286, "learning_rate": 0.0002, "epoch": 4.348880446204834, "step": 26900}, {"loss": 0.56, "grad_norm": 0.9648140072822571, "learning_rate": 0.0002, "epoch": 4.3504971303855795, "step": 26910}, {"loss": 0.5162, "grad_norm": 1.0488841533660889, "learning_rate": 0.0002, "epoch": 4.352113814566325, "step": 26920}, {"loss": 0.5842, "grad_norm": 0.8771942257881165, "learning_rate": 0.0002, "epoch": 4.35373049874707, "step": 26930}, {"loss": 0.5966, "grad_norm": 0.9411202073097229, "learning_rate": 0.0002, "epoch": 4.355347182927815, "step": 26940}, {"loss": 0.6001, "grad_norm": 1.0997588634490967, "learning_rate": 0.0002, "epoch": 4.35696386710856, "step": 26950}, {"loss": 0.5528, "grad_norm": 0.968754768371582, "learning_rate": 0.0002, "epoch": 4.358580551289306, "step": 26960}, {"loss": 0.5881, "grad_norm": 0.9990773797035217, "learning_rate": 0.0002, "epoch": 4.360197235470051, "step": 26970}, {"loss": 0.5761, "grad_norm": 1.0210620164871216, "learning_rate": 0.0002, "epoch": 4.361813919650796, "step": 26980}, {"loss": 0.5768, "grad_norm": 0.855462908744812, "learning_rate": 0.0002, "epoch": 4.363430603831541, "step": 26990}, {"loss": 0.5493, "grad_norm": 0.9169660806655884, "learning_rate": 0.0002, "epoch": 4.365047288012287, "step": 27000}, {"loss": 0.5697, "grad_norm": 1.089629888534546, "learning_rate": 0.0002, "epoch": 4.366663972193032, "step": 27010}, {"loss": 0.5854, "grad_norm": 1.0932867527008057, "learning_rate": 0.0002, "epoch": 4.368280656373777, "step": 27020}, {"loss": 0.5656, "grad_norm": 0.9290956854820251, "learning_rate": 0.0002, "epoch": 4.369897340554522, "step": 27030}, {"loss": 0.5727, "grad_norm": 1.2800624370574951, "learning_rate": 0.0002, "epoch": 4.3715140247352675, "step": 27040}, {"loss": 0.5837, "grad_norm": 0.8993493318557739, "learning_rate": 0.0002, "epoch": 4.373130708916014, "step": 27050}, {"loss": 0.6232, "grad_norm": 1.1566431522369385, "learning_rate": 0.0002, "epoch": 4.374747393096759, "step": 27060}, {"loss": 0.5902, "grad_norm": 0.9479052424430847, "learning_rate": 0.0002, "epoch": 4.376364077277504, "step": 27070}, {"loss": 0.6189, "grad_norm": 1.0063648223876953, "learning_rate": 0.0002, "epoch": 4.377980761458249, "step": 27080}, {"loss": 0.561, "grad_norm": 0.8342045545578003, "learning_rate": 0.0002, "epoch": 4.379597445638995, "step": 27090}, {"loss": 0.5515, "grad_norm": 1.1390739679336548, "learning_rate": 0.0002, "epoch": 4.38121412981974, "step": 27100}, {"loss": 0.5372, "grad_norm": 0.9547637104988098, "learning_rate": 0.0002, "epoch": 4.382830814000485, "step": 27110}, {"loss": 0.5728, "grad_norm": 1.0503804683685303, "learning_rate": 0.0002, "epoch": 4.38444749818123, "step": 27120}, {"loss": 0.5787, "grad_norm": 0.9064017534255981, "learning_rate": 0.0002, "epoch": 4.3860641823619755, "step": 27130}, {"loss": 0.5798, "grad_norm": 0.9382519125938416, "learning_rate": 0.0002, "epoch": 4.387680866542721, "step": 27140}, {"loss": 0.5791, "grad_norm": 1.0410341024398804, "learning_rate": 0.0002, "epoch": 4.389297550723466, "step": 27150}, {"loss": 0.6034, "grad_norm": 0.9218655824661255, "learning_rate": 0.0002, "epoch": 4.390914234904211, "step": 27160}, {"loss": 0.5204, "grad_norm": 0.8119737505912781, "learning_rate": 0.0002, "epoch": 4.392530919084956, "step": 27170}, {"loss": 0.5612, "grad_norm": 0.8584722876548767, "learning_rate": 0.0002, "epoch": 4.394147603265702, "step": 27180}, {"loss": 0.5772, "grad_norm": 0.9668293595314026, "learning_rate": 0.0002, "epoch": 4.395764287446447, "step": 27190}, {"loss": 0.6009, "grad_norm": 1.022334098815918, "learning_rate": 0.0002, "epoch": 4.397380971627193, "step": 27200}, {"loss": 0.5573, "grad_norm": 0.9553216099739075, "learning_rate": 0.0002, "epoch": 4.398997655807938, "step": 27210}, {"loss": 0.5604, "grad_norm": 0.9282339215278625, "learning_rate": 0.0002, "epoch": 4.4006143399886835, "step": 27220}, {"loss": 0.5599, "grad_norm": 1.0232292413711548, "learning_rate": 0.0002, "epoch": 4.402231024169429, "step": 27230}, {"loss": 0.6078, "grad_norm": 0.9915700554847717, "learning_rate": 0.0002, "epoch": 4.403847708350174, "step": 27240}, {"loss": 0.5778, "grad_norm": 1.0014961957931519, "learning_rate": 0.0002, "epoch": 4.405464392530919, "step": 27250}, {"loss": 0.5824, "grad_norm": 1.1172103881835938, "learning_rate": 0.0002, "epoch": 4.407081076711664, "step": 27260}, {"loss": 0.5286, "grad_norm": 0.8583093285560608, "learning_rate": 0.0002, "epoch": 4.40869776089241, "step": 27270}, {"loss": 0.5507, "grad_norm": 0.7609201669692993, "learning_rate": 0.0002, "epoch": 4.410314445073155, "step": 27280}, {"loss": 0.575, "grad_norm": 1.0619351863861084, "learning_rate": 0.0002, "epoch": 4.4119311292539, "step": 27290}, {"loss": 0.5579, "grad_norm": 1.0177674293518066, "learning_rate": 0.0002, "epoch": 4.413547813434645, "step": 27300}, {"loss": 0.5628, "grad_norm": 0.9921218156814575, "learning_rate": 0.0002, "epoch": 4.4151644976153905, "step": 27310}, {"loss": 0.6018, "grad_norm": 1.126244306564331, "learning_rate": 0.0002, "epoch": 4.416781181796136, "step": 27320}, {"loss": 0.5743, "grad_norm": 1.0678540468215942, "learning_rate": 0.0002, "epoch": 4.418397865976881, "step": 27330}, {"loss": 0.5665, "grad_norm": 0.8705704212188721, "learning_rate": 0.0002, "epoch": 4.420014550157627, "step": 27340}, {"loss": 0.5763, "grad_norm": 1.272074818611145, "learning_rate": 0.0002, "epoch": 4.421631234338372, "step": 27350}, {"loss": 0.561, "grad_norm": 0.8740444183349609, "learning_rate": 0.0002, "epoch": 4.423247918519118, "step": 27360}, {"loss": 0.5492, "grad_norm": 1.0584250688552856, "learning_rate": 0.0002, "epoch": 4.424864602699863, "step": 27370}, {"loss": 0.589, "grad_norm": 1.059870719909668, "learning_rate": 0.0002, "epoch": 4.426481286880608, "step": 27380}, {"loss": 0.5551, "grad_norm": 1.072265863418579, "learning_rate": 0.0002, "epoch": 4.428097971061353, "step": 27390}, {"loss": 0.5584, "grad_norm": 0.871481716632843, "learning_rate": 0.0002, "epoch": 4.4297146552420985, "step": 27400}, {"loss": 0.5372, "grad_norm": 0.9555448293685913, "learning_rate": 0.0002, "epoch": 4.431331339422844, "step": 27410}, {"loss": 0.5593, "grad_norm": 1.0402292013168335, "learning_rate": 0.0002, "epoch": 4.432948023603589, "step": 27420}, {"loss": 0.5532, "grad_norm": 1.12587571144104, "learning_rate": 0.0002, "epoch": 4.434564707784334, "step": 27430}, {"loss": 0.5403, "grad_norm": 1.0783193111419678, "learning_rate": 0.0002, "epoch": 4.436181391965079, "step": 27440}, {"loss": 0.5313, "grad_norm": 1.024133563041687, "learning_rate": 0.0002, "epoch": 4.437798076145825, "step": 27450}, {"loss": 0.5621, "grad_norm": 0.9156768918037415, "learning_rate": 0.0002, "epoch": 4.43941476032657, "step": 27460}, {"loss": 0.5307, "grad_norm": 1.0215224027633667, "learning_rate": 0.0002, "epoch": 4.441031444507315, "step": 27470}, {"loss": 0.5188, "grad_norm": 1.082116961479187, "learning_rate": 0.0002, "epoch": 4.442648128688061, "step": 27480}, {"loss": 0.6203, "grad_norm": 1.0412873029708862, "learning_rate": 0.0002, "epoch": 4.4442648128688065, "step": 27490}, {"loss": 0.5939, "grad_norm": 1.0509289503097534, "learning_rate": 0.0002, "epoch": 4.445881497049552, "step": 27500}, {"loss": 0.5503, "grad_norm": 0.9291498064994812, "learning_rate": 0.0002, "epoch": 4.447498181230297, "step": 27510}, {"loss": 0.5408, "grad_norm": 0.970184326171875, "learning_rate": 0.0002, "epoch": 4.449114865411042, "step": 27520}, {"loss": 0.5705, "grad_norm": 0.8418883681297302, "learning_rate": 0.0002, "epoch": 4.450731549591787, "step": 27530}, {"loss": 0.5124, "grad_norm": 0.8823825120925903, "learning_rate": 0.0002, "epoch": 4.452348233772533, "step": 27540}, {"loss": 0.5867, "grad_norm": 1.1909019947052002, "learning_rate": 0.0002, "epoch": 4.453964917953278, "step": 27550}, {"loss": 0.5685, "grad_norm": 1.0317302942276, "learning_rate": 0.0002, "epoch": 4.455581602134023, "step": 27560}, {"loss": 0.5538, "grad_norm": 0.9977751970291138, "learning_rate": 0.0002, "epoch": 4.457198286314768, "step": 27570}, {"loss": 0.5628, "grad_norm": 0.8909519910812378, "learning_rate": 0.0002, "epoch": 4.458814970495514, "step": 27580}, {"loss": 0.6099, "grad_norm": 0.8653029799461365, "learning_rate": 0.0002, "epoch": 4.460431654676259, "step": 27590}, {"loss": 0.5622, "grad_norm": 1.0783653259277344, "learning_rate": 0.0002, "epoch": 4.462048338857004, "step": 27600}, {"loss": 0.579, "grad_norm": 1.1235394477844238, "learning_rate": 0.0002, "epoch": 4.463665023037749, "step": 27610}, {"loss": 0.5545, "grad_norm": 0.9386643767356873, "learning_rate": 0.0002, "epoch": 4.4652817072184945, "step": 27620}, {"loss": 0.5554, "grad_norm": 1.0605148077011108, "learning_rate": 0.0002, "epoch": 4.466898391399241, "step": 27630}, {"loss": 0.5886, "grad_norm": 1.1283893585205078, "learning_rate": 0.0002, "epoch": 4.468515075579986, "step": 27640}, {"loss": 0.5801, "grad_norm": 1.0583468675613403, "learning_rate": 0.0002, "epoch": 4.470131759760731, "step": 27650}, {"loss": 0.5601, "grad_norm": 0.9563992023468018, "learning_rate": 0.0002, "epoch": 4.471748443941476, "step": 27660}, {"loss": 0.5687, "grad_norm": 1.100598931312561, "learning_rate": 0.0002, "epoch": 4.4733651281222215, "step": 27670}, {"loss": 0.589, "grad_norm": 0.9386957287788391, "learning_rate": 0.0002, "epoch": 4.474981812302967, "step": 27680}, {"loss": 0.6241, "grad_norm": 1.2946288585662842, "learning_rate": 0.0002, "epoch": 4.476598496483712, "step": 27690}, {"loss": 0.6075, "grad_norm": 1.0325199365615845, "learning_rate": 0.0002, "epoch": 4.478215180664457, "step": 27700}, {"loss": 0.588, "grad_norm": 1.0318928956985474, "learning_rate": 0.0002, "epoch": 4.4798318648452025, "step": 27710}, {"loss": 0.5656, "grad_norm": 0.8721024394035339, "learning_rate": 0.0002, "epoch": 4.481448549025948, "step": 27720}, {"loss": 0.5421, "grad_norm": 1.17376708984375, "learning_rate": 0.0002, "epoch": 4.483065233206693, "step": 27730}, {"loss": 0.5657, "grad_norm": 1.0926326513290405, "learning_rate": 0.0002, "epoch": 4.484681917387438, "step": 27740}, {"loss": 0.5514, "grad_norm": 0.9043852686882019, "learning_rate": 0.0002, "epoch": 4.486298601568183, "step": 27750}, {"loss": 0.582, "grad_norm": 1.064600944519043, "learning_rate": 0.0002, "epoch": 4.487915285748929, "step": 27760}, {"loss": 0.6108, "grad_norm": 0.7833460569381714, "learning_rate": 0.0002, "epoch": 4.489531969929674, "step": 27770}, {"loss": 0.5985, "grad_norm": 1.1073496341705322, "learning_rate": 0.0002, "epoch": 4.49114865411042, "step": 27780}, {"loss": 0.5577, "grad_norm": 1.0799397230148315, "learning_rate": 0.0002, "epoch": 4.492765338291165, "step": 27790}, {"loss": 0.5601, "grad_norm": 1.1062238216400146, "learning_rate": 0.0002, "epoch": 4.49438202247191, "step": 27800}, {"loss": 0.6126, "grad_norm": 1.0568242073059082, "learning_rate": 0.0002, "epoch": 4.495998706652656, "step": 27810}, {"loss": 0.5913, "grad_norm": 0.8861091732978821, "learning_rate": 0.0002, "epoch": 4.497615390833401, "step": 27820}, {"loss": 0.5858, "grad_norm": 1.2297543287277222, "learning_rate": 0.0002, "epoch": 4.499232075014146, "step": 27830}, {"loss": 0.5859, "grad_norm": 0.9600302577018738, "learning_rate": 0.0002, "epoch": 4.500848759194891, "step": 27840}, {"loss": 0.6124, "grad_norm": 1.057051181793213, "learning_rate": 0.0002, "epoch": 4.502465443375637, "step": 27850}, {"loss": 0.5788, "grad_norm": 0.9839690923690796, "learning_rate": 0.0002, "epoch": 4.504082127556382, "step": 27860}, {"loss": 0.555, "grad_norm": 1.1479853391647339, "learning_rate": 0.0002, "epoch": 4.505698811737127, "step": 27870}, {"loss": 0.6039, "grad_norm": 1.0550768375396729, "learning_rate": 0.0002, "epoch": 4.507315495917872, "step": 27880}, {"loss": 0.563, "grad_norm": 0.898209273815155, "learning_rate": 0.0002, "epoch": 4.5089321800986175, "step": 27890}, {"loss": 0.5734, "grad_norm": 0.9460315108299255, "learning_rate": 0.0002, "epoch": 4.510548864279363, "step": 27900}, {"loss": 0.5702, "grad_norm": 0.9499884247779846, "learning_rate": 0.0002, "epoch": 4.512165548460108, "step": 27910}, {"loss": 0.5385, "grad_norm": 0.7801318764686584, "learning_rate": 0.0002, "epoch": 4.513782232640853, "step": 27920}, {"loss": 0.5391, "grad_norm": 0.9286966323852539, "learning_rate": 0.0002, "epoch": 4.515398916821599, "step": 27930}, {"loss": 0.5717, "grad_norm": 0.9539980292320251, "learning_rate": 0.0002, "epoch": 4.517015601002345, "step": 27940}, {"loss": 0.6073, "grad_norm": 1.1053401231765747, "learning_rate": 0.0002, "epoch": 4.51863228518309, "step": 27950}, {"loss": 0.6087, "grad_norm": 0.7535534501075745, "learning_rate": 0.0002, "epoch": 4.520248969363835, "step": 27960}, {"loss": 0.5701, "grad_norm": 1.076926589012146, "learning_rate": 0.0002, "epoch": 4.52186565354458, "step": 27970}, {"loss": 0.6028, "grad_norm": 1.181935429573059, "learning_rate": 0.0002, "epoch": 4.5234823377253255, "step": 27980}, {"loss": 0.6033, "grad_norm": 0.9293407201766968, "learning_rate": 0.0002, "epoch": 4.525099021906071, "step": 27990}, {"loss": 0.5815, "grad_norm": 0.8953009247779846, "learning_rate": 0.0002, "epoch": 4.526715706086816, "step": 28000}, {"loss": 0.5564, "grad_norm": 1.0850225687026978, "learning_rate": 0.0002, "epoch": 4.528332390267561, "step": 28010}, {"loss": 0.5459, "grad_norm": 0.9125663042068481, "learning_rate": 0.0002, "epoch": 4.529949074448306, "step": 28020}, {"loss": 0.5922, "grad_norm": 0.8745216727256775, "learning_rate": 0.0002, "epoch": 4.531565758629052, "step": 28030}, {"loss": 0.567, "grad_norm": 1.0783463716506958, "learning_rate": 0.0002, "epoch": 4.533182442809797, "step": 28040}, {"loss": 0.5754, "grad_norm": 0.7513844966888428, "learning_rate": 0.0002, "epoch": 4.534799126990542, "step": 28050}, {"loss": 0.5608, "grad_norm": 1.0135776996612549, "learning_rate": 0.0002, "epoch": 4.536415811171287, "step": 28060}, {"loss": 0.5827, "grad_norm": 0.8886825442314148, "learning_rate": 0.0002, "epoch": 4.538032495352033, "step": 28070}, {"loss": 0.5605, "grad_norm": 0.8153995275497437, "learning_rate": 0.0002, "epoch": 4.539649179532779, "step": 28080}, {"loss": 0.6377, "grad_norm": 0.9853341579437256, "learning_rate": 0.0002, "epoch": 4.541265863713524, "step": 28090}, {"loss": 0.5957, "grad_norm": 0.9365800023078918, "learning_rate": 0.0002, "epoch": 4.542882547894269, "step": 28100}, {"loss": 0.5477, "grad_norm": 0.9765017628669739, "learning_rate": 0.0002, "epoch": 4.544499232075014, "step": 28110}, {"loss": 0.6185, "grad_norm": 0.9811279773712158, "learning_rate": 0.0002, "epoch": 4.54611591625576, "step": 28120}, {"loss": 0.6095, "grad_norm": 1.0387924909591675, "learning_rate": 0.0002, "epoch": 4.547732600436505, "step": 28130}, {"loss": 0.6534, "grad_norm": 1.0684878826141357, "learning_rate": 0.0002, "epoch": 4.54934928461725, "step": 28140}, {"loss": 0.5701, "grad_norm": 1.0000102519989014, "learning_rate": 0.0002, "epoch": 4.550965968797995, "step": 28150}, {"loss": 0.5327, "grad_norm": 1.0717930793762207, "learning_rate": 0.0002, "epoch": 4.5525826529787405, "step": 28160}, {"loss": 0.5594, "grad_norm": 0.990074634552002, "learning_rate": 0.0002, "epoch": 4.554199337159486, "step": 28170}, {"loss": 0.5452, "grad_norm": 0.8673754930496216, "learning_rate": 0.0002, "epoch": 4.555816021340231, "step": 28180}, {"loss": 0.5773, "grad_norm": 0.864247739315033, "learning_rate": 0.0002, "epoch": 4.557432705520976, "step": 28190}, {"loss": 0.5516, "grad_norm": 0.8280200958251953, "learning_rate": 0.0002, "epoch": 4.5590493897017215, "step": 28200}, {"loss": 0.5709, "grad_norm": 1.1312172412872314, "learning_rate": 0.0002, "epoch": 4.560666073882467, "step": 28210}, {"loss": 0.5776, "grad_norm": 0.9147403240203857, "learning_rate": 0.0002, "epoch": 4.562282758063212, "step": 28220}, {"loss": 0.5591, "grad_norm": 1.0321218967437744, "learning_rate": 0.0002, "epoch": 4.563899442243958, "step": 28230}, {"loss": 0.5508, "grad_norm": 1.168332815170288, "learning_rate": 0.0002, "epoch": 4.565516126424703, "step": 28240}, {"loss": 0.5649, "grad_norm": 1.0067222118377686, "learning_rate": 0.0002, "epoch": 4.5671328106054485, "step": 28250}, {"loss": 0.5853, "grad_norm": 1.0283393859863281, "learning_rate": 0.0002, "epoch": 4.568749494786194, "step": 28260}, {"loss": 0.5772, "grad_norm": 0.9912363886833191, "learning_rate": 0.0002, "epoch": 4.570366178966939, "step": 28270}, {"loss": 0.5757, "grad_norm": 1.108032464981079, "learning_rate": 0.0002, "epoch": 4.571982863147684, "step": 28280}, {"loss": 0.5529, "grad_norm": 0.8260078430175781, "learning_rate": 0.0002, "epoch": 4.573599547328429, "step": 28290}, {"loss": 0.5625, "grad_norm": 0.8946247100830078, "learning_rate": 0.0002, "epoch": 4.575216231509175, "step": 28300}, {"loss": 0.5533, "grad_norm": 0.8273587822914124, "learning_rate": 0.0002, "epoch": 4.57683291568992, "step": 28310}, {"loss": 0.6058, "grad_norm": 0.9040093421936035, "learning_rate": 0.0002, "epoch": 4.578449599870665, "step": 28320}, {"loss": 0.5521, "grad_norm": 0.8435290455818176, "learning_rate": 0.0002, "epoch": 4.58006628405141, "step": 28330}, {"loss": 0.6086, "grad_norm": 1.164088249206543, "learning_rate": 0.0002, "epoch": 4.581682968232156, "step": 28340}, {"loss": 0.5603, "grad_norm": 0.9861085414886475, "learning_rate": 0.0002, "epoch": 4.583299652412901, "step": 28350}, {"loss": 0.5701, "grad_norm": 0.8892980813980103, "learning_rate": 0.0002, "epoch": 4.584916336593646, "step": 28360}, {"loss": 0.598, "grad_norm": 1.240574836730957, "learning_rate": 0.0002, "epoch": 4.586533020774391, "step": 28370}, {"loss": 0.5797, "grad_norm": 0.8669408559799194, "learning_rate": 0.0002, "epoch": 4.588149704955137, "step": 28380}, {"loss": 0.5603, "grad_norm": 0.9145985841751099, "learning_rate": 0.0002, "epoch": 4.589766389135883, "step": 28390}, {"loss": 0.5765, "grad_norm": 0.8584614992141724, "learning_rate": 0.0002, "epoch": 4.591383073316628, "step": 28400}, {"loss": 0.5898, "grad_norm": 1.118829369544983, "learning_rate": 0.0002, "epoch": 4.592999757497373, "step": 28410}, {"loss": 0.5641, "grad_norm": 1.1411553621292114, "learning_rate": 0.0002, "epoch": 4.594616441678118, "step": 28420}, {"loss": 0.549, "grad_norm": 0.9433278441429138, "learning_rate": 0.0002, "epoch": 4.596233125858864, "step": 28430}, {"loss": 0.5496, "grad_norm": 0.816830039024353, "learning_rate": 0.0002, "epoch": 4.597849810039609, "step": 28440}, {"loss": 0.5543, "grad_norm": 1.2124968767166138, "learning_rate": 0.0002, "epoch": 4.599466494220354, "step": 28450}, {"loss": 0.5759, "grad_norm": 0.9658762216567993, "learning_rate": 0.0002, "epoch": 4.601083178401099, "step": 28460}, {"loss": 0.5902, "grad_norm": 0.836100161075592, "learning_rate": 0.0002, "epoch": 4.6026998625818445, "step": 28470}, {"loss": 0.5749, "grad_norm": 0.9989104270935059, "learning_rate": 0.0002, "epoch": 4.60431654676259, "step": 28480}, {"loss": 0.5616, "grad_norm": 1.1298956871032715, "learning_rate": 0.0002, "epoch": 4.605933230943335, "step": 28490}, {"loss": 0.5846, "grad_norm": 1.1731704473495483, "learning_rate": 0.0002, "epoch": 4.60754991512408, "step": 28500}, {"loss": 0.5816, "grad_norm": 0.9624714255332947, "learning_rate": 0.0002, "epoch": 4.609166599304825, "step": 28510}, {"loss": 0.5868, "grad_norm": 1.364073634147644, "learning_rate": 0.0002, "epoch": 4.610783283485571, "step": 28520}, {"loss": 0.6237, "grad_norm": 1.1827356815338135, "learning_rate": 0.0002, "epoch": 4.612399967666317, "step": 28530}, {"loss": 0.5643, "grad_norm": 0.6651531457901001, "learning_rate": 0.0002, "epoch": 4.614016651847062, "step": 28540}, {"loss": 0.6051, "grad_norm": 1.1640995740890503, "learning_rate": 0.0002, "epoch": 4.615633336027807, "step": 28550}, {"loss": 0.5995, "grad_norm": 1.028918743133545, "learning_rate": 0.0002, "epoch": 4.6172500202085525, "step": 28560}, {"loss": 0.5607, "grad_norm": 0.8252120614051819, "learning_rate": 0.0002, "epoch": 4.618866704389298, "step": 28570}, {"loss": 0.5769, "grad_norm": 1.3536735773086548, "learning_rate": 0.0002, "epoch": 4.620483388570043, "step": 28580}, {"loss": 0.6006, "grad_norm": 1.2146915197372437, "learning_rate": 0.0002, "epoch": 4.622100072750788, "step": 28590}, {"loss": 0.5503, "grad_norm": 1.0122549533843994, "learning_rate": 0.0002, "epoch": 4.623716756931533, "step": 28600}, {"loss": 0.6072, "grad_norm": 0.9977872967720032, "learning_rate": 0.0002, "epoch": 4.625333441112279, "step": 28610}, {"loss": 0.5669, "grad_norm": 1.0159751176834106, "learning_rate": 0.0002, "epoch": 4.626950125293024, "step": 28620}, {"loss": 0.5935, "grad_norm": 1.0028325319290161, "learning_rate": 0.0002, "epoch": 4.628566809473769, "step": 28630}, {"loss": 0.5515, "grad_norm": 0.901638388633728, "learning_rate": 0.0002, "epoch": 4.630183493654514, "step": 28640}, {"loss": 0.595, "grad_norm": 0.9450507164001465, "learning_rate": 0.0002, "epoch": 4.6318001778352595, "step": 28650}, {"loss": 0.5972, "grad_norm": 0.9987545013427734, "learning_rate": 0.0002, "epoch": 4.633416862016006, "step": 28660}, {"loss": 0.5863, "grad_norm": 0.9574332237243652, "learning_rate": 0.0002, "epoch": 4.63503354619675, "step": 28670}, {"loss": 0.5804, "grad_norm": 1.2215653657913208, "learning_rate": 0.0002, "epoch": 4.636650230377496, "step": 28680}, {"loss": 0.5798, "grad_norm": 0.9798858761787415, "learning_rate": 0.0002, "epoch": 4.638266914558241, "step": 28690}, {"loss": 0.5773, "grad_norm": 1.0648466348648071, "learning_rate": 0.0002, "epoch": 4.639883598738987, "step": 28700}, {"loss": 0.6108, "grad_norm": 1.0606504678726196, "learning_rate": 0.0002, "epoch": 4.641500282919732, "step": 28710}, {"loss": 0.5801, "grad_norm": 1.0892442464828491, "learning_rate": 0.0002, "epoch": 4.643116967100477, "step": 28720}, {"loss": 0.5492, "grad_norm": 0.914391040802002, "learning_rate": 0.0002, "epoch": 4.644733651281222, "step": 28730}, {"loss": 0.5439, "grad_norm": 0.9782370328903198, "learning_rate": 0.0002, "epoch": 4.6463503354619675, "step": 28740}, {"loss": 0.6035, "grad_norm": 1.0344339609146118, "learning_rate": 0.0002, "epoch": 4.647967019642713, "step": 28750}, {"loss": 0.5775, "grad_norm": 1.0513931512832642, "learning_rate": 0.0002, "epoch": 4.649583703823458, "step": 28760}, {"loss": 0.546, "grad_norm": 0.9711475968360901, "learning_rate": 0.0002, "epoch": 4.651200388004203, "step": 28770}, {"loss": 0.5472, "grad_norm": 0.977519690990448, "learning_rate": 0.0002, "epoch": 4.652817072184948, "step": 28780}, {"loss": 0.5826, "grad_norm": 0.9150224924087524, "learning_rate": 0.0002, "epoch": 4.654433756365694, "step": 28790}, {"loss": 0.5382, "grad_norm": 1.0973542928695679, "learning_rate": 0.0002, "epoch": 4.656050440546439, "step": 28800}, {"loss": 0.6147, "grad_norm": 0.944877564907074, "learning_rate": 0.0002, "epoch": 4.657667124727185, "step": 28810}, {"loss": 0.5537, "grad_norm": 0.9508748650550842, "learning_rate": 0.0002, "epoch": 4.659283808907929, "step": 28820}, {"loss": 0.5537, "grad_norm": 0.9681721329689026, "learning_rate": 0.0002, "epoch": 4.6609004930886755, "step": 28830}, {"loss": 0.592, "grad_norm": 1.0214351415634155, "learning_rate": 0.0002, "epoch": 4.662517177269421, "step": 28840}, {"loss": 0.6031, "grad_norm": 0.9748611450195312, "learning_rate": 0.0002, "epoch": 4.664133861450166, "step": 28850}, {"loss": 0.572, "grad_norm": 0.8484147191047668, "learning_rate": 0.0002, "epoch": 4.665750545630911, "step": 28860}, {"loss": 0.5699, "grad_norm": 1.1252986192703247, "learning_rate": 0.0002, "epoch": 4.667367229811656, "step": 28870}, {"loss": 0.5724, "grad_norm": 0.8706206679344177, "learning_rate": 0.0002, "epoch": 4.668983913992402, "step": 28880}, {"loss": 0.6002, "grad_norm": 1.1432424783706665, "learning_rate": 0.0002, "epoch": 4.670600598173147, "step": 28890}, {"loss": 0.5675, "grad_norm": 1.017029047012329, "learning_rate": 0.0002, "epoch": 4.672217282353892, "step": 28900}, {"loss": 0.5831, "grad_norm": 1.085597038269043, "learning_rate": 0.0002, "epoch": 4.673833966534637, "step": 28910}, {"loss": 0.5678, "grad_norm": 0.9275796413421631, "learning_rate": 0.0002, "epoch": 4.675450650715383, "step": 28920}, {"loss": 0.5603, "grad_norm": 0.9518964886665344, "learning_rate": 0.0002, "epoch": 4.677067334896128, "step": 28930}, {"loss": 0.6232, "grad_norm": 1.0352122783660889, "learning_rate": 0.0002, "epoch": 4.678684019076873, "step": 28940}, {"loss": 0.5786, "grad_norm": 1.090124249458313, "learning_rate": 0.0002, "epoch": 4.680300703257618, "step": 28950}, {"loss": 0.5728, "grad_norm": 0.8799563050270081, "learning_rate": 0.0002, "epoch": 4.681917387438364, "step": 28960}, {"loss": 0.5787, "grad_norm": 1.0929821729660034, "learning_rate": 0.0002, "epoch": 4.683534071619109, "step": 28970}, {"loss": 0.6134, "grad_norm": 0.903727650642395, "learning_rate": 0.0002, "epoch": 4.685150755799855, "step": 28980}, {"loss": 0.5522, "grad_norm": 0.9752424955368042, "learning_rate": 0.0002, "epoch": 4.6867674399806, "step": 28990}, {"loss": 0.5762, "grad_norm": 0.9351571202278137, "learning_rate": 0.0002, "epoch": 4.688384124161345, "step": 29000}, {"loss": 0.5811, "grad_norm": 0.923877477645874, "learning_rate": 0.0002, "epoch": 4.6900008083420905, "step": 29010}, {"loss": 0.5682, "grad_norm": 1.045389175415039, "learning_rate": 0.0002, "epoch": 4.691617492522836, "step": 29020}, {"loss": 0.584, "grad_norm": 1.0200831890106201, "learning_rate": 0.0002, "epoch": 4.693234176703581, "step": 29030}, {"loss": 0.5514, "grad_norm": 1.1499706506729126, "learning_rate": 0.0002, "epoch": 4.694850860884326, "step": 29040}, {"loss": 0.5745, "grad_norm": 0.860118567943573, "learning_rate": 0.0002, "epoch": 4.6964675450650715, "step": 29050}, {"loss": 0.5741, "grad_norm": 0.9774864315986633, "learning_rate": 0.0002, "epoch": 4.698084229245817, "step": 29060}, {"loss": 0.5765, "grad_norm": 1.0323210954666138, "learning_rate": 0.0002, "epoch": 4.699700913426562, "step": 29070}, {"loss": 0.5452, "grad_norm": 0.8492481112480164, "learning_rate": 0.0002, "epoch": 4.701317597607307, "step": 29080}, {"loss": 0.5985, "grad_norm": 1.131951093673706, "learning_rate": 0.0002, "epoch": 4.702934281788052, "step": 29090}, {"loss": 0.6412, "grad_norm": 0.8763113021850586, "learning_rate": 0.0002, "epoch": 4.704550965968798, "step": 29100}, {"loss": 0.575, "grad_norm": 1.045028805732727, "learning_rate": 0.0002, "epoch": 4.706167650149544, "step": 29110}, {"loss": 0.5548, "grad_norm": 0.9961401224136353, "learning_rate": 0.0002, "epoch": 4.707784334330288, "step": 29120}, {"loss": 0.559, "grad_norm": 0.9282503724098206, "learning_rate": 0.0002, "epoch": 4.709401018511034, "step": 29130}, {"loss": 0.5744, "grad_norm": 1.1418932676315308, "learning_rate": 0.0002, "epoch": 4.711017702691779, "step": 29140}, {"loss": 0.5394, "grad_norm": 0.9950099587440491, "learning_rate": 0.0002, "epoch": 4.712634386872525, "step": 29150}, {"loss": 0.6177, "grad_norm": 0.8304893374443054, "learning_rate": 0.0002, "epoch": 4.71425107105327, "step": 29160}, {"loss": 0.6074, "grad_norm": 1.115626335144043, "learning_rate": 0.0002, "epoch": 4.715867755234015, "step": 29170}, {"loss": 0.6265, "grad_norm": 1.079818606376648, "learning_rate": 0.0002, "epoch": 4.71748443941476, "step": 29180}, {"loss": 0.561, "grad_norm": 1.1929082870483398, "learning_rate": 0.0002, "epoch": 4.719101123595506, "step": 29190}, {"loss": 0.5708, "grad_norm": 0.9621080756187439, "learning_rate": 0.0002, "epoch": 4.720717807776251, "step": 29200}, {"loss": 0.546, "grad_norm": 0.8549222350120544, "learning_rate": 0.0002, "epoch": 4.722334491956996, "step": 29210}, {"loss": 0.5775, "grad_norm": 0.9341941475868225, "learning_rate": 0.0002, "epoch": 4.723951176137741, "step": 29220}, {"loss": 0.5436, "grad_norm": 1.075406789779663, "learning_rate": 0.0002, "epoch": 4.7255678603184865, "step": 29230}, {"loss": 0.576, "grad_norm": 1.0859880447387695, "learning_rate": 0.0002, "epoch": 4.727184544499232, "step": 29240}, {"loss": 0.5525, "grad_norm": 0.8475605249404907, "learning_rate": 0.0002, "epoch": 4.728801228679977, "step": 29250}, {"loss": 0.5659, "grad_norm": 0.9331845641136169, "learning_rate": 0.0002, "epoch": 4.730417912860723, "step": 29260}, {"loss": 0.5901, "grad_norm": 0.9279314279556274, "learning_rate": 0.0002, "epoch": 4.7320345970414674, "step": 29270}, {"loss": 0.597, "grad_norm": 0.7803558707237244, "learning_rate": 0.0002, "epoch": 4.733651281222214, "step": 29280}, {"loss": 0.5968, "grad_norm": 1.0159329175949097, "learning_rate": 0.0002, "epoch": 4.735267965402959, "step": 29290}, {"loss": 0.5333, "grad_norm": 0.9448670744895935, "learning_rate": 0.0002, "epoch": 4.736884649583704, "step": 29300}, {"loss": 0.574, "grad_norm": 1.0732197761535645, "learning_rate": 0.0002, "epoch": 4.738501333764449, "step": 29310}, {"loss": 0.6066, "grad_norm": 0.901830792427063, "learning_rate": 0.0002, "epoch": 4.7401180179451945, "step": 29320}, {"loss": 0.6105, "grad_norm": 0.9141789674758911, "learning_rate": 0.0002, "epoch": 4.74173470212594, "step": 29330}, {"loss": 0.5481, "grad_norm": 0.9733418226242065, "learning_rate": 0.0002, "epoch": 4.743351386306685, "step": 29340}, {"loss": 0.612, "grad_norm": 0.909810483455658, "learning_rate": 0.0002, "epoch": 4.74496807048743, "step": 29350}, {"loss": 0.5911, "grad_norm": 0.909541666507721, "learning_rate": 0.0002, "epoch": 4.746584754668175, "step": 29360}, {"loss": 0.5579, "grad_norm": 0.9383015632629395, "learning_rate": 0.0002, "epoch": 4.748201438848921, "step": 29370}, {"loss": 0.5529, "grad_norm": 0.9275668263435364, "learning_rate": 0.0002, "epoch": 4.749818123029666, "step": 29380}, {"loss": 0.5623, "grad_norm": 1.1146225929260254, "learning_rate": 0.0002, "epoch": 4.751434807210411, "step": 29390}, {"loss": 0.6018, "grad_norm": 1.0062453746795654, "learning_rate": 0.0002, "epoch": 4.753051491391156, "step": 29400}, {"loss": 0.5872, "grad_norm": 0.9451895952224731, "learning_rate": 0.0002, "epoch": 4.7546681755719025, "step": 29410}, {"loss": 0.5767, "grad_norm": 0.870457649230957, "learning_rate": 0.0002, "epoch": 4.756284859752648, "step": 29420}, {"loss": 0.57, "grad_norm": 1.0411282777786255, "learning_rate": 0.0002, "epoch": 4.757901543933393, "step": 29430}, {"loss": 0.5688, "grad_norm": 1.1648986339569092, "learning_rate": 0.0002, "epoch": 4.759518228114138, "step": 29440}, {"loss": 0.5432, "grad_norm": 0.8999572992324829, "learning_rate": 0.0002, "epoch": 4.761134912294883, "step": 29450}, {"loss": 0.5667, "grad_norm": 0.9863559007644653, "learning_rate": 0.0002, "epoch": 4.762751596475629, "step": 29460}, {"loss": 0.5779, "grad_norm": 0.9676542282104492, "learning_rate": 0.0002, "epoch": 4.764368280656374, "step": 29470}, {"loss": 0.6075, "grad_norm": 1.004775047302246, "learning_rate": 0.0002, "epoch": 4.765984964837119, "step": 29480}, {"loss": 0.6044, "grad_norm": 1.0937515497207642, "learning_rate": 0.0002, "epoch": 4.767601649017864, "step": 29490}, {"loss": 0.5433, "grad_norm": 0.9551598429679871, "learning_rate": 0.0002, "epoch": 4.7692183331986095, "step": 29500}, {"loss": 0.5609, "grad_norm": 1.0757228136062622, "learning_rate": 0.0002, "epoch": 4.770835017379355, "step": 29510}, {"loss": 0.567, "grad_norm": 1.0588841438293457, "learning_rate": 0.0002, "epoch": 4.7724517015601, "step": 29520}, {"loss": 0.5814, "grad_norm": 1.0744032859802246, "learning_rate": 0.0002, "epoch": 4.774068385740845, "step": 29530}, {"loss": 0.5681, "grad_norm": 1.0066277980804443, "learning_rate": 0.0002, "epoch": 4.7756850699215905, "step": 29540}, {"loss": 0.545, "grad_norm": 1.082319736480713, "learning_rate": 0.0002, "epoch": 4.777301754102336, "step": 29550}, {"loss": 0.5709, "grad_norm": 0.8252472877502441, "learning_rate": 0.0002, "epoch": 4.778918438283082, "step": 29560}, {"loss": 0.5666, "grad_norm": 0.9855340123176575, "learning_rate": 0.0002, "epoch": 4.780535122463827, "step": 29570}, {"loss": 0.6117, "grad_norm": 0.9991421699523926, "learning_rate": 0.0002, "epoch": 4.782151806644572, "step": 29580}, {"loss": 0.5966, "grad_norm": 1.316841959953308, "learning_rate": 0.0002, "epoch": 4.7837684908253175, "step": 29590}, {"loss": 0.6102, "grad_norm": 1.1513035297393799, "learning_rate": 0.0002, "epoch": 4.785385175006063, "step": 29600}, {"loss": 0.5785, "grad_norm": 0.9767683744430542, "learning_rate": 0.0002, "epoch": 4.787001859186808, "step": 29610}, {"loss": 0.6037, "grad_norm": 0.9786278605461121, "learning_rate": 0.0002, "epoch": 4.788618543367553, "step": 29620}, {"loss": 0.6108, "grad_norm": 0.8004973530769348, "learning_rate": 0.0002, "epoch": 4.7902352275482984, "step": 29630}, {"loss": 0.5932, "grad_norm": 1.0997767448425293, "learning_rate": 0.0002, "epoch": 4.791851911729044, "step": 29640}, {"loss": 0.5655, "grad_norm": 0.9752856492996216, "learning_rate": 0.0002, "epoch": 4.793468595909789, "step": 29650}, {"loss": 0.5916, "grad_norm": 1.0518392324447632, "learning_rate": 0.0002, "epoch": 4.795085280090534, "step": 29660}, {"loss": 0.6042, "grad_norm": 1.1050055027008057, "learning_rate": 0.0002, "epoch": 4.796701964271279, "step": 29670}, {"loss": 0.6089, "grad_norm": 0.9933857917785645, "learning_rate": 0.0002, "epoch": 4.798318648452025, "step": 29680}, {"loss": 0.6041, "grad_norm": 1.2804018259048462, "learning_rate": 0.0002, "epoch": 4.79993533263277, "step": 29690}, {"loss": 0.636, "grad_norm": 1.0133371353149414, "learning_rate": 0.0002, "epoch": 4.801552016813515, "step": 29700}, {"loss": 0.5662, "grad_norm": 1.080350637435913, "learning_rate": 0.0002, "epoch": 4.803168700994261, "step": 29710}, {"loss": 0.5603, "grad_norm": 0.9986529350280762, "learning_rate": 0.0002, "epoch": 4.804785385175006, "step": 29720}, {"loss": 0.5894, "grad_norm": 0.975665807723999, "learning_rate": 0.0002, "epoch": 4.806402069355752, "step": 29730}, {"loss": 0.6328, "grad_norm": 0.8458138704299927, "learning_rate": 0.0002, "epoch": 4.808018753536497, "step": 29740}, {"loss": 0.5837, "grad_norm": 0.99330073595047, "learning_rate": 0.0002, "epoch": 4.809635437717242, "step": 29750}, {"loss": 0.5507, "grad_norm": 0.898274302482605, "learning_rate": 0.0002, "epoch": 4.811252121897987, "step": 29760}, {"loss": 0.5842, "grad_norm": 1.0504480600357056, "learning_rate": 0.0002, "epoch": 4.812868806078733, "step": 29770}, {"loss": 0.5821, "grad_norm": 0.937919020652771, "learning_rate": 0.0002, "epoch": 4.814485490259478, "step": 29780}, {"loss": 0.5885, "grad_norm": 0.9593307971954346, "learning_rate": 0.0002, "epoch": 4.816102174440223, "step": 29790}, {"loss": 0.578, "grad_norm": 0.9431198835372925, "learning_rate": 0.0002, "epoch": 4.817718858620968, "step": 29800}, {"loss": 0.5739, "grad_norm": 1.2729957103729248, "learning_rate": 0.0002, "epoch": 4.8193355428017135, "step": 29810}, {"loss": 0.6124, "grad_norm": 0.8876838684082031, "learning_rate": 0.0002, "epoch": 4.820952226982459, "step": 29820}, {"loss": 0.5583, "grad_norm": 1.0185000896453857, "learning_rate": 0.0002, "epoch": 4.822568911163204, "step": 29830}, {"loss": 0.5686, "grad_norm": 1.064276099205017, "learning_rate": 0.0002, "epoch": 4.824185595343949, "step": 29840}, {"loss": 0.5698, "grad_norm": 0.9774803519248962, "learning_rate": 0.0002, "epoch": 4.825802279524694, "step": 29850}, {"loss": 0.5533, "grad_norm": 1.131646990776062, "learning_rate": 0.0002, "epoch": 4.8274189637054405, "step": 29860}, {"loss": 0.6371, "grad_norm": 1.081455945968628, "learning_rate": 0.0002, "epoch": 4.829035647886186, "step": 29870}, {"loss": 0.5793, "grad_norm": 0.990538477897644, "learning_rate": 0.0002, "epoch": 4.830652332066931, "step": 29880}, {"loss": 0.5833, "grad_norm": 0.9750600457191467, "learning_rate": 0.0002, "epoch": 4.832269016247676, "step": 29890}, {"loss": 0.619, "grad_norm": 1.0600621700286865, "learning_rate": 0.0002, "epoch": 4.8338857004284215, "step": 29900}, {"loss": 0.5841, "grad_norm": 0.9237320423126221, "learning_rate": 0.0002, "epoch": 4.835502384609167, "step": 29910}, {"loss": 0.5513, "grad_norm": 0.9739177227020264, "learning_rate": 0.0002, "epoch": 4.837119068789912, "step": 29920}, {"loss": 0.587, "grad_norm": 1.128677248954773, "learning_rate": 0.0002, "epoch": 4.838735752970657, "step": 29930}, {"loss": 0.564, "grad_norm": 1.042604923248291, "learning_rate": 0.0002, "epoch": 4.840352437151402, "step": 29940}, {"loss": 0.5885, "grad_norm": 0.849758505821228, "learning_rate": 0.0002, "epoch": 4.841969121332148, "step": 29950}, {"loss": 0.5952, "grad_norm": 1.2809888124465942, "learning_rate": 0.0002, "epoch": 4.843585805512893, "step": 29960}, {"loss": 0.5703, "grad_norm": 1.0177865028381348, "learning_rate": 0.0002, "epoch": 4.845202489693638, "step": 29970}, {"loss": 0.5946, "grad_norm": 1.0026639699935913, "learning_rate": 0.0002, "epoch": 4.846819173874383, "step": 29980}, {"loss": 0.5897, "grad_norm": 0.9679505228996277, "learning_rate": 0.0002, "epoch": 4.8484358580551286, "step": 29990}, {"loss": 0.5621, "grad_norm": 0.8939532041549683, "learning_rate": 0.0002, "epoch": 4.850052542235874, "step": 30000}, {"loss": 0.5852, "grad_norm": 0.9957457780838013, "learning_rate": 0.0002, "epoch": 4.85166922641662, "step": 30010}, {"loss": 0.6117, "grad_norm": 1.1646790504455566, "learning_rate": 0.0002, "epoch": 4.853285910597365, "step": 30020}, {"loss": 0.5711, "grad_norm": 0.8804680705070496, "learning_rate": 0.0002, "epoch": 4.85490259477811, "step": 30030}, {"loss": 0.5397, "grad_norm": 1.161970853805542, "learning_rate": 0.0002, "epoch": 4.856519278958856, "step": 30040}, {"loss": 0.5552, "grad_norm": 0.9081037640571594, "learning_rate": 0.0002, "epoch": 4.858135963139601, "step": 30050}, {"loss": 0.6024, "grad_norm": 0.9402848482131958, "learning_rate": 0.0002, "epoch": 4.859752647320346, "step": 30060}, {"loss": 0.6256, "grad_norm": 0.9023865461349487, "learning_rate": 0.0002, "epoch": 4.861369331501091, "step": 30070}, {"loss": 0.5926, "grad_norm": 1.0173414945602417, "learning_rate": 0.0002, "epoch": 4.8629860156818365, "step": 30080}, {"loss": 0.6274, "grad_norm": 1.084402322769165, "learning_rate": 0.0002, "epoch": 4.864602699862582, "step": 30090}, {"loss": 0.6311, "grad_norm": 0.9577937126159668, "learning_rate": 0.0002, "epoch": 4.866219384043327, "step": 30100}, {"loss": 0.5724, "grad_norm": 0.9807606935501099, "learning_rate": 0.0002, "epoch": 4.867836068224072, "step": 30110}, {"loss": 0.5786, "grad_norm": 0.978784441947937, "learning_rate": 0.0002, "epoch": 4.8694527524048175, "step": 30120}, {"loss": 0.6194, "grad_norm": 0.9762914776802063, "learning_rate": 0.0002, "epoch": 4.871069436585563, "step": 30130}, {"loss": 0.5892, "grad_norm": 0.9404871463775635, "learning_rate": 0.0002, "epoch": 4.872686120766308, "step": 30140}, {"loss": 0.6182, "grad_norm": 1.0069509744644165, "learning_rate": 0.0002, "epoch": 4.874302804947053, "step": 30150}, {"loss": 0.6225, "grad_norm": 1.1770923137664795, "learning_rate": 0.0002, "epoch": 4.875919489127799, "step": 30160}, {"loss": 0.5657, "grad_norm": 1.021210789680481, "learning_rate": 0.0002, "epoch": 4.8775361733085445, "step": 30170}, {"loss": 0.6033, "grad_norm": 0.8512648940086365, "learning_rate": 0.0002, "epoch": 4.87915285748929, "step": 30180}, {"loss": 0.5519, "grad_norm": 0.9345870018005371, "learning_rate": 0.0002, "epoch": 4.880769541670035, "step": 30190}, {"loss": 0.5682, "grad_norm": 1.0224418640136719, "learning_rate": 0.0002, "epoch": 4.88238622585078, "step": 30200}, {"loss": 0.5807, "grad_norm": 1.0316044092178345, "learning_rate": 0.0002, "epoch": 4.884002910031525, "step": 30210}, {"loss": 0.6065, "grad_norm": 1.102437973022461, "learning_rate": 0.0002, "epoch": 4.885619594212271, "step": 30220}, {"loss": 0.586, "grad_norm": 1.0220023393630981, "learning_rate": 0.0002, "epoch": 4.887236278393016, "step": 30230}, {"loss": 0.5781, "grad_norm": 1.0934523344039917, "learning_rate": 0.0002, "epoch": 4.888852962573761, "step": 30240}, {"loss": 0.6313, "grad_norm": 1.264630913734436, "learning_rate": 0.0002, "epoch": 4.890469646754506, "step": 30250}, {"loss": 0.5712, "grad_norm": 1.0999879837036133, "learning_rate": 0.0002, "epoch": 4.892086330935252, "step": 30260}, {"loss": 0.6413, "grad_norm": 0.9124550223350525, "learning_rate": 0.0002, "epoch": 4.893703015115997, "step": 30270}, {"loss": 0.596, "grad_norm": 0.9853624105453491, "learning_rate": 0.0002, "epoch": 4.895319699296742, "step": 30280}, {"loss": 0.595, "grad_norm": 1.0589802265167236, "learning_rate": 0.0002, "epoch": 4.896936383477488, "step": 30290}, {"loss": 0.6129, "grad_norm": 0.8487226366996765, "learning_rate": 0.0002, "epoch": 4.8985530676582325, "step": 30300}, {"loss": 0.5514, "grad_norm": 1.0212191343307495, "learning_rate": 0.0002, "epoch": 4.900169751838979, "step": 30310}, {"loss": 0.5896, "grad_norm": 1.0187491178512573, "learning_rate": 0.0002, "epoch": 4.901786436019724, "step": 30320}, {"loss": 0.5809, "grad_norm": 1.0013091564178467, "learning_rate": 0.0002, "epoch": 4.903403120200469, "step": 30330}, {"loss": 0.5658, "grad_norm": 1.0017542839050293, "learning_rate": 0.0002, "epoch": 4.905019804381214, "step": 30340}, {"loss": 0.6002, "grad_norm": 0.9665151238441467, "learning_rate": 0.0002, "epoch": 4.9066364885619596, "step": 30350}, {"loss": 0.5864, "grad_norm": 0.8774822950363159, "learning_rate": 0.0002, "epoch": 4.908253172742705, "step": 30360}, {"loss": 0.5771, "grad_norm": 0.9449850916862488, "learning_rate": 0.0002, "epoch": 4.90986985692345, "step": 30370}, {"loss": 0.58, "grad_norm": 0.7368341088294983, "learning_rate": 0.0002, "epoch": 4.911486541104195, "step": 30380}, {"loss": 0.5992, "grad_norm": 0.9669167995452881, "learning_rate": 0.0002, "epoch": 4.9131032252849405, "step": 30390}, {"loss": 0.6202, "grad_norm": 1.1227794885635376, "learning_rate": 0.0002, "epoch": 4.914719909465686, "step": 30400}, {"loss": 0.6181, "grad_norm": 0.9884361028671265, "learning_rate": 0.0002, "epoch": 4.916336593646431, "step": 30410}, {"loss": 0.6185, "grad_norm": 0.9949551224708557, "learning_rate": 0.0002, "epoch": 4.917953277827176, "step": 30420}, {"loss": 0.5866, "grad_norm": 0.9491621851921082, "learning_rate": 0.0002, "epoch": 4.919569962007921, "step": 30430}, {"loss": 0.6005, "grad_norm": 0.78848797082901, "learning_rate": 0.0002, "epoch": 4.9211866461886675, "step": 30440}, {"loss": 0.5561, "grad_norm": 1.0693835020065308, "learning_rate": 0.0002, "epoch": 4.922803330369412, "step": 30450}, {"loss": 0.566, "grad_norm": 0.9573729634284973, "learning_rate": 0.0002, "epoch": 4.924420014550158, "step": 30460}, {"loss": 0.6084, "grad_norm": 0.9975152611732483, "learning_rate": 0.0002, "epoch": 4.926036698730903, "step": 30470}, {"loss": 0.5969, "grad_norm": 0.8695693016052246, "learning_rate": 0.0002, "epoch": 4.9276533829116484, "step": 30480}, {"loss": 0.6144, "grad_norm": 1.145394206047058, "learning_rate": 0.0002, "epoch": 4.929270067092394, "step": 30490}, {"loss": 0.5736, "grad_norm": 0.7668989896774292, "learning_rate": 0.0002, "epoch": 4.930886751273139, "step": 30500}, {"loss": 0.6052, "grad_norm": 0.9630151391029358, "learning_rate": 0.0002, "epoch": 4.932503435453884, "step": 30510}, {"loss": 0.6461, "grad_norm": 0.940705418586731, "learning_rate": 0.0002, "epoch": 4.934120119634629, "step": 30520}, {"loss": 0.6326, "grad_norm": 1.3243348598480225, "learning_rate": 0.0002, "epoch": 4.935736803815375, "step": 30530}, {"loss": 0.6174, "grad_norm": 1.004347801208496, "learning_rate": 0.0002, "epoch": 4.93735348799612, "step": 30540}, {"loss": 0.583, "grad_norm": 0.8711541295051575, "learning_rate": 0.0002, "epoch": 4.938970172176865, "step": 30550}, {"loss": 0.599, "grad_norm": 0.8980631828308105, "learning_rate": 0.0002, "epoch": 4.94058685635761, "step": 30560}, {"loss": 0.6024, "grad_norm": 0.8388893604278564, "learning_rate": 0.0002, "epoch": 4.9422035405383555, "step": 30570}, {"loss": 0.6189, "grad_norm": 1.0991183519363403, "learning_rate": 0.0002, "epoch": 4.943820224719101, "step": 30580}, {"loss": 0.5906, "grad_norm": 0.9731075763702393, "learning_rate": 0.0002, "epoch": 4.945436908899847, "step": 30590}, {"loss": 0.5883, "grad_norm": 1.3904452323913574, "learning_rate": 0.0002, "epoch": 4.947053593080591, "step": 30600}, {"loss": 0.5952, "grad_norm": 1.2489882707595825, "learning_rate": 0.0002, "epoch": 4.948670277261337, "step": 30610}, {"loss": 0.5887, "grad_norm": 1.240072250366211, "learning_rate": 0.0002, "epoch": 4.950286961442083, "step": 30620}, {"loss": 0.5762, "grad_norm": 0.9191411733627319, "learning_rate": 0.0002, "epoch": 4.951903645622828, "step": 30630}, {"loss": 0.5597, "grad_norm": 0.8888895511627197, "learning_rate": 0.0002, "epoch": 4.953520329803573, "step": 30640}, {"loss": 0.6594, "grad_norm": 0.9001450538635254, "learning_rate": 0.0002, "epoch": 4.955137013984318, "step": 30650}, {"loss": 0.6047, "grad_norm": 1.053971767425537, "learning_rate": 0.0002, "epoch": 4.9567536981650635, "step": 30660}, {"loss": 0.6107, "grad_norm": 1.2224042415618896, "learning_rate": 0.0002, "epoch": 4.958370382345809, "step": 30670}, {"loss": 0.6211, "grad_norm": 0.8855111598968506, "learning_rate": 0.0002, "epoch": 4.959987066526554, "step": 30680}, {"loss": 0.5764, "grad_norm": 0.9489575624465942, "learning_rate": 0.0002, "epoch": 4.961603750707299, "step": 30690}, {"loss": 0.5371, "grad_norm": 0.9635404944419861, "learning_rate": 0.0002, "epoch": 4.963220434888044, "step": 30700}, {"loss": 0.6043, "grad_norm": 1.1784121990203857, "learning_rate": 0.0002, "epoch": 4.96483711906879, "step": 30710}, {"loss": 0.5803, "grad_norm": 1.0059462785720825, "learning_rate": 0.0002, "epoch": 4.966453803249535, "step": 30720}, {"loss": 0.5759, "grad_norm": 0.9479738473892212, "learning_rate": 0.0002, "epoch": 4.96807048743028, "step": 30730}, {"loss": 0.584, "grad_norm": 1.0624593496322632, "learning_rate": 0.0002, "epoch": 4.969687171611026, "step": 30740}, {"loss": 0.6202, "grad_norm": 1.1429259777069092, "learning_rate": 0.0002, "epoch": 4.971303855791771, "step": 30750}, {"loss": 0.6174, "grad_norm": 0.9102491140365601, "learning_rate": 0.0002, "epoch": 4.972920539972517, "step": 30760}, {"loss": 0.6025, "grad_norm": 1.1262688636779785, "learning_rate": 0.0002, "epoch": 4.974537224153262, "step": 30770}, {"loss": 0.588, "grad_norm": 1.1415393352508545, "learning_rate": 0.0002, "epoch": 4.976153908334007, "step": 30780}, {"loss": 0.5832, "grad_norm": 1.083078384399414, "learning_rate": 0.0002, "epoch": 4.977770592514752, "step": 30790}, {"loss": 0.6025, "grad_norm": 0.964859127998352, "learning_rate": 0.0002, "epoch": 4.979387276695498, "step": 30800}, {"loss": 0.6095, "grad_norm": 0.8704743385314941, "learning_rate": 0.0002, "epoch": 4.981003960876243, "step": 30810}, {"loss": 0.5666, "grad_norm": 1.0714856386184692, "learning_rate": 0.0002, "epoch": 4.982620645056988, "step": 30820}, {"loss": 0.565, "grad_norm": 0.6818771362304688, "learning_rate": 0.0002, "epoch": 4.984237329237733, "step": 30830}, {"loss": 0.5999, "grad_norm": 1.0454156398773193, "learning_rate": 0.0002, "epoch": 4.985854013418479, "step": 30840}, {"loss": 0.5683, "grad_norm": 0.9410776495933533, "learning_rate": 0.0002, "epoch": 4.987470697599224, "step": 30850}, {"loss": 0.5899, "grad_norm": 1.0878902673721313, "learning_rate": 0.0002, "epoch": 4.989087381779969, "step": 30860}, {"loss": 0.5914, "grad_norm": 0.8916727304458618, "learning_rate": 0.0002, "epoch": 4.990704065960714, "step": 30870}, {"loss": 0.6066, "grad_norm": 1.045776128768921, "learning_rate": 0.0002, "epoch": 4.9923207501414595, "step": 30880}, {"loss": 0.5767, "grad_norm": 0.9861903786659241, "learning_rate": 0.0002, "epoch": 4.993937434322206, "step": 30890}, {"loss": 0.6192, "grad_norm": 0.9275050759315491, "learning_rate": 0.0002, "epoch": 4.995554118502951, "step": 30900}, {"loss": 0.6181, "grad_norm": 0.94013911485672, "learning_rate": 0.0002, "epoch": 4.997170802683696, "step": 30910}, {"loss": 0.614, "grad_norm": 0.9771268367767334, "learning_rate": 0.0002, "epoch": 4.998787486864441, "step": 30920}, {"eval_loss": 1.1968598365783691, "eval_runtime": 122.2519, "eval_samples_per_second": 5.996, "eval_steps_per_second": 0.753, "epoch": 4.9999191657909625, "step": 30927}, {"loss": 0.5238, "grad_norm": 0.8021580576896667, "learning_rate": 0.0002, "epoch": 5.0004041710451865, "step": 30930}, {"loss": 0.4984, "grad_norm": 1.0807327032089233, "learning_rate": 0.0002, "epoch": 5.002020855225932, "step": 30940}, {"loss": 0.514, "grad_norm": 1.1638425588607788, "learning_rate": 0.0002, "epoch": 5.003637539406677, "step": 30950}, {"loss": 0.4621, "grad_norm": 1.1700230836868286, "learning_rate": 0.0002, "epoch": 5.005254223587422, "step": 30960}, {"loss": 0.4657, "grad_norm": 0.9053420424461365, "learning_rate": 0.0002, "epoch": 5.0068709077681675, "step": 30970}, {"loss": 0.4865, "grad_norm": 0.9226111769676208, "learning_rate": 0.0002, "epoch": 5.008487591948913, "step": 30980}, {"loss": 0.5011, "grad_norm": 1.238669514656067, "learning_rate": 0.0002, "epoch": 5.010104276129658, "step": 30990}, {"loss": 0.4754, "grad_norm": 1.0668327808380127, "learning_rate": 0.0002, "epoch": 5.011720960310403, "step": 31000}, {"loss": 0.5414, "grad_norm": 1.0903944969177246, "learning_rate": 0.0002, "epoch": 5.013337644491148, "step": 31010}, {"loss": 0.5117, "grad_norm": 1.0763911008834839, "learning_rate": 0.0002, "epoch": 5.014954328671894, "step": 31020}, {"loss": 0.4908, "grad_norm": 1.0108771324157715, "learning_rate": 0.0002, "epoch": 5.016571012852639, "step": 31030}, {"loss": 0.5052, "grad_norm": 0.8816103935241699, "learning_rate": 0.0002, "epoch": 5.018187697033385, "step": 31040}, {"loss": 0.4985, "grad_norm": 1.11434805393219, "learning_rate": 0.0002, "epoch": 5.01980438121413, "step": 31050}, {"loss": 0.5074, "grad_norm": 1.0727789402008057, "learning_rate": 0.0002, "epoch": 5.021421065394875, "step": 31060}, {"loss": 0.4938, "grad_norm": 1.1480379104614258, "learning_rate": 0.0002, "epoch": 5.023037749575621, "step": 31070}, {"loss": 0.491, "grad_norm": 1.0913071632385254, "learning_rate": 0.0002, "epoch": 5.024654433756366, "step": 31080}, {"loss": 0.4896, "grad_norm": 0.9891864657402039, "learning_rate": 0.0002, "epoch": 5.026271117937111, "step": 31090}, {"loss": 0.4965, "grad_norm": 0.9167473912239075, "learning_rate": 0.0002, "epoch": 5.027887802117856, "step": 31100}, {"loss": 0.5098, "grad_norm": 1.2259035110473633, "learning_rate": 0.0002, "epoch": 5.029504486298602, "step": 31110}, {"loss": 0.5206, "grad_norm": 1.1812787055969238, "learning_rate": 0.0002, "epoch": 5.031121170479347, "step": 31120}, {"loss": 0.4725, "grad_norm": 1.0890522003173828, "learning_rate": 0.0002, "epoch": 5.032737854660092, "step": 31130}, {"loss": 0.4768, "grad_norm": 1.0521091222763062, "learning_rate": 0.0002, "epoch": 5.034354538840837, "step": 31140}, {"loss": 0.4718, "grad_norm": 1.1274569034576416, "learning_rate": 0.0002, "epoch": 5.0359712230215825, "step": 31150}, {"loss": 0.4604, "grad_norm": 1.140974998474121, "learning_rate": 0.0002, "epoch": 5.037587907202328, "step": 31160}, {"loss": 0.5077, "grad_norm": 1.1215609312057495, "learning_rate": 0.0002, "epoch": 5.039204591383073, "step": 31170}, {"loss": 0.4746, "grad_norm": 1.0107218027114868, "learning_rate": 0.0002, "epoch": 5.040821275563818, "step": 31180}, {"loss": 0.5126, "grad_norm": 1.0198770761489868, "learning_rate": 0.0002, "epoch": 5.042437959744564, "step": 31190}, {"loss": 0.5004, "grad_norm": 1.1613430976867676, "learning_rate": 0.0002, "epoch": 5.0440546439253096, "step": 31200}, {"loss": 0.5181, "grad_norm": 0.8555458188056946, "learning_rate": 0.0002, "epoch": 5.045671328106055, "step": 31210}, {"loss": 0.4878, "grad_norm": 1.0235545635223389, "learning_rate": 0.0002, "epoch": 5.0472880122868, "step": 31220}, {"loss": 0.499, "grad_norm": 1.0228750705718994, "learning_rate": 0.0002, "epoch": 5.048904696467545, "step": 31230}, {"loss": 0.4544, "grad_norm": 0.8216419816017151, "learning_rate": 0.0002, "epoch": 5.0505213806482905, "step": 31240}, {"loss": 0.4947, "grad_norm": 0.925828218460083, "learning_rate": 0.0002, "epoch": 5.052138064829036, "step": 31250}, {"loss": 0.4835, "grad_norm": 0.9229369759559631, "learning_rate": 0.0002, "epoch": 5.053754749009781, "step": 31260}, {"loss": 0.5136, "grad_norm": 0.9531727433204651, "learning_rate": 0.0002, "epoch": 5.055371433190526, "step": 31270}, {"loss": 0.5161, "grad_norm": 0.7738548517227173, "learning_rate": 0.0002, "epoch": 5.056988117371271, "step": 31280}, {"loss": 0.5166, "grad_norm": 1.0551451444625854, "learning_rate": 0.0002, "epoch": 5.058604801552017, "step": 31290}, {"loss": 0.4953, "grad_norm": 0.9782299399375916, "learning_rate": 0.0002, "epoch": 5.060221485732762, "step": 31300}, {"loss": 0.4776, "grad_norm": 1.0220632553100586, "learning_rate": 0.0002, "epoch": 5.061838169913507, "step": 31310}, {"loss": 0.5117, "grad_norm": 0.9808892607688904, "learning_rate": 0.0002, "epoch": 5.063454854094252, "step": 31320}, {"loss": 0.501, "grad_norm": 1.0662003755569458, "learning_rate": 0.0002, "epoch": 5.065071538274998, "step": 31330}, {"loss": 0.4844, "grad_norm": 1.0036940574645996, "learning_rate": 0.0002, "epoch": 5.066688222455744, "step": 31340}, {"loss": 0.5299, "grad_norm": 1.1931052207946777, "learning_rate": 0.0002, "epoch": 5.068304906636489, "step": 31350}, {"loss": 0.4646, "grad_norm": 0.9370693564414978, "learning_rate": 0.0002, "epoch": 5.069921590817234, "step": 31360}, {"loss": 0.5274, "grad_norm": 0.9589039087295532, "learning_rate": 0.0002, "epoch": 5.071538274997979, "step": 31370}, {"loss": 0.4669, "grad_norm": 1.0052711963653564, "learning_rate": 0.0002, "epoch": 5.073154959178725, "step": 31380}, {"loss": 0.5283, "grad_norm": 0.9991368651390076, "learning_rate": 0.0002, "epoch": 5.07477164335947, "step": 31390}, {"loss": 0.4579, "grad_norm": 0.8539695739746094, "learning_rate": 0.0002, "epoch": 5.076388327540215, "step": 31400}, {"loss": 0.4609, "grad_norm": 1.048775553703308, "learning_rate": 0.0002, "epoch": 5.07800501172096, "step": 31410}, {"loss": 0.4915, "grad_norm": 0.9983724355697632, "learning_rate": 0.0002, "epoch": 5.0796216959017055, "step": 31420}, {"loss": 0.4594, "grad_norm": 1.0189813375473022, "learning_rate": 0.0002, "epoch": 5.081238380082451, "step": 31430}, {"loss": 0.5449, "grad_norm": 0.9781646728515625, "learning_rate": 0.0002, "epoch": 5.082855064263196, "step": 31440}, {"loss": 0.4698, "grad_norm": 0.9424566030502319, "learning_rate": 0.0002, "epoch": 5.084471748443941, "step": 31450}, {"loss": 0.4768, "grad_norm": 1.0036484003067017, "learning_rate": 0.0002, "epoch": 5.0860884326246865, "step": 31460}, {"loss": 0.487, "grad_norm": 1.0983147621154785, "learning_rate": 0.0002, "epoch": 5.087705116805432, "step": 31470}, {"loss": 0.5236, "grad_norm": 1.0856730937957764, "learning_rate": 0.0002, "epoch": 5.089321800986177, "step": 31480}, {"loss": 0.485, "grad_norm": 1.2191699743270874, "learning_rate": 0.0002, "epoch": 5.090938485166923, "step": 31490}, {"loss": 0.4936, "grad_norm": 0.939346194267273, "learning_rate": 0.0002, "epoch": 5.092555169347668, "step": 31500}, {"loss": 0.5107, "grad_norm": 0.9730121493339539, "learning_rate": 0.0002, "epoch": 5.0941718535284135, "step": 31510}, {"loss": 0.4973, "grad_norm": 0.923686146736145, "learning_rate": 0.0002, "epoch": 5.095788537709159, "step": 31520}, {"loss": 0.4906, "grad_norm": 1.1734349727630615, "learning_rate": 0.0002, "epoch": 5.097405221889904, "step": 31530}, {"loss": 0.5165, "grad_norm": 1.084509015083313, "learning_rate": 0.0002, "epoch": 5.099021906070649, "step": 31540}, {"loss": 0.5078, "grad_norm": 1.0144678354263306, "learning_rate": 0.0002, "epoch": 5.100638590251394, "step": 31550}, {"loss": 0.4719, "grad_norm": 0.9958019256591797, "learning_rate": 0.0002, "epoch": 5.10225527443214, "step": 31560}, {"loss": 0.4876, "grad_norm": 0.8900736570358276, "learning_rate": 0.0002, "epoch": 5.103871958612885, "step": 31570}, {"loss": 0.463, "grad_norm": 1.0921649932861328, "learning_rate": 0.0002, "epoch": 5.10548864279363, "step": 31580}, {"loss": 0.5148, "grad_norm": 1.1613792181015015, "learning_rate": 0.0002, "epoch": 5.107105326974375, "step": 31590}, {"loss": 0.5055, "grad_norm": 0.9211367964744568, "learning_rate": 0.0002, "epoch": 5.108722011155121, "step": 31600}, {"loss": 0.5364, "grad_norm": 1.3315813541412354, "learning_rate": 0.0002, "epoch": 5.110338695335866, "step": 31610}, {"loss": 0.5336, "grad_norm": 1.3765019178390503, "learning_rate": 0.0002, "epoch": 5.111955379516611, "step": 31620}, {"loss": 0.4861, "grad_norm": 1.070198893547058, "learning_rate": 0.0002, "epoch": 5.113572063697356, "step": 31630}, {"loss": 0.5046, "grad_norm": 0.947631299495697, "learning_rate": 0.0002, "epoch": 5.115188747878102, "step": 31640}, {"loss": 0.5297, "grad_norm": 1.0197371244430542, "learning_rate": 0.0002, "epoch": 5.116805432058848, "step": 31650}, {"loss": 0.5014, "grad_norm": 0.8647911548614502, "learning_rate": 0.0002, "epoch": 5.118422116239593, "step": 31660}, {"loss": 0.4705, "grad_norm": 0.8944075107574463, "learning_rate": 0.0002, "epoch": 5.120038800420338, "step": 31670}, {"loss": 0.5175, "grad_norm": 1.124497652053833, "learning_rate": 0.0002, "epoch": 5.121655484601083, "step": 31680}, {"loss": 0.5109, "grad_norm": 0.893131673336029, "learning_rate": 0.0002, "epoch": 5.123272168781829, "step": 31690}, {"loss": 0.4937, "grad_norm": 1.0122284889221191, "learning_rate": 0.0002, "epoch": 5.124888852962574, "step": 31700}, {"loss": 0.5522, "grad_norm": 0.9493719935417175, "learning_rate": 0.0002, "epoch": 5.126505537143319, "step": 31710}, {"loss": 0.5031, "grad_norm": 0.9700539112091064, "learning_rate": 0.0002, "epoch": 5.128122221324064, "step": 31720}, {"loss": 0.5126, "grad_norm": 1.111677646636963, "learning_rate": 0.0002, "epoch": 5.1297389055048095, "step": 31730}, {"loss": 0.5272, "grad_norm": 0.8204274773597717, "learning_rate": 0.0002, "epoch": 5.131355589685555, "step": 31740}, {"loss": 0.5029, "grad_norm": 1.1029267311096191, "learning_rate": 0.0002, "epoch": 5.1329722738663, "step": 31750}, {"loss": 0.505, "grad_norm": 1.065575122833252, "learning_rate": 0.0002, "epoch": 5.134588958047045, "step": 31760}, {"loss": 0.502, "grad_norm": 0.8208706974983215, "learning_rate": 0.0002, "epoch": 5.13620564222779, "step": 31770}, {"loss": 0.5352, "grad_norm": 1.0520979166030884, "learning_rate": 0.0002, "epoch": 5.137822326408536, "step": 31780}, {"loss": 0.4911, "grad_norm": 0.8585538268089294, "learning_rate": 0.0002, "epoch": 5.139439010589282, "step": 31790}, {"loss": 0.5159, "grad_norm": 1.1491447687149048, "learning_rate": 0.0002, "epoch": 5.141055694770027, "step": 31800}, {"loss": 0.5157, "grad_norm": 0.9441081285476685, "learning_rate": 0.0002, "epoch": 5.142672378950772, "step": 31810}, {"loss": 0.5383, "grad_norm": 1.4146889448165894, "learning_rate": 0.0002, "epoch": 5.1442890631315175, "step": 31820}, {"loss": 0.5159, "grad_norm": 1.0326547622680664, "learning_rate": 0.0002, "epoch": 5.145905747312263, "step": 31830}, {"loss": 0.5348, "grad_norm": 0.9879202842712402, "learning_rate": 0.0002, "epoch": 5.147522431493008, "step": 31840}, {"loss": 0.5083, "grad_norm": 1.0374281406402588, "learning_rate": 0.0002, "epoch": 5.149139115673753, "step": 31850}, {"loss": 0.4827, "grad_norm": 1.181229591369629, "learning_rate": 0.0002, "epoch": 5.150755799854498, "step": 31860}, {"loss": 0.5313, "grad_norm": 1.2078537940979004, "learning_rate": 0.0002, "epoch": 5.152372484035244, "step": 31870}, {"loss": 0.5329, "grad_norm": 0.9599190354347229, "learning_rate": 0.0002, "epoch": 5.153989168215989, "step": 31880}, {"loss": 0.4953, "grad_norm": 1.0378568172454834, "learning_rate": 0.0002, "epoch": 5.155605852396734, "step": 31890}, {"loss": 0.5069, "grad_norm": 0.8746536374092102, "learning_rate": 0.0002, "epoch": 5.157222536577479, "step": 31900}, {"loss": 0.5272, "grad_norm": 1.0232136249542236, "learning_rate": 0.0002, "epoch": 5.1588392207582245, "step": 31910}, {"loss": 0.4844, "grad_norm": 0.9827565550804138, "learning_rate": 0.0002, "epoch": 5.16045590493897, "step": 31920}, {"loss": 0.5029, "grad_norm": 1.342657208442688, "learning_rate": 0.0002, "epoch": 5.162072589119716, "step": 31930}, {"loss": 0.513, "grad_norm": 1.18390691280365, "learning_rate": 0.0002, "epoch": 5.163689273300461, "step": 31940}, {"loss": 0.5267, "grad_norm": 0.996350109577179, "learning_rate": 0.0002, "epoch": 5.165305957481206, "step": 31950}, {"loss": 0.5063, "grad_norm": 0.9710391163825989, "learning_rate": 0.0002, "epoch": 5.166922641661952, "step": 31960}, {"loss": 0.5115, "grad_norm": 1.0264002084732056, "learning_rate": 0.0002, "epoch": 5.168539325842697, "step": 31970}, {"loss": 0.4972, "grad_norm": 1.0028311014175415, "learning_rate": 0.0002, "epoch": 5.170156010023442, "step": 31980}, {"loss": 0.5103, "grad_norm": 1.1078234910964966, "learning_rate": 0.0002, "epoch": 5.171772694204187, "step": 31990}, {"loss": 0.495, "grad_norm": 0.9659610390663147, "learning_rate": 0.0002, "epoch": 5.1733893783849325, "step": 32000}, {"loss": 0.5114, "grad_norm": 0.841986894607544, "learning_rate": 0.0002, "epoch": 5.175006062565678, "step": 32010}, {"loss": 0.48, "grad_norm": 1.095332384109497, "learning_rate": 0.0002, "epoch": 5.176622746746423, "step": 32020}, {"loss": 0.4741, "grad_norm": 1.1242377758026123, "learning_rate": 0.0002, "epoch": 5.178239430927168, "step": 32030}, {"loss": 0.5573, "grad_norm": 0.9872292280197144, "learning_rate": 0.0002, "epoch": 5.179856115107913, "step": 32040}, {"loss": 0.48, "grad_norm": 0.936161994934082, "learning_rate": 0.0002, "epoch": 5.181472799288659, "step": 32050}, {"loss": 0.5093, "grad_norm": 1.166100025177002, "learning_rate": 0.0002, "epoch": 5.183089483469404, "step": 32060}, {"loss": 0.5438, "grad_norm": 1.0764425992965698, "learning_rate": 0.0002, "epoch": 5.184706167650149, "step": 32070}, {"loss": 0.4843, "grad_norm": 1.0480051040649414, "learning_rate": 0.0002, "epoch": 5.186322851830895, "step": 32080}, {"loss": 0.5386, "grad_norm": 1.0874916315078735, "learning_rate": 0.0002, "epoch": 5.1879395360116405, "step": 32090}, {"loss": 0.4975, "grad_norm": 1.0817396640777588, "learning_rate": 0.0002, "epoch": 5.189556220192386, "step": 32100}, {"loss": 0.5177, "grad_norm": 1.054111361503601, "learning_rate": 0.0002, "epoch": 5.191172904373131, "step": 32110}, {"loss": 0.5229, "grad_norm": 0.9655823707580566, "learning_rate": 0.0002, "epoch": 5.192789588553876, "step": 32120}, {"loss": 0.5105, "grad_norm": 1.1384109258651733, "learning_rate": 0.0002, "epoch": 5.194406272734621, "step": 32130}, {"loss": 0.5073, "grad_norm": 1.0149348974227905, "learning_rate": 0.0002, "epoch": 5.196022956915367, "step": 32140}, {"loss": 0.5293, "grad_norm": 1.1084046363830566, "learning_rate": 0.0002, "epoch": 5.197639641096112, "step": 32150}, {"loss": 0.4936, "grad_norm": 1.1209309101104736, "learning_rate": 0.0002, "epoch": 5.199256325276857, "step": 32160}, {"loss": 0.5101, "grad_norm": 1.133089542388916, "learning_rate": 0.0002, "epoch": 5.200873009457602, "step": 32170}, {"loss": 0.5242, "grad_norm": 1.0893020629882812, "learning_rate": 0.0002, "epoch": 5.202489693638348, "step": 32180}, {"loss": 0.4872, "grad_norm": 0.90018630027771, "learning_rate": 0.0002, "epoch": 5.204106377819093, "step": 32190}, {"loss": 0.4999, "grad_norm": 0.977622926235199, "learning_rate": 0.0002, "epoch": 5.205723061999838, "step": 32200}, {"loss": 0.5028, "grad_norm": 1.2940177917480469, "learning_rate": 0.0002, "epoch": 5.207339746180583, "step": 32210}, {"loss": 0.5396, "grad_norm": 1.2131710052490234, "learning_rate": 0.0002, "epoch": 5.2089564303613285, "step": 32220}, {"loss": 0.5189, "grad_norm": 1.0234841108322144, "learning_rate": 0.0002, "epoch": 5.210573114542075, "step": 32230}, {"loss": 0.5424, "grad_norm": 1.157975435256958, "learning_rate": 0.0002, "epoch": 5.21218979872282, "step": 32240}, {"loss": 0.5396, "grad_norm": 1.0381282567977905, "learning_rate": 0.0002, "epoch": 5.213806482903565, "step": 32250}, {"loss": 0.5192, "grad_norm": 1.0125395059585571, "learning_rate": 0.0002, "epoch": 5.21542316708431, "step": 32260}, {"loss": 0.5216, "grad_norm": 1.272691011428833, "learning_rate": 0.0002, "epoch": 5.2170398512650555, "step": 32270}, {"loss": 0.52, "grad_norm": 1.0061250925064087, "learning_rate": 0.0002, "epoch": 5.218656535445801, "step": 32280}, {"loss": 0.4739, "grad_norm": 0.9752234816551208, "learning_rate": 0.0002, "epoch": 5.220273219626546, "step": 32290}, {"loss": 0.5471, "grad_norm": 1.1193140745162964, "learning_rate": 0.0002, "epoch": 5.221889903807291, "step": 32300}, {"loss": 0.4976, "grad_norm": 1.0126434564590454, "learning_rate": 0.0002, "epoch": 5.2235065879880365, "step": 32310}, {"loss": 0.5257, "grad_norm": 1.4338394403457642, "learning_rate": 0.0002, "epoch": 5.225123272168782, "step": 32320}, {"loss": 0.5235, "grad_norm": 1.004101276397705, "learning_rate": 0.0002, "epoch": 5.226739956349527, "step": 32330}, {"loss": 0.5091, "grad_norm": 0.8744166493415833, "learning_rate": 0.0002, "epoch": 5.228356640530272, "step": 32340}, {"loss": 0.5388, "grad_norm": 1.0165376663208008, "learning_rate": 0.0002, "epoch": 5.229973324711017, "step": 32350}, {"loss": 0.5469, "grad_norm": 0.8635954260826111, "learning_rate": 0.0002, "epoch": 5.231590008891763, "step": 32360}, {"loss": 0.5609, "grad_norm": 1.1392399072647095, "learning_rate": 0.0002, "epoch": 5.233206693072509, "step": 32370}, {"loss": 0.5173, "grad_norm": 1.0202113389968872, "learning_rate": 0.0002, "epoch": 5.234823377253254, "step": 32380}, {"loss": 0.4983, "grad_norm": 1.0417983531951904, "learning_rate": 0.0002, "epoch": 5.236440061433999, "step": 32390}, {"loss": 0.507, "grad_norm": 0.8729333877563477, "learning_rate": 0.0002, "epoch": 5.238056745614744, "step": 32400}, {"loss": 0.5426, "grad_norm": 1.1626229286193848, "learning_rate": 0.0002, "epoch": 5.23967342979549, "step": 32410}, {"loss": 0.5355, "grad_norm": 0.9086161851882935, "learning_rate": 0.0002, "epoch": 5.241290113976235, "step": 32420}, {"loss": 0.4927, "grad_norm": 1.3999892473220825, "learning_rate": 0.0002, "epoch": 5.24290679815698, "step": 32430}, {"loss": 0.4795, "grad_norm": 1.0356311798095703, "learning_rate": 0.0002, "epoch": 5.244523482337725, "step": 32440}, {"loss": 0.5035, "grad_norm": 0.9655531644821167, "learning_rate": 0.0002, "epoch": 5.246140166518471, "step": 32450}, {"loss": 0.5166, "grad_norm": 1.0411828756332397, "learning_rate": 0.0002, "epoch": 5.247756850699216, "step": 32460}, {"loss": 0.5141, "grad_norm": 1.1199816465377808, "learning_rate": 0.0002, "epoch": 5.249373534879961, "step": 32470}, {"loss": 0.4864, "grad_norm": 1.260321855545044, "learning_rate": 0.0002, "epoch": 5.250990219060706, "step": 32480}, {"loss": 0.4893, "grad_norm": 1.2950857877731323, "learning_rate": 0.0002, "epoch": 5.2526069032414515, "step": 32490}, {"loss": 0.4952, "grad_norm": 0.8982820510864258, "learning_rate": 0.0002, "epoch": 5.254223587422197, "step": 32500}, {"loss": 0.5138, "grad_norm": 0.8512987494468689, "learning_rate": 0.0002, "epoch": 5.255840271602942, "step": 32510}, {"loss": 0.5341, "grad_norm": 1.067443609237671, "learning_rate": 0.0002, "epoch": 5.257456955783688, "step": 32520}, {"loss": 0.4928, "grad_norm": 1.0957417488098145, "learning_rate": 0.0002, "epoch": 5.259073639964433, "step": 32530}, {"loss": 0.5169, "grad_norm": 1.4161807298660278, "learning_rate": 0.0002, "epoch": 5.260690324145179, "step": 32540}, {"loss": 0.5599, "grad_norm": 1.2264093160629272, "learning_rate": 0.0002, "epoch": 5.262307008325924, "step": 32550}, {"loss": 0.5221, "grad_norm": 1.0015931129455566, "learning_rate": 0.0002, "epoch": 5.263923692506669, "step": 32560}, {"loss": 0.5253, "grad_norm": 1.0743094682693481, "learning_rate": 0.0002, "epoch": 5.265540376687414, "step": 32570}, {"loss": 0.5289, "grad_norm": 1.1386840343475342, "learning_rate": 0.0002, "epoch": 5.2671570608681595, "step": 32580}, {"loss": 0.5315, "grad_norm": 1.0093860626220703, "learning_rate": 0.0002, "epoch": 5.268773745048905, "step": 32590}, {"loss": 0.5175, "grad_norm": 0.9593744874000549, "learning_rate": 0.0002, "epoch": 5.27039042922965, "step": 32600}, {"loss": 0.528, "grad_norm": 1.146021842956543, "learning_rate": 0.0002, "epoch": 5.272007113410395, "step": 32610}, {"loss": 0.4983, "grad_norm": 0.9579031467437744, "learning_rate": 0.0002, "epoch": 5.27362379759114, "step": 32620}, {"loss": 0.5376, "grad_norm": 1.0548793077468872, "learning_rate": 0.0002, "epoch": 5.275240481771886, "step": 32630}, {"loss": 0.5267, "grad_norm": 1.0380561351776123, "learning_rate": 0.0002, "epoch": 5.276857165952631, "step": 32640}, {"loss": 0.5182, "grad_norm": 1.2119969129562378, "learning_rate": 0.0002, "epoch": 5.278473850133376, "step": 32650}, {"loss": 0.5298, "grad_norm": 1.0507797002792358, "learning_rate": 0.0002, "epoch": 5.280090534314121, "step": 32660}, {"loss": 0.5253, "grad_norm": 1.0185176134109497, "learning_rate": 0.0002, "epoch": 5.2817072184948675, "step": 32670}, {"loss": 0.4904, "grad_norm": 1.2358098030090332, "learning_rate": 0.0002, "epoch": 5.283323902675613, "step": 32680}, {"loss": 0.5169, "grad_norm": 0.7937114238739014, "learning_rate": 0.0002, "epoch": 5.284940586856358, "step": 32690}, {"loss": 0.495, "grad_norm": 0.9825124740600586, "learning_rate": 0.0002, "epoch": 5.286557271037103, "step": 32700}, {"loss": 0.5149, "grad_norm": 1.2059301137924194, "learning_rate": 0.0002, "epoch": 5.288173955217848, "step": 32710}, {"loss": 0.5272, "grad_norm": 1.0828571319580078, "learning_rate": 0.0002, "epoch": 5.289790639398594, "step": 32720}, {"loss": 0.5383, "grad_norm": 1.0129735469818115, "learning_rate": 0.0002, "epoch": 5.291407323579339, "step": 32730}, {"loss": 0.5216, "grad_norm": 1.0591634511947632, "learning_rate": 0.0002, "epoch": 5.293024007760084, "step": 32740}, {"loss": 0.522, "grad_norm": 0.9256815910339355, "learning_rate": 0.0002, "epoch": 5.294640691940829, "step": 32750}, {"loss": 0.5396, "grad_norm": 1.0928633213043213, "learning_rate": 0.0002, "epoch": 5.2962573761215745, "step": 32760}, {"loss": 0.5093, "grad_norm": 0.9415594935417175, "learning_rate": 0.0002, "epoch": 5.29787406030232, "step": 32770}, {"loss": 0.5252, "grad_norm": 1.141316294670105, "learning_rate": 0.0002, "epoch": 5.299490744483065, "step": 32780}, {"loss": 0.4837, "grad_norm": 1.0646510124206543, "learning_rate": 0.0002, "epoch": 5.30110742866381, "step": 32790}, {"loss": 0.5547, "grad_norm": 1.189661979675293, "learning_rate": 0.0002, "epoch": 5.3027241128445555, "step": 32800}, {"loss": 0.5664, "grad_norm": 0.9568731188774109, "learning_rate": 0.0002, "epoch": 5.304340797025301, "step": 32810}, {"loss": 0.5344, "grad_norm": 1.1556824445724487, "learning_rate": 0.0002, "epoch": 5.305957481206047, "step": 32820}, {"loss": 0.4894, "grad_norm": 0.9353463649749756, "learning_rate": 0.0002, "epoch": 5.307574165386792, "step": 32830}, {"loss": 0.5052, "grad_norm": 1.1208295822143555, "learning_rate": 0.0002, "epoch": 5.309190849567537, "step": 32840}, {"loss": 0.5126, "grad_norm": 1.0894153118133545, "learning_rate": 0.0002, "epoch": 5.3108075337482825, "step": 32850}, {"loss": 0.5046, "grad_norm": 1.090329647064209, "learning_rate": 0.0002, "epoch": 5.312424217929028, "step": 32860}, {"loss": 0.5237, "grad_norm": 1.0781712532043457, "learning_rate": 0.0002, "epoch": 5.314040902109773, "step": 32870}, {"loss": 0.57, "grad_norm": 1.1785295009613037, "learning_rate": 0.0002, "epoch": 5.315657586290518, "step": 32880}, {"loss": 0.4953, "grad_norm": 1.0406851768493652, "learning_rate": 0.0002, "epoch": 5.317274270471263, "step": 32890}, {"loss": 0.514, "grad_norm": 1.0982953310012817, "learning_rate": 0.0002, "epoch": 5.318890954652009, "step": 32900}, {"loss": 0.4944, "grad_norm": 1.2969383001327515, "learning_rate": 0.0002, "epoch": 5.320507638832754, "step": 32910}, {"loss": 0.4786, "grad_norm": 0.9687288999557495, "learning_rate": 0.0002, "epoch": 5.322124323013499, "step": 32920}, {"loss": 0.5286, "grad_norm": 1.136760950088501, "learning_rate": 0.0002, "epoch": 5.323741007194244, "step": 32930}, {"loss": 0.5321, "grad_norm": 1.3045495748519897, "learning_rate": 0.0002, "epoch": 5.32535769137499, "step": 32940}, {"loss": 0.5413, "grad_norm": 1.221675992012024, "learning_rate": 0.0002, "epoch": 5.326974375555735, "step": 32950}, {"loss": 0.4999, "grad_norm": 1.1380633115768433, "learning_rate": 0.0002, "epoch": 5.32859105973648, "step": 32960}, {"loss": 0.5037, "grad_norm": 1.1065956354141235, "learning_rate": 0.0002, "epoch": 5.330207743917226, "step": 32970}, {"loss": 0.4913, "grad_norm": 1.0187175273895264, "learning_rate": 0.0002, "epoch": 5.331824428097971, "step": 32980}, {"loss": 0.5234, "grad_norm": 0.9077118039131165, "learning_rate": 0.0002, "epoch": 5.333441112278717, "step": 32990}, {"loss": 0.5071, "grad_norm": 1.0092815160751343, "learning_rate": 0.0002, "epoch": 5.335057796459462, "step": 33000}, {"loss": 0.498, "grad_norm": 1.0168777704238892, "learning_rate": 0.0002, "epoch": 5.336674480640207, "step": 33010}, {"loss": 0.4952, "grad_norm": 0.996161937713623, "learning_rate": 0.0002, "epoch": 5.338291164820952, "step": 33020}, {"loss": 0.5024, "grad_norm": 0.794463038444519, "learning_rate": 0.0002, "epoch": 5.339907849001698, "step": 33030}, {"loss": 0.5112, "grad_norm": 0.9750674962997437, "learning_rate": 0.0002, "epoch": 5.341524533182443, "step": 33040}, {"loss": 0.528, "grad_norm": 1.2770029306411743, "learning_rate": 0.0002, "epoch": 5.343141217363188, "step": 33050}, {"loss": 0.52, "grad_norm": 1.1500186920166016, "learning_rate": 0.0002, "epoch": 5.344757901543933, "step": 33060}, {"loss": 0.4906, "grad_norm": 1.0726377964019775, "learning_rate": 0.0002, "epoch": 5.3463745857246785, "step": 33070}, {"loss": 0.5212, "grad_norm": 0.9314153790473938, "learning_rate": 0.0002, "epoch": 5.347991269905424, "step": 33080}, {"loss": 0.5434, "grad_norm": 1.344988465309143, "learning_rate": 0.0002, "epoch": 5.349607954086169, "step": 33090}, {"loss": 0.4874, "grad_norm": 0.863196611404419, "learning_rate": 0.0002, "epoch": 5.351224638266914, "step": 33100}, {"loss": 0.534, "grad_norm": 1.128100037574768, "learning_rate": 0.0002, "epoch": 5.352841322447659, "step": 33110}, {"loss": 0.5293, "grad_norm": 1.1673583984375, "learning_rate": 0.0002, "epoch": 5.3544580066284055, "step": 33120}, {"loss": 0.4787, "grad_norm": 0.9416789412498474, "learning_rate": 0.0002, "epoch": 5.356074690809151, "step": 33130}, {"loss": 0.5155, "grad_norm": 1.1855236291885376, "learning_rate": 0.0002, "epoch": 5.357691374989896, "step": 33140}, {"loss": 0.515, "grad_norm": 1.0415170192718506, "learning_rate": 0.0002, "epoch": 5.359308059170641, "step": 33150}, {"loss": 0.545, "grad_norm": 0.9953004121780396, "learning_rate": 0.0002, "epoch": 5.3609247433513865, "step": 33160}, {"loss": 0.5305, "grad_norm": 0.96138596534729, "learning_rate": 0.0002, "epoch": 5.362541427532132, "step": 33170}, {"loss": 0.5064, "grad_norm": 1.341979742050171, "learning_rate": 0.0002, "epoch": 5.364158111712877, "step": 33180}, {"loss": 0.4986, "grad_norm": 1.0136911869049072, "learning_rate": 0.0002, "epoch": 5.365774795893622, "step": 33190}, {"loss": 0.5459, "grad_norm": 0.8685575127601624, "learning_rate": 0.0002, "epoch": 5.367391480074367, "step": 33200}, {"loss": 0.5146, "grad_norm": 0.8833574652671814, "learning_rate": 0.0002, "epoch": 5.369008164255113, "step": 33210}, {"loss": 0.4982, "grad_norm": 0.9123612642288208, "learning_rate": 0.0002, "epoch": 5.370624848435858, "step": 33220}, {"loss": 0.5047, "grad_norm": 1.2720599174499512, "learning_rate": 0.0002, "epoch": 5.372241532616603, "step": 33230}, {"loss": 0.5175, "grad_norm": 1.0596648454666138, "learning_rate": 0.0002, "epoch": 5.373858216797348, "step": 33240}, {"loss": 0.5284, "grad_norm": 1.119701623916626, "learning_rate": 0.0002, "epoch": 5.3754749009780936, "step": 33250}, {"loss": 0.5217, "grad_norm": 1.3000061511993408, "learning_rate": 0.0002, "epoch": 5.377091585158839, "step": 33260}, {"loss": 0.5125, "grad_norm": 1.083891749382019, "learning_rate": 0.0002, "epoch": 5.378708269339585, "step": 33270}, {"loss": 0.5065, "grad_norm": 0.9402718544006348, "learning_rate": 0.0002, "epoch": 5.38032495352033, "step": 33280}, {"loss": 0.5559, "grad_norm": 1.3376892805099487, "learning_rate": 0.0002, "epoch": 5.381941637701075, "step": 33290}, {"loss": 0.5193, "grad_norm": 1.1600074768066406, "learning_rate": 0.0002, "epoch": 5.383558321881821, "step": 33300}, {"loss": 0.4907, "grad_norm": 1.1449427604675293, "learning_rate": 0.0002, "epoch": 5.385175006062566, "step": 33310}, {"loss": 0.5449, "grad_norm": 1.3118891716003418, "learning_rate": 0.0002, "epoch": 5.386791690243311, "step": 33320}, {"loss": 0.547, "grad_norm": 0.743449866771698, "learning_rate": 0.0002, "epoch": 5.388408374424056, "step": 33330}, {"loss": 0.5555, "grad_norm": 0.9358304142951965, "learning_rate": 0.0002, "epoch": 5.3900250586048015, "step": 33340}, {"loss": 0.5558, "grad_norm": 1.0447142124176025, "learning_rate": 0.0002, "epoch": 5.391641742785547, "step": 33350}, {"loss": 0.5106, "grad_norm": 1.1088626384735107, "learning_rate": 0.0002, "epoch": 5.393258426966292, "step": 33360}, {"loss": 0.4929, "grad_norm": 1.1267958879470825, "learning_rate": 0.0002, "epoch": 5.394875111147037, "step": 33370}, {"loss": 0.5165, "grad_norm": 0.9709370136260986, "learning_rate": 0.0002, "epoch": 5.3964917953277824, "step": 33380}, {"loss": 0.5206, "grad_norm": 1.0939103364944458, "learning_rate": 0.0002, "epoch": 5.398108479508528, "step": 33390}, {"loss": 0.5177, "grad_norm": 0.9559304714202881, "learning_rate": 0.0002, "epoch": 5.399725163689273, "step": 33400}, {"loss": 0.5064, "grad_norm": 1.199580430984497, "learning_rate": 0.0002, "epoch": 5.401341847870018, "step": 33410}, {"loss": 0.52, "grad_norm": 0.9097000360488892, "learning_rate": 0.0002, "epoch": 5.402958532050764, "step": 33420}, {"loss": 0.514, "grad_norm": 1.1940981149673462, "learning_rate": 0.0002, "epoch": 5.4045752162315095, "step": 33430}, {"loss": 0.5069, "grad_norm": 1.0530916452407837, "learning_rate": 0.0002, "epoch": 5.406191900412255, "step": 33440}, {"loss": 0.5482, "grad_norm": 1.0482549667358398, "learning_rate": 0.0002, "epoch": 5.407808584593, "step": 33450}, {"loss": 0.501, "grad_norm": 1.2524714469909668, "learning_rate": 0.0002, "epoch": 5.409425268773745, "step": 33460}, {"loss": 0.5597, "grad_norm": 1.1091666221618652, "learning_rate": 0.0002, "epoch": 5.41104195295449, "step": 33470}, {"loss": 0.546, "grad_norm": 0.9981587529182434, "learning_rate": 0.0002, "epoch": 5.412658637135236, "step": 33480}, {"loss": 0.4977, "grad_norm": 1.016681432723999, "learning_rate": 0.0002, "epoch": 5.414275321315981, "step": 33490}, {"loss": 0.5388, "grad_norm": 1.1456854343414307, "learning_rate": 0.0002, "epoch": 5.415892005496726, "step": 33500}, {"loss": 0.5292, "grad_norm": 1.1454259157180786, "learning_rate": 0.0002, "epoch": 5.417508689677471, "step": 33510}, {"loss": 0.5061, "grad_norm": 0.9858416318893433, "learning_rate": 0.0002, "epoch": 5.419125373858217, "step": 33520}, {"loss": 0.5139, "grad_norm": 0.9764766693115234, "learning_rate": 0.0002, "epoch": 5.420742058038962, "step": 33530}, {"loss": 0.5518, "grad_norm": 1.199920892715454, "learning_rate": 0.0002, "epoch": 5.422358742219707, "step": 33540}, {"loss": 0.5182, "grad_norm": 1.3107370138168335, "learning_rate": 0.0002, "epoch": 5.423975426400452, "step": 33550}, {"loss": 0.5149, "grad_norm": 0.9637970328330994, "learning_rate": 0.0002, "epoch": 5.4255921105811975, "step": 33560}, {"loss": 0.526, "grad_norm": 1.023359775543213, "learning_rate": 0.0002, "epoch": 5.427208794761944, "step": 33570}, {"loss": 0.5206, "grad_norm": 1.060417652130127, "learning_rate": 0.0002, "epoch": 5.428825478942689, "step": 33580}, {"loss": 0.5052, "grad_norm": 0.9971120953559875, "learning_rate": 0.0002, "epoch": 5.430442163123434, "step": 33590}, {"loss": 0.5044, "grad_norm": 0.9213743209838867, "learning_rate": 0.0002, "epoch": 5.432058847304179, "step": 33600}, {"loss": 0.5714, "grad_norm": 1.1512309312820435, "learning_rate": 0.0002, "epoch": 5.4336755314849245, "step": 33610}, {"loss": 0.5317, "grad_norm": 1.2198847532272339, "learning_rate": 0.0002, "epoch": 5.43529221566567, "step": 33620}, {"loss": 0.5237, "grad_norm": 1.0329595804214478, "learning_rate": 0.0002, "epoch": 5.436908899846415, "step": 33630}, {"loss": 0.5364, "grad_norm": 1.1075750589370728, "learning_rate": 0.0002, "epoch": 5.43852558402716, "step": 33640}, {"loss": 0.5295, "grad_norm": 1.006342887878418, "learning_rate": 0.0002, "epoch": 5.4401422682079055, "step": 33650}, {"loss": 0.5394, "grad_norm": 0.9179885983467102, "learning_rate": 0.0002, "epoch": 5.441758952388651, "step": 33660}, {"loss": 0.5124, "grad_norm": 1.2799493074417114, "learning_rate": 0.0002, "epoch": 5.443375636569396, "step": 33670}, {"loss": 0.5426, "grad_norm": 1.1153863668441772, "learning_rate": 0.0002, "epoch": 5.444992320750141, "step": 33680}, {"loss": 0.5087, "grad_norm": 1.0681028366088867, "learning_rate": 0.0002, "epoch": 5.446609004930886, "step": 33690}, {"loss": 0.5272, "grad_norm": 0.9788817167282104, "learning_rate": 0.0002, "epoch": 5.448225689111632, "step": 33700}, {"loss": 0.5308, "grad_norm": 0.8481608629226685, "learning_rate": 0.0002, "epoch": 5.449842373292377, "step": 33710}, {"loss": 0.5225, "grad_norm": 1.113756537437439, "learning_rate": 0.0002, "epoch": 5.451459057473123, "step": 33720}, {"loss": 0.5213, "grad_norm": 0.8425475358963013, "learning_rate": 0.0002, "epoch": 5.453075741653868, "step": 33730}, {"loss": 0.571, "grad_norm": 1.0852208137512207, "learning_rate": 0.0002, "epoch": 5.4546924258346134, "step": 33740}, {"loss": 0.5535, "grad_norm": 1.1664748191833496, "learning_rate": 0.0002, "epoch": 5.456309110015359, "step": 33750}, {"loss": 0.5419, "grad_norm": 1.217241644859314, "learning_rate": 0.0002, "epoch": 5.457925794196104, "step": 33760}, {"loss": 0.5351, "grad_norm": 1.1572928428649902, "learning_rate": 0.0002, "epoch": 5.459542478376849, "step": 33770}, {"loss": 0.5161, "grad_norm": 1.0437318086624146, "learning_rate": 0.0002, "epoch": 5.461159162557594, "step": 33780}, {"loss": 0.5266, "grad_norm": 0.9807571768760681, "learning_rate": 0.0002, "epoch": 5.46277584673834, "step": 33790}, {"loss": 0.5384, "grad_norm": 1.1436342000961304, "learning_rate": 0.0002, "epoch": 5.464392530919085, "step": 33800}, {"loss": 0.5338, "grad_norm": 1.1004794836044312, "learning_rate": 0.0002, "epoch": 5.46600921509983, "step": 33810}, {"loss": 0.4868, "grad_norm": 1.2130268812179565, "learning_rate": 0.0002, "epoch": 5.467625899280575, "step": 33820}, {"loss": 0.516, "grad_norm": 1.3154419660568237, "learning_rate": 0.0002, "epoch": 5.4692425834613205, "step": 33830}, {"loss": 0.4934, "grad_norm": 0.7934383749961853, "learning_rate": 0.0002, "epoch": 5.470859267642066, "step": 33840}, {"loss": 0.5133, "grad_norm": 0.7838410139083862, "learning_rate": 0.0002, "epoch": 5.472475951822812, "step": 33850}, {"loss": 0.4926, "grad_norm": 1.0415139198303223, "learning_rate": 0.0002, "epoch": 5.474092636003557, "step": 33860}, {"loss": 0.5323, "grad_norm": 0.9213164448738098, "learning_rate": 0.0002, "epoch": 5.475709320184302, "step": 33870}, {"loss": 0.5125, "grad_norm": 1.0364776849746704, "learning_rate": 0.0002, "epoch": 5.477326004365048, "step": 33880}, {"loss": 0.5212, "grad_norm": 0.9994072318077087, "learning_rate": 0.0002, "epoch": 5.478942688545793, "step": 33890}, {"loss": 0.5396, "grad_norm": 1.196730136871338, "learning_rate": 0.0002, "epoch": 5.480559372726538, "step": 33900}, {"loss": 0.538, "grad_norm": 0.9955780506134033, "learning_rate": 0.0002, "epoch": 5.482176056907283, "step": 33910}, {"loss": 0.5307, "grad_norm": 1.168188214302063, "learning_rate": 0.0002, "epoch": 5.4837927410880285, "step": 33920}, {"loss": 0.5548, "grad_norm": 1.1816450357437134, "learning_rate": 0.0002, "epoch": 5.485409425268774, "step": 33930}, {"loss": 0.5535, "grad_norm": 1.079715609550476, "learning_rate": 0.0002, "epoch": 5.487026109449519, "step": 33940}, {"loss": 0.5262, "grad_norm": 1.153850793838501, "learning_rate": 0.0002, "epoch": 5.488642793630264, "step": 33950}, {"loss": 0.5248, "grad_norm": 1.0207297801971436, "learning_rate": 0.0002, "epoch": 5.490259477811009, "step": 33960}, {"loss": 0.5142, "grad_norm": 1.1290855407714844, "learning_rate": 0.0002, "epoch": 5.491876161991755, "step": 33970}, {"loss": 0.5168, "grad_norm": 1.068058967590332, "learning_rate": 0.0002, "epoch": 5.4934928461725, "step": 33980}, {"loss": 0.5317, "grad_norm": 0.9789979457855225, "learning_rate": 0.0002, "epoch": 5.495109530353245, "step": 33990}, {"loss": 0.5113, "grad_norm": 0.9696692824363708, "learning_rate": 0.0002, "epoch": 5.496726214533991, "step": 34000}, {"loss": 0.5413, "grad_norm": 1.0539981126785278, "learning_rate": 0.0002, "epoch": 5.4983428987147365, "step": 34010}, {"loss": 0.5783, "grad_norm": 1.0249929428100586, "learning_rate": 0.0002, "epoch": 5.499959582895482, "step": 34020}, {"loss": 0.4888, "grad_norm": 0.9577504992485046, "learning_rate": 0.0002, "epoch": 5.501576267076227, "step": 34030}, {"loss": 0.5291, "grad_norm": 1.0963513851165771, "learning_rate": 0.0002, "epoch": 5.503192951256972, "step": 34040}, {"loss": 0.5315, "grad_norm": 0.8339345455169678, "learning_rate": 0.0002, "epoch": 5.504809635437717, "step": 34050}, {"loss": 0.5191, "grad_norm": 1.0138782262802124, "learning_rate": 0.0002, "epoch": 5.506426319618463, "step": 34060}, {"loss": 0.5463, "grad_norm": 1.0180109739303589, "learning_rate": 0.0002, "epoch": 5.508043003799208, "step": 34070}, {"loss": 0.5083, "grad_norm": 1.2790818214416504, "learning_rate": 0.0002, "epoch": 5.509659687979953, "step": 34080}, {"loss": 0.5195, "grad_norm": 1.428247332572937, "learning_rate": 0.0002, "epoch": 5.511276372160698, "step": 34090}, {"loss": 0.5291, "grad_norm": 1.0926059484481812, "learning_rate": 0.0002, "epoch": 5.5128930563414436, "step": 34100}, {"loss": 0.5665, "grad_norm": 1.2353343963623047, "learning_rate": 0.0002, "epoch": 5.514509740522189, "step": 34110}, {"loss": 0.5331, "grad_norm": 0.935587465763092, "learning_rate": 0.0002, "epoch": 5.516126424702934, "step": 34120}, {"loss": 0.5512, "grad_norm": 0.9767586588859558, "learning_rate": 0.0002, "epoch": 5.517743108883679, "step": 34130}, {"loss": 0.5315, "grad_norm": 1.1660610437393188, "learning_rate": 0.0002, "epoch": 5.5193597930644245, "step": 34140}, {"loss": 0.52, "grad_norm": 0.9828870892524719, "learning_rate": 0.0002, "epoch": 5.520976477245171, "step": 34150}, {"loss": 0.5198, "grad_norm": 1.0097278356552124, "learning_rate": 0.0002, "epoch": 5.522593161425916, "step": 34160}, {"loss": 0.5293, "grad_norm": 1.1766167879104614, "learning_rate": 0.0002, "epoch": 5.524209845606661, "step": 34170}, {"loss": 0.5258, "grad_norm": 0.982292115688324, "learning_rate": 0.0002, "epoch": 5.525826529787406, "step": 34180}, {"loss": 0.5114, "grad_norm": 1.0744609832763672, "learning_rate": 0.0002, "epoch": 5.5274432139681515, "step": 34190}, {"loss": 0.5469, "grad_norm": 1.3831160068511963, "learning_rate": 0.0002, "epoch": 5.529059898148897, "step": 34200}, {"loss": 0.5819, "grad_norm": 1.074771761894226, "learning_rate": 0.0002, "epoch": 5.530676582329642, "step": 34210}, {"loss": 0.5399, "grad_norm": 1.016652226448059, "learning_rate": 0.0002, "epoch": 5.532293266510387, "step": 34220}, {"loss": 0.5158, "grad_norm": 1.2231552600860596, "learning_rate": 0.0002, "epoch": 5.5339099506911325, "step": 34230}, {"loss": 0.5091, "grad_norm": 0.8051198720932007, "learning_rate": 0.0002, "epoch": 5.535526634871878, "step": 34240}, {"loss": 0.5583, "grad_norm": 1.1779674291610718, "learning_rate": 0.0002, "epoch": 5.537143319052623, "step": 34250}, {"loss": 0.5044, "grad_norm": 1.2468291521072388, "learning_rate": 0.0002, "epoch": 5.538760003233368, "step": 34260}, {"loss": 0.523, "grad_norm": 1.14818274974823, "learning_rate": 0.0002, "epoch": 5.540376687414113, "step": 34270}, {"loss": 0.5375, "grad_norm": 1.2362616062164307, "learning_rate": 0.0002, "epoch": 5.541993371594859, "step": 34280}, {"loss": 0.4996, "grad_norm": 1.0206977128982544, "learning_rate": 0.0002, "epoch": 5.543610055775604, "step": 34290}, {"loss": 0.5212, "grad_norm": 1.2018457651138306, "learning_rate": 0.0002, "epoch": 5.54522673995635, "step": 34300}, {"loss": 0.5462, "grad_norm": 1.0349043607711792, "learning_rate": 0.0002, "epoch": 5.546843424137095, "step": 34310}, {"loss": 0.5231, "grad_norm": 1.2022006511688232, "learning_rate": 0.0002, "epoch": 5.54846010831784, "step": 34320}, {"loss": 0.5173, "grad_norm": 1.0810624361038208, "learning_rate": 0.0002, "epoch": 5.550076792498586, "step": 34330}, {"loss": 0.5821, "grad_norm": 1.3297529220581055, "learning_rate": 0.0002, "epoch": 5.551693476679331, "step": 34340}, {"loss": 0.5321, "grad_norm": 0.9722549915313721, "learning_rate": 0.0002, "epoch": 5.553310160860076, "step": 34350}, {"loss": 0.4823, "grad_norm": 0.9903425574302673, "learning_rate": 0.0002, "epoch": 5.554926845040821, "step": 34360}, {"loss": 0.5601, "grad_norm": 0.9568067789077759, "learning_rate": 0.0002, "epoch": 5.556543529221567, "step": 34370}, {"loss": 0.5242, "grad_norm": 1.113870620727539, "learning_rate": 0.0002, "epoch": 5.558160213402312, "step": 34380}, {"loss": 0.5278, "grad_norm": 1.0557632446289062, "learning_rate": 0.0002, "epoch": 5.559776897583057, "step": 34390}, {"loss": 0.5501, "grad_norm": 0.9615673422813416, "learning_rate": 0.0002, "epoch": 5.561393581763802, "step": 34400}, {"loss": 0.5066, "grad_norm": 0.9536027312278748, "learning_rate": 0.0002, "epoch": 5.5630102659445475, "step": 34410}, {"loss": 0.4949, "grad_norm": 0.8808749318122864, "learning_rate": 0.0002, "epoch": 5.564626950125293, "step": 34420}, {"loss": 0.5954, "grad_norm": 1.286132574081421, "learning_rate": 0.0002, "epoch": 5.566243634306038, "step": 34430}, {"loss": 0.5507, "grad_norm": 1.259644865989685, "learning_rate": 0.0002, "epoch": 5.567860318486783, "step": 34440}, {"loss": 0.4922, "grad_norm": 0.9920216798782349, "learning_rate": 0.0002, "epoch": 5.569477002667529, "step": 34450}, {"loss": 0.5527, "grad_norm": 1.182926893234253, "learning_rate": 0.0002, "epoch": 5.5710936868482746, "step": 34460}, {"loss": 0.5185, "grad_norm": 1.1434749364852905, "learning_rate": 0.0002, "epoch": 5.57271037102902, "step": 34470}, {"loss": 0.5256, "grad_norm": 1.2420979738235474, "learning_rate": 0.0002, "epoch": 5.574327055209765, "step": 34480}, {"loss": 0.5039, "grad_norm": 0.9338384866714478, "learning_rate": 0.0002, "epoch": 5.57594373939051, "step": 34490}, {"loss": 0.5634, "grad_norm": 1.0196425914764404, "learning_rate": 0.0002, "epoch": 5.5775604235712555, "step": 34500}, {"loss": 0.5132, "grad_norm": 0.9586997032165527, "learning_rate": 0.0002, "epoch": 5.579177107752001, "step": 34510}, {"loss": 0.5336, "grad_norm": 1.2409086227416992, "learning_rate": 0.0002, "epoch": 5.580793791932746, "step": 34520}, {"loss": 0.5364, "grad_norm": 1.1483757495880127, "learning_rate": 0.0002, "epoch": 5.582410476113491, "step": 34530}, {"loss": 0.5325, "grad_norm": 1.1624305248260498, "learning_rate": 0.0002, "epoch": 5.584027160294236, "step": 34540}, {"loss": 0.5342, "grad_norm": 1.2635223865509033, "learning_rate": 0.0002, "epoch": 5.585643844474982, "step": 34550}, {"loss": 0.4924, "grad_norm": 0.9824051856994629, "learning_rate": 0.0002, "epoch": 5.587260528655727, "step": 34560}, {"loss": 0.5395, "grad_norm": 1.0858620405197144, "learning_rate": 0.0002, "epoch": 5.588877212836472, "step": 34570}, {"loss": 0.5459, "grad_norm": 1.1452655792236328, "learning_rate": 0.0002, "epoch": 5.590493897017217, "step": 34580}, {"loss": 0.5746, "grad_norm": 1.110610842704773, "learning_rate": 0.0002, "epoch": 5.592110581197963, "step": 34590}, {"loss": 0.5285, "grad_norm": 0.9976194500923157, "learning_rate": 0.0002, "epoch": 5.593727265378709, "step": 34600}, {"loss": 0.548, "grad_norm": 1.0698920488357544, "learning_rate": 0.0002, "epoch": 5.595343949559454, "step": 34610}, {"loss": 0.5311, "grad_norm": 1.1505171060562134, "learning_rate": 0.0002, "epoch": 5.596960633740199, "step": 34620}, {"loss": 0.5471, "grad_norm": 1.1014643907546997, "learning_rate": 0.0002, "epoch": 5.598577317920944, "step": 34630}, {"loss": 0.55, "grad_norm": 0.915595293045044, "learning_rate": 0.0002, "epoch": 5.60019400210169, "step": 34640}, {"loss": 0.5821, "grad_norm": 1.1856765747070312, "learning_rate": 0.0002, "epoch": 5.601810686282435, "step": 34650}, {"loss": 0.5502, "grad_norm": 1.1357687711715698, "learning_rate": 0.0002, "epoch": 5.60342737046318, "step": 34660}, {"loss": 0.5034, "grad_norm": 1.0232492685317993, "learning_rate": 0.0002, "epoch": 5.605044054643925, "step": 34670}, {"loss": 0.5357, "grad_norm": 0.9375017881393433, "learning_rate": 0.0002, "epoch": 5.6066607388246705, "step": 34680}, {"loss": 0.5518, "grad_norm": 1.0796529054641724, "learning_rate": 0.0002, "epoch": 5.608277423005416, "step": 34690}, {"loss": 0.5173, "grad_norm": 1.1383336782455444, "learning_rate": 0.0002, "epoch": 5.609894107186161, "step": 34700}, {"loss": 0.5477, "grad_norm": 1.0248544216156006, "learning_rate": 0.0002, "epoch": 5.611510791366906, "step": 34710}, {"loss": 0.5669, "grad_norm": 1.0986040830612183, "learning_rate": 0.0002, "epoch": 5.6131274755476515, "step": 34720}, {"loss": 0.5188, "grad_norm": 1.2689568996429443, "learning_rate": 0.0002, "epoch": 5.614744159728397, "step": 34730}, {"loss": 0.5136, "grad_norm": 1.4044264554977417, "learning_rate": 0.0002, "epoch": 5.616360843909142, "step": 34740}, {"loss": 0.5699, "grad_norm": 1.2084474563598633, "learning_rate": 0.0002, "epoch": 5.617977528089888, "step": 34750}, {"loss": 0.5377, "grad_norm": 1.061248540878296, "learning_rate": 0.0002, "epoch": 5.619594212270633, "step": 34760}, {"loss": 0.5669, "grad_norm": 1.0220764875411987, "learning_rate": 0.0002, "epoch": 5.6212108964513785, "step": 34770}, {"loss": 0.54, "grad_norm": 1.0859092473983765, "learning_rate": 0.0002, "epoch": 5.622827580632124, "step": 34780}, {"loss": 0.5308, "grad_norm": 0.9049732089042664, "learning_rate": 0.0002, "epoch": 5.624444264812869, "step": 34790}, {"loss": 0.5433, "grad_norm": 1.2103937864303589, "learning_rate": 0.0002, "epoch": 5.626060948993614, "step": 34800}, {"loss": 0.5513, "grad_norm": 0.9854230284690857, "learning_rate": 0.0002, "epoch": 5.627677633174359, "step": 34810}, {"loss": 0.5274, "grad_norm": 0.9316635131835938, "learning_rate": 0.0002, "epoch": 5.629294317355105, "step": 34820}, {"loss": 0.5393, "grad_norm": 1.105296015739441, "learning_rate": 0.0002, "epoch": 5.63091100153585, "step": 34830}, {"loss": 0.5527, "grad_norm": 0.993383526802063, "learning_rate": 0.0002, "epoch": 5.632527685716595, "step": 34840}, {"loss": 0.5375, "grad_norm": 1.1544116735458374, "learning_rate": 0.0002, "epoch": 5.63414436989734, "step": 34850}, {"loss": 0.5448, "grad_norm": 1.284475326538086, "learning_rate": 0.0002, "epoch": 5.635761054078086, "step": 34860}, {"loss": 0.5069, "grad_norm": 1.121997594833374, "learning_rate": 0.0002, "epoch": 5.637377738258831, "step": 34870}, {"loss": 0.5335, "grad_norm": 1.213040828704834, "learning_rate": 0.0002, "epoch": 5.638994422439576, "step": 34880}, {"loss": 0.5623, "grad_norm": 1.23222017288208, "learning_rate": 0.0002, "epoch": 5.640611106620321, "step": 34890}, {"loss": 0.5622, "grad_norm": 0.9793637990951538, "learning_rate": 0.0002, "epoch": 5.642227790801067, "step": 34900}, {"loss": 0.5405, "grad_norm": 1.38919997215271, "learning_rate": 0.0002, "epoch": 5.643844474981813, "step": 34910}, {"loss": 0.5007, "grad_norm": 0.8390951156616211, "learning_rate": 0.0002, "epoch": 5.645461159162558, "step": 34920}, {"loss": 0.5974, "grad_norm": 0.9465909004211426, "learning_rate": 0.0002, "epoch": 5.647077843343303, "step": 34930}, {"loss": 0.5264, "grad_norm": 1.066957712173462, "learning_rate": 0.0002, "epoch": 5.648694527524048, "step": 34940}, {"loss": 0.5513, "grad_norm": 0.9842154383659363, "learning_rate": 0.0002, "epoch": 5.650311211704794, "step": 34950}, {"loss": 0.567, "grad_norm": 1.1766440868377686, "learning_rate": 0.0002, "epoch": 5.651927895885539, "step": 34960}, {"loss": 0.5462, "grad_norm": 0.9061306118965149, "learning_rate": 0.0002, "epoch": 5.653544580066284, "step": 34970}, {"loss": 0.5446, "grad_norm": 1.2941309213638306, "learning_rate": 0.0002, "epoch": 5.655161264247029, "step": 34980}, {"loss": 0.5704, "grad_norm": 0.9741247892379761, "learning_rate": 0.0002, "epoch": 5.6567779484277745, "step": 34990}, {"loss": 0.5152, "grad_norm": 1.0784187316894531, "learning_rate": 0.0002, "epoch": 5.65839463260852, "step": 35000}, {"loss": 0.5363, "grad_norm": 0.937889814376831, "learning_rate": 0.0002, "epoch": 5.660011316789265, "step": 35010}, {"loss": 0.5019, "grad_norm": 0.9667879939079285, "learning_rate": 0.0002, "epoch": 5.66162800097001, "step": 35020}, {"loss": 0.5209, "grad_norm": 1.0554876327514648, "learning_rate": 0.0002, "epoch": 5.663244685150756, "step": 35030}, {"loss": 0.523, "grad_norm": 1.2030539512634277, "learning_rate": 0.0002, "epoch": 5.664861369331501, "step": 35040}, {"loss": 0.5406, "grad_norm": 1.0849953889846802, "learning_rate": 0.0002, "epoch": 5.666478053512247, "step": 35050}, {"loss": 0.5747, "grad_norm": 1.1598973274230957, "learning_rate": 0.0002, "epoch": 5.668094737692992, "step": 35060}, {"loss": 0.5488, "grad_norm": 1.0233359336853027, "learning_rate": 0.0002, "epoch": 5.669711421873737, "step": 35070}, {"loss": 0.5409, "grad_norm": 1.1124799251556396, "learning_rate": 0.0002, "epoch": 5.6713281060544825, "step": 35080}, {"loss": 0.5578, "grad_norm": 1.2351475954055786, "learning_rate": 0.0002, "epoch": 5.672944790235228, "step": 35090}, {"loss": 0.5638, "grad_norm": 1.0240728855133057, "learning_rate": 0.0002, "epoch": 5.674561474415973, "step": 35100}, {"loss": 0.5192, "grad_norm": 1.0223692655563354, "learning_rate": 0.0002, "epoch": 5.676178158596718, "step": 35110}, {"loss": 0.524, "grad_norm": 1.4569132328033447, "learning_rate": 0.0002, "epoch": 5.677794842777463, "step": 35120}, {"loss": 0.555, "grad_norm": 0.8983587026596069, "learning_rate": 0.0002, "epoch": 5.679411526958209, "step": 35130}, {"loss": 0.5439, "grad_norm": 1.0775383710861206, "learning_rate": 0.0002, "epoch": 5.681028211138954, "step": 35140}, {"loss": 0.5289, "grad_norm": 0.9800270795822144, "learning_rate": 0.0002, "epoch": 5.682644895319699, "step": 35150}, {"loss": 0.533, "grad_norm": 0.9858237504959106, "learning_rate": 0.0002, "epoch": 5.684261579500444, "step": 35160}, {"loss": 0.5671, "grad_norm": 1.031087040901184, "learning_rate": 0.0002, "epoch": 5.6858782636811895, "step": 35170}, {"loss": 0.5528, "grad_norm": 1.0294365882873535, "learning_rate": 0.0002, "epoch": 5.687494947861936, "step": 35180}, {"loss": 0.5581, "grad_norm": 1.108144760131836, "learning_rate": 0.0002, "epoch": 5.68911163204268, "step": 35190}, {"loss": 0.5373, "grad_norm": 1.0813100337982178, "learning_rate": 0.0002, "epoch": 5.690728316223426, "step": 35200}, {"loss": 0.5429, "grad_norm": 1.3146867752075195, "learning_rate": 0.0002, "epoch": 5.692345000404171, "step": 35210}, {"loss": 0.5297, "grad_norm": 1.16780424118042, "learning_rate": 0.0002, "epoch": 5.693961684584917, "step": 35220}, {"loss": 0.577, "grad_norm": 0.9929125905036926, "learning_rate": 0.0002, "epoch": 5.695578368765662, "step": 35230}, {"loss": 0.5441, "grad_norm": 0.9049441814422607, "learning_rate": 0.0002, "epoch": 5.697195052946407, "step": 35240}, {"loss": 0.5349, "grad_norm": 0.9768866300582886, "learning_rate": 0.0002, "epoch": 5.698811737127152, "step": 35250}, {"loss": 0.542, "grad_norm": 0.8306029438972473, "learning_rate": 0.0002, "epoch": 5.7004284213078975, "step": 35260}, {"loss": 0.4771, "grad_norm": 0.8417280316352844, "learning_rate": 0.0002, "epoch": 5.702045105488643, "step": 35270}, {"loss": 0.574, "grad_norm": 0.9954485893249512, "learning_rate": 0.0002, "epoch": 5.703661789669388, "step": 35280}, {"loss": 0.5469, "grad_norm": 1.2417993545532227, "learning_rate": 0.0002, "epoch": 5.705278473850133, "step": 35290}, {"loss": 0.5275, "grad_norm": 1.1696544885635376, "learning_rate": 0.0002, "epoch": 5.706895158030878, "step": 35300}, {"loss": 0.5188, "grad_norm": 1.2424817085266113, "learning_rate": 0.0002, "epoch": 5.708511842211624, "step": 35310}, {"loss": 0.5595, "grad_norm": 1.1791106462478638, "learning_rate": 0.0002, "epoch": 5.710128526392369, "step": 35320}, {"loss": 0.5076, "grad_norm": 1.202181339263916, "learning_rate": 0.0002, "epoch": 5.711745210573115, "step": 35330}, {"loss": 0.5847, "grad_norm": 1.1006861925125122, "learning_rate": 0.0002, "epoch": 5.713361894753859, "step": 35340}, {"loss": 0.5627, "grad_norm": 1.0918344259262085, "learning_rate": 0.0002, "epoch": 5.7149785789346055, "step": 35350}, {"loss": 0.5677, "grad_norm": 1.0427305698394775, "learning_rate": 0.0002, "epoch": 5.716595263115351, "step": 35360}, {"loss": 0.5288, "grad_norm": 1.0818872451782227, "learning_rate": 0.0002, "epoch": 5.718211947296096, "step": 35370}, {"loss": 0.5296, "grad_norm": 1.186006784439087, "learning_rate": 0.0002, "epoch": 5.719828631476841, "step": 35380}, {"loss": 0.5507, "grad_norm": 1.2073674201965332, "learning_rate": 0.0002, "epoch": 5.721445315657586, "step": 35390}, {"loss": 0.5483, "grad_norm": 1.065338134765625, "learning_rate": 0.0002, "epoch": 5.723061999838332, "step": 35400}, {"loss": 0.5195, "grad_norm": 0.9448973536491394, "learning_rate": 0.0002, "epoch": 5.724678684019077, "step": 35410}, {"loss": 0.5276, "grad_norm": 1.1487499475479126, "learning_rate": 0.0002, "epoch": 5.726295368199822, "step": 35420}, {"loss": 0.5435, "grad_norm": 1.1334216594696045, "learning_rate": 0.0002, "epoch": 5.727912052380567, "step": 35430}, {"loss": 0.5074, "grad_norm": 1.1932826042175293, "learning_rate": 0.0002, "epoch": 5.729528736561313, "step": 35440}, {"loss": 0.5502, "grad_norm": 1.2615786790847778, "learning_rate": 0.0002, "epoch": 5.731145420742058, "step": 35450}, {"loss": 0.5612, "grad_norm": 1.2803694009780884, "learning_rate": 0.0002, "epoch": 5.732762104922803, "step": 35460}, {"loss": 0.5458, "grad_norm": 0.9271906614303589, "learning_rate": 0.0002, "epoch": 5.734378789103548, "step": 35470}, {"loss": 0.5342, "grad_norm": 1.0958917140960693, "learning_rate": 0.0002, "epoch": 5.735995473284294, "step": 35480}, {"loss": 0.538, "grad_norm": 1.1072784662246704, "learning_rate": 0.0002, "epoch": 5.737612157465039, "step": 35490}, {"loss": 0.5683, "grad_norm": 1.1641002893447876, "learning_rate": 0.0002, "epoch": 5.739228841645785, "step": 35500}, {"loss": 0.5252, "grad_norm": 1.0246447324752808, "learning_rate": 0.0002, "epoch": 5.74084552582653, "step": 35510}, {"loss": 0.55, "grad_norm": 1.032474398612976, "learning_rate": 0.0002, "epoch": 5.742462210007275, "step": 35520}, {"loss": 0.4965, "grad_norm": 1.1600854396820068, "learning_rate": 0.0002, "epoch": 5.7440788941880205, "step": 35530}, {"loss": 0.5543, "grad_norm": 1.0686054229736328, "learning_rate": 0.0002, "epoch": 5.745695578368766, "step": 35540}, {"loss": 0.5706, "grad_norm": 1.2314637899398804, "learning_rate": 0.0002, "epoch": 5.747312262549511, "step": 35550}, {"loss": 0.5492, "grad_norm": 0.922134280204773, "learning_rate": 0.0002, "epoch": 5.748928946730256, "step": 35560}, {"loss": 0.5495, "grad_norm": 0.933043360710144, "learning_rate": 0.0002, "epoch": 5.7505456309110015, "step": 35570}, {"loss": 0.5007, "grad_norm": 1.1911931037902832, "learning_rate": 0.0002, "epoch": 5.752162315091747, "step": 35580}, {"loss": 0.5244, "grad_norm": 0.8984857797622681, "learning_rate": 0.0002, "epoch": 5.753778999272492, "step": 35590}, {"loss": 0.5493, "grad_norm": 0.9495107531547546, "learning_rate": 0.0002, "epoch": 5.755395683453237, "step": 35600}, {"loss": 0.5326, "grad_norm": 1.2805472612380981, "learning_rate": 0.0002, "epoch": 5.757012367633982, "step": 35610}, {"loss": 0.5276, "grad_norm": 1.1236625909805298, "learning_rate": 0.0002, "epoch": 5.758629051814728, "step": 35620}, {"loss": 0.6102, "grad_norm": 1.0552798509597778, "learning_rate": 0.0002, "epoch": 5.760245735995474, "step": 35630}, {"loss": 0.5479, "grad_norm": 1.119909644126892, "learning_rate": 0.0002, "epoch": 5.761862420176218, "step": 35640}, {"loss": 0.5282, "grad_norm": 0.8786116242408752, "learning_rate": 0.0002, "epoch": 5.763479104356964, "step": 35650}, {"loss": 0.5406, "grad_norm": 1.2417117357254028, "learning_rate": 0.0002, "epoch": 5.765095788537709, "step": 35660}, {"loss": 0.537, "grad_norm": 1.255200982093811, "learning_rate": 0.0002, "epoch": 5.766712472718455, "step": 35670}, {"loss": 0.5308, "grad_norm": 1.0611358880996704, "learning_rate": 0.0002, "epoch": 5.7683291568992, "step": 35680}, {"loss": 0.5614, "grad_norm": 1.1443911790847778, "learning_rate": 0.0002, "epoch": 5.769945841079945, "step": 35690}, {"loss": 0.5386, "grad_norm": 1.1437989473342896, "learning_rate": 0.0002, "epoch": 5.77156252526069, "step": 35700}, {"loss": 0.537, "grad_norm": 1.1375046968460083, "learning_rate": 0.0002, "epoch": 5.773179209441436, "step": 35710}, {"loss": 0.5198, "grad_norm": 1.0777729749679565, "learning_rate": 0.0002, "epoch": 5.774795893622181, "step": 35720}, {"loss": 0.5521, "grad_norm": 1.1160215139389038, "learning_rate": 0.0002, "epoch": 5.776412577802926, "step": 35730}, {"loss": 0.5569, "grad_norm": 1.1268514394760132, "learning_rate": 0.0002, "epoch": 5.778029261983671, "step": 35740}, {"loss": 0.5311, "grad_norm": 1.2752262353897095, "learning_rate": 0.0002, "epoch": 5.7796459461644165, "step": 35750}, {"loss": 0.5625, "grad_norm": 1.0416184663772583, "learning_rate": 0.0002, "epoch": 5.781262630345162, "step": 35760}, {"loss": 0.5438, "grad_norm": 1.0622444152832031, "learning_rate": 0.0002, "epoch": 5.782879314525907, "step": 35770}, {"loss": 0.5268, "grad_norm": 1.1217877864837646, "learning_rate": 0.0002, "epoch": 5.784495998706653, "step": 35780}, {"loss": 0.5225, "grad_norm": 0.9363139867782593, "learning_rate": 0.0002, "epoch": 5.786112682887398, "step": 35790}, {"loss": 0.5524, "grad_norm": 0.96628737449646, "learning_rate": 0.0002, "epoch": 5.787729367068144, "step": 35800}, {"loss": 0.52, "grad_norm": 0.9572572112083435, "learning_rate": 0.0002, "epoch": 5.789346051248889, "step": 35810}, {"loss": 0.5615, "grad_norm": 0.938724935054779, "learning_rate": 0.0002, "epoch": 5.790962735429634, "step": 35820}, {"loss": 0.5391, "grad_norm": 1.3314417600631714, "learning_rate": 0.0002, "epoch": 5.792579419610379, "step": 35830}, {"loss": 0.5441, "grad_norm": 1.0097602605819702, "learning_rate": 0.0002, "epoch": 5.7941961037911245, "step": 35840}, {"loss": 0.591, "grad_norm": 1.1265122890472412, "learning_rate": 0.0002, "epoch": 5.79581278797187, "step": 35850}, {"loss": 0.5333, "grad_norm": 1.2191909551620483, "learning_rate": 0.0002, "epoch": 5.797429472152615, "step": 35860}, {"loss": 0.5274, "grad_norm": 0.9690808057785034, "learning_rate": 0.0002, "epoch": 5.79904615633336, "step": 35870}, {"loss": 0.5425, "grad_norm": 1.0871665477752686, "learning_rate": 0.0002, "epoch": 5.800662840514105, "step": 35880}, {"loss": 0.5602, "grad_norm": 1.1093597412109375, "learning_rate": 0.0002, "epoch": 5.802279524694851, "step": 35890}, {"loss": 0.5475, "grad_norm": 1.2434282302856445, "learning_rate": 0.0002, "epoch": 5.803896208875596, "step": 35900}, {"loss": 0.5288, "grad_norm": 1.2933623790740967, "learning_rate": 0.0002, "epoch": 5.805512893056341, "step": 35910}, {"loss": 0.5554, "grad_norm": 1.0005441904067993, "learning_rate": 0.0002, "epoch": 5.807129577237086, "step": 35920}, {"loss": 0.5318, "grad_norm": 1.2373108863830566, "learning_rate": 0.0002, "epoch": 5.8087462614178325, "step": 35930}, {"loss": 0.5413, "grad_norm": 1.2622692584991455, "learning_rate": 0.0002, "epoch": 5.810362945598578, "step": 35940}, {"loss": 0.5558, "grad_norm": 1.0112963914871216, "learning_rate": 0.0002, "epoch": 5.811979629779323, "step": 35950}, {"loss": 0.5115, "grad_norm": 1.050572395324707, "learning_rate": 0.0002, "epoch": 5.813596313960068, "step": 35960}, {"loss": 0.5288, "grad_norm": 0.9774560928344727, "learning_rate": 0.0002, "epoch": 5.815212998140813, "step": 35970}, {"loss": 0.585, "grad_norm": 1.19438898563385, "learning_rate": 0.0002, "epoch": 5.816829682321559, "step": 35980}, {"loss": 0.5798, "grad_norm": 1.0267130136489868, "learning_rate": 0.0002, "epoch": 5.818446366502304, "step": 35990}, {"loss": 0.5126, "grad_norm": 0.9813851714134216, "learning_rate": 0.0002, "epoch": 5.820063050683049, "step": 36000}, {"loss": 0.5138, "grad_norm": 0.9177457094192505, "learning_rate": 0.0002, "epoch": 5.821679734863794, "step": 36010}, {"loss": 0.5453, "grad_norm": 1.0020731687545776, "learning_rate": 0.0002, "epoch": 5.8232964190445395, "step": 36020}, {"loss": 0.5646, "grad_norm": 1.073222041130066, "learning_rate": 0.0002, "epoch": 5.824913103225285, "step": 36030}, {"loss": 0.5539, "grad_norm": 1.016337513923645, "learning_rate": 0.0002, "epoch": 5.82652978740603, "step": 36040}, {"loss": 0.5592, "grad_norm": 1.267364263534546, "learning_rate": 0.0002, "epoch": 5.828146471586775, "step": 36050}, {"loss": 0.595, "grad_norm": 1.2730127573013306, "learning_rate": 0.0002, "epoch": 5.8297631557675205, "step": 36060}, {"loss": 0.5247, "grad_norm": 1.108442783355713, "learning_rate": 0.0002, "epoch": 5.831379839948266, "step": 36070}, {"loss": 0.5103, "grad_norm": 1.198072075843811, "learning_rate": 0.0002, "epoch": 5.832996524129012, "step": 36080}, {"loss": 0.5479, "grad_norm": 1.0458786487579346, "learning_rate": 0.0002, "epoch": 5.834613208309757, "step": 36090}, {"loss": 0.5564, "grad_norm": 0.9096664786338806, "learning_rate": 0.0002, "epoch": 5.836229892490502, "step": 36100}, {"loss": 0.5602, "grad_norm": 0.9957793951034546, "learning_rate": 0.0002, "epoch": 5.8378465766712475, "step": 36110}, {"loss": 0.5799, "grad_norm": 1.3693058490753174, "learning_rate": 0.0002, "epoch": 5.839463260851993, "step": 36120}, {"loss": 0.5425, "grad_norm": 1.268608808517456, "learning_rate": 0.0002, "epoch": 5.841079945032738, "step": 36130}, {"loss": 0.5653, "grad_norm": 0.8516020178794861, "learning_rate": 0.0002, "epoch": 5.842696629213483, "step": 36140}, {"loss": 0.5475, "grad_norm": 0.90385502576828, "learning_rate": 0.0002, "epoch": 5.844313313394228, "step": 36150}, {"loss": 0.5274, "grad_norm": 1.0910571813583374, "learning_rate": 0.0002, "epoch": 5.845929997574974, "step": 36160}, {"loss": 0.555, "grad_norm": 0.9417795538902283, "learning_rate": 0.0002, "epoch": 5.847546681755719, "step": 36170}, {"loss": 0.5784, "grad_norm": 1.0027360916137695, "learning_rate": 0.0002, "epoch": 5.849163365936464, "step": 36180}, {"loss": 0.5423, "grad_norm": 1.1480516195297241, "learning_rate": 0.0002, "epoch": 5.850780050117209, "step": 36190}, {"loss": 0.5517, "grad_norm": 1.2431457042694092, "learning_rate": 0.0002, "epoch": 5.852396734297955, "step": 36200}, {"loss": 0.5404, "grad_norm": 1.091465950012207, "learning_rate": 0.0002, "epoch": 5.8540134184787, "step": 36210}, {"loss": 0.53, "grad_norm": 0.9693930745124817, "learning_rate": 0.0002, "epoch": 5.855630102659445, "step": 36220}, {"loss": 0.5453, "grad_norm": 0.9937465190887451, "learning_rate": 0.0002, "epoch": 5.857246786840191, "step": 36230}, {"loss": 0.5621, "grad_norm": 1.0731011629104614, "learning_rate": 0.0002, "epoch": 5.858863471020936, "step": 36240}, {"loss": 0.5687, "grad_norm": 1.0869048833847046, "learning_rate": 0.0002, "epoch": 5.860480155201682, "step": 36250}, {"loss": 0.5576, "grad_norm": 0.9226390719413757, "learning_rate": 0.0002, "epoch": 5.862096839382427, "step": 36260}, {"loss": 0.531, "grad_norm": 1.1755430698394775, "learning_rate": 0.0002, "epoch": 5.863713523563172, "step": 36270}, {"loss": 0.558, "grad_norm": 0.8815974593162537, "learning_rate": 0.0002, "epoch": 5.865330207743917, "step": 36280}, {"loss": 0.5065, "grad_norm": 1.3648751974105835, "learning_rate": 0.0002, "epoch": 5.866946891924663, "step": 36290}, {"loss": 0.536, "grad_norm": 0.8729211091995239, "learning_rate": 0.0002, "epoch": 5.868563576105408, "step": 36300}, {"loss": 0.5192, "grad_norm": 1.0870907306671143, "learning_rate": 0.0002, "epoch": 5.870180260286153, "step": 36310}, {"loss": 0.5609, "grad_norm": 1.1164259910583496, "learning_rate": 0.0002, "epoch": 5.871796944466898, "step": 36320}, {"loss": 0.551, "grad_norm": 1.1572535037994385, "learning_rate": 0.0002, "epoch": 5.8734136286476435, "step": 36330}, {"loss": 0.5898, "grad_norm": 1.0456238985061646, "learning_rate": 0.0002, "epoch": 5.875030312828389, "step": 36340}, {"loss": 0.5008, "grad_norm": 1.1310722827911377, "learning_rate": 0.0002, "epoch": 5.876646997009134, "step": 36350}, {"loss": 0.5352, "grad_norm": 1.0004712343215942, "learning_rate": 0.0002, "epoch": 5.878263681189879, "step": 36360}, {"loss": 0.5632, "grad_norm": 1.0991777181625366, "learning_rate": 0.0002, "epoch": 5.879880365370624, "step": 36370}, {"loss": 0.5815, "grad_norm": 1.2789239883422852, "learning_rate": 0.0002, "epoch": 5.8814970495513705, "step": 36380}, {"loss": 0.56, "grad_norm": 0.9524819850921631, "learning_rate": 0.0002, "epoch": 5.883113733732116, "step": 36390}, {"loss": 0.5701, "grad_norm": 1.1115771532058716, "learning_rate": 0.0002, "epoch": 5.884730417912861, "step": 36400}, {"loss": 0.5463, "grad_norm": 1.37419855594635, "learning_rate": 0.0002, "epoch": 5.886347102093606, "step": 36410}, {"loss": 0.5675, "grad_norm": 1.1449527740478516, "learning_rate": 0.0002, "epoch": 5.8879637862743515, "step": 36420}, {"loss": 0.5255, "grad_norm": 1.198046326637268, "learning_rate": 0.0002, "epoch": 5.889580470455097, "step": 36430}, {"loss": 0.5383, "grad_norm": 1.0180530548095703, "learning_rate": 0.0002, "epoch": 5.891197154635842, "step": 36440}, {"loss": 0.5319, "grad_norm": 1.0516417026519775, "learning_rate": 0.0002, "epoch": 5.892813838816587, "step": 36450}, {"loss": 0.5782, "grad_norm": 1.1658052206039429, "learning_rate": 0.0002, "epoch": 5.894430522997332, "step": 36460}, {"loss": 0.5864, "grad_norm": 1.190699577331543, "learning_rate": 0.0002, "epoch": 5.896047207178078, "step": 36470}, {"loss": 0.5451, "grad_norm": 1.1235495805740356, "learning_rate": 0.0002, "epoch": 5.897663891358823, "step": 36480}, {"loss": 0.5284, "grad_norm": 1.1926926374435425, "learning_rate": 0.0002, "epoch": 5.899280575539568, "step": 36490}, {"loss": 0.5686, "grad_norm": 1.1184662580490112, "learning_rate": 0.0002, "epoch": 5.900897259720313, "step": 36500}, {"loss": 0.5147, "grad_norm": 1.000970721244812, "learning_rate": 0.0002, "epoch": 5.9025139439010585, "step": 36510}, {"loss": 0.5351, "grad_norm": 1.0373306274414062, "learning_rate": 0.0002, "epoch": 5.904130628081804, "step": 36520}, {"loss": 0.535, "grad_norm": 1.0840669870376587, "learning_rate": 0.0002, "epoch": 5.90574731226255, "step": 36530}, {"loss": 0.538, "grad_norm": 0.9908381104469299, "learning_rate": 0.0002, "epoch": 5.907363996443295, "step": 36540}, {"loss": 0.5313, "grad_norm": 1.0456029176712036, "learning_rate": 0.0002, "epoch": 5.90898068062404, "step": 36550}, {"loss": 0.5693, "grad_norm": 1.1381454467773438, "learning_rate": 0.0002, "epoch": 5.910597364804786, "step": 36560}, {"loss": 0.5473, "grad_norm": 0.9440900087356567, "learning_rate": 0.0002, "epoch": 5.912214048985531, "step": 36570}, {"loss": 0.5542, "grad_norm": 1.1674573421478271, "learning_rate": 0.0002, "epoch": 5.913830733166276, "step": 36580}, {"loss": 0.526, "grad_norm": 1.1226966381072998, "learning_rate": 0.0002, "epoch": 5.915447417347021, "step": 36590}, {"loss": 0.6091, "grad_norm": 0.9696915745735168, "learning_rate": 0.0002, "epoch": 5.9170641015277665, "step": 36600}, {"loss": 0.5523, "grad_norm": 0.9593005180358887, "learning_rate": 0.0002, "epoch": 5.918680785708512, "step": 36610}, {"loss": 0.5536, "grad_norm": 1.122169852256775, "learning_rate": 0.0002, "epoch": 5.920297469889257, "step": 36620}, {"loss": 0.5039, "grad_norm": 0.9923415780067444, "learning_rate": 0.0002, "epoch": 5.921914154070002, "step": 36630}, {"loss": 0.5893, "grad_norm": 1.063838005065918, "learning_rate": 0.0002, "epoch": 5.923530838250747, "step": 36640}, {"loss": 0.5799, "grad_norm": 0.9083505272865295, "learning_rate": 0.0002, "epoch": 5.925147522431493, "step": 36650}, {"loss": 0.5264, "grad_norm": 0.9439437985420227, "learning_rate": 0.0002, "epoch": 5.926764206612239, "step": 36660}, {"loss": 0.5891, "grad_norm": 0.9778534173965454, "learning_rate": 0.0002, "epoch": 5.928380890792983, "step": 36670}, {"loss": 0.566, "grad_norm": 0.9723961353302002, "learning_rate": 0.0002, "epoch": 5.929997574973729, "step": 36680}, {"loss": 0.5741, "grad_norm": 1.162333607673645, "learning_rate": 0.0002, "epoch": 5.9316142591544745, "step": 36690}, {"loss": 0.5771, "grad_norm": 1.2784897089004517, "learning_rate": 0.0002, "epoch": 5.93323094333522, "step": 36700}, {"loss": 0.5343, "grad_norm": 1.0924867391586304, "learning_rate": 0.0002, "epoch": 5.934847627515965, "step": 36710}, {"loss": 0.5554, "grad_norm": 1.046922206878662, "learning_rate": 0.0002, "epoch": 5.93646431169671, "step": 36720}, {"loss": 0.5476, "grad_norm": 0.8632535338401794, "learning_rate": 0.0002, "epoch": 5.938080995877455, "step": 36730}, {"loss": 0.5456, "grad_norm": 1.358762502670288, "learning_rate": 0.0002, "epoch": 5.939697680058201, "step": 36740}, {"loss": 0.551, "grad_norm": 1.2058624029159546, "learning_rate": 0.0002, "epoch": 5.941314364238946, "step": 36750}, {"loss": 0.5462, "grad_norm": 1.1396408081054688, "learning_rate": 0.0002, "epoch": 5.942931048419691, "step": 36760}, {"loss": 0.5483, "grad_norm": 1.1510354280471802, "learning_rate": 0.0002, "epoch": 5.944547732600436, "step": 36770}, {"loss": 0.5659, "grad_norm": 1.1401607990264893, "learning_rate": 0.0002, "epoch": 5.946164416781182, "step": 36780}, {"loss": 0.5557, "grad_norm": 1.1871325969696045, "learning_rate": 0.0002, "epoch": 5.947781100961927, "step": 36790}, {"loss": 0.4945, "grad_norm": 0.9928333163261414, "learning_rate": 0.0002, "epoch": 5.949397785142672, "step": 36800}, {"loss": 0.5303, "grad_norm": 1.0549445152282715, "learning_rate": 0.0002, "epoch": 5.951014469323418, "step": 36810}, {"loss": 0.5532, "grad_norm": 0.9791563749313354, "learning_rate": 0.0002, "epoch": 5.9526311535041625, "step": 36820}, {"loss": 0.5317, "grad_norm": 1.1268441677093506, "learning_rate": 0.0002, "epoch": 5.954247837684909, "step": 36830}, {"loss": 0.5585, "grad_norm": 1.0533992052078247, "learning_rate": 0.0002, "epoch": 5.955864521865654, "step": 36840}, {"loss": 0.4972, "grad_norm": 1.023358941078186, "learning_rate": 0.0002, "epoch": 5.957481206046399, "step": 36850}, {"loss": 0.5557, "grad_norm": 1.2631961107254028, "learning_rate": 0.0002, "epoch": 5.959097890227144, "step": 36860}, {"loss": 0.5662, "grad_norm": 0.9397698640823364, "learning_rate": 0.0002, "epoch": 5.9607145744078895, "step": 36870}, {"loss": 0.5775, "grad_norm": 1.1678427457809448, "learning_rate": 0.0002, "epoch": 5.962331258588635, "step": 36880}, {"loss": 0.5435, "grad_norm": 1.1403759717941284, "learning_rate": 0.0002, "epoch": 5.96394794276938, "step": 36890}, {"loss": 0.5479, "grad_norm": 1.030572772026062, "learning_rate": 0.0002, "epoch": 5.965564626950125, "step": 36900}, {"loss": 0.5838, "grad_norm": 1.0992497205734253, "learning_rate": 0.0002, "epoch": 5.9671813111308705, "step": 36910}, {"loss": 0.5452, "grad_norm": 1.075466275215149, "learning_rate": 0.0002, "epoch": 5.968797995311616, "step": 36920}, {"loss": 0.5739, "grad_norm": 1.0153694152832031, "learning_rate": 0.0002, "epoch": 5.970414679492361, "step": 36930}, {"loss": 0.5672, "grad_norm": 0.973193883895874, "learning_rate": 0.0002, "epoch": 5.972031363673106, "step": 36940}, {"loss": 0.5585, "grad_norm": 0.8294678926467896, "learning_rate": 0.0002, "epoch": 5.973648047853851, "step": 36950}, {"loss": 0.5631, "grad_norm": 1.0048716068267822, "learning_rate": 0.0002, "epoch": 5.9752647320345975, "step": 36960}, {"loss": 0.5471, "grad_norm": 0.9714070558547974, "learning_rate": 0.0002, "epoch": 5.976881416215342, "step": 36970}, {"loss": 0.5419, "grad_norm": 0.8667682409286499, "learning_rate": 0.0002, "epoch": 5.978498100396088, "step": 36980}, {"loss": 0.5474, "grad_norm": 1.0461409091949463, "learning_rate": 0.0002, "epoch": 5.980114784576833, "step": 36990}, {"loss": 0.5454, "grad_norm": 0.9229754209518433, "learning_rate": 0.0002, "epoch": 5.981731468757578, "step": 37000}, {"loss": 0.5599, "grad_norm": 1.0406876802444458, "learning_rate": 0.0002, "epoch": 5.983348152938324, "step": 37010}, {"loss": 0.5569, "grad_norm": 0.8993828296661377, "learning_rate": 0.0002, "epoch": 5.984964837119069, "step": 37020}, {"loss": 0.5611, "grad_norm": 1.2260479927062988, "learning_rate": 0.0002, "epoch": 5.986581521299814, "step": 37030}, {"loss": 0.5523, "grad_norm": 1.0107380151748657, "learning_rate": 0.0002, "epoch": 5.988198205480559, "step": 37040}, {"loss": 0.5639, "grad_norm": 1.0240139961242676, "learning_rate": 0.0002, "epoch": 5.989814889661305, "step": 37050}, {"loss": 0.5209, "grad_norm": 1.0185275077819824, "learning_rate": 0.0002, "epoch": 5.99143157384205, "step": 37060}, {"loss": 0.5114, "grad_norm": 1.1361802816390991, "learning_rate": 0.0002, "epoch": 5.993048258022795, "step": 37070}, {"loss": 0.5692, "grad_norm": 1.0395532846450806, "learning_rate": 0.0002, "epoch": 5.99466494220354, "step": 37080}, {"loss": 0.594, "grad_norm": 0.9463558197021484, "learning_rate": 0.0002, "epoch": 5.9962816263842855, "step": 37090}, {"loss": 0.5775, "grad_norm": 1.2066948413848877, "learning_rate": 0.0002, "epoch": 5.997898310565031, "step": 37100}, {"loss": 0.5356, "grad_norm": 0.9749386310577393, "learning_rate": 0.0002, "epoch": 5.999514994745777, "step": 37110}, {"eval_loss": 1.2270219326019287, "eval_runtime": 122.2047, "eval_samples_per_second": 5.998, "eval_steps_per_second": 0.753, "epoch": 6.0, "step": 37113}, {"loss": 0.4855, "grad_norm": 0.9641092419624329, "learning_rate": 0.0002, "epoch": 6.001131678926522, "step": 37120}, {"loss": 0.4112, "grad_norm": 1.103379249572754, "learning_rate": 0.0002, "epoch": 6.002748363107267, "step": 37130}, {"loss": 0.4577, "grad_norm": 0.8381665349006653, "learning_rate": 0.0002, "epoch": 6.004365047288013, "step": 37140}, {"loss": 0.4794, "grad_norm": 1.245323896408081, "learning_rate": 0.0002, "epoch": 6.005981731468758, "step": 37150}, {"loss": 0.4503, "grad_norm": 1.3140289783477783, "learning_rate": 0.0002, "epoch": 6.007598415649503, "step": 37160}, {"loss": 0.4456, "grad_norm": 0.8479695916175842, "learning_rate": 0.0002, "epoch": 6.009215099830248, "step": 37170}, {"loss": 0.4573, "grad_norm": 0.8841437101364136, "learning_rate": 0.0002, "epoch": 6.0108317840109935, "step": 37180}, {"loss": 0.4565, "grad_norm": 0.8900154829025269, "learning_rate": 0.0002, "epoch": 6.012448468191739, "step": 37190}, {"loss": 0.457, "grad_norm": 1.2753345966339111, "learning_rate": 0.0002, "epoch": 6.014065152372484, "step": 37200}, {"loss": 0.4365, "grad_norm": 1.4625498056411743, "learning_rate": 0.0002, "epoch": 6.015681836553229, "step": 37210}, {"loss": 0.4252, "grad_norm": 0.7455034852027893, "learning_rate": 0.0002, "epoch": 6.017298520733974, "step": 37220}, {"loss": 0.4433, "grad_norm": 1.1658862829208374, "learning_rate": 0.0002, "epoch": 6.01891520491472, "step": 37230}, {"loss": 0.4499, "grad_norm": 0.9785751104354858, "learning_rate": 0.0002, "epoch": 6.020531889095465, "step": 37240}, {"loss": 0.4956, "grad_norm": 1.3193122148513794, "learning_rate": 0.0002, "epoch": 6.02214857327621, "step": 37250}, {"loss": 0.4727, "grad_norm": 1.038273572921753, "learning_rate": 0.0002, "epoch": 6.023765257456955, "step": 37260}, {"loss": 0.4395, "grad_norm": 1.0550594329833984, "learning_rate": 0.0002, "epoch": 6.0253819416377015, "step": 37270}, {"loss": 0.4767, "grad_norm": 0.9745930433273315, "learning_rate": 0.0002, "epoch": 6.026998625818447, "step": 37280}, {"loss": 0.4233, "grad_norm": 0.9273530840873718, "learning_rate": 0.0002, "epoch": 6.028615309999192, "step": 37290}, {"loss": 0.4195, "grad_norm": 1.3844057321548462, "learning_rate": 0.0002, "epoch": 6.030231994179937, "step": 37300}, {"loss": 0.4768, "grad_norm": 1.2058762311935425, "learning_rate": 0.0002, "epoch": 6.031848678360682, "step": 37310}, {"loss": 0.4499, "grad_norm": 1.242663025856018, "learning_rate": 0.0002, "epoch": 6.033465362541428, "step": 37320}, {"loss": 0.4597, "grad_norm": 1.3504270315170288, "learning_rate": 0.0002, "epoch": 6.035082046722173, "step": 37330}, {"loss": 0.4402, "grad_norm": 0.8734912276268005, "learning_rate": 0.0002, "epoch": 6.036698730902918, "step": 37340}, {"loss": 0.477, "grad_norm": 1.0182311534881592, "learning_rate": 0.0002, "epoch": 6.038315415083663, "step": 37350}, {"loss": 0.4261, "grad_norm": 0.9898499846458435, "learning_rate": 0.0002, "epoch": 6.0399320992644085, "step": 37360}, {"loss": 0.4459, "grad_norm": 1.0637860298156738, "learning_rate": 0.0002, "epoch": 6.041548783445154, "step": 37370}, {"loss": 0.4958, "grad_norm": 1.0099523067474365, "learning_rate": 0.0002, "epoch": 6.043165467625899, "step": 37380}, {"loss": 0.4459, "grad_norm": 1.1080750226974487, "learning_rate": 0.0002, "epoch": 6.044782151806644, "step": 37390}, {"loss": 0.4473, "grad_norm": 1.2551289796829224, "learning_rate": 0.0002, "epoch": 6.0463988359873895, "step": 37400}, {"loss": 0.468, "grad_norm": 0.8959632515907288, "learning_rate": 0.0002, "epoch": 6.048015520168136, "step": 37410}, {"loss": 0.4255, "grad_norm": 1.1748892068862915, "learning_rate": 0.0002, "epoch": 6.049632204348881, "step": 37420}, {"loss": 0.4458, "grad_norm": 1.3122745752334595, "learning_rate": 0.0002, "epoch": 6.051248888529626, "step": 37430}, {"loss": 0.4676, "grad_norm": 1.0227985382080078, "learning_rate": 0.0002, "epoch": 6.052865572710371, "step": 37440}, {"loss": 0.4503, "grad_norm": 1.0380030870437622, "learning_rate": 0.0002, "epoch": 6.0544822568911165, "step": 37450}, {"loss": 0.4686, "grad_norm": 0.8919622898101807, "learning_rate": 0.0002, "epoch": 6.056098941071862, "step": 37460}, {"loss": 0.4406, "grad_norm": 1.4554150104522705, "learning_rate": 0.0002, "epoch": 6.057715625252607, "step": 37470}, {"loss": 0.4688, "grad_norm": 1.2853292226791382, "learning_rate": 0.0002, "epoch": 6.059332309433352, "step": 37480}, {"loss": 0.4489, "grad_norm": 1.2951840162277222, "learning_rate": 0.0002, "epoch": 6.0609489936140974, "step": 37490}, {"loss": 0.4819, "grad_norm": 1.1750973463058472, "learning_rate": 0.0002, "epoch": 6.062565677794843, "step": 37500}, {"loss": 0.4574, "grad_norm": 0.9328424334526062, "learning_rate": 0.0002, "epoch": 6.064182361975588, "step": 37510}, {"loss": 0.4597, "grad_norm": 1.0353537797927856, "learning_rate": 0.0002, "epoch": 6.065799046156333, "step": 37520}, {"loss": 0.4407, "grad_norm": 1.1594274044036865, "learning_rate": 0.0002, "epoch": 6.067415730337078, "step": 37530}, {"loss": 0.4642, "grad_norm": 0.9034168124198914, "learning_rate": 0.0002, "epoch": 6.069032414517824, "step": 37540}, {"loss": 0.4625, "grad_norm": 1.068617820739746, "learning_rate": 0.0002, "epoch": 6.070649098698569, "step": 37550}, {"loss": 0.4378, "grad_norm": 1.0931321382522583, "learning_rate": 0.0002, "epoch": 6.072265782879315, "step": 37560}, {"loss": 0.4527, "grad_norm": 1.2542688846588135, "learning_rate": 0.0002, "epoch": 6.07388246706006, "step": 37570}, {"loss": 0.4725, "grad_norm": 1.273384928703308, "learning_rate": 0.0002, "epoch": 6.075499151240805, "step": 37580}, {"loss": 0.4928, "grad_norm": 1.4771400690078735, "learning_rate": 0.0002, "epoch": 6.077115835421551, "step": 37590}, {"loss": 0.461, "grad_norm": 1.3751444816589355, "learning_rate": 0.0002, "epoch": 6.078732519602296, "step": 37600}, {"loss": 0.4602, "grad_norm": 1.4532550573349, "learning_rate": 0.0002, "epoch": 6.080349203783041, "step": 37610}, {"loss": 0.4428, "grad_norm": 1.3175991773605347, "learning_rate": 0.0002, "epoch": 6.081965887963786, "step": 37620}, {"loss": 0.4746, "grad_norm": 1.0624970197677612, "learning_rate": 0.0002, "epoch": 6.083582572144532, "step": 37630}, {"loss": 0.413, "grad_norm": 1.099715232849121, "learning_rate": 0.0002, "epoch": 6.085199256325277, "step": 37640}, {"loss": 0.4528, "grad_norm": 1.0380114316940308, "learning_rate": 0.0002, "epoch": 6.086815940506022, "step": 37650}, {"loss": 0.4373, "grad_norm": 1.1136109828948975, "learning_rate": 0.0002, "epoch": 6.088432624686767, "step": 37660}, {"loss": 0.4915, "grad_norm": 0.996498703956604, "learning_rate": 0.0002, "epoch": 6.0900493088675125, "step": 37670}, {"loss": 0.4713, "grad_norm": 1.0552574396133423, "learning_rate": 0.0002, "epoch": 6.091665993048258, "step": 37680}, {"loss": 0.4414, "grad_norm": 1.4108527898788452, "learning_rate": 0.0002, "epoch": 6.093282677229003, "step": 37690}, {"loss": 0.4851, "grad_norm": 1.1323093175888062, "learning_rate": 0.0002, "epoch": 6.094899361409748, "step": 37700}, {"loss": 0.4455, "grad_norm": 0.9364377856254578, "learning_rate": 0.0002, "epoch": 6.096516045590494, "step": 37710}, {"loss": 0.4791, "grad_norm": 1.1300561428070068, "learning_rate": 0.0002, "epoch": 6.0981327297712395, "step": 37720}, {"loss": 0.4539, "grad_norm": 1.0616047382354736, "learning_rate": 0.0002, "epoch": 6.099749413951985, "step": 37730}, {"loss": 0.4516, "grad_norm": 1.1205905675888062, "learning_rate": 0.0002, "epoch": 6.10136609813273, "step": 37740}, {"loss": 0.4688, "grad_norm": 0.9592534303665161, "learning_rate": 0.0002, "epoch": 6.102982782313475, "step": 37750}, {"loss": 0.4494, "grad_norm": 0.9797531962394714, "learning_rate": 0.0002, "epoch": 6.1045994664942205, "step": 37760}, {"loss": 0.4237, "grad_norm": 1.093404769897461, "learning_rate": 0.0002, "epoch": 6.106216150674966, "step": 37770}, {"loss": 0.4691, "grad_norm": 1.2172642946243286, "learning_rate": 0.0002, "epoch": 6.107832834855711, "step": 37780}, {"loss": 0.4398, "grad_norm": 1.0467255115509033, "learning_rate": 0.0002, "epoch": 6.109449519036456, "step": 37790}, {"loss": 0.4676, "grad_norm": 1.159318208694458, "learning_rate": 0.0002, "epoch": 6.111066203217201, "step": 37800}, {"loss": 0.4539, "grad_norm": 1.0615603923797607, "learning_rate": 0.0002, "epoch": 6.112682887397947, "step": 37810}, {"loss": 0.4957, "grad_norm": 1.0542045831680298, "learning_rate": 0.0002, "epoch": 6.114299571578692, "step": 37820}, {"loss": 0.4512, "grad_norm": 0.8962697982788086, "learning_rate": 0.0002, "epoch": 6.115916255759437, "step": 37830}, {"loss": 0.4519, "grad_norm": 1.106352686882019, "learning_rate": 0.0002, "epoch": 6.117532939940182, "step": 37840}, {"loss": 0.4421, "grad_norm": 1.1660276651382446, "learning_rate": 0.0002, "epoch": 6.1191496241209276, "step": 37850}, {"loss": 0.4701, "grad_norm": 1.3524385690689087, "learning_rate": 0.0002, "epoch": 6.120766308301674, "step": 37860}, {"loss": 0.4684, "grad_norm": 1.1056050062179565, "learning_rate": 0.0002, "epoch": 6.122382992482419, "step": 37870}, {"loss": 0.4518, "grad_norm": 1.0772725343704224, "learning_rate": 0.0002, "epoch": 6.123999676663164, "step": 37880}, {"loss": 0.4356, "grad_norm": 1.1011115312576294, "learning_rate": 0.0002, "epoch": 6.125616360843909, "step": 37890}, {"loss": 0.4909, "grad_norm": 0.8952536582946777, "learning_rate": 0.0002, "epoch": 6.127233045024655, "step": 37900}, {"loss": 0.4299, "grad_norm": 1.244398593902588, "learning_rate": 0.0002, "epoch": 6.1288497292054, "step": 37910}, {"loss": 0.4764, "grad_norm": 0.9658283591270447, "learning_rate": 0.0002, "epoch": 6.130466413386145, "step": 37920}, {"loss": 0.4378, "grad_norm": 1.0649068355560303, "learning_rate": 0.0002, "epoch": 6.13208309756689, "step": 37930}, {"loss": 0.4638, "grad_norm": 0.94698166847229, "learning_rate": 0.0002, "epoch": 6.1336997817476355, "step": 37940}, {"loss": 0.488, "grad_norm": 1.1450897455215454, "learning_rate": 0.0002, "epoch": 6.135316465928381, "step": 37950}, {"loss": 0.4791, "grad_norm": 1.032482624053955, "learning_rate": 0.0002, "epoch": 6.136933150109126, "step": 37960}, {"loss": 0.4179, "grad_norm": 1.0993428230285645, "learning_rate": 0.0002, "epoch": 6.138549834289871, "step": 37970}, {"loss": 0.4781, "grad_norm": 1.2907029390335083, "learning_rate": 0.0002, "epoch": 6.1401665184706165, "step": 37980}, {"loss": 0.4671, "grad_norm": 1.1007903814315796, "learning_rate": 0.0002, "epoch": 6.141783202651362, "step": 37990}, {"loss": 0.4213, "grad_norm": 0.9286124110221863, "learning_rate": 0.0002, "epoch": 6.143399886832107, "step": 38000}, {"loss": 0.4741, "grad_norm": 1.1426366567611694, "learning_rate": 0.0002, "epoch": 6.145016571012853, "step": 38010}, {"loss": 0.4746, "grad_norm": 1.2608287334442139, "learning_rate": 0.0002, "epoch": 6.146633255193598, "step": 38020}, {"loss": 0.454, "grad_norm": 1.1346837282180786, "learning_rate": 0.0002, "epoch": 6.1482499393743435, "step": 38030}, {"loss": 0.4469, "grad_norm": 1.144080400466919, "learning_rate": 0.0002, "epoch": 6.149866623555089, "step": 38040}, {"loss": 0.4515, "grad_norm": 1.3456705808639526, "learning_rate": 0.0002, "epoch": 6.151483307735834, "step": 38050}, {"loss": 0.4775, "grad_norm": 1.0517960786819458, "learning_rate": 0.0002, "epoch": 6.153099991916579, "step": 38060}, {"loss": 0.4986, "grad_norm": 1.1887445449829102, "learning_rate": 0.0002, "epoch": 6.154716676097324, "step": 38070}, {"loss": 0.4516, "grad_norm": 1.0449163913726807, "learning_rate": 0.0002, "epoch": 6.15633336027807, "step": 38080}, {"loss": 0.4808, "grad_norm": 1.3218743801116943, "learning_rate": 0.0002, "epoch": 6.157950044458815, "step": 38090}, {"loss": 0.4632, "grad_norm": 1.003208875656128, "learning_rate": 0.0002, "epoch": 6.15956672863956, "step": 38100}, {"loss": 0.4978, "grad_norm": 1.008623719215393, "learning_rate": 0.0002, "epoch": 6.161183412820305, "step": 38110}, {"loss": 0.4608, "grad_norm": 1.2122787237167358, "learning_rate": 0.0002, "epoch": 6.162800097001051, "step": 38120}, {"loss": 0.4666, "grad_norm": 1.253403902053833, "learning_rate": 0.0002, "epoch": 6.164416781181796, "step": 38130}, {"loss": 0.4778, "grad_norm": 1.2289724349975586, "learning_rate": 0.0002, "epoch": 6.166033465362541, "step": 38140}, {"loss": 0.4774, "grad_norm": 1.330694556236267, "learning_rate": 0.0002, "epoch": 6.167650149543286, "step": 38150}, {"loss": 0.4699, "grad_norm": 1.0946741104125977, "learning_rate": 0.0002, "epoch": 6.169266833724032, "step": 38160}, {"loss": 0.4816, "grad_norm": 1.0719934701919556, "learning_rate": 0.0002, "epoch": 6.170883517904778, "step": 38170}, {"loss": 0.4678, "grad_norm": 1.1142133474349976, "learning_rate": 0.0002, "epoch": 6.172500202085523, "step": 38180}, {"loss": 0.4911, "grad_norm": 1.1221938133239746, "learning_rate": 0.0002, "epoch": 6.174116886266268, "step": 38190}, {"loss": 0.4462, "grad_norm": 1.1391617059707642, "learning_rate": 0.0002, "epoch": 6.175733570447013, "step": 38200}, {"loss": 0.4867, "grad_norm": 1.2263455390930176, "learning_rate": 0.0002, "epoch": 6.1773502546277586, "step": 38210}, {"loss": 0.4633, "grad_norm": 1.0930434465408325, "learning_rate": 0.0002, "epoch": 6.178966938808504, "step": 38220}, {"loss": 0.4406, "grad_norm": 1.3489030599594116, "learning_rate": 0.0002, "epoch": 6.180583622989249, "step": 38230}, {"loss": 0.4994, "grad_norm": 1.1383486986160278, "learning_rate": 0.0002, "epoch": 6.182200307169994, "step": 38240}, {"loss": 0.4851, "grad_norm": 1.2408897876739502, "learning_rate": 0.0002, "epoch": 6.1838169913507395, "step": 38250}, {"loss": 0.4848, "grad_norm": 1.1436222791671753, "learning_rate": 0.0002, "epoch": 6.185433675531485, "step": 38260}, {"loss": 0.4594, "grad_norm": 1.370117425918579, "learning_rate": 0.0002, "epoch": 6.18705035971223, "step": 38270}, {"loss": 0.5023, "grad_norm": 0.8862423300743103, "learning_rate": 0.0002, "epoch": 6.188667043892975, "step": 38280}, {"loss": 0.4559, "grad_norm": 0.9603779315948486, "learning_rate": 0.0002, "epoch": 6.19028372807372, "step": 38290}, {"loss": 0.4835, "grad_norm": 1.389291524887085, "learning_rate": 0.0002, "epoch": 6.191900412254466, "step": 38300}, {"loss": 0.4435, "grad_norm": 1.0767031908035278, "learning_rate": 0.0002, "epoch": 6.193517096435212, "step": 38310}, {"loss": 0.4683, "grad_norm": 1.1800403594970703, "learning_rate": 0.0002, "epoch": 6.195133780615957, "step": 38320}, {"loss": 0.4608, "grad_norm": 0.997891366481781, "learning_rate": 0.0002, "epoch": 6.196750464796702, "step": 38330}, {"loss": 0.4575, "grad_norm": 1.1201492547988892, "learning_rate": 0.0002, "epoch": 6.1983671489774474, "step": 38340}, {"loss": 0.4952, "grad_norm": 0.9769026637077332, "learning_rate": 0.0002, "epoch": 6.199983833158193, "step": 38350}, {"loss": 0.4563, "grad_norm": 0.9447069764137268, "learning_rate": 0.0002, "epoch": 6.201600517338938, "step": 38360}, {"loss": 0.516, "grad_norm": 1.0959235429763794, "learning_rate": 0.0002, "epoch": 6.203217201519683, "step": 38370}, {"loss": 0.4688, "grad_norm": 1.2495406866073608, "learning_rate": 0.0002, "epoch": 6.204833885700428, "step": 38380}, {"loss": 0.4445, "grad_norm": 0.8589218258857727, "learning_rate": 0.0002, "epoch": 6.206450569881174, "step": 38390}, {"loss": 0.4808, "grad_norm": 0.959155797958374, "learning_rate": 0.0002, "epoch": 6.208067254061919, "step": 38400}, {"loss": 0.4622, "grad_norm": 1.0105533599853516, "learning_rate": 0.0002, "epoch": 6.209683938242664, "step": 38410}, {"loss": 0.4887, "grad_norm": 0.9824615120887756, "learning_rate": 0.0002, "epoch": 6.211300622423409, "step": 38420}, {"loss": 0.4656, "grad_norm": 0.8616500496864319, "learning_rate": 0.0002, "epoch": 6.2129173066041545, "step": 38430}, {"loss": 0.449, "grad_norm": 1.2917758226394653, "learning_rate": 0.0002, "epoch": 6.2145339907849, "step": 38440}, {"loss": 0.4201, "grad_norm": 1.0564531087875366, "learning_rate": 0.0002, "epoch": 6.216150674965646, "step": 38450}, {"loss": 0.4849, "grad_norm": 1.152331829071045, "learning_rate": 0.0002, "epoch": 6.217767359146391, "step": 38460}, {"loss": 0.4887, "grad_norm": 0.9152206778526306, "learning_rate": 0.0002, "epoch": 6.219384043327136, "step": 38470}, {"loss": 0.4686, "grad_norm": 0.9931167960166931, "learning_rate": 0.0002, "epoch": 6.221000727507882, "step": 38480}, {"loss": 0.4765, "grad_norm": 1.3248072862625122, "learning_rate": 0.0002, "epoch": 6.222617411688627, "step": 38490}, {"loss": 0.4636, "grad_norm": 1.3916507959365845, "learning_rate": 0.0002, "epoch": 6.224234095869372, "step": 38500}, {"loss": 0.506, "grad_norm": 1.1775140762329102, "learning_rate": 0.0002, "epoch": 6.225850780050117, "step": 38510}, {"loss": 0.47, "grad_norm": 1.1581059694290161, "learning_rate": 0.0002, "epoch": 6.2274674642308625, "step": 38520}, {"loss": 0.4679, "grad_norm": 1.359320878982544, "learning_rate": 0.0002, "epoch": 6.229084148411608, "step": 38530}, {"loss": 0.4697, "grad_norm": 1.185041904449463, "learning_rate": 0.0002, "epoch": 6.230700832592353, "step": 38540}, {"loss": 0.4815, "grad_norm": 1.1861097812652588, "learning_rate": 0.0002, "epoch": 6.232317516773098, "step": 38550}, {"loss": 0.4925, "grad_norm": 1.126990556716919, "learning_rate": 0.0002, "epoch": 6.233934200953843, "step": 38560}, {"loss": 0.4414, "grad_norm": 0.9744541049003601, "learning_rate": 0.0002, "epoch": 6.235550885134589, "step": 38570}, {"loss": 0.4577, "grad_norm": 1.1260887384414673, "learning_rate": 0.0002, "epoch": 6.237167569315334, "step": 38580}, {"loss": 0.4852, "grad_norm": 1.1290327310562134, "learning_rate": 0.0002, "epoch": 6.238784253496079, "step": 38590}, {"loss": 0.4805, "grad_norm": 1.0952879190444946, "learning_rate": 0.0002, "epoch": 6.240400937676825, "step": 38600}, {"loss": 0.4436, "grad_norm": 1.1037684679031372, "learning_rate": 0.0002, "epoch": 6.2420176218575705, "step": 38610}, {"loss": 0.466, "grad_norm": 1.1356085538864136, "learning_rate": 0.0002, "epoch": 6.243634306038316, "step": 38620}, {"loss": 0.5129, "grad_norm": 1.0677106380462646, "learning_rate": 0.0002, "epoch": 6.245250990219061, "step": 38630}, {"loss": 0.4907, "grad_norm": 1.1573411226272583, "learning_rate": 0.0002, "epoch": 6.246867674399806, "step": 38640}, {"loss": 0.5098, "grad_norm": 1.2707505226135254, "learning_rate": 0.0002, "epoch": 6.248484358580551, "step": 38650}, {"loss": 0.4926, "grad_norm": 1.0480109453201294, "learning_rate": 0.0002, "epoch": 6.250101042761297, "step": 38660}, {"loss": 0.4654, "grad_norm": 1.3668724298477173, "learning_rate": 0.0002, "epoch": 6.251717726942042, "step": 38670}, {"loss": 0.5128, "grad_norm": 1.217289686203003, "learning_rate": 0.0002, "epoch": 6.253334411122787, "step": 38680}, {"loss": 0.4621, "grad_norm": 1.2950236797332764, "learning_rate": 0.0002, "epoch": 6.254951095303532, "step": 38690}, {"loss": 0.5076, "grad_norm": 1.4506934881210327, "learning_rate": 0.0002, "epoch": 6.256567779484278, "step": 38700}, {"loss": 0.4803, "grad_norm": 1.1248667240142822, "learning_rate": 0.0002, "epoch": 6.258184463665023, "step": 38710}, {"loss": 0.4746, "grad_norm": 1.3384023904800415, "learning_rate": 0.0002, "epoch": 6.259801147845768, "step": 38720}, {"loss": 0.473, "grad_norm": 1.128074288368225, "learning_rate": 0.0002, "epoch": 6.261417832026513, "step": 38730}, {"loss": 0.4638, "grad_norm": 1.1169012784957886, "learning_rate": 0.0002, "epoch": 6.263034516207259, "step": 38740}, {"loss": 0.4747, "grad_norm": 1.195198893547058, "learning_rate": 0.0002, "epoch": 6.264651200388005, "step": 38750}, {"loss": 0.4906, "grad_norm": 1.2471518516540527, "learning_rate": 0.0002, "epoch": 6.26626788456875, "step": 38760}, {"loss": 0.4507, "grad_norm": 1.2646394968032837, "learning_rate": 0.0002, "epoch": 6.267884568749495, "step": 38770}, {"loss": 0.4934, "grad_norm": 1.0286450386047363, "learning_rate": 0.0002, "epoch": 6.26950125293024, "step": 38780}, {"loss": 0.4787, "grad_norm": 1.2440695762634277, "learning_rate": 0.0002, "epoch": 6.2711179371109855, "step": 38790}, {"loss": 0.4806, "grad_norm": 0.8941256403923035, "learning_rate": 0.0002, "epoch": 6.272734621291731, "step": 38800}, {"loss": 0.4741, "grad_norm": 1.0693447589874268, "learning_rate": 0.0002, "epoch": 6.274351305472476, "step": 38810}, {"loss": 0.4408, "grad_norm": 1.0936840772628784, "learning_rate": 0.0002, "epoch": 6.275967989653221, "step": 38820}, {"loss": 0.4729, "grad_norm": 1.0961874723434448, "learning_rate": 0.0002, "epoch": 6.2775846738339665, "step": 38830}, {"loss": 0.4504, "grad_norm": 1.1465433835983276, "learning_rate": 0.0002, "epoch": 6.279201358014712, "step": 38840}, {"loss": 0.4771, "grad_norm": 1.2987004518508911, "learning_rate": 0.0002, "epoch": 6.280818042195457, "step": 38850}, {"loss": 0.4945, "grad_norm": 1.1310304403305054, "learning_rate": 0.0002, "epoch": 6.282434726376202, "step": 38860}, {"loss": 0.5346, "grad_norm": 1.306538462638855, "learning_rate": 0.0002, "epoch": 6.284051410556947, "step": 38870}, {"loss": 0.4873, "grad_norm": 1.2405401468276978, "learning_rate": 0.0002, "epoch": 6.285668094737693, "step": 38880}, {"loss": 0.4929, "grad_norm": 1.0934767723083496, "learning_rate": 0.0002, "epoch": 6.287284778918439, "step": 38890}, {"loss": 0.4853, "grad_norm": 1.3370496034622192, "learning_rate": 0.0002, "epoch": 6.288901463099184, "step": 38900}, {"loss": 0.4892, "grad_norm": 1.0319404602050781, "learning_rate": 0.0002, "epoch": 6.290518147279929, "step": 38910}, {"loss": 0.4685, "grad_norm": 0.9734271168708801, "learning_rate": 0.0002, "epoch": 6.292134831460674, "step": 38920}, {"loss": 0.5085, "grad_norm": 1.0940454006195068, "learning_rate": 0.0002, "epoch": 6.29375151564142, "step": 38930}, {"loss": 0.4985, "grad_norm": 1.036500334739685, "learning_rate": 0.0002, "epoch": 6.295368199822165, "step": 38940}, {"loss": 0.4878, "grad_norm": 1.020308256149292, "learning_rate": 0.0002, "epoch": 6.29698488400291, "step": 38950}, {"loss": 0.4668, "grad_norm": 1.1416399478912354, "learning_rate": 0.0002, "epoch": 6.298601568183655, "step": 38960}, {"loss": 0.4727, "grad_norm": 1.2497479915618896, "learning_rate": 0.0002, "epoch": 6.300218252364401, "step": 38970}, {"loss": 0.4721, "grad_norm": 1.1692523956298828, "learning_rate": 0.0002, "epoch": 6.301834936545146, "step": 38980}, {"loss": 0.505, "grad_norm": 1.0693109035491943, "learning_rate": 0.0002, "epoch": 6.303451620725891, "step": 38990}, {"loss": 0.4875, "grad_norm": 0.8883291482925415, "learning_rate": 0.0002, "epoch": 6.305068304906636, "step": 39000}, {"loss": 0.5371, "grad_norm": 1.1445088386535645, "learning_rate": 0.0002, "epoch": 6.3066849890873815, "step": 39010}, {"loss": 0.5089, "grad_norm": 1.226792335510254, "learning_rate": 0.0002, "epoch": 6.308301673268127, "step": 39020}, {"loss": 0.474, "grad_norm": 1.0498932600021362, "learning_rate": 0.0002, "epoch": 6.309918357448872, "step": 39030}, {"loss": 0.4964, "grad_norm": 1.0834535360336304, "learning_rate": 0.0002, "epoch": 6.311535041629618, "step": 39040}, {"loss": 0.4733, "grad_norm": 1.144666075706482, "learning_rate": 0.0002, "epoch": 6.313151725810363, "step": 39050}, {"loss": 0.4784, "grad_norm": 1.1468489170074463, "learning_rate": 0.0002, "epoch": 6.3147684099911086, "step": 39060}, {"loss": 0.4911, "grad_norm": 1.290949821472168, "learning_rate": 0.0002, "epoch": 6.316385094171854, "step": 39070}, {"loss": 0.5002, "grad_norm": 1.087868094444275, "learning_rate": 0.0002, "epoch": 6.318001778352599, "step": 39080}, {"loss": 0.4944, "grad_norm": 1.0156296491622925, "learning_rate": 0.0002, "epoch": 6.319618462533344, "step": 39090}, {"loss": 0.5019, "grad_norm": 1.0805060863494873, "learning_rate": 0.0002, "epoch": 6.3212351467140895, "step": 39100}, {"loss": 0.4598, "grad_norm": 0.9030579924583435, "learning_rate": 0.0002, "epoch": 6.322851830894835, "step": 39110}, {"loss": 0.4635, "grad_norm": 1.1488285064697266, "learning_rate": 0.0002, "epoch": 6.32446851507558, "step": 39120}, {"loss": 0.5368, "grad_norm": 1.2050796747207642, "learning_rate": 0.0002, "epoch": 6.326085199256325, "step": 39130}, {"loss": 0.4854, "grad_norm": 1.093451738357544, "learning_rate": 0.0002, "epoch": 6.32770188343707, "step": 39140}, {"loss": 0.5055, "grad_norm": 1.2046772241592407, "learning_rate": 0.0002, "epoch": 6.329318567617816, "step": 39150}, {"loss": 0.4703, "grad_norm": 1.045777678489685, "learning_rate": 0.0002, "epoch": 6.330935251798561, "step": 39160}, {"loss": 0.513, "grad_norm": 1.2008492946624756, "learning_rate": 0.0002, "epoch": 6.332551935979306, "step": 39170}, {"loss": 0.4909, "grad_norm": 1.0613869428634644, "learning_rate": 0.0002, "epoch": 6.334168620160051, "step": 39180}, {"loss": 0.4708, "grad_norm": 1.058440089225769, "learning_rate": 0.0002, "epoch": 6.3357853043407975, "step": 39190}, {"loss": 0.4719, "grad_norm": 1.195658802986145, "learning_rate": 0.0002, "epoch": 6.337401988521543, "step": 39200}, {"loss": 0.4901, "grad_norm": 1.1595174074172974, "learning_rate": 0.0002, "epoch": 6.339018672702288, "step": 39210}, {"loss": 0.4587, "grad_norm": 1.0674750804901123, "learning_rate": 0.0002, "epoch": 6.340635356883033, "step": 39220}, {"loss": 0.4801, "grad_norm": 1.3306758403778076, "learning_rate": 0.0002, "epoch": 6.342252041063778, "step": 39230}, {"loss": 0.4839, "grad_norm": 1.3582593202590942, "learning_rate": 0.0002, "epoch": 6.343868725244524, "step": 39240}, {"loss": 0.4964, "grad_norm": 1.2351572513580322, "learning_rate": 0.0002, "epoch": 6.345485409425269, "step": 39250}, {"loss": 0.4806, "grad_norm": 1.3623450994491577, "learning_rate": 0.0002, "epoch": 6.347102093606014, "step": 39260}, {"loss": 0.466, "grad_norm": 1.201270580291748, "learning_rate": 0.0002, "epoch": 6.348718777786759, "step": 39270}, {"loss": 0.4899, "grad_norm": 0.9300584197044373, "learning_rate": 0.0002, "epoch": 6.3503354619675045, "step": 39280}, {"loss": 0.4867, "grad_norm": 0.944525957107544, "learning_rate": 0.0002, "epoch": 6.35195214614825, "step": 39290}, {"loss": 0.4954, "grad_norm": 1.4263732433319092, "learning_rate": 0.0002, "epoch": 6.353568830328995, "step": 39300}, {"loss": 0.4982, "grad_norm": 1.392592191696167, "learning_rate": 0.0002, "epoch": 6.35518551450974, "step": 39310}, {"loss": 0.4868, "grad_norm": 1.0753393173217773, "learning_rate": 0.0002, "epoch": 6.3568021986904855, "step": 39320}, {"loss": 0.4896, "grad_norm": 1.0088151693344116, "learning_rate": 0.0002, "epoch": 6.358418882871231, "step": 39330}, {"loss": 0.4684, "grad_norm": 1.1784582138061523, "learning_rate": 0.0002, "epoch": 6.360035567051977, "step": 39340}, {"loss": 0.4732, "grad_norm": 1.020526647567749, "learning_rate": 0.0002, "epoch": 6.361652251232722, "step": 39350}, {"loss": 0.5177, "grad_norm": 1.1400747299194336, "learning_rate": 0.0002, "epoch": 6.363268935413467, "step": 39360}, {"loss": 0.4976, "grad_norm": 0.9960665702819824, "learning_rate": 0.0002, "epoch": 6.3648856195942125, "step": 39370}, {"loss": 0.483, "grad_norm": 1.1547569036483765, "learning_rate": 0.0002, "epoch": 6.366502303774958, "step": 39380}, {"loss": 0.4861, "grad_norm": 1.2180676460266113, "learning_rate": 0.0002, "epoch": 6.368118987955703, "step": 39390}, {"loss": 0.4805, "grad_norm": 1.1391799449920654, "learning_rate": 0.0002, "epoch": 6.369735672136448, "step": 39400}, {"loss": 0.5004, "grad_norm": 1.2893574237823486, "learning_rate": 0.0002, "epoch": 6.371352356317193, "step": 39410}, {"loss": 0.4807, "grad_norm": 1.192878246307373, "learning_rate": 0.0002, "epoch": 6.372969040497939, "step": 39420}, {"loss": 0.4637, "grad_norm": 0.9771704077720642, "learning_rate": 0.0002, "epoch": 6.374585724678684, "step": 39430}, {"loss": 0.4867, "grad_norm": 1.285387635231018, "learning_rate": 0.0002, "epoch": 6.376202408859429, "step": 39440}, {"loss": 0.4593, "grad_norm": 1.019957184791565, "learning_rate": 0.0002, "epoch": 6.377819093040174, "step": 39450}, {"loss": 0.473, "grad_norm": 1.2002915143966675, "learning_rate": 0.0002, "epoch": 6.37943577722092, "step": 39460}, {"loss": 0.5025, "grad_norm": 1.3285092115402222, "learning_rate": 0.0002, "epoch": 6.381052461401665, "step": 39470}, {"loss": 0.4626, "grad_norm": 1.097846269607544, "learning_rate": 0.0002, "epoch": 6.38266914558241, "step": 39480}, {"loss": 0.5109, "grad_norm": 0.9537988305091858, "learning_rate": 0.0002, "epoch": 6.384285829763156, "step": 39490}, {"loss": 0.4492, "grad_norm": 1.0350042581558228, "learning_rate": 0.0002, "epoch": 6.385902513943901, "step": 39500}, {"loss": 0.4824, "grad_norm": 0.9559133052825928, "learning_rate": 0.0002, "epoch": 6.387519198124647, "step": 39510}, {"loss": 0.5189, "grad_norm": 0.9615123271942139, "learning_rate": 0.0002, "epoch": 6.389135882305392, "step": 39520}, {"loss": 0.4915, "grad_norm": 1.0604504346847534, "learning_rate": 0.0002, "epoch": 6.390752566486137, "step": 39530}, {"loss": 0.5315, "grad_norm": 1.2460750341415405, "learning_rate": 0.0002, "epoch": 6.392369250666882, "step": 39540}, {"loss": 0.4929, "grad_norm": 1.1496477127075195, "learning_rate": 0.0002, "epoch": 6.393985934847628, "step": 39550}, {"loss": 0.4872, "grad_norm": 1.048043966293335, "learning_rate": 0.0002, "epoch": 6.395602619028373, "step": 39560}, {"loss": 0.5231, "grad_norm": 1.333539366722107, "learning_rate": 0.0002, "epoch": 6.397219303209118, "step": 39570}, {"loss": 0.4877, "grad_norm": 1.0605626106262207, "learning_rate": 0.0002, "epoch": 6.398835987389863, "step": 39580}, {"loss": 0.4643, "grad_norm": 1.163220763206482, "learning_rate": 0.0002, "epoch": 6.4004526715706085, "step": 39590}, {"loss": 0.4824, "grad_norm": 1.1878494024276733, "learning_rate": 0.0002, "epoch": 6.402069355751354, "step": 39600}, {"loss": 0.5242, "grad_norm": 1.4630796909332275, "learning_rate": 0.0002, "epoch": 6.403686039932099, "step": 39610}, {"loss": 0.4985, "grad_norm": 1.073255181312561, "learning_rate": 0.0002, "epoch": 6.405302724112844, "step": 39620}, {"loss": 0.5108, "grad_norm": 1.0538873672485352, "learning_rate": 0.0002, "epoch": 6.406919408293589, "step": 39630}, {"loss": 0.4801, "grad_norm": 1.015525221824646, "learning_rate": 0.0002, "epoch": 6.4085360924743355, "step": 39640}, {"loss": 0.4781, "grad_norm": 1.1454379558563232, "learning_rate": 0.0002, "epoch": 6.410152776655081, "step": 39650}, {"loss": 0.498, "grad_norm": 1.2801800966262817, "learning_rate": 0.0002, "epoch": 6.411769460835826, "step": 39660}, {"loss": 0.4804, "grad_norm": 1.077579140663147, "learning_rate": 0.0002, "epoch": 6.413386145016571, "step": 39670}, {"loss": 0.51, "grad_norm": 1.376662015914917, "learning_rate": 0.0002, "epoch": 6.4150028291973165, "step": 39680}, {"loss": 0.4956, "grad_norm": 1.2064344882965088, "learning_rate": 0.0002, "epoch": 6.416619513378062, "step": 39690}, {"loss": 0.4762, "grad_norm": 1.0689115524291992, "learning_rate": 0.0002, "epoch": 6.418236197558807, "step": 39700}, {"loss": 0.4762, "grad_norm": 0.9997019171714783, "learning_rate": 0.0002, "epoch": 6.419852881739552, "step": 39710}, {"loss": 0.49, "grad_norm": 1.2368080615997314, "learning_rate": 0.0002, "epoch": 6.421469565920297, "step": 39720}, {"loss": 0.4774, "grad_norm": 1.2085820436477661, "learning_rate": 0.0002, "epoch": 6.423086250101043, "step": 39730}, {"loss": 0.4671, "grad_norm": 1.057246208190918, "learning_rate": 0.0002, "epoch": 6.424702934281788, "step": 39740}, {"loss": 0.5315, "grad_norm": 1.1311043500900269, "learning_rate": 0.0002, "epoch": 6.426319618462533, "step": 39750}, {"loss": 0.5171, "grad_norm": 1.2352231740951538, "learning_rate": 0.0002, "epoch": 6.427936302643278, "step": 39760}, {"loss": 0.466, "grad_norm": 0.953233540058136, "learning_rate": 0.0002, "epoch": 6.4295529868240235, "step": 39770}, {"loss": 0.4834, "grad_norm": 1.0632505416870117, "learning_rate": 0.0002, "epoch": 6.431169671004769, "step": 39780}, {"loss": 0.5053, "grad_norm": 1.0916751623153687, "learning_rate": 0.0002, "epoch": 6.432786355185515, "step": 39790}, {"loss": 0.4788, "grad_norm": 0.9732703566551208, "learning_rate": 0.0002, "epoch": 6.43440303936626, "step": 39800}, {"loss": 0.4982, "grad_norm": 1.1673705577850342, "learning_rate": 0.0002, "epoch": 6.436019723547005, "step": 39810}, {"loss": 0.4484, "grad_norm": 1.1049559116363525, "learning_rate": 0.0002, "epoch": 6.437636407727751, "step": 39820}, {"loss": 0.4784, "grad_norm": 1.345277190208435, "learning_rate": 0.0002, "epoch": 6.439253091908496, "step": 39830}, {"loss": 0.4716, "grad_norm": 1.1118950843811035, "learning_rate": 0.0002, "epoch": 6.440869776089241, "step": 39840}, {"loss": 0.5133, "grad_norm": 1.4872850179672241, "learning_rate": 0.0002, "epoch": 6.442486460269986, "step": 39850}, {"loss": 0.4532, "grad_norm": 1.0763497352600098, "learning_rate": 0.0002, "epoch": 6.4441031444507315, "step": 39860}, {"loss": 0.4572, "grad_norm": 0.9245555400848389, "learning_rate": 0.0002, "epoch": 6.445719828631477, "step": 39870}, {"loss": 0.4917, "grad_norm": 1.4154807329177856, "learning_rate": 0.0002, "epoch": 6.447336512812222, "step": 39880}, {"loss": 0.4852, "grad_norm": 1.0885124206542969, "learning_rate": 0.0002, "epoch": 6.448953196992967, "step": 39890}, {"loss": 0.5399, "grad_norm": 1.3989344835281372, "learning_rate": 0.0002, "epoch": 6.450569881173712, "step": 39900}, {"loss": 0.509, "grad_norm": 0.9763124585151672, "learning_rate": 0.0002, "epoch": 6.452186565354458, "step": 39910}, {"loss": 0.5134, "grad_norm": 1.135272741317749, "learning_rate": 0.0002, "epoch": 6.453803249535203, "step": 39920}, {"loss": 0.4941, "grad_norm": 1.1140081882476807, "learning_rate": 0.0002, "epoch": 6.455419933715948, "step": 39930}, {"loss": 0.5137, "grad_norm": 1.0992448329925537, "learning_rate": 0.0002, "epoch": 6.457036617896694, "step": 39940}, {"loss": 0.4914, "grad_norm": 1.1658501625061035, "learning_rate": 0.0002, "epoch": 6.4586533020774395, "step": 39950}, {"loss": 0.5036, "grad_norm": 1.1122797727584839, "learning_rate": 0.0002, "epoch": 6.460269986258185, "step": 39960}, {"loss": 0.5159, "grad_norm": 0.9664968252182007, "learning_rate": 0.0002, "epoch": 6.46188667043893, "step": 39970}, {"loss": 0.4989, "grad_norm": 1.2513965368270874, "learning_rate": 0.0002, "epoch": 6.463503354619675, "step": 39980}, {"loss": 0.4694, "grad_norm": 1.1198630332946777, "learning_rate": 0.0002, "epoch": 6.46512003880042, "step": 39990}, {"loss": 0.5023, "grad_norm": 0.8783249855041504, "learning_rate": 0.0002, "epoch": 6.466736722981166, "step": 40000}, {"loss": 0.4648, "grad_norm": 1.1313109397888184, "learning_rate": 0.0002, "epoch": 6.468353407161911, "step": 40010}, {"loss": 0.4965, "grad_norm": 1.0854487419128418, "learning_rate": 0.0002, "epoch": 6.469970091342656, "step": 40020}, {"loss": 0.5253, "grad_norm": 1.1738566160202026, "learning_rate": 0.0002, "epoch": 6.471586775523401, "step": 40030}, {"loss": 0.4947, "grad_norm": 0.9720084071159363, "learning_rate": 0.0002, "epoch": 6.473203459704147, "step": 40040}, {"loss": 0.5218, "grad_norm": 1.105618953704834, "learning_rate": 0.0002, "epoch": 6.474820143884892, "step": 40050}, {"loss": 0.4943, "grad_norm": 1.2007657289505005, "learning_rate": 0.0002, "epoch": 6.476436828065637, "step": 40060}, {"loss": 0.4882, "grad_norm": 1.088402509689331, "learning_rate": 0.0002, "epoch": 6.478053512246382, "step": 40070}, {"loss": 0.504, "grad_norm": 1.0775291919708252, "learning_rate": 0.0002, "epoch": 6.4796701964271275, "step": 40080}, {"loss": 0.4791, "grad_norm": 1.1018189191818237, "learning_rate": 0.0002, "epoch": 6.481286880607874, "step": 40090}, {"loss": 0.488, "grad_norm": 1.1676557064056396, "learning_rate": 0.0002, "epoch": 6.482903564788619, "step": 40100}, {"loss": 0.4818, "grad_norm": 0.9619805812835693, "learning_rate": 0.0002, "epoch": 6.484520248969364, "step": 40110}, {"loss": 0.4986, "grad_norm": 1.2408208847045898, "learning_rate": 0.0002, "epoch": 6.486136933150109, "step": 40120}, {"loss": 0.4668, "grad_norm": 1.3488136529922485, "learning_rate": 0.0002, "epoch": 6.4877536173308545, "step": 40130}, {"loss": 0.4774, "grad_norm": 0.9864488244056702, "learning_rate": 0.0002, "epoch": 6.4893703015116, "step": 40140}, {"loss": 0.4651, "grad_norm": 0.9437947273254395, "learning_rate": 0.0002, "epoch": 6.490986985692345, "step": 40150}, {"loss": 0.542, "grad_norm": 1.2005455493927002, "learning_rate": 0.0002, "epoch": 6.49260366987309, "step": 40160}, {"loss": 0.4704, "grad_norm": 1.0796732902526855, "learning_rate": 0.0002, "epoch": 6.4942203540538355, "step": 40170}, {"loss": 0.498, "grad_norm": 1.1347825527191162, "learning_rate": 0.0002, "epoch": 6.495837038234581, "step": 40180}, {"loss": 0.5215, "grad_norm": 1.2311455011367798, "learning_rate": 0.0002, "epoch": 6.497453722415326, "step": 40190}, {"loss": 0.5043, "grad_norm": 1.068609356880188, "learning_rate": 0.0002, "epoch": 6.499070406596071, "step": 40200}, {"loss": 0.4868, "grad_norm": 1.196425437927246, "learning_rate": 0.0002, "epoch": 6.500687090776816, "step": 40210}, {"loss": 0.4881, "grad_norm": 1.183927297592163, "learning_rate": 0.0002, "epoch": 6.5023037749575625, "step": 40220}, {"loss": 0.4958, "grad_norm": 0.9099724292755127, "learning_rate": 0.0002, "epoch": 6.503920459138307, "step": 40230}, {"loss": 0.4816, "grad_norm": 0.9261038899421692, "learning_rate": 0.0002, "epoch": 6.505537143319053, "step": 40240}, {"loss": 0.5151, "grad_norm": 1.185491681098938, "learning_rate": 0.0002, "epoch": 6.507153827499798, "step": 40250}, {"loss": 0.4853, "grad_norm": 1.1866052150726318, "learning_rate": 0.0002, "epoch": 6.508770511680543, "step": 40260}, {"loss": 0.491, "grad_norm": 1.1600912809371948, "learning_rate": 0.0002, "epoch": 6.510387195861289, "step": 40270}, {"loss": 0.5181, "grad_norm": 0.9609426259994507, "learning_rate": 0.0002, "epoch": 6.512003880042034, "step": 40280}, {"loss": 0.4794, "grad_norm": 1.078864336013794, "learning_rate": 0.0002, "epoch": 6.513620564222779, "step": 40290}, {"loss": 0.46, "grad_norm": 1.042761206626892, "learning_rate": 0.0002, "epoch": 6.515237248403524, "step": 40300}, {"loss": 0.5341, "grad_norm": 0.9742481112480164, "learning_rate": 0.0002, "epoch": 6.51685393258427, "step": 40310}, {"loss": 0.5234, "grad_norm": 1.2544835805892944, "learning_rate": 0.0002, "epoch": 6.518470616765015, "step": 40320}, {"loss": 0.4815, "grad_norm": 1.3019760847091675, "learning_rate": 0.0002, "epoch": 6.52008730094576, "step": 40330}, {"loss": 0.5039, "grad_norm": 1.3196964263916016, "learning_rate": 0.0002, "epoch": 6.521703985126505, "step": 40340}, {"loss": 0.4979, "grad_norm": 1.2795668840408325, "learning_rate": 0.0002, "epoch": 6.5233206693072505, "step": 40350}, {"loss": 0.5075, "grad_norm": 1.1618940830230713, "learning_rate": 0.0002, "epoch": 6.524937353487996, "step": 40360}, {"loss": 0.5081, "grad_norm": 1.330543041229248, "learning_rate": 0.0002, "epoch": 6.526554037668742, "step": 40370}, {"loss": 0.5055, "grad_norm": 1.1946901082992554, "learning_rate": 0.0002, "epoch": 6.528170721849486, "step": 40380}, {"loss": 0.4518, "grad_norm": 1.1708201169967651, "learning_rate": 0.0002, "epoch": 6.529787406030232, "step": 40390}, {"loss": 0.4556, "grad_norm": 0.894036591053009, "learning_rate": 0.0002, "epoch": 6.531404090210978, "step": 40400}, {"loss": 0.4919, "grad_norm": 1.1199041604995728, "learning_rate": 0.0002, "epoch": 6.533020774391723, "step": 40410}, {"loss": 0.471, "grad_norm": 1.180317759513855, "learning_rate": 0.0002, "epoch": 6.534637458572468, "step": 40420}, {"loss": 0.4914, "grad_norm": 1.37367582321167, "learning_rate": 0.0002, "epoch": 6.536254142753213, "step": 40430}, {"loss": 0.4561, "grad_norm": 1.134791612625122, "learning_rate": 0.0002, "epoch": 6.5378708269339585, "step": 40440}, {"loss": 0.5337, "grad_norm": 1.1160204410552979, "learning_rate": 0.0002, "epoch": 6.539487511114704, "step": 40450}, {"loss": 0.5299, "grad_norm": 1.268347978591919, "learning_rate": 0.0002, "epoch": 6.541104195295449, "step": 40460}, {"loss": 0.5167, "grad_norm": 1.1424330472946167, "learning_rate": 0.0002, "epoch": 6.542720879476194, "step": 40470}, {"loss": 0.5114, "grad_norm": 1.3098465204238892, "learning_rate": 0.0002, "epoch": 6.544337563656939, "step": 40480}, {"loss": 0.4865, "grad_norm": 1.3439544439315796, "learning_rate": 0.0002, "epoch": 6.545954247837685, "step": 40490}, {"loss": 0.5183, "grad_norm": 1.2708452939987183, "learning_rate": 0.0002, "epoch": 6.54757093201843, "step": 40500}, {"loss": 0.5099, "grad_norm": 1.483680248260498, "learning_rate": 0.0002, "epoch": 6.549187616199175, "step": 40510}, {"loss": 0.4811, "grad_norm": 1.1697806119918823, "learning_rate": 0.0002, "epoch": 6.550804300379921, "step": 40520}, {"loss": 0.4814, "grad_norm": 1.1665642261505127, "learning_rate": 0.0002, "epoch": 6.5524209845606665, "step": 40530}, {"loss": 0.4985, "grad_norm": 1.1243325471878052, "learning_rate": 0.0002, "epoch": 6.554037668741412, "step": 40540}, {"loss": 0.4936, "grad_norm": 1.0277988910675049, "learning_rate": 0.0002, "epoch": 6.555654352922157, "step": 40550}, {"loss": 0.487, "grad_norm": 1.1466810703277588, "learning_rate": 0.0002, "epoch": 6.557271037102902, "step": 40560}, {"loss": 0.4851, "grad_norm": 1.1415363550186157, "learning_rate": 0.0002, "epoch": 6.558887721283647, "step": 40570}, {"loss": 0.4631, "grad_norm": 1.1923491954803467, "learning_rate": 0.0002, "epoch": 6.560504405464393, "step": 40580}, {"loss": 0.5071, "grad_norm": 0.9264549612998962, "learning_rate": 0.0002, "epoch": 6.562121089645138, "step": 40590}, {"loss": 0.466, "grad_norm": 0.8810341954231262, "learning_rate": 0.0002, "epoch": 6.563737773825883, "step": 40600}, {"loss": 0.5085, "grad_norm": 2.3296701908111572, "learning_rate": 0.0002, "epoch": 6.565354458006628, "step": 40610}, {"loss": 0.5196, "grad_norm": 1.0865163803100586, "learning_rate": 0.0002, "epoch": 6.5669711421873735, "step": 40620}, {"loss": 0.5132, "grad_norm": 0.9844607710838318, "learning_rate": 0.0002, "epoch": 6.568587826368119, "step": 40630}, {"loss": 0.5437, "grad_norm": 1.1686855554580688, "learning_rate": 0.0002, "epoch": 6.570204510548864, "step": 40640}, {"loss": 0.5293, "grad_norm": 1.016829252243042, "learning_rate": 0.0002, "epoch": 6.571821194729609, "step": 40650}, {"loss": 0.5243, "grad_norm": 1.2789337635040283, "learning_rate": 0.0002, "epoch": 6.5734378789103545, "step": 40660}, {"loss": 0.4867, "grad_norm": 1.0819072723388672, "learning_rate": 0.0002, "epoch": 6.575054563091101, "step": 40670}, {"loss": 0.5024, "grad_norm": 1.1478345394134521, "learning_rate": 0.0002, "epoch": 6.576671247271846, "step": 40680}, {"loss": 0.5282, "grad_norm": 0.7972208857536316, "learning_rate": 0.0002, "epoch": 6.578287931452591, "step": 40690}, {"loss": 0.4877, "grad_norm": 1.1481789350509644, "learning_rate": 0.0002, "epoch": 6.579904615633336, "step": 40700}, {"loss": 0.5143, "grad_norm": 1.0921871662139893, "learning_rate": 0.0002, "epoch": 6.5815212998140815, "step": 40710}, {"loss": 0.5441, "grad_norm": 1.0230315923690796, "learning_rate": 0.0002, "epoch": 6.583137983994827, "step": 40720}, {"loss": 0.4734, "grad_norm": 1.151049017906189, "learning_rate": 0.0002, "epoch": 6.584754668175572, "step": 40730}, {"loss": 0.4782, "grad_norm": 1.4016883373260498, "learning_rate": 0.0002, "epoch": 6.586371352356317, "step": 40740}, {"loss": 0.5195, "grad_norm": 1.2211825847625732, "learning_rate": 0.0002, "epoch": 6.587988036537062, "step": 40750}, {"loss": 0.4815, "grad_norm": 1.2803404331207275, "learning_rate": 0.0002, "epoch": 6.589604720717808, "step": 40760}, {"loss": 0.5329, "grad_norm": 1.1119942665100098, "learning_rate": 0.0002, "epoch": 6.591221404898553, "step": 40770}, {"loss": 0.5135, "grad_norm": 1.464650273323059, "learning_rate": 0.0002, "epoch": 6.592838089079298, "step": 40780}, {"loss": 0.5181, "grad_norm": 1.1751397848129272, "learning_rate": 0.0002, "epoch": 6.594454773260043, "step": 40790}, {"loss": 0.4772, "grad_norm": 1.0866316556930542, "learning_rate": 0.0002, "epoch": 6.596071457440789, "step": 40800}, {"loss": 0.5132, "grad_norm": 1.1733694076538086, "learning_rate": 0.0002, "epoch": 6.597688141621534, "step": 40810}, {"loss": 0.5138, "grad_norm": 1.184708833694458, "learning_rate": 0.0002, "epoch": 6.59930482580228, "step": 40820}, {"loss": 0.4885, "grad_norm": 1.406081199645996, "learning_rate": 0.0002, "epoch": 6.600921509983025, "step": 40830}, {"loss": 0.499, "grad_norm": 0.9658212661743164, "learning_rate": 0.0002, "epoch": 6.60253819416377, "step": 40840}, {"loss": 0.5113, "grad_norm": 1.1457678079605103, "learning_rate": 0.0002, "epoch": 6.604154878344516, "step": 40850}, {"loss": 0.4916, "grad_norm": 1.0487784147262573, "learning_rate": 0.0002, "epoch": 6.605771562525261, "step": 40860}, {"loss": 0.4682, "grad_norm": 0.9357177019119263, "learning_rate": 0.0002, "epoch": 6.607388246706006, "step": 40870}, {"loss": 0.4751, "grad_norm": 1.1479727029800415, "learning_rate": 0.0002, "epoch": 6.609004930886751, "step": 40880}, {"loss": 0.5493, "grad_norm": 1.3729329109191895, "learning_rate": 0.0002, "epoch": 6.610621615067497, "step": 40890}, {"loss": 0.4886, "grad_norm": 1.0085599422454834, "learning_rate": 0.0002, "epoch": 6.612238299248242, "step": 40900}, {"loss": 0.516, "grad_norm": 1.2750911712646484, "learning_rate": 0.0002, "epoch": 6.613854983428987, "step": 40910}, {"loss": 0.5342, "grad_norm": 1.1929547786712646, "learning_rate": 0.0002, "epoch": 6.615471667609732, "step": 40920}, {"loss": 0.4919, "grad_norm": 1.0821375846862793, "learning_rate": 0.0002, "epoch": 6.6170883517904775, "step": 40930}, {"loss": 0.5057, "grad_norm": 1.197347640991211, "learning_rate": 0.0002, "epoch": 6.618705035971223, "step": 40940}, {"loss": 0.492, "grad_norm": 1.2074699401855469, "learning_rate": 0.0002, "epoch": 6.620321720151968, "step": 40950}, {"loss": 0.5089, "grad_norm": 1.312009572982788, "learning_rate": 0.0002, "epoch": 6.621938404332713, "step": 40960}, {"loss": 0.5476, "grad_norm": 1.4381471872329712, "learning_rate": 0.0002, "epoch": 6.623555088513459, "step": 40970}, {"loss": 0.4904, "grad_norm": 1.1574671268463135, "learning_rate": 0.0002, "epoch": 6.6251717726942045, "step": 40980}, {"loss": 0.531, "grad_norm": 0.885661780834198, "learning_rate": 0.0002, "epoch": 6.62678845687495, "step": 40990}, {"loss": 0.5145, "grad_norm": 1.024571180343628, "learning_rate": 0.0002, "epoch": 6.628405141055695, "step": 41000}, {"loss": 0.4791, "grad_norm": 1.103437900543213, "learning_rate": 0.0002, "epoch": 6.63002182523644, "step": 41010}, {"loss": 0.4671, "grad_norm": 1.122450828552246, "learning_rate": 0.0002, "epoch": 6.6316385094171855, "step": 41020}, {"loss": 0.5134, "grad_norm": 1.2256295680999756, "learning_rate": 0.0002, "epoch": 6.633255193597931, "step": 41030}, {"loss": 0.4908, "grad_norm": 1.364594578742981, "learning_rate": 0.0002, "epoch": 6.634871877778676, "step": 41040}, {"loss": 0.4964, "grad_norm": 0.9550056457519531, "learning_rate": 0.0002, "epoch": 6.636488561959421, "step": 41050}, {"loss": 0.5028, "grad_norm": 1.3174707889556885, "learning_rate": 0.0002, "epoch": 6.638105246140166, "step": 41060}, {"loss": 0.4717, "grad_norm": 1.0835540294647217, "learning_rate": 0.0002, "epoch": 6.639721930320912, "step": 41070}, {"loss": 0.497, "grad_norm": 1.1432770490646362, "learning_rate": 0.0002, "epoch": 6.641338614501657, "step": 41080}, {"loss": 0.4903, "grad_norm": 1.2398556470870972, "learning_rate": 0.0002, "epoch": 6.642955298682402, "step": 41090}, {"loss": 0.4991, "grad_norm": 1.1147747039794922, "learning_rate": 0.0002, "epoch": 6.644571982863147, "step": 41100}, {"loss": 0.505, "grad_norm": 1.0730493068695068, "learning_rate": 0.0002, "epoch": 6.6461886670438926, "step": 41110}, {"loss": 0.486, "grad_norm": 1.3218451738357544, "learning_rate": 0.0002, "epoch": 6.647805351224639, "step": 41120}, {"loss": 0.5276, "grad_norm": 1.3027331829071045, "learning_rate": 0.0002, "epoch": 6.649422035405384, "step": 41130}, {"loss": 0.5263, "grad_norm": 1.0280735492706299, "learning_rate": 0.0002, "epoch": 6.651038719586129, "step": 41140}, {"loss": 0.4952, "grad_norm": 1.109916090965271, "learning_rate": 0.0002, "epoch": 6.652655403766874, "step": 41150}, {"loss": 0.5001, "grad_norm": 1.078734040260315, "learning_rate": 0.0002, "epoch": 6.65427208794762, "step": 41160}, {"loss": 0.484, "grad_norm": 1.1595654487609863, "learning_rate": 0.0002, "epoch": 6.655888772128365, "step": 41170}, {"loss": 0.5101, "grad_norm": 1.1701031923294067, "learning_rate": 0.0002, "epoch": 6.65750545630911, "step": 41180}, {"loss": 0.5341, "grad_norm": 1.0424643754959106, "learning_rate": 0.0002, "epoch": 6.659122140489855, "step": 41190}, {"loss": 0.4863, "grad_norm": 1.22880220413208, "learning_rate": 0.0002, "epoch": 6.6607388246706005, "step": 41200}, {"loss": 0.4987, "grad_norm": 1.1907655000686646, "learning_rate": 0.0002, "epoch": 6.662355508851346, "step": 41210}, {"loss": 0.5343, "grad_norm": 1.0765007734298706, "learning_rate": 0.0002, "epoch": 6.663972193032091, "step": 41220}, {"loss": 0.5039, "grad_norm": 0.9994917511940002, "learning_rate": 0.0002, "epoch": 6.665588877212836, "step": 41230}, {"loss": 0.507, "grad_norm": 0.968578040599823, "learning_rate": 0.0002, "epoch": 6.6672055613935814, "step": 41240}, {"loss": 0.5068, "grad_norm": 1.0576032400131226, "learning_rate": 0.0002, "epoch": 6.668822245574327, "step": 41250}, {"loss": 0.486, "grad_norm": 1.2183765172958374, "learning_rate": 0.0002, "epoch": 6.670438929755072, "step": 41260}, {"loss": 0.4764, "grad_norm": 1.2548623085021973, "learning_rate": 0.0002, "epoch": 6.672055613935818, "step": 41270}, {"loss": 0.5014, "grad_norm": 1.0848388671875, "learning_rate": 0.0002, "epoch": 6.673672298116563, "step": 41280}, {"loss": 0.5404, "grad_norm": 1.21421217918396, "learning_rate": 0.0002, "epoch": 6.6752889822973085, "step": 41290}, {"loss": 0.4911, "grad_norm": 1.1453598737716675, "learning_rate": 0.0002, "epoch": 6.676905666478054, "step": 41300}, {"loss": 0.5033, "grad_norm": 1.2682722806930542, "learning_rate": 0.0002, "epoch": 6.678522350658799, "step": 41310}, {"loss": 0.5313, "grad_norm": 1.1659725904464722, "learning_rate": 0.0002, "epoch": 6.680139034839544, "step": 41320}, {"loss": 0.5505, "grad_norm": 1.36194908618927, "learning_rate": 0.0002, "epoch": 6.681755719020289, "step": 41330}, {"loss": 0.5127, "grad_norm": 1.1712592840194702, "learning_rate": 0.0002, "epoch": 6.683372403201035, "step": 41340}, {"loss": 0.5082, "grad_norm": 1.4168336391448975, "learning_rate": 0.0002, "epoch": 6.68498908738178, "step": 41350}, {"loss": 0.5124, "grad_norm": 1.0395328998565674, "learning_rate": 0.0002, "epoch": 6.686605771562525, "step": 41360}, {"loss": 0.5404, "grad_norm": 1.2511054277420044, "learning_rate": 0.0002, "epoch": 6.68822245574327, "step": 41370}, {"loss": 0.5027, "grad_norm": 1.0438542366027832, "learning_rate": 0.0002, "epoch": 6.689839139924016, "step": 41380}, {"loss": 0.5069, "grad_norm": 1.08684241771698, "learning_rate": 0.0002, "epoch": 6.691455824104761, "step": 41390}, {"loss": 0.5224, "grad_norm": 1.250788927078247, "learning_rate": 0.0002, "epoch": 6.693072508285506, "step": 41400}, {"loss": 0.4921, "grad_norm": 1.313890814781189, "learning_rate": 0.0002, "epoch": 6.694689192466251, "step": 41410}, {"loss": 0.5028, "grad_norm": 1.3218982219696045, "learning_rate": 0.0002, "epoch": 6.696305876646997, "step": 41420}, {"loss": 0.4851, "grad_norm": 1.0366582870483398, "learning_rate": 0.0002, "epoch": 6.697922560827743, "step": 41430}, {"loss": 0.5103, "grad_norm": 1.066121220588684, "learning_rate": 0.0002, "epoch": 6.699539245008488, "step": 41440}, {"loss": 0.4966, "grad_norm": 1.0239925384521484, "learning_rate": 0.0002, "epoch": 6.701155929189233, "step": 41450}, {"loss": 0.4767, "grad_norm": 0.9402176141738892, "learning_rate": 0.0002, "epoch": 6.702772613369978, "step": 41460}, {"loss": 0.5381, "grad_norm": 1.391718864440918, "learning_rate": 0.0002, "epoch": 6.7043892975507235, "step": 41470}, {"loss": 0.512, "grad_norm": 1.215600609779358, "learning_rate": 0.0002, "epoch": 6.706005981731469, "step": 41480}, {"loss": 0.5219, "grad_norm": 1.063722848892212, "learning_rate": 0.0002, "epoch": 6.707622665912214, "step": 41490}, {"loss": 0.492, "grad_norm": 1.132149577140808, "learning_rate": 0.0002, "epoch": 6.709239350092959, "step": 41500}, {"loss": 0.4812, "grad_norm": 1.0302950143814087, "learning_rate": 0.0002, "epoch": 6.7108560342737045, "step": 41510}, {"loss": 0.5141, "grad_norm": 1.5342752933502197, "learning_rate": 0.0002, "epoch": 6.71247271845445, "step": 41520}, {"loss": 0.5123, "grad_norm": 1.177137017250061, "learning_rate": 0.0002, "epoch": 6.714089402635195, "step": 41530}, {"loss": 0.5082, "grad_norm": 1.2335538864135742, "learning_rate": 0.0002, "epoch": 6.71570608681594, "step": 41540}, {"loss": 0.4864, "grad_norm": 1.140604853630066, "learning_rate": 0.0002, "epoch": 6.717322770996686, "step": 41550}, {"loss": 0.4888, "grad_norm": 1.3567465543746948, "learning_rate": 0.0002, "epoch": 6.718939455177431, "step": 41560}, {"loss": 0.5183, "grad_norm": 1.0693929195404053, "learning_rate": 0.0002, "epoch": 6.720556139358177, "step": 41570}, {"loss": 0.5131, "grad_norm": 1.1592605113983154, "learning_rate": 0.0002, "epoch": 6.722172823538922, "step": 41580}, {"loss": 0.5476, "grad_norm": 0.989006519317627, "learning_rate": 0.0002, "epoch": 6.723789507719667, "step": 41590}, {"loss": 0.4952, "grad_norm": 1.04103422164917, "learning_rate": 0.0002, "epoch": 6.7254061919004124, "step": 41600}, {"loss": 0.4823, "grad_norm": 1.1129004955291748, "learning_rate": 0.0002, "epoch": 6.727022876081158, "step": 41610}, {"loss": 0.5032, "grad_norm": 1.1473113298416138, "learning_rate": 0.0002, "epoch": 6.728639560261903, "step": 41620}, {"loss": 0.5253, "grad_norm": 1.348036527633667, "learning_rate": 0.0002, "epoch": 6.730256244442648, "step": 41630}, {"loss": 0.4983, "grad_norm": 1.259942650794983, "learning_rate": 0.0002, "epoch": 6.731872928623393, "step": 41640}, {"loss": 0.5182, "grad_norm": 1.0591514110565186, "learning_rate": 0.0002, "epoch": 6.733489612804139, "step": 41650}, {"loss": 0.4886, "grad_norm": 0.9737129211425781, "learning_rate": 0.0002, "epoch": 6.735106296984884, "step": 41660}, {"loss": 0.5051, "grad_norm": 1.2520451545715332, "learning_rate": 0.0002, "epoch": 6.736722981165629, "step": 41670}, {"loss": 0.5364, "grad_norm": 1.0555530786514282, "learning_rate": 0.0002, "epoch": 6.738339665346374, "step": 41680}, {"loss": 0.4954, "grad_norm": 1.0025697946548462, "learning_rate": 0.0002, "epoch": 6.7399563495271195, "step": 41690}, {"loss": 0.5485, "grad_norm": 1.1114100217819214, "learning_rate": 0.0002, "epoch": 6.741573033707866, "step": 41700}, {"loss": 0.4986, "grad_norm": 1.1537504196166992, "learning_rate": 0.0002, "epoch": 6.74318971788861, "step": 41710}, {"loss": 0.5025, "grad_norm": 1.037880539894104, "learning_rate": 0.0002, "epoch": 6.744806402069356, "step": 41720}, {"loss": 0.482, "grad_norm": 1.0691965818405151, "learning_rate": 0.0002, "epoch": 6.746423086250101, "step": 41730}, {"loss": 0.5272, "grad_norm": 1.376325011253357, "learning_rate": 0.0002, "epoch": 6.748039770430847, "step": 41740}, {"loss": 0.5484, "grad_norm": 1.4667129516601562, "learning_rate": 0.0002, "epoch": 6.749656454611592, "step": 41750}, {"loss": 0.5139, "grad_norm": 1.1517162322998047, "learning_rate": 0.0002, "epoch": 6.751273138792337, "step": 41760}, {"loss": 0.5523, "grad_norm": 1.1454511880874634, "learning_rate": 0.0002, "epoch": 6.752889822973082, "step": 41770}, {"loss": 0.4664, "grad_norm": 1.6323128938674927, "learning_rate": 0.0002, "epoch": 6.7545065071538275, "step": 41780}, {"loss": 0.5153, "grad_norm": 1.0951642990112305, "learning_rate": 0.0002, "epoch": 6.756123191334573, "step": 41790}, {"loss": 0.4998, "grad_norm": 1.0766983032226562, "learning_rate": 0.0002, "epoch": 6.757739875515318, "step": 41800}, {"loss": 0.548, "grad_norm": 1.3472381830215454, "learning_rate": 0.0002, "epoch": 6.759356559696063, "step": 41810}, {"loss": 0.5172, "grad_norm": 1.0248444080352783, "learning_rate": 0.0002, "epoch": 6.760973243876808, "step": 41820}, {"loss": 0.5236, "grad_norm": 1.1276055574417114, "learning_rate": 0.0002, "epoch": 6.762589928057554, "step": 41830}, {"loss": 0.5044, "grad_norm": 1.5398495197296143, "learning_rate": 0.0002, "epoch": 6.764206612238299, "step": 41840}, {"loss": 0.5097, "grad_norm": 1.1886497735977173, "learning_rate": 0.0002, "epoch": 6.765823296419045, "step": 41850}, {"loss": 0.499, "grad_norm": 1.027198076248169, "learning_rate": 0.0002, "epoch": 6.767439980599789, "step": 41860}, {"loss": 0.5444, "grad_norm": 1.4644980430603027, "learning_rate": 0.0002, "epoch": 6.7690566647805355, "step": 41870}, {"loss": 0.5009, "grad_norm": 0.9633586406707764, "learning_rate": 0.0002, "epoch": 6.770673348961281, "step": 41880}, {"loss": 0.484, "grad_norm": 1.0895354747772217, "learning_rate": 0.0002, "epoch": 6.772290033142026, "step": 41890}, {"loss": 0.5172, "grad_norm": 1.1887167692184448, "learning_rate": 0.0002, "epoch": 6.773906717322771, "step": 41900}, {"loss": 0.5399, "grad_norm": 1.3699820041656494, "learning_rate": 0.0002, "epoch": 6.775523401503516, "step": 41910}, {"loss": 0.5504, "grad_norm": 1.0266352891921997, "learning_rate": 0.0002, "epoch": 6.777140085684262, "step": 41920}, {"loss": 0.5105, "grad_norm": 1.0919075012207031, "learning_rate": 0.0002, "epoch": 6.778756769865007, "step": 41930}, {"loss": 0.4842, "grad_norm": 0.9839563369750977, "learning_rate": 0.0002, "epoch": 6.780373454045752, "step": 41940}, {"loss": 0.5081, "grad_norm": 1.2605451345443726, "learning_rate": 0.0002, "epoch": 6.781990138226497, "step": 41950}, {"loss": 0.5391, "grad_norm": 0.9268672466278076, "learning_rate": 0.0002, "epoch": 6.7836068224072426, "step": 41960}, {"loss": 0.4916, "grad_norm": 1.2002313137054443, "learning_rate": 0.0002, "epoch": 6.785223506587988, "step": 41970}, {"loss": 0.5467, "grad_norm": 1.2018438577651978, "learning_rate": 0.0002, "epoch": 6.786840190768733, "step": 41980}, {"loss": 0.5491, "grad_norm": 1.17646062374115, "learning_rate": 0.0002, "epoch": 6.788456874949478, "step": 41990}, {"loss": 0.5354, "grad_norm": 1.1080009937286377, "learning_rate": 0.0002, "epoch": 6.790073559130224, "step": 42000}, {"loss": 0.5384, "grad_norm": 1.1606498956680298, "learning_rate": 0.0002, "epoch": 6.791690243310969, "step": 42010}, {"loss": 0.4931, "grad_norm": 1.2484819889068604, "learning_rate": 0.0002, "epoch": 6.793306927491715, "step": 42020}, {"loss": 0.498, "grad_norm": 1.1363215446472168, "learning_rate": 0.0002, "epoch": 6.79492361167246, "step": 42030}, {"loss": 0.5343, "grad_norm": 1.4469727277755737, "learning_rate": 0.0002, "epoch": 6.796540295853205, "step": 42040}, {"loss": 0.5146, "grad_norm": 1.0617138147354126, "learning_rate": 0.0002, "epoch": 6.7981569800339505, "step": 42050}, {"loss": 0.5188, "grad_norm": 1.1459330320358276, "learning_rate": 0.0002, "epoch": 6.799773664214696, "step": 42060}, {"loss": 0.5116, "grad_norm": 1.2095019817352295, "learning_rate": 0.0002, "epoch": 6.801390348395441, "step": 42070}, {"loss": 0.545, "grad_norm": 1.3200831413269043, "learning_rate": 0.0002, "epoch": 6.803007032576186, "step": 42080}, {"loss": 0.5406, "grad_norm": 1.1633318662643433, "learning_rate": 0.0002, "epoch": 6.8046237167569315, "step": 42090}, {"loss": 0.4938, "grad_norm": 0.8986614942550659, "learning_rate": 0.0002, "epoch": 6.806240400937677, "step": 42100}, {"loss": 0.559, "grad_norm": 1.3705275058746338, "learning_rate": 0.0002, "epoch": 6.807857085118422, "step": 42110}, {"loss": 0.5022, "grad_norm": 1.2418090105056763, "learning_rate": 0.0002, "epoch": 6.809473769299167, "step": 42120}, {"loss": 0.5014, "grad_norm": 1.0818954706192017, "learning_rate": 0.0002, "epoch": 6.811090453479912, "step": 42130}, {"loss": 0.4791, "grad_norm": 0.9293872117996216, "learning_rate": 0.0002, "epoch": 6.812707137660658, "step": 42140}, {"loss": 0.5009, "grad_norm": 0.9791894555091858, "learning_rate": 0.0002, "epoch": 6.814323821841404, "step": 42150}, {"loss": 0.5142, "grad_norm": 1.1956568956375122, "learning_rate": 0.0002, "epoch": 6.815940506022149, "step": 42160}, {"loss": 0.5126, "grad_norm": 0.9643568992614746, "learning_rate": 0.0002, "epoch": 6.817557190202894, "step": 42170}, {"loss": 0.5121, "grad_norm": 1.2499792575836182, "learning_rate": 0.0002, "epoch": 6.819173874383639, "step": 42180}, {"loss": 0.4942, "grad_norm": 1.1779413223266602, "learning_rate": 0.0002, "epoch": 6.820790558564385, "step": 42190}, {"loss": 0.498, "grad_norm": 1.0570595264434814, "learning_rate": 0.0002, "epoch": 6.82240724274513, "step": 42200}, {"loss": 0.4997, "grad_norm": 1.1393938064575195, "learning_rate": 0.0002, "epoch": 6.824023926925875, "step": 42210}, {"loss": 0.4842, "grad_norm": 1.152463436126709, "learning_rate": 0.0002, "epoch": 6.82564061110662, "step": 42220}, {"loss": 0.5234, "grad_norm": 1.3353025913238525, "learning_rate": 0.0002, "epoch": 6.827257295287366, "step": 42230}, {"loss": 0.539, "grad_norm": 1.1719051599502563, "learning_rate": 0.0002, "epoch": 6.828873979468111, "step": 42240}, {"loss": 0.5139, "grad_norm": 1.262141227722168, "learning_rate": 0.0002, "epoch": 6.830490663648856, "step": 42250}, {"loss": 0.5021, "grad_norm": 1.240899920463562, "learning_rate": 0.0002, "epoch": 6.832107347829601, "step": 42260}, {"loss": 0.4961, "grad_norm": 1.0505269765853882, "learning_rate": 0.0002, "epoch": 6.8337240320103465, "step": 42270}, {"loss": 0.4932, "grad_norm": 1.1556071043014526, "learning_rate": 0.0002, "epoch": 6.835340716191092, "step": 42280}, {"loss": 0.5461, "grad_norm": 1.1427719593048096, "learning_rate": 0.0002, "epoch": 6.836957400371837, "step": 42290}, {"loss": 0.5199, "grad_norm": 1.1540080308914185, "learning_rate": 0.0002, "epoch": 6.838574084552583, "step": 42300}, {"loss": 0.5269, "grad_norm": 1.0521200895309448, "learning_rate": 0.0002, "epoch": 6.840190768733328, "step": 42310}, {"loss": 0.541, "grad_norm": 1.0205531120300293, "learning_rate": 0.0002, "epoch": 6.8418074529140736, "step": 42320}, {"loss": 0.5225, "grad_norm": 1.0010193586349487, "learning_rate": 0.0002, "epoch": 6.843424137094819, "step": 42330}, {"loss": 0.5101, "grad_norm": 1.2138770818710327, "learning_rate": 0.0002, "epoch": 6.845040821275564, "step": 42340}, {"loss": 0.5452, "grad_norm": 1.3028651475906372, "learning_rate": 0.0002, "epoch": 6.846657505456309, "step": 42350}, {"loss": 0.4894, "grad_norm": 1.0326353311538696, "learning_rate": 0.0002, "epoch": 6.8482741896370545, "step": 42360}, {"loss": 0.5285, "grad_norm": 1.036085605621338, "learning_rate": 0.0002, "epoch": 6.8498908738178, "step": 42370}, {"loss": 0.505, "grad_norm": 1.0575472116470337, "learning_rate": 0.0002, "epoch": 6.851507557998545, "step": 42380}, {"loss": 0.4997, "grad_norm": 1.1749629974365234, "learning_rate": 0.0002, "epoch": 6.85312424217929, "step": 42390}, {"loss": 0.4961, "grad_norm": 1.1747760772705078, "learning_rate": 0.0002, "epoch": 6.854740926360035, "step": 42400}, {"loss": 0.5138, "grad_norm": 1.1877071857452393, "learning_rate": 0.0002, "epoch": 6.856357610540781, "step": 42410}, {"loss": 0.4972, "grad_norm": 1.1209983825683594, "learning_rate": 0.0002, "epoch": 6.857974294721526, "step": 42420}, {"loss": 0.4939, "grad_norm": 1.2918205261230469, "learning_rate": 0.0002, "epoch": 6.859590978902271, "step": 42430}, {"loss": 0.5012, "grad_norm": 1.2443464994430542, "learning_rate": 0.0002, "epoch": 6.861207663083016, "step": 42440}, {"loss": 0.5226, "grad_norm": 0.9336795210838318, "learning_rate": 0.0002, "epoch": 6.8628243472637624, "step": 42450}, {"loss": 0.5108, "grad_norm": 1.2183542251586914, "learning_rate": 0.0002, "epoch": 6.864441031444508, "step": 42460}, {"loss": 0.5245, "grad_norm": 1.0071234703063965, "learning_rate": 0.0002, "epoch": 6.866057715625253, "step": 42470}, {"loss": 0.4753, "grad_norm": 1.2914012670516968, "learning_rate": 0.0002, "epoch": 6.867674399805998, "step": 42480}, {"loss": 0.4865, "grad_norm": 1.1050426959991455, "learning_rate": 0.0002, "epoch": 6.869291083986743, "step": 42490}, {"loss": 0.5243, "grad_norm": 1.1163811683654785, "learning_rate": 0.0002, "epoch": 6.870907768167489, "step": 42500}, {"loss": 0.5065, "grad_norm": 1.1575818061828613, "learning_rate": 0.0002, "epoch": 6.872524452348234, "step": 42510}, {"loss": 0.5353, "grad_norm": 1.11167311668396, "learning_rate": 0.0002, "epoch": 6.874141136528979, "step": 42520}, {"loss": 0.5141, "grad_norm": 1.0379102230072021, "learning_rate": 0.0002, "epoch": 6.875757820709724, "step": 42530}, {"loss": 0.5355, "grad_norm": 1.2617160081863403, "learning_rate": 0.0002, "epoch": 6.8773745048904695, "step": 42540}, {"loss": 0.4785, "grad_norm": 1.1749719381332397, "learning_rate": 0.0002, "epoch": 6.878991189071215, "step": 42550}, {"loss": 0.5503, "grad_norm": 1.2284821271896362, "learning_rate": 0.0002, "epoch": 6.88060787325196, "step": 42560}, {"loss": 0.5065, "grad_norm": 1.1917030811309814, "learning_rate": 0.0002, "epoch": 6.882224557432705, "step": 42570}, {"loss": 0.5176, "grad_norm": 1.1943914890289307, "learning_rate": 0.0002, "epoch": 6.8838412416134505, "step": 42580}, {"loss": 0.5072, "grad_norm": 1.2641394138336182, "learning_rate": 0.0002, "epoch": 6.885457925794196, "step": 42590}, {"loss": 0.5004, "grad_norm": 1.1280436515808105, "learning_rate": 0.0002, "epoch": 6.887074609974942, "step": 42600}, {"loss": 0.5328, "grad_norm": 0.9865449070930481, "learning_rate": 0.0002, "epoch": 6.888691294155687, "step": 42610}, {"loss": 0.4953, "grad_norm": 0.994987428188324, "learning_rate": 0.0002, "epoch": 6.890307978336432, "step": 42620}, {"loss": 0.4805, "grad_norm": 0.9900388717651367, "learning_rate": 0.0002, "epoch": 6.8919246625171775, "step": 42630}, {"loss": 0.5467, "grad_norm": 1.2992421388626099, "learning_rate": 0.0002, "epoch": 6.893541346697923, "step": 42640}, {"loss": 0.5017, "grad_norm": 1.0152487754821777, "learning_rate": 0.0002, "epoch": 6.895158030878668, "step": 42650}, {"loss": 0.5043, "grad_norm": 1.199453353881836, "learning_rate": 0.0002, "epoch": 6.896774715059413, "step": 42660}, {"loss": 0.5106, "grad_norm": 1.100630521774292, "learning_rate": 0.0002, "epoch": 6.898391399240158, "step": 42670}, {"loss": 0.503, "grad_norm": 1.0489764213562012, "learning_rate": 0.0002, "epoch": 6.900008083420904, "step": 42680}, {"loss": 0.4634, "grad_norm": 1.101407527923584, "learning_rate": 0.0002, "epoch": 6.901624767601649, "step": 42690}, {"loss": 0.5361, "grad_norm": 1.3130593299865723, "learning_rate": 0.0002, "epoch": 6.903241451782394, "step": 42700}, {"loss": 0.5119, "grad_norm": 0.9906072616577148, "learning_rate": 0.0002, "epoch": 6.904858135963139, "step": 42710}, {"loss": 0.5146, "grad_norm": 1.094502329826355, "learning_rate": 0.0002, "epoch": 6.906474820143885, "step": 42720}, {"loss": 0.5165, "grad_norm": 1.1025426387786865, "learning_rate": 0.0002, "epoch": 6.90809150432463, "step": 42730}, {"loss": 0.5463, "grad_norm": 1.0644042491912842, "learning_rate": 0.0002, "epoch": 6.909708188505375, "step": 42740}, {"loss": 0.5024, "grad_norm": 1.0709129571914673, "learning_rate": 0.0002, "epoch": 6.911324872686121, "step": 42750}, {"loss": 0.5093, "grad_norm": 1.2445871829986572, "learning_rate": 0.0002, "epoch": 6.912941556866866, "step": 42760}, {"loss": 0.5305, "grad_norm": 1.020058035850525, "learning_rate": 0.0002, "epoch": 6.914558241047612, "step": 42770}, {"loss": 0.5382, "grad_norm": 0.9795091152191162, "learning_rate": 0.0002, "epoch": 6.916174925228357, "step": 42780}, {"loss": 0.5429, "grad_norm": 0.9369977116584778, "learning_rate": 0.0002, "epoch": 6.917791609409102, "step": 42790}, {"loss": 0.5444, "grad_norm": 1.0741904973983765, "learning_rate": 0.0002, "epoch": 6.919408293589847, "step": 42800}, {"loss": 0.5402, "grad_norm": 1.0702799558639526, "learning_rate": 0.0002, "epoch": 6.921024977770593, "step": 42810}, {"loss": 0.5291, "grad_norm": 1.0383983850479126, "learning_rate": 0.0002, "epoch": 6.922641661951338, "step": 42820}, {"loss": 0.5106, "grad_norm": 1.0761083364486694, "learning_rate": 0.0002, "epoch": 6.924258346132083, "step": 42830}, {"loss": 0.5726, "grad_norm": 1.2332350015640259, "learning_rate": 0.0002, "epoch": 6.925875030312828, "step": 42840}, {"loss": 0.4996, "grad_norm": 1.3184348344802856, "learning_rate": 0.0002, "epoch": 6.9274917144935735, "step": 42850}, {"loss": 0.5503, "grad_norm": 1.0586378574371338, "learning_rate": 0.0002, "epoch": 6.929108398674319, "step": 42860}, {"loss": 0.511, "grad_norm": 1.2294201850891113, "learning_rate": 0.0002, "epoch": 6.930725082855064, "step": 42870}, {"loss": 0.54, "grad_norm": 1.3097991943359375, "learning_rate": 0.0002, "epoch": 6.932341767035809, "step": 42880}, {"loss": 0.5228, "grad_norm": 0.9006873965263367, "learning_rate": 0.0002, "epoch": 6.933958451216554, "step": 42890}, {"loss": 0.4617, "grad_norm": 1.265931248664856, "learning_rate": 0.0002, "epoch": 6.9355751353973005, "step": 42900}, {"loss": 0.5029, "grad_norm": 1.1013522148132324, "learning_rate": 0.0002, "epoch": 6.937191819578046, "step": 42910}, {"loss": 0.5334, "grad_norm": 0.9910131692886353, "learning_rate": 0.0002, "epoch": 6.938808503758791, "step": 42920}, {"loss": 0.5211, "grad_norm": 1.102683424949646, "learning_rate": 0.0002, "epoch": 6.940425187939536, "step": 42930}, {"loss": 0.5588, "grad_norm": 1.232961893081665, "learning_rate": 0.0002, "epoch": 6.9420418721202815, "step": 42940}, {"loss": 0.5357, "grad_norm": 1.1714650392532349, "learning_rate": 0.0002, "epoch": 6.943658556301027, "step": 42950}, {"loss": 0.5232, "grad_norm": 1.1684318780899048, "learning_rate": 0.0002, "epoch": 6.945275240481772, "step": 42960}, {"loss": 0.5035, "grad_norm": 1.2074716091156006, "learning_rate": 0.0002, "epoch": 6.946891924662517, "step": 42970}, {"loss": 0.5111, "grad_norm": 1.2061275243759155, "learning_rate": 0.0002, "epoch": 6.948508608843262, "step": 42980}, {"loss": 0.5066, "grad_norm": 1.1216989755630493, "learning_rate": 0.0002, "epoch": 6.950125293024008, "step": 42990}, {"loss": 0.4948, "grad_norm": 1.304117202758789, "learning_rate": 0.0002, "epoch": 6.951741977204753, "step": 43000}, {"loss": 0.5684, "grad_norm": 1.2377972602844238, "learning_rate": 0.0002, "epoch": 6.953358661385498, "step": 43010}, {"loss": 0.4792, "grad_norm": 1.2332178354263306, "learning_rate": 0.0002, "epoch": 6.954975345566243, "step": 43020}, {"loss": 0.5181, "grad_norm": 1.1919599771499634, "learning_rate": 0.0002, "epoch": 6.956592029746989, "step": 43030}, {"loss": 0.5352, "grad_norm": 1.272700548171997, "learning_rate": 0.0002, "epoch": 6.958208713927734, "step": 43040}, {"loss": 0.5328, "grad_norm": 1.4377546310424805, "learning_rate": 0.0002, "epoch": 6.95982539810848, "step": 43050}, {"loss": 0.4894, "grad_norm": 1.2070353031158447, "learning_rate": 0.0002, "epoch": 6.961442082289225, "step": 43060}, {"loss": 0.525, "grad_norm": 1.090205430984497, "learning_rate": 0.0002, "epoch": 6.96305876646997, "step": 43070}, {"loss": 0.5255, "grad_norm": 1.1832911968231201, "learning_rate": 0.0002, "epoch": 6.964675450650716, "step": 43080}, {"loss": 0.5497, "grad_norm": 1.2921082973480225, "learning_rate": 0.0002, "epoch": 6.966292134831461, "step": 43090}, {"loss": 0.5527, "grad_norm": 1.4303096532821655, "learning_rate": 0.0002, "epoch": 6.967908819012206, "step": 43100}, {"loss": 0.4807, "grad_norm": 1.0788004398345947, "learning_rate": 0.0002, "epoch": 6.969525503192951, "step": 43110}, {"loss": 0.5006, "grad_norm": 1.2192047834396362, "learning_rate": 0.0002, "epoch": 6.9711421873736965, "step": 43120}, {"loss": 0.4714, "grad_norm": 1.0735143423080444, "learning_rate": 0.0002, "epoch": 6.972758871554442, "step": 43130}, {"loss": 0.5307, "grad_norm": 1.0317153930664062, "learning_rate": 0.0002, "epoch": 6.974375555735187, "step": 43140}, {"loss": 0.5154, "grad_norm": 1.0926798582077026, "learning_rate": 0.0002, "epoch": 6.975992239915932, "step": 43150}, {"loss": 0.4976, "grad_norm": 1.1660500764846802, "learning_rate": 0.0002, "epoch": 6.977608924096677, "step": 43160}, {"loss": 0.5456, "grad_norm": 1.3945232629776, "learning_rate": 0.0002, "epoch": 6.979225608277423, "step": 43170}, {"loss": 0.4979, "grad_norm": 1.2684587240219116, "learning_rate": 0.0002, "epoch": 6.980842292458169, "step": 43180}, {"loss": 0.5406, "grad_norm": 1.1574004888534546, "learning_rate": 0.0002, "epoch": 6.982458976638913, "step": 43190}, {"loss": 0.5629, "grad_norm": 1.2534198760986328, "learning_rate": 0.0002, "epoch": 6.984075660819659, "step": 43200}, {"loss": 0.5191, "grad_norm": 1.135245442390442, "learning_rate": 0.0002, "epoch": 6.9856923450004045, "step": 43210}, {"loss": 0.548, "grad_norm": 1.3824104070663452, "learning_rate": 0.0002, "epoch": 6.98730902918115, "step": 43220}, {"loss": 0.5294, "grad_norm": 1.2128452062606812, "learning_rate": 0.0002, "epoch": 6.988925713361895, "step": 43230}, {"loss": 0.505, "grad_norm": 1.0795245170593262, "learning_rate": 0.0002, "epoch": 6.99054239754264, "step": 43240}, {"loss": 0.4889, "grad_norm": 1.337353229522705, "learning_rate": 0.0002, "epoch": 6.992159081723385, "step": 43250}, {"loss": 0.4749, "grad_norm": 1.1731765270233154, "learning_rate": 0.0002, "epoch": 6.993775765904131, "step": 43260}, {"loss": 0.4897, "grad_norm": 1.0203192234039307, "learning_rate": 0.0002, "epoch": 6.995392450084876, "step": 43270}, {"loss": 0.5324, "grad_norm": 0.9261201620101929, "learning_rate": 0.0002, "epoch": 6.997009134265621, "step": 43280}, {"loss": 0.5227, "grad_norm": 1.107865810394287, "learning_rate": 0.0002, "epoch": 6.998625818446366, "step": 43290}, {"eval_loss": 1.2679380178451538, "eval_runtime": 122.202, "eval_samples_per_second": 5.998, "eval_steps_per_second": 0.753, "epoch": 6.9999191657909625, "step": 43298}, {"loss": 0.4651, "grad_norm": 0.9555306434631348, "learning_rate": 0.0002, "epoch": 7.000242502627112, "step": 43300}, {"loss": 0.4301, "grad_norm": 1.3280415534973145, "learning_rate": 0.0002, "epoch": 7.001859186807857, "step": 43310}, {"loss": 0.437, "grad_norm": 1.5583289861679077, "learning_rate": 0.0002, "epoch": 7.003475870988602, "step": 43320}, {"loss": 0.4532, "grad_norm": 1.0714443922042847, "learning_rate": 0.0002, "epoch": 7.005092555169347, "step": 43330}, {"loss": 0.4048, "grad_norm": 1.048075795173645, "learning_rate": 0.0002, "epoch": 7.006709239350093, "step": 43340}, {"loss": 0.4119, "grad_norm": 1.1053836345672607, "learning_rate": 0.0002, "epoch": 7.008325923530839, "step": 43350}, {"loss": 0.4352, "grad_norm": 0.8911725282669067, "learning_rate": 0.0002, "epoch": 7.009942607711584, "step": 43360}, {"loss": 0.4236, "grad_norm": 0.9404396414756775, "learning_rate": 0.0002, "epoch": 7.011559291892329, "step": 43370}, {"loss": 0.4529, "grad_norm": 1.152365803718567, "learning_rate": 0.0002, "epoch": 7.013175976073074, "step": 43380}, {"loss": 0.4381, "grad_norm": 1.2118251323699951, "learning_rate": 0.0002, "epoch": 7.0147926602538195, "step": 43390}, {"loss": 0.4404, "grad_norm": 1.2046295404434204, "learning_rate": 0.0002, "epoch": 7.016409344434565, "step": 43400}, {"loss": 0.4177, "grad_norm": 0.929465115070343, "learning_rate": 0.0002, "epoch": 7.01802602861531, "step": 43410}, {"loss": 0.4086, "grad_norm": 1.3720149993896484, "learning_rate": 0.0002, "epoch": 7.019642712796055, "step": 43420}, {"loss": 0.4174, "grad_norm": 1.1316810846328735, "learning_rate": 0.0002, "epoch": 7.0212593969768005, "step": 43430}, {"loss": 0.451, "grad_norm": 1.0342087745666504, "learning_rate": 0.0002, "epoch": 7.022876081157546, "step": 43440}, {"loss": 0.4084, "grad_norm": 1.1455655097961426, "learning_rate": 0.0002, "epoch": 7.024492765338291, "step": 43450}, {"loss": 0.4168, "grad_norm": 1.1308859586715698, "learning_rate": 0.0002, "epoch": 7.026109449519036, "step": 43460}, {"loss": 0.4099, "grad_norm": 1.0796722173690796, "learning_rate": 0.0002, "epoch": 7.027726133699781, "step": 43470}, {"loss": 0.4047, "grad_norm": 1.0031877756118774, "learning_rate": 0.0002, "epoch": 7.029342817880527, "step": 43480}, {"loss": 0.3968, "grad_norm": 1.2391340732574463, "learning_rate": 0.0002, "epoch": 7.030959502061273, "step": 43490}, {"loss": 0.4155, "grad_norm": 1.0807358026504517, "learning_rate": 0.0002, "epoch": 7.032576186242018, "step": 43500}, {"loss": 0.4322, "grad_norm": 1.230995535850525, "learning_rate": 0.0002, "epoch": 7.034192870422763, "step": 43510}, {"loss": 0.3971, "grad_norm": 1.2200509309768677, "learning_rate": 0.0002, "epoch": 7.035809554603508, "step": 43520}, {"loss": 0.4242, "grad_norm": 0.9785236120223999, "learning_rate": 0.0002, "epoch": 7.037426238784254, "step": 43530}, {"loss": 0.4173, "grad_norm": 1.0009595155715942, "learning_rate": 0.0002, "epoch": 7.039042922964999, "step": 43540}, {"loss": 0.4175, "grad_norm": 0.9783103466033936, "learning_rate": 0.0002, "epoch": 7.040659607145744, "step": 43550}, {"loss": 0.4307, "grad_norm": 1.1303530931472778, "learning_rate": 0.0002, "epoch": 7.042276291326489, "step": 43560}, {"loss": 0.4066, "grad_norm": 1.1768499612808228, "learning_rate": 0.0002, "epoch": 7.043892975507235, "step": 43570}, {"loss": 0.4492, "grad_norm": 1.1040459871292114, "learning_rate": 0.0002, "epoch": 7.04550965968798, "step": 43580}, {"loss": 0.4314, "grad_norm": 1.0673959255218506, "learning_rate": 0.0002, "epoch": 7.047126343868725, "step": 43590}, {"loss": 0.402, "grad_norm": 1.1220765113830566, "learning_rate": 0.0002, "epoch": 7.04874302804947, "step": 43600}, {"loss": 0.4108, "grad_norm": 1.1746923923492432, "learning_rate": 0.0002, "epoch": 7.0503597122302155, "step": 43610}, {"loss": 0.4618, "grad_norm": 1.2764517068862915, "learning_rate": 0.0002, "epoch": 7.051976396410961, "step": 43620}, {"loss": 0.4243, "grad_norm": 1.1180157661437988, "learning_rate": 0.0002, "epoch": 7.053593080591706, "step": 43630}, {"loss": 0.4593, "grad_norm": 1.3558318614959717, "learning_rate": 0.0002, "epoch": 7.055209764772452, "step": 43640}, {"loss": 0.4351, "grad_norm": 0.9804982542991638, "learning_rate": 0.0002, "epoch": 7.056826448953197, "step": 43650}, {"loss": 0.4309, "grad_norm": 1.106404423713684, "learning_rate": 0.0002, "epoch": 7.058443133133943, "step": 43660}, {"loss": 0.4183, "grad_norm": 0.9469243884086609, "learning_rate": 0.0002, "epoch": 7.060059817314688, "step": 43670}, {"loss": 0.4335, "grad_norm": 1.272987723350525, "learning_rate": 0.0002, "epoch": 7.061676501495433, "step": 43680}, {"loss": 0.4017, "grad_norm": 1.0536233186721802, "learning_rate": 0.0002, "epoch": 7.063293185676178, "step": 43690}, {"loss": 0.4597, "grad_norm": 1.1730698347091675, "learning_rate": 0.0002, "epoch": 7.0649098698569235, "step": 43700}, {"loss": 0.4304, "grad_norm": 1.150707483291626, "learning_rate": 0.0002, "epoch": 7.066526554037669, "step": 43710}, {"loss": 0.4136, "grad_norm": 1.4583828449249268, "learning_rate": 0.0002, "epoch": 7.068143238218414, "step": 43720}, {"loss": 0.4385, "grad_norm": 1.569705843925476, "learning_rate": 0.0002, "epoch": 7.069759922399159, "step": 43730}, {"loss": 0.4051, "grad_norm": 1.156192660331726, "learning_rate": 0.0002, "epoch": 7.071376606579904, "step": 43740}, {"loss": 0.4375, "grad_norm": 1.25005304813385, "learning_rate": 0.0002, "epoch": 7.07299329076065, "step": 43750}, {"loss": 0.4096, "grad_norm": 1.0468846559524536, "learning_rate": 0.0002, "epoch": 7.074609974941395, "step": 43760}, {"loss": 0.4253, "grad_norm": 1.2045108079910278, "learning_rate": 0.0002, "epoch": 7.07622665912214, "step": 43770}, {"loss": 0.4248, "grad_norm": 1.1341021060943604, "learning_rate": 0.0002, "epoch": 7.077843343302886, "step": 43780}, {"loss": 0.394, "grad_norm": 1.0712201595306396, "learning_rate": 0.0002, "epoch": 7.0794600274836315, "step": 43790}, {"loss": 0.4093, "grad_norm": 1.0421321392059326, "learning_rate": 0.0002, "epoch": 7.081076711664377, "step": 43800}, {"loss": 0.4317, "grad_norm": 1.2241183519363403, "learning_rate": 0.0002, "epoch": 7.082693395845122, "step": 43810}, {"loss": 0.4064, "grad_norm": 1.0945624113082886, "learning_rate": 0.0002, "epoch": 7.084310080025867, "step": 43820}, {"loss": 0.4049, "grad_norm": 1.2772969007492065, "learning_rate": 0.0002, "epoch": 7.085926764206612, "step": 43830}, {"loss": 0.4098, "grad_norm": 1.1715508699417114, "learning_rate": 0.0002, "epoch": 7.087543448387358, "step": 43840}, {"loss": 0.4756, "grad_norm": 1.1975586414337158, "learning_rate": 0.0002, "epoch": 7.089160132568103, "step": 43850}, {"loss": 0.4272, "grad_norm": 1.1673274040222168, "learning_rate": 0.0002, "epoch": 7.090776816748848, "step": 43860}, {"loss": 0.4435, "grad_norm": 1.096590518951416, "learning_rate": 0.0002, "epoch": 7.092393500929593, "step": 43870}, {"loss": 0.4329, "grad_norm": 1.0174020528793335, "learning_rate": 0.0002, "epoch": 7.0940101851103385, "step": 43880}, {"loss": 0.4307, "grad_norm": 1.0147380828857422, "learning_rate": 0.0002, "epoch": 7.095626869291084, "step": 43890}, {"loss": 0.4115, "grad_norm": 1.0056098699569702, "learning_rate": 0.0002, "epoch": 7.097243553471829, "step": 43900}, {"loss": 0.4181, "grad_norm": 1.4678083658218384, "learning_rate": 0.0002, "epoch": 7.098860237652574, "step": 43910}, {"loss": 0.4404, "grad_norm": 1.3740565776824951, "learning_rate": 0.0002, "epoch": 7.1004769218333195, "step": 43920}, {"loss": 0.4435, "grad_norm": 1.0279403924942017, "learning_rate": 0.0002, "epoch": 7.102093606014066, "step": 43930}, {"loss": 0.4247, "grad_norm": 1.186720371246338, "learning_rate": 0.0002, "epoch": 7.103710290194811, "step": 43940}, {"loss": 0.4001, "grad_norm": 1.3767904043197632, "learning_rate": 0.0002, "epoch": 7.105326974375556, "step": 43950}, {"loss": 0.4314, "grad_norm": 1.1637471914291382, "learning_rate": 0.0002, "epoch": 7.106943658556301, "step": 43960}, {"loss": 0.3996, "grad_norm": 1.1860042810440063, "learning_rate": 0.0002, "epoch": 7.1085603427370465, "step": 43970}, {"loss": 0.4014, "grad_norm": 1.080944538116455, "learning_rate": 0.0002, "epoch": 7.110177026917792, "step": 43980}, {"loss": 0.4152, "grad_norm": 1.0119353532791138, "learning_rate": 0.0002, "epoch": 7.111793711098537, "step": 43990}, {"loss": 0.4354, "grad_norm": 1.179388403892517, "learning_rate": 0.0002, "epoch": 7.113410395279282, "step": 44000}, {"loss": 0.4494, "grad_norm": 0.9202800989151001, "learning_rate": 0.0002, "epoch": 7.115027079460027, "step": 44010}, {"loss": 0.4356, "grad_norm": 1.142206072807312, "learning_rate": 0.0002, "epoch": 7.116643763640773, "step": 44020}, {"loss": 0.4197, "grad_norm": 1.17897367477417, "learning_rate": 0.0002, "epoch": 7.118260447821518, "step": 44030}, {"loss": 0.4394, "grad_norm": 1.238087773323059, "learning_rate": 0.0002, "epoch": 7.119877132002263, "step": 44040}, {"loss": 0.4844, "grad_norm": 1.5113195180892944, "learning_rate": 0.0002, "epoch": 7.121493816183008, "step": 44050}, {"loss": 0.4526, "grad_norm": 1.1819349527359009, "learning_rate": 0.0002, "epoch": 7.123110500363754, "step": 44060}, {"loss": 0.4071, "grad_norm": 1.1062556505203247, "learning_rate": 0.0002, "epoch": 7.124727184544499, "step": 44070}, {"loss": 0.4282, "grad_norm": 0.986954927444458, "learning_rate": 0.0002, "epoch": 7.126343868725245, "step": 44080}, {"loss": 0.4497, "grad_norm": 0.9641291499137878, "learning_rate": 0.0002, "epoch": 7.12796055290599, "step": 44090}, {"loss": 0.4348, "grad_norm": 0.9519979953765869, "learning_rate": 0.0002, "epoch": 7.129577237086735, "step": 44100}, {"loss": 0.4527, "grad_norm": 1.0477287769317627, "learning_rate": 0.0002, "epoch": 7.131193921267481, "step": 44110}, {"loss": 0.4168, "grad_norm": 0.9185389280319214, "learning_rate": 0.0002, "epoch": 7.132810605448226, "step": 44120}, {"loss": 0.4255, "grad_norm": 1.0224069356918335, "learning_rate": 0.0002, "epoch": 7.134427289628971, "step": 44130}, {"loss": 0.4598, "grad_norm": 1.0762630701065063, "learning_rate": 0.0002, "epoch": 7.136043973809716, "step": 44140}, {"loss": 0.4308, "grad_norm": 1.330917477607727, "learning_rate": 0.0002, "epoch": 7.137660657990462, "step": 44150}, {"loss": 0.4548, "grad_norm": 1.220115303993225, "learning_rate": 0.0002, "epoch": 7.139277342171207, "step": 44160}, {"loss": 0.4089, "grad_norm": 0.9959004521369934, "learning_rate": 0.0002, "epoch": 7.140894026351952, "step": 44170}, {"loss": 0.4475, "grad_norm": 1.272449016571045, "learning_rate": 0.0002, "epoch": 7.142510710532697, "step": 44180}, {"loss": 0.4268, "grad_norm": 1.0696483850479126, "learning_rate": 0.0002, "epoch": 7.1441273947134425, "step": 44190}, {"loss": 0.4218, "grad_norm": 1.347206711769104, "learning_rate": 0.0002, "epoch": 7.145744078894188, "step": 44200}, {"loss": 0.4652, "grad_norm": 1.1455401182174683, "learning_rate": 0.0002, "epoch": 7.147360763074933, "step": 44210}, {"loss": 0.4186, "grad_norm": 1.1443370580673218, "learning_rate": 0.0002, "epoch": 7.148977447255678, "step": 44220}, {"loss": 0.4669, "grad_norm": 1.0239921808242798, "learning_rate": 0.0002, "epoch": 7.150594131436424, "step": 44230}, {"loss": 0.4601, "grad_norm": 1.1596333980560303, "learning_rate": 0.0002, "epoch": 7.1522108156171695, "step": 44240}, {"loss": 0.44, "grad_norm": 1.2471510171890259, "learning_rate": 0.0002, "epoch": 7.153827499797915, "step": 44250}, {"loss": 0.426, "grad_norm": 1.0713822841644287, "learning_rate": 0.0002, "epoch": 7.15544418397866, "step": 44260}, {"loss": 0.4381, "grad_norm": 1.3523266315460205, "learning_rate": 0.0002, "epoch": 7.157060868159405, "step": 44270}, {"loss": 0.4101, "grad_norm": 1.1620066165924072, "learning_rate": 0.0002, "epoch": 7.1586775523401505, "step": 44280}, {"loss": 0.4195, "grad_norm": 1.2988619804382324, "learning_rate": 0.0002, "epoch": 7.160294236520896, "step": 44290}, {"loss": 0.4405, "grad_norm": 1.2527822256088257, "learning_rate": 0.0002, "epoch": 7.161910920701641, "step": 44300}, {"loss": 0.4813, "grad_norm": 1.2322553396224976, "learning_rate": 0.0002, "epoch": 7.163527604882386, "step": 44310}, {"loss": 0.4274, "grad_norm": 1.0497055053710938, "learning_rate": 0.0002, "epoch": 7.165144289063131, "step": 44320}, {"loss": 0.4236, "grad_norm": 1.1928341388702393, "learning_rate": 0.0002, "epoch": 7.166760973243877, "step": 44330}, {"loss": 0.4511, "grad_norm": 1.0016584396362305, "learning_rate": 0.0002, "epoch": 7.168377657424622, "step": 44340}, {"loss": 0.4566, "grad_norm": 1.0385509729385376, "learning_rate": 0.0002, "epoch": 7.169994341605367, "step": 44350}, {"loss": 0.4178, "grad_norm": 1.3217328786849976, "learning_rate": 0.0002, "epoch": 7.171611025786112, "step": 44360}, {"loss": 0.425, "grad_norm": 1.240696668624878, "learning_rate": 0.0002, "epoch": 7.1732277099668575, "step": 44370}, {"loss": 0.4572, "grad_norm": 1.1037760972976685, "learning_rate": 0.0002, "epoch": 7.174844394147604, "step": 44380}, {"loss": 0.4525, "grad_norm": 1.062762975692749, "learning_rate": 0.0002, "epoch": 7.176461078328349, "step": 44390}, {"loss": 0.4766, "grad_norm": 1.2859047651290894, "learning_rate": 0.0002, "epoch": 7.178077762509094, "step": 44400}, {"loss": 0.4511, "grad_norm": 1.1852408647537231, "learning_rate": 0.0002, "epoch": 7.179694446689839, "step": 44410}, {"loss": 0.4386, "grad_norm": 1.315587043762207, "learning_rate": 0.0002, "epoch": 7.181311130870585, "step": 44420}, {"loss": 0.4491, "grad_norm": 0.889542281627655, "learning_rate": 0.0002, "epoch": 7.18292781505133, "step": 44430}, {"loss": 0.4328, "grad_norm": 1.0123721361160278, "learning_rate": 0.0002, "epoch": 7.184544499232075, "step": 44440}, {"loss": 0.4096, "grad_norm": 1.0503462553024292, "learning_rate": 0.0002, "epoch": 7.18616118341282, "step": 44450}, {"loss": 0.44, "grad_norm": 1.338188886642456, "learning_rate": 0.0002, "epoch": 7.1877778675935655, "step": 44460}, {"loss": 0.4451, "grad_norm": 1.206543207168579, "learning_rate": 0.0002, "epoch": 7.189394551774311, "step": 44470}, {"loss": 0.4415, "grad_norm": 1.2013356685638428, "learning_rate": 0.0002, "epoch": 7.191011235955056, "step": 44480}, {"loss": 0.4291, "grad_norm": 1.1124168634414673, "learning_rate": 0.0002, "epoch": 7.192627920135801, "step": 44490}, {"loss": 0.4182, "grad_norm": 1.199379324913025, "learning_rate": 0.0002, "epoch": 7.194244604316546, "step": 44500}, {"loss": 0.4525, "grad_norm": 1.196746587753296, "learning_rate": 0.0002, "epoch": 7.195861288497292, "step": 44510}, {"loss": 0.4876, "grad_norm": 0.9684673547744751, "learning_rate": 0.0002, "epoch": 7.197477972678037, "step": 44520}, {"loss": 0.4403, "grad_norm": 1.5727651119232178, "learning_rate": 0.0002, "epoch": 7.199094656858783, "step": 44530}, {"loss": 0.4424, "grad_norm": 0.8371674418449402, "learning_rate": 0.0002, "epoch": 7.200711341039528, "step": 44540}, {"loss": 0.4366, "grad_norm": 1.0343716144561768, "learning_rate": 0.0002, "epoch": 7.2023280252202735, "step": 44550}, {"loss": 0.4557, "grad_norm": 1.1839478015899658, "learning_rate": 0.0002, "epoch": 7.203944709401019, "step": 44560}, {"loss": 0.4293, "grad_norm": 0.9466627836227417, "learning_rate": 0.0002, "epoch": 7.205561393581764, "step": 44570}, {"loss": 0.4651, "grad_norm": 1.1452360153198242, "learning_rate": 0.0002, "epoch": 7.207178077762509, "step": 44580}, {"loss": 0.5037, "grad_norm": 1.63698410987854, "learning_rate": 0.0002, "epoch": 7.208794761943254, "step": 44590}, {"loss": 0.4212, "grad_norm": 1.1124789714813232, "learning_rate": 0.0002, "epoch": 7.210411446124, "step": 44600}, {"loss": 0.4323, "grad_norm": 1.4233685731887817, "learning_rate": 0.0002, "epoch": 7.212028130304745, "step": 44610}, {"loss": 0.4176, "grad_norm": 1.302145004272461, "learning_rate": 0.0002, "epoch": 7.21364481448549, "step": 44620}, {"loss": 0.457, "grad_norm": 1.2115466594696045, "learning_rate": 0.0002, "epoch": 7.215261498666235, "step": 44630}, {"loss": 0.4419, "grad_norm": 1.0771325826644897, "learning_rate": 0.0002, "epoch": 7.216878182846981, "step": 44640}, {"loss": 0.4183, "grad_norm": 1.1603602170944214, "learning_rate": 0.0002, "epoch": 7.218494867027726, "step": 44650}, {"loss": 0.468, "grad_norm": 1.4013969898223877, "learning_rate": 0.0002, "epoch": 7.220111551208471, "step": 44660}, {"loss": 0.4867, "grad_norm": 1.2145777940750122, "learning_rate": 0.0002, "epoch": 7.221728235389216, "step": 44670}, {"loss": 0.4436, "grad_norm": 1.2084238529205322, "learning_rate": 0.0002, "epoch": 7.223344919569962, "step": 44680}, {"loss": 0.4423, "grad_norm": 1.1801965236663818, "learning_rate": 0.0002, "epoch": 7.224961603750708, "step": 44690}, {"loss": 0.426, "grad_norm": 0.9561195969581604, "learning_rate": 0.0002, "epoch": 7.226578287931453, "step": 44700}, {"loss": 0.4895, "grad_norm": 1.1857006549835205, "learning_rate": 0.0002, "epoch": 7.228194972112198, "step": 44710}, {"loss": 0.4382, "grad_norm": 1.1576673984527588, "learning_rate": 0.0002, "epoch": 7.229811656292943, "step": 44720}, {"loss": 0.4717, "grad_norm": 1.3517892360687256, "learning_rate": 0.0002, "epoch": 7.2314283404736885, "step": 44730}, {"loss": 0.4495, "grad_norm": 1.1489306688308716, "learning_rate": 0.0002, "epoch": 7.233045024654434, "step": 44740}, {"loss": 0.438, "grad_norm": 1.0758644342422485, "learning_rate": 0.0002, "epoch": 7.234661708835179, "step": 44750}, {"loss": 0.4431, "grad_norm": 1.1679041385650635, "learning_rate": 0.0002, "epoch": 7.236278393015924, "step": 44760}, {"loss": 0.504, "grad_norm": 1.1404961347579956, "learning_rate": 0.0002, "epoch": 7.2378950771966695, "step": 44770}, {"loss": 0.4499, "grad_norm": 1.2602572441101074, "learning_rate": 0.0002, "epoch": 7.239511761377415, "step": 44780}, {"loss": 0.4669, "grad_norm": 1.2912664413452148, "learning_rate": 0.0002, "epoch": 7.24112844555816, "step": 44790}, {"loss": 0.4336, "grad_norm": 1.340198278427124, "learning_rate": 0.0002, "epoch": 7.242745129738905, "step": 44800}, {"loss": 0.4336, "grad_norm": 1.0613332986831665, "learning_rate": 0.0002, "epoch": 7.24436181391965, "step": 44810}, {"loss": 0.4433, "grad_norm": 1.1658564805984497, "learning_rate": 0.0002, "epoch": 7.2459784981003965, "step": 44820}, {"loss": 0.4532, "grad_norm": 1.046440839767456, "learning_rate": 0.0002, "epoch": 7.247595182281142, "step": 44830}, {"loss": 0.4332, "grad_norm": 1.2335407733917236, "learning_rate": 0.0002, "epoch": 7.249211866461887, "step": 44840}, {"loss": 0.455, "grad_norm": 1.3742769956588745, "learning_rate": 0.0002, "epoch": 7.250828550642632, "step": 44850}, {"loss": 0.4297, "grad_norm": 1.1744071245193481, "learning_rate": 0.0002, "epoch": 7.252445234823377, "step": 44860}, {"loss": 0.4348, "grad_norm": 1.4268226623535156, "learning_rate": 0.0002, "epoch": 7.254061919004123, "step": 44870}, {"loss": 0.4485, "grad_norm": 1.1255686283111572, "learning_rate": 0.0002, "epoch": 7.255678603184868, "step": 44880}, {"loss": 0.4264, "grad_norm": 1.255053162574768, "learning_rate": 0.0002, "epoch": 7.257295287365613, "step": 44890}, {"loss": 0.455, "grad_norm": 1.4957616329193115, "learning_rate": 0.0002, "epoch": 7.258911971546358, "step": 44900}, {"loss": 0.4465, "grad_norm": 1.0546756982803345, "learning_rate": 0.0002, "epoch": 7.260528655727104, "step": 44910}, {"loss": 0.4802, "grad_norm": 1.4683036804199219, "learning_rate": 0.0002, "epoch": 7.262145339907849, "step": 44920}, {"loss": 0.4175, "grad_norm": 1.2027722597122192, "learning_rate": 0.0002, "epoch": 7.263762024088594, "step": 44930}, {"loss": 0.4316, "grad_norm": 1.277767539024353, "learning_rate": 0.0002, "epoch": 7.265378708269339, "step": 44940}, {"loss": 0.4749, "grad_norm": 1.4894379377365112, "learning_rate": 0.0002, "epoch": 7.2669953924500845, "step": 44950}, {"loss": 0.46, "grad_norm": 1.0998231172561646, "learning_rate": 0.0002, "epoch": 7.26861207663083, "step": 44960}, {"loss": 0.4414, "grad_norm": 1.3713536262512207, "learning_rate": 0.0002, "epoch": 7.270228760811576, "step": 44970}, {"loss": 0.4431, "grad_norm": 1.473396897315979, "learning_rate": 0.0002, "epoch": 7.271845444992321, "step": 44980}, {"loss": 0.4582, "grad_norm": 1.0893826484680176, "learning_rate": 0.0002, "epoch": 7.273462129173066, "step": 44990}, {"loss": 0.4297, "grad_norm": 1.4798463582992554, "learning_rate": 0.0002, "epoch": 7.275078813353812, "step": 45000}, {"loss": 0.4604, "grad_norm": 1.0536930561065674, "learning_rate": 0.0002, "epoch": 7.276695497534557, "step": 45010}, {"loss": 0.4396, "grad_norm": 1.064450740814209, "learning_rate": 0.0002, "epoch": 7.278312181715302, "step": 45020}, {"loss": 0.4445, "grad_norm": 1.3605865240097046, "learning_rate": 0.0002, "epoch": 7.279928865896047, "step": 45030}, {"loss": 0.4123, "grad_norm": 1.1779286861419678, "learning_rate": 0.0002, "epoch": 7.2815455500767925, "step": 45040}, {"loss": 0.4588, "grad_norm": 1.1568892002105713, "learning_rate": 0.0002, "epoch": 7.283162234257538, "step": 45050}, {"loss": 0.4557, "grad_norm": 1.0677175521850586, "learning_rate": 0.0002, "epoch": 7.284778918438283, "step": 45060}, {"loss": 0.4765, "grad_norm": 1.1939430236816406, "learning_rate": 0.0002, "epoch": 7.286395602619028, "step": 45070}, {"loss": 0.443, "grad_norm": 1.0273144245147705, "learning_rate": 0.0002, "epoch": 7.288012286799773, "step": 45080}, {"loss": 0.4472, "grad_norm": 1.358487844467163, "learning_rate": 0.0002, "epoch": 7.289628970980519, "step": 45090}, {"loss": 0.4207, "grad_norm": 1.2139160633087158, "learning_rate": 0.0002, "epoch": 7.291245655161264, "step": 45100}, {"loss": 0.4221, "grad_norm": 1.2484227418899536, "learning_rate": 0.0002, "epoch": 7.29286233934201, "step": 45110}, {"loss": 0.4351, "grad_norm": 1.2373738288879395, "learning_rate": 0.0002, "epoch": 7.294479023522755, "step": 45120}, {"loss": 0.472, "grad_norm": 1.3877158164978027, "learning_rate": 0.0002, "epoch": 7.2960957077035005, "step": 45130}, {"loss": 0.4741, "grad_norm": 1.1372028589248657, "learning_rate": 0.0002, "epoch": 7.297712391884246, "step": 45140}, {"loss": 0.4465, "grad_norm": 1.259987711906433, "learning_rate": 0.0002, "epoch": 7.299329076064991, "step": 45150}, {"loss": 0.4795, "grad_norm": 1.6501492261886597, "learning_rate": 0.0002, "epoch": 7.300945760245736, "step": 45160}, {"loss": 0.4441, "grad_norm": 1.5927983522415161, "learning_rate": 0.0002, "epoch": 7.302562444426481, "step": 45170}, {"loss": 0.4513, "grad_norm": 0.957084596157074, "learning_rate": 0.0002, "epoch": 7.304179128607227, "step": 45180}, {"loss": 0.4367, "grad_norm": 1.7777647972106934, "learning_rate": 0.0002, "epoch": 7.305795812787972, "step": 45190}, {"loss": 0.4365, "grad_norm": 1.1905052661895752, "learning_rate": 0.0002, "epoch": 7.307412496968717, "step": 45200}, {"loss": 0.4354, "grad_norm": 1.0944236516952515, "learning_rate": 0.0002, "epoch": 7.309029181149462, "step": 45210}, {"loss": 0.4558, "grad_norm": 1.171034336090088, "learning_rate": 0.0002, "epoch": 7.3106458653302075, "step": 45220}, {"loss": 0.4518, "grad_norm": 1.421743392944336, "learning_rate": 0.0002, "epoch": 7.312262549510953, "step": 45230}, {"loss": 0.4713, "grad_norm": 1.1282994747161865, "learning_rate": 0.0002, "epoch": 7.313879233691698, "step": 45240}, {"loss": 0.471, "grad_norm": 1.0742822885513306, "learning_rate": 0.0002, "epoch": 7.315495917872443, "step": 45250}, {"loss": 0.4887, "grad_norm": 1.2697997093200684, "learning_rate": 0.0002, "epoch": 7.317112602053189, "step": 45260}, {"loss": 0.461, "grad_norm": 1.2066359519958496, "learning_rate": 0.0002, "epoch": 7.318729286233935, "step": 45270}, {"loss": 0.463, "grad_norm": 1.0044163465499878, "learning_rate": 0.0002, "epoch": 7.32034597041468, "step": 45280}, {"loss": 0.4394, "grad_norm": 1.2365968227386475, "learning_rate": 0.0002, "epoch": 7.321962654595425, "step": 45290}, {"loss": 0.4305, "grad_norm": 1.0731542110443115, "learning_rate": 0.0002, "epoch": 7.32357933877617, "step": 45300}, {"loss": 0.4744, "grad_norm": 1.1595830917358398, "learning_rate": 0.0002, "epoch": 7.3251960229569155, "step": 45310}, {"loss": 0.4393, "grad_norm": 1.3445849418640137, "learning_rate": 0.0002, "epoch": 7.326812707137661, "step": 45320}, {"loss": 0.4288, "grad_norm": 1.3067926168441772, "learning_rate": 0.0002, "epoch": 7.328429391318406, "step": 45330}, {"loss": 0.4569, "grad_norm": 1.200667381286621, "learning_rate": 0.0002, "epoch": 7.330046075499151, "step": 45340}, {"loss": 0.4449, "grad_norm": 0.9936319589614868, "learning_rate": 0.0002, "epoch": 7.3316627596798964, "step": 45350}, {"loss": 0.4481, "grad_norm": 1.1291998624801636, "learning_rate": 0.0002, "epoch": 7.333279443860642, "step": 45360}, {"loss": 0.4643, "grad_norm": 1.3663034439086914, "learning_rate": 0.0002, "epoch": 7.334896128041387, "step": 45370}, {"loss": 0.4548, "grad_norm": 1.0762227773666382, "learning_rate": 0.0002, "epoch": 7.336512812222132, "step": 45380}, {"loss": 0.4495, "grad_norm": 0.9525768160820007, "learning_rate": 0.0002, "epoch": 7.338129496402877, "step": 45390}, {"loss": 0.472, "grad_norm": 1.1143709421157837, "learning_rate": 0.0002, "epoch": 7.339746180583623, "step": 45400}, {"loss": 0.4432, "grad_norm": 1.0711175203323364, "learning_rate": 0.0002, "epoch": 7.341362864764369, "step": 45410}, {"loss": 0.4603, "grad_norm": 1.2650856971740723, "learning_rate": 0.0002, "epoch": 7.342979548945114, "step": 45420}, {"loss": 0.5021, "grad_norm": 1.194861888885498, "learning_rate": 0.0002, "epoch": 7.344596233125859, "step": 45430}, {"loss": 0.467, "grad_norm": 1.4936751127243042, "learning_rate": 0.0002, "epoch": 7.346212917306604, "step": 45440}, {"loss": 0.4798, "grad_norm": 1.2938975095748901, "learning_rate": 0.0002, "epoch": 7.34782960148735, "step": 45450}, {"loss": 0.4589, "grad_norm": 1.2841941118240356, "learning_rate": 0.0002, "epoch": 7.349446285668095, "step": 45460}, {"loss": 0.4398, "grad_norm": 1.5376560688018799, "learning_rate": 0.0002, "epoch": 7.35106296984884, "step": 45470}, {"loss": 0.4583, "grad_norm": 1.1307156085968018, "learning_rate": 0.0002, "epoch": 7.352679654029585, "step": 45480}, {"loss": 0.4678, "grad_norm": 1.4883167743682861, "learning_rate": 0.0002, "epoch": 7.354296338210331, "step": 45490}, {"loss": 0.4966, "grad_norm": 1.0547393560409546, "learning_rate": 0.0002, "epoch": 7.355913022391076, "step": 45500}, {"loss": 0.4601, "grad_norm": 1.5476845502853394, "learning_rate": 0.0002, "epoch": 7.357529706571821, "step": 45510}, {"loss": 0.4466, "grad_norm": 1.1916698217391968, "learning_rate": 0.0002, "epoch": 7.359146390752566, "step": 45520}, {"loss": 0.4791, "grad_norm": 1.238319754600525, "learning_rate": 0.0002, "epoch": 7.3607630749333115, "step": 45530}, {"loss": 0.4818, "grad_norm": 1.4216728210449219, "learning_rate": 0.0002, "epoch": 7.362379759114057, "step": 45540}, {"loss": 0.4828, "grad_norm": 1.303995132446289, "learning_rate": 0.0002, "epoch": 7.363996443294802, "step": 45550}, {"loss": 0.464, "grad_norm": 1.2453089952468872, "learning_rate": 0.0002, "epoch": 7.365613127475548, "step": 45560}, {"loss": 0.4735, "grad_norm": 1.1971137523651123, "learning_rate": 0.0002, "epoch": 7.367229811656293, "step": 45570}, {"loss": 0.4415, "grad_norm": 1.0801963806152344, "learning_rate": 0.0002, "epoch": 7.3688464958370385, "step": 45580}, {"loss": 0.4946, "grad_norm": 1.1602367162704468, "learning_rate": 0.0002, "epoch": 7.370463180017784, "step": 45590}, {"loss": 0.45, "grad_norm": 1.1623423099517822, "learning_rate": 0.0002, "epoch": 7.372079864198529, "step": 45600}, {"loss": 0.4648, "grad_norm": 1.108467936515808, "learning_rate": 0.0002, "epoch": 7.373696548379274, "step": 45610}, {"loss": 0.4566, "grad_norm": 1.087322473526001, "learning_rate": 0.0002, "epoch": 7.3753132325600195, "step": 45620}, {"loss": 0.4505, "grad_norm": 1.0945587158203125, "learning_rate": 0.0002, "epoch": 7.376929916740765, "step": 45630}, {"loss": 0.4864, "grad_norm": 1.6565983295440674, "learning_rate": 0.0002, "epoch": 7.37854660092151, "step": 45640}, {"loss": 0.4491, "grad_norm": 1.1279444694519043, "learning_rate": 0.0002, "epoch": 7.380163285102255, "step": 45650}, {"loss": 0.4606, "grad_norm": 1.0888527631759644, "learning_rate": 0.0002, "epoch": 7.381779969283, "step": 45660}, {"loss": 0.429, "grad_norm": 1.1114956140518188, "learning_rate": 0.0002, "epoch": 7.383396653463746, "step": 45670}, {"loss": 0.4726, "grad_norm": 1.195497751235962, "learning_rate": 0.0002, "epoch": 7.385013337644491, "step": 45680}, {"loss": 0.4643, "grad_norm": 1.3111436367034912, "learning_rate": 0.0002, "epoch": 7.386630021825236, "step": 45690}, {"loss": 0.4777, "grad_norm": 1.22647225856781, "learning_rate": 0.0002, "epoch": 7.388246706005981, "step": 45700}, {"loss": 0.4877, "grad_norm": 0.9309225678443909, "learning_rate": 0.0002, "epoch": 7.389863390186727, "step": 45710}, {"loss": 0.4789, "grad_norm": 1.198773741722107, "learning_rate": 0.0002, "epoch": 7.391480074367473, "step": 45720}, {"loss": 0.4496, "grad_norm": 1.2208130359649658, "learning_rate": 0.0002, "epoch": 7.393096758548218, "step": 45730}, {"loss": 0.4614, "grad_norm": 1.0756449699401855, "learning_rate": 0.0002, "epoch": 7.394713442728963, "step": 45740}, {"loss": 0.4469, "grad_norm": 1.0117692947387695, "learning_rate": 0.0002, "epoch": 7.396330126909708, "step": 45750}, {"loss": 0.4217, "grad_norm": 1.1144468784332275, "learning_rate": 0.0002, "epoch": 7.397946811090454, "step": 45760}, {"loss": 0.4737, "grad_norm": 1.140549898147583, "learning_rate": 0.0002, "epoch": 7.399563495271199, "step": 45770}, {"loss": 0.453, "grad_norm": 1.2335172891616821, "learning_rate": 0.0002, "epoch": 7.401180179451944, "step": 45780}, {"loss": 0.4501, "grad_norm": 1.296393632888794, "learning_rate": 0.0002, "epoch": 7.402796863632689, "step": 45790}, {"loss": 0.4716, "grad_norm": 1.2551302909851074, "learning_rate": 0.0002, "epoch": 7.4044135478134345, "step": 45800}, {"loss": 0.451, "grad_norm": 1.1909204721450806, "learning_rate": 0.0002, "epoch": 7.40603023199418, "step": 45810}, {"loss": 0.4296, "grad_norm": 1.17038893699646, "learning_rate": 0.0002, "epoch": 7.407646916174925, "step": 45820}, {"loss": 0.4574, "grad_norm": 1.0033377408981323, "learning_rate": 0.0002, "epoch": 7.40926360035567, "step": 45830}, {"loss": 0.4413, "grad_norm": 1.2957805395126343, "learning_rate": 0.0002, "epoch": 7.4108802845364155, "step": 45840}, {"loss": 0.4597, "grad_norm": 1.347462773323059, "learning_rate": 0.0002, "epoch": 7.412496968717161, "step": 45850}, {"loss": 0.4499, "grad_norm": 1.3187026977539062, "learning_rate": 0.0002, "epoch": 7.414113652897907, "step": 45860}, {"loss": 0.4624, "grad_norm": 1.092236876487732, "learning_rate": 0.0002, "epoch": 7.415730337078652, "step": 45870}, {"loss": 0.4636, "grad_norm": 1.075634241104126, "learning_rate": 0.0002, "epoch": 7.417347021259397, "step": 45880}, {"loss": 0.4465, "grad_norm": 1.0200046300888062, "learning_rate": 0.0002, "epoch": 7.4189637054401425, "step": 45890}, {"loss": 0.47, "grad_norm": 1.1419479846954346, "learning_rate": 0.0002, "epoch": 7.420580389620888, "step": 45900}, {"loss": 0.4409, "grad_norm": 1.0798102617263794, "learning_rate": 0.0002, "epoch": 7.422197073801633, "step": 45910}, {"loss": 0.5173, "grad_norm": 0.9999571442604065, "learning_rate": 0.0002, "epoch": 7.423813757982378, "step": 45920}, {"loss": 0.4714, "grad_norm": 1.2220723628997803, "learning_rate": 0.0002, "epoch": 7.425430442163123, "step": 45930}, {"loss": 0.4844, "grad_norm": 1.1209388971328735, "learning_rate": 0.0002, "epoch": 7.427047126343869, "step": 45940}, {"loss": 0.4534, "grad_norm": 1.1198307275772095, "learning_rate": 0.0002, "epoch": 7.428663810524614, "step": 45950}, {"loss": 0.4486, "grad_norm": 1.0170516967773438, "learning_rate": 0.0002, "epoch": 7.430280494705359, "step": 45960}, {"loss": 0.488, "grad_norm": 1.2963446378707886, "learning_rate": 0.0002, "epoch": 7.431897178886104, "step": 45970}, {"loss": 0.4346, "grad_norm": 1.4202494621276855, "learning_rate": 0.0002, "epoch": 7.43351386306685, "step": 45980}, {"loss": 0.4917, "grad_norm": 1.066774845123291, "learning_rate": 0.0002, "epoch": 7.435130547247595, "step": 45990}, {"loss": 0.4897, "grad_norm": 1.2760428190231323, "learning_rate": 0.0002, "epoch": 7.43674723142834, "step": 46000}, {"loss": 0.4562, "grad_norm": 1.530720829963684, "learning_rate": 0.0002, "epoch": 7.438363915609086, "step": 46010}, {"loss": 0.4691, "grad_norm": 1.1914178133010864, "learning_rate": 0.0002, "epoch": 7.439980599789831, "step": 46020}, {"loss": 0.5038, "grad_norm": 1.466650128364563, "learning_rate": 0.0002, "epoch": 7.441597283970577, "step": 46030}, {"loss": 0.4673, "grad_norm": 1.1567928791046143, "learning_rate": 0.0002, "epoch": 7.443213968151322, "step": 46040}, {"loss": 0.4778, "grad_norm": 1.252336025238037, "learning_rate": 0.0002, "epoch": 7.444830652332067, "step": 46050}, {"loss": 0.4493, "grad_norm": 1.2095589637756348, "learning_rate": 0.0002, "epoch": 7.446447336512812, "step": 46060}, {"loss": 0.4407, "grad_norm": 1.4075263738632202, "learning_rate": 0.0002, "epoch": 7.4480640206935576, "step": 46070}, {"loss": 0.4328, "grad_norm": 1.2527226209640503, "learning_rate": 0.0002, "epoch": 7.449680704874303, "step": 46080}, {"loss": 0.4922, "grad_norm": 1.3044105768203735, "learning_rate": 0.0002, "epoch": 7.451297389055048, "step": 46090}, {"loss": 0.4465, "grad_norm": 1.2888941764831543, "learning_rate": 0.0002, "epoch": 7.452914073235793, "step": 46100}, {"loss": 0.4414, "grad_norm": 1.3148317337036133, "learning_rate": 0.0002, "epoch": 7.4545307574165385, "step": 46110}, {"loss": 0.4431, "grad_norm": 0.9526162147521973, "learning_rate": 0.0002, "epoch": 7.456147441597284, "step": 46120}, {"loss": 0.4422, "grad_norm": 1.2618519067764282, "learning_rate": 0.0002, "epoch": 7.457764125778029, "step": 46130}, {"loss": 0.4745, "grad_norm": 1.0392966270446777, "learning_rate": 0.0002, "epoch": 7.459380809958774, "step": 46140}, {"loss": 0.4589, "grad_norm": 1.3286794424057007, "learning_rate": 0.0002, "epoch": 7.460997494139519, "step": 46150}, {"loss": 0.4762, "grad_norm": 1.2377561330795288, "learning_rate": 0.0002, "epoch": 7.4626141783202655, "step": 46160}, {"loss": 0.4119, "grad_norm": 1.034134030342102, "learning_rate": 0.0002, "epoch": 7.464230862501011, "step": 46170}, {"loss": 0.4487, "grad_norm": 1.1719683408737183, "learning_rate": 0.0002, "epoch": 7.465847546681756, "step": 46180}, {"loss": 0.4423, "grad_norm": 1.182691216468811, "learning_rate": 0.0002, "epoch": 7.467464230862501, "step": 46190}, {"loss": 0.4341, "grad_norm": 1.1898412704467773, "learning_rate": 0.0002, "epoch": 7.4690809150432464, "step": 46200}, {"loss": 0.4753, "grad_norm": 1.0543978214263916, "learning_rate": 0.0002, "epoch": 7.470697599223992, "step": 46210}, {"loss": 0.4673, "grad_norm": 1.176971673965454, "learning_rate": 0.0002, "epoch": 7.472314283404737, "step": 46220}, {"loss": 0.4598, "grad_norm": 1.129456639289856, "learning_rate": 0.0002, "epoch": 7.473930967585482, "step": 46230}, {"loss": 0.4805, "grad_norm": 1.1782855987548828, "learning_rate": 0.0002, "epoch": 7.475547651766227, "step": 46240}, {"loss": 0.4979, "grad_norm": 1.1678800582885742, "learning_rate": 0.0002, "epoch": 7.477164335946973, "step": 46250}, {"loss": 0.4374, "grad_norm": 0.9768722653388977, "learning_rate": 0.0002, "epoch": 7.478781020127718, "step": 46260}, {"loss": 0.4683, "grad_norm": 1.3222670555114746, "learning_rate": 0.0002, "epoch": 7.480397704308463, "step": 46270}, {"loss": 0.459, "grad_norm": 1.0573948621749878, "learning_rate": 0.0002, "epoch": 7.482014388489208, "step": 46280}, {"loss": 0.5019, "grad_norm": 1.3233898878097534, "learning_rate": 0.0002, "epoch": 7.4836310726699535, "step": 46290}, {"loss": 0.4689, "grad_norm": 0.9695420265197754, "learning_rate": 0.0002, "epoch": 7.485247756850699, "step": 46300}, {"loss": 0.471, "grad_norm": 1.2072020769119263, "learning_rate": 0.0002, "epoch": 7.486864441031445, "step": 46310}, {"loss": 0.486, "grad_norm": 1.2161253690719604, "learning_rate": 0.0002, "epoch": 7.48848112521219, "step": 46320}, {"loss": 0.4581, "grad_norm": 1.185958743095398, "learning_rate": 0.0002, "epoch": 7.490097809392935, "step": 46330}, {"loss": 0.4617, "grad_norm": 1.3741549253463745, "learning_rate": 0.0002, "epoch": 7.491714493573681, "step": 46340}, {"loss": 0.4772, "grad_norm": 1.0586212873458862, "learning_rate": 0.0002, "epoch": 7.493331177754426, "step": 46350}, {"loss": 0.4644, "grad_norm": 1.2000513076782227, "learning_rate": 0.0002, "epoch": 7.494947861935171, "step": 46360}, {"loss": 0.4584, "grad_norm": 1.3326879739761353, "learning_rate": 0.0002, "epoch": 7.496564546115916, "step": 46370}, {"loss": 0.4741, "grad_norm": 1.3452857732772827, "learning_rate": 0.0002, "epoch": 7.4981812302966615, "step": 46380}, {"loss": 0.4747, "grad_norm": 1.2885284423828125, "learning_rate": 0.0002, "epoch": 7.499797914477407, "step": 46390}, {"loss": 0.4648, "grad_norm": 1.097342610359192, "learning_rate": 0.0002, "epoch": 7.501414598658152, "step": 46400}, {"loss": 0.4714, "grad_norm": 1.2342469692230225, "learning_rate": 0.0002, "epoch": 7.503031282838897, "step": 46410}, {"loss": 0.4421, "grad_norm": 1.0151721239089966, "learning_rate": 0.0002, "epoch": 7.504647967019642, "step": 46420}, {"loss": 0.4347, "grad_norm": 1.2487123012542725, "learning_rate": 0.0002, "epoch": 7.506264651200388, "step": 46430}, {"loss": 0.4768, "grad_norm": 0.9319046139717102, "learning_rate": 0.0002, "epoch": 7.507881335381134, "step": 46440}, {"loss": 0.4693, "grad_norm": 1.1362226009368896, "learning_rate": 0.0002, "epoch": 7.509498019561878, "step": 46450}, {"loss": 0.5007, "grad_norm": 1.2883973121643066, "learning_rate": 0.0002, "epoch": 7.511114703742624, "step": 46460}, {"loss": 0.4455, "grad_norm": 1.0892037153244019, "learning_rate": 0.0002, "epoch": 7.5127313879233695, "step": 46470}, {"loss": 0.4721, "grad_norm": 1.1870533227920532, "learning_rate": 0.0002, "epoch": 7.514348072104115, "step": 46480}, {"loss": 0.4824, "grad_norm": 1.2103877067565918, "learning_rate": 0.0002, "epoch": 7.51596475628486, "step": 46490}, {"loss": 0.4573, "grad_norm": 1.0980644226074219, "learning_rate": 0.0002, "epoch": 7.517581440465605, "step": 46500}, {"loss": 0.4759, "grad_norm": 1.4729726314544678, "learning_rate": 0.0002, "epoch": 7.51919812464635, "step": 46510}, {"loss": 0.4413, "grad_norm": 1.1808913946151733, "learning_rate": 0.0002, "epoch": 7.520814808827096, "step": 46520}, {"loss": 0.4278, "grad_norm": 1.2347747087478638, "learning_rate": 0.0002, "epoch": 7.522431493007841, "step": 46530}, {"loss": 0.4745, "grad_norm": 1.5921525955200195, "learning_rate": 0.0002, "epoch": 7.524048177188586, "step": 46540}, {"loss": 0.4525, "grad_norm": 1.1328861713409424, "learning_rate": 0.0002, "epoch": 7.525664861369331, "step": 46550}, {"loss": 0.4771, "grad_norm": 1.289947748184204, "learning_rate": 0.0002, "epoch": 7.527281545550077, "step": 46560}, {"loss": 0.4711, "grad_norm": 1.0198370218276978, "learning_rate": 0.0002, "epoch": 7.528898229730822, "step": 46570}, {"loss": 0.504, "grad_norm": 1.3007137775421143, "learning_rate": 0.0002, "epoch": 7.530514913911567, "step": 46580}, {"loss": 0.4496, "grad_norm": 1.2864280939102173, "learning_rate": 0.0002, "epoch": 7.532131598092313, "step": 46590}, {"loss": 0.463, "grad_norm": 1.1005513668060303, "learning_rate": 0.0002, "epoch": 7.5337482822730575, "step": 46600}, {"loss": 0.4426, "grad_norm": 0.9998318552970886, "learning_rate": 0.0002, "epoch": 7.535364966453804, "step": 46610}, {"loss": 0.4762, "grad_norm": 1.2042466402053833, "learning_rate": 0.0002, "epoch": 7.536981650634549, "step": 46620}, {"loss": 0.4685, "grad_norm": 1.3240692615509033, "learning_rate": 0.0002, "epoch": 7.538598334815294, "step": 46630}, {"loss": 0.4608, "grad_norm": 1.2145483493804932, "learning_rate": 0.0002, "epoch": 7.540215018996039, "step": 46640}, {"loss": 0.4608, "grad_norm": 1.169691801071167, "learning_rate": 0.0002, "epoch": 7.5418317031767845, "step": 46650}, {"loss": 0.4527, "grad_norm": 1.194045901298523, "learning_rate": 0.0002, "epoch": 7.54344838735753, "step": 46660}, {"loss": 0.4599, "grad_norm": 1.0481327772140503, "learning_rate": 0.0002, "epoch": 7.545065071538275, "step": 46670}, {"loss": 0.4729, "grad_norm": 1.0714460611343384, "learning_rate": 0.0002, "epoch": 7.54668175571902, "step": 46680}, {"loss": 0.4703, "grad_norm": 1.1811443567276, "learning_rate": 0.0002, "epoch": 7.5482984398997655, "step": 46690}, {"loss": 0.4628, "grad_norm": 1.2794281244277954, "learning_rate": 0.0002, "epoch": 7.549915124080511, "step": 46700}, {"loss": 0.4659, "grad_norm": 1.001287817955017, "learning_rate": 0.0002, "epoch": 7.551531808261256, "step": 46710}, {"loss": 0.4938, "grad_norm": 1.3598867654800415, "learning_rate": 0.0002, "epoch": 7.553148492442001, "step": 46720}, {"loss": 0.4731, "grad_norm": 1.206254482269287, "learning_rate": 0.0002, "epoch": 7.554765176622746, "step": 46730}, {"loss": 0.4581, "grad_norm": 1.1095832586288452, "learning_rate": 0.0002, "epoch": 7.5563818608034925, "step": 46740}, {"loss": 0.4625, "grad_norm": 1.3912206888198853, "learning_rate": 0.0002, "epoch": 7.557998544984237, "step": 46750}, {"loss": 0.4464, "grad_norm": 0.9883413314819336, "learning_rate": 0.0002, "epoch": 7.559615229164983, "step": 46760}, {"loss": 0.4535, "grad_norm": 1.0965087413787842, "learning_rate": 0.0002, "epoch": 7.561231913345728, "step": 46770}, {"loss": 0.469, "grad_norm": 1.092261552810669, "learning_rate": 0.0002, "epoch": 7.562848597526473, "step": 46780}, {"loss": 0.488, "grad_norm": 1.0443673133850098, "learning_rate": 0.0002, "epoch": 7.564465281707219, "step": 46790}, {"loss": 0.4875, "grad_norm": 1.2420614957809448, "learning_rate": 0.0002, "epoch": 7.566081965887964, "step": 46800}, {"loss": 0.4911, "grad_norm": 1.0510783195495605, "learning_rate": 0.0002, "epoch": 7.567698650068709, "step": 46810}, {"loss": 0.4541, "grad_norm": 1.0291800498962402, "learning_rate": 0.0002, "epoch": 7.569315334249454, "step": 46820}, {"loss": 0.4591, "grad_norm": 1.1784595251083374, "learning_rate": 0.0002, "epoch": 7.5709320184302, "step": 46830}, {"loss": 0.5154, "grad_norm": 1.0424436330795288, "learning_rate": 0.0002, "epoch": 7.572548702610945, "step": 46840}, {"loss": 0.4612, "grad_norm": 1.182131290435791, "learning_rate": 0.0002, "epoch": 7.57416538679169, "step": 46850}, {"loss": 0.446, "grad_norm": 0.9917051792144775, "learning_rate": 0.0002, "epoch": 7.575782070972435, "step": 46860}, {"loss": 0.4428, "grad_norm": 1.1616078615188599, "learning_rate": 0.0002, "epoch": 7.5773987551531805, "step": 46870}, {"loss": 0.4769, "grad_norm": 1.401071548461914, "learning_rate": 0.0002, "epoch": 7.579015439333926, "step": 46880}, {"loss": 0.4635, "grad_norm": 0.874487578868866, "learning_rate": 0.0002, "epoch": 7.580632123514672, "step": 46890}, {"loss": 0.4641, "grad_norm": 1.2511193752288818, "learning_rate": 0.0002, "epoch": 7.582248807695416, "step": 46900}, {"loss": 0.4715, "grad_norm": 1.7548277378082275, "learning_rate": 0.0002, "epoch": 7.583865491876162, "step": 46910}, {"loss": 0.4681, "grad_norm": 1.349366545677185, "learning_rate": 0.0002, "epoch": 7.5854821760569076, "step": 46920}, {"loss": 0.4819, "grad_norm": 1.0609583854675293, "learning_rate": 0.0002, "epoch": 7.587098860237653, "step": 46930}, {"loss": 0.4498, "grad_norm": 1.031512975692749, "learning_rate": 0.0002, "epoch": 7.588715544418398, "step": 46940}, {"loss": 0.4688, "grad_norm": 1.1440242528915405, "learning_rate": 0.0002, "epoch": 7.590332228599143, "step": 46950}, {"loss": 0.4568, "grad_norm": 1.2762987613677979, "learning_rate": 0.0002, "epoch": 7.5919489127798885, "step": 46960}, {"loss": 0.4569, "grad_norm": 1.167269229888916, "learning_rate": 0.0002, "epoch": 7.593565596960634, "step": 46970}, {"loss": 0.461, "grad_norm": 1.131127953529358, "learning_rate": 0.0002, "epoch": 7.595182281141379, "step": 46980}, {"loss": 0.4666, "grad_norm": 1.4527075290679932, "learning_rate": 0.0002, "epoch": 7.596798965322124, "step": 46990}, {"loss": 0.4973, "grad_norm": 1.330132007598877, "learning_rate": 0.0002, "epoch": 7.598415649502869, "step": 47000}, {"loss": 0.4969, "grad_norm": 1.4223501682281494, "learning_rate": 0.0002, "epoch": 7.600032333683615, "step": 47010}, {"loss": 0.4572, "grad_norm": 1.2045072317123413, "learning_rate": 0.0002, "epoch": 7.60164901786436, "step": 47020}, {"loss": 0.4666, "grad_norm": 1.1549896001815796, "learning_rate": 0.0002, "epoch": 7.603265702045105, "step": 47030}, {"loss": 0.4383, "grad_norm": 1.2221543788909912, "learning_rate": 0.0002, "epoch": 7.604882386225851, "step": 47040}, {"loss": 0.4826, "grad_norm": 1.1171326637268066, "learning_rate": 0.0002, "epoch": 7.6064990704065965, "step": 47050}, {"loss": 0.4465, "grad_norm": 1.073671817779541, "learning_rate": 0.0002, "epoch": 7.608115754587342, "step": 47060}, {"loss": 0.4623, "grad_norm": 1.2524123191833496, "learning_rate": 0.0002, "epoch": 7.609732438768087, "step": 47070}, {"loss": 0.4538, "grad_norm": 1.2015056610107422, "learning_rate": 0.0002, "epoch": 7.611349122948832, "step": 47080}, {"loss": 0.4871, "grad_norm": 1.2454534769058228, "learning_rate": 0.0002, "epoch": 7.612965807129577, "step": 47090}, {"loss": 0.5064, "grad_norm": 0.9815779328346252, "learning_rate": 0.0002, "epoch": 7.614582491310323, "step": 47100}, {"loss": 0.4841, "grad_norm": 1.1437602043151855, "learning_rate": 0.0002, "epoch": 7.616199175491068, "step": 47110}, {"loss": 0.453, "grad_norm": 1.1004078388214111, "learning_rate": 0.0002, "epoch": 7.617815859671813, "step": 47120}, {"loss": 0.4552, "grad_norm": 1.069453477859497, "learning_rate": 0.0002, "epoch": 7.619432543852558, "step": 47130}, {"loss": 0.4627, "grad_norm": 1.1434191465377808, "learning_rate": 0.0002, "epoch": 7.6210492280333035, "step": 47140}, {"loss": 0.4882, "grad_norm": 1.216845989227295, "learning_rate": 0.0002, "epoch": 7.622665912214049, "step": 47150}, {"loss": 0.481, "grad_norm": 1.2302134037017822, "learning_rate": 0.0002, "epoch": 7.624282596394794, "step": 47160}, {"loss": 0.4806, "grad_norm": 1.4284924268722534, "learning_rate": 0.0002, "epoch": 7.625899280575539, "step": 47170}, {"loss": 0.4458, "grad_norm": 1.3359615802764893, "learning_rate": 0.0002, "epoch": 7.6275159647562845, "step": 47180}, {"loss": 0.4842, "grad_norm": 1.0242379903793335, "learning_rate": 0.0002, "epoch": 7.629132648937031, "step": 47190}, {"loss": 0.5137, "grad_norm": 1.249513030052185, "learning_rate": 0.0002, "epoch": 7.630749333117776, "step": 47200}, {"loss": 0.4462, "grad_norm": 1.0881463289260864, "learning_rate": 0.0002, "epoch": 7.632366017298521, "step": 47210}, {"loss": 0.4864, "grad_norm": 1.2903773784637451, "learning_rate": 0.0002, "epoch": 7.633982701479266, "step": 47220}, {"loss": 0.4729, "grad_norm": 1.1671710014343262, "learning_rate": 0.0002, "epoch": 7.6355993856600115, "step": 47230}, {"loss": 0.4936, "grad_norm": 1.1960735321044922, "learning_rate": 0.0002, "epoch": 7.637216069840757, "step": 47240}, {"loss": 0.4884, "grad_norm": 1.2692298889160156, "learning_rate": 0.0002, "epoch": 7.638832754021502, "step": 47250}, {"loss": 0.423, "grad_norm": 0.9812195301055908, "learning_rate": 0.0002, "epoch": 7.640449438202247, "step": 47260}, {"loss": 0.4737, "grad_norm": 1.3986053466796875, "learning_rate": 0.0002, "epoch": 7.642066122382992, "step": 47270}, {"loss": 0.4834, "grad_norm": 1.2692067623138428, "learning_rate": 0.0002, "epoch": 7.643682806563738, "step": 47280}, {"loss": 0.4893, "grad_norm": 1.1185054779052734, "learning_rate": 0.0002, "epoch": 7.645299490744483, "step": 47290}, {"loss": 0.4828, "grad_norm": 1.2837327718734741, "learning_rate": 0.0002, "epoch": 7.646916174925228, "step": 47300}, {"loss": 0.4891, "grad_norm": 1.8518418073654175, "learning_rate": 0.0002, "epoch": 7.648532859105973, "step": 47310}, {"loss": 0.4626, "grad_norm": 0.9781302213668823, "learning_rate": 0.0002, "epoch": 7.650149543286719, "step": 47320}, {"loss": 0.501, "grad_norm": 1.0777910947799683, "learning_rate": 0.0002, "epoch": 7.651766227467464, "step": 47330}, {"loss": 0.4927, "grad_norm": 1.2031499147415161, "learning_rate": 0.0002, "epoch": 7.65338291164821, "step": 47340}, {"loss": 0.439, "grad_norm": 1.14322829246521, "learning_rate": 0.0002, "epoch": 7.654999595828955, "step": 47350}, {"loss": 0.4481, "grad_norm": 1.3211992979049683, "learning_rate": 0.0002, "epoch": 7.6566162800097, "step": 47360}, {"loss": 0.4462, "grad_norm": 1.3632899522781372, "learning_rate": 0.0002, "epoch": 7.658232964190446, "step": 47370}, {"loss": 0.4934, "grad_norm": 1.2593929767608643, "learning_rate": 0.0002, "epoch": 7.659849648371191, "step": 47380}, {"loss": 0.4645, "grad_norm": 1.442670464515686, "learning_rate": 0.0002, "epoch": 7.661466332551936, "step": 47390}, {"loss": 0.4584, "grad_norm": 1.2304763793945312, "learning_rate": 0.0002, "epoch": 7.663083016732681, "step": 47400}, {"loss": 0.464, "grad_norm": 1.0182652473449707, "learning_rate": 0.0002, "epoch": 7.664699700913427, "step": 47410}, {"loss": 0.457, "grad_norm": 1.365441083908081, "learning_rate": 0.0002, "epoch": 7.666316385094172, "step": 47420}, {"loss": 0.4787, "grad_norm": 1.1578556299209595, "learning_rate": 0.0002, "epoch": 7.667933069274917, "step": 47430}, {"loss": 0.486, "grad_norm": 1.0346194505691528, "learning_rate": 0.0002, "epoch": 7.669549753455662, "step": 47440}, {"loss": 0.4703, "grad_norm": 1.2567378282546997, "learning_rate": 0.0002, "epoch": 7.6711664376364075, "step": 47450}, {"loss": 0.4853, "grad_norm": 1.1669118404388428, "learning_rate": 0.0002, "epoch": 7.672783121817153, "step": 47460}, {"loss": 0.4869, "grad_norm": 1.0174756050109863, "learning_rate": 0.0002, "epoch": 7.674399805997898, "step": 47470}, {"loss": 0.4601, "grad_norm": 1.0962231159210205, "learning_rate": 0.0002, "epoch": 7.676016490178643, "step": 47480}, {"loss": 0.4866, "grad_norm": 1.1098674535751343, "learning_rate": 0.0002, "epoch": 7.677633174359389, "step": 47490}, {"loss": 0.4682, "grad_norm": 1.1441160440444946, "learning_rate": 0.0002, "epoch": 7.6792498585401345, "step": 47500}, {"loss": 0.4552, "grad_norm": 1.0473432540893555, "learning_rate": 0.0002, "epoch": 7.68086654272088, "step": 47510}, {"loss": 0.4771, "grad_norm": 1.2954738140106201, "learning_rate": 0.0002, "epoch": 7.682483226901625, "step": 47520}, {"loss": 0.5012, "grad_norm": 1.2931294441223145, "learning_rate": 0.0002, "epoch": 7.68409991108237, "step": 47530}, {"loss": 0.4808, "grad_norm": 1.4005156755447388, "learning_rate": 0.0002, "epoch": 7.6857165952631155, "step": 47540}, {"loss": 0.4847, "grad_norm": 1.0998929738998413, "learning_rate": 0.0002, "epoch": 7.687333279443861, "step": 47550}, {"loss": 0.4839, "grad_norm": 1.3478347063064575, "learning_rate": 0.0002, "epoch": 7.688949963624606, "step": 47560}, {"loss": 0.4918, "grad_norm": 1.2991969585418701, "learning_rate": 0.0002, "epoch": 7.690566647805351, "step": 47570}, {"loss": 0.4673, "grad_norm": 1.0892608165740967, "learning_rate": 0.0002, "epoch": 7.692183331986096, "step": 47580}, {"loss": 0.4937, "grad_norm": 1.2230998277664185, "learning_rate": 0.0002, "epoch": 7.693800016166842, "step": 47590}, {"loss": 0.5222, "grad_norm": 1.2635555267333984, "learning_rate": 0.0002, "epoch": 7.695416700347587, "step": 47600}, {"loss": 0.4883, "grad_norm": 1.1720705032348633, "learning_rate": 0.0002, "epoch": 7.697033384528332, "step": 47610}, {"loss": 0.452, "grad_norm": 1.1134333610534668, "learning_rate": 0.0002, "epoch": 7.698650068709077, "step": 47620}, {"loss": 0.4859, "grad_norm": 1.2643009424209595, "learning_rate": 0.0002, "epoch": 7.7002667528898225, "step": 47630}, {"loss": 0.4825, "grad_norm": 1.1145045757293701, "learning_rate": 0.0002, "epoch": 7.701883437070569, "step": 47640}, {"loss": 0.4735, "grad_norm": 1.1808549165725708, "learning_rate": 0.0002, "epoch": 7.703500121251314, "step": 47650}, {"loss": 0.4841, "grad_norm": 1.2996630668640137, "learning_rate": 0.0002, "epoch": 7.705116805432059, "step": 47660}, {"loss": 0.4712, "grad_norm": 1.2786413431167603, "learning_rate": 0.0002, "epoch": 7.706733489612804, "step": 47670}, {"loss": 0.4694, "grad_norm": 1.3245121240615845, "learning_rate": 0.0002, "epoch": 7.70835017379355, "step": 47680}, {"loss": 0.4467, "grad_norm": 1.4168202877044678, "learning_rate": 0.0002, "epoch": 7.709966857974295, "step": 47690}, {"loss": 0.5143, "grad_norm": 1.0354000329971313, "learning_rate": 0.0002, "epoch": 7.71158354215504, "step": 47700}, {"loss": 0.4703, "grad_norm": 0.9630362391471863, "learning_rate": 0.0002, "epoch": 7.713200226335785, "step": 47710}, {"loss": 0.4996, "grad_norm": 1.1045806407928467, "learning_rate": 0.0002, "epoch": 7.7148169105165305, "step": 47720}, {"loss": 0.4756, "grad_norm": 1.2403767108917236, "learning_rate": 0.0002, "epoch": 7.716433594697276, "step": 47730}, {"loss": 0.4658, "grad_norm": 0.9893410801887512, "learning_rate": 0.0002, "epoch": 7.718050278878021, "step": 47740}, {"loss": 0.4463, "grad_norm": 1.0749315023422241, "learning_rate": 0.0002, "epoch": 7.719666963058766, "step": 47750}, {"loss": 0.467, "grad_norm": 1.2851510047912598, "learning_rate": 0.0002, "epoch": 7.721283647239511, "step": 47760}, {"loss": 0.489, "grad_norm": 1.2964261770248413, "learning_rate": 0.0002, "epoch": 7.722900331420257, "step": 47770}, {"loss": 0.4702, "grad_norm": 1.0603861808776855, "learning_rate": 0.0002, "epoch": 7.724517015601002, "step": 47780}, {"loss": 0.478, "grad_norm": 1.2728440761566162, "learning_rate": 0.0002, "epoch": 7.726133699781748, "step": 47790}, {"loss": 0.4746, "grad_norm": 1.20509934425354, "learning_rate": 0.0002, "epoch": 7.727750383962493, "step": 47800}, {"loss": 0.4556, "grad_norm": 1.397595763206482, "learning_rate": 0.0002, "epoch": 7.7293670681432385, "step": 47810}, {"loss": 0.4736, "grad_norm": 1.2595560550689697, "learning_rate": 0.0002, "epoch": 7.730983752323984, "step": 47820}, {"loss": 0.5061, "grad_norm": 1.166074514389038, "learning_rate": 0.0002, "epoch": 7.732600436504729, "step": 47830}, {"loss": 0.4907, "grad_norm": 1.258192777633667, "learning_rate": 0.0002, "epoch": 7.734217120685474, "step": 47840}, {"loss": 0.5256, "grad_norm": 1.0394890308380127, "learning_rate": 0.0002, "epoch": 7.735833804866219, "step": 47850}, {"loss": 0.4863, "grad_norm": 1.2017768621444702, "learning_rate": 0.0002, "epoch": 7.737450489046965, "step": 47860}, {"loss": 0.4784, "grad_norm": 1.1070265769958496, "learning_rate": 0.0002, "epoch": 7.73906717322771, "step": 47870}, {"loss": 0.4616, "grad_norm": 1.0544345378875732, "learning_rate": 0.0002, "epoch": 7.740683857408455, "step": 47880}, {"loss": 0.4519, "grad_norm": 1.0194088220596313, "learning_rate": 0.0002, "epoch": 7.7423005415892, "step": 47890}, {"loss": 0.4758, "grad_norm": 1.3095234632492065, "learning_rate": 0.0002, "epoch": 7.743917225769946, "step": 47900}, {"loss": 0.4646, "grad_norm": 1.0579626560211182, "learning_rate": 0.0002, "epoch": 7.745533909950691, "step": 47910}, {"loss": 0.4532, "grad_norm": 1.012990951538086, "learning_rate": 0.0002, "epoch": 7.747150594131437, "step": 47920}, {"loss": 0.4775, "grad_norm": 1.485148549079895, "learning_rate": 0.0002, "epoch": 7.748767278312181, "step": 47930}, {"loss": 0.4892, "grad_norm": 1.3595696687698364, "learning_rate": 0.0002, "epoch": 7.750383962492927, "step": 47940}, {"loss": 0.4609, "grad_norm": 0.9945753216743469, "learning_rate": 0.0002, "epoch": 7.752000646673673, "step": 47950}, {"loss": 0.5138, "grad_norm": 1.2098956108093262, "learning_rate": 0.0002, "epoch": 7.753617330854418, "step": 47960}, {"loss": 0.4815, "grad_norm": 1.3056198358535767, "learning_rate": 0.0002, "epoch": 7.755234015035163, "step": 47970}, {"loss": 0.4761, "grad_norm": 1.2247772216796875, "learning_rate": 0.0002, "epoch": 7.756850699215908, "step": 47980}, {"loss": 0.5023, "grad_norm": 1.397642970085144, "learning_rate": 0.0002, "epoch": 7.7584673833966535, "step": 47990}, {"loss": 0.4901, "grad_norm": 1.2565888166427612, "learning_rate": 0.0002, "epoch": 7.760084067577399, "step": 48000}, {"loss": 0.469, "grad_norm": 1.0065099000930786, "learning_rate": 0.0002, "epoch": 7.761700751758144, "step": 48010}, {"loss": 0.4886, "grad_norm": 1.1466305255889893, "learning_rate": 0.0002, "epoch": 7.763317435938889, "step": 48020}, {"loss": 0.4898, "grad_norm": 1.4492419958114624, "learning_rate": 0.0002, "epoch": 7.7649341201196345, "step": 48030}, {"loss": 0.489, "grad_norm": 1.0945932865142822, "learning_rate": 0.0002, "epoch": 7.76655080430038, "step": 48040}, {"loss": 0.4968, "grad_norm": 1.1938602924346924, "learning_rate": 0.0002, "epoch": 7.768167488481125, "step": 48050}, {"loss": 0.4497, "grad_norm": 1.168890357017517, "learning_rate": 0.0002, "epoch": 7.76978417266187, "step": 48060}, {"loss": 0.4881, "grad_norm": 1.3134305477142334, "learning_rate": 0.0002, "epoch": 7.771400856842616, "step": 48070}, {"loss": 0.5226, "grad_norm": 1.044438123703003, "learning_rate": 0.0002, "epoch": 7.773017541023361, "step": 48080}, {"loss": 0.4497, "grad_norm": 1.1275628805160522, "learning_rate": 0.0002, "epoch": 7.774634225204107, "step": 48090}, {"loss": 0.5063, "grad_norm": 1.0877318382263184, "learning_rate": 0.0002, "epoch": 7.776250909384852, "step": 48100}, {"loss": 0.4795, "grad_norm": 1.4800893068313599, "learning_rate": 0.0002, "epoch": 7.777867593565597, "step": 48110}, {"loss": 0.4984, "grad_norm": 1.1495977640151978, "learning_rate": 0.0002, "epoch": 7.779484277746342, "step": 48120}, {"loss": 0.4662, "grad_norm": 1.2175556421279907, "learning_rate": 0.0002, "epoch": 7.781100961927088, "step": 48130}, {"loss": 0.4935, "grad_norm": 1.150556206703186, "learning_rate": 0.0002, "epoch": 7.782717646107833, "step": 48140}, {"loss": 0.5039, "grad_norm": 1.051145315170288, "learning_rate": 0.0002, "epoch": 7.784334330288578, "step": 48150}, {"loss": 0.4611, "grad_norm": 1.2842742204666138, "learning_rate": 0.0002, "epoch": 7.785951014469323, "step": 48160}, {"loss": 0.4589, "grad_norm": 1.2251030206680298, "learning_rate": 0.0002, "epoch": 7.787567698650069, "step": 48170}, {"loss": 0.4905, "grad_norm": 1.2809321880340576, "learning_rate": 0.0002, "epoch": 7.789184382830814, "step": 48180}, {"loss": 0.4569, "grad_norm": 1.005690336227417, "learning_rate": 0.0002, "epoch": 7.790801067011559, "step": 48190}, {"loss": 0.4862, "grad_norm": 1.325501561164856, "learning_rate": 0.0002, "epoch": 7.792417751192304, "step": 48200}, {"loss": 0.4384, "grad_norm": 1.4551857709884644, "learning_rate": 0.0002, "epoch": 7.7940344353730495, "step": 48210}, {"loss": 0.4696, "grad_norm": 1.3399626016616821, "learning_rate": 0.0002, "epoch": 7.795651119553796, "step": 48220}, {"loss": 0.4654, "grad_norm": 1.0379714965820312, "learning_rate": 0.0002, "epoch": 7.79726780373454, "step": 48230}, {"loss": 0.4915, "grad_norm": 0.9725802540779114, "learning_rate": 0.0002, "epoch": 7.798884487915286, "step": 48240}, {"loss": 0.4583, "grad_norm": 1.0202224254608154, "learning_rate": 0.0002, "epoch": 7.800501172096031, "step": 48250}, {"loss": 0.4792, "grad_norm": 0.9477742910385132, "learning_rate": 0.0002, "epoch": 7.802117856276777, "step": 48260}, {"loss": 0.4836, "grad_norm": 1.2726924419403076, "learning_rate": 0.0002, "epoch": 7.803734540457522, "step": 48270}, {"loss": 0.494, "grad_norm": 1.453190565109253, "learning_rate": 0.0002, "epoch": 7.805351224638267, "step": 48280}, {"loss": 0.4559, "grad_norm": 1.2806978225708008, "learning_rate": 0.0002, "epoch": 7.806967908819012, "step": 48290}, {"loss": 0.4867, "grad_norm": 1.0897129774093628, "learning_rate": 0.0002, "epoch": 7.8085845929997575, "step": 48300}, {"loss": 0.4939, "grad_norm": 1.381636381149292, "learning_rate": 0.0002, "epoch": 7.810201277180503, "step": 48310}, {"loss": 0.4797, "grad_norm": 0.9954851269721985, "learning_rate": 0.0002, "epoch": 7.811817961361248, "step": 48320}, {"loss": 0.4995, "grad_norm": 1.1756198406219482, "learning_rate": 0.0002, "epoch": 7.813434645541993, "step": 48330}, {"loss": 0.4904, "grad_norm": 1.2087817192077637, "learning_rate": 0.0002, "epoch": 7.815051329722738, "step": 48340}, {"loss": 0.4935, "grad_norm": 1.3075505495071411, "learning_rate": 0.0002, "epoch": 7.816668013903484, "step": 48350}, {"loss": 0.486, "grad_norm": 1.1872076988220215, "learning_rate": 0.0002, "epoch": 7.818284698084229, "step": 48360}, {"loss": 0.538, "grad_norm": 1.2134783267974854, "learning_rate": 0.0002, "epoch": 7.819901382264975, "step": 48370}, {"loss": 0.4759, "grad_norm": 1.28566312789917, "learning_rate": 0.0002, "epoch": 7.821518066445719, "step": 48380}, {"loss": 0.4962, "grad_norm": 1.0578798055648804, "learning_rate": 0.0002, "epoch": 7.8231347506264655, "step": 48390}, {"loss": 0.4924, "grad_norm": 1.1225441694259644, "learning_rate": 0.0002, "epoch": 7.824751434807211, "step": 48400}, {"loss": 0.5081, "grad_norm": 1.2029428482055664, "learning_rate": 0.0002, "epoch": 7.826368118987956, "step": 48410}, {"loss": 0.4669, "grad_norm": 1.252485990524292, "learning_rate": 0.0002, "epoch": 7.827984803168701, "step": 48420}, {"loss": 0.4932, "grad_norm": 1.1822574138641357, "learning_rate": 0.0002, "epoch": 7.829601487349446, "step": 48430}, {"loss": 0.4692, "grad_norm": 1.2428245544433594, "learning_rate": 0.0002, "epoch": 7.831218171530192, "step": 48440}, {"loss": 0.4796, "grad_norm": 1.0565894842147827, "learning_rate": 0.0002, "epoch": 7.832834855710937, "step": 48450}, {"loss": 0.5016, "grad_norm": 1.363452672958374, "learning_rate": 0.0002, "epoch": 7.834451539891682, "step": 48460}, {"loss": 0.463, "grad_norm": 1.2436026334762573, "learning_rate": 0.0002, "epoch": 7.836068224072427, "step": 48470}, {"loss": 0.4794, "grad_norm": 1.2623029947280884, "learning_rate": 0.0002, "epoch": 7.8376849082531725, "step": 48480}, {"loss": 0.4863, "grad_norm": 1.0942288637161255, "learning_rate": 0.0002, "epoch": 7.839301592433918, "step": 48490}, {"loss": 0.4723, "grad_norm": 1.1791462898254395, "learning_rate": 0.0002, "epoch": 7.840918276614663, "step": 48500}, {"loss": 0.501, "grad_norm": 1.3342814445495605, "learning_rate": 0.0002, "epoch": 7.842534960795408, "step": 48510}, {"loss": 0.4654, "grad_norm": 1.0511828660964966, "learning_rate": 0.0002, "epoch": 7.844151644976154, "step": 48520}, {"loss": 0.5051, "grad_norm": 1.48568594455719, "learning_rate": 0.0002, "epoch": 7.845768329156899, "step": 48530}, {"loss": 0.4572, "grad_norm": 1.296844720840454, "learning_rate": 0.0002, "epoch": 7.847385013337645, "step": 48540}, {"loss": 0.5216, "grad_norm": 1.3032835721969604, "learning_rate": 0.0002, "epoch": 7.84900169751839, "step": 48550}, {"loss": 0.472, "grad_norm": 1.260769009590149, "learning_rate": 0.0002, "epoch": 7.850618381699135, "step": 48560}, {"loss": 0.5094, "grad_norm": 1.3309531211853027, "learning_rate": 0.0002, "epoch": 7.8522350658798805, "step": 48570}, {"loss": 0.4759, "grad_norm": 1.1907469034194946, "learning_rate": 0.0002, "epoch": 7.853851750060626, "step": 48580}, {"loss": 0.485, "grad_norm": 0.9690865874290466, "learning_rate": 0.0002, "epoch": 7.855468434241371, "step": 48590}, {"loss": 0.4899, "grad_norm": 1.2417343854904175, "learning_rate": 0.0002, "epoch": 7.857085118422116, "step": 48600}, {"loss": 0.4888, "grad_norm": 1.1366082429885864, "learning_rate": 0.0002, "epoch": 7.858701802602861, "step": 48610}, {"loss": 0.4909, "grad_norm": 1.4737876653671265, "learning_rate": 0.0002, "epoch": 7.860318486783607, "step": 48620}, {"loss": 0.4476, "grad_norm": 1.3934144973754883, "learning_rate": 0.0002, "epoch": 7.861935170964352, "step": 48630}, {"loss": 0.5141, "grad_norm": 0.9997506737709045, "learning_rate": 0.0002, "epoch": 7.863551855145097, "step": 48640}, {"loss": 0.4766, "grad_norm": 1.3827011585235596, "learning_rate": 0.0002, "epoch": 7.865168539325842, "step": 48650}, {"loss": 0.4926, "grad_norm": 1.2811808586120605, "learning_rate": 0.0002, "epoch": 7.866785223506588, "step": 48660}, {"loss": 0.4898, "grad_norm": 1.394400715827942, "learning_rate": 0.0002, "epoch": 7.868401907687334, "step": 48670}, {"loss": 0.4889, "grad_norm": 1.5635628700256348, "learning_rate": 0.0002, "epoch": 7.870018591868079, "step": 48680}, {"loss": 0.4822, "grad_norm": 1.147349238395691, "learning_rate": 0.0002, "epoch": 7.871635276048824, "step": 48690}, {"loss": 0.485, "grad_norm": 1.2417502403259277, "learning_rate": 0.0002, "epoch": 7.873251960229569, "step": 48700}, {"loss": 0.4976, "grad_norm": 1.0380291938781738, "learning_rate": 0.0002, "epoch": 7.874868644410315, "step": 48710}, {"loss": 0.5119, "grad_norm": 1.2139482498168945, "learning_rate": 0.0002, "epoch": 7.87648532859106, "step": 48720}, {"loss": 0.497, "grad_norm": 1.2833739519119263, "learning_rate": 0.0002, "epoch": 7.878102012771805, "step": 48730}, {"loss": 0.4776, "grad_norm": 1.2405574321746826, "learning_rate": 0.0002, "epoch": 7.87971869695255, "step": 48740}, {"loss": 0.5085, "grad_norm": 1.1267465353012085, "learning_rate": 0.0002, "epoch": 7.881335381133296, "step": 48750}, {"loss": 0.4894, "grad_norm": 1.3052713871002197, "learning_rate": 0.0002, "epoch": 7.882952065314041, "step": 48760}, {"loss": 0.5077, "grad_norm": 1.0581550598144531, "learning_rate": 0.0002, "epoch": 7.884568749494786, "step": 48770}, {"loss": 0.4598, "grad_norm": 1.1074683666229248, "learning_rate": 0.0002, "epoch": 7.886185433675531, "step": 48780}, {"loss": 0.4813, "grad_norm": 1.0812418460845947, "learning_rate": 0.0002, "epoch": 7.8878021178562765, "step": 48790}, {"loss": 0.477, "grad_norm": 1.3083902597427368, "learning_rate": 0.0002, "epoch": 7.889418802037022, "step": 48800}, {"loss": 0.4717, "grad_norm": 1.457373023033142, "learning_rate": 0.0002, "epoch": 7.891035486217767, "step": 48810}, {"loss": 0.4607, "grad_norm": 1.048091173171997, "learning_rate": 0.0002, "epoch": 7.892652170398513, "step": 48820}, {"loss": 0.4848, "grad_norm": 1.1420985460281372, "learning_rate": 0.0002, "epoch": 7.894268854579258, "step": 48830}, {"loss": 0.5019, "grad_norm": 1.0286061763763428, "learning_rate": 0.0002, "epoch": 7.8958855387600035, "step": 48840}, {"loss": 0.5054, "grad_norm": 1.0361840724945068, "learning_rate": 0.0002, "epoch": 7.897502222940749, "step": 48850}, {"loss": 0.4908, "grad_norm": 1.1862726211547852, "learning_rate": 0.0002, "epoch": 7.899118907121494, "step": 48860}, {"loss": 0.4491, "grad_norm": 1.2256416082382202, "learning_rate": 0.0002, "epoch": 7.900735591302239, "step": 48870}, {"loss": 0.4732, "grad_norm": 1.0664557218551636, "learning_rate": 0.0002, "epoch": 7.9023522754829845, "step": 48880}, {"loss": 0.4741, "grad_norm": 1.3960802555084229, "learning_rate": 0.0002, "epoch": 7.90396895966373, "step": 48890}, {"loss": 0.5061, "grad_norm": 1.230430245399475, "learning_rate": 0.0002, "epoch": 7.905585643844475, "step": 48900}, {"loss": 0.4698, "grad_norm": 1.0949305295944214, "learning_rate": 0.0002, "epoch": 7.90720232802522, "step": 48910}, {"loss": 0.4964, "grad_norm": 1.4402074813842773, "learning_rate": 0.0002, "epoch": 7.908819012205965, "step": 48920}, {"loss": 0.5057, "grad_norm": 1.1064879894256592, "learning_rate": 0.0002, "epoch": 7.910435696386711, "step": 48930}, {"loss": 0.482, "grad_norm": 0.9874461889266968, "learning_rate": 0.0002, "epoch": 7.912052380567456, "step": 48940}, {"loss": 0.4851, "grad_norm": 1.2584952116012573, "learning_rate": 0.0002, "epoch": 7.913669064748201, "step": 48950}, {"loss": 0.4744, "grad_norm": 1.3016353845596313, "learning_rate": 0.0002, "epoch": 7.915285748928946, "step": 48960}, {"loss": 0.4734, "grad_norm": 1.104179859161377, "learning_rate": 0.0002, "epoch": 7.916902433109692, "step": 48970}, {"loss": 0.5143, "grad_norm": 1.26803457736969, "learning_rate": 0.0002, "epoch": 7.918519117290438, "step": 48980}, {"loss": 0.4808, "grad_norm": 1.0336869955062866, "learning_rate": 0.0002, "epoch": 7.920135801471183, "step": 48990}, {"loss": 0.4938, "grad_norm": 1.0630918741226196, "learning_rate": 0.0002, "epoch": 7.921752485651928, "step": 49000}, {"loss": 0.4988, "grad_norm": 1.2257622480392456, "learning_rate": 0.0002, "epoch": 7.923369169832673, "step": 49010}, {"loss": 0.5116, "grad_norm": 1.1722705364227295, "learning_rate": 0.0002, "epoch": 7.924985854013419, "step": 49020}, {"loss": 0.4737, "grad_norm": 1.4473323822021484, "learning_rate": 0.0002, "epoch": 7.926602538194164, "step": 49030}, {"loss": 0.5412, "grad_norm": 1.3780192136764526, "learning_rate": 0.0002, "epoch": 7.928219222374909, "step": 49040}, {"loss": 0.4849, "grad_norm": 1.253423810005188, "learning_rate": 0.0002, "epoch": 7.929835906555654, "step": 49050}, {"loss": 0.5103, "grad_norm": 1.1733828783035278, "learning_rate": 0.0002, "epoch": 7.9314525907363995, "step": 49060}, {"loss": 0.4967, "grad_norm": 1.249990701675415, "learning_rate": 0.0002, "epoch": 7.933069274917145, "step": 49070}, {"loss": 0.51, "grad_norm": 1.4012458324432373, "learning_rate": 0.0002, "epoch": 7.93468595909789, "step": 49080}, {"loss": 0.4785, "grad_norm": 1.268652319908142, "learning_rate": 0.0002, "epoch": 7.936302643278635, "step": 49090}, {"loss": 0.5319, "grad_norm": 1.0469073057174683, "learning_rate": 0.0002, "epoch": 7.9379193274593804, "step": 49100}, {"loss": 0.4836, "grad_norm": 1.3028813600540161, "learning_rate": 0.0002, "epoch": 7.939536011640126, "step": 49110}, {"loss": 0.4791, "grad_norm": 1.0998128652572632, "learning_rate": 0.0002, "epoch": 7.941152695820872, "step": 49120}, {"loss": 0.5022, "grad_norm": 1.300884485244751, "learning_rate": 0.0002, "epoch": 7.942769380001617, "step": 49130}, {"loss": 0.5193, "grad_norm": 1.257865071296692, "learning_rate": 0.0002, "epoch": 7.944386064182362, "step": 49140}, {"loss": 0.4755, "grad_norm": 1.074731707572937, "learning_rate": 0.0002, "epoch": 7.9460027483631075, "step": 49150}, {"loss": 0.4675, "grad_norm": 1.1055876016616821, "learning_rate": 0.0002, "epoch": 7.947619432543853, "step": 49160}, {"loss": 0.4801, "grad_norm": 1.1986541748046875, "learning_rate": 0.0002, "epoch": 7.949236116724598, "step": 49170}, {"loss": 0.488, "grad_norm": 1.094555139541626, "learning_rate": 0.0002, "epoch": 7.950852800905343, "step": 49180}, {"loss": 0.4974, "grad_norm": 1.2922005653381348, "learning_rate": 0.0002, "epoch": 7.952469485086088, "step": 49190}, {"loss": 0.4973, "grad_norm": 1.1557104587554932, "learning_rate": 0.0002, "epoch": 7.954086169266834, "step": 49200}, {"loss": 0.4806, "grad_norm": 1.2414908409118652, "learning_rate": 0.0002, "epoch": 7.955702853447579, "step": 49210}, {"loss": 0.4848, "grad_norm": 1.3606830835342407, "learning_rate": 0.0002, "epoch": 7.957319537628324, "step": 49220}, {"loss": 0.4981, "grad_norm": 0.9592481851577759, "learning_rate": 0.0002, "epoch": 7.958936221809069, "step": 49230}, {"loss": 0.4731, "grad_norm": 1.2130779027938843, "learning_rate": 0.0002, "epoch": 7.960552905989815, "step": 49240}, {"loss": 0.4529, "grad_norm": 1.1078767776489258, "learning_rate": 0.0002, "epoch": 7.96216959017056, "step": 49250}, {"loss": 0.4983, "grad_norm": 1.0684230327606201, "learning_rate": 0.0002, "epoch": 7.963786274351305, "step": 49260}, {"loss": 0.4832, "grad_norm": 1.1368396282196045, "learning_rate": 0.0002, "epoch": 7.965402958532051, "step": 49270}, {"loss": 0.5226, "grad_norm": 1.2161095142364502, "learning_rate": 0.0002, "epoch": 7.967019642712796, "step": 49280}, {"loss": 0.4938, "grad_norm": 1.2087634801864624, "learning_rate": 0.0002, "epoch": 7.968636326893542, "step": 49290}, {"loss": 0.4969, "grad_norm": 1.1078447103500366, "learning_rate": 0.0002, "epoch": 7.970253011074287, "step": 49300}, {"loss": 0.5333, "grad_norm": 1.3378221988677979, "learning_rate": 0.0002, "epoch": 7.971869695255032, "step": 49310}, {"loss": 0.4736, "grad_norm": 1.0475801229476929, "learning_rate": 0.0002, "epoch": 7.973486379435777, "step": 49320}, {"loss": 0.4515, "grad_norm": 0.9948194622993469, "learning_rate": 0.0002, "epoch": 7.9751030636165225, "step": 49330}, {"loss": 0.4685, "grad_norm": 1.06312894821167, "learning_rate": 0.0002, "epoch": 7.976719747797268, "step": 49340}, {"loss": 0.453, "grad_norm": 1.4047085046768188, "learning_rate": 0.0002, "epoch": 7.978336431978013, "step": 49350}, {"loss": 0.5054, "grad_norm": 1.086578130722046, "learning_rate": 0.0002, "epoch": 7.979953116158758, "step": 49360}, {"loss": 0.5024, "grad_norm": 1.2896746397018433, "learning_rate": 0.0002, "epoch": 7.9815698003395035, "step": 49370}, {"loss": 0.5102, "grad_norm": 1.260717511177063, "learning_rate": 0.0002, "epoch": 7.983186484520249, "step": 49380}, {"loss": 0.4836, "grad_norm": 1.4238426685333252, "learning_rate": 0.0002, "epoch": 7.984803168700994, "step": 49390}, {"loss": 0.4797, "grad_norm": 1.1800259351730347, "learning_rate": 0.0002, "epoch": 7.986419852881739, "step": 49400}, {"loss": 0.4911, "grad_norm": 1.128868579864502, "learning_rate": 0.0002, "epoch": 7.988036537062484, "step": 49410}, {"loss": 0.4674, "grad_norm": 1.1832106113433838, "learning_rate": 0.0002, "epoch": 7.9896532212432305, "step": 49420}, {"loss": 0.5016, "grad_norm": 1.1728334426879883, "learning_rate": 0.0002, "epoch": 7.991269905423976, "step": 49430}, {"loss": 0.4706, "grad_norm": 1.2403929233551025, "learning_rate": 0.0002, "epoch": 7.992886589604721, "step": 49440}, {"loss": 0.4989, "grad_norm": 1.245354175567627, "learning_rate": 0.0002, "epoch": 7.994503273785466, "step": 49450}, {"loss": 0.5088, "grad_norm": 1.3526462316513062, "learning_rate": 0.0002, "epoch": 7.9961199579662114, "step": 49460}, {"loss": 0.4703, "grad_norm": 1.2117315530776978, "learning_rate": 0.0002, "epoch": 7.997736642146957, "step": 49470}, {"loss": 0.4802, "grad_norm": 1.0393620729446411, "learning_rate": 0.0002, "epoch": 7.999353326327702, "step": 49480}]}